diff --git a/projects/rccl/CMakeLists.txt b/projects/rccl/CMakeLists.txt index 237e9242b3..c33228b186 100644 --- a/projects/rccl/CMakeLists.txt +++ b/projects/rccl/CMakeLists.txt @@ -55,8 +55,16 @@ else() endif() # Setup VERSION -set(VERSION_STRING "2.6.0") -rocm_setup_version(VERSION ${VERSION_STRING}) +set(VERSION_STRING "2.6.0.") + +# Check if BUILD_NUMBER is defined in a Jenkins environment +if($ENV{BUILD_NUMBER}) + string(CONCAT BUILD_VERSION ${VERSION_STRING} $ENV{BUILD_NUMBER}) +else() + string(CONCAT BUILD_VERSION ${VERSION_STRING} "0") +endif() + +rocm_setup_version(VERSION ${BUILD_VERSION} NO_GIT_TAG_VERSION) list(APPEND CMAKE_PREFIX_PATH /opt/rocm @@ -79,27 +87,12 @@ include_directories(src/collectives) include_directories(src/collectives/device) set(CU_SOURCES - src/bootstrap.cu - src/collectives/all_gather.cu - src/collectives/all_reduce.cu - src/collectives/broadcast.cu - src/collectives/reduce.cu - src/collectives/reduce_scatter.cu - src/collectives/device/functions.cu - src/init.cu - src/misc/enqueue.cu - src/misc/group.cu - src/misc/ibvwrap.cu - src/misc/nvmlwrap_stub.cu - src/misc/rings.cu - src/misc/utils.cu - src/ring.cu - src/transport.cu - src/transport/net.cu - src/transport/net_ib.cu - src/transport/net_socket.cu - src/transport/p2p.cu - src/transport/shm.cu) + src/collectives/device/all_reduce.cu + src/collectives/device/all_gather.cu + src/collectives/device/reduce.cu + src/collectives/device/broadcast.cu + src/collectives/device/reduce_scatter.cu + src/collectives/device/functions.cu) set(CPP_SOURCES) foreach(filename ${CU_SOURCES}) @@ -111,20 +104,34 @@ foreach(filename ${CU_SOURCES}) list(APPEND CPP_SOURCES ${cpp_filename}) endforeach(filename) -list(APPEND CPP_SOURCES src/collectives/device/all_gather_0.cpp) -list(APPEND CPP_SOURCES src/collectives/device/all_reduce_0.cpp) -list(APPEND CPP_SOURCES src/collectives/device/all_reduce_1.cpp) -list(APPEND CPP_SOURCES src/collectives/device/all_reduce_2.cpp) -list(APPEND CPP_SOURCES src/collectives/device/all_reduce_3.cpp) -list(APPEND CPP_SOURCES src/collectives/device/broadcast_0.cpp) -list(APPEND CPP_SOURCES src/collectives/device/reduce_0.cpp) -list(APPEND CPP_SOURCES src/collectives/device/reduce_1.cpp) -list(APPEND CPP_SOURCES src/collectives/device/reduce_2.cpp) -list(APPEND CPP_SOURCES src/collectives/device/reduce_3.cpp) -list(APPEND CPP_SOURCES src/collectives/device/reduce_scatter_0.cpp) -list(APPEND CPP_SOURCES src/collectives/device/reduce_scatter_1.cpp) -list(APPEND CPP_SOURCES src/collectives/device/reduce_scatter_2.cpp) -list(APPEND CPP_SOURCES src/collectives/device/reduce_scatter_3.cpp) +set(CC_SOURCES + src/init.cc + src/collectives/all_reduce.cc + src/collectives/all_gather.cc + src/collectives/reduce.cc + src/collectives/broadcast.cc + src/collectives/reduce_scatter.cc + src/channel.cc + src/misc/trees.cc + src/misc/rings.cc + src/misc/argcheck.cc + src/misc/group.cc + src/misc/utils.cc + src/misc/ibvwrap.cc + src/misc/nvmlwrap_stub.cc + src/misc/topo.cc + src/transport/net.cc + src/transport/net_ib.cc + src/transport/net_socket.cc + src/transport/p2p.cc + src/transport/shm.cc + src/transport.cc + src/bootstrap.cc + src/enqueue.cc) + +foreach(filename ${CC_SOURCES}) + list(APPEND CPP_SOURCES ${filename}) +endforeach(filename) add_library(rccl ${CPP_SOURCES}) @@ -132,18 +139,20 @@ if(TRACE) add_definitions(-DENABLE_TRACE) endif() +if(PROFILE) + add_definitions(-DENABLE_PROFILING) +endif() + target_link_libraries(rccl PRIVATE --amdgpu-target=gfx803 PRIVATE --amdgpu-target=gfx900 - PRIVATE --amdgpu-target=gfx906 - PRIVATE --amdgpu-target=gfx908) + PRIVATE --amdgpu-target=gfx906) if("${HIP_COMPILER}" MATCHES "clang") target_compile_options(rccl PRIVATE --amdgpu-target=gfx803 PRIVATE --amdgpu-target=gfx900 PRIVATE --amdgpu-target=gfx906 - PRIVATE --amdgpu-target=gfx908 PRIVATE -fgpu-rdc) target_link_libraries(rccl PRIVATE -fgpu-rdc) target_include_directories(rccl PRIVATE /opt/rocm/hsa/include) diff --git a/projects/rccl/Jenkinsfile b/projects/rccl/Jenkinsfile index c53a81da98..d07a0b8140 100644 --- a/projects/rccl/Jenkinsfile +++ b/projects/rccl/Jenkinsfile @@ -80,7 +80,7 @@ rcclCI: sudo dpkg -i package/*.deb """ - + //platform.archiveArtifacts(this, """${project.paths.project_build_prefix}/build/package/*.deb""") } diff --git a/projects/rccl/LICENSE.txt b/projects/rccl/LICENSE.txt index 6b9c6a3138..60db84a684 100644 --- a/projects/rccl/LICENSE.txt +++ b/projects/rccl/LICENSE.txt @@ -1,5 +1,5 @@ - Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved. + Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved. Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved. Redistribution and use in source and binary forms, with or without diff --git a/projects/rccl/Makefile b/projects/rccl/Makefile index 605e3bfaad..caed3d42ac 100644 --- a/projects/rccl/Makefile +++ b/projects/rccl/Makefile @@ -1,5 +1,5 @@ # -# Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved. # # See LICENSE.txt for license information # diff --git a/projects/rccl/docs/Doxyfile b/projects/rccl/docs/Doxyfile index 3d28cf5388..42dae7cc30 100644 --- a/projects/rccl/docs/Doxyfile +++ b/projects/rccl/docs/Doxyfile @@ -162,7 +162,7 @@ FULL_PATH_NAMES = YES # will be relative from the directory where doxygen is started. # This tag requires that the tag FULL_PATH_NAMES is set to YES. -STRIP_FROM_PATH = +STRIP_FROM_PATH = # The STRIP_FROM_INC_PATH tag can be used to strip a user-defined part of the # path mentioned in the documentation of a class, which tells the reader which @@ -171,7 +171,7 @@ STRIP_FROM_PATH = # specify the list of include paths that are normally passed to the compiler # using the -I flag. -STRIP_FROM_INC_PATH = +STRIP_FROM_INC_PATH = # If the SHORT_NAMES tag is set to YES, doxygen will generate much shorter (but # less readable) file names. This can be useful is your file systems doesn't @@ -238,13 +238,13 @@ TAB_SIZE = 4 # "Side Effects:". You can put \n's in the value part of an alias to insert # newlines. -ALIASES = +ALIASES = # This tag can be used to specify a number of word-keyword mappings (TCL only). # A mapping has the form "name=value". For example adding "class=itcl::class" # will allow you to use the command class in the itcl::class meaning. -TCL_SUBST = +TCL_SUBST = # Set the OPTIMIZE_OUTPUT_FOR_C tag to YES if your project consists of C sources # only. Doxygen will then generate output that is more tailored for C. For @@ -291,7 +291,7 @@ OPTIMIZE_OUTPUT_VHDL = NO # Note that for custom extensions you also need to set FILE_PATTERNS otherwise # the files are not read by doxygen. -EXTENSION_MAPPING = +EXTENSION_MAPPING = # If the MARKDOWN_SUPPORT tag is enabled then doxygen pre-processes all comments # according to the Markdown format, which allows for more readable @@ -641,7 +641,7 @@ GENERATE_DEPRECATEDLIST= YES # sections, marked by \if ... \endif and \cond # ... \endcond blocks. -ENABLED_SECTIONS = +ENABLED_SECTIONS = # The MAX_INITIALIZER_LINES tag determines the maximum number of lines that the # initial value of a variable or macro / define can have for it to appear in the @@ -683,7 +683,7 @@ SHOW_NAMESPACES = YES # by doxygen. Whatever the program writes to standard output is used as the file # version. For an example see the documentation. -FILE_VERSION_FILTER = +FILE_VERSION_FILTER = # The LAYOUT_FILE tag can be used to specify a layout file which will be parsed # by doxygen. The layout file controls the global structure of the generated @@ -696,7 +696,7 @@ FILE_VERSION_FILTER = # DoxygenLayout.xml, doxygen will parse it automatically even if the LAYOUT_FILE # tag is left empty. -LAYOUT_FILE = +LAYOUT_FILE = # The CITE_BIB_FILES tag can be used to specify one or more bib files containing # the reference definitions. This must be a list of .bib files. The .bib @@ -706,7 +706,7 @@ LAYOUT_FILE = # LATEX_BIB_STYLE. To use this feature you need bibtex and perl available in the # search path. See also \cite for info how to create references. -CITE_BIB_FILES = +CITE_BIB_FILES = #--------------------------------------------------------------------------- # Configuration options related to warning and progress messages @@ -765,7 +765,7 @@ WARN_FORMAT = "$file:$line: $text" # messages should be written. If left blank the output is written to standard # error (stderr). -WARN_LOGFILE = +WARN_LOGFILE = #--------------------------------------------------------------------------- # Configuration options related to the input files @@ -858,7 +858,7 @@ RECURSIVE = NO # Note that relative paths are relative to the directory from which doxygen is # run. -EXCLUDE = +EXCLUDE = # The EXCLUDE_SYMLINKS tag can be used to select whether or not files or # directories that are symbolic links (a Unix file system feature) are excluded @@ -874,7 +874,7 @@ EXCLUDE_SYMLINKS = NO # Note that the wildcards are matched against the file with absolute path, so to # exclude all test directories for example use the pattern */test/* -EXCLUDE_PATTERNS = +EXCLUDE_PATTERNS = # The EXCLUDE_SYMBOLS tag can be used to specify one or more symbol names # (namespaces, classes, functions, etc.) that should be excluded from the @@ -885,13 +885,13 @@ EXCLUDE_PATTERNS = # Note that the wildcards are matched against the file with absolute path, so to # exclude all test directories use the pattern */test/* -EXCLUDE_SYMBOLS = +EXCLUDE_SYMBOLS = # The EXAMPLE_PATH tag can be used to specify one or more files or directories # that contain example code fragments that are included (see the \include # command). -EXAMPLE_PATH = +EXAMPLE_PATH = # If the value of the EXAMPLE_PATH tag contains directories, you can use the # EXAMPLE_PATTERNS tag to specify one or more wildcard pattern (like *.cpp and @@ -911,7 +911,7 @@ EXAMPLE_RECURSIVE = NO # that contain images that are to be included in the documentation (see the # \image command). -IMAGE_PATH = +IMAGE_PATH = # The INPUT_FILTER tag can be used to specify a program that doxygen should # invoke to filter for each input file. Doxygen will invoke the filter program @@ -928,7 +928,7 @@ IMAGE_PATH = # code is scanned, but not when the output code is generated. If lines are added # or removed, the anchors will not be placed correctly. -INPUT_FILTER = +INPUT_FILTER = # The FILTER_PATTERNS tag can be used to specify filters on a per file pattern # basis. Doxygen will compare the file name with each pattern and apply the @@ -937,7 +937,7 @@ INPUT_FILTER = # filters are used. If the FILTER_PATTERNS tag is empty or if none of the # patterns match the file name, INPUT_FILTER is applied. -FILTER_PATTERNS = +FILTER_PATTERNS = # If the FILTER_SOURCE_FILES tag is set to YES, the input filter (if set using # INPUT_FILTER) will also be used to filter the input files that are used for @@ -952,7 +952,7 @@ FILTER_SOURCE_FILES = NO # *.ext= (so without naming a filter). # This tag requires that the tag FILTER_SOURCE_FILES is set to YES. -FILTER_SOURCE_PATTERNS = +FILTER_SOURCE_PATTERNS = # If the USE_MDFILE_AS_MAINPAGE tag refers to the name of a markdown file that # is part of the input, its contents will be placed on the main page @@ -1064,7 +1064,7 @@ CLANG_ASSISTED_PARSING = NO # specified with INPUT and INCLUDE_PATH. # This tag requires that the tag CLANG_ASSISTED_PARSING is set to YES. -CLANG_OPTIONS = +CLANG_OPTIONS = #--------------------------------------------------------------------------- # Configuration options related to the alphabetical class index @@ -1090,7 +1090,7 @@ COLS_IN_ALPHA_INDEX = 5 # while generating the index headers. # This tag requires that the tag ALPHABETICAL_INDEX is set to YES. -IGNORE_PREFIX = +IGNORE_PREFIX = #--------------------------------------------------------------------------- # Configuration options related to the HTML output @@ -1134,7 +1134,7 @@ HTML_FILE_EXTENSION = .html # of the possible markers and block names see the documentation. # This tag requires that the tag GENERATE_HTML is set to YES. -HTML_HEADER = +HTML_HEADER = # The HTML_FOOTER tag can be used to specify a user-defined HTML footer for each # generated HTML page. If the tag is left blank doxygen will generate a standard @@ -1144,7 +1144,7 @@ HTML_HEADER = # that doxygen normally uses. # This tag requires that the tag GENERATE_HTML is set to YES. -HTML_FOOTER = +HTML_FOOTER = # The HTML_STYLESHEET tag can be used to specify a user-defined cascading style # sheet that is used by each HTML page. It can be used to fine-tune the look of @@ -1156,7 +1156,7 @@ HTML_FOOTER = # obsolete. # This tag requires that the tag GENERATE_HTML is set to YES. -HTML_STYLESHEET = +HTML_STYLESHEET = # The HTML_EXTRA_STYLESHEET tag can be used to specify additional user-defined # cascading style sheets that are included after the standard style sheets @@ -1169,7 +1169,7 @@ HTML_STYLESHEET = # list). For an example see the documentation. # This tag requires that the tag GENERATE_HTML is set to YES. -HTML_EXTRA_STYLESHEET = +HTML_EXTRA_STYLESHEET = # The HTML_EXTRA_FILES tag can be used to specify one or more extra images or # other source files which should be copied to the HTML output directory. Note @@ -1179,7 +1179,7 @@ HTML_EXTRA_STYLESHEET = # files will be copied as-is; there are no commands or markers available. # This tag requires that the tag GENERATE_HTML is set to YES. -HTML_EXTRA_FILES = +HTML_EXTRA_FILES = # The HTML_COLORSTYLE_HUE tag controls the color of the HTML output. Doxygen # will adjust the colors in the style sheet and background images according to @@ -1308,7 +1308,7 @@ GENERATE_HTMLHELP = NO # written to the html output directory. # This tag requires that the tag GENERATE_HTMLHELP is set to YES. -CHM_FILE = +CHM_FILE = # The HHC_LOCATION tag can be used to specify the location (absolute path # including file name) of the HTML help compiler (hhc.exe). If non-empty, @@ -1316,7 +1316,7 @@ CHM_FILE = # The file has to be specified with full path. # This tag requires that the tag GENERATE_HTMLHELP is set to YES. -HHC_LOCATION = +HHC_LOCATION = # The GENERATE_CHI flag controls if a separate .chi index file is generated # (YES) or that it should be included in the master .chm file (NO). @@ -1329,7 +1329,7 @@ GENERATE_CHI = NO # and project file content. # This tag requires that the tag GENERATE_HTMLHELP is set to YES. -CHM_INDEX_ENCODING = +CHM_INDEX_ENCODING = # The BINARY_TOC flag controls whether a binary table of contents is generated # (YES) or a normal table of contents (NO) in the .chm file. Furthermore it @@ -1360,7 +1360,7 @@ GENERATE_QHP = NO # the HTML output folder. # This tag requires that the tag GENERATE_QHP is set to YES. -QCH_FILE = +QCH_FILE = # The QHP_NAMESPACE tag specifies the namespace to use when generating Qt Help # Project output. For more information please see Qt Help Project / Namespace @@ -1385,7 +1385,7 @@ QHP_VIRTUAL_FOLDER = doc # filters). # This tag requires that the tag GENERATE_QHP is set to YES. -QHP_CUST_FILTER_NAME = +QHP_CUST_FILTER_NAME = # The QHP_CUST_FILTER_ATTRS tag specifies the list of the attributes of the # custom filter to add. For more information please see Qt Help Project / Custom @@ -1393,21 +1393,21 @@ QHP_CUST_FILTER_NAME = # filters). # This tag requires that the tag GENERATE_QHP is set to YES. -QHP_CUST_FILTER_ATTRS = +QHP_CUST_FILTER_ATTRS = # The QHP_SECT_FILTER_ATTRS tag specifies the list of the attributes this # project's filter section matches. Qt Help Project / Filter Attributes (see: # http://qt-project.org/doc/qt-4.8/qthelpproject.html#filter-attributes). # This tag requires that the tag GENERATE_QHP is set to YES. -QHP_SECT_FILTER_ATTRS = +QHP_SECT_FILTER_ATTRS = # The QHG_LOCATION tag can be used to specify the location of Qt's # qhelpgenerator. If non-empty doxygen will try to run qhelpgenerator on the # generated .qhp file. # This tag requires that the tag GENERATE_QHP is set to YES. -QHG_LOCATION = +QHG_LOCATION = # If the GENERATE_ECLIPSEHELP tag is set to YES, additional index files will be # generated, together with the HTML files, they form an Eclipse help plugin. To @@ -1540,7 +1540,7 @@ MATHJAX_RELPATH = http://cdn.mathjax.org/mathjax/latest # MATHJAX_EXTENSIONS = TeX/AMSmath TeX/AMSsymbols # This tag requires that the tag USE_MATHJAX is set to YES. -MATHJAX_EXTENSIONS = +MATHJAX_EXTENSIONS = # The MATHJAX_CODEFILE tag can be used to specify a file with javascript pieces # of code that will be used on startup of the MathJax code. See the MathJax site @@ -1548,7 +1548,7 @@ MATHJAX_EXTENSIONS = # example see the documentation. # This tag requires that the tag USE_MATHJAX is set to YES. -MATHJAX_CODEFILE = +MATHJAX_CODEFILE = # When the SEARCHENGINE tag is enabled doxygen will generate a search box for # the HTML output. The underlying search engine uses javascript and DHTML and @@ -1608,7 +1608,7 @@ EXTERNAL_SEARCH = NO # Searching" for details. # This tag requires that the tag SEARCHENGINE is set to YES. -SEARCHENGINE_URL = +SEARCHENGINE_URL = # When SERVER_BASED_SEARCH and EXTERNAL_SEARCH are both enabled the unindexed # search data is written to a file for indexing by an external tool. With the @@ -1624,7 +1624,7 @@ SEARCHDATA_FILE = searchdata.xml # projects and redirect the results back to the right project. # This tag requires that the tag SEARCHENGINE is set to YES. -EXTERNAL_SEARCH_ID = +EXTERNAL_SEARCH_ID = # The EXTRA_SEARCH_MAPPINGS tag can be used to enable searching through doxygen # projects other than the one defined by this configuration file, but that are @@ -1634,7 +1634,7 @@ EXTERNAL_SEARCH_ID = # EXTRA_SEARCH_MAPPINGS = tagname1=loc1 tagname2=loc2 ... # This tag requires that the tag SEARCHENGINE is set to YES. -EXTRA_SEARCH_MAPPINGS = +EXTRA_SEARCH_MAPPINGS = #--------------------------------------------------------------------------- # Configuration options related to the LaTeX output @@ -1698,7 +1698,7 @@ PAPER_TYPE = a4 # If left blank no extra packages will be included. # This tag requires that the tag GENERATE_LATEX is set to YES. -EXTRA_PACKAGES = +EXTRA_PACKAGES = # The LATEX_HEADER tag can be used to specify a personal LaTeX header for the # generated LaTeX document. The header should contain everything until the first @@ -1714,7 +1714,7 @@ EXTRA_PACKAGES = # to HTML_HEADER. # This tag requires that the tag GENERATE_LATEX is set to YES. -LATEX_HEADER = +LATEX_HEADER = # The LATEX_FOOTER tag can be used to specify a personal LaTeX footer for the # generated LaTeX document. The footer should contain everything after the last @@ -1725,7 +1725,7 @@ LATEX_HEADER = # Note: Only use a user-defined footer if you know what you are doing! # This tag requires that the tag GENERATE_LATEX is set to YES. -LATEX_FOOTER = +LATEX_FOOTER = # The LATEX_EXTRA_STYLESHEET tag can be used to specify additional user-defined # LaTeX style sheets that are included after the standard style sheets created @@ -1736,7 +1736,7 @@ LATEX_FOOTER = # list). # This tag requires that the tag GENERATE_LATEX is set to YES. -LATEX_EXTRA_STYLESHEET = +LATEX_EXTRA_STYLESHEET = # The LATEX_EXTRA_FILES tag can be used to specify one or more extra images or # other source files which should be copied to the LATEX_OUTPUT output @@ -1744,7 +1744,7 @@ LATEX_EXTRA_STYLESHEET = # markers available. # This tag requires that the tag GENERATE_LATEX is set to YES. -LATEX_EXTRA_FILES = +LATEX_EXTRA_FILES = # If the PDF_HYPERLINKS tag is set to YES, the LaTeX that is generated is # prepared for conversion to PDF (using ps2pdf or pdflatex). The PDF file will @@ -1844,14 +1844,14 @@ RTF_HYPERLINKS = NO # default style sheet that doxygen normally uses. # This tag requires that the tag GENERATE_RTF is set to YES. -RTF_STYLESHEET_FILE = +RTF_STYLESHEET_FILE = # Set optional variables used in the generation of an RTF document. Syntax is # similar to doxygen's config file. A template extensions file can be generated # using doxygen -e rtf extensionFile. # This tag requires that the tag GENERATE_RTF is set to YES. -RTF_EXTENSIONS_FILE = +RTF_EXTENSIONS_FILE = # If the RTF_SOURCE_CODE tag is set to YES then doxygen will include source code # with syntax highlighting in the RTF output. @@ -1896,7 +1896,7 @@ MAN_EXTENSION = .3 # MAN_EXTENSION with the initial . removed. # This tag requires that the tag GENERATE_MAN is set to YES. -MAN_SUBDIR = +MAN_SUBDIR = # If the MAN_LINKS tag is set to YES and doxygen generates man output, then it # will generate one additional man file for each entity documented in the real @@ -1915,7 +1915,7 @@ MAN_LINKS = NO # captures the structure of the code including all documentation. # The default value is: NO. -GENERATE_XML = YES +GENERATE_XML = YES # The XML_OUTPUT tag is used to specify where the XML pages will be put. If a # relative path is entered the value of OUTPUT_DIRECTORY will be put in front of @@ -2009,7 +2009,7 @@ PERLMOD_PRETTY = YES # overwrite each other's variables. # This tag requires that the tag GENERATE_PERLMOD is set to YES. -PERLMOD_MAKEVAR_PREFIX = +PERLMOD_MAKEVAR_PREFIX = #--------------------------------------------------------------------------- # Configuration options related to the preprocessor @@ -2050,7 +2050,7 @@ SEARCH_INCLUDES = YES # preprocessor. # This tag requires that the tag SEARCH_INCLUDES is set to YES. -INCLUDE_PATH = +INCLUDE_PATH = # You can use the INCLUDE_FILE_PATTERNS tag to specify one or more wildcard # patterns (like *.h and *.hpp) to filter out the header-files in the @@ -2058,7 +2058,7 @@ INCLUDE_PATH = # used. # This tag requires that the tag ENABLE_PREPROCESSING is set to YES. -INCLUDE_FILE_PATTERNS = +INCLUDE_FILE_PATTERNS = # The PREDEFINED tag can be used to specify one or more macro names that are # defined before the preprocessor is started (similar to the -D option of e.g. @@ -2068,7 +2068,7 @@ INCLUDE_FILE_PATTERNS = # recursively expanded use the := operator instead of the = operator. # This tag requires that the tag ENABLE_PREPROCESSING is set to YES. -PREDEFINED = +PREDEFINED = # If the MACRO_EXPANSION and EXPAND_ONLY_PREDEF tags are set to YES then this # tag can be used to specify a list of macro names that should be expanded. The @@ -2077,7 +2077,7 @@ PREDEFINED = # definition found in the source code. # This tag requires that the tag ENABLE_PREPROCESSING is set to YES. -EXPAND_AS_DEFINED = +EXPAND_AS_DEFINED = # If the SKIP_FUNCTION_MACROS tag is set to YES then doxygen's preprocessor will # remove all references to function-like macros that are alone on a line, have @@ -2106,13 +2106,13 @@ SKIP_FUNCTION_MACROS = YES # the path). If a tag file is not located in the directory in which doxygen is # run, you must also specify the path to the tagfile here. -TAGFILES = +TAGFILES = # When a file name is specified after GENERATE_TAGFILE, doxygen will create a # tag file that is based on the input files it reads. See section "Linking to # external documentation" for more information about the usage of tag files. -GENERATE_TAGFILE = +GENERATE_TAGFILE = # If the ALLEXTERNALS tag is set to YES, all external class will be listed in # the class index. If set to NO, only the inherited external classes will be @@ -2161,14 +2161,14 @@ CLASS_DIAGRAMS = NO # the mscgen tool resides. If left empty the tool is assumed to be found in the # default search path. -MSCGEN_PATH = +MSCGEN_PATH = # You can include diagrams made with dia in doxygen documentation. Doxygen will # then run dia to produce the diagram and insert it in the documentation. The # DIA_PATH tag allows you to specify the directory where the dia binary resides. # If left empty dia is assumed to be found in the default search path. -DIA_PATH = +DIA_PATH = # If set to YES the inheritance and collaboration graphs will hide inheritance # and usage relations if the target is undocumented or is not a class. @@ -2217,7 +2217,7 @@ DOT_FONTSIZE = 10 # the path where dot can find it using this tag. # This tag requires that the tag HAVE_DOT is set to YES. -DOT_FONTPATH = +DOT_FONTPATH = # If the CLASS_GRAPH tag is set to YES then doxygen will generate a graph for # each documented class showing the direct and indirect inheritance relations. @@ -2361,26 +2361,26 @@ INTERACTIVE_SVG = NO # found. If left blank, it is assumed the dot tool can be found in the path. # This tag requires that the tag HAVE_DOT is set to YES. -DOT_PATH = +DOT_PATH = # The DOTFILE_DIRS tag can be used to specify one or more directories that # contain dot files that are included in the documentation (see the \dotfile # command). # This tag requires that the tag HAVE_DOT is set to YES. -DOTFILE_DIRS = +DOTFILE_DIRS = # The MSCFILE_DIRS tag can be used to specify one or more directories that # contain msc files that are included in the documentation (see the \mscfile # command). -MSCFILE_DIRS = +MSCFILE_DIRS = # The DIAFILE_DIRS tag can be used to specify one or more directories that # contain dia files that are included in the documentation (see the \diafile # command). -DIAFILE_DIRS = +DIAFILE_DIRS = # When using plantuml, the PLANTUML_JAR_PATH tag should be used to specify the # path where java can find the plantuml.jar file. If left blank, it is assumed @@ -2388,12 +2388,12 @@ DIAFILE_DIRS = # generate a warning when it encounters a \startuml command in this case and # will not generate output for the diagram. -PLANTUML_JAR_PATH = +PLANTUML_JAR_PATH = # When using plantuml, the specified paths are searched for files specified by # the !include statement in a plantuml block. -PLANTUML_INCLUDE_PATH = +PLANTUML_INCLUDE_PATH = # The DOT_GRAPH_MAX_NODES tag can be used to set the maximum number of nodes # that will be shown in the graph. If the number of nodes in a graph becomes diff --git a/projects/rccl/docs/source/allapi.rst b/projects/rccl/docs/source/allapi.rst index 364cb40d6b..cc54d2419c 100644 --- a/projects/rccl/docs/source/allapi.rst +++ b/projects/rccl/docs/source/allapi.rst @@ -1,5 +1,5 @@ .. toctree:: - :maxdepth: 4 + :maxdepth: 4 :caption: Contents: ======= @@ -8,4 +8,4 @@ All API .. doxygenindex:: - + diff --git a/projects/rccl/docs/source/api.rst b/projects/rccl/docs/source/api.rst index b0b44bb5b9..8e316acda1 100644 --- a/projects/rccl/docs/source/api.rst +++ b/projects/rccl/docs/source/api.rst @@ -1,5 +1,5 @@ .. toctree:: - :maxdepth: 4 + :maxdepth: 4 :caption: Contents: === diff --git a/projects/rccl/docs/source/index.rst b/projects/rccl/docs/source/index.rst index 3d9b62aa4a..04943ca350 100644 --- a/projects/rccl/docs/source/index.rst +++ b/projects/rccl/docs/source/index.rst @@ -7,10 +7,10 @@ Welcome to RCCL's documentation! ================================== .. toctree:: - :maxdepth: 4 + :maxdepth: 4 :caption: Contents: - library + library api allapi diff --git a/projects/rccl/docs/source/library.rst b/projects/rccl/docs/source/library.rst index cbb0b95048..a7fae1dafc 100644 --- a/projects/rccl/docs/source/library.rst +++ b/projects/rccl/docs/source/library.rst @@ -1,6 +1,6 @@ .. toctree:: - :maxdepth: 4 + :maxdepth: 4 :caption: Contents: ====== @@ -10,4 +10,4 @@ RCCL Introduction ------------ -The RCCL is an AMD port of NCCL. +The RCCL is an AMD port of NCCL. diff --git a/projects/rccl/ext-net/dummy/Makefile b/projects/rccl/ext-net/dummy/Makefile index d1eb4c5a62..efa841c53c 100644 --- a/projects/rccl/ext-net/dummy/Makefile +++ b/projects/rccl/ext-net/dummy/Makefile @@ -1,5 +1,5 @@ # -# Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved. # # See LICENSE.txt for license information # diff --git a/projects/rccl/ext-net/dummy/plugin.c b/projects/rccl/ext-net/dummy/plugin.c index f11b36590d..67d7d88411 100644 --- a/projects/rccl/ext-net/dummy/plugin.c +++ b/projects/rccl/ext-net/dummy/plugin.c @@ -1,5 +1,5 @@ /************************************************************************* - * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ diff --git a/projects/rccl/hipify.sh b/projects/rccl/hipify.sh deleted file mode 100755 index e389fb519e..0000000000 --- a/projects/rccl/hipify.sh +++ /dev/null @@ -1,112 +0,0 @@ -#!/bin/bash -# Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved. - -FILES=" -./src/nccl.h.in -./src/bootstrap.cu -./src/collectives/all_gather.cu -./src/collectives/all_reduce.cu -./src/collectives/broadcast.cu -./src/collectives/collectives.h -./src/collectives/device/all_gather.cu -./src/collectives/device/all_gather.h -./src/collectives/device/all_reduce.cu -./src/collectives/device/all_reduce.h -./src/collectives/device/broadcast.cu -./src/collectives/device/broadcast.h -./src/collectives/device/common.h -./src/collectives/device/common_kernel.h -./src/collectives/device/functions.cu -./src/collectives/device/ll_kernel.h -./src/collectives/device/primitives.h -./src/collectives/device/reduce.cu -./src/collectives/device/reduce.h -./src/collectives/device/reduce_kernel.h -./src/collectives/device/reduce_scatter.cu -./src/collectives/device/reduce_scatter.h -./src/collectives/reduce.cu -./src/collectives/reduce_scatter.cu -./src/include/bootstrap.h -./src/include/common_coll.h -./src/include/core.h -./src/include/debug.h -./src/include/enqueue.h -./src/include/group.h -./src/include/ibvwrap.h -./src/include/nccl_net.h -./src/include/net.h -./src/include/nvlink.h -./src/include/nvmlwrap.h -./src/include/param.h -./src/include/ring.h -./src/include/rings.h -./src/include/shm.h -./src/include/socket.h -./src/include/topo.h -./src/include/transport.h -./src/include/utils.h -./src/init.cu -./src/misc/enqueue.cu -./src/misc/group.cu -./src/misc/ibvwrap.cu -./src/misc/nvmlwrap.cu -./src/misc/rings.cu -./src/misc/utils.cu -./src/ring.cu -./src/transport.cu -./src/transport/net.cu -./src/transport/net_ib.cu -./src/transport/net_socket.cu -./src/transport/p2p.cu -./src/transport/shm.cu -" - -for f in $FILES -do - sed -i \ - -e 's@cuda_runtime.h@hip/hip_runtime_api.h@g' \ - -e 's@cuda_fp16.h@hip/hip_fp16.h@g' \ - -e 's/cudaDeviceCanAccessPeer/hipDeviceCanAccessPeer/g' \ - -e 's/cudaDeviceEnablePeerAccess/hipDeviceEnablePeerAccess/g' \ - -e 's/cudaDeviceGetPCIBusId/hipDeviceGetPCIBusId/g' \ - -e 's/cudaErrorPeerAccessAlreadyEnabled/hipErrorPeerAccessAlreadyEnabled/g' \ - -e 's/cudaError_t/hipError_t/g' \ - -e 's/cudaEventCreateWithFlags/hipEventCreateWithFlags/g' \ - -e 's/cudaEventDestroy/hipEventDestroy/g' \ - -e 's/cudaEventDisableTiming/hipEventDisableTiming/g' \ - -e 's/cudaEventRecord/hipEventRecord/g' \ - -e 's/cudaEvent_t/hipEvent_t/g' \ - -e 's/cudaFree/hipFree/g' \ - -e 's/cudaFreeHost/hipHostFree/g' \ - -e 's/cudaGetDevice/hipGetDevice/g' \ - -e 's/cudaGetErrorString/hipGetErrorString/g' \ - -e 's/cudaGetLastError/hipGetLastError/g' \ - -e 's/cudaHostAlloc/hipHostMalloc/g' \ - -e 's/cudaHostAllocMapped/hipHostMallocMapped/g' \ - -e 's/cudaHostGetDevicePointer/hipHostGetDevicePointer/g' \ - -e 's/cudaHostRegister/hipHostRegister/g' \ - -e 's/cudaHostRegisterMapped/hipHostRegisterMapped/g' \ - -e 's/cudaHostUnregister/hipHostUnregister/g' \ - -e 's/cudaIpcCloseMemHandle/hipIpcCloseMemHandle/g' \ - -e 's/cudaIpcGetMemHandle/hipIpcGetMemHandle/g' \ - -e 's/cudaIpcMemHandle_t/hipIpcMemHandle_t/g' \ - -e 's/cudaIpcMemLazyEnablePeerAccess/hipIpcMemLazyEnablePeerAccess/g' \ - -e 's/cudaIpcOpenMemHandle/hipIpcOpenMemHandle/g' \ - -e 's/cudaMalloc/hipMalloc/g' \ - -e 's/cudaMemcpy/hipMemcpy/g' \ - -e 's/cudaMemcpyAsync/hipMemcpyAsync/g' \ - -e 's/cudaMemcpyDefault/hipMemcpyDefault/g' \ - -e 's/cudaMemcpyDeviceToDevice/hipMemcpyDeviceToDevice/g' \ - -e 's/cudaMemoryTypeDevice/hipMemoryTypeDevice/g' \ - -e 's/cudaMemset/hipMemset/g' \ - -e 's/cudaPointerAttributes/hipPointerAttribute_t/g' \ - -e 's/cudaPointerGetAttributes/hipPointerGetAttributes/g' \ - -e 's/cudaSetDevice/hipSetDevice/g' \ - -e 's/cudaStreamCreateWithFlags/hipStreamCreateWithFlags/g' \ - -e 's/cudaStreamDestroy/hipStreamDestroy/g' \ - -e 's/cudaStreamNonBlocking/hipStreamNonBlocking/g' \ - -e 's/cudaStreamWaitEvent/hipStreamWaitEvent/g' \ - -e 's/cudaStream_t/hipStream_t/g' \ - -e 's/cudaSuccess/hipSuccess/g' \ - $f -done diff --git a/projects/rccl/makefiles/common.mk b/projects/rccl/makefiles/common.mk index 83a2a3951a..2ad5c73200 100644 --- a/projects/rccl/makefiles/common.mk +++ b/projects/rccl/makefiles/common.mk @@ -1,5 +1,5 @@ # -# Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved. # # See LICENSE.txt for license information # @@ -16,7 +16,7 @@ NVCC = $(CUDA_HOME)/bin/nvcc CUDA_LIB ?= $(CUDA_HOME)/lib64 CUDA_INC ?= $(CUDA_HOME)/include -CUDA_VERSION = $(strip $(shell $(NVCC) --version | grep release | sed 's/.*release //' | sed 's/\,.*//')) +CUDA_VERSION = $(strip $(shell which $(NVCC) >/dev/null && $(NVCC) --version | grep release | sed 's/.*release //' | sed 's/\,.*//')) #CUDA_VERSION ?= $(shell ls $(CUDA_LIB)/libcudart.so.* | head -1 | rev | cut -d "." -f -2 | rev) CUDA_MAJOR = $(shell echo $(CUDA_VERSION) | cut -d "." -f 1) CUDA_MINOR = $(shell echo $(CUDA_VERSION) | cut -d "." -f 2) @@ -36,15 +36,16 @@ CUDA8_PTX = -gencode=arch=compute_61,code=compute_61 CUDA9_PTX = -gencode=arch=compute_70,code=compute_70 # Include Volta support if we're using CUDA9 or above -ifeq ($(shell test "$(CUDA_MAJOR)" -gt 8; echo $$?),0) +ifeq ($(shell test "0$(CUDA_MAJOR)" -gt 8; echo $$?),0) NVCC_GENCODE ?= $(CUDA8_GENCODE) $(CUDA9_GENCODE) $(CUDA9_PTX) else NVCC_GENCODE ?= $(CUDA8_GENCODE) $(CUDA8_PTX) endif #$(info NVCC_GENCODE is ${NVCC_GENCODE}) -CXXFLAGS := -I$(CUDA_INC) -DCUDA_MAJOR=$(CUDA_MAJOR) -DCUDA_MINOR=$(CUDA_MINOR) -fPIC -fvisibility=hidden -CXXFLAGS += -Wall -Wno-sign-compare +CXXFLAGS := -DCUDA_MAJOR=$(CUDA_MAJOR) -DCUDA_MINOR=$(CUDA_MINOR) -fPIC -fvisibility=hidden +CXXFLAGS += -Wall -Wno-unused-function -Wno-sign-compare -std=c++11 -Wvla +CXXFLAGS += -I $(CUDA_INC) NVCUFLAGS := -ccbin $(CXX) $(NVCC_GENCODE) -lineinfo -std=c++11 -Xptxas -maxrregcount=96 -Xfatbin -compress-all # Use addprefix so that we can specify more than one path NVLDFLAGS := -L${CUDA_LIB} -lcudart -lrt @@ -68,7 +69,7 @@ CXXFLAGS += -O0 -g -ggdb3 endif ifneq ($(VERBOSE), 0) -NVCUFLAGS += -Xptxas -v -Xcompiler -Wall,-Wextra +NVCUFLAGS += -Xptxas -v -Xcompiler -Wall,-Wextra,-Wno-unused-parameter CXXFLAGS += -Wall -Wextra else .SILENT: diff --git a/projects/rccl/makefiles/formatting.mk b/projects/rccl/makefiles/formatting.mk index 4a4ab885cf..a543131d59 100644 --- a/projects/rccl/makefiles/formatting.mk +++ b/projects/rccl/makefiles/formatting.mk @@ -1,5 +1,5 @@ # -# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. # # See LICENSE.txt for license information # diff --git a/projects/rccl/makefiles/version.mk b/projects/rccl/makefiles/version.mk index f9cee6a5a8..bab58ec0bf 100644 --- a/projects/rccl/makefiles/version.mk +++ b/projects/rccl/makefiles/version.mk @@ -1,6 +1,6 @@ ##### version NCCL_MAJOR := 2 -NCCL_MINOR := 3 -NCCL_PATCH := 7 +NCCL_MINOR := 4 +NCCL_PATCH := 8 NCCL_SUFFIX := PKG_REVISION := 1 diff --git a/projects/rccl/pkg/Makefile b/projects/rccl/pkg/Makefile index 04b23da70e..ab6487be9b 100644 --- a/projects/rccl/pkg/Makefile +++ b/projects/rccl/pkg/Makefile @@ -1,5 +1,5 @@ # -# Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved. # # See LICENSE.txt for license information # diff --git a/projects/rccl/pkg/debian/Makefile b/projects/rccl/pkg/debian/Makefile index 439635f948..7884cf2545 100644 --- a/projects/rccl/pkg/debian/Makefile +++ b/projects/rccl/pkg/debian/Makefile @@ -1,5 +1,5 @@ # -# Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved. # # See LICENSE.txt for license information # diff --git a/projects/rccl/pkg/redhat/Makefile b/projects/rccl/pkg/redhat/Makefile index ffcc973bcd..0808478624 100644 --- a/projects/rccl/pkg/redhat/Makefile +++ b/projects/rccl/pkg/redhat/Makefile @@ -1,5 +1,5 @@ # -# Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved. # # See LICENSE.txt for license information # diff --git a/projects/rccl/pkg/redhat/nccl.spec.in b/projects/rccl/pkg/redhat/nccl.spec.in index 65a2c60154..f9d83a30df 100644 --- a/projects/rccl/pkg/redhat/nccl.spec.in +++ b/projects/rccl/pkg/redhat/nccl.spec.in @@ -1,6 +1,6 @@ Name: libnccl -Version: ${nccl:Major}.${nccl:Minor}.${nccl:Patch} -Release: ${pkg:Revision} +Version: ${nccl:Major}.${nccl:Minor}.${nccl:Patch}${nccl:Suffix} +Release: ${pkg:Revision}+cuda${cuda:Major}.${cuda:Minor} Summary: NVIDIA Collectives Communication Library (NCCL) Runtime Group: Development/Libraries diff --git a/projects/rccl/pkg/srctxz/Makefile b/projects/rccl/pkg/srctxz/Makefile index 1cb7c06a99..01cab95a43 100644 --- a/projects/rccl/pkg/srctxz/Makefile +++ b/projects/rccl/pkg/srctxz/Makefile @@ -1,5 +1,5 @@ # -# Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved. # # See LICENSE.txt for license information # @@ -36,4 +36,5 @@ $(TXZPREPDIR)/% : %.in -e "s/\$${nccl:Minor}/$(NCCL_MINOR)/g" \ -e "s/\$${nccl:Patch}/$(NCCL_PATCH)/g" \ -e "s/\$${nccl:Suffix}/$(NCCL_SUFFIX)/g" \ + -e "s/\$${pkg:Revision}/$(PKG_REVISION)/g" \ $< > $@ diff --git a/projects/rccl/pkg/srctxz/create_srctxz.sh.in b/projects/rccl/pkg/srctxz/create_srctxz.sh.in index 0b8e6d2b4c..11bdd52db7 100644 --- a/projects/rccl/pkg/srctxz/create_srctxz.sh.in +++ b/projects/rccl/pkg/srctxz/create_srctxz.sh.in @@ -1,6 +1,6 @@ #!/bin/bash # -# Copyright (c) 2017-2018, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. # # See LICENSE.txt for license information # @@ -25,8 +25,9 @@ NCCL_MAJOR=${nccl:Major} NCCL_MINOR=${nccl:Minor} NCCL_PATCH=${nccl:Patch} NCCL_SUFFIX=${nccl:Suffix} +NCCL_BUILD=${pkg:Revision} -NCCLNAME="nccl-src_${NCCL_MAJOR}.${NCCL_MINOR}.${NCCL_PATCH}${NCCL_SUFFIX}" +NCCLNAME="nccl-src_${NCCL_MAJOR}.${NCCL_MINOR}.${NCCL_PATCH}${NCCL_SUFFIX}-${NCCL_BUILD}" tar --exclude build \ --exclude ".git*" \ diff --git a/projects/rccl/pkg/txz/Makefile b/projects/rccl/pkg/txz/Makefile index fa587ef186..b7d9aa53c8 100644 --- a/projects/rccl/pkg/txz/Makefile +++ b/projects/rccl/pkg/txz/Makefile @@ -1,5 +1,5 @@ # -# Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved. # # See LICENSE.txt for license information # diff --git a/projects/rccl/pkg/txz/create_txz.sh.in b/projects/rccl/pkg/txz/create_txz.sh.in index 73922e0929..deae854830 100644 --- a/projects/rccl/pkg/txz/create_txz.sh.in +++ b/projects/rccl/pkg/txz/create_txz.sh.in @@ -1,6 +1,6 @@ #!/bin/bash # -# Copyright (c) 2017-2018, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. # # See LICENSE.txt for license information # diff --git a/projects/rccl/src/Makefile b/projects/rccl/src/Makefile index 481000ad16..452adf52ae 100644 --- a/projects/rccl/src/Makefile +++ b/projects/rccl/src/Makefile @@ -1,5 +1,5 @@ # -# Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved. # # See LICENSE.txt for license information # @@ -9,41 +9,48 @@ include ../makefiles/version.mk ##### src files INCEXPORTS := nccl.h nccl_net.h -LIBSRCFILES := init.cu ring.cu bootstrap.cu transport.cu misc/group.cu \ - misc/nvmlwrap.cu misc/ibvwrap.cu misc/rings.cu misc/utils.cu misc/enqueue.cu \ - transport/p2p.cu transport/shm.cu transport/net.cu transport/net_socket.cu transport/net_ib.cu \ - collectives/all_reduce.cu collectives/all_gather.cu collectives/broadcast.cu collectives/reduce.cu collectives/reduce_scatter.cu +LIBSRCFILES := init.cc channel.cc bootstrap.cc transport.cc enqueue.cc \ + misc/group.cc misc/nvmlwrap.cc misc/ibvwrap.cc misc/rings.cc misc/utils.cc misc/argcheck.cc misc/trees.cc misc/topo.cc \ + transport/p2p.cc transport/shm.cc transport/net.cc transport/net_socket.cc transport/net_ib.cc \ + collectives/all_reduce.cc collectives/all_gather.cc collectives/broadcast.cc collectives/reduce.cc collectives/reduce_scatter.cc ##### lib files LIBNAME := libnccl.so STATICLIBNAME := libnccl_static.a +##### pkgconfig files +PKGCONFIGFILE := nccl.pc ##### dirs BUILDDIR ?= $(abspath ../build) INCDIR := $(BUILDDIR)/include LIBDIR := $(BUILDDIR)/lib OBJDIR := $(BUILDDIR)/obj +PKGDIR := $(BUILDDIR)/lib/pkgconfig ##### target files +CUDARTLIB ?= cudart_static INCTARGETS := $(INCEXPORTS:%=$(INCDIR)/%) LIBSONAME := $(LIBNAME:%=%.$(NCCL_MAJOR)) LIBTARGET := $(LIBNAME:%=%.$(NCCL_MAJOR).$(NCCL_MINOR).$(NCCL_PATCH)) STATICLIBTARGET := $(STATICLIBNAME) -LIBOBJ := $(LIBSRCFILES:%.cu=$(OBJDIR)/%.o) +PKGTARGET := $(PKGCONFIGFILE) +LIBOBJ := $(LIBSRCFILES:%.cc=$(OBJDIR)/%.o) DEPFILES := $(LIBOBJ:%.o=%.d) -LDFLAGS += -L${CUDA_LIB} -lcudart_static -lrt +LDFLAGS += -L${CUDA_LIB} -l$(CUDARTLIB) -lpthread -lrt -ldl DEVICELIB := $(BUILDDIR)/obj/collectives/device/colldevice.a - ##### rules build : lib staticlib -lib : $(INCTARGETS) $(LIBDIR)/$(LIBTARGET) +lib : $(INCTARGETS) $(LIBDIR)/$(LIBTARGET) $(PKGDIR)/$(PKGTARGET) staticlib : $(LIBDIR)/$(STATICLIBTARGET) -devicelib: $(INCDIR)/nccl.h +$(DEVICELIB): ALWAYS_REBUILD $(MAKE) -C collectives/device +# Empty target to force rebuild +ALWAYS_REBUILD: + -include $(DEPFILES) $(LIBDIR)/$(LIBTARGET) $(LIBDIR)/$(STATICLIBTARGET) : $(LIBOBJ) @@ -51,7 +58,7 @@ $(INCDIR)/nccl.h : nccl.h.in # NCCL_VERSION(X,Y,Z) ((X) * 1000 + (Y) * 100 + (Z)) @$(eval NCCL_VERSION := $(shell printf "%d%d%02d" $(NCCL_MAJOR) $(NCCL_MINOR) $(NCCL_PATCH))) mkdir -p $(INCDIR) - printf "Generating %-35s > %s\n" $< $@ + @printf "Generating %-35s > %s\n" $< $@ sed -e "s/\$${nccl:Major}/$(NCCL_MAJOR)/g" \ -e "s/\$${nccl:Minor}/$(NCCL_MINOR)/g" \ -e "s/\$${nccl:Patch}/$(NCCL_PATCH)/g" \ @@ -59,14 +66,14 @@ $(INCDIR)/nccl.h : nccl.h.in -e "s/\$${nccl:Version}/$(NCCL_VERSION)/g" \ $< > $@ -$(LIBDIR)/$(LIBTARGET): $(LIBOBJ) devicelib +$(LIBDIR)/$(LIBTARGET): $(LIBOBJ) $(DEVICELIB) @printf "Linking %-35s > %s\n" $(LIBTARGET) $@ mkdir -p $(LIBDIR) $(CXX) $(CXXFLAGS) -shared -Wl,--no-as-needed -Wl,-soname,$(LIBSONAME) -o $@ $(LIBOBJ) $(DEVICELIB) $(LDFLAGS) ln -sf $(LIBSONAME) $(LIBDIR)/$(LIBNAME) ln -sf $(LIBTARGET) $(LIBDIR)/$(LIBSONAME) -$(LIBDIR)/$(STATICLIBTARGET): $(LIBOBJ) devicelib +$(LIBDIR)/$(STATICLIBTARGET): $(LIBOBJ) $(DEVICELIB) @printf "Archiving %-35s > %s\n" $(STATICLIBTARGET) $@ mkdir -p $(LIBDIR) $(eval TMP := $(shell mktemp -d)) @@ -75,6 +82,15 @@ $(LIBDIR)/$(STATICLIBTARGET): $(LIBOBJ) devicelib ar cr $@ $(LIBOBJ) $(TMP)/*.o rm -Rf $(TMP) +$(PKGDIR)/nccl.pc : nccl.pc.in + mkdir -p $(PKGDIR) + @printf "Generating %-35s > %s\n" $< $@ + sed -e 's|$${nccl:Prefix}|\$(PREFIX)|g' \ + -e "s/\$${nccl:Major}/$(NCCL_MAJOR)/g" \ + -e "s/\$${nccl:Minor}/$(NCCL_MINOR)/g" \ + -e "s/\$${nccl:Patch}/$(NCCL_PATCH)/g" \ + $< > $@ + $(INCDIR)/%.h : %.h @printf "Grabbing %-35s > %s\n" $< $@ mkdir -p $(INCDIR) @@ -85,27 +101,34 @@ $(INCDIR)/nccl_%.h : include/nccl_%.h mkdir -p $(INCDIR) cp -f $< $@ -$(OBJDIR)/%.o : %.cu +$(PKGDIR)/%.pc : %.pc + @printf "Grabbing %-35s > %s\n" $< $@ + mkdir -p $(PKGDIR) + cp -f $< $@ + +$(OBJDIR)/%.o : %.cc @printf "Compiling %-35s > %s\n" $< $@ mkdir -p `dirname $@` - $(NVCC) -I. -I$(INCDIR) -Iinclude -c $(NVCUFLAGS) --compiler-options "$(CXXFLAGS)" $< -o $@ - @$(NVCC) -I. -I$(INCDIR) -Iinclude -M $(NVCUFLAGS) --compiler-options "$(CXXFLAGS)" $< > $(@:%.o=%.d.tmp) + $(CXX) -I. -I$(INCDIR) $(CXXFLAGS) -Iinclude -c $< -o $@ + @$(CXX) -I. -I$(INCDIR) $(CXXFLAGS) -Iinclude -M $< > $(@:%.o=%.d.tmp) @sed "0,/^.*:/s//$(subst /,\/,$@):/" $(@:%.o=%.d.tmp) > $(@:%.o=%.d) @sed -e 's/.*://' -e 's/\\$$//' < $(@:%.o=%.d.tmp) | fmt -1 | \ sed -e 's/^ *//' -e 's/$$/:/' >> $(@:%.o=%.d) @rm -f $(@:%.o=%.d.tmp) clean : - rm -rf ${INCDIR} ${LIBDIR} ${OBJDIR} + rm -rf ${INCDIR} ${LIBDIR} ${PKGDIR} ${OBJDIR} $(MAKE) -C collectives/device clean install : lib mkdir -p $(PREFIX)/lib + mkdir -p $(PREFIX)/lib/pkgconfig mkdir -p $(PREFIX)/include - cp -P -v $(BUILDDIR)/lib/* $(PREFIX)/lib/ + cp -P -v $(BUILDDIR)/lib/lib* $(PREFIX)/lib/ + cp -P -v $(BUILDDIR)/lib/pkgconfig/* $(PREFIX)/lib/pkgconfig/ cp -v $(BUILDDIR)/include/* $(PREFIX)/include/ -FILESTOFORMAT := $(shell find . -name ".\#*" -prune -o \( -name "*.cu" -o -name "*.h" \) -print | grep -v -E 'ibvwrap.h|nvmlwrap.h|nccl.h') +FILESTOFORMAT := $(shell find . -name ".\#*" -prune -o \( -name "*.cc" -o -name "*.h" \) -print | grep -v -E 'ibvwrap.h|nvmlwrap.h|nccl.h') # Note that formatting.mk defines a new target so in order to not overwrite the default target, # it shouldn't be included at the top. Also, it uses the above definition of FILESTOFORMAT as well # as the BUILDDIR variable. diff --git a/projects/rccl/src/bootstrap.cc b/projects/rccl/src/bootstrap.cc new file mode 100644 index 0000000000..d7c2ac6760 --- /dev/null +++ b/projects/rccl/src/bootstrap.cc @@ -0,0 +1,467 @@ +/************************************************************************* + * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ + +#include "nccl.h" +#include "core.h" +#include "utils.h" +#include "bootstrap.h" +#include "net.h" +#include "socket.h" +#include +#include + +// Always use sockets for bootstrap +struct bootstrapNetHandle { + union socketAddress connectAddr; +}; + +struct bootstrapNetComm { + int fd; +}; + +/* Init functions */ +static char bootstrapNetIfNames[MAX_IF_NAME_SIZE*MAX_IFS]; +static union socketAddress bootstrapNetIfAddrs[MAX_IFS]; +static int bootstrapNetIfs = -1; +pthread_mutex_t bootstrapNetLock = PTHREAD_MUTEX_INITIALIZER; + +ncclResult_t bootstrapNetInit() { + if (bootstrapNetIfs == -1) { + pthread_mutex_lock(&bootstrapNetLock); + if (bootstrapNetIfs == -1) { + bootstrapNetIfs = findInterfaces(bootstrapNetIfNames, bootstrapNetIfAddrs, MAX_IF_NAME_SIZE, MAX_IFS); + if (bootstrapNetIfs <= 0) { + WARN("Bootstrap : no socket interface found"); + return ncclInternalError; + } else { + char line[1024]; + char addrline[1024]; + line[0] = '\0'; + for (int i=0; ifd = -1; + return ncclSuccess; +} + +static ncclResult_t bootstrapNetGetSocketAddr(int dev, union socketAddress* addr) { + if (dev >= bootstrapNetIfs) return ncclInternalError; + memcpy(addr, bootstrapNetIfAddrs+dev, sizeof(*addr)); + return ncclSuccess; +} + +/* Socket Interface Selection type */ +enum bootstrapInterface_t { findSubnetIf = -1, dontCareIf = -2 }; + +static ncclResult_t bootstrapNetListen(int dev, void* opaqueHandle, void** listenComm) { + struct bootstrapNetHandle* handle = (struct bootstrapNetHandle*) opaqueHandle; + static_assert(sizeof(struct bootstrapNetHandle) < NCCL_NET_HANDLE_MAXSIZE, "bootstrapNetHandle size too large"); + // if dev >= 0, listen based on dev + if (dev >= 0) { + NCCLCHECK(bootstrapNetGetSocketAddr(dev, &(handle->connectAddr))); + } else if (dev == findSubnetIf) { + // handle stores a remote address + // need to find a local addr that is in the same network as the remote addr + union socketAddress localAddr; + char ifName[MAX_IF_NAME_SIZE]; + if (findInterfaceMatchSubnet(ifName, &localAddr, handle->connectAddr, MAX_IF_NAME_SIZE, 1) <= 0) { + WARN("NET/Socket : No usable listening interface found"); + return ncclSystemError; + } + // pass the local address back + memcpy(&handle->connectAddr, &localAddr, sizeof(handle->connectAddr)); + } // Otherwise, handle stores a local address + struct bootstrapNetComm* comm; + NCCLCHECK(bootstrapNetNewComm(&comm)); + NCCLCHECK(createListenSocket(&comm->fd, &handle->connectAddr)); + *listenComm = comm; + return ncclSuccess; +} + +static ncclResult_t bootstrapNetConnect(int dev, void* opaqueHandle, void** sendComm) { + struct bootstrapNetComm* comm; + NCCLCHECK(bootstrapNetNewComm(&comm)); + struct bootstrapNetHandle* handle = (struct bootstrapNetHandle*) opaqueHandle; + NCCLCHECK(connectAddress(&comm->fd, &handle->connectAddr)); + *sendComm = comm; + return ncclSuccess; +} + +static ncclResult_t bootstrapNetAccept(void* listenComm, void** recvComm) { + struct bootstrapNetComm* lComm = (struct bootstrapNetComm*)listenComm; + struct bootstrapNetComm* rComm; + NCCLCHECK(bootstrapNetNewComm(&rComm)); + struct sockaddr_in sockaddr; + socklen_t socklen = sizeof(struct sockaddr_in); + SYSCHECKVAL(accept(lComm->fd, (struct sockaddr*)&sockaddr, &socklen), "accept", rComm->fd); + *recvComm = rComm; + return ncclSuccess; +} + +static ncclResult_t bootstrapNetClose(void* opaqueComm) { + struct bootstrapNetComm* comm = (struct bootstrapNetComm*)opaqueComm; + if (comm) { + close(comm->fd); + free(comm); + } + return ncclSuccess; +} + +static ncclResult_t bootstrapNetCloseSend(void* sendComm) { NCCLCHECK(bootstrapNetClose(sendComm)); return ncclSuccess; } +static ncclResult_t bootstrapNetCloseRecv(void* recvComm) { NCCLCHECK(bootstrapNetClose(recvComm)); return ncclSuccess; } +static ncclResult_t bootstrapNetCloseListen(void* listenComm) { NCCLCHECK(bootstrapNetClose(listenComm)); return ncclSuccess; } + +// Additional sync functions +static ncclResult_t bootstrapNetSend(void* sendComm, void* data, int size) { + struct bootstrapNetComm* comm = (struct bootstrapNetComm*)sendComm; + NCCLCHECK(socketSend(comm->fd, &size, sizeof(int))); + NCCLCHECK(socketSend(comm->fd, data, size)); + return ncclSuccess; +} +static ncclResult_t bootstrapNetRecv(void* recvComm, void* data, int size) { + struct bootstrapNetComm* comm = (struct bootstrapNetComm*)recvComm; + int recvSize; + NCCLCHECK(socketReceive(comm->fd, &recvSize, sizeof(int))); + if (recvSize > size) { + WARN("Message truncated : received %d bytes instead of %d\n", recvSize, size); + return ncclInternalError; + } + NCCLCHECK(socketReceive(comm->fd, data, std::min(recvSize, size))); + return ncclSuccess; +} + +ncclResult_t bootstrapNetCreateHandle(void* opaqueHandle, const char* str) { + struct bootstrapNetHandle* handle = (struct bootstrapNetHandle*) opaqueHandle; + NCCLCHECK(GetSocketAddrFromString(&handle->connectAddr, str)); + return ncclSuccess; +} + +struct extId { + ncclNetHandle_t extHandleRoot; + void* extListenComm; + uint64_t hostHash; + pid_t pid; + int fd; + pthread_t boostrapThread; +}; + +struct extInfo { + int rank; + int nranks; + ncclNetHandle_t extHandleListenRoot; + ncclNetHandle_t extHandleListen; +}; + +#include + +static ncclResult_t setFilesLimit() { + struct rlimit filesLimit; + SYSCHECK(getrlimit(RLIMIT_NOFILE, &filesLimit), "getrlimit"); + filesLimit.rlim_cur = filesLimit.rlim_max; + SYSCHECK(setrlimit(RLIMIT_NOFILE, &filesLimit), "setrlimit"); + return ncclSuccess; +} + +static void *bootstrapRoot(void* commId) { + struct extInfo info; + struct extId* id = (struct extId*)commId; + ncclNetHandle_t *rankHandles = NULL; + ncclNetHandle_t *rankHandlesRoot = NULL; // for initial rank <-> root information exchange + ncclNetHandle_t zero = { 0 }; // for sanity checking + void* tmpComm; + ncclResult_t res; + setFilesLimit(); + + TRACE(NCCL_INIT, "BEGIN"); + /* Receive addresses from all ranks */ + int nranks = 0, c = 0; + do { + NCCLCHECKGOTO(bootstrapNetAccept(id->extListenComm, &tmpComm), res, out); + NCCLCHECKGOTO(bootstrapNetRecv(tmpComm, &info, sizeof(info)), res, out); + NCCLCHECKGOTO(bootstrapNetCloseRecv(tmpComm), res, out); + + if (c == 0) { + nranks = info.nranks; + NCCLCHECKGOTO(ncclCalloc(&rankHandles, nranks), res, out); + NCCLCHECKGOTO(ncclCalloc(&rankHandlesRoot, nranks), res, out); + } + + if (nranks != info.nranks) { + WARN("Bootstrap Root : mismatch in rank count from procs %d : %d", nranks, info.nranks); + goto out; + } + + if (memcmp(&zero, &rankHandlesRoot[info.rank], sizeof(ncclNetHandle_t)) != 0) { + WARN("Bootstrap Root : rank %d of %d ranks has already checked in", info.rank, nranks); + goto out; + } + + // Save the connection handle for that rank + memcpy(rankHandlesRoot+info.rank, info.extHandleListenRoot, sizeof(ncclNetHandle_t)); + memcpy(rankHandles+info.rank, info.extHandleListen, sizeof(ncclNetHandle_t)); + + ++c; + } while (c < nranks); + TRACE(NCCL_INIT, "COLLECTED HANDLES"); + + // Send the connect handle for the next rank in the AllGather ring + for (int r=0; rextListenComm); + free(commId); + if (rankHandles) free(rankHandles); + if (rankHandlesRoot) free(rankHandlesRoot); + + TRACE(NCCL_INIT, "DONE"); + return NULL; +} + +ncclResult_t bootstrapCreateRoot(ncclUniqueId* commId, bool idFromEnv) { + struct extId* id = (struct extId*)commId; + id->hostHash = getHostHash(); + NCCLCHECK(bootstrapNetListen(idFromEnv ? dontCareIf : 0, &id->extHandleRoot, &id->extListenComm)); + ncclUniqueId* threadIdCopy; + NCCLCHECK(ncclCalloc(&threadIdCopy, 1)); + memcpy(threadIdCopy, id, sizeof(ncclUniqueId)); + pthread_create(&id->boostrapThread, NULL, bootstrapRoot, (void *)threadIdCopy); + return ncclSuccess; +} + +ncclResult_t bootstrapGetUniqueId(ncclUniqueId* out) { + static_assert(sizeof(extId) < sizeof(ncclUniqueId), "NetId does not fit inside ncclUniqueId"); + extId* id = (extId*)out; + + char* env = getenv("NCCL_COMM_ID"); + if (env) { + if (bootstrapNetCreateHandle(&id->extHandleRoot, env) != 0) { + WARN("Invalid NCCL_COMM_ID, please use format: : or []: or :"); + return ncclInvalidArgument; + } + id->pid = -1; + } else { + id->pid = getpid(); + NCCLCHECK(bootstrapCreateRoot(out, false)); + } + + return ncclSuccess; +} + +struct unexConn { + int peer; + void* comm; + struct unexConn* next; +}; + +struct extState { + void* extBstrapListenComm; + void* extBstrapRingRecvComm; + void* extBstrapRingSendComm; + ncclNetHandle_t* peerBstrapHandles; + struct unexConn* unexpectedConnections; + int rank; + int nranks; + int dev; +}; + +ncclResult_t bootstrapInit(ncclUniqueId* commId, int rank, int nranks, void** commState) { + struct extId* id = (struct extId*)commId; + bool idFromEnv = id->pid < 0; + struct extState* state; + NCCLCHECK(ncclCalloc(&state, 1)); + state->rank = rank; + state->nranks = nranks; + *commState = state; + + TRACE(NCCL_INIT, "rank %d nranks %d", rank, nranks); + + struct extInfo info = { 0 }; + info.rank = rank; + info.nranks = nranks; + void *tmpSendComm, *tmpRecvComm; + // Pass the remote address to listen via info + if (idFromEnv) { + memcpy(&info.extHandleListen, &id->extHandleRoot, sizeof(ncclNetHandle_t)); + memcpy(&info.extHandleListenRoot, &id->extHandleRoot, sizeof(ncclNetHandle_t)); + } + // listen will return the local address via info (specify interface type 'findSubnetIf') + state->dev = idFromEnv ? findSubnetIf : 0; + void* extBstrapListenCommRoot; + NCCLCHECK(bootstrapNetListen(state->dev, &info.extHandleListen, &state->extBstrapListenComm)); + NCCLCHECK(bootstrapNetListen(state->dev, &info.extHandleListenRoot, &extBstrapListenCommRoot)); + + // stagger connection times to avoid an overload of the root at very high rank counts + if (nranks > 128) { + long msec = rank; + struct timespec tv; + tv.tv_sec = msec / 1000; + tv.tv_nsec = 1000000 * (msec % 1000); + TRACE(NCCL_INIT, "rank %d delaying connection to root by %ld msec", rank, msec); + (void) nanosleep(&tv, NULL); + } + + // send info on my listening socket to root + NCCLCHECK(bootstrapNetConnect(state->dev, id->extHandleRoot, &tmpSendComm)); + NCCLCHECK(bootstrapNetSend(tmpSendComm, &info, sizeof(info))); + NCCLCHECK(bootstrapNetCloseSend(tmpSendComm)); + + // get info on my "next" rank in the bootstrap ring from root + ncclNetHandle_t extHandleNext; + NCCLCHECK(bootstrapNetAccept(extBstrapListenCommRoot, &tmpRecvComm)); + NCCLCHECK(bootstrapNetRecv(tmpRecvComm, &extHandleNext, sizeof(extHandleNext))); + NCCLCHECK(bootstrapNetCloseRecv(tmpRecvComm)); + NCCLCHECK(bootstrapNetCloseListen(extBstrapListenCommRoot)); + + NCCLCHECK(bootstrapNetConnect(state->dev, extHandleNext, &state->extBstrapRingSendComm)); + // Accept the connect request from the previous rank in the AllGather ring + NCCLCHECK(bootstrapNetAccept(state->extBstrapListenComm, &state->extBstrapRingRecvComm)); + + // AllGather all listen handlers + NCCLCHECK(ncclCalloc(&state->peerBstrapHandles, nranks)); + memcpy(state->peerBstrapHandles+rank, info.extHandleListen, sizeof(ncclNetHandle_t)); + NCCLCHECK(bootstrapAllGather(state, state->peerBstrapHandles, sizeof(ncclNetHandle_t))); + + TRACE(NCCL_INIT, "rank %d nranks %d - DONE", rank, nranks); + + return ncclSuccess; +} + +ncclResult_t bootstrapAllGather(void* commState, void* allData, int size) { + struct extState* state = (struct extState*)commState; + char* data = (char*)allData; + int rank = state->rank; + int nranks = state->nranks; + + TRACE(NCCL_INIT, "rank %d nranks %d size %d", rank, nranks, size); + + /* Simple ring based AllGather + * At each step i receive data from (rank-i-1) from left + * and send previous step's data from (rank-i) to right + */ + for (int i=0; iextBstrapRingSendComm, data+sslice*size, size)); + // Recv slice from the left + NCCLCHECK(bootstrapNetRecv(state->extBstrapRingRecvComm, data+rslice*size, size)); + } + + TRACE(NCCL_INIT, "rank %d nranks %d size %d - DONE", rank, nranks, size); + return ncclSuccess; +} + +ncclResult_t bootstrapSend(void* commState, int peer, void* data, int size) { + struct extState* state = (struct extState*)commState; + void* tmpSendComm; + NCCLCHECK(bootstrapNetConnect(state->dev, state->peerBstrapHandles[peer], &tmpSendComm)); + NCCLCHECK(bootstrapNetSend(tmpSendComm, &state->rank, sizeof(int))); + NCCLCHECK(bootstrapNetSend(tmpSendComm, data, size)); + NCCLCHECK(bootstrapNetCloseSend(tmpSendComm)); + return ncclSuccess; +} + +ncclResult_t unexpectedEnqueue(struct extState* state, int peer, void* comm) { + // New unex + struct unexConn* unex; + NCCLCHECK(ncclCalloc(&unex, 1)); + unex->peer = peer; + unex->comm = comm; + + // Enqueue + struct unexConn* list = state->unexpectedConnections; + if (list == NULL) { + state->unexpectedConnections = unex; + return ncclSuccess; + } + while (list->next) list = list->next; + list->next = unex; + return ncclSuccess; +} + +void* unexpectedDequeue(struct extState* state, int peer) { + struct unexConn* elem = state->unexpectedConnections; + struct unexConn* prev = NULL; + while (elem) { + if (elem->peer == peer) { + if (prev == NULL) { + state->unexpectedConnections = elem->next; + } else { + prev->next = elem->next; + } + void* comm = elem->comm; + free(elem); + return comm; + } + prev = elem; + elem = elem->next; + } + return NULL; +} + +// We can't know who we'll receive from, so we need to receive everything at once +ncclResult_t bootstrapRecv(void* commState, int peer, void* data, int size) { + struct extState* state = (struct extState*)commState; + + void* tmpRecvComm; + + // Search unexpected connections first + if ((tmpRecvComm = unexpectedDequeue(state, peer)) != NULL) { + NCCLCHECK(bootstrapNetRecv(tmpRecvComm, ((char*)data), size)); + NCCLCHECK(bootstrapNetCloseRecv(tmpRecvComm)); + return ncclSuccess; + } + + // Then look for new connections + while (1) { + NCCLCHECK(bootstrapNetAccept(state->extBstrapListenComm, &tmpRecvComm)); + int newPeer; + NCCLCHECK(bootstrapNetRecv(tmpRecvComm, &newPeer, sizeof(int))); + if (newPeer == peer) { + NCCLCHECK(bootstrapNetRecv(tmpRecvComm, ((char*)data), size)); + NCCLCHECK(bootstrapNetCloseRecv(tmpRecvComm)); + return ncclSuccess; + } + // Unexpected connection. Save for later. + NCCLCHECK(unexpectedEnqueue(state, newPeer, tmpRecvComm)); + } +} + +ncclResult_t bootstrapClose(void* commState) { + struct extState* state = (struct extState*)commState; + if (state->unexpectedConnections != NULL) { + WARN("Unexpected connections are not empty.\n"); + return ncclInternalError; + } + NCCLCHECK(bootstrapNetCloseListen(state->extBstrapListenComm)); + NCCLCHECK(bootstrapNetCloseSend(state->extBstrapRingSendComm)); + NCCLCHECK(bootstrapNetCloseRecv(state->extBstrapRingRecvComm)); + + free(state->peerBstrapHandles); + free(state); + + return ncclSuccess; +} diff --git a/projects/rccl/src/bootstrap.cu b/projects/rccl/src/bootstrap.cu deleted file mode 100644 index 13c6e922b1..0000000000 --- a/projects/rccl/src/bootstrap.cu +++ /dev/null @@ -1,249 +0,0 @@ -/************************************************************************* - * Copyright (c) 2016-2018, NVIDIA CORPORATION. All rights reserved. - * - * See LICENSE.txt for license information - ************************************************************************/ - -#include "nccl.h" -#include "core.h" -#include "utils.h" -#include "bootstrap.h" -#include "net.h" -#include -#include - -// Always use sockets for bootstrap -ncclNet_t* ncclBootstrapNet = &ncclNetSocket; - -static ncclResult_t bootstrapListen(int dev, void* handle, void** listenComm) { NCCLCHECK(ncclBootstrapNet->listen(dev, handle, listenComm)); return ncclSuccess; } -static ncclResult_t bootstrapConnect(int dev, void* handle, void** sendComm) { NCCLCHECK(ncclBootstrapNet->connect(dev, handle, sendComm)); return ncclSuccess; } -static ncclResult_t bootstrapAccept(void* listenComm, void** recvComm) { NCCLCHECK(ncclBootstrapNet->accept(listenComm, recvComm)); return ncclSuccess; } -static ncclResult_t bootstrapTest(void* request, int* done, int* size) { NCCLCHECK(ncclBootstrapNet->test(request, done, size)); return ncclSuccess; } -static ncclResult_t bootstrapCloseSend(void* sendComm) { NCCLCHECK(ncclBootstrapNet->closeSend(sendComm)); return ncclSuccess; } -static ncclResult_t bootstrapCloseRecv(void* recvComm) { NCCLCHECK(ncclBootstrapNet->closeRecv(recvComm)); return ncclSuccess; } -static ncclResult_t bootstrapCloseListen(void* listenComm) { NCCLCHECK(ncclBootstrapNet->closeListen(listenComm)); return ncclSuccess; } - -// Additional sync functions based on async + test for bootstrap, using host ptrs. -static ncclResult_t bootstrapSend(void* sendComm, void* data, int size) { - void* request; - NCCLCHECK(ncclBootstrapNet->isend(sendComm, data, size, NCCL_PTR_HOST, &request)); - int done = 0; - while (!done) NCCLCHECK(bootstrapTest(request, &done, NULL)); - return ncclSuccess; -} -static ncclResult_t bootstrapRecv(void* recvComm, void* data, int size) { - void* request; - NCCLCHECK(ncclBootstrapNet->irecv(recvComm, data, size, NCCL_PTR_HOST, &request)); - int done = 0; - while (!done) NCCLCHECK(bootstrapTest(request, &done, NULL)); - return ncclSuccess; -} - -struct extId { - ncclNetHandle_t extHandleRoot; - void* extListenComm; - uint64_t hostHash; - pid_t pid; - int fd; - pthread_t boostrapThread; -}; - -struct extInfo { - int rank; - int nranks; - ncclNetHandle_t extHandleListenFromRoot; - ncclNetHandle_t extHandleRing; -}; - -#include - -static ncclResult_t setFilesLimit() { - struct rlimit filesLimit; - SYSCHECK(getrlimit(RLIMIT_NOFILE, &filesLimit), "getrlimit"); - filesLimit.rlim_cur = filesLimit.rlim_max; - SYSCHECK(setrlimit(RLIMIT_NOFILE, &filesLimit), "setrlimit"); - return ncclSuccess; -} - -static void *bootstrapRoot(void* commId) { - struct extInfo info; - struct extId* id = (struct extId*)commId; - ncclNetHandle_t *extHandleBstrap = NULL; // for initial rank <-> root information exchange - ncclNetHandle_t *extHandleRing = NULL; // for bootstrap ring creation - ncclNetHandle_t zero = { 0 }; // for sanity checking - void* tmpComm; - ncclResult_t res; - setFilesLimit(); - - /* Receive addresses from all ranks */ - int nranks = 0, c = 0; - do { - NCCLCHECKGOTO(bootstrapAccept(id->extListenComm, &tmpComm), res, out); - NCCLCHECKGOTO(bootstrapRecv(tmpComm, &info, sizeof(info)), res, out); - NCCLCHECKGOTO(bootstrapCloseRecv(tmpComm), res, out); - - if (c == 0) { - extHandleBstrap = (ncclNetHandle_t *)calloc(info.nranks, sizeof(ncclNetHandle_t)); - extHandleRing = (ncclNetHandle_t *)calloc(info.nranks, sizeof(ncclNetHandle_t)); - if (extHandleBstrap == NULL || extHandleRing == NULL) { - WARN("Bootstrap thread : failed to allocate memory"); - goto out; - } - nranks = info.nranks; - } - - if (nranks != info.nranks) { - WARN("Bootstrap Root : mismatch in rank count from procs %d : %d", nranks, info.nranks); - goto out; - } - - if (memcmp(&zero, &extHandleBstrap[info.rank], sizeof(ncclNetHandle_t)) != 0) { - WARN("Bootstrap Root : rank %d of %d ranks has already checked in", info.rank, nranks); - goto out; - } - - // Save the connection handle for connecting back to the ranks - memcpy(&extHandleBstrap[info.rank], info.extHandleListenFromRoot, sizeof(ncclNetHandle_t)); - // Save the connection handle for the AllGather ring - memcpy(&extHandleRing[info.rank], info.extHandleRing, sizeof(ncclNetHandle_t)); - - ++c; - } while (c < nranks); - - // Send the connect handle for the next rank in the AllGather ring - for (int r=0; rextListenComm); - free(commId); - free(extHandleBstrap); - free(extHandleRing); - return NULL; -} - -ncclResult_t bootstrapCreateRoot(ncclUniqueId* commId, bool idFromEnv) { - struct extId* id = (struct extId*)commId; - id->hostHash = getHostHash(); - NCCLCHECK(bootstrapListen(idFromEnv ? dontCareIf : 0, &id->extHandleRoot, &id->extListenComm)); - ncclUniqueId* threadIdCopy; - NCCLCHECK(ncclCalloc(&threadIdCopy, 1)); - memcpy(threadIdCopy, id, sizeof(ncclUniqueId)); - pthread_create(&id->boostrapThread, NULL, bootstrapRoot, (void *)threadIdCopy); - return ncclSuccess; -} - -ncclResult_t bootstrapGetUniqueId(ncclUniqueId* out) { - static_assert(sizeof(extId) < sizeof(ncclUniqueId), "NetId does not fit inside ncclUniqueId"); - extId* id = (extId*)out; - - char* env = getenv("NCCL_COMM_ID"); - if (env) { - if (ncclSocketCreateHandle(&id->extHandleRoot, env) != 0) { - WARN("Invalid NCCL_COMM_ID, please use format: : or []: or :"); - return ncclInvalidArgument; - } - id->pid = -1; - } else { - id->pid = getpid(); - NCCLCHECK(bootstrapCreateRoot(out, false)); - } - - return ncclSuccess; -} - -struct extState { - void* extBstrapRingRecvComm; - void* extBstrapRingSendComm; - ncclNetHandle_t extBstrapRootHandle; - int rank; - int nranks; - int dev; -}; - -ncclResult_t bootstrapInit(ncclUniqueId* commId, int rank, int nranks, void** commState) { - struct extId* id = (struct extId*)commId; - bool idFromEnv = id->pid < 0; - struct extState* state; - NCCLCHECK(ncclCalloc(&state, 1)); - state->rank = rank; - state->nranks = nranks; - *commState = state; - void* extBstrapRootListenComm; // comm on which we accept root's connections - - struct extInfo info = { 0 }; - info.rank = rank; - info.nranks = nranks; - void *tmpSendComm, *extBstrapRingListenComm, *tmpRecvComm; - // Pass the remote address to listen via info - if (idFromEnv) { - memcpy(&info.extHandleListenFromRoot, &id->extHandleRoot, sizeof(ncclNetHandle_t)); - memcpy(&info.extHandleRing, &id->extHandleRoot, sizeof(ncclNetHandle_t)); - } - // listen will return the local address via info (specify interface type 'findSubnetIf') - state->dev = idFromEnv ? findSubnetIf : 0; - NCCLCHECK(bootstrapListen(state->dev, &info.extHandleListenFromRoot, &extBstrapRootListenComm)); - NCCLCHECK(bootstrapListen(state->dev, &info.extHandleRing, &extBstrapRingListenComm)); // AllGather Ring - - memcpy(&state->extBstrapRootHandle, &id->extHandleRoot, sizeof(ncclNetHandle_t)); - // send info on my listening sockets to root - NCCLCHECK(bootstrapConnect(state->dev, id->extHandleRoot, &tmpSendComm)); - NCCLCHECK(bootstrapSend(tmpSendComm, &info, sizeof(info))); - NCCLCHECK(bootstrapCloseSend(tmpSendComm)); - - // get info on my "next" rank in the bootstrap ring from root - ncclNetHandle_t extHandleNext; - NCCLCHECK(bootstrapAccept(extBstrapRootListenComm, &tmpRecvComm)); - NCCLCHECK(bootstrapRecv(tmpRecvComm, &extHandleNext, sizeof(extHandleNext))); - NCCLCHECK(bootstrapCloseRecv(tmpRecvComm)); - - NCCLCHECK(bootstrapConnect(state->dev, extHandleNext, &state->extBstrapRingSendComm)); - // Accept the connect request from the previous rank in the AllGather ring - NCCLCHECK(bootstrapAccept(extBstrapRingListenComm, &state->extBstrapRingRecvComm)); - NCCLCHECK(bootstrapCloseListen(extBstrapRingListenComm)); - NCCLCHECK(bootstrapCloseListen(extBstrapRootListenComm)); - - return ncclSuccess; -} - -ncclResult_t bootstrapAllGather(void* commState, void* allData, int size) { - struct extState* state = (struct extState*)commState; - char* data = (char*)allData; - int rank = state->rank; - int nranks = state->nranks; - - TRACE(NCCL_INIT, "rank %d nranks %d size %d", rank, nranks, size); - - /* Simple ring based AllGather - * At each step i receive data from (rank-i-1) from left - * and send previous step's data from (rank-i) to right - */ - for (int i=0; iextBstrapRingSendComm, data+sslice*size, size)); - // Recv slice from the left - NCCLCHECK(bootstrapRecv(state->extBstrapRingRecvComm, data+rslice*size, size)); - } - - TRACE(NCCL_INIT, "rank %d nranks %d size %d - DONE", rank, nranks, size); - return ncclSuccess; -} - -ncclResult_t bootstrapClose(void* commState) { - struct extState* state = (struct extState*)commState; - - NCCLCHECK(bootstrapCloseSend(state->extBstrapRingSendComm)); - NCCLCHECK(bootstrapCloseRecv(state->extBstrapRingRecvComm)); - - free(state); - - return ncclSuccess; -} diff --git a/projects/rccl/src/channel.cc b/projects/rccl/src/channel.cc new file mode 100644 index 0000000000..5a5903d3c8 --- /dev/null +++ b/projects/rccl/src/channel.cc @@ -0,0 +1,57 @@ +/************************************************************************* + * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved. + * Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ + +#include "channel.h" +#include "param.h" + +NCCL_PARAM(Buffsize, "BUFFSIZE", DEFAULT_BUFFER_SIZE_BYTES); + +ncclResult_t initChannel(struct ncclComm* comm, int channelid) { + struct ncclChannel* channel = comm->channels+channelid; + channel->id = channelid; + + // Setup intermediate buffering + channel->buffSize = ncclParamBuffsize(); + + // Ring index to user rank table. + NCCLCHECK(ncclCudaCalloc(&channel->ring.devUserRanks, comm->nRanks)); + NCCLCHECK(ncclCalloc(&channel->ring.userRanks, comm->nRanks)); + + // Communication structures with peers. + NCCLCHECK(ncclCudaCalloc(&channel->devPeers, comm->nRanks)); + NCCLCHECK(ncclCalloc(&channel->peers, comm->nRanks)); + for (size_t i=0; inRanks; ++i) { + channel->peers[i].send.comm = comm; + channel->peers[i].recv.comm = comm; + } + + // Per-channel operation list. + NCCLCHECK(ncclCudaHostAlloc((void**)&channel->collectives, (void**)&channel->devCollectives, sizeof(struct ncclColl)*NCCL_MAX_OPS)); + return ncclSuccess; +} + +ncclResult_t freeChannel(struct ncclChannel* channel, int nRanks) { + // Operation list + NCCLCHECK(ncclCudaHostFree(channel->collectives)); + + // Free Ring index to rank tables + free(channel->ring.userRanks); + CUDACHECK(hipFree(channel->ring.devUserRanks)); + + // Free transport proxy resources + for (int r=0; rpeers+r; + if (peer->send.transportResources) NCCLCHECK(peer->send.transportComm->free(peer->send.transportResources)); + if (peer->recv.transportResources) NCCLCHECK(peer->recv.transportComm->free(peer->recv.transportResources)); + } + + // Free the peer structures. + CUDACHECK(hipFree(channel->devPeers)); + free(channel->peers); + + return ncclSuccess; +} diff --git a/projects/rccl/src/collectives/all_gather.cc b/projects/rccl/src/collectives/all_gather.cc new file mode 100644 index 0000000000..1959420e36 --- /dev/null +++ b/projects/rccl/src/collectives/all_gather.cc @@ -0,0 +1,19 @@ +/************************************************************************* + * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved. + * Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ + +#include "enqueue.h" +#include "collectives.h" + +NCCL_API(ncclResult_t, ncclAllGather, const void* sendbuff, void* recvbuff, size_t sendcount, + ncclDataType_t datatype, ncclComm_t comm, hipStream_t stream); +ncclResult_t ncclAllGather(const void* sendbuff, void* recvbuff, size_t sendcount, + ncclDataType_t datatype, ncclComm_t comm, hipStream_t stream) { + struct ncclInfo info = { ncclCollAllGather, "AllGather", + sendbuff, recvbuff, sendcount, datatype, ncclSum, 0, comm, stream, /* Args */ + ALLGATHER_CHUNKSTEPS, ALLGATHER_SLICESTEPS }; + return ncclEnqueueCheck(&info); +} diff --git a/projects/rccl/src/collectives/all_gather.cu b/projects/rccl/src/collectives/all_gather.cu deleted file mode 100644 index 7ad36c777b..0000000000 --- a/projects/rccl/src/collectives/all_gather.cu +++ /dev/null @@ -1,33 +0,0 @@ -/************************************************************************* - * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved. - * Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved. - * - * See LICENSE.txt for license information - ************************************************************************/ - -#include "core.h" -#include "common_coll.h" -#include "enqueue.h" -#include "collectives.h" - -ncclResult_t ncclAllGatherFunc(const void* sendbuff, void* recvbuff, size_t count, - ncclDataType_t datatype, ncclRedOp_t op, int root, ncclComm_t comm, hipStream_t stream) { - size_t nbytes = count*ncclTypeSize(datatype); - INFO(NCCL_COLL,"AllGather: opCount %lx sendbuff %p recvbuff %p count %zi datatype %d op %d root %d comm %p [nranks=%d] stream %p", comm->opCount, sendbuff, recvbuff, count, datatype, op, root, comm, comm->nRanks, stream); - if (comm->nRanks == 1) { - if (sendbuff != recvbuff) - CUDACHECK(hipMemcpyAsync(recvbuff, sendbuff, nbytes, hipMemcpyDeviceToDevice, stream)); - } else { - NCCLCHECK(transportSaveProxies(ALLGATHER_SUBSTEPS, ALLGATHER_BUFCHUNKS, comm->nRanks-1, comm->nRanks, nbytes*comm->nRanks, proxyPatternRing, comm)); - NCCLCHECK(saveKernel(ncclCollAllGather, sendbuff, recvbuff, nbytes, ncclInt8, op, root, comm, stream, nbytes*comm->nRanks, 1)); - } - return ncclSuccess; -} - -NCCL_API(ncclResult_t, ncclAllGather, const void* sendbuff, void* recvbuff, size_t sendcount, - ncclDataType_t datatype, ncclComm_t comm, hipStream_t stream); -ncclResult_t ncclAllGather(const void* sendbuff, void* recvbuff, size_t sendcount, - ncclDataType_t datatype, ncclComm_t comm, hipStream_t stream) { - return ncclEnqueueCheck(ncclAllGatherFunc, "AllGather", sendbuff, recvbuff, sendcount, datatype, - ncclSum, 0, comm, stream); -} diff --git a/projects/rccl/src/collectives/all_reduce.cc b/projects/rccl/src/collectives/all_reduce.cc new file mode 100644 index 0000000000..4051da8b59 --- /dev/null +++ b/projects/rccl/src/collectives/all_reduce.cc @@ -0,0 +1,19 @@ +/************************************************************************* + * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved. + * Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ + +#include "enqueue.h" +#include "collectives.h" + +NCCL_API(ncclResult_t, ncclAllReduce, const void* sendbuff, void* recvbuff, size_t count, + ncclDataType_t datatype, ncclRedOp_t op, ncclComm* comm, hipStream_t stream); +ncclResult_t ncclAllReduce(const void* sendbuff, void* recvbuff, size_t count, + ncclDataType_t datatype, ncclRedOp_t op, ncclComm* comm, hipStream_t stream) { + struct ncclInfo info = { ncclCollAllReduce, "AllReduce", + sendbuff, recvbuff, count, datatype, op, 0, comm, stream, /* Args */ + ALLREDUCE_CHUNKSTEPS, ALLREDUCE_SLICESTEPS }; + return ncclEnqueueCheck(&info); +} diff --git a/projects/rccl/src/collectives/all_reduce.cu b/projects/rccl/src/collectives/all_reduce.cu deleted file mode 100644 index 234af2c898..0000000000 --- a/projects/rccl/src/collectives/all_reduce.cu +++ /dev/null @@ -1,33 +0,0 @@ -/************************************************************************* - * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved. - * Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved. - * - * See LICENSE.txt for license information - ************************************************************************/ - -#include "core.h" -#include "common_coll.h" -#include "enqueue.h" -#include "collectives.h" - -ncclResult_t ncclAllReduceFunc(const void* sendbuff, void* recvbuff, size_t count, - ncclDataType_t datatype, ncclRedOp_t op, int root, ncclComm_t comm, hipStream_t stream) { - size_t nbytes = count*ncclTypeSize(datatype); - INFO(NCCL_COLL,"AllReduce: opCount %lx sendbuff %p recvbuff %p count %zi datatype %d op %d root %d comm %p [nranks=%d] stream %p", comm->opCount, sendbuff, recvbuff, count, datatype, op, root, comm, comm->nRanks, stream); - if (comm->nRanks == 1) { - if (sendbuff != recvbuff) - CUDACHECK(hipMemcpyAsync(recvbuff, sendbuff, nbytes, hipMemcpyDeviceToDevice, stream)); - } else { - NCCLCHECK(transportSaveProxies(ALLREDUCE_SUBSTEPS, ALLREDUCE_BUFCHUNKS, (comm->nRanks)*2-2, comm->nRanks, nbytes, proxyPatternRing, comm)); - NCCLCHECK(saveKernel(ncclCollAllReduce, sendbuff, recvbuff, count, datatype, op, root, comm, stream, nbytes, comm->nRanks)); - } - return ncclSuccess; -} - -NCCL_API(ncclResult_t, ncclAllReduce, const void* sendbuff, void* recvbuff, size_t count, - ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm, hipStream_t stream); -ncclResult_t ncclAllReduce(const void* sendbuff, void* recvbuff, size_t count, - ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm, hipStream_t stream) { - return ncclEnqueueCheck(ncclAllReduceFunc, "AllReduce", sendbuff, recvbuff, count, datatype, - op, 0, comm, stream); -} diff --git a/projects/rccl/src/collectives/broadcast.cc b/projects/rccl/src/collectives/broadcast.cc new file mode 100644 index 0000000000..f096ac1f72 --- /dev/null +++ b/projects/rccl/src/collectives/broadcast.cc @@ -0,0 +1,27 @@ +/************************************************************************* + * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved. + * Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ + +#include "enqueue.h" +#include "collectives.h" + +NCCL_API(ncclResult_t, ncclBroadcast, const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype, int root, + ncclComm_t comm, hipStream_t stream); +ncclResult_t ncclBroadcast(const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype, int root, + ncclComm_t comm, hipStream_t stream) { + struct ncclInfo info = { ncclCollBroadcast, "Broadcast", + sendbuff, recvbuff, count, datatype, ncclSum, root, comm, stream, /* Args */ + BROADCAST_CHUNKSTEPS, BROADCAST_SLICESTEPS }; + return ncclEnqueueCheck(&info); +} +/* Deprecated original "in place" function, similar to MPI */ +NCCL_API(ncclResult_t, ncclBcast, void* buff, size_t count, ncclDataType_t datatype, int root, + ncclComm_t comm, hipStream_t stream); +ncclResult_t ncclBcast(void* buff, size_t count, ncclDataType_t datatype, int root, + ncclComm_t comm, hipStream_t stream) { + return ncclBroadcast(buff, buff, count, datatype, root, comm, stream); +} + diff --git a/projects/rccl/src/collectives/broadcast.cu b/projects/rccl/src/collectives/broadcast.cu deleted file mode 100644 index a2b65995f8..0000000000 --- a/projects/rccl/src/collectives/broadcast.cu +++ /dev/null @@ -1,43 +0,0 @@ -/************************************************************************* - * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved. - * Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved. - * - * See LICENSE.txt for license information - ************************************************************************/ - -#include "core.h" -#include "common_coll.h" -#include "enqueue.h" -#include "collectives.h" - -ncclResult_t ncclBroadcastFunc(const void* sendbuff, void* recvbuff, const size_t count, - ncclDataType_t datatype, ncclRedOp_t op, int root, ncclComm_t comm, hipStream_t stream) { - size_t nbytes = count*ncclTypeSize(datatype); - INFO(NCCL_COLL,"Broadcast: opCount %lx sendbuff %p recvbuff %p count %zi datatype %d op %d root %d comm %p [nranks=%d] stream %p", comm->opCount, sendbuff, recvbuff, count, datatype, op, root, comm, comm->nRanks, stream); - if (comm->nRanks == 1) { - if (sendbuff != recvbuff) - CUDACHECK(hipMemcpyAsync(recvbuff, sendbuff, nbytes, hipMemcpyDeviceToDevice, stream)); - } else { - NCCLCHECK(transportSaveProxies(BROADCAST_SUBSTEPS, BROADCAST_BUFCHUNKS, 1, 1, nbytes, proxyPatternFrom(root), comm)); - NCCLCHECK(saveKernel(ncclCollBroadcast, sendbuff, recvbuff, nbytes, ncclInt8, op, root, comm, stream, nbytes, 1)); - } - - return ncclSuccess; -} - -/* Deprecated original "in place" function, similar to MPI */ -NCCL_API(ncclResult_t, ncclBcast, void* buff, size_t count, ncclDataType_t datatype, int root, - ncclComm_t comm, hipStream_t stream); -ncclResult_t ncclBcast(void* buff, size_t count, ncclDataType_t datatype, int root, - ncclComm_t comm, hipStream_t stream) { - return ncclEnqueueCheck(ncclBroadcastFunc, "Bcast", buff, buff, count, datatype, - ncclSum, root, comm, stream); -} - -NCCL_API(ncclResult_t, ncclBroadcast, const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype, int root, - ncclComm_t comm, hipStream_t stream); -ncclResult_t ncclBroadcast(const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype, int root, - ncclComm_t comm, hipStream_t stream) { - return ncclEnqueueCheck(ncclBroadcastFunc, "Broadcast", sendbuff, recvbuff, count, datatype, - ncclSum, root, comm, stream); -} diff --git a/projects/rccl/src/collectives/collectives.h b/projects/rccl/src/collectives/collectives.h index 5b2f0f13f4..c56d90888e 100644 --- a/projects/rccl/src/collectives/collectives.h +++ b/projects/rccl/src/collectives/collectives.h @@ -1,5 +1,6 @@ +#include "hip/hip_runtime.h" /************************************************************************* - * Copyright (c) 2017-2018, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. * Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved. * * See LICENSE.txt for license information @@ -8,9 +9,7 @@ #ifndef NCCL_COLLECTIVES_H_ #define NCCL_COLLECTIVES_H_ -typedef enum { ncclCollBroadcast, ncclCollReduce, ncclCollAllGather, ncclCollReduceScatter, ncclCollAllReduce, ncclCollCount } ncclColl_t; - -#define FUNC_INDEX(coll, redop, dtype, ll) ((((coll*ncclNumOps + redop)*ncclNumTypes) + dtype)*2+ll) +#define FUNC_INDEX(coll, redop, dtype, ll, al) ((((coll*ncclNumOps + redop)*ncclNumTypes) + dtype)*2+ll) #define NCCL_COLL_NAME(coll, op, dtype) \ coll##_##op##_##dtype @@ -19,13 +18,16 @@ typedef enum { ncclCollBroadcast, ncclCollReduce, ncclCollAllGather, ncclCollRed coll##Kernel_##op##_##dtype /* Declare all collective operations */ -#define DECL_COLL4(coll, op, dtype) \ +#define DECL_COLL5(coll, op, dtype) \ extern __device__ __attribute__((noinline)) void NCCL_COLL_NAME(coll, op, dtype)(struct CollectiveArgs* args); \ - extern __global__ void NCCL_KERN_NAME(coll, op, dtype)(struct ncclColl coll); \ + extern __global__ void NCCL_KERN_NAME(coll, op, dtype)(struct ncclColl c); \ + +#define DECL_COLL4(coll, op, dtype) \ + DECL_COLL5(coll, op, dtype) \ + DECL_COLL5(coll##LL, op, dtype) #define DECL_COLL3(coll, op, dtype) \ - DECL_COLL4(coll##LL, op, dtype) \ - DECL_COLL4(coll, op, dtype) + DECL_COLL4(coll##Ring, op, dtype) #define DECL_COLL2(coll, op) \ DECL_COLL3(coll, op, i8) \ @@ -53,15 +55,22 @@ typedef enum { ncclCollBroadcast, ncclCollReduce, ncclCollAllGather, ncclCollRed DECL_ALL_COLLS -#define ALLREDUCE_SUBSTEPS 2 -#define ALLREDUCE_BUFCHUNKS 2 -#define ALLGATHER_SUBSTEPS 2 -#define ALLGATHER_BUFCHUNKS 2 -#define REDUCESCATTER_SUBSTEPS 2 -#define REDUCESCATTER_BUFCHUNKS 2 -#define BROADCAST_SUBSTEPS 8 -#define BROADCAST_BUFCHUNKS 2 -#define REDUCE_SUBSTEPS 8 -#define REDUCE_BUFCHUNKS 2 +// CHUNKSIZE must be a multiple of SLICESIZE +//#define ALLREDUCE_SLICESTEPS (NCCL_STEPS/4) +//#define ALLREDUCE_CHUNKSTEPS (NCCL_STEPS/2) +//#define ALLGATHER_SLICESTEPS (NCCL_STEPS/4) +//#define ALLGATHER_CHUNKSTEPS (NCCL_STEPS/2) +//#define REDUCESCATTER_SLICESTEPS (NCCL_STEPS/4) +//#define REDUCESCATTER_CHUNKSTEPS (NCCL_STEPS/2) +#define ALLREDUCE_SLICESTEPS 4 +#define ALLREDUCE_CHUNKSTEPS 4 +#define ALLGATHER_SLICESTEPS 4 +#define ALLGATHER_CHUNKSTEPS 4 +#define REDUCESCATTER_SLICESTEPS 4 +#define REDUCESCATTER_CHUNKSTEPS 4 +#define BROADCAST_SLICESTEPS 1 +#define BROADCAST_CHUNKSTEPS 1 +#define REDUCE_SLICESTEPS 1 +#define REDUCE_CHUNKSTEPS 1 #endif diff --git a/projects/rccl/src/collectives/device/Makefile b/projects/rccl/src/collectives/device/Makefile index e2bcd49007..0ee587bd9a 100644 --- a/projects/rccl/src/collectives/device/Makefile +++ b/projects/rccl/src/collectives/device/Makefile @@ -1,5 +1,5 @@ # -# Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved. # # See LICENSE.txt for license information # @@ -12,18 +12,13 @@ OBJDIR := $(BUILDDIR)/obj/collectives/device LIBSRCFILES := all_reduce.cu broadcast.cu reduce.cu all_gather.cu reduce_scatter.cu -LIBOBJ := $(patsubst %.cu,$(OBJDIR)/%_sum.o, $(LIBSRCFILES)) \ - $(patsubst %.cu,$(OBJDIR)/%_prod.o, $(LIBSRCFILES)) \ - $(patsubst %.cu,$(OBJDIR)/%_min.o, $(LIBSRCFILES)) \ - $(patsubst %.cu,$(OBJDIR)/%_max.o, $(LIBSRCFILES)) \ - $(OBJDIR)/functions.o - LIBSRCFILES += functions.cu DEPFILES := $(patsubst %.cu, $(OBJDIR)/%.d, $(LIBSRCFILES)) -DEPENDFILES := $(DEPFILES:%.d=%.dep) +DEPENDFILES:= $(DEPFILES:%.d=%.dep) STATICLIB := $(OBJDIR)/colldevice.a DEVOBJ := $(OBJDIR)/devlink.o +RULESFILE := $(OBJDIR)/Makefile.rules NVCUFLAGS += -I. -I.. -I$(BUILDDIR)/include -I../../include --compiler-options "-fPIC -fvisibility=hidden" @@ -33,6 +28,16 @@ all: $(STATICLIB) # Dummy rule so that the extra dependency (%.dep) files are preserved by make all_deps: $(DEPENDFILES) +# Auto-generating the rules per op/reduction/datatype/algorithm +$(RULESFILE) : + @printf "Generating %-35s > %s\n" rules $@ + @mkdir -p $(OBJDIR) + @./gen_rules.sh $(OBJDIR) > $@ + +-include $(RULESFILE) + +LIBOBJ := $(GENOBJS) $(OBJDIR)/functions.o + -include $(DEPFILES) $(STATICLIB): $(LIBOBJ) $(DEVOBJ) @@ -58,26 +63,6 @@ $(OBJDIR)/functions.o : functions.cu $(OBJDIR)/functions.dep mkdir -p `dirname $@` $(NVCC) $(NVCUFLAGS) -dc $< -o $@ -$(OBJDIR)/%_sum.o : %.cu $(OBJDIR)/%.dep - @printf "Compiling %-35s > %s\n" $< $@ - mkdir -p `dirname $@` - $(NVCC) -DNCCL_OP=0 $(NVCUFLAGS) -dc $< -o $@ - -$(OBJDIR)/%_prod.o : %.cu $(OBJDIR)/%.dep - @printf "Compiling %-35s > %s\n" $< $@ - mkdir -p `dirname $@` - $(NVCC) -DNCCL_OP=1 $(NVCUFLAGS) -dc $< -o $@ - -$(OBJDIR)/%_min.o : %.cu $(OBJDIR)/%.dep - @printf "Compiling %-35s > %s\n" $< $@ - mkdir -p `dirname $@` - $(NVCC) -DNCCL_OP=2 $(NVCUFLAGS) -dc $< -o $@ - -$(OBJDIR)/%_max.o : %.cu $(OBJDIR)/%.dep - @printf "Compiling %-35s > %s\n" $< $@ - mkdir -p `dirname $@` - $(NVCC) -DNCCL_OP=3 $(NVCUFLAGS) -dc $< -o $@ - # ... and create the device-side linked object with all those. $(DEVOBJ) : $(LIBOBJ) $(NVCC) $(NVCUFLAGS) -dlink $^ -o $@ diff --git a/projects/rccl/src/collectives/device/all_gather.cu b/projects/rccl/src/collectives/device/all_gather.cu index 0f572ce7cb..3fd3e0c63e 100644 --- a/projects/rccl/src/collectives/device/all_gather.cu +++ b/projects/rccl/src/collectives/device/all_gather.cu @@ -1,5 +1,6 @@ /************************************************************************* * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved. + * Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ @@ -10,6 +11,4 @@ #define UNROLL 4 -#if NCCL_OP == 0 IMPL_COLL3(ncclAllGather, copy, FuncSum, i8, int8_t, ncclCollAllGather, ncclSum, ncclInt8); -#endif diff --git a/projects/rccl/src/collectives/device/all_gather.h b/projects/rccl/src/collectives/device/all_gather.h index 1e086c8b64..0b89d3a1f8 100644 --- a/projects/rccl/src/collectives/device/all_gather.h +++ b/projects/rccl/src/collectives/device/all_gather.h @@ -1,81 +1,44 @@ /************************************************************************* - * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved. * Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ -#include "core.h" +#include "devcomm.h" #include "primitives.h" #include "collectives.h" -// Increase Step and poffset/noffset for buffer sync -#define NEXT_STEP \ - step++; \ - poffset = noffset; \ - noffset += sliceSize; \ - if (noffset == buffSize) noffset = 0; - template __attribute__((noinline)) -__device__ void ncclAllGatherKernel(struct CollectiveArgs* args) { +__device__ void ncclAllGatherRingKernel(struct CollectiveArgs* args) { const int tid = threadIdx.x; const int nthreads = blockDim.x; const int bid = args->bid; - __shared__ T* sharedNextOutput; - struct ncclComm* comm = args->comm; - struct ncclRing* ring = comm->rings+blockIdx.x; - int prevdirect = 0; - int nextdirect = 0; - - WaitFlag waitDoneFromNext(ring->send.conn.head, ALLGATHER_BUFCHUNKS*ALLGATHER_SUBSTEPS); - WaitFlag waitReadyFromPrev(ring->recv.conn.tail, ALLGATHER_SUBSTEPS); - PostFlag postDoneToPrev(ring->recv.conn.head, ALLGATHER_SUBSTEPS, NULL, 0); - PostFlag postReadyToNext(ring->send.conn.tail, 0, ring->send.conn.fifo, ALLGATHER_BUFCHUNKS*ALLGATHER_SUBSTEPS, ring->next_hdp_reg); - - typedef Primitives Prims; - + struct ncclDevComm* comm = args->comm; + struct ncclChannel* channel = comm->channels+blockIdx.x; + struct ncclRing* ring = &channel->ring; const ssize_t size = args->N; const int nranks = comm->nRanks; - const int buffSize = ring->buffSize / sizeof(T); - const int sliceSize = buffSize / ALLGATHER_BUFCHUNKS; - const ssize_t loopSize = args->nRings*(ssize_t)sliceSize; - - if (tid == 0) { - // Update in case we skipped some collectives - STORE(ring->recv.conn.opCount, args->opCount); - // Wait for next to be ready - WaitFlag waitOpCountNext(ring->send.conn.opCount, 0); - waitOpCountNext.wait(args->opCount); - if (prevdirect) { - *ring->recv.conn.ptrExchange = args->ThisOutput; - } - if (nextdirect) { - void* volatile* ptr = &(ring->devMemSend->ptrExchange); - while (LOAD(ptr) == nullptr); - sharedNextOutput = (T*)LOAD(ptr); - STORE(ptr, nullptr); - } - } - __syncthreads(); - - uint64_t step = 0ULL; - int poffset, noffset = 0; + const int stepSize = channel->buffSize / (sizeof(T)*NCCL_STEPS); + const int chunkSize = stepSize * ALLREDUCE_CHUNKSTEPS; + const ssize_t loopSize = args->nChannels*(ssize_t)chunkSize; // Compute pointers const T * __restrict__ thisInput = (const T*)args->ThisInput; T * __restrict__ thisOutput = (T*)args->ThisOutput; - T * __restrict__ prevInput = (T*)ring->recv.conn.buff; - T * __restrict__ nextOutput = (T*)ring->send.conn.buff; + + ncclPrimitives + prims(tid, nthreads, &ring->prev, &ring->next, thisOutput, stepSize, channel, comm, args->opCount); for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) { - int chunkSize = min(sliceSize, DIVUP(size-gridOffset,args->nRings)); - ALIGN_SIZE(chunkSize, nthreads*sizeof(uint64_t)/sizeof(T)); - ssize_t chunkOffset = gridOffset + bid*chunkSize; + int realChunkSize = min(chunkSize, DIVUP(size-gridOffset,args->nChannels)); + ALIGN_SIZE(realChunkSize, nthreads*sizeof(uint64_t)/sizeof(T)); + ssize_t chunkOffset = gridOffset + bid*realChunkSize; /////////////// begin AllGather steps /////////////// ssize_t offset; - int maxOffset = min(chunkSize, size-chunkOffset); + int nelem = min(realChunkSize, size-chunkOffset); int rankDest; // step 0: push data to next GPU @@ -83,130 +46,53 @@ __device__ void ncclAllGatherKernel(struct CollectiveArgs* args) { offset = chunkOffset + rankDest * size; if (thisInput + chunkOffset == thisOutput + offset) { // In place - Prims::Copy(tid, nthreads, - thisInput + chunkOffset, - nextdirect ? (sharedNextOutput + offset) : (nextOutput + noffset), - sliceSize, maxOffset, - step, - waitDoneFromNext, - postReadyToNext); + prims.directSend(thisInput+chunkOffset, offset, nelem); } else { - Prims::DoubleCopy(tid, nthreads, - thisInput + chunkOffset, - thisOutput + offset, - nextdirect ? (sharedNextOutput + offset) : (nextOutput + noffset), - sliceSize, maxOffset, - step, - waitDoneFromNext, - postReadyToNext); + prims.directCopySend(thisInput+chunkOffset, thisOutput+offset, offset, nelem); } - NEXT_STEP; // Increases step, poffset, noffset - // k-2 steps: copy to next GPU - if (prevdirect) { - for (int j=1; jdevUserRanks[nranks-j]; - offset = chunkOffset + rankDest * size; - - Prims::Copy(tid, nthreads, - thisOutput + offset, - nextdirect ? (sharedNextOutput + offset) : (nextOutput + noffset), - sliceSize, maxOffset, - step, - waitDoneFromNext, waitReadyFromPrev, - postReadyToNext, postDoneToPrev); - - NEXT_STEP; - } - Prims::Copy(tid, nthreads, - NULL, - NULL, - 0, 0, - step, - waitReadyFromPrev, - postDoneToPrev); - } else { - for (int j=1; jdevUserRanks[nranks-j]; - offset = chunkOffset + rankDest * size; - - Prims::DoubleCopy(tid, nthreads, - prevInput + poffset, - thisOutput + offset, - nextdirect ? (sharedNextOutput + offset) : (nextOutput + noffset), - sliceSize, maxOffset, - step, - waitDoneFromNext, waitReadyFromPrev, - postReadyToNext, postDoneToPrev); - - NEXT_STEP; - } - - // Make final copy from buffer to dest. - rankDest = ring->devUserRanks[1]; + for (int j=1; jdevUserRanks[nranks-j]; offset = chunkOffset + rankDest * size; - // Here we need to copy from buffer to this output. - Prims::Copy(tid, nthreads, - prevInput + poffset, - thisOutput + offset, - sliceSize, maxOffset, - step, - waitReadyFromPrev, - postDoneToPrev); + prims.directRecvCopySend(thisOutput+offset, offset, nelem); } - } - if (tid == 0) { - waitDoneFromNext.wait(ALLGATHER_SUBSTEPS*(step + ALLGATHER_BUFCHUNKS)); - STORE(ring->send.conn.head, 0ULL); - STORE(ring->recv.conn.tail, 0ULL); - __threadfence_system(); - STORE(ring->recv.conn.opCount, args->opCount+1); + // Make final copy from buffer to dest. + rankDest = ring->devUserRanks[1]; + offset = chunkOffset + rankDest * size; + + // Final wait/copy. + prims.directRecv(thisOutput+offset, offset, nelem); } } -#include "ll_kernel.h" - -#define NEXT_STEP_LL \ - poffset = noffset; \ - pflag = nflag; \ - noffset += NCCL_LL_SLICE_LINES; \ - if (noffset == NCCL_LL_BUFF_LINES) { noffset = 0; } \ - nflag++; \ - step++; +template +__attribute__((noinline)) +__device__ void ncclAllGatherTreeKernel(struct CollectiveArgs* args) { } template __attribute__((noinline)) -__device__ void ncclAllGatherLLKernel(struct CollectiveArgs* args) { +__device__ void ncclAllGatherRingLLKernel(struct CollectiveArgs* args) { const int tid = threadIdx.x; const int bid = args->bid; - const int llNthreads = args->nThreads; - struct ncclComm* comm = args->comm; - struct ncclRing* ring = comm->rings+blockIdx.x; - volatile uint64_t * recvHeadPtr = ring->recv.conn.llHead; - volatile uint64_t * sendHeadPtr = ring->send.conn.llHead; - volatile int * sizesFifo = ring->send.conn.llFifo; - uint64_t sendHead = sendHeadPtr[0]; + const int nthreads = args->nThreads; + struct ncclDevComm* comm = args->comm; + struct ncclChannel* channel = comm->channels+blockIdx.x; + struct ncclRing* ring = &channel->ring; - typedef LLPrimitives LL; + ncclLLPrimitives LLprims(tid, nthreads, &ring->prev, &ring->next, channel, comm, args->opCount); const ssize_t size = args->N; //const int rank = comm->rank; const int nranks = comm->nRanks; ssize_t chunkSize = NCCL_LL_SLICE_LINES * sizeof(uint64_t) / sizeof(T); - const ssize_t loopSize = args->nRings*chunkSize; - - uint64_t step = ring->send.conn.llStep; - uint32_t pflag, nflag = step + 1; - int poffset, noffset = NCCL_LL_SLICE_LINES * STEP_TO_SLOT(step); + const ssize_t loopSize = args->nChannels*chunkSize; // Compute pointers const T * __restrict__ thisInput = (const T*)args->ThisInput; T * __restrict__ thisOutput = (T*)args->ThisOutput; - union ncclLLFifoLine * prevInput = (union ncclLLFifoLine *)ring->recv.conn.llBuff; - union ncclLLFifoLine * nextOutput = (union ncclLLFifoLine *)ring->send.conn.llBuff; for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) { if (size-gridOffset < loopSize) { @@ -216,57 +102,35 @@ __device__ void ncclAllGatherLLKernel(struct CollectiveArgs* args) { /////////////// begin AllGather steps /////////////// ssize_t offset; - int maxOffset = min(chunkSize, size-chunkOffset); + int nelem = min(chunkSize, size-chunkOffset); int rankDest; // step 0: push data to next GPU rankDest = ring->devUserRanks[0]; offset = chunkOffset + rankDest * size; - WAIT_NEXT; if (thisInput + chunkOffset == thisOutput + offset) { // In place - LL::ReduceCopy( - thisInput + chunkOffset, - nextOutput + noffset, - maxOffset, nflag, llNthreads); + LLprims.send(thisInput+chunkOffset, nelem); } else { - LL::ReduceCopy( - thisInput + chunkOffset, - thisOutput + offset, - nextOutput + noffset, - maxOffset, nflag, llNthreads); + LLprims.copySend(thisInput+chunkOffset, thisOutput+offset, nelem); } - POST_SIZE; - - NEXT_STEP_LL; // k-2 steps: copy to next GPU for (int j=1; jdevUserRanks[nranks-j]; offset = chunkOffset + rankDest * size; - WAIT_NEXT; - LL::ReduceCopy( - prevInput + poffset, - thisOutput + offset, - nextOutput + noffset, - maxOffset, pflag, nflag, llNthreads); - POST_SIZE; - ACK_PREV; - - NEXT_STEP_LL; + LLprims.recvCopySend(thisOutput+offset, nelem); } // step k-1: final store rankDest = ring->devUserRanks[1]; offset = chunkOffset + rankDest * size; - LL::ReduceCopy( - prevInput + poffset, - thisOutput + offset, - maxOffset, pflag, llNthreads); - ACK_PREV; + LLprims.recv(thisOutput+offset, nelem); } - - FIFO_CLEANING_AND_SAVE_STEP(nflag); } + +template +__attribute__((noinline)) +__device__ void ncclAllGatherTreeLLKernel(struct CollectiveArgs* args) { } diff --git a/projects/rccl/src/collectives/device/all_reduce.cu b/projects/rccl/src/collectives/device/all_reduce.cu index caa1479c12..704197160e 100644 --- a/projects/rccl/src/collectives/device/all_reduce.cu +++ b/projects/rccl/src/collectives/device/all_reduce.cu @@ -1,5 +1,6 @@ /************************************************************************* * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved. + * Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ @@ -10,12 +11,7 @@ #define UNROLL 4 -#if NCCL_OP == 0 IMPL_COLL2(ncclAllReduce, sum, FuncSum, ncclCollAllReduce, ncclSum); -#elif NCCL_OP == 1 IMPL_COLL2(ncclAllReduce, prod, FuncProd, ncclCollAllReduce, ncclProd); -#elif NCCL_OP == 2 IMPL_COLL2(ncclAllReduce, min, FuncMin, ncclCollAllReduce, ncclMin); -#elif NCCL_OP == 3 -IMPL_COLL2(ncclAllReduce, max, FuncMax, ncclCollAllReduce, ncclMax); -#endif +IMPL_COLL2(ncclAllReduce, max, FuncMax, ncclCollAllReduce, ncclMax); \ No newline at end of file diff --git a/projects/rccl/src/collectives/device/all_reduce.h b/projects/rccl/src/collectives/device/all_reduce.h index b75223b13e..f319b4333e 100644 --- a/projects/rccl/src/collectives/device/all_reduce.h +++ b/projects/rccl/src/collectives/device/all_reduce.h @@ -1,243 +1,181 @@ /************************************************************************* - * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved. * Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ -#include "core.h" +#include "devcomm.h" #include "primitives.h" #include "collectives.h" -// Increase Step and poffset/noffset for buffer sync -#define NEXT_STEP \ - step++; \ - poffset = noffset; \ - noffset += sliceSize; \ - if (noffset == buffSize) noffset = 0; - template __attribute__((noinline)) -__device__ void ncclAllReduceKernel(struct CollectiveArgs* args) { +__device__ void ncclAllReduceRingKernel(struct CollectiveArgs* args) { const int tid = threadIdx.x; const int nthreads = blockDim.x; const int bid = args->bid; - __shared__ T* sharedNextOutput; - struct ncclComm* comm = args->comm; - struct ncclRing* ring = comm->rings+blockIdx.x; - int prevdirect = 0; - int nextdirect = 0; - - WaitFlag waitDoneFromNext(ring->send.conn.head, ALLREDUCE_BUFCHUNKS*ALLREDUCE_SUBSTEPS); - WaitFlag waitReadyFromPrev(ring->recv.conn.tail, ALLREDUCE_SUBSTEPS); - PostFlag postDoneToPrev(ring->recv.conn.head, ALLREDUCE_SUBSTEPS, NULL, 0); - PostFlag postReadyToNext(ring->send.conn.tail, 0, ring->send.conn.fifo, ALLREDUCE_BUFCHUNKS*ALLREDUCE_SUBSTEPS, ring->next_hdp_reg); - - typedef Primitives Prims; - + struct ncclDevComm* comm = args->comm; + struct ncclChannel* channel = comm->channels+blockIdx.x; + struct ncclRing* ring = &channel->ring; const ssize_t size = args->N; - //const int rank = comm->rank; const int nranks = comm->nRanks; - const int buffSize = ring->buffSize / sizeof(T); - const int sliceSize = buffSize / ALLREDUCE_BUFCHUNKS; - const ssize_t loopSize = args->nRings*(ssize_t)sliceSize; - - if (tid == 0) { - // Update in case we skipped some collectives - STORE(ring->recv.conn.opCount, args->opCount); - // Wait for next to be ready - WaitFlag waitOpCountNext(ring->send.conn.opCount, 0); - waitOpCountNext.wait(args->opCount); - if (prevdirect) { - *ring->recv.conn.ptrExchange = args->ThisOutput; - } - if (nextdirect) { - void* volatile* ptr = &(ring->devMemSend->ptrExchange); - while (LOAD(ptr) == nullptr); - sharedNextOutput = (T*)LOAD(ptr); - STORE(ptr, nullptr); - } - } - __syncthreads(); - - uint64_t step = 0ULL; - int poffset, noffset = 0; + const int stepSize = channel->buffSize / (sizeof(T)*NCCL_STEPS); + const int chunkSize = stepSize * ALLREDUCE_CHUNKSTEPS; + const ssize_t loopSize = args->nChannels*(ssize_t)chunkSize; +#ifdef ENABLE_PROFILING + auto devProf = comm->devProf; + uint64_t clk, t0 = 0ULL, ws, wr; + if (tid == 0) clk = clock64(); +#endif // Compute pointers const T * __restrict__ thisInput = (const T*)args->ThisInput; T * __restrict__ thisOutput = (T*)args->ThisOutput; - T * __restrict__ prevInput = (T*)ring->recv.conn.buff; - T * __restrict__ nextOutput = (T*)ring->send.conn.buff; + + ncclPrimitives + prims(tid, nthreads, &ring->prev, &ring->next, thisOutput, stepSize, channel, comm, args->opCount); for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += nranks*loopSize) { - int chunkSize = min(sliceSize, DIVUP(size-gridOffset,nranks*args->nRings)); - ALIGN_SIZE(chunkSize, nthreads*sizeof(uint64_t)/sizeof(T)); - ssize_t chunkOffset = gridOffset + bid*nranks*chunkSize; + int realChunkSize = min(chunkSize, DIVUP(size-gridOffset,nranks*args->nChannels)); + ALIGN_SIZE(realChunkSize, nthreads*sizeof(uint64_t)/sizeof(T)); + ssize_t chunkOffset = gridOffset + bid*nranks*realChunkSize; /////////////// begin AllReduce steps /////////////// ssize_t offset; - int maxOffset; + int nelem; int slice; // step 0: push data to next GPU slice = ring->devUserRanks[nranks-1]; - offset = chunkOffset + slice * chunkSize; - maxOffset = min(chunkSize, size-offset); + offset = chunkOffset + slice * realChunkSize; + nelem = min(realChunkSize, size-offset); - Prims::Copy(tid, nthreads, - thisInput + offset, - nextOutput + noffset, - sliceSize, maxOffset, - step, - waitDoneFromNext, - postReadyToNext); - - NEXT_STEP; // Increases step, poffset, noffset + INIT_COUNTER; + prims.send(thisInput+offset, nelem); + ACCUMULATE_COUNTER(send); // k-2 steps: reduce and copy to next GPU for (int j=2; jdevUserRanks[nranks-j]; - offset = chunkOffset + slice * chunkSize; - maxOffset = min(chunkSize, size-offset); + offset = chunkOffset + slice * realChunkSize; + nelem = min(realChunkSize, size-offset); - Prims::Reduce(tid, nthreads, - prevInput + poffset, - thisInput + offset, - nextOutput + noffset, - sliceSize, maxOffset, - step, - waitDoneFromNext, waitReadyFromPrev, - postReadyToNext, postDoneToPrev); - - NEXT_STEP; + INIT_COUNTER; + prims.recvReduceSend(thisInput+offset, nelem); + ACCUMULATE_COUNTER(recvReduceSend); } // step k-1: reduce this buffer and data, which will produce the final // result that we store in this data and push to the next GPU slice = ring->devUserRanks[0]; - offset = chunkOffset + slice * chunkSize; - maxOffset = min(chunkSize, size-offset); + offset = chunkOffset + slice * realChunkSize; + nelem = min(realChunkSize, size-offset); - Prims::ReduceCopy(tid, nthreads, - prevInput + poffset, - thisInput + offset, - nextdirect ? (sharedNextOutput + offset) : (nextOutput + noffset), - thisOutput + offset, - sliceSize, maxOffset, - step, - waitDoneFromNext, waitReadyFromPrev, - postReadyToNext, postDoneToPrev); - - NEXT_STEP; + INIT_COUNTER; + prims.directRecvReduceCopySend(thisInput+offset, thisOutput+offset, offset, nelem); + ACCUMULATE_COUNTER(directRecvReduceCopySend); // k-2 steps: copy to next GPU - if (prevdirect) { - for (int j=1; jdevUserRanks[nranks - j]; - offset = chunkOffset + slice * chunkSize; - maxOffset = min(chunkSize, size-offset); + for (int j=1; jdevUserRanks[nranks-j]; + offset = chunkOffset + slice * realChunkSize; + nelem = min(realChunkSize, size-offset); - Prims::Copy(tid, nthreads, - thisOutput + offset, - nextdirect ? (sharedNextOutput + offset) : (nextOutput + noffset), - sliceSize, maxOffset, - step, - waitDoneFromNext, waitReadyFromPrev, - postReadyToNext, postDoneToPrev); - - NEXT_STEP; - } - Prims::Copy(tid, nthreads, - NULL, - NULL, - 0, 0, - step, - waitReadyFromPrev, - postDoneToPrev); - } else { - for (int j=1; jdevUserRanks[nranks - j]; - offset = chunkOffset + slice * chunkSize; - maxOffset = min(chunkSize, size-offset); - - Prims::DoubleCopy(tid, nthreads, - prevInput + poffset, - thisOutput + offset, - nextdirect ? (sharedNextOutput + offset) : (nextOutput + noffset), - sliceSize, maxOffset, - step, - waitDoneFromNext, waitReadyFromPrev, - postReadyToNext, postDoneToPrev); - - NEXT_STEP; - } - - // Make final copy from buffer to dest. - slice = ring->devUserRanks[1]; - offset = chunkOffset + slice * chunkSize; - maxOffset = min(chunkSize, size-offset); - - // Here we need to copy from buffer to this output. - Prims::Copy(tid, nthreads, - prevInput + poffset, - thisOutput + offset, - sliceSize, maxOffset, - step, - waitReadyFromPrev, - postDoneToPrev); + INIT_COUNTER; + prims.directRecvCopySend(thisOutput+offset, offset, nelem); + ACCUMULATE_COUNTER(directRecvCopySend); } - } - if (tid == 0) { - // Wait for next to have consumed all data before we reset the flag - waitDoneFromNext.wait(ALLREDUCE_SUBSTEPS*(step + ALLREDUCE_BUFCHUNKS)); - STORE(ring->send.conn.head, 0ULL); - STORE(ring->recv.conn.tail, 0ULL); - __threadfence_system(); - STORE(ring->recv.conn.opCount, args->opCount+1); + // Make final copy from buffer to dest. + slice = ring->devUserRanks[1]; + offset = chunkOffset + slice * realChunkSize; + nelem = min(realChunkSize, size-offset); + + // Final wait/copy. + INIT_COUNTER; + prims.directRecv(thisOutput+offset, offset, nelem); + ACCUMULATE_COUNTER(directRecv); } +#ifdef ENABLE_PROFILING + if (tid == 0) __atomic_fetch_add(&(devProf->total_cycle), clock64() - clk, __ATOMIC_SEQ_CST); +#endif } -#include "ll_kernel.h" +template +__attribute__((noinline)) +__device__ void ncclAllReduceTreeKernel(struct CollectiveArgs* args) { + const int tid = threadIdx.x; + const int nthreads = blockDim.x; + const int bid = args->bid; + struct ncclDevComm* comm = args->comm; + struct ncclChannel* channel = comm->channels+blockIdx.x; + struct ncclTree* tree = &channel->tree; + const ssize_t size = args->N; + const int stepSize = channel->buffSize / (sizeof(T)*NCCL_STEPS); + const int chunkSize = args->lastChunkSize; + const ssize_t loopSize = args->nChannels*chunkSize; -#define NEXT_STEP_LL \ - poffset = noffset; \ - pflag = nflag; \ - noffset += NCCL_LL_SLICE_LINES; \ - if (noffset == NCCL_LL_BUFF_LINES) { noffset = 0; } \ - nflag++; \ - step++; + // Compute pointers + const T * __restrict__ thisInput = (const T*)args->ThisInput; + T * __restrict__ thisOutput = (T*)args->ThisOutput; + + do { + // Reduce : max number of recv is 3, max number of send is 1 (binary tree + local) + ncclPrimitives prims(tid, nthreads, tree->down, &tree->up, NULL, stepSize, channel, comm, args->opCount); + for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) { + // Up + ssize_t offset = gridOffset + bid*chunkSize; + int nelem = min(chunkSize, size-offset); + if (tree->up == -1) { + prims.recvReduceCopy(thisInput+offset, thisOutput+offset, nelem); + } else if (tree->down[0] == -1) { + prims.send(thisInput+offset, nelem); + } else { + prims.recvReduceSend(thisInput+offset, nelem); + } + } + } while(0); + + do { + // Broadcast : max number of recv is 1, max number of send is 3 (binary tree + local) + ncclPrimitives prims(tid, nthreads, &tree->up, tree->down, NULL, stepSize, channel, comm, args->opCount); + for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) { + // Down + ssize_t offset = gridOffset + bid*chunkSize; + int nelem = min(chunkSize, size-offset); + if (tree->up == -1) { + prims.send(thisOutput+offset, nelem); + } else if (tree->down[0] == -1) { + prims.recv(thisOutput+offset, nelem); + } else { + prims.recvCopySend(thisOutput+offset, nelem); + } + } + } while(0); +} template __attribute__((noinline)) -__device__ void ncclAllReduceLLKernel(struct CollectiveArgs* args) { +__device__ void ncclAllReduceRingLLKernel(struct CollectiveArgs* args) { const int tid = threadIdx.x; const int bid = args->bid; - const int llNthreads = args->nThreads; - struct ncclComm* comm = args->comm; - struct ncclRing* ring = comm->rings+blockIdx.x; - volatile uint64_t * recvHeadPtr = ring->recv.conn.llHead; - volatile uint64_t * sendHeadPtr = ring->send.conn.llHead; - volatile int * sizesFifo = ring->send.conn.llFifo; - uint64_t sendHead = sendHeadPtr[0]; + const int nthreads = args->nThreads; + struct ncclDevComm* comm = args->comm; + struct ncclChannel* channel = comm->channels+blockIdx.x; + struct ncclRing* ring = &channel->ring; - typedef LLPrimitives LL; + ncclLLPrimitives LLprims(tid, nthreads, &ring->prev, &ring->next, channel, comm, args->opCount); const ssize_t size = args->N; //const int rank = comm->rank; const int nranks = comm->nRanks; ssize_t chunkSize = NCCL_LL_SLICE_LINES * sizeof(uint64_t) / sizeof(T); - const ssize_t loopSize = args->nRings*nranks*chunkSize; - - uint64_t step = ring->send.conn.llStep; - uint32_t pflag, nflag = step + 1; - int poffset, noffset = NCCL_LL_SLICE_LINES * STEP_TO_SLOT(step); + const ssize_t loopSize = args->nChannels*nranks*chunkSize; // Compute pointers const T * __restrict__ thisInput = (const T*)args->ThisInput; T * __restrict__ thisOutput = (T*)args->ThisOutput; - union ncclLLFifoLine * prevInput = (union ncclLLFifoLine *)ring->recv.conn.llBuff; - union ncclLLFifoLine * nextOutput = (union ncclLLFifoLine *)ring->send.conn.llBuff; for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) { if (size-gridOffset < loopSize) { @@ -247,89 +185,100 @@ __device__ void ncclAllReduceLLKernel(struct CollectiveArgs* args) { /////////////// begin AllReduce steps /////////////// ssize_t offset; - int maxOffset; + int nelem; int slice; // step 0: push data to next GPU slice = ring->devUserRanks[nranks-1]; offset = chunkOffset + slice * chunkSize; - maxOffset = min(chunkSize, size-offset); + nelem = min(chunkSize, size-offset); - WAIT_NEXT; - LL::ReduceCopy( - thisInput + offset, - nextOutput + noffset, - maxOffset, nflag, llNthreads); - POST_SIZE; - - NEXT_STEP_LL; + LLprims.send(thisInput+offset, nelem); // k-2 steps: reduce and copy to next GPU for (int j=2; jdevUserRanks[nranks-j]; offset = chunkOffset + slice * chunkSize; - maxOffset = min(chunkSize, size-offset); + nelem = min(chunkSize, size-offset); - WAIT_NEXT; - LL::ReduceCopy( - thisInput + offset, - prevInput + poffset, - nextOutput + noffset, - maxOffset, pflag, nflag, llNthreads); - POST_SIZE; - ACK_PREV; - - NEXT_STEP_LL; + LLprims.recvReduceSend(thisInput+offset, nelem); } // step k-1: reduce this buffer and data, which will produce the final // result that we store in this data and push to the next GPU slice = ring->devUserRanks[0]; offset = chunkOffset + slice * chunkSize; - maxOffset = min(chunkSize, size-offset); + nelem = min(chunkSize, size-offset); - WAIT_NEXT; - LL::ReduceCopy( - thisInput + offset, - prevInput + poffset, - thisOutput + offset, - nextOutput + noffset, - maxOffset, pflag, nflag, llNthreads); - POST_SIZE; - ACK_PREV; - - NEXT_STEP_LL; + LLprims.recvReduceCopySend(thisInput+offset, thisOutput+offset, nelem); // k-2 steps: copy to next GPU for (int j=1; jdevUserRanks[nranks - j]; + slice = ring->devUserRanks[nranks-j]; offset = chunkOffset + slice * chunkSize; - maxOffset = min(chunkSize, size-offset); + nelem = min(chunkSize, size-offset); - WAIT_NEXT; - LL::ReduceCopy( - prevInput + poffset, - thisOutput + offset, - nextOutput + noffset, - maxOffset, pflag, nflag, llNthreads); - POST_SIZE; - ACK_PREV; - - NEXT_STEP_LL; + LLprims.recvCopySend(thisOutput+offset, nelem); } // Make final copy from buffer to dest. slice = ring->devUserRanks[1]; offset = chunkOffset + slice * chunkSize; - maxOffset = min(chunkSize, size-offset); + nelem = min(chunkSize, size-offset); // Here we need to copy from buffer to this output. - LL::ReduceCopy( - prevInput + poffset, - thisOutput + offset, - maxOffset, pflag, llNthreads); - ACK_PREV; + LLprims.recv(thisOutput+offset, nelem); } - - FIFO_CLEANING_AND_SAVE_STEP(nflag); +} + +template +__attribute__((noinline)) +__device__ void ncclAllReduceTreeLLKernel(struct CollectiveArgs* args) { + const int tid = threadIdx.x; + const int nthreads = args->nThreads; + const int bid = args->bid; + struct ncclDevComm* comm = args->comm; + struct ncclChannel* channel = comm->channels+blockIdx.x; + struct ncclTree* tree = &channel->tree; + const ssize_t size = args->N; + ssize_t chunkSize = NCCL_LL_SLICE_LINES * sizeof(uint64_t) / sizeof(T); + const ssize_t loopSize = args->nChannels*chunkSize; + + // Compute pointers + const T * __restrict__ thisInput = (const T*)args->ThisInput; + T * __restrict__ thisOutput = (T*)args->ThisOutput; + + do { + // Reduce : max number of recv is 3, max number of send is 1 (binary tree + local) + ncclLLPrimitives LLprims(tid, nthreads, tree->down, &tree->up, channel, comm, args->opCount); + for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) { + // Up + ssize_t offset = gridOffset + bid*chunkSize; + int nelem = min(chunkSize, size-offset); + if (tree->up == -1) { + LLprims.recvReduceCopy(thisInput+offset, thisOutput+offset, nelem); + } else if (tree->down[0] == -1) { + LLprims.send(thisInput+offset, nelem); + } else { + LLprims.recvReduceSend(thisInput+offset, nelem); + } + } + } while(0); + + do { + // Broadcast : max number of recv is 1, max number of send is 3 (binary tree + local) + ncclLLPrimitives LLprims(tid, nthreads, &tree->up, tree->down, channel, comm, args->opCount); + for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) { + // Down + ssize_t offset = gridOffset + bid*chunkSize; + int nelem = min(chunkSize, size-offset); + if (tree->up == -1) { + LLprims.send(thisOutput+offset, nelem); + } else if (tree->down[0] == -1) { + LLprims.recv(thisOutput+offset, nelem); + } else { + LLprims.recvCopySend(thisOutput+offset, nelem); + } + } + } while(0); } diff --git a/projects/rccl/src/collectives/device/all_reduce_1.cpp b/projects/rccl/src/collectives/device/all_reduce_1.cpp deleted file mode 100644 index dda4b5d517..0000000000 --- a/projects/rccl/src/collectives/device/all_reduce_1.cpp +++ /dev/null @@ -1,8 +0,0 @@ -/************************************************************************* - * Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved. - * - * See LICENSE.txt for license information - ************************************************************************/ - -#define NCCL_OP 1 -#include "device/all_reduce.cu" diff --git a/projects/rccl/src/collectives/device/all_reduce_2.cpp b/projects/rccl/src/collectives/device/all_reduce_2.cpp deleted file mode 100644 index 745435b60f..0000000000 --- a/projects/rccl/src/collectives/device/all_reduce_2.cpp +++ /dev/null @@ -1,8 +0,0 @@ -/************************************************************************* - * Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved. - * - * See LICENSE.txt for license information - ************************************************************************/ - -#define NCCL_OP 2 -#include "device/all_reduce.cu" diff --git a/projects/rccl/src/collectives/device/all_reduce_3.cpp b/projects/rccl/src/collectives/device/all_reduce_3.cpp deleted file mode 100644 index d7f45f03dd..0000000000 --- a/projects/rccl/src/collectives/device/all_reduce_3.cpp +++ /dev/null @@ -1,8 +0,0 @@ -/************************************************************************* - * Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved. - * - * See LICENSE.txt for license information - ************************************************************************/ - -#define NCCL_OP 3 -#include "device/all_reduce.cu" diff --git a/projects/rccl/src/collectives/device/broadcast.cu b/projects/rccl/src/collectives/device/broadcast.cu index 4125de41f9..c4b1cbc5e9 100644 --- a/projects/rccl/src/collectives/device/broadcast.cu +++ b/projects/rccl/src/collectives/device/broadcast.cu @@ -1,5 +1,6 @@ /************************************************************************* * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved. + * Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ @@ -10,6 +11,4 @@ #define UNROLL 4 -#if NCCL_OP == 0 -IMPL_COLL3(ncclBroadcast, copy, FuncSum, i8, int8_t, ncclCollBroadcast, ncclSum, ncclInt8); -#endif +IMPL_COLL3(ncclBroadcast, copy, FuncSum, i8, int8_t, ncclCollBroadcast, ncclSum, ncclInt8); \ No newline at end of file diff --git a/projects/rccl/src/collectives/device/broadcast.h b/projects/rccl/src/collectives/device/broadcast.h index 5fafbaf6aa..3c54de9dd8 100644 --- a/projects/rccl/src/collectives/device/broadcast.h +++ b/projects/rccl/src/collectives/device/broadcast.h @@ -1,184 +1,101 @@ /************************************************************************* - * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved. * Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ -#include "core.h" +#include "devcomm.h" #include "primitives.h" #include "collectives.h" -// Increase Step and boffset for buffer sync -#define NEXT_STEP \ - step++; \ - boffset += sliceSize; \ - if (boffset == buffSize) boffset = 0; - template __attribute__((noinline)) -__device__ void ncclBroadcastKernel(struct CollectiveArgs* args) { +__device__ void ncclBroadcastRingKernel(struct CollectiveArgs* args) { const int tid = threadIdx.x; const int nthreads = blockDim.x; const int bid = args->bid; - __shared__ T* sharedNextOutput; - struct ncclComm* comm = args->comm; - struct ncclRing* ring = comm->rings+blockIdx.x; - int prevdirect = 0; - int nextdirect = 0; + struct ncclDevComm* comm = args->comm; + struct ncclChannel* channel = comm->channels+blockIdx.x; + struct ncclRing* ring = &channel->ring; + const ssize_t size = args->N; + const int stepSize = channel->buffSize / (sizeof(T)*NCCL_STEPS); + const int chunkSize = stepSize * BROADCAST_CHUNKSTEPS; + const ssize_t loopSize = args->nChannels*(ssize_t)chunkSize; + const int rank = ring->devUserRanks[0]; + const int nextRank = ring->devUserRanks[1]; + const int root = args->root; +#ifdef ENABLE_PROFILING + auto devProf = comm->devProf; + uint64_t clk, t0 = 0ULL, ws, wr; + if (tid == 0) clk = clock64(); +#endif - WaitFlag waitDoneFromNext(ring->send.conn.head, (BROADCAST_BUFCHUNKS-1)*BROADCAST_SUBSTEPS); - WaitFlag waitReadyFromPrev(ring->recv.conn.tail, 0); - PostFlag postDoneToPrev(ring->recv.conn.head, 0, NULL, 0); - PostFlag postReadyToNext(ring->send.conn.tail, 0, ring->send.conn.fifo, BROADCAST_BUFCHUNKS*BROADCAST_SUBSTEPS, ring->next_hdp_reg); + // Compute pointers + const T * __restrict__ thisInput = (const T*)args->ThisInput; + T * __restrict__ thisOutput = (T*)args->ThisOutput; - typedef Primitives Prims; + ncclPrimitives + prims(tid, nthreads, &ring->prev, &ring->next, NULL, stepSize, channel, comm, args->opCount); + + for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) { + int realChunkSize = min(chunkSize, DIVUP(size-gridOffset,args->nChannels)); + ALIGN_SIZE(realChunkSize, nthreads*sizeof(uint64_t)/sizeof(T)); + ssize_t offset = gridOffset + bid*realChunkSize; + int nelem = min(realChunkSize, size-offset); + + if (rank == root) { + if (thisInput == thisOutput) { + INIT_COUNTER; + prims.send(thisInput+offset, nelem); + ACCUMULATE_COUNTER(send); + } else { + INIT_COUNTER; + prims.copySend(thisInput+offset, thisOutput+offset, nelem); + ACCUMULATE_COUNTER(copySend); + } + } else if (nextRank == root) { + INIT_COUNTER; + prims.recv(thisOutput+offset, nelem); + ACCUMULATE_COUNTER(recv); + } else { + INIT_COUNTER; + prims.recvCopySend(thisOutput+offset, nelem); + ACCUMULATE_COUNTER(recvCopySend); + } + } +#ifdef ENABLE_PROFILING + if (tid == 0) __atomic_fetch_add(&(devProf->total_cycle), clock64() - clk, __ATOMIC_SEQ_CST); +#endif +} + +template +__attribute__((noinline)) +__device__ void ncclBroadcastTreeKernel(struct CollectiveArgs* args) { } + +template +__attribute__((noinline)) +__device__ void ncclBroadcastRingLLKernel(struct CollectiveArgs* args) { + const int tid = threadIdx.x; + const int bid = args->bid; + const int nthreads = args->nThreads; + struct ncclDevComm* comm = args->comm; + struct ncclChannel* channel = comm->channels+blockIdx.x; + struct ncclRing* ring = &channel->ring; + + ncclLLPrimitives LLprims(tid, nthreads, &ring->prev, &ring->next, channel, comm, args->opCount); const ssize_t size = args->N; - const int buffSize = ring->buffSize / sizeof(T); - const int sliceSize = buffSize / BROADCAST_BUFCHUNKS; - const ssize_t loopSize = args->nRings*(ssize_t)sliceSize; const int rank = ring->devUserRanks[0]; const int nextRank = ring->devUserRanks[1]; const int root = args->root; - if (tid == 0) { - // Update in case we skipped some collectives - STORE(ring->recv.conn.opCount, args->opCount); - if (nextRank != root) { - // Wait for next to be ready - WaitFlag waitOpCountNext(ring->send.conn.opCount, 0); - waitOpCountNext.wait(args->opCount); - } - if (rank != root && prevdirect) { - *ring->recv.conn.ptrExchange = args->ThisOutput; - } - if (nextRank != root && nextdirect) { - void* volatile* ptr = &(ring->devMemSend->ptrExchange); - while (LOAD(ptr) == nullptr); - sharedNextOutput = (T*)LOAD(ptr); - STORE(ptr, nullptr); - } - } - __syncthreads(); - - uint64_t step = 0ULL; - int boffset = 0; - - // Compute pointers - const T * __restrict__ thisInput = (const T*)args->ThisInput; - T * __restrict__ thisOutput = (T*)args->ThisOutput; - T * __restrict__ prevInput = (T*)ring->recv.conn.buff; - T * __restrict__ nextOutput = (T*)ring->send.conn.buff; - - for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) { - int chunkSize = min(sliceSize, DIVUP(size-gridOffset,args->nRings)); - ALIGN_SIZE(chunkSize, nthreads*sizeof(uint64_t)/sizeof(T)); - ssize_t offset = gridOffset + bid*chunkSize; - int maxOffset = min(chunkSize, size-offset); - - if (rank == root) { - if (thisInput == thisOutput) { - Prims::Copy(tid, nthreads, - thisInput + offset, - nextdirect ? (sharedNextOutput + offset) : (nextOutput + boffset), - sliceSize, maxOffset, - step, - waitDoneFromNext, - postReadyToNext); - } else { - Prims::DoubleCopy(tid, nthreads, - thisInput + offset, - thisOutput + offset, - nextdirect ? (sharedNextOutput + offset) : (nextOutput + boffset), - sliceSize, maxOffset, - step, - waitDoneFromNext, - postReadyToNext); - } - } else if (nextRank == root) { - if (prevdirect) maxOffset = 0; // Only wait for signals - Prims::Copy(tid, nthreads, - prevInput + boffset, - thisOutput + offset, - sliceSize, maxOffset, - step, - waitReadyFromPrev, - postDoneToPrev); - } else { - if (prevdirect) { - Prims::Copy(tid, nthreads, - thisOutput + offset, - nextdirect ? (sharedNextOutput + offset) : (nextOutput + boffset), - sliceSize, maxOffset, - step, - waitDoneFromNext, waitReadyFromPrev, - postReadyToNext, postDoneToPrev); - } else { - Prims::DoubleCopy(tid, nthreads, - prevInput + boffset, - thisOutput + offset, - nextdirect ? (sharedNextOutput + offset) : (nextOutput + boffset), - sliceSize, maxOffset, - step, - waitDoneFromNext, waitReadyFromPrev, - postReadyToNext, postDoneToPrev); - } - } - NEXT_STEP; // Increases step, boffset - } - - if (tid == 0) { - if (nextRank != root) { - // Wait for next to have consumed data before resetting the flag - waitDoneFromNext.wait(BROADCAST_SUBSTEPS*(step + BROADCAST_BUFCHUNKS - 1)); - STORE(ring->send.conn.head, 0ULL); - } - STORE(ring->recv.conn.tail, 0ULL); - __threadfence_system(); - STORE(ring->recv.conn.opCount, args->opCount+1); - } -} - -#include "ll_kernel.h" - -#define NEXT_STEP_LL \ - boffset += NCCL_LL_SLICE_LINES; \ - if (boffset == NCCL_LL_BUFF_LINES) boffset = 0; \ - flag++; \ - step++; - -template -__attribute__((noinline)) -__device__ void ncclBroadcastLLKernel(struct CollectiveArgs* args) { - const int tid = threadIdx.x; - const int bid = args->bid; - const int llNthreads = args->nThreads; - struct ncclComm* comm = args->comm; - struct ncclRing* ring = comm->rings+blockIdx.x; - volatile uint64_t * recvHeadPtr = ring->recv.conn.llHead; - volatile uint64_t * sendHeadPtr = ring->send.conn.llHead; - volatile int * sizesFifo = ring->send.conn.llFifo; - uint64_t sendHead = sendHeadPtr[0]; - const int rank = comm->rank; - const int nextRank = ring->devUserRanks[1]; - const int root = args->root; - - typedef LLPrimitives LL; - - const ssize_t size = args->N; ssize_t chunkSize = NCCL_LL_SLICE_LINES * sizeof(uint64_t) / sizeof(T); - const ssize_t loopSize = args->nRings*chunkSize; - - uint64_t step = ring->send.conn.llStep; - uint32_t flag = step + 1; - int boffset = NCCL_LL_SLICE_LINES * STEP_TO_SLOT(step); + const ssize_t loopSize = args->nChannels*chunkSize; // Compute pointers const T * __restrict__ thisInput = (const T*)args->ThisInput; T * __restrict__ thisOutput = (T*)args->ThisOutput; - union ncclLLFifoLine * prevInput = (union ncclLLFifoLine *)ring->recv.conn.llBuff; - union ncclLLFifoLine * nextOutput = (union ncclLLFifoLine *)ring->send.conn.llBuff; for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) { if (size-gridOffset < loopSize) { @@ -186,46 +103,21 @@ __device__ void ncclBroadcastLLKernel(struct CollectiveArgs* args) { } ssize_t offset = gridOffset + bid*chunkSize; - int maxOffset = min(chunkSize, size-offset); + int nelem = min(chunkSize, size-offset); if (rank == root) { - WAIT_NEXT; if (thisInput == thisOutput) { - LL::ReduceCopy( - thisInput + offset, - nextOutput + boffset, - maxOffset, flag, llNthreads); + LLprims.send(thisInput+offset, nelem); } else { - LL::ReduceCopy( - thisInput + offset, - thisOutput + offset, - nextOutput + boffset, - maxOffset, flag, llNthreads); + LLprims.copySend(thisInput + offset, thisOutput + offset, nelem); } - POST_SIZE; - NEXT_STEP_LL; } else if (nextRank == root) { - LL::ReduceCopy( - prevInput + boffset, - thisOutput + offset, - maxOffset, flag, llNthreads); - NEXT_STEP_LL; - ACK_PREV; + LLprims.recv(thisOutput + offset, nelem); } else { - WAIT_NEXT; - LL::ReduceCopy( - prevInput + boffset, - thisOutput + offset, - nextOutput + boffset, - maxOffset, flag, flag, llNthreads); - POST_SIZE; - NEXT_STEP_LL; - ACK_PREV; + LLprims.recvCopySend(thisOutput + offset, nelem); } } - - // We need everyone to acknowledge data even if they didn't receive anything - // so that the next collective can start right away. - ACK_PREV; - - FIFO_CLEANING_AND_SAVE_STEP(flag); } + +template +__attribute__((noinline)) +__device__ void ncclBroadcastTreeLLKernel(struct CollectiveArgs* args) { } diff --git a/projects/rccl/src/collectives/device/broadcast_0.cpp b/projects/rccl/src/collectives/device/broadcast_0.cpp deleted file mode 100644 index 75b75ad9cf..0000000000 --- a/projects/rccl/src/collectives/device/broadcast_0.cpp +++ /dev/null @@ -1,8 +0,0 @@ -/************************************************************************* - * Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved. - * - * See LICENSE.txt for license information - ************************************************************************/ - -#define NCCL_OP 0 -#include "device/broadcast.cu" diff --git a/projects/rccl/src/collectives/device/common.h b/projects/rccl/src/collectives/device/common.h index 819f3a12ab..fd26814b0f 100644 --- a/projects/rccl/src/collectives/device/common.h +++ b/projects/rccl/src/collectives/device/common.h @@ -1,5 +1,6 @@ +#include "hip/hip_runtime.h" /************************************************************************* - * Copyright (c) 2017-2018, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. * Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved. * * See LICENSE.txt for license information @@ -8,18 +9,38 @@ #ifndef NCCL_DEVICE_COMMON_H_ #define NCCL_DEVICE_COMMON_H_ -#include - #include "../collectives.h" -#include "core.h" +#include "devcomm.h" #include "nccl.h" - #include -typedef void(*ncclKern_t)(struct CollectiveArgs* args); -#define NCCL_FUNC4(coll, op, dtype) \ +// Exit If Abort Barrier across CTA: make sure all threads exit consistently +// Each thread sets a predicate to true if abort == 1 +// all CTA's threads enter the barrier and do a popc on their predicates being True +// If any of the thread's predicate was True, all the threads call exit() +#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__) +#define exitIfAbortBarrier(abort, abortCount) \ + if (abort) __atomic_fetch_add(abortCount, 1, __ATOMIC_SEQ_CST); \ + __syncthreads(); \ + if (LOAD(abortCount)) { asm volatile ("s_endpgm"); return; } +#else +static inline __device__ void exitIfAbortBarrier(int abort) { + uint32_t popc; + asm ("{"); + asm volatile (" .reg .pred barr_pred;"); + asm volatile (" setp.eq.u32 barr_pred,%0,1;" :: "r"(abort)); + asm volatile (" bar.red.popc.u32 %0, 13, barr_pred;" : "=r"(popc)); + asm ("}"); + if (popc) { asm volatile ("exit;"); } +} +#endif + +#define NCCL_FUNC5(coll, op, dtype) \ NCCL_COLL_NAME(coll, op, dtype), \ - NCCL_COLL_NAME(coll##LL, op, dtype) \ + NCCL_COLL_NAME(coll##LL, op, dtype) + +#define NCCL_FUNC4(coll, op, dtype) \ + NCCL_FUNC5(coll##Ring, op, dtype) // Must be consistent with ncclDataType_t #define NCCL_FUNCS3A(coll, op) \ @@ -64,20 +85,13 @@ typedef void(*ncclKern_t)(struct CollectiveArgs* args); NCCL_FUNCS2A(ncclAllReduce) } // Must be consistent with the ncclFuncSet enum -using ncclKern_t = void (*)(struct CollectiveArgs*); +using ncclFunc_t = void (*)(struct CollectiveArgs*); -static const __device__ constexpr ncclKern_t ncclFuncs[]{ -#if defined(__HIP_DEVICE_COMPILE__) - NCCL_FUNCS2B(ncclBroadcast), - NCCL_FUNCS2A(ncclReduce), - NCCL_FUNCS2B(ncclAllGather), - NCCL_FUNCS2A(ncclReduceScatter), - NCCL_FUNCS2A(ncclAllReduce) -#endif +static const __device__ constexpr ncclFunc_t ncclFuncs[]{ // Don't try to initialize the host shadow copy of this device-side global // variable. There is no host pointer to a device-side function, which // confuses clang. This will be fixed in the next clang release. -#if __CUDA_ARCH__ +#if defined(__HIP_DEVICE_COMPILE__) NCCL_FUNCS2B(ncclBroadcast), NCCL_FUNCS2A(ncclReduce), NCCL_FUNCS2B(ncclAllGather), @@ -88,82 +102,89 @@ static const __device__ constexpr ncclKern_t ncclFuncs[]{ template struct Caller { - static - __device__ void call(ncclColl* const c) noexcept + static __device__ __host__ + void call(ncclColl* const c) noexcept { constexpr unsigned short m = f + (l - f) / 2; - return (c->funcIndex < m) ? Caller::call(c) : Caller::call(c); + return (c->funcIndex < m) ? Caller::call(c) : Caller::call(c); } }; template struct Caller{ - static - __device__ void call(struct ncclColl* const c) noexcept { ncclFuncs[f](&c->args); } + static __device__ __host__ + void call(struct ncclColl* const c) noexcept { ncclFuncs[f](&c->args); } }; inline __device__ -void NCCL_CALL_FUNCTIONS(struct ncclColl* const c) noexcept -{ +void NCCL_CALL_FUNCTIONS(struct ncclColl* const c) noexcept { if (c->funcIndex < 72) { - if (c->funcIndex % 2) ncclBroadcastLL_copy_i8(&c->args); - else ncclBroadcast_copy_i8(&c->args); + if (c->funcIndex % 2) ncclBroadcastRingLL_copy_i8(&c->args); + else ncclBroadcastRing_copy_i8(&c->args); } else if (c->funcIndex < 144) Caller<72, 144>::call(c); else if (c->funcIndex < 216) { - if (c->funcIndex % 2) ncclAllGatherLL_copy_i8(&c->args); - else ncclAllGather_copy_i8(&c->args); + if (c->funcIndex % 2) ncclAllGatherRingLL_copy_i8(&c->args); + else ncclAllGatherRing_copy_i8(&c->args); } else Caller<216, 360>::call(c); } -static __device__ void load_parallel(void* dst, void* src, size_t size, int tid) { +static __device__ void load_parallel(void* dst, void* src, size_t size, int tid, uint32_t* abortCount) { int* d = (int*)dst; int* s = (int*)src; - __syncthreads(); + // When aggregation is effective, if some threads have aborted inside the LL kernel, + // make sure the rest of the threads abort as well + exitIfAbortBarrier(0, abortCount); for (int o = tid; o < (size/sizeof(int)); o += blockDim.x) d[o] = s[o]; __syncthreads(); } -static __device__ void load_coll(struct ncclColl* localColl, struct ncclColl* hostColl, int tid) { - load_parallel(localColl, hostColl, sizeof(struct ncclColl), tid); +static __device__ void load_coll(struct ncclColl* localColl, struct ncclColl* hostColl, int tid, uint32_t* abortCount) { + load_parallel(localColl, hostColl, sizeof(struct ncclColl), tid, abortCount); if (tid == 0) hostColl->active = 0; } /* Functions for aggregation case */ -#define IMPL_COLL4(coll, op, ncclFunc, dtype, ctype) \ +#define IMPL_COLL_FUNC(coll, op, ncclFunc, dtype, ctype) \ __device__ void NCCL_COLL_NAME(coll, op, dtype)(struct CollectiveArgs* args) { \ - coll##Kernel, ctype>(args); \ + coll##Kernel, ctype>(args); \ } + +#if NCCL_OP == 0 /* Kernels with the first operation inlined */ -#define IMPL_COLL4K(coll, op, ncclFunc, dtype, ctype, fIndex) \ +#define IMPL_COLL_KERN(coll, op, ncclFunc, dtype, ctype, fIndex) \ __launch_bounds__(MAXTHREADS+WARP_SIZE, 1) \ __global__ void NCCL_KERN_NAME(coll, op, dtype)(struct ncclColl firstColl) { \ int tid = threadIdx.x; \ int bid = blockIdx.x; \ __shared__ struct ncclColl localColl; \ + __shared__ uint32_t abortCount; \ + if (tid == 0) abortCount = 0; \ + __syncthreads(); \ \ - struct ncclComm* comm = firstColl.args.comm; \ - struct ncclRing* ring = comm->rings+bid; \ + struct ncclDevComm* comm = firstColl.args.comm; \ + struct ncclChannel* channel = comm->channels+bid; \ struct ncclColl* c; \ + channel->abortCount = &abortCount; \ if (bid == 0) { \ /* To optimize for latency, (only) the first operation is passed as argument.*/ \ c = &firstColl; \ } else { \ c = &localColl; \ - load_coll(c, ring->devCollectives+ring->collFifoHead, tid); \ + load_coll(c, channel->devCollectives+channel->collFifoHead, tid, &abortCount); \ } \ while (1) { \ - if (tid < c->nThreads) { \ + if (tid < c->args.nThreads) { \ if (c->funcIndex == fIndex) { \ - coll##Kernel, ctype>(&c->args); \ + coll##Kernel, ctype>(&c->args); \ } else { \ NCCL_CALL_FUNCTIONS(c); \ } \ } \ int nextIndex = c->nextIndex; \ - if (tid == 0) ring->collFifoHead = nextIndex; \ + if (tid == 0) channel->collFifoHead = nextIndex; \ \ if (c->active == 2) { \ return; \ @@ -171,15 +192,21 @@ __global__ void NCCL_KERN_NAME(coll, op, dtype)(struct ncclColl firstColl) { \ \ /* Load next collective operation*/ \ c = &localColl; /* for bid 0 */ \ - load_coll(c, ring->devCollectives+nextIndex, tid); \ + load_coll(c, channel->devCollectives+nextIndex, tid, &abortCount); \ } \ } +#else +#define IMPL_COLL_KERN(coll, op, ncclFunc, dtype, ctype, fIndex) +#endif + +// Only generate inline kernels for LL +#define IMPL_COLL4(coll, op, ncclFunc, dtype, ctype, ncclColl, ncclOp, ncclType, al) \ + IMPL_COLL_FUNC(coll, op, ncclFunc, dtype, ctype) \ + IMPL_COLL_FUNC(coll##LL, op, ncclFunc, dtype, ctype) \ + IMPL_COLL_KERN(coll##LL, op, ncclFunc, dtype, ctype, FUNC_INDEX(ncclColl, ncclOp, ncclType, 1, al)) \ #define IMPL_COLL3(coll, op, ncclFunc, dtype, ctype, ncclColl, ncclOp, ncclType) \ - IMPL_COLL4(coll##LL, op, ncclFunc, dtype, ctype) \ - IMPL_COLL4K(coll##LL, op, ncclFunc, dtype, ctype, FUNC_INDEX(ncclColl, ncclOp, ncclType, 1)) \ - IMPL_COLL4(coll, op, ncclFunc, dtype, ctype) \ - IMPL_COLL4K(coll, op, ncclFunc, dtype, ctype, FUNC_INDEX(ncclColl, ncclOp, ncclType, 0)) \ + IMPL_COLL4(coll##Ring, op, ncclFunc, dtype, ctype, ncclColl, ncclOp, ncclType, 0) #define IMPL_COLL2(coll, op, ncclFunc, ncclColl, ncclOp) \ IMPL_COLL3(coll, op, ncclFunc, i8, int8_t, ncclColl, ncclOp, ncclInt8) \ @@ -192,4 +219,6 @@ __global__ void NCCL_KERN_NAME(coll, op, dtype)(struct ncclColl firstColl) { \ IMPL_COLL3(coll, op, ncclFunc, f32, float, ncclColl, ncclOp, ncclFloat32) \ IMPL_COLL3(coll, op, ncclFunc, f64, double, ncclColl, ncclOp, ncclFloat64) +#define COLL_UNROLL 2 + #endif diff --git a/projects/rccl/src/collectives/device/common_kernel.h b/projects/rccl/src/collectives/device/common_kernel.h index e8194bf4e3..7cf85671a3 100644 --- a/projects/rccl/src/collectives/device/common_kernel.h +++ b/projects/rccl/src/collectives/device/common_kernel.h @@ -1,5 +1,5 @@ /************************************************************************* - * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved. * Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved. * * See LICENSE.txt for license information @@ -8,25 +8,25 @@ #ifndef NCCL_COMMON_KERNEL_H_ #define NCCL_COMMON_KERNEL_H_ -#include "core.h" +#include "devcomm.h" #include #include -#include +#include // Define min for ssize_t static __device__ int min(int a, ssize_t b) { return (a < b) ? a : b; } typedef uint64_t PackType; -#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) +#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__) template struct MULTI { - __device__ PackType operator()(const PackType x, const PackType y) const - { - return FUNC()(x, y); - } + __device__ PackType operator()(const PackType x, const PackType y) const + { + return FUNC()(x, y); + } }; #else @@ -205,15 +205,7 @@ struct MULTI { } }; -#endif //defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) - -#define ALIGNUP(x, a) ((((x)-1) & ~((a)-1)) + (a)) - -template -__device__ inline volatile T* AlignUp(volatile T * ptr, size_t align) { - size_t ptrval = reinterpret_cast(ptr); - return reinterpret_cast(ALIGNUP(ptrval, align)); -} +#endif //defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__) template inline __device__ T vFetch(const volatile T* ptr) { @@ -225,7 +217,7 @@ void vStore(volatile T* ptr, const T val) { *ptr = val; } -#if CUDART_VERSION < 9000 && !(defined(__HIP_PLATFORM_HCC__) || defined(__HCC__)) +#if CUDART_VERSION < 9000 && !(defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)) template<> inline __device__ half vFetch(const volatile half* ptr) { half r; @@ -251,26 +243,6 @@ void vStore(volatile half* ptr, const half val) { } #endif -template -__attribute__((noinline)) -__device__ inline void ReduceCopy( - const int tid, const int nthreads, - const volatile T * __restrict__ const src0, - const volatile T * __restrict__ const src1, - volatile T * __restrict__ const dest0, - volatile T * __restrict__ const dest1, const int N) { - for (int idx = tid; idx < N; idx += nthreads) { - T val = vFetch(src0+idx); - if (TWO_INPUTS) { - val = FUNC()(val, vFetch(src1+idx)); - } - vStore(dest0+idx, val); - if (TWO_OUTPUTS) { - vStore(dest1+idx, val); - } - } -} - typedef ulong2 Pack128; template @@ -281,8 +253,8 @@ struct MULTI128 { } }; -inline __device__ void Fetch128(Pack128& v, Pack128* p) { -#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) +inline __device__ void Fetch128(Pack128& v, const Pack128* p) { +#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__) v.x = p->x; v.y = p->y; #else @@ -290,7 +262,7 @@ inline __device__ void Fetch128(Pack128& v, Pack128* p) { #endif } inline __device__ void Store128(Pack128* p, Pack128& v) { -#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) +#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__) p->x = v.x; p->y = v.y; #else @@ -298,67 +270,104 @@ inline __device__ void Store128(Pack128* p, Pack128& v) { #endif } -#define WARP_SIZE 32 -template -__attribute__((noinline)) -__device__ inline void ReduceCopy128b( const int w, const int nw, const int t, - Pack128 * src0, Pack128 * src1, Pack128 * dest0, Pack128 * dest1, - const int N) { - Pack128 t0[UNROLL]; - Pack128 t1[UNROLL]; - const Pack128* src0_end = src0 + N; - const int inc = nw * UNROLL * WARP_SIZE; - const int offset = w * UNROLL * WARP_SIZE + t; - src0 += offset; if (TWO_INPUTS) src1 += offset; - dest0 += offset; if (TWO_OUTPUTS) dest1 += offset; +template +__device__ void ReduceCopyMulti(const int tid, const int nthreads, + int nsrcs, const T* srcs[MAXSRCS], int ndsts, T* dsts[MAXDSTS], + const int offset, const int N) { + for (int idx = offset+tid; idx < offset+N; idx += nthreads) { + T val = vFetch(srcs[0]+idx); + #pragma unroll + for (int i=1; i()(t0[u], t1[u]); - Store128(dest0+u*WARP_SIZE, t0[u]); - if (TWO_OUTPUTS) Store128(dest1+u*WARP_SIZE, t0[u]); - } - src0 += inc; if (TWO_INPUTS) src1 += inc; - dest0 += inc; if (TWO_OUTPUTS) dest1 += inc; + #pragma unroll + for (int i=0; i -__attribute__((noinline)) -__device__ inline void ReduceOrCopy(const int tid, const int nthreads, - volatile T * __restrict__ dest0, volatile T * __restrict__ dest1, - const volatile T * __restrict__ src0, const volatile T * __restrict__ src1, +#define WARP_SIZE 64 + +template +__device__ void ReduceCopy128bMulti( const int w, const int nw, const int t, + int nsrcs, const T* s[MAXSRCS], int ndsts, T* d[MAXDSTS], + const int elemOffset, const int Npack) { + const int inc = nw * UNROLL * WARP_SIZE; + int offset = w * UNROLL * WARP_SIZE + t; + + const Pack128* srcs[MAXSRCS]; + for (int i=0; i()(vals[u], vals2[u]); + } + #pragma unroll 1 + for (int i=MINSRCS; i()(vals[u], vals2[u]); + } + + // Store + for (int i = 0; i < MINDSTS; i++) { + for (int u = 0; u < UNROLL; ++u) Store128(dsts[i]+u*WARP_SIZE, vals[u]); + } + #pragma unroll 1 + for (int i=MINDSTS; i +__device__ int ptrAlign128(T* ptr) { return (uint64_t)ptr % alignof(Pack128); } + +// Try to limit consecutive load/stores to 8. +// Use UNROLL 8 when we have a single source and a single destination, 4 otherwise +#define AUTOUNROLL (UNROLL*(4/(MINDSTS+MINSRCS))) + +template +__device__ void ReduceOrCopyMulti(const int tid, const int nthreads, + int nsrcs, const T* srcs[MAXSRCS], int ndsts, T* dsts[MAXDSTS], int N) { int Nrem = N; if (Nrem <= 0) return; - int Npreamble = (Nrem(tid, nthreads, src0, src1, dest0, dest1, Npreamble); - - Nrem -= Npreamble; - if (Nrem == 0) return; - - dest0 += Npreamble; if (HAS_DEST1) { dest1 += Npreamble; } - src0 += Npreamble; if (HAS_SRC1) { src1 += Npreamble; } + if (Npreamble) { + ReduceCopyMulti(tid, nthreads, nsrcs, srcs, ndsts, dsts, 0, Npreamble); + Nrem -= Npreamble; + if (Nrem == 0) return; + } + int offset = Npreamble; // stage 2: fast path: use 128b loads/stores to do the bulk of the work, // assuming the pointers we have are all 128-bit alignable. @@ -366,35 +375,33 @@ __device__ inline void ReduceOrCopy(const int tid, const int nthreads, int nw = nthreads / WARP_SIZE; // Number of warps int t = tid % WARP_SIZE; // Thread (inside the warp) - const int PackFactor = sizeof(Pack128) / sizeof(T); + const int packFactor = sizeof(Pack128) / sizeof(T); // stage 2a: main loop - int Nalign2a = (Nrem / (PackFactor * UNROLL * nthreads)) - * (UNROLL * nthreads); // round down + int Npack2a = (Nrem / (packFactor * AUTOUNROLL * WARP_SIZE)) + * (AUTOUNROLL * WARP_SIZE); // round down + int Nelem2a = Npack2a * packFactor; - ReduceCopy128b(w, nw, t, (Pack128*)src0, (Pack128*)src1, (Pack128*)dest0, (Pack128*)dest1, Nalign2a); + ReduceCopy128bMulti(w, nw, t, nsrcs, srcs, ndsts, dsts, offset, Npack2a); - int Ndone2a = Nalign2a * PackFactor; - Nrem -= Ndone2a; + Nrem -= Nelem2a; if (Nrem == 0) return; - dest0 += Ndone2a; if (HAS_DEST1) { dest1 += Ndone2a; } - src0 += Ndone2a; if (HAS_SRC1) { src1 += Ndone2a; } + offset += Nelem2a; // stage 2b: slightly less optimized for section when we don't have full - // UNROLLs + // unrolling - int Nalign2b = Nrem / PackFactor; + int Npack2b = Nrem / packFactor; + int Nelem2b = Npack2b * packFactor; - ReduceCopy128b(w, nw, t, (Pack128*)src0, (Pack128*)src1, (Pack128*)dest0, (Pack128*)dest1, Nalign2b); + ReduceCopy128bMulti(w, nw, t, nsrcs, srcs, ndsts, dsts, offset, Npack2b); - int Ndone2b = Nalign2b * PackFactor; - Nrem -= Ndone2b; + Nrem -= Nelem2b; if (Nrem == 0) return; - dest0 += Ndone2b; if (HAS_DEST1) { dest1 += Ndone2b; } - src0 += Ndone2b; if (HAS_SRC1) { src1 += Ndone2b; } + offset += Nelem2b; // stage 2c: tail - ReduceCopy(tid, nthreads, src0, src1, dest0, dest1, Nrem); + ReduceCopyMulti(tid, nthreads, nsrcs, srcs, ndsts, dsts, offset, Nrem); } #endif // COMMON_KERNEL_H_ diff --git a/projects/rccl/src/collectives/device/functions.cu b/projects/rccl/src/collectives/device/functions.cu index bc7c175fc5..ed67c1b9df 100644 --- a/projects/rccl/src/collectives/device/functions.cu +++ b/projects/rccl/src/collectives/device/functions.cu @@ -1,15 +1,13 @@ /************************************************************************* - * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved. * Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ -#include "core.h" +#include "devcomm.h" #include "collectives.h" #include "common.h" - - // Workaround for https://reviews.llvm.org/D55580 __device__ void ncclWorkaroundClangD55580() {} diff --git a/projects/rccl/src/collectives/device/gen_rules.sh b/projects/rccl/src/collectives/device/gen_rules.sh new file mode 100755 index 0000000000..4413213e1e --- /dev/null +++ b/projects/rccl/src/collectives/device/gen_rules.sh @@ -0,0 +1,28 @@ +#!/bin/bash +# +# Copyright (c) 2018-2019, NVIDIA CORPORATION. All rights reserved. +# +# See LICENSE.txt for license information +# + +dir=$1 + +targets="GENOBJS := \\\\\n" + +for base in all_reduce all_gather broadcast reduce reduce_scatter; do + opn=0 + for op in sum prod min max; do + dtn=0 + for dt in i8 u8 i32 u32 i64 u64 f16 f32 f64; do + echo "${dir}/${base}_${op}_${dt}.o : ${base}.cu ${dir}/${base}.dep" + echo " @printf \"Compiling %-35s > %s\\\\n\" ${base}.cu ${dir}/${base}_${op}_${dt}.o" + echo " mkdir -p ${dir}" + echo " \${NVCC} -DNCCL_OP=${opn} -DNCCL_TYPE=${dtn} \${NVCUFLAGS} -dc ${base}.cu -o ${dir}/${base}_${op}_${dt}.o" + echo "" + targets="$targets\t${dir}/${base}_${op}_${dt}.o \\\\\n" + dtn=$(($dtn + 1)) + done + opn=$(($opn + 1)) + done +done +echo -e "$targets" diff --git a/projects/rccl/src/collectives/device/ll_kernel.h b/projects/rccl/src/collectives/device/ll_kernel.h deleted file mode 100644 index ca7e4d63e5..0000000000 --- a/projects/rccl/src/collectives/device/ll_kernel.h +++ /dev/null @@ -1,186 +0,0 @@ -/************************************************************************* - * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved. - * Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved. - * - * See LICENSE.txt for license information - ************************************************************************/ - -#ifndef NCCL_LL_KERNEL_H_ -#define NCCL_LL_KERNEL_H_ - -static __device__ __attribute__((noinline)) uint64_t readLL(union ncclLLFifoLine* src, uint32_t flag) { -#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) - using Vec = uint32_t __attribute__((ext_vector_type(4))); - Vec i4; - do { - asm volatile ("flat_load_dwordx4 %0, %1, glc\n" - "s_waitcnt vmcnt(0)\n" - "buffer_wbinvl1_vol\n" : "=v"(i4) : "v"(src)); - } while (i4[1] != flag || i4[3] != flag); - uint64_t val64 = (uint64_t)(i4[0]) + (((uint64_t)i4[2]) << 32); - return val64; -#else - uint32_t data1, flag1, data2, flag2; - do { - asm volatile("ld.volatile.global.v4.u32 {%0,%1,%2,%3}, [%4];" : "=r"(data1), "=r"(flag1), "=r"(data2), "=r"(flag2) : "l"(&src->i4)); - } while ((flag1 != flag) || (flag2 != flag)); - uint64_t val64 = data1 + (((uint64_t)data2) << 32); - return val64; -#endif -} - -static __device__ __attribute__((noinline)) void storeLL(union ncclLLFifoLine* dst, uint64_t val, uint32_t flag) { -#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) - using Vec = uint32_t __attribute__((ext_vector_type(4))); - Vec i4; - i4[0] = val & 0xffffffff; - i4[1] = flag; - i4[2] = (val >> 32); - i4[3] = flag; - asm volatile ("flat_store_dwordx4 %0, %1, glc\n" - "s_waitcnt vmcnt(0)\n" - "buffer_wbinvl1_vol\n" : : "v"(dst), "v"(i4)); -#else - asm volatile("st.volatile.global.v4.u32 [%0], {%1,%2,%3,%4};" :: "l"(&dst->i4), "r"((uint32_t)val), "r"(flag), "r"((uint32_t)(val >> 32)), "r"(flag)); -#endif -} - -// Using memcpy handles misaligned pointers. -static __device__ uint64_t readAL(uint64_t* src) { - uint64_t val; - memcpy((char*)&val, (char*)src, sizeof(uint64_t)); - return val; -} -static __device__ void storeAL(uint64_t* dst, uint64_t val) { - memcpy((char*)dst, (char*)&val, sizeof(uint64_t)); -} - -template -class LLPrimitives { - private: - template - __attribute__((noinline)) - static __device__ void ReduceCopyGeneric(const T* src1, union ncclLLFifoLine* src2, T* dst1, union ncclLLFifoLine* dst2, int size, uint32_t iflag, uint32_t oflag, int nthreads) { - if (size <= 0) return; - size_t size64 = size * sizeof(T) / sizeof(uint64_t); - uint64_t* src1A = (uint64_t*)src1; - uint64_t* dst1A = (uint64_t*)dst1; - int offset = threadIdx.x; - // Do multiples of 64 bits -#pragma unroll 1 - for (; offset < size64; offset += nthreads) { - uint64_t val; - if (HAS_SRC1) { - val = readAL(src1A+offset); - if (HAS_SRC2) val = MULTI()(readLL(src2+offset, iflag), val); - } else if (HAS_SRC2) { - val = readLL(src2+offset, iflag); - } - if (HAS_DST1) storeAL(dst1A+offset, val); - if (HAS_DST2) storeLL(dst2+offset, val, oflag); - } - // Finish last word - int sizeDone = size64*(sizeof(uint64_t)/sizeof(T)); - int sizeRem = size - sizeDone; - if (threadIdx.x == 0 && sizeRem) { - const T* src1B = src1 + sizeDone; - T* dst1B = dst1 + sizeDone; - - uint64_t lastVal; - T* vals = (T*)&lastVal; - - if (HAS_SRC2) { - uint64_t lastVal2 = readLL(src2+size64, iflag); - T* src2B = (T*)&lastVal2; - for (int offset = 0; offset < sizeRem; offset++) { - vals[offset] = HAS_SRC1 ? FUNC()(src2B[offset], src1B[offset]) : src2B[offset]; - } - } else if (HAS_SRC1) { - for (int offset = 0; offset < sizeRem; offset++) { - vals[offset] = src1B[offset]; - } - } - if (HAS_DST2) storeLL(dst2+size64, lastVal, oflag); - if (HAS_DST1) { - for (int offset = 0; offset < sizeRem; offset++) { - dst1B[offset] = vals[offset]; - } - } - } - } - public: - static __device__ void ReduceCopy(const T* src, union ncclLLFifoLine* dst, int size, uint32_t oflag, int nthreads) { - return ReduceCopyGeneric<1, 0, 0, 1>(src, NULL, NULL, dst, size, 0, oflag, nthreads); - } - - static __device__ void ReduceCopy(union ncclLLFifoLine* src, T* dst, int size, uint32_t iflag, int nthreads) { - return ReduceCopyGeneric<0, 1, 1, 0>(NULL, src, dst, NULL, size, iflag, 0, nthreads); - } - - static __device__ void ReduceCopy(const T* src1, union ncclLLFifoLine* src2, union ncclLLFifoLine* dst, int size, uint32_t iflag, uint32_t oflag, int nthreads) { - return ReduceCopyGeneric<1, 1, 0, 1>(src1, src2, NULL, dst, size, iflag, oflag, nthreads); - } - - static __device__ void ReduceCopy(const T* src1, union ncclLLFifoLine* src2, T* dst, int size, uint32_t iflag, int nthreads) { - return ReduceCopyGeneric<1, 1, 1, 0>(src1, src2, dst, NULL, size, iflag, 0, nthreads); - } - - static __device__ void ReduceCopy(const T* src, T* dst1, union ncclLLFifoLine* dst2, int size, uint32_t oflag, int nthreads) { - return ReduceCopyGeneric<1, 0, 1, 1>(src, NULL, dst1, dst2, size, 0, oflag, nthreads); - } - - static __device__ void ReduceCopy(union ncclLLFifoLine* src, T* dst1, union ncclLLFifoLine* dst2, int size, uint32_t iflag, uint32_t oflag, int nthreads) { - return ReduceCopyGeneric<0, 1, 1, 1>(NULL, src, dst1, dst2, size, iflag, oflag, nthreads); - } - - static __device__ void ReduceCopy(const T* src1, union ncclLLFifoLine* src2, T* dst1, union ncclLLFifoLine* dst2, int size, uint32_t iflag, uint32_t oflag, int nthreads) { - return ReduceCopyGeneric<1, 1, 1, 1>(src1, src2, dst1, dst2, size, iflag, oflag, nthreads); - } -}; - -// Common macros - -#define STEP_TO_SLOT(step) \ - (step % NCCL_LL_CHUNKS) - -#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) -#define SYNC __syncthreads() -#else -#define SYNC asm volatile ("bar.sync 1, %0;" :: "r"(llNthreads)) -#endif - -#define WAIT_NEXT \ - if (tid == 0) { \ - while (sendHead + NCCL_LL_CHUNKS <= step) { \ - sendHead = LOAD(sendHeadPtr); \ - } \ - } \ - SYNC; - -#define POST_SIZE \ - if (tid == 0 && sizesFifo) { STORE(sizesFifo + step % NCCL_LL_CHUNKS, (maxOffset <= 0) ? -1 : (maxOffset*2*(int)sizeof(T))); } - -#define ACK_PREV \ - SYNC; \ - if (tid == 0) STORE(recvHeadPtr,step); - -#define FIFO_CLEANING_AND_SAVE_STEP(flag) do { \ - if (step > LOAD(&ring->send.conn.llLastCleaning) + NCCL_LL_CLEAN_FREQ) { \ - /* Reset all flags */ \ - static_assert((NCCL_LL_BUFF_SIZE % NCCL_LL_MAX_NTHREADS) == 0, "NCCL_LL_BUFF_SIZE must be a multiple of THREADS"); \ - static_assert(NCCL_LL_BUFF_SIZE/(sizeof(union ncclLLFifoLine)*NCCL_LL_MAX_NTHREADS) > 0, "NCCL_LL_BUFF_SIZE is less than 16 bytes*THREADS"); \ - const union ncclLLFifoLine resetLine = { 0, flag, 0, flag }; \ - for (int i=0; isend.conn.llLastCleaning, step); }\ - } \ - STORE(&ring->send.conn.llStep, step); \ -} while (0); - -#endif diff --git a/projects/rccl/src/collectives/device/primitives.h b/projects/rccl/src/collectives/device/primitives.h index 436063c1f9..81a4d4cb7f 100644 --- a/projects/rccl/src/collectives/device/primitives.h +++ b/projects/rccl/src/collectives/device/primitives.h @@ -1,5 +1,5 @@ /************************************************************************* - * Copyright (c) 2016-2018, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved. * Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved. * * See LICENSE.txt for license information @@ -10,229 +10,635 @@ #include #include "reduce_kernel.h" // for reduction funcs +#include "common.h" +#define SPINS_BEFORE_CHECK_ABORT 1000000 -/* Defines primitive operations: Copy, Reduce, DoubleCopy, and ReduceCopy. - * - * In order to reduce the reptetion of template arguments, the operations - * are bundled as static methods of the Primitives class. - * - * Each primitive operation copies/reduces a contiguous buffer and syncs - * an optional set of flags against a sub-step counter. The sync value is - * based on the step parameter. Sync flags must be of type WaitFlag or - * PostFlag. The primitive routines wait for all WaitFlag args to attain - * at least a value of SUBSTEPS*(step-1)+substep+1 (i.e. completion of - * corresponding substep by previous step) before executing the transfer. - * After each substep is transfered, all PostFlag arguments get updated to - * the value SUBSTEPS*step+substep+1. - */ +// Unroll unconditionally the first send/recv since nsend/nrecv should be at +// least 1 if SEND/RECV is set. +#define FOR_SEND(func, ...) do { \ + if (SEND) { \ + /* Send to far first, then close */ \ + for (int i=1; i(Flag1, Flag2, ...) -template __device__ -bool AnyAre() { return false; } - -template -__device__ -bool AnyAre(FIRST_T first, TAIL_Ts... tail) { - return std::is_same::value || AnyAre(tail...); -} - - -// Wait on all WaitFlags, ignore PostFlags -__device__ -static void WaitOnFlags(uint64_t val) { } - -template __device__ -static void WaitOnFlags(uint64_t val, WaitFlag flag, TAIL_Ts... tail) { - flag.wait(val); - WaitOnFlags(val, tail...); -} - -template __device__ -static void WaitOnFlags(uint64_t val, PostFlag, TAIL_Ts... tail) { - WaitOnFlags(val, tail...); -} - - -// Post all PostFlags, ignore WaitFlags -__device__ -static void PostToFlags(uint64_t val) { } - -template __device__ -static void PostToFlags(uint64_t val, WaitFlag flag, TAIL_Ts... tail) { - PostToFlags(val, tail...); -} - -template __device__ -static void PostToFlags(uint64_t val, PostFlag flag, TAIL_Ts... tail) { - flag.post(val); - PostToFlags(val, tail...); -} - - -// Post sizes for PostFlags, ignore WaitFlags -__device__ -static void PostSizeToFlags(uint64_t step, int size) { } - -template __device__ -static void PostSizeToFlags(uint64_t step, int size, WaitFlag flag, TAIL_Ts... tail) { - PostSizeToFlags(step, size, tail...); -} - -template __device__ -static void PostSizeToFlags(uint64_t step, int size, PostFlag flag, TAIL_Ts... tail) { - flag.postSize(step, size); - PostSizeToFlags(step, size, tail...); -} - - -// Create pointer arithmetic syntax that doesn't break for std::nullptr_t -template __device__ -static Tptr ptradd(Tptr ptr, int i) { - return ptr + i; -} - -__device__ -static std::nullptr_t ptradd(std::nullptr_t ptr, int i) { - return nullptr; -} - -// use different unroll numbers for all primitives for best throughput -#define COPY_UNROLL 4 -#define REDUCE_UNROLL 2 -#define DOUBLECOPY_UNROLL 2 -#define REDUCECOPY_UNROLL 2 +#define FOR_RECV(func, ...) do { \ + if (RECV) { \ + /* Recv from close first, then far */ \ + func(0, ##__VA_ARGS__); \ + for (int i=1; i > -class Primitives { +template +class ncclPrimitives { private: - template // either WaitFunc or PostFunc - static __device__ __attribute__((noinline)) void - GenericOp(const int tid, const int nthreads, - const T* src1, - const SRC2_T src2, - T* dst1, - DST2_T dst2, - int len, int maxoffset, uint64_t step, SYNC_Ts... flags) { + const int tid; + const int nthreads; + int nrecv = 0; + int nsend = 0; + const int stepSize; + struct ncclConnInfo* recvConn[NRECV]; + struct ncclConnInfo* sendConn[NSEND]; + volatile uint64_t* waitPtr; + uint64_t recvStep[NRECV]; + uint64_t sendStep[NSEND]; + uint64_t sendConnHead[NSEND]; + const T* recvDirectBuff[NRECV]; + T* sendDirectBuff[NSEND]; + const T* recvBuff[NRECV]; + T* sendBuff[NSEND]; + struct ncclDevComm* comm; + uint32_t* abortCount; - enum { noSrc2 = std::is_same::value }; - enum { noDst2 = std::is_same::value }; - static_assert(noSrc2 || std::is_same::value, - "src2 must be of type T* or std::nullptr_t"); - static_assert(noDst2 || std::is_same::value, - "dst2 must be of type T* or std::nullptr_t"); + __device__ int recvOffset(int i) { return (recvStep[i]%NCCL_STEPS)*stepSize; } + __device__ int sendOffset(int i) { return (sendStep[i]%NCCL_STEPS)*stepSize; } + __device__ const T* recvPtr(int i) { return ((const T*)recvBuff[i])+recvOffset(i); } + __device__ T* sendPtr(int i) { return ((T*)sendBuff[i])+sendOffset(i); } - using OpType = typename std::conditional, REDOP>::type; - - int sliceSize = len / SUBSTEPS; - int sliceOffset = 0; - -#pragma unroll 1 - for (int sub=0; sub(flags...)) { - if (tid == 0) { - WaitOnFlags(SUBSTEPS*step + sub + 1, flags...); - } -#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) - __syncthreads(); + __device__ void barrier() { +#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__) + __syncthreads(); #else - asm volatile ("bar.sync 1, %0;" :: "r"(nthreads)); + asm volatile ("bar.sync 1, %0;" :: "r"(nthreads)); #endif - } - ReduceOrCopy - < - UNROLL, - OpType, - T, - !std::is_same::value, // HAS_DEST1 - !std::is_same::value // HAS_SRC1 - > - ( - tid, nthreads, - ptradd(dst1, sliceOffset), - ptradd(dst2, sliceOffset), - ptradd(src1, sliceOffset), - ptradd(src2, sliceOffset), - realSize - ); - if (AnyAre(flags...)) { - __syncthreads(); - if(tid == 0) - PostSizeToFlags(SUBSTEPS*step+sub, realSize*sizeof(T), flags...); - __threadfence_system(); - if(tid == 0) - PostToFlags(SUBSTEPS*step + sub + 1, flags...); + } + + uint32_t mismatch = 0; + const uint64_t opCount; + + __device__ void checkMismatch(volatile uint64_t* remoteOpCount) { + if (mismatch) { + // In non-LL, we use _threadfence_system before incrementing opCount, yet we are still waiting for credits here, so there must be a size mismatch + STORE(comm->fatalDevError, ncclDevAssertedMismatch); + } else if (remoteOpCount && LOAD(remoteOpCount) > opCount) { + mismatch += 1; + } + } + + uint32_t spins = 0; + uint32_t abort = 0; + + __device__ int checkAbort(volatile uint64_t* remoteOpCount) { + spins++; + if (spins == SPINS_BEFORE_CHECK_ABORT) { + abort = LOAD(comm->abortFlag); + checkMismatch(remoteOpCount); + spins = 0; + } + return abort; + } + + __device__ void waitRecv(int i) { + spins = 0; + mismatch = 0; + recvStep[i] += SLICESTEPS; + if (tid == i) { +#ifdef ENABLE_PROFILING + auto devProf = comm->devProf; + uint64_t t0 = clock64(); +#endif + while (LOAD(waitPtr) < recvStep[i]) { + if (checkAbort(recvConn[i]->opCountRem)) break; + } +#ifdef ENABLE_PROFILING + __atomic_fetch_add(&devProf->wait_recv_cycle[blockIdx.x], clock64() - t0, __ATOMIC_SEQ_CST); +#endif + } + } + + __device__ void waitSend(int i) { + spins = 0; + mismatch = 0; + sendStep[i] += SLICESTEPS; + if (tid == WARP_SIZE+i) { +#ifdef ENABLE_PROFILING + auto devProf = comm->devProf; + uint64_t t0 = clock64(); +#endif + while (sendConnHead[i] + NCCL_STEPS < sendStep[i]) { + sendConnHead[i] = LOAD(waitPtr); + if (checkAbort(sendConn[i]->opCountRem)) break; + } +#ifdef ENABLE_PROFILING + __atomic_fetch_add(&devProf->wait_send_cycle[blockIdx.x], clock64() - t0, __ATOMIC_SEQ_CST); +#endif + } + } + + inline __device__ void postRecv(int i) { + STORE(recvConn[i]->head, recvStep[i]); + } + + inline __device__ void postSend(int i) { + if (sendConn[i]->next_hdp_reg) STORE(sendConn[i]->next_hdp_reg, 0x1); + STORE(sendConn[i]->tail, sendStep[i]); + } + + __device__ void postSendSize(int i, int size) { + if (sendConn[i]->fifo) STORE(sendConn[i]->fifo+((sendStep[i]-SLICESTEPS)%NCCL_STEPS), size); + } + + template + __device__ const T* directRecvPtr(int i, int directOffset) { + return DIRECTRECV && recvDirectBuff[i] ? recvDirectBuff[i]+directOffset : recvPtr(i); + } + + template + __device__ T* directSendPtr(int i, int directOffset) { + return DIRECTSEND && sendDirectBuff[i] ? sendDirectBuff[i]+directOffset : sendPtr(i); + } + + template + __device__ void + GenericOp(const T* srcPtr, T* dstPtr, int nelem, int directOffset) { + int offset = 0; + int sliceSize = stepSize * SLICESTEPS; + + const T* srcs[RECV*NRECV+SRC]; + srcs[0] = SRC ? srcPtr : directRecvPtr(0, directOffset); + if (RECV) { + if (SRC) srcs[1] = recvPtr(0); + for (int i=1; i(0, directOffset); + if (SEND) { + if (DST) dsts[1] = directSendPtr(0, directOffset); + for (int i=1; i(i, directOffset); + } + + #pragma unroll 1 + for (int slice=0; slice 0) { + barrier(); + if (DIRECTRECV && recvDirectBuff[0]) { + // We can only have one direct receive. Since srcs[0] == dstPtr+offset, skip one copy + if (SEND) { + ReduceOrCopyMulti(tid, nthreads, 1, srcs, nsend, dsts+1, realSize); + } + } else { + ReduceOrCopyMulti(tid, nthreads, RECV*nrecv+SRC, srcs, SEND*nsend+DST, dsts, realSize); } } - sliceOffset += sliceSize; + exitIfAbortBarrier(abort, abortCount); + if (tid == 0) FOR_SEND(postSendSize, realSize*sizeof(T)); + if (SEND) __threadfence_system(); + if (tid == 0) FOR_SEND(postSend); + if (tid == 0) FOR_RECV(postRecv); + } + for (int i=0; ibuff); + recvStep[i] = LOAD(&recvConn[i]->step); + recvStep[i] = ROUNDUP(recvStep[i], SLICESPERCHUNK*SLICESTEPS); + // Return credits in case we rounded up. + if (tid == 0) STORE(recvConn[i]->head, recvStep[i]); + if (tid == i) { + waitPtr = LOAD(&recvConn[i]->tail); + STORE(recvConn[i]->opCountLoc, opCount); + } + recvDirectBuff[i] = NULL; + if (directBuff && recvConn[i]->direct) { + recvDirectBuff[i] = directBuff; + if (tid == 0) STORE(recvConn[i]->ptrExchange, directBuff); + } + nrecv++; + } + + __device__ void loadSendConn(struct ncclConnInfo* conn, int i, T* directBuff) { + sendConn[i] = conn; + sendBuff[i] = (T*)LOAD(&sendConn[i]->buff); + sendStep[i] = LOAD(&sendConn[i]->step); + sendStep[i] = ROUNDUP(sendStep[i], SLICESPERCHUNK*SLICESTEPS); + if (tid == WARP_SIZE+i) { + waitPtr = LOAD(&sendConn[i]->head); + sendConnHead[i] = LOAD(waitPtr); + STORE(sendConn[i]->opCountLoc, opCount); + } + sendDirectBuff[i] = NULL; + if (directBuff && sendConn[i]->direct) { + void* volatile* ptr = sendConn[i]->ptrExchange; + while ((sendDirectBuff[i] = (T*)(LOAD(ptr))) == NULL); + __syncthreads(); + if (tid == 0) STORE(ptr, NULL); + } + nsend++; + } + + __device__ void saveRecvConn(int i) { + if (tid == i) { + STORE(&recvConn[i]->step, recvStep[i]); + __threadfence_system(); + __atomic_fetch_add(recvConn[i]->opCountLoc, 1, __ATOMIC_SEQ_CST); + } + } + + __device__ void saveSendConn(int i) { + if (tid == WARP_SIZE+i) { + STORE(&sendConn[i]->step, sendStep[i]); + __threadfence_system(); + __atomic_fetch_add(sendConn[i]->opCountLoc, 1, __ATOMIC_SEQ_CST); } } public: - template - static __device__ void - Copy(const int tid, const int nthreads, const T* src, T* dst, - int len, int maxOffset, uint64_t step, SYNC_Ts... flags) { - GenericOp(tid, nthreads, src, nullptr, dst, nullptr, len, maxOffset, step, flags...); + __device__ + ncclPrimitives(const int tid, const int nthreads, int* recvPeers, int* sendPeers, T* directBuff, int stepSize, struct ncclChannel* channel, struct ncclDevComm* comm, const uint64_t opCount) + : comm(comm), tid(tid), nthreads(nthreads), stepSize(stepSize), opCount(opCount) { + // Make sure step is updated before we read it + abortCount = channel->abortCount; + __syncthreads(); + + // disable directBuff + for (int i=0; i= 0; i++) loadRecvConn(&channel->devPeers[recvPeers[i]].recv.conn, i, 0); + for (int i=0; i= 0; i++) loadSendConn(&channel->devPeers[sendPeers[i]].send.conn, i, 0); } - template - static __device__ void - DoubleCopy(const int tid, const int nthreads, const T* src, T* dst1, T* dst2, - int len, int maxOffset, uint64_t step, SYNC_Ts... flags) { - GenericOp(tid, nthreads, src, nullptr, dst1, dst2, len, maxOffset, step, flags...); + __device__ void + send(const T* src, int nelem) { + GenericOp<0, 0, 0, 1, 1, 0>(src, NULL, nelem, 0); + } + __device__ void + directSend(const T* src, int directOffset, int nelem) { + GenericOp<0, 1, 0, 1, 1, 0>(src, NULL, nelem, directOffset); } - template - static __device__ void - Reduce(const int tid, const int nthreads, const T* src1, const T* src2, T* dst, - int len, int maxOffset, uint64_t step, SYNC_Ts... flags) { - GenericOp(tid, nthreads, src1, src2, dst, nullptr, len, maxOffset, step, flags...); + __device__ void + recv(T* dst, int nelem) { + GenericOp<0, 0, 1, 0, 0, 1>(NULL, dst, nelem, 0); + } + __device__ void + directRecv(T* dst, int directOffset, int nelem) { + GenericOp<1, 0, 1, 0, 0, 1>(NULL, dst, nelem, directOffset); } - template - static __device__ void - ReduceCopy(const int tid, const int nthreads, const T* src1, const T* src2, T* dst1, T* dst2, - int len, int maxOffset, uint64_t step, SYNC_Ts... flags) { - GenericOp(tid, nthreads, src1, src2, dst1, dst2, len, maxOffset, step, flags...); + __device__ void + copySend(const T* src, T* dst, int nelem) { + GenericOp<0, 0, 0, 1, 1, 1>(src, dst, nelem, 0); + } + __device__ void + directCopySend(const T* src, T* dst, int directOffset, int nelem) { + GenericOp<0, 1, 0, 1, 1, 1>(src, dst, nelem, directOffset); + } + + __device__ void + recvCopySend(T* dst, int nelem) { + GenericOp<0, 0, 1, 1, 0, 1>(NULL, dst, nelem, 0); + } + __device__ void + directRecvCopySend(T* dst, int directOffset, int nelem) { + GenericOp<1, 1, 1, 1, 0, 1>(NULL, dst, nelem, directOffset); + } + + __device__ void + recvReduceCopy(const T* src, T* dst, int nelem) { + GenericOp<0, 0, 1, 0, 1, 1>(src, dst, nelem, 0); + } + + __device__ void + recvReduceSend(const T* src, int nelem) { + GenericOp<0, 0, 1, 1, 1, 0>(src, NULL, nelem, 0); + } + + __device__ void + recvReduceCopySend(const T* src, T* dst, int nelem) { + GenericOp<0, 0, 1, 1, 1, 1>(src, dst, nelem, 0); + } + __device__ void + directRecvReduceCopySend(const T* src, T* dst, int directOffset, int nelem) { + // Direct is only for the send part + GenericOp<0, 1, 1, 1, 1, 1>(src, dst, nelem, directOffset); + } + + __device__ ~ncclPrimitives() { + // Save steps for next collective. Have thread 0 do it to be compatible + // with the way LL works. + for (int i=0; i +class ncclLLPrimitives { + private: + const int tid; + const int nthreads; + int nrecv = 0; + int nsend = 0; + struct ncclConnInfo* recvConn[NRECV]; + struct ncclConnInfo* sendConn[NSEND]; + volatile uint64_t* waitPtr; + volatile uint64_t* postPtr; + volatile int* fifoPtr; + uint64_t recvStep[NRECV]; + uint64_t sendStep[NSEND]; + uint64_t sendConnHead; + union ncclLLFifoLine* recvBuff[NRECV]; + union ncclLLFifoLine* sendBuff[NSEND]; + struct ncclDevComm* comm; + uint32_t* abortCount; + + __device__ int recvOffset(int i) { return (recvStep[i]%NCCL_STEPS)*NCCL_LL_SLICE_LINES; } + __device__ int sendOffset(int i) { return (sendStep[i]%NCCL_STEPS)*NCCL_LL_SLICE_LINES; } + __device__ union ncclLLFifoLine* recvPtr(int i) { return recvBuff[i]+recvOffset(i); } + __device__ union ncclLLFifoLine* sendPtr(int i) { return sendBuff[i]+sendOffset(i); } + __device__ uint32_t recvFlag(int i) { return NCCL_LL_FLAG(recvStep[i]+1); } + __device__ uint32_t sendFlag(int i) { return NCCL_LL_FLAG(sendStep[i]+1); } + +#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__) +#else + // Exit If Abort Barrier : make sure all threads exit consistently + // Each thread sets a predicate to true if val == 1 + // all CTA's threads enter the barrier and do a popc on their predicates being True + // If any of the thread's predicate was True, all the threads call exit() + __device__ void exitIfAbortLocalBarrier() { + uint32_t popc; + asm ("{"); + asm volatile (" .reg .pred barr_pred;"); + asm volatile (" setp.eq.u32 barr_pred,%0,1;" :: "r"(abort)); + asm volatile (" bar.red.popc.u32 %0, 14, %1, barr_pred;" : "=r"(popc) : "r"(nthreads)); + asm ("}"); + if (popc) { + // Make sure threads not participating in the operation get the abort and all threads exit + exitIfAbortBarrier(1); + } + } +#endif + + __device__ void barrier() { +#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__) + __syncthreads(); +#else + asm volatile ("bar.sync 1, %0;" :: "r"(nthreads)); +#endif + } + + uint32_t mismatch = 0; + const uint64_t opCount; + + __device__ void checkMismatch(volatile uint64_t* remoteOpCount) { + if (mismatch > 20) { + // We have seen that the peer advanced opcount so many times yet we are still waiting for credit of current op, so it is _most likely_ a mismatch + // Note that we are not using _threadfence_system in LL so the error cannot be asserted + STORE(comm->fatalDevError, ncclDevSuspectedMismatch); + } else if (remoteOpCount && LOAD(remoteOpCount) > opCount) { + mismatch += 1; + } + } + + uint32_t spins = 0; + uint32_t abort = 0; + + __device__ int checkAbort(volatile uint64_t* remoteOpCount) { + spins++; + if (spins == SPINS_BEFORE_CHECK_ABORT) { + abort = LOAD(comm->abortFlag); + checkMismatch(remoteOpCount); + spins = 0; + } + return abort; + } + + __device__ void waitSend(int i, int nbytes) { + spins = 0; + mismatch = 0; + if (tid == WARP_SIZE+i) { + while (sendConnHead + NCCL_STEPS < sendStep[i] + 1) { + sendConnHead = LOAD(waitPtr); + if (checkAbort(sendConn[i]->opCountRem)) break; + } + if (fifoPtr) { + int size = ((sendStep[i] & NCCL_LL_CLEAN_MASK) == NCCL_LL_CLEAN_MASK) ? NCCL_LL_SLICE_LINES*sizeof(union ncclLLFifoLine) : nbytes; + STORE(fifoPtr+sendStep[i]%NCCL_STEPS, size); + } + } + } + + __device__ void postRecv(int i) { + recvStep[i]++; + if (tid == i) STORE(postPtr, recvStep[i]); + } + + __device__ void postSend(int i, int offset) { + // LL Cleanup : write all flags in the slice to make sure we don't have + // data corruption when flag loops over. + if ((sendStep[i] & NCCL_LL_CLEAN_MASK) == NCCL_LL_CLEAN_MASK) { + for (int o = offset; oopCountRem)); + uint64_t val64 = (uint64_t)(i4[0]) + (((uint64_t)i4[2]) << 32); +#else + do { + asm volatile("ld.volatile.global.v4.u32 {%0,%1,%2,%3}, [%4];" : "=r"(data1), "=r"(flag1), "=r"(data2), "=r"(flag2) : "l"(&src->i4)); + if (checkAbort(recvConn[i]->opCountRem)) break; + } while ((flag1 != flag) || (flag2 != flag)); + uint64_t val64 = data1 + (((uint64_t)data2) << 32); +#endif + return val64; + } + + __device__ __attribute__((noinline)) void storeLL(union ncclLLFifoLine* dst, uint64_t val, uint32_t flag) { +#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__) + using Vec = uint32_t __attribute__((ext_vector_type(4))); + Vec i4; + i4[0] = val & 0xffffffff; + i4[1] = flag; + i4[2] = (val >> 32); + i4[3] = flag; + asm volatile ("flat_store_dwordx4 %0, %1, glc\n" + "s_waitcnt vmcnt(0)\n" + "buffer_wbinvl1_vol\n" : : "v"(dst), "v"(i4)); +#else + asm volatile("st.volatile.global.v4.u32 [%0], {%1,%2,%3,%4};" :: "l"(&dst->i4), "r"((uint32_t)val), "r"(flag), "r"((uint32_t)(val >> 32)), "r"(flag)); +#endif + } + + // Using memcpy handles misaligned pointers. + __device__ uint64_t readAL(uint64_t* src) { + uint64_t val; + memcpy((char*)&val, (char*)src, sizeof(uint64_t)); + return val; + } + + __device__ void storeAL(uint64_t* dst, uint64_t val, uint32_t nbytes) { + memcpy((char*)dst, (char*)&val, nbytes); + } + + template + __device__ void LLGenericOp(const T* srcPtr, T* dstPtr, int nelem) { + uint32_t nbytes = nelem < 0 ? 0 : nelem*sizeof(T); + FOR_SEND(waitSend, nbytes*2); + barrier(); + uint32_t npack = DIVUP(nbytes, sizeof(uint64_t)); + uint64_t* srcPack = (uint64_t*)srcPtr; + uint64_t* dstPack = (uint64_t*)dstPtr; + int offset = tid; + // Do multiples of 64 bits + #pragma unroll 1 + for (; offset()(readLL(0, offset), val); + for (int i=1; i()(readLL(i, offset), val); + } + } + + // Send : inter-node, then intra-node, then local + if (SEND) { + for (int i=1; illBuff; + recvStep[i] = recvConn[i]->step; + if (tid == i) { + postPtr = recvConn[i]->head; + STORE(recvConn[i]->opCountLoc, opCount); + } + nrecv++; + } + + __device__ void loadSendConn(struct ncclConnInfo* conn, int i) { + sendConn[i] = conn; + sendBuff[i] = sendConn[i]->llBuff; + sendStep[i] = sendConn[i]->step; + if (tid == WARP_SIZE+i) { + waitPtr = sendConn[i]->head; + fifoPtr = sendConn[i]->fifo; + sendConnHead = LOAD(waitPtr); + STORE(sendConn[i]->opCountLoc, opCount); + } + nsend++; + } + + __device__ void saveRecvConn(int i) { + if (tid == i) { + recvConn[i]->step = recvStep[i]; + __atomic_fetch_add(recvConn[i]->opCountLoc, 1, __ATOMIC_SEQ_CST); + __threadfence_block(); + } + } + + __device__ void saveSendConn(int i) { + if (tid == WARP_SIZE+i) { + sendConn[i]->step = sendStep[i]; + __atomic_fetch_add(sendConn[i]->opCountLoc, 1, __ATOMIC_SEQ_CST); + __threadfence_block(); + } + } + + public: + __device__ + ncclLLPrimitives(const int tid, const int nthreads, int* recvPeers, int* sendPeers, struct ncclChannel* channel, struct ncclDevComm* comm, const uint64_t opCount) + : comm(comm), tid(tid), nthreads(nthreads), opCount(opCount) { + // Make sure step is updated before we read it. + abortCount = channel->abortCount; + barrier(); + + for (int i=0; i= 0; i++) loadRecvConn(&channel->devPeers[recvPeers[i]].recv.conn, i); + for (int i=0; i= 0; i++) loadSendConn(&channel->devPeers[sendPeers[i]].send.conn, i); + } + + __device__ void send(const T* src, int nelem) { + return LLGenericOp<0, 1, 1, 0>(src, NULL, nelem); + } + + __device__ void recv(T* dst, int nelem) { + return LLGenericOp<1, 0, 0, 1>(NULL, dst, nelem); + } + + __device__ void recvReduceSend(const T* src, int nelem) { + return LLGenericOp<1, 1, 1, 0>(src, NULL, nelem); + } + + __device__ void recvReduceCopy(const T* src, T* dst, int nelem) { + return LLGenericOp<1, 0, 1, 1>(src, dst, nelem); + } + + __device__ void copySend(const T* src, T* dst, int nelem) { + return LLGenericOp<0, 1, 1, 1>(src, dst, nelem); + } + + __device__ void recvCopySend(T* dst, int nelem) { + return LLGenericOp<1, 1, 0, 1>(NULL, dst, nelem); + } + + __device__ void recvReduceCopySend(const T* src, T* dst, int nelem) { + return LLGenericOp<1, 1, 1, 1>(src, dst, nelem); + } + + __device__ ~ncclLLPrimitives() { + // Save steps for the next operation + for (int i=0; iwait_send_cycle[blockIdx.x])); \ + wr = LOAD(&(devProf->wait_recv_cycle[blockIdx.x])); } + +#define ACCUMULATE_COUNTER(prim) \ + if (tid==0) { __atomic_fetch_add(&(devProf->prim##_cycle), clock64() - t0 \ + + ws - LOAD(&(devProf->wait_send_cycle[blockIdx.x])) \ + + wr - LOAD(&(devProf->wait_recv_cycle[blockIdx.x])), \ + __ATOMIC_SEQ_CST); \ + __atomic_fetch_add(&(devProf->prim##_byte), nelem * sizeof(T), __ATOMIC_SEQ_CST); } +#else +#define INIT_COUNTER +#define ACCUMULATE_COUNTER(prim) +#endif + +#endif diff --git a/projects/rccl/src/collectives/device/reduce.cu b/projects/rccl/src/collectives/device/reduce.cu index bd1d23ce79..dbfa1b7fad 100644 --- a/projects/rccl/src/collectives/device/reduce.cu +++ b/projects/rccl/src/collectives/device/reduce.cu @@ -1,5 +1,6 @@ /************************************************************************* * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved. + * Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ @@ -10,12 +11,7 @@ #define UNROLL 4 -#if NCCL_OP == 0 IMPL_COLL2(ncclReduce, sum, FuncSum, ncclCollReduce, ncclSum); -#elif NCCL_OP == 1 IMPL_COLL2(ncclReduce, prod, FuncProd, ncclCollReduce, ncclProd); -#elif NCCL_OP == 2 IMPL_COLL2(ncclReduce, min, FuncMin, ncclCollReduce, ncclMin); -#elif NCCL_OP == 3 IMPL_COLL2(ncclReduce, max, FuncMax, ncclCollReduce, ncclMax); -#endif diff --git a/projects/rccl/src/collectives/device/reduce.h b/projects/rccl/src/collectives/device/reduce.h index c7d6eb11b7..fca4714faf 100644 --- a/projects/rccl/src/collectives/device/reduce.h +++ b/projects/rccl/src/collectives/device/reduce.h @@ -1,153 +1,82 @@ /************************************************************************* - * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved. * Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ -#include "core.h" +#include "devcomm.h" #include "primitives.h" #include "collectives.h" -// Increase Step and boffset for buffer sync -#define NEXT_STEP \ - step++; \ - boffset += sliceSize; \ - if (boffset == buffSize) boffset = 0; - template __attribute__((noinline)) -__device__ void ncclReduceKernel(struct CollectiveArgs* args) { +__device__ void ncclReduceRingKernel(struct CollectiveArgs* args) { const int tid = threadIdx.x; const int nthreads = blockDim.x; const int bid = args->bid; - struct ncclComm* comm = args->comm; - struct ncclRing* ring = comm->rings+blockIdx.x; - - WaitFlag waitDoneFromNext(ring->send.conn.head, (REDUCE_BUFCHUNKS-1)*REDUCE_SUBSTEPS); - WaitFlag waitReadyFromPrev(ring->recv.conn.tail, 0); - PostFlag postDoneToPrev(ring->recv.conn.head, 0, NULL, 0); - PostFlag postReadyToNext(ring->send.conn.tail, 0, ring->send.conn.fifo, REDUCE_BUFCHUNKS*REDUCE_SUBSTEPS, ring->next_hdp_reg); - - typedef Primitives Prims; - + struct ncclDevComm* comm = args->comm; + struct ncclChannel* channel = comm->channels+blockIdx.x; + struct ncclRing* ring = &channel->ring; const ssize_t size = args->N; const int nranks = comm->nRanks; - const int buffSize = ring->buffSize / sizeof(T); - const int sliceSize = buffSize / REDUCE_BUFCHUNKS; - const ssize_t loopSize = args->nRings*(ssize_t)sliceSize; + const int stepSize = channel->buffSize / (sizeof(T)*NCCL_STEPS); + const int chunkSize = stepSize * REDUCE_CHUNKSTEPS; + const ssize_t loopSize = args->nChannels*(ssize_t)chunkSize; const int rank = ring->devUserRanks[0]; const int prevRank = ring->devUserRanks[nranks-1]; const int root = args->root; - if (tid == 0) { - // Update in case we skipped some collectives - STORE(ring->recv.conn.opCount, args->opCount); - - if (rank != root) { - // Wait for next to be ready - WaitFlag waitOpCountNext(ring->send.conn.opCount, 0); - waitOpCountNext.wait(args->opCount); - } - } - __syncthreads(); - - uint64_t step = 0ULL; - int boffset = 0; - // Compute pointers const T * __restrict__ thisInput = (const T*)args->ThisInput; T * __restrict__ thisOutput = (T*)args->ThisOutput; - T * __restrict__ prevInput = (T*)ring->recv.conn.buff; - T * __restrict__ nextOutput = (T*)ring->send.conn.buff; + + ncclPrimitives + prims(tid, nthreads, &ring->prev, &ring->next, NULL, stepSize, channel, comm, args->opCount); for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) { - int chunkSize = min(sliceSize, DIVUP(size-gridOffset,args->nRings)); - ALIGN_SIZE(chunkSize, nthreads*sizeof(uint64_t)/sizeof(T)); - ssize_t offset = gridOffset + bid*chunkSize; - int maxOffset = min(chunkSize, size-offset); + int realChunkSize = min(chunkSize, DIVUP(size-gridOffset,args->nChannels)); + ALIGN_SIZE(realChunkSize, nthreads*sizeof(uint64_t)/sizeof(T)); + ssize_t offset = gridOffset + bid*realChunkSize; + int nelem = min(realChunkSize, size-offset); if (prevRank == root) { - Prims::Copy(tid, nthreads, - thisInput + offset, - nextOutput + boffset, - sliceSize, maxOffset, - step, - waitDoneFromNext, - postReadyToNext); + prims.send(thisInput+offset, nelem); } else if (rank == root) { - Prims::Reduce(tid, nthreads, - prevInput + boffset, - thisInput + offset, - thisOutput + offset, - sliceSize, maxOffset, - step, - waitReadyFromPrev, - postDoneToPrev); + prims.recvReduceCopy(thisInput+offset, thisOutput+offset, nelem); } else { - Prims::Reduce(tid, nthreads, - prevInput + boffset, - thisInput + offset, - nextOutput + boffset, - sliceSize, maxOffset, - step, - waitDoneFromNext, waitReadyFromPrev, - postReadyToNext, postDoneToPrev); + prims.recvReduceSend(thisInput+offset, nelem); } - NEXT_STEP; // Increases step, boffset - } - - if (tid == 0) { - if (rank != root) { - // Wait for next to have consumed data before resetting the flag - waitDoneFromNext.wait(REDUCE_SUBSTEPS*(step + REDUCE_BUFCHUNKS - 1)); - STORE(ring->send.conn.head, 0ULL); - } - STORE(ring->recv.conn.tail, 0ULL); - __threadfence_system(); - STORE(ring->recv.conn.opCount, args->opCount+1); } } -#include "ll_kernel.h" - -#define NEXT_STEP_LL \ - boffset += NCCL_LL_SLICE_LINES; \ - if (boffset == NCCL_LL_BUFF_LINES) boffset = 0; \ - flag++; \ - step++; +template +__attribute__((noinline)) +__device__ void ncclReduceTreeKernel(struct CollectiveArgs* args) { } template __attribute__((noinline)) -__device__ void ncclReduceLLKernel(struct CollectiveArgs* args) { +__device__ void ncclReduceRingLLKernel(struct CollectiveArgs* args) { const int tid = threadIdx.x; const int bid = args->bid; - const int llNthreads = args->nThreads; - struct ncclComm* comm = args->comm; - struct ncclRing* ring = comm->rings+blockIdx.x; - volatile uint64_t * recvHeadPtr = ring->recv.conn.llHead; - volatile uint64_t * sendHeadPtr = ring->send.conn.llHead; - volatile int * sizesFifo = ring->send.conn.llFifo; - uint64_t sendHead = sendHeadPtr[0]; - const int nranks = comm->nRanks; + const int nthreads = args->nThreads; + struct ncclDevComm* comm = args->comm; + struct ncclChannel* channel = comm->channels+blockIdx.x; + struct ncclRing* ring = &channel->ring; + + ncclLLPrimitives LLprims(tid, nthreads, &ring->prev, &ring->next, channel, comm, args->opCount); + + const ssize_t size = args->N; const int rank = comm->rank; + const int nranks = comm->nRanks; const int prevRank = ring->devUserRanks[nranks-1]; const int root = args->root; - typedef LLPrimitives LL; - - const ssize_t size = args->N; ssize_t chunkSize = NCCL_LL_SLICE_LINES * sizeof(uint64_t) / sizeof(T); - const ssize_t loopSize = args->nRings*chunkSize; - - uint64_t step = ring->send.conn.llStep; - uint32_t flag = step + 1; - int boffset = NCCL_LL_SLICE_LINES * STEP_TO_SLOT(step); + const ssize_t loopSize = args->nChannels*chunkSize; // Compute pointers const T * __restrict__ thisInput = (const T*)args->ThisInput; T * __restrict__ thisOutput = (T*)args->ThisOutput; - union ncclLLFifoLine * prevInput = (union ncclLLFifoLine *)ring->recv.conn.llBuff; - union ncclLLFifoLine * nextOutput = (union ncclLLFifoLine *)ring->send.conn.llBuff; for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) { if (size-gridOffset < loopSize) { @@ -155,39 +84,17 @@ __device__ void ncclReduceLLKernel(struct CollectiveArgs* args) { } ssize_t offset = gridOffset + bid*chunkSize; - int maxOffset = min(chunkSize, size-offset); + int nelem = min(chunkSize, size-offset); if (prevRank == root) { - WAIT_NEXT; - LL::ReduceCopy( - thisInput + offset, - nextOutput + boffset, - maxOffset, flag, llNthreads); - POST_SIZE; - NEXT_STEP_LL; + LLprims.send(thisInput+offset, nelem); } else if (rank == root) { - LL::ReduceCopy( - thisInput + offset, - prevInput + boffset, - thisOutput + offset, - maxOffset, flag, llNthreads); - NEXT_STEP_LL; - ACK_PREV; + LLprims.recvReduceCopy(thisInput+offset, thisOutput+offset, nelem); } else { - WAIT_NEXT; - LL::ReduceCopy( - thisInput + offset, - prevInput + boffset, - nextOutput + boffset, - maxOffset, flag, flag, llNthreads); - POST_SIZE; - NEXT_STEP_LL; - ACK_PREV; + LLprims.recvReduceSend(thisInput+offset, nelem); } } - - // We need everyone to acknowledge data even if they didn't receive anything - // so that the next collective can start right away. - ACK_PREV; - - FIFO_CLEANING_AND_SAVE_STEP(flag); } + +template +__attribute__((noinline)) +__device__ void ncclReduceTreeLLKernel(struct CollectiveArgs* args) { } diff --git a/projects/rccl/src/collectives/device/reduce_0.cpp b/projects/rccl/src/collectives/device/reduce_0.cpp deleted file mode 100644 index f1b83bc655..0000000000 --- a/projects/rccl/src/collectives/device/reduce_0.cpp +++ /dev/null @@ -1,8 +0,0 @@ -/************************************************************************* - * Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved. - * - * See LICENSE.txt for license information - ************************************************************************/ - -#define NCCL_OP 0 -#include "device/reduce.cu" diff --git a/projects/rccl/src/collectives/device/reduce_1.cpp b/projects/rccl/src/collectives/device/reduce_1.cpp deleted file mode 100644 index 63b157075e..0000000000 --- a/projects/rccl/src/collectives/device/reduce_1.cpp +++ /dev/null @@ -1,8 +0,0 @@ -/************************************************************************* - * Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved. - * - * See LICENSE.txt for license information - ************************************************************************/ - -#define NCCL_OP 1 -#include "device/reduce.cu" diff --git a/projects/rccl/src/collectives/device/reduce_2.cpp b/projects/rccl/src/collectives/device/reduce_2.cpp deleted file mode 100644 index 7c84b0ada3..0000000000 --- a/projects/rccl/src/collectives/device/reduce_2.cpp +++ /dev/null @@ -1,8 +0,0 @@ -/************************************************************************* - * Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved. - * - * See LICENSE.txt for license information - ************************************************************************/ - -#define NCCL_OP 2 -#include "device/reduce.cu" diff --git a/projects/rccl/src/collectives/device/reduce_3.cpp b/projects/rccl/src/collectives/device/reduce_3.cpp deleted file mode 100644 index c590bdd3c6..0000000000 --- a/projects/rccl/src/collectives/device/reduce_3.cpp +++ /dev/null @@ -1,8 +0,0 @@ -/************************************************************************* - * Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved. - * - * See LICENSE.txt for license information - ************************************************************************/ - -#define NCCL_OP 3 -#include "device/reduce.cu" diff --git a/projects/rccl/src/collectives/device/reduce_kernel.h b/projects/rccl/src/collectives/device/reduce_kernel.h index 86e0f56a12..4c5caa9f28 100644 --- a/projects/rccl/src/collectives/device/reduce_kernel.h +++ b/projects/rccl/src/collectives/device/reduce_kernel.h @@ -1,5 +1,5 @@ /************************************************************************* - * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved. * Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved. * * See LICENSE.txt for license information @@ -19,7 +19,7 @@ struct FuncNull { } }; -#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) +#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__) //we really don't need any specializations and we don't need //to break things into uint32_t @@ -164,30 +164,31 @@ struct FuncMin { } }; +#define MASK0 0x00ff00ff +#define MASK1 0xff00ff00 +static __device__ uint32_t addChar4(const uint32_t x, const uint32_t y) { + /* This can be used both for signed and unsigned 8-bit addition */ + const uint32_t x0 = x & MASK0; + const uint32_t x1 = x & MASK1; + const uint32_t y0 = y & MASK0; + const uint32_t y1 = y & MASK1; + const uint32_t r0 = (x0+y0); + const uint32_t r1 = (x1+y1); + return (r0 & MASK0) | (r1 & MASK1); +} + template<> struct FuncSum { - union converter { uint32_t storage; char4 a; }; __device__ uint32_t operator()(const uint32_t x, const uint32_t y) const { +#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__) +#else #if (__CUDA_ARCH__ >= 300) && (__CUDA_ARCH__ < 500) int32_t rv, z=0; asm("vadd4.s32.s32.s32 %0, %1, %2, %3;" : "=r"(rv) : "r"(x), "r"(y), "r"(z)); return rv; -#elif (__CUDA_ARCH__ >= 500) && (__CUDA_ARCH__ < 700) - int32_t rv; - asm("vadd.s32.s32.s32 %0, %1.b0, %2.b0; \n\t" - "vadd.s32.s32.s32 %0.b1, %1.b1, %2.b1, %0;\n\t" - "vadd.s32.s32.s32 %0.b2, %1.b2, %2.b2, %0;\n\t" - "vadd.s32.s32.s32 %0.b3, %1.b3, %2.b3, %0;" : "=r"(rv) : "r"(x), "r"(y)); - return rv; #else - converter cx, cy, cr; - cx.storage = x; - cy.storage = y; - cr.a.x = cx.a.x + cy.a.x; - cr.a.y = cx.a.y + cy.a.y; - cr.a.z = cx.a.z + cy.a.z; - cr.a.w = cx.a.w + cy.a.w; - return cr.storage; + return addChar4(x, y); +#endif #endif } __device__ int8_t operator()(const int8_t x, const int8_t y) const { @@ -196,28 +197,16 @@ struct FuncSum { }; template<> struct FuncSum { - union converter { uint32_t storage; uchar4 a; }; __device__ uint32_t operator()(const uint32_t x, const uint32_t y) const { +#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__) +#else #if (__CUDA_ARCH__ >= 300) && (__CUDA_ARCH__ < 500) int32_t rv, z=0; asm("vadd4.u32.u32.u32 %0, %1, %2, %3;" : "=r"(rv) : "r"(x), "r"(y), "r"(z)); return rv; -#elif (__CUDA_ARCH__ >= 500) && (__CUDA_ARCH__ < 700) - int32_t rv; - asm("vadd.u32.u32.u32 %0, %1.b0, %2.b0; \n\t" - "vadd.u32.u32.u32 %0.b1, %1.b1, %2.b1, %0;\n\t" - "vadd.u32.u32.u32 %0.b2, %1.b2, %2.b2, %0;\n\t" - "vadd.u32.u32.u32 %0.b3, %1.b3, %2.b3, %0;" : "=r"(rv) : "r"(x), "r"(y)); - return rv; #else - converter cx, cy, cr; - cx.storage = x; - cy.storage = y; - cr.a.x = cx.a.x + cy.a.x; - cr.a.y = cx.a.y + cy.a.y; - cr.a.z = cx.a.z + cy.a.z; - cr.a.w = cx.a.w + cy.a.w; - return cr.storage; + return addChar4(x, y); +#endif #endif } __device__ uint8_t operator()(const uint8_t x, const uint8_t y) const { @@ -227,22 +216,6 @@ struct FuncSum { static __device__ uint32_t mulChar4(const uint32_t x, const uint32_t y) { /* This can be used both for signed and unsigned 8-bit multiplication */ -#if (__CUDA_ARCH__ >= 300) - uint32_t rv; - asm("{ .reg .u32 t0, t1, t2, t3;\n\t" - " vmad.u32.u32.u32 t3, %1.b3, %2.b3, 0;\n\t" - " vmad.u32.u32.u32 t2, %1.b2, %2.b2, 0;\n\t" - " shl.b32 t3, t3, 16;\n\t" - " shl.b32 t2, t2, 16;\n\t" - " vmad.u32.u32.u32 t1, %1.b1, %2.b1, t3;\n\t" - " shl.b32 t1, t1, 8;\n\t" - " vmad.u32.u32.u32 t0, %1.b0, %2.b0, t2;\n\t" - " and.b32 t1, t1, 0xff00ff00;\n\t" - " and.b32 t0, t0, 0x00ff00ff;\n\t" - " or.b32 %0, t0, t1;\n\t" - "}" : "=r"(rv) : "r"(x), "r"(y)); - return rv; -#else union converter { uint32_t storage; char4 a; }; converter cx, cy, cr; cx.storage = x; @@ -252,7 +225,6 @@ static __device__ uint32_t mulChar4(const uint32_t x, const uint32_t y) { cr.a.z = cx.a.z * cy.a.z; cr.a.w = cx.a.w * cy.a.w; return cr.storage; -#endif } template<> @@ -278,17 +250,12 @@ template<> struct FuncMax { union converter { uint32_t storage; char4 a; }; __device__ uint32_t operator()(const uint32_t x, const uint32_t y) const { +#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__) +#else #if (__CUDA_ARCH__ >= 300) && (__CUDA_ARCH__ < 500) int32_t rv, z=0; asm("vmax4.s32.s32.s32 %0, %1, %2, %3;" : "=r"(rv) : "r"(x), "r"(y), "r"(z)); return rv; -#elif (__CUDA_ARCH__ >= 500) && (__CUDA_ARCH__ < 700) - int32_t rv; - asm("vmax.s32.s32.s32 %0, %1.b0, %2.b0; \n\t" - "vmax.s32.s32.s32 %0.b1, %1.b1, %2.b1, %0;\n\t" - "vmax.s32.s32.s32 %0.b2, %1.b2, %2.b2, %0;\n\t" - "vmax.s32.s32.s32 %0.b3, %1.b3, %2.b3, %0;" : "=r"(rv) : "r"(x), "r"(y)); - return rv; #else converter cx, cy, cr; cx.storage = x; @@ -298,6 +265,7 @@ struct FuncMax { cr.a.z = max(cx.a.z, cy.a.z); cr.a.w = max(cx.a.w, cy.a.w); return cr.storage; +#endif #endif } __device__ int8_t operator()(const int8_t x, const int8_t y) const { @@ -308,17 +276,12 @@ template<> struct FuncMax { union converter { uint32_t storage; uchar4 a; }; __device__ uint32_t operator()(const uint32_t x, const uint32_t y) const { +#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__) +#else #if (__CUDA_ARCH__ >= 300) && (__CUDA_ARCH__ < 500) int32_t rv, z=0; asm("vmax4.u32.u32.u32 %0, %1, %2, %3;" : "=r"(rv) : "r"(x), "r"(y), "r"(z)); return rv; -#elif (__CUDA_ARCH__ >= 500) && (__CUDA_ARCH__ < 700) - int32_t rv; - asm("vmax.u32.u32.u32 %0, %1.b0, %2.b0; \n\t" - "vmax.u32.u32.u32 %0.b1, %1.b1, %2.b1, %0;\n\t" - "vmax.u32.u32.u32 %0.b2, %1.b2, %2.b2, %0;\n\t" - "vmax.u32.u32.u32 %0.b3, %1.b3, %2.b3, %0;" : "=r"(rv) : "r"(x), "r"(y)); - return rv; #else converter cx, cy, cr; cx.storage = x; @@ -328,6 +291,7 @@ struct FuncMax { cr.a.z = max(cx.a.z, cy.a.z); cr.a.w = max(cx.a.w, cy.a.w); return cr.storage; +#endif #endif } __device__ uint8_t operator()(const uint8_t x, const uint8_t y) const { @@ -339,17 +303,12 @@ template<> struct FuncMin { union converter { uint32_t storage; char4 a; }; __device__ uint32_t operator()(const uint32_t x, const uint32_t y) const { +#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__) +#else #if (__CUDA_ARCH__ >= 300) && (__CUDA_ARCH__ < 500) int32_t rv, z=0; asm("vmin4.s32.s32.s32 %0, %1, %2, %3;" : "=r"(rv) : "r"(x), "r"(y), "r"(z)); return rv; -#elif (__CUDA_ARCH__ >= 500) && (__CUDA_ARCH__ < 700) - int32_t rv; - asm("vmin.s32.s32.s32 %0, %1.b0, %2.b0; \n\t" - "vmin.s32.s32.s32 %0.b1, %1.b1, %2.b1, %0;\n\t" - "vmin.s32.s32.s32 %0.b2, %1.b2, %2.b2, %0;\n\t" - "vmin.s32.s32.s32 %0.b3, %1.b3, %2.b3, %0;" : "=r"(rv) : "r"(x), "r"(y)); - return rv; #else converter cx, cy, cr; cx.storage = x; @@ -359,6 +318,7 @@ struct FuncMin { cr.a.z = min(cx.a.z, cy.a.z); cr.a.w = min(cx.a.w, cy.a.w); return cr.storage; +#endif #endif } __device__ int8_t operator()(const int8_t x, const int8_t y) const { @@ -369,17 +329,12 @@ template<> struct FuncMin { union converter { uint32_t storage; uchar4 a; }; __device__ uint32_t operator()(const uint32_t x, const uint32_t y) const { +#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__) +#else #if (__CUDA_ARCH__ >= 300) && (__CUDA_ARCH__ < 500) int32_t rv, z=0; asm("vmin4.u32.u32.u32 %0, %1, %2, %3;" : "=r"(rv) : "r"(x), "r"(y), "r"(z)); return rv; -#elif (__CUDA_ARCH__ >= 500) && (__CUDA_ARCH__ < 700) - int32_t rv; - asm("vmin.u32.u32.u32 %0, %1.b0, %2.b0; \n\t" - "vmin.u32.u32.u32 %0.b1, %1.b1, %2.b1, %0;\n\t" - "vmin.u32.u32.u32 %0.b2, %1.b2, %2.b2, %0;\n\t" - "vmin.u32.u32.u32 %0.b3, %1.b3, %2.b3, %0;" : "=r"(rv) : "r"(x), "r"(y)); - return rv; #else converter cx, cy, cr; cx.storage = x; @@ -389,6 +344,7 @@ struct FuncMin { cr.a.z = min(cx.a.z, cy.a.z); cr.a.w = min(cx.a.w, cy.a.w); return cr.storage; +#endif #endif } __device__ uint8_t operator()(const uint8_t x, const uint8_t y) const { @@ -480,6 +436,6 @@ struct FuncMin { } }; -#endif // defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) +#endif // defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__) #endif // REDUCE_KERNEL_H_ diff --git a/projects/rccl/src/collectives/device/reduce_scatter.cu b/projects/rccl/src/collectives/device/reduce_scatter.cu index efff65deba..82cb408a16 100644 --- a/projects/rccl/src/collectives/device/reduce_scatter.cu +++ b/projects/rccl/src/collectives/device/reduce_scatter.cu @@ -11,12 +11,7 @@ #define UNROLL 4 -#if NCCL_OP == 0 IMPL_COLL2(ncclReduceScatter, sum, FuncSum, ncclCollReduceScatter, ncclSum); -#elif NCCL_OP == 1 IMPL_COLL2(ncclReduceScatter, prod, FuncProd, ncclCollReduceScatter, ncclProd); -#elif NCCL_OP == 2 IMPL_COLL2(ncclReduceScatter, min, FuncMin, ncclCollReduceScatter, ncclMin); -#elif NCCL_OP == 3 -IMPL_COLL2(ncclReduceScatter, max, FuncMax, ncclCollReduceScatter, ncclMax); -#endif +IMPL_COLL2(ncclReduceScatter, max, FuncMax, ncclCollReduceScatter, ncclMax); \ No newline at end of file diff --git a/projects/rccl/src/collectives/device/reduce_scatter.h b/projects/rccl/src/collectives/device/reduce_scatter.h index bb738766f1..c768d6a365 100644 --- a/projects/rccl/src/collectives/device/reduce_scatter.h +++ b/projects/rccl/src/collectives/device/reduce_scatter.h @@ -1,166 +1,93 @@ /************************************************************************* - * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved. * Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ -#include "core.h" +#include "devcomm.h" #include "primitives.h" #include "collectives.h" -// Increase Step and poffset/noffset for buffer sync -#define NEXT_STEP \ - step++; \ - poffset = noffset; \ - noffset += sliceSize; \ - if (noffset == buffSize) noffset = 0; - template __attribute__((noinline)) -__device__ void ncclReduceScatterKernel(struct CollectiveArgs* args) { +__device__ void ncclReduceScatterRingKernel(struct CollectiveArgs* args) { const int tid = threadIdx.x; const int nthreads = blockDim.x; const int bid = args->bid; - struct ncclComm* comm = args->comm; - struct ncclRing* ring = comm->rings+blockIdx.x; - - WaitFlag waitDoneFromNext(ring->send.conn.head, REDUCESCATTER_BUFCHUNKS*REDUCESCATTER_SUBSTEPS); - WaitFlag waitReadyFromPrev(ring->recv.conn.tail, REDUCESCATTER_SUBSTEPS); - PostFlag postDoneToPrev(ring->recv.conn.head, REDUCESCATTER_SUBSTEPS, NULL, 0); - PostFlag postReadyToNext(ring->send.conn.tail, 0, ring->send.conn.fifo, REDUCESCATTER_BUFCHUNKS*REDUCESCATTER_SUBSTEPS, ring->next_hdp_reg); - - typedef Primitives Prims; - + struct ncclDevComm* comm = args->comm; + struct ncclChannel* channel = comm->channels+blockIdx.x; + struct ncclRing* ring = &channel->ring; const ssize_t size = args->N; const int nranks = comm->nRanks; - const int buffSize = ring->buffSize / sizeof(T); - const int sliceSize = buffSize / REDUCESCATTER_BUFCHUNKS; - const ssize_t loopSize = args->nRings*(ssize_t)sliceSize; - - if (tid == 0) { - // Update in case we skipped some collectives - STORE(ring->recv.conn.opCount, args->opCount); - // Wait for next to be ready - WaitFlag waitOpCountNext(ring->send.conn.opCount, 0); - waitOpCountNext.wait(args->opCount); - } - __syncthreads(); - - uint64_t step = 0ULL; - int poffset, noffset = 0; + const int stepSize = channel->buffSize / (sizeof(T)*NCCL_STEPS); + const int chunkSize = stepSize * ALLREDUCE_CHUNKSTEPS; + const ssize_t loopSize = args->nChannels*(ssize_t)chunkSize; // Compute pointers const T * __restrict__ thisInput = (const T*)args->ThisInput; T * __restrict__ thisOutput = (T*)args->ThisOutput; - T * __restrict__ prevInput = (T*)ring->recv.conn.buff; - T * __restrict__ nextOutput = (T*)ring->send.conn.buff; + + ncclPrimitives + prims(tid, nthreads, &ring->prev, &ring->next, NULL, stepSize, channel, comm, args->opCount); for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) { - int chunkSize = min(sliceSize, DIVUP(size-gridOffset,args->nRings)); - ALIGN_SIZE(chunkSize, nthreads*sizeof(uint64_t)/sizeof(T)); - ssize_t chunkOffset = gridOffset + bid*chunkSize; + int realChunkSize = min(chunkSize, DIVUP(size-gridOffset,args->nChannels)); + ALIGN_SIZE(realChunkSize, nthreads*sizeof(uint64_t)/sizeof(T)); + ssize_t chunkOffset = gridOffset + bid*realChunkSize; /////////////// begin ReduceScatter steps /////////////// ssize_t offset; - int maxOffset = min(chunkSize, size-chunkOffset); + int nelem = min(realChunkSize, size-chunkOffset); int rankDest; // step 0: push data to next GPU rankDest = ring->devUserRanks[nranks-1]; offset = chunkOffset + rankDest * size; - Prims::Copy(tid, nthreads, - thisInput + offset, - nextOutput + noffset, - sliceSize, maxOffset, - step, - waitDoneFromNext, - postReadyToNext); - - NEXT_STEP; // Increases step, poffset, noffset + prims.send(thisInput+offset, nelem); // k-2 steps: reduce and copy to next GPU for (int j=2; jdevUserRanks[nranks-j]; offset = chunkOffset + rankDest * size; - Prims::Reduce(tid, nthreads, - prevInput + poffset, - thisInput + offset, - nextOutput + noffset, - sliceSize, maxOffset, - step, - waitDoneFromNext, waitReadyFromPrev, - postReadyToNext, postDoneToPrev); - - NEXT_STEP; + prims.recvReduceSend(thisInput+offset, nelem); } - // step k-1: reduce this buffer and data, which will produce the final - // result that we store in this data and push to the next GPU + // step k-1: reduce this buffer and data, which will produce the final result rankDest = ring->devUserRanks[0]; offset = chunkOffset + rankDest * size; - Prims::Reduce(tid, nthreads, - prevInput + poffset, - thisInput + offset, - thisOutput + chunkOffset, - sliceSize, maxOffset, - step, - waitReadyFromPrev, - postDoneToPrev); - } - - if (tid == 0) { - waitDoneFromNext.wait(REDUCESCATTER_SUBSTEPS*(step + REDUCESCATTER_BUFCHUNKS)); - STORE(ring->send.conn.head, 0ULL); - STORE(ring->recv.conn.tail, 0ULL); - __threadfence_system(); - STORE(ring->recv.conn.opCount, args->opCount+1); + prims.recvReduceCopy(thisInput+offset, thisOutput+chunkOffset, nelem); } } -#include "ll_kernel.h" - -#define NEXT_STEP_LL \ - poffset = noffset; \ - pflag = nflag; \ - noffset += NCCL_LL_SLICE_LINES; \ - if (noffset == NCCL_LL_BUFF_LINES) { noffset = 0; } \ - nflag++; \ - step++; +template +__attribute__((noinline)) +__device__ void ncclReduceScatterTreeKernel(struct CollectiveArgs* args) { } template __attribute__((noinline)) -__device__ void ncclReduceScatterLLKernel(struct CollectiveArgs* args) { +__device__ void ncclReduceScatterRingLLKernel(struct CollectiveArgs* args) { const int tid = threadIdx.x; const int bid = args->bid; - const int llNthreads = args->nThreads; - struct ncclComm* comm = args->comm; - struct ncclRing* ring = comm->rings+blockIdx.x; - volatile uint64_t * recvHeadPtr = ring->recv.conn.llHead; - volatile uint64_t * sendHeadPtr = ring->send.conn.llHead; - volatile int * sizesFifo = ring->send.conn.llFifo; - uint64_t sendHead = sendHeadPtr[0]; + const int nthreads = args->nThreads; + struct ncclDevComm* comm = args->comm; + struct ncclChannel* channel = comm->channels+blockIdx.x; + struct ncclRing* ring = &channel->ring; - typedef LLPrimitives LL; + ncclLLPrimitives LLprims(tid, nthreads, &ring->prev, &ring->next, channel, comm, args->opCount); const ssize_t size = args->N; //const int rank = comm->rank; const int nranks = comm->nRanks; ssize_t chunkSize = NCCL_LL_SLICE_LINES * sizeof(uint64_t) / sizeof(T); - const ssize_t loopSize = args->nRings*chunkSize; - - uint64_t step = ring->send.conn.llStep; - uint32_t pflag, nflag = step + 1; - int poffset, noffset = NCCL_LL_SLICE_LINES * STEP_TO_SLOT(step); + const ssize_t loopSize = args->nChannels*chunkSize; // Compute pointers const T * __restrict__ thisInput = (const T*)args->ThisInput; T * __restrict__ thisOutput = (T*)args->ThisOutput; - union ncclLLFifoLine * prevInput = (union ncclLLFifoLine *)ring->recv.conn.llBuff; - union ncclLLFifoLine * nextOutput = (union ncclLLFifoLine *)ring->send.conn.llBuff; for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) { if (size-gridOffset < loopSize) { @@ -170,37 +97,21 @@ __device__ void ncclReduceScatterLLKernel(struct CollectiveArgs* args) { /////////////// begin ReduceScatter steps /////////////// ssize_t offset; - int maxOffset = min(chunkSize, size-chunkOffset); + int nelem = min(chunkSize, size-chunkOffset); int rankDest; // step 0: push data to next GPU rankDest = ring->devUserRanks[nranks-1]; offset = chunkOffset + rankDest * size; - WAIT_NEXT; - LL::ReduceCopy( - thisInput + offset, - nextOutput + noffset, - maxOffset, nflag, llNthreads); - POST_SIZE; - - NEXT_STEP_LL; + LLprims.send(thisInput+offset, nelem); // k-2 steps: reduce and copy to next GPU for (int j=2; jdevUserRanks[nranks-j]; offset = chunkOffset + rankDest * size; - WAIT_NEXT; - LL::ReduceCopy( - thisInput + offset, - prevInput + poffset, - nextOutput + noffset, - maxOffset, pflag, nflag, llNthreads); - POST_SIZE; - ACK_PREV; - - NEXT_STEP_LL; + LLprims.recvReduceSend(thisInput+offset, nelem); } // step k-1: reduce this buffer and data, which will produce the final @@ -208,13 +119,10 @@ __device__ void ncclReduceScatterLLKernel(struct CollectiveArgs* args) { rankDest = ring->devUserRanks[0]; offset = chunkOffset + rankDest * size; - LL::ReduceCopy( - thisInput + offset, - prevInput + poffset, - thisOutput + chunkOffset, - maxOffset, pflag, llNthreads); - ACK_PREV; + LLprims.recvReduceCopy(thisInput+offset, thisOutput+chunkOffset, nelem); } - - FIFO_CLEANING_AND_SAVE_STEP(nflag); } + +template +__attribute__((noinline)) +__device__ void ncclReduceScatterTreeLLKernel(struct CollectiveArgs* args) { } diff --git a/projects/rccl/src/collectives/device/reduce_scatter_0.cpp b/projects/rccl/src/collectives/device/reduce_scatter_0.cpp deleted file mode 100644 index 936f164605..0000000000 --- a/projects/rccl/src/collectives/device/reduce_scatter_0.cpp +++ /dev/null @@ -1,8 +0,0 @@ -/************************************************************************* - * Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved. - * - * See LICENSE.txt for license information - ************************************************************************/ - -#define NCCL_OP 0 -#include "device/reduce_scatter.cu" diff --git a/projects/rccl/src/collectives/device/reduce_scatter_1.cpp b/projects/rccl/src/collectives/device/reduce_scatter_1.cpp deleted file mode 100644 index 3dbd2466d7..0000000000 --- a/projects/rccl/src/collectives/device/reduce_scatter_1.cpp +++ /dev/null @@ -1,8 +0,0 @@ -/************************************************************************* - * Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved. - * - * See LICENSE.txt for license information - ************************************************************************/ - -#define NCCL_OP 1 -#include "device/reduce_scatter.cu" diff --git a/projects/rccl/src/collectives/device/reduce_scatter_2.cpp b/projects/rccl/src/collectives/device/reduce_scatter_2.cpp deleted file mode 100644 index 7302f55739..0000000000 --- a/projects/rccl/src/collectives/device/reduce_scatter_2.cpp +++ /dev/null @@ -1,8 +0,0 @@ -/************************************************************************* - * Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved. - * - * See LICENSE.txt for license information - ************************************************************************/ - -#define NCCL_OP 2 -#include "device/reduce_scatter.cu" diff --git a/projects/rccl/src/collectives/device/reduce_scatter_3.cpp b/projects/rccl/src/collectives/device/reduce_scatter_3.cpp deleted file mode 100644 index 95a2fc93b7..0000000000 --- a/projects/rccl/src/collectives/device/reduce_scatter_3.cpp +++ /dev/null @@ -1,8 +0,0 @@ -/************************************************************************* - * Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved. - * - * See LICENSE.txt for license information - ************************************************************************/ - -#define NCCL_OP 3 -#include "device/reduce_scatter.cu" diff --git a/projects/rccl/src/collectives/reduce.cc b/projects/rccl/src/collectives/reduce.cc new file mode 100644 index 0000000000..f53437f86d --- /dev/null +++ b/projects/rccl/src/collectives/reduce.cc @@ -0,0 +1,19 @@ +/************************************************************************* + * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved. + * Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ + +#include "enqueue.h" +#include "collectives.h" + +NCCL_API(ncclResult_t, ncclReduce, const void* sendbuff, void* recvbuff, size_t count, + ncclDataType_t datatype, ncclRedOp_t op, int root, ncclComm_t comm, hipStream_t stream); +ncclResult_t ncclReduce(const void* sendbuff, void* recvbuff, size_t count, + ncclDataType_t datatype, ncclRedOp_t op, int root, ncclComm_t comm, hipStream_t stream) { + struct ncclInfo info = { ncclCollReduce, "Reduce", + sendbuff, recvbuff, count, datatype, op, root, comm, stream, /* Args */ + REDUCE_CHUNKSTEPS, REDUCE_SLICESTEPS }; + return ncclEnqueueCheck(&info); +} diff --git a/projects/rccl/src/collectives/reduce.cu b/projects/rccl/src/collectives/reduce.cu deleted file mode 100644 index 89dc804b7f..0000000000 --- a/projects/rccl/src/collectives/reduce.cu +++ /dev/null @@ -1,33 +0,0 @@ -/************************************************************************* - * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved. - * - * See LICENSE.txt for license information - ************************************************************************/ - -#include "core.h" -#include "common_coll.h" -#include "enqueue.h" -#include "collectives.h" - -ncclResult_t ncclReduceFunc(const void* sendbuff, void* recvbuff, const size_t count, - ncclDataType_t datatype, ncclRedOp_t op, int root, ncclComm_t comm, hipStream_t stream) { - size_t nbytes = count*ncclTypeSize(datatype); - INFO(NCCL_COLL,"Reduce: opCount %lx sendbuff %p recvbuff %p count %zi datatype %d op %d root %d comm %p [nranks=%d] stream %p", comm->opCount, sendbuff, recvbuff, count, datatype, op, root, comm, comm->nRanks, stream); - if (comm->nRanks == 1) { - if (sendbuff != recvbuff) - CUDACHECK(hipMemcpyAsync(recvbuff, sendbuff, nbytes, hipMemcpyDeviceToDevice, stream)); - } else { - NCCLCHECK(transportSaveProxies(REDUCE_SUBSTEPS, REDUCE_BUFCHUNKS, 1, 1, nbytes, proxyPatternTo(root), comm)); - NCCLCHECK(saveKernel(ncclCollReduce, sendbuff, recvbuff, count, datatype, op, root, comm, stream, nbytes, 1)); - } - - return ncclSuccess; -} - -NCCL_API(ncclResult_t, ncclReduce, const void* sendbuff, void* recvbuff, size_t count, - ncclDataType_t datatype, ncclRedOp_t op, int root, ncclComm_t comm, hipStream_t stream); -ncclResult_t ncclReduce(const void* sendbuff, void* recvbuff, size_t count, - ncclDataType_t datatype, ncclRedOp_t op, int root, ncclComm_t comm, hipStream_t stream) { - return ncclEnqueueCheck(ncclReduceFunc, "Reduce", sendbuff, recvbuff, count, datatype, - op, root, comm, stream); -} diff --git a/projects/rccl/src/collectives/reduce_scatter.cc b/projects/rccl/src/collectives/reduce_scatter.cc new file mode 100644 index 0000000000..0ded7c557a --- /dev/null +++ b/projects/rccl/src/collectives/reduce_scatter.cc @@ -0,0 +1,19 @@ +/************************************************************************* + * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved. + * Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ + +#include "enqueue.h" +#include "collectives.h" + +NCCL_API(ncclResult_t, ncclReduceScatter, const void* sendbuff, void* recvbuff, size_t recvcount, + ncclDataType_t datatype, ncclRedOp_t op, ncclComm* comm, hipStream_t stream); +ncclResult_t ncclReduceScatter(const void* sendbuff, void* recvbuff, size_t recvcount, + ncclDataType_t datatype, ncclRedOp_t op, ncclComm* comm, hipStream_t stream) { + struct ncclInfo info = { ncclCollReduceScatter, "ReduceScatter", + sendbuff, recvbuff, recvcount, datatype, op, 0, comm, stream, /* Args */ + REDUCESCATTER_CHUNKSTEPS, REDUCESCATTER_SLICESTEPS }; + return ncclEnqueueCheck(&info); +} diff --git a/projects/rccl/src/collectives/reduce_scatter.cu b/projects/rccl/src/collectives/reduce_scatter.cu deleted file mode 100644 index f73d50948d..0000000000 --- a/projects/rccl/src/collectives/reduce_scatter.cu +++ /dev/null @@ -1,32 +0,0 @@ -/************************************************************************* - * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved. - * - * See LICENSE.txt for license information - ************************************************************************/ - -#include "core.h" -#include "common_coll.h" -#include "enqueue.h" -#include "collectives.h" - -ncclResult_t ncclReduceScatterFunc(const void* sendbuff, void* recvbuff, size_t count, - ncclDataType_t datatype, ncclRedOp_t op, int root, ncclComm_t comm, hipStream_t stream) { - size_t nbytes = count*ncclTypeSize(datatype); - INFO(NCCL_COLL,"ReduceScatter: opCount %lx sendbuff %p recvbuff %p count %zi datatype %d op %d root %d comm %p [nranks=%d] stream %p", comm->opCount, sendbuff, recvbuff, count, datatype, op, root, comm, comm->nRanks, stream); - if (comm->nRanks == 1) { - if (sendbuff != recvbuff) - CUDACHECK(hipMemcpyAsync(recvbuff, sendbuff, nbytes, hipMemcpyDeviceToDevice, stream)); - } else { - NCCLCHECK(transportSaveProxies(REDUCESCATTER_SUBSTEPS, REDUCESCATTER_BUFCHUNKS, comm->nRanks-1, comm->nRanks, nbytes*comm->nRanks, proxyPatternRing, comm)); - NCCLCHECK(saveKernel(ncclCollReduceScatter, sendbuff, recvbuff, count, datatype, op, root, comm, stream, nbytes*comm->nRanks, 1)); - } - return ncclSuccess; -} - -NCCL_API(ncclResult_t, ncclReduceScatter, const void* sendbuff, void* recvbuff, size_t recvcount, - ncclDataType_t datatype, ncclRedOp_t op, ncclComm* comm, hipStream_t stream); -ncclResult_t ncclReduceScatter(const void* sendbuff, void* recvbuff, size_t recvcount, - ncclDataType_t datatype, ncclRedOp_t op, ncclComm* comm, hipStream_t stream) { - return ncclEnqueueCheck(ncclReduceScatterFunc, "ReduceScatter", sendbuff, recvbuff, recvcount, datatype, - op, 0, comm, stream); -} diff --git a/projects/rccl/src/enqueue.cc b/projects/rccl/src/enqueue.cc new file mode 100644 index 0000000000..0c7b897ec4 --- /dev/null +++ b/projects/rccl/src/enqueue.cc @@ -0,0 +1,441 @@ +/************************************************************************* + * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ + +#include "enqueue.h" +#include "checks.h" +#include "param.h" + +#include "collectives/collectives.h" + +// Only generate inline kernels for LL +#define NCCL_FUNC5(coll, op, dtype) \ + NCCL_KERN_NAME(coll##LL, op, dtype), \ + NCCL_KERN_NAME(coll##LL, op, dtype) + +#define NCCL_FUNC4(coll, op, dtype) \ + NCCL_FUNC5(coll##Ring, op, dtype) + +// Must be consistent with ncclDataType_t +#define NCCL_FUNCS3A(coll, op) \ + NCCL_FUNC4(coll, op, i8), \ + NCCL_FUNC4(coll, op, u8), \ + NCCL_FUNC4(coll, op, i32), \ + NCCL_FUNC4(coll, op, u32), \ + NCCL_FUNC4(coll, op, i64), \ + NCCL_FUNC4(coll, op, u64), \ + NCCL_FUNC4(coll, op, f16), \ + NCCL_FUNC4(coll, op, f32), \ + NCCL_FUNC4(coll, op, f64) +#define NCCL_FUNCS3B(coll, op) \ + NCCL_FUNC4(coll, op, i8), \ + NCCL_FUNC4(coll, op, i8), \ + NCCL_FUNC4(coll, op, i8), \ + NCCL_FUNC4(coll, op, i8), \ + NCCL_FUNC4(coll, op, i8), \ + NCCL_FUNC4(coll, op, i8), \ + NCCL_FUNC4(coll, op, i8), \ + NCCL_FUNC4(coll, op, i8), \ + NCCL_FUNC4(coll, op, i8) + +// Must be consistent with ncclRedOp_t -- but we only generate kernel for sums. +#define NCCL_FUNCS2A(coll) \ + NCCL_FUNCS3A(coll, sum), \ + NCCL_FUNCS3A(coll, sum), \ + NCCL_FUNCS3A(coll, sum), \ + NCCL_FUNCS3A(coll, sum) +#define NCCL_FUNCS2B(coll) \ + NCCL_FUNCS3B(coll, copy), \ + NCCL_FUNCS3B(coll, copy), \ + NCCL_FUNCS3B(coll, copy), \ + NCCL_FUNCS3B(coll, copy) + +typedef void(*ncclKern_t)(struct ncclColl); +// Must be consistent with the ncclFuncSet enum +static ncclKern_t const ncclKerns[ncclCollCount*ncclNumOps*ncclNumTypes*2] = { + NCCL_FUNCS2B(ncclBroadcast), + NCCL_FUNCS2A(ncclReduce), + NCCL_FUNCS2B(ncclAllGather), + NCCL_FUNCS2A(ncclReduceScatter), + NCCL_FUNCS2A(ncclAllReduce) +}; + +/*****************************************************************************/ +/* Launch system : synchronization and CUDA kernel launch */ +/*****************************************************************************/ + +ncclResult_t ncclLaunchCooperativeKernelMultiDevice(hipLaunchParams *paramsList, int* cudaDevs, int numDevices, int cgMode) { + if (cgMode & 0x01) { + CUDACHECK(hipExtLaunchMultiKernelMultiDevice(paramsList, numDevices, + // These flags are to reduce the latency of using this API + 0)); + return ncclSuccess; + } + int savedDev; + CUDACHECK(hipGetDevice(&savedDev)); + for (int i = 0; i < numDevices; i++) { + hipLaunchParams* params = paramsList+i; + CUDACHECK(hipSetDevice(cudaDevs[i])); + hipLaunchKernelGGL(((void (*)(struct ncclColl))params->func), params->gridDim, params->blockDim, params->sharedMem, params->stream, **((struct ncclColl **)(params->args))); + } + CUDACHECK(hipSetDevice(savedDev)); + return ncclSuccess; +} + +ncclResult_t setupLaunch(struct ncclComm* comm, hipLaunchParams* params) { + params->gridDim.x = std::min(params->gridDim.x, comm->nChannels); + + // Set active = 2 for the last operation + for (int r=0; rgridDim.x; r++) { + struct ncclChannel* channel = comm->channels+r; + STORE(&channel->collectives[(channel->collStart+channel->collCount-1)%NCCL_MAX_OPS].active, 2); + } + + // Find the first operation, choose the kernel accordingly and pass it + // as the first argument. + struct ncclColl* coll = comm->channels[0].collectives+comm->channels[0].collStart; + memcpy(&comm->args, coll, sizeof(struct ncclColl)); + // As we pass that coll directly, we can free it immediately. + STORE(&coll->active, 0); + + params->func = (void *)ncclKerns[coll->funcIndex]; + return ncclSuccess; +} + +ncclResult_t ncclCpuBarrierIn(struct ncclComm* comm, int* isLast) { + volatile int* ptr = (volatile int*)(comm->intraBarrier+comm->intraPhase); + int val = LOAD(ptr); + bool done = false; + while (done == false) { + if (val >= comm->intraRanks) { + WARN("Trying to launch too many collectives"); + return ncclInvalidUsage; + } + if (val+1 == comm->intraRanks) { + // Reset the barrier. + comm->intraBarrier[comm->intraPhase^1] = 0; + *isLast = 1; + return ncclSuccess; + } + done = __sync_bool_compare_and_swap(ptr, val, val+1); + val++; + } + *isLast = 0; + return ncclSuccess; +} + +ncclResult_t ncclCpuBarrierLast(struct ncclComm* comm) { + volatile int* ptr = (volatile int*)(comm->intraBarrier+comm->intraPhase); + int val = LOAD(ptr); + if (__sync_bool_compare_and_swap(ptr, val, val+1) != true) { + WARN("Trying to launch too many collectives"); + return ncclInternalError; + } + return ncclSuccess; +} + +ncclResult_t ncclCpuBarrierOut(struct ncclComm* comm) { + volatile int* ptr = (volatile int*)(comm->intraBarrier+comm->intraPhase); + while (LOAD(ptr) < comm->intraRanks) pthread_yield(); + comm->intraPhase ^= 1; + return ncclSuccess; +} + +ncclResult_t ncclBarrierEnqueue(struct ncclComm* comm) { + if (comm->nRanks == 1) return ncclSuccess; + hipLaunchParams* params = comm->myParams; + + NCCLCHECK(setupLaunch(comm, params)); + + // Use internal NCCL stream for CGMD/GROUP launch if required or if the user stream is NULL + if (comm->launchMode == ncclComm::GROUP && (comm->groupCudaStream || comm->userStream == NULL)) { + // Enqueue event in user stream + CUDACHECK(hipEventRecord(comm->doneEvent, comm->userStream)); + // Create dependency between user stream and internal NCCL stream + CUDACHECK(hipStreamWaitEvent(comm->groupStream, comm->doneEvent, 0)); + params->stream = comm->groupStream; + } else { + if (comm->userStream != params->stream) { + // Stream changed from last call, create dependency against last NCCL kernel launch + CUDACHECK(hipStreamWaitEvent(comm->userStream, comm->doneEvent, 0)); + } + params->stream = comm->userStream; + } + + int isLast = 0; + NCCLCHECK(ncclCpuBarrierIn(comm, &isLast)); + + if (isLast) { + if (comm->launchMode == ncclComm::GROUP) { + // I'm the last. Launch all operations. + NCCLCHECK(ncclLaunchCooperativeKernelMultiDevice(comm->intraParams, comm->intraCudaDevs, comm->intraRanks, *comm->intraCGMode)); + } + NCCLCHECK(ncclCpuBarrierLast(comm)); + } + return ncclSuccess; +} + +ncclResult_t ncclBarrierEnqueueWait(ncclComm_t comm) { + if (comm->nRanks == 1) return ncclSuccess; + // We can't print the CG mode before the first barrier happened. + if (comm->rank == 0 && *comm->intraCGMode & 0x10) { + *comm->intraCGMode ^= 0x10; + INFO(NCCL_INIT,"Launch mode %s%s%s", + comm->launchMode == ncclComm::GROUP ? "Group" : "Parallel", + *comm->intraCGMode ? "/CGMD" : "", + (comm->launchMode == ncclComm::GROUP && comm->groupCudaStream) ? "/Stream" : ""); + } + + NCCLCHECK(ncclCpuBarrierOut(comm)); + + hipLaunchParams *params = comm->myParams; + if (comm->launchMode == ncclComm::PARALLEL) { + hipLaunchKernelGGL(((void (*)(struct ncclColl))params->func), params->gridDim, params->blockDim, params->sharedMem, params->stream, **((struct ncclColl **)(params->args))); + } + // Start the network proxies as soon as the kernel has been launched. We can't + // perform any CUDA call between the two or having a hipFree between the CUDA + // launch and the transportStartProxy call could cause a deadlock. + // Also, starting the proxies after the CUDA launch seems to be better for + // performance (latency). + for (int r=0; rgridDim.x; r++) { + struct ncclChannel* channel = comm->channels+r; + channel->collStart = channel->collFifoTail; + channel->collCount = 0; + } + params->gridDim.x = params->blockDim.x = 0; + NCCLCHECK(transportStartProxy(comm)); + return ncclSuccess; +} + +ncclResult_t ncclEnqueueEvents(ncclComm_t comm) { + hipLaunchParams *params = comm->myParams; + // Enqueue event after NCCL kernel + CUDACHECK(hipEventRecord(comm->doneEvent, params->stream)); + // Use internal NCCL stream for CGMD/GROUP launch if required or if the user stream is NULL + if (comm->launchMode == ncclComm::GROUP && (comm->groupCudaStream || comm->userStream == NULL)) { + // Create dependency between NCCL internal stream and user stream + CUDACHECK(hipStreamWaitEvent(comm->userStream, comm->doneEvent, 0)); + } + comm->userStreamSet = false; + return ncclSuccess; +} + +/*****************************************************************************/ +/* Enqueueing system : computation of kernel and proxy operations parameters */ +/*****************************************************************************/ + +static ncclResult_t getPatternInfo(struct ncclInfo* info) { + if (info->coll == ncclCollBroadcast) info->pattern = ncclPatternPipelineFrom; + else if (info->coll == ncclCollReduce) info->pattern = ncclPatternPipelineTo; + else if (info->coll == ncclCollAllGather || info->coll == ncclCollReduceScatter) info->pattern = ncclPatternRing; + else if (info->coll == ncclCollAllReduce) { + if (info->nBytes <= info->comm->treeThreshold) + info->pattern = ncclPatternTreeUpDown; + else + info->pattern = ncclPatternRingTwice; + } + else { + WARN("Unknown collective %d", info->coll); + return ncclInternalError; + } + return ncclSuccess; +} + +static ncclResult_t getLoopInfo(struct ncclInfo* info) { + switch (info->pattern) { + case ncclPatternTreeUp: + case ncclPatternTreeDown: + case ncclPatternTreeUpDown: + case ncclPatternPipelineFrom: + case ncclPatternPipelineTo: + info->nstepsPerLoop = info-> nchunksPerLoop = 1; break; + case ncclPatternRing: + info->nstepsPerLoop = info->comm->nRanks-1; info->nchunksPerLoop = info->comm->nRanks; break; + case ncclPatternRingTwice: + info->nstepsPerLoop = 2*(info->comm->nRanks-1); info->nchunksPerLoop = info->comm->nRanks; break; + default: + WARN("Unknown pattern %d\n", info->pattern); + return ncclInternalError; + } + return ncclSuccess; +} + +static void getKernelInfo(struct ncclInfo* info, uint8_t* nChannels, uint16_t* nThreads, int* llMode) { + // Compute thresholds and limits that users can override + ssize_t perThreadLLThreshold = std::min(info->comm->threadThreshold, NCCL_LL_CHANNEL_THRESHOLD); + int maxLLNthreads = std::min(NCCL_LL_MAX_NTHREADS, info->comm->nThreads); + + // First compute nThreads + int nt = NCCL_LL_MIN_NTHREADS; + while (DIVUP(info->nBytes, nt*info->nchunksPerLoop) > perThreadLLThreshold && nt*2 <= maxLLNthreads) nt *= 2; + + // Then compute nChannels + int nc = DIVUP(info->nBytes, nt*info->nchunksPerLoop*perThreadLLThreshold); + if (nc == 0) nc = 1; + if (nc > info->comm->nChannels) nc = info->comm->nChannels; + + // Check if we have a fixed LL threshold, otherwise compute it. + int perThreadThreshold = info->comm->threadThreshold; + if (info->pattern >= ncclPatternTreeUp) perThreadThreshold *= 4; + ssize_t llThreshold = info->comm->llThreshold >= 0 ? + info->comm->llThreshold : + nc*nt*info->nchunksPerLoop*perThreadThreshold; + + if (info->nBytes <= llThreshold) { + *llMode = 1; + *nChannels = nc; + *nThreads = nt; + } else { + *llMode = 0; + *nChannels = info->comm->nChannels; + *nThreads = info->comm->nThreads; + } +} + +static ncclResult_t computeColl(struct ncclInfo* info /* input */, struct ncclColl* coll, struct ncclProxyArgs* proxyArgs /* output */) { + // Set nstepsPerLoop and nchunksPerLoop + NCCLCHECK(getPatternInfo(info)); + NCCLCHECK(getLoopInfo(info)); + + coll->args.root = info->root; + coll->args.N = info->count; + coll->args.ThisInput = info->sendbuff; + coll->args.ThisOutput = info->recvbuff; + coll->args.comm = info->comm->devComm; + coll->args.opCount = info->comm->opCount; + + // Compute llMode, nChannels, nThreads + int llMode; + getKernelInfo(info, &coll->args.nChannels, &coll->args.nThreads, &llMode); + + int treeMode = info->pattern >= ncclPatternTreeUp ? 1 : 0; + coll->funcIndex = FUNC_INDEX(info->coll, info->op, info->datatype, llMode, treeMode); + + int stepSize = ( llMode ? NCCL_LL_BUFF_SIZE : info->comm->channels[0].buffSize ) / NCCL_STEPS; + int chunkSteps = (llMode|treeMode) ? 1 : info->chunkSteps; + int sliceSteps = (llMode|treeMode) ? 1 : info->sliceSteps; + int chunkSize = stepSize*chunkSteps; + + // Compute lastChunkSize + if (treeMode == 1 && llMode == 0) { + if (info->pattern == ncclPatternTreeUpDown) { + // Optimize chunkSize / nSteps + while (info->nBytes / (coll->args.nChannels*chunkSize) < info->comm->channels[0].tree.depth*8 && chunkSize > 131072) chunkSize /= 2; + while (info->nBytes / (coll->args.nChannels*chunkSize) < info->comm->channels[0].tree.depth*4 && chunkSize > 65536) chunkSize /= 2; + while (info->nBytes / (coll->args.nChannels*chunkSize) < info->comm->channels[0].tree.depth && chunkSize > 32768) chunkSize /= 2; + } + // Use lastChunkSize as chunkSize + coll->args.lastChunkSize = chunkSize / ncclTypeSize(info->datatype); + } else if (llMode == 1) { + int sliceSize = NCCL_LL_SLICE_LINES * sizeof(uint64_t); + const ssize_t loopSize = coll->args.nChannels*info->nchunksPerLoop*(ssize_t)sliceSize; + coll->args.lastChunkSize = DIVUP((info->nBytes-(info->nBytes/loopSize)*loopSize), coll->args.nChannels*info->nchunksPerLoop); + ALIGN_SIZE(coll->args.lastChunkSize, coll->args.nThreads*sizeof(uint64_t)); + coll->args.lastChunkSize /= ncclTypeSize(info->datatype); + } + + // Compute nSteps for proxies + size_t nBytes = llMode ? info->nBytes*2 : info->nBytes; + + int nLoops = (int)(DIVUP(nBytes, (((size_t)(coll->args.nChannels))*info->nchunksPerLoop*chunkSize))); + proxyArgs->nsteps = info->nstepsPerLoop * nLoops * chunkSteps; + proxyArgs->sliceSteps = sliceSteps; + proxyArgs->chunkSteps = chunkSteps; + proxyArgs->llMode = llMode; + proxyArgs->opCount = info->comm->opCount; + TRACE(NCCL_NET,"opCount %lx slicesteps %d spl %d cpl %d nbytes %zi -> llmode %d nchannels %d nthreads %d, nloops %d nsteps %d comm %p", + coll->args.opCount, proxyArgs->sliceSteps, info->nstepsPerLoop, info->nchunksPerLoop, nBytes, llMode, coll->args.nChannels, coll->args.nThreads, + nLoops, proxyArgs->nsteps, info->comm); + return ncclSuccess; +} + +static ncclResult_t saveKernel(struct ncclInfo* info) { + if (info->comm->nRanks == 1) { + if (info->sendbuff != info->recvbuff) + CUDACHECK(hipMemcpyAsync(info->recvbuff, info->sendbuff, info->nBytes, hipMemcpyDeviceToDevice, info->stream)); + return ncclSuccess; + } + + struct ncclColl coll; + struct ncclProxyArgs proxyArgs; + memset(&proxyArgs, 0, sizeof(struct ncclProxyArgs)); + NCCLCHECK(computeColl(info, &coll, &proxyArgs)); + + info->comm->myParams->blockDim.x = std::max(info->comm->myParams->blockDim.x, coll.args.nThreads); + if (info->comm->userStreamSet == false) { + info->comm->userStream = info->stream; + info->comm->userStreamSet = true; + } else if (info->stream != info->comm->userStream) { + WARN("Error : mixing different streams within a group call is not supported."); + return ncclInvalidUsage; + } + for (int bid=0; bidcomm->channels+(info->comm->myParams->gridDim.x % info->comm->nChannels); + + if (channel->collCount == NCCL_MAX_OPS) { + WARN("Too many aggregated operations (%d max)", NCCL_MAX_OPS); + return ncclInvalidUsage; + } + + // Proxy + proxyArgs.channel = channel; + NCCLCHECK(transportSaveProxies(&proxyArgs, info->pattern, info->root, info->comm->nRanks)); + + info->comm->myParams->gridDim.x++; + + int opIndex = channel->collFifoTail; + struct ncclColl* c = channel->collectives+opIndex; + volatile uint8_t* activePtr = (volatile uint8_t*)&c->active; + while (LOAD(activePtr) != 0) sched_yield(); + + memcpy(c, &coll, sizeof(struct ncclColl)); + + c->args.bid = bid; + STORE(&c->active, 1); + opIndex = (opIndex+1)%NCCL_MAX_OPS; + c->nextIndex = opIndex; + channel->collFifoTail = opIndex; + channel->collCount++; + } + /*if (llMode == 0)*/ info->comm->opCount++; + return ncclSuccess; +} + + +ncclResult_t ncclEnqueueCheck(struct ncclInfo* info) { + if (info->comm == NULL) return ncclInvalidArgument; + + INFO(NCCL_COLL,"%s: opCount %lx sendbuff %p recvbuff %p count %zi datatype %d op %d root %d comm %p [nranks=%d] stream %p", + info->opName, info->comm->opCount, info->sendbuff, info->recvbuff, info->count, + info->datatype, info->op, info->root, info->comm, info->comm->nRanks, info->stream); + + // Launch asynchronously if needed + if (ncclAsyncMode()) { + ncclResult_t ret = ncclSuccess; + int savedDev = -1; + if (info->comm->checkPointers) { + CUDACHECKGOTO(hipGetDevice(&savedDev), ret, end); + CUDACHECKGOTO(hipSetDevice(info->comm->cudaDev), ret, end); + } + // Check arguments + NCCLCHECKGOTO(ArgsCheck(info), ret, end); + // Always register comm even in case of error to make sure ncclGroupEnd + // cleans it up. + NCCLCHECKGOTO(ncclAsyncColl(info->comm), ret, end); + NCCLCHECKGOTO(saveKernel(info), ret, end); +end: + if (savedDev != -1) CUDACHECK(hipSetDevice(savedDev)); + ncclAsyncErrCheck(ret); + return ret; + } else { + NCCLCHECK(ArgsCheck(info)); + NCCLCHECK(saveKernel(info)); + NCCLCHECK(ncclBarrierEnqueue(info->comm)); + NCCLCHECK(ncclBarrierEnqueueWait(info->comm)); + NCCLCHECK(ncclEnqueueEvents(info->comm)); + return ncclSuccess; + } +} diff --git a/projects/rccl/src/include/alloc.h b/projects/rccl/src/include/alloc.h new file mode 100644 index 0000000000..3d0f07aa95 --- /dev/null +++ b/projects/rccl/src/include/alloc.h @@ -0,0 +1,54 @@ +/************************************************************************* + * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ + +#ifndef NCCL_ALLOC_H_ +#define NCCL_ALLOC_H_ + +#include "nccl.h" +#include "checks.h" +#include + +static inline ncclResult_t ncclCudaHostAlloc(void** ptr, void** devPtr, size_t size) { + CUDACHECK(hipHostMalloc(ptr, size, hipHostMallocMapped)); + memset(*ptr, 0, size); + *devPtr = *ptr; + return ncclSuccess; +} + +static inline ncclResult_t ncclCudaHostFree(void* ptr) { + CUDACHECK(hipHostFree(ptr)); + return ncclSuccess; +} + +template +static ncclResult_t ncclCalloc(T** ptr, size_t nelem) { + void* p = malloc(nelem*sizeof(T)); + if (p == NULL) { + WARN("Failed to malloc %ld bytes", nelem*sizeof(T)); + return ncclSystemError; + } + memset(p, 0, nelem*sizeof(T)); + *ptr = (T*)p; + return ncclSuccess; +} + +template +static ncclResult_t ncclCudaCalloc(T** ptr, size_t nelem, bool isFineGrain = false) { + if (isFineGrain) + CUDACHECK(hipExtMallocWithFlags((void**)ptr, nelem*sizeof(T), hipDeviceMallocFinegrained)); + else + CUDACHECK(hipMalloc(ptr, nelem*sizeof(T))); + CUDACHECK(hipMemset(*ptr, 0, nelem*sizeof(T))); + return ncclSuccess; +} + +template +static ncclResult_t ncclCudaMemcpy(T* dst, T* src, size_t nelem) { + CUDACHECK(hipMemcpy(dst, src, nelem*sizeof(T), hipMemcpyDefault)); + return ncclSuccess; +} + +#endif diff --git a/projects/rccl/src/include/argcheck.h b/projects/rccl/src/include/argcheck.h new file mode 100644 index 0000000000..0d6cca7c30 --- /dev/null +++ b/projects/rccl/src/include/argcheck.h @@ -0,0 +1,15 @@ +/************************************************************************* + * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ + +#ifndef NCCL_ARGCHECK_H_ +#define NCCL_ARGCHECK_H_ + +#include "core.h" + +ncclResult_t PtrCheck(void* ptr, const char* opname, const char* ptrname); +ncclResult_t ArgsCheck(struct ncclInfo* info); + +#endif diff --git a/projects/rccl/src/include/bootstrap.h b/projects/rccl/src/include/bootstrap.h index 278593c8cd..dacbc7c5e1 100644 --- a/projects/rccl/src/include/bootstrap.h +++ b/projects/rccl/src/include/bootstrap.h @@ -1,5 +1,5 @@ /************************************************************************* - * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ @@ -9,9 +9,12 @@ #include "nccl.h" +ncclResult_t bootstrapNetInit(); ncclResult_t bootstrapCreateRoot(ncclUniqueId* commId, bool idFromEnv); ncclResult_t bootstrapGetUniqueId(ncclUniqueId* out); ncclResult_t bootstrapInit(ncclUniqueId* id, int rank, int nranks, void** commState); ncclResult_t bootstrapAllGather(void* commState, void* allData, int size); +ncclResult_t bootstrapSend(void* commState, int peer, void* data, int size); +ncclResult_t bootstrapRecv(void* commState, int peer, void* data, int size); ncclResult_t bootstrapClose(void* commState); #endif diff --git a/projects/rccl/src/include/channel.h b/projects/rccl/src/include/channel.h new file mode 100644 index 0000000000..c01d942e4f --- /dev/null +++ b/projects/rccl/src/include/channel.h @@ -0,0 +1,14 @@ +/************************************************************************* + * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ + +#ifndef NCCL_CHANNEL_H_ +#define NCCL_CHANNEL_H_ +#include "core.h" + +ncclResult_t initChannel(struct ncclComm* comm, int channelid); +ncclResult_t freeChannel(struct ncclChannel* channel, int nRanks); + +#endif diff --git a/projects/rccl/src/include/checks.h b/projects/rccl/src/include/checks.h new file mode 100644 index 0000000000..5636338d94 --- /dev/null +++ b/projects/rccl/src/include/checks.h @@ -0,0 +1,73 @@ +/************************************************************************* + * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ + +#ifndef NCCL_CHECKS_H_ +#define NCCL_CHECKS_H_ + +#include "debug.h" + +// Check CUDA calls +#define CUDACHECK(cmd) do { \ + hipError_t e = cmd; \ + if( e != hipSuccess ) { \ + WARN("Cuda failure '%s'", hipGetErrorString(e)); \ + return ncclUnhandledCudaError; \ + } \ +} while(false) + +#define CUDACHECKGOTO(cmd, res, label) do { \ + hipError_t e = cmd; \ + if( e != hipSuccess ) { \ + WARN("Cuda failure '%s'", hipGetErrorString(e)); \ + res = ncclUnhandledCudaError; \ + goto label; \ + } \ +} while(false) + +#include +// Check system calls +#define SYSCHECK(call, name) do { \ + int retval; \ + SYSCHECKVAL(call, name, retval); \ +} while (false) + +#define SYSCHECKVAL(call, name, retval) do { \ + SYSCHECKSYNC(call, name, retval); \ + if (retval == -1) { \ + WARN("Call to " name " failed : %s", strerror(errno)); \ + return ncclSystemError; \ + } \ +} while (false) + +#define SYSCHECKSYNC(call, name, retval) do { \ + retval = call; \ + if (retval == -1 && (errno == EINTR || errno == EWOULDBLOCK || errno == EAGAIN)) { \ + INFO(NCCL_ALL,"Call to " name " returned %s, retrying", strerror(errno)); \ + } else { \ + break; \ + } \ +} while(true) + +// Propagate errors up +#define NCCLCHECK(call) do { \ + ncclResult_t res = call; \ + if (res != ncclSuccess) { \ + /* Print the back trace*/ \ + INFO(NCCL_ALL,"%s:%d -> %d", __FILE__, __LINE__, res); \ + return res; \ + } \ +} while (0); + +#define NCCLCHECKGOTO(call, res, label) do { \ + res = call; \ + if (res != ncclSuccess) { \ + /* Print the back trace*/ \ + INFO(NCCL_ALL,"%s:%d -> %d", __FILE__, __LINE__, res); \ + goto label; \ + } \ +} while (0); + +#endif diff --git a/projects/rccl/src/include/comm.h b/projects/rccl/src/include/comm.h new file mode 100644 index 0000000000..57a9b12c48 --- /dev/null +++ b/projects/rccl/src/include/comm.h @@ -0,0 +1,117 @@ +/************************************************************************* + * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved. + * Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ + +#ifndef NCCL_COMM_H_ +#define NCCL_COMM_H_ + +#define MAXCHANNELS 16 +#define DEFAULT_BUFFER_SIZE_BYTES (1LL << 22) /* 4MiB */ + +#define CACHE_LINE_SIZE 64 +#define MEM_ALIGN 4096 +#define CUDA_IPC_MIN 2097152UL + +struct ncclSendMem { + union { + struct { + uint64_t head; + char pad1[CACHE_LINE_SIZE-sizeof(uint64_t)]; + void* ptrExchange; + char pad2[CACHE_LINE_SIZE-sizeof(void*)]; + uint64_t opCount; + }; + char pad3[MEM_ALIGN]; + }; +}; + +struct ncclRecvMem { + union { + struct { + uint64_t tail; + char pad1[CACHE_LINE_SIZE-sizeof(uint64_t)]; + uint64_t opCount; + char pad2[CACHE_LINE_SIZE-sizeof(uint64_t)]; + int sizesFifo[NCCL_STEPS]; + }; + char pad4[MEM_ALIGN]; + }; + ncclLLFifoLine llBuff[NCCL_LL_BUFF_LINES]; + char buff[1]; // Actually larger than that +}; + +struct ncclComm { + struct ncclChannel channels[MAXCHANNELS]; + + struct ncclPeerInfo* peerInfo; + + void* bootstrap; + + int rank; // my rank in the communicator + int nRanks; // number of GPUs in communicator + int cudaDev; // my cuda device index + int nvmlDev; // my NVML device number + + enum { GROUP, PARALLEL } launchMode; + hipStream_t userStream; + bool userStreamSet; + hipEvent_t doneEvent; + bool checkPointers; + + // Counter to make sure collectives match (needed for bcast/reduce + // where syncs are not symmetric). + uint64_t opCount; + + // Channels for collectives + int nChannels; + int nThreads; + + // Low-latency algorithm threshold + ssize_t llThreshold; + ssize_t threadThreshold; + + // Tree algorithm threshold + ssize_t treeThreshold; + + // An internal CUDA stream for NCCL kernel CGMD launches + int groupCudaStream; + hipStream_t groupStream; + + // Whether there has been a fatal error in this communicator. + ncclResult_t fatalError; + + // Error reported by GPU + volatile ncclDevError_t* fatalDevError; + + // Flag to ask NCCL kernels to abort + volatile uint32_t *abortFlag; + + // Device side of the communicator + struct ncclDevComm *devComm; + // Host copy of the devComm (to free CUDA allocs) + struct ncclDevComm hostDevComm; + + // Intra-process sync + int intraRank; + int intraRanks; + int* intraBarrier; + int intraPhase; + + // Storage for deferred intra-process launch + hipLaunchParams * intraParams; + hipLaunchParams *myParams; + int* intraCudaDevs; + int* intraCGMode; // Whether we can use CUDA9 CGMD or not + int* intraCC; // Only to check all have the same ComputeCap and disable CGMode if not + struct ncclColl args; + struct ncclColl* argsptr; + + // Global proxy thread + pthread_t proxyThread; + struct ncclProxyState proxyState; +}; + +#endif diff --git a/projects/rccl/src/include/common_coll.h b/projects/rccl/src/include/common_coll.h deleted file mode 100644 index be9aa0023f..0000000000 --- a/projects/rccl/src/include/common_coll.h +++ /dev/null @@ -1,196 +0,0 @@ -/************************************************************************* - * Copyright (c) 2016-2018, NVIDIA CORPORATION. All rights reserved. - * Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved. - * - * See LICENSE.txt for license information - ************************************************************************/ - -#ifndef COMMON_COLL_H_ -#define COMMON_COLL_H_ - -#include "core.h" -#include "enqueue.h" -#include "collectives/collectives.h" - -static ncclResult_t PointerCheck(const void* pointer, struct ncclComm* comm, const char* ptrname, const char* opname) { - hipPointerAttribute_t attr; - hipError_t err = hipPointerGetAttributes(&attr, pointer); - if (err != hipSuccess || attr.devicePointer == NULL) { - WARN("%s : %s is not a valid pointer", opname, ptrname); - return ncclInvalidArgument; - } -#if CUDART_VERSION >= 10000 - if (attr.type == hipMemoryTypeDevice && attr.device != comm->cudaDev) { -#else - if (attr.memoryType == hipMemoryTypeDevice && attr.device != comm->cudaDev) { -#endif - WARN("%s : %s allocated on device %d mismatchs with NCCL device %d", opname, ptrname, attr.device, comm->cudaDev); - return ncclInvalidArgument; - } - return ncclSuccess; -} - -static ncclResult_t PtrCheck(void* ptr, const char* opname, const char* ptrname) { - if (ptr == NULL) { - WARN("%s : %s argument is NULL", opname, ptrname); - return ncclInvalidArgument; - } - return ncclSuccess; -} - -static ncclResult_t ArgsCheck(const void* sendbuff, const void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, struct ncclComm* comm, const char* opname) { - NCCLCHECK(PtrCheck(comm, opname, "comm")); - // First, the easy ones - if (root < 0 || root >= comm->nRanks) { - WARN("%s : invalid root %d (root should be in the 0..%d range)", opname, root, comm->nRanks); - return ncclInvalidArgument; - } - if (type < 0 || type >= ncclNumTypes) { - WARN("%s : invalid type %d", opname, type); - return ncclInvalidArgument; - } - if (op < 0 || op >= ncclNumOps) { - WARN("%s : invalid reduction operation %d", opname, op); - return ncclInvalidArgument; - } - - if (comm->checkPointers) { - // Check CUDA device pointers - if (strcmp(opname, "Broadcast") != 0 || comm->rank == root) { - NCCLCHECK(PointerCheck(sendbuff, comm, "sendbuff", opname)); - } - if (strcmp(opname, "Reduce") != 0 || comm->rank == root) { - NCCLCHECK(PointerCheck(recvbuff, comm, "recvbuff", opname)); - } - } - return ncclSuccess; -} - -static __inline__ int ncclTypeSize(ncclDataType_t type) { - switch (type) { - case ncclInt8: - case ncclUint8: - return 1; - case ncclFloat16: - return 2; - case ncclInt32: - case ncclUint32: - case ncclFloat32: - return 4; - case ncclInt64: - case ncclUint64: - case ncclFloat64: - return 8; - default: - return -1; - } -} - -// In : comm, nbytes ; Out : nrings, nthreads, ll -// - We start with the minimum number of threads possible (64) and see if the size fits in LL; -// If not, we increase the number of threads by 2x, until we reach the max number of LL threads (256, or set by user via NCCL_NTHREADS, or platform non-LL default) -// - We use "maxRings" to limit the max number of rings we can use before reaching the max number of LL threads -// This ensures we don't use a large number of rings with a small number of threads -// - We use the NCCL_LL_RING_THRESHOLD as the per-thread threshold before we reach the max number of threads -// we use NCCL_THREAD_THRESHOLD when we reach the max -// - If by the max number of LL threads, the size still cannot fit in LL, then we use non-LL setting -// - We honor the NCCL_LL_THRESHOLD (total threshold) set by user too -static inline void ncclGetCollResource(ncclComm_t comm, size_t nbytes, int* nrings, int* nthreads, int* ll) { - *ll = 0; - int llEnforced = 0; /* see if the size falls in the NCCL_LL_THRESHOLD range set by user */ - if (comm->llThreshold >= 0) { /* user sets total LL threshold */ - if (nbytes > comm->llThreshold) { /* non-LL */ - *nthreads = comm->nThreads; - *nrings = comm->nRings; - return; - } else { - llEnforced = 1; /* user wants to use LL */ - } - } - int nt = NCCL_LL_MIN_NTHREADS; /* start with min number of LL threads */ - size_t nr; - int ll_max_nthreads = std::min(NCCL_LL_MAX_NTHREADS, comm->nThreads); /* respect user's setting or platform's default setting */ - int maxRings = (comm->nRanks <= 4) ? 1 : ll_max_nthreads / NCCL_LL_MIN_NTHREADS; - ssize_t threshold = std::min(comm->threadThreshold, (ssize_t)NCCL_LL_RING_THRESHOLD); - while (nt < ll_max_nthreads && *ll == 0) { - nr = DIVUP(nbytes, (NCCL_LL_RING_THRESHOLD*nt*comm->nRanks)); - if (nr <= maxRings) { /* avoid using few threads but many rings */ - nr = nr == 0 ? 1 : nr > comm->nRings ? comm->nRings : nr; - *ll = nbytes > comm->nRanks*nr*nt*threshold ? 0 : 1; - } - if (*ll == 0) { - nt = nt << 1; - } - } - if (*ll == 1) { - *nthreads = nt; - *nrings = (int)nr; - return; /* we can use smaller number of threads to make LL work, stop here */ - } - nr = DIVUP(nbytes, (NCCL_LL_RING_THRESHOLD*ll_max_nthreads*comm->nRanks)); /* else we try the max number of LL threads */ - nr = nr == 0 ? 1 : nr > comm->nRings ? comm->nRings : nr; - *ll = nbytes > comm->nRanks*nr*ll_max_nthreads*comm->threadThreshold ? llEnforced : 1; - *nthreads = *ll ? ll_max_nthreads : comm->nThreads; - *nrings = *ll ? (int)nr : comm->nRings; -} - -static ncclResult_t saveKernel(int coll, const void* sendbuff, void* recvbuff, size_t count, - ncclDataType_t dtype, ncclRedOp_t op, int root, ncclComm_t comm, hipStream_t stream, size_t nbytes, int loopFactor) { - int llMode, nBlocks, nThreads; - ncclGetCollResource(comm, nbytes, &nBlocks, &nThreads, &llMode); - comm->myParams->blockDim.x = std::max((int)comm->myParams->blockDim.x, nThreads); - if (comm->userStreamSet == false) { - comm->userStream = stream; - comm->userStreamSet = true; - } else if (stream != comm->userStream) { - WARN("Error : mixing different streams within a group call is not supported."); - return ncclInvalidUsage; - } - int lastChunkSize = 0; - if (llMode == 1) { - int sliceSize = NCCL_LL_SLICE_LINES * sizeof(uint64_t) / ncclTypeSize(dtype); - const ssize_t loopSize = nBlocks*loopFactor*(ssize_t)sliceSize; - lastChunkSize = DIVUP((count-count/loopSize*loopSize), nBlocks*loopFactor); - ALIGN_SIZE(lastChunkSize, nThreads*sizeof(uint64_t)/ncclTypeSize(dtype)); - } - for (int bid=0; bidrings+(comm->myParams->gridDim.x % comm->nRings); - if (ring->collCount == NCCL_MAX_OPS) { - WARN("Too many aggregated operations (%d max)", NCCL_MAX_OPS); - return ncclInvalidUsage; - } - - comm->myParams->gridDim.x++; - - int opIndex = ring->collFifoTail; - struct ncclColl* c = ring->collectives+opIndex; - volatile uint8_t* activePtr = (volatile uint8_t*)&c->active; - while (LOAD(activePtr) != 0) sched_yield(); - - struct CollectiveArgs* args = &c->args; - args->root = root; - args->N = count; - args->ThisInput = sendbuff; - args->ThisOutput = recvbuff; - args->comm = comm->devComm; - args->opCount = comm->opCount; - args->bid = bid; - args->nRings = nBlocks; - args->nThreads = nThreads; - args->lastChunkSize = lastChunkSize; - - c->nThreads = nThreads; - c->funcIndex = FUNC_INDEX(coll, op, dtype, llMode); - STORE(&c->active, 1); - opIndex = (opIndex+1)%NCCL_MAX_OPS; - c->nextIndex = opIndex; - ring->collFifoTail = opIndex; - ring->collCount++; - } - /*if (llMode == 0)*/ comm->opCount++; - return ncclSuccess; -} - -extern __global__ void ncclMultiOpKernel (struct ncclColl firstColl); - -#endif diff --git a/projects/rccl/src/include/core.h b/projects/rccl/src/include/core.h index 2e803facbc..8a08b914b0 100644 --- a/projects/rccl/src/include/core.h +++ b/projects/rccl/src/include/core.h @@ -1,6 +1,5 @@ /************************************************************************* - * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved. - * Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved. + * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ @@ -8,313 +7,20 @@ #ifndef NCCL_CORE_H_ #define NCCL_CORE_H_ -#define NCCL_MAX_OPS 2048 - +#include +#include #include "nccl.h" -#include "transport.h" #include "debug.h" +#include "checks.h" +#include "alloc.h" +#include "transport.h" +#include "devcomm.h" +#include "comm.h" +#include "info.h" +#include "argcheck.h" #include -#include // std::min/std::max #include #include -#include -#include - -#define MAXRINGS 16 -#define MAXTHREADS 256 -#define DEFAULT_BUFFER_SIZE_BYTES (1LL << 22) /* 4MiB */ - -// Rings / LL tuning -#define NCCL_LL_RING_THRESHOLD 8 // Per thread size before we start increasing nrings -#define NCCL_THREAD_THRESHOLD 256 // Per thread size before we switch to non-LL for Volta and above -#define NCCL_THREAD_THRESHOLD_PREVOLTA 32 // Per thread size before we switch to non-LL for pre-Volta archs -#define NCCL_LL_MAX_NTHREADS 256 -#define NCCL_LL_MIN_NTHREADS 256 - -#define DIVUP(x, y) \ - (((x)+(y)-1)/(y)) -#define ROUNDUP(x, y) \ - (DIVUP((x), (y))*(y)) - -#define ALIGN_SIZE(size, align) \ - size = ((size + (align) - 1) / (align)) * (align); - -union ncclLLFifoLine { - /* Flags have to be *after* data, because otherwise, an incomplete receive - from the network may receive the flag but not the data. - Note this is assuming that either we receive contiguous chunks of data - (sockets) or data is written with an atomicity of 8 bytes (IB/RDMA). */ - struct { - uint32_t data1; - uint32_t flag1; - uint32_t data2; - uint32_t flag2; - }; - uint64_t v[2]; - int4 i4; -}; - -struct ncclConnInfo { - // Regular comm mechanism - char *buff; // Local for recv, remote for send - uint64_t *tail; // Local for recv, remote for send - uint64_t *head; // Local for send, remote for recv - uint64_t *opCount; // Local for recv, remote for send - - int direct; // Direct communication - void **ptrExchange; // Pointer exchange for direct communication - - int *fifo; // Size fifo for proxy - - // Low latency mechanism - char *llBuff; // Local for recv, remote for send - uint64_t *llHead; // Local for send, remote for recv - int *llFifo; // LL Size fifo for proxy - uint64_t llStep; // Keep where we are - uint64_t llLastCleaning; -}; - -struct ncclConnector { - struct transportProxyInfo* proxyInfo; - struct ncclTransport* transport; - void* transportResources; // Host-side resources - struct ncclConnInfo conn; -}; - -#define CACHE_LINE_SIZE 64 -#define MEM_ALIGN 4096 -#define SIZES_FIFO_SIZE 16 -#define CUDA_IPC_MIN 2097152UL /* 2MiB - not currently used */ - -#define NCCL_LL_CHUNKS 8 -#define NUM_LINES_PER_THREAD 8 -#define NCCL_LL_BUFF_SIZE (NUM_LINES_PER_THREAD*NCCL_LL_MAX_NTHREADS*NCCL_LL_CHUNKS*sizeof(union ncclLLFifoLine)) // 256K -#define NCCL_LL_BUFF_LINES (NCCL_LL_BUFF_SIZE / (2*sizeof(uint64_t))) -#define NCCL_LL_SLICE_LINES (NCCL_LL_BUFF_LINES / NCCL_LL_CHUNKS) -#define NCCL_LL_CLEAN_FREQ 0x10000000 - -struct ncclSendMem { - union { - struct { - uint64_t head; - char pad1[CACHE_LINE_SIZE-sizeof(uint64_t)]; - void* ptrExchange; - char pad2[CACHE_LINE_SIZE-sizeof(void*)]; - uint64_t llHead; - }; - char pad3[MEM_ALIGN]; - }; -}; - -struct ncclRecvMem { - union { - struct { - uint64_t tail; - char pad2[CACHE_LINE_SIZE-sizeof(uint64_t)]; - uint64_t opCount; - char pad4[CACHE_LINE_SIZE-sizeof(uint64_t)]; - int sizesFifo[SIZES_FIFO_SIZE]; - int llSizesFifo[SIZES_FIFO_SIZE]; - }; - char pad5[MEM_ALIGN]; - }; - char llBuff[NCCL_LL_BUFF_SIZE]; - char buff[1]; // Actually larger than that -}; - -struct ncclRing { - union { - struct { - int id; - int nthreads; - // Per ring resources - struct ncclSendMem* devMemSend; // CUDA-size resources - struct ncclRecvMem* devMemRecv; // CUDA-size resources - int buffSize; - int devMemSendSize; // Keep the size for IPCs - int devMemRecvSize; // Keep the size for IPCs - struct ncclConnector send; - struct ncclConnector recv; - - // Maps an internal nccl index to user-specified rank order. This is necessary - // since we need to know how the user expects data to be ordered across - // devices. Ordered from current device. - int* userRanks; - int* devUserRanks; - - // GPU's HDP_MEM_FLUSH_ADDR: HDP Memory Coherency Flush Control. This register - // allows software to explicitly initiate a flush read to HDP memory. See more - // descriptions in primitives.h. - uint32_t* next_hdp_reg; // Next GPU in ring (for p2p transport use only) - uint32_t* curr_hdp_reg; // Curr GPU in ring (for rdma transport use only) - - // Operation list for aggregation - struct ncclColl* collectives; - struct ncclColl* devCollectives; - int collStart; - int collCount; - int collFifoHead; // Only used by GPU - int collFifoTail; // Only used by CPU - }; - int data[0x80]; - }; -}; -static_assert(sizeof(struct ncclRing) == 0x80*sizeof(int), "ncclRing must have a pow2 size"); - -#pragma pack(push) /* push current alignment to stack */ -#pragma pack(4) /* set alignment to 4 bytes boundary */ -/* CollectiveArgs + ncclColl are to be a power of two, currently 64 bytes, */ -/* to make sure reads to host from the CUDA kernel are aligned. */ -/* Make sure to adjust padding at the end of ncclColl. */ -struct CollectiveArgs { - struct ncclComm* comm; - uint64_t opCount; - - // local and remote input, output, and buffer - const void * ThisInput; - void * ThisOutput; - - // general parameters - size_t N; - uint32_t root; - uint8_t bid; - uint8_t nRings; - uint16_t nThreads; - - int lastChunkSize; -}; -struct ncclColl { - union { - struct { - struct CollectiveArgs args; - uint16_t nThreads; - uint16_t funcIndex; - uint16_t nextIndex; - uint8_t active; - }; - int data[0x10]; - }; -}; -static_assert(sizeof(struct ncclColl) == (0x10*sizeof(int)), "ncclColl must have a pow2 size"); -#pragma pack(pop) /* restore original alignment from stack */ - -struct ncclComm { - struct ncclRing rings[MAXRINGS]; - - int rank; // my rank in the communicator - int nRanks; // number of GPUs in communicator - int cudaDev; // my cuda device index - - enum { GROUP, PARALLEL } launchMode; - hipStream_t userStream; - bool userStreamSet; - hipEvent_t doneEvent; - bool checkPointers; - - // Counter to make sure collectives match (needed for bcast/reduce - // where syncs are not symmetric). - uint64_t opCount; - - // Rings for collectives - int nRings; - int nThreads; - - // Low-latency algorithm threshold - ssize_t llThreshold; - ssize_t threadThreshold; - - // An internal CUDA stream for NCCL kernel CGMD launches - int groupCudaStream; - hipStream_t groupStream; - - // Device copy of the communicator - struct ncclComm *devComm; - - // Intra-process sync - int intraRank; - int intraRanks; - int* intraBarrier; - int intraPhase; - - // Storage for deferred intra-process launch - hipLaunchParams* intraParams; - hipLaunchParams* myParams; - int* intraCudaDevs; - int* intraCGMode; // Whether we can use CUDA9 CGMD or not - int* intraCC; // Only to check all have the same ComputeCap and disable CGMode if not - struct ncclColl args; - struct ncclColl* argsptr; -}; - -// Convert volatile access to atomic -#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) -#define LOAD(VAR) __atomic_load_n((VAR), __ATOMIC_SEQ_CST) -#define STORE(DST, SRC) __atomic_store_n((DST), (SRC), __ATOMIC_SEQ_CST) -#else -#define LOAD(VAR) *(VAR) -#define STORE(DST, SRC) *(DST) = (SRC) -#endif - -// Check CUDA calls -#define CUDACHECK(cmd) do { \ - hipError_t e = cmd; \ - if( e != hipSuccess ) { \ - WARN("Cuda failure '%s'", hipGetErrorString(e)); \ - return ncclUnhandledCudaError; \ - } \ -} while(false) - -#define CUDACHECKGOTO(cmd, res, label) do { \ - hipError_t e = cmd; \ - if( e != hipSuccess ) { \ - WARN("Cuda failure '%s'", hipGetErrorString(e)); \ - res = ncclUnhandledCudaError; \ - goto label; \ - } \ -} while(false) - -#include -// Check system calls -#define SYSCHECK(call, name) do { \ - int retval; \ - SYSCHECKVAL(call, name, retval); \ -} while (false) - -#define SYSCHECKVAL(call, name, retval) do { \ - SYSCHECKSYNC(call, name, retval); \ - if (retval == -1) { \ - WARN("Call to " name " failed : %s", strerror(errno)); \ - return ncclSystemError; \ - } \ -} while (false) - -#define SYSCHECKSYNC(call, name, retval) do { \ - retval = call; \ - if (retval == -1 && (errno == EINTR || errno == EWOULDBLOCK || errno == EAGAIN)) { \ - INFO(NCCL_ALL,"Call to " name " returned %s, retrying", strerror(errno)); \ - } else { \ - break; \ - } \ -} while(true) - -// Propagate errors up -#define NCCLCHECK(call) do { \ - ncclResult_t res = call; \ - if (res != ncclSuccess) { \ - /* Print the back trace*/ \ - INFO(NCCL_ALL,"%s:%d -> %d", __FILE__, __LINE__, res); \ - return res; \ - } \ -} while (0); - -#define NCCLCHECKGOTO(call, res, label) do { \ - res = call; \ - if (res != ncclSuccess) { \ - /* Print the back trace*/ \ - INFO(NCCL_ALL,"%s:%d -> %d", __FILE__, __LINE__, res); \ - goto label; \ - } \ -} while (0); #ifdef PROFAPI #define NCCL_API(ret, func, args...) \ @@ -333,51 +39,27 @@ struct ncclComm { #endif // end PROFAPI int ncclCudaCompCap(); +ncclResult_t ncclNvlinkGpu(int* nvlink); +int64_t ncclTreeThreshold(); -#include -static inline ncclResult_t ncclCudaHostAlloc(void** ptr, void** devPtr, size_t size) { - CUDACHECK(hipHostMalloc(ptr, size, hipHostMallocMapped)); - memset(*ptr, 0, size); - *devPtr = *ptr; - return ncclSuccess; -} - -static inline ncclResult_t ncclCudaHostFree(void* ptr) { - CUDACHECK(hipHostFree(ptr)); - return ncclSuccess; -} - -template -static ncclResult_t ncclCalloc(T** ptr, size_t nelem) { - void* p = malloc(nelem*sizeof(T)); - if (p == NULL) { - WARN("Failed to malloc %ld bytes", nelem*sizeof(T)); - return ncclSystemError; +static __inline__ int ncclTypeSize(ncclDataType_t type) { + switch (type) { + case ncclInt8: + case ncclUint8: + return 1; + case ncclFloat16: + return 2; + case ncclInt32: + case ncclUint32: + case ncclFloat32: + return 4; + case ncclInt64: + case ncclUint64: + case ncclFloat64: + return 8; + default: + return -1; } - memset(p, 0, nelem*sizeof(T)); - *ptr = (T*)p; - return ncclSuccess; -} - -template -static ncclResult_t ncclCudaCalloc(T** ptr, size_t nelem, bool isFineGrain = false) { - if (isFineGrain) { - hipError_t e = hipExtMallocWithFlags((void**)ptr, nelem*sizeof(T), hipDeviceMallocFinegrained); - if (e != hipSuccess) { - *ptr = 0; - return ncclInvalidUsage; - } - } - else - CUDACHECK(hipMalloc(ptr, nelem*sizeof(T))); - CUDACHECK(hipMemset(*ptr, 0, nelem*sizeof(T))); - return ncclSuccess; -} - -template -static ncclResult_t ncclCudaMemcpy(T* dst, T* src, size_t nelem) { - CUDACHECK(hipMemcpy(dst, src, nelem*sizeof(T), hipMemcpyDefault)); - return ncclSuccess; } #endif // end include guard diff --git a/projects/rccl/src/include/cpuset.h b/projects/rccl/src/include/cpuset.h new file mode 100644 index 0000000000..98b93de87d --- /dev/null +++ b/projects/rccl/src/include/cpuset.h @@ -0,0 +1,61 @@ +/************************************************************************* + * Copyright (c) 2018-2019, NVIDIA CORPORATION. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ + +#ifndef NCCL_CPUSET_H_ +#define NCCL_CPUSET_H_ + +// Convert local_cpus, e.g. 0003ff,f0003fff to cpu_set_t + +static int hexToInt(char c) { + int v = c - '0'; + if (v < 0) return -1; + if (v > 9) v = 10 + c - 'a'; + if ((v < 0) || (v > 15)) return -1; + return v; +} + +#define CPU_SET_N_U32 (sizeof(cpu_set_t)/sizeof(uint32_t)) + +ncclResult_t ncclStrToCpuset(char* str, cpu_set_t* mask) { + uint32_t cpumasks[CPU_SET_N_U32]; + int m = CPU_SET_N_U32-1; + cpumasks[m] = 0; + for (int o=0; o=0; o--) { + if (c == 0 && m8[o] == 0) continue; + sprintf(str+c, "%02x", m8[o]); + c+=2; + if (o && o%4 == 0) { + sprintf(str+c, ","); + c++; + } + } + str[c] = '\0'; + return ncclSuccess; +} + +#endif diff --git a/projects/rccl/src/include/debug.h b/projects/rccl/src/include/debug.h index 1ef87d9f6a..c3e8fa04bd 100644 --- a/projects/rccl/src/include/debug.h +++ b/projects/rccl/src/include/debug.h @@ -1,6 +1,5 @@ /************************************************************************* - * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved. - * Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved. + * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ @@ -25,7 +24,8 @@ extern int ncclDebugLevel; extern uint64_t ncclDebugMask; extern pthread_mutex_t ncclDebugOutputLock; extern FILE *ncclDebugFile; -extern ncclResult_t getHostName(char* hostname, int maxlen); +extern ncclResult_t getHostName(char* hostname, int maxlen, const char delim); +extern ncclResult_t getNvmlDevice(int cudaDev, int *nvmlDev); extern void ncclDebugLog(ncclDebugLogLevel level, unsigned long flags, const char *filefunc, int line, const char *fmt, ...); @@ -108,7 +108,7 @@ static inline void initDebug() { break; case 'h': // %h = hostname char hostname[1024]; - getHostName(hostname, 1024); + getHostName(hostname, 1024, '.'); dfn += snprintf(dfn, PATH_MAX, "%s", hostname); break; case 'p': // %p = pid diff --git a/projects/rccl/src/include/devcomm.h b/projects/rccl/src/include/devcomm.h new file mode 100644 index 0000000000..30eccab7b8 --- /dev/null +++ b/projects/rccl/src/include/devcomm.h @@ -0,0 +1,259 @@ +/************************************************************************* + * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved. + * Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ + +#ifndef NCCL_DEVICE_H_ +#define NCCL_DEVICE_H_ + +#include "nccl.h" +#include + +// Convert volatile access to atomic +#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__) +#define LOAD(VAR) __atomic_load_n((VAR), __ATOMIC_SEQ_CST) +#define STORE(DST, SRC) __atomic_store_n((DST), (SRC), __ATOMIC_SEQ_CST) +#else +#define LOAD(VAR) *(VAR) +#define STORE(DST, SRC) *(DST) = (SRC) +#endif + +#define NCCL_MAX_OPS 2048 +#define NCCL_STEPS 8 + +typedef enum { ncclCollBroadcast, ncclCollReduce, ncclCollAllGather, ncclCollReduceScatter, ncclCollAllReduce, ncclCollCount } ncclColl_t; + +#define DIVUP(x, y) \ + (((x)+(y)-1)/(y)) +#define ROUNDUP(x, y) \ + (DIVUP((x), (y))*(y)) + +#define ALIGN_SIZE(size, align) \ + size = ((size + (align) - 1) / (align)) * (align); + +union ncclLLFifoLine { + /* Flags have to be *after* data, because otherwise, an incomplete receive + from the network may receive the flag but not the data. + Note this is assuming that either we receive contiguous chunks of data + (sockets) or data is written with an atomicity of 8 bytes (IB/RDMA). */ + struct { + uint32_t data1; + uint32_t flag1; + uint32_t data2; + uint32_t flag2; + }; + uint64_t v[2]; + int4 i4; +}; + +#define MAXTHREADS 256 +#define NCCL_LL_MAX_NTHREADS MAXTHREADS +#define NUM_LINES_PER_THREAD 8 +#define NCCL_LL_SLICE_LINES (NUM_LINES_PER_THREAD*NCCL_LL_MAX_NTHREADS) +#define NCCL_LL_BUFF_LINES (NCCL_LL_SLICE_LINES*NCCL_STEPS) +#define NCCL_LL_BUFF_SIZE (NCCL_LL_BUFF_LINES*sizeof(union ncclLLFifoLine)) +#ifdef DEBUG_LL +#define NCCL_LL_CLEAN_MASK 0x00000ff8 +#define NCCL_LL_FLAG_MAX 0x00001000 +#define NCCL_LL_FLAG(a) ((uint32_t)(a % NCCL_LL_FLAG_MAX)) +#else +#define NCCL_LL_CLEAN_MASK 0x7ffffff8 +#define NCCL_LL_FLAG(a) ((uint32_t)(a)) +#endif +// Make sure the clean mask will last for at least NCCL_NSTEPS +static_assert(NCCL_LL_CLEAN_MASK % NCCL_STEPS == 0, "Invalid NCCL_LL_CLEAN_MASK value"); + +struct ncclConnInfo { + // Regular comm mechanism + char *buff; // Local for recv, remote for send + uint64_t *tail; // Local for recv, remote for send + uint64_t *head; // Local for send, remote for recv + uint64_t *opCountLoc; // opCount of local rank + uint64_t *opCountRem; // opCount of remote rank + + int direct; // Direct communication + void **ptrExchange; // Pointer exchange for direct communication + + int *fifo; // Size fifo for proxy + + uint64_t step; // Keep where we are + + // Low latency mechanism + union ncclLLFifoLine *llBuff; // Local for recv, remote for send + uint64_t llLastCleaning; + + // GPU's HDP_MEM_FLUSH_ADDR: HDP Memory Coherency Flush Control. This register + // allows software to explicitly initiate a flush read to HDP memory. See more + // descriptions in primitives.h. + uint32_t* next_hdp_reg; // Next GPU in ring (for p2p transport use only) + uint32_t* curr_hdp_reg; // Curr GPU in ring (for rdma transport use only) +}; + +struct ncclConnector { + int connected; + struct ncclProxyArgs *proxyAppend; + struct ncclTransportComm* transportComm; + void* transportResources; // Host-side resources + struct ncclConnInfo conn; + struct ncclComm *comm; +}; + +struct ncclRing { + // Shortcuts for userRanks[1] and userRanks[n-1] + int prev; + int next; + + // Maps an internal nccl index to user-specified rank order. This is necessary + // since we need to know how the user expects data to be ordered across + // devices. Ordered from current device. + int* userRanks; + int* devUserRanks; +}; + + +#define NCCL_MAX_TREE_ARITY 3 +struct ncclTree { + int depth; + int up; + int down[NCCL_MAX_TREE_ARITY]; +}; + +struct ncclPeer { + struct ncclConnector send; + struct ncclConnector recv; +}; + +struct ncclDevComm; + +#pragma pack(push) /* push current alignment to stack */ +#pragma pack(4) /* set alignment to 4 bytes boundary */ +/* CollectiveArgs + ncclColl are to be a power of two, currently 64 bytes, */ +/* to make sure reads to host from the CUDA kernel are aligned. */ +/* Make sure to adjust padding at the end of ncclColl. */ +struct CollectiveArgs { + struct ncclDevComm* comm; + uint64_t opCount; + + // local and remote input, output, and buffer + const void * ThisInput; + void * ThisOutput; + + // general parameters + size_t N; + uint32_t root; + uint8_t bid; + uint8_t nChannels; + uint16_t nThreads; + + int lastChunkSize; +}; +struct ncclColl { + union { + struct { + struct CollectiveArgs args; + uint16_t funcIndex; + uint16_t nextIndex; + uint8_t active; + }; + int data[0x10]; + }; +}; +static_assert(sizeof(struct ncclColl) == (0x10*sizeof(int)), "ncclColl must have a pow2 size"); + +struct ncclChannel { + union { + struct { + struct ncclRing ring; + struct ncclTree tree; + + int id; + int nthreads; + int buffSize; + + // Communication structures + struct ncclPeer* peers; + struct ncclPeer* devPeers; + + // Operation list for aggregation + struct ncclColl* collectives; + struct ncclColl* devCollectives; + int collStart; + int collCount; + int collFifoHead; // Only used by GPU + int collFifoTail; // Only used by CPU + + uint32_t* abortCount; + }; + int data[0x80]; + }; +}; +static_assert(sizeof(struct ncclChannel) == 0x80*sizeof(int), "ncclChannel must have a pow2 size"); +#pragma pack(pop) /* restore original alignment from stack */ + +#define MAXCHANNELS 16 + +#ifdef ENABLE_PROFILING +struct ncclProf { + union { + struct { + uint64_t total_cycle; + uint64_t wait_send_cycle[MAXCHANNELS]; + uint64_t wait_recv_cycle[MAXCHANNELS]; + // primtive cycles + uint64_t send_cycle; + uint64_t directSend_cycle; + uint64_t recv_cycle; + uint64_t directRecv_cycle; + uint64_t copySend_cycle; + uint64_t directCopySend_cycle; + uint64_t recvCopySend_cycle; + uint64_t directRecvCopySend_cycle; + uint64_t recvReduceCopy_cycle; + uint64_t recvReduceSend_cycle; + uint64_t recvReduceCopySend_cycle; + uint64_t directRecvReduceCopySend_cycle; + // primitive bytes + uint64_t send_byte; + uint64_t directSend_byte; + uint64_t recv_byte; + uint64_t directRecv_byte; + uint64_t copySend_byte; + uint64_t directCopySend_byte; + uint64_t recvCopySend_byte; + uint64_t directRecvCopySend_byte; + uint64_t recvReduceCopy_byte; + uint64_t recvReduceSend_byte; + uint64_t recvReduceCopySend_byte; + uint64_t directRecvReduceCopySend_byte; + }; + int data[0x80]; + }; +}; +#endif + +typedef enum { + ncclDevSuccess, + ncclDevAssertedMismatch, + ncclDevSuspectedMismatch +} ncclDevError_t; + +struct ncclDevComm { + int rank; + int nRanks; + + // Flag to ask NCCL kernels to abort + volatile uint32_t *abortFlag; + volatile ncclDevError_t *fatalDevError; + + // Channels, device side + struct ncclChannel* channels; + +#ifdef ENABLE_PROFILING + // Profiling counters + struct ncclProf* devProf; +#endif +}; + +#endif diff --git a/projects/rccl/src/include/enqueue.h b/projects/rccl/src/include/enqueue.h index f17639826e..c40957df91 100644 --- a/projects/rccl/src/include/enqueue.h +++ b/projects/rccl/src/include/enqueue.h @@ -1,5 +1,5 @@ /************************************************************************* - * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved. * Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved. * * See LICENSE.txt for license information @@ -11,12 +11,14 @@ #include "core.h" #include "group.h" -typedef ncclResult_t(*ncclFunc_t)(const void* sendbuff, void* recvbuff, size_t count, - ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, hipStream_t stream); +// Channels / LL tuning +#define NCCL_LL_CHANNEL_THRESHOLD 8 // Per thread size before we start increasing nrings +#define NCCL_THREAD_THRESHOLD 256 // Per thread size before we switch to non-LL +#define NCCL_THREAD_THRESHOLD_PREVOLTA 32 // Per thread size before we switch to non-LL for pre-Volta archs +#define NCCL_THREAD_THRESHOLD_VEGA 8 // Per thread size before we switch to non-LL for VEGA +#define NCCL_LL_MIN_NTHREADS 256 -ncclResult_t ncclEnqueueCheck(ncclFunc_t func, const char* primName, const void* sendbuff, - void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, - ncclComm_t comm, hipStream_t stream); +ncclResult_t ncclEnqueueCheck(struct ncclInfo* info); ncclResult_t ncclCpuBarrierIn(ncclComm_t comm, int* isLast); ncclResult_t ncclCpuBarrierLast(ncclComm_t comm); ncclResult_t ncclCpuBarrierOut(ncclComm_t comm); diff --git a/projects/rccl/src/include/ibvwrap.h b/projects/rccl/src/include/ibvwrap.h index 4f3e8311dc..0943f9962c 100644 --- a/projects/rccl/src/include/ibvwrap.h +++ b/projects/rccl/src/include/ibvwrap.h @@ -4,7 +4,7 @@ * Copyright (c) 2005, 2006, 2007 Cisco Systems, Inc. All rights reserved. * Copyright (c) 2005 PathScale, Inc. All rights reserved. * - * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ diff --git a/projects/rccl/src/include/info.h b/projects/rccl/src/include/info.h new file mode 100644 index 0000000000..dfb8c2f280 --- /dev/null +++ b/projects/rccl/src/include/info.h @@ -0,0 +1,45 @@ +/************************************************************************* + * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ + +#ifndef NCCL_INFO_H_ +#define NCCL_INFO_H_ + +#include "nccl.h" + +typedef enum { + ncclPatternRing, + ncclPatternRingTwice, + ncclPatternPipelineFrom, + ncclPatternPipelineTo, + ncclPatternTreeUp, + ncclPatternTreeDown, + ncclPatternTreeUpDown +} ncclPattern_t; + +// Used to pass NCCL call information between functions +struct ncclInfo { + ncclColl_t coll; + const char* opName; + // NCCL Coll Args + const void* sendbuff; + void* recvbuff; + size_t count; + ncclDataType_t datatype; + ncclRedOp_t op; + int root; + ncclComm_t comm; + hipStream_t stream; + // Algorithm details + int chunkSteps; + int sliceSteps; + // Computed later + ncclPattern_t pattern; + size_t nBytes; + int nstepsPerLoop; + int nchunksPerLoop; +}; + +#endif diff --git a/projects/rccl/src/include/nccl_net.h b/projects/rccl/src/include/nccl_net.h index ce3f6cab6d..797c759e69 100644 --- a/projects/rccl/src/include/nccl_net.h +++ b/projects/rccl/src/include/nccl_net.h @@ -1,5 +1,5 @@ /************************************************************************* - * Copyright (c) 2017-2018, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ @@ -58,8 +58,51 @@ typedef struct { ncclResult_t (*closeListen)(void* listenComm); } ncclNet_v1_t; -typedef ncclNet_v1_t ncclNet_t; +typedef struct { + // Name of the network (mainly for logs) + const char* name; + // Initialize the network. + ncclResult_t (*init)(ncclDebugLogger_t logFunction); + // Return the number of adapters. + ncclResult_t (*devices)(int* ndev); + // Return the device path in /sys. NCCL will call free on this path. + ncclResult_t (*pciPath)(int dev, char** path); + // Return whether this device supports host pointers and/or CUDA pointers + // as data from the current GPU. Supported types should be composed with + // NCCL_PTR_HOST and NCCL_PTR_CUDA. + ncclResult_t (*ptrSupport)(int dev, int* supportedTypes); + // Create a receiving object and provide a handle to connect to it. The + // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged + // between ranks to create a connection. + ncclResult_t (*listen)(int dev, void* handle, void** listenComm); + // Connect to a handle and return a sending comm object for that peer. + ncclResult_t (*connect)(int dev, void* handle, void** sendComm); + // Finalize connection establishment after remote peer has called connectHandle + ncclResult_t (*accept)(void* listenComm, void** recvComm); + // Register/Deregister memory. Comm can be either a sendComm or a recvComm. + // Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA. + ncclResult_t (*regMr)(void* comm, void* data, int size, int type, void** mhandle); + ncclResult_t (*deregMr)(void* comm, void* mhandle); + // Asynchronous send to a peer. + // May return request == NULL if the call cannot be performed (or would block) + ncclResult_t (*isend)(void* sendComm, void* data, int size, void* mhandle, void** request); + // Asynchronous recv from a peer. + // May return request == NULL if the call cannot be performed (or would block) + ncclResult_t (*irecv)(void* recvComm, void* data, int size, void* mhandle, void** request); + // Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is + // visible to the GPU + ncclResult_t (*flush)(void* recvComm, void* data, int size, void* mhandle); + // Test whether a request is complete. If size is not NULL, it returns the + // number of bytes sent/received. + ncclResult_t (*test)(void* request, int* done, int* size); + // Close and free send/recv comm objects + ncclResult_t (*closeSend)(void* sendComm); + ncclResult_t (*closeRecv)(void* recvComm); + ncclResult_t (*closeListen)(void* listenComm); +} ncclNet_v2_t; -#define NCCL_PLUGIN_SYMBOL ncclNetPlugin_v1 +typedef ncclNet_v2_t ncclNet_t; + +#define NCCL_PLUGIN_SYMBOL ncclNetPlugin_v2 #endif // end include guard diff --git a/projects/rccl/src/include/net.h b/projects/rccl/src/include/net.h index ebc967782c..950b5e5c0c 100644 --- a/projects/rccl/src/include/net.h +++ b/projects/rccl/src/include/net.h @@ -1,5 +1,5 @@ /************************************************************************* - * Copyright (c) 2016-2018, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ @@ -13,11 +13,6 @@ extern ncclNet_t* ncclNet; typedef char ncclNetHandle_t[NCCL_NET_HANDLE_MAXSIZE]; -/* Socket Interface Selection type */ -typedef enum { findSubnetIf = -1, - dontCareIf = -2 -} ncclSocketIfSl_t; - // Translation to external API static const char* ncclNetName() { return ncclNet->name; } static ncclResult_t ncclNetDevices(int* ndev) { NCCLCHECK(ncclNet->devices(ndev)); return ncclSuccess; } @@ -26,15 +21,16 @@ static ncclResult_t ncclNetPtrSupport(int dev, int* supportedTypes) { NCCLCHECK( static ncclResult_t ncclNetListen(int dev, void* handle, void** listenComm) { NCCLCHECK(ncclNet->listen(dev, handle, listenComm)); return ncclSuccess; } static ncclResult_t ncclNetConnect(int dev, void* handle, void** sendComm) { NCCLCHECK(ncclNet->connect(dev, handle, sendComm)); return ncclSuccess; } static ncclResult_t ncclNetAccept(void* listenComm, void** recvComm) { NCCLCHECK(ncclNet->accept(listenComm, recvComm)); return ncclSuccess; } -static ncclResult_t ncclNetIsend(void* sendComm, void* data, int size, int type, void** request) { NCCLCHECK(ncclNet->isend(sendComm, data, size, type, request)); return ncclSuccess; } -static ncclResult_t ncclNetIrecv(void* recvComm, void* data, int size, int type, void** request) { NCCLCHECK(ncclNet->irecv(recvComm, data, size, type, request)); return ncclSuccess; } -static ncclResult_t ncclNetFlush(void* recvComm, void* data, int size) { NCCLCHECK(ncclNet->flush(recvComm, data, size)); return ncclSuccess; } +static ncclResult_t ncclNetRegMr(void* comm, void* data, int size, int type, void** mhandle) { NCCLCHECK(ncclNet->regMr(comm, data, size, type, mhandle)); return ncclSuccess; } +static ncclResult_t ncclNetDeregMr(void* comm, void* mhandle) { NCCLCHECK(ncclNet->deregMr(comm, mhandle)); return ncclSuccess; } +static ncclResult_t ncclNetIsend(void* sendComm, void* data, int size, void* mhandle, void** request) { NCCLCHECK(ncclNet->isend(sendComm, data, size, mhandle, request)); return ncclSuccess; } +static ncclResult_t ncclNetIrecv(void* recvComm, void* data, int size, void* mhandle, void** request) { NCCLCHECK(ncclNet->irecv(recvComm, data, size, mhandle, request)); return ncclSuccess; } +static ncclResult_t ncclNetFlush(void* recvComm, void* data, int size, void* mhandle) { NCCLCHECK(ncclNet->flush(recvComm, data, size, mhandle)); return ncclSuccess; } static ncclResult_t ncclNetTest(void* request, int* done, int* size) { NCCLCHECK(ncclNet->test(request, done, size)); return ncclSuccess; } static ncclResult_t ncclNetCloseSend(void* sendComm) { NCCLCHECK(ncclNet->closeSend(sendComm)); return ncclSuccess; } static ncclResult_t ncclNetCloseRecv(void* recvComm) { NCCLCHECK(ncclNet->closeRecv(recvComm)); return ncclSuccess; } static ncclResult_t ncclNetCloseListen(void* listenComm) { NCCLCHECK(ncclNet->closeListen(listenComm)); return ncclSuccess; } -extern ncclResult_t ncclSocketCreateHandle(void* opaqueHandle, const char* str); extern ncclNet_t ncclNetIb; extern ncclNet_t ncclNetSocket; diff --git a/projects/rccl/src/include/nvlink.h b/projects/rccl/src/include/nvlink.h index 28976386bb..5806b4d511 100644 --- a/projects/rccl/src/include/nvlink.h +++ b/projects/rccl/src/include/nvlink.h @@ -1,5 +1,5 @@ /************************************************************************* - * Copyright (c) 2016-2018, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved. * Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved. * * See LICENSE.txt for license information @@ -19,6 +19,7 @@ enum ncclNvLinkDeviceType { ncclNvLinkDeviceGpu, ncclNvLinkDeviceSwitch, + ncclNvLinkDeviceBridge, // IBM/Power NVLink bridge (Device 04ea) }; static ncclResult_t ncclDeviceType(const char* busId, enum ncclNvLinkDeviceType* type) { @@ -26,7 +27,13 @@ static ncclResult_t ncclDeviceType(const char* busId, enum ncclNvLinkDeviceType* memcpy(classPath+sizeof("/sys/bus/pci/devices/")-1, busId, sizeof("0000:00:00.0")-1); char* rPath = realpath(classPath, NULL); int fd; - SYSCHECKVAL(open(rPath, O_RDONLY), "open", fd); + if ((fd = open(rPath, O_RDONLY)) == -1) { + // Could not find device. It might be because we're in a VM and + // we don't see the whole machine. This is handled silently so + // we don't want to print an INFO error. + TRACE(NCCL_INIT, "Open of %s failed : %s\n", rPath, strerror(errno)); + return ncclSystemError; + } free(rPath); char pciClass[9]; strncpy(pciClass, "0x000000", 9); @@ -36,6 +43,9 @@ static ncclResult_t ncclDeviceType(const char* busId, enum ncclNvLinkDeviceType* if (strcmp(pciClass, "0x068000") == 0) { // PCI device is of type "Bridge / Other Bridge Device" (NVswitch) *type = ncclNvLinkDeviceSwitch; + } else if (strcmp(pciClass, "0x068001") == 0) { + // PCI device is of type "Bridge: IBM Device 04ea" + *type = ncclNvLinkDeviceBridge; } else if (strcmp(pciClass, "0x030200") == 0 // "3D Controller" (Tesla) || strcmp(pciClass, "0x030000") == 0) { // "VGA Controller" (GeForce) *type = ncclNvLinkDeviceGpu; @@ -49,9 +59,9 @@ static ncclResult_t ncclDeviceType(const char* busId, enum ncclNvLinkDeviceType* /* Get the maximum number of NVLinks based on the GPU generation */ static ncclResult_t getMaxNvlinks(int* maxLinks) { int cudaDev; - CUDACHECK(cudaGetDevice(&cudaDev)); + CUDACHECK(hipGetDevice(&cudaDev)); int ccMajor; - CUDACHECK(cudaDeviceGetAttribute(&ccMajor, cudaDevAttrComputeCapabilityMajor, cudaDev)); + CUDACHECK(hipDeviceGetAttribute(&ccMajor, hipDeviceAttributeComputeCapabilityMajor, cudaDev)); // 6 for Volta, 4 for Pascal *maxLinks = (ccMajor > 6) ? 6 : 4; // INFO("Device %d detected %d NVLinks", cudaDev, *maxLinks); @@ -68,18 +78,15 @@ static int getNvlinkGpu(const char* busId1, const char* busId2) { if (res != ncclSuccess) return 0; for(int l=0; l 6 ? 6 : 4; - for(int l=0; l +#include +#include "nvmlwrap.h" #include "topo.h" #define CONNECT_NVLINK 0x10 #define CONNECT_NVSWITCH 0x100 -static int getNumNvlinks(const char* busId) { - return 0; +enum ncclNvLinkDeviceType { + ncclNvLinkDeviceGpu, + ncclNvLinkDeviceSwitch, + ncclNvLinkDeviceBridge, // IBM/Power NVLink bridge (Device 04ea) +}; + +static int getNvlinkGpu(const char* busId1, const char* busId2) { + int links = 0; + return CONNECT_NVLINK*links; } #endif diff --git a/projects/rccl/src/include/nvmlwrap.h b/projects/rccl/src/include/nvmlwrap.h index ddfd233d74..f658279807 100644 --- a/projects/rccl/src/include/nvmlwrap.h +++ b/projects/rccl/src/include/nvmlwrap.h @@ -1,5 +1,5 @@ /************************************************************************* - * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ @@ -7,7 +7,7 @@ #ifndef NCCL_NVMLWRAP_H_ #define NCCL_NVMLWRAP_H_ -#include "core.h" +#include "nccl.h" //#define NVML_DIRECT 1 #ifdef NVML_DIRECT @@ -32,14 +32,6 @@ static ncclResult_t wrapNvmlDeviceGetIndex(nvmlDevice_t device, unsigned* index) NVMLCHECK(nvmlDeviceGetIndex(device, index)); return ncclSuccess; } -static ncclResult_t wrapNvmlDeviceSetCpuAffinity(nvmlDevice_t device) { - NVMLCHECK(nvmlDeviceSetCpuAffinity(device)); - return ncclSuccess; -} -static ncclResult_t wrapNvmlDeviceClearCpuAffinity(nvmlDevice_t device) { - NVMLCHECK(nvmlDeviceClearCpuAffinity(device)); - return ncclSuccess; -} static ncclResult_t wrapNvmlDeviceGetHandleByIndex(unsigned int index, nvmlDevice_t *device) { NVMLCHECK(nvmlDeviceGetHandleByIndex(index,device)); return ncclSuccess; @@ -61,6 +53,10 @@ static ncclResult_t wrapNvmlDeviceGetNvLinkCapability(nvmlDevice_t device, unsig NVMLCHECK(nvmlDeviceGetNvLinkCapability(device, link, capability, capResult)); return ncclSuccess; } +static ncclResult_t wrapNvmlDeviceGetMinorNumber(nvmlDevice_t device, unsigned int* minorNumber) { + NVMLCHECK(nvmlDeviceGetMinorNumber(device, minorNumber)); + return ncclSuccess; +} #else // Dynamically handle dependencies on NVML @@ -136,14 +132,14 @@ ncclResult_t wrapNvmlInit(void); ncclResult_t wrapNvmlShutdown(void); ncclResult_t wrapNvmlDeviceGetHandleByPciBusId(const char* pciBusId, nvmlDevice_t* device); ncclResult_t wrapNvmlDeviceGetIndex(nvmlDevice_t device, unsigned* index); -ncclResult_t wrapNvmlDeviceSetCpuAffinity(nvmlDevice_t device); -ncclResult_t wrapNvmlDeviceClearCpuAffinity(nvmlDevice_t device); ncclResult_t wrapNvmlDeviceGetHandleByIndex(unsigned int index, nvmlDevice_t *device); ncclResult_t wrapNvmlDeviceGetPciInfo(nvmlDevice_t device, nvmlPciInfo_t* pci); ncclResult_t wrapNvmlDeviceGetNvLinkState(nvmlDevice_t device, unsigned int link, nvmlEnableState_t *isActive); ncclResult_t wrapNvmlDeviceGetNvLinkRemotePciInfo(nvmlDevice_t device, unsigned int link, nvmlPciInfo_t *pci); ncclResult_t wrapNvmlDeviceGetNvLinkCapability(nvmlDevice_t device, unsigned int link, nvmlNvLinkCapability_t capability, unsigned int *capResult); +ncclResult_t wrapNvmlDeviceGetMinorNumber(nvmlDevice_t device, unsigned int* minorNumber); + #endif // NVML_DIRECT #endif // End include guard diff --git a/projects/rccl/src/include/param.h b/projects/rccl/src/include/param.h index dd5f697e34..54317571e7 100644 --- a/projects/rccl/src/include/param.h +++ b/projects/rccl/src/include/param.h @@ -1,5 +1,5 @@ /************************************************************************* - * Copyright (c) 2017-2018, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ @@ -36,7 +36,6 @@ static void setEnvFile(const char* fileName) { s++; strncpy(envValue, line+s, 1024); setenv(envVar, envValue, 0); - char *str = getenv(envVar); } if (line) free(line); fclose(file); diff --git a/projects/rccl/src/include/ring.h b/projects/rccl/src/include/ring.h deleted file mode 100644 index fa5e09959f..0000000000 --- a/projects/rccl/src/include/ring.h +++ /dev/null @@ -1,14 +0,0 @@ -/************************************************************************* - * Copyright (c) 2015-2017, NVIDIA CORPORATION. All rights reserved. - * - * See LICENSE.txt for license information - ************************************************************************/ - -#ifndef NCCL_RING_H_ -#define NCCL_RING_H_ -#include "core.h" - -ncclResult_t initRing(struct ncclComm* comm, int ringid); -ncclResult_t freeRing(struct ncclRing* ring); - -#endif diff --git a/projects/rccl/src/include/rings.h b/projects/rccl/src/include/rings.h index 3b4c311102..f634cbe071 100644 --- a/projects/rccl/src/include/rings.h +++ b/projects/rccl/src/include/rings.h @@ -1,5 +1,5 @@ /************************************************************************* - * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved. * Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved. * * See LICENSE.txt for license information @@ -9,14 +9,13 @@ #define NCCL_RINGS_H_ static int getDefaultThreads() { - // On Kepler, rings are doubled later. -#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) +#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__) return 256; -#else +#else // On Kepler, rings are doubled later. return ncclCudaCompCap() == 3 ? 128 : 256; #endif } -ncclResult_t ncclGetRings(int* nrings, int* nthreads, int rank, int nranks, int* transports, ncclTvalue_t* values, int* prev, int* next); +ncclResult_t ncclGetRings(int* nrings, int* nthreads, int rank, int nranks, int* transports, ncclTvalue_t* values, int* prev, int* next, int* treeIn, int* treeOut); #endif diff --git a/projects/rccl/src/include/shm.h b/projects/rccl/src/include/shm.h index 850ecae5ce..17861bed62 100644 --- a/projects/rccl/src/include/shm.h +++ b/projects/rccl/src/include/shm.h @@ -1,5 +1,5 @@ /************************************************************************* - * Copyright (c) 2016-2018, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved. * Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved. * * See LICENSE.txt for license information diff --git a/projects/rccl/src/include/socket.h b/projects/rccl/src/include/socket.h index 624af403f8..68ce235d62 100644 --- a/projects/rccl/src/include/socket.h +++ b/projects/rccl/src/include/socket.h @@ -1,5 +1,5 @@ /************************************************************************* - * Copyright (c) 2016-2018, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ @@ -18,8 +18,9 @@ #define MAX_IFS 16 #define MAX_IF_NAME_SIZE 16 -#define SLEEP_INT 1000 // sleep interval in usec -#define RETRY_TIMES 2e4 // retry times before reporting a timeout (20 sec) +#define SLEEP_INT 1000 // connection retry sleep interval in usec +#define RETRY_REFUSED_TIMES 2e4 // connection refused retry times before reporting a timeout (20 sec) +#define RETRY_TIMEDOUT_TIMES 3 // connection timed out retry times (each one can take 20s) /* Common socket address storage structure for IPv4/IPv6 */ union socketAddress { @@ -41,7 +42,7 @@ static inline const char *socketToString(struct sockaddr *saddr, char *buf) { return buf; } -static inline short socketToPort(struct sockaddr *saddr) { +static inline uint16_t socketToPort(struct sockaddr *saddr) { return ntohs(saddr->sa_family == AF_INET ? ((struct sockaddr_in*)saddr)->sin_port : ((struct sockaddr_in6*)saddr)->sin6_port); } @@ -60,9 +61,12 @@ static inline int envSocketFamily(void) { } static int findInterfaces(const char* prefixList, char* names, union socketAddress *addrs, int sock_family, int maxIfNameSize, int maxIfs) { +#ifdef ENABLE_TRACE char line[1024]; +#endif struct netIf userIfs[MAX_IFS]; bool searchNot = prefixList && prefixList[0] == '^'; + bool searchExact = prefixList && prefixList[0] == '='; int nUserIfs = parseStringList(prefixList, userIfs, MAX_IFS); int found = 0; @@ -89,7 +93,7 @@ static int findInterfaces(const char* prefixList, char* names, union socketAddre } // check against user specified interfaces - if (!(matchIfList(interface->ifa_name, -1, userIfs, nUserIfs) ^ searchNot)) { + if (!(matchIfList(interface->ifa_name, -1, userIfs, nUserIfs, searchExact) ^ searchNot)) { continue; } @@ -106,7 +110,6 @@ static int findInterfaces(const char* prefixList, char* names, union socketAddre // Store the IP address int salen = (family == AF_INET) ? sizeof(sockaddr_in) : sizeof(sockaddr_in6); memcpy(addrs+found, interface->ifa_addr, salen); - INFO(NCCL_INIT|NCCL_NET,"NET : Using interface %s:%s", interface->ifa_name, socketToString(interface->ifa_addr, line)); found++; } } @@ -159,7 +162,10 @@ static bool matchSubnet(struct ifaddrs local_if, union socketAddress remote) { } static int findInterfaceMatchSubnet(char* ifNames, union socketAddress* localAddrs, union socketAddress remoteAddr, int ifNameMaxSize, int maxIfs) { - char line[1024], line_a[1024]; +#ifdef ENABLE_TRACE + char line[1024]; +#endif + char line_a[1024]; int found = 0; struct ifaddrs *interfaces, *interface; getifaddrs(&interfaces); @@ -183,7 +189,7 @@ static int findInterfaceMatchSubnet(char* ifNames, union socketAddress* localAdd // Store the interface name strncpy(ifNames+found*ifNameMaxSize, interface->ifa_name, ifNameMaxSize); - INFO(NCCL_INIT|NCCL_NET,"NET : Found interface %s:%s in the same subnet as remote address %s", interface->ifa_name, socketToString(&(localAddrs[found].sa), line), socketToString(&(remoteAddr.sa), line_a)); + TRACE(NCCL_INIT|NCCL_NET,"NET : Found interface %s:%s in the same subnet as remote address %s", interface->ifa_name, socketToString(&(localAddrs[found].sa), line), socketToString(&(remoteAddr.sa), line_a)); found++; if (found == maxIfs) break; } @@ -336,8 +342,10 @@ static ncclResult_t createListenSocket(int *fd, union socketAddress *localAddr) TRACE(NCCL_INIT|NCCL_NET,"Listening on socket %s", socketToString(&localAddr->sa, line)); #endif - /* Put the socket in listen mode */ - SYSCHECK(listen(sockfd, 128), "listen"); + /* Put the socket in listen mode + * NB: The backlog will be silently truncated to the value in /proc/sys/net/core/somaxconn + */ + SYSCHECK(listen(sockfd, 16384), "listen"); *fd = sockfd; return ncclSuccess; } @@ -367,14 +375,18 @@ static ncclResult_t connectAddress(int* fd, union socketAddress* remoteAddr) { #endif int ret; - int retries = 0; + int timedout_retries = 0; + int refused_retries = 0; retry: SYSCHECKSYNC(connect(*fd, &remoteAddr->sa, salen), "connect", ret); if (ret == 0) return ncclSuccess; - if (errno == ECONNREFUSED && ++retries < RETRY_TIMES) { - INFO(NCCL_ALL,"Call to connect returned %s, retrying", strerror(errno)); \ - usleep(SLEEP_INT); - goto retry; + if ((errno == ECONNREFUSED || errno == ETIMEDOUT)) { + if ((errno == ECONNREFUSED && ++refused_retries < RETRY_REFUSED_TIMES) || + (errno == ETIMEDOUT && ++timedout_retries < RETRY_TIMEDOUT_TIMES)) { + INFO(NCCL_ALL,"Call to connect returned %s, retrying", strerror(errno)); + usleep(SLEEP_INT); + goto retry; + } } WARN("Connect to %s failed : %s", socketToString(&remoteAddr->sa, line), strerror(errno)); return ncclSystemError; @@ -382,12 +394,12 @@ retry: #define NCCL_SOCKET_SEND 0 #define NCCL_SOCKET_RECV 1 -static ncclResult_t socketProgress(int op, int fd, void* ptr, int size, int* offset) { +static ncclResult_t socketProgressOpt(int op, int fd, void* ptr, int size, int* offset, int block) { int bytes = 0; char* data = (char*)ptr; do { - if (op == NCCL_SOCKET_RECV) bytes = recv(fd, data+(*offset), size-(*offset), MSG_DONTWAIT); - if (op == NCCL_SOCKET_SEND) bytes = send(fd, data+(*offset), size-(*offset), MSG_DONTWAIT); + if (op == NCCL_SOCKET_RECV) bytes = recv(fd, data+(*offset), size-(*offset), block ? 0 : MSG_DONTWAIT); + if (op == NCCL_SOCKET_SEND) bytes = send(fd, data+(*offset), size-(*offset), block ? 0 : MSG_DONTWAIT); if (op == NCCL_SOCKET_RECV && bytes == 0) { WARN("Net : Connection closed by remote peer"); return ncclSystemError; @@ -405,9 +417,13 @@ static ncclResult_t socketProgress(int op, int fd, void* ptr, int size, int* off return ncclSuccess; } +static ncclResult_t socketProgress(int op, int fd, void* ptr, int size, int* offset) { + return socketProgressOpt(op, fd, ptr, size, offset, 0); +} + static ncclResult_t socketWait(int op, int fd, void* ptr, int size, int* offset) { while (*offset < size) - NCCLCHECK(socketProgress(op, fd, ptr, size, offset)); + NCCLCHECK(socketProgressOpt(op, fd, ptr, size, offset, 1)); return ncclSuccess; } diff --git a/projects/rccl/src/include/topo.h b/projects/rccl/src/include/topo.h index d14e38690e..69cd100743 100644 --- a/projects/rccl/src/include/topo.h +++ b/projects/rccl/src/include/topo.h @@ -1,6 +1,5 @@ /************************************************************************* - * Copyright (c) 2016-2018, NVIDIA CORPORATION. All rights reserved. - * Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved. + * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ @@ -12,78 +11,35 @@ #include #include #include -#include -#include -#include +#include -#define BUSID_SIZE (sizeof("0000:00:00.0")) -#define BUSID_REDUCED_SIZE (sizeof("0000:00")) +ncclResult_t getCudaPath(int cudaDev, char** path); -static bool isEPYC() { - std::ifstream cpuinfo("/proc/cpuinfo"); - std::string line; - int needed = 2; - static bool vendor_id = true, cpu_family = false, initialized = false; - if (initialized) return (vendor_id && cpu_family); - while (std::getline(cpuinfo, line)) { - if (line.compare(0, 9, "vendor_id") == 0) { - if(line.find("AuthenticAMD") == std::string::npos) - vendor_id = false; - needed --; - } - if (line.compare(0, 10, "cpu family") == 0) { - std::string family_str = line.substr(line.find(": ") + 2); - if (std::stoi(family_str) >= 23) - cpu_family = true; - needed --; - } - if (!needed) - break; - } - initialized = true; - return (vendor_id && cpu_family); -} +static int getNumaId(char *path) { + char npath[PATH_MAX]; + snprintf(npath, PATH_MAX, "%s/numa_node", path); + npath[PATH_MAX-1] = '\0'; -static ncclResult_t getCudaPath(int cudaDev, char** path) { - char busId[BUSID_SIZE]; - CUDACHECK(hipDeviceGetPCIBusId(busId, BUSID_SIZE, cudaDev)); - for (int i=0; i +#include "nvmlwrap.h" #define NTRANSPORTS 3 @@ -19,11 +21,13 @@ struct ncclRing; struct ncclConnector; struct ncclComm; -#define RANK_INFO_SIZE 64 -typedef char ncclTinfo_t[RANK_INFO_SIZE]; - -struct ncclInfo { - ncclTinfo_t tinfo[NTRANSPORTS]; +struct ncclPeerInfo { + int rank; + int cudaDev; + int nvmlDev; + uint64_t hostHash; + uint64_t pidHash; + char busId[NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE]; }; // Used to hold the transport connection values @@ -34,18 +38,47 @@ struct ncclConnect { char data[CONNECT_SIZE]; }; +enum ncclProxyOpState { ncclProxyOpNone, ncclProxyOpReady, ncclProxyOpProgress }; + +struct ncclProxyArgs; +typedef ncclResult_t (*proxyProgressFunc_t)(struct ncclProxyArgs*); + struct ncclProxyArgs { - struct ncclRing* ring; - int substeps; + proxyProgressFunc_t progress; + struct ncclChannel* channel; + struct ncclConnector* connector; + int sliceSteps; + int chunkSteps; int nsteps; uint64_t opCount; int llMode; - bool needProxy; - int active; // add component before this line -- it is left out during initialization + int state; // add component before this line -- it is left out during initialization + + // Internal state + uint64_t head; + uint64_t tail; + uint64_t end; + void* requests[NCCL_STEPS]; + int idle; + + // Element linking + pthread_mutex_t mutex; + struct ncclProxyArgs* next; + struct ncclProxyArgs* nextPeer; +}; + +struct ncclProxyPool; +struct ncclProxyState { + pthread_cond_t cond; + pthread_mutex_t mutex; + bool stop; + struct ncclProxyArgs* ops; + struct ncclProxyArgs* pool; + struct ncclProxyPool* pools; }; struct ncclTransportComm { - ncclResult_t (*setup)(ncclTinfo_t*, ncclTinfo_t*, struct ncclConnect*, struct ncclRing*); + ncclResult_t (*setup)(struct ncclPeerInfo*, struct ncclPeerInfo*, struct ncclConnect*, struct ncclConnector*, int buffSize, int channelId); ncclResult_t (*connect)(struct ncclConnect*, struct ncclConnector*); ncclResult_t (*free)(void*); ncclResult_t (*proxy)(struct ncclProxyArgs*); @@ -53,8 +86,7 @@ struct ncclTransportComm { struct ncclTransport { const char name[4]; - ncclResult_t (*fillInfo)(ncclTinfo_t*, int, uint64_t); - ncclResult_t (*canConnect)(ncclTvalue_t*, ncclTinfo_t*, ncclTinfo_t*); + ncclResult_t (*canConnect)(ncclTvalue_t*, struct ncclPeerInfo*, struct ncclPeerInfo*); ncclResult_t (*getRings)(int, int*, int*, ncclTvalue_t*, int*, int*, int*, int, int*); struct ncclTransportComm send; struct ncclTransportComm recv; @@ -64,37 +96,17 @@ struct ncclTransport { typedef ncclResult_t (*threadFunc_t)(struct ncclProxyArgs*); -#define TRANSPORT_PROXY_FIFO_SIZE NCCL_MAX_OPS - -struct transportProxyInfo { - struct ncclComm* comm; - pthread_t thread; - threadFunc_t func; - volatile int proxyReady; - struct ncclProxyArgs argsFifo[TRANSPORT_PROXY_FIFO_SIZE]; - volatile uint64_t argsFifoHead; - volatile uint64_t argsFifoTail; - pthread_cond_t cond; - pthread_mutex_t mutex; -}; - -ncclResult_t transportCreateProxy(int type, struct ncclRing* ring, struct ncclComm* comm); -ncclResult_t transportDestroyProxy(struct ncclConnector* connector); - enum proxyMode { proxyRing = 0, proxyFrom = 1, proxyTo = 2 }; -static int proxyPatternRing = proxyRing; -static inline int proxyPatternFrom(int root) { return 1+root; } -static inline int proxyPatternTo(int root) { return -1-root; } -static inline enum proxyMode proxyPatternMode(int pattern) { return (pattern == 0) ? proxyRing : ((pattern > 0) ? proxyFrom : proxyTo); } -static inline int proxyPatternRoot(int pattern) { return (pattern > 0) ? pattern-1 : -pattern-1; } - -ncclResult_t transportSaveProxies(int substeps, int subchunks, int nstepsPerRound, int nblocksPerRound, size_t size, int pattern, struct ncclComm* comm); -ncclResult_t transportStartProxies(struct ncclComm* comm); +ncclResult_t transportAllocateProxyArgs(struct ncclComm* comm, struct ncclProxyArgs** argsptr); +ncclResult_t transportSaveProxies(struct ncclProxyArgs* args, int pattern, int root, int nranks); +ncclResult_t transportStartProxy(struct ncclComm* comm); +ncclResult_t transportCreateProxy(struct ncclComm* comm); +ncclResult_t transportDestroyProxy(struct ncclComm* comm); #include @@ -106,8 +118,4 @@ inline void transportProxyWait(const FUNC& func) { } } -inline void transportProxyIdle(int idle) { - sched_yield(); -} - #endif diff --git a/projects/rccl/src/include/trees.h b/projects/rccl/src/include/trees.h new file mode 100644 index 0000000000..7eadd8556e --- /dev/null +++ b/projects/rccl/src/include/trees.h @@ -0,0 +1,13 @@ +/************************************************************************* + * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ + +#ifndef NCCL_TREES_H_ +#define NCCL_TREES_H_ + +ncclResult_t ncclGetBtree(int nranks, int rank, int* u0, int* d1, int* d0); +ncclResult_t ncclGetDtree(int nranks, int rank, int* u0, int* d0_0, int* d0_1, int* u1, int* d1_0, int* d1_1); + +#endif diff --git a/projects/rccl/src/include/utils.h b/projects/rccl/src/include/utils.h index 0ed875c161..2282f5cce3 100644 --- a/projects/rccl/src/include/utils.h +++ b/projects/rccl/src/include/utils.h @@ -1,5 +1,6 @@ /************************************************************************* - * Copyright (c) 2016-2018, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved. + * Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ @@ -10,7 +11,7 @@ #include "nccl.h" #include -ncclResult_t getHostName(char* hostname, int maxlen); +ncclResult_t getHostName(char* hostname, int maxlen, const char delim); uint64_t getnHash(const char* string, int n); uint64_t getHostHash(); uint64_t getPidHash(); @@ -21,6 +22,6 @@ struct netIf { }; int parseStringList(const char* string, struct netIf* ifList, int maxList); -bool matchIfList(const char* string, int port, struct netIf* ifList, int listSize); +bool matchIfList(const char* string, int port, struct netIf* ifList, int listSize, bool matchExact); #endif diff --git a/projects/rccl/src/init.cc b/projects/rccl/src/init.cc new file mode 100644 index 0000000000..320b5d4f35 --- /dev/null +++ b/projects/rccl/src/init.cc @@ -0,0 +1,1369 @@ +/************************************************************************* + * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved. + * Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ + +#include "nccl.h" +#include "core.h" +#include "channel.h" +#include "param.h" +#include "nvmlwrap.h" +#include "rings.h" +#include "trees.h" +#include "bootstrap.h" +#include "transport.h" +#include "group.h" +#include "utils.h" +#include "net.h" +#include "checks.h" +#include "enqueue.h" +#include "topo.h" +#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__) +#include "nvlink_stub.h" +#else +#include "nvlink.h" +#endif +#include "cpuset.h" +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define STR2(v) #v +#define STR(v) STR2(v) + +int ncclDebugLevel; +uint64_t ncclDebugMask = NCCL_INIT; // Default debug sub-system mask is INIT +pthread_mutex_t ncclDebugOutputLock; +FILE *ncclDebugFile = stdout; + +#ifdef ENABLE_TRACE +std::chrono::high_resolution_clock::time_point ncclEpoch; +#endif + +#if CUDART_VERSION >= 9020 || defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__) +#define NCCL_GROUP_CUDA_STREAM 0 // CGMD: CUDA 9.2,10.X Don't need to use an internal CUDA stream +#else +#define NCCL_GROUP_CUDA_STREAM 1 // CGMD: CUDA 9.0,9.1 Need to use an internal CUDA stream +#endif + +NCCL_PARAM(GroupCudaStream, "GROUP_CUDA_STREAM", NCCL_GROUP_CUDA_STREAM); + +NCCL_PARAM(CheckPointers, "CHECK_POINTERS", 0); + +ncclNet_t* ncclNet = NULL; + +// We define this as weak to let tests redefine their own +#pragma weak ncclNvlinkGpu +ncclResult_t ncclNvlinkGpu(int* nvlink) { + int cudaDev; + CUDACHECK(hipGetDevice(&cudaDev)); + char busId[NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE]; + CUDACHECK(hipDeviceGetPCIBusId(busId, NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE, cudaDev)); + *nvlink = getNvlinkGpu(busId, NULL); + return ncclSuccess; +} +// We define this as weak to let tests redefine their own +#pragma weak ncclCudaCompCap +int ncclCudaCompCap() { + int cudaDev; + if (hipGetDevice(&cudaDev) != hipSuccess) return 0; + int ccMajor; + if (hipDeviceGetAttribute(&ccMajor, hipDeviceAttributeComputeCapabilityMajor, cudaDev) != hipSuccess) return 0; + return ccMajor; +} +int ncclCudaFullCompCap() { + int cudaDev; + if (hipGetDevice(&cudaDev) != hipSuccess) return 0; + int ccMajor, ccMinor; + if (hipDeviceGetAttribute(&ccMajor, hipDeviceAttributeComputeCapabilityMajor, cudaDev) != hipSuccess) return 0; + if (hipDeviceGetAttribute(&ccMinor, hipDeviceAttributeComputeCapabilityMinor, cudaDev) != hipSuccess) return 0; + return ccMajor*10+ccMinor; +} + +// Returns ncclInternalError if anything fails, causing that network to be ignored. +ncclResult_t initNet(ncclNet_t* net) { + int ndev; + if (net->init(ncclDebugLog) != ncclSuccess) return ncclInternalError; + if (net->devices(&ndev) != ncclSuccess) return ncclInternalError; + if (ndev <= 0) return ncclSystemError; + return ncclSuccess; +} + +ncclResult_t initNetPlugin(ncclNet_t** net) { + void* netPluginLib = dlopen("libnccl-net.so", RTLD_NOW | RTLD_LOCAL); + if (netPluginLib == NULL) { + // dlopen does not guarantee to set errno, but dlerror only gives us a + // string, so checking errno doesn't hurt to try to provide a better + // error message + if (errno == ENOENT) { + INFO(NCCL_INIT|NCCL_NET, "NET/Plugin : No plugin found (libnccl-net.so)."); + } else { + INFO(NCCL_INIT|NCCL_NET, "NET/Plugin : Plugin load returned %d : %s.", errno, dlerror()); + } + return ncclSuccess; + } + ncclNet_t* extNet = (ncclNet_t*) dlsym(netPluginLib, STR(NCCL_PLUGIN_SYMBOL)); + if (extNet == NULL) { + INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Failed to find " STR(NCCL_PLUGIN_SYMBOL) " symbol."); + goto cleanup; + } + if (initNet(extNet) == ncclSuccess) { + *net = extNet; + return ncclSuccess; + } +cleanup: + if (netPluginLib != NULL) dlclose(netPluginLib); + return ncclSuccess; +} + +ncclResult_t initNet() { + // Always initialize bootstrap network + NCCLCHECK(bootstrapNetInit()); + + NCCLCHECK(initNetPlugin(&ncclNet)); + if (ncclNet != NULL) return ncclSuccess; + if (initNet(&ncclNetIb) == ncclSuccess) { + ncclNet = &ncclNetIb; + } else { + NCCLCHECK(initNet(&ncclNetSocket)); + ncclNet = &ncclNetSocket; + } + return ncclSuccess; +} + +NCCL_PARAM(LlThreshold, "LL_THRESHOLD", -2); +NCCL_PARAM(ThreadThreshold, "THREAD_THRESHOLD", -2); +NCCL_PARAM(TreeThreshold, "TREE_THRESHOLD", 0); + +int ncclThreadThreshold(int minCompCap, int multiNode) { + int threshold = ncclParamThreadThreshold(); + if (threshold == -2) { // user has not set this env variable +#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__) + threshold = NCCL_THREAD_THRESHOLD_VEGA; +#else + threshold = (minCompCap <= 6) ? NCCL_THREAD_THRESHOLD_PREVOLTA : NCCL_THREAD_THRESHOLD; +#endif + // multiply by 2 if running on multiple nodes + if (multiNode) { + threshold *= 2; + } + } + return threshold; +} + +bool useFineGrainVramPcie = false; + +void parseHsaForceFineGrainVramPcie() { + char* str = getenv("HSA_FORCE_FINE_GRAIN_PCIE"); + if (str && strlen(str) > 0) { + errno = 0; + int64_t v = strtoll(str, NULL, 0); + if (errno || (v != 0 && v != 1)) { + INFO(NCCL_ALL,"Invalid value %s for %s, using default %u.", str, "HSA_FORCE_FINE_GRAIN_PCIE", useFineGrainVramPcie); \ + } else { + useFineGrainVramPcie = v; + INFO(NCCL_ALL,"%s set by environment to %u.", "HSA_FORCE_FINE_GRAIN_PCIE", useFineGrainVramPcie); \ + } + } +} + +pthread_mutex_t initLock = PTHREAD_MUTEX_INITIALIZER; +static bool initialized = false; +static ncclResult_t ncclInit() { + if (initialized) return ncclSuccess; + pthread_mutex_lock(&initLock); + if (!initialized) { + initEnv(); + initDebug(); + initNet(); + initialized = true; + } + // Check if HSA_FORCE_FINE_GRAIN_PCIE is set in env + parseHsaForceFineGrainVramPcie(); + pthread_mutex_unlock(&initLock); + return ncclSuccess; +} + +NCCL_API(ncclResult_t, ncclGetVersion, int* version); +ncclResult_t ncclGetVersion(int* version) { + if (version == NULL) return ncclInvalidArgument; + *version = NCCL_VERSION_CODE; + return ncclSuccess; +} + +NCCL_API(ncclResult_t, ncclGetUniqueId, ncclUniqueId* out); +ncclResult_t ncclGetUniqueId(ncclUniqueId* out) { + NCCLCHECK(ncclInit()); + NCCLCHECK(PtrCheck(out, "GetUniqueId", "out")); + return bootstrapGetUniqueId(out); +} + +// Prevent compiler from optimizing out these operations +void __attribute__((optimize("O0"))) commPoison(ncclComm_t comm) { + comm->rank = comm->cudaDev = comm->nvmlDev = comm->nRanks = -1; +} + +static ncclResult_t commFree(ncclComm_t comm) { + if (comm == NULL) + return ncclSuccess; + +#ifdef ENABLE_PROFILING + struct ncclProf* prof = (struct ncclProf*)malloc(sizeof(struct ncclProf)); + CUDACHECK(hipMemcpy(prof, comm->hostDevComm.devProf, sizeof(struct ncclProf), hipMemcpyDeviceToHost)); + uint64_t wait_send_cycle = 0, wait_recv_cycle = 0; + for (int chan=0; channChannels; chan++) { + wait_send_cycle += prof->wait_send_cycle[chan]; + wait_recv_cycle += prof->wait_recv_cycle[chan]; + } + #define VEGA_GPU_RTC_FREQUENCY 2.7E7 + if (comm->rank == 0) { + INFO(NCCL_INIT, "# %4s %6s %6s %6s %6s %6s %7s %6s %6s %6s %6s %6s", "Rank", "total", "w_send", "w_recv", "send", "rcRdS", "dRcRdCS", "dRcCS", "dRc", "cS", "rc", "rcCS"); + INFO(NCCL_INIT, "# %4s %6s %6s %6s %6s %6s %7s %6s %6s %6s %6s %6s", "", "(s)", "(s)", "(s)", "(GB/s)", "(GB/s)", "(GB/s)", "(GB/s)", "(GB/s)", "(GB/s)", "(GB/s)", "(GB/s)", "(GB/s)"); + } + INFO(NCCL_INIT, "# %4d %6.4f %6.4f %6.4f %6.2f %6.2f %7.2f %6.2f %6.2f %6.2f %6.2f %6.2f", + comm->rank, (double)prof->total_cycle/VEGA_GPU_RTC_FREQUENCY/comm->nChannels, + (double)wait_send_cycle/VEGA_GPU_RTC_FREQUENCY/comm->nChannels, + (double)wait_recv_cycle/VEGA_GPU_RTC_FREQUENCY/comm->nChannels, + (prof->send_cycle) ? (double)prof->send_byte*comm->nChannels/((double)prof->send_cycle/VEGA_GPU_RTC_FREQUENCY*1.0E9) : 0, + (prof->recvReduceSend_cycle) ? (double)prof->recvReduceSend_byte*comm->nChannels/((double)prof->recvReduceSend_cycle/VEGA_GPU_RTC_FREQUENCY*1.0E9) : 0, + (prof->directRecvReduceCopySend_cycle) ? (double)prof->directRecvReduceCopySend_byte*comm->nChannels/((double)prof->directRecvReduceCopySend_cycle/VEGA_GPU_RTC_FREQUENCY*1.0E9) : 0, + (prof->directRecvCopySend_cycle) ? (double)prof->directRecvCopySend_byte*comm->nChannels/((double)prof->directRecvCopySend_cycle/VEGA_GPU_RTC_FREQUENCY*1.0E9) : 0, + (prof->directRecv_cycle) ? (double)prof->directRecv_byte*comm->nChannels/((double)prof->directRecv_cycle/VEGA_GPU_RTC_FREQUENCY*1.0E9) : 0, + (prof->copySend_cycle) ? (double)prof->copySend_byte*comm->nChannels/((double)prof->copySend_cycle/VEGA_GPU_RTC_FREQUENCY*1.0E9) : 0, + (prof->recv_cycle) ? (double)prof->recv_byte*comm->nChannels/((double)prof->recv_cycle/VEGA_GPU_RTC_FREQUENCY*1.0E9) : 0, + (prof->recvCopySend_cycle) ? (double)prof->recvCopySend_byte*comm->nChannels/((double)prof->recvCopySend_cycle/VEGA_GPU_RTC_FREQUENCY*1.0E9) : 0); + free(prof); + CUDACHECK(hipFree(comm->hostDevComm.devProf)); +#endif + + free(comm->peerInfo); + + if (comm->bootstrap) + NCCLCHECK(bootstrapClose(comm->bootstrap)); + + CUDACHECK(hipFree(comm->hostDevComm.channels)); + CUDACHECK(hipFree(comm->devComm)); + + for (int channel=0; channelnChannels; channel++) + NCCLCHECK(freeChannel(comm->channels+channel, comm->nRanks)); + + if (comm->doneEvent != NULL) + CUDACHECK(hipEventDestroy(comm->doneEvent)); + + if (comm->launchMode == ncclComm::GROUP) { + CUDACHECK(hipStreamDestroy(comm->groupStream)); + } + + // Last rank frees shared resources between threads + int isLast; + NCCLCHECK(ncclCpuBarrierIn(comm, &isLast)); + if (isLast) { + free(comm->intraBarrier); + free(comm->intraParams); + free(comm->intraCudaDevs); + free(comm->intraCGMode); + free(comm->intraCC); + } + CUDACHECK(hipHostFree((void *)comm->abortFlag)); + CUDACHECK(hipHostFree((void *)comm->fatalDevError)); + + // Poison comm to try and catch a double free + commPoison(comm); + + free(comm); + return ncclSuccess; +} + +static ncclResult_t commAlloc(ncclComm_t* comret, int ndev, int rank) { + if (ndev < 1) { + WARN("invalid device count (%d) requested", ndev); + return ncclInvalidArgument; + } + if (rank >= ndev || rank < 0) { + WARN("rank %d exceeds ndev=%d", rank, ndev); + return ncclInvalidArgument; + } + + // Try to create a CUDA object right away. If there is something wrong with + // the device we're on (failure cause #1) , better know it early. + hipEvent_t doneEvent; + CUDACHECK(hipEventCreateWithFlags(&doneEvent, hipEventDisableTiming)); + + struct ncclComm* comm; + NCCLCHECK(ncclCalloc(&comm, 1)); + + comm->rank = comm->hostDevComm.rank =rank; + comm->nRanks = comm->hostDevComm.nRanks = ndev; + hipGetDevice(&comm->cudaDev); + getNvmlDevice(comm->cudaDev, &comm->nvmlDev); + TRACE(NCCL_INIT,"comm %p rank %d nranks %d cudaDev %d nvmlDev %d", comm, rank, ndev, comm->cudaDev, comm->nvmlDev); + + comm->doneEvent = doneEvent; + comm->llThreshold = ncclParamLlThreshold(); + comm->treeThreshold = ncclParamTreeThreshold(); + comm->checkPointers = ncclParamCheckPointers() == 1 ? true : false; +#if CUDART_VERSION >= 9020 || defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__) + comm->groupCudaStream = ncclParamGroupCudaStream(); +#else + // Don't allow the user to overload the default setting in older CUDA builds + comm->groupCudaStream = NCCL_GROUP_CUDA_STREAM; +#endif + comm->fatalError = ncclSuccess; + + NCCLCHECK(ncclCudaHostAlloc((void**) &comm->fatalDevError, (void**) &comm->hostDevComm.fatalDevError, sizeof(ncclDevError_t))); + STORE(comm->fatalDevError, ncclDevSuccess); + + NCCLCHECK(ncclCudaHostAlloc((void**) &comm->abortFlag, (void**) &comm->hostDevComm.abortFlag, sizeof(uint32_t))); + STORE(comm->abortFlag, 0); + + comm->argsptr = &comm->args; +#ifdef ENABLE_PROFILING + NCCLCHECK(ncclCudaCalloc(&comm->hostDevComm.devProf, 1)); +#endif + + *comret = comm; + return ncclSuccess; +} + +static ncclResult_t devCommSetup(ncclComm_t comm) { + // Duplicate the channels on the device + NCCLCHECK(ncclCudaCalloc(&comm->hostDevComm.channels, comm->nChannels)); + NCCLCHECK(ncclCudaMemcpy(comm->hostDevComm.channels, comm->channels, comm->nChannels)); + + // Copy userRanks and peers + for (int r=0; rnChannels; r++) { + NCCLCHECK(ncclCudaMemcpy(comm->channels[r].ring.devUserRanks, comm->channels[r].ring.userRanks, comm->nRanks)); + NCCLCHECK(ncclCudaMemcpy(comm->channels[r].devPeers, comm->channels[r].peers, comm->nRanks)); + } + + // Duplicate the dev comm on the device + NCCLCHECK(ncclCudaCalloc(&comm->devComm, 1)); + NCCLCHECK(ncclCudaMemcpy(comm->devComm, &comm->hostDevComm, 1)); + return ncclSuccess; +} + +// Pre-process the string so that running "strings" on the lib can quickly reveal the version. +#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__) +#define VERSION_STRING "NCCL version " STR(NCCL_MAJOR) "." STR(NCCL_MINOR) "." STR(NCCL_PATCH) NCCL_SUFFIX "+hip" +#else +#define VERSION_STRING "NCCL version " STR(NCCL_MAJOR) "." STR(NCCL_MINOR) "." STR(NCCL_PATCH) NCCL_SUFFIX "+cuda" STR(CUDA_MAJOR) "." STR(CUDA_MINOR) +#endif +static void showVersion() { + static int shown = 0; + if (shown == 0 && ncclDebugLevel >= NCCL_LOG_VERSION) { + printf("%s\n", VERSION_STRING); + fflush(stdout); + if (ncclDebugFile != stdout) + INFO(NCCL_ALL,"%s", VERSION_STRING); // Also log NCCL version in one of the files + shown = 1; + } +} + +static ncclResult_t fillInfo(struct ncclPeerInfo* info, int rank, uint64_t commHash) { + info->rank = rank; + CUDACHECK(hipGetDevice(&info->cudaDev)); + NCCLCHECK(getNvmlDevice(info->cudaDev, &info->nvmlDev)) + info->hostHash=getHostHash()+commHash; + info->pidHash=getPidHash()+commHash; + + // Get PCI Bus Id. We need to get the bus ID through CUDA first, since the + // cudaDev is a CUDA runtime dev number which could be different from the + // NVML device number. Then we get the busID from NVML to be sure it is + // consistent with NVML remote PCI bus Ids. + CUDACHECK(hipDeviceGetPCIBusId(info->busId, NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE, info->cudaDev)); +#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__) +#else + nvmlDevice_t nvmlDevice; + NCCLCHECK(wrapNvmlDeviceGetHandleByPciBusId(info->busId, &nvmlDevice)); + nvmlPciInfo_t pciInfo; + NCCLCHECK(wrapNvmlDeviceGetPciInfo(nvmlDevice, &pciInfo)); + strncpy(info->busId, pciInfo.busId, NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE); +#endif + return ncclSuccess; +} + +static ncclResult_t setCpuAffinity(int cudaDev); + +template +static ncclResult_t selectTransport(struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connect, struct ncclConnector* connector, int buffSize, int channelId) { + for (int t=0; tsend : &transport->recv; + ncclTvalue_t ret = 0; + NCCLCHECK(transport->canConnect(&ret, myInfo, peerInfo)); + if (ret > 0) { + cpu_set_t affinitySave; + sched_getaffinity(0, sizeof(cpu_set_t), &affinitySave); + int cudaDev; + CUDACHECK(hipGetDevice(&cudaDev)); + setCpuAffinity(cudaDev); + connector->transportComm = transportComm; + NCCLCHECK(transportComm->setup(myInfo, peerInfo, connect, connector, buffSize, channelId)); + sched_setaffinity(0, sizeof(cpu_set_t), &affinitySave); + return ncclSuccess; + } + } + WARN("No transport found !"); + return ncclInternalError; +} + +static int log2(int n) { + int l = 0; + while (n>>=1) l++; + return l; +} + +static ncclResult_t ncclTreeThreshold(int nnodes, int nranks, int nChannels, ssize_t *treeThreshold) { + int nvlink; + NCCLCHECK(ncclNvlinkGpu(&nvlink)); + float ringbw = nvlink ? 5000*nChannels : 5000; // approx, in MB/s or B/us + float ringlatinter = 6; + float treelatintra = 4; + float treelatinter = 15; + float treebw; + if (!nvlink) { + treebw = ringbw * 2 / 3; + } else { + treebw = ringbw * 3 / 4; + if (nnodes == 2) treebw *= 2; + } + float ringlat = ringlatinter*(nranks-1); + float treelat = treelatinter*log2(nnodes)+treelatintra*(nranks/nnodes-1); + if (nnodes < 2 || ringlat <= treelat) + *treeThreshold = 0; + else if (treebw > ringbw) + *treeThreshold = 0x7fffffffffffffff; + else + *treeThreshold = (ssize_t)(((ringbw*treebw/(ringbw-treebw)))*(ringlat-treelat)); + return ncclSuccess; +} + +static ncclResult_t setupChannel(struct ncclComm* comm, int channelId, int rank, int nranks, int* ringRanks, int* treeMasters) { + TRACE(NCCL_INIT, "rank %d nranks %d", rank, nranks); + NCCLCHECK(initChannel(comm, channelId)); + + struct ncclChannel* channel = comm->channels+channelId; + struct ncclRing* ring = &channel->ring; + + // Reorganize ranks to start with rank. + int shift; + for (shift = 0; shiftuserRanks[i] = ringRanks[(i+shift)%nranks]; + } + int prev = ring->prev = ring->userRanks[nranks-1]; + int next = ring->next = ring->userRanks[1]; + + struct ncclTree* tree = &channel->tree; + tree->up = -1; + tree->down[0] = tree->down[1] = tree->down[2] = -1; + + // + // Find per-node masters and connect them via a binary tree + // + + int nMasters = 0; + for (int r=0; rtreeThreshold == -2) + NCCLCHECK(ncclTreeThreshold(nMasters, comm->nRanks, comm->nChannels, &comm->treeThreshold)); + + if (comm->treeThreshold > 0) { + // Compute tree depth. Not an exact value but a good approximation in most + // cases and consistent across nodes + tree->depth = nranks/nMasters + log2(nMasters); + + // Find my master : go backwards in the ring to find my root + int master = 0; + for (int i = 0; iuserRanks[(nranks-i)%nranks]; + if (treeMasters[r]) { + master = r; + break; + } + } + + int* ranks; + NCCLCHECK(ncclCalloc(&ranks, nMasters)); + int i = 0, masterIndex = -1; + // Build binary tree + for (int r=0; rnChannels, 2)) { + btreeUp = u0; btreeDown0 = d0_0; btreeDown1 = d0_1; + } else { + btreeUp = u1; btreeDown0 = d1_0; btreeDown1 = d1_1; + } + + // + // Now build the full tree, combining the intra-node ring and the + // inter-node binary tree. + // + + if (rank == master) { + int nDown = 0; + if (btreeUp != -1) tree->up = ranks[btreeUp]; + if (treeMasters[next] == 0) tree->down[nDown++] = next; + if (btreeDown0 != -1) tree->down[nDown++] = ranks[btreeDown0]; + if (btreeDown1 != -1) tree->down[nDown++] = ranks[btreeDown1]; + } else { + tree->up = prev; + if (treeMasters[next] == 0) tree->down[0] = next; + } + free(ranks); + } + + TRACE(NCCL_INIT, "rank %d nranks %d - DONE", rank, nranks); + return ncclSuccess; +} + +static ncclResult_t fillConnect(struct ncclPeerInfo* peerInfo, int nranks, int rank, int* connectTransport, ncclTvalue_t* connectValue) { + for (int r=0; r 0) { + connectTransport[r] = t; + break; + } + } + } + return ncclSuccess; +} + +#define MAXWIDTH 20 +#define PREFIXLEN 15 +#define STRLENGTH (PREFIXLEN+5*MAXWIDTH) +void dumpMatrix(int* connectMatrix, int nranks) { + char line[STRLENGTH+1]; + line[STRLENGTH] = '\0'; + memset(line, ' ', STRLENGTH); + for (int j=0; jmyParams = comm->intraParams+comm->intraRank; + params->args =(void **)&comm->argsptr; + params->stream = NULL; + params->sharedMem = 0; + params->blockDim.x = 0; params->blockDim.y = params->blockDim.z = 1; + params->gridDim.x = 0; params->gridDim.y = params->gridDim.z = 1; + return ncclSuccess; +} + +// Allocate/Set Intra Process Structures and set CG options +ncclResult_t ncclCommSetIntra(struct ncclComm* comm, int rank, int ranks, struct ncclComm* comm0) { + comm->intraRank = rank; + comm->intraRanks = ranks; + comm->intraPhase = 0; + + // Alloc shared structures + if (rank == 0) { + assert(comm == comm0); + int* bar; + NCCLCHECK(ncclCalloc(&bar, 2)); + bar[0] = bar[1] = 0; + comm->intraBarrier = bar; + NCCLCHECK(ncclCalloc(&comm->intraParams, comm->intraRanks)); + NCCLCHECK(ncclCalloc(&comm->intraCudaDevs, comm->intraRanks)); + int* CGMode; + NCCLCHECK(ncclCalloc(&CGMode, 1)); + *CGMode = 0x11; + comm->intraCGMode = CGMode; + int* CC; + NCCLCHECK(ncclCalloc(&CC, 1)); + *CC = ncclCudaFullCompCap(); + comm->intraCC = CC; + } else { + comm->intraBarrier = (int*)waitForNonNullPtr(&comm0->intraBarrier); + comm->intraParams = (hipLaunchParams*)waitForNonNullPtr(&comm0->intraParams); + comm->intraCudaDevs = (int*)waitForNonNullPtr(&comm0->intraCudaDevs); + comm->intraCGMode = (int*)waitForNonNullPtr(&comm0->intraCGMode); + comm->intraCC = (int*)waitForNonNullPtr(&comm0->intraCC); + } + comm->intraCudaDevs[comm->intraRank] = comm->cudaDev; + NCCLCHECK(initParams(comm)); + + int cgMdLaunch = 1; + + // Set CG Mode + comm->launchMode = ncclComm::GROUP; + char* str = getenv("NCCL_LAUNCH_MODE"); + if (comm->intraRanks == 1 || (str && strcmp(str, "PARALLEL") == 0)) { + comm->launchMode = ncclComm::PARALLEL; + } + if (comm->launchMode == ncclComm::GROUP) { + CUDACHECK(hipStreamCreateWithFlags(&comm->groupStream, hipStreamNonBlocking)); +#if CUDART_VERSION >= 9000 + if (*comm->intraCC && (ncclCudaFullCompCap() == *comm->intraCC)) { + // Check whether the GPU supports Cooperative Group Multi Device Launch + (void) hipDeviceGetAttribute(&cgMdLaunch, cudaDevAttrCooperativeMultiDeviceLaunch, comm->cudaDev); + } +#endif + } + + // Disable cgMdLaunch if any rank does not support it + if (cgMdLaunch == 0) { + *comm->intraCGMode = 0x10; + } + return ncclSuccess; +} + +static ncclResult_t p2pSetup(struct ncclComm* comm, struct ncclChannel* channel, int nrecv, int* peerRecv, int nsend, int* peerSend) { + TRACE(NCCL_INIT, "nsend %d nrecv %d", nsend, nrecv); + uint32_t nSkippedSend = 0, nSkippedRecv = 0; /* for tracing */ + struct ncclConnect connect; + struct ncclConnector* conn; + for (int i=0; ipeers[peer].recv; + if (conn->connected) { ++nSkippedRecv; continue; } + memset(&connect, 0, sizeof(connect)); + NCCLCHECK(selectTransport<0>(comm->peerInfo+comm->rank, comm->peerInfo+peer, &connect, conn, channel->buffSize, channel->id)); + NCCLCHECK(bootstrapSend(comm->bootstrap, peer, &connect, sizeof(struct ncclConnect))); + } + for (int i=0; ipeers[peer].send; + if (conn->connected) { ++nSkippedSend; continue; } + memset(&connect, 0, sizeof(connect)); + NCCLCHECK(selectTransport<1>(comm->peerInfo+comm->rank, comm->peerInfo+peer, &connect, conn, channel->buffSize, channel->id)); + NCCLCHECK(bootstrapSend(comm->bootstrap, peer, &connect, sizeof(struct ncclConnect))); + } + for (int i=0; ipeers[peer].send; + if (conn->connected) {++nSkippedSend; continue; } + memset(&connect, 0, sizeof(connect)); + NCCLCHECK(bootstrapRecv(comm->bootstrap, peer, &connect, sizeof(struct ncclConnect))); + NCCLCHECK(conn->transportComm->connect(&connect, conn)); + conn->connected = 1; + } + for (int i=0; ipeers[peer].recv; + if (conn->connected) {++nSkippedRecv; continue; } + memset(&connect, 0, sizeof(connect)); + NCCLCHECK(bootstrapRecv(comm->bootstrap, peer, &connect, sizeof(struct ncclConnect))); + NCCLCHECK(conn->transportComm->connect(&connect, conn)); + conn->connected = 1; + } + TRACE(NCCL_INIT, "nsend %d nrecv %d nSkippedSend %u nSkippedRecv %u - DONE", nsend, nrecv, nSkippedSend, nSkippedRecv); + return ncclSuccess; +} + +static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* commId) { + // We use 3 AllGathers + // 1. { peerInfo, comm } + // 2. ConnectTransport[nranks], ConnectValue[nranks] + // 3. { nThreads, nrings, compCap, prev[MAXCHANNELS], next[MAXCHANNELS] } + + int rank = comm->rank; + int nranks = comm->nRanks; + uint64_t commHash = getnHash(commId->internal, NCCL_UNIQUE_ID_BYTES); + TRACE(NCCL_INIT, "comm %p, commHash %lu, rank %d nranks %d - BEGIN", comm, commHash, rank, nranks); + NCCLCHECK(bootstrapInit(commId, rank, nranks, &comm->bootstrap)); + + // AllGather1 - begin + struct { + struct ncclPeerInfo peerInfo; + struct ncclComm* comm; + } *allGather1Data; + + NCCLCHECK(ncclCalloc(&allGather1Data, nranks)); + allGather1Data[rank].comm = comm; + NCCLCHECK(fillInfo(&allGather1Data[rank].peerInfo, rank, commHash)); + NCCLCHECK(bootstrapAllGather(comm->bootstrap, allGather1Data, sizeof(*allGather1Data))); + + NCCLCHECK(ncclCalloc(&comm->peerInfo, nranks)); + for (int i = 0; i < nranks; i++) { + memcpy(comm->peerInfo+i, &allGather1Data[i].peerInfo, sizeof(struct ncclPeerInfo)); + } + // AllGather1 data is used again below + // AllGather1 - end + + // AllGather2 - begin + size_t allGather2DataRowSize = sizeof(int)*nranks + sizeof(ncclTvalue_t)*nranks; + void *allGather2Data; + NCCLCHECK(ncclCalloc((char **)&allGather2Data, allGather2DataRowSize*nranks)); + int *myTransportRow = (int *)((char *)allGather2Data + allGather2DataRowSize*rank); + ncclTvalue_t *myValueRow = (ncclTvalue_t *)(myTransportRow + nranks); + + NCCLCHECK(fillConnect(comm->peerInfo, nranks, rank, myTransportRow, myValueRow)); + NCCLCHECK(bootstrapAllGather(comm->bootstrap, allGather2Data, allGather2DataRowSize)); + + int* connectTransport; + ncclTvalue_t* connectValue; + NCCLCHECK(ncclCalloc(&connectTransport, nranks*nranks)); + NCCLCHECK(ncclCalloc(&connectValue, nranks*nranks)); + for (int i = 0; i < nranks; i++) { + memcpy(connectTransport + i*nranks, (char *)allGather2Data + i*allGather2DataRowSize, sizeof(int)*nranks); + memcpy(connectValue + i*nranks, (char *)allGather2Data + i*allGather2DataRowSize + nranks*sizeof(int), sizeof(ncclTvalue_t)*nranks); + } + free(allGather2Data); + // AllGather2 - end + + //if (rank == 0) dumpMatrix(connectTransport, nranks); + //if (rank == 0) dumpMatrixTvalue(connectValue, nranks); + + // Get my rings + int nrings; + int* prev, *next, *treeIn, *treeOut; + NCCLCHECK(ncclCalloc(&prev, nranks*MAXCHANNELS)); + NCCLCHECK(ncclCalloc(&next, nranks*MAXCHANNELS)); + NCCLCHECK(ncclCalloc(&treeIn, nranks*MAXCHANNELS)); + NCCLCHECK(ncclCalloc(&treeOut, nranks*MAXCHANNELS)); + comm->nThreads = getDefaultThreads(); + NCCLCHECK(ncclGetRings(&nrings, &comm->nThreads, rank, nranks, connectTransport, connectValue, prev, next, treeIn, treeOut)); + TRACE(NCCL_INIT, "rank %d nranks %d - BUILD %d RINGS", rank, nranks, nrings); + assert(nrings <= MAXCHANNELS); + free(connectTransport); + free(connectValue); + + // AllGather3 - begin + struct { + int nThreads; + int nrings; + int cudaCompCap; + int prev[MAXCHANNELS]; + int next[MAXCHANNELS]; + } *allGather3Data; + + NCCLCHECK(ncclCalloc(&allGather3Data, nranks)); + allGather3Data[rank].nThreads = comm->nThreads; + allGather3Data[rank].nrings = nrings; + allGather3Data[rank].cudaCompCap = ncclCudaCompCap(); + for (int r=0; rbootstrap, allGather3Data, sizeof(*allGather3Data))); + + // Find max nThreads + for (int i=0; inThreads = std::max(allGather3Data[i].nThreads, comm->nThreads); + + // Determine the minimum CUDA Compute capability of all GPUs + int myCompCap = allGather3Data[rank].cudaCompCap; + int minCompCap = myCompCap; + for (int i = 0; i < nranks; i++) + minCompCap = std::min(allGather3Data[i].cudaCompCap, minCompCap); + + // Determine thread threshold across all GPUs + int nnodes = 0; + for (int r=0; rthreadThreshold = ncclThreadThreshold(minCompCap, nnodes); + + // Find min nrings across ranks + for (int i=0; inChannels = nrings; + + // Unpack the per ring prev/next arrays + for (int i = 0; i < nranks; i++) { + for (int r = 0; r < nrings; r++) { + prev[r*nranks+i] = allGather3Data[i].prev[r]; + next[r*nranks+i] = allGather3Data[i].next[r]; + } + } + free(allGather3Data); + // AllGather3 - end + + int *rings; + NCCLCHECK(ncclCalloc(&rings, nranks*MAXCHANNELS)); + NCCLCHECK(buildRings(nrings, rings, rank, nranks, prev, next)); + free(prev); + free(next); + TRACE(NCCL_INIT, "rank %d nranks %d - BUILT %d RINGS", rank, nranks, nrings); + + // Connect with prev/next for each ring + struct ncclConnect *connect; + NCCLCHECK(ncclCalloc(&connect, 2)); + for (int r=0; rchannels+r; + NCCLCHECK(setupChannel(comm, r, rank, nranks, rings+r*nranks, treeIn+r*nranks)); + NCCLCHECK(p2pSetup(comm, channel, 1, &channel->ring.prev, 1, &channel->ring.next)); + NCCLCHECK(p2pSetup(comm, channel, NCCL_MAX_TREE_ARITY, channel->tree.down, 1, &channel->tree.up)); + NCCLCHECK(p2pSetup(comm, channel, 1, &channel->tree.up, NCCL_MAX_TREE_ARITY, channel->tree.down)); + } + if (comm->treeThreshold > 0) { + char line[1024]; + line[0]='\0'; + for (int c=0; cchannels[c].tree; + snprintf(line+strlen(line), 1023-strlen(line), " [%d] %d->%d->%d/%d/%d", + c, tree->up, rank, tree->down[0], tree->down[1], tree->down[2]); + } + line[1023] = '\0'; + INFO(NCCL_INIT, "Trees%s", line); + } + if (rank == 0) { + char treeline[64]; + snprintf(treeline, 64, "enabled up to size %ld", comm->treeThreshold); + INFO(NCCL_INIT,"Using %d threads, Min Comp Cap %d, Trees %s", comm->nThreads, minCompCap, + comm->treeThreshold == 0 ? "disabled" : + comm->treeThreshold == 0x7fffffffffffffff ? "enabled for all sizes" : + treeline); + } + + TRACE(NCCL_INIT, "rank %d nranks %d - CONNECTED %d RINGS AND TREES", rank, nranks, nrings); + free(connect); + free(rings); + free(treeIn); + free(treeOut); + + // Compute intra ranks (using AllGather1 data) + int intraRank0 = -1, intraRank = -1, intraRanks = 0; + for (int i = 0; i < nranks; i++) { + if ((allGather1Data[i].peerInfo.hostHash == allGather1Data[rank].peerInfo.hostHash) && + (allGather1Data[i].peerInfo.pidHash == allGather1Data[rank].peerInfo.pidHash)) { + if (intraRanks == 0) intraRank0 = i; + if (i == rank) intraRank = intraRanks; + intraRanks++; + } + } + TRACE(NCCL_INIT,"hostHash[%d] %lx intraRank %d intraRanks %d intraRank0 %d", + rank, allGather1Data[rank].peerInfo.hostHash, intraRank, intraRanks, intraRank0); + if (intraRank == -1 || intraRank0 == -1 || allGather1Data[intraRank0].comm == NULL) { + WARN("Failed to determine intra ranks hostHash[%d] %lx intraRank %d intraRanks %d intraRank0 %d", + rank, allGather1Data[rank].peerInfo.hostHash, intraRank, intraRanks, intraRank0); + return ncclInternalError; + } + NCCLCHECK(ncclCommSetIntra(comm, intraRank, intraRanks, allGather1Data[intraRank0].comm)); + + // Done with AllGather1 data + free(allGather1Data); + + if (nnodes) NCCLCHECK(transportCreateProxy(comm)); + + TRACE(NCCL_INIT, "rank %d nranks %d - DONE", rank, nranks); + return ncclSuccess; +} + +static ncclResult_t getCpuGpuAffinity(int cudaDev, cpu_set_t* mask) { + CPU_ZERO_S(sizeof(cpu_set_t), mask); + char* cudaPath; + NCCLCHECK(getCudaPath(cudaDev, &cudaPath)); + char path[PATH_MAX]; + strncpy(path, cudaPath, PATH_MAX-1); + snprintf(path+strlen(path), PATH_MAX-1-strlen(path), "/local_cpus"); + path[PATH_MAX-1] = '\0'; + int fd; + SYSCHECKVAL(open(path, O_RDONLY), "open", fd); + char affinityStr[sizeof(cpu_set_t)*2 + 1]; + int r = read(fd, affinityStr, sizeof(cpu_set_t)*2); + if (r > 0) { + affinityStr[r] = '\0'; + NCCLCHECK(ncclStrToCpuset(affinityStr, mask)); + } + close(fd); + free(cudaPath); + return ncclSuccess; +} + +NCCL_PARAM(IgnoreCpuAffinity, "IGNORE_CPU_AFFINITY", 0); + +static ncclResult_t setCpuAffinity(int cudaDev) { + // Query the CPU affinity set we were provided + cpu_set_t mask; + SYSCHECK(sched_getaffinity(0, sizeof(cpu_set_t), &mask), "sched_getaffinity"); + +#ifdef ENABLE_TRACE + { + char affinityStr[sizeof(cpu_set_t)*2]; + NCCLCHECK(ncclCpusetToStr(&mask, affinityStr)); + TRACE(NCCL_INIT, "Current affinity for GPU %d is %s", cudaDev, affinityStr); + } +#endif + + // Find the CPUs that are local to the supplied GPU + cpu_set_t gpuMask; + NCCLCHECK(getCpuGpuAffinity(cudaDev, &gpuMask)); + +#ifdef ENABLE_TRACE + { + char affinityStr[sizeof(cpu_set_t)*2]; + NCCLCHECK(ncclCpusetToStr(&gpuMask, affinityStr)); + TRACE(NCCL_INIT, "CPU GPU affinity for GPU %d is %s", cudaDev, affinityStr); + } +#endif + + cpu_set_t finalMask; + if (ncclParamIgnoreCpuAffinity()) + // Ignore the CPU affinity set and use the GPU one instead + finalMask = gpuMask; + else + // Use a subset of the GPU affinity set + CPU_AND(&finalMask, &mask, &gpuMask); + + // If there is a non empty set, use it to set affinity + if (CPU_COUNT(&finalMask)) { + char affinityStr[sizeof(cpu_set_t)*2]; + NCCLCHECK(ncclCpusetToStr(&finalMask, affinityStr)); + INFO(NCCL_INIT, "Setting affinity for GPU %d to %s", cudaDev, affinityStr); + SYSCHECK(sched_setaffinity(0, sizeof(cpu_set_t), &finalMask), "sched_setaffinity"); + } + return ncclSuccess; +} + +ncclResult_t ncclCommInitRankSync(ncclComm_t* newcomm, int nranks, ncclUniqueId commId, int myrank) { + cpu_set_t affinitySave; + sched_getaffinity(0, sizeof(cpu_set_t), &affinitySave); + + NCCLCHECK(wrapNvmlSymbols()); + NCCLCHECK(wrapNvmlInit()); + + // Make sure all host memory allocation are close to the GPU + int cudaDev; + CUDACHECK(hipGetDevice(&cudaDev)); + NCCLCHECK(setCpuAffinity(cudaDev)); + ncclResult_t res; + + NCCLCHECKGOTO(commAlloc(newcomm, nranks, myrank), res, cleanup); + NCCLCHECKGOTO(initTransportsRank(*newcomm, &commId), res, cleanup); + NCCLCHECKGOTO(devCommSetup(*newcomm), res, cleanup); + + sched_setaffinity(0, sizeof(cpu_set_t), &affinitySave); + NCCLCHECKGOTO(wrapNvmlShutdown(), res, cleanup); + + INFO(NCCL_INIT,"comm %p rank %d nranks %d cudaDev %d nvmlDev %d - Init COMPLETE", *newcomm, myrank, nranks, (*newcomm)->cudaDev, (*newcomm)->nvmlDev); + + return ncclSuccess; +cleanup: + *newcomm = NULL; + sched_setaffinity(0, sizeof(cpu_set_t), &affinitySave); + return res; +} + +NCCL_API(ncclResult_t, ncclCommInitRank, ncclComm_t* newcomm, int nranks, ncclUniqueId commId, int myrank); +ncclResult_t ncclCommInitRank(ncclComm_t* newcomm, int nranks, ncclUniqueId commId, int myrank) { + char* env = getenv("NCCL_COMM_ID"); + if (env && myrank == 0) { + NCCLCHECK(bootstrapCreateRoot(&commId, true)); + } + + NCCLCHECK(ncclInit()); + if (myrank == 0) showVersion(); + + // Make sure the CUDA runtime is initialized. + CUDACHECK(hipFree(NULL)); + + NCCLCHECK(PtrCheck(newcomm, "CommInitRank", "newcomm")); + if (nranks < 1 || myrank < 0 || myrank >= nranks) { + WARN("Invalid rank requested : %d/%d", myrank, nranks); + return ncclInvalidArgument; + } + + if (ncclAsyncMode()) { + int cudaDev; + CUDACHECK(hipGetDevice(&cudaDev)); + return ncclAsyncInit(ncclCommInitRankSync, cudaDev, newcomm, nranks, commId, myrank); + } else { + return ncclCommInitRankSync(newcomm, nranks, commId, myrank); + } +} + +static ncclResult_t initTransportsAll(struct ncclComm** comms, const int* devs, int nranks) { + struct ncclPeerInfo* allInfo; + NCCLCHECK(ncclCalloc(&allInfo, nranks)); + for (int rank=0; ranknChannels = nrings; + comms[rank]->nThreads = nthreads; + comms[rank]->threadThreshold = threadThreshold; + } + + struct ncclConnect* connect; + NCCLCHECK(ncclCalloc(&connect, 2*nranks)); + for (int r=0; rchannels+r; + struct ncclRing *ring = &channel->ring; + NCCLCHECK(setupChannel(comms[rank], r, rank, nranks, ringRanks, treeIn)); + // Make sure we don't use trees, we cannot use them with initAll + comms[rank]->treeThreshold = 0; + int prev = channel->ring.prev = ring->userRanks[nranks-1]; + int next = channel->ring.next = ring->userRanks[1]; + struct ncclConnector* recv = &channel->peers[prev].recv; + struct ncclConnector* send = &channel->peers[next].send; + NCCLCHECK(selectTransport<0>(allInfo+rank, allInfo+prev, connect+rank*2+0, recv, channel->buffSize, channel->id)); + NCCLCHECK(selectTransport<1>(allInfo+rank, allInfo+next, connect+rank*2+1, send, channel->buffSize, channel->id)); + } + for (int rank=0; rankchannels+r; + struct ncclRing *ring = &channel->ring; + struct ncclConnector* recv = &channel->peers[ring->prev].recv; + struct ncclConnector* send = &channel->peers[ring->next].send; + NCCLCHECK(recv->transportComm->connect(connect+ring->prev*2+1, recv)); + NCCLCHECK(send->transportComm->connect(connect+ring->next*2+0, send)); + } + } + free(connect); + free(allInfo); + free(rings); + free(treeIn); + free(treeOut); + return ncclSuccess; +} + + +NCCL_API(ncclResult_t, ncclCommInitAll, ncclComm_t* comms, int ndev, const int* devlist); +ncclResult_t ncclCommInitAll(ncclComm_t* comms, int ndev, const int* devlist) { + NCCLCHECK(ncclInit()); + NCCLCHECK(wrapNvmlSymbols()); + NCCLCHECK(wrapNvmlInit()); + showVersion(); + + INFO(NCCL_INIT,"nranks %d", ndev); + + NCCLCHECK(PtrCheck(comms, "CommInitAll", "comms")); + if (ndev < 1) { + WARN("Invalid device count requested : %d", ndev); + return ncclInvalidArgument; + } + + ncclResult_t res; + int savedDevice; + int rank, cudaDev; + ncclComm_t comm = NULL; + int* ncclDevList = NULL; + NCCLCHECK(ncclCalloc(&ncclDevList, ndev)); + for (int i=0; irank; +#endif + CUDACHECK(hipGetDevice(&savedDevice)); + int commDevice = comm->cudaDev; + + if (savedDevice != commDevice) { + CUDACHECK(hipSetDevice(commDevice)); + } + + TRACE(NCCL_INIT, "Destroying comm %p rank %d abortFlag %d fatalError %d", comm, rank, LOAD(comm->abortFlag), comm->fatalError); + + CUDACHECK(hipStreamSynchronize(comm->groupStream)); + NCCLCHECK(transportDestroyProxy(comm)); + NCCLCHECK(commFree(comm)); + + if (savedDevice != commDevice) + CUDACHECK(hipSetDevice(savedDevice)); + + TRACE(NCCL_INIT, "Destroyed comm %p rank %d", comm, rank); + + return ncclSuccess; +} + +NCCL_API(ncclResult_t, ncclCommDestroy, ncclComm_t comm); +ncclResult_t ncclCommDestroy(ncclComm_t comm) { + if (comm == NULL) + return ncclSuccess; + + TRACE(NCCL_INIT, "comm %p rank %d nRanks %d cudaDev %d nvmlDev %d", comm, comm->rank, comm->nRanks, comm->cudaDev, comm->nvmlDev); + + // Try and prevent a double free of the comm struct (user error) + if (comm->rank == -1 || comm->nRanks <= 0 || comm->cudaDev == -1 || comm->nvmlDev == -1) { + WARN("comm %p has already been destroyed", comm); + return ncclInvalidArgument; + } + + return commDestroy(comm); +} + +NCCL_API(ncclResult_t, ncclCommAbort, ncclComm_t comm); +ncclResult_t ncclCommAbort(ncclComm_t comm) { + if (comm == NULL) + return ncclSuccess; + + // Ask anything that might still be running on the device to quit + STORE(comm->abortFlag, 1); + + // do not destroy comm because kernel maybe still running + // return commDestroy(comm); + return ncclSuccess; +} + +NCCL_API(const char*, ncclGetErrorString, ncclResult_t code); +const char* ncclGetErrorString(ncclResult_t code) { + switch (code) { + case ncclSuccess : return "no error"; + case ncclUnhandledCudaError : return "unhandled cuda error"; + case ncclSystemError : return "unhandled system error"; + case ncclInternalError : return "internal error"; + case ncclInvalidArgument : return "invalid argument"; + case ncclInvalidUsage : return "invalid usage"; + default : return "unknown result code"; + } +} + +NCCL_API(ncclResult_t, ncclCommGetAsyncError, ncclComm_t comm, ncclResult_t *asyncError); +ncclResult_t ncclCommGetAsyncError(ncclComm_t comm, ncclResult_t *asyncError) { + NCCLCHECK(PtrCheck(comm, "ncclGetAsyncError", "comm")); + NCCLCHECK(PtrCheck(asyncError, "ncclGetAsyncError", "asyncError")); + + // Check device reported error + static ncclDevError_t printedDevErr = ncclDevSuccess; + switch(LOAD(comm->fatalDevError)) { + case ncclDevSuccess : + break; + case ncclDevAssertedMismatch : + if (printedDevErr != ncclDevAssertedMismatch) { + WARN("Mismatched collective detected, please check your collective calls at and around rank %d. You can use NCCL_DEBUG=INFO and NCCL_DEBUG_SUBSYS=COLL to see the collective logs", comm->rank); + printedDevErr = ncclDevAssertedMismatch; + } + if (comm->fatalError == ncclSuccess) { + comm->fatalError = ncclInvalidUsage; + } + break; + case ncclDevSuspectedMismatch : + if (printedDevErr != ncclDevSuspectedMismatch) { + WARN("Your program may be hanging, this may be caused by a collective mismatch around rank %d. Please check your collective calls at and around this rank. You can use NCCL_DEBUG=INFO and NCCL_DEBUG_SUBSYS=COLL to see the collective logs", comm->rank); + printedDevErr = ncclDevSuspectedMismatch; + } + break; + default: + WARN("Unknown device error %d", *comm->fatalDevError); + return ncclInternalError; + } + *asyncError = comm->fatalError; + return ncclSuccess; +} + +NCCL_API(ncclResult_t, ncclCommCount, const ncclComm_t comm, int* count); +ncclResult_t ncclCommCount(const ncclComm_t comm, int* count) { + NCCLCHECK(PtrCheck(comm, "CommCount", "comm")); + NCCLCHECK(PtrCheck(count, "CommCount", "count")); + *count = comm->nRanks; + return ncclSuccess; +} + +NCCL_API(ncclResult_t, ncclCommCuDevice, const ncclComm_t comm, int* devid); +ncclResult_t ncclCommCuDevice(const ncclComm_t comm, int* devid) { + NCCLCHECK(PtrCheck(comm, "CommCuDevice", "comm")); + NCCLCHECK(PtrCheck(devid, "CommCuDevice", "devid")); + *devid = comm->cudaDev; + return ncclSuccess; +} + +NCCL_API(ncclResult_t, ncclCommUserRank, const ncclComm_t comm, int* rank); +ncclResult_t ncclCommUserRank(const ncclComm_t comm, int* rank) { + NCCLCHECK(PtrCheck(comm, "CommUserRank", "comm")); + NCCLCHECK(PtrCheck(rank, "CommUserRank", "rank")); + *rank = comm->rank; + return ncclSuccess; +} diff --git a/projects/rccl/src/init.cu b/projects/rccl/src/init.cu deleted file mode 100644 index 95f70cde2b..0000000000 --- a/projects/rccl/src/init.cu +++ /dev/null @@ -1,970 +0,0 @@ -/************************************************************************* - * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved. - * Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved. - * - * See LICENSE.txt for license information - ************************************************************************/ - -#include "nccl.h" -#include "core.h" -#include "ring.h" -#include "param.h" -#include "nvmlwrap.h" -#include "rings.h" -#include "bootstrap.h" -#include "transport.h" -#include "common_coll.h" -#include "group.h" -#include "utils.h" -#include "net.h" -#include "topo.h" -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#define STR2(v) #v -#define STR(v) STR2(v) - -int ncclDebugLevel; -uint64_t ncclDebugMask = NCCL_INIT; // Default debug sub-system mask is INIT -pthread_mutex_t ncclDebugOutputLock; -FILE *ncclDebugFile = stdout; - -#ifdef ENABLE_TRACE -std::chrono::high_resolution_clock::time_point ncclEpoch; -#endif - -#if CUDART_VERSION >= 9200 || defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) -#define NCCL_GROUP_CUDA_STREAM 0 // CGMD: CUDA 9.2,10.X Don't need to use an internal CUDA stream -#else -#define NCCL_GROUP_CUDA_STREAM 1 // CGMD: CUDA 9.0,9.1 Need to use an internal CUDA stream -#endif - -NCCL_PARAM(GroupCudaStream, "GROUP_CUDA_STREAM", NCCL_GROUP_CUDA_STREAM); - -NCCL_PARAM(CheckPointers, "CHECK_POINTERS", 0); - -ncclNet_t* ncclNet = NULL; - -// We define this as weak to let tests redefine their own -#pragma weak ncclCudaCompCap -int ncclCudaCompCap() { - int cudaDev; - if (hipGetDevice(&cudaDev) != hipSuccess) return 0; - int ccMajor; - if (hipDeviceGetAttribute(&ccMajor, hipDeviceAttributeComputeCapabilityMajor, cudaDev) != hipSuccess) return 0; - return ccMajor; -} -int ncclCudaFullCompCap() { - int cudaDev; - if (hipGetDevice(&cudaDev) != hipSuccess) return 0; - int ccMajor, ccMinor; - if (hipDeviceGetAttribute(&ccMajor, hipDeviceAttributeComputeCapabilityMajor, cudaDev) != hipSuccess) return 0; - if (hipDeviceGetAttribute(&ccMinor, hipDeviceAttributeComputeCapabilityMinor, cudaDev) != hipSuccess) return 0; - return ccMajor*10+ccMinor; -} - -// Returns ncclInternalError if anything fails, causing that network to be ignored. -ncclResult_t initNet(ncclNet_t* net) { - int ndev; - if (net->init(ncclDebugLog) != ncclSuccess) return ncclInternalError; - if (net->devices(&ndev) != ncclSuccess) return ncclInternalError; - if (ndev <= 0) { - INFO(NCCL_INIT|NCCL_NET, "Net/%s: call to devices() returned 0 devices.", net->name); - return ncclSystemError; - } - return ncclSuccess; -} - -ncclResult_t initNetPlugin(ncclNet_t** net) { - void* netPluginLib = dlopen("libnccl-net.so", RTLD_NOW | RTLD_LOCAL); - if (netPluginLib == NULL) { - // dlopen does not guarantee to set errno, but dlerror only gives us a - // string, so checking errno doesn't hurt to try to provide a better - // error message - if (errno == ENOENT) { - INFO(NCCL_INIT|NCCL_NET, "No network plugin found."); - } else { - INFO(NCCL_INIT|NCCL_NET, "Unable to load libnccl-net.so : %s", dlerror()); - } - return ncclSuccess; - } - ncclNet_t* extNet = (ncclNet_t*) dlsym(netPluginLib, STR(NCCL_PLUGIN_SYMBOL)); - if (extNet == NULL) { - INFO(NCCL_INIT|NCCL_NET, "NetPlugin: could not find " STR(NCCL_PLUGIN_SYMBOL) " symbol"); - goto cleanup; - } - if (initNet(extNet) == ncclSuccess) { - *net = extNet; - return ncclSuccess; - } -cleanup: - if (netPluginLib != NULL) dlclose(netPluginLib); - return ncclSuccess; -} - -ncclResult_t initNet() { - // Always initialize sockets as we use it for bootstrap - NCCLCHECK(initNet(&ncclNetSocket)); - - NCCLCHECK(initNetPlugin(&ncclNet)); - if (ncclNet != NULL) { - INFO(NCCL_INIT|NCCL_NET, "Using network plugin %s", ncclNetName()); - return ncclSuccess; - } - if (initNet(&ncclNetIb) == ncclSuccess) { - ncclNet = &ncclNetIb; - } else { - ncclNet = &ncclNetSocket; - } - INFO(NCCL_INIT|NCCL_NET,"Using network %s", ncclNetName()); - return ncclSuccess; -} - -NCCL_PARAM(LlThreshold, "LL_THRESHOLD", -2); -NCCL_PARAM(ThreadThreshold, "THREAD_THRESHOLD", -2); - -int ncclThreadThreshold(int minCompCap, int multiNode) { - int threshold = ncclParamThreadThreshold(); - if (threshold == -2) { // user has not set this env variable - threshold = (minCompCap <= 6) ? NCCL_THREAD_THRESHOLD_PREVOLTA : NCCL_THREAD_THRESHOLD; - // multiply by 2 if running on multiple nodes - if (multiNode) { - threshold *= 2; - } - } - return threshold; -} - -bool useFineGrainVramPcie = false; - -void parseHsaForceFineGrainVramPcie() { - char* str = getenv("HSA_FORCE_FINE_GRAIN_PCIE"); - if (str && strlen(str) > 0) { - errno = 0; - int64_t v = strtoll(str, NULL, 0); - if (errno || (v != 0 && v != 1)) { - INFO(NCCL_ALL,"Invalid value %s for %s, using default %u.", str, "HSA_FORCE_FINE_GRAIN_PCIE", useFineGrainVramPcie); \ - } else { - useFineGrainVramPcie = v; - INFO(NCCL_ALL,"%s set by environment to %u.", "HSA_FORCE_FINE_GRAIN_PCIE", useFineGrainVramPcie); \ - } - } -} - -pthread_mutex_t initLock = PTHREAD_MUTEX_INITIALIZER; -static bool initialized = false; -static ncclResult_t ncclInit() { - if (initialized) return ncclSuccess; - pthread_mutex_lock(&initLock); - if (!initialized) { - initEnv(); - initDebug(); - initNet(); - // Check if HSA_FORCE_FINE_GRAIN_PCIE is set in env - parseHsaForceFineGrainVramPcie(); - initialized = true; - } - pthread_mutex_unlock(&initLock); - return ncclSuccess; -} - -NCCL_API(ncclResult_t, ncclGetVersion, int* version); -ncclResult_t ncclGetVersion(int* version) { - if (version == NULL) return ncclInvalidArgument; - *version = NCCL_VERSION_CODE; - return ncclSuccess; -} - -NCCL_API(ncclResult_t, ncclGetUniqueId, ncclUniqueId* out); -ncclResult_t ncclGetUniqueId(ncclUniqueId* out) { - NCCLCHECK(ncclInit()); - NCCLCHECK(PtrCheck(out, "GetUniqueId", "out")); - return bootstrapGetUniqueId(out); -} - -static ncclResult_t commFree(ncclComm_t comm) { - if (comm == NULL) - return ncclSuccess; - - CUDACHECK(hipFree(comm->devComm)); - - for (int ring=0; ringnRings; ring++) - NCCLCHECK(freeRing(comm->rings+ring)); - - if (comm->doneEvent != NULL) - CUDACHECK(hipEventDestroy(comm->doneEvent)); - - if (comm->launchMode == ncclComm::GROUP) { - CUDACHECK(hipStreamDestroy(comm->groupStream)); - } - - // Last rank frees shared resources between threads - int isLast; - NCCLCHECK(ncclCpuBarrierIn(comm, &isLast)); - if (isLast) { - free(comm->intraBarrier); - free(comm->intraParams); - free(comm->intraCudaDevs); - free(comm->intraCGMode); - free(comm->intraCC); - } - - free(comm); - return ncclSuccess; -} - -static ncclResult_t commAlloc(ncclComm_t* comret, int ndev, int rank) { - if (ndev < 1) { - WARN("invalid device count (%d) requested", ndev); - return ncclInvalidArgument; - } - if (rank >= ndev || rank < 0) { - WARN("rank %d exceeds ndev=%d", rank, ndev); - return ncclInvalidArgument; - } - - // Try to create a CUDA object right away. If there is something wrong with - // the device we're on (failure cause #1) , better know it early. - hipEvent_t doneEvent; - CUDACHECK(hipEventCreateWithFlags(&doneEvent, hipEventDisableTiming)); - - struct ncclComm* comm; - NCCLCHECK(ncclCalloc(&comm, 1)); - - INFO(NCCL_INIT,"comm %p rank %d nranks %d", comm, rank, ndev); - comm->rank = rank; - comm->nRanks = ndev; - hipGetDevice(&comm->cudaDev); - comm->doneEvent = doneEvent; - comm->llThreshold = ncclParamLlThreshold(); - comm->checkPointers = ncclParamCheckPointers() == 1 ? true : false; -#if CUDART_VERSION >= 9200 || defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) - comm->groupCudaStream = ncclParamGroupCudaStream(); -#else - // Don't allow the user to overload the default setting in older CUDA builds - comm->groupCudaStream = NCCL_GROUP_CUDA_STREAM; -#endif - - comm->argsptr = &comm->args; - - *comret = comm; - return ncclSuccess; -} - -static ncclResult_t devCommSetup(ncclComm_t comm) { - // Fully duplicate the comm on the device - NCCLCHECK(ncclCudaCalloc(&comm->devComm, 1)); - // Copy the comm on the device - NCCLCHECK(ncclCudaMemcpy(comm->devComm, comm, 1)); - // Copy userRanks - for (int r=0; rnRings; r++) { - NCCLCHECK(ncclCudaMemcpy(comm->rings[r].devUserRanks, comm->rings[r].userRanks, comm->nRanks)); - } - return ncclSuccess; -} - -// Pre-process the string so that running "strings" on the lib can quickly reveal the version. -#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) -#define VERSION_STRING "NCCL version " STR(NCCL_MAJOR) "." STR(NCCL_MINOR) "." STR(NCCL_PATCH) NCCL_SUFFIX "+hip" -#else -#define VERSION_STRING "NCCL version " STR(NCCL_MAJOR) "." STR(NCCL_MINOR) "." STR(NCCL_PATCH) NCCL_SUFFIX "+cuda" STR(CUDA_MAJOR) "." STR(CUDA_MINOR) -#endif -static void showVersion() { - static int shown = 0; - if (shown == 0 && ncclDebugLevel >= NCCL_LOG_VERSION) { - printf("%s\n", VERSION_STRING); - fflush(stdout); - if (ncclDebugFile != stdout) - INFO(NCCL_ALL,"%s", VERSION_STRING); // Also log NCCL version in one of the files - shown = 1; - } -} - -static ncclResult_t fillInfo(struct ncclInfo* info, int rank, uint64_t commHash) { - for (int t=0; ttinfo+t, rank, commHash)); - } - return ncclSuccess; -} - -bool SetCpuAffinity(int cudaDev, nvmlDevice_t* nvmlDevice); - -template -static ncclResult_t selectTransport(struct ncclInfo* myInfo, struct ncclInfo* peerInfo, struct ncclConnect* connect, struct ncclTransport** transportRet, struct ncclRing* ring) { - for (int t=0; tsend : &transport->recv; - ncclTvalue_t ret = 0; - NCCLCHECK(transport->canConnect(&ret, myInfo->tinfo+t, peerInfo->tinfo+t)); - if (ret > 0) { - cpu_set_t affinitySave; - nvmlDevice_t nvmlDevice; - int cudaDev; - CUDACHECK(hipGetDevice(&cudaDev)); - sched_getaffinity(0, sizeof(cpu_set_t), &affinitySave); - SetCpuAffinity(cudaDev, &nvmlDevice); - NCCLCHECK(transportComm->setup(myInfo->tinfo+t, peerInfo->tinfo+t, connect, ring)); - *transportRet = transport; - sched_setaffinity(0, sizeof(cpu_set_t), &affinitySave); - return ncclSuccess; - } - } - WARN("No transport found !"); - *transportRet = NULL; - return ncclInternalError; -} - -static ncclResult_t setupRing(struct ncclComm* comm, int ringid, int rank, int nranks, int* ringRanks, struct ncclInfo* allInfo, struct ncclConnect* connect) { - NCCLCHECK(initRing(comm, ringid)); - - struct ncclRing* ring = comm->rings+ringid; - // Reorganize ranks to start with rank. - int shift; - for (shift = 0; shiftuserRanks[i] = ringRanks[(i+shift)%nranks]; - } - int prev = ring->userRanks[nranks-1]; - int next = ring->userRanks[1]; - - NCCLCHECK(selectTransport<0>(allInfo+rank, allInfo+prev, connect+0, &ring->recv.transport, ring)); - NCCLCHECK(selectTransport<1>(allInfo+rank, allInfo+next, connect+1, &ring->send.transport, ring)); - NCCLCHECK(transportCreateProxy(0, ring, comm)); - NCCLCHECK(transportCreateProxy(1, ring, comm)); - return ncclSuccess; -} - -static ncclResult_t fillConnect(struct ncclInfo* allInfo, int nranks, int rank, int* connectTransport, ncclTvalue_t* connectValue) { - for (int r=0; r 0) { - connectTransport[r] = t; - break; - } - } - } - return ncclSuccess; -} - -static void swap(void* mem1, void* mem2, int size) { - char tmp[size]; - memcpy(tmp, mem1, size); memcpy(mem1, mem2, size); memcpy(mem2, tmp, size); -} - -#define MAXWIDTH 64 -#define PREFIXLEN 15 -#define STRLENGTH (PREFIXLEN+5*MAXWIDTH) -void dumpMatrix(int* connectMatrix, int nranks) { - char line[STRLENGTH+1]; - line[STRLENGTH] = '\0'; - memset(line, ' ', STRLENGTH); - for (int j=0; jmyParams = comm->intraParams+comm->intraRank; - params->args = (void **)&comm->argsptr; - params->stream = NULL; - params->sharedMem = 0; - params->blockDim.x = 0; params->blockDim.y = params->blockDim.z = 1; - params->gridDim.x = 0; params->gridDim.y = params->gridDim.z = 1; - return ncclSuccess; -} - -// Allocate/Set Intra Process Structures and set CG options -ncclResult_t ncclCommSetIntra(struct ncclComm* comm, int rank, int ranks, struct ncclComm* comm0) { - comm->intraRank = rank; - comm->intraRanks = ranks; - comm->intraPhase = 0; - - // Alloc shared structures - if (rank == 0) { - assert(comm == comm0); - int* bar; - NCCLCHECK(ncclCalloc(&bar, 2)); - bar[0] = bar[1] = 0; - comm->intraBarrier = bar; - NCCLCHECK(ncclCalloc(&comm->intraParams, comm->intraRanks)); - NCCLCHECK(ncclCalloc(&comm->intraCudaDevs, comm->intraRanks)); - int* CGMode; - NCCLCHECK(ncclCalloc(&CGMode, 1)); - *CGMode = 0x11; - comm->intraCGMode = CGMode; - int* CC; - NCCLCHECK(ncclCalloc(&CC, 1)); - *CC = ncclCudaFullCompCap(); - comm->intraCC = CC; - } else { - comm->intraBarrier = (int*)waitForNonNullPtr(&comm0->intraBarrier); - comm->intraParams = (hipLaunchParams*)waitForNonNullPtr(&comm0->intraParams); - comm->intraCudaDevs = (int*)waitForNonNullPtr(&comm0->intraCudaDevs); - comm->intraCGMode = (int*)waitForNonNullPtr(&comm0->intraCGMode); - comm->intraCC = (int*)waitForNonNullPtr(&comm0->intraCC); - } - comm->intraCudaDevs[comm->intraRank] = comm->cudaDev; - NCCLCHECK(initParams(comm)); - - int cgMdLaunch = 1; - - // Set CG Mode - comm->launchMode = ncclComm::GROUP; - char* str = getenv("NCCL_LAUNCH_MODE"); - if (comm->intraRanks == 1 || (str && strcmp(str, "PARALLEL") == 0)) { - comm->launchMode = ncclComm::PARALLEL; - } - if (comm->launchMode == ncclComm::GROUP) { - CUDACHECK(hipStreamCreateWithFlags(&comm->groupStream, hipStreamNonBlocking)); -#if CUDART_VERSION >= 9000 - if (*comm->intraCC && (ncclCudaFullCompCap() == *comm->intraCC)) { - // Check whether the GPU supports Cooperative Group Multi Device Launch - (void) hipDeviceGetAttribute(&cgMdLaunch, cudaDevAttrCooperativeMultiDeviceLaunch, comm->cudaDev); - } -#endif - } - - // Disable cgMdLaunch if any rank does not support it - if (cgMdLaunch == 0) { - *comm->intraCGMode = 0x10; - } - return ncclSuccess; -} - -static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* commId) { - int rank = comm->rank; - int nranks = comm->nRanks; - void* commState; - uint64_t commHash = getnHash(commId->internal, NCCL_UNIQUE_ID_BYTES); - TRACE(NCCL_INIT, "comm %p, commHash %lu, rank %d nranks %d - BEGIN", comm, commHash, rank, nranks); - NCCLCHECK(bootstrapInit(commId, rank, nranks, &commState)); - - struct ncclInfo* allInfo; - NCCLCHECK(ncclCalloc(&allInfo, nranks)); - NCCLCHECK(fillInfo(allInfo+rank, rank, commHash)); - NCCLCHECK(bootstrapAllGather(commState, allInfo, sizeof(struct ncclInfo))); - - int* connectTransport; - ncclTvalue_t* connectValue; - NCCLCHECK(ncclCalloc(&connectTransport, nranks*nranks)); - NCCLCHECK(ncclCalloc(&connectValue, nranks*nranks)); - - NCCLCHECK(fillConnect(allInfo, nranks, rank, connectTransport+nranks*rank, connectValue+nranks*rank)); - NCCLCHECK(bootstrapAllGather(commState, connectTransport, nranks*(sizeof(int)))); - NCCLCHECK(bootstrapAllGather(commState, connectValue, nranks*(sizeof(ncclTvalue_t)))); - //if (rank == 0) dumpMatrix(connectTransport, nranks); - //if (rank == 0) dumpMatrixTvalue(connectValue, nranks); - - // Get my rings - int nrings; - int* prev, *next; - NCCLCHECK(ncclCalloc(&prev, nranks*MAXRINGS)); - NCCLCHECK(ncclCalloc(&next, nranks*MAXRINGS)); - comm->nThreads = getDefaultThreads(); - NCCLCHECK(ncclGetRings(&nrings, &comm->nThreads, rank, nranks, connectTransport, connectValue, prev, next)); - free(connectTransport); - free(connectValue); - - // Find max nThreads - int allData[nranks]; - allData[rank] = comm->nThreads; - NCCLCHECK(bootstrapAllGather(commState, allData, sizeof(int))); - for (int i=0; inThreads = std::max(allData[i], comm->nThreads); - if (rank == 0) INFO(NCCL_INIT,"Using %d threads", comm->nThreads); - - // Determine the minimum CUDA Compute capability of all GPUs - int myCompCap = ncclCudaCompCap(); - int minCompCap = myCompCap; - allData[rank] = myCompCap; - NCCLCHECK(bootstrapAllGather(commState, allData, sizeof(int))); - for (int i=0; inRings = nrings; - for (int r=0; rrings+r; - NCCLCHECK(setupRing(comm, r, rank, nranks, ringRanks, allInfo, connectData+2*rank)); - int prev_offset = ring->userRanks[nranks-1]*2+1; - int next_offset = ring->userRanks[1]*2; - NCCLCHECK(bootstrapAllGather(commState, connectData, sizeof(struct ncclConnect)*2)); - NCCLCHECK(ring->send.transport->send.connect(connectData+next_offset, &ring->send)); - NCCLCHECK(ring->recv.transport->recv.connect(connectData+prev_offset, &ring->recv)); - } - free(connectData); - free(rings); - free(allInfo); - - // Intra-process barrier setup - struct rankInfo { - uint64_t hostHash; - uint64_t pidHash; - struct ncclComm* comm; - } rankInfos[nranks]; - rankInfos[rank].hostHash = getHostHash(); - rankInfos[rank].pidHash = getPidHash(); - rankInfos[rank].comm = comm; - NCCLCHECK(bootstrapAllGather(commState, rankInfos, sizeof(struct rankInfo))); - - // Compute intra ranks - int intraRank0 = -1, intraRank = -1, intraRanks = 0; - int multiNode = 0; - for (int r=0; rthreadThreshold = ncclThreadThreshold(minCompCap, multiNode); - - // Barrier - bootstrapClose(commState); - return ncclSuccess; -} - -bool SetCpuAffinity(int cudaDev, nvmlDevice_t* nvmlDevice) { -#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) - if (numa_available() < 0) { - WARN("System does not support NUMA API!"); - return false; - } - char* cudaPath; - NCCLCHECK(getCudaPath(cudaDev, &cudaPath)); - strcat(cudaPath, "/numa_node"); - int fd; - SYSCHECKVAL(open(cudaPath, O_RDONLY), "open", fd); - char numa_node[5]; - int len; - SYSCHECKVAL(read(fd, numa_node, 4), "read", len); - SYSCHECK(close(fd), "close"); - errno = 0; - long node = strtol(numa_node, NULL, 10); - if (errno == ERANGE || errno == EINVAL) { - INFO(NCCL_ALL,"%s: Call to strtol returned %s", __func__, strerror(errno)); - free(cudaPath); - return false; - } - numa_run_on_node(node); - numa_set_preferred(node); - free(cudaPath); - return true; -#else - char busId[NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE]; - if (hipDeviceGetPCIBusId(busId, NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE, cudaDev) != hipSuccess) return false; - if (wrapNvmlDeviceGetHandleByPciBusId(busId, nvmlDevice) != ncclSuccess) return false; - if (wrapNvmlDeviceSetCpuAffinity(*nvmlDevice) != ncclSuccess) { - WARN("Failed to set CPU affinity"); - return false; - } - return true; -#endif -} - -ncclResult_t ncclCommInitRankSync(ncclComm_t* newcomm, int nranks, ncclUniqueId commId, int myrank) { - cpu_set_t affinitySave; - sched_getaffinity(0, sizeof(cpu_set_t), &affinitySave); - - NCCLCHECK(wrapNvmlSymbols()); - NCCLCHECK(wrapNvmlInit()); - - // Make sure all host memory allocation are close to the GPU - int cudaDev; - nvmlDevice_t nvmlDevice; - CUDACHECK(hipGetDevice(&cudaDev)); - SetCpuAffinity(cudaDev, &nvmlDevice); - ncclResult_t res; - - NCCLCHECKGOTO(commAlloc(newcomm, nranks, myrank), res, cleanup); - NCCLCHECKGOTO(initTransportsRank(*newcomm, &commId), res, cleanup); - NCCLCHECKGOTO(devCommSetup(*newcomm), res, cleanup); - - sched_setaffinity(0, sizeof(cpu_set_t), &affinitySave); - NCCLCHECKGOTO(wrapNvmlShutdown(), res, cleanup); - - INFO(NCCL_INIT,"comm %p rank %d nranks %d - COMPLETE", *newcomm, myrank, nranks); - - return ncclSuccess; -cleanup: - *newcomm = NULL; - sched_setaffinity(0, sizeof(cpu_set_t), &affinitySave); - return res; -} - -NCCL_API(ncclResult_t, ncclCommInitRank, ncclComm_t* newcomm, int nranks, ncclUniqueId commId, int myrank); -ncclResult_t ncclCommInitRank(ncclComm_t* newcomm, int nranks, ncclUniqueId commId, int myrank) { - char* env = getenv("NCCL_COMM_ID"); - if (env && myrank == 0) { - NCCLCHECK(bootstrapCreateRoot(&commId, true)); - } - - NCCLCHECK(ncclInit()); - if (myrank == 0) showVersion(); - - INFO(NCCL_INIT,"rank %d nranks %d", myrank, nranks); - - // Make sure the CUDA runtime is initialized. - CUDACHECK(hipFree(NULL)); - - NCCLCHECK(PtrCheck(newcomm, "CommInitRank", "newcomm")); - if (nranks < 1 || myrank < 0 || myrank >= nranks) { - WARN("Invalid rank requested : %d/%d", myrank, nranks); - return ncclInvalidArgument; - } - - if (ncclAsyncMode()) { - int cudaDev; - CUDACHECK(hipGetDevice(&cudaDev)); - return ncclAsyncInit(ncclCommInitRankSync, cudaDev, newcomm, nranks, commId, myrank); - } else { - return ncclCommInitRankSync(newcomm, nranks, commId, myrank); - } -} - -static ncclResult_t initTransportsAll(struct ncclComm** comms, const int* devs, int nranks) { - struct ncclInfo* allInfo; - NCCLCHECK(ncclCalloc(&allInfo, nranks)); - for (int rank=0; ranknRings = nrings; - comms[rank]->nThreads = nthreads; - comms[rank]->threadThreshold = threadThreshold; - } - - for (int r=0; rprev and prevRank->next - struct ncclRing *ring = comms[rank]->rings+r; - int prevRank = ring->userRanks[nranks-1]; - struct ncclConnect* prevRankNextConnect = connect+2*prevRank+1; - struct ncclConnect* rankPrevConnect = connect+2*rank; - swap(prevRankNextConnect, rankPrevConnect, sizeof(struct ncclConnect)); - } - for (int rank=0; rankrings+r; - NCCLCHECK(ring->send.transport->send.connect(connect+2*rank+1, &ring->send)); - NCCLCHECK(ring->recv.transport->recv.connect(connect+2*rank+0, &ring->recv)); - } - } - free(rings); - free(allInfo); - return ncclSuccess; -} - - -NCCL_API(ncclResult_t, ncclCommInitAll, ncclComm_t* comms, int ndev, const int* devlist); -ncclResult_t ncclCommInitAll(ncclComm_t* comms, int ndev, const int* devlist) { - NCCLCHECK(ncclInit()); - NCCLCHECK(wrapNvmlSymbols()); - NCCLCHECK(wrapNvmlInit()); - showVersion(); - - INFO(NCCL_INIT,"nranks %d", ndev); - - NCCLCHECK(PtrCheck(comms, "CommInitAll", "comms")); - if (ndev < 1) { - WARN("Invalid device count requested : %d", ndev); - return ncclInvalidArgument; - } - - ncclResult_t res; - int savedDevice; - int rank, cudaDev; - ncclComm_t comm = NULL; - nvmlDevice_t nvmlDevice; - int ncclDevList[ndev]; - for (int i=0; icudaDev; - - if (savedDevice != commDevice) { - CUDACHECK(hipSetDevice(commDevice)); - } - - NCCLCHECK(commFree(comm)); - - if (savedDevice != commDevice) - CUDACHECK(hipSetDevice(savedDevice)); - - return ncclSuccess; -} - -NCCL_API(const char*, ncclGetErrorString, ncclResult_t code); -const char* ncclGetErrorString(ncclResult_t code) { - switch (code) { - case ncclSuccess : return "no error"; - case ncclUnhandledCudaError : return "unhandled cuda error"; - case ncclSystemError : return "unhandled system error"; - case ncclInternalError : return "internal error"; - case ncclInvalidArgument : return "invalid argument"; - case ncclInvalidUsage : return "invalid usage"; - default : return "unknown result code"; - } -} - -NCCL_API(ncclResult_t, ncclCommCount, const ncclComm_t comm, int* count); -ncclResult_t ncclCommCount(const ncclComm_t comm, int* count) { - NCCLCHECK(PtrCheck(comm, "CommCount", "comm")); - NCCLCHECK(PtrCheck(count, "CommCount", "count")); - *count = comm->nRanks; - return ncclSuccess; -} - -NCCL_API(ncclResult_t, ncclCommCuDevice, const ncclComm_t comm, int* devid); -ncclResult_t ncclCommCuDevice(const ncclComm_t comm, int* devid) { - NCCLCHECK(PtrCheck(comm, "CommCuDevice", "comm")); - NCCLCHECK(PtrCheck(devid, "CommCuDevice", "devid")); - *devid = comm->cudaDev; - return ncclSuccess; -} - -NCCL_API(ncclResult_t, ncclCommUserRank, const ncclComm_t comm, int* rank); -ncclResult_t ncclCommUserRank(const ncclComm_t comm, int* rank) { - NCCLCHECK(PtrCheck(comm, "CommUserRank", "comm")); - NCCLCHECK(PtrCheck(rank, "CommUserRank", "rank")); - *rank = comm->rank; - return ncclSuccess; -} diff --git a/projects/rccl/src/misc/argcheck.cc b/projects/rccl/src/misc/argcheck.cc new file mode 100644 index 0000000000..b906a68f5c --- /dev/null +++ b/projects/rccl/src/misc/argcheck.cc @@ -0,0 +1,69 @@ +/************************************************************************* + * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ + +#include "argcheck.h" + +static ncclResult_t CudaPtrCheck(const void* pointer, struct ncclComm* comm, const char* ptrname, const char* opname) { + hipPointerAttribute_t attr; + hipError_t err = hipPointerGetAttributes(&attr, pointer); + if (err != hipSuccess || attr.devicePointer == NULL) { + WARN("%s : %s is not a valid pointer", opname, ptrname); + return ncclInvalidArgument; + } +#if CUDART_VERSION >= 10000 + if (attr.type == hipMemoryTypeDevice && attr.device != comm->cudaDev) { +#else + if (attr.memoryType == hipMemoryTypeDevice && attr.device != comm->cudaDev) { +#endif + WARN("%s : %s allocated on device %d mismatchs with NCCL device %d", opname, ptrname, attr.device, comm->cudaDev); + return ncclInvalidArgument; + } + return ncclSuccess; +} + +ncclResult_t PtrCheck(void* ptr, const char* opname, const char* ptrname) { + if (ptr == NULL) { + WARN("%s : %s argument is NULL", opname, ptrname); + return ncclInvalidArgument; + } + return ncclSuccess; +} + +ncclResult_t ArgsCheck(struct ncclInfo* info) { + NCCLCHECK(PtrCheck(info->comm, info->opName, "comm")); + // First, the easy ones + if (info->root < 0 || info->root >= info->comm->nRanks) { + WARN("%s : invalid root %d (root should be in the 0..%d range)", info->opName, info->root, info->comm->nRanks); + return ncclInvalidArgument; + } + if (info->datatype < 0 || info->datatype >= ncclNumTypes) { + WARN("%s : invalid type %d", info->opName, info->datatype); + return ncclInvalidArgument; + } + // Type is OK, compute nbytes. Convert Allgather/Broadcast calls to chars. + info->nBytes = info->count * ncclTypeSize(info->datatype); + if (info->coll == ncclCollAllGather || info->coll == ncclCollBroadcast) { + info->count = info->nBytes; + info->datatype = ncclInt8; + } + if (info->coll == ncclCollAllGather || info->coll == ncclCollReduceScatter) info->nBytes *= info->comm->nRanks; // count is per rank + + if (info->op < 0 || info->op >= ncclNumOps) { + WARN("%s : invalid reduction operation %d", info->opName, info->op); + return ncclInvalidArgument; + } + + if (info->comm->checkPointers) { + // Check CUDA device pointers + if (info->coll != ncclCollBroadcast || info->comm->rank == info->root) { + NCCLCHECK(CudaPtrCheck(info->sendbuff, info->comm, "sendbuff", info->opName)); + } + if (info->coll != ncclCollReduce || info->comm->rank == info->root) { + NCCLCHECK(CudaPtrCheck(info->recvbuff, info->comm, "recvbuff", info->opName)); + } + } + return ncclSuccess; +} diff --git a/projects/rccl/src/misc/enqueue.cu b/projects/rccl/src/misc/enqueue.cu deleted file mode 100644 index eb56de55ae..0000000000 --- a/projects/rccl/src/misc/enqueue.cu +++ /dev/null @@ -1,248 +0,0 @@ -/************************************************************************* - * Copyright (c) 2017-2018, NVIDIA CORPORATION. All rights reserved. - * Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved. - * - * See LICENSE.txt for license information - ************************************************************************/ - -#include - -#include "enqueue.h" -#include "common_coll.h" -#include "param.h" - -#include "collectives/collectives.h" - -#define NCCL_FUNC4(coll, op, dtype) \ - NCCL_KERN_NAME(coll, op, dtype), \ - NCCL_KERN_NAME(coll##LL, op, dtype) - -// Must be consistent with ncclDataType_t -#define NCCL_FUNCS3A(coll, op) \ - NCCL_FUNC4(coll, op, i8), \ - NCCL_FUNC4(coll, op, u8), \ - NCCL_FUNC4(coll, op, i32), \ - NCCL_FUNC4(coll, op, u32), \ - NCCL_FUNC4(coll, op, i64), \ - NCCL_FUNC4(coll, op, u64), \ - NCCL_FUNC4(coll, op, f16), \ - NCCL_FUNC4(coll, op, f32), \ - NCCL_FUNC4(coll, op, f64) -#define NCCL_FUNCS3B(coll, op) \ - NCCL_FUNC4(coll, op, i8), \ - NCCL_FUNC4(coll, op, i8), \ - NCCL_FUNC4(coll, op, i8), \ - NCCL_FUNC4(coll, op, i8), \ - NCCL_FUNC4(coll, op, i8), \ - NCCL_FUNC4(coll, op, i8), \ - NCCL_FUNC4(coll, op, i8), \ - NCCL_FUNC4(coll, op, i8), \ - NCCL_FUNC4(coll, op, i8) - -// Must be consistent with ncclRedOp_t -#define NCCL_FUNCS2A(coll) \ - NCCL_FUNCS3A(coll, sum ), \ - NCCL_FUNCS3A(coll, prod), \ - NCCL_FUNCS3A(coll, max ), \ - NCCL_FUNCS3A(coll, min ) -#define NCCL_FUNCS2B(coll) \ - NCCL_FUNCS3B(coll, copy), \ - NCCL_FUNCS3B(coll, copy), \ - NCCL_FUNCS3B(coll, copy), \ - NCCL_FUNCS3B(coll, copy) - -typedef void(*ncclKern_t)(struct ncclColl); -// Must be consistent with the ncclFuncSet enum -static ncclKern_t const ncclKerns[ncclCollCount*ncclNumOps*ncclNumTypes*2] = { - NCCL_FUNCS2B(ncclBroadcast), - NCCL_FUNCS2A(ncclReduce), - NCCL_FUNCS2B(ncclAllGather), - NCCL_FUNCS2A(ncclReduceScatter), - NCCL_FUNCS2A(ncclAllReduce) -}; - -ncclResult_t ncclLaunchCooperativeKernelMultiDevice(hipLaunchParams *paramsList, int* cudaDevs, int numDevices, int cgMode) { - if (cgMode & 0x01) { - CUDACHECK(hipExtLaunchMultiKernelMultiDevice(paramsList, numDevices, 0)); - return ncclSuccess; - } - int savedDev; - CUDACHECK(hipGetDevice(&savedDev)); - for (int i = 0; i < numDevices; i++) { - hipLaunchParams* params = paramsList+i; - CUDACHECK(hipSetDevice(cudaDevs[i])); - hipLaunchKernelGGL(((void (*)(struct ncclColl))params->func), params->gridDim, params->blockDim, params->sharedMem, params->stream, **((struct ncclColl **)(params->args))); - } - CUDACHECK(hipSetDevice(savedDev)); - return ncclSuccess; -} - -ncclResult_t setupLaunch(struct ncclComm* comm, hipLaunchParams* params) { - params->gridDim.x = std::min((int) params->gridDim.x, comm->nRings); - - // Set active = 2 for the last operation - for (int r=0; rgridDim.x; r++) { - struct ncclRing* ring = comm->rings+r; - STORE(&ring->collectives[(ring->collStart+ring->collCount-1)%NCCL_MAX_OPS].active, 2); - } - - // Find the first operation, choose the kernel accordingly and pass it - // as the first argument. - struct ncclColl* coll = comm->rings[0].collectives+comm->rings[0].collStart; - memcpy(&comm->args, coll, sizeof(struct ncclColl)); - // As we pass that coll directly, we can free it immediately. - STORE(&coll->active, 0); - - params->func = (void *)ncclKerns[coll->funcIndex]; - return ncclSuccess; -} - -ncclResult_t ncclCpuBarrierIn(struct ncclComm* comm, int* isLast) { - volatile int* ptr = (volatile int*)(comm->intraBarrier+comm->intraPhase); - int val = LOAD(ptr); - bool done = false; - while (done == false) { - if (val >= comm->intraRanks) { - WARN("Trying to launch too many collectives"); - return ncclInvalidUsage; - } - if (val+1 == comm->intraRanks) { - // Reset the barrier. - comm->intraBarrier[comm->intraPhase^1] = 0; - *isLast = 1; - return ncclSuccess; - } - done = __sync_bool_compare_and_swap(ptr, val, val+1); - val++; - } - *isLast = 0; - return ncclSuccess; -} - -ncclResult_t ncclCpuBarrierLast(struct ncclComm* comm) { - volatile int* ptr = (volatile int*)(comm->intraBarrier+comm->intraPhase); - int val = LOAD(ptr); - if (__sync_bool_compare_and_swap(ptr, val, val+1) != true) { - WARN("Trying to launch too many collectives"); - return ncclInternalError; - } - return ncclSuccess; -} - -ncclResult_t ncclCpuBarrierOut(struct ncclComm* comm) { - volatile int* ptr = (volatile int*)(comm->intraBarrier+comm->intraPhase); - while (LOAD(ptr) < comm->intraRanks) pthread_yield(); - comm->intraPhase ^= 1; - return ncclSuccess; -} - -ncclResult_t ncclBarrierEnqueue(struct ncclComm* comm) { - if (comm->nRanks == 1) return ncclSuccess; - hipLaunchParams* params = comm->myParams; - - NCCLCHECK(setupLaunch(comm, params)); - - // Use internal NCCL stream for CGMD/GROUP launch if required or if the user stream is NULL - if (comm->launchMode == ncclComm::GROUP && (comm->groupCudaStream || comm->userStream == NULL)) { - // Enqueue event in user stream - CUDACHECK(hipEventRecord(comm->doneEvent, comm->userStream)); - // Create dependency between user stream and internal NCCL stream - CUDACHECK(hipStreamWaitEvent(comm->groupStream, comm->doneEvent, 0)); - params->stream = comm->groupStream; - } else { - if (comm->userStream != params->stream) { - // Stream changed from last call, create dependency against last NCCL kernel launch - CUDACHECK(hipStreamWaitEvent(comm->userStream, comm->doneEvent, 0)); - } - params->stream = comm->userStream; - } - - int isLast = 0; - NCCLCHECK(ncclCpuBarrierIn(comm, &isLast)); - - if (isLast) { - if (comm->launchMode == ncclComm::GROUP) { - // I'm the last. Launch all operations. - NCCLCHECK(ncclLaunchCooperativeKernelMultiDevice(comm->intraParams, comm->intraCudaDevs, comm->intraRanks, *comm->intraCGMode)); - } - NCCLCHECK(ncclCpuBarrierLast(comm)); - } - return ncclSuccess; -} - -ncclResult_t ncclBarrierEnqueueWait(ncclComm_t comm) { - if (comm->nRanks == 1) return ncclSuccess; - // We can't print the CG mode before the first barrier happened. - if (comm->rank == 0 && *comm->intraCGMode & 0x10) { - *comm->intraCGMode ^= 0x10; - INFO(NCCL_INIT,"Launch mode %s%s%s", - comm->launchMode == ncclComm::GROUP ? "Group" : "Parallel", - *comm->intraCGMode ? "/CGMD" : "", - (comm->launchMode == ncclComm::GROUP && comm->groupCudaStream) ? "/Stream" : ""); - } - - NCCLCHECK(ncclCpuBarrierOut(comm)); - - hipLaunchParams *params = comm->myParams; - if (comm->launchMode == ncclComm::PARALLEL) { - hipLaunchKernelGGL(((void (*)(struct ncclColl))params->func), params->gridDim, params->blockDim, params->sharedMem, params->stream, **((struct ncclColl **)(params->args))); - } - // Start the network proxies as soon as the kernel has been launched. We can't - // perform any CUDA call between the two or having a hipFree between the CUDA - // launch and the transportStartProxies call could cause a deadlock. - // Also, starting the proxies after the CUDA launch seems to be better for - // performance (latency). - for (int r=0; rgridDim.x; r++) { - struct ncclRing* ring = comm->rings+r; - ring->collStart = ring->collFifoTail; - ring->collCount = 0; - } - params->gridDim.x = params->blockDim.x = 0; - NCCLCHECK(transportStartProxies(comm)); - return ncclSuccess; -} - -ncclResult_t ncclEnqueueEvents(ncclComm_t comm) { - hipLaunchParams *params = comm->myParams; - // Enqueue event after NCCL kernel - CUDACHECK(hipEventRecord(comm->doneEvent, params->stream)); - // Use internal NCCL stream for CGMD/GROUP launch if required or if the user stream is NULL - if (comm->launchMode == ncclComm::GROUP && (comm->groupCudaStream || comm->userStream == NULL)) { - // Create dependency between NCCL internal stream and user stream - CUDACHECK(hipStreamWaitEvent(comm->userStream, comm->doneEvent, 0)); - } - comm->userStreamSet = false; - return ncclSuccess; -} - -ncclResult_t ncclEnqueueCheck(ncclFunc_t func, const char* primName, const void* sendbuff, - void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, - ncclComm_t comm, hipStream_t stream) { - if (comm == NULL) return ncclInvalidArgument; - // Launch asynchronously if needed - if (ncclAsyncMode()) { - ncclResult_t ret = ncclSuccess; - int savedDev = -1; - if (comm->checkPointers) { - CUDACHECKGOTO(hipGetDevice(&savedDev), ret, end); - CUDACHECKGOTO(hipSetDevice(comm->cudaDev), ret, end); - } - // Check arguments - NCCLCHECKGOTO(ArgsCheck(sendbuff, recvbuff, count, type, op, root, comm, primName), ret, end); - // Always register comm even in case of error to make sure ncclGroupEnd - // cleans it up. - NCCLCHECK(ncclAsyncColl(comm)); - NCCLCHECKGOTO(func(sendbuff, recvbuff, count, type, op, root, comm, stream), ret, end); -end: - if (savedDev != -1) CUDACHECK(hipSetDevice(savedDev)); - ncclAsyncErrCheck(ret); - return ret; - } else { - NCCLCHECK(ArgsCheck(sendbuff, recvbuff, count, type, op, root, comm, primName)); - NCCLCHECK(func(sendbuff, recvbuff, count, type, op, root, comm, stream)); - NCCLCHECK(ncclBarrierEnqueue(comm)); - NCCLCHECK(ncclBarrierEnqueueWait(comm)); - NCCLCHECK(ncclEnqueueEvents(comm)); - return ncclSuccess; - } -} diff --git a/projects/rccl/src/misc/group.cu b/projects/rccl/src/misc/group.cc similarity index 93% rename from projects/rccl/src/misc/group.cu rename to projects/rccl/src/misc/group.cc index 0144bee78d..8b0628197e 100644 --- a/projects/rccl/src/misc/group.cu +++ b/projects/rccl/src/misc/group.cc @@ -1,5 +1,5 @@ /************************************************************************* - * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved. * Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved. * * See LICENSE.txt for license information @@ -119,7 +119,7 @@ ncclResult_t ncclGroupEnd() { int savedDev; CUDACHECK(hipGetDevice(&savedDev)); int done = ncclGroupIndex; - int doneArray[ncclGroupIndex]; + int doneArray[MAX_ASYNC_OPS]; for (int i=0; inRings; r++) { - struct ncclRing* ring = comm->rings+r; - for (int i=0; icollCount; i++) { - STORE(&ring->collectives[(ring->collStart + i)%NCCL_MAX_OPS].active, 0); + for (int c=0; cnChannels; c++) { + struct ncclChannel* channel = comm->channels+c; + for (int i=0; icollCount; i++) { + STORE(&channel->collectives[(channel->collStart + i)%NCCL_MAX_OPS].active, 0); } - ring->collFifoTail = ring->collStart; - ring->collCount = 0; + channel->collFifoTail = channel->collStart; + channel->collCount = 0; } comm->myParams->gridDim.x = comm->myParams->blockDim.x = 0; comm->userStreamSet = false; diff --git a/projects/rccl/src/misc/ibvwrap.cu b/projects/rccl/src/misc/ibvwrap.cc similarity index 99% rename from projects/rccl/src/misc/ibvwrap.cu rename to projects/rccl/src/misc/ibvwrap.cc index 7ac3431c37..f47c141bc1 100644 --- a/projects/rccl/src/misc/ibvwrap.cu +++ b/projects/rccl/src/misc/ibvwrap.cc @@ -1,5 +1,5 @@ /************************************************************************* - * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ diff --git a/projects/rccl/src/misc/nvmlwrap.cu b/projects/rccl/src/misc/nvmlwrap.cc similarity index 84% rename from projects/rccl/src/misc/nvmlwrap.cu rename to projects/rccl/src/misc/nvmlwrap.cc index f3ee2ac9ae..fbe481fdd8 100644 --- a/projects/rccl/src/misc/nvmlwrap.cu +++ b/projects/rccl/src/misc/nvmlwrap.cc @@ -1,6 +1,5 @@ /************************************************************************* - * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved. - * Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved. + * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ @@ -17,14 +16,14 @@ static nvmlReturn_t (*nvmlInternalInit)(void); static nvmlReturn_t (*nvmlInternalShutdown)(void); static nvmlReturn_t (*nvmlInternalDeviceGetHandleByPciBusId)(const char* pciBusId, nvmlDevice_t* device); static nvmlReturn_t (*nvmlInternalDeviceGetIndex)(nvmlDevice_t device, unsigned* index); -static nvmlReturn_t (*nvmlInternalDeviceSetCpuAffinity)(nvmlDevice_t device); -static nvmlReturn_t (*nvmlInternalDeviceClearCpuAffinity)(nvmlDevice_t device); static const char* (*nvmlInternalErrorString)(nvmlReturn_t r); static nvmlReturn_t (*nvmlInternalDeviceGetNvLinkState)(nvmlDevice_t device, unsigned int link, nvmlEnableState_t *isActive); static nvmlReturn_t (*nvmlInternalDeviceGetPciInfo)(nvmlDevice_t device, nvmlPciInfo_t* pci); static nvmlReturn_t (*nvmlInternalDeviceGetNvLinkRemotePciInfo)(nvmlDevice_t device, unsigned int link, nvmlPciInfo_t *pci); static nvmlReturn_t (*nvmlInternalDeviceGetNvLinkCapability)(nvmlDevice_t device, unsigned int link, nvmlNvLinkCapability_t capability, unsigned int *capResult); +static nvmlReturn_t (*nvmlInternalDeviceGetMinorNumber)(nvmlDevice_t device, unsigned int* minorNumber); + ncclResult_t wrapNvmlSymbols(void) { if (nvmlState == nvmlInitialized) @@ -71,10 +70,9 @@ ncclResult_t wrapNvmlSymbols(void) { LOAD_SYM(nvmlhandle, "nvmlShutdown", nvmlInternalShutdown); LOAD_SYM(nvmlhandle, "nvmlDeviceGetHandleByPciBusId", nvmlInternalDeviceGetHandleByPciBusId); LOAD_SYM(nvmlhandle, "nvmlDeviceGetIndex", nvmlInternalDeviceGetIndex); - LOAD_SYM(nvmlhandle, "nvmlDeviceSetCpuAffinity", nvmlInternalDeviceSetCpuAffinity); - LOAD_SYM(nvmlhandle, "nvmlDeviceClearCpuAffinity", nvmlInternalDeviceClearCpuAffinity); LOAD_SYM(nvmlhandle, "nvmlErrorString", nvmlInternalErrorString); LOAD_SYM(nvmlhandle, "nvmlDeviceGetPciInfo", nvmlInternalDeviceGetPciInfo); + LOAD_SYM(nvmlhandle, "nvmlDeviceGetMinorNumber", nvmlInternalDeviceGetMinorNumber); LOAD_SYM_OPTIONAL(nvmlhandle, "nvmlDeviceGetNvLinkState", nvmlInternalDeviceGetNvLinkState); LOAD_SYM_OPTIONAL(nvmlhandle, "nvmlDeviceGetNvLinkRemotePciInfo", nvmlInternalDeviceGetNvLinkRemotePciInfo); LOAD_SYM_OPTIONAL(nvmlhandle, "nvmlDeviceGetNvLinkCapability", nvmlInternalDeviceGetNvLinkCapability); @@ -87,9 +85,8 @@ teardown: nvmlInternalShutdown = NULL; nvmlInternalDeviceGetHandleByPciBusId = NULL; nvmlInternalDeviceGetIndex = NULL; - nvmlInternalDeviceSetCpuAffinity = NULL; - nvmlInternalDeviceClearCpuAffinity = NULL; nvmlInternalDeviceGetPciInfo = NULL; + nvmlInternalDeviceGetMinorNumber = NULL; nvmlInternalDeviceGetNvLinkState = NULL; nvmlInternalDeviceGetNvLinkRemotePciInfo = NULL; nvmlInternalDeviceGetNvLinkCapability = NULL; @@ -156,38 +153,6 @@ ncclResult_t wrapNvmlDeviceGetIndex(nvmlDevice_t device, unsigned* index) { return ncclSuccess; } -ncclResult_t wrapNvmlDeviceSetCpuAffinity(nvmlDevice_t device) { - if (nvmlInternalDeviceSetCpuAffinity == NULL) { - WARN("lib wrapper not initialized."); - return ncclInternalError; - } - // Workaround : it seems SetCpuAffinity is not thread safe. - static pthread_mutex_t lock = PTHREAD_MUTEX_INITIALIZER; - pthread_mutex_lock(&lock); - nvmlReturn_t ret = nvmlInternalDeviceSetCpuAffinity(device); - pthread_mutex_unlock(&lock); - if (ret != NVML_SUCCESS) { - WARN("nvmlDeviceSetCpuAffinity() failed: %s ", - nvmlInternalErrorString(ret)); - return ncclSystemError; - } - return ncclSuccess; -} - -ncclResult_t wrapNvmlDeviceClearCpuAffinity(nvmlDevice_t device) { - if (nvmlInternalInit == NULL) { - WARN("lib wrapper not initialized."); - return ncclInternalError; - } - nvmlReturn_t ret = nvmlInternalDeviceClearCpuAffinity(device); - if (ret != NVML_SUCCESS) { - WARN("nvmlDeviceClearCpuAffinity() failed: %s ", - nvmlInternalErrorString(ret)); - return ncclSystemError; - } - return ncclSuccess; -} - ncclResult_t wrapNvmlDeviceGetPciInfo(nvmlDevice_t device, nvmlPciInfo_t* pci) { if (nvmlInternalDeviceGetPciInfo == NULL) { WARN("lib wrapper not initialized."); @@ -202,6 +167,20 @@ ncclResult_t wrapNvmlDeviceGetPciInfo(nvmlDevice_t device, nvmlPciInfo_t* pci) { return ncclSuccess; } +ncclResult_t wrapNvmlDeviceGetMinorNumber(nvmlDevice_t device, unsigned int* minorNumber) { + if (nvmlInternalDeviceGetMinorNumber == NULL) { + WARN("lib wrapper not initialized."); + return ncclInternalError; + } + nvmlReturn_t ret = nvmlInternalDeviceGetMinorNumber(device, minorNumber); + if (ret != NVML_SUCCESS) { + WARN("nvmlDeviceGetMinorNumber() failed: %s ", + nvmlInternalErrorString(ret)); + return ncclSystemError; + } + return ncclSuccess; +} + ncclResult_t wrapNvmlDeviceGetNvLinkState(nvmlDevice_t device, unsigned int link, nvmlEnableState_t *isActive) { if (nvmlInternalDeviceGetNvLinkState == NULL) { /* Do not warn, this symbol is optional. */ @@ -209,8 +188,9 @@ ncclResult_t wrapNvmlDeviceGetNvLinkState(nvmlDevice_t device, unsigned int link } nvmlReturn_t ret = nvmlInternalDeviceGetNvLinkState(device, link, isActive); if (ret != NVML_SUCCESS) { - INFO(NCCL_INIT,"nvmlDeviceGetNvLinkState() failed: %s ", - nvmlInternalErrorString(ret)); + if (ret != NVML_ERROR_NOT_SUPPORTED) + INFO(NCCL_INIT,"nvmlDeviceGetNvLinkState() failed: %s ", + nvmlInternalErrorString(ret)); return ncclSystemError; } return ncclSuccess; diff --git a/projects/rccl/src/misc/nvmlwrap_stub.cu b/projects/rccl/src/misc/nvmlwrap_stub.cc similarity index 85% rename from projects/rccl/src/misc/nvmlwrap_stub.cu rename to projects/rccl/src/misc/nvmlwrap_stub.cc index 85a389a1a9..b3bf5b7439 100644 --- a/projects/rccl/src/misc/nvmlwrap_stub.cu +++ b/projects/rccl/src/misc/nvmlwrap_stub.cc @@ -1,5 +1,5 @@ /************************************************************************* - * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved. * Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved. * * See LICENSE.txt for license information @@ -27,18 +27,14 @@ ncclResult_t wrapNvmlDeviceGetIndex(nvmlDevice_t device, unsigned* index) { return ncclSuccess; } -ncclResult_t wrapNvmlDeviceSetCpuAffinity(nvmlDevice_t device) { - return ncclSuccess; -} - -ncclResult_t wrapNvmlDeviceClearCpuAffinity(nvmlDevice_t device) { - return ncclSuccess; -} - ncclResult_t wrapNvmlDeviceGetPciInfo(nvmlDevice_t device, nvmlPciInfo_t* pci) { return ncclSuccess; } +ncclResult_t wrapNvmlDeviceGetMinorNumber(nvmlDevice_t device, unsigned int* minorNumber) { + return ncclSuccess; +} + ncclResult_t wrapNvmlDeviceGetNvLinkState(nvmlDevice_t device, unsigned int link, nvmlEnableState_t *isActive) { return ncclSuccess; } @@ -50,4 +46,4 @@ ncclResult_t wrapNvmlDeviceGetNvLinkRemotePciInfo(nvmlDevice_t device, unsigned ncclResult_t wrapNvmlDeviceGetNvLinkCapability(nvmlDevice_t device, unsigned int link, nvmlNvLinkCapability_t capability, unsigned int *capResult) { return ncclSuccess; -} \ No newline at end of file +} diff --git a/projects/rccl/src/misc/rings.cu b/projects/rccl/src/misc/rings.cc similarity index 84% rename from projects/rccl/src/misc/rings.cu rename to projects/rccl/src/misc/rings.cc index 359e26b359..1fc58f08d0 100644 --- a/projects/rccl/src/misc/rings.cu +++ b/projects/rccl/src/misc/rings.cc @@ -1,5 +1,5 @@ /************************************************************************* - * Copyright (c) 2016-2018, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved. * Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved. * * See LICENSE.txt for license information @@ -161,14 +161,25 @@ static ncclResult_t fillCoords(int nranks, int* matrix, int* coords, int* rankTo while ((rank = findConnected(curRank, matrix, nranks, transport, coords)) == -1) { current[transport] = 0; transport++; - if (transport == NTRANSPORTS) { free(p2pConnected); return ncclInternalError; } + if (transport == NTRANSPORTS) { + WARN("Error : Could not find transport to connect next group\n"); + free(p2pConnected); + return ncclInternalError; } } curRank = rank; current[transport]++; } } -NCCL_PARAM(MinNrings, "MIN_NRINGS", 0); +#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__) +#define DEFAULT_MIN_NRINGS 2 +#elif defined(__PPC__) +// Make the default NCCL_MIN_NRINGS=4 for IBM/Power nodes +#define DEFAULT_MIN_NRINGS 4 +#else +#define DEFAULT_MIN_NRINGS 0 +#endif +NCCL_PARAM(MinNrings, "MIN_NRINGS", DEFAULT_MIN_NRINGS); NCCL_PARAM(MaxNrings, "MAX_NRINGS", 0); /* Users can force the number of threads with an environment variable */ @@ -180,8 +191,20 @@ ncclResult_t getEnvThreads(int* nthreads) { return ncclSuccess; } +static inline int copyRings(int nrings, int newNrings, int nranks, int* a, int* b, int* c, int* d) { + if (newNrings > MAXCHANNELS) newNrings = MAXCHANNELS; + for (int r=nrings; r 0) { if (rank == 0) INFO(NCCL_INIT,"%d ring(s) set by environment", *nrings); NCCLCHECK(getEnvThreads(nthreads)); + for (int r = 0; r<*nrings; r++) { + for (int i = 0; i=0; t--) { for (int i=0; i 1 && nvlink) { + *nrings = copyRings(*nrings, *nrings*2, nranks, prev, next, treeIn, treeOut); + } + if (*nrings == 0) { WARN("Could not create rings, falling back on simple ring"); *nrings = 1; @@ -330,15 +373,15 @@ ncclResult_t ncclGetRings(int* nrings, int* nthreads, int rank, int nranks, int* if (rank == 0) WARN("NCCL_MIN_NRINGS set to a value greater than NCCL_MAX_NRINGS, ignoring NCCL_MIN_NRINGS"); minNrings = 0; } - if (minNrings > MAXRINGS) { - if (rank == 0) WARN("NCCL_MIN_NRINGS set to a value greater than the maximum number of rings supported (%d), limiting it to %d", MAXRINGS, MAXRINGS); - minNrings = MAXRINGS; + if (minNrings > MAXCHANNELS) { + if (rank == 0) WARN("NCCL_MIN_NRINGS set to a value greater than the maximum number of rings supported (%d), limiting it to %d", MAXCHANNELS, MAXCHANNELS); + minNrings = MAXCHANNELS; } if (maxNrings > 0 && maxNrings <= *nrings) { if (rank == 0) INFO(NCCL_INIT,"Limiting to %d rings per user request.", maxNrings); *nrings = maxNrings; } else { -#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) +#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__) int defaultMinNrings = 1; #else int defaultMinNrings = ncclCudaCompCap() == 3 ? 2 : 1; @@ -346,13 +389,7 @@ ncclResult_t ncclGetRings(int* nrings, int* nthreads, int rank, int nranks, int* if (minNrings < defaultMinNrings) minNrings = defaultMinNrings; if (minNrings > 0 && minNrings > *nrings) { if (rank == 0 && minNrings > defaultMinNrings) INFO(NCCL_INIT,"Duplicating rings to %d per user request.", minNrings); - for (int r=*nrings; r root ? rank-1 : rank) + +/* Btree which alternates leaves and nodes. + * Assumes root is 0, which conveniently builds a tree on powers of two, + * (because we have pow2-1 ranks) which lets us manipulate bits. + * Find first non-zero bit, then : + * Find the parent : + * xx01[0] -> xx10[0] (1,5,9 below) or xx00[0] if xx10[0] is out of bounds (13 below) + * xx11[0] -> xx10[0] (3,7,11 below) + * Find the children : + * xx10[0] -> xx01[0] (2,4,6,8,10,12) or -1 (1,3,5,7,9,11,13) + * xx10[0] -> xx11[0] (2,4,6,8,10) or xx101[0] (12) or xx1001[0] ... or -1 (1,3,5,7,9,11,13) + * + * Illustration : + * 0---------------8 + * ______/ \______ + * 4 12 + * / \ / \ + * 2 6 10 \ + * / \ / \ / \ \ + * 1 3 5 7 9 11 13 + */ +ncclResult_t ncclGetBtree(int nranks, int rank, int* u, int* d0, int* d1) { + int up, down0, down1; + int bit; + for (bit=1; bit 1 ? bit >> 1 : -1; + *d1 = -1; + return ncclSuccess; + } + + up = (rank ^ bit) | (bit << 1); + if (up >= nranks) up = (rank ^ bit); + *u = up; + + int lowbit = bit >> 1; + // down0 is always within bounds + down0 = lowbit == 0 ? -1 : rank-lowbit; + + down1 = lowbit == 0 ? -1 : rank+lowbit; + // Make sure down1 is within bounds + while (down1 >= nranks) { + down1 = lowbit == 0 ? -1 : rank+lowbit; + lowbit >>= 1; + } + *d0 = down0; *d1 = down1; + + return ncclSuccess; +} + +/* Build a double binary tree. Take the previous tree for the first tree. + * For the second tree, we use a mirror tree (if nranks is odd) + * + * 8---------0---------5 + * ______/ \______ _____/ \______ + * 4 12 1 9 + * / \ / \ / \ + * 2 6 10 3 7 10 + * / \ / \ / \ / \ / \ / \ + * 1 3 5 7 9 11 2 4 6 8 11 12 + * + * or shift it by one rank (if nranks is even) + * + * 8---------0--------------9 + * ______/ \ ______/ \ + * 4 \ 5 \ + * / \ \ / \ \ + * 2 6 10 3 7 11 + * / \ / \ / \ / \ / \ / \ + * 1 3 5 7 9 11 2 4 6 8 10 1 + */ +ncclResult_t ncclGetDtree(int nranks, int rank, int* s0, int* d0_0, int* d0_1, int* s1, int* d1_0, int* d1_1) { + // First tree ... use a btree + ncclGetBtree(nranks, rank, s0, d0_0, d0_1); + // Second tree ... mirror or shift + if (nranks % 2 == 0) { + // shift + int shiftrank = (rank-1+nranks) % nranks; + int u, d0, d1; + ncclGetBtree(nranks, shiftrank, &u, &d0, &d1); + *s1 = u == -1 ? -1 : (u+1) % nranks; + *d1_0 = d0 == -1 ? -1 : (d0+1) % nranks; + *d1_1 = d1 == -1 ? -1 : (d1+1) % nranks; + } else { + // mirror + int u, d0, d1; + ncclGetBtree(nranks, nranks-1-rank, &u, &d0, &d1); + *s1 = u == -1 ? -1 : nranks-1-u; + *d1_0 = d0 == -1 ? -1 : nranks-1-d0; + *d1_1 = d1 == -1 ? -1 : nranks-1-d1; + } + return ncclSuccess; +} diff --git a/projects/rccl/src/misc/utils.cu b/projects/rccl/src/misc/utils.cc similarity index 79% rename from projects/rccl/src/misc/utils.cu rename to projects/rccl/src/misc/utils.cc index c42b7ca122..614c78b936 100644 --- a/projects/rccl/src/misc/utils.cu +++ b/projects/rccl/src/misc/utils.cc @@ -1,5 +1,6 @@ /************************************************************************* - * Copyright (c) 2016-2018, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved. + * Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ @@ -11,13 +12,31 @@ #include #include -ncclResult_t getHostName(char* hostname, int maxlen) { +#include "nvmlwrap.h" +#include "core.h" + +// Convert a logical cudaDev index to the NVML device minor number +ncclResult_t getNvmlDevice(int cudaDev, int *nvmlDev) { + char busId[NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE]; + nvmlDevice_t nvmlDevice; + unsigned int dev; + *nvmlDev = -1; + CUDACHECK(hipDeviceGetPCIBusId(busId, NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE, cudaDev)); + NCCLCHECK(wrapNvmlDeviceGetHandleByPciBusId(busId, &nvmlDevice)); + NCCLCHECK(wrapNvmlDeviceGetMinorNumber(nvmlDevice, &dev)); + + *nvmlDev = dev; + + return ncclSuccess; +} + +ncclResult_t getHostName(char* hostname, int maxlen, const char delim) { if (gethostname(hostname, maxlen) != 0) { strncpy(hostname, "unknown", maxlen); return ncclSystemError; } int i = 0; - while ((hostname[i] != '.') && (hostname[i] != '\0') && (i < maxlen-1)) i++; + while ((hostname[i] != delim) && (hostname[i] != '\0') && (i < maxlen-1)) i++; hostname[i] = '\0'; return ncclSuccess; } @@ -30,7 +49,7 @@ void ncclDebugLog(ncclDebugLogLevel level, unsigned long flags, const char *file if (ncclDebugLevel <= NCCL_LOG_NONE) return; char hostname[1024]; - getHostName(hostname, 1024); + getHostName(hostname, 1024, '.'); int cudaDev; hipGetDevice(&cudaDev); @@ -95,8 +114,8 @@ uint64_t getnHash(const char* string, int n) { */ uint64_t getHostHash(void) { char uname[1024]; - // Start off with the hostname - (void) getHostName(uname, sizeof(uname)); + // Start off with the full hostname + (void) getHostName(uname, sizeof(uname), '\0'); int offset = strlen(uname); int len; // $(readlink /proc/self/ns/uts) @@ -138,8 +157,8 @@ int parseStringList(const char* string, struct netIf* ifList, int maxList) { if (!string) return 0; const char* ptr = string; - // Ignore "^" prefix, will be detected outside of this function - if (ptr[0] == '^') ptr++; + // Ignore "^" or "=" prefix, will be detected outside of this function + if (ptr[0] == '^' || ptr[0] == '=') ptr++; int ifNum = 0; int ifC = 0; @@ -168,8 +187,10 @@ int parseStringList(const char* string, struct netIf* ifList, int maxList) { return ifNum; } -static bool matchPrefix(const char* string, const char* prefix) { - return (strncmp(string, prefix, strlen(prefix)) == 0); +static bool matchIf(const char* string, const char* ref, bool matchExact) { + // Make sure to include '\0' in the exact case + int matchLen = matchExact ? strlen(string) + 1 : strlen(ref); + return strncmp(string, ref, matchLen) == 0; } static bool matchPort(const int port1, const int port2) { @@ -180,12 +201,12 @@ static bool matchPort(const int port1, const int port2) { } -bool matchIfList(const char* string, int port, struct netIf* ifList, int listSize) { +bool matchIfList(const char* string, int port, struct netIf* ifList, int listSize, bool matchExact) { // Make an exception for the case where no user list is defined if (listSize == 0) return true; for (int i=0; irings+ringid; - ring->id = ringid; - - // Setup intermediate buffering - ring->buffSize = ncclParamBuffsize(); - - // attempt to allocate buffers in fine grain - const int sendSize = ring->devMemSendSize = sizeof(struct ncclSendMem); - struct ncclSendMem* sendMem; - ncclCudaCalloc((char**)&sendMem, sendSize, true); - ring->devMemSend = sendMem; - - const int recvSize = ring->devMemRecvSize = offsetof(struct ncclRecvMem, buff)+ring->buffSize; - struct ncclRecvMem* recvMem; - ncclCudaCalloc((char**)&recvMem, recvSize, true); - ring->devMemRecv = recvMem; - - TRACE(NCCL_INIT,"sendMem %p size %d recvMem %p size %d", sendMem, sendSize, recvMem, recvSize); - - // Pre-configure send/recv pointers. Those are the default, they may change later. - if (recvMem){ - ring->recv.conn.buff = recvMem->buff; - ring->recv.conn.llBuff = recvMem->llBuff; - ring->recv.conn.tail = &recvMem->tail; - ring->recv.conn.opCount = &recvMem->opCount; - } else { - ring->recv.conn.buff = 0; - ring->recv.conn.llBuff = 0; - ring->recv.conn.tail = 0; - ring->recv.conn.opCount = 0; - } - ring->recv.conn.direct = 0; - - if (sendMem) { - ring->send.conn.head = &sendMem->head; - ring->send.conn.llHead = &sendMem->llHead; - } else { - ring->send.conn.head = 0; - ring->send.conn.llHead = 0; - } - ring->send.conn.direct = 0; - ring->send.conn.llStep = 0; - ring->send.conn.llLastCleaning = 0; - - // Ring index to user rank table. - NCCLCHECK(ncclCudaCalloc(&ring->devUserRanks, comm->nRanks)); - NCCLCHECK(ncclCalloc(&ring->userRanks, comm->nRanks)); - - // Per-ring operation list. - NCCLCHECK(ncclCudaHostAlloc((void**)&ring->collectives, (void**)&ring->devCollectives, sizeof(struct ncclColl)*NCCL_MAX_OPS)); - return ncclSuccess; -} - -ncclResult_t freeRing(struct ncclRing* ring) { - // Intermediate buffering - CUDACHECK(hipFree(ring->devMemSend)); - CUDACHECK(hipFree(ring->devMemRecv)); - - // Index to rank table - free(ring->userRanks); - CUDACHECK(hipFree(ring->devUserRanks)); - - // Operation list - NCCLCHECK(ncclCudaHostFree(ring->collectives)); - - // Free transport proxy resources - if (ring->send.transportResources) NCCLCHECK(ring->send.transport->send.free(ring->send.transportResources)); - NCCLCHECK(transportDestroyProxy(&ring->send)); - if (ring->recv.transportResources) NCCLCHECK(ring->recv.transport->recv.free(ring->recv.transportResources)); - NCCLCHECK(transportDestroyProxy(&ring->recv)); - return ncclSuccess; -} diff --git a/projects/rccl/src/transport.cc b/projects/rccl/src/transport.cc new file mode 100644 index 0000000000..3b08e377cf --- /dev/null +++ b/projects/rccl/src/transport.cc @@ -0,0 +1,249 @@ +/************************************************************************* + * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved. + * Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ + +#include "core.h" + +extern struct ncclTransport p2pTransport; +extern struct ncclTransport shmTransport; +extern struct ncclTransport netTransport; + +struct ncclTransport ncclTransports[NTRANSPORTS] = { + p2pTransport, + shmTransport, + netTransport, +}; + +#define RECV 0 +#define SEND 1 + +static bool NeedProxy(int type, int pattern, int root, struct ncclRing* ring, int nranks) { + if (pattern == ncclPatternRing || pattern == ncclPatternRingTwice) return true; + + /* In chains, one rank does not need a proxy. Let's figure out which one it is */ + // Which index in the reorganized rings should we compare root against */ + const int myrank = 0, nextrank = 1, prevrank = nranks-1; + int index = pattern == ncclPatternPipelineFrom ? + /* no recv / no send if root = */ + /* bcast */ (type == RECV ? myrank : nextrank ): + /* reduce */ (type == RECV ? prevrank : myrank ); + int rank = ring->userRanks[index]; + return (root != rank); +} + +enum { proxyRecv=0, proxySend=1 }; + +#define PROXYARGS_ALLOCATE_SIZE 32 +struct ncclProxyPool { + struct ncclProxyPool *next; + struct ncclProxyArgs elems[PROXYARGS_ALLOCATE_SIZE]; +}; + +ncclResult_t transportAllocateProxyArgs(struct ncclComm* comm, struct ncclProxyArgs** argsptr) { + struct ncclProxyState* state = &comm->proxyState; + struct ncclProxyArgs* elem; + pthread_mutex_lock(&state->mutex); + if (state->pool == NULL) { + // Allocate a new pool of elements + struct ncclProxyPool* newPool; + NCCLCHECK(ncclCalloc(&newPool, 1)); + struct ncclProxyArgs* newElems = newPool->elems; + // Chain newly allocated elements + for (int i=0; ipool = newElems; + // Save the pool memory block for later resource release + newPool->next = state->pools; + state->pools = newPool; + } + elem = state->pool; + state->pool = state->pool->next; + pthread_mutex_unlock(&state->mutex); + elem->next = elem->nextPeer = NULL; + *argsptr = elem; + return ncclSuccess; +} + +static void ProxyAppend(struct ncclConnector* connector, struct ncclProxyArgs* args) { + struct ncclComm* comm = connector->comm; + struct ncclProxyState* state = &comm->proxyState; + pthread_mutex_lock(&state->mutex); + if (connector->proxyAppend == NULL) { + // Nothing running for that peer. Add to the circular list + if (state->ops == NULL) { + // Create the list + args->next = args; + state->ops = args; + } else { + // Insert element in the list + args->next = state->ops->next; + state->ops->next = args; + } + connector->proxyAppend = args; + } else { + // There is an active operation already for that peer. + // Add it to the per-peer list + connector->proxyAppend->nextPeer = args; + connector->proxyAppend = args; + } + pthread_mutex_unlock(&state->mutex); +} + +template +static ncclResult_t SaveProxy(int peer, struct ncclProxyArgs* args) { + if (peer < 0) return ncclSuccess; + + struct ncclPeer* peerComm = args->channel->peers+peer; + struct ncclConnector* connector = type == proxyRecv ? &peerComm->recv : &peerComm->send; + if (connector->transportComm->proxy == NULL) return ncclSuccess; + + struct ncclProxyArgs* op; + NCCLCHECK(transportAllocateProxyArgs(connector->comm, &op)); + memcpy(op, args, sizeof(struct ncclProxyArgs)); + op->connector = connector; + op->progress = connector->transportComm->proxy; + op->state = ncclProxyOpReady; + ProxyAppend(connector, op); + return ncclSuccess; +} + +ncclResult_t transportSaveProxies(struct ncclProxyArgs* args, int pattern, int root, int nranks) { + if (pattern == ncclPatternRing || pattern == ncclPatternRingTwice || pattern == ncclPatternPipelineFrom || pattern == ncclPatternPipelineTo) { + struct ncclRing* ring = &args->channel->ring; + if (NeedProxy(RECV, pattern, root, ring, nranks)) NCCLCHECK(SaveProxy(ring->prev, args)); + if (NeedProxy(SEND, pattern, root, ring, nranks)) NCCLCHECK(SaveProxy(ring->next, args)); + } + if (pattern == ncclPatternTreeUp || pattern == ncclPatternTreeUpDown) { + // Tree up + struct ncclTree* tree = &args->channel->tree; + for (int i=0; i(tree->down[i], args)); + NCCLCHECK(SaveProxy(tree->up, args)); + } + if (pattern == ncclPatternTreeDown || pattern == ncclPatternTreeUpDown) { + // Tree down + struct ncclTree* tree = &args->channel->tree; + for (int i=0; i< NCCL_MAX_TREE_ARITY; i++) NCCLCHECK(SaveProxy(tree->down[i], args)); + NCCLCHECK(SaveProxy(tree->up, args)); + } + return ncclSuccess; +} + +void* persistentThread(void *comm_) { + struct ncclComm* comm = (struct ncclComm*)comm_; + struct ncclProxyState* state = &comm->proxyState; + struct ncclProxyArgs* op = NULL; + ncclResult_t ret = ncclSuccess; + int idle = 1; + int idleSpin = 0; + while (1) { + do { + if (LOAD(comm->abortFlag)) return NULL; + if (op == NULL) { + pthread_mutex_lock(&state->mutex); + op = state->ops; + if (op == NULL) { + if (state->stop) { + // No more commands to process and proxy has been requested to stop + pthread_mutex_unlock(&state->mutex); + return NULL; + } + pthread_cond_wait(&state->cond, &state->mutex); + } + pthread_mutex_unlock(&state->mutex); + } + } while (op == NULL); + op->idle = 0; + if (op->state != ncclProxyOpNone) ret = op->progress(op); + if (ret != ncclSuccess) { + comm->fatalError = ret; + INFO(NCCL_ALL,"%s:%d -> %d [Proxy Thread]", __FILE__, __LINE__, ret); + return NULL; + } + idle &= op->idle; + pthread_mutex_lock(&state->mutex); + if (!idle) idleSpin = 0; + struct ncclProxyArgs *next = op->next; + if (next->state == ncclProxyOpNone) { + struct ncclProxyArgs *freeOp = next; + if (next->nextPeer) { + // Replace next by its next per-peer element. + next = next->nextPeer; + if (op != freeOp) { + next->next = freeOp->next; + op->next = next; + } else { + next->next = next; + } + } else { + // Remove next from circular list + next->connector->proxyAppend = NULL; + if (op != freeOp) { + next = next->next; + op->next = next; + } else { + next = NULL; + } + } + if (freeOp == state->ops) state->ops = next; + freeOp->next = state->pool; + state->pool = freeOp; + } + op = next; + if (op == state->ops) { + if (idle == 1) { + if (++idleSpin == 10) { + sched_yield(); + idleSpin = 0; + } + } + idle = 1; + } + pthread_mutex_unlock(&state->mutex); + } +} + +ncclResult_t transportStartProxy(struct ncclComm* comm) { + pthread_mutex_lock(&comm->proxyState.mutex); + if (comm->proxyState.ops != NULL) + pthread_cond_signal(&comm->proxyState.cond); + pthread_mutex_unlock(&comm->proxyState.mutex); + return ncclSuccess; +} + +ncclResult_t transportCreateProxy(struct ncclComm* comm) { + if (!comm->proxyThread) { + comm->proxyState.cond = PTHREAD_COND_INITIALIZER; + comm->proxyState.mutex = PTHREAD_MUTEX_INITIALIZER; + comm->proxyState.ops = NULL; + pthread_create(&comm->proxyThread, NULL, persistentThread, comm); + } + return ncclSuccess; +} + +ncclResult_t transportDestroyProxy(struct ncclComm* comm) { + struct ncclProxyState* state = &comm->proxyState; + + // Request the proxy to stop and then wake it + pthread_mutex_lock(&state->mutex); + state->stop = true; + pthread_cond_signal(&state->cond); + pthread_mutex_unlock(&state->mutex); + if (comm->proxyThread) pthread_join(comm->proxyThread, NULL); + + // Free off any memory allocated for the proxy arg pools + pthread_mutex_lock(&state->mutex); + struct ncclProxyState* proxyState = &comm->proxyState; + while (proxyState->pools != NULL) { + struct ncclProxyPool *next = proxyState->pools->next; + free(proxyState->pools); + proxyState->pools = next; + } + pthread_mutex_unlock(&state->mutex); + + return ncclSuccess; +} diff --git a/projects/rccl/src/transport.cu b/projects/rccl/src/transport.cu deleted file mode 100644 index 240453cb3d..0000000000 --- a/projects/rccl/src/transport.cu +++ /dev/null @@ -1,190 +0,0 @@ -/************************************************************************* - * Copyright (c) 2016-2018, NVIDIA CORPORATION. All rights reserved. - * Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved. - * - * See LICENSE.txt for license information - ************************************************************************/ - -#include "core.h" -#include "common_coll.h" -#include -#include - -extern struct ncclTransport p2pTransport; -extern struct ncclTransport shmTransport; -extern struct ncclTransport netTransport; - -struct ncclTransport ncclTransports[NTRANSPORTS] = { - p2pTransport, - shmTransport, - netTransport, -}; - -static void FifoPullArgs(struct transportProxyInfo* info, struct ncclProxyArgs *args) { - struct ncclProxyArgs *fifoArgs = info->argsFifo + (LOAD(&info->argsFifoHead) % TRANSPORT_PROXY_FIFO_SIZE); - pthread_mutex_lock(&info->mutex); - while (LOAD(&fifoArgs->active) == 0) - pthread_cond_wait(&info->cond, &info->mutex); - __sync_synchronize(); - memcpy(args, fifoArgs, sizeof(struct ncclProxyArgs)); - __sync_synchronize(); - STORE(&fifoArgs->active, 0); - pthread_cond_signal(&info->cond); - pthread_mutex_unlock(&info->mutex); - __atomic_fetch_add(&info->argsFifoHead, 1, __ATOMIC_SEQ_CST); -} - -static struct ncclProxyArgs* FifoGetNextArgs(struct transportProxyInfo* info) { - if (info == NULL) return NULL; - struct ncclProxyArgs* fifoArgs = info->argsFifo + (LOAD(&info->argsFifoTail) % TRANSPORT_PROXY_FIFO_SIZE); - pthread_mutex_lock(&info->mutex); - while (LOAD(&fifoArgs->active) == 1) - pthread_cond_wait(&info->cond, &info->mutex); - pthread_mutex_unlock(&info->mutex); - __atomic_fetch_add(&info->argsFifoTail, 1, __ATOMIC_SEQ_CST); - return fifoArgs; -} - -static void FifoPushArgs(struct transportProxyInfo* info) { - if (info == NULL) return; - - struct ncclProxyArgs* fifoArgs = info->argsFifo + ((LOAD(&info->argsFifoTail)-1) % TRANSPORT_PROXY_FIFO_SIZE); - if (LOAD(&fifoArgs->active) == 0) return; - - pthread_mutex_lock(&info->mutex); - pthread_cond_signal(&info->cond); - pthread_mutex_unlock(&info->mutex); -} - -static void WaitProxyReady(struct transportProxyInfo* info) { - pthread_mutex_lock(&info->mutex); - while (LOAD(&info->proxyReady) == 0) - pthread_cond_wait(&info->cond, &info->mutex); - pthread_mutex_unlock(&info->mutex); -} - -static void SetProxyReady(struct transportProxyInfo* info) { - pthread_mutex_lock(&info->mutex); - STORE(&info->proxyReady, 1); - pthread_cond_signal(&info->cond); - pthread_mutex_unlock(&info->mutex); -} - -static void StopProxy(struct transportProxyInfo* info) { - struct ncclProxyArgs* fifoArgs = FifoGetNextArgs(info); - STORE(&fifoArgs->active, -1); - FifoPushArgs(info); -} - -#define RECV 0 -#define SEND 1 - -static bool NeedProxy(int type, int pattern, struct ncclRing* ring, int nranks) { - enum proxyMode mode = proxyPatternMode(pattern); - if (mode == proxyRing) return true; - - /* In chains, one rank does not need a proxy. Let's figure out which one it is */ - int root = proxyPatternRoot(pattern); - // Which index in the reorganized rings should we compare root against */ - const int myrank = 0, nextrank = 1, prevrank = nranks-1; - int index = mode == proxyFrom ? - /* no recv / no send if root = */ - /* bcast */ (type == RECV ? myrank : nextrank ): - /* reduce */ (type == RECV ? prevrank : myrank ); - int rank = ring->userRanks[index]; - return (root != rank); -} - -static void SaveProxy(struct ncclConnector* connector, struct ncclProxyArgs* args, int needProxy) { - struct transportProxyInfo* info = connector->proxyInfo; - if (info == NULL) return; - struct ncclProxyArgs* fifoArgs = FifoGetNextArgs(info); - args->needProxy = needProxy; - __sync_synchronize(); - memcpy(fifoArgs, args, sizeof(struct ncclProxyArgs)); - __sync_synchronize(); - STORE(&fifoArgs->active, 1); -} - -ncclResult_t transportSaveProxies(int substeps, int subchunks, int nstepsPerRound, int nblocksPerRound, size_t nbytes, int pattern, struct ncclComm* comm) { - int llMode, nrings, nthreads; - ncclGetCollResource(comm, nbytes, &nrings, &nthreads, &llMode); - nbytes = llMode ? nbytes * 2 : nbytes; - substeps = llMode ? 1 : substeps; - subchunks = llMode ? NCCL_LL_CHUNKS : subchunks; - int buffSize = llMode ? NCCL_LL_BUFF_SIZE : comm->rings[0].buffSize; - - int nrounds = (int)(DIVUP(nbytes, ((size_t)nrings * nblocksPerRound * (buffSize/subchunks)))); // Fixed 32-bit overflow - int nsteps = nstepsPerRound * nrounds * substeps; - TRACE(NCCL_NET,"opCount %lx substeps %d subchunks %d nrounds %d nsteps %d comm %p", comm->opCount, subchunks, subchunks, nrounds, nsteps, comm); - TRACE(NCCL_NET,"opCount %lx nbytes %zi nrings %d buffSize %d pattern %d comm %p", comm->opCount, nbytes, nrings, buffSize, pattern, comm); - for (int r=0; rrings+((comm->myParams->gridDim.x+r)%comm->nRings); - struct ncclProxyArgs args = { ring, substeps*subchunks, nsteps, comm->opCount, llMode, 0 }; - SaveProxy(&ring->recv, &args, NeedProxy(RECV, pattern, ring, comm->nRanks)); - SaveProxy(&ring->send, &args, NeedProxy(SEND, pattern, ring, comm->nRanks)); - } - return ncclSuccess; -} - -ncclResult_t transportStartProxies(ncclComm* comm) { - for (int r=0; rnRings; r++) { - FifoPushArgs(comm->rings[r].send.proxyInfo); - FifoPushArgs(comm->rings[r].recv.proxyInfo); - } - pthread_yield(); // Let other threads run - return ncclSuccess; -} - -void* persistentThread(void *opaqueInfo) { - struct transportProxyInfo* info = (struct transportProxyInfo*)opaqueInfo; - // We need to initialize the context before launching any NCCL cuda kernel, - // otherwise we would create it during the first hipMemcpyAsync inside the - // proxy function and that would cause a deadlock - hipSetDevice(info->comm->cudaDev); - // Signal the main thread the context is created and it can proceed. - SetProxyReady(info); - while (1) { - struct ncclProxyArgs args; - FifoPullArgs(info, &args); - if (args.active == -1) { - // Main thread asked to stop - return NULL; - } - ncclResult_t res = info->func(&args); - if (res != ncclSuccess) { - WARN("%s:%d -> %d [Proxy thread error]", __FILE__, __LINE__, res); - } - } -} - -ncclResult_t transportCreateProxy(int type, struct ncclRing* ring, struct ncclComm* comm) { - struct ncclConnector* connector = (type == RECV) ? &ring->recv : &ring->send; - threadFunc_t proxyfunc = (threadFunc_t) ((type == RECV) ? connector->transport->recv.proxy : connector->transport->send.proxy); - if (proxyfunc) { - TRACE(NCCL_NET,"type %d ring %p proxyfunc %p comm %p", type, ring, proxyfunc, comm); - struct transportProxyInfo* info; - NCCLCHECK(ncclCalloc(&info, 1)); - connector->proxyInfo = info; - info->comm = comm; - info->cond = PTHREAD_COND_INITIALIZER; - info->mutex = PTHREAD_MUTEX_INITIALIZER; - info->func = proxyfunc; - STORE(&info->argsFifoHead, 0); STORE(&info->argsFifoTail, 0); - STORE(&info->proxyReady, 0); - pthread_create(&connector->proxyInfo->thread, NULL, persistentThread, info); - // Wait for thread to initialize its CUDA context. - WaitProxyReady(info); - } - return ncclSuccess; -} - -ncclResult_t transportDestroyProxy(struct ncclConnector* connector) { - if (connector->proxyInfo) { - StopProxy(connector->proxyInfo); - pthread_join(connector->proxyInfo->thread, NULL); - free(connector->proxyInfo); - connector->proxyInfo = NULL; - } - return ncclSuccess; -} diff --git a/projects/rccl/src/transport/net.cc b/projects/rccl/src/transport/net.cc new file mode 100644 index 0000000000..7991754f65 --- /dev/null +++ b/projects/rccl/src/transport/net.cc @@ -0,0 +1,574 @@ +/************************************************************************* + * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved. + * Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ + +#include "core.h" +#include "transport.h" +#include "nvmlwrap.h" +#include "net.h" +#include "param.h" +#include "topo.h" +#include +#include + +#define NET_MAX_IFS 16 +#define NET_MAX_GPUS 32 + +// Cache GPU-NIC distances to avoid re-computing them +#define NET_TVALUE_UNKNOWN 0ULL +static ncclTvalue_t ncclNetTvalues[NET_MAX_GPUS] = { NET_TVALUE_UNKNOWN }; +static int ncclNetNDev; + +// We encode 3 bits of distance per interface into a ncclTvalue_t (64-bit) +#define NET_BITS_PER_IF 3 +#define NET_BITS_PER_IF_MASK ((1<= NET_MAX_IFS*NET_BITS_PER_IF, "NET_MAX_IFS*NET_BITS_PER_IF must fit in a ncclTvalue_t"); +static ncclTvalue_t getTvalue(short* distances, int ndev) { + ncclTvalue_t tvalue = 0; + for (int d=0; d> (dev*NET_BITS_PER_IF)) & NET_BITS_PER_IF_MASK; +} + +struct netConnectInfo { + ncclNetHandle_t netHandle; +}; + +struct netSendResources { + void* netSendComm; + struct ncclSendMem* hostSendMem; + struct ncclRecvMem* hostRecvMem; + struct ncclSendMem* devHostSendMem; + struct ncclRecvMem* devHostRecvMem; + int netDev; + int useGdr; + int buffSize; + void* mhandle; + void* llMhandle; + struct ncclRecvMem* devRecvMem; + uint64_t step; + uint64_t llLastCleaning; +}; + +struct netRecvResources { + void* netListenComm; + void* netRecvComm; + struct ncclSendMem* hostSendMem; + struct ncclRecvMem* hostRecvMem; + struct ncclSendMem* devHostSendMem; + struct ncclRecvMem* devHostRecvMem; + int netDev; + int useGdr; + int buffSize; + void* mhandle; + void* llMhandle; + struct ncclRecvMem* devRecvMem; + uint64_t step; + uint64_t llLastCleaning; + uint32_t* curr_hdp_reg; // Curr GPU in ring (for rdma transport use only) +}; + +static ncclResult_t netDistance(int cudaDev, int dev, short* distance) { + char* cudaPath = NULL; + char* nicPath = NULL; + ncclResult_t err; + NCCLCHECK(getCudaPath(cudaDev, &cudaPath)); + err = ncclNetPciPath(dev, &nicPath); + *distance = (err != ncclSuccess || nicPath == NULL || cudaPath == NULL) ? PATH_SYS : pciDistance(nicPath, cudaPath); + if (nicPath) free(nicPath); + if (cudaPath) free(cudaPath); + return ncclSuccess; +} + +static ncclResult_t netDevices(int* ndev, short** distances) { + NCCLCHECK(ncclNetDevices(ndev)); + if (*ndev == 0) { + WARN("Error : Network returned 0 device"); + return ncclSystemError; + } + if (*ndev > NET_MAX_IFS) *ndev = NET_MAX_IFS; + + *distances = (short*)malloc(*ndev*sizeof(short)); + if (*distances == NULL) return ncclSystemError; + + // Find distance with current GPU + int cudaDev, nvmlDev; + CUDACHECK(hipGetDevice(&cudaDev)); + NCCLCHECK(getNvmlDevice(cudaDev, &nvmlDev)) + char line[1024]; + sprintf(line, "CUDA Dev %d[%d], %s NIC distance : ", cudaDev, nvmlDev, ncclNetName()); + for (int d=0; d<*ndev; d++) { + NCCLCHECK(netDistance(cudaDev, d, *distances+d)); + sprintf(line+strlen(line), " %s", pathDists[(*distances)[d]]); + } + INFO(NCCL_INIT|NCCL_NET, "%s", line); + return ncclSuccess; +} + +/* Determine if we can communicate with the peer */ +ncclResult_t netCanConnect(ncclTvalue_t* ret, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo) { + int cudaDev; + CUDACHECK(hipGetDevice(&cudaDev)); + ret[0] = ncclNetTvalues[cudaDev]; + if (ret[0] == NET_TVALUE_UNKNOWN) { + if (cudaDev >= NET_MAX_GPUS) { + WARN("CUDA device %d >= MAX %d\n", cudaDev, NET_MAX_GPUS); + return ncclInternalError; + } + int nDev; + short* distances; + NCCLCHECK(netDevices(&nDev, &distances)); + ncclNetTvalues[cudaDev] = ret[0] = getTvalue(distances, nDev); + ncclNetNDev = nDev; + free(distances); + } + return ncclSuccess; +} + +static inline int groupBestStart(int nranks, int* groups, int group, ncclTvalue_t* values, int card, int minScore) { + int bestRank = -1; + int bestScore = 0; + for (int rank=0; rank>(NET_BITS_PER_IF*card)) & NET_BITS_PER_IF_MASK; + if (score >= minScore && score > bestScore) { + bestScore = score; + bestRank = rank; + } + // All other values should be the same, stop here for this rank + break; + } + } + } + return bestRank; +} +static inline int groupBestEnd(int nranks, int* groups, int group, int* subgroups, int startSubGroup, int startRank, ncclTvalue_t* values, int card, int minScore) { + // For the last rank, we don't need the absolute best score, just to be within minScore. + for (int rank=nranks-1; rank>=0; rank--) { + if (groups[rank] != group) continue; + if (startSubGroup != -1 && startSubGroup == subgroups[rank]) continue; + if (startRank == rank) continue; + for (int i=0; i>(NET_BITS_PER_IF*card)) & NET_BITS_PER_IF_MASK; + if (score >= minScore) { + return rank; + } + // All other values should be the same, stop here for this rank + break; + } + } + } + return -1; +} + +ncclResult_t netGetRings(int nranks, int* groups, int* subgroups, ncclTvalue_t* values, int* nringsRet, int* prev, int* next, int minScore, int* nthreads) { + int nGroups = groups[nranks-1] + 1; + int *cardUsed, *starts, *ends; + NCCLCHECK(ncclCalloc(&cardUsed, NET_MAX_IFS*nGroups)); + NCCLCHECK(ncclCalloc(&starts, nGroups)); + NCCLCHECK(ncclCalloc(&ends, nGroups)); + + for (int ring = 0; ring<*nringsRet; ring++) { + for (int group = 0; group maxScore) maxScore = getScore(tvalues,d); + int skip = ringId+1; + while (skip) { + for (int d=0; d= netGdrLevel) { + INFO(NCCL_NET,"NET/%s : GPU Direct RDMA Disabled for GPU %d[%d] / HCA %d (distance %d >= %d)", ncclNetName(), cudaDev, nvmlDev, dev, distance, netGdrLevel); + return ncclSuccess; + } + + // Finally, check if the NIC supports it + int flags; + NCCLCHECK(ncclNetPtrSupport(dev, &flags)); + if ((flags & NCCL_PTR_CUDA) == 0) return ncclSuccess; + *useGdr = 1; + INFO(NCCL_NET,"NET/%s : GPU Direct RDMA Enabled for GPU %d[%d] / HCA %d (distance %d < %d), read %d", ncclNetName(), cudaDev, nvmlDev, dev, distance, netGdrLevel, read); + return ncclSuccess; +} + +/* Determine if we will use this transport for this peer and return connect + * information for this peer */ +ncclResult_t netSendSetup(struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connectInfo, struct ncclConnector* send, int buffSize, int channelId) { + struct netSendResources* resources; + NCCLCHECK(ncclCalloc(&resources, 1)); + send->transportResources = resources; + + int cudaDev; + CUDACHECK(hipGetDevice(&cudaDev)); + resources->netDev = getDev(cudaDev, channelId); + NCCLCHECK(netGetGdrSupport(resources->netDev, 1, &resources->useGdr)); + + int sendSize = sizeof(struct ncclSendMem); + NCCLCHECK(ncclCudaHostAlloc((void**)&resources->hostSendMem, (void**)&resources->devHostSendMem, sendSize)); + + int recvSize = offsetof(struct ncclRecvMem, buff)+buffSize; + if (resources->useGdr) { + NCCLCHECK(ncclCudaCalloc((char**)(&resources->devRecvMem), recvSize, true)); + } + NCCLCHECK(ncclCudaHostAlloc((void**)&resources->hostRecvMem, (void**)&resources->devHostRecvMem, recvSize)); + resources->buffSize = buffSize; + + INFO(NCCL_INIT|NCCL_NET,"Ring %02d : %d -> %d [send] via NET/%s/%d%s", channelId, myInfo->rank, peerInfo->rank, ncclNetName(), resources->netDev, + resources->useGdr ? "/GDRDMA" : ""); + return ncclSuccess; +} + +ncclResult_t netRecvSetup(struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connectInfo, struct ncclConnector* recv, int buffSize, int channelId) { + struct netRecvResources* resources; + NCCLCHECK(ncclCalloc(&resources, 1)); + recv->transportResources = resources; + + int cudaDev; + CUDACHECK(hipGetDevice(&cudaDev)); + resources->netDev = getDev(cudaDev, channelId); + NCCLCHECK(netGetGdrSupport(resources->netDev, 0, &resources->useGdr)); + + int sendSize = sizeof(struct ncclSendMem); + NCCLCHECK(ncclCudaHostAlloc((void**)&resources->hostSendMem, (void**)&resources->devHostSendMem, sendSize)); + + int recvSize = offsetof(struct ncclRecvMem, buff)+buffSize; + if (resources->useGdr) { + NCCLCHECK(ncclCudaCalloc((char**)(&resources->devRecvMem), recvSize, true)); + CUDACHECK(hipDeviceGetAttribute((int*)&resources->curr_hdp_reg, hipDeviceAttributeHdpMemFlushCntl,peerInfo->cudaDev)); + } + NCCLCHECK(ncclCudaHostAlloc((void**)&resources->hostRecvMem, (void**)&resources->devHostRecvMem, recvSize)); + resources->buffSize = buffSize; + + INFO(NCCL_INIT|NCCL_NET,"Ring %02d : %d -> %d [receive] via NET/%s/%d%s", channelId, peerInfo->rank, myInfo->rank, ncclNetName(), resources->netDev, + resources->useGdr ? "/GDRDMA" : ""); + struct netConnectInfo* info = (struct netConnectInfo*) connectInfo; + NCCLCHECK(ncclNetListen(resources->netDev, &info->netHandle, &resources->netListenComm)); + return ncclSuccess; +} + +ncclResult_t netSendConnect(struct ncclConnect* connectInfo, struct ncclConnector* send) { + // Setup device pointers + struct netSendResources* resources = (struct netSendResources*)send->transportResources; + + // Intermediate buffering on GPU for GPU Direct RDMA, but LL buffer is always on host + struct ncclRecvMem* recvMem = resources->useGdr ? resources->devRecvMem : resources->devHostRecvMem; + send->conn.buff = recvMem->buff; + send->conn.llBuff = resources->devHostRecvMem->llBuff; + + // Head/Tail/Opcount/Fifos are always on host + send->conn.tail = &resources->devHostRecvMem->tail; + send->conn.opCountRem = &resources->devHostRecvMem->opCount; + send->conn.fifo = resources->devHostRecvMem->sizesFifo; + send->conn.head = &resources->devHostSendMem->head; + send->conn.opCountLoc = &resources->devHostSendMem->opCount; + for (int i=0; iconn.fifo[i] = -1; + + // Connect to remote peer + struct netConnectInfo* info = (struct netConnectInfo*)connectInfo; + NCCLCHECK(ncclNetConnect(resources->netDev, info->netHandle, &resources->netSendComm)); + + NCCLCHECK(ncclNetRegMr(resources->netSendComm, recvMem->buff, resources->buffSize, + resources->useGdr ? NCCL_PTR_CUDA : NCCL_PTR_HOST, &resources->mhandle)); + NCCLCHECK(ncclNetRegMr(resources->netSendComm, resources->devHostRecvMem->llBuff, + NCCL_LL_BUFF_SIZE, NCCL_PTR_HOST, &resources->llMhandle)); + + return ncclSuccess; +} + +/* Connect to this peer */ +ncclResult_t netRecvConnect(struct ncclConnect* connectInfo, struct ncclConnector* recv) { + // Setup device pointers + struct netRecvResources* resources = (struct netRecvResources*)recv->transportResources; + + // Intermediate buffering on GPU for GPU Direct RDMA + struct ncclRecvMem* recvMem = resources->useGdr ? resources->devRecvMem : resources->devHostRecvMem; + recv->conn.buff = recvMem->buff; + recv->conn.llBuff = recvMem->llBuff; + + // Head/Tail/Opcount are always on host + recv->conn.tail = &resources->devHostRecvMem->tail; + recv->conn.opCountLoc = &resources->devHostRecvMem->opCount; + recv->conn.head = &resources->devHostSendMem->head; + recv->conn.opCountRem = &resources->devHostSendMem->opCount; + + // Finish connection establishment from remote peer + NCCLCHECK(ncclNetAccept(resources->netListenComm, &resources->netRecvComm)); + NCCLCHECK(ncclNetCloseListen(resources->netListenComm)); + + NCCLCHECK(ncclNetRegMr(resources->netRecvComm, recvMem->buff, resources->buffSize, + resources->useGdr ? NCCL_PTR_CUDA : NCCL_PTR_HOST, &resources->mhandle)); + NCCLCHECK(ncclNetRegMr(resources->netRecvComm, recvMem->llBuff, NCCL_LL_BUFF_SIZE, + resources->useGdr ? NCCL_PTR_CUDA : NCCL_PTR_HOST, &resources->llMhandle)); + + return ncclSuccess; +} + +ncclResult_t netSendFree(void* transportResources) { + struct netSendResources* resources = (struct netSendResources*)transportResources; + NCCLCHECK(ncclCudaHostFree(resources->hostSendMem)); + NCCLCHECK(ncclNetDeregMr(resources->netSendComm, resources->mhandle)); + NCCLCHECK(ncclNetDeregMr(resources->netSendComm, resources->llMhandle)); + NCCLCHECK(ncclCudaHostFree(resources->hostRecvMem)); + if (resources->useGdr) + CUDACHECK(hipFree(resources->devRecvMem)); + NCCLCHECK(ncclNetCloseSend(resources->netSendComm)); + free(resources); + return ncclSuccess; +} + +ncclResult_t netRecvFree(void* transportResources) { + struct netRecvResources* resources = (struct netRecvResources*)transportResources; + NCCLCHECK(ncclCudaHostFree(resources->hostSendMem)); + NCCLCHECK(ncclNetDeregMr(resources->netRecvComm, resources->mhandle)); + NCCLCHECK(ncclNetDeregMr(resources->netRecvComm, resources->llMhandle)); + NCCLCHECK(ncclCudaHostFree(resources->hostRecvMem)); + if (resources->useGdr) + CUDACHECK(hipFree(resources->devRecvMem)); + NCCLCHECK(ncclNetCloseRecv(resources->netRecvComm)); + free(resources); + return ncclSuccess; +} + +ncclResult_t netSendProxy(struct ncclProxyArgs* args) { + struct netSendResources* resources = (struct netSendResources*) (args->connector->transportResources); + if (args->state == ncclProxyOpReady) { + // Update opCount + resources->hostRecvMem->opCount = args->opCount; + + // Round to next multiple of sliceSteps + resources->step = ROUNDUP(resources->step, args->chunkSteps); + args->head = resources->step; + args->tail = resources->step; + args->end = args->head + args->nsteps; + args->state = ncclProxyOpProgress; + } + if (args->state == ncclProxyOpProgress) { + args->idle = 1; + if (args->head < args->end) { + if (args->tail < args->end && args->tail < args->head + NCCL_STEPS) { + volatile int* sizesFifo = resources->hostRecvMem->sizesFifo; + volatile uint64_t* recvTail = &resources->hostRecvMem->tail; + if (args->llMode) { + int buffSlot = args->tail%NCCL_STEPS; + int size = LOAD(sizesFifo+buffSlot); + if (size != -1) { + uint32_t flag = NCCL_LL_FLAG(args->tail + 1); + int nFifoLines = DIVUP(size, sizeof(union ncclLLFifoLine)); + size = nFifoLines * sizeof(union ncclLLFifoLine); + union ncclLLFifoLine* lines = resources->hostRecvMem->llBuff+buffSlot*NCCL_LL_SLICE_LINES; + int ready = 1; + for (int i=0; inetSendComm, lines, size, resources->llMhandle, args->requests+buffSlot)); + if (args->requests[buffSlot] != NULL) { + STORE(sizesFifo+buffSlot, -1); + // Make sure size is reset to zero before we update the head. + __sync_synchronize(); + args->tail += args->sliceSteps; + args->idle = 0; + } + } + } + } else if (args->tail < LOAD(recvTail)) { + struct ncclRecvMem* localMem = resources->useGdr ? resources->devRecvMem : resources->hostRecvMem; + int stepSize = args->channel->buffSize/NCCL_STEPS; + // Send through network + int buffSlot = args->tail%NCCL_STEPS; + NCCLCHECK(ncclNetIsend(resources->netSendComm, localMem->buff+buffSlot*stepSize, LOAD(sizesFifo+buffSlot), resources->mhandle, args->requests+buffSlot)); + if (args->requests[buffSlot] != NULL) { + STORE(sizesFifo+buffSlot, -1); + // Make sure size is reset to zero before we update the head. + __sync_synchronize(); + args->tail += args->sliceSteps; + args->idle = 0; + } + } + } + if (args->head < args->tail) { + int done; + int buffSlot = args->head%NCCL_STEPS; + NCCLCHECK(ncclNetTest(args->requests[buffSlot], &done, NULL)); + if (done) { + args->head += args->sliceSteps; + STORE(&resources->hostSendMem->head, args->head); + args->idle = 0; + } + } + } + if (args->head == args->end) { + resources->step = args->end; + args->idle = 0; + args->state = ncclProxyOpNone; + } + } + return ncclSuccess; +} + +ncclResult_t netRecvProxy(struct ncclProxyArgs* args) { + struct netRecvResources* resources = (struct netRecvResources*) (args->connector->transportResources); + if (args->state == ncclProxyOpReady) { + // Update opCount + resources->hostSendMem->opCount = args->opCount; + + // Round to next multiple of sliceSteps + resources->step = ROUNDUP(resources->step, args->chunkSteps); + args->head = resources->step; + args->tail = resources->step; + args->end = args->head + args->nsteps; + args->state = ncclProxyOpProgress; + } + if (args->state == ncclProxyOpProgress) { + args->idle = 1; + int stepSize = ( args->llMode ? NCCL_LL_BUFF_SIZE : args->channel->buffSize ) / NCCL_STEPS; + if (args->head < args->end) { + struct ncclRecvMem* localMem = resources->useGdr ? resources->devRecvMem : resources->hostRecvMem; + char* localBuff = args->llMode ? (char*)localMem->llBuff : localMem->buff; + void* mhandle = args->llMode ? resources->llMhandle : resources->mhandle; + volatile uint64_t* sendHead = &resources->hostSendMem->head; + if ((args->tail < args->head + NCCL_STEPS) && (args->tail < LOAD(sendHead) + NCCL_STEPS) && (args->tail < args->end)) { + int buffSlot = args->tail%NCCL_STEPS; + int sliceSize = stepSize * args->sliceSteps; + NCCLCHECK(ncclNetIrecv(resources->netRecvComm, localBuff+buffSlot*stepSize, sliceSize, mhandle, args->requests+buffSlot)); + if (args->requests[buffSlot] != NULL) { + args->tail += args->sliceSteps; + args->idle = 0; + } + } + if (args->tail > args->head) { + int buffSlot = args->head%NCCL_STEPS; + int done, size; + NCCLCHECK(ncclNetTest(args->requests[buffSlot], &done, &size)); + if (done) { + args->head += args->sliceSteps; + if (args->llMode == 0) { + if (resources->useGdr) { + ncclNetFlush(resources->netRecvComm, localBuff+buffSlot*stepSize, size, mhandle); + // Flush local HDP register after local read-back finishes + STORE(resources->curr_hdp_reg, 0x1); + TRACE(NCCL_NET, "Flushing GPU memory via HDP %p", resources->curr_hdp_reg); + } + STORE(&resources->hostRecvMem->tail, args->head); + } + args->idle = 0; + } + } + } + if (args->head == args->end) { + resources->step = args->end; + args->idle = 0; + args->state = ncclProxyOpNone; + } + } + return ncclSuccess; +} + +struct ncclTransport netTransport = { + "NET", + netCanConnect, + netGetRings, + { netSendSetup, netSendConnect, netSendFree, netSendProxy }, + { netRecvSetup, netRecvConnect, netRecvFree, netRecvProxy } +}; diff --git a/projects/rccl/src/transport/net.cu b/projects/rccl/src/transport/net.cu deleted file mode 100644 index 1c09c91378..0000000000 --- a/projects/rccl/src/transport/net.cu +++ /dev/null @@ -1,584 +0,0 @@ -/************************************************************************* - * Copyright (c) 2016-2018, NVIDIA CORPORATION. All rights reserved. - * Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved. - * - * See LICENSE.txt for license information - ************************************************************************/ - -#include "core.h" -#include "transport.h" -#include "nvmlwrap.h" -#include "net.h" -#include "param.h" -#include -#include -#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) -#include "nvlink_stub.h" -#else -#include "nvlink.h" -#endif - -#define NET_MAX_IFS 16 - -// We encode 3 bits of distance per interface into a ncclTvalue_t (64-bit) -#define NET_BITS_PER_IF 3 -#define NET_BITS_PER_IF_MASK ((1<= NET_MAX_IFS*NET_BITS_PER_IF, "NET_MAX_IFS*NET_BITS_PER_IF must fit in a ncclTvalue_t"); -static ncclTvalue_t getTvalue(short* distances, int ndev) { - ncclTvalue_t tvalue = 0; - for (int d=0; drank = rank; - NCCLCHECK(ncclNetDevices(&info->ndev)); - if (info->ndev == 0) { - WARN("Error : Network returned 0 device"); - return ncclSystemError; - } - if (info->ndev > NET_MAX_IFS) info->ndev = NET_MAX_IFS; - - // Find distance with current GPU - int cudaDev; - hipGetDevice(&cudaDev); - char* cudaPath; - NCCLCHECK(getCudaPath(cudaDev, &cudaPath)); - - char line[1024]; - sprintf(line, "CUDA Dev %d, %s NIC distance : ", cudaDev, ncclNetName()); - for (int d=0; dndev; d++) { - char* nicPath; - ncclResult_t err = ncclNetPciPath(d, &nicPath); - info->distances[d] = (err != ncclSuccess || nicPath == NULL || cudaPath == NULL) ? PATH_SOC : pciDistance(nicPath, cudaPath); - sprintf(line+strlen(line), " %s", pathDists[info->distances[d]]); - if (err == ncclSuccess) free(nicPath); - } - INFO(NCCL_INIT|NCCL_NET, "%s", line); - free(cudaPath); - return ncclSuccess; -} - -/* Determine if we can communicate with the peer */ -ncclResult_t netCanConnect(ncclTvalue_t* ret, ncclTinfo_t* myOpaqueInfo, ncclTinfo_t* peerOpaqueInfo) { - struct netInfo* myInfo = (struct netInfo*)myOpaqueInfo; - ret[0] = getTvalue(myInfo->distances, myInfo->ndev); - return ncclSuccess; -} - -static inline int groupBestStart(int nranks, int* groups, int group, ncclTvalue_t* values, int card, int minScore) { - int bestRank = -1; - int bestScore = 0; - for (int rank=0; rank>(NET_BITS_PER_IF*card)) & NET_BITS_PER_IF_MASK; - if (score >= minScore && score > bestScore) { - bestScore = score; - bestRank = rank; - } - // All other values should be the same, stop here for this rank - break; - } - } - } - return bestRank; -} -static inline int groupBestEnd(int nranks, int* groups, int group, int* subgroups, int startSubGroup, int startRank, ncclTvalue_t* values, int card, int minScore) { - // For the last rank, we don't need the absolute best score, just to be within minScore. - for (int rank=nranks-1; rank>=0; rank--) { - if (groups[rank] != group) continue; - if (startSubGroup != -1 && startSubGroup == subgroups[rank]) continue; - if (startRank == rank) continue; - for (int i=0; i>(NET_BITS_PER_IF*card)) & NET_BITS_PER_IF_MASK; - if (score >= minScore) { - return rank; - } - // All other values should be the same, stop here for this rank - break; - } - } - } - return -1; -} - - -ncclResult_t netGetRings(int nranks, int* groups, int* subgroups, ncclTvalue_t* values, int* nringsRet, int* prev, int* next, int minScore, int* nthreads) { - int nGroups = groups[nranks-1] + 1; - int cardUsed[NET_MAX_IFS*nGroups]; - for (int c=0; c= netGdrLevel) { - INFO(NCCL_INIT|NCCL_NET,"NET/%s : GPU Direct RDMA Disabled for GPU %d / HCA %d (distance %d >= %d)", ncclNetName(), cudaDev, dev, distance, netGdrLevel); - return ncclSuccess; - } - - // Finally, check if the NIC supports it - int flags; - NCCLCHECK(ncclNetPtrSupport(dev, &flags)); - if ((flags & NCCL_PTR_CUDA) == 0) return ncclSuccess; - *useGdr = 1; - INFO(NCCL_INIT|NCCL_NET,"NET/%s : GPU Direct RDMA Enabled for GPU %d / HCA %d (distance %d >= %d), read %d", ncclNetName(), cudaDev, dev, distance, netGdrLevel, read); - return ncclSuccess; -} - -/* Determine if we will use this transport for this peer and return connect - * information for this peer */ -ncclResult_t netSendSetup(ncclTinfo_t* myOpaqueInfo, ncclTinfo_t* peerOpaqueInfo, struct ncclConnect* connectInfo, struct ncclRing* ring) { - struct netSendResources* resources; - NCCLCHECK(ncclCalloc(&resources, 1)); - ring->send.transportResources = resources; - - struct netInfo* myInfo = (struct netInfo*)myOpaqueInfo; - resources->netDev = getDev(ring->id, myInfo->ndev, myInfo->distances); - NCCLCHECK(netGetGdrSupport(resources->netDev, myInfo->distances[resources->netDev], 1, &resources->useGdr)); - - int size = offsetof(struct ncclRecvMem, buff)+ring->buffSize; - if (resources->useGdr) { - NCCLCHECK(ncclCudaCalloc((char**)(&resources->devNetMem), size, true)); - } - - NCCLCHECK(ncclCudaHostAlloc((void**)&resources->hostRecvMem, (void**)&resources->devHostRecvMem, size)); - NCCLCHECK(ncclCudaHostAlloc((void**)&resources->hostSendMem, (void**)&resources->devHostSendMem, size)); - - return ncclSuccess; -} - -ncclResult_t netRecvSetup(ncclTinfo_t* myOpaqueInfo, ncclTinfo_t* peerOpaqueInfo, struct ncclConnect* connectInfo, struct ncclRing* ring) { - struct netRecvResources* resources; - NCCLCHECK(ncclCalloc(&resources, 1)); - ring->recv.transportResources = resources; - - struct netInfo* myInfo = (struct netInfo*)myOpaqueInfo; - resources->netDev = getDev(ring->id, myInfo->ndev, myInfo->distances); - NCCLCHECK(netGetGdrSupport(resources->netDev, myInfo->distances[resources->netDev], 0, &resources->useGdr)); - - if (resources->useGdr) { - // Collect HDR register for local GPU to initiate flush after receive - int cudaDev; - hipGetDevice(&cudaDev); - CUDACHECK(hipDeviceGetAttribute((int*)&ring->curr_hdp_reg, hipDeviceAttributeHdpMemFlushCntl, cudaDev)); - } - - int sendSize = sizeof(struct ncclSendMem); - NCCLCHECK(ncclCudaHostAlloc((void**)&resources->hostSendMem, (void**)&resources->devHostSendMem, sendSize)); - - int recvSize = offsetof(struct ncclRecvMem, buff)+ring->buffSize; - NCCLCHECK(ncclCudaHostAlloc((void**)&resources->hostRecvMem, (void**)&resources->devHostRecvMem, recvSize)); - - struct netInfo* peerInfo = (struct netInfo*)peerOpaqueInfo; - INFO(NCCL_INIT|NCCL_NET,"Ring %02d : %d -> %d via NET/%s/%d%s%s", ring->id, peerInfo->rank, myInfo->rank, ncclNetName(), resources->netDev, - resources->useGdr ? "/GDRDMA" : "", - (resources->hostDevMem != NULL) ? "/GDCopy" : ""); - struct netConnectInfo* info = (struct netConnectInfo*) connectInfo; - NCCLCHECK(ncclNetListen(resources->netDev, &info->netHandle, &resources->netListenComm)); - return ncclSuccess; -} - -ncclResult_t netSendConnect(struct ncclConnect* connectInfo, struct ncclConnector* send) { - // Setup device pointers - struct netSendResources* resources = (struct netSendResources*)send->transportResources; - - if (resources->useGdr) { - send->conn.buff = resources->devNetMem->buff; - // We don't use devMem for llMode because the CPU has to read the data - send->conn.llBuff = resources->devHostRecvMem->llBuff; - } else { - send->conn.buff = resources->devHostRecvMem->buff; - send->conn.llBuff = resources->devHostRecvMem->llBuff; - } - send->conn.tail = &resources->devHostRecvMem->tail; - send->conn.opCount = &resources->devHostRecvMem->opCount; - send->conn.fifo = resources->devHostRecvMem->sizesFifo; - send->conn.llFifo = resources->devHostRecvMem->llSizesFifo; - - if (resources->hostDevMem == NULL) { - send->conn.head = &resources->devHostSendMem->head; - send->conn.llHead = &resources->devHostSendMem->llHead; - } - - // Connect to remote peer - struct netConnectInfo* info = (struct netConnectInfo*)connectInfo; - NCCLCHECK(ncclNetConnect(resources->netDev, info->netHandle, &resources->netSendComm)); - return ncclSuccess; -} - -/* Connect to this peer */ -ncclResult_t netRecvConnect(struct ncclConnect* connectInfo, struct ncclConnector* recv) { - // Setup device pointers - struct netRecvResources* resources = (struct netRecvResources*)recv->transportResources; - - recv->conn.head = &resources->devHostSendMem->head; - recv->conn.llHead = &resources->devHostSendMem->llHead; - - if (resources->useGdr == 0) { - recv->conn.buff = resources->devHostRecvMem->buff; - recv->conn.llBuff = resources->devHostRecvMem->llBuff; - } - - if (resources->hostDevMem == NULL) { - recv->conn.tail = &resources->devHostRecvMem->tail; - recv->conn.opCount = &resources->devHostRecvMem->opCount; - } - - // Finish connection establishment - NCCLCHECK(ncclNetAccept(resources->netListenComm, &resources->netRecvComm)); - NCCLCHECK(ncclNetCloseListen(resources->netListenComm)); - - return ncclSuccess; -} - -ncclResult_t netSendFree(void* transportResources) { - struct netSendResources* resources = (struct netSendResources*)transportResources; - NCCLCHECK(ncclCudaHostFree(resources->hostSendMem)); - NCCLCHECK(ncclCudaHostFree(resources->hostRecvMem)); - if (resources->useGdr) - CUDACHECK(hipFree(resources->devNetMem)); - NCCLCHECK(ncclNetCloseSend(resources->netSendComm)); - free(resources); - return ncclSuccess; -} - -ncclResult_t netRecvFree(void* transportResources) { - struct netRecvResources* resources = (struct netRecvResources*)transportResources; - NCCLCHECK(ncclCudaHostFree(resources->hostSendMem)); - NCCLCHECK(ncclCudaHostFree(resources->hostRecvMem)); - NCCLCHECK(ncclNetCloseRecv(resources->netRecvComm)); - free(resources); - return ncclSuccess; -} - -ncclResult_t netSendProxy(struct ncclProxyArgs* args) { - struct ncclRing* ring = args->ring; - struct netSendResources* resources = (struct netSendResources*) (ring->send.transportResources); - const int llMode = args->llMode; - - volatile uint64_t* prevTail = &resources->hostRecvMem->tail; - struct ncclSendMem* prevMem = resources->hostDevMem ? resources->hostDevMem : resources->hostSendMem; - uint64_t* prevHead = llMode ? &prevMem->llHead : &prevMem->head; - struct ncclRecvMem* localMem = resources->useGdr ? resources->devNetMem : resources->hostRecvMem; - char* localBuff = llMode ? resources->hostRecvMem->llBuff : localMem->buff; - int ptrType = resources->useGdr ? NCCL_PTR_CUDA : NCCL_PTR_HOST; - volatile int* sizesFifo = llMode ? resources->hostRecvMem->llSizesFifo : resources->hostRecvMem->sizesFifo; - int buffSize = llMode ? NCCL_LL_BUFF_SIZE : ring->buffSize; - int sliceSize = buffSize / args->substeps; - - assert(args->substeps <= SIZES_FIFO_SIZE); - - uint64_t head = llMode ? resources->llStep : 0ULL; - uint64_t tail = llMode ? resources->llStep : 0ULL; - uint64_t end = head + args->nsteps; - - int idle = 0; - void* requests[args->substeps]; - - if (!args->needProxy) goto nextColl; - - TRACE(NCCL_NET,"opCount %lx head %lx tail %lx end %lx nsteps %d llMode %d", args->opCount, head, tail, end, args->nsteps, llMode); - TRACE(NCCL_NET,"opCount %lx buffSize %d sliceSize %d ptrType %d", args->opCount, buffSize, sliceSize, ptrType); - - // Update in case we skipped some collectives - if (llMode == 0) resources->hostRecvMem->opCount = args->opCount; - - while (head < end) { - idle++; - if (llMode) { - if (tail < end && tail < head + args->substeps) { - int slot = tail%args->substeps; - int size = LOAD(&sizesFifo[slot]); - if (size != 0) { - if (size == -1) size = 0; - uint32_t flag = tail + 1; - int nFifoLines = DIVUP(size, sizeof(union ncclLLFifoLine)); - size = nFifoLines * sizeof(union ncclLLFifoLine); - union ncclLLFifoLine* lines = (union ncclLLFifoLine*)(localBuff+slot*sliceSize); - for (int i=0; inetSendComm, lines, size, ptrType, requests+slot)); - if (requests[slot] != NULL) { - STORE(&sizesFifo[slot], size); - tail++; - idle = 0; - } - } - } - } else while (tail < LOAD(prevTail)) { - // Send through network - int slot = tail%args->substeps; - //TRACE(NCCL_NET,"head %d tail %d prevTail %d slot %d size %d ptrType %d", head, tail, LOAD(prevTail), slot, LOAD(&sizesFifo[slot]), ptrType); - NCCLCHECK(ncclNetIsend(resources->netSendComm, localBuff+slot*sliceSize, LOAD(&sizesFifo[slot]), ptrType, requests+slot)); - if (requests[slot] != NULL) { - tail++; - idle = 0; - } - } - if (head < tail) { - int done; - int slot = head%args->substeps; - NCCLCHECK(ncclNetTest(requests[slot], &done, NULL)); - if (done) { - if (llMode) { - STORE(&sizesFifo[slot], 0); - // Make sure size is reset to zero before we update the head. - __sync_synchronize(); - } - head++; - STORE(prevHead, head); - idle = 0; - } - } - if (idle) transportProxyIdle(idle); - } - - // Reset - if (llMode == 0) STORE(prevTail, 0); - -nextColl: - if (llMode) { - resources->llStep += args->nsteps; - // Don't forget to ack otherwise the GPU won't be able to push data. - STORE(prevHead, resources->llStep); - if (resources->llStep > resources->llLastCleaning + NCCL_LL_CLEAN_FREQ) { - memset(localBuff, 0, NCCL_LL_BUFF_SIZE); - resources->llStep += NCCL_LL_CHUNKS; - STORE(prevHead, resources->llStep); - resources->llLastCleaning = resources->llStep; - } - } - return ncclSuccess; -} - -ncclResult_t netRecvProxy(struct ncclProxyArgs* args) { - struct ncclRing* ring = args->ring; - struct netRecvResources* resources = (struct netRecvResources*) (ring->recv.transportResources); - int llMode = args->llMode; - - volatile uint64_t* nextHead = llMode ? &resources->hostSendMem->llHead : &resources->hostSendMem->head; - struct ncclRecvMem* localMem = resources->useGdr ? ring->devMemRecv : resources->hostRecvMem; - char* localBuff = llMode ? localMem->llBuff : localMem->buff; - char* nextBuff = (resources->useGdr == 0 && resources->hostDevMem) ? resources->hostDevMem->buff : NULL; - int ptrType = resources->useGdr ? NCCL_PTR_CUDA : NCCL_PTR_HOST; - uint64_t* nextTail = resources->hostDevMem ? &resources->hostDevMem->tail : &resources->hostRecvMem->tail; - - int buffSize = llMode ? NCCL_LL_BUFF_SIZE : ring->buffSize; - int sliceSize = buffSize / args->substeps; - - uint64_t head = llMode ? resources->llStep : 0ULL; - uint64_t tail = llMode ? resources->llStep : 0ULL; - uint64_t end = head + args->nsteps; - - int idle = 0; - void* requests[args->substeps]; - - if (!args->needProxy) goto nextColl; - - TRACE(NCCL_NET,"opCount %lx head %lx tail %lx end %lx nsteps %d llMode %d", args->opCount, head, tail, end, args->nsteps, llMode); - TRACE(NCCL_NET,"opCount %lx buffSize %d sliceSize %d ptrType %d", args->opCount, buffSize, sliceSize, ptrType); - - if (llMode == 0) { - // Waiting for next opCount is only needed before writing nextTail. - uint64_t* nextOpCount = resources->hostDevMem ? &resources->hostDevMem->opCount : &resources->hostRecvMem->opCount; - transportProxyWait([=] { return *nextOpCount >= args->opCount; }); - } - - while (head < end) { - idle++; - if ((tail < head + args->substeps) && (tail < LOAD(nextHead) + args->substeps) && (tail < end)) { - int slot = tail%args->substeps; - NCCLCHECK(ncclNetIrecv(resources->netRecvComm, localBuff+slot*sliceSize, sliceSize, ptrType, requests+slot)); - if (requests[slot] != NULL) { - tail++; - idle = 0; - } - } - if (tail > head) { - int done; - int slot = head%args->substeps; - int size; - NCCLCHECK(ncclNetTest(requests[slot], &done, &size)); - if (done) { - if (nextBuff) memcpy(nextBuff+slot*sliceSize, localBuff+slot*sliceSize, size); - head++; - if (llMode == 0) { - if (ptrType == NCCL_PTR_CUDA) { - ncclNetFlush(resources->netRecvComm, localBuff+slot*sliceSize, size); - - // Flush local HDP register after local read-back finishes - STORE(ring->curr_hdp_reg, 0x1); - TRACE(NCCL_NET, "Flushing GPU memory via HDP %p", ring->curr_hdp_reg); - } - //TRACE(NCCL_NET,"head %d tail %d slot %d size %d ptrType %d", head, tail, slot, size, ptrType); - STORE(nextTail, head); - } - idle = 0; - } - } - if (idle) transportProxyIdle(idle); - } - - // Wait for last ack and reset - if (llMode == 0) { - transportProxyWait([=] { return LOAD(nextHead) == head; }); - STORE(nextHead, 0); - } - -nextColl: - if (llMode) { - resources->llStep += args->nsteps; - if (resources->llStep > resources->llLastCleaning + NCCL_LL_CLEAN_FREQ) { - resources->llStep += NCCL_LL_CHUNKS; - while (LOAD(nextHead) < resources->llStep); - resources->llLastCleaning = resources->llStep; - } - } - return ncclSuccess; -} - -struct ncclTransport netTransport = { - "NET", - netFillInfo, - netCanConnect, - netGetRings, - { netSendSetup, netSendConnect, netSendFree, netSendProxy }, - { netRecvSetup, netRecvConnect, netRecvFree, netRecvProxy } -}; diff --git a/projects/rccl/src/transport/net_ib.cu b/projects/rccl/src/transport/net_ib.cc similarity index 78% rename from projects/rccl/src/transport/net_ib.cu rename to projects/rccl/src/transport/net_ib.cc index cbe2f9c45f..bfb2d8d437 100644 --- a/projects/rccl/src/transport/net_ib.cu +++ b/projects/rccl/src/transport/net_ib.cc @@ -1,5 +1,5 @@ /************************************************************************* - * Copyright (c) 2016-2018, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved. * Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved. * * See LICENSE.txt for license information @@ -33,6 +33,7 @@ static int ncclNIbDevs = -1; struct ncclIbDev { int device; uint8_t port; + uint8_t link; ibv_context* context; char devName[MAXNAMESIZE]; }; @@ -98,7 +99,6 @@ ncclResult_t ncclIbInit(ncclDebugLogger_t logFunction) { WARN("NET/IB : No IP interface found."); return ncclInternalError; } - INFO(NCCL_INIT|NCCL_NET,"NET/IB : Using interface %s for sideband communication", ncclIbIfName); // Detect IB cards int nIbDevs; @@ -108,53 +108,67 @@ ncclResult_t ncclIbInit(ncclDebugLogger_t logFunction) { char* userIbEnv = getenv("NCCL_IB_HCA"); struct netIf userIfs[MAX_IB_DEVS]; bool searchNot = userIbEnv && userIbEnv[0] == '^'; + bool searchExact = userIbEnv && userIbEnv[0] == '='; int nUserIfs = parseStringList(userIbEnv, userIfs, MAX_IB_DEVS); if (ncclSuccess != wrap_ibv_get_device_list(&devices, &nIbDevs)) return ncclInternalError; - for (int d=0; dname); continue; } - int found = 0; - if (context) { - struct ibv_device_attr devAttr; - if (ncclSuccess != wrap_ibv_query_device(context, &devAttr)) { - WARN("NET/IB : Unable to query device %s", devices[d]->name); + int nPorts = 0; + struct ibv_device_attr devAttr; + memset(&devAttr, 0, sizeof(devAttr)); + if (ncclSuccess != wrap_ibv_query_device(context, &devAttr)) { + WARN("NET/IB : Unable to query device %s", devices[d]->name); + if (ncclSuccess != wrap_ibv_close_device(context)) { return ncclInternalError; } + continue; + } + for (int port = 1; port <= devAttr.phys_port_cnt; port++) { + struct ibv_port_attr portAttr; + if (ncclSuccess != wrap_ibv_query_port(context, port, &portAttr)) { + WARN("NET/IB : Unable to query port %d", port); continue; } - for (int port = 1; port <= devAttr.phys_port_cnt; port++) { - struct ibv_port_attr portAttr; - if (ncclSuccess != wrap_ibv_query_port(context, port, &portAttr)) { - WARN("NET/IB : Unable to query port %d", port); - continue; - } - if (portAttr.state != IBV_PORT_ACTIVE) continue; - if (portAttr.link_layer != IBV_LINK_LAYER_INFINIBAND - && portAttr.link_layer != IBV_LINK_LAYER_ETHERNET) continue; + if (portAttr.state != IBV_PORT_ACTIVE) continue; + if (portAttr.link_layer != IBV_LINK_LAYER_INFINIBAND + && portAttr.link_layer != IBV_LINK_LAYER_ETHERNET) continue; - // check against user specified HCAs/ports - if (! (matchIfList(devices[d]->name, port, userIfs, nUserIfs) ^ searchNot)) { - continue; - } - INFO(NCCL_INIT|NCCL_NET,"NET/IB: [%d] %s:%d/%s ", d, devices[d]->name, port, - portAttr.link_layer == IBV_LINK_LAYER_INFINIBAND ? "IB" : "RoCE"); - ncclIbDevs[ncclNIbDevs].device = d; - ncclIbDevs[ncclNIbDevs].port = port; - ncclIbDevs[ncclNIbDevs].context = context; - strncpy(ncclIbDevs[ncclNIbDevs].devName, devices[d]->name, MAXNAMESIZE); - ncclNIbDevs++; - found++; - pthread_create(&ncclIbAsyncThread, NULL, ncclIbAsyncThreadMain, context); + // check against user specified HCAs/ports + if (! (matchIfList(devices[d]->name, port, userIfs, nUserIfs, searchExact) ^ searchNot)) { + continue; } - - if (found == 0) { if (ncclSuccess != wrap_ibv_close_device(context)) { return ncclInternalError; } } + TRACE(NCCL_INIT|NCCL_NET,"NET/IB: [%d] %s:%d/%s ", d, devices[d]->name, port, + portAttr.link_layer == IBV_LINK_LAYER_INFINIBAND ? "IB" : "RoCE"); + ncclIbDevs[ncclNIbDevs].device = d; + ncclIbDevs[ncclNIbDevs].port = port; + ncclIbDevs[ncclNIbDevs].link = portAttr.link_layer; + ncclIbDevs[ncclNIbDevs].context = context; + strncpy(ncclIbDevs[ncclNIbDevs].devName, devices[d]->name, MAXNAMESIZE); + ncclNIbDevs++; + nPorts++; + pthread_create(&ncclIbAsyncThread, NULL, ncclIbAsyncThreadMain, context); } + if (nPorts == 0 && ncclSuccess != wrap_ibv_close_device(context)) { return ncclInternalError; } } if (nIbDevs && (ncclSuccess != wrap_ibv_free_device_list(devices))) { return ncclInternalError; }; } + if (ncclNIbDevs == 0) { + INFO(NCCL_INIT|NCCL_NET, "NET/IB : No device found."); + } else { + char line[1024]; + line[0] = '\0'; + for (int d=0; dfd, &qpInfo, sizeof(qpInfo))); @@ -542,7 +549,6 @@ ncclResult_t ncclIbGetRequest(struct ncclIbRequest* reqs, struct ncclIbRequest** r->used = 1; r->type = 0; r->verbs = NULL; - r->ibMr = NULL; r->done = 0; r->size = -1; r->free = 0; @@ -588,57 +594,34 @@ ncclResult_t ncclIbTest(void* request, int* done, int* size); #define REG_ALIGN (4096) -// Cache previous MRs to avoid registering/unregistering for each Isend/Irecv -ncclResult_t ncclIbGetMr(struct ncclIbVerbs* verbs, void* data, int size, struct ncclIbMr** mrRet) { +ncclResult_t ncclIbRegMr(void* comm, void* data, int size, int type, void** mhandle) { + struct ncclIbVerbs* verbs = (struct ncclIbVerbs*)comm; uint64_t addr = (uint64_t)data; - int elem = -1; assert(size > 0); - // Look for an already existing MR - for (int i=0; imrPool[i].mr == NULL) continue; - uint64_t regAddr = (uint64_t)verbs->mrPool[i].mr->addr; - uint64_t regSize = (uint64_t)verbs->mrPool[i].mr->length; - if (regAddr <= addr && addr+size <= regAddr+regSize) { - *mrRet = verbs->mrPool+i; - verbs->mrPool[i].refcnt++; - return ncclSuccess; - } - } - - // Find an unused element - if (elem == -1) { - elem = (verbs->mrRotation++); - for (int i=0; imrPool[elem].refcnt > 0) elem++; else break; - } - if (verbs->mrPool[elem].refcnt > 0) { - WARN("NET/IB : memory register : no MR available"); - return ncclInternalError; - } - } - - assert(elem < MAX_REQUESTS); - assert(verbs->mrPool[elem].refcnt == 0); - // Deregister / register uint64_t regAddr = addr & (~(REG_ALIGN-1)); uint64_t regSize = addr+size - regAddr; regSize = ((regSize + REG_ALIGN-1) / REG_ALIGN ) * REG_ALIGN; - if (verbs->mrPool[elem].mr) NCCLCHECK(wrap_ibv_dereg_mr(verbs->mrPool[elem].mr)); - NCCLCHECK(wrap_ibv_reg_mr(&verbs->mrPool[elem].mr, verbs->pd, (void*)regAddr, regSize, IBV_ACCESS_LOCAL_WRITE|IBV_ACCESS_REMOTE_WRITE|IBV_ACCESS_REMOTE_READ)); - *mrRet = verbs->mrPool+elem; - verbs->mrPool[elem].refcnt++; - TRACE(NCCL_INIT,"elem %d regAddr %lx size %ld rkey %x", elem, regAddr, regSize, (verbs->mrPool+elem)->mr->rkey); + struct ibv_mr* mr; + NCCLCHECK(wrap_ibv_reg_mr(&mr, verbs->pd, (void*)regAddr, regSize, IBV_ACCESS_LOCAL_WRITE|IBV_ACCESS_REMOTE_WRITE|IBV_ACCESS_REMOTE_READ)); + *mhandle = (void*)mr; + TRACE(NCCL_INIT,"regAddr %lx size %ld rkey %x", regAddr, regSize, mr->rkey); return ncclSuccess; } -ncclResult_t ncclIbIsend(void* sendComm, void* data, int size, int type, void** request) { +ncclResult_t ncclIbDeregMr(void* comm, void* mhandle) { + NCCLCHECK(wrap_ibv_dereg_mr((struct ibv_mr*)mhandle)); + return ncclSuccess; +} + +ncclResult_t ncclIbIsend(void* sendComm, void* data, int size, void* mhandle, void** request) { struct ncclIbSendComm* comm = (struct ncclIbSendComm*)sendComm; if (comm->ready == 0) NCCLCHECK(ncclSendCheck(comm)); if (comm->ready == 0) { *request = NULL; return ncclSuccess; } + struct ibv_mr* mr = (struct ibv_mr*)mhandle; + // Wait for the receiver to have posted the corresponding receive volatile struct ncclIbSendFifo* slot = comm->fifo + (comm->fifoHead%MAX_REQUESTS); volatile uint32_t * readyPtr = &slot->ready; @@ -646,7 +629,6 @@ ncclResult_t ncclIbIsend(void* sendComm, void* data, int size, int type, void** struct ncclIbRequest* req; NCCLCHECK(ncclIbGetRequest(comm->reqs, &req)); - req->type = type; req->verbs = &comm->verbs; req->size = size; @@ -659,8 +641,7 @@ ncclResult_t ncclIbIsend(void* sendComm, void* data, int size, int type, void** wr.sg_list = NULL; wr.num_sge = 0; } else { - NCCLCHECK(ncclIbGetMr(&comm->verbs, data, size, &req->ibMr)); - sge.addr=(uintptr_t)data; sge.length=(unsigned int)size; sge.lkey=req->ibMr->mr->lkey; + sge.addr=(uintptr_t)data; sge.length=(unsigned int)size; sge.lkey=mr->lkey; wr.sg_list = &sge; wr.num_sge = 1; } @@ -671,22 +652,22 @@ ncclResult_t ncclIbIsend(void* sendComm, void* data, int size, int type, void** __sync_synchronize(); // order the readyPtr load against rkey load below // Sanity checks to catch user collective call count/size mismatches // plus any potential programming errors - if (size > slot->size || slot->size <= 0 || slot->addr == 0 || slot->rkey == 0 || slot->seq != comm->fifoHead) { + if (size > LOAD(&slot->size) || LOAD(&slot->size) <= 0 || LOAD(&slot->addr) == 0 || LOAD(&slot->rkey) == 0 || LOAD(&slot->seq) != comm->fifoHead) { WARN("NET/IB : collective mismatch error local size %d remote %d addr %lx rkey %x seq %x/%x", - size, slot->size, slot->addr, slot->rkey, slot->seq, comm->fifoHead); + size, LOAD(&slot->size), LOAD(&slot->addr), LOAD(&slot->rkey), LOAD(&slot->seq), comm->fifoHead); return ncclInternalError; } wr.opcode = IBV_WR_RDMA_WRITE_WITH_IMM; - wr.wr.rdma.remote_addr = slot->addr; - wr.wr.rdma.rkey = slot->rkey; + wr.wr.rdma.remote_addr = LOAD(&slot->addr); + wr.wr.rdma.rkey = LOAD(&slot->rkey); wr.imm_data = size; // Send the message size via imm_data __sync_synchronize(); #endif // We must clear slot->ready, but reset other fields to aid // debugging and sanity checks STORE(&slot->ready, 0); - slot->addr = 0ULL; - slot->rkey = slot->size = slot->seq = 0; + STORE(&slot->addr, 0); + STORE(&slot->rkey, 0); STORE(&slot->size, 0); STORE(&slot->seq, 0); comm->fifoHead++; struct ibv_send_wr* bad_wr; @@ -725,14 +706,15 @@ ncclResult_t ncclIbPostFifo(struct ncclIbRecvComm* comm, uint32_t rkey, uint64_t return ncclSuccess; } -ncclResult_t ncclIbIrecv(void* recvComm, void* data, int size, int type, void** request) { +ncclResult_t ncclIbIrecv(void* recvComm, void* data, int size, void* mhandle, void** request) { struct ncclIbRecvComm* comm = (struct ncclIbRecvComm*)recvComm; if (comm->ready == 0) NCCLCHECK(ncclRecvCheck(comm)); if (comm->ready == 0) { *request = NULL; return ncclSuccess; } + struct ibv_mr* mr = (struct ibv_mr*)mhandle; + struct ncclIbRequest* req; NCCLCHECK(ncclIbGetRequest(comm->reqs, &req)); - req->type = type; req->verbs = &comm->verbs; req->size = size; @@ -744,10 +726,8 @@ ncclResult_t ncclIbIrecv(void* recvComm, void* data, int size, int type, void** if (size == 0) { wr.sg_list = NULL; wr.num_sge = 0; - req->ibMr = NULL; } else { - NCCLCHECK(ncclIbGetMr(&comm->verbs, data, size, &req->ibMr)); - sge.addr=(uintptr_t)data; sge.length=(unsigned int)size; sge.lkey=req->ibMr->mr->lkey; + sge.addr=(uintptr_t)data; sge.length=(unsigned int)size; sge.lkey=mr->lkey; wr.sg_list = &sge; wr.num_sge = 1; } @@ -757,25 +737,25 @@ ncclResult_t ncclIbIrecv(void* recvComm, void* data, int size, int type, void** *request = req; // Post to FIFO to notify sender - NCCLCHECK(ncclIbPostFifo(comm, req->ibMr->mr->rkey, (uint64_t)data, size)); + NCCLCHECK(ncclIbPostFifo(comm, mr->rkey, (uint64_t)data, size)); return ncclSuccess; } -ncclResult_t ncclIbFlush(void* recvComm, void* data, int size) { +ncclResult_t ncclIbFlush(void* recvComm, void* data, int size, void* mhandle) { struct ncclIbRecvComm* comm = (struct ncclIbRecvComm*)recvComm; if (comm->gpuFlush.enabled == 0 || size == 0) return ncclSuccess; struct ncclIbRequest* req; NCCLCHECK(ncclIbGetRequest(comm->reqs, &req)); req->verbs = &comm->verbs; - NCCLCHECK(ncclIbGetMr(&comm->verbs, data, 1, &req->ibMr)); + struct ibv_mr* mr = (struct ibv_mr*)mhandle; struct ibv_send_wr wr; memset(&wr, 0, sizeof(wr)); wr.wr_id = (uint64_t)req; wr.wr.rdma.remote_addr = (uint64_t)data; - wr.wr.rdma.rkey = req->ibMr->mr->rkey; + wr.wr.rdma.rkey = mr->rkey; wr.sg_list = &comm->gpuFlush.sge; wr.num_sge = 1; wr.opcode = IBV_WR_RDMA_READ; @@ -805,32 +785,31 @@ ncclResult_t ncclIbTest(void* request, int* done, int* size) { } int wrDone = 0; - struct ibv_wc wc; - NCCLCHECK(wrap_ibv_poll_cq(r->verbs->cq, 1, &wc, &wrDone)); + struct ibv_wc wcs[4]; + NCCLCHECK(wrap_ibv_poll_cq(r->verbs->cq, 4, wcs, &wrDone)); if (wrDone == 0) return ncclSuccess; - if (wc.status != IBV_WC_SUCCESS) { - WARN("NET/IB : Got completion with error %d, opcode %d, len %d, vendor err %d", wc.status, wc.opcode, wc.byte_len, wc.vendor_err); - return ncclSystemError; - } + for (int w=0; wstatus != IBV_WC_SUCCESS) { + WARN("NET/IB : Got completion with error %d, opcode %d, len %d, vendor err %d", wc->status, wc->opcode, wc->byte_len, wc->vendor_err); + return ncclSystemError; + } - struct ncclIbRequest* doneReq = (struct ncclIbRequest*)wc.wr_id; - if (doneReq) { - if (wc.opcode == IBV_WC_RECV) { - doneReq->size = wc.byte_len; + struct ncclIbRequest* doneReq = (struct ncclIbRequest*)wc->wr_id; + if (doneReq) { + if (wc->opcode == IBV_WC_RECV) { + doneReq->size = wc->byte_len; #if USE_RDMA_WRITE - } else if (wc.opcode == IBV_WC_RECV_RDMA_WITH_IMM) { - doneReq->size = wc.imm_data; + } else if (wc->opcode == IBV_WC_RECV_RDMA_WITH_IMM) { + doneReq->size = wc->imm_data; #endif - } - if (doneReq->ibMr != NULL) { - doneReq->ibMr->refcnt--; - if (doneReq->ibMr->refcnt < 0) WARN("NET/IB : doneReq %p MR %p refcount now %d", doneReq, doneReq->ibMr, doneReq->ibMr->refcnt); - } - doneReq->done = 1; - if (doneReq->free == 1) { - // This is an internal (FIFO post) req. Free it immediately. - doneReq->used = 0; + } + doneReq->done = 1; + if (doneReq->free == 1) { + // This is an internal (FIFO post) req. Free it immediately. + doneReq->used = 0; + } } } } @@ -842,12 +821,6 @@ ncclResult_t ncclIbCloseSend(void* sendComm) { close(comm->fd); if (comm->qp != NULL) NCCLCHECK(wrap_ibv_destroy_qp(comm->qp)); if (comm->fifoMr != NULL) NCCLCHECK(wrap_ibv_dereg_mr(comm->fifoMr)); - for (int i=0; iverbs.mrPool[i].mr != NULL) { - if (comm->verbs.mrPool[i].refcnt != 0) WARN("NET/IB : TX MR #%d has non-zero (%d) refcnt", i, comm->verbs.mrPool[i].refcnt); - NCCLCHECK(wrap_ibv_dereg_mr(comm->verbs.mrPool[i].mr)); - } - } NCCLCHECK(ncclIbDestroyVerbs(&comm->verbs)); free(comm); } @@ -864,12 +837,6 @@ ncclResult_t ncclIbCloseRecv(void* recvComm) { if (comm->gpuFlush.hostMr != NULL) NCCLCHECK(wrap_ibv_dereg_mr(comm->gpuFlush.hostMr)); } if (comm->remFifo.mr != NULL) NCCLCHECK(wrap_ibv_dereg_mr(comm->remFifo.mr)); - for (int i=0; iverbs.mrPool[i].mr != NULL) { - if (comm->verbs.mrPool[i].refcnt != 0) WARN("NET/IB : RX MR #%d has non-zero (%d) refcnt", i, comm->verbs.mrPool[i].refcnt); - NCCLCHECK(wrap_ibv_dereg_mr(comm->verbs.mrPool[i].mr)); - } - } NCCLCHECK(ncclIbDestroyVerbs(&comm->verbs)); free(comm); } @@ -894,6 +861,8 @@ ncclNet_t ncclNetIb = { ncclIbListen, ncclIbConnect, ncclIbAccept, + ncclIbRegMr, + ncclIbDeregMr, ncclIbIsend, ncclIbIrecv, ncclIbFlush, diff --git a/projects/rccl/src/transport/net_socket.cc b/projects/rccl/src/transport/net_socket.cc new file mode 100644 index 0000000000..ec0e50d518 --- /dev/null +++ b/projects/rccl/src/transport/net_socket.cc @@ -0,0 +1,486 @@ +/************************************************************************* + * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved. + * Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ + +#include "nccl.h" +#include "core.h" +#include "socket.h" +#include "net.h" +#include "param.h" + +#include +#include +#include +#include +#include +#include +#include + +/* Init functions */ +static char ncclNetIfNames[MAX_IF_NAME_SIZE*MAX_IFS]; +static union socketAddress ncclNetIfAddrs[MAX_IFS]; +static int ncclNetIfs = -1; +pthread_mutex_t ncclSocketLock = PTHREAD_MUTEX_INITIALIZER; + +ncclResult_t ncclSocketInit(ncclDebugLogger_t logFunction) { + if (ncclNetIfs == -1) { + pthread_mutex_lock(&ncclSocketLock); + if (ncclNetIfs == -1) { + ncclNetIfs = findInterfaces(ncclNetIfNames, ncclNetIfAddrs, MAX_IF_NAME_SIZE, MAX_IFS); + if (ncclNetIfs <= 0) { + WARN("NET/Socket : no interface found"); + return ncclInternalError; + } else { + char line[1024]; + char addrline[1024]; + line[0] = '\0'; + for (int i=0; i= ncclNetIfs) return ncclInternalError; + memcpy(addr, ncclNetIfAddrs+dev, sizeof(*addr)); + return ncclSuccess; +} + +/* Communication functions */ + +#define MAX_SOCKETS 64 +#define MAX_THREADS 16 +#define MAX_REQUESTS 128 +#define MAX_QUEUE_LEN MAX_REQUESTS +#define MIN_CHUNKSIZE (64*1024) + +NCCL_PARAM(SocketNsocksPerThread, "NSOCKS_PERTHREAD", -2); +NCCL_PARAM(SocketNthreads, "SOCKET_NTHREADS", -2); + +struct ncclSocketHandle { + union socketAddress connectAddr; + int nSocks; + int nThreads; +}; + +struct ncclSocketTask { + int op; + void* data; + int size; + int fd; + int offset; + int used; + ncclResult_t result; +}; + +struct ncclSocketRequest { + int op; + void* data; + int size; + int ctrlFd; + int used; + struct ncclSocketComm* comm; + struct ncclSocketTask* tasks[MAX_SOCKETS]; + int nSubs; +}; + +struct ncclSocketTaskQueue { + int next; + struct ncclSocketTask* tasks; +}; + +enum threadState {start, stop}; + +struct ncclSocketThreadResources { + struct ncclSocketTaskQueue threadTaskQueue; + enum threadState state; + struct ncclSocketComm* comm; + pthread_mutex_t threadLock; + pthread_cond_t threadCond; +}; + +struct ncclSocketListenComm { + int fd; + int nSocks; + int nThreads; +}; + +struct ncclSocketComm { + int ctrlFd; + int fds[MAX_SOCKETS]; + int nSocks; + int nThreads; + int nextFd; + struct ncclSocketRequest requests[MAX_REQUESTS]; + pthread_t helperThread[MAX_THREADS]; + struct ncclSocketThreadResources threadResources[MAX_THREADS]; +}; + +void* persistentSocketThread(void *args_) { + struct ncclSocketThreadResources* resource = (struct ncclSocketThreadResources*)args_; + struct ncclSocketComm* comm = resource->comm; + volatile enum threadState* state = &resource->state; + struct ncclSocketTaskQueue* myQueue = &resource->threadTaskQueue; + int nSocksPerThread = comm->nSocks / comm->nThreads; + while (1) { + int idle = 1; + int mark = myQueue->next; // mark newest task seen + for (int i=0; itasks+i+j; + if (r != NULL && r->used == 1 && r->offset < r->size) { + r->result = socketProgress(r->op, r->fd, r->data, r->size, &r->offset); + if (r->result != ncclSuccess) { + WARN("NET/Socket : socket progress error"); + return NULL; + } + idle = 0; + if (r->offset < r->size) repeat = 1; + } + } + } while (repeat); + } + if (idle) { + pthread_mutex_lock(&resource->threadLock); + while (mark == myQueue->next && LOAD(state) != stop) { // no new tasks, wait + pthread_cond_wait(&resource->threadCond, &resource->threadLock); + } + pthread_mutex_unlock(&resource->threadLock); + } + if (LOAD(state) == stop) return NULL; + } +} + +ncclResult_t ncclSocketGetNsockNthread(int dev, int* ns, int* nt) { + int nSocksPerThread = ncclParamSocketNsocksPerThread(); + int nThreads = ncclParamSocketNthreads(); + if (nThreads > MAX_THREADS) { + WARN("NET/Socket : NCCL_SOCKET_NTHREADS is greater than the maximum allowed, setting to %d", MAX_THREADS); + nThreads = MAX_THREADS; + } + if (nThreads == -2 || nSocksPerThread == -2) { + // Auto-detection + int autoNt=1, autoNs=1; + char vendorPath[PATH_MAX]; + snprintf(vendorPath, PATH_MAX, "/sys/class/net/%s/device/vendor", ncclNetIfNames+dev*MAX_IF_NAME_SIZE); + char* rPath = realpath(vendorPath, NULL); + int fd = open(rPath, O_RDONLY); + free(rPath); + if (fd == -1) { + // Could not find device vendor. This is handled silently so + // we don't want to print an INFO error. + TRACE(NCCL_NET, "Open of %s failed : %s\n", vendorPath, strerror(errno)); + goto end; + } + char vendor[7]; + strncpy(vendor, "0x0000", 7); + int len; + SYSCHECKVAL(read(fd, vendor, 6), "read", len); + SYSCHECK(close(fd), "close"); + if (strcmp(vendor, "0x1d0f") == 0) { // AWS + autoNt = 2; + autoNs = 8; + } +end: + if (nThreads == -2) nThreads = autoNt; + if (nSocksPerThread == -2) nSocksPerThread = autoNs; + } + int nSocks = nSocksPerThread * nThreads; + if (nSocks > MAX_SOCKETS) { + nSocksPerThread = MAX_SOCKETS/nThreads; + WARN("NET/Socket : the total number of sockets is greater than the maximum allowed, setting NCCL_NSOCKS_PERTHREAD to %d", nSocksPerThread); + nSocks = nSocksPerThread * nThreads; + } + *ns = nSocks; + *nt = nThreads; + INFO(NCCL_INIT, "NET/Socket: Using %d threads and %d sockets per thread", nThreads, nSocksPerThread); + return ncclSuccess; +} + +ncclResult_t ncclSocketNewListenComm(struct ncclSocketListenComm** comm) { + NCCLCHECK(ncclCalloc(comm, 1)); + (*comm)->fd = -1; + return ncclSuccess; +} + +ncclResult_t ncclSocketNewComm(struct ncclSocketComm** comm) { + NCCLCHECK(ncclCalloc(comm, 1)); + (*comm)->ctrlFd = -1; + for (int i=0; i < MAX_SOCKETS; i++) { + (*comm)->fds[i] = -1; + } + (*comm)->nextFd = 0; + return ncclSuccess; +} + +ncclResult_t ncclSocketListen(int dev, void* opaqueHandle, void** listenComm) { + if (dev < 0) { // data transfer socket is based on specified dev + return ncclInternalError; + } + struct ncclSocketHandle* handle = (struct ncclSocketHandle*) opaqueHandle; + static_assert(sizeof(struct ncclSocketHandle) < NCCL_NET_HANDLE_MAXSIZE, "ncclSocketHandle size too large"); + struct ncclSocketListenComm* comm; + NCCLCHECK(ncclSocketNewListenComm(&comm)); + NCCLCHECK(GetSocketAddr(dev, &handle->connectAddr)); + NCCLCHECK(createListenSocket(&comm->fd, &handle->connectAddr)); + NCCLCHECK(ncclSocketGetNsockNthread(dev, &comm->nSocks, &comm->nThreads)); + handle->nSocks = comm->nSocks; + handle->nThreads = comm->nThreads; + *listenComm = comm; + return ncclSuccess; +} + +ncclResult_t ncclSocketConnect(int dev, void* opaqueHandle, void** sendComm) { + if (dev < 0) { // data transfer socket is based on specified dev + return ncclInternalError; + } + struct ncclSocketComm* comm; + NCCLCHECK(ncclSocketNewComm(&comm)); + struct ncclSocketHandle* handle = (struct ncclSocketHandle*) opaqueHandle; + comm->nSocks = handle->nSocks; + comm->nThreads = handle->nThreads; + for (int i=0; inSocks+1; i++) { + int tmpFd, offset=0; + NCCLCHECK(connectAddress(&tmpFd, &handle->connectAddr)); + NCCLCHECK(socketWait(NCCL_SOCKET_SEND, tmpFd, &i, sizeof(int), &offset)); + if (i == comm->nSocks) comm->ctrlFd = tmpFd; + else comm->fds[i] = tmpFd; + } + *sendComm = comm; + return ncclSuccess; +} + +ncclResult_t ncclSocketAccept(void* listenComm, void** recvComm) { + struct ncclSocketListenComm* lComm = (struct ncclSocketListenComm*)listenComm; + struct ncclSocketComm* rComm; + NCCLCHECK(ncclSocketNewComm(&rComm)); + rComm->nSocks = lComm->nSocks; + rComm->nThreads = lComm->nThreads; + for (int i=0; inSocks+1; i++) { + int tmpFd, sendSockIdx, offset=0; + struct sockaddr_in sockaddr; + socklen_t socklen = sizeof(struct sockaddr_in); + SYSCHECKVAL(accept(lComm->fd, (struct sockaddr*)&sockaddr, &socklen), "accept", tmpFd); + NCCLCHECK(socketWait(NCCL_SOCKET_RECV, tmpFd, &sendSockIdx, sizeof(int), &offset)); + if (sendSockIdx == rComm->nSocks) rComm->ctrlFd = tmpFd; + else rComm->fds[sendSockIdx] = tmpFd; + } + *recvComm = rComm; + return ncclSuccess; +} + +ncclResult_t ncclSocketGetRequest(struct ncclSocketComm* comm, int op, void* data, int size, struct ncclSocketRequest** req) { + for (int i=0; irequests+i; + if (r->used == 0) { + r->op = op; + r->data = data; + r->size = size; + r->ctrlFd = comm->ctrlFd; + r->used = 1; + r->comm = comm; + r->nSubs = 0; + *req = r; + return ncclSuccess; + } + } + WARN("NET/Socket : unable to allocate requests"); + return ncclInternalError; +} + +ncclResult_t ncclSocketGetTask(struct ncclSocketComm* comm, int op, void* data, int size, struct ncclSocketTask** req) { + int tid = comm->nextFd % comm->nThreads; + struct ncclSocketThreadResources* res = comm->threadResources+tid; + struct ncclSocketTaskQueue* queue = &res->threadTaskQueue; + // create helper threads and prepare per-thread task queue + if (queue->tasks == NULL) { + NCCLCHECK(ncclCalloc(&queue->tasks, MAX_QUEUE_LEN)); + queue->next = 0; + res->comm = comm; + pthread_mutex_init(&res->threadLock, NULL); + pthread_cond_init(&res->threadCond, NULL); + pthread_create(comm->helperThread+tid, NULL, persistentSocketThread, res); + } + struct ncclSocketTask* r = queue->tasks+queue->next; + if (r->used == 0) { + r->op = op; + r->data = data; + r->size = size; + r->fd = comm->fds[comm->nextFd]; + r->offset = 0; + r->result = ncclSuccess; + comm->nextFd = (comm->nextFd + 1) % comm->nSocks; + r->used = 1; + *req = r; + pthread_mutex_lock(&res->threadLock); + queue->next = (queue->next+1)%MAX_QUEUE_LEN; + res->state = start; + pthread_cond_signal(&res->threadCond); + pthread_mutex_unlock(&res->threadLock); + return ncclSuccess; + } + WARN("NET/Socket : unable to allocate subtasks"); + return ncclInternalError; +} + +ncclResult_t ncclSocketTest(void* request, int* done, int* size) { + *done = 0; + struct ncclSocketRequest *r = (struct ncclSocketRequest*)request; + if (r == NULL) { + WARN("NET/Socket : test called with NULL request"); + return ncclInternalError; + } + if (r->used == 1) { /* try to send/recv size */ + int data = r->size; + int offset = 0; + NCCLCHECK(socketProgress(r->op, r->ctrlFd, &data, sizeof(int), &offset)); + + if (offset == 0) return ncclSuccess; /* Not ready -- retry later */ + + // Not sure we could ever receive less than 4 bytes, but just in case ... + if (offset < sizeof(int)) NCCLCHECK(socketWait(r->op, r->ctrlFd, &data, sizeof(int), &offset)); + + // Check size is less or equal to the size provided by the user + if (r->op == NCCL_SOCKET_RECV && data > r->size) { + WARN("NET/Socket : message truncated : receiving %d bytes instead of %d", data, r->size); + return ncclInternalError; + } + r->size = data; + r->used = 2; // done exchanging size + // divide into subtasks + int taskSize = std::max(MIN_CHUNKSIZE, DIVUP(r->size, r->comm->nSocks)); + int chunkOffset = 0, i = 0; + while (chunkOffset < r->size) { + int chunkSize = std::min(taskSize, r->size-chunkOffset); + NCCLCHECK(ncclSocketGetTask(r->comm, r->op, (char*)(r->data)+chunkOffset, chunkSize, r->tasks+i++)); + chunkOffset += chunkSize; + } + r->nSubs = i; + } + if (r->used == 2) { // already exchanged size + int nCompleted = 0; + for (int i=0; inSubs; i++) { + struct ncclSocketTask* sub = r->tasks[i]; + if (sub->result != ncclSuccess) return sub->result; + if (sub->offset == sub->size) nCompleted++; + } + if (nCompleted == r->nSubs) { + if (size) *size = r->size; + *done = 1; + r->used = 0; + for (int i=0; inSubs; i++) { + struct ncclSocketTask* sub = r->tasks[i]; + sub->used = 0; + } + } + } + return ncclSuccess; +} + +ncclResult_t ncclSocketRegMr(void* comm, void* data, int size, int type, void** mhandle) { + return (type != NCCL_PTR_HOST) ? ncclInternalError : ncclSuccess; +} +ncclResult_t ncclSocketDeregMr(void* comm, void* mhandle) { return ncclSuccess; } + +ncclResult_t ncclSocketIsend(void* sendComm, void* data, int size, void* mhandle, void** request) { + struct ncclSocketComm* comm = (struct ncclSocketComm*)sendComm; + NCCLCHECK(ncclSocketGetRequest(comm, NCCL_SOCKET_SEND, data, size, (struct ncclSocketRequest**)request)); + return ncclSuccess; +} + +ncclResult_t ncclSocketIrecv(void* recvComm, void* data, int size, void* mhandle, void** request) { + struct ncclSocketComm* comm = (struct ncclSocketComm*)recvComm; + NCCLCHECK(ncclSocketGetRequest(comm, NCCL_SOCKET_RECV, data, size, (struct ncclSocketRequest**)request)); + return ncclSuccess; +} + +ncclResult_t ncclSocketFlush(void* recvComm, void* data, int size, void* mhandle) { + // We don't support CUDA pointers, so we don't need a flush operation + return ncclInternalError; +} + +ncclResult_t ncclSocketCloseListen(void* opaqueComm) { + struct ncclSocketListenComm* comm = (struct ncclSocketListenComm*)opaqueComm; + if (comm) { + if (comm->fd != -1) close(comm->fd); + free(comm); + } + return ncclSuccess; +} + +ncclResult_t ncclSocketClose(void* opaqueComm) { + struct ncclSocketComm* comm = (struct ncclSocketComm*)opaqueComm; + if (comm) { + for (int i=0; inThreads; i++) { + struct ncclSocketThreadResources* res = comm->threadResources+i; + if (comm->helperThread[i]) { + pthread_mutex_lock(&res->threadLock); + res->state = stop; + pthread_cond_signal(&res->threadCond); + pthread_mutex_unlock(&res->threadLock); + pthread_join(comm->helperThread[i], NULL); + } + free(res->threadTaskQueue.tasks); + } + if (comm->ctrlFd != -1) close(comm->ctrlFd); + for (int i=0; inSocks; i++) { + if (comm->fds[i] != -1) close(comm->fds[i]); + } + free(comm); + } + return ncclSuccess; +} + +ncclNet_t ncclNetSocket = { + "Socket", + ncclSocketInit, + ncclSocketDevices, + ncclSocketPciPath, + ncclSocketPtrSupport, + ncclSocketListen, + ncclSocketConnect, + ncclSocketAccept, + ncclSocketRegMr, + ncclSocketDeregMr, + ncclSocketIsend, + ncclSocketIrecv, + ncclSocketFlush, + ncclSocketTest, + ncclSocketClose, + ncclSocketClose, + ncclSocketCloseListen +}; diff --git a/projects/rccl/src/transport/net_socket.cu b/projects/rccl/src/transport/net_socket.cu deleted file mode 100644 index b09e2e7234..0000000000 --- a/projects/rccl/src/transport/net_socket.cu +++ /dev/null @@ -1,259 +0,0 @@ -/************************************************************************* - * Copyright (c) 2016-2018, NVIDIA CORPORATION. All rights reserved. - * Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved. - * - * See LICENSE.txt for license information - ************************************************************************/ - -#include "nccl.h" -#include "core.h" -#include "socket.h" -#include "net.h" - -#include -#include -#include -#include -#include -#include - -/* Init functions */ -static char ncclNetIfNames[MAX_IF_NAME_SIZE*MAX_IFS]; -static union socketAddress ncclNetIfAddrs[MAX_IFS]; -static int ncclNetIfs = -1; -pthread_mutex_t ncclSocketLock = PTHREAD_MUTEX_INITIALIZER; - -ncclResult_t ncclSocketInit(ncclDebugLogger_t logFunction) { - if (ncclNetIfs == -1) { - pthread_mutex_lock(&ncclSocketLock); - if (ncclNetIfs == -1) { - ncclNetIfs = findInterfaces(ncclNetIfNames, ncclNetIfAddrs, MAX_IF_NAME_SIZE, MAX_IFS); - INFO(NCCL_INIT|NCCL_NET,"NET/Socket : %d interfaces found", ncclNetIfs); - if (ncclNetIfs <= 0) { - WARN("NET/Socket : no interface found"); - return ncclInternalError; - } - } - pthread_mutex_unlock(&ncclSocketLock); - } - return ncclSuccess; -} - -ncclResult_t ncclSocketPtrSupport(int dev, int* supportedTypes) { - *supportedTypes = NCCL_PTR_HOST; - return ncclSuccess; -} - -ncclResult_t ncclSocketDevices(int* ndev) { - *ndev = ncclNetIfs; - return ncclSuccess; -} - -ncclResult_t ncclSocketPciPath(int dev, char** path) { - char devicepath[PATH_MAX]; - snprintf(devicepath, PATH_MAX, "/sys/class/net/%s", ncclNetIfNames+dev*MAX_IF_NAME_SIZE); - *path = realpath(devicepath, NULL); - const char* string_virual_network_device_path="/sys/devices/virtual/net/"; - if (*path && !strncmp(*path, string_virual_network_device_path, strlen(string_virual_network_device_path))) - return ncclSuccess; - free(*path); - *path = realpath(devicepath, NULL); - if (*path == NULL) { - INFO(NCCL_NET|NCCL_INIT, "Could not find real path of %s", devicepath); - return ncclSystemError; - } - return ncclSuccess; -} - -static ncclResult_t GetSocketAddr(int dev, union socketAddress* addr) { - if (dev >= ncclNetIfs) return ncclInternalError; - memcpy(addr, ncclNetIfAddrs+dev, sizeof(*addr)); - return ncclSuccess; -} - -/* Communication functions */ - -struct ncclSocketHandle { - union socketAddress connectAddr; -}; - -struct ncclSocketRequest { - int op; - void* data; - int size; - int fd; - int offset; - int used; -}; - -struct ncclSocketReqs { - struct ncclSocketRequest* requests; -}; - -struct ncclSocketComm { - int fd; - struct ncclSocketReqs reqs; -}; - -ncclResult_t ncclSocketNewComm(struct ncclSocketComm** comm) { - NCCLCHECK(ncclCalloc(comm, 1)); - (*comm)->fd = -1; - return ncclSuccess; -} - -ncclResult_t ncclSocketCreateHandle(void* opaqueHandle, const char* str) { - struct ncclSocketHandle* handle = (struct ncclSocketHandle*) opaqueHandle; - NCCLCHECK(GetSocketAddrFromString(&(handle->connectAddr), str)); - return ncclSuccess; -} - -ncclResult_t ncclSocketListen(int dev, void* opaqueHandle, void** listenComm) { - struct ncclSocketHandle* handle = (struct ncclSocketHandle*) opaqueHandle; - static_assert(sizeof(struct ncclSocketHandle) < NCCL_NET_HANDLE_MAXSIZE, "ncclSocketHandle size too large"); - // if dev >= 0, listen based on dev - if (dev >= 0) { - NCCLCHECK(GetSocketAddr(dev, &(handle->connectAddr))); - } else if (dev == findSubnetIf) { - // handle stores a remote address - // need to find a local addr that is in the same network as the remote addr - union socketAddress localAddr; - char ifName[MAX_IF_NAME_SIZE]; - if (findInterfaceMatchSubnet(ifName, &localAddr, handle->connectAddr, MAX_IF_NAME_SIZE, 1) <= 0) { - WARN("No usable listening interface found"); - return ncclSystemError; - } - // pass the local address back - memcpy(&handle->connectAddr, &localAddr, sizeof(handle->connectAddr)); - } // Otherwise, handle stores a local address - struct ncclSocketComm* comm; - NCCLCHECK(ncclSocketNewComm(&comm)); - NCCLCHECK(createListenSocket(&comm->fd, &handle->connectAddr)); - *listenComm = comm; - return ncclSuccess; -} - -ncclResult_t ncclSocketConnect(int dev, void* opaqueHandle, void** sendComm) { - struct ncclSocketComm* comm; - NCCLCHECK(ncclSocketNewComm(&comm)); - struct ncclSocketHandle* handle = (struct ncclSocketHandle*) opaqueHandle; - NCCLCHECK(connectAddress(&comm->fd, &handle->connectAddr)); - *sendComm = comm; - return ncclSuccess; -} - -ncclResult_t ncclSocketAccept(void* listenComm, void** recvComm) { - struct ncclSocketComm* lComm = (struct ncclSocketComm*)listenComm; - struct ncclSocketComm* rComm; - NCCLCHECK(ncclSocketNewComm(&rComm)); - struct sockaddr_in sockaddr; - socklen_t socklen = sizeof(struct sockaddr_in); - SYSCHECKVAL(accept(lComm->fd, (struct sockaddr*)&sockaddr, &socklen), "accept", rComm->fd); - *recvComm = rComm; - return ncclSuccess; -} - -#define MAX_REQUESTS 128 - -ncclResult_t ncclSocketGetRequest(struct ncclSocketReqs* reqs, int op, void* data, int size, int fd, struct ncclSocketRequest** req) { - if (reqs->requests == NULL) { - NCCLCHECK(ncclCalloc(&reqs->requests, MAX_REQUESTS)); - } - for (int i=0; irequests+i; - if (r->used == 0) { - r->op = op; - r->data = data; - r->size = size; - r->fd = fd; - r->offset = -1; - r->used = 1; - *req = r; - return ncclSuccess; - } - } - WARN("Socket : unable to allocate requests"); - return ncclInternalError; -} - -ncclResult_t ncclSocketTest(void* request, int* done, int* size) { - *done = 0; - struct ncclSocketRequest *r = (struct ncclSocketRequest*)request; - if (r == NULL) { - WARN("NET/Socket : test called with NULL request"); - return ncclInternalError; - } - if (r->offset == -1) { /* try to send/recv size */ - int data = r->size; - int offset = 0; - NCCLCHECK(socketProgress(r->op, r->fd, &data, sizeof(int), &offset)); - - if (offset == 0) return ncclSuccess; /* Not ready -- retry later */ - - // Not sure we could ever receive less than 4 bytes, but just in case ... - if (offset < sizeof(int)) NCCLCHECK(socketWait(r->op, r->fd, &data, sizeof(int), &offset)); - - // Check size is less or equal to the size provided by the user - if (r->op == NCCL_SOCKET_RECV && data > r->size) { - WARN("NET/Socket : message truncated : receiving %d bytes instead of %d", data, r->size); - return ncclInternalError; - } - r->size = data; - r->offset = 0; - } - if (r->offset < r->size) { - NCCLCHECK(socketProgress(r->op, r->fd, r->data, r->size, &r->offset)); - } - if (r->offset == r->size) { - if (size) *size = r->size; - *done = 1; - r->used = 0; - } - return ncclSuccess; -} - -ncclResult_t ncclSocketIsend(void* sendComm, void* data, int size, int type, void** request) { - if (type != NCCL_PTR_HOST) return ncclInternalError; - struct ncclSocketComm* comm = (struct ncclSocketComm*)sendComm; - NCCLCHECK(ncclSocketGetRequest(&comm->reqs, NCCL_SOCKET_SEND, data, size, comm->fd, (struct ncclSocketRequest**)request)); - return ncclSuccess; -} - -ncclResult_t ncclSocketIrecv(void* recvComm, void* data, int size, int type, void** request) { - if (type != NCCL_PTR_HOST) return ncclInternalError; - struct ncclSocketComm* comm = (struct ncclSocketComm*)recvComm; - NCCLCHECK(ncclSocketGetRequest(&comm->reqs, NCCL_SOCKET_RECV, data, size, comm->fd, (struct ncclSocketRequest**)request)); - return ncclSuccess; -} - -ncclResult_t ncclSocketFlush(void* recvComm, void* data, int size) { - // We don't support CUDA pointers, so we don't need a flush operation - return ncclInternalError; -} - -ncclResult_t ncclSocketClose(void* opaqueComm) { - struct ncclSocketComm* comm = (struct ncclSocketComm*)opaqueComm; - if (comm) { - free(comm->reqs.requests); - close(comm->fd); - free(comm); - } - return ncclSuccess; -} - -ncclNet_t ncclNetSocket = { - "Socket", - ncclSocketInit, - ncclSocketDevices, - ncclSocketPciPath, - ncclSocketPtrSupport, - ncclSocketListen, - ncclSocketConnect, - ncclSocketAccept, - ncclSocketIsend, - ncclSocketIrecv, - ncclSocketFlush, - ncclSocketTest, - ncclSocketClose, - ncclSocketClose, - ncclSocketClose -}; diff --git a/projects/rccl/src/transport/p2p.cu b/projects/rccl/src/transport/p2p.cc similarity index 68% rename from projects/rccl/src/transport/p2p.cu rename to projects/rccl/src/transport/p2p.cc index f5ea1f1cbb..61874c9d42 100644 --- a/projects/rccl/src/transport/p2p.cu +++ b/projects/rccl/src/transport/p2p.cc @@ -1,5 +1,5 @@ /************************************************************************* - * Copyright (c) 2016-2018, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved. * Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved. * * See LICENSE.txt for license information @@ -11,26 +11,16 @@ #include "transport.h" #include "param.h" #include -#include -#include -#include "nvmlwrap.h" +#include #include -#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) +#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__) #include "nvlink_stub.h" +#include +#include #else #include "nvlink.h" #endif -extern bool useFineGrainVramPcie; - -struct p2pInfo { - int rank; - int cudaDev; - uint64_t hostHash; - uint64_t pidHash; - char busId[NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE]; -}; - struct p2pConnectInfo { int direct; union { @@ -39,41 +29,45 @@ struct p2pConnectInfo { }; }; +struct p2pSendResources { + struct ncclSendMem* devMem; + void* ipcPtr; + uint32_t* next_hdp_reg; // Next GPU in ring (for p2p transport use only) +}; + +struct p2pRecvResources { + struct ncclRecvMem* devMem; + void* ipcPtr; +}; + #include -/* Fill information necessary to exchange between ranks to choose whether or not - * to use this transport */ -ncclResult_t p2pFillInfo(ncclTinfo_t* opaqueInfo, int rank, uint64_t commHash) { - struct p2pInfo* info = (struct p2pInfo*)opaqueInfo; - static_assert(sizeof(struct p2pInfo) <= sizeof(ncclTinfo_t), "p2p Info too large"); - info->rank = rank; - CUDACHECK(hipGetDevice(&info->cudaDev)); - info->hostHash=getHostHash()+commHash; - info->pidHash=getPidHash()+commHash; - - // Get PCI Bus Id. We need to get the bus ID through CUDA first, since the - // cudaDev is a CUDA runtime dev number which could be different from the - // NVML device number. Then we get the busID from NVML to be sure it is - // consistent with NVML remote PCI bus Ids. - CUDACHECK(hipDeviceGetPCIBusId(info->busId, NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE, info->cudaDev)); -#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) -#else - nvmlDevice_t nvmlDevice; - NCCLCHECK(wrapNvmlDeviceGetHandleByPciBusId(info->busId, &nvmlDevice)); - nvmlPciInfo_t pciInfo; - NCCLCHECK(wrapNvmlDeviceGetPciInfo(nvmlDevice, &pciInfo)); - strncpy(info->busId, pciInfo.busId, NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE); -#endif - return ncclSuccess; -} - NCCL_PARAM(P2pLevel, "P2P_LEVEL", -2); NCCL_PARAM(P2pDisable, "P2P_DISABLE", -2); +extern bool useFineGrainVramPcie; + +/* Convert a PCI busId string into a local cudaDev device index (cf. CUDA_VISIBLE_DEVICES) */ +static int busIdToCudaDev(const char* busId) { + int ndev; + if (hipGetDeviceCount(&ndev) != hipSuccess) + return -1; + for (int i = 0; i < ndev; i++) { + char devBusId[NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE]; + if (hipDeviceGetPCIBusId(devBusId, NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE, i) != hipSuccess) + return -1; + if (strcmp(busId, devBusId) == 0) { + return i; + } + } + // BusId was not found in our locally visible CUDA devices + return -1; +} + /* Determine if we can communicate with the peer through p2p */ -ncclResult_t p2pCanConnect(ncclTvalue_t* ret, ncclTinfo_t* myOpaqueInfo, ncclTinfo_t* peerOpaqueInfo) { +ncclResult_t p2pCanConnect(ncclTvalue_t* ret, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo) { // Do not use P2P across root complexes by default (provided CUDA permits it) - int p2pLevel = PATH_SOC; + int p2pLevel = PATH_NODE; if (ncclParamP2pDisable() == 1) p2pLevel = 0; if (ncclParamP2pLevel() != -2) p2pLevel = ncclParamP2pLevel(); @@ -81,29 +75,44 @@ ncclResult_t p2pCanConnect(ncclTvalue_t* ret, ncclTinfo_t* myOpaqueInfo, ncclTin if (p2pLevel == 0) return ncclSuccess; - struct p2pInfo* myInfo = (struct p2pInfo*)myOpaqueInfo; - struct p2pInfo* peerInfo = (struct p2pInfo*)peerOpaqueInfo; - // Rule out different nodes if (myInfo->hostHash != peerInfo->hostHash) return ncclSuccess; + // Convert the peer's busId into a local cudaDev index (cf. CUDA_VISIBLE_DEVICES) + int peerCudaDev = busIdToCudaDev(peerInfo->busId); + if (peerCudaDev == -1) { + // Peer's CUDA device is not visible in this process +#if CUDART_VERSION >= 10010 + // But in CUDA 10.1 we can still communicate with 'invisible' devices + TRACE(NCCL_INIT|NCCL_P2P, "Checking P2P connection between %d(%s) and %d(%s)", myInfo->nvmlDev, myInfo->busId, peerInfo->nvmlDev, peerInfo->busId); + // Check for NVLink/NVswitch including P2P access + int nvlinkp2p = getNvlinkGpu(myInfo->busId, peerInfo->busId); + if (nvlinkp2p > 0) { + *ret = nvlinkp2p; + return ncclSuccess; + } +#endif + return ncclSuccess; + } + + TRACE(NCCL_INIT|NCCL_P2P, "Checking P2P connection between [%d=%d] and [%d=%d]", myInfo->cudaDev, myInfo->nvmlDev, peerCudaDev, peerInfo->nvmlDev); + // Do not detect topology if we're on the same GPU. Note this is not really supported. - if (myInfo->cudaDev == peerInfo->cudaDev) { - *ret = 1 + PATH_SOC; + if (myInfo->cudaDev == peerCudaDev) { + *ret = 1 + PATH_SYS; return ncclSuccess; } // See if CUDA can do P2P int p2p; - if (hipDeviceCanAccessPeer(&p2p, myInfo->cudaDev, peerInfo->cudaDev) != hipSuccess) { - INFO(NCCL_INIT|NCCL_P2P,"peer query failed between dev %d and dev %d", - myInfo->cudaDev, peerInfo->cudaDev); + if (hipDeviceCanAccessPeer(&p2p, myInfo->cudaDev, peerCudaDev) != hipSuccess) { + INFO(NCCL_INIT|NCCL_P2P,"peer query failed between dev %d(=%d) and dev %d(=%d)", + myInfo->cudaDev, myInfo->nvmlDev, peerCudaDev, peerInfo->nvmlDev); return ncclSuccess; } - if (p2p == 0) return ncclSuccess; -#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) +#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__) uint32_t link_type, hops; if (hipExtGetLinkTypeAndHopCount(myInfo->cudaDev, peerInfo->cudaDev, &link_type, &hops) != hipSuccess) { p2p = 0; @@ -124,9 +133,8 @@ ncclResult_t p2pCanConnect(ncclTvalue_t* ret, ncclTinfo_t* myOpaqueInfo, ncclTin if (!useFineGrainVramPcie) return ncclSuccess; } - #else - // Check for NVLink/NVswitch +// Check for NVLink/NVswitch int nvlinkp2p = getNvlinkGpu(myInfo->busId, peerInfo->busId); #endif if (nvlinkp2p > 0) { @@ -138,11 +146,11 @@ ncclResult_t p2pCanConnect(ncclTvalue_t* ret, ncclTinfo_t* myOpaqueInfo, ncclTin char* myPath; char* peerPath; ncclResult_t err1 = getCudaPath(myInfo->cudaDev, &myPath); - ncclResult_t err2 = getCudaPath(peerInfo->cudaDev, &peerPath); + ncclResult_t err2 = getCudaPath(peerCudaDev, &peerPath); if (err1 == ncclSuccess && err2 == ncclSuccess) { int distance = pciDistance(myPath, peerPath); if (distance < p2pLevel) { - *ret = 1 + PATH_SOC - distance; + *ret = 1 + PATH_SYS - distance; } } if (err1 == ncclSuccess) free(myPath); @@ -150,6 +158,9 @@ ncclResult_t p2pCanConnect(ncclTvalue_t* ret, ncclTinfo_t* myOpaqueInfo, ncclTin return ncclSuccess; } +#define MAXGPUS_NVLINKP2P 8 // 16 would take an almost infinite time anyway +#define MAXGPUS_PCI 64 + static int computeRingsRec(ncclTvalue_t* matrix, int n, int *rings, int currentRing, int nRingsMax, int* inTheRing, int current, int remaining, int connect) { int nrings = 0; ncclTvalue_t* line = matrix+current*n; @@ -177,7 +188,7 @@ static int computeRingsRec(ncclTvalue_t* matrix, int n, int *rings, int currentR } } } else { - int ringsSave[nRingsMax*n]; + int ringsSave[MAXCHANNELS*MAXGPUS_NVLINKP2P]; int maxStep = 0; for (int i=0; i 0) { @@ -210,8 +221,8 @@ static int computeRingsRec(ncclTvalue_t* matrix, int n, int *rings, int currentR static inline int copyRings(int nranks, int* rings, int nrings, int newNrings) { if (nrings == 0) return 0; // Copy rings by dup times - if (newNrings > MAXRINGS) { - newNrings = MAXRINGS; + if (newNrings > MAXCHANNELS) { + newNrings = MAXCHANNELS; } for (int r=nrings; r MAXRINGS) { - WARN("Max rings reached, limiting to %d", MAXRINGS); - nrings = MAXRINGS; + if (nrings > MAXCHANNELS) { + WARN("Max rings reached, limiting to %d", MAXCHANNELS); + nrings = MAXCHANNELS; } // Find existing constraints / connections int connect = 0; @@ -275,9 +285,9 @@ int p2pComputeRingsNvLink(ncclTvalue_t* values, int nranks, int* rings, int nrin if (compNrings && compNrings < nrings && nranks <= 4) { // Try to oversubscribe to get a better result - int *rings2 = (int *)malloc(sizeof(int)*MAXRINGS*nranks); - if (rings2 == NULL) { WARN("malloc of %ld bytes failed", sizeof(int)*MAXRINGS*nranks); return 0; } - for (int i=0; i compNrings*2) { @@ -289,13 +299,12 @@ int p2pComputeRingsNvLink(ncclTvalue_t* values, int nranks, int* rings, int nrin } // Duplicate the rings for direct NVLink -#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) +#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__) compNrings = copyRings(nranks, rings, compNrings, compNrings*3); #else compNrings = copyRings(nranks, rings, compNrings, compNrings*2); #endif - if (ncclCudaCompCap() == 6) *nthreads /= 2; return compNrings; } @@ -341,9 +350,9 @@ int p2pComputeRingsSeqNew(ncclTvalue_t* values, int nranks, int* rings, int nrin } static int findClosestPci(ncclTvalue_t* values, int* inRing, int rank, int end, int nranks, int minScore) { - for (int score = PATH_SOC+1; score >= minScore; score--) { + for (int score = PATH_SYS+1; score >= minScore; score--) { int best = -1; - int worst_end_score = PATH_SOC+2; // find the closest to rank, farthest from end + int worst_end_score = PATH_SYS+2; // find the closest to rank, farthest from end for (int n = 0; n < nranks; n++) { if (inRing[n]) continue; if (values[rank*nranks+n] == score) { @@ -365,7 +374,7 @@ int p2pComputeRingsPci(ncclTvalue_t* values, int nranks, int* rings, int nrings, int start = findConnect(nranks, prev+r*nranks); int end = findConnect(nranks, next+r*nranks); - int inRing[nranks]; + int inRing[MAXGPUS_PCI]; for (int i=0; i 0) { // NVLink : Connect rings or create new ones + if (nranks > MAXGPUS_NVLINKP2P) { + WARN("Recursive P2P computation cannot work for >8 GPUs"); + return ncclInternalError; + } nrings = p2pComputeRingsNvLink(values, nranks, rings, nrings, prev, next, 0, nthreads); goto end; } @@ -486,48 +499,59 @@ end: } while (0) /* Send: Create and return connect structures for this peer to connect to me */ -ncclResult_t p2pSendSetup(ncclTinfo_t* myOpaqueInfo, ncclTinfo_t* peerOpaqueInfo, struct ncclConnect* connectInfo, struct ncclRing* ring) { - struct p2pInfo* myInfo = (struct p2pInfo*)myOpaqueInfo; - struct p2pInfo* peerInfo = (struct p2pInfo*)peerOpaqueInfo; - struct p2pConnectInfo info; +ncclResult_t p2pSendSetup(struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, + struct ncclConnect* connectInfo, struct ncclConnector* send, int buffSize, int channelId) { + struct p2pSendResources* resources; + NCCLCHECK(ncclCalloc(&resources, 1)); + send->transportResources = resources; + int sendSize = sizeof(struct ncclSendMem); + ALIGN_SIZE(sendSize, CUDA_IPC_MIN); + NCCLCHECK(ncclCudaCalloc((char**)&resources->devMem, sendSize, true)); + uint32_t linktype, hops; if (hipExtGetLinkTypeAndHopCount(myInfo->cudaDev, peerInfo->cudaDev, &linktype, &hops) != hipSuccess) { - INFO(NCCL_INIT|NCCL_P2P,"Ring %02d : %d -> %d failed to get link type and hop count", ring->id, myInfo->rank, peerInfo->rank); + INFO(NCCL_INIT|NCCL_P2P,"Ring %02d : %d -> %d failed to get link type and hop count", channelId, myInfo->rank, peerInfo->rank); return ncclInternalError; } if (linktype != HSA_AMD_LINK_INFO_TYPE_XGMI) { - CUDACHECK(hipDeviceGetAttribute((int*)&ring->next_hdp_reg, hipDeviceAttributeHdpMemFlushCntl,peerInfo->cudaDev)); - TRACE(NCCL_INIT|NCCL_P2P,"Ring %02d : %d -> %d HDP %p", ring->id, myInfo->rank, peerInfo->rank, ring->next_hdp_reg); + CUDACHECK(hipDeviceGetAttribute((int*)&resources->next_hdp_reg, hipDeviceAttributeHdpMemFlushCntl,peerInfo->cudaDev)); + TRACE(NCCL_INIT|NCCL_P2P,"Ring %02d : %d -> %d HDP %p", channelId, myInfo->rank, peerInfo->rank, resources->next_hdp_reg); } + else + resources->next_hdp_reg = 0; + + struct p2pConnectInfo info; if (myInfo->pidHash == peerInfo->pidHash) { info.direct = 1; - info.directPtr = ring->devMemSend; + info.directPtr = resources->devMem; if (myInfo->cudaDev == peerInfo->cudaDev) { - INFO(NCCL_INIT|NCCL_P2P,"Ring %02d : %d -> %d via P2P/common device", ring->id, myInfo->rank, peerInfo->rank); + INFO(NCCL_INIT|NCCL_P2P,"Ring %02d : %d -> %d via P2P/common device", channelId, myInfo->rank, peerInfo->rank); } else { // Enable P2P access hipError_t err = hipDeviceEnablePeerAccess(peerInfo->cudaDev, 0); if (err == hipErrorPeerAccessAlreadyEnabled) { hipGetLastError(); } else if (err != hipSuccess) { - WARN("failed to peer with device %d: %d %s", - peerInfo->cudaDev, err, hipGetErrorString(err)); + WARN("failed to peer with device %d(=%d): %d %s", + peerInfo->cudaDev, peerInfo->nvmlDev, err, hipGetErrorString(err)); return ncclInternalError; } INFO(NCCL_INIT|NCCL_P2P,"Ring %02d : %d[%d] -> %d[%d] via P2P/direct pointer", - ring->id, myInfo->rank, myInfo->cudaDev, peerInfo->rank, peerInfo->cudaDev); + channelId, myInfo->rank, myInfo->nvmlDev, peerInfo->rank, peerInfo->nvmlDev); } } else { + // Convert the peer's busId into a local cudaDev index (cf. CUDA_VISIBLE_DEVICES) + int peerCudaDev = busIdToCudaDev(peerInfo->busId); info.direct = 0; // Map IPC and enable P2P access - hipError_t err = hipIpcGetMemHandle(&info.devIpc, (void*)ring->devMemSend); + hipError_t err = hipIpcGetMemHandle(&info.devIpc, (void*)resources->devMem); if (err != hipSuccess) { - WARN("rank %d failed to get CUDA IPC handle to device %d : %d %s", - myInfo->rank, peerInfo->cudaDev, err, hipGetErrorString(err)); + WARN("rank %d failed to get CUDA IPC handle to device %d(=%d) : %d %s", + myInfo->rank, peerCudaDev, peerInfo->nvmlDev, err, hipGetErrorString(err)); return ncclInternalError; } INFO(NCCL_INIT|NCCL_P2P,"Ring %02d : %d[%d] -> %d[%d] via P2P/IPC", - ring->id, myInfo->rank, myInfo->cudaDev, peerInfo->rank, peerInfo->cudaDev); + channelId, myInfo->rank, myInfo->nvmlDev, peerInfo->rank, peerInfo->nvmlDev); //TRACE_DUMP_IPC(&info.devIpc); } static_assert(sizeof(struct p2pConnectInfo) <= sizeof(struct ncclConnect), "p2p Connect Info is too big"); @@ -536,13 +560,20 @@ ncclResult_t p2pSendSetup(ncclTinfo_t* myOpaqueInfo, ncclTinfo_t* peerOpaqueInfo } /* Create and return connect structures for this peer to connect to me */ -ncclResult_t p2pRecvSetup(ncclTinfo_t* myOpaqueInfo, ncclTinfo_t* peerOpaqueInfo, struct ncclConnect* connectInfo, struct ncclRing* ring) { - struct p2pInfo* myInfo = (struct p2pInfo*)myOpaqueInfo; - struct p2pInfo* peerInfo = (struct p2pInfo*)peerOpaqueInfo; +ncclResult_t p2pRecvSetup(struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, + struct ncclConnect* connectInfo, struct ncclConnector * recv, int buffSize, int channelId) { + + struct p2pRecvResources* resources; + NCCLCHECK(ncclCalloc(&resources, 1)); + recv->transportResources = resources; + int recvSize = offsetof(struct ncclRecvMem, buff)+buffSize; + ALIGN_SIZE(recvSize, CUDA_IPC_MIN); + NCCLCHECK(ncclCudaCalloc((char**)&resources->devMem, recvSize, true)); + struct p2pConnectInfo info; if (myInfo->pidHash == peerInfo->pidHash) { info.direct = 1; - info.directPtr = ring->devMemRecv; + info.directPtr = resources->devMem; if (myInfo->cudaDev == peerInfo->cudaDev) { TRACE(NCCL_INIT|NCCL_P2P,"%d <- %d via P2P/common device", myInfo->rank, peerInfo->rank); } else { @@ -551,22 +582,24 @@ ncclResult_t p2pRecvSetup(ncclTinfo_t* myOpaqueInfo, ncclTinfo_t* peerOpaqueInfo if (err == hipErrorPeerAccessAlreadyEnabled) { hipGetLastError(); } else if (err != hipSuccess) { - WARN("failed to peer with device %d: %d %s", - peerInfo->cudaDev, err, hipGetErrorString(err)); + WARN("failed to peer with device %d(=%d): %d %s", + peerInfo->cudaDev, peerInfo->nvmlDev, err, hipGetErrorString(err)); return ncclInternalError; } - TRACE(NCCL_INIT|NCCL_P2P,"Ring %02d : %d[%d] <- %d[%d] via P2P/direct pointer", ring->id, myInfo->rank, myInfo->cudaDev, peerInfo->rank, peerInfo->cudaDev); + TRACE(NCCL_INIT|NCCL_P2P,"Ring %02d : %d[%d] <- %d[%d] via P2P/direct pointer", channelId, myInfo->rank, myInfo->nvmlDev, peerInfo->rank, peerInfo->nvmlDev); } } else { + // Convert the peer's busId into a local cudaDev index (cf. CUDA_VISIBLE_DEVICES) + int peerCudaDev = busIdToCudaDev(peerInfo->busId); info.direct = 0; // Map IPC and enable P2P access - hipError_t err = hipIpcGetMemHandle(&info.devIpc, (void*)ring->devMemRecv); + hipError_t err = hipIpcGetMemHandle(&info.devIpc, (void*)resources->devMem); if (err != hipSuccess) { - WARN("rank %d failed to get HIP IPC handle to device %d : %d %s", - myInfo->rank, peerInfo->cudaDev, err, hipGetErrorString(err)); + WARN("rank %d failed to get CUDA IPC handle to device %d(=%d) : %d %s", + myInfo->rank, peerCudaDev, peerInfo->nvmlDev, err, hipGetErrorString(err)); return ncclInternalError; } - TRACE(NCCL_INIT|NCCL_P2P,"Ring %02d : %d[%d] <- %d[%d] via P2P/IPC", ring->id, myInfo->rank, myInfo->cudaDev, peerInfo->rank, peerInfo->cudaDev); + TRACE(NCCL_INIT|NCCL_P2P,"Ring %02d : %d[%d] <- %d[%d] via P2P/IPC", channelId, myInfo->rank, myInfo->nvmlDev, peerInfo->rank, peerInfo->nvmlDev); //TRACE_DUMP_IPC(&info.devIpc); } static_assert(sizeof(struct p2pConnectInfo) <= sizeof(struct ncclConnect), "p2p Connect Info is too big"); @@ -576,22 +609,16 @@ ncclResult_t p2pRecvSetup(ncclTinfo_t* myOpaqueInfo, ncclTinfo_t* peerOpaqueInfo /* Connect/Send to this peer */ static ncclResult_t p2pSendConnect(struct ncclConnect* connectInfo, struct ncclConnector* send) { - void** resources = &send->transportResources; + struct p2pSendResources* resources = (struct p2pSendResources*)send->transportResources; struct ncclRecvMem* remDevMem; struct p2pConnectInfo* info = (struct p2pConnectInfo*)connectInfo; if (info->direct) { remDevMem = (struct ncclRecvMem*)(info->directPtr); send->conn.direct = 1; - *resources = NULL; } else { - void* remPtr = NULL; //TRACE_DUMP_IPC(&info->devIpc); - hipError_t err = hipIpcOpenMemHandle(&remPtr, info->devIpc, hipIpcMemLazyEnablePeerAccess); - void** ipcPtrSave; - NCCLCHECK(ncclCalloc(&ipcPtrSave, 1)); - *resources = ipcPtrSave; - *ipcPtrSave = remPtr; - remDevMem = (struct ncclRecvMem*)remPtr; + hipError_t err = hipIpcOpenMemHandle(&resources->ipcPtr, info->devIpc, hipIpcMemLazyEnablePeerAccess); + remDevMem = (struct ncclRecvMem*)resources->ipcPtr; if (err != hipSuccess) { WARN("failed to open CUDA IPC handle : %d %s", err, hipGetErrorString(err)); @@ -602,30 +629,27 @@ static ncclResult_t p2pSendConnect(struct ncclConnect* connectInfo, struct ncclC send->conn.buff = remDevMem->buff; send->conn.llBuff = remDevMem->llBuff; send->conn.tail = &remDevMem->tail; - send->conn.opCount = &remDevMem->opCount; - // send->conn->head should have been set to devMemSend already + send->conn.opCountRem = &remDevMem->opCount; + send->conn.head = &resources->devMem->head; + send->conn.ptrExchange = &resources->devMem->ptrExchange; + send->conn.opCountLoc = &resources->devMem->opCount; + send->conn.next_hdp_reg = resources->next_hdp_reg; return ncclSuccess; } /* Connect/Recv from this peer */ ncclResult_t p2pRecvConnect(struct ncclConnect* connectInfo, struct ncclConnector* recv) { - void** resources = &recv->transportResources; + struct p2pRecvResources* resources = (struct p2pRecvResources*)recv->transportResources; struct ncclSendMem* remDevMem; struct p2pConnectInfo* info = (struct p2pConnectInfo*)connectInfo; if (info->direct) { remDevMem = (struct ncclSendMem*)(info->directPtr); recv->conn.direct = 1; recv->conn.ptrExchange = &remDevMem->ptrExchange; - *resources = NULL; } else { - void* remPtr = NULL; //TRACE_DUMP_IPC(&info->devIpc); - hipError_t err = hipIpcOpenMemHandle(&remPtr, info->devIpc, hipIpcMemLazyEnablePeerAccess); - void** ipcPtrSave; - NCCLCHECK(ncclCalloc(&ipcPtrSave, 1)); - *resources = ipcPtrSave; - *ipcPtrSave = remPtr; - remDevMem = (struct ncclSendMem*)remPtr; + hipError_t err = hipIpcOpenMemHandle(&resources->ipcPtr, info->devIpc, hipIpcMemLazyEnablePeerAccess); + remDevMem = (struct ncclSendMem*)resources->ipcPtr; if (err != hipSuccess) { WARN("failed to open CUDA IPC handle : %d %s", err, hipGetErrorString(err)); @@ -633,28 +657,37 @@ ncclResult_t p2pRecvConnect(struct ncclConnect* connectInfo, struct ncclConnecto } } - // recv->conn->buff should have been set to devMemRecv already - // recv->conn->tail should have been set to devMemRecv already - // recv->conn->opCount should have been set to devMemRecv already + recv->conn.buff = resources->devMem->buff; + recv->conn.llBuff = resources->devMem->llBuff; + recv->conn.tail = &resources->devMem->tail; + recv->conn.opCountLoc = &resources->devMem->opCount; recv->conn.head = &remDevMem->head; - recv->conn.llHead = &remDevMem->llHead; + recv->conn.opCountRem = &remDevMem->opCount; return ncclSuccess; } -ncclResult_t p2pFree(void* resources) { - if (resources != NULL) { - void** ipcPtrSave = (void**) resources; - CUDACHECK(hipIpcCloseMemHandle(*ipcPtrSave)); - free(resources); - } +ncclResult_t p2pSendFree(void* resources) { + struct p2pSendResources* sendRes = (struct p2pSendResources*)resources; + if (sendRes->ipcPtr) + CUDACHECK(hipIpcCloseMemHandle(sendRes->ipcPtr)); + CUDACHECK(hipFree(sendRes->devMem)); + free(sendRes); + return ncclSuccess; +} + +ncclResult_t p2pRecvFree(void* resources) { + struct p2pRecvResources* recvRes = (struct p2pRecvResources*)resources; + if (recvRes->ipcPtr) + CUDACHECK(hipIpcCloseMemHandle(recvRes->ipcPtr)); + CUDACHECK(hipFree(recvRes->devMem)); + free(recvRes); return ncclSuccess; } struct ncclTransport p2pTransport = { "P2P", - p2pFillInfo, p2pCanConnect, p2pGetRings, - { p2pSendSetup, p2pSendConnect, p2pFree, NULL }, - { p2pRecvSetup, p2pRecvConnect, p2pFree, NULL } + { p2pSendSetup, p2pSendConnect, p2pSendFree, NULL }, + { p2pRecvSetup, p2pRecvConnect, p2pRecvFree, NULL } }; diff --git a/projects/rccl/src/transport/shm.cu b/projects/rccl/src/transport/shm.cc similarity index 69% rename from projects/rccl/src/transport/shm.cu rename to projects/rccl/src/transport/shm.cc index 0ba168b2bf..730a8604b8 100644 --- a/projects/rccl/src/transport/shm.cu +++ b/projects/rccl/src/transport/shm.cc @@ -1,5 +1,5 @@ /************************************************************************* - * Copyright (c) 2016-2018, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved. * Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved. * * See LICENSE.txt for license information @@ -11,26 +11,13 @@ #include "param.h" #include "shm.h" #include -#include +#include -struct shmInfo { - int rank; - int cudaDev; - uint64_t hostHash; - uint64_t pidHash; -}; - -struct shmSendConnectInfo { +struct shmConnectInfo { uint64_t pidHash; int id; - int rank; - int shmSize; -}; - -struct shmRecvConnectInfo { - uint64_t pidHash; - int id; - int rank; + int sendRank; + int recvRank; int shmSize; }; @@ -52,24 +39,10 @@ struct shmRecvResources { struct ncclRecvMem* devHostMem; }; -/* Fill information necessary to exchange between ranks to choose whether or not - * to use this transport */ -ncclResult_t shmFillInfo(ncclTinfo_t* opaqueInfo, int rank, uint64_t commHash) { - struct shmInfo* info = (struct shmInfo*)opaqueInfo; - static_assert(sizeof(struct shmInfo) <= sizeof(ncclTinfo_t), "shm Info too large"); - info->rank = rank; - CUDACHECK(hipGetDevice(&info->cudaDev)); - info->hostHash=getHostHash()+commHash; - info->pidHash=getPidHash()+commHash; - return ncclSuccess; -} - NCCL_PARAM(ShmDisable, "SHM_DISABLE", 0); /* Determine if we can communicate with the peer */ -ncclResult_t shmCanConnect(ncclTvalue_t* ret, ncclTinfo_t* myOpaqueInfo, ncclTinfo_t* peerOpaqueInfo) { - struct shmInfo* myInfo = (struct shmInfo*)myOpaqueInfo; - struct shmInfo* peerInfo = (struct shmInfo*)peerOpaqueInfo; +ncclResult_t shmCanConnect(ncclTvalue_t* ret, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo) { *ret = ((ncclParamShmDisable() == 1) || (myInfo->hostHash != peerInfo->hostHash)) ? 0 : 1; return ncclSuccess; } @@ -88,11 +61,13 @@ static inline int groupLast(int nranks, int* groups, int group, int rankToAvoid) return -1; } +#define MAXGROUPS 16 + ncclResult_t shmGetRings(int nranks, int* groups, int* subgroups, ncclTvalue_t* values, int* nringsRet, int* prev, int* next, int minScore, int* nthreads) { - if (*nringsRet == MAXRINGS) *nringsRet = 1; + if (*nringsRet == MAXCHANNELS) *nringsRet = 1; int nGroups = groups[nranks-1] + 1; - int starts[nGroups]; - int ends[nGroups]; + int starts[MAXGROUPS]; + int ends[MAXGROUPS]; for (int ring = 0; ring<*nringsRet; ring++) { int startGroup = -1, endGroup = -1; for (int group = 0; groupsend.transportResources = resources; + send->transportResources = resources; + + struct shmConnectInfo info; + info.id = channelId; + info.pidHash = myInfo->pidHash; + info.sendRank = myInfo->rank; + info.recvRank = peerInfo->rank; - struct shmRecvConnectInfo info; char shmName[MAX_SHM_NAME_LEN]; - sprintf(shmName, "nccl-shm-send-%lx-%d-%d", myInfo->pidHash, ring->id, myInfo->rank); + sprintf(shmName, "nccl-shm-send-%lx-%d-%d-%d", info.pidHash, info.id, info.sendRank, info.recvRank); info.shmSize = resources->shmSize = sizeof(struct ncclSendMem); TRACE(NCCL_SHM,"Open shmName %s shmSize %d", shmName, info.shmSize); NCCLCHECK(shmOpen(shmName, resources->shmSize, (void**)&resources->hostMem, (void**)&resources->devHostMem, 1)); - INFO(NCCL_INIT|NCCL_SHM,"Ring %02d : %d[%d] -> %d[%d] via direct shared memory", ring->id, myInfo->rank, myInfo->cudaDev, peerInfo->rank, peerInfo->cudaDev); - info.id = ring->id; info.rank = myInfo->rank; info.pidHash = myInfo->pidHash; - static_assert(sizeof(struct shmRecvConnectInfo) <= sizeof(struct ncclConnect), "shm Connect Recv Info is too big"); - memcpy(connectInfo, &info, sizeof(struct shmRecvConnectInfo)); + INFO(NCCL_INIT|NCCL_SHM,"Ring %02d : %d[%d] -> %d[%d] via direct shared memory", channelId, myInfo->rank, myInfo->cudaDev, peerInfo->rank, peerInfo->cudaDev); + static_assert(sizeof(struct shmConnectInfo) <= sizeof(struct ncclConnect), "shm Connect Recv Info is too big"); + memcpy(connectInfo, &info, sizeof(struct shmConnectInfo)); return ncclSuccess; } -ncclResult_t shmRecvSetup(ncclTinfo_t* myOpaqueInfo, ncclTinfo_t* peerOpaqueInfo, struct ncclConnect* connectInfo, struct ncclRing* ring) { - struct shmInfo* myInfo = (struct shmInfo*)myOpaqueInfo; +ncclResult_t shmRecvSetup(struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connectInfo, struct ncclConnector* recv, int buffSize, int channelId) { struct shmRecvResources* resources; NCCLCHECK(ncclCalloc(&resources, 1)); - ring->recv.transportResources = resources; + recv->transportResources = resources; - struct shmSendConnectInfo info; + struct shmConnectInfo info; + info.id = channelId; + info.pidHash = myInfo->pidHash; + info.sendRank = peerInfo->rank; + info.recvRank = myInfo->rank; char shmName[MAX_SHM_NAME_LEN]; - sprintf(shmName, "nccl-shm-recv-%lx-%d-%d", myInfo->pidHash, ring->id, myInfo->rank); - info.shmSize = resources->shmSize = offsetof(struct ncclRecvMem, buff)+ring->buffSize; + sprintf(shmName, "nccl-shm-recv-%lx-%d-%d-%d", info.pidHash, info.id, info.sendRank, info.recvRank); + info.shmSize = resources->shmSize = offsetof(struct ncclRecvMem, buff)+buffSize; TRACE(NCCL_SHM,"Open shmName %s shmSize %d", shmName, info.shmSize); NCCLCHECK(shmOpen(shmName, resources->shmSize, (void**)&resources->hostMem, (void**)&resources->devHostMem, 1)); - info.id = ring->id; info.rank = myInfo->rank; info.pidHash = myInfo->pidHash; - static_assert(sizeof(struct shmRecvConnectInfo) <= sizeof(struct ncclConnect), "shm Connect Send Info is too big"); - memcpy(connectInfo, &info, sizeof(struct shmSendConnectInfo)); + static_assert(sizeof(struct shmConnectInfo) <= sizeof(struct ncclConnect), "shm Connect Send Info is too big"); + memcpy(connectInfo, &info, sizeof(struct shmConnectInfo)); return ncclSuccess; } /* Connect to this peer */ ncclResult_t shmSendConnect(struct ncclConnect* connectInfo, struct ncclConnector* send) { // Setup device pointers - struct shmSendConnectInfo* info = (struct shmSendConnectInfo*)connectInfo; + struct shmConnectInfo* info = (struct shmConnectInfo*)connectInfo; struct shmSendResources* resources = (struct shmSendResources*)send->transportResources; char shmName[MAX_SHM_NAME_LEN]; - sprintf(shmName, "nccl-shm-recv-%lx-%d-%d", info->pidHash, info->id, info->rank); + sprintf(shmName, "nccl-shm-recv-%lx-%d-%d-%d", info->pidHash, info->id, info->sendRank, info->recvRank); resources->remShmSize = info->shmSize; TRACE(NCCL_SHM,"Open shmName %s shmSize %d", shmName, info->shmSize); NCCLCHECK(shmOpen(shmName, resources->remShmSize, (void**)&resources->remHostMem, (void**)&resources->devRemHostMem, 0)); @@ -217,31 +196,31 @@ ncclResult_t shmSendConnect(struct ncclConnect* connectInfo, struct ncclConnecto send->conn.buff = resources->devRemHostMem->buff; send->conn.llBuff = resources->devRemHostMem->llBuff; send->conn.tail = &resources->devRemHostMem->tail; - send->conn.opCount = &resources->devRemHostMem->opCount; + send->conn.opCountRem = &resources->devRemHostMem->opCount; send->conn.head = &resources->devHostMem->head; - send->conn.llHead = &resources->devHostMem->llHead; + send->conn.opCountLoc = &resources->devHostMem->opCount; return ncclSuccess; } ncclResult_t shmRecvConnect(struct ncclConnect* connectInfo, struct ncclConnector* recv) { // Setup device pointers struct shmRecvResources* resources = (struct shmRecvResources*)recv->transportResources; - struct shmRecvConnectInfo* info = (struct shmRecvConnectInfo*)connectInfo; + struct shmConnectInfo* info = (struct shmConnectInfo*)connectInfo; char shmName[MAX_SHM_NAME_LEN]; - sprintf(shmName, "nccl-shm-send-%lx-%d-%d", info->pidHash, info->id, info->rank); + sprintf(shmName, "nccl-shm-send-%lx-%d-%d-%d", info->pidHash, info->id, info->sendRank, info->recvRank); resources->remShmSize = info->shmSize; TRACE(NCCL_SHM,"Open shmName %s shmSize %d", shmName, info->shmSize); NCCLCHECK(shmOpen(shmName, resources->remShmSize, (void**)&resources->remHostMem, (void**)&resources->devRemHostMem, 0)); NCCLCHECK(shmUnlink(shmName)); recv->conn.head = &resources->devRemHostMem->head; - recv->conn.llHead = &resources->devRemHostMem->llHead; + recv->conn.opCountRem = &resources->devRemHostMem->opCount; recv->conn.buff = resources->devHostMem->buff; recv->conn.llBuff = resources->devHostMem->llBuff; recv->conn.tail = &resources->devHostMem->tail; - recv->conn.opCount = &resources->devHostMem->opCount; + recv->conn.opCountLoc = &resources->devHostMem->opCount; return ncclSuccess; } @@ -263,7 +242,6 @@ ncclResult_t shmRecvFree(void* transportResources) { struct ncclTransport shmTransport = { "SHM", - shmFillInfo, shmCanConnect, shmGetRings, { shmSendSetup, shmSendConnect, shmSendFree, NULL }, diff --git a/projects/rccl/test/CMakeLists.txt b/projects/rccl/test/CMakeLists.txt index 86709ee761..d8fef68c17 100644 --- a/projects/rccl/test/CMakeLists.txt +++ b/projects/rccl/test/CMakeLists.txt @@ -51,6 +51,8 @@ if(BUILD_TESTS) test_ReduceScatter.cpp test_GroupCalls.cpp test_CombinedCalls.cpp + test_AllReduceAbort.cpp + test_BroadcastAbort.cpp ) add_executable(UnitTests ${TEST_SOURCES}) diff --git a/projects/rccl/test/test_AllReduceAbort.cpp b/projects/rccl/test/test_AllReduceAbort.cpp new file mode 100644 index 0000000000..9400bd84fc --- /dev/null +++ b/projects/rccl/test/test_AllReduceAbort.cpp @@ -0,0 +1,150 @@ +/************************************************************************* + * Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ + +#include "test_AllReduceAbort.hpp" +#include "../include/core.h" +#include + +#define NUM_ITER 8 +#define FAKE_OP_COUNT NUM_ITER+1 + +namespace CorrectnessTests +{ + #define HIPCHECK(cmd) \ + do { \ + hipError_t error = (cmd); \ + if (error != hipSuccess) { \ + std::cerr << "Encountered HIP error (" << error << ") at line " \ + << __LINE__ << " in file " << __FILE__ << "\n"; \ + exit(-1); \ + } \ + } while (0) + + #define LOAD(VAR) __atomic_load_n((VAR), __ATOMIC_SEQ_CST) + #define STORE(DST, SRC) __atomic_store_n((DST), (SRC), __ATOMIC_SEQ_CST) + + TEST_P(AllReduceAbortTest, Correctness) { + if (numDevices > numDevicesAvailable) return; + + // Prepare input / output / expected results + Dataset dataset; + dataset.Initialize(numDevices, numElements, dataType, inPlace); + FillDatasetWithPattern(dataset); + + int gpu = 0; // GPU number to trigger abort + ncclComm_t comm = comms[gpu]; + + HIPCHECK(hipSetDevice(gpu)); + hipStream_t stream; + HIPCHECK(hipStreamCreateWithFlags(&stream, hipStreamNonBlocking)); + struct ncclChannel* channel = comm->channels; + struct ncclRing *ring = &channel->ring; + struct ncclConnector* send = &channel->peers[ring->next].send; + size_t op_offset = &(send->conn.opCountRem) - (uint64_t **)channel->peers; + size_t head_offset = &(send->conn.head) - (uint64_t **)channel->peers; + uint64_t **p_dev_opCount = (uint64_t **)(channel->devPeers) + op_offset; + uint64_t **p_dev_head = (uint64_t **)(channel->devPeers) + head_offset; + uint64_t *real_opCount, *fake_opCount, *fake_o; + uint64_t *real_head, *fake_head, *fake_h; + + // get original opCount and head + HIPCHECK(hipMemcpyAsync(&real_opCount, p_dev_opCount, sizeof(uint64_t*), hipMemcpyDeviceToHost, stream)); + HIPCHECK(hipMemcpyAsync(&real_head, p_dev_head, sizeof(uint64_t*), hipMemcpyDeviceToHost, stream)); + HIPCHECK(hipStreamSynchronize(stream)); + // allocate and install fakes + HIPCHECK(hipHostMalloc(&fake_opCount, sizeof(uint64_t*), hipHostMallocMapped)); + HIPCHECK(hipMemcpyAsync(p_dev_opCount, &fake_opCount, sizeof(uint64_t*), hipMemcpyHostToDevice, stream)); + *fake_opCount = FAKE_OP_COUNT; + HIPCHECK(hipHostMalloc(&fake_head, sizeof(uint64_t*), hipHostMallocMapped)); + HIPCHECK(hipMemcpyAsync(p_dev_head, &fake_head, sizeof(uint64_t*), hipMemcpyHostToDevice, stream)); + *fake_head = 0; + HIPCHECK(hipStreamSynchronize(stream)); + // read back fakes to confirm + HIPCHECK(hipMemcpyAsync(&fake_o, p_dev_opCount, sizeof(uint64_t*), hipMemcpyDeviceToHost, stream)); + HIPCHECK(hipMemcpyAsync(&fake_h, p_dev_head, sizeof(uint64_t*), hipMemcpyDeviceToHost, stream)); + HIPCHECK(hipStreamSynchronize(stream)); + //std::cerr << "[ ] replaced gpu " << gpu << " real_opCount = " << real_opCount << " to fake_opCount = " << fake_o << std::endl; + //std::cerr << "[ ] replaced gpu " << gpu << " real_head = " << real_head << " to fake_head = " << fake_h << std::endl; + + // Perform a number of iterations and introduce abort + for (int j = 0; j < NUM_ITER; j++) { + //std::cerr << "[ ] iter = " << j << std::endl; + // Start a group call + ncclGroupStart(); + for (int i = 0; i < numDevices; i++) { + ncclAllReduce(dataset.inputs[i], dataset.outputs[i], + numElements, dataType, op, comms[i], streams[i]); + } + // Signal end of group call + ncclGroupEnd(); + } + + // Wait for reduction to complete + auto start = std::chrono::high_resolution_clock::now(); + hipError_t hipErr; + int remaining = numDevices; + int* done = (int*)malloc(sizeof(int)*numDevices); + memset(done, 0, sizeof(int)*numDevices); + bool timeout = false, abort_called = false; + while (remaining) { + int idle = 1; + for (int i=0; i= 2 + #if NCCL_VERSION_CODE >= NCCL_VERSION(2,4,0) + auto delta = std::chrono::high_resolution_clock::now() - start; + double deltaSec = std::chrono::duration_cast>(delta).count(); + if (deltaSec > 10.0 && !timeout) { + std::cerr << "[ ] timeout condition, calling ncclCommAbort ... " << std::endl; + timeout = true; + } + ncclResult_t ncclAsyncErr; + ncclCommGetAsyncError(comms[i], &ncclAsyncErr); + if ((ncclAsyncErr != ncclSuccess || timeout) && !abort_called) { + // An asynchronous error happened. Stop the operation and destroy + // the communicator + std::cerr << "[ ] ncclAsyncErr = " << ncclAsyncErr << std::endl; + for (int i=0; i + +#define NUM_ITER 8 +#define FAKE_OP_COUNT NUM_ITER+1 + +namespace CorrectnessTests +{ + #define HIPCHECK(cmd) \ + do { \ + hipError_t error = (cmd); \ + if (error != hipSuccess) { \ + std::cerr << "Encountered HIP error (" << error << ") at line " \ + << __LINE__ << " in file " << __FILE__ << "\n"; \ + exit(-1); \ + } \ + } while (0) + + #define LOAD(VAR) __atomic_load_n((VAR), __ATOMIC_SEQ_CST) + #define STORE(DST, SRC) __atomic_store_n((DST), (SRC), __ATOMIC_SEQ_CST) + + TEST_P(BroadcastAbortTest, Correctness) { + if (numDevices > numDevicesAvailable) return; + + // Prepare input / output / expected results + Dataset dataset; + dataset.Initialize(numDevices, numElements, dataType, inPlace); + FillDatasetWithPattern(dataset); + + int root = 0; + int gpu = 0; // GPU number to trigger abort + ncclComm_t comm = comms[gpu]; + + HIPCHECK(hipSetDevice(gpu)); + hipStream_t stream; + HIPCHECK(hipStreamCreateWithFlags(&stream, hipStreamNonBlocking)); + struct ncclChannel* channel = comm->channels; + struct ncclRing *ring = &channel->ring; + struct ncclConnector* send = &channel->peers[ring->next].send; + size_t op_offset = &(send->conn.opCountRem) - (uint64_t **)channel->peers; + size_t head_offset = &(send->conn.head) - (uint64_t **)channel->peers; + uint64_t **p_dev_opCount = (uint64_t **)(channel->devPeers) + op_offset; + uint64_t **p_dev_head = (uint64_t **)(channel->devPeers) + head_offset; + uint64_t *real_opCount, *fake_opCount, *fake_o; + uint64_t *real_head, *fake_head, *fake_h; + + // get original opCount and head + HIPCHECK(hipMemcpyAsync(&real_opCount, p_dev_opCount, sizeof(uint64_t*), hipMemcpyDeviceToHost, stream)); + HIPCHECK(hipMemcpyAsync(&real_head, p_dev_head, sizeof(uint64_t*), hipMemcpyDeviceToHost, stream)); + HIPCHECK(hipStreamSynchronize(stream)); + // allocate and install fakes + HIPCHECK(hipHostMalloc(&fake_opCount, sizeof(uint64_t*), hipHostMallocMapped)); + HIPCHECK(hipMemcpyAsync(p_dev_opCount, &fake_opCount, sizeof(uint64_t*), hipMemcpyHostToDevice, stream)); + *fake_opCount = FAKE_OP_COUNT; + HIPCHECK(hipHostMalloc(&fake_head, sizeof(uint64_t*), hipHostMallocMapped)); + HIPCHECK(hipMemcpyAsync(p_dev_head, &fake_head, sizeof(uint64_t*), hipMemcpyHostToDevice, stream)); + *fake_head = 0; + HIPCHECK(hipStreamSynchronize(stream)); + // read back fakes to confirm + HIPCHECK(hipMemcpyAsync(&fake_o, p_dev_opCount, sizeof(uint64_t*), hipMemcpyDeviceToHost, stream)); + HIPCHECK(hipMemcpyAsync(&fake_h, p_dev_head, sizeof(uint64_t*), hipMemcpyDeviceToHost, stream)); + HIPCHECK(hipStreamSynchronize(stream)); + //std::cerr << "[ ] replaced gpu " << gpu << " real_opCount = " << real_opCount << " to fake_opCount = " << fake_o << std::endl; + //std::cerr << "[ ] replaced gpu " << gpu << " real_head = " << real_head << " to fake_head = " << fake_h << std::endl; + + // Perform a number of iterations and introduce abort + for (int j = 0; j < NUM_ITER; j++) { + //std::cerr << "[ ] iter = " << j << std::endl; + // Start a group call + ncclGroupStart(); + for (int i = 0; i < numDevices; i++) { + ncclBroadcast(dataset.inputs[i], + dataset.outputs[i], + numElements, dataType, + root, comms[i], streams[i]); + } + // Signal end of group call + ncclGroupEnd(); + } + + // Wait for reduction to complete + auto start = std::chrono::high_resolution_clock::now(); + hipError_t hipErr; + int remaining = numDevices; + int* done = (int*)malloc(sizeof(int)*numDevices); + memset(done, 0, sizeof(int)*numDevices); + bool timeout = false, abort_called = false; + while (remaining) { + int idle = 1; + for (int i=0; i= 2 + #if NCCL_VERSION_CODE >= NCCL_VERSION(2,4,0) + auto delta = std::chrono::high_resolution_clock::now() - start; + double deltaSec = std::chrono::duration_cast>(delta).count(); + if (deltaSec > 10.0 && !timeout) { + std::cerr << "[ ] timeout condition, calling ncclCommAbort ... " << std::endl; + timeout = true; + } + ncclResult_t ncclAsyncErr; + ncclCommGetAsyncError(comms[i], &ncclAsyncErr); + if ((ncclAsyncErr != ncclSuccess || timeout) && !abort_called) { + // An asynchronous error happened. Stop the operation and destroy + // the communicator + std::cerr << "[ ] ncclAsyncErr = " << ncclAsyncErr << std::endl; + for (int i=0; i -struct MULTI128 { - __device__ void operator()(Pack128& x, Pack128& y) { - x.x = MULTI()(x.x, y.x); - x.y = MULTI()(x.y, y.y); - } -}; - -inline __device__ void Fetch128(Pack128& v, Pack128* p) { - v.x = p->x; - v.y = p->y; -} - -inline __device__ void Store128(Pack128* p, Pack128& v) { - p->x = v.x; - p->y = v.y; -} - -#define WARP_SIZE 32 -template -__attribute__((noinline)) -__device__ inline void ReduceCopy128b( const int w, const int nw, const int t, - Pack128 * src0, Pack128 * src1, Pack128 * dest0, Pack128 * dest1, - const int N) { - Pack128 t0[UNROLL]; - Pack128 t1[UNROLL]; - const Pack128* src0_end = src0 + N; - const int inc = nw * UNROLL * WARP_SIZE; - const int offset = w * UNROLL * WARP_SIZE + t; - src0 += offset; if (TWO_INPUTS) src1 += offset; - dest0 += offset; if (TWO_OUTPUTS) dest1 += offset; - - while (src0 < src0_end) { -#pragma unroll - for (int u = 0; u < UNROLL; ++u) { - Fetch128(t0[u], src0+u*WARP_SIZE); - if (TWO_INPUTS) Fetch128(t1[u], src1+u*WARP_SIZE); - } -#pragma unroll - for (int u = 0; u < UNROLL; ++u) { - if (TWO_INPUTS) MULTI128()(t0[u], t1[u]); - Store128(dest0+u*WARP_SIZE, t0[u]); - if (TWO_OUTPUTS) Store128(dest1+u*WARP_SIZE, t0[u]); - } - src0 += inc; if (TWO_INPUTS) src1 += inc; - dest0 += inc; if (TWO_OUTPUTS) dest1 += inc; - } -} - -template -__attribute__((noinline)) -__device__ inline void ReduceOrCopy(const int tid, const int nthreads, - volatile T * __restrict__ dest0, volatile T * __restrict__ dest1, - const volatile T * __restrict__ src0, const volatile T * __restrict__ src1, - int N) { - int Nrem = N; - if (Nrem <= 0) return; - - int Npreamble = (Nrem(tid, nthreads, src0, src1, dest0, dest1, Npreamble); - - Nrem -= Npreamble; - if (Nrem == 0) return; - - dest0 += Npreamble; if (HAS_DEST1) { dest1 += Npreamble; } - src0 += Npreamble; if (HAS_SRC1) { src1 += Npreamble; } - - // stage 2: fast path: use 128b loads/stores to do the bulk of the work, - // assuming the pointers we have are all 128-bit alignable. - int w = tid / WARP_SIZE; // Warp number - int nw = nthreads / WARP_SIZE; // Number of warps - int t = tid % WARP_SIZE; // Thread (inside the warp) - - const int PackFactor = sizeof(Pack128) / sizeof(T); - - // stage 2a: main loop - int Nalign2a = (Nrem / (PackFactor * UNROLL * nthreads)) - * (UNROLL * nthreads); // round down - - ReduceCopy128b(w, nw, t, (Pack128*)src0, (Pack128*)src1, (Pack128*)dest0, (Pack128*)dest1, Nalign2a); - - int Ndone2a = Nalign2a * PackFactor; - Nrem -= Ndone2a; - if (Nrem == 0) return; - dest0 += Ndone2a; if (HAS_DEST1) { dest1 += Ndone2a; } - src0 += Ndone2a; if (HAS_SRC1) { src1 += Ndone2a; } - - // stage 2b: slightly less optimized for section when we don't have full - // UNROLLs - - int Nalign2b = Nrem / PackFactor; - - ReduceCopy128b(w, nw, t, (Pack128*)src0, (Pack128*)src1, (Pack128*)dest0, (Pack128*)dest1, Nalign2b); - - int Ndone2b = Nalign2b * PackFactor; - Nrem -= Ndone2b; - if (Nrem == 0) return; - dest0 += Ndone2b; if (HAS_DEST1) { dest1 += Ndone2b; } - src0 += Ndone2b; if (HAS_SRC1) { src1 += Ndone2b; } - - // stage 2c: tail - ReduceCopy(tid, nthreads, src0, src1, dest0, dest1, Nrem); -} - template struct FuncPassA { __device__ T operator()(const T x, const T y) const { @@ -217,6 +98,160 @@ struct MULTI { } }; + +typedef ulong2 Pack128; + +template +struct MULTI128 { + __device__ void operator()(Pack128& x, Pack128& y) { + x.x = MULTI()(x.x, y.x); + x.y = MULTI()(x.y, y.y); + } +}; + +inline __device__ void Fetch128(Pack128& v, const Pack128* p) { + v.x = p->x; + v.y = p->y; +} +inline __device__ void Store128(Pack128* p, Pack128& v) { + p->x = v.x; + p->y = v.y; +} + +template +__device__ void ReduceCopyMulti(const int tid, const int nthreads, + int nsrcs, const T* srcs[MAXSRCS], int ndsts, T* dsts[MAXDSTS], + const int offset, const int N) { + for (int idx = offset+tid; idx < offset+N; idx += nthreads) { + T val = vFetch(srcs[0]+idx); + #pragma unroll + for (int i=1; i +__device__ void ReduceCopy128bMulti( const int w, const int nw, const int t, + int nsrcs, const T* s[MAXSRCS], int ndsts, T* d[MAXDSTS], + const int elemOffset, const int Npack) { + const int inc = nw * UNROLL * WARP_SIZE; + int offset = w * UNROLL * WARP_SIZE + t; + + const Pack128* srcs[MAXSRCS]; + for (int i=0; i()(vals[u], vals2[u]); + } + #pragma unroll 1 + for (int i=MINSRCS; i()(vals[u], vals2[u]); + } + + // Store + for (int i = 0; i < MINDSTS; i++) { + for (int u = 0; u < UNROLL; ++u) Store128(dsts[i]+u*WARP_SIZE, vals[u]); + } + #pragma unroll 1 + for (int i=MINDSTS; i +__device__ int ptrAlign128(T* ptr) { return (uint64_t)ptr % alignof(Pack128); } + +// Try to limit consecutive load/stores to 8. +// Use UNROLL 8 when we have a single source and a single destination, 4 otherwise +#define AUTOUNROLL (UNROLL*(4/(MINDSTS+MINSRCS))) + +template +__device__ void ReduceOrCopyMulti(const int tid, const int nthreads, + int nsrcs, const T* srcs[MAXSRCS], int ndsts, T* dsts[MAXDSTS], + int N) { + int Nrem = N; + if (Nrem <= 0) return; + + int alignDiff = 0; + int align = ptrAlign128(srcs[0]); + #pragma unroll + for (int i=1; i(tid, nthreads, nsrcs, srcs, ndsts, dsts, 0, Npreamble); + Nrem -= Npreamble; + if (Nrem == 0) return; + } + int offset = Npreamble; + + // stage 2: fast path: use 128b loads/stores to do the bulk of the work, + // assuming the pointers we have are all 128-bit alignable. + int w = tid / WARP_SIZE; // Warp number + int nw = nthreads / WARP_SIZE; // Number of warps + int t = tid % WARP_SIZE; // Thread (inside the warp) + + const int packFactor = sizeof(Pack128) / sizeof(T); + + // stage 2a: main loop + int Npack2a = (Nrem / (packFactor * AUTOUNROLL * WARP_SIZE)) + * (AUTOUNROLL * WARP_SIZE); // round down + int Nelem2a = Npack2a * packFactor; + + ReduceCopy128bMulti(w, nw, t, nsrcs, srcs, ndsts, dsts, offset, Npack2a); + + Nrem -= Nelem2a; + if (Nrem == 0) return; + offset += Nelem2a; + + // stage 2b: slightly less optimized for section when we don't have full + // unrolling + + int Npack2b = Nrem / packFactor; + int Nelem2b = Npack2b * packFactor; + + ReduceCopy128bMulti(w, nw, t, nsrcs, srcs, ndsts, dsts, offset, Npack2b); + + Nrem -= Nelem2b; + if (Nrem == 0) return; + offset += Nelem2b; + + // stage 2c: tail + ReduceCopyMulti(tid, nthreads, nsrcs, srcs, ndsts, dsts, offset, Nrem); +} + // Assumptions: // - there is exactly 1 block // - THREADS is the number of producer threads @@ -224,24 +259,38 @@ struct MULTI { template __device__ void Copy(volatile T * __restrict__ const dest, const volatile T * __restrict__ const src, const int N) { - ReduceOrCopy, T, false, false>(threadIdx.x, THREADS, - dest, nullptr, src, nullptr, N); + const T* srcs[2]; + T* dsts[2]; + srcs[0] = (const T*)src; + dsts[0] = (T*)dest; + ReduceOrCopyMulti, T, 1, 2, 1, 2>(threadIdx.x, THREADS, + 1, srcs, 1, dsts, N); } template __device__ void DoubleCopy(volatile T * __restrict__ const dest0, volatile T * __restrict__ const dest1, const volatile T * __restrict__ const src, const int N) { - ReduceOrCopy, T, true, false>(threadIdx.x, THREADS, - dest0, dest1, src, nullptr, N); + const T* srcs[2]; + T* dsts[2]; + srcs[0] = (const T*)src; + dsts[0] = (T*)dest0; + dsts[1] = (T*)dest1; + ReduceOrCopyMulti, T, 1, 2, 1, 2>(threadIdx.x, THREADS, + 1, srcs, 2, dsts, N); } template __device__ void Reduce(volatile T * __restrict__ const dest, const volatile T * __restrict__ const src0, const volatile T * __restrict__ const src1, const int N) { - ReduceOrCopy, T, false, true>(threadIdx.x, THREADS, - dest, nullptr, src0, src1, N); + const T* srcs[2]; + T* dsts[2]; + srcs[0] = (const T*)src0; + srcs[1] = (const T*)src1; + dsts[0] = (T*)dest; + ReduceOrCopyMulti, T, 1, 2, 1, 2>(threadIdx.x, THREADS, + 2, srcs, 1, dsts, N); } template @@ -249,7 +298,13 @@ __device__ void ReduceCopy(volatile T * __restrict__ const dest0, volatile T * __restrict__ const dest1, const volatile T * __restrict__ const src0, const volatile T * __restrict__ const src1, const int N) { - ReduceOrCopy, T, true, true>(threadIdx.x, THREADS, - dest0, dest1, src0, src1, N); + const T* srcs[2]; + T* dsts[2]; + srcs[0] = (const T*)src0; + srcs[1] = (const T*)src1; + dsts[0] = (T*)dest0; + dsts[1] = (T*)dest1; + ReduceOrCopyMulti, T, 1, 2, 1, 2>(threadIdx.x, THREADS, + 2, srcs, 2, dsts, N); } #endif // COPY_KERNEL_H_ diff --git a/projects/rccl/tools/rccl-prim-test/rccl_prim_test.cpp b/projects/rccl/tools/rccl-prim-test/rccl_prim_test.cpp index 57d3bc6bfd..0e0fe3c63c 100644 --- a/projects/rccl/tools/rccl-prim-test/rccl_prim_test.cpp +++ b/projects/rccl/tools/rccl-prim-test/rccl_prim_test.cpp @@ -427,10 +427,12 @@ int main(int argc,char* argv[]) sizeof(struct profiling_data_t), hipMemcpyDeviceToHost, stream[i])); HIPCHECK(hipStreamSynchronize(stream[i])); + int next_gpu = findNextGpu(ring_0, i, nGpu); uint32_t linktype; uint32_t hopcount; HIPCHECK(hipExtGetLinkTypeAndHopCount(i, next_gpu , &linktype, &hopcount)); + hipDeviceProp_t prop; HIPCHECK(hipGetDeviceProperties(&prop, i)); if(prop.gcnArch == 906 ) { @@ -441,11 +443,11 @@ int main(int argc,char* argv[]) double t0 = (double)profiling_data[i]->write_cycles/((double)RTC_CLOCK_FREQ_MI100)/(double)workgroups; fprintf(stderr, "[GPU %d -> GPU %d][%s]:time %.4fs bytes_transferred %lu kernel throughput %.2f GB/s\n", i, next_gpu,link_type_name[linktype],t0, profiling_data[i]->bytes_transferred, (double)profiling_data[i]->bytes_transferred/(t0*1.0E9)); - } else { + } else { double t0 = (double)profiling_data[i]->write_cycles/((double)RTC_CLOCK_FREQ_DEFAULT)/(double)workgroups; fprintf(stderr, "[GPU %d -> GPU %d][%s]:time %.4fs bytes_transferred %lu kernel throughput %.2f GB/s\n", i, next_gpu,link_type_name[linktype],t0, profiling_data[i]->bytes_transferred, (double)profiling_data[i]->bytes_transferred/(t0*1.0E9)); - } + } } std::cout<<"***Application Level Transfer Profiling Data***"<bytes_transferred) / (deltaSec*1.0E9);