Merge branch 'master' into xgmi_bench

[ROCm/rccl commit: deea20d49c]
2019-08-16 10:56:56 +05:30
@@ -55,8 +55,16 @@ else()
 endif()

 # Setup VERSION
-set(VERSION_STRING "2.6.0")
-rocm_setup_version(VERSION ${VERSION_STRING})
+set(VERSION_STRING "2.6.0.")
+
+# Check if BUILD_NUMBER is defined in a Jenkins environment
+if($ENV{BUILD_NUMBER})
+  string(CONCAT BUILD_VERSION ${VERSION_STRING} $ENV{BUILD_NUMBER})
+else()
+  string(CONCAT BUILD_VERSION ${VERSION_STRING} "0")
+endif()
+
+rocm_setup_version(VERSION ${BUILD_VERSION} NO_GIT_TAG_VERSION)

 list(APPEND CMAKE_PREFIX_PATH
            /opt/rocm
@@ -79,27 +87,12 @@ include_directories(src/collectives)
 include_directories(src/collectives/device)

 set(CU_SOURCES
-    src/bootstrap.cu
-    src/collectives/all_gather.cu
-    src/collectives/all_reduce.cu
-    src/collectives/broadcast.cu
-    src/collectives/reduce.cu
-    src/collectives/reduce_scatter.cu
-    src/collectives/device/functions.cu
-    src/init.cu
-    src/misc/enqueue.cu
-    src/misc/group.cu
-    src/misc/ibvwrap.cu
-    src/misc/nvmlwrap_stub.cu
-    src/misc/rings.cu
-    src/misc/utils.cu
-    src/ring.cu
-    src/transport.cu
-    src/transport/net.cu
-    src/transport/net_ib.cu
-    src/transport/net_socket.cu
-    src/transport/p2p.cu
-    src/transport/shm.cu)
+    src/collectives/device/all_reduce.cu
+    src/collectives/device/all_gather.cu
+    src/collectives/device/reduce.cu
+    src/collectives/device/broadcast.cu
+    src/collectives/device/reduce_scatter.cu
+    src/collectives/device/functions.cu)

 set(CPP_SOURCES)
 foreach(filename ${CU_SOURCES})
@@ -111,20 +104,34 @@ foreach(filename ${CU_SOURCES})
  list(APPEND CPP_SOURCES ${cpp_filename})
 endforeach(filename)

-list(APPEND CPP_SOURCES src/collectives/device/all_gather_0.cpp)
-list(APPEND CPP_SOURCES src/collectives/device/all_reduce_0.cpp)
-list(APPEND CPP_SOURCES src/collectives/device/all_reduce_1.cpp)
-list(APPEND CPP_SOURCES src/collectives/device/all_reduce_2.cpp)
-list(APPEND CPP_SOURCES src/collectives/device/all_reduce_3.cpp)
-list(APPEND CPP_SOURCES src/collectives/device/broadcast_0.cpp)
-list(APPEND CPP_SOURCES src/collectives/device/reduce_0.cpp)
-list(APPEND CPP_SOURCES src/collectives/device/reduce_1.cpp)
-list(APPEND CPP_SOURCES src/collectives/device/reduce_2.cpp)
-list(APPEND CPP_SOURCES src/collectives/device/reduce_3.cpp)
-list(APPEND CPP_SOURCES src/collectives/device/reduce_scatter_0.cpp)
-list(APPEND CPP_SOURCES src/collectives/device/reduce_scatter_1.cpp)
-list(APPEND CPP_SOURCES src/collectives/device/reduce_scatter_2.cpp)
-list(APPEND CPP_SOURCES src/collectives/device/reduce_scatter_3.cpp)
+set(CC_SOURCES
+    src/init.cc
+    src/collectives/all_reduce.cc
+    src/collectives/all_gather.cc
+    src/collectives/reduce.cc
+    src/collectives/broadcast.cc
+    src/collectives/reduce_scatter.cc
+    src/channel.cc
+    src/misc/trees.cc
+    src/misc/rings.cc
+    src/misc/argcheck.cc
+    src/misc/group.cc
+    src/misc/utils.cc
+    src/misc/ibvwrap.cc
+    src/misc/nvmlwrap_stub.cc
+    src/misc/topo.cc
+    src/transport/net.cc
+    src/transport/net_ib.cc
+    src/transport/net_socket.cc
+    src/transport/p2p.cc
+    src/transport/shm.cc
+    src/transport.cc
+    src/bootstrap.cc
+    src/enqueue.cc)
+
+foreach(filename ${CC_SOURCES})
+  list(APPEND CPP_SOURCES ${filename})
+endforeach(filename)

 add_library(rccl ${CPP_SOURCES})

@@ -132,18 +139,20 @@ if(TRACE)
  add_definitions(-DENABLE_TRACE)
 endif()

+if(PROFILE)
+  add_definitions(-DENABLE_PROFILING)
+endif()
+
 target_link_libraries(rccl
  PRIVATE --amdgpu-target=gfx803
  PRIVATE --amdgpu-target=gfx900
-  PRIVATE --amdgpu-target=gfx906
-  PRIVATE --amdgpu-target=gfx908)
+  PRIVATE --amdgpu-target=gfx906)

 if("${HIP_COMPILER}" MATCHES "clang")
  target_compile_options(rccl
    PRIVATE --amdgpu-target=gfx803
    PRIVATE --amdgpu-target=gfx900
    PRIVATE --amdgpu-target=gfx906
-    PRIVATE --amdgpu-target=gfx908
    PRIVATE -fgpu-rdc)
  target_link_libraries(rccl PRIVATE -fgpu-rdc)
  target_include_directories(rccl PRIVATE /opt/rocm/hsa/include)
@@ -80,7 +80,7 @@ rcclCI:
                      sudo dpkg -i package/*.deb
                      """

-        
+
        //platform.archiveArtifacts(this, """${project.paths.project_build_prefix}/build/package/*.deb""")
    }

@@ -1,5 +1,5 @@

- Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+ Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
 Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.

 Redistribution and use in source and binary forms, with or without
@@ -1,5 +1,5 @@
 #
-# Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+# Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
 #
 # See LICENSE.txt for license information
 #
@@ -162,7 +162,7 @@ FULL_PATH_NAMES        = YES
 # will be relative from the directory where doxygen is started.
 # This tag requires that the tag FULL_PATH_NAMES is set to YES.

-STRIP_FROM_PATH        = 
+STRIP_FROM_PATH        =

 # The STRIP_FROM_INC_PATH tag can be used to strip a user-defined part of the
 # path mentioned in the documentation of a class, which tells the reader which
@@ -171,7 +171,7 @@ STRIP_FROM_PATH        =
 # specify the list of include paths that are normally passed to the compiler
 # using the -I flag.

-STRIP_FROM_INC_PATH    = 
+STRIP_FROM_INC_PATH    =

 # If the SHORT_NAMES tag is set to YES, doxygen will generate much shorter (but
 # less readable) file names. This can be useful is your file systems doesn't
@@ -238,13 +238,13 @@ TAB_SIZE               = 4
 # "Side Effects:". You can put \n's in the value part of an alias to insert
 # newlines.

-ALIASES                = 
+ALIASES                =

 # This tag can be used to specify a number of word-keyword mappings (TCL only).
 # A mapping has the form "name=value". For example adding "class=itcl::class"
 # will allow you to use the command class in the itcl::class meaning.

-TCL_SUBST              = 
+TCL_SUBST              =

 # Set the OPTIMIZE_OUTPUT_FOR_C tag to YES if your project consists of C sources
 # only. Doxygen will then generate output that is more tailored for C. For
@@ -291,7 +291,7 @@ OPTIMIZE_OUTPUT_VHDL   = NO
 # Note that for custom extensions you also need to set FILE_PATTERNS otherwise
 # the files are not read by doxygen.

-EXTENSION_MAPPING      = 
+EXTENSION_MAPPING      =

 # If the MARKDOWN_SUPPORT tag is enabled then doxygen pre-processes all comments
 # according to the Markdown format, which allows for more readable
@@ -641,7 +641,7 @@ GENERATE_DEPRECATEDLIST= YES
 # sections, marked by \if <section_label> ... \endif and \cond <section_label>
 # ... \endcond blocks.

-ENABLED_SECTIONS       = 
+ENABLED_SECTIONS       =

 # The MAX_INITIALIZER_LINES tag determines the maximum number of lines that the
 # initial value of a variable or macro / define can have for it to appear in the
@@ -683,7 +683,7 @@ SHOW_NAMESPACES        = YES
 # by doxygen. Whatever the program writes to standard output is used as the file
 # version. For an example see the documentation.

-FILE_VERSION_FILTER    = 
+FILE_VERSION_FILTER    =

 # The LAYOUT_FILE tag can be used to specify a layout file which will be parsed
 # by doxygen. The layout file controls the global structure of the generated
@@ -696,7 +696,7 @@ FILE_VERSION_FILTER    =
 # DoxygenLayout.xml, doxygen will parse it automatically even if the LAYOUT_FILE
 # tag is left empty.

-LAYOUT_FILE            = 
+LAYOUT_FILE            =

 # The CITE_BIB_FILES tag can be used to specify one or more bib files containing
 # the reference definitions. This must be a list of .bib files. The .bib
@@ -706,7 +706,7 @@ LAYOUT_FILE            =
 # LATEX_BIB_STYLE. To use this feature you need bibtex and perl available in the
 # search path. See also \cite for info how to create references.

-CITE_BIB_FILES         = 
+CITE_BIB_FILES         =

 #---------------------------------------------------------------------------
 # Configuration options related to warning and progress messages
@@ -765,7 +765,7 @@ WARN_FORMAT            = "$file:$line: $text"
 # messages should be written. If left blank the output is written to standard
 # error (stderr).

-WARN_LOGFILE           = 
+WARN_LOGFILE           =

 #---------------------------------------------------------------------------
 # Configuration options related to the input files
@@ -858,7 +858,7 @@ RECURSIVE              = NO
 # Note that relative paths are relative to the directory from which doxygen is
 # run.

-EXCLUDE                = 
+EXCLUDE                =

 # The EXCLUDE_SYMLINKS tag can be used to select whether or not files or
 # directories that are symbolic links (a Unix file system feature) are excluded
@@ -874,7 +874,7 @@ EXCLUDE_SYMLINKS       = NO
 # Note that the wildcards are matched against the file with absolute path, so to
 # exclude all test directories for example use the pattern */test/*

-EXCLUDE_PATTERNS       = 
+EXCLUDE_PATTERNS       =

 # The EXCLUDE_SYMBOLS tag can be used to specify one or more symbol names
 # (namespaces, classes, functions, etc.) that should be excluded from the
@@ -885,13 +885,13 @@ EXCLUDE_PATTERNS       =
 # Note that the wildcards are matched against the file with absolute path, so to
 # exclude all test directories use the pattern */test/*

-EXCLUDE_SYMBOLS        = 
+EXCLUDE_SYMBOLS        =

 # The EXAMPLE_PATH tag can be used to specify one or more files or directories
 # that contain example code fragments that are included (see the \include
 # command).

-EXAMPLE_PATH           = 
+EXAMPLE_PATH           =

 # If the value of the EXAMPLE_PATH tag contains directories, you can use the
 # EXAMPLE_PATTERNS tag to specify one or more wildcard pattern (like *.cpp and
@@ -911,7 +911,7 @@ EXAMPLE_RECURSIVE      = NO
 # that contain images that are to be included in the documentation (see the
 # \image command).

-IMAGE_PATH             = 
+IMAGE_PATH             =

 # The INPUT_FILTER tag can be used to specify a program that doxygen should
 # invoke to filter for each input file. Doxygen will invoke the filter program
@@ -928,7 +928,7 @@ IMAGE_PATH             =
 # code is scanned, but not when the output code is generated. If lines are added
 # or removed, the anchors will not be placed correctly.

-INPUT_FILTER           = 
+INPUT_FILTER           =

 # The FILTER_PATTERNS tag can be used to specify filters on a per file pattern
 # basis. Doxygen will compare the file name with each pattern and apply the
@@ -937,7 +937,7 @@ INPUT_FILTER           =
 # filters are used. If the FILTER_PATTERNS tag is empty or if none of the
 # patterns match the file name, INPUT_FILTER is applied.

-FILTER_PATTERNS        = 
+FILTER_PATTERNS        =

 # If the FILTER_SOURCE_FILES tag is set to YES, the input filter (if set using
 # INPUT_FILTER) will also be used to filter the input files that are used for
@@ -952,7 +952,7 @@ FILTER_SOURCE_FILES    = NO
 # *.ext= (so without naming a filter).
 # This tag requires that the tag FILTER_SOURCE_FILES is set to YES.

-FILTER_SOURCE_PATTERNS = 
+FILTER_SOURCE_PATTERNS =

 # If the USE_MDFILE_AS_MAINPAGE tag refers to the name of a markdown file that
 # is part of the input, its contents will be placed on the main page
@@ -1064,7 +1064,7 @@ CLANG_ASSISTED_PARSING = NO
 # specified with INPUT and INCLUDE_PATH.
 # This tag requires that the tag CLANG_ASSISTED_PARSING is set to YES.

-CLANG_OPTIONS          = 
+CLANG_OPTIONS          =

 #---------------------------------------------------------------------------
 # Configuration options related to the alphabetical class index
@@ -1090,7 +1090,7 @@ COLS_IN_ALPHA_INDEX    = 5
 # while generating the index headers.
 # This tag requires that the tag ALPHABETICAL_INDEX is set to YES.

-IGNORE_PREFIX          = 
+IGNORE_PREFIX          =

 #---------------------------------------------------------------------------
 # Configuration options related to the HTML output
@@ -1134,7 +1134,7 @@ HTML_FILE_EXTENSION    = .html
 # of the possible markers and block names see the documentation.
 # This tag requires that the tag GENERATE_HTML is set to YES.

-HTML_HEADER            = 
+HTML_HEADER            =

 # The HTML_FOOTER tag can be used to specify a user-defined HTML footer for each
 # generated HTML page. If the tag is left blank doxygen will generate a standard
@@ -1144,7 +1144,7 @@ HTML_HEADER            =
 # that doxygen normally uses.
 # This tag requires that the tag GENERATE_HTML is set to YES.

-HTML_FOOTER            = 
+HTML_FOOTER            =

 # The HTML_STYLESHEET tag can be used to specify a user-defined cascading style
 # sheet that is used by each HTML page. It can be used to fine-tune the look of
@@ -1156,7 +1156,7 @@ HTML_FOOTER            =
 # obsolete.
 # This tag requires that the tag GENERATE_HTML is set to YES.

-HTML_STYLESHEET        = 
+HTML_STYLESHEET        =

 # The HTML_EXTRA_STYLESHEET tag can be used to specify additional user-defined
 # cascading style sheets that are included after the standard style sheets
@@ -1169,7 +1169,7 @@ HTML_STYLESHEET        =
 # list). For an example see the documentation.
 # This tag requires that the tag GENERATE_HTML is set to YES.

-HTML_EXTRA_STYLESHEET  = 
+HTML_EXTRA_STYLESHEET  =

 # The HTML_EXTRA_FILES tag can be used to specify one or more extra images or
 # other source files which should be copied to the HTML output directory. Note
@@ -1179,7 +1179,7 @@ HTML_EXTRA_STYLESHEET  =
 # files will be copied as-is; there are no commands or markers available.
 # This tag requires that the tag GENERATE_HTML is set to YES.

-HTML_EXTRA_FILES       = 
+HTML_EXTRA_FILES       =

 # The HTML_COLORSTYLE_HUE tag controls the color of the HTML output. Doxygen
 # will adjust the colors in the style sheet and background images according to
@@ -1308,7 +1308,7 @@ GENERATE_HTMLHELP      = NO
 # written to the html output directory.
 # This tag requires that the tag GENERATE_HTMLHELP is set to YES.

-CHM_FILE               = 
+CHM_FILE               =

 # The HHC_LOCATION tag can be used to specify the location (absolute path
 # including file name) of the HTML help compiler (hhc.exe). If non-empty,
@@ -1316,7 +1316,7 @@ CHM_FILE               =
 # The file has to be specified with full path.
 # This tag requires that the tag GENERATE_HTMLHELP is set to YES.

-HHC_LOCATION           = 
+HHC_LOCATION           =

 # The GENERATE_CHI flag controls if a separate .chi index file is generated
 # (YES) or that it should be included in the master .chm file (NO).
@@ -1329,7 +1329,7 @@ GENERATE_CHI           = NO
 # and project file content.
 # This tag requires that the tag GENERATE_HTMLHELP is set to YES.

-CHM_INDEX_ENCODING     = 
+CHM_INDEX_ENCODING     =

 # The BINARY_TOC flag controls whether a binary table of contents is generated
 # (YES) or a normal table of contents (NO) in the .chm file. Furthermore it
@@ -1360,7 +1360,7 @@ GENERATE_QHP           = NO
 # the HTML output folder.
 # This tag requires that the tag GENERATE_QHP is set to YES.

-QCH_FILE               = 
+QCH_FILE               =

 # The QHP_NAMESPACE tag specifies the namespace to use when generating Qt Help
 # Project output. For more information please see Qt Help Project / Namespace
@@ -1385,7 +1385,7 @@ QHP_VIRTUAL_FOLDER     = doc
 # filters).
 # This tag requires that the tag GENERATE_QHP is set to YES.

-QHP_CUST_FILTER_NAME   = 
+QHP_CUST_FILTER_NAME   =

 # The QHP_CUST_FILTER_ATTRS tag specifies the list of the attributes of the
 # custom filter to add. For more information please see Qt Help Project / Custom
@@ -1393,21 +1393,21 @@ QHP_CUST_FILTER_NAME   =
 # filters).
 # This tag requires that the tag GENERATE_QHP is set to YES.

-QHP_CUST_FILTER_ATTRS  = 
+QHP_CUST_FILTER_ATTRS  =

 # The QHP_SECT_FILTER_ATTRS tag specifies the list of the attributes this
 # project's filter section matches. Qt Help Project / Filter Attributes (see:
 # http://qt-project.org/doc/qt-4.8/qthelpproject.html#filter-attributes).
 # This tag requires that the tag GENERATE_QHP is set to YES.

-QHP_SECT_FILTER_ATTRS  = 
+QHP_SECT_FILTER_ATTRS  =

 # The QHG_LOCATION tag can be used to specify the location of Qt's
 # qhelpgenerator. If non-empty doxygen will try to run qhelpgenerator on the
 # generated .qhp file.
 # This tag requires that the tag GENERATE_QHP is set to YES.

-QHG_LOCATION           = 
+QHG_LOCATION           =

 # If the GENERATE_ECLIPSEHELP tag is set to YES, additional index files will be
 # generated, together with the HTML files, they form an Eclipse help plugin. To
@@ -1540,7 +1540,7 @@ MATHJAX_RELPATH        = http://cdn.mathjax.org/mathjax/latest
 # MATHJAX_EXTENSIONS = TeX/AMSmath TeX/AMSsymbols
 # This tag requires that the tag USE_MATHJAX is set to YES.

-MATHJAX_EXTENSIONS     = 
+MATHJAX_EXTENSIONS     =

 # The MATHJAX_CODEFILE tag can be used to specify a file with javascript pieces
 # of code that will be used on startup of the MathJax code. See the MathJax site
@@ -1548,7 +1548,7 @@ MATHJAX_EXTENSIONS     =
 # example see the documentation.
 # This tag requires that the tag USE_MATHJAX is set to YES.

-MATHJAX_CODEFILE       = 
+MATHJAX_CODEFILE       =

 # When the SEARCHENGINE tag is enabled doxygen will generate a search box for
 # the HTML output. The underlying search engine uses javascript and DHTML and
@@ -1608,7 +1608,7 @@ EXTERNAL_SEARCH        = NO
 # Searching" for details.
 # This tag requires that the tag SEARCHENGINE is set to YES.

-SEARCHENGINE_URL       = 
+SEARCHENGINE_URL       =

 # When SERVER_BASED_SEARCH and EXTERNAL_SEARCH are both enabled the unindexed
 # search data is written to a file for indexing by an external tool. With the
@@ -1624,7 +1624,7 @@ SEARCHDATA_FILE        = searchdata.xml
 # projects and redirect the results back to the right project.
 # This tag requires that the tag SEARCHENGINE is set to YES.

-EXTERNAL_SEARCH_ID     = 
+EXTERNAL_SEARCH_ID     =

 # The EXTRA_SEARCH_MAPPINGS tag can be used to enable searching through doxygen
 # projects other than the one defined by this configuration file, but that are
@@ -1634,7 +1634,7 @@ EXTERNAL_SEARCH_ID     =
 # EXTRA_SEARCH_MAPPINGS = tagname1=loc1 tagname2=loc2 ...
 # This tag requires that the tag SEARCHENGINE is set to YES.

-EXTRA_SEARCH_MAPPINGS  = 
+EXTRA_SEARCH_MAPPINGS  =

 #---------------------------------------------------------------------------
 # Configuration options related to the LaTeX output
@@ -1698,7 +1698,7 @@ PAPER_TYPE             = a4
 # If left blank no extra packages will be included.
 # This tag requires that the tag GENERATE_LATEX is set to YES.

-EXTRA_PACKAGES         = 
+EXTRA_PACKAGES         =

 # The LATEX_HEADER tag can be used to specify a personal LaTeX header for the
 # generated LaTeX document. The header should contain everything until the first
@@ -1714,7 +1714,7 @@ EXTRA_PACKAGES         =
 # to HTML_HEADER.
 # This tag requires that the tag GENERATE_LATEX is set to YES.

-LATEX_HEADER           = 
+LATEX_HEADER           =

 # The LATEX_FOOTER tag can be used to specify a personal LaTeX footer for the
 # generated LaTeX document. The footer should contain everything after the last
@@ -1725,7 +1725,7 @@ LATEX_HEADER           =
 # Note: Only use a user-defined footer if you know what you are doing!
 # This tag requires that the tag GENERATE_LATEX is set to YES.

-LATEX_FOOTER           = 
+LATEX_FOOTER           =

 # The LATEX_EXTRA_STYLESHEET tag can be used to specify additional user-defined
 # LaTeX style sheets that are included after the standard style sheets created
@@ -1736,7 +1736,7 @@ LATEX_FOOTER           =
 # list).
 # This tag requires that the tag GENERATE_LATEX is set to YES.

-LATEX_EXTRA_STYLESHEET = 
+LATEX_EXTRA_STYLESHEET =

 # The LATEX_EXTRA_FILES tag can be used to specify one or more extra images or
 # other source files which should be copied to the LATEX_OUTPUT output
@@ -1744,7 +1744,7 @@ LATEX_EXTRA_STYLESHEET =
 # markers available.
 # This tag requires that the tag GENERATE_LATEX is set to YES.

-LATEX_EXTRA_FILES      = 
+LATEX_EXTRA_FILES      =

 # If the PDF_HYPERLINKS tag is set to YES, the LaTeX that is generated is
 # prepared for conversion to PDF (using ps2pdf or pdflatex). The PDF file will
@@ -1844,14 +1844,14 @@ RTF_HYPERLINKS         = NO
 # default style sheet that doxygen normally uses.
 # This tag requires that the tag GENERATE_RTF is set to YES.

-RTF_STYLESHEET_FILE    = 
+RTF_STYLESHEET_FILE    =

 # Set optional variables used in the generation of an RTF document. Syntax is
 # similar to doxygen's config file. A template extensions file can be generated
 # using doxygen -e rtf extensionFile.
 # This tag requires that the tag GENERATE_RTF is set to YES.

-RTF_EXTENSIONS_FILE    = 
+RTF_EXTENSIONS_FILE    =

 # If the RTF_SOURCE_CODE tag is set to YES then doxygen will include source code
 # with syntax highlighting in the RTF output.
@@ -1896,7 +1896,7 @@ MAN_EXTENSION          = .3
 # MAN_EXTENSION with the initial . removed.
 # This tag requires that the tag GENERATE_MAN is set to YES.

-MAN_SUBDIR             = 
+MAN_SUBDIR             =

 # If the MAN_LINKS tag is set to YES and doxygen generates man output, then it
 # will generate one additional man file for each entity documented in the real
@@ -1915,7 +1915,7 @@ MAN_LINKS              = NO
 # captures the structure of the code including all documentation.
 # The default value is: NO.

-GENERATE_XML           = YES 
+GENERATE_XML           = YES

 # The XML_OUTPUT tag is used to specify where the XML pages will be put. If a
 # relative path is entered the value of OUTPUT_DIRECTORY will be put in front of
@@ -2009,7 +2009,7 @@ PERLMOD_PRETTY         = YES
 # overwrite each other's variables.
 # This tag requires that the tag GENERATE_PERLMOD is set to YES.

-PERLMOD_MAKEVAR_PREFIX = 
+PERLMOD_MAKEVAR_PREFIX =

 #---------------------------------------------------------------------------
 # Configuration options related to the preprocessor
@@ -2050,7 +2050,7 @@ SEARCH_INCLUDES        = YES
 # preprocessor.
 # This tag requires that the tag SEARCH_INCLUDES is set to YES.

-INCLUDE_PATH           = 
+INCLUDE_PATH           =

 # You can use the INCLUDE_FILE_PATTERNS tag to specify one or more wildcard
 # patterns (like *.h and *.hpp) to filter out the header-files in the
@@ -2058,7 +2058,7 @@ INCLUDE_PATH           =
 # used.
 # This tag requires that the tag ENABLE_PREPROCESSING is set to YES.

-INCLUDE_FILE_PATTERNS  = 
+INCLUDE_FILE_PATTERNS  =

 # The PREDEFINED tag can be used to specify one or more macro names that are
 # defined before the preprocessor is started (similar to the -D option of e.g.
@@ -2068,7 +2068,7 @@ INCLUDE_FILE_PATTERNS  =
 # recursively expanded use the := operator instead of the = operator.
 # This tag requires that the tag ENABLE_PREPROCESSING is set to YES.

-PREDEFINED             = 
+PREDEFINED             =

 # If the MACRO_EXPANSION and EXPAND_ONLY_PREDEF tags are set to YES then this
 # tag can be used to specify a list of macro names that should be expanded. The
@@ -2077,7 +2077,7 @@ PREDEFINED             =
 # definition found in the source code.
 # This tag requires that the tag ENABLE_PREPROCESSING is set to YES.

-EXPAND_AS_DEFINED      = 
+EXPAND_AS_DEFINED      =

 # If the SKIP_FUNCTION_MACROS tag is set to YES then doxygen's preprocessor will
 # remove all references to function-like macros that are alone on a line, have
@@ -2106,13 +2106,13 @@ SKIP_FUNCTION_MACROS   = YES
 # the path). If a tag file is not located in the directory in which doxygen is
 # run, you must also specify the path to the tagfile here.

-TAGFILES               = 
+TAGFILES               =

 # When a file name is specified after GENERATE_TAGFILE, doxygen will create a
 # tag file that is based on the input files it reads. See section "Linking to
 # external documentation" for more information about the usage of tag files.

-GENERATE_TAGFILE       = 
+GENERATE_TAGFILE       =

 # If the ALLEXTERNALS tag is set to YES, all external class will be listed in
 # the class index. If set to NO, only the inherited external classes will be
@@ -2161,14 +2161,14 @@ CLASS_DIAGRAMS         = NO
 # the mscgen tool resides. If left empty the tool is assumed to be found in the
 # default search path.

-MSCGEN_PATH            = 
+MSCGEN_PATH            =

 # You can include diagrams made with dia in doxygen documentation. Doxygen will
 # then run dia to produce the diagram and insert it in the documentation. The
 # DIA_PATH tag allows you to specify the directory where the dia binary resides.
 # If left empty dia is assumed to be found in the default search path.

-DIA_PATH               = 
+DIA_PATH               =

 # If set to YES the inheritance and collaboration graphs will hide inheritance
 # and usage relations if the target is undocumented or is not a class.
@@ -2217,7 +2217,7 @@ DOT_FONTSIZE           = 10
 # the path where dot can find it using this tag.
 # This tag requires that the tag HAVE_DOT is set to YES.

-DOT_FONTPATH           = 
+DOT_FONTPATH           =

 # If the CLASS_GRAPH tag is set to YES then doxygen will generate a graph for
 # each documented class showing the direct and indirect inheritance relations.
@@ -2361,26 +2361,26 @@ INTERACTIVE_SVG        = NO
 # found. If left blank, it is assumed the dot tool can be found in the path.
 # This tag requires that the tag HAVE_DOT is set to YES.

-DOT_PATH               = 
+DOT_PATH               =

 # The DOTFILE_DIRS tag can be used to specify one or more directories that
 # contain dot files that are included in the documentation (see the \dotfile
 # command).
 # This tag requires that the tag HAVE_DOT is set to YES.

-DOTFILE_DIRS           = 
+DOTFILE_DIRS           =

 # The MSCFILE_DIRS tag can be used to specify one or more directories that
 # contain msc files that are included in the documentation (see the \mscfile
 # command).

-MSCFILE_DIRS           = 
+MSCFILE_DIRS           =

 # The DIAFILE_DIRS tag can be used to specify one or more directories that
 # contain dia files that are included in the documentation (see the \diafile
 # command).

-DIAFILE_DIRS           = 
+DIAFILE_DIRS           =

 # When using plantuml, the PLANTUML_JAR_PATH tag should be used to specify the
 # path where java can find the plantuml.jar file. If left blank, it is assumed
@@ -2388,12 +2388,12 @@ DIAFILE_DIRS           =
 # generate a warning when it encounters a \startuml command in this case and
 # will not generate output for the diagram.

-PLANTUML_JAR_PATH      = 
+PLANTUML_JAR_PATH      =

 # When using plantuml, the specified paths are searched for files specified by
 # the !include statement in a plantuml block.

-PLANTUML_INCLUDE_PATH  = 
+PLANTUML_INCLUDE_PATH  =

 # The DOT_GRAPH_MAX_NODES tag can be used to set the maximum number of nodes
 # that will be shown in the graph. If the number of nodes in a graph becomes
@@ -1,5 +1,5 @@
 .. toctree::
-   :maxdepth: 4 
+   :maxdepth: 4
   :caption: Contents:

 =======
@@ -8,4 +8,4 @@ All API

 .. doxygenindex::

- 
+
@@ -1,5 +1,5 @@
 .. toctree::
-   :maxdepth: 4 
+   :maxdepth: 4
   :caption: Contents:

 ===
@@ -7,10 +7,10 @@ Welcome to RCCL's documentation!
 ==================================

 .. toctree::
-   :maxdepth: 4 
+   :maxdepth: 4
   :caption: Contents:

-   library 
+   library
   api
   allapi

@@ -1,6 +1,6 @@

 .. toctree::
-   :maxdepth: 4 
+   :maxdepth: 4
   :caption: Contents:

 ======
@@ -10,4 +10,4 @@ RCCL
 Introduction
 ------------

-The RCCL is an AMD port of NCCL. 
+The RCCL is an AMD port of NCCL.
@@ -1,5 +1,5 @@
 #
-# Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+# Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
 #
 # See LICENSE.txt for license information
 #
@@ -1,5 +1,5 @@
 /*************************************************************************
- * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
 *
 * See LICENSE.txt for license information
 ************************************************************************/
@@ -1,112 +0,0 @@
-#!/bin/bash
-# Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
-
-FILES="
-./src/nccl.h.in
-./src/bootstrap.cu
-./src/collectives/all_gather.cu
-./src/collectives/all_reduce.cu
-./src/collectives/broadcast.cu
-./src/collectives/collectives.h
-./src/collectives/device/all_gather.cu
-./src/collectives/device/all_gather.h
-./src/collectives/device/all_reduce.cu
-./src/collectives/device/all_reduce.h
-./src/collectives/device/broadcast.cu
-./src/collectives/device/broadcast.h
-./src/collectives/device/common.h
-./src/collectives/device/common_kernel.h
-./src/collectives/device/functions.cu
-./src/collectives/device/ll_kernel.h
-./src/collectives/device/primitives.h
-./src/collectives/device/reduce.cu
-./src/collectives/device/reduce.h
-./src/collectives/device/reduce_kernel.h
-./src/collectives/device/reduce_scatter.cu
-./src/collectives/device/reduce_scatter.h
-./src/collectives/reduce.cu
-./src/collectives/reduce_scatter.cu
-./src/include/bootstrap.h
-./src/include/common_coll.h
-./src/include/core.h
-./src/include/debug.h
-./src/include/enqueue.h
-./src/include/group.h
-./src/include/ibvwrap.h
-./src/include/nccl_net.h
-./src/include/net.h
-./src/include/nvlink.h
-./src/include/nvmlwrap.h
-./src/include/param.h
-./src/include/ring.h
-./src/include/rings.h
-./src/include/shm.h
-./src/include/socket.h
-./src/include/topo.h
-./src/include/transport.h
-./src/include/utils.h
-./src/init.cu
-./src/misc/enqueue.cu
-./src/misc/group.cu
-./src/misc/ibvwrap.cu
-./src/misc/nvmlwrap.cu
-./src/misc/rings.cu
-./src/misc/utils.cu
-./src/ring.cu
-./src/transport.cu
-./src/transport/net.cu
-./src/transport/net_ib.cu
-./src/transport/net_socket.cu
-./src/transport/p2p.cu
-./src/transport/shm.cu
-"
-
-for f in $FILES
-do
-    sed -i \
-        -e 's@cuda_runtime.h@hip/hip_runtime_api.h@g' \
-        -e 's@cuda_fp16.h@hip/hip_fp16.h@g' \
-        -e 's/cudaDeviceCanAccessPeer/hipDeviceCanAccessPeer/g' \
-        -e 's/cudaDeviceEnablePeerAccess/hipDeviceEnablePeerAccess/g' \
-        -e 's/cudaDeviceGetPCIBusId/hipDeviceGetPCIBusId/g' \
-        -e 's/cudaErrorPeerAccessAlreadyEnabled/hipErrorPeerAccessAlreadyEnabled/g' \
-        -e 's/cudaError_t/hipError_t/g' \
-        -e 's/cudaEventCreateWithFlags/hipEventCreateWithFlags/g' \
-        -e 's/cudaEventDestroy/hipEventDestroy/g' \
-        -e 's/cudaEventDisableTiming/hipEventDisableTiming/g' \
-        -e 's/cudaEventRecord/hipEventRecord/g' \
-        -e 's/cudaEvent_t/hipEvent_t/g' \
-        -e 's/cudaFree/hipFree/g' \
-        -e 's/cudaFreeHost/hipHostFree/g' \
-        -e 's/cudaGetDevice/hipGetDevice/g' \
-        -e 's/cudaGetErrorString/hipGetErrorString/g' \
-        -e 's/cudaGetLastError/hipGetLastError/g' \
-        -e 's/cudaHostAlloc/hipHostMalloc/g' \
-        -e 's/cudaHostAllocMapped/hipHostMallocMapped/g' \
-        -e 's/cudaHostGetDevicePointer/hipHostGetDevicePointer/g' \
-        -e 's/cudaHostRegister/hipHostRegister/g' \
-        -e 's/cudaHostRegisterMapped/hipHostRegisterMapped/g' \
-        -e 's/cudaHostUnregister/hipHostUnregister/g' \
-        -e 's/cudaIpcCloseMemHandle/hipIpcCloseMemHandle/g' \
-        -e 's/cudaIpcGetMemHandle/hipIpcGetMemHandle/g' \
-        -e 's/cudaIpcMemHandle_t/hipIpcMemHandle_t/g' \
-        -e 's/cudaIpcMemLazyEnablePeerAccess/hipIpcMemLazyEnablePeerAccess/g' \
-        -e 's/cudaIpcOpenMemHandle/hipIpcOpenMemHandle/g' \
-        -e 's/cudaMalloc/hipMalloc/g' \
-        -e 's/cudaMemcpy/hipMemcpy/g' \
-        -e 's/cudaMemcpyAsync/hipMemcpyAsync/g' \
-        -e 's/cudaMemcpyDefault/hipMemcpyDefault/g' \
-        -e 's/cudaMemcpyDeviceToDevice/hipMemcpyDeviceToDevice/g' \
-        -e 's/cudaMemoryTypeDevice/hipMemoryTypeDevice/g' \
-        -e 's/cudaMemset/hipMemset/g' \
-        -e 's/cudaPointerAttributes/hipPointerAttribute_t/g' \
-        -e 's/cudaPointerGetAttributes/hipPointerGetAttributes/g' \
-        -e 's/cudaSetDevice/hipSetDevice/g' \
-        -e 's/cudaStreamCreateWithFlags/hipStreamCreateWithFlags/g' \
-        -e 's/cudaStreamDestroy/hipStreamDestroy/g' \
-        -e 's/cudaStreamNonBlocking/hipStreamNonBlocking/g' \
-        -e 's/cudaStreamWaitEvent/hipStreamWaitEvent/g' \
-        -e 's/cudaStream_t/hipStream_t/g' \
-        -e 's/cudaSuccess/hipSuccess/g' \
-        $f
-done
@@ -1,5 +1,5 @@
 #
-# Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+# Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
 #
 # See LICENSE.txt for license information
 #
@@ -16,7 +16,7 @@ NVCC = $(CUDA_HOME)/bin/nvcc

 CUDA_LIB ?= $(CUDA_HOME)/lib64
 CUDA_INC ?= $(CUDA_HOME)/include
-CUDA_VERSION = $(strip $(shell $(NVCC) --version | grep release | sed 's/.*release //' | sed 's/\,.*//'))
+CUDA_VERSION = $(strip $(shell which $(NVCC) >/dev/null && $(NVCC) --version | grep release | sed 's/.*release //' | sed 's/\,.*//'))
 #CUDA_VERSION ?= $(shell ls $(CUDA_LIB)/libcudart.so.* | head -1 | rev | cut -d "." -f -2 | rev)
 CUDA_MAJOR = $(shell echo $(CUDA_VERSION) | cut -d "." -f 1)
 CUDA_MINOR = $(shell echo $(CUDA_VERSION) | cut -d "." -f 2)
@@ -36,15 +36,16 @@ CUDA8_PTX     = -gencode=arch=compute_61,code=compute_61
 CUDA9_PTX     = -gencode=arch=compute_70,code=compute_70

 # Include Volta support if we're using CUDA9 or above
-ifeq ($(shell test "$(CUDA_MAJOR)" -gt 8; echo $$?),0)
+ifeq ($(shell test "0$(CUDA_MAJOR)" -gt 8; echo $$?),0)
  NVCC_GENCODE ?= $(CUDA8_GENCODE) $(CUDA9_GENCODE) $(CUDA9_PTX)
 else
  NVCC_GENCODE ?= $(CUDA8_GENCODE) $(CUDA8_PTX)
 endif
 #$(info NVCC_GENCODE is ${NVCC_GENCODE})

-CXXFLAGS   := -I$(CUDA_INC) -DCUDA_MAJOR=$(CUDA_MAJOR) -DCUDA_MINOR=$(CUDA_MINOR) -fPIC -fvisibility=hidden
-CXXFLAGS   += -Wall -Wno-sign-compare
+CXXFLAGS   := -DCUDA_MAJOR=$(CUDA_MAJOR) -DCUDA_MINOR=$(CUDA_MINOR) -fPIC -fvisibility=hidden
+CXXFLAGS   += -Wall -Wno-unused-function -Wno-sign-compare -std=c++11 -Wvla
+CXXFLAGS   += -I $(CUDA_INC)
 NVCUFLAGS  := -ccbin $(CXX) $(NVCC_GENCODE) -lineinfo -std=c++11 -Xptxas -maxrregcount=96 -Xfatbin -compress-all
 # Use addprefix so that we can specify more than one path
 NVLDFLAGS  := -L${CUDA_LIB} -lcudart -lrt
@@ -68,7 +69,7 @@ CXXFLAGS  += -O0 -g -ggdb3
 endif

 ifneq ($(VERBOSE), 0)
-NVCUFLAGS += -Xptxas -v -Xcompiler -Wall,-Wextra
+NVCUFLAGS += -Xptxas -v -Xcompiler -Wall,-Wextra,-Wno-unused-parameter
 CXXFLAGS  += -Wall -Wextra
 else
 .SILENT:
@@ -1,5 +1,5 @@
 #
-# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
+# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
 #
 # See LICENSE.txt for license information
 #
@@ -1,6 +1,6 @@
 ##### version
 NCCL_MAJOR   := 2
-NCCL_MINOR   := 3
-NCCL_PATCH   := 7
+NCCL_MINOR   := 4
+NCCL_PATCH   := 8
 NCCL_SUFFIX  :=
 PKG_REVISION := 1
@@ -1,5 +1,5 @@
 #
-# Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+# Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
 #
 # See LICENSE.txt for license information
 #
@@ -1,5 +1,5 @@
 #
-# Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+# Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
 #
 # See LICENSE.txt for license information
 #
@@ -1,5 +1,5 @@
 #
-# Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+# Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
 #
 # See LICENSE.txt for license information
 #
@@ -1,6 +1,6 @@
 Name:           libnccl
-Version:        ${nccl:Major}.${nccl:Minor}.${nccl:Patch}
-Release:        ${pkg:Revision}
+Version:        ${nccl:Major}.${nccl:Minor}.${nccl:Patch}${nccl:Suffix}
+Release:        ${pkg:Revision}+cuda${cuda:Major}.${cuda:Minor}
 Summary:        NVIDIA Collectives Communication Library (NCCL) Runtime

 Group:          Development/Libraries
@@ -1,5 +1,5 @@
 #
-# Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+# Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
 #
 # See LICENSE.txt for license information
 #
@@ -36,4 +36,5 @@ $(TXZPREPDIR)/% : %.in
 	    -e "s/\$${nccl:Minor}/$(NCCL_MINOR)/g" \
 	    -e "s/\$${nccl:Patch}/$(NCCL_PATCH)/g" \
 	    -e "s/\$${nccl:Suffix}/$(NCCL_SUFFIX)/g" \
+	    -e "s/\$${pkg:Revision}/$(PKG_REVISION)/g" \
 	    $< > $@
@@ -1,6 +1,6 @@
 #!/bin/bash
 #
-# Copyright (c) 2017-2018, NVIDIA CORPORATION. All rights reserved.
+# Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved.
 #
 # See LICENSE.txt for license information
 #
@@ -25,8 +25,9 @@ NCCL_MAJOR=${nccl:Major}
 NCCL_MINOR=${nccl:Minor}
 NCCL_PATCH=${nccl:Patch}
 NCCL_SUFFIX=${nccl:Suffix}
+NCCL_BUILD=${pkg:Revision}

-NCCLNAME="nccl-src_${NCCL_MAJOR}.${NCCL_MINOR}.${NCCL_PATCH}${NCCL_SUFFIX}"
+NCCLNAME="nccl-src_${NCCL_MAJOR}.${NCCL_MINOR}.${NCCL_PATCH}${NCCL_SUFFIX}-${NCCL_BUILD}"

 tar --exclude build \
    --exclude ".git*" \
@@ -1,5 +1,5 @@
 #
-# Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+# Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
 #
 # See LICENSE.txt for license information
 #
@@ -1,6 +1,6 @@
 #!/bin/bash
 #
-# Copyright (c) 2017-2018, NVIDIA CORPORATION. All rights reserved.
+# Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved.
 #
 # See LICENSE.txt for license information
 #
@@ -1,5 +1,5 @@
 #
-# Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+# Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
 #
 # See LICENSE.txt for license information
 #
@@ -9,41 +9,48 @@ include ../makefiles/version.mk

 ##### src files
 INCEXPORTS  := nccl.h nccl_net.h
-LIBSRCFILES := init.cu ring.cu bootstrap.cu transport.cu misc/group.cu \
-		misc/nvmlwrap.cu misc/ibvwrap.cu misc/rings.cu misc/utils.cu misc/enqueue.cu \
-		transport/p2p.cu transport/shm.cu transport/net.cu transport/net_socket.cu transport/net_ib.cu \
-                collectives/all_reduce.cu collectives/all_gather.cu collectives/broadcast.cu collectives/reduce.cu collectives/reduce_scatter.cu
+LIBSRCFILES := init.cc channel.cc bootstrap.cc transport.cc enqueue.cc \
+                misc/group.cc misc/nvmlwrap.cc misc/ibvwrap.cc misc/rings.cc misc/utils.cc misc/argcheck.cc misc/trees.cc misc/topo.cc \
+		transport/p2p.cc transport/shm.cc transport/net.cc transport/net_socket.cc transport/net_ib.cc \
+                collectives/all_reduce.cc collectives/all_gather.cc collectives/broadcast.cc collectives/reduce.cc collectives/reduce_scatter.cc

 ##### lib files
 LIBNAME     := libnccl.so
 STATICLIBNAME := libnccl_static.a
+##### pkgconfig files
+PKGCONFIGFILE := nccl.pc
 ##### dirs
 BUILDDIR ?= $(abspath ../build)
 INCDIR := $(BUILDDIR)/include
 LIBDIR := $(BUILDDIR)/lib
 OBJDIR := $(BUILDDIR)/obj
+PKGDIR := $(BUILDDIR)/lib/pkgconfig
 ##### target files
+CUDARTLIB  ?= cudart_static
 INCTARGETS := $(INCEXPORTS:%=$(INCDIR)/%)
 LIBSONAME  := $(LIBNAME:%=%.$(NCCL_MAJOR))
 LIBTARGET  := $(LIBNAME:%=%.$(NCCL_MAJOR).$(NCCL_MINOR).$(NCCL_PATCH))
 STATICLIBTARGET := $(STATICLIBNAME)
-LIBOBJ     := $(LIBSRCFILES:%.cu=$(OBJDIR)/%.o)
+PKGTARGET  := $(PKGCONFIGFILE)
+LIBOBJ     := $(LIBSRCFILES:%.cc=$(OBJDIR)/%.o)
 DEPFILES   := $(LIBOBJ:%.o=%.d)
-LDFLAGS    += -L${CUDA_LIB} -lcudart_static -lrt
+LDFLAGS    += -L${CUDA_LIB} -l$(CUDARTLIB) -lpthread -lrt -ldl

 DEVICELIB  := $(BUILDDIR)/obj/collectives/device/colldevice.a

-
 ##### rules
 build : lib staticlib

-lib : $(INCTARGETS) $(LIBDIR)/$(LIBTARGET)
+lib : $(INCTARGETS) $(LIBDIR)/$(LIBTARGET) $(PKGDIR)/$(PKGTARGET)

 staticlib : $(LIBDIR)/$(STATICLIBTARGET)

-devicelib: $(INCDIR)/nccl.h
+$(DEVICELIB): ALWAYS_REBUILD
 	$(MAKE) -C collectives/device

+# Empty target to force rebuild
+ALWAYS_REBUILD:
+
 -include $(DEPFILES)
 $(LIBDIR)/$(LIBTARGET) $(LIBDIR)/$(STATICLIBTARGET) : $(LIBOBJ)

@@ -51,7 +58,7 @@ $(INCDIR)/nccl.h : nccl.h.in
 # NCCL_VERSION(X,Y,Z) ((X) * 1000 + (Y) * 100 + (Z))
 	@$(eval NCCL_VERSION := $(shell printf "%d%d%02d" $(NCCL_MAJOR) $(NCCL_MINOR) $(NCCL_PATCH)))
 	mkdir -p $(INCDIR)
-	printf "Generating %-35s > %s\n" $< $@
+	@printf "Generating %-35s > %s\n" $< $@
 	sed -e "s/\$${nccl:Major}/$(NCCL_MAJOR)/g" \
 	    -e "s/\$${nccl:Minor}/$(NCCL_MINOR)/g" \
 	    -e "s/\$${nccl:Patch}/$(NCCL_PATCH)/g" \
@@ -59,14 +66,14 @@ $(INCDIR)/nccl.h : nccl.h.in
 	    -e "s/\$${nccl:Version}/$(NCCL_VERSION)/g" \
 	    $< > $@

-$(LIBDIR)/$(LIBTARGET): $(LIBOBJ) devicelib
+$(LIBDIR)/$(LIBTARGET): $(LIBOBJ) $(DEVICELIB)
 	@printf "Linking    %-35s > %s\n" $(LIBTARGET) $@
 	mkdir -p $(LIBDIR)
 	$(CXX) $(CXXFLAGS) -shared -Wl,--no-as-needed -Wl,-soname,$(LIBSONAME) -o $@ $(LIBOBJ) $(DEVICELIB) $(LDFLAGS)
 	ln -sf $(LIBSONAME) $(LIBDIR)/$(LIBNAME)
 	ln -sf $(LIBTARGET) $(LIBDIR)/$(LIBSONAME)

-$(LIBDIR)/$(STATICLIBTARGET): $(LIBOBJ) devicelib
+$(LIBDIR)/$(STATICLIBTARGET): $(LIBOBJ) $(DEVICELIB)
 	@printf "Archiving  %-35s > %s\n" $(STATICLIBTARGET) $@
 	mkdir -p $(LIBDIR)
 	$(eval TMP := $(shell mktemp -d))
@@ -75,6 +82,15 @@ $(LIBDIR)/$(STATICLIBTARGET): $(LIBOBJ) devicelib
 	ar cr $@ $(LIBOBJ) $(TMP)/*.o
 	rm -Rf $(TMP)

+$(PKGDIR)/nccl.pc : nccl.pc.in
+	mkdir -p $(PKGDIR)
+	@printf "Generating %-35s > %s\n" $< $@
+	sed -e 's|$${nccl:Prefix}|\$(PREFIX)|g' \
+	    -e "s/\$${nccl:Major}/$(NCCL_MAJOR)/g" \
+	    -e "s/\$${nccl:Minor}/$(NCCL_MINOR)/g" \
+	    -e "s/\$${nccl:Patch}/$(NCCL_PATCH)/g" \
+	    $< > $@
+
 $(INCDIR)/%.h : %.h
 	@printf "Grabbing   %-35s > %s\n" $< $@
 	mkdir -p $(INCDIR)
@@ -85,27 +101,34 @@ $(INCDIR)/nccl_%.h : include/nccl_%.h
 	mkdir -p $(INCDIR)
 	cp -f $< $@

-$(OBJDIR)/%.o : %.cu
+$(PKGDIR)/%.pc : %.pc
+	@printf "Grabbing   %-35s > %s\n" $< $@
+	mkdir -p $(PKGDIR)
+	cp -f $< $@
+
+$(OBJDIR)/%.o : %.cc
 	@printf "Compiling  %-35s > %s\n" $< $@
 	mkdir -p `dirname $@`
-	$(NVCC) -I. -I$(INCDIR) -Iinclude -c $(NVCUFLAGS) --compiler-options "$(CXXFLAGS)" $< -o $@
-	@$(NVCC) -I. -I$(INCDIR) -Iinclude -M $(NVCUFLAGS) --compiler-options "$(CXXFLAGS)" $< > $(@:%.o=%.d.tmp)
+	$(CXX) -I. -I$(INCDIR) $(CXXFLAGS) -Iinclude -c $< -o $@
+	@$(CXX) -I. -I$(INCDIR) $(CXXFLAGS) -Iinclude -M $< > $(@:%.o=%.d.tmp)
 	@sed "0,/^.*:/s//$(subst /,\/,$@):/" $(@:%.o=%.d.tmp) > $(@:%.o=%.d)
 	@sed -e 's/.*://' -e 's/\\$$//' < $(@:%.o=%.d.tmp) | fmt -1 | \
                sed -e 's/^ *//' -e 's/$$/:/' >> $(@:%.o=%.d)
 	@rm -f $(@:%.o=%.d.tmp)

 clean :
-	rm -rf ${INCDIR} ${LIBDIR} ${OBJDIR}
+	rm -rf ${INCDIR} ${LIBDIR} ${PKGDIR} ${OBJDIR}
 	$(MAKE) -C collectives/device clean

 install : lib
 	mkdir -p $(PREFIX)/lib
+	mkdir -p $(PREFIX)/lib/pkgconfig
 	mkdir -p $(PREFIX)/include
-	cp -P -v $(BUILDDIR)/lib/* $(PREFIX)/lib/
+	cp -P -v $(BUILDDIR)/lib/lib* $(PREFIX)/lib/
+	cp -P -v $(BUILDDIR)/lib/pkgconfig/* $(PREFIX)/lib/pkgconfig/
 	cp -v $(BUILDDIR)/include/* $(PREFIX)/include/

-FILESTOFORMAT := $(shell find . -name ".\#*" -prune -o \( -name "*.cu" -o -name "*.h" \) -print | grep -v -E 'ibvwrap.h|nvmlwrap.h|nccl.h')
+FILESTOFORMAT := $(shell find . -name ".\#*" -prune -o \( -name "*.cc" -o -name "*.h" \) -print | grep -v -E 'ibvwrap.h|nvmlwrap.h|nccl.h')
 # Note that formatting.mk defines a new target so in order to not overwrite the default target,
 # it shouldn't be included at the top. Also, it uses the above definition of FILESTOFORMAT as well
 # as the BUILDDIR variable.
@@ -0,0 +1,467 @@
+/*************************************************************************
+ * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#include "nccl.h"
+#include "core.h"
+#include "utils.h"
+#include "bootstrap.h"
+#include "net.h"
+#include "socket.h"
+#include <unistd.h>
+#include <sys/types.h>
+
+// Always use sockets for bootstrap
+struct bootstrapNetHandle {
+  union socketAddress connectAddr;
+};
+
+struct bootstrapNetComm {
+  int fd;
+};
+
+/* Init functions */
+static char bootstrapNetIfNames[MAX_IF_NAME_SIZE*MAX_IFS];
+static union socketAddress bootstrapNetIfAddrs[MAX_IFS];
+static int bootstrapNetIfs = -1;
+pthread_mutex_t bootstrapNetLock = PTHREAD_MUTEX_INITIALIZER;
+
+ncclResult_t bootstrapNetInit() {
+  if (bootstrapNetIfs == -1) {
+    pthread_mutex_lock(&bootstrapNetLock);
+    if (bootstrapNetIfs == -1) {
+      bootstrapNetIfs = findInterfaces(bootstrapNetIfNames, bootstrapNetIfAddrs, MAX_IF_NAME_SIZE, MAX_IFS);
+      if (bootstrapNetIfs <= 0) {
+        WARN("Bootstrap : no socket interface found");
+        return ncclInternalError;
+      } else {
+        char line[1024];
+        char addrline[1024];
+        line[0] = '\0';
+        for (int i=0; i<bootstrapNetIfs; i++) {
+          snprintf(line+strlen(line), 1023-strlen(line), " [%d]%s:%s", i, bootstrapNetIfNames+i*MAX_IF_NAME_SIZE,
+              socketToString(&bootstrapNetIfAddrs[i].sa, addrline));
+        }
+        line[1023] = '\0';
+        INFO(NCCL_INIT, "Bootstrap : Using%s", line);
+      }
+    }
+    pthread_mutex_unlock(&bootstrapNetLock);
+  }
+  return ncclSuccess;
+}
+
+static ncclResult_t bootstrapNetNewComm(struct bootstrapNetComm** comm) {
+  NCCLCHECK(ncclCalloc(comm, 1));
+  (*comm)->fd = -1;
+  return ncclSuccess;
+}
+
+static ncclResult_t bootstrapNetGetSocketAddr(int dev, union socketAddress* addr) {
+  if (dev >= bootstrapNetIfs) return ncclInternalError;
+  memcpy(addr, bootstrapNetIfAddrs+dev, sizeof(*addr));
+  return ncclSuccess;
+}
+
+/* Socket Interface Selection type */
+enum bootstrapInterface_t { findSubnetIf = -1, dontCareIf = -2 };
+
+static ncclResult_t bootstrapNetListen(int dev, void* opaqueHandle, void** listenComm) {
+  struct bootstrapNetHandle* handle = (struct bootstrapNetHandle*) opaqueHandle;
+  static_assert(sizeof(struct bootstrapNetHandle) < NCCL_NET_HANDLE_MAXSIZE, "bootstrapNetHandle size too large");
+  // if dev >= 0, listen based on dev
+  if (dev >= 0) {
+    NCCLCHECK(bootstrapNetGetSocketAddr(dev, &(handle->connectAddr)));
+  } else if (dev == findSubnetIf) {
+    // handle stores a remote address
+    // need to find a local addr that is in the same network as the remote addr
+    union socketAddress localAddr;
+    char ifName[MAX_IF_NAME_SIZE];
+    if (findInterfaceMatchSubnet(ifName, &localAddr, handle->connectAddr, MAX_IF_NAME_SIZE, 1) <= 0) {
+      WARN("NET/Socket : No usable listening interface found");
+      return ncclSystemError;
+    }
+    // pass the local address back
+    memcpy(&handle->connectAddr, &localAddr, sizeof(handle->connectAddr));
+  } // Otherwise, handle stores a local address
+  struct bootstrapNetComm* comm;
+  NCCLCHECK(bootstrapNetNewComm(&comm));
+  NCCLCHECK(createListenSocket(&comm->fd, &handle->connectAddr));
+  *listenComm = comm;
+  return ncclSuccess;
+}
+
+static ncclResult_t bootstrapNetConnect(int dev, void* opaqueHandle, void** sendComm) {
+  struct bootstrapNetComm* comm;
+  NCCLCHECK(bootstrapNetNewComm(&comm));
+  struct bootstrapNetHandle* handle = (struct bootstrapNetHandle*) opaqueHandle;
+  NCCLCHECK(connectAddress(&comm->fd, &handle->connectAddr));
+  *sendComm = comm;
+  return ncclSuccess;
+}
+
+static ncclResult_t bootstrapNetAccept(void* listenComm, void** recvComm) {
+  struct bootstrapNetComm* lComm = (struct bootstrapNetComm*)listenComm;
+  struct bootstrapNetComm* rComm;
+  NCCLCHECK(bootstrapNetNewComm(&rComm));
+  struct sockaddr_in sockaddr;
+  socklen_t socklen = sizeof(struct sockaddr_in);
+  SYSCHECKVAL(accept(lComm->fd, (struct sockaddr*)&sockaddr, &socklen), "accept", rComm->fd);
+  *recvComm = rComm;
+  return ncclSuccess;
+}
+
+static ncclResult_t bootstrapNetClose(void* opaqueComm) {
+  struct bootstrapNetComm* comm = (struct bootstrapNetComm*)opaqueComm;
+  if (comm) {
+    close(comm->fd);
+    free(comm);
+  }
+  return ncclSuccess;
+}
+
+static ncclResult_t bootstrapNetCloseSend(void* sendComm) { NCCLCHECK(bootstrapNetClose(sendComm)); return ncclSuccess; }
+static ncclResult_t bootstrapNetCloseRecv(void* recvComm) { NCCLCHECK(bootstrapNetClose(recvComm)); return ncclSuccess; }
+static ncclResult_t bootstrapNetCloseListen(void* listenComm) { NCCLCHECK(bootstrapNetClose(listenComm)); return ncclSuccess; }
+
+// Additional sync functions
+static ncclResult_t bootstrapNetSend(void* sendComm, void* data, int size) {
+  struct bootstrapNetComm* comm = (struct bootstrapNetComm*)sendComm;
+  NCCLCHECK(socketSend(comm->fd, &size, sizeof(int)));
+  NCCLCHECK(socketSend(comm->fd, data, size));
+  return ncclSuccess;
+}
+static ncclResult_t bootstrapNetRecv(void* recvComm, void* data, int size) {
+  struct bootstrapNetComm* comm = (struct bootstrapNetComm*)recvComm;
+  int recvSize;
+  NCCLCHECK(socketReceive(comm->fd, &recvSize, sizeof(int)));
+  if (recvSize > size) {
+    WARN("Message truncated : received %d bytes instead of %d\n", recvSize, size);
+    return ncclInternalError;
+  }
+  NCCLCHECK(socketReceive(comm->fd, data, std::min(recvSize, size)));
+  return ncclSuccess;
+}
+
+ncclResult_t bootstrapNetCreateHandle(void* opaqueHandle, const char* str) {
+  struct bootstrapNetHandle* handle = (struct bootstrapNetHandle*) opaqueHandle;
+  NCCLCHECK(GetSocketAddrFromString(&handle->connectAddr, str));
+  return ncclSuccess;
+}
+
+struct extId {
+  ncclNetHandle_t extHandleRoot;
+  void* extListenComm;
+  uint64_t hostHash;
+  pid_t pid;
+  int fd;
+  pthread_t boostrapThread;
+};
+
+struct extInfo {
+  int rank;
+  int nranks;
+  ncclNetHandle_t extHandleListenRoot;
+  ncclNetHandle_t extHandleListen;
+};
+
+#include <sys/resource.h>
+
+static ncclResult_t setFilesLimit() {
+  struct rlimit filesLimit;
+  SYSCHECK(getrlimit(RLIMIT_NOFILE, &filesLimit), "getrlimit");
+  filesLimit.rlim_cur = filesLimit.rlim_max;
+  SYSCHECK(setrlimit(RLIMIT_NOFILE, &filesLimit), "setrlimit");
+  return ncclSuccess;
+}
+
+static void *bootstrapRoot(void* commId) {
+  struct extInfo info;
+  struct extId* id = (struct extId*)commId;
+  ncclNetHandle_t *rankHandles = NULL;
+  ncclNetHandle_t *rankHandlesRoot = NULL; // for initial rank <-> root information exchange
+  ncclNetHandle_t zero = { 0 }; // for sanity checking
+  void* tmpComm;
+  ncclResult_t res;
+  setFilesLimit();
+
+  TRACE(NCCL_INIT, "BEGIN");
+  /* Receive addresses from all ranks */
+  int nranks = 0, c = 0;
+  do {
+    NCCLCHECKGOTO(bootstrapNetAccept(id->extListenComm, &tmpComm), res, out);
+    NCCLCHECKGOTO(bootstrapNetRecv(tmpComm, &info, sizeof(info)), res, out);
+    NCCLCHECKGOTO(bootstrapNetCloseRecv(tmpComm), res, out);
+
+    if (c == 0) {
+      nranks = info.nranks;
+      NCCLCHECKGOTO(ncclCalloc(&rankHandles, nranks), res, out);
+      NCCLCHECKGOTO(ncclCalloc(&rankHandlesRoot, nranks), res, out);
+    }
+
+    if (nranks != info.nranks) {
+      WARN("Bootstrap Root : mismatch in rank count from procs %d : %d", nranks, info.nranks);
+      goto out;
+    }
+
+    if (memcmp(&zero, &rankHandlesRoot[info.rank], sizeof(ncclNetHandle_t)) != 0) {
+      WARN("Bootstrap Root : rank %d of %d ranks has already checked in", info.rank, nranks);
+      goto out;
+    }
+
+    // Save the connection handle for that rank
+    memcpy(rankHandlesRoot+info.rank, info.extHandleListenRoot, sizeof(ncclNetHandle_t));
+    memcpy(rankHandles+info.rank, info.extHandleListen, sizeof(ncclNetHandle_t));
+
+    ++c;
+  } while (c < nranks);
+  TRACE(NCCL_INIT, "COLLECTED HANDLES");
+
+  // Send the connect handle for the next rank in the AllGather ring
+  for (int r=0; r<nranks; ++r) {
+    int next = (r+1) % nranks;
+    void *tmpSendComm;
+    NCCLCHECKGOTO(bootstrapNetConnect(0, rankHandlesRoot[r], &tmpSendComm), res, out);
+    NCCLCHECKGOTO(bootstrapNetSend(tmpSendComm, rankHandles+next, sizeof(ncclNetHandle_t)), res, out);
+    NCCLCHECKGOTO(bootstrapNetCloseSend(tmpSendComm), res, out);
+  }
+  TRACE(NCCL_INIT, "SENT OUT HANDLES");
+
+out:
+  bootstrapNetCloseListen(id->extListenComm);
+  free(commId);
+  if (rankHandles) free(rankHandles);
+  if (rankHandlesRoot) free(rankHandlesRoot);
+
+  TRACE(NCCL_INIT, "DONE");
+  return NULL;
+}
+
+ncclResult_t bootstrapCreateRoot(ncclUniqueId* commId, bool idFromEnv) {
+  struct extId* id = (struct extId*)commId;
+  id->hostHash = getHostHash();
+  NCCLCHECK(bootstrapNetListen(idFromEnv ? dontCareIf : 0, &id->extHandleRoot, &id->extListenComm));
+  ncclUniqueId* threadIdCopy;
+  NCCLCHECK(ncclCalloc(&threadIdCopy, 1));
+  memcpy(threadIdCopy, id, sizeof(ncclUniqueId));
+  pthread_create(&id->boostrapThread, NULL, bootstrapRoot, (void *)threadIdCopy);
+  return ncclSuccess;
+}
+
+ncclResult_t bootstrapGetUniqueId(ncclUniqueId* out) {
+  static_assert(sizeof(extId) < sizeof(ncclUniqueId), "NetId does not fit inside ncclUniqueId");
+  extId* id = (extId*)out;
+
+  char* env = getenv("NCCL_COMM_ID");
+  if (env) {
+    if (bootstrapNetCreateHandle(&id->extHandleRoot, env) != 0) {
+      WARN("Invalid NCCL_COMM_ID, please use format: <ipv4>:<port> or [<ipv6>]:<port> or <hostname>:<port>");
+      return ncclInvalidArgument;
+    }
+    id->pid = -1;
+  } else {
+    id->pid = getpid();
+    NCCLCHECK(bootstrapCreateRoot(out, false));
+  }
+
+  return ncclSuccess;
+}
+
+struct unexConn {
+  int peer;
+  void* comm;
+  struct unexConn* next;
+};
+
+struct extState {
+  void* extBstrapListenComm;
+  void* extBstrapRingRecvComm;
+  void* extBstrapRingSendComm;
+  ncclNetHandle_t* peerBstrapHandles;
+  struct unexConn* unexpectedConnections;
+  int rank;
+  int nranks;
+  int dev;
+};
+
+ncclResult_t bootstrapInit(ncclUniqueId* commId, int rank, int nranks, void** commState) {
+  struct extId* id = (struct extId*)commId;
+  bool idFromEnv = id->pid < 0;
+  struct extState* state;
+  NCCLCHECK(ncclCalloc(&state, 1));
+  state->rank = rank;
+  state->nranks = nranks;
+  *commState = state;
+
+  TRACE(NCCL_INIT, "rank %d nranks %d", rank, nranks);
+
+  struct extInfo info = { 0 };
+  info.rank = rank;
+  info.nranks = nranks;
+  void *tmpSendComm, *tmpRecvComm;
+  // Pass the remote address to listen via info
+  if (idFromEnv) {
+    memcpy(&info.extHandleListen, &id->extHandleRoot, sizeof(ncclNetHandle_t));
+    memcpy(&info.extHandleListenRoot, &id->extHandleRoot, sizeof(ncclNetHandle_t));
+  }
+  // listen will return the local address via info (specify interface type 'findSubnetIf')
+  state->dev = idFromEnv ? findSubnetIf : 0;
+  void* extBstrapListenCommRoot;
+  NCCLCHECK(bootstrapNetListen(state->dev, &info.extHandleListen, &state->extBstrapListenComm));
+  NCCLCHECK(bootstrapNetListen(state->dev, &info.extHandleListenRoot, &extBstrapListenCommRoot));
+
+  // stagger connection times to avoid an overload of the root at very high rank counts
+  if (nranks > 128) {
+    long msec = rank;
+    struct timespec tv;
+    tv.tv_sec = msec / 1000;
+    tv.tv_nsec = 1000000 * (msec % 1000);
+    TRACE(NCCL_INIT, "rank %d delaying connection to root by %ld msec", rank, msec);
+    (void) nanosleep(&tv, NULL);
+  }
+
+  // send info on my listening socket to root
+  NCCLCHECK(bootstrapNetConnect(state->dev, id->extHandleRoot, &tmpSendComm));
+  NCCLCHECK(bootstrapNetSend(tmpSendComm, &info, sizeof(info)));
+  NCCLCHECK(bootstrapNetCloseSend(tmpSendComm));
+
+  // get info on my "next" rank in the bootstrap ring from root
+  ncclNetHandle_t extHandleNext;
+  NCCLCHECK(bootstrapNetAccept(extBstrapListenCommRoot, &tmpRecvComm));
+  NCCLCHECK(bootstrapNetRecv(tmpRecvComm, &extHandleNext, sizeof(extHandleNext)));
+  NCCLCHECK(bootstrapNetCloseRecv(tmpRecvComm));
+  NCCLCHECK(bootstrapNetCloseListen(extBstrapListenCommRoot));
+
+  NCCLCHECK(bootstrapNetConnect(state->dev, extHandleNext, &state->extBstrapRingSendComm));
+  // Accept the connect request from the previous rank in the AllGather ring
+  NCCLCHECK(bootstrapNetAccept(state->extBstrapListenComm, &state->extBstrapRingRecvComm));
+
+  // AllGather all listen handlers
+  NCCLCHECK(ncclCalloc(&state->peerBstrapHandles, nranks));
+  memcpy(state->peerBstrapHandles+rank, info.extHandleListen, sizeof(ncclNetHandle_t));
+  NCCLCHECK(bootstrapAllGather(state, state->peerBstrapHandles, sizeof(ncclNetHandle_t)));
+
+  TRACE(NCCL_INIT, "rank %d nranks %d - DONE", rank, nranks);
+
+  return ncclSuccess;
+}
+
+ncclResult_t bootstrapAllGather(void* commState, void* allData, int size) {
+  struct extState* state = (struct extState*)commState;
+  char* data = (char*)allData;
+  int rank = state->rank;
+  int nranks = state->nranks;
+
+  TRACE(NCCL_INIT, "rank %d nranks %d size %d", rank, nranks, size);
+
+  /* Simple ring based AllGather
+   * At each step i receive data from (rank-i-1) from left
+   * and send previous step's data from (rank-i) to right
+   */
+  for (int i=0; i<nranks-1; i++) {
+    size_t rslice = (rank - i - 1 + nranks) % nranks;
+    size_t sslice = (rank - i + nranks) % nranks;
+
+    // Send slice to the right
+    NCCLCHECK(bootstrapNetSend(state->extBstrapRingSendComm, data+sslice*size, size));
+    // Recv slice from the left
+    NCCLCHECK(bootstrapNetRecv(state->extBstrapRingRecvComm, data+rslice*size, size));
+  }
+
+  TRACE(NCCL_INIT, "rank %d nranks %d size %d - DONE", rank, nranks, size);
+  return ncclSuccess;
+}
+
+ncclResult_t bootstrapSend(void* commState, int peer, void* data, int size) {
+  struct extState* state = (struct extState*)commState;
+  void* tmpSendComm;
+  NCCLCHECK(bootstrapNetConnect(state->dev, state->peerBstrapHandles[peer], &tmpSendComm));
+  NCCLCHECK(bootstrapNetSend(tmpSendComm, &state->rank, sizeof(int)));
+  NCCLCHECK(bootstrapNetSend(tmpSendComm, data, size));
+  NCCLCHECK(bootstrapNetCloseSend(tmpSendComm));
+  return ncclSuccess;
+}
+
+ncclResult_t unexpectedEnqueue(struct extState* state, int peer, void* comm) {
+  // New unex
+  struct unexConn* unex;
+  NCCLCHECK(ncclCalloc(&unex, 1));
+  unex->peer = peer;
+  unex->comm = comm;
+
+  // Enqueue
+  struct unexConn* list = state->unexpectedConnections;
+  if (list == NULL) {
+    state->unexpectedConnections = unex;
+    return ncclSuccess;
+  }
+  while (list->next) list = list->next;
+  list->next = unex;
+  return ncclSuccess;
+}
+
+void* unexpectedDequeue(struct extState* state, int peer) {
+  struct unexConn* elem = state->unexpectedConnections;
+  struct unexConn* prev = NULL;
+  while (elem) {
+    if (elem->peer == peer) {
+      if (prev == NULL) {
+        state->unexpectedConnections = elem->next;
+      } else {
+        prev->next = elem->next;
+      }
+      void* comm = elem->comm;
+      free(elem);
+      return comm;
+    }
+    prev = elem;
+    elem = elem->next;
+  }
+  return NULL;
+}
+
+// We can't know who we'll receive from, so we need to receive everything at once
+ncclResult_t bootstrapRecv(void* commState, int peer, void* data, int size) {
+  struct extState* state = (struct extState*)commState;
+
+  void* tmpRecvComm;
+
+  // Search unexpected connections first
+  if ((tmpRecvComm = unexpectedDequeue(state, peer)) != NULL) {
+    NCCLCHECK(bootstrapNetRecv(tmpRecvComm, ((char*)data), size));
+    NCCLCHECK(bootstrapNetCloseRecv(tmpRecvComm));
+    return ncclSuccess;
+  }
+
+  // Then look for new connections
+  while (1) {
+    NCCLCHECK(bootstrapNetAccept(state->extBstrapListenComm, &tmpRecvComm));
+    int newPeer;
+    NCCLCHECK(bootstrapNetRecv(tmpRecvComm, &newPeer, sizeof(int)));
+    if (newPeer == peer) {
+      NCCLCHECK(bootstrapNetRecv(tmpRecvComm, ((char*)data), size));
+      NCCLCHECK(bootstrapNetCloseRecv(tmpRecvComm));
+      return ncclSuccess;
+    }
+    // Unexpected connection. Save for later.
+    NCCLCHECK(unexpectedEnqueue(state, newPeer, tmpRecvComm));
+  }
+}
+
+ncclResult_t bootstrapClose(void* commState) {
+  struct extState* state = (struct extState*)commState;
+  if (state->unexpectedConnections != NULL) {
+    WARN("Unexpected connections are not empty.\n");
+    return ncclInternalError;
+  }
+  NCCLCHECK(bootstrapNetCloseListen(state->extBstrapListenComm));
+  NCCLCHECK(bootstrapNetCloseSend(state->extBstrapRingSendComm));
+  NCCLCHECK(bootstrapNetCloseRecv(state->extBstrapRingRecvComm));
+
+  free(state->peerBstrapHandles);
+  free(state);
+
+  return ncclSuccess;
+}
@@ -1,249 +0,0 @@
-/*************************************************************************
- * Copyright (c) 2016-2018, NVIDIA CORPORATION. All rights reserved.
- *
- * See LICENSE.txt for license information
- ************************************************************************/
-
-#include "nccl.h"
-#include "core.h"
-#include "utils.h"
-#include "bootstrap.h"
-#include "net.h"
-#include <unistd.h>
-#include <sys/types.h>
-
-// Always use sockets for bootstrap
-ncclNet_t* ncclBootstrapNet = &ncclNetSocket;
-
-static ncclResult_t bootstrapListen(int dev, void* handle, void** listenComm) { NCCLCHECK(ncclBootstrapNet->listen(dev, handle, listenComm)); return ncclSuccess; }
-static ncclResult_t bootstrapConnect(int dev, void* handle, void** sendComm) { NCCLCHECK(ncclBootstrapNet->connect(dev, handle, sendComm)); return ncclSuccess; }
-static ncclResult_t bootstrapAccept(void* listenComm, void** recvComm) { NCCLCHECK(ncclBootstrapNet->accept(listenComm, recvComm)); return ncclSuccess; }
-static ncclResult_t bootstrapTest(void* request, int* done, int* size) { NCCLCHECK(ncclBootstrapNet->test(request, done, size)); return ncclSuccess; }
-static ncclResult_t bootstrapCloseSend(void* sendComm) { NCCLCHECK(ncclBootstrapNet->closeSend(sendComm)); return ncclSuccess; }
-static ncclResult_t bootstrapCloseRecv(void* recvComm) { NCCLCHECK(ncclBootstrapNet->closeRecv(recvComm)); return ncclSuccess; }
-static ncclResult_t bootstrapCloseListen(void* listenComm) { NCCLCHECK(ncclBootstrapNet->closeListen(listenComm)); return ncclSuccess; }
-
-// Additional sync functions based on async + test for bootstrap, using host ptrs.
-static ncclResult_t bootstrapSend(void* sendComm, void* data, int size) {
-  void* request;
-  NCCLCHECK(ncclBootstrapNet->isend(sendComm, data, size, NCCL_PTR_HOST, &request));
-  int done = 0;
-  while (!done) NCCLCHECK(bootstrapTest(request, &done, NULL));
-  return ncclSuccess;
-}
-static ncclResult_t bootstrapRecv(void* recvComm, void* data, int size) {
-  void* request;
-  NCCLCHECK(ncclBootstrapNet->irecv(recvComm, data, size, NCCL_PTR_HOST, &request));
-  int done = 0;
-  while (!done) NCCLCHECK(bootstrapTest(request, &done, NULL));
-  return ncclSuccess;
-}
-
-struct extId {
-  ncclNetHandle_t extHandleRoot;
-  void* extListenComm;
-  uint64_t hostHash;
-  pid_t pid;
-  int fd;
-  pthread_t boostrapThread;
-};
-
-struct extInfo {
-  int rank;
-  int nranks;
-  ncclNetHandle_t extHandleListenFromRoot;
-  ncclNetHandle_t extHandleRing;
-};
-
-#include <sys/resource.h>
-
-static ncclResult_t setFilesLimit() {
-  struct rlimit filesLimit;
-  SYSCHECK(getrlimit(RLIMIT_NOFILE, &filesLimit), "getrlimit");
-  filesLimit.rlim_cur = filesLimit.rlim_max;
-  SYSCHECK(setrlimit(RLIMIT_NOFILE, &filesLimit), "setrlimit");
-  return ncclSuccess;
-}
-
-static void *bootstrapRoot(void* commId) {
-  struct extInfo info;
-  struct extId* id = (struct extId*)commId;
-  ncclNetHandle_t *extHandleBstrap = NULL; // for initial rank <-> root information exchange
-  ncclNetHandle_t *extHandleRing = NULL; // for bootstrap ring creation
-  ncclNetHandle_t zero = { 0 }; // for sanity checking
-  void* tmpComm;
-  ncclResult_t res;
-  setFilesLimit();
-
-  /* Receive addresses from all ranks */
-  int nranks = 0, c = 0;
-  do {
-    NCCLCHECKGOTO(bootstrapAccept(id->extListenComm, &tmpComm), res, out);
-    NCCLCHECKGOTO(bootstrapRecv(tmpComm, &info, sizeof(info)), res, out);
-    NCCLCHECKGOTO(bootstrapCloseRecv(tmpComm), res, out);
-
-    if (c == 0) {
-      extHandleBstrap = (ncclNetHandle_t *)calloc(info.nranks, sizeof(ncclNetHandle_t));
-      extHandleRing = (ncclNetHandle_t *)calloc(info.nranks, sizeof(ncclNetHandle_t));
-      if (extHandleBstrap == NULL || extHandleRing == NULL) {
-        WARN("Bootstrap thread : failed to allocate memory");
-        goto out;
-      }
-      nranks = info.nranks;
-    }
-
-    if (nranks != info.nranks) {
-      WARN("Bootstrap Root : mismatch in rank count from procs %d : %d", nranks, info.nranks);
-      goto out;
-    }
-
-    if (memcmp(&zero, &extHandleBstrap[info.rank], sizeof(ncclNetHandle_t)) != 0) {
-      WARN("Bootstrap Root : rank %d of %d ranks has already checked in", info.rank, nranks);
-      goto out;
-    }
-
-    // Save the connection handle for connecting back to the ranks
-    memcpy(&extHandleBstrap[info.rank], info.extHandleListenFromRoot, sizeof(ncclNetHandle_t));
-    // Save the connection handle for the AllGather ring
-    memcpy(&extHandleRing[info.rank], info.extHandleRing, sizeof(ncclNetHandle_t));
-
-    ++c;
-  } while (c < nranks);
-
-  // Send the connect handle for the next rank in the AllGather ring
-  for (int r=0; r<nranks; ++r) {
-    int next = (r+1) % nranks;
-    void *tmpSendComm;
-    NCCLCHECKGOTO(bootstrapConnect(0, extHandleBstrap[r], &tmpSendComm), res, out);
-    NCCLCHECKGOTO(bootstrapSend(tmpSendComm, &extHandleRing[next], sizeof(ncclNetHandle_t)), res, out);
-    NCCLCHECKGOTO(bootstrapCloseSend(tmpSendComm), res, out);
-  }
-
-out:
-  bootstrapCloseListen(id->extListenComm);
-  free(commId);
-  free(extHandleBstrap);
-  free(extHandleRing);
-  return NULL;
-}
-
-ncclResult_t bootstrapCreateRoot(ncclUniqueId* commId, bool idFromEnv) {
-  struct extId* id = (struct extId*)commId;
-  id->hostHash = getHostHash();
-  NCCLCHECK(bootstrapListen(idFromEnv ? dontCareIf : 0, &id->extHandleRoot, &id->extListenComm));
-  ncclUniqueId* threadIdCopy;
-  NCCLCHECK(ncclCalloc(&threadIdCopy, 1));
-  memcpy(threadIdCopy, id, sizeof(ncclUniqueId));
-  pthread_create(&id->boostrapThread, NULL, bootstrapRoot, (void *)threadIdCopy);
-  return ncclSuccess;
-}
-
-ncclResult_t bootstrapGetUniqueId(ncclUniqueId* out) {
-  static_assert(sizeof(extId) < sizeof(ncclUniqueId), "NetId does not fit inside ncclUniqueId");
-  extId* id = (extId*)out;
-
-  char* env = getenv("NCCL_COMM_ID");
-  if (env) {
-    if (ncclSocketCreateHandle(&id->extHandleRoot, env) != 0) {
-      WARN("Invalid NCCL_COMM_ID, please use format: <ipv4>:<port> or [<ipv6>]:<port> or <hostname>:<port>");
-      return ncclInvalidArgument;
-    }
-    id->pid = -1;
-  } else {
-    id->pid = getpid();
-    NCCLCHECK(bootstrapCreateRoot(out, false));
-  }
-
-  return ncclSuccess;
-}
-
-struct extState {
-  void* extBstrapRingRecvComm;
-  void* extBstrapRingSendComm;
-  ncclNetHandle_t extBstrapRootHandle;
-  int rank;
-  int nranks;
-  int dev;
-};
-
-ncclResult_t bootstrapInit(ncclUniqueId* commId, int rank, int nranks, void** commState) {
-  struct extId* id = (struct extId*)commId;
-  bool idFromEnv = id->pid < 0;
-  struct extState* state;
-  NCCLCHECK(ncclCalloc(&state, 1));
-  state->rank = rank;
-  state->nranks = nranks;
-  *commState = state;
-  void* extBstrapRootListenComm; // comm on which we accept root's connections
-
-  struct extInfo info = { 0 };
-  info.rank = rank;
-  info.nranks = nranks;
-  void *tmpSendComm, *extBstrapRingListenComm, *tmpRecvComm;
-  // Pass the remote address to listen via info
-  if (idFromEnv) {
-    memcpy(&info.extHandleListenFromRoot, &id->extHandleRoot, sizeof(ncclNetHandle_t));
-    memcpy(&info.extHandleRing, &id->extHandleRoot, sizeof(ncclNetHandle_t));
-  }
-  // listen will return the local address via info (specify interface type 'findSubnetIf')
-  state->dev = idFromEnv ? findSubnetIf : 0;
-  NCCLCHECK(bootstrapListen(state->dev, &info.extHandleListenFromRoot, &extBstrapRootListenComm));
-  NCCLCHECK(bootstrapListen(state->dev, &info.extHandleRing, &extBstrapRingListenComm)); // AllGather Ring
-
-  memcpy(&state->extBstrapRootHandle, &id->extHandleRoot, sizeof(ncclNetHandle_t));
-  // send info on my listening sockets to root
-  NCCLCHECK(bootstrapConnect(state->dev, id->extHandleRoot, &tmpSendComm));
-  NCCLCHECK(bootstrapSend(tmpSendComm, &info, sizeof(info)));
-  NCCLCHECK(bootstrapCloseSend(tmpSendComm));
-
-  // get info on my "next" rank in the bootstrap ring from root
-  ncclNetHandle_t extHandleNext;
-  NCCLCHECK(bootstrapAccept(extBstrapRootListenComm, &tmpRecvComm));
-  NCCLCHECK(bootstrapRecv(tmpRecvComm, &extHandleNext, sizeof(extHandleNext)));
-  NCCLCHECK(bootstrapCloseRecv(tmpRecvComm));
-
-  NCCLCHECK(bootstrapConnect(state->dev, extHandleNext, &state->extBstrapRingSendComm));
-  // Accept the connect request from the previous rank in the AllGather ring
-  NCCLCHECK(bootstrapAccept(extBstrapRingListenComm, &state->extBstrapRingRecvComm));
-  NCCLCHECK(bootstrapCloseListen(extBstrapRingListenComm));
-  NCCLCHECK(bootstrapCloseListen(extBstrapRootListenComm));
-
-  return ncclSuccess;
-}
-
-ncclResult_t bootstrapAllGather(void* commState, void* allData, int size) {
-  struct extState* state = (struct extState*)commState;
-  char* data = (char*)allData;
-  int rank = state->rank;
-  int nranks = state->nranks;
-
-  TRACE(NCCL_INIT, "rank %d nranks %d size %d", rank, nranks, size);
-
-  /* Simple ring based AllGather
-   * At each step i receive data from (rank-i-1) from left
-   * and send previous step's data from (rank-i) to right
-   */
-  for (int i=0; i<nranks-1; i++) {
-    int rslice = (rank - i - 1 + nranks) % nranks;
-    int sslice = (rank - i + nranks) % nranks;
-
-    // Send slice to the right
-    NCCLCHECK(bootstrapSend(state->extBstrapRingSendComm, data+sslice*size, size));
-    // Recv slice from the left
-    NCCLCHECK(bootstrapRecv(state->extBstrapRingRecvComm, data+rslice*size, size));
-  }
-
-  TRACE(NCCL_INIT, "rank %d nranks %d size %d - DONE", rank, nranks, size);
-  return ncclSuccess;
-}
-
-ncclResult_t bootstrapClose(void* commState) {
-  struct extState* state = (struct extState*)commState;
-
-  NCCLCHECK(bootstrapCloseSend(state->extBstrapRingSendComm));
-  NCCLCHECK(bootstrapCloseRecv(state->extBstrapRingRecvComm));
-
-  free(state);
-
-  return ncclSuccess;
-}
@@ -0,0 +1,57 @@
+/*************************************************************************
+ * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
+ * Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#include "channel.h"
+#include "param.h"
+
+NCCL_PARAM(Buffsize, "BUFFSIZE", DEFAULT_BUFFER_SIZE_BYTES);
+
+ncclResult_t initChannel(struct ncclComm* comm, int channelid) {
+  struct ncclChannel* channel = comm->channels+channelid;
+  channel->id = channelid;
+
+  // Setup intermediate buffering
+  channel->buffSize = ncclParamBuffsize();
+
+  // Ring index to user rank table.
+  NCCLCHECK(ncclCudaCalloc(&channel->ring.devUserRanks, comm->nRanks));
+  NCCLCHECK(ncclCalloc(&channel->ring.userRanks, comm->nRanks));
+
+  // Communication structures with peers.
+  NCCLCHECK(ncclCudaCalloc(&channel->devPeers, comm->nRanks));
+  NCCLCHECK(ncclCalloc(&channel->peers, comm->nRanks));
+  for (size_t i=0; i<comm->nRanks; ++i) {
+    channel->peers[i].send.comm = comm;
+    channel->peers[i].recv.comm = comm;
+  }
+
+  // Per-channel operation list.
+  NCCLCHECK(ncclCudaHostAlloc((void**)&channel->collectives, (void**)&channel->devCollectives, sizeof(struct ncclColl)*NCCL_MAX_OPS));
+  return ncclSuccess;
+}
+
+ncclResult_t freeChannel(struct ncclChannel* channel, int nRanks) {
+  // Operation list
+  NCCLCHECK(ncclCudaHostFree(channel->collectives));
+
+  // Free Ring index to rank tables
+  free(channel->ring.userRanks);
+  CUDACHECK(hipFree(channel->ring.devUserRanks));
+
+  // Free transport proxy resources
+  for (int r=0; r<nRanks; r++) {
+    struct ncclPeer* peer = channel->peers+r;
+    if (peer->send.transportResources) NCCLCHECK(peer->send.transportComm->free(peer->send.transportResources));
+    if (peer->recv.transportResources) NCCLCHECK(peer->recv.transportComm->free(peer->recv.transportResources));
+  }
+
+  // Free the peer structures.
+  CUDACHECK(hipFree(channel->devPeers));
+  free(channel->peers);
+
+  return ncclSuccess;
+}
@@ -0,0 +1,19 @@
+/*************************************************************************
+ * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
+ * Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#include "enqueue.h"
+#include "collectives.h"
+
+NCCL_API(ncclResult_t, ncclAllGather, const void* sendbuff, void* recvbuff, size_t sendcount,
+    ncclDataType_t datatype, ncclComm_t comm, hipStream_t stream);
+ncclResult_t ncclAllGather(const void* sendbuff, void* recvbuff, size_t sendcount,
+    ncclDataType_t datatype, ncclComm_t comm, hipStream_t stream) {
+  struct ncclInfo info = { ncclCollAllGather, "AllGather",
+    sendbuff, recvbuff, sendcount, datatype, ncclSum, 0, comm, stream, /* Args */
+    ALLGATHER_CHUNKSTEPS, ALLGATHER_SLICESTEPS };
+  return ncclEnqueueCheck(&info);
+}
@@ -1,33 +0,0 @@
-/*************************************************************************
- * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
- * Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
- *
- * See LICENSE.txt for license information
- ************************************************************************/
-
-#include "core.h"
-#include "common_coll.h"
-#include "enqueue.h"
-#include "collectives.h"
-
-ncclResult_t ncclAllGatherFunc(const void* sendbuff, void* recvbuff, size_t count,
-    ncclDataType_t datatype, ncclRedOp_t op, int root, ncclComm_t comm, hipStream_t stream) {
-  size_t nbytes = count*ncclTypeSize(datatype);
-  INFO(NCCL_COLL,"AllGather: opCount %lx sendbuff %p recvbuff %p count %zi datatype %d op %d root %d comm %p [nranks=%d] stream %p", comm->opCount, sendbuff, recvbuff, count, datatype, op, root, comm, comm->nRanks, stream);
-  if (comm->nRanks == 1) {
-    if (sendbuff != recvbuff)
-      CUDACHECK(hipMemcpyAsync(recvbuff, sendbuff, nbytes, hipMemcpyDeviceToDevice, stream));
-  } else {
-    NCCLCHECK(transportSaveProxies(ALLGATHER_SUBSTEPS, ALLGATHER_BUFCHUNKS, comm->nRanks-1, comm->nRanks, nbytes*comm->nRanks, proxyPatternRing, comm));
-    NCCLCHECK(saveKernel(ncclCollAllGather, sendbuff, recvbuff, nbytes, ncclInt8, op, root, comm, stream, nbytes*comm->nRanks, 1));
-  }
-  return ncclSuccess;
-}
-
-NCCL_API(ncclResult_t, ncclAllGather, const void* sendbuff, void* recvbuff, size_t sendcount,
-    ncclDataType_t datatype, ncclComm_t comm, hipStream_t stream);
-ncclResult_t ncclAllGather(const void* sendbuff, void* recvbuff, size_t sendcount,
-    ncclDataType_t datatype, ncclComm_t comm, hipStream_t stream) {
-  return ncclEnqueueCheck(ncclAllGatherFunc, "AllGather", sendbuff, recvbuff, sendcount, datatype,
-          ncclSum, 0, comm, stream);
-}
@@ -0,0 +1,19 @@
+/*************************************************************************
+ * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
+ * Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#include "enqueue.h"
+#include "collectives.h"
+
+NCCL_API(ncclResult_t, ncclAllReduce, const void* sendbuff, void* recvbuff, size_t count,
+    ncclDataType_t datatype, ncclRedOp_t op, ncclComm* comm, hipStream_t stream);
+ncclResult_t ncclAllReduce(const void* sendbuff, void* recvbuff, size_t count,
+    ncclDataType_t datatype, ncclRedOp_t op, ncclComm* comm, hipStream_t stream) {
+  struct ncclInfo info = { ncclCollAllReduce, "AllReduce",
+    sendbuff, recvbuff, count, datatype, op, 0, comm, stream, /* Args */
+    ALLREDUCE_CHUNKSTEPS, ALLREDUCE_SLICESTEPS };
+  return ncclEnqueueCheck(&info);
+}
@@ -1,33 +0,0 @@
-/*************************************************************************
- * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
- * Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
- *
- * See LICENSE.txt for license information
- ************************************************************************/
-
-#include "core.h"
-#include "common_coll.h"
-#include "enqueue.h"
-#include "collectives.h"
-
-ncclResult_t ncclAllReduceFunc(const void* sendbuff, void* recvbuff, size_t count,
-    ncclDataType_t datatype, ncclRedOp_t op, int root, ncclComm_t comm, hipStream_t stream) {
-  size_t nbytes = count*ncclTypeSize(datatype);
-  INFO(NCCL_COLL,"AllReduce: opCount %lx sendbuff %p recvbuff %p count %zi datatype %d op %d root %d comm %p [nranks=%d] stream %p", comm->opCount, sendbuff, recvbuff, count, datatype, op, root, comm, comm->nRanks, stream);
-  if (comm->nRanks == 1) {
-    if (sendbuff != recvbuff)
-      CUDACHECK(hipMemcpyAsync(recvbuff, sendbuff, nbytes, hipMemcpyDeviceToDevice, stream));
-  } else {
-    NCCLCHECK(transportSaveProxies(ALLREDUCE_SUBSTEPS, ALLREDUCE_BUFCHUNKS, (comm->nRanks)*2-2, comm->nRanks, nbytes, proxyPatternRing, comm));
-    NCCLCHECK(saveKernel(ncclCollAllReduce, sendbuff, recvbuff, count, datatype, op, root, comm, stream, nbytes, comm->nRanks));
-  }
-  return ncclSuccess;
-}
-
-NCCL_API(ncclResult_t, ncclAllReduce, const void* sendbuff, void* recvbuff, size_t count,
-    ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm, hipStream_t stream);
-ncclResult_t ncclAllReduce(const void* sendbuff, void* recvbuff, size_t count,
-    ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm, hipStream_t stream) {
-  return ncclEnqueueCheck(ncclAllReduceFunc, "AllReduce", sendbuff, recvbuff, count, datatype,
-          op, 0, comm, stream);
-}
@@ -0,0 +1,27 @@
+/*************************************************************************
+ * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
+ * Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#include "enqueue.h"
+#include "collectives.h"
+
+NCCL_API(ncclResult_t, ncclBroadcast, const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype, int root,
+    ncclComm_t comm, hipStream_t stream);
+ncclResult_t ncclBroadcast(const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype, int root,
+    ncclComm_t comm, hipStream_t stream) {
+  struct ncclInfo info = { ncclCollBroadcast, "Broadcast",
+    sendbuff, recvbuff, count, datatype, ncclSum, root, comm, stream, /* Args */
+    BROADCAST_CHUNKSTEPS, BROADCAST_SLICESTEPS };
+  return ncclEnqueueCheck(&info);
+}
+/* Deprecated original "in place" function, similar to MPI */
+NCCL_API(ncclResult_t, ncclBcast, void* buff, size_t count, ncclDataType_t datatype, int root,
+    ncclComm_t comm, hipStream_t stream);
+ncclResult_t ncclBcast(void* buff, size_t count, ncclDataType_t datatype, int root,
+    ncclComm_t comm, hipStream_t stream) {
+  return ncclBroadcast(buff, buff, count, datatype, root, comm, stream);
+}
+
@@ -1,43 +0,0 @@
-/*************************************************************************
- * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
- * Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
- *
- * See LICENSE.txt for license information
- ************************************************************************/
-
-#include "core.h"
-#include "common_coll.h"
-#include "enqueue.h"
-#include "collectives.h"
-
-ncclResult_t ncclBroadcastFunc(const void* sendbuff, void* recvbuff, const size_t count,
-    ncclDataType_t datatype, ncclRedOp_t op, int root, ncclComm_t comm, hipStream_t stream) {
-  size_t nbytes = count*ncclTypeSize(datatype);
-  INFO(NCCL_COLL,"Broadcast: opCount %lx sendbuff %p recvbuff %p count %zi datatype %d op %d root %d comm %p [nranks=%d] stream %p", comm->opCount, sendbuff, recvbuff, count, datatype, op, root, comm, comm->nRanks, stream);
-  if (comm->nRanks == 1) {
-    if (sendbuff != recvbuff)
-      CUDACHECK(hipMemcpyAsync(recvbuff, sendbuff, nbytes, hipMemcpyDeviceToDevice, stream));
-  } else {
-    NCCLCHECK(transportSaveProxies(BROADCAST_SUBSTEPS, BROADCAST_BUFCHUNKS, 1, 1, nbytes, proxyPatternFrom(root), comm));
-    NCCLCHECK(saveKernel(ncclCollBroadcast, sendbuff, recvbuff, nbytes, ncclInt8, op, root, comm, stream, nbytes, 1));
-  }
-
-  return ncclSuccess;
-}
-
-/* Deprecated original "in place" function, similar to MPI */
-NCCL_API(ncclResult_t, ncclBcast, void* buff, size_t count, ncclDataType_t datatype, int root,
-    ncclComm_t comm, hipStream_t stream);
-ncclResult_t ncclBcast(void* buff, size_t count, ncclDataType_t datatype, int root,
-    ncclComm_t comm, hipStream_t stream) {
-  return ncclEnqueueCheck(ncclBroadcastFunc, "Bcast", buff, buff, count, datatype,
-          ncclSum, root, comm, stream);
-}
-
-NCCL_API(ncclResult_t, ncclBroadcast, const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype, int root,
-    ncclComm_t comm, hipStream_t stream);
-ncclResult_t ncclBroadcast(const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype, int root,
-    ncclComm_t comm, hipStream_t stream) {
-  return ncclEnqueueCheck(ncclBroadcastFunc, "Broadcast", sendbuff, recvbuff, count, datatype,
-          ncclSum, root, comm, stream);
-}
@@ -1,5 +1,6 @@
+#include "hip/hip_runtime.h"
 /*************************************************************************
- * Copyright (c) 2017-2018, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved.
 * Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
 *
 * See LICENSE.txt for license information
@@ -8,9 +9,7 @@
 #ifndef NCCL_COLLECTIVES_H_
 #define NCCL_COLLECTIVES_H_

-typedef enum { ncclCollBroadcast, ncclCollReduce, ncclCollAllGather, ncclCollReduceScatter, ncclCollAllReduce, ncclCollCount } ncclColl_t;
-
-#define FUNC_INDEX(coll, redop, dtype, ll) ((((coll*ncclNumOps + redop)*ncclNumTypes) + dtype)*2+ll)
+#define FUNC_INDEX(coll, redop, dtype, ll, al) ((((coll*ncclNumOps + redop)*ncclNumTypes) + dtype)*2+ll)

 #define NCCL_COLL_NAME(coll, op, dtype) \
  coll##_##op##_##dtype
@@ -19,13 +18,16 @@ typedef enum { ncclCollBroadcast, ncclCollReduce, ncclCollAllGather, ncclCollRed
  coll##Kernel_##op##_##dtype

 /* Declare all collective operations */
-#define DECL_COLL4(coll, op, dtype) \
+#define DECL_COLL5(coll, op, dtype) \
  extern __device__ __attribute__((noinline)) void NCCL_COLL_NAME(coll, op, dtype)(struct CollectiveArgs* args); \
-  extern __global__ void NCCL_KERN_NAME(coll, op, dtype)(struct ncclColl coll); \
+  extern __global__ void NCCL_KERN_NAME(coll, op, dtype)(struct ncclColl c); \
+
+#define DECL_COLL4(coll, op, dtype) \
+  DECL_COLL5(coll, op, dtype) \
+  DECL_COLL5(coll##LL, op, dtype)

 #define DECL_COLL3(coll, op, dtype) \
-  DECL_COLL4(coll##LL, op, dtype) \
-  DECL_COLL4(coll, op, dtype)
+  DECL_COLL4(coll##Ring, op, dtype)

 #define DECL_COLL2(coll, op) \
  DECL_COLL3(coll, op, i8) \
@@ -53,15 +55,22 @@ typedef enum { ncclCollBroadcast, ncclCollReduce, ncclCollAllGather, ncclCollRed

 DECL_ALL_COLLS

-#define ALLREDUCE_SUBSTEPS 2
-#define ALLREDUCE_BUFCHUNKS 2
-#define ALLGATHER_SUBSTEPS 2
-#define ALLGATHER_BUFCHUNKS 2
-#define REDUCESCATTER_SUBSTEPS 2
-#define REDUCESCATTER_BUFCHUNKS 2
-#define BROADCAST_SUBSTEPS 8
-#define BROADCAST_BUFCHUNKS 2
-#define REDUCE_SUBSTEPS 8
-#define REDUCE_BUFCHUNKS 2
+// CHUNKSIZE must be a multiple of SLICESIZE
+//#define ALLREDUCE_SLICESTEPS (NCCL_STEPS/4)
+//#define ALLREDUCE_CHUNKSTEPS (NCCL_STEPS/2)
+//#define ALLGATHER_SLICESTEPS (NCCL_STEPS/4)
+//#define ALLGATHER_CHUNKSTEPS (NCCL_STEPS/2)
+//#define REDUCESCATTER_SLICESTEPS (NCCL_STEPS/4)
+//#define REDUCESCATTER_CHUNKSTEPS (NCCL_STEPS/2)
+#define ALLREDUCE_SLICESTEPS 4
+#define ALLREDUCE_CHUNKSTEPS 4
+#define ALLGATHER_SLICESTEPS 4
+#define ALLGATHER_CHUNKSTEPS 4
+#define REDUCESCATTER_SLICESTEPS 4
+#define REDUCESCATTER_CHUNKSTEPS 4
+#define BROADCAST_SLICESTEPS 1
+#define BROADCAST_CHUNKSTEPS 1
+#define REDUCE_SLICESTEPS 1
+#define REDUCE_CHUNKSTEPS 1

 #endif
@@ -1,5 +1,5 @@
 #
-# Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+# Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
 #
 # See LICENSE.txt for license information
 #
@@ -12,18 +12,13 @@ OBJDIR := $(BUILDDIR)/obj/collectives/device

 LIBSRCFILES := all_reduce.cu broadcast.cu reduce.cu all_gather.cu reduce_scatter.cu

-LIBOBJ     := $(patsubst %.cu,$(OBJDIR)/%_sum.o, $(LIBSRCFILES)) \
-              $(patsubst %.cu,$(OBJDIR)/%_prod.o, $(LIBSRCFILES)) \
-              $(patsubst %.cu,$(OBJDIR)/%_min.o, $(LIBSRCFILES)) \
-              $(patsubst %.cu,$(OBJDIR)/%_max.o, $(LIBSRCFILES)) \
-              $(OBJDIR)/functions.o
-
 LIBSRCFILES += functions.cu

 DEPFILES   := $(patsubst %.cu, $(OBJDIR)/%.d, $(LIBSRCFILES))
-DEPENDFILES := $(DEPFILES:%.d=%.dep)
+DEPENDFILES:= $(DEPFILES:%.d=%.dep)
 STATICLIB  := $(OBJDIR)/colldevice.a
 DEVOBJ     := $(OBJDIR)/devlink.o
+RULESFILE  := $(OBJDIR)/Makefile.rules

 NVCUFLAGS  += -I. -I.. -I$(BUILDDIR)/include -I../../include --compiler-options "-fPIC -fvisibility=hidden"

@@ -33,6 +28,16 @@ all: $(STATICLIB)
 # Dummy rule so that the extra dependency (%.dep) files are preserved by make
 all_deps: $(DEPENDFILES)

+# Auto-generating the rules per op/reduction/datatype/algorithm
+$(RULESFILE) :
+	@printf "Generating %-35s > %s\n" rules $@
+	@mkdir -p $(OBJDIR)
+	@./gen_rules.sh $(OBJDIR) > $@
+
+-include $(RULESFILE)
+
+LIBOBJ     := $(GENOBJS) $(OBJDIR)/functions.o
+
 -include $(DEPFILES)

 $(STATICLIB): $(LIBOBJ) $(DEVOBJ)
@@ -58,26 +63,6 @@ $(OBJDIR)/functions.o : functions.cu $(OBJDIR)/functions.dep
 	mkdir -p `dirname $@`
 	$(NVCC) $(NVCUFLAGS) -dc $< -o $@

-$(OBJDIR)/%_sum.o : %.cu $(OBJDIR)/%.dep
-	@printf "Compiling  %-35s > %s\n" $< $@
-	mkdir -p `dirname $@`
-	$(NVCC) -DNCCL_OP=0 $(NVCUFLAGS) -dc $< -o $@
-
-$(OBJDIR)/%_prod.o : %.cu $(OBJDIR)/%.dep
-	@printf "Compiling  %-35s > %s\n" $< $@
-	mkdir -p `dirname $@`
-	$(NVCC) -DNCCL_OP=1 $(NVCUFLAGS) -dc $< -o $@
-
-$(OBJDIR)/%_min.o : %.cu $(OBJDIR)/%.dep
-	@printf "Compiling  %-35s > %s\n" $< $@
-	mkdir -p `dirname $@`
-	$(NVCC) -DNCCL_OP=2 $(NVCUFLAGS) -dc $< -o $@
-
-$(OBJDIR)/%_max.o : %.cu $(OBJDIR)/%.dep
-	@printf "Compiling  %-35s > %s\n" $< $@
-	mkdir -p `dirname $@`
-	$(NVCC) -DNCCL_OP=3 $(NVCUFLAGS) -dc $< -o $@
-
 # ... and create the device-side linked object with all those.
 $(DEVOBJ) : $(LIBOBJ)
 	$(NVCC) $(NVCUFLAGS) -dlink $^ -o $@
@@ -1,5 +1,6 @@
 /*************************************************************************
 * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+ * Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
 *
 * See LICENSE.txt for license information
 ************************************************************************/
@@ -10,6 +11,4 @@

 #define UNROLL 4

-#if NCCL_OP == 0
 IMPL_COLL3(ncclAllGather, copy, FuncSum, i8, int8_t, ncclCollAllGather, ncclSum, ncclInt8);
-#endif
@@ -1,81 +1,44 @@
 /*************************************************************************
- * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
 * Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
 *
 * See LICENSE.txt for license information
 ************************************************************************/

-#include "core.h"
+#include "devcomm.h"
 #include "primitives.h"
 #include "collectives.h"

-// Increase Step and poffset/noffset for buffer sync
-#define NEXT_STEP \
-  step++; \
-  poffset = noffset; \
-  noffset += sliceSize; \
-  if (noffset == buffSize) noffset = 0;
-
 template<int UNROLL, class FUNC, typename T>
 __attribute__((noinline))
-__device__ void ncclAllGatherKernel(struct CollectiveArgs* args) {
+__device__ void ncclAllGatherRingKernel(struct CollectiveArgs* args) {
  const int tid = threadIdx.x;
  const int nthreads = blockDim.x;
  const int bid = args->bid;
-  __shared__ T* sharedNextOutput;
-  struct ncclComm* comm = args->comm;
-  struct ncclRing* ring = comm->rings+blockIdx.x;
-  int prevdirect = 0;
-  int nextdirect = 0;
-
-  WaitFlag waitDoneFromNext(ring->send.conn.head, ALLGATHER_BUFCHUNKS*ALLGATHER_SUBSTEPS);
-  WaitFlag waitReadyFromPrev(ring->recv.conn.tail, ALLGATHER_SUBSTEPS);
-  PostFlag postDoneToPrev(ring->recv.conn.head, ALLGATHER_SUBSTEPS, NULL, 0);
-  PostFlag postReadyToNext(ring->send.conn.tail, 0, ring->send.conn.fifo, ALLGATHER_BUFCHUNKS*ALLGATHER_SUBSTEPS, ring->next_hdp_reg);
-
-  typedef Primitives<UNROLL, ALLGATHER_SUBSTEPS, T> Prims;
-
+  struct ncclDevComm* comm = args->comm;
+  struct ncclChannel* channel = comm->channels+blockIdx.x;
+  struct ncclRing* ring = &channel->ring;
  const ssize_t size = args->N;
  const int nranks = comm->nRanks;
-  const int buffSize = ring->buffSize / sizeof(T);
-  const int sliceSize = buffSize / ALLGATHER_BUFCHUNKS;
-  const ssize_t loopSize = args->nRings*(ssize_t)sliceSize;
-
-  if (tid == 0) {
-    // Update in case we skipped some collectives
-    STORE(ring->recv.conn.opCount, args->opCount);
-    // Wait for next to be ready
-    WaitFlag waitOpCountNext(ring->send.conn.opCount, 0);
-    waitOpCountNext.wait(args->opCount);
-    if (prevdirect) {
-      *ring->recv.conn.ptrExchange = args->ThisOutput;
-    }
-    if (nextdirect) {
-      void* volatile* ptr = &(ring->devMemSend->ptrExchange);
-      while (LOAD(ptr) == nullptr);
-      sharedNextOutput = (T*)LOAD(ptr);
-      STORE(ptr, nullptr);
-    }
-  }
-  __syncthreads();
-
-  uint64_t step = 0ULL;
-  int poffset, noffset = 0;
+  const int stepSize = channel->buffSize / (sizeof(T)*NCCL_STEPS);
+  const int chunkSize = stepSize * ALLREDUCE_CHUNKSTEPS;
+  const ssize_t loopSize = args->nChannels*(ssize_t)chunkSize;

  // Compute pointers
  const T * __restrict__ thisInput = (const T*)args->ThisInput;
  T * __restrict__ thisOutput = (T*)args->ThisOutput;
-  T * __restrict__ prevInput = (T*)ring->recv.conn.buff;
-  T * __restrict__ nextOutput = (T*)ring->send.conn.buff;
+
+  ncclPrimitives<UNROLL, ALLGATHER_CHUNKSTEPS/ALLGATHER_SLICESTEPS, ALLREDUCE_SLICESTEPS, T, 1, 1, FUNC>
+    prims(tid, nthreads, &ring->prev, &ring->next, thisOutput, stepSize, channel, comm, args->opCount);

  for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
-    int chunkSize = min(sliceSize, DIVUP(size-gridOffset,args->nRings));
-    ALIGN_SIZE(chunkSize, nthreads*sizeof(uint64_t)/sizeof(T));
-    ssize_t chunkOffset = gridOffset + bid*chunkSize;
+    int realChunkSize = min(chunkSize, DIVUP(size-gridOffset,args->nChannels));
+    ALIGN_SIZE(realChunkSize, nthreads*sizeof(uint64_t)/sizeof(T));
+    ssize_t chunkOffset = gridOffset + bid*realChunkSize;

    /////////////// begin AllGather steps ///////////////
    ssize_t offset;
-    int maxOffset = min(chunkSize, size-chunkOffset);
+    int nelem = min(realChunkSize, size-chunkOffset);
    int rankDest;

    // step 0: push data to next GPU
@@ -83,130 +46,53 @@ __device__ void ncclAllGatherKernel(struct CollectiveArgs* args) {
    offset = chunkOffset + rankDest * size;

    if (thisInput + chunkOffset == thisOutput + offset) { // In place
-      Prims::Copy(tid, nthreads,
-          thisInput  + chunkOffset,
-          nextdirect ? (sharedNextOutput + offset) : (nextOutput + noffset),
-          sliceSize, maxOffset,
-          step,
-          waitDoneFromNext,
-          postReadyToNext);
+      prims.directSend(thisInput+chunkOffset, offset, nelem);
    } else {
-      Prims::DoubleCopy(tid, nthreads,
-          thisInput  + chunkOffset,
-          thisOutput + offset,
-          nextdirect ? (sharedNextOutput + offset) : (nextOutput + noffset),
-          sliceSize, maxOffset,
-          step,
-          waitDoneFromNext,
-          postReadyToNext);
+      prims.directCopySend(thisInput+chunkOffset, thisOutput+offset, offset, nelem);
    }

-    NEXT_STEP; // Increases step, poffset, noffset
-
    // k-2 steps: copy to next GPU
-    if (prevdirect) {
-      for (int j=1; j<nranks-1; ++j) {
-        rankDest = ring->devUserRanks[nranks-j];
-        offset = chunkOffset + rankDest * size;
-
-        Prims::Copy(tid, nthreads,
-            thisOutput + offset,
-            nextdirect ? (sharedNextOutput + offset) : (nextOutput + noffset),
-            sliceSize, maxOffset,
-            step,
-            waitDoneFromNext, waitReadyFromPrev,
-            postReadyToNext, postDoneToPrev);
-
-        NEXT_STEP;
-      }
-      Prims::Copy(tid, nthreads,
-          NULL,
-          NULL,
-          0, 0,
-          step,
-          waitReadyFromPrev,
-          postDoneToPrev);
-    } else {
-      for (int j=1; j<nranks-1; ++j) {
-        rankDest = ring->devUserRanks[nranks-j];
-        offset = chunkOffset + rankDest * size;
-
-        Prims::DoubleCopy(tid, nthreads,
-            prevInput + poffset,
-            thisOutput + offset,
-            nextdirect ? (sharedNextOutput + offset) : (nextOutput + noffset),
-            sliceSize, maxOffset,
-            step,
-            waitDoneFromNext, waitReadyFromPrev,
-            postReadyToNext, postDoneToPrev);
-
-        NEXT_STEP;
-      }
-
-      // Make final copy from buffer to dest.
-      rankDest = ring->devUserRanks[1];
+    for (int j=1; j<nranks-1; ++j) {
+      rankDest = ring->devUserRanks[nranks-j];
      offset = chunkOffset + rankDest * size;

-      // Here we need to copy from buffer to this output.
-      Prims::Copy(tid, nthreads,
-          prevInput + poffset,
-          thisOutput + offset,
-          sliceSize, maxOffset,
-          step,
-          waitReadyFromPrev,
-          postDoneToPrev);
+      prims.directRecvCopySend(thisOutput+offset, offset, nelem);
    }
-  }

-  if (tid == 0) {
-    waitDoneFromNext.wait(ALLGATHER_SUBSTEPS*(step + ALLGATHER_BUFCHUNKS));
-    STORE(ring->send.conn.head, 0ULL);
-    STORE(ring->recv.conn.tail, 0ULL);
-    __threadfence_system();
-    STORE(ring->recv.conn.opCount, args->opCount+1);
+    // Make final copy from buffer to dest.
+    rankDest = ring->devUserRanks[1];
+    offset = chunkOffset + rankDest * size;
+
+    // Final wait/copy.
+    prims.directRecv(thisOutput+offset, offset, nelem);
  }
 }

-#include "ll_kernel.h"
-
-#define NEXT_STEP_LL \
-  poffset = noffset; \
-  pflag = nflag; \
-  noffset += NCCL_LL_SLICE_LINES; \
-  if (noffset == NCCL_LL_BUFF_LINES) { noffset = 0; } \
-  nflag++; \
-  step++;
+template<int UNROLL, class FUNC, typename T>
+__attribute__((noinline))
+__device__ void ncclAllGatherTreeKernel(struct CollectiveArgs* args) { }

 template<int UNUSED, class FUNC, typename T>
 __attribute__((noinline))
-__device__ void ncclAllGatherLLKernel(struct CollectiveArgs* args) {
+__device__ void ncclAllGatherRingLLKernel(struct CollectiveArgs* args) {
  const int tid = threadIdx.x;
  const int bid = args->bid;
-  const int llNthreads = args->nThreads;
-  struct ncclComm* comm = args->comm;
-  struct ncclRing* ring = comm->rings+blockIdx.x;
-  volatile uint64_t * recvHeadPtr = ring->recv.conn.llHead;
-  volatile uint64_t * sendHeadPtr = ring->send.conn.llHead;
-  volatile int * sizesFifo = ring->send.conn.llFifo;
-  uint64_t sendHead = sendHeadPtr[0];
+  const int nthreads = args->nThreads;
+  struct ncclDevComm* comm = args->comm;
+  struct ncclChannel* channel = comm->channels+blockIdx.x;
+  struct ncclRing* ring = &channel->ring;

-  typedef LLPrimitives<T, FUNC> LL;
+  ncclLLPrimitives<T, FUNC, 1, 1> LLprims(tid, nthreads, &ring->prev, &ring->next, channel, comm, args->opCount);

  const ssize_t size = args->N;
  //const int rank = comm->rank;
  const int nranks = comm->nRanks;
  ssize_t chunkSize = NCCL_LL_SLICE_LINES * sizeof(uint64_t) / sizeof(T);
-  const ssize_t loopSize = args->nRings*chunkSize;
-
-  uint64_t step = ring->send.conn.llStep;
-  uint32_t pflag, nflag = step + 1;
-  int poffset, noffset = NCCL_LL_SLICE_LINES * STEP_TO_SLOT(step);
+  const ssize_t loopSize = args->nChannels*chunkSize;

  // Compute pointers
  const T * __restrict__ thisInput = (const T*)args->ThisInput;
  T * __restrict__ thisOutput = (T*)args->ThisOutput;
-  union ncclLLFifoLine * prevInput = (union ncclLLFifoLine *)ring->recv.conn.llBuff;
-  union ncclLLFifoLine * nextOutput = (union ncclLLFifoLine *)ring->send.conn.llBuff;

  for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
    if (size-gridOffset < loopSize) {
@@ -216,57 +102,35 @@ __device__ void ncclAllGatherLLKernel(struct CollectiveArgs* args) {

    /////////////// begin AllGather steps ///////////////
    ssize_t offset;
-    int maxOffset = min(chunkSize, size-chunkOffset);
+    int nelem = min(chunkSize, size-chunkOffset);
    int rankDest;

    // step 0: push data to next GPU
    rankDest = ring->devUserRanks[0];
    offset = chunkOffset + rankDest * size;

-    WAIT_NEXT;
    if (thisInput + chunkOffset == thisOutput + offset) { // In place
-      LL::ReduceCopy(
-          thisInput  + chunkOffset,
-          nextOutput + noffset,
-          maxOffset, nflag, llNthreads);
+      LLprims.send(thisInput+chunkOffset, nelem);
    } else {
-      LL::ReduceCopy(
-          thisInput  + chunkOffset,
-          thisOutput + offset,
-          nextOutput + noffset,
-          maxOffset, nflag, llNthreads);
+      LLprims.copySend(thisInput+chunkOffset, thisOutput+offset, nelem);
    }
-    POST_SIZE;
-
-    NEXT_STEP_LL;

    // k-2 steps: copy to next GPU
    for (int j=1; j<nranks-1; ++j) {
      rankDest = ring->devUserRanks[nranks-j];
      offset = chunkOffset + rankDest * size;

-      WAIT_NEXT;
-      LL::ReduceCopy(
-          prevInput  + poffset,
-          thisOutput + offset,
-          nextOutput + noffset,
-          maxOffset, pflag, nflag, llNthreads);
-      POST_SIZE;
-      ACK_PREV;
-
-      NEXT_STEP_LL;
+      LLprims.recvCopySend(thisOutput+offset, nelem);
    }

    // step k-1: final store
    rankDest = ring->devUserRanks[1];
    offset = chunkOffset + rankDest * size;

-    LL::ReduceCopy(
-        prevInput  + poffset,
-        thisOutput + offset,
-        maxOffset, pflag, llNthreads);
-    ACK_PREV;
+    LLprims.recv(thisOutput+offset, nelem);
  }
-
-  FIFO_CLEANING_AND_SAVE_STEP(nflag);
 }
+
+template<int UNUSED, class FUNC, typename T>
+__attribute__((noinline))
+__device__ void ncclAllGatherTreeLLKernel(struct CollectiveArgs* args) { }
@@ -1,5 +1,6 @@
 /*************************************************************************
 * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+ * Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
 *
 * See LICENSE.txt for license information
 ************************************************************************/
@@ -10,12 +11,7 @@

 #define UNROLL 4

-#if NCCL_OP == 0
 IMPL_COLL2(ncclAllReduce, sum,  FuncSum,  ncclCollAllReduce, ncclSum);
-#elif NCCL_OP == 1
 IMPL_COLL2(ncclAllReduce, prod, FuncProd, ncclCollAllReduce, ncclProd);
-#elif NCCL_OP == 2
 IMPL_COLL2(ncclAllReduce, min,  FuncMin,  ncclCollAllReduce, ncclMin);
-#elif NCCL_OP == 3
-IMPL_COLL2(ncclAllReduce, max,  FuncMax,  ncclCollAllReduce, ncclMax);
-#endif
+IMPL_COLL2(ncclAllReduce, max,  FuncMax,  ncclCollAllReduce, ncclMax);
@@ -1,243 +1,181 @@
 /*************************************************************************
- * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
 * Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
 *
 * See LICENSE.txt for license information
 ************************************************************************/

-#include "core.h"
+#include "devcomm.h"
 #include "primitives.h"
 #include "collectives.h"

-// Increase Step and poffset/noffset for buffer sync
-#define NEXT_STEP \
-  step++; \
-  poffset = noffset; \
-  noffset += sliceSize; \
-  if (noffset == buffSize) noffset = 0;
-
 template<int UNROLL, class FUNC, typename T>
 __attribute__((noinline))
-__device__ void ncclAllReduceKernel(struct CollectiveArgs* args) {
+__device__ void ncclAllReduceRingKernel(struct CollectiveArgs* args) {
  const int tid = threadIdx.x;
  const int nthreads = blockDim.x;
  const int bid = args->bid;
-  __shared__ T* sharedNextOutput;
-  struct ncclComm* comm = args->comm;
-  struct ncclRing* ring = comm->rings+blockIdx.x;
-  int prevdirect = 0;
-  int nextdirect = 0;
-
-  WaitFlag waitDoneFromNext(ring->send.conn.head, ALLREDUCE_BUFCHUNKS*ALLREDUCE_SUBSTEPS);
-  WaitFlag waitReadyFromPrev(ring->recv.conn.tail, ALLREDUCE_SUBSTEPS);
-  PostFlag postDoneToPrev(ring->recv.conn.head, ALLREDUCE_SUBSTEPS, NULL, 0);
-  PostFlag postReadyToNext(ring->send.conn.tail, 0, ring->send.conn.fifo, ALLREDUCE_BUFCHUNKS*ALLREDUCE_SUBSTEPS, ring->next_hdp_reg);
-
-  typedef Primitives<UNROLL, ALLREDUCE_SUBSTEPS, T, FUNC> Prims;
-
+  struct ncclDevComm* comm = args->comm;
+  struct ncclChannel* channel = comm->channels+blockIdx.x;
+  struct ncclRing* ring = &channel->ring;
  const ssize_t size = args->N;
-  //const int rank = comm->rank;
  const int nranks = comm->nRanks;
-  const int buffSize = ring->buffSize / sizeof(T);
-  const int sliceSize = buffSize / ALLREDUCE_BUFCHUNKS;
-  const ssize_t loopSize = args->nRings*(ssize_t)sliceSize;
-
-  if (tid == 0) {
-    // Update in case we skipped some collectives
-    STORE(ring->recv.conn.opCount, args->opCount);
-    // Wait for next to be ready
-    WaitFlag waitOpCountNext(ring->send.conn.opCount, 0);
-    waitOpCountNext.wait(args->opCount);
-    if (prevdirect) {
-      *ring->recv.conn.ptrExchange = args->ThisOutput;
-    }
-    if (nextdirect) {
-      void* volatile* ptr = &(ring->devMemSend->ptrExchange);
-      while (LOAD(ptr) == nullptr);
-      sharedNextOutput = (T*)LOAD(ptr);
-      STORE(ptr, nullptr);
-    }
-  }
-  __syncthreads();
-
-  uint64_t step = 0ULL;
-  int poffset, noffset = 0;
+  const int stepSize = channel->buffSize / (sizeof(T)*NCCL_STEPS);
+  const int chunkSize = stepSize * ALLREDUCE_CHUNKSTEPS;
+  const ssize_t loopSize = args->nChannels*(ssize_t)chunkSize;
+#ifdef ENABLE_PROFILING
+  auto devProf = comm->devProf;
+  uint64_t clk, t0 = 0ULL, ws, wr;
+  if (tid == 0) clk = clock64();
+#endif

  // Compute pointers
  const T * __restrict__ thisInput = (const T*)args->ThisInput;
  T * __restrict__ thisOutput = (T*)args->ThisOutput;
-  T * __restrict__ prevInput = (T*)ring->recv.conn.buff;
-  T * __restrict__ nextOutput = (T*)ring->send.conn.buff;
+
+  ncclPrimitives<UNROLL, ALLREDUCE_CHUNKSTEPS/ALLREDUCE_SLICESTEPS, ALLREDUCE_SLICESTEPS, T, 1, 1, FUNC>
+    prims(tid, nthreads, &ring->prev, &ring->next, thisOutput, stepSize, channel, comm, args->opCount);

  for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += nranks*loopSize) {
-    int chunkSize = min(sliceSize, DIVUP(size-gridOffset,nranks*args->nRings));
-    ALIGN_SIZE(chunkSize, nthreads*sizeof(uint64_t)/sizeof(T));
-    ssize_t chunkOffset = gridOffset + bid*nranks*chunkSize;
+    int realChunkSize = min(chunkSize, DIVUP(size-gridOffset,nranks*args->nChannels));
+    ALIGN_SIZE(realChunkSize, nthreads*sizeof(uint64_t)/sizeof(T));
+    ssize_t chunkOffset = gridOffset + bid*nranks*realChunkSize;

    /////////////// begin AllReduce steps ///////////////
    ssize_t offset;
-    int maxOffset;
+    int nelem;
    int slice;

    // step 0: push data to next GPU
    slice = ring->devUserRanks[nranks-1];
-    offset = chunkOffset + slice * chunkSize;
-    maxOffset = min(chunkSize, size-offset);
+    offset = chunkOffset + slice * realChunkSize;
+    nelem = min(realChunkSize, size-offset);

-    Prims::Copy(tid, nthreads,
-        thisInput  + offset,
-        nextOutput + noffset,
-        sliceSize, maxOffset,
-        step,
-        waitDoneFromNext,
-        postReadyToNext);
-
-    NEXT_STEP; // Increases step, poffset, noffset
+    INIT_COUNTER;
+    prims.send(thisInput+offset, nelem);
+    ACCUMULATE_COUNTER(send);

    // k-2 steps: reduce and copy to next GPU
    for (int j=2; j<nranks; ++j) {
      slice = ring->devUserRanks[nranks-j];
-      offset = chunkOffset + slice * chunkSize;
-      maxOffset = min(chunkSize, size-offset);
+      offset = chunkOffset + slice * realChunkSize;
+      nelem = min(realChunkSize, size-offset);

-      Prims::Reduce(tid, nthreads,
-          prevInput  + poffset,
-          thisInput  + offset,
-          nextOutput + noffset,
-          sliceSize, maxOffset,
-          step,
-          waitDoneFromNext, waitReadyFromPrev,
-          postReadyToNext, postDoneToPrev);
-
-      NEXT_STEP;
+      INIT_COUNTER;
+      prims.recvReduceSend(thisInput+offset, nelem);
+      ACCUMULATE_COUNTER(recvReduceSend);
    }

    // step k-1: reduce this buffer and data, which will produce the final
    // result that we store in this data and push to the next GPU
    slice = ring->devUserRanks[0];
-    offset = chunkOffset + slice * chunkSize;
-    maxOffset = min(chunkSize, size-offset);
+    offset = chunkOffset + slice * realChunkSize;
+    nelem = min(realChunkSize, size-offset);

-    Prims::ReduceCopy(tid, nthreads,
-        prevInput  + poffset,
-        thisInput  + offset,
-        nextdirect ? (sharedNextOutput + offset) : (nextOutput + noffset),
-        thisOutput + offset,
-        sliceSize, maxOffset,
-        step,
-        waitDoneFromNext, waitReadyFromPrev,
-        postReadyToNext, postDoneToPrev);
-
-    NEXT_STEP;
+    INIT_COUNTER;
+    prims.directRecvReduceCopySend(thisInput+offset, thisOutput+offset, offset, nelem);
+    ACCUMULATE_COUNTER(directRecvReduceCopySend);

    // k-2 steps: copy to next GPU
-    if (prevdirect) {
-      for (int j=1; j<nranks-1; ++j) {
-        slice = ring->devUserRanks[nranks - j];
-        offset = chunkOffset + slice * chunkSize;
-        maxOffset = min(chunkSize, size-offset);
+    for (int j=1; j<nranks-1; ++j) {
+      slice = ring->devUserRanks[nranks-j];
+      offset = chunkOffset + slice * realChunkSize;
+      nelem = min(realChunkSize, size-offset);

-        Prims::Copy(tid, nthreads,
-            thisOutput + offset,
-            nextdirect ? (sharedNextOutput + offset) : (nextOutput + noffset),
-            sliceSize, maxOffset,
-            step,
-            waitDoneFromNext, waitReadyFromPrev,
-            postReadyToNext, postDoneToPrev);
-
-        NEXT_STEP;
-      }
-      Prims::Copy(tid, nthreads,
-          NULL,
-          NULL,
-          0, 0,
-          step,
-          waitReadyFromPrev,
-          postDoneToPrev);
-    } else {
-      for (int j=1; j<nranks-1; ++j) {
-        slice = ring->devUserRanks[nranks - j];
-        offset = chunkOffset + slice * chunkSize;
-        maxOffset = min(chunkSize, size-offset);
-
-        Prims::DoubleCopy(tid, nthreads,
-            prevInput + poffset,
-            thisOutput + offset,
-            nextdirect ? (sharedNextOutput + offset) : (nextOutput + noffset),
-            sliceSize, maxOffset,
-            step,
-            waitDoneFromNext, waitReadyFromPrev,
-            postReadyToNext, postDoneToPrev);
-
-        NEXT_STEP;
-      }
-
-      // Make final copy from buffer to dest.
-      slice = ring->devUserRanks[1];
-      offset = chunkOffset + slice * chunkSize;
-      maxOffset = min(chunkSize, size-offset);
-
-      // Here we need to copy from buffer to this output.
-      Prims::Copy(tid, nthreads,
-          prevInput + poffset,
-          thisOutput + offset,
-          sliceSize, maxOffset,
-          step,
-          waitReadyFromPrev,
-          postDoneToPrev);
+      INIT_COUNTER;
+      prims.directRecvCopySend(thisOutput+offset, offset, nelem);
+      ACCUMULATE_COUNTER(directRecvCopySend);
    }
-  }

-  if (tid == 0) {
-    // Wait for next to have consumed all data before we reset the flag
-    waitDoneFromNext.wait(ALLREDUCE_SUBSTEPS*(step + ALLREDUCE_BUFCHUNKS));
-    STORE(ring->send.conn.head, 0ULL);
-    STORE(ring->recv.conn.tail, 0ULL);
-    __threadfence_system();
-    STORE(ring->recv.conn.opCount, args->opCount+1);
+    // Make final copy from buffer to dest.
+    slice = ring->devUserRanks[1];
+    offset = chunkOffset + slice * realChunkSize;
+    nelem = min(realChunkSize, size-offset);
+
+    // Final wait/copy.
+    INIT_COUNTER;
+    prims.directRecv(thisOutput+offset, offset, nelem);
+    ACCUMULATE_COUNTER(directRecv);
  }
+#ifdef ENABLE_PROFILING
+  if (tid == 0) __atomic_fetch_add(&(devProf->total_cycle), clock64() - clk, __ATOMIC_SEQ_CST);
+#endif
 }

-#include "ll_kernel.h"
+template<int UNROLL, class FUNC, typename T>
+__attribute__((noinline))
+__device__ void ncclAllReduceTreeKernel(struct CollectiveArgs* args) {
+  const int tid = threadIdx.x;
+  const int nthreads = blockDim.x;
+  const int bid = args->bid;
+  struct ncclDevComm* comm = args->comm;
+  struct ncclChannel* channel = comm->channels+blockIdx.x;
+  struct ncclTree* tree = &channel->tree;
+  const ssize_t size = args->N;
+  const int stepSize = channel->buffSize / (sizeof(T)*NCCL_STEPS);
+  const int chunkSize = args->lastChunkSize;
+  const ssize_t loopSize = args->nChannels*chunkSize;

-#define NEXT_STEP_LL \
-  poffset = noffset; \
-  pflag = nflag; \
-  noffset += NCCL_LL_SLICE_LINES; \
-  if (noffset == NCCL_LL_BUFF_LINES) { noffset = 0; } \
-  nflag++; \
-  step++;
+  // Compute pointers
+  const T * __restrict__ thisInput = (const T*)args->ThisInput;
+  T * __restrict__ thisOutput = (T*)args->ThisOutput;
+
+  do {
+    // Reduce : max number of recv is 3, max number of send is 1 (binary tree + local)
+    ncclPrimitives<UNROLL, 1, 1, T, NCCL_MAX_TREE_ARITY, 1, FUNC> prims(tid, nthreads, tree->down, &tree->up, NULL, stepSize, channel, comm, args->opCount);
+    for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
+      // Up
+      ssize_t offset = gridOffset + bid*chunkSize;
+      int nelem = min(chunkSize, size-offset);
+      if (tree->up == -1) {
+        prims.recvReduceCopy(thisInput+offset, thisOutput+offset, nelem);
+      } else if (tree->down[0] == -1) {
+        prims.send(thisInput+offset, nelem);
+      } else {
+        prims.recvReduceSend(thisInput+offset, nelem);
+      }
+    }
+  } while(0);
+
+  do {
+    // Broadcast : max number of recv is 1, max number of send is 3 (binary tree + local)
+    ncclPrimitives<UNROLL, 1, 1, T, 1, NCCL_MAX_TREE_ARITY, FUNC> prims(tid, nthreads, &tree->up, tree->down, NULL, stepSize, channel, comm, args->opCount);
+    for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
+      // Down
+      ssize_t offset = gridOffset + bid*chunkSize;
+      int nelem = min(chunkSize, size-offset);
+      if (tree->up == -1) {
+        prims.send(thisOutput+offset, nelem);
+      } else if (tree->down[0] == -1) {
+        prims.recv(thisOutput+offset, nelem);
+      } else {
+        prims.recvCopySend(thisOutput+offset, nelem);
+      }
+    }
+  } while(0);
+}

 template<int UNUSED, class FUNC, typename T>
 __attribute__((noinline))
-__device__ void ncclAllReduceLLKernel(struct CollectiveArgs* args) {
+__device__ void ncclAllReduceRingLLKernel(struct CollectiveArgs* args) {
  const int tid = threadIdx.x;
  const int bid = args->bid;
-  const int llNthreads = args->nThreads;
-  struct ncclComm* comm = args->comm;
-  struct ncclRing* ring = comm->rings+blockIdx.x;
-  volatile uint64_t * recvHeadPtr = ring->recv.conn.llHead;
-  volatile uint64_t * sendHeadPtr = ring->send.conn.llHead;
-  volatile int * sizesFifo = ring->send.conn.llFifo;
-  uint64_t sendHead = sendHeadPtr[0];
+  const int nthreads = args->nThreads;
+  struct ncclDevComm* comm = args->comm;
+  struct ncclChannel* channel = comm->channels+blockIdx.x;
+  struct ncclRing* ring = &channel->ring;

-  typedef LLPrimitives<T, FUNC> LL;
+  ncclLLPrimitives<T, FUNC, 1, 1> LLprims(tid, nthreads, &ring->prev, &ring->next, channel, comm, args->opCount);

  const ssize_t size = args->N;
  //const int rank = comm->rank;
  const int nranks = comm->nRanks;
  ssize_t chunkSize = NCCL_LL_SLICE_LINES * sizeof(uint64_t) / sizeof(T);
-  const ssize_t loopSize = args->nRings*nranks*chunkSize;
-
-  uint64_t step = ring->send.conn.llStep;
-  uint32_t pflag, nflag = step + 1;
-  int poffset, noffset = NCCL_LL_SLICE_LINES * STEP_TO_SLOT(step);
+  const ssize_t loopSize = args->nChannels*nranks*chunkSize;

  // Compute pointers
  const T * __restrict__ thisInput = (const T*)args->ThisInput;
  T * __restrict__ thisOutput = (T*)args->ThisOutput;
-  union ncclLLFifoLine * prevInput = (union ncclLLFifoLine *)ring->recv.conn.llBuff;
-  union ncclLLFifoLine * nextOutput = (union ncclLLFifoLine *)ring->send.conn.llBuff;

  for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
    if (size-gridOffset < loopSize) {
@@ -247,89 +185,100 @@ __device__ void ncclAllReduceLLKernel(struct CollectiveArgs* args) {

    /////////////// begin AllReduce steps ///////////////
    ssize_t offset;
-    int maxOffset;
+    int nelem;
    int slice;

    // step 0: push data to next GPU
    slice = ring->devUserRanks[nranks-1];
    offset = chunkOffset + slice * chunkSize;
-    maxOffset = min(chunkSize, size-offset);
+    nelem = min(chunkSize, size-offset);

-    WAIT_NEXT;
-    LL::ReduceCopy(
-        thisInput  + offset,
-        nextOutput + noffset,
-        maxOffset, nflag, llNthreads);
-    POST_SIZE;
-
-    NEXT_STEP_LL;
+    LLprims.send(thisInput+offset, nelem);

    // k-2 steps: reduce and copy to next GPU
    for (int j=2; j<nranks; ++j) {
      slice = ring->devUserRanks[nranks-j];
      offset = chunkOffset + slice * chunkSize;
-      maxOffset = min(chunkSize, size-offset);
+      nelem = min(chunkSize, size-offset);

-      WAIT_NEXT;
-      LL::ReduceCopy(
-          thisInput  + offset,
-          prevInput  + poffset,
-          nextOutput + noffset,
-          maxOffset, pflag, nflag, llNthreads);
-      POST_SIZE;
-      ACK_PREV;
-
-      NEXT_STEP_LL;
+      LLprims.recvReduceSend(thisInput+offset, nelem);
    }

    // step k-1: reduce this buffer and data, which will produce the final
    // result that we store in this data and push to the next GPU
    slice = ring->devUserRanks[0];
    offset = chunkOffset + slice * chunkSize;
-    maxOffset = min(chunkSize, size-offset);
+    nelem = min(chunkSize, size-offset);

-    WAIT_NEXT;
-    LL::ReduceCopy(
-        thisInput  + offset,
-        prevInput  + poffset,
-        thisOutput + offset,
-        nextOutput + noffset,
-        maxOffset, pflag, nflag, llNthreads);
-    POST_SIZE;
-    ACK_PREV;
-
-    NEXT_STEP_LL;
+    LLprims.recvReduceCopySend(thisInput+offset, thisOutput+offset, nelem);

    // k-2 steps: copy to next GPU
    for (int j=1; j<nranks-1; ++j) {
-      slice = ring->devUserRanks[nranks - j];
+      slice = ring->devUserRanks[nranks-j];
      offset = chunkOffset + slice * chunkSize;
-      maxOffset = min(chunkSize, size-offset);
+      nelem = min(chunkSize, size-offset);

-      WAIT_NEXT;
-      LL::ReduceCopy(
-          prevInput + poffset,
-          thisOutput + offset,
-          nextOutput + noffset,
-          maxOffset, pflag, nflag, llNthreads);
-      POST_SIZE;
-      ACK_PREV;
-
-      NEXT_STEP_LL;
+      LLprims.recvCopySend(thisOutput+offset, nelem);
    }

    // Make final copy from buffer to dest.
    slice = ring->devUserRanks[1];
    offset = chunkOffset + slice * chunkSize;
-    maxOffset = min(chunkSize, size-offset);
+    nelem = min(chunkSize, size-offset);

    // Here we need to copy from buffer to this output.
-    LL::ReduceCopy(
-        prevInput + poffset,
-        thisOutput + offset,
-        maxOffset, pflag, llNthreads);
-    ACK_PREV;
+    LLprims.recv(thisOutput+offset, nelem);
  }
-
-  FIFO_CLEANING_AND_SAVE_STEP(nflag);
+}
+
+template<int UNUSED, class FUNC, typename T>
+__attribute__((noinline))
+__device__ void ncclAllReduceTreeLLKernel(struct CollectiveArgs* args) {
+  const int tid = threadIdx.x;
+  const int nthreads = args->nThreads;
+  const int bid = args->bid;
+  struct ncclDevComm* comm = args->comm;
+  struct ncclChannel* channel = comm->channels+blockIdx.x;
+  struct ncclTree* tree = &channel->tree;
+  const ssize_t size = args->N;
+  ssize_t chunkSize = NCCL_LL_SLICE_LINES * sizeof(uint64_t) / sizeof(T);
+  const ssize_t loopSize = args->nChannels*chunkSize;
+
+  // Compute pointers
+  const T * __restrict__ thisInput = (const T*)args->ThisInput;
+  T * __restrict__ thisOutput = (T*)args->ThisOutput;
+
+  do {
+    // Reduce : max number of recv is 3, max number of send is 1 (binary tree + local)
+    ncclLLPrimitives<T, FUNC, NCCL_MAX_TREE_ARITY, 1> LLprims(tid, nthreads, tree->down, &tree->up, channel, comm, args->opCount);
+    for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
+      // Up
+      ssize_t offset = gridOffset + bid*chunkSize;
+      int nelem = min(chunkSize, size-offset);
+      if (tree->up == -1) {
+        LLprims.recvReduceCopy(thisInput+offset, thisOutput+offset, nelem);
+      } else if (tree->down[0] == -1) {
+        LLprims.send(thisInput+offset, nelem);
+      } else {
+        LLprims.recvReduceSend(thisInput+offset, nelem);
+      }
+    }
+  } while(0);
+
+  do {
+    // Broadcast : max number of recv is 1, max number of send is 3 (binary tree + local)
+    ncclLLPrimitives<T, FUNC, 1, NCCL_MAX_TREE_ARITY> LLprims(tid, nthreads, &tree->up, tree->down, channel, comm, args->opCount);
+    for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
+      // Down
+      ssize_t offset = gridOffset + bid*chunkSize;
+      int nelem = min(chunkSize, size-offset);
+      if (tree->up == -1) {
+        LLprims.send(thisOutput+offset, nelem);
+      } else if (tree->down[0] == -1) {
+        LLprims.recv(thisOutput+offset, nelem);
+      } else {
+        LLprims.recvCopySend(thisOutput+offset, nelem);
+      }
+    }
+  } while(0);
 }
@@ -1,8 +0,0 @@
-/*************************************************************************
- * Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
- *
- * See LICENSE.txt for license information
- ************************************************************************/
-
-#define NCCL_OP 1
-#include "device/all_reduce.cu"
@@ -1,8 +0,0 @@
-/*************************************************************************
- * Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
- *
- * See LICENSE.txt for license information
- ************************************************************************/
-
-#define NCCL_OP 2
-#include "device/all_reduce.cu"
@@ -1,8 +0,0 @@
-/*************************************************************************
- * Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
- *
- * See LICENSE.txt for license information
- ************************************************************************/
-
-#define NCCL_OP 3
-#include "device/all_reduce.cu"
@@ -1,5 +1,6 @@
 /*************************************************************************
 * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+ * Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
 *
 * See LICENSE.txt for license information
 ************************************************************************/
@@ -10,6 +11,4 @@

 #define UNROLL 4

-#if NCCL_OP == 0
-IMPL_COLL3(ncclBroadcast, copy, FuncSum, i8, int8_t, ncclCollBroadcast, ncclSum, ncclInt8);
-#endif
+IMPL_COLL3(ncclBroadcast, copy, FuncSum, i8, int8_t, ncclCollBroadcast, ncclSum, ncclInt8);
@@ -1,184 +1,101 @@
 /*************************************************************************
- * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
 * Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
 *
 * See LICENSE.txt for license information
 ************************************************************************/

-#include "core.h"
+#include "devcomm.h"
 #include "primitives.h"
 #include "collectives.h"

-// Increase Step and boffset for buffer sync
-#define NEXT_STEP \
-  step++; \
-  boffset += sliceSize; \
-  if (boffset == buffSize) boffset = 0;
-
 template<int UNROLL, class FUNC, typename T>
 __attribute__((noinline))
-__device__ void ncclBroadcastKernel(struct CollectiveArgs* args) {
+__device__ void ncclBroadcastRingKernel(struct CollectiveArgs* args) {
  const int tid = threadIdx.x;
  const int nthreads = blockDim.x;
  const int bid = args->bid;
-  __shared__ T* sharedNextOutput;
-  struct ncclComm* comm = args->comm;
-  struct ncclRing* ring = comm->rings+blockIdx.x;
-  int prevdirect = 0;
-  int nextdirect = 0;
+  struct ncclDevComm* comm = args->comm;
+  struct ncclChannel* channel = comm->channels+blockIdx.x;
+  struct ncclRing* ring = &channel->ring;
+  const ssize_t size = args->N;
+  const int stepSize = channel->buffSize / (sizeof(T)*NCCL_STEPS);
+  const int chunkSize = stepSize * BROADCAST_CHUNKSTEPS;
+  const ssize_t loopSize = args->nChannels*(ssize_t)chunkSize;
+  const int rank = ring->devUserRanks[0];
+  const int nextRank = ring->devUserRanks[1];
+  const int root = args->root;
+#ifdef ENABLE_PROFILING
+  auto devProf = comm->devProf;
+  uint64_t clk, t0 = 0ULL, ws, wr;
+  if (tid == 0) clk = clock64();
+#endif

-  WaitFlag waitDoneFromNext(ring->send.conn.head, (BROADCAST_BUFCHUNKS-1)*BROADCAST_SUBSTEPS);
-  WaitFlag waitReadyFromPrev(ring->recv.conn.tail, 0);
-  PostFlag postDoneToPrev(ring->recv.conn.head, 0, NULL, 0);
-  PostFlag postReadyToNext(ring->send.conn.tail, 0, ring->send.conn.fifo, BROADCAST_BUFCHUNKS*BROADCAST_SUBSTEPS, ring->next_hdp_reg);
+  // Compute pointers
+  const T * __restrict__ thisInput = (const T*)args->ThisInput;
+  T * __restrict__ thisOutput = (T*)args->ThisOutput;

-  typedef Primitives<UNROLL, BROADCAST_SUBSTEPS, T> Prims;
+  ncclPrimitives<UNROLL, BROADCAST_CHUNKSTEPS/BROADCAST_SLICESTEPS, BROADCAST_SLICESTEPS, T, 1, 1, FUNC>
+    prims(tid, nthreads, &ring->prev, &ring->next, NULL, stepSize, channel, comm, args->opCount);
+
+  for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
+    int realChunkSize = min(chunkSize, DIVUP(size-gridOffset,args->nChannels));
+    ALIGN_SIZE(realChunkSize, nthreads*sizeof(uint64_t)/sizeof(T));
+    ssize_t offset = gridOffset + bid*realChunkSize;
+    int nelem = min(realChunkSize, size-offset);
+
+    if (rank == root) {
+      if (thisInput == thisOutput) {
+        INIT_COUNTER;
+        prims.send(thisInput+offset, nelem);
+        ACCUMULATE_COUNTER(send);
+      } else {
+        INIT_COUNTER;
+        prims.copySend(thisInput+offset, thisOutput+offset, nelem);
+        ACCUMULATE_COUNTER(copySend);
+      }
+    } else if (nextRank == root) {
+      INIT_COUNTER;
+      prims.recv(thisOutput+offset, nelem);
+      ACCUMULATE_COUNTER(recv);
+    } else {
+      INIT_COUNTER;
+      prims.recvCopySend(thisOutput+offset, nelem);
+      ACCUMULATE_COUNTER(recvCopySend);
+    }
+  }
+#ifdef ENABLE_PROFILING
+  if (tid == 0) __atomic_fetch_add(&(devProf->total_cycle), clock64() - clk, __ATOMIC_SEQ_CST);
+#endif
+}
+
+template<int UNROLL, class FUNC, typename T>
+__attribute__((noinline))
+__device__ void ncclBroadcastTreeKernel(struct CollectiveArgs* args) { }
+
+template<int UNUSED, class FUNC, typename T>
+__attribute__((noinline))
+__device__ void ncclBroadcastRingLLKernel(struct CollectiveArgs* args) {
+  const int tid = threadIdx.x;
+  const int bid = args->bid;
+  const int nthreads = args->nThreads;
+  struct ncclDevComm* comm = args->comm;
+  struct ncclChannel* channel = comm->channels+blockIdx.x;
+  struct ncclRing* ring = &channel->ring;
+
+  ncclLLPrimitives<T, FUNC, 1, 1> LLprims(tid, nthreads, &ring->prev, &ring->next, channel, comm, args->opCount);

  const ssize_t size = args->N;
-  const int buffSize = ring->buffSize / sizeof(T);
-  const int sliceSize = buffSize / BROADCAST_BUFCHUNKS;
-  const ssize_t loopSize = args->nRings*(ssize_t)sliceSize;
  const int rank = ring->devUserRanks[0];
  const int nextRank = ring->devUserRanks[1];
  const int root = args->root;

-  if (tid == 0) {
-    // Update in case we skipped some collectives
-    STORE(ring->recv.conn.opCount, args->opCount);
-    if (nextRank != root) {
-      // Wait for next to be ready
-      WaitFlag waitOpCountNext(ring->send.conn.opCount, 0);
-      waitOpCountNext.wait(args->opCount);
-    }
-    if (rank != root && prevdirect) {
-      *ring->recv.conn.ptrExchange = args->ThisOutput;
-    }
-    if (nextRank != root && nextdirect) {
-      void* volatile* ptr = &(ring->devMemSend->ptrExchange);
-      while (LOAD(ptr) == nullptr);
-      sharedNextOutput = (T*)LOAD(ptr);
-      STORE(ptr, nullptr);
-    }
-  }
-  __syncthreads();
-
-  uint64_t step = 0ULL;
-  int boffset = 0;
-
-  // Compute pointers
-  const T * __restrict__ thisInput = (const T*)args->ThisInput;
-  T * __restrict__ thisOutput = (T*)args->ThisOutput;
-  T * __restrict__ prevInput = (T*)ring->recv.conn.buff;
-  T * __restrict__ nextOutput = (T*)ring->send.conn.buff;
-
-  for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
-    int chunkSize = min(sliceSize, DIVUP(size-gridOffset,args->nRings));
-    ALIGN_SIZE(chunkSize, nthreads*sizeof(uint64_t)/sizeof(T));
-    ssize_t offset = gridOffset + bid*chunkSize;
-    int maxOffset = min(chunkSize, size-offset);
-
-    if (rank == root) {
-      if (thisInput == thisOutput) {
-        Prims::Copy(tid, nthreads,
-            thisInput  + offset,
-            nextdirect ? (sharedNextOutput + offset) : (nextOutput + boffset),
-            sliceSize, maxOffset,
-            step,
-            waitDoneFromNext,
-            postReadyToNext);
-      } else {
-        Prims::DoubleCopy(tid, nthreads,
-            thisInput  + offset,
-            thisOutput + offset,
-            nextdirect ? (sharedNextOutput + offset) : (nextOutput + boffset),
-            sliceSize, maxOffset,
-            step,
-            waitDoneFromNext,
-            postReadyToNext);
-      }
-    } else if (nextRank == root) {
-      if (prevdirect) maxOffset = 0; // Only wait for signals
-      Prims::Copy(tid, nthreads,
-          prevInput  + boffset,
-          thisOutput + offset,
-          sliceSize, maxOffset,
-          step,
-          waitReadyFromPrev,
-          postDoneToPrev);
-    } else {
-      if (prevdirect) {
-        Prims::Copy(tid, nthreads,
-            thisOutput + offset,
-            nextdirect ? (sharedNextOutput + offset) : (nextOutput + boffset),
-            sliceSize, maxOffset,
-            step,
-            waitDoneFromNext, waitReadyFromPrev,
-            postReadyToNext, postDoneToPrev);
-      } else {
-        Prims::DoubleCopy(tid, nthreads,
-            prevInput + boffset,
-            thisOutput + offset,
-            nextdirect ? (sharedNextOutput + offset) : (nextOutput + boffset),
-            sliceSize, maxOffset,
-            step,
-            waitDoneFromNext, waitReadyFromPrev,
-            postReadyToNext, postDoneToPrev);
-      }
-    }
-    NEXT_STEP; // Increases step, boffset
-  }
-
-  if (tid == 0) {
-    if (nextRank != root) {
-      // Wait for next to have consumed data before resetting the flag
-      waitDoneFromNext.wait(BROADCAST_SUBSTEPS*(step + BROADCAST_BUFCHUNKS - 1));
-      STORE(ring->send.conn.head, 0ULL);
-    }
-    STORE(ring->recv.conn.tail, 0ULL);
-    __threadfence_system();
-    STORE(ring->recv.conn.opCount, args->opCount+1);
-  }
-}
-
-#include "ll_kernel.h"
-
-#define NEXT_STEP_LL \
-  boffset += NCCL_LL_SLICE_LINES; \
-  if (boffset == NCCL_LL_BUFF_LINES) boffset = 0; \
-  flag++; \
-  step++;
-
-template<int UNUSED, class FUNC, typename T>
-__attribute__((noinline))
-__device__ void ncclBroadcastLLKernel(struct CollectiveArgs* args) {
-  const int tid = threadIdx.x;
-  const int bid = args->bid;
-  const int llNthreads = args->nThreads;
-  struct ncclComm* comm = args->comm;
-  struct ncclRing* ring = comm->rings+blockIdx.x;
-  volatile uint64_t * recvHeadPtr = ring->recv.conn.llHead;
-  volatile uint64_t * sendHeadPtr = ring->send.conn.llHead;
-  volatile int * sizesFifo = ring->send.conn.llFifo;
-  uint64_t sendHead = sendHeadPtr[0];
-  const int rank = comm->rank;
-  const int nextRank = ring->devUserRanks[1];
-  const int root = args->root;
-
-  typedef LLPrimitives<T, FUNC> LL;
-
-  const ssize_t size = args->N;
  ssize_t chunkSize = NCCL_LL_SLICE_LINES * sizeof(uint64_t) / sizeof(T);
-  const ssize_t loopSize = args->nRings*chunkSize;
-
-  uint64_t step = ring->send.conn.llStep;
-  uint32_t flag = step + 1;
-  int boffset = NCCL_LL_SLICE_LINES * STEP_TO_SLOT(step);
+  const ssize_t loopSize = args->nChannels*chunkSize;

  // Compute pointers
  const T * __restrict__ thisInput = (const T*)args->ThisInput;
  T * __restrict__ thisOutput = (T*)args->ThisOutput;
-  union ncclLLFifoLine * prevInput = (union ncclLLFifoLine *)ring->recv.conn.llBuff;
-  union ncclLLFifoLine * nextOutput = (union ncclLLFifoLine *)ring->send.conn.llBuff;

  for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
    if (size-gridOffset < loopSize) {
@@ -186,46 +103,21 @@ __device__ void ncclBroadcastLLKernel(struct CollectiveArgs* args) {
    }
    ssize_t offset = gridOffset + bid*chunkSize;

-    int maxOffset = min(chunkSize, size-offset);
+    int nelem = min(chunkSize, size-offset);
    if (rank == root) {
-      WAIT_NEXT;
      if (thisInput == thisOutput) {
-        LL::ReduceCopy(
-            thisInput + offset,
-            nextOutput + boffset,
-            maxOffset, flag, llNthreads);
+        LLprims.send(thisInput+offset, nelem);
      } else {
-        LL::ReduceCopy(
-            thisInput + offset,
-            thisOutput + offset,
-            nextOutput + boffset,
-            maxOffset, flag, llNthreads);
+        LLprims.copySend(thisInput + offset, thisOutput + offset, nelem);
      }
-      POST_SIZE;
-      NEXT_STEP_LL;
    } else if (nextRank == root) {
-      LL::ReduceCopy(
-          prevInput + boffset,
-          thisOutput + offset,
-          maxOffset, flag, llNthreads);
-      NEXT_STEP_LL;
-      ACK_PREV;
+      LLprims.recv(thisOutput + offset, nelem);
    } else {
-      WAIT_NEXT;
-      LL::ReduceCopy(
-          prevInput + boffset,
-          thisOutput + offset,
-          nextOutput + boffset,
-          maxOffset, flag, flag, llNthreads);
-      POST_SIZE;
-      NEXT_STEP_LL;
-      ACK_PREV;
+      LLprims.recvCopySend(thisOutput + offset, nelem);
    }
  }
-
-  // We need everyone to acknowledge data even if they didn't receive anything
-  // so that the next collective can start right away.
-  ACK_PREV;
-
-  FIFO_CLEANING_AND_SAVE_STEP(flag);
 }
+
+template<int UNUSED, class FUNC, typename T>
+__attribute__((noinline))
+__device__ void ncclBroadcastTreeLLKernel(struct CollectiveArgs* args) { }
@@ -1,8 +0,0 @@
-/*************************************************************************
- * Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
- *
- * See LICENSE.txt for license information
- ************************************************************************/
-
-#define NCCL_OP 0
-#include "device/broadcast.cu"
@@ -1,5 +1,6 @@
+#include "hip/hip_runtime.h"
 /*************************************************************************
- * Copyright (c) 2017-2018, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved.
 * Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
 *
 * See LICENSE.txt for license information
@@ -8,18 +9,38 @@
 #ifndef NCCL_DEVICE_COMMON_H_
 #define NCCL_DEVICE_COMMON_H_

-#include <hip/hip_runtime.h>
-
 #include "../collectives.h"
-#include "core.h"
+#include "devcomm.h"
 #include "nccl.h"
-
 #include <type_traits>

-typedef void(*ncclKern_t)(struct CollectiveArgs* args);
-#define NCCL_FUNC4(coll, op, dtype) \
+// Exit If Abort Barrier across CTA: make sure all threads exit consistently
+// Each thread sets a predicate to true if abort == 1
+// all CTA's threads enter the barrier and do a popc on their predicates being True
+// If any of the thread's predicate was True, all the threads call exit()
+#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
+#define exitIfAbortBarrier(abort, abortCount) \
+  if (abort) __atomic_fetch_add(abortCount, 1, __ATOMIC_SEQ_CST); \
+  __syncthreads(); \
+  if (LOAD(abortCount)) { asm volatile ("s_endpgm"); return; }
+#else
+static inline __device__ void exitIfAbortBarrier(int abort) {
+  uint32_t popc;
+  asm ("{");
+  asm volatile ("   .reg .pred barr_pred;");
+  asm volatile ("   setp.eq.u32 barr_pred,%0,1;" :: "r"(abort));
+  asm volatile ("   bar.red.popc.u32 %0, 13, barr_pred;" : "=r"(popc));
+  asm ("}");
+  if (popc) { asm volatile ("exit;"); }
+}
+#endif
+
+#define NCCL_FUNC5(coll, op, dtype) \
  NCCL_COLL_NAME(coll, op, dtype), \
-  NCCL_COLL_NAME(coll##LL, op, dtype)  \
+  NCCL_COLL_NAME(coll##LL, op, dtype)
+
+#define NCCL_FUNC4(coll, op, dtype) \
+  NCCL_FUNC5(coll##Ring, op, dtype)

 // Must be consistent with ncclDataType_t
 #define NCCL_FUNCS3A(coll, op) \
@@ -64,20 +85,13 @@ typedef void(*ncclKern_t)(struct CollectiveArgs* args);
  NCCL_FUNCS2A(ncclAllReduce) }

 // Must be consistent with the ncclFuncSet enum
-using ncclKern_t = void (*)(struct CollectiveArgs*);
+using ncclFunc_t = void (*)(struct CollectiveArgs*);

-static const __device__ constexpr ncclKern_t ncclFuncs[]{
-#if defined(__HIP_DEVICE_COMPILE__)
-  NCCL_FUNCS2B(ncclBroadcast),
-  NCCL_FUNCS2A(ncclReduce),
-  NCCL_FUNCS2B(ncclAllGather),
-  NCCL_FUNCS2A(ncclReduceScatter),
-  NCCL_FUNCS2A(ncclAllReduce)
-#endif
+static const __device__ constexpr ncclFunc_t ncclFuncs[]{
 // Don't try to initialize the host shadow copy of this device-side global
 // variable. There is no host pointer to a device-side function, which
 // confuses clang. This will be fixed in the next clang release.
-#if __CUDA_ARCH__
+#if defined(__HIP_DEVICE_COMPILE__)
  NCCL_FUNCS2B(ncclBroadcast),
  NCCL_FUNCS2A(ncclReduce),
  NCCL_FUNCS2B(ncclAllGather),
@@ -88,82 +102,89 @@ static const __device__ constexpr ncclKern_t ncclFuncs[]{

 template<unsigned short f, unsigned short l>
 struct Caller {
-  static
-  __device__ void call(ncclColl* const c) noexcept
+  static __device__ __host__
+  void call(ncclColl* const c) noexcept
  {
    constexpr unsigned short m = f + (l - f) / 2;

-    return (c->funcIndex < m) ? Caller<f, m>::call(c) : Caller<m, l>::call(c);
+     return (c->funcIndex < m) ? Caller<f, m>::call(c) : Caller<m, l>::call(c);
  }
 };

 template<unsigned short f>
 struct Caller<f, f + 1>{
-  static
-  __device__ void call(struct ncclColl* const c) noexcept { ncclFuncs[f](&c->args); }
+  static __device__ __host__
+  void call(struct ncclColl* const c) noexcept { ncclFuncs[f](&c->args); }
 };

 inline
 __device__
-void NCCL_CALL_FUNCTIONS(struct ncclColl* const c) noexcept
-{
+void NCCL_CALL_FUNCTIONS(struct ncclColl* const c) noexcept {
  if (c->funcIndex < 72) {
-    if (c->funcIndex % 2) ncclBroadcastLL_copy_i8(&c->args);
-    else ncclBroadcast_copy_i8(&c->args);
+    if (c->funcIndex % 2) ncclBroadcastRingLL_copy_i8(&c->args);
+    else ncclBroadcastRing_copy_i8(&c->args);
  }
  else if (c->funcIndex < 144) Caller<72, 144>::call(c);
  else if (c->funcIndex < 216) {
-    if (c->funcIndex % 2) ncclAllGatherLL_copy_i8(&c->args);
-    else ncclAllGather_copy_i8(&c->args);
+    if (c->funcIndex % 2) ncclAllGatherRingLL_copy_i8(&c->args);
+    else ncclAllGatherRing_copy_i8(&c->args);
  }
  else Caller<216, 360>::call(c);
 }

-static __device__ void load_parallel(void* dst, void* src, size_t size, int tid) {
+static __device__ void load_parallel(void* dst, void* src, size_t size, int tid, uint32_t* abortCount) {
  int* d = (int*)dst;
  int* s = (int*)src;
-  __syncthreads();
+  // When aggregation is effective, if some threads have aborted inside the LL kernel,
+  // make sure the rest of the threads abort as well
+  exitIfAbortBarrier(0, abortCount);
  for (int o = tid; o < (size/sizeof(int)); o += blockDim.x) d[o] = s[o];
  __syncthreads();
 }
-static __device__ void load_coll(struct ncclColl* localColl, struct ncclColl* hostColl, int tid) {
-  load_parallel(localColl, hostColl, sizeof(struct ncclColl), tid);
+static __device__ void load_coll(struct ncclColl* localColl, struct ncclColl* hostColl, int tid, uint32_t* abortCount) {
+  load_parallel(localColl, hostColl, sizeof(struct ncclColl), tid, abortCount);
  if (tid == 0) hostColl->active = 0;
 }

 /* Functions for aggregation case */
-#define IMPL_COLL4(coll, op, ncclFunc, dtype, ctype) \
+#define IMPL_COLL_FUNC(coll, op, ncclFunc, dtype, ctype) \
 __device__ void NCCL_COLL_NAME(coll, op, dtype)(struct CollectiveArgs* args) { \
-  coll##Kernel<UNROLL, ncclFunc<ctype>, ctype>(args); \
+  coll##Kernel<COLL_UNROLL, ncclFunc<ctype>, ctype>(args); \
 }
+
+#if NCCL_OP == 0
 /* Kernels with the first operation inlined */
-#define IMPL_COLL4K(coll, op, ncclFunc, dtype, ctype, fIndex) \
+#define IMPL_COLL_KERN(coll, op, ncclFunc, dtype, ctype, fIndex) \
 __launch_bounds__(MAXTHREADS+WARP_SIZE, 1) \
 __global__ void NCCL_KERN_NAME(coll, op, dtype)(struct ncclColl firstColl) { \
  int tid = threadIdx.x; \
  int bid = blockIdx.x; \
  __shared__ struct ncclColl localColl; \
+  __shared__ uint32_t abortCount; \
+  if (tid == 0) abortCount = 0; \
+  __syncthreads(); \
 \
-  struct ncclComm* comm = firstColl.args.comm; \
-  struct ncclRing* ring = comm->rings+bid; \
+  struct ncclDevComm* comm = firstColl.args.comm; \
+  struct ncclChannel* channel = comm->channels+bid; \
  struct ncclColl* c; \
+  channel->abortCount = &abortCount; \
  if (bid == 0) { \
    /* To optimize for latency, (only) the first operation is passed as argument.*/ \
    c = &firstColl; \
  } else { \
    c = &localColl; \
-    load_coll(c, ring->devCollectives+ring->collFifoHead, tid); \
+    load_coll(c, channel->devCollectives+channel->collFifoHead, tid, &abortCount); \
  } \
  while (1) { \
-    if (tid < c->nThreads) { \
+    if (tid < c->args.nThreads) { \
      if (c->funcIndex == fIndex) { \
-        coll##Kernel<UNROLL, ncclFunc<ctype>, ctype>(&c->args); \
+        coll##Kernel<COLL_UNROLL, ncclFunc<ctype>, ctype>(&c->args); \
      } else { \
        NCCL_CALL_FUNCTIONS(c); \
      } \
    } \
    int nextIndex = c->nextIndex; \
-    if (tid == 0) ring->collFifoHead = nextIndex; \
+    if (tid == 0) channel->collFifoHead = nextIndex; \
 \
    if (c->active == 2) { \
      return; \
@@ -171,15 +192,21 @@ __global__ void NCCL_KERN_NAME(coll, op, dtype)(struct ncclColl firstColl) { \
 \
    /* Load next collective operation*/ \
    c = &localColl; /* for bid 0 */ \
-    load_coll(c, ring->devCollectives+nextIndex, tid); \
+    load_coll(c, channel->devCollectives+nextIndex, tid, &abortCount); \
  } \
 }
+#else
+#define IMPL_COLL_KERN(coll, op, ncclFunc, dtype, ctype, fIndex)
+#endif
+
+// Only generate inline kernels for LL
+#define IMPL_COLL4(coll, op, ncclFunc, dtype, ctype, ncclColl, ncclOp, ncclType, al) \
+  IMPL_COLL_FUNC(coll, op, ncclFunc, dtype, ctype) \
+  IMPL_COLL_FUNC(coll##LL, op, ncclFunc, dtype, ctype) \
+  IMPL_COLL_KERN(coll##LL, op, ncclFunc, dtype, ctype, FUNC_INDEX(ncclColl, ncclOp, ncclType, 1, al)) \

 #define IMPL_COLL3(coll, op, ncclFunc, dtype, ctype, ncclColl, ncclOp, ncclType) \
-  IMPL_COLL4(coll##LL, op, ncclFunc, dtype, ctype) \
-  IMPL_COLL4K(coll##LL, op, ncclFunc, dtype, ctype, FUNC_INDEX(ncclColl, ncclOp, ncclType, 1)) \
-  IMPL_COLL4(coll, op, ncclFunc, dtype, ctype) \
-  IMPL_COLL4K(coll, op, ncclFunc, dtype, ctype, FUNC_INDEX(ncclColl, ncclOp, ncclType, 0)) \
+  IMPL_COLL4(coll##Ring, op, ncclFunc, dtype, ctype, ncclColl, ncclOp, ncclType, 0)

 #define IMPL_COLL2(coll, op, ncclFunc, ncclColl, ncclOp) \
  IMPL_COLL3(coll, op, ncclFunc, i8,  int8_t,   ncclColl, ncclOp, ncclInt8) \
@@ -192,4 +219,6 @@ __global__ void NCCL_KERN_NAME(coll, op, dtype)(struct ncclColl firstColl) { \
  IMPL_COLL3(coll, op, ncclFunc, f32, float,    ncclColl, ncclOp, ncclFloat32) \
  IMPL_COLL3(coll, op, ncclFunc, f64, double,   ncclColl, ncclOp, ncclFloat64)

+#define COLL_UNROLL 2
+
 #endif
@@ -1,5 +1,5 @@
 /*************************************************************************
- * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
 * Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
 *
 * See LICENSE.txt for license information
@@ -8,25 +8,25 @@
 #ifndef NCCL_COMMON_KERNEL_H_
 #define NCCL_COMMON_KERNEL_H_

-#include "core.h"
+#include "devcomm.h"
 #include <cstdio>
 #include <cstdint>

-#include <hip/hip_runtime_api.h>
+#include <hip/hip_runtime.h>

 // Define min for ssize_t
 static __device__ int min(int a, ssize_t b) { return (a < b) ? a : b; }

 typedef uint64_t PackType;

-#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__)
+#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)

 template<class FUNC, typename T>
 struct MULTI {
-    __device__ PackType operator()(const PackType x, const PackType y) const
-    {
-        return FUNC()(x, y);
-    }
+  __device__ PackType operator()(const PackType x, const PackType y) const
+  {
+    return FUNC()(x, y);
+  }
 };

 #else
@@ -205,15 +205,7 @@ struct MULTI<FUNC, int64_t> {
  }
 };

-#endif //defined(__HIP_PLATFORM_HCC__) || defined(__HCC__)
-
-#define ALIGNUP(x, a)   ((((x)-1) & ~((a)-1)) + (a))
-
-template<typename T>
-__device__ inline volatile T* AlignUp(volatile T * ptr, size_t align) {
-  size_t ptrval = reinterpret_cast<size_t>(ptr);
-  return reinterpret_cast<volatile T*>(ALIGNUP(ptrval, align));
-}
+#endif //defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)

 template<typename T> inline __device__
 T vFetch(const volatile T* ptr) {
@@ -225,7 +217,7 @@ void vStore(volatile T* ptr, const T val) {
  *ptr = val;
 }

-#if CUDART_VERSION < 9000 && !(defined(__HIP_PLATFORM_HCC__) || defined(__HCC__))
+#if CUDART_VERSION < 9000 && !(defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__))
 template<> inline __device__
 half vFetch<half>(const volatile half* ptr) {
  half r;
@@ -251,26 +243,6 @@ void vStore<half>(volatile half* ptr, const half val) {
 }
 #endif

-template<class FUNC, typename T, bool TWO_INPUTS, bool TWO_OUTPUTS>
-__attribute__((noinline))
-__device__ inline void ReduceCopy(
-    const int tid, const int nthreads,
-    const volatile T * __restrict__ const src0,
-    const volatile T * __restrict__ const src1,
-    volatile T * __restrict__ const dest0,
-    volatile T * __restrict__ const dest1, const int N) {
-  for (int idx = tid; idx < N; idx += nthreads) {
-    T val = vFetch(src0+idx);
-    if (TWO_INPUTS) {
-      val = FUNC()(val, vFetch(src1+idx));
-    }
-    vStore(dest0+idx, val);
-    if (TWO_OUTPUTS) {
-      vStore(dest1+idx, val);
-    }
-  }
-}
-
 typedef ulong2 Pack128;

 template<class FUNC, typename T>
@@ -281,8 +253,8 @@ struct MULTI128 {
  }
 };

-inline __device__ void Fetch128(Pack128& v, Pack128* p) {
-#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__)
+inline __device__ void Fetch128(Pack128& v, const Pack128* p) {
+#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
  v.x = p->x;
  v.y = p->y;
 #else
@@ -290,7 +262,7 @@ inline __device__ void Fetch128(Pack128& v, Pack128* p) {
 #endif
 }
 inline __device__ void Store128(Pack128* p, Pack128& v) {
-#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__)
+#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
  p->x = v.x;
  p->y = v.y;
 #else
@@ -298,67 +270,104 @@ inline __device__ void Store128(Pack128* p, Pack128& v) {
 #endif
 }

-#define WARP_SIZE 32
-template<class FUNC, typename T, bool TWO_INPUTS, bool TWO_OUTPUTS, int UNROLL>
-__attribute__((noinline))
-__device__ inline void ReduceCopy128b( const int w, const int nw, const int t,
-    Pack128 * src0, Pack128 * src1, Pack128 * dest0, Pack128 * dest1,
-    const int N) {
-  Pack128 t0[UNROLL];
-  Pack128 t1[UNROLL];
-  const Pack128* src0_end = src0 + N;
-  const int inc = nw * UNROLL * WARP_SIZE;
-  const int offset = w * UNROLL * WARP_SIZE + t;
-  src0 += offset;  if (TWO_INPUTS)  src1 += offset;
-  dest0 += offset; if (TWO_OUTPUTS) dest1 += offset;
+template<class FUNC, typename T, int MINSRCS, int MAXSRCS, int MINDSTS, int MAXDSTS>
+__device__ void ReduceCopyMulti(const int tid, const int nthreads,
+    int nsrcs, const T* srcs[MAXSRCS], int ndsts, T* dsts[MAXDSTS],
+    const int offset, const int N) {
+  for (int idx = offset+tid; idx < offset+N; idx += nthreads) {
+    T val = vFetch(srcs[0]+idx);
+    #pragma unroll
+    for (int i=1; i<MINSRCS; i++) val = FUNC()(val, vFetch(srcs[i]+idx));
+    #pragma unroll 1
+    for (int i=MINSRCS; i<MAXSRCS && i<nsrcs; i++) val = FUNC()(val, vFetch(srcs[i]+idx));

-  while (src0 < src0_end) {
-#pragma unroll
-    for (int u = 0; u < UNROLL; ++u) {
-      Fetch128(t0[u], src0+u*WARP_SIZE);
-      if (TWO_INPUTS) Fetch128(t1[u], src1+u*WARP_SIZE);
-    }
-#pragma unroll
-    for (int u = 0; u < UNROLL; ++u) {
-      if (TWO_INPUTS) MULTI128<FUNC, T>()(t0[u], t1[u]);
-      Store128(dest0+u*WARP_SIZE, t0[u]);
-      if (TWO_OUTPUTS) Store128(dest1+u*WARP_SIZE, t0[u]);
-    }
-    src0 += inc;  if (TWO_INPUTS)  src1 += inc;
-    dest0 += inc; if (TWO_OUTPUTS) dest1 += inc;
+    #pragma unroll
+    for (int i=0; i<MINDSTS; i++) vStore(dsts[i]+idx, val);
+    #pragma unroll 1
+    for (int i=MINDSTS; i<MAXDSTS && i<ndsts; i++) vStore(dsts[i]+idx, val);
  }
 }

-template<int UNROLL, class FUNC, typename T, bool HAS_DEST1, bool HAS_SRC1>
-__attribute__((noinline))
-__device__ inline void ReduceOrCopy(const int tid, const int nthreads,
-    volatile T * __restrict__ dest0, volatile T * __restrict__ dest1,
-    const volatile T * __restrict__ src0, const volatile T * __restrict__ src1,
+#define WARP_SIZE 64
+
+template<class FUNC, typename T, int UNROLL, int MINSRCS, int MAXSRCS, int MINDSTS, int MAXDSTS>
+__device__ void ReduceCopy128bMulti( const int w, const int nw, const int t,
+    int nsrcs, const T* s[MAXSRCS], int ndsts, T* d[MAXDSTS],
+    const int elemOffset, const int Npack) {
+  const int inc = nw * UNROLL * WARP_SIZE;
+  int offset = w * UNROLL * WARP_SIZE + t;
+
+  const Pack128* srcs[MAXSRCS];
+  for (int i=0; i<MAXSRCS; i++) srcs[i] = ((const Pack128*)(s[i]+elemOffset))+offset;
+  Pack128* dsts[MAXDSTS];
+  for (int i=0; i<MAXDSTS; i++) dsts[i] = ((Pack128*)(d[i]+elemOffset))+offset;
+
+  while (offset < Npack) {
+    Pack128 vals[UNROLL];
+    // Load and reduce
+    for (int u = 0; u < UNROLL; ++u) Fetch128(vals[u], srcs[0]+u*WARP_SIZE);
+
+    for (int i=1; i<MINSRCS; i++) {
+      Pack128 vals2[UNROLL];
+      for (int u = 0; u < UNROLL; ++u) Fetch128(vals2[u], srcs[i]+u*WARP_SIZE);
+      for (int u = 0; u < UNROLL; ++u) MULTI128<FUNC, T>()(vals[u], vals2[u]);
+    }
+    #pragma unroll 1
+    for (int i=MINSRCS; i<MAXSRCS && i<nsrcs; i++) {
+      Pack128 vals2[UNROLL];
+      for (int u = 0; u < UNROLL; ++u) Fetch128(vals2[u], srcs[i]+u*WARP_SIZE);
+      for (int u = 0; u < UNROLL; ++u) MULTI128<FUNC, T>()(vals[u], vals2[u]);
+    }
+
+    // Store
+    for (int i = 0; i < MINDSTS; i++) {
+      for (int u = 0; u < UNROLL; ++u) Store128(dsts[i]+u*WARP_SIZE, vals[u]);
+    }
+    #pragma unroll 1
+    for (int i=MINDSTS; i<MAXDSTS && i<ndsts; i++) {
+      for (int u = 0; u < UNROLL; ++u) Store128(dsts[i]+u*WARP_SIZE, vals[u]);
+    }
+    for (int i=0; i<MAXSRCS; i++) srcs[i] += inc;
+    for (int i=0; i<MAXDSTS; i++) dsts[i] += inc;
+    offset += inc;
+  }
+}
+
+template <typename T>
+__device__ int ptrAlign128(T* ptr) { return (uint64_t)ptr % alignof(Pack128); }
+
+// Try to limit consecutive load/stores to 8.
+// Use UNROLL 8 when we have a single source and a single destination, 4 otherwise
+#define AUTOUNROLL (UNROLL*(4/(MINDSTS+MINSRCS)))
+
+template<int UNROLL, class FUNC, typename T, int MINSRCS, int MAXSRCS, int MINDSTS, int MAXDSTS>
+__device__ void ReduceOrCopyMulti(const int tid, const int nthreads,
+    int nsrcs, const T* srcs[MAXSRCS], int ndsts, T* dsts[MAXDSTS],
    int N) {
  int Nrem = N;
  if (Nrem <= 0) return;

-  int Npreamble = (Nrem<alignof(Pack128)) ? Nrem : AlignUp(dest0, alignof(Pack128)) - dest0;
+  int alignDiff = 0;
+  int align = ptrAlign128(srcs[0]);
+  #pragma unroll
+  for (int i=1; i<MINSRCS; i++) alignDiff |= (align ^ ptrAlign128(srcs[i]));
+  for (int i=MINSRCS; i<MAXSRCS && i<nsrcs; i++) alignDiff |= (align ^ ptrAlign128(srcs[i]));
+  #pragma unroll
+  for (int i=0; i<MINDSTS; i++) alignDiff |= (align ^ ptrAlign128(dsts[i]));
+  for (int i=MINDSTS; i<MAXDSTS && i<ndsts; i++) alignDiff |= (align ^ ptrAlign128(dsts[i]));

-  // stage 0: check if we'll be able to use the fast, 128-bit aligned path.
-  // If not, we'll just use the slow preamble path for the whole operation
-  bool alignable = (((AlignUp(src0,  alignof(Pack128)) == src0  + Npreamble)) &&
-          (!HAS_DEST1 || (AlignUp(dest1, alignof(Pack128)) == dest1 + Npreamble)) &&
-          (!HAS_SRC1  || (AlignUp(src1,  alignof(Pack128)) == src1  + Npreamble)));
-
-  if (!alignable) {
-    Npreamble = Nrem;
-  }
+  int Npreamble = alignDiff ? Nrem :
+    N < alignof(Pack128) ? N :
+    (alignof(Pack128) - align) % alignof(Pack128);

  // stage 1: preamble: handle any elements up to the point of everything coming
  // into alignment
-  ReduceCopy<FUNC, T, HAS_SRC1, HAS_DEST1>(tid, nthreads, src0, src1, dest0, dest1, Npreamble);
-
-  Nrem -= Npreamble;
-  if (Nrem == 0) return;
-
-  dest0 += Npreamble; if (HAS_DEST1) { dest1 += Npreamble; }
-  src0  += Npreamble; if (HAS_SRC1)  { src1  += Npreamble; }
+  if (Npreamble) {
+    ReduceCopyMulti<FUNC, T, MINSRCS, MAXSRCS, MINDSTS, MAXDSTS>(tid, nthreads, nsrcs, srcs, ndsts, dsts, 0, Npreamble);
+    Nrem -= Npreamble;
+    if (Nrem == 0) return;
+  }
+  int offset = Npreamble;

  // stage 2: fast path: use 128b loads/stores to do the bulk of the work,
  // assuming the pointers we have are all 128-bit alignable.
@@ -366,35 +375,33 @@ __device__ inline void ReduceOrCopy(const int tid, const int nthreads,
  int nw = nthreads / WARP_SIZE; // Number of warps
  int t = tid % WARP_SIZE;       // Thread (inside the warp)

-  const int PackFactor = sizeof(Pack128) / sizeof(T);
+  const int packFactor = sizeof(Pack128) / sizeof(T);

  // stage 2a: main loop
-  int Nalign2a = (Nrem / (PackFactor * UNROLL * nthreads))
-      * (UNROLL * nthreads); // round down
+  int Npack2a = (Nrem / (packFactor * AUTOUNROLL * WARP_SIZE))
+      * (AUTOUNROLL * WARP_SIZE); // round down
+  int Nelem2a = Npack2a * packFactor;

-  ReduceCopy128b<FUNC, T, HAS_SRC1, HAS_DEST1, UNROLL>(w, nw, t, (Pack128*)src0, (Pack128*)src1, (Pack128*)dest0, (Pack128*)dest1, Nalign2a);
+  ReduceCopy128bMulti<FUNC, T, AUTOUNROLL, MINSRCS, MAXSRCS, MINDSTS, MAXDSTS>(w, nw, t, nsrcs, srcs, ndsts, dsts, offset, Npack2a);

-  int Ndone2a = Nalign2a * PackFactor;
-  Nrem -= Ndone2a;
+  Nrem -= Nelem2a;
  if (Nrem == 0) return;
-  dest0 += Ndone2a; if (HAS_DEST1) { dest1 += Ndone2a; }
-  src0  += Ndone2a; if (HAS_SRC1)  { src1  += Ndone2a; }
+  offset += Nelem2a;

  // stage 2b: slightly less optimized for section when we don't have full
-  // UNROLLs
+  // unrolling

-  int Nalign2b = Nrem / PackFactor;
+  int Npack2b = Nrem / packFactor;
+  int Nelem2b = Npack2b * packFactor;

-  ReduceCopy128b<FUNC, T, HAS_SRC1, HAS_DEST1, 1>(w, nw, t, (Pack128*)src0, (Pack128*)src1, (Pack128*)dest0, (Pack128*)dest1, Nalign2b);
+  ReduceCopy128bMulti<FUNC, T, 1, MINSRCS, MAXSRCS, MINDSTS, MAXDSTS>(w, nw, t, nsrcs, srcs, ndsts, dsts, offset, Npack2b);

-  int Ndone2b = Nalign2b * PackFactor;
-  Nrem -= Ndone2b;
+  Nrem -= Nelem2b;
  if (Nrem == 0) return;
-  dest0 += Ndone2b; if (HAS_DEST1) { dest1 += Ndone2b; }
-  src0  += Ndone2b; if (HAS_SRC1)  { src1  += Ndone2b; }
+  offset += Nelem2b;

  // stage 2c: tail
-  ReduceCopy<FUNC, T, HAS_SRC1, HAS_DEST1>(tid, nthreads, src0, src1, dest0, dest1, Nrem);
+  ReduceCopyMulti<FUNC, T, MINSRCS, MAXSRCS, MINDSTS, MAXDSTS>(tid, nthreads, nsrcs, srcs, ndsts, dsts, offset, Nrem);
 }

 #endif // COMMON_KERNEL_H_
@@ -1,15 +1,13 @@
 /*************************************************************************
- * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
 * Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
 *
 * See LICENSE.txt for license information
 ************************************************************************/

-#include "core.h"
+#include "devcomm.h"
 #include "collectives.h"
 #include "common.h"

-
-
 // Workaround for https://reviews.llvm.org/D55580
 __device__ void ncclWorkaroundClangD55580() {}
@@ -0,0 +1,28 @@
+#!/bin/bash
+#
+# Copyright (c) 2018-2019, NVIDIA CORPORATION. All rights reserved.
+#
+# See LICENSE.txt for license information
+#
+
+dir=$1
+
+targets="GENOBJS := \\\\\n"
+
+for base in all_reduce all_gather broadcast reduce reduce_scatter; do
+  opn=0
+  for op in sum prod min max; do
+    dtn=0
+    for dt in i8 u8 i32 u32 i64 u64 f16 f32 f64; do
+      echo "${dir}/${base}_${op}_${dt}.o : ${base}.cu ${dir}/${base}.dep"
+      echo "	@printf \"Compiling  %-35s > %s\\\\n\" ${base}.cu ${dir}/${base}_${op}_${dt}.o"
+      echo "	mkdir -p ${dir}"
+      echo "	\${NVCC} -DNCCL_OP=${opn} -DNCCL_TYPE=${dtn} \${NVCUFLAGS} -dc ${base}.cu -o ${dir}/${base}_${op}_${dt}.o"
+      echo ""
+      targets="$targets\t${dir}/${base}_${op}_${dt}.o \\\\\n"
+      dtn=$(($dtn + 1))
+    done
+    opn=$(($opn + 1))
+  done
+done
+echo -e "$targets"
@@ -1,186 +0,0 @@
-/*************************************************************************
- * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
- * Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
- *
- * See LICENSE.txt for license information
- ************************************************************************/
-
-#ifndef NCCL_LL_KERNEL_H_
-#define NCCL_LL_KERNEL_H_
-
-static __device__ __attribute__((noinline)) uint64_t readLL(union ncclLLFifoLine* src, uint32_t flag) {
-#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__)
-  using Vec = uint32_t __attribute__((ext_vector_type(4)));
-  Vec i4;
-  do {
-    asm volatile ("flat_load_dwordx4 %0, %1, glc\n"
-      "s_waitcnt vmcnt(0)\n"
-      "buffer_wbinvl1_vol\n" : "=v"(i4) : "v"(src));
-  } while (i4[1] != flag || i4[3] != flag);
-  uint64_t val64 = (uint64_t)(i4[0]) + (((uint64_t)i4[2]) << 32);
-  return val64;
-#else
-  uint32_t data1, flag1, data2, flag2;
-  do {
-    asm volatile("ld.volatile.global.v4.u32 {%0,%1,%2,%3}, [%4];" : "=r"(data1), "=r"(flag1), "=r"(data2), "=r"(flag2) : "l"(&src->i4));
-  } while ((flag1 != flag) || (flag2 != flag));
-  uint64_t val64 = data1 + (((uint64_t)data2) << 32);
-  return val64;
-#endif
-}
-
-static __device__ __attribute__((noinline)) void storeLL(union ncclLLFifoLine* dst, uint64_t val, uint32_t flag) {
-#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__)
-  using Vec = uint32_t __attribute__((ext_vector_type(4)));
-  Vec i4;
-  i4[0] = val & 0xffffffff;
-  i4[1] = flag;
-  i4[2] = (val >> 32);
-  i4[3] = flag;
-  asm volatile ("flat_store_dwordx4 %0, %1, glc\n"
-    "s_waitcnt vmcnt(0)\n"
-    "buffer_wbinvl1_vol\n" : : "v"(dst), "v"(i4));
-#else
-  asm volatile("st.volatile.global.v4.u32 [%0], {%1,%2,%3,%4};" :: "l"(&dst->i4), "r"((uint32_t)val), "r"(flag), "r"((uint32_t)(val >> 32)), "r"(flag));
-#endif
-}
-
-// Using memcpy handles misaligned pointers.
-static __device__ uint64_t readAL(uint64_t* src) {
-  uint64_t val;
-  memcpy((char*)&val, (char*)src, sizeof(uint64_t));
-  return val;
-}
-static __device__ void storeAL(uint64_t* dst, uint64_t val) {
-  memcpy((char*)dst, (char*)&val, sizeof(uint64_t));
-}
-
-template <typename T, class FUNC>
-class LLPrimitives {
- private:
-  template <int HAS_SRC1, int HAS_SRC2, int HAS_DST1, int HAS_DST2>
-  __attribute__((noinline))
-  static __device__ void ReduceCopyGeneric(const T* src1, union ncclLLFifoLine* src2, T* dst1, union ncclLLFifoLine* dst2, int size, uint32_t iflag, uint32_t oflag, int nthreads) {
-    if (size <= 0) return;
-    size_t size64 = size * sizeof(T) / sizeof(uint64_t);
-    uint64_t* src1A = (uint64_t*)src1;
-    uint64_t* dst1A = (uint64_t*)dst1;
-    int offset = threadIdx.x;
-    // Do multiples of 64 bits
-#pragma unroll 1
-    for (; offset < size64; offset += nthreads) {
-      uint64_t val;
-      if (HAS_SRC1) {
-        val = readAL(src1A+offset);
-        if (HAS_SRC2) val = MULTI<FUNC, T>()(readLL(src2+offset, iflag), val);
-      } else if (HAS_SRC2) {
-        val = readLL(src2+offset, iflag);
-      }
-      if (HAS_DST1) storeAL(dst1A+offset, val);
-      if (HAS_DST2) storeLL(dst2+offset, val, oflag);
-    }
-    // Finish last word
-    int sizeDone = size64*(sizeof(uint64_t)/sizeof(T));
-    int sizeRem = size - sizeDone;
-    if (threadIdx.x == 0 && sizeRem) {
-      const T* src1B = src1 + sizeDone;
-      T* dst1B = dst1 + sizeDone;
-
-      uint64_t lastVal;
-      T* vals = (T*)&lastVal;
-
-      if (HAS_SRC2) {
-        uint64_t lastVal2 = readLL(src2+size64, iflag);
-        T* src2B = (T*)&lastVal2;
-        for (int offset = 0; offset < sizeRem; offset++) {
-          vals[offset] = HAS_SRC1 ? FUNC()(src2B[offset], src1B[offset]) : src2B[offset];
-        }
-      } else if (HAS_SRC1) {
-        for (int offset = 0; offset < sizeRem; offset++) {
-          vals[offset] = src1B[offset];
-        }
-      }
-      if (HAS_DST2) storeLL(dst2+size64, lastVal, oflag);
-      if (HAS_DST1) {
-        for (int offset = 0; offset < sizeRem; offset++) {
-          dst1B[offset] = vals[offset];
-        }
-      }
-    }
-  }
- public:
-  static __device__ void ReduceCopy(const T* src, union ncclLLFifoLine* dst, int size, uint32_t oflag, int nthreads) {
-    return ReduceCopyGeneric<1, 0, 0, 1>(src, NULL, NULL, dst, size, 0, oflag, nthreads);
-  }
-
-  static __device__ void ReduceCopy(union ncclLLFifoLine* src, T* dst, int size, uint32_t iflag, int nthreads) {
-    return ReduceCopyGeneric<0, 1, 1, 0>(NULL, src, dst, NULL, size, iflag, 0, nthreads);
-  }
-
-  static __device__ void ReduceCopy(const T* src1, union ncclLLFifoLine* src2, union ncclLLFifoLine* dst, int size, uint32_t iflag, uint32_t oflag, int nthreads) {
-    return ReduceCopyGeneric<1, 1, 0, 1>(src1, src2, NULL, dst, size, iflag, oflag, nthreads);
-  }
-
-  static __device__ void ReduceCopy(const T* src1, union ncclLLFifoLine* src2, T* dst, int size, uint32_t iflag, int nthreads) {
-    return ReduceCopyGeneric<1, 1, 1, 0>(src1, src2, dst, NULL, size, iflag, 0, nthreads);
-  }
-
-  static __device__ void ReduceCopy(const T* src, T* dst1, union ncclLLFifoLine* dst2, int size, uint32_t oflag, int nthreads) {
-    return ReduceCopyGeneric<1, 0, 1, 1>(src, NULL, dst1, dst2, size, 0, oflag, nthreads);
-  }
-
-  static __device__ void ReduceCopy(union ncclLLFifoLine* src, T* dst1, union ncclLLFifoLine* dst2, int size, uint32_t iflag, uint32_t oflag, int nthreads) {
-    return ReduceCopyGeneric<0, 1, 1, 1>(NULL, src, dst1, dst2, size, iflag, oflag, nthreads);
-  }
-
-  static __device__ void ReduceCopy(const T* src1, union ncclLLFifoLine* src2, T* dst1, union ncclLLFifoLine* dst2, int size, uint32_t iflag, uint32_t oflag, int nthreads) {
-    return ReduceCopyGeneric<1, 1, 1, 1>(src1, src2, dst1, dst2, size, iflag, oflag, nthreads);
-  }
-};
-
-// Common macros
-
-#define STEP_TO_SLOT(step) \
-  (step % NCCL_LL_CHUNKS)
-
-#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__)
-#define SYNC __syncthreads()
-#else
-#define SYNC asm volatile ("bar.sync 1, %0;" :: "r"(llNthreads))
-#endif
-
-#define WAIT_NEXT \
-  if (tid == 0) { \
-    while (sendHead + NCCL_LL_CHUNKS <= step) { \
-      sendHead = LOAD(sendHeadPtr); \
-    } \
-  } \
-  SYNC;
-
-#define POST_SIZE \
-  if (tid == 0 && sizesFifo) { STORE(sizesFifo + step % NCCL_LL_CHUNKS, (maxOffset <= 0) ? -1 : (maxOffset*2*(int)sizeof(T))); }
-
-#define ACK_PREV \
-  SYNC; \
-  if (tid == 0) STORE(recvHeadPtr,step);
-
-#define FIFO_CLEANING_AND_SAVE_STEP(flag) do { \
-  if (step > LOAD(&ring->send.conn.llLastCleaning) + NCCL_LL_CLEAN_FREQ) { \
-    /* Reset all flags */ \
-    static_assert((NCCL_LL_BUFF_SIZE % NCCL_LL_MAX_NTHREADS) == 0, "NCCL_LL_BUFF_SIZE must be a multiple of THREADS"); \
-    static_assert(NCCL_LL_BUFF_SIZE/(sizeof(union ncclLLFifoLine)*NCCL_LL_MAX_NTHREADS) > 0, "NCCL_LL_BUFF_SIZE is less than 16 bytes*THREADS"); \
-    const union ncclLLFifoLine resetLine = { 0, flag, 0, flag }; \
-    for (int i=0; i<NCCL_LL_BUFF_SIZE/(sizeof(union ncclLLFifoLine)*llNthreads); i++) { \
-      prevInput[tid+i*llNthreads].i4 = resetLine.i4; \
-    } \
-    __threadfence_system(); \
-    /* Restart from the same slot, only make sure sender waits for data to be reset */ \
-    step += NCCL_LL_CHUNKS; \
-    ACK_PREV; \
-    while (LOAD(sendHeadPtr) < step); \
-    { if (tid == 0) STORE(&ring->send.conn.llLastCleaning, step); }\
-  } \
-  STORE(&ring->send.conn.llStep, step); \
-} while (0);
-
-#endif
@@ -1,5 +1,5 @@
 /*************************************************************************
- * Copyright (c) 2016-2018, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
 * Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
 *
 * See LICENSE.txt for license information
@@ -10,229 +10,635 @@

 #include <type_traits>
 #include "reduce_kernel.h" // for reduction funcs
+#include "common.h"

+#define SPINS_BEFORE_CHECK_ABORT 1000000

-/* Defines primitive operations: Copy, Reduce, DoubleCopy, and ReduceCopy.
- *
- * In order to reduce the reptetion of template arguments, the operations
- * are bundled as static methods of the Primitives class.
- *
- * Each primitive operation copies/reduces a contiguous buffer and syncs
- * an optional set of flags against a sub-step counter. The sync value is
- * based on the step parameter. Sync flags must be of type WaitFlag or
- * PostFlag. The primitive routines wait for all WaitFlag args to attain
- * at least a value of SUBSTEPS*(step-1)+substep+1 (i.e. completion of
- * corresponding substep by previous step) before executing the transfer.
- * After each substep is transfered, all PostFlag arguments get updated to
- * the value SUBSTEPS*step+substep+1.
- */
+// Unroll unconditionally the first send/recv since nsend/nrecv should be at
+// least 1 if SEND/RECV is set.
+#define FOR_SEND(func, ...) do { \
+  if (SEND) { \
+    /* Send to far first, then close */ \
+    for (int i=1; i<NSEND && i<nsend; i++) func(i, ##__VA_ARGS__); \
+    func(0, ##__VA_ARGS__); \
+  } \
+} while (0)

-
-class WaitFlag {
-  volatile uint64_t * const flag;
-  const int shift;
- public:
-  __device__
-  WaitFlag(volatile uint64_t * const flag, const int shift) : flag(flag), shift(shift) { }
-  __device__
-  void wait(uint64_t val) { while ((LOAD(flag) + shift) < val) /*SPIN*/; }
-};
-
-
-class PostFlag {
-  volatile uint64_t * const flag;
-  const int shift;
-  volatile int * const fifo;
-  const int fifo_size;
-  uint32_t * hdp_reg;
- public:
-  __device__
-  PostFlag(volatile uint64_t* const flag, const int shift, volatile int* const fifo, const int fifo_size, uint32_t* hdp_reg = NULL)
-    : flag(flag), shift(shift), fifo(fifo), fifo_size(fifo_size), hdp_reg(hdp_reg) { }
-  // remote writes can be reordered if we don't do s_waitcnt 0 + store to HDP between the data and flag
-  __device__
-  void post(uint64_t val) { if (hdp_reg != NULL) STORE(hdp_reg, 0x1); STORE(flag, (val - shift)); }
-  __device__
-  void postSize(uint64_t step, int size) { if (fifo != NULL) STORE(fifo + step%fifo_size, size); };
-};
-
-
-// Helper to check if any argument is of type T.
-// e.g. AnyAre<WaitFlag>(Flag1, Flag2, ...)
-template<typename T> __device__
-bool AnyAre() { return false; }
-
-template<typename T, typename FIRST_T, typename... TAIL_Ts>
-__device__
-bool AnyAre(FIRST_T first, TAIL_Ts... tail) {
-  return std::is_same<T, FIRST_T>::value || AnyAre<T>(tail...);
-}
-
-
-// Wait on all WaitFlags, ignore PostFlags
-__device__
-static void WaitOnFlags(uint64_t val) { }
-
-template <typename... TAIL_Ts> __device__
-static void WaitOnFlags(uint64_t val, WaitFlag flag, TAIL_Ts... tail) {
-  flag.wait(val);
-  WaitOnFlags(val, tail...);
-}
-
-template <typename... TAIL_Ts> __device__
-static void WaitOnFlags(uint64_t val, PostFlag, TAIL_Ts... tail) {
-  WaitOnFlags(val, tail...);
-}
-
-
-// Post all PostFlags, ignore WaitFlags
-__device__
-static void PostToFlags(uint64_t val) { }
-
-template <typename... TAIL_Ts> __device__
-static void PostToFlags(uint64_t val, WaitFlag flag, TAIL_Ts... tail) {
-  PostToFlags(val, tail...);
-}
-
-template <typename... TAIL_Ts> __device__
-static void PostToFlags(uint64_t val, PostFlag flag, TAIL_Ts... tail) {
-  flag.post(val);
-  PostToFlags(val, tail...);
-}
-
-
-// Post sizes for PostFlags, ignore WaitFlags
-__device__
-static void PostSizeToFlags(uint64_t step, int size) { }
-
-template <typename... TAIL_Ts> __device__
-static void PostSizeToFlags(uint64_t step, int size, WaitFlag flag, TAIL_Ts... tail) {
-  PostSizeToFlags(step, size, tail...);
-}
-
-template <typename... TAIL_Ts> __device__
-static void PostSizeToFlags(uint64_t step, int size, PostFlag flag, TAIL_Ts... tail) {
-  flag.postSize(step, size);
-  PostSizeToFlags(step, size, tail...);
-}
-
-
-// Create pointer arithmetic syntax that doesn't break for std::nullptr_t
-template <typename Tptr> __device__
-static Tptr ptradd(Tptr ptr, int i) {
-  return ptr + i;
-}
-
-__device__
-static std::nullptr_t ptradd(std::nullptr_t ptr, int i) {
-  return nullptr;
-}
-
-// use different unroll numbers for all primitives for best throughput
-#define COPY_UNROLL       4
-#define REDUCE_UNROLL     2
-#define DOUBLECOPY_UNROLL 2
-#define REDUCECOPY_UNROLL 2
+#define FOR_RECV(func, ...) do { \
+  if (RECV) { \
+    /* Recv from close first, then far */ \
+    func(0, ##__VA_ARGS__); \
+    for (int i=1; i<NRECV && i<nrecv; i++) func(i, ##__VA_ARGS__); \
+  } \
+} while (0)

 // Implementation of primitive types
-template <int, int SUBSTEPS, typename T, typename REDOP=FuncSum<T> >
-class Primitives {
+template <int UNROLL, int SLICESPERCHUNK, int SLICESTEPS, typename T, int NRECV, int NSEND, class FUNC>
+class ncclPrimitives {
 private:
-  template <int UNROLL,
-      typename SRC2_T, // either T* or std::nullptr_t
-      typename DST2_T, // either T* or std::nullptr_t
-      typename... SYNC_Ts> // either WaitFunc or PostFunc
-  static __device__ __attribute__((noinline)) void
-  GenericOp(const int tid, const int nthreads,
-      const T*     src1,
-      const SRC2_T src2,
-      T*     dst1,
-      DST2_T dst2,
-      int len, int maxoffset, uint64_t step, SYNC_Ts... flags) {
+  const int tid;
+  const int nthreads;
+  int nrecv = 0;
+  int nsend = 0;
+  const int stepSize;
+  struct ncclConnInfo* recvConn[NRECV];
+  struct ncclConnInfo* sendConn[NSEND];
+  volatile uint64_t* waitPtr;
+  uint64_t recvStep[NRECV];
+  uint64_t sendStep[NSEND];
+  uint64_t sendConnHead[NSEND];
+  const T* recvDirectBuff[NRECV];
+  T* sendDirectBuff[NSEND];
+  const T* recvBuff[NRECV];
+  T* sendBuff[NSEND];
+  struct ncclDevComm* comm;
+  uint32_t* abortCount;

-    enum { noSrc2 = std::is_same<SRC2_T, std::nullptr_t>::value };
-    enum { noDst2 = std::is_same<DST2_T, std::nullptr_t>::value };
-    static_assert(noSrc2 || std::is_same<SRC2_T, const T*>::value,
-        "src2 must be of type T* or std::nullptr_t");
-    static_assert(noDst2 || std::is_same<DST2_T, T*>::value,
-        "dst2 must be of type T* or std::nullptr_t");
+  __device__ int recvOffset(int i) { return (recvStep[i]%NCCL_STEPS)*stepSize; }
+  __device__ int sendOffset(int i) { return (sendStep[i]%NCCL_STEPS)*stepSize; }
+  __device__ const T* recvPtr(int i) { return ((const T*)recvBuff[i])+recvOffset(i); }
+  __device__ T* sendPtr(int i) { return ((T*)sendBuff[i])+sendOffset(i); }

-    using OpType = typename std::conditional<noSrc2, FuncSum<T>, REDOP>::type;
-
-    int sliceSize = len / SUBSTEPS;
-    int sliceOffset = 0;
-
-#pragma unroll 1
-    for (int sub=0; sub<SUBSTEPS; ++sub) {
-      int realSize = max(0, min(sliceSize, maxoffset-sliceOffset));
-      if (tid < nthreads) {
-        if (AnyAre<WaitFlag>(flags...)) {
-          if (tid == 0) {
-            WaitOnFlags(SUBSTEPS*step + sub + 1, flags...);
-          }
-#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__)
-          __syncthreads();
+  __device__ void barrier() {
+#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
+    __syncthreads();
 #else
-          asm volatile ("bar.sync 1, %0;" :: "r"(nthreads));
+    asm volatile ("bar.sync 1, %0;" :: "r"(nthreads));
 #endif
-        }
-        ReduceOrCopy
-        <
-        UNROLL,
-        OpType,
-        T,
-        !std::is_same<DST2_T, std::nullptr_t>::value, // HAS_DEST1
-        !std::is_same<SRC2_T, std::nullptr_t>::value  // HAS_SRC1
-        >
-        (
-            tid, nthreads,
-            ptradd(dst1, sliceOffset),
-            ptradd(dst2, sliceOffset),
-            ptradd(src1, sliceOffset),
-            ptradd(src2, sliceOffset),
-            realSize
-        );
-        if (AnyAre<PostFlag>(flags...)) {
-          __syncthreads();
-          if(tid == 0)
-            PostSizeToFlags(SUBSTEPS*step+sub, realSize*sizeof(T), flags...);
-          __threadfence_system();
-          if(tid == 0)
-            PostToFlags(SUBSTEPS*step + sub + 1, flags...);
+  }
+
+  uint32_t mismatch = 0;
+  const uint64_t opCount;
+
+  __device__ void checkMismatch(volatile uint64_t* remoteOpCount) {
+    if (mismatch) {
+      // In non-LL, we use _threadfence_system before incrementing opCount, yet we are still waiting for credits here, so there must be a size mismatch
+      STORE(comm->fatalDevError, ncclDevAssertedMismatch);
+    } else if (remoteOpCount && LOAD(remoteOpCount) > opCount) {
+      mismatch += 1;
+    }
+  }
+
+  uint32_t spins = 0;
+  uint32_t abort = 0;
+
+  __device__ int checkAbort(volatile uint64_t* remoteOpCount) {
+    spins++;
+    if (spins == SPINS_BEFORE_CHECK_ABORT) {
+      abort = LOAD(comm->abortFlag);
+      checkMismatch(remoteOpCount);
+      spins = 0;
+    }
+    return abort;
+  }
+
+  __device__ void waitRecv(int i) {
+    spins = 0;
+    mismatch = 0;
+    recvStep[i] += SLICESTEPS;
+    if (tid == i) {
+#ifdef ENABLE_PROFILING
+      auto devProf = comm->devProf;
+      uint64_t t0 = clock64();
+#endif
+      while (LOAD(waitPtr) < recvStep[i]) {
+        if (checkAbort(recvConn[i]->opCountRem)) break;
+      }
+#ifdef ENABLE_PROFILING
+      __atomic_fetch_add(&devProf->wait_recv_cycle[blockIdx.x], clock64() - t0, __ATOMIC_SEQ_CST);
+#endif
+    }
+  }
+
+  __device__ void waitSend(int i) {
+    spins = 0;
+    mismatch = 0;
+    sendStep[i] += SLICESTEPS;
+    if (tid == WARP_SIZE+i) {
+#ifdef ENABLE_PROFILING
+      auto devProf = comm->devProf;
+      uint64_t t0 = clock64();
+#endif
+      while (sendConnHead[i] + NCCL_STEPS < sendStep[i]) {
+        sendConnHead[i] = LOAD(waitPtr);
+        if (checkAbort(sendConn[i]->opCountRem)) break;
+      }
+#ifdef ENABLE_PROFILING
+      __atomic_fetch_add(&devProf->wait_send_cycle[blockIdx.x], clock64() - t0, __ATOMIC_SEQ_CST);
+#endif
+    }
+  }
+
+  inline __device__ void postRecv(int i) {
+    STORE(recvConn[i]->head, recvStep[i]);
+  }
+
+  inline __device__ void postSend(int i) {
+    if (sendConn[i]->next_hdp_reg) STORE(sendConn[i]->next_hdp_reg, 0x1);
+    STORE(sendConn[i]->tail, sendStep[i]);
+  }
+
+  __device__ void postSendSize(int i, int size) {
+    if (sendConn[i]->fifo) STORE(sendConn[i]->fifo+((sendStep[i]-SLICESTEPS)%NCCL_STEPS), size);
+  }
+
+  template <int DIRECTRECV>
+  __device__ const T* directRecvPtr(int i, int directOffset) {
+    return DIRECTRECV && recvDirectBuff[i] ? recvDirectBuff[i]+directOffset : recvPtr(i);
+  }
+
+  template <int DIRECTSEND>
+  __device__ T* directSendPtr(int i, int directOffset) {
+    return DIRECTSEND && sendDirectBuff[i] ? sendDirectBuff[i]+directOffset : sendPtr(i);
+  }
+
+  template <int DIRECTRECV, int DIRECTSEND, int RECV, int SEND, int SRC, int DST>
+  __device__ void
+  GenericOp(const T* srcPtr, T* dstPtr, int nelem, int directOffset) {
+    int offset = 0;
+    int sliceSize = stepSize * SLICESTEPS;
+
+    const T* srcs[RECV*NRECV+SRC];
+    srcs[0] = SRC ? srcPtr : directRecvPtr<DIRECTRECV>(0, directOffset);
+    if (RECV) {
+      if (SRC) srcs[1] = recvPtr(0);
+      for (int i=1; i<NRECV && i<nrecv; i++) srcs[SRC+i] = recvPtr(i);
+    }
+
+    T* dsts[SEND*NSEND+DST];
+    dsts[0] = DST ? dstPtr : directSendPtr<DIRECTSEND>(0, directOffset);
+    if (SEND) {
+      if (DST) dsts[1] = directSendPtr<DIRECTSEND>(0, directOffset);
+      for (int i=1; i<NSEND && i<nsend; i++) dsts[DST+i] = directSendPtr<DIRECTSEND>(i, directOffset);
+    }
+
+    #pragma unroll 1
+    for (int slice=0; slice<SLICESPERCHUNK; ++slice) {
+      int realSize = max(0, min(sliceSize, nelem-offset));
+      FOR_SEND(waitSend);
+      FOR_RECV(waitRecv);
+      if (realSize > 0) {
+        barrier();
+        if (DIRECTRECV && recvDirectBuff[0]) {
+          // We can only have one direct receive. Since srcs[0] == dstPtr+offset, skip one copy
+          if (SEND) {
+            ReduceOrCopyMulti<UNROLL, FUNC, T, 1, 1, 1, NSEND>(tid, nthreads, 1, srcs, nsend, dsts+1, realSize);
+          }
+        } else {
+          ReduceOrCopyMulti<UNROLL, FUNC, T, RECV+SRC, RECV*NRECV+SRC, SEND+DST, SEND*NSEND+DST>(tid, nthreads, RECV*nrecv+SRC, srcs, SEND*nsend+DST, dsts, realSize);
        }
      }
-      sliceOffset += sliceSize;
+      exitIfAbortBarrier(abort, abortCount);
+      if (tid == 0) FOR_SEND(postSendSize, realSize*sizeof(T));
+      if (SEND) __threadfence_system();
+      if (tid == 0) FOR_SEND(postSend);
+      if (tid == 0) FOR_RECV(postRecv);
+    }
+    for (int i=0; i<RECV*NRECV+SRC; i++) srcs[i] += sliceSize;
+    for (int i=0; i<SEND*NSEND+DST; i++) dsts[i] += sliceSize;
+    offset += sliceSize;
+  }
+
+  __device__ void loadRecvConn(struct ncclConnInfo* conn, int i, T* directBuff) {
+    recvConn[i] = conn;
+    recvBuff[i] = (const T*)LOAD(&recvConn[i]->buff);
+    recvStep[i] = LOAD(&recvConn[i]->step);
+    recvStep[i] = ROUNDUP(recvStep[i], SLICESPERCHUNK*SLICESTEPS);
+    // Return credits in case we rounded up.
+    if (tid == 0) STORE(recvConn[i]->head, recvStep[i]);
+    if (tid == i) {
+      waitPtr = LOAD(&recvConn[i]->tail);
+      STORE(recvConn[i]->opCountLoc, opCount);
+    }
+    recvDirectBuff[i] = NULL;
+    if (directBuff && recvConn[i]->direct) {
+      recvDirectBuff[i] = directBuff;
+      if (tid == 0) STORE(recvConn[i]->ptrExchange, directBuff);
+    }
+    nrecv++;
+  }
+
+  __device__ void loadSendConn(struct ncclConnInfo* conn, int i, T* directBuff) {
+    sendConn[i] = conn;
+    sendBuff[i] = (T*)LOAD(&sendConn[i]->buff);
+    sendStep[i] = LOAD(&sendConn[i]->step);
+    sendStep[i] = ROUNDUP(sendStep[i], SLICESPERCHUNK*SLICESTEPS);
+    if (tid == WARP_SIZE+i) {
+      waitPtr = LOAD(&sendConn[i]->head);
+      sendConnHead[i] = LOAD(waitPtr);
+      STORE(sendConn[i]->opCountLoc, opCount);
+    }
+    sendDirectBuff[i] = NULL;
+    if (directBuff && sendConn[i]->direct) {
+      void* volatile* ptr = sendConn[i]->ptrExchange;
+      while ((sendDirectBuff[i] = (T*)(LOAD(ptr))) == NULL);
+      __syncthreads();
+      if (tid == 0) STORE(ptr, NULL);
+    }
+    nsend++;
+  }
+
+  __device__ void saveRecvConn(int i) {
+    if (tid == i) {
+      STORE(&recvConn[i]->step, recvStep[i]);
+      __threadfence_system();
+      __atomic_fetch_add(recvConn[i]->opCountLoc, 1, __ATOMIC_SEQ_CST);
+    }
+  }
+
+  __device__ void saveSendConn(int i) {
+    if (tid == WARP_SIZE+i) {
+      STORE(&sendConn[i]->step, sendStep[i]);
+      __threadfence_system();
+      __atomic_fetch_add(sendConn[i]->opCountLoc, 1, __ATOMIC_SEQ_CST);
    }
  }

 public:
-  template <typename... SYNC_Ts>
-  static __device__ void
-  Copy(const int tid, const int nthreads, const T* src, T* dst,
-      int len, int maxOffset, uint64_t step, SYNC_Ts... flags) {
-    GenericOp<COPY_UNROLL>(tid, nthreads, src, nullptr, dst, nullptr, len, maxOffset, step, flags...);
+  __device__
+  ncclPrimitives(const int tid, const int nthreads, int* recvPeers, int* sendPeers, T* directBuff, int stepSize, struct ncclChannel* channel, struct ncclDevComm* comm, const uint64_t opCount)
+    : comm(comm), tid(tid), nthreads(nthreads), stepSize(stepSize), opCount(opCount) {
+    // Make sure step is updated before we read it
+    abortCount = channel->abortCount;
+    __syncthreads();
+
+    // disable directBuff
+    for (int i=0; i<NRECV && recvPeers[i] >= 0; i++) loadRecvConn(&channel->devPeers[recvPeers[i]].recv.conn, i, 0);
+    for (int i=0; i<NSEND && sendPeers[i] >= 0; i++) loadSendConn(&channel->devPeers[sendPeers[i]].send.conn, i, 0);
  }

-  template <typename... SYNC_Ts>
-  static __device__ void
-  DoubleCopy(const int tid, const int nthreads, const T* src, T* dst1, T* dst2,
-      int len, int maxOffset, uint64_t step, SYNC_Ts... flags) {
-    GenericOp<DOUBLECOPY_UNROLL>(tid, nthreads, src, nullptr, dst1, dst2, len, maxOffset, step, flags...);
+  __device__ void
+  send(const T* src, int nelem) {
+    GenericOp<0, 0, 0, 1, 1, 0>(src, NULL, nelem, 0);
+  }
+  __device__ void
+  directSend(const T* src, int directOffset, int nelem) {
+    GenericOp<0, 1, 0, 1, 1, 0>(src, NULL, nelem, directOffset);
  }

-  template <typename... SYNC_Ts>
-  static __device__ void
-  Reduce(const int tid, const int nthreads, const T* src1, const T* src2, T* dst,
-      int len, int maxOffset, uint64_t step, SYNC_Ts... flags) {
-    GenericOp<REDUCE_UNROLL>(tid, nthreads, src1, src2, dst, nullptr, len, maxOffset, step, flags...);
+  __device__ void
+  recv(T* dst, int nelem) {
+    GenericOp<0, 0, 1, 0, 0, 1>(NULL, dst, nelem, 0);
+  }
+  __device__ void
+  directRecv(T* dst, int directOffset, int nelem) {
+    GenericOp<1, 0, 1, 0, 0, 1>(NULL, dst, nelem, directOffset);
  }

-  template <typename... SYNC_Ts>
-  static __device__ void
-  ReduceCopy(const int tid, const int nthreads, const T* src1, const T* src2, T* dst1, T* dst2,
-      int len, int maxOffset, uint64_t step, SYNC_Ts... flags) {
-    GenericOp<REDUCECOPY_UNROLL>(tid, nthreads, src1, src2, dst1, dst2, len, maxOffset, step, flags...);
+  __device__ void
+  copySend(const T* src, T* dst, int nelem) {
+    GenericOp<0, 0, 0, 1, 1, 1>(src, dst, nelem, 0);
+  }
+  __device__ void
+  directCopySend(const T* src, T* dst, int directOffset, int nelem) {
+    GenericOp<0, 1, 0, 1, 1, 1>(src, dst, nelem, directOffset);
+  }
+
+  __device__ void
+  recvCopySend(T* dst, int nelem) {
+    GenericOp<0, 0, 1, 1, 0, 1>(NULL, dst, nelem, 0);
+  }
+  __device__ void
+  directRecvCopySend(T* dst, int directOffset, int nelem) {
+    GenericOp<1, 1, 1, 1, 0, 1>(NULL, dst, nelem, directOffset);
+  }
+
+  __device__ void
+  recvReduceCopy(const T* src, T* dst, int nelem) {
+    GenericOp<0, 0, 1, 0, 1, 1>(src, dst, nelem, 0);
+  }
+
+  __device__ void
+  recvReduceSend(const T* src, int nelem) {
+    GenericOp<0, 0, 1, 1, 1, 0>(src, NULL, nelem, 0);
+  }
+
+  __device__ void
+  recvReduceCopySend(const T* src, T* dst, int nelem) {
+    GenericOp<0, 0, 1, 1, 1, 1>(src, dst, nelem, 0);
+  }
+  __device__ void
+  directRecvReduceCopySend(const T* src, T* dst, int directOffset, int nelem) {
+    // Direct is only for the send part
+    GenericOp<0, 1, 1, 1, 1, 1>(src, dst, nelem, directOffset);
+  }
+
+  __device__ ~ncclPrimitives() {
+    // Save steps for next collective. Have thread 0 do it to be compatible
+    // with the way LL works.
+    for (int i=0; i<NRECV && i<nrecv; i++) saveRecvConn(i);
+    for (int i=0; i<NSEND && i<nsend; i++) saveSendConn(i);
  }
 };

-#endif // end include guard
+template <typename T, class FUNC, int NRECV, int NSEND>
+class ncclLLPrimitives {
+ private:
+  const int tid;
+  const int nthreads;
+  int nrecv = 0;
+  int nsend = 0;
+  struct ncclConnInfo* recvConn[NRECV];
+  struct ncclConnInfo* sendConn[NSEND];
+  volatile uint64_t* waitPtr;
+  volatile uint64_t* postPtr;
+  volatile int* fifoPtr;
+  uint64_t recvStep[NRECV];
+  uint64_t sendStep[NSEND];
+  uint64_t sendConnHead;
+  union ncclLLFifoLine* recvBuff[NRECV];
+  union ncclLLFifoLine* sendBuff[NSEND];
+  struct ncclDevComm* comm;
+  uint32_t* abortCount;
+
+  __device__ int recvOffset(int i) { return (recvStep[i]%NCCL_STEPS)*NCCL_LL_SLICE_LINES; }
+  __device__ int sendOffset(int i) { return (sendStep[i]%NCCL_STEPS)*NCCL_LL_SLICE_LINES; }
+  __device__ union ncclLLFifoLine* recvPtr(int i) { return recvBuff[i]+recvOffset(i); }
+  __device__ union ncclLLFifoLine* sendPtr(int i) { return sendBuff[i]+sendOffset(i); }
+  __device__ uint32_t recvFlag(int i) { return NCCL_LL_FLAG(recvStep[i]+1); }
+  __device__ uint32_t sendFlag(int i) { return NCCL_LL_FLAG(sendStep[i]+1); }
+
+#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
+#else
+  // Exit If Abort Barrier : make sure all threads exit consistently
+  // Each thread sets a predicate to true if val == 1
+  // all CTA's threads enter the barrier and do a popc on their predicates being True
+  // If any of the thread's predicate was True, all the threads call exit()
+  __device__ void exitIfAbortLocalBarrier() {
+    uint32_t popc;
+    asm ("{");
+    asm volatile ("   .reg .pred barr_pred;");
+    asm volatile ("   setp.eq.u32 barr_pred,%0,1;" :: "r"(abort));
+    asm volatile ("   bar.red.popc.u32 %0, 14, %1, barr_pred;" : "=r"(popc) : "r"(nthreads));
+    asm ("}");
+    if (popc) {
+      // Make sure threads not participating in the operation get the abort and all threads exit
+      exitIfAbortBarrier(1);
+    }
+  }
+#endif
+
+  __device__ void barrier() {
+#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
+    __syncthreads();
+#else
+    asm volatile ("bar.sync 1, %0;" :: "r"(nthreads));
+#endif
+  }
+
+  uint32_t mismatch = 0;
+  const uint64_t opCount;
+
+  __device__ void checkMismatch(volatile uint64_t* remoteOpCount) {
+    if (mismatch > 20) {
+      // We have seen that the peer advanced opcount so many times yet we are still waiting for credit of current op, so it is _most likely_ a mismatch
+      // Note that we are not using _threadfence_system in LL so the error cannot be asserted
+      STORE(comm->fatalDevError, ncclDevSuspectedMismatch);
+    } else if (remoteOpCount && LOAD(remoteOpCount) > opCount) {
+      mismatch += 1;
+    }
+  }
+
+  uint32_t spins = 0;
+  uint32_t abort = 0;
+
+  __device__ int checkAbort(volatile uint64_t* remoteOpCount) {
+    spins++;
+    if (spins == SPINS_BEFORE_CHECK_ABORT) {
+      abort = LOAD(comm->abortFlag);
+      checkMismatch(remoteOpCount);
+      spins = 0;
+    }
+    return abort;
+  }
+
+  __device__ void waitSend(int i, int nbytes) {
+    spins = 0;
+    mismatch = 0;
+    if (tid == WARP_SIZE+i) {
+      while (sendConnHead + NCCL_STEPS < sendStep[i] + 1) {
+        sendConnHead = LOAD(waitPtr);
+        if (checkAbort(sendConn[i]->opCountRem)) break;
+      }
+      if (fifoPtr) {
+        int size = ((sendStep[i] & NCCL_LL_CLEAN_MASK) == NCCL_LL_CLEAN_MASK) ? NCCL_LL_SLICE_LINES*sizeof(union ncclLLFifoLine) : nbytes;
+        STORE(fifoPtr+sendStep[i]%NCCL_STEPS, size);
+      }
+    }
+  }
+
+  __device__ void postRecv(int i) {
+    recvStep[i]++;
+    if (tid == i) STORE(postPtr, recvStep[i]);
+  }
+
+  __device__ void postSend(int i, int offset) {
+    // LL Cleanup : write all flags in the slice to make sure we don't have
+    // data corruption when flag loops over.
+    if ((sendStep[i] & NCCL_LL_CLEAN_MASK) == NCCL_LL_CLEAN_MASK) {
+      for (int o = offset; o<NCCL_LL_SLICE_LINES; o+=nthreads) storeLL(sendPtr(i)+o, 0, sendFlag(i));
+    }
+    sendStep[i]++;
+  }
+
+  __device__ __attribute__((noinline)) uint64_t readLL(int i, int offset) {
+    union ncclLLFifoLine* src = recvPtr(i) + offset;
+    uint32_t flag = recvFlag(i);
+    uint32_t data1, flag1, data2, flag2;
+    spins = 0;
+    mismatch = 0;
+#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
+    using Vec = uint32_t __attribute__((ext_vector_type(4)));
+    Vec i4;
+    do {
+      asm volatile ("flat_load_dwordx4 %0, %1, glc\n"
+        "s_waitcnt vmcnt(0)\n"
+        "buffer_wbinvl1_vol\n" : "=v"(i4) : "v"(src));
+      if (i4[1] == flag && i4[3] == flag) break;
+    } while (!checkAbort(recvConn[i]->opCountRem));
+    uint64_t val64 = (uint64_t)(i4[0]) + (((uint64_t)i4[2]) << 32);
+#else
+    do {
+      asm volatile("ld.volatile.global.v4.u32 {%0,%1,%2,%3}, [%4];" : "=r"(data1), "=r"(flag1), "=r"(data2), "=r"(flag2) : "l"(&src->i4));
+      if (checkAbort(recvConn[i]->opCountRem)) break;
+    } while ((flag1 != flag) || (flag2 != flag));
+    uint64_t val64 = data1 + (((uint64_t)data2) << 32);
+#endif
+    return val64;
+  }
+
+  __device__ __attribute__((noinline)) void storeLL(union ncclLLFifoLine* dst, uint64_t val, uint32_t flag) {
+#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
+  using Vec = uint32_t __attribute__((ext_vector_type(4)));
+  Vec i4;
+  i4[0] = val & 0xffffffff;
+  i4[1] = flag;
+  i4[2] = (val >> 32);
+  i4[3] = flag;
+  asm volatile ("flat_store_dwordx4 %0, %1, glc\n"
+    "s_waitcnt vmcnt(0)\n"
+    "buffer_wbinvl1_vol\n" : : "v"(dst), "v"(i4));
+#else
+    asm volatile("st.volatile.global.v4.u32 [%0], {%1,%2,%3,%4};" :: "l"(&dst->i4), "r"((uint32_t)val), "r"(flag), "r"((uint32_t)(val >> 32)), "r"(flag));
+#endif
+  }
+
+  // Using memcpy handles misaligned pointers.
+  __device__ uint64_t readAL(uint64_t* src) {
+    uint64_t val;
+    memcpy((char*)&val, (char*)src, sizeof(uint64_t));
+    return val;
+  }
+
+  __device__ void storeAL(uint64_t* dst, uint64_t val, uint32_t nbytes) {
+    memcpy((char*)dst, (char*)&val, nbytes);
+  }
+
+  template <int RECV, int SEND, int SRC, int DST>
+  __device__ void LLGenericOp(const T* srcPtr, T* dstPtr, int nelem) {
+    uint32_t nbytes = nelem < 0 ? 0 : nelem*sizeof(T);
+    FOR_SEND(waitSend, nbytes*2);
+    barrier();
+    uint32_t npack = DIVUP(nbytes, sizeof(uint64_t));
+    uint64_t* srcPack = (uint64_t*)srcPtr;
+    uint64_t* dstPack = (uint64_t*)dstPtr;
+    int offset = tid;
+    // Do multiples of 64 bits
+    #pragma unroll 1
+    for (; offset<npack; offset+=nthreads) {
+      // Recv : local, then intra-node, then inter-node
+      uint64_t val = SRC ? readAL(srcPack+offset) : readLL(0, offset);
+      if (RECV) {
+        if (SRC) val = MULTI<FUNC, T>()(readLL(0, offset), val);
+        for (int i=1; i<NRECV && i<nrecv; i++) {
+          val = MULTI<FUNC, T>()(readLL(i, offset), val);
+        }
+      }
+
+      // Send : inter-node, then intra-node, then local
+      if (SEND) {
+        for (int i=1; i<NSEND && i<nsend; i++) storeLL(sendPtr(i)+offset, val, sendFlag(i));
+        storeLL(sendPtr(0)+offset, val, sendFlag(0));
+      }
+      if (DST) {
+        if (((offset*sizeof(uint64_t)) ^ nbytes) < sizeof(uint64_t)) {
+          // Last incomplete word
+          storeAL(dstPack+offset, val, nbytes & 0x7);
+        } else {
+          storeAL(dstPack+offset, val, sizeof(uint64_t));
+        }
+      }
+    }
+#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
+    exitIfAbortBarrier(abort, abortCount);
+#else
+    exitIfAbortLocalBarrier();
+#endif
+    FOR_RECV(postRecv);
+    FOR_SEND(postSend, offset);
+  }
+
+  __device__ void loadRecvConn(struct ncclConnInfo* conn, int i) {
+    recvConn[i] = conn;
+    recvBuff[i] = recvConn[i]->llBuff;
+    recvStep[i] = recvConn[i]->step;
+    if (tid == i) {
+      postPtr = recvConn[i]->head;
+      STORE(recvConn[i]->opCountLoc, opCount);
+    }
+    nrecv++;
+  }
+
+  __device__ void loadSendConn(struct ncclConnInfo* conn, int i) {
+    sendConn[i] = conn;
+    sendBuff[i] = sendConn[i]->llBuff;
+    sendStep[i] = sendConn[i]->step;
+    if (tid == WARP_SIZE+i) {
+      waitPtr = sendConn[i]->head;
+      fifoPtr = sendConn[i]->fifo;
+      sendConnHead = LOAD(waitPtr);
+      STORE(sendConn[i]->opCountLoc, opCount);
+    }
+    nsend++;
+  }
+
+  __device__ void saveRecvConn(int i) {
+    if (tid == i) {
+      recvConn[i]->step = recvStep[i];
+      __atomic_fetch_add(recvConn[i]->opCountLoc, 1, __ATOMIC_SEQ_CST);
+      __threadfence_block();
+    }
+  }
+
+  __device__ void saveSendConn(int i) {
+    if (tid == WARP_SIZE+i) {
+      sendConn[i]->step = sendStep[i];
+      __atomic_fetch_add(sendConn[i]->opCountLoc, 1, __ATOMIC_SEQ_CST);
+      __threadfence_block();
+    }
+  }
+
+ public:
+  __device__
+  ncclLLPrimitives(const int tid, const int nthreads, int* recvPeers, int* sendPeers, struct ncclChannel* channel, struct ncclDevComm* comm, const uint64_t opCount)
+    : comm(comm), tid(tid), nthreads(nthreads), opCount(opCount) {
+    // Make sure step is updated before we read it.
+    abortCount = channel->abortCount;
+    barrier();
+
+    for (int i=0; i<NRECV && recvPeers[i] >= 0; i++) loadRecvConn(&channel->devPeers[recvPeers[i]].recv.conn, i);
+    for (int i=0; i<NSEND && sendPeers[i] >= 0; i++) loadSendConn(&channel->devPeers[sendPeers[i]].send.conn, i);
+  }
+
+  __device__ void send(const T* src, int nelem) {
+    return LLGenericOp<0, 1, 1, 0>(src, NULL, nelem);
+  }
+
+  __device__ void recv(T* dst, int nelem) {
+    return LLGenericOp<1, 0, 0, 1>(NULL, dst, nelem);
+  }
+
+  __device__ void recvReduceSend(const T* src, int nelem) {
+    return LLGenericOp<1, 1, 1, 0>(src, NULL, nelem);
+  }
+
+  __device__ void recvReduceCopy(const T* src, T* dst, int nelem) {
+    return LLGenericOp<1, 0, 1, 1>(src, dst, nelem);
+  }
+
+  __device__ void copySend(const T* src, T* dst, int nelem) {
+    return LLGenericOp<0, 1, 1, 1>(src, dst, nelem);
+  }
+
+  __device__ void recvCopySend(T* dst, int nelem) {
+    return LLGenericOp<1, 1, 0, 1>(NULL, dst, nelem);
+  }
+
+  __device__ void recvReduceCopySend(const T* src, T* dst, int nelem) {
+    return LLGenericOp<1, 1, 1, 1>(src, dst, nelem);
+  }
+
+  __device__ ~ncclLLPrimitives() {
+    // Save steps for the next operation
+    for (int i=0; i<NRECV && i<nrecv; i++) saveRecvConn(i);
+    for (int i=0; i<NSEND && i<nsend; i++) saveSendConn(i);
+  }
+};
+
+#ifdef ENABLE_PROFILING
+#define INIT_COUNTER \
+  if (tid==0) { t0 = clock64(); ws = LOAD(&(devProf->wait_send_cycle[blockIdx.x])); \
+    wr = LOAD(&(devProf->wait_recv_cycle[blockIdx.x])); }
+
+#define ACCUMULATE_COUNTER(prim) \
+  if (tid==0) { __atomic_fetch_add(&(devProf->prim##_cycle), clock64() - t0 \
+    + ws - LOAD(&(devProf->wait_send_cycle[blockIdx.x])) \
+    + wr - LOAD(&(devProf->wait_recv_cycle[blockIdx.x])), \
+    __ATOMIC_SEQ_CST); \
+    __atomic_fetch_add(&(devProf->prim##_byte), nelem * sizeof(T), __ATOMIC_SEQ_CST); }
+#else
+#define INIT_COUNTER
+#define ACCUMULATE_COUNTER(prim)
+#endif
+
+#endif
@@ -1,5 +1,6 @@
 /*************************************************************************
 * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+ * Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
 *
 * See LICENSE.txt for license information
 ************************************************************************/
@@ -10,12 +11,7 @@

 #define UNROLL 4

-#if NCCL_OP == 0
 IMPL_COLL2(ncclReduce, sum,  FuncSum,  ncclCollReduce, ncclSum);
-#elif NCCL_OP == 1
 IMPL_COLL2(ncclReduce, prod, FuncProd, ncclCollReduce, ncclProd);
-#elif NCCL_OP == 2
 IMPL_COLL2(ncclReduce, min,  FuncMin,  ncclCollReduce, ncclMin);
-#elif NCCL_OP == 3
 IMPL_COLL2(ncclReduce, max,  FuncMax,  ncclCollReduce, ncclMax);
-#endif
@@ -1,153 +1,82 @@
 /*************************************************************************
- * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
 * Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
 *
 * See LICENSE.txt for license information
 ************************************************************************/

-#include "core.h"
+#include "devcomm.h"
 #include "primitives.h"
 #include "collectives.h"

-// Increase Step and boffset for buffer sync
-#define NEXT_STEP \
-  step++; \
-  boffset += sliceSize; \
-  if (boffset == buffSize) boffset = 0;
-
 template<int UNROLL, class FUNC, typename T>
 __attribute__((noinline))
-__device__ void ncclReduceKernel(struct CollectiveArgs* args) {
+__device__ void ncclReduceRingKernel(struct CollectiveArgs* args) {
  const int tid = threadIdx.x;
  const int nthreads = blockDim.x;
  const int bid = args->bid;
-  struct ncclComm* comm = args->comm;
-  struct ncclRing* ring = comm->rings+blockIdx.x;
-
-  WaitFlag waitDoneFromNext(ring->send.conn.head, (REDUCE_BUFCHUNKS-1)*REDUCE_SUBSTEPS);
-  WaitFlag waitReadyFromPrev(ring->recv.conn.tail, 0);
-  PostFlag postDoneToPrev(ring->recv.conn.head, 0, NULL, 0);
-  PostFlag postReadyToNext(ring->send.conn.tail, 0, ring->send.conn.fifo, REDUCE_BUFCHUNKS*REDUCE_SUBSTEPS, ring->next_hdp_reg);
-
-  typedef Primitives<UNROLL, REDUCE_SUBSTEPS, T, FUNC> Prims;
-
+  struct ncclDevComm* comm = args->comm;
+  struct ncclChannel* channel = comm->channels+blockIdx.x;
+  struct ncclRing* ring = &channel->ring;
  const ssize_t size = args->N;
  const int nranks = comm->nRanks;
-  const int buffSize = ring->buffSize / sizeof(T);
-  const int sliceSize = buffSize / REDUCE_BUFCHUNKS;
-  const ssize_t loopSize = args->nRings*(ssize_t)sliceSize;
+  const int stepSize = channel->buffSize / (sizeof(T)*NCCL_STEPS);
+  const int chunkSize = stepSize * REDUCE_CHUNKSTEPS;
+  const ssize_t loopSize = args->nChannels*(ssize_t)chunkSize;
  const int rank = ring->devUserRanks[0];
  const int prevRank = ring->devUserRanks[nranks-1];
  const int root = args->root;

-  if (tid == 0) {
-    // Update in case we skipped some collectives
-    STORE(ring->recv.conn.opCount, args->opCount);
-
-    if (rank != root) {
-      // Wait for next to be ready
-      WaitFlag waitOpCountNext(ring->send.conn.opCount, 0);
-      waitOpCountNext.wait(args->opCount);
-    }
-  }
-  __syncthreads();
-
-  uint64_t step = 0ULL;
-  int boffset = 0;
-
  // Compute pointers
  const T * __restrict__ thisInput = (const T*)args->ThisInput;
  T * __restrict__ thisOutput = (T*)args->ThisOutput;
-  T * __restrict__ prevInput = (T*)ring->recv.conn.buff;
-  T * __restrict__ nextOutput = (T*)ring->send.conn.buff;
+
+  ncclPrimitives<UNROLL, REDUCE_CHUNKSTEPS/REDUCE_SLICESTEPS, REDUCE_SLICESTEPS, T, 1, 1, FUNC>
+    prims(tid, nthreads, &ring->prev, &ring->next, NULL, stepSize, channel, comm, args->opCount);

  for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
-    int chunkSize = min(sliceSize, DIVUP(size-gridOffset,args->nRings));
-    ALIGN_SIZE(chunkSize, nthreads*sizeof(uint64_t)/sizeof(T));
-    ssize_t offset = gridOffset + bid*chunkSize;
-    int maxOffset = min(chunkSize, size-offset);
+    int realChunkSize = min(chunkSize, DIVUP(size-gridOffset,args->nChannels));
+    ALIGN_SIZE(realChunkSize, nthreads*sizeof(uint64_t)/sizeof(T));
+    ssize_t offset = gridOffset + bid*realChunkSize;
+    int nelem = min(realChunkSize, size-offset);
    if (prevRank == root) {
-      Prims::Copy(tid, nthreads,
-          thisInput + offset,
-          nextOutput + boffset,
-          sliceSize, maxOffset,
-          step,
-          waitDoneFromNext,
-          postReadyToNext);
+      prims.send(thisInput+offset, nelem);
    } else if (rank == root) {
-      Prims::Reduce(tid, nthreads,
-          prevInput  + boffset,
-          thisInput + offset,
-          thisOutput + offset,
-          sliceSize, maxOffset,
-          step,
-          waitReadyFromPrev,
-          postDoneToPrev);
+      prims.recvReduceCopy(thisInput+offset, thisOutput+offset, nelem);
    } else {
-      Prims::Reduce(tid, nthreads,
-          prevInput + boffset,
-          thisInput + offset,
-          nextOutput + boffset,
-          sliceSize, maxOffset,
-          step,
-          waitDoneFromNext, waitReadyFromPrev,
-          postReadyToNext, postDoneToPrev);
+      prims.recvReduceSend(thisInput+offset, nelem);
    }
-    NEXT_STEP; // Increases step, boffset
-  }
-
-  if (tid == 0) {
-    if (rank != root) {
-      // Wait for next to have consumed data before resetting the flag
-      waitDoneFromNext.wait(REDUCE_SUBSTEPS*(step + REDUCE_BUFCHUNKS - 1));
-      STORE(ring->send.conn.head, 0ULL);
-    }
-    STORE(ring->recv.conn.tail, 0ULL);
-    __threadfence_system();
-    STORE(ring->recv.conn.opCount, args->opCount+1);
  }
 }

-#include "ll_kernel.h"
-
-#define NEXT_STEP_LL \
-  boffset += NCCL_LL_SLICE_LINES; \
-  if (boffset == NCCL_LL_BUFF_LINES) boffset = 0; \
-  flag++; \
-  step++;
+template<int UNROLL, class FUNC, typename T>
+__attribute__((noinline))
+__device__ void ncclReduceTreeKernel(struct CollectiveArgs* args) { }

 template<int UNUSED, class FUNC, typename T>
 __attribute__((noinline))
-__device__ void ncclReduceLLKernel(struct CollectiveArgs* args) {
+__device__ void ncclReduceRingLLKernel(struct CollectiveArgs* args) {
  const int tid = threadIdx.x;
  const int bid = args->bid;
-  const int llNthreads = args->nThreads;
-  struct ncclComm* comm = args->comm;
-  struct ncclRing* ring = comm->rings+blockIdx.x;
-  volatile uint64_t * recvHeadPtr = ring->recv.conn.llHead;
-  volatile uint64_t * sendHeadPtr = ring->send.conn.llHead;
-  volatile int * sizesFifo = ring->send.conn.llFifo;
-  uint64_t sendHead = sendHeadPtr[0];
-  const int nranks = comm->nRanks;
+  const int nthreads = args->nThreads;
+  struct ncclDevComm* comm = args->comm;
+  struct ncclChannel* channel = comm->channels+blockIdx.x;
+  struct ncclRing* ring = &channel->ring;
+
+  ncclLLPrimitives<T, FUNC, 1, 1> LLprims(tid, nthreads, &ring->prev, &ring->next, channel, comm, args->opCount);
+
+  const ssize_t size = args->N;
  const int rank = comm->rank;
+  const int nranks = comm->nRanks;
  const int prevRank = ring->devUserRanks[nranks-1];
  const int root = args->root;

-  typedef LLPrimitives<T, FUNC> LL;
-
-  const ssize_t size = args->N;
  ssize_t chunkSize = NCCL_LL_SLICE_LINES * sizeof(uint64_t) / sizeof(T);
-  const ssize_t loopSize = args->nRings*chunkSize;
-
-  uint64_t step = ring->send.conn.llStep;
-  uint32_t flag = step + 1;
-  int boffset = NCCL_LL_SLICE_LINES * STEP_TO_SLOT(step);
+  const ssize_t loopSize = args->nChannels*chunkSize;

  // Compute pointers
  const T * __restrict__ thisInput = (const T*)args->ThisInput;
  T * __restrict__ thisOutput = (T*)args->ThisOutput;
-  union ncclLLFifoLine * prevInput = (union ncclLLFifoLine *)ring->recv.conn.llBuff;
-  union ncclLLFifoLine * nextOutput = (union ncclLLFifoLine *)ring->send.conn.llBuff;

  for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
    if (size-gridOffset < loopSize) {
@@ -155,39 +84,17 @@ __device__ void ncclReduceLLKernel(struct CollectiveArgs* args) {
    }
    ssize_t offset = gridOffset + bid*chunkSize;

-    int maxOffset = min(chunkSize, size-offset);
+    int nelem = min(chunkSize, size-offset);
    if (prevRank == root) {
-      WAIT_NEXT;
-      LL::ReduceCopy(
-          thisInput + offset,
-          nextOutput + boffset,
-          maxOffset, flag, llNthreads);
-      POST_SIZE;
-      NEXT_STEP_LL;
+      LLprims.send(thisInput+offset, nelem);
    } else if (rank == root) {
-      LL::ReduceCopy(
-          thisInput + offset,
-          prevInput  + boffset,
-          thisOutput + offset,
-          maxOffset, flag, llNthreads);
-      NEXT_STEP_LL;
-      ACK_PREV;
+      LLprims.recvReduceCopy(thisInput+offset, thisOutput+offset, nelem);
    } else {
-      WAIT_NEXT;
-      LL::ReduceCopy(
-          thisInput + offset,
-          prevInput + boffset,
-          nextOutput + boffset,
-          maxOffset, flag, flag, llNthreads);
-      POST_SIZE;
-      NEXT_STEP_LL;
-      ACK_PREV;
+      LLprims.recvReduceSend(thisInput+offset, nelem);
    }
  }
-
-  // We need everyone to acknowledge data even if they didn't receive anything
-  // so that the next collective can start right away.
-  ACK_PREV;
-
-  FIFO_CLEANING_AND_SAVE_STEP(flag);
 }
+
+template<int UNUSED, class FUNC, typename T>
+__attribute__((noinline))
+__device__ void ncclReduceTreeLLKernel(struct CollectiveArgs* args) { }
@@ -1,8 +0,0 @@
-/*************************************************************************
- * Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
- *
- * See LICENSE.txt for license information
- ************************************************************************/
-
-#define NCCL_OP 0
-#include "device/reduce.cu"
@@ -1,8 +0,0 @@
-/*************************************************************************
- * Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
- *
- * See LICENSE.txt for license information
- ************************************************************************/
-
-#define NCCL_OP 1
-#include "device/reduce.cu"
@@ -1,8 +0,0 @@
-/*************************************************************************
- * Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
- *
- * See LICENSE.txt for license information
- ************************************************************************/
-
-#define NCCL_OP 2
-#include "device/reduce.cu"
@@ -1,8 +0,0 @@
-/*************************************************************************
- * Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
- *
- * See LICENSE.txt for license information
- ************************************************************************/
-
-#define NCCL_OP 3
-#include "device/reduce.cu"
@@ -1,5 +1,5 @@
 /*************************************************************************
- * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
 * Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
 *
 * See LICENSE.txt for license information
@@ -19,7 +19,7 @@ struct FuncNull {
  }
 };

-#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__)
+#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)

 //we really don't need any specializations and we don't need
 //to break things into uint32_t
@@ -164,30 +164,31 @@ struct FuncMin {
  }
 };

+#define MASK0 0x00ff00ff
+#define MASK1 0xff00ff00
+static __device__ uint32_t addChar4(const uint32_t x, const uint32_t y) {
+  /* This can be used both for signed and unsigned 8-bit addition */
+  const uint32_t x0 = x & MASK0;
+  const uint32_t x1 = x & MASK1;
+  const uint32_t y0 = y & MASK0;
+  const uint32_t y1 = y & MASK1;
+  const uint32_t r0 = (x0+y0);
+  const uint32_t r1 = (x1+y1);
+  return (r0 & MASK0) | (r1 & MASK1);
+}
+
 template<>
 struct FuncSum<int8_t> {
-  union converter { uint32_t storage; char4 a; };
  __device__ uint32_t operator()(const uint32_t x, const uint32_t y) const {
+#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
+#else
 #if (__CUDA_ARCH__ >= 300) && (__CUDA_ARCH__ < 500)
    int32_t rv, z=0;
    asm("vadd4.s32.s32.s32 %0, %1, %2, %3;" : "=r"(rv) : "r"(x), "r"(y), "r"(z));
    return rv;
-#elif (__CUDA_ARCH__ >= 500) && (__CUDA_ARCH__ < 700)
-    int32_t rv;
-    asm("vadd.s32.s32.s32 %0,    %1.b0, %2.b0;    \n\t"
-        "vadd.s32.s32.s32 %0.b1, %1.b1, %2.b1, %0;\n\t"
-        "vadd.s32.s32.s32 %0.b2, %1.b2, %2.b2, %0;\n\t"
-        "vadd.s32.s32.s32 %0.b3, %1.b3, %2.b3, %0;" : "=r"(rv) : "r"(x), "r"(y));
-    return rv;
 #else
-    converter cx, cy, cr;
-    cx.storage = x;
-    cy.storage = y;
-    cr.a.x = cx.a.x + cy.a.x;
-    cr.a.y = cx.a.y + cy.a.y;
-    cr.a.z = cx.a.z + cy.a.z;
-    cr.a.w = cx.a.w + cy.a.w;
-    return cr.storage;
+    return addChar4(x, y);
+#endif
 #endif
  }
  __device__ int8_t operator()(const int8_t x, const int8_t y) const {
@@ -196,28 +197,16 @@ struct FuncSum<int8_t> {
 };
 template<>
 struct FuncSum<uint8_t> {
-  union converter { uint32_t storage; uchar4 a; };
  __device__ uint32_t operator()(const uint32_t x, const uint32_t y) const {
+#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
+#else
 #if (__CUDA_ARCH__ >= 300) && (__CUDA_ARCH__ < 500)
    int32_t rv, z=0;
    asm("vadd4.u32.u32.u32 %0, %1, %2, %3;" : "=r"(rv) : "r"(x), "r"(y), "r"(z));
    return rv;
-#elif (__CUDA_ARCH__ >= 500) && (__CUDA_ARCH__ < 700)
-    int32_t rv;
-    asm("vadd.u32.u32.u32 %0,    %1.b0, %2.b0;    \n\t"
-        "vadd.u32.u32.u32 %0.b1, %1.b1, %2.b1, %0;\n\t"
-        "vadd.u32.u32.u32 %0.b2, %1.b2, %2.b2, %0;\n\t"
-        "vadd.u32.u32.u32 %0.b3, %1.b3, %2.b3, %0;" : "=r"(rv) : "r"(x), "r"(y));
-    return rv;
 #else
-    converter cx, cy, cr;
-    cx.storage = x;
-    cy.storage = y;
-    cr.a.x = cx.a.x + cy.a.x;
-    cr.a.y = cx.a.y + cy.a.y;
-    cr.a.z = cx.a.z + cy.a.z;
-    cr.a.w = cx.a.w + cy.a.w;
-    return cr.storage;
+    return addChar4(x, y);
+#endif
 #endif
  }
  __device__ uint8_t operator()(const uint8_t x, const uint8_t y) const {
@@ -227,22 +216,6 @@ struct FuncSum<uint8_t> {

 static __device__ uint32_t mulChar4(const uint32_t x, const uint32_t y) {
  /* This can be used both for signed and unsigned 8-bit multiplication */
-#if (__CUDA_ARCH__ >= 300)
-  uint32_t rv;
-  asm("{ .reg .u32 t0, t1, t2, t3;\n\t"
-      " vmad.u32.u32.u32 t3, %1.b3, %2.b3, 0;\n\t"
-      " vmad.u32.u32.u32 t2, %1.b2, %2.b2, 0;\n\t"
-      " shl.b32          t3, t3, 16;\n\t"
-      " shl.b32          t2, t2, 16;\n\t"
-      " vmad.u32.u32.u32 t1, %1.b1, %2.b1, t3;\n\t"
-      " shl.b32          t1, t1, 8;\n\t"
-      " vmad.u32.u32.u32 t0, %1.b0, %2.b0, t2;\n\t"
-      " and.b32          t1, t1, 0xff00ff00;\n\t"
-      " and.b32          t0, t0, 0x00ff00ff;\n\t"
-      " or.b32           %0,  t0, t1;\n\t"
-      "}" : "=r"(rv) : "r"(x), "r"(y));
-  return rv;
-#else
  union converter { uint32_t storage; char4 a; };
  converter cx, cy, cr;
  cx.storage = x;
@@ -252,7 +225,6 @@ static __device__ uint32_t mulChar4(const uint32_t x, const uint32_t y) {
  cr.a.z = cx.a.z * cy.a.z;
  cr.a.w = cx.a.w * cy.a.w;
  return cr.storage;
-#endif
 }

 template<>
@@ -278,17 +250,12 @@ template<>
 struct FuncMax<int8_t> {
  union converter { uint32_t storage; char4 a; };
  __device__ uint32_t operator()(const uint32_t x, const uint32_t y) const {
+#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
+#else
 #if (__CUDA_ARCH__ >= 300) && (__CUDA_ARCH__ < 500)
    int32_t rv, z=0;
    asm("vmax4.s32.s32.s32 %0, %1, %2, %3;" : "=r"(rv) : "r"(x), "r"(y), "r"(z));
    return rv;
-#elif (__CUDA_ARCH__ >= 500) && (__CUDA_ARCH__ < 700)
-    int32_t rv;
-    asm("vmax.s32.s32.s32 %0,    %1.b0, %2.b0;    \n\t"
-        "vmax.s32.s32.s32 %0.b1, %1.b1, %2.b1, %0;\n\t"
-        "vmax.s32.s32.s32 %0.b2, %1.b2, %2.b2, %0;\n\t"
-        "vmax.s32.s32.s32 %0.b3, %1.b3, %2.b3, %0;" : "=r"(rv) : "r"(x), "r"(y));
-    return rv;
 #else
    converter cx, cy, cr;
    cx.storage = x;
@@ -298,6 +265,7 @@ struct FuncMax<int8_t> {
    cr.a.z = max(cx.a.z, cy.a.z);
    cr.a.w = max(cx.a.w, cy.a.w);
    return cr.storage;
+#endif
 #endif
  }
  __device__ int8_t operator()(const int8_t x, const int8_t y) const {
@@ -308,17 +276,12 @@ template<>
 struct FuncMax<uint8_t> {
  union converter { uint32_t storage; uchar4 a; };
  __device__ uint32_t operator()(const uint32_t x, const uint32_t y) const {
+#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
+#else
 #if (__CUDA_ARCH__ >= 300) && (__CUDA_ARCH__ < 500)
    int32_t rv, z=0;
    asm("vmax4.u32.u32.u32 %0, %1, %2, %3;" : "=r"(rv) : "r"(x), "r"(y), "r"(z));
    return rv;
-#elif (__CUDA_ARCH__ >= 500) && (__CUDA_ARCH__ < 700)
-    int32_t rv;
-    asm("vmax.u32.u32.u32 %0,    %1.b0, %2.b0;    \n\t"
-        "vmax.u32.u32.u32 %0.b1, %1.b1, %2.b1, %0;\n\t"
-        "vmax.u32.u32.u32 %0.b2, %1.b2, %2.b2, %0;\n\t"
-        "vmax.u32.u32.u32 %0.b3, %1.b3, %2.b3, %0;" : "=r"(rv) : "r"(x), "r"(y));
-    return rv;
 #else
    converter cx, cy, cr;
    cx.storage = x;
@@ -328,6 +291,7 @@ struct FuncMax<uint8_t> {
    cr.a.z = max(cx.a.z, cy.a.z);
    cr.a.w = max(cx.a.w, cy.a.w);
    return cr.storage;
+#endif
 #endif
  }
  __device__ uint8_t operator()(const uint8_t x, const uint8_t y) const {
@@ -339,17 +303,12 @@ template<>
 struct FuncMin<int8_t> {
  union converter { uint32_t storage; char4 a; };
  __device__ uint32_t operator()(const uint32_t x, const uint32_t y) const {
+#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
+#else
 #if (__CUDA_ARCH__ >= 300) && (__CUDA_ARCH__ < 500)
    int32_t rv, z=0;
    asm("vmin4.s32.s32.s32 %0, %1, %2, %3;" : "=r"(rv) : "r"(x), "r"(y), "r"(z));
    return rv;
-#elif (__CUDA_ARCH__ >= 500) && (__CUDA_ARCH__ < 700)
-    int32_t rv;
-    asm("vmin.s32.s32.s32 %0,    %1.b0, %2.b0;    \n\t"
-        "vmin.s32.s32.s32 %0.b1, %1.b1, %2.b1, %0;\n\t"
-        "vmin.s32.s32.s32 %0.b2, %1.b2, %2.b2, %0;\n\t"
-        "vmin.s32.s32.s32 %0.b3, %1.b3, %2.b3, %0;" : "=r"(rv) : "r"(x), "r"(y));
-    return rv;
 #else
    converter cx, cy, cr;
    cx.storage = x;
@@ -359,6 +318,7 @@ struct FuncMin<int8_t> {
    cr.a.z = min(cx.a.z, cy.a.z);
    cr.a.w = min(cx.a.w, cy.a.w);
    return cr.storage;
+#endif
 #endif
  }
  __device__ int8_t operator()(const int8_t x, const int8_t y) const {
@@ -369,17 +329,12 @@ template<>
 struct FuncMin<uint8_t> {
  union converter { uint32_t storage; uchar4 a; };
  __device__ uint32_t operator()(const uint32_t x, const uint32_t y) const {
+#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
+#else
 #if (__CUDA_ARCH__ >= 300) && (__CUDA_ARCH__ < 500)
    int32_t rv, z=0;
    asm("vmin4.u32.u32.u32 %0, %1, %2, %3;" : "=r"(rv) : "r"(x), "r"(y), "r"(z));
    return rv;
-#elif (__CUDA_ARCH__ >= 500) && (__CUDA_ARCH__ < 700)
-    int32_t rv;
-    asm("vmin.u32.u32.u32 %0,    %1.b0, %2.b0;    \n\t"
-        "vmin.u32.u32.u32 %0.b1, %1.b1, %2.b1, %0;\n\t"
-        "vmin.u32.u32.u32 %0.b2, %1.b2, %2.b2, %0;\n\t"
-        "vmin.u32.u32.u32 %0.b3, %1.b3, %2.b3, %0;" : "=r"(rv) : "r"(x), "r"(y));
-    return rv;
 #else
    converter cx, cy, cr;
    cx.storage = x;
@@ -389,6 +344,7 @@ struct FuncMin<uint8_t> {
    cr.a.z = min(cx.a.z, cy.a.z);
    cr.a.w = min(cx.a.w, cy.a.w);
    return cr.storage;
+#endif
 #endif
  }
  __device__ uint8_t operator()(const uint8_t x, const uint8_t y) const {
@@ -480,6 +436,6 @@ struct FuncMin<half> {
  }
 };

-#endif // defined(__HIP_PLATFORM_HCC__) || defined(__HCC__)
+#endif // defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)

 #endif // REDUCE_KERNEL_H_
@@ -11,12 +11,7 @@

 #define UNROLL 4

-#if NCCL_OP == 0
 IMPL_COLL2(ncclReduceScatter, sum,  FuncSum,  ncclCollReduceScatter, ncclSum);
-#elif NCCL_OP == 1
 IMPL_COLL2(ncclReduceScatter, prod, FuncProd, ncclCollReduceScatter, ncclProd);
-#elif NCCL_OP == 2
 IMPL_COLL2(ncclReduceScatter, min,  FuncMin,  ncclCollReduceScatter, ncclMin);
-#elif NCCL_OP == 3
-IMPL_COLL2(ncclReduceScatter, max,  FuncMax,  ncclCollReduceScatter, ncclMax);
-#endif
+IMPL_COLL2(ncclReduceScatter, max,  FuncMax,  ncclCollReduceScatter, ncclMax);
@@ -1,166 +1,93 @@
 /*************************************************************************
- * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
 * Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
 *
 * See LICENSE.txt for license information
 ************************************************************************/

-#include "core.h"
+#include "devcomm.h"
 #include "primitives.h"
 #include "collectives.h"

-// Increase Step and poffset/noffset for buffer sync
-#define NEXT_STEP \
-  step++; \
-  poffset = noffset; \
-  noffset += sliceSize; \
-  if (noffset == buffSize) noffset = 0;
-
 template<int UNROLL, class FUNC, typename T>
 __attribute__((noinline))
-__device__ void ncclReduceScatterKernel(struct CollectiveArgs* args) {
+__device__ void ncclReduceScatterRingKernel(struct CollectiveArgs* args) {
  const int tid = threadIdx.x;
  const int nthreads = blockDim.x;
  const int bid = args->bid;
-  struct ncclComm* comm = args->comm;
-  struct ncclRing* ring = comm->rings+blockIdx.x;
-
-  WaitFlag waitDoneFromNext(ring->send.conn.head, REDUCESCATTER_BUFCHUNKS*REDUCESCATTER_SUBSTEPS);
-  WaitFlag waitReadyFromPrev(ring->recv.conn.tail, REDUCESCATTER_SUBSTEPS);
-  PostFlag postDoneToPrev(ring->recv.conn.head, REDUCESCATTER_SUBSTEPS, NULL, 0);
-  PostFlag postReadyToNext(ring->send.conn.tail, 0, ring->send.conn.fifo, REDUCESCATTER_BUFCHUNKS*REDUCESCATTER_SUBSTEPS, ring->next_hdp_reg);
-
-  typedef Primitives<UNROLL, REDUCESCATTER_SUBSTEPS, T, FUNC> Prims;
-
+  struct ncclDevComm* comm = args->comm;
+  struct ncclChannel* channel = comm->channels+blockIdx.x;
+  struct ncclRing* ring = &channel->ring;
  const ssize_t size = args->N;
  const int nranks = comm->nRanks;
-  const int buffSize = ring->buffSize / sizeof(T);
-  const int sliceSize = buffSize / REDUCESCATTER_BUFCHUNKS;
-  const ssize_t loopSize = args->nRings*(ssize_t)sliceSize;
-
-  if (tid == 0) {
-    // Update in case we skipped some collectives
-    STORE(ring->recv.conn.opCount, args->opCount);
-    // Wait for next to be ready
-    WaitFlag waitOpCountNext(ring->send.conn.opCount, 0);
-    waitOpCountNext.wait(args->opCount);
-  }
-  __syncthreads();
-
-  uint64_t step = 0ULL;
-  int poffset, noffset = 0;
+  const int stepSize = channel->buffSize / (sizeof(T)*NCCL_STEPS);
+  const int chunkSize = stepSize * ALLREDUCE_CHUNKSTEPS;
+  const ssize_t loopSize = args->nChannels*(ssize_t)chunkSize;

  // Compute pointers
  const T * __restrict__ thisInput = (const T*)args->ThisInput;
  T * __restrict__ thisOutput = (T*)args->ThisOutput;
-  T * __restrict__ prevInput = (T*)ring->recv.conn.buff;
-  T * __restrict__ nextOutput = (T*)ring->send.conn.buff;
+
+  ncclPrimitives<UNROLL, REDUCESCATTER_CHUNKSTEPS/REDUCESCATTER_SLICESTEPS, REDUCESCATTER_SLICESTEPS, T, 1, 1, FUNC>
+    prims(tid, nthreads, &ring->prev, &ring->next, NULL, stepSize, channel, comm, args->opCount);

  for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
-    int chunkSize = min(sliceSize, DIVUP(size-gridOffset,args->nRings));
-    ALIGN_SIZE(chunkSize, nthreads*sizeof(uint64_t)/sizeof(T));
-    ssize_t chunkOffset = gridOffset + bid*chunkSize;
+    int realChunkSize = min(chunkSize, DIVUP(size-gridOffset,args->nChannels));
+    ALIGN_SIZE(realChunkSize, nthreads*sizeof(uint64_t)/sizeof(T));
+    ssize_t chunkOffset = gridOffset + bid*realChunkSize;

    /////////////// begin ReduceScatter steps ///////////////
    ssize_t offset;
-    int maxOffset = min(chunkSize, size-chunkOffset);
+    int nelem = min(realChunkSize, size-chunkOffset);
    int rankDest;

    // step 0: push data to next GPU
    rankDest = ring->devUserRanks[nranks-1];
    offset = chunkOffset + rankDest * size;

-    Prims::Copy(tid, nthreads,
-        thisInput  + offset,
-        nextOutput + noffset,
-        sliceSize, maxOffset,
-        step,
-        waitDoneFromNext,
-        postReadyToNext);
-
-    NEXT_STEP; // Increases step, poffset, noffset
+    prims.send(thisInput+offset, nelem);

    // k-2 steps: reduce and copy to next GPU
    for (int j=2; j<nranks; ++j) {
      rankDest = ring->devUserRanks[nranks-j];
      offset = chunkOffset + rankDest * size;

-      Prims::Reduce(tid, nthreads,
-          prevInput  + poffset,
-          thisInput  + offset,
-          nextOutput + noffset,
-          sliceSize, maxOffset,
-          step,
-          waitDoneFromNext, waitReadyFromPrev,
-          postReadyToNext, postDoneToPrev);
-
-      NEXT_STEP;
+      prims.recvReduceSend(thisInput+offset, nelem);
    }

-    // step k-1: reduce this buffer and data, which will produce the final
-    // result that we store in this data and push to the next GPU
+    // step k-1: reduce this buffer and data, which will produce the final result
    rankDest = ring->devUserRanks[0];
    offset = chunkOffset + rankDest * size;

-    Prims::Reduce(tid, nthreads,
-        prevInput  + poffset,
-        thisInput  + offset,
-        thisOutput + chunkOffset,
-        sliceSize, maxOffset,
-        step,
-        waitReadyFromPrev,
-        postDoneToPrev);
-  }
-
-  if (tid == 0) {
-    waitDoneFromNext.wait(REDUCESCATTER_SUBSTEPS*(step + REDUCESCATTER_BUFCHUNKS));
-    STORE(ring->send.conn.head, 0ULL);
-    STORE(ring->recv.conn.tail, 0ULL);
-    __threadfence_system();
-    STORE(ring->recv.conn.opCount, args->opCount+1);
+    prims.recvReduceCopy(thisInput+offset, thisOutput+chunkOffset, nelem);
  }
 }

-#include "ll_kernel.h"
-
-#define NEXT_STEP_LL \
-  poffset = noffset; \
-  pflag = nflag; \
-  noffset += NCCL_LL_SLICE_LINES; \
-  if (noffset == NCCL_LL_BUFF_LINES) { noffset = 0; } \
-  nflag++; \
-  step++;
+template<int UNROLL, class FUNC, typename T>
+__attribute__((noinline))
+__device__ void ncclReduceScatterTreeKernel(struct CollectiveArgs* args) { }

 template<int UNUSED, class FUNC, typename T>
 __attribute__((noinline))
-__device__ void ncclReduceScatterLLKernel(struct CollectiveArgs* args) {
+__device__ void ncclReduceScatterRingLLKernel(struct CollectiveArgs* args) {
  const int tid = threadIdx.x;
  const int bid = args->bid;
-  const int llNthreads = args->nThreads;
-  struct ncclComm* comm = args->comm;
-  struct ncclRing* ring = comm->rings+blockIdx.x;
-  volatile uint64_t * recvHeadPtr = ring->recv.conn.llHead;
-  volatile uint64_t * sendHeadPtr = ring->send.conn.llHead;
-  volatile int * sizesFifo = ring->send.conn.llFifo;
-  uint64_t sendHead = sendHeadPtr[0];
+  const int nthreads = args->nThreads;
+  struct ncclDevComm* comm = args->comm;
+  struct ncclChannel* channel = comm->channels+blockIdx.x;
+  struct ncclRing* ring = &channel->ring;

-  typedef LLPrimitives<T, FUNC> LL;
+  ncclLLPrimitives<T, FUNC, 1, 1> LLprims(tid, nthreads, &ring->prev, &ring->next, channel, comm, args->opCount);

  const ssize_t size = args->N;
  //const int rank = comm->rank;
  const int nranks = comm->nRanks;
  ssize_t chunkSize = NCCL_LL_SLICE_LINES * sizeof(uint64_t) / sizeof(T);
-  const ssize_t loopSize = args->nRings*chunkSize;
-
-  uint64_t step = ring->send.conn.llStep;
-  uint32_t pflag, nflag = step + 1;
-  int poffset, noffset = NCCL_LL_SLICE_LINES * STEP_TO_SLOT(step);
+  const ssize_t loopSize = args->nChannels*chunkSize;

  // Compute pointers
  const T * __restrict__ thisInput = (const T*)args->ThisInput;
  T * __restrict__ thisOutput = (T*)args->ThisOutput;
-  union ncclLLFifoLine * prevInput = (union ncclLLFifoLine *)ring->recv.conn.llBuff;
-  union ncclLLFifoLine * nextOutput = (union ncclLLFifoLine *)ring->send.conn.llBuff;

  for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
    if (size-gridOffset < loopSize) {
@@ -170,37 +97,21 @@ __device__ void ncclReduceScatterLLKernel(struct CollectiveArgs* args) {

    /////////////// begin ReduceScatter steps ///////////////
    ssize_t offset;
-    int maxOffset = min(chunkSize, size-chunkOffset);
+    int nelem = min(chunkSize, size-chunkOffset);
    int rankDest;

    // step 0: push data to next GPU
    rankDest = ring->devUserRanks[nranks-1];
    offset = chunkOffset + rankDest * size;

-    WAIT_NEXT;
-    LL::ReduceCopy(
-        thisInput  + offset,
-        nextOutput + noffset,
-        maxOffset, nflag, llNthreads);
-    POST_SIZE;
-
-    NEXT_STEP_LL;
+    LLprims.send(thisInput+offset, nelem);

    // k-2 steps: reduce and copy to next GPU
    for (int j=2; j<nranks; ++j) {
      rankDest = ring->devUserRanks[nranks-j];
      offset = chunkOffset + rankDest * size;

-      WAIT_NEXT;
-      LL::ReduceCopy(
-          thisInput  + offset,
-          prevInput  + poffset,
-          nextOutput + noffset,
-          maxOffset, pflag, nflag, llNthreads);
-      POST_SIZE;
-      ACK_PREV;
-
-      NEXT_STEP_LL;
+      LLprims.recvReduceSend(thisInput+offset, nelem);
    }

    // step k-1: reduce this buffer and data, which will produce the final
@@ -208,13 +119,10 @@ __device__ void ncclReduceScatterLLKernel(struct CollectiveArgs* args) {
    rankDest = ring->devUserRanks[0];
    offset = chunkOffset + rankDest * size;

-    LL::ReduceCopy(
-        thisInput  + offset,
-        prevInput  + poffset,
-        thisOutput + chunkOffset,
-        maxOffset, pflag, llNthreads);
-    ACK_PREV;
+    LLprims.recvReduceCopy(thisInput+offset, thisOutput+chunkOffset, nelem);
  }
-
-  FIFO_CLEANING_AND_SAVE_STEP(nflag);
 }
+
+template<int UNUSED, class FUNC, typename T>
+__attribute__((noinline))
+__device__ void ncclReduceScatterTreeLLKernel(struct CollectiveArgs* args) { }
@@ -1,8 +0,0 @@
-/*************************************************************************
- * Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
- *
- * See LICENSE.txt for license information
- ************************************************************************/
-
-#define NCCL_OP 0
-#include "device/reduce_scatter.cu"
@@ -1,8 +0,0 @@
-/*************************************************************************
- * Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
- *
- * See LICENSE.txt for license information
- ************************************************************************/
-
-#define NCCL_OP 1
-#include "device/reduce_scatter.cu"
@@ -1,8 +0,0 @@
-/*************************************************************************
- * Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
- *
- * See LICENSE.txt for license information
- ************************************************************************/
-
-#define NCCL_OP 2
-#include "device/reduce_scatter.cu"
@@ -1,8 +0,0 @@
-/*************************************************************************
- * Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
- *
- * See LICENSE.txt for license information
- ************************************************************************/
-
-#define NCCL_OP 3
-#include "device/reduce_scatter.cu"
@@ -0,0 +1,19 @@
+/*************************************************************************
+ * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
+ * Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#include "enqueue.h"
+#include "collectives.h"
+
+NCCL_API(ncclResult_t, ncclReduce, const void* sendbuff, void* recvbuff, size_t count,
+    ncclDataType_t datatype, ncclRedOp_t op, int root, ncclComm_t comm, hipStream_t stream);
+ncclResult_t ncclReduce(const void* sendbuff, void* recvbuff, size_t count,
+    ncclDataType_t datatype, ncclRedOp_t op, int root, ncclComm_t comm, hipStream_t stream) {
+  struct ncclInfo info = { ncclCollReduce, "Reduce",
+    sendbuff, recvbuff, count, datatype, op, root, comm, stream, /* Args */
+    REDUCE_CHUNKSTEPS, REDUCE_SLICESTEPS };
+  return ncclEnqueueCheck(&info);
+}
@@ -1,33 +0,0 @@
-/*************************************************************************
- * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
- *
- * See LICENSE.txt for license information
- ************************************************************************/
-
-#include "core.h"
-#include "common_coll.h"
-#include "enqueue.h"
-#include "collectives.h"
-
-ncclResult_t ncclReduceFunc(const void* sendbuff, void* recvbuff, const size_t count,
-    ncclDataType_t datatype, ncclRedOp_t op, int root, ncclComm_t comm, hipStream_t stream) {
-  size_t nbytes = count*ncclTypeSize(datatype);
-  INFO(NCCL_COLL,"Reduce: opCount %lx sendbuff %p recvbuff %p count %zi datatype %d op %d root %d comm %p [nranks=%d] stream %p", comm->opCount, sendbuff, recvbuff, count, datatype, op, root, comm, comm->nRanks, stream);
-  if (comm->nRanks == 1) {
-    if (sendbuff != recvbuff)
-      CUDACHECK(hipMemcpyAsync(recvbuff, sendbuff, nbytes, hipMemcpyDeviceToDevice, stream));
-  } else {
-    NCCLCHECK(transportSaveProxies(REDUCE_SUBSTEPS, REDUCE_BUFCHUNKS, 1, 1, nbytes, proxyPatternTo(root), comm));
-    NCCLCHECK(saveKernel(ncclCollReduce, sendbuff, recvbuff, count, datatype, op, root, comm, stream, nbytes, 1));
-  }
-
-  return ncclSuccess;
-}
-
-NCCL_API(ncclResult_t, ncclReduce, const void* sendbuff, void* recvbuff, size_t count,
-    ncclDataType_t datatype, ncclRedOp_t op, int root, ncclComm_t comm, hipStream_t stream);
-ncclResult_t ncclReduce(const void* sendbuff, void* recvbuff, size_t count,
-    ncclDataType_t datatype, ncclRedOp_t op, int root, ncclComm_t comm, hipStream_t stream) {
-  return ncclEnqueueCheck(ncclReduceFunc, "Reduce", sendbuff, recvbuff, count, datatype,
-          op, root, comm, stream);
-}
@@ -0,0 +1,19 @@
+/*************************************************************************
+ * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
+ * Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#include "enqueue.h"
+#include "collectives.h"
+
+NCCL_API(ncclResult_t, ncclReduceScatter, const void* sendbuff, void* recvbuff, size_t recvcount,
+    ncclDataType_t datatype, ncclRedOp_t op, ncclComm* comm, hipStream_t stream);
+ncclResult_t ncclReduceScatter(const void* sendbuff, void* recvbuff, size_t recvcount,
+    ncclDataType_t datatype, ncclRedOp_t op, ncclComm* comm, hipStream_t stream) {
+  struct ncclInfo info = { ncclCollReduceScatter, "ReduceScatter",
+    sendbuff, recvbuff, recvcount, datatype, op, 0, comm, stream, /* Args */
+    REDUCESCATTER_CHUNKSTEPS, REDUCESCATTER_SLICESTEPS };
+  return ncclEnqueueCheck(&info);
+}
@@ -1,32 +0,0 @@
-/*************************************************************************
- * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
- *
- * See LICENSE.txt for license information
- ************************************************************************/
-
-#include "core.h"
-#include "common_coll.h"
-#include "enqueue.h"
-#include "collectives.h"
-
-ncclResult_t ncclReduceScatterFunc(const void* sendbuff, void* recvbuff, size_t count,
-    ncclDataType_t datatype, ncclRedOp_t op, int root, ncclComm_t comm, hipStream_t stream) {
-  size_t nbytes = count*ncclTypeSize(datatype);
-  INFO(NCCL_COLL,"ReduceScatter: opCount %lx sendbuff %p recvbuff %p count %zi datatype %d op %d root %d comm %p [nranks=%d] stream %p", comm->opCount, sendbuff, recvbuff, count, datatype, op, root, comm, comm->nRanks, stream);
-  if (comm->nRanks == 1) {
-    if (sendbuff != recvbuff)
-      CUDACHECK(hipMemcpyAsync(recvbuff, sendbuff, nbytes, hipMemcpyDeviceToDevice, stream));
-  } else {
-    NCCLCHECK(transportSaveProxies(REDUCESCATTER_SUBSTEPS, REDUCESCATTER_BUFCHUNKS, comm->nRanks-1, comm->nRanks, nbytes*comm->nRanks, proxyPatternRing, comm));
-    NCCLCHECK(saveKernel(ncclCollReduceScatter, sendbuff, recvbuff, count, datatype, op, root, comm, stream, nbytes*comm->nRanks, 1));
-  }
-  return ncclSuccess;
-}
-
-NCCL_API(ncclResult_t, ncclReduceScatter, const void* sendbuff, void* recvbuff, size_t recvcount,
-    ncclDataType_t datatype, ncclRedOp_t op, ncclComm* comm, hipStream_t stream);
-ncclResult_t ncclReduceScatter(const void* sendbuff, void* recvbuff, size_t recvcount,
-    ncclDataType_t datatype, ncclRedOp_t op, ncclComm* comm, hipStream_t stream) {
-  return ncclEnqueueCheck(ncclReduceScatterFunc, "ReduceScatter", sendbuff, recvbuff, recvcount, datatype,
-          op, 0, comm, stream);
-}
@@ -0,0 +1,441 @@
+/*************************************************************************
+ * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved.
+ * Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#include "enqueue.h"
+#include "checks.h"
+#include "param.h"
+
+#include "collectives/collectives.h"
+
+// Only generate inline kernels for LL
+#define NCCL_FUNC5(coll, op, dtype) \
+  NCCL_KERN_NAME(coll##LL, op, dtype), \
+  NCCL_KERN_NAME(coll##LL, op, dtype)
+
+#define NCCL_FUNC4(coll, op, dtype) \
+  NCCL_FUNC5(coll##Ring, op, dtype)
+
+// Must be consistent with ncclDataType_t
+#define NCCL_FUNCS3A(coll, op) \
+  NCCL_FUNC4(coll, op,  i8), \
+  NCCL_FUNC4(coll, op,  u8), \
+  NCCL_FUNC4(coll, op, i32), \
+  NCCL_FUNC4(coll, op, u32), \
+  NCCL_FUNC4(coll, op, i64), \
+  NCCL_FUNC4(coll, op, u64), \
+  NCCL_FUNC4(coll, op, f16), \
+  NCCL_FUNC4(coll, op, f32), \
+  NCCL_FUNC4(coll, op, f64)
+#define NCCL_FUNCS3B(coll, op) \
+  NCCL_FUNC4(coll, op,  i8), \
+  NCCL_FUNC4(coll, op,  i8), \
+  NCCL_FUNC4(coll, op,  i8), \
+  NCCL_FUNC4(coll, op,  i8), \
+  NCCL_FUNC4(coll, op,  i8), \
+  NCCL_FUNC4(coll, op,  i8), \
+  NCCL_FUNC4(coll, op,  i8), \
+  NCCL_FUNC4(coll, op,  i8), \
+  NCCL_FUNC4(coll, op,  i8)
+
+// Must be consistent with ncclRedOp_t -- but we only generate kernel for sums.
+#define NCCL_FUNCS2A(coll) \
+  NCCL_FUNCS3A(coll, sum), \
+  NCCL_FUNCS3A(coll, sum), \
+  NCCL_FUNCS3A(coll, sum), \
+  NCCL_FUNCS3A(coll, sum)
+#define NCCL_FUNCS2B(coll) \
+  NCCL_FUNCS3B(coll, copy), \
+  NCCL_FUNCS3B(coll, copy), \
+  NCCL_FUNCS3B(coll, copy), \
+  NCCL_FUNCS3B(coll, copy)
+
+typedef void(*ncclKern_t)(struct ncclColl);
+// Must be consistent with the ncclFuncSet enum
+static ncclKern_t const ncclKerns[ncclCollCount*ncclNumOps*ncclNumTypes*2] = {
+  NCCL_FUNCS2B(ncclBroadcast),
+  NCCL_FUNCS2A(ncclReduce),
+  NCCL_FUNCS2B(ncclAllGather),
+  NCCL_FUNCS2A(ncclReduceScatter),
+  NCCL_FUNCS2A(ncclAllReduce)
+};
+
+/*****************************************************************************/
+/*       Launch system : synchronization and CUDA kernel launch              */
+/*****************************************************************************/
+
+ncclResult_t ncclLaunchCooperativeKernelMultiDevice(hipLaunchParams *paramsList, int* cudaDevs, int numDevices, int cgMode) {
+  if (cgMode & 0x01) {
+    CUDACHECK(hipExtLaunchMultiKernelMultiDevice(paramsList, numDevices,
+            // These flags are to reduce the latency of using this API
+            0));
+    return ncclSuccess;
+  }
+  int savedDev;
+  CUDACHECK(hipGetDevice(&savedDev));
+  for (int i = 0; i < numDevices; i++) {
+    hipLaunchParams* params = paramsList+i;
+    CUDACHECK(hipSetDevice(cudaDevs[i]));
+    hipLaunchKernelGGL(((void (*)(struct ncclColl))params->func), params->gridDim, params->blockDim, params->sharedMem, params->stream, **((struct ncclColl **)(params->args)));
+  }
+  CUDACHECK(hipSetDevice(savedDev));
+  return ncclSuccess;
+}
+
+ncclResult_t setupLaunch(struct ncclComm* comm, hipLaunchParams* params) {
+  params->gridDim.x = std::min<unsigned>(params->gridDim.x, comm->nChannels);
+
+  // Set active = 2 for the last operation
+  for (int r=0; r<params->gridDim.x; r++) {
+    struct ncclChannel* channel = comm->channels+r;
+    STORE(&channel->collectives[(channel->collStart+channel->collCount-1)%NCCL_MAX_OPS].active, 2);
+  }
+
+  // Find the first operation, choose the kernel accordingly and pass it
+  // as the first argument.
+  struct ncclColl* coll = comm->channels[0].collectives+comm->channels[0].collStart;
+  memcpy(&comm->args, coll, sizeof(struct ncclColl));
+  // As we pass that coll directly, we can free it immediately.
+  STORE(&coll->active, 0);
+
+  params->func = (void *)ncclKerns[coll->funcIndex];
+  return ncclSuccess;
+}
+
+ncclResult_t ncclCpuBarrierIn(struct ncclComm* comm, int* isLast) {
+  volatile int* ptr = (volatile int*)(comm->intraBarrier+comm->intraPhase);
+  int val = LOAD(ptr);
+  bool done = false;
+  while (done == false) {
+    if (val >= comm->intraRanks) {
+      WARN("Trying to launch too many collectives");
+      return ncclInvalidUsage;
+    }
+    if (val+1 == comm->intraRanks) {
+      // Reset the barrier.
+      comm->intraBarrier[comm->intraPhase^1] = 0;
+      *isLast = 1;
+      return ncclSuccess;
+    }
+    done = __sync_bool_compare_and_swap(ptr, val, val+1);
+    val++;
+  }
+  *isLast = 0;
+  return ncclSuccess;
+}
+
+ncclResult_t ncclCpuBarrierLast(struct ncclComm* comm) {
+  volatile int* ptr = (volatile int*)(comm->intraBarrier+comm->intraPhase);
+  int val = LOAD(ptr);
+  if (__sync_bool_compare_and_swap(ptr, val, val+1) != true) {
+    WARN("Trying to launch too many collectives");
+    return ncclInternalError;
+  }
+  return ncclSuccess;
+}
+
+ncclResult_t ncclCpuBarrierOut(struct ncclComm* comm) {
+  volatile int* ptr = (volatile int*)(comm->intraBarrier+comm->intraPhase);
+  while (LOAD(ptr) < comm->intraRanks) pthread_yield();
+  comm->intraPhase ^= 1;
+  return ncclSuccess;
+}
+
+ncclResult_t ncclBarrierEnqueue(struct ncclComm* comm) {
+  if (comm->nRanks == 1) return ncclSuccess;
+  hipLaunchParams* params = comm->myParams;
+
+  NCCLCHECK(setupLaunch(comm, params));
+
+  // Use internal NCCL stream for CGMD/GROUP launch if required or if the user stream is NULL
+  if (comm->launchMode == ncclComm::GROUP && (comm->groupCudaStream || comm->userStream == NULL)) {
+    // Enqueue event in user stream
+    CUDACHECK(hipEventRecord(comm->doneEvent, comm->userStream));
+    // Create dependency between user stream and internal NCCL stream
+    CUDACHECK(hipStreamWaitEvent(comm->groupStream, comm->doneEvent, 0));
+    params->stream = comm->groupStream;
+  } else {
+    if (comm->userStream != params->stream) {
+      // Stream changed from last call, create dependency against last NCCL kernel launch
+      CUDACHECK(hipStreamWaitEvent(comm->userStream, comm->doneEvent, 0));
+    }
+    params->stream = comm->userStream;
+  }
+
+  int isLast = 0;
+  NCCLCHECK(ncclCpuBarrierIn(comm, &isLast));
+
+  if (isLast) {
+    if (comm->launchMode == ncclComm::GROUP) {
+      // I'm the last. Launch all operations.
+      NCCLCHECK(ncclLaunchCooperativeKernelMultiDevice(comm->intraParams, comm->intraCudaDevs, comm->intraRanks, *comm->intraCGMode));
+    }
+    NCCLCHECK(ncclCpuBarrierLast(comm));
+  }
+  return ncclSuccess;
+}
+
+ncclResult_t ncclBarrierEnqueueWait(ncclComm_t comm) {
+  if (comm->nRanks == 1) return ncclSuccess;
+  // We can't print the CG mode before the first barrier happened.
+  if (comm->rank == 0 && *comm->intraCGMode & 0x10) {
+    *comm->intraCGMode ^= 0x10;
+    INFO(NCCL_INIT,"Launch mode %s%s%s",
+        comm->launchMode == ncclComm::GROUP ? "Group" : "Parallel",
+        *comm->intraCGMode ? "/CGMD" : "",
+        (comm->launchMode == ncclComm::GROUP && comm->groupCudaStream) ? "/Stream" : "");
+  }
+
+  NCCLCHECK(ncclCpuBarrierOut(comm));
+
+  hipLaunchParams *params = comm->myParams;
+  if (comm->launchMode == ncclComm::PARALLEL) {
+    hipLaunchKernelGGL(((void (*)(struct ncclColl))params->func), params->gridDim, params->blockDim, params->sharedMem, params->stream, **((struct ncclColl **)(params->args)));
+  }
+  // Start the network proxies as soon as the kernel has been launched. We can't
+  // perform any CUDA call between the two or having a hipFree between the CUDA
+  // launch and the transportStartProxy call could cause a deadlock.
+  // Also, starting the proxies after the CUDA launch seems to be better for
+  // performance (latency).
+  for (int r=0; r<params->gridDim.x; r++) {
+    struct ncclChannel* channel = comm->channels+r;
+    channel->collStart = channel->collFifoTail;
+    channel->collCount = 0;
+  }
+  params->gridDim.x = params->blockDim.x = 0;
+  NCCLCHECK(transportStartProxy(comm));
+  return ncclSuccess;
+}
+
+ncclResult_t ncclEnqueueEvents(ncclComm_t comm) {
+  hipLaunchParams *params = comm->myParams;
+  // Enqueue event after NCCL kernel
+  CUDACHECK(hipEventRecord(comm->doneEvent, params->stream));
+  // Use internal NCCL stream for CGMD/GROUP launch if required or if the user stream is NULL
+  if (comm->launchMode == ncclComm::GROUP && (comm->groupCudaStream || comm->userStream == NULL)) {
+    // Create dependency between NCCL internal stream and user stream
+    CUDACHECK(hipStreamWaitEvent(comm->userStream, comm->doneEvent, 0));
+  }
+  comm->userStreamSet = false;
+  return ncclSuccess;
+}
+
+/*****************************************************************************/
+/* Enqueueing system : computation of kernel and proxy operations parameters */
+/*****************************************************************************/
+
+static ncclResult_t getPatternInfo(struct ncclInfo* info) {
+  if (info->coll == ncclCollBroadcast) info->pattern = ncclPatternPipelineFrom;
+  else if (info->coll == ncclCollReduce) info->pattern = ncclPatternPipelineTo;
+  else if (info->coll == ncclCollAllGather || info->coll == ncclCollReduceScatter) info->pattern = ncclPatternRing;
+  else if (info->coll == ncclCollAllReduce) {
+    if (info->nBytes <= info->comm->treeThreshold)
+      info->pattern = ncclPatternTreeUpDown;
+    else
+      info->pattern = ncclPatternRingTwice;
+  }
+  else {
+    WARN("Unknown collective %d", info->coll);
+    return ncclInternalError;
+  }
+  return ncclSuccess;
+}
+
+static ncclResult_t getLoopInfo(struct ncclInfo* info) {
+  switch (info->pattern) {
+    case ncclPatternTreeUp:
+    case ncclPatternTreeDown:
+    case ncclPatternTreeUpDown:
+    case ncclPatternPipelineFrom:
+    case ncclPatternPipelineTo:
+      info->nstepsPerLoop = info-> nchunksPerLoop = 1; break;
+    case ncclPatternRing:
+      info->nstepsPerLoop = info->comm->nRanks-1; info->nchunksPerLoop = info->comm->nRanks; break;
+    case ncclPatternRingTwice:
+      info->nstepsPerLoop = 2*(info->comm->nRanks-1); info->nchunksPerLoop = info->comm->nRanks; break;
+    default:
+      WARN("Unknown pattern %d\n", info->pattern);
+      return ncclInternalError;
+  }
+  return ncclSuccess;
+}
+
+static void getKernelInfo(struct ncclInfo* info, uint8_t* nChannels, uint16_t* nThreads, int* llMode) {
+  // Compute thresholds and limits that users can override
+  ssize_t perThreadLLThreshold = std::min<ssize_t>(info->comm->threadThreshold, NCCL_LL_CHANNEL_THRESHOLD);
+  int maxLLNthreads = std::min(NCCL_LL_MAX_NTHREADS, info->comm->nThreads);
+
+  // First compute nThreads
+  int nt = NCCL_LL_MIN_NTHREADS;
+  while (DIVUP(info->nBytes, nt*info->nchunksPerLoop) > perThreadLLThreshold && nt*2 <= maxLLNthreads) nt *= 2;
+
+  // Then compute nChannels
+  int nc = DIVUP(info->nBytes, nt*info->nchunksPerLoop*perThreadLLThreshold);
+  if (nc == 0) nc = 1;
+  if (nc > info->comm->nChannels) nc = info->comm->nChannels;
+
+  // Check if we have a fixed LL threshold, otherwise compute it.
+  int perThreadThreshold = info->comm->threadThreshold;
+  if (info->pattern >= ncclPatternTreeUp) perThreadThreshold *= 4;
+  ssize_t llThreshold = info->comm->llThreshold >= 0 ?
+    info->comm->llThreshold :
+    nc*nt*info->nchunksPerLoop*perThreadThreshold;
+
+  if (info->nBytes <= llThreshold) {
+    *llMode = 1;
+    *nChannels = nc;
+    *nThreads = nt;
+  } else {
+    *llMode = 0;
+    *nChannels = info->comm->nChannels;
+    *nThreads = info->comm->nThreads;
+  }
+}
+
+static ncclResult_t computeColl(struct ncclInfo* info /* input */, struct ncclColl* coll, struct ncclProxyArgs* proxyArgs /* output */) {
+  // Set nstepsPerLoop and nchunksPerLoop
+  NCCLCHECK(getPatternInfo(info));
+  NCCLCHECK(getLoopInfo(info));
+
+  coll->args.root = info->root;
+  coll->args.N = info->count;
+  coll->args.ThisInput = info->sendbuff;
+  coll->args.ThisOutput = info->recvbuff;
+  coll->args.comm = info->comm->devComm;
+  coll->args.opCount = info->comm->opCount;
+
+  // Compute llMode, nChannels, nThreads
+  int llMode;
+  getKernelInfo(info, &coll->args.nChannels, &coll->args.nThreads, &llMode);
+
+  int treeMode = info->pattern >= ncclPatternTreeUp ? 1 : 0;
+  coll->funcIndex = FUNC_INDEX(info->coll, info->op, info->datatype, llMode, treeMode);
+
+  int stepSize   = ( llMode ? NCCL_LL_BUFF_SIZE : info->comm->channels[0].buffSize ) / NCCL_STEPS;
+  int chunkSteps = (llMode|treeMode) ? 1 : info->chunkSteps;
+  int sliceSteps = (llMode|treeMode) ? 1 : info->sliceSteps;
+  int chunkSize  = stepSize*chunkSteps;
+
+  // Compute lastChunkSize
+  if (treeMode == 1 && llMode == 0) {
+    if (info->pattern == ncclPatternTreeUpDown) {
+      // Optimize chunkSize / nSteps
+      while (info->nBytes / (coll->args.nChannels*chunkSize) < info->comm->channels[0].tree.depth*8 && chunkSize > 131072) chunkSize /= 2;
+      while (info->nBytes / (coll->args.nChannels*chunkSize) < info->comm->channels[0].tree.depth*4 && chunkSize > 65536) chunkSize /= 2;
+      while (info->nBytes / (coll->args.nChannels*chunkSize) < info->comm->channels[0].tree.depth && chunkSize > 32768) chunkSize /= 2;
+    }
+    // Use lastChunkSize as chunkSize
+    coll->args.lastChunkSize = chunkSize / ncclTypeSize(info->datatype);
+  } else if (llMode == 1) {
+    int sliceSize = NCCL_LL_SLICE_LINES * sizeof(uint64_t);
+    const ssize_t loopSize = coll->args.nChannels*info->nchunksPerLoop*(ssize_t)sliceSize;
+    coll->args.lastChunkSize = DIVUP((info->nBytes-(info->nBytes/loopSize)*loopSize), coll->args.nChannels*info->nchunksPerLoop);
+    ALIGN_SIZE(coll->args.lastChunkSize, coll->args.nThreads*sizeof(uint64_t));
+    coll->args.lastChunkSize /= ncclTypeSize(info->datatype);
+  }
+
+  // Compute nSteps for proxies
+  size_t nBytes  = llMode ? info->nBytes*2 : info->nBytes;
+
+  int nLoops = (int)(DIVUP(nBytes, (((size_t)(coll->args.nChannels))*info->nchunksPerLoop*chunkSize)));
+  proxyArgs->nsteps = info->nstepsPerLoop * nLoops * chunkSteps;
+  proxyArgs->sliceSteps = sliceSteps;
+  proxyArgs->chunkSteps = chunkSteps;
+  proxyArgs->llMode = llMode;
+  proxyArgs->opCount = info->comm->opCount;
+  TRACE(NCCL_NET,"opCount %lx slicesteps %d spl %d cpl %d nbytes %zi -> llmode %d nchannels %d nthreads %d, nloops %d nsteps %d comm %p",
+      coll->args.opCount, proxyArgs->sliceSteps, info->nstepsPerLoop, info->nchunksPerLoop, nBytes, llMode, coll->args.nChannels, coll->args.nThreads,
+      nLoops, proxyArgs->nsteps, info->comm);
+  return ncclSuccess;
+}
+
+static ncclResult_t saveKernel(struct ncclInfo* info) {
+  if (info->comm->nRanks == 1) {
+    if (info->sendbuff != info->recvbuff)
+      CUDACHECK(hipMemcpyAsync(info->recvbuff, info->sendbuff, info->nBytes, hipMemcpyDeviceToDevice, info->stream));
+    return ncclSuccess;
+  }
+
+  struct ncclColl coll;
+  struct ncclProxyArgs proxyArgs;
+  memset(&proxyArgs, 0, sizeof(struct ncclProxyArgs));
+  NCCLCHECK(computeColl(info, &coll, &proxyArgs));
+
+  info->comm->myParams->blockDim.x = std::max<unsigned>(info->comm->myParams->blockDim.x, coll.args.nThreads);
+  if (info->comm->userStreamSet == false) {
+    info->comm->userStream = info->stream;
+    info->comm->userStreamSet = true;
+  } else if (info->stream != info->comm->userStream) {
+    WARN("Error : mixing different streams within a group call is not supported.");
+    return ncclInvalidUsage;
+  }
+  for (int bid=0; bid<coll.args.nChannels; bid++) {
+    struct ncclChannel* channel = info->comm->channels+(info->comm->myParams->gridDim.x % info->comm->nChannels);
+
+    if (channel->collCount == NCCL_MAX_OPS) {
+      WARN("Too many aggregated operations (%d max)", NCCL_MAX_OPS);
+      return ncclInvalidUsage;
+    }
+
+    // Proxy
+    proxyArgs.channel = channel;
+    NCCLCHECK(transportSaveProxies(&proxyArgs, info->pattern, info->root, info->comm->nRanks));
+
+    info->comm->myParams->gridDim.x++;
+
+    int opIndex = channel->collFifoTail;
+    struct ncclColl* c = channel->collectives+opIndex;
+    volatile uint8_t* activePtr = (volatile uint8_t*)&c->active;
+    while (LOAD(activePtr) != 0) sched_yield();
+
+    memcpy(c, &coll, sizeof(struct ncclColl));
+
+    c->args.bid = bid;
+    STORE(&c->active, 1);
+    opIndex = (opIndex+1)%NCCL_MAX_OPS;
+    c->nextIndex = opIndex;
+    channel->collFifoTail = opIndex;
+    channel->collCount++;
+  }
+  /*if (llMode == 0)*/ info->comm->opCount++;
+  return ncclSuccess;
+}
+
+
+ncclResult_t ncclEnqueueCheck(struct ncclInfo* info) {
+  if (info->comm == NULL) return ncclInvalidArgument;
+
+  INFO(NCCL_COLL,"%s: opCount %lx sendbuff %p recvbuff %p count %zi datatype %d op %d root %d comm %p [nranks=%d] stream %p",
+       info->opName, info->comm->opCount, info->sendbuff, info->recvbuff, info->count,
+       info->datatype, info->op, info->root, info->comm, info->comm->nRanks, info->stream);
+
+  // Launch asynchronously if needed
+  if (ncclAsyncMode()) {
+    ncclResult_t ret = ncclSuccess;
+    int savedDev = -1;
+    if (info->comm->checkPointers) {
+      CUDACHECKGOTO(hipGetDevice(&savedDev), ret, end);
+      CUDACHECKGOTO(hipSetDevice(info->comm->cudaDev), ret, end);
+    }
+    // Check arguments
+    NCCLCHECKGOTO(ArgsCheck(info), ret, end);
+    // Always register comm even in case of error to make sure ncclGroupEnd
+    // cleans it up.
+    NCCLCHECKGOTO(ncclAsyncColl(info->comm), ret, end);
+    NCCLCHECKGOTO(saveKernel(info), ret, end);
+end:
+    if (savedDev != -1) CUDACHECK(hipSetDevice(savedDev));
+    ncclAsyncErrCheck(ret);
+    return ret;
+  } else {
+    NCCLCHECK(ArgsCheck(info));
+    NCCLCHECK(saveKernel(info));
+    NCCLCHECK(ncclBarrierEnqueue(info->comm));
+    NCCLCHECK(ncclBarrierEnqueueWait(info->comm));
+    NCCLCHECK(ncclEnqueueEvents(info->comm));
+    return ncclSuccess;
+  }
+}
@@ -0,0 +1,54 @@
+/*************************************************************************
+ * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef NCCL_ALLOC_H_
+#define NCCL_ALLOC_H_
+
+#include "nccl.h"
+#include "checks.h"
+#include <sys/mman.h>
+
+static inline ncclResult_t ncclCudaHostAlloc(void** ptr, void** devPtr, size_t size) {
+  CUDACHECK(hipHostMalloc(ptr, size, hipHostMallocMapped));
+  memset(*ptr, 0, size);
+  *devPtr = *ptr;
+  return ncclSuccess;
+}
+
+static inline ncclResult_t ncclCudaHostFree(void* ptr) {
+  CUDACHECK(hipHostFree(ptr));
+  return ncclSuccess;
+}
+
+template <typename T>
+static ncclResult_t ncclCalloc(T** ptr, size_t nelem) {
+  void* p = malloc(nelem*sizeof(T));
+  if (p == NULL) {
+    WARN("Failed to malloc %ld bytes", nelem*sizeof(T));
+    return ncclSystemError;
+  }
+  memset(p, 0, nelem*sizeof(T));
+  *ptr = (T*)p;
+  return ncclSuccess;
+}
+
+template <typename T>
+static ncclResult_t ncclCudaCalloc(T** ptr, size_t nelem, bool isFineGrain = false) {
+  if (isFineGrain)
+    CUDACHECK(hipExtMallocWithFlags((void**)ptr, nelem*sizeof(T), hipDeviceMallocFinegrained));
+  else
+    CUDACHECK(hipMalloc(ptr, nelem*sizeof(T)));
+  CUDACHECK(hipMemset(*ptr, 0, nelem*sizeof(T)));
+  return ncclSuccess;
+}
+
+template <typename T>
+static ncclResult_t ncclCudaMemcpy(T* dst, T* src, size_t nelem) {
+  CUDACHECK(hipMemcpy(dst, src, nelem*sizeof(T), hipMemcpyDefault));
+  return ncclSuccess;
+}
+
+#endif
@@ -0,0 +1,15 @@
+/*************************************************************************
+ * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef NCCL_ARGCHECK_H_
+#define NCCL_ARGCHECK_H_
+
+#include "core.h"
+
+ncclResult_t PtrCheck(void* ptr, const char* opname, const char* ptrname);
+ncclResult_t ArgsCheck(struct ncclInfo* info);
+
+#endif
@@ -1,5 +1,5 @@
 /*************************************************************************
- * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
 *
 * See LICENSE.txt for license information
 ************************************************************************/
@@ -9,9 +9,12 @@

 #include "nccl.h"

+ncclResult_t bootstrapNetInit();
 ncclResult_t bootstrapCreateRoot(ncclUniqueId* commId, bool idFromEnv);
 ncclResult_t bootstrapGetUniqueId(ncclUniqueId* out);
 ncclResult_t bootstrapInit(ncclUniqueId* id, int rank, int nranks, void** commState);
 ncclResult_t bootstrapAllGather(void* commState, void* allData, int size);
+ncclResult_t bootstrapSend(void* commState, int peer, void* data, int size);
+ncclResult_t bootstrapRecv(void* commState, int peer, void* data, int size);
 ncclResult_t bootstrapClose(void* commState);
 #endif
@@ -0,0 +1,14 @@
+/*************************************************************************
+ * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef NCCL_CHANNEL_H_
+#define NCCL_CHANNEL_H_
+#include "core.h"
+
+ncclResult_t initChannel(struct ncclComm* comm, int channelid);
+ncclResult_t freeChannel(struct ncclChannel* channel, int nRanks);
+
+#endif
@@ -0,0 +1,73 @@
+/*************************************************************************
+ * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef NCCL_CHECKS_H_
+#define NCCL_CHECKS_H_
+
+#include "debug.h"
+
+// Check CUDA calls
+#define CUDACHECK(cmd) do {                                 \
+    hipError_t e = cmd;                                    \
+    if( e != hipSuccess ) {                                \
+        WARN("Cuda failure '%s'", hipGetErrorString(e));   \
+        return ncclUnhandledCudaError;                      \
+    }                                                       \
+} while(false)
+
+#define CUDACHECKGOTO(cmd, res, label) do {                 \
+    hipError_t e = cmd;                                    \
+    if( e != hipSuccess ) {                                \
+        WARN("Cuda failure '%s'", hipGetErrorString(e));   \
+        res = ncclUnhandledCudaError;                       \
+        goto label;                                         \
+    }                                                       \
+} while(false)
+
+#include <errno.h>
+// Check system calls
+#define SYSCHECK(call, name) do { \
+  int retval; \
+  SYSCHECKVAL(call, name, retval); \
+} while (false)
+
+#define SYSCHECKVAL(call, name, retval) do { \
+  SYSCHECKSYNC(call, name, retval); \
+  if (retval == -1) { \
+    WARN("Call to " name " failed : %s", strerror(errno)); \
+    return ncclSystemError; \
+  } \
+} while (false)
+
+#define SYSCHECKSYNC(call, name, retval) do { \
+  retval = call; \
+  if (retval == -1 && (errno == EINTR || errno == EWOULDBLOCK || errno == EAGAIN)) { \
+    INFO(NCCL_ALL,"Call to " name " returned %s, retrying", strerror(errno)); \
+  } else { \
+    break; \
+  } \
+} while(true)
+
+// Propagate errors up
+#define NCCLCHECK(call) do { \
+  ncclResult_t res = call; \
+  if (res != ncclSuccess) { \
+    /* Print the back trace*/ \
+    INFO(NCCL_ALL,"%s:%d -> %d", __FILE__, __LINE__, res);    \
+    return res; \
+  } \
+} while (0);
+
+#define NCCLCHECKGOTO(call, res, label) do { \
+  res = call; \
+  if (res != ncclSuccess) { \
+    /* Print the back trace*/ \
+    INFO(NCCL_ALL,"%s:%d -> %d", __FILE__, __LINE__, res);    \
+    goto label; \
+  } \
+} while (0);
+
+#endif
@@ -0,0 +1,117 @@
+/*************************************************************************
+ * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
+ * Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef NCCL_COMM_H_
+#define NCCL_COMM_H_
+
+#define MAXCHANNELS 16
+#define DEFAULT_BUFFER_SIZE_BYTES (1LL << 22) /* 4MiB */
+
+#define CACHE_LINE_SIZE 64
+#define MEM_ALIGN 4096
+#define CUDA_IPC_MIN 2097152UL
+
+struct ncclSendMem {
+  union {
+    struct {
+      uint64_t head;
+      char pad1[CACHE_LINE_SIZE-sizeof(uint64_t)];
+      void* ptrExchange;
+      char pad2[CACHE_LINE_SIZE-sizeof(void*)];
+      uint64_t opCount;
+    };
+    char pad3[MEM_ALIGN];
+  };
+};
+
+struct ncclRecvMem {
+  union {
+    struct {
+      uint64_t tail;
+      char pad1[CACHE_LINE_SIZE-sizeof(uint64_t)];
+      uint64_t opCount;
+      char pad2[CACHE_LINE_SIZE-sizeof(uint64_t)];
+      int sizesFifo[NCCL_STEPS];
+    };
+    char pad4[MEM_ALIGN];
+  };
+  ncclLLFifoLine llBuff[NCCL_LL_BUFF_LINES];
+  char buff[1]; // Actually larger than that
+};
+
+struct ncclComm {
+  struct ncclChannel channels[MAXCHANNELS];
+
+  struct ncclPeerInfo* peerInfo;
+
+  void* bootstrap;
+
+  int rank;    // my rank in the communicator
+  int nRanks;  // number of GPUs in communicator
+  int cudaDev; // my cuda device index
+  int nvmlDev; // my NVML device number
+
+  enum { GROUP, PARALLEL } launchMode;
+  hipStream_t userStream;
+  bool userStreamSet;
+  hipEvent_t doneEvent;
+  bool checkPointers;
+
+  // Counter to make sure collectives match (needed for bcast/reduce
+  // where syncs are not symmetric).
+  uint64_t opCount;
+
+  // Channels for collectives
+  int nChannels;
+  int nThreads;
+
+  // Low-latency algorithm threshold
+  ssize_t llThreshold;
+  ssize_t threadThreshold;
+
+  // Tree algorithm threshold
+  ssize_t treeThreshold;
+
+  // An internal CUDA stream for NCCL kernel CGMD launches
+  int groupCudaStream;
+  hipStream_t groupStream;
+
+  // Whether there has been a fatal error in this communicator.
+  ncclResult_t fatalError;
+
+  // Error reported by GPU
+  volatile ncclDevError_t* fatalDevError;
+
+  // Flag to ask NCCL kernels to abort
+  volatile uint32_t *abortFlag;
+
+  // Device side of the communicator
+  struct ncclDevComm *devComm;
+  // Host copy of the devComm (to free CUDA allocs)
+  struct ncclDevComm hostDevComm;
+
+  // Intra-process sync
+  int intraRank;
+  int intraRanks;
+  int* intraBarrier;
+  int intraPhase;
+
+  // Storage for deferred intra-process launch
+  hipLaunchParams * intraParams;
+  hipLaunchParams *myParams;
+  int* intraCudaDevs;
+  int* intraCGMode; // Whether we can use CUDA9 CGMD or not
+  int* intraCC; // Only to check all have the same ComputeCap and disable CGMode if not
+  struct ncclColl args;
+  struct ncclColl* argsptr;
+
+  // Global proxy thread
+  pthread_t proxyThread;
+  struct ncclProxyState proxyState;
+};
+
+#endif
@@ -1,196 +0,0 @@
-/*************************************************************************
- * Copyright (c) 2016-2018, NVIDIA CORPORATION. All rights reserved.
- * Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
- *
- * See LICENSE.txt for license information
- ************************************************************************/
-
-#ifndef COMMON_COLL_H_
-#define COMMON_COLL_H_
-
-#include "core.h"
-#include "enqueue.h"
-#include "collectives/collectives.h"
-
-static ncclResult_t PointerCheck(const void* pointer, struct ncclComm* comm, const char* ptrname, const char* opname) {
-  hipPointerAttribute_t attr;
-  hipError_t err = hipPointerGetAttributes(&attr, pointer);
-  if (err != hipSuccess || attr.devicePointer == NULL) {
-    WARN("%s : %s is not a valid pointer", opname, ptrname);
-    return ncclInvalidArgument;
-  }
-#if CUDART_VERSION >= 10000
-  if (attr.type == hipMemoryTypeDevice && attr.device != comm->cudaDev) {
-#else
-  if (attr.memoryType == hipMemoryTypeDevice && attr.device != comm->cudaDev) {
-#endif
-    WARN("%s : %s allocated on device %d mismatchs with NCCL device %d", opname, ptrname, attr.device, comm->cudaDev);
-    return ncclInvalidArgument;
-  }
-  return ncclSuccess;
-}
-
-static ncclResult_t PtrCheck(void* ptr, const char* opname, const char* ptrname) {
-  if (ptr == NULL) {
-    WARN("%s : %s argument is NULL", opname, ptrname);
-    return ncclInvalidArgument;
-  }
-  return ncclSuccess;
-}
-
-static ncclResult_t ArgsCheck(const void* sendbuff, const void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, struct ncclComm* comm, const char* opname) {
-  NCCLCHECK(PtrCheck(comm, opname, "comm"));
-  // First, the easy ones
-  if (root < 0 || root >= comm->nRanks) {
-    WARN("%s : invalid root %d (root should be in the 0..%d range)", opname, root, comm->nRanks);
-    return ncclInvalidArgument;
-  }
-  if (type < 0 || type >= ncclNumTypes) {
-    WARN("%s : invalid type %d", opname, type);
-    return ncclInvalidArgument;
-  }
-  if (op < 0 || op >= ncclNumOps) {
-    WARN("%s : invalid reduction operation %d", opname, op);
-    return ncclInvalidArgument;
-  }
-
-  if (comm->checkPointers) {
-    // Check CUDA device pointers
-    if (strcmp(opname, "Broadcast") != 0 || comm->rank == root) {
-      NCCLCHECK(PointerCheck(sendbuff, comm, "sendbuff", opname));
-    }
-    if (strcmp(opname, "Reduce") != 0 || comm->rank == root) {
-      NCCLCHECK(PointerCheck(recvbuff, comm, "recvbuff", opname));
-    }
-  }
-  return ncclSuccess;
-}
-
-static __inline__ int ncclTypeSize(ncclDataType_t type) {
-  switch (type) {
-    case ncclInt8:
-    case ncclUint8:
-      return 1;
-    case ncclFloat16:
-      return 2;
-    case ncclInt32:
-    case ncclUint32:
-    case ncclFloat32:
-      return 4;
-    case ncclInt64:
-    case ncclUint64:
-    case ncclFloat64:
-      return 8;
-    default:
-      return -1;
-  }
-}
-
-// In : comm, nbytes ; Out : nrings, nthreads, ll
-// - We start with the minimum number of threads possible (64) and see if the size fits in LL;
-//   If not, we increase the number of threads by 2x, until we reach the max number of LL threads (256, or set by user via NCCL_NTHREADS, or platform non-LL default)
-// - We use "maxRings" to limit the max number of rings we can use before reaching the max number of LL threads
-//   This ensures we don't use a large number of rings with a small number of threads
-// - We use the NCCL_LL_RING_THRESHOLD as the per-thread threshold before we reach the max number of threads
-//   we use NCCL_THREAD_THRESHOLD when we reach the max
-// - If by the max number of LL threads, the size still cannot fit in LL, then we use non-LL setting
-// - We honor the NCCL_LL_THRESHOLD (total threshold) set by user too
-static inline void ncclGetCollResource(ncclComm_t comm, size_t nbytes, int* nrings, int* nthreads, int* ll) {
-  *ll = 0;
-  int llEnforced = 0; /* see if the size falls in the NCCL_LL_THRESHOLD range set by user */
-  if (comm->llThreshold >= 0) { /* user sets total LL threshold */
-    if (nbytes > comm->llThreshold) { /* non-LL */
-      *nthreads = comm->nThreads;
-      *nrings = comm->nRings;
-      return;
-    } else {
-      llEnforced = 1; /* user wants to use LL */
-    }
-  }
-  int nt = NCCL_LL_MIN_NTHREADS; /* start with min number of LL threads */
-  size_t nr;
-  int ll_max_nthreads = std::min(NCCL_LL_MAX_NTHREADS, comm->nThreads); /* respect user's setting or platform's default setting */
-  int maxRings = (comm->nRanks <= 4) ? 1 : ll_max_nthreads / NCCL_LL_MIN_NTHREADS;
-  ssize_t threshold = std::min(comm->threadThreshold, (ssize_t)NCCL_LL_RING_THRESHOLD);
-  while (nt < ll_max_nthreads && *ll == 0) {
-    nr = DIVUP(nbytes, (NCCL_LL_RING_THRESHOLD*nt*comm->nRanks));
-    if (nr <= maxRings) { /* avoid using few threads but many rings */
-      nr = nr == 0 ? 1 : nr > comm->nRings ? comm->nRings : nr;
-      *ll = nbytes > comm->nRanks*nr*nt*threshold ? 0 : 1;
-    }
-    if (*ll == 0) {
-      nt = nt << 1;
-    }
-  }
-  if (*ll == 1) {
-    *nthreads = nt;
-    *nrings = (int)nr;
-    return; /* we can use smaller number of threads to make LL work, stop here */
-  }
-  nr = DIVUP(nbytes, (NCCL_LL_RING_THRESHOLD*ll_max_nthreads*comm->nRanks)); /* else we try the max number of LL threads */
-  nr = nr == 0 ? 1 : nr > comm->nRings ? comm->nRings : nr;
-  *ll = nbytes > comm->nRanks*nr*ll_max_nthreads*comm->threadThreshold ? llEnforced : 1;
-  *nthreads = *ll ? ll_max_nthreads : comm->nThreads;
-  *nrings = *ll ? (int)nr : comm->nRings;
-}
-
-static ncclResult_t saveKernel(int coll, const void* sendbuff, void* recvbuff, size_t count,
-    ncclDataType_t dtype, ncclRedOp_t op, int root, ncclComm_t comm, hipStream_t stream, size_t nbytes, int loopFactor) {
-  int llMode, nBlocks, nThreads;
-  ncclGetCollResource(comm, nbytes, &nBlocks, &nThreads, &llMode);
-  comm->myParams->blockDim.x = std::max((int)comm->myParams->blockDim.x, nThreads);
-  if (comm->userStreamSet == false) {
-    comm->userStream = stream;
-    comm->userStreamSet = true;
-  } else if (stream != comm->userStream) {
-    WARN("Error : mixing different streams within a group call is not supported.");
-    return ncclInvalidUsage;
-  }
-  int lastChunkSize = 0;
-  if (llMode == 1) {
-    int sliceSize = NCCL_LL_SLICE_LINES * sizeof(uint64_t) / ncclTypeSize(dtype);
-    const ssize_t loopSize = nBlocks*loopFactor*(ssize_t)sliceSize;
-    lastChunkSize = DIVUP((count-count/loopSize*loopSize), nBlocks*loopFactor);
-    ALIGN_SIZE(lastChunkSize, nThreads*sizeof(uint64_t)/ncclTypeSize(dtype));
-  }
-  for (int bid=0; bid<nBlocks; bid++) {
-    struct ncclRing* ring = comm->rings+(comm->myParams->gridDim.x % comm->nRings);
-    if (ring->collCount == NCCL_MAX_OPS) {
-      WARN("Too many aggregated operations (%d max)", NCCL_MAX_OPS);
-      return ncclInvalidUsage;
-    }
-
-    comm->myParams->gridDim.x++;
-
-    int opIndex = ring->collFifoTail;
-    struct ncclColl* c = ring->collectives+opIndex;
-    volatile uint8_t* activePtr = (volatile uint8_t*)&c->active;
-    while (LOAD(activePtr) != 0) sched_yield();
-
-    struct CollectiveArgs* args = &c->args;
-    args->root = root;
-    args->N = count;
-    args->ThisInput = sendbuff;
-    args->ThisOutput = recvbuff;
-    args->comm = comm->devComm;
-    args->opCount = comm->opCount;
-    args->bid = bid;
-    args->nRings = nBlocks;
-    args->nThreads = nThreads;
-    args->lastChunkSize = lastChunkSize;
-
-    c->nThreads = nThreads;
-    c->funcIndex = FUNC_INDEX(coll, op, dtype, llMode);
-    STORE(&c->active, 1);
-    opIndex = (opIndex+1)%NCCL_MAX_OPS;
-    c->nextIndex = opIndex;
-    ring->collFifoTail = opIndex;
-    ring->collCount++;
-  }
-  /*if (llMode == 0)*/ comm->opCount++;
-  return ncclSuccess;
-}
-
-extern __global__ void ncclMultiOpKernel (struct ncclColl firstColl);
-
-#endif
@@ -1,6 +1,5 @@
 /*************************************************************************
- * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
- * Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
+ * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
 *
 * See LICENSE.txt for license information
 ************************************************************************/
@@ -8,313 +7,20 @@
 #ifndef NCCL_CORE_H_
 #define NCCL_CORE_H_

-#define NCCL_MAX_OPS 2048
-
+#include <pthread.h>
+#include <algorithm>
 #include "nccl.h"
-#include "transport.h"
 #include "debug.h"
+#include "checks.h"
+#include "alloc.h"
+#include "transport.h"
+#include "devcomm.h"
+#include "comm.h"
+#include "info.h"
+#include "argcheck.h"
 #include <cstdio>
-#include <algorithm> // std::min/std::max
 #include <unistd.h>
 #include <stdlib.h>
-#include <hip/hip_runtime_api.h>
-#include <hip/hip_runtime.h>
-
-#define MAXRINGS 16
-#define MAXTHREADS 256
-#define DEFAULT_BUFFER_SIZE_BYTES (1LL << 22) /* 4MiB */
-
-// Rings / LL tuning
-#define NCCL_LL_RING_THRESHOLD 8 // Per thread size before we start increasing nrings
-#define NCCL_THREAD_THRESHOLD 256 // Per thread size before we switch to non-LL for Volta and above
-#define NCCL_THREAD_THRESHOLD_PREVOLTA 32 // Per thread size before we switch to non-LL for pre-Volta archs
-#define NCCL_LL_MAX_NTHREADS 256
-#define NCCL_LL_MIN_NTHREADS 256
-
-#define DIVUP(x, y) \
-    (((x)+(y)-1)/(y))
-#define ROUNDUP(x, y) \
-    (DIVUP((x), (y))*(y))
-
-#define ALIGN_SIZE(size, align) \
-  size = ((size + (align) - 1) / (align)) * (align);
-
-union ncclLLFifoLine {
-  /* Flags have to be *after* data, because otherwise, an incomplete receive
-     from the network may receive the flag but not the data.
-     Note this is assuming that either we receive contiguous chunks of data
-     (sockets) or data is written with an atomicity of 8 bytes (IB/RDMA). */
-  struct {
-    uint32_t data1;
-    uint32_t flag1;
-    uint32_t data2;
-    uint32_t flag2;
-  };
-  uint64_t v[2];
-  int4 i4;
-};
-
-struct ncclConnInfo {
-  // Regular comm mechanism
-  char *buff;         // Local for recv, remote for send
-  uint64_t *tail;     // Local for recv, remote for send
-  uint64_t *head;     // Local for send, remote for recv
-  uint64_t *opCount;  // Local for recv, remote for send
-
-  int direct;         // Direct communication
-  void **ptrExchange; // Pointer exchange for direct communication
-
-  int *fifo;          // Size fifo for proxy
-
-  // Low latency mechanism
-  char *llBuff;       // Local for recv, remote for send
-  uint64_t *llHead;   // Local for send, remote for recv
-  int *llFifo;        // LL Size fifo for proxy
-  uint64_t llStep;    // Keep where we are
-  uint64_t llLastCleaning;
-};
-
-struct ncclConnector {
-  struct transportProxyInfo* proxyInfo;
-  struct ncclTransport* transport;
-  void* transportResources; // Host-side resources
-  struct ncclConnInfo conn;
-};
-
-#define CACHE_LINE_SIZE 64
-#define MEM_ALIGN 4096
-#define SIZES_FIFO_SIZE 16
-#define CUDA_IPC_MIN 2097152UL /* 2MiB - not currently used */
-
-#define NCCL_LL_CHUNKS 8
-#define NUM_LINES_PER_THREAD 8
-#define NCCL_LL_BUFF_SIZE (NUM_LINES_PER_THREAD*NCCL_LL_MAX_NTHREADS*NCCL_LL_CHUNKS*sizeof(union ncclLLFifoLine)) // 256K
-#define NCCL_LL_BUFF_LINES (NCCL_LL_BUFF_SIZE / (2*sizeof(uint64_t)))
-#define NCCL_LL_SLICE_LINES (NCCL_LL_BUFF_LINES / NCCL_LL_CHUNKS)
-#define NCCL_LL_CLEAN_FREQ 0x10000000
-
-struct ncclSendMem {
-  union {
-    struct {
-      uint64_t head;
-      char pad1[CACHE_LINE_SIZE-sizeof(uint64_t)];
-      void* ptrExchange;
-      char pad2[CACHE_LINE_SIZE-sizeof(void*)];
-      uint64_t llHead;
-    };
-    char pad3[MEM_ALIGN];
-  };
-};
-
-struct ncclRecvMem {
-  union {
-    struct {
-      uint64_t tail;
-      char pad2[CACHE_LINE_SIZE-sizeof(uint64_t)];
-      uint64_t opCount;
-      char pad4[CACHE_LINE_SIZE-sizeof(uint64_t)];
-      int sizesFifo[SIZES_FIFO_SIZE];
-      int llSizesFifo[SIZES_FIFO_SIZE];
-    };
-    char pad5[MEM_ALIGN];
-  };
-  char llBuff[NCCL_LL_BUFF_SIZE];
-  char buff[1]; // Actually larger than that
-};
-
-struct ncclRing {
-  union {
-    struct {
-      int id;
-      int nthreads;
-      // Per ring resources
-      struct ncclSendMem* devMemSend;   // CUDA-size resources
-      struct ncclRecvMem* devMemRecv;   // CUDA-size resources
-      int buffSize;
-      int devMemSendSize;    // Keep the size for IPCs
-      int devMemRecvSize;    // Keep the size for IPCs
-      struct ncclConnector send;
-      struct ncclConnector recv;
-
-      // Maps an internal nccl index to user-specified rank order. This is necessary
-      // since we need to know how the user expects data to be ordered across
-      // devices. Ordered from current device.
-      int* userRanks;
-      int* devUserRanks;
-
-      // GPU's HDP_MEM_FLUSH_ADDR: HDP Memory Coherency Flush Control. This register
-      // allows software to explicitly initiate a flush read to HDP memory. See more
-      // descriptions in primitives.h.
-      uint32_t* next_hdp_reg;  // Next GPU in ring (for p2p transport use only)
-      uint32_t* curr_hdp_reg;  // Curr GPU in ring (for rdma transport use only)
-      
-      // Operation list for aggregation
-      struct ncclColl* collectives;
-      struct ncclColl* devCollectives;
-      int collStart;
-      int collCount;
-      int collFifoHead; // Only used by GPU
-      int collFifoTail; // Only used by CPU
-    };
-    int data[0x80];
-  };
-};
-static_assert(sizeof(struct ncclRing) == 0x80*sizeof(int), "ncclRing must have a pow2 size");
-
-#pragma pack(push)  /* push current alignment to stack */
-#pragma pack(4)     /* set alignment to 4 bytes boundary */
-/* CollectiveArgs + ncclColl are to be a power of two, currently 64 bytes, */
-/* to make sure reads to host from the CUDA kernel are aligned. */
-/* Make sure to adjust padding at the end of ncclColl. */
-struct CollectiveArgs {
-  struct ncclComm* comm;
-  uint64_t opCount;
-
-  // local and remote input, output, and buffer
-  const void * ThisInput;
-  void * ThisOutput;
-
-  // general parameters
-  size_t N;
-  uint32_t root;
-  uint8_t bid;
-  uint8_t nRings;
-  uint16_t nThreads;
-
-  int lastChunkSize;
-};
-struct ncclColl {
-  union {
-    struct {
-      struct CollectiveArgs args;
-      uint16_t nThreads;
-      uint16_t funcIndex;
-      uint16_t nextIndex;
-      uint8_t  active;
-    };
-    int data[0x10];
-  };
-};
-static_assert(sizeof(struct ncclColl) == (0x10*sizeof(int)), "ncclColl must have a pow2 size");
-#pragma pack(pop)   /* restore original alignment from stack */
-
-struct ncclComm {
-  struct ncclRing rings[MAXRINGS];
-
-  int rank;    // my rank in the communicator
-  int nRanks;  // number of GPUs in communicator
-  int cudaDev; // my cuda device index
-
-  enum { GROUP, PARALLEL } launchMode;
-  hipStream_t userStream;
-  bool userStreamSet;
-  hipEvent_t doneEvent;
-  bool checkPointers;
-
-  // Counter to make sure collectives match (needed for bcast/reduce
-  // where syncs are not symmetric).
-  uint64_t opCount;
-
-  // Rings for collectives
-  int nRings;
-  int nThreads;
-
-  // Low-latency algorithm threshold
-  ssize_t llThreshold;
-  ssize_t threadThreshold;
-
-  // An internal CUDA stream for NCCL kernel CGMD launches
-  int groupCudaStream;
-  hipStream_t groupStream;
-
-  // Device copy of the communicator
-  struct ncclComm *devComm;
-
-  // Intra-process sync
-  int intraRank;
-  int intraRanks;
-  int* intraBarrier;
-  int intraPhase;
-
-  // Storage for deferred intra-process launch
-  hipLaunchParams* intraParams;
-  hipLaunchParams* myParams;
-  int* intraCudaDevs;
-  int* intraCGMode; // Whether we can use CUDA9 CGMD or not
-  int* intraCC; // Only to check all have the same ComputeCap and disable CGMode if not
-  struct ncclColl args;
-  struct ncclColl* argsptr;
-};
-
-// Convert volatile access to atomic
-#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__)
-#define LOAD(VAR) __atomic_load_n((VAR), __ATOMIC_SEQ_CST)
-#define STORE(DST, SRC) __atomic_store_n((DST), (SRC), __ATOMIC_SEQ_CST)
-#else
-#define LOAD(VAR) *(VAR)
-#define STORE(DST, SRC) *(DST) = (SRC)
-#endif
-
-// Check CUDA calls
-#define CUDACHECK(cmd) do {                                 \
-    hipError_t e = cmd;                                    \
-    if( e != hipSuccess ) {                                \
-        WARN("Cuda failure '%s'", hipGetErrorString(e));   \
-        return ncclUnhandledCudaError;                      \
-    }                                                       \
-} while(false)
-
-#define CUDACHECKGOTO(cmd, res, label) do {                 \
-    hipError_t e = cmd;                                    \
-    if( e != hipSuccess ) {                                \
-        WARN("Cuda failure '%s'", hipGetErrorString(e));   \
-        res = ncclUnhandledCudaError;                       \
-        goto label;                                         \
-    }                                                       \
-} while(false)
-
-#include <errno.h>
-// Check system calls
-#define SYSCHECK(call, name) do { \
-  int retval; \
-  SYSCHECKVAL(call, name, retval); \
-} while (false)
-
-#define SYSCHECKVAL(call, name, retval) do { \
-  SYSCHECKSYNC(call, name, retval); \
-  if (retval == -1) { \
-    WARN("Call to " name " failed : %s", strerror(errno)); \
-    return ncclSystemError; \
-  } \
-} while (false)
-
-#define SYSCHECKSYNC(call, name, retval) do { \
-  retval = call; \
-  if (retval == -1 && (errno == EINTR || errno == EWOULDBLOCK || errno == EAGAIN)) { \
-    INFO(NCCL_ALL,"Call to " name " returned %s, retrying", strerror(errno)); \
-  } else { \
-    break; \
-  } \
-} while(true)
-
-// Propagate errors up
-#define NCCLCHECK(call) do { \
-  ncclResult_t res = call; \
-  if (res != ncclSuccess) { \
-    /* Print the back trace*/ \
-    INFO(NCCL_ALL,"%s:%d -> %d", __FILE__, __LINE__, res);    \
-    return res; \
-  } \
-} while (0);
-
-#define NCCLCHECKGOTO(call, res, label) do { \
-  res = call; \
-  if (res != ncclSuccess) { \
-    /* Print the back trace*/ \
-    INFO(NCCL_ALL,"%s:%d -> %d", __FILE__, __LINE__, res);    \
-    goto label; \
-  } \
-} while (0);

 #ifdef PROFAPI
 #define NCCL_API(ret, func, args...)        \
@@ -333,51 +39,27 @@ struct ncclComm {
 #endif // end PROFAPI

 int ncclCudaCompCap();
+ncclResult_t ncclNvlinkGpu(int* nvlink);
+int64_t ncclTreeThreshold();

-#include <sys/mman.h>
-static inline ncclResult_t ncclCudaHostAlloc(void** ptr, void** devPtr, size_t size) {
-  CUDACHECK(hipHostMalloc(ptr, size, hipHostMallocMapped));
-  memset(*ptr, 0, size);
-  *devPtr = *ptr;
-  return ncclSuccess;
-}
-
-static inline ncclResult_t ncclCudaHostFree(void* ptr) {
-  CUDACHECK(hipHostFree(ptr));
-  return ncclSuccess;
-}
-
-template <typename T>
-static ncclResult_t ncclCalloc(T** ptr, size_t nelem) {
-  void* p = malloc(nelem*sizeof(T));
-  if (p == NULL) {
-    WARN("Failed to malloc %ld bytes", nelem*sizeof(T));
-    return ncclSystemError;
+static __inline__ int ncclTypeSize(ncclDataType_t type) {
+  switch (type) {
+    case ncclInt8:
+    case ncclUint8:
+      return 1;
+    case ncclFloat16:
+      return 2;
+    case ncclInt32:
+    case ncclUint32:
+    case ncclFloat32:
+      return 4;
+    case ncclInt64:
+    case ncclUint64:
+    case ncclFloat64:
+      return 8;
+    default:
+      return -1;
  }
-  memset(p, 0, nelem*sizeof(T));
-  *ptr = (T*)p;
-  return ncclSuccess;
-}
-
-template <typename T>
-static ncclResult_t ncclCudaCalloc(T** ptr, size_t nelem, bool isFineGrain = false) {
-  if (isFineGrain) {
-    hipError_t e = hipExtMallocWithFlags((void**)ptr, nelem*sizeof(T), hipDeviceMallocFinegrained);
-    if (e != hipSuccess) {
-      *ptr = 0;
-      return ncclInvalidUsage;
-    }
-  }
-  else
-    CUDACHECK(hipMalloc(ptr, nelem*sizeof(T)));
-  CUDACHECK(hipMemset(*ptr, 0, nelem*sizeof(T)));
-  return ncclSuccess;
-}
-
-template <typename T>
-static ncclResult_t ncclCudaMemcpy(T* dst, T* src, size_t nelem) {
-  CUDACHECK(hipMemcpy(dst, src, nelem*sizeof(T), hipMemcpyDefault));
-  return ncclSuccess;
 }

 #endif // end include guard
@@ -0,0 +1,61 @@
+/*************************************************************************
+ * Copyright (c) 2018-2019, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef NCCL_CPUSET_H_
+#define NCCL_CPUSET_H_
+
+// Convert local_cpus, e.g. 0003ff,f0003fff to cpu_set_t
+
+static int hexToInt(char c) {
+  int v = c - '0';
+  if (v < 0) return -1;
+  if (v > 9) v = 10 + c - 'a';
+  if ((v < 0) || (v > 15)) return -1;
+  return v;
+}
+
+#define CPU_SET_N_U32 (sizeof(cpu_set_t)/sizeof(uint32_t))
+
+ncclResult_t ncclStrToCpuset(char* str, cpu_set_t* mask) {
+  uint32_t cpumasks[CPU_SET_N_U32];
+  int m = CPU_SET_N_U32-1;
+  cpumasks[m] = 0;
+  for (int o=0; o<strlen(str); o++) {
+    char c = str[o];
+    if (c == ',') {
+      m--;
+      cpumasks[m] = 0;
+    } else {
+      int v = hexToInt(c);
+      if (v == -1) break;
+      cpumasks[m] <<= 4;
+      cpumasks[m] += v;
+    }
+  }
+  // Copy cpumasks to mask
+  for (int a=0; m<CPU_SET_N_U32; a++,m++) {
+    memcpy(((uint32_t*)mask)+a, cpumasks+m, sizeof(uint32_t));
+  }
+  return ncclSuccess;
+}
+
+ncclResult_t ncclCpusetToStr(cpu_set_t* mask, char* str) {
+  int c = 0;
+  uint8_t* m8 = (uint8_t*)mask;
+  for (int o=sizeof(cpu_set_t)-1; o>=0; o--) {
+    if (c == 0 && m8[o] == 0) continue;
+    sprintf(str+c, "%02x", m8[o]);
+    c+=2;
+    if (o && o%4 == 0) {
+      sprintf(str+c, ",");
+      c++;
+    }
+  }
+  str[c] = '\0';
+  return ncclSuccess;
+}
+
+#endif
@@ -1,6 +1,5 @@
 /*************************************************************************
- * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
- * Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
+ * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
 *
 * See LICENSE.txt for license information
 ************************************************************************/
@@ -25,7 +24,8 @@ extern int ncclDebugLevel;
 extern uint64_t ncclDebugMask;
 extern pthread_mutex_t ncclDebugOutputLock;
 extern FILE *ncclDebugFile;
-extern ncclResult_t getHostName(char* hostname, int maxlen);
+extern ncclResult_t getHostName(char* hostname, int maxlen, const char delim);
+extern ncclResult_t getNvmlDevice(int cudaDev, int *nvmlDev);

 extern void ncclDebugLog(ncclDebugLogLevel level, unsigned long flags, const char *filefunc, int line, const char *fmt, ...);

@@ -108,7 +108,7 @@ static inline void initDebug() {
          break;
        case 'h': // %h = hostname
          char hostname[1024];
-          getHostName(hostname, 1024);
+          getHostName(hostname, 1024, '.');
          dfn += snprintf(dfn, PATH_MAX, "%s", hostname);
          break;
        case 'p': // %p = pid
@@ -0,0 +1,259 @@
+/*************************************************************************
+ * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
+ * Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef NCCL_DEVICE_H_
+#define NCCL_DEVICE_H_
+
+#include "nccl.h"
+#include <stdint.h>
+
+// Convert volatile access to atomic
+#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
+#define LOAD(VAR) __atomic_load_n((VAR), __ATOMIC_SEQ_CST)
+#define STORE(DST, SRC) __atomic_store_n((DST), (SRC), __ATOMIC_SEQ_CST)
+#else
+#define LOAD(VAR) *(VAR)
+#define STORE(DST, SRC) *(DST) = (SRC)
+#endif
+
+#define NCCL_MAX_OPS 2048
+#define NCCL_STEPS 8
+
+typedef enum { ncclCollBroadcast, ncclCollReduce, ncclCollAllGather, ncclCollReduceScatter, ncclCollAllReduce, ncclCollCount } ncclColl_t;
+
+#define DIVUP(x, y) \
+    (((x)+(y)-1)/(y))
+#define ROUNDUP(x, y) \
+    (DIVUP((x), (y))*(y))
+
+#define ALIGN_SIZE(size, align) \
+  size = ((size + (align) - 1) / (align)) * (align);
+
+union ncclLLFifoLine {
+  /* Flags have to be *after* data, because otherwise, an incomplete receive
+     from the network may receive the flag but not the data.
+     Note this is assuming that either we receive contiguous chunks of data
+     (sockets) or data is written with an atomicity of 8 bytes (IB/RDMA). */
+  struct {
+    uint32_t data1;
+    uint32_t flag1;
+    uint32_t data2;
+    uint32_t flag2;
+  };
+  uint64_t v[2];
+  int4 i4;
+};
+
+#define MAXTHREADS 256
+#define NCCL_LL_MAX_NTHREADS MAXTHREADS
+#define NUM_LINES_PER_THREAD 8
+#define NCCL_LL_SLICE_LINES (NUM_LINES_PER_THREAD*NCCL_LL_MAX_NTHREADS)
+#define NCCL_LL_BUFF_LINES (NCCL_LL_SLICE_LINES*NCCL_STEPS)
+#define NCCL_LL_BUFF_SIZE (NCCL_LL_BUFF_LINES*sizeof(union ncclLLFifoLine))
+#ifdef DEBUG_LL
+#define NCCL_LL_CLEAN_MASK 0x00000ff8
+#define NCCL_LL_FLAG_MAX   0x00001000
+#define NCCL_LL_FLAG(a) ((uint32_t)(a % NCCL_LL_FLAG_MAX))
+#else
+#define NCCL_LL_CLEAN_MASK 0x7ffffff8
+#define NCCL_LL_FLAG(a) ((uint32_t)(a))
+#endif
+// Make sure the clean mask will last for at least NCCL_NSTEPS
+static_assert(NCCL_LL_CLEAN_MASK % NCCL_STEPS == 0, "Invalid NCCL_LL_CLEAN_MASK value");
+
+struct ncclConnInfo {
+  // Regular comm mechanism
+  char *buff;         // Local for recv, remote for send
+  uint64_t *tail;     // Local for recv, remote for send
+  uint64_t *head;     // Local for send, remote for recv
+  uint64_t *opCountLoc; // opCount of local rank
+  uint64_t *opCountRem; // opCount of remote rank
+
+  int direct;         // Direct communication
+  void **ptrExchange; // Pointer exchange for direct communication
+
+  int *fifo;          // Size fifo for proxy
+
+  uint64_t step;      // Keep where we are
+
+  // Low latency mechanism
+  union ncclLLFifoLine *llBuff; // Local for recv, remote for send
+  uint64_t llLastCleaning;
+
+  // GPU's HDP_MEM_FLUSH_ADDR: HDP Memory Coherency Flush Control. This register
+  // allows software to explicitly initiate a flush read to HDP memory. See more
+  // descriptions in primitives.h.
+  uint32_t* next_hdp_reg;  // Next GPU in ring (for p2p transport use only)
+  uint32_t* curr_hdp_reg;  // Curr GPU in ring (for rdma transport use only)
+};
+
+struct ncclConnector {
+  int connected;
+  struct ncclProxyArgs *proxyAppend;
+  struct ncclTransportComm* transportComm;
+  void* transportResources; // Host-side resources
+  struct ncclConnInfo conn;
+  struct ncclComm *comm;
+};
+
+struct ncclRing {
+  // Shortcuts for userRanks[1] and userRanks[n-1]
+  int prev;
+  int next;
+
+  // Maps an internal nccl index to user-specified rank order. This is necessary
+  // since we need to know how the user expects data to be ordered across
+  // devices. Ordered from current device.
+  int* userRanks;
+  int* devUserRanks;
+};
+
+
+#define NCCL_MAX_TREE_ARITY 3
+struct ncclTree {
+  int depth;
+  int up;
+  int down[NCCL_MAX_TREE_ARITY];
+};
+
+struct ncclPeer {
+  struct ncclConnector send;
+  struct ncclConnector recv;
+};
+
+struct ncclDevComm;
+
+#pragma pack(push)  /* push current alignment to stack */
+#pragma pack(4)     /* set alignment to 4 bytes boundary */
+/* CollectiveArgs + ncclColl are to be a power of two, currently 64 bytes, */
+/* to make sure reads to host from the CUDA kernel are aligned. */
+/* Make sure to adjust padding at the end of ncclColl. */
+struct CollectiveArgs {
+  struct ncclDevComm* comm;
+  uint64_t opCount;
+
+  // local and remote input, output, and buffer
+  const void * ThisInput;
+  void * ThisOutput;
+
+  // general parameters
+  size_t N;
+  uint32_t root;
+  uint8_t bid;
+  uint8_t nChannels;
+  uint16_t nThreads;
+
+  int lastChunkSize;
+};
+struct ncclColl {
+  union {
+    struct {
+      struct CollectiveArgs args;
+      uint16_t funcIndex;
+      uint16_t nextIndex;
+      uint8_t  active;
+    };
+    int data[0x10];
+  };
+};
+static_assert(sizeof(struct ncclColl) == (0x10*sizeof(int)), "ncclColl must have a pow2 size");
+
+struct ncclChannel {
+  union {
+    struct {
+      struct ncclRing ring;
+      struct ncclTree tree;
+
+      int id;
+      int nthreads;
+      int buffSize;
+
+      // Communication structures
+      struct ncclPeer* peers;
+      struct ncclPeer* devPeers;
+
+      // Operation list for aggregation
+      struct ncclColl* collectives;
+      struct ncclColl* devCollectives;
+      int collStart;
+      int collCount;
+      int collFifoHead; // Only used by GPU
+      int collFifoTail; // Only used by CPU
+
+      uint32_t* abortCount;
+    };
+    int data[0x80];
+  };
+};
+static_assert(sizeof(struct ncclChannel) == 0x80*sizeof(int), "ncclChannel must have a pow2 size");
+#pragma pack(pop)   /* restore original alignment from stack */
+
+#define MAXCHANNELS 16
+
+#ifdef ENABLE_PROFILING
+struct ncclProf {
+  union {
+    struct {
+      uint64_t total_cycle;
+      uint64_t wait_send_cycle[MAXCHANNELS];
+      uint64_t wait_recv_cycle[MAXCHANNELS];
+      // primtive cycles
+      uint64_t send_cycle;
+      uint64_t directSend_cycle;
+      uint64_t recv_cycle;
+      uint64_t directRecv_cycle;
+      uint64_t copySend_cycle;
+      uint64_t directCopySend_cycle;
+      uint64_t recvCopySend_cycle;
+      uint64_t directRecvCopySend_cycle;
+      uint64_t recvReduceCopy_cycle;
+      uint64_t recvReduceSend_cycle;
+      uint64_t recvReduceCopySend_cycle;
+      uint64_t directRecvReduceCopySend_cycle;
+      // primitive bytes
+      uint64_t send_byte;
+      uint64_t directSend_byte;
+      uint64_t recv_byte;
+      uint64_t directRecv_byte;
+      uint64_t copySend_byte;
+      uint64_t directCopySend_byte;
+      uint64_t recvCopySend_byte;
+      uint64_t directRecvCopySend_byte;
+      uint64_t recvReduceCopy_byte;
+      uint64_t recvReduceSend_byte;
+      uint64_t recvReduceCopySend_byte;
+      uint64_t directRecvReduceCopySend_byte;
+    };
+    int data[0x80];
+  };
+};
+#endif
+
+typedef enum {
+  ncclDevSuccess,
+  ncclDevAssertedMismatch,
+  ncclDevSuspectedMismatch
+} ncclDevError_t;
+
+struct ncclDevComm {
+  int rank;
+  int nRanks;
+
+  // Flag to ask NCCL kernels to abort
+  volatile uint32_t *abortFlag;
+  volatile ncclDevError_t *fatalDevError;
+
+  // Channels, device side
+  struct ncclChannel* channels;
+
+#ifdef ENABLE_PROFILING
+  // Profiling counters
+  struct ncclProf* devProf;
+#endif
+};
+
+#endif
@@ -1,5 +1,5 @@
 /*************************************************************************
- * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
 * Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
 *
 * See LICENSE.txt for license information
@@ -11,12 +11,14 @@
 #include "core.h"
 #include "group.h"

-typedef ncclResult_t(*ncclFunc_t)(const void* sendbuff, void* recvbuff, size_t count,
-    ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, hipStream_t stream);
+// Channels / LL tuning
+#define NCCL_LL_CHANNEL_THRESHOLD 8 // Per thread size before we start increasing nrings
+#define NCCL_THREAD_THRESHOLD 256  // Per thread size before we switch to non-LL
+#define NCCL_THREAD_THRESHOLD_PREVOLTA 32 // Per thread size before we switch to non-LL for pre-Volta archs
+#define NCCL_THREAD_THRESHOLD_VEGA 8 // Per thread size before we switch to non-LL for VEGA
+#define NCCL_LL_MIN_NTHREADS 256

-ncclResult_t ncclEnqueueCheck(ncclFunc_t func, const char* primName, const void* sendbuff,
-    void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root,
-    ncclComm_t comm, hipStream_t stream);
+ncclResult_t ncclEnqueueCheck(struct ncclInfo* info);
 ncclResult_t ncclCpuBarrierIn(ncclComm_t comm, int* isLast);
 ncclResult_t ncclCpuBarrierLast(ncclComm_t comm);
 ncclResult_t ncclCpuBarrierOut(ncclComm_t comm);
@@ -4,7 +4,7 @@
 * Copyright (c) 2005, 2006, 2007 Cisco Systems, Inc.  All rights reserved.
 * Copyright (c) 2005 PathScale, Inc.  All rights reserved.
 *
- * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
 *
 * See LICENSE.txt for license information
 ************************************************************************/
@@ -0,0 +1,45 @@
+/*************************************************************************
+ * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef NCCL_INFO_H_
+#define NCCL_INFO_H_
+
+#include "nccl.h"
+
+typedef enum {
+  ncclPatternRing,
+  ncclPatternRingTwice,
+  ncclPatternPipelineFrom,
+  ncclPatternPipelineTo,
+  ncclPatternTreeUp,
+  ncclPatternTreeDown,
+  ncclPatternTreeUpDown
+} ncclPattern_t;
+
+// Used to pass NCCL call information between functions
+struct ncclInfo {
+  ncclColl_t coll;
+  const char* opName;
+  // NCCL Coll Args
+  const void* sendbuff;
+  void* recvbuff;
+  size_t count;
+  ncclDataType_t datatype;
+  ncclRedOp_t op;
+  int root;
+  ncclComm_t comm;
+  hipStream_t stream;
+  // Algorithm details
+  int chunkSteps;
+  int sliceSteps;
+  // Computed later
+  ncclPattern_t pattern;
+  size_t nBytes;
+  int nstepsPerLoop;
+  int nchunksPerLoop;
+};
+
+#endif
@@ -1,5 +1,5 @@
 /*************************************************************************
- * Copyright (c) 2017-2018, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved.
 *
 * See LICENSE.txt for license information
 ************************************************************************/
@@ -58,8 +58,51 @@ typedef struct {
  ncclResult_t (*closeListen)(void* listenComm);
 } ncclNet_v1_t;

-typedef ncclNet_v1_t ncclNet_t;
+typedef struct {
+  // Name of the network (mainly for logs)
+  const char* name;
+  // Initialize the network.
+  ncclResult_t (*init)(ncclDebugLogger_t logFunction);
+  // Return the number of adapters.
+  ncclResult_t (*devices)(int* ndev);
+  // Return the device path in /sys. NCCL will call free on this path.
+  ncclResult_t (*pciPath)(int dev, char** path);
+  // Return whether this device supports host pointers and/or CUDA pointers
+  // as data from the current GPU. Supported types should be composed with
+  // NCCL_PTR_HOST and NCCL_PTR_CUDA.
+  ncclResult_t (*ptrSupport)(int dev, int* supportedTypes);
+  // Create a receiving object and provide a handle to connect to it. The
+  // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged
+  // between ranks to create a connection.
+  ncclResult_t (*listen)(int dev, void* handle, void** listenComm);
+  // Connect to a handle and return a sending comm object for that peer.
+  ncclResult_t (*connect)(int dev, void* handle, void** sendComm);
+  // Finalize connection establishment after remote peer has called connectHandle
+  ncclResult_t (*accept)(void* listenComm, void** recvComm);
+  // Register/Deregister memory. Comm can be either a sendComm or a recvComm.
+  // Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA.
+  ncclResult_t (*regMr)(void* comm, void* data, int size, int type, void** mhandle);
+  ncclResult_t (*deregMr)(void* comm, void* mhandle);
+  // Asynchronous send to a peer.
+  // May return request == NULL if the call cannot be performed (or would block)
+  ncclResult_t (*isend)(void* sendComm, void* data, int size, void* mhandle, void** request);
+  // Asynchronous recv from a peer.
+  // May return request == NULL if the call cannot be performed (or would block)
+  ncclResult_t (*irecv)(void* recvComm, void* data, int size, void* mhandle, void** request);
+  // Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is
+  // visible to the GPU
+  ncclResult_t (*flush)(void* recvComm, void* data, int size, void* mhandle);
+  // Test whether a request is complete. If size is not NULL, it returns the
+  // number of bytes sent/received.
+  ncclResult_t (*test)(void* request, int* done, int* size);
+  // Close and free send/recv comm objects
+  ncclResult_t (*closeSend)(void* sendComm);
+  ncclResult_t (*closeRecv)(void* recvComm);
+  ncclResult_t (*closeListen)(void* listenComm);
+} ncclNet_v2_t;

-#define NCCL_PLUGIN_SYMBOL ncclNetPlugin_v1
+typedef ncclNet_v2_t ncclNet_t;
+
+#define NCCL_PLUGIN_SYMBOL ncclNetPlugin_v2

 #endif // end include guard
@@ -1,5 +1,5 @@
 /*************************************************************************
- * Copyright (c) 2016-2018, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
 *
 * See LICENSE.txt for license information
 ************************************************************************/
@@ -13,11 +13,6 @@
 extern ncclNet_t* ncclNet;
 typedef char ncclNetHandle_t[NCCL_NET_HANDLE_MAXSIZE];

-/* Socket Interface Selection type */
-typedef enum { findSubnetIf   = -1,
-    dontCareIf     = -2
-} ncclSocketIfSl_t;
-
 // Translation to external API
 static const char* ncclNetName() { return ncclNet->name; }
 static ncclResult_t ncclNetDevices(int* ndev) { NCCLCHECK(ncclNet->devices(ndev)); return ncclSuccess; }
@@ -26,15 +21,16 @@ static ncclResult_t ncclNetPtrSupport(int dev, int* supportedTypes) { NCCLCHECK(
 static ncclResult_t ncclNetListen(int dev, void* handle, void** listenComm) { NCCLCHECK(ncclNet->listen(dev, handle, listenComm)); return ncclSuccess; }
 static ncclResult_t ncclNetConnect(int dev, void* handle, void** sendComm) { NCCLCHECK(ncclNet->connect(dev, handle, sendComm)); return ncclSuccess; }
 static ncclResult_t ncclNetAccept(void* listenComm, void** recvComm) { NCCLCHECK(ncclNet->accept(listenComm, recvComm)); return ncclSuccess; }
-static ncclResult_t ncclNetIsend(void* sendComm, void* data, int size, int type, void** request) { NCCLCHECK(ncclNet->isend(sendComm, data, size, type, request)); return ncclSuccess; }
-static ncclResult_t ncclNetIrecv(void* recvComm, void* data, int size, int type, void** request) { NCCLCHECK(ncclNet->irecv(recvComm, data, size, type, request)); return ncclSuccess; }
-static ncclResult_t ncclNetFlush(void* recvComm, void* data, int size) { NCCLCHECK(ncclNet->flush(recvComm, data, size)); return ncclSuccess; }
+static ncclResult_t ncclNetRegMr(void* comm, void* data, int size, int type, void** mhandle) { NCCLCHECK(ncclNet->regMr(comm, data, size, type, mhandle)); return ncclSuccess; }
+static ncclResult_t ncclNetDeregMr(void* comm, void* mhandle) { NCCLCHECK(ncclNet->deregMr(comm, mhandle)); return ncclSuccess; }
+static ncclResult_t ncclNetIsend(void* sendComm, void* data, int size, void* mhandle, void** request) { NCCLCHECK(ncclNet->isend(sendComm, data, size, mhandle, request)); return ncclSuccess; }
+static ncclResult_t ncclNetIrecv(void* recvComm, void* data, int size, void* mhandle, void** request) { NCCLCHECK(ncclNet->irecv(recvComm, data, size, mhandle, request)); return ncclSuccess; }
+static ncclResult_t ncclNetFlush(void* recvComm, void* data, int size, void* mhandle) { NCCLCHECK(ncclNet->flush(recvComm, data, size, mhandle)); return ncclSuccess; }
 static ncclResult_t ncclNetTest(void* request, int* done, int* size) { NCCLCHECK(ncclNet->test(request, done, size)); return ncclSuccess; }
 static ncclResult_t ncclNetCloseSend(void* sendComm) { NCCLCHECK(ncclNet->closeSend(sendComm)); return ncclSuccess; }
 static ncclResult_t ncclNetCloseRecv(void* recvComm) { NCCLCHECK(ncclNet->closeRecv(recvComm)); return ncclSuccess; }
 static ncclResult_t ncclNetCloseListen(void* listenComm) { NCCLCHECK(ncclNet->closeListen(listenComm)); return ncclSuccess; }

-extern ncclResult_t ncclSocketCreateHandle(void* opaqueHandle, const char* str);
 extern ncclNet_t ncclNetIb;
 extern ncclNet_t ncclNetSocket;

@@ -1,5 +1,5 @@
 /*************************************************************************
- * Copyright (c) 2016-2018, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
 * Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
 *
 * See LICENSE.txt for license information
@@ -19,6 +19,7 @@
 enum ncclNvLinkDeviceType {
  ncclNvLinkDeviceGpu,
  ncclNvLinkDeviceSwitch,
+  ncclNvLinkDeviceBridge, // IBM/Power NVLink bridge (Device 04ea)
 };

 static ncclResult_t ncclDeviceType(const char* busId, enum ncclNvLinkDeviceType* type) {
@@ -26,7 +27,13 @@ static ncclResult_t ncclDeviceType(const char* busId, enum ncclNvLinkDeviceType*
  memcpy(classPath+sizeof("/sys/bus/pci/devices/")-1, busId, sizeof("0000:00:00.0")-1);
  char* rPath = realpath(classPath, NULL);
  int fd;
-  SYSCHECKVAL(open(rPath, O_RDONLY), "open", fd);
+  if ((fd = open(rPath, O_RDONLY)) == -1) {
+    // Could not find device. It might be because we're in a VM and
+    // we don't see the whole machine. This is handled silently so
+    // we don't want to print an INFO error.
+    TRACE(NCCL_INIT, "Open of %s failed : %s\n", rPath, strerror(errno));
+    return ncclSystemError;
+  }
  free(rPath);
  char pciClass[9];
  strncpy(pciClass, "0x000000", 9);
@@ -36,6 +43,9 @@ static ncclResult_t ncclDeviceType(const char* busId, enum ncclNvLinkDeviceType*
  if (strcmp(pciClass, "0x068000") == 0) {
    // PCI device is of type "Bridge / Other Bridge Device" (NVswitch)
    *type = ncclNvLinkDeviceSwitch;
+  } else if (strcmp(pciClass, "0x068001") == 0) {
+    // PCI device is of type "Bridge: IBM Device 04ea"
+    *type = ncclNvLinkDeviceBridge;
  } else if (strcmp(pciClass, "0x030200") == 0 // "3D Controller" (Tesla)
      || strcmp(pciClass, "0x030000") == 0) {  // "VGA Controller" (GeForce)
    *type = ncclNvLinkDeviceGpu;
@@ -49,9 +59,9 @@ static ncclResult_t ncclDeviceType(const char* busId, enum ncclNvLinkDeviceType*
 /* Get the maximum number of NVLinks based on the GPU generation */
 static ncclResult_t getMaxNvlinks(int* maxLinks) {
  int cudaDev;
-  CUDACHECK(cudaGetDevice(&cudaDev));
+  CUDACHECK(hipGetDevice(&cudaDev));
  int ccMajor;
-  CUDACHECK(cudaDeviceGetAttribute(&ccMajor, cudaDevAttrComputeCapabilityMajor, cudaDev));
+  CUDACHECK(hipDeviceGetAttribute(&ccMajor, hipDeviceAttributeComputeCapabilityMajor, cudaDev));
  // 6 for Volta, 4 for Pascal
  *maxLinks = (ccMajor > 6) ? 6 : 4;
  // INFO("Device %d detected %d NVLinks", cudaDev, *maxLinks);
@@ -68,18 +78,15 @@ static int getNvlinkGpu(const char* busId1, const char* busId2) {
  if (res != ncclSuccess) return 0;

  for(int l=0; l<maxNvLinks; ++l) {
-    // nvmlDeviceGetNvLinkCapability(NVML_NVLINK_CAP_P2P_SUPPORTED) would seem to
-    // report whether the NVLink connects to a peer GPU (versus a POWER CPU?). I
-    // don't know whether nvmlDeviceGetNvLinkRemotePciInfo() would succeed in
-    // the POWER CPU case, so it seems best to check this as well.
+    // Check whether we can use this NVLink for P2P
    unsigned canP2P;
    if ((wrapNvmlDeviceGetNvLinkCapability(nvmlDev, l, NVML_NVLINK_CAP_P2P_SUPPORTED, &canP2P) != ncclSuccess) || !canP2P) continue;

-    // nvmlDeviceGetNvLinkRemotePciInfo() will return NVML_ERROR_NOT_SUPPORTED
-    // if the links don't exist, or are disabled. So checking for that return
-    // here would probably make the nvmlDeviceGetNvLinkCapability check above
-    // redundant. Presumably, we still need to check the P2P capability above,
-    // since even non-GPUs would possess PCI info.
+    // Make sure the Nvlink is up. The previous call should have trained the link.
+    nvmlEnableState_t isActive;
+    if ((wrapNvmlDeviceGetNvLinkState(nvmlDev, l, &isActive) != ncclSuccess) || (isActive != NVML_FEATURE_ENABLED)) continue;
+
+    // Try to figure out what's on the other side of the NVLink
    nvmlPciInfo_t remoteProc;
    if (wrapNvmlDeviceGetNvLinkRemotePciInfo(nvmlDev, l, &remoteProc) != ncclSuccess) continue;

@@ -90,7 +97,7 @@ static int getNvlinkGpu(const char* busId1, const char* busId2) {
      p[c] = toupper(p[c]);
    }

-    if (strncmp(busId2, remoteProc.busId, NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE) == 0) {
+    if (busId2 != NULL && strncmp(busId2, remoteProc.busId, NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE) == 0) {
      links++;
    } else {
      // Make a lower case copy of the bus ID for calling ncclDeviceType
@@ -102,11 +109,21 @@ static int getNvlinkGpu(const char* busId1, const char* busId2) {
        lowerId[c] = tolower(p[c]);
      }

-      // Determine if the remote side is NVswitch
+      // Determine if the remote side is NVswitch or a GPU
      enum ncclNvLinkDeviceType type;
-      if (ncclDeviceType(lowerId, &type) == ncclSuccess && type == ncclNvLinkDeviceSwitch) {
-        //TODO: we are making an assumption that all GPUs are connected to this switch
-        //This assumption may change for future architectures
+      ncclResult_t ret = ncclDeviceType(lowerId, &type);
+      if (ret == ncclSuccess) {
+        if (type == ncclNvLinkDeviceSwitch) {
+          //TODO: we are making an assumption that all GPUs are connected to this switch
+          //This assumption may change for future architectures
+          nvswitch_links++;
+        } else if (type == ncclNvLinkDeviceGpu && busId2 == NULL) {
+          links++;
+        }
+      } else {
+        // The NVLink is up but we couldn't find the PCI device on the other
+        // side. Assume it's an NVswitch outside a VM.
+        if (l==0) INFO(NCCL_INIT, "Assuming NVLink is connected to NVswitch");
        nvswitch_links++;
      }
    }
@@ -114,43 +131,4 @@ static int getNvlinkGpu(const char* busId1, const char* busId2) {
  return nvswitch_links ? CONNECT_NVSWITCH*nvswitch_links : CONNECT_NVLINK*links;
 }

-static int getNumNvlinks(const char* busId) {
-  nvmlDevice_t nvmlDev;
-  ncclResult_t res = wrapNvmlDeviceGetHandleByPciBusId(busId, &nvmlDev);
-  if (res != ncclSuccess) return 0;
-
-  int nvlinks = 0, nvswitch_links = 0;
-  int maxNvLinks = ncclCudaCompCap() > 6 ? 6 : 4;
-  for(int l=0; l<maxNvLinks; ++l) {
-    unsigned canP2P;
-    nvmlEnableState_t isActive;
-    if (wrapNvmlDeviceGetNvLinkCapability(nvmlDev, l, NVML_NVLINK_CAP_P2P_SUPPORTED, &canP2P) == ncclSuccess && canP2P &&
-        wrapNvmlDeviceGetNvLinkState(nvmlDev, l, &isActive) == ncclSuccess && isActive == NVML_FEATURE_ENABLED) {
-      nvlinks++;
-    } else {
-      continue;
-    }
-
-    nvmlPciInfo_t remoteProc;
-    if (wrapNvmlDeviceGetNvLinkRemotePciInfo(nvmlDev, l, &remoteProc) != ncclSuccess) continue;
-
-    // Make a lower case copy of the bus ID for calling ncclDeviceType
-    // PCI system path is in lower case
-    char* p = remoteProc.busId;
-    char lowerId[NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE];
-    for (int c=0; c<NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE; c++) {
-      if (p[c] == 0) break;
-      lowerId[c] = tolower(p[c]);
-    }
-
-    // Determine if the remote side is NVswitch
-    enum ncclNvLinkDeviceType type;
-    if (ncclDeviceType(lowerId, &type) == ncclSuccess && type == ncclNvLinkDeviceSwitch) {
-      //TODO: we are making an assumption that all GPUs are connected to this switch
-      //This assumption may change for future architectures
-      nvswitch_links++;
-    }
-  }
-  return nvswitch_links ? CONNECT_NVSWITCH*nvswitch_links : CONNECT_NVLINK*nvlinks;
-}
 #endif
@@ -1,5 +1,5 @@
 /*************************************************************************
- * Copyright (c) 2016-2018, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
 * Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
 *
 * See LICENSE.txt for license information
@@ -8,13 +8,23 @@
 #ifndef NCCL_NVLINK_H_
 #define NCCL_NVLINK_H_

+#include <sys/stat.h>
+#include <fcntl.h>
+#include "nvmlwrap.h"
 #include "topo.h"

 #define CONNECT_NVLINK 0x10
 #define CONNECT_NVSWITCH 0x100

-static int getNumNvlinks(const char* busId) {
-  return 0;
+enum ncclNvLinkDeviceType {
+  ncclNvLinkDeviceGpu,
+  ncclNvLinkDeviceSwitch,
+  ncclNvLinkDeviceBridge, // IBM/Power NVLink bridge (Device 04ea)
+};
+
+static int getNvlinkGpu(const char* busId1, const char* busId2) {
+  int links = 0;
+  return CONNECT_NVLINK*links;
 }

 #endif
@@ -1,5 +1,5 @@
 /*************************************************************************
- * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
 *
 * See LICENSE.txt for license information
 ************************************************************************/
@@ -7,7 +7,7 @@
 #ifndef NCCL_NVMLWRAP_H_
 #define NCCL_NVMLWRAP_H_

-#include "core.h"
+#include "nccl.h"

 //#define NVML_DIRECT 1
 #ifdef NVML_DIRECT
@@ -32,14 +32,6 @@ static ncclResult_t wrapNvmlDeviceGetIndex(nvmlDevice_t device, unsigned* index)
  NVMLCHECK(nvmlDeviceGetIndex(device, index));
  return ncclSuccess;
 }
-static ncclResult_t wrapNvmlDeviceSetCpuAffinity(nvmlDevice_t device) {
-  NVMLCHECK(nvmlDeviceSetCpuAffinity(device));
-  return ncclSuccess;
-}
-static ncclResult_t wrapNvmlDeviceClearCpuAffinity(nvmlDevice_t device) {
-  NVMLCHECK(nvmlDeviceClearCpuAffinity(device));
-  return ncclSuccess;
-}
 static ncclResult_t wrapNvmlDeviceGetHandleByIndex(unsigned int index, nvmlDevice_t *device) {
  NVMLCHECK(nvmlDeviceGetHandleByIndex(index,device));
  return ncclSuccess;
@@ -61,6 +53,10 @@ static ncclResult_t wrapNvmlDeviceGetNvLinkCapability(nvmlDevice_t device, unsig
  NVMLCHECK(nvmlDeviceGetNvLinkCapability(device, link, capability, capResult));
  return ncclSuccess;
 }
+static ncclResult_t wrapNvmlDeviceGetMinorNumber(nvmlDevice_t device, unsigned int* minorNumber) {
+  NVMLCHECK(nvmlDeviceGetMinorNumber(device, minorNumber));
+  return ncclSuccess;
+}
 #else
 // Dynamically handle dependencies on NVML

@@ -136,14 +132,14 @@ ncclResult_t wrapNvmlInit(void);
 ncclResult_t wrapNvmlShutdown(void);
 ncclResult_t wrapNvmlDeviceGetHandleByPciBusId(const char* pciBusId, nvmlDevice_t* device);
 ncclResult_t wrapNvmlDeviceGetIndex(nvmlDevice_t device, unsigned* index);
-ncclResult_t wrapNvmlDeviceSetCpuAffinity(nvmlDevice_t device);
-ncclResult_t wrapNvmlDeviceClearCpuAffinity(nvmlDevice_t device);
 ncclResult_t wrapNvmlDeviceGetHandleByIndex(unsigned int index, nvmlDevice_t *device);
 ncclResult_t wrapNvmlDeviceGetPciInfo(nvmlDevice_t device, nvmlPciInfo_t* pci);
 ncclResult_t wrapNvmlDeviceGetNvLinkState(nvmlDevice_t device, unsigned int link, nvmlEnableState_t *isActive);
 ncclResult_t wrapNvmlDeviceGetNvLinkRemotePciInfo(nvmlDevice_t device, unsigned int link, nvmlPciInfo_t *pci);
 ncclResult_t wrapNvmlDeviceGetNvLinkCapability(nvmlDevice_t device, unsigned int link,
                                                   nvmlNvLinkCapability_t capability, unsigned int *capResult);
+ncclResult_t wrapNvmlDeviceGetMinorNumber(nvmlDevice_t device, unsigned int* minorNumber);
+
 #endif // NVML_DIRECT

 #endif // End include guard
@@ -1,5 +1,5 @@
 /*************************************************************************
- * Copyright (c) 2017-2018, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved.
 *
 * See LICENSE.txt for license information
 ************************************************************************/
@@ -36,7 +36,6 @@ static void setEnvFile(const char* fileName) {
    s++;
    strncpy(envValue, line+s, 1024);
    setenv(envVar, envValue, 0);
-    char *str = getenv(envVar);
  }
  if (line) free(line);
  fclose(file);
@@ -1,14 +0,0 @@
-/*************************************************************************
- * Copyright (c) 2015-2017, NVIDIA CORPORATION. All rights reserved.
- *
- * See LICENSE.txt for license information
- ************************************************************************/
-
-#ifndef NCCL_RING_H_
-#define NCCL_RING_H_
-#include "core.h"
-
-ncclResult_t initRing(struct ncclComm* comm, int ringid);
-ncclResult_t freeRing(struct ncclRing* ring);
-
-#endif
@@ -1,5 +1,5 @@
 /*************************************************************************
- * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
 * Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
 *
 * See LICENSE.txt for license information
@@ -9,14 +9,13 @@
 #define NCCL_RINGS_H_

 static int getDefaultThreads() {
-  // On Kepler, rings are doubled later.
-#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__)
+#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
  return 256;
-#else
+#else  // On Kepler, rings are doubled later.
  return ncclCudaCompCap() == 3 ? 128 : 256;
 #endif
 }

-ncclResult_t ncclGetRings(int* nrings, int* nthreads, int rank, int nranks, int* transports, ncclTvalue_t* values, int* prev, int* next);
+ncclResult_t ncclGetRings(int* nrings, int* nthreads, int rank, int nranks, int* transports, ncclTvalue_t* values, int* prev, int* next, int* treeIn, int* treeOut);

 #endif
@@ -1,5 +1,5 @@
 /*************************************************************************
- * Copyright (c) 2016-2018, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
 * Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
 *
 * See LICENSE.txt for license information
@@ -1,5 +1,5 @@
 /*************************************************************************
- * Copyright (c) 2016-2018, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
 *
 * See LICENSE.txt for license information
 ************************************************************************/
@@ -18,8 +18,9 @@

 #define MAX_IFS 16
 #define MAX_IF_NAME_SIZE 16
-#define SLEEP_INT     1000  // sleep interval in usec
-#define RETRY_TIMES   2e4   // retry times before reporting a timeout (20 sec)
+#define SLEEP_INT            1000 // connection retry sleep interval in usec
+#define RETRY_REFUSED_TIMES   2e4 // connection refused retry times before reporting a timeout (20 sec)
+#define RETRY_TIMEDOUT_TIMES    3 // connection timed out retry times (each one can take 20s)

 /* Common socket address storage structure for IPv4/IPv6 */
 union socketAddress {
@@ -41,7 +42,7 @@ static inline const char *socketToString(struct sockaddr *saddr, char *buf) {
  return buf;
 }

-static inline short socketToPort(struct sockaddr *saddr) {
+static inline uint16_t socketToPort(struct sockaddr *saddr) {
  return ntohs(saddr->sa_family == AF_INET ? ((struct sockaddr_in*)saddr)->sin_port : ((struct sockaddr_in6*)saddr)->sin6_port);
 }

@@ -60,9 +61,12 @@ static inline int envSocketFamily(void) {
 }

 static int findInterfaces(const char* prefixList, char* names, union socketAddress *addrs, int sock_family, int maxIfNameSize, int maxIfs) {
+#ifdef ENABLE_TRACE
  char line[1024];
+#endif
  struct netIf userIfs[MAX_IFS];
  bool searchNot = prefixList && prefixList[0] == '^';
+  bool searchExact = prefixList && prefixList[0] == '=';
  int nUserIfs = parseStringList(prefixList, userIfs, MAX_IFS);

  int found = 0;
@@ -89,7 +93,7 @@ static int findInterfaces(const char* prefixList, char* names, union socketAddre
    }

    // check against user specified interfaces
-    if (!(matchIfList(interface->ifa_name, -1, userIfs, nUserIfs) ^ searchNot)) {
+    if (!(matchIfList(interface->ifa_name, -1, userIfs, nUserIfs, searchExact) ^ searchNot)) {
      continue;
    }

@@ -106,7 +110,6 @@ static int findInterfaces(const char* prefixList, char* names, union socketAddre
      // Store the IP address
      int salen = (family == AF_INET) ? sizeof(sockaddr_in) : sizeof(sockaddr_in6);
      memcpy(addrs+found, interface->ifa_addr, salen);
-      INFO(NCCL_INIT|NCCL_NET,"NET : Using interface %s:%s", interface->ifa_name, socketToString(interface->ifa_addr, line));
      found++;
    }
  }
@@ -159,7 +162,10 @@ static bool matchSubnet(struct ifaddrs local_if, union socketAddress remote) {
 }

 static int findInterfaceMatchSubnet(char* ifNames, union socketAddress* localAddrs, union socketAddress remoteAddr, int ifNameMaxSize, int maxIfs) {
-  char line[1024], line_a[1024];
+#ifdef ENABLE_TRACE
+  char line[1024];
+#endif
+  char line_a[1024];
  int found = 0;
  struct ifaddrs *interfaces, *interface;
  getifaddrs(&interfaces);
@@ -183,7 +189,7 @@ static int findInterfaceMatchSubnet(char* ifNames, union socketAddress* localAdd
    // Store the interface name
    strncpy(ifNames+found*ifNameMaxSize, interface->ifa_name, ifNameMaxSize);

-    INFO(NCCL_INIT|NCCL_NET,"NET : Found interface %s:%s in the same subnet as remote address %s", interface->ifa_name, socketToString(&(localAddrs[found].sa), line), socketToString(&(remoteAddr.sa), line_a));
+    TRACE(NCCL_INIT|NCCL_NET,"NET : Found interface %s:%s in the same subnet as remote address %s", interface->ifa_name, socketToString(&(localAddrs[found].sa), line), socketToString(&(remoteAddr.sa), line_a));
    found++;
    if (found == maxIfs) break;
  }
@@ -336,8 +342,10 @@ static ncclResult_t createListenSocket(int *fd, union socketAddress *localAddr)
  TRACE(NCCL_INIT|NCCL_NET,"Listening on socket %s", socketToString(&localAddr->sa, line));
 #endif

-  /* Put the socket in listen mode */
-  SYSCHECK(listen(sockfd, 128), "listen");
+  /* Put the socket in listen mode
+   * NB: The backlog will be silently truncated to the value in /proc/sys/net/core/somaxconn
+   */
+  SYSCHECK(listen(sockfd, 16384), "listen");
  *fd = sockfd;
  return ncclSuccess;
 }
@@ -367,14 +375,18 @@ static ncclResult_t connectAddress(int* fd, union socketAddress* remoteAddr) {
 #endif

  int ret;
-  int retries = 0;
+  int timedout_retries = 0;
+  int refused_retries = 0;
 retry:
  SYSCHECKSYNC(connect(*fd, &remoteAddr->sa, salen), "connect", ret);
  if (ret == 0) return ncclSuccess;
-  if (errno == ECONNREFUSED && ++retries < RETRY_TIMES) {
-    INFO(NCCL_ALL,"Call to connect returned %s, retrying", strerror(errno)); \
-    usleep(SLEEP_INT);
-    goto retry;
+  if ((errno == ECONNREFUSED || errno == ETIMEDOUT)) {
+    if ((errno == ECONNREFUSED && ++refused_retries < RETRY_REFUSED_TIMES) ||
+        (errno == ETIMEDOUT && ++timedout_retries < RETRY_TIMEDOUT_TIMES)) {
+      INFO(NCCL_ALL,"Call to connect returned %s, retrying", strerror(errno));
+      usleep(SLEEP_INT);
+      goto retry;
+    }
  }
  WARN("Connect to %s failed : %s", socketToString(&remoteAddr->sa, line), strerror(errno));
  return ncclSystemError;
@@ -382,12 +394,12 @@ retry:

 #define NCCL_SOCKET_SEND 0
 #define NCCL_SOCKET_RECV 1
-static ncclResult_t socketProgress(int op, int fd, void* ptr, int size, int* offset) {
+static ncclResult_t socketProgressOpt(int op, int fd, void* ptr, int size, int* offset, int block) {
  int bytes = 0;
  char* data = (char*)ptr;
  do {
-    if (op == NCCL_SOCKET_RECV) bytes = recv(fd, data+(*offset), size-(*offset), MSG_DONTWAIT);
-    if (op == NCCL_SOCKET_SEND) bytes = send(fd, data+(*offset), size-(*offset), MSG_DONTWAIT);
+    if (op == NCCL_SOCKET_RECV) bytes = recv(fd, data+(*offset), size-(*offset), block ? 0 : MSG_DONTWAIT);
+    if (op == NCCL_SOCKET_SEND) bytes = send(fd, data+(*offset), size-(*offset), block ? 0 : MSG_DONTWAIT);
    if (op == NCCL_SOCKET_RECV && bytes == 0) {
      WARN("Net : Connection closed by remote peer");
      return ncclSystemError;
@@ -405,9 +417,13 @@ static ncclResult_t socketProgress(int op, int fd, void* ptr, int size, int* off
  return ncclSuccess;
 }

+static ncclResult_t socketProgress(int op, int fd, void* ptr, int size, int* offset) {
+  return socketProgressOpt(op, fd, ptr, size, offset, 0);
+}
+
 static ncclResult_t socketWait(int op, int fd, void* ptr, int size, int* offset) {
  while (*offset < size)
-    NCCLCHECK(socketProgress(op, fd, ptr, size, offset));
+    NCCLCHECK(socketProgressOpt(op, fd, ptr, size, offset, 1));
  return ncclSuccess;
 }

@@ -1,6 +1,5 @@
 /*************************************************************************
- * Copyright (c) 2016-2018, NVIDIA CORPORATION. All rights reserved.
- * Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
+ * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
 *
 * See LICENSE.txt for license information
 ************************************************************************/
@@ -12,78 +11,35 @@
 #include <limits.h>
 #include <stdlib.h>
 #include <ctype.h>
-#include <iostream>
-#include <fstream>
-#include <string>
+#include <stdio.h>

-#define BUSID_SIZE (sizeof("0000:00:00.0"))
-#define BUSID_REDUCED_SIZE (sizeof("0000:00"))
+ncclResult_t getCudaPath(int cudaDev, char** path);

-static bool isEPYC() {
-  std::ifstream cpuinfo("/proc/cpuinfo");
-  std::string line;
-  int needed = 2;
-  static bool vendor_id = true, cpu_family = false, initialized = false;
-  if (initialized) return (vendor_id && cpu_family);
-  while (std::getline(cpuinfo, line)) {
-    if (line.compare(0, 9, "vendor_id") == 0) {
-      if(line.find("AuthenticAMD") == std::string::npos)
-        vendor_id = false;
-      needed --;
-    }
-    if (line.compare(0, 10, "cpu family") == 0) {
-      std::string family_str = line.substr(line.find(": ") + 2);
-      if (std::stoi(family_str) >= 23)
-        cpu_family = true;
-      needed --;
-    }
-    if (!needed)
-      break;
-  }
-  initialized = true;
-  return (vendor_id && cpu_family);
-}
+static int getNumaId(char *path) {
+  char npath[PATH_MAX];
+  snprintf(npath, PATH_MAX, "%s/numa_node", path);
+  npath[PATH_MAX-1] = '\0';

-static ncclResult_t getCudaPath(int cudaDev, char** path) {
-  char busId[BUSID_SIZE];
-  CUDACHECK(hipDeviceGetPCIBusId(busId, BUSID_SIZE, cudaDev));
-  for (int i=0; i<BUSID_SIZE; i++) busId[i] = tolower(busId[i]);
-  char busPath[] = "/sys/class/pci_bus/0000:00/../../0000:00:00.0";
-  memcpy(busPath+sizeof("/sys/class/pci_bus/")-1, busId, BUSID_REDUCED_SIZE-1);
-  memcpy(busPath+sizeof("/sys/class/pci_bus/0000:00/../../")-1, busId, BUSID_SIZE-1);
-  *path = realpath(busPath, NULL);
-  if (*path == NULL) {
-    WARN("Could not find real path of %s", busPath);
-    return ncclSystemError;
-  }
-  return ncclSuccess;
+  int numaId = -1;
+  FILE *file = fopen(npath, "r");
+  if (file == NULL) return -1;
+  if (fscanf(file, "%d", &numaId) == EOF) { fclose(file); return -1; }
+  fclose(file);
+
+  return numaId;
 }

 enum ncclPathDist {
-  PATH_PIX = 0,
-  PATH_PXB = 1,
-  PATH_PHB = 2,
-  PATH_SOC = 3
+  PATH_PIX  = 0,
+  PATH_PXB  = 1,
+  PATH_PHB  = 2,
+  PATH_NODE = 3,
+  PATH_SYS  = 4,
+  PATH_ARRAY_SIZE = 5
 };

-static const char* pathDists[] = { "PIX", "PXB", "PHB", "SOC" };
+extern const char* pathDists[PATH_ARRAY_SIZE];

-static int pciDistance(char* path1, char* path2) {
-  int score = 0;
-  int depth = 0;
-  int same = 1;
-  for (int i=0; i<strlen(path1); i++) {
-    if (path1[i] != path2[i]) same = 0;
-    if (path1[i] == '/') {
-      depth++;
-      if (same == 1) score++;
-    }
-  }
-  if (isEPYC() && score <= 3) return PATH_PHB;
-  if (score <= 3) return PATH_SOC;
-  if (score == 4) return PATH_PHB;
-  if (score == depth-1) return PATH_PIX;
-  return PATH_PXB;
-}
+int pciDistance(char* path1, char* path2);

 #endif
@@ -1,5 +1,5 @@
 /*************************************************************************
- * Copyright (c) 2016-2018, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
 *
 * See LICENSE.txt for license information
 ************************************************************************/
@@ -8,7 +8,9 @@
 #define NCCL_TRANSPORT_H_

 #include "nccl.h"
+#include "devcomm.h"
 #include <stdint.h>
+#include "nvmlwrap.h"

 #define NTRANSPORTS 3

@@ -19,11 +21,13 @@ struct ncclRing;
 struct ncclConnector;
 struct ncclComm;

-#define RANK_INFO_SIZE 64
-typedef char ncclTinfo_t[RANK_INFO_SIZE];
-
-struct ncclInfo {
-  ncclTinfo_t tinfo[NTRANSPORTS];
+struct ncclPeerInfo {
+  int rank;
+  int cudaDev;
+  int nvmlDev;
+  uint64_t hostHash;
+  uint64_t pidHash;
+  char busId[NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE];
 };

 // Used to hold the transport connection values
@@ -34,18 +38,47 @@ struct ncclConnect {
  char data[CONNECT_SIZE];
 };

+enum ncclProxyOpState { ncclProxyOpNone, ncclProxyOpReady, ncclProxyOpProgress };
+
+struct ncclProxyArgs;
+typedef ncclResult_t (*proxyProgressFunc_t)(struct ncclProxyArgs*);
+
 struct ncclProxyArgs {
-  struct ncclRing* ring;
-  int substeps;
+  proxyProgressFunc_t progress;
+  struct ncclChannel* channel;
+  struct ncclConnector* connector;
+  int sliceSteps;
+  int chunkSteps;
  int nsteps;
  uint64_t opCount;
  int llMode;
-  bool needProxy;
-  int active;   // add component before this line -- it is left out during initialization
+  int state;   // add component before this line -- it is left out during initialization
+
+  // Internal state
+  uint64_t head;
+  uint64_t tail;
+  uint64_t end;
+  void* requests[NCCL_STEPS];
+  int idle;
+
+  // Element linking
+  pthread_mutex_t mutex;
+  struct ncclProxyArgs* next;
+  struct ncclProxyArgs* nextPeer;
+};
+
+struct ncclProxyPool;
+struct ncclProxyState {
+  pthread_cond_t cond;
+  pthread_mutex_t mutex;
+  bool stop;
+  struct ncclProxyArgs* ops;
+  struct ncclProxyArgs* pool;
+  struct ncclProxyPool* pools;
 };

 struct ncclTransportComm {
-  ncclResult_t (*setup)(ncclTinfo_t*, ncclTinfo_t*, struct ncclConnect*, struct ncclRing*);
+  ncclResult_t (*setup)(struct ncclPeerInfo*, struct ncclPeerInfo*, struct ncclConnect*, struct ncclConnector*, int buffSize, int channelId);
  ncclResult_t (*connect)(struct ncclConnect*, struct ncclConnector*);
  ncclResult_t (*free)(void*);
  ncclResult_t (*proxy)(struct ncclProxyArgs*);
@@ -53,8 +86,7 @@ struct ncclTransportComm {

 struct ncclTransport {
  const char name[4];
-  ncclResult_t (*fillInfo)(ncclTinfo_t*, int, uint64_t);
-  ncclResult_t (*canConnect)(ncclTvalue_t*, ncclTinfo_t*, ncclTinfo_t*);
+  ncclResult_t (*canConnect)(ncclTvalue_t*, struct ncclPeerInfo*, struct ncclPeerInfo*);
  ncclResult_t (*getRings)(int, int*, int*, ncclTvalue_t*, int*, int*, int*, int, int*);
  struct ncclTransportComm send;
  struct ncclTransportComm recv;
@@ -64,37 +96,17 @@ struct ncclTransport {

 typedef ncclResult_t (*threadFunc_t)(struct ncclProxyArgs*);

-#define TRANSPORT_PROXY_FIFO_SIZE NCCL_MAX_OPS
-
-struct transportProxyInfo {
-  struct ncclComm* comm;
-  pthread_t thread;
-  threadFunc_t func;
-  volatile int proxyReady;
-  struct ncclProxyArgs argsFifo[TRANSPORT_PROXY_FIFO_SIZE];
-  volatile uint64_t argsFifoHead;
-  volatile uint64_t argsFifoTail;
-  pthread_cond_t cond;
-  pthread_mutex_t mutex;
-};
-
-ncclResult_t transportCreateProxy(int type, struct ncclRing* ring, struct ncclComm* comm);
-ncclResult_t transportDestroyProxy(struct ncclConnector* connector);
-
 enum proxyMode {
  proxyRing = 0,
  proxyFrom = 1,
  proxyTo = 2
 };

-static int proxyPatternRing = proxyRing;
-static inline int proxyPatternFrom(int root) { return 1+root; }
-static inline int proxyPatternTo(int root) { return -1-root; }
-static inline enum proxyMode proxyPatternMode(int pattern) { return (pattern == 0) ? proxyRing : ((pattern > 0) ? proxyFrom : proxyTo); }
-static inline int proxyPatternRoot(int pattern) { return (pattern > 0) ? pattern-1 : -pattern-1; }
-
-ncclResult_t transportSaveProxies(int substeps, int subchunks, int nstepsPerRound, int nblocksPerRound, size_t size, int pattern, struct ncclComm* comm);
-ncclResult_t transportStartProxies(struct ncclComm* comm);
+ncclResult_t transportAllocateProxyArgs(struct ncclComm* comm, struct ncclProxyArgs** argsptr);
+ncclResult_t transportSaveProxies(struct ncclProxyArgs* args, int pattern, int root, int nranks);
+ncclResult_t transportStartProxy(struct ncclComm* comm);
+ncclResult_t transportCreateProxy(struct ncclComm* comm);
+ncclResult_t transportDestroyProxy(struct ncclComm* comm);

 #include <unistd.h>

@@ -106,8 +118,4 @@ inline void transportProxyWait(const FUNC& func) {
  }
 }

-inline void transportProxyIdle(int idle) {
-  sched_yield();
-}
-
 #endif
@@ -0,0 +1,13 @@
+/*************************************************************************
+ * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef NCCL_TREES_H_
+#define NCCL_TREES_H_
+
+ncclResult_t ncclGetBtree(int nranks, int rank, int* u0, int* d1, int* d0);
+ncclResult_t ncclGetDtree(int nranks, int rank, int* u0, int* d0_0, int* d0_1, int* u1, int* d1_0, int* d1_1);
+
+#endif
@@ -1,5 +1,6 @@
 /*************************************************************************
- * Copyright (c) 2016-2018, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
+ * Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
 *
 * See LICENSE.txt for license information
 ************************************************************************/
@@ -10,7 +11,7 @@
 #include "nccl.h"
 #include <stdint.h>

-ncclResult_t getHostName(char* hostname, int maxlen);
+ncclResult_t getHostName(char* hostname, int maxlen, const char delim);
 uint64_t getnHash(const char* string, int n);
 uint64_t getHostHash();
 uint64_t getPidHash();
@@ -21,6 +22,6 @@ struct netIf {
 };

 int parseStringList(const char* string, struct netIf* ifList, int maxList);
-bool matchIfList(const char* string, int port, struct netIf* ifList, int listSize);
+bool matchIfList(const char* string, int port, struct netIf* ifList, int listSize, bool matchExact);

 #endif
@@ -1,970 +0,0 @@
-/*************************************************************************
- * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
- * Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
- *
- * See LICENSE.txt for license information
- ************************************************************************/
-
-#include "nccl.h"
-#include "core.h"
-#include "ring.h"
-#include "param.h"
-#include "nvmlwrap.h"
-#include "rings.h"
-#include "bootstrap.h"
-#include "transport.h"
-#include "common_coll.h"
-#include "group.h"
-#include "utils.h"
-#include "net.h"
-#include "topo.h"
-#include <numa.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <sys/mman.h>
-#include <sys/stat.h>
-#include <sys/types.h>
-#include <sched.h>
-#include <fcntl.h>
-#include <unistd.h>
-#include <hip/hip_runtime_api.h>
-#include <string.h>
-#include <errno.h>
-#include <assert.h>
-#include <dlfcn.h>
-
-#define STR2(v) #v
-#define STR(v) STR2(v)
-
-int ncclDebugLevel;
-uint64_t ncclDebugMask = NCCL_INIT; // Default debug sub-system mask is INIT
-pthread_mutex_t ncclDebugOutputLock;
-FILE *ncclDebugFile = stdout;
-
-#ifdef ENABLE_TRACE
-std::chrono::high_resolution_clock::time_point ncclEpoch;
-#endif
-
-#if CUDART_VERSION >= 9200 || defined(__HIP_PLATFORM_HCC__) || defined(__HCC__)
-#define NCCL_GROUP_CUDA_STREAM 0 // CGMD: CUDA 9.2,10.X Don't need to use an internal CUDA stream
-#else
-#define NCCL_GROUP_CUDA_STREAM 1 // CGMD: CUDA 9.0,9.1 Need to use an internal CUDA stream
-#endif
-
-NCCL_PARAM(GroupCudaStream, "GROUP_CUDA_STREAM", NCCL_GROUP_CUDA_STREAM);
-
-NCCL_PARAM(CheckPointers, "CHECK_POINTERS", 0);
-
-ncclNet_t* ncclNet = NULL;
-
-// We define this as weak to let tests redefine their own
-#pragma weak ncclCudaCompCap
-int ncclCudaCompCap() {
-  int cudaDev;
-  if (hipGetDevice(&cudaDev) != hipSuccess) return 0;
-  int ccMajor;
-  if (hipDeviceGetAttribute(&ccMajor, hipDeviceAttributeComputeCapabilityMajor, cudaDev) != hipSuccess) return 0;
-  return ccMajor;
-}
-int ncclCudaFullCompCap() {
-  int cudaDev;
-  if (hipGetDevice(&cudaDev) != hipSuccess) return 0;
-  int ccMajor, ccMinor;
-  if (hipDeviceGetAttribute(&ccMajor, hipDeviceAttributeComputeCapabilityMajor, cudaDev) != hipSuccess) return 0;
-  if (hipDeviceGetAttribute(&ccMinor, hipDeviceAttributeComputeCapabilityMinor, cudaDev) != hipSuccess) return 0;
-  return ccMajor*10+ccMinor;
-}
-
-// Returns ncclInternalError if anything fails, causing that network to be ignored.
-ncclResult_t initNet(ncclNet_t* net) {
-  int ndev;
-  if (net->init(ncclDebugLog) != ncclSuccess) return ncclInternalError;
-  if (net->devices(&ndev) != ncclSuccess) return ncclInternalError;
-  if (ndev <= 0) {
-    INFO(NCCL_INIT|NCCL_NET, "Net/%s: call to devices() returned 0 devices.", net->name);
-    return ncclSystemError;
-  }
-  return ncclSuccess;
-}
-
-ncclResult_t initNetPlugin(ncclNet_t** net) {
-  void* netPluginLib = dlopen("libnccl-net.so", RTLD_NOW | RTLD_LOCAL);
-  if (netPluginLib == NULL) {
-    // dlopen does not guarantee to set errno, but dlerror only gives us a
-    // string, so checking errno doesn't hurt to try to provide a better
-    // error message
-    if (errno == ENOENT) {
-      INFO(NCCL_INIT|NCCL_NET, "No network plugin found.");
-    } else {
-      INFO(NCCL_INIT|NCCL_NET, "Unable to load libnccl-net.so : %s", dlerror());
-    }
-    return ncclSuccess;
-  }
-  ncclNet_t* extNet = (ncclNet_t*) dlsym(netPluginLib, STR(NCCL_PLUGIN_SYMBOL));
-  if (extNet == NULL) {
-    INFO(NCCL_INIT|NCCL_NET, "NetPlugin: could not find " STR(NCCL_PLUGIN_SYMBOL) " symbol");
-    goto cleanup;
-  }
-  if (initNet(extNet) == ncclSuccess) {
-    *net = extNet;
-    return ncclSuccess;
-  }
-cleanup:
-  if (netPluginLib != NULL) dlclose(netPluginLib);
-  return ncclSuccess;
-}
-
-ncclResult_t initNet() {
-  // Always initialize sockets as we use it for bootstrap
-  NCCLCHECK(initNet(&ncclNetSocket));
-
-  NCCLCHECK(initNetPlugin(&ncclNet));
-  if (ncclNet != NULL) {
-    INFO(NCCL_INIT|NCCL_NET, "Using network plugin %s", ncclNetName());
-    return ncclSuccess;
-  }
-  if (initNet(&ncclNetIb) == ncclSuccess) {
-    ncclNet = &ncclNetIb;
-  } else {
-    ncclNet = &ncclNetSocket;
-  }
-  INFO(NCCL_INIT|NCCL_NET,"Using network %s", ncclNetName());
-  return ncclSuccess;
-}
-
-NCCL_PARAM(LlThreshold, "LL_THRESHOLD", -2);
-NCCL_PARAM(ThreadThreshold, "THREAD_THRESHOLD", -2);
-
-int ncclThreadThreshold(int minCompCap, int multiNode) {
-  int threshold = ncclParamThreadThreshold();
-  if (threshold == -2) { // user has not set this env variable
-    threshold = (minCompCap <= 6) ? NCCL_THREAD_THRESHOLD_PREVOLTA : NCCL_THREAD_THRESHOLD;
-    // multiply by 2 if running on multiple nodes
-    if (multiNode) {
-      threshold *= 2;
-    }
-  }
-  return threshold;
-}
-
-bool useFineGrainVramPcie = false;
-
-void parseHsaForceFineGrainVramPcie() {
-  char* str = getenv("HSA_FORCE_FINE_GRAIN_PCIE");
-  if (str && strlen(str) > 0) {
-    errno = 0;
-    int64_t v = strtoll(str, NULL, 0);
-    if (errno || (v != 0 && v != 1)) {
-      INFO(NCCL_ALL,"Invalid value %s for %s, using default %u.", str, "HSA_FORCE_FINE_GRAIN_PCIE", useFineGrainVramPcie); \
-    } else {
-      useFineGrainVramPcie = v;
-      INFO(NCCL_ALL,"%s set by environment to %u.", "HSA_FORCE_FINE_GRAIN_PCIE", useFineGrainVramPcie);  \
-    }
-  }
-}
-
-pthread_mutex_t initLock = PTHREAD_MUTEX_INITIALIZER;
-static bool initialized = false;
-static ncclResult_t ncclInit() {
-  if (initialized) return ncclSuccess;
-  pthread_mutex_lock(&initLock);
-  if (!initialized) {
-    initEnv();
-    initDebug();
-    initNet();
-    // Check if HSA_FORCE_FINE_GRAIN_PCIE is set in env
-    parseHsaForceFineGrainVramPcie();
-    initialized = true;
-  }
-  pthread_mutex_unlock(&initLock);
-  return ncclSuccess;
-}
-
-NCCL_API(ncclResult_t, ncclGetVersion, int* version);
-ncclResult_t ncclGetVersion(int* version) {
-  if (version == NULL) return ncclInvalidArgument;
-  *version = NCCL_VERSION_CODE;
-  return ncclSuccess;
-}
-
-NCCL_API(ncclResult_t, ncclGetUniqueId, ncclUniqueId* out);
-ncclResult_t ncclGetUniqueId(ncclUniqueId* out) {
-  NCCLCHECK(ncclInit());
-  NCCLCHECK(PtrCheck(out, "GetUniqueId", "out"));
-  return bootstrapGetUniqueId(out);
-}
-
-static ncclResult_t commFree(ncclComm_t comm) {
-  if (comm == NULL)
-    return ncclSuccess;
-
-  CUDACHECK(hipFree(comm->devComm));
-
-  for (int ring=0; ring<comm->nRings; ring++)
-    NCCLCHECK(freeRing(comm->rings+ring));
-
-  if (comm->doneEvent != NULL)
-    CUDACHECK(hipEventDestroy(comm->doneEvent));
-
-  if (comm->launchMode == ncclComm::GROUP) {
-    CUDACHECK(hipStreamDestroy(comm->groupStream));
-  }
-
-  // Last rank frees shared resources between threads
-  int isLast;
-  NCCLCHECK(ncclCpuBarrierIn(comm, &isLast));
-  if (isLast) {
-    free(comm->intraBarrier);
-    free(comm->intraParams);
-    free(comm->intraCudaDevs);
-    free(comm->intraCGMode);
-    free(comm->intraCC);
-  }
-
-  free(comm);
-  return ncclSuccess;
-}
-
-static ncclResult_t commAlloc(ncclComm_t* comret, int ndev, int rank) {
-  if (ndev < 1) {
-    WARN("invalid device count (%d) requested", ndev);
-    return ncclInvalidArgument;
-  }
-  if (rank >= ndev || rank < 0) {
-    WARN("rank %d exceeds ndev=%d", rank, ndev);
-    return ncclInvalidArgument;
-  }
-
-  // Try to create a CUDA object right away. If there is something wrong with
-  // the device we're on (failure cause #1) , better know it early.
-  hipEvent_t doneEvent;
-  CUDACHECK(hipEventCreateWithFlags(&doneEvent, hipEventDisableTiming));
-
-  struct ncclComm* comm;
-  NCCLCHECK(ncclCalloc(&comm, 1));
-
-  INFO(NCCL_INIT,"comm %p rank %d nranks %d", comm, rank, ndev);
-  comm->rank = rank;
-  comm->nRanks = ndev;
-  hipGetDevice(&comm->cudaDev);
-  comm->doneEvent = doneEvent;
-  comm->llThreshold = ncclParamLlThreshold();
-  comm->checkPointers = ncclParamCheckPointers() == 1 ? true : false;
-#if CUDART_VERSION >= 9200 || defined(__HIP_PLATFORM_HCC__) || defined(__HCC__)
-  comm->groupCudaStream = ncclParamGroupCudaStream();
-#else
-  // Don't allow the user to overload the default setting in older CUDA builds
-  comm->groupCudaStream = NCCL_GROUP_CUDA_STREAM;
-#endif
-
-  comm->argsptr = &comm->args;
-
-  *comret = comm;
-  return ncclSuccess;
-}
-
-static ncclResult_t devCommSetup(ncclComm_t comm) {
-  // Fully duplicate the comm on the device
-  NCCLCHECK(ncclCudaCalloc(&comm->devComm, 1));
-  // Copy the comm on the device
-  NCCLCHECK(ncclCudaMemcpy(comm->devComm, comm, 1));
-  // Copy userRanks
-  for (int r=0; r<comm->nRings; r++) {
-    NCCLCHECK(ncclCudaMemcpy(comm->rings[r].devUserRanks, comm->rings[r].userRanks, comm->nRanks));
-  }
-  return ncclSuccess;
-}
-
-// Pre-process the string so that running "strings" on the lib can quickly reveal the version.
-#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__)
-#define VERSION_STRING "NCCL version " STR(NCCL_MAJOR) "." STR(NCCL_MINOR) "." STR(NCCL_PATCH) NCCL_SUFFIX "+hip"
-#else
-#define VERSION_STRING "NCCL version " STR(NCCL_MAJOR) "." STR(NCCL_MINOR) "." STR(NCCL_PATCH) NCCL_SUFFIX "+cuda" STR(CUDA_MAJOR) "." STR(CUDA_MINOR)
-#endif
-static void showVersion() {
-  static int shown = 0;
-  if (shown == 0 && ncclDebugLevel >= NCCL_LOG_VERSION) {
-    printf("%s\n", VERSION_STRING);
-    fflush(stdout);
-    if (ncclDebugFile != stdout)
-      INFO(NCCL_ALL,"%s", VERSION_STRING); // Also log NCCL version in one of the files
-    shown = 1;
-  }
-}
-
-static ncclResult_t fillInfo(struct ncclInfo* info, int rank, uint64_t commHash) {
-  for (int t=0; t<NTRANSPORTS; t++) {
-    NCCLCHECK(ncclTransports[t].fillInfo(info->tinfo+t, rank, commHash));
-  }
-  return ncclSuccess;
-}
-
-bool SetCpuAffinity(int cudaDev, nvmlDevice_t* nvmlDevice);
-
-template <int type>
-static ncclResult_t selectTransport(struct ncclInfo* myInfo, struct ncclInfo* peerInfo, struct ncclConnect* connect, struct ncclTransport** transportRet, struct ncclRing* ring) {
-  for (int t=0; t<NTRANSPORTS; t++) {
-    struct ncclTransport *transport = ncclTransports+t;
-    struct ncclTransportComm* transportComm = type == 1 ? &transport->send : &transport->recv;
-    ncclTvalue_t ret = 0;
-    NCCLCHECK(transport->canConnect(&ret, myInfo->tinfo+t, peerInfo->tinfo+t));
-    if (ret > 0) {
-      cpu_set_t affinitySave;
-      nvmlDevice_t nvmlDevice;
-      int cudaDev;
-      CUDACHECK(hipGetDevice(&cudaDev));
-      sched_getaffinity(0, sizeof(cpu_set_t), &affinitySave);
-      SetCpuAffinity(cudaDev, &nvmlDevice);
-      NCCLCHECK(transportComm->setup(myInfo->tinfo+t, peerInfo->tinfo+t, connect, ring));
-      *transportRet = transport;
-      sched_setaffinity(0, sizeof(cpu_set_t), &affinitySave);
-      return ncclSuccess;
-    }
-  }
-  WARN("No transport found !");
-  *transportRet = NULL;
-  return ncclInternalError;
-}
-
-static ncclResult_t setupRing(struct ncclComm* comm, int ringid, int rank, int nranks, int* ringRanks, struct ncclInfo* allInfo, struct ncclConnect* connect) {
-  NCCLCHECK(initRing(comm, ringid));
-
-  struct ncclRing* ring = comm->rings+ringid;
-  // Reorganize ranks to start with rank.
-  int shift;
-  for (shift = 0; shift<nranks; shift++) {
-    if (ringRanks[shift] == rank) {
-      break;
-    }
-  }
-  for (int i=0; i<nranks; i++) {
-    ring->userRanks[i] = ringRanks[(i+shift)%nranks];
-  }
-  int prev = ring->userRanks[nranks-1];
-  int next = ring->userRanks[1];
-
-  NCCLCHECK(selectTransport<0>(allInfo+rank, allInfo+prev, connect+0, &ring->recv.transport, ring));
-  NCCLCHECK(selectTransport<1>(allInfo+rank, allInfo+next, connect+1, &ring->send.transport, ring));
-  NCCLCHECK(transportCreateProxy(0, ring, comm));
-  NCCLCHECK(transportCreateProxy(1, ring, comm));
-  return ncclSuccess;
-}
-
-static ncclResult_t fillConnect(struct ncclInfo* allInfo, int nranks, int rank, int* connectTransport, ncclTvalue_t* connectValue) {
-  for (int r=0; r<nranks; r++) {
-    connectTransport[r] = -1;
-    for (int t=0; t<NTRANSPORTS; t++) {
-      NCCLCHECK(ncclTransports[t].canConnect(connectValue+r, allInfo[rank].tinfo+t, allInfo[r].tinfo+t));
-      if (connectValue[r] > 0) {
-        connectTransport[r] = t;
-        break;
-      }
-    }
-  }
-  return ncclSuccess;
-}
-
-static void swap(void* mem1, void* mem2, int size) {
-  char tmp[size];
-  memcpy(tmp, mem1, size); memcpy(mem1, mem2, size); memcpy(mem2, tmp, size);
-}
-
-#define MAXWIDTH 64
-#define PREFIXLEN 15
-#define STRLENGTH (PREFIXLEN+5*MAXWIDTH)
-void dumpMatrix(int* connectMatrix, int nranks) {
-  char line[STRLENGTH+1];
-  line[STRLENGTH] = '\0';
-  memset(line, ' ', STRLENGTH);
-  for (int j=0; j<nranks && j<MAXWIDTH; j++) sprintf(4+line+4*j, " %3d", j);
-  INFO(NCCL_INIT,"%s", line);
-  for (int i=0; i<nranks; i++) {
-    memset(line, ' ', STRLENGTH);
-    sprintf(line, "%3d ", i);
-    for (int j=0; j<nranks && j<MAXWIDTH; j++) sprintf(4+line+4*j, " %3d", connectMatrix[i*nranks+j]);
-    INFO(NCCL_INIT,"%s", line);
-  }
-}
-
-void dumpMatrixTvalue(ncclTvalue_t* connectMatrix, int nranks) {
-  char line[STRLENGTH+1];
-  line[STRLENGTH] = '\0';
-  memset(line, ' ', STRLENGTH);
-  for (int j=0; j<nranks && j<MAXWIDTH; j++) sprintf(4+line+5*j, " %4d", j);
-  INFO(NCCL_INIT,"%s", line);
-  for (int i=0; i<nranks; i++) {
-    memset(line, ' ', STRLENGTH);
-    sprintf(line, "%3d ", i);
-    for (int j=0; j<nranks && j<MAXWIDTH; j++) sprintf(4+line+5*j, " %4o", (int)connectMatrix[i*nranks+j]);
-    INFO(NCCL_INIT,"%s", line);
-  }
-}
-
-
-void dumpLine(int* values, int nranks, const char* prefix) {
-  int prefixlen = strlen(prefix);
-  char line[STRLENGTH+1];
-  line[STRLENGTH] = '\0';
-  memset(line, ' ', STRLENGTH);
-  strncpy(line, prefix, PREFIXLEN);
-  for (int i=0; i<nranks && i<MAXWIDTH; i++) sprintf(line+prefixlen+4*i, " %3d", values[i]);
-  INFO(NCCL_INIT,"%s", line);
-}
-
-static ncclResult_t buildRings(int nrings, int* rings, int rank, int nranks, int* prev, int* next) {
-  for (int r=0; r<nrings; r++) {
-    char prefix[30];
-    /*sprintf(prefix, "[%d] Ring %d Prev : ", rank, r);
-    dumpLine(prev+r*nranks, nranks, prefix);
-    sprintf(prefix, "[%d] Ring %d Next : ", rank, r);
-    dumpLine(next+r*nranks, nranks, prefix);*/
-
-    int current = rank;
-    for (int i=0; i<nranks; i++) {
-      rings[r*nranks+i] = current;
-      current = next[r*nranks+current];
-    }
-    sprintf(prefix, "Ring %02d : ", r);
-    if (rank == 0) dumpLine(rings+r*nranks, nranks, prefix);
-    if (current != rank) {
-      WARN("Error : ring %d does not loop back to start (%d != %d)", r, current, rank);
-      return ncclInternalError;
-    }
-    // Check that all ranks are there
-    for (int i=0; i<nranks; i++) {
-      int found = 0;
-      for (int j=0; j<nranks; j++) {
-        if (rings[r*nranks+j] == i) {
-          found = 1;
-          break;
-        }
-      }
-      if (found == 0) {
-        WARN("Error : ring %d does not contain rank %d", r, i);
-        return ncclInternalError;
-      }
-    }
-  }
-  return ncclSuccess;
-}
-
-void* waitForNonNullPtr(void* p) {
-  volatile void** ptr = (volatile void**) p;
-  while (LOAD(ptr) == NULL) sched_yield();
-  return (void*)LOAD(ptr);
-}
-
-ncclResult_t initParams(struct ncclComm* comm) {
-  hipLaunchParams* params = comm->myParams = comm->intraParams+comm->intraRank;
-  params->args = (void **)&comm->argsptr;
-  params->stream = NULL;
-  params->sharedMem = 0;
-  params->blockDim.x = 0; params->blockDim.y = params->blockDim.z = 1;
-  params->gridDim.x = 0; params->gridDim.y = params->gridDim.z = 1;
-  return ncclSuccess;
-}
-
-// Allocate/Set Intra Process Structures and set CG options
-ncclResult_t ncclCommSetIntra(struct ncclComm* comm, int rank, int ranks, struct ncclComm* comm0) {
-  comm->intraRank = rank;
-  comm->intraRanks = ranks;
-  comm->intraPhase = 0;
-
-  // Alloc shared structures
-  if (rank == 0) {
-    assert(comm == comm0);
-    int* bar;
-    NCCLCHECK(ncclCalloc(&bar, 2));
-    bar[0] = bar[1] = 0;
-    comm->intraBarrier = bar;
-    NCCLCHECK(ncclCalloc(&comm->intraParams, comm->intraRanks));
-    NCCLCHECK(ncclCalloc(&comm->intraCudaDevs, comm->intraRanks));
-    int* CGMode;
-    NCCLCHECK(ncclCalloc(&CGMode, 1));
-    *CGMode = 0x11;
-    comm->intraCGMode = CGMode;
-    int* CC;
-    NCCLCHECK(ncclCalloc(&CC, 1));
-    *CC = ncclCudaFullCompCap();
-    comm->intraCC = CC;
-  } else {
-    comm->intraBarrier = (int*)waitForNonNullPtr(&comm0->intraBarrier);
-    comm->intraParams = (hipLaunchParams*)waitForNonNullPtr(&comm0->intraParams);
-    comm->intraCudaDevs = (int*)waitForNonNullPtr(&comm0->intraCudaDevs);
-    comm->intraCGMode = (int*)waitForNonNullPtr(&comm0->intraCGMode);
-    comm->intraCC = (int*)waitForNonNullPtr(&comm0->intraCC);
-  }
-  comm->intraCudaDevs[comm->intraRank] = comm->cudaDev;
-  NCCLCHECK(initParams(comm));
-
-  int cgMdLaunch = 1;
-
-  // Set CG Mode
-  comm->launchMode = ncclComm::GROUP;
-  char* str = getenv("NCCL_LAUNCH_MODE");
-  if (comm->intraRanks == 1 || (str && strcmp(str, "PARALLEL") == 0)) {
-    comm->launchMode = ncclComm::PARALLEL;
-  }
-  if (comm->launchMode == ncclComm::GROUP) {
-    CUDACHECK(hipStreamCreateWithFlags(&comm->groupStream, hipStreamNonBlocking));
-#if CUDART_VERSION >= 9000
-    if (*comm->intraCC && (ncclCudaFullCompCap() == *comm->intraCC)) {
-      // Check whether the GPU supports Cooperative Group Multi Device Launch
-      (void) hipDeviceGetAttribute(&cgMdLaunch, cudaDevAttrCooperativeMultiDeviceLaunch, comm->cudaDev);
-    }
-#endif
-  }
-
-  // Disable cgMdLaunch if any rank does not support it
-  if (cgMdLaunch == 0) {
-    *comm->intraCGMode = 0x10;
-  }
-  return ncclSuccess;
-}
-
-static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* commId) {
-  int rank = comm->rank;
-  int nranks = comm->nRanks;
-  void* commState;
-  uint64_t commHash = getnHash(commId->internal, NCCL_UNIQUE_ID_BYTES);
-  TRACE(NCCL_INIT, "comm %p, commHash %lu, rank %d nranks %d - BEGIN", comm, commHash, rank, nranks);
-  NCCLCHECK(bootstrapInit(commId, rank, nranks, &commState));
-
-  struct ncclInfo* allInfo;
-  NCCLCHECK(ncclCalloc(&allInfo, nranks));
-  NCCLCHECK(fillInfo(allInfo+rank, rank, commHash));
-  NCCLCHECK(bootstrapAllGather(commState, allInfo, sizeof(struct ncclInfo)));
-
-  int* connectTransport;
-  ncclTvalue_t* connectValue;
-  NCCLCHECK(ncclCalloc(&connectTransport, nranks*nranks));
-  NCCLCHECK(ncclCalloc(&connectValue, nranks*nranks));
-
-  NCCLCHECK(fillConnect(allInfo, nranks, rank, connectTransport+nranks*rank, connectValue+nranks*rank));
-  NCCLCHECK(bootstrapAllGather(commState, connectTransport, nranks*(sizeof(int))));
-  NCCLCHECK(bootstrapAllGather(commState, connectValue, nranks*(sizeof(ncclTvalue_t))));
-  //if (rank == 0) dumpMatrix(connectTransport, nranks);
-  //if (rank == 0) dumpMatrixTvalue(connectValue, nranks);
-
-  // Get my rings
-  int nrings;
-  int* prev, *next;
-  NCCLCHECK(ncclCalloc(&prev, nranks*MAXRINGS));
-  NCCLCHECK(ncclCalloc(&next, nranks*MAXRINGS));
-  comm->nThreads = getDefaultThreads();
-  NCCLCHECK(ncclGetRings(&nrings, &comm->nThreads, rank, nranks, connectTransport, connectValue, prev, next));
-  free(connectTransport);
-  free(connectValue);
-
-  // Find max nThreads
-  int allData[nranks];
-  allData[rank] = comm->nThreads;
-  NCCLCHECK(bootstrapAllGather(commState, allData, sizeof(int)));
-  for (int i=0; i<nranks; i++)
-    comm->nThreads = std::max(allData[i], comm->nThreads);
-  if (rank == 0) INFO(NCCL_INIT,"Using %d threads", comm->nThreads);
-
-  // Determine the minimum CUDA Compute capability of all GPUs
-  int myCompCap = ncclCudaCompCap();
-  int minCompCap = myCompCap;
-  allData[rank] = myCompCap;
-  NCCLCHECK(bootstrapAllGather(commState, allData, sizeof(int)));
-  for (int i=0; i<nranks; i++)
-    minCompCap = std::min(allData[i], minCompCap);
-  if (rank == 0) INFO(NCCL_INIT,"Min Comp Cap %d", minCompCap);
-
-  // Find min nrings across ranks
-  allData[rank] = nrings;
-  NCCLCHECK(bootstrapAllGather(commState, allData, sizeof(int)));
-  for (int i=0; i<nranks; i++)
-    nrings = std::min(allData[i], nrings);
-
-  // Exchange data with others to build complete rings
-  comm->nRings = nrings;
-  for (int r=0; r<nrings; r++) {
-    NCCLCHECK(bootstrapAllGather(commState, prev+r*nranks, sizeof(int)));
-    NCCLCHECK(bootstrapAllGather(commState, next+r*nranks, sizeof(int)));
-  }
-  int *rings;
-  NCCLCHECK(ncclCalloc(&rings, nranks*MAXRINGS));
-  NCCLCHECK(buildRings(nrings, rings, rank, nranks, prev, next));
-  free(prev);
-  free(next);
-
-  // Connect with prev/next for each ring
-  struct ncclConnect *connectData;
-  NCCLCHECK(ncclCalloc(&connectData, 2*nranks));
-  for (int r=0; r<nrings; r++) {
-    int* ringRanks = rings+r*nranks;
-    struct ncclRing *ring = comm->rings+r;
-    NCCLCHECK(setupRing(comm, r, rank, nranks, ringRanks, allInfo, connectData+2*rank));
-    int prev_offset = ring->userRanks[nranks-1]*2+1;
-    int next_offset = ring->userRanks[1]*2;
-    NCCLCHECK(bootstrapAllGather(commState, connectData, sizeof(struct ncclConnect)*2));
-    NCCLCHECK(ring->send.transport->send.connect(connectData+next_offset, &ring->send));
-    NCCLCHECK(ring->recv.transport->recv.connect(connectData+prev_offset, &ring->recv));
-  }
-  free(connectData);
-  free(rings);
-  free(allInfo);
-
-  // Intra-process barrier setup
-  struct rankInfo {
-    uint64_t hostHash;
-    uint64_t pidHash;
-    struct ncclComm* comm;
-  } rankInfos[nranks];
-  rankInfos[rank].hostHash = getHostHash();
-  rankInfos[rank].pidHash = getPidHash();
-  rankInfos[rank].comm = comm;
-  NCCLCHECK(bootstrapAllGather(commState, rankInfos, sizeof(struct rankInfo)));
-
-  // Compute intra ranks
-  int intraRank0 = -1, intraRank = -1, intraRanks = 0;
-  int multiNode = 0;
-  for (int r=0; r<nranks; r++) {
-    if ((rankInfos[r].hostHash == rankInfos[rank].hostHash) &&
-        (rankInfos[r].pidHash == rankInfos[rank].pidHash)) {
-      if (intraRanks == 0) intraRank0 = r;
-      if (r == rank) intraRank = intraRanks;
-      intraRanks++;
-    } else if (rankInfos[r].hostHash != rankInfos[rank].hostHash) {
-      multiNode = 1;
-    }
-  }
-  TRACE(NCCL_INIT,"hostHash[%d] %lx intraRank %d intraRanks %d intraRank0 %d",
-      rank, rankInfos[rank].hostHash, intraRank, intraRanks, intraRank0);
-  if (intraRank == -1 || intraRank0 == -1 || rankInfos[intraRank0].comm == NULL) {
-    WARN("Failed to determine intra ranks hostHash[%d] %lx intraRank %d intraRanks %d intraRank0 %d",
-        rank, rankInfos[rank].hostHash, intraRank, intraRanks, intraRank0);
-    return ncclInternalError;
-  }
-  NCCLCHECK(ncclCommSetIntra(comm, intraRank, intraRanks, rankInfos[intraRank0].comm));
-
-  // Determine thread threshold across all GPUs
-  comm->threadThreshold = ncclThreadThreshold(minCompCap, multiNode);
-
-  // Barrier
-  bootstrapClose(commState);
-  return ncclSuccess;
-}
-
-bool SetCpuAffinity(int cudaDev, nvmlDevice_t* nvmlDevice) {
-#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__)
-  if (numa_available() < 0) {
-    WARN("System does not support NUMA API!");
-    return false;
-  }
-  char* cudaPath;
-  NCCLCHECK(getCudaPath(cudaDev, &cudaPath));
-  strcat(cudaPath, "/numa_node");
-  int fd;
-  SYSCHECKVAL(open(cudaPath, O_RDONLY), "open", fd);
-  char numa_node[5];
-  int len;
-  SYSCHECKVAL(read(fd, numa_node, 4), "read", len);
-  SYSCHECK(close(fd), "close");
-  errno = 0;
-  long node = strtol(numa_node, NULL, 10);
-  if (errno == ERANGE || errno == EINVAL) {
-    INFO(NCCL_ALL,"%s: Call to strtol returned %s", __func__, strerror(errno));
-    free(cudaPath);
-    return false;
-  }
-  numa_run_on_node(node);
-  numa_set_preferred(node);
-  free(cudaPath);
-  return true;
-#else
-  char busId[NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE];
-  if (hipDeviceGetPCIBusId(busId, NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE, cudaDev) != hipSuccess) return false;
-  if (wrapNvmlDeviceGetHandleByPciBusId(busId, nvmlDevice) != ncclSuccess) return false;
-  if (wrapNvmlDeviceSetCpuAffinity(*nvmlDevice) != ncclSuccess) {
-    WARN("Failed to set CPU affinity");
-    return false;
-  }
-  return true;
-#endif
-}
-
-ncclResult_t ncclCommInitRankSync(ncclComm_t* newcomm, int nranks, ncclUniqueId commId, int myrank) {
-  cpu_set_t affinitySave;
-  sched_getaffinity(0, sizeof(cpu_set_t), &affinitySave);
-
-  NCCLCHECK(wrapNvmlSymbols());
-  NCCLCHECK(wrapNvmlInit());
-
-  // Make sure all host memory allocation are close to the GPU
-  int cudaDev;
-  nvmlDevice_t nvmlDevice;
-  CUDACHECK(hipGetDevice(&cudaDev));
-  SetCpuAffinity(cudaDev, &nvmlDevice);
-  ncclResult_t res;
-
-  NCCLCHECKGOTO(commAlloc(newcomm, nranks, myrank), res, cleanup);
-  NCCLCHECKGOTO(initTransportsRank(*newcomm, &commId), res, cleanup);
-  NCCLCHECKGOTO(devCommSetup(*newcomm), res, cleanup);
-
-  sched_setaffinity(0, sizeof(cpu_set_t), &affinitySave);
-  NCCLCHECKGOTO(wrapNvmlShutdown(), res, cleanup);
-
-  INFO(NCCL_INIT,"comm %p rank %d nranks %d - COMPLETE", *newcomm, myrank, nranks);
-
-  return ncclSuccess;
-cleanup:
-  *newcomm = NULL;
-  sched_setaffinity(0, sizeof(cpu_set_t), &affinitySave);
-  return res;
-}
-
-NCCL_API(ncclResult_t, ncclCommInitRank, ncclComm_t* newcomm, int nranks, ncclUniqueId commId, int myrank);
-ncclResult_t ncclCommInitRank(ncclComm_t* newcomm, int nranks, ncclUniqueId commId, int myrank) {
-  char* env = getenv("NCCL_COMM_ID");
-  if (env && myrank == 0) {
-    NCCLCHECK(bootstrapCreateRoot(&commId, true));
-  }
-
-  NCCLCHECK(ncclInit());
-  if (myrank == 0) showVersion();
-
-  INFO(NCCL_INIT,"rank %d nranks %d", myrank, nranks);
-
-  // Make sure the CUDA runtime is initialized.
-  CUDACHECK(hipFree(NULL));
-
-  NCCLCHECK(PtrCheck(newcomm, "CommInitRank", "newcomm"));
-  if (nranks < 1 || myrank < 0 || myrank >= nranks) {
-    WARN("Invalid rank requested : %d/%d", myrank, nranks);
-    return ncclInvalidArgument;
-  }
-
-  if (ncclAsyncMode()) {
-    int cudaDev;
-    CUDACHECK(hipGetDevice(&cudaDev));
-    return ncclAsyncInit(ncclCommInitRankSync, cudaDev, newcomm, nranks, commId, myrank);
-  } else {
-    return ncclCommInitRankSync(newcomm, nranks, commId, myrank);
-  }
-}
-
-static ncclResult_t initTransportsAll(struct ncclComm** comms, const int* devs, int nranks) {
-  struct ncclInfo* allInfo;
-  NCCLCHECK(ncclCalloc(&allInfo, nranks));
-  for (int rank=0; rank<nranks; rank++) {
-    CUDACHECK(hipSetDevice(devs[rank]));
-    NCCLCHECK(fillInfo(allInfo+rank, rank, 0));
-  }
-
-  int* connectTransport;
-  ncclTvalue_t* connectValue;
-  NCCLCHECK(ncclCalloc(&connectTransport, nranks*nranks));
-  NCCLCHECK(ncclCalloc(&connectValue, nranks*nranks));
-  for (int rank=0; rank<nranks; rank++)
-    NCCLCHECK(fillConnect(allInfo, nranks, rank, connectTransport+nranks*rank, connectValue+nranks*rank));
-
-  int* prev, *prevFinal, *next, *nextFinal;
-  NCCLCHECK(ncclCalloc(&prev, nranks*MAXRINGS));
-  NCCLCHECK(ncclCalloc(&prevFinal, nranks*MAXRINGS));
-  NCCLCHECK(ncclCalloc(&next, nranks*MAXRINGS));
-  NCCLCHECK(ncclCalloc(&nextFinal, nranks*MAXRINGS));
-  int nrings = MAXRINGS;
-  int nthreads=0;
-  int myCompCap = ncclCudaCompCap();
-  int minCompCap = myCompCap;
-  for (int rank=0; rank<nranks; rank++) {
-    CUDACHECK(hipSetDevice(devs[rank]));
-    int nringsRank;
-    int nthreadsRank = getDefaultThreads();
-    myCompCap = ncclCudaCompCap();
-    NCCLCHECK(ncclGetRings(&nringsRank, &nthreadsRank, rank, nranks, connectTransport, connectValue, prev, next));
-    nrings = std::min(nrings, nringsRank);
-    nthreads = std::max(nthreads, nthreadsRank);
-    minCompCap = std::min(minCompCap, myCompCap);
-    for (int ring=0; ring<nrings; ring++) {
-      int index = ring*nranks+rank;
-      prevFinal[index] = prev[index];
-      nextFinal[index] = next[index];
-    }
-  }
-  free(connectTransport);
-  free(connectValue);
-  free(prev);
-  free(next);
-
-  INFO(NCCL_INIT,"Using %d threads", nthreads);
-  INFO(NCCL_INIT,"Min Comp Cap %d", minCompCap);
-
-  int* rings;
-  NCCLCHECK(ncclCalloc(&rings, nranks*MAXRINGS));
-  NCCLCHECK(buildRings(nrings, rings, 0, nranks, prevFinal, nextFinal));
-  free(prevFinal);
-  free(nextFinal);
-
-  // Determine thread threshold across all GPUs
-  int threadThreshold = ncclThreadThreshold(minCompCap, 0);
-
-  for (int rank=0; rank<nranks; rank++) {
-    comms[rank]->nRings = nrings;
-    comms[rank]->nThreads = nthreads;
-    comms[rank]->threadThreshold = threadThreshold;
-  }
-
-  for (int r=0; r<nrings; r++) {
-    struct ncclConnect connect[2*nranks];
-    int* ringRanks = rings+r*nranks;
-    for (int rank=0; rank<nranks; rank++) {
-      CUDACHECK(hipSetDevice(devs[rank]));
-      NCCLCHECK(setupRing(comms[rank], r, rank, nranks, ringRanks, allInfo, connect+2*rank));
-    }
-    // RingExchange connect information
-    for (int rank=0; rank<nranks; rank++) {
-      // Swap rank->prev and prevRank->next
-      struct ncclRing *ring = comms[rank]->rings+r;
-      int prevRank = ring->userRanks[nranks-1];
-      struct ncclConnect* prevRankNextConnect = connect+2*prevRank+1;
-      struct ncclConnect* rankPrevConnect = connect+2*rank;
-      swap(prevRankNextConnect, rankPrevConnect, sizeof(struct ncclConnect));
-    }
-    for (int rank=0; rank<nranks; rank++) {
-      CUDACHECK(hipSetDevice(devs[rank]));
-      struct ncclRing *ring = comms[rank]->rings+r;
-      NCCLCHECK(ring->send.transport->send.connect(connect+2*rank+1, &ring->send));
-      NCCLCHECK(ring->recv.transport->recv.connect(connect+2*rank+0, &ring->recv));
-    }
-  }
-  free(rings);
-  free(allInfo);
-  return ncclSuccess;
-}
-
-
-NCCL_API(ncclResult_t, ncclCommInitAll, ncclComm_t* comms, int ndev, const int* devlist);
-ncclResult_t ncclCommInitAll(ncclComm_t* comms, int ndev, const int* devlist) {
-  NCCLCHECK(ncclInit());
-  NCCLCHECK(wrapNvmlSymbols());
-  NCCLCHECK(wrapNvmlInit());
-  showVersion();
-
-  INFO(NCCL_INIT,"nranks %d", ndev);
-
-  NCCLCHECK(PtrCheck(comms, "CommInitAll", "comms"));
-  if (ndev < 1) {
-    WARN("Invalid device count requested : %d", ndev);
-    return ncclInvalidArgument;
-  }
-
-  ncclResult_t res;
-  int savedDevice;
-  int rank, cudaDev;
-  ncclComm_t comm = NULL;
-  nvmlDevice_t nvmlDevice;
-  int ncclDevList[ndev];
-  for (int i=0; i<ndev; i++) {
-    ncclDevList[i] = devlist ? devlist[i] : i;
-  }
-
-  hipGetDevice(&savedDevice);
-
-  for(rank=0; rank<ndev; ++rank)
-    comms[rank] = NULL;
-
-  cpu_set_t affinitySave;
-  sched_getaffinity(0, sizeof(cpu_set_t), &affinitySave);
-
-  for (rank=0; rank<ndev; ++rank) {
-    cudaDev = ncclDevList[rank];
-    CUDACHECKGOTO(hipSetDevice(cudaDev), res, cleanup);
-
-    SetCpuAffinity(cudaDev, &nvmlDevice);
-
-    NCCLCHECKGOTO(commAlloc(&comm, ndev, rank), res, cleanup);
-    comms[rank] = comm;
-
-    NCCLCHECKGOTO(ncclCommSetIntra(comm, rank, ndev, comms[0]), res, cleanup);
-  }
-
-  sched_setaffinity(0, sizeof(cpu_set_t), &affinitySave);
-
-  NCCLCHECKGOTO(initTransportsAll(comms, ncclDevList, ndev), res, cleanup);
-
-  for(rank=0; rank<ndev; ++rank) {
-    cudaDev = ncclDevList[rank];
-    CUDACHECKGOTO(hipSetDevice(cudaDev), res, cleanup);
-    NCCLCHECKGOTO(devCommSetup(comms[rank]), res, cleanup);
-  }
-
-  res = ncclSuccess;
-  goto final;
-
-cleanup:
-  for(rank=0; rank<ndev; ++rank) {
-    if(comms[rank] != NULL) {
-      commFree(comms[rank]);
-    }
-  }
-
-final:
-  if(wrapNvmlShutdown() != ncclSuccess)
-    INFO(NCCL_INIT,"NCCL did not shutdown nvml properly");
-  hipSetDevice(savedDevice);
-  sched_setaffinity(0, sizeof(cpu_set_t), &affinitySave);
-  return res;
-}
-
-NCCL_API(ncclResult_t, ncclCommDestroy, ncclComm_t comm);
-ncclResult_t ncclCommDestroy(ncclComm_t comm) {
-
-  if (comm == NULL)
-    return ncclSuccess;
-  int savedDevice;
-  CUDACHECK(hipGetDevice(&savedDevice));
-  int commDevice = comm->cudaDev;
-
-  if (savedDevice != commDevice) {
-    CUDACHECK(hipSetDevice(commDevice));
-  }
-
-  NCCLCHECK(commFree(comm));
-
-  if (savedDevice != commDevice)
-    CUDACHECK(hipSetDevice(savedDevice));
-
-  return ncclSuccess;
-}
-
-NCCL_API(const char*, ncclGetErrorString, ncclResult_t code);
-const char* ncclGetErrorString(ncclResult_t code) {
-  switch (code) {
-    case ncclSuccess                : return "no error";
-    case ncclUnhandledCudaError     : return "unhandled cuda error";
-    case ncclSystemError            : return "unhandled system error";
-    case ncclInternalError          : return "internal error";
-    case ncclInvalidArgument        : return "invalid argument";
-    case ncclInvalidUsage           : return "invalid usage";
-    default                         : return "unknown result code";
-  }
-}
-
-NCCL_API(ncclResult_t, ncclCommCount, const ncclComm_t comm, int* count);
-ncclResult_t ncclCommCount(const ncclComm_t comm, int* count) {
-  NCCLCHECK(PtrCheck(comm, "CommCount", "comm"));
-  NCCLCHECK(PtrCheck(count, "CommCount", "count"));
-  *count = comm->nRanks;
-  return ncclSuccess;
-}
-
-NCCL_API(ncclResult_t, ncclCommCuDevice, const ncclComm_t comm, int* devid);
-ncclResult_t ncclCommCuDevice(const ncclComm_t comm, int* devid) {
-  NCCLCHECK(PtrCheck(comm, "CommCuDevice", "comm"));
-  NCCLCHECK(PtrCheck(devid, "CommCuDevice", "devid"));
-  *devid = comm->cudaDev;
-  return ncclSuccess;
-}
-
-NCCL_API(ncclResult_t, ncclCommUserRank, const ncclComm_t comm, int* rank);
-ncclResult_t ncclCommUserRank(const ncclComm_t comm, int* rank) {
-  NCCLCHECK(PtrCheck(comm, "CommUserRank", "comm"));
-  NCCLCHECK(PtrCheck(rank, "CommUserRank", "rank"));
-  *rank = comm->rank;
-  return ncclSuccess;
-}
@@ -0,0 +1,69 @@
+/*************************************************************************
+ * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#include "argcheck.h"
+
+static ncclResult_t CudaPtrCheck(const void* pointer, struct ncclComm* comm, const char* ptrname, const char* opname) {
+  hipPointerAttribute_t attr;
+  hipError_t err = hipPointerGetAttributes(&attr, pointer);
+  if (err != hipSuccess || attr.devicePointer == NULL) {
+    WARN("%s : %s is not a valid pointer", opname, ptrname);
+    return ncclInvalidArgument;
+  }
+#if CUDART_VERSION >= 10000
+  if (attr.type == hipMemoryTypeDevice && attr.device != comm->cudaDev) {
+#else
+  if (attr.memoryType == hipMemoryTypeDevice && attr.device != comm->cudaDev) {
+#endif
+    WARN("%s : %s allocated on device %d mismatchs with NCCL device %d", opname, ptrname, attr.device, comm->cudaDev);
+    return ncclInvalidArgument;
+  }
+  return ncclSuccess;
+}
+
+ncclResult_t PtrCheck(void* ptr, const char* opname, const char* ptrname) {
+  if (ptr == NULL) {
+    WARN("%s : %s argument is NULL", opname, ptrname);
+    return ncclInvalidArgument;
+  }
+  return ncclSuccess;
+}
+
+ncclResult_t ArgsCheck(struct ncclInfo* info) {
+  NCCLCHECK(PtrCheck(info->comm, info->opName, "comm"));
+  // First, the easy ones
+  if (info->root < 0 || info->root >= info->comm->nRanks) {
+    WARN("%s : invalid root %d (root should be in the 0..%d range)", info->opName, info->root, info->comm->nRanks);
+    return ncclInvalidArgument;
+  }
+  if (info->datatype < 0 || info->datatype >= ncclNumTypes) {
+    WARN("%s : invalid type %d", info->opName, info->datatype);
+    return ncclInvalidArgument;
+  }
+  // Type is OK, compute nbytes. Convert Allgather/Broadcast calls to chars.
+  info->nBytes = info->count * ncclTypeSize(info->datatype);
+  if (info->coll == ncclCollAllGather || info->coll == ncclCollBroadcast) {
+    info->count = info->nBytes;
+    info->datatype = ncclInt8;
+  }
+  if (info->coll == ncclCollAllGather || info->coll == ncclCollReduceScatter) info->nBytes *= info->comm->nRanks; // count is per rank
+
+  if (info->op < 0 || info->op >= ncclNumOps) {
+    WARN("%s : invalid reduction operation %d", info->opName, info->op);
+    return ncclInvalidArgument;
+  }
+
+  if (info->comm->checkPointers) {
+    // Check CUDA device pointers
+    if (info->coll != ncclCollBroadcast || info->comm->rank == info->root) {
+      NCCLCHECK(CudaPtrCheck(info->sendbuff, info->comm, "sendbuff", info->opName));
+    }
+    if (info->coll != ncclCollReduce || info->comm->rank == info->root) {
+      NCCLCHECK(CudaPtrCheck(info->recvbuff, info->comm, "recvbuff", info->opName));
+    }
+  }
+  return ncclSuccess;
+}
--- a/Показать больше
+++ b/Показать больше