diff --git a/projects/rccl/CMakeLists.txt b/projects/rccl/CMakeLists.txt
index 237e9242b3..c33228b186 100644
--- a/projects/rccl/CMakeLists.txt
+++ b/projects/rccl/CMakeLists.txt
@@ -55,8 +55,16 @@ else()
 endif()
 
 # Setup VERSION
-set(VERSION_STRING "2.6.0")
-rocm_setup_version(VERSION ${VERSION_STRING})
+set(VERSION_STRING "2.6.0.")
+
+# Check if BUILD_NUMBER is defined in a Jenkins environment
+if($ENV{BUILD_NUMBER})
+  string(CONCAT BUILD_VERSION ${VERSION_STRING} $ENV{BUILD_NUMBER})
+else()
+  string(CONCAT BUILD_VERSION ${VERSION_STRING} "0")
+endif()
+
+rocm_setup_version(VERSION ${BUILD_VERSION} NO_GIT_TAG_VERSION)
 
 list(APPEND CMAKE_PREFIX_PATH
             /opt/rocm
@@ -79,27 +87,12 @@ include_directories(src/collectives)
 include_directories(src/collectives/device)
 
 set(CU_SOURCES
-    src/bootstrap.cu
-    src/collectives/all_gather.cu
-    src/collectives/all_reduce.cu
-    src/collectives/broadcast.cu
-    src/collectives/reduce.cu
-    src/collectives/reduce_scatter.cu
-    src/collectives/device/functions.cu
-    src/init.cu
-    src/misc/enqueue.cu
-    src/misc/group.cu
-    src/misc/ibvwrap.cu
-    src/misc/nvmlwrap_stub.cu
-    src/misc/rings.cu
-    src/misc/utils.cu
-    src/ring.cu
-    src/transport.cu
-    src/transport/net.cu
-    src/transport/net_ib.cu
-    src/transport/net_socket.cu
-    src/transport/p2p.cu
-    src/transport/shm.cu)
+    src/collectives/device/all_reduce.cu
+    src/collectives/device/all_gather.cu
+    src/collectives/device/reduce.cu
+    src/collectives/device/broadcast.cu
+    src/collectives/device/reduce_scatter.cu
+    src/collectives/device/functions.cu)
 
 set(CPP_SOURCES)
 foreach(filename ${CU_SOURCES})
@@ -111,20 +104,34 @@ foreach(filename ${CU_SOURCES})
   list(APPEND CPP_SOURCES ${cpp_filename})
 endforeach(filename)
 
-list(APPEND CPP_SOURCES src/collectives/device/all_gather_0.cpp)
-list(APPEND CPP_SOURCES src/collectives/device/all_reduce_0.cpp)
-list(APPEND CPP_SOURCES src/collectives/device/all_reduce_1.cpp)
-list(APPEND CPP_SOURCES src/collectives/device/all_reduce_2.cpp)
-list(APPEND CPP_SOURCES src/collectives/device/all_reduce_3.cpp)
-list(APPEND CPP_SOURCES src/collectives/device/broadcast_0.cpp)
-list(APPEND CPP_SOURCES src/collectives/device/reduce_0.cpp)
-list(APPEND CPP_SOURCES src/collectives/device/reduce_1.cpp)
-list(APPEND CPP_SOURCES src/collectives/device/reduce_2.cpp)
-list(APPEND CPP_SOURCES src/collectives/device/reduce_3.cpp)
-list(APPEND CPP_SOURCES src/collectives/device/reduce_scatter_0.cpp)
-list(APPEND CPP_SOURCES src/collectives/device/reduce_scatter_1.cpp)
-list(APPEND CPP_SOURCES src/collectives/device/reduce_scatter_2.cpp)
-list(APPEND CPP_SOURCES src/collectives/device/reduce_scatter_3.cpp)
+set(CC_SOURCES
+    src/init.cc
+    src/collectives/all_reduce.cc
+    src/collectives/all_gather.cc
+    src/collectives/reduce.cc
+    src/collectives/broadcast.cc
+    src/collectives/reduce_scatter.cc
+    src/channel.cc
+    src/misc/trees.cc
+    src/misc/rings.cc
+    src/misc/argcheck.cc
+    src/misc/group.cc
+    src/misc/utils.cc
+    src/misc/ibvwrap.cc
+    src/misc/nvmlwrap_stub.cc
+    src/misc/topo.cc
+    src/transport/net.cc
+    src/transport/net_ib.cc
+    src/transport/net_socket.cc
+    src/transport/p2p.cc
+    src/transport/shm.cc
+    src/transport.cc
+    src/bootstrap.cc
+    src/enqueue.cc)
+
+foreach(filename ${CC_SOURCES})
+  list(APPEND CPP_SOURCES ${filename})
+endforeach(filename)
 
 add_library(rccl ${CPP_SOURCES})
 
@@ -132,18 +139,20 @@ if(TRACE)
   add_definitions(-DENABLE_TRACE)
 endif()
 
+if(PROFILE)
+  add_definitions(-DENABLE_PROFILING)
+endif()
+
 target_link_libraries(rccl
   PRIVATE --amdgpu-target=gfx803
   PRIVATE --amdgpu-target=gfx900
-  PRIVATE --amdgpu-target=gfx906
-  PRIVATE --amdgpu-target=gfx908)
+  PRIVATE --amdgpu-target=gfx906)
 
 if("${HIP_COMPILER}" MATCHES "clang")
   target_compile_options(rccl
     PRIVATE --amdgpu-target=gfx803
     PRIVATE --amdgpu-target=gfx900
     PRIVATE --amdgpu-target=gfx906
-    PRIVATE --amdgpu-target=gfx908
     PRIVATE -fgpu-rdc)
   target_link_libraries(rccl PRIVATE -fgpu-rdc)
   target_include_directories(rccl PRIVATE /opt/rocm/hsa/include)
diff --git a/projects/rccl/Jenkinsfile b/projects/rccl/Jenkinsfile
index c53a81da98..d07a0b8140 100644
--- a/projects/rccl/Jenkinsfile
+++ b/projects/rccl/Jenkinsfile
@@ -80,7 +80,7 @@ rcclCI:
                       sudo dpkg -i package/*.deb
                       """
 
-        
+
         //platform.archiveArtifacts(this, """${project.paths.project_build_prefix}/build/package/*.deb""")
     }
 
diff --git a/projects/rccl/LICENSE.txt b/projects/rccl/LICENSE.txt
index 6b9c6a3138..60db84a684 100644
--- a/projects/rccl/LICENSE.txt
+++ b/projects/rccl/LICENSE.txt
@@ -1,5 +1,5 @@
 
- Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+ Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
  Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
 
  Redistribution and use in source and binary forms, with or without
diff --git a/projects/rccl/Makefile b/projects/rccl/Makefile
index 605e3bfaad..caed3d42ac 100644
--- a/projects/rccl/Makefile
+++ b/projects/rccl/Makefile
@@ -1,5 +1,5 @@
 #
-# Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+# Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
 #
 # See LICENSE.txt for license information
 #
diff --git a/projects/rccl/docs/Doxyfile b/projects/rccl/docs/Doxyfile
index 3d28cf5388..42dae7cc30 100644
--- a/projects/rccl/docs/Doxyfile
+++ b/projects/rccl/docs/Doxyfile
@@ -162,7 +162,7 @@ FULL_PATH_NAMES        = YES
 # will be relative from the directory where doxygen is started.
 # This tag requires that the tag FULL_PATH_NAMES is set to YES.
 
-STRIP_FROM_PATH        = 
+STRIP_FROM_PATH        =
 
 # The STRIP_FROM_INC_PATH tag can be used to strip a user-defined part of the
 # path mentioned in the documentation of a class, which tells the reader which
@@ -171,7 +171,7 @@ STRIP_FROM_PATH        =
 # specify the list of include paths that are normally passed to the compiler
 # using the -I flag.
 
-STRIP_FROM_INC_PATH    = 
+STRIP_FROM_INC_PATH    =
 
 # If the SHORT_NAMES tag is set to YES, doxygen will generate much shorter (but
 # less readable) file names. This can be useful is your file systems doesn't
@@ -238,13 +238,13 @@ TAB_SIZE               = 4
 # "Side Effects:". You can put \n's in the value part of an alias to insert
 # newlines.
 
-ALIASES                = 
+ALIASES                =
 
 # This tag can be used to specify a number of word-keyword mappings (TCL only).
 # A mapping has the form "name=value". For example adding "class=itcl::class"
 # will allow you to use the command class in the itcl::class meaning.
 
-TCL_SUBST              = 
+TCL_SUBST              =
 
 # Set the OPTIMIZE_OUTPUT_FOR_C tag to YES if your project consists of C sources
 # only. Doxygen will then generate output that is more tailored for C. For
@@ -291,7 +291,7 @@ OPTIMIZE_OUTPUT_VHDL   = NO
 # Note that for custom extensions you also need to set FILE_PATTERNS otherwise
 # the files are not read by doxygen.
 
-EXTENSION_MAPPING      = 
+EXTENSION_MAPPING      =
 
 # If the MARKDOWN_SUPPORT tag is enabled then doxygen pre-processes all comments
 # according to the Markdown format, which allows for more readable
@@ -641,7 +641,7 @@ GENERATE_DEPRECATEDLIST= YES
 # sections, marked by \if <section_label> ... \endif and \cond <section_label>
 # ... \endcond blocks.
 
-ENABLED_SECTIONS       = 
+ENABLED_SECTIONS       =
 
 # The MAX_INITIALIZER_LINES tag determines the maximum number of lines that the
 # initial value of a variable or macro / define can have for it to appear in the
@@ -683,7 +683,7 @@ SHOW_NAMESPACES        = YES
 # by doxygen. Whatever the program writes to standard output is used as the file
 # version. For an example see the documentation.
 
-FILE_VERSION_FILTER    = 
+FILE_VERSION_FILTER    =
 
 # The LAYOUT_FILE tag can be used to specify a layout file which will be parsed
 # by doxygen. The layout file controls the global structure of the generated
@@ -696,7 +696,7 @@ FILE_VERSION_FILTER    =
 # DoxygenLayout.xml, doxygen will parse it automatically even if the LAYOUT_FILE
 # tag is left empty.
 
-LAYOUT_FILE            = 
+LAYOUT_FILE            =
 
 # The CITE_BIB_FILES tag can be used to specify one or more bib files containing
 # the reference definitions. This must be a list of .bib files. The .bib
@@ -706,7 +706,7 @@ LAYOUT_FILE            =
 # LATEX_BIB_STYLE. To use this feature you need bibtex and perl available in the
 # search path. See also \cite for info how to create references.
 
-CITE_BIB_FILES         = 
+CITE_BIB_FILES         =
 
 #---------------------------------------------------------------------------
 # Configuration options related to warning and progress messages
@@ -765,7 +765,7 @@ WARN_FORMAT            = "$file:$line: $text"
 # messages should be written. If left blank the output is written to standard
 # error (stderr).
 
-WARN_LOGFILE           = 
+WARN_LOGFILE           =
 
 #---------------------------------------------------------------------------
 # Configuration options related to the input files
@@ -858,7 +858,7 @@ RECURSIVE              = NO
 # Note that relative paths are relative to the directory from which doxygen is
 # run.
 
-EXCLUDE                = 
+EXCLUDE                =
 
 # The EXCLUDE_SYMLINKS tag can be used to select whether or not files or
 # directories that are symbolic links (a Unix file system feature) are excluded
@@ -874,7 +874,7 @@ EXCLUDE_SYMLINKS       = NO
 # Note that the wildcards are matched against the file with absolute path, so to
 # exclude all test directories for example use the pattern */test/*
 
-EXCLUDE_PATTERNS       = 
+EXCLUDE_PATTERNS       =
 
 # The EXCLUDE_SYMBOLS tag can be used to specify one or more symbol names
 # (namespaces, classes, functions, etc.) that should be excluded from the
@@ -885,13 +885,13 @@ EXCLUDE_PATTERNS       =
 # Note that the wildcards are matched against the file with absolute path, so to
 # exclude all test directories use the pattern */test/*
 
-EXCLUDE_SYMBOLS        = 
+EXCLUDE_SYMBOLS        =
 
 # The EXAMPLE_PATH tag can be used to specify one or more files or directories
 # that contain example code fragments that are included (see the \include
 # command).
 
-EXAMPLE_PATH           = 
+EXAMPLE_PATH           =
 
 # If the value of the EXAMPLE_PATH tag contains directories, you can use the
 # EXAMPLE_PATTERNS tag to specify one or more wildcard pattern (like *.cpp and
@@ -911,7 +911,7 @@ EXAMPLE_RECURSIVE      = NO
 # that contain images that are to be included in the documentation (see the
 # \image command).
 
-IMAGE_PATH             = 
+IMAGE_PATH             =
 
 # The INPUT_FILTER tag can be used to specify a program that doxygen should
 # invoke to filter for each input file. Doxygen will invoke the filter program
@@ -928,7 +928,7 @@ IMAGE_PATH             =
 # code is scanned, but not when the output code is generated. If lines are added
 # or removed, the anchors will not be placed correctly.
 
-INPUT_FILTER           = 
+INPUT_FILTER           =
 
 # The FILTER_PATTERNS tag can be used to specify filters on a per file pattern
 # basis. Doxygen will compare the file name with each pattern and apply the
@@ -937,7 +937,7 @@ INPUT_FILTER           =
 # filters are used. If the FILTER_PATTERNS tag is empty or if none of the
 # patterns match the file name, INPUT_FILTER is applied.
 
-FILTER_PATTERNS        = 
+FILTER_PATTERNS        =
 
 # If the FILTER_SOURCE_FILES tag is set to YES, the input filter (if set using
 # INPUT_FILTER) will also be used to filter the input files that are used for
@@ -952,7 +952,7 @@ FILTER_SOURCE_FILES    = NO
 # *.ext= (so without naming a filter).
 # This tag requires that the tag FILTER_SOURCE_FILES is set to YES.
 
-FILTER_SOURCE_PATTERNS = 
+FILTER_SOURCE_PATTERNS =
 
 # If the USE_MDFILE_AS_MAINPAGE tag refers to the name of a markdown file that
 # is part of the input, its contents will be placed on the main page
@@ -1064,7 +1064,7 @@ CLANG_ASSISTED_PARSING = NO
 # specified with INPUT and INCLUDE_PATH.
 # This tag requires that the tag CLANG_ASSISTED_PARSING is set to YES.
 
-CLANG_OPTIONS          = 
+CLANG_OPTIONS          =
 
 #---------------------------------------------------------------------------
 # Configuration options related to the alphabetical class index
@@ -1090,7 +1090,7 @@ COLS_IN_ALPHA_INDEX    = 5
 # while generating the index headers.
 # This tag requires that the tag ALPHABETICAL_INDEX is set to YES.
 
-IGNORE_PREFIX          = 
+IGNORE_PREFIX          =
 
 #---------------------------------------------------------------------------
 # Configuration options related to the HTML output
@@ -1134,7 +1134,7 @@ HTML_FILE_EXTENSION    = .html
 # of the possible markers and block names see the documentation.
 # This tag requires that the tag GENERATE_HTML is set to YES.
 
-HTML_HEADER            = 
+HTML_HEADER            =
 
 # The HTML_FOOTER tag can be used to specify a user-defined HTML footer for each
 # generated HTML page. If the tag is left blank doxygen will generate a standard
@@ -1144,7 +1144,7 @@ HTML_HEADER            =
 # that doxygen normally uses.
 # This tag requires that the tag GENERATE_HTML is set to YES.
 
-HTML_FOOTER            = 
+HTML_FOOTER            =
 
 # The HTML_STYLESHEET tag can be used to specify a user-defined cascading style
 # sheet that is used by each HTML page. It can be used to fine-tune the look of
@@ -1156,7 +1156,7 @@ HTML_FOOTER            =
 # obsolete.
 # This tag requires that the tag GENERATE_HTML is set to YES.
 
-HTML_STYLESHEET        = 
+HTML_STYLESHEET        =
 
 # The HTML_EXTRA_STYLESHEET tag can be used to specify additional user-defined
 # cascading style sheets that are included after the standard style sheets
@@ -1169,7 +1169,7 @@ HTML_STYLESHEET        =
 # list). For an example see the documentation.
 # This tag requires that the tag GENERATE_HTML is set to YES.
 
-HTML_EXTRA_STYLESHEET  = 
+HTML_EXTRA_STYLESHEET  =
 
 # The HTML_EXTRA_FILES tag can be used to specify one or more extra images or
 # other source files which should be copied to the HTML output directory. Note
@@ -1179,7 +1179,7 @@ HTML_EXTRA_STYLESHEET  =
 # files will be copied as-is; there are no commands or markers available.
 # This tag requires that the tag GENERATE_HTML is set to YES.
 
-HTML_EXTRA_FILES       = 
+HTML_EXTRA_FILES       =
 
 # The HTML_COLORSTYLE_HUE tag controls the color of the HTML output. Doxygen
 # will adjust the colors in the style sheet and background images according to
@@ -1308,7 +1308,7 @@ GENERATE_HTMLHELP      = NO
 # written to the html output directory.
 # This tag requires that the tag GENERATE_HTMLHELP is set to YES.
 
-CHM_FILE               = 
+CHM_FILE               =
 
 # The HHC_LOCATION tag can be used to specify the location (absolute path
 # including file name) of the HTML help compiler (hhc.exe). If non-empty,
@@ -1316,7 +1316,7 @@ CHM_FILE               =
 # The file has to be specified with full path.
 # This tag requires that the tag GENERATE_HTMLHELP is set to YES.
 
-HHC_LOCATION           = 
+HHC_LOCATION           =
 
 # The GENERATE_CHI flag controls if a separate .chi index file is generated
 # (YES) or that it should be included in the master .chm file (NO).
@@ -1329,7 +1329,7 @@ GENERATE_CHI           = NO
 # and project file content.
 # This tag requires that the tag GENERATE_HTMLHELP is set to YES.
 
-CHM_INDEX_ENCODING     = 
+CHM_INDEX_ENCODING     =
 
 # The BINARY_TOC flag controls whether a binary table of contents is generated
 # (YES) or a normal table of contents (NO) in the .chm file. Furthermore it
@@ -1360,7 +1360,7 @@ GENERATE_QHP           = NO
 # the HTML output folder.
 # This tag requires that the tag GENERATE_QHP is set to YES.
 
-QCH_FILE               = 
+QCH_FILE               =
 
 # The QHP_NAMESPACE tag specifies the namespace to use when generating Qt Help
 # Project output. For more information please see Qt Help Project / Namespace
@@ -1385,7 +1385,7 @@ QHP_VIRTUAL_FOLDER     = doc
 # filters).
 # This tag requires that the tag GENERATE_QHP is set to YES.
 
-QHP_CUST_FILTER_NAME   = 
+QHP_CUST_FILTER_NAME   =
 
 # The QHP_CUST_FILTER_ATTRS tag specifies the list of the attributes of the
 # custom filter to add. For more information please see Qt Help Project / Custom
@@ -1393,21 +1393,21 @@ QHP_CUST_FILTER_NAME   =
 # filters).
 # This tag requires that the tag GENERATE_QHP is set to YES.
 
-QHP_CUST_FILTER_ATTRS  = 
+QHP_CUST_FILTER_ATTRS  =
 
 # The QHP_SECT_FILTER_ATTRS tag specifies the list of the attributes this
 # project's filter section matches. Qt Help Project / Filter Attributes (see:
 # http://qt-project.org/doc/qt-4.8/qthelpproject.html#filter-attributes).
 # This tag requires that the tag GENERATE_QHP is set to YES.
 
-QHP_SECT_FILTER_ATTRS  = 
+QHP_SECT_FILTER_ATTRS  =
 
 # The QHG_LOCATION tag can be used to specify the location of Qt's
 # qhelpgenerator. If non-empty doxygen will try to run qhelpgenerator on the
 # generated .qhp file.
 # This tag requires that the tag GENERATE_QHP is set to YES.
 
-QHG_LOCATION           = 
+QHG_LOCATION           =
 
 # If the GENERATE_ECLIPSEHELP tag is set to YES, additional index files will be
 # generated, together with the HTML files, they form an Eclipse help plugin. To
@@ -1540,7 +1540,7 @@ MATHJAX_RELPATH        = http://cdn.mathjax.org/mathjax/latest
 # MATHJAX_EXTENSIONS = TeX/AMSmath TeX/AMSsymbols
 # This tag requires that the tag USE_MATHJAX is set to YES.
 
-MATHJAX_EXTENSIONS     = 
+MATHJAX_EXTENSIONS     =
 
 # The MATHJAX_CODEFILE tag can be used to specify a file with javascript pieces
 # of code that will be used on startup of the MathJax code. See the MathJax site
@@ -1548,7 +1548,7 @@ MATHJAX_EXTENSIONS     =
 # example see the documentation.
 # This tag requires that the tag USE_MATHJAX is set to YES.
 
-MATHJAX_CODEFILE       = 
+MATHJAX_CODEFILE       =
 
 # When the SEARCHENGINE tag is enabled doxygen will generate a search box for
 # the HTML output. The underlying search engine uses javascript and DHTML and
@@ -1608,7 +1608,7 @@ EXTERNAL_SEARCH        = NO
 # Searching" for details.
 # This tag requires that the tag SEARCHENGINE is set to YES.
 
-SEARCHENGINE_URL       = 
+SEARCHENGINE_URL       =
 
 # When SERVER_BASED_SEARCH and EXTERNAL_SEARCH are both enabled the unindexed
 # search data is written to a file for indexing by an external tool. With the
@@ -1624,7 +1624,7 @@ SEARCHDATA_FILE        = searchdata.xml
 # projects and redirect the results back to the right project.
 # This tag requires that the tag SEARCHENGINE is set to YES.
 
-EXTERNAL_SEARCH_ID     = 
+EXTERNAL_SEARCH_ID     =
 
 # The EXTRA_SEARCH_MAPPINGS tag can be used to enable searching through doxygen
 # projects other than the one defined by this configuration file, but that are
@@ -1634,7 +1634,7 @@ EXTERNAL_SEARCH_ID     =
 # EXTRA_SEARCH_MAPPINGS = tagname1=loc1 tagname2=loc2 ...
 # This tag requires that the tag SEARCHENGINE is set to YES.
 
-EXTRA_SEARCH_MAPPINGS  = 
+EXTRA_SEARCH_MAPPINGS  =
 
 #---------------------------------------------------------------------------
 # Configuration options related to the LaTeX output
@@ -1698,7 +1698,7 @@ PAPER_TYPE             = a4
 # If left blank no extra packages will be included.
 # This tag requires that the tag GENERATE_LATEX is set to YES.
 
-EXTRA_PACKAGES         = 
+EXTRA_PACKAGES         =
 
 # The LATEX_HEADER tag can be used to specify a personal LaTeX header for the
 # generated LaTeX document. The header should contain everything until the first
@@ -1714,7 +1714,7 @@ EXTRA_PACKAGES         =
 # to HTML_HEADER.
 # This tag requires that the tag GENERATE_LATEX is set to YES.
 
-LATEX_HEADER           = 
+LATEX_HEADER           =
 
 # The LATEX_FOOTER tag can be used to specify a personal LaTeX footer for the
 # generated LaTeX document. The footer should contain everything after the last
@@ -1725,7 +1725,7 @@ LATEX_HEADER           =
 # Note: Only use a user-defined footer if you know what you are doing!
 # This tag requires that the tag GENERATE_LATEX is set to YES.
 
-LATEX_FOOTER           = 
+LATEX_FOOTER           =
 
 # The LATEX_EXTRA_STYLESHEET tag can be used to specify additional user-defined
 # LaTeX style sheets that are included after the standard style sheets created
@@ -1736,7 +1736,7 @@ LATEX_FOOTER           =
 # list).
 # This tag requires that the tag GENERATE_LATEX is set to YES.
 
-LATEX_EXTRA_STYLESHEET = 
+LATEX_EXTRA_STYLESHEET =
 
 # The LATEX_EXTRA_FILES tag can be used to specify one or more extra images or
 # other source files which should be copied to the LATEX_OUTPUT output
@@ -1744,7 +1744,7 @@ LATEX_EXTRA_STYLESHEET =
 # markers available.
 # This tag requires that the tag GENERATE_LATEX is set to YES.
 
-LATEX_EXTRA_FILES      = 
+LATEX_EXTRA_FILES      =
 
 # If the PDF_HYPERLINKS tag is set to YES, the LaTeX that is generated is
 # prepared for conversion to PDF (using ps2pdf or pdflatex). The PDF file will
@@ -1844,14 +1844,14 @@ RTF_HYPERLINKS         = NO
 # default style sheet that doxygen normally uses.
 # This tag requires that the tag GENERATE_RTF is set to YES.
 
-RTF_STYLESHEET_FILE    = 
+RTF_STYLESHEET_FILE    =
 
 # Set optional variables used in the generation of an RTF document. Syntax is
 # similar to doxygen's config file. A template extensions file can be generated
 # using doxygen -e rtf extensionFile.
 # This tag requires that the tag GENERATE_RTF is set to YES.
 
-RTF_EXTENSIONS_FILE    = 
+RTF_EXTENSIONS_FILE    =
 
 # If the RTF_SOURCE_CODE tag is set to YES then doxygen will include source code
 # with syntax highlighting in the RTF output.
@@ -1896,7 +1896,7 @@ MAN_EXTENSION          = .3
 # MAN_EXTENSION with the initial . removed.
 # This tag requires that the tag GENERATE_MAN is set to YES.
 
-MAN_SUBDIR             = 
+MAN_SUBDIR             =
 
 # If the MAN_LINKS tag is set to YES and doxygen generates man output, then it
 # will generate one additional man file for each entity documented in the real
@@ -1915,7 +1915,7 @@ MAN_LINKS              = NO
 # captures the structure of the code including all documentation.
 # The default value is: NO.
 
-GENERATE_XML           = YES 
+GENERATE_XML           = YES
 
 # The XML_OUTPUT tag is used to specify where the XML pages will be put. If a
 # relative path is entered the value of OUTPUT_DIRECTORY will be put in front of
@@ -2009,7 +2009,7 @@ PERLMOD_PRETTY         = YES
 # overwrite each other's variables.
 # This tag requires that the tag GENERATE_PERLMOD is set to YES.
 
-PERLMOD_MAKEVAR_PREFIX = 
+PERLMOD_MAKEVAR_PREFIX =
 
 #---------------------------------------------------------------------------
 # Configuration options related to the preprocessor
@@ -2050,7 +2050,7 @@ SEARCH_INCLUDES        = YES
 # preprocessor.
 # This tag requires that the tag SEARCH_INCLUDES is set to YES.
 
-INCLUDE_PATH           = 
+INCLUDE_PATH           =
 
 # You can use the INCLUDE_FILE_PATTERNS tag to specify one or more wildcard
 # patterns (like *.h and *.hpp) to filter out the header-files in the
@@ -2058,7 +2058,7 @@ INCLUDE_PATH           =
 # used.
 # This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
 
-INCLUDE_FILE_PATTERNS  = 
+INCLUDE_FILE_PATTERNS  =
 
 # The PREDEFINED tag can be used to specify one or more macro names that are
 # defined before the preprocessor is started (similar to the -D option of e.g.
@@ -2068,7 +2068,7 @@ INCLUDE_FILE_PATTERNS  =
 # recursively expanded use the := operator instead of the = operator.
 # This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
 
-PREDEFINED             = 
+PREDEFINED             =
 
 # If the MACRO_EXPANSION and EXPAND_ONLY_PREDEF tags are set to YES then this
 # tag can be used to specify a list of macro names that should be expanded. The
@@ -2077,7 +2077,7 @@ PREDEFINED             =
 # definition found in the source code.
 # This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
 
-EXPAND_AS_DEFINED      = 
+EXPAND_AS_DEFINED      =
 
 # If the SKIP_FUNCTION_MACROS tag is set to YES then doxygen's preprocessor will
 # remove all references to function-like macros that are alone on a line, have
@@ -2106,13 +2106,13 @@ SKIP_FUNCTION_MACROS   = YES
 # the path). If a tag file is not located in the directory in which doxygen is
 # run, you must also specify the path to the tagfile here.
 
-TAGFILES               = 
+TAGFILES               =
 
 # When a file name is specified after GENERATE_TAGFILE, doxygen will create a
 # tag file that is based on the input files it reads. See section "Linking to
 # external documentation" for more information about the usage of tag files.
 
-GENERATE_TAGFILE       = 
+GENERATE_TAGFILE       =
 
 # If the ALLEXTERNALS tag is set to YES, all external class will be listed in
 # the class index. If set to NO, only the inherited external classes will be
@@ -2161,14 +2161,14 @@ CLASS_DIAGRAMS         = NO
 # the mscgen tool resides. If left empty the tool is assumed to be found in the
 # default search path.
 
-MSCGEN_PATH            = 
+MSCGEN_PATH            =
 
 # You can include diagrams made with dia in doxygen documentation. Doxygen will
 # then run dia to produce the diagram and insert it in the documentation. The
 # DIA_PATH tag allows you to specify the directory where the dia binary resides.
 # If left empty dia is assumed to be found in the default search path.
 
-DIA_PATH               = 
+DIA_PATH               =
 
 # If set to YES the inheritance and collaboration graphs will hide inheritance
 # and usage relations if the target is undocumented or is not a class.
@@ -2217,7 +2217,7 @@ DOT_FONTSIZE           = 10
 # the path where dot can find it using this tag.
 # This tag requires that the tag HAVE_DOT is set to YES.
 
-DOT_FONTPATH           = 
+DOT_FONTPATH           =
 
 # If the CLASS_GRAPH tag is set to YES then doxygen will generate a graph for
 # each documented class showing the direct and indirect inheritance relations.
@@ -2361,26 +2361,26 @@ INTERACTIVE_SVG        = NO
 # found. If left blank, it is assumed the dot tool can be found in the path.
 # This tag requires that the tag HAVE_DOT is set to YES.
 
-DOT_PATH               = 
+DOT_PATH               =
 
 # The DOTFILE_DIRS tag can be used to specify one or more directories that
 # contain dot files that are included in the documentation (see the \dotfile
 # command).
 # This tag requires that the tag HAVE_DOT is set to YES.
 
-DOTFILE_DIRS           = 
+DOTFILE_DIRS           =
 
 # The MSCFILE_DIRS tag can be used to specify one or more directories that
 # contain msc files that are included in the documentation (see the \mscfile
 # command).
 
-MSCFILE_DIRS           = 
+MSCFILE_DIRS           =
 
 # The DIAFILE_DIRS tag can be used to specify one or more directories that
 # contain dia files that are included in the documentation (see the \diafile
 # command).
 
-DIAFILE_DIRS           = 
+DIAFILE_DIRS           =
 
 # When using plantuml, the PLANTUML_JAR_PATH tag should be used to specify the
 # path where java can find the plantuml.jar file. If left blank, it is assumed
@@ -2388,12 +2388,12 @@ DIAFILE_DIRS           =
 # generate a warning when it encounters a \startuml command in this case and
 # will not generate output for the diagram.
 
-PLANTUML_JAR_PATH      = 
+PLANTUML_JAR_PATH      =
 
 # When using plantuml, the specified paths are searched for files specified by
 # the !include statement in a plantuml block.
 
-PLANTUML_INCLUDE_PATH  = 
+PLANTUML_INCLUDE_PATH  =
 
 # The DOT_GRAPH_MAX_NODES tag can be used to set the maximum number of nodes
 # that will be shown in the graph. If the number of nodes in a graph becomes
diff --git a/projects/rccl/docs/source/allapi.rst b/projects/rccl/docs/source/allapi.rst
index 364cb40d6b..cc54d2419c 100644
--- a/projects/rccl/docs/source/allapi.rst
+++ b/projects/rccl/docs/source/allapi.rst
@@ -1,5 +1,5 @@
 .. toctree::
-   :maxdepth: 4 
+   :maxdepth: 4
    :caption: Contents:
 
 =======
@@ -8,4 +8,4 @@ All API
 
 .. doxygenindex::
 
- 
+
diff --git a/projects/rccl/docs/source/api.rst b/projects/rccl/docs/source/api.rst
index b0b44bb5b9..8e316acda1 100644
--- a/projects/rccl/docs/source/api.rst
+++ b/projects/rccl/docs/source/api.rst
@@ -1,5 +1,5 @@
 .. toctree::
-   :maxdepth: 4 
+   :maxdepth: 4
    :caption: Contents:
 
 ===
diff --git a/projects/rccl/docs/source/index.rst b/projects/rccl/docs/source/index.rst
index 3d9b62aa4a..04943ca350 100644
--- a/projects/rccl/docs/source/index.rst
+++ b/projects/rccl/docs/source/index.rst
@@ -7,10 +7,10 @@ Welcome to RCCL's documentation!
 ==================================
 
 .. toctree::
-   :maxdepth: 4 
+   :maxdepth: 4
    :caption: Contents:
 
-   library 
+   library
    api
    allapi
 
diff --git a/projects/rccl/docs/source/library.rst b/projects/rccl/docs/source/library.rst
index cbb0b95048..a7fae1dafc 100644
--- a/projects/rccl/docs/source/library.rst
+++ b/projects/rccl/docs/source/library.rst
@@ -1,6 +1,6 @@
 
 .. toctree::
-   :maxdepth: 4 
+   :maxdepth: 4
    :caption: Contents:
 
 ======
@@ -10,4 +10,4 @@ RCCL
 Introduction
 ------------
 
-The RCCL is an AMD port of NCCL. 
+The RCCL is an AMD port of NCCL.
diff --git a/projects/rccl/ext-net/dummy/Makefile b/projects/rccl/ext-net/dummy/Makefile
index d1eb4c5a62..efa841c53c 100644
--- a/projects/rccl/ext-net/dummy/Makefile
+++ b/projects/rccl/ext-net/dummy/Makefile
@@ -1,5 +1,5 @@
 #
-# Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+# Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
 #
 # See LICENSE.txt for license information
 #
diff --git a/projects/rccl/ext-net/dummy/plugin.c b/projects/rccl/ext-net/dummy/plugin.c
index f11b36590d..67d7d88411 100644
--- a/projects/rccl/ext-net/dummy/plugin.c
+++ b/projects/rccl/ext-net/dummy/plugin.c
@@ -1,5 +1,5 @@
 /*************************************************************************
- * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
  *
  * See LICENSE.txt for license information
  ************************************************************************/
diff --git a/projects/rccl/hipify.sh b/projects/rccl/hipify.sh
deleted file mode 100755
index e389fb519e..0000000000
--- a/projects/rccl/hipify.sh
+++ /dev/null
@@ -1,112 +0,0 @@
-#!/bin/bash
-# Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
-
-FILES="
-./src/nccl.h.in
-./src/bootstrap.cu
-./src/collectives/all_gather.cu
-./src/collectives/all_reduce.cu
-./src/collectives/broadcast.cu
-./src/collectives/collectives.h
-./src/collectives/device/all_gather.cu
-./src/collectives/device/all_gather.h
-./src/collectives/device/all_reduce.cu
-./src/collectives/device/all_reduce.h
-./src/collectives/device/broadcast.cu
-./src/collectives/device/broadcast.h
-./src/collectives/device/common.h
-./src/collectives/device/common_kernel.h
-./src/collectives/device/functions.cu
-./src/collectives/device/ll_kernel.h
-./src/collectives/device/primitives.h
-./src/collectives/device/reduce.cu
-./src/collectives/device/reduce.h
-./src/collectives/device/reduce_kernel.h
-./src/collectives/device/reduce_scatter.cu
-./src/collectives/device/reduce_scatter.h
-./src/collectives/reduce.cu
-./src/collectives/reduce_scatter.cu
-./src/include/bootstrap.h
-./src/include/common_coll.h
-./src/include/core.h
-./src/include/debug.h
-./src/include/enqueue.h
-./src/include/group.h
-./src/include/ibvwrap.h
-./src/include/nccl_net.h
-./src/include/net.h
-./src/include/nvlink.h
-./src/include/nvmlwrap.h
-./src/include/param.h
-./src/include/ring.h
-./src/include/rings.h
-./src/include/shm.h
-./src/include/socket.h
-./src/include/topo.h
-./src/include/transport.h
-./src/include/utils.h
-./src/init.cu
-./src/misc/enqueue.cu
-./src/misc/group.cu
-./src/misc/ibvwrap.cu
-./src/misc/nvmlwrap.cu
-./src/misc/rings.cu
-./src/misc/utils.cu
-./src/ring.cu
-./src/transport.cu
-./src/transport/net.cu
-./src/transport/net_ib.cu
-./src/transport/net_socket.cu
-./src/transport/p2p.cu
-./src/transport/shm.cu
-"
-
-for f in $FILES
-do
-    sed -i \
-        -e 's@cuda_runtime.h@hip/hip_runtime_api.h@g' \
-        -e 's@cuda_fp16.h@hip/hip_fp16.h@g' \
-        -e 's/cudaDeviceCanAccessPeer/hipDeviceCanAccessPeer/g' \
-        -e 's/cudaDeviceEnablePeerAccess/hipDeviceEnablePeerAccess/g' \
-        -e 's/cudaDeviceGetPCIBusId/hipDeviceGetPCIBusId/g' \
-        -e 's/cudaErrorPeerAccessAlreadyEnabled/hipErrorPeerAccessAlreadyEnabled/g' \
-        -e 's/cudaError_t/hipError_t/g' \
-        -e 's/cudaEventCreateWithFlags/hipEventCreateWithFlags/g' \
-        -e 's/cudaEventDestroy/hipEventDestroy/g' \
-        -e 's/cudaEventDisableTiming/hipEventDisableTiming/g' \
-        -e 's/cudaEventRecord/hipEventRecord/g' \
-        -e 's/cudaEvent_t/hipEvent_t/g' \
-        -e 's/cudaFree/hipFree/g' \
-        -e 's/cudaFreeHost/hipHostFree/g' \
-        -e 's/cudaGetDevice/hipGetDevice/g' \
-        -e 's/cudaGetErrorString/hipGetErrorString/g' \
-        -e 's/cudaGetLastError/hipGetLastError/g' \
-        -e 's/cudaHostAlloc/hipHostMalloc/g' \
-        -e 's/cudaHostAllocMapped/hipHostMallocMapped/g' \
-        -e 's/cudaHostGetDevicePointer/hipHostGetDevicePointer/g' \
-        -e 's/cudaHostRegister/hipHostRegister/g' \
-        -e 's/cudaHostRegisterMapped/hipHostRegisterMapped/g' \
-        -e 's/cudaHostUnregister/hipHostUnregister/g' \
-        -e 's/cudaIpcCloseMemHandle/hipIpcCloseMemHandle/g' \
-        -e 's/cudaIpcGetMemHandle/hipIpcGetMemHandle/g' \
-        -e 's/cudaIpcMemHandle_t/hipIpcMemHandle_t/g' \
-        -e 's/cudaIpcMemLazyEnablePeerAccess/hipIpcMemLazyEnablePeerAccess/g' \
-        -e 's/cudaIpcOpenMemHandle/hipIpcOpenMemHandle/g' \
-        -e 's/cudaMalloc/hipMalloc/g' \
-        -e 's/cudaMemcpy/hipMemcpy/g' \
-        -e 's/cudaMemcpyAsync/hipMemcpyAsync/g' \
-        -e 's/cudaMemcpyDefault/hipMemcpyDefault/g' \
-        -e 's/cudaMemcpyDeviceToDevice/hipMemcpyDeviceToDevice/g' \
-        -e 's/cudaMemoryTypeDevice/hipMemoryTypeDevice/g' \
-        -e 's/cudaMemset/hipMemset/g' \
-        -e 's/cudaPointerAttributes/hipPointerAttribute_t/g' \
-        -e 's/cudaPointerGetAttributes/hipPointerGetAttributes/g' \
-        -e 's/cudaSetDevice/hipSetDevice/g' \
-        -e 's/cudaStreamCreateWithFlags/hipStreamCreateWithFlags/g' \
-        -e 's/cudaStreamDestroy/hipStreamDestroy/g' \
-        -e 's/cudaStreamNonBlocking/hipStreamNonBlocking/g' \
-        -e 's/cudaStreamWaitEvent/hipStreamWaitEvent/g' \
-        -e 's/cudaStream_t/hipStream_t/g' \
-        -e 's/cudaSuccess/hipSuccess/g' \
-        $f
-done
diff --git a/projects/rccl/makefiles/common.mk b/projects/rccl/makefiles/common.mk
index 83a2a3951a..2ad5c73200 100644
--- a/projects/rccl/makefiles/common.mk
+++ b/projects/rccl/makefiles/common.mk
@@ -1,5 +1,5 @@
 #
-# Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+# Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
 #
 # See LICENSE.txt for license information
 #
@@ -16,7 +16,7 @@ NVCC = $(CUDA_HOME)/bin/nvcc
 
 CUDA_LIB ?= $(CUDA_HOME)/lib64
 CUDA_INC ?= $(CUDA_HOME)/include
-CUDA_VERSION = $(strip $(shell $(NVCC) --version | grep release | sed 's/.*release //' | sed 's/\,.*//'))
+CUDA_VERSION = $(strip $(shell which $(NVCC) >/dev/null && $(NVCC) --version | grep release | sed 's/.*release //' | sed 's/\,.*//'))
 #CUDA_VERSION ?= $(shell ls $(CUDA_LIB)/libcudart.so.* | head -1 | rev | cut -d "." -f -2 | rev)
 CUDA_MAJOR = $(shell echo $(CUDA_VERSION) | cut -d "." -f 1)
 CUDA_MINOR = $(shell echo $(CUDA_VERSION) | cut -d "." -f 2)
@@ -36,15 +36,16 @@ CUDA8_PTX     = -gencode=arch=compute_61,code=compute_61
 CUDA9_PTX     = -gencode=arch=compute_70,code=compute_70
 
 # Include Volta support if we're using CUDA9 or above
-ifeq ($(shell test "$(CUDA_MAJOR)" -gt 8; echo $$?),0)
+ifeq ($(shell test "0$(CUDA_MAJOR)" -gt 8; echo $$?),0)
   NVCC_GENCODE ?= $(CUDA8_GENCODE) $(CUDA9_GENCODE) $(CUDA9_PTX)
 else
   NVCC_GENCODE ?= $(CUDA8_GENCODE) $(CUDA8_PTX)
 endif
 #$(info NVCC_GENCODE is ${NVCC_GENCODE})
 
-CXXFLAGS   := -I$(CUDA_INC) -DCUDA_MAJOR=$(CUDA_MAJOR) -DCUDA_MINOR=$(CUDA_MINOR) -fPIC -fvisibility=hidden
-CXXFLAGS   += -Wall -Wno-sign-compare
+CXXFLAGS   := -DCUDA_MAJOR=$(CUDA_MAJOR) -DCUDA_MINOR=$(CUDA_MINOR) -fPIC -fvisibility=hidden
+CXXFLAGS   += -Wall -Wno-unused-function -Wno-sign-compare -std=c++11 -Wvla
+CXXFLAGS   += -I $(CUDA_INC)
 NVCUFLAGS  := -ccbin $(CXX) $(NVCC_GENCODE) -lineinfo -std=c++11 -Xptxas -maxrregcount=96 -Xfatbin -compress-all
 # Use addprefix so that we can specify more than one path
 NVLDFLAGS  := -L${CUDA_LIB} -lcudart -lrt
@@ -68,7 +69,7 @@ CXXFLAGS  += -O0 -g -ggdb3
 endif
 
 ifneq ($(VERBOSE), 0)
-NVCUFLAGS += -Xptxas -v -Xcompiler -Wall,-Wextra
+NVCUFLAGS += -Xptxas -v -Xcompiler -Wall,-Wextra,-Wno-unused-parameter
 CXXFLAGS  += -Wall -Wextra
 else
 .SILENT:
diff --git a/projects/rccl/makefiles/formatting.mk b/projects/rccl/makefiles/formatting.mk
index 4a4ab885cf..a543131d59 100644
--- a/projects/rccl/makefiles/formatting.mk
+++ b/projects/rccl/makefiles/formatting.mk
@@ -1,5 +1,5 @@
 #
-# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
+# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
 #
 # See LICENSE.txt for license information
 #
diff --git a/projects/rccl/makefiles/version.mk b/projects/rccl/makefiles/version.mk
index f9cee6a5a8..bab58ec0bf 100644
--- a/projects/rccl/makefiles/version.mk
+++ b/projects/rccl/makefiles/version.mk
@@ -1,6 +1,6 @@
 ##### version
 NCCL_MAJOR   := 2
-NCCL_MINOR   := 3
-NCCL_PATCH   := 7
+NCCL_MINOR   := 4
+NCCL_PATCH   := 8
 NCCL_SUFFIX  :=
 PKG_REVISION := 1
diff --git a/projects/rccl/pkg/Makefile b/projects/rccl/pkg/Makefile
index 04b23da70e..ab6487be9b 100644
--- a/projects/rccl/pkg/Makefile
+++ b/projects/rccl/pkg/Makefile
@@ -1,5 +1,5 @@
 #
-# Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+# Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
 #
 # See LICENSE.txt for license information
 #
diff --git a/projects/rccl/pkg/debian/Makefile b/projects/rccl/pkg/debian/Makefile
index 439635f948..7884cf2545 100644
--- a/projects/rccl/pkg/debian/Makefile
+++ b/projects/rccl/pkg/debian/Makefile
@@ -1,5 +1,5 @@
 #
-# Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+# Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
 #
 # See LICENSE.txt for license information
 #
diff --git a/projects/rccl/pkg/redhat/Makefile b/projects/rccl/pkg/redhat/Makefile
index ffcc973bcd..0808478624 100644
--- a/projects/rccl/pkg/redhat/Makefile
+++ b/projects/rccl/pkg/redhat/Makefile
@@ -1,5 +1,5 @@
 #
-# Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+# Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
 #
 # See LICENSE.txt for license information
 #
diff --git a/projects/rccl/pkg/redhat/nccl.spec.in b/projects/rccl/pkg/redhat/nccl.spec.in
index 65a2c60154..f9d83a30df 100644
--- a/projects/rccl/pkg/redhat/nccl.spec.in
+++ b/projects/rccl/pkg/redhat/nccl.spec.in
@@ -1,6 +1,6 @@
 Name:           libnccl
-Version:        ${nccl:Major}.${nccl:Minor}.${nccl:Patch}
-Release:        ${pkg:Revision}
+Version:        ${nccl:Major}.${nccl:Minor}.${nccl:Patch}${nccl:Suffix}
+Release:        ${pkg:Revision}+cuda${cuda:Major}.${cuda:Minor}
 Summary:        NVIDIA Collectives Communication Library (NCCL) Runtime
 
 Group:          Development/Libraries
diff --git a/projects/rccl/pkg/srctxz/Makefile b/projects/rccl/pkg/srctxz/Makefile
index 1cb7c06a99..01cab95a43 100644
--- a/projects/rccl/pkg/srctxz/Makefile
+++ b/projects/rccl/pkg/srctxz/Makefile
@@ -1,5 +1,5 @@
 #
-# Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+# Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
 #
 # See LICENSE.txt for license information
 #
@@ -36,4 +36,5 @@ $(TXZPREPDIR)/% : %.in
 	    -e "s/\$${nccl:Minor}/$(NCCL_MINOR)/g" \
 	    -e "s/\$${nccl:Patch}/$(NCCL_PATCH)/g" \
 	    -e "s/\$${nccl:Suffix}/$(NCCL_SUFFIX)/g" \
+	    -e "s/\$${pkg:Revision}/$(PKG_REVISION)/g" \
 	    $< > $@
diff --git a/projects/rccl/pkg/srctxz/create_srctxz.sh.in b/projects/rccl/pkg/srctxz/create_srctxz.sh.in
index 0b8e6d2b4c..11bdd52db7 100644
--- a/projects/rccl/pkg/srctxz/create_srctxz.sh.in
+++ b/projects/rccl/pkg/srctxz/create_srctxz.sh.in
@@ -1,6 +1,6 @@
 #!/bin/bash
 #
-# Copyright (c) 2017-2018, NVIDIA CORPORATION. All rights reserved.
+# Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved.
 #
 # See LICENSE.txt for license information
 #
@@ -25,8 +25,9 @@ NCCL_MAJOR=${nccl:Major}
 NCCL_MINOR=${nccl:Minor}
 NCCL_PATCH=${nccl:Patch}
 NCCL_SUFFIX=${nccl:Suffix}
+NCCL_BUILD=${pkg:Revision}
 
-NCCLNAME="nccl-src_${NCCL_MAJOR}.${NCCL_MINOR}.${NCCL_PATCH}${NCCL_SUFFIX}"
+NCCLNAME="nccl-src_${NCCL_MAJOR}.${NCCL_MINOR}.${NCCL_PATCH}${NCCL_SUFFIX}-${NCCL_BUILD}"
 
 tar --exclude build \
     --exclude ".git*" \
diff --git a/projects/rccl/pkg/txz/Makefile b/projects/rccl/pkg/txz/Makefile
index fa587ef186..b7d9aa53c8 100644
--- a/projects/rccl/pkg/txz/Makefile
+++ b/projects/rccl/pkg/txz/Makefile
@@ -1,5 +1,5 @@
 #
-# Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+# Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
 #
 # See LICENSE.txt for license information
 #
diff --git a/projects/rccl/pkg/txz/create_txz.sh.in b/projects/rccl/pkg/txz/create_txz.sh.in
index 73922e0929..deae854830 100644
--- a/projects/rccl/pkg/txz/create_txz.sh.in
+++ b/projects/rccl/pkg/txz/create_txz.sh.in
@@ -1,6 +1,6 @@
 #!/bin/bash
 #
-# Copyright (c) 2017-2018, NVIDIA CORPORATION. All rights reserved.
+# Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved.
 #
 # See LICENSE.txt for license information
 #
diff --git a/projects/rccl/src/Makefile b/projects/rccl/src/Makefile
index 481000ad16..452adf52ae 100644
--- a/projects/rccl/src/Makefile
+++ b/projects/rccl/src/Makefile
@@ -1,5 +1,5 @@
 #
-# Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+# Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
 #
 # See LICENSE.txt for license information
 #
@@ -9,41 +9,48 @@ include ../makefiles/version.mk
 
 ##### src files
 INCEXPORTS  := nccl.h nccl_net.h
-LIBSRCFILES := init.cu ring.cu bootstrap.cu transport.cu misc/group.cu \
-		misc/nvmlwrap.cu misc/ibvwrap.cu misc/rings.cu misc/utils.cu misc/enqueue.cu \
-		transport/p2p.cu transport/shm.cu transport/net.cu transport/net_socket.cu transport/net_ib.cu \
-                collectives/all_reduce.cu collectives/all_gather.cu collectives/broadcast.cu collectives/reduce.cu collectives/reduce_scatter.cu
+LIBSRCFILES := init.cc channel.cc bootstrap.cc transport.cc enqueue.cc \
+                misc/group.cc misc/nvmlwrap.cc misc/ibvwrap.cc misc/rings.cc misc/utils.cc misc/argcheck.cc misc/trees.cc misc/topo.cc \
+		transport/p2p.cc transport/shm.cc transport/net.cc transport/net_socket.cc transport/net_ib.cc \
+                collectives/all_reduce.cc collectives/all_gather.cc collectives/broadcast.cc collectives/reduce.cc collectives/reduce_scatter.cc
 
 ##### lib files
 LIBNAME     := libnccl.so
 STATICLIBNAME := libnccl_static.a
+##### pkgconfig files
+PKGCONFIGFILE := nccl.pc
 ##### dirs
 BUILDDIR ?= $(abspath ../build)
 INCDIR := $(BUILDDIR)/include
 LIBDIR := $(BUILDDIR)/lib
 OBJDIR := $(BUILDDIR)/obj
+PKGDIR := $(BUILDDIR)/lib/pkgconfig
 ##### target files
+CUDARTLIB  ?= cudart_static
 INCTARGETS := $(INCEXPORTS:%=$(INCDIR)/%)
 LIBSONAME  := $(LIBNAME:%=%.$(NCCL_MAJOR))
 LIBTARGET  := $(LIBNAME:%=%.$(NCCL_MAJOR).$(NCCL_MINOR).$(NCCL_PATCH))
 STATICLIBTARGET := $(STATICLIBNAME)
-LIBOBJ     := $(LIBSRCFILES:%.cu=$(OBJDIR)/%.o)
+PKGTARGET  := $(PKGCONFIGFILE)
+LIBOBJ     := $(LIBSRCFILES:%.cc=$(OBJDIR)/%.o)
 DEPFILES   := $(LIBOBJ:%.o=%.d)
-LDFLAGS    += -L${CUDA_LIB} -lcudart_static -lrt
+LDFLAGS    += -L${CUDA_LIB} -l$(CUDARTLIB) -lpthread -lrt -ldl
 
 DEVICELIB  := $(BUILDDIR)/obj/collectives/device/colldevice.a
 
-
 ##### rules
 build : lib staticlib
 
-lib : $(INCTARGETS) $(LIBDIR)/$(LIBTARGET)
+lib : $(INCTARGETS) $(LIBDIR)/$(LIBTARGET) $(PKGDIR)/$(PKGTARGET)
 
 staticlib : $(LIBDIR)/$(STATICLIBTARGET)
 
-devicelib: $(INCDIR)/nccl.h
+$(DEVICELIB): ALWAYS_REBUILD
 	$(MAKE) -C collectives/device
 
+# Empty target to force rebuild
+ALWAYS_REBUILD:
+
 -include $(DEPFILES)
 $(LIBDIR)/$(LIBTARGET) $(LIBDIR)/$(STATICLIBTARGET) : $(LIBOBJ)
 
@@ -51,7 +58,7 @@ $(INCDIR)/nccl.h : nccl.h.in
 # NCCL_VERSION(X,Y,Z) ((X) * 1000 + (Y) * 100 + (Z))
 	@$(eval NCCL_VERSION := $(shell printf "%d%d%02d" $(NCCL_MAJOR) $(NCCL_MINOR) $(NCCL_PATCH)))
 	mkdir -p $(INCDIR)
-	printf "Generating %-35s > %s\n" $< $@
+	@printf "Generating %-35s > %s\n" $< $@
 	sed -e "s/\$${nccl:Major}/$(NCCL_MAJOR)/g" \
 	    -e "s/\$${nccl:Minor}/$(NCCL_MINOR)/g" \
 	    -e "s/\$${nccl:Patch}/$(NCCL_PATCH)/g" \
@@ -59,14 +66,14 @@ $(INCDIR)/nccl.h : nccl.h.in
 	    -e "s/\$${nccl:Version}/$(NCCL_VERSION)/g" \
 	    $< > $@
 
-$(LIBDIR)/$(LIBTARGET): $(LIBOBJ) devicelib
+$(LIBDIR)/$(LIBTARGET): $(LIBOBJ) $(DEVICELIB)
 	@printf "Linking    %-35s > %s\n" $(LIBTARGET) $@
 	mkdir -p $(LIBDIR)
 	$(CXX) $(CXXFLAGS) -shared -Wl,--no-as-needed -Wl,-soname,$(LIBSONAME) -o $@ $(LIBOBJ) $(DEVICELIB) $(LDFLAGS)
 	ln -sf $(LIBSONAME) $(LIBDIR)/$(LIBNAME)
 	ln -sf $(LIBTARGET) $(LIBDIR)/$(LIBSONAME)
 
-$(LIBDIR)/$(STATICLIBTARGET): $(LIBOBJ) devicelib
+$(LIBDIR)/$(STATICLIBTARGET): $(LIBOBJ) $(DEVICELIB)
 	@printf "Archiving  %-35s > %s\n" $(STATICLIBTARGET) $@
 	mkdir -p $(LIBDIR)
 	$(eval TMP := $(shell mktemp -d))
@@ -75,6 +82,15 @@ $(LIBDIR)/$(STATICLIBTARGET): $(LIBOBJ) devicelib
 	ar cr $@ $(LIBOBJ) $(TMP)/*.o
 	rm -Rf $(TMP)
 
+$(PKGDIR)/nccl.pc : nccl.pc.in
+	mkdir -p $(PKGDIR)
+	@printf "Generating %-35s > %s\n" $< $@
+	sed -e 's|$${nccl:Prefix}|\$(PREFIX)|g' \
+	    -e "s/\$${nccl:Major}/$(NCCL_MAJOR)/g" \
+	    -e "s/\$${nccl:Minor}/$(NCCL_MINOR)/g" \
+	    -e "s/\$${nccl:Patch}/$(NCCL_PATCH)/g" \
+	    $< > $@
+
 $(INCDIR)/%.h : %.h
 	@printf "Grabbing   %-35s > %s\n" $< $@
 	mkdir -p $(INCDIR)
@@ -85,27 +101,34 @@ $(INCDIR)/nccl_%.h : include/nccl_%.h
 	mkdir -p $(INCDIR)
 	cp -f $< $@
 
-$(OBJDIR)/%.o : %.cu
+$(PKGDIR)/%.pc : %.pc
+	@printf "Grabbing   %-35s > %s\n" $< $@
+	mkdir -p $(PKGDIR)
+	cp -f $< $@
+
+$(OBJDIR)/%.o : %.cc
 	@printf "Compiling  %-35s > %s\n" $< $@
 	mkdir -p `dirname $@`
-	$(NVCC) -I. -I$(INCDIR) -Iinclude -c $(NVCUFLAGS) --compiler-options "$(CXXFLAGS)" $< -o $@
-	@$(NVCC) -I. -I$(INCDIR) -Iinclude -M $(NVCUFLAGS) --compiler-options "$(CXXFLAGS)" $< > $(@:%.o=%.d.tmp)
+	$(CXX) -I. -I$(INCDIR) $(CXXFLAGS) -Iinclude -c $< -o $@
+	@$(CXX) -I. -I$(INCDIR) $(CXXFLAGS) -Iinclude -M $< > $(@:%.o=%.d.tmp)
 	@sed "0,/^.*:/s//$(subst /,\/,$@):/" $(@:%.o=%.d.tmp) > $(@:%.o=%.d)
 	@sed -e 's/.*://' -e 's/\\$$//' < $(@:%.o=%.d.tmp) | fmt -1 | \
                 sed -e 's/^ *//' -e 's/$$/:/' >> $(@:%.o=%.d)
 	@rm -f $(@:%.o=%.d.tmp)
 
 clean :
-	rm -rf ${INCDIR} ${LIBDIR} ${OBJDIR}
+	rm -rf ${INCDIR} ${LIBDIR} ${PKGDIR} ${OBJDIR}
 	$(MAKE) -C collectives/device clean
 
 install : lib
 	mkdir -p $(PREFIX)/lib
+	mkdir -p $(PREFIX)/lib/pkgconfig
 	mkdir -p $(PREFIX)/include
-	cp -P -v $(BUILDDIR)/lib/* $(PREFIX)/lib/
+	cp -P -v $(BUILDDIR)/lib/lib* $(PREFIX)/lib/
+	cp -P -v $(BUILDDIR)/lib/pkgconfig/* $(PREFIX)/lib/pkgconfig/
 	cp -v $(BUILDDIR)/include/* $(PREFIX)/include/
 
-FILESTOFORMAT := $(shell find . -name ".\#*" -prune -o \( -name "*.cu" -o -name "*.h" \) -print | grep -v -E 'ibvwrap.h|nvmlwrap.h|nccl.h')
+FILESTOFORMAT := $(shell find . -name ".\#*" -prune -o \( -name "*.cc" -o -name "*.h" \) -print | grep -v -E 'ibvwrap.h|nvmlwrap.h|nccl.h')
 # Note that formatting.mk defines a new target so in order to not overwrite the default target,
 # it shouldn't be included at the top. Also, it uses the above definition of FILESTOFORMAT as well
 # as the BUILDDIR variable.
diff --git a/projects/rccl/src/bootstrap.cc b/projects/rccl/src/bootstrap.cc
new file mode 100644
index 0000000000..d7c2ac6760
--- /dev/null
+++ b/projects/rccl/src/bootstrap.cc
@@ -0,0 +1,467 @@
+/*************************************************************************
+ * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#include "nccl.h"
+#include "core.h"
+#include "utils.h"
+#include "bootstrap.h"
+#include "net.h"
+#include "socket.h"
+#include <unistd.h>
+#include <sys/types.h>
+
+// Always use sockets for bootstrap
+struct bootstrapNetHandle {
+  union socketAddress connectAddr;
+};
+
+struct bootstrapNetComm {
+  int fd;
+};
+
+/* Init functions */
+static char bootstrapNetIfNames[MAX_IF_NAME_SIZE*MAX_IFS];
+static union socketAddress bootstrapNetIfAddrs[MAX_IFS];
+static int bootstrapNetIfs = -1;
+pthread_mutex_t bootstrapNetLock = PTHREAD_MUTEX_INITIALIZER;
+
+ncclResult_t bootstrapNetInit() {
+  if (bootstrapNetIfs == -1) {
+    pthread_mutex_lock(&bootstrapNetLock);
+    if (bootstrapNetIfs == -1) {
+      bootstrapNetIfs = findInterfaces(bootstrapNetIfNames, bootstrapNetIfAddrs, MAX_IF_NAME_SIZE, MAX_IFS);
+      if (bootstrapNetIfs <= 0) {
+        WARN("Bootstrap : no socket interface found");
+        return ncclInternalError;
+      } else {
+        char line[1024];
+        char addrline[1024];
+        line[0] = '\0';
+        for (int i=0; i<bootstrapNetIfs; i++) {
+          snprintf(line+strlen(line), 1023-strlen(line), " [%d]%s:%s", i, bootstrapNetIfNames+i*MAX_IF_NAME_SIZE,
+              socketToString(&bootstrapNetIfAddrs[i].sa, addrline));
+        }
+        line[1023] = '\0';
+        INFO(NCCL_INIT, "Bootstrap : Using%s", line);
+      }
+    }
+    pthread_mutex_unlock(&bootstrapNetLock);
+  }
+  return ncclSuccess;
+}
+
+static ncclResult_t bootstrapNetNewComm(struct bootstrapNetComm** comm) {
+  NCCLCHECK(ncclCalloc(comm, 1));
+  (*comm)->fd = -1;
+  return ncclSuccess;
+}
+
+static ncclResult_t bootstrapNetGetSocketAddr(int dev, union socketAddress* addr) {
+  if (dev >= bootstrapNetIfs) return ncclInternalError;
+  memcpy(addr, bootstrapNetIfAddrs+dev, sizeof(*addr));
+  return ncclSuccess;
+}
+
+/* Socket Interface Selection type */
+enum bootstrapInterface_t { findSubnetIf = -1, dontCareIf = -2 };
+
+static ncclResult_t bootstrapNetListen(int dev, void* opaqueHandle, void** listenComm) {
+  struct bootstrapNetHandle* handle = (struct bootstrapNetHandle*) opaqueHandle;
+  static_assert(sizeof(struct bootstrapNetHandle) < NCCL_NET_HANDLE_MAXSIZE, "bootstrapNetHandle size too large");
+  // if dev >= 0, listen based on dev
+  if (dev >= 0) {
+    NCCLCHECK(bootstrapNetGetSocketAddr(dev, &(handle->connectAddr)));
+  } else if (dev == findSubnetIf) {
+    // handle stores a remote address
+    // need to find a local addr that is in the same network as the remote addr
+    union socketAddress localAddr;
+    char ifName[MAX_IF_NAME_SIZE];
+    if (findInterfaceMatchSubnet(ifName, &localAddr, handle->connectAddr, MAX_IF_NAME_SIZE, 1) <= 0) {
+      WARN("NET/Socket : No usable listening interface found");
+      return ncclSystemError;
+    }
+    // pass the local address back
+    memcpy(&handle->connectAddr, &localAddr, sizeof(handle->connectAddr));
+  } // Otherwise, handle stores a local address
+  struct bootstrapNetComm* comm;
+  NCCLCHECK(bootstrapNetNewComm(&comm));
+  NCCLCHECK(createListenSocket(&comm->fd, &handle->connectAddr));
+  *listenComm = comm;
+  return ncclSuccess;
+}
+
+static ncclResult_t bootstrapNetConnect(int dev, void* opaqueHandle, void** sendComm) {
+  struct bootstrapNetComm* comm;
+  NCCLCHECK(bootstrapNetNewComm(&comm));
+  struct bootstrapNetHandle* handle = (struct bootstrapNetHandle*) opaqueHandle;
+  NCCLCHECK(connectAddress(&comm->fd, &handle->connectAddr));
+  *sendComm = comm;
+  return ncclSuccess;
+}
+
+static ncclResult_t bootstrapNetAccept(void* listenComm, void** recvComm) {
+  struct bootstrapNetComm* lComm = (struct bootstrapNetComm*)listenComm;
+  struct bootstrapNetComm* rComm;
+  NCCLCHECK(bootstrapNetNewComm(&rComm));
+  struct sockaddr_in sockaddr;
+  socklen_t socklen = sizeof(struct sockaddr_in);
+  SYSCHECKVAL(accept(lComm->fd, (struct sockaddr*)&sockaddr, &socklen), "accept", rComm->fd);
+  *recvComm = rComm;
+  return ncclSuccess;
+}
+
+static ncclResult_t bootstrapNetClose(void* opaqueComm) {
+  struct bootstrapNetComm* comm = (struct bootstrapNetComm*)opaqueComm;
+  if (comm) {
+    close(comm->fd);
+    free(comm);
+  }
+  return ncclSuccess;
+}
+
+static ncclResult_t bootstrapNetCloseSend(void* sendComm) { NCCLCHECK(bootstrapNetClose(sendComm)); return ncclSuccess; }
+static ncclResult_t bootstrapNetCloseRecv(void* recvComm) { NCCLCHECK(bootstrapNetClose(recvComm)); return ncclSuccess; }
+static ncclResult_t bootstrapNetCloseListen(void* listenComm) { NCCLCHECK(bootstrapNetClose(listenComm)); return ncclSuccess; }
+
+// Additional sync functions
+static ncclResult_t bootstrapNetSend(void* sendComm, void* data, int size) {
+  struct bootstrapNetComm* comm = (struct bootstrapNetComm*)sendComm;
+  NCCLCHECK(socketSend(comm->fd, &size, sizeof(int)));
+  NCCLCHECK(socketSend(comm->fd, data, size));
+  return ncclSuccess;
+}
+static ncclResult_t bootstrapNetRecv(void* recvComm, void* data, int size) {
+  struct bootstrapNetComm* comm = (struct bootstrapNetComm*)recvComm;
+  int recvSize;
+  NCCLCHECK(socketReceive(comm->fd, &recvSize, sizeof(int)));
+  if (recvSize > size) {
+    WARN("Message truncated : received %d bytes instead of %d\n", recvSize, size);
+    return ncclInternalError;
+  }
+  NCCLCHECK(socketReceive(comm->fd, data, std::min(recvSize, size)));
+  return ncclSuccess;
+}
+
+ncclResult_t bootstrapNetCreateHandle(void* opaqueHandle, const char* str) {
+  struct bootstrapNetHandle* handle = (struct bootstrapNetHandle*) opaqueHandle;
+  NCCLCHECK(GetSocketAddrFromString(&handle->connectAddr, str));
+  return ncclSuccess;
+}
+
+struct extId {
+  ncclNetHandle_t extHandleRoot;
+  void* extListenComm;
+  uint64_t hostHash;
+  pid_t pid;
+  int fd;
+  pthread_t boostrapThread;
+};
+
+struct extInfo {
+  int rank;
+  int nranks;
+  ncclNetHandle_t extHandleListenRoot;
+  ncclNetHandle_t extHandleListen;
+};
+
+#include <sys/resource.h>
+
+static ncclResult_t setFilesLimit() {
+  struct rlimit filesLimit;
+  SYSCHECK(getrlimit(RLIMIT_NOFILE, &filesLimit), "getrlimit");
+  filesLimit.rlim_cur = filesLimit.rlim_max;
+  SYSCHECK(setrlimit(RLIMIT_NOFILE, &filesLimit), "setrlimit");
+  return ncclSuccess;
+}
+
+static void *bootstrapRoot(void* commId) {
+  struct extInfo info;
+  struct extId* id = (struct extId*)commId;
+  ncclNetHandle_t *rankHandles = NULL;
+  ncclNetHandle_t *rankHandlesRoot = NULL; // for initial rank <-> root information exchange
+  ncclNetHandle_t zero = { 0 }; // for sanity checking
+  void* tmpComm;
+  ncclResult_t res;
+  setFilesLimit();
+
+  TRACE(NCCL_INIT, "BEGIN");
+  /* Receive addresses from all ranks */
+  int nranks = 0, c = 0;
+  do {
+    NCCLCHECKGOTO(bootstrapNetAccept(id->extListenComm, &tmpComm), res, out);
+    NCCLCHECKGOTO(bootstrapNetRecv(tmpComm, &info, sizeof(info)), res, out);
+    NCCLCHECKGOTO(bootstrapNetCloseRecv(tmpComm), res, out);
+
+    if (c == 0) {
+      nranks = info.nranks;
+      NCCLCHECKGOTO(ncclCalloc(&rankHandles, nranks), res, out);
+      NCCLCHECKGOTO(ncclCalloc(&rankHandlesRoot, nranks), res, out);
+    }
+
+    if (nranks != info.nranks) {
+      WARN("Bootstrap Root : mismatch in rank count from procs %d : %d", nranks, info.nranks);
+      goto out;
+    }
+
+    if (memcmp(&zero, &rankHandlesRoot[info.rank], sizeof(ncclNetHandle_t)) != 0) {
+      WARN("Bootstrap Root : rank %d of %d ranks has already checked in", info.rank, nranks);
+      goto out;
+    }
+
+    // Save the connection handle for that rank
+    memcpy(rankHandlesRoot+info.rank, info.extHandleListenRoot, sizeof(ncclNetHandle_t));
+    memcpy(rankHandles+info.rank, info.extHandleListen, sizeof(ncclNetHandle_t));
+
+    ++c;
+  } while (c < nranks);
+  TRACE(NCCL_INIT, "COLLECTED HANDLES");
+
+  // Send the connect handle for the next rank in the AllGather ring
+  for (int r=0; r<nranks; ++r) {
+    int next = (r+1) % nranks;
+    void *tmpSendComm;
+    NCCLCHECKGOTO(bootstrapNetConnect(0, rankHandlesRoot[r], &tmpSendComm), res, out);
+    NCCLCHECKGOTO(bootstrapNetSend(tmpSendComm, rankHandles+next, sizeof(ncclNetHandle_t)), res, out);
+    NCCLCHECKGOTO(bootstrapNetCloseSend(tmpSendComm), res, out);
+  }
+  TRACE(NCCL_INIT, "SENT OUT HANDLES");
+
+out:
+  bootstrapNetCloseListen(id->extListenComm);
+  free(commId);
+  if (rankHandles) free(rankHandles);
+  if (rankHandlesRoot) free(rankHandlesRoot);
+
+  TRACE(NCCL_INIT, "DONE");
+  return NULL;
+}
+
+ncclResult_t bootstrapCreateRoot(ncclUniqueId* commId, bool idFromEnv) {
+  struct extId* id = (struct extId*)commId;
+  id->hostHash = getHostHash();
+  NCCLCHECK(bootstrapNetListen(idFromEnv ? dontCareIf : 0, &id->extHandleRoot, &id->extListenComm));
+  ncclUniqueId* threadIdCopy;
+  NCCLCHECK(ncclCalloc(&threadIdCopy, 1));
+  memcpy(threadIdCopy, id, sizeof(ncclUniqueId));
+  pthread_create(&id->boostrapThread, NULL, bootstrapRoot, (void *)threadIdCopy);
+  return ncclSuccess;
+}
+
+ncclResult_t bootstrapGetUniqueId(ncclUniqueId* out) {
+  static_assert(sizeof(extId) < sizeof(ncclUniqueId), "NetId does not fit inside ncclUniqueId");
+  extId* id = (extId*)out;
+
+  char* env = getenv("NCCL_COMM_ID");
+  if (env) {
+    if (bootstrapNetCreateHandle(&id->extHandleRoot, env) != 0) {
+      WARN("Invalid NCCL_COMM_ID, please use format: <ipv4>:<port> or [<ipv6>]:<port> or <hostname>:<port>");
+      return ncclInvalidArgument;
+    }
+    id->pid = -1;
+  } else {
+    id->pid = getpid();
+    NCCLCHECK(bootstrapCreateRoot(out, false));
+  }
+
+  return ncclSuccess;
+}
+
+struct unexConn {
+  int peer;
+  void* comm;
+  struct unexConn* next;
+};
+
+struct extState {
+  void* extBstrapListenComm;
+  void* extBstrapRingRecvComm;
+  void* extBstrapRingSendComm;
+  ncclNetHandle_t* peerBstrapHandles;
+  struct unexConn* unexpectedConnections;
+  int rank;
+  int nranks;
+  int dev;
+};
+
+ncclResult_t bootstrapInit(ncclUniqueId* commId, int rank, int nranks, void** commState) {
+  struct extId* id = (struct extId*)commId;
+  bool idFromEnv = id->pid < 0;
+  struct extState* state;
+  NCCLCHECK(ncclCalloc(&state, 1));
+  state->rank = rank;
+  state->nranks = nranks;
+  *commState = state;
+
+  TRACE(NCCL_INIT, "rank %d nranks %d", rank, nranks);
+
+  struct extInfo info = { 0 };
+  info.rank = rank;
+  info.nranks = nranks;
+  void *tmpSendComm, *tmpRecvComm;
+  // Pass the remote address to listen via info
+  if (idFromEnv) {
+    memcpy(&info.extHandleListen, &id->extHandleRoot, sizeof(ncclNetHandle_t));
+    memcpy(&info.extHandleListenRoot, &id->extHandleRoot, sizeof(ncclNetHandle_t));
+  }
+  // listen will return the local address via info (specify interface type 'findSubnetIf')
+  state->dev = idFromEnv ? findSubnetIf : 0;
+  void* extBstrapListenCommRoot;
+  NCCLCHECK(bootstrapNetListen(state->dev, &info.extHandleListen, &state->extBstrapListenComm));
+  NCCLCHECK(bootstrapNetListen(state->dev, &info.extHandleListenRoot, &extBstrapListenCommRoot));
+
+  // stagger connection times to avoid an overload of the root at very high rank counts
+  if (nranks > 128) {
+    long msec = rank;
+    struct timespec tv;
+    tv.tv_sec = msec / 1000;
+    tv.tv_nsec = 1000000 * (msec % 1000);
+    TRACE(NCCL_INIT, "rank %d delaying connection to root by %ld msec", rank, msec);
+    (void) nanosleep(&tv, NULL);
+  }
+
+  // send info on my listening socket to root
+  NCCLCHECK(bootstrapNetConnect(state->dev, id->extHandleRoot, &tmpSendComm));
+  NCCLCHECK(bootstrapNetSend(tmpSendComm, &info, sizeof(info)));
+  NCCLCHECK(bootstrapNetCloseSend(tmpSendComm));
+
+  // get info on my "next" rank in the bootstrap ring from root
+  ncclNetHandle_t extHandleNext;
+  NCCLCHECK(bootstrapNetAccept(extBstrapListenCommRoot, &tmpRecvComm));
+  NCCLCHECK(bootstrapNetRecv(tmpRecvComm, &extHandleNext, sizeof(extHandleNext)));
+  NCCLCHECK(bootstrapNetCloseRecv(tmpRecvComm));
+  NCCLCHECK(bootstrapNetCloseListen(extBstrapListenCommRoot));
+
+  NCCLCHECK(bootstrapNetConnect(state->dev, extHandleNext, &state->extBstrapRingSendComm));
+  // Accept the connect request from the previous rank in the AllGather ring
+  NCCLCHECK(bootstrapNetAccept(state->extBstrapListenComm, &state->extBstrapRingRecvComm));
+
+  // AllGather all listen handlers
+  NCCLCHECK(ncclCalloc(&state->peerBstrapHandles, nranks));
+  memcpy(state->peerBstrapHandles+rank, info.extHandleListen, sizeof(ncclNetHandle_t));
+  NCCLCHECK(bootstrapAllGather(state, state->peerBstrapHandles, sizeof(ncclNetHandle_t)));
+
+  TRACE(NCCL_INIT, "rank %d nranks %d - DONE", rank, nranks);
+
+  return ncclSuccess;
+}
+
+ncclResult_t bootstrapAllGather(void* commState, void* allData, int size) {
+  struct extState* state = (struct extState*)commState;
+  char* data = (char*)allData;
+  int rank = state->rank;
+  int nranks = state->nranks;
+
+  TRACE(NCCL_INIT, "rank %d nranks %d size %d", rank, nranks, size);
+
+  /* Simple ring based AllGather
+   * At each step i receive data from (rank-i-1) from left
+   * and send previous step's data from (rank-i) to right
+   */
+  for (int i=0; i<nranks-1; i++) {
+    size_t rslice = (rank - i - 1 + nranks) % nranks;
+    size_t sslice = (rank - i + nranks) % nranks;
+
+    // Send slice to the right
+    NCCLCHECK(bootstrapNetSend(state->extBstrapRingSendComm, data+sslice*size, size));
+    // Recv slice from the left
+    NCCLCHECK(bootstrapNetRecv(state->extBstrapRingRecvComm, data+rslice*size, size));
+  }
+
+  TRACE(NCCL_INIT, "rank %d nranks %d size %d - DONE", rank, nranks, size);
+  return ncclSuccess;
+}
+
+ncclResult_t bootstrapSend(void* commState, int peer, void* data, int size) {
+  struct extState* state = (struct extState*)commState;
+  void* tmpSendComm;
+  NCCLCHECK(bootstrapNetConnect(state->dev, state->peerBstrapHandles[peer], &tmpSendComm));
+  NCCLCHECK(bootstrapNetSend(tmpSendComm, &state->rank, sizeof(int)));
+  NCCLCHECK(bootstrapNetSend(tmpSendComm, data, size));
+  NCCLCHECK(bootstrapNetCloseSend(tmpSendComm));
+  return ncclSuccess;
+}
+
+ncclResult_t unexpectedEnqueue(struct extState* state, int peer, void* comm) {
+  // New unex
+  struct unexConn* unex;
+  NCCLCHECK(ncclCalloc(&unex, 1));
+  unex->peer = peer;
+  unex->comm = comm;
+
+  // Enqueue
+  struct unexConn* list = state->unexpectedConnections;
+  if (list == NULL) {
+    state->unexpectedConnections = unex;
+    return ncclSuccess;
+  }
+  while (list->next) list = list->next;
+  list->next = unex;
+  return ncclSuccess;
+}
+
+void* unexpectedDequeue(struct extState* state, int peer) {
+  struct unexConn* elem = state->unexpectedConnections;
+  struct unexConn* prev = NULL;
+  while (elem) {
+    if (elem->peer == peer) {
+      if (prev == NULL) {
+        state->unexpectedConnections = elem->next;
+      } else {
+        prev->next = elem->next;
+      }
+      void* comm = elem->comm;
+      free(elem);
+      return comm;
+    }
+    prev = elem;
+    elem = elem->next;
+  }
+  return NULL;
+}
+
+// We can't know who we'll receive from, so we need to receive everything at once
+ncclResult_t bootstrapRecv(void* commState, int peer, void* data, int size) {
+  struct extState* state = (struct extState*)commState;
+
+  void* tmpRecvComm;
+
+  // Search unexpected connections first
+  if ((tmpRecvComm = unexpectedDequeue(state, peer)) != NULL) {
+    NCCLCHECK(bootstrapNetRecv(tmpRecvComm, ((char*)data), size));
+    NCCLCHECK(bootstrapNetCloseRecv(tmpRecvComm));
+    return ncclSuccess;
+  }
+
+  // Then look for new connections
+  while (1) {
+    NCCLCHECK(bootstrapNetAccept(state->extBstrapListenComm, &tmpRecvComm));
+    int newPeer;
+    NCCLCHECK(bootstrapNetRecv(tmpRecvComm, &newPeer, sizeof(int)));
+    if (newPeer == peer) {
+      NCCLCHECK(bootstrapNetRecv(tmpRecvComm, ((char*)data), size));
+      NCCLCHECK(bootstrapNetCloseRecv(tmpRecvComm));
+      return ncclSuccess;
+    }
+    // Unexpected connection. Save for later.
+    NCCLCHECK(unexpectedEnqueue(state, newPeer, tmpRecvComm));
+  }
+}
+
+ncclResult_t bootstrapClose(void* commState) {
+  struct extState* state = (struct extState*)commState;
+  if (state->unexpectedConnections != NULL) {
+    WARN("Unexpected connections are not empty.\n");
+    return ncclInternalError;
+  }
+  NCCLCHECK(bootstrapNetCloseListen(state->extBstrapListenComm));
+  NCCLCHECK(bootstrapNetCloseSend(state->extBstrapRingSendComm));
+  NCCLCHECK(bootstrapNetCloseRecv(state->extBstrapRingRecvComm));
+
+  free(state->peerBstrapHandles);
+  free(state);
+
+  return ncclSuccess;
+}
diff --git a/projects/rccl/src/bootstrap.cu b/projects/rccl/src/bootstrap.cu
deleted file mode 100644
index 13c6e922b1..0000000000
--- a/projects/rccl/src/bootstrap.cu
+++ /dev/null
@@ -1,249 +0,0 @@
-/*************************************************************************
- * Copyright (c) 2016-2018, NVIDIA CORPORATION. All rights reserved.
- *
- * See LICENSE.txt for license information
- ************************************************************************/
-
-#include "nccl.h"
-#include "core.h"
-#include "utils.h"
-#include "bootstrap.h"
-#include "net.h"
-#include <unistd.h>
-#include <sys/types.h>
-
-// Always use sockets for bootstrap
-ncclNet_t* ncclBootstrapNet = &ncclNetSocket;
-
-static ncclResult_t bootstrapListen(int dev, void* handle, void** listenComm) { NCCLCHECK(ncclBootstrapNet->listen(dev, handle, listenComm)); return ncclSuccess; }
-static ncclResult_t bootstrapConnect(int dev, void* handle, void** sendComm) { NCCLCHECK(ncclBootstrapNet->connect(dev, handle, sendComm)); return ncclSuccess; }
-static ncclResult_t bootstrapAccept(void* listenComm, void** recvComm) { NCCLCHECK(ncclBootstrapNet->accept(listenComm, recvComm)); return ncclSuccess; }
-static ncclResult_t bootstrapTest(void* request, int* done, int* size) { NCCLCHECK(ncclBootstrapNet->test(request, done, size)); return ncclSuccess; }
-static ncclResult_t bootstrapCloseSend(void* sendComm) { NCCLCHECK(ncclBootstrapNet->closeSend(sendComm)); return ncclSuccess; }
-static ncclResult_t bootstrapCloseRecv(void* recvComm) { NCCLCHECK(ncclBootstrapNet->closeRecv(recvComm)); return ncclSuccess; }
-static ncclResult_t bootstrapCloseListen(void* listenComm) { NCCLCHECK(ncclBootstrapNet->closeListen(listenComm)); return ncclSuccess; }
-
-// Additional sync functions based on async + test for bootstrap, using host ptrs.
-static ncclResult_t bootstrapSend(void* sendComm, void* data, int size) {
-  void* request;
-  NCCLCHECK(ncclBootstrapNet->isend(sendComm, data, size, NCCL_PTR_HOST, &request));
-  int done = 0;
-  while (!done) NCCLCHECK(bootstrapTest(request, &done, NULL));
-  return ncclSuccess;
-}
-static ncclResult_t bootstrapRecv(void* recvComm, void* data, int size) {
-  void* request;
-  NCCLCHECK(ncclBootstrapNet->irecv(recvComm, data, size, NCCL_PTR_HOST, &request));
-  int done = 0;
-  while (!done) NCCLCHECK(bootstrapTest(request, &done, NULL));
-  return ncclSuccess;
-}
-
-struct extId {
-  ncclNetHandle_t extHandleRoot;
-  void* extListenComm;
-  uint64_t hostHash;
-  pid_t pid;
-  int fd;
-  pthread_t boostrapThread;
-};
-
-struct extInfo {
-  int rank;
-  int nranks;
-  ncclNetHandle_t extHandleListenFromRoot;
-  ncclNetHandle_t extHandleRing;
-};
-
-#include <sys/resource.h>
-
-static ncclResult_t setFilesLimit() {
-  struct rlimit filesLimit;
-  SYSCHECK(getrlimit(RLIMIT_NOFILE, &filesLimit), "getrlimit");
-  filesLimit.rlim_cur = filesLimit.rlim_max;
-  SYSCHECK(setrlimit(RLIMIT_NOFILE, &filesLimit), "setrlimit");
-  return ncclSuccess;
-}
-
-static void *bootstrapRoot(void* commId) {
-  struct extInfo info;
-  struct extId* id = (struct extId*)commId;
-  ncclNetHandle_t *extHandleBstrap = NULL; // for initial rank <-> root information exchange
-  ncclNetHandle_t *extHandleRing = NULL; // for bootstrap ring creation
-  ncclNetHandle_t zero = { 0 }; // for sanity checking
-  void* tmpComm;
-  ncclResult_t res;
-  setFilesLimit();
-
-  /* Receive addresses from all ranks */
-  int nranks = 0, c = 0;
-  do {
-    NCCLCHECKGOTO(bootstrapAccept(id->extListenComm, &tmpComm), res, out);
-    NCCLCHECKGOTO(bootstrapRecv(tmpComm, &info, sizeof(info)), res, out);
-    NCCLCHECKGOTO(bootstrapCloseRecv(tmpComm), res, out);
-
-    if (c == 0) {
-      extHandleBstrap = (ncclNetHandle_t *)calloc(info.nranks, sizeof(ncclNetHandle_t));
-      extHandleRing = (ncclNetHandle_t *)calloc(info.nranks, sizeof(ncclNetHandle_t));
-      if (extHandleBstrap == NULL || extHandleRing == NULL) {
-        WARN("Bootstrap thread : failed to allocate memory");
-        goto out;
-      }
-      nranks = info.nranks;
-    }
-
-    if (nranks != info.nranks) {
-      WARN("Bootstrap Root : mismatch in rank count from procs %d : %d", nranks, info.nranks);
-      goto out;
-    }
-
-    if (memcmp(&zero, &extHandleBstrap[info.rank], sizeof(ncclNetHandle_t)) != 0) {
-      WARN("Bootstrap Root : rank %d of %d ranks has already checked in", info.rank, nranks);
-      goto out;
-    }
-
-    // Save the connection handle for connecting back to the ranks
-    memcpy(&extHandleBstrap[info.rank], info.extHandleListenFromRoot, sizeof(ncclNetHandle_t));
-    // Save the connection handle for the AllGather ring
-    memcpy(&extHandleRing[info.rank], info.extHandleRing, sizeof(ncclNetHandle_t));
-
-    ++c;
-  } while (c < nranks);
-
-  // Send the connect handle for the next rank in the AllGather ring
-  for (int r=0; r<nranks; ++r) {
-    int next = (r+1) % nranks;
-    void *tmpSendComm;
-    NCCLCHECKGOTO(bootstrapConnect(0, extHandleBstrap[r], &tmpSendComm), res, out);
-    NCCLCHECKGOTO(bootstrapSend(tmpSendComm, &extHandleRing[next], sizeof(ncclNetHandle_t)), res, out);
-    NCCLCHECKGOTO(bootstrapCloseSend(tmpSendComm), res, out);
-  }
-
-out:
-  bootstrapCloseListen(id->extListenComm);
-  free(commId);
-  free(extHandleBstrap);
-  free(extHandleRing);
-  return NULL;
-}
-
-ncclResult_t bootstrapCreateRoot(ncclUniqueId* commId, bool idFromEnv) {
-  struct extId* id = (struct extId*)commId;
-  id->hostHash = getHostHash();
-  NCCLCHECK(bootstrapListen(idFromEnv ? dontCareIf : 0, &id->extHandleRoot, &id->extListenComm));
-  ncclUniqueId* threadIdCopy;
-  NCCLCHECK(ncclCalloc(&threadIdCopy, 1));
-  memcpy(threadIdCopy, id, sizeof(ncclUniqueId));
-  pthread_create(&id->boostrapThread, NULL, bootstrapRoot, (void *)threadIdCopy);
-  return ncclSuccess;
-}
-
-ncclResult_t bootstrapGetUniqueId(ncclUniqueId* out) {
-  static_assert(sizeof(extId) < sizeof(ncclUniqueId), "NetId does not fit inside ncclUniqueId");
-  extId* id = (extId*)out;
-
-  char* env = getenv("NCCL_COMM_ID");
-  if (env) {
-    if (ncclSocketCreateHandle(&id->extHandleRoot, env) != 0) {
-      WARN("Invalid NCCL_COMM_ID, please use format: <ipv4>:<port> or [<ipv6>]:<port> or <hostname>:<port>");
-      return ncclInvalidArgument;
-    }
-    id->pid = -1;
-  } else {
-    id->pid = getpid();
-    NCCLCHECK(bootstrapCreateRoot(out, false));
-  }
-
-  return ncclSuccess;
-}
-
-struct extState {
-  void* extBstrapRingRecvComm;
-  void* extBstrapRingSendComm;
-  ncclNetHandle_t extBstrapRootHandle;
-  int rank;
-  int nranks;
-  int dev;
-};
-
-ncclResult_t bootstrapInit(ncclUniqueId* commId, int rank, int nranks, void** commState) {
-  struct extId* id = (struct extId*)commId;
-  bool idFromEnv = id->pid < 0;
-  struct extState* state;
-  NCCLCHECK(ncclCalloc(&state, 1));
-  state->rank = rank;
-  state->nranks = nranks;
-  *commState = state;
-  void* extBstrapRootListenComm; // comm on which we accept root's connections
-
-  struct extInfo info = { 0 };
-  info.rank = rank;
-  info.nranks = nranks;
-  void *tmpSendComm, *extBstrapRingListenComm, *tmpRecvComm;
-  // Pass the remote address to listen via info
-  if (idFromEnv) {
-    memcpy(&info.extHandleListenFromRoot, &id->extHandleRoot, sizeof(ncclNetHandle_t));
-    memcpy(&info.extHandleRing, &id->extHandleRoot, sizeof(ncclNetHandle_t));
-  }
-  // listen will return the local address via info (specify interface type 'findSubnetIf')
-  state->dev = idFromEnv ? findSubnetIf : 0;
-  NCCLCHECK(bootstrapListen(state->dev, &info.extHandleListenFromRoot, &extBstrapRootListenComm));
-  NCCLCHECK(bootstrapListen(state->dev, &info.extHandleRing, &extBstrapRingListenComm)); // AllGather Ring
-
-  memcpy(&state->extBstrapRootHandle, &id->extHandleRoot, sizeof(ncclNetHandle_t));
-  // send info on my listening sockets to root
-  NCCLCHECK(bootstrapConnect(state->dev, id->extHandleRoot, &tmpSendComm));
-  NCCLCHECK(bootstrapSend(tmpSendComm, &info, sizeof(info)));
-  NCCLCHECK(bootstrapCloseSend(tmpSendComm));
-
-  // get info on my "next" rank in the bootstrap ring from root
-  ncclNetHandle_t extHandleNext;
-  NCCLCHECK(bootstrapAccept(extBstrapRootListenComm, &tmpRecvComm));
-  NCCLCHECK(bootstrapRecv(tmpRecvComm, &extHandleNext, sizeof(extHandleNext)));
-  NCCLCHECK(bootstrapCloseRecv(tmpRecvComm));
-
-  NCCLCHECK(bootstrapConnect(state->dev, extHandleNext, &state->extBstrapRingSendComm));
-  // Accept the connect request from the previous rank in the AllGather ring
-  NCCLCHECK(bootstrapAccept(extBstrapRingListenComm, &state->extBstrapRingRecvComm));
-  NCCLCHECK(bootstrapCloseListen(extBstrapRingListenComm));
-  NCCLCHECK(bootstrapCloseListen(extBstrapRootListenComm));
-
-  return ncclSuccess;
-}
-
-ncclResult_t bootstrapAllGather(void* commState, void* allData, int size) {
-  struct extState* state = (struct extState*)commState;
-  char* data = (char*)allData;
-  int rank = state->rank;
-  int nranks = state->nranks;
-
-  TRACE(NCCL_INIT, "rank %d nranks %d size %d", rank, nranks, size);
-
-  /* Simple ring based AllGather
-   * At each step i receive data from (rank-i-1) from left
-   * and send previous step's data from (rank-i) to right
-   */
-  for (int i=0; i<nranks-1; i++) {
-    int rslice = (rank - i - 1 + nranks) % nranks;
-    int sslice = (rank - i + nranks) % nranks;
-
-    // Send slice to the right
-    NCCLCHECK(bootstrapSend(state->extBstrapRingSendComm, data+sslice*size, size));
-    // Recv slice from the left
-    NCCLCHECK(bootstrapRecv(state->extBstrapRingRecvComm, data+rslice*size, size));
-  }
-
-  TRACE(NCCL_INIT, "rank %d nranks %d size %d - DONE", rank, nranks, size);
-  return ncclSuccess;
-}
-
-ncclResult_t bootstrapClose(void* commState) {
-  struct extState* state = (struct extState*)commState;
-
-  NCCLCHECK(bootstrapCloseSend(state->extBstrapRingSendComm));
-  NCCLCHECK(bootstrapCloseRecv(state->extBstrapRingRecvComm));
-
-  free(state);
-
-  return ncclSuccess;
-}
diff --git a/projects/rccl/src/channel.cc b/projects/rccl/src/channel.cc
new file mode 100644
index 0000000000..5a5903d3c8
--- /dev/null
+++ b/projects/rccl/src/channel.cc
@@ -0,0 +1,57 @@
+/*************************************************************************
+ * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
+ * Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#include "channel.h"
+#include "param.h"
+
+NCCL_PARAM(Buffsize, "BUFFSIZE", DEFAULT_BUFFER_SIZE_BYTES);
+
+ncclResult_t initChannel(struct ncclComm* comm, int channelid) {
+  struct ncclChannel* channel = comm->channels+channelid;
+  channel->id = channelid;
+
+  // Setup intermediate buffering
+  channel->buffSize = ncclParamBuffsize();
+
+  // Ring index to user rank table.
+  NCCLCHECK(ncclCudaCalloc(&channel->ring.devUserRanks, comm->nRanks));
+  NCCLCHECK(ncclCalloc(&channel->ring.userRanks, comm->nRanks));
+
+  // Communication structures with peers.
+  NCCLCHECK(ncclCudaCalloc(&channel->devPeers, comm->nRanks));
+  NCCLCHECK(ncclCalloc(&channel->peers, comm->nRanks));
+  for (size_t i=0; i<comm->nRanks; ++i) {
+    channel->peers[i].send.comm = comm;
+    channel->peers[i].recv.comm = comm;
+  }
+
+  // Per-channel operation list.
+  NCCLCHECK(ncclCudaHostAlloc((void**)&channel->collectives, (void**)&channel->devCollectives, sizeof(struct ncclColl)*NCCL_MAX_OPS));
+  return ncclSuccess;
+}
+
+ncclResult_t freeChannel(struct ncclChannel* channel, int nRanks) {
+  // Operation list
+  NCCLCHECK(ncclCudaHostFree(channel->collectives));
+
+  // Free Ring index to rank tables
+  free(channel->ring.userRanks);
+  CUDACHECK(hipFree(channel->ring.devUserRanks));
+
+  // Free transport proxy resources
+  for (int r=0; r<nRanks; r++) {
+    struct ncclPeer* peer = channel->peers+r;
+    if (peer->send.transportResources) NCCLCHECK(peer->send.transportComm->free(peer->send.transportResources));
+    if (peer->recv.transportResources) NCCLCHECK(peer->recv.transportComm->free(peer->recv.transportResources));
+  }
+
+  // Free the peer structures.
+  CUDACHECK(hipFree(channel->devPeers));
+  free(channel->peers);
+
+  return ncclSuccess;
+}
diff --git a/projects/rccl/src/collectives/all_gather.cc b/projects/rccl/src/collectives/all_gather.cc
new file mode 100644
index 0000000000..1959420e36
--- /dev/null
+++ b/projects/rccl/src/collectives/all_gather.cc
@@ -0,0 +1,19 @@
+/*************************************************************************
+ * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
+ * Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#include "enqueue.h"
+#include "collectives.h"
+
+NCCL_API(ncclResult_t, ncclAllGather, const void* sendbuff, void* recvbuff, size_t sendcount,
+    ncclDataType_t datatype, ncclComm_t comm, hipStream_t stream);
+ncclResult_t ncclAllGather(const void* sendbuff, void* recvbuff, size_t sendcount,
+    ncclDataType_t datatype, ncclComm_t comm, hipStream_t stream) {
+  struct ncclInfo info = { ncclCollAllGather, "AllGather",
+    sendbuff, recvbuff, sendcount, datatype, ncclSum, 0, comm, stream, /* Args */
+    ALLGATHER_CHUNKSTEPS, ALLGATHER_SLICESTEPS };
+  return ncclEnqueueCheck(&info);
+}
diff --git a/projects/rccl/src/collectives/all_gather.cu b/projects/rccl/src/collectives/all_gather.cu
deleted file mode 100644
index 7ad36c777b..0000000000
--- a/projects/rccl/src/collectives/all_gather.cu
+++ /dev/null
@@ -1,33 +0,0 @@
-/*************************************************************************
- * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
- * Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
- *
- * See LICENSE.txt for license information
- ************************************************************************/
-
-#include "core.h"
-#include "common_coll.h"
-#include "enqueue.h"
-#include "collectives.h"
-
-ncclResult_t ncclAllGatherFunc(const void* sendbuff, void* recvbuff, size_t count,
-    ncclDataType_t datatype, ncclRedOp_t op, int root, ncclComm_t comm, hipStream_t stream) {
-  size_t nbytes = count*ncclTypeSize(datatype);
-  INFO(NCCL_COLL,"AllGather: opCount %lx sendbuff %p recvbuff %p count %zi datatype %d op %d root %d comm %p [nranks=%d] stream %p", comm->opCount, sendbuff, recvbuff, count, datatype, op, root, comm, comm->nRanks, stream);
-  if (comm->nRanks == 1) {
-    if (sendbuff != recvbuff)
-      CUDACHECK(hipMemcpyAsync(recvbuff, sendbuff, nbytes, hipMemcpyDeviceToDevice, stream));
-  } else {
-    NCCLCHECK(transportSaveProxies(ALLGATHER_SUBSTEPS, ALLGATHER_BUFCHUNKS, comm->nRanks-1, comm->nRanks, nbytes*comm->nRanks, proxyPatternRing, comm));
-    NCCLCHECK(saveKernel(ncclCollAllGather, sendbuff, recvbuff, nbytes, ncclInt8, op, root, comm, stream, nbytes*comm->nRanks, 1));
-  }
-  return ncclSuccess;
-}
-
-NCCL_API(ncclResult_t, ncclAllGather, const void* sendbuff, void* recvbuff, size_t sendcount,
-    ncclDataType_t datatype, ncclComm_t comm, hipStream_t stream);
-ncclResult_t ncclAllGather(const void* sendbuff, void* recvbuff, size_t sendcount,
-    ncclDataType_t datatype, ncclComm_t comm, hipStream_t stream) {
-  return ncclEnqueueCheck(ncclAllGatherFunc, "AllGather", sendbuff, recvbuff, sendcount, datatype,
-          ncclSum, 0, comm, stream);
-}
diff --git a/projects/rccl/src/collectives/all_reduce.cc b/projects/rccl/src/collectives/all_reduce.cc
new file mode 100644
index 0000000000..4051da8b59
--- /dev/null
+++ b/projects/rccl/src/collectives/all_reduce.cc
@@ -0,0 +1,19 @@
+/*************************************************************************
+ * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
+ * Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#include "enqueue.h"
+#include "collectives.h"
+
+NCCL_API(ncclResult_t, ncclAllReduce, const void* sendbuff, void* recvbuff, size_t count,
+    ncclDataType_t datatype, ncclRedOp_t op, ncclComm* comm, hipStream_t stream);
+ncclResult_t ncclAllReduce(const void* sendbuff, void* recvbuff, size_t count,
+    ncclDataType_t datatype, ncclRedOp_t op, ncclComm* comm, hipStream_t stream) {
+  struct ncclInfo info = { ncclCollAllReduce, "AllReduce",
+    sendbuff, recvbuff, count, datatype, op, 0, comm, stream, /* Args */
+    ALLREDUCE_CHUNKSTEPS, ALLREDUCE_SLICESTEPS };
+  return ncclEnqueueCheck(&info);
+}
diff --git a/projects/rccl/src/collectives/all_reduce.cu b/projects/rccl/src/collectives/all_reduce.cu
deleted file mode 100644
index 234af2c898..0000000000
--- a/projects/rccl/src/collectives/all_reduce.cu
+++ /dev/null
@@ -1,33 +0,0 @@
-/*************************************************************************
- * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
- * Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
- *
- * See LICENSE.txt for license information
- ************************************************************************/
-
-#include "core.h"
-#include "common_coll.h"
-#include "enqueue.h"
-#include "collectives.h"
-
-ncclResult_t ncclAllReduceFunc(const void* sendbuff, void* recvbuff, size_t count,
-    ncclDataType_t datatype, ncclRedOp_t op, int root, ncclComm_t comm, hipStream_t stream) {
-  size_t nbytes = count*ncclTypeSize(datatype);
-  INFO(NCCL_COLL,"AllReduce: opCount %lx sendbuff %p recvbuff %p count %zi datatype %d op %d root %d comm %p [nranks=%d] stream %p", comm->opCount, sendbuff, recvbuff, count, datatype, op, root, comm, comm->nRanks, stream);
-  if (comm->nRanks == 1) {
-    if (sendbuff != recvbuff)
-      CUDACHECK(hipMemcpyAsync(recvbuff, sendbuff, nbytes, hipMemcpyDeviceToDevice, stream));
-  } else {
-    NCCLCHECK(transportSaveProxies(ALLREDUCE_SUBSTEPS, ALLREDUCE_BUFCHUNKS, (comm->nRanks)*2-2, comm->nRanks, nbytes, proxyPatternRing, comm));
-    NCCLCHECK(saveKernel(ncclCollAllReduce, sendbuff, recvbuff, count, datatype, op, root, comm, stream, nbytes, comm->nRanks));
-  }
-  return ncclSuccess;
-}
-
-NCCL_API(ncclResult_t, ncclAllReduce, const void* sendbuff, void* recvbuff, size_t count,
-    ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm, hipStream_t stream);
-ncclResult_t ncclAllReduce(const void* sendbuff, void* recvbuff, size_t count,
-    ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm, hipStream_t stream) {
-  return ncclEnqueueCheck(ncclAllReduceFunc, "AllReduce", sendbuff, recvbuff, count, datatype,
-          op, 0, comm, stream);
-}
diff --git a/projects/rccl/src/collectives/broadcast.cc b/projects/rccl/src/collectives/broadcast.cc
new file mode 100644
index 0000000000..f096ac1f72
--- /dev/null
+++ b/projects/rccl/src/collectives/broadcast.cc
@@ -0,0 +1,27 @@
+/*************************************************************************
+ * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
+ * Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#include "enqueue.h"
+#include "collectives.h"
+
+NCCL_API(ncclResult_t, ncclBroadcast, const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype, int root,
+    ncclComm_t comm, hipStream_t stream);
+ncclResult_t ncclBroadcast(const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype, int root,
+    ncclComm_t comm, hipStream_t stream) {
+  struct ncclInfo info = { ncclCollBroadcast, "Broadcast",
+    sendbuff, recvbuff, count, datatype, ncclSum, root, comm, stream, /* Args */
+    BROADCAST_CHUNKSTEPS, BROADCAST_SLICESTEPS };
+  return ncclEnqueueCheck(&info);
+}
+/* Deprecated original "in place" function, similar to MPI */
+NCCL_API(ncclResult_t, ncclBcast, void* buff, size_t count, ncclDataType_t datatype, int root,
+    ncclComm_t comm, hipStream_t stream);
+ncclResult_t ncclBcast(void* buff, size_t count, ncclDataType_t datatype, int root,
+    ncclComm_t comm, hipStream_t stream) {
+  return ncclBroadcast(buff, buff, count, datatype, root, comm, stream);
+}
+
diff --git a/projects/rccl/src/collectives/broadcast.cu b/projects/rccl/src/collectives/broadcast.cu
deleted file mode 100644
index a2b65995f8..0000000000
--- a/projects/rccl/src/collectives/broadcast.cu
+++ /dev/null
@@ -1,43 +0,0 @@
-/*************************************************************************
- * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
- * Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
- *
- * See LICENSE.txt for license information
- ************************************************************************/
-
-#include "core.h"
-#include "common_coll.h"
-#include "enqueue.h"
-#include "collectives.h"
-
-ncclResult_t ncclBroadcastFunc(const void* sendbuff, void* recvbuff, const size_t count,
-    ncclDataType_t datatype, ncclRedOp_t op, int root, ncclComm_t comm, hipStream_t stream) {
-  size_t nbytes = count*ncclTypeSize(datatype);
-  INFO(NCCL_COLL,"Broadcast: opCount %lx sendbuff %p recvbuff %p count %zi datatype %d op %d root %d comm %p [nranks=%d] stream %p", comm->opCount, sendbuff, recvbuff, count, datatype, op, root, comm, comm->nRanks, stream);
-  if (comm->nRanks == 1) {
-    if (sendbuff != recvbuff)
-      CUDACHECK(hipMemcpyAsync(recvbuff, sendbuff, nbytes, hipMemcpyDeviceToDevice, stream));
-  } else {
-    NCCLCHECK(transportSaveProxies(BROADCAST_SUBSTEPS, BROADCAST_BUFCHUNKS, 1, 1, nbytes, proxyPatternFrom(root), comm));
-    NCCLCHECK(saveKernel(ncclCollBroadcast, sendbuff, recvbuff, nbytes, ncclInt8, op, root, comm, stream, nbytes, 1));
-  }
-
-  return ncclSuccess;
-}
-
-/* Deprecated original "in place" function, similar to MPI */
-NCCL_API(ncclResult_t, ncclBcast, void* buff, size_t count, ncclDataType_t datatype, int root,
-    ncclComm_t comm, hipStream_t stream);
-ncclResult_t ncclBcast(void* buff, size_t count, ncclDataType_t datatype, int root,
-    ncclComm_t comm, hipStream_t stream) {
-  return ncclEnqueueCheck(ncclBroadcastFunc, "Bcast", buff, buff, count, datatype,
-          ncclSum, root, comm, stream);
-}
-
-NCCL_API(ncclResult_t, ncclBroadcast, const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype, int root,
-    ncclComm_t comm, hipStream_t stream);
-ncclResult_t ncclBroadcast(const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype, int root,
-    ncclComm_t comm, hipStream_t stream) {
-  return ncclEnqueueCheck(ncclBroadcastFunc, "Broadcast", sendbuff, recvbuff, count, datatype,
-          ncclSum, root, comm, stream);
-}
diff --git a/projects/rccl/src/collectives/collectives.h b/projects/rccl/src/collectives/collectives.h
index 5b2f0f13f4..c56d90888e 100644
--- a/projects/rccl/src/collectives/collectives.h
+++ b/projects/rccl/src/collectives/collectives.h
@@ -1,5 +1,6 @@
+#include "hip/hip_runtime.h"
 /*************************************************************************
- * Copyright (c) 2017-2018, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved.
  * Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
  *
  * See LICENSE.txt for license information
@@ -8,9 +9,7 @@
 #ifndef NCCL_COLLECTIVES_H_
 #define NCCL_COLLECTIVES_H_
 
-typedef enum { ncclCollBroadcast, ncclCollReduce, ncclCollAllGather, ncclCollReduceScatter, ncclCollAllReduce, ncclCollCount } ncclColl_t;
-
-#define FUNC_INDEX(coll, redop, dtype, ll) ((((coll*ncclNumOps + redop)*ncclNumTypes) + dtype)*2+ll)
+#define FUNC_INDEX(coll, redop, dtype, ll, al) ((((coll*ncclNumOps + redop)*ncclNumTypes) + dtype)*2+ll)
 
 #define NCCL_COLL_NAME(coll, op, dtype) \
   coll##_##op##_##dtype
@@ -19,13 +18,16 @@ typedef enum { ncclCollBroadcast, ncclCollReduce, ncclCollAllGather, ncclCollRed
   coll##Kernel_##op##_##dtype
 
 /* Declare all collective operations */
-#define DECL_COLL4(coll, op, dtype) \
+#define DECL_COLL5(coll, op, dtype) \
   extern __device__ __attribute__((noinline)) void NCCL_COLL_NAME(coll, op, dtype)(struct CollectiveArgs* args); \
-  extern __global__ void NCCL_KERN_NAME(coll, op, dtype)(struct ncclColl coll); \
+  extern __global__ void NCCL_KERN_NAME(coll, op, dtype)(struct ncclColl c); \
+
+#define DECL_COLL4(coll, op, dtype) \
+  DECL_COLL5(coll, op, dtype) \
+  DECL_COLL5(coll##LL, op, dtype)
 
 #define DECL_COLL3(coll, op, dtype) \
-  DECL_COLL4(coll##LL, op, dtype) \
-  DECL_COLL4(coll, op, dtype)
+  DECL_COLL4(coll##Ring, op, dtype)
 
 #define DECL_COLL2(coll, op) \
   DECL_COLL3(coll, op, i8) \
@@ -53,15 +55,22 @@ typedef enum { ncclCollBroadcast, ncclCollReduce, ncclCollAllGather, ncclCollRed
 
 DECL_ALL_COLLS
 
-#define ALLREDUCE_SUBSTEPS 2
-#define ALLREDUCE_BUFCHUNKS 2
-#define ALLGATHER_SUBSTEPS 2
-#define ALLGATHER_BUFCHUNKS 2
-#define REDUCESCATTER_SUBSTEPS 2
-#define REDUCESCATTER_BUFCHUNKS 2
-#define BROADCAST_SUBSTEPS 8
-#define BROADCAST_BUFCHUNKS 2
-#define REDUCE_SUBSTEPS 8
-#define REDUCE_BUFCHUNKS 2
+// CHUNKSIZE must be a multiple of SLICESIZE
+//#define ALLREDUCE_SLICESTEPS (NCCL_STEPS/4)
+//#define ALLREDUCE_CHUNKSTEPS (NCCL_STEPS/2)
+//#define ALLGATHER_SLICESTEPS (NCCL_STEPS/4)
+//#define ALLGATHER_CHUNKSTEPS (NCCL_STEPS/2)
+//#define REDUCESCATTER_SLICESTEPS (NCCL_STEPS/4)
+//#define REDUCESCATTER_CHUNKSTEPS (NCCL_STEPS/2)
+#define ALLREDUCE_SLICESTEPS 4
+#define ALLREDUCE_CHUNKSTEPS 4
+#define ALLGATHER_SLICESTEPS 4
+#define ALLGATHER_CHUNKSTEPS 4
+#define REDUCESCATTER_SLICESTEPS 4
+#define REDUCESCATTER_CHUNKSTEPS 4
+#define BROADCAST_SLICESTEPS 1
+#define BROADCAST_CHUNKSTEPS 1
+#define REDUCE_SLICESTEPS 1
+#define REDUCE_CHUNKSTEPS 1
 
 #endif
diff --git a/projects/rccl/src/collectives/device/Makefile b/projects/rccl/src/collectives/device/Makefile
index e2bcd49007..0ee587bd9a 100644
--- a/projects/rccl/src/collectives/device/Makefile
+++ b/projects/rccl/src/collectives/device/Makefile
@@ -1,5 +1,5 @@
 #
-# Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+# Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
 #
 # See LICENSE.txt for license information
 #
@@ -12,18 +12,13 @@ OBJDIR := $(BUILDDIR)/obj/collectives/device
 
 LIBSRCFILES := all_reduce.cu broadcast.cu reduce.cu all_gather.cu reduce_scatter.cu
 
-LIBOBJ     := $(patsubst %.cu,$(OBJDIR)/%_sum.o, $(LIBSRCFILES)) \
-              $(patsubst %.cu,$(OBJDIR)/%_prod.o, $(LIBSRCFILES)) \
-              $(patsubst %.cu,$(OBJDIR)/%_min.o, $(LIBSRCFILES)) \
-              $(patsubst %.cu,$(OBJDIR)/%_max.o, $(LIBSRCFILES)) \
-              $(OBJDIR)/functions.o
-
 LIBSRCFILES += functions.cu
 
 DEPFILES   := $(patsubst %.cu, $(OBJDIR)/%.d, $(LIBSRCFILES))
-DEPENDFILES := $(DEPFILES:%.d=%.dep)
+DEPENDFILES:= $(DEPFILES:%.d=%.dep)
 STATICLIB  := $(OBJDIR)/colldevice.a
 DEVOBJ     := $(OBJDIR)/devlink.o
+RULESFILE  := $(OBJDIR)/Makefile.rules
 
 NVCUFLAGS  += -I. -I.. -I$(BUILDDIR)/include -I../../include --compiler-options "-fPIC -fvisibility=hidden"
 
@@ -33,6 +28,16 @@ all: $(STATICLIB)
 # Dummy rule so that the extra dependency (%.dep) files are preserved by make
 all_deps: $(DEPENDFILES)
 
+# Auto-generating the rules per op/reduction/datatype/algorithm
+$(RULESFILE) :
+	@printf "Generating %-35s > %s\n" rules $@
+	@mkdir -p $(OBJDIR)
+	@./gen_rules.sh $(OBJDIR) > $@
+
+-include $(RULESFILE)
+
+LIBOBJ     := $(GENOBJS) $(OBJDIR)/functions.o
+
 -include $(DEPFILES)
 
 $(STATICLIB): $(LIBOBJ) $(DEVOBJ)
@@ -58,26 +63,6 @@ $(OBJDIR)/functions.o : functions.cu $(OBJDIR)/functions.dep
 	mkdir -p `dirname $@`
 	$(NVCC) $(NVCUFLAGS) -dc $< -o $@
 
-$(OBJDIR)/%_sum.o : %.cu $(OBJDIR)/%.dep
-	@printf "Compiling  %-35s > %s\n" $< $@
-	mkdir -p `dirname $@`
-	$(NVCC) -DNCCL_OP=0 $(NVCUFLAGS) -dc $< -o $@
-
-$(OBJDIR)/%_prod.o : %.cu $(OBJDIR)/%.dep
-	@printf "Compiling  %-35s > %s\n" $< $@
-	mkdir -p `dirname $@`
-	$(NVCC) -DNCCL_OP=1 $(NVCUFLAGS) -dc $< -o $@
-
-$(OBJDIR)/%_min.o : %.cu $(OBJDIR)/%.dep
-	@printf "Compiling  %-35s > %s\n" $< $@
-	mkdir -p `dirname $@`
-	$(NVCC) -DNCCL_OP=2 $(NVCUFLAGS) -dc $< -o $@
-
-$(OBJDIR)/%_max.o : %.cu $(OBJDIR)/%.dep
-	@printf "Compiling  %-35s > %s\n" $< $@
-	mkdir -p `dirname $@`
-	$(NVCC) -DNCCL_OP=3 $(NVCUFLAGS) -dc $< -o $@
-
 # ... and create the device-side linked object with all those.
 $(DEVOBJ) : $(LIBOBJ)
 	$(NVCC) $(NVCUFLAGS) -dlink $^ -o $@
diff --git a/projects/rccl/src/collectives/device/all_gather.cu b/projects/rccl/src/collectives/device/all_gather.cu
index 0f572ce7cb..3fd3e0c63e 100644
--- a/projects/rccl/src/collectives/device/all_gather.cu
+++ b/projects/rccl/src/collectives/device/all_gather.cu
@@ -1,5 +1,6 @@
 /*************************************************************************
  * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+ * Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
  *
  * See LICENSE.txt for license information
  ************************************************************************/
@@ -10,6 +11,4 @@
 
 #define UNROLL 4
 
-#if NCCL_OP == 0
 IMPL_COLL3(ncclAllGather, copy, FuncSum, i8, int8_t, ncclCollAllGather, ncclSum, ncclInt8);
-#endif
diff --git a/projects/rccl/src/collectives/device/all_gather.h b/projects/rccl/src/collectives/device/all_gather.h
index 1e086c8b64..0b89d3a1f8 100644
--- a/projects/rccl/src/collectives/device/all_gather.h
+++ b/projects/rccl/src/collectives/device/all_gather.h
@@ -1,81 +1,44 @@
 /*************************************************************************
- * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
  * Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
  *
  * See LICENSE.txt for license information
  ************************************************************************/
 
-#include "core.h"
+#include "devcomm.h"
 #include "primitives.h"
 #include "collectives.h"
 
-// Increase Step and poffset/noffset for buffer sync
-#define NEXT_STEP \
-  step++; \
-  poffset = noffset; \
-  noffset += sliceSize; \
-  if (noffset == buffSize) noffset = 0;
-
 template<int UNROLL, class FUNC, typename T>
 __attribute__((noinline))
-__device__ void ncclAllGatherKernel(struct CollectiveArgs* args) {
+__device__ void ncclAllGatherRingKernel(struct CollectiveArgs* args) {
   const int tid = threadIdx.x;
   const int nthreads = blockDim.x;
   const int bid = args->bid;
-  __shared__ T* sharedNextOutput;
-  struct ncclComm* comm = args->comm;
-  struct ncclRing* ring = comm->rings+blockIdx.x;
-  int prevdirect = 0;
-  int nextdirect = 0;
-
-  WaitFlag waitDoneFromNext(ring->send.conn.head, ALLGATHER_BUFCHUNKS*ALLGATHER_SUBSTEPS);
-  WaitFlag waitReadyFromPrev(ring->recv.conn.tail, ALLGATHER_SUBSTEPS);
-  PostFlag postDoneToPrev(ring->recv.conn.head, ALLGATHER_SUBSTEPS, NULL, 0);
-  PostFlag postReadyToNext(ring->send.conn.tail, 0, ring->send.conn.fifo, ALLGATHER_BUFCHUNKS*ALLGATHER_SUBSTEPS, ring->next_hdp_reg);
-
-  typedef Primitives<UNROLL, ALLGATHER_SUBSTEPS, T> Prims;
-
+  struct ncclDevComm* comm = args->comm;
+  struct ncclChannel* channel = comm->channels+blockIdx.x;
+  struct ncclRing* ring = &channel->ring;
   const ssize_t size = args->N;
   const int nranks = comm->nRanks;
-  const int buffSize = ring->buffSize / sizeof(T);
-  const int sliceSize = buffSize / ALLGATHER_BUFCHUNKS;
-  const ssize_t loopSize = args->nRings*(ssize_t)sliceSize;
-
-  if (tid == 0) {
-    // Update in case we skipped some collectives
-    STORE(ring->recv.conn.opCount, args->opCount);
-    // Wait for next to be ready
-    WaitFlag waitOpCountNext(ring->send.conn.opCount, 0);
-    waitOpCountNext.wait(args->opCount);
-    if (prevdirect) {
-      *ring->recv.conn.ptrExchange = args->ThisOutput;
-    }
-    if (nextdirect) {
-      void* volatile* ptr = &(ring->devMemSend->ptrExchange);
-      while (LOAD(ptr) == nullptr);
-      sharedNextOutput = (T*)LOAD(ptr);
-      STORE(ptr, nullptr);
-    }
-  }
-  __syncthreads();
-
-  uint64_t step = 0ULL;
-  int poffset, noffset = 0;
+  const int stepSize = channel->buffSize / (sizeof(T)*NCCL_STEPS);
+  const int chunkSize = stepSize * ALLREDUCE_CHUNKSTEPS;
+  const ssize_t loopSize = args->nChannels*(ssize_t)chunkSize;
 
   // Compute pointers
   const T * __restrict__ thisInput = (const T*)args->ThisInput;
   T * __restrict__ thisOutput = (T*)args->ThisOutput;
-  T * __restrict__ prevInput = (T*)ring->recv.conn.buff;
-  T * __restrict__ nextOutput = (T*)ring->send.conn.buff;
+
+  ncclPrimitives<UNROLL, ALLGATHER_CHUNKSTEPS/ALLGATHER_SLICESTEPS, ALLREDUCE_SLICESTEPS, T, 1, 1, FUNC>
+    prims(tid, nthreads, &ring->prev, &ring->next, thisOutput, stepSize, channel, comm, args->opCount);
 
   for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
-    int chunkSize = min(sliceSize, DIVUP(size-gridOffset,args->nRings));
-    ALIGN_SIZE(chunkSize, nthreads*sizeof(uint64_t)/sizeof(T));
-    ssize_t chunkOffset = gridOffset + bid*chunkSize;
+    int realChunkSize = min(chunkSize, DIVUP(size-gridOffset,args->nChannels));
+    ALIGN_SIZE(realChunkSize, nthreads*sizeof(uint64_t)/sizeof(T));
+    ssize_t chunkOffset = gridOffset + bid*realChunkSize;
 
     /////////////// begin AllGather steps ///////////////
     ssize_t offset;
-    int maxOffset = min(chunkSize, size-chunkOffset);
+    int nelem = min(realChunkSize, size-chunkOffset);
     int rankDest;
 
     // step 0: push data to next GPU
@@ -83,130 +46,53 @@ __device__ void ncclAllGatherKernel(struct CollectiveArgs* args) {
     offset = chunkOffset + rankDest * size;
 
     if (thisInput + chunkOffset == thisOutput + offset) { // In place
-      Prims::Copy(tid, nthreads,
-          thisInput  + chunkOffset,
-          nextdirect ? (sharedNextOutput + offset) : (nextOutput + noffset),
-          sliceSize, maxOffset,
-          step,
-          waitDoneFromNext,
-          postReadyToNext);
+      prims.directSend(thisInput+chunkOffset, offset, nelem);
     } else {
-      Prims::DoubleCopy(tid, nthreads,
-          thisInput  + chunkOffset,
-          thisOutput + offset,
-          nextdirect ? (sharedNextOutput + offset) : (nextOutput + noffset),
-          sliceSize, maxOffset,
-          step,
-          waitDoneFromNext,
-          postReadyToNext);
+      prims.directCopySend(thisInput+chunkOffset, thisOutput+offset, offset, nelem);
     }
 
-    NEXT_STEP; // Increases step, poffset, noffset
-
     // k-2 steps: copy to next GPU
-    if (prevdirect) {
-      for (int j=1; j<nranks-1; ++j) {
-        rankDest = ring->devUserRanks[nranks-j];
-        offset = chunkOffset + rankDest * size;
-
-        Prims::Copy(tid, nthreads,
-            thisOutput + offset,
-            nextdirect ? (sharedNextOutput + offset) : (nextOutput + noffset),
-            sliceSize, maxOffset,
-            step,
-            waitDoneFromNext, waitReadyFromPrev,
-            postReadyToNext, postDoneToPrev);
-
-        NEXT_STEP;
-      }
-      Prims::Copy(tid, nthreads,
-          NULL,
-          NULL,
-          0, 0,
-          step,
-          waitReadyFromPrev,
-          postDoneToPrev);
-    } else {
-      for (int j=1; j<nranks-1; ++j) {
-        rankDest = ring->devUserRanks[nranks-j];
-        offset = chunkOffset + rankDest * size;
-
-        Prims::DoubleCopy(tid, nthreads,
-            prevInput + poffset,
-            thisOutput + offset,
-            nextdirect ? (sharedNextOutput + offset) : (nextOutput + noffset),
-            sliceSize, maxOffset,
-            step,
-            waitDoneFromNext, waitReadyFromPrev,
-            postReadyToNext, postDoneToPrev);
-
-        NEXT_STEP;
-      }
-
-      // Make final copy from buffer to dest.
-      rankDest = ring->devUserRanks[1];
+    for (int j=1; j<nranks-1; ++j) {
+      rankDest = ring->devUserRanks[nranks-j];
       offset = chunkOffset + rankDest * size;
 
-      // Here we need to copy from buffer to this output.
-      Prims::Copy(tid, nthreads,
-          prevInput + poffset,
-          thisOutput + offset,
-          sliceSize, maxOffset,
-          step,
-          waitReadyFromPrev,
-          postDoneToPrev);
+      prims.directRecvCopySend(thisOutput+offset, offset, nelem);
     }
-  }
 
-  if (tid == 0) {
-    waitDoneFromNext.wait(ALLGATHER_SUBSTEPS*(step + ALLGATHER_BUFCHUNKS));
-    STORE(ring->send.conn.head, 0ULL);
-    STORE(ring->recv.conn.tail, 0ULL);
-    __threadfence_system();
-    STORE(ring->recv.conn.opCount, args->opCount+1);
+    // Make final copy from buffer to dest.
+    rankDest = ring->devUserRanks[1];
+    offset = chunkOffset + rankDest * size;
+
+    // Final wait/copy.
+    prims.directRecv(thisOutput+offset, offset, nelem);
   }
 }
 
-#include "ll_kernel.h"
-
-#define NEXT_STEP_LL \
-  poffset = noffset; \
-  pflag = nflag; \
-  noffset += NCCL_LL_SLICE_LINES; \
-  if (noffset == NCCL_LL_BUFF_LINES) { noffset = 0; } \
-  nflag++; \
-  step++;
+template<int UNROLL, class FUNC, typename T>
+__attribute__((noinline))
+__device__ void ncclAllGatherTreeKernel(struct CollectiveArgs* args) { }
 
 template<int UNUSED, class FUNC, typename T>
 __attribute__((noinline))
-__device__ void ncclAllGatherLLKernel(struct CollectiveArgs* args) {
+__device__ void ncclAllGatherRingLLKernel(struct CollectiveArgs* args) {
   const int tid = threadIdx.x;
   const int bid = args->bid;
-  const int llNthreads = args->nThreads;
-  struct ncclComm* comm = args->comm;
-  struct ncclRing* ring = comm->rings+blockIdx.x;
-  volatile uint64_t * recvHeadPtr = ring->recv.conn.llHead;
-  volatile uint64_t * sendHeadPtr = ring->send.conn.llHead;
-  volatile int * sizesFifo = ring->send.conn.llFifo;
-  uint64_t sendHead = sendHeadPtr[0];
+  const int nthreads = args->nThreads;
+  struct ncclDevComm* comm = args->comm;
+  struct ncclChannel* channel = comm->channels+blockIdx.x;
+  struct ncclRing* ring = &channel->ring;
 
-  typedef LLPrimitives<T, FUNC> LL;
+  ncclLLPrimitives<T, FUNC, 1, 1> LLprims(tid, nthreads, &ring->prev, &ring->next, channel, comm, args->opCount);
 
   const ssize_t size = args->N;
   //const int rank = comm->rank;
   const int nranks = comm->nRanks;
   ssize_t chunkSize = NCCL_LL_SLICE_LINES * sizeof(uint64_t) / sizeof(T);
-  const ssize_t loopSize = args->nRings*chunkSize;
-
-  uint64_t step = ring->send.conn.llStep;
-  uint32_t pflag, nflag = step + 1;
-  int poffset, noffset = NCCL_LL_SLICE_LINES * STEP_TO_SLOT(step);
+  const ssize_t loopSize = args->nChannels*chunkSize;
 
   // Compute pointers
   const T * __restrict__ thisInput = (const T*)args->ThisInput;
   T * __restrict__ thisOutput = (T*)args->ThisOutput;
-  union ncclLLFifoLine * prevInput = (union ncclLLFifoLine *)ring->recv.conn.llBuff;
-  union ncclLLFifoLine * nextOutput = (union ncclLLFifoLine *)ring->send.conn.llBuff;
 
   for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
     if (size-gridOffset < loopSize) {
@@ -216,57 +102,35 @@ __device__ void ncclAllGatherLLKernel(struct CollectiveArgs* args) {
 
     /////////////// begin AllGather steps ///////////////
     ssize_t offset;
-    int maxOffset = min(chunkSize, size-chunkOffset);
+    int nelem = min(chunkSize, size-chunkOffset);
     int rankDest;
 
     // step 0: push data to next GPU
     rankDest = ring->devUserRanks[0];
     offset = chunkOffset + rankDest * size;
 
-    WAIT_NEXT;
     if (thisInput + chunkOffset == thisOutput + offset) { // In place
-      LL::ReduceCopy(
-          thisInput  + chunkOffset,
-          nextOutput + noffset,
-          maxOffset, nflag, llNthreads);
+      LLprims.send(thisInput+chunkOffset, nelem);
     } else {
-      LL::ReduceCopy(
-          thisInput  + chunkOffset,
-          thisOutput + offset,
-          nextOutput + noffset,
-          maxOffset, nflag, llNthreads);
+      LLprims.copySend(thisInput+chunkOffset, thisOutput+offset, nelem);
     }
-    POST_SIZE;
-
-    NEXT_STEP_LL;
 
     // k-2 steps: copy to next GPU
     for (int j=1; j<nranks-1; ++j) {
       rankDest = ring->devUserRanks[nranks-j];
       offset = chunkOffset + rankDest * size;
 
-      WAIT_NEXT;
-      LL::ReduceCopy(
-          prevInput  + poffset,
-          thisOutput + offset,
-          nextOutput + noffset,
-          maxOffset, pflag, nflag, llNthreads);
-      POST_SIZE;
-      ACK_PREV;
-
-      NEXT_STEP_LL;
+      LLprims.recvCopySend(thisOutput+offset, nelem);
     }
 
     // step k-1: final store
     rankDest = ring->devUserRanks[1];
     offset = chunkOffset + rankDest * size;
 
-    LL::ReduceCopy(
-        prevInput  + poffset,
-        thisOutput + offset,
-        maxOffset, pflag, llNthreads);
-    ACK_PREV;
+    LLprims.recv(thisOutput+offset, nelem);
   }
-
-  FIFO_CLEANING_AND_SAVE_STEP(nflag);
 }
+
+template<int UNUSED, class FUNC, typename T>
+__attribute__((noinline))
+__device__ void ncclAllGatherTreeLLKernel(struct CollectiveArgs* args) { }
diff --git a/projects/rccl/src/collectives/device/all_reduce.cu b/projects/rccl/src/collectives/device/all_reduce.cu
index caa1479c12..704197160e 100644
--- a/projects/rccl/src/collectives/device/all_reduce.cu
+++ b/projects/rccl/src/collectives/device/all_reduce.cu
@@ -1,5 +1,6 @@
 /*************************************************************************
  * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+ * Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
  *
  * See LICENSE.txt for license information
  ************************************************************************/
@@ -10,12 +11,7 @@
 
 #define UNROLL 4
 
-#if NCCL_OP == 0
 IMPL_COLL2(ncclAllReduce, sum,  FuncSum,  ncclCollAllReduce, ncclSum);
-#elif NCCL_OP == 1
 IMPL_COLL2(ncclAllReduce, prod, FuncProd, ncclCollAllReduce, ncclProd);
-#elif NCCL_OP == 2
 IMPL_COLL2(ncclAllReduce, min,  FuncMin,  ncclCollAllReduce, ncclMin);
-#elif NCCL_OP == 3
-IMPL_COLL2(ncclAllReduce, max,  FuncMax,  ncclCollAllReduce, ncclMax);
-#endif
+IMPL_COLL2(ncclAllReduce, max,  FuncMax,  ncclCollAllReduce, ncclMax);
\ No newline at end of file
diff --git a/projects/rccl/src/collectives/device/all_reduce.h b/projects/rccl/src/collectives/device/all_reduce.h
index b75223b13e..f319b4333e 100644
--- a/projects/rccl/src/collectives/device/all_reduce.h
+++ b/projects/rccl/src/collectives/device/all_reduce.h
@@ -1,243 +1,181 @@
 /*************************************************************************
- * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
  * Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
  *
  * See LICENSE.txt for license information
  ************************************************************************/
 
-#include "core.h"
+#include "devcomm.h"
 #include "primitives.h"
 #include "collectives.h"
 
-// Increase Step and poffset/noffset for buffer sync
-#define NEXT_STEP \
-  step++; \
-  poffset = noffset; \
-  noffset += sliceSize; \
-  if (noffset == buffSize) noffset = 0;
-
 template<int UNROLL, class FUNC, typename T>
 __attribute__((noinline))
-__device__ void ncclAllReduceKernel(struct CollectiveArgs* args) {
+__device__ void ncclAllReduceRingKernel(struct CollectiveArgs* args) {
   const int tid = threadIdx.x;
   const int nthreads = blockDim.x;
   const int bid = args->bid;
-  __shared__ T* sharedNextOutput;
-  struct ncclComm* comm = args->comm;
-  struct ncclRing* ring = comm->rings+blockIdx.x;
-  int prevdirect = 0;
-  int nextdirect = 0;
-
-  WaitFlag waitDoneFromNext(ring->send.conn.head, ALLREDUCE_BUFCHUNKS*ALLREDUCE_SUBSTEPS);
-  WaitFlag waitReadyFromPrev(ring->recv.conn.tail, ALLREDUCE_SUBSTEPS);
-  PostFlag postDoneToPrev(ring->recv.conn.head, ALLREDUCE_SUBSTEPS, NULL, 0);
-  PostFlag postReadyToNext(ring->send.conn.tail, 0, ring->send.conn.fifo, ALLREDUCE_BUFCHUNKS*ALLREDUCE_SUBSTEPS, ring->next_hdp_reg);
-
-  typedef Primitives<UNROLL, ALLREDUCE_SUBSTEPS, T, FUNC> Prims;
-
+  struct ncclDevComm* comm = args->comm;
+  struct ncclChannel* channel = comm->channels+blockIdx.x;
+  struct ncclRing* ring = &channel->ring;
   const ssize_t size = args->N;
-  //const int rank = comm->rank;
   const int nranks = comm->nRanks;
-  const int buffSize = ring->buffSize / sizeof(T);
-  const int sliceSize = buffSize / ALLREDUCE_BUFCHUNKS;
-  const ssize_t loopSize = args->nRings*(ssize_t)sliceSize;
-
-  if (tid == 0) {
-    // Update in case we skipped some collectives
-    STORE(ring->recv.conn.opCount, args->opCount);
-    // Wait for next to be ready
-    WaitFlag waitOpCountNext(ring->send.conn.opCount, 0);
-    waitOpCountNext.wait(args->opCount);
-    if (prevdirect) {
-      *ring->recv.conn.ptrExchange = args->ThisOutput;
-    }
-    if (nextdirect) {
-      void* volatile* ptr = &(ring->devMemSend->ptrExchange);
-      while (LOAD(ptr) == nullptr);
-      sharedNextOutput = (T*)LOAD(ptr);
-      STORE(ptr, nullptr);
-    }
-  }
-  __syncthreads();
-
-  uint64_t step = 0ULL;
-  int poffset, noffset = 0;
+  const int stepSize = channel->buffSize / (sizeof(T)*NCCL_STEPS);
+  const int chunkSize = stepSize * ALLREDUCE_CHUNKSTEPS;
+  const ssize_t loopSize = args->nChannels*(ssize_t)chunkSize;
+#ifdef ENABLE_PROFILING
+  auto devProf = comm->devProf;
+  uint64_t clk, t0 = 0ULL, ws, wr;
+  if (tid == 0) clk = clock64();
+#endif
 
   // Compute pointers
   const T * __restrict__ thisInput = (const T*)args->ThisInput;
   T * __restrict__ thisOutput = (T*)args->ThisOutput;
-  T * __restrict__ prevInput = (T*)ring->recv.conn.buff;
-  T * __restrict__ nextOutput = (T*)ring->send.conn.buff;
+
+  ncclPrimitives<UNROLL, ALLREDUCE_CHUNKSTEPS/ALLREDUCE_SLICESTEPS, ALLREDUCE_SLICESTEPS, T, 1, 1, FUNC>
+    prims(tid, nthreads, &ring->prev, &ring->next, thisOutput, stepSize, channel, comm, args->opCount);
 
   for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += nranks*loopSize) {
-    int chunkSize = min(sliceSize, DIVUP(size-gridOffset,nranks*args->nRings));
-    ALIGN_SIZE(chunkSize, nthreads*sizeof(uint64_t)/sizeof(T));
-    ssize_t chunkOffset = gridOffset + bid*nranks*chunkSize;
+    int realChunkSize = min(chunkSize, DIVUP(size-gridOffset,nranks*args->nChannels));
+    ALIGN_SIZE(realChunkSize, nthreads*sizeof(uint64_t)/sizeof(T));
+    ssize_t chunkOffset = gridOffset + bid*nranks*realChunkSize;
 
     /////////////// begin AllReduce steps ///////////////
     ssize_t offset;
-    int maxOffset;
+    int nelem;
     int slice;
 
     // step 0: push data to next GPU
     slice = ring->devUserRanks[nranks-1];
-    offset = chunkOffset + slice * chunkSize;
-    maxOffset = min(chunkSize, size-offset);
+    offset = chunkOffset + slice * realChunkSize;
+    nelem = min(realChunkSize, size-offset);
 
-    Prims::Copy(tid, nthreads,
-        thisInput  + offset,
-        nextOutput + noffset,
-        sliceSize, maxOffset,
-        step,
-        waitDoneFromNext,
-        postReadyToNext);
-
-    NEXT_STEP; // Increases step, poffset, noffset
+    INIT_COUNTER;
+    prims.send(thisInput+offset, nelem);
+    ACCUMULATE_COUNTER(send);
 
     // k-2 steps: reduce and copy to next GPU
     for (int j=2; j<nranks; ++j) {
       slice = ring->devUserRanks[nranks-j];
-      offset = chunkOffset + slice * chunkSize;
-      maxOffset = min(chunkSize, size-offset);
+      offset = chunkOffset + slice * realChunkSize;
+      nelem = min(realChunkSize, size-offset);
 
-      Prims::Reduce(tid, nthreads,
-          prevInput  + poffset,
-          thisInput  + offset,
-          nextOutput + noffset,
-          sliceSize, maxOffset,
-          step,
-          waitDoneFromNext, waitReadyFromPrev,
-          postReadyToNext, postDoneToPrev);
-
-      NEXT_STEP;
+      INIT_COUNTER;
+      prims.recvReduceSend(thisInput+offset, nelem);
+      ACCUMULATE_COUNTER(recvReduceSend);
     }
 
     // step k-1: reduce this buffer and data, which will produce the final
     // result that we store in this data and push to the next GPU
     slice = ring->devUserRanks[0];
-    offset = chunkOffset + slice * chunkSize;
-    maxOffset = min(chunkSize, size-offset);
+    offset = chunkOffset + slice * realChunkSize;
+    nelem = min(realChunkSize, size-offset);
 
-    Prims::ReduceCopy(tid, nthreads,
-        prevInput  + poffset,
-        thisInput  + offset,
-        nextdirect ? (sharedNextOutput + offset) : (nextOutput + noffset),
-        thisOutput + offset,
-        sliceSize, maxOffset,
-        step,
-        waitDoneFromNext, waitReadyFromPrev,
-        postReadyToNext, postDoneToPrev);
-
-    NEXT_STEP;
+    INIT_COUNTER;
+    prims.directRecvReduceCopySend(thisInput+offset, thisOutput+offset, offset, nelem);
+    ACCUMULATE_COUNTER(directRecvReduceCopySend);
 
     // k-2 steps: copy to next GPU
-    if (prevdirect) {
-      for (int j=1; j<nranks-1; ++j) {
-        slice = ring->devUserRanks[nranks - j];
-        offset = chunkOffset + slice * chunkSize;
-        maxOffset = min(chunkSize, size-offset);
+    for (int j=1; j<nranks-1; ++j) {
+      slice = ring->devUserRanks[nranks-j];
+      offset = chunkOffset + slice * realChunkSize;
+      nelem = min(realChunkSize, size-offset);
 
-        Prims::Copy(tid, nthreads,
-            thisOutput + offset,
-            nextdirect ? (sharedNextOutput + offset) : (nextOutput + noffset),
-            sliceSize, maxOffset,
-            step,
-            waitDoneFromNext, waitReadyFromPrev,
-            postReadyToNext, postDoneToPrev);
-
-        NEXT_STEP;
-      }
-      Prims::Copy(tid, nthreads,
-          NULL,
-          NULL,
-          0, 0,
-          step,
-          waitReadyFromPrev,
-          postDoneToPrev);
-    } else {
-      for (int j=1; j<nranks-1; ++j) {
-        slice = ring->devUserRanks[nranks - j];
-        offset = chunkOffset + slice * chunkSize;
-        maxOffset = min(chunkSize, size-offset);
-
-        Prims::DoubleCopy(tid, nthreads,
-            prevInput + poffset,
-            thisOutput + offset,
-            nextdirect ? (sharedNextOutput + offset) : (nextOutput + noffset),
-            sliceSize, maxOffset,
-            step,
-            waitDoneFromNext, waitReadyFromPrev,
-            postReadyToNext, postDoneToPrev);
-
-        NEXT_STEP;
-      }
-
-      // Make final copy from buffer to dest.
-      slice = ring->devUserRanks[1];
-      offset = chunkOffset + slice * chunkSize;
-      maxOffset = min(chunkSize, size-offset);
-
-      // Here we need to copy from buffer to this output.
-      Prims::Copy(tid, nthreads,
-          prevInput + poffset,
-          thisOutput + offset,
-          sliceSize, maxOffset,
-          step,
-          waitReadyFromPrev,
-          postDoneToPrev);
+      INIT_COUNTER;
+      prims.directRecvCopySend(thisOutput+offset, offset, nelem);
+      ACCUMULATE_COUNTER(directRecvCopySend);
     }
-  }
 
-  if (tid == 0) {
-    // Wait for next to have consumed all data before we reset the flag
-    waitDoneFromNext.wait(ALLREDUCE_SUBSTEPS*(step + ALLREDUCE_BUFCHUNKS));
-    STORE(ring->send.conn.head, 0ULL);
-    STORE(ring->recv.conn.tail, 0ULL);
-    __threadfence_system();
-    STORE(ring->recv.conn.opCount, args->opCount+1);
+    // Make final copy from buffer to dest.
+    slice = ring->devUserRanks[1];
+    offset = chunkOffset + slice * realChunkSize;
+    nelem = min(realChunkSize, size-offset);
+
+    // Final wait/copy.
+    INIT_COUNTER;
+    prims.directRecv(thisOutput+offset, offset, nelem);
+    ACCUMULATE_COUNTER(directRecv);
   }
+#ifdef ENABLE_PROFILING
+  if (tid == 0) __atomic_fetch_add(&(devProf->total_cycle), clock64() - clk, __ATOMIC_SEQ_CST);
+#endif
 }
 
-#include "ll_kernel.h"
+template<int UNROLL, class FUNC, typename T>
+__attribute__((noinline))
+__device__ void ncclAllReduceTreeKernel(struct CollectiveArgs* args) {
+  const int tid = threadIdx.x;
+  const int nthreads = blockDim.x;
+  const int bid = args->bid;
+  struct ncclDevComm* comm = args->comm;
+  struct ncclChannel* channel = comm->channels+blockIdx.x;
+  struct ncclTree* tree = &channel->tree;
+  const ssize_t size = args->N;
+  const int stepSize = channel->buffSize / (sizeof(T)*NCCL_STEPS);
+  const int chunkSize = args->lastChunkSize;
+  const ssize_t loopSize = args->nChannels*chunkSize;
 
-#define NEXT_STEP_LL \
-  poffset = noffset; \
-  pflag = nflag; \
-  noffset += NCCL_LL_SLICE_LINES; \
-  if (noffset == NCCL_LL_BUFF_LINES) { noffset = 0; } \
-  nflag++; \
-  step++;
+  // Compute pointers
+  const T * __restrict__ thisInput = (const T*)args->ThisInput;
+  T * __restrict__ thisOutput = (T*)args->ThisOutput;
+
+  do {
+    // Reduce : max number of recv is 3, max number of send is 1 (binary tree + local)
+    ncclPrimitives<UNROLL, 1, 1, T, NCCL_MAX_TREE_ARITY, 1, FUNC> prims(tid, nthreads, tree->down, &tree->up, NULL, stepSize, channel, comm, args->opCount);
+    for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
+      // Up
+      ssize_t offset = gridOffset + bid*chunkSize;
+      int nelem = min(chunkSize, size-offset);
+      if (tree->up == -1) {
+        prims.recvReduceCopy(thisInput+offset, thisOutput+offset, nelem);
+      } else if (tree->down[0] == -1) {
+        prims.send(thisInput+offset, nelem);
+      } else {
+        prims.recvReduceSend(thisInput+offset, nelem);
+      }
+    }
+  } while(0);
+
+  do {
+    // Broadcast : max number of recv is 1, max number of send is 3 (binary tree + local)
+    ncclPrimitives<UNROLL, 1, 1, T, 1, NCCL_MAX_TREE_ARITY, FUNC> prims(tid, nthreads, &tree->up, tree->down, NULL, stepSize, channel, comm, args->opCount);
+    for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
+      // Down
+      ssize_t offset = gridOffset + bid*chunkSize;
+      int nelem = min(chunkSize, size-offset);
+      if (tree->up == -1) {
+        prims.send(thisOutput+offset, nelem);
+      } else if (tree->down[0] == -1) {
+        prims.recv(thisOutput+offset, nelem);
+      } else {
+        prims.recvCopySend(thisOutput+offset, nelem);
+      }
+    }
+  } while(0);
+}
 
 template<int UNUSED, class FUNC, typename T>
 __attribute__((noinline))
-__device__ void ncclAllReduceLLKernel(struct CollectiveArgs* args) {
+__device__ void ncclAllReduceRingLLKernel(struct CollectiveArgs* args) {
   const int tid = threadIdx.x;
   const int bid = args->bid;
-  const int llNthreads = args->nThreads;
-  struct ncclComm* comm = args->comm;
-  struct ncclRing* ring = comm->rings+blockIdx.x;
-  volatile uint64_t * recvHeadPtr = ring->recv.conn.llHead;
-  volatile uint64_t * sendHeadPtr = ring->send.conn.llHead;
-  volatile int * sizesFifo = ring->send.conn.llFifo;
-  uint64_t sendHead = sendHeadPtr[0];
+  const int nthreads = args->nThreads;
+  struct ncclDevComm* comm = args->comm;
+  struct ncclChannel* channel = comm->channels+blockIdx.x;
+  struct ncclRing* ring = &channel->ring;
 
-  typedef LLPrimitives<T, FUNC> LL;
+  ncclLLPrimitives<T, FUNC, 1, 1> LLprims(tid, nthreads, &ring->prev, &ring->next, channel, comm, args->opCount);
 
   const ssize_t size = args->N;
   //const int rank = comm->rank;
   const int nranks = comm->nRanks;
   ssize_t chunkSize = NCCL_LL_SLICE_LINES * sizeof(uint64_t) / sizeof(T);
-  const ssize_t loopSize = args->nRings*nranks*chunkSize;
-
-  uint64_t step = ring->send.conn.llStep;
-  uint32_t pflag, nflag = step + 1;
-  int poffset, noffset = NCCL_LL_SLICE_LINES * STEP_TO_SLOT(step);
+  const ssize_t loopSize = args->nChannels*nranks*chunkSize;
 
   // Compute pointers
   const T * __restrict__ thisInput = (const T*)args->ThisInput;
   T * __restrict__ thisOutput = (T*)args->ThisOutput;
-  union ncclLLFifoLine * prevInput = (union ncclLLFifoLine *)ring->recv.conn.llBuff;
-  union ncclLLFifoLine * nextOutput = (union ncclLLFifoLine *)ring->send.conn.llBuff;
 
   for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
     if (size-gridOffset < loopSize) {
@@ -247,89 +185,100 @@ __device__ void ncclAllReduceLLKernel(struct CollectiveArgs* args) {
 
     /////////////// begin AllReduce steps ///////////////
     ssize_t offset;
-    int maxOffset;
+    int nelem;
     int slice;
 
     // step 0: push data to next GPU
     slice = ring->devUserRanks[nranks-1];
     offset = chunkOffset + slice * chunkSize;
-    maxOffset = min(chunkSize, size-offset);
+    nelem = min(chunkSize, size-offset);
 
-    WAIT_NEXT;
-    LL::ReduceCopy(
-        thisInput  + offset,
-        nextOutput + noffset,
-        maxOffset, nflag, llNthreads);
-    POST_SIZE;
-
-    NEXT_STEP_LL;
+    LLprims.send(thisInput+offset, nelem);
 
     // k-2 steps: reduce and copy to next GPU
     for (int j=2; j<nranks; ++j) {
       slice = ring->devUserRanks[nranks-j];
       offset = chunkOffset + slice * chunkSize;
-      maxOffset = min(chunkSize, size-offset);
+      nelem = min(chunkSize, size-offset);
 
-      WAIT_NEXT;
-      LL::ReduceCopy(
-          thisInput  + offset,
-          prevInput  + poffset,
-          nextOutput + noffset,
-          maxOffset, pflag, nflag, llNthreads);
-      POST_SIZE;
-      ACK_PREV;
-
-      NEXT_STEP_LL;
+      LLprims.recvReduceSend(thisInput+offset, nelem);
     }
 
     // step k-1: reduce this buffer and data, which will produce the final
     // result that we store in this data and push to the next GPU
     slice = ring->devUserRanks[0];
     offset = chunkOffset + slice * chunkSize;
-    maxOffset = min(chunkSize, size-offset);
+    nelem = min(chunkSize, size-offset);
 
-    WAIT_NEXT;
-    LL::ReduceCopy(
-        thisInput  + offset,
-        prevInput  + poffset,
-        thisOutput + offset,
-        nextOutput + noffset,
-        maxOffset, pflag, nflag, llNthreads);
-    POST_SIZE;
-    ACK_PREV;
-
-    NEXT_STEP_LL;
+    LLprims.recvReduceCopySend(thisInput+offset, thisOutput+offset, nelem);
 
     // k-2 steps: copy to next GPU
     for (int j=1; j<nranks-1; ++j) {
-      slice = ring->devUserRanks[nranks - j];
+      slice = ring->devUserRanks[nranks-j];
       offset = chunkOffset + slice * chunkSize;
-      maxOffset = min(chunkSize, size-offset);
+      nelem = min(chunkSize, size-offset);
 
-      WAIT_NEXT;
-      LL::ReduceCopy(
-          prevInput + poffset,
-          thisOutput + offset,
-          nextOutput + noffset,
-          maxOffset, pflag, nflag, llNthreads);
-      POST_SIZE;
-      ACK_PREV;
-
-      NEXT_STEP_LL;
+      LLprims.recvCopySend(thisOutput+offset, nelem);
     }
 
     // Make final copy from buffer to dest.
     slice = ring->devUserRanks[1];
     offset = chunkOffset + slice * chunkSize;
-    maxOffset = min(chunkSize, size-offset);
+    nelem = min(chunkSize, size-offset);
 
     // Here we need to copy from buffer to this output.
-    LL::ReduceCopy(
-        prevInput + poffset,
-        thisOutput + offset,
-        maxOffset, pflag, llNthreads);
-    ACK_PREV;
+    LLprims.recv(thisOutput+offset, nelem);
   }
-
-  FIFO_CLEANING_AND_SAVE_STEP(nflag);
+}
+
+template<int UNUSED, class FUNC, typename T>
+__attribute__((noinline))
+__device__ void ncclAllReduceTreeLLKernel(struct CollectiveArgs* args) {
+  const int tid = threadIdx.x;
+  const int nthreads = args->nThreads;
+  const int bid = args->bid;
+  struct ncclDevComm* comm = args->comm;
+  struct ncclChannel* channel = comm->channels+blockIdx.x;
+  struct ncclTree* tree = &channel->tree;
+  const ssize_t size = args->N;
+  ssize_t chunkSize = NCCL_LL_SLICE_LINES * sizeof(uint64_t) / sizeof(T);
+  const ssize_t loopSize = args->nChannels*chunkSize;
+
+  // Compute pointers
+  const T * __restrict__ thisInput = (const T*)args->ThisInput;
+  T * __restrict__ thisOutput = (T*)args->ThisOutput;
+
+  do {
+    // Reduce : max number of recv is 3, max number of send is 1 (binary tree + local)
+    ncclLLPrimitives<T, FUNC, NCCL_MAX_TREE_ARITY, 1> LLprims(tid, nthreads, tree->down, &tree->up, channel, comm, args->opCount);
+    for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
+      // Up
+      ssize_t offset = gridOffset + bid*chunkSize;
+      int nelem = min(chunkSize, size-offset);
+      if (tree->up == -1) {
+        LLprims.recvReduceCopy(thisInput+offset, thisOutput+offset, nelem);
+      } else if (tree->down[0] == -1) {
+        LLprims.send(thisInput+offset, nelem);
+      } else {
+        LLprims.recvReduceSend(thisInput+offset, nelem);
+      }
+    }
+  } while(0);
+
+  do {
+    // Broadcast : max number of recv is 1, max number of send is 3 (binary tree + local)
+    ncclLLPrimitives<T, FUNC, 1, NCCL_MAX_TREE_ARITY> LLprims(tid, nthreads, &tree->up, tree->down, channel, comm, args->opCount);
+    for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
+      // Down
+      ssize_t offset = gridOffset + bid*chunkSize;
+      int nelem = min(chunkSize, size-offset);
+      if (tree->up == -1) {
+        LLprims.send(thisOutput+offset, nelem);
+      } else if (tree->down[0] == -1) {
+        LLprims.recv(thisOutput+offset, nelem);
+      } else {
+        LLprims.recvCopySend(thisOutput+offset, nelem);
+      }
+    }
+  } while(0);
 }
diff --git a/projects/rccl/src/collectives/device/all_reduce_1.cpp b/projects/rccl/src/collectives/device/all_reduce_1.cpp
deleted file mode 100644
index dda4b5d517..0000000000
--- a/projects/rccl/src/collectives/device/all_reduce_1.cpp
+++ /dev/null
@@ -1,8 +0,0 @@
-/*************************************************************************
- * Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
- *
- * See LICENSE.txt for license information
- ************************************************************************/
-
-#define NCCL_OP 1
-#include "device/all_reduce.cu"
diff --git a/projects/rccl/src/collectives/device/all_reduce_2.cpp b/projects/rccl/src/collectives/device/all_reduce_2.cpp
deleted file mode 100644
index 745435b60f..0000000000
--- a/projects/rccl/src/collectives/device/all_reduce_2.cpp
+++ /dev/null
@@ -1,8 +0,0 @@
-/*************************************************************************
- * Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
- *
- * See LICENSE.txt for license information
- ************************************************************************/
-
-#define NCCL_OP 2
-#include "device/all_reduce.cu"
diff --git a/projects/rccl/src/collectives/device/all_reduce_3.cpp b/projects/rccl/src/collectives/device/all_reduce_3.cpp
deleted file mode 100644
index d7f45f03dd..0000000000
--- a/projects/rccl/src/collectives/device/all_reduce_3.cpp
+++ /dev/null
@@ -1,8 +0,0 @@
-/*************************************************************************
- * Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
- *
- * See LICENSE.txt for license information
- ************************************************************************/
-
-#define NCCL_OP 3
-#include "device/all_reduce.cu"
diff --git a/projects/rccl/src/collectives/device/broadcast.cu b/projects/rccl/src/collectives/device/broadcast.cu
index 4125de41f9..c4b1cbc5e9 100644
--- a/projects/rccl/src/collectives/device/broadcast.cu
+++ b/projects/rccl/src/collectives/device/broadcast.cu
@@ -1,5 +1,6 @@
 /*************************************************************************
  * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+ * Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
  *
  * See LICENSE.txt for license information
  ************************************************************************/
@@ -10,6 +11,4 @@
 
 #define UNROLL 4
 
-#if NCCL_OP == 0
-IMPL_COLL3(ncclBroadcast, copy, FuncSum, i8, int8_t, ncclCollBroadcast, ncclSum, ncclInt8);
-#endif
+IMPL_COLL3(ncclBroadcast, copy, FuncSum, i8, int8_t, ncclCollBroadcast, ncclSum, ncclInt8);
\ No newline at end of file
diff --git a/projects/rccl/src/collectives/device/broadcast.h b/projects/rccl/src/collectives/device/broadcast.h
index 5fafbaf6aa..3c54de9dd8 100644
--- a/projects/rccl/src/collectives/device/broadcast.h
+++ b/projects/rccl/src/collectives/device/broadcast.h
@@ -1,184 +1,101 @@
 /*************************************************************************
- * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
  * Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
  *
  * See LICENSE.txt for license information
  ************************************************************************/
 
-#include "core.h"
+#include "devcomm.h"
 #include "primitives.h"
 #include "collectives.h"
 
-// Increase Step and boffset for buffer sync
-#define NEXT_STEP \
-  step++; \
-  boffset += sliceSize; \
-  if (boffset == buffSize) boffset = 0;
-
 template<int UNROLL, class FUNC, typename T>
 __attribute__((noinline))
-__device__ void ncclBroadcastKernel(struct CollectiveArgs* args) {
+__device__ void ncclBroadcastRingKernel(struct CollectiveArgs* args) {
   const int tid = threadIdx.x;
   const int nthreads = blockDim.x;
   const int bid = args->bid;
-  __shared__ T* sharedNextOutput;
-  struct ncclComm* comm = args->comm;
-  struct ncclRing* ring = comm->rings+blockIdx.x;
-  int prevdirect = 0;
-  int nextdirect = 0;
+  struct ncclDevComm* comm = args->comm;
+  struct ncclChannel* channel = comm->channels+blockIdx.x;
+  struct ncclRing* ring = &channel->ring;
+  const ssize_t size = args->N;
+  const int stepSize = channel->buffSize / (sizeof(T)*NCCL_STEPS);
+  const int chunkSize = stepSize * BROADCAST_CHUNKSTEPS;
+  const ssize_t loopSize = args->nChannels*(ssize_t)chunkSize;
+  const int rank = ring->devUserRanks[0];
+  const int nextRank = ring->devUserRanks[1];
+  const int root = args->root;
+#ifdef ENABLE_PROFILING
+  auto devProf = comm->devProf;
+  uint64_t clk, t0 = 0ULL, ws, wr;
+  if (tid == 0) clk = clock64();
+#endif
 
-  WaitFlag waitDoneFromNext(ring->send.conn.head, (BROADCAST_BUFCHUNKS-1)*BROADCAST_SUBSTEPS);
-  WaitFlag waitReadyFromPrev(ring->recv.conn.tail, 0);
-  PostFlag postDoneToPrev(ring->recv.conn.head, 0, NULL, 0);
-  PostFlag postReadyToNext(ring->send.conn.tail, 0, ring->send.conn.fifo, BROADCAST_BUFCHUNKS*BROADCAST_SUBSTEPS, ring->next_hdp_reg);
+  // Compute pointers
+  const T * __restrict__ thisInput = (const T*)args->ThisInput;
+  T * __restrict__ thisOutput = (T*)args->ThisOutput;
 
-  typedef Primitives<UNROLL, BROADCAST_SUBSTEPS, T> Prims;
+  ncclPrimitives<UNROLL, BROADCAST_CHUNKSTEPS/BROADCAST_SLICESTEPS, BROADCAST_SLICESTEPS, T, 1, 1, FUNC>
+    prims(tid, nthreads, &ring->prev, &ring->next, NULL, stepSize, channel, comm, args->opCount);
+
+  for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
+    int realChunkSize = min(chunkSize, DIVUP(size-gridOffset,args->nChannels));
+    ALIGN_SIZE(realChunkSize, nthreads*sizeof(uint64_t)/sizeof(T));
+    ssize_t offset = gridOffset + bid*realChunkSize;
+    int nelem = min(realChunkSize, size-offset);
+
+    if (rank == root) {
+      if (thisInput == thisOutput) {
+        INIT_COUNTER;
+        prims.send(thisInput+offset, nelem);
+        ACCUMULATE_COUNTER(send);
+      } else {
+        INIT_COUNTER;
+        prims.copySend(thisInput+offset, thisOutput+offset, nelem);
+        ACCUMULATE_COUNTER(copySend);
+      }
+    } else if (nextRank == root) {
+      INIT_COUNTER;
+      prims.recv(thisOutput+offset, nelem);
+      ACCUMULATE_COUNTER(recv);
+    } else {
+      INIT_COUNTER;
+      prims.recvCopySend(thisOutput+offset, nelem);
+      ACCUMULATE_COUNTER(recvCopySend);
+    }
+  }
+#ifdef ENABLE_PROFILING
+  if (tid == 0) __atomic_fetch_add(&(devProf->total_cycle), clock64() - clk, __ATOMIC_SEQ_CST);
+#endif
+}
+
+template<int UNROLL, class FUNC, typename T>
+__attribute__((noinline))
+__device__ void ncclBroadcastTreeKernel(struct CollectiveArgs* args) { }
+
+template<int UNUSED, class FUNC, typename T>
+__attribute__((noinline))
+__device__ void ncclBroadcastRingLLKernel(struct CollectiveArgs* args) {
+  const int tid = threadIdx.x;
+  const int bid = args->bid;
+  const int nthreads = args->nThreads;
+  struct ncclDevComm* comm = args->comm;
+  struct ncclChannel* channel = comm->channels+blockIdx.x;
+  struct ncclRing* ring = &channel->ring;
+
+  ncclLLPrimitives<T, FUNC, 1, 1> LLprims(tid, nthreads, &ring->prev, &ring->next, channel, comm, args->opCount);
 
   const ssize_t size = args->N;
-  const int buffSize = ring->buffSize / sizeof(T);
-  const int sliceSize = buffSize / BROADCAST_BUFCHUNKS;
-  const ssize_t loopSize = args->nRings*(ssize_t)sliceSize;
   const int rank = ring->devUserRanks[0];
   const int nextRank = ring->devUserRanks[1];
   const int root = args->root;
 
-  if (tid == 0) {
-    // Update in case we skipped some collectives
-    STORE(ring->recv.conn.opCount, args->opCount);
-    if (nextRank != root) {
-      // Wait for next to be ready
-      WaitFlag waitOpCountNext(ring->send.conn.opCount, 0);
-      waitOpCountNext.wait(args->opCount);
-    }
-    if (rank != root && prevdirect) {
-      *ring->recv.conn.ptrExchange = args->ThisOutput;
-    }
-    if (nextRank != root && nextdirect) {
-      void* volatile* ptr = &(ring->devMemSend->ptrExchange);
-      while (LOAD(ptr) == nullptr);
-      sharedNextOutput = (T*)LOAD(ptr);
-      STORE(ptr, nullptr);
-    }
-  }
-  __syncthreads();
-
-  uint64_t step = 0ULL;
-  int boffset = 0;
-
-  // Compute pointers
-  const T * __restrict__ thisInput = (const T*)args->ThisInput;
-  T * __restrict__ thisOutput = (T*)args->ThisOutput;
-  T * __restrict__ prevInput = (T*)ring->recv.conn.buff;
-  T * __restrict__ nextOutput = (T*)ring->send.conn.buff;
-
-  for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
-    int chunkSize = min(sliceSize, DIVUP(size-gridOffset,args->nRings));
-    ALIGN_SIZE(chunkSize, nthreads*sizeof(uint64_t)/sizeof(T));
-    ssize_t offset = gridOffset + bid*chunkSize;
-    int maxOffset = min(chunkSize, size-offset);
-
-    if (rank == root) {
-      if (thisInput == thisOutput) {
-        Prims::Copy(tid, nthreads,
-            thisInput  + offset,
-            nextdirect ? (sharedNextOutput + offset) : (nextOutput + boffset),
-            sliceSize, maxOffset,
-            step,
-            waitDoneFromNext,
-            postReadyToNext);
-      } else {
-        Prims::DoubleCopy(tid, nthreads,
-            thisInput  + offset,
-            thisOutput + offset,
-            nextdirect ? (sharedNextOutput + offset) : (nextOutput + boffset),
-            sliceSize, maxOffset,
-            step,
-            waitDoneFromNext,
-            postReadyToNext);
-      }
-    } else if (nextRank == root) {
-      if (prevdirect) maxOffset = 0; // Only wait for signals
-      Prims::Copy(tid, nthreads,
-          prevInput  + boffset,
-          thisOutput + offset,
-          sliceSize, maxOffset,
-          step,
-          waitReadyFromPrev,
-          postDoneToPrev);
-    } else {
-      if (prevdirect) {
-        Prims::Copy(tid, nthreads,
-            thisOutput + offset,
-            nextdirect ? (sharedNextOutput + offset) : (nextOutput + boffset),
-            sliceSize, maxOffset,
-            step,
-            waitDoneFromNext, waitReadyFromPrev,
-            postReadyToNext, postDoneToPrev);
-      } else {
-        Prims::DoubleCopy(tid, nthreads,
-            prevInput + boffset,
-            thisOutput + offset,
-            nextdirect ? (sharedNextOutput + offset) : (nextOutput + boffset),
-            sliceSize, maxOffset,
-            step,
-            waitDoneFromNext, waitReadyFromPrev,
-            postReadyToNext, postDoneToPrev);
-      }
-    }
-    NEXT_STEP; // Increases step, boffset
-  }
-
-  if (tid == 0) {
-    if (nextRank != root) {
-      // Wait for next to have consumed data before resetting the flag
-      waitDoneFromNext.wait(BROADCAST_SUBSTEPS*(step + BROADCAST_BUFCHUNKS - 1));
-      STORE(ring->send.conn.head, 0ULL);
-    }
-    STORE(ring->recv.conn.tail, 0ULL);
-    __threadfence_system();
-    STORE(ring->recv.conn.opCount, args->opCount+1);
-  }
-}
-
-#include "ll_kernel.h"
-
-#define NEXT_STEP_LL \
-  boffset += NCCL_LL_SLICE_LINES; \
-  if (boffset == NCCL_LL_BUFF_LINES) boffset = 0; \
-  flag++; \
-  step++;
-
-template<int UNUSED, class FUNC, typename T>
-__attribute__((noinline))
-__device__ void ncclBroadcastLLKernel(struct CollectiveArgs* args) {
-  const int tid = threadIdx.x;
-  const int bid = args->bid;
-  const int llNthreads = args->nThreads;
-  struct ncclComm* comm = args->comm;
-  struct ncclRing* ring = comm->rings+blockIdx.x;
-  volatile uint64_t * recvHeadPtr = ring->recv.conn.llHead;
-  volatile uint64_t * sendHeadPtr = ring->send.conn.llHead;
-  volatile int * sizesFifo = ring->send.conn.llFifo;
-  uint64_t sendHead = sendHeadPtr[0];
-  const int rank = comm->rank;
-  const int nextRank = ring->devUserRanks[1];
-  const int root = args->root;
-
-  typedef LLPrimitives<T, FUNC> LL;
-
-  const ssize_t size = args->N;
   ssize_t chunkSize = NCCL_LL_SLICE_LINES * sizeof(uint64_t) / sizeof(T);
-  const ssize_t loopSize = args->nRings*chunkSize;
-
-  uint64_t step = ring->send.conn.llStep;
-  uint32_t flag = step + 1;
-  int boffset = NCCL_LL_SLICE_LINES * STEP_TO_SLOT(step);
+  const ssize_t loopSize = args->nChannels*chunkSize;
 
   // Compute pointers
   const T * __restrict__ thisInput = (const T*)args->ThisInput;
   T * __restrict__ thisOutput = (T*)args->ThisOutput;
-  union ncclLLFifoLine * prevInput = (union ncclLLFifoLine *)ring->recv.conn.llBuff;
-  union ncclLLFifoLine * nextOutput = (union ncclLLFifoLine *)ring->send.conn.llBuff;
 
   for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
     if (size-gridOffset < loopSize) {
@@ -186,46 +103,21 @@ __device__ void ncclBroadcastLLKernel(struct CollectiveArgs* args) {
     }
     ssize_t offset = gridOffset + bid*chunkSize;
 
-    int maxOffset = min(chunkSize, size-offset);
+    int nelem = min(chunkSize, size-offset);
     if (rank == root) {
-      WAIT_NEXT;
       if (thisInput == thisOutput) {
-        LL::ReduceCopy(
-            thisInput + offset,
-            nextOutput + boffset,
-            maxOffset, flag, llNthreads);
+        LLprims.send(thisInput+offset, nelem);
       } else {
-        LL::ReduceCopy(
-            thisInput + offset,
-            thisOutput + offset,
-            nextOutput + boffset,
-            maxOffset, flag, llNthreads);
+        LLprims.copySend(thisInput + offset, thisOutput + offset, nelem);
       }
-      POST_SIZE;
-      NEXT_STEP_LL;
     } else if (nextRank == root) {
-      LL::ReduceCopy(
-          prevInput + boffset,
-          thisOutput + offset,
-          maxOffset, flag, llNthreads);
-      NEXT_STEP_LL;
-      ACK_PREV;
+      LLprims.recv(thisOutput + offset, nelem);
     } else {
-      WAIT_NEXT;
-      LL::ReduceCopy(
-          prevInput + boffset,
-          thisOutput + offset,
-          nextOutput + boffset,
-          maxOffset, flag, flag, llNthreads);
-      POST_SIZE;
-      NEXT_STEP_LL;
-      ACK_PREV;
+      LLprims.recvCopySend(thisOutput + offset, nelem);
     }
   }
-
-  // We need everyone to acknowledge data even if they didn't receive anything
-  // so that the next collective can start right away.
-  ACK_PREV;
-
-  FIFO_CLEANING_AND_SAVE_STEP(flag);
 }
+
+template<int UNUSED, class FUNC, typename T>
+__attribute__((noinline))
+__device__ void ncclBroadcastTreeLLKernel(struct CollectiveArgs* args) { }
diff --git a/projects/rccl/src/collectives/device/broadcast_0.cpp b/projects/rccl/src/collectives/device/broadcast_0.cpp
deleted file mode 100644
index 75b75ad9cf..0000000000
--- a/projects/rccl/src/collectives/device/broadcast_0.cpp
+++ /dev/null
@@ -1,8 +0,0 @@
-/*************************************************************************
- * Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
- *
- * See LICENSE.txt for license information
- ************************************************************************/
-
-#define NCCL_OP 0
-#include "device/broadcast.cu"
diff --git a/projects/rccl/src/collectives/device/common.h b/projects/rccl/src/collectives/device/common.h
index 819f3a12ab..fd26814b0f 100644
--- a/projects/rccl/src/collectives/device/common.h
+++ b/projects/rccl/src/collectives/device/common.h
@@ -1,5 +1,6 @@
+#include "hip/hip_runtime.h"
 /*************************************************************************
- * Copyright (c) 2017-2018, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved.
  * Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
  *
  * See LICENSE.txt for license information
@@ -8,18 +9,38 @@
 #ifndef NCCL_DEVICE_COMMON_H_
 #define NCCL_DEVICE_COMMON_H_
 
-#include <hip/hip_runtime.h>
-
 #include "../collectives.h"
-#include "core.h"
+#include "devcomm.h"
 #include "nccl.h"
-
 #include <type_traits>
 
-typedef void(*ncclKern_t)(struct CollectiveArgs* args);
-#define NCCL_FUNC4(coll, op, dtype) \
+// Exit If Abort Barrier across CTA: make sure all threads exit consistently
+// Each thread sets a predicate to true if abort == 1
+// all CTA's threads enter the barrier and do a popc on their predicates being True
+// If any of the thread's predicate was True, all the threads call exit()
+#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
+#define exitIfAbortBarrier(abort, abortCount) \
+  if (abort) __atomic_fetch_add(abortCount, 1, __ATOMIC_SEQ_CST); \
+  __syncthreads(); \
+  if (LOAD(abortCount)) { asm volatile ("s_endpgm"); return; }
+#else
+static inline __device__ void exitIfAbortBarrier(int abort) {
+  uint32_t popc;
+  asm ("{");
+  asm volatile ("   .reg .pred barr_pred;");
+  asm volatile ("   setp.eq.u32 barr_pred,%0,1;" :: "r"(abort));
+  asm volatile ("   bar.red.popc.u32 %0, 13, barr_pred;" : "=r"(popc));
+  asm ("}");
+  if (popc) { asm volatile ("exit;"); }
+}
+#endif
+
+#define NCCL_FUNC5(coll, op, dtype) \
   NCCL_COLL_NAME(coll, op, dtype), \
-  NCCL_COLL_NAME(coll##LL, op, dtype)  \
+  NCCL_COLL_NAME(coll##LL, op, dtype)
+
+#define NCCL_FUNC4(coll, op, dtype) \
+  NCCL_FUNC5(coll##Ring, op, dtype)
 
 // Must be consistent with ncclDataType_t
 #define NCCL_FUNCS3A(coll, op) \
@@ -64,20 +85,13 @@ typedef void(*ncclKern_t)(struct CollectiveArgs* args);
   NCCL_FUNCS2A(ncclAllReduce) }
 
 // Must be consistent with the ncclFuncSet enum
-using ncclKern_t = void (*)(struct CollectiveArgs*);
+using ncclFunc_t = void (*)(struct CollectiveArgs*);
 
-static const __device__ constexpr ncclKern_t ncclFuncs[]{
-#if defined(__HIP_DEVICE_COMPILE__)
-  NCCL_FUNCS2B(ncclBroadcast),
-  NCCL_FUNCS2A(ncclReduce),
-  NCCL_FUNCS2B(ncclAllGather),
-  NCCL_FUNCS2A(ncclReduceScatter),
-  NCCL_FUNCS2A(ncclAllReduce)
-#endif
+static const __device__ constexpr ncclFunc_t ncclFuncs[]{
 // Don't try to initialize the host shadow copy of this device-side global
 // variable. There is no host pointer to a device-side function, which
 // confuses clang. This will be fixed in the next clang release.
-#if __CUDA_ARCH__
+#if defined(__HIP_DEVICE_COMPILE__)
   NCCL_FUNCS2B(ncclBroadcast),
   NCCL_FUNCS2A(ncclReduce),
   NCCL_FUNCS2B(ncclAllGather),
@@ -88,82 +102,89 @@ static const __device__ constexpr ncclKern_t ncclFuncs[]{
 
 template<unsigned short f, unsigned short l>
 struct Caller {
-  static
-  __device__ void call(ncclColl* const c) noexcept
+  static __device__ __host__
+  void call(ncclColl* const c) noexcept
   {
     constexpr unsigned short m = f + (l - f) / 2;
 
-    return (c->funcIndex < m) ? Caller<f, m>::call(c) : Caller<m, l>::call(c);
+     return (c->funcIndex < m) ? Caller<f, m>::call(c) : Caller<m, l>::call(c);
   }
 };
 
 template<unsigned short f>
 struct Caller<f, f + 1>{
-  static
-  __device__ void call(struct ncclColl* const c) noexcept { ncclFuncs[f](&c->args); }
+  static __device__ __host__
+  void call(struct ncclColl* const c) noexcept { ncclFuncs[f](&c->args); }
 };
 
 inline
 __device__
-void NCCL_CALL_FUNCTIONS(struct ncclColl* const c) noexcept
-{
+void NCCL_CALL_FUNCTIONS(struct ncclColl* const c) noexcept {
   if (c->funcIndex < 72) {
-    if (c->funcIndex % 2) ncclBroadcastLL_copy_i8(&c->args);
-    else ncclBroadcast_copy_i8(&c->args);
+    if (c->funcIndex % 2) ncclBroadcastRingLL_copy_i8(&c->args);
+    else ncclBroadcastRing_copy_i8(&c->args);
   }
   else if (c->funcIndex < 144) Caller<72, 144>::call(c);
   else if (c->funcIndex < 216) {
-    if (c->funcIndex % 2) ncclAllGatherLL_copy_i8(&c->args);
-    else ncclAllGather_copy_i8(&c->args);
+    if (c->funcIndex % 2) ncclAllGatherRingLL_copy_i8(&c->args);
+    else ncclAllGatherRing_copy_i8(&c->args);
   }
   else Caller<216, 360>::call(c);
 }
 
-static __device__ void load_parallel(void* dst, void* src, size_t size, int tid) {
+static __device__ void load_parallel(void* dst, void* src, size_t size, int tid, uint32_t* abortCount) {
   int* d = (int*)dst;
   int* s = (int*)src;
-  __syncthreads();
+  // When aggregation is effective, if some threads have aborted inside the LL kernel,
+  // make sure the rest of the threads abort as well
+  exitIfAbortBarrier(0, abortCount);
   for (int o = tid; o < (size/sizeof(int)); o += blockDim.x) d[o] = s[o];
   __syncthreads();
 }
-static __device__ void load_coll(struct ncclColl* localColl, struct ncclColl* hostColl, int tid) {
-  load_parallel(localColl, hostColl, sizeof(struct ncclColl), tid);
+static __device__ void load_coll(struct ncclColl* localColl, struct ncclColl* hostColl, int tid, uint32_t* abortCount) {
+  load_parallel(localColl, hostColl, sizeof(struct ncclColl), tid, abortCount);
   if (tid == 0) hostColl->active = 0;
 }
 
 /* Functions for aggregation case */
-#define IMPL_COLL4(coll, op, ncclFunc, dtype, ctype) \
+#define IMPL_COLL_FUNC(coll, op, ncclFunc, dtype, ctype) \
 __device__ void NCCL_COLL_NAME(coll, op, dtype)(struct CollectiveArgs* args) { \
-  coll##Kernel<UNROLL, ncclFunc<ctype>, ctype>(args); \
+  coll##Kernel<COLL_UNROLL, ncclFunc<ctype>, ctype>(args); \
 }
+
+#if NCCL_OP == 0
 /* Kernels with the first operation inlined */
-#define IMPL_COLL4K(coll, op, ncclFunc, dtype, ctype, fIndex) \
+#define IMPL_COLL_KERN(coll, op, ncclFunc, dtype, ctype, fIndex) \
 __launch_bounds__(MAXTHREADS+WARP_SIZE, 1) \
 __global__ void NCCL_KERN_NAME(coll, op, dtype)(struct ncclColl firstColl) { \
   int tid = threadIdx.x; \
   int bid = blockIdx.x; \
   __shared__ struct ncclColl localColl; \
+  __shared__ uint32_t abortCount; \
+  if (tid == 0) abortCount = 0; \
+  __syncthreads(); \
  \
-  struct ncclComm* comm = firstColl.args.comm; \
-  struct ncclRing* ring = comm->rings+bid; \
+  struct ncclDevComm* comm = firstColl.args.comm; \
+  struct ncclChannel* channel = comm->channels+bid; \
   struct ncclColl* c; \
+  channel->abortCount = &abortCount; \
   if (bid == 0) { \
     /* To optimize for latency, (only) the first operation is passed as argument.*/ \
     c = &firstColl; \
   } else { \
     c = &localColl; \
-    load_coll(c, ring->devCollectives+ring->collFifoHead, tid); \
+    load_coll(c, channel->devCollectives+channel->collFifoHead, tid, &abortCount); \
   } \
   while (1) { \
-    if (tid < c->nThreads) { \
+    if (tid < c->args.nThreads) { \
       if (c->funcIndex == fIndex) { \
-        coll##Kernel<UNROLL, ncclFunc<ctype>, ctype>(&c->args); \
+        coll##Kernel<COLL_UNROLL, ncclFunc<ctype>, ctype>(&c->args); \
       } else { \
         NCCL_CALL_FUNCTIONS(c); \
       } \
     } \
     int nextIndex = c->nextIndex; \
-    if (tid == 0) ring->collFifoHead = nextIndex; \
+    if (tid == 0) channel->collFifoHead = nextIndex; \
  \
     if (c->active == 2) { \
       return; \
@@ -171,15 +192,21 @@ __global__ void NCCL_KERN_NAME(coll, op, dtype)(struct ncclColl firstColl) { \
  \
     /* Load next collective operation*/ \
     c = &localColl; /* for bid 0 */ \
-    load_coll(c, ring->devCollectives+nextIndex, tid); \
+    load_coll(c, channel->devCollectives+nextIndex, tid, &abortCount); \
   } \
 }
+#else
+#define IMPL_COLL_KERN(coll, op, ncclFunc, dtype, ctype, fIndex)
+#endif
+
+// Only generate inline kernels for LL
+#define IMPL_COLL4(coll, op, ncclFunc, dtype, ctype, ncclColl, ncclOp, ncclType, al) \
+  IMPL_COLL_FUNC(coll, op, ncclFunc, dtype, ctype) \
+  IMPL_COLL_FUNC(coll##LL, op, ncclFunc, dtype, ctype) \
+  IMPL_COLL_KERN(coll##LL, op, ncclFunc, dtype, ctype, FUNC_INDEX(ncclColl, ncclOp, ncclType, 1, al)) \
 
 #define IMPL_COLL3(coll, op, ncclFunc, dtype, ctype, ncclColl, ncclOp, ncclType) \
-  IMPL_COLL4(coll##LL, op, ncclFunc, dtype, ctype) \
-  IMPL_COLL4K(coll##LL, op, ncclFunc, dtype, ctype, FUNC_INDEX(ncclColl, ncclOp, ncclType, 1)) \
-  IMPL_COLL4(coll, op, ncclFunc, dtype, ctype) \
-  IMPL_COLL4K(coll, op, ncclFunc, dtype, ctype, FUNC_INDEX(ncclColl, ncclOp, ncclType, 0)) \
+  IMPL_COLL4(coll##Ring, op, ncclFunc, dtype, ctype, ncclColl, ncclOp, ncclType, 0)
 
 #define IMPL_COLL2(coll, op, ncclFunc, ncclColl, ncclOp) \
   IMPL_COLL3(coll, op, ncclFunc, i8,  int8_t,   ncclColl, ncclOp, ncclInt8) \
@@ -192,4 +219,6 @@ __global__ void NCCL_KERN_NAME(coll, op, dtype)(struct ncclColl firstColl) { \
   IMPL_COLL3(coll, op, ncclFunc, f32, float,    ncclColl, ncclOp, ncclFloat32) \
   IMPL_COLL3(coll, op, ncclFunc, f64, double,   ncclColl, ncclOp, ncclFloat64)
 
+#define COLL_UNROLL 2
+
 #endif
diff --git a/projects/rccl/src/collectives/device/common_kernel.h b/projects/rccl/src/collectives/device/common_kernel.h
index e8194bf4e3..7cf85671a3 100644
--- a/projects/rccl/src/collectives/device/common_kernel.h
+++ b/projects/rccl/src/collectives/device/common_kernel.h
@@ -1,5 +1,5 @@
 /*************************************************************************
- * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
  * Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
  *
  * See LICENSE.txt for license information
@@ -8,25 +8,25 @@
 #ifndef NCCL_COMMON_KERNEL_H_
 #define NCCL_COMMON_KERNEL_H_
 
-#include "core.h"
+#include "devcomm.h"
 #include <cstdio>
 #include <cstdint>
 
-#include <hip/hip_runtime_api.h>
+#include <hip/hip_runtime.h>
 
 // Define min for ssize_t
 static __device__ int min(int a, ssize_t b) { return (a < b) ? a : b; }
 
 typedef uint64_t PackType;
 
-#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__)
+#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
 
 template<class FUNC, typename T>
 struct MULTI {
-    __device__ PackType operator()(const PackType x, const PackType y) const
-    {
-        return FUNC()(x, y);
-    }
+  __device__ PackType operator()(const PackType x, const PackType y) const
+  {
+    return FUNC()(x, y);
+  }
 };
 
 #else
@@ -205,15 +205,7 @@ struct MULTI<FUNC, int64_t> {
   }
 };
 
-#endif //defined(__HIP_PLATFORM_HCC__) || defined(__HCC__)
-
-#define ALIGNUP(x, a)   ((((x)-1) & ~((a)-1)) + (a))
-
-template<typename T>
-__device__ inline volatile T* AlignUp(volatile T * ptr, size_t align) {
-  size_t ptrval = reinterpret_cast<size_t>(ptr);
-  return reinterpret_cast<volatile T*>(ALIGNUP(ptrval, align));
-}
+#endif //defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
 
 template<typename T> inline __device__
 T vFetch(const volatile T* ptr) {
@@ -225,7 +217,7 @@ void vStore(volatile T* ptr, const T val) {
   *ptr = val;
 }
 
-#if CUDART_VERSION < 9000 && !(defined(__HIP_PLATFORM_HCC__) || defined(__HCC__))
+#if CUDART_VERSION < 9000 && !(defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__))
 template<> inline __device__
 half vFetch<half>(const volatile half* ptr) {
   half r;
@@ -251,26 +243,6 @@ void vStore<half>(volatile half* ptr, const half val) {
 }
 #endif
 
-template<class FUNC, typename T, bool TWO_INPUTS, bool TWO_OUTPUTS>
-__attribute__((noinline))
-__device__ inline void ReduceCopy(
-    const int tid, const int nthreads,
-    const volatile T * __restrict__ const src0,
-    const volatile T * __restrict__ const src1,
-    volatile T * __restrict__ const dest0,
-    volatile T * __restrict__ const dest1, const int N) {
-  for (int idx = tid; idx < N; idx += nthreads) {
-    T val = vFetch(src0+idx);
-    if (TWO_INPUTS) {
-      val = FUNC()(val, vFetch(src1+idx));
-    }
-    vStore(dest0+idx, val);
-    if (TWO_OUTPUTS) {
-      vStore(dest1+idx, val);
-    }
-  }
-}
-
 typedef ulong2 Pack128;
 
 template<class FUNC, typename T>
@@ -281,8 +253,8 @@ struct MULTI128 {
   }
 };
 
-inline __device__ void Fetch128(Pack128& v, Pack128* p) {
-#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__)
+inline __device__ void Fetch128(Pack128& v, const Pack128* p) {
+#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
   v.x = p->x;
   v.y = p->y;
 #else
@@ -290,7 +262,7 @@ inline __device__ void Fetch128(Pack128& v, Pack128* p) {
 #endif
 }
 inline __device__ void Store128(Pack128* p, Pack128& v) {
-#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__)
+#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
   p->x = v.x;
   p->y = v.y;
 #else
@@ -298,67 +270,104 @@ inline __device__ void Store128(Pack128* p, Pack128& v) {
 #endif
 }
 
-#define WARP_SIZE 32
-template<class FUNC, typename T, bool TWO_INPUTS, bool TWO_OUTPUTS, int UNROLL>
-__attribute__((noinline))
-__device__ inline void ReduceCopy128b( const int w, const int nw, const int t,
-    Pack128 * src0, Pack128 * src1, Pack128 * dest0, Pack128 * dest1,
-    const int N) {
-  Pack128 t0[UNROLL];
-  Pack128 t1[UNROLL];
-  const Pack128* src0_end = src0 + N;
-  const int inc = nw * UNROLL * WARP_SIZE;
-  const int offset = w * UNROLL * WARP_SIZE + t;
-  src0 += offset;  if (TWO_INPUTS)  src1 += offset;
-  dest0 += offset; if (TWO_OUTPUTS) dest1 += offset;
+template<class FUNC, typename T, int MINSRCS, int MAXSRCS, int MINDSTS, int MAXDSTS>
+__device__ void ReduceCopyMulti(const int tid, const int nthreads,
+    int nsrcs, const T* srcs[MAXSRCS], int ndsts, T* dsts[MAXDSTS],
+    const int offset, const int N) {
+  for (int idx = offset+tid; idx < offset+N; idx += nthreads) {
+    T val = vFetch(srcs[0]+idx);
+    #pragma unroll
+    for (int i=1; i<MINSRCS; i++) val = FUNC()(val, vFetch(srcs[i]+idx));
+    #pragma unroll 1
+    for (int i=MINSRCS; i<MAXSRCS && i<nsrcs; i++) val = FUNC()(val, vFetch(srcs[i]+idx));
 
-  while (src0 < src0_end) {
-#pragma unroll
-    for (int u = 0; u < UNROLL; ++u) {
-      Fetch128(t0[u], src0+u*WARP_SIZE);
-      if (TWO_INPUTS) Fetch128(t1[u], src1+u*WARP_SIZE);
-    }
-#pragma unroll
-    for (int u = 0; u < UNROLL; ++u) {
-      if (TWO_INPUTS) MULTI128<FUNC, T>()(t0[u], t1[u]);
-      Store128(dest0+u*WARP_SIZE, t0[u]);
-      if (TWO_OUTPUTS) Store128(dest1+u*WARP_SIZE, t0[u]);
-    }
-    src0 += inc;  if (TWO_INPUTS)  src1 += inc;
-    dest0 += inc; if (TWO_OUTPUTS) dest1 += inc;
+    #pragma unroll
+    for (int i=0; i<MINDSTS; i++) vStore(dsts[i]+idx, val);
+    #pragma unroll 1
+    for (int i=MINDSTS; i<MAXDSTS && i<ndsts; i++) vStore(dsts[i]+idx, val);
   }
 }
 
-template<int UNROLL, class FUNC, typename T, bool HAS_DEST1, bool HAS_SRC1>
-__attribute__((noinline))
-__device__ inline void ReduceOrCopy(const int tid, const int nthreads,
-    volatile T * __restrict__ dest0, volatile T * __restrict__ dest1,
-    const volatile T * __restrict__ src0, const volatile T * __restrict__ src1,
+#define WARP_SIZE 64
+
+template<class FUNC, typename T, int UNROLL, int MINSRCS, int MAXSRCS, int MINDSTS, int MAXDSTS>
+__device__ void ReduceCopy128bMulti( const int w, const int nw, const int t,
+    int nsrcs, const T* s[MAXSRCS], int ndsts, T* d[MAXDSTS],
+    const int elemOffset, const int Npack) {
+  const int inc = nw * UNROLL * WARP_SIZE;
+  int offset = w * UNROLL * WARP_SIZE + t;
+
+  const Pack128* srcs[MAXSRCS];
+  for (int i=0; i<MAXSRCS; i++) srcs[i] = ((const Pack128*)(s[i]+elemOffset))+offset;
+  Pack128* dsts[MAXDSTS];
+  for (int i=0; i<MAXDSTS; i++) dsts[i] = ((Pack128*)(d[i]+elemOffset))+offset;
+
+  while (offset < Npack) {
+    Pack128 vals[UNROLL];
+    // Load and reduce
+    for (int u = 0; u < UNROLL; ++u) Fetch128(vals[u], srcs[0]+u*WARP_SIZE);
+
+    for (int i=1; i<MINSRCS; i++) {
+      Pack128 vals2[UNROLL];
+      for (int u = 0; u < UNROLL; ++u) Fetch128(vals2[u], srcs[i]+u*WARP_SIZE);
+      for (int u = 0; u < UNROLL; ++u) MULTI128<FUNC, T>()(vals[u], vals2[u]);
+    }
+    #pragma unroll 1
+    for (int i=MINSRCS; i<MAXSRCS && i<nsrcs; i++) {
+      Pack128 vals2[UNROLL];
+      for (int u = 0; u < UNROLL; ++u) Fetch128(vals2[u], srcs[i]+u*WARP_SIZE);
+      for (int u = 0; u < UNROLL; ++u) MULTI128<FUNC, T>()(vals[u], vals2[u]);
+    }
+
+    // Store
+    for (int i = 0; i < MINDSTS; i++) {
+      for (int u = 0; u < UNROLL; ++u) Store128(dsts[i]+u*WARP_SIZE, vals[u]);
+    }
+    #pragma unroll 1
+    for (int i=MINDSTS; i<MAXDSTS && i<ndsts; i++) {
+      for (int u = 0; u < UNROLL; ++u) Store128(dsts[i]+u*WARP_SIZE, vals[u]);
+    }
+    for (int i=0; i<MAXSRCS; i++) srcs[i] += inc;
+    for (int i=0; i<MAXDSTS; i++) dsts[i] += inc;
+    offset += inc;
+  }
+}
+
+template <typename T>
+__device__ int ptrAlign128(T* ptr) { return (uint64_t)ptr % alignof(Pack128); }
+
+// Try to limit consecutive load/stores to 8.
+// Use UNROLL 8 when we have a single source and a single destination, 4 otherwise
+#define AUTOUNROLL (UNROLL*(4/(MINDSTS+MINSRCS)))
+
+template<int UNROLL, class FUNC, typename T, int MINSRCS, int MAXSRCS, int MINDSTS, int MAXDSTS>
+__device__ void ReduceOrCopyMulti(const int tid, const int nthreads,
+    int nsrcs, const T* srcs[MAXSRCS], int ndsts, T* dsts[MAXDSTS],
     int N) {
   int Nrem = N;
   if (Nrem <= 0) return;
 
-  int Npreamble = (Nrem<alignof(Pack128)) ? Nrem : AlignUp(dest0, alignof(Pack128)) - dest0;
+  int alignDiff = 0;
+  int align = ptrAlign128(srcs[0]);
+  #pragma unroll
+  for (int i=1; i<MINSRCS; i++) alignDiff |= (align ^ ptrAlign128(srcs[i]));
+  for (int i=MINSRCS; i<MAXSRCS && i<nsrcs; i++) alignDiff |= (align ^ ptrAlign128(srcs[i]));
+  #pragma unroll
+  for (int i=0; i<MINDSTS; i++) alignDiff |= (align ^ ptrAlign128(dsts[i]));
+  for (int i=MINDSTS; i<MAXDSTS && i<ndsts; i++) alignDiff |= (align ^ ptrAlign128(dsts[i]));
 
-  // stage 0: check if we'll be able to use the fast, 128-bit aligned path.
-  // If not, we'll just use the slow preamble path for the whole operation
-  bool alignable = (((AlignUp(src0,  alignof(Pack128)) == src0  + Npreamble)) &&
-          (!HAS_DEST1 || (AlignUp(dest1, alignof(Pack128)) == dest1 + Npreamble)) &&
-          (!HAS_SRC1  || (AlignUp(src1,  alignof(Pack128)) == src1  + Npreamble)));
-
-  if (!alignable) {
-    Npreamble = Nrem;
-  }
+  int Npreamble = alignDiff ? Nrem :
+    N < alignof(Pack128) ? N :
+    (alignof(Pack128) - align) % alignof(Pack128);
 
   // stage 1: preamble: handle any elements up to the point of everything coming
   // into alignment
-  ReduceCopy<FUNC, T, HAS_SRC1, HAS_DEST1>(tid, nthreads, src0, src1, dest0, dest1, Npreamble);
-
-  Nrem -= Npreamble;
-  if (Nrem == 0) return;
-
-  dest0 += Npreamble; if (HAS_DEST1) { dest1 += Npreamble; }
-  src0  += Npreamble; if (HAS_SRC1)  { src1  += Npreamble; }
+  if (Npreamble) {
+    ReduceCopyMulti<FUNC, T, MINSRCS, MAXSRCS, MINDSTS, MAXDSTS>(tid, nthreads, nsrcs, srcs, ndsts, dsts, 0, Npreamble);
+    Nrem -= Npreamble;
+    if (Nrem == 0) return;
+  }
+  int offset = Npreamble;
 
   // stage 2: fast path: use 128b loads/stores to do the bulk of the work,
   // assuming the pointers we have are all 128-bit alignable.
@@ -366,35 +375,33 @@ __device__ inline void ReduceOrCopy(const int tid, const int nthreads,
   int nw = nthreads / WARP_SIZE; // Number of warps
   int t = tid % WARP_SIZE;       // Thread (inside the warp)
 
-  const int PackFactor = sizeof(Pack128) / sizeof(T);
+  const int packFactor = sizeof(Pack128) / sizeof(T);
 
   // stage 2a: main loop
-  int Nalign2a = (Nrem / (PackFactor * UNROLL * nthreads))
-      * (UNROLL * nthreads); // round down
+  int Npack2a = (Nrem / (packFactor * AUTOUNROLL * WARP_SIZE))
+      * (AUTOUNROLL * WARP_SIZE); // round down
+  int Nelem2a = Npack2a * packFactor;
 
-  ReduceCopy128b<FUNC, T, HAS_SRC1, HAS_DEST1, UNROLL>(w, nw, t, (Pack128*)src0, (Pack128*)src1, (Pack128*)dest0, (Pack128*)dest1, Nalign2a);
+  ReduceCopy128bMulti<FUNC, T, AUTOUNROLL, MINSRCS, MAXSRCS, MINDSTS, MAXDSTS>(w, nw, t, nsrcs, srcs, ndsts, dsts, offset, Npack2a);
 
-  int Ndone2a = Nalign2a * PackFactor;
-  Nrem -= Ndone2a;
+  Nrem -= Nelem2a;
   if (Nrem == 0) return;
-  dest0 += Ndone2a; if (HAS_DEST1) { dest1 += Ndone2a; }
-  src0  += Ndone2a; if (HAS_SRC1)  { src1  += Ndone2a; }
+  offset += Nelem2a;
 
   // stage 2b: slightly less optimized for section when we don't have full
-  // UNROLLs
+  // unrolling
 
-  int Nalign2b = Nrem / PackFactor;
+  int Npack2b = Nrem / packFactor;
+  int Nelem2b = Npack2b * packFactor;
 
-  ReduceCopy128b<FUNC, T, HAS_SRC1, HAS_DEST1, 1>(w, nw, t, (Pack128*)src0, (Pack128*)src1, (Pack128*)dest0, (Pack128*)dest1, Nalign2b);
+  ReduceCopy128bMulti<FUNC, T, 1, MINSRCS, MAXSRCS, MINDSTS, MAXDSTS>(w, nw, t, nsrcs, srcs, ndsts, dsts, offset, Npack2b);
 
-  int Ndone2b = Nalign2b * PackFactor;
-  Nrem -= Ndone2b;
+  Nrem -= Nelem2b;
   if (Nrem == 0) return;
-  dest0 += Ndone2b; if (HAS_DEST1) { dest1 += Ndone2b; }
-  src0  += Ndone2b; if (HAS_SRC1)  { src1  += Ndone2b; }
+  offset += Nelem2b;
 
   // stage 2c: tail
-  ReduceCopy<FUNC, T, HAS_SRC1, HAS_DEST1>(tid, nthreads, src0, src1, dest0, dest1, Nrem);
+  ReduceCopyMulti<FUNC, T, MINSRCS, MAXSRCS, MINDSTS, MAXDSTS>(tid, nthreads, nsrcs, srcs, ndsts, dsts, offset, Nrem);
 }
 
 #endif // COMMON_KERNEL_H_
diff --git a/projects/rccl/src/collectives/device/functions.cu b/projects/rccl/src/collectives/device/functions.cu
index bc7c175fc5..ed67c1b9df 100644
--- a/projects/rccl/src/collectives/device/functions.cu
+++ b/projects/rccl/src/collectives/device/functions.cu
@@ -1,15 +1,13 @@
 /*************************************************************************
- * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
  * Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
  *
  * See LICENSE.txt for license information
  ************************************************************************/
 
-#include "core.h"
+#include "devcomm.h"
 #include "collectives.h"
 #include "common.h"
 
-
-
 // Workaround for https://reviews.llvm.org/D55580
 __device__ void ncclWorkaroundClangD55580() {}
diff --git a/projects/rccl/src/collectives/device/gen_rules.sh b/projects/rccl/src/collectives/device/gen_rules.sh
new file mode 100755
index 0000000000..4413213e1e
--- /dev/null
+++ b/projects/rccl/src/collectives/device/gen_rules.sh
@@ -0,0 +1,28 @@
+#!/bin/bash
+#
+# Copyright (c) 2018-2019, NVIDIA CORPORATION. All rights reserved.
+#
+# See LICENSE.txt for license information
+#
+
+dir=$1
+
+targets="GENOBJS := \\\\\n"
+
+for base in all_reduce all_gather broadcast reduce reduce_scatter; do
+  opn=0
+  for op in sum prod min max; do
+    dtn=0
+    for dt in i8 u8 i32 u32 i64 u64 f16 f32 f64; do
+      echo "${dir}/${base}_${op}_${dt}.o : ${base}.cu ${dir}/${base}.dep"
+      echo "	@printf \"Compiling  %-35s > %s\\\\n\" ${base}.cu ${dir}/${base}_${op}_${dt}.o"
+      echo "	mkdir -p ${dir}"
+      echo "	\${NVCC} -DNCCL_OP=${opn} -DNCCL_TYPE=${dtn} \${NVCUFLAGS} -dc ${base}.cu -o ${dir}/${base}_${op}_${dt}.o"
+      echo ""
+      targets="$targets\t${dir}/${base}_${op}_${dt}.o \\\\\n"
+      dtn=$(($dtn + 1))
+    done
+    opn=$(($opn + 1))
+  done
+done
+echo -e "$targets"
diff --git a/projects/rccl/src/collectives/device/ll_kernel.h b/projects/rccl/src/collectives/device/ll_kernel.h
deleted file mode 100644
index ca7e4d63e5..0000000000
--- a/projects/rccl/src/collectives/device/ll_kernel.h
+++ /dev/null
@@ -1,186 +0,0 @@
-/*************************************************************************
- * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
- * Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
- *
- * See LICENSE.txt for license information
- ************************************************************************/
-
-#ifndef NCCL_LL_KERNEL_H_
-#define NCCL_LL_KERNEL_H_
-
-static __device__ __attribute__((noinline)) uint64_t readLL(union ncclLLFifoLine* src, uint32_t flag) {
-#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__)
-  using Vec = uint32_t __attribute__((ext_vector_type(4)));
-  Vec i4;
-  do {
-    asm volatile ("flat_load_dwordx4 %0, %1, glc\n"
-      "s_waitcnt vmcnt(0)\n"
-      "buffer_wbinvl1_vol\n" : "=v"(i4) : "v"(src));
-  } while (i4[1] != flag || i4[3] != flag);
-  uint64_t val64 = (uint64_t)(i4[0]) + (((uint64_t)i4[2]) << 32);
-  return val64;
-#else
-  uint32_t data1, flag1, data2, flag2;
-  do {
-    asm volatile("ld.volatile.global.v4.u32 {%0,%1,%2,%3}, [%4];" : "=r"(data1), "=r"(flag1), "=r"(data2), "=r"(flag2) : "l"(&src->i4));
-  } while ((flag1 != flag) || (flag2 != flag));
-  uint64_t val64 = data1 + (((uint64_t)data2) << 32);
-  return val64;
-#endif
-}
-
-static __device__ __attribute__((noinline)) void storeLL(union ncclLLFifoLine* dst, uint64_t val, uint32_t flag) {
-#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__)
-  using Vec = uint32_t __attribute__((ext_vector_type(4)));
-  Vec i4;
-  i4[0] = val & 0xffffffff;
-  i4[1] = flag;
-  i4[2] = (val >> 32);
-  i4[3] = flag;
-  asm volatile ("flat_store_dwordx4 %0, %1, glc\n"
-    "s_waitcnt vmcnt(0)\n"
-    "buffer_wbinvl1_vol\n" : : "v"(dst), "v"(i4));
-#else
-  asm volatile("st.volatile.global.v4.u32 [%0], {%1,%2,%3,%4};" :: "l"(&dst->i4), "r"((uint32_t)val), "r"(flag), "r"((uint32_t)(val >> 32)), "r"(flag));
-#endif
-}
-
-// Using memcpy handles misaligned pointers.
-static __device__ uint64_t readAL(uint64_t* src) {
-  uint64_t val;
-  memcpy((char*)&val, (char*)src, sizeof(uint64_t));
-  return val;
-}
-static __device__ void storeAL(uint64_t* dst, uint64_t val) {
-  memcpy((char*)dst, (char*)&val, sizeof(uint64_t));
-}
-
-template <typename T, class FUNC>
-class LLPrimitives {
- private:
-  template <int HAS_SRC1, int HAS_SRC2, int HAS_DST1, int HAS_DST2>
-  __attribute__((noinline))
-  static __device__ void ReduceCopyGeneric(const T* src1, union ncclLLFifoLine* src2, T* dst1, union ncclLLFifoLine* dst2, int size, uint32_t iflag, uint32_t oflag, int nthreads) {
-    if (size <= 0) return;
-    size_t size64 = size * sizeof(T) / sizeof(uint64_t);
-    uint64_t* src1A = (uint64_t*)src1;
-    uint64_t* dst1A = (uint64_t*)dst1;
-    int offset = threadIdx.x;
-    // Do multiples of 64 bits
-#pragma unroll 1
-    for (; offset < size64; offset += nthreads) {
-      uint64_t val;
-      if (HAS_SRC1) {
-        val = readAL(src1A+offset);
-        if (HAS_SRC2) val = MULTI<FUNC, T>()(readLL(src2+offset, iflag), val);
-      } else if (HAS_SRC2) {
-        val = readLL(src2+offset, iflag);
-      }
-      if (HAS_DST1) storeAL(dst1A+offset, val);
-      if (HAS_DST2) storeLL(dst2+offset, val, oflag);
-    }
-    // Finish last word
-    int sizeDone = size64*(sizeof(uint64_t)/sizeof(T));
-    int sizeRem = size - sizeDone;
-    if (threadIdx.x == 0 && sizeRem) {
-      const T* src1B = src1 + sizeDone;
-      T* dst1B = dst1 + sizeDone;
-
-      uint64_t lastVal;
-      T* vals = (T*)&lastVal;
-
-      if (HAS_SRC2) {
-        uint64_t lastVal2 = readLL(src2+size64, iflag);
-        T* src2B = (T*)&lastVal2;
-        for (int offset = 0; offset < sizeRem; offset++) {
-          vals[offset] = HAS_SRC1 ? FUNC()(src2B[offset], src1B[offset]) : src2B[offset];
-        }
-      } else if (HAS_SRC1) {
-        for (int offset = 0; offset < sizeRem; offset++) {
-          vals[offset] = src1B[offset];
-        }
-      }
-      if (HAS_DST2) storeLL(dst2+size64, lastVal, oflag);
-      if (HAS_DST1) {
-        for (int offset = 0; offset < sizeRem; offset++) {
-          dst1B[offset] = vals[offset];
-        }
-      }
-    }
-  }
- public:
-  static __device__ void ReduceCopy(const T* src, union ncclLLFifoLine* dst, int size, uint32_t oflag, int nthreads) {
-    return ReduceCopyGeneric<1, 0, 0, 1>(src, NULL, NULL, dst, size, 0, oflag, nthreads);
-  }
-
-  static __device__ void ReduceCopy(union ncclLLFifoLine* src, T* dst, int size, uint32_t iflag, int nthreads) {
-    return ReduceCopyGeneric<0, 1, 1, 0>(NULL, src, dst, NULL, size, iflag, 0, nthreads);
-  }
-
-  static __device__ void ReduceCopy(const T* src1, union ncclLLFifoLine* src2, union ncclLLFifoLine* dst, int size, uint32_t iflag, uint32_t oflag, int nthreads) {
-    return ReduceCopyGeneric<1, 1, 0, 1>(src1, src2, NULL, dst, size, iflag, oflag, nthreads);
-  }
-
-  static __device__ void ReduceCopy(const T* src1, union ncclLLFifoLine* src2, T* dst, int size, uint32_t iflag, int nthreads) {
-    return ReduceCopyGeneric<1, 1, 1, 0>(src1, src2, dst, NULL, size, iflag, 0, nthreads);
-  }
-
-  static __device__ void ReduceCopy(const T* src, T* dst1, union ncclLLFifoLine* dst2, int size, uint32_t oflag, int nthreads) {
-    return ReduceCopyGeneric<1, 0, 1, 1>(src, NULL, dst1, dst2, size, 0, oflag, nthreads);
-  }
-
-  static __device__ void ReduceCopy(union ncclLLFifoLine* src, T* dst1, union ncclLLFifoLine* dst2, int size, uint32_t iflag, uint32_t oflag, int nthreads) {
-    return ReduceCopyGeneric<0, 1, 1, 1>(NULL, src, dst1, dst2, size, iflag, oflag, nthreads);
-  }
-
-  static __device__ void ReduceCopy(const T* src1, union ncclLLFifoLine* src2, T* dst1, union ncclLLFifoLine* dst2, int size, uint32_t iflag, uint32_t oflag, int nthreads) {
-    return ReduceCopyGeneric<1, 1, 1, 1>(src1, src2, dst1, dst2, size, iflag, oflag, nthreads);
-  }
-};
-
-// Common macros
-
-#define STEP_TO_SLOT(step) \
-  (step % NCCL_LL_CHUNKS)
-
-#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__)
-#define SYNC __syncthreads()
-#else
-#define SYNC asm volatile ("bar.sync 1, %0;" :: "r"(llNthreads))
-#endif
-
-#define WAIT_NEXT \
-  if (tid == 0) { \
-    while (sendHead + NCCL_LL_CHUNKS <= step) { \
-      sendHead = LOAD(sendHeadPtr); \
-    } \
-  } \
-  SYNC;
-
-#define POST_SIZE \
-  if (tid == 0 && sizesFifo) { STORE(sizesFifo + step % NCCL_LL_CHUNKS, (maxOffset <= 0) ? -1 : (maxOffset*2*(int)sizeof(T))); }
-
-#define ACK_PREV \
-  SYNC; \
-  if (tid == 0) STORE(recvHeadPtr,step);
-
-#define FIFO_CLEANING_AND_SAVE_STEP(flag) do { \
-  if (step > LOAD(&ring->send.conn.llLastCleaning) + NCCL_LL_CLEAN_FREQ) { \
-    /* Reset all flags */ \
-    static_assert((NCCL_LL_BUFF_SIZE % NCCL_LL_MAX_NTHREADS) == 0, "NCCL_LL_BUFF_SIZE must be a multiple of THREADS"); \
-    static_assert(NCCL_LL_BUFF_SIZE/(sizeof(union ncclLLFifoLine)*NCCL_LL_MAX_NTHREADS) > 0, "NCCL_LL_BUFF_SIZE is less than 16 bytes*THREADS"); \
-    const union ncclLLFifoLine resetLine = { 0, flag, 0, flag }; \
-    for (int i=0; i<NCCL_LL_BUFF_SIZE/(sizeof(union ncclLLFifoLine)*llNthreads); i++) { \
-      prevInput[tid+i*llNthreads].i4 = resetLine.i4; \
-    } \
-    __threadfence_system(); \
-    /* Restart from the same slot, only make sure sender waits for data to be reset */ \
-    step += NCCL_LL_CHUNKS; \
-    ACK_PREV; \
-    while (LOAD(sendHeadPtr) < step); \
-    { if (tid == 0) STORE(&ring->send.conn.llLastCleaning, step); }\
-  } \
-  STORE(&ring->send.conn.llStep, step); \
-} while (0);
-
-#endif
diff --git a/projects/rccl/src/collectives/device/primitives.h b/projects/rccl/src/collectives/device/primitives.h
index 436063c1f9..81a4d4cb7f 100644
--- a/projects/rccl/src/collectives/device/primitives.h
+++ b/projects/rccl/src/collectives/device/primitives.h
@@ -1,5 +1,5 @@
 /*************************************************************************
- * Copyright (c) 2016-2018, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
  * Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
  *
  * See LICENSE.txt for license information
@@ -10,229 +10,635 @@
 
 #include <type_traits>
 #include "reduce_kernel.h" // for reduction funcs
+#include "common.h"
 
+#define SPINS_BEFORE_CHECK_ABORT 1000000
 
-/* Defines primitive operations: Copy, Reduce, DoubleCopy, and ReduceCopy.
- *
- * In order to reduce the reptetion of template arguments, the operations
- * are bundled as static methods of the Primitives class.
- *
- * Each primitive operation copies/reduces a contiguous buffer and syncs
- * an optional set of flags against a sub-step counter. The sync value is
- * based on the step parameter. Sync flags must be of type WaitFlag or
- * PostFlag. The primitive routines wait for all WaitFlag args to attain
- * at least a value of SUBSTEPS*(step-1)+substep+1 (i.e. completion of
- * corresponding substep by previous step) before executing the transfer.
- * After each substep is transfered, all PostFlag arguments get updated to
- * the value SUBSTEPS*step+substep+1.
- */
+// Unroll unconditionally the first send/recv since nsend/nrecv should be at
+// least 1 if SEND/RECV is set.
+#define FOR_SEND(func, ...) do { \
+  if (SEND) { \
+    /* Send to far first, then close */ \
+    for (int i=1; i<NSEND && i<nsend; i++) func(i, ##__VA_ARGS__); \
+    func(0, ##__VA_ARGS__); \
+  } \
+} while (0)
 
-
-class WaitFlag {
-  volatile uint64_t * const flag;
-  const int shift;
- public:
-  __device__
-  WaitFlag(volatile uint64_t * const flag, const int shift) : flag(flag), shift(shift) { }
-  __device__
-  void wait(uint64_t val) { while ((LOAD(flag) + shift) < val) /*SPIN*/; }
-};
-
-
-class PostFlag {
-  volatile uint64_t * const flag;
-  const int shift;
-  volatile int * const fifo;
-  const int fifo_size;
-  uint32_t * hdp_reg;
- public:
-  __device__
-  PostFlag(volatile uint64_t* const flag, const int shift, volatile int* const fifo, const int fifo_size, uint32_t* hdp_reg = NULL)
-    : flag(flag), shift(shift), fifo(fifo), fifo_size(fifo_size), hdp_reg(hdp_reg) { }
-  // remote writes can be reordered if we don't do s_waitcnt 0 + store to HDP between the data and flag
-  __device__
-  void post(uint64_t val) { if (hdp_reg != NULL) STORE(hdp_reg, 0x1); STORE(flag, (val - shift)); }
-  __device__
-  void postSize(uint64_t step, int size) { if (fifo != NULL) STORE(fifo + step%fifo_size, size); };
-};
-
-
-// Helper to check if any argument is of type T.
-// e.g. AnyAre<WaitFlag>(Flag1, Flag2, ...)
-template<typename T> __device__
-bool AnyAre() { return false; }
-
-template<typename T, typename FIRST_T, typename... TAIL_Ts>
-__device__
-bool AnyAre(FIRST_T first, TAIL_Ts... tail) {
-  return std::is_same<T, FIRST_T>::value || AnyAre<T>(tail...);
-}
-
-
-// Wait on all WaitFlags, ignore PostFlags
-__device__
-static void WaitOnFlags(uint64_t val) { }
-
-template <typename... TAIL_Ts> __device__
-static void WaitOnFlags(uint64_t val, WaitFlag flag, TAIL_Ts... tail) {
-  flag.wait(val);
-  WaitOnFlags(val, tail...);
-}
-
-template <typename... TAIL_Ts> __device__
-static void WaitOnFlags(uint64_t val, PostFlag, TAIL_Ts... tail) {
-  WaitOnFlags(val, tail...);
-}
-
-
-// Post all PostFlags, ignore WaitFlags
-__device__
-static void PostToFlags(uint64_t val) { }
-
-template <typename... TAIL_Ts> __device__
-static void PostToFlags(uint64_t val, WaitFlag flag, TAIL_Ts... tail) {
-  PostToFlags(val, tail...);
-}
-
-template <typename... TAIL_Ts> __device__
-static void PostToFlags(uint64_t val, PostFlag flag, TAIL_Ts... tail) {
-  flag.post(val);
-  PostToFlags(val, tail...);
-}
-
-
-// Post sizes for PostFlags, ignore WaitFlags
-__device__
-static void PostSizeToFlags(uint64_t step, int size) { }
-
-template <typename... TAIL_Ts> __device__
-static void PostSizeToFlags(uint64_t step, int size, WaitFlag flag, TAIL_Ts... tail) {
-  PostSizeToFlags(step, size, tail...);
-}
-
-template <typename... TAIL_Ts> __device__
-static void PostSizeToFlags(uint64_t step, int size, PostFlag flag, TAIL_Ts... tail) {
-  flag.postSize(step, size);
-  PostSizeToFlags(step, size, tail...);
-}
-
-
-// Create pointer arithmetic syntax that doesn't break for std::nullptr_t
-template <typename Tptr> __device__
-static Tptr ptradd(Tptr ptr, int i) {
-  return ptr + i;
-}
-
-__device__
-static std::nullptr_t ptradd(std::nullptr_t ptr, int i) {
-  return nullptr;
-}
-
-// use different unroll numbers for all primitives for best throughput
-#define COPY_UNROLL       4
-#define REDUCE_UNROLL     2
-#define DOUBLECOPY_UNROLL 2
-#define REDUCECOPY_UNROLL 2
+#define FOR_RECV(func, ...) do { \
+  if (RECV) { \
+    /* Recv from close first, then far */ \
+    func(0, ##__VA_ARGS__); \
+    for (int i=1; i<NRECV && i<nrecv; i++) func(i, ##__VA_ARGS__); \
+  } \
+} while (0)
 
 // Implementation of primitive types
-template <int, int SUBSTEPS, typename T, typename REDOP=FuncSum<T> >
-class Primitives {
+template <int UNROLL, int SLICESPERCHUNK, int SLICESTEPS, typename T, int NRECV, int NSEND, class FUNC>
+class ncclPrimitives {
  private:
-  template <int UNROLL,
-      typename SRC2_T, // either T* or std::nullptr_t
-      typename DST2_T, // either T* or std::nullptr_t
-      typename... SYNC_Ts> // either WaitFunc or PostFunc
-  static __device__ __attribute__((noinline)) void
-  GenericOp(const int tid, const int nthreads,
-      const T*     src1,
-      const SRC2_T src2,
-      T*     dst1,
-      DST2_T dst2,
-      int len, int maxoffset, uint64_t step, SYNC_Ts... flags) {
+  const int tid;
+  const int nthreads;
+  int nrecv = 0;
+  int nsend = 0;
+  const int stepSize;
+  struct ncclConnInfo* recvConn[NRECV];
+  struct ncclConnInfo* sendConn[NSEND];
+  volatile uint64_t* waitPtr;
+  uint64_t recvStep[NRECV];
+  uint64_t sendStep[NSEND];
+  uint64_t sendConnHead[NSEND];
+  const T* recvDirectBuff[NRECV];
+  T* sendDirectBuff[NSEND];
+  const T* recvBuff[NRECV];
+  T* sendBuff[NSEND];
+  struct ncclDevComm* comm;
+  uint32_t* abortCount;
 
-    enum { noSrc2 = std::is_same<SRC2_T, std::nullptr_t>::value };
-    enum { noDst2 = std::is_same<DST2_T, std::nullptr_t>::value };
-    static_assert(noSrc2 || std::is_same<SRC2_T, const T*>::value,
-        "src2 must be of type T* or std::nullptr_t");
-    static_assert(noDst2 || std::is_same<DST2_T, T*>::value,
-        "dst2 must be of type T* or std::nullptr_t");
+  __device__ int recvOffset(int i) { return (recvStep[i]%NCCL_STEPS)*stepSize; }
+  __device__ int sendOffset(int i) { return (sendStep[i]%NCCL_STEPS)*stepSize; }
+  __device__ const T* recvPtr(int i) { return ((const T*)recvBuff[i])+recvOffset(i); }
+  __device__ T* sendPtr(int i) { return ((T*)sendBuff[i])+sendOffset(i); }
 
-    using OpType = typename std::conditional<noSrc2, FuncSum<T>, REDOP>::type;
-
-    int sliceSize = len / SUBSTEPS;
-    int sliceOffset = 0;
-
-#pragma unroll 1
-    for (int sub=0; sub<SUBSTEPS; ++sub) {
-      int realSize = max(0, min(sliceSize, maxoffset-sliceOffset));
-      if (tid < nthreads) {
-        if (AnyAre<WaitFlag>(flags...)) {
-          if (tid == 0) {
-            WaitOnFlags(SUBSTEPS*step + sub + 1, flags...);
-          }
-#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__)
-          __syncthreads();
+  __device__ void barrier() {
+#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
+    __syncthreads();
 #else
-          asm volatile ("bar.sync 1, %0;" :: "r"(nthreads));
+    asm volatile ("bar.sync 1, %0;" :: "r"(nthreads));
 #endif
-        }
-        ReduceOrCopy
-        <
-        UNROLL,
-        OpType,
-        T,
-        !std::is_same<DST2_T, std::nullptr_t>::value, // HAS_DEST1
-        !std::is_same<SRC2_T, std::nullptr_t>::value  // HAS_SRC1
-        >
-        (
-            tid, nthreads,
-            ptradd(dst1, sliceOffset),
-            ptradd(dst2, sliceOffset),
-            ptradd(src1, sliceOffset),
-            ptradd(src2, sliceOffset),
-            realSize
-        );
-        if (AnyAre<PostFlag>(flags...)) {
-          __syncthreads();
-          if(tid == 0)
-            PostSizeToFlags(SUBSTEPS*step+sub, realSize*sizeof(T), flags...);
-          __threadfence_system();
-          if(tid == 0)
-            PostToFlags(SUBSTEPS*step + sub + 1, flags...);
+  }
+
+  uint32_t mismatch = 0;
+  const uint64_t opCount;
+
+  __device__ void checkMismatch(volatile uint64_t* remoteOpCount) {
+    if (mismatch) {
+      // In non-LL, we use _threadfence_system before incrementing opCount, yet we are still waiting for credits here, so there must be a size mismatch
+      STORE(comm->fatalDevError, ncclDevAssertedMismatch);
+    } else if (remoteOpCount && LOAD(remoteOpCount) > opCount) {
+      mismatch += 1;
+    }
+  }
+
+  uint32_t spins = 0;
+  uint32_t abort = 0;
+
+  __device__ int checkAbort(volatile uint64_t* remoteOpCount) {
+    spins++;
+    if (spins == SPINS_BEFORE_CHECK_ABORT) {
+      abort = LOAD(comm->abortFlag);
+      checkMismatch(remoteOpCount);
+      spins = 0;
+    }
+    return abort;
+  }
+
+  __device__ void waitRecv(int i) {
+    spins = 0;
+    mismatch = 0;
+    recvStep[i] += SLICESTEPS;
+    if (tid == i) {
+#ifdef ENABLE_PROFILING
+      auto devProf = comm->devProf;
+      uint64_t t0 = clock64();
+#endif
+      while (LOAD(waitPtr) < recvStep[i]) {
+        if (checkAbort(recvConn[i]->opCountRem)) break;
+      }
+#ifdef ENABLE_PROFILING
+      __atomic_fetch_add(&devProf->wait_recv_cycle[blockIdx.x], clock64() - t0, __ATOMIC_SEQ_CST);
+#endif
+    }
+  }
+
+  __device__ void waitSend(int i) {
+    spins = 0;
+    mismatch = 0;
+    sendStep[i] += SLICESTEPS;
+    if (tid == WARP_SIZE+i) {
+#ifdef ENABLE_PROFILING
+      auto devProf = comm->devProf;
+      uint64_t t0 = clock64();
+#endif
+      while (sendConnHead[i] + NCCL_STEPS < sendStep[i]) {
+        sendConnHead[i] = LOAD(waitPtr);
+        if (checkAbort(sendConn[i]->opCountRem)) break;
+      }
+#ifdef ENABLE_PROFILING
+      __atomic_fetch_add(&devProf->wait_send_cycle[blockIdx.x], clock64() - t0, __ATOMIC_SEQ_CST);
+#endif
+    }
+  }
+
+  inline __device__ void postRecv(int i) {
+    STORE(recvConn[i]->head, recvStep[i]);
+  }
+
+  inline __device__ void postSend(int i) {
+    if (sendConn[i]->next_hdp_reg) STORE(sendConn[i]->next_hdp_reg, 0x1);
+    STORE(sendConn[i]->tail, sendStep[i]);
+  }
+
+  __device__ void postSendSize(int i, int size) {
+    if (sendConn[i]->fifo) STORE(sendConn[i]->fifo+((sendStep[i]-SLICESTEPS)%NCCL_STEPS), size);
+  }
+
+  template <int DIRECTRECV>
+  __device__ const T* directRecvPtr(int i, int directOffset) {
+    return DIRECTRECV && recvDirectBuff[i] ? recvDirectBuff[i]+directOffset : recvPtr(i);
+  }
+
+  template <int DIRECTSEND>
+  __device__ T* directSendPtr(int i, int directOffset) {
+    return DIRECTSEND && sendDirectBuff[i] ? sendDirectBuff[i]+directOffset : sendPtr(i);
+  }
+
+  template <int DIRECTRECV, int DIRECTSEND, int RECV, int SEND, int SRC, int DST>
+  __device__ void
+  GenericOp(const T* srcPtr, T* dstPtr, int nelem, int directOffset) {
+    int offset = 0;
+    int sliceSize = stepSize * SLICESTEPS;
+
+    const T* srcs[RECV*NRECV+SRC];
+    srcs[0] = SRC ? srcPtr : directRecvPtr<DIRECTRECV>(0, directOffset);
+    if (RECV) {
+      if (SRC) srcs[1] = recvPtr(0);
+      for (int i=1; i<NRECV && i<nrecv; i++) srcs[SRC+i] = recvPtr(i);
+    }
+
+    T* dsts[SEND*NSEND+DST];
+    dsts[0] = DST ? dstPtr : directSendPtr<DIRECTSEND>(0, directOffset);
+    if (SEND) {
+      if (DST) dsts[1] = directSendPtr<DIRECTSEND>(0, directOffset);
+      for (int i=1; i<NSEND && i<nsend; i++) dsts[DST+i] = directSendPtr<DIRECTSEND>(i, directOffset);
+    }
+
+    #pragma unroll 1
+    for (int slice=0; slice<SLICESPERCHUNK; ++slice) {
+      int realSize = max(0, min(sliceSize, nelem-offset));
+      FOR_SEND(waitSend);
+      FOR_RECV(waitRecv);
+      if (realSize > 0) {
+        barrier();
+        if (DIRECTRECV && recvDirectBuff[0]) {
+          // We can only have one direct receive. Since srcs[0] == dstPtr+offset, skip one copy
+          if (SEND) {
+            ReduceOrCopyMulti<UNROLL, FUNC, T, 1, 1, 1, NSEND>(tid, nthreads, 1, srcs, nsend, dsts+1, realSize);
+          }
+        } else {
+          ReduceOrCopyMulti<UNROLL, FUNC, T, RECV+SRC, RECV*NRECV+SRC, SEND+DST, SEND*NSEND+DST>(tid, nthreads, RECV*nrecv+SRC, srcs, SEND*nsend+DST, dsts, realSize);
         }
       }
-      sliceOffset += sliceSize;
+      exitIfAbortBarrier(abort, abortCount);
+      if (tid == 0) FOR_SEND(postSendSize, realSize*sizeof(T));
+      if (SEND) __threadfence_system();
+      if (tid == 0) FOR_SEND(postSend);
+      if (tid == 0) FOR_RECV(postRecv);
+    }
+    for (int i=0; i<RECV*NRECV+SRC; i++) srcs[i] += sliceSize;
+    for (int i=0; i<SEND*NSEND+DST; i++) dsts[i] += sliceSize;
+    offset += sliceSize;
+  }
+
+  __device__ void loadRecvConn(struct ncclConnInfo* conn, int i, T* directBuff) {
+    recvConn[i] = conn;
+    recvBuff[i] = (const T*)LOAD(&recvConn[i]->buff);
+    recvStep[i] = LOAD(&recvConn[i]->step);
+    recvStep[i] = ROUNDUP(recvStep[i], SLICESPERCHUNK*SLICESTEPS);
+    // Return credits in case we rounded up.
+    if (tid == 0) STORE(recvConn[i]->head, recvStep[i]);
+    if (tid == i) {
+      waitPtr = LOAD(&recvConn[i]->tail);
+      STORE(recvConn[i]->opCountLoc, opCount);
+    }
+    recvDirectBuff[i] = NULL;
+    if (directBuff && recvConn[i]->direct) {
+      recvDirectBuff[i] = directBuff;
+      if (tid == 0) STORE(recvConn[i]->ptrExchange, directBuff);
+    }
+    nrecv++;
+  }
+
+  __device__ void loadSendConn(struct ncclConnInfo* conn, int i, T* directBuff) {
+    sendConn[i] = conn;
+    sendBuff[i] = (T*)LOAD(&sendConn[i]->buff);
+    sendStep[i] = LOAD(&sendConn[i]->step);
+    sendStep[i] = ROUNDUP(sendStep[i], SLICESPERCHUNK*SLICESTEPS);
+    if (tid == WARP_SIZE+i) {
+      waitPtr = LOAD(&sendConn[i]->head);
+      sendConnHead[i] = LOAD(waitPtr);
+      STORE(sendConn[i]->opCountLoc, opCount);
+    }
+    sendDirectBuff[i] = NULL;
+    if (directBuff && sendConn[i]->direct) {
+      void* volatile* ptr = sendConn[i]->ptrExchange;
+      while ((sendDirectBuff[i] = (T*)(LOAD(ptr))) == NULL);
+      __syncthreads();
+      if (tid == 0) STORE(ptr, NULL);
+    }
+    nsend++;
+  }
+
+  __device__ void saveRecvConn(int i) {
+    if (tid == i) {
+      STORE(&recvConn[i]->step, recvStep[i]);
+      __threadfence_system();
+      __atomic_fetch_add(recvConn[i]->opCountLoc, 1, __ATOMIC_SEQ_CST);
+    }
+  }
+
+  __device__ void saveSendConn(int i) {
+    if (tid == WARP_SIZE+i) {
+      STORE(&sendConn[i]->step, sendStep[i]);
+      __threadfence_system();
+      __atomic_fetch_add(sendConn[i]->opCountLoc, 1, __ATOMIC_SEQ_CST);
     }
   }
 
  public:
-  template <typename... SYNC_Ts>
-  static __device__ void
-  Copy(const int tid, const int nthreads, const T* src, T* dst,
-      int len, int maxOffset, uint64_t step, SYNC_Ts... flags) {
-    GenericOp<COPY_UNROLL>(tid, nthreads, src, nullptr, dst, nullptr, len, maxOffset, step, flags...);
+  __device__
+  ncclPrimitives(const int tid, const int nthreads, int* recvPeers, int* sendPeers, T* directBuff, int stepSize, struct ncclChannel* channel, struct ncclDevComm* comm, const uint64_t opCount)
+    : comm(comm), tid(tid), nthreads(nthreads), stepSize(stepSize), opCount(opCount) {
+    // Make sure step is updated before we read it
+    abortCount = channel->abortCount;
+    __syncthreads();
+
+    // disable directBuff
+    for (int i=0; i<NRECV && recvPeers[i] >= 0; i++) loadRecvConn(&channel->devPeers[recvPeers[i]].recv.conn, i, 0);
+    for (int i=0; i<NSEND && sendPeers[i] >= 0; i++) loadSendConn(&channel->devPeers[sendPeers[i]].send.conn, i, 0);
   }
 
-  template <typename... SYNC_Ts>
-  static __device__ void
-  DoubleCopy(const int tid, const int nthreads, const T* src, T* dst1, T* dst2,
-      int len, int maxOffset, uint64_t step, SYNC_Ts... flags) {
-    GenericOp<DOUBLECOPY_UNROLL>(tid, nthreads, src, nullptr, dst1, dst2, len, maxOffset, step, flags...);
+  __device__ void
+  send(const T* src, int nelem) {
+    GenericOp<0, 0, 0, 1, 1, 0>(src, NULL, nelem, 0);
+  }
+  __device__ void
+  directSend(const T* src, int directOffset, int nelem) {
+    GenericOp<0, 1, 0, 1, 1, 0>(src, NULL, nelem, directOffset);
   }
 
-  template <typename... SYNC_Ts>
-  static __device__ void
-  Reduce(const int tid, const int nthreads, const T* src1, const T* src2, T* dst,
-      int len, int maxOffset, uint64_t step, SYNC_Ts... flags) {
-    GenericOp<REDUCE_UNROLL>(tid, nthreads, src1, src2, dst, nullptr, len, maxOffset, step, flags...);
+  __device__ void
+  recv(T* dst, int nelem) {
+    GenericOp<0, 0, 1, 0, 0, 1>(NULL, dst, nelem, 0);
+  }
+  __device__ void
+  directRecv(T* dst, int directOffset, int nelem) {
+    GenericOp<1, 0, 1, 0, 0, 1>(NULL, dst, nelem, directOffset);
   }
 
-  template <typename... SYNC_Ts>
-  static __device__ void
-  ReduceCopy(const int tid, const int nthreads, const T* src1, const T* src2, T* dst1, T* dst2,
-      int len, int maxOffset, uint64_t step, SYNC_Ts... flags) {
-    GenericOp<REDUCECOPY_UNROLL>(tid, nthreads, src1, src2, dst1, dst2, len, maxOffset, step, flags...);
+  __device__ void
+  copySend(const T* src, T* dst, int nelem) {
+    GenericOp<0, 0, 0, 1, 1, 1>(src, dst, nelem, 0);
+  }
+  __device__ void
+  directCopySend(const T* src, T* dst, int directOffset, int nelem) {
+    GenericOp<0, 1, 0, 1, 1, 1>(src, dst, nelem, directOffset);
+  }
+
+  __device__ void
+  recvCopySend(T* dst, int nelem) {
+    GenericOp<0, 0, 1, 1, 0, 1>(NULL, dst, nelem, 0);
+  }
+  __device__ void
+  directRecvCopySend(T* dst, int directOffset, int nelem) {
+    GenericOp<1, 1, 1, 1, 0, 1>(NULL, dst, nelem, directOffset);
+  }
+
+  __device__ void
+  recvReduceCopy(const T* src, T* dst, int nelem) {
+    GenericOp<0, 0, 1, 0, 1, 1>(src, dst, nelem, 0);
+  }
+
+  __device__ void
+  recvReduceSend(const T* src, int nelem) {
+    GenericOp<0, 0, 1, 1, 1, 0>(src, NULL, nelem, 0);
+  }
+
+  __device__ void
+  recvReduceCopySend(const T* src, T* dst, int nelem) {
+    GenericOp<0, 0, 1, 1, 1, 1>(src, dst, nelem, 0);
+  }
+  __device__ void
+  directRecvReduceCopySend(const T* src, T* dst, int directOffset, int nelem) {
+    // Direct is only for the send part
+    GenericOp<0, 1, 1, 1, 1, 1>(src, dst, nelem, directOffset);
+  }
+
+  __device__ ~ncclPrimitives() {
+    // Save steps for next collective. Have thread 0 do it to be compatible
+    // with the way LL works.
+    for (int i=0; i<NRECV && i<nrecv; i++) saveRecvConn(i);
+    for (int i=0; i<NSEND && i<nsend; i++) saveSendConn(i);
   }
 };
 
-#endif // end include guard
+template <typename T, class FUNC, int NRECV, int NSEND>
+class ncclLLPrimitives {
+ private:
+  const int tid;
+  const int nthreads;
+  int nrecv = 0;
+  int nsend = 0;
+  struct ncclConnInfo* recvConn[NRECV];
+  struct ncclConnInfo* sendConn[NSEND];
+  volatile uint64_t* waitPtr;
+  volatile uint64_t* postPtr;
+  volatile int* fifoPtr;
+  uint64_t recvStep[NRECV];
+  uint64_t sendStep[NSEND];
+  uint64_t sendConnHead;
+  union ncclLLFifoLine* recvBuff[NRECV];
+  union ncclLLFifoLine* sendBuff[NSEND];
+  struct ncclDevComm* comm;
+  uint32_t* abortCount;
+
+  __device__ int recvOffset(int i) { return (recvStep[i]%NCCL_STEPS)*NCCL_LL_SLICE_LINES; }
+  __device__ int sendOffset(int i) { return (sendStep[i]%NCCL_STEPS)*NCCL_LL_SLICE_LINES; }
+  __device__ union ncclLLFifoLine* recvPtr(int i) { return recvBuff[i]+recvOffset(i); }
+  __device__ union ncclLLFifoLine* sendPtr(int i) { return sendBuff[i]+sendOffset(i); }
+  __device__ uint32_t recvFlag(int i) { return NCCL_LL_FLAG(recvStep[i]+1); }
+  __device__ uint32_t sendFlag(int i) { return NCCL_LL_FLAG(sendStep[i]+1); }
+
+#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
+#else
+  // Exit If Abort Barrier : make sure all threads exit consistently
+  // Each thread sets a predicate to true if val == 1
+  // all CTA's threads enter the barrier and do a popc on their predicates being True
+  // If any of the thread's predicate was True, all the threads call exit()
+  __device__ void exitIfAbortLocalBarrier() {
+    uint32_t popc;
+    asm ("{");
+    asm volatile ("   .reg .pred barr_pred;");
+    asm volatile ("   setp.eq.u32 barr_pred,%0,1;" :: "r"(abort));
+    asm volatile ("   bar.red.popc.u32 %0, 14, %1, barr_pred;" : "=r"(popc) : "r"(nthreads));
+    asm ("}");
+    if (popc) {
+      // Make sure threads not participating in the operation get the abort and all threads exit
+      exitIfAbortBarrier(1);
+    }
+  }
+#endif
+
+  __device__ void barrier() {
+#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
+    __syncthreads();
+#else
+    asm volatile ("bar.sync 1, %0;" :: "r"(nthreads));
+#endif
+  }
+
+  uint32_t mismatch = 0;
+  const uint64_t opCount;
+
+  __device__ void checkMismatch(volatile uint64_t* remoteOpCount) {
+    if (mismatch > 20) {
+      // We have seen that the peer advanced opcount so many times yet we are still waiting for credit of current op, so it is _most likely_ a mismatch
+      // Note that we are not using _threadfence_system in LL so the error cannot be asserted
+      STORE(comm->fatalDevError, ncclDevSuspectedMismatch);
+    } else if (remoteOpCount && LOAD(remoteOpCount) > opCount) {
+      mismatch += 1;
+    }
+  }
+
+  uint32_t spins = 0;
+  uint32_t abort = 0;
+
+  __device__ int checkAbort(volatile uint64_t* remoteOpCount) {
+    spins++;
+    if (spins == SPINS_BEFORE_CHECK_ABORT) {
+      abort = LOAD(comm->abortFlag);
+      checkMismatch(remoteOpCount);
+      spins = 0;
+    }
+    return abort;
+  }
+
+  __device__ void waitSend(int i, int nbytes) {
+    spins = 0;
+    mismatch = 0;
+    if (tid == WARP_SIZE+i) {
+      while (sendConnHead + NCCL_STEPS < sendStep[i] + 1) {
+        sendConnHead = LOAD(waitPtr);
+        if (checkAbort(sendConn[i]->opCountRem)) break;
+      }
+      if (fifoPtr) {
+        int size = ((sendStep[i] & NCCL_LL_CLEAN_MASK) == NCCL_LL_CLEAN_MASK) ? NCCL_LL_SLICE_LINES*sizeof(union ncclLLFifoLine) : nbytes;
+        STORE(fifoPtr+sendStep[i]%NCCL_STEPS, size);
+      }
+    }
+  }
+
+  __device__ void postRecv(int i) {
+    recvStep[i]++;
+    if (tid == i) STORE(postPtr, recvStep[i]);
+  }
+
+  __device__ void postSend(int i, int offset) {
+    // LL Cleanup : write all flags in the slice to make sure we don't have
+    // data corruption when flag loops over.
+    if ((sendStep[i] & NCCL_LL_CLEAN_MASK) == NCCL_LL_CLEAN_MASK) {
+      for (int o = offset; o<NCCL_LL_SLICE_LINES; o+=nthreads) storeLL(sendPtr(i)+o, 0, sendFlag(i));
+    }
+    sendStep[i]++;
+  }
+
+  __device__ __attribute__((noinline)) uint64_t readLL(int i, int offset) {
+    union ncclLLFifoLine* src = recvPtr(i) + offset;
+    uint32_t flag = recvFlag(i);
+    uint32_t data1, flag1, data2, flag2;
+    spins = 0;
+    mismatch = 0;
+#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
+    using Vec = uint32_t __attribute__((ext_vector_type(4)));
+    Vec i4;
+    do {
+      asm volatile ("flat_load_dwordx4 %0, %1, glc\n"
+        "s_waitcnt vmcnt(0)\n"
+        "buffer_wbinvl1_vol\n" : "=v"(i4) : "v"(src));
+      if (i4[1] == flag && i4[3] == flag) break;
+    } while (!checkAbort(recvConn[i]->opCountRem));
+    uint64_t val64 = (uint64_t)(i4[0]) + (((uint64_t)i4[2]) << 32);
+#else
+    do {
+      asm volatile("ld.volatile.global.v4.u32 {%0,%1,%2,%3}, [%4];" : "=r"(data1), "=r"(flag1), "=r"(data2), "=r"(flag2) : "l"(&src->i4));
+      if (checkAbort(recvConn[i]->opCountRem)) break;
+    } while ((flag1 != flag) || (flag2 != flag));
+    uint64_t val64 = data1 + (((uint64_t)data2) << 32);
+#endif
+    return val64;
+  }
+
+  __device__ __attribute__((noinline)) void storeLL(union ncclLLFifoLine* dst, uint64_t val, uint32_t flag) {
+#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
+  using Vec = uint32_t __attribute__((ext_vector_type(4)));
+  Vec i4;
+  i4[0] = val & 0xffffffff;
+  i4[1] = flag;
+  i4[2] = (val >> 32);
+  i4[3] = flag;
+  asm volatile ("flat_store_dwordx4 %0, %1, glc\n"
+    "s_waitcnt vmcnt(0)\n"
+    "buffer_wbinvl1_vol\n" : : "v"(dst), "v"(i4));
+#else
+    asm volatile("st.volatile.global.v4.u32 [%0], {%1,%2,%3,%4};" :: "l"(&dst->i4), "r"((uint32_t)val), "r"(flag), "r"((uint32_t)(val >> 32)), "r"(flag));
+#endif
+  }
+
+  // Using memcpy handles misaligned pointers.
+  __device__ uint64_t readAL(uint64_t* src) {
+    uint64_t val;
+    memcpy((char*)&val, (char*)src, sizeof(uint64_t));
+    return val;
+  }
+
+  __device__ void storeAL(uint64_t* dst, uint64_t val, uint32_t nbytes) {
+    memcpy((char*)dst, (char*)&val, nbytes);
+  }
+
+  template <int RECV, int SEND, int SRC, int DST>
+  __device__ void LLGenericOp(const T* srcPtr, T* dstPtr, int nelem) {
+    uint32_t nbytes = nelem < 0 ? 0 : nelem*sizeof(T);
+    FOR_SEND(waitSend, nbytes*2);
+    barrier();
+    uint32_t npack = DIVUP(nbytes, sizeof(uint64_t));
+    uint64_t* srcPack = (uint64_t*)srcPtr;
+    uint64_t* dstPack = (uint64_t*)dstPtr;
+    int offset = tid;
+    // Do multiples of 64 bits
+    #pragma unroll 1
+    for (; offset<npack; offset+=nthreads) {
+      // Recv : local, then intra-node, then inter-node
+      uint64_t val = SRC ? readAL(srcPack+offset) : readLL(0, offset);
+      if (RECV) {
+        if (SRC) val = MULTI<FUNC, T>()(readLL(0, offset), val);
+        for (int i=1; i<NRECV && i<nrecv; i++) {
+          val = MULTI<FUNC, T>()(readLL(i, offset), val);
+        }
+      }
+
+      // Send : inter-node, then intra-node, then local
+      if (SEND) {
+        for (int i=1; i<NSEND && i<nsend; i++) storeLL(sendPtr(i)+offset, val, sendFlag(i));
+        storeLL(sendPtr(0)+offset, val, sendFlag(0));
+      }
+      if (DST) {
+        if (((offset*sizeof(uint64_t)) ^ nbytes) < sizeof(uint64_t)) {
+          // Last incomplete word
+          storeAL(dstPack+offset, val, nbytes & 0x7);
+        } else {
+          storeAL(dstPack+offset, val, sizeof(uint64_t));
+        }
+      }
+    }
+#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
+    exitIfAbortBarrier(abort, abortCount);
+#else
+    exitIfAbortLocalBarrier();
+#endif
+    FOR_RECV(postRecv);
+    FOR_SEND(postSend, offset);
+  }
+
+  __device__ void loadRecvConn(struct ncclConnInfo* conn, int i) {
+    recvConn[i] = conn;
+    recvBuff[i] = recvConn[i]->llBuff;
+    recvStep[i] = recvConn[i]->step;
+    if (tid == i) {
+      postPtr = recvConn[i]->head;
+      STORE(recvConn[i]->opCountLoc, opCount);
+    }
+    nrecv++;
+  }
+
+  __device__ void loadSendConn(struct ncclConnInfo* conn, int i) {
+    sendConn[i] = conn;
+    sendBuff[i] = sendConn[i]->llBuff;
+    sendStep[i] = sendConn[i]->step;
+    if (tid == WARP_SIZE+i) {
+      waitPtr = sendConn[i]->head;
+      fifoPtr = sendConn[i]->fifo;
+      sendConnHead = LOAD(waitPtr);
+      STORE(sendConn[i]->opCountLoc, opCount);
+    }
+    nsend++;
+  }
+
+  __device__ void saveRecvConn(int i) {
+    if (tid == i) {
+      recvConn[i]->step = recvStep[i];
+      __atomic_fetch_add(recvConn[i]->opCountLoc, 1, __ATOMIC_SEQ_CST);
+      __threadfence_block();
+    }
+  }
+
+  __device__ void saveSendConn(int i) {
+    if (tid == WARP_SIZE+i) {
+      sendConn[i]->step = sendStep[i];
+      __atomic_fetch_add(sendConn[i]->opCountLoc, 1, __ATOMIC_SEQ_CST);
+      __threadfence_block();
+    }
+  }
+
+ public:
+  __device__
+  ncclLLPrimitives(const int tid, const int nthreads, int* recvPeers, int* sendPeers, struct ncclChannel* channel, struct ncclDevComm* comm, const uint64_t opCount)
+    : comm(comm), tid(tid), nthreads(nthreads), opCount(opCount) {
+    // Make sure step is updated before we read it.
+    abortCount = channel->abortCount;
+    barrier();
+
+    for (int i=0; i<NRECV && recvPeers[i] >= 0; i++) loadRecvConn(&channel->devPeers[recvPeers[i]].recv.conn, i);
+    for (int i=0; i<NSEND && sendPeers[i] >= 0; i++) loadSendConn(&channel->devPeers[sendPeers[i]].send.conn, i);
+  }
+
+  __device__ void send(const T* src, int nelem) {
+    return LLGenericOp<0, 1, 1, 0>(src, NULL, nelem);
+  }
+
+  __device__ void recv(T* dst, int nelem) {
+    return LLGenericOp<1, 0, 0, 1>(NULL, dst, nelem);
+  }
+
+  __device__ void recvReduceSend(const T* src, int nelem) {
+    return LLGenericOp<1, 1, 1, 0>(src, NULL, nelem);
+  }
+
+  __device__ void recvReduceCopy(const T* src, T* dst, int nelem) {
+    return LLGenericOp<1, 0, 1, 1>(src, dst, nelem);
+  }
+
+  __device__ void copySend(const T* src, T* dst, int nelem) {
+    return LLGenericOp<0, 1, 1, 1>(src, dst, nelem);
+  }
+
+  __device__ void recvCopySend(T* dst, int nelem) {
+    return LLGenericOp<1, 1, 0, 1>(NULL, dst, nelem);
+  }
+
+  __device__ void recvReduceCopySend(const T* src, T* dst, int nelem) {
+    return LLGenericOp<1, 1, 1, 1>(src, dst, nelem);
+  }
+
+  __device__ ~ncclLLPrimitives() {
+    // Save steps for the next operation
+    for (int i=0; i<NRECV && i<nrecv; i++) saveRecvConn(i);
+    for (int i=0; i<NSEND && i<nsend; i++) saveSendConn(i);
+  }
+};
+
+#ifdef ENABLE_PROFILING
+#define INIT_COUNTER \
+  if (tid==0) { t0 = clock64(); ws = LOAD(&(devProf->wait_send_cycle[blockIdx.x])); \
+    wr = LOAD(&(devProf->wait_recv_cycle[blockIdx.x])); }
+
+#define ACCUMULATE_COUNTER(prim) \
+  if (tid==0) { __atomic_fetch_add(&(devProf->prim##_cycle), clock64() - t0 \
+    + ws - LOAD(&(devProf->wait_send_cycle[blockIdx.x])) \
+    + wr - LOAD(&(devProf->wait_recv_cycle[blockIdx.x])), \
+    __ATOMIC_SEQ_CST); \
+    __atomic_fetch_add(&(devProf->prim##_byte), nelem * sizeof(T), __ATOMIC_SEQ_CST); }
+#else
+#define INIT_COUNTER
+#define ACCUMULATE_COUNTER(prim)
+#endif
+
+#endif
diff --git a/projects/rccl/src/collectives/device/reduce.cu b/projects/rccl/src/collectives/device/reduce.cu
index bd1d23ce79..dbfa1b7fad 100644
--- a/projects/rccl/src/collectives/device/reduce.cu
+++ b/projects/rccl/src/collectives/device/reduce.cu
@@ -1,5 +1,6 @@
 /*************************************************************************
  * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+ * Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
  *
  * See LICENSE.txt for license information
  ************************************************************************/
@@ -10,12 +11,7 @@
 
 #define UNROLL 4
 
-#if NCCL_OP == 0
 IMPL_COLL2(ncclReduce, sum,  FuncSum,  ncclCollReduce, ncclSum);
-#elif NCCL_OP == 1
 IMPL_COLL2(ncclReduce, prod, FuncProd, ncclCollReduce, ncclProd);
-#elif NCCL_OP == 2
 IMPL_COLL2(ncclReduce, min,  FuncMin,  ncclCollReduce, ncclMin);
-#elif NCCL_OP == 3
 IMPL_COLL2(ncclReduce, max,  FuncMax,  ncclCollReduce, ncclMax);
-#endif
diff --git a/projects/rccl/src/collectives/device/reduce.h b/projects/rccl/src/collectives/device/reduce.h
index c7d6eb11b7..fca4714faf 100644
--- a/projects/rccl/src/collectives/device/reduce.h
+++ b/projects/rccl/src/collectives/device/reduce.h
@@ -1,153 +1,82 @@
 /*************************************************************************
- * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
  * Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
  *
  * See LICENSE.txt for license information
  ************************************************************************/
 
-#include "core.h"
+#include "devcomm.h"
 #include "primitives.h"
 #include "collectives.h"
 
-// Increase Step and boffset for buffer sync
-#define NEXT_STEP \
-  step++; \
-  boffset += sliceSize; \
-  if (boffset == buffSize) boffset = 0;
-
 template<int UNROLL, class FUNC, typename T>
 __attribute__((noinline))
-__device__ void ncclReduceKernel(struct CollectiveArgs* args) {
+__device__ void ncclReduceRingKernel(struct CollectiveArgs* args) {
   const int tid = threadIdx.x;
   const int nthreads = blockDim.x;
   const int bid = args->bid;
-  struct ncclComm* comm = args->comm;
-  struct ncclRing* ring = comm->rings+blockIdx.x;
-
-  WaitFlag waitDoneFromNext(ring->send.conn.head, (REDUCE_BUFCHUNKS-1)*REDUCE_SUBSTEPS);
-  WaitFlag waitReadyFromPrev(ring->recv.conn.tail, 0);
-  PostFlag postDoneToPrev(ring->recv.conn.head, 0, NULL, 0);
-  PostFlag postReadyToNext(ring->send.conn.tail, 0, ring->send.conn.fifo, REDUCE_BUFCHUNKS*REDUCE_SUBSTEPS, ring->next_hdp_reg);
-
-  typedef Primitives<UNROLL, REDUCE_SUBSTEPS, T, FUNC> Prims;
-
+  struct ncclDevComm* comm = args->comm;
+  struct ncclChannel* channel = comm->channels+blockIdx.x;
+  struct ncclRing* ring = &channel->ring;
   const ssize_t size = args->N;
   const int nranks = comm->nRanks;
-  const int buffSize = ring->buffSize / sizeof(T);
-  const int sliceSize = buffSize / REDUCE_BUFCHUNKS;
-  const ssize_t loopSize = args->nRings*(ssize_t)sliceSize;
+  const int stepSize = channel->buffSize / (sizeof(T)*NCCL_STEPS);
+  const int chunkSize = stepSize * REDUCE_CHUNKSTEPS;
+  const ssize_t loopSize = args->nChannels*(ssize_t)chunkSize;
   const int rank = ring->devUserRanks[0];
   const int prevRank = ring->devUserRanks[nranks-1];
   const int root = args->root;
 
-  if (tid == 0) {
-    // Update in case we skipped some collectives
-    STORE(ring->recv.conn.opCount, args->opCount);
-
-    if (rank != root) {
-      // Wait for next to be ready
-      WaitFlag waitOpCountNext(ring->send.conn.opCount, 0);
-      waitOpCountNext.wait(args->opCount);
-    }
-  }
-  __syncthreads();
-
-  uint64_t step = 0ULL;
-  int boffset = 0;
-
   // Compute pointers
   const T * __restrict__ thisInput = (const T*)args->ThisInput;
   T * __restrict__ thisOutput = (T*)args->ThisOutput;
-  T * __restrict__ prevInput = (T*)ring->recv.conn.buff;
-  T * __restrict__ nextOutput = (T*)ring->send.conn.buff;
+
+  ncclPrimitives<UNROLL, REDUCE_CHUNKSTEPS/REDUCE_SLICESTEPS, REDUCE_SLICESTEPS, T, 1, 1, FUNC>
+    prims(tid, nthreads, &ring->prev, &ring->next, NULL, stepSize, channel, comm, args->opCount);
 
   for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
-    int chunkSize = min(sliceSize, DIVUP(size-gridOffset,args->nRings));
-    ALIGN_SIZE(chunkSize, nthreads*sizeof(uint64_t)/sizeof(T));
-    ssize_t offset = gridOffset + bid*chunkSize;
-    int maxOffset = min(chunkSize, size-offset);
+    int realChunkSize = min(chunkSize, DIVUP(size-gridOffset,args->nChannels));
+    ALIGN_SIZE(realChunkSize, nthreads*sizeof(uint64_t)/sizeof(T));
+    ssize_t offset = gridOffset + bid*realChunkSize;
+    int nelem = min(realChunkSize, size-offset);
     if (prevRank == root) {
-      Prims::Copy(tid, nthreads,
-          thisInput + offset,
-          nextOutput + boffset,
-          sliceSize, maxOffset,
-          step,
-          waitDoneFromNext,
-          postReadyToNext);
+      prims.send(thisInput+offset, nelem);
     } else if (rank == root) {
-      Prims::Reduce(tid, nthreads,
-          prevInput  + boffset,
-          thisInput + offset,
-          thisOutput + offset,
-          sliceSize, maxOffset,
-          step,
-          waitReadyFromPrev,
-          postDoneToPrev);
+      prims.recvReduceCopy(thisInput+offset, thisOutput+offset, nelem);
     } else {
-      Prims::Reduce(tid, nthreads,
-          prevInput + boffset,
-          thisInput + offset,
-          nextOutput + boffset,
-          sliceSize, maxOffset,
-          step,
-          waitDoneFromNext, waitReadyFromPrev,
-          postReadyToNext, postDoneToPrev);
+      prims.recvReduceSend(thisInput+offset, nelem);
     }
-    NEXT_STEP; // Increases step, boffset
-  }
-
-  if (tid == 0) {
-    if (rank != root) {
-      // Wait for next to have consumed data before resetting the flag
-      waitDoneFromNext.wait(REDUCE_SUBSTEPS*(step + REDUCE_BUFCHUNKS - 1));
-      STORE(ring->send.conn.head, 0ULL);
-    }
-    STORE(ring->recv.conn.tail, 0ULL);
-    __threadfence_system();
-    STORE(ring->recv.conn.opCount, args->opCount+1);
   }
 }
 
-#include "ll_kernel.h"
-
-#define NEXT_STEP_LL \
-  boffset += NCCL_LL_SLICE_LINES; \
-  if (boffset == NCCL_LL_BUFF_LINES) boffset = 0; \
-  flag++; \
-  step++;
+template<int UNROLL, class FUNC, typename T>
+__attribute__((noinline))
+__device__ void ncclReduceTreeKernel(struct CollectiveArgs* args) { }
 
 template<int UNUSED, class FUNC, typename T>
 __attribute__((noinline))
-__device__ void ncclReduceLLKernel(struct CollectiveArgs* args) {
+__device__ void ncclReduceRingLLKernel(struct CollectiveArgs* args) {
   const int tid = threadIdx.x;
   const int bid = args->bid;
-  const int llNthreads = args->nThreads;
-  struct ncclComm* comm = args->comm;
-  struct ncclRing* ring = comm->rings+blockIdx.x;
-  volatile uint64_t * recvHeadPtr = ring->recv.conn.llHead;
-  volatile uint64_t * sendHeadPtr = ring->send.conn.llHead;
-  volatile int * sizesFifo = ring->send.conn.llFifo;
-  uint64_t sendHead = sendHeadPtr[0];
-  const int nranks = comm->nRanks;
+  const int nthreads = args->nThreads;
+  struct ncclDevComm* comm = args->comm;
+  struct ncclChannel* channel = comm->channels+blockIdx.x;
+  struct ncclRing* ring = &channel->ring;
+
+  ncclLLPrimitives<T, FUNC, 1, 1> LLprims(tid, nthreads, &ring->prev, &ring->next, channel, comm, args->opCount);
+
+  const ssize_t size = args->N;
   const int rank = comm->rank;
+  const int nranks = comm->nRanks;
   const int prevRank = ring->devUserRanks[nranks-1];
   const int root = args->root;
 
-  typedef LLPrimitives<T, FUNC> LL;
-
-  const ssize_t size = args->N;
   ssize_t chunkSize = NCCL_LL_SLICE_LINES * sizeof(uint64_t) / sizeof(T);
-  const ssize_t loopSize = args->nRings*chunkSize;
-
-  uint64_t step = ring->send.conn.llStep;
-  uint32_t flag = step + 1;
-  int boffset = NCCL_LL_SLICE_LINES * STEP_TO_SLOT(step);
+  const ssize_t loopSize = args->nChannels*chunkSize;
 
   // Compute pointers
   const T * __restrict__ thisInput = (const T*)args->ThisInput;
   T * __restrict__ thisOutput = (T*)args->ThisOutput;
-  union ncclLLFifoLine * prevInput = (union ncclLLFifoLine *)ring->recv.conn.llBuff;
-  union ncclLLFifoLine * nextOutput = (union ncclLLFifoLine *)ring->send.conn.llBuff;
 
   for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
     if (size-gridOffset < loopSize) {
@@ -155,39 +84,17 @@ __device__ void ncclReduceLLKernel(struct CollectiveArgs* args) {
     }
     ssize_t offset = gridOffset + bid*chunkSize;
 
-    int maxOffset = min(chunkSize, size-offset);
+    int nelem = min(chunkSize, size-offset);
     if (prevRank == root) {
-      WAIT_NEXT;
-      LL::ReduceCopy(
-          thisInput + offset,
-          nextOutput + boffset,
-          maxOffset, flag, llNthreads);
-      POST_SIZE;
-      NEXT_STEP_LL;
+      LLprims.send(thisInput+offset, nelem);
     } else if (rank == root) {
-      LL::ReduceCopy(
-          thisInput + offset,
-          prevInput  + boffset,
-          thisOutput + offset,
-          maxOffset, flag, llNthreads);
-      NEXT_STEP_LL;
-      ACK_PREV;
+      LLprims.recvReduceCopy(thisInput+offset, thisOutput+offset, nelem);
     } else {
-      WAIT_NEXT;
-      LL::ReduceCopy(
-          thisInput + offset,
-          prevInput + boffset,
-          nextOutput + boffset,
-          maxOffset, flag, flag, llNthreads);
-      POST_SIZE;
-      NEXT_STEP_LL;
-      ACK_PREV;
+      LLprims.recvReduceSend(thisInput+offset, nelem);
     }
   }
-
-  // We need everyone to acknowledge data even if they didn't receive anything
-  // so that the next collective can start right away.
-  ACK_PREV;
-
-  FIFO_CLEANING_AND_SAVE_STEP(flag);
 }
+
+template<int UNUSED, class FUNC, typename T>
+__attribute__((noinline))
+__device__ void ncclReduceTreeLLKernel(struct CollectiveArgs* args) { }
diff --git a/projects/rccl/src/collectives/device/reduce_0.cpp b/projects/rccl/src/collectives/device/reduce_0.cpp
deleted file mode 100644
index f1b83bc655..0000000000
--- a/projects/rccl/src/collectives/device/reduce_0.cpp
+++ /dev/null
@@ -1,8 +0,0 @@
-/*************************************************************************
- * Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
- *
- * See LICENSE.txt for license information
- ************************************************************************/
-
-#define NCCL_OP 0
-#include "device/reduce.cu"
diff --git a/projects/rccl/src/collectives/device/reduce_1.cpp b/projects/rccl/src/collectives/device/reduce_1.cpp
deleted file mode 100644
index 63b157075e..0000000000
--- a/projects/rccl/src/collectives/device/reduce_1.cpp
+++ /dev/null
@@ -1,8 +0,0 @@
-/*************************************************************************
- * Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
- *
- * See LICENSE.txt for license information
- ************************************************************************/
-
-#define NCCL_OP 1
-#include "device/reduce.cu"
diff --git a/projects/rccl/src/collectives/device/reduce_2.cpp b/projects/rccl/src/collectives/device/reduce_2.cpp
deleted file mode 100644
index 7c84b0ada3..0000000000
--- a/projects/rccl/src/collectives/device/reduce_2.cpp
+++ /dev/null
@@ -1,8 +0,0 @@
-/*************************************************************************
- * Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
- *
- * See LICENSE.txt for license information
- ************************************************************************/
-
-#define NCCL_OP 2
-#include "device/reduce.cu"
diff --git a/projects/rccl/src/collectives/device/reduce_3.cpp b/projects/rccl/src/collectives/device/reduce_3.cpp
deleted file mode 100644
index c590bdd3c6..0000000000
--- a/projects/rccl/src/collectives/device/reduce_3.cpp
+++ /dev/null
@@ -1,8 +0,0 @@
-/*************************************************************************
- * Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
- *
- * See LICENSE.txt for license information
- ************************************************************************/
-
-#define NCCL_OP 3
-#include "device/reduce.cu"
diff --git a/projects/rccl/src/collectives/device/reduce_kernel.h b/projects/rccl/src/collectives/device/reduce_kernel.h
index 86e0f56a12..4c5caa9f28 100644
--- a/projects/rccl/src/collectives/device/reduce_kernel.h
+++ b/projects/rccl/src/collectives/device/reduce_kernel.h
@@ -1,5 +1,5 @@
 /*************************************************************************
- * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
  * Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
  *
  * See LICENSE.txt for license information
@@ -19,7 +19,7 @@ struct FuncNull {
   }
 };
 
-#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__)
+#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
 
 //we really don't need any specializations and we don't need
 //to break things into uint32_t
@@ -164,30 +164,31 @@ struct FuncMin {
   }
 };
 
+#define MASK0 0x00ff00ff
+#define MASK1 0xff00ff00
+static __device__ uint32_t addChar4(const uint32_t x, const uint32_t y) {
+  /* This can be used both for signed and unsigned 8-bit addition */
+  const uint32_t x0 = x & MASK0;
+  const uint32_t x1 = x & MASK1;
+  const uint32_t y0 = y & MASK0;
+  const uint32_t y1 = y & MASK1;
+  const uint32_t r0 = (x0+y0);
+  const uint32_t r1 = (x1+y1);
+  return (r0 & MASK0) | (r1 & MASK1);
+}
+
 template<>
 struct FuncSum<int8_t> {
-  union converter { uint32_t storage; char4 a; };
   __device__ uint32_t operator()(const uint32_t x, const uint32_t y) const {
+#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
+#else
 #if (__CUDA_ARCH__ >= 300) && (__CUDA_ARCH__ < 500)
     int32_t rv, z=0;
     asm("vadd4.s32.s32.s32 %0, %1, %2, %3;" : "=r"(rv) : "r"(x), "r"(y), "r"(z));
     return rv;
-#elif (__CUDA_ARCH__ >= 500) && (__CUDA_ARCH__ < 700)
-    int32_t rv;
-    asm("vadd.s32.s32.s32 %0,    %1.b0, %2.b0;    \n\t"
-        "vadd.s32.s32.s32 %0.b1, %1.b1, %2.b1, %0;\n\t"
-        "vadd.s32.s32.s32 %0.b2, %1.b2, %2.b2, %0;\n\t"
-        "vadd.s32.s32.s32 %0.b3, %1.b3, %2.b3, %0;" : "=r"(rv) : "r"(x), "r"(y));
-    return rv;
 #else
-    converter cx, cy, cr;
-    cx.storage = x;
-    cy.storage = y;
-    cr.a.x = cx.a.x + cy.a.x;
-    cr.a.y = cx.a.y + cy.a.y;
-    cr.a.z = cx.a.z + cy.a.z;
-    cr.a.w = cx.a.w + cy.a.w;
-    return cr.storage;
+    return addChar4(x, y);
+#endif
 #endif
   }
   __device__ int8_t operator()(const int8_t x, const int8_t y) const {
@@ -196,28 +197,16 @@ struct FuncSum<int8_t> {
 };
 template<>
 struct FuncSum<uint8_t> {
-  union converter { uint32_t storage; uchar4 a; };
   __device__ uint32_t operator()(const uint32_t x, const uint32_t y) const {
+#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
+#else
 #if (__CUDA_ARCH__ >= 300) && (__CUDA_ARCH__ < 500)
     int32_t rv, z=0;
     asm("vadd4.u32.u32.u32 %0, %1, %2, %3;" : "=r"(rv) : "r"(x), "r"(y), "r"(z));
     return rv;
-#elif (__CUDA_ARCH__ >= 500) && (__CUDA_ARCH__ < 700)
-    int32_t rv;
-    asm("vadd.u32.u32.u32 %0,    %1.b0, %2.b0;    \n\t"
-        "vadd.u32.u32.u32 %0.b1, %1.b1, %2.b1, %0;\n\t"
-        "vadd.u32.u32.u32 %0.b2, %1.b2, %2.b2, %0;\n\t"
-        "vadd.u32.u32.u32 %0.b3, %1.b3, %2.b3, %0;" : "=r"(rv) : "r"(x), "r"(y));
-    return rv;
 #else
-    converter cx, cy, cr;
-    cx.storage = x;
-    cy.storage = y;
-    cr.a.x = cx.a.x + cy.a.x;
-    cr.a.y = cx.a.y + cy.a.y;
-    cr.a.z = cx.a.z + cy.a.z;
-    cr.a.w = cx.a.w + cy.a.w;
-    return cr.storage;
+    return addChar4(x, y);
+#endif
 #endif
   }
   __device__ uint8_t operator()(const uint8_t x, const uint8_t y) const {
@@ -227,22 +216,6 @@ struct FuncSum<uint8_t> {
 
 static __device__ uint32_t mulChar4(const uint32_t x, const uint32_t y) {
   /* This can be used both for signed and unsigned 8-bit multiplication */
-#if (__CUDA_ARCH__ >= 300)
-  uint32_t rv;
-  asm("{ .reg .u32 t0, t1, t2, t3;\n\t"
-      " vmad.u32.u32.u32 t3, %1.b3, %2.b3, 0;\n\t"
-      " vmad.u32.u32.u32 t2, %1.b2, %2.b2, 0;\n\t"
-      " shl.b32          t3, t3, 16;\n\t"
-      " shl.b32          t2, t2, 16;\n\t"
-      " vmad.u32.u32.u32 t1, %1.b1, %2.b1, t3;\n\t"
-      " shl.b32          t1, t1, 8;\n\t"
-      " vmad.u32.u32.u32 t0, %1.b0, %2.b0, t2;\n\t"
-      " and.b32          t1, t1, 0xff00ff00;\n\t"
-      " and.b32          t0, t0, 0x00ff00ff;\n\t"
-      " or.b32           %0,  t0, t1;\n\t"
-      "}" : "=r"(rv) : "r"(x), "r"(y));
-  return rv;
-#else
   union converter { uint32_t storage; char4 a; };
   converter cx, cy, cr;
   cx.storage = x;
@@ -252,7 +225,6 @@ static __device__ uint32_t mulChar4(const uint32_t x, const uint32_t y) {
   cr.a.z = cx.a.z * cy.a.z;
   cr.a.w = cx.a.w * cy.a.w;
   return cr.storage;
-#endif
 }
 
 template<>
@@ -278,17 +250,12 @@ template<>
 struct FuncMax<int8_t> {
   union converter { uint32_t storage; char4 a; };
   __device__ uint32_t operator()(const uint32_t x, const uint32_t y) const {
+#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
+#else
 #if (__CUDA_ARCH__ >= 300) && (__CUDA_ARCH__ < 500)
     int32_t rv, z=0;
     asm("vmax4.s32.s32.s32 %0, %1, %2, %3;" : "=r"(rv) : "r"(x), "r"(y), "r"(z));
     return rv;
-#elif (__CUDA_ARCH__ >= 500) && (__CUDA_ARCH__ < 700)
-    int32_t rv;
-    asm("vmax.s32.s32.s32 %0,    %1.b0, %2.b0;    \n\t"
-        "vmax.s32.s32.s32 %0.b1, %1.b1, %2.b1, %0;\n\t"
-        "vmax.s32.s32.s32 %0.b2, %1.b2, %2.b2, %0;\n\t"
-        "vmax.s32.s32.s32 %0.b3, %1.b3, %2.b3, %0;" : "=r"(rv) : "r"(x), "r"(y));
-    return rv;
 #else
     converter cx, cy, cr;
     cx.storage = x;
@@ -298,6 +265,7 @@ struct FuncMax<int8_t> {
     cr.a.z = max(cx.a.z, cy.a.z);
     cr.a.w = max(cx.a.w, cy.a.w);
     return cr.storage;
+#endif
 #endif
   }
   __device__ int8_t operator()(const int8_t x, const int8_t y) const {
@@ -308,17 +276,12 @@ template<>
 struct FuncMax<uint8_t> {
   union converter { uint32_t storage; uchar4 a; };
   __device__ uint32_t operator()(const uint32_t x, const uint32_t y) const {
+#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
+#else
 #if (__CUDA_ARCH__ >= 300) && (__CUDA_ARCH__ < 500)
     int32_t rv, z=0;
     asm("vmax4.u32.u32.u32 %0, %1, %2, %3;" : "=r"(rv) : "r"(x), "r"(y), "r"(z));
     return rv;
-#elif (__CUDA_ARCH__ >= 500) && (__CUDA_ARCH__ < 700)
-    int32_t rv;
-    asm("vmax.u32.u32.u32 %0,    %1.b0, %2.b0;    \n\t"
-        "vmax.u32.u32.u32 %0.b1, %1.b1, %2.b1, %0;\n\t"
-        "vmax.u32.u32.u32 %0.b2, %1.b2, %2.b2, %0;\n\t"
-        "vmax.u32.u32.u32 %0.b3, %1.b3, %2.b3, %0;" : "=r"(rv) : "r"(x), "r"(y));
-    return rv;
 #else
     converter cx, cy, cr;
     cx.storage = x;
@@ -328,6 +291,7 @@ struct FuncMax<uint8_t> {
     cr.a.z = max(cx.a.z, cy.a.z);
     cr.a.w = max(cx.a.w, cy.a.w);
     return cr.storage;
+#endif
 #endif
   }
   __device__ uint8_t operator()(const uint8_t x, const uint8_t y) const {
@@ -339,17 +303,12 @@ template<>
 struct FuncMin<int8_t> {
   union converter { uint32_t storage; char4 a; };
   __device__ uint32_t operator()(const uint32_t x, const uint32_t y) const {
+#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
+#else
 #if (__CUDA_ARCH__ >= 300) && (__CUDA_ARCH__ < 500)
     int32_t rv, z=0;
     asm("vmin4.s32.s32.s32 %0, %1, %2, %3;" : "=r"(rv) : "r"(x), "r"(y), "r"(z));
     return rv;
-#elif (__CUDA_ARCH__ >= 500) && (__CUDA_ARCH__ < 700)
-    int32_t rv;
-    asm("vmin.s32.s32.s32 %0,    %1.b0, %2.b0;    \n\t"
-        "vmin.s32.s32.s32 %0.b1, %1.b1, %2.b1, %0;\n\t"
-        "vmin.s32.s32.s32 %0.b2, %1.b2, %2.b2, %0;\n\t"
-        "vmin.s32.s32.s32 %0.b3, %1.b3, %2.b3, %0;" : "=r"(rv) : "r"(x), "r"(y));
-    return rv;
 #else
     converter cx, cy, cr;
     cx.storage = x;
@@ -359,6 +318,7 @@ struct FuncMin<int8_t> {
     cr.a.z = min(cx.a.z, cy.a.z);
     cr.a.w = min(cx.a.w, cy.a.w);
     return cr.storage;
+#endif
 #endif
   }
   __device__ int8_t operator()(const int8_t x, const int8_t y) const {
@@ -369,17 +329,12 @@ template<>
 struct FuncMin<uint8_t> {
   union converter { uint32_t storage; uchar4 a; };
   __device__ uint32_t operator()(const uint32_t x, const uint32_t y) const {
+#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
+#else
 #if (__CUDA_ARCH__ >= 300) && (__CUDA_ARCH__ < 500)
     int32_t rv, z=0;
     asm("vmin4.u32.u32.u32 %0, %1, %2, %3;" : "=r"(rv) : "r"(x), "r"(y), "r"(z));
     return rv;
-#elif (__CUDA_ARCH__ >= 500) && (__CUDA_ARCH__ < 700)
-    int32_t rv;
-    asm("vmin.u32.u32.u32 %0,    %1.b0, %2.b0;    \n\t"
-        "vmin.u32.u32.u32 %0.b1, %1.b1, %2.b1, %0;\n\t"
-        "vmin.u32.u32.u32 %0.b2, %1.b2, %2.b2, %0;\n\t"
-        "vmin.u32.u32.u32 %0.b3, %1.b3, %2.b3, %0;" : "=r"(rv) : "r"(x), "r"(y));
-    return rv;
 #else
     converter cx, cy, cr;
     cx.storage = x;
@@ -389,6 +344,7 @@ struct FuncMin<uint8_t> {
     cr.a.z = min(cx.a.z, cy.a.z);
     cr.a.w = min(cx.a.w, cy.a.w);
     return cr.storage;
+#endif
 #endif
   }
   __device__ uint8_t operator()(const uint8_t x, const uint8_t y) const {
@@ -480,6 +436,6 @@ struct FuncMin<half> {
   }
 };
 
-#endif // defined(__HIP_PLATFORM_HCC__) || defined(__HCC__)
+#endif // defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
 
 #endif // REDUCE_KERNEL_H_
diff --git a/projects/rccl/src/collectives/device/reduce_scatter.cu b/projects/rccl/src/collectives/device/reduce_scatter.cu
index efff65deba..82cb408a16 100644
--- a/projects/rccl/src/collectives/device/reduce_scatter.cu
+++ b/projects/rccl/src/collectives/device/reduce_scatter.cu
@@ -11,12 +11,7 @@
 
 #define UNROLL 4
 
-#if NCCL_OP == 0
 IMPL_COLL2(ncclReduceScatter, sum,  FuncSum,  ncclCollReduceScatter, ncclSum);
-#elif NCCL_OP == 1
 IMPL_COLL2(ncclReduceScatter, prod, FuncProd, ncclCollReduceScatter, ncclProd);
-#elif NCCL_OP == 2
 IMPL_COLL2(ncclReduceScatter, min,  FuncMin,  ncclCollReduceScatter, ncclMin);
-#elif NCCL_OP == 3
-IMPL_COLL2(ncclReduceScatter, max,  FuncMax,  ncclCollReduceScatter, ncclMax);
-#endif
+IMPL_COLL2(ncclReduceScatter, max,  FuncMax,  ncclCollReduceScatter, ncclMax);
\ No newline at end of file
diff --git a/projects/rccl/src/collectives/device/reduce_scatter.h b/projects/rccl/src/collectives/device/reduce_scatter.h
index bb738766f1..c768d6a365 100644
--- a/projects/rccl/src/collectives/device/reduce_scatter.h
+++ b/projects/rccl/src/collectives/device/reduce_scatter.h
@@ -1,166 +1,93 @@
 /*************************************************************************
- * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
  * Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
  *
  * See LICENSE.txt for license information
  ************************************************************************/
 
-#include "core.h"
+#include "devcomm.h"
 #include "primitives.h"
 #include "collectives.h"
 
-// Increase Step and poffset/noffset for buffer sync
-#define NEXT_STEP \
-  step++; \
-  poffset = noffset; \
-  noffset += sliceSize; \
-  if (noffset == buffSize) noffset = 0;
-
 template<int UNROLL, class FUNC, typename T>
 __attribute__((noinline))
-__device__ void ncclReduceScatterKernel(struct CollectiveArgs* args) {
+__device__ void ncclReduceScatterRingKernel(struct CollectiveArgs* args) {
   const int tid = threadIdx.x;
   const int nthreads = blockDim.x;
   const int bid = args->bid;
-  struct ncclComm* comm = args->comm;
-  struct ncclRing* ring = comm->rings+blockIdx.x;
-
-  WaitFlag waitDoneFromNext(ring->send.conn.head, REDUCESCATTER_BUFCHUNKS*REDUCESCATTER_SUBSTEPS);
-  WaitFlag waitReadyFromPrev(ring->recv.conn.tail, REDUCESCATTER_SUBSTEPS);
-  PostFlag postDoneToPrev(ring->recv.conn.head, REDUCESCATTER_SUBSTEPS, NULL, 0);
-  PostFlag postReadyToNext(ring->send.conn.tail, 0, ring->send.conn.fifo, REDUCESCATTER_BUFCHUNKS*REDUCESCATTER_SUBSTEPS, ring->next_hdp_reg);
-
-  typedef Primitives<UNROLL, REDUCESCATTER_SUBSTEPS, T, FUNC> Prims;
-
+  struct ncclDevComm* comm = args->comm;
+  struct ncclChannel* channel = comm->channels+blockIdx.x;
+  struct ncclRing* ring = &channel->ring;
   const ssize_t size = args->N;
   const int nranks = comm->nRanks;
-  const int buffSize = ring->buffSize / sizeof(T);
-  const int sliceSize = buffSize / REDUCESCATTER_BUFCHUNKS;
-  const ssize_t loopSize = args->nRings*(ssize_t)sliceSize;
-
-  if (tid == 0) {
-    // Update in case we skipped some collectives
-    STORE(ring->recv.conn.opCount, args->opCount);
-    // Wait for next to be ready
-    WaitFlag waitOpCountNext(ring->send.conn.opCount, 0);
-    waitOpCountNext.wait(args->opCount);
-  }
-  __syncthreads();
-
-  uint64_t step = 0ULL;
-  int poffset, noffset = 0;
+  const int stepSize = channel->buffSize / (sizeof(T)*NCCL_STEPS);
+  const int chunkSize = stepSize * ALLREDUCE_CHUNKSTEPS;
+  const ssize_t loopSize = args->nChannels*(ssize_t)chunkSize;
 
   // Compute pointers
   const T * __restrict__ thisInput = (const T*)args->ThisInput;
   T * __restrict__ thisOutput = (T*)args->ThisOutput;
-  T * __restrict__ prevInput = (T*)ring->recv.conn.buff;
-  T * __restrict__ nextOutput = (T*)ring->send.conn.buff;
+
+  ncclPrimitives<UNROLL, REDUCESCATTER_CHUNKSTEPS/REDUCESCATTER_SLICESTEPS, REDUCESCATTER_SLICESTEPS, T, 1, 1, FUNC>
+    prims(tid, nthreads, &ring->prev, &ring->next, NULL, stepSize, channel, comm, args->opCount);
 
   for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
-    int chunkSize = min(sliceSize, DIVUP(size-gridOffset,args->nRings));
-    ALIGN_SIZE(chunkSize, nthreads*sizeof(uint64_t)/sizeof(T));
-    ssize_t chunkOffset = gridOffset + bid*chunkSize;
+    int realChunkSize = min(chunkSize, DIVUP(size-gridOffset,args->nChannels));
+    ALIGN_SIZE(realChunkSize, nthreads*sizeof(uint64_t)/sizeof(T));
+    ssize_t chunkOffset = gridOffset + bid*realChunkSize;
 
     /////////////// begin ReduceScatter steps ///////////////
     ssize_t offset;
-    int maxOffset = min(chunkSize, size-chunkOffset);
+    int nelem = min(realChunkSize, size-chunkOffset);
     int rankDest;
 
     // step 0: push data to next GPU
     rankDest = ring->devUserRanks[nranks-1];
     offset = chunkOffset + rankDest * size;
 
-    Prims::Copy(tid, nthreads,
-        thisInput  + offset,
-        nextOutput + noffset,
-        sliceSize, maxOffset,
-        step,
-        waitDoneFromNext,
-        postReadyToNext);
-
-    NEXT_STEP; // Increases step, poffset, noffset
+    prims.send(thisInput+offset, nelem);
 
     // k-2 steps: reduce and copy to next GPU
     for (int j=2; j<nranks; ++j) {
       rankDest = ring->devUserRanks[nranks-j];
       offset = chunkOffset + rankDest * size;
 
-      Prims::Reduce(tid, nthreads,
-          prevInput  + poffset,
-          thisInput  + offset,
-          nextOutput + noffset,
-          sliceSize, maxOffset,
-          step,
-          waitDoneFromNext, waitReadyFromPrev,
-          postReadyToNext, postDoneToPrev);
-
-      NEXT_STEP;
+      prims.recvReduceSend(thisInput+offset, nelem);
     }
 
-    // step k-1: reduce this buffer and data, which will produce the final
-    // result that we store in this data and push to the next GPU
+    // step k-1: reduce this buffer and data, which will produce the final result
     rankDest = ring->devUserRanks[0];
     offset = chunkOffset + rankDest * size;
 
-    Prims::Reduce(tid, nthreads,
-        prevInput  + poffset,
-        thisInput  + offset,
-        thisOutput + chunkOffset,
-        sliceSize, maxOffset,
-        step,
-        waitReadyFromPrev,
-        postDoneToPrev);
-  }
-
-  if (tid == 0) {
-    waitDoneFromNext.wait(REDUCESCATTER_SUBSTEPS*(step + REDUCESCATTER_BUFCHUNKS));
-    STORE(ring->send.conn.head, 0ULL);
-    STORE(ring->recv.conn.tail, 0ULL);
-    __threadfence_system();
-    STORE(ring->recv.conn.opCount, args->opCount+1);
+    prims.recvReduceCopy(thisInput+offset, thisOutput+chunkOffset, nelem);
   }
 }
 
-#include "ll_kernel.h"
-
-#define NEXT_STEP_LL \
-  poffset = noffset; \
-  pflag = nflag; \
-  noffset += NCCL_LL_SLICE_LINES; \
-  if (noffset == NCCL_LL_BUFF_LINES) { noffset = 0; } \
-  nflag++; \
-  step++;
+template<int UNROLL, class FUNC, typename T>
+__attribute__((noinline))
+__device__ void ncclReduceScatterTreeKernel(struct CollectiveArgs* args) { }
 
 template<int UNUSED, class FUNC, typename T>
 __attribute__((noinline))
-__device__ void ncclReduceScatterLLKernel(struct CollectiveArgs* args) {
+__device__ void ncclReduceScatterRingLLKernel(struct CollectiveArgs* args) {
   const int tid = threadIdx.x;
   const int bid = args->bid;
-  const int llNthreads = args->nThreads;
-  struct ncclComm* comm = args->comm;
-  struct ncclRing* ring = comm->rings+blockIdx.x;
-  volatile uint64_t * recvHeadPtr = ring->recv.conn.llHead;
-  volatile uint64_t * sendHeadPtr = ring->send.conn.llHead;
-  volatile int * sizesFifo = ring->send.conn.llFifo;
-  uint64_t sendHead = sendHeadPtr[0];
+  const int nthreads = args->nThreads;
+  struct ncclDevComm* comm = args->comm;
+  struct ncclChannel* channel = comm->channels+blockIdx.x;
+  struct ncclRing* ring = &channel->ring;
 
-  typedef LLPrimitives<T, FUNC> LL;
+  ncclLLPrimitives<T, FUNC, 1, 1> LLprims(tid, nthreads, &ring->prev, &ring->next, channel, comm, args->opCount);
 
   const ssize_t size = args->N;
   //const int rank = comm->rank;
   const int nranks = comm->nRanks;
   ssize_t chunkSize = NCCL_LL_SLICE_LINES * sizeof(uint64_t) / sizeof(T);
-  const ssize_t loopSize = args->nRings*chunkSize;
-
-  uint64_t step = ring->send.conn.llStep;
-  uint32_t pflag, nflag = step + 1;
-  int poffset, noffset = NCCL_LL_SLICE_LINES * STEP_TO_SLOT(step);
+  const ssize_t loopSize = args->nChannels*chunkSize;
 
   // Compute pointers
   const T * __restrict__ thisInput = (const T*)args->ThisInput;
   T * __restrict__ thisOutput = (T*)args->ThisOutput;
-  union ncclLLFifoLine * prevInput = (union ncclLLFifoLine *)ring->recv.conn.llBuff;
-  union ncclLLFifoLine * nextOutput = (union ncclLLFifoLine *)ring->send.conn.llBuff;
 
   for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
     if (size-gridOffset < loopSize) {
@@ -170,37 +97,21 @@ __device__ void ncclReduceScatterLLKernel(struct CollectiveArgs* args) {
 
     /////////////// begin ReduceScatter steps ///////////////
     ssize_t offset;
-    int maxOffset = min(chunkSize, size-chunkOffset);
+    int nelem = min(chunkSize, size-chunkOffset);
     int rankDest;
 
     // step 0: push data to next GPU
     rankDest = ring->devUserRanks[nranks-1];
     offset = chunkOffset + rankDest * size;
 
-    WAIT_NEXT;
-    LL::ReduceCopy(
-        thisInput  + offset,
-        nextOutput + noffset,
-        maxOffset, nflag, llNthreads);
-    POST_SIZE;
-
-    NEXT_STEP_LL;
+    LLprims.send(thisInput+offset, nelem);
 
     // k-2 steps: reduce and copy to next GPU
     for (int j=2; j<nranks; ++j) {
       rankDest = ring->devUserRanks[nranks-j];
       offset = chunkOffset + rankDest * size;
 
-      WAIT_NEXT;
-      LL::ReduceCopy(
-          thisInput  + offset,
-          prevInput  + poffset,
-          nextOutput + noffset,
-          maxOffset, pflag, nflag, llNthreads);
-      POST_SIZE;
-      ACK_PREV;
-
-      NEXT_STEP_LL;
+      LLprims.recvReduceSend(thisInput+offset, nelem);
     }
 
     // step k-1: reduce this buffer and data, which will produce the final
@@ -208,13 +119,10 @@ __device__ void ncclReduceScatterLLKernel(struct CollectiveArgs* args) {
     rankDest = ring->devUserRanks[0];
     offset = chunkOffset + rankDest * size;
 
-    LL::ReduceCopy(
-        thisInput  + offset,
-        prevInput  + poffset,
-        thisOutput + chunkOffset,
-        maxOffset, pflag, llNthreads);
-    ACK_PREV;
+    LLprims.recvReduceCopy(thisInput+offset, thisOutput+chunkOffset, nelem);
   }
-
-  FIFO_CLEANING_AND_SAVE_STEP(nflag);
 }
+
+template<int UNUSED, class FUNC, typename T>
+__attribute__((noinline))
+__device__ void ncclReduceScatterTreeLLKernel(struct CollectiveArgs* args) { }
diff --git a/projects/rccl/src/collectives/device/reduce_scatter_0.cpp b/projects/rccl/src/collectives/device/reduce_scatter_0.cpp
deleted file mode 100644
index 936f164605..0000000000
--- a/projects/rccl/src/collectives/device/reduce_scatter_0.cpp
+++ /dev/null
@@ -1,8 +0,0 @@
-/*************************************************************************
- * Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
- *
- * See LICENSE.txt for license information
- ************************************************************************/
-
-#define NCCL_OP 0
-#include "device/reduce_scatter.cu"
diff --git a/projects/rccl/src/collectives/device/reduce_scatter_1.cpp b/projects/rccl/src/collectives/device/reduce_scatter_1.cpp
deleted file mode 100644
index 3dbd2466d7..0000000000
--- a/projects/rccl/src/collectives/device/reduce_scatter_1.cpp
+++ /dev/null
@@ -1,8 +0,0 @@
-/*************************************************************************
- * Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
- *
- * See LICENSE.txt for license information
- ************************************************************************/
-
-#define NCCL_OP 1
-#include "device/reduce_scatter.cu"
diff --git a/projects/rccl/src/collectives/device/reduce_scatter_2.cpp b/projects/rccl/src/collectives/device/reduce_scatter_2.cpp
deleted file mode 100644
index 7302f55739..0000000000
--- a/projects/rccl/src/collectives/device/reduce_scatter_2.cpp
+++ /dev/null
@@ -1,8 +0,0 @@
-/*************************************************************************
- * Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
- *
- * See LICENSE.txt for license information
- ************************************************************************/
-
-#define NCCL_OP 2
-#include "device/reduce_scatter.cu"
diff --git a/projects/rccl/src/collectives/device/reduce_scatter_3.cpp b/projects/rccl/src/collectives/device/reduce_scatter_3.cpp
deleted file mode 100644
index 95a2fc93b7..0000000000
--- a/projects/rccl/src/collectives/device/reduce_scatter_3.cpp
+++ /dev/null
@@ -1,8 +0,0 @@
-/*************************************************************************
- * Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
- *
- * See LICENSE.txt for license information
- ************************************************************************/
-
-#define NCCL_OP 3
-#include "device/reduce_scatter.cu"
diff --git a/projects/rccl/src/collectives/reduce.cc b/projects/rccl/src/collectives/reduce.cc
new file mode 100644
index 0000000000..f53437f86d
--- /dev/null
+++ b/projects/rccl/src/collectives/reduce.cc
@@ -0,0 +1,19 @@
+/*************************************************************************
+ * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
+ * Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#include "enqueue.h"
+#include "collectives.h"
+
+NCCL_API(ncclResult_t, ncclReduce, const void* sendbuff, void* recvbuff, size_t count,
+    ncclDataType_t datatype, ncclRedOp_t op, int root, ncclComm_t comm, hipStream_t stream);
+ncclResult_t ncclReduce(const void* sendbuff, void* recvbuff, size_t count,
+    ncclDataType_t datatype, ncclRedOp_t op, int root, ncclComm_t comm, hipStream_t stream) {
+  struct ncclInfo info = { ncclCollReduce, "Reduce",
+    sendbuff, recvbuff, count, datatype, op, root, comm, stream, /* Args */
+    REDUCE_CHUNKSTEPS, REDUCE_SLICESTEPS };
+  return ncclEnqueueCheck(&info);
+}
diff --git a/projects/rccl/src/collectives/reduce.cu b/projects/rccl/src/collectives/reduce.cu
deleted file mode 100644
index 89dc804b7f..0000000000
--- a/projects/rccl/src/collectives/reduce.cu
+++ /dev/null
@@ -1,33 +0,0 @@
-/*************************************************************************
- * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
- *
- * See LICENSE.txt for license information
- ************************************************************************/
-
-#include "core.h"
-#include "common_coll.h"
-#include "enqueue.h"
-#include "collectives.h"
-
-ncclResult_t ncclReduceFunc(const void* sendbuff, void* recvbuff, const size_t count,
-    ncclDataType_t datatype, ncclRedOp_t op, int root, ncclComm_t comm, hipStream_t stream) {
-  size_t nbytes = count*ncclTypeSize(datatype);
-  INFO(NCCL_COLL,"Reduce: opCount %lx sendbuff %p recvbuff %p count %zi datatype %d op %d root %d comm %p [nranks=%d] stream %p", comm->opCount, sendbuff, recvbuff, count, datatype, op, root, comm, comm->nRanks, stream);
-  if (comm->nRanks == 1) {
-    if (sendbuff != recvbuff)
-      CUDACHECK(hipMemcpyAsync(recvbuff, sendbuff, nbytes, hipMemcpyDeviceToDevice, stream));
-  } else {
-    NCCLCHECK(transportSaveProxies(REDUCE_SUBSTEPS, REDUCE_BUFCHUNKS, 1, 1, nbytes, proxyPatternTo(root), comm));
-    NCCLCHECK(saveKernel(ncclCollReduce, sendbuff, recvbuff, count, datatype, op, root, comm, stream, nbytes, 1));
-  }
-
-  return ncclSuccess;
-}
-
-NCCL_API(ncclResult_t, ncclReduce, const void* sendbuff, void* recvbuff, size_t count,
-    ncclDataType_t datatype, ncclRedOp_t op, int root, ncclComm_t comm, hipStream_t stream);
-ncclResult_t ncclReduce(const void* sendbuff, void* recvbuff, size_t count,
-    ncclDataType_t datatype, ncclRedOp_t op, int root, ncclComm_t comm, hipStream_t stream) {
-  return ncclEnqueueCheck(ncclReduceFunc, "Reduce", sendbuff, recvbuff, count, datatype,
-          op, root, comm, stream);
-}
diff --git a/projects/rccl/src/collectives/reduce_scatter.cc b/projects/rccl/src/collectives/reduce_scatter.cc
new file mode 100644
index 0000000000..0ded7c557a
--- /dev/null
+++ b/projects/rccl/src/collectives/reduce_scatter.cc
@@ -0,0 +1,19 @@
+/*************************************************************************
+ * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
+ * Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#include "enqueue.h"
+#include "collectives.h"
+
+NCCL_API(ncclResult_t, ncclReduceScatter, const void* sendbuff, void* recvbuff, size_t recvcount,
+    ncclDataType_t datatype, ncclRedOp_t op, ncclComm* comm, hipStream_t stream);
+ncclResult_t ncclReduceScatter(const void* sendbuff, void* recvbuff, size_t recvcount,
+    ncclDataType_t datatype, ncclRedOp_t op, ncclComm* comm, hipStream_t stream) {
+  struct ncclInfo info = { ncclCollReduceScatter, "ReduceScatter",
+    sendbuff, recvbuff, recvcount, datatype, op, 0, comm, stream, /* Args */
+    REDUCESCATTER_CHUNKSTEPS, REDUCESCATTER_SLICESTEPS };
+  return ncclEnqueueCheck(&info);
+}
diff --git a/projects/rccl/src/collectives/reduce_scatter.cu b/projects/rccl/src/collectives/reduce_scatter.cu
deleted file mode 100644
index f73d50948d..0000000000
--- a/projects/rccl/src/collectives/reduce_scatter.cu
+++ /dev/null
@@ -1,32 +0,0 @@
-/*************************************************************************
- * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
- *
- * See LICENSE.txt for license information
- ************************************************************************/
-
-#include "core.h"
-#include "common_coll.h"
-#include "enqueue.h"
-#include "collectives.h"
-
-ncclResult_t ncclReduceScatterFunc(const void* sendbuff, void* recvbuff, size_t count,
-    ncclDataType_t datatype, ncclRedOp_t op, int root, ncclComm_t comm, hipStream_t stream) {
-  size_t nbytes = count*ncclTypeSize(datatype);
-  INFO(NCCL_COLL,"ReduceScatter: opCount %lx sendbuff %p recvbuff %p count %zi datatype %d op %d root %d comm %p [nranks=%d] stream %p", comm->opCount, sendbuff, recvbuff, count, datatype, op, root, comm, comm->nRanks, stream);
-  if (comm->nRanks == 1) {
-    if (sendbuff != recvbuff)
-      CUDACHECK(hipMemcpyAsync(recvbuff, sendbuff, nbytes, hipMemcpyDeviceToDevice, stream));
-  } else {
-    NCCLCHECK(transportSaveProxies(REDUCESCATTER_SUBSTEPS, REDUCESCATTER_BUFCHUNKS, comm->nRanks-1, comm->nRanks, nbytes*comm->nRanks, proxyPatternRing, comm));
-    NCCLCHECK(saveKernel(ncclCollReduceScatter, sendbuff, recvbuff, count, datatype, op, root, comm, stream, nbytes*comm->nRanks, 1));
-  }
-  return ncclSuccess;
-}
-
-NCCL_API(ncclResult_t, ncclReduceScatter, const void* sendbuff, void* recvbuff, size_t recvcount,
-    ncclDataType_t datatype, ncclRedOp_t op, ncclComm* comm, hipStream_t stream);
-ncclResult_t ncclReduceScatter(const void* sendbuff, void* recvbuff, size_t recvcount,
-    ncclDataType_t datatype, ncclRedOp_t op, ncclComm* comm, hipStream_t stream) {
-  return ncclEnqueueCheck(ncclReduceScatterFunc, "ReduceScatter", sendbuff, recvbuff, recvcount, datatype,
-          op, 0, comm, stream);
-}
diff --git a/projects/rccl/src/enqueue.cc b/projects/rccl/src/enqueue.cc
new file mode 100644
index 0000000000..0c7b897ec4
--- /dev/null
+++ b/projects/rccl/src/enqueue.cc
@@ -0,0 +1,441 @@
+/*************************************************************************
+ * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved.
+ * Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#include "enqueue.h"
+#include "checks.h"
+#include "param.h"
+
+#include "collectives/collectives.h"
+
+// Only generate inline kernels for LL
+#define NCCL_FUNC5(coll, op, dtype) \
+  NCCL_KERN_NAME(coll##LL, op, dtype), \
+  NCCL_KERN_NAME(coll##LL, op, dtype)
+
+#define NCCL_FUNC4(coll, op, dtype) \
+  NCCL_FUNC5(coll##Ring, op, dtype)
+
+// Must be consistent with ncclDataType_t
+#define NCCL_FUNCS3A(coll, op) \
+  NCCL_FUNC4(coll, op,  i8), \
+  NCCL_FUNC4(coll, op,  u8), \
+  NCCL_FUNC4(coll, op, i32), \
+  NCCL_FUNC4(coll, op, u32), \
+  NCCL_FUNC4(coll, op, i64), \
+  NCCL_FUNC4(coll, op, u64), \
+  NCCL_FUNC4(coll, op, f16), \
+  NCCL_FUNC4(coll, op, f32), \
+  NCCL_FUNC4(coll, op, f64)
+#define NCCL_FUNCS3B(coll, op) \
+  NCCL_FUNC4(coll, op,  i8), \
+  NCCL_FUNC4(coll, op,  i8), \
+  NCCL_FUNC4(coll, op,  i8), \
+  NCCL_FUNC4(coll, op,  i8), \
+  NCCL_FUNC4(coll, op,  i8), \
+  NCCL_FUNC4(coll, op,  i8), \
+  NCCL_FUNC4(coll, op,  i8), \
+  NCCL_FUNC4(coll, op,  i8), \
+  NCCL_FUNC4(coll, op,  i8)
+
+// Must be consistent with ncclRedOp_t -- but we only generate kernel for sums.
+#define NCCL_FUNCS2A(coll) \
+  NCCL_FUNCS3A(coll, sum), \
+  NCCL_FUNCS3A(coll, sum), \
+  NCCL_FUNCS3A(coll, sum), \
+  NCCL_FUNCS3A(coll, sum)
+#define NCCL_FUNCS2B(coll) \
+  NCCL_FUNCS3B(coll, copy), \
+  NCCL_FUNCS3B(coll, copy), \
+  NCCL_FUNCS3B(coll, copy), \
+  NCCL_FUNCS3B(coll, copy)
+
+typedef void(*ncclKern_t)(struct ncclColl);
+// Must be consistent with the ncclFuncSet enum
+static ncclKern_t const ncclKerns[ncclCollCount*ncclNumOps*ncclNumTypes*2] = {
+  NCCL_FUNCS2B(ncclBroadcast),
+  NCCL_FUNCS2A(ncclReduce),
+  NCCL_FUNCS2B(ncclAllGather),
+  NCCL_FUNCS2A(ncclReduceScatter),
+  NCCL_FUNCS2A(ncclAllReduce)
+};
+
+/*****************************************************************************/
+/*       Launch system : synchronization and CUDA kernel launch              */
+/*****************************************************************************/
+
+ncclResult_t ncclLaunchCooperativeKernelMultiDevice(hipLaunchParams *paramsList, int* cudaDevs, int numDevices, int cgMode) {
+  if (cgMode & 0x01) {
+    CUDACHECK(hipExtLaunchMultiKernelMultiDevice(paramsList, numDevices,
+            // These flags are to reduce the latency of using this API
+            0));
+    return ncclSuccess;
+  }
+  int savedDev;
+  CUDACHECK(hipGetDevice(&savedDev));
+  for (int i = 0; i < numDevices; i++) {
+    hipLaunchParams* params = paramsList+i;
+    CUDACHECK(hipSetDevice(cudaDevs[i]));
+    hipLaunchKernelGGL(((void (*)(struct ncclColl))params->func), params->gridDim, params->blockDim, params->sharedMem, params->stream, **((struct ncclColl **)(params->args)));
+  }
+  CUDACHECK(hipSetDevice(savedDev));
+  return ncclSuccess;
+}
+
+ncclResult_t setupLaunch(struct ncclComm* comm, hipLaunchParams* params) {
+  params->gridDim.x = std::min<unsigned>(params->gridDim.x, comm->nChannels);
+
+  // Set active = 2 for the last operation
+  for (int r=0; r<params->gridDim.x; r++) {
+    struct ncclChannel* channel = comm->channels+r;
+    STORE(&channel->collectives[(channel->collStart+channel->collCount-1)%NCCL_MAX_OPS].active, 2);
+  }
+
+  // Find the first operation, choose the kernel accordingly and pass it
+  // as the first argument.
+  struct ncclColl* coll = comm->channels[0].collectives+comm->channels[0].collStart;
+  memcpy(&comm->args, coll, sizeof(struct ncclColl));
+  // As we pass that coll directly, we can free it immediately.
+  STORE(&coll->active, 0);
+
+  params->func = (void *)ncclKerns[coll->funcIndex];
+  return ncclSuccess;
+}
+
+ncclResult_t ncclCpuBarrierIn(struct ncclComm* comm, int* isLast) {
+  volatile int* ptr = (volatile int*)(comm->intraBarrier+comm->intraPhase);
+  int val = LOAD(ptr);
+  bool done = false;
+  while (done == false) {
+    if (val >= comm->intraRanks) {
+      WARN("Trying to launch too many collectives");
+      return ncclInvalidUsage;
+    }
+    if (val+1 == comm->intraRanks) {
+      // Reset the barrier.
+      comm->intraBarrier[comm->intraPhase^1] = 0;
+      *isLast = 1;
+      return ncclSuccess;
+    }
+    done = __sync_bool_compare_and_swap(ptr, val, val+1);
+    val++;
+  }
+  *isLast = 0;
+  return ncclSuccess;
+}
+
+ncclResult_t ncclCpuBarrierLast(struct ncclComm* comm) {
+  volatile int* ptr = (volatile int*)(comm->intraBarrier+comm->intraPhase);
+  int val = LOAD(ptr);
+  if (__sync_bool_compare_and_swap(ptr, val, val+1) != true) {
+    WARN("Trying to launch too many collectives");
+    return ncclInternalError;
+  }
+  return ncclSuccess;
+}
+
+ncclResult_t ncclCpuBarrierOut(struct ncclComm* comm) {
+  volatile int* ptr = (volatile int*)(comm->intraBarrier+comm->intraPhase);
+  while (LOAD(ptr) < comm->intraRanks) pthread_yield();
+  comm->intraPhase ^= 1;
+  return ncclSuccess;
+}
+
+ncclResult_t ncclBarrierEnqueue(struct ncclComm* comm) {
+  if (comm->nRanks == 1) return ncclSuccess;
+  hipLaunchParams* params = comm->myParams;
+
+  NCCLCHECK(setupLaunch(comm, params));
+
+  // Use internal NCCL stream for CGMD/GROUP launch if required or if the user stream is NULL
+  if (comm->launchMode == ncclComm::GROUP && (comm->groupCudaStream || comm->userStream == NULL)) {
+    // Enqueue event in user stream
+    CUDACHECK(hipEventRecord(comm->doneEvent, comm->userStream));
+    // Create dependency between user stream and internal NCCL stream
+    CUDACHECK(hipStreamWaitEvent(comm->groupStream, comm->doneEvent, 0));
+    params->stream = comm->groupStream;
+  } else {
+    if (comm->userStream != params->stream) {
+      // Stream changed from last call, create dependency against last NCCL kernel launch
+      CUDACHECK(hipStreamWaitEvent(comm->userStream, comm->doneEvent, 0));
+    }
+    params->stream = comm->userStream;
+  }
+
+  int isLast = 0;
+  NCCLCHECK(ncclCpuBarrierIn(comm, &isLast));
+
+  if (isLast) {
+    if (comm->launchMode == ncclComm::GROUP) {
+      // I'm the last. Launch all operations.
+      NCCLCHECK(ncclLaunchCooperativeKernelMultiDevice(comm->intraParams, comm->intraCudaDevs, comm->intraRanks, *comm->intraCGMode));
+    }
+    NCCLCHECK(ncclCpuBarrierLast(comm));
+  }
+  return ncclSuccess;
+}
+
+ncclResult_t ncclBarrierEnqueueWait(ncclComm_t comm) {
+  if (comm->nRanks == 1) return ncclSuccess;
+  // We can't print the CG mode before the first barrier happened.
+  if (comm->rank == 0 && *comm->intraCGMode & 0x10) {
+    *comm->intraCGMode ^= 0x10;
+    INFO(NCCL_INIT,"Launch mode %s%s%s",
+        comm->launchMode == ncclComm::GROUP ? "Group" : "Parallel",
+        *comm->intraCGMode ? "/CGMD" : "",
+        (comm->launchMode == ncclComm::GROUP && comm->groupCudaStream) ? "/Stream" : "");
+  }
+
+  NCCLCHECK(ncclCpuBarrierOut(comm));
+
+  hipLaunchParams *params = comm->myParams;
+  if (comm->launchMode == ncclComm::PARALLEL) {
+    hipLaunchKernelGGL(((void (*)(struct ncclColl))params->func), params->gridDim, params->blockDim, params->sharedMem, params->stream, **((struct ncclColl **)(params->args)));
+  }
+  // Start the network proxies as soon as the kernel has been launched. We can't
+  // perform any CUDA call between the two or having a hipFree between the CUDA
+  // launch and the transportStartProxy call could cause a deadlock.
+  // Also, starting the proxies after the CUDA launch seems to be better for
+  // performance (latency).
+  for (int r=0; r<params->gridDim.x; r++) {
+    struct ncclChannel* channel = comm->channels+r;
+    channel->collStart = channel->collFifoTail;
+    channel->collCount = 0;
+  }
+  params->gridDim.x = params->blockDim.x = 0;
+  NCCLCHECK(transportStartProxy(comm));
+  return ncclSuccess;
+}
+
+ncclResult_t ncclEnqueueEvents(ncclComm_t comm) {
+  hipLaunchParams *params = comm->myParams;
+  // Enqueue event after NCCL kernel
+  CUDACHECK(hipEventRecord(comm->doneEvent, params->stream));
+  // Use internal NCCL stream for CGMD/GROUP launch if required or if the user stream is NULL
+  if (comm->launchMode == ncclComm::GROUP && (comm->groupCudaStream || comm->userStream == NULL)) {
+    // Create dependency between NCCL internal stream and user stream
+    CUDACHECK(hipStreamWaitEvent(comm->userStream, comm->doneEvent, 0));
+  }
+  comm->userStreamSet = false;
+  return ncclSuccess;
+}
+
+/*****************************************************************************/
+/* Enqueueing system : computation of kernel and proxy operations parameters */
+/*****************************************************************************/
+
+static ncclResult_t getPatternInfo(struct ncclInfo* info) {
+  if (info->coll == ncclCollBroadcast) info->pattern = ncclPatternPipelineFrom;
+  else if (info->coll == ncclCollReduce) info->pattern = ncclPatternPipelineTo;
+  else if (info->coll == ncclCollAllGather || info->coll == ncclCollReduceScatter) info->pattern = ncclPatternRing;
+  else if (info->coll == ncclCollAllReduce) {
+    if (info->nBytes <= info->comm->treeThreshold)
+      info->pattern = ncclPatternTreeUpDown;
+    else
+      info->pattern = ncclPatternRingTwice;
+  }
+  else {
+    WARN("Unknown collective %d", info->coll);
+    return ncclInternalError;
+  }
+  return ncclSuccess;
+}
+
+static ncclResult_t getLoopInfo(struct ncclInfo* info) {
+  switch (info->pattern) {
+    case ncclPatternTreeUp:
+    case ncclPatternTreeDown:
+    case ncclPatternTreeUpDown:
+    case ncclPatternPipelineFrom:
+    case ncclPatternPipelineTo:
+      info->nstepsPerLoop = info-> nchunksPerLoop = 1; break;
+    case ncclPatternRing:
+      info->nstepsPerLoop = info->comm->nRanks-1; info->nchunksPerLoop = info->comm->nRanks; break;
+    case ncclPatternRingTwice:
+      info->nstepsPerLoop = 2*(info->comm->nRanks-1); info->nchunksPerLoop = info->comm->nRanks; break;
+    default:
+      WARN("Unknown pattern %d\n", info->pattern);
+      return ncclInternalError;
+  }
+  return ncclSuccess;
+}
+
+static void getKernelInfo(struct ncclInfo* info, uint8_t* nChannels, uint16_t* nThreads, int* llMode) {
+  // Compute thresholds and limits that users can override
+  ssize_t perThreadLLThreshold = std::min<ssize_t>(info->comm->threadThreshold, NCCL_LL_CHANNEL_THRESHOLD);
+  int maxLLNthreads = std::min(NCCL_LL_MAX_NTHREADS, info->comm->nThreads);
+
+  // First compute nThreads
+  int nt = NCCL_LL_MIN_NTHREADS;
+  while (DIVUP(info->nBytes, nt*info->nchunksPerLoop) > perThreadLLThreshold && nt*2 <= maxLLNthreads) nt *= 2;
+
+  // Then compute nChannels
+  int nc = DIVUP(info->nBytes, nt*info->nchunksPerLoop*perThreadLLThreshold);
+  if (nc == 0) nc = 1;
+  if (nc > info->comm->nChannels) nc = info->comm->nChannels;
+
+  // Check if we have a fixed LL threshold, otherwise compute it.
+  int perThreadThreshold = info->comm->threadThreshold;
+  if (info->pattern >= ncclPatternTreeUp) perThreadThreshold *= 4;
+  ssize_t llThreshold = info->comm->llThreshold >= 0 ?
+    info->comm->llThreshold :
+    nc*nt*info->nchunksPerLoop*perThreadThreshold;
+
+  if (info->nBytes <= llThreshold) {
+    *llMode = 1;
+    *nChannels = nc;
+    *nThreads = nt;
+  } else {
+    *llMode = 0;
+    *nChannels = info->comm->nChannels;
+    *nThreads = info->comm->nThreads;
+  }
+}
+
+static ncclResult_t computeColl(struct ncclInfo* info /* input */, struct ncclColl* coll, struct ncclProxyArgs* proxyArgs /* output */) {
+  // Set nstepsPerLoop and nchunksPerLoop
+  NCCLCHECK(getPatternInfo(info));
+  NCCLCHECK(getLoopInfo(info));
+
+  coll->args.root = info->root;
+  coll->args.N = info->count;
+  coll->args.ThisInput = info->sendbuff;
+  coll->args.ThisOutput = info->recvbuff;
+  coll->args.comm = info->comm->devComm;
+  coll->args.opCount = info->comm->opCount;
+
+  // Compute llMode, nChannels, nThreads
+  int llMode;
+  getKernelInfo(info, &coll->args.nChannels, &coll->args.nThreads, &llMode);
+
+  int treeMode = info->pattern >= ncclPatternTreeUp ? 1 : 0;
+  coll->funcIndex = FUNC_INDEX(info->coll, info->op, info->datatype, llMode, treeMode);
+
+  int stepSize   = ( llMode ? NCCL_LL_BUFF_SIZE : info->comm->channels[0].buffSize ) / NCCL_STEPS;
+  int chunkSteps = (llMode|treeMode) ? 1 : info->chunkSteps;
+  int sliceSteps = (llMode|treeMode) ? 1 : info->sliceSteps;
+  int chunkSize  = stepSize*chunkSteps;
+
+  // Compute lastChunkSize
+  if (treeMode == 1 && llMode == 0) {
+    if (info->pattern == ncclPatternTreeUpDown) {
+      // Optimize chunkSize / nSteps
+      while (info->nBytes / (coll->args.nChannels*chunkSize) < info->comm->channels[0].tree.depth*8 && chunkSize > 131072) chunkSize /= 2;
+      while (info->nBytes / (coll->args.nChannels*chunkSize) < info->comm->channels[0].tree.depth*4 && chunkSize > 65536) chunkSize /= 2;
+      while (info->nBytes / (coll->args.nChannels*chunkSize) < info->comm->channels[0].tree.depth && chunkSize > 32768) chunkSize /= 2;
+    }
+    // Use lastChunkSize as chunkSize
+    coll->args.lastChunkSize = chunkSize / ncclTypeSize(info->datatype);
+  } else if (llMode == 1) {
+    int sliceSize = NCCL_LL_SLICE_LINES * sizeof(uint64_t);
+    const ssize_t loopSize = coll->args.nChannels*info->nchunksPerLoop*(ssize_t)sliceSize;
+    coll->args.lastChunkSize = DIVUP((info->nBytes-(info->nBytes/loopSize)*loopSize), coll->args.nChannels*info->nchunksPerLoop);
+    ALIGN_SIZE(coll->args.lastChunkSize, coll->args.nThreads*sizeof(uint64_t));
+    coll->args.lastChunkSize /= ncclTypeSize(info->datatype);
+  }
+
+  // Compute nSteps for proxies
+  size_t nBytes  = llMode ? info->nBytes*2 : info->nBytes;
+
+  int nLoops = (int)(DIVUP(nBytes, (((size_t)(coll->args.nChannels))*info->nchunksPerLoop*chunkSize)));
+  proxyArgs->nsteps = info->nstepsPerLoop * nLoops * chunkSteps;
+  proxyArgs->sliceSteps = sliceSteps;
+  proxyArgs->chunkSteps = chunkSteps;
+  proxyArgs->llMode = llMode;
+  proxyArgs->opCount = info->comm->opCount;
+  TRACE(NCCL_NET,"opCount %lx slicesteps %d spl %d cpl %d nbytes %zi -> llmode %d nchannels %d nthreads %d, nloops %d nsteps %d comm %p",
+      coll->args.opCount, proxyArgs->sliceSteps, info->nstepsPerLoop, info->nchunksPerLoop, nBytes, llMode, coll->args.nChannels, coll->args.nThreads,
+      nLoops, proxyArgs->nsteps, info->comm);
+  return ncclSuccess;
+}
+
+static ncclResult_t saveKernel(struct ncclInfo* info) {
+  if (info->comm->nRanks == 1) {
+    if (info->sendbuff != info->recvbuff)
+      CUDACHECK(hipMemcpyAsync(info->recvbuff, info->sendbuff, info->nBytes, hipMemcpyDeviceToDevice, info->stream));
+    return ncclSuccess;
+  }
+
+  struct ncclColl coll;
+  struct ncclProxyArgs proxyArgs;
+  memset(&proxyArgs, 0, sizeof(struct ncclProxyArgs));
+  NCCLCHECK(computeColl(info, &coll, &proxyArgs));
+
+  info->comm->myParams->blockDim.x = std::max<unsigned>(info->comm->myParams->blockDim.x, coll.args.nThreads);
+  if (info->comm->userStreamSet == false) {
+    info->comm->userStream = info->stream;
+    info->comm->userStreamSet = true;
+  } else if (info->stream != info->comm->userStream) {
+    WARN("Error : mixing different streams within a group call is not supported.");
+    return ncclInvalidUsage;
+  }
+  for (int bid=0; bid<coll.args.nChannels; bid++) {
+    struct ncclChannel* channel = info->comm->channels+(info->comm->myParams->gridDim.x % info->comm->nChannels);
+
+    if (channel->collCount == NCCL_MAX_OPS) {
+      WARN("Too many aggregated operations (%d max)", NCCL_MAX_OPS);
+      return ncclInvalidUsage;
+    }
+
+    // Proxy
+    proxyArgs.channel = channel;
+    NCCLCHECK(transportSaveProxies(&proxyArgs, info->pattern, info->root, info->comm->nRanks));
+
+    info->comm->myParams->gridDim.x++;
+
+    int opIndex = channel->collFifoTail;
+    struct ncclColl* c = channel->collectives+opIndex;
+    volatile uint8_t* activePtr = (volatile uint8_t*)&c->active;
+    while (LOAD(activePtr) != 0) sched_yield();
+
+    memcpy(c, &coll, sizeof(struct ncclColl));
+
+    c->args.bid = bid;
+    STORE(&c->active, 1);
+    opIndex = (opIndex+1)%NCCL_MAX_OPS;
+    c->nextIndex = opIndex;
+    channel->collFifoTail = opIndex;
+    channel->collCount++;
+  }
+  /*if (llMode == 0)*/ info->comm->opCount++;
+  return ncclSuccess;
+}
+
+
+ncclResult_t ncclEnqueueCheck(struct ncclInfo* info) {
+  if (info->comm == NULL) return ncclInvalidArgument;
+
+  INFO(NCCL_COLL,"%s: opCount %lx sendbuff %p recvbuff %p count %zi datatype %d op %d root %d comm %p [nranks=%d] stream %p",
+       info->opName, info->comm->opCount, info->sendbuff, info->recvbuff, info->count,
+       info->datatype, info->op, info->root, info->comm, info->comm->nRanks, info->stream);
+
+  // Launch asynchronously if needed
+  if (ncclAsyncMode()) {
+    ncclResult_t ret = ncclSuccess;
+    int savedDev = -1;
+    if (info->comm->checkPointers) {
+      CUDACHECKGOTO(hipGetDevice(&savedDev), ret, end);
+      CUDACHECKGOTO(hipSetDevice(info->comm->cudaDev), ret, end);
+    }
+    // Check arguments
+    NCCLCHECKGOTO(ArgsCheck(info), ret, end);
+    // Always register comm even in case of error to make sure ncclGroupEnd
+    // cleans it up.
+    NCCLCHECKGOTO(ncclAsyncColl(info->comm), ret, end);
+    NCCLCHECKGOTO(saveKernel(info), ret, end);
+end:
+    if (savedDev != -1) CUDACHECK(hipSetDevice(savedDev));
+    ncclAsyncErrCheck(ret);
+    return ret;
+  } else {
+    NCCLCHECK(ArgsCheck(info));
+    NCCLCHECK(saveKernel(info));
+    NCCLCHECK(ncclBarrierEnqueue(info->comm));
+    NCCLCHECK(ncclBarrierEnqueueWait(info->comm));
+    NCCLCHECK(ncclEnqueueEvents(info->comm));
+    return ncclSuccess;
+  }
+}
diff --git a/projects/rccl/src/include/alloc.h b/projects/rccl/src/include/alloc.h
new file mode 100644
index 0000000000..3d0f07aa95
--- /dev/null
+++ b/projects/rccl/src/include/alloc.h
@@ -0,0 +1,54 @@
+/*************************************************************************
+ * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef NCCL_ALLOC_H_
+#define NCCL_ALLOC_H_
+
+#include "nccl.h"
+#include "checks.h"
+#include <sys/mman.h>
+
+static inline ncclResult_t ncclCudaHostAlloc(void** ptr, void** devPtr, size_t size) {
+  CUDACHECK(hipHostMalloc(ptr, size, hipHostMallocMapped));
+  memset(*ptr, 0, size);
+  *devPtr = *ptr;
+  return ncclSuccess;
+}
+
+static inline ncclResult_t ncclCudaHostFree(void* ptr) {
+  CUDACHECK(hipHostFree(ptr));
+  return ncclSuccess;
+}
+
+template <typename T>
+static ncclResult_t ncclCalloc(T** ptr, size_t nelem) {
+  void* p = malloc(nelem*sizeof(T));
+  if (p == NULL) {
+    WARN("Failed to malloc %ld bytes", nelem*sizeof(T));
+    return ncclSystemError;
+  }
+  memset(p, 0, nelem*sizeof(T));
+  *ptr = (T*)p;
+  return ncclSuccess;
+}
+
+template <typename T>
+static ncclResult_t ncclCudaCalloc(T** ptr, size_t nelem, bool isFineGrain = false) {
+  if (isFineGrain)
+    CUDACHECK(hipExtMallocWithFlags((void**)ptr, nelem*sizeof(T), hipDeviceMallocFinegrained));
+  else
+    CUDACHECK(hipMalloc(ptr, nelem*sizeof(T)));
+  CUDACHECK(hipMemset(*ptr, 0, nelem*sizeof(T)));
+  return ncclSuccess;
+}
+
+template <typename T>
+static ncclResult_t ncclCudaMemcpy(T* dst, T* src, size_t nelem) {
+  CUDACHECK(hipMemcpy(dst, src, nelem*sizeof(T), hipMemcpyDefault));
+  return ncclSuccess;
+}
+
+#endif
diff --git a/projects/rccl/src/include/argcheck.h b/projects/rccl/src/include/argcheck.h
new file mode 100644
index 0000000000..0d6cca7c30
--- /dev/null
+++ b/projects/rccl/src/include/argcheck.h
@@ -0,0 +1,15 @@
+/*************************************************************************
+ * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef NCCL_ARGCHECK_H_
+#define NCCL_ARGCHECK_H_
+
+#include "core.h"
+
+ncclResult_t PtrCheck(void* ptr, const char* opname, const char* ptrname);
+ncclResult_t ArgsCheck(struct ncclInfo* info);
+
+#endif
diff --git a/projects/rccl/src/include/bootstrap.h b/projects/rccl/src/include/bootstrap.h
index 278593c8cd..dacbc7c5e1 100644
--- a/projects/rccl/src/include/bootstrap.h
+++ b/projects/rccl/src/include/bootstrap.h
@@ -1,5 +1,5 @@
 /*************************************************************************
- * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
  *
  * See LICENSE.txt for license information
  ************************************************************************/
@@ -9,9 +9,12 @@
 
 #include "nccl.h"
 
+ncclResult_t bootstrapNetInit();
 ncclResult_t bootstrapCreateRoot(ncclUniqueId* commId, bool idFromEnv);
 ncclResult_t bootstrapGetUniqueId(ncclUniqueId* out);
 ncclResult_t bootstrapInit(ncclUniqueId* id, int rank, int nranks, void** commState);
 ncclResult_t bootstrapAllGather(void* commState, void* allData, int size);
+ncclResult_t bootstrapSend(void* commState, int peer, void* data, int size);
+ncclResult_t bootstrapRecv(void* commState, int peer, void* data, int size);
 ncclResult_t bootstrapClose(void* commState);
 #endif
diff --git a/projects/rccl/src/include/channel.h b/projects/rccl/src/include/channel.h
new file mode 100644
index 0000000000..c01d942e4f
--- /dev/null
+++ b/projects/rccl/src/include/channel.h
@@ -0,0 +1,14 @@
+/*************************************************************************
+ * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef NCCL_CHANNEL_H_
+#define NCCL_CHANNEL_H_
+#include "core.h"
+
+ncclResult_t initChannel(struct ncclComm* comm, int channelid);
+ncclResult_t freeChannel(struct ncclChannel* channel, int nRanks);
+
+#endif
diff --git a/projects/rccl/src/include/checks.h b/projects/rccl/src/include/checks.h
new file mode 100644
index 0000000000..5636338d94
--- /dev/null
+++ b/projects/rccl/src/include/checks.h
@@ -0,0 +1,73 @@
+/*************************************************************************
+ * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef NCCL_CHECKS_H_
+#define NCCL_CHECKS_H_
+
+#include "debug.h"
+
+// Check CUDA calls
+#define CUDACHECK(cmd) do {                                 \
+    hipError_t e = cmd;                                    \
+    if( e != hipSuccess ) {                                \
+        WARN("Cuda failure '%s'", hipGetErrorString(e));   \
+        return ncclUnhandledCudaError;                      \
+    }                                                       \
+} while(false)
+
+#define CUDACHECKGOTO(cmd, res, label) do {                 \
+    hipError_t e = cmd;                                    \
+    if( e != hipSuccess ) {                                \
+        WARN("Cuda failure '%s'", hipGetErrorString(e));   \
+        res = ncclUnhandledCudaError;                       \
+        goto label;                                         \
+    }                                                       \
+} while(false)
+
+#include <errno.h>
+// Check system calls
+#define SYSCHECK(call, name) do { \
+  int retval; \
+  SYSCHECKVAL(call, name, retval); \
+} while (false)
+
+#define SYSCHECKVAL(call, name, retval) do { \
+  SYSCHECKSYNC(call, name, retval); \
+  if (retval == -1) { \
+    WARN("Call to " name " failed : %s", strerror(errno)); \
+    return ncclSystemError; \
+  } \
+} while (false)
+
+#define SYSCHECKSYNC(call, name, retval) do { \
+  retval = call; \
+  if (retval == -1 && (errno == EINTR || errno == EWOULDBLOCK || errno == EAGAIN)) { \
+    INFO(NCCL_ALL,"Call to " name " returned %s, retrying", strerror(errno)); \
+  } else { \
+    break; \
+  } \
+} while(true)
+
+// Propagate errors up
+#define NCCLCHECK(call) do { \
+  ncclResult_t res = call; \
+  if (res != ncclSuccess) { \
+    /* Print the back trace*/ \
+    INFO(NCCL_ALL,"%s:%d -> %d", __FILE__, __LINE__, res);    \
+    return res; \
+  } \
+} while (0);
+
+#define NCCLCHECKGOTO(call, res, label) do { \
+  res = call; \
+  if (res != ncclSuccess) { \
+    /* Print the back trace*/ \
+    INFO(NCCL_ALL,"%s:%d -> %d", __FILE__, __LINE__, res);    \
+    goto label; \
+  } \
+} while (0);
+
+#endif
diff --git a/projects/rccl/src/include/comm.h b/projects/rccl/src/include/comm.h
new file mode 100644
index 0000000000..57a9b12c48
--- /dev/null
+++ b/projects/rccl/src/include/comm.h
@@ -0,0 +1,117 @@
+/*************************************************************************
+ * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
+ * Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef NCCL_COMM_H_
+#define NCCL_COMM_H_
+
+#define MAXCHANNELS 16
+#define DEFAULT_BUFFER_SIZE_BYTES (1LL << 22) /* 4MiB */
+
+#define CACHE_LINE_SIZE 64
+#define MEM_ALIGN 4096
+#define CUDA_IPC_MIN 2097152UL
+
+struct ncclSendMem {
+  union {
+    struct {
+      uint64_t head;
+      char pad1[CACHE_LINE_SIZE-sizeof(uint64_t)];
+      void* ptrExchange;
+      char pad2[CACHE_LINE_SIZE-sizeof(void*)];
+      uint64_t opCount;
+    };
+    char pad3[MEM_ALIGN];
+  };
+};
+
+struct ncclRecvMem {
+  union {
+    struct {
+      uint64_t tail;
+      char pad1[CACHE_LINE_SIZE-sizeof(uint64_t)];
+      uint64_t opCount;
+      char pad2[CACHE_LINE_SIZE-sizeof(uint64_t)];
+      int sizesFifo[NCCL_STEPS];
+    };
+    char pad4[MEM_ALIGN];
+  };
+  ncclLLFifoLine llBuff[NCCL_LL_BUFF_LINES];
+  char buff[1]; // Actually larger than that
+};
+
+struct ncclComm {
+  struct ncclChannel channels[MAXCHANNELS];
+
+  struct ncclPeerInfo* peerInfo;
+
+  void* bootstrap;
+
+  int rank;    // my rank in the communicator
+  int nRanks;  // number of GPUs in communicator
+  int cudaDev; // my cuda device index
+  int nvmlDev; // my NVML device number
+
+  enum { GROUP, PARALLEL } launchMode;
+  hipStream_t userStream;
+  bool userStreamSet;
+  hipEvent_t doneEvent;
+  bool checkPointers;
+
+  // Counter to make sure collectives match (needed for bcast/reduce
+  // where syncs are not symmetric).
+  uint64_t opCount;
+
+  // Channels for collectives
+  int nChannels;
+  int nThreads;
+
+  // Low-latency algorithm threshold
+  ssize_t llThreshold;
+  ssize_t threadThreshold;
+
+  // Tree algorithm threshold
+  ssize_t treeThreshold;
+
+  // An internal CUDA stream for NCCL kernel CGMD launches
+  int groupCudaStream;
+  hipStream_t groupStream;
+
+  // Whether there has been a fatal error in this communicator.
+  ncclResult_t fatalError;
+
+  // Error reported by GPU
+  volatile ncclDevError_t* fatalDevError;
+
+  // Flag to ask NCCL kernels to abort
+  volatile uint32_t *abortFlag;
+
+  // Device side of the communicator
+  struct ncclDevComm *devComm;
+  // Host copy of the devComm (to free CUDA allocs)
+  struct ncclDevComm hostDevComm;
+
+  // Intra-process sync
+  int intraRank;
+  int intraRanks;
+  int* intraBarrier;
+  int intraPhase;
+
+  // Storage for deferred intra-process launch
+  hipLaunchParams * intraParams;
+  hipLaunchParams *myParams;
+  int* intraCudaDevs;
+  int* intraCGMode; // Whether we can use CUDA9 CGMD or not
+  int* intraCC; // Only to check all have the same ComputeCap and disable CGMode if not
+  struct ncclColl args;
+  struct ncclColl* argsptr;
+
+  // Global proxy thread
+  pthread_t proxyThread;
+  struct ncclProxyState proxyState;
+};
+
+#endif
diff --git a/projects/rccl/src/include/common_coll.h b/projects/rccl/src/include/common_coll.h
deleted file mode 100644
index be9aa0023f..0000000000
--- a/projects/rccl/src/include/common_coll.h
+++ /dev/null
@@ -1,196 +0,0 @@
-/*************************************************************************
- * Copyright (c) 2016-2018, NVIDIA CORPORATION. All rights reserved.
- * Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
- *
- * See LICENSE.txt for license information
- ************************************************************************/
-
-#ifndef COMMON_COLL_H_
-#define COMMON_COLL_H_
-
-#include "core.h"
-#include "enqueue.h"
-#include "collectives/collectives.h"
-
-static ncclResult_t PointerCheck(const void* pointer, struct ncclComm* comm, const char* ptrname, const char* opname) {
-  hipPointerAttribute_t attr;
-  hipError_t err = hipPointerGetAttributes(&attr, pointer);
-  if (err != hipSuccess || attr.devicePointer == NULL) {
-    WARN("%s : %s is not a valid pointer", opname, ptrname);
-    return ncclInvalidArgument;
-  }
-#if CUDART_VERSION >= 10000
-  if (attr.type == hipMemoryTypeDevice && attr.device != comm->cudaDev) {
-#else
-  if (attr.memoryType == hipMemoryTypeDevice && attr.device != comm->cudaDev) {
-#endif
-    WARN("%s : %s allocated on device %d mismatchs with NCCL device %d", opname, ptrname, attr.device, comm->cudaDev);
-    return ncclInvalidArgument;
-  }
-  return ncclSuccess;
-}
-
-static ncclResult_t PtrCheck(void* ptr, const char* opname, const char* ptrname) {
-  if (ptr == NULL) {
-    WARN("%s : %s argument is NULL", opname, ptrname);
-    return ncclInvalidArgument;
-  }
-  return ncclSuccess;
-}
-
-static ncclResult_t ArgsCheck(const void* sendbuff, const void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, struct ncclComm* comm, const char* opname) {
-  NCCLCHECK(PtrCheck(comm, opname, "comm"));
-  // First, the easy ones
-  if (root < 0 || root >= comm->nRanks) {
-    WARN("%s : invalid root %d (root should be in the 0..%d range)", opname, root, comm->nRanks);
-    return ncclInvalidArgument;
-  }
-  if (type < 0 || type >= ncclNumTypes) {
-    WARN("%s : invalid type %d", opname, type);
-    return ncclInvalidArgument;
-  }
-  if (op < 0 || op >= ncclNumOps) {
-    WARN("%s : invalid reduction operation %d", opname, op);
-    return ncclInvalidArgument;
-  }
-
-  if (comm->checkPointers) {
-    // Check CUDA device pointers
-    if (strcmp(opname, "Broadcast") != 0 || comm->rank == root) {
-      NCCLCHECK(PointerCheck(sendbuff, comm, "sendbuff", opname));
-    }
-    if (strcmp(opname, "Reduce") != 0 || comm->rank == root) {
-      NCCLCHECK(PointerCheck(recvbuff, comm, "recvbuff", opname));
-    }
-  }
-  return ncclSuccess;
-}
-
-static __inline__ int ncclTypeSize(ncclDataType_t type) {
-  switch (type) {
-    case ncclInt8:
-    case ncclUint8:
-      return 1;
-    case ncclFloat16:
-      return 2;
-    case ncclInt32:
-    case ncclUint32:
-    case ncclFloat32:
-      return 4;
-    case ncclInt64:
-    case ncclUint64:
-    case ncclFloat64:
-      return 8;
-    default:
-      return -1;
-  }
-}
-
-// In : comm, nbytes ; Out : nrings, nthreads, ll
-// - We start with the minimum number of threads possible (64) and see if the size fits in LL;
-//   If not, we increase the number of threads by 2x, until we reach the max number of LL threads (256, or set by user via NCCL_NTHREADS, or platform non-LL default)
-// - We use "maxRings" to limit the max number of rings we can use before reaching the max number of LL threads
-//   This ensures we don't use a large number of rings with a small number of threads
-// - We use the NCCL_LL_RING_THRESHOLD as the per-thread threshold before we reach the max number of threads
-//   we use NCCL_THREAD_THRESHOLD when we reach the max
-// - If by the max number of LL threads, the size still cannot fit in LL, then we use non-LL setting
-// - We honor the NCCL_LL_THRESHOLD (total threshold) set by user too
-static inline void ncclGetCollResource(ncclComm_t comm, size_t nbytes, int* nrings, int* nthreads, int* ll) {
-  *ll = 0;
-  int llEnforced = 0; /* see if the size falls in the NCCL_LL_THRESHOLD range set by user */
-  if (comm->llThreshold >= 0) { /* user sets total LL threshold */
-    if (nbytes > comm->llThreshold) { /* non-LL */
-      *nthreads = comm->nThreads;
-      *nrings = comm->nRings;
-      return;
-    } else {
-      llEnforced = 1; /* user wants to use LL */
-    }
-  }
-  int nt = NCCL_LL_MIN_NTHREADS; /* start with min number of LL threads */
-  size_t nr;
-  int ll_max_nthreads = std::min(NCCL_LL_MAX_NTHREADS, comm->nThreads); /* respect user's setting or platform's default setting */
-  int maxRings = (comm->nRanks <= 4) ? 1 : ll_max_nthreads / NCCL_LL_MIN_NTHREADS;
-  ssize_t threshold = std::min(comm->threadThreshold, (ssize_t)NCCL_LL_RING_THRESHOLD);
-  while (nt < ll_max_nthreads && *ll == 0) {
-    nr = DIVUP(nbytes, (NCCL_LL_RING_THRESHOLD*nt*comm->nRanks));
-    if (nr <= maxRings) { /* avoid using few threads but many rings */
-      nr = nr == 0 ? 1 : nr > comm->nRings ? comm->nRings : nr;
-      *ll = nbytes > comm->nRanks*nr*nt*threshold ? 0 : 1;
-    }
-    if (*ll == 0) {
-      nt = nt << 1;
-    }
-  }
-  if (*ll == 1) {
-    *nthreads = nt;
-    *nrings = (int)nr;
-    return; /* we can use smaller number of threads to make LL work, stop here */
-  }
-  nr = DIVUP(nbytes, (NCCL_LL_RING_THRESHOLD*ll_max_nthreads*comm->nRanks)); /* else we try the max number of LL threads */
-  nr = nr == 0 ? 1 : nr > comm->nRings ? comm->nRings : nr;
-  *ll = nbytes > comm->nRanks*nr*ll_max_nthreads*comm->threadThreshold ? llEnforced : 1;
-  *nthreads = *ll ? ll_max_nthreads : comm->nThreads;
-  *nrings = *ll ? (int)nr : comm->nRings;
-}
-
-static ncclResult_t saveKernel(int coll, const void* sendbuff, void* recvbuff, size_t count,
-    ncclDataType_t dtype, ncclRedOp_t op, int root, ncclComm_t comm, hipStream_t stream, size_t nbytes, int loopFactor) {
-  int llMode, nBlocks, nThreads;
-  ncclGetCollResource(comm, nbytes, &nBlocks, &nThreads, &llMode);
-  comm->myParams->blockDim.x = std::max((int)comm->myParams->blockDim.x, nThreads);
-  if (comm->userStreamSet == false) {
-    comm->userStream = stream;
-    comm->userStreamSet = true;
-  } else if (stream != comm->userStream) {
-    WARN("Error : mixing different streams within a group call is not supported.");
-    return ncclInvalidUsage;
-  }
-  int lastChunkSize = 0;
-  if (llMode == 1) {
-    int sliceSize = NCCL_LL_SLICE_LINES * sizeof(uint64_t) / ncclTypeSize(dtype);
-    const ssize_t loopSize = nBlocks*loopFactor*(ssize_t)sliceSize;
-    lastChunkSize = DIVUP((count-count/loopSize*loopSize), nBlocks*loopFactor);
-    ALIGN_SIZE(lastChunkSize, nThreads*sizeof(uint64_t)/ncclTypeSize(dtype));
-  }
-  for (int bid=0; bid<nBlocks; bid++) {
-    struct ncclRing* ring = comm->rings+(comm->myParams->gridDim.x % comm->nRings);
-    if (ring->collCount == NCCL_MAX_OPS) {
-      WARN("Too many aggregated operations (%d max)", NCCL_MAX_OPS);
-      return ncclInvalidUsage;
-    }
-
-    comm->myParams->gridDim.x++;
-
-    int opIndex = ring->collFifoTail;
-    struct ncclColl* c = ring->collectives+opIndex;
-    volatile uint8_t* activePtr = (volatile uint8_t*)&c->active;
-    while (LOAD(activePtr) != 0) sched_yield();
-
-    struct CollectiveArgs* args = &c->args;
-    args->root = root;
-    args->N = count;
-    args->ThisInput = sendbuff;
-    args->ThisOutput = recvbuff;
-    args->comm = comm->devComm;
-    args->opCount = comm->opCount;
-    args->bid = bid;
-    args->nRings = nBlocks;
-    args->nThreads = nThreads;
-    args->lastChunkSize = lastChunkSize;
-
-    c->nThreads = nThreads;
-    c->funcIndex = FUNC_INDEX(coll, op, dtype, llMode);
-    STORE(&c->active, 1);
-    opIndex = (opIndex+1)%NCCL_MAX_OPS;
-    c->nextIndex = opIndex;
-    ring->collFifoTail = opIndex;
-    ring->collCount++;
-  }
-  /*if (llMode == 0)*/ comm->opCount++;
-  return ncclSuccess;
-}
-
-extern __global__ void ncclMultiOpKernel (struct ncclColl firstColl);
-
-#endif
diff --git a/projects/rccl/src/include/core.h b/projects/rccl/src/include/core.h
index 2e803facbc..8a08b914b0 100644
--- a/projects/rccl/src/include/core.h
+++ b/projects/rccl/src/include/core.h
@@ -1,6 +1,5 @@
 /*************************************************************************
- * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
- * Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
+ * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
  *
  * See LICENSE.txt for license information
  ************************************************************************/
@@ -8,313 +7,20 @@
 #ifndef NCCL_CORE_H_
 #define NCCL_CORE_H_
 
-#define NCCL_MAX_OPS 2048
-
+#include <pthread.h>
+#include <algorithm>
 #include "nccl.h"
-#include "transport.h"
 #include "debug.h"
+#include "checks.h"
+#include "alloc.h"
+#include "transport.h"
+#include "devcomm.h"
+#include "comm.h"
+#include "info.h"
+#include "argcheck.h"
 #include <cstdio>
-#include <algorithm> // std::min/std::max
 #include <unistd.h>
 #include <stdlib.h>
-#include <hip/hip_runtime_api.h>
-#include <hip/hip_runtime.h>
-
-#define MAXRINGS 16
-#define MAXTHREADS 256
-#define DEFAULT_BUFFER_SIZE_BYTES (1LL << 22) /* 4MiB */
-
-// Rings / LL tuning
-#define NCCL_LL_RING_THRESHOLD 8 // Per thread size before we start increasing nrings
-#define NCCL_THREAD_THRESHOLD 256 // Per thread size before we switch to non-LL for Volta and above
-#define NCCL_THREAD_THRESHOLD_PREVOLTA 32 // Per thread size before we switch to non-LL for pre-Volta archs
-#define NCCL_LL_MAX_NTHREADS 256
-#define NCCL_LL_MIN_NTHREADS 256
-
-#define DIVUP(x, y) \
-    (((x)+(y)-1)/(y))
-#define ROUNDUP(x, y) \
-    (DIVUP((x), (y))*(y))
-
-#define ALIGN_SIZE(size, align) \
-  size = ((size + (align) - 1) / (align)) * (align);
-
-union ncclLLFifoLine {
-  /* Flags have to be *after* data, because otherwise, an incomplete receive
-     from the network may receive the flag but not the data.
-     Note this is assuming that either we receive contiguous chunks of data
-     (sockets) or data is written with an atomicity of 8 bytes (IB/RDMA). */
-  struct {
-    uint32_t data1;
-    uint32_t flag1;
-    uint32_t data2;
-    uint32_t flag2;
-  };
-  uint64_t v[2];
-  int4 i4;
-};
-
-struct ncclConnInfo {
-  // Regular comm mechanism
-  char *buff;         // Local for recv, remote for send
-  uint64_t *tail;     // Local for recv, remote for send
-  uint64_t *head;     // Local for send, remote for recv
-  uint64_t *opCount;  // Local for recv, remote for send
-
-  int direct;         // Direct communication
-  void **ptrExchange; // Pointer exchange for direct communication
-
-  int *fifo;          // Size fifo for proxy
-
-  // Low latency mechanism
-  char *llBuff;       // Local for recv, remote for send
-  uint64_t *llHead;   // Local for send, remote for recv
-  int *llFifo;        // LL Size fifo for proxy
-  uint64_t llStep;    // Keep where we are
-  uint64_t llLastCleaning;
-};
-
-struct ncclConnector {
-  struct transportProxyInfo* proxyInfo;
-  struct ncclTransport* transport;
-  void* transportResources; // Host-side resources
-  struct ncclConnInfo conn;
-};
-
-#define CACHE_LINE_SIZE 64
-#define MEM_ALIGN 4096
-#define SIZES_FIFO_SIZE 16
-#define CUDA_IPC_MIN 2097152UL /* 2MiB - not currently used */
-
-#define NCCL_LL_CHUNKS 8
-#define NUM_LINES_PER_THREAD 8
-#define NCCL_LL_BUFF_SIZE (NUM_LINES_PER_THREAD*NCCL_LL_MAX_NTHREADS*NCCL_LL_CHUNKS*sizeof(union ncclLLFifoLine)) // 256K
-#define NCCL_LL_BUFF_LINES (NCCL_LL_BUFF_SIZE / (2*sizeof(uint64_t)))
-#define NCCL_LL_SLICE_LINES (NCCL_LL_BUFF_LINES / NCCL_LL_CHUNKS)
-#define NCCL_LL_CLEAN_FREQ 0x10000000
-
-struct ncclSendMem {
-  union {
-    struct {
-      uint64_t head;
-      char pad1[CACHE_LINE_SIZE-sizeof(uint64_t)];
-      void* ptrExchange;
-      char pad2[CACHE_LINE_SIZE-sizeof(void*)];
-      uint64_t llHead;
-    };
-    char pad3[MEM_ALIGN];
-  };
-};
-
-struct ncclRecvMem {
-  union {
-    struct {
-      uint64_t tail;
-      char pad2[CACHE_LINE_SIZE-sizeof(uint64_t)];
-      uint64_t opCount;
-      char pad4[CACHE_LINE_SIZE-sizeof(uint64_t)];
-      int sizesFifo[SIZES_FIFO_SIZE];
-      int llSizesFifo[SIZES_FIFO_SIZE];
-    };
-    char pad5[MEM_ALIGN];
-  };
-  char llBuff[NCCL_LL_BUFF_SIZE];
-  char buff[1]; // Actually larger than that
-};
-
-struct ncclRing {
-  union {
-    struct {
-      int id;
-      int nthreads;
-      // Per ring resources
-      struct ncclSendMem* devMemSend;   // CUDA-size resources
-      struct ncclRecvMem* devMemRecv;   // CUDA-size resources
-      int buffSize;
-      int devMemSendSize;    // Keep the size for IPCs
-      int devMemRecvSize;    // Keep the size for IPCs
-      struct ncclConnector send;
-      struct ncclConnector recv;
-
-      // Maps an internal nccl index to user-specified rank order. This is necessary
-      // since we need to know how the user expects data to be ordered across
-      // devices. Ordered from current device.
-      int* userRanks;
-      int* devUserRanks;
-
-      // GPU's HDP_MEM_FLUSH_ADDR: HDP Memory Coherency Flush Control. This register
-      // allows software to explicitly initiate a flush read to HDP memory. See more
-      // descriptions in primitives.h.
-      uint32_t* next_hdp_reg;  // Next GPU in ring (for p2p transport use only)
-      uint32_t* curr_hdp_reg;  // Curr GPU in ring (for rdma transport use only)
-      
-      // Operation list for aggregation
-      struct ncclColl* collectives;
-      struct ncclColl* devCollectives;
-      int collStart;
-      int collCount;
-      int collFifoHead; // Only used by GPU
-      int collFifoTail; // Only used by CPU
-    };
-    int data[0x80];
-  };
-};
-static_assert(sizeof(struct ncclRing) == 0x80*sizeof(int), "ncclRing must have a pow2 size");
-
-#pragma pack(push)  /* push current alignment to stack */
-#pragma pack(4)     /* set alignment to 4 bytes boundary */
-/* CollectiveArgs + ncclColl are to be a power of two, currently 64 bytes, */
-/* to make sure reads to host from the CUDA kernel are aligned. */
-/* Make sure to adjust padding at the end of ncclColl. */
-struct CollectiveArgs {
-  struct ncclComm* comm;
-  uint64_t opCount;
-
-  // local and remote input, output, and buffer
-  const void * ThisInput;
-  void * ThisOutput;
-
-  // general parameters
-  size_t N;
-  uint32_t root;
-  uint8_t bid;
-  uint8_t nRings;
-  uint16_t nThreads;
-
-  int lastChunkSize;
-};
-struct ncclColl {
-  union {
-    struct {
-      struct CollectiveArgs args;
-      uint16_t nThreads;
-      uint16_t funcIndex;
-      uint16_t nextIndex;
-      uint8_t  active;
-    };
-    int data[0x10];
-  };
-};
-static_assert(sizeof(struct ncclColl) == (0x10*sizeof(int)), "ncclColl must have a pow2 size");
-#pragma pack(pop)   /* restore original alignment from stack */
-
-struct ncclComm {
-  struct ncclRing rings[MAXRINGS];
-
-  int rank;    // my rank in the communicator
-  int nRanks;  // number of GPUs in communicator
-  int cudaDev; // my cuda device index
-
-  enum { GROUP, PARALLEL } launchMode;
-  hipStream_t userStream;
-  bool userStreamSet;
-  hipEvent_t doneEvent;
-  bool checkPointers;
-
-  // Counter to make sure collectives match (needed for bcast/reduce
-  // where syncs are not symmetric).
-  uint64_t opCount;
-
-  // Rings for collectives
-  int nRings;
-  int nThreads;
-
-  // Low-latency algorithm threshold
-  ssize_t llThreshold;
-  ssize_t threadThreshold;
-
-  // An internal CUDA stream for NCCL kernel CGMD launches
-  int groupCudaStream;
-  hipStream_t groupStream;
-
-  // Device copy of the communicator
-  struct ncclComm *devComm;
-
-  // Intra-process sync
-  int intraRank;
-  int intraRanks;
-  int* intraBarrier;
-  int intraPhase;
-
-  // Storage for deferred intra-process launch
-  hipLaunchParams* intraParams;
-  hipLaunchParams* myParams;
-  int* intraCudaDevs;
-  int* intraCGMode; // Whether we can use CUDA9 CGMD or not
-  int* intraCC; // Only to check all have the same ComputeCap and disable CGMode if not
-  struct ncclColl args;
-  struct ncclColl* argsptr;
-};
-
-// Convert volatile access to atomic
-#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__)
-#define LOAD(VAR) __atomic_load_n((VAR), __ATOMIC_SEQ_CST)
-#define STORE(DST, SRC) __atomic_store_n((DST), (SRC), __ATOMIC_SEQ_CST)
-#else
-#define LOAD(VAR) *(VAR)
-#define STORE(DST, SRC) *(DST) = (SRC)
-#endif
-
-// Check CUDA calls
-#define CUDACHECK(cmd) do {                                 \
-    hipError_t e = cmd;                                    \
-    if( e != hipSuccess ) {                                \
-        WARN("Cuda failure '%s'", hipGetErrorString(e));   \
-        return ncclUnhandledCudaError;                      \
-    }                                                       \
-} while(false)
-
-#define CUDACHECKGOTO(cmd, res, label) do {                 \
-    hipError_t e = cmd;                                    \
-    if( e != hipSuccess ) {                                \
-        WARN("Cuda failure '%s'", hipGetErrorString(e));   \
-        res = ncclUnhandledCudaError;                       \
-        goto label;                                         \
-    }                                                       \
-} while(false)
-
-#include <errno.h>
-// Check system calls
-#define SYSCHECK(call, name) do { \
-  int retval; \
-  SYSCHECKVAL(call, name, retval); \
-} while (false)
-
-#define SYSCHECKVAL(call, name, retval) do { \
-  SYSCHECKSYNC(call, name, retval); \
-  if (retval == -1) { \
-    WARN("Call to " name " failed : %s", strerror(errno)); \
-    return ncclSystemError; \
-  } \
-} while (false)
-
-#define SYSCHECKSYNC(call, name, retval) do { \
-  retval = call; \
-  if (retval == -1 && (errno == EINTR || errno == EWOULDBLOCK || errno == EAGAIN)) { \
-    INFO(NCCL_ALL,"Call to " name " returned %s, retrying", strerror(errno)); \
-  } else { \
-    break; \
-  } \
-} while(true)
-
-// Propagate errors up
-#define NCCLCHECK(call) do { \
-  ncclResult_t res = call; \
-  if (res != ncclSuccess) { \
-    /* Print the back trace*/ \
-    INFO(NCCL_ALL,"%s:%d -> %d", __FILE__, __LINE__, res);    \
-    return res; \
-  } \
-} while (0);
-
-#define NCCLCHECKGOTO(call, res, label) do { \
-  res = call; \
-  if (res != ncclSuccess) { \
-    /* Print the back trace*/ \
-    INFO(NCCL_ALL,"%s:%d -> %d", __FILE__, __LINE__, res);    \
-    goto label; \
-  } \
-} while (0);
 
 #ifdef PROFAPI
 #define NCCL_API(ret, func, args...)        \
@@ -333,51 +39,27 @@ struct ncclComm {
 #endif // end PROFAPI
 
 int ncclCudaCompCap();
+ncclResult_t ncclNvlinkGpu(int* nvlink);
+int64_t ncclTreeThreshold();
 
-#include <sys/mman.h>
-static inline ncclResult_t ncclCudaHostAlloc(void** ptr, void** devPtr, size_t size) {
-  CUDACHECK(hipHostMalloc(ptr, size, hipHostMallocMapped));
-  memset(*ptr, 0, size);
-  *devPtr = *ptr;
-  return ncclSuccess;
-}
-
-static inline ncclResult_t ncclCudaHostFree(void* ptr) {
-  CUDACHECK(hipHostFree(ptr));
-  return ncclSuccess;
-}
-
-template <typename T>
-static ncclResult_t ncclCalloc(T** ptr, size_t nelem) {
-  void* p = malloc(nelem*sizeof(T));
-  if (p == NULL) {
-    WARN("Failed to malloc %ld bytes", nelem*sizeof(T));
-    return ncclSystemError;
+static __inline__ int ncclTypeSize(ncclDataType_t type) {
+  switch (type) {
+    case ncclInt8:
+    case ncclUint8:
+      return 1;
+    case ncclFloat16:
+      return 2;
+    case ncclInt32:
+    case ncclUint32:
+    case ncclFloat32:
+      return 4;
+    case ncclInt64:
+    case ncclUint64:
+    case ncclFloat64:
+      return 8;
+    default:
+      return -1;
   }
-  memset(p, 0, nelem*sizeof(T));
-  *ptr = (T*)p;
-  return ncclSuccess;
-}
-
-template <typename T>
-static ncclResult_t ncclCudaCalloc(T** ptr, size_t nelem, bool isFineGrain = false) {
-  if (isFineGrain) {
-    hipError_t e = hipExtMallocWithFlags((void**)ptr, nelem*sizeof(T), hipDeviceMallocFinegrained);
-    if (e != hipSuccess) {
-      *ptr = 0;
-      return ncclInvalidUsage;
-    }
-  }
-  else
-    CUDACHECK(hipMalloc(ptr, nelem*sizeof(T)));
-  CUDACHECK(hipMemset(*ptr, 0, nelem*sizeof(T)));
-  return ncclSuccess;
-}
-
-template <typename T>
-static ncclResult_t ncclCudaMemcpy(T* dst, T* src, size_t nelem) {
-  CUDACHECK(hipMemcpy(dst, src, nelem*sizeof(T), hipMemcpyDefault));
-  return ncclSuccess;
 }
 
 #endif // end include guard
diff --git a/projects/rccl/src/include/cpuset.h b/projects/rccl/src/include/cpuset.h
new file mode 100644
index 0000000000..98b93de87d
--- /dev/null
+++ b/projects/rccl/src/include/cpuset.h
@@ -0,0 +1,61 @@
+/*************************************************************************
+ * Copyright (c) 2018-2019, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef NCCL_CPUSET_H_
+#define NCCL_CPUSET_H_
+
+// Convert local_cpus, e.g. 0003ff,f0003fff to cpu_set_t
+
+static int hexToInt(char c) {
+  int v = c - '0';
+  if (v < 0) return -1;
+  if (v > 9) v = 10 + c - 'a';
+  if ((v < 0) || (v > 15)) return -1;
+  return v;
+}
+
+#define CPU_SET_N_U32 (sizeof(cpu_set_t)/sizeof(uint32_t))
+
+ncclResult_t ncclStrToCpuset(char* str, cpu_set_t* mask) {
+  uint32_t cpumasks[CPU_SET_N_U32];
+  int m = CPU_SET_N_U32-1;
+  cpumasks[m] = 0;
+  for (int o=0; o<strlen(str); o++) {
+    char c = str[o];
+    if (c == ',') {
+      m--;
+      cpumasks[m] = 0;
+    } else {
+      int v = hexToInt(c);
+      if (v == -1) break;
+      cpumasks[m] <<= 4;
+      cpumasks[m] += v;
+    }
+  }
+  // Copy cpumasks to mask
+  for (int a=0; m<CPU_SET_N_U32; a++,m++) {
+    memcpy(((uint32_t*)mask)+a, cpumasks+m, sizeof(uint32_t));
+  }
+  return ncclSuccess;
+}
+
+ncclResult_t ncclCpusetToStr(cpu_set_t* mask, char* str) {
+  int c = 0;
+  uint8_t* m8 = (uint8_t*)mask;
+  for (int o=sizeof(cpu_set_t)-1; o>=0; o--) {
+    if (c == 0 && m8[o] == 0) continue;
+    sprintf(str+c, "%02x", m8[o]);
+    c+=2;
+    if (o && o%4 == 0) {
+      sprintf(str+c, ",");
+      c++;
+    }
+  }
+  str[c] = '\0';
+  return ncclSuccess;
+}
+
+#endif
diff --git a/projects/rccl/src/include/debug.h b/projects/rccl/src/include/debug.h
index 1ef87d9f6a..c3e8fa04bd 100644
--- a/projects/rccl/src/include/debug.h
+++ b/projects/rccl/src/include/debug.h
@@ -1,6 +1,5 @@
 /*************************************************************************
- * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
- * Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
+ * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
  *
  * See LICENSE.txt for license information
  ************************************************************************/
@@ -25,7 +24,8 @@ extern int ncclDebugLevel;
 extern uint64_t ncclDebugMask;
 extern pthread_mutex_t ncclDebugOutputLock;
 extern FILE *ncclDebugFile;
-extern ncclResult_t getHostName(char* hostname, int maxlen);
+extern ncclResult_t getHostName(char* hostname, int maxlen, const char delim);
+extern ncclResult_t getNvmlDevice(int cudaDev, int *nvmlDev);
 
 extern void ncclDebugLog(ncclDebugLogLevel level, unsigned long flags, const char *filefunc, int line, const char *fmt, ...);
 
@@ -108,7 +108,7 @@ static inline void initDebug() {
           break;
         case 'h': // %h = hostname
           char hostname[1024];
-          getHostName(hostname, 1024);
+          getHostName(hostname, 1024, '.');
           dfn += snprintf(dfn, PATH_MAX, "%s", hostname);
           break;
         case 'p': // %p = pid
diff --git a/projects/rccl/src/include/devcomm.h b/projects/rccl/src/include/devcomm.h
new file mode 100644
index 0000000000..30eccab7b8
--- /dev/null
+++ b/projects/rccl/src/include/devcomm.h
@@ -0,0 +1,259 @@
+/*************************************************************************
+ * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
+ * Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef NCCL_DEVICE_H_
+#define NCCL_DEVICE_H_
+
+#include "nccl.h"
+#include <stdint.h>
+
+// Convert volatile access to atomic
+#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
+#define LOAD(VAR) __atomic_load_n((VAR), __ATOMIC_SEQ_CST)
+#define STORE(DST, SRC) __atomic_store_n((DST), (SRC), __ATOMIC_SEQ_CST)
+#else
+#define LOAD(VAR) *(VAR)
+#define STORE(DST, SRC) *(DST) = (SRC)
+#endif
+
+#define NCCL_MAX_OPS 2048
+#define NCCL_STEPS 8
+
+typedef enum { ncclCollBroadcast, ncclCollReduce, ncclCollAllGather, ncclCollReduceScatter, ncclCollAllReduce, ncclCollCount } ncclColl_t;
+
+#define DIVUP(x, y) \
+    (((x)+(y)-1)/(y))
+#define ROUNDUP(x, y) \
+    (DIVUP((x), (y))*(y))
+
+#define ALIGN_SIZE(size, align) \
+  size = ((size + (align) - 1) / (align)) * (align);
+
+union ncclLLFifoLine {
+  /* Flags have to be *after* data, because otherwise, an incomplete receive
+     from the network may receive the flag but not the data.
+     Note this is assuming that either we receive contiguous chunks of data
+     (sockets) or data is written with an atomicity of 8 bytes (IB/RDMA). */
+  struct {
+    uint32_t data1;
+    uint32_t flag1;
+    uint32_t data2;
+    uint32_t flag2;
+  };
+  uint64_t v[2];
+  int4 i4;
+};
+
+#define MAXTHREADS 256
+#define NCCL_LL_MAX_NTHREADS MAXTHREADS
+#define NUM_LINES_PER_THREAD 8
+#define NCCL_LL_SLICE_LINES (NUM_LINES_PER_THREAD*NCCL_LL_MAX_NTHREADS)
+#define NCCL_LL_BUFF_LINES (NCCL_LL_SLICE_LINES*NCCL_STEPS)
+#define NCCL_LL_BUFF_SIZE (NCCL_LL_BUFF_LINES*sizeof(union ncclLLFifoLine))
+#ifdef DEBUG_LL
+#define NCCL_LL_CLEAN_MASK 0x00000ff8
+#define NCCL_LL_FLAG_MAX   0x00001000
+#define NCCL_LL_FLAG(a) ((uint32_t)(a % NCCL_LL_FLAG_MAX))
+#else
+#define NCCL_LL_CLEAN_MASK 0x7ffffff8
+#define NCCL_LL_FLAG(a) ((uint32_t)(a))
+#endif
+// Make sure the clean mask will last for at least NCCL_NSTEPS
+static_assert(NCCL_LL_CLEAN_MASK % NCCL_STEPS == 0, "Invalid NCCL_LL_CLEAN_MASK value");
+
+struct ncclConnInfo {
+  // Regular comm mechanism
+  char *buff;         // Local for recv, remote for send
+  uint64_t *tail;     // Local for recv, remote for send
+  uint64_t *head;     // Local for send, remote for recv
+  uint64_t *opCountLoc; // opCount of local rank
+  uint64_t *opCountRem; // opCount of remote rank
+
+  int direct;         // Direct communication
+  void **ptrExchange; // Pointer exchange for direct communication
+
+  int *fifo;          // Size fifo for proxy
+
+  uint64_t step;      // Keep where we are
+
+  // Low latency mechanism
+  union ncclLLFifoLine *llBuff; // Local for recv, remote for send
+  uint64_t llLastCleaning;
+
+  // GPU's HDP_MEM_FLUSH_ADDR: HDP Memory Coherency Flush Control. This register
+  // allows software to explicitly initiate a flush read to HDP memory. See more
+  // descriptions in primitives.h.
+  uint32_t* next_hdp_reg;  // Next GPU in ring (for p2p transport use only)
+  uint32_t* curr_hdp_reg;  // Curr GPU in ring (for rdma transport use only)
+};
+
+struct ncclConnector {
+  int connected;
+  struct ncclProxyArgs *proxyAppend;
+  struct ncclTransportComm* transportComm;
+  void* transportResources; // Host-side resources
+  struct ncclConnInfo conn;
+  struct ncclComm *comm;
+};
+
+struct ncclRing {
+  // Shortcuts for userRanks[1] and userRanks[n-1]
+  int prev;
+  int next;
+
+  // Maps an internal nccl index to user-specified rank order. This is necessary
+  // since we need to know how the user expects data to be ordered across
+  // devices. Ordered from current device.
+  int* userRanks;
+  int* devUserRanks;
+};
+
+
+#define NCCL_MAX_TREE_ARITY 3
+struct ncclTree {
+  int depth;
+  int up;
+  int down[NCCL_MAX_TREE_ARITY];
+};
+
+struct ncclPeer {
+  struct ncclConnector send;
+  struct ncclConnector recv;
+};
+
+struct ncclDevComm;
+
+#pragma pack(push)  /* push current alignment to stack */
+#pragma pack(4)     /* set alignment to 4 bytes boundary */
+/* CollectiveArgs + ncclColl are to be a power of two, currently 64 bytes, */
+/* to make sure reads to host from the CUDA kernel are aligned. */
+/* Make sure to adjust padding at the end of ncclColl. */
+struct CollectiveArgs {
+  struct ncclDevComm* comm;
+  uint64_t opCount;
+
+  // local and remote input, output, and buffer
+  const void * ThisInput;
+  void * ThisOutput;
+
+  // general parameters
+  size_t N;
+  uint32_t root;
+  uint8_t bid;
+  uint8_t nChannels;
+  uint16_t nThreads;
+
+  int lastChunkSize;
+};
+struct ncclColl {
+  union {
+    struct {
+      struct CollectiveArgs args;
+      uint16_t funcIndex;
+      uint16_t nextIndex;
+      uint8_t  active;
+    };
+    int data[0x10];
+  };
+};
+static_assert(sizeof(struct ncclColl) == (0x10*sizeof(int)), "ncclColl must have a pow2 size");
+
+struct ncclChannel {
+  union {
+    struct {
+      struct ncclRing ring;
+      struct ncclTree tree;
+
+      int id;
+      int nthreads;
+      int buffSize;
+
+      // Communication structures
+      struct ncclPeer* peers;
+      struct ncclPeer* devPeers;
+
+      // Operation list for aggregation
+      struct ncclColl* collectives;
+      struct ncclColl* devCollectives;
+      int collStart;
+      int collCount;
+      int collFifoHead; // Only used by GPU
+      int collFifoTail; // Only used by CPU
+
+      uint32_t* abortCount;
+    };
+    int data[0x80];
+  };
+};
+static_assert(sizeof(struct ncclChannel) == 0x80*sizeof(int), "ncclChannel must have a pow2 size");
+#pragma pack(pop)   /* restore original alignment from stack */
+
+#define MAXCHANNELS 16
+
+#ifdef ENABLE_PROFILING
+struct ncclProf {
+  union {
+    struct {
+      uint64_t total_cycle;
+      uint64_t wait_send_cycle[MAXCHANNELS];
+      uint64_t wait_recv_cycle[MAXCHANNELS];
+      // primtive cycles
+      uint64_t send_cycle;
+      uint64_t directSend_cycle;
+      uint64_t recv_cycle;
+      uint64_t directRecv_cycle;
+      uint64_t copySend_cycle;
+      uint64_t directCopySend_cycle;
+      uint64_t recvCopySend_cycle;
+      uint64_t directRecvCopySend_cycle;
+      uint64_t recvReduceCopy_cycle;
+      uint64_t recvReduceSend_cycle;
+      uint64_t recvReduceCopySend_cycle;
+      uint64_t directRecvReduceCopySend_cycle;
+      // primitive bytes
+      uint64_t send_byte;
+      uint64_t directSend_byte;
+      uint64_t recv_byte;
+      uint64_t directRecv_byte;
+      uint64_t copySend_byte;
+      uint64_t directCopySend_byte;
+      uint64_t recvCopySend_byte;
+      uint64_t directRecvCopySend_byte;
+      uint64_t recvReduceCopy_byte;
+      uint64_t recvReduceSend_byte;
+      uint64_t recvReduceCopySend_byte;
+      uint64_t directRecvReduceCopySend_byte;
+    };
+    int data[0x80];
+  };
+};
+#endif
+
+typedef enum {
+  ncclDevSuccess,
+  ncclDevAssertedMismatch,
+  ncclDevSuspectedMismatch
+} ncclDevError_t;
+
+struct ncclDevComm {
+  int rank;
+  int nRanks;
+
+  // Flag to ask NCCL kernels to abort
+  volatile uint32_t *abortFlag;
+  volatile ncclDevError_t *fatalDevError;
+
+  // Channels, device side
+  struct ncclChannel* channels;
+
+#ifdef ENABLE_PROFILING
+  // Profiling counters
+  struct ncclProf* devProf;
+#endif
+};
+
+#endif
diff --git a/projects/rccl/src/include/enqueue.h b/projects/rccl/src/include/enqueue.h
index f17639826e..c40957df91 100644
--- a/projects/rccl/src/include/enqueue.h
+++ b/projects/rccl/src/include/enqueue.h
@@ -1,5 +1,5 @@
 /*************************************************************************
- * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
  * Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
  *
  * See LICENSE.txt for license information
@@ -11,12 +11,14 @@
 #include "core.h"
 #include "group.h"
 
-typedef ncclResult_t(*ncclFunc_t)(const void* sendbuff, void* recvbuff, size_t count,
-    ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, hipStream_t stream);
+// Channels / LL tuning
+#define NCCL_LL_CHANNEL_THRESHOLD 8 // Per thread size before we start increasing nrings
+#define NCCL_THREAD_THRESHOLD 256  // Per thread size before we switch to non-LL
+#define NCCL_THREAD_THRESHOLD_PREVOLTA 32 // Per thread size before we switch to non-LL for pre-Volta archs
+#define NCCL_THREAD_THRESHOLD_VEGA 8 // Per thread size before we switch to non-LL for VEGA
+#define NCCL_LL_MIN_NTHREADS 256
 
-ncclResult_t ncclEnqueueCheck(ncclFunc_t func, const char* primName, const void* sendbuff,
-    void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root,
-    ncclComm_t comm, hipStream_t stream);
+ncclResult_t ncclEnqueueCheck(struct ncclInfo* info);
 ncclResult_t ncclCpuBarrierIn(ncclComm_t comm, int* isLast);
 ncclResult_t ncclCpuBarrierLast(ncclComm_t comm);
 ncclResult_t ncclCpuBarrierOut(ncclComm_t comm);
diff --git a/projects/rccl/src/include/ibvwrap.h b/projects/rccl/src/include/ibvwrap.h
index 4f3e8311dc..0943f9962c 100644
--- a/projects/rccl/src/include/ibvwrap.h
+++ b/projects/rccl/src/include/ibvwrap.h
@@ -4,7 +4,7 @@
  * Copyright (c) 2005, 2006, 2007 Cisco Systems, Inc.  All rights reserved.
  * Copyright (c) 2005 PathScale, Inc.  All rights reserved.
  *
- * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
  *
  * See LICENSE.txt for license information
  ************************************************************************/
diff --git a/projects/rccl/src/include/info.h b/projects/rccl/src/include/info.h
new file mode 100644
index 0000000000..dfb8c2f280
--- /dev/null
+++ b/projects/rccl/src/include/info.h
@@ -0,0 +1,45 @@
+/*************************************************************************
+ * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef NCCL_INFO_H_
+#define NCCL_INFO_H_
+
+#include "nccl.h"
+
+typedef enum {
+  ncclPatternRing,
+  ncclPatternRingTwice,
+  ncclPatternPipelineFrom,
+  ncclPatternPipelineTo,
+  ncclPatternTreeUp,
+  ncclPatternTreeDown,
+  ncclPatternTreeUpDown
+} ncclPattern_t;
+
+// Used to pass NCCL call information between functions
+struct ncclInfo {
+  ncclColl_t coll;
+  const char* opName;
+  // NCCL Coll Args
+  const void* sendbuff;
+  void* recvbuff;
+  size_t count;
+  ncclDataType_t datatype;
+  ncclRedOp_t op;
+  int root;
+  ncclComm_t comm;
+  hipStream_t stream;
+  // Algorithm details
+  int chunkSteps;
+  int sliceSteps;
+  // Computed later
+  ncclPattern_t pattern;
+  size_t nBytes;
+  int nstepsPerLoop;
+  int nchunksPerLoop;
+};
+
+#endif
diff --git a/projects/rccl/src/include/nccl_net.h b/projects/rccl/src/include/nccl_net.h
index ce3f6cab6d..797c759e69 100644
--- a/projects/rccl/src/include/nccl_net.h
+++ b/projects/rccl/src/include/nccl_net.h
@@ -1,5 +1,5 @@
 /*************************************************************************
- * Copyright (c) 2017-2018, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved.
  *
  * See LICENSE.txt for license information
  ************************************************************************/
@@ -58,8 +58,51 @@ typedef struct {
   ncclResult_t (*closeListen)(void* listenComm);
 } ncclNet_v1_t;
 
-typedef ncclNet_v1_t ncclNet_t;
+typedef struct {
+  // Name of the network (mainly for logs)
+  const char* name;
+  // Initialize the network.
+  ncclResult_t (*init)(ncclDebugLogger_t logFunction);
+  // Return the number of adapters.
+  ncclResult_t (*devices)(int* ndev);
+  // Return the device path in /sys. NCCL will call free on this path.
+  ncclResult_t (*pciPath)(int dev, char** path);
+  // Return whether this device supports host pointers and/or CUDA pointers
+  // as data from the current GPU. Supported types should be composed with
+  // NCCL_PTR_HOST and NCCL_PTR_CUDA.
+  ncclResult_t (*ptrSupport)(int dev, int* supportedTypes);
+  // Create a receiving object and provide a handle to connect to it. The
+  // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged
+  // between ranks to create a connection.
+  ncclResult_t (*listen)(int dev, void* handle, void** listenComm);
+  // Connect to a handle and return a sending comm object for that peer.
+  ncclResult_t (*connect)(int dev, void* handle, void** sendComm);
+  // Finalize connection establishment after remote peer has called connectHandle
+  ncclResult_t (*accept)(void* listenComm, void** recvComm);
+  // Register/Deregister memory. Comm can be either a sendComm or a recvComm.
+  // Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA.
+  ncclResult_t (*regMr)(void* comm, void* data, int size, int type, void** mhandle);
+  ncclResult_t (*deregMr)(void* comm, void* mhandle);
+  // Asynchronous send to a peer.
+  // May return request == NULL if the call cannot be performed (or would block)
+  ncclResult_t (*isend)(void* sendComm, void* data, int size, void* mhandle, void** request);
+  // Asynchronous recv from a peer.
+  // May return request == NULL if the call cannot be performed (or would block)
+  ncclResult_t (*irecv)(void* recvComm, void* data, int size, void* mhandle, void** request);
+  // Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is
+  // visible to the GPU
+  ncclResult_t (*flush)(void* recvComm, void* data, int size, void* mhandle);
+  // Test whether a request is complete. If size is not NULL, it returns the
+  // number of bytes sent/received.
+  ncclResult_t (*test)(void* request, int* done, int* size);
+  // Close and free send/recv comm objects
+  ncclResult_t (*closeSend)(void* sendComm);
+  ncclResult_t (*closeRecv)(void* recvComm);
+  ncclResult_t (*closeListen)(void* listenComm);
+} ncclNet_v2_t;
 
-#define NCCL_PLUGIN_SYMBOL ncclNetPlugin_v1
+typedef ncclNet_v2_t ncclNet_t;
+
+#define NCCL_PLUGIN_SYMBOL ncclNetPlugin_v2
 
 #endif // end include guard
diff --git a/projects/rccl/src/include/net.h b/projects/rccl/src/include/net.h
index ebc967782c..950b5e5c0c 100644
--- a/projects/rccl/src/include/net.h
+++ b/projects/rccl/src/include/net.h
@@ -1,5 +1,5 @@
 /*************************************************************************
- * Copyright (c) 2016-2018, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
  *
  * See LICENSE.txt for license information
  ************************************************************************/
@@ -13,11 +13,6 @@
 extern ncclNet_t* ncclNet;
 typedef char ncclNetHandle_t[NCCL_NET_HANDLE_MAXSIZE];
 
-/* Socket Interface Selection type */
-typedef enum { findSubnetIf   = -1,
-    dontCareIf     = -2
-} ncclSocketIfSl_t;
-
 // Translation to external API
 static const char* ncclNetName() { return ncclNet->name; }
 static ncclResult_t ncclNetDevices(int* ndev) { NCCLCHECK(ncclNet->devices(ndev)); return ncclSuccess; }
@@ -26,15 +21,16 @@ static ncclResult_t ncclNetPtrSupport(int dev, int* supportedTypes) { NCCLCHECK(
 static ncclResult_t ncclNetListen(int dev, void* handle, void** listenComm) { NCCLCHECK(ncclNet->listen(dev, handle, listenComm)); return ncclSuccess; }
 static ncclResult_t ncclNetConnect(int dev, void* handle, void** sendComm) { NCCLCHECK(ncclNet->connect(dev, handle, sendComm)); return ncclSuccess; }
 static ncclResult_t ncclNetAccept(void* listenComm, void** recvComm) { NCCLCHECK(ncclNet->accept(listenComm, recvComm)); return ncclSuccess; }
-static ncclResult_t ncclNetIsend(void* sendComm, void* data, int size, int type, void** request) { NCCLCHECK(ncclNet->isend(sendComm, data, size, type, request)); return ncclSuccess; }
-static ncclResult_t ncclNetIrecv(void* recvComm, void* data, int size, int type, void** request) { NCCLCHECK(ncclNet->irecv(recvComm, data, size, type, request)); return ncclSuccess; }
-static ncclResult_t ncclNetFlush(void* recvComm, void* data, int size) { NCCLCHECK(ncclNet->flush(recvComm, data, size)); return ncclSuccess; }
+static ncclResult_t ncclNetRegMr(void* comm, void* data, int size, int type, void** mhandle) { NCCLCHECK(ncclNet->regMr(comm, data, size, type, mhandle)); return ncclSuccess; }
+static ncclResult_t ncclNetDeregMr(void* comm, void* mhandle) { NCCLCHECK(ncclNet->deregMr(comm, mhandle)); return ncclSuccess; }
+static ncclResult_t ncclNetIsend(void* sendComm, void* data, int size, void* mhandle, void** request) { NCCLCHECK(ncclNet->isend(sendComm, data, size, mhandle, request)); return ncclSuccess; }
+static ncclResult_t ncclNetIrecv(void* recvComm, void* data, int size, void* mhandle, void** request) { NCCLCHECK(ncclNet->irecv(recvComm, data, size, mhandle, request)); return ncclSuccess; }
+static ncclResult_t ncclNetFlush(void* recvComm, void* data, int size, void* mhandle) { NCCLCHECK(ncclNet->flush(recvComm, data, size, mhandle)); return ncclSuccess; }
 static ncclResult_t ncclNetTest(void* request, int* done, int* size) { NCCLCHECK(ncclNet->test(request, done, size)); return ncclSuccess; }
 static ncclResult_t ncclNetCloseSend(void* sendComm) { NCCLCHECK(ncclNet->closeSend(sendComm)); return ncclSuccess; }
 static ncclResult_t ncclNetCloseRecv(void* recvComm) { NCCLCHECK(ncclNet->closeRecv(recvComm)); return ncclSuccess; }
 static ncclResult_t ncclNetCloseListen(void* listenComm) { NCCLCHECK(ncclNet->closeListen(listenComm)); return ncclSuccess; }
 
-extern ncclResult_t ncclSocketCreateHandle(void* opaqueHandle, const char* str);
 extern ncclNet_t ncclNetIb;
 extern ncclNet_t ncclNetSocket;
 
diff --git a/projects/rccl/src/include/nvlink.h b/projects/rccl/src/include/nvlink.h
index 28976386bb..5806b4d511 100644
--- a/projects/rccl/src/include/nvlink.h
+++ b/projects/rccl/src/include/nvlink.h
@@ -1,5 +1,5 @@
 /*************************************************************************
- * Copyright (c) 2016-2018, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
  * Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
  *
  * See LICENSE.txt for license information
@@ -19,6 +19,7 @@
 enum ncclNvLinkDeviceType {
   ncclNvLinkDeviceGpu,
   ncclNvLinkDeviceSwitch,
+  ncclNvLinkDeviceBridge, // IBM/Power NVLink bridge (Device 04ea)
 };
 
 static ncclResult_t ncclDeviceType(const char* busId, enum ncclNvLinkDeviceType* type) {
@@ -26,7 +27,13 @@ static ncclResult_t ncclDeviceType(const char* busId, enum ncclNvLinkDeviceType*
   memcpy(classPath+sizeof("/sys/bus/pci/devices/")-1, busId, sizeof("0000:00:00.0")-1);
   char* rPath = realpath(classPath, NULL);
   int fd;
-  SYSCHECKVAL(open(rPath, O_RDONLY), "open", fd);
+  if ((fd = open(rPath, O_RDONLY)) == -1) {
+    // Could not find device. It might be because we're in a VM and
+    // we don't see the whole machine. This is handled silently so
+    // we don't want to print an INFO error.
+    TRACE(NCCL_INIT, "Open of %s failed : %s\n", rPath, strerror(errno));
+    return ncclSystemError;
+  }
   free(rPath);
   char pciClass[9];
   strncpy(pciClass, "0x000000", 9);
@@ -36,6 +43,9 @@ static ncclResult_t ncclDeviceType(const char* busId, enum ncclNvLinkDeviceType*
   if (strcmp(pciClass, "0x068000") == 0) {
     // PCI device is of type "Bridge / Other Bridge Device" (NVswitch)
     *type = ncclNvLinkDeviceSwitch;
+  } else if (strcmp(pciClass, "0x068001") == 0) {
+    // PCI device is of type "Bridge: IBM Device 04ea"
+    *type = ncclNvLinkDeviceBridge;
   } else if (strcmp(pciClass, "0x030200") == 0 // "3D Controller" (Tesla)
       || strcmp(pciClass, "0x030000") == 0) {  // "VGA Controller" (GeForce)
     *type = ncclNvLinkDeviceGpu;
@@ -49,9 +59,9 @@ static ncclResult_t ncclDeviceType(const char* busId, enum ncclNvLinkDeviceType*
 /* Get the maximum number of NVLinks based on the GPU generation */
 static ncclResult_t getMaxNvlinks(int* maxLinks) {
   int cudaDev;
-  CUDACHECK(cudaGetDevice(&cudaDev));
+  CUDACHECK(hipGetDevice(&cudaDev));
   int ccMajor;
-  CUDACHECK(cudaDeviceGetAttribute(&ccMajor, cudaDevAttrComputeCapabilityMajor, cudaDev));
+  CUDACHECK(hipDeviceGetAttribute(&ccMajor, hipDeviceAttributeComputeCapabilityMajor, cudaDev));
   // 6 for Volta, 4 for Pascal
   *maxLinks = (ccMajor > 6) ? 6 : 4;
   // INFO("Device %d detected %d NVLinks", cudaDev, *maxLinks);
@@ -68,18 +78,15 @@ static int getNvlinkGpu(const char* busId1, const char* busId2) {
   if (res != ncclSuccess) return 0;
 
   for(int l=0; l<maxNvLinks; ++l) {
-    // nvmlDeviceGetNvLinkCapability(NVML_NVLINK_CAP_P2P_SUPPORTED) would seem to
-    // report whether the NVLink connects to a peer GPU (versus a POWER CPU?). I
-    // don't know whether nvmlDeviceGetNvLinkRemotePciInfo() would succeed in
-    // the POWER CPU case, so it seems best to check this as well.
+    // Check whether we can use this NVLink for P2P
     unsigned canP2P;
     if ((wrapNvmlDeviceGetNvLinkCapability(nvmlDev, l, NVML_NVLINK_CAP_P2P_SUPPORTED, &canP2P) != ncclSuccess) || !canP2P) continue;
 
-    // nvmlDeviceGetNvLinkRemotePciInfo() will return NVML_ERROR_NOT_SUPPORTED
-    // if the links don't exist, or are disabled. So checking for that return
-    // here would probably make the nvmlDeviceGetNvLinkCapability check above
-    // redundant. Presumably, we still need to check the P2P capability above,
-    // since even non-GPUs would possess PCI info.
+    // Make sure the Nvlink is up. The previous call should have trained the link.
+    nvmlEnableState_t isActive;
+    if ((wrapNvmlDeviceGetNvLinkState(nvmlDev, l, &isActive) != ncclSuccess) || (isActive != NVML_FEATURE_ENABLED)) continue;
+
+    // Try to figure out what's on the other side of the NVLink
     nvmlPciInfo_t remoteProc;
     if (wrapNvmlDeviceGetNvLinkRemotePciInfo(nvmlDev, l, &remoteProc) != ncclSuccess) continue;
 
@@ -90,7 +97,7 @@ static int getNvlinkGpu(const char* busId1, const char* busId2) {
       p[c] = toupper(p[c]);
     }
 
-    if (strncmp(busId2, remoteProc.busId, NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE) == 0) {
+    if (busId2 != NULL && strncmp(busId2, remoteProc.busId, NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE) == 0) {
       links++;
     } else {
       // Make a lower case copy of the bus ID for calling ncclDeviceType
@@ -102,11 +109,21 @@ static int getNvlinkGpu(const char* busId1, const char* busId2) {
         lowerId[c] = tolower(p[c]);
       }
 
-      // Determine if the remote side is NVswitch
+      // Determine if the remote side is NVswitch or a GPU
       enum ncclNvLinkDeviceType type;
-      if (ncclDeviceType(lowerId, &type) == ncclSuccess && type == ncclNvLinkDeviceSwitch) {
-        //TODO: we are making an assumption that all GPUs are connected to this switch
-        //This assumption may change for future architectures
+      ncclResult_t ret = ncclDeviceType(lowerId, &type);
+      if (ret == ncclSuccess) {
+        if (type == ncclNvLinkDeviceSwitch) {
+          //TODO: we are making an assumption that all GPUs are connected to this switch
+          //This assumption may change for future architectures
+          nvswitch_links++;
+        } else if (type == ncclNvLinkDeviceGpu && busId2 == NULL) {
+          links++;
+        }
+      } else {
+        // The NVLink is up but we couldn't find the PCI device on the other
+        // side. Assume it's an NVswitch outside a VM.
+        if (l==0) INFO(NCCL_INIT, "Assuming NVLink is connected to NVswitch");
         nvswitch_links++;
       }
     }
@@ -114,43 +131,4 @@ static int getNvlinkGpu(const char* busId1, const char* busId2) {
   return nvswitch_links ? CONNECT_NVSWITCH*nvswitch_links : CONNECT_NVLINK*links;
 }
 
-static int getNumNvlinks(const char* busId) {
-  nvmlDevice_t nvmlDev;
-  ncclResult_t res = wrapNvmlDeviceGetHandleByPciBusId(busId, &nvmlDev);
-  if (res != ncclSuccess) return 0;
-
-  int nvlinks = 0, nvswitch_links = 0;
-  int maxNvLinks = ncclCudaCompCap() > 6 ? 6 : 4;
-  for(int l=0; l<maxNvLinks; ++l) {
-    unsigned canP2P;
-    nvmlEnableState_t isActive;
-    if (wrapNvmlDeviceGetNvLinkCapability(nvmlDev, l, NVML_NVLINK_CAP_P2P_SUPPORTED, &canP2P) == ncclSuccess && canP2P &&
-        wrapNvmlDeviceGetNvLinkState(nvmlDev, l, &isActive) == ncclSuccess && isActive == NVML_FEATURE_ENABLED) {
-      nvlinks++;
-    } else {
-      continue;
-    }
-
-    nvmlPciInfo_t remoteProc;
-    if (wrapNvmlDeviceGetNvLinkRemotePciInfo(nvmlDev, l, &remoteProc) != ncclSuccess) continue;
-
-    // Make a lower case copy of the bus ID for calling ncclDeviceType
-    // PCI system path is in lower case
-    char* p = remoteProc.busId;
-    char lowerId[NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE];
-    for (int c=0; c<NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE; c++) {
-      if (p[c] == 0) break;
-      lowerId[c] = tolower(p[c]);
-    }
-
-    // Determine if the remote side is NVswitch
-    enum ncclNvLinkDeviceType type;
-    if (ncclDeviceType(lowerId, &type) == ncclSuccess && type == ncclNvLinkDeviceSwitch) {
-      //TODO: we are making an assumption that all GPUs are connected to this switch
-      //This assumption may change for future architectures
-      nvswitch_links++;
-    }
-  }
-  return nvswitch_links ? CONNECT_NVSWITCH*nvswitch_links : CONNECT_NVLINK*nvlinks;
-}
 #endif
diff --git a/projects/rccl/src/include/nvlink_stub.h b/projects/rccl/src/include/nvlink_stub.h
index 9ee176edf0..32f3e6b2c1 100644
--- a/projects/rccl/src/include/nvlink_stub.h
+++ b/projects/rccl/src/include/nvlink_stub.h
@@ -1,5 +1,5 @@
 /*************************************************************************
- * Copyright (c) 2016-2018, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
  * Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
  *
  * See LICENSE.txt for license information
@@ -8,13 +8,23 @@
 #ifndef NCCL_NVLINK_H_
 #define NCCL_NVLINK_H_
 
+#include <sys/stat.h>
+#include <fcntl.h>
+#include "nvmlwrap.h"
 #include "topo.h"
 
 #define CONNECT_NVLINK 0x10
 #define CONNECT_NVSWITCH 0x100
 
-static int getNumNvlinks(const char* busId) {
-  return 0;
+enum ncclNvLinkDeviceType {
+  ncclNvLinkDeviceGpu,
+  ncclNvLinkDeviceSwitch,
+  ncclNvLinkDeviceBridge, // IBM/Power NVLink bridge (Device 04ea)
+};
+
+static int getNvlinkGpu(const char* busId1, const char* busId2) {
+  int links = 0;
+  return CONNECT_NVLINK*links;
 }
 
 #endif
diff --git a/projects/rccl/src/include/nvmlwrap.h b/projects/rccl/src/include/nvmlwrap.h
index ddfd233d74..f658279807 100644
--- a/projects/rccl/src/include/nvmlwrap.h
+++ b/projects/rccl/src/include/nvmlwrap.h
@@ -1,5 +1,5 @@
 /*************************************************************************
- * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
  *
  * See LICENSE.txt for license information
  ************************************************************************/
@@ -7,7 +7,7 @@
 #ifndef NCCL_NVMLWRAP_H_
 #define NCCL_NVMLWRAP_H_
 
-#include "core.h"
+#include "nccl.h"
 
 //#define NVML_DIRECT 1
 #ifdef NVML_DIRECT
@@ -32,14 +32,6 @@ static ncclResult_t wrapNvmlDeviceGetIndex(nvmlDevice_t device, unsigned* index)
   NVMLCHECK(nvmlDeviceGetIndex(device, index));
   return ncclSuccess;
 }
-static ncclResult_t wrapNvmlDeviceSetCpuAffinity(nvmlDevice_t device) {
-  NVMLCHECK(nvmlDeviceSetCpuAffinity(device));
-  return ncclSuccess;
-}
-static ncclResult_t wrapNvmlDeviceClearCpuAffinity(nvmlDevice_t device) {
-  NVMLCHECK(nvmlDeviceClearCpuAffinity(device));
-  return ncclSuccess;
-}
 static ncclResult_t wrapNvmlDeviceGetHandleByIndex(unsigned int index, nvmlDevice_t *device) {
   NVMLCHECK(nvmlDeviceGetHandleByIndex(index,device));
   return ncclSuccess;
@@ -61,6 +53,10 @@ static ncclResult_t wrapNvmlDeviceGetNvLinkCapability(nvmlDevice_t device, unsig
   NVMLCHECK(nvmlDeviceGetNvLinkCapability(device, link, capability, capResult));
   return ncclSuccess;
 }
+static ncclResult_t wrapNvmlDeviceGetMinorNumber(nvmlDevice_t device, unsigned int* minorNumber) {
+  NVMLCHECK(nvmlDeviceGetMinorNumber(device, minorNumber));
+  return ncclSuccess;
+}
 #else
 // Dynamically handle dependencies on NVML
 
@@ -136,14 +132,14 @@ ncclResult_t wrapNvmlInit(void);
 ncclResult_t wrapNvmlShutdown(void);
 ncclResult_t wrapNvmlDeviceGetHandleByPciBusId(const char* pciBusId, nvmlDevice_t* device);
 ncclResult_t wrapNvmlDeviceGetIndex(nvmlDevice_t device, unsigned* index);
-ncclResult_t wrapNvmlDeviceSetCpuAffinity(nvmlDevice_t device);
-ncclResult_t wrapNvmlDeviceClearCpuAffinity(nvmlDevice_t device);
 ncclResult_t wrapNvmlDeviceGetHandleByIndex(unsigned int index, nvmlDevice_t *device);
 ncclResult_t wrapNvmlDeviceGetPciInfo(nvmlDevice_t device, nvmlPciInfo_t* pci);
 ncclResult_t wrapNvmlDeviceGetNvLinkState(nvmlDevice_t device, unsigned int link, nvmlEnableState_t *isActive);
 ncclResult_t wrapNvmlDeviceGetNvLinkRemotePciInfo(nvmlDevice_t device, unsigned int link, nvmlPciInfo_t *pci);
 ncclResult_t wrapNvmlDeviceGetNvLinkCapability(nvmlDevice_t device, unsigned int link,
                                                    nvmlNvLinkCapability_t capability, unsigned int *capResult);
+ncclResult_t wrapNvmlDeviceGetMinorNumber(nvmlDevice_t device, unsigned int* minorNumber);
+
 #endif // NVML_DIRECT
 
 #endif // End include guard
diff --git a/projects/rccl/src/include/param.h b/projects/rccl/src/include/param.h
index dd5f697e34..54317571e7 100644
--- a/projects/rccl/src/include/param.h
+++ b/projects/rccl/src/include/param.h
@@ -1,5 +1,5 @@
 /*************************************************************************
- * Copyright (c) 2017-2018, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved.
  *
  * See LICENSE.txt for license information
  ************************************************************************/
@@ -36,7 +36,6 @@ static void setEnvFile(const char* fileName) {
     s++;
     strncpy(envValue, line+s, 1024);
     setenv(envVar, envValue, 0);
-    char *str = getenv(envVar);
   }
   if (line) free(line);
   fclose(file);
diff --git a/projects/rccl/src/include/ring.h b/projects/rccl/src/include/ring.h
deleted file mode 100644
index fa5e09959f..0000000000
--- a/projects/rccl/src/include/ring.h
+++ /dev/null
@@ -1,14 +0,0 @@
-/*************************************************************************
- * Copyright (c) 2015-2017, NVIDIA CORPORATION. All rights reserved.
- *
- * See LICENSE.txt for license information
- ************************************************************************/
-
-#ifndef NCCL_RING_H_
-#define NCCL_RING_H_
-#include "core.h"
-
-ncclResult_t initRing(struct ncclComm* comm, int ringid);
-ncclResult_t freeRing(struct ncclRing* ring);
-
-#endif
diff --git a/projects/rccl/src/include/rings.h b/projects/rccl/src/include/rings.h
index 3b4c311102..f634cbe071 100644
--- a/projects/rccl/src/include/rings.h
+++ b/projects/rccl/src/include/rings.h
@@ -1,5 +1,5 @@
 /*************************************************************************
- * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
  * Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
  *
  * See LICENSE.txt for license information
@@ -9,14 +9,13 @@
 #define NCCL_RINGS_H_
 
 static int getDefaultThreads() {
-  // On Kepler, rings are doubled later.
-#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__)
+#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
   return 256;
-#else
+#else  // On Kepler, rings are doubled later.
   return ncclCudaCompCap() == 3 ? 128 : 256;
 #endif
 }
 
-ncclResult_t ncclGetRings(int* nrings, int* nthreads, int rank, int nranks, int* transports, ncclTvalue_t* values, int* prev, int* next);
+ncclResult_t ncclGetRings(int* nrings, int* nthreads, int rank, int nranks, int* transports, ncclTvalue_t* values, int* prev, int* next, int* treeIn, int* treeOut);
 
 #endif
diff --git a/projects/rccl/src/include/shm.h b/projects/rccl/src/include/shm.h
index 850ecae5ce..17861bed62 100644
--- a/projects/rccl/src/include/shm.h
+++ b/projects/rccl/src/include/shm.h
@@ -1,5 +1,5 @@
 /*************************************************************************
- * Copyright (c) 2016-2018, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
  * Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
  *
  * See LICENSE.txt for license information
diff --git a/projects/rccl/src/include/socket.h b/projects/rccl/src/include/socket.h
index 624af403f8..68ce235d62 100644
--- a/projects/rccl/src/include/socket.h
+++ b/projects/rccl/src/include/socket.h
@@ -1,5 +1,5 @@
 /*************************************************************************
- * Copyright (c) 2016-2018, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
  *
  * See LICENSE.txt for license information
  ************************************************************************/
@@ -18,8 +18,9 @@
 
 #define MAX_IFS 16
 #define MAX_IF_NAME_SIZE 16
-#define SLEEP_INT     1000  // sleep interval in usec
-#define RETRY_TIMES   2e4   // retry times before reporting a timeout (20 sec)
+#define SLEEP_INT            1000 // connection retry sleep interval in usec
+#define RETRY_REFUSED_TIMES   2e4 // connection refused retry times before reporting a timeout (20 sec)
+#define RETRY_TIMEDOUT_TIMES    3 // connection timed out retry times (each one can take 20s)
 
 /* Common socket address storage structure for IPv4/IPv6 */
 union socketAddress {
@@ -41,7 +42,7 @@ static inline const char *socketToString(struct sockaddr *saddr, char *buf) {
   return buf;
 }
 
-static inline short socketToPort(struct sockaddr *saddr) {
+static inline uint16_t socketToPort(struct sockaddr *saddr) {
   return ntohs(saddr->sa_family == AF_INET ? ((struct sockaddr_in*)saddr)->sin_port : ((struct sockaddr_in6*)saddr)->sin6_port);
 }
 
@@ -60,9 +61,12 @@ static inline int envSocketFamily(void) {
 }
 
 static int findInterfaces(const char* prefixList, char* names, union socketAddress *addrs, int sock_family, int maxIfNameSize, int maxIfs) {
+#ifdef ENABLE_TRACE
   char line[1024];
+#endif
   struct netIf userIfs[MAX_IFS];
   bool searchNot = prefixList && prefixList[0] == '^';
+  bool searchExact = prefixList && prefixList[0] == '=';
   int nUserIfs = parseStringList(prefixList, userIfs, MAX_IFS);
 
   int found = 0;
@@ -89,7 +93,7 @@ static int findInterfaces(const char* prefixList, char* names, union socketAddre
     }
 
     // check against user specified interfaces
-    if (!(matchIfList(interface->ifa_name, -1, userIfs, nUserIfs) ^ searchNot)) {
+    if (!(matchIfList(interface->ifa_name, -1, userIfs, nUserIfs, searchExact) ^ searchNot)) {
       continue;
     }
 
@@ -106,7 +110,6 @@ static int findInterfaces(const char* prefixList, char* names, union socketAddre
       // Store the IP address
       int salen = (family == AF_INET) ? sizeof(sockaddr_in) : sizeof(sockaddr_in6);
       memcpy(addrs+found, interface->ifa_addr, salen);
-      INFO(NCCL_INIT|NCCL_NET,"NET : Using interface %s:%s", interface->ifa_name, socketToString(interface->ifa_addr, line));
       found++;
     }
   }
@@ -159,7 +162,10 @@ static bool matchSubnet(struct ifaddrs local_if, union socketAddress remote) {
 }
 
 static int findInterfaceMatchSubnet(char* ifNames, union socketAddress* localAddrs, union socketAddress remoteAddr, int ifNameMaxSize, int maxIfs) {
-  char line[1024], line_a[1024];
+#ifdef ENABLE_TRACE
+  char line[1024];
+#endif
+  char line_a[1024];
   int found = 0;
   struct ifaddrs *interfaces, *interface;
   getifaddrs(&interfaces);
@@ -183,7 +189,7 @@ static int findInterfaceMatchSubnet(char* ifNames, union socketAddress* localAdd
     // Store the interface name
     strncpy(ifNames+found*ifNameMaxSize, interface->ifa_name, ifNameMaxSize);
 
-    INFO(NCCL_INIT|NCCL_NET,"NET : Found interface %s:%s in the same subnet as remote address %s", interface->ifa_name, socketToString(&(localAddrs[found].sa), line), socketToString(&(remoteAddr.sa), line_a));
+    TRACE(NCCL_INIT|NCCL_NET,"NET : Found interface %s:%s in the same subnet as remote address %s", interface->ifa_name, socketToString(&(localAddrs[found].sa), line), socketToString(&(remoteAddr.sa), line_a));
     found++;
     if (found == maxIfs) break;
   }
@@ -336,8 +342,10 @@ static ncclResult_t createListenSocket(int *fd, union socketAddress *localAddr)
   TRACE(NCCL_INIT|NCCL_NET,"Listening on socket %s", socketToString(&localAddr->sa, line));
 #endif
 
-  /* Put the socket in listen mode */
-  SYSCHECK(listen(sockfd, 128), "listen");
+  /* Put the socket in listen mode
+   * NB: The backlog will be silently truncated to the value in /proc/sys/net/core/somaxconn
+   */
+  SYSCHECK(listen(sockfd, 16384), "listen");
   *fd = sockfd;
   return ncclSuccess;
 }
@@ -367,14 +375,18 @@ static ncclResult_t connectAddress(int* fd, union socketAddress* remoteAddr) {
 #endif
 
   int ret;
-  int retries = 0;
+  int timedout_retries = 0;
+  int refused_retries = 0;
 retry:
   SYSCHECKSYNC(connect(*fd, &remoteAddr->sa, salen), "connect", ret);
   if (ret == 0) return ncclSuccess;
-  if (errno == ECONNREFUSED && ++retries < RETRY_TIMES) {
-    INFO(NCCL_ALL,"Call to connect returned %s, retrying", strerror(errno)); \
-    usleep(SLEEP_INT);
-    goto retry;
+  if ((errno == ECONNREFUSED || errno == ETIMEDOUT)) {
+    if ((errno == ECONNREFUSED && ++refused_retries < RETRY_REFUSED_TIMES) ||
+        (errno == ETIMEDOUT && ++timedout_retries < RETRY_TIMEDOUT_TIMES)) {
+      INFO(NCCL_ALL,"Call to connect returned %s, retrying", strerror(errno));
+      usleep(SLEEP_INT);
+      goto retry;
+    }
   }
   WARN("Connect to %s failed : %s", socketToString(&remoteAddr->sa, line), strerror(errno));
   return ncclSystemError;
@@ -382,12 +394,12 @@ retry:
 
 #define NCCL_SOCKET_SEND 0
 #define NCCL_SOCKET_RECV 1
-static ncclResult_t socketProgress(int op, int fd, void* ptr, int size, int* offset) {
+static ncclResult_t socketProgressOpt(int op, int fd, void* ptr, int size, int* offset, int block) {
   int bytes = 0;
   char* data = (char*)ptr;
   do {
-    if (op == NCCL_SOCKET_RECV) bytes = recv(fd, data+(*offset), size-(*offset), MSG_DONTWAIT);
-    if (op == NCCL_SOCKET_SEND) bytes = send(fd, data+(*offset), size-(*offset), MSG_DONTWAIT);
+    if (op == NCCL_SOCKET_RECV) bytes = recv(fd, data+(*offset), size-(*offset), block ? 0 : MSG_DONTWAIT);
+    if (op == NCCL_SOCKET_SEND) bytes = send(fd, data+(*offset), size-(*offset), block ? 0 : MSG_DONTWAIT);
     if (op == NCCL_SOCKET_RECV && bytes == 0) {
       WARN("Net : Connection closed by remote peer");
       return ncclSystemError;
@@ -405,9 +417,13 @@ static ncclResult_t socketProgress(int op, int fd, void* ptr, int size, int* off
   return ncclSuccess;
 }
 
+static ncclResult_t socketProgress(int op, int fd, void* ptr, int size, int* offset) {
+  return socketProgressOpt(op, fd, ptr, size, offset, 0);
+}
+
 static ncclResult_t socketWait(int op, int fd, void* ptr, int size, int* offset) {
   while (*offset < size)
-    NCCLCHECK(socketProgress(op, fd, ptr, size, offset));
+    NCCLCHECK(socketProgressOpt(op, fd, ptr, size, offset, 1));
   return ncclSuccess;
 }
 
diff --git a/projects/rccl/src/include/topo.h b/projects/rccl/src/include/topo.h
index d14e38690e..69cd100743 100644
--- a/projects/rccl/src/include/topo.h
+++ b/projects/rccl/src/include/topo.h
@@ -1,6 +1,5 @@
 /*************************************************************************
- * Copyright (c) 2016-2018, NVIDIA CORPORATION. All rights reserved.
- * Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
+ * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
  *
  * See LICENSE.txt for license information
  ************************************************************************/
@@ -12,78 +11,35 @@
 #include <limits.h>
 #include <stdlib.h>
 #include <ctype.h>
-#include <iostream>
-#include <fstream>
-#include <string>
+#include <stdio.h>
 
-#define BUSID_SIZE (sizeof("0000:00:00.0"))
-#define BUSID_REDUCED_SIZE (sizeof("0000:00"))
+ncclResult_t getCudaPath(int cudaDev, char** path);
 
-static bool isEPYC() {
-  std::ifstream cpuinfo("/proc/cpuinfo");
-  std::string line;
-  int needed = 2;
-  static bool vendor_id = true, cpu_family = false, initialized = false;
-  if (initialized) return (vendor_id && cpu_family);
-  while (std::getline(cpuinfo, line)) {
-    if (line.compare(0, 9, "vendor_id") == 0) {
-      if(line.find("AuthenticAMD") == std::string::npos)
-        vendor_id = false;
-      needed --;
-    }
-    if (line.compare(0, 10, "cpu family") == 0) {
-      std::string family_str = line.substr(line.find(": ") + 2);
-      if (std::stoi(family_str) >= 23)
-        cpu_family = true;
-      needed --;
-    }
-    if (!needed)
-      break;
-  }
-  initialized = true;
-  return (vendor_id && cpu_family);
-}
+static int getNumaId(char *path) {
+  char npath[PATH_MAX];
+  snprintf(npath, PATH_MAX, "%s/numa_node", path);
+  npath[PATH_MAX-1] = '\0';
 
-static ncclResult_t getCudaPath(int cudaDev, char** path) {
-  char busId[BUSID_SIZE];
-  CUDACHECK(hipDeviceGetPCIBusId(busId, BUSID_SIZE, cudaDev));
-  for (int i=0; i<BUSID_SIZE; i++) busId[i] = tolower(busId[i]);
-  char busPath[] = "/sys/class/pci_bus/0000:00/../../0000:00:00.0";
-  memcpy(busPath+sizeof("/sys/class/pci_bus/")-1, busId, BUSID_REDUCED_SIZE-1);
-  memcpy(busPath+sizeof("/sys/class/pci_bus/0000:00/../../")-1, busId, BUSID_SIZE-1);
-  *path = realpath(busPath, NULL);
-  if (*path == NULL) {
-    WARN("Could not find real path of %s", busPath);
-    return ncclSystemError;
-  }
-  return ncclSuccess;
+  int numaId = -1;
+  FILE *file = fopen(npath, "r");
+  if (file == NULL) return -1;
+  if (fscanf(file, "%d", &numaId) == EOF) { fclose(file); return -1; }
+  fclose(file);
+
+  return numaId;
 }
 
 enum ncclPathDist {
-  PATH_PIX = 0,
-  PATH_PXB = 1,
-  PATH_PHB = 2,
-  PATH_SOC = 3
+  PATH_PIX  = 0,
+  PATH_PXB  = 1,
+  PATH_PHB  = 2,
+  PATH_NODE = 3,
+  PATH_SYS  = 4,
+  PATH_ARRAY_SIZE = 5
 };
 
-static const char* pathDists[] = { "PIX", "PXB", "PHB", "SOC" };
+extern const char* pathDists[PATH_ARRAY_SIZE];
 
-static int pciDistance(char* path1, char* path2) {
-  int score = 0;
-  int depth = 0;
-  int same = 1;
-  for (int i=0; i<strlen(path1); i++) {
-    if (path1[i] != path2[i]) same = 0;
-    if (path1[i] == '/') {
-      depth++;
-      if (same == 1) score++;
-    }
-  }
-  if (isEPYC() && score <= 3) return PATH_PHB;
-  if (score <= 3) return PATH_SOC;
-  if (score == 4) return PATH_PHB;
-  if (score == depth-1) return PATH_PIX;
-  return PATH_PXB;
-}
+int pciDistance(char* path1, char* path2);
 
 #endif
diff --git a/projects/rccl/src/include/transport.h b/projects/rccl/src/include/transport.h
index bc9b7779d8..91628f6b65 100644
--- a/projects/rccl/src/include/transport.h
+++ b/projects/rccl/src/include/transport.h
@@ -1,5 +1,5 @@
 /*************************************************************************
- * Copyright (c) 2016-2018, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
  *
  * See LICENSE.txt for license information
  ************************************************************************/
@@ -8,7 +8,9 @@
 #define NCCL_TRANSPORT_H_
 
 #include "nccl.h"
+#include "devcomm.h"
 #include <stdint.h>
+#include "nvmlwrap.h"
 
 #define NTRANSPORTS 3
 
@@ -19,11 +21,13 @@ struct ncclRing;
 struct ncclConnector;
 struct ncclComm;
 
-#define RANK_INFO_SIZE 64
-typedef char ncclTinfo_t[RANK_INFO_SIZE];
-
-struct ncclInfo {
-  ncclTinfo_t tinfo[NTRANSPORTS];
+struct ncclPeerInfo {
+  int rank;
+  int cudaDev;
+  int nvmlDev;
+  uint64_t hostHash;
+  uint64_t pidHash;
+  char busId[NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE];
 };
 
 // Used to hold the transport connection values
@@ -34,18 +38,47 @@ struct ncclConnect {
   char data[CONNECT_SIZE];
 };
 
+enum ncclProxyOpState { ncclProxyOpNone, ncclProxyOpReady, ncclProxyOpProgress };
+
+struct ncclProxyArgs;
+typedef ncclResult_t (*proxyProgressFunc_t)(struct ncclProxyArgs*);
+
 struct ncclProxyArgs {
-  struct ncclRing* ring;
-  int substeps;
+  proxyProgressFunc_t progress;
+  struct ncclChannel* channel;
+  struct ncclConnector* connector;
+  int sliceSteps;
+  int chunkSteps;
   int nsteps;
   uint64_t opCount;
   int llMode;
-  bool needProxy;
-  int active;   // add component before this line -- it is left out during initialization
+  int state;   // add component before this line -- it is left out during initialization
+
+  // Internal state
+  uint64_t head;
+  uint64_t tail;
+  uint64_t end;
+  void* requests[NCCL_STEPS];
+  int idle;
+
+  // Element linking
+  pthread_mutex_t mutex;
+  struct ncclProxyArgs* next;
+  struct ncclProxyArgs* nextPeer;
+};
+
+struct ncclProxyPool;
+struct ncclProxyState {
+  pthread_cond_t cond;
+  pthread_mutex_t mutex;
+  bool stop;
+  struct ncclProxyArgs* ops;
+  struct ncclProxyArgs* pool;
+  struct ncclProxyPool* pools;
 };
 
 struct ncclTransportComm {
-  ncclResult_t (*setup)(ncclTinfo_t*, ncclTinfo_t*, struct ncclConnect*, struct ncclRing*);
+  ncclResult_t (*setup)(struct ncclPeerInfo*, struct ncclPeerInfo*, struct ncclConnect*, struct ncclConnector*, int buffSize, int channelId);
   ncclResult_t (*connect)(struct ncclConnect*, struct ncclConnector*);
   ncclResult_t (*free)(void*);
   ncclResult_t (*proxy)(struct ncclProxyArgs*);
@@ -53,8 +86,7 @@ struct ncclTransportComm {
 
 struct ncclTransport {
   const char name[4];
-  ncclResult_t (*fillInfo)(ncclTinfo_t*, int, uint64_t);
-  ncclResult_t (*canConnect)(ncclTvalue_t*, ncclTinfo_t*, ncclTinfo_t*);
+  ncclResult_t (*canConnect)(ncclTvalue_t*, struct ncclPeerInfo*, struct ncclPeerInfo*);
   ncclResult_t (*getRings)(int, int*, int*, ncclTvalue_t*, int*, int*, int*, int, int*);
   struct ncclTransportComm send;
   struct ncclTransportComm recv;
@@ -64,37 +96,17 @@ struct ncclTransport {
 
 typedef ncclResult_t (*threadFunc_t)(struct ncclProxyArgs*);
 
-#define TRANSPORT_PROXY_FIFO_SIZE NCCL_MAX_OPS
-
-struct transportProxyInfo {
-  struct ncclComm* comm;
-  pthread_t thread;
-  threadFunc_t func;
-  volatile int proxyReady;
-  struct ncclProxyArgs argsFifo[TRANSPORT_PROXY_FIFO_SIZE];
-  volatile uint64_t argsFifoHead;
-  volatile uint64_t argsFifoTail;
-  pthread_cond_t cond;
-  pthread_mutex_t mutex;
-};
-
-ncclResult_t transportCreateProxy(int type, struct ncclRing* ring, struct ncclComm* comm);
-ncclResult_t transportDestroyProxy(struct ncclConnector* connector);
-
 enum proxyMode {
   proxyRing = 0,
   proxyFrom = 1,
   proxyTo = 2
 };
 
-static int proxyPatternRing = proxyRing;
-static inline int proxyPatternFrom(int root) { return 1+root; }
-static inline int proxyPatternTo(int root) { return -1-root; }
-static inline enum proxyMode proxyPatternMode(int pattern) { return (pattern == 0) ? proxyRing : ((pattern > 0) ? proxyFrom : proxyTo); }
-static inline int proxyPatternRoot(int pattern) { return (pattern > 0) ? pattern-1 : -pattern-1; }
-
-ncclResult_t transportSaveProxies(int substeps, int subchunks, int nstepsPerRound, int nblocksPerRound, size_t size, int pattern, struct ncclComm* comm);
-ncclResult_t transportStartProxies(struct ncclComm* comm);
+ncclResult_t transportAllocateProxyArgs(struct ncclComm* comm, struct ncclProxyArgs** argsptr);
+ncclResult_t transportSaveProxies(struct ncclProxyArgs* args, int pattern, int root, int nranks);
+ncclResult_t transportStartProxy(struct ncclComm* comm);
+ncclResult_t transportCreateProxy(struct ncclComm* comm);
+ncclResult_t transportDestroyProxy(struct ncclComm* comm);
 
 #include <unistd.h>
 
@@ -106,8 +118,4 @@ inline void transportProxyWait(const FUNC& func) {
   }
 }
 
-inline void transportProxyIdle(int idle) {
-  sched_yield();
-}
-
 #endif
diff --git a/projects/rccl/src/include/trees.h b/projects/rccl/src/include/trees.h
new file mode 100644
index 0000000000..7eadd8556e
--- /dev/null
+++ b/projects/rccl/src/include/trees.h
@@ -0,0 +1,13 @@
+/*************************************************************************
+ * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef NCCL_TREES_H_
+#define NCCL_TREES_H_
+
+ncclResult_t ncclGetBtree(int nranks, int rank, int* u0, int* d1, int* d0);
+ncclResult_t ncclGetDtree(int nranks, int rank, int* u0, int* d0_0, int* d0_1, int* u1, int* d1_0, int* d1_1);
+
+#endif
diff --git a/projects/rccl/src/include/utils.h b/projects/rccl/src/include/utils.h
index 0ed875c161..2282f5cce3 100644
--- a/projects/rccl/src/include/utils.h
+++ b/projects/rccl/src/include/utils.h
@@ -1,5 +1,6 @@
 /*************************************************************************
- * Copyright (c) 2016-2018, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
+ * Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
  *
  * See LICENSE.txt for license information
  ************************************************************************/
@@ -10,7 +11,7 @@
 #include "nccl.h"
 #include <stdint.h>
 
-ncclResult_t getHostName(char* hostname, int maxlen);
+ncclResult_t getHostName(char* hostname, int maxlen, const char delim);
 uint64_t getnHash(const char* string, int n);
 uint64_t getHostHash();
 uint64_t getPidHash();
@@ -21,6 +22,6 @@ struct netIf {
 };
 
 int parseStringList(const char* string, struct netIf* ifList, int maxList);
-bool matchIfList(const char* string, int port, struct netIf* ifList, int listSize);
+bool matchIfList(const char* string, int port, struct netIf* ifList, int listSize, bool matchExact);
 
 #endif
diff --git a/projects/rccl/src/init.cc b/projects/rccl/src/init.cc
new file mode 100644
index 0000000000..320b5d4f35
--- /dev/null
+++ b/projects/rccl/src/init.cc
@@ -0,0 +1,1369 @@
+/*************************************************************************
+ * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
+ * Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#include "nccl.h"
+#include "core.h"
+#include "channel.h"
+#include "param.h"
+#include "nvmlwrap.h"
+#include "rings.h"
+#include "trees.h"
+#include "bootstrap.h"
+#include "transport.h"
+#include "group.h"
+#include "utils.h"
+#include "net.h"
+#include "checks.h"
+#include "enqueue.h"
+#include "topo.h"
+#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
+#include "nvlink_stub.h"
+#else
+#include "nvlink.h"
+#endif
+#include "cpuset.h"
+#include <stdio.h>
+#include <stdlib.h>
+#include <sys/mman.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <sched.h>
+#include <fcntl.h>
+#include <unistd.h>
+#include <hip/hip_runtime.h>
+#include <string.h>
+#include <errno.h>
+#include <assert.h>
+#include <dlfcn.h>
+
+#define STR2(v) #v
+#define STR(v) STR2(v)
+
+int ncclDebugLevel;
+uint64_t ncclDebugMask = NCCL_INIT; // Default debug sub-system mask is INIT
+pthread_mutex_t ncclDebugOutputLock;
+FILE *ncclDebugFile = stdout;
+
+#ifdef ENABLE_TRACE
+std::chrono::high_resolution_clock::time_point ncclEpoch;
+#endif
+
+#if CUDART_VERSION >= 9020 || defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
+#define NCCL_GROUP_CUDA_STREAM 0 // CGMD: CUDA 9.2,10.X Don't need to use an internal CUDA stream
+#else
+#define NCCL_GROUP_CUDA_STREAM 1 // CGMD: CUDA 9.0,9.1 Need to use an internal CUDA stream
+#endif
+
+NCCL_PARAM(GroupCudaStream, "GROUP_CUDA_STREAM", NCCL_GROUP_CUDA_STREAM);
+
+NCCL_PARAM(CheckPointers, "CHECK_POINTERS", 0);
+
+ncclNet_t* ncclNet = NULL;
+
+// We define this as weak to let tests redefine their own
+#pragma weak ncclNvlinkGpu
+ncclResult_t ncclNvlinkGpu(int* nvlink) {
+  int cudaDev;
+  CUDACHECK(hipGetDevice(&cudaDev));
+  char busId[NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE];
+  CUDACHECK(hipDeviceGetPCIBusId(busId, NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE, cudaDev));
+  *nvlink = getNvlinkGpu(busId, NULL);
+  return ncclSuccess;
+}
+// We define this as weak to let tests redefine their own
+#pragma weak ncclCudaCompCap
+int ncclCudaCompCap() {
+  int cudaDev;
+  if (hipGetDevice(&cudaDev) != hipSuccess) return 0;
+  int ccMajor;
+  if (hipDeviceGetAttribute(&ccMajor, hipDeviceAttributeComputeCapabilityMajor, cudaDev) != hipSuccess) return 0;
+  return ccMajor;
+}
+int ncclCudaFullCompCap() {
+  int cudaDev;
+  if (hipGetDevice(&cudaDev) != hipSuccess) return 0;
+  int ccMajor, ccMinor;
+  if (hipDeviceGetAttribute(&ccMajor, hipDeviceAttributeComputeCapabilityMajor, cudaDev) != hipSuccess) return 0;
+  if (hipDeviceGetAttribute(&ccMinor, hipDeviceAttributeComputeCapabilityMinor, cudaDev) != hipSuccess) return 0;
+  return ccMajor*10+ccMinor;
+}
+
+// Returns ncclInternalError if anything fails, causing that network to be ignored.
+ncclResult_t initNet(ncclNet_t* net) {
+  int ndev;
+  if (net->init(ncclDebugLog) != ncclSuccess) return ncclInternalError;
+  if (net->devices(&ndev) != ncclSuccess) return ncclInternalError;
+  if (ndev <= 0) return ncclSystemError;
+  return ncclSuccess;
+}
+
+ncclResult_t initNetPlugin(ncclNet_t** net) {
+  void* netPluginLib = dlopen("libnccl-net.so", RTLD_NOW | RTLD_LOCAL);
+  if (netPluginLib == NULL) {
+    // dlopen does not guarantee to set errno, but dlerror only gives us a
+    // string, so checking errno doesn't hurt to try to provide a better
+    // error message
+    if (errno == ENOENT) {
+      INFO(NCCL_INIT|NCCL_NET, "NET/Plugin : No plugin found (libnccl-net.so).");
+    } else {
+      INFO(NCCL_INIT|NCCL_NET, "NET/Plugin : Plugin load returned %d : %s.", errno, dlerror());
+    }
+    return ncclSuccess;
+  }
+  ncclNet_t* extNet = (ncclNet_t*) dlsym(netPluginLib, STR(NCCL_PLUGIN_SYMBOL));
+  if (extNet == NULL) {
+    INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Failed to find " STR(NCCL_PLUGIN_SYMBOL) " symbol.");
+    goto cleanup;
+  }
+  if (initNet(extNet) == ncclSuccess) {
+    *net = extNet;
+    return ncclSuccess;
+  }
+cleanup:
+  if (netPluginLib != NULL) dlclose(netPluginLib);
+  return ncclSuccess;
+}
+
+ncclResult_t initNet() {
+  // Always initialize bootstrap network
+  NCCLCHECK(bootstrapNetInit());
+
+  NCCLCHECK(initNetPlugin(&ncclNet));
+  if (ncclNet != NULL) return ncclSuccess;
+  if (initNet(&ncclNetIb) == ncclSuccess) {
+    ncclNet = &ncclNetIb;
+  } else {
+    NCCLCHECK(initNet(&ncclNetSocket));
+    ncclNet = &ncclNetSocket;
+  }
+  return ncclSuccess;
+}
+
+NCCL_PARAM(LlThreshold, "LL_THRESHOLD", -2);
+NCCL_PARAM(ThreadThreshold, "THREAD_THRESHOLD", -2);
+NCCL_PARAM(TreeThreshold, "TREE_THRESHOLD", 0);
+
+int ncclThreadThreshold(int minCompCap, int multiNode) {
+  int threshold = ncclParamThreadThreshold();
+  if (threshold == -2) { // user has not set this env variable
+#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
+    threshold = NCCL_THREAD_THRESHOLD_VEGA;
+#else
+    threshold = (minCompCap <= 6) ? NCCL_THREAD_THRESHOLD_PREVOLTA : NCCL_THREAD_THRESHOLD;
+#endif
+    // multiply by 2 if running on multiple nodes
+    if (multiNode) {
+      threshold *= 2;
+    }
+  }
+  return threshold;
+}
+
+bool useFineGrainVramPcie = false;
+
+void parseHsaForceFineGrainVramPcie() {
+  char* str = getenv("HSA_FORCE_FINE_GRAIN_PCIE");
+  if (str && strlen(str) > 0) {
+    errno = 0;
+    int64_t v = strtoll(str, NULL, 0);
+    if (errno || (v != 0 && v != 1)) {
+      INFO(NCCL_ALL,"Invalid value %s for %s, using default %u.", str, "HSA_FORCE_FINE_GRAIN_PCIE", useFineGrainVramPcie); \
+    } else {
+      useFineGrainVramPcie = v;
+      INFO(NCCL_ALL,"%s set by environment to %u.", "HSA_FORCE_FINE_GRAIN_PCIE", useFineGrainVramPcie);  \
+    }
+  }
+}
+
+pthread_mutex_t initLock = PTHREAD_MUTEX_INITIALIZER;
+static bool initialized = false;
+static ncclResult_t ncclInit() {
+  if (initialized) return ncclSuccess;
+  pthread_mutex_lock(&initLock);
+  if (!initialized) {
+    initEnv();
+    initDebug();
+    initNet();
+    initialized = true;
+  }
+  // Check if HSA_FORCE_FINE_GRAIN_PCIE is set in env
+  parseHsaForceFineGrainVramPcie();
+  pthread_mutex_unlock(&initLock);
+  return ncclSuccess;
+}
+
+NCCL_API(ncclResult_t, ncclGetVersion, int* version);
+ncclResult_t ncclGetVersion(int* version) {
+  if (version == NULL) return ncclInvalidArgument;
+  *version = NCCL_VERSION_CODE;
+  return ncclSuccess;
+}
+
+NCCL_API(ncclResult_t, ncclGetUniqueId, ncclUniqueId* out);
+ncclResult_t ncclGetUniqueId(ncclUniqueId* out) {
+  NCCLCHECK(ncclInit());
+  NCCLCHECK(PtrCheck(out, "GetUniqueId", "out"));
+  return bootstrapGetUniqueId(out);
+}
+
+// Prevent compiler from optimizing out these operations
+void __attribute__((optimize("O0"))) commPoison(ncclComm_t comm) {
+  comm->rank = comm->cudaDev = comm->nvmlDev = comm->nRanks = -1;
+}
+
+static ncclResult_t commFree(ncclComm_t comm) {
+  if (comm == NULL)
+    return ncclSuccess;
+
+#ifdef ENABLE_PROFILING
+  struct ncclProf* prof = (struct ncclProf*)malloc(sizeof(struct ncclProf));
+  CUDACHECK(hipMemcpy(prof, comm->hostDevComm.devProf, sizeof(struct ncclProf), hipMemcpyDeviceToHost));
+  uint64_t wait_send_cycle = 0, wait_recv_cycle = 0;
+  for (int chan=0; chan<comm->nChannels; chan++) {
+    wait_send_cycle += prof->wait_send_cycle[chan];
+    wait_recv_cycle += prof->wait_recv_cycle[chan];
+  }
+  #define VEGA_GPU_RTC_FREQUENCY 2.7E7
+  if (comm->rank == 0) {
+    INFO(NCCL_INIT, "# %4s %6s %6s %6s %6s %6s %7s %6s %6s %6s %6s %6s", "Rank", "total", "w_send", "w_recv", "send", "rcRdS", "dRcRdCS", "dRcCS", "dRc", "cS", "rc", "rcCS");
+    INFO(NCCL_INIT, "# %4s %6s %6s %6s %6s %6s %7s %6s %6s %6s %6s %6s", "", "(s)", "(s)", "(s)", "(GB/s)", "(GB/s)", "(GB/s)", "(GB/s)", "(GB/s)", "(GB/s)", "(GB/s)", "(GB/s)", "(GB/s)");
+  }
+  INFO(NCCL_INIT, "# %4d %6.4f %6.4f %6.4f %6.2f %6.2f %7.2f %6.2f %6.2f %6.2f %6.2f %6.2f",
+    comm->rank, (double)prof->total_cycle/VEGA_GPU_RTC_FREQUENCY/comm->nChannels,
+    (double)wait_send_cycle/VEGA_GPU_RTC_FREQUENCY/comm->nChannels,
+    (double)wait_recv_cycle/VEGA_GPU_RTC_FREQUENCY/comm->nChannels,
+    (prof->send_cycle) ? (double)prof->send_byte*comm->nChannels/((double)prof->send_cycle/VEGA_GPU_RTC_FREQUENCY*1.0E9) : 0,
+    (prof->recvReduceSend_cycle) ? (double)prof->recvReduceSend_byte*comm->nChannels/((double)prof->recvReduceSend_cycle/VEGA_GPU_RTC_FREQUENCY*1.0E9) : 0,
+    (prof->directRecvReduceCopySend_cycle) ? (double)prof->directRecvReduceCopySend_byte*comm->nChannels/((double)prof->directRecvReduceCopySend_cycle/VEGA_GPU_RTC_FREQUENCY*1.0E9) : 0,
+    (prof->directRecvCopySend_cycle) ? (double)prof->directRecvCopySend_byte*comm->nChannels/((double)prof->directRecvCopySend_cycle/VEGA_GPU_RTC_FREQUENCY*1.0E9) : 0,
+    (prof->directRecv_cycle) ? (double)prof->directRecv_byte*comm->nChannels/((double)prof->directRecv_cycle/VEGA_GPU_RTC_FREQUENCY*1.0E9) : 0,
+    (prof->copySend_cycle) ? (double)prof->copySend_byte*comm->nChannels/((double)prof->copySend_cycle/VEGA_GPU_RTC_FREQUENCY*1.0E9) : 0,
+    (prof->recv_cycle) ? (double)prof->recv_byte*comm->nChannels/((double)prof->recv_cycle/VEGA_GPU_RTC_FREQUENCY*1.0E9) : 0,
+    (prof->recvCopySend_cycle) ? (double)prof->recvCopySend_byte*comm->nChannels/((double)prof->recvCopySend_cycle/VEGA_GPU_RTC_FREQUENCY*1.0E9) : 0);
+  free(prof);
+  CUDACHECK(hipFree(comm->hostDevComm.devProf));
+#endif
+
+  free(comm->peerInfo);
+
+  if (comm->bootstrap)
+    NCCLCHECK(bootstrapClose(comm->bootstrap));
+
+  CUDACHECK(hipFree(comm->hostDevComm.channels));
+  CUDACHECK(hipFree(comm->devComm));
+
+  for (int channel=0; channel<comm->nChannels; channel++)
+    NCCLCHECK(freeChannel(comm->channels+channel, comm->nRanks));
+
+  if (comm->doneEvent != NULL)
+    CUDACHECK(hipEventDestroy(comm->doneEvent));
+
+  if (comm->launchMode == ncclComm::GROUP) {
+    CUDACHECK(hipStreamDestroy(comm->groupStream));
+  }
+
+  // Last rank frees shared resources between threads
+  int isLast;
+  NCCLCHECK(ncclCpuBarrierIn(comm, &isLast));
+  if (isLast) {
+    free(comm->intraBarrier);
+    free(comm->intraParams);
+    free(comm->intraCudaDevs);
+    free(comm->intraCGMode);
+    free(comm->intraCC);
+  }
+  CUDACHECK(hipHostFree((void *)comm->abortFlag));
+  CUDACHECK(hipHostFree((void *)comm->fatalDevError));
+
+  // Poison comm to try and catch a double free
+  commPoison(comm);
+
+  free(comm);
+  return ncclSuccess;
+}
+
+static ncclResult_t commAlloc(ncclComm_t* comret, int ndev, int rank) {
+  if (ndev < 1) {
+    WARN("invalid device count (%d) requested", ndev);
+    return ncclInvalidArgument;
+  }
+  if (rank >= ndev || rank < 0) {
+    WARN("rank %d exceeds ndev=%d", rank, ndev);
+    return ncclInvalidArgument;
+  }
+
+  // Try to create a CUDA object right away. If there is something wrong with
+  // the device we're on (failure cause #1) , better know it early.
+  hipEvent_t doneEvent;
+  CUDACHECK(hipEventCreateWithFlags(&doneEvent, hipEventDisableTiming));
+
+  struct ncclComm* comm;
+  NCCLCHECK(ncclCalloc(&comm, 1));
+
+  comm->rank = comm->hostDevComm.rank =rank;
+  comm->nRanks = comm->hostDevComm.nRanks = ndev;
+  hipGetDevice(&comm->cudaDev);
+  getNvmlDevice(comm->cudaDev, &comm->nvmlDev);
+  TRACE(NCCL_INIT,"comm %p rank %d nranks %d cudaDev %d nvmlDev %d", comm, rank, ndev, comm->cudaDev, comm->nvmlDev);
+
+  comm->doneEvent = doneEvent;
+  comm->llThreshold = ncclParamLlThreshold();
+  comm->treeThreshold = ncclParamTreeThreshold();
+  comm->checkPointers = ncclParamCheckPointers() == 1 ? true : false;
+#if CUDART_VERSION >= 9020 || defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
+  comm->groupCudaStream = ncclParamGroupCudaStream();
+#else
+  // Don't allow the user to overload the default setting in older CUDA builds
+  comm->groupCudaStream = NCCL_GROUP_CUDA_STREAM;
+#endif
+  comm->fatalError = ncclSuccess;
+
+  NCCLCHECK(ncclCudaHostAlloc((void**) &comm->fatalDevError, (void**) &comm->hostDevComm.fatalDevError, sizeof(ncclDevError_t)));
+  STORE(comm->fatalDevError, ncclDevSuccess);
+
+  NCCLCHECK(ncclCudaHostAlloc((void**) &comm->abortFlag, (void**) &comm->hostDevComm.abortFlag, sizeof(uint32_t)));
+  STORE(comm->abortFlag, 0);
+
+  comm->argsptr = &comm->args;
+#ifdef ENABLE_PROFILING
+  NCCLCHECK(ncclCudaCalloc(&comm->hostDevComm.devProf, 1));
+#endif
+
+  *comret = comm;
+  return ncclSuccess;
+}
+
+static ncclResult_t devCommSetup(ncclComm_t comm) {
+  // Duplicate the channels on the device
+  NCCLCHECK(ncclCudaCalloc(&comm->hostDevComm.channels, comm->nChannels));
+  NCCLCHECK(ncclCudaMemcpy(comm->hostDevComm.channels, comm->channels, comm->nChannels));
+
+  // Copy userRanks and peers
+  for (int r=0; r<comm->nChannels; r++) {
+    NCCLCHECK(ncclCudaMemcpy(comm->channels[r].ring.devUserRanks, comm->channels[r].ring.userRanks, comm->nRanks));
+    NCCLCHECK(ncclCudaMemcpy(comm->channels[r].devPeers, comm->channels[r].peers, comm->nRanks));
+  }
+
+  // Duplicate the dev comm on the device
+  NCCLCHECK(ncclCudaCalloc(&comm->devComm, 1));
+  NCCLCHECK(ncclCudaMemcpy(comm->devComm, &comm->hostDevComm, 1));
+  return ncclSuccess;
+}
+
+// Pre-process the string so that running "strings" on the lib can quickly reveal the version.
+#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
+#define VERSION_STRING "NCCL version " STR(NCCL_MAJOR) "." STR(NCCL_MINOR) "." STR(NCCL_PATCH) NCCL_SUFFIX "+hip"
+#else
+#define VERSION_STRING "NCCL version " STR(NCCL_MAJOR) "." STR(NCCL_MINOR) "." STR(NCCL_PATCH) NCCL_SUFFIX "+cuda" STR(CUDA_MAJOR) "." STR(CUDA_MINOR)
+#endif
+static void showVersion() {
+  static int shown = 0;
+  if (shown == 0 && ncclDebugLevel >= NCCL_LOG_VERSION) {
+    printf("%s\n", VERSION_STRING);
+    fflush(stdout);
+    if (ncclDebugFile != stdout)
+      INFO(NCCL_ALL,"%s", VERSION_STRING); // Also log NCCL version in one of the files
+    shown = 1;
+  }
+}
+
+static ncclResult_t fillInfo(struct ncclPeerInfo* info, int rank, uint64_t commHash) {
+  info->rank = rank;
+  CUDACHECK(hipGetDevice(&info->cudaDev));
+  NCCLCHECK(getNvmlDevice(info->cudaDev, &info->nvmlDev))
+  info->hostHash=getHostHash()+commHash;
+  info->pidHash=getPidHash()+commHash;
+
+  // Get PCI Bus Id. We need to get the bus ID through CUDA first, since the
+  // cudaDev is a CUDA runtime dev number which could be different from the
+  // NVML device number. Then we get the busID from NVML to be sure it is
+  // consistent with NVML remote PCI bus Ids.
+  CUDACHECK(hipDeviceGetPCIBusId(info->busId, NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE, info->cudaDev));
+#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
+#else
+  nvmlDevice_t nvmlDevice;
+  NCCLCHECK(wrapNvmlDeviceGetHandleByPciBusId(info->busId, &nvmlDevice));
+  nvmlPciInfo_t pciInfo;
+  NCCLCHECK(wrapNvmlDeviceGetPciInfo(nvmlDevice, &pciInfo));
+  strncpy(info->busId, pciInfo.busId, NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE);
+#endif
+  return ncclSuccess;
+}
+
+static ncclResult_t setCpuAffinity(int cudaDev);
+
+template <int type>
+static ncclResult_t selectTransport(struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connect, struct ncclConnector* connector, int buffSize, int channelId) {
+  for (int t=0; t<NTRANSPORTS; t++) {
+    struct ncclTransport *transport = ncclTransports+t;
+    struct ncclTransportComm* transportComm = type == 1 ? &transport->send : &transport->recv;
+    ncclTvalue_t ret = 0;
+    NCCLCHECK(transport->canConnect(&ret, myInfo, peerInfo));
+    if (ret > 0) {
+      cpu_set_t affinitySave;
+      sched_getaffinity(0, sizeof(cpu_set_t), &affinitySave);
+      int cudaDev;
+      CUDACHECK(hipGetDevice(&cudaDev));
+      setCpuAffinity(cudaDev);
+      connector->transportComm = transportComm;
+      NCCLCHECK(transportComm->setup(myInfo, peerInfo, connect, connector, buffSize, channelId));
+      sched_setaffinity(0, sizeof(cpu_set_t), &affinitySave);
+      return ncclSuccess;
+    }
+  }
+  WARN("No transport found !");
+  return ncclInternalError;
+}
+
+static int log2(int n) {
+ int l = 0;
+ while (n>>=1) l++;
+ return l;
+}
+
+static ncclResult_t ncclTreeThreshold(int nnodes, int nranks, int nChannels, ssize_t *treeThreshold) {
+  int nvlink;
+  NCCLCHECK(ncclNvlinkGpu(&nvlink));
+  float ringbw = nvlink ? 5000*nChannels : 5000; // approx, in MB/s or B/us
+  float ringlatinter = 6;
+  float treelatintra = 4;
+  float treelatinter = 15;
+  float treebw;
+  if (!nvlink) {
+    treebw = ringbw * 2 / 3;
+  } else {
+    treebw = ringbw * 3 / 4;
+    if (nnodes == 2) treebw *= 2;
+  }
+  float ringlat = ringlatinter*(nranks-1);
+  float treelat = treelatinter*log2(nnodes)+treelatintra*(nranks/nnodes-1);
+  if (nnodes < 2 || ringlat <= treelat)
+    *treeThreshold = 0;
+  else if (treebw > ringbw)
+    *treeThreshold = 0x7fffffffffffffff;
+  else
+    *treeThreshold = (ssize_t)(((ringbw*treebw/(ringbw-treebw)))*(ringlat-treelat));
+  return ncclSuccess;
+}
+
+static ncclResult_t setupChannel(struct ncclComm* comm, int channelId, int rank, int nranks, int* ringRanks, int* treeMasters) {
+  TRACE(NCCL_INIT, "rank %d nranks %d", rank, nranks);
+  NCCLCHECK(initChannel(comm, channelId));
+
+  struct ncclChannel* channel = comm->channels+channelId;
+  struct ncclRing* ring = &channel->ring;
+
+  // Reorganize ranks to start with rank.
+  int shift;
+  for (shift = 0; shift<nranks; shift++) {
+    if (ringRanks[shift] == rank) {
+      break;
+    }
+  }
+  for (int i=0; i<nranks; i++) {
+    ring->userRanks[i] = ringRanks[(i+shift)%nranks];
+  }
+  int prev = ring->prev = ring->userRanks[nranks-1];
+  int next = ring->next = ring->userRanks[1];
+
+  struct ncclTree* tree = &channel->tree;
+  tree->up = -1;
+  tree->down[0] = tree->down[1] = tree->down[2] = -1;
+
+  //
+  // Find per-node masters and connect them via a binary tree
+  //
+
+  int nMasters = 0;
+  for (int r=0; r<nranks; r++) nMasters += treeMasters[r];
+  if (nMasters == 0) {
+    nMasters = 1;
+    treeMasters[0] = 1;
+  }
+
+  if (comm->treeThreshold == -2)
+    NCCLCHECK(ncclTreeThreshold(nMasters, comm->nRanks, comm->nChannels, &comm->treeThreshold));
+
+  if (comm->treeThreshold > 0) {
+    // Compute tree depth. Not an exact value but a good approximation in most
+    // cases and consistent across nodes
+    tree->depth = nranks/nMasters + log2(nMasters);
+
+    // Find my master : go backwards in the ring to find my root
+    int master = 0;
+    for (int i = 0; i<nranks; i++) {
+      int r = ring->userRanks[(nranks-i)%nranks];
+      if (treeMasters[r]) {
+        master = r;
+        break;
+      }
+    }
+
+    int* ranks;
+    NCCLCHECK(ncclCalloc(&ranks, nMasters));
+    int i = 0, masterIndex = -1;
+    // Build binary tree
+    for (int r=0; r<nranks; r++) {
+      // Create index table
+      if (r == master) masterIndex = i;
+      if (treeMasters[r]) ranks[i++] = r;
+    }
+    int btreeUp, btreeDown0, btreeDown1;
+    int u0, d0_0, d0_1, u1, d1_0, d1_1;
+    NCCLCHECK(ncclGetDtree(nMasters, masterIndex, &u0, &d0_0, &d0_1, &u1, &d1_0, &d1_1));
+    if (channelId < DIVUP(comm->nChannels, 2)) {
+      btreeUp = u0; btreeDown0 = d0_0; btreeDown1 = d0_1;
+    } else {
+      btreeUp = u1; btreeDown0 = d1_0; btreeDown1 = d1_1;
+    }
+
+    //
+    // Now build the full tree, combining the intra-node ring and the
+    // inter-node binary tree.
+    //
+
+    if (rank == master) {
+      int nDown = 0;
+      if (btreeUp != -1) tree->up = ranks[btreeUp];
+      if (treeMasters[next] == 0) tree->down[nDown++] = next;
+      if (btreeDown0 != -1) tree->down[nDown++] = ranks[btreeDown0];
+      if (btreeDown1 != -1) tree->down[nDown++] = ranks[btreeDown1];
+    } else {
+      tree->up = prev;
+      if (treeMasters[next] == 0) tree->down[0] = next;
+    }
+    free(ranks);
+  }
+
+  TRACE(NCCL_INIT, "rank %d nranks %d - DONE", rank, nranks);
+  return ncclSuccess;
+}
+
+static ncclResult_t fillConnect(struct ncclPeerInfo* peerInfo, int nranks, int rank, int* connectTransport, ncclTvalue_t* connectValue) {
+  for (int r=0; r<nranks; r++) {
+    connectTransport[r] = -1;
+    for (int t=0; t<NTRANSPORTS; t++) {
+      NCCLCHECK(ncclTransports[t].canConnect(connectValue+r, peerInfo+rank, peerInfo+r));
+      if (connectValue[r] > 0) {
+        connectTransport[r] = t;
+        break;
+      }
+    }
+  }
+  return ncclSuccess;
+}
+
+#define MAXWIDTH 20
+#define PREFIXLEN 15
+#define STRLENGTH (PREFIXLEN+5*MAXWIDTH)
+void dumpMatrix(int* connectMatrix, int nranks) {
+  char line[STRLENGTH+1];
+  line[STRLENGTH] = '\0';
+  memset(line, ' ', STRLENGTH);
+  for (int j=0; j<nranks && j<MAXWIDTH; j++) sprintf(4+line+4*j, " %3d", j);
+  INFO(NCCL_INIT,"%s", line);
+  for (int i=0; i<nranks; i++) {
+    memset(line, ' ', STRLENGTH);
+    sprintf(line, "%3d ", i);
+    for (int j=0; j<nranks && j<MAXWIDTH; j++) sprintf(4+line+4*j, " %3d", connectMatrix[i*nranks+j]);
+    INFO(NCCL_INIT,"%s", line);
+  }
+}
+
+void dumpMatrixTvalue(ncclTvalue_t* connectMatrix, int nranks) {
+  char line[STRLENGTH+1];
+  line[STRLENGTH] = '\0';
+  memset(line, ' ', STRLENGTH);
+  for (int j=0; j<nranks && j<MAXWIDTH; j++) sprintf(4+line+5*j, " %4d", j);
+  INFO(NCCL_INIT,"%s", line);
+  for (int i=0; i<nranks; i++) {
+    memset(line, ' ', STRLENGTH);
+    sprintf(line, "%3d ", i);
+    for (int j=0; j<nranks && j<MAXWIDTH; j++) sprintf(4+line+5*j, " %4o", (int)connectMatrix[i*nranks+j]);
+    INFO(NCCL_INIT,"%s", line);
+  }
+}
+
+
+void dumpLine(int* values, int nranks, const char* prefix) {
+  int prefixlen = strlen(prefix);
+  char line[STRLENGTH+1];
+  line[STRLENGTH] = '\0';
+  memset(line, ' ', STRLENGTH);
+  strncpy(line, prefix, PREFIXLEN);
+  for (int i=0; i<nranks && i<MAXWIDTH; i++) sprintf(line+prefixlen+4*i, " %3d", values[i]);
+  INFO(NCCL_INIT,"%s", line);
+}
+
+static ncclResult_t buildRings(int nrings, int* rings, int rank, int nranks, int* prev, int* next) {
+  for (int r=0; r<nrings; r++) {
+    char prefix[30];
+    /*sprintf(prefix, "[%d] Channel %d Prev : ", rank, r);
+    dumpLine(prev+r*nranks, nranks, prefix);
+    sprintf(prefix, "[%d] Channel %d Next : ", rank, r);
+    dumpLine(next+r*nranks, nranks, prefix);*/
+
+    int current = rank;
+    for (int i=0; i<nranks; i++) {
+      rings[r*nranks+i] = current;
+      current = next[r*nranks+current];
+    }
+    sprintf(prefix, "Channel %02d : ", r);
+    if (rank == 0) dumpLine(rings+r*nranks, nranks, prefix);
+    if (current != rank) {
+      WARN("Error : ring %d does not loop back to start (%d != %d)", r, current, rank);
+      return ncclInternalError;
+    }
+    // Check that all ranks are there
+    for (int i=0; i<nranks; i++) {
+      int found = 0;
+      for (int j=0; j<nranks; j++) {
+        if (rings[r*nranks+j] == i) {
+          found = 1;
+          break;
+        }
+      }
+      if (found == 0) {
+        WARN("Error : ring %d does not contain rank %d", r, i);
+        return ncclInternalError;
+      }
+    }
+  }
+  return ncclSuccess;
+}
+
+void* waitForNonNullPtr(void* p) {
+  volatile void** ptr = (volatile void**) p;
+  while (LOAD(ptr) == NULL) sched_yield();
+  return (void*)(LOAD(ptr));
+}
+
+ncclResult_t initParams(struct ncclComm* comm) {
+  hipLaunchParams* params = comm->myParams = comm->intraParams+comm->intraRank;
+  params->args =(void **)&comm->argsptr;
+  params->stream = NULL;
+  params->sharedMem = 0;
+  params->blockDim.x = 0; params->blockDim.y = params->blockDim.z = 1;
+  params->gridDim.x = 0; params->gridDim.y = params->gridDim.z = 1;
+  return ncclSuccess;
+}
+
+// Allocate/Set Intra Process Structures and set CG options
+ncclResult_t ncclCommSetIntra(struct ncclComm* comm, int rank, int ranks, struct ncclComm* comm0) {
+  comm->intraRank = rank;
+  comm->intraRanks = ranks;
+  comm->intraPhase = 0;
+
+  // Alloc shared structures
+  if (rank == 0) {
+    assert(comm == comm0);
+    int* bar;
+    NCCLCHECK(ncclCalloc(&bar, 2));
+    bar[0] = bar[1] = 0;
+    comm->intraBarrier = bar;
+    NCCLCHECK(ncclCalloc(&comm->intraParams, comm->intraRanks));
+    NCCLCHECK(ncclCalloc(&comm->intraCudaDevs, comm->intraRanks));
+    int* CGMode;
+    NCCLCHECK(ncclCalloc(&CGMode, 1));
+    *CGMode = 0x11;
+    comm->intraCGMode = CGMode;
+    int* CC;
+    NCCLCHECK(ncclCalloc(&CC, 1));
+    *CC = ncclCudaFullCompCap();
+    comm->intraCC = CC;
+  } else {
+    comm->intraBarrier = (int*)waitForNonNullPtr(&comm0->intraBarrier);
+    comm->intraParams = (hipLaunchParams*)waitForNonNullPtr(&comm0->intraParams);
+    comm->intraCudaDevs = (int*)waitForNonNullPtr(&comm0->intraCudaDevs);
+    comm->intraCGMode = (int*)waitForNonNullPtr(&comm0->intraCGMode);
+    comm->intraCC = (int*)waitForNonNullPtr(&comm0->intraCC);
+  }
+  comm->intraCudaDevs[comm->intraRank] = comm->cudaDev;
+  NCCLCHECK(initParams(comm));
+
+  int cgMdLaunch = 1;
+
+  // Set CG Mode
+  comm->launchMode = ncclComm::GROUP;
+  char* str = getenv("NCCL_LAUNCH_MODE");
+  if (comm->intraRanks == 1 || (str && strcmp(str, "PARALLEL") == 0)) {
+    comm->launchMode = ncclComm::PARALLEL;
+  }
+  if (comm->launchMode == ncclComm::GROUP) {
+    CUDACHECK(hipStreamCreateWithFlags(&comm->groupStream, hipStreamNonBlocking));
+#if CUDART_VERSION >= 9000
+    if (*comm->intraCC && (ncclCudaFullCompCap() == *comm->intraCC)) {
+      // Check whether the GPU supports Cooperative Group Multi Device Launch
+      (void) hipDeviceGetAttribute(&cgMdLaunch, cudaDevAttrCooperativeMultiDeviceLaunch, comm->cudaDev);
+    }
+#endif
+  }
+
+  // Disable cgMdLaunch if any rank does not support it
+  if (cgMdLaunch == 0) {
+    *comm->intraCGMode = 0x10;
+  }
+  return ncclSuccess;
+}
+
+static ncclResult_t p2pSetup(struct ncclComm* comm, struct ncclChannel* channel, int nrecv, int* peerRecv, int nsend, int* peerSend) {
+  TRACE(NCCL_INIT, "nsend %d nrecv %d", nsend, nrecv);
+  uint32_t nSkippedSend = 0, nSkippedRecv = 0; /* for tracing */
+  struct ncclConnect connect;
+  struct ncclConnector* conn;
+  for (int i=0; i<nrecv; i++) {
+    int peer = peerRecv[i];
+    if (peer == -1) continue;
+    conn = &channel->peers[peer].recv;
+    if (conn->connected) { ++nSkippedRecv; continue; }
+    memset(&connect, 0, sizeof(connect));
+    NCCLCHECK(selectTransport<0>(comm->peerInfo+comm->rank, comm->peerInfo+peer, &connect, conn, channel->buffSize, channel->id));
+    NCCLCHECK(bootstrapSend(comm->bootstrap, peer, &connect, sizeof(struct ncclConnect)));
+  }
+  for (int i=0; i<nsend; i++) {
+    int peer = peerSend[i];
+    if (peer == -1) continue;
+    conn = &channel->peers[peer].send;
+    if (conn->connected) { ++nSkippedSend; continue; }
+    memset(&connect, 0, sizeof(connect));
+    NCCLCHECK(selectTransport<1>(comm->peerInfo+comm->rank, comm->peerInfo+peer, &connect, conn, channel->buffSize, channel->id));
+    NCCLCHECK(bootstrapSend(comm->bootstrap, peer, &connect, sizeof(struct ncclConnect)));
+  }
+  for (int i=0; i<nsend; i++) {
+    int peer = peerSend[i];
+    if (peer == -1) continue;
+    conn = &channel->peers[peer].send;
+    if (conn->connected) {++nSkippedSend; continue; }
+    memset(&connect, 0, sizeof(connect));
+    NCCLCHECK(bootstrapRecv(comm->bootstrap, peer, &connect, sizeof(struct ncclConnect)));
+    NCCLCHECK(conn->transportComm->connect(&connect, conn));
+    conn->connected = 1;
+  }
+  for (int i=0; i<nrecv; i++) {
+    int peer = peerRecv[i];
+    if (peer == -1) continue;
+    conn = &channel->peers[peer].recv;
+    if (conn->connected) {++nSkippedRecv; continue; }
+    memset(&connect, 0, sizeof(connect));
+    NCCLCHECK(bootstrapRecv(comm->bootstrap, peer, &connect, sizeof(struct ncclConnect)));
+    NCCLCHECK(conn->transportComm->connect(&connect, conn));
+    conn->connected = 1;
+  }
+  TRACE(NCCL_INIT, "nsend %d nrecv %d nSkippedSend %u nSkippedRecv %u - DONE", nsend, nrecv, nSkippedSend, nSkippedRecv);
+  return ncclSuccess;
+}
+
+static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* commId) {
+  // We use 3 AllGathers
+  // 1. { peerInfo, comm }
+  // 2. ConnectTransport[nranks], ConnectValue[nranks]
+  // 3. { nThreads, nrings, compCap, prev[MAXCHANNELS], next[MAXCHANNELS] }
+
+  int rank = comm->rank;
+  int nranks = comm->nRanks;
+  uint64_t commHash = getnHash(commId->internal, NCCL_UNIQUE_ID_BYTES);
+  TRACE(NCCL_INIT, "comm %p, commHash %lu, rank %d nranks %d - BEGIN", comm, commHash, rank, nranks);
+  NCCLCHECK(bootstrapInit(commId, rank, nranks, &comm->bootstrap));
+
+  // AllGather1 - begin
+  struct {
+    struct ncclPeerInfo peerInfo;
+    struct ncclComm* comm;
+  } *allGather1Data;
+
+  NCCLCHECK(ncclCalloc(&allGather1Data, nranks));
+  allGather1Data[rank].comm = comm;
+  NCCLCHECK(fillInfo(&allGather1Data[rank].peerInfo, rank, commHash));
+  NCCLCHECK(bootstrapAllGather(comm->bootstrap, allGather1Data, sizeof(*allGather1Data)));
+
+  NCCLCHECK(ncclCalloc(&comm->peerInfo, nranks));
+  for (int i = 0; i < nranks; i++) {
+    memcpy(comm->peerInfo+i, &allGather1Data[i].peerInfo, sizeof(struct ncclPeerInfo));
+  }
+  // AllGather1 data is used again below
+  // AllGather1 - end
+
+  // AllGather2 - begin
+  size_t allGather2DataRowSize = sizeof(int)*nranks + sizeof(ncclTvalue_t)*nranks;
+  void *allGather2Data;
+  NCCLCHECK(ncclCalloc((char **)&allGather2Data, allGather2DataRowSize*nranks));
+  int *myTransportRow = (int *)((char *)allGather2Data + allGather2DataRowSize*rank);
+  ncclTvalue_t *myValueRow = (ncclTvalue_t *)(myTransportRow + nranks);
+
+  NCCLCHECK(fillConnect(comm->peerInfo, nranks, rank, myTransportRow, myValueRow));
+  NCCLCHECK(bootstrapAllGather(comm->bootstrap, allGather2Data, allGather2DataRowSize));
+
+  int* connectTransport;
+  ncclTvalue_t* connectValue;
+  NCCLCHECK(ncclCalloc(&connectTransport, nranks*nranks));
+  NCCLCHECK(ncclCalloc(&connectValue, nranks*nranks));
+  for (int i = 0; i < nranks; i++) {
+    memcpy(connectTransport + i*nranks, (char *)allGather2Data + i*allGather2DataRowSize, sizeof(int)*nranks);
+    memcpy(connectValue + i*nranks, (char *)allGather2Data + i*allGather2DataRowSize + nranks*sizeof(int), sizeof(ncclTvalue_t)*nranks);
+  }
+  free(allGather2Data);
+  // AllGather2 - end
+
+  //if (rank == 0) dumpMatrix(connectTransport, nranks);
+  //if (rank == 0) dumpMatrixTvalue(connectValue, nranks);
+
+  // Get my rings
+  int nrings;
+  int* prev, *next, *treeIn, *treeOut;
+  NCCLCHECK(ncclCalloc(&prev, nranks*MAXCHANNELS));
+  NCCLCHECK(ncclCalloc(&next, nranks*MAXCHANNELS));
+  NCCLCHECK(ncclCalloc(&treeIn, nranks*MAXCHANNELS));
+  NCCLCHECK(ncclCalloc(&treeOut, nranks*MAXCHANNELS));
+  comm->nThreads = getDefaultThreads();
+  NCCLCHECK(ncclGetRings(&nrings, &comm->nThreads, rank, nranks, connectTransport, connectValue, prev, next, treeIn, treeOut));
+  TRACE(NCCL_INIT, "rank %d nranks %d - BUILD %d RINGS", rank, nranks, nrings);
+  assert(nrings <= MAXCHANNELS);
+  free(connectTransport);
+  free(connectValue);
+
+  // AllGather3 - begin
+  struct {
+    int nThreads;
+    int nrings;
+    int cudaCompCap;
+    int prev[MAXCHANNELS];
+    int next[MAXCHANNELS];
+  } *allGather3Data;
+
+  NCCLCHECK(ncclCalloc(&allGather3Data, nranks));
+  allGather3Data[rank].nThreads = comm->nThreads;
+  allGather3Data[rank].nrings = nrings;
+  allGather3Data[rank].cudaCompCap = ncclCudaCompCap();
+  for (int r=0; r<nrings; r++) {
+    allGather3Data[rank].prev[r] = *(prev+r*nranks+rank);
+    allGather3Data[rank].next[r] = *(next+r*nranks+rank);
+  }
+  NCCLCHECK(bootstrapAllGather(comm->bootstrap, allGather3Data, sizeof(*allGather3Data)));
+
+  // Find max nThreads
+  for (int i=0; i<nranks; i++)
+    comm->nThreads = std::max(allGather3Data[i].nThreads, comm->nThreads);
+
+  // Determine the minimum CUDA Compute capability of all GPUs
+  int myCompCap = allGather3Data[rank].cudaCompCap;
+  int minCompCap = myCompCap;
+  for (int i = 0; i < nranks; i++)
+    minCompCap = std::min(allGather3Data[i].cudaCompCap, minCompCap);
+
+  // Determine thread threshold across all GPUs
+  int nnodes = 0;
+  for (int r=0; r<nranks; r++) nnodes += treeIn[r];
+  comm->threadThreshold = ncclThreadThreshold(minCompCap, nnodes);
+
+  // Find min nrings across ranks
+  for (int i=0; i<nranks; i++)
+    nrings = std::min(allGather3Data[i].nrings, nrings);
+  comm->nChannels = nrings;
+
+  // Unpack the per ring prev/next arrays
+  for (int i = 0; i < nranks; i++) {
+    for (int r = 0; r < nrings; r++) {
+      prev[r*nranks+i] = allGather3Data[i].prev[r];
+      next[r*nranks+i] = allGather3Data[i].next[r];
+    }
+  }
+  free(allGather3Data);
+  // AllGather3 - end
+
+  int *rings;
+  NCCLCHECK(ncclCalloc(&rings, nranks*MAXCHANNELS));
+  NCCLCHECK(buildRings(nrings, rings, rank, nranks, prev, next));
+  free(prev);
+  free(next);
+  TRACE(NCCL_INIT, "rank %d nranks %d - BUILT %d RINGS", rank, nranks, nrings);
+
+  // Connect with prev/next for each ring
+  struct ncclConnect *connect;
+  NCCLCHECK(ncclCalloc(&connect, 2));
+  for (int r=0; r<nrings; r++) {
+    struct ncclChannel* channel = comm->channels+r;
+    NCCLCHECK(setupChannel(comm, r, rank, nranks, rings+r*nranks, treeIn+r*nranks));
+    NCCLCHECK(p2pSetup(comm, channel, 1, &channel->ring.prev, 1, &channel->ring.next));
+    NCCLCHECK(p2pSetup(comm, channel, NCCL_MAX_TREE_ARITY, channel->tree.down, 1, &channel->tree.up));
+    NCCLCHECK(p2pSetup(comm, channel, 1, &channel->tree.up, NCCL_MAX_TREE_ARITY, channel->tree.down));
+  }
+  if (comm->treeThreshold > 0) {
+    char line[1024];
+    line[0]='\0';
+    for (int c=0; c<nrings; c++) {
+      struct ncclTree* tree = &comm->channels[c].tree;
+      snprintf(line+strlen(line), 1023-strlen(line), " [%d] %d->%d->%d/%d/%d",
+          c, tree->up, rank, tree->down[0], tree->down[1], tree->down[2]);
+    }
+    line[1023] = '\0';
+    INFO(NCCL_INIT, "Trees%s", line);
+  }
+  if (rank == 0) {
+    char treeline[64];
+    snprintf(treeline, 64, "enabled up to size %ld", comm->treeThreshold);
+    INFO(NCCL_INIT,"Using %d threads, Min Comp Cap %d, Trees %s", comm->nThreads, minCompCap,
+       comm->treeThreshold == 0 ? "disabled" :
+       comm->treeThreshold == 0x7fffffffffffffff ? "enabled for all sizes" :
+       treeline);
+  }
+
+  TRACE(NCCL_INIT, "rank %d nranks %d - CONNECTED %d RINGS AND TREES", rank, nranks, nrings);
+  free(connect);
+  free(rings);
+  free(treeIn);
+  free(treeOut);
+
+  // Compute intra ranks (using AllGather1 data)
+  int intraRank0 = -1, intraRank = -1, intraRanks = 0;
+  for (int i = 0; i < nranks; i++) {
+    if ((allGather1Data[i].peerInfo.hostHash == allGather1Data[rank].peerInfo.hostHash) &&
+        (allGather1Data[i].peerInfo.pidHash == allGather1Data[rank].peerInfo.pidHash)) {
+      if (intraRanks == 0) intraRank0 = i;
+      if (i == rank) intraRank = intraRanks;
+      intraRanks++;
+    }
+  }
+  TRACE(NCCL_INIT,"hostHash[%d] %lx intraRank %d intraRanks %d intraRank0 %d",
+        rank, allGather1Data[rank].peerInfo.hostHash, intraRank, intraRanks, intraRank0);
+  if (intraRank == -1 || intraRank0 == -1 || allGather1Data[intraRank0].comm == NULL) {
+    WARN("Failed to determine intra ranks hostHash[%d] %lx intraRank %d intraRanks %d intraRank0 %d",
+         rank, allGather1Data[rank].peerInfo.hostHash, intraRank, intraRanks, intraRank0);
+    return ncclInternalError;
+  }
+  NCCLCHECK(ncclCommSetIntra(comm, intraRank, intraRanks, allGather1Data[intraRank0].comm));
+
+  // Done with AllGather1 data
+  free(allGather1Data);
+
+  if (nnodes) NCCLCHECK(transportCreateProxy(comm));
+
+  TRACE(NCCL_INIT, "rank %d nranks %d - DONE", rank, nranks);
+  return ncclSuccess;
+}
+
+static ncclResult_t getCpuGpuAffinity(int cudaDev, cpu_set_t* mask) {
+  CPU_ZERO_S(sizeof(cpu_set_t), mask);
+  char* cudaPath;
+  NCCLCHECK(getCudaPath(cudaDev, &cudaPath));
+  char path[PATH_MAX];
+  strncpy(path, cudaPath, PATH_MAX-1);
+  snprintf(path+strlen(path), PATH_MAX-1-strlen(path), "/local_cpus");
+  path[PATH_MAX-1] = '\0';
+  int fd;
+  SYSCHECKVAL(open(path, O_RDONLY), "open", fd);
+  char affinityStr[sizeof(cpu_set_t)*2 + 1];
+  int r = read(fd, affinityStr, sizeof(cpu_set_t)*2);
+  if (r > 0) {
+    affinityStr[r] = '\0';
+    NCCLCHECK(ncclStrToCpuset(affinityStr, mask));
+  }
+  close(fd);
+  free(cudaPath);
+  return ncclSuccess;
+}
+
+NCCL_PARAM(IgnoreCpuAffinity, "IGNORE_CPU_AFFINITY", 0);
+
+static ncclResult_t setCpuAffinity(int cudaDev) {
+  // Query the CPU affinity set we were provided
+  cpu_set_t mask;
+  SYSCHECK(sched_getaffinity(0, sizeof(cpu_set_t), &mask), "sched_getaffinity");
+
+#ifdef ENABLE_TRACE
+  {
+    char affinityStr[sizeof(cpu_set_t)*2];
+    NCCLCHECK(ncclCpusetToStr(&mask, affinityStr));
+    TRACE(NCCL_INIT, "Current affinity for GPU %d is %s", cudaDev, affinityStr);
+  }
+#endif
+
+  // Find the CPUs that are local to the supplied GPU
+  cpu_set_t gpuMask;
+  NCCLCHECK(getCpuGpuAffinity(cudaDev, &gpuMask));
+
+#ifdef ENABLE_TRACE
+  {
+    char affinityStr[sizeof(cpu_set_t)*2];
+    NCCLCHECK(ncclCpusetToStr(&gpuMask, affinityStr));
+    TRACE(NCCL_INIT, "CPU GPU affinity for GPU %d is %s", cudaDev, affinityStr);
+  }
+#endif
+
+  cpu_set_t finalMask;
+  if (ncclParamIgnoreCpuAffinity())
+    // Ignore the CPU affinity set and use the GPU one instead
+    finalMask = gpuMask;
+  else
+    // Use a subset of the GPU affinity set
+    CPU_AND(&finalMask, &mask, &gpuMask);
+
+  // If there is a non empty set, use it to set affinity
+  if (CPU_COUNT(&finalMask)) {
+    char affinityStr[sizeof(cpu_set_t)*2];
+    NCCLCHECK(ncclCpusetToStr(&finalMask, affinityStr));
+    INFO(NCCL_INIT, "Setting affinity for GPU %d to %s", cudaDev, affinityStr);
+    SYSCHECK(sched_setaffinity(0, sizeof(cpu_set_t), &finalMask), "sched_setaffinity");
+  }
+  return ncclSuccess;
+}
+
+ncclResult_t ncclCommInitRankSync(ncclComm_t* newcomm, int nranks, ncclUniqueId commId, int myrank) {
+  cpu_set_t affinitySave;
+  sched_getaffinity(0, sizeof(cpu_set_t), &affinitySave);
+
+  NCCLCHECK(wrapNvmlSymbols());
+  NCCLCHECK(wrapNvmlInit());
+
+  // Make sure all host memory allocation are close to the GPU
+  int cudaDev;
+  CUDACHECK(hipGetDevice(&cudaDev));
+  NCCLCHECK(setCpuAffinity(cudaDev));
+  ncclResult_t res;
+
+  NCCLCHECKGOTO(commAlloc(newcomm, nranks, myrank), res, cleanup);
+  NCCLCHECKGOTO(initTransportsRank(*newcomm, &commId), res, cleanup);
+  NCCLCHECKGOTO(devCommSetup(*newcomm), res, cleanup);
+
+  sched_setaffinity(0, sizeof(cpu_set_t), &affinitySave);
+  NCCLCHECKGOTO(wrapNvmlShutdown(), res, cleanup);
+
+  INFO(NCCL_INIT,"comm %p rank %d nranks %d cudaDev %d nvmlDev %d - Init COMPLETE", *newcomm, myrank, nranks, (*newcomm)->cudaDev, (*newcomm)->nvmlDev);
+
+  return ncclSuccess;
+cleanup:
+  *newcomm = NULL;
+  sched_setaffinity(0, sizeof(cpu_set_t), &affinitySave);
+  return res;
+}
+
+NCCL_API(ncclResult_t, ncclCommInitRank, ncclComm_t* newcomm, int nranks, ncclUniqueId commId, int myrank);
+ncclResult_t ncclCommInitRank(ncclComm_t* newcomm, int nranks, ncclUniqueId commId, int myrank) {
+  char* env = getenv("NCCL_COMM_ID");
+  if (env && myrank == 0) {
+    NCCLCHECK(bootstrapCreateRoot(&commId, true));
+  }
+
+  NCCLCHECK(ncclInit());
+  if (myrank == 0) showVersion();
+
+  // Make sure the CUDA runtime is initialized.
+  CUDACHECK(hipFree(NULL));
+
+  NCCLCHECK(PtrCheck(newcomm, "CommInitRank", "newcomm"));
+  if (nranks < 1 || myrank < 0 || myrank >= nranks) {
+    WARN("Invalid rank requested : %d/%d", myrank, nranks);
+    return ncclInvalidArgument;
+  }
+
+  if (ncclAsyncMode()) {
+    int cudaDev;
+    CUDACHECK(hipGetDevice(&cudaDev));
+    return ncclAsyncInit(ncclCommInitRankSync, cudaDev, newcomm, nranks, commId, myrank);
+  } else {
+    return ncclCommInitRankSync(newcomm, nranks, commId, myrank);
+  }
+}
+
+static ncclResult_t initTransportsAll(struct ncclComm** comms, const int* devs, int nranks) {
+  struct ncclPeerInfo* allInfo;
+  NCCLCHECK(ncclCalloc(&allInfo, nranks));
+  for (int rank=0; rank<nranks; rank++) {
+    CUDACHECK(hipSetDevice(devs[rank]));
+    NCCLCHECK(fillInfo(allInfo+rank, rank, 0));
+  }
+
+  int* connectTransport;
+  ncclTvalue_t* connectValue;
+  NCCLCHECK(ncclCalloc(&connectTransport, nranks*nranks));
+  NCCLCHECK(ncclCalloc(&connectValue, nranks*nranks));
+  for (int rank=0; rank<nranks; rank++)
+    NCCLCHECK(fillConnect(allInfo, nranks, rank, connectTransport+nranks*rank, connectValue+nranks*rank));
+
+  int* prev, *prevFinal, *next, *nextFinal, *treeIn, *treeOut;
+  NCCLCHECK(ncclCalloc(&prev, nranks*MAXCHANNELS));
+  NCCLCHECK(ncclCalloc(&prevFinal, nranks*MAXCHANNELS));
+  NCCLCHECK(ncclCalloc(&next, nranks*MAXCHANNELS));
+  NCCLCHECK(ncclCalloc(&nextFinal, nranks*MAXCHANNELS));
+  NCCLCHECK(ncclCalloc(&treeIn, nranks*MAXCHANNELS));
+  NCCLCHECK(ncclCalloc(&treeOut, nranks*MAXCHANNELS));
+  int nrings = MAXCHANNELS;
+  int nthreads=0;
+  int myCompCap = ncclCudaCompCap();
+  int minCompCap = myCompCap;
+  for (int rank=0; rank<nranks; rank++) {
+    CUDACHECK(hipSetDevice(devs[rank]));
+    int nringsRank;
+    int nthreadsRank = getDefaultThreads();
+    myCompCap = ncclCudaCompCap();
+    NCCLCHECK(ncclGetRings(&nringsRank, &nthreadsRank, rank, nranks, connectTransport, connectValue, prev, next, treeIn, treeOut));
+    nrings = std::min(nrings, nringsRank);
+    nthreads = std::max(nthreads, nthreadsRank);
+    minCompCap = std::min(minCompCap, myCompCap);
+    for (int ring=0; ring<nrings; ring++) {
+      int index = ring*nranks+rank;
+      prevFinal[index] = prev[index];
+      nextFinal[index] = next[index];
+    }
+  }
+  free(connectTransport);
+  free(connectValue);
+  free(prev);
+  free(next);
+
+  INFO(NCCL_INIT,"Using %d threads, Min Comp Cap %d, Trees disabled", nthreads, minCompCap);
+
+  int* rings;
+  NCCLCHECK(ncclCalloc(&rings, nranks*MAXCHANNELS));
+  NCCLCHECK(buildRings(nrings, rings, 0, nranks, prevFinal, nextFinal));
+  free(prevFinal);
+  free(nextFinal);
+
+  // Determine thread threshold across all GPUs
+  int threadThreshold = ncclThreadThreshold(minCompCap, 0);
+
+  for (int rank=0; rank<nranks; rank++) {
+    comms[rank]->nChannels = nrings;
+    comms[rank]->nThreads = nthreads;
+    comms[rank]->threadThreshold = threadThreshold;
+  }
+
+  struct ncclConnect* connect;
+  NCCLCHECK(ncclCalloc(&connect, 2*nranks));
+  for (int r=0; r<nrings; r++) {
+    int* ringRanks = rings+r*nranks;
+    for (int rank=0; rank<nranks; rank++) {
+      CUDACHECK(hipSetDevice(devs[rank]));
+      struct ncclChannel* channel = comms[rank]->channels+r;
+      struct ncclRing *ring = &channel->ring;
+      NCCLCHECK(setupChannel(comms[rank], r, rank, nranks, ringRanks, treeIn));
+      // Make sure we don't use trees, we cannot use them with initAll
+      comms[rank]->treeThreshold = 0;
+      int prev = channel->ring.prev = ring->userRanks[nranks-1];
+      int next = channel->ring.next = ring->userRanks[1];
+      struct ncclConnector* recv = &channel->peers[prev].recv;
+      struct ncclConnector* send = &channel->peers[next].send;
+      NCCLCHECK(selectTransport<0>(allInfo+rank, allInfo+prev, connect+rank*2+0, recv, channel->buffSize, channel->id));
+      NCCLCHECK(selectTransport<1>(allInfo+rank, allInfo+next, connect+rank*2+1, send, channel->buffSize, channel->id));
+    }
+    for (int rank=0; rank<nranks; rank++) {
+      CUDACHECK(hipSetDevice(devs[rank]));
+      struct ncclChannel* channel = comms[rank]->channels+r;
+      struct ncclRing *ring = &channel->ring;
+      struct ncclConnector* recv = &channel->peers[ring->prev].recv;
+      struct ncclConnector* send = &channel->peers[ring->next].send;
+      NCCLCHECK(recv->transportComm->connect(connect+ring->prev*2+1, recv));
+      NCCLCHECK(send->transportComm->connect(connect+ring->next*2+0, send));
+    }
+  }
+  free(connect);
+  free(allInfo);
+  free(rings);
+  free(treeIn);
+  free(treeOut);
+  return ncclSuccess;
+}
+
+
+NCCL_API(ncclResult_t, ncclCommInitAll, ncclComm_t* comms, int ndev, const int* devlist);
+ncclResult_t ncclCommInitAll(ncclComm_t* comms, int ndev, const int* devlist) {
+  NCCLCHECK(ncclInit());
+  NCCLCHECK(wrapNvmlSymbols());
+  NCCLCHECK(wrapNvmlInit());
+  showVersion();
+
+  INFO(NCCL_INIT,"nranks %d", ndev);
+
+  NCCLCHECK(PtrCheck(comms, "CommInitAll", "comms"));
+  if (ndev < 1) {
+    WARN("Invalid device count requested : %d", ndev);
+    return ncclInvalidArgument;
+  }
+
+  ncclResult_t res;
+  int savedDevice;
+  int rank, cudaDev;
+  ncclComm_t comm = NULL;
+  int* ncclDevList = NULL;
+  NCCLCHECK(ncclCalloc(&ncclDevList, ndev));
+  for (int i=0; i<ndev; i++) {
+    ncclDevList[i] = devlist ? devlist[i] : i;
+  }
+
+  CUDACHECKGOTO(hipGetDevice(&savedDevice), res, cleanup);
+
+  for(rank=0; rank<ndev; ++rank)
+    comms[rank] = NULL;
+
+  cpu_set_t affinitySave;
+  sched_getaffinity(0, sizeof(cpu_set_t), &affinitySave);
+
+  for (rank=0; rank<ndev; ++rank) {
+    cudaDev = ncclDevList[rank];
+    CUDACHECKGOTO(hipSetDevice(cudaDev), res, cleanup);
+
+    NCCLCHECK(setCpuAffinity(cudaDev));
+
+    NCCLCHECKGOTO(commAlloc(&comm, ndev, rank), res, cleanup);
+    comms[rank] = comm;
+
+    NCCLCHECKGOTO(ncclCommSetIntra(comm, rank, ndev, comms[0]), res, cleanup);
+  }
+
+  sched_setaffinity(0, sizeof(cpu_set_t), &affinitySave);
+
+  NCCLCHECKGOTO(initTransportsAll(comms, ncclDevList, ndev), res, cleanup);
+
+  for(rank=0; rank<ndev; ++rank) {
+    cudaDev = ncclDevList[rank];
+    CUDACHECKGOTO(hipSetDevice(cudaDev), res, cleanup);
+    NCCLCHECKGOTO(devCommSetup(comms[rank]), res, cleanup);
+  }
+
+  res = ncclSuccess;
+  goto final;
+
+cleanup:
+  for(rank=0; rank<ndev; ++rank) {
+    if(comms[rank] != NULL) {
+      commFree(comms[rank]);
+    }
+  }
+
+final:
+  free(ncclDevList);
+  if(wrapNvmlShutdown() != ncclSuccess)
+    INFO(NCCL_INIT,"NCCL did not shutdown nvml properly");
+  hipSetDevice(savedDevice);
+  sched_setaffinity(0, sizeof(cpu_set_t), &affinitySave);
+  return res;
+}
+
+
+static ncclResult_t commDestroy(ncclComm_t comm) {
+  int savedDevice;
+#ifdef ENABLE_TRACE
+  int rank = comm->rank;
+#endif
+  CUDACHECK(hipGetDevice(&savedDevice));
+  int commDevice = comm->cudaDev;
+
+  if (savedDevice != commDevice) {
+    CUDACHECK(hipSetDevice(commDevice));
+  }
+
+  TRACE(NCCL_INIT, "Destroying comm %p rank %d abortFlag %d fatalError %d", comm, rank, LOAD(comm->abortFlag), comm->fatalError);
+
+  CUDACHECK(hipStreamSynchronize(comm->groupStream));
+  NCCLCHECK(transportDestroyProxy(comm));
+  NCCLCHECK(commFree(comm));
+
+  if (savedDevice != commDevice)
+    CUDACHECK(hipSetDevice(savedDevice));
+
+  TRACE(NCCL_INIT, "Destroyed comm %p rank %d", comm, rank);
+
+  return ncclSuccess;
+}
+
+NCCL_API(ncclResult_t, ncclCommDestroy, ncclComm_t comm);
+ncclResult_t ncclCommDestroy(ncclComm_t comm) {
+  if (comm == NULL)
+    return ncclSuccess;
+
+  TRACE(NCCL_INIT, "comm %p rank %d nRanks %d cudaDev %d nvmlDev %d", comm, comm->rank, comm->nRanks, comm->cudaDev, comm->nvmlDev);
+
+  // Try and prevent a double free of the comm struct (user error)
+  if (comm->rank == -1 || comm->nRanks <= 0 || comm->cudaDev == -1 || comm->nvmlDev == -1) {
+    WARN("comm %p has already been destroyed", comm);
+    return ncclInvalidArgument;
+  }
+
+  return commDestroy(comm);
+}
+
+NCCL_API(ncclResult_t, ncclCommAbort, ncclComm_t comm);
+ncclResult_t ncclCommAbort(ncclComm_t comm) {
+  if (comm == NULL)
+    return ncclSuccess;
+
+  // Ask anything that might still be running on the device to quit
+  STORE(comm->abortFlag, 1);
+
+  // do not destroy comm because kernel maybe still running
+  // return commDestroy(comm);
+  return ncclSuccess;
+}
+
+NCCL_API(const char*, ncclGetErrorString, ncclResult_t code);
+const char* ncclGetErrorString(ncclResult_t code) {
+  switch (code) {
+    case ncclSuccess                : return "no error";
+    case ncclUnhandledCudaError     : return "unhandled cuda error";
+    case ncclSystemError            : return "unhandled system error";
+    case ncclInternalError          : return "internal error";
+    case ncclInvalidArgument        : return "invalid argument";
+    case ncclInvalidUsage           : return "invalid usage";
+    default                         : return "unknown result code";
+  }
+}
+
+NCCL_API(ncclResult_t, ncclCommGetAsyncError, ncclComm_t comm, ncclResult_t *asyncError);
+ncclResult_t ncclCommGetAsyncError(ncclComm_t comm, ncclResult_t *asyncError) {
+  NCCLCHECK(PtrCheck(comm, "ncclGetAsyncError", "comm"));
+  NCCLCHECK(PtrCheck(asyncError, "ncclGetAsyncError", "asyncError"));
+
+  // Check device reported error
+  static ncclDevError_t printedDevErr = ncclDevSuccess;
+  switch(LOAD(comm->fatalDevError)) {
+    case ncclDevSuccess :
+      break;
+    case ncclDevAssertedMismatch :
+      if (printedDevErr != ncclDevAssertedMismatch) {
+        WARN("Mismatched collective detected, please check your collective calls at and around rank %d. You can use NCCL_DEBUG=INFO and NCCL_DEBUG_SUBSYS=COLL to see the collective logs", comm->rank);
+        printedDevErr = ncclDevAssertedMismatch;
+      }
+      if (comm->fatalError == ncclSuccess) {
+        comm->fatalError = ncclInvalidUsage;
+      }
+      break;
+    case ncclDevSuspectedMismatch :
+      if (printedDevErr != ncclDevSuspectedMismatch) {
+        WARN("Your program may be hanging, this may be caused by a collective mismatch around rank %d. Please check your collective calls at and around this rank. You can use NCCL_DEBUG=INFO and NCCL_DEBUG_SUBSYS=COLL to see the collective logs", comm->rank);
+        printedDevErr = ncclDevSuspectedMismatch;
+      }
+      break;
+    default:
+      WARN("Unknown device error %d", *comm->fatalDevError);
+      return ncclInternalError;
+  }
+  *asyncError = comm->fatalError;
+  return ncclSuccess;
+}
+
+NCCL_API(ncclResult_t, ncclCommCount, const ncclComm_t comm, int* count);
+ncclResult_t ncclCommCount(const ncclComm_t comm, int* count) {
+  NCCLCHECK(PtrCheck(comm, "CommCount", "comm"));
+  NCCLCHECK(PtrCheck(count, "CommCount", "count"));
+  *count = comm->nRanks;
+  return ncclSuccess;
+}
+
+NCCL_API(ncclResult_t, ncclCommCuDevice, const ncclComm_t comm, int* devid);
+ncclResult_t ncclCommCuDevice(const ncclComm_t comm, int* devid) {
+  NCCLCHECK(PtrCheck(comm, "CommCuDevice", "comm"));
+  NCCLCHECK(PtrCheck(devid, "CommCuDevice", "devid"));
+  *devid = comm->cudaDev;
+  return ncclSuccess;
+}
+
+NCCL_API(ncclResult_t, ncclCommUserRank, const ncclComm_t comm, int* rank);
+ncclResult_t ncclCommUserRank(const ncclComm_t comm, int* rank) {
+  NCCLCHECK(PtrCheck(comm, "CommUserRank", "comm"));
+  NCCLCHECK(PtrCheck(rank, "CommUserRank", "rank"));
+  *rank = comm->rank;
+  return ncclSuccess;
+}
diff --git a/projects/rccl/src/init.cu b/projects/rccl/src/init.cu
deleted file mode 100644
index 95f70cde2b..0000000000
--- a/projects/rccl/src/init.cu
+++ /dev/null
@@ -1,970 +0,0 @@
-/*************************************************************************
- * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
- * Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
- *
- * See LICENSE.txt for license information
- ************************************************************************/
-
-#include "nccl.h"
-#include "core.h"
-#include "ring.h"
-#include "param.h"
-#include "nvmlwrap.h"
-#include "rings.h"
-#include "bootstrap.h"
-#include "transport.h"
-#include "common_coll.h"
-#include "group.h"
-#include "utils.h"
-#include "net.h"
-#include "topo.h"
-#include <numa.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <sys/mman.h>
-#include <sys/stat.h>
-#include <sys/types.h>
-#include <sched.h>
-#include <fcntl.h>
-#include <unistd.h>
-#include <hip/hip_runtime_api.h>
-#include <string.h>
-#include <errno.h>
-#include <assert.h>
-#include <dlfcn.h>
-
-#define STR2(v) #v
-#define STR(v) STR2(v)
-
-int ncclDebugLevel;
-uint64_t ncclDebugMask = NCCL_INIT; // Default debug sub-system mask is INIT
-pthread_mutex_t ncclDebugOutputLock;
-FILE *ncclDebugFile = stdout;
-
-#ifdef ENABLE_TRACE
-std::chrono::high_resolution_clock::time_point ncclEpoch;
-#endif
-
-#if CUDART_VERSION >= 9200 || defined(__HIP_PLATFORM_HCC__) || defined(__HCC__)
-#define NCCL_GROUP_CUDA_STREAM 0 // CGMD: CUDA 9.2,10.X Don't need to use an internal CUDA stream
-#else
-#define NCCL_GROUP_CUDA_STREAM 1 // CGMD: CUDA 9.0,9.1 Need to use an internal CUDA stream
-#endif
-
-NCCL_PARAM(GroupCudaStream, "GROUP_CUDA_STREAM", NCCL_GROUP_CUDA_STREAM);
-
-NCCL_PARAM(CheckPointers, "CHECK_POINTERS", 0);
-
-ncclNet_t* ncclNet = NULL;
-
-// We define this as weak to let tests redefine their own
-#pragma weak ncclCudaCompCap
-int ncclCudaCompCap() {
-  int cudaDev;
-  if (hipGetDevice(&cudaDev) != hipSuccess) return 0;
-  int ccMajor;
-  if (hipDeviceGetAttribute(&ccMajor, hipDeviceAttributeComputeCapabilityMajor, cudaDev) != hipSuccess) return 0;
-  return ccMajor;
-}
-int ncclCudaFullCompCap() {
-  int cudaDev;
-  if (hipGetDevice(&cudaDev) != hipSuccess) return 0;
-  int ccMajor, ccMinor;
-  if (hipDeviceGetAttribute(&ccMajor, hipDeviceAttributeComputeCapabilityMajor, cudaDev) != hipSuccess) return 0;
-  if (hipDeviceGetAttribute(&ccMinor, hipDeviceAttributeComputeCapabilityMinor, cudaDev) != hipSuccess) return 0;
-  return ccMajor*10+ccMinor;
-}
-
-// Returns ncclInternalError if anything fails, causing that network to be ignored.
-ncclResult_t initNet(ncclNet_t* net) {
-  int ndev;
-  if (net->init(ncclDebugLog) != ncclSuccess) return ncclInternalError;
-  if (net->devices(&ndev) != ncclSuccess) return ncclInternalError;
-  if (ndev <= 0) {
-    INFO(NCCL_INIT|NCCL_NET, "Net/%s: call to devices() returned 0 devices.", net->name);
-    return ncclSystemError;
-  }
-  return ncclSuccess;
-}
-
-ncclResult_t initNetPlugin(ncclNet_t** net) {
-  void* netPluginLib = dlopen("libnccl-net.so", RTLD_NOW | RTLD_LOCAL);
-  if (netPluginLib == NULL) {
-    // dlopen does not guarantee to set errno, but dlerror only gives us a
-    // string, so checking errno doesn't hurt to try to provide a better
-    // error message
-    if (errno == ENOENT) {
-      INFO(NCCL_INIT|NCCL_NET, "No network plugin found.");
-    } else {
-      INFO(NCCL_INIT|NCCL_NET, "Unable to load libnccl-net.so : %s", dlerror());
-    }
-    return ncclSuccess;
-  }
-  ncclNet_t* extNet = (ncclNet_t*) dlsym(netPluginLib, STR(NCCL_PLUGIN_SYMBOL));
-  if (extNet == NULL) {
-    INFO(NCCL_INIT|NCCL_NET, "NetPlugin: could not find " STR(NCCL_PLUGIN_SYMBOL) " symbol");
-    goto cleanup;
-  }
-  if (initNet(extNet) == ncclSuccess) {
-    *net = extNet;
-    return ncclSuccess;
-  }
-cleanup:
-  if (netPluginLib != NULL) dlclose(netPluginLib);
-  return ncclSuccess;
-}
-
-ncclResult_t initNet() {
-  // Always initialize sockets as we use it for bootstrap
-  NCCLCHECK(initNet(&ncclNetSocket));
-
-  NCCLCHECK(initNetPlugin(&ncclNet));
-  if (ncclNet != NULL) {
-    INFO(NCCL_INIT|NCCL_NET, "Using network plugin %s", ncclNetName());
-    return ncclSuccess;
-  }
-  if (initNet(&ncclNetIb) == ncclSuccess) {
-    ncclNet = &ncclNetIb;
-  } else {
-    ncclNet = &ncclNetSocket;
-  }
-  INFO(NCCL_INIT|NCCL_NET,"Using network %s", ncclNetName());
-  return ncclSuccess;
-}
-
-NCCL_PARAM(LlThreshold, "LL_THRESHOLD", -2);
-NCCL_PARAM(ThreadThreshold, "THREAD_THRESHOLD", -2);
-
-int ncclThreadThreshold(int minCompCap, int multiNode) {
-  int threshold = ncclParamThreadThreshold();
-  if (threshold == -2) { // user has not set this env variable
-    threshold = (minCompCap <= 6) ? NCCL_THREAD_THRESHOLD_PREVOLTA : NCCL_THREAD_THRESHOLD;
-    // multiply by 2 if running on multiple nodes
-    if (multiNode) {
-      threshold *= 2;
-    }
-  }
-  return threshold;
-}
-
-bool useFineGrainVramPcie = false;
-
-void parseHsaForceFineGrainVramPcie() {
-  char* str = getenv("HSA_FORCE_FINE_GRAIN_PCIE");
-  if (str && strlen(str) > 0) {
-    errno = 0;
-    int64_t v = strtoll(str, NULL, 0);
-    if (errno || (v != 0 && v != 1)) {
-      INFO(NCCL_ALL,"Invalid value %s for %s, using default %u.", str, "HSA_FORCE_FINE_GRAIN_PCIE", useFineGrainVramPcie); \
-    } else {
-      useFineGrainVramPcie = v;
-      INFO(NCCL_ALL,"%s set by environment to %u.", "HSA_FORCE_FINE_GRAIN_PCIE", useFineGrainVramPcie);  \
-    }
-  }
-}
-
-pthread_mutex_t initLock = PTHREAD_MUTEX_INITIALIZER;
-static bool initialized = false;
-static ncclResult_t ncclInit() {
-  if (initialized) return ncclSuccess;
-  pthread_mutex_lock(&initLock);
-  if (!initialized) {
-    initEnv();
-    initDebug();
-    initNet();
-    // Check if HSA_FORCE_FINE_GRAIN_PCIE is set in env
-    parseHsaForceFineGrainVramPcie();
-    initialized = true;
-  }
-  pthread_mutex_unlock(&initLock);
-  return ncclSuccess;
-}
-
-NCCL_API(ncclResult_t, ncclGetVersion, int* version);
-ncclResult_t ncclGetVersion(int* version) {
-  if (version == NULL) return ncclInvalidArgument;
-  *version = NCCL_VERSION_CODE;
-  return ncclSuccess;
-}
-
-NCCL_API(ncclResult_t, ncclGetUniqueId, ncclUniqueId* out);
-ncclResult_t ncclGetUniqueId(ncclUniqueId* out) {
-  NCCLCHECK(ncclInit());
-  NCCLCHECK(PtrCheck(out, "GetUniqueId", "out"));
-  return bootstrapGetUniqueId(out);
-}
-
-static ncclResult_t commFree(ncclComm_t comm) {
-  if (comm == NULL)
-    return ncclSuccess;
-
-  CUDACHECK(hipFree(comm->devComm));
-
-  for (int ring=0; ring<comm->nRings; ring++)
-    NCCLCHECK(freeRing(comm->rings+ring));
-
-  if (comm->doneEvent != NULL)
-    CUDACHECK(hipEventDestroy(comm->doneEvent));
-
-  if (comm->launchMode == ncclComm::GROUP) {
-    CUDACHECK(hipStreamDestroy(comm->groupStream));
-  }
-
-  // Last rank frees shared resources between threads
-  int isLast;
-  NCCLCHECK(ncclCpuBarrierIn(comm, &isLast));
-  if (isLast) {
-    free(comm->intraBarrier);
-    free(comm->intraParams);
-    free(comm->intraCudaDevs);
-    free(comm->intraCGMode);
-    free(comm->intraCC);
-  }
-
-  free(comm);
-  return ncclSuccess;
-}
-
-static ncclResult_t commAlloc(ncclComm_t* comret, int ndev, int rank) {
-  if (ndev < 1) {
-    WARN("invalid device count (%d) requested", ndev);
-    return ncclInvalidArgument;
-  }
-  if (rank >= ndev || rank < 0) {
-    WARN("rank %d exceeds ndev=%d", rank, ndev);
-    return ncclInvalidArgument;
-  }
-
-  // Try to create a CUDA object right away. If there is something wrong with
-  // the device we're on (failure cause #1) , better know it early.
-  hipEvent_t doneEvent;
-  CUDACHECK(hipEventCreateWithFlags(&doneEvent, hipEventDisableTiming));
-
-  struct ncclComm* comm;
-  NCCLCHECK(ncclCalloc(&comm, 1));
-
-  INFO(NCCL_INIT,"comm %p rank %d nranks %d", comm, rank, ndev);
-  comm->rank = rank;
-  comm->nRanks = ndev;
-  hipGetDevice(&comm->cudaDev);
-  comm->doneEvent = doneEvent;
-  comm->llThreshold = ncclParamLlThreshold();
-  comm->checkPointers = ncclParamCheckPointers() == 1 ? true : false;
-#if CUDART_VERSION >= 9200 || defined(__HIP_PLATFORM_HCC__) || defined(__HCC__)
-  comm->groupCudaStream = ncclParamGroupCudaStream();
-#else
-  // Don't allow the user to overload the default setting in older CUDA builds
-  comm->groupCudaStream = NCCL_GROUP_CUDA_STREAM;
-#endif
-
-  comm->argsptr = &comm->args;
-
-  *comret = comm;
-  return ncclSuccess;
-}
-
-static ncclResult_t devCommSetup(ncclComm_t comm) {
-  // Fully duplicate the comm on the device
-  NCCLCHECK(ncclCudaCalloc(&comm->devComm, 1));
-  // Copy the comm on the device
-  NCCLCHECK(ncclCudaMemcpy(comm->devComm, comm, 1));
-  // Copy userRanks
-  for (int r=0; r<comm->nRings; r++) {
-    NCCLCHECK(ncclCudaMemcpy(comm->rings[r].devUserRanks, comm->rings[r].userRanks, comm->nRanks));
-  }
-  return ncclSuccess;
-}
-
-// Pre-process the string so that running "strings" on the lib can quickly reveal the version.
-#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__)
-#define VERSION_STRING "NCCL version " STR(NCCL_MAJOR) "." STR(NCCL_MINOR) "." STR(NCCL_PATCH) NCCL_SUFFIX "+hip"
-#else
-#define VERSION_STRING "NCCL version " STR(NCCL_MAJOR) "." STR(NCCL_MINOR) "." STR(NCCL_PATCH) NCCL_SUFFIX "+cuda" STR(CUDA_MAJOR) "." STR(CUDA_MINOR)
-#endif
-static void showVersion() {
-  static int shown = 0;
-  if (shown == 0 && ncclDebugLevel >= NCCL_LOG_VERSION) {
-    printf("%s\n", VERSION_STRING);
-    fflush(stdout);
-    if (ncclDebugFile != stdout)
-      INFO(NCCL_ALL,"%s", VERSION_STRING); // Also log NCCL version in one of the files
-    shown = 1;
-  }
-}
-
-static ncclResult_t fillInfo(struct ncclInfo* info, int rank, uint64_t commHash) {
-  for (int t=0; t<NTRANSPORTS; t++) {
-    NCCLCHECK(ncclTransports[t].fillInfo(info->tinfo+t, rank, commHash));
-  }
-  return ncclSuccess;
-}
-
-bool SetCpuAffinity(int cudaDev, nvmlDevice_t* nvmlDevice);
-
-template <int type>
-static ncclResult_t selectTransport(struct ncclInfo* myInfo, struct ncclInfo* peerInfo, struct ncclConnect* connect, struct ncclTransport** transportRet, struct ncclRing* ring) {
-  for (int t=0; t<NTRANSPORTS; t++) {
-    struct ncclTransport *transport = ncclTransports+t;
-    struct ncclTransportComm* transportComm = type == 1 ? &transport->send : &transport->recv;
-    ncclTvalue_t ret = 0;
-    NCCLCHECK(transport->canConnect(&ret, myInfo->tinfo+t, peerInfo->tinfo+t));
-    if (ret > 0) {
-      cpu_set_t affinitySave;
-      nvmlDevice_t nvmlDevice;
-      int cudaDev;
-      CUDACHECK(hipGetDevice(&cudaDev));
-      sched_getaffinity(0, sizeof(cpu_set_t), &affinitySave);
-      SetCpuAffinity(cudaDev, &nvmlDevice);
-      NCCLCHECK(transportComm->setup(myInfo->tinfo+t, peerInfo->tinfo+t, connect, ring));
-      *transportRet = transport;
-      sched_setaffinity(0, sizeof(cpu_set_t), &affinitySave);
-      return ncclSuccess;
-    }
-  }
-  WARN("No transport found !");
-  *transportRet = NULL;
-  return ncclInternalError;
-}
-
-static ncclResult_t setupRing(struct ncclComm* comm, int ringid, int rank, int nranks, int* ringRanks, struct ncclInfo* allInfo, struct ncclConnect* connect) {
-  NCCLCHECK(initRing(comm, ringid));
-
-  struct ncclRing* ring = comm->rings+ringid;
-  // Reorganize ranks to start with rank.
-  int shift;
-  for (shift = 0; shift<nranks; shift++) {
-    if (ringRanks[shift] == rank) {
-      break;
-    }
-  }
-  for (int i=0; i<nranks; i++) {
-    ring->userRanks[i] = ringRanks[(i+shift)%nranks];
-  }
-  int prev = ring->userRanks[nranks-1];
-  int next = ring->userRanks[1];
-
-  NCCLCHECK(selectTransport<0>(allInfo+rank, allInfo+prev, connect+0, &ring->recv.transport, ring));
-  NCCLCHECK(selectTransport<1>(allInfo+rank, allInfo+next, connect+1, &ring->send.transport, ring));
-  NCCLCHECK(transportCreateProxy(0, ring, comm));
-  NCCLCHECK(transportCreateProxy(1, ring, comm));
-  return ncclSuccess;
-}
-
-static ncclResult_t fillConnect(struct ncclInfo* allInfo, int nranks, int rank, int* connectTransport, ncclTvalue_t* connectValue) {
-  for (int r=0; r<nranks; r++) {
-    connectTransport[r] = -1;
-    for (int t=0; t<NTRANSPORTS; t++) {
-      NCCLCHECK(ncclTransports[t].canConnect(connectValue+r, allInfo[rank].tinfo+t, allInfo[r].tinfo+t));
-      if (connectValue[r] > 0) {
-        connectTransport[r] = t;
-        break;
-      }
-    }
-  }
-  return ncclSuccess;
-}
-
-static void swap(void* mem1, void* mem2, int size) {
-  char tmp[size];
-  memcpy(tmp, mem1, size); memcpy(mem1, mem2, size); memcpy(mem2, tmp, size);
-}
-
-#define MAXWIDTH 64
-#define PREFIXLEN 15
-#define STRLENGTH (PREFIXLEN+5*MAXWIDTH)
-void dumpMatrix(int* connectMatrix, int nranks) {
-  char line[STRLENGTH+1];
-  line[STRLENGTH] = '\0';
-  memset(line, ' ', STRLENGTH);
-  for (int j=0; j<nranks && j<MAXWIDTH; j++) sprintf(4+line+4*j, " %3d", j);
-  INFO(NCCL_INIT,"%s", line);
-  for (int i=0; i<nranks; i++) {
-    memset(line, ' ', STRLENGTH);
-    sprintf(line, "%3d ", i);
-    for (int j=0; j<nranks && j<MAXWIDTH; j++) sprintf(4+line+4*j, " %3d", connectMatrix[i*nranks+j]);
-    INFO(NCCL_INIT,"%s", line);
-  }
-}
-
-void dumpMatrixTvalue(ncclTvalue_t* connectMatrix, int nranks) {
-  char line[STRLENGTH+1];
-  line[STRLENGTH] = '\0';
-  memset(line, ' ', STRLENGTH);
-  for (int j=0; j<nranks && j<MAXWIDTH; j++) sprintf(4+line+5*j, " %4d", j);
-  INFO(NCCL_INIT,"%s", line);
-  for (int i=0; i<nranks; i++) {
-    memset(line, ' ', STRLENGTH);
-    sprintf(line, "%3d ", i);
-    for (int j=0; j<nranks && j<MAXWIDTH; j++) sprintf(4+line+5*j, " %4o", (int)connectMatrix[i*nranks+j]);
-    INFO(NCCL_INIT,"%s", line);
-  }
-}
-
-
-void dumpLine(int* values, int nranks, const char* prefix) {
-  int prefixlen = strlen(prefix);
-  char line[STRLENGTH+1];
-  line[STRLENGTH] = '\0';
-  memset(line, ' ', STRLENGTH);
-  strncpy(line, prefix, PREFIXLEN);
-  for (int i=0; i<nranks && i<MAXWIDTH; i++) sprintf(line+prefixlen+4*i, " %3d", values[i]);
-  INFO(NCCL_INIT,"%s", line);
-}
-
-static ncclResult_t buildRings(int nrings, int* rings, int rank, int nranks, int* prev, int* next) {
-  for (int r=0; r<nrings; r++) {
-    char prefix[30];
-    /*sprintf(prefix, "[%d] Ring %d Prev : ", rank, r);
-    dumpLine(prev+r*nranks, nranks, prefix);
-    sprintf(prefix, "[%d] Ring %d Next : ", rank, r);
-    dumpLine(next+r*nranks, nranks, prefix);*/
-
-    int current = rank;
-    for (int i=0; i<nranks; i++) {
-      rings[r*nranks+i] = current;
-      current = next[r*nranks+current];
-    }
-    sprintf(prefix, "Ring %02d : ", r);
-    if (rank == 0) dumpLine(rings+r*nranks, nranks, prefix);
-    if (current != rank) {
-      WARN("Error : ring %d does not loop back to start (%d != %d)", r, current, rank);
-      return ncclInternalError;
-    }
-    // Check that all ranks are there
-    for (int i=0; i<nranks; i++) {
-      int found = 0;
-      for (int j=0; j<nranks; j++) {
-        if (rings[r*nranks+j] == i) {
-          found = 1;
-          break;
-        }
-      }
-      if (found == 0) {
-        WARN("Error : ring %d does not contain rank %d", r, i);
-        return ncclInternalError;
-      }
-    }
-  }
-  return ncclSuccess;
-}
-
-void* waitForNonNullPtr(void* p) {
-  volatile void** ptr = (volatile void**) p;
-  while (LOAD(ptr) == NULL) sched_yield();
-  return (void*)LOAD(ptr);
-}
-
-ncclResult_t initParams(struct ncclComm* comm) {
-  hipLaunchParams* params = comm->myParams = comm->intraParams+comm->intraRank;
-  params->args = (void **)&comm->argsptr;
-  params->stream = NULL;
-  params->sharedMem = 0;
-  params->blockDim.x = 0; params->blockDim.y = params->blockDim.z = 1;
-  params->gridDim.x = 0; params->gridDim.y = params->gridDim.z = 1;
-  return ncclSuccess;
-}
-
-// Allocate/Set Intra Process Structures and set CG options
-ncclResult_t ncclCommSetIntra(struct ncclComm* comm, int rank, int ranks, struct ncclComm* comm0) {
-  comm->intraRank = rank;
-  comm->intraRanks = ranks;
-  comm->intraPhase = 0;
-
-  // Alloc shared structures
-  if (rank == 0) {
-    assert(comm == comm0);
-    int* bar;
-    NCCLCHECK(ncclCalloc(&bar, 2));
-    bar[0] = bar[1] = 0;
-    comm->intraBarrier = bar;
-    NCCLCHECK(ncclCalloc(&comm->intraParams, comm->intraRanks));
-    NCCLCHECK(ncclCalloc(&comm->intraCudaDevs, comm->intraRanks));
-    int* CGMode;
-    NCCLCHECK(ncclCalloc(&CGMode, 1));
-    *CGMode = 0x11;
-    comm->intraCGMode = CGMode;
-    int* CC;
-    NCCLCHECK(ncclCalloc(&CC, 1));
-    *CC = ncclCudaFullCompCap();
-    comm->intraCC = CC;
-  } else {
-    comm->intraBarrier = (int*)waitForNonNullPtr(&comm0->intraBarrier);
-    comm->intraParams = (hipLaunchParams*)waitForNonNullPtr(&comm0->intraParams);
-    comm->intraCudaDevs = (int*)waitForNonNullPtr(&comm0->intraCudaDevs);
-    comm->intraCGMode = (int*)waitForNonNullPtr(&comm0->intraCGMode);
-    comm->intraCC = (int*)waitForNonNullPtr(&comm0->intraCC);
-  }
-  comm->intraCudaDevs[comm->intraRank] = comm->cudaDev;
-  NCCLCHECK(initParams(comm));
-
-  int cgMdLaunch = 1;
-
-  // Set CG Mode
-  comm->launchMode = ncclComm::GROUP;
-  char* str = getenv("NCCL_LAUNCH_MODE");
-  if (comm->intraRanks == 1 || (str && strcmp(str, "PARALLEL") == 0)) {
-    comm->launchMode = ncclComm::PARALLEL;
-  }
-  if (comm->launchMode == ncclComm::GROUP) {
-    CUDACHECK(hipStreamCreateWithFlags(&comm->groupStream, hipStreamNonBlocking));
-#if CUDART_VERSION >= 9000
-    if (*comm->intraCC && (ncclCudaFullCompCap() == *comm->intraCC)) {
-      // Check whether the GPU supports Cooperative Group Multi Device Launch
-      (void) hipDeviceGetAttribute(&cgMdLaunch, cudaDevAttrCooperativeMultiDeviceLaunch, comm->cudaDev);
-    }
-#endif
-  }
-
-  // Disable cgMdLaunch if any rank does not support it
-  if (cgMdLaunch == 0) {
-    *comm->intraCGMode = 0x10;
-  }
-  return ncclSuccess;
-}
-
-static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* commId) {
-  int rank = comm->rank;
-  int nranks = comm->nRanks;
-  void* commState;
-  uint64_t commHash = getnHash(commId->internal, NCCL_UNIQUE_ID_BYTES);
-  TRACE(NCCL_INIT, "comm %p, commHash %lu, rank %d nranks %d - BEGIN", comm, commHash, rank, nranks);
-  NCCLCHECK(bootstrapInit(commId, rank, nranks, &commState));
-
-  struct ncclInfo* allInfo;
-  NCCLCHECK(ncclCalloc(&allInfo, nranks));
-  NCCLCHECK(fillInfo(allInfo+rank, rank, commHash));
-  NCCLCHECK(bootstrapAllGather(commState, allInfo, sizeof(struct ncclInfo)));
-
-  int* connectTransport;
-  ncclTvalue_t* connectValue;
-  NCCLCHECK(ncclCalloc(&connectTransport, nranks*nranks));
-  NCCLCHECK(ncclCalloc(&connectValue, nranks*nranks));
-
-  NCCLCHECK(fillConnect(allInfo, nranks, rank, connectTransport+nranks*rank, connectValue+nranks*rank));
-  NCCLCHECK(bootstrapAllGather(commState, connectTransport, nranks*(sizeof(int))));
-  NCCLCHECK(bootstrapAllGather(commState, connectValue, nranks*(sizeof(ncclTvalue_t))));
-  //if (rank == 0) dumpMatrix(connectTransport, nranks);
-  //if (rank == 0) dumpMatrixTvalue(connectValue, nranks);
-
-  // Get my rings
-  int nrings;
-  int* prev, *next;
-  NCCLCHECK(ncclCalloc(&prev, nranks*MAXRINGS));
-  NCCLCHECK(ncclCalloc(&next, nranks*MAXRINGS));
-  comm->nThreads = getDefaultThreads();
-  NCCLCHECK(ncclGetRings(&nrings, &comm->nThreads, rank, nranks, connectTransport, connectValue, prev, next));
-  free(connectTransport);
-  free(connectValue);
-
-  // Find max nThreads
-  int allData[nranks];
-  allData[rank] = comm->nThreads;
-  NCCLCHECK(bootstrapAllGather(commState, allData, sizeof(int)));
-  for (int i=0; i<nranks; i++)
-    comm->nThreads = std::max(allData[i], comm->nThreads);
-  if (rank == 0) INFO(NCCL_INIT,"Using %d threads", comm->nThreads);
-
-  // Determine the minimum CUDA Compute capability of all GPUs
-  int myCompCap = ncclCudaCompCap();
-  int minCompCap = myCompCap;
-  allData[rank] = myCompCap;
-  NCCLCHECK(bootstrapAllGather(commState, allData, sizeof(int)));
-  for (int i=0; i<nranks; i++)
-    minCompCap = std::min(allData[i], minCompCap);
-  if (rank == 0) INFO(NCCL_INIT,"Min Comp Cap %d", minCompCap);
-
-  // Find min nrings across ranks
-  allData[rank] = nrings;
-  NCCLCHECK(bootstrapAllGather(commState, allData, sizeof(int)));
-  for (int i=0; i<nranks; i++)
-    nrings = std::min(allData[i], nrings);
-
-  // Exchange data with others to build complete rings
-  comm->nRings = nrings;
-  for (int r=0; r<nrings; r++) {
-    NCCLCHECK(bootstrapAllGather(commState, prev+r*nranks, sizeof(int)));
-    NCCLCHECK(bootstrapAllGather(commState, next+r*nranks, sizeof(int)));
-  }
-  int *rings;
-  NCCLCHECK(ncclCalloc(&rings, nranks*MAXRINGS));
-  NCCLCHECK(buildRings(nrings, rings, rank, nranks, prev, next));
-  free(prev);
-  free(next);
-
-  // Connect with prev/next for each ring
-  struct ncclConnect *connectData;
-  NCCLCHECK(ncclCalloc(&connectData, 2*nranks));
-  for (int r=0; r<nrings; r++) {
-    int* ringRanks = rings+r*nranks;
-    struct ncclRing *ring = comm->rings+r;
-    NCCLCHECK(setupRing(comm, r, rank, nranks, ringRanks, allInfo, connectData+2*rank));
-    int prev_offset = ring->userRanks[nranks-1]*2+1;
-    int next_offset = ring->userRanks[1]*2;
-    NCCLCHECK(bootstrapAllGather(commState, connectData, sizeof(struct ncclConnect)*2));
-    NCCLCHECK(ring->send.transport->send.connect(connectData+next_offset, &ring->send));
-    NCCLCHECK(ring->recv.transport->recv.connect(connectData+prev_offset, &ring->recv));
-  }
-  free(connectData);
-  free(rings);
-  free(allInfo);
-
-  // Intra-process barrier setup
-  struct rankInfo {
-    uint64_t hostHash;
-    uint64_t pidHash;
-    struct ncclComm* comm;
-  } rankInfos[nranks];
-  rankInfos[rank].hostHash = getHostHash();
-  rankInfos[rank].pidHash = getPidHash();
-  rankInfos[rank].comm = comm;
-  NCCLCHECK(bootstrapAllGather(commState, rankInfos, sizeof(struct rankInfo)));
-
-  // Compute intra ranks
-  int intraRank0 = -1, intraRank = -1, intraRanks = 0;
-  int multiNode = 0;
-  for (int r=0; r<nranks; r++) {
-    if ((rankInfos[r].hostHash == rankInfos[rank].hostHash) &&
-        (rankInfos[r].pidHash == rankInfos[rank].pidHash)) {
-      if (intraRanks == 0) intraRank0 = r;
-      if (r == rank) intraRank = intraRanks;
-      intraRanks++;
-    } else if (rankInfos[r].hostHash != rankInfos[rank].hostHash) {
-      multiNode = 1;
-    }
-  }
-  TRACE(NCCL_INIT,"hostHash[%d] %lx intraRank %d intraRanks %d intraRank0 %d",
-      rank, rankInfos[rank].hostHash, intraRank, intraRanks, intraRank0);
-  if (intraRank == -1 || intraRank0 == -1 || rankInfos[intraRank0].comm == NULL) {
-    WARN("Failed to determine intra ranks hostHash[%d] %lx intraRank %d intraRanks %d intraRank0 %d",
-        rank, rankInfos[rank].hostHash, intraRank, intraRanks, intraRank0);
-    return ncclInternalError;
-  }
-  NCCLCHECK(ncclCommSetIntra(comm, intraRank, intraRanks, rankInfos[intraRank0].comm));
-
-  // Determine thread threshold across all GPUs
-  comm->threadThreshold = ncclThreadThreshold(minCompCap, multiNode);
-
-  // Barrier
-  bootstrapClose(commState);
-  return ncclSuccess;
-}
-
-bool SetCpuAffinity(int cudaDev, nvmlDevice_t* nvmlDevice) {
-#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__)
-  if (numa_available() < 0) {
-    WARN("System does not support NUMA API!");
-    return false;
-  }
-  char* cudaPath;
-  NCCLCHECK(getCudaPath(cudaDev, &cudaPath));
-  strcat(cudaPath, "/numa_node");
-  int fd;
-  SYSCHECKVAL(open(cudaPath, O_RDONLY), "open", fd);
-  char numa_node[5];
-  int len;
-  SYSCHECKVAL(read(fd, numa_node, 4), "read", len);
-  SYSCHECK(close(fd), "close");
-  errno = 0;
-  long node = strtol(numa_node, NULL, 10);
-  if (errno == ERANGE || errno == EINVAL) {
-    INFO(NCCL_ALL,"%s: Call to strtol returned %s", __func__, strerror(errno));
-    free(cudaPath);
-    return false;
-  }
-  numa_run_on_node(node);
-  numa_set_preferred(node);
-  free(cudaPath);
-  return true;
-#else
-  char busId[NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE];
-  if (hipDeviceGetPCIBusId(busId, NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE, cudaDev) != hipSuccess) return false;
-  if (wrapNvmlDeviceGetHandleByPciBusId(busId, nvmlDevice) != ncclSuccess) return false;
-  if (wrapNvmlDeviceSetCpuAffinity(*nvmlDevice) != ncclSuccess) {
-    WARN("Failed to set CPU affinity");
-    return false;
-  }
-  return true;
-#endif
-}
-
-ncclResult_t ncclCommInitRankSync(ncclComm_t* newcomm, int nranks, ncclUniqueId commId, int myrank) {
-  cpu_set_t affinitySave;
-  sched_getaffinity(0, sizeof(cpu_set_t), &affinitySave);
-
-  NCCLCHECK(wrapNvmlSymbols());
-  NCCLCHECK(wrapNvmlInit());
-
-  // Make sure all host memory allocation are close to the GPU
-  int cudaDev;
-  nvmlDevice_t nvmlDevice;
-  CUDACHECK(hipGetDevice(&cudaDev));
-  SetCpuAffinity(cudaDev, &nvmlDevice);
-  ncclResult_t res;
-
-  NCCLCHECKGOTO(commAlloc(newcomm, nranks, myrank), res, cleanup);
-  NCCLCHECKGOTO(initTransportsRank(*newcomm, &commId), res, cleanup);
-  NCCLCHECKGOTO(devCommSetup(*newcomm), res, cleanup);
-
-  sched_setaffinity(0, sizeof(cpu_set_t), &affinitySave);
-  NCCLCHECKGOTO(wrapNvmlShutdown(), res, cleanup);
-
-  INFO(NCCL_INIT,"comm %p rank %d nranks %d - COMPLETE", *newcomm, myrank, nranks);
-
-  return ncclSuccess;
-cleanup:
-  *newcomm = NULL;
-  sched_setaffinity(0, sizeof(cpu_set_t), &affinitySave);
-  return res;
-}
-
-NCCL_API(ncclResult_t, ncclCommInitRank, ncclComm_t* newcomm, int nranks, ncclUniqueId commId, int myrank);
-ncclResult_t ncclCommInitRank(ncclComm_t* newcomm, int nranks, ncclUniqueId commId, int myrank) {
-  char* env = getenv("NCCL_COMM_ID");
-  if (env && myrank == 0) {
-    NCCLCHECK(bootstrapCreateRoot(&commId, true));
-  }
-
-  NCCLCHECK(ncclInit());
-  if (myrank == 0) showVersion();
-
-  INFO(NCCL_INIT,"rank %d nranks %d", myrank, nranks);
-
-  // Make sure the CUDA runtime is initialized.
-  CUDACHECK(hipFree(NULL));
-
-  NCCLCHECK(PtrCheck(newcomm, "CommInitRank", "newcomm"));
-  if (nranks < 1 || myrank < 0 || myrank >= nranks) {
-    WARN("Invalid rank requested : %d/%d", myrank, nranks);
-    return ncclInvalidArgument;
-  }
-
-  if (ncclAsyncMode()) {
-    int cudaDev;
-    CUDACHECK(hipGetDevice(&cudaDev));
-    return ncclAsyncInit(ncclCommInitRankSync, cudaDev, newcomm, nranks, commId, myrank);
-  } else {
-    return ncclCommInitRankSync(newcomm, nranks, commId, myrank);
-  }
-}
-
-static ncclResult_t initTransportsAll(struct ncclComm** comms, const int* devs, int nranks) {
-  struct ncclInfo* allInfo;
-  NCCLCHECK(ncclCalloc(&allInfo, nranks));
-  for (int rank=0; rank<nranks; rank++) {
-    CUDACHECK(hipSetDevice(devs[rank]));
-    NCCLCHECK(fillInfo(allInfo+rank, rank, 0));
-  }
-
-  int* connectTransport;
-  ncclTvalue_t* connectValue;
-  NCCLCHECK(ncclCalloc(&connectTransport, nranks*nranks));
-  NCCLCHECK(ncclCalloc(&connectValue, nranks*nranks));
-  for (int rank=0; rank<nranks; rank++)
-    NCCLCHECK(fillConnect(allInfo, nranks, rank, connectTransport+nranks*rank, connectValue+nranks*rank));
-
-  int* prev, *prevFinal, *next, *nextFinal;
-  NCCLCHECK(ncclCalloc(&prev, nranks*MAXRINGS));
-  NCCLCHECK(ncclCalloc(&prevFinal, nranks*MAXRINGS));
-  NCCLCHECK(ncclCalloc(&next, nranks*MAXRINGS));
-  NCCLCHECK(ncclCalloc(&nextFinal, nranks*MAXRINGS));
-  int nrings = MAXRINGS;
-  int nthreads=0;
-  int myCompCap = ncclCudaCompCap();
-  int minCompCap = myCompCap;
-  for (int rank=0; rank<nranks; rank++) {
-    CUDACHECK(hipSetDevice(devs[rank]));
-    int nringsRank;
-    int nthreadsRank = getDefaultThreads();
-    myCompCap = ncclCudaCompCap();
-    NCCLCHECK(ncclGetRings(&nringsRank, &nthreadsRank, rank, nranks, connectTransport, connectValue, prev, next));
-    nrings = std::min(nrings, nringsRank);
-    nthreads = std::max(nthreads, nthreadsRank);
-    minCompCap = std::min(minCompCap, myCompCap);
-    for (int ring=0; ring<nrings; ring++) {
-      int index = ring*nranks+rank;
-      prevFinal[index] = prev[index];
-      nextFinal[index] = next[index];
-    }
-  }
-  free(connectTransport);
-  free(connectValue);
-  free(prev);
-  free(next);
-
-  INFO(NCCL_INIT,"Using %d threads", nthreads);
-  INFO(NCCL_INIT,"Min Comp Cap %d", minCompCap);
-
-  int* rings;
-  NCCLCHECK(ncclCalloc(&rings, nranks*MAXRINGS));
-  NCCLCHECK(buildRings(nrings, rings, 0, nranks, prevFinal, nextFinal));
-  free(prevFinal);
-  free(nextFinal);
-
-  // Determine thread threshold across all GPUs
-  int threadThreshold = ncclThreadThreshold(minCompCap, 0);
-
-  for (int rank=0; rank<nranks; rank++) {
-    comms[rank]->nRings = nrings;
-    comms[rank]->nThreads = nthreads;
-    comms[rank]->threadThreshold = threadThreshold;
-  }
-
-  for (int r=0; r<nrings; r++) {
-    struct ncclConnect connect[2*nranks];
-    int* ringRanks = rings+r*nranks;
-    for (int rank=0; rank<nranks; rank++) {
-      CUDACHECK(hipSetDevice(devs[rank]));
-      NCCLCHECK(setupRing(comms[rank], r, rank, nranks, ringRanks, allInfo, connect+2*rank));
-    }
-    // RingExchange connect information
-    for (int rank=0; rank<nranks; rank++) {
-      // Swap rank->prev and prevRank->next
-      struct ncclRing *ring = comms[rank]->rings+r;
-      int prevRank = ring->userRanks[nranks-1];
-      struct ncclConnect* prevRankNextConnect = connect+2*prevRank+1;
-      struct ncclConnect* rankPrevConnect = connect+2*rank;
-      swap(prevRankNextConnect, rankPrevConnect, sizeof(struct ncclConnect));
-    }
-    for (int rank=0; rank<nranks; rank++) {
-      CUDACHECK(hipSetDevice(devs[rank]));
-      struct ncclRing *ring = comms[rank]->rings+r;
-      NCCLCHECK(ring->send.transport->send.connect(connect+2*rank+1, &ring->send));
-      NCCLCHECK(ring->recv.transport->recv.connect(connect+2*rank+0, &ring->recv));
-    }
-  }
-  free(rings);
-  free(allInfo);
-  return ncclSuccess;
-}
-
-
-NCCL_API(ncclResult_t, ncclCommInitAll, ncclComm_t* comms, int ndev, const int* devlist);
-ncclResult_t ncclCommInitAll(ncclComm_t* comms, int ndev, const int* devlist) {
-  NCCLCHECK(ncclInit());
-  NCCLCHECK(wrapNvmlSymbols());
-  NCCLCHECK(wrapNvmlInit());
-  showVersion();
-
-  INFO(NCCL_INIT,"nranks %d", ndev);
-
-  NCCLCHECK(PtrCheck(comms, "CommInitAll", "comms"));
-  if (ndev < 1) {
-    WARN("Invalid device count requested : %d", ndev);
-    return ncclInvalidArgument;
-  }
-
-  ncclResult_t res;
-  int savedDevice;
-  int rank, cudaDev;
-  ncclComm_t comm = NULL;
-  nvmlDevice_t nvmlDevice;
-  int ncclDevList[ndev];
-  for (int i=0; i<ndev; i++) {
-    ncclDevList[i] = devlist ? devlist[i] : i;
-  }
-
-  hipGetDevice(&savedDevice);
-
-  for(rank=0; rank<ndev; ++rank)
-    comms[rank] = NULL;
-
-  cpu_set_t affinitySave;
-  sched_getaffinity(0, sizeof(cpu_set_t), &affinitySave);
-
-  for (rank=0; rank<ndev; ++rank) {
-    cudaDev = ncclDevList[rank];
-    CUDACHECKGOTO(hipSetDevice(cudaDev), res, cleanup);
-
-    SetCpuAffinity(cudaDev, &nvmlDevice);
-
-    NCCLCHECKGOTO(commAlloc(&comm, ndev, rank), res, cleanup);
-    comms[rank] = comm;
-
-    NCCLCHECKGOTO(ncclCommSetIntra(comm, rank, ndev, comms[0]), res, cleanup);
-  }
-
-  sched_setaffinity(0, sizeof(cpu_set_t), &affinitySave);
-
-  NCCLCHECKGOTO(initTransportsAll(comms, ncclDevList, ndev), res, cleanup);
-
-  for(rank=0; rank<ndev; ++rank) {
-    cudaDev = ncclDevList[rank];
-    CUDACHECKGOTO(hipSetDevice(cudaDev), res, cleanup);
-    NCCLCHECKGOTO(devCommSetup(comms[rank]), res, cleanup);
-  }
-
-  res = ncclSuccess;
-  goto final;
-
-cleanup:
-  for(rank=0; rank<ndev; ++rank) {
-    if(comms[rank] != NULL) {
-      commFree(comms[rank]);
-    }
-  }
-
-final:
-  if(wrapNvmlShutdown() != ncclSuccess)
-    INFO(NCCL_INIT,"NCCL did not shutdown nvml properly");
-  hipSetDevice(savedDevice);
-  sched_setaffinity(0, sizeof(cpu_set_t), &affinitySave);
-  return res;
-}
-
-NCCL_API(ncclResult_t, ncclCommDestroy, ncclComm_t comm);
-ncclResult_t ncclCommDestroy(ncclComm_t comm) {
-
-  if (comm == NULL)
-    return ncclSuccess;
-  int savedDevice;
-  CUDACHECK(hipGetDevice(&savedDevice));
-  int commDevice = comm->cudaDev;
-
-  if (savedDevice != commDevice) {
-    CUDACHECK(hipSetDevice(commDevice));
-  }
-
-  NCCLCHECK(commFree(comm));
-
-  if (savedDevice != commDevice)
-    CUDACHECK(hipSetDevice(savedDevice));
-
-  return ncclSuccess;
-}
-
-NCCL_API(const char*, ncclGetErrorString, ncclResult_t code);
-const char* ncclGetErrorString(ncclResult_t code) {
-  switch (code) {
-    case ncclSuccess                : return "no error";
-    case ncclUnhandledCudaError     : return "unhandled cuda error";
-    case ncclSystemError            : return "unhandled system error";
-    case ncclInternalError          : return "internal error";
-    case ncclInvalidArgument        : return "invalid argument";
-    case ncclInvalidUsage           : return "invalid usage";
-    default                         : return "unknown result code";
-  }
-}
-
-NCCL_API(ncclResult_t, ncclCommCount, const ncclComm_t comm, int* count);
-ncclResult_t ncclCommCount(const ncclComm_t comm, int* count) {
-  NCCLCHECK(PtrCheck(comm, "CommCount", "comm"));
-  NCCLCHECK(PtrCheck(count, "CommCount", "count"));
-  *count = comm->nRanks;
-  return ncclSuccess;
-}
-
-NCCL_API(ncclResult_t, ncclCommCuDevice, const ncclComm_t comm, int* devid);
-ncclResult_t ncclCommCuDevice(const ncclComm_t comm, int* devid) {
-  NCCLCHECK(PtrCheck(comm, "CommCuDevice", "comm"));
-  NCCLCHECK(PtrCheck(devid, "CommCuDevice", "devid"));
-  *devid = comm->cudaDev;
-  return ncclSuccess;
-}
-
-NCCL_API(ncclResult_t, ncclCommUserRank, const ncclComm_t comm, int* rank);
-ncclResult_t ncclCommUserRank(const ncclComm_t comm, int* rank) {
-  NCCLCHECK(PtrCheck(comm, "CommUserRank", "comm"));
-  NCCLCHECK(PtrCheck(rank, "CommUserRank", "rank"));
-  *rank = comm->rank;
-  return ncclSuccess;
-}
diff --git a/projects/rccl/src/misc/argcheck.cc b/projects/rccl/src/misc/argcheck.cc
new file mode 100644
index 0000000000..b906a68f5c
--- /dev/null
+++ b/projects/rccl/src/misc/argcheck.cc
@@ -0,0 +1,69 @@
+/*************************************************************************
+ * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#include "argcheck.h"
+
+static ncclResult_t CudaPtrCheck(const void* pointer, struct ncclComm* comm, const char* ptrname, const char* opname) {
+  hipPointerAttribute_t attr;
+  hipError_t err = hipPointerGetAttributes(&attr, pointer);
+  if (err != hipSuccess || attr.devicePointer == NULL) {
+    WARN("%s : %s is not a valid pointer", opname, ptrname);
+    return ncclInvalidArgument;
+  }
+#if CUDART_VERSION >= 10000
+  if (attr.type == hipMemoryTypeDevice && attr.device != comm->cudaDev) {
+#else
+  if (attr.memoryType == hipMemoryTypeDevice && attr.device != comm->cudaDev) {
+#endif
+    WARN("%s : %s allocated on device %d mismatchs with NCCL device %d", opname, ptrname, attr.device, comm->cudaDev);
+    return ncclInvalidArgument;
+  }
+  return ncclSuccess;
+}
+
+ncclResult_t PtrCheck(void* ptr, const char* opname, const char* ptrname) {
+  if (ptr == NULL) {
+    WARN("%s : %s argument is NULL", opname, ptrname);
+    return ncclInvalidArgument;
+  }
+  return ncclSuccess;
+}
+
+ncclResult_t ArgsCheck(struct ncclInfo* info) {
+  NCCLCHECK(PtrCheck(info->comm, info->opName, "comm"));
+  // First, the easy ones
+  if (info->root < 0 || info->root >= info->comm->nRanks) {
+    WARN("%s : invalid root %d (root should be in the 0..%d range)", info->opName, info->root, info->comm->nRanks);
+    return ncclInvalidArgument;
+  }
+  if (info->datatype < 0 || info->datatype >= ncclNumTypes) {
+    WARN("%s : invalid type %d", info->opName, info->datatype);
+    return ncclInvalidArgument;
+  }
+  // Type is OK, compute nbytes. Convert Allgather/Broadcast calls to chars.
+  info->nBytes = info->count * ncclTypeSize(info->datatype);
+  if (info->coll == ncclCollAllGather || info->coll == ncclCollBroadcast) {
+    info->count = info->nBytes;
+    info->datatype = ncclInt8;
+  }
+  if (info->coll == ncclCollAllGather || info->coll == ncclCollReduceScatter) info->nBytes *= info->comm->nRanks; // count is per rank
+
+  if (info->op < 0 || info->op >= ncclNumOps) {
+    WARN("%s : invalid reduction operation %d", info->opName, info->op);
+    return ncclInvalidArgument;
+  }
+
+  if (info->comm->checkPointers) {
+    // Check CUDA device pointers
+    if (info->coll != ncclCollBroadcast || info->comm->rank == info->root) {
+      NCCLCHECK(CudaPtrCheck(info->sendbuff, info->comm, "sendbuff", info->opName));
+    }
+    if (info->coll != ncclCollReduce || info->comm->rank == info->root) {
+      NCCLCHECK(CudaPtrCheck(info->recvbuff, info->comm, "recvbuff", info->opName));
+    }
+  }
+  return ncclSuccess;
+}
diff --git a/projects/rccl/src/misc/enqueue.cu b/projects/rccl/src/misc/enqueue.cu
deleted file mode 100644
index eb56de55ae..0000000000
--- a/projects/rccl/src/misc/enqueue.cu
+++ /dev/null
@@ -1,248 +0,0 @@
-/*************************************************************************
- * Copyright (c) 2017-2018, NVIDIA CORPORATION. All rights reserved.
- * Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
- *
- * See LICENSE.txt for license information
- ************************************************************************/
-
-#include <hip/hip_runtime.h>
-
-#include "enqueue.h"
-#include "common_coll.h"
-#include "param.h"
-
-#include "collectives/collectives.h"
-
-#define NCCL_FUNC4(coll, op, dtype) \
-  NCCL_KERN_NAME(coll, op, dtype), \
-  NCCL_KERN_NAME(coll##LL, op, dtype)
-
-// Must be consistent with ncclDataType_t
-#define NCCL_FUNCS3A(coll, op) \
-  NCCL_FUNC4(coll, op,  i8), \
-  NCCL_FUNC4(coll, op,  u8), \
-  NCCL_FUNC4(coll, op, i32), \
-  NCCL_FUNC4(coll, op, u32), \
-  NCCL_FUNC4(coll, op, i64), \
-  NCCL_FUNC4(coll, op, u64), \
-  NCCL_FUNC4(coll, op, f16), \
-  NCCL_FUNC4(coll, op, f32), \
-  NCCL_FUNC4(coll, op, f64)
-#define NCCL_FUNCS3B(coll, op) \
-  NCCL_FUNC4(coll, op,  i8), \
-  NCCL_FUNC4(coll, op,  i8), \
-  NCCL_FUNC4(coll, op,  i8), \
-  NCCL_FUNC4(coll, op,  i8), \
-  NCCL_FUNC4(coll, op,  i8), \
-  NCCL_FUNC4(coll, op,  i8), \
-  NCCL_FUNC4(coll, op,  i8), \
-  NCCL_FUNC4(coll, op,  i8), \
-  NCCL_FUNC4(coll, op,  i8)
-
-// Must be consistent with ncclRedOp_t
-#define NCCL_FUNCS2A(coll) \
-  NCCL_FUNCS3A(coll, sum ), \
-  NCCL_FUNCS3A(coll, prod), \
-  NCCL_FUNCS3A(coll, max ), \
-  NCCL_FUNCS3A(coll, min )
-#define NCCL_FUNCS2B(coll) \
-  NCCL_FUNCS3B(coll, copy), \
-  NCCL_FUNCS3B(coll, copy), \
-  NCCL_FUNCS3B(coll, copy), \
-  NCCL_FUNCS3B(coll, copy)
-
-typedef void(*ncclKern_t)(struct ncclColl);
-// Must be consistent with the ncclFuncSet enum
-static ncclKern_t const ncclKerns[ncclCollCount*ncclNumOps*ncclNumTypes*2] = {
-  NCCL_FUNCS2B(ncclBroadcast),
-  NCCL_FUNCS2A(ncclReduce),
-  NCCL_FUNCS2B(ncclAllGather),
-  NCCL_FUNCS2A(ncclReduceScatter),
-  NCCL_FUNCS2A(ncclAllReduce)
-};
-
-ncclResult_t ncclLaunchCooperativeKernelMultiDevice(hipLaunchParams *paramsList, int* cudaDevs, int numDevices, int cgMode) {
-  if (cgMode & 0x01) {
-    CUDACHECK(hipExtLaunchMultiKernelMultiDevice(paramsList, numDevices, 0));
-    return ncclSuccess;
-  }
-  int savedDev;
-  CUDACHECK(hipGetDevice(&savedDev));
-  for (int i = 0; i < numDevices; i++) {
-    hipLaunchParams* params = paramsList+i;
-    CUDACHECK(hipSetDevice(cudaDevs[i]));
-    hipLaunchKernelGGL(((void (*)(struct ncclColl))params->func), params->gridDim, params->blockDim, params->sharedMem, params->stream, **((struct ncclColl **)(params->args)));
-  }
-  CUDACHECK(hipSetDevice(savedDev));
-  return ncclSuccess;
-}
-
-ncclResult_t setupLaunch(struct ncclComm* comm, hipLaunchParams* params) {
-  params->gridDim.x = std::min((int) params->gridDim.x, comm->nRings);
-
-  // Set active = 2 for the last operation
-  for (int r=0; r<params->gridDim.x; r++) {
-    struct ncclRing* ring = comm->rings+r;
-    STORE(&ring->collectives[(ring->collStart+ring->collCount-1)%NCCL_MAX_OPS].active, 2);
-  }
-
-  // Find the first operation, choose the kernel accordingly and pass it
-  // as the first argument.
-  struct ncclColl* coll = comm->rings[0].collectives+comm->rings[0].collStart;
-  memcpy(&comm->args, coll, sizeof(struct ncclColl));
-  // As we pass that coll directly, we can free it immediately.
-  STORE(&coll->active, 0);
-
-  params->func = (void *)ncclKerns[coll->funcIndex];
-  return ncclSuccess;
-}
-
-ncclResult_t ncclCpuBarrierIn(struct ncclComm* comm, int* isLast) {
-  volatile int* ptr = (volatile int*)(comm->intraBarrier+comm->intraPhase);
-  int val = LOAD(ptr);
-  bool done = false;
-  while (done == false) {
-    if (val >= comm->intraRanks) {
-      WARN("Trying to launch too many collectives");
-      return ncclInvalidUsage;
-    }
-    if (val+1 == comm->intraRanks) {
-      // Reset the barrier.
-      comm->intraBarrier[comm->intraPhase^1] = 0;
-      *isLast = 1;
-      return ncclSuccess;
-    }
-    done = __sync_bool_compare_and_swap(ptr, val, val+1);
-    val++;
-  }
-  *isLast = 0;
-  return ncclSuccess;
-}
-
-ncclResult_t ncclCpuBarrierLast(struct ncclComm* comm) {
-  volatile int* ptr = (volatile int*)(comm->intraBarrier+comm->intraPhase);
-  int val = LOAD(ptr);
-  if (__sync_bool_compare_and_swap(ptr, val, val+1) != true) {
-    WARN("Trying to launch too many collectives");
-    return ncclInternalError;
-  }
-  return ncclSuccess;
-}
-
-ncclResult_t ncclCpuBarrierOut(struct ncclComm* comm) {
-  volatile int* ptr = (volatile int*)(comm->intraBarrier+comm->intraPhase);
-  while (LOAD(ptr) < comm->intraRanks) pthread_yield();
-  comm->intraPhase ^= 1;
-  return ncclSuccess;
-}
-
-ncclResult_t ncclBarrierEnqueue(struct ncclComm* comm) {
-  if (comm->nRanks == 1) return ncclSuccess;
-  hipLaunchParams* params = comm->myParams;
-
-  NCCLCHECK(setupLaunch(comm, params));
-
-  // Use internal NCCL stream for CGMD/GROUP launch if required or if the user stream is NULL
-  if (comm->launchMode == ncclComm::GROUP && (comm->groupCudaStream || comm->userStream == NULL)) {
-    // Enqueue event in user stream
-    CUDACHECK(hipEventRecord(comm->doneEvent, comm->userStream));
-    // Create dependency between user stream and internal NCCL stream
-    CUDACHECK(hipStreamWaitEvent(comm->groupStream, comm->doneEvent, 0));
-    params->stream = comm->groupStream;
-  } else {
-    if (comm->userStream != params->stream) {
-      // Stream changed from last call, create dependency against last NCCL kernel launch
-      CUDACHECK(hipStreamWaitEvent(comm->userStream, comm->doneEvent, 0));
-    }
-    params->stream = comm->userStream;
-  }
-
-  int isLast = 0;
-  NCCLCHECK(ncclCpuBarrierIn(comm, &isLast));
-
-  if (isLast) {
-    if (comm->launchMode == ncclComm::GROUP) {
-      // I'm the last. Launch all operations.
-      NCCLCHECK(ncclLaunchCooperativeKernelMultiDevice(comm->intraParams, comm->intraCudaDevs, comm->intraRanks, *comm->intraCGMode));
-    }
-    NCCLCHECK(ncclCpuBarrierLast(comm));
-  }
-  return ncclSuccess;
-}
-
-ncclResult_t ncclBarrierEnqueueWait(ncclComm_t comm) {
-  if (comm->nRanks == 1) return ncclSuccess;
-  // We can't print the CG mode before the first barrier happened.
-  if (comm->rank == 0 && *comm->intraCGMode & 0x10) {
-    *comm->intraCGMode ^= 0x10;
-    INFO(NCCL_INIT,"Launch mode %s%s%s",
-        comm->launchMode == ncclComm::GROUP ? "Group" : "Parallel",
-        *comm->intraCGMode ? "/CGMD" : "",
-        (comm->launchMode == ncclComm::GROUP && comm->groupCudaStream) ? "/Stream" : "");
-  }
-
-  NCCLCHECK(ncclCpuBarrierOut(comm));
-
-  hipLaunchParams *params = comm->myParams;
-  if (comm->launchMode == ncclComm::PARALLEL) {
-    hipLaunchKernelGGL(((void (*)(struct ncclColl))params->func), params->gridDim, params->blockDim, params->sharedMem, params->stream, **((struct ncclColl **)(params->args)));
-  }
-  // Start the network proxies as soon as the kernel has been launched. We can't
-  // perform any CUDA call between the two or having a hipFree between the CUDA
-  // launch and the transportStartProxies call could cause a deadlock.
-  // Also, starting the proxies after the CUDA launch seems to be better for
-  // performance (latency).
-  for (int r=0; r<params->gridDim.x; r++) {
-    struct ncclRing* ring = comm->rings+r;
-    ring->collStart = ring->collFifoTail;
-    ring->collCount = 0;
-  }
-  params->gridDim.x = params->blockDim.x = 0;
-  NCCLCHECK(transportStartProxies(comm));
-  return ncclSuccess;
-}
-
-ncclResult_t ncclEnqueueEvents(ncclComm_t comm) {
-  hipLaunchParams *params = comm->myParams;
-  // Enqueue event after NCCL kernel
-  CUDACHECK(hipEventRecord(comm->doneEvent, params->stream));
-  // Use internal NCCL stream for CGMD/GROUP launch if required or if the user stream is NULL
-  if (comm->launchMode == ncclComm::GROUP && (comm->groupCudaStream || comm->userStream == NULL)) {
-    // Create dependency between NCCL internal stream and user stream
-    CUDACHECK(hipStreamWaitEvent(comm->userStream, comm->doneEvent, 0));
-  }
-  comm->userStreamSet = false;
-  return ncclSuccess;
-}
-
-ncclResult_t ncclEnqueueCheck(ncclFunc_t func, const char* primName, const void* sendbuff,
-    void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root,
-    ncclComm_t comm, hipStream_t stream) {
-  if (comm == NULL) return ncclInvalidArgument;
-  // Launch asynchronously if needed
-  if (ncclAsyncMode()) {
-    ncclResult_t ret = ncclSuccess;
-    int savedDev = -1;
-    if (comm->checkPointers) {
-      CUDACHECKGOTO(hipGetDevice(&savedDev), ret, end);
-      CUDACHECKGOTO(hipSetDevice(comm->cudaDev), ret, end);
-    }
-    // Check arguments
-    NCCLCHECKGOTO(ArgsCheck(sendbuff, recvbuff, count, type, op, root, comm, primName), ret, end);
-    // Always register comm even in case of error to make sure ncclGroupEnd
-    // cleans it up.
-    NCCLCHECK(ncclAsyncColl(comm));
-    NCCLCHECKGOTO(func(sendbuff, recvbuff, count, type, op, root, comm, stream), ret, end);
-end:
-    if (savedDev != -1) CUDACHECK(hipSetDevice(savedDev));
-    ncclAsyncErrCheck(ret);
-    return ret;
-  } else {
-    NCCLCHECK(ArgsCheck(sendbuff, recvbuff, count, type, op, root, comm, primName));
-    NCCLCHECK(func(sendbuff, recvbuff, count, type, op, root, comm, stream));
-    NCCLCHECK(ncclBarrierEnqueue(comm));
-    NCCLCHECK(ncclBarrierEnqueueWait(comm));
-    NCCLCHECK(ncclEnqueueEvents(comm));
-    return ncclSuccess;
-  }
-}
diff --git a/projects/rccl/src/misc/group.cu b/projects/rccl/src/misc/group.cc
similarity index 93%
rename from projects/rccl/src/misc/group.cu
rename to projects/rccl/src/misc/group.cc
index 0144bee78d..8b0628197e 100644
--- a/projects/rccl/src/misc/group.cu
+++ b/projects/rccl/src/misc/group.cc
@@ -1,5 +1,5 @@
 /*************************************************************************
- * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
  * Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
  *
  * See LICENSE.txt for license information
@@ -119,7 +119,7 @@ ncclResult_t ncclGroupEnd() {
   int savedDev;
   CUDACHECK(hipGetDevice(&savedDev));
   int done = ncclGroupIndex;
-  int doneArray[ncclGroupIndex];
+  int doneArray[MAX_ASYNC_OPS];
   for (int i=0; i<ncclGroupIndex; i++) doneArray[i] = 0;
 
   ncclResult_t ret = ncclGroupError;
@@ -180,13 +180,13 @@ group_cleanup:
   // an atomic operation, we need to cancel all operations.
   for (int i=0; i<ncclGroupIndex; i++) {
     struct ncclComm* comm = ncclGroupArgs[i].coll.comm;
-    for (int r=0; r<comm->nRings; r++) {
-      struct ncclRing* ring = comm->rings+r;
-      for (int i=0; i<ring->collCount; i++) {
-        STORE(&ring->collectives[(ring->collStart + i)%NCCL_MAX_OPS].active, 0);
+    for (int c=0; c<comm->nChannels; c++) {
+      struct ncclChannel* channel = comm->channels+c;
+      for (int i=0; i<channel->collCount; i++) {
+        STORE(&channel->collectives[(channel->collStart + i)%NCCL_MAX_OPS].active, 0);
       }
-      ring->collFifoTail = ring->collStart;
-      ring->collCount = 0;
+      channel->collFifoTail = channel->collStart;
+      channel->collCount = 0;
     }
     comm->myParams->gridDim.x = comm->myParams->blockDim.x = 0;
     comm->userStreamSet = false;
diff --git a/projects/rccl/src/misc/ibvwrap.cu b/projects/rccl/src/misc/ibvwrap.cc
similarity index 99%
rename from projects/rccl/src/misc/ibvwrap.cu
rename to projects/rccl/src/misc/ibvwrap.cc
index 7ac3431c37..f47c141bc1 100644
--- a/projects/rccl/src/misc/ibvwrap.cu
+++ b/projects/rccl/src/misc/ibvwrap.cc
@@ -1,5 +1,5 @@
 /*************************************************************************
- * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
  *
  * See LICENSE.txt for license information
  ************************************************************************/
diff --git a/projects/rccl/src/misc/nvmlwrap.cu b/projects/rccl/src/misc/nvmlwrap.cc
similarity index 84%
rename from projects/rccl/src/misc/nvmlwrap.cu
rename to projects/rccl/src/misc/nvmlwrap.cc
index f3ee2ac9ae..fbe481fdd8 100644
--- a/projects/rccl/src/misc/nvmlwrap.cu
+++ b/projects/rccl/src/misc/nvmlwrap.cc
@@ -1,6 +1,5 @@
 /*************************************************************************
- * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
- * Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
+ * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
  *
  * See LICENSE.txt for license information
  ************************************************************************/
@@ -17,14 +16,14 @@ static nvmlReturn_t (*nvmlInternalInit)(void);
 static nvmlReturn_t (*nvmlInternalShutdown)(void);
 static nvmlReturn_t (*nvmlInternalDeviceGetHandleByPciBusId)(const char* pciBusId, nvmlDevice_t* device);
 static nvmlReturn_t (*nvmlInternalDeviceGetIndex)(nvmlDevice_t device, unsigned* index);
-static nvmlReturn_t (*nvmlInternalDeviceSetCpuAffinity)(nvmlDevice_t device);
-static nvmlReturn_t (*nvmlInternalDeviceClearCpuAffinity)(nvmlDevice_t device);
 static const char* (*nvmlInternalErrorString)(nvmlReturn_t r);
 static nvmlReturn_t (*nvmlInternalDeviceGetNvLinkState)(nvmlDevice_t device, unsigned int link, nvmlEnableState_t *isActive);
 static nvmlReturn_t (*nvmlInternalDeviceGetPciInfo)(nvmlDevice_t device, nvmlPciInfo_t* pci);
 static nvmlReturn_t (*nvmlInternalDeviceGetNvLinkRemotePciInfo)(nvmlDevice_t device, unsigned int link, nvmlPciInfo_t *pci);
 static nvmlReturn_t (*nvmlInternalDeviceGetNvLinkCapability)(nvmlDevice_t device, unsigned int link,
     nvmlNvLinkCapability_t capability, unsigned int *capResult);
+static nvmlReturn_t (*nvmlInternalDeviceGetMinorNumber)(nvmlDevice_t device, unsigned int* minorNumber);
+
 
 ncclResult_t wrapNvmlSymbols(void) {
   if (nvmlState == nvmlInitialized)
@@ -71,10 +70,9 @@ ncclResult_t wrapNvmlSymbols(void) {
   LOAD_SYM(nvmlhandle, "nvmlShutdown", nvmlInternalShutdown);
   LOAD_SYM(nvmlhandle, "nvmlDeviceGetHandleByPciBusId", nvmlInternalDeviceGetHandleByPciBusId);
   LOAD_SYM(nvmlhandle, "nvmlDeviceGetIndex", nvmlInternalDeviceGetIndex);
-  LOAD_SYM(nvmlhandle, "nvmlDeviceSetCpuAffinity", nvmlInternalDeviceSetCpuAffinity);
-  LOAD_SYM(nvmlhandle, "nvmlDeviceClearCpuAffinity", nvmlInternalDeviceClearCpuAffinity);
   LOAD_SYM(nvmlhandle, "nvmlErrorString", nvmlInternalErrorString);
   LOAD_SYM(nvmlhandle, "nvmlDeviceGetPciInfo", nvmlInternalDeviceGetPciInfo);
+  LOAD_SYM(nvmlhandle, "nvmlDeviceGetMinorNumber", nvmlInternalDeviceGetMinorNumber);
   LOAD_SYM_OPTIONAL(nvmlhandle, "nvmlDeviceGetNvLinkState", nvmlInternalDeviceGetNvLinkState);
   LOAD_SYM_OPTIONAL(nvmlhandle, "nvmlDeviceGetNvLinkRemotePciInfo", nvmlInternalDeviceGetNvLinkRemotePciInfo);
   LOAD_SYM_OPTIONAL(nvmlhandle, "nvmlDeviceGetNvLinkCapability", nvmlInternalDeviceGetNvLinkCapability);
@@ -87,9 +85,8 @@ teardown:
   nvmlInternalShutdown = NULL;
   nvmlInternalDeviceGetHandleByPciBusId = NULL;
   nvmlInternalDeviceGetIndex = NULL;
-  nvmlInternalDeviceSetCpuAffinity = NULL;
-  nvmlInternalDeviceClearCpuAffinity = NULL;
   nvmlInternalDeviceGetPciInfo = NULL;
+  nvmlInternalDeviceGetMinorNumber = NULL;
   nvmlInternalDeviceGetNvLinkState = NULL;
   nvmlInternalDeviceGetNvLinkRemotePciInfo = NULL;
   nvmlInternalDeviceGetNvLinkCapability = NULL;
@@ -156,38 +153,6 @@ ncclResult_t wrapNvmlDeviceGetIndex(nvmlDevice_t device, unsigned* index) {
   return ncclSuccess;
 }
 
-ncclResult_t wrapNvmlDeviceSetCpuAffinity(nvmlDevice_t device) {
-  if (nvmlInternalDeviceSetCpuAffinity == NULL) {
-    WARN("lib wrapper not initialized.");
-    return ncclInternalError;
-  }
-  // Workaround : it seems SetCpuAffinity is not thread safe.
-  static pthread_mutex_t lock = PTHREAD_MUTEX_INITIALIZER;
-  pthread_mutex_lock(&lock);
-  nvmlReturn_t ret = nvmlInternalDeviceSetCpuAffinity(device);
-  pthread_mutex_unlock(&lock);
-  if (ret != NVML_SUCCESS) {
-    WARN("nvmlDeviceSetCpuAffinity() failed: %s ",
-        nvmlInternalErrorString(ret));
-    return ncclSystemError;
-  }
-  return ncclSuccess;
-}
-
-ncclResult_t wrapNvmlDeviceClearCpuAffinity(nvmlDevice_t device) {
-  if (nvmlInternalInit == NULL) {
-    WARN("lib wrapper not initialized.");
-    return ncclInternalError;
-  }
-  nvmlReturn_t ret = nvmlInternalDeviceClearCpuAffinity(device);
-  if (ret != NVML_SUCCESS) {
-    WARN("nvmlDeviceClearCpuAffinity() failed: %s ",
-        nvmlInternalErrorString(ret));
-    return ncclSystemError;
-  }
-  return ncclSuccess;
-}
-
 ncclResult_t wrapNvmlDeviceGetPciInfo(nvmlDevice_t device, nvmlPciInfo_t* pci) {
   if (nvmlInternalDeviceGetPciInfo == NULL) {
     WARN("lib wrapper not initialized.");
@@ -202,6 +167,20 @@ ncclResult_t wrapNvmlDeviceGetPciInfo(nvmlDevice_t device, nvmlPciInfo_t* pci) {
   return ncclSuccess;
 }
 
+ncclResult_t wrapNvmlDeviceGetMinorNumber(nvmlDevice_t device, unsigned int* minorNumber) {
+  if (nvmlInternalDeviceGetMinorNumber == NULL) {
+    WARN("lib wrapper not initialized.");
+    return ncclInternalError;
+  }
+  nvmlReturn_t ret = nvmlInternalDeviceGetMinorNumber(device, minorNumber);
+  if (ret != NVML_SUCCESS) {
+    WARN("nvmlDeviceGetMinorNumber() failed: %s ",
+        nvmlInternalErrorString(ret));
+    return ncclSystemError;
+  }
+  return ncclSuccess;
+}
+
 ncclResult_t wrapNvmlDeviceGetNvLinkState(nvmlDevice_t device, unsigned int link, nvmlEnableState_t *isActive) {
   if (nvmlInternalDeviceGetNvLinkState == NULL) {
     /* Do not warn, this symbol is optional. */
@@ -209,8 +188,9 @@ ncclResult_t wrapNvmlDeviceGetNvLinkState(nvmlDevice_t device, unsigned int link
   }
   nvmlReturn_t ret = nvmlInternalDeviceGetNvLinkState(device, link, isActive);
   if (ret != NVML_SUCCESS) {
-    INFO(NCCL_INIT,"nvmlDeviceGetNvLinkState() failed: %s ",
-        nvmlInternalErrorString(ret));
+    if (ret != NVML_ERROR_NOT_SUPPORTED)
+      INFO(NCCL_INIT,"nvmlDeviceGetNvLinkState() failed: %s ",
+          nvmlInternalErrorString(ret));
     return ncclSystemError;
   }
   return ncclSuccess;
diff --git a/projects/rccl/src/misc/nvmlwrap_stub.cu b/projects/rccl/src/misc/nvmlwrap_stub.cc
similarity index 85%
rename from projects/rccl/src/misc/nvmlwrap_stub.cu
rename to projects/rccl/src/misc/nvmlwrap_stub.cc
index 85a389a1a9..b3bf5b7439 100644
--- a/projects/rccl/src/misc/nvmlwrap_stub.cu
+++ b/projects/rccl/src/misc/nvmlwrap_stub.cc
@@ -1,5 +1,5 @@
 /*************************************************************************
- * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
  * Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
  *
  * See LICENSE.txt for license information
@@ -27,18 +27,14 @@ ncclResult_t wrapNvmlDeviceGetIndex(nvmlDevice_t device, unsigned* index) {
   return ncclSuccess;
 }
 
-ncclResult_t wrapNvmlDeviceSetCpuAffinity(nvmlDevice_t device) {
-  return ncclSuccess;
-}
-
-ncclResult_t wrapNvmlDeviceClearCpuAffinity(nvmlDevice_t device) {
-  return ncclSuccess;
-}
-
 ncclResult_t wrapNvmlDeviceGetPciInfo(nvmlDevice_t device, nvmlPciInfo_t* pci) {
   return ncclSuccess;
 }
 
+ncclResult_t wrapNvmlDeviceGetMinorNumber(nvmlDevice_t device, unsigned int* minorNumber) {
+  return ncclSuccess;
+}
+
 ncclResult_t wrapNvmlDeviceGetNvLinkState(nvmlDevice_t device, unsigned int link, nvmlEnableState_t *isActive) {
   return ncclSuccess;
 }
@@ -50,4 +46,4 @@ ncclResult_t wrapNvmlDeviceGetNvLinkRemotePciInfo(nvmlDevice_t device, unsigned
 ncclResult_t wrapNvmlDeviceGetNvLinkCapability(nvmlDevice_t device, unsigned int link,
     nvmlNvLinkCapability_t capability, unsigned int *capResult) {
   return ncclSuccess;
-}
\ No newline at end of file
+}
diff --git a/projects/rccl/src/misc/rings.cu b/projects/rccl/src/misc/rings.cc
similarity index 84%
rename from projects/rccl/src/misc/rings.cu
rename to projects/rccl/src/misc/rings.cc
index 359e26b359..1fc58f08d0 100644
--- a/projects/rccl/src/misc/rings.cu
+++ b/projects/rccl/src/misc/rings.cc
@@ -1,5 +1,5 @@
 /*************************************************************************
- * Copyright (c) 2016-2018, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
  * Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
  *
  * See LICENSE.txt for license information
@@ -161,14 +161,25 @@ static ncclResult_t fillCoords(int nranks, int* matrix, int* coords, int* rankTo
     while ((rank = findConnected(curRank, matrix, nranks, transport, coords)) == -1) {
       current[transport] = 0;
       transport++;
-      if (transport == NTRANSPORTS) { free(p2pConnected); return ncclInternalError; }
+      if (transport == NTRANSPORTS) {
+        WARN("Error : Could not find transport to connect next group\n");
+        free(p2pConnected);
+        return ncclInternalError; }
     }
     curRank = rank;
     current[transport]++;
   }
 }
 
-NCCL_PARAM(MinNrings, "MIN_NRINGS", 0);
+#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
+#define DEFAULT_MIN_NRINGS 2
+#elif defined(__PPC__)
+// Make the default NCCL_MIN_NRINGS=4 for IBM/Power nodes
+#define DEFAULT_MIN_NRINGS 4
+#else
+#define DEFAULT_MIN_NRINGS 0
+#endif
+NCCL_PARAM(MinNrings, "MIN_NRINGS", DEFAULT_MIN_NRINGS);
 NCCL_PARAM(MaxNrings, "MAX_NRINGS", 0);
 
 /* Users can force the number of threads with an environment variable */
@@ -180,8 +191,20 @@ ncclResult_t getEnvThreads(int* nthreads) {
   return ncclSuccess;
 }
 
+static inline int copyRings(int nrings, int newNrings, int nranks, int* a, int* b, int* c, int* d) {
+  if (newNrings > MAXCHANNELS) newNrings = MAXCHANNELS;
+  for (int r=nrings; r<newNrings; r++) {
+    for (int i=0; i<nranks; i++) {
+      a[r*nranks+i] = a[(r-nrings)*nranks+i];
+      b[r*nranks+i] = b[(r-nrings)*nranks+i];
+      c[r*nranks+i] = c[(r-nrings)*nranks+i];
+      d[r*nranks+i] = d[(r-nrings)*nranks+i];
+    }
+  }
+  return newNrings;
+}
 /* Main ring creation function */
-ncclResult_t ncclGetRings(int* nrings, int* nthreads, int rank, int nranks, int* transports, ncclTvalue_t* values, int* prev, int* next) {
+ncclResult_t ncclGetRings(int* nrings, int* nthreads, int rank, int nranks, int* transports, ncclTvalue_t* values, int* prev, int* next, int* treeIn, int* treeOut) {
   *nrings = 0;
 
   if (nranks == 1) return ncclSuccess;
@@ -192,6 +215,12 @@ ncclResult_t ncclGetRings(int* nrings, int* nthreads, int rank, int nranks, int*
     if (ret == ncclSuccess && *nrings > 0) {
       if (rank == 0) INFO(NCCL_INIT,"%d ring(s) set by environment", *nrings);
       NCCLCHECK(getEnvThreads(nthreads));
+      for (int r = 0; r<*nrings; r++) {
+        for (int i = 0; i<nranks; i++) {
+          if (transports[i*nranks+prev[r*nranks+i]] == 2) treeIn[r*nranks+i] = 1;
+          if (transports[i*nranks+next[r*nranks+i]] == 2) treeOut[r*nranks+i] = 1;
+        }
+      }
       return ncclSuccess;
     }
     if (rank == 0) INFO(NCCL_INIT,"No valid ring found in environment, ignoring");
@@ -211,8 +240,8 @@ ncclResult_t ncclGetRings(int* nrings, int* nthreads, int rank, int nranks, int*
   int minScore = NCCL_MAX_SCORE;
   int nringsTmp;
   int *prevTmp, *nextTmp, *idxToRank, *rankToIdx, *groups, *subgroups;
-  NCCLCHECK(ncclCalloc(&prevTmp, nranks*MAXRINGS));
-  NCCLCHECK(ncclCalloc(&nextTmp, nranks*MAXRINGS));
+  NCCLCHECK(ncclCalloc(&prevTmp, nranks*MAXCHANNELS));
+  NCCLCHECK(ncclCalloc(&nextTmp, nranks*MAXCHANNELS));
   NCCLCHECK(ncclCalloc(&idxToRank, nranks));
   NCCLCHECK(ncclCalloc(&rankToIdx, nranks));
   NCCLCHECK(ncclCalloc(&groups, nranks));
@@ -221,8 +250,8 @@ ncclResult_t ncclGetRings(int* nrings, int* nthreads, int rank, int nranks, int*
   int nThreads;
   do {
     nThreads = *nthreads;
-    for (int i=0; i<nranks*MAXRINGS; i++) prevTmp[i] = nextTmp[i] = -1;
-    nringsTmp = MAXRINGS;
+    for (int i=0; i<nranks*MAXCHANNELS; i++) prevTmp[i] = nextTmp[i] = -1;
+    nringsTmp = MAXCHANNELS;
     // Loop over transports to connect groups
     for (int t=NTRANSPORTS-1; t>=0; t--) {
       for (int i=0; i<nranks; i++) idxToRank[i] = rankToIdx[i] = -1;
@@ -283,6 +312,11 @@ ncclResult_t ncclGetRings(int* nrings, int* nthreads, int rank, int nranks, int*
           for (int i=0; i<nidx; i++) {
             if ((prevTmp[r*nranks+idxToRank[i]] == -1) && (subprev[r*nidx+i] != -1)) prevTmp[r*nranks+idxToRank[i]] = idxToRank[subprev[r*nidx+i]];
             if ((nextTmp[r*nranks+idxToRank[i]] == -1) && (subnext[r*nidx+i] != -1)) nextTmp[r*nranks+idxToRank[i]] = idxToRank[subnext[r*nidx+i]];
+            if (t == NTRANSPORTS-1) {
+              // Save node-level masters for trees
+              treeIn[r*nranks+idxToRank[i]] = prevTmp[r*nranks+idxToRank[i]] == -1 ? 0 : 1;
+              treeOut[r*nranks+idxToRank[i]] = nextTmp[r*nranks+idxToRank[i]] == -1 ? 0 : 1;
+            }
           }
         }
         //for (int r=0; r<nringsTmp; r++) {
@@ -317,6 +351,15 @@ ncclResult_t ncclGetRings(int* nrings, int* nthreads, int rank, int nranks, int*
 
   *nthreads = nThreads;
 
+  /* Duplicate the rings in case of multinode+NVLink */
+  int nnodes = 0;
+  for (int r=0; r<nranks; r++) nnodes += treeIn[r];
+  int nvlink;
+  NCCLCHECK(ncclNvlinkGpu(&nvlink));
+  if (nnodes > 1 && nvlink) {
+    *nrings = copyRings(*nrings, *nrings*2, nranks, prev, next, treeIn, treeOut);
+  }
+
   if (*nrings == 0) {
     WARN("Could not create rings, falling back on simple ring");
     *nrings = 1;
@@ -330,15 +373,15 @@ ncclResult_t ncclGetRings(int* nrings, int* nthreads, int rank, int nranks, int*
     if (rank == 0) WARN("NCCL_MIN_NRINGS set to a value greater than NCCL_MAX_NRINGS, ignoring NCCL_MIN_NRINGS");
     minNrings = 0;
   }
-  if (minNrings > MAXRINGS) {
-    if (rank == 0) WARN("NCCL_MIN_NRINGS set to a value greater than the maximum number of rings supported (%d), limiting it to %d", MAXRINGS, MAXRINGS);
-    minNrings = MAXRINGS;
+  if (minNrings > MAXCHANNELS) {
+    if (rank == 0) WARN("NCCL_MIN_NRINGS set to a value greater than the maximum number of rings supported (%d), limiting it to %d", MAXCHANNELS, MAXCHANNELS);
+    minNrings = MAXCHANNELS;
   }
   if (maxNrings > 0 && maxNrings <= *nrings) {
     if (rank == 0) INFO(NCCL_INIT,"Limiting to %d rings per user request.", maxNrings);
     *nrings = maxNrings;
   } else {
-#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__)
+#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
     int defaultMinNrings = 1;
 #else
     int defaultMinNrings = ncclCudaCompCap() == 3 ? 2 : 1;
@@ -346,13 +389,7 @@ ncclResult_t ncclGetRings(int* nrings, int* nthreads, int rank, int nranks, int*
     if (minNrings < defaultMinNrings) minNrings = defaultMinNrings;
     if (minNrings > 0 && minNrings > *nrings) {
       if (rank == 0 && minNrings > defaultMinNrings) INFO(NCCL_INIT,"Duplicating rings to %d per user request.", minNrings);
-      for (int r=*nrings; r<MAXRINGS && r <minNrings; r++) {
-        for (int i=0; i<nranks; i++) {
-          prev[r*nranks+i] = prev[(r-*nrings)*nranks+i];
-          next[r*nranks+i] = next[(r-*nrings)*nranks+i];
-        }
-      }
-      *nrings = minNrings;
+      *nrings = copyRings(*nrings, minNrings, nranks, prev, next, treeIn, treeOut);
     }
   }
 
diff --git a/projects/rccl/src/misc/topo.cc b/projects/rccl/src/misc/topo.cc
new file mode 100644
index 0000000000..4ce68e2430
--- /dev/null
+++ b/projects/rccl/src/misc/topo.cc
@@ -0,0 +1,58 @@
+/*************************************************************************
+ * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
+ * Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#include "core.h"
+#include "topo.h"
+
+#define BUSID_SIZE (sizeof("0000:00:00.0"))
+#define BUSID_REDUCED_SIZE (sizeof("0000:00"))
+
+ncclResult_t getCudaPath(int cudaDev, char** path) {
+  char busId[BUSID_SIZE];
+  CUDACHECK(hipDeviceGetPCIBusId(busId, BUSID_SIZE, cudaDev));
+  for (int i=0; i<BUSID_SIZE; i++) busId[i] = tolower(busId[i]);
+  char busPath[] = "/sys/class/pci_bus/0000:00/../../0000:00:00.0";
+  memcpy(busPath+sizeof("/sys/class/pci_bus/")-1, busId, BUSID_REDUCED_SIZE-1);
+  memcpy(busPath+sizeof("/sys/class/pci_bus/0000:00/../../")-1, busId, BUSID_SIZE-1);
+  *path = realpath(busPath, NULL);
+  if (*path == NULL) {
+    WARN("Could not find real path of %s", busPath);
+    return ncclSystemError;
+  }
+  return ncclSuccess;
+}
+
+const char* pathDists[] = { "PIX", "PXB", "PHB", "NODE", "SYS" };
+
+int pciDistance(char* path1, char* path2) {
+  int score = 0;
+  int depth = 0;
+  int same = 1;
+  for (int i=0; i<strlen(path1); i++) {
+    if (path1[i] != path2[i]) same = 0;
+    if (path1[i] == '/') {
+      depth++;
+      if (same == 1) score++;
+    }
+  }
+  if (score <= 3) {
+#ifdef __PPC__
+    // NUMA distance detection and PATH_SYS not supported on IBM/Power nodes
+    // nodes currently
+    return PATH_NODE;
+#else
+    /* Split the former PATH_SOC distance into PATH_NODE and PATH_SYS based on numaId */
+    int numaId1 = getNumaId(path1);
+    int numaId2 = getNumaId(path2);
+    TRACE(NCCL_INIT, "depth %d score %d path1 %s numaId %d path2 %s numaId %d", depth, score, path1, numaId1, path2, numaId2);
+    return ((numaId1 == numaId2) ? PATH_NODE : PATH_SYS);
+#endif
+  }
+  if (score == 4) return PATH_PHB;
+  if (score == depth-1) return PATH_PIX;
+  return PATH_PXB;
+}
diff --git a/projects/rccl/src/misc/trees.cc b/projects/rccl/src/misc/trees.cc
new file mode 100644
index 0000000000..f672abe302
--- /dev/null
+++ b/projects/rccl/src/misc/trees.cc
@@ -0,0 +1,108 @@
+/*************************************************************************
+ * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#include "core.h"
+#include "net.h"
+#include "param.h"
+
+#define RANK_TO_INDEX(r) (rank > root ? rank-1 : rank)
+
+/* Btree which alternates leaves and nodes.
+ * Assumes root is 0, which conveniently builds a tree on powers of two,
+ * (because we have pow2-1 ranks) which lets us manipulate bits.
+ * Find first non-zero bit, then :
+ * Find the parent :
+ *   xx01[0] -> xx10[0] (1,5,9 below) or xx00[0] if xx10[0] is out of bounds (13 below)
+ *   xx11[0] -> xx10[0] (3,7,11 below)
+ * Find the children :
+ *   xx10[0] -> xx01[0] (2,4,6,8,10,12) or -1 (1,3,5,7,9,11,13)
+ *   xx10[0] -> xx11[0] (2,4,6,8,10) or xx101[0] (12) or xx1001[0] ... or -1 (1,3,5,7,9,11,13)
+ *
+ * Illustration :
+ * 0---------------8
+ *          ______/ \______
+ *         4               12
+ *       /   \            /  \
+ *     2       6       10     \
+ *    / \     / \     /  \     \
+ *   1   3   5   7   9   11    13
+ */
+ncclResult_t ncclGetBtree(int nranks, int rank, int* u, int* d0, int* d1) {
+  int up, down0, down1;
+  int bit;
+  for (bit=1; bit<nranks; bit<<=1) {
+    if (bit & rank) break;
+  }
+
+  if (rank == 0) {
+    *u = -1;
+    *d0 = nranks > 1 ? bit >> 1 : -1;
+    *d1 = -1;
+    return ncclSuccess;
+  }
+
+  up = (rank ^ bit) | (bit << 1);
+  if (up >= nranks) up = (rank ^ bit);
+  *u = up;
+
+  int lowbit = bit >> 1;
+  // down0 is always within bounds
+  down0 = lowbit == 0 ? -1 : rank-lowbit;
+
+  down1 = lowbit == 0 ? -1 : rank+lowbit;
+  // Make sure down1 is within bounds
+  while (down1 >= nranks) {
+    down1 = lowbit == 0 ? -1 : rank+lowbit;
+    lowbit >>= 1;
+  }
+  *d0 = down0; *d1 = down1;
+
+  return ncclSuccess;
+}
+
+/* Build a double binary tree. Take the previous tree for the first tree.
+ * For the second tree, we use a mirror tree (if nranks is odd)
+ *
+ *                 8---------0---------5
+ *          ______/ \______      _____/ \______
+ *         4               12   1              9
+ *       /   \            /      \           /   \
+ *     2       6       10          3       7      10
+ *    / \     / \     /  \        / \     / \    /  \
+ *   1   3   5   7   9   11      2   4   6   8  11  12
+ *
+ * or shift it by one rank (if nranks is even)
+ *
+ *                 8---------0--------------9
+ *          ______/ \                ______/ \
+ *         4         \              5         \
+ *       /   \        \           /   \        \
+ *     2       6       10       3       7       11
+ *    / \     / \     /  \     / \     / \     /  \
+ *   1   3   5   7   9   11   2   4   6   8   10   1
+ */
+ncclResult_t ncclGetDtree(int nranks, int rank, int* s0, int* d0_0, int* d0_1, int* s1, int* d1_0, int* d1_1) {
+  // First tree ... use a btree
+  ncclGetBtree(nranks, rank, s0, d0_0, d0_1);
+  // Second tree ... mirror or shift
+  if (nranks % 2 == 0) {
+    // shift
+    int shiftrank = (rank-1+nranks) % nranks;
+    int u, d0, d1;
+    ncclGetBtree(nranks, shiftrank, &u, &d0, &d1);
+    *s1 = u == -1 ? -1 : (u+1) % nranks;
+    *d1_0 = d0 == -1 ? -1 : (d0+1) % nranks;
+    *d1_1 = d1 == -1 ? -1 : (d1+1) % nranks;
+  } else {
+    // mirror
+    int u, d0, d1;
+    ncclGetBtree(nranks, nranks-1-rank, &u, &d0, &d1);
+    *s1 = u == -1 ? -1 : nranks-1-u;
+    *d1_0 = d0 == -1 ? -1 : nranks-1-d0;
+    *d1_1 = d1 == -1 ? -1 : nranks-1-d1;
+  }
+  return ncclSuccess;
+}
diff --git a/projects/rccl/src/misc/utils.cu b/projects/rccl/src/misc/utils.cc
similarity index 79%
rename from projects/rccl/src/misc/utils.cu
rename to projects/rccl/src/misc/utils.cc
index c42b7ca122..614c78b936 100644
--- a/projects/rccl/src/misc/utils.cu
+++ b/projects/rccl/src/misc/utils.cc
@@ -1,5 +1,6 @@
 /*************************************************************************
- * Copyright (c) 2016-2018, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
+ * Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
  *
  * See LICENSE.txt for license information
  ************************************************************************/
@@ -11,13 +12,31 @@
 #include <string.h>
 #include <stdarg.h>
 
-ncclResult_t getHostName(char* hostname, int maxlen) {
+#include "nvmlwrap.h"
+#include "core.h"
+
+// Convert a logical cudaDev index to the NVML device minor number
+ncclResult_t getNvmlDevice(int cudaDev, int *nvmlDev) {
+  char busId[NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE];
+  nvmlDevice_t nvmlDevice;
+  unsigned int dev;
+  *nvmlDev = -1;
+  CUDACHECK(hipDeviceGetPCIBusId(busId, NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE, cudaDev));
+  NCCLCHECK(wrapNvmlDeviceGetHandleByPciBusId(busId, &nvmlDevice));
+  NCCLCHECK(wrapNvmlDeviceGetMinorNumber(nvmlDevice, &dev));
+
+  *nvmlDev = dev;
+
+  return ncclSuccess;
+}
+
+ncclResult_t getHostName(char* hostname, int maxlen, const char delim) {
   if (gethostname(hostname, maxlen) != 0) {
     strncpy(hostname, "unknown", maxlen);
     return ncclSystemError;
   }
   int i = 0;
-  while ((hostname[i] != '.') && (hostname[i] != '\0') && (i < maxlen-1)) i++;
+  while ((hostname[i] != delim) && (hostname[i] != '\0') && (i < maxlen-1)) i++;
   hostname[i] = '\0';
   return ncclSuccess;
 }
@@ -30,7 +49,7 @@ void ncclDebugLog(ncclDebugLogLevel level, unsigned long flags, const char *file
   if (ncclDebugLevel <= NCCL_LOG_NONE) return;
 
   char hostname[1024];
-  getHostName(hostname, 1024);
+  getHostName(hostname, 1024, '.');
   int cudaDev;
   hipGetDevice(&cudaDev);
 
@@ -95,8 +114,8 @@ uint64_t getnHash(const char* string, int n) {
  */
 uint64_t getHostHash(void) {
   char uname[1024];
-  // Start off with the hostname
-  (void) getHostName(uname, sizeof(uname));
+  // Start off with the full hostname
+  (void) getHostName(uname, sizeof(uname), '\0');
   int offset = strlen(uname);
   int len;
   // $(readlink /proc/self/ns/uts)
@@ -138,8 +157,8 @@ int parseStringList(const char* string, struct netIf* ifList, int maxList) {
   if (!string) return 0;
 
   const char* ptr = string;
-  // Ignore "^" prefix, will be detected outside of this function
-  if (ptr[0] == '^') ptr++;
+  // Ignore "^" or "=" prefix, will be detected outside of this function
+  if (ptr[0] == '^' || ptr[0] == '=') ptr++;
 
   int ifNum = 0;
   int ifC = 0;
@@ -168,8 +187,10 @@ int parseStringList(const char* string, struct netIf* ifList, int maxList) {
   return ifNum;
 }
 
-static bool matchPrefix(const char* string, const char* prefix) {
-  return (strncmp(string, prefix, strlen(prefix)) == 0);
+static bool matchIf(const char* string, const char* ref, bool matchExact) {
+  // Make sure to include '\0' in the exact case
+  int matchLen = matchExact ? strlen(string) + 1 : strlen(ref);
+  return strncmp(string, ref, matchLen) == 0;
 }
 
 static bool matchPort(const int port1, const int port2) {
@@ -180,12 +201,12 @@ static bool matchPort(const int port1, const int port2) {
 }
 
 
-bool matchIfList(const char* string, int port, struct netIf* ifList, int listSize) {
+bool matchIfList(const char* string, int port, struct netIf* ifList, int listSize, bool matchExact) {
   // Make an exception for the case where no user list is defined
   if (listSize == 0) return true;
 
   for (int i=0; i<listSize; i++) {
-    if (matchPrefix(string, ifList[i].prefix)
+    if (matchIf(string, ifList[i].prefix, matchExact)
         && matchPort(port, ifList[i].port)) {
       return true;
     }
diff --git a/projects/rccl/src/nccl.h.in b/projects/rccl/src/nccl.h.in
index 4d0a5a94d4..686ed42406 100644
--- a/projects/rccl/src/nccl.h.in
+++ b/projects/rccl/src/nccl.h.in
@@ -1,5 +1,5 @@
 /*************************************************************************
- * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
  * Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
  *
  * See LICENSE.txt for license information
@@ -23,16 +23,14 @@
 extern "C" {
 #endif
 
-/*! @brief Opaque handle to communicator 
-*/
+/* Opaque handle to communicator */
 typedef struct ncclComm* ncclComm_t;
 
 #define NCCL_UNIQUE_ID_BYTES 128
-/*! @brief struct to store ncclUniqueId */
 typedef struct { char internal[NCCL_UNIQUE_ID_BYTES]; } ncclUniqueId;
 
-/*! @brief Error type */
-  typedef enum { ncclSuccess                 =  0, /**< Successfuly ran */
+/* Error type */
+typedef enum { ncclSuccess                 =  0,
                ncclUnhandledCudaError      =  1,
                ncclSystemError             =  2,
                ncclInternalError           =  3,
@@ -40,86 +38,75 @@ typedef struct { char internal[NCCL_UNIQUE_ID_BYTES]; } ncclUniqueId;
                ncclInvalidUsage            =  5,
                ncclNumResults              =  6 } ncclResult_t;
 
-
-  
-/*! @brief Return the NCCL_VERSION_CODE of the NCCL library in the supplied integer.
- * 
- * @details This integer is coded with the MAJOR, MINOR and PATCH level of the
+/* Return the NCCL_VERSION_CODE of the NCCL library in the supplied integer.
+ * This integer is coded with the MAJOR, MINOR and PATCH level of the
  * NCCL library
  */
 ncclResult_t ncclGetVersion(int *version);
 ncclResult_t pncclGetVersion(int *version);
 
-/*! @brief Generates an ID for ncclCommInitRank
-
-    @details
-    Generates an ID to be used in ncclCommInitRank. ncclGetUniqueId should be
-    called once and the Id should be distributed to all ranks in the
-    communicator before calling ncclCommInitRank. 
-    
-    @param[in]
-    uniqueId     ncclUniqueId*
-                 pointer to uniqueId
-
-*/
+/* Generates an Id to be used in ncclCommInitRank. ncclGetUniqueId should be
+ * called once and the Id should be distributed to all ranks in the
+ * communicator before calling ncclCommInitRank. */
 ncclResult_t  ncclGetUniqueId(ncclUniqueId* uniqueId);
 ncclResult_t pncclGetUniqueId(ncclUniqueId* uniqueId);
 
-/*! @brief Creates a new communicator (multi thread/process version).
-    
-    @details
-    rank must be between 0 and nranks-1 and unique within a communicator clique.
-    Each rank is associated to a CUDA device, which has to be set before calling
-    ncclCommInitRank.
-    ncclCommInitRank implicitly syncronizes with other ranks, so it must be
-    called by different threads/processes or use ncclGroupStart/ncclGroupEnd. 
-    
-    @param[in]
-    comm        ncclComm_t*
-                communicator struct pointer
-    */
+/* Creates a new communicator (multi thread/process version).
+ * rank must be between 0 and nranks-1 and unique within a communicator clique.
+ * Each rank is associated to a CUDA device, which has to be set before calling
+ * ncclCommInitRank.
+ * ncclCommInitRank implicitly syncronizes with other ranks, so it must be
+ * called by different threads/processes or use ncclGroupStart/ncclGroupEnd. */
 ncclResult_t  ncclCommInitRank(ncclComm_t* comm, int nranks, ncclUniqueId commId, int rank);
 ncclResult_t pncclCommInitRank(ncclComm_t* comm, int nranks, ncclUniqueId commId, int rank);
 
-/*! @brief Creates a clique of communicators (single process version).
- *
- * @details This is a convenience function to create a single-process communicator clique.
+/* Creates a clique of communicators (single process version).
+ * This is a convenience function to create a single-process communicator clique.
  * Returns an array of ndev newly initialized communicators in comm.
  * comm should be pre-allocated with size at least ndev*sizeof(ncclComm_t).
- * If devlist is NULL, the first ndev HIP devices are used.
- * Order of devlist defines user-order of processors within the communicator. 
- * */
+ * If devlist is NULL, the first ndev CUDA devices are used.
+ * Order of devlist defines user-order of processors within the communicator. */
 ncclResult_t  ncclCommInitAll(ncclComm_t* comm, int ndev, const int* devlist);
 ncclResult_t pncclCommInitAll(ncclComm_t* comm, int ndev, const int* devlist);
 
-/*! @brief Frees resources associated with communicator object. */
+/* Frees resources associated with communicator object, but waits for any operations
+ * that might still be running on the device. */
 ncclResult_t  ncclCommDestroy(ncclComm_t comm);
 ncclResult_t pncclCommDestroy(ncclComm_t comm);
 
-/*! @brief Returns a human-readable error message. */
+/* Frees resources associated with communicator object and aborts any operations
+ * that might still be running on the device. */
+ncclResult_t  ncclCommAbort(ncclComm_t comm);
+ncclResult_t pncclCommAbort(ncclComm_t comm);
+
+/* Returns a human-readable error message. */
 const char*  ncclGetErrorString(ncclResult_t result);
 const char* pncclGetErrorString(ncclResult_t result);
 
-/*! @brief Gets the number of ranks in the communicator clique. */
+/* Checks whether the comm has encountered any asynchronous errors */
+ncclResult_t  ncclCommGetAsyncError(ncclComm_t comm, ncclResult_t *asyncError);
+ncclResult_t pncclCommGetAsyncError(ncclComm_t comm, ncclResult_t *asyncError);
+
+/* Gets the number of ranks in the communicator clique. */
 ncclResult_t  ncclCommCount(const ncclComm_t comm, int* count);
 ncclResult_t pncclCommCount(const ncclComm_t comm, int* count);
 
-/*! @brief Returns the rocm device number associated with the communicator. */
+/* Returns the cuda device number associated with the communicator. */
 ncclResult_t  ncclCommCuDevice(const ncclComm_t comm, int* device);
 ncclResult_t pncclCommCuDevice(const ncclComm_t comm, int* device);
 
-/*! @brief Returns the user-ordered "rank" associated with the communicator. */
+/* Returns the user-ordered "rank" associated with the communicator. */
 ncclResult_t  ncclCommUserRank(const ncclComm_t comm, int* rank);
 ncclResult_t pncclCommUserRank(const ncclComm_t comm, int* rank);
 
-/*! @brief Reduction operation selector */
+/* Reduction operation selector */
 typedef enum { ncclSum        = 0,
                ncclProd       = 1,
                ncclMax        = 2,
                ncclMin        = 3,
                ncclNumOps     = 4 } ncclRedOp_t;
 
-/*! @brief Data types */
+/* Data types */
 typedef enum { ncclInt8       = 0, ncclChar       = 0,
                ncclUint8      = 1,
                ncclInt32      = 2, ncclInt        = 2,
@@ -137,17 +124,17 @@ typedef enum { ncclInt8       = 0, ncclChar       = 0,
  * Collective communication operations must be called separately for each
  * communicator in a communicator clique.
  *
- * They return when operations have been enqueued on the hipstream.
+ * They return when operations have been enqueued on the CUDA stream.
  *
  * Since they may perform inter-CPU synchronization, each call has to be done
  * from a different thread or process, or need to use Group Semantics (see
  * below).
  */
 
-/*!
- * @brief Reduce collective communication
+/*
+ * Reduce
  *
- * @details Reduces data arrays of length count in sendbuff into recvbuff using op
+ * Reduces data arrays of length count in sendbuff into recvbuff using op
  * operation.
  * recvbuff may be NULL on all calls except for root device.
  * root is the rank (not the CUDA device) where data will reside after the
@@ -160,9 +147,10 @@ ncclResult_t  ncclReduce(const void* sendbuff, void* recvbuff, size_t count, ncc
 ncclResult_t pncclReduce(const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype,
     ncclRedOp_t op, int root, ncclComm_t comm, hipStream_t stream);
 
-/*! @brief (deprecated) Broadcast (in-place)
+/*
+ * (deprecated) Broadcast (in-place)
  *
- * @details Copies count values from root to all other devices.
+ * Copies count values from root to all other devices.
  * root is the rank (not the CUDA device) where data resides before the
  * operation is started.
  *
@@ -173,10 +161,11 @@ ncclResult_t  ncclBcast(void* buff, size_t count, ncclDataType_t datatype, int r
 ncclResult_t pncclBcast(void* buff, size_t count, ncclDataType_t datatype, int root,
     ncclComm_t comm, hipStream_t stream);
 
-/*! @brief Broadcast
+/*
+ * Broadcast
  *
- * @details Copies count values from root to all other devices.
- * root is the rank (not the HIP device) where data resides before the
+ * Copies count values from root to all other devices.
+ * root is the rank (not the CUDA device) where data resides before the
  * operation is started.
  *
  * In-place operation will happen if sendbuff == recvbuff.
@@ -186,9 +175,10 @@ ncclResult_t  ncclBroadcast(const void* sendbuff, void* recvbuff, size_t count,
 ncclResult_t pncclBroadcast(const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype, int root,
     ncclComm_t comm, hipStream_t stream);
 
-/*! @brief All-Reduce
+/*
+ * All-Reduce
  *
- * @details Reduces data arrays of length count in sendbuff using op operation, and
+ * Reduces data arrays of length count in sendbuff using op operation, and
  * leaves identical copies of result on each recvbuff.
  *
  * In-place operation will happen if sendbuff == recvbuff.
@@ -198,10 +188,10 @@ ncclResult_t  ncclAllReduce(const void* sendbuff, void* recvbuff, size_t count,
 ncclResult_t pncclAllReduce(const void* sendbuff, void* recvbuff, size_t count,
     ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm, hipStream_t stream);
 
-/*!
- * @brief Reduce-Scatter
+/*
+ * Reduce-Scatter
  *
- * @details Reduces data in sendbuff using op operation and leaves reduced result
+ * Reduces data in sendbuff using op operation and leaves reduced result
  * scattered over the devices so that recvbuff on rank i will contain the i-th
  * block of the result.
  * Assumes sendcount is equal to nranks*recvcount, which means that sendbuff
@@ -216,9 +206,10 @@ ncclResult_t pncclReduceScatter(const void* sendbuff, void* recvbuff,
     size_t recvcount, ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm,
     hipStream_t stream);
 
-/*! @brief All-Gather
+/*
+ * All-Gather
  *
- * @details Each device gathers sendcount values from other GPUs into recvbuff,
+ * Each device gathers sendcount values from other GPUs into recvbuff,
  * receiving data from rank i at offset i*sendcount.
  * Assumes recvcount is equal to nranks*sendcount, which means that recvbuff
  * should have a size of at least nranks*sendcount elements.
@@ -248,16 +239,18 @@ ncclResult_t pncclAllGather(const void* sendbuff, void* recvbuff, size_t sendcou
  * of ncclGroupStart/ncclGroupEnd.
  */
 
-/*! @brief Group Start
+/*
+ * Group Start
  *
- * @details Start a group call. All subsequent calls to NCCL may not block due to
+ * Start a group call. All subsequent calls to NCCL may not block due to
  * inter-CPU synchronization.
  */
 ncclResult_t ncclGroupStart();
 
-/*! @brief Group End
+/*
+ * Group End
  *
- * @details End a group call. Wait for all calls since ncclGroupStart to complete
+ * End a group call. Wait for all calls since ncclGroupStart to complete
  * before returning.
  */
 ncclResult_t ncclGroupEnd();
diff --git a/projects/rccl/src/nccl.pc.in b/projects/rccl/src/nccl.pc.in
new file mode 100755
index 0000000000..0d98494999
--- /dev/null
+++ b/projects/rccl/src/nccl.pc.in
@@ -0,0 +1,10 @@
+prefix=${nccl:Prefix}
+exec_prefix=${prefix}
+libdir=${exec_prefix}/lib
+includedir=${prefix}/include
+
+Name: nccl
+Description: Optimized primitives for collective multi-GPU communication
+Version: ${nccl:Major}.${nccl:Minor}.${nccl:Patch}
+Libs: -L${libdir} -lnccl
+Cflags: -I${includedir}
diff --git a/projects/rccl/src/ring.cu b/projects/rccl/src/ring.cu
deleted file mode 100644
index 293557fa39..0000000000
--- a/projects/rccl/src/ring.cu
+++ /dev/null
@@ -1,85 +0,0 @@
-/*************************************************************************
- * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
- * Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved. 
- *
- * See LICENSE.txt for license information
- ************************************************************************/
-
-#include "ring.h"
-#include "param.h"
-
-NCCL_PARAM(Buffsize, "BUFFSIZE", DEFAULT_BUFFER_SIZE_BYTES);
-
-ncclResult_t initRing(struct ncclComm* comm, int ringid) {
-  struct ncclRing* ring = comm->rings+ringid;
-  ring->id = ringid;
-
-  // Setup intermediate buffering
-  ring->buffSize = ncclParamBuffsize();
-
-  // attempt to allocate buffers in fine grain
-  const int sendSize = ring->devMemSendSize = sizeof(struct ncclSendMem);
-  struct ncclSendMem* sendMem;
-  ncclCudaCalloc((char**)&sendMem, sendSize, true);
-  ring->devMemSend = sendMem;
-
-  const int recvSize = ring->devMemRecvSize = offsetof(struct ncclRecvMem, buff)+ring->buffSize;
-  struct ncclRecvMem* recvMem;
-  ncclCudaCalloc((char**)&recvMem, recvSize, true);
-  ring->devMemRecv = recvMem;
-
-  TRACE(NCCL_INIT,"sendMem %p size %d recvMem %p size %d", sendMem, sendSize, recvMem, recvSize);
-
-  // Pre-configure send/recv pointers. Those are the default, they may change later.
-  if (recvMem){
-    ring->recv.conn.buff = recvMem->buff;
-    ring->recv.conn.llBuff = recvMem->llBuff;
-    ring->recv.conn.tail = &recvMem->tail;
-    ring->recv.conn.opCount = &recvMem->opCount;
-  } else {
-    ring->recv.conn.buff = 0;
-    ring->recv.conn.llBuff = 0;
-    ring->recv.conn.tail = 0;
-    ring->recv.conn.opCount = 0;
-  }
-  ring->recv.conn.direct = 0;
-
-  if (sendMem) {
-    ring->send.conn.head = &sendMem->head;
-    ring->send.conn.llHead = &sendMem->llHead;
-  } else {
-    ring->send.conn.head = 0;
-    ring->send.conn.llHead = 0;
-  }
-  ring->send.conn.direct = 0;
-  ring->send.conn.llStep = 0;
-  ring->send.conn.llLastCleaning = 0;
-
-  // Ring index to user rank table.
-  NCCLCHECK(ncclCudaCalloc(&ring->devUserRanks, comm->nRanks));
-  NCCLCHECK(ncclCalloc(&ring->userRanks, comm->nRanks));
-
-  // Per-ring operation list.
-  NCCLCHECK(ncclCudaHostAlloc((void**)&ring->collectives, (void**)&ring->devCollectives, sizeof(struct ncclColl)*NCCL_MAX_OPS));
-  return ncclSuccess;
-}
-
-ncclResult_t freeRing(struct ncclRing* ring) {
-  // Intermediate buffering
-  CUDACHECK(hipFree(ring->devMemSend));
-  CUDACHECK(hipFree(ring->devMemRecv));
-
-  // Index to rank table
-  free(ring->userRanks);
-  CUDACHECK(hipFree(ring->devUserRanks));
-
-  // Operation list
-  NCCLCHECK(ncclCudaHostFree(ring->collectives));
-
-  // Free transport proxy resources
-  if (ring->send.transportResources) NCCLCHECK(ring->send.transport->send.free(ring->send.transportResources));
-  NCCLCHECK(transportDestroyProxy(&ring->send));
-  if (ring->recv.transportResources) NCCLCHECK(ring->recv.transport->recv.free(ring->recv.transportResources));
-  NCCLCHECK(transportDestroyProxy(&ring->recv));
-  return ncclSuccess;
-}
diff --git a/projects/rccl/src/transport.cc b/projects/rccl/src/transport.cc
new file mode 100644
index 0000000000..3b08e377cf
--- /dev/null
+++ b/projects/rccl/src/transport.cc
@@ -0,0 +1,249 @@
+/*************************************************************************
+ * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
+ * Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#include "core.h"
+
+extern struct ncclTransport p2pTransport;
+extern struct ncclTransport shmTransport;
+extern struct ncclTransport netTransport;
+
+struct ncclTransport ncclTransports[NTRANSPORTS] = {
+  p2pTransport,
+  shmTransport,
+  netTransport,
+};
+
+#define RECV 0
+#define SEND 1
+
+static bool NeedProxy(int type, int pattern, int root, struct ncclRing* ring, int nranks) {
+  if (pattern == ncclPatternRing || pattern == ncclPatternRingTwice) return true;
+
+  /* In chains, one rank does not need a proxy. Let's figure out which one it is */
+  // Which index in the reorganized rings should we compare root against */
+  const int myrank = 0, nextrank = 1, prevrank = nranks-1;
+  int index = pattern == ncclPatternPipelineFrom ?
+      /*                            no recv /  no send    if root = */
+      /* bcast  */ (type == RECV ?   myrank : nextrank ):
+      /* reduce */ (type == RECV ? prevrank :   myrank );
+  int rank = ring->userRanks[index];
+  return (root != rank);
+}
+
+enum { proxyRecv=0, proxySend=1 };
+
+#define PROXYARGS_ALLOCATE_SIZE 32
+struct ncclProxyPool {
+  struct ncclProxyPool *next;
+  struct ncclProxyArgs elems[PROXYARGS_ALLOCATE_SIZE];
+};
+
+ncclResult_t transportAllocateProxyArgs(struct ncclComm* comm, struct ncclProxyArgs** argsptr) {
+  struct ncclProxyState* state = &comm->proxyState;
+  struct ncclProxyArgs* elem;
+  pthread_mutex_lock(&state->mutex);
+  if (state->pool == NULL) {
+    // Allocate a new pool of elements
+    struct ncclProxyPool* newPool;
+    NCCLCHECK(ncclCalloc(&newPool, 1));
+    struct ncclProxyArgs* newElems = newPool->elems;
+    // Chain newly allocated elements
+    for (int i=0; i<PROXYARGS_ALLOCATE_SIZE; i++) {
+      if (i+1 < PROXYARGS_ALLOCATE_SIZE) newElems[i].next = newElems+i+1;
+    }
+    // Add them all to the pool list
+    state->pool = newElems;
+    // Save the pool memory block for later resource release
+    newPool->next = state->pools;
+    state->pools = newPool;
+  }
+  elem = state->pool;
+  state->pool = state->pool->next;
+  pthread_mutex_unlock(&state->mutex);
+  elem->next = elem->nextPeer = NULL;
+  *argsptr = elem;
+  return ncclSuccess;
+}
+
+static void ProxyAppend(struct ncclConnector* connector, struct ncclProxyArgs* args) {
+  struct ncclComm* comm = connector->comm;
+  struct ncclProxyState* state = &comm->proxyState;
+  pthread_mutex_lock(&state->mutex);
+  if (connector->proxyAppend == NULL) {
+    // Nothing running for that peer. Add to the circular list
+    if (state->ops == NULL) {
+      // Create the list
+      args->next = args;
+      state->ops = args;
+    } else {
+      // Insert element in the list
+      args->next = state->ops->next;
+      state->ops->next = args;
+    }
+    connector->proxyAppend = args;
+  } else {
+    // There is an active operation already for that peer.
+    // Add it to the per-peer list
+    connector->proxyAppend->nextPeer = args;
+    connector->proxyAppend = args;
+  }
+  pthread_mutex_unlock(&state->mutex);
+}
+
+template <int type>
+static ncclResult_t SaveProxy(int peer, struct ncclProxyArgs* args) {
+  if (peer < 0) return ncclSuccess;
+
+  struct ncclPeer* peerComm = args->channel->peers+peer;
+  struct ncclConnector* connector = type == proxyRecv ? &peerComm->recv : &peerComm->send;
+  if (connector->transportComm->proxy == NULL) return ncclSuccess;
+
+  struct ncclProxyArgs* op;
+  NCCLCHECK(transportAllocateProxyArgs(connector->comm, &op));
+  memcpy(op, args, sizeof(struct ncclProxyArgs));
+  op->connector = connector;
+  op->progress = connector->transportComm->proxy;
+  op->state = ncclProxyOpReady;
+  ProxyAppend(connector, op);
+  return ncclSuccess;
+}
+
+ncclResult_t transportSaveProxies(struct ncclProxyArgs* args, int pattern, int root, int nranks) {
+  if (pattern == ncclPatternRing || pattern == ncclPatternRingTwice || pattern == ncclPatternPipelineFrom || pattern == ncclPatternPipelineTo) {
+    struct ncclRing* ring = &args->channel->ring;
+    if (NeedProxy(RECV, pattern, root, ring, nranks)) NCCLCHECK(SaveProxy<proxyRecv>(ring->prev, args));
+    if (NeedProxy(SEND, pattern, root, ring, nranks)) NCCLCHECK(SaveProxy<proxySend>(ring->next, args));
+  }
+  if (pattern == ncclPatternTreeUp || pattern == ncclPatternTreeUpDown) {
+    // Tree up
+    struct ncclTree* tree = &args->channel->tree;
+    for (int i=0; i<NCCL_MAX_TREE_ARITY; i++) NCCLCHECK(SaveProxy<proxyRecv>(tree->down[i], args));
+    NCCLCHECK(SaveProxy<proxySend>(tree->up, args));
+  }
+  if (pattern == ncclPatternTreeDown || pattern == ncclPatternTreeUpDown) {
+    // Tree down
+    struct ncclTree* tree = &args->channel->tree;
+    for (int i=0; i< NCCL_MAX_TREE_ARITY; i++) NCCLCHECK(SaveProxy<proxySend>(tree->down[i], args));
+    NCCLCHECK(SaveProxy<proxyRecv>(tree->up, args));
+  }
+  return ncclSuccess;
+}
+
+void* persistentThread(void *comm_) {
+  struct ncclComm* comm = (struct ncclComm*)comm_;
+  struct ncclProxyState* state = &comm->proxyState;
+  struct ncclProxyArgs* op = NULL;
+  ncclResult_t ret = ncclSuccess;
+  int idle = 1;
+  int idleSpin = 0;
+  while (1) {
+    do {
+      if (LOAD(comm->abortFlag)) return NULL;
+      if (op == NULL) {
+        pthread_mutex_lock(&state->mutex);
+        op = state->ops;
+        if (op == NULL) {
+          if (state->stop) {
+            // No more commands to process and proxy has been requested to stop
+            pthread_mutex_unlock(&state->mutex);
+            return NULL;
+          }
+          pthread_cond_wait(&state->cond, &state->mutex);
+        }
+        pthread_mutex_unlock(&state->mutex);
+      }
+    } while (op == NULL);
+    op->idle = 0;
+    if (op->state != ncclProxyOpNone) ret = op->progress(op);
+    if (ret != ncclSuccess) {
+      comm->fatalError = ret;
+      INFO(NCCL_ALL,"%s:%d -> %d [Proxy Thread]", __FILE__, __LINE__, ret);
+      return NULL;
+    }
+    idle &= op->idle;
+    pthread_mutex_lock(&state->mutex);
+    if (!idle) idleSpin = 0;
+    struct ncclProxyArgs *next = op->next;
+    if (next->state == ncclProxyOpNone) {
+      struct ncclProxyArgs *freeOp = next;
+      if (next->nextPeer) {
+        // Replace next by its next per-peer element.
+        next = next->nextPeer;
+        if (op != freeOp) {
+          next->next = freeOp->next;
+          op->next = next;
+        } else {
+          next->next = next;
+        }
+      } else {
+        // Remove next from circular list
+        next->connector->proxyAppend = NULL;
+        if (op != freeOp) {
+          next = next->next;
+          op->next = next;
+        } else {
+          next = NULL;
+        }
+      }
+      if (freeOp == state->ops) state->ops = next;
+      freeOp->next = state->pool;
+      state->pool = freeOp;
+    }
+    op = next;
+    if (op == state->ops) {
+      if (idle == 1) {
+        if (++idleSpin == 10) {
+          sched_yield();
+          idleSpin = 0;
+        }
+      }
+      idle = 1;
+    }
+    pthread_mutex_unlock(&state->mutex);
+  }
+}
+
+ncclResult_t transportStartProxy(struct ncclComm* comm) {
+  pthread_mutex_lock(&comm->proxyState.mutex);
+  if (comm->proxyState.ops != NULL)
+    pthread_cond_signal(&comm->proxyState.cond);
+  pthread_mutex_unlock(&comm->proxyState.mutex);
+  return ncclSuccess;
+}
+
+ncclResult_t transportCreateProxy(struct ncclComm* comm) {
+  if (!comm->proxyThread) {
+    comm->proxyState.cond = PTHREAD_COND_INITIALIZER;
+    comm->proxyState.mutex = PTHREAD_MUTEX_INITIALIZER;
+    comm->proxyState.ops = NULL;
+    pthread_create(&comm->proxyThread, NULL, persistentThread, comm);
+  }
+  return ncclSuccess;
+}
+
+ncclResult_t transportDestroyProxy(struct ncclComm* comm) {
+  struct ncclProxyState* state = &comm->proxyState;
+
+  // Request the proxy to stop and then wake it
+  pthread_mutex_lock(&state->mutex);
+  state->stop = true;
+  pthread_cond_signal(&state->cond);
+  pthread_mutex_unlock(&state->mutex);
+  if (comm->proxyThread) pthread_join(comm->proxyThread, NULL);
+
+  // Free off any memory allocated for the proxy arg pools
+  pthread_mutex_lock(&state->mutex);
+  struct ncclProxyState* proxyState = &comm->proxyState;
+  while (proxyState->pools != NULL) {
+    struct ncclProxyPool *next = proxyState->pools->next;
+    free(proxyState->pools);
+    proxyState->pools = next;
+  }
+  pthread_mutex_unlock(&state->mutex);
+
+  return ncclSuccess;
+}
diff --git a/projects/rccl/src/transport.cu b/projects/rccl/src/transport.cu
deleted file mode 100644
index 240453cb3d..0000000000
--- a/projects/rccl/src/transport.cu
+++ /dev/null
@@ -1,190 +0,0 @@
-/*************************************************************************
- * Copyright (c) 2016-2018, NVIDIA CORPORATION. All rights reserved.
- * Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
- *
- * See LICENSE.txt for license information
- ************************************************************************/
-
-#include "core.h"
-#include "common_coll.h"
-#include <hsa/hsa.h>
-#include <hsa/hsa_ext_amd.h>
-
-extern struct ncclTransport p2pTransport;
-extern struct ncclTransport shmTransport;
-extern struct ncclTransport netTransport;
-
-struct ncclTransport ncclTransports[NTRANSPORTS] = {
-  p2pTransport,
-  shmTransport,
-  netTransport,
-};
-
-static void FifoPullArgs(struct transportProxyInfo* info, struct ncclProxyArgs *args) {
-  struct ncclProxyArgs *fifoArgs = info->argsFifo + (LOAD(&info->argsFifoHead) % TRANSPORT_PROXY_FIFO_SIZE);
-  pthread_mutex_lock(&info->mutex);
-  while (LOAD(&fifoArgs->active) == 0)
-    pthread_cond_wait(&info->cond, &info->mutex);
-  __sync_synchronize();
-  memcpy(args, fifoArgs, sizeof(struct ncclProxyArgs));
-  __sync_synchronize();
-  STORE(&fifoArgs->active, 0);
-  pthread_cond_signal(&info->cond);
-  pthread_mutex_unlock(&info->mutex);
-  __atomic_fetch_add(&info->argsFifoHead, 1, __ATOMIC_SEQ_CST);
-}
-
-static struct ncclProxyArgs* FifoGetNextArgs(struct transportProxyInfo* info) {
-  if (info == NULL) return NULL;
-  struct ncclProxyArgs* fifoArgs = info->argsFifo + (LOAD(&info->argsFifoTail) % TRANSPORT_PROXY_FIFO_SIZE);
-  pthread_mutex_lock(&info->mutex);
-  while (LOAD(&fifoArgs->active) == 1)
-    pthread_cond_wait(&info->cond, &info->mutex);
-  pthread_mutex_unlock(&info->mutex);
-  __atomic_fetch_add(&info->argsFifoTail, 1, __ATOMIC_SEQ_CST);
-  return fifoArgs;
-}
-
-static void FifoPushArgs(struct transportProxyInfo* info) {
-  if (info == NULL) return;
-
-  struct ncclProxyArgs* fifoArgs = info->argsFifo + ((LOAD(&info->argsFifoTail)-1) % TRANSPORT_PROXY_FIFO_SIZE);
-  if (LOAD(&fifoArgs->active) == 0) return;
-
-  pthread_mutex_lock(&info->mutex);
-  pthread_cond_signal(&info->cond);
-  pthread_mutex_unlock(&info->mutex);
-}
-
-static void WaitProxyReady(struct transportProxyInfo* info) {
-  pthread_mutex_lock(&info->mutex);
-  while (LOAD(&info->proxyReady) == 0)
-    pthread_cond_wait(&info->cond, &info->mutex);
-  pthread_mutex_unlock(&info->mutex);
-}
-
-static void SetProxyReady(struct transportProxyInfo* info) {
-  pthread_mutex_lock(&info->mutex);
-  STORE(&info->proxyReady, 1);
-  pthread_cond_signal(&info->cond);
-  pthread_mutex_unlock(&info->mutex);
-}
-
-static void StopProxy(struct transportProxyInfo* info) {
-  struct ncclProxyArgs* fifoArgs = FifoGetNextArgs(info);
-  STORE(&fifoArgs->active, -1);
-  FifoPushArgs(info);
-}
-
-#define RECV 0
-#define SEND 1
-
-static bool NeedProxy(int type, int pattern, struct ncclRing* ring, int nranks) {
-  enum proxyMode mode = proxyPatternMode(pattern);
-  if (mode == proxyRing) return true;
-
-  /* In chains, one rank does not need a proxy. Let's figure out which one it is */
-  int root = proxyPatternRoot(pattern);
-  // Which index in the reorganized rings should we compare root against */
-  const int myrank = 0, nextrank = 1, prevrank = nranks-1;
-  int index = mode == proxyFrom ?
-      /*                            no recv /  no send    if root = */
-      /* bcast  */ (type == RECV ?   myrank : nextrank ):
-      /* reduce */ (type == RECV ? prevrank :   myrank );
-  int rank = ring->userRanks[index];
-  return (root != rank);
-}
-
-static void SaveProxy(struct ncclConnector* connector, struct ncclProxyArgs* args, int needProxy) {
-  struct transportProxyInfo* info = connector->proxyInfo;
-  if (info == NULL) return;
-  struct ncclProxyArgs* fifoArgs = FifoGetNextArgs(info);
-  args->needProxy = needProxy;
-  __sync_synchronize();
-  memcpy(fifoArgs, args, sizeof(struct ncclProxyArgs));
-  __sync_synchronize();
-  STORE(&fifoArgs->active, 1);
-}
-
-ncclResult_t transportSaveProxies(int substeps, int subchunks, int nstepsPerRound, int nblocksPerRound, size_t nbytes, int pattern, struct ncclComm* comm) {
-  int llMode, nrings, nthreads;
-  ncclGetCollResource(comm, nbytes, &nrings, &nthreads, &llMode);
-  nbytes       = llMode ? nbytes * 2    : nbytes;
-  substeps     = llMode ? 1             : substeps;
-  subchunks    = llMode ? NCCL_LL_CHUNKS : subchunks;
-  int buffSize = llMode ? NCCL_LL_BUFF_SIZE : comm->rings[0].buffSize;
-
-  int nrounds = (int)(DIVUP(nbytes, ((size_t)nrings * nblocksPerRound * (buffSize/subchunks)))); // Fixed 32-bit overflow
-  int nsteps = nstepsPerRound * nrounds * substeps;
-  TRACE(NCCL_NET,"opCount %lx substeps %d subchunks %d nrounds %d nsteps %d comm %p", comm->opCount, subchunks, subchunks, nrounds, nsteps, comm);
-  TRACE(NCCL_NET,"opCount %lx nbytes %zi nrings %d buffSize %d pattern %d comm %p", comm->opCount, nbytes, nrings, buffSize, pattern, comm);
-  for (int r=0; r<nrings; r++) {
-    struct ncclRing* ring = comm->rings+((comm->myParams->gridDim.x+r)%comm->nRings);
-    struct ncclProxyArgs args = { ring, substeps*subchunks, nsteps, comm->opCount, llMode, 0 };
-    SaveProxy(&ring->recv, &args, NeedProxy(RECV, pattern, ring, comm->nRanks));
-    SaveProxy(&ring->send, &args, NeedProxy(SEND, pattern, ring, comm->nRanks));
-  }
-  return ncclSuccess;
-}
-
-ncclResult_t transportStartProxies(ncclComm* comm) {
-  for (int r=0; r<comm->nRings; r++) {
-    FifoPushArgs(comm->rings[r].send.proxyInfo);
-    FifoPushArgs(comm->rings[r].recv.proxyInfo);
-  }
-  pthread_yield(); // Let other threads run
-  return ncclSuccess;
-}
-
-void* persistentThread(void *opaqueInfo) {
-  struct transportProxyInfo* info = (struct transportProxyInfo*)opaqueInfo;
-  // We need to initialize the context before launching any NCCL cuda kernel,
-  // otherwise we would create it during the first hipMemcpyAsync inside the
-  // proxy function and that would cause a deadlock
-  hipSetDevice(info->comm->cudaDev);
-  // Signal the main thread the context is created and it can proceed.
-  SetProxyReady(info);
-  while (1) {
-    struct ncclProxyArgs args;
-    FifoPullArgs(info, &args);
-    if (args.active == -1) {
-      // Main thread asked to stop
-      return NULL;
-    }
-    ncclResult_t res = info->func(&args);
-    if (res != ncclSuccess) {
-      WARN("%s:%d -> %d [Proxy thread error]", __FILE__, __LINE__, res);
-    }
-  }
-}
-
-ncclResult_t transportCreateProxy(int type, struct ncclRing* ring, struct ncclComm* comm) {
-  struct ncclConnector* connector = (type == RECV) ? &ring->recv : &ring->send;
-  threadFunc_t proxyfunc = (threadFunc_t) ((type == RECV) ? connector->transport->recv.proxy : connector->transport->send.proxy);
-  if (proxyfunc) {
-    TRACE(NCCL_NET,"type %d ring %p proxyfunc %p comm %p", type, ring, proxyfunc, comm);
-    struct transportProxyInfo* info;
-    NCCLCHECK(ncclCalloc(&info, 1));
-    connector->proxyInfo = info;
-    info->comm = comm;
-    info->cond = PTHREAD_COND_INITIALIZER;
-    info->mutex = PTHREAD_MUTEX_INITIALIZER;
-    info->func = proxyfunc;
-    STORE(&info->argsFifoHead, 0); STORE(&info->argsFifoTail, 0);
-    STORE(&info->proxyReady, 0);
-    pthread_create(&connector->proxyInfo->thread, NULL, persistentThread, info);
-    // Wait for thread to initialize its CUDA context.
-    WaitProxyReady(info);
-  }
-  return ncclSuccess;
-}
-
-ncclResult_t transportDestroyProxy(struct ncclConnector* connector) {
-  if (connector->proxyInfo) {
-    StopProxy(connector->proxyInfo);
-    pthread_join(connector->proxyInfo->thread, NULL);
-    free(connector->proxyInfo);
-    connector->proxyInfo = NULL;
-  }
-  return ncclSuccess;
-}
diff --git a/projects/rccl/src/transport/net.cc b/projects/rccl/src/transport/net.cc
new file mode 100644
index 0000000000..7991754f65
--- /dev/null
+++ b/projects/rccl/src/transport/net.cc
@@ -0,0 +1,574 @@
+/*************************************************************************
+ * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
+ * Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#include "core.h"
+#include "transport.h"
+#include "nvmlwrap.h"
+#include "net.h"
+#include "param.h"
+#include "topo.h"
+#include <hip/hip_runtime.h>
+#include <assert.h>
+
+#define NET_MAX_IFS 16
+#define NET_MAX_GPUS 32
+
+// Cache GPU-NIC distances to avoid re-computing them
+#define NET_TVALUE_UNKNOWN 0ULL
+static ncclTvalue_t ncclNetTvalues[NET_MAX_GPUS] = { NET_TVALUE_UNKNOWN };
+static int ncclNetNDev;
+
+// We encode 3 bits of distance per interface into a ncclTvalue_t (64-bit)
+#define NET_BITS_PER_IF 3
+#define NET_BITS_PER_IF_MASK ((1<<NET_BITS_PER_IF)-1)
+static_assert(sizeof(ncclTvalue_t)*8 >= NET_MAX_IFS*NET_BITS_PER_IF, "NET_MAX_IFS*NET_BITS_PER_IF must fit in a ncclTvalue_t");
+static ncclTvalue_t getTvalue(short* distances, int ndev) {
+  ncclTvalue_t tvalue = 0;
+  for (int d=0; d<ndev; d++) {
+    ncclTvalue_t score = 1 + PATH_SYS - distances[d];
+    // Keep 3 bits of score info per dev
+    tvalue |= ((score & NET_BITS_PER_IF_MASK)<<(NET_BITS_PER_IF*d));
+  }
+  return tvalue;
+}
+static int getScore(ncclTvalue_t tvalue, int dev) {
+  return (tvalue >> (dev*NET_BITS_PER_IF)) & NET_BITS_PER_IF_MASK;
+}
+
+struct netConnectInfo {
+  ncclNetHandle_t netHandle;
+};
+
+struct netSendResources {
+  void* netSendComm;
+  struct ncclSendMem* hostSendMem;
+  struct ncclRecvMem* hostRecvMem;
+  struct ncclSendMem* devHostSendMem;
+  struct ncclRecvMem* devHostRecvMem;
+  int netDev;
+  int useGdr;
+  int buffSize;
+  void* mhandle;
+  void* llMhandle;
+  struct ncclRecvMem* devRecvMem;
+  uint64_t step;
+  uint64_t llLastCleaning;
+};
+
+struct netRecvResources {
+  void* netListenComm;
+  void* netRecvComm;
+  struct ncclSendMem* hostSendMem;
+  struct ncclRecvMem* hostRecvMem;
+  struct ncclSendMem* devHostSendMem;
+  struct ncclRecvMem* devHostRecvMem;
+  int netDev;
+  int useGdr;
+  int buffSize;
+  void* mhandle;
+  void* llMhandle;
+  struct ncclRecvMem* devRecvMem;
+  uint64_t step;
+  uint64_t llLastCleaning;
+  uint32_t* curr_hdp_reg;  // Curr GPU in ring (for rdma transport use only)
+};
+
+static ncclResult_t netDistance(int cudaDev, int dev, short* distance) {
+  char* cudaPath = NULL;
+  char* nicPath = NULL;
+  ncclResult_t err;
+  NCCLCHECK(getCudaPath(cudaDev, &cudaPath));
+  err = ncclNetPciPath(dev, &nicPath);
+  *distance = (err != ncclSuccess || nicPath == NULL || cudaPath == NULL) ? PATH_SYS : pciDistance(nicPath, cudaPath);
+  if (nicPath) free(nicPath);
+  if (cudaPath) free(cudaPath);
+  return ncclSuccess;
+}
+
+static ncclResult_t netDevices(int* ndev, short** distances) {
+  NCCLCHECK(ncclNetDevices(ndev));
+  if (*ndev == 0) {
+    WARN("Error : Network returned 0 device");
+    return ncclSystemError;
+  }
+  if (*ndev > NET_MAX_IFS) *ndev = NET_MAX_IFS;
+
+  *distances = (short*)malloc(*ndev*sizeof(short));
+  if (*distances == NULL) return ncclSystemError;
+
+  // Find distance with current GPU
+  int cudaDev, nvmlDev;
+  CUDACHECK(hipGetDevice(&cudaDev));
+  NCCLCHECK(getNvmlDevice(cudaDev, &nvmlDev))
+  char line[1024];
+  sprintf(line, "CUDA Dev %d[%d], %s NIC distance : ", cudaDev, nvmlDev, ncclNetName());
+  for (int d=0; d<*ndev; d++) {
+    NCCLCHECK(netDistance(cudaDev, d, *distances+d));
+    sprintf(line+strlen(line), " %s", pathDists[(*distances)[d]]);
+  }
+  INFO(NCCL_INIT|NCCL_NET, "%s", line);
+  return ncclSuccess;
+}
+
+/* Determine if we can communicate with the peer */
+ncclResult_t netCanConnect(ncclTvalue_t* ret, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo) {
+  int cudaDev;
+  CUDACHECK(hipGetDevice(&cudaDev));
+  ret[0] = ncclNetTvalues[cudaDev];
+  if (ret[0] == NET_TVALUE_UNKNOWN) {
+    if (cudaDev >= NET_MAX_GPUS) {
+      WARN("CUDA device %d >= MAX %d\n", cudaDev, NET_MAX_GPUS);
+      return ncclInternalError;
+    }
+    int nDev;
+    short* distances;
+    NCCLCHECK(netDevices(&nDev, &distances));
+    ncclNetTvalues[cudaDev] = ret[0] = getTvalue(distances, nDev);
+    ncclNetNDev = nDev;
+    free(distances);
+  }
+  return ncclSuccess;
+}
+
+static inline int groupBestStart(int nranks, int* groups, int group, ncclTvalue_t* values, int card, int minScore) {
+  int bestRank = -1;
+  int bestScore = 0;
+  for (int rank=0; rank<nranks; rank++) {
+    if (groups[rank] != group) continue;
+    for (int i=0; i<nranks; i++) {
+      ncclTvalue_t netValue = values[rank*nranks+i];
+      if (netValue != 0) {
+        ncclTvalue_t score = (netValue>>(NET_BITS_PER_IF*card)) & NET_BITS_PER_IF_MASK;
+        if (score >= minScore && score > bestScore) {
+          bestScore = score;
+          bestRank = rank;
+        }
+        // All other values should be the same, stop here for this rank
+        break;
+      }
+    }
+  }
+  return bestRank;
+}
+static inline int groupBestEnd(int nranks, int* groups, int group, int* subgroups, int startSubGroup, int startRank, ncclTvalue_t* values, int card, int minScore) {
+  // For the last rank, we don't need the absolute best score, just to be within minScore.
+  for (int rank=nranks-1; rank>=0; rank--) {
+    if (groups[rank] != group) continue;
+    if (startSubGroup != -1 && startSubGroup == subgroups[rank]) continue;
+    if (startRank == rank) continue;
+    for (int i=0; i<nranks; i++) {
+      ncclTvalue_t netValue = values[rank*nranks+i];
+      if (netValue != 0) {
+        ncclTvalue_t score = (netValue>>(NET_BITS_PER_IF*card)) & NET_BITS_PER_IF_MASK;
+        if (score >= minScore) {
+          return rank;
+        }
+        // All other values should be the same, stop here for this rank
+        break;
+      }
+    }
+  }
+  return -1;
+}
+
+ncclResult_t netGetRings(int nranks, int* groups, int* subgroups, ncclTvalue_t* values, int* nringsRet, int* prev, int* next, int minScore, int* nthreads) {
+  int nGroups = groups[nranks-1] + 1;
+  int *cardUsed, *starts, *ends;
+  NCCLCHECK(ncclCalloc(&cardUsed, NET_MAX_IFS*nGroups));
+  NCCLCHECK(ncclCalloc(&starts, nGroups));
+  NCCLCHECK(ncclCalloc(&ends, nGroups));
+
+  for (int ring = 0; ring<*nringsRet; ring++) {
+    for (int group = 0; group<nGroups; group++) {
+      int nranksInGroup = 0;
+      int nsubGroups = 0;
+      for (int rank=0; rank<nranks; rank++)
+        if (groups[rank] == group) {
+          nranksInGroup++;
+          nsubGroups = std::max(subgroups[rank], nsubGroups);
+        }
+      starts[group] = ends[group] = -1;
+      // Receive on the rank closest to the NIC
+      for (int card=0; card<NET_MAX_IFS; card++) {
+        if (cardUsed[group*NET_MAX_IFS+card] == 1) continue;
+        int start = groupBestStart(nranks, groups, group, values, card, minScore);
+        // Send from any rank, but best on a different subgroup and close to the NIC also.
+        int end = (nranksInGroup == 1) ? start
+            : groupBestEnd(nranks, groups, group, subgroups, nsubGroups ? subgroups[start] : -1, start, values, card, minScore);
+        //printf("Ring %d, Minscore %d, Card %d, group %d, start = %d, end = %d\n", ring, minScore, card, group, start, end);
+        if (start != -1 && end != -1) {
+          cardUsed[group*NET_MAX_IFS+card] = 1;
+          starts[group] = start;
+          ends[group] = end;
+          break;
+        }
+      }
+      if (starts[group] == -1 || ends[group] == -1) {
+        *nringsRet = ring;
+        goto done;
+      }
+    }
+    // Link groups together
+    for (int group = 0; group<nGroups; group++) {
+      int nextGroup = (group+1)%nGroups;
+      next[ring*nranks+ends[group]] = starts[nextGroup];
+      prev[ring*nranks+starts[nextGroup]] = ends[group];
+    }
+  }
+done:
+  free(cardUsed);
+  free(starts);
+  free(ends);
+  return ncclSuccess;
+}
+
+int getDev(int cudaDev, int ringId) {
+  ncclTvalue_t tvalues = ncclNetTvalues[cudaDev];
+
+  int dev = 0;
+  int maxScore = 0;
+  for (int d=0; d<ncclNetNDev; d++) if (getScore(tvalues,d) > maxScore) maxScore = getScore(tvalues,d);
+  int skip = ringId+1;
+  while (skip) {
+    for (int d=0; d<ncclNetNDev; d++) {
+      if (getScore(tvalues, d) == maxScore) {
+        skip--;
+        if (skip == 0) { dev = d; goto end; }
+      }
+    }
+  }
+end:
+  return dev;
+}
+
+extern bool useFineGrainVramPcie;
+
+NCCL_PARAM(NetGdrRead, "NET_GDR_READ", -2);
+NCCL_PARAM(NetGdrLevel, "NET_GDR_LEVEL", PATH_PHB);
+
+static ncclResult_t netGetGdrSupport(int dev, int read, int* useGdr) {
+  *useGdr = 0;
+
+  int cudaDev, nvmlDev;
+  CUDACHECK(hipGetDevice(&cudaDev));
+  NCCLCHECK(getNvmlDevice(cudaDev, &nvmlDev))
+
+  if (!useFineGrainVramPcie) {
+    INFO(NCCL_INIT|NCCL_NET,"NET/%s : GPU Direct RDMA Disabled for GPU %d / Need Fine Grain VRAM over PCIe", ncclNetName(), cudaDev);
+    return ncclSuccess;
+  }
+
+  if (read) { // For reads (sends) only enable under certain conditions
+    int gdrReadParam = ncclParamNetGdrRead();
+    if (gdrReadParam == 0) return ncclSuccess;
+    if (gdrReadParam < 0) {
+       int nvlink;
+       NCCLCHECK(ncclNvlinkGpu(&nvlink));
+       if (!nvlink) return ncclSuccess;
+    }
+  }
+
+  // Check if we are close enough that it makes sense to enable GDR
+  int netGdrLevel = ncclParamNetGdrLevel();
+  short distance;
+  NCCLCHECK(netDistance(cudaDev, dev, &distance));
+  if (distance >= netGdrLevel) {
+    INFO(NCCL_NET,"NET/%s : GPU Direct RDMA Disabled for GPU %d[%d] / HCA %d (distance %d >= %d)", ncclNetName(), cudaDev, nvmlDev, dev, distance, netGdrLevel);
+    return ncclSuccess;
+  }
+
+  // Finally, check if the NIC supports it
+  int flags;
+  NCCLCHECK(ncclNetPtrSupport(dev, &flags));
+  if ((flags & NCCL_PTR_CUDA) == 0) return ncclSuccess;
+  *useGdr = 1;
+  INFO(NCCL_NET,"NET/%s : GPU Direct RDMA Enabled for GPU %d[%d] / HCA %d (distance %d < %d), read %d", ncclNetName(), cudaDev, nvmlDev, dev, distance, netGdrLevel, read);
+  return ncclSuccess;
+}
+
+/* Determine if we will use this transport for this peer and return connect
+ * information for this peer */
+ncclResult_t netSendSetup(struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connectInfo, struct ncclConnector* send, int buffSize, int channelId) {
+  struct netSendResources* resources;
+  NCCLCHECK(ncclCalloc(&resources, 1));
+  send->transportResources = resources;
+
+  int cudaDev;
+  CUDACHECK(hipGetDevice(&cudaDev));
+  resources->netDev = getDev(cudaDev, channelId);
+  NCCLCHECK(netGetGdrSupport(resources->netDev, 1, &resources->useGdr));
+
+  int sendSize = sizeof(struct ncclSendMem);
+  NCCLCHECK(ncclCudaHostAlloc((void**)&resources->hostSendMem, (void**)&resources->devHostSendMem, sendSize));
+
+  int recvSize = offsetof(struct ncclRecvMem, buff)+buffSize;
+  if (resources->useGdr) {
+    NCCLCHECK(ncclCudaCalloc((char**)(&resources->devRecvMem), recvSize, true));
+  }
+  NCCLCHECK(ncclCudaHostAlloc((void**)&resources->hostRecvMem, (void**)&resources->devHostRecvMem, recvSize));
+  resources->buffSize = buffSize;
+
+  INFO(NCCL_INIT|NCCL_NET,"Ring %02d : %d -> %d [send] via NET/%s/%d%s", channelId, myInfo->rank, peerInfo->rank, ncclNetName(), resources->netDev,
+      resources->useGdr ? "/GDRDMA" : "");
+  return ncclSuccess;
+}
+
+ncclResult_t netRecvSetup(struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connectInfo, struct ncclConnector* recv, int buffSize, int channelId) {
+  struct netRecvResources* resources;
+  NCCLCHECK(ncclCalloc(&resources, 1));
+  recv->transportResources = resources;
+
+  int cudaDev;
+  CUDACHECK(hipGetDevice(&cudaDev));
+  resources->netDev = getDev(cudaDev, channelId);
+  NCCLCHECK(netGetGdrSupport(resources->netDev, 0, &resources->useGdr));
+
+  int sendSize = sizeof(struct ncclSendMem);
+  NCCLCHECK(ncclCudaHostAlloc((void**)&resources->hostSendMem, (void**)&resources->devHostSendMem, sendSize));
+
+  int recvSize = offsetof(struct ncclRecvMem, buff)+buffSize;
+  if (resources->useGdr) {
+    NCCLCHECK(ncclCudaCalloc((char**)(&resources->devRecvMem), recvSize, true));
+    CUDACHECK(hipDeviceGetAttribute((int*)&resources->curr_hdp_reg, hipDeviceAttributeHdpMemFlushCntl,peerInfo->cudaDev));
+  }
+  NCCLCHECK(ncclCudaHostAlloc((void**)&resources->hostRecvMem, (void**)&resources->devHostRecvMem, recvSize));
+  resources->buffSize = buffSize;
+
+  INFO(NCCL_INIT|NCCL_NET,"Ring %02d : %d -> %d [receive] via NET/%s/%d%s", channelId, peerInfo->rank, myInfo->rank, ncclNetName(), resources->netDev,
+      resources->useGdr ? "/GDRDMA" : "");
+  struct netConnectInfo* info = (struct netConnectInfo*) connectInfo;
+  NCCLCHECK(ncclNetListen(resources->netDev, &info->netHandle, &resources->netListenComm));
+  return ncclSuccess;
+}
+
+ncclResult_t netSendConnect(struct ncclConnect* connectInfo, struct ncclConnector* send) {
+  // Setup device pointers
+  struct netSendResources* resources = (struct netSendResources*)send->transportResources;
+
+  // Intermediate buffering on GPU for GPU Direct RDMA, but LL buffer is always on host
+  struct ncclRecvMem* recvMem = resources->useGdr ? resources->devRecvMem : resources->devHostRecvMem;
+  send->conn.buff = recvMem->buff;
+  send->conn.llBuff = resources->devHostRecvMem->llBuff;
+
+  // Head/Tail/Opcount/Fifos are always on host
+  send->conn.tail = &resources->devHostRecvMem->tail;
+  send->conn.opCountRem = &resources->devHostRecvMem->opCount;
+  send->conn.fifo = resources->devHostRecvMem->sizesFifo;
+  send->conn.head = &resources->devHostSendMem->head;
+  send->conn.opCountLoc = &resources->devHostSendMem->opCount;
+  for (int i=0; i<NCCL_STEPS; i++) send->conn.fifo[i] = -1;
+
+  // Connect to remote peer
+  struct netConnectInfo* info = (struct netConnectInfo*)connectInfo;
+  NCCLCHECK(ncclNetConnect(resources->netDev, info->netHandle, &resources->netSendComm));
+
+  NCCLCHECK(ncclNetRegMr(resources->netSendComm, recvMem->buff, resources->buffSize,
+        resources->useGdr ? NCCL_PTR_CUDA : NCCL_PTR_HOST, &resources->mhandle));
+  NCCLCHECK(ncclNetRegMr(resources->netSendComm, resources->devHostRecvMem->llBuff,
+        NCCL_LL_BUFF_SIZE, NCCL_PTR_HOST, &resources->llMhandle));
+
+  return ncclSuccess;
+}
+
+/* Connect to this peer */
+ncclResult_t netRecvConnect(struct ncclConnect* connectInfo, struct ncclConnector* recv) {
+  // Setup device pointers
+  struct netRecvResources* resources = (struct netRecvResources*)recv->transportResources;
+
+  // Intermediate buffering on GPU for GPU Direct RDMA
+  struct ncclRecvMem* recvMem = resources->useGdr ? resources->devRecvMem : resources->devHostRecvMem;
+  recv->conn.buff = recvMem->buff;
+  recv->conn.llBuff = recvMem->llBuff;
+
+  // Head/Tail/Opcount are always on host
+  recv->conn.tail = &resources->devHostRecvMem->tail;
+  recv->conn.opCountLoc = &resources->devHostRecvMem->opCount;
+  recv->conn.head = &resources->devHostSendMem->head;
+  recv->conn.opCountRem = &resources->devHostSendMem->opCount;
+
+  // Finish connection establishment from remote peer
+  NCCLCHECK(ncclNetAccept(resources->netListenComm, &resources->netRecvComm));
+  NCCLCHECK(ncclNetCloseListen(resources->netListenComm));
+
+  NCCLCHECK(ncclNetRegMr(resources->netRecvComm, recvMem->buff, resources->buffSize,
+        resources->useGdr ? NCCL_PTR_CUDA : NCCL_PTR_HOST, &resources->mhandle));
+  NCCLCHECK(ncclNetRegMr(resources->netRecvComm, recvMem->llBuff, NCCL_LL_BUFF_SIZE,
+        resources->useGdr ? NCCL_PTR_CUDA : NCCL_PTR_HOST, &resources->llMhandle));
+
+  return ncclSuccess;
+}
+
+ncclResult_t netSendFree(void* transportResources) {
+  struct netSendResources* resources = (struct netSendResources*)transportResources;
+  NCCLCHECK(ncclCudaHostFree(resources->hostSendMem));
+  NCCLCHECK(ncclNetDeregMr(resources->netSendComm, resources->mhandle));
+  NCCLCHECK(ncclNetDeregMr(resources->netSendComm, resources->llMhandle));
+  NCCLCHECK(ncclCudaHostFree(resources->hostRecvMem));
+  if (resources->useGdr)
+    CUDACHECK(hipFree(resources->devRecvMem));
+  NCCLCHECK(ncclNetCloseSend(resources->netSendComm));
+  free(resources);
+  return ncclSuccess;
+}
+
+ncclResult_t netRecvFree(void* transportResources) {
+  struct netRecvResources* resources = (struct netRecvResources*)transportResources;
+  NCCLCHECK(ncclCudaHostFree(resources->hostSendMem));
+  NCCLCHECK(ncclNetDeregMr(resources->netRecvComm, resources->mhandle));
+  NCCLCHECK(ncclNetDeregMr(resources->netRecvComm, resources->llMhandle));
+  NCCLCHECK(ncclCudaHostFree(resources->hostRecvMem));
+  if (resources->useGdr)
+    CUDACHECK(hipFree(resources->devRecvMem));
+  NCCLCHECK(ncclNetCloseRecv(resources->netRecvComm));
+  free(resources);
+  return ncclSuccess;
+}
+
+ncclResult_t netSendProxy(struct ncclProxyArgs* args) {
+  struct netSendResources* resources = (struct netSendResources*) (args->connector->transportResources);
+  if (args->state == ncclProxyOpReady) {
+    // Update opCount
+    resources->hostRecvMem->opCount = args->opCount;
+
+    // Round to next multiple of sliceSteps
+    resources->step = ROUNDUP(resources->step, args->chunkSteps);
+    args->head = resources->step;
+    args->tail = resources->step;
+    args->end = args->head + args->nsteps;
+    args->state = ncclProxyOpProgress;
+  }
+  if (args->state == ncclProxyOpProgress) {
+    args->idle = 1;
+    if (args->head < args->end) {
+      if (args->tail < args->end && args->tail < args->head + NCCL_STEPS) {
+        volatile int* sizesFifo = resources->hostRecvMem->sizesFifo;
+        volatile uint64_t* recvTail = &resources->hostRecvMem->tail;
+        if (args->llMode) {
+          int buffSlot = args->tail%NCCL_STEPS;
+          int size = LOAD(sizesFifo+buffSlot);
+          if (size != -1) {
+            uint32_t flag = NCCL_LL_FLAG(args->tail + 1);
+            int nFifoLines = DIVUP(size, sizeof(union ncclLLFifoLine));
+            size = nFifoLines * sizeof(union ncclLLFifoLine);
+            union ncclLLFifoLine* lines = resources->hostRecvMem->llBuff+buffSlot*NCCL_LL_SLICE_LINES;
+            int ready = 1;
+            for (int i=0; i<nFifoLines; i++) {
+              volatile uint32_t *f1 = &lines[i].flag1;
+              volatile uint32_t *f2 = &lines[i].flag2;
+              if (LOAD(f1) != flag || LOAD(f2) != flag) { ready = 0; break; }
+            }
+            if (ready) {
+              NCCLCHECK(ncclNetIsend(resources->netSendComm, lines, size, resources->llMhandle, args->requests+buffSlot));
+              if (args->requests[buffSlot] != NULL) {
+                STORE(sizesFifo+buffSlot, -1);
+                // Make sure size is reset to zero before we update the head.
+                __sync_synchronize();
+                args->tail += args->sliceSteps;
+                args->idle = 0;
+              }
+            }
+          }
+        } else if (args->tail < LOAD(recvTail)) {
+          struct ncclRecvMem* localMem = resources->useGdr ? resources->devRecvMem : resources->hostRecvMem;
+          int stepSize = args->channel->buffSize/NCCL_STEPS;
+          // Send through network
+          int buffSlot = args->tail%NCCL_STEPS;
+          NCCLCHECK(ncclNetIsend(resources->netSendComm, localMem->buff+buffSlot*stepSize, LOAD(sizesFifo+buffSlot), resources->mhandle, args->requests+buffSlot));
+          if (args->requests[buffSlot] != NULL) {
+            STORE(sizesFifo+buffSlot, -1);
+            // Make sure size is reset to zero before we update the head.
+            __sync_synchronize();
+            args->tail += args->sliceSteps;
+            args->idle = 0;
+          }
+        }
+      }
+      if (args->head < args->tail) {
+        int done;
+        int buffSlot = args->head%NCCL_STEPS;
+        NCCLCHECK(ncclNetTest(args->requests[buffSlot], &done, NULL));
+        if (done) {
+          args->head += args->sliceSteps;
+          STORE(&resources->hostSendMem->head, args->head);
+          args->idle = 0;
+        }
+      }
+    }
+    if (args->head == args->end) {
+      resources->step = args->end;
+      args->idle = 0;
+      args->state = ncclProxyOpNone;
+    }
+  }
+  return ncclSuccess;
+}
+
+ncclResult_t netRecvProxy(struct ncclProxyArgs* args) {
+  struct netRecvResources* resources = (struct netRecvResources*) (args->connector->transportResources);
+  if (args->state == ncclProxyOpReady) {
+    // Update opCount
+    resources->hostSendMem->opCount = args->opCount;
+
+    // Round to next multiple of sliceSteps
+    resources->step = ROUNDUP(resources->step, args->chunkSteps);
+    args->head = resources->step;
+    args->tail = resources->step;
+    args->end = args->head + args->nsteps;
+    args->state = ncclProxyOpProgress;
+  }
+  if (args->state == ncclProxyOpProgress) {
+    args->idle = 1;
+    int stepSize = ( args->llMode ? NCCL_LL_BUFF_SIZE : args->channel->buffSize ) / NCCL_STEPS;
+    if (args->head < args->end) {
+      struct ncclRecvMem* localMem = resources->useGdr ? resources->devRecvMem : resources->hostRecvMem;
+      char* localBuff = args->llMode ? (char*)localMem->llBuff : localMem->buff;
+      void* mhandle = args->llMode ? resources->llMhandle : resources->mhandle;
+      volatile uint64_t* sendHead = &resources->hostSendMem->head;
+      if ((args->tail < args->head + NCCL_STEPS) && (args->tail < LOAD(sendHead) + NCCL_STEPS) && (args->tail < args->end)) {
+        int buffSlot = args->tail%NCCL_STEPS;
+        int sliceSize = stepSize * args->sliceSteps;
+        NCCLCHECK(ncclNetIrecv(resources->netRecvComm, localBuff+buffSlot*stepSize, sliceSize, mhandle, args->requests+buffSlot));
+        if (args->requests[buffSlot] != NULL) {
+          args->tail += args->sliceSteps;
+          args->idle = 0;
+        }
+      }
+      if (args->tail > args->head) {
+        int buffSlot = args->head%NCCL_STEPS;
+        int done, size;
+        NCCLCHECK(ncclNetTest(args->requests[buffSlot], &done, &size));
+        if (done) {
+          args->head += args->sliceSteps;
+          if (args->llMode == 0) {
+            if (resources->useGdr) {
+              ncclNetFlush(resources->netRecvComm, localBuff+buffSlot*stepSize, size, mhandle);
+              // Flush local HDP register after local read-back finishes
+              STORE(resources->curr_hdp_reg, 0x1);
+              TRACE(NCCL_NET, "Flushing GPU memory via HDP %p", resources->curr_hdp_reg);
+            }
+            STORE(&resources->hostRecvMem->tail, args->head);
+          }
+          args->idle = 0;
+        }
+      }
+    }
+    if (args->head == args->end) {
+      resources->step = args->end;
+      args->idle = 0;
+      args->state = ncclProxyOpNone;
+    }
+  }
+  return ncclSuccess;
+}
+
+struct ncclTransport netTransport = {
+  "NET",
+  netCanConnect,
+  netGetRings,
+  { netSendSetup, netSendConnect, netSendFree, netSendProxy },
+  { netRecvSetup, netRecvConnect, netRecvFree, netRecvProxy }
+};
diff --git a/projects/rccl/src/transport/net.cu b/projects/rccl/src/transport/net.cu
deleted file mode 100644
index 1c09c91378..0000000000
--- a/projects/rccl/src/transport/net.cu
+++ /dev/null
@@ -1,584 +0,0 @@
-/*************************************************************************
- * Copyright (c) 2016-2018, NVIDIA CORPORATION. All rights reserved.
- * Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
- *
- * See LICENSE.txt for license information
- ************************************************************************/
-
-#include "core.h"
-#include "transport.h"
-#include "nvmlwrap.h"
-#include "net.h"
-#include "param.h"
-#include <hip/hip_runtime_api.h>
-#include <assert.h>
-#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__)
-#include "nvlink_stub.h"
-#else
-#include "nvlink.h"
-#endif
-
-#define NET_MAX_IFS 16
-
-// We encode 3 bits of distance per interface into a ncclTvalue_t (64-bit)
-#define NET_BITS_PER_IF 3
-#define NET_BITS_PER_IF_MASK ((1<<NET_BITS_PER_IF)-1)
-static_assert(sizeof(ncclTvalue_t)*8 >= NET_MAX_IFS*NET_BITS_PER_IF, "NET_MAX_IFS*NET_BITS_PER_IF must fit in a ncclTvalue_t");
-static ncclTvalue_t getTvalue(short* distances, int ndev) {
-  ncclTvalue_t tvalue = 0;
-  for (int d=0; d<ndev; d++) {
-    int score = 1 + PATH_SOC - distances[d];
-    // Keep 3 bits of score info per dev
-    tvalue |= ((score & NET_BITS_PER_IF_MASK)<<(NET_BITS_PER_IF*d));
-  }
-  return tvalue;
-}
-
-struct netInfo {
-  int rank;
-  int ndev;
-  ncclTvalue_t tValue;
-  short distances[NET_MAX_IFS];
-};
-
-struct netConnectInfo {
-  ncclNetHandle_t netHandle;
-};
-
-struct netSendResources {
-  void* netSendComm;
-  struct ncclSendMem* hostSendMem;
-  struct ncclRecvMem* hostRecvMem;
-  struct ncclSendMem* devHostSendMem;
-  struct ncclRecvMem* devHostRecvMem;
-  struct ncclSendMem* hostDevMem;
-  int netDev;
-  int useGdr;
-  struct ncclRecvMem* devNetMem;
-  uint64_t llStep;
-  uint64_t llLastCleaning;
-};
-
-struct netRecvResources {
-  void* netListenComm;
-  void* netRecvComm;
-  struct ncclSendMem* hostSendMem;
-  struct ncclRecvMem* hostRecvMem;
-  struct ncclSendMem* devHostSendMem;
-  struct ncclRecvMem* devHostRecvMem;
-  struct ncclRecvMem* hostDevMem;
-  int netDev;
-  int useGdr;
-  uint64_t llStep;
-  uint64_t llLastCleaning;
-};
-
-/* Fill information necessary to exchange between ranks to choose whether or not
- * to use this transport */
-ncclResult_t netFillInfo(ncclTinfo_t* opaqueInfo, int rank, uint64_t commHash) {
-  struct netInfo* info = (struct netInfo*)opaqueInfo;
-  static_assert(sizeof(struct netInfo) <= sizeof(ncclTinfo_t), "NET Info too large");
-  info->rank = rank;
-  NCCLCHECK(ncclNetDevices(&info->ndev));
-  if (info->ndev == 0) {
-    WARN("Error : Network returned 0 device");
-    return ncclSystemError;
-  }
-  if (info->ndev > NET_MAX_IFS) info->ndev = NET_MAX_IFS;
-
-  // Find distance with current GPU
-  int cudaDev;
-  hipGetDevice(&cudaDev);
-  char* cudaPath;
-  NCCLCHECK(getCudaPath(cudaDev, &cudaPath));
-
-  char line[1024];
-  sprintf(line, "CUDA Dev %d, %s NIC distance : ", cudaDev, ncclNetName());
-  for (int d=0; d<info->ndev; d++) {
-    char* nicPath;
-    ncclResult_t err = ncclNetPciPath(d, &nicPath);
-    info->distances[d] = (err != ncclSuccess || nicPath == NULL || cudaPath == NULL) ? PATH_SOC : pciDistance(nicPath, cudaPath);
-    sprintf(line+strlen(line), " %s", pathDists[info->distances[d]]);
-    if (err == ncclSuccess) free(nicPath);
-  }
-  INFO(NCCL_INIT|NCCL_NET, "%s", line);
-  free(cudaPath);
-  return ncclSuccess;
-}
-
-/* Determine if we can communicate with the peer */
-ncclResult_t netCanConnect(ncclTvalue_t* ret, ncclTinfo_t* myOpaqueInfo, ncclTinfo_t* peerOpaqueInfo) {
-  struct netInfo* myInfo = (struct netInfo*)myOpaqueInfo;
-  ret[0] = getTvalue(myInfo->distances, myInfo->ndev);
-  return ncclSuccess;
-}
-
-static inline int groupBestStart(int nranks, int* groups, int group, ncclTvalue_t* values, int card, int minScore) {
-  int bestRank = -1;
-  int bestScore = 0;
-  for (int rank=0; rank<nranks; rank++) {
-    if (groups[rank] != group) continue;
-    for (int i=0; i<nranks; i++) {
-      ncclTvalue_t netValue = values[rank*nranks+i];
-      if (netValue != 0) {
-        ncclTvalue_t score = (netValue>>(NET_BITS_PER_IF*card)) & NET_BITS_PER_IF_MASK;
-        if (score >= minScore && score > bestScore) {
-          bestScore = score;
-          bestRank = rank;
-        }
-        // All other values should be the same, stop here for this rank
-        break;
-      }
-    }
-  }
-  return bestRank;
-}
-static inline int groupBestEnd(int nranks, int* groups, int group, int* subgroups, int startSubGroup, int startRank, ncclTvalue_t* values, int card, int minScore) {
-  // For the last rank, we don't need the absolute best score, just to be within minScore.
-  for (int rank=nranks-1; rank>=0; rank--) {
-    if (groups[rank] != group) continue;
-    if (startSubGroup != -1 && startSubGroup == subgroups[rank]) continue;
-    if (startRank == rank) continue;
-    for (int i=0; i<nranks; i++) {
-      ncclTvalue_t netValue = values[rank*nranks+i];
-      if (netValue != 0) {
-        ncclTvalue_t score = (netValue>>(NET_BITS_PER_IF*card)) & NET_BITS_PER_IF_MASK;
-        if (score >= minScore) {
-          return rank;
-        }
-        // All other values should be the same, stop here for this rank
-        break;
-      }
-    }
-  }
-  return -1;
-}
-
-
-ncclResult_t netGetRings(int nranks, int* groups, int* subgroups, ncclTvalue_t* values, int* nringsRet, int* prev, int* next, int minScore, int* nthreads) {
-  int nGroups = groups[nranks-1] + 1;
-  int cardUsed[NET_MAX_IFS*nGroups];
-  for (int c=0; c<NET_MAX_IFS*nGroups; c++) cardUsed[c] = 0;
-
-  for (int ring = 0; ring<*nringsRet; ring++) {
-    int starts[nGroups];
-    int ends[nGroups];
-    for (int group = 0; group<nGroups; group++) {
-      int nranksInGroup = 0;
-      int nsubGroups = 0;
-      for (int rank=0; rank<nranks; rank++) if (groups[rank] == group) {
-          nranksInGroup++;
-          nsubGroups = std::max(subgroups[rank], nsubGroups);
-        }
-      starts[group] = ends[group] = -1;
-      // Receive on the rank closest to the NIC
-      for (int card=0; card<NET_MAX_IFS; card++) {
-        if (cardUsed[group*NET_MAX_IFS+card] == 1) continue;
-        int start = groupBestStart(nranks, groups, group, values, card, minScore);
-        // Send from any rank, but best on a different subgroup and close to the NIC also.
-        int end = (nranksInGroup == 1) ? start
-            : groupBestEnd(nranks, groups, group, subgroups, nsubGroups ? subgroups[start] : -1, start, values, card, minScore);
-        //printf("Ring %d, Minscore %d, Card %d, group %d, start = %d, end = %d\n", ring, minScore, card, group, start, end);
-        if (start != -1 && end != -1) {
-          cardUsed[group*NET_MAX_IFS+card] = 1;
-          starts[group] = start;
-          ends[group] = end;
-          break;
-        }
-      }
-      if (starts[group] == -1 || ends[group] == -1) {
-        *nringsRet = ring;
-        return ncclSuccess;
-      }
-    }
-    // Link groups together
-    for (int group = 0; group<nGroups; group++) {
-      int nextGroup = (group+1)%nGroups;
-      next[ring*nranks+ends[group]] = starts[nextGroup];
-      prev[ring*nranks+starts[nextGroup]] = ends[group];
-    }
-  }
-  return ncclSuccess;
-}
-
-int getDev(int ringId, int nDev, short* distances) {
-  int minDistance = PATH_SOC;
-  for (int d=0; d<nDev; d++) if (distances[d] < minDistance) minDistance = distances[d];
-  int skip = ringId+1;
-  while (skip) {
-    for (int d=0; d<nDev; d++) {
-      if (distances[d] == minDistance) {
-        skip--;
-        if (skip == 0) return d;
-      }
-    }
-  }
-  return 0;
-}
-
-NCCL_PARAM(NetGdrRead, "NET_GDR_READ", -2);
-NCCL_PARAM(NetGdrLevel, "NET_GDR_LEVEL", PATH_PHB);
-
-extern bool useFineGrainVramPcie;
-
-static ncclResult_t netGetGdrSupport(int dev, int distance, int read, int* useGdr) {
-  *useGdr = 0;
-
-  int cudaDev;
-  CUDACHECK(hipGetDevice(&cudaDev));
-
-  if (!useFineGrainVramPcie) {
-    INFO(NCCL_INIT|NCCL_NET,"NET/%s : GPU Direct RDMA Disabled for GPU %d / Need Fine Grain VRAM over PCIe", ncclNetName(), cudaDev);
-    return ncclSuccess;
-  }
-
-  if (read) { // For reads (sends) only enable under certain conditions
-    int gdrReadParam = ncclParamNetGdrRead();
-    if (gdrReadParam == 0) return ncclSuccess;
-    else if (gdrReadParam < 0) { // default : enable only on DGX2
-      char busId[NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE];
-      CUDACHECK(hipDeviceGetPCIBusId(busId, NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE, cudaDev));
-      int nvlinks = getNumNvlinks(busId);
-      if (nvlinks < CONNECT_NVSWITCH || ncclCudaCompCap() < 7) return ncclSuccess;
-    }
-  }
-
-  // Check if we are close enough that it makes sense to enable GDR
-  int netGdrLevel = ncclParamNetGdrLevel();
-  if (distance >= netGdrLevel) {
-    INFO(NCCL_INIT|NCCL_NET,"NET/%s : GPU Direct RDMA Disabled for GPU %d / HCA %d (distance %d >= %d)", ncclNetName(), cudaDev, dev, distance, netGdrLevel);
-    return ncclSuccess;
-  }
-
-  // Finally, check if the NIC supports it
-  int flags;
-  NCCLCHECK(ncclNetPtrSupport(dev, &flags));
-  if ((flags & NCCL_PTR_CUDA) == 0) return ncclSuccess;
-  *useGdr = 1;
-  INFO(NCCL_INIT|NCCL_NET,"NET/%s : GPU Direct RDMA Enabled for GPU %d / HCA %d (distance %d >= %d), read %d", ncclNetName(), cudaDev, dev, distance, netGdrLevel, read);
-  return ncclSuccess;
-}
-
-/* Determine if we will use this transport for this peer and return connect
- * information for this peer */
-ncclResult_t netSendSetup(ncclTinfo_t* myOpaqueInfo, ncclTinfo_t* peerOpaqueInfo, struct ncclConnect* connectInfo, struct ncclRing* ring) {
-  struct netSendResources* resources;
-  NCCLCHECK(ncclCalloc(&resources, 1));
-  ring->send.transportResources = resources;
-
-  struct netInfo* myInfo = (struct netInfo*)myOpaqueInfo;
-  resources->netDev = getDev(ring->id, myInfo->ndev, myInfo->distances);
-  NCCLCHECK(netGetGdrSupport(resources->netDev, myInfo->distances[resources->netDev], 1, &resources->useGdr));
-
-  int size = offsetof(struct ncclRecvMem, buff)+ring->buffSize;
-  if (resources->useGdr) {
-    NCCLCHECK(ncclCudaCalloc((char**)(&resources->devNetMem), size, true));
-  }
-
-  NCCLCHECK(ncclCudaHostAlloc((void**)&resources->hostRecvMem, (void**)&resources->devHostRecvMem, size));
-  NCCLCHECK(ncclCudaHostAlloc((void**)&resources->hostSendMem, (void**)&resources->devHostSendMem, size));
-
-  return ncclSuccess;
-}
-
-ncclResult_t netRecvSetup(ncclTinfo_t* myOpaqueInfo, ncclTinfo_t* peerOpaqueInfo, struct ncclConnect* connectInfo, struct ncclRing* ring) {
-  struct netRecvResources* resources;
-  NCCLCHECK(ncclCalloc(&resources, 1));
-  ring->recv.transportResources = resources;
-
-  struct netInfo* myInfo = (struct netInfo*)myOpaqueInfo;
-  resources->netDev = getDev(ring->id, myInfo->ndev, myInfo->distances);
-  NCCLCHECK(netGetGdrSupport(resources->netDev, myInfo->distances[resources->netDev], 0, &resources->useGdr));
-
-  if (resources->useGdr) {
-    // Collect HDR register for local GPU to initiate flush after receive
-    int cudaDev;
-    hipGetDevice(&cudaDev);
-    CUDACHECK(hipDeviceGetAttribute((int*)&ring->curr_hdp_reg, hipDeviceAttributeHdpMemFlushCntl, cudaDev));
-  }
-
-  int sendSize = sizeof(struct ncclSendMem);
-  NCCLCHECK(ncclCudaHostAlloc((void**)&resources->hostSendMem, (void**)&resources->devHostSendMem, sendSize));
-
-  int recvSize = offsetof(struct ncclRecvMem, buff)+ring->buffSize;
-  NCCLCHECK(ncclCudaHostAlloc((void**)&resources->hostRecvMem, (void**)&resources->devHostRecvMem, recvSize));
-
-  struct netInfo* peerInfo = (struct netInfo*)peerOpaqueInfo;
-  INFO(NCCL_INIT|NCCL_NET,"Ring %02d : %d -> %d via NET/%s/%d%s%s", ring->id, peerInfo->rank, myInfo->rank, ncclNetName(), resources->netDev,
-      resources->useGdr ? "/GDRDMA" : "",
-      (resources->hostDevMem != NULL) ? "/GDCopy" : "");
-  struct netConnectInfo* info = (struct netConnectInfo*) connectInfo;
-  NCCLCHECK(ncclNetListen(resources->netDev, &info->netHandle, &resources->netListenComm));
-  return ncclSuccess;
-}
-
-ncclResult_t netSendConnect(struct ncclConnect* connectInfo, struct ncclConnector* send) {
-  // Setup device pointers
-  struct netSendResources* resources = (struct netSendResources*)send->transportResources;
-
-  if (resources->useGdr) {
-    send->conn.buff = resources->devNetMem->buff;
-    // We don't use devMem for llMode because the CPU has to read the data
-    send->conn.llBuff = resources->devHostRecvMem->llBuff;
-  } else {
-    send->conn.buff = resources->devHostRecvMem->buff;
-    send->conn.llBuff = resources->devHostRecvMem->llBuff;
-  }
-  send->conn.tail = &resources->devHostRecvMem->tail;
-  send->conn.opCount = &resources->devHostRecvMem->opCount;
-  send->conn.fifo = resources->devHostRecvMem->sizesFifo;
-  send->conn.llFifo = resources->devHostRecvMem->llSizesFifo;
-
-  if (resources->hostDevMem == NULL) {
-    send->conn.head = &resources->devHostSendMem->head;
-    send->conn.llHead = &resources->devHostSendMem->llHead;
-  }
-
-  // Connect to remote peer
-  struct netConnectInfo* info = (struct netConnectInfo*)connectInfo;
-  NCCLCHECK(ncclNetConnect(resources->netDev, info->netHandle, &resources->netSendComm));
-  return ncclSuccess;
-}
-
-/* Connect to this peer */
-ncclResult_t netRecvConnect(struct ncclConnect* connectInfo, struct ncclConnector* recv) {
-  // Setup device pointers
-  struct netRecvResources* resources = (struct netRecvResources*)recv->transportResources;
-
-  recv->conn.head = &resources->devHostSendMem->head;
-  recv->conn.llHead = &resources->devHostSendMem->llHead;
-
-  if (resources->useGdr == 0) {
-    recv->conn.buff = resources->devHostRecvMem->buff;
-    recv->conn.llBuff = resources->devHostRecvMem->llBuff;
-  }
-
-  if (resources->hostDevMem == NULL) {
-    recv->conn.tail = &resources->devHostRecvMem->tail;
-    recv->conn.opCount = &resources->devHostRecvMem->opCount;
-  }
-
-  // Finish connection establishment
-  NCCLCHECK(ncclNetAccept(resources->netListenComm, &resources->netRecvComm));
-  NCCLCHECK(ncclNetCloseListen(resources->netListenComm));
-
-  return ncclSuccess;
-}
-
-ncclResult_t netSendFree(void* transportResources) {
-  struct netSendResources* resources = (struct netSendResources*)transportResources;
-  NCCLCHECK(ncclCudaHostFree(resources->hostSendMem));
-  NCCLCHECK(ncclCudaHostFree(resources->hostRecvMem));
-  if (resources->useGdr)
-    CUDACHECK(hipFree(resources->devNetMem));
-  NCCLCHECK(ncclNetCloseSend(resources->netSendComm));
-  free(resources);
-  return ncclSuccess;
-}
-
-ncclResult_t netRecvFree(void* transportResources) {
-  struct netRecvResources* resources = (struct netRecvResources*)transportResources;
-  NCCLCHECK(ncclCudaHostFree(resources->hostSendMem));
-  NCCLCHECK(ncclCudaHostFree(resources->hostRecvMem));
-  NCCLCHECK(ncclNetCloseRecv(resources->netRecvComm));
-  free(resources);
-  return ncclSuccess;
-}
-
-ncclResult_t netSendProxy(struct ncclProxyArgs* args) {
-  struct ncclRing* ring = args->ring;
-  struct netSendResources* resources = (struct netSendResources*) (ring->send.transportResources);
-  const int llMode = args->llMode;
-
-  volatile uint64_t* prevTail = &resources->hostRecvMem->tail;
-  struct ncclSendMem* prevMem = resources->hostDevMem ? resources->hostDevMem : resources->hostSendMem;
-  uint64_t* prevHead = llMode ? &prevMem->llHead : &prevMem->head;
-  struct ncclRecvMem* localMem = resources->useGdr ? resources->devNetMem : resources->hostRecvMem;
-  char* localBuff = llMode ? resources->hostRecvMem->llBuff : localMem->buff;
-  int ptrType = resources->useGdr ? NCCL_PTR_CUDA : NCCL_PTR_HOST;
-  volatile int* sizesFifo = llMode ? resources->hostRecvMem->llSizesFifo : resources->hostRecvMem->sizesFifo;
-  int buffSize = llMode ? NCCL_LL_BUFF_SIZE : ring->buffSize;
-  int sliceSize = buffSize / args->substeps;
-
-  assert(args->substeps <= SIZES_FIFO_SIZE);
-
-  uint64_t head = llMode ? resources->llStep : 0ULL;
-  uint64_t tail = llMode ? resources->llStep : 0ULL;
-  uint64_t end = head + args->nsteps;
-
-  int idle = 0;
-  void* requests[args->substeps];
-
-  if (!args->needProxy) goto nextColl;
-
-  TRACE(NCCL_NET,"opCount %lx head %lx tail %lx end %lx nsteps %d llMode %d", args->opCount, head, tail, end, args->nsteps, llMode);
-  TRACE(NCCL_NET,"opCount %lx buffSize %d sliceSize %d ptrType %d", args->opCount, buffSize, sliceSize, ptrType);
-
-  // Update in case we skipped some collectives
-  if (llMode == 0) resources->hostRecvMem->opCount = args->opCount;
-
-  while (head < end) {
-    idle++;
-    if (llMode) {
-      if (tail < end && tail < head + args->substeps) {
-        int slot = tail%args->substeps;
-        int size = LOAD(&sizesFifo[slot]);
-        if (size != 0) {
-          if (size == -1) size = 0;
-          uint32_t flag = tail + 1;
-          int nFifoLines = DIVUP(size, sizeof(union ncclLLFifoLine));
-          size = nFifoLines * sizeof(union ncclLLFifoLine);
-          union ncclLLFifoLine* lines = (union ncclLLFifoLine*)(localBuff+slot*sliceSize);
-          for (int i=0; i<nFifoLines; i++) {
-            volatile uint32_t *f1 = &lines[i].flag1;
-            volatile uint32_t *f2 = &lines[i].flag2;
-            while (LOAD(&f1[0]) != flag || LOAD(&f2[0]) != flag);
-          }
-          NCCLCHECK(ncclNetIsend(resources->netSendComm, lines, size, ptrType, requests+slot));
-          if (requests[slot] != NULL) {
-            STORE(&sizesFifo[slot], size);
-            tail++;
-            idle = 0;
-          }
-        }
-      }
-    } else while (tail < LOAD(prevTail)) {
-        // Send through network
-        int slot = tail%args->substeps;
-        //TRACE(NCCL_NET,"head %d tail %d prevTail %d slot %d size %d ptrType %d", head, tail, LOAD(prevTail), slot, LOAD(&sizesFifo[slot]), ptrType);
-        NCCLCHECK(ncclNetIsend(resources->netSendComm, localBuff+slot*sliceSize, LOAD(&sizesFifo[slot]), ptrType, requests+slot));
-        if (requests[slot] != NULL) {
-          tail++;
-          idle = 0;
-        }
-      }
-    if (head < tail) {
-      int done;
-      int slot = head%args->substeps;
-      NCCLCHECK(ncclNetTest(requests[slot], &done, NULL));
-      if (done) {
-        if (llMode) {
-          STORE(&sizesFifo[slot], 0);
-          // Make sure size is reset to zero before we update the head.
-          __sync_synchronize();
-        }
-        head++;
-        STORE(prevHead, head);
-        idle = 0;
-      }
-    }
-    if (idle) transportProxyIdle(idle);
-  }
-
-  // Reset
-  if (llMode == 0) STORE(prevTail, 0);
-
-nextColl:
-  if (llMode) {
-    resources->llStep += args->nsteps;
-    // Don't forget to ack otherwise the GPU won't be able to push data.
-    STORE(prevHead, resources->llStep);
-    if (resources->llStep > resources->llLastCleaning + NCCL_LL_CLEAN_FREQ) {
-      memset(localBuff, 0, NCCL_LL_BUFF_SIZE);
-      resources->llStep += NCCL_LL_CHUNKS;
-      STORE(prevHead, resources->llStep);
-      resources->llLastCleaning = resources->llStep;
-    }
-  }
-  return ncclSuccess;
-}
-
-ncclResult_t netRecvProxy(struct ncclProxyArgs* args) {
-  struct ncclRing* ring = args->ring;
-  struct netRecvResources* resources = (struct netRecvResources*) (ring->recv.transportResources);
-  int llMode = args->llMode;
-
-  volatile uint64_t* nextHead = llMode ? &resources->hostSendMem->llHead : &resources->hostSendMem->head;
-  struct ncclRecvMem* localMem = resources->useGdr ? ring->devMemRecv : resources->hostRecvMem;
-  char* localBuff = llMode ? localMem->llBuff : localMem->buff;
-  char* nextBuff = (resources->useGdr == 0 && resources->hostDevMem) ? resources->hostDevMem->buff : NULL;
-  int ptrType = resources->useGdr ? NCCL_PTR_CUDA : NCCL_PTR_HOST;
-  uint64_t* nextTail = resources->hostDevMem ? &resources->hostDevMem->tail : &resources->hostRecvMem->tail;
-
-  int buffSize = llMode ? NCCL_LL_BUFF_SIZE : ring->buffSize;
-  int sliceSize = buffSize / args->substeps;
-
-  uint64_t head = llMode ? resources->llStep : 0ULL;
-  uint64_t tail = llMode ? resources->llStep : 0ULL;
-  uint64_t end = head + args->nsteps;
-
-  int idle = 0;
-  void* requests[args->substeps];
-
-  if (!args->needProxy) goto nextColl;
-
-  TRACE(NCCL_NET,"opCount %lx head %lx tail %lx end %lx nsteps %d llMode %d", args->opCount, head, tail, end, args->nsteps, llMode);
-  TRACE(NCCL_NET,"opCount %lx buffSize %d sliceSize %d ptrType %d", args->opCount, buffSize, sliceSize, ptrType);
-
-  if (llMode == 0) {
-    // Waiting for next opCount is only needed before writing nextTail.
-    uint64_t* nextOpCount = resources->hostDevMem ? &resources->hostDevMem->opCount : &resources->hostRecvMem->opCount;
-    transportProxyWait([=] { return *nextOpCount >= args->opCount; });
-  }
-
-  while (head < end) {
-    idle++;
-    if ((tail < head + args->substeps) && (tail < LOAD(nextHead) + args->substeps) && (tail < end)) {
-      int slot = tail%args->substeps;
-      NCCLCHECK(ncclNetIrecv(resources->netRecvComm, localBuff+slot*sliceSize, sliceSize, ptrType, requests+slot));
-      if (requests[slot] != NULL) {
-        tail++;
-        idle = 0;
-      }
-    }
-    if (tail > head) {
-      int done;
-      int slot = head%args->substeps;
-      int size;
-      NCCLCHECK(ncclNetTest(requests[slot], &done, &size));
-      if (done) {
-        if (nextBuff) memcpy(nextBuff+slot*sliceSize, localBuff+slot*sliceSize, size);
-        head++;
-        if (llMode == 0) {
-          if (ptrType == NCCL_PTR_CUDA) {
-              ncclNetFlush(resources->netRecvComm, localBuff+slot*sliceSize, size);
-
-              // Flush local HDP register after local read-back finishes
-              STORE(ring->curr_hdp_reg, 0x1);
-              TRACE(NCCL_NET, "Flushing GPU memory via HDP %p", ring->curr_hdp_reg);
-          }
-          //TRACE(NCCL_NET,"head %d tail %d slot %d size %d ptrType %d", head, tail, slot, size, ptrType);
-          STORE(nextTail, head);
-        }
-        idle = 0;
-      }
-    }
-    if (idle) transportProxyIdle(idle);
-  }
-
-  // Wait for last ack and reset
-  if (llMode == 0) {
-    transportProxyWait([=] { return LOAD(nextHead) == head; });
-    STORE(nextHead, 0);
-  }
-
-nextColl:
-  if (llMode) {
-    resources->llStep += args->nsteps;
-    if (resources->llStep > resources->llLastCleaning + NCCL_LL_CLEAN_FREQ) {
-      resources->llStep += NCCL_LL_CHUNKS;
-      while (LOAD(nextHead) < resources->llStep);
-      resources->llLastCleaning = resources->llStep;
-    }
-  }
-  return ncclSuccess;
-}
-
-struct ncclTransport netTransport = {
-  "NET",
-  netFillInfo,
-  netCanConnect,
-  netGetRings,
-  { netSendSetup, netSendConnect, netSendFree, netSendProxy },
-  { netRecvSetup, netRecvConnect, netRecvFree, netRecvProxy }
-};
diff --git a/projects/rccl/src/transport/net_ib.cu b/projects/rccl/src/transport/net_ib.cc
similarity index 78%
rename from projects/rccl/src/transport/net_ib.cu
rename to projects/rccl/src/transport/net_ib.cc
index cbe2f9c45f..bfb2d8d437 100644
--- a/projects/rccl/src/transport/net_ib.cu
+++ b/projects/rccl/src/transport/net_ib.cc
@@ -1,5 +1,5 @@
 /*************************************************************************
- * Copyright (c) 2016-2018, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
  * Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
  *
  * See LICENSE.txt for license information
@@ -33,6 +33,7 @@ static int ncclNIbDevs = -1;
 struct ncclIbDev {
   int device;
   uint8_t port;
+  uint8_t link;
   ibv_context* context;
   char devName[MAXNAMESIZE];
 };
@@ -98,7 +99,6 @@ ncclResult_t ncclIbInit(ncclDebugLogger_t logFunction) {
         WARN("NET/IB : No IP interface found.");
         return ncclInternalError;
       }
-      INFO(NCCL_INIT|NCCL_NET,"NET/IB : Using interface %s for sideband communication", ncclIbIfName);
 
       // Detect IB cards
       int nIbDevs;
@@ -108,53 +108,67 @@ ncclResult_t ncclIbInit(ncclDebugLogger_t logFunction) {
       char* userIbEnv = getenv("NCCL_IB_HCA");
       struct netIf userIfs[MAX_IB_DEVS];
       bool searchNot = userIbEnv && userIbEnv[0] == '^';
+      bool searchExact = userIbEnv && userIbEnv[0] == '=';
       int nUserIfs = parseStringList(userIbEnv, userIfs, MAX_IB_DEVS);
 
       if (ncclSuccess != wrap_ibv_get_device_list(&devices, &nIbDevs)) return ncclInternalError;
 
-      for (int d=0; d<nIbDevs; d++) {
+      for (int d=0; d<nIbDevs && ncclNIbDevs<MAX_IB_DEVS; d++) {
         struct ibv_context * context;
-        if (ncclSuccess != wrap_ibv_open_device(&context, devices[d])) {
+        if (ncclSuccess != wrap_ibv_open_device(&context, devices[d]) || context == NULL) {
           WARN("NET/IB : Unable to open device %s", devices[d]->name);
           continue;
         }
-        int found = 0;
-        if (context) {
-          struct ibv_device_attr devAttr;
-          if (ncclSuccess != wrap_ibv_query_device(context, &devAttr)) {
-            WARN("NET/IB : Unable to query device %s", devices[d]->name);
+        int nPorts = 0;
+        struct ibv_device_attr devAttr;
+        memset(&devAttr, 0, sizeof(devAttr));
+        if (ncclSuccess != wrap_ibv_query_device(context, &devAttr)) {
+          WARN("NET/IB : Unable to query device %s", devices[d]->name);
+          if (ncclSuccess != wrap_ibv_close_device(context)) { return ncclInternalError; }
+          continue;
+        }
+        for (int port = 1; port <= devAttr.phys_port_cnt; port++) {
+          struct ibv_port_attr portAttr;
+          if (ncclSuccess != wrap_ibv_query_port(context, port, &portAttr)) {
+            WARN("NET/IB : Unable to query port %d", port);
             continue;
           }
-          for (int port = 1; port <= devAttr.phys_port_cnt; port++) {
-            struct ibv_port_attr portAttr;
-            if (ncclSuccess != wrap_ibv_query_port(context, port, &portAttr)) {
-              WARN("NET/IB : Unable to query port %d", port);
-              continue;
-            }
-            if (portAttr.state != IBV_PORT_ACTIVE) continue;
-            if (portAttr.link_layer != IBV_LINK_LAYER_INFINIBAND
-                && portAttr.link_layer != IBV_LINK_LAYER_ETHERNET) continue;
+          if (portAttr.state != IBV_PORT_ACTIVE) continue;
+          if (portAttr.link_layer != IBV_LINK_LAYER_INFINIBAND
+              && portAttr.link_layer != IBV_LINK_LAYER_ETHERNET) continue;
 
-            // check against user specified HCAs/ports
-            if (! (matchIfList(devices[d]->name, port, userIfs, nUserIfs) ^ searchNot)) {
-              continue;
-            }
-            INFO(NCCL_INIT|NCCL_NET,"NET/IB: [%d] %s:%d/%s ", d, devices[d]->name, port,
-                portAttr.link_layer == IBV_LINK_LAYER_INFINIBAND ? "IB" : "RoCE");
-            ncclIbDevs[ncclNIbDevs].device = d;
-            ncclIbDevs[ncclNIbDevs].port = port;
-            ncclIbDevs[ncclNIbDevs].context = context;
-            strncpy(ncclIbDevs[ncclNIbDevs].devName, devices[d]->name, MAXNAMESIZE);
-            ncclNIbDevs++;
-            found++;
-            pthread_create(&ncclIbAsyncThread, NULL, ncclIbAsyncThreadMain, context);
+          // check against user specified HCAs/ports
+          if (! (matchIfList(devices[d]->name, port, userIfs, nUserIfs, searchExact) ^ searchNot)) {
+            continue;
           }
-
-          if (found == 0) { if (ncclSuccess != wrap_ibv_close_device(context)) { return ncclInternalError; } }
+          TRACE(NCCL_INIT|NCCL_NET,"NET/IB: [%d] %s:%d/%s ", d, devices[d]->name, port,
+              portAttr.link_layer == IBV_LINK_LAYER_INFINIBAND ? "IB" : "RoCE");
+          ncclIbDevs[ncclNIbDevs].device = d;
+          ncclIbDevs[ncclNIbDevs].port = port;
+          ncclIbDevs[ncclNIbDevs].link = portAttr.link_layer;
+          ncclIbDevs[ncclNIbDevs].context = context;
+          strncpy(ncclIbDevs[ncclNIbDevs].devName, devices[d]->name, MAXNAMESIZE);
+          ncclNIbDevs++;
+          nPorts++;
+          pthread_create(&ncclIbAsyncThread, NULL, ncclIbAsyncThreadMain, context);
         }
+        if (nPorts == 0 && ncclSuccess != wrap_ibv_close_device(context)) { return ncclInternalError; }
       }
       if (nIbDevs && (ncclSuccess != wrap_ibv_free_device_list(devices))) { return ncclInternalError; };
     }
+    if (ncclNIbDevs == 0) {
+      INFO(NCCL_INIT|NCCL_NET, "NET/IB : No device found.");
+    } else {
+      char line[1024];
+      line[0] = '\0';
+      for (int d=0; d<ncclNIbDevs; d++) {
+        snprintf(line+strlen(line), 1023-strlen(line), " [%d]%s:%d/%s", d, ncclIbDevs[d].devName,
+            ncclIbDevs[d].port, ncclIbDevs[d].link == IBV_LINK_LAYER_INFINIBAND ? "IB" : "RoCE");
+      }
+      line[1023] = '\0';
+      char addrline[1024];
+      INFO(NCCL_INIT|NCCL_NET, "NET/IB : Using%s ; OOB %s:%s", line, ncclIbIfName, socketToString(&ncclIbIfAddr.sa, addrline));
+    }
     pthread_mutex_unlock(&ncclIbLock);
   }
   return ncclSuccess;
@@ -183,7 +197,7 @@ ncclResult_t ncclIbPciPath(int dev, char** path) {
 ncclResult_t ncclIbGdrSupport(int ibDev) {
   static int moduleLoaded = -1;
   if (moduleLoaded == -1) {
-#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__)
+#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
     moduleLoaded = (access("/sys/kernel/mm/memory_peers/amdkfd/version", F_OK) == -1) ? 0 : 1;
 #else
     moduleLoaded = (access("/sys/kernel/mm/memory_peers/nv_mem/version", F_OK) == -1) ? 0 : 1;
@@ -210,11 +224,12 @@ ncclResult_t ncclIbGdrSupport(int ibDev) {
 ncclResult_t ncclIbPtrSupport(int dev, int* supportedTypes) {
   *supportedTypes = NCCL_PTR_HOST;
 
-  int cudaDev;
+  int cudaDev, nvmlDev;
   CUDACHECK(hipGetDevice(&cudaDev));
+  NCCLCHECK(getNvmlDevice(cudaDev, &nvmlDev))
 
   if (ncclIbGdrSupport(dev) != ncclSuccess) {
-    INFO(NCCL_INIT|NCCL_NET,"NET/IB : GPU Direct RDMA Disabled for GPU %d / HCA %s (no module or not supported by GPU)", cudaDev, ncclIbDevs[dev].devName);
+    INFO(NCCL_NET,"NET/IB : GPU Direct RDMA Disabled for GPU %d[%d] / HCA %d '%s' (no module or not supported by GPU)", cudaDev, nvmlDev, dev, ncclIbDevs[dev].devName);
     return ncclSuccess;
   }
   *supportedTypes |= NCCL_PTR_CUDA;
@@ -247,23 +262,15 @@ struct ncclIbHandle {
   union socketAddress connectAddr;
 };
 
-struct ncclIbMr {
-  struct ibv_mr* mr;
-  int refcnt;
-};
-
 struct ncclIbVerbs {
   struct ibv_pd* pd;
   struct ibv_cq* cq;
-  struct ncclIbMr mrPool[MAX_REQUESTS];
-  int mrRotation;
 };
 
 struct ncclIbRequest {
   int used;
   int type;
   struct ncclIbVerbs* verbs;
-  struct ncclIbMr * ibMr;
   int done;
   int size;
   int free;
@@ -283,12 +290,12 @@ struct ncclIbSendFifo {
 };
 
 struct ncclIbSendComm {
+  struct ncclIbVerbs verbs;
   struct ncclIbSendFifo fifo[MAX_REQUESTS];
   struct ncclIbRequest reqs[MAX_REQUESTS];
   uint32_t fifoHead;
   int fd;
   int ready;
-  struct ncclIbVerbs verbs;
   struct ibv_qp* qp;
   struct ibv_mr* fifoMr;
 };
@@ -312,11 +319,11 @@ struct ncclIbRemFifo {
 };
 
 struct ncclIbRecvComm {
+  struct ncclIbVerbs verbs;
   struct ncclIbRemFifo remFifo;
   struct ncclIbRequest reqs[MAX_REQUESTS];
   int fd;
   int ready;
-  struct ncclIbVerbs verbs;
   struct ibv_qp* qp;
   struct ncclIbGpuFlush gpuFlush;
 };
@@ -439,13 +446,13 @@ ncclResult_t ncclIbConnect(int dev, void* opaqueHandle, void** sendComm) {
   // RoCE support
   qpInfo.lid = portAttr.lid;
   if (qpInfo.lid) { // IB
-    INFO(NCCL_INIT|NCCL_NET,"NET/IB: Dev %d Port %d qpn %d mtu %d LID %d", dev, ib_port, qpInfo.qpn, qpInfo.mtu, qpInfo.lid);
+    INFO(NCCL_NET,"NET/IB: Dev %d Port %d qpn %d mtu %d LID %d", dev, ib_port, qpInfo.qpn, qpInfo.mtu, qpInfo.lid);
   } else { // RoCE
     union ibv_gid gid;
     NCCLCHECK(wrap_ibv_query_gid(ctx, ib_port, ncclParamIbGidIndex(), &gid));
     qpInfo.spn = gid.global.subnet_prefix;
     qpInfo.iid = gid.global.interface_id;
-    INFO(NCCL_INIT|NCCL_NET,"NET/IB: Dev %d Port %d qpn %d mtu %d GID %ld (%lX/%lX)", dev, ib_port, qpInfo.qpn, qpInfo.mtu, ncclParamIbGidIndex(), qpInfo.spn, qpInfo.iid);
+    INFO(NCCL_NET,"NET/IB: Dev %d Port %d qpn %d mtu %d GID %ld (%lX/%lX)", dev, ib_port, qpInfo.qpn, qpInfo.mtu, ncclParamIbGidIndex(), qpInfo.spn, qpInfo.iid);
   }
 
   NCCLCHECK(socketSend(comm->fd, &qpInfo, sizeof(qpInfo)));
@@ -542,7 +549,6 @@ ncclResult_t ncclIbGetRequest(struct ncclIbRequest* reqs, struct ncclIbRequest**
       r->used = 1;
       r->type = 0;
       r->verbs = NULL;
-      r->ibMr = NULL;
       r->done = 0;
       r->size = -1;
       r->free = 0;
@@ -588,57 +594,34 @@ ncclResult_t ncclIbTest(void* request, int* done, int* size);
 
 #define REG_ALIGN (4096)
 
-// Cache previous MRs to avoid registering/unregistering for each Isend/Irecv
-ncclResult_t ncclIbGetMr(struct ncclIbVerbs* verbs, void* data, int size, struct ncclIbMr** mrRet) {
+ncclResult_t ncclIbRegMr(void* comm, void* data, int size, int type, void** mhandle) {
+  struct ncclIbVerbs* verbs = (struct ncclIbVerbs*)comm;
   uint64_t addr = (uint64_t)data;
-  int elem = -1;
   assert(size > 0);
 
-  // Look for an already existing MR
-  for (int i=0; i<MAX_REQUESTS; i++) {
-    if (verbs->mrPool[i].mr == NULL) continue;
-    uint64_t regAddr = (uint64_t)verbs->mrPool[i].mr->addr;
-    uint64_t regSize = (uint64_t)verbs->mrPool[i].mr->length;
-    if (regAddr <= addr && addr+size <= regAddr+regSize) {
-      *mrRet = verbs->mrPool+i;
-      verbs->mrPool[i].refcnt++;
-      return ncclSuccess;
-    }
-  }
-
-  // Find an unused element
-  if (elem == -1) {
-    elem = (verbs->mrRotation++);
-    for (int i=0; i<MAX_REQUESTS; i++) {
-      elem %= MAX_REQUESTS;
-      if (verbs->mrPool[elem].refcnt > 0) elem++; else break;
-    }
-    if (verbs->mrPool[elem].refcnt > 0) {
-      WARN("NET/IB : memory register : no MR available");
-      return ncclInternalError;
-    }
-  }
-
-  assert(elem < MAX_REQUESTS);
-  assert(verbs->mrPool[elem].refcnt == 0);
-
   // Deregister / register
   uint64_t regAddr = addr & (~(REG_ALIGN-1));
   uint64_t regSize = addr+size - regAddr;
   regSize = ((regSize + REG_ALIGN-1) / REG_ALIGN ) * REG_ALIGN;
-  if (verbs->mrPool[elem].mr) NCCLCHECK(wrap_ibv_dereg_mr(verbs->mrPool[elem].mr));
-  NCCLCHECK(wrap_ibv_reg_mr(&verbs->mrPool[elem].mr, verbs->pd, (void*)regAddr, regSize, IBV_ACCESS_LOCAL_WRITE|IBV_ACCESS_REMOTE_WRITE|IBV_ACCESS_REMOTE_READ));
-  *mrRet = verbs->mrPool+elem;
-  verbs->mrPool[elem].refcnt++;
-  TRACE(NCCL_INIT,"elem %d regAddr %lx size %ld rkey %x", elem, regAddr, regSize, (verbs->mrPool+elem)->mr->rkey);
+  struct ibv_mr* mr;
+  NCCLCHECK(wrap_ibv_reg_mr(&mr, verbs->pd, (void*)regAddr, regSize, IBV_ACCESS_LOCAL_WRITE|IBV_ACCESS_REMOTE_WRITE|IBV_ACCESS_REMOTE_READ));
+  *mhandle = (void*)mr;
+  TRACE(NCCL_INIT,"regAddr %lx size %ld rkey %x", regAddr, regSize, mr->rkey);
   return ncclSuccess;
 }
 
-ncclResult_t ncclIbIsend(void* sendComm, void* data, int size, int type, void** request) {
+ncclResult_t ncclIbDeregMr(void* comm, void* mhandle) {
+  NCCLCHECK(wrap_ibv_dereg_mr((struct ibv_mr*)mhandle));
+  return ncclSuccess;
+}
+
+ncclResult_t ncclIbIsend(void* sendComm, void* data, int size, void* mhandle, void** request) {
   struct ncclIbSendComm* comm = (struct ncclIbSendComm*)sendComm;
   if (comm->ready == 0) NCCLCHECK(ncclSendCheck(comm));
   if (comm->ready == 0) { *request = NULL; return ncclSuccess; }
 
+  struct ibv_mr* mr = (struct ibv_mr*)mhandle;
+
   // Wait for the receiver to have posted the corresponding receive
   volatile struct ncclIbSendFifo* slot = comm->fifo + (comm->fifoHead%MAX_REQUESTS);
   volatile uint32_t * readyPtr = &slot->ready;
@@ -646,7 +629,6 @@ ncclResult_t ncclIbIsend(void* sendComm, void* data, int size, int type, void**
 
   struct ncclIbRequest* req;
   NCCLCHECK(ncclIbGetRequest(comm->reqs, &req));
-  req->type = type;
   req->verbs = &comm->verbs;
   req->size = size;
 
@@ -659,8 +641,7 @@ ncclResult_t ncclIbIsend(void* sendComm, void* data, int size, int type, void**
     wr.sg_list = NULL;
     wr.num_sge = 0;
   } else {
-    NCCLCHECK(ncclIbGetMr(&comm->verbs, data, size, &req->ibMr));
-    sge.addr=(uintptr_t)data; sge.length=(unsigned int)size; sge.lkey=req->ibMr->mr->lkey;
+    sge.addr=(uintptr_t)data; sge.length=(unsigned int)size; sge.lkey=mr->lkey;
     wr.sg_list = &sge;
     wr.num_sge = 1;
   }
@@ -671,22 +652,22 @@ ncclResult_t ncclIbIsend(void* sendComm, void* data, int size, int type, void**
   __sync_synchronize(); // order the readyPtr load against rkey load below
   // Sanity checks to catch user collective call count/size mismatches
   // plus any potential programming errors
-  if (size > slot->size || slot->size <= 0 || slot->addr == 0 || slot->rkey == 0 || slot->seq != comm->fifoHead) {
+  if (size > LOAD(&slot->size) || LOAD(&slot->size) <= 0 || LOAD(&slot->addr) == 0 || LOAD(&slot->rkey) == 0 || LOAD(&slot->seq) != comm->fifoHead) {
     WARN("NET/IB : collective mismatch error local size %d remote %d addr %lx rkey %x seq %x/%x",
-        size, slot->size, slot->addr, slot->rkey, slot->seq, comm->fifoHead);
+        size, LOAD(&slot->size), LOAD(&slot->addr), LOAD(&slot->rkey), LOAD(&slot->seq), comm->fifoHead);
     return ncclInternalError;
   }
   wr.opcode = IBV_WR_RDMA_WRITE_WITH_IMM;
-  wr.wr.rdma.remote_addr = slot->addr;
-  wr.wr.rdma.rkey = slot->rkey;
+  wr.wr.rdma.remote_addr = LOAD(&slot->addr);
+  wr.wr.rdma.rkey = LOAD(&slot->rkey);
   wr.imm_data = size; // Send the message size via imm_data
   __sync_synchronize();
 #endif
   // We must clear slot->ready, but reset other fields to aid
   // debugging and sanity checks
   STORE(&slot->ready, 0);
-  slot->addr = 0ULL;
-  slot->rkey = slot->size = slot->seq = 0;
+  STORE(&slot->addr, 0);
+  STORE(&slot->rkey, 0); STORE(&slot->size, 0); STORE(&slot->seq, 0);
   comm->fifoHead++;
 
   struct ibv_send_wr* bad_wr;
@@ -725,14 +706,15 @@ ncclResult_t ncclIbPostFifo(struct ncclIbRecvComm* comm, uint32_t rkey, uint64_t
   return ncclSuccess;
 }
 
-ncclResult_t ncclIbIrecv(void* recvComm, void* data, int size, int type, void** request) {
+ncclResult_t ncclIbIrecv(void* recvComm, void* data, int size, void* mhandle, void** request) {
   struct ncclIbRecvComm* comm = (struct ncclIbRecvComm*)recvComm;
   if (comm->ready == 0) NCCLCHECK(ncclRecvCheck(comm));
   if (comm->ready == 0) { *request = NULL; return ncclSuccess; }
 
+  struct ibv_mr* mr = (struct ibv_mr*)mhandle;
+
   struct ncclIbRequest* req;
   NCCLCHECK(ncclIbGetRequest(comm->reqs, &req));
-  req->type = type;
   req->verbs = &comm->verbs;
   req->size = size;
 
@@ -744,10 +726,8 @@ ncclResult_t ncclIbIrecv(void* recvComm, void* data, int size, int type, void**
   if (size == 0) {
     wr.sg_list = NULL;
     wr.num_sge = 0;
-    req->ibMr = NULL;
   } else {
-    NCCLCHECK(ncclIbGetMr(&comm->verbs, data, size, &req->ibMr));
-    sge.addr=(uintptr_t)data; sge.length=(unsigned int)size; sge.lkey=req->ibMr->mr->lkey;
+    sge.addr=(uintptr_t)data; sge.length=(unsigned int)size; sge.lkey=mr->lkey;
     wr.sg_list = &sge;
     wr.num_sge = 1;
   }
@@ -757,25 +737,25 @@ ncclResult_t ncclIbIrecv(void* recvComm, void* data, int size, int type, void**
   *request = req;
 
   // Post to FIFO to notify sender
-  NCCLCHECK(ncclIbPostFifo(comm, req->ibMr->mr->rkey, (uint64_t)data, size));
+  NCCLCHECK(ncclIbPostFifo(comm, mr->rkey, (uint64_t)data, size));
   return ncclSuccess;
 }
 
-ncclResult_t ncclIbFlush(void* recvComm, void* data, int size) {
+ncclResult_t ncclIbFlush(void* recvComm, void* data, int size, void* mhandle) {
   struct ncclIbRecvComm* comm = (struct ncclIbRecvComm*)recvComm;
   if (comm->gpuFlush.enabled == 0 || size == 0) return ncclSuccess;
 
   struct ncclIbRequest* req;
   NCCLCHECK(ncclIbGetRequest(comm->reqs, &req));
   req->verbs = &comm->verbs;
-  NCCLCHECK(ncclIbGetMr(&comm->verbs, data, 1, &req->ibMr));
+  struct ibv_mr* mr = (struct ibv_mr*)mhandle;
 
   struct ibv_send_wr wr;
   memset(&wr, 0, sizeof(wr));
   wr.wr_id = (uint64_t)req;
 
   wr.wr.rdma.remote_addr = (uint64_t)data;
-  wr.wr.rdma.rkey = req->ibMr->mr->rkey;
+  wr.wr.rdma.rkey = mr->rkey;
   wr.sg_list = &comm->gpuFlush.sge;
   wr.num_sge = 1;
   wr.opcode = IBV_WR_RDMA_READ;
@@ -805,32 +785,31 @@ ncclResult_t ncclIbTest(void* request, int* done, int* size) {
     }
 
     int wrDone = 0;
-    struct ibv_wc wc;
-    NCCLCHECK(wrap_ibv_poll_cq(r->verbs->cq, 1, &wc, &wrDone));
+    struct ibv_wc wcs[4];
+    NCCLCHECK(wrap_ibv_poll_cq(r->verbs->cq, 4, wcs, &wrDone));
     if (wrDone == 0) return ncclSuccess;
 
-    if (wc.status != IBV_WC_SUCCESS) {
-      WARN("NET/IB : Got completion with error %d, opcode %d, len %d, vendor err %d", wc.status, wc.opcode, wc.byte_len, wc.vendor_err);
-      return ncclSystemError;
-    }
+    for (int w=0; w<wrDone; w++) {
+      struct ibv_wc *wc = wcs+w;
+      if (wc->status != IBV_WC_SUCCESS) {
+        WARN("NET/IB : Got completion with error %d, opcode %d, len %d, vendor err %d", wc->status, wc->opcode, wc->byte_len, wc->vendor_err);
+        return ncclSystemError;
+      }
 
-    struct ncclIbRequest* doneReq = (struct ncclIbRequest*)wc.wr_id;
-    if (doneReq) {
-      if (wc.opcode == IBV_WC_RECV) {
-        doneReq->size = wc.byte_len;
+      struct ncclIbRequest* doneReq = (struct ncclIbRequest*)wc->wr_id;
+      if (doneReq) {
+        if (wc->opcode == IBV_WC_RECV) {
+          doneReq->size = wc->byte_len;
 #if USE_RDMA_WRITE
-      } else if (wc.opcode == IBV_WC_RECV_RDMA_WITH_IMM) {
-        doneReq->size = wc.imm_data;
+        } else if (wc->opcode == IBV_WC_RECV_RDMA_WITH_IMM) {
+          doneReq->size = wc->imm_data;
 #endif
-      }
-      if (doneReq->ibMr != NULL) {
-        doneReq->ibMr->refcnt--;
-        if (doneReq->ibMr->refcnt < 0) WARN("NET/IB : doneReq %p MR %p refcount now %d", doneReq, doneReq->ibMr, doneReq->ibMr->refcnt);
-      }
-      doneReq->done = 1;
-      if (doneReq->free == 1) {
-        // This is an internal (FIFO post) req. Free it immediately.
-        doneReq->used = 0;
+        }
+        doneReq->done = 1;
+        if (doneReq->free == 1) {
+          // This is an internal (FIFO post) req. Free it immediately.
+          doneReq->used = 0;
+        }
       }
     }
   }
@@ -842,12 +821,6 @@ ncclResult_t ncclIbCloseSend(void* sendComm) {
     close(comm->fd);
     if (comm->qp != NULL) NCCLCHECK(wrap_ibv_destroy_qp(comm->qp));
     if (comm->fifoMr != NULL) NCCLCHECK(wrap_ibv_dereg_mr(comm->fifoMr));
-    for (int i=0; i<MAX_REQUESTS; i++) {
-      if (comm->verbs.mrPool[i].mr != NULL) {
-        if (comm->verbs.mrPool[i].refcnt != 0) WARN("NET/IB : TX MR #%d has non-zero (%d) refcnt", i, comm->verbs.mrPool[i].refcnt);
-        NCCLCHECK(wrap_ibv_dereg_mr(comm->verbs.mrPool[i].mr));
-      }
-    }
     NCCLCHECK(ncclIbDestroyVerbs(&comm->verbs));
     free(comm);
   }
@@ -864,12 +837,6 @@ ncclResult_t ncclIbCloseRecv(void* recvComm) {
       if (comm->gpuFlush.hostMr != NULL) NCCLCHECK(wrap_ibv_dereg_mr(comm->gpuFlush.hostMr));
     }
     if (comm->remFifo.mr != NULL) NCCLCHECK(wrap_ibv_dereg_mr(comm->remFifo.mr));
-    for (int i=0; i<MAX_REQUESTS; i++) {
-      if (comm->verbs.mrPool[i].mr != NULL) {
-        if (comm->verbs.mrPool[i].refcnt != 0) WARN("NET/IB : RX MR #%d has non-zero (%d) refcnt", i, comm->verbs.mrPool[i].refcnt);
-        NCCLCHECK(wrap_ibv_dereg_mr(comm->verbs.mrPool[i].mr));
-      }
-    }
     NCCLCHECK(ncclIbDestroyVerbs(&comm->verbs));
     free(comm);
   }
@@ -894,6 +861,8 @@ ncclNet_t ncclNetIb = {
   ncclIbListen,
   ncclIbConnect,
   ncclIbAccept,
+  ncclIbRegMr,
+  ncclIbDeregMr,
   ncclIbIsend,
   ncclIbIrecv,
   ncclIbFlush,
diff --git a/projects/rccl/src/transport/net_socket.cc b/projects/rccl/src/transport/net_socket.cc
new file mode 100644
index 0000000000..ec0e50d518
--- /dev/null
+++ b/projects/rccl/src/transport/net_socket.cc
@@ -0,0 +1,486 @@
+/*************************************************************************
+ * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
+ * Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#include "nccl.h"
+#include "core.h"
+#include "socket.h"
+#include "net.h"
+#include "param.h"
+
+#include <assert.h>
+#include <pthread.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <poll.h>
+#include <limits.h>
+#include <fcntl.h>
+
+/* Init functions */
+static char ncclNetIfNames[MAX_IF_NAME_SIZE*MAX_IFS];
+static union socketAddress ncclNetIfAddrs[MAX_IFS];
+static int ncclNetIfs = -1;
+pthread_mutex_t ncclSocketLock = PTHREAD_MUTEX_INITIALIZER;
+
+ncclResult_t ncclSocketInit(ncclDebugLogger_t logFunction) {
+  if (ncclNetIfs == -1) {
+    pthread_mutex_lock(&ncclSocketLock);
+    if (ncclNetIfs == -1) {
+      ncclNetIfs = findInterfaces(ncclNetIfNames, ncclNetIfAddrs, MAX_IF_NAME_SIZE, MAX_IFS);
+      if (ncclNetIfs <= 0) {
+        WARN("NET/Socket : no interface found");
+        return ncclInternalError;
+      } else {
+        char line[1024];
+        char addrline[1024];
+        line[0] = '\0';
+        for (int i=0; i<ncclNetIfs; i++) {
+          snprintf(line+strlen(line), 1023-strlen(line), " [%d]%s:%s", i, ncclNetIfNames+i*MAX_IF_NAME_SIZE,
+              socketToString(&ncclNetIfAddrs[i].sa, addrline));
+        }
+        line[1023] = '\0';
+        INFO(NCCL_INIT|NCCL_NET,"NET/Socket : Using%s", line);
+      }
+    }
+    pthread_mutex_unlock(&ncclSocketLock);
+  }
+  return ncclSuccess;
+}
+
+ncclResult_t ncclSocketPtrSupport(int dev, int* supportedTypes) {
+  *supportedTypes = NCCL_PTR_HOST;
+  return ncclSuccess;
+}
+
+ncclResult_t ncclSocketDevices(int* ndev) {
+  *ndev = ncclNetIfs;
+  return ncclSuccess;
+}
+
+ncclResult_t ncclSocketPciPath(int dev, char** path) {
+  char devicepath[PATH_MAX];
+  snprintf(devicepath, PATH_MAX, "/sys/class/net/%s/device", ncclNetIfNames+dev*MAX_IF_NAME_SIZE);
+  *path = realpath(devicepath, NULL);
+  if (*path == NULL) {
+    INFO(NCCL_NET|NCCL_INIT, "Could not find real path of %s", devicepath);
+    return ncclSystemError;
+  }
+  return ncclSuccess;
+}
+
+ncclResult_t GetSocketAddr(int dev, union socketAddress* addr) {
+  if (dev >= ncclNetIfs) return ncclInternalError;
+  memcpy(addr, ncclNetIfAddrs+dev, sizeof(*addr));
+  return ncclSuccess;
+}
+
+/* Communication functions */
+
+#define MAX_SOCKETS 64
+#define MAX_THREADS 16
+#define MAX_REQUESTS 128
+#define MAX_QUEUE_LEN MAX_REQUESTS
+#define MIN_CHUNKSIZE (64*1024)
+
+NCCL_PARAM(SocketNsocksPerThread, "NSOCKS_PERTHREAD", -2);
+NCCL_PARAM(SocketNthreads, "SOCKET_NTHREADS", -2);
+
+struct ncclSocketHandle {
+  union socketAddress connectAddr;
+  int nSocks;
+  int nThreads;
+};
+
+struct ncclSocketTask {
+  int op;
+  void* data;
+  int size;
+  int fd;
+  int offset;
+  int used;
+  ncclResult_t result;
+};
+
+struct ncclSocketRequest {
+  int op;
+  void* data;
+  int size;
+  int ctrlFd;
+  int used;
+  struct ncclSocketComm* comm;
+  struct ncclSocketTask* tasks[MAX_SOCKETS];
+  int nSubs;
+};
+
+struct ncclSocketTaskQueue {
+  int next;
+  struct ncclSocketTask* tasks;
+};
+
+enum threadState {start, stop};
+
+struct ncclSocketThreadResources {
+  struct ncclSocketTaskQueue threadTaskQueue;
+  enum threadState state;
+  struct ncclSocketComm* comm;
+  pthread_mutex_t threadLock;
+  pthread_cond_t  threadCond;
+};
+
+struct ncclSocketListenComm {
+  int fd;
+  int nSocks;
+  int nThreads;
+};
+
+struct ncclSocketComm {
+  int ctrlFd;
+  int fds[MAX_SOCKETS];
+  int nSocks;
+  int nThreads;
+  int nextFd;
+  struct ncclSocketRequest requests[MAX_REQUESTS];
+  pthread_t helperThread[MAX_THREADS];
+  struct ncclSocketThreadResources threadResources[MAX_THREADS];
+};
+
+void* persistentSocketThread(void *args_) {
+  struct ncclSocketThreadResources* resource = (struct ncclSocketThreadResources*)args_;
+  struct ncclSocketComm* comm = resource->comm;
+  volatile enum threadState* state = &resource->state;
+  struct ncclSocketTaskQueue* myQueue = &resource->threadTaskQueue;
+  int nSocksPerThread = comm->nSocks / comm->nThreads;
+  while (1) {
+    int idle = 1;
+    int mark = myQueue->next; // mark newest task seen
+    for (int i=0; i<MAX_QUEUE_LEN; i+=nSocksPerThread) {
+      int repeat;
+      do {
+        repeat = 0;
+        for (int j=0; j<nSocksPerThread; j++) {
+          struct ncclSocketTask* r = myQueue->tasks+i+j;
+          if (r != NULL && r->used == 1 && r->offset < r->size) {
+            r->result = socketProgress(r->op, r->fd, r->data, r->size, &r->offset);
+            if (r->result != ncclSuccess) {
+              WARN("NET/Socket : socket progress error");
+              return NULL;
+            }
+            idle = 0;
+            if (r->offset < r->size) repeat = 1;
+          }
+        }
+      } while (repeat);
+    }
+    if (idle) {
+      pthread_mutex_lock(&resource->threadLock);
+      while (mark == myQueue->next && LOAD(state) != stop) { // no new tasks, wait
+        pthread_cond_wait(&resource->threadCond, &resource->threadLock);
+      }
+      pthread_mutex_unlock(&resource->threadLock);
+    }
+    if (LOAD(state) == stop) return NULL;
+  }
+}
+
+ncclResult_t ncclSocketGetNsockNthread(int dev, int* ns, int* nt) {
+  int nSocksPerThread = ncclParamSocketNsocksPerThread();
+  int nThreads = ncclParamSocketNthreads();
+  if (nThreads > MAX_THREADS) {
+    WARN("NET/Socket : NCCL_SOCKET_NTHREADS is greater than the maximum allowed, setting to %d", MAX_THREADS);
+    nThreads = MAX_THREADS;
+  }
+  if (nThreads == -2 || nSocksPerThread == -2) {
+    // Auto-detection
+    int autoNt=1, autoNs=1;
+    char vendorPath[PATH_MAX];
+    snprintf(vendorPath, PATH_MAX, "/sys/class/net/%s/device/vendor", ncclNetIfNames+dev*MAX_IF_NAME_SIZE);
+    char* rPath = realpath(vendorPath, NULL);
+    int fd = open(rPath, O_RDONLY);
+    free(rPath);
+    if (fd == -1) {
+      // Could not find device vendor. This is handled silently so
+      // we don't want to print an INFO error.
+      TRACE(NCCL_NET, "Open of %s failed : %s\n", vendorPath, strerror(errno));
+      goto end;
+    }
+    char vendor[7];
+    strncpy(vendor, "0x0000", 7);
+    int len;
+    SYSCHECKVAL(read(fd, vendor, 6), "read", len);
+    SYSCHECK(close(fd), "close");
+    if (strcmp(vendor, "0x1d0f") == 0) { // AWS
+      autoNt = 2;
+      autoNs = 8;
+    }
+end:
+    if (nThreads == -2) nThreads = autoNt;
+    if (nSocksPerThread == -2) nSocksPerThread = autoNs;
+  }
+  int nSocks = nSocksPerThread * nThreads;
+  if (nSocks > MAX_SOCKETS) {
+    nSocksPerThread = MAX_SOCKETS/nThreads;
+    WARN("NET/Socket : the total number of sockets is greater than the maximum allowed, setting NCCL_NSOCKS_PERTHREAD to %d", nSocksPerThread);
+    nSocks = nSocksPerThread * nThreads;
+  }
+  *ns = nSocks;
+  *nt = nThreads;
+  INFO(NCCL_INIT, "NET/Socket: Using %d threads and %d sockets per thread", nThreads, nSocksPerThread);
+  return ncclSuccess;
+}
+
+ncclResult_t ncclSocketNewListenComm(struct ncclSocketListenComm** comm) {
+  NCCLCHECK(ncclCalloc(comm, 1));
+  (*comm)->fd = -1;
+  return ncclSuccess;
+}
+
+ncclResult_t ncclSocketNewComm(struct ncclSocketComm** comm) {
+  NCCLCHECK(ncclCalloc(comm, 1));
+  (*comm)->ctrlFd = -1;
+  for (int i=0; i < MAX_SOCKETS; i++) {
+    (*comm)->fds[i] = -1;
+  }
+  (*comm)->nextFd = 0;
+  return ncclSuccess;
+}
+
+ncclResult_t ncclSocketListen(int dev, void* opaqueHandle, void** listenComm) {
+  if (dev < 0) { // data transfer socket is based on specified dev
+    return ncclInternalError;
+  }
+  struct ncclSocketHandle* handle = (struct ncclSocketHandle*) opaqueHandle;
+  static_assert(sizeof(struct ncclSocketHandle) < NCCL_NET_HANDLE_MAXSIZE, "ncclSocketHandle size too large");
+  struct ncclSocketListenComm* comm;
+  NCCLCHECK(ncclSocketNewListenComm(&comm));
+  NCCLCHECK(GetSocketAddr(dev, &handle->connectAddr));
+  NCCLCHECK(createListenSocket(&comm->fd, &handle->connectAddr));
+  NCCLCHECK(ncclSocketGetNsockNthread(dev, &comm->nSocks, &comm->nThreads));
+  handle->nSocks = comm->nSocks;
+  handle->nThreads = comm->nThreads;
+  *listenComm = comm;
+  return ncclSuccess;
+}
+
+ncclResult_t ncclSocketConnect(int dev, void* opaqueHandle, void** sendComm) {
+  if (dev < 0) { // data transfer socket is based on specified dev
+    return ncclInternalError;
+  }
+  struct ncclSocketComm* comm;
+  NCCLCHECK(ncclSocketNewComm(&comm));
+  struct ncclSocketHandle* handle = (struct ncclSocketHandle*) opaqueHandle;
+  comm->nSocks = handle->nSocks;
+  comm->nThreads = handle->nThreads;
+  for (int i=0; i<comm->nSocks+1; i++) {
+    int tmpFd, offset=0;
+    NCCLCHECK(connectAddress(&tmpFd, &handle->connectAddr));
+    NCCLCHECK(socketWait(NCCL_SOCKET_SEND, tmpFd, &i, sizeof(int), &offset));
+    if (i == comm->nSocks) comm->ctrlFd = tmpFd;
+    else comm->fds[i] = tmpFd;
+  }
+  *sendComm = comm;
+  return ncclSuccess;
+}
+
+ncclResult_t ncclSocketAccept(void* listenComm, void** recvComm) {
+  struct ncclSocketListenComm* lComm = (struct ncclSocketListenComm*)listenComm;
+  struct ncclSocketComm* rComm;
+  NCCLCHECK(ncclSocketNewComm(&rComm));
+  rComm->nSocks = lComm->nSocks;
+  rComm->nThreads = lComm->nThreads;
+  for (int i=0; i<rComm->nSocks+1; i++) {
+    int tmpFd, sendSockIdx, offset=0;
+    struct sockaddr_in sockaddr;
+    socklen_t socklen = sizeof(struct sockaddr_in);
+    SYSCHECKVAL(accept(lComm->fd, (struct sockaddr*)&sockaddr, &socklen), "accept", tmpFd);
+    NCCLCHECK(socketWait(NCCL_SOCKET_RECV, tmpFd, &sendSockIdx, sizeof(int), &offset));
+    if (sendSockIdx == rComm->nSocks) rComm->ctrlFd = tmpFd;
+    else rComm->fds[sendSockIdx] = tmpFd;
+  }
+  *recvComm = rComm;
+  return ncclSuccess;
+}
+
+ncclResult_t ncclSocketGetRequest(struct ncclSocketComm* comm, int op, void* data, int size, struct ncclSocketRequest** req) {
+  for (int i=0; i<MAX_REQUESTS; i++) {
+    struct ncclSocketRequest* r = comm->requests+i;
+    if (r->used == 0) {
+      r->op = op;
+      r->data = data;
+      r->size = size;
+      r->ctrlFd = comm->ctrlFd;
+      r->used = 1;
+      r->comm = comm;
+      r->nSubs = 0;
+      *req = r;
+      return ncclSuccess;
+    }
+  }
+  WARN("NET/Socket : unable to allocate requests");
+  return ncclInternalError;
+}
+
+ncclResult_t ncclSocketGetTask(struct ncclSocketComm* comm, int op, void* data, int size, struct ncclSocketTask** req) {
+  int tid = comm->nextFd % comm->nThreads;
+  struct ncclSocketThreadResources* res = comm->threadResources+tid;
+  struct ncclSocketTaskQueue* queue = &res->threadTaskQueue;
+  // create helper threads and prepare per-thread task queue
+  if (queue->tasks == NULL) {
+    NCCLCHECK(ncclCalloc(&queue->tasks, MAX_QUEUE_LEN));
+    queue->next = 0;
+    res->comm = comm;
+    pthread_mutex_init(&res->threadLock, NULL);
+    pthread_cond_init(&res->threadCond, NULL);
+    pthread_create(comm->helperThread+tid, NULL, persistentSocketThread, res);
+  }
+  struct ncclSocketTask* r = queue->tasks+queue->next;
+  if (r->used == 0) {
+    r->op = op;
+    r->data = data;
+    r->size = size;
+    r->fd = comm->fds[comm->nextFd];
+    r->offset = 0;
+    r->result = ncclSuccess;
+    comm->nextFd = (comm->nextFd + 1) % comm->nSocks;
+    r->used = 1;
+    *req = r;
+    pthread_mutex_lock(&res->threadLock);
+    queue->next = (queue->next+1)%MAX_QUEUE_LEN;
+    res->state = start;
+    pthread_cond_signal(&res->threadCond);
+    pthread_mutex_unlock(&res->threadLock);
+    return ncclSuccess;
+  }
+  WARN("NET/Socket : unable to allocate subtasks");
+  return ncclInternalError;
+}
+
+ncclResult_t ncclSocketTest(void* request, int* done, int* size) {
+  *done = 0;
+  struct ncclSocketRequest *r = (struct ncclSocketRequest*)request;
+  if (r == NULL) {
+    WARN("NET/Socket : test called with NULL request");
+    return ncclInternalError;
+  }
+  if (r->used == 1) { /* try to send/recv size */
+    int data = r->size;
+    int offset = 0;
+    NCCLCHECK(socketProgress(r->op, r->ctrlFd, &data, sizeof(int), &offset));
+
+    if (offset == 0) return ncclSuccess; /* Not ready -- retry later */
+
+    // Not sure we could ever receive less than 4 bytes, but just in case ...
+    if (offset < sizeof(int)) NCCLCHECK(socketWait(r->op, r->ctrlFd, &data, sizeof(int), &offset));
+
+    // Check size is less or equal to the size provided by the user
+    if (r->op == NCCL_SOCKET_RECV && data > r->size) {
+      WARN("NET/Socket : message truncated : receiving %d bytes instead of %d", data, r->size);
+      return ncclInternalError;
+    }
+    r->size = data;
+    r->used = 2; // done exchanging size
+    // divide into subtasks
+    int taskSize = std::max(MIN_CHUNKSIZE, DIVUP(r->size, r->comm->nSocks));
+    int chunkOffset = 0, i = 0;
+    while (chunkOffset < r->size) {
+      int chunkSize = std::min(taskSize, r->size-chunkOffset);
+      NCCLCHECK(ncclSocketGetTask(r->comm, r->op, (char*)(r->data)+chunkOffset, chunkSize, r->tasks+i++));
+      chunkOffset += chunkSize;
+    }
+    r->nSubs = i;
+  }
+  if (r->used == 2) { // already exchanged size
+    int nCompleted = 0;
+    for (int i=0; i<r->nSubs; i++) {
+      struct ncclSocketTask* sub = r->tasks[i];
+      if (sub->result != ncclSuccess) return sub->result;
+      if (sub->offset == sub->size) nCompleted++;
+    }
+    if (nCompleted == r->nSubs) {
+      if (size) *size = r->size;
+      *done = 1;
+      r->used = 0;
+      for (int i=0; i<r->nSubs; i++) {
+        struct ncclSocketTask* sub = r->tasks[i];
+        sub->used = 0;
+      }
+    }
+  }
+  return ncclSuccess;
+}
+
+ncclResult_t ncclSocketRegMr(void* comm, void* data, int size, int type, void** mhandle) {
+  return (type != NCCL_PTR_HOST) ? ncclInternalError : ncclSuccess;
+}
+ncclResult_t ncclSocketDeregMr(void* comm, void* mhandle) { return ncclSuccess; }
+
+ncclResult_t ncclSocketIsend(void* sendComm, void* data, int size, void* mhandle, void** request) {
+  struct ncclSocketComm* comm = (struct ncclSocketComm*)sendComm;
+  NCCLCHECK(ncclSocketGetRequest(comm, NCCL_SOCKET_SEND, data, size, (struct ncclSocketRequest**)request));
+  return ncclSuccess;
+}
+
+ncclResult_t ncclSocketIrecv(void* recvComm, void* data, int size, void* mhandle, void** request) {
+  struct ncclSocketComm* comm = (struct ncclSocketComm*)recvComm;
+  NCCLCHECK(ncclSocketGetRequest(comm, NCCL_SOCKET_RECV, data, size, (struct ncclSocketRequest**)request));
+  return ncclSuccess;
+}
+
+ncclResult_t ncclSocketFlush(void* recvComm, void* data, int size, void* mhandle) {
+  // We don't support CUDA pointers, so we don't need a flush operation
+  return ncclInternalError;
+}
+
+ncclResult_t ncclSocketCloseListen(void* opaqueComm) {
+  struct ncclSocketListenComm* comm = (struct ncclSocketListenComm*)opaqueComm;
+  if (comm) {
+    if (comm->fd != -1) close(comm->fd);
+    free(comm);
+  }
+  return ncclSuccess;
+}
+
+ncclResult_t ncclSocketClose(void* opaqueComm) {
+  struct ncclSocketComm* comm = (struct ncclSocketComm*)opaqueComm;
+  if (comm) {
+    for (int i=0; i<comm->nThreads; i++) {
+      struct ncclSocketThreadResources* res = comm->threadResources+i;
+      if (comm->helperThread[i]) {
+        pthread_mutex_lock(&res->threadLock);
+        res->state = stop;
+        pthread_cond_signal(&res->threadCond);
+        pthread_mutex_unlock(&res->threadLock);
+        pthread_join(comm->helperThread[i], NULL);
+      }
+      free(res->threadTaskQueue.tasks);
+    }
+    if (comm->ctrlFd != -1) close(comm->ctrlFd);
+    for (int i=0; i<comm->nSocks; i++) {
+      if (comm->fds[i] != -1) close(comm->fds[i]);
+    }
+    free(comm);
+  }
+  return ncclSuccess;
+}
+
+ncclNet_t ncclNetSocket = {
+  "Socket",
+  ncclSocketInit,
+  ncclSocketDevices,
+  ncclSocketPciPath,
+  ncclSocketPtrSupport,
+  ncclSocketListen,
+  ncclSocketConnect,
+  ncclSocketAccept,
+  ncclSocketRegMr,
+  ncclSocketDeregMr,
+  ncclSocketIsend,
+  ncclSocketIrecv,
+  ncclSocketFlush,
+  ncclSocketTest,
+  ncclSocketClose,
+  ncclSocketClose,
+  ncclSocketCloseListen
+};
diff --git a/projects/rccl/src/transport/net_socket.cu b/projects/rccl/src/transport/net_socket.cu
deleted file mode 100644
index b09e2e7234..0000000000
--- a/projects/rccl/src/transport/net_socket.cu
+++ /dev/null
@@ -1,259 +0,0 @@
-/*************************************************************************
- * Copyright (c) 2016-2018, NVIDIA CORPORATION. All rights reserved.
- * Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
- *
- * See LICENSE.txt for license information
- ************************************************************************/
-
-#include "nccl.h"
-#include "core.h"
-#include "socket.h"
-#include "net.h"
-
-#include <assert.h>
-#include <pthread.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <poll.h>
-#include <limits.h>
-
-/* Init functions */
-static char ncclNetIfNames[MAX_IF_NAME_SIZE*MAX_IFS];
-static union socketAddress ncclNetIfAddrs[MAX_IFS];
-static int ncclNetIfs = -1;
-pthread_mutex_t ncclSocketLock = PTHREAD_MUTEX_INITIALIZER;
-
-ncclResult_t ncclSocketInit(ncclDebugLogger_t logFunction) {
-  if (ncclNetIfs == -1) {
-    pthread_mutex_lock(&ncclSocketLock);
-    if (ncclNetIfs == -1) {
-      ncclNetIfs = findInterfaces(ncclNetIfNames, ncclNetIfAddrs, MAX_IF_NAME_SIZE, MAX_IFS);
-      INFO(NCCL_INIT|NCCL_NET,"NET/Socket : %d interfaces found", ncclNetIfs);
-      if (ncclNetIfs <= 0) {
-        WARN("NET/Socket : no interface found");
-        return ncclInternalError;
-      }
-    }
-    pthread_mutex_unlock(&ncclSocketLock);
-  }
-  return ncclSuccess;
-}
-
-ncclResult_t ncclSocketPtrSupport(int dev, int* supportedTypes) {
-  *supportedTypes = NCCL_PTR_HOST;
-  return ncclSuccess;
-}
-
-ncclResult_t ncclSocketDevices(int* ndev) {
-  *ndev = ncclNetIfs;
-  return ncclSuccess;
-}
-
-ncclResult_t ncclSocketPciPath(int dev, char** path) {
-  char devicepath[PATH_MAX];
-  snprintf(devicepath, PATH_MAX, "/sys/class/net/%s", ncclNetIfNames+dev*MAX_IF_NAME_SIZE);
-  *path = realpath(devicepath, NULL);
-  const char* string_virual_network_device_path="/sys/devices/virtual/net/";
-  if (*path && !strncmp(*path, string_virual_network_device_path, strlen(string_virual_network_device_path)))
-    return ncclSuccess;
-  free(*path);
-  *path = realpath(devicepath, NULL);
-  if (*path == NULL) {
-    INFO(NCCL_NET|NCCL_INIT, "Could not find real path of %s", devicepath);
-    return ncclSystemError;
-  }
-  return ncclSuccess;
-}
-
-static ncclResult_t GetSocketAddr(int dev, union socketAddress* addr) {
-  if (dev >= ncclNetIfs) return ncclInternalError;
-  memcpy(addr, ncclNetIfAddrs+dev, sizeof(*addr));
-  return ncclSuccess;
-}
-
-/* Communication functions */
-
-struct ncclSocketHandle {
-  union socketAddress connectAddr;
-};
-
-struct ncclSocketRequest {
-  int op;
-  void* data;
-  int size;
-  int fd;
-  int offset;
-  int used;
-};
-
-struct ncclSocketReqs {
-  struct ncclSocketRequest* requests;
-};
-
-struct ncclSocketComm {
-  int fd;
-  struct ncclSocketReqs reqs;
-};
-
-ncclResult_t ncclSocketNewComm(struct ncclSocketComm** comm) {
-  NCCLCHECK(ncclCalloc(comm, 1));
-  (*comm)->fd = -1;
-  return ncclSuccess;
-}
-
-ncclResult_t ncclSocketCreateHandle(void* opaqueHandle, const char* str) {
-  struct ncclSocketHandle* handle = (struct ncclSocketHandle*) opaqueHandle;
-  NCCLCHECK(GetSocketAddrFromString(&(handle->connectAddr), str));
-  return ncclSuccess;
-}
-
-ncclResult_t ncclSocketListen(int dev, void* opaqueHandle, void** listenComm) {
-  struct ncclSocketHandle* handle = (struct ncclSocketHandle*) opaqueHandle;
-  static_assert(sizeof(struct ncclSocketHandle) < NCCL_NET_HANDLE_MAXSIZE, "ncclSocketHandle size too large");
-  // if dev >= 0, listen based on dev
-  if (dev >= 0) {
-    NCCLCHECK(GetSocketAddr(dev, &(handle->connectAddr)));
-  } else if (dev == findSubnetIf) {
-    // handle stores a remote address
-    // need to find a local addr that is in the same network as the remote addr
-    union socketAddress localAddr;
-    char ifName[MAX_IF_NAME_SIZE];
-    if (findInterfaceMatchSubnet(ifName, &localAddr, handle->connectAddr, MAX_IF_NAME_SIZE, 1) <= 0) {
-      WARN("No usable listening interface found");
-      return ncclSystemError;
-    }
-    // pass the local address back
-    memcpy(&handle->connectAddr, &localAddr, sizeof(handle->connectAddr));
-  } // Otherwise, handle stores a local address
-  struct ncclSocketComm* comm;
-  NCCLCHECK(ncclSocketNewComm(&comm));
-  NCCLCHECK(createListenSocket(&comm->fd, &handle->connectAddr));
-  *listenComm = comm;
-  return ncclSuccess;
-}
-
-ncclResult_t ncclSocketConnect(int dev, void* opaqueHandle, void** sendComm) {
-  struct ncclSocketComm* comm;
-  NCCLCHECK(ncclSocketNewComm(&comm));
-  struct ncclSocketHandle* handle = (struct ncclSocketHandle*) opaqueHandle;
-  NCCLCHECK(connectAddress(&comm->fd, &handle->connectAddr));
-  *sendComm = comm;
-  return ncclSuccess;
-}
-
-ncclResult_t ncclSocketAccept(void* listenComm, void** recvComm) {
-  struct ncclSocketComm* lComm = (struct ncclSocketComm*)listenComm;
-  struct ncclSocketComm* rComm;
-  NCCLCHECK(ncclSocketNewComm(&rComm));
-  struct sockaddr_in sockaddr;
-  socklen_t socklen = sizeof(struct sockaddr_in);
-  SYSCHECKVAL(accept(lComm->fd, (struct sockaddr*)&sockaddr, &socklen), "accept", rComm->fd);
-  *recvComm = rComm;
-  return ncclSuccess;
-}
-
-#define MAX_REQUESTS 128
-
-ncclResult_t ncclSocketGetRequest(struct ncclSocketReqs* reqs, int op, void* data, int size, int fd, struct ncclSocketRequest** req) {
-  if (reqs->requests == NULL) {
-    NCCLCHECK(ncclCalloc(&reqs->requests, MAX_REQUESTS));
-  }
-  for (int i=0; i<MAX_REQUESTS; i++) {
-    struct ncclSocketRequest* r = reqs->requests+i;
-    if (r->used == 0) {
-      r->op = op;
-      r->data = data;
-      r->size = size;
-      r->fd = fd;
-      r->offset = -1;
-      r->used = 1;
-      *req = r;
-      return ncclSuccess;
-    }
-  }
-  WARN("Socket : unable to allocate requests");
-  return ncclInternalError;
-}
-
-ncclResult_t ncclSocketTest(void* request, int* done, int* size) {
-  *done = 0;
-  struct ncclSocketRequest *r = (struct ncclSocketRequest*)request;
-  if (r == NULL) {
-    WARN("NET/Socket : test called with NULL request");
-    return ncclInternalError;
-  }
-  if (r->offset == -1) { /* try to send/recv size */
-    int data = r->size;
-    int offset = 0;
-    NCCLCHECK(socketProgress(r->op, r->fd, &data, sizeof(int), &offset));
-
-    if (offset == 0) return ncclSuccess; /* Not ready -- retry later */
-
-    // Not sure we could ever receive less than 4 bytes, but just in case ...
-    if (offset < sizeof(int)) NCCLCHECK(socketWait(r->op, r->fd, &data, sizeof(int), &offset));
-
-    // Check size is less or equal to the size provided by the user
-    if (r->op == NCCL_SOCKET_RECV && data > r->size) {
-      WARN("NET/Socket : message truncated : receiving %d bytes instead of %d", data, r->size);
-      return ncclInternalError;
-    }
-    r->size = data;
-    r->offset = 0;
-  }
-  if (r->offset < r->size) {
-    NCCLCHECK(socketProgress(r->op, r->fd, r->data, r->size, &r->offset));
-  }
-  if (r->offset == r->size) {
-    if (size) *size = r->size;
-    *done = 1;
-    r->used = 0;
-  }
-  return ncclSuccess;
-}
-
-ncclResult_t ncclSocketIsend(void* sendComm, void* data, int size, int type, void** request) {
-  if (type != NCCL_PTR_HOST) return ncclInternalError;
-  struct ncclSocketComm* comm = (struct ncclSocketComm*)sendComm;
-  NCCLCHECK(ncclSocketGetRequest(&comm->reqs, NCCL_SOCKET_SEND, data, size, comm->fd, (struct ncclSocketRequest**)request));
-  return ncclSuccess;
-}
-
-ncclResult_t ncclSocketIrecv(void* recvComm, void* data, int size, int type, void** request) {
-  if (type != NCCL_PTR_HOST) return ncclInternalError;
-  struct ncclSocketComm* comm = (struct ncclSocketComm*)recvComm;
-  NCCLCHECK(ncclSocketGetRequest(&comm->reqs, NCCL_SOCKET_RECV, data, size, comm->fd, (struct ncclSocketRequest**)request));
-  return ncclSuccess;
-}
-
-ncclResult_t ncclSocketFlush(void* recvComm, void* data, int size) {
-  // We don't support CUDA pointers, so we don't need a flush operation
-  return ncclInternalError;
-}
-
-ncclResult_t ncclSocketClose(void* opaqueComm) {
-  struct ncclSocketComm* comm = (struct ncclSocketComm*)opaqueComm;
-  if (comm) {
-    free(comm->reqs.requests);
-    close(comm->fd);
-    free(comm);
-  }
-  return ncclSuccess;
-}
-
-ncclNet_t ncclNetSocket = {
-  "Socket",
-  ncclSocketInit,
-  ncclSocketDevices,
-  ncclSocketPciPath,
-  ncclSocketPtrSupport,
-  ncclSocketListen,
-  ncclSocketConnect,
-  ncclSocketAccept,
-  ncclSocketIsend,
-  ncclSocketIrecv,
-  ncclSocketFlush,
-  ncclSocketTest,
-  ncclSocketClose,
-  ncclSocketClose,
-  ncclSocketClose
-};
diff --git a/projects/rccl/src/transport/p2p.cu b/projects/rccl/src/transport/p2p.cc
similarity index 68%
rename from projects/rccl/src/transport/p2p.cu
rename to projects/rccl/src/transport/p2p.cc
index f5ea1f1cbb..61874c9d42 100644
--- a/projects/rccl/src/transport/p2p.cu
+++ b/projects/rccl/src/transport/p2p.cc
@@ -1,5 +1,5 @@
 /*************************************************************************
- * Copyright (c) 2016-2018, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
  * Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
  *
  * See LICENSE.txt for license information
@@ -11,26 +11,16 @@
 #include "transport.h"
 #include "param.h"
 #include <unistd.h>
-#include <hip/hip_runtime_api.h>
-#include <hsa/hsa_ext_amd.h>
-#include "nvmlwrap.h"
+#include <hip/hip_runtime.h>
 #include <ctype.h>
-#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__)
+#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
 #include "nvlink_stub.h"
+#include <hsa/hsa.h>
+#include <hsa/hsa_ext_amd.h>
 #else
 #include "nvlink.h"
 #endif
 
-extern bool useFineGrainVramPcie;
-
-struct p2pInfo {
-  int rank;
-  int cudaDev;
-  uint64_t hostHash;
-  uint64_t pidHash;
-  char busId[NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE];
-};
-
 struct p2pConnectInfo {
   int direct;
   union {
@@ -39,41 +29,45 @@ struct p2pConnectInfo {
   };
 };
 
+struct p2pSendResources {
+  struct ncclSendMem* devMem;
+  void* ipcPtr;
+  uint32_t* next_hdp_reg;  // Next GPU in ring (for p2p transport use only)
+};
+
+struct p2pRecvResources {
+  struct ncclRecvMem* devMem;
+  void* ipcPtr;
+};
+
 #include <sys/types.h>
 
-/* Fill information necessary to exchange between ranks to choose whether or not
- * to use this transport */
-ncclResult_t p2pFillInfo(ncclTinfo_t* opaqueInfo, int rank, uint64_t commHash) {
-  struct p2pInfo* info = (struct p2pInfo*)opaqueInfo;
-  static_assert(sizeof(struct p2pInfo) <= sizeof(ncclTinfo_t), "p2p Info too large");
-  info->rank = rank;
-  CUDACHECK(hipGetDevice(&info->cudaDev));
-  info->hostHash=getHostHash()+commHash;
-  info->pidHash=getPidHash()+commHash;
-
-  // Get PCI Bus Id. We need to get the bus ID through CUDA first, since the
-  // cudaDev is a CUDA runtime dev number which could be different from the
-  // NVML device number. Then we get the busID from NVML to be sure it is
-  // consistent with NVML remote PCI bus Ids.
-  CUDACHECK(hipDeviceGetPCIBusId(info->busId, NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE, info->cudaDev));
-#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__)
-#else
-  nvmlDevice_t nvmlDevice;
-  NCCLCHECK(wrapNvmlDeviceGetHandleByPciBusId(info->busId, &nvmlDevice));
-  nvmlPciInfo_t pciInfo;
-  NCCLCHECK(wrapNvmlDeviceGetPciInfo(nvmlDevice, &pciInfo));
-  strncpy(info->busId, pciInfo.busId, NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE);
-#endif
-  return ncclSuccess;
-}
-
 NCCL_PARAM(P2pLevel, "P2P_LEVEL", -2);
 NCCL_PARAM(P2pDisable, "P2P_DISABLE", -2);
 
+extern bool useFineGrainVramPcie;
+
+/* Convert a PCI busId string into a local cudaDev device index (cf. CUDA_VISIBLE_DEVICES) */
+static int busIdToCudaDev(const char* busId) {
+  int ndev;
+  if (hipGetDeviceCount(&ndev) != hipSuccess)
+    return -1;
+  for (int i = 0; i < ndev; i++) {
+    char devBusId[NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE];
+    if (hipDeviceGetPCIBusId(devBusId, NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE, i) != hipSuccess)
+      return -1;
+    if (strcmp(busId, devBusId) == 0) {
+      return i;
+    }
+  }
+  // BusId was not found in our locally visible CUDA devices
+  return -1;
+}
+
 /* Determine if we can communicate with the peer through p2p */
-ncclResult_t p2pCanConnect(ncclTvalue_t* ret, ncclTinfo_t* myOpaqueInfo, ncclTinfo_t* peerOpaqueInfo) {
+ncclResult_t p2pCanConnect(ncclTvalue_t* ret, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo) {
   // Do not use P2P across root complexes by default (provided CUDA permits it)
-  int p2pLevel = PATH_SOC;
+  int p2pLevel = PATH_NODE;
   if (ncclParamP2pDisable() == 1) p2pLevel = 0;
   if (ncclParamP2pLevel() != -2) p2pLevel = ncclParamP2pLevel();
 
@@ -81,29 +75,44 @@ ncclResult_t p2pCanConnect(ncclTvalue_t* ret, ncclTinfo_t* myOpaqueInfo, ncclTin
 
   if (p2pLevel == 0) return ncclSuccess;
 
-  struct p2pInfo* myInfo = (struct p2pInfo*)myOpaqueInfo;
-  struct p2pInfo* peerInfo = (struct p2pInfo*)peerOpaqueInfo;
-
   // Rule out different nodes
   if (myInfo->hostHash != peerInfo->hostHash) return ncclSuccess;
 
+  // Convert the peer's busId into a local cudaDev index (cf. CUDA_VISIBLE_DEVICES)
+  int peerCudaDev = busIdToCudaDev(peerInfo->busId);
+  if (peerCudaDev == -1) {
+    // Peer's CUDA device is not visible in this process
+#if CUDART_VERSION >= 10010
+    // But in CUDA 10.1 we can still communicate with 'invisible' devices
+    TRACE(NCCL_INIT|NCCL_P2P, "Checking P2P connection between %d(%s) and %d(%s)", myInfo->nvmlDev, myInfo->busId, peerInfo->nvmlDev, peerInfo->busId);
+    // Check for NVLink/NVswitch including P2P access
+    int nvlinkp2p = getNvlinkGpu(myInfo->busId, peerInfo->busId);
+    if (nvlinkp2p > 0) {
+      *ret = nvlinkp2p;
+      return ncclSuccess;
+    }
+#endif
+    return ncclSuccess;
+  }
+
+  TRACE(NCCL_INIT|NCCL_P2P, "Checking P2P connection between [%d=%d] and [%d=%d]", myInfo->cudaDev, myInfo->nvmlDev, peerCudaDev, peerInfo->nvmlDev);
+
   // Do not detect topology if we're on the same GPU. Note this is not really supported.
-  if (myInfo->cudaDev == peerInfo->cudaDev) {
-    *ret = 1 + PATH_SOC;
+  if (myInfo->cudaDev == peerCudaDev) {
+    *ret = 1 + PATH_SYS;
     return ncclSuccess;
   }
 
   // See if CUDA can do P2P
   int p2p;
-  if (hipDeviceCanAccessPeer(&p2p, myInfo->cudaDev, peerInfo->cudaDev) != hipSuccess) {
-    INFO(NCCL_INIT|NCCL_P2P,"peer query failed between dev %d and dev %d",
-        myInfo->cudaDev, peerInfo->cudaDev);
+  if (hipDeviceCanAccessPeer(&p2p, myInfo->cudaDev, peerCudaDev) != hipSuccess) {
+    INFO(NCCL_INIT|NCCL_P2P,"peer query failed between dev %d(=%d) and dev %d(=%d)",
+         myInfo->cudaDev, myInfo->nvmlDev, peerCudaDev, peerInfo->nvmlDev);
     return ncclSuccess;
   }
-
   if (p2p == 0) return ncclSuccess;
 
-#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__)
+#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
   uint32_t link_type, hops;
   if (hipExtGetLinkTypeAndHopCount(myInfo->cudaDev, peerInfo->cudaDev, &link_type, &hops) != hipSuccess) {
     p2p = 0;
@@ -124,9 +133,8 @@ ncclResult_t p2pCanConnect(ncclTvalue_t* ret, ncclTinfo_t* myOpaqueInfo, ncclTin
     if (!useFineGrainVramPcie)
       return ncclSuccess;
   }
-
 #else
- // Check for NVLink/NVswitch
+// Check for NVLink/NVswitch
   int nvlinkp2p = getNvlinkGpu(myInfo->busId, peerInfo->busId);
 #endif
   if (nvlinkp2p > 0) {
@@ -138,11 +146,11 @@ ncclResult_t p2pCanConnect(ncclTvalue_t* ret, ncclTinfo_t* myOpaqueInfo, ncclTin
   char* myPath;
   char* peerPath;
   ncclResult_t err1 = getCudaPath(myInfo->cudaDev, &myPath);
-  ncclResult_t err2 = getCudaPath(peerInfo->cudaDev, &peerPath);
+  ncclResult_t err2 = getCudaPath(peerCudaDev, &peerPath);
   if (err1 == ncclSuccess && err2 == ncclSuccess) {
     int distance = pciDistance(myPath, peerPath);
     if (distance < p2pLevel) {
-      *ret = 1 + PATH_SOC - distance;
+      *ret = 1 + PATH_SYS - distance;
     }
   }
   if (err1 == ncclSuccess) free(myPath);
@@ -150,6 +158,9 @@ ncclResult_t p2pCanConnect(ncclTvalue_t* ret, ncclTinfo_t* myOpaqueInfo, ncclTin
   return ncclSuccess;
 }
 
+#define MAXGPUS_NVLINKP2P 8 // 16 would take an almost infinite time anyway
+#define MAXGPUS_PCI 64
+
 static int computeRingsRec(ncclTvalue_t* matrix, int n, int *rings, int currentRing, int nRingsMax, int* inTheRing, int current, int remaining, int connect) {
   int nrings = 0;
   ncclTvalue_t* line = matrix+current*n;
@@ -177,7 +188,7 @@ static int computeRingsRec(ncclTvalue_t* matrix, int n, int *rings, int currentR
       }
     }
   } else {
-    int ringsSave[nRingsMax*n];
+    int ringsSave[MAXCHANNELS*MAXGPUS_NVLINKP2P];
     int maxStep = 0;
     for (int i=0; i<n; i++) {
       if (inTheRing[i] == 0 && line[i] > 0) {
@@ -210,8 +221,8 @@ static int computeRingsRec(ncclTvalue_t* matrix, int n, int *rings, int currentR
 static inline int copyRings(int nranks, int* rings, int nrings, int newNrings) {
   if (nrings == 0) return 0;
   // Copy rings by dup times
-  if (newNrings > MAXRINGS) {
-    newNrings = MAXRINGS;
+  if (newNrings > MAXCHANNELS) {
+    newNrings = MAXCHANNELS;
   }
   for (int r=nrings; r<newNrings; r++) {
     for (int i=0; i<nranks; i++) rings[r*nranks+i] = rings[(r%nrings)*nranks+i];
@@ -227,7 +238,6 @@ int p2pComputeRingsNvLink(ncclTvalue_t* matrix, int nranks, int *rings, int nrin
   if (connect) {
     inTheRing[rings[0]] = 1;
     nrings = computeRingsRec(matrix, nranks, rings, 0, nringsMax, inTheRing, rings[1], nranks-2, connect);
-    nrings = copyRings(nranks, rings, nrings, nringsMax);
   } else {
     rings[0] = 0;
     nrings = computeRingsRec(matrix, nranks, rings, 0, nringsMax, inTheRing, 0, nranks-1, connect);
@@ -245,9 +255,9 @@ static inline int findConnect(int nranks, int* ranks) {
 
 int p2pComputeRingsNvLink(ncclTvalue_t* values, int nranks, int* rings, int nrings, int* prev, int* next, int oversubscribe, int* nthreads) {
   if (nrings == 0) return 0;
-  if (nrings > MAXRINGS) {
-    WARN("Max rings reached, limiting to %d", MAXRINGS);
-    nrings = MAXRINGS;
+  if (nrings > MAXCHANNELS) {
+    WARN("Max rings reached, limiting to %d", MAXCHANNELS);
+    nrings = MAXCHANNELS;
   }
   // Find existing constraints / connections
   int connect = 0;
@@ -275,9 +285,9 @@ int p2pComputeRingsNvLink(ncclTvalue_t* values, int nranks, int* rings, int nrin
 
   if (compNrings && compNrings < nrings && nranks <= 4) {
     // Try to oversubscribe to get a better result
-    int *rings2 = (int *)malloc(sizeof(int)*MAXRINGS*nranks);
-    if (rings2 == NULL) { WARN("malloc of %ld bytes failed", sizeof(int)*MAXRINGS*nranks); return 0; }
-    for (int i=0; i<MAXRINGS*nranks; i++) rings2[i] = -1;
+    int *rings2 = (int *)malloc(sizeof(int)*MAXCHANNELS*nranks);
+    if (rings2 == NULL) { WARN("malloc of %ld bytes failed", sizeof(int)*MAXCHANNELS*nranks); return 0; }
+    for (int i=0; i<MAXCHANNELS*nranks; i++) rings2[i] = -1;
     int nThreads = *nthreads;
     int compNrings2 = p2pComputeRingsNvLink(values, nranks, rings2, nrings, prev, next, 1, &nThreads);
     if (compNrings2 > compNrings*2) {
@@ -289,13 +299,12 @@ int p2pComputeRingsNvLink(ncclTvalue_t* values, int nranks, int* rings, int nrin
   }
 
   // Duplicate the rings for direct NVLink
-#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__)
+#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
   compNrings = copyRings(nranks, rings, compNrings, compNrings*3);
 #else
   compNrings = copyRings(nranks, rings, compNrings, compNrings*2);
 #endif
 
-  if (ncclCudaCompCap() == 6) *nthreads /= 2;
   return compNrings;
 }
 
@@ -341,9 +350,9 @@ int p2pComputeRingsSeqNew(ncclTvalue_t* values, int nranks, int* rings, int nrin
 }
 
 static int findClosestPci(ncclTvalue_t* values, int* inRing, int rank, int end, int nranks, int minScore) {
-  for (int score = PATH_SOC+1; score >= minScore; score--) {
+  for (int score = PATH_SYS+1; score >= minScore; score--) {
     int best = -1;
-    int worst_end_score = PATH_SOC+2; // find the closest to rank, farthest from end
+    int worst_end_score = PATH_SYS+2; // find the closest to rank, farthest from end
     for (int n = 0; n < nranks; n++) {
       if (inRing[n]) continue;
       if (values[rank*nranks+n] == score) {
@@ -365,7 +374,7 @@ int p2pComputeRingsPci(ncclTvalue_t* values, int nranks, int* rings, int nrings,
     int start = findConnect(nranks, prev+r*nranks);
     int end = findConnect(nranks, next+r*nranks);
 
-    int inRing[nranks];
+    int inRing[MAXGPUS_PCI];
     for (int i=0; i<nranks; i++) inRing[i] = 0;
 
     if (start == -1 && end == -1) {
@@ -407,8 +416,8 @@ int p2pComputeRingsPci(ncclTvalue_t* values, int nranks, int* rings, int nrings,
 ncclResult_t p2pGetRings(int nranks, int* groups, int* subgroups, ncclTvalue_t* values, int* nringsRet, int* prev, int* next, int minScore, int* nthreads) {
   if (*nringsRet == 0) return ncclSuccess;
   int *rings;
-  NCCLCHECK(ncclCalloc(&rings, MAXRINGS*nranks));
-  for (int i=0; i<MAXRINGS*nranks; i++) rings[i] = -1;
+  NCCLCHECK(ncclCalloc(&rings, MAXCHANNELS*nranks));
+  for (int i=0; i<MAXCHANNELS*nranks; i++) rings[i] = -1;
   int nrings = *nringsRet;
 
   // NVswitch
@@ -449,10 +458,14 @@ ncclResult_t p2pGetRings(int nranks, int* groups, int* subgroups, ncclTvalue_t*
       links += val/CONNECT_NVLINK;
     }
     if (rank == 0) directLinks = links;
-    else directLinks =  std::min(directLinks, links);
+    else directLinks = std::min(directLinks, links);
   }
   if (directLinks > 0) {
     // NVLink : Connect rings or create new ones
+    if (nranks > MAXGPUS_NVLINKP2P) {
+      WARN("Recursive P2P computation cannot work for >8 GPUs");
+      return ncclInternalError;
+    }
     nrings = p2pComputeRingsNvLink(values, nranks, rings, nrings, prev, next, 0, nthreads);
     goto end;
   }
@@ -486,48 +499,59 @@ end:
   } while (0)
 
 /* Send: Create and return connect structures for this peer to connect to me */
-ncclResult_t p2pSendSetup(ncclTinfo_t* myOpaqueInfo, ncclTinfo_t* peerOpaqueInfo, struct ncclConnect* connectInfo, struct ncclRing* ring) {
-  struct p2pInfo* myInfo = (struct p2pInfo*)myOpaqueInfo;
-  struct p2pInfo* peerInfo = (struct p2pInfo*)peerOpaqueInfo;
-  struct p2pConnectInfo info;
+ncclResult_t p2pSendSetup(struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo,
+    struct ncclConnect* connectInfo, struct ncclConnector* send, int buffSize, int channelId) {
+  struct p2pSendResources* resources;
+  NCCLCHECK(ncclCalloc(&resources, 1));
+  send->transportResources = resources;
+  int sendSize = sizeof(struct ncclSendMem);
+  ALIGN_SIZE(sendSize, CUDA_IPC_MIN);
+  NCCLCHECK(ncclCudaCalloc((char**)&resources->devMem, sendSize, true));
+
   uint32_t linktype, hops;
   if (hipExtGetLinkTypeAndHopCount(myInfo->cudaDev, peerInfo->cudaDev, &linktype, &hops) != hipSuccess) {
-    INFO(NCCL_INIT|NCCL_P2P,"Ring %02d : %d -> %d failed to get link type and hop count", ring->id, myInfo->rank, peerInfo->rank);
+    INFO(NCCL_INIT|NCCL_P2P,"Ring %02d : %d -> %d failed to get link type and hop count", channelId, myInfo->rank, peerInfo->rank);
     return ncclInternalError;
   }
   if (linktype != HSA_AMD_LINK_INFO_TYPE_XGMI) {
-    CUDACHECK(hipDeviceGetAttribute((int*)&ring->next_hdp_reg, hipDeviceAttributeHdpMemFlushCntl,peerInfo->cudaDev));
-    TRACE(NCCL_INIT|NCCL_P2P,"Ring %02d : %d -> %d HDP %p", ring->id, myInfo->rank, peerInfo->rank, ring->next_hdp_reg);
+    CUDACHECK(hipDeviceGetAttribute((int*)&resources->next_hdp_reg, hipDeviceAttributeHdpMemFlushCntl,peerInfo->cudaDev));
+    TRACE(NCCL_INIT|NCCL_P2P,"Ring %02d : %d -> %d HDP %p", channelId, myInfo->rank, peerInfo->rank, resources->next_hdp_reg);
   }
+  else
+    resources->next_hdp_reg = 0;
+
+  struct p2pConnectInfo info;
   if (myInfo->pidHash == peerInfo->pidHash) {
     info.direct = 1;
-    info.directPtr = ring->devMemSend;
+    info.directPtr = resources->devMem;
     if (myInfo->cudaDev == peerInfo->cudaDev) {
-      INFO(NCCL_INIT|NCCL_P2P,"Ring %02d : %d -> %d via P2P/common device", ring->id, myInfo->rank, peerInfo->rank);
+      INFO(NCCL_INIT|NCCL_P2P,"Ring %02d : %d -> %d via P2P/common device", channelId, myInfo->rank, peerInfo->rank);
     } else {
       // Enable P2P access
       hipError_t err = hipDeviceEnablePeerAccess(peerInfo->cudaDev, 0);
       if (err == hipErrorPeerAccessAlreadyEnabled) {
         hipGetLastError();
       } else if (err != hipSuccess) {
-        WARN("failed to peer with device %d: %d %s",
-            peerInfo->cudaDev, err, hipGetErrorString(err));
+        WARN("failed to peer with device %d(=%d): %d %s",
+             peerInfo->cudaDev, peerInfo->nvmlDev, err, hipGetErrorString(err));
         return ncclInternalError;
       }
       INFO(NCCL_INIT|NCCL_P2P,"Ring %02d : %d[%d] -> %d[%d] via P2P/direct pointer",
-          ring->id, myInfo->rank, myInfo->cudaDev, peerInfo->rank, peerInfo->cudaDev);
+          channelId, myInfo->rank, myInfo->nvmlDev, peerInfo->rank, peerInfo->nvmlDev);
     }
   } else {
+    // Convert the peer's busId into a local cudaDev index (cf. CUDA_VISIBLE_DEVICES)
+    int peerCudaDev = busIdToCudaDev(peerInfo->busId);
     info.direct = 0;
     // Map IPC and enable P2P access
-    hipError_t err = hipIpcGetMemHandle(&info.devIpc, (void*)ring->devMemSend);
+    hipError_t err = hipIpcGetMemHandle(&info.devIpc, (void*)resources->devMem);
     if (err != hipSuccess) {
-      WARN("rank %d failed to get CUDA IPC handle to device %d : %d %s",
-          myInfo->rank, peerInfo->cudaDev, err, hipGetErrorString(err));
+      WARN("rank %d failed to get CUDA IPC handle to device %d(=%d) : %d %s",
+           myInfo->rank, peerCudaDev, peerInfo->nvmlDev, err, hipGetErrorString(err));
       return ncclInternalError;
     }
     INFO(NCCL_INIT|NCCL_P2P,"Ring %02d : %d[%d] -> %d[%d] via P2P/IPC",
-        ring->id, myInfo->rank, myInfo->cudaDev, peerInfo->rank, peerInfo->cudaDev);
+        channelId, myInfo->rank, myInfo->nvmlDev, peerInfo->rank, peerInfo->nvmlDev);
     //TRACE_DUMP_IPC(&info.devIpc);
   }
   static_assert(sizeof(struct p2pConnectInfo) <= sizeof(struct ncclConnect), "p2p Connect Info is too big");
@@ -536,13 +560,20 @@ ncclResult_t p2pSendSetup(ncclTinfo_t* myOpaqueInfo, ncclTinfo_t* peerOpaqueInfo
 }
 
 /* Create and return connect structures for this peer to connect to me */
-ncclResult_t p2pRecvSetup(ncclTinfo_t* myOpaqueInfo, ncclTinfo_t* peerOpaqueInfo, struct ncclConnect* connectInfo, struct ncclRing* ring) {
-  struct p2pInfo* myInfo = (struct p2pInfo*)myOpaqueInfo;
-  struct p2pInfo* peerInfo = (struct p2pInfo*)peerOpaqueInfo;
+ncclResult_t p2pRecvSetup(struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo,
+    struct ncclConnect* connectInfo, struct ncclConnector * recv, int buffSize, int channelId) {
+
+  struct p2pRecvResources* resources;
+  NCCLCHECK(ncclCalloc(&resources, 1));
+  recv->transportResources = resources;
+  int recvSize = offsetof(struct ncclRecvMem, buff)+buffSize;
+  ALIGN_SIZE(recvSize, CUDA_IPC_MIN);
+  NCCLCHECK(ncclCudaCalloc((char**)&resources->devMem, recvSize, true));
+
   struct p2pConnectInfo info;
   if (myInfo->pidHash == peerInfo->pidHash) {
     info.direct = 1;
-    info.directPtr = ring->devMemRecv;
+    info.directPtr = resources->devMem;
     if (myInfo->cudaDev == peerInfo->cudaDev) {
       TRACE(NCCL_INIT|NCCL_P2P,"%d <- %d via P2P/common device", myInfo->rank, peerInfo->rank);
     } else {
@@ -551,22 +582,24 @@ ncclResult_t p2pRecvSetup(ncclTinfo_t* myOpaqueInfo, ncclTinfo_t* peerOpaqueInfo
       if (err == hipErrorPeerAccessAlreadyEnabled) {
         hipGetLastError();
       } else if (err != hipSuccess) {
-        WARN("failed to peer with device %d: %d %s",
-            peerInfo->cudaDev, err, hipGetErrorString(err));
+        WARN("failed to peer with device %d(=%d): %d %s",
+             peerInfo->cudaDev, peerInfo->nvmlDev, err, hipGetErrorString(err));
         return ncclInternalError;
       }
-      TRACE(NCCL_INIT|NCCL_P2P,"Ring %02d : %d[%d] <- %d[%d] via P2P/direct pointer", ring->id, myInfo->rank, myInfo->cudaDev, peerInfo->rank, peerInfo->cudaDev);
+      TRACE(NCCL_INIT|NCCL_P2P,"Ring %02d : %d[%d] <- %d[%d] via P2P/direct pointer", channelId, myInfo->rank, myInfo->nvmlDev, peerInfo->rank, peerInfo->nvmlDev);
     }
   } else {
+    // Convert the peer's busId into a local cudaDev index (cf. CUDA_VISIBLE_DEVICES)
+    int peerCudaDev = busIdToCudaDev(peerInfo->busId);
     info.direct = 0;
     // Map IPC and enable P2P access
-    hipError_t err = hipIpcGetMemHandle(&info.devIpc, (void*)ring->devMemRecv);
+    hipError_t err = hipIpcGetMemHandle(&info.devIpc, (void*)resources->devMem);
     if (err != hipSuccess) {
-      WARN("rank %d failed to get HIP IPC handle to device %d : %d %s",
-          myInfo->rank, peerInfo->cudaDev, err, hipGetErrorString(err));
+      WARN("rank %d failed to get CUDA IPC handle to device %d(=%d) : %d %s",
+           myInfo->rank, peerCudaDev, peerInfo->nvmlDev, err, hipGetErrorString(err));
       return ncclInternalError;
     }
-    TRACE(NCCL_INIT|NCCL_P2P,"Ring %02d : %d[%d] <- %d[%d] via P2P/IPC", ring->id, myInfo->rank, myInfo->cudaDev, peerInfo->rank, peerInfo->cudaDev);
+    TRACE(NCCL_INIT|NCCL_P2P,"Ring %02d : %d[%d] <- %d[%d] via P2P/IPC", channelId, myInfo->rank, myInfo->nvmlDev, peerInfo->rank, peerInfo->nvmlDev);
     //TRACE_DUMP_IPC(&info.devIpc);
   }
   static_assert(sizeof(struct p2pConnectInfo) <= sizeof(struct ncclConnect), "p2p Connect Info is too big");
@@ -576,22 +609,16 @@ ncclResult_t p2pRecvSetup(ncclTinfo_t* myOpaqueInfo, ncclTinfo_t* peerOpaqueInfo
 
 /* Connect/Send to this peer */
 static ncclResult_t p2pSendConnect(struct ncclConnect* connectInfo, struct ncclConnector* send) {
-  void** resources = &send->transportResources;
+  struct p2pSendResources* resources = (struct p2pSendResources*)send->transportResources;
   struct ncclRecvMem* remDevMem;
   struct p2pConnectInfo* info = (struct p2pConnectInfo*)connectInfo;
   if (info->direct) {
     remDevMem = (struct ncclRecvMem*)(info->directPtr);
     send->conn.direct = 1;
-    *resources = NULL;
   } else {
-    void* remPtr = NULL;
     //TRACE_DUMP_IPC(&info->devIpc);
-    hipError_t err = hipIpcOpenMemHandle(&remPtr, info->devIpc, hipIpcMemLazyEnablePeerAccess);
-    void** ipcPtrSave;
-    NCCLCHECK(ncclCalloc(&ipcPtrSave, 1));
-    *resources = ipcPtrSave;
-    *ipcPtrSave = remPtr;
-    remDevMem = (struct ncclRecvMem*)remPtr;
+    hipError_t err = hipIpcOpenMemHandle(&resources->ipcPtr, info->devIpc, hipIpcMemLazyEnablePeerAccess);
+    remDevMem = (struct ncclRecvMem*)resources->ipcPtr;
     if (err != hipSuccess) {
       WARN("failed to open CUDA IPC handle : %d %s",
           err, hipGetErrorString(err));
@@ -602,30 +629,27 @@ static ncclResult_t p2pSendConnect(struct ncclConnect* connectInfo, struct ncclC
   send->conn.buff = remDevMem->buff;
   send->conn.llBuff = remDevMem->llBuff;
   send->conn.tail = &remDevMem->tail;
-  send->conn.opCount = &remDevMem->opCount;
-  // send->conn->head should have been set to devMemSend already
+  send->conn.opCountRem = &remDevMem->opCount;
+  send->conn.head = &resources->devMem->head;
+  send->conn.ptrExchange = &resources->devMem->ptrExchange;
+  send->conn.opCountLoc = &resources->devMem->opCount;
+  send->conn.next_hdp_reg = resources->next_hdp_reg;
   return ncclSuccess;
 }
 
 /* Connect/Recv from this peer */
 ncclResult_t p2pRecvConnect(struct ncclConnect* connectInfo, struct ncclConnector* recv) {
-  void** resources = &recv->transportResources;
+  struct p2pRecvResources* resources = (struct p2pRecvResources*)recv->transportResources;
   struct ncclSendMem* remDevMem;
   struct p2pConnectInfo* info = (struct p2pConnectInfo*)connectInfo;
   if (info->direct) {
     remDevMem = (struct ncclSendMem*)(info->directPtr);
     recv->conn.direct = 1;
     recv->conn.ptrExchange = &remDevMem->ptrExchange;
-    *resources = NULL;
   } else {
-    void* remPtr = NULL;
     //TRACE_DUMP_IPC(&info->devIpc);
-    hipError_t err = hipIpcOpenMemHandle(&remPtr, info->devIpc, hipIpcMemLazyEnablePeerAccess);
-    void** ipcPtrSave;
-    NCCLCHECK(ncclCalloc(&ipcPtrSave, 1));
-    *resources = ipcPtrSave;
-    *ipcPtrSave = remPtr;
-    remDevMem = (struct ncclSendMem*)remPtr;
+    hipError_t err = hipIpcOpenMemHandle(&resources->ipcPtr, info->devIpc, hipIpcMemLazyEnablePeerAccess);
+    remDevMem = (struct ncclSendMem*)resources->ipcPtr;
     if (err != hipSuccess) {
       WARN("failed to open CUDA IPC handle : %d %s",
           err, hipGetErrorString(err));
@@ -633,28 +657,37 @@ ncclResult_t p2pRecvConnect(struct ncclConnect* connectInfo, struct ncclConnecto
     }
   }
 
-  // recv->conn->buff should have been set to devMemRecv already
-  // recv->conn->tail should have been set to devMemRecv already
-  // recv->conn->opCount should have been set to devMemRecv already
+  recv->conn.buff = resources->devMem->buff;
+  recv->conn.llBuff = resources->devMem->llBuff;
+  recv->conn.tail = &resources->devMem->tail;
+  recv->conn.opCountLoc = &resources->devMem->opCount;
   recv->conn.head = &remDevMem->head;
-  recv->conn.llHead = &remDevMem->llHead;
+  recv->conn.opCountRem = &remDevMem->opCount;
   return ncclSuccess;
 }
 
-ncclResult_t p2pFree(void* resources) {
-  if (resources != NULL) {
-    void** ipcPtrSave = (void**) resources;
-    CUDACHECK(hipIpcCloseMemHandle(*ipcPtrSave));
-    free(resources);
-  }
+ncclResult_t p2pSendFree(void* resources) {
+  struct p2pSendResources* sendRes = (struct p2pSendResources*)resources;
+  if (sendRes->ipcPtr)
+    CUDACHECK(hipIpcCloseMemHandle(sendRes->ipcPtr));
+  CUDACHECK(hipFree(sendRes->devMem));
+  free(sendRes);
+  return ncclSuccess;
+}
+
+ncclResult_t p2pRecvFree(void* resources) {
+  struct p2pRecvResources* recvRes = (struct p2pRecvResources*)resources;
+  if (recvRes->ipcPtr)
+    CUDACHECK(hipIpcCloseMemHandle(recvRes->ipcPtr));
+  CUDACHECK(hipFree(recvRes->devMem));
+  free(recvRes);
   return ncclSuccess;
 }
 
 struct ncclTransport p2pTransport = {
   "P2P",
-  p2pFillInfo,
   p2pCanConnect,
   p2pGetRings,
-  { p2pSendSetup, p2pSendConnect, p2pFree, NULL },
-  { p2pRecvSetup, p2pRecvConnect, p2pFree, NULL }
+  { p2pSendSetup, p2pSendConnect, p2pSendFree, NULL },
+  { p2pRecvSetup, p2pRecvConnect, p2pRecvFree, NULL }
 };
diff --git a/projects/rccl/src/transport/shm.cu b/projects/rccl/src/transport/shm.cc
similarity index 69%
rename from projects/rccl/src/transport/shm.cu
rename to projects/rccl/src/transport/shm.cc
index 0ba168b2bf..730a8604b8 100644
--- a/projects/rccl/src/transport/shm.cu
+++ b/projects/rccl/src/transport/shm.cc
@@ -1,5 +1,5 @@
 /*************************************************************************
- * Copyright (c) 2016-2018, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
  * Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
  *
  * See LICENSE.txt for license information
@@ -11,26 +11,13 @@
 #include "param.h"
 #include "shm.h"
 #include <unistd.h>
-#include <hip/hip_runtime_api.h>
+#include <hip/hip_runtime.h>
 
-struct shmInfo {
-  int rank;
-  int cudaDev;
-  uint64_t hostHash;
-  uint64_t pidHash;
-};
-
-struct shmSendConnectInfo {
+struct shmConnectInfo {
   uint64_t pidHash;
   int id;
-  int rank;
-  int shmSize;
-};
-
-struct shmRecvConnectInfo {
-  uint64_t pidHash;
-  int id;
-  int rank;
+  int sendRank;
+  int recvRank;
   int shmSize;
 };
 
@@ -52,24 +39,10 @@ struct shmRecvResources {
   struct ncclRecvMem* devHostMem;
 };
 
-/* Fill information necessary to exchange between ranks to choose whether or not
- * to use this transport */
-ncclResult_t shmFillInfo(ncclTinfo_t* opaqueInfo, int rank, uint64_t commHash) {
-  struct shmInfo* info = (struct shmInfo*)opaqueInfo;
-  static_assert(sizeof(struct shmInfo) <= sizeof(ncclTinfo_t), "shm Info too large");
-  info->rank = rank;
-  CUDACHECK(hipGetDevice(&info->cudaDev));
-  info->hostHash=getHostHash()+commHash;
-  info->pidHash=getPidHash()+commHash;
-  return ncclSuccess;
-}
-
 NCCL_PARAM(ShmDisable, "SHM_DISABLE", 0);
 
 /* Determine if we can communicate with the peer */
-ncclResult_t shmCanConnect(ncclTvalue_t* ret, ncclTinfo_t* myOpaqueInfo, ncclTinfo_t* peerOpaqueInfo) {
-  struct shmInfo* myInfo = (struct shmInfo*)myOpaqueInfo;
-  struct shmInfo* peerInfo = (struct shmInfo*)peerOpaqueInfo;
+ncclResult_t shmCanConnect(ncclTvalue_t* ret, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo) {
   *ret = ((ncclParamShmDisable() == 1) || (myInfo->hostHash != peerInfo->hostHash)) ? 0 : 1;
   return ncclSuccess;
 }
@@ -88,11 +61,13 @@ static inline int groupLast(int nranks, int* groups, int group, int rankToAvoid)
   return -1;
 }
 
+#define MAXGROUPS 16
+
 ncclResult_t shmGetRings(int nranks, int* groups, int* subgroups, ncclTvalue_t* values, int* nringsRet, int* prev, int* next, int minScore, int* nthreads) {
-  if (*nringsRet == MAXRINGS) *nringsRet = 1;
+  if (*nringsRet == MAXCHANNELS) *nringsRet = 1;
   int nGroups = groups[nranks-1] + 1;
-  int starts[nGroups];
-  int ends[nGroups];
+  int starts[MAXGROUPS];
+  int ends[MAXGROUPS];
   for (int ring = 0; ring<*nringsRet; ring++) {
     int startGroup = -1, endGroup = -1;
     for (int group = 0; group<nGroups; group++) {
@@ -157,56 +132,60 @@ ncclResult_t shmGetRings(int nranks, int* groups, int* subgroups, ncclTvalue_t*
 #define MAX_SHM_NAME_LEN 1024
 
 /* Create and return connect structures for this peer to connect to me */
-ncclResult_t shmSendSetup(ncclTinfo_t* myOpaqueInfo, ncclTinfo_t* peerOpaqueInfo, struct ncclConnect* connectInfo, struct ncclRing* ring) {
-  struct shmInfo* myInfo = (struct shmInfo*)myOpaqueInfo;
-  struct shmInfo* peerInfo = (struct shmInfo*)peerOpaqueInfo;
+ncclResult_t shmSendSetup(struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connectInfo, struct ncclConnector* send, int buffSize, int channelId) {
 
   struct shmSendResources* resources;
   NCCLCHECK(ncclCalloc(&resources, 1));
-  ring->send.transportResources = resources;
+  send->transportResources = resources;
+
+  struct shmConnectInfo info;
+  info.id = channelId;
+  info.pidHash = myInfo->pidHash;
+  info.sendRank = myInfo->rank;
+  info.recvRank = peerInfo->rank;
 
-  struct shmRecvConnectInfo info;
   char shmName[MAX_SHM_NAME_LEN];
-  sprintf(shmName, "nccl-shm-send-%lx-%d-%d", myInfo->pidHash, ring->id, myInfo->rank);
+  sprintf(shmName, "nccl-shm-send-%lx-%d-%d-%d", info.pidHash, info.id, info.sendRank, info.recvRank);
   info.shmSize = resources->shmSize = sizeof(struct ncclSendMem);
   TRACE(NCCL_SHM,"Open shmName %s shmSize %d", shmName, info.shmSize);
   NCCLCHECK(shmOpen(shmName, resources->shmSize, (void**)&resources->hostMem, (void**)&resources->devHostMem, 1));
 
-  INFO(NCCL_INIT|NCCL_SHM,"Ring %02d : %d[%d] -> %d[%d] via direct shared memory", ring->id, myInfo->rank, myInfo->cudaDev, peerInfo->rank, peerInfo->cudaDev);
-  info.id = ring->id; info.rank = myInfo->rank; info.pidHash = myInfo->pidHash;
-  static_assert(sizeof(struct shmRecvConnectInfo) <= sizeof(struct ncclConnect), "shm Connect Recv Info is too big");
-  memcpy(connectInfo, &info, sizeof(struct shmRecvConnectInfo));
+  INFO(NCCL_INIT|NCCL_SHM,"Ring %02d : %d[%d] -> %d[%d] via direct shared memory", channelId, myInfo->rank, myInfo->cudaDev, peerInfo->rank, peerInfo->cudaDev);
+  static_assert(sizeof(struct shmConnectInfo) <= sizeof(struct ncclConnect), "shm Connect Recv Info is too big");
+  memcpy(connectInfo, &info, sizeof(struct shmConnectInfo));
   return ncclSuccess;
 }
 
-ncclResult_t shmRecvSetup(ncclTinfo_t* myOpaqueInfo, ncclTinfo_t* peerOpaqueInfo, struct ncclConnect* connectInfo, struct ncclRing* ring) {
-  struct shmInfo* myInfo = (struct shmInfo*)myOpaqueInfo;
+ncclResult_t shmRecvSetup(struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connectInfo, struct ncclConnector* recv, int buffSize, int channelId) {
   struct shmRecvResources* resources;
   NCCLCHECK(ncclCalloc(&resources, 1));
-  ring->recv.transportResources = resources;
+  recv->transportResources = resources;
 
-  struct shmSendConnectInfo info;
+  struct shmConnectInfo info;
+  info.id = channelId;
+  info.pidHash = myInfo->pidHash;
+  info.sendRank = peerInfo->rank;
+  info.recvRank = myInfo->rank;
 
   char shmName[MAX_SHM_NAME_LEN];
-  sprintf(shmName, "nccl-shm-recv-%lx-%d-%d", myInfo->pidHash, ring->id, myInfo->rank);
-  info.shmSize = resources->shmSize = offsetof(struct ncclRecvMem, buff)+ring->buffSize;
+  sprintf(shmName, "nccl-shm-recv-%lx-%d-%d-%d", info.pidHash, info.id, info.sendRank, info.recvRank);
+  info.shmSize = resources->shmSize = offsetof(struct ncclRecvMem, buff)+buffSize;
   TRACE(NCCL_SHM,"Open shmName %s shmSize %d", shmName, info.shmSize);
   NCCLCHECK(shmOpen(shmName, resources->shmSize, (void**)&resources->hostMem, (void**)&resources->devHostMem, 1));
 
-  info.id = ring->id; info.rank = myInfo->rank; info.pidHash = myInfo->pidHash;
-  static_assert(sizeof(struct shmRecvConnectInfo) <= sizeof(struct ncclConnect), "shm Connect Send Info is too big");
-  memcpy(connectInfo, &info, sizeof(struct shmSendConnectInfo));
+  static_assert(sizeof(struct shmConnectInfo) <= sizeof(struct ncclConnect), "shm Connect Send Info is too big");
+  memcpy(connectInfo, &info, sizeof(struct shmConnectInfo));
   return ncclSuccess;
 }
 
 /* Connect to this peer */
 ncclResult_t shmSendConnect(struct ncclConnect* connectInfo, struct ncclConnector* send) {
   // Setup device pointers
-  struct shmSendConnectInfo* info = (struct shmSendConnectInfo*)connectInfo;
+  struct shmConnectInfo* info = (struct shmConnectInfo*)connectInfo;
   struct shmSendResources* resources = (struct shmSendResources*)send->transportResources;
 
   char shmName[MAX_SHM_NAME_LEN];
-  sprintf(shmName, "nccl-shm-recv-%lx-%d-%d", info->pidHash, info->id, info->rank);
+  sprintf(shmName, "nccl-shm-recv-%lx-%d-%d-%d", info->pidHash, info->id, info->sendRank, info->recvRank);
   resources->remShmSize = info->shmSize;
   TRACE(NCCL_SHM,"Open shmName %s shmSize %d", shmName, info->shmSize);
   NCCLCHECK(shmOpen(shmName, resources->remShmSize, (void**)&resources->remHostMem, (void**)&resources->devRemHostMem, 0));
@@ -217,31 +196,31 @@ ncclResult_t shmSendConnect(struct ncclConnect* connectInfo, struct ncclConnecto
   send->conn.buff = resources->devRemHostMem->buff;
   send->conn.llBuff = resources->devRemHostMem->llBuff;
   send->conn.tail = &resources->devRemHostMem->tail;
-  send->conn.opCount = &resources->devRemHostMem->opCount;
+  send->conn.opCountRem = &resources->devRemHostMem->opCount;
 
   send->conn.head = &resources->devHostMem->head;
-  send->conn.llHead = &resources->devHostMem->llHead;
+  send->conn.opCountLoc = &resources->devHostMem->opCount;
   return ncclSuccess;
 }
 
 ncclResult_t shmRecvConnect(struct ncclConnect* connectInfo, struct ncclConnector* recv) {
   // Setup device pointers
   struct shmRecvResources* resources = (struct shmRecvResources*)recv->transportResources;
-  struct shmRecvConnectInfo* info = (struct shmRecvConnectInfo*)connectInfo;
+  struct shmConnectInfo* info = (struct shmConnectInfo*)connectInfo;
 
   char shmName[MAX_SHM_NAME_LEN];
-  sprintf(shmName, "nccl-shm-send-%lx-%d-%d", info->pidHash, info->id, info->rank);
+  sprintf(shmName, "nccl-shm-send-%lx-%d-%d-%d", info->pidHash, info->id, info->sendRank, info->recvRank);
   resources->remShmSize = info->shmSize;
   TRACE(NCCL_SHM,"Open shmName %s shmSize %d", shmName, info->shmSize);
   NCCLCHECK(shmOpen(shmName, resources->remShmSize, (void**)&resources->remHostMem, (void**)&resources->devRemHostMem, 0));
   NCCLCHECK(shmUnlink(shmName));
   recv->conn.head = &resources->devRemHostMem->head;
-  recv->conn.llHead = &resources->devRemHostMem->llHead;
+  recv->conn.opCountRem = &resources->devRemHostMem->opCount;
 
   recv->conn.buff = resources->devHostMem->buff;
   recv->conn.llBuff = resources->devHostMem->llBuff;
   recv->conn.tail = &resources->devHostMem->tail;
-  recv->conn.opCount = &resources->devHostMem->opCount;
+  recv->conn.opCountLoc = &resources->devHostMem->opCount;
   return ncclSuccess;
 }
 
@@ -263,7 +242,6 @@ ncclResult_t shmRecvFree(void* transportResources) {
 
 struct ncclTransport shmTransport = {
   "SHM",
-  shmFillInfo,
   shmCanConnect,
   shmGetRings,
   { shmSendSetup, shmSendConnect, shmSendFree, NULL },
diff --git a/projects/rccl/test/CMakeLists.txt b/projects/rccl/test/CMakeLists.txt
index 86709ee761..d8fef68c17 100644
--- a/projects/rccl/test/CMakeLists.txt
+++ b/projects/rccl/test/CMakeLists.txt
@@ -51,6 +51,8 @@ if(BUILD_TESTS)
     test_ReduceScatter.cpp
     test_GroupCalls.cpp
     test_CombinedCalls.cpp
+    test_AllReduceAbort.cpp
+    test_BroadcastAbort.cpp
   )
 
   add_executable(UnitTests ${TEST_SOURCES})
diff --git a/projects/rccl/test/test_AllReduceAbort.cpp b/projects/rccl/test/test_AllReduceAbort.cpp
new file mode 100644
index 0000000000..9400bd84fc
--- /dev/null
+++ b/projects/rccl/test/test_AllReduceAbort.cpp
@@ -0,0 +1,150 @@
+/*************************************************************************
+ * Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#include "test_AllReduceAbort.hpp"
+#include "../include/core.h"
+#include <omp.h>
+
+#define NUM_ITER 8
+#define FAKE_OP_COUNT NUM_ITER+1
+
+namespace CorrectnessTests
+{
+    #define HIPCHECK(cmd)                                                          \
+    do {                                                                           \
+      hipError_t error = (cmd);                                                    \
+      if (error != hipSuccess) {                                                   \
+        std::cerr << "Encountered HIP error (" << error << ") at line "            \
+                  << __LINE__ << " in file " << __FILE__ << "\n";                  \
+        exit(-1);                                                                  \
+      }                                                                            \
+    } while (0)
+
+    #define LOAD(VAR) __atomic_load_n((VAR), __ATOMIC_SEQ_CST)
+    #define STORE(DST, SRC) __atomic_store_n((DST), (SRC), __ATOMIC_SEQ_CST)
+
+    TEST_P(AllReduceAbortTest, Correctness) {
+        if (numDevices > numDevicesAvailable) return;
+
+        // Prepare input / output / expected results
+        Dataset dataset;
+        dataset.Initialize(numDevices, numElements, dataType, inPlace);
+        FillDatasetWithPattern(dataset);
+
+        int gpu = 0; // GPU number to trigger abort
+        ncclComm_t comm = comms[gpu];
+
+        HIPCHECK(hipSetDevice(gpu));
+        hipStream_t stream;
+        HIPCHECK(hipStreamCreateWithFlags(&stream, hipStreamNonBlocking));
+        struct ncclChannel* channel = comm->channels;
+        struct ncclRing *ring = &channel->ring;
+        struct ncclConnector* send = &channel->peers[ring->next].send;
+        size_t op_offset = &(send->conn.opCountRem) - (uint64_t **)channel->peers;
+        size_t head_offset = &(send->conn.head) - (uint64_t **)channel->peers;
+        uint64_t **p_dev_opCount = (uint64_t **)(channel->devPeers) + op_offset;
+        uint64_t **p_dev_head = (uint64_t **)(channel->devPeers) + head_offset;
+        uint64_t *real_opCount, *fake_opCount, *fake_o;
+        uint64_t *real_head, *fake_head, *fake_h;
+
+        // get original opCount and head
+        HIPCHECK(hipMemcpyAsync(&real_opCount, p_dev_opCount, sizeof(uint64_t*), hipMemcpyDeviceToHost, stream));
+        HIPCHECK(hipMemcpyAsync(&real_head, p_dev_head, sizeof(uint64_t*), hipMemcpyDeviceToHost, stream));
+        HIPCHECK(hipStreamSynchronize(stream));
+        // allocate and install fakes
+        HIPCHECK(hipHostMalloc(&fake_opCount, sizeof(uint64_t*), hipHostMallocMapped));
+        HIPCHECK(hipMemcpyAsync(p_dev_opCount, &fake_opCount, sizeof(uint64_t*), hipMemcpyHostToDevice, stream));
+        *fake_opCount = FAKE_OP_COUNT;
+        HIPCHECK(hipHostMalloc(&fake_head, sizeof(uint64_t*), hipHostMallocMapped));
+        HIPCHECK(hipMemcpyAsync(p_dev_head, &fake_head, sizeof(uint64_t*), hipMemcpyHostToDevice, stream));
+        *fake_head = 0;
+        HIPCHECK(hipStreamSynchronize(stream));
+        // read back fakes to confirm
+        HIPCHECK(hipMemcpyAsync(&fake_o, p_dev_opCount, sizeof(uint64_t*), hipMemcpyDeviceToHost, stream));
+        HIPCHECK(hipMemcpyAsync(&fake_h, p_dev_head, sizeof(uint64_t*), hipMemcpyDeviceToHost, stream));
+        HIPCHECK(hipStreamSynchronize(stream));
+        //std::cerr << "[          ] replaced gpu " << gpu << " real_opCount = " << real_opCount << " to fake_opCount = " << fake_o << std::endl;
+        //std::cerr << "[          ] replaced gpu " << gpu << " real_head = " << real_head << " to fake_head = " << fake_h << std::endl;
+
+        // Perform a number of iterations and introduce abort
+        for (int j = 0; j < NUM_ITER; j++) {
+            //std::cerr << "[          ] iter = " << j << std::endl;
+            // Start a group call
+            ncclGroupStart();
+            for (int i = 0; i < numDevices; i++) {
+                ncclAllReduce(dataset.inputs[i], dataset.outputs[i],
+                              numElements, dataType, op, comms[i], streams[i]);
+            }
+            // Signal end of group call
+            ncclGroupEnd();
+        }
+
+        // Wait for reduction to complete
+        auto start = std::chrono::high_resolution_clock::now();
+        hipError_t hipErr;
+        int remaining = numDevices;
+        int* done = (int*)malloc(sizeof(int)*numDevices);
+        memset(done, 0, sizeof(int)*numDevices);
+        bool timeout = false, abort_called = false;
+        while (remaining) {
+            int idle = 1;
+            for (int i=0; i<numDevices; i++) {
+                if (done[i]) continue;
+
+                hipErr = hipStreamQuery(streams[i]);
+                if (hipErr == hipSuccess) {
+                    done[i] = 1;
+                    remaining--;
+                    idle = 0;
+                    continue;
+                }
+
+ #if NCCL_MAJOR >= 2
+ #if NCCL_VERSION_CODE >= NCCL_VERSION(2,4,0)
+                auto delta = std::chrono::high_resolution_clock::now() - start;
+                double deltaSec = std::chrono::duration_cast<std::chrono::duration<double>>(delta).count();
+                if (deltaSec > 10.0 && !timeout) {
+                    std::cerr << "[          ] timeout condition, calling ncclCommAbort ... " << std::endl;
+                    timeout = true;
+                }
+                ncclResult_t ncclAsyncErr;
+                ncclCommGetAsyncError(comms[i], &ncclAsyncErr);
+                if ((ncclAsyncErr != ncclSuccess || timeout) && !abort_called) {
+                    // An asynchronous error happened. Stop the operation and destroy
+                    // the communicator
+                    std::cerr << "[          ] ncclAsyncErr = " << ncclAsyncErr << std::endl;
+                    for (int i=0; i<numDevices; i++)
+                      ncclCommAbort(comms[i]);
+                    // Abort the perf test
+                    abort_called = true;
+                    break;
+                }
+#endif
+#endif
+            }
+            // We might want to let other threads (including NCCL threads) use the CPU.
+            if (idle) pthread_yield();
+        }
+
+        HIPCHECK(hipHostFree(fake_opCount));
+        HIPCHECK(hipStreamDestroy(stream));
+        dataset.Release();
+    }
+
+    INSTANTIATE_TEST_CASE_P(AllReduceAbortSweep,
+                            AllReduceAbortTest,
+                            testing::Combine(
+                                // Reduction operator
+                                testing::Values(ncclSum),
+                                // Data types
+                                testing::Values(ncclFloat32),
+                                // Number of elements
+                                testing::Values(1024, 1048576),
+                                // Number of devices
+                                testing::Values(2, 4),
+                                // In-place or not
+                                testing::Values(false)));
+} // namespace
diff --git a/projects/rccl/src/collectives/device/all_gather_0.cpp b/projects/rccl/test/test_AllReduceAbort.hpp
similarity index 55%
rename from projects/rccl/src/collectives/device/all_gather_0.cpp
rename to projects/rccl/test/test_AllReduceAbort.hpp
index 75f90ca5e6..f41473b65d 100644
--- a/projects/rccl/src/collectives/device/all_gather_0.cpp
+++ b/projects/rccl/test/test_AllReduceAbort.hpp
@@ -3,6 +3,18 @@
  *
  * See LICENSE.txt for license information
  ************************************************************************/
+#ifndef TEST_ALLREDUCE_HPP
+#define TEST_ALLREDUCE_HPP
 
-#define NCCL_OP 0
-#include "device/all_gather.cu"
+#include "CorrectnessTest.hpp"
+
+namespace CorrectnessTests
+{
+    class AllReduceAbortTest : public CorrectnessTest
+    {
+    protected:
+    public:
+    };
+}
+
+#endif
diff --git a/projects/rccl/test/test_BroadcastAbort.cpp b/projects/rccl/test/test_BroadcastAbort.cpp
new file mode 100644
index 0000000000..28596cc52a
--- /dev/null
+++ b/projects/rccl/test/test_BroadcastAbort.cpp
@@ -0,0 +1,153 @@
+/*************************************************************************
+ * Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#include "test_BroadcastAbort.hpp"
+#include "../include/core.h"
+#include <omp.h>
+
+#define NUM_ITER 8
+#define FAKE_OP_COUNT NUM_ITER+1
+
+namespace CorrectnessTests
+{
+    #define HIPCHECK(cmd)                                                          \
+    do {                                                                           \
+      hipError_t error = (cmd);                                                    \
+      if (error != hipSuccess) {                                                   \
+        std::cerr << "Encountered HIP error (" << error << ") at line "            \
+                  << __LINE__ << " in file " << __FILE__ << "\n";                  \
+        exit(-1);                                                                  \
+      }                                                                            \
+    } while (0)
+
+    #define LOAD(VAR) __atomic_load_n((VAR), __ATOMIC_SEQ_CST)
+    #define STORE(DST, SRC) __atomic_store_n((DST), (SRC), __ATOMIC_SEQ_CST)
+
+    TEST_P(BroadcastAbortTest, Correctness) {
+        if (numDevices > numDevicesAvailable) return;
+
+        // Prepare input / output / expected results
+        Dataset dataset;
+        dataset.Initialize(numDevices, numElements, dataType, inPlace);
+        FillDatasetWithPattern(dataset);
+
+        int root = 0;
+        int gpu = 0; // GPU number to trigger abort
+        ncclComm_t comm = comms[gpu];
+
+        HIPCHECK(hipSetDevice(gpu));
+        hipStream_t stream;
+        HIPCHECK(hipStreamCreateWithFlags(&stream, hipStreamNonBlocking));
+        struct ncclChannel* channel = comm->channels;
+        struct ncclRing *ring = &channel->ring;
+        struct ncclConnector* send = &channel->peers[ring->next].send;
+        size_t op_offset = &(send->conn.opCountRem) - (uint64_t **)channel->peers;
+        size_t head_offset = &(send->conn.head) - (uint64_t **)channel->peers;
+        uint64_t **p_dev_opCount = (uint64_t **)(channel->devPeers) + op_offset;
+        uint64_t **p_dev_head = (uint64_t **)(channel->devPeers) + head_offset;
+        uint64_t *real_opCount, *fake_opCount, *fake_o;
+        uint64_t *real_head, *fake_head, *fake_h;
+
+        // get original opCount and head
+        HIPCHECK(hipMemcpyAsync(&real_opCount, p_dev_opCount, sizeof(uint64_t*), hipMemcpyDeviceToHost, stream));
+        HIPCHECK(hipMemcpyAsync(&real_head, p_dev_head, sizeof(uint64_t*), hipMemcpyDeviceToHost, stream));
+        HIPCHECK(hipStreamSynchronize(stream));
+        // allocate and install fakes
+        HIPCHECK(hipHostMalloc(&fake_opCount, sizeof(uint64_t*), hipHostMallocMapped));
+        HIPCHECK(hipMemcpyAsync(p_dev_opCount, &fake_opCount, sizeof(uint64_t*), hipMemcpyHostToDevice, stream));
+        *fake_opCount = FAKE_OP_COUNT;
+        HIPCHECK(hipHostMalloc(&fake_head, sizeof(uint64_t*), hipHostMallocMapped));
+        HIPCHECK(hipMemcpyAsync(p_dev_head, &fake_head, sizeof(uint64_t*), hipMemcpyHostToDevice, stream));
+        *fake_head = 0;
+        HIPCHECK(hipStreamSynchronize(stream));
+        // read back fakes to confirm
+        HIPCHECK(hipMemcpyAsync(&fake_o, p_dev_opCount, sizeof(uint64_t*), hipMemcpyDeviceToHost, stream));
+        HIPCHECK(hipMemcpyAsync(&fake_h, p_dev_head, sizeof(uint64_t*), hipMemcpyDeviceToHost, stream));
+        HIPCHECK(hipStreamSynchronize(stream));
+        //std::cerr << "[          ] replaced gpu " << gpu << " real_opCount = " << real_opCount << " to fake_opCount = " << fake_o << std::endl;
+        //std::cerr << "[          ] replaced gpu " << gpu << " real_head = " << real_head << " to fake_head = " << fake_h << std::endl;
+
+        // Perform a number of iterations and introduce abort
+        for (int j = 0; j < NUM_ITER; j++) {
+            //std::cerr << "[          ] iter = " << j << std::endl;
+            // Start a group call
+            ncclGroupStart();
+            for (int i = 0; i < numDevices; i++) {
+                ncclBroadcast(dataset.inputs[i],
+                              dataset.outputs[i],
+                              numElements, dataType,
+                              root, comms[i], streams[i]);
+            }
+            // Signal end of group call
+            ncclGroupEnd();
+        }
+
+        // Wait for reduction to complete
+        auto start = std::chrono::high_resolution_clock::now();
+        hipError_t hipErr;
+        int remaining = numDevices;
+        int* done = (int*)malloc(sizeof(int)*numDevices);
+        memset(done, 0, sizeof(int)*numDevices);
+        bool timeout = false, abort_called = false;
+        while (remaining) {
+            int idle = 1;
+            for (int i=0; i<numDevices; i++) {
+                if (done[i]) continue;
+
+                hipErr = hipStreamQuery(streams[i]);
+                if (hipErr == hipSuccess) {
+                    done[i] = 1;
+                    remaining--;
+                    idle = 0;
+                    continue;
+                }
+
+ #if NCCL_MAJOR >= 2
+ #if NCCL_VERSION_CODE >= NCCL_VERSION(2,4,0)
+                auto delta = std::chrono::high_resolution_clock::now() - start;
+                double deltaSec = std::chrono::duration_cast<std::chrono::duration<double>>(delta).count();
+                if (deltaSec > 10.0 && !timeout) {
+                    std::cerr << "[          ] timeout condition, calling ncclCommAbort ... " << std::endl;
+                    timeout = true;
+                }
+                ncclResult_t ncclAsyncErr;
+                ncclCommGetAsyncError(comms[i], &ncclAsyncErr);
+                if ((ncclAsyncErr != ncclSuccess || timeout) && !abort_called) {
+                    // An asynchronous error happened. Stop the operation and destroy
+                    // the communicator
+                    std::cerr << "[          ] ncclAsyncErr = " << ncclAsyncErr << std::endl;
+                    for (int i=0; i<numDevices; i++)
+                      ncclCommAbort(comms[i]);
+                    // Abort the perf test
+                    abort_called = true;
+                    break;
+                }
+#endif
+#endif
+            }
+            // We might want to let other threads (including NCCL threads) use the CPU.
+            if (idle) pthread_yield();
+        }
+
+        HIPCHECK(hipHostFree(fake_opCount));
+        HIPCHECK(hipStreamDestroy(stream));
+        dataset.Release();
+    }
+
+    INSTANTIATE_TEST_CASE_P(BroadcastAbortSweep,
+                            BroadcastAbortTest,
+                            testing::Combine(
+                                // Reduction operator
+                                testing::Values(ncclSum),
+                                // Data types
+                                testing::Values(ncclFloat32),
+                                // Number of elements
+                                testing::Values(1048576),
+                                // Number of devices
+                                testing::Values(2, 4),
+                                // In-place or not
+                                testing::Values(false)));
+} // namespace
diff --git a/projects/rccl/src/collectives/device/all_reduce_0.cpp b/projects/rccl/test/test_BroadcastAbort.hpp
similarity index 55%
rename from projects/rccl/src/collectives/device/all_reduce_0.cpp
rename to projects/rccl/test/test_BroadcastAbort.hpp
index 235005af1a..c566808389 100644
--- a/projects/rccl/src/collectives/device/all_reduce_0.cpp
+++ b/projects/rccl/test/test_BroadcastAbort.hpp
@@ -3,6 +3,18 @@
  *
  * See LICENSE.txt for license information
  ************************************************************************/
+#ifndef TEST_ALLREDUCE_HPP
+#define TEST_ALLREDUCE_HPP
 
-#define NCCL_OP 0
-#include "device/all_reduce.cu"
+#include "CorrectnessTest.hpp"
+
+namespace CorrectnessTests
+{
+    class BroadcastAbortTest : public CorrectnessTest
+    {
+    protected:
+    public:
+    };
+}
+
+#endif
diff --git a/projects/rccl/tools/rccl-prim-test/copy_kernel.h b/projects/rccl/tools/rccl-prim-test/copy_kernel.h
index f4de543efd..e2377fdbb8 100644
--- a/projects/rccl/tools/rccl-prim-test/copy_kernel.h
+++ b/projects/rccl/tools/rccl-prim-test/copy_kernel.h
@@ -61,125 +61,6 @@ __device__ inline void ReduceCopy(
   }
 }
 
-typedef ulong2 Pack128;
-
-template<class FUNC, typename T>
-struct MULTI128 {
-  __device__ void operator()(Pack128& x, Pack128& y) {
-    x.x = MULTI<FUNC, T>()(x.x, y.x);
-    x.y = MULTI<FUNC, T>()(x.y, y.y);
-  }
-};
-
-inline __device__ void Fetch128(Pack128& v, Pack128* p) {
-  v.x = p->x;
-  v.y = p->y;
-}
-
-inline __device__ void Store128(Pack128* p, Pack128& v) {
-  p->x = v.x;
-  p->y = v.y;
-}
-
-#define WARP_SIZE 32
-template<class FUNC, typename T, bool TWO_INPUTS, bool TWO_OUTPUTS, int UNROLL>
-__attribute__((noinline))
-__device__ inline void ReduceCopy128b( const int w, const int nw, const int t,
-    Pack128 * src0, Pack128 * src1, Pack128 * dest0, Pack128 * dest1,
-    const int N) {
-  Pack128 t0[UNROLL];
-  Pack128 t1[UNROLL];
-  const Pack128* src0_end = src0 + N;
-  const int inc = nw * UNROLL * WARP_SIZE;
-  const int offset = w * UNROLL * WARP_SIZE + t;
-  src0 += offset;  if (TWO_INPUTS)  src1 += offset;
-  dest0 += offset; if (TWO_OUTPUTS) dest1 += offset;
-
-  while (src0 < src0_end) {
-#pragma unroll
-    for (int u = 0; u < UNROLL; ++u) {
-      Fetch128(t0[u], src0+u*WARP_SIZE);
-      if (TWO_INPUTS) Fetch128(t1[u], src1+u*WARP_SIZE);
-    }
-#pragma unroll
-    for (int u = 0; u < UNROLL; ++u) {
-      if (TWO_INPUTS) MULTI128<FUNC, T>()(t0[u], t1[u]);
-      Store128(dest0+u*WARP_SIZE, t0[u]);
-      if (TWO_OUTPUTS) Store128(dest1+u*WARP_SIZE, t0[u]);
-    }
-    src0 += inc;  if (TWO_INPUTS)  src1 += inc;
-    dest0 += inc; if (TWO_OUTPUTS) dest1 += inc;
-  }
-}
-
-template<int UNROLL, class FUNC, typename T, bool HAS_DEST1, bool HAS_SRC1>
-__attribute__((noinline))
-__device__ inline void ReduceOrCopy(const int tid, const int nthreads,
-    volatile T * __restrict__ dest0, volatile T * __restrict__ dest1,
-    const volatile T * __restrict__ src0, const volatile T * __restrict__ src1,
-    int N) {
-  int Nrem = N;
-  if (Nrem <= 0) return;
-
-  int Npreamble = (Nrem<alignof(Pack128)) ? Nrem : AlignUp(dest0, alignof(Pack128)) - dest0;
-
-  // stage 0: check if we'll be able to use the fast, 128-bit aligned path.
-  // If not, we'll just use the slow preamble path for the whole operation
-  bool alignable = (((AlignUp(src0,  alignof(Pack128)) == src0  + Npreamble)) &&
-          (!HAS_DEST1 || (AlignUp(dest1, alignof(Pack128)) == dest1 + Npreamble)) &&
-          (!HAS_SRC1  || (AlignUp(src1,  alignof(Pack128)) == src1  + Npreamble)));
-
-  if (!alignable) {
-    Npreamble = Nrem;
-  }
-
-  // stage 1: preamble: handle any elements up to the point of everything coming
-  // into alignment
-  ReduceCopy<FUNC, T, HAS_SRC1, HAS_DEST1>(tid, nthreads, src0, src1, dest0, dest1, Npreamble);
-
-  Nrem -= Npreamble;
-  if (Nrem == 0) return;
-
-  dest0 += Npreamble; if (HAS_DEST1) { dest1 += Npreamble; }
-  src0  += Npreamble; if (HAS_SRC1)  { src1  += Npreamble; }
-
-  // stage 2: fast path: use 128b loads/stores to do the bulk of the work,
-  // assuming the pointers we have are all 128-bit alignable.
-  int w = tid / WARP_SIZE;       // Warp number
-  int nw = nthreads / WARP_SIZE; // Number of warps
-  int t = tid % WARP_SIZE;       // Thread (inside the warp)
-
-  const int PackFactor = sizeof(Pack128) / sizeof(T);
-
-  // stage 2a: main loop
-  int Nalign2a = (Nrem / (PackFactor * UNROLL * nthreads))
-      * (UNROLL * nthreads); // round down
-
-  ReduceCopy128b<FUNC, T, HAS_SRC1, HAS_DEST1, UNROLL>(w, nw, t, (Pack128*)src0, (Pack128*)src1, (Pack128*)dest0, (Pack128*)dest1, Nalign2a);
-
-  int Ndone2a = Nalign2a * PackFactor;
-  Nrem -= Ndone2a;
-  if (Nrem == 0) return;
-  dest0 += Ndone2a; if (HAS_DEST1) { dest1 += Ndone2a; }
-  src0  += Ndone2a; if (HAS_SRC1)  { src1  += Ndone2a; }
-
-  // stage 2b: slightly less optimized for section when we don't have full
-  // UNROLLs
-
-  int Nalign2b = Nrem / PackFactor;
-
-  ReduceCopy128b<FUNC, T, HAS_SRC1, HAS_DEST1, 1>(w, nw, t, (Pack128*)src0, (Pack128*)src1, (Pack128*)dest0, (Pack128*)dest1, Nalign2b);
-
-  int Ndone2b = Nalign2b * PackFactor;
-  Nrem -= Ndone2b;
-  if (Nrem == 0) return;
-  dest0 += Ndone2b; if (HAS_DEST1) { dest1 += Ndone2b; }
-  src0  += Ndone2b; if (HAS_SRC1)  { src1  += Ndone2b; }
-
-  // stage 2c: tail
-  ReduceCopy<FUNC, T, HAS_SRC1, HAS_DEST1>(tid, nthreads, src0, src1, dest0, dest1, Nrem);
-}
-
 template<typename T>
 struct FuncPassA {
   __device__ T operator()(const T x, const T y) const {
@@ -217,6 +98,160 @@ struct MULTI<FUNC, float> {
   }
 };
 
+
+typedef ulong2 Pack128;
+
+template<class FUNC, typename T>
+struct MULTI128 {
+  __device__ void operator()(Pack128& x, Pack128& y) {
+    x.x = MULTI<FUNC, T>()(x.x, y.x);
+    x.y = MULTI<FUNC, T>()(x.y, y.y);
+  }
+};
+
+inline __device__ void Fetch128(Pack128& v, const Pack128* p) {
+  v.x = p->x;
+  v.y = p->y;
+}
+inline __device__ void Store128(Pack128* p, Pack128& v) {
+  p->x = v.x;
+  p->y = v.y;
+}
+
+template<class FUNC, typename T, int MINSRCS, int MAXSRCS, int MINDSTS, int MAXDSTS>
+__device__ void ReduceCopyMulti(const int tid, const int nthreads,
+    int nsrcs, const T* srcs[MAXSRCS], int ndsts, T* dsts[MAXDSTS],
+    const int offset, const int N) {
+  for (int idx = offset+tid; idx < offset+N; idx += nthreads) {
+    T val = vFetch(srcs[0]+idx);
+    #pragma unroll
+    for (int i=1; i<MINSRCS; i++) val = FUNC()(val, vFetch(srcs[i]+idx));
+    #pragma unroll 1
+    for (int i=MINSRCS; i<MAXSRCS && i<nsrcs; i++) val = FUNC()(val, vFetch(srcs[i]+idx));
+
+    #pragma unroll
+    for (int i=0; i<MINDSTS; i++) vStore(dsts[i]+idx, val);
+    #pragma unroll 1
+    for (int i=MINDSTS; i<MAXDSTS && i<ndsts; i++) vStore(dsts[i]+idx, val);
+  }
+}
+
+#define WARP_SIZE 64
+
+template<class FUNC, typename T, int UNROLL, int MINSRCS, int MAXSRCS, int MINDSTS, int MAXDSTS>
+__device__ void ReduceCopy128bMulti( const int w, const int nw, const int t,
+    int nsrcs, const T* s[MAXSRCS], int ndsts, T* d[MAXDSTS],
+    const int elemOffset, const int Npack) {
+  const int inc = nw * UNROLL * WARP_SIZE;
+  int offset = w * UNROLL * WARP_SIZE + t;
+
+  const Pack128* srcs[MAXSRCS];
+  for (int i=0; i<MAXSRCS; i++) srcs[i] = ((const Pack128*)(s[i]+elemOffset))+offset;
+  Pack128* dsts[MAXDSTS];
+  for (int i=0; i<MAXDSTS; i++) dsts[i] = ((Pack128*)(d[i]+elemOffset))+offset;
+
+  while (offset < Npack) {
+    Pack128 vals[UNROLL];
+    // Load and reduce
+    for (int u = 0; u < UNROLL; ++u) Fetch128(vals[u], srcs[0]+u*WARP_SIZE);
+
+    for (int i=1; i<MINSRCS; i++) {
+      Pack128 vals2[UNROLL];
+      for (int u = 0; u < UNROLL; ++u) Fetch128(vals2[u], srcs[i]+u*WARP_SIZE);
+      for (int u = 0; u < UNROLL; ++u) MULTI128<FUNC, T>()(vals[u], vals2[u]);
+    }
+    #pragma unroll 1
+    for (int i=MINSRCS; i<MAXSRCS && i<nsrcs; i++) {
+      Pack128 vals2[UNROLL];
+      for (int u = 0; u < UNROLL; ++u) Fetch128(vals2[u], srcs[i]+u*WARP_SIZE);
+      for (int u = 0; u < UNROLL; ++u) MULTI128<FUNC, T>()(vals[u], vals2[u]);
+    }
+
+    // Store
+    for (int i = 0; i < MINDSTS; i++) {
+      for (int u = 0; u < UNROLL; ++u) Store128(dsts[i]+u*WARP_SIZE, vals[u]);
+    }
+    #pragma unroll 1
+    for (int i=MINDSTS; i<MAXDSTS && i<ndsts; i++) {
+      for (int u = 0; u < UNROLL; ++u) Store128(dsts[i]+u*WARP_SIZE, vals[u]);
+    }
+    for (int i=0; i<MAXSRCS; i++) srcs[i] += inc;
+    for (int i=0; i<MAXDSTS; i++) dsts[i] += inc;
+    offset += inc;
+  }
+}
+
+template <typename T>
+__device__ int ptrAlign128(T* ptr) { return (uint64_t)ptr % alignof(Pack128); }
+
+// Try to limit consecutive load/stores to 8.
+// Use UNROLL 8 when we have a single source and a single destination, 4 otherwise
+#define AUTOUNROLL (UNROLL*(4/(MINDSTS+MINSRCS)))
+
+template<int UNROLL, class FUNC, typename T, int MINSRCS, int MAXSRCS, int MINDSTS, int MAXDSTS>
+__device__ void ReduceOrCopyMulti(const int tid, const int nthreads,
+    int nsrcs, const T* srcs[MAXSRCS], int ndsts, T* dsts[MAXDSTS],
+    int N) {
+  int Nrem = N;
+  if (Nrem <= 0) return;
+
+  int alignDiff = 0;
+  int align = ptrAlign128(srcs[0]);
+  #pragma unroll
+  for (int i=1; i<MINSRCS; i++) alignDiff |= (align ^ ptrAlign128(srcs[i]));
+  for (int i=MINSRCS; i<MAXSRCS && i<nsrcs; i++) alignDiff |= (align ^ ptrAlign128(srcs[i]));
+  #pragma unroll
+  for (int i=0; i<MINDSTS; i++) alignDiff |= (align ^ ptrAlign128(dsts[i]));
+  for (int i=MINDSTS; i<MAXDSTS && i<ndsts; i++) alignDiff |= (align ^ ptrAlign128(dsts[i]));
+
+  int Npreamble = alignDiff ? Nrem :
+    N < alignof(Pack128) ? N :
+    (alignof(Pack128) - align) % alignof(Pack128);
+
+  // stage 1: preamble: handle any elements up to the point of everything coming
+  // into alignment
+  if (Npreamble) {
+    ReduceCopyMulti<FUNC, T, MINSRCS, MAXSRCS, MINDSTS, MAXDSTS>(tid, nthreads, nsrcs, srcs, ndsts, dsts, 0, Npreamble);
+    Nrem -= Npreamble;
+    if (Nrem == 0) return;
+  }
+  int offset = Npreamble;
+
+  // stage 2: fast path: use 128b loads/stores to do the bulk of the work,
+  // assuming the pointers we have are all 128-bit alignable.
+  int w = tid / WARP_SIZE;       // Warp number
+  int nw = nthreads / WARP_SIZE; // Number of warps
+  int t = tid % WARP_SIZE;       // Thread (inside the warp)
+
+  const int packFactor = sizeof(Pack128) / sizeof(T);
+
+  // stage 2a: main loop
+  int Npack2a = (Nrem / (packFactor * AUTOUNROLL * WARP_SIZE))
+      * (AUTOUNROLL * WARP_SIZE); // round down
+  int Nelem2a = Npack2a * packFactor;
+
+  ReduceCopy128bMulti<FUNC, T, AUTOUNROLL, MINSRCS, MAXSRCS, MINDSTS, MAXDSTS>(w, nw, t, nsrcs, srcs, ndsts, dsts, offset, Npack2a);
+
+  Nrem -= Nelem2a;
+  if (Nrem == 0) return;
+  offset += Nelem2a;
+
+  // stage 2b: slightly less optimized for section when we don't have full
+  // unrolling
+
+  int Npack2b = Nrem / packFactor;
+  int Nelem2b = Npack2b * packFactor;
+
+  ReduceCopy128bMulti<FUNC, T, 1, MINSRCS, MAXSRCS, MINDSTS, MAXDSTS>(w, nw, t, nsrcs, srcs, ndsts, dsts, offset, Npack2b);
+
+  Nrem -= Nelem2b;
+  if (Nrem == 0) return;
+  offset += Nelem2b;
+
+  // stage 2c: tail
+  ReduceCopyMulti<FUNC, T, MINSRCS, MAXSRCS, MINDSTS, MAXDSTS>(tid, nthreads, nsrcs, srcs, ndsts, dsts, offset, Nrem);
+}
+
 // Assumptions:
 // - there is exactly 1 block
 // - THREADS is the number of producer threads
@@ -224,24 +259,38 @@ struct MULTI<FUNC, float> {
 template<int UNROLL, int THREADS, typename T>
 __device__ void Copy(volatile T * __restrict__ const dest,
     const volatile T * __restrict__ const src, const int N) {
-  ReduceOrCopy<UNROLL, FuncPassA<T>, T, false, false>(threadIdx.x, THREADS,
-      dest, nullptr, src, nullptr, N);
+  const T* srcs[2];
+  T* dsts[2];
+  srcs[0] = (const T*)src;
+  dsts[0] = (T*)dest;
+  ReduceOrCopyMulti<UNROLL, FuncPassA<T>, T, 1, 2, 1, 2>(threadIdx.x, THREADS,
+      1, srcs, 1, dsts, N);
 }
 
 template<int UNROLL, int THREADS, typename T>
 __device__ void DoubleCopy(volatile T * __restrict__ const dest0,
     volatile T * __restrict__ const dest1,
     const volatile T * __restrict__ const src, const int N) {
-  ReduceOrCopy<UNROLL, FuncPassA<T>, T, true, false>(threadIdx.x, THREADS,
-      dest0, dest1, src, nullptr, N);
+  const T* srcs[2];
+  T* dsts[2];
+  srcs[0] = (const T*)src;
+  dsts[0] = (T*)dest0;
+  dsts[1] = (T*)dest1;
+  ReduceOrCopyMulti<UNROLL, FuncPassA<T>, T, 1, 2, 1, 2>(threadIdx.x, THREADS,
+      1, srcs, 2, dsts, N);
 }
 
 template<int UNROLL, int THREADS, typename T>
 __device__ void Reduce(volatile T * __restrict__ const dest,
     const volatile T * __restrict__ const src0,
     const volatile T * __restrict__ const src1, const int N) {
-  ReduceOrCopy<UNROLL, FuncSum<T>, T, false, true>(threadIdx.x, THREADS,
-      dest, nullptr, src0, src1, N);
+  const T* srcs[2];
+  T* dsts[2];
+  srcs[0] = (const T*)src0;
+  srcs[1] = (const T*)src1;
+  dsts[0] = (T*)dest;
+  ReduceOrCopyMulti<UNROLL, FuncPassA<T>, T, 1, 2, 1, 2>(threadIdx.x, THREADS,
+      2, srcs, 1, dsts, N);
 }
 
 template<int UNROLL, int THREADS, typename T>
@@ -249,7 +298,13 @@ __device__ void ReduceCopy(volatile T * __restrict__ const dest0,
     volatile T * __restrict__ const dest1,
     const volatile T * __restrict__ const src0,
     const volatile T * __restrict__ const src1, const int N) {
-  ReduceOrCopy<UNROLL, FuncSum<T>, T, true, true>(threadIdx.x, THREADS,
-      dest0, dest1, src0, src1, N);
+  const T* srcs[2];
+  T* dsts[2];
+  srcs[0] = (const T*)src0;
+  srcs[1] = (const T*)src1;
+  dsts[0] = (T*)dest0;
+  dsts[1] = (T*)dest1;
+  ReduceOrCopyMulti<UNROLL, FuncPassA<T>, T, 1, 2, 1, 2>(threadIdx.x, THREADS,
+      2, srcs, 2, dsts, N);
 }
 #endif // COPY_KERNEL_H_
diff --git a/projects/rccl/tools/rccl-prim-test/rccl_prim_test.cpp b/projects/rccl/tools/rccl-prim-test/rccl_prim_test.cpp
index 57d3bc6bfd..0e0fe3c63c 100644
--- a/projects/rccl/tools/rccl-prim-test/rccl_prim_test.cpp
+++ b/projects/rccl/tools/rccl-prim-test/rccl_prim_test.cpp
@@ -427,10 +427,12 @@ int main(int argc,char* argv[])
                               sizeof(struct profiling_data_t), hipMemcpyDeviceToHost,
                               stream[i]));
       HIPCHECK(hipStreamSynchronize(stream[i]));
+
       int next_gpu = findNextGpu(ring_0, i, nGpu);
       uint32_t linktype;
       uint32_t hopcount;
       HIPCHECK(hipExtGetLinkTypeAndHopCount(i, next_gpu , &linktype, &hopcount));
+
       hipDeviceProp_t prop;
       HIPCHECK(hipGetDeviceProperties(&prop, i));
       if(prop.gcnArch == 906 ) {
@@ -441,11 +443,11 @@ int main(int argc,char* argv[])
         double t0 = (double)profiling_data[i]->write_cycles/((double)RTC_CLOCK_FREQ_MI100)/(double)workgroups;
         fprintf(stderr, "[GPU %d -> GPU %d][%s]:time %.4fs bytes_transferred %lu kernel throughput %.2f GB/s\n",
           i, next_gpu,link_type_name[linktype],t0, profiling_data[i]->bytes_transferred, (double)profiling_data[i]->bytes_transferred/(t0*1.0E9));
-	  } else {
+	    } else {
         double t0 = (double)profiling_data[i]->write_cycles/((double)RTC_CLOCK_FREQ_DEFAULT)/(double)workgroups;
         fprintf(stderr, "[GPU %d -> GPU %d][%s]:time %.4fs bytes_transferred %lu kernel throughput %.2f GB/s\n",
           i, next_gpu,link_type_name[linktype],t0, profiling_data[i]->bytes_transferred, (double)profiling_data[i]->bytes_transferred/(t0*1.0E9));
-	  }
+	    }
     }
     std::cout<<"***Application Level Transfer Profiling Data***"<<std::endl;
     double speed = (double)(profiling_data[0]->bytes_transferred) / (deltaSec*1.0E9);