RCCL 2.4 update

2019-07-05 15:43:00 -07:00
@@ -0,0 +1,206 @@
+# Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
+
+cmake_minimum_required(VERSION 2.8.12)
+
+set(CMAKE_INSTALL_PREFIX "/opt/rocm" CACHE PATH "")
+
+project(rccl CXX)
+
+find_package(ROCM
+             REQUIRED
+             PATHS
+             /opt/rocm)
+
+include(ROCMInstallTargets)
+include(ROCMPackageConfigHelpers)
+include(ROCMSetupVersion)
+include(ROCMInstallSymlinks)
+include(ROCMCreatePackage)
+
+option(BUILD_TESTS "Build test programs" OFF)
+
+# parse version from Makefile NCCL_MAJOR, NCCL_MINOR, NCCL_PATCH must exist
+# NCCL_SUFFIX is optional NCCL_VERSION formatting is ((X) * 1000 + (Y) * 100 +
+# (Z)) so we must first detect one or two digits first
+file(READ makefiles/version.mk version_mk_text)
+if("${version_mk_text}" MATCHES "NCCL_MAJOR *:= *([0-9]*)")
+  set(NCCL_MAJOR ${CMAKE_MATCH_1})
+else()
+  message(FATAL_ERROR "Failed to parse NCCL_MAJOR")
+endif()
+if("${version_mk_text}" MATCHES "NCCL_MINOR *:= *([0-9]*)")
+  set(NCCL_MINOR ${CMAKE_MATCH_1})
+else()
+  message(FATAL_ERROR "Failed to parse NCCL_MINOR")
+endif()
+if("${version_mk_text}" MATCHES "NCCL_PATCH *:= *([0-9]*)")
+  set(NCCL_PATCH ${CMAKE_MATCH_1})
+else()
+  message(FATAL_ERROR "Failed to parse NCCL_PATCH")
+endif()
+if("${version_mk_text}" MATCHES "NCCL_SUFFIX *:= *([0-9]*)")
+  set(NCCL_SUFFIX ${CMAKE_MATCH_1})
+else()
+  set(NCCL_SUFFIX)
+endif()
+if("${version_mk_text}" MATCHES "PKG_REVISION *:= *([0-9]*)")
+  set(PKG_REVISION ${CMAKE_MATCH_1})
+else()
+  message(FATAL_ERROR "Failed to parse PKG_REVISION")
+endif()
+if("${NCCL_PATCH}" MATCHES "[0-9][0-9]")
+  set(NCCL_VERSION "${NCCL_MAJOR}${NCCL_MINOR}${NCCL_PATCH}")
+else()
+  set(NCCL_VERSION "${NCCL_MAJOR}${NCCL_MINOR}0${NCCL_PATCH}")
+endif()
+
+# Setup VERSION
+set(VERSION_STRING "2.6.0.")
+
+# Check if BUILD_NUMBER is defined in a Jenkins environment
+if($ENV{BUILD_NUMBER})
+  string(CONCAT BUILD_VERSION ${VERSION_STRING} $ENV{BUILD_NUMBER})
+else()
+  string(CONCAT BUILD_VERSION ${VERSION_STRING} "0")
+endif()
+
+rocm_setup_version(VERSION ${BUILD_VERSION} NO_GIT_TAG_VERSION)
+
+list(APPEND CMAKE_PREFIX_PATH
+            /opt/rocm
+            /opt/rocm/hip
+            /opt/rocm/hcc)
+
+find_package(hip REQUIRED)
+message(STATUS "HIP compiler: ${HIP_COMPILER}")
+message(STATUS "HIP runtime: ${HIP_RUNTIME}")
+
+option(BUILD_SHARED_LIBS "Build as a shared library" ON)
+
+configure_file(src/nccl.h.in ${PROJECT_BINARY_DIR}/rccl.h)
+configure_file(src/nccl.h.in ${PROJECT_BINARY_DIR}/nccl.h)
+
+include_directories(${PROJECT_BINARY_DIR}) # for generated rccl.h header
+include_directories(src)
+include_directories(src/include)
+include_directories(src/collectives)
+include_directories(src/collectives/device)
+
+set(CU_SOURCES
+    src/collectives/device/all_reduce.cu
+    src/collectives/device/all_gather.cu
+    src/collectives/device/reduce.cu
+    src/collectives/device/broadcast.cu
+    src/collectives/device/reduce_scatter.cu
+    src/collectives/device/functions.cu)
+
+set(CPP_SOURCES)
+foreach(filename ${CU_SOURCES})
+  string(REPLACE ".cu"
+                 ".cpp"
+                 cpp_filename
+                 ${filename})
+  configure_file(${filename} ${cpp_filename} COPYONLY)
+  list(APPEND CPP_SOURCES ${cpp_filename})
+endforeach(filename)
+
+set(CC_SOURCES
+    src/init.cc
+    src/collectives/all_reduce.cc
+    src/collectives/all_gather.cc
+    src/collectives/reduce.cc
+    src/collectives/broadcast.cc
+    src/collectives/reduce_scatter.cc
+    src/channel.cc
+    src/misc/trees.cc
+    src/misc/rings.cc
+    src/misc/argcheck.cc
+    src/misc/group.cc
+    src/misc/utils.cc
+    src/misc/ibvwrap.cc
+    src/misc/nvmlwrap_stub.cc
+    src/misc/topo.cc
+    src/transport/net.cc
+    src/transport/net_ib.cc
+    src/transport/net_socket.cc
+    src/transport/p2p.cc
+    src/transport/shm.cc
+    src/transport.cc
+    src/bootstrap.cc
+    src/enqueue.cc)
+
+foreach(filename ${CC_SOURCES})
+  list(APPEND CPP_SOURCES ${filename})
+endforeach(filename)
+
+add_library(rccl ${CPP_SOURCES})
+
+if(TRACE)
+  add_definitions(-DENABLE_TRACE)
+endif()
+
+if(PROFILE)
+  add_definitions(-DENABLE_PROFILING)
+endif()
+
+target_link_libraries(rccl
+  PRIVATE --amdgpu-target=gfx803
+  PRIVATE --amdgpu-target=gfx900
+  PRIVATE --amdgpu-target=gfx906)
+
+if("${HIP_COMPILER}" MATCHES "clang")
+  target_compile_options(rccl
+    PRIVATE --amdgpu-target=gfx803
+    PRIVATE --amdgpu-target=gfx900
+    PRIVATE --amdgpu-target=gfx906
+    PRIVATE -fgpu-rdc)
+  target_link_libraries(rccl PRIVATE -fgpu-rdc)
+  target_include_directories(rccl PRIVATE /opt/rocm/hsa/include)
+endif()
+
+if("${HIP_COMPILER}" MATCHES "hcc")
+  target_link_libraries(rccl PRIVATE -hc-function-calls)
+endif()
+
+if(TARGET hip::device)
+  target_link_libraries(rccl PRIVATE hip::device)
+  target_link_libraries(rccl INTERFACE hip::host)
+else()
+  target_link_libraries(rccl PUBLIC hip::hip_hcc ${hcc_LIBRARIES} numa)
+endif()
+
+rocm_install_targets(TARGETS
+                     rccl
+                     PREFIX
+                     rccl)
+install(FILES ${PROJECT_BINARY_DIR}/rccl.h
+        DESTINATION rccl/${CMAKE_INSTALL_INCLUDEDIR})
+
+rocm_export_targets(NAMESPACE
+                    roc::
+                    PREFIX
+                    rccl
+                    TARGETS
+                    rccl
+                    DEPENDS
+                    hip)
+
+set(CPACK_DEBIAN_PACKAGE_DEPENDS "hip_hcc")
+set(CPACK_RPM_PACKAGE_REQUIRES "hip_hcc")
+
+set(CPACK_RPM_EXCLUDE_FROM_AUTO_FILELIST_ADDITION "/opt" "/opt/rocm")
+
+rocm_create_package(
+  NAME
+  rccl
+  DESCRIPTION
+  "Optimized primitives for collective multi-GPU communication"
+  MAINTAINER
+  "<no-reply@amd.com>"
+  LDCONFIG)
+
+rocm_install_symlink_subdir(rccl)
+
+if(BUILD_TESTS)
+  add_subdirectory(test)
+endif()
@@ -0,0 +1,89 @@
+#!/usr/bin/env groovy
+// Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
+// This shared library is available at https://github.com/ROCmSoftwarePlatform/rccl
+@Library('rocJenkins@noDocker') _
+
+// This is file for internal AMD use.
+// If you are interested in running your own Jenkins, please raise a github issue for assistance.
+
+import com.amd.project.*
+import com.amd.docker.*
+
+////////////////////////////////////////////////////////////////////////
+// Mostly generated from snippet generator 'properties; set job properties'
+// Time-based triggers added to execute nightly tests, eg '30 2 * * *' means 2:30 AM
+properties([
+    pipelineTriggers([cron('0 1 * * *'), [$class: 'PeriodicFolderTrigger', interval: '5m']]),
+    buildDiscarder(logRotator(
+      artifactDaysToKeepStr: '',
+      artifactNumToKeepStr: '',
+      daysToKeepStr: '',
+      numToKeepStr: '10')),
+    disableConcurrentBuilds(),
+    [$class: 'CopyArtifactPermissionProperty', projectNames: '*']
+   ])
+
+
+////////////////////////////////////////////////////////////////////////
+import java.nio.file.Path;
+
+rcclCI:
+{
+
+    def rccl = new rocProject('rccl')
+    // customize for project
+    rccl.paths.build_command = './install.sh -t'
+
+    // Define test architectures, optional rocm version argument is available
+    def nodes = new dockerNodes(['RCCL'], rccl)
+
+    boolean formatCheck = false
+
+    def compileCommand =
+    {
+        platform, project->
+
+        project.paths.construct_build_prefix()
+        def command = """#!/usr/bin/env bash
+                  set -x
+                  cd ${project.paths.project_build_prefix}
+                  LD_LIBRARY_PATH=/opt/rocm/hcc/lib CXX=${project.compiler.compiler_path} ${project.paths.build_command}
+                """
+
+	  sh command
+    }
+
+    def testCommand =
+    {
+        platform, project->
+
+        def command = """#!/usr/bin/env bash
+                set -x
+                cd ${project.paths.project_build_prefix}/build/release/test
+                HSA_FORCE_FINE_GRAIN_PCIE=1 ./UnitTests --gtest_output=xml --gtest_color=yes
+            """
+
+        sh command
+        //junit "${project.paths.project_build_prefix}/build/release/*.xml"
+    }
+
+    def packageCommand =
+    {
+        platform, project->
+
+        def command = """
+                      set -x
+                      cd ${project.paths.project_build_prefix}/build
+                      make package
+                      rm -rf package && mkdir -p package
+                      mv *.deb package/
+                      sudo dpkg -i package/*.deb
+                      """
+
+
+        //platform.archiveArtifacts(this, """${project.paths.project_build_prefix}/build/package/*.deb""")
+    }
+
+    buildProjectNoDocker(rccl, formatCheck, nodes.dockerArray, compileCommand, testCommand, packageCommand)
+
+}
@@ -1,5 +1,6 @@

 Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
+ Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.

 Redistribution and use in source and binary forms, with or without
 modification, are permitted provided that the following conditions
@@ -0,0 +1,66 @@
+Notices and Licenses file
+_______________________________________________________________
+
+Dependencies on nvidia-nccl v2.3.7-1 (BSD3)
+Copyright (c) 2015-2018, NVIDIA CORPORATION.
+Modifications Copyright (c) 2019 Advanced Micro Devices, Inc.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+ * Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in the
+   documentation and/or other materials provided with the distribution.
+ * Neither the name of NVIDIA CORPORATION, Lawrence Berkeley National
+   Laboratory, the U.S. Department of Energy, nor the names of their
+   contributors may be used to endorse or promote products derived
+   from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+The U.S. Department of Energy funded the development of this software
+under subcontract 7078610 with Lawrence Berkeley National Laboratory.
+
+
+nvidia-nccl v2.3.7-1 (BSD2)
+Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+ * Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in the
+   documentation and/or other materials provided with the distribution.
+ * Neither the name of NVIDIA CORPORATION, Lawrence Berkeley National
+   Laboratory, the U.S. Department of Energy, nor the names of their
+   contributors may be used to endorse or promote products derived
+   from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+The U.S. Department of Energy funded the development of this software
+under subcontract 7078610 with Lawrence Berkeley National Laboratory.
@@ -1,92 +1,80 @@
-# NCCL
+# RCCL

-Optimized primitives for collective multi-GPU communication.
+ROCm Communication Collectives Library

 ## Introduction

-NCCL (pronounced "Nickel") is a stand-alone library of standard collective communication routines for GPUs, implementing all-reduce, all-gather, reduce, broadcast, and reduce-scatter. It has been optimized to achieve high bandwidth on platforms using PCIe, NVLink, NVswitch, as well as networking using InfiniBand Verbs or TCP/IP sockets. NCCL supports an arbitrary number of GPUs installed in a single node or across multiple nodes, and can be used in either single- or multi-process (e.g., MPI) applications.
+RCCL (pronounced "Rickle") is a stand-alone library of standard collective communication routines for GPUs, implementing all-reduce, all-gather, reduce, broadcast, and reduce-scatter. It has been optimized to achieve high bandwidth on platforms using PCIe, xGMI as well as networking using InfiniBand Verbs or TCP/IP sockets. RCCL supports an arbitrary number of GPUs installed in a single node, and can be used in either single- or multi-process (e.g., MPI) applications. Multi node support is planned for a future release.

-For more information on NCCL usage, please refer to the [NCCL documentation](https://docs.nvidia.com/deeplearning/sdk/nccl-developer-guide/index.html).
-
-## What's inside
-
-At present, the library implements the following collectives operations:
-
- all-reduce
- all-gather
- reduce-scatter
- reduce
- broadcast
-
-These operations are implemented using ring algorithms and have been optimized for throughput and latency. For best performance, small operations can be either batched into larger operations or aggregated through the API.
+The collective operations are implemented using ring algorithms and have been optimized for throughput and latency. For best performance, small operations can be either batched into larger operations or aggregated through the API.

 ## Requirements

-NCCL requires at least CUDA 7.0 and Kepler or newer GPUs. For PCIe based platforms, best performance is achieved when all GPUs are located on a common PCIe root complex, but multi-socket configurations are also supported.
+1. ROCm supported GPUs
+2. ROCm stack installed on the system (HIP runtime & HCC)
+3. For building and running the unit tests, chrpath will need to be installed on your machine first. (sudo apt-get install chrpath)

-## Build
+## Quickstart RCCL Build

-Note: the official and tested builds of NCCL can be downloaded from: https://developer.nvidia.com/nccl. You can skip the following build steps if you choose to use the official builds.
+RCCL directly depends on HIP runtime & HCC C++ compiler which are part of the ROCm software stack.
+In addition, HC Direct Function call support needs to be present on your machine.  There are binaries for hcc and HIP that need to be installed to get HC Direct Function call support.  These binaries are currently packaged with roc-master, and will be included in ROCm 2.4.

-To build the library :
+The root of this repository has a helper script 'install.sh' to build and install RCCL on Ubuntu with a single command.  It does not take a lot of options and hard-codes configuration that can be specified through invoking cmake directly, but it's a great way to get started quickly and can serve as an example of how to build/install.
+
+*  `./install.sh` -- builds library including unit tests
+*  `./install.sh -i` -- builds and installs the library to /opt/rocm/rccl; installation path can be changed with --prefix argument (see below.)
+*  `./install.sh -h` -- shows help
+*  `./install.sh -t` -- builds library including unit tests
+*  `./install.sh -r` -- runs unit tests (must be already built)
+*  `./install.sh -p` -- builds RCCL package
+*  `./install.sh --prefix` -- specify custom path to install RCCL to (default:/opt/rocm)
+
+## Manual build
+#### To build the library :

 ```shell
-$ cd nccl
-$ make -j src.build
+$ git clone https://github.com/ROCmSoftwarePlatform/rccl.git
+$ cd rccl
+$ mkdir build
+$ cd build
+$ CXX=/opt/rocm/bin/hcc cmake -DCMAKE_INSTALL_PREFIX=$PWD/rccl-install ..
+$ make -j 8
 ```
+You may substitute a path of your own choosing for CMAKE_INSTALL_PREFIX. Note: ensure rocm-cmake is installed, `apt install rocm-cmake`.

-If CUDA is not installed in the default /usr/local/cuda path, you can define the CUDA path with :
+#### To build the RCCL package and install package :
+
+Assuming you have already cloned this repository and built the library as shown in the previous section:

 ```shell
-$ make src.build CUDA_HOME=<path to cuda install>
+$ cd rccl/build
+$ make package
+$ sudo dpkg -i *.deb
 ```

-NCCL will be compiled and installed in `build/` unless `BUILDDIR` is set.
-
-By default, NCCL is compiled for all supported architectures. To accelerate the compilation and reduce the binary size, consider redefining `NVCC_GENCODE` (defined in `makefiles/common.mk`) to only include the architecture of the target platform :
-```shell
-$ make -j src.build NVCC_GENCODE="-gencode=arch=compute_70,code=sm_70"
-```
-
-## Install
-
-To install NCCL on the system, create a package then install it as root.
-
-Debian/Ubuntu :
-```shell
-$ # Install tools to create debian packages
-$ sudo apt install build-essential devscripts debhelper fakeroot
-$ # Build NCCL deb package
-$ make pkg.debian.build
-$ ls build/pkg/deb/
-```
-
-RedHat/CentOS :
-```shell
-$ # Install tools to create rpm packages
-$ sudo yum install rpm-build rpmdevtools
-$ # Build NCCL rpm package
-$ make pkg.redhat.build
-$ ls build/pkg/rpm/
-```
-
-OS-agnostic tarball :
-```shell
-$ make pkg.txz.build
-$ ls build/pkg/txz/
-```
+RCCL package install requires sudo/root access because it creates a directory called "rccl" under /opt/rocm/. This is an optional step and RCCL can be used directly by including the path containing librccl.so.

 ## Tests

-Tests for NCCL are maintained separately at https://github.com/nvidia/nccl-tests.
+There are unit tests implemented with the Googletest framework in RCCL, which are currently a work-in-progress.  To invoke the unit tests, go to the rccl-install folder, then the test/ subfolder, and execute the appropriate unit test executable(s). Several notes for running the unit tests:

+1. The LD_LIBRARY_PATH environment variable will need to be set to include /path/to/rccl-install/lib/ in order to run the unit tests.
+2. The HSA_FORCE_FINE_GRAIN_PCIE environment variable will need to be set to 1 in order to run the unit tests.
+
+An example call to the unit tests:
 ```shell
-$ git clone https://github.com/NVIDIA/nccl-tests.git
-$ cd nccl-tests
-$ make
-$ ./build/all_reduce_perf -b 8 -e 256M -f 2 -g <ngpus>
+$ LD_LIBRARY_PATH=rccl-install/lib/ HSA_FORCE_FINE_GRAIN_PCIE=1 rccl-install/test/UnitTests
 ```

+There are also other performance and error-checking tests for RCCL.  These are maintained separately at https://github.com/ROCmSoftwarePlatform/rccl-tests.
+See the rccl-tests README for more information on how to build and run those tests.
+
+## Library and API Documentation
+
+Please refer to the [Library documentation](http://rccl.readthedocs.io/) for current documentation.
+
 ## Copyright

-All source code and accompanying documentation is copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
+All source code and accompanying documentation is copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+
+All modifications are copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
@@ -0,0 +1,16 @@
+#!/bin/bash
+
+if [ -d docBin ]; then
+    rm -rf docBin
+fi
+
+sed -e 's/ROCFFT_EXPORT //g' ../library/include/rocfft.h > rocfft.h
+doxygen Doxyfile
+
+cd source
+make clean
+make html
+cd ..
+
+rm rocfft.h
+
@@ -0,0 +1,12 @@
+#!/bin/bash
+
+if [ -d docBin ]; then
+    rm -rf docBin
+fi
+
+rm nccl.h
+
+sed -e 's/ROCFFT_EXPORT //g' ../src/nccl.h.in > nccl.h
+doxygen Doxyfile
+#rm nccl.h
+
@@ -0,0 +1,20 @@
+# Minimal makefile for Sphinx documentation
+#
+
+# You can set these variables from the command line.
+SPHINXOPTS    =
+SPHINXBUILD   = sphinx-build
+SPHINXPROJ    = RCCL
+SOURCEDIR     = .
+BUILDDIR      = _build
+
+# Put it first so that "make" without argument is like "make help".
+help:
+	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
+
+.PHONY: help Makefile
+
+# Catch-all target: route all unknown targets to Sphinx using the new
+# "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
+%: Makefile
+	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
@@ -0,0 +1,11 @@
+.. toctree::
+   :maxdepth: 4
+   :caption: Contents:
+
+=======
+All API
+=======
+
+.. doxygenindex::
+
+
@@ -0,0 +1,103 @@
+.. toctree::
+   :maxdepth: 4
+   :caption: Contents:
+
+===
+API
+===
+
+This section provides details of the library API
+
+Communicator Functions
+----------------------
+
+.. doxygenfunction:: ncclGetUniqueId
+
+.. doxygenfunction:: ncclCommInitRank
+
+.. doxygenfunction:: ncclCommInitAll
+
+.. doxygenfunction:: ncclCommDestroy
+
+.. doxygenfunction:: ncclCommCount
+
+.. doxygenfunction:: ncclCommCuDevice
+
+.. doxygenfunction:: ncclCommUserRank
+
+Collection Communication Operations
+-----------------------------------
+
+Collective communication operations must be called separately for each communicator in a communicator clique.
+
+They return when operations have been enqueued on the hipstream.
+
+Since they may perform inter-CPU synchronization, each call has to be done from a different thread or process, or need to use Group Semantics (see below).
+
+.. doxygenfunction:: ncclReduce
+
+.. doxygenfunction:: ncclBcast
+
+.. doxygenfunction:: ncclBroadcast
+
+.. doxygenfunction:: ncclAllReduce
+
+.. doxygenfunction:: ncclReduceScatter
+
+.. doxygenfunction:: ncclAllGather
+
+
+Group Semantics
+---------------
+When managing multiple GPUs from a single thread, and since NCCL collective
+calls may perform inter-CPU synchronization, we need to "group" calls for
+different ranks/devices into a single call.
+
+Grouping NCCL calls as being part of the same collective operation is done
+using ncclGroupStart and ncclGroupEnd. ncclGroupStart will enqueue all
+collective calls until the ncclGroupEnd call, which will wait for all calls
+to be complete. Note that for collective communication, ncclGroupEnd only
+guarantees that the operations are enqueued on the streams, not that
+the operation is effectively done.
+
+Both collective communication and ncclCommInitRank can be used in conjunction
+of ncclGroupStart/ncclGroupEnd.
+
+.. doxygenfunction:: ncclGroupStart
+
+.. doxygenfunction:: ncclGroupEnd
+
+Library Functions
+-----------------
+
+.. doxygenfunction:: ncclGetVersion
+
+.. doxygenfunction:: ncclGetErrorString
+
+Types
+-----
+
+There are few data structures that are internal to the library. The pointer types to these
+structures are given below. The user would need to use these types to create handles and pass them
+between different library functions.
+
+.. doxygentypedef:: ncclComm_t
+
+.. doxygenstruct:: ncclUniqueId
+
+
+
+Enumerations
+------------
+
+This section provides all the enumerations used.
+
+.. doxygenenum:: ncclResult_t
+
+.. doxygenenum:: ncclRedOp_t
+
+.. doxygenenum:: ncclDataType_t
+
+
+
+
@@ -0,0 +1,185 @@
+# -*- coding: utf-8 -*-
+#
+# RCCL documentation build configuration file, created by
+# sphinx-quickstart on Mon Jan  8 16:34:42 2018.
+#
+# This file is execfile()d with the current directory set to its
+# containing dir.
+#
+# Note that not all possible configuration values are present in this
+# autogenerated file.
+#
+# All configuration values have a default; values that are commented out
+# serve to show the default.
+
+# If extensions (or modules to document with autodoc) are in another directory,
+# add these directories to sys.path here. If the directory is relative to the
+# documentation root, use os.path.abspath to make it absolute, like shown here.
+#
+# import os
+# import sys
+# sys.path.insert(0, os.path.abspath('.'))
+
+import os
+import sys
+import subprocess
+
+read_the_docs_build = os.environ.get('READTHEDOCS', None) == 'True'
+
+if read_the_docs_build:
+    subprocess.call('cd ..; ./run_doxygen.sh; cd source', shell=True)
+
+# -- General configuration ------------------------------------------------
+
+# If your documentation needs a minimal Sphinx version, state it here.
+#
+# needs_sphinx = '1.0'
+
+# Add any Sphinx extension module names here, as strings. They can be
+# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
+# ones.
+extensions = ['sphinx.ext.mathjax', 'breathe']
+breathe_projects = { "RCCL": "../docBin/xml" }
+breathe_default_project = "RCCL"
+
+# Add any paths that contain templates here, relative to this directory.
+templates_path = ['_templates']
+
+# The suffix(es) of source filenames.
+# You can specify multiple suffix as a list of string:
+#
+# source_suffix = ['.rst', '.md']
+source_suffix = '.rst'
+
+# The master toctree document.
+master_doc = 'index'
+
+# General information about the project.
+project = u'RCCL'
+copyright = u'2015-2018, NVIDIA CORPORATION; Modifications Copyright 2019 Advanced Mirco Devices'
+author = u'Advanced Mirco Devices'
+
+# The version info for the project you're documenting, acts as replacement for
+# |version| and |release|, also used in various other places throughout the
+# built documents.
+#
+# The short X.Y version.
+version = u'0.8'
+# The full version, including alpha/beta/rc tags.
+release = u'0.8'
+
+# The language for content autogenerated by Sphinx. Refer to documentation
+# for a list of supported languages.
+#
+# This is also used if you do content translation via gettext catalogs.
+# Usually you set "language" from the command line for these cases.
+language = None
+
+# List of patterns, relative to source directory, that match files and
+# directories to ignore when looking for source files.
+# This patterns also effect to html_static_path and html_extra_path
+exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store']
+
+# The name of the Pygments (syntax highlighting) style to use.
+pygments_style = 'sphinx'
+
+# If true, `todo` and `todoList` produce output, else they produce nothing.
+todo_include_todos = False
+
+
+# -- Options for HTML output ----------------------------------------------
+
+# The theme to use for HTML and HTML Help pages.  See the documentation for
+# a list of builtin themes.
+#
+# html_theme = 'alabaster'
+
+if read_the_docs_build:
+    html_theme = 'default'
+else:
+    import sphinx_rtd_theme
+    html_theme = "sphinx_rtd_theme"
+    html_theme_path = [sphinx_rtd_theme.get_html_theme_path()]
+
+# Theme options are theme-specific and customize the look and feel of a theme
+# further.  For a list of options available for each theme, see the
+# documentation.
+#
+# html_theme_options = {}
+
+# Add any paths that contain custom static files (such as style sheets) here,
+# relative to this directory. They are copied after the builtin static files,
+# so a file named "default.css" will overwrite the builtin "default.css".
+# html_static_path = ['_static']
+
+# Custom sidebar templates, must be a dictionary that maps document names
+# to template names.
+#
+# This is required for the alabaster theme
+# refs: http://alabaster.readthedocs.io/en/latest/installation.html#sidebars
+# html_sidebars = {
+#     '**': [
+#         'relations.html',  # needs 'show_related': True theme option to display
+#         'searchbox.html',
+#     ]
+# }
+
+
+# -- Options for HTMLHelp output ------------------------------------------
+
+# Output file base name for HTML help builder.
+htmlhelp_basename = 'RCCLdoc'
+
+
+# -- Options for LaTeX output ---------------------------------------------
+
+latex_elements = {
+    # The paper size ('letterpaper' or 'a4paper').
+    #
+    # 'papersize': 'letterpaper',
+
+    # The font size ('10pt', '11pt' or '12pt').
+    #
+    # 'pointsize': '10pt',
+
+    # Additional stuff for the LaTeX preamble.
+    #
+    # 'preamble': '',
+
+    # Latex figure (float) alignment
+    #
+    # 'figure_align': 'htbp',
+}
+
+# Grouping the document tree into LaTeX files. List of tuples
+# (source start file, target name, title,
+#  author, documentclass [howto, manual, or own class]).
+latex_documents = [
+    (master_doc, 'RCCL.tex', u'RCCL Documentation',
+     u'Advanced Mirco Devices', 'manual'),
+]
+
+
+# -- Options for manual page output ---------------------------------------
+
+# One entry per manual page. List of tuples
+# (source start file, name, description, authors, manual section).
+man_pages = [
+    (master_doc, 'RCCL', u'RCCL Documentation',
+     [author], 1)
+]
+
+
+# -- Options for Texinfo output -------------------------------------------
+
+# Grouping the document tree into Texinfo files. List of tuples
+# (source start file, target name, title, author,
+#  dir menu entry, description, category)
+texinfo_documents = [
+    (master_doc, 'RCCL', u'RCCL Documentation',
+     author, 'RCCL', 'One line description of project.',
+     'Miscellaneous'),
+]
+
+
+
@@ -0,0 +1,21 @@
+.. rocFFT documentation master file, created by
+   sphinx-quickstart on Mon Jan  8 09:51:41 2018.
+   You can adapt this file completely to your liking, but it should at least
+   contain the root `toctree` directive.
+
+Welcome to RCCL's documentation!
+==================================
+
+.. toctree::
+   :maxdepth: 4
+   :caption: Contents:
+
+   library
+   api
+   allapi
+
+Indices and tables
+==================
+
+* :ref:`genindex`
+* :ref:`search`
@@ -0,0 +1,13 @@
+
+.. toctree::
+   :maxdepth: 4
+   :caption: Contents:
+
+======
+RCCL
+======
+
+Introduction
+------------
+
+The RCCL is an AMD port of NCCL.
@@ -0,0 +1,3 @@
+
+breathe
+
@@ -0,0 +1,132 @@
+#!/bin/bash
+# Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
+
+# #################################################
+# helper functions
+# #################################################
+function display_help()
+{
+    echo "RCCL build & installation helper script"
+    echo "./install [-h|--help] "
+    echo "    [-h|--help] prints this help message."
+    echo "    [-i|--install] install RCCL library (see --prefix argument below.)"
+    echo "    [-p|--package_build] Build RCCL package."
+    echo "    [-t|--tests_build] Build unit tests, but do not run."
+    echo "    [-r|--run_tests] Run unit tests (must be built already.)"
+    echo "    [--prefix] Specify custom directory to install RCCL to (default: /opt/rocm)."
+}
+
+# #################################################
+# global variables
+# #################################################
+default_path=/opt/rocm
+build_package=false
+install_prefix=$default_path
+build_tests=false
+run_tests=false
+build_release=true
+install_library=false
+
+# #################################################
+# Parameter parsing
+# #################################################
+
+# check if we have a modern version of getopt that can handle whitespace and long parameters
+getopt -T
+if [[ $? -eq 4 ]]; then
+    GETOPT_PARSE=$(getopt --name "${0}" --longoptions help,install,package_build,tests_build,run_tests,prefix: --options hiptr -- "$@")
+else
+    echo "Need a new version of getopt"
+    exit 1
+fi
+
+if [[ $? -ne 0 ]]; then
+    echo "getopt invocation failed; could not parse the command line";
+    exit 1
+fi
+
+eval set -- "${GETOPT_PARSE}"
+
+while true; do
+    case "${1}" in
+    -h|--help)
+        display_help
+        exit 0
+        ;;
+    -i|--install)
+        install_library=true
+        shift ;;
+    -p|--package_build)
+        build_package=true
+        shift ;;
+    -t|--tests_build)
+        build_tests=true
+        shift ;;
+    -r|--run_tests)
+        run_tests=true
+        shift ;;
+    --prefix)
+        install_prefix=${2}
+        shift 2 ;;
+    --) shift ; break ;;
+    *)  echo "Unexpected command line parameter received; aborting";
+        exit 1
+        ;;
+    esac
+    done
+
+rocm_path=/opt/rocm/bin
+
+# #################################################
+# prep
+# #################################################
+# ensure a clean build environment
+if [[ "${build_release}" == true ]]; then
+    rm -rf build/release
+else
+    rm -rf build/debug
+fi
+
+
+# Create and go to the build directory.
+mkdir -p build; cd build
+
+if ($build_release); then
+    mkdir -p release; cd release
+else
+    mkdir -p debug; cd debug
+fi
+
+
+# build type
+if [[ "${build_release}" == true ]]; then
+    cmake_common_options="${cmake_common_options} -DCMAKE_BUILD_TYPE=Release"
+else
+    cmake_common_options="${cmake_common_options} -DCMAKE_BUILD_TYPE=Debug"
+fi
+
+if ($build_tests); then
+    CXX=$rocm_path/hcc cmake -DBUILD_TESTS=ON -DCMAKE_INSTALL_PREFIX=$install_prefix ../../.
+else
+    CXX=$rocm_path/hcc cmake -DBUILD_TESTS=OFF -DCMAKE_INSTALL_PREFIX=$install_prefix ../../.
+fi
+
+if ($install_library); then
+    make -j$(nproc) install
+else
+    make -j$(nproc)
+fi
+
+if ($build_package); then
+    make package
+fi
+
+# Optionally, run tests if they're enabled.
+if ($run_tests); then
+    if (test -f "./test/UnitTests"); then
+        HSA_FORCE_FINE_GRAIN_PCIE=1 ./test/UnitTests
+    else
+        echo "Unit tests have not been built yet; please re-run script with -t to build unit tests."
+        exit 1
+    fi
+fi
@@ -1,5 +1,6 @@
 /*************************************************************************
 * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
+ * Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
 *
 * See LICENSE.txt for license information
 ************************************************************************/
@@ -39,7 +40,7 @@ ncclResult_t freeChannel(struct ncclChannel* channel, int nRanks) {

  // Free Ring index to rank tables
  free(channel->ring.userRanks);
-  CUDACHECK(cudaFree(channel->ring.devUserRanks));
+  CUDACHECK(hipFree(channel->ring.devUserRanks));

  // Free transport proxy resources
  for (int r=0; r<nRanks; r++) {
@@ -49,7 +50,7 @@ ncclResult_t freeChannel(struct ncclChannel* channel, int nRanks) {
  }

  // Free the peer structures.
-  CUDACHECK(cudaFree(channel->devPeers));
+  CUDACHECK(hipFree(channel->devPeers));
  free(channel->peers);

  return ncclSuccess;
@@ -1,5 +1,6 @@
 /*************************************************************************
 * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
+ * Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
 *
 * See LICENSE.txt for license information
 ************************************************************************/
@@ -8,9 +9,9 @@
 #include "collectives.h"

 NCCL_API(ncclResult_t, ncclAllGather, const void* sendbuff, void* recvbuff, size_t sendcount,
-    ncclDataType_t datatype, ncclComm_t comm, cudaStream_t stream);
+    ncclDataType_t datatype, ncclComm_t comm, hipStream_t stream);
 ncclResult_t ncclAllGather(const void* sendbuff, void* recvbuff, size_t sendcount,
-    ncclDataType_t datatype, ncclComm_t comm, cudaStream_t stream) {
+    ncclDataType_t datatype, ncclComm_t comm, hipStream_t stream) {
  struct ncclInfo info = { ncclCollAllGather, "AllGather",
    sendbuff, recvbuff, sendcount, datatype, ncclSum, 0, comm, stream, /* Args */
    ALLGATHER_CHUNKSTEPS, ALLGATHER_SLICESTEPS };
@@ -1,5 +1,6 @@
 /*************************************************************************
 * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
+ * Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
 *
 * See LICENSE.txt for license information
 ************************************************************************/
@@ -8,9 +9,9 @@
 #include "collectives.h"

 NCCL_API(ncclResult_t, ncclAllReduce, const void* sendbuff, void* recvbuff, size_t count,
-    ncclDataType_t datatype, ncclRedOp_t op, ncclComm* comm, cudaStream_t stream);
+    ncclDataType_t datatype, ncclRedOp_t op, ncclComm* comm, hipStream_t stream);
 ncclResult_t ncclAllReduce(const void* sendbuff, void* recvbuff, size_t count,
-    ncclDataType_t datatype, ncclRedOp_t op, ncclComm* comm, cudaStream_t stream) {
+    ncclDataType_t datatype, ncclRedOp_t op, ncclComm* comm, hipStream_t stream) {
  struct ncclInfo info = { ncclCollAllReduce, "AllReduce",
    sendbuff, recvbuff, count, datatype, op, 0, comm, stream, /* Args */
    ALLREDUCE_CHUNKSTEPS, ALLREDUCE_SLICESTEPS };
@@ -1,5 +1,6 @@
 /*************************************************************************
 * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
+ * Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
 *
 * See LICENSE.txt for license information
 ************************************************************************/
@@ -8,9 +9,9 @@
 #include "collectives.h"

 NCCL_API(ncclResult_t, ncclBroadcast, const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype, int root,
-    ncclComm_t comm, cudaStream_t stream);
+    ncclComm_t comm, hipStream_t stream);
 ncclResult_t ncclBroadcast(const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype, int root,
-    ncclComm_t comm, cudaStream_t stream) {
+    ncclComm_t comm, hipStream_t stream) {
  struct ncclInfo info = { ncclCollBroadcast, "Broadcast",
    sendbuff, recvbuff, count, datatype, ncclSum, root, comm, stream, /* Args */
    BROADCAST_CHUNKSTEPS, BROADCAST_SLICESTEPS };
@@ -18,9 +19,9 @@ ncclResult_t ncclBroadcast(const void* sendbuff, void* recvbuff, size_t count, n
 }
 /* Deprecated original "in place" function, similar to MPI */
 NCCL_API(ncclResult_t, ncclBcast, void* buff, size_t count, ncclDataType_t datatype, int root,
-    ncclComm_t comm, cudaStream_t stream);
+    ncclComm_t comm, hipStream_t stream);
 ncclResult_t ncclBcast(void* buff, size_t count, ncclDataType_t datatype, int root,
-    ncclComm_t comm, cudaStream_t stream) {
+    ncclComm_t comm, hipStream_t stream) {
  return ncclBroadcast(buff, buff, count, datatype, root, comm, stream);
 }

@@ -1,5 +1,7 @@
+#include "hip/hip_runtime.h"
 /*************************************************************************
 * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved.
+ * Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
 *
 * See LICENSE.txt for license information
 ************************************************************************/
@@ -7,7 +9,7 @@
 #ifndef NCCL_COLLECTIVES_H_
 #define NCCL_COLLECTIVES_H_

-#define FUNC_INDEX(coll, redop, dtype, ll, al) ((((((coll)*ncclNumOps + (redop))*ncclNumTypes) + (dtype))*2+(al))*2+(ll))
+#define FUNC_INDEX(coll, redop, dtype, ll, al) ((((coll*ncclNumOps + redop)*ncclNumTypes) + dtype)*2+ll)

 #define NCCL_COLL_NAME(coll, op, dtype) \
  coll##_##op##_##dtype
@@ -17,7 +19,7 @@

 /* Declare all collective operations */
 #define DECL_COLL5(coll, op, dtype) \
-  extern __device__ void NCCL_COLL_NAME(coll, op, dtype)(struct CollectiveArgs* args); \
+  extern __device__ __attribute__((noinline)) void NCCL_COLL_NAME(coll, op, dtype)(struct CollectiveArgs* args); \
  extern __global__ void NCCL_KERN_NAME(coll, op, dtype)(struct ncclColl c); \

 #define DECL_COLL4(coll, op, dtype) \
@@ -25,8 +27,7 @@
  DECL_COLL5(coll##LL, op, dtype)

 #define DECL_COLL3(coll, op, dtype) \
-  DECL_COLL4(coll##Ring, op, dtype) \
-  DECL_COLL4(coll##Tree, op, dtype)
+  DECL_COLL4(coll##Ring, op, dtype)

 #define DECL_COLL2(coll, op) \
  DECL_COLL3(coll, op, i8) \
@@ -55,12 +56,18 @@
 DECL_ALL_COLLS

 // CHUNKSIZE must be a multiple of SLICESIZE
-#define ALLREDUCE_SLICESTEPS (NCCL_STEPS/4)
-#define ALLREDUCE_CHUNKSTEPS (NCCL_STEPS/2)
-#define ALLGATHER_SLICESTEPS (NCCL_STEPS/4)
-#define ALLGATHER_CHUNKSTEPS (NCCL_STEPS/2)
-#define REDUCESCATTER_SLICESTEPS (NCCL_STEPS/4)
-#define REDUCESCATTER_CHUNKSTEPS (NCCL_STEPS/2)
+//#define ALLREDUCE_SLICESTEPS (NCCL_STEPS/4)
+//#define ALLREDUCE_CHUNKSTEPS (NCCL_STEPS/2)
+//#define ALLGATHER_SLICESTEPS (NCCL_STEPS/4)
+//#define ALLGATHER_CHUNKSTEPS (NCCL_STEPS/2)
+//#define REDUCESCATTER_SLICESTEPS (NCCL_STEPS/4)
+//#define REDUCESCATTER_CHUNKSTEPS (NCCL_STEPS/2)
+#define ALLREDUCE_SLICESTEPS 4
+#define ALLREDUCE_CHUNKSTEPS 4
+#define ALLGATHER_SLICESTEPS 4
+#define ALLGATHER_CHUNKSTEPS 4
+#define REDUCESCATTER_SLICESTEPS 4
+#define REDUCESCATTER_CHUNKSTEPS 4
 #define BROADCAST_SLICESTEPS 1
 #define BROADCAST_CHUNKSTEPS 1
 #define REDUCE_SLICESTEPS 1
@@ -1,11 +1,14 @@
 /*************************************************************************
- * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+ * Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
 *
 * See LICENSE.txt for license information
 ************************************************************************/

-#include "all_gather.h"
 #include "common.h"
+#include "all_gather.h"
 #include "collectives.h"

-IMPL_COLL_C(ncclAllGather, ncclCollAllGather);
+#define UNROLL 4
+
+IMPL_COLL3(ncclAllGather, copy, FuncSum, i8, int8_t, ncclCollAllGather, ncclSum, ncclInt8);
@@ -1,5 +1,6 @@
 /*************************************************************************
 * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
+ * Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
 *
 * See LICENSE.txt for license information
 ************************************************************************/
@@ -9,9 +10,10 @@
 #include "collectives.h"

 template<int UNROLL, class FUNC, typename T>
+__attribute__((noinline))
 __device__ void ncclAllGatherRingKernel(struct CollectiveArgs* args) {
  const int tid = threadIdx.x;
-  const int nthreads = blockDim.x - 1;
+  const int nthreads = blockDim.x;
  const int bid = args->bid;
  struct ncclDevComm* comm = args->comm;
  struct ncclChannel* channel = comm->channels+blockIdx.x;
@@ -67,9 +69,11 @@ __device__ void ncclAllGatherRingKernel(struct CollectiveArgs* args) {
 }

 template<int UNROLL, class FUNC, typename T>
+__attribute__((noinline))
 __device__ void ncclAllGatherTreeKernel(struct CollectiveArgs* args) { }

 template<int UNUSED, class FUNC, typename T>
+__attribute__((noinline))
 __device__ void ncclAllGatherRingLLKernel(struct CollectiveArgs* args) {
  const int tid = threadIdx.x;
  const int bid = args->bid;
@@ -128,4 +132,5 @@ __device__ void ncclAllGatherRingLLKernel(struct CollectiveArgs* args) {
 }

 template<int UNUSED, class FUNC, typename T>
+__attribute__((noinline))
 __device__ void ncclAllGatherTreeLLKernel(struct CollectiveArgs* args) { }
@@ -1,11 +1,17 @@
 /*************************************************************************
- * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+ * Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
 *
 * See LICENSE.txt for license information
 ************************************************************************/

-#include "all_reduce.h"
 #include "common.h"
+#include "all_reduce.h"
 #include "collectives.h"

-IMPL_COLL_R(ncclAllReduce, ncclCollAllReduce);
+#define UNROLL 4
+
+IMPL_COLL2(ncclAllReduce, sum,  FuncSum,  ncclCollAllReduce, ncclSum);
+IMPL_COLL2(ncclAllReduce, prod, FuncProd, ncclCollAllReduce, ncclProd);
+IMPL_COLL2(ncclAllReduce, min,  FuncMin,  ncclCollAllReduce, ncclMin);
+IMPL_COLL2(ncclAllReduce, max,  FuncMax,  ncclCollAllReduce, ncclMax);
@@ -1,5 +1,6 @@
 /*************************************************************************
 * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
+ * Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
 *
 * See LICENSE.txt for license information
 ************************************************************************/
@@ -9,9 +10,10 @@
 #include "collectives.h"

 template<int UNROLL, class FUNC, typename T>
+__attribute__((noinline))
 __device__ void ncclAllReduceRingKernel(struct CollectiveArgs* args) {
  const int tid = threadIdx.x;
-  const int nthreads = blockDim.x - 1;
+  const int nthreads = blockDim.x;
  const int bid = args->bid;
  struct ncclDevComm* comm = args->comm;
  struct ncclChannel* channel = comm->channels+blockIdx.x;
@@ -21,6 +23,11 @@ __device__ void ncclAllReduceRingKernel(struct CollectiveArgs* args) {
  const int stepSize = channel->buffSize / (sizeof(T)*NCCL_STEPS);
  const int chunkSize = stepSize * ALLREDUCE_CHUNKSTEPS;
  const ssize_t loopSize = args->nChannels*(ssize_t)chunkSize;
+#ifdef ENABLE_PROFILING
+  auto devProf = comm->devProf;
+  uint64_t clk, t0 = 0ULL, ws, wr;
+  if (tid == 0) clk = clock64();
+#endif

  // Compute pointers
  const T * __restrict__ thisInput = (const T*)args->ThisInput;
@@ -44,7 +51,9 @@ __device__ void ncclAllReduceRingKernel(struct CollectiveArgs* args) {
    offset = chunkOffset + slice * realChunkSize;
    nelem = min(realChunkSize, size-offset);

+    INIT_COUNTER;
    prims.send(thisInput+offset, nelem);
+    ACCUMULATE_COUNTER(send);

    // k-2 steps: reduce and copy to next GPU
    for (int j=2; j<nranks; ++j) {
@@ -52,7 +61,9 @@ __device__ void ncclAllReduceRingKernel(struct CollectiveArgs* args) {
      offset = chunkOffset + slice * realChunkSize;
      nelem = min(realChunkSize, size-offset);

+      INIT_COUNTER;
      prims.recvReduceSend(thisInput+offset, nelem);
+      ACCUMULATE_COUNTER(recvReduceSend);
    }

    // step k-1: reduce this buffer and data, which will produce the final
@@ -61,7 +72,9 @@ __device__ void ncclAllReduceRingKernel(struct CollectiveArgs* args) {
    offset = chunkOffset + slice * realChunkSize;
    nelem = min(realChunkSize, size-offset);

+    INIT_COUNTER;
    prims.directRecvReduceCopySend(thisInput+offset, thisOutput+offset, offset, nelem);
+    ACCUMULATE_COUNTER(directRecvReduceCopySend);

    // k-2 steps: copy to next GPU
    for (int j=1; j<nranks-1; ++j) {
@@ -69,7 +82,9 @@ __device__ void ncclAllReduceRingKernel(struct CollectiveArgs* args) {
      offset = chunkOffset + slice * realChunkSize;
      nelem = min(realChunkSize, size-offset);

+      INIT_COUNTER;
      prims.directRecvCopySend(thisOutput+offset, offset, nelem);
+      ACCUMULATE_COUNTER(directRecvCopySend);
    }

    // Make final copy from buffer to dest.
@@ -78,14 +93,20 @@ __device__ void ncclAllReduceRingKernel(struct CollectiveArgs* args) {
    nelem = min(realChunkSize, size-offset);

    // Final wait/copy.
+    INIT_COUNTER;
    prims.directRecv(thisOutput+offset, offset, nelem);
+    ACCUMULATE_COUNTER(directRecv);
  }
+#ifdef ENABLE_PROFILING
+  if (tid == 0) __atomic_fetch_add(&(devProf->total_cycle), clock64() - clk, __ATOMIC_SEQ_CST);
+#endif
 }

 template<int UNROLL, class FUNC, typename T>
+__attribute__((noinline))
 __device__ void ncclAllReduceTreeKernel(struct CollectiveArgs* args) {
  const int tid = threadIdx.x;
-  const int nthreads = blockDim.x - 1;
+  const int nthreads = blockDim.x;
  const int bid = args->bid;
  struct ncclDevComm* comm = args->comm;
  struct ncclChannel* channel = comm->channels+blockIdx.x;
@@ -135,6 +156,7 @@ __device__ void ncclAllReduceTreeKernel(struct CollectiveArgs* args) {
 }

 template<int UNUSED, class FUNC, typename T>
+__attribute__((noinline))
 __device__ void ncclAllReduceRingLLKernel(struct CollectiveArgs* args) {
  const int tid = threadIdx.x;
  const int bid = args->bid;
@@ -210,6 +232,7 @@ __device__ void ncclAllReduceRingLLKernel(struct CollectiveArgs* args) {
 }

 template<int UNUSED, class FUNC, typename T>
+__attribute__((noinline))
 __device__ void ncclAllReduceTreeLLKernel(struct CollectiveArgs* args) {
  const int tid = threadIdx.x;
  const int nthreads = args->nThreads;
@@ -1,11 +1,14 @@
 /*************************************************************************
- * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+ * Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
 *
 * See LICENSE.txt for license information
 ************************************************************************/

-#include "broadcast.h"
 #include "common.h"
+#include "broadcast.h"
 #include "collectives.h"

-IMPL_COLL_C(ncclBroadcast, ncclCollBroadcast);
+#define UNROLL 4
+
+IMPL_COLL3(ncclBroadcast, copy, FuncSum, i8, int8_t, ncclCollBroadcast, ncclSum, ncclInt8);
@@ -1,5 +1,6 @@
 /*************************************************************************
 * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
+ * Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
 *
 * See LICENSE.txt for license information
 ************************************************************************/
@@ -9,9 +10,10 @@
 #include "collectives.h"

 template<int UNROLL, class FUNC, typename T>
+__attribute__((noinline))
 __device__ void ncclBroadcastRingKernel(struct CollectiveArgs* args) {
  const int tid = threadIdx.x;
-  const int nthreads = blockDim.x - 1;
+  const int nthreads = blockDim.x;
  const int bid = args->bid;
  struct ncclDevComm* comm = args->comm;
  struct ncclChannel* channel = comm->channels+blockIdx.x;
@@ -23,6 +25,11 @@ __device__ void ncclBroadcastRingKernel(struct CollectiveArgs* args) {
  const int rank = ring->devUserRanks[0];
  const int nextRank = ring->devUserRanks[1];
  const int root = args->root;
+#ifdef ENABLE_PROFILING
+  auto devProf = comm->devProf;
+  uint64_t clk, t0 = 0ULL, ws, wr;
+  if (tid == 0) clk = clock64();
+#endif

  // Compute pointers
  const T * __restrict__ thisInput = (const T*)args->ThisInput;
@@ -39,22 +46,35 @@ __device__ void ncclBroadcastRingKernel(struct CollectiveArgs* args) {

    if (rank == root) {
      if (thisInput == thisOutput) {
+        INIT_COUNTER;
        prims.send(thisInput+offset, nelem);
+        ACCUMULATE_COUNTER(send);
      } else {
+        INIT_COUNTER;
        prims.copySend(thisInput+offset, thisOutput+offset, nelem);
+        ACCUMULATE_COUNTER(copySend);
      }
    } else if (nextRank == root) {
+      INIT_COUNTER;
      prims.recv(thisOutput+offset, nelem);
+      ACCUMULATE_COUNTER(recv);
    } else {
+      INIT_COUNTER;
      prims.recvCopySend(thisOutput+offset, nelem);
+      ACCUMULATE_COUNTER(recvCopySend);
    }
  }
+#ifdef ENABLE_PROFILING
+  if (tid == 0) __atomic_fetch_add(&(devProf->total_cycle), clock64() - clk, __ATOMIC_SEQ_CST);
+#endif
 }

 template<int UNROLL, class FUNC, typename T>
+__attribute__((noinline))
 __device__ void ncclBroadcastTreeKernel(struct CollectiveArgs* args) { }

 template<int UNUSED, class FUNC, typename T>
+__attribute__((noinline))
 __device__ void ncclBroadcastRingLLKernel(struct CollectiveArgs* args) {
  const int tid = threadIdx.x;
  const int bid = args->bid;
@@ -99,4 +119,5 @@ __device__ void ncclBroadcastRingLLKernel(struct CollectiveArgs* args) {
 }

 template<int UNUSED, class FUNC, typename T>
+__attribute__((noinline))
 __device__ void ncclBroadcastTreeLLKernel(struct CollectiveArgs* args) { }
@@ -1,5 +1,7 @@
+#include "hip/hip_runtime.h"
 /*************************************************************************
 * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved.
+ * Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
 *
 * See LICENSE.txt for license information
 ************************************************************************/
@@ -10,11 +12,18 @@
 #include "../collectives.h"
 #include "devcomm.h"
 #include "nccl.h"
+#include <type_traits>

 // Exit If Abort Barrier across CTA: make sure all threads exit consistently
 // Each thread sets a predicate to true if abort == 1
 // all CTA's threads enter the barrier and do a popc on their predicates being True
 // If any of the thread's predicate was True, all the threads call exit()
+#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__)
+#define exitIfAbortBarrier(abort, abortCount) \
+  if (abort) __atomic_fetch_add(abortCount, 1, __ATOMIC_SEQ_CST); \
+  __syncthreads(); \
+  if (LOAD(abortCount)) { asm volatile ("s_endpgm"); return; }
+#else
 static inline __device__ void exitIfAbortBarrier(int abort) {
  uint32_t popc;
  asm ("{");
@@ -24,21 +33,116 @@ static inline __device__ void exitIfAbortBarrier(int abort) {
  asm ("}");
  if (popc) { asm volatile ("exit;"); }
 }
+#endif

-typedef void(*ncclKern_t)(struct CollectiveArgs* args);
-extern __device__ ncclKern_t ncclFuncs[];
+#define NCCL_FUNC5(coll, op, dtype) \
+  NCCL_COLL_NAME(coll, op, dtype), \
+  NCCL_COLL_NAME(coll##LL, op, dtype)

-static __device__ void load_parallel(void* dst, void* src, size_t size, int tid) {
+#define NCCL_FUNC4(coll, op, dtype) \
+  NCCL_FUNC5(coll##Ring, op, dtype)
+
+// Must be consistent with ncclDataType_t
+#define NCCL_FUNCS3A(coll, op) \
+  NCCL_FUNC4(coll, op,  i8), \
+  NCCL_FUNC4(coll, op,  u8), \
+  NCCL_FUNC4(coll, op, i32), \
+  NCCL_FUNC4(coll, op, u32), \
+  NCCL_FUNC4(coll, op, i64), \
+  NCCL_FUNC4(coll, op, u64), \
+  NCCL_FUNC4(coll, op, f16), \
+  NCCL_FUNC4(coll, op, f32), \
+  NCCL_FUNC4(coll, op, f64)
+#define NCCL_FUNCS3B(coll, op) \
+  NCCL_FUNC4(coll, op,  i8), \
+  NCCL_FUNC4(coll, op,  i8), \
+  NCCL_FUNC4(coll, op,  i8), \
+  NCCL_FUNC4(coll, op,  i8), \
+  NCCL_FUNC4(coll, op,  i8), \
+  NCCL_FUNC4(coll, op,  i8), \
+  NCCL_FUNC4(coll, op,  i8), \
+  NCCL_FUNC4(coll, op,  i8), \
+  NCCL_FUNC4(coll, op,  i8)
+
+// Must be consistent with ncclRedOp_t
+#define NCCL_FUNCS2A(coll) \
+  NCCL_FUNCS3A(coll, sum ), \
+  NCCL_FUNCS3A(coll, prod), \
+  NCCL_FUNCS3A(coll, max ), \
+  NCCL_FUNCS3A(coll, min )
+#define NCCL_FUNCS2B(coll) \
+  NCCL_FUNCS3B(coll, copy), \
+  NCCL_FUNCS3B(coll, copy), \
+  NCCL_FUNCS3B(coll, copy), \
+  NCCL_FUNCS3B(coll, copy)
+
+// Must be consistent with ncclColl_t
+#define NCCL_FUNCS() { \
+  NCCL_FUNCS2B(ncclBroadcast), \
+  NCCL_FUNCS2A(ncclReduce), \
+  NCCL_FUNCS2B(ncclAllGather), \
+  NCCL_FUNCS2A(ncclReduceScatter), \
+  NCCL_FUNCS2A(ncclAllReduce) }
+
+// Must be consistent with the ncclFuncSet enum
+using ncclFunc_t = void (*)(struct CollectiveArgs*);
+
+static const __device__ constexpr ncclFunc_t ncclFuncs[]{
+// Don't try to initialize the host shadow copy of this device-side global
+// variable. There is no host pointer to a device-side function, which
+// confuses clang. This will be fixed in the next clang release.
+#if defined(__HIP_DEVICE_COMPILE__)
+  NCCL_FUNCS2B(ncclBroadcast),
+  NCCL_FUNCS2A(ncclReduce),
+  NCCL_FUNCS2B(ncclAllGather),
+  NCCL_FUNCS2A(ncclReduceScatter),
+  NCCL_FUNCS2A(ncclAllReduce)
+#endif
+};
+
+template<unsigned short f, unsigned short l>
+struct Caller {
+  static
+  void call(ncclColl* const c) noexcept
+  {
+    constexpr unsigned short m = f + (l - f) / 2;
+
+     return (c->funcIndex < m) ? Caller<f, m>::call(c) : Caller<m, l>::call(c);
+  }
+};
+
+template<unsigned short f>
+struct Caller<f, f + 1>{
+  static
+  void call(struct ncclColl* const c) noexcept { ncclFuncs[f](&c->args); }
+};
+
+inline
+__device__
+void NCCL_CALL_FUNCTIONS(struct ncclColl* const c) noexcept {
+  if (c->funcIndex < 72) {
+    if (c->funcIndex % 2) ncclBroadcastRingLL_copy_i8(&c->args);
+    else ncclBroadcastRing_copy_i8(&c->args);
+  }
+  else if (c->funcIndex < 144) Caller<72, 144>::call(c);
+  else if (c->funcIndex < 216) {
+    if (c->funcIndex % 2) ncclAllGatherRingLL_copy_i8(&c->args);
+    else ncclAllGatherRing_copy_i8(&c->args);
+  }
+  else Caller<216, 360>::call(c);
+}
+
+static __device__ void load_parallel(void* dst, void* src, size_t size, int tid, uint32_t* abortCount) {
  int* d = (int*)dst;
  int* s = (int*)src;
  // When aggregation is effective, if some threads have aborted inside the LL kernel,
  // make sure the rest of the threads abort as well
-  exitIfAbortBarrier(0);
+  exitIfAbortBarrier(0, abortCount);
  for (int o = tid; o < (size/sizeof(int)); o += blockDim.x) d[o] = s[o];
  __syncthreads();
 }
-static __device__ void load_coll(struct ncclColl* localColl, struct ncclColl* hostColl, int tid) {
-  load_parallel(localColl, hostColl, sizeof(struct ncclColl), tid);
+static __device__ void load_coll(struct ncclColl* localColl, struct ncclColl* hostColl, int tid, uint32_t* abortCount) {
+  load_parallel(localColl, hostColl, sizeof(struct ncclColl), tid, abortCount);
  if (tid == 0) hostColl->active = 0;
 }

@@ -56,23 +160,27 @@ __global__ void NCCL_KERN_NAME(coll, op, dtype)(struct ncclColl firstColl) { \
  int tid = threadIdx.x; \
  int bid = blockIdx.x; \
  __shared__ struct ncclColl localColl; \
+  __shared__ uint32_t abortCount; \
+  if (tid == 0) abortCount = 0; \
+  __syncthreads(); \
 \
  struct ncclDevComm* comm = firstColl.args.comm; \
  struct ncclChannel* channel = comm->channels+bid; \
  struct ncclColl* c; \
+  channel->abortCount = &abortCount; \
  if (bid == 0) { \
    /* To optimize for latency, (only) the first operation is passed as argument.*/ \
    c = &firstColl; \
  } else { \
    c = &localColl; \
-    load_coll(c, channel->devCollectives+channel->collFifoHead, tid); \
+    load_coll(c, channel->devCollectives+channel->collFifoHead, tid, &abortCount); \
  } \
  while (1) { \
    if (tid < c->args.nThreads) { \
      if (c->funcIndex == fIndex) { \
        coll##Kernel<COLL_UNROLL, ncclFunc<ctype>, ctype>(&c->args); \
      } else { \
-        ncclFuncs[c->funcIndex](&c->args); \
+        NCCL_CALL_FUNCTIONS(c); \
      } \
    } \
    int nextIndex = c->nextIndex; \
@@ -84,7 +192,7 @@ __global__ void NCCL_KERN_NAME(coll, op, dtype)(struct ncclColl firstColl) { \
 \
    /* Load next collective operation*/ \
    c = &localColl; /* for bid 0 */ \
-    load_coll(c, channel->devCollectives+nextIndex, tid); \
+    load_coll(c, channel->devCollectives+nextIndex, tid, &abortCount); \
  } \
 }
 #else
@@ -98,61 +206,19 @@ __global__ void NCCL_KERN_NAME(coll, op, dtype)(struct ncclColl firstColl) { \
  IMPL_COLL_KERN(coll##LL, op, ncclFunc, dtype, ctype, FUNC_INDEX(ncclColl, ncclOp, ncclType, 1, al)) \

 #define IMPL_COLL3(coll, op, ncclFunc, dtype, ctype, ncclColl, ncclOp, ncclType) \
-  IMPL_COLL4(coll##Ring, op, ncclFunc, dtype, ctype, ncclColl, ncclOp, ncclType, 0) \
-  IMPL_COLL4(coll##Tree, op, ncclFunc, dtype, ctype, ncclColl, ncclOp, ncclType, 1)
+  IMPL_COLL4(coll##Ring, op, ncclFunc, dtype, ctype, ncclColl, ncclOp, ncclType, 0)

-#if NCCL_TYPE == 0
-#define IMPL_COLL2(coll, op, ncclFunc, ncclColl, ncclOp) \
-  IMPL_COLL3(coll, op, ncclFunc, i8,  int8_t,   ncclColl, ncclOp, ncclInt8)
-#elif NCCL_TYPE == 1
-#define IMPL_COLL2(coll, op, ncclFunc, ncclColl, ncclOp) \
-  IMPL_COLL3(coll, op, ncclFunc, u8,  uint8_t,  ncclColl, ncclOp, ncclUint8)
-#elif NCCL_TYPE == 2
-#define IMPL_COLL2(coll, op, ncclFunc, ncclColl, ncclOp) \
-  IMPL_COLL3(coll, op, ncclFunc, i32, int32_t,  ncclColl, ncclOp, ncclInt32)
-#elif NCCL_TYPE == 3
-#define IMPL_COLL2(coll, op, ncclFunc, ncclColl, ncclOp) \
-  IMPL_COLL3(coll, op, ncclFunc, u32, uint32_t, ncclColl, ncclOp, ncclUint32)
-#elif NCCL_TYPE == 4
-#define IMPL_COLL2(coll, op, ncclFunc, ncclColl, ncclOp) \
-  IMPL_COLL3(coll, op, ncclFunc, i64, int64_t,  ncclColl, ncclOp, ncclInt64)
-#elif NCCL_TYPE == 5
-#define IMPL_COLL2(coll, op, ncclFunc, ncclColl, ncclOp) \
-  IMPL_COLL3(coll, op, ncclFunc, u64, uint64_t, ncclColl, ncclOp, ncclUint64)
-#elif NCCL_TYPE == 6
-#define IMPL_COLL2(coll, op, ncclFunc, ncclColl, ncclOp) \
-  IMPL_COLL3(coll, op, ncclFunc, f16, half,     ncclColl, ncclOp, ncclFloat16)
-#elif NCCL_TYPE == 7
-#define IMPL_COLL2(coll, op, ncclFunc, ncclColl, ncclOp) \
-  IMPL_COLL3(coll, op, ncclFunc, f32, float,    ncclColl, ncclOp, ncclFloat32)
-#elif NCCL_TYPE == 8
 #define IMPL_COLL2(coll, op, ncclFunc, ncclColl, ncclOp) \
+  IMPL_COLL3(coll, op, ncclFunc, i8,  int8_t,   ncclColl, ncclOp, ncclInt8) \
+  IMPL_COLL3(coll, op, ncclFunc, u8,  uint8_t,  ncclColl, ncclOp, ncclUint8) \
+  IMPL_COLL3(coll, op, ncclFunc, i32, int32_t,  ncclColl, ncclOp, ncclInt32) \
+  IMPL_COLL3(coll, op, ncclFunc, u32, uint32_t, ncclColl, ncclOp, ncclUint32) \
+  IMPL_COLL3(coll, op, ncclFunc, i64, int64_t,  ncclColl, ncclOp, ncclInt64) \
+  IMPL_COLL3(coll, op, ncclFunc, u64, uint64_t, ncclColl, ncclOp, ncclUint64) \
+  IMPL_COLL3(coll, op, ncclFunc, f16, half,     ncclColl, ncclOp, ncclFloat16) \
+  IMPL_COLL3(coll, op, ncclFunc, f32, float,    ncclColl, ncclOp, ncclFloat32) \
  IMPL_COLL3(coll, op, ncclFunc, f64, double,   ncclColl, ncclOp, ncclFloat64)
-#endif

-// Reduction define all functions
-#if NCCL_OP == 0
-#define IMPL_COLL_R(collf, colln) \
-  IMPL_COLL2(collf, sum,  FuncSum,  colln, ncclSum);
-#elif NCCL_OP == 1
-#define IMPL_COLL_R(collf, colln) \
-  IMPL_COLL2(collf, prod, FuncProd, colln, ncclProd);
-#elif NCCL_OP == 2
-#define IMPL_COLL_R(collf, colln) \
-  IMPL_COLL2(collf, min,  FuncMin,  colln, ncclMin);
-#elif NCCL_OP == 3
-#define IMPL_COLL_R(collf, colln) \
-  IMPL_COLL2(collf, max,  FuncMax,  colln, ncclMax);
-#endif
-
-// Copy primitives only define one
-#if NCCL_OP == 0 && NCCL_TYPE == 0
-#define IMPL_COLL_C(collf, colln) \
-  IMPL_COLL3(collf, copy, FuncSum, i8, int8_t, colln, ncclSum, ncclInt8);
-#else
-#define IMPL_COLL_C(collf, colln)
-#endif
-
-#define COLL_UNROLL 4
+#define COLL_UNROLL 2

 #endif
@@ -1,5 +1,6 @@
 /*************************************************************************
 * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
+ * Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
 *
 * See LICENSE.txt for license information
 ************************************************************************/
@@ -11,13 +12,25 @@
 #include <cstdio>
 #include <cstdint>

-#include <cuda_runtime.h>
+#include <hip/hip_runtime.h>

 // Define min for ssize_t
 static __device__ int min(int a, ssize_t b) { return (a < b) ? a : b; }

 typedef uint64_t PackType;

+#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__)
+
+template<class FUNC, typename T>
+struct MULTI {
+  __device__ PackType operator()(const PackType x, const PackType y) const
+  {
+    return FUNC()(x, y);
+  }
+};
+
+#else
+
 // unpack x and y to elements of type T and apply FUNC to each element
 template<class FUNC, typename T>
 struct MULTI {
@@ -192,6 +205,8 @@ struct MULTI<FUNC, int64_t> {
  }
 };

+#endif //defined(__HIP_PLATFORM_HCC__) || defined(__HCC__)
+
 template<typename T> inline __device__
 T vFetch(const volatile T* ptr) {
  return *ptr;
@@ -202,7 +217,7 @@ void vStore(volatile T* ptr, const T val) {
  *ptr = val;
 }

-#if CUDART_VERSION < 9000
+#if CUDART_VERSION < 9000 && !(defined(__HIP_PLATFORM_HCC__) || defined(__HCC__))
 template<> inline __device__
 half vFetch<half>(const volatile half* ptr) {
  half r;
@@ -239,14 +254,24 @@ struct MULTI128 {
 };

 inline __device__ void Fetch128(Pack128& v, const Pack128* p) {
+#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__)
+  v.x = p->x;
+  v.y = p->y;
+#else
  asm volatile("ld.volatile.global.v2.u64 {%0,%1}, [%2];" : "=l"(v.x), "=l"(v.y) : "l"(p) : "memory");
+#endif
 }
 inline __device__ void Store128(Pack128* p, Pack128& v) {
+#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__)
+  p->x = v.x;
+  p->y = v.y;
+#else
  asm volatile("st.volatile.global.v2.u64 [%0], {%1,%2};" :: "l"(p), "l"(v.x), "l"(v.y) : "memory");
+#endif
 }

 template<class FUNC, typename T, int MINSRCS, int MAXSRCS, int MINDSTS, int MAXDSTS>
-__device__ __forceinline__ void ReduceCopyMulti(const int tid, const int nthreads,
+__device__ void ReduceCopyMulti(const int tid, const int nthreads,
    int nsrcs, const T* srcs[MAXSRCS], int ndsts, T* dsts[MAXDSTS],
    const int offset, const int N) {
  for (int idx = offset+tid; idx < offset+N; idx += nthreads) {
@@ -263,10 +288,10 @@ __device__ __forceinline__ void ReduceCopyMulti(const int tid, const int nthread
  }
 }

-#define WARP_SIZE 32
+#define WARP_SIZE 64

 template<class FUNC, typename T, int UNROLL, int MINSRCS, int MAXSRCS, int MINDSTS, int MAXDSTS>
-__device__ __forceinline__ void ReduceCopy128bMulti( const int w, const int nw, const int t,
+__device__ void ReduceCopy128bMulti( const int w, const int nw, const int t,
    int nsrcs, const T* s[MAXSRCS], int ndsts, T* d[MAXDSTS],
    const int elemOffset, const int Npack) {
  const int inc = nw * UNROLL * WARP_SIZE;
@@ -316,7 +341,7 @@ __device__ int ptrAlign128(T* ptr) { return (uint64_t)ptr % alignof(Pack128); }
 #define AUTOUNROLL (UNROLL*(4/(MINDSTS+MINSRCS)))

 template<int UNROLL, class FUNC, typename T, int MINSRCS, int MAXSRCS, int MINDSTS, int MAXDSTS>
-__device__ __forceinline__ void ReduceOrCopyMulti(const int tid, const int nthreads,
+__device__ void ReduceOrCopyMulti(const int tid, const int nthreads,
    int nsrcs, const T* srcs[MAXSRCS], int ndsts, T* dsts[MAXDSTS],
    int N) {
  int Nrem = N;
@@ -1,5 +1,6 @@
 /*************************************************************************
 * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
+ * Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
 *
 * See LICENSE.txt for license information
 ************************************************************************/
@@ -8,69 +9,5 @@
 #include "collectives.h"
 #include "common.h"

-#define NCCL_FUNC5(coll, op, dtype) \
-  NCCL_COLL_NAME(coll, op, dtype), \
-  NCCL_COLL_NAME(coll##LL, op, dtype)
-
-#define NCCL_FUNC4(coll, op, dtype) \
-  NCCL_FUNC5(coll##Ring, op, dtype), \
-  NCCL_FUNC5(coll##Tree, op, dtype)
-
-// Must be consistent with ncclDataType_t
-#define NCCL_FUNCS3A(coll, op) \
-  NCCL_FUNC4(coll, op,  i8), \
-  NCCL_FUNC4(coll, op,  u8), \
-  NCCL_FUNC4(coll, op, i32), \
-  NCCL_FUNC4(coll, op, u32), \
-  NCCL_FUNC4(coll, op, i64), \
-  NCCL_FUNC4(coll, op, u64), \
-  NCCL_FUNC4(coll, op, f16), \
-  NCCL_FUNC4(coll, op, f32), \
-  NCCL_FUNC4(coll, op, f64)
-#define NCCL_FUNCS3B(coll, op) \
-  NCCL_FUNC4(coll, op,  i8), \
-  NCCL_FUNC4(coll, op,  i8), \
-  NCCL_FUNC4(coll, op,  i8), \
-  NCCL_FUNC4(coll, op,  i8), \
-  NCCL_FUNC4(coll, op,  i8), \
-  NCCL_FUNC4(coll, op,  i8), \
-  NCCL_FUNC4(coll, op,  i8), \
-  NCCL_FUNC4(coll, op,  i8), \
-  NCCL_FUNC4(coll, op,  i8)
-
-// Must be consistent with ncclRedOp_t
-#define NCCL_FUNCS2A(coll) \
-  NCCL_FUNCS3A(coll, sum ), \
-  NCCL_FUNCS3A(coll, prod), \
-  NCCL_FUNCS3A(coll, max ), \
-  NCCL_FUNCS3A(coll, min )
-#define NCCL_FUNCS2B(coll) \
-  NCCL_FUNCS3B(coll, copy), \
-  NCCL_FUNCS3B(coll, copy), \
-  NCCL_FUNCS3B(coll, copy), \
-  NCCL_FUNCS3B(coll, copy)
-
-// Must be consistent with ncclColl_t
-#define NCCL_FUNCS() { \
-  NCCL_FUNCS2B(ncclBroadcast), \
-  NCCL_FUNCS2A(ncclReduce), \
-  NCCL_FUNCS2B(ncclAllGather), \
-  NCCL_FUNCS2A(ncclReduceScatter), \
-  NCCL_FUNCS2A(ncclAllReduce) }
-
-// Must be consistent with the ncclFuncSet enum
-__device__ ncclKern_t ncclFuncs[ncclCollCount*ncclNumOps*ncclNumTypes*2*2] = {
-// Don't try to initialize the host shadow copy of this device-side global
-// variable. There is no host pointer to a device-side function, which
-// confuses clang. This will be fixed in the next clang release.
-#if __CUDA_ARCH__
-  NCCL_FUNCS2B(ncclBroadcast),
-  NCCL_FUNCS2A(ncclReduce),
-  NCCL_FUNCS2B(ncclAllGather),
-  NCCL_FUNCS2A(ncclReduceScatter),
-  NCCL_FUNCS2A(ncclAllReduce)
-#endif
-};
-
 // Workaround for https://reviews.llvm.org/D55580
 __device__ void ncclWorkaroundClangD55580() {}
@@ -1,5 +1,6 @@
 /*************************************************************************
 * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
+ * Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
 *
 * See LICENSE.txt for license information
 ************************************************************************/
@@ -51,24 +52,29 @@ class ncclPrimitives {
  const T* recvBuff[NRECV];
  T* sendBuff[NSEND];
  struct ncclDevComm* comm;
+  uint32_t* abortCount;

-  inline __device__ int recvOffset(int i) { return (recvStep[i]%NCCL_STEPS)*stepSize; }
-  inline __device__ int sendOffset(int i) { return (sendStep[i]%NCCL_STEPS)*stepSize; }
-  inline __device__ const T* recvPtr(int i) { return ((const T*)recvBuff[i])+recvOffset(i); }
-  inline __device__ T* sendPtr(int i) { return ((T*)sendBuff[i])+sendOffset(i); }
+  __device__ int recvOffset(int i) { return (recvStep[i]%NCCL_STEPS)*stepSize; }
+  __device__ int sendOffset(int i) { return (sendStep[i]%NCCL_STEPS)*stepSize; }
+  __device__ const T* recvPtr(int i) { return ((const T*)recvBuff[i])+recvOffset(i); }
+  __device__ T* sendPtr(int i) { return ((T*)sendBuff[i])+sendOffset(i); }

-  inline __device__ void barrier() {
+  __device__ void barrier() {
+#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__)
+    __syncthreads();
+#else
    asm volatile ("bar.sync 1, %0;" :: "r"(nthreads));
+#endif
  }

  uint32_t mismatch = 0;
  const uint64_t opCount;

-  inline __device__ void checkMismatch(volatile uint64_t* remoteOpCount) {
+  __device__ void checkMismatch(volatile uint64_t* remoteOpCount) {
    if (mismatch) {
      // In non-LL, we use _threadfence_system before incrementing opCount, yet we are still waiting for credits here, so there must be a size mismatch
-      *(comm->fatalDevError) = ncclDevAssertedMismatch;
-    } else if (remoteOpCount && *remoteOpCount > opCount) {
+      STORE(comm->fatalDevError, ncclDevAssertedMismatch);
+    } else if (remoteOpCount && LOAD(remoteOpCount) > opCount) {
      mismatch += 1;
    }
  }
@@ -76,63 +82,78 @@ class ncclPrimitives {
  uint32_t spins = 0;
  uint32_t abort = 0;

-  inline __device__ int checkAbort(volatile uint64_t* remoteOpCount) {
+  __device__ int checkAbort(volatile uint64_t* remoteOpCount) {
    spins++;
+    abort = LOAD(comm->abortFlag);
    if (spins == SPINS_BEFORE_CHECK_ABORT) {
-      abort = *(comm->abortFlag);
      checkMismatch(remoteOpCount);
      spins = 0;
    }
    return abort;
  }

-  inline __device__ void waitRecv(int i) {
+  __device__ void waitRecv(int i) {
    spins = 0;
    mismatch = 0;
    recvStep[i] += SLICESTEPS;
    if (tid == i) {
-      while (*(waitPtr) < recvStep[i]) {
+#ifdef ENABLE_PROFILING
+      auto devProf = comm->devProf;
+      uint64_t t0 = clock64();
+#endif
+      while (LOAD(waitPtr) < recvStep[i]) {
        if (checkAbort(recvConn[i]->opCountRem)) break;
      }
+#ifdef ENABLE_PROFILING
+      __atomic_fetch_add(&devProf->wait_recv_cycle[blockIdx.x], clock64() - t0, __ATOMIC_SEQ_CST);
+#endif
    }
  }

-  inline __device__ void waitSend(int i) {
+  __device__ void waitSend(int i) {
    spins = 0;
    mismatch = 0;
    sendStep[i] += SLICESTEPS;
    if (tid == WARP_SIZE+i) {
+#ifdef ENABLE_PROFILING
+      auto devProf = comm->devProf;
+      uint64_t t0 = clock64();
+#endif
      while (sendConnHead[i] + NCCL_STEPS < sendStep[i]) {
-        sendConnHead[i] = *waitPtr;
+        sendConnHead[i] = LOAD(waitPtr);
        if (checkAbort(sendConn[i]->opCountRem)) break;
      }
+#ifdef ENABLE_PROFILING
+      __atomic_fetch_add(&devProf->wait_send_cycle[blockIdx.x], clock64() - t0, __ATOMIC_SEQ_CST);
+#endif
    }
  }

  inline __device__ void postRecv(int i) {
-    *(recvConn[i]->head) = recvStep[i] += SLICESTEPS;
+    STORE(recvConn[i]->head, recvStep[i]);
  }

  inline __device__ void postSend(int i) {
-    *(sendConn[i]->tail) = sendStep[i] += SLICESTEPS;
+    if (sendConn[i]->next_hdp_reg) STORE(sendConn[i]->next_hdp_reg, 0x1);
+    STORE(sendConn[i]->tail, sendStep[i]);
  }

-  inline __device__ void postSendSize(int i, int size) {
-    if (sendConn[i]->fifo) sendConn[i]->fifo[sendStep[i]%NCCL_STEPS] = size;
+  __device__ void postSendSize(int i, int size) {
+    if (sendConn[i]->fifo) STORE(sendConn[i]->fifo+((sendStep[i]-SLICESTEPS)%NCCL_STEPS), size);
  }

  template <int DIRECTRECV>
-  inline __device__ const T* directRecvPtr(int i, int directOffset) {
+  __device__ const T* directRecvPtr(int i, int directOffset) {
    return DIRECTRECV && recvDirectBuff[i] ? recvDirectBuff[i]+directOffset : recvPtr(i);
  }

  template <int DIRECTSEND>
-  inline __device__ T* directSendPtr(int i, int directOffset) {
+  __device__ T* directSendPtr(int i, int directOffset) {
    return DIRECTSEND && sendDirectBuff[i] ? sendDirectBuff[i]+directOffset : sendPtr(i);
  }

  template <int DIRECTRECV, int DIRECTSEND, int RECV, int SEND, int SRC, int DST>
-  inline __device__ void
+  __device__ void
  GenericOp(const T* srcPtr, T* dstPtr, int nelem, int directOffset) {
    int offset = 0;
    int sliceSize = stepSize * SLICESTEPS;
@@ -154,157 +175,155 @@ class ncclPrimitives {
    #pragma unroll 1
    for (int slice=0; slice<SLICESPERCHUNK; ++slice) {
      int realSize = max(0, min(sliceSize, nelem-offset));
-      if (tid < nthreads) {
-        FOR_SEND(waitSend);
-        FOR_RECV(waitRecv);
-        if (realSize > 0) {
-          barrier();
-          if (DIRECTRECV && recvDirectBuff[0]) {
-            // We can only have one direct receive. Since srcs[0] == dstPtr+offset, skip one copy
-            if (SEND) {
-              ReduceOrCopyMulti<UNROLL, FUNC, T, 1, 1, 1, NSEND>(tid, nthreads, 1, srcs, nsend, dsts+1, realSize);
-            }
-          } else {
-            ReduceOrCopyMulti<UNROLL, FUNC, T, RECV+SRC, RECV*NRECV+SRC, SEND+DST, SEND*NSEND+DST>(tid, nthreads, RECV*nrecv+SRC, srcs, SEND*nsend+DST, dsts, realSize);
+      FOR_SEND(waitSend);
+      FOR_RECV(waitRecv);
+      if (realSize > 0) {
+        barrier();
+        if (DIRECTRECV && recvDirectBuff[0]) {
+          // We can only have one direct receive. Since srcs[0] == dstPtr+offset, skip one copy
+          if (SEND) {
+            ReduceOrCopyMulti<UNROLL, FUNC, T, 1, 1, 1, NSEND>(tid, nthreads, 1, srcs, nsend, dsts+1, realSize);
          }
+        } else {
+          ReduceOrCopyMulti<UNROLL, FUNC, T, RECV+SRC, RECV*NRECV+SRC, SEND+DST, SEND*NSEND+DST>(tid, nthreads, RECV*nrecv+SRC, srcs, SEND*nsend+DST, dsts, realSize);
        }
-        exitIfAbortBarrier(abort);
-      } else {
-        exitIfAbortBarrier(abort);
-        FOR_SEND(postSendSize, realSize*sizeof(T));
-        if (SEND) __threadfence_system();
-        FOR_SEND(postSend);
-        FOR_RECV(postRecv);
      }
-      for (int i=0; i<RECV*NRECV+SRC; i++) srcs[i] += sliceSize;
-      for (int i=0; i<SEND*NSEND+DST; i++) dsts[i] += sliceSize;
-      offset += sliceSize;
+      exitIfAbortBarrier(abort, abortCount);
+      if (tid == 0) FOR_SEND(postSendSize, realSize*sizeof(T));
+      if (SEND) __threadfence_system();
+      if (tid == 0) FOR_SEND(postSend);
+      if (tid == 0) FOR_RECV(postRecv);
    }
+    for (int i=0; i<RECV*NRECV+SRC; i++) srcs[i] += sliceSize;
+    for (int i=0; i<SEND*NSEND+DST; i++) dsts[i] += sliceSize;
+    offset += sliceSize;
  }

-  __device__ __forceinline__ void loadRecvConn(struct ncclConnInfo* conn, int i, T* directBuff) {
+  __device__ void loadRecvConn(struct ncclConnInfo* conn, int i, T* directBuff) {
    recvConn[i] = conn;
-    recvBuff[i] = (const T*)recvConn[i]->buff;
-    recvStep[i] = recvConn[i]->step;
+    recvBuff[i] = (const T*)LOAD(&recvConn[i]->buff);
+    recvStep[i] = LOAD(&recvConn[i]->step);
    recvStep[i] = ROUNDUP(recvStep[i], SLICESPERCHUNK*SLICESTEPS);
    // Return credits in case we rounded up.
-    if (tid == nthreads) *recvConn[i]->head = recvStep[i];
+    if (tid == 0) STORE(recvConn[i]->head, recvStep[i]);
    if (tid == i) {
-      waitPtr = recvConn[i]->tail;
-      *(recvConn[i]->opCountLoc) = opCount;
+      waitPtr = LOAD(&recvConn[i]->tail);
+      STORE(recvConn[i]->opCountLoc, opCount);
    }
    recvDirectBuff[i] = NULL;
    if (directBuff && recvConn[i]->direct) {
      recvDirectBuff[i] = directBuff;
-      if (tid == 0) *recvConn[i]->ptrExchange = directBuff;
+      if (tid == 0) STORE(recvConn[i]->ptrExchange, directBuff);
    }
    nrecv++;
  }

-  __device__ __forceinline__ void loadSendConn(struct ncclConnInfo* conn, int i, T* directBuff) {
+  __device__ void loadSendConn(struct ncclConnInfo* conn, int i, T* directBuff) {
    sendConn[i] = conn;
-    sendBuff[i] = (T*)sendConn[i]->buff;
-    sendStep[i] = sendConn[i]->step;
+    sendBuff[i] = (T*)LOAD(&sendConn[i]->buff);
+    sendStep[i] = LOAD(&sendConn[i]->step);
    sendStep[i] = ROUNDUP(sendStep[i], SLICESPERCHUNK*SLICESTEPS);
    if (tid == WARP_SIZE+i) {
-      waitPtr = sendConn[i]->head;
-      sendConnHead[i] = *waitPtr;
-      *(sendConn[i]->opCountLoc) = opCount;
+      waitPtr = LOAD(&sendConn[i]->head);
+      sendConnHead[i] = LOAD(waitPtr);
+      STORE(sendConn[i]->opCountLoc, opCount);
    }
    sendDirectBuff[i] = NULL;
    if (directBuff && sendConn[i]->direct) {
      void* volatile* ptr = sendConn[i]->ptrExchange;
-      while ((sendDirectBuff[i] = (T*)(*ptr)) == NULL);
+      while ((sendDirectBuff[i] = (T*)(LOAD(ptr))) == NULL);
      __syncthreads();
-      if (tid == 0) *ptr = NULL;
+      if (tid == 0) STORE(ptr, NULL);
    }
    nsend++;
  }

-  __device__ __forceinline__ void saveRecvConn(int i) {
+  __device__ void saveRecvConn(int i) {
    if (tid == i) {
-      recvConn[i]->step = recvStep[i];
+      STORE(&recvConn[i]->step, recvStep[i]);
      __threadfence_system();
-      *(recvConn[i]->opCountLoc) += 1;
+      __atomic_fetch_add(recvConn[i]->opCountLoc, 1, __ATOMIC_SEQ_CST);
    }
  }

-  __device__ __forceinline__ void saveSendConn(int i) {
+  __device__ void saveSendConn(int i) {
    if (tid == WARP_SIZE+i) {
-      sendConn[i]->step = sendStep[i];
+      STORE(&sendConn[i]->step, sendStep[i]);
      __threadfence_system();
-      *(sendConn[i]->opCountLoc) += 1;
+      __atomic_fetch_add(sendConn[i]->opCountLoc, 1, __ATOMIC_SEQ_CST);
    }
  }

 public:
-  __device__ __forceinline__
+  __device__
  ncclPrimitives(const int tid, const int nthreads, int* recvPeers, int* sendPeers, T* directBuff, int stepSize, struct ncclChannel* channel, struct ncclDevComm* comm, const uint64_t opCount)
    : comm(comm), tid(tid), nthreads(nthreads), stepSize(stepSize), opCount(opCount) {
    // Make sure step is updated before we read it
+    abortCount = channel->abortCount;
    __syncthreads();

-    for (int i=0; i<NRECV && recvPeers[i] >= 0; i++) loadRecvConn(&channel->devPeers[recvPeers[i]].recv.conn, i, directBuff);
-    for (int i=0; i<NSEND && sendPeers[i] >= 0; i++) loadSendConn(&channel->devPeers[sendPeers[i]].send.conn, i, directBuff);
+    // disable directBuff
+    for (int i=0; i<NRECV && recvPeers[i] >= 0; i++) loadRecvConn(&channel->devPeers[recvPeers[i]].recv.conn, i, 0);
+    for (int i=0; i<NSEND && sendPeers[i] >= 0; i++) loadSendConn(&channel->devPeers[sendPeers[i]].send.conn, i, 0);
  }

-  __device__ __forceinline__ void
+  __device__ void
  send(const T* src, int nelem) {
    GenericOp<0, 0, 0, 1, 1, 0>(src, NULL, nelem, 0);
  }
-  __device__ __forceinline__ void
+  __device__ void
  directSend(const T* src, int directOffset, int nelem) {
    GenericOp<0, 1, 0, 1, 1, 0>(src, NULL, nelem, directOffset);
  }

-  __device__ __forceinline__ void
+  __device__ void
  recv(T* dst, int nelem) {
    GenericOp<0, 0, 1, 0, 0, 1>(NULL, dst, nelem, 0);
  }
-  __device__ __forceinline__ void
+  __device__ void
  directRecv(T* dst, int directOffset, int nelem) {
    GenericOp<1, 0, 1, 0, 0, 1>(NULL, dst, nelem, directOffset);
  }

-  __device__ __forceinline__ void
+  __device__ void
  copySend(const T* src, T* dst, int nelem) {
    GenericOp<0, 0, 0, 1, 1, 1>(src, dst, nelem, 0);
  }
-  __device__ __forceinline__ void
+  __device__ void
  directCopySend(const T* src, T* dst, int directOffset, int nelem) {
    GenericOp<0, 1, 0, 1, 1, 1>(src, dst, nelem, directOffset);
  }

-  __device__ __forceinline__ void
+  __device__ void
  recvCopySend(T* dst, int nelem) {
    GenericOp<0, 0, 1, 1, 0, 1>(NULL, dst, nelem, 0);
  }
-  __device__ __forceinline__ void
+  __device__ void
  directRecvCopySend(T* dst, int directOffset, int nelem) {
    GenericOp<1, 1, 1, 1, 0, 1>(NULL, dst, nelem, directOffset);
  }

-  __device__ __forceinline__ void
+  __device__ void
  recvReduceCopy(const T* src, T* dst, int nelem) {
    GenericOp<0, 0, 1, 0, 1, 1>(src, dst, nelem, 0);
  }

-  __device__ __forceinline__ void
+  __device__ void
  recvReduceSend(const T* src, int nelem) {
    GenericOp<0, 0, 1, 1, 1, 0>(src, NULL, nelem, 0);
  }

-  __device__ __forceinline__ void
+  __device__ void
  recvReduceCopySend(const T* src, T* dst, int nelem) {
    GenericOp<0, 0, 1, 1, 1, 1>(src, dst, nelem, 0);
  }
-  __device__ __forceinline__ void
+  __device__ void
  directRecvReduceCopySend(const T* src, T* dst, int directOffset, int nelem) {
    // Direct is only for the send part
    GenericOp<0, 1, 1, 1, 1, 1>(src, dst, nelem, directOffset);
  }

-  __device__ __forceinline__ ~ncclPrimitives() {
+  __device__ ~ncclPrimitives() {
    // Save steps for next collective. Have thread 0 do it to be compatible
    // with the way LL works.
    for (int i=0; i<NRECV && i<nrecv; i++) saveRecvConn(i);
@@ -330,19 +349,22 @@ class ncclLLPrimitives {
  union ncclLLFifoLine* recvBuff[NRECV];
  union ncclLLFifoLine* sendBuff[NSEND];
  struct ncclDevComm* comm;
+  uint32_t* abortCount;

-  inline __device__ int recvOffset(int i) { return (recvStep[i]%NCCL_STEPS)*NCCL_LL_SLICE_LINES; }
-  inline __device__ int sendOffset(int i) { return (sendStep[i]%NCCL_STEPS)*NCCL_LL_SLICE_LINES; }
-  inline __device__ union ncclLLFifoLine* recvPtr(int i) { return recvBuff[i]+recvOffset(i); }
-  inline __device__ union ncclLLFifoLine* sendPtr(int i) { return sendBuff[i]+sendOffset(i); }
-  inline __device__ uint32_t recvFlag(int i) { return NCCL_LL_FLAG(recvStep[i]+1); }
-  inline __device__ uint32_t sendFlag(int i) { return NCCL_LL_FLAG(sendStep[i]+1); }
+  __device__ int recvOffset(int i) { return (recvStep[i]%NCCL_STEPS)*NCCL_LL_SLICE_LINES; }
+  __device__ int sendOffset(int i) { return (sendStep[i]%NCCL_STEPS)*NCCL_LL_SLICE_LINES; }
+  __device__ union ncclLLFifoLine* recvPtr(int i) { return recvBuff[i]+recvOffset(i); }
+  __device__ union ncclLLFifoLine* sendPtr(int i) { return sendBuff[i]+sendOffset(i); }
+  __device__ uint32_t recvFlag(int i) { return NCCL_LL_FLAG(recvStep[i]+1); }
+  __device__ uint32_t sendFlag(int i) { return NCCL_LL_FLAG(sendStep[i]+1); }

+#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__)
+#else
  // Exit If Abort Barrier : make sure all threads exit consistently
  // Each thread sets a predicate to true if val == 1
  // all CTA's threads enter the barrier and do a popc on their predicates being True
  // If any of the thread's predicate was True, all the threads call exit()
-  inline __device__ void exitIfAbortLocalBarrier() {
+  __device__ void exitIfAbortLocalBarrier() {
    uint32_t popc;
    asm ("{");
    asm volatile ("   .reg .pred barr_pred;");
@@ -354,20 +376,25 @@ class ncclLLPrimitives {
      exitIfAbortBarrier(1);
    }
  }
+#endif

-  inline __device__ void barrier() {
+  __device__ void barrier() {
+#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__)
+    __syncthreads();
+#else
    asm volatile ("bar.sync 1, %0;" :: "r"(nthreads));
+#endif
  }

  uint32_t mismatch = 0;
  const uint64_t opCount;

-  inline __device__ void checkMismatch(volatile uint64_t* remoteOpCount) {
+  __device__ void checkMismatch(volatile uint64_t* remoteOpCount) {
    if (mismatch > 20) {
      // We have seen that the peer advanced opcount so many times yet we are still waiting for credit of current op, so it is _most likely_ a mismatch
      // Note that we are not using _threadfence_system in LL so the error cannot be asserted
-      *(comm->fatalDevError) = ncclDevSuspectedMismatch;
-    } else if (remoteOpCount && *remoteOpCount > opCount) {
+      STORE(comm->fatalDevError, ncclDevSuspectedMismatch);
+    } else if (remoteOpCount && LOAD(remoteOpCount) > opCount) {
      mismatch += 1;
    }
  }
@@ -375,37 +402,37 @@ class ncclLLPrimitives {
  uint32_t spins = 0;
  uint32_t abort = 0;

-  inline __device__ int checkAbort(volatile uint64_t* remoteOpCount) {
+  __device__ int checkAbort(volatile uint64_t* remoteOpCount) {
    spins++;
+    abort = LOAD(comm->abortFlag);
    if (spins == SPINS_BEFORE_CHECK_ABORT) {
-      abort = *(comm->abortFlag);
      checkMismatch(remoteOpCount);
      spins = 0;
    }
    return abort;
  }

-  inline __device__ void waitSend(int i, int nbytes) {
+  __device__ void waitSend(int i, int nbytes) {
    spins = 0;
    mismatch = 0;
    if (tid == WARP_SIZE+i) {
      while (sendConnHead + NCCL_STEPS < sendStep[i] + 1) {
-        sendConnHead = *waitPtr;
+        sendConnHead = LOAD(waitPtr);
        if (checkAbort(sendConn[i]->opCountRem)) break;
      }
      if (fifoPtr) {
        int size = ((sendStep[i] & NCCL_LL_CLEAN_MASK) == NCCL_LL_CLEAN_MASK) ? NCCL_LL_SLICE_LINES*sizeof(union ncclLLFifoLine) : nbytes;
-        fifoPtr[sendStep[i]%NCCL_STEPS] = size;
+        STORE(fifoPtr+sendStep[i]%NCCL_STEPS, size);
      }
    }
  }

-  inline __device__ void postRecv(int i) {
+  __device__ void postRecv(int i) {
    recvStep[i]++;
-    if (tid == i) *postPtr = recvStep[i];
+    if (tid == i) STORE(postPtr, recvStep[i]);
  }

-  inline __device__ void postSend(int i, int offset) {
+  __device__ void postSend(int i, int offset) {
    // LL Cleanup : write all flags in the slice to make sure we don't have
    // data corruption when flag loops over.
    if ((sendStep[i] & NCCL_LL_CLEAN_MASK) == NCCL_LL_CLEAN_MASK) {
@@ -414,22 +441,46 @@ class ncclLLPrimitives {
    sendStep[i]++;
  }

-  __device__ uint64_t readLL(int i, int offset) {
+  __device__ __attribute__((noinline)) uint64_t readLL(int i, int offset) {
    union ncclLLFifoLine* src = recvPtr(i) + offset;
    uint32_t flag = recvFlag(i);
    uint32_t data1, flag1, data2, flag2;
    spins = 0;
    mismatch = 0;
+#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__)
+    using Vec = uint32_t __attribute__((ext_vector_type(4)));
+    Vec i4;
+    do {
+      asm volatile ("flat_load_dwordx4 %0, %1, glc\n"
+        "s_waitcnt vmcnt(0)\n"
+        "buffer_wbinvl1_vol\n" : "=v"(i4) : "v"(src));
+      if (checkAbort(recvConn[i]->opCountRem)) break;
+    } while (i4[1] != flag || i4[3] != flag);
+    uint64_t val64 = (uint64_t)(i4[0]) + (((uint64_t)i4[2]) << 32);
+#else
    do {
      asm volatile("ld.volatile.global.v4.u32 {%0,%1,%2,%3}, [%4];" : "=r"(data1), "=r"(flag1), "=r"(data2), "=r"(flag2) : "l"(&src->i4));
      if (checkAbort(recvConn[i]->opCountRem)) break;
    } while ((flag1 != flag) || (flag2 != flag));
    uint64_t val64 = data1 + (((uint64_t)data2) << 32);
+#endif
    return val64;
  }

-  __device__ void storeLL(union ncclLLFifoLine* dst, uint64_t val, uint32_t flag) {
+  __device__ __attribute__((noinline)) void storeLL(union ncclLLFifoLine* dst, uint64_t val, uint32_t flag) {
+#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__)
+  using Vec = uint32_t __attribute__((ext_vector_type(4)));
+  Vec i4;
+  i4[0] = val & 0xffffffff;
+  i4[1] = flag;
+  i4[2] = (val >> 32);
+  i4[3] = flag;
+  asm volatile ("flat_store_dwordx4 %0, %1, glc\n"
+    "s_waitcnt vmcnt(0)\n"
+    "buffer_wbinvl1_vol\n" : : "v"(dst), "v"(i4));
+#else
    asm volatile("st.volatile.global.v4.u32 [%0], {%1,%2,%3,%4};" :: "l"(&dst->i4), "r"((uint32_t)val), "r"(flag), "r"((uint32_t)(val >> 32)), "r"(flag));
+#endif
  }

  // Using memcpy handles misaligned pointers.
@@ -453,7 +504,7 @@ class ncclLLPrimitives {
    uint64_t* dstPack = (uint64_t*)dstPtr;
    int offset = tid;
    // Do multiples of 64 bits
-    #pragma unroll 2
+    #pragma unroll 1
    for (; offset<npack; offset+=nthreads) {
      // Recv : local, then intra-node, then inter-node
      uint64_t val = SRC ? readAL(srcPack+offset) : readLL(0, offset);
@@ -478,56 +529,61 @@ class ncclLLPrimitives {
        }
      }
    }
+#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__)
+    exitIfAbortBarrier(abort, abortCount);
+#else
    exitIfAbortLocalBarrier();
+#endif
    FOR_RECV(postRecv);
    FOR_SEND(postSend, offset);
  }

-  __device__ __forceinline__ void loadRecvConn(struct ncclConnInfo* conn, int i) {
+  __device__ void loadRecvConn(struct ncclConnInfo* conn, int i) {
    recvConn[i] = conn;
    recvBuff[i] = recvConn[i]->llBuff;
    recvStep[i] = recvConn[i]->step;
    if (tid == i) {
      postPtr = recvConn[i]->head;
-      *(recvConn[i]->opCountLoc) = opCount;
+      STORE(recvConn[i]->opCountLoc, opCount);
    }
    nrecv++;
  }

-  __device__ __forceinline__ void loadSendConn(struct ncclConnInfo* conn, int i) {
+  __device__ void loadSendConn(struct ncclConnInfo* conn, int i) {
    sendConn[i] = conn;
    sendBuff[i] = sendConn[i]->llBuff;
    sendStep[i] = sendConn[i]->step;
    if (tid == WARP_SIZE+i) {
      waitPtr = sendConn[i]->head;
      fifoPtr = sendConn[i]->fifo;
-      sendConnHead = *waitPtr;
-      *(sendConn[i]->opCountLoc) = opCount;
+      sendConnHead = LOAD(waitPtr);
+      STORE(sendConn[i]->opCountLoc, opCount);
    }
    nsend++;
  }

-  __device__ __forceinline__ void saveRecvConn(int i) {
+  __device__ void saveRecvConn(int i) {
    if (tid == i) {
      recvConn[i]->step = recvStep[i];
-      *(recvConn[i]->opCountLoc) += 1;
+      __atomic_fetch_add(recvConn[i]->opCountLoc, 1, __ATOMIC_SEQ_CST);
      __threadfence_block();
    }
  }

-  __device__ __forceinline__ void saveSendConn(int i) {
+  __device__ void saveSendConn(int i) {
    if (tid == WARP_SIZE+i) {
      sendConn[i]->step = sendStep[i];
-      *(sendConn[i]->opCountLoc) += 1;
+      __atomic_fetch_add(sendConn[i]->opCountLoc, 1, __ATOMIC_SEQ_CST);
      __threadfence_block();
    }
  }

 public:
-  __device__ __forceinline__
+  __device__
  ncclLLPrimitives(const int tid, const int nthreads, int* recvPeers, int* sendPeers, struct ncclChannel* channel, struct ncclDevComm* comm, const uint64_t opCount)
    : comm(comm), tid(tid), nthreads(nthreads), opCount(opCount) {
    // Make sure step is updated before we read it.
+    abortCount = channel->abortCount;
    barrier();

    for (int i=0; i<NRECV && recvPeers[i] >= 0; i++) loadRecvConn(&channel->devPeers[recvPeers[i]].recv.conn, i);
@@ -562,10 +618,27 @@ class ncclLLPrimitives {
    return LLGenericOp<1, 1, 1, 1>(src, dst, nelem);
  }

-  __device__ __forceinline__ ~ncclLLPrimitives() {
+  __device__ ~ncclLLPrimitives() {
    // Save steps for the next operation
    for (int i=0; i<NRECV && i<nrecv; i++) saveRecvConn(i);
    for (int i=0; i<NSEND && i<nsend; i++) saveSendConn(i);
  }
 };
+
+#ifdef ENABLE_PROFILING
+#define INIT_COUNTER \
+  if (tid==0) { t0 = clock64(); ws = LOAD(&(devProf->wait_send_cycle[blockIdx.x])); \
+    wr = LOAD(&(devProf->wait_recv_cycle[blockIdx.x])); }
+
+#define ACCUMULATE_COUNTER(prim) \
+  if (tid==0) { __atomic_fetch_add(&(devProf->prim##_cycle), clock64() - t0 \
+    + ws - LOAD(&(devProf->wait_send_cycle[blockIdx.x])) \
+    + wr - LOAD(&(devProf->wait_recv_cycle[blockIdx.x])), \
+    __ATOMIC_SEQ_CST); \
+    __atomic_fetch_add(&(devProf->prim##_byte), nelem * sizeof(T), __ATOMIC_SEQ_CST); }
+#else
+#define INIT_COUNTER
+#define ACCUMULATE_COUNTER(prim)
+#endif
+
 #endif
@@ -1,11 +1,17 @@
 /*************************************************************************
- * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+ * Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
 *
 * See LICENSE.txt for license information
 ************************************************************************/

-#include "reduce.h"
 #include "common.h"
+#include "reduce.h"
 #include "collectives.h"

-IMPL_COLL_R(ncclReduce, ncclCollReduce);
+#define UNROLL 4
+
+IMPL_COLL2(ncclReduce, sum,  FuncSum,  ncclCollReduce, ncclSum);
+IMPL_COLL2(ncclReduce, prod, FuncProd, ncclCollReduce, ncclProd);
+IMPL_COLL2(ncclReduce, min,  FuncMin,  ncclCollReduce, ncclMin);
+IMPL_COLL2(ncclReduce, max,  FuncMax,  ncclCollReduce, ncclMax);
@@ -1,5 +1,6 @@
 /*************************************************************************
 * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
+ * Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
 *
 * See LICENSE.txt for license information
 ************************************************************************/
@@ -9,9 +10,10 @@
 #include "collectives.h"

 template<int UNROLL, class FUNC, typename T>
+__attribute__((noinline))
 __device__ void ncclReduceRingKernel(struct CollectiveArgs* args) {
  const int tid = threadIdx.x;
-  const int nthreads = blockDim.x - 1;
+  const int nthreads = blockDim.x;
  const int bid = args->bid;
  struct ncclDevComm* comm = args->comm;
  struct ncclChannel* channel = comm->channels+blockIdx.x;
@@ -48,9 +50,11 @@ __device__ void ncclReduceRingKernel(struct CollectiveArgs* args) {
 }

 template<int UNROLL, class FUNC, typename T>
+__attribute__((noinline))
 __device__ void ncclReduceTreeKernel(struct CollectiveArgs* args) { }

 template<int UNUSED, class FUNC, typename T>
+__attribute__((noinline))
 __device__ void ncclReduceRingLLKernel(struct CollectiveArgs* args) {
  const int tid = threadIdx.x;
  const int bid = args->bid;
@@ -92,4 +96,5 @@ __device__ void ncclReduceRingLLKernel(struct CollectiveArgs* args) {
 }

 template<int UNUSED, class FUNC, typename T>
+__attribute__((noinline))
 __device__ void ncclReduceTreeLLKernel(struct CollectiveArgs* args) { }
@@ -1,5 +1,6 @@
 /*************************************************************************
 * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
+ * Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
 *
 * See LICENSE.txt for license information
 ************************************************************************/
@@ -18,6 +19,123 @@ struct FuncNull {
  }
 };

+#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__)
+
+//we really don't need any specializations and we don't need
+//to break things into uint32_t
+template<typename T>
+__device__ inline T ncclMinFunc(T x, T y) { return y < x ? y : x; }
+
+template<typename T>
+__device__ inline T ncclMaxFunc(T x, T y) { return y < x ? x : y; }
+
+template<typename T>
+class FuncBase {
+protected:
+  static constexpr auto n = sizeof(PackType) / sizeof(T);
+
+  union Cvt {
+    using Vec = T __attribute__((ext_vector_type(n)));
+
+    PackType data;
+    Vec vec;
+
+    static_assert(sizeof(Vec) == sizeof(data), "Vec must be the same size of data.");
+  };
+};
+
+template<>
+class FuncBase<half> {
+protected:
+  static constexpr auto n = sizeof(PackType) / sizeof(_Float16);
+  union Cvt {
+    using Vec = _Float16 __attribute__((ext_vector_type(n)));
+
+    PackType data;
+    Vec vec;
+
+    static_assert(sizeof(Vec) == sizeof(data), "Vec must be the same size of data.");
+  };
+};
+
+template<typename T>
+struct FuncSum : private FuncBase<T> {
+  __device__ PackType operator()(PackType x, PackType y) const
+  {
+    using Cvt = typename FuncBase<T>::Cvt;
+
+    Cvt tmp_x{x};
+    tmp_x.vec += Cvt{y}.vec;
+
+    return tmp_x.data;
+  }
+  template<typename U = T, typename std::enable_if<!std::is_same<T, U>{}>* = nullptr>
+  __device__ T operator()(const T x, const T y) const {
+    return x + y;
+  }
+};
+
+template<typename T>
+struct FuncProd : private FuncBase<T> {
+  __device__ PackType operator()(PackType x, PackType y) const
+  {
+    using Cvt = typename FuncBase<T>::Cvt;
+
+    Cvt tmp_x{x};
+    tmp_x.vec *= Cvt{y}.vec;
+
+    return tmp_x.data;
+  }
+  template<typename U = T, typename std::enable_if<!std::is_same<T, U>{}>* = nullptr>
+  __device__ T operator()(const T x, const T y) const {
+    return x * y;
+  }
+};
+
+template<typename T>
+struct FuncMax : private FuncBase<T> {
+  __device__ PackType operator()(PackType x, PackType y) const
+  {
+    using Cvt = typename FuncBase<T>::Cvt;
+
+    Cvt tmp_x{x};
+    Cvt tmp_y{y};
+
+    for (auto i = 0u; i != FuncBase<T>::n; ++i) {
+        tmp_x.vec[i] = ncclMaxFunc(tmp_x.vec[i], tmp_y.vec[i]);
+    }
+
+    return tmp_x.data;
+  }
+  template<typename U = T, typename std::enable_if<!std::is_same<T, U>{}>* = nullptr>
+  __device__ T operator()(const T x, const T y) const {
+    return (x < y) ? y : x;
+  }
+};
+
+template<typename T>
+struct FuncMin : private FuncBase<T> {
+  __device__ PackType operator()(PackType x, PackType y) const
+  {
+    using Cvt = typename FuncBase<T>::Cvt;
+
+    Cvt tmp_x{x};
+    Cvt tmp_y{y};
+
+    for (auto i = 0u; i != FuncBase<T>::n; ++i) {
+        tmp_x.vec[i] = ncclMinFunc(tmp_x.vec[i], tmp_y.vec[i]);
+    }
+
+    return tmp_x.data;
+  }
+  template<typename U = T, typename std::enable_if<!std::is_same<T, U>{}>* = nullptr>
+  __device__ T operator()(const T x, const T y) const {
+    return (x < y) ? x : y;
+  }
+};
+
+#else
+
 template<typename T>
 struct FuncSum {
  __device__ T operator()(const T x, const T y) const {
@@ -62,12 +180,15 @@ static __device__ uint32_t addChar4(const uint32_t x, const uint32_t y) {
 template<>
 struct FuncSum<int8_t> {
  __device__ uint32_t operator()(const uint32_t x, const uint32_t y) const {
+#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__)
+#else
 #if (__CUDA_ARCH__ >= 300) && (__CUDA_ARCH__ < 500)
    int32_t rv, z=0;
    asm("vadd4.s32.s32.s32 %0, %1, %2, %3;" : "=r"(rv) : "r"(x), "r"(y), "r"(z));
    return rv;
 #else
    return addChar4(x, y);
+#endif
 #endif
  }
  __device__ int8_t operator()(const int8_t x, const int8_t y) const {
@@ -77,12 +198,15 @@ struct FuncSum<int8_t> {
 template<>
 struct FuncSum<uint8_t> {
  __device__ uint32_t operator()(const uint32_t x, const uint32_t y) const {
+#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__)
+#else
 #if (__CUDA_ARCH__ >= 300) && (__CUDA_ARCH__ < 500)
    int32_t rv, z=0;
    asm("vadd4.u32.u32.u32 %0, %1, %2, %3;" : "=r"(rv) : "r"(x), "r"(y), "r"(z));
    return rv;
 #else
    return addChar4(x, y);
+#endif
 #endif
  }
  __device__ uint8_t operator()(const uint8_t x, const uint8_t y) const {
@@ -126,6 +250,8 @@ template<>
 struct FuncMax<int8_t> {
  union converter { uint32_t storage; char4 a; };
  __device__ uint32_t operator()(const uint32_t x, const uint32_t y) const {
+#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__)
+#else
 #if (__CUDA_ARCH__ >= 300) && (__CUDA_ARCH__ < 500)
    int32_t rv, z=0;
    asm("vmax4.s32.s32.s32 %0, %1, %2, %3;" : "=r"(rv) : "r"(x), "r"(y), "r"(z));
@@ -139,6 +265,7 @@ struct FuncMax<int8_t> {
    cr.a.z = max(cx.a.z, cy.a.z);
    cr.a.w = max(cx.a.w, cy.a.w);
    return cr.storage;
+#endif
 #endif
  }
  __device__ int8_t operator()(const int8_t x, const int8_t y) const {
@@ -149,6 +276,8 @@ template<>
 struct FuncMax<uint8_t> {
  union converter { uint32_t storage; uchar4 a; };
  __device__ uint32_t operator()(const uint32_t x, const uint32_t y) const {
+#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__)
+#else
 #if (__CUDA_ARCH__ >= 300) && (__CUDA_ARCH__ < 500)
    int32_t rv, z=0;
    asm("vmax4.u32.u32.u32 %0, %1, %2, %3;" : "=r"(rv) : "r"(x), "r"(y), "r"(z));
@@ -162,6 +291,7 @@ struct FuncMax<uint8_t> {
    cr.a.z = max(cx.a.z, cy.a.z);
    cr.a.w = max(cx.a.w, cy.a.w);
    return cr.storage;
+#endif
 #endif
  }
  __device__ uint8_t operator()(const uint8_t x, const uint8_t y) const {
@@ -173,6 +303,8 @@ template<>
 struct FuncMin<int8_t> {
  union converter { uint32_t storage; char4 a; };
  __device__ uint32_t operator()(const uint32_t x, const uint32_t y) const {
+#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__)
+#else
 #if (__CUDA_ARCH__ >= 300) && (__CUDA_ARCH__ < 500)
    int32_t rv, z=0;
    asm("vmin4.s32.s32.s32 %0, %1, %2, %3;" : "=r"(rv) : "r"(x), "r"(y), "r"(z));
@@ -186,6 +318,7 @@ struct FuncMin<int8_t> {
    cr.a.z = min(cx.a.z, cy.a.z);
    cr.a.w = min(cx.a.w, cy.a.w);
    return cr.storage;
+#endif
 #endif
  }
  __device__ int8_t operator()(const int8_t x, const int8_t y) const {
@@ -196,6 +329,8 @@ template<>
 struct FuncMin<uint8_t> {
  union converter { uint32_t storage; uchar4 a; };
  __device__ uint32_t operator()(const uint32_t x, const uint32_t y) const {
+#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__)
+#else
 #if (__CUDA_ARCH__ >= 300) && (__CUDA_ARCH__ < 500)
    int32_t rv, z=0;
    asm("vmin4.u32.u32.u32 %0, %1, %2, %3;" : "=r"(rv) : "r"(x), "r"(y), "r"(z));
@@ -209,6 +344,7 @@ struct FuncMin<uint8_t> {
    cr.a.z = min(cx.a.z, cy.a.z);
    cr.a.w = min(cx.a.w, cy.a.w);
    return cr.storage;
+#endif
 #endif
  }
  __device__ uint8_t operator()(const uint8_t x, const uint8_t y) const {
@@ -299,4 +435,7 @@ struct FuncMin<half> {
    return __float2half(fm);
  }
 };
+
+#endif // defined(__HIP_PLATFORM_HCC__) || defined(__HCC__)
+
 #endif // REDUCE_KERNEL_H_
@@ -1,11 +1,18 @@
 /*************************************************************************
- * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+ * Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
+ * Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
 *
 * See LICENSE.txt for license information
 ************************************************************************/

-#include "reduce_scatter.h"
 #include "common.h"
+#include "reduce_scatter.h"
 #include "collectives.h"

-IMPL_COLL_R(ncclReduceScatter, ncclCollReduceScatter);
+#define UNROLL 4
+
+IMPL_COLL2(ncclReduceScatter, sum,  FuncSum,  ncclCollReduceScatter, ncclSum);
+IMPL_COLL2(ncclReduceScatter, prod, FuncProd, ncclCollReduceScatter, ncclProd);
+IMPL_COLL2(ncclReduceScatter, min,  FuncMin,  ncclCollReduceScatter, ncclMin);
+IMPL_COLL2(ncclReduceScatter, max,  FuncMax,  ncclCollReduceScatter, ncclMax);
@@ -1,5 +1,6 @@
 /*************************************************************************
 * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
+ * Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
 *
 * See LICENSE.txt for license information
 ************************************************************************/
@@ -9,9 +10,10 @@
 #include "collectives.h"

 template<int UNROLL, class FUNC, typename T>
+__attribute__((noinline))
 __device__ void ncclReduceScatterRingKernel(struct CollectiveArgs* args) {
  const int tid = threadIdx.x;
-  const int nthreads = blockDim.x - 1;
+  const int nthreads = blockDim.x;
  const int bid = args->bid;
  struct ncclDevComm* comm = args->comm;
  struct ncclChannel* channel = comm->channels+blockIdx.x;
@@ -62,9 +64,11 @@ __device__ void ncclReduceScatterRingKernel(struct CollectiveArgs* args) {
 }

 template<int UNROLL, class FUNC, typename T>
+__attribute__((noinline))
 __device__ void ncclReduceScatterTreeKernel(struct CollectiveArgs* args) { }

 template<int UNUSED, class FUNC, typename T>
+__attribute__((noinline))
 __device__ void ncclReduceScatterRingLLKernel(struct CollectiveArgs* args) {
  const int tid = threadIdx.x;
  const int bid = args->bid;
@@ -120,4 +124,5 @@ __device__ void ncclReduceScatterRingLLKernel(struct CollectiveArgs* args) {
 }

 template<int UNUSED, class FUNC, typename T>
+__attribute__((noinline))
 __device__ void ncclReduceScatterTreeLLKernel(struct CollectiveArgs* args) { }
@@ -1,5 +1,6 @@
 /*************************************************************************
 * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
+ * Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
 *
 * See LICENSE.txt for license information
 ************************************************************************/
@@ -8,9 +9,9 @@
 #include "collectives.h"

 NCCL_API(ncclResult_t, ncclReduce, const void* sendbuff, void* recvbuff, size_t count,
-    ncclDataType_t datatype, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream);
+    ncclDataType_t datatype, ncclRedOp_t op, int root, ncclComm_t comm, hipStream_t stream);
 ncclResult_t ncclReduce(const void* sendbuff, void* recvbuff, size_t count,
-    ncclDataType_t datatype, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) {
+    ncclDataType_t datatype, ncclRedOp_t op, int root, ncclComm_t comm, hipStream_t stream) {
  struct ncclInfo info = { ncclCollReduce, "Reduce",
    sendbuff, recvbuff, count, datatype, op, root, comm, stream, /* Args */
    REDUCE_CHUNKSTEPS, REDUCE_SLICESTEPS };
@@ -1,5 +1,6 @@
 /*************************************************************************
 * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
+ * Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
 *
 * See LICENSE.txt for license information
 ************************************************************************/
@@ -8,9 +9,9 @@
 #include "collectives.h"

 NCCL_API(ncclResult_t, ncclReduceScatter, const void* sendbuff, void* recvbuff, size_t recvcount,
-    ncclDataType_t datatype, ncclRedOp_t op, ncclComm* comm, cudaStream_t stream);
+    ncclDataType_t datatype, ncclRedOp_t op, ncclComm* comm, hipStream_t stream);
 ncclResult_t ncclReduceScatter(const void* sendbuff, void* recvbuff, size_t recvcount,
-    ncclDataType_t datatype, ncclRedOp_t op, ncclComm* comm, cudaStream_t stream) {
+    ncclDataType_t datatype, ncclRedOp_t op, ncclComm* comm, hipStream_t stream) {
  struct ncclInfo info = { ncclCollReduceScatter, "ReduceScatter",
    sendbuff, recvbuff, recvcount, datatype, op, 0, comm, stream, /* Args */
    REDUCESCATTER_CHUNKSTEPS, REDUCESCATTER_SLICESTEPS };
@@ -1,5 +1,6 @@
 /*************************************************************************
 * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved.
+ * Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
 *
 * See LICENSE.txt for license information
 ************************************************************************/
@@ -12,34 +13,33 @@

 // Only generate inline kernels for LL
 #define NCCL_FUNC5(coll, op, dtype) \
-  (void*)NCCL_KERN_NAME(coll##LL, op, dtype), \
-  (void*)NCCL_KERN_NAME(coll##LL, op, dtype)
+  NCCL_KERN_NAME(coll##LL, op, dtype), \
+  NCCL_KERN_NAME(coll##LL, op, dtype)

 #define NCCL_FUNC4(coll, op, dtype) \
-  (void*)NCCL_FUNC5(coll##Ring, op, dtype), \
-  (void*)NCCL_FUNC5(coll##Tree, op, dtype)
+  NCCL_FUNC5(coll##Ring, op, dtype)

 // Must be consistent with ncclDataType_t
 #define NCCL_FUNCS3A(coll, op) \
-  (void*)NCCL_FUNC4(coll, op,  i8), \
-  (void*)NCCL_FUNC4(coll, op,  u8), \
-  (void*)NCCL_FUNC4(coll, op, i32), \
-  (void*)NCCL_FUNC4(coll, op, u32), \
-  (void*)NCCL_FUNC4(coll, op, i64), \
-  (void*)NCCL_FUNC4(coll, op, u64), \
-  (void*)NCCL_FUNC4(coll, op, f16), \
-  (void*)NCCL_FUNC4(coll, op, f32), \
-  (void*)NCCL_FUNC4(coll, op, f64)
+  NCCL_FUNC4(coll, op,  i8), \
+  NCCL_FUNC4(coll, op,  u8), \
+  NCCL_FUNC4(coll, op, i32), \
+  NCCL_FUNC4(coll, op, u32), \
+  NCCL_FUNC4(coll, op, i64), \
+  NCCL_FUNC4(coll, op, u64), \
+  NCCL_FUNC4(coll, op, f16), \
+  NCCL_FUNC4(coll, op, f32), \
+  NCCL_FUNC4(coll, op, f64)
 #define NCCL_FUNCS3B(coll, op) \
-  (void*)NCCL_FUNC4(coll, op,  i8), \
-  (void*)NCCL_FUNC4(coll, op,  i8), \
-  (void*)NCCL_FUNC4(coll, op,  i8), \
-  (void*)NCCL_FUNC4(coll, op,  i8), \
-  (void*)NCCL_FUNC4(coll, op,  i8), \
-  (void*)NCCL_FUNC4(coll, op,  i8), \
-  (void*)NCCL_FUNC4(coll, op,  i8), \
-  (void*)NCCL_FUNC4(coll, op,  i8), \
-  (void*)NCCL_FUNC4(coll, op,  i8)
+  NCCL_FUNC4(coll, op,  i8), \
+  NCCL_FUNC4(coll, op,  i8), \
+  NCCL_FUNC4(coll, op,  i8), \
+  NCCL_FUNC4(coll, op,  i8), \
+  NCCL_FUNC4(coll, op,  i8), \
+  NCCL_FUNC4(coll, op,  i8), \
+  NCCL_FUNC4(coll, op,  i8), \
+  NCCL_FUNC4(coll, op,  i8), \
+  NCCL_FUNC4(coll, op,  i8)

 // Must be consistent with ncclRedOp_t -- but we only generate kernel for sums.
 #define NCCL_FUNCS2A(coll) \
@@ -53,8 +53,9 @@
  NCCL_FUNCS3B(coll, copy), \
  NCCL_FUNCS3B(coll, copy)

+typedef void(*ncclKern_t)(struct ncclColl);
 // Must be consistent with the ncclFuncSet enum
-static void* const ncclKerns[ncclCollCount*ncclNumOps*ncclNumTypes*2*2] = {
+static ncclKern_t const ncclKerns[ncclCollCount*ncclNumOps*ncclNumTypes*2] = {
  NCCL_FUNCS2B(ncclBroadcast),
  NCCL_FUNCS2A(ncclReduce),
  NCCL_FUNCS2B(ncclAllGather),
@@ -66,33 +67,31 @@ static void* const ncclKerns[ncclCollCount*ncclNumOps*ncclNumTypes*2*2] = {
 /*       Launch system : synchronization and CUDA kernel launch              */
 /*****************************************************************************/

-ncclResult_t ncclLaunchCooperativeKernelMultiDevice(struct cudaLaunchParams *paramsList, int* cudaDevs, int numDevices, int cgMode) {
-#if CUDART_VERSION >= 9000
+ncclResult_t ncclLaunchCooperativeKernelMultiDevice(hipLaunchParams *paramsList, int* cudaDevs, int numDevices, int cgMode) {
  if (cgMode & 0x01) {
-    CUDACHECK(cudaLaunchCooperativeKernelMultiDevice(paramsList, numDevices,
+    CUDACHECK(hipExtLaunchMultiKernelMultiDevice(paramsList, numDevices,
            // These flags are to reduce the latency of using this API
-            cudaCooperativeLaunchMultiDeviceNoPreSync|cudaCooperativeLaunchMultiDeviceNoPostSync));
+            0));
    return ncclSuccess;
  }
-#endif
  int savedDev;
-  CUDACHECK(cudaGetDevice(&savedDev));
+  CUDACHECK(hipGetDevice(&savedDev));
  for (int i = 0; i < numDevices; i++) {
-    struct cudaLaunchParams* params = paramsList+i;
-    CUDACHECK(cudaSetDevice(cudaDevs[i]));
-    CUDACHECK(cudaLaunchKernel(params->func, params->gridDim, params->blockDim, params->args, params->sharedMem, params->stream));
+    hipLaunchParams* params = paramsList+i;
+    CUDACHECK(hipSetDevice(cudaDevs[i]));
+    hipLaunchKernelGGL(((void (*)(struct ncclColl))params->func), params->gridDim, params->blockDim, params->sharedMem, params->stream, **((struct ncclColl **)(params->args)));
  }
-  CUDACHECK(cudaSetDevice(savedDev));
+  CUDACHECK(hipSetDevice(savedDev));
  return ncclSuccess;
 }

-ncclResult_t setupLaunch(struct ncclComm* comm, struct cudaLaunchParams* params) {
+ncclResult_t setupLaunch(struct ncclComm* comm, hipLaunchParams* params) {
  params->gridDim.x = std::min<unsigned>(params->gridDim.x, comm->nChannels);

  // Set active = 2 for the last operation
  for (int r=0; r<params->gridDim.x; r++) {
    struct ncclChannel* channel = comm->channels+r;
-    channel->collectives[(channel->collStart+channel->collCount-1)%NCCL_MAX_OPS].active = 2;
+    STORE(&channel->collectives[(channel->collStart+channel->collCount-1)%NCCL_MAX_OPS].active, 2);
  }

  // Find the first operation, choose the kernel accordingly and pass it
@@ -100,15 +99,15 @@ ncclResult_t setupLaunch(struct ncclComm* comm, struct cudaLaunchParams* params)
  struct ncclColl* coll = comm->channels[0].collectives+comm->channels[0].collStart;
  memcpy(&comm->args, coll, sizeof(struct ncclColl));
  // As we pass that coll directly, we can free it immediately.
-  coll->active = 0;
+  STORE(&coll->active, 0);

-  params->func = ncclKerns[coll->funcIndex];
+  params->func = (void *)ncclKerns[coll->funcIndex];
  return ncclSuccess;
 }

 ncclResult_t ncclCpuBarrierIn(struct ncclComm* comm, int* isLast) {
  volatile int* ptr = (volatile int*)(comm->intraBarrier+comm->intraPhase);
-  int val = *ptr;
+  int val = LOAD(ptr);
  bool done = false;
  while (done == false) {
    if (val >= comm->intraRanks) {
@@ -130,7 +129,7 @@ ncclResult_t ncclCpuBarrierIn(struct ncclComm* comm, int* isLast) {

 ncclResult_t ncclCpuBarrierLast(struct ncclComm* comm) {
  volatile int* ptr = (volatile int*)(comm->intraBarrier+comm->intraPhase);
-  int val = *ptr;
+  int val = LOAD(ptr);
  if (__sync_bool_compare_and_swap(ptr, val, val+1) != true) {
    WARN("Trying to launch too many collectives");
    return ncclInternalError;
@@ -140,28 +139,28 @@ ncclResult_t ncclCpuBarrierLast(struct ncclComm* comm) {

 ncclResult_t ncclCpuBarrierOut(struct ncclComm* comm) {
  volatile int* ptr = (volatile int*)(comm->intraBarrier+comm->intraPhase);
-  while (*ptr < comm->intraRanks) pthread_yield();
+  while (LOAD(ptr) < comm->intraRanks) pthread_yield();
  comm->intraPhase ^= 1;
  return ncclSuccess;
 }

 ncclResult_t ncclBarrierEnqueue(struct ncclComm* comm) {
  if (comm->nRanks == 1) return ncclSuccess;
-  struct cudaLaunchParams* params = comm->myParams;
+  hipLaunchParams* params = comm->myParams;

  NCCLCHECK(setupLaunch(comm, params));

  // Use internal NCCL stream for CGMD/GROUP launch if required or if the user stream is NULL
  if (comm->launchMode == ncclComm::GROUP && (comm->groupCudaStream || comm->userStream == NULL)) {
    // Enqueue event in user stream
-    CUDACHECK(cudaEventRecord(comm->doneEvent, comm->userStream));
+    CUDACHECK(hipEventRecord(comm->doneEvent, comm->userStream));
    // Create dependency between user stream and internal NCCL stream
-    CUDACHECK(cudaStreamWaitEvent(comm->groupStream, comm->doneEvent, 0));
+    CUDACHECK(hipStreamWaitEvent(comm->groupStream, comm->doneEvent, 0));
    params->stream = comm->groupStream;
  } else {
    if (comm->userStream != params->stream) {
      // Stream changed from last call, create dependency against last NCCL kernel launch
-      CUDACHECK(cudaStreamWaitEvent(comm->userStream, comm->doneEvent, 0));
+      CUDACHECK(hipStreamWaitEvent(comm->userStream, comm->doneEvent, 0));
    }
    params->stream = comm->userStream;
  }
@@ -192,12 +191,12 @@ ncclResult_t ncclBarrierEnqueueWait(ncclComm_t comm) {

  NCCLCHECK(ncclCpuBarrierOut(comm));

-  struct cudaLaunchParams *params = comm->myParams;
+  hipLaunchParams *params = comm->myParams;
  if (comm->launchMode == ncclComm::PARALLEL) {
-    CUDACHECK(cudaLaunchKernel(params->func, params->gridDim, params->blockDim, params->args, params->sharedMem, params->stream));
+    hipLaunchKernelGGL(((void (*)(struct ncclColl))params->func), params->gridDim, params->blockDim, params->sharedMem, params->stream, **((struct ncclColl **)(params->args)));
  }
  // Start the network proxies as soon as the kernel has been launched. We can't
-  // perform any CUDA call between the two or having a cudaFree between the CUDA
+  // perform any CUDA call between the two or having a hipFree between the CUDA
  // launch and the transportStartProxy call could cause a deadlock.
  // Also, starting the proxies after the CUDA launch seems to be better for
  // performance (latency).
@@ -212,13 +211,13 @@ ncclResult_t ncclBarrierEnqueueWait(ncclComm_t comm) {
 }

 ncclResult_t ncclEnqueueEvents(ncclComm_t comm) {
-  struct cudaLaunchParams *params = comm->myParams;
+  hipLaunchParams *params = comm->myParams;
  // Enqueue event after NCCL kernel
-  CUDACHECK(cudaEventRecord(comm->doneEvent, params->stream));
+  CUDACHECK(hipEventRecord(comm->doneEvent, params->stream));
  // Use internal NCCL stream for CGMD/GROUP launch if required or if the user stream is NULL
  if (comm->launchMode == ncclComm::GROUP && (comm->groupCudaStream || comm->userStream == NULL)) {
    // Create dependency between NCCL internal stream and user stream
-    CUDACHECK(cudaStreamWaitEvent(comm->userStream, comm->doneEvent, 0));
+    CUDACHECK(hipStreamWaitEvent(comm->userStream, comm->doneEvent, 0));
  }
  comm->userStreamSet = false;
  return ncclSuccess;
@@ -292,7 +291,7 @@ static void getKernelInfo(struct ncclInfo* info, uint8_t* nChannels, uint16_t* n
  } else {
    *llMode = 0;
    *nChannels = info->comm->nChannels;
-    *nThreads = info->comm->nThreads+1;
+    *nThreads = info->comm->nThreads;
  }
 }

@@ -356,7 +355,7 @@ static ncclResult_t computeColl(struct ncclInfo* info /* input */, struct ncclCo
 static ncclResult_t saveKernel(struct ncclInfo* info) {
  if (info->comm->nRanks == 1) {
    if (info->sendbuff != info->recvbuff)
-      CUDACHECK(cudaMemcpyAsync(info->recvbuff, info->sendbuff, info->nBytes, cudaMemcpyDeviceToDevice, info->stream));
+      CUDACHECK(hipMemcpyAsync(info->recvbuff, info->sendbuff, info->nBytes, hipMemcpyDeviceToDevice, info->stream));
    return ncclSuccess;
  }

@@ -390,12 +389,12 @@ static ncclResult_t saveKernel(struct ncclInfo* info) {
    int opIndex = channel->collFifoTail;
    struct ncclColl* c = channel->collectives+opIndex;
    volatile uint8_t* activePtr = (volatile uint8_t*)&c->active;
-    while (activePtr[0] != 0) sched_yield();
+    while (LOAD(activePtr) != 0) sched_yield();

    memcpy(c, &coll, sizeof(struct ncclColl));

    c->args.bid = bid;
-    c->active = 1;
+    STORE(&c->active, 1);
    opIndex = (opIndex+1)%NCCL_MAX_OPS;
    c->nextIndex = opIndex;
    channel->collFifoTail = opIndex;
@@ -418,8 +417,8 @@ ncclResult_t ncclEnqueueCheck(struct ncclInfo* info) {
    ncclResult_t ret = ncclSuccess;
    int savedDev = -1;
    if (info->comm->checkPointers) {
-      CUDACHECKGOTO(cudaGetDevice(&savedDev), ret, end);
-      CUDACHECKGOTO(cudaSetDevice(info->comm->cudaDev), ret, end);
+      CUDACHECKGOTO(hipGetDevice(&savedDev), ret, end);
+      CUDACHECKGOTO(hipSetDevice(info->comm->cudaDev), ret, end);
    }
    // Check arguments
    NCCLCHECKGOTO(ArgsCheck(info), ret, end);
@@ -428,7 +427,7 @@ ncclResult_t ncclEnqueueCheck(struct ncclInfo* info) {
    NCCLCHECKGOTO(ncclAsyncColl(info->comm), ret, end);
    NCCLCHECKGOTO(saveKernel(info), ret, end);
 end:
-    if (savedDev != -1) CUDACHECK(cudaSetDevice(savedDev));
+    if (savedDev != -1) CUDACHECK(hipSetDevice(savedDev));
    ncclAsyncErrCheck(ret);
    return ret;
  } else {
@@ -12,14 +12,14 @@
 #include <sys/mman.h>

 static inline ncclResult_t ncclCudaHostAlloc(void** ptr, void** devPtr, size_t size) {
-  CUDACHECK(cudaHostAlloc(ptr, size, cudaHostAllocMapped));
+  CUDACHECK(hipHostMalloc(ptr, size, hipHostMallocMapped));
  memset(*ptr, 0, size);
  *devPtr = *ptr;
  return ncclSuccess;
 }

 static inline ncclResult_t ncclCudaHostFree(void* ptr) {
-  CUDACHECK(cudaFreeHost(ptr));
+  CUDACHECK(hipHostFree(ptr));
  return ncclSuccess;
 }

@@ -36,15 +36,18 @@ static ncclResult_t ncclCalloc(T** ptr, size_t nelem) {
 }

 template <typename T>
-static ncclResult_t ncclCudaCalloc(T** ptr, size_t nelem) {
-  CUDACHECK(cudaMalloc(ptr, nelem*sizeof(T)));
-  CUDACHECK(cudaMemset(*ptr, 0, nelem*sizeof(T)));
+static ncclResult_t ncclCudaCalloc(T** ptr, size_t nelem, bool isFineGrain = false) {
+  if (isFineGrain)
+    CUDACHECK(hipExtMallocWithFlags((void**)ptr, nelem*sizeof(T), hipDeviceMallocFinegrained));
+  else
+    CUDACHECK(hipMalloc(ptr, nelem*sizeof(T)));
+  CUDACHECK(hipMemset(*ptr, 0, nelem*sizeof(T)));
  return ncclSuccess;
 }

 template <typename T>
 static ncclResult_t ncclCudaMemcpy(T* dst, T* src, size_t nelem) {
-  CUDACHECK(cudaMemcpy(dst, src, nelem*sizeof(T), cudaMemcpyDefault));
+  CUDACHECK(hipMemcpy(dst, src, nelem*sizeof(T), hipMemcpyDefault));
  return ncclSuccess;
 }

@@ -11,17 +11,17 @@

 // Check CUDA calls
 #define CUDACHECK(cmd) do {                                 \
-    cudaError_t e = cmd;                                    \
-    if( e != cudaSuccess ) {                                \
-        WARN("Cuda failure '%s'", cudaGetErrorString(e));   \
+    hipError_t e = cmd;                                    \
+    if( e != hipSuccess ) {                                \
+        WARN("Cuda failure '%s'", hipGetErrorString(e));   \
        return ncclUnhandledCudaError;                      \
    }                                                       \
 } while(false)

 #define CUDACHECKGOTO(cmd, res, label) do {                 \
-    cudaError_t e = cmd;                                    \
-    if( e != cudaSuccess ) {                                \
-        WARN("Cuda failure '%s'", cudaGetErrorString(e));   \
+    hipError_t e = cmd;                                    \
+    if( e != hipSuccess ) {                                \
+        WARN("Cuda failure '%s'", hipGetErrorString(e));   \
        res = ncclUnhandledCudaError;                       \
        goto label;                                         \
    }                                                       \
@@ -1,5 +1,6 @@
 /*************************************************************************
 * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
+ * Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
 *
 * See LICENSE.txt for license information
 ************************************************************************/
@@ -7,21 +8,10 @@
 #ifndef NCCL_COMM_H_
 #define NCCL_COMM_H_

-#if CUDART_VERSION < 9000
-struct cudaLaunchParams {
-  void *func;
-  dim3 gridDim;
-  dim3 blockDim;
-  void **args;
-  size_t sharedMem;
-  cudaStream_t stream;
-};
-#endif
-
 #define MAXCHANNELS 16
 #define DEFAULT_BUFFER_SIZE_BYTES (1LL << 22) /* 4MiB */

-#define CACHE_LINE_SIZE 128
+#define CACHE_LINE_SIZE 64
 #define MEM_ALIGN 4096
 #define CUDA_IPC_MIN 2097152UL

@@ -66,9 +56,9 @@ struct ncclComm {
  int nvmlDev; // my NVML device number

  enum { GROUP, PARALLEL } launchMode;
-  cudaStream_t userStream;
+  hipStream_t userStream;
  bool userStreamSet;
-  cudaEvent_t doneEvent;
+  hipEvent_t doneEvent;
  bool checkPointers;

  // Counter to make sure collectives match (needed for bcast/reduce
@@ -88,7 +78,7 @@ struct ncclComm {

  // An internal CUDA stream for NCCL kernel CGMD launches
  int groupCudaStream;
-  cudaStream_t groupStream;
+  hipStream_t groupStream;

  // Whether there has been a fatal error in this communicator.
  ncclResult_t fatalError;
@@ -111,13 +101,13 @@ struct ncclComm {
  int intraPhase;

  // Storage for deferred intra-process launch
-  struct cudaLaunchParams * intraParams;
-  struct cudaLaunchParams *myParams;
+  hipLaunchParams * intraParams;
+  hipLaunchParams *myParams;
  int* intraCudaDevs;
  int* intraCGMode; // Whether we can use CUDA9 CGMD or not
  int* intraCC; // Only to check all have the same ComputeCap and disable CGMode if not
  struct ncclColl args;
-  void* argsptr;
+  struct ncclColl* argsptr;

  // Global proxy thread
  pthread_t proxyThread;
@@ -1,5 +1,6 @@
 /*************************************************************************
 * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
+ * Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
 *
 * See LICENSE.txt for license information
 ************************************************************************/
@@ -10,6 +11,15 @@
 #include "nccl.h"
 #include <stdint.h>

+// Convert volatile access to atomic
+#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__)
+#define LOAD(VAR) __atomic_load_n((VAR), __ATOMIC_SEQ_CST)
+#define STORE(DST, SRC) __atomic_store_n((DST), (SRC), __ATOMIC_SEQ_CST)
+#else
+#define LOAD(VAR) *(VAR)
+#define STORE(DST, SRC) *(DST) = (SRC)
+#endif
+
 #define NCCL_MAX_OPS 2048
 #define NCCL_STEPS 8

@@ -73,6 +83,12 @@ struct ncclConnInfo {
  // Low latency mechanism
  union ncclLLFifoLine *llBuff; // Local for recv, remote for send
  uint64_t llLastCleaning;
+
+  // GPU's HDP_MEM_FLUSH_ADDR: HDP Memory Coherency Flush Control. This register
+  // allows software to explicitly initiate a flush read to HDP memory. See more
+  // descriptions in primitives.h.
+  uint32_t* next_hdp_reg;  // Next GPU in ring (for p2p transport use only)
+  uint32_t* curr_hdp_reg;  // Curr GPU in ring (for rdma transport use only)
 };

 struct ncclConnector {
@@ -111,6 +127,8 @@ struct ncclPeer {

 struct ncclDevComm;

+#pragma pack(push)  /* push current alignment to stack */
+#pragma pack(4)     /* set alignment to 4 bytes boundary */
 /* CollectiveArgs + ncclColl are to be a power of two, currently 64 bytes, */
 /* to make sure reads to host from the CUDA kernel are aligned. */
 /* Make sure to adjust padding at the end of ncclColl. */
@@ -165,14 +183,56 @@ struct ncclChannel {
      int collCount;
      int collFifoHead; // Only used by GPU
      int collFifoTail; // Only used by CPU
+
+      uint32_t* abortCount;
    };
    int data[0x80];
  };
 };
 static_assert(sizeof(struct ncclChannel) == 0x80*sizeof(int), "ncclChannel must have a pow2 size");
+#pragma pack(pop)   /* restore original alignment from stack */

 #define MAXCHANNELS 16

+#ifdef ENABLE_PROFILING
+struct ncclProf {
+  union {
+    struct {
+      uint64_t total_cycle;
+      uint64_t wait_send_cycle[MAXCHANNELS];
+      uint64_t wait_recv_cycle[MAXCHANNELS];
+      // primtive cycles
+      uint64_t send_cycle;
+      uint64_t directSend_cycle;
+      uint64_t recv_cycle;
+      uint64_t directRecv_cycle;
+      uint64_t copySend_cycle;
+      uint64_t directCopySend_cycle;
+      uint64_t recvCopySend_cycle;
+      uint64_t directRecvCopySend_cycle;
+      uint64_t recvReduceCopy_cycle;
+      uint64_t recvReduceSend_cycle;
+      uint64_t recvReduceCopySend_cycle;
+      uint64_t directRecvReduceCopySend_cycle;
+      // primitive bytes
+      uint64_t send_byte;
+      uint64_t directSend_byte;
+      uint64_t recv_byte;
+      uint64_t directRecv_byte;
+      uint64_t copySend_byte;
+      uint64_t directCopySend_byte;
+      uint64_t recvCopySend_byte;
+      uint64_t directRecvCopySend_byte;
+      uint64_t recvReduceCopy_byte;
+      uint64_t recvReduceSend_byte;
+      uint64_t recvReduceCopySend_byte;
+      uint64_t directRecvReduceCopySend_byte;
+    };
+    int data[0x80];
+  };
+};
+#endif
+
 typedef enum {
  ncclDevSuccess,
  ncclDevAssertedMismatch,
@@ -189,6 +249,11 @@ struct ncclDevComm {

  // Channels, device side
  struct ncclChannel* channels;
+
+#ifdef ENABLE_PROFILING
+  // Profiling counters
+  struct ncclProf* devProf;
+#endif
 };

 #endif
@@ -1,5 +1,6 @@
 /*************************************************************************
 * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
+ * Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
 *
 * See LICENSE.txt for license information
 ************************************************************************/
@@ -12,9 +13,9 @@

 // Channels / LL tuning
 #define NCCL_LL_CHANNEL_THRESHOLD 8 // Per thread size before we start increasing nrings
-#define NCCL_THREAD_THRESHOLD 64  // Per thread size before we switch to non-LL
+#define NCCL_THREAD_THRESHOLD 256  // Per thread size before we switch to non-LL
 #define NCCL_THREAD_THRESHOLD_PREVOLTA 32 // Per thread size before we switch to non-LL for pre-Volta archs
-#define NCCL_LL_MIN_NTHREADS 64
+#define NCCL_LL_MIN_NTHREADS 256

 ncclResult_t ncclEnqueueCheck(struct ncclInfo* info);
 ncclResult_t ncclCpuBarrierIn(ncclComm_t comm, int* isLast);
@@ -1,5 +1,6 @@
 /*************************************************************************
 * Copyright (c) 2015-2017, NVIDIA CORPORATION. All rights reserved.
+ * Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
 *
 * See LICENSE.txt for license information
 ************************************************************************/
@@ -18,7 +19,7 @@ typedef ncclResult_t(*ncclInitFunc_t)(ncclComm_t* newcomm, int ndev, ncclUniqueI
 ncclResult_t ncclAsyncInit(ncclInitFunc_t func, int cudaDev, ncclComm_t* newcomm, int ndev, ncclUniqueId commId, int myrank);

 typedef ncclResult_t(*ncclCollFunc_t)(const void* sendbuff, void* recvbuff, size_t count,
-    ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream);
+    ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, hipStream_t stream);

 ncclResult_t ncclAsyncColl(ncclComm_t comm);
 #endif
@@ -31,7 +31,7 @@ struct ncclInfo {
  ncclRedOp_t op;
  int root;
  ncclComm_t comm;
-  cudaStream_t stream;
+  hipStream_t stream;
  // Algorithm details
  int chunkSteps;
  int sliceSteps;
@@ -1,5 +1,6 @@
 /*************************************************************************
 * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
+ * Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
 *
 * See LICENSE.txt for license information
 ************************************************************************/
@@ -58,9 +59,9 @@ static ncclResult_t ncclDeviceType(const char* busId, enum ncclNvLinkDeviceType*
 /* Get the maximum number of NVLinks based on the GPU generation */
 static ncclResult_t getMaxNvlinks(int* maxLinks) {
  int cudaDev;
-  CUDACHECK(cudaGetDevice(&cudaDev));
+  CUDACHECK(hipGetDevice(&cudaDev));
  int ccMajor;
-  CUDACHECK(cudaDeviceGetAttribute(&ccMajor, cudaDevAttrComputeCapabilityMajor, cudaDev));
+  CUDACHECK(hipDeviceGetAttribute(&ccMajor, hipDeviceAttributeComputeCapabilityMajor, cudaDev));
  // 6 for Volta, 4 for Pascal
  *maxLinks = (ccMajor > 6) ? 6 : 4;
  // INFO("Device %d detected %d NVLinks", cudaDev, *maxLinks);
@@ -0,0 +1,30 @@
+/*************************************************************************
+ * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
+ * Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef NCCL_NVLINK_H_
+#define NCCL_NVLINK_H_
+
+#include <sys/stat.h>
+#include <fcntl.h>
+#include "nvmlwrap.h"
+#include "topo.h"
+
+#define CONNECT_NVLINK 0x10
+#define CONNECT_NVSWITCH 0x100
+
+enum ncclNvLinkDeviceType {
+  ncclNvLinkDeviceGpu,
+  ncclNvLinkDeviceSwitch,
+  ncclNvLinkDeviceBridge, // IBM/Power NVLink bridge (Device 04ea)
+};
+
+static int getNvlinkGpu(const char* busId1, const char* busId2) {
+  int links = 0;
+  return CONNECT_NVLINK*links;
+}
+
+#endif
@@ -1,5 +1,6 @@
 /*************************************************************************
 * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
+ * Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
 *
 * See LICENSE.txt for license information
 ************************************************************************/
@@ -8,8 +9,11 @@
 #define NCCL_RINGS_H_

 static int getDefaultThreads() {
-  // On Kepler, rings are doubled later.
+#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__)
+  return 256;
+#else  // On Kepler, rings are doubled later.
  return ncclCudaCompCap() == 3 ? 128 : 256;
+#endif
 }

 ncclResult_t ncclGetRings(int* nrings, int* nthreads, int rank, int nranks, int* transports, ncclTvalue_t* values, int* prev, int* next, int* treeIn, int* treeOut);
@@ -1,5 +1,6 @@
 /*************************************************************************
 * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
+ * Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
 *
 * See LICENSE.txt for license information
 ************************************************************************/
@@ -39,14 +40,14 @@ static ncclResult_t shmOpen(const char* shmname, const int shmsize, void** shmPt
  ncclResult_t res = ncclSuccess;

  NCCLCHECKGOTO(shmSetup(shmname, shmsize, &fd, &ptr, create), res, sysError);
-  CUDACHECKGOTO(cudaHostRegister(ptr, shmsize, cudaHostRegisterMapped), res, cudaError);
-  CUDACHECKGOTO(cudaHostGetDevicePointer(devShmPtr, ptr, 0), res, cudaError);
+  CUDACHECKGOTO(hipHostRegister(ptr, shmsize, hipHostRegisterMapped), res, hipError_t);
+  CUDACHECKGOTO(hipHostGetDevicePointer(devShmPtr, ptr, 0), res, hipError_t);

  *shmPtr = ptr;
  return ncclSuccess;
 sysError:
  WARN("Error while %s shared memory segment %s (size %d)\n", create ? "creating" : "attaching to", shmname, shmsize);
-cudaError:
+hipError_t:
  if (fd != -1) close(fd);
  if (create) shm_unlink(shmname);
  if (ptr != MAP_FAILED) munmap(ptr, shmsize);
@@ -60,7 +61,7 @@ static ncclResult_t shmUnlink(const char* shmname) {
 }

 static ncclResult_t shmClose(void* shmPtr, void* devShmPtr, const int shmsize) {
-  CUDACHECK(cudaHostUnregister(shmPtr));
+  CUDACHECK(hipHostUnregister(shmPtr));
  if (munmap(shmPtr, shmsize) != 0) {
    WARN("munmap of shared memory failed");
    return ncclSystemError;
@@ -1,5 +1,6 @@
 /*************************************************************************
 * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
+ * Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
 *
 * See LICENSE.txt for license information
 ************************************************************************/
@@ -11,6 +12,7 @@
 #include <stdint.h>

 ncclResult_t getHostName(char* hostname, int maxlen, const char delim);
+uint64_t getnHash(const char* string, int n);
 uint64_t getHostHash();
 uint64_t getPidHash();

@@ -1,5 +1,6 @@
 /*************************************************************************
 * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
+ * Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
 *
 * See LICENSE.txt for license information
 ************************************************************************/
@@ -19,7 +20,11 @@
 #include "checks.h"
 #include "enqueue.h"
 #include "topo.h"
+#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__)
+#include "nvlink_stub.h"
+#else
 #include "nvlink.h"
+#endif
 #include "cpuset.h"
 #include <stdio.h>
 #include <stdlib.h>
@@ -29,7 +34,7 @@
 #include <sched.h>
 #include <fcntl.h>
 #include <unistd.h>
-#include <cuda_runtime.h>
+#include <hip/hip_runtime.h>
 #include <string.h>
 #include <errno.h>
 #include <assert.h>
@@ -47,7 +52,7 @@ FILE *ncclDebugFile = stdout;
 std::chrono::high_resolution_clock::time_point ncclEpoch;
 #endif

-#if CUDART_VERSION >= 9020
+#if CUDART_VERSION >= 9020 || defined(__HIP_PLATFORM_HCC__) || defined(__HCC__)
 #define NCCL_GROUP_CUDA_STREAM 0 // CGMD: CUDA 9.2,10.X Don't need to use an internal CUDA stream
 #else
 #define NCCL_GROUP_CUDA_STREAM 1 // CGMD: CUDA 9.0,9.1 Need to use an internal CUDA stream
@@ -63,9 +68,9 @@ ncclNet_t* ncclNet = NULL;
 #pragma weak ncclNvlinkGpu
 ncclResult_t ncclNvlinkGpu(int* nvlink) {
  int cudaDev;
-  CUDACHECK(cudaGetDevice(&cudaDev));
+  CUDACHECK(hipGetDevice(&cudaDev));
  char busId[NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE];
-  CUDACHECK(cudaDeviceGetPCIBusId(busId, NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE, cudaDev));
+  CUDACHECK(hipDeviceGetPCIBusId(busId, NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE, cudaDev));
  *nvlink = getNvlinkGpu(busId, NULL);
  return ncclSuccess;
 }
@@ -73,17 +78,17 @@ ncclResult_t ncclNvlinkGpu(int* nvlink) {
 #pragma weak ncclCudaCompCap
 int ncclCudaCompCap() {
  int cudaDev;
-  if (cudaGetDevice(&cudaDev) != cudaSuccess) return 0;
+  if (hipGetDevice(&cudaDev) != hipSuccess) return 0;
  int ccMajor;
-  if (cudaDeviceGetAttribute(&ccMajor, cudaDevAttrComputeCapabilityMajor, cudaDev) != cudaSuccess) return 0;
+  if (hipDeviceGetAttribute(&ccMajor, hipDeviceAttributeComputeCapabilityMajor, cudaDev) != hipSuccess) return 0;
  return ccMajor;
 }
 int ncclCudaFullCompCap() {
  int cudaDev;
-  if (cudaGetDevice(&cudaDev) != cudaSuccess) return 0;
+  if (hipGetDevice(&cudaDev) != hipSuccess) return 0;
  int ccMajor, ccMinor;
-  if (cudaDeviceGetAttribute(&ccMajor, cudaDevAttrComputeCapabilityMajor, cudaDev) != cudaSuccess) return 0;
-  if (cudaDeviceGetAttribute(&ccMinor, cudaDevAttrComputeCapabilityMinor, cudaDev) != cudaSuccess) return 0;
+  if (hipDeviceGetAttribute(&ccMajor, hipDeviceAttributeComputeCapabilityMajor, cudaDev) != hipSuccess) return 0;
+  if (hipDeviceGetAttribute(&ccMinor, hipDeviceAttributeComputeCapabilityMinor, cudaDev) != hipSuccess) return 0;
  return ccMajor*10+ccMinor;
 }

@@ -140,7 +145,7 @@ ncclResult_t initNet() {

 NCCL_PARAM(LlThreshold, "LL_THRESHOLD", -2);
 NCCL_PARAM(ThreadThreshold, "THREAD_THRESHOLD", -2);
-NCCL_PARAM(TreeThreshold, "TREE_THRESHOLD", -2);
+NCCL_PARAM(TreeThreshold, "TREE_THRESHOLD", 0);

 int ncclThreadThreshold(int minCompCap, int multiNode) {
  int threshold = ncclParamThreadThreshold();
@@ -154,6 +159,22 @@ int ncclThreadThreshold(int minCompCap, int multiNode) {
  return threshold;
 }

+bool useFineGrainVramPcie = false;
+
+void parseHsaForceFineGrainVramPcie() {
+  char* str = getenv("HSA_FORCE_FINE_GRAIN_PCIE");
+  if (str && strlen(str) > 0) {
+    errno = 0;
+    int64_t v = strtoll(str, NULL, 0);
+    if (errno || (v != 0 && v != 1)) {
+      INFO(NCCL_ALL,"Invalid value %s for %s, using default %u.", str, "HSA_FORCE_FINE_GRAIN_PCIE", useFineGrainVramPcie); \
+    } else {
+      useFineGrainVramPcie = v;
+      INFO(NCCL_ALL,"%s set by environment to %u.", "HSA_FORCE_FINE_GRAIN_PCIE", useFineGrainVramPcie);  \
+    }
+  }
+}
+
 pthread_mutex_t initLock = PTHREAD_MUTEX_INITIALIZER;
 static bool initialized = false;
 static ncclResult_t ncclInit() {
@@ -165,6 +186,8 @@ static ncclResult_t ncclInit() {
    initNet();
    initialized = true;
  }
+  // Check if HSA_FORCE_FINE_GRAIN_PCIE is set in env
+  parseHsaForceFineGrainVramPcie();
  pthread_mutex_unlock(&initLock);
  return ncclSuccess;
 }
@@ -192,22 +215,51 @@ static ncclResult_t commFree(ncclComm_t comm) {
  if (comm == NULL)
    return ncclSuccess;

+#ifdef ENABLE_PROFILING
+  struct ncclProf* prof = (struct ncclProf*)malloc(sizeof(struct ncclProf));
+  CUDACHECK(hipMemcpy(prof, comm->hostDevComm.devProf, sizeof(struct ncclProf), hipMemcpyDeviceToHost));
+  uint64_t wait_send_cycle = 0, wait_recv_cycle = 0;
+  for (int chan=0; chan<comm->nChannels; chan++) {
+    wait_send_cycle += prof->wait_send_cycle[chan];
+    wait_recv_cycle += prof->wait_recv_cycle[chan];
+  }
+  #define VEGA_GPU_RTC_FREQUENCY 2.7E7
+  if (comm->rank == 0) {
+    INFO(NCCL_INIT, "# %4s %6s %6s %6s %6s %6s %7s %6s %6s %6s %6s %6s", "Rank", "total", "w_send", "w_recv", "send", "rcRdS", "dRcRdCS", "dRcCS", "dRc", "cS", "rc", "rcCS");
+    INFO(NCCL_INIT, "# %4s %6s %6s %6s %6s %6s %7s %6s %6s %6s %6s %6s", "", "(s)", "(s)", "(s)", "(GB/s)", "(GB/s)", "(GB/s)", "(GB/s)", "(GB/s)", "(GB/s)", "(GB/s)", "(GB/s)", "(GB/s)");
+  }
+  INFO(NCCL_INIT, "# %4d %6.4f %6.4f %6.4f %6.2f %6.2f %7.2f %6.2f %6.2f %6.2f %6.2f %6.2f",
+    comm->rank, (double)prof->total_cycle/VEGA_GPU_RTC_FREQUENCY/comm->nChannels,
+    (double)wait_send_cycle/VEGA_GPU_RTC_FREQUENCY/comm->nChannels,
+    (double)wait_recv_cycle/VEGA_GPU_RTC_FREQUENCY/comm->nChannels,
+    (prof->send_cycle) ? (double)prof->send_byte*comm->nChannels/((double)prof->send_cycle/VEGA_GPU_RTC_FREQUENCY*1.0E9) : 0,
+    (prof->recvReduceSend_cycle) ? (double)prof->recvReduceSend_byte*comm->nChannels/((double)prof->recvReduceSend_cycle/VEGA_GPU_RTC_FREQUENCY*1.0E9) : 0,
+    (prof->directRecvReduceCopySend_cycle) ? (double)prof->directRecvReduceCopySend_byte*comm->nChannels/((double)prof->directRecvReduceCopySend_cycle/VEGA_GPU_RTC_FREQUENCY*1.0E9) : 0,
+    (prof->directRecvCopySend_cycle) ? (double)prof->directRecvCopySend_byte*comm->nChannels/((double)prof->directRecvCopySend_cycle/VEGA_GPU_RTC_FREQUENCY*1.0E9) : 0,
+    (prof->directRecv_cycle) ? (double)prof->directRecv_byte*comm->nChannels/((double)prof->directRecv_cycle/VEGA_GPU_RTC_FREQUENCY*1.0E9) : 0,
+    (prof->copySend_cycle) ? (double)prof->copySend_byte*comm->nChannels/((double)prof->copySend_cycle/VEGA_GPU_RTC_FREQUENCY*1.0E9) : 0,
+    (prof->recv_cycle) ? (double)prof->recv_byte*comm->nChannels/((double)prof->recv_cycle/VEGA_GPU_RTC_FREQUENCY*1.0E9) : 0,
+    (prof->recvCopySend_cycle) ? (double)prof->recvCopySend_byte*comm->nChannels/((double)prof->recvCopySend_cycle/VEGA_GPU_RTC_FREQUENCY*1.0E9) : 0);
+  free(prof);
+  CUDACHECK(hipFree(comm->hostDevComm.devProf));
+#endif
+
  free(comm->peerInfo);

  if (comm->bootstrap)
    NCCLCHECK(bootstrapClose(comm->bootstrap));

-  CUDACHECK(cudaFree(comm->hostDevComm.channels));
-  CUDACHECK(cudaFree(comm->devComm));
+  CUDACHECK(hipFree(comm->hostDevComm.channels));
+  CUDACHECK(hipFree(comm->devComm));

  for (int channel=0; channel<comm->nChannels; channel++)
    NCCLCHECK(freeChannel(comm->channels+channel, comm->nRanks));

  if (comm->doneEvent != NULL)
-    CUDACHECK(cudaEventDestroy(comm->doneEvent));
+    CUDACHECK(hipEventDestroy(comm->doneEvent));

  if (comm->launchMode == ncclComm::GROUP) {
-    CUDACHECK(cudaStreamDestroy(comm->groupStream));
+    CUDACHECK(hipStreamDestroy(comm->groupStream));
  }

  // Last rank frees shared resources between threads
@@ -220,8 +272,8 @@ static ncclResult_t commFree(ncclComm_t comm) {
    free(comm->intraCGMode);
    free(comm->intraCC);
  }
-  CUDACHECK(cudaFreeHost((void *)comm->abortFlag));
-  CUDACHECK(cudaFreeHost((void *)comm->fatalDevError));
+  CUDACHECK(hipHostFree((void *)comm->abortFlag));
+  CUDACHECK(hipHostFree((void *)comm->fatalDevError));

  // Poison comm to try and catch a double free
  commPoison(comm);
@@ -242,15 +294,15 @@ static ncclResult_t commAlloc(ncclComm_t* comret, int ndev, int rank) {

  // Try to create a CUDA object right away. If there is something wrong with
  // the device we're on (failure cause #1) , better know it early.
-  cudaEvent_t doneEvent;
-  CUDACHECK(cudaEventCreateWithFlags(&doneEvent, cudaEventDisableTiming));
+  hipEvent_t doneEvent;
+  CUDACHECK(hipEventCreateWithFlags(&doneEvent, hipEventDisableTiming));

  struct ncclComm* comm;
  NCCLCHECK(ncclCalloc(&comm, 1));

  comm->rank = comm->hostDevComm.rank =rank;
  comm->nRanks = comm->hostDevComm.nRanks = ndev;
-  cudaGetDevice(&comm->cudaDev);
+  hipGetDevice(&comm->cudaDev);
  getNvmlDevice(comm->cudaDev, &comm->nvmlDev);
  TRACE(NCCL_INIT,"comm %p rank %d nranks %d cudaDev %d nvmlDev %d", comm, rank, ndev, comm->cudaDev, comm->nvmlDev);

@@ -258,7 +310,7 @@ static ncclResult_t commAlloc(ncclComm_t* comret, int ndev, int rank) {
  comm->llThreshold = ncclParamLlThreshold();
  comm->treeThreshold = ncclParamTreeThreshold();
  comm->checkPointers = ncclParamCheckPointers() == 1 ? true : false;
-#if CUDART_VERSION >= 9020
+#if CUDART_VERSION >= 9020 || defined(__HIP_PLATFORM_HCC__) || defined(__HCC__)
  comm->groupCudaStream = ncclParamGroupCudaStream();
 #else
  // Don't allow the user to overload the default setting in older CUDA builds
@@ -267,12 +319,15 @@ static ncclResult_t commAlloc(ncclComm_t* comret, int ndev, int rank) {
  comm->fatalError = ncclSuccess;

  NCCLCHECK(ncclCudaHostAlloc((void**) &comm->fatalDevError, (void**) &comm->hostDevComm.fatalDevError, sizeof(ncclDevError_t)));
-  *comm->fatalDevError = ncclDevSuccess;
+  STORE(comm->fatalDevError, ncclDevSuccess);

  NCCLCHECK(ncclCudaHostAlloc((void**) &comm->abortFlag, (void**) &comm->hostDevComm.abortFlag, sizeof(uint32_t)));
-  *comm->abortFlag = 0;
+  STORE(comm->abortFlag, 0);

  comm->argsptr = &comm->args;
+#ifdef ENABLE_PROFILING
+  NCCLCHECK(ncclCudaCalloc(&comm->hostDevComm.devProf, 1));
+#endif

  *comret = comm;
  return ncclSuccess;
@@ -296,7 +351,11 @@ static ncclResult_t devCommSetup(ncclComm_t comm) {
 }

 // Pre-process the string so that running "strings" on the lib can quickly reveal the version.
+#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__)
+#define VERSION_STRING "NCCL version " STR(NCCL_MAJOR) "." STR(NCCL_MINOR) "." STR(NCCL_PATCH) NCCL_SUFFIX "+hip"
+#else
 #define VERSION_STRING "NCCL version " STR(NCCL_MAJOR) "." STR(NCCL_MINOR) "." STR(NCCL_PATCH) NCCL_SUFFIX "+cuda" STR(CUDA_MAJOR) "." STR(CUDA_MINOR)
+#endif
 static void showVersion() {
  static int shown = 0;
  if (shown == 0 && ncclDebugLevel >= NCCL_LOG_VERSION) {
@@ -308,26 +367,31 @@ static void showVersion() {
  }
 }

-static ncclResult_t fillInfo(struct ncclPeerInfo* info, int rank) {
+static ncclResult_t fillInfo(struct ncclPeerInfo* info, int rank, uint64_t commHash) {
  info->rank = rank;
-  CUDACHECK(cudaGetDevice(&info->cudaDev));
+  CUDACHECK(hipGetDevice(&info->cudaDev));
  NCCLCHECK(getNvmlDevice(info->cudaDev, &info->nvmlDev))
-  info->hostHash=getHostHash();
-  info->pidHash=getPidHash();
+  info->hostHash=getHostHash()+commHash;
+  info->pidHash=getPidHash()+commHash;

  // Get PCI Bus Id. We need to get the bus ID through CUDA first, since the
  // cudaDev is a CUDA runtime dev number which could be different from the
  // NVML device number. Then we get the busID from NVML to be sure it is
  // consistent with NVML remote PCI bus Ids.
-  CUDACHECK(cudaDeviceGetPCIBusId(info->busId, NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE, info->cudaDev));
+  CUDACHECK(hipDeviceGetPCIBusId(info->busId, NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE, info->cudaDev));
+#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__)
+#else
  nvmlDevice_t nvmlDevice;
  NCCLCHECK(wrapNvmlDeviceGetHandleByPciBusId(info->busId, &nvmlDevice));
  nvmlPciInfo_t pciInfo;
  NCCLCHECK(wrapNvmlDeviceGetPciInfo(nvmlDevice, &pciInfo));
  strncpy(info->busId, pciInfo.busId, NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE);
+#endif
  return ncclSuccess;
 }

+static ncclResult_t setCpuAffinity(int cudaDev);
+
 template <int type>
 static ncclResult_t selectTransport(struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connect, struct ncclConnector* connector, int buffSize, int channelId) {
  for (int t=0; t<NTRANSPORTS; t++) {
@@ -336,8 +400,14 @@ static ncclResult_t selectTransport(struct ncclPeerInfo* myInfo, struct ncclPeer
    ncclTvalue_t ret = 0;
    NCCLCHECK(transport->canConnect(&ret, myInfo, peerInfo));
    if (ret > 0) {
+      cpu_set_t affinitySave;
+      sched_getaffinity(0, sizeof(cpu_set_t), &affinitySave);
+      int cudaDev;
+      CUDACHECK(hipGetDevice(&cudaDev));
+      setCpuAffinity(cudaDev);
      connector->transportComm = transportComm;
      NCCLCHECK(transportComm->setup(myInfo, peerInfo, connect, connector, buffSize, channelId));
+      sched_setaffinity(0, sizeof(cpu_set_t), &affinitySave);
      return ncclSuccess;
    }
  }
@@ -564,13 +634,13 @@ static ncclResult_t buildRings(int nrings, int* rings, int rank, int nranks, int

 void* waitForNonNullPtr(void* p) {
  volatile void** ptr = (volatile void**) p;
-  while (*ptr == NULL) sched_yield();
-  return (void*)*ptr;
+  while (LOAD(ptr) == NULL) sched_yield();
+  return (void*)(LOAD(ptr));
 }

 ncclResult_t initParams(struct ncclComm* comm) {
-  struct cudaLaunchParams* params = comm->myParams = comm->intraParams+comm->intraRank;
-  params->args = &comm->argsptr;
+  hipLaunchParams* params = comm->myParams = comm->intraParams+comm->intraRank;
+  params->args =(void **)&comm->argsptr;
  params->stream = NULL;
  params->sharedMem = 0;
  params->blockDim.x = 0; params->blockDim.y = params->blockDim.z = 1;
@@ -603,7 +673,7 @@ ncclResult_t ncclCommSetIntra(struct ncclComm* comm, int rank, int ranks, struct
    comm->intraCC = CC;
  } else {
    comm->intraBarrier = (int*)waitForNonNullPtr(&comm0->intraBarrier);
-    comm->intraParams = (struct cudaLaunchParams*)waitForNonNullPtr(&comm0->intraParams);
+    comm->intraParams = (hipLaunchParams*)waitForNonNullPtr(&comm0->intraParams);
    comm->intraCudaDevs = (int*)waitForNonNullPtr(&comm0->intraCudaDevs);
    comm->intraCGMode = (int*)waitForNonNullPtr(&comm0->intraCGMode);
    comm->intraCC = (int*)waitForNonNullPtr(&comm0->intraCC);
@@ -611,7 +681,7 @@ ncclResult_t ncclCommSetIntra(struct ncclComm* comm, int rank, int ranks, struct
  comm->intraCudaDevs[comm->intraRank] = comm->cudaDev;
  NCCLCHECK(initParams(comm));

-  int cgMdLaunch = 0;
+  int cgMdLaunch = 1;

  // Set CG Mode
  comm->launchMode = ncclComm::GROUP;
@@ -620,11 +690,11 @@ ncclResult_t ncclCommSetIntra(struct ncclComm* comm, int rank, int ranks, struct
    comm->launchMode = ncclComm::PARALLEL;
  }
  if (comm->launchMode == ncclComm::GROUP) {
-    CUDACHECK(cudaStreamCreateWithFlags(&comm->groupStream, cudaStreamNonBlocking));
+    CUDACHECK(hipStreamCreateWithFlags(&comm->groupStream, hipStreamNonBlocking));
 #if CUDART_VERSION >= 9000
    if (*comm->intraCC && (ncclCudaFullCompCap() == *comm->intraCC)) {
      // Check whether the GPU supports Cooperative Group Multi Device Launch
-      (void) cudaDeviceGetAttribute(&cgMdLaunch, cudaDevAttrCooperativeMultiDeviceLaunch, comm->cudaDev);
+      (void) hipDeviceGetAttribute(&cgMdLaunch, cudaDevAttrCooperativeMultiDeviceLaunch, comm->cudaDev);
    }
 #endif
  }
@@ -691,7 +761,8 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* comm

  int rank = comm->rank;
  int nranks = comm->nRanks;
-  TRACE(NCCL_INIT, "rank %d nranks %d - BEGIN", rank, nranks);
+  uint64_t commHash = getnHash(commId->internal, NCCL_UNIQUE_ID_BYTES);
+  TRACE(NCCL_INIT, "comm %p, commHash %lu, rank %d nranks %d - BEGIN", comm, commHash, rank, nranks);
  NCCLCHECK(bootstrapInit(commId, rank, nranks, &comm->bootstrap));

  // AllGather1 - begin
@@ -702,7 +773,7 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* comm

  NCCLCHECK(ncclCalloc(&allGather1Data, nranks));
  allGather1Data[rank].comm = comm;
-  NCCLCHECK(fillInfo(&allGather1Data[rank].peerInfo, rank));
+  NCCLCHECK(fillInfo(&allGather1Data[rank].peerInfo, rank, commHash));
  NCCLCHECK(bootstrapAllGather(comm->bootstrap, allGather1Data, sizeof(*allGather1Data)));

  NCCLCHECK(ncclCalloc(&comm->peerInfo, nranks));
@@ -945,7 +1016,7 @@ ncclResult_t ncclCommInitRankSync(ncclComm_t* newcomm, int nranks, ncclUniqueId

  // Make sure all host memory allocation are close to the GPU
  int cudaDev;
-  CUDACHECK(cudaGetDevice(&cudaDev));
+  CUDACHECK(hipGetDevice(&cudaDev));
  NCCLCHECK(setCpuAffinity(cudaDev));
  ncclResult_t res;

@@ -976,7 +1047,7 @@ ncclResult_t ncclCommInitRank(ncclComm_t* newcomm, int nranks, ncclUniqueId comm
  if (myrank == 0) showVersion();

  // Make sure the CUDA runtime is initialized.
-  CUDACHECK(cudaFree(NULL));
+  CUDACHECK(hipFree(NULL));

  NCCLCHECK(PtrCheck(newcomm, "CommInitRank", "newcomm"));
  if (nranks < 1 || myrank < 0 || myrank >= nranks) {
@@ -986,7 +1057,7 @@ ncclResult_t ncclCommInitRank(ncclComm_t* newcomm, int nranks, ncclUniqueId comm

  if (ncclAsyncMode()) {
    int cudaDev;
-    CUDACHECK(cudaGetDevice(&cudaDev));
+    CUDACHECK(hipGetDevice(&cudaDev));
    return ncclAsyncInit(ncclCommInitRankSync, cudaDev, newcomm, nranks, commId, myrank);
  } else {
    return ncclCommInitRankSync(newcomm, nranks, commId, myrank);
@@ -997,8 +1068,8 @@ static ncclResult_t initTransportsAll(struct ncclComm** comms, const int* devs,
  struct ncclPeerInfo* allInfo;
  NCCLCHECK(ncclCalloc(&allInfo, nranks));
  for (int rank=0; rank<nranks; rank++) {
-    CUDACHECK(cudaSetDevice(devs[rank]));
-    NCCLCHECK(fillInfo(allInfo+rank, rank));
+    CUDACHECK(hipSetDevice(devs[rank]));
+    NCCLCHECK(fillInfo(allInfo+rank, rank, 0));
  }

  int* connectTransport;
@@ -1020,7 +1091,7 @@ static ncclResult_t initTransportsAll(struct ncclComm** comms, const int* devs,
  int myCompCap = ncclCudaCompCap();
  int minCompCap = myCompCap;
  for (int rank=0; rank<nranks; rank++) {
-    CUDACHECK(cudaSetDevice(devs[rank]));
+    CUDACHECK(hipSetDevice(devs[rank]));
    int nringsRank;
    int nthreadsRank = getDefaultThreads();
    myCompCap = ncclCudaCompCap();
@@ -1061,7 +1132,7 @@ static ncclResult_t initTransportsAll(struct ncclComm** comms, const int* devs,
  for (int r=0; r<nrings; r++) {
    int* ringRanks = rings+r*nranks;
    for (int rank=0; rank<nranks; rank++) {
-      CUDACHECK(cudaSetDevice(devs[rank]));
+      CUDACHECK(hipSetDevice(devs[rank]));
      struct ncclChannel* channel = comms[rank]->channels+r;
      struct ncclRing *ring = &channel->ring;
      NCCLCHECK(setupChannel(comms[rank], r, rank, nranks, ringRanks, treeIn));
@@ -1075,7 +1146,7 @@ static ncclResult_t initTransportsAll(struct ncclComm** comms, const int* devs,
      NCCLCHECK(selectTransport<1>(allInfo+rank, allInfo+next, connect+rank*2+1, send, channel->buffSize, channel->id));
    }
    for (int rank=0; rank<nranks; rank++) {
-      CUDACHECK(cudaSetDevice(devs[rank]));
+      CUDACHECK(hipSetDevice(devs[rank]));
      struct ncclChannel* channel = comms[rank]->channels+r;
      struct ncclRing *ring = &channel->ring;
      struct ncclConnector* recv = &channel->peers[ring->prev].recv;
@@ -1118,7 +1189,7 @@ ncclResult_t ncclCommInitAll(ncclComm_t* comms, int ndev, const int* devlist) {
    ncclDevList[i] = devlist ? devlist[i] : i;
  }

-  CUDACHECKGOTO(cudaGetDevice(&savedDevice), res, cleanup);
+  CUDACHECKGOTO(hipGetDevice(&savedDevice), res, cleanup);

  for(rank=0; rank<ndev; ++rank)
    comms[rank] = NULL;
@@ -1128,7 +1199,7 @@ ncclResult_t ncclCommInitAll(ncclComm_t* comms, int ndev, const int* devlist) {

  for (rank=0; rank<ndev; ++rank) {
    cudaDev = ncclDevList[rank];
-    CUDACHECKGOTO(cudaSetDevice(cudaDev), res, cleanup);
+    CUDACHECKGOTO(hipSetDevice(cudaDev), res, cleanup);

    NCCLCHECK(setCpuAffinity(cudaDev));

@@ -1144,7 +1215,7 @@ ncclResult_t ncclCommInitAll(ncclComm_t* comms, int ndev, const int* devlist) {

  for(rank=0; rank<ndev; ++rank) {
    cudaDev = ncclDevList[rank];
-    CUDACHECKGOTO(cudaSetDevice(cudaDev), res, cleanup);
+    CUDACHECKGOTO(hipSetDevice(cudaDev), res, cleanup);
    NCCLCHECKGOTO(devCommSetup(comms[rank]), res, cleanup);
  }

@@ -1162,7 +1233,7 @@ final:
  free(ncclDevList);
  if(wrapNvmlShutdown() != ncclSuccess)
    INFO(NCCL_INIT,"NCCL did not shutdown nvml properly");
-  cudaSetDevice(savedDevice);
+  hipSetDevice(savedDevice);
  sched_setaffinity(0, sizeof(cpu_set_t), &affinitySave);
  return res;
 }
@@ -1173,21 +1244,21 @@ static ncclResult_t commDestroy(ncclComm_t comm) {
 #ifdef ENABLE_TRACE
  int rank = comm->rank;
 #endif
-  CUDACHECK(cudaGetDevice(&savedDevice));
+  CUDACHECK(hipGetDevice(&savedDevice));
  int commDevice = comm->cudaDev;

  if (savedDevice != commDevice) {
-    CUDACHECK(cudaSetDevice(commDevice));
+    CUDACHECK(hipSetDevice(commDevice));
  }

-  TRACE(NCCL_INIT, "Destroying comm %p rank %d abortFlag %d fatalError %d", comm, rank, *comm->abortFlag, comm->fatalError);
+  TRACE(NCCL_INIT, "Destroying comm %p rank %d abortFlag %d fatalError %d", comm, rank, LOAD(comm->abortFlag), comm->fatalError);

-  CUDACHECK(cudaStreamSynchronize(comm->groupStream));
+  CUDACHECK(hipStreamSynchronize(comm->groupStream));
  NCCLCHECK(transportDestroyProxy(comm));
  NCCLCHECK(commFree(comm));

  if (savedDevice != commDevice)
-    CUDACHECK(cudaSetDevice(savedDevice));
+    CUDACHECK(hipSetDevice(savedDevice));

  TRACE(NCCL_INIT, "Destroyed comm %p rank %d", comm, rank);

@@ -1216,9 +1287,11 @@ ncclResult_t ncclCommAbort(ncclComm_t comm) {
    return ncclSuccess;

  // Ask anything that might still be running on the device to quit
-  *comm->abortFlag = 1;
+  STORE(comm->abortFlag, 1);

-  return commDestroy(comm);
+  // do not destroy comm because kernel maybe still running
+  // return commDestroy(comm);
+  return ncclSuccess;
 }

 NCCL_API(const char*, ncclGetErrorString, ncclResult_t code);
@@ -1241,7 +1314,7 @@ ncclResult_t ncclCommGetAsyncError(ncclComm_t comm, ncclResult_t *asyncError) {

  // Check device reported error
  static ncclDevError_t printedDevErr = ncclDevSuccess;
-  switch(*comm->fatalDevError) {
+  switch(LOAD(comm->fatalDevError)) {
    case ncclDevSuccess :
      break;
    case ncclDevAssertedMismatch :
@@ -7,16 +7,16 @@
 #include "argcheck.h"

 static ncclResult_t CudaPtrCheck(const void* pointer, struct ncclComm* comm, const char* ptrname, const char* opname) {
-  cudaPointerAttributes attr;
-  cudaError_t err = cudaPointerGetAttributes(&attr, pointer);
-  if (err != cudaSuccess || attr.devicePointer == NULL) {
+  hipPointerAttribute_t attr;
+  hipError_t err = hipPointerGetAttributes(&attr, pointer);
+  if (err != hipSuccess || attr.devicePointer == NULL) {
    WARN("%s : %s is not a valid pointer", opname, ptrname);
    return ncclInvalidArgument;
  }
 #if CUDART_VERSION >= 10000
-  if (attr.type == cudaMemoryTypeDevice && attr.device != comm->cudaDev) {
+  if (attr.type == hipMemoryTypeDevice && attr.device != comm->cudaDev) {
 #else
-  if (attr.memoryType == cudaMemoryTypeDevice && attr.device != comm->cudaDev) {
+  if (attr.memoryType == hipMemoryTypeDevice && attr.device != comm->cudaDev) {
 #endif
    WARN("%s : %s allocated on device %d mismatchs with NCCL device %d", opname, ptrname, attr.device, comm->cudaDev);
    return ncclInvalidArgument;
@@ -1,5 +1,6 @@
 /*************************************************************************
 * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
+ * Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
 *
 * See LICENSE.txt for license information
 ************************************************************************/
@@ -52,7 +53,7 @@ struct ncclAsyncArgs {
 thread_local struct ncclAsyncArgs ncclGroupArgs[MAX_ASYNC_OPS];

 ncclResult_t ncclSetDevice(int cudaDev) {
-  CUDACHECK(cudaSetDevice(cudaDev));
+  CUDACHECK(hipSetDevice(cudaDev));
  return ncclSuccess;
 }

@@ -116,7 +117,7 @@ ncclResult_t ncclGroupEnd() {
  ncclGroupMode--;
  if (ncclGroupMode > 0) return ncclSuccess;
  int savedDev;
-  CUDACHECK(cudaGetDevice(&savedDev));
+  CUDACHECK(hipGetDevice(&savedDev));
  int done = ncclGroupIndex;
  int doneArray[MAX_ASYNC_OPS];
  for (int i=0; i<ncclGroupIndex; i++) doneArray[i] = 0;
@@ -129,22 +130,22 @@ ncclResult_t ncclGroupEnd() {
   * 2. Barrier Wait. No CUDA call is permitted
   * 3. Enqueue Events. CUDA event wait/enqueue.
   * This is needed because step 2 cannot call any CUDA primitive, otherwise if
-   * cudaFree happens between 1 and 3, it could block that CUDA call and
+   * hipFree happens between 1 and 3, it could block that CUDA call and
   * prevent some ranks from launching their network threads, which would
-   * prevent the NCCL call from completing, blocking the cudaFree call.
+   * prevent the NCCL call from completing, blocking the hipFree call.
   */
  for (int i=0; i<ncclGroupIndex; i++) {
    struct ncclAsyncArgs* args = ncclGroupArgs+i;
    if (args->funcType == ASYNC_FUNC_COLL) {
      if (args->coll.comm->userStream == NULL)
-        CUDACHECKGOTO(cudaSetDevice(args->coll.comm->cudaDev), ret, end);
+        CUDACHECKGOTO(hipSetDevice(args->coll.comm->cudaDev), ret, end);
      NCCLCHECKGOTO(ncclBarrierEnqueue(args->coll.comm), ret, end);
    }
  }
  for (int i=0; i<ncclGroupIndex; i++) {
    struct ncclAsyncArgs* args = ncclGroupArgs+i;
    if (args->funcType == ASYNC_FUNC_COLL) {
-      CUDACHECKGOTO(cudaSetDevice(args->coll.comm->cudaDev), ret, end);
+      CUDACHECKGOTO(hipSetDevice(args->coll.comm->cudaDev), ret, end);
      NCCLCHECKGOTO(ncclBarrierEnqueueWait(args->coll.comm), ret, end);
    }
  }
@@ -152,7 +153,7 @@ ncclResult_t ncclGroupEnd() {
    struct ncclAsyncArgs* args = ncclGroupArgs+i;
    if (args->funcType == ASYNC_FUNC_COLL) {
      if (args->coll.comm->userStream == NULL)
-        CUDACHECKGOTO(cudaSetDevice(args->coll.comm->cudaDev), ret, end);
+        CUDACHECKGOTO(hipSetDevice(args->coll.comm->cudaDev), ret, end);
      NCCLCHECKGOTO(ncclEnqueueEvents(args->coll.comm), ret, end);
      doneArray[i] = 1;
      done--;
@@ -182,7 +183,7 @@ group_cleanup:
    for (int c=0; c<comm->nChannels; c++) {
      struct ncclChannel* channel = comm->channels+c;
      for (int i=0; i<channel->collCount; i++) {
-        channel->collectives[(channel->collStart + i)%NCCL_MAX_OPS].active = 0;
+        STORE(&channel->collectives[(channel->collStart + i)%NCCL_MAX_OPS].active, 0);
      }
      channel->collFifoTail = channel->collStart;
      channel->collCount = 0;
@@ -193,6 +194,6 @@ group_cleanup:
 end:
  ncclGroupError = ncclSuccess;
  ncclGroupIndex = 0;
-  CUDACHECK(cudaSetDevice(savedDev)); // do other clean-ups first before calling cudaSetDevice, because this call can fail too
+  CUDACHECK(hipSetDevice(savedDev)); // do other clean-ups first before calling hipSetDevice, because this call can fail too
  return ret;
 }
@@ -0,0 +1,49 @@
+/*************************************************************************
+ * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
+ * Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#include "nvmlwrap.h"
+
+ncclResult_t wrapNvmlSymbols(void) {
+  return ncclSuccess;
+}
+
+ncclResult_t wrapNvmlInit(void) {
+  return ncclSuccess;
+}
+
+ncclResult_t wrapNvmlShutdown(void) {
+  return ncclSuccess;
+}
+
+ncclResult_t wrapNvmlDeviceGetHandleByPciBusId(const char* pciBusId, nvmlDevice_t* device) {
+  return ncclSuccess;
+}
+
+ncclResult_t wrapNvmlDeviceGetIndex(nvmlDevice_t device, unsigned* index) {
+  return ncclSuccess;
+}
+
+ncclResult_t wrapNvmlDeviceGetPciInfo(nvmlDevice_t device, nvmlPciInfo_t* pci) {
+  return ncclSuccess;
+}
+
+ncclResult_t wrapNvmlDeviceGetMinorNumber(nvmlDevice_t device, unsigned int* minorNumber) {
+  return ncclSuccess;
+}
+
+ncclResult_t wrapNvmlDeviceGetNvLinkState(nvmlDevice_t device, unsigned int link, nvmlEnableState_t *isActive) {
+  return ncclSuccess;
+}
+
+ncclResult_t wrapNvmlDeviceGetNvLinkRemotePciInfo(nvmlDevice_t device, unsigned int link, nvmlPciInfo_t *pci) {
+  return ncclSuccess;
+}
+
+ncclResult_t wrapNvmlDeviceGetNvLinkCapability(nvmlDevice_t device, unsigned int link,
+    nvmlNvLinkCapability_t capability, unsigned int *capResult) {
+  return ncclSuccess;
+}
@@ -1,5 +1,6 @@
 /*************************************************************************
 * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
+ * Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
 *
 * See LICENSE.txt for license information
 ************************************************************************/
@@ -378,7 +379,11 @@ ncclResult_t ncclGetRings(int* nrings, int* nthreads, int rank, int nranks, int*
    if (rank == 0) INFO(NCCL_INIT,"Limiting to %d rings per user request.", maxNrings);
    *nrings = maxNrings;
  } else {
+#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__)
+    int defaultMinNrings = 1;
+#else
    int defaultMinNrings = ncclCudaCompCap() == 3 ? 2 : 1;
+#endif
    if (minNrings < defaultMinNrings) minNrings = defaultMinNrings;
    if (minNrings > 0 && minNrings > *nrings) {
      if (rank == 0 && minNrings > defaultMinNrings) INFO(NCCL_INIT,"Duplicating rings to %d per user request.", minNrings);
@@ -1,5 +1,6 @@
 /*************************************************************************
 * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
+ * Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
 *
 * See LICENSE.txt for license information
 ************************************************************************/
@@ -12,7 +13,7 @@

 ncclResult_t getCudaPath(int cudaDev, char** path) {
  char busId[BUSID_SIZE];
-  CUDACHECK(cudaDeviceGetPCIBusId(busId, BUSID_SIZE, cudaDev));
+  CUDACHECK(hipDeviceGetPCIBusId(busId, BUSID_SIZE, cudaDev));
  for (int i=0; i<BUSID_SIZE; i++) busId[i] = tolower(busId[i]);
  char busPath[] = "/sys/class/pci_bus/0000:00/../../0000:00:00.0";
  memcpy(busPath+sizeof("/sys/class/pci_bus/")-1, busId, BUSID_REDUCED_SIZE-1);
@@ -1,5 +1,6 @@
 /*************************************************************************
 * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
+ * Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
 *
 * See LICENSE.txt for license information
 ************************************************************************/
@@ -20,7 +21,7 @@ ncclResult_t getNvmlDevice(int cudaDev, int *nvmlDev) {
  nvmlDevice_t nvmlDevice;
  unsigned int dev;
  *nvmlDev = -1;
-  CUDACHECK(cudaDeviceGetPCIBusId(busId, NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE, cudaDev));
+  CUDACHECK(hipDeviceGetPCIBusId(busId, NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE, cudaDev));
  NCCLCHECK(wrapNvmlDeviceGetHandleByPciBusId(busId, &nvmlDevice));
  NCCLCHECK(wrapNvmlDeviceGetMinorNumber(nvmlDevice, &dev));

@@ -50,7 +51,7 @@ void ncclDebugLog(ncclDebugLogLevel level, unsigned long flags, const char *file
  char hostname[1024];
  getHostName(hostname, 1024, '.');
  int cudaDev;
-  cudaGetDevice(&cudaDev);
+  hipGetDevice(&cudaDev);

  char buffer[1024];
  size_t len = 0;
@@ -96,6 +97,15 @@ uint64_t getHash(const char* string) {
  return result;
 }

+uint64_t getnHash(const char* string, int n) {
+  // Based on DJB2, result = result * 33 + char
+  uint64_t result = 9527;
+  for (int c = 0; c < n; c++) {
+    result = ((result << 5) + result) + string[c];
+  }
+  return result;
+}
+
 /* Generate a hash of the unique identifying string for this host
 * that will be unique for both bare-metal and container instances
 * Equivalent of a hash of;
@@ -1,5 +1,6 @@
 /*************************************************************************
 * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
+ * Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
 *
 * See LICENSE.txt for license information
 ************************************************************************/
@@ -7,15 +8,15 @@
 #ifndef NCCL_H_
 #define NCCL_H_

-#include <cuda_runtime.h>
-#include <cuda_fp16.h>
+#include <hip/hip_runtime_api.h>
+#include <hip/hip_fp16.h>

-#define NCCL_MAJOR ${nccl:Major}
-#define NCCL_MINOR ${nccl:Minor}
-#define NCCL_PATCH ${nccl:Patch}
-#define NCCL_SUFFIX "${nccl:Suffix}"
+#define NCCL_MAJOR ${NCCL_MAJOR}
+#define NCCL_MINOR ${NCCL_MINOR}
+#define NCCL_PATCH ${NCCL_PATCH}
+#define NCCL_SUFFIX "${NCCL_SUFFIX}"

-#define NCCL_VERSION_CODE ${nccl:Version}
+#define NCCL_VERSION_CODE ${NCCL_VERSION}
 #define NCCL_VERSION(X,Y,Z) ((X) * 1000 + (Y) * 100 + (Z))

 #ifdef __cplusplus
@@ -142,9 +143,9 @@ typedef enum { ncclInt8       = 0, ncclChar       = 0,
 * In-place operation will happen if sendbuff == recvbuff.
 */
 ncclResult_t  ncclReduce(const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype,
-    ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream);
+    ncclRedOp_t op, int root, ncclComm_t comm, hipStream_t stream);
 ncclResult_t pncclReduce(const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype,
-    ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream);
+    ncclRedOp_t op, int root, ncclComm_t comm, hipStream_t stream);

 /*
 * (deprecated) Broadcast (in-place)
@@ -156,9 +157,9 @@ ncclResult_t pncclReduce(const void* sendbuff, void* recvbuff, size_t count, ncc
 * This operation is implicitely in place.
 */
 ncclResult_t  ncclBcast(void* buff, size_t count, ncclDataType_t datatype, int root,
-    ncclComm_t comm, cudaStream_t stream);
+    ncclComm_t comm, hipStream_t stream);
 ncclResult_t pncclBcast(void* buff, size_t count, ncclDataType_t datatype, int root,
-    ncclComm_t comm, cudaStream_t stream);
+    ncclComm_t comm, hipStream_t stream);

 /*
 * Broadcast
@@ -170,9 +171,9 @@ ncclResult_t pncclBcast(void* buff, size_t count, ncclDataType_t datatype, int r
 * In-place operation will happen if sendbuff == recvbuff.
 */
 ncclResult_t  ncclBroadcast(const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype, int root,
-    ncclComm_t comm, cudaStream_t stream);
+    ncclComm_t comm, hipStream_t stream);
 ncclResult_t pncclBroadcast(const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype, int root,
-    ncclComm_t comm, cudaStream_t stream);
+    ncclComm_t comm, hipStream_t stream);

 /*
 * All-Reduce
@@ -183,9 +184,9 @@ ncclResult_t pncclBroadcast(const void* sendbuff, void* recvbuff, size_t count,
 * In-place operation will happen if sendbuff == recvbuff.
 */
 ncclResult_t  ncclAllReduce(const void* sendbuff, void* recvbuff, size_t count,
-    ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm, cudaStream_t stream);
+    ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm, hipStream_t stream);
 ncclResult_t pncclAllReduce(const void* sendbuff, void* recvbuff, size_t count,
-    ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm, cudaStream_t stream);
+    ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm, hipStream_t stream);

 /*
 * Reduce-Scatter
@@ -200,10 +201,10 @@ ncclResult_t pncclAllReduce(const void* sendbuff, void* recvbuff, size_t count,
 */
 ncclResult_t  ncclReduceScatter(const void* sendbuff, void* recvbuff,
    size_t recvcount, ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm,
-    cudaStream_t stream);
+    hipStream_t stream);
 ncclResult_t pncclReduceScatter(const void* sendbuff, void* recvbuff,
    size_t recvcount, ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm,
-    cudaStream_t stream);
+    hipStream_t stream);

 /*
 * All-Gather
@@ -216,9 +217,9 @@ ncclResult_t pncclReduceScatter(const void* sendbuff, void* recvbuff,
 * In-place operations will happen if sendbuff == recvbuff + rank * sendcount.
 */
 ncclResult_t  ncclAllGather(const void* sendbuff, void* recvbuff, size_t sendcount,
-    ncclDataType_t datatype, ncclComm_t comm, cudaStream_t stream);
+    ncclDataType_t datatype, ncclComm_t comm, hipStream_t stream);
 ncclResult_t pncclAllGather(const void* sendbuff, void* recvbuff, size_t sendcount,
-    ncclDataType_t datatype, ncclComm_t comm, cudaStream_t stream);
+    ncclDataType_t datatype, ncclComm_t comm, hipStream_t stream);

 /*
 * Group semantics
@@ -1,5 +1,6 @@
 /*************************************************************************
 * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
+ * Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
 *
 * See LICENSE.txt for license information
 ************************************************************************/
@@ -141,7 +142,7 @@ void* persistentThread(void *comm_) {
  int idleSpin = 0;
  while (1) {
    do {
-      if (*comm->abortFlag) return NULL;
+      if (LOAD(comm->abortFlag)) return NULL;
      if (op == NULL) {
        pthread_mutex_lock(&state->mutex);
        op = state->ops;
@@ -1,5 +1,6 @@
 /*************************************************************************
 * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
+ * Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
 *
 * See LICENSE.txt for license information
 ************************************************************************/
@@ -10,7 +11,7 @@
 #include "net.h"
 #include "param.h"
 #include "topo.h"
-#include <cuda_runtime.h>
+#include <hip/hip_runtime.h>
 #include <assert.h>

 #define NET_MAX_IFS 16
@@ -73,6 +74,7 @@ struct netRecvResources {
  struct ncclRecvMem* devRecvMem;
  uint64_t step;
  uint64_t llLastCleaning;
+  uint32_t* curr_hdp_reg;  // Curr GPU in ring (for rdma transport use only)
 };

 static ncclResult_t netDistance(int cudaDev, int dev, short* distance) {
@@ -100,7 +102,7 @@ static ncclResult_t netDevices(int* ndev, short** distances) {

  // Find distance with current GPU
  int cudaDev, nvmlDev;
-  CUDACHECK(cudaGetDevice(&cudaDev));
+  CUDACHECK(hipGetDevice(&cudaDev));
  NCCLCHECK(getNvmlDevice(cudaDev, &nvmlDev))
  char line[1024];
  sprintf(line, "CUDA Dev %d[%d], %s NIC distance : ", cudaDev, nvmlDev, ncclNetName());
@@ -115,7 +117,7 @@ static ncclResult_t netDevices(int* ndev, short** distances) {
 /* Determine if we can communicate with the peer */
 ncclResult_t netCanConnect(ncclTvalue_t* ret, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo) {
  int cudaDev;
-  CUDACHECK(cudaGetDevice(&cudaDev));
+  CUDACHECK(hipGetDevice(&cudaDev));
  ret[0] = ncclNetTvalues[cudaDev];
  if (ret[0] == NET_TVALUE_UNKNOWN) {
    if (cudaDev >= NET_MAX_GPUS) {
@@ -243,6 +245,8 @@ end:
  return dev;
 }

+extern bool useFineGrainVramPcie;
+
 NCCL_PARAM(NetGdrRead, "NET_GDR_READ", -2);
 NCCL_PARAM(NetGdrLevel, "NET_GDR_LEVEL", PATH_PHB);

@@ -250,9 +254,14 @@ static ncclResult_t netGetGdrSupport(int dev, int read, int* useGdr) {
  *useGdr = 0;

  int cudaDev, nvmlDev;
-  CUDACHECK(cudaGetDevice(&cudaDev));
+  CUDACHECK(hipGetDevice(&cudaDev));
  NCCLCHECK(getNvmlDevice(cudaDev, &nvmlDev))

+  if (!useFineGrainVramPcie) {
+    INFO(NCCL_INIT|NCCL_NET,"NET/%s : GPU Direct RDMA Disabled for GPU %d / Need Fine Grain VRAM over PCIe", ncclNetName(), cudaDev);
+    return ncclSuccess;
+  }
+
  if (read) { // For reads (sends) only enable under certain conditions
    int gdrReadParam = ncclParamNetGdrRead();
    if (gdrReadParam == 0) return ncclSuccess;
@@ -289,7 +298,7 @@ ncclResult_t netSendSetup(struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peer
  send->transportResources = resources;

  int cudaDev;
-  CUDACHECK(cudaGetDevice(&cudaDev));
+  CUDACHECK(hipGetDevice(&cudaDev));
  resources->netDev = getDev(cudaDev, channelId);
  NCCLCHECK(netGetGdrSupport(resources->netDev, 1, &resources->useGdr));

@@ -298,7 +307,7 @@ ncclResult_t netSendSetup(struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peer

  int recvSize = offsetof(struct ncclRecvMem, buff)+buffSize;
  if (resources->useGdr) {
-    NCCLCHECK(ncclCudaCalloc((char**)(&resources->devRecvMem), recvSize));
+    NCCLCHECK(ncclCudaCalloc((char**)(&resources->devRecvMem), recvSize, true));
  }
  NCCLCHECK(ncclCudaHostAlloc((void**)&resources->hostRecvMem, (void**)&resources->devHostRecvMem, recvSize));
  resources->buffSize = buffSize;
@@ -314,7 +323,7 @@ ncclResult_t netRecvSetup(struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peer
  recv->transportResources = resources;

  int cudaDev;
-  CUDACHECK(cudaGetDevice(&cudaDev));
+  CUDACHECK(hipGetDevice(&cudaDev));
  resources->netDev = getDev(cudaDev, channelId);
  NCCLCHECK(netGetGdrSupport(resources->netDev, 0, &resources->useGdr));

@@ -323,7 +332,8 @@ ncclResult_t netRecvSetup(struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peer

  int recvSize = offsetof(struct ncclRecvMem, buff)+buffSize;
  if (resources->useGdr) {
-    NCCLCHECK(ncclCudaCalloc((char**)(&resources->devRecvMem), recvSize));
+    NCCLCHECK(ncclCudaCalloc((char**)(&resources->devRecvMem), recvSize, true));
+    CUDACHECK(hipDeviceGetAttribute((int*)&resources->curr_hdp_reg, hipDeviceAttributeHdpMemFlushCntl,peerInfo->cudaDev));
  }
  NCCLCHECK(ncclCudaHostAlloc((void**)&resources->hostRecvMem, (void**)&resources->devHostRecvMem, recvSize));
  resources->buffSize = buffSize;
@@ -399,7 +409,7 @@ ncclResult_t netSendFree(void* transportResources) {
  NCCLCHECK(ncclNetDeregMr(resources->netSendComm, resources->llMhandle));
  NCCLCHECK(ncclCudaHostFree(resources->hostRecvMem));
  if (resources->useGdr)
-    CUDACHECK(cudaFree(resources->devRecvMem));
+    CUDACHECK(hipFree(resources->devRecvMem));
  NCCLCHECK(ncclNetCloseSend(resources->netSendComm));
  free(resources);
  return ncclSuccess;
@@ -412,7 +422,7 @@ ncclResult_t netRecvFree(void* transportResources) {
  NCCLCHECK(ncclNetDeregMr(resources->netRecvComm, resources->llMhandle));
  NCCLCHECK(ncclCudaHostFree(resources->hostRecvMem));
  if (resources->useGdr)
-    CUDACHECK(cudaFree(resources->devRecvMem));
+    CUDACHECK(hipFree(resources->devRecvMem));
  NCCLCHECK(ncclNetCloseRecv(resources->netRecvComm));
  free(resources);
  return ncclSuccess;
@@ -439,7 +449,7 @@ ncclResult_t netSendProxy(struct ncclProxyArgs* args) {
        volatile uint64_t* recvTail = &resources->hostRecvMem->tail;
        if (args->llMode) {
          int buffSlot = args->tail%NCCL_STEPS;
-          int size = sizesFifo[buffSlot];
+          int size = LOAD(sizesFifo+buffSlot);
          if (size != -1) {
            uint32_t flag = NCCL_LL_FLAG(args->tail + 1);
            int nFifoLines = DIVUP(size, sizeof(union ncclLLFifoLine));
@@ -449,12 +459,12 @@ ncclResult_t netSendProxy(struct ncclProxyArgs* args) {
            for (int i=0; i<nFifoLines; i++) {
              volatile uint32_t *f1 = &lines[i].flag1;
              volatile uint32_t *f2 = &lines[i].flag2;
-              if (f1[0] != flag || f2[0] != flag) { ready = 0; break; }
+              if (LOAD(f1) != flag || LOAD(f2) != flag) { ready = 0; break; }
            }
            if (ready) {
              NCCLCHECK(ncclNetIsend(resources->netSendComm, lines, size, resources->llMhandle, args->requests+buffSlot));
              if (args->requests[buffSlot] != NULL) {
-                sizesFifo[buffSlot] = -1;
+                STORE(sizesFifo+buffSlot, -1);
                // Make sure size is reset to zero before we update the head.
                __sync_synchronize();
                args->tail += args->sliceSteps;
@@ -462,14 +472,14 @@ ncclResult_t netSendProxy(struct ncclProxyArgs* args) {
              }
            }
          }
-        } else if (args->tail < *recvTail) {
+        } else if (args->tail < LOAD(recvTail)) {
          struct ncclRecvMem* localMem = resources->useGdr ? resources->devRecvMem : resources->hostRecvMem;
          int stepSize = args->channel->buffSize/NCCL_STEPS;
          // Send through network
          int buffSlot = args->tail%NCCL_STEPS;
-          NCCLCHECK(ncclNetIsend(resources->netSendComm, localMem->buff+buffSlot*stepSize, sizesFifo[buffSlot], resources->mhandle, args->requests+buffSlot));
+          NCCLCHECK(ncclNetIsend(resources->netSendComm, localMem->buff+buffSlot*stepSize, LOAD(sizesFifo+buffSlot), resources->mhandle, args->requests+buffSlot));
          if (args->requests[buffSlot] != NULL) {
-            sizesFifo[buffSlot] = -1;
+            STORE(sizesFifo+buffSlot, -1);
            // Make sure size is reset to zero before we update the head.
            __sync_synchronize();
            args->tail += args->sliceSteps;
@@ -483,7 +493,7 @@ ncclResult_t netSendProxy(struct ncclProxyArgs* args) {
        NCCLCHECK(ncclNetTest(args->requests[buffSlot], &done, NULL));
        if (done) {
          args->head += args->sliceSteps;
-          resources->hostSendMem->head = args->head;
+          STORE(&resources->hostSendMem->head, args->head);
          args->idle = 0;
        }
      }
@@ -518,7 +528,7 @@ ncclResult_t netRecvProxy(struct ncclProxyArgs* args) {
      char* localBuff = args->llMode ? (char*)localMem->llBuff : localMem->buff;
      void* mhandle = args->llMode ? resources->llMhandle : resources->mhandle;
      volatile uint64_t* sendHead = &resources->hostSendMem->head;
-      if ((args->tail < args->head + NCCL_STEPS) && (args->tail < *sendHead + NCCL_STEPS) && (args->tail < args->end)) {
+      if ((args->tail < args->head + NCCL_STEPS) && (args->tail < LOAD(sendHead) + NCCL_STEPS) && (args->tail < args->end)) {
        int buffSlot = args->tail%NCCL_STEPS;
        int sliceSize = stepSize * args->sliceSteps;
        NCCLCHECK(ncclNetIrecv(resources->netRecvComm, localBuff+buffSlot*stepSize, sliceSize, mhandle, args->requests+buffSlot));
@@ -534,8 +544,13 @@ ncclResult_t netRecvProxy(struct ncclProxyArgs* args) {
        if (done) {
          args->head += args->sliceSteps;
          if (args->llMode == 0) {
-            if (resources->useGdr) ncclNetFlush(resources->netRecvComm, localBuff+buffSlot*stepSize, size, mhandle);
-            resources->hostRecvMem->tail = args->head;
+            if (resources->useGdr) {
+              ncclNetFlush(resources->netRecvComm, localBuff+buffSlot*stepSize, size, mhandle);
+              // Flush local HDP register after local read-back finishes
+              STORE(resources->curr_hdp_reg, 0x1);
+              TRACE(NCCL_NET, "Flushing GPU memory via HDP %p", resources->curr_hdp_reg);
+            }
+            STORE(&resources->hostRecvMem->tail, args->head);
          }
          args->idle = 0;
        }
@@ -1,5 +1,6 @@
 /*************************************************************************
 * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
+ * Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
 *
 * See LICENSE.txt for license information
 ************************************************************************/
@@ -196,12 +197,16 @@ ncclResult_t ncclIbPciPath(int dev, char** path) {
 ncclResult_t ncclIbGdrSupport(int ibDev) {
  static int moduleLoaded = -1;
  if (moduleLoaded == -1) {
+#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__)
+    moduleLoaded = (access("/sys/kernel/mm/memory_peers/amdkfd/version", F_OK) == -1) ? 0 : 1;
+#else
    moduleLoaded = (access("/sys/kernel/mm/memory_peers/nv_mem/version", F_OK) == -1) ? 0 : 1;
+#endif
  }
  if (moduleLoaded == 0) return ncclSystemError;
  ncclResult_t ret = ncclSystemError;
  void* ptr;
-  if (cudaMalloc(&ptr, sizeof(int)) == cudaSuccess) {
+  if (hipMalloc(&ptr, sizeof(int)) == hipSuccess) {
    struct ibv_mr* mr;
    struct ibv_pd* pd;
    if (wrap_ibv_alloc_pd(&pd, ncclIbDevs[ibDev].context) == ncclSuccess) {
@@ -211,7 +216,7 @@ ncclResult_t ncclIbGdrSupport(int ibDev) {
      }
      wrap_ibv_dealloc_pd(pd);
    }
-    cudaFree(ptr);
+    hipFree(ptr);
  }
  return ret;
 }
@@ -220,7 +225,7 @@ ncclResult_t ncclIbPtrSupport(int dev, int* supportedTypes) {
  *supportedTypes = NCCL_PTR_HOST;

  int cudaDev, nvmlDev;
-  CUDACHECK(cudaGetDevice(&cudaDev));
+  CUDACHECK(hipGetDevice(&cudaDev));
  NCCLCHECK(getNvmlDevice(cudaDev, &nvmlDev))

  if (ncclIbGdrSupport(dev) != ncclSuccess) {
@@ -620,7 +625,7 @@ ncclResult_t ncclIbIsend(void* sendComm, void* data, int size, void* mhandle, vo
  // Wait for the receiver to have posted the corresponding receive
  volatile struct ncclIbSendFifo* slot = comm->fifo + (comm->fifoHead%MAX_REQUESTS);
  volatile uint32_t * readyPtr = &slot->ready;
-  if (*readyPtr == 0) { *request = NULL; return ncclSuccess; }
+  if (LOAD(readyPtr) == 0) { *request = NULL; return ncclSuccess; }

  struct ncclIbRequest* req;
  NCCLCHECK(ncclIbGetRequest(comm->reqs, &req));
@@ -647,22 +652,22 @@ ncclResult_t ncclIbIsend(void* sendComm, void* data, int size, void* mhandle, vo
  __sync_synchronize(); // order the readyPtr load against rkey load below
  // Sanity checks to catch user collective call count/size mismatches
  // plus any potential programming errors
-  if (size > slot->size || slot->size <= 0 || slot->addr == 0 || slot->rkey == 0 || slot->seq != comm->fifoHead) {
+  if (size > LOAD(&slot->size) || LOAD(&slot->size) <= 0 || LOAD(&slot->addr) == 0 || LOAD(&slot->rkey) == 0 || LOAD(&slot->seq) != comm->fifoHead) {
    WARN("NET/IB : collective mismatch error local size %d remote %d addr %lx rkey %x seq %x/%x",
-        size, slot->size, slot->addr, slot->rkey, slot->seq, comm->fifoHead);
+        size, LOAD(&slot->size), LOAD(&slot->addr), LOAD(&slot->rkey), LOAD(&slot->seq), comm->fifoHead);
    return ncclInternalError;
  }
  wr.opcode = IBV_WR_RDMA_WRITE_WITH_IMM;
-  wr.wr.rdma.remote_addr = slot->addr;
-  wr.wr.rdma.rkey = slot->rkey;
+  wr.wr.rdma.remote_addr = LOAD(&slot->addr);
+  wr.wr.rdma.rkey = LOAD(&slot->rkey);
  wr.imm_data = size; // Send the message size via imm_data
  __sync_synchronize();
 #endif
  // We must clear slot->ready, but reset other fields to aid
  // debugging and sanity checks
-  slot->ready = 0;
-  slot->addr = 0ULL;
-  slot->rkey = slot->size = slot->seq = 0;
+  STORE(&slot->ready, 0);
+  STORE(&slot->addr, 0);
+  STORE(&slot->rkey, 0); STORE(&slot->size, 0); STORE(&slot->seq, 0);
  comm->fifoHead++;

  struct ibv_send_wr* bad_wr;
@@ -1,5 +1,6 @@
 /*************************************************************************
 * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
+ * Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
 *
 * See LICENSE.txt for license information
 ************************************************************************/
@@ -175,12 +176,12 @@ void* persistentSocketThread(void *args_) {
    }
    if (idle) {
      pthread_mutex_lock(&resource->threadLock);
-      while (mark == myQueue->next && *state != stop) { // no new tasks, wait
+      while (mark == myQueue->next && LOAD(state) != stop) { // no new tasks, wait
        pthread_cond_wait(&resource->threadCond, &resource->threadLock);
      }
      pthread_mutex_unlock(&resource->threadLock);
    }
-    if (*state == stop) return NULL;
+    if (LOAD(state) == stop) return NULL;
  }
 }

@@ -1,5 +1,6 @@
 /*************************************************************************
 * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
+ * Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
 *
 * See LICENSE.txt for license information
 ************************************************************************/
@@ -10,21 +11,26 @@
 #include "transport.h"
 #include "param.h"
 #include <unistd.h>
-#include <cuda_runtime.h>
+#include <hip/hip_runtime.h>
 #include <ctype.h>
+#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__)
+#include "nvlink_stub.h"
+#else
 #include "nvlink.h"
+#endif

 struct p2pConnectInfo {
  int direct;
  union {
    void* directPtr;
-    cudaIpcMemHandle_t devIpc;
+    hipIpcMemHandle_t devIpc;
  };
 };

 struct p2pSendResources {
  struct ncclSendMem* devMem;
  void* ipcPtr;
+  uint32_t* next_hdp_reg;  // Next GPU in ring (for p2p transport use only)
 };

 struct p2pRecvResources {
@@ -37,14 +43,16 @@ struct p2pRecvResources {
 NCCL_PARAM(P2pLevel, "P2P_LEVEL", -2);
 NCCL_PARAM(P2pDisable, "P2P_DISABLE", -2);

+extern bool useFineGrainVramPcie;
+
 /* Convert a PCI busId string into a local cudaDev device index (cf. CUDA_VISIBLE_DEVICES) */
 static int busIdToCudaDev(const char* busId) {
  int ndev;
-  if (cudaGetDeviceCount(&ndev) != cudaSuccess)
+  if (hipGetDeviceCount(&ndev) != hipSuccess)
    return -1;
  for (int i = 0; i < ndev; i++) {
    char devBusId[NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE];
-    if (cudaDeviceGetPCIBusId(devBusId, NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE, i) != cudaSuccess)
+    if (hipDeviceGetPCIBusId(devBusId, NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE, i) != hipSuccess)
      return -1;
    if (strcmp(busId, devBusId) == 0) {
      return i;
@@ -95,15 +103,38 @@ ncclResult_t p2pCanConnect(ncclTvalue_t* ret, struct ncclPeerInfo* myInfo, struc

  // See if CUDA can do P2P
  int p2p;
-  if (cudaDeviceCanAccessPeer(&p2p, myInfo->cudaDev, peerCudaDev) != cudaSuccess) {
+  if (hipDeviceCanAccessPeer(&p2p, myInfo->cudaDev, peerCudaDev) != hipSuccess) {
    INFO(NCCL_INIT|NCCL_P2P,"peer query failed between dev %d(=%d) and dev %d(=%d)",
         myInfo->cudaDev, myInfo->nvmlDev, peerCudaDev, peerInfo->nvmlDev);
    return ncclSuccess;
  }
  if (p2p == 0) return ncclSuccess;

-  // Check for NVLink/NVswitch
+#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__)
+  uint32_t link_type, hops;
+  if (hipExtGetLinkTypeAndHopCount(myInfo->cudaDev, peerInfo->cudaDev, &link_type, &hops) != hipSuccess) {
+    p2p = 0;
+    return ncclSuccess;
+  }
+  static const char* link_type_name[] = {"HT", "QPI", "PCIE", "IB", "XGMI"};
+  static unsigned long long link_status_print_once_mask = 0;
+  if (!(link_status_print_once_mask & (1 << (myInfo->cudaDev*8 + peerInfo->cudaDev)))) {
+    INFO(NCCL_INIT, "%d -> %d: link type %s hops %d", myInfo->cudaDev, peerInfo->cudaDev,
+      link_type_name[link_type], hops);
+    link_status_print_once_mask |= (1 << (myInfo->cudaDev*8 + peerInfo->cudaDev));
+  }
+  int nvlinkp2p = 0;
+  if (link_type == HSA_AMD_LINK_INFO_TYPE_XGMI) {
+    if (hops == 1)
+      nvlinkp2p = CONNECT_NVLINK;
+  } else {
+    if (!useFineGrainVramPcie)
+      return ncclSuccess;
+  }
+#else
+// Check for NVLink/NVswitch
  int nvlinkp2p = getNvlinkGpu(myInfo->busId, peerInfo->busId);
+#endif
  if (nvlinkp2p > 0) {
    *ret = nvlinkp2p;
    return ncclSuccess;
@@ -266,7 +297,11 @@ int p2pComputeRingsNvLink(ncclTvalue_t* values, int nranks, int* rings, int nrin
  }

  // Duplicate the rings for direct NVLink
+#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__)
+  compNrings = copyRings(nranks, rings, compNrings, compNrings*3);
+#else
  compNrings = copyRings(nranks, rings, compNrings, compNrings*2);
+#endif

  return compNrings;
 }
@@ -464,13 +499,24 @@ end:
 /* Send: Create and return connect structures for this peer to connect to me */
 ncclResult_t p2pSendSetup(struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo,
    struct ncclConnect* connectInfo, struct ncclConnector* send, int buffSize, int channelId) {
-
  struct p2pSendResources* resources;
  NCCLCHECK(ncclCalloc(&resources, 1));
  send->transportResources = resources;
  int sendSize = sizeof(struct ncclSendMem);
  ALIGN_SIZE(sendSize, CUDA_IPC_MIN);
-  NCCLCHECK(ncclCudaCalloc((char**)&resources->devMem, sendSize));
+  NCCLCHECK(ncclCudaCalloc((char**)&resources->devMem, sendSize, true));
+
+  uint32_t linktype, hops;
+  if (hipExtGetLinkTypeAndHopCount(myInfo->cudaDev, peerInfo->cudaDev, &linktype, &hops) != hipSuccess) {
+    INFO(NCCL_INIT|NCCL_P2P,"Ring %02d : %d -> %d failed to get link type and hop count", channelId, myInfo->rank, peerInfo->rank);
+    return ncclInternalError;
+  }
+  if (linktype != HSA_AMD_LINK_INFO_TYPE_XGMI) {
+    CUDACHECK(hipDeviceGetAttribute((int*)&resources->next_hdp_reg, hipDeviceAttributeHdpMemFlushCntl,peerInfo->cudaDev));
+    TRACE(NCCL_INIT|NCCL_P2P,"Ring %02d : %d -> %d HDP %p", channelId, myInfo->rank, peerInfo->rank, resources->next_hdp_reg);
+  }
+  else
+    resources->next_hdp_reg = 0;

  struct p2pConnectInfo info;
  if (myInfo->pidHash == peerInfo->pidHash) {
@@ -480,12 +526,12 @@ ncclResult_t p2pSendSetup(struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peer
      INFO(NCCL_INIT|NCCL_P2P,"Ring %02d : %d -> %d via P2P/common device", channelId, myInfo->rank, peerInfo->rank);
    } else {
      // Enable P2P access
-      cudaError_t err = cudaDeviceEnablePeerAccess(peerInfo->cudaDev, 0);
-      if (err == cudaErrorPeerAccessAlreadyEnabled) {
-        cudaGetLastError();
-      } else if (err != cudaSuccess) {
+      hipError_t err = hipDeviceEnablePeerAccess(peerInfo->cudaDev, 0);
+      if (err == hipErrorPeerAccessAlreadyEnabled) {
+        hipGetLastError();
+      } else if (err != hipSuccess) {
        WARN("failed to peer with device %d(=%d): %d %s",
-             peerInfo->cudaDev, peerInfo->nvmlDev, err, cudaGetErrorString(err));
+             peerInfo->cudaDev, peerInfo->nvmlDev, err, hipGetErrorString(err));
        return ncclInternalError;
      }
      INFO(NCCL_INIT|NCCL_P2P,"Ring %02d : %d[%d] -> %d[%d] via P2P/direct pointer",
@@ -496,10 +542,10 @@ ncclResult_t p2pSendSetup(struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peer
    int peerCudaDev = busIdToCudaDev(peerInfo->busId);
    info.direct = 0;
    // Map IPC and enable P2P access
-    cudaError_t err = cudaIpcGetMemHandle(&info.devIpc, (void*)resources->devMem);
-    if (err != cudaSuccess) {
+    hipError_t err = hipIpcGetMemHandle(&info.devIpc, (void*)resources->devMem);
+    if (err != hipSuccess) {
      WARN("rank %d failed to get CUDA IPC handle to device %d(=%d) : %d %s",
-           myInfo->rank, peerCudaDev, peerInfo->nvmlDev, err, cudaGetErrorString(err));
+           myInfo->rank, peerCudaDev, peerInfo->nvmlDev, err, hipGetErrorString(err));
      return ncclInternalError;
    }
    INFO(NCCL_INIT|NCCL_P2P,"Ring %02d : %d[%d] -> %d[%d] via P2P/IPC",
@@ -520,7 +566,7 @@ ncclResult_t p2pRecvSetup(struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peer
  recv->transportResources = resources;
  int recvSize = offsetof(struct ncclRecvMem, buff)+buffSize;
  ALIGN_SIZE(recvSize, CUDA_IPC_MIN);
-  NCCLCHECK(ncclCudaCalloc((char**)&resources->devMem, recvSize));
+  NCCLCHECK(ncclCudaCalloc((char**)&resources->devMem, recvSize, true));

  struct p2pConnectInfo info;
  if (myInfo->pidHash == peerInfo->pidHash) {
@@ -530,12 +576,12 @@ ncclResult_t p2pRecvSetup(struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peer
      TRACE(NCCL_INIT|NCCL_P2P,"%d <- %d via P2P/common device", myInfo->rank, peerInfo->rank);
    } else {
      // Enable P2P access
-      cudaError_t err = cudaDeviceEnablePeerAccess(peerInfo->cudaDev, 0);
-      if (err == cudaErrorPeerAccessAlreadyEnabled) {
-        cudaGetLastError();
-      } else if (err != cudaSuccess) {
+      hipError_t err = hipDeviceEnablePeerAccess(peerInfo->cudaDev, 0);
+      if (err == hipErrorPeerAccessAlreadyEnabled) {
+        hipGetLastError();
+      } else if (err != hipSuccess) {
        WARN("failed to peer with device %d(=%d): %d %s",
-             peerInfo->cudaDev, peerInfo->nvmlDev, err, cudaGetErrorString(err));
+             peerInfo->cudaDev, peerInfo->nvmlDev, err, hipGetErrorString(err));
        return ncclInternalError;
      }
      TRACE(NCCL_INIT|NCCL_P2P,"Ring %02d : %d[%d] <- %d[%d] via P2P/direct pointer", channelId, myInfo->rank, myInfo->nvmlDev, peerInfo->rank, peerInfo->nvmlDev);
@@ -545,10 +591,10 @@ ncclResult_t p2pRecvSetup(struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peer
    int peerCudaDev = busIdToCudaDev(peerInfo->busId);
    info.direct = 0;
    // Map IPC and enable P2P access
-    cudaError_t err = cudaIpcGetMemHandle(&info.devIpc, (void*)resources->devMem);
-    if (err != cudaSuccess) {
+    hipError_t err = hipIpcGetMemHandle(&info.devIpc, (void*)resources->devMem);
+    if (err != hipSuccess) {
      WARN("rank %d failed to get CUDA IPC handle to device %d(=%d) : %d %s",
-           myInfo->rank, peerCudaDev, peerInfo->nvmlDev, err, cudaGetErrorString(err));
+           myInfo->rank, peerCudaDev, peerInfo->nvmlDev, err, hipGetErrorString(err));
      return ncclInternalError;
    }
    TRACE(NCCL_INIT|NCCL_P2P,"Ring %02d : %d[%d] <- %d[%d] via P2P/IPC", channelId, myInfo->rank, myInfo->nvmlDev, peerInfo->rank, peerInfo->nvmlDev);
@@ -569,11 +615,11 @@ static ncclResult_t p2pSendConnect(struct ncclConnect* connectInfo, struct ncclC
    send->conn.direct = 1;
  } else {
    //TRACE_DUMP_IPC(&info->devIpc);
-    cudaError_t err = cudaIpcOpenMemHandle(&resources->ipcPtr, info->devIpc, cudaIpcMemLazyEnablePeerAccess);
+    hipError_t err = hipIpcOpenMemHandle(&resources->ipcPtr, info->devIpc, hipIpcMemLazyEnablePeerAccess);
    remDevMem = (struct ncclRecvMem*)resources->ipcPtr;
-    if (err != cudaSuccess) {
+    if (err != hipSuccess) {
      WARN("failed to open CUDA IPC handle : %d %s",
-          err, cudaGetErrorString(err));
+          err, hipGetErrorString(err));
      return ncclUnhandledCudaError;
    }
  }
@@ -585,6 +631,7 @@ static ncclResult_t p2pSendConnect(struct ncclConnect* connectInfo, struct ncclC
  send->conn.head = &resources->devMem->head;
  send->conn.ptrExchange = &resources->devMem->ptrExchange;
  send->conn.opCountLoc = &resources->devMem->opCount;
+  send->conn.next_hdp_reg = resources->next_hdp_reg;
  return ncclSuccess;
 }

@@ -599,11 +646,11 @@ ncclResult_t p2pRecvConnect(struct ncclConnect* connectInfo, struct ncclConnecto
    recv->conn.ptrExchange = &remDevMem->ptrExchange;
  } else {
    //TRACE_DUMP_IPC(&info->devIpc);
-    cudaError_t err = cudaIpcOpenMemHandle(&resources->ipcPtr, info->devIpc, cudaIpcMemLazyEnablePeerAccess);
+    hipError_t err = hipIpcOpenMemHandle(&resources->ipcPtr, info->devIpc, hipIpcMemLazyEnablePeerAccess);
    remDevMem = (struct ncclSendMem*)resources->ipcPtr;
-    if (err != cudaSuccess) {
+    if (err != hipSuccess) {
      WARN("failed to open CUDA IPC handle : %d %s",
-          err, cudaGetErrorString(err));
+          err, hipGetErrorString(err));
      return ncclUnhandledCudaError;
    }
  }
@@ -620,8 +667,8 @@ ncclResult_t p2pRecvConnect(struct ncclConnect* connectInfo, struct ncclConnecto
 ncclResult_t p2pSendFree(void* resources) {
  struct p2pSendResources* sendRes = (struct p2pSendResources*)resources;
  if (sendRes->ipcPtr)
-    CUDACHECK(cudaIpcCloseMemHandle(sendRes->ipcPtr));
-  CUDACHECK(cudaFree(sendRes->devMem));
+    CUDACHECK(hipIpcCloseMemHandle(sendRes->ipcPtr));
+  CUDACHECK(hipFree(sendRes->devMem));
  free(sendRes);
  return ncclSuccess;
 }
@@ -629,8 +676,8 @@ ncclResult_t p2pSendFree(void* resources) {
 ncclResult_t p2pRecvFree(void* resources) {
  struct p2pRecvResources* recvRes = (struct p2pRecvResources*)resources;
  if (recvRes->ipcPtr)
-    CUDACHECK(cudaIpcCloseMemHandle(recvRes->ipcPtr));
-  CUDACHECK(cudaFree(recvRes->devMem));
+    CUDACHECK(hipIpcCloseMemHandle(recvRes->ipcPtr));
+  CUDACHECK(hipFree(recvRes->devMem));
  free(recvRes);
  return ncclSuccess;
 }
@@ -1,5 +1,6 @@
 /*************************************************************************
 * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
+ * Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
 *
 * See LICENSE.txt for license information
 ************************************************************************/
@@ -10,7 +11,7 @@
 #include "param.h"
 #include "shm.h"
 #include <unistd.h>
-#include <cuda_runtime.h>
+#include <hip/hip_runtime.h>

 struct shmConnectInfo {
  uint64_t pidHash;
@@ -0,0 +1,72 @@
+cmake_minimum_required(VERSION 2.8.12)
+
+if(BUILD_TESTS)
+
+  message("Going to build unit tests (Installed in /test/UnitTests)")
+
+  # chrpath is required to properly set rpath for the UnitTests executable
+  find_program(CHRPATH chrpath)
+  if(NOT CHRPATH)
+      message(FATAL_ERROR "chrpath is required for UnitTests. Please install (e.g. sudo apt-get install chrpath)")
+  endif()
+
+  # OpenMP is used to drive GPUs (one per thread)
+  find_package(OpenMP REQUIRED)
+  set (CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OpenMP_C_FLAGS}")
+  set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}")
+  set (CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} ${OpenMP_EXE_LINKER_FLAGS}")
+
+  # Download and unpack googletest at configure time
+  configure_file(CMakeLists.txt.in googletest-download/CMakeLists.txt)
+  execute_process(
+    COMMAND ${CMAKE_COMMAND} -G "${CMAKE_GENERATOR}" .
+    RESULT_VARIABLE result
+    WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/googletest-download)
+  if(result)
+    message(FATAL_ERROR "CMake step for googletest failed: ${result}")
+  endif()
+  execute_process(
+    COMMAND ${CMAKE_COMMAND} --build .
+    RESULT_VARIABLE result
+    WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/googletest-download)
+  if(result)
+    message(FATAL_ERROR "Build step for googletest failed: ${result}")
+  endif()
+
+  # Add googletest directly to our build. This adds the following targets:
+  # gtest, gtest_main, gmock and gmock_main
+  add_subdirectory("${CMAKE_BINARY_DIR}/googletest-src"
+                   "${CMAKE_BINARY_DIR}/googletest-build")
+
+  # Add googletest directly to our build. This defines the gtest and gtest_main
+  # targets. add_subdirectory(${CMAKE_CURRENT_BINARY_DIR}/googletest-src
+  # ${CMAKE_CURRENT_BINARY_DIR}/googletest-build EXCLUDE_FROM_ALL)
+
+  # Collect source files for tests
+  set(TEST_SOURCES
+    test_AllGather.cpp
+    test_AllReduce.cpp
+    test_Broadcast.cpp
+    test_Reduce.cpp
+    test_ReduceScatter.cpp
+    test_GroupCalls.cpp
+    test_CombinedCalls.cpp
+    test_AllReduceAbort.cpp
+    test_BroadcastAbort.cpp
+  )
+
+  add_executable(UnitTests ${TEST_SOURCES})
+  target_include_directories(UnitTests PRIVATE /opt/rocm)
+  target_link_libraries(UnitTests PRIVATE gtest_main PRIVATE rccl)
+  install(TARGETS UnitTests RUNTIME DESTINATION test)
+
+  # HCC adds /opt/rocm/lib as RPATH, even though the install process is supposed to
+  # remove RPATH.  As a work-around, set the correct RPATH for the unit test executable
+  # as a post-install step
+  install(
+    CODE
+      "execute_process(COMMAND chrpath -r ${CMAKE_INSTALL_PREFIX}/lib:/opt/rocm/lib ${CMAKE_INSTALL_PREFIX}/test/UnitTests)"
+    )
+else()
+  message("Not building unit tests")
+endif()
@@ -0,0 +1,15 @@
+cmake_minimum_required(VERSION 2.8.2)
+
+project(googletest-download NONE)
+
+include(ExternalProject)
+ExternalProject_Add(googletest
+  GIT_REPOSITORY https://github.com/google/googletest.git
+  GIT_TAG release-1.8.1
+  SOURCE_DIR "${CMAKE_BINARY_DIR}/googletest-src"
+  BINARY_DIR "${CMAKE_BINARY_DIR}/googletest-build"
+  CONFIGURE_COMMAND ""
+  BUILD_COMMAND ""
+  INSTALL_COMMAND ""
+  TEST_COMMAND ""
+)
@@ -0,0 +1,360 @@
+/*************************************************************************
+ * Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+#ifndef CORRECTNESSTEST_HPP
+#define CORRECTNESSTEST_HPP
+
+#include <cstdio>
+#include <tuple>
+#include <vector>
+#include <gtest/gtest.h>
+#include "rccl.h"
+
+#define HIP_CALL(x) ASSERT_EQ(x, hipSuccess)
+#define NCCL_CALL(x) ASSERT_EQ(x, ncclSuccess)
+
+namespace CorrectnessTests
+{
+    // Performs the various basic reduction operations
+    template <typename T>
+    T ReduceOp(ncclRedOp_t const op, T const A, T const B)
+    {
+        switch (op)
+        {
+        case ncclSum:  return A + B;
+        case ncclProd: return A * B;
+        case ncclMax:  return std::max(A, B);
+        case ncclMin:  return std::min(A, B);
+        default:
+            fprintf(stderr, "[ERROR] Unsupported reduction operator (%d)\n", op);
+            exit(0);
+        }
+    }
+
+    // Returns the number of bytes per element for each supported datatype
+    static int DataTypeToBytes(ncclDataType_t const dataType)
+    {
+        switch (dataType)
+        {
+        case ncclInt8:   return 1;
+        case ncclUint8:  return 1;
+        case ncclInt32:  return 4;
+        case ncclUint32: return 4;
+        case ncclInt64:  return 8;
+        case ncclUint64: return 8;
+        case ncclFloat16: return 2;
+        case ncclFloat32: return 4;
+        case ncclFloat64: return 8;
+        default:
+            fprintf(stderr, "[ERROR] Unsupported datatype (%d)\n", dataType);
+            exit(0);
+        }
+    }
+
+    // Encapsulates all the memory used per devices for collectives, as well as reference results
+    struct Dataset
+    {
+        int                 numDevices;  // Number of devices participating
+        size_t              numElements; // Number of elements per array
+        ncclDataType_t      dataType;    // Data type of each input/output pointer
+        bool                inPlace;     // Whether or not output pointers are same as input pointers
+
+        std::vector<void *> inputs;      // Input pointers (1 per device)
+        std::vector<void *> outputs;     // Output pointers (1 per device)
+                                         // May be identical to input pointers for in-place tests
+        std::vector<void *> expected;    // Expected output (1 per device)
+
+        size_t NumBytes() const
+        {
+            return numElements * DataTypeToBytes(dataType);
+        }
+
+        void Initialize(int            const numDevices_,
+                        size_t         const numElements_,
+                        ncclDataType_t const dataType_,
+                        bool           const inPlace_)
+        {
+            numDevices  = numDevices_;
+            numElements = numElements_;
+            dataType    = dataType_;
+            inPlace     = inPlace_;
+
+            inputs.resize(numDevices);
+            outputs.resize(numDevices);
+            expected.resize(numDevices);
+
+            // Allocate per-device memory
+            size_t const numBytes = NumBytes();
+
+            for (int i = 0; i < numDevices; i++)
+            {
+                HIP_CALL(hipSetDevice(i));
+                HIP_CALL(hipMalloc((void **)&inputs[i], numBytes));
+                if (inPlace)
+                    outputs[i] = inputs[i];
+                else
+                    HIP_CALL(hipMalloc((void **)&outputs[i], numBytes));
+
+                expected[i] = malloc(numBytes);
+            }
+        }
+
+        // Explicit memory release to avoid double-free from subDatasets
+        void Release()
+        {
+            for (int i = 0; i < outputs.size(); i++)
+            {
+                if (!inPlace) hipFree(outputs[i]);
+                hipFree(inputs[i]);
+                free(expected[i]);
+            }
+
+            outputs.clear();
+        }
+
+        // Creates a dataset by pointing to an existing dataset
+        // Primarily to allow for testing with different starting byte-alignments
+        void ExtractSubDataset(size_t const startElement,
+                               size_t const lastElement,
+                               Dataset& subDataset)
+        {
+            ASSERT_LE(startElement, lastElement);
+            ASSERT_LT(lastElement, numElements);
+
+            subDataset.numDevices  = numDevices;
+            subDataset.numElements = lastElement - startElement + 1;
+            subDataset.dataType    = dataType;
+            subDataset.inPlace     = inPlace;
+
+            subDataset.inputs.resize(numDevices);
+            subDataset.outputs.resize(numDevices);
+            subDataset.expected.resize(numDevices);
+
+            size_t const byteOffset = (startElement * DataTypeToBytes(dataType));
+            for (int i = 0; i < numDevices; i++)
+            {
+                subDataset.inputs[i]   = (int8_t *)inputs[i] + byteOffset;
+                subDataset.outputs[i]  = (int8_t *)outputs[i] + byteOffset;
+                subDataset.expected[i] = (int8_t *)expected[i] + byteOffset;
+            }
+        }
+    };
+
+    typedef std::tuple<ncclRedOp_t    /* op          */,
+                       ncclDataType_t /* dataType    */,
+                       size_t         /* numElements */,
+                       int            /* numDevices  */,
+                       bool           /* inPlace     */> TestTuple;
+
+    // Base class for each collective test
+    // - Each test is instantiated with a different TestTuple
+    class CorrectnessTest : public testing::TestWithParam<TestTuple>
+    {
+    protected:
+
+        // This code is called per test-tuple
+        void SetUp() override
+        {
+            // Check for fine-grained env variable (otherwise will hang)
+            if (!getenv("HSA_FORCE_FINE_GRAIN_PCIE"))
+            {
+                printf("Must set HSA_FORCE_FINE_GRAIN_PCIE=1 prior to execution\n");
+                exit(0);
+            }
+
+            // Make the test tuple parameters accessible
+            std::tie(op, dataType, numElements, numDevices, inPlace) = GetParam();
+
+            // Collect the number of available GPUs
+            HIP_CALL(hipGetDeviceCount(&numDevicesAvailable));
+
+            // Only proceed with testing if there are enough GPUs
+            if (numDevices > numDevicesAvailable)
+            {
+                fprintf(stdout, "[  SKIPPED ] Test requires %d devices (only %d available)\n",
+                        numDevices, numDevicesAvailable);
+
+                // Modify the number of devices so that tear-down doesn't occur
+                // This is temporary until GTEST_SKIP() becomes available
+                numDevices = 0;
+                numDevicesAvailable = -1;
+                return;
+            }
+
+            // Initialize communicators
+            comms.resize(numDevices);
+            NCCL_CALL(ncclCommInitAll(comms.data(), numDevices, NULL));
+
+            // Create streams
+            streams.resize(numDevices);
+            for (int i = 0; i < numDevices; i++)
+            {
+                HIP_CALL(hipSetDevice(i));
+                HIP_CALL(hipStreamCreate(&streams[i]));
+            }
+        }
+
+        // Clean up per TestTuple
+        void TearDown() override
+        {
+            // Release communicators and streams
+            for (int i = 0; i < numDevices; i++)
+            {
+                NCCL_CALL(ncclCommDestroy(comms[i]));
+                HIP_CALL(hipStreamDestroy(streams[i]));
+            }
+        }
+
+        void FillDatasetWithPattern(Dataset& dataset)
+        {
+            int8_t*   arrayI1 = (int8_t   *)malloc(dataset.NumBytes());
+            uint8_t*  arrayU1 = (uint8_t  *)arrayI1;
+            int32_t*  arrayI4 = (int32_t  *)arrayI1;
+            uint32_t* arrayU4 = (uint32_t *)arrayI1;
+            int64_t*  arrayI8 = (int64_t  *)arrayI1;
+            uint64_t* arrayU8 = (uint64_t *)arrayI1;
+            float*    arrayF4 = (float    *)arrayI1;
+            double*   arrayF8 = (double   *)arrayI1;
+
+            // NOTE: Currently half-precision float tests are unsupported due to half being supported
+            //       on GPU only and not host
+
+            // Fills input  data[i][j] with (i + j) % 6
+            // - Keeping range small to reduce likelihood of overflow
+            // - Sticking with floating points values that are perfectly representable
+            for (int i = 0; i < dataset.numDevices; i++)
+            {
+                for (int j = 0; j < dataset.numElements; j++)
+                {
+                    int    valueI = (i + j) % 6;
+                    float  valueF = (float)valueI;
+
+                    switch (dataset.dataType)
+                    {
+                    case ncclInt8:    arrayI1[j] = valueI; break;
+                    case ncclUint8:   arrayU1[j] = valueI; break;
+                    case ncclInt32:   arrayI4[j] = valueI; break;
+                    case ncclUint32:  arrayU4[j] = valueI; break;
+                    case ncclInt64:   arrayI8[j] = valueI; break;
+                    case ncclUint64:  arrayU8[j] = valueI; break;
+                    case ncclFloat32: arrayF4[j] = valueF; break;
+                    case ncclFloat64: arrayF8[j] = valueF; break;
+                    default:
+                        fprintf(stderr, "[ERROR] Unsupported datatype\n");
+                        exit(0);
+                    }
+                }
+
+                HIP_CALL(hipSetDevice(i));
+                HIP_CALL(hipMemcpy(dataset.inputs[i], arrayI1, dataset.NumBytes(), hipMemcpyHostToDevice));
+
+                // Fills output data[i][j] with 0 (if not inplace)
+                if (!dataset.inPlace)
+                    HIP_CALL(hipMemset(dataset.outputs[i], 0, dataset.NumBytes()));
+            }
+
+            free(arrayI1);
+        }
+
+        void Synchronize() const
+        {
+            // Wait for reduction to complete
+            for (int i = 0; i < numDevices; i++)
+            {
+                HIP_CALL(hipSetDevice(i));
+                HIP_CALL(hipStreamSynchronize(streams[i]));
+            }
+        }
+
+        void ValidateResults(Dataset const& dataset) const
+        {
+            int8_t*   outputI1 = (int8_t   *)malloc(dataset.NumBytes());
+            uint8_t*  outputU1 = (uint8_t  *)outputI1;
+            int32_t*  outputI4 = (int32_t  *)outputI1;
+            uint32_t* outputU4 = (uint32_t *)outputI1;
+            int64_t*  outputI8 = (int64_t  *)outputI1;
+            uint64_t* outputU8 = (uint64_t *)outputI1;
+            float*    outputF4 = (float    *)outputI1;
+            double*   outputF8 = (double   *)outputI1;
+
+            bool isMatch = true;
+
+            // Loop over each device's output and compare it to the expected output
+            // (Each collective operation computes its own expected results)
+            for (int i = 0; i < dataset.numDevices && isMatch; i++)
+            {
+                HIP_CALL(hipMemcpy(outputI1, dataset.outputs[i], dataset.NumBytes(), hipMemcpyDeviceToHost));
+
+                int8_t*   expectedI1 = (int8_t   *)dataset.expected[i];
+                uint8_t*  expectedU1 = (uint8_t  *)expectedI1;
+                int32_t*  expectedI4 = (int32_t  *)expectedI1;
+                uint32_t* expectedU4 = (uint32_t *)expectedI1;
+                int64_t*  expectedI8 = (int64_t  *)expectedI1;
+                uint64_t* expectedU8 = (uint64_t *)expectedI1;
+                float*    expectedF4 = (float    *)expectedI1;
+                double*   expectedF8 = (double   *)expectedI1;
+
+                for (int j = 0; j < dataset.numElements && isMatch; j++)
+                {
+                    switch (dataset.dataType)
+                    {
+                    case ncclInt8:    isMatch &= (outputI1[j] == expectedI1[j]); break;
+                    case ncclUint8:   isMatch &= (outputU1[j] == expectedU1[j]); break;
+                    case ncclInt32:   isMatch &= (outputI4[j] == expectedI4[j]); break;
+                    case ncclUint32:  isMatch &= (outputU4[j] == expectedU4[j]); break;
+                    case ncclInt64:   isMatch &= (outputI8[j] == expectedI8[j]); break;
+                    case ncclUint64:  isMatch &= (outputU8[j] == expectedU8[j]); break;
+                    case ncclFloat32: isMatch &= (outputF4[j] == expectedF4[j]); break;
+                    case ncclFloat64: isMatch &= (outputF8[j] == expectedF8[j]); break;
+                    default:
+                        fprintf(stderr, "[ERROR] Unsupported datatype\n");
+                        exit(0);
+                    }
+
+                    if (!isMatch)
+                    {
+                        switch (dataset.dataType)
+                        {
+                        case ncclInt8:
+                            printf("Expected %d.  Output %d on device %d[%d]\n", outputI1[j], expectedI1[j], i, j); break;
+                        case ncclUint8:
+                            printf("Expected %u.  Output %u on device %d[%d]\n", outputU1[j], expectedU1[j], i, j); break;
+                        case ncclInt32:
+                            printf("Expected %d.  Output %d on device %d[%d]\n", outputI4[j], expectedI4[j], i, j); break;
+                        case ncclUint32:
+                            printf("Expected %u.  Output %u on device %d[%d]\n", outputU4[j], expectedU4[j], i, j); break;
+                        case ncclInt64:
+                            printf("Expected %ld.  Output %ld on device %d[%d]\n", outputI8[j], expectedI8[j], i, j); break;
+                        case ncclUint64:
+                            printf("Expected %lu.  Output %lu on device %d[%d]\n", outputU8[j], expectedU8[j], i, j); break;
+                        case ncclFloat32:
+                            printf("Expected %f.  Output %f on device %d[%d]\n", outputF4[j], expectedF4[j], i, j); break;
+                        case ncclFloat64:
+                            printf("Expected %lf.  Output %lf on device %d[%d]\n", outputF8[j], expectedF8[j], i, j); break;
+                        default:
+                            fprintf(stderr, "[ERROR] Unsupported datatype\n");
+                            exit(0);
+                        }
+                    }
+                }
+                ASSERT_EQ(isMatch, true);
+            }
+        }
+
+        // Passed in parameters from TestTuple
+        ncclRedOp_t              op;
+        ncclDataType_t           dataType;
+        size_t                   numElements;
+        int                      numDevices;
+        bool                     inPlace;
+
+        int                      numDevicesAvailable;
+        std::vector<ncclComm_t>  comms;
+        std::vector<hipStream_t> streams;
+    };
+
+}
+
+#endif
@@ -0,0 +1,111 @@
+/*************************************************************************
+ * Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+#include "test_AllGather.hpp"
+#include <omp.h>
+
+namespace CorrectnessTests
+{
+    TEST_P(AllGatherCorrectnessTest, Correctness)
+    {
+        if (numDevices > numDevicesAvailable) return;
+        if (numElements % numDevices != 0) return;
+
+        // Prepare input / output / expected results
+        Dataset dataset;
+        dataset.Initialize(numDevices, numElements, dataType, inPlace);
+        FillDatasetWithPattern(dataset);
+        ComputeExpectedResults(dataset);
+
+        size_t const byteCount = dataset.NumBytes() / dataset.numDevices;
+        size_t const sendCount = dataset.numElements / dataset.numDevices;
+
+        // Launch the reduction (1 thread per GPU)
+        #pragma omp parallel for num_threads(numDevices)
+        for (int i = 0; i < numDevices; i++)
+        {
+            ncclAllGather((int8_t *)dataset.inputs[i] + (i * byteCount),
+                          dataset.outputs[i], sendCount,
+                          dataType, comms[i], streams[i]);
+        }
+
+        // Wait for reduction to complete
+        Synchronize();
+
+        // Check results
+        ValidateResults(dataset);
+        dataset.Release();
+    }
+
+    TEST_P(AllGatherCorrectnessTest, Alignment)
+    {
+        if (numDevices > numDevicesAvailable) return;
+        if (numElements % numDevices != 0) return;
+
+        // Allocate dataset
+        Dataset dataset;
+        dataset.Initialize(numDevices, numElements, dataType, inPlace);
+
+        // Loop over several offsets (so that device pointers are not aligned)
+        for (int firstElement = 1; firstElement <= 11; firstElement += 2)
+        {
+            if (firstElement < numElements)
+            {
+                // Select last element so that total number of elements is multiple of numDevices
+                int const lastElement = firstElement + ((numElements - firstElement) / numDevices) * numDevices - 1;
+                if (lastElement >= numElements) break;
+
+                Dataset subDataset;
+                dataset.ExtractSubDataset(firstElement, lastElement, subDataset);
+
+                // Compute reference results for sub-dataset
+                FillDatasetWithPattern(subDataset);
+                ComputeExpectedResults(subDataset);
+
+                size_t const byteCount = subDataset.NumBytes() / subDataset.numDevices;
+                size_t const sendCount = subDataset.numElements / subDataset.numDevices;
+
+                // Launch the reduction (1 thread per GPU)
+                #pragma omp parallel for num_threads(numDevices)
+                for (int i = 0; i < numDevices; i++)
+                {
+                    ncclAllGather((int8_t *)subDataset.inputs[i] + (i * byteCount),
+                                  subDataset.outputs[i], sendCount,
+                                  dataType, comms[i], streams[i]);
+                }
+
+                // Wait for reduction to complete
+                Synchronize();
+
+                // Check results
+                ValidateResults(subDataset);
+            }
+        }
+        dataset.Release();
+    }
+
+
+    INSTANTIATE_TEST_CASE_P(AllGatherCorrectnessSweep,
+                            AllGatherCorrectnessTest,
+                            testing::Combine(
+                                // Reduction operator (not used)
+                                testing::Values(ncclSum),
+                                // Data types
+                                testing::Values(ncclInt8,
+                                                ncclUint8,
+                                                ncclInt32,
+                                                ncclUint32,
+                                                ncclInt64,
+                                                ncclUint64,
+                                                //ncclFloat16,
+                                                ncclFloat32,
+                                                ncclFloat64),
+                                // Number of elements
+                                testing::Values(3072, 3145728),
+                                // Number of devices
+                                testing::Values(2,3,4),
+                                // In-place or not
+                                testing::Values(false, true)));
+} // namespace
@@ -0,0 +1,32 @@
+/*************************************************************************
+ * Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+#ifndef TEST_ALLGATHER_HPP
+#define TEST_ALLGATHER_HPP
+
+#include "CorrectnessTest.hpp"
+
+namespace CorrectnessTests
+{
+    class AllGatherCorrectnessTest : public CorrectnessTest
+    {
+    public:
+        static void ComputeExpectedResults(Dataset& dataset)
+        {
+            size_t const byteCount = dataset.NumBytes() / dataset.numDevices;
+
+            int8_t* result = (int8_t *)malloc(dataset.NumBytes());
+
+            for (int i = 0; i < dataset.numDevices; i++)
+                HIP_CALL(hipMemcpy(result + i * byteCount, (int8_t *)dataset.inputs[i] + (i * byteCount),
+                                   byteCount, hipMemcpyDeviceToHost));
+
+            for (int i = 0; i < dataset.numDevices; i++)
+                memcpy(dataset.expected[i], result, dataset.NumBytes());
+        }
+    };
+}
+
+#endif
@@ -0,0 +1,60 @@
+/*************************************************************************
+ * Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#include "test_AllReduce.hpp"
+#include <omp.h>
+
+namespace CorrectnessTests
+{
+    TEST_P(AllReduceCorrectnessTest, Correctness)
+    {
+        if (numDevices > numDevicesAvailable) return;
+
+        // Prepare input / output / expected results
+        Dataset dataset;
+        dataset.Initialize(numDevices, numElements, dataType, inPlace);
+        FillDatasetWithPattern(dataset);
+        ComputeExpectedResults(dataset, op);
+
+        // Launch the reduction (1 thread per GPU)
+        #pragma omp parallel for num_threads(numDevices)
+        for (int i = 0; i < numDevices; i++)
+        {
+            ncclAllReduce(dataset.inputs[i], dataset.outputs[i],
+                          numElements, dataType, op, comms[i], streams[i]);
+        }
+
+        // Wait for reduction to complete
+        Synchronize();
+
+        // Check results
+        ValidateResults(dataset);
+
+        dataset.Release();
+    }
+
+    INSTANTIATE_TEST_CASE_P(AllReduceCorrectnessSweep,
+                            AllReduceCorrectnessTest,
+                            testing::Combine(
+                                // Reduction operator
+                                testing::Values(ncclSum, ncclProd, ncclMax, ncclMin),
+                                // Data types
+                                testing::Values(ncclInt8,
+                                                ncclUint8,
+                                                ncclInt32,
+                                                ncclUint32,
+                                                ncclInt64,
+                                                ncclUint64,
+                                                //ncclFloat16,
+                                                ncclFloat32,
+                                                ncclFloat64),
+                                // Number of elements
+                                testing::Values(1024, 1048576),
+                                // Number of devices
+                                testing::Values(2,3,4),
+                                // In-place or not
+                                testing::Values(false, true)));
+} // namespace
@@ -0,0 +1,76 @@
+/*************************************************************************
+ * Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+#ifndef TEST_ALLREDUCE_HPP
+#define TEST_ALLREDUCE_HPP
+
+#include "CorrectnessTest.hpp"
+
+namespace CorrectnessTests
+{
+    class AllReduceCorrectnessTest : public CorrectnessTest
+    {
+    public:
+        static void ComputeExpectedResults(Dataset& dataset, ncclRedOp_t const op)
+        {
+            // Copy all inputs to expected arrays temporarily to perform reduction on host
+            for (int i = 0; i < dataset.numDevices; i++)
+                HIP_CALL(hipMemcpy(dataset.expected[i], dataset.inputs[i],
+                                   dataset.NumBytes(), hipMemcpyDeviceToHost));
+
+            // Allocate temporary host array to accumulate results
+            int8_t*   resultI1 = (int8_t   *)malloc(dataset.NumBytes());
+            uint8_t*  resultU1 = (uint8_t  *)resultI1;
+            int32_t*  resultI4 = (int32_t  *)resultI1;
+            uint32_t* resultU4 = (uint32_t *)resultI1;
+            int64_t*  resultI8 = (int64_t  *)resultI1;
+            uint64_t* resultU8 = (uint64_t *)resultI1;
+            float*    resultF4 = (float    *)resultI1;
+            double*   resultF8 = (double   *)resultI1;
+
+            // Initialize the result with the first device's array
+            memcpy(resultI1, dataset.expected[0], dataset.NumBytes());
+
+            // Perform reduction on the other device arrays
+            for (int i = 1; i < dataset.numDevices; i++)
+            {
+                int8_t*   arrayI1 = (int8_t   *)dataset.expected[i];
+                uint8_t*  arrayU1 = (uint8_t  *)arrayI1;
+                int32_t*  arrayI4 = (int32_t  *)arrayI1;
+                uint32_t* arrayU4 = (uint32_t *)arrayI1;
+                int64_t*  arrayI8 = (int64_t  *)arrayI1;
+                uint64_t* arrayU8 = (uint64_t *)arrayI1;
+                float*    arrayF4 = (float    *)arrayI1;
+                double*   arrayF8 = (double   *)arrayI1;
+
+                for (int j = 0; j < dataset.numElements; j++)
+                {
+                    switch (dataset.dataType)
+                    {
+                    case ncclInt8:    resultI1[j] = ReduceOp(op, resultI1[j], arrayI1[j]); break;
+                    case ncclUint8:   resultU1[j] = ReduceOp(op, resultU1[j], arrayU1[j]); break;
+                    case ncclInt32:   resultI4[j] = ReduceOp(op, resultI4[j], arrayI4[j]); break;
+                    case ncclUint32:  resultU4[j] = ReduceOp(op, resultU4[j], arrayU4[j]); break;
+                    case ncclInt64:   resultI8[j] = ReduceOp(op, resultI8[j], arrayI8[j]); break;
+                    case ncclUint64:  resultU8[j] = ReduceOp(op, resultU8[j], arrayU8[j]); break;
+                    case ncclFloat32: resultF4[j] = ReduceOp(op, resultF4[j], arrayF4[j]); break;
+                    case ncclFloat64: resultF8[j] = ReduceOp(op, resultF8[j], arrayF8[j]); break;
+                    default:
+                        fprintf(stderr, "[ERROR] Unsupported datatype\n");
+                        exit(0);
+                    }
+                }
+            }
+
+            // Copy results into expected arrays
+            for (int i = 0; i < dataset.numDevices; i++)
+                memcpy(dataset.expected[i], resultI1, dataset.NumBytes());
+
+            free(resultI1);
+        }
+    };
+}
+
+#endif
@@ -0,0 +1,150 @@
+/*************************************************************************
+ * Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#include "test_AllReduceAbort.hpp"
+#include "../include/core.h"
+#include <omp.h>
+
+#define NUM_ITER 8
+#define FAKE_OP_COUNT NUM_ITER+1
+
+namespace CorrectnessTests
+{
+    #define HIPCHECK(cmd)                                                          \
+    do {                                                                           \
+      hipError_t error = (cmd);                                                    \
+      if (error != hipSuccess) {                                                   \
+        std::cerr << "Encountered HIP error (" << error << ") at line "            \
+                  << __LINE__ << " in file " << __FILE__ << "\n";                  \
+        exit(-1);                                                                  \
+      }                                                                            \
+    } while (0)
+
+    #define LOAD(VAR) __atomic_load_n((VAR), __ATOMIC_SEQ_CST)
+    #define STORE(DST, SRC) __atomic_store_n((DST), (SRC), __ATOMIC_SEQ_CST)
+
+    TEST_P(AllReduceAbortTest, Correctness) {
+        if (numDevices > numDevicesAvailable) return;
+
+        // Prepare input / output / expected results
+        Dataset dataset;
+        dataset.Initialize(numDevices, numElements, dataType, inPlace);
+        FillDatasetWithPattern(dataset);
+
+        int gpu = 0; // GPU number to trigger abort
+        ncclComm_t comm = comms[gpu];
+
+        HIPCHECK(hipSetDevice(gpu));
+        hipStream_t stream;
+        HIPCHECK(hipStreamCreateWithFlags(&stream, hipStreamNonBlocking));
+        struct ncclChannel* channel = comm->channels;
+        struct ncclRing *ring = &channel->ring;
+        struct ncclConnector* send = &channel->peers[ring->next].send;
+        size_t op_offset = &(send->conn.opCountRem) - (uint64_t **)channel->peers;
+        size_t head_offset = &(send->conn.head) - (uint64_t **)channel->peers;
+        uint64_t **p_dev_opCount = (uint64_t **)(channel->devPeers) + op_offset;
+        uint64_t **p_dev_head = (uint64_t **)(channel->devPeers) + head_offset;
+        uint64_t *real_opCount, *fake_opCount, *fake_o;
+        uint64_t *real_head, *fake_head, *fake_h;
+
+        // get original opCount and head
+        HIPCHECK(hipMemcpyAsync(&real_opCount, p_dev_opCount, sizeof(uint64_t*), hipMemcpyDeviceToHost, stream));
+        HIPCHECK(hipMemcpyAsync(&real_head, p_dev_head, sizeof(uint64_t*), hipMemcpyDeviceToHost, stream));
+        HIPCHECK(hipStreamSynchronize(stream));
+        // allocate and install fakes
+        HIPCHECK(hipHostMalloc(&fake_opCount, sizeof(uint64_t*), hipHostMallocMapped));
+        HIPCHECK(hipMemcpyAsync(p_dev_opCount, &fake_opCount, sizeof(uint64_t*), hipMemcpyHostToDevice, stream));
+        *fake_opCount = FAKE_OP_COUNT;
+        HIPCHECK(hipHostMalloc(&fake_head, sizeof(uint64_t*), hipHostMallocMapped));
+        HIPCHECK(hipMemcpyAsync(p_dev_head, &fake_head, sizeof(uint64_t*), hipMemcpyHostToDevice, stream));
+        *fake_head = 0;
+        HIPCHECK(hipStreamSynchronize(stream));
+        // read back fakes to confirm
+        HIPCHECK(hipMemcpyAsync(&fake_o, p_dev_opCount, sizeof(uint64_t*), hipMemcpyDeviceToHost, stream));
+        HIPCHECK(hipMemcpyAsync(&fake_h, p_dev_head, sizeof(uint64_t*), hipMemcpyDeviceToHost, stream));
+        HIPCHECK(hipStreamSynchronize(stream));
+        //std::cerr << "[          ] replaced gpu " << gpu << " real_opCount = " << real_opCount << " to fake_opCount = " << fake_o << std::endl;
+        //std::cerr << "[          ] replaced gpu " << gpu << " real_head = " << real_head << " to fake_head = " << fake_h << std::endl;
+
+        // Perform a number of iterations and introduce abort
+        for (int j = 0; j < NUM_ITER; j++) {
+            //std::cerr << "[          ] iter = " << j << std::endl;
+            // Start a group call
+            ncclGroupStart();
+            for (int i = 0; i < numDevices; i++) {
+                ncclAllReduce(dataset.inputs[i], dataset.outputs[i],
+                              numElements, dataType, op, comms[i], streams[i]);
+            }
+            // Signal end of group call
+            ncclGroupEnd();
+        }
+
+        // Wait for reduction to complete
+        auto start = std::chrono::high_resolution_clock::now();
+        hipError_t hipErr;
+        int remaining = numDevices;
+        int* done = (int*)malloc(sizeof(int)*numDevices);
+        memset(done, 0, sizeof(int)*numDevices);
+        bool timeout = false, abort_called = false;
+        while (remaining) {
+            int idle = 1;
+            for (int i=0; i<numDevices; i++) {
+                if (done[i]) continue;
+
+                hipErr = hipStreamQuery(streams[i]);
+                if (hipErr == hipSuccess) {
+                    done[i] = 1;
+                    remaining--;
+                    idle = 0;
+                    continue;
+                }
+
+ #if NCCL_MAJOR >= 2
+ #if NCCL_VERSION_CODE >= NCCL_VERSION(2,4,0)
+                auto delta = std::chrono::high_resolution_clock::now() - start;
+                double deltaSec = std::chrono::duration_cast<std::chrono::duration<double>>(delta).count();
+                if (deltaSec > 10.0 && !timeout) {
+                    std::cerr << "[          ] timeout condition, calling ncclCommAbort ... " << std::endl;
+                    timeout = true;
+                }
+                ncclResult_t ncclAsyncErr;
+                ncclCommGetAsyncError(comms[i], &ncclAsyncErr);
+                if ((ncclAsyncErr != ncclSuccess || timeout) && !abort_called) {
+                    // An asynchronous error happened. Stop the operation and destroy
+                    // the communicator
+                    std::cerr << "[          ] ncclAsyncErr = " << ncclAsyncErr << std::endl;
+                    for (int i=0; i<numDevices; i++)
+                      ncclCommAbort(comms[i]);
+                    // Abort the perf test
+                    abort_called = true;
+                    break;
+                }
+#endif
+#endif
+            }
+            // We might want to let other threads (including NCCL threads) use the CPU.
+            if (idle) pthread_yield();
+        }
+
+        HIPCHECK(hipHostFree(fake_opCount));
+        HIPCHECK(hipStreamDestroy(stream));
+        dataset.Release();
+    }
+
+    INSTANTIATE_TEST_CASE_P(AllReduceAbortSweep,
+                            AllReduceAbortTest,
+                            testing::Combine(
+                                // Reduction operator
+                                testing::Values(ncclSum),
+                                // Data types
+                                testing::Values(ncclFloat32),
+                                // Number of elements
+                                testing::Values(1024, 1048576),
+                                // Number of devices
+                                testing::Values(2, 4),
+                                // In-place or not
+                                testing::Values(false)));
+} // namespace
@@ -0,0 +1,20 @@
+/*************************************************************************
+ * Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+#ifndef TEST_ALLREDUCE_HPP
+#define TEST_ALLREDUCE_HPP
+
+#include "CorrectnessTest.hpp"
+
+namespace CorrectnessTests
+{
+    class AllReduceAbortTest : public CorrectnessTest
+    {
+    protected:
+    public:
+    };
+}
+
+#endif
@@ -0,0 +1,69 @@
+/*************************************************************************
+ * Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#include "test_Broadcast.hpp"
+#include <omp.h>
+
+namespace CorrectnessTests
+{
+    TEST_P(BroadcastCorrectnessTest, Correctness)
+    {
+        if (numDevices > numDevicesAvailable) return;
+
+        // Allocate data
+        Dataset dataset;
+        dataset.Initialize(numDevices, numElements, dataType, inPlace);
+
+        // Test each possible root
+        for (int root = 0; root < numDevices; root++)
+        {
+            // Prepare input / output / expected results
+            FillDatasetWithPattern(dataset);
+            ComputeExpectedResults(dataset, root);
+
+            // Launch the reduction (1 thread per GPU)
+            #pragma omp parallel for num_threads(numDevices)
+            for (int i = 0; i < numDevices; i++)
+            {
+                ncclBroadcast(dataset.inputs[i],
+                              dataset.outputs[i],
+                              numElements, dataType,
+                              root, comms[i], streams[i]);
+            }
+
+
+            // Wait for reduction to complete
+            Synchronize();
+
+            // Check results
+            ValidateResults(dataset);
+        }
+
+        dataset.Release();
+    }
+
+    INSTANTIATE_TEST_CASE_P(BroadcastCorrectnessSweep,
+                            BroadcastCorrectnessTest,
+                            testing::Combine(
+                                // Reduction operator is not used
+                                testing::Values(ncclSum),
+                                // Data types
+                                testing::Values(ncclInt8,
+                                                ncclUint8,
+                                                ncclInt32,
+                                                ncclUint32,
+                                                ncclInt64,
+                                                ncclUint64,
+                                                //ncclFloat16,
+                                                ncclFloat32,
+                                                ncclFloat64),
+                                // Number of elements
+                                testing::Values(1024, 1048576),
+                                // Number of devices
+                                testing::Values(2,3,4),
+                                // In-place or not
+                                testing::Values(false, true)));
+} // namespace
@@ -0,0 +1,26 @@
+/*************************************************************************
+ * Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+#ifndef TEST_BROADCAST_HPP
+#define TEST_BROADCAST_HPP
+
+#include "CorrectnessTest.hpp"
+#include <omp.h>
+
+namespace CorrectnessTests
+{
+    class BroadcastCorrectnessTest : public CorrectnessTest
+    {
+    public:
+        static void ComputeExpectedResults(Dataset& dataset, int const root)
+        {
+            for (int i = 0; i < dataset.numDevices; i++)
+                HIP_CALL(hipMemcpy(dataset.expected[i], dataset.inputs[root],
+                                   dataset.NumBytes(), hipMemcpyDeviceToHost));
+        }
+    };
+}
+
+#endif
@@ -0,0 +1,153 @@
+/*************************************************************************
+ * Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#include "test_BroadcastAbort.hpp"
+#include "../include/core.h"
+#include <omp.h>
+
+#define NUM_ITER 8
+#define FAKE_OP_COUNT NUM_ITER+1
+
+namespace CorrectnessTests
+{
+    #define HIPCHECK(cmd)                                                          \
+    do {                                                                           \
+      hipError_t error = (cmd);                                                    \
+      if (error != hipSuccess) {                                                   \
+        std::cerr << "Encountered HIP error (" << error << ") at line "            \
+                  << __LINE__ << " in file " << __FILE__ << "\n";                  \
+        exit(-1);                                                                  \
+      }                                                                            \
+    } while (0)
+
+    #define LOAD(VAR) __atomic_load_n((VAR), __ATOMIC_SEQ_CST)
+    #define STORE(DST, SRC) __atomic_store_n((DST), (SRC), __ATOMIC_SEQ_CST)
+
+    TEST_P(BroadcastAbortTest, Correctness) {
+        if (numDevices > numDevicesAvailable) return;
+
+        // Prepare input / output / expected results
+        Dataset dataset;
+        dataset.Initialize(numDevices, numElements, dataType, inPlace);
+        FillDatasetWithPattern(dataset);
+
+        int root = 0;
+        int gpu = 0; // GPU number to trigger abort
+        ncclComm_t comm = comms[gpu];
+
+        HIPCHECK(hipSetDevice(gpu));
+        hipStream_t stream;
+        HIPCHECK(hipStreamCreateWithFlags(&stream, hipStreamNonBlocking));
+        struct ncclChannel* channel = comm->channels;
+        struct ncclRing *ring = &channel->ring;
+        struct ncclConnector* send = &channel->peers[ring->next].send;
+        size_t op_offset = &(send->conn.opCountRem) - (uint64_t **)channel->peers;
+        size_t head_offset = &(send->conn.head) - (uint64_t **)channel->peers;
+        uint64_t **p_dev_opCount = (uint64_t **)(channel->devPeers) + op_offset;
+        uint64_t **p_dev_head = (uint64_t **)(channel->devPeers) + head_offset;
+        uint64_t *real_opCount, *fake_opCount, *fake_o;
+        uint64_t *real_head, *fake_head, *fake_h;
+
+        // get original opCount and head
+        HIPCHECK(hipMemcpyAsync(&real_opCount, p_dev_opCount, sizeof(uint64_t*), hipMemcpyDeviceToHost, stream));
+        HIPCHECK(hipMemcpyAsync(&real_head, p_dev_head, sizeof(uint64_t*), hipMemcpyDeviceToHost, stream));
+        HIPCHECK(hipStreamSynchronize(stream));
+        // allocate and install fakes
+        HIPCHECK(hipHostMalloc(&fake_opCount, sizeof(uint64_t*), hipHostMallocMapped));
+        HIPCHECK(hipMemcpyAsync(p_dev_opCount, &fake_opCount, sizeof(uint64_t*), hipMemcpyHostToDevice, stream));
+        *fake_opCount = FAKE_OP_COUNT;
+        HIPCHECK(hipHostMalloc(&fake_head, sizeof(uint64_t*), hipHostMallocMapped));
+        HIPCHECK(hipMemcpyAsync(p_dev_head, &fake_head, sizeof(uint64_t*), hipMemcpyHostToDevice, stream));
+        *fake_head = 0;
+        HIPCHECK(hipStreamSynchronize(stream));
+        // read back fakes to confirm
+        HIPCHECK(hipMemcpyAsync(&fake_o, p_dev_opCount, sizeof(uint64_t*), hipMemcpyDeviceToHost, stream));
+        HIPCHECK(hipMemcpyAsync(&fake_h, p_dev_head, sizeof(uint64_t*), hipMemcpyDeviceToHost, stream));
+        HIPCHECK(hipStreamSynchronize(stream));
+        //std::cerr << "[          ] replaced gpu " << gpu << " real_opCount = " << real_opCount << " to fake_opCount = " << fake_o << std::endl;
+        //std::cerr << "[          ] replaced gpu " << gpu << " real_head = " << real_head << " to fake_head = " << fake_h << std::endl;
+
+        // Perform a number of iterations and introduce abort
+        for (int j = 0; j < NUM_ITER; j++) {
+            //std::cerr << "[          ] iter = " << j << std::endl;
+            // Start a group call
+            ncclGroupStart();
+            for (int i = 0; i < numDevices; i++) {
+                ncclBroadcast(dataset.inputs[i],
+                              dataset.outputs[i],
+                              numElements, dataType,
+                              root, comms[i], streams[i]);
+            }
+            // Signal end of group call
+            ncclGroupEnd();
+        }
+
+        // Wait for reduction to complete
+        auto start = std::chrono::high_resolution_clock::now();
+        hipError_t hipErr;
+        int remaining = numDevices;
+        int* done = (int*)malloc(sizeof(int)*numDevices);
+        memset(done, 0, sizeof(int)*numDevices);
+        bool timeout = false, abort_called = false;
+        while (remaining) {
+            int idle = 1;
+            for (int i=0; i<numDevices; i++) {
+                if (done[i]) continue;
+
+                hipErr = hipStreamQuery(streams[i]);
+                if (hipErr == hipSuccess) {
+                    done[i] = 1;
+                    remaining--;
+                    idle = 0;
+                    continue;
+                }
+
+ #if NCCL_MAJOR >= 2
+ #if NCCL_VERSION_CODE >= NCCL_VERSION(2,4,0)
+                auto delta = std::chrono::high_resolution_clock::now() - start;
+                double deltaSec = std::chrono::duration_cast<std::chrono::duration<double>>(delta).count();
+                if (deltaSec > 10.0 && !timeout) {
+                    std::cerr << "[          ] timeout condition, calling ncclCommAbort ... " << std::endl;
+                    timeout = true;
+                }
+                ncclResult_t ncclAsyncErr;
+                ncclCommGetAsyncError(comms[i], &ncclAsyncErr);
+                if ((ncclAsyncErr != ncclSuccess || timeout) && !abort_called) {
+                    // An asynchronous error happened. Stop the operation and destroy
+                    // the communicator
+                    std::cerr << "[          ] ncclAsyncErr = " << ncclAsyncErr << std::endl;
+                    for (int i=0; i<numDevices; i++)
+                      ncclCommAbort(comms[i]);
+                    // Abort the perf test
+                    abort_called = true;
+                    break;
+                }
+#endif
+#endif
+            }
+            // We might want to let other threads (including NCCL threads) use the CPU.
+            if (idle) pthread_yield();
+        }
+
+        HIPCHECK(hipHostFree(fake_opCount));
+        HIPCHECK(hipStreamDestroy(stream));
+        dataset.Release();
+    }
+
+    INSTANTIATE_TEST_CASE_P(BroadcastAbortSweep,
+                            BroadcastAbortTest,
+                            testing::Combine(
+                                // Reduction operator
+                                testing::Values(ncclSum),
+                                // Data types
+                                testing::Values(ncclFloat32),
+                                // Number of elements
+                                testing::Values(1048576),
+                                // Number of devices
+                                testing::Values(2, 4),
+                                // In-place or not
+                                testing::Values(false)));
+} // namespace
@@ -0,0 +1,20 @@
+/*************************************************************************
+ * Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+#ifndef TEST_ALLREDUCE_HPP
+#define TEST_ALLREDUCE_HPP
+
+#include "CorrectnessTest.hpp"
+
+namespace CorrectnessTests
+{
+    class BroadcastAbortTest : public CorrectnessTest
+    {
+    protected:
+    public:
+    };
+}
+
+#endif
@@ -0,0 +1,99 @@
+/*************************************************************************
+ * Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+#include "test_CombinedCalls.hpp"
+
+#include "test_AllGather.hpp"
+#include "test_AllReduce.hpp"
+#include "test_Broadcast.hpp"
+#include "test_Reduce.hpp"
+#include "test_ReduceScatter.hpp"
+
+#include <omp.h>
+
+namespace CorrectnessTests
+{
+    TEST_P(CombinedCallsCorrectnessTest, Correctness)
+    {
+        if (numDevices > numDevicesAvailable) return;
+
+        // Create multiple datasets for combined operation
+        std::vector<Dataset> datasets(5);
+        for (int i = 0; i < datasets.size(); i++)
+        {
+            datasets[i].Initialize(numDevices, numElements, dataType, inPlace);
+            FillDatasetWithPattern(datasets[i]);
+        }
+
+        // Compute expected results for each dataset in combined
+        int const root = 0;
+        AllGatherCorrectnessTest::ComputeExpectedResults(datasets[0]);
+        AllReduceCorrectnessTest::ComputeExpectedResults(datasets[1], op);
+        BroadcastCorrectnessTest::ComputeExpectedResults(datasets[2], root);
+        ReduceCorrectnessTest::ComputeExpectedResults(datasets[3], op, root);
+        ReduceScatterCorrectnessTest::ComputeExpectedResults(datasets[4], op);
+
+        size_t const byteCount = datasets[0].NumBytes() / numDevices;
+        size_t const elemCount = numElements / numDevices;
+
+        #pragma omp parallel for num_threads(numDevices)
+        for (int i = 0; i < numDevices; i++)
+        {
+            ncclAllGather((int8_t *)datasets[0].inputs[i] + (i * byteCount),
+                          datasets[0].outputs[i], elemCount,
+                          dataType, comms[i], streams[i]);
+
+            ncclAllReduce(datasets[1].inputs[i], datasets[1].outputs[i],
+                          numElements, dataType, op, comms[i], streams[i]);
+
+            ncclBroadcast(datasets[2].inputs[i],
+                          datasets[2].outputs[i],
+                          numElements, dataType,
+                          root, comms[i], streams[i]);
+
+            ncclReduce(datasets[3].inputs[i],
+                       datasets[3].outputs[i],
+                       numElements, dataType, op,
+                       root, comms[i], streams[i]);
+
+            ncclReduceScatter(datasets[4].inputs[i],
+                              (int8_t *)datasets[4].outputs[i] + (i * byteCount),
+                              elemCount, dataType, op,
+                              comms[i], streams[i]);
+        }
+
+        // Wait for reduction to complete
+        Synchronize();
+
+        // Check results for each collective in the combined
+        for (int i = 0; i < 5; i++)
+        {
+            ValidateResults(datasets[i]);
+            datasets[i].Release();
+        }
+    }
+
+    INSTANTIATE_TEST_CASE_P(CombinedCallsCorrectnessSweep,
+                            CombinedCallsCorrectnessTest,
+                            testing::Combine(
+                                // Reduction operator (not used)
+                                testing::Values(ncclSum),
+                                // Data types
+                                testing::Values(ncclInt8,
+                                                ncclUint8,
+                                                ncclInt32,
+                                                ncclUint32,
+                                                ncclInt64,
+                                                ncclUint64,
+                                                //ncclFloat16,
+                                                ncclFloat32,
+                                                ncclFloat64),
+                                // Number of elements
+                                testing::Values(3072, 3145728),
+                                // Number of devices
+                                testing::Values(2,3,4),
+                                // In-place or not
+                                testing::Values(false, true)));
+} // namespace
@@ -0,0 +1,17 @@
+/*************************************************************************
+ * Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef TEST_COMBINEDCALLS_HPP
+#define TEST_COMBINEDCALLS_HPP
+
+#include "CorrectnessTest.hpp"
+
+namespace CorrectnessTests
+{
+    class CombinedCallsCorrectnessTest : public CorrectnessTest {};
+}
+
+#endif
@@ -0,0 +1,120 @@
+/*************************************************************************
+ * Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+#include "test_GroupCalls.hpp"
+
+#include "test_AllGather.hpp"
+#include "test_AllReduce.hpp"
+#include "test_Broadcast.hpp"
+#include "test_Reduce.hpp"
+#include "test_ReduceScatter.hpp"
+
+#include <omp.h>
+
+namespace CorrectnessTests
+{
+    TEST_P(GroupCallsCorrectnessTest, Correctness)
+    {
+        if (numDevices > numDevicesAvailable) return;
+
+        // Create multiple datasets for group operation
+        std::vector<Dataset> datasets(5);
+        for (int i = 0; i < datasets.size(); i++)
+        {
+            datasets[i].Initialize(numDevices, numElements, dataType, inPlace);
+            FillDatasetWithPattern(datasets[i]);
+        }
+
+        // Compute expected results for each dataset in group
+        int const root = 0;
+        AllGatherCorrectnessTest::ComputeExpectedResults(datasets[0]);
+        AllReduceCorrectnessTest::ComputeExpectedResults(datasets[1], op);
+        BroadcastCorrectnessTest::ComputeExpectedResults(datasets[2], root);
+        ReduceCorrectnessTest::ComputeExpectedResults(datasets[3], op, root);
+        ReduceScatterCorrectnessTest::ComputeExpectedResults(datasets[4], op);
+
+        // Start a group call
+        ncclGroupStart();
+
+        // AllGather
+        size_t const byteCount = datasets[0].NumBytes() / numDevices;
+        size_t const elemCount = numElements / numDevices;
+        for (int i = 0; i < numDevices; i++)
+        {
+            ncclAllGather((int8_t *)datasets[0].inputs[i] + (i * byteCount),
+                          datasets[0].outputs[i], elemCount,
+                          dataType, comms[i], streams[i]);
+        }
+
+        // AllReduce
+        for (int i = 0; i < numDevices; i++)
+        {
+            ncclAllReduce(datasets[1].inputs[i], datasets[1].outputs[i],
+                          numElements, dataType, op, comms[i], streams[i]);
+        }
+
+        // Broadcast
+        for (int i = 0; i < numDevices; i++)
+        {
+            ncclBroadcast(datasets[2].inputs[i],
+                          datasets[2].outputs[i],
+                          numElements, dataType,
+                          root, comms[i], streams[i]);
+        }
+
+        // Reduce
+        for (int i = 0; i < numDevices; i++)
+        {
+            ncclReduce(datasets[3].inputs[i],
+                       datasets[3].outputs[i],
+                       numElements, dataType, op,
+                       root, comms[i], streams[i]);
+        }
+
+        // ReduceScatter
+        for (int i = 0; i < numDevices; i++)
+        {
+            ncclReduceScatter(datasets[4].inputs[i],
+                              (int8_t *)datasets[4].outputs[i] + (i * byteCount),
+                              elemCount, dataType, op,
+                              comms[i], streams[i]);
+        }
+
+        // Signal end of group call
+        ncclGroupEnd();
+
+        // Wait for reduction to complete
+        Synchronize();
+
+        // Check results for each collective in the group
+        for (int i = 0; i < 5; i++)
+        {
+            ValidateResults(datasets[i]);
+            datasets[i].Release();
+        }
+    }
+
+    INSTANTIATE_TEST_CASE_P(GroupCallsCorrectnessSweep,
+                            GroupCallsCorrectnessTest,
+                            testing::Combine(
+                                // Reduction operator (not used)
+                                testing::Values(ncclSum),
+                                // Data types
+                                testing::Values(ncclInt8,
+                                                ncclUint8,
+                                                ncclInt32,
+                                                ncclUint32,
+                                                ncclInt64,
+                                                ncclUint64,
+                                                //ncclFloat16,
+                                                ncclFloat32,
+                                                ncclFloat64),
+                                // Number of elements
+                                testing::Values(3072, 3145728),
+                                // Number of devices
+                                testing::Values(2,3,4),
+                                // In-place or not
+                                testing::Values(false, true)));
+} // namespace
@@ -0,0 +1,17 @@
+/*************************************************************************
+ * Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef TEST_GROUPCALLS_HPP
+#define TEST_GROUPCALLS_HPP
+
+#include "CorrectnessTest.hpp"
+
+namespace CorrectnessTests
+{
+    class GroupCallsCorrectnessTest : public CorrectnessTest {};
+}
+
+#endif
@@ -0,0 +1,68 @@
+/*************************************************************************
+ * Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#include "test_Reduce.hpp"
+#include <omp.h>
+
+namespace CorrectnessTests
+{
+    TEST_P(ReduceCorrectnessTest, Correctness)
+    {
+        if (numDevices > numDevicesAvailable) return;
+
+        // Allocate data
+        Dataset dataset;
+        dataset.Initialize(numDevices, numElements, dataType, inPlace);
+
+        // Test each possible root
+        for (int root = 0; root < numDevices; root++)
+        {
+            // Prepare input / output / expected results
+            FillDatasetWithPattern(dataset);
+            ComputeExpectedResults(dataset, op, root);
+
+            // Launch the reduction (1 thread per GPU)
+            #pragma omp parallel for num_threads(numDevices)
+            for (int i = 0; i < numDevices; i++)
+            {
+                ncclReduce(dataset.inputs[i],
+                           dataset.outputs[i],
+                           numElements, dataType, op,
+                           root, comms[i], streams[i]);
+            }
+
+            // Wait for reduction to complete
+            Synchronize();
+
+            // Check results
+            ValidateResults(dataset);
+        }
+
+        dataset.Release();
+    }
+
+    INSTANTIATE_TEST_CASE_P(ReduceCorrectnessSweep,
+                            ReduceCorrectnessTest,
+                            testing::Combine(
+                                // Reduction operator
+                                testing::Values(ncclSum, ncclProd, ncclMax, ncclMin),
+                                // Data types
+                                testing::Values(ncclInt8,
+                                                ncclUint8,
+                                                ncclInt32,
+                                                ncclUint32,
+                                                ncclInt64,
+                                                ncclUint64,
+                                                //ncclFloat16,
+                                                ncclFloat32,
+                                                ncclFloat64),
+                                // Number of elements
+                                testing::Values(1024, 1048576),
+                                // Number of devices
+                                testing::Values(2,3,4),
+                                // In-place or not
+                                testing::Values(false, true)));
+} // namespace
@@ -0,0 +1,80 @@
+/*************************************************************************
+ * Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+#ifndef TEST_REDUCE_HPP
+#define TEST_REDUCE_HPP
+
+#include "CorrectnessTest.hpp"
+
+namespace CorrectnessTests
+{
+    class ReduceCorrectnessTest : public CorrectnessTest
+    {
+    public:
+        static void ComputeExpectedResults(Dataset& dataset, ncclRedOp_t const op, int const root)
+        {
+            // Copy all inputs to expected arrays temporarily to perform reduction on host
+            for (int i = 0; i < dataset.numDevices; i++)
+                HIP_CALL(hipMemcpy(dataset.expected[i], dataset.inputs[i],
+                                   dataset.NumBytes(), hipMemcpyDeviceToHost));
+
+            // Allocate temporary host array to accumulate results
+            int8_t*   resultI1 = (int8_t   *)malloc(dataset.NumBytes());
+            uint8_t*  resultU1 = (uint8_t  *)resultI1;
+            int32_t*  resultI4 = (int32_t  *)resultI1;
+            uint32_t* resultU4 = (uint32_t *)resultI1;
+            int64_t*  resultI8 = (int64_t  *)resultI1;
+            uint64_t* resultU8 = (uint64_t *)resultI1;
+            float*    resultF4 = (float    *)resultI1;
+            double*   resultF8 = (double   *)resultI1;
+
+            // Initialize the result with the first device's array
+            memcpy(resultI1, dataset.expected[0], dataset.NumBytes());
+
+            // Perform reduction on the other device arrays
+            for (int i = 1; i < dataset.numDevices; i++)
+            {
+                int8_t*   arrayI1 = (int8_t   *)dataset.expected[i];
+                uint8_t*  arrayU1 = (uint8_t  *)arrayI1;
+                int32_t*  arrayI4 = (int32_t  *)arrayI1;
+                uint32_t* arrayU4 = (uint32_t *)arrayI1;
+                int64_t*  arrayI8 = (int64_t  *)arrayI1;
+                uint64_t* arrayU8 = (uint64_t *)arrayI1;
+                float*    arrayF4 = (float    *)arrayI1;
+                double*   arrayF8 = (double   *)arrayI1;
+
+                for (int j = 0; j < dataset.numElements; j++)
+                {
+                    switch (dataset.dataType)
+                    {
+                    case ncclInt8:    resultI1[j] = ReduceOp(op, resultI1[j], arrayI1[j]); break;
+                    case ncclUint8:   resultU1[j] = ReduceOp(op, resultU1[j], arrayU1[j]); break;
+                    case ncclInt32:   resultI4[j] = ReduceOp(op, resultI4[j], arrayI4[j]); break;
+                    case ncclUint32:  resultU4[j] = ReduceOp(op, resultU4[j], arrayU4[j]); break;
+                    case ncclInt64:   resultI8[j] = ReduceOp(op, resultI8[j], arrayI8[j]); break;
+                    case ncclUint64:  resultU8[j] = ReduceOp(op, resultU8[j], arrayU8[j]); break;
+                    case ncclFloat32: resultF4[j] = ReduceOp(op, resultF4[j], arrayF4[j]); break;
+                    case ncclFloat64: resultF8[j] = ReduceOp(op, resultF8[j], arrayF8[j]); break;
+                    default:
+                        fprintf(stderr, "[ERROR] Unsupported datatype\n");
+                        exit(0);
+                    }
+                }
+            }
+
+            // Copy results into expected arrays
+            for (int i = 0; i < dataset.numDevices; i++)
+            {
+                if (i == root)
+                    memcpy(dataset.expected[root], resultI1, dataset.NumBytes());
+                else
+                    HIP_CALL(hipMemcpy(dataset.expected[i], dataset.outputs[i], dataset.NumBytes(), hipMemcpyDeviceToHost));
+            }
+            free(resultI1);
+        }
+    };
+}
+
+#endif
@@ -0,0 +1,67 @@
+/*************************************************************************
+ * Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#include "test_ReduceScatter.hpp"
+#include <omp.h>
+
+namespace CorrectnessTests
+{
+    TEST_P(ReduceScatterCorrectnessTest, Correctness)
+    {
+        if (numDevices > numDevicesAvailable) return;
+        if (numElements % numDevices != 0) return;
+
+        // Prepare input / output / expected results
+        Dataset dataset;
+        dataset.Initialize(numDevices, numElements, dataType, inPlace);
+        FillDatasetWithPattern(dataset);
+        ComputeExpectedResults(dataset, op);
+
+        size_t const byteCount = dataset.NumBytes() / dataset.numDevices;
+        size_t const recvCount = dataset.numElements / dataset.numDevices;
+
+        // Launch the reduction (1 thread per GPU)
+        #pragma omp parallel for num_threads(numDevices)
+        for (int i = 0; i < numDevices; i++)
+        {
+            ncclReduceScatter(dataset.inputs[i],
+                              (int8_t *)dataset.outputs[i] + (i * byteCount),
+                              recvCount, dataType, op,
+                              comms[i], streams[i]);
+        }
+
+
+        // Wait for reduction to complete
+        Synchronize();
+
+        // Check results
+        ValidateResults(dataset);
+
+        dataset.Release();
+    }
+
+    INSTANTIATE_TEST_CASE_P(ReduceScatterCorrectnessSweep,
+                            ReduceScatterCorrectnessTest,
+                            testing::Combine(
+                                // Reduction operator
+                                testing::Values(ncclSum, ncclProd, ncclMax, ncclMin),
+                                // Data types
+                                testing::Values(ncclInt8,
+                                                ncclUint8,
+                                                ncclInt32,
+                                                ncclUint32,
+                                                ncclInt64,
+                                                ncclUint64,
+                                                //ncclFloat16,
+                                                ncclFloat32,
+                                                ncclFloat64),
+                                // Number of elements
+                                testing::Values(3072, 3145728),
+                                // Number of devices
+                                testing::Values(2,3,4),
+                                // In-place or not
+                                testing::Values(false, true)));
+} // namespace
@@ -0,0 +1,83 @@
+/*************************************************************************
+ * Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+#ifndef TEST_REDUCE_SCATTER_HPP
+#define TEST_REDUCE_SCATTER_HPP
+
+#include "CorrectnessTest.hpp"
+
+namespace CorrectnessTests
+{
+    class ReduceScatterCorrectnessTest : public CorrectnessTest
+    {
+    public:
+        static void ComputeExpectedResults(Dataset& dataset, ncclRedOp_t const op)
+        {
+            // Copy all inputs to expected arrays temporarily to perform reduction on host
+            for (int i = 0; i < dataset.numDevices; i++)
+                HIP_CALL(hipMemcpy(dataset.expected[i], dataset.inputs[i],
+                                   dataset.NumBytes(), hipMemcpyDeviceToHost));
+
+            // Allocate temporary host array to accumulate results
+            int8_t*   resultI1 = (int8_t   *)malloc(dataset.NumBytes());
+            uint8_t*  resultU1 = (uint8_t  *)resultI1;
+            int32_t*  resultI4 = (int32_t  *)resultI1;
+            uint32_t* resultU4 = (uint32_t *)resultI1;
+            int64_t*  resultI8 = (int64_t  *)resultI1;
+            uint64_t* resultU8 = (uint64_t *)resultI1;
+            float*    resultF4 = (float    *)resultI1;
+            double*   resultF8 = (double   *)resultI1;
+
+            // Initialize the result with the first device's array
+            memcpy(resultI1, dataset.expected[0], dataset.NumBytes());
+
+            // Perform reduction on the other device arrays
+            for (int i = 1; i < dataset.numDevices; i++)
+            {
+                int8_t*   arrayI1 = (int8_t   *)dataset.expected[i];
+                uint8_t*  arrayU1 = (uint8_t  *)arrayI1;
+                int32_t*  arrayI4 = (int32_t  *)arrayI1;
+                uint32_t* arrayU4 = (uint32_t *)arrayI1;
+                int64_t*  arrayI8 = (int64_t  *)arrayI1;
+                uint64_t* arrayU8 = (uint64_t *)arrayI1;
+                float*    arrayF4 = (float    *)arrayI1;
+                double*   arrayF8 = (double   *)arrayI1;
+
+                for (int j = 0; j < dataset.numElements; j++)
+                {
+                    switch (dataset.dataType)
+                    {
+                    case ncclInt8:    resultI1[j] = ReduceOp(op, resultI1[j], arrayI1[j]); break;
+                    case ncclUint8:   resultU1[j] = ReduceOp(op, resultU1[j], arrayU1[j]); break;
+                    case ncclInt32:   resultI4[j] = ReduceOp(op, resultI4[j], arrayI4[j]); break;
+                    case ncclUint32:  resultU4[j] = ReduceOp(op, resultU4[j], arrayU4[j]); break;
+                    case ncclInt64:   resultI8[j] = ReduceOp(op, resultI8[j], arrayI8[j]); break;
+                    case ncclUint64:  resultU8[j] = ReduceOp(op, resultU8[j], arrayU8[j]); break;
+                    case ncclFloat32: resultF4[j] = ReduceOp(op, resultF4[j], arrayF4[j]); break;
+                    case ncclFloat64: resultF8[j] = ReduceOp(op, resultF8[j], arrayF8[j]); break;
+                    default:
+                        fprintf(stderr, "[ERROR] Unsupported datatype\n");
+                        exit(0);
+                    }
+                }
+            }
+
+            // Copy results into expected arrays
+            size_t const byteCount = dataset.NumBytes() / dataset.numDevices;
+
+            for (int i = 0; i < dataset.numDevices; i++)
+                HIP_CALL(hipMemcpy(dataset.expected[i], dataset.outputs[i],
+                                   dataset.NumBytes(), hipMemcpyDeviceToHost));
+
+            for (int i = 0; i < dataset.numDevices; i++)
+                memcpy((int8_t *)dataset.expected[i] + (i * byteCount),
+                       resultI1 + (i * byteCount), byteCount);
+
+            free(resultI1);
+        }
+    };
+}
+
+#endif
@@ -0,0 +1,16 @@
+HIP_PATH?= $(wildcard /opt/rocm/hip)
+ifeq (,$(HIP_PATH))
+HIP_PATH=../../..
+endif
+HIPCC=$(HIP_PATH)/bin/hipcc
+
+EXE=TransferBench
+CXXFLAGS = -O3 -fopenmp -I../../src/include -I.
+
+all: $(EXE)
+
+$(EXE): $(EXE).cpp $(shell find -regex ".*\.\hpp")
+	$(HIPCC) $(CXXFLAGS) $< -o $@
+
+clean:
+	rm -f *.o $(EXE)
@@ -0,0 +1,313 @@
+/*
+Copyright (c) 2019 - present Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+// This program measures simultaneous copy performance across multiple GPUs
+// on the same node
+
+#include <chrono>
+#include <cstdio>
+#include <cstdlib>
+#include <set>
+
+#include <hip/hip_runtime.h>
+#include "copy_kernel.h"
+#include "TransferBench.hpp"
+
+int main(int argc, char **argv)
+{
+    // Display usage
+    if (argc <= 1)
+    {
+        printf("Usage: %s configFile <N>\n", argv[0]);
+        printf("- configFile: file describing topologies to test\n");
+        printf("  Each line should contain a single topology\n");
+        printf("    L - number of links followed by L white-space separated triples (src, dst, # blocks)\n");
+        printf("    For example:\n");
+        printf("      2 0 1 3  1 0 3\n");
+        printf("      would define 2 links each using 3 threadblocks from GPU0 -> GPU1, and GPU1->GPU0\n");
+        printf("- N: (Optional) Number of bytes to transfer per link.\n");
+        printf("     If not specified, defaults to 2^28=256MB. Must be a multiple of 128 bytes\n");
+        printf("Set env var USE_MEMCPY_ASYNC to use hipMemcpyAsync instead of copy kernel\n");
+        exit(0);
+    }
+
+    // Parse number of bytes to use (or use default if not specified)
+    size_t const numBytesPerLink = argc > 2 ? atoll(argv[2]) : (1<<28);
+    size_t N = numBytesPerLink / sizeof(float);
+    if (numBytesPerLink % 128)
+    {
+        printf("[ERROR] numBytesPerLink (%lu) must be a multiple of 128\n", numBytesPerLink);
+        exit(1);
+    }
+
+    // Currently an environment variable is required in order to enable fine-grained VRAM allocations
+    if (!getenv("HSA_FORCE_FINE_GRAIN_PCIE"))
+    {
+        printf("[ERROR] Currently you must set HSA_FORCE_FINE_GRAIN_PCIE=1 prior to execution\n");
+        exit(1);
+    }
+
+    bool useMemcpy = getenv("USE_MEMCPY_ASYNC");
+    printf("Using %s\n", useMemcpy ? "hipMemcpyAsync (USE_MEMCPY_ASYNC found) [# of blocks to use will be ignored]" : "copy kernel (USE_MEMCPY_ASYNC not found)");
+
+    // Collect the number of available GPUs on this machine
+    int numDevices;
+    HIP_CALL(hipGetDeviceCount(&numDevices));
+    if (numDevices < 1)
+    {
+        printf("[ERROR] No GPU devices found\n");
+        exit(1);
+    }
+
+    // Print header
+    printf("%-*s(GB/s)", MAX_NAME_LEN - 6, "Configuration");
+    for (int i = 0; i < numDevices; i++)
+        printf("  GPU %02d", i);
+    printf("   Total\n");
+    for (int i = 0; i < MAX_NAME_LEN + 8 * (numDevices + 1); i++) printf("=");
+    printf("\n");
+
+    // Read configuration file
+    FILE* fp = fopen(argv[1], "r");
+    if (!fp)
+    {
+        printf("[ERROR] Unable to open link configuration file: [%s]\n", argv[1]);
+        exit(1);
+    }
+
+    // Track links that get used
+    std::map<std::pair<int, int>, int> linkMap;
+
+    char line[2048];
+    while(fgets(line, 2048, fp))
+    {
+        // Parse links from configuration file
+        std::vector<Link> links;
+        ParseLinks(line, links);
+
+        int const numLinks = links.size();
+        if (numLinks == 0) continue;
+
+        // Clear counters
+        int linkCount[numDevices];
+        for (int i = 0; i < numDevices; i++)
+            linkCount[i] = 0;
+
+        float* linkSrcMem[numLinks];
+        float* linkDstMem[numLinks];
+        hipStream_t streams[numLinks];
+        hipEvent_t startEvents[numLinks];
+        hipEvent_t stopEvents[numLinks];
+        std::vector<BlockParam> cpuBlockParams[numLinks];
+        BlockParam* gpuBlockParams[numLinks];
+
+        char name[MAX_NAME_LEN+1] = {};
+
+        for (int i = 0; i < numLinks; i++)
+        {
+            int const src = links[i].srcGpu;
+            int const dst = links[i].dstGpu;
+            if (src < 0 || src >= numDevices ||
+                dst < 0 || dst >= numDevices)
+            {
+                printf("[ERROR] Invalid link (%d to %d). Total devices: %d\n", src, dst, numDevices);
+                exit(1);
+            }
+            snprintf(name + strlen(name), MAX_NAME_LEN, "%d->%d:%d ", src, dst, links[i].numBlocksToUse);
+
+            // Enable peer-to-peer access if this is the first time seeing this pair
+            auto linkPair = std::make_pair(src, dst);
+            linkMap[linkPair]++;
+            if (linkMap[linkPair] == 1)
+            {
+                int canAccess;
+                HIP_CALL(hipDeviceCanAccessPeer(&canAccess, src, dst));
+                if (!canAccess)
+                {
+                    printf("[ERROR] Unable to enable peer access between device %d and %d\n", src, dst);
+                    exit(1);
+                }
+                HIP_CALL(hipSetDevice(src));
+                HIP_CALL(hipDeviceEnablePeerAccess(dst, 0));
+            }
+
+            // Count # of links / total blocks each GPU will be working on
+            linkCount[src]++;
+
+            // Allocate GPU memory on source GPU / streams / events
+            HIP_CALL(hipSetDevice(links[i].srcGpu));
+            HIP_CALL(hipStreamCreate(&streams[i]));
+            HIP_CALL(hipEventCreate(&startEvents[i]));
+            HIP_CALL(hipEventCreate(&stopEvents[i]));
+            HIP_CALL(hipMalloc((void **)&linkSrcMem[i], numBytesPerLink));
+            HIP_CALL(hipMalloc((void**)&gpuBlockParams[i], sizeof(BlockParam) * numLinks));
+            CheckOrFill(N, linkSrcMem[i], false);
+
+            // Allocate fine-grained GPU memory on destination GPU
+            HIP_CALL(hipSetDevice(links[i].dstGpu));
+            HIP_CALL(hipExtMallocWithFlags((void**)&linkDstMem[i], numBytesPerLink, hipDeviceMallocFinegrained));
+
+            // Each block needs to know src/dst pointers and how many elements to transfer
+            // Figure out the sub-array each block does for this link
+            // NOTE: Have each sub-array to work on multiple of 32-floats (128-bytes),
+            //       but divide as evenly as possible
+            // NOTE: N is always a multiple of 32
+            int blocksWithExtra = (N / 32) % links[i].numBlocksToUse;
+            int perBlockBaseN   = (N / 32) / links[i].numBlocksToUse * 32;
+            for (int j = 0; j < links[i].numBlocksToUse; j++)
+            {
+                BlockParam param;
+                param.N   = perBlockBaseN + ((j < blocksWithExtra) ? 32 : 0);
+                param.src = linkSrcMem[i] + ((j * perBlockBaseN) + ((j < blocksWithExtra) ?
+                                                                    j : blocksWithExtra) * 32);
+                param.dst = linkDstMem[i] + ((j * perBlockBaseN) + ((j < blocksWithExtra) ?
+                                                                    j : blocksWithExtra) * 32);
+                cpuBlockParams[i].push_back(param);
+            }
+
+            HIP_CALL(hipMemcpy(gpuBlockParams[i], cpuBlockParams[i].data(),
+                               sizeof(BlockParam) * links[i].numBlocksToUse, hipMemcpyHostToDevice));
+        }
+
+        // Launch kernels (warmup iterations are not counted)
+        int numWarmups = 3;
+        int numIterations = 10;
+        double totalCpuTime = 0;
+        double totalGpuTime[numDevices];
+        for (int i = 0; i < numDevices; i++) totalGpuTime[i] = 0.0;
+
+        for (int iteration = -numWarmups; iteration < numIterations; iteration++)
+        {
+            auto cpuStart = std::chrono::high_resolution_clock::now();
+            #pragma omp parallel for num_threads(numLinks)
+            for (int i = 0; i < numLinks; i++)
+            {
+                HIP_CALL(hipSetDevice(links[i].srcGpu));
+                HIP_CALL(hipEventRecord(startEvents[i], streams[i]));
+                if (useMemcpy)
+                {
+                    HIP_CALL(hipMemcpyAsync(linkDstMem[i], linkSrcMem[i],
+                                            numBytesPerLink, hipMemcpyDeviceToDevice,
+                                            streams[i]));
+                }
+                else
+                {
+                    hipLaunchKernelGGL(CopyKernel,
+                                       dim3(links[i].numBlocksToUse, 1, 1),
+                                       dim3(BLOCKSIZE, 1, 1),
+                                       0,
+                                       streams[i],
+                                       gpuBlockParams[i]);
+                }
+                HIP_CALL(hipEventRecord(stopEvents[i], streams[i]));
+            }
+
+            for (int i = 0; i < numLinks; i++)
+                hipStreamSynchronize(streams[i]);
+
+            auto cpuDelta = std::chrono::high_resolution_clock::now() - cpuStart;
+            double deltaSec = std::chrono::duration_cast<std::chrono::duration<double>>(cpuDelta).count();
+
+            if (iteration >= 0)
+            {
+                totalCpuTime += deltaSec;
+
+                for (int i = 0; i < numDevices; i++)
+                {
+                    // Multiple links running on the same device may be running simultaneously
+                    // so try to figure out the first/last event across all links
+                    float maxTime = 0.0f;
+                    for (int j = 0; j < numLinks; j++)
+                    {
+                        if (links[j].srcGpu != i) continue;
+                        for (int k = 0; k < numLinks; k++)
+                        {
+                            if (links[k].srcGpu != i) continue;
+
+                            float gpuDeltaMsec;
+                            HIP_CALL(hipEventElapsedTime(&gpuDeltaMsec, startEvents[j], stopEvents[k]));
+                            maxTime = std::max(maxTime, gpuDeltaMsec);
+                        }
+                    }
+                    totalGpuTime[i] += maxTime / 1000.0;
+                }
+            }
+        }
+
+        // Validate that each link has transferred correctly
+        for (int i = 0; i < numLinks; i++)
+            CheckOrFill(N, linkDstMem[i], true);
+
+        // Report timings
+        printf("%-*s", MAX_NAME_LEN, name);
+        for (int i = 0; i < numDevices; i++)
+        {
+            if (linkCount[i] == 0)
+            {
+                printf("%8.3f", 0.0);
+            }
+            else
+            {
+                totalGpuTime[i] /= (1.0 * numIterations);
+                printf("%8.3f", (linkCount[i] * numBytesPerLink / 1.0E9) / totalGpuTime[i]);
+            }
+        }
+
+        // Print off bandwidth (based on CPU wall-time timer)
+        totalCpuTime /= numIterations;
+        printf("%8.3f\n", (numLinks * numBytesPerLink / 1.0E9) / totalCpuTime);
+
+        // Release GPU memory
+        for (int i = 0; i < numLinks; i++)
+        {
+            HIP_CALL(hipFree(linkSrcMem[i]));
+            HIP_CALL(hipFree(linkDstMem[i]));
+            HIP_CALL(hipFree(gpuBlockParams[i]));
+            HIP_CALL(hipStreamDestroy(streams[i]));
+            HIP_CALL(hipEventDestroy(startEvents[i]));
+            HIP_CALL(hipEventDestroy(stopEvents[i]));
+
+        }
+    }
+    fclose(fp);
+
+    // Print link information
+    for (int i = 0; i < MAX_NAME_LEN + 8 * (numDevices + 1); i++) printf("=");
+    printf("\n");
+    printf("Link topology:\n");
+    uint32_t linkType;
+    uint32_t hopCount;
+    for (auto mapPair : linkMap)
+    {
+        int src = mapPair.first.first;
+        int dst = mapPair.first.second;
+        HIP_CALL(hipExtGetLinkTypeAndHopCount(src, dst, &linkType, &hopCount));
+        printf("%d -> %d: %s [%d hop(s)]\n", src, dst,
+               linkType == HSA_AMD_LINK_INFO_TYPE_HYPERTRANSPORT ? "HYPERTRANSPORT" :
+               linkType == HSA_AMD_LINK_INFO_TYPE_QPI            ? "QPI" :
+               linkType == HSA_AMD_LINK_INFO_TYPE_PCIE           ? "PCIE" :
+               linkType == HSA_AMD_LINK_INFO_TYPE_INFINBAND      ? "INFINIBAND" :
+               linkType == HSA_AMD_LINK_INFO_TYPE_XGMI           ? "XGMI" : "UNKNOWN",
+               hopCount);
+    }
+    return 0;
+}
@@ -0,0 +1,111 @@
+/*
+Copyright (c) 2019 - present Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+// Helper macro for catching HIP errors
+#define HIP_CALL(cmd)                                                   \
+    do {                                                                \
+        hipError_t error = (cmd);                                       \
+        if (error != hipSuccess)                                        \
+        {                                                               \
+            std::cerr << "Encountered HIP error (" << hipGetErrorString(error) << ") at line " \
+                      << __LINE__ << " in file " << __FILE__ << "\n";   \
+            exit(-1);                                                   \
+        }                                                               \
+    } while (0)
+
+#define MAX_NAME_LEN 64
+#define BLOCKSIZE 256
+#define COPY_UNROLL 4
+
+// Each link is defined between a source GPU and destination GPU
+struct Link
+{
+    int srcGpu;         // Source GPU      (global memory source)
+    int dstGpu;         // Destination GPU (fine-grained memory destination)
+    int numBlocksToUse; // Number of threadblocks to use for this link
+};
+
+// Each threadblock copies N floats from src to dst
+struct BlockParam
+{
+    int N;
+    float* src;
+    float* dst;
+};
+
+// GPU copy kernel
+__global__ void __launch_bounds__(BLOCKSIZE)
+CopyKernel(BlockParam* blockParams)
+{
+    // Collect the arguments for this block
+    int N = blockParams[blockIdx.x].N;
+    const float* __restrict__ src = (float* )blockParams[blockIdx.x].src;
+    float* __restrict__ dst = (float* )blockParams[blockIdx.x].dst;
+
+    Copy<COPY_UNROLL, BLOCKSIZE>(dst, src, N);
+}
+
+// Helper function to parse a link of link definitions
+void ParseLinks(char const* line, std::vector<Link>& links)
+{
+    links.clear();
+    int numLinks = 0;
+
+    std::istringstream iss;
+    iss.clear();
+    iss.str(line);
+    iss >> numLinks;
+    links.resize(numLinks);
+    if (iss.fail()) return;
+
+
+    for (int i = 0; i < numLinks; i++)
+        iss >> links[i].srcGpu >> links[i].dstGpu >> links[i].numBlocksToUse;
+}
+
+// Helper function to either fill a device pointer with pseudo-random data, or to check to see if it matches
+void CheckOrFill(int N, float* devPtr, bool doCheck)
+{
+    float* refBuffer = (float*)malloc(N * sizeof(float));
+
+    for (int i = 0; i < N; i++)
+        refBuffer[i] = i % 383 + 31;
+
+    if (doCheck)
+    {
+        float* hostBuffer = (float*) malloc(N * sizeof(float));
+        HIP_CALL(hipMemcpy(hostBuffer, devPtr, N * sizeof(float), hipMemcpyDeviceToHost));
+        for (int i = 0; i < N; i++)
+        {
+            if (refBuffer[i] != hostBuffer[i])
+            {
+                printf("[ERROR] Mismatch at element %d Ref: %f Actual: %f\n", i, refBuffer[i], hostBuffer[i]);
+                exit(1);
+            }
+        }
+    }
+    else
+    {
+        HIP_CALL(hipMemcpy(devPtr, refBuffer, N * sizeof(float), hipMemcpyHostToDevice));
+    }
+    free(refBuffer);
+}
@@ -0,0 +1,310 @@
+/*************************************************************************
+ * Copyright (c) 2015, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+
+#ifndef COPY_KERNEL_H_
+#define COPY_KERNEL_H_
+#include <cstdio>
+#include <cstdint>
+
+// Define min for ssize_t
+static __device__ int min(int a, ssize_t b) { return (a < b) ? a : b; }
+
+typedef uint64_t PackType;
+
+template<class FUNC, typename T>
+struct MULTI {
+    __device__ PackType operator()(const PackType x, const PackType y) const
+    {
+        return FUNC()(x, y);
+    }
+};
+
+#define ALIGNUP(x, a)   ((((x)-1) & ~((a)-1)) + (a))
+
+template<typename T>
+__device__ inline volatile T* AlignUp(volatile T * ptr, size_t align) {
+  size_t ptrval = reinterpret_cast<size_t>(ptr);
+  return reinterpret_cast<volatile T*>(ALIGNUP(ptrval, align));
+}
+
+template<typename T> inline __device__
+T vFetch(const volatile T* ptr) {
+  return *ptr;
+}
+
+template<typename T> inline __device__
+void vStore(volatile T* ptr, const T val) {
+  *ptr = val;
+}
+
+template<class FUNC, typename T, bool TWO_INPUTS, bool TWO_OUTPUTS>
+__attribute__((noinline))
+__device__ inline void ReduceCopy(
+    const int tid, const int nthreads,
+    const volatile T * __restrict__ const src0,
+    const volatile T * __restrict__ const src1,
+    volatile T * __restrict__ const dest0,
+    volatile T * __restrict__ const dest1, const int N) {
+  for (int idx = tid; idx < N; idx += nthreads) {
+    T val = vFetch(src0+idx);
+    if (TWO_INPUTS) {
+      val = FUNC()(val, vFetch(src1+idx));
+    }
+    vStore(dest0+idx, val);
+    if (TWO_OUTPUTS) {
+      vStore(dest1+idx, val);
+    }
+  }
+}
+
+template<typename T>
+struct FuncPassA {
+  __device__ T operator()(const T x, const T y) const {
+    return x;
+  }
+};
+
+template<typename T>
+struct FuncSum {
+  __device__ T operator()(const T x, const T y) const {
+    return x + y;
+  }
+};
+
+template<class FUNC>
+struct MULTI<FUNC, float> {
+  static_assert(sizeof(PackType) == 2 * sizeof(float),
+      "PackType must be twice the size of float.");
+  union converter {
+    PackType storage;
+    struct {
+      float a, b;
+    };
+  };
+
+  __device__ PackType operator()(const PackType x, const PackType y) const {
+    converter cx, cy, cr;
+    cx.storage = x;
+    cy.storage = y;
+
+    cr.a = FUNC()(cx.a, cy.a);
+    cr.b = FUNC()(cx.b, cy.b);
+
+    return cr.storage;
+  }
+};
+
+
+typedef ulong2 Pack128;
+
+template<class FUNC, typename T>
+struct MULTI128 {
+  __device__ void operator()(Pack128& x, Pack128& y) {
+    x.x = MULTI<FUNC, T>()(x.x, y.x);
+    x.y = MULTI<FUNC, T>()(x.y, y.y);
+  }
+};
+
+inline __device__ void Fetch128(Pack128& v, const Pack128* p) {
+  v.x = p->x;
+  v.y = p->y;
+}
+inline __device__ void Store128(Pack128* p, Pack128& v) {
+  p->x = v.x;
+  p->y = v.y;
+}
+
+template<class FUNC, typename T, int MINSRCS, int MAXSRCS, int MINDSTS, int MAXDSTS>
+__device__ void ReduceCopyMulti(const int tid, const int nthreads,
+    int nsrcs, const T* srcs[MAXSRCS], int ndsts, T* dsts[MAXDSTS],
+    const int offset, const int N) {
+  for (int idx = offset+tid; idx < offset+N; idx += nthreads) {
+    T val = vFetch(srcs[0]+idx);
+    #pragma unroll
+    for (int i=1; i<MINSRCS; i++) val = FUNC()(val, vFetch(srcs[i]+idx));
+    #pragma unroll 1
+    for (int i=MINSRCS; i<MAXSRCS && i<nsrcs; i++) val = FUNC()(val, vFetch(srcs[i]+idx));
+
+    #pragma unroll
+    for (int i=0; i<MINDSTS; i++) vStore(dsts[i]+idx, val);
+    #pragma unroll 1
+    for (int i=MINDSTS; i<MAXDSTS && i<ndsts; i++) vStore(dsts[i]+idx, val);
+  }
+}
+
+#define WARP_SIZE 64
+
+template<class FUNC, typename T, int UNROLL, int MINSRCS, int MAXSRCS, int MINDSTS, int MAXDSTS>
+__device__ void ReduceCopy128bMulti( const int w, const int nw, const int t,
+    int nsrcs, const T* s[MAXSRCS], int ndsts, T* d[MAXDSTS],
+    const int elemOffset, const int Npack) {
+  const int inc = nw * UNROLL * WARP_SIZE;
+  int offset = w * UNROLL * WARP_SIZE + t;
+
+  const Pack128* srcs[MAXSRCS];
+  for (int i=0; i<MAXSRCS; i++) srcs[i] = ((const Pack128*)(s[i]+elemOffset))+offset;
+  Pack128* dsts[MAXDSTS];
+  for (int i=0; i<MAXDSTS; i++) dsts[i] = ((Pack128*)(d[i]+elemOffset))+offset;
+
+  while (offset < Npack) {
+    Pack128 vals[UNROLL];
+    // Load and reduce
+    for (int u = 0; u < UNROLL; ++u) Fetch128(vals[u], srcs[0]+u*WARP_SIZE);
+
+    for (int i=1; i<MINSRCS; i++) {
+      Pack128 vals2[UNROLL];
+      for (int u = 0; u < UNROLL; ++u) Fetch128(vals2[u], srcs[i]+u*WARP_SIZE);
+      for (int u = 0; u < UNROLL; ++u) MULTI128<FUNC, T>()(vals[u], vals2[u]);
+    }
+    #pragma unroll 1
+    for (int i=MINSRCS; i<MAXSRCS && i<nsrcs; i++) {
+      Pack128 vals2[UNROLL];
+      for (int u = 0; u < UNROLL; ++u) Fetch128(vals2[u], srcs[i]+u*WARP_SIZE);
+      for (int u = 0; u < UNROLL; ++u) MULTI128<FUNC, T>()(vals[u], vals2[u]);
+    }
+
+    // Store
+    for (int i = 0; i < MINDSTS; i++) {
+      for (int u = 0; u < UNROLL; ++u) Store128(dsts[i]+u*WARP_SIZE, vals[u]);
+    }
+    #pragma unroll 1
+    for (int i=MINDSTS; i<MAXDSTS && i<ndsts; i++) {
+      for (int u = 0; u < UNROLL; ++u) Store128(dsts[i]+u*WARP_SIZE, vals[u]);
+    }
+    for (int i=0; i<MAXSRCS; i++) srcs[i] += inc;
+    for (int i=0; i<MAXDSTS; i++) dsts[i] += inc;
+    offset += inc;
+  }
+}
+
+template <typename T>
+__device__ int ptrAlign128(T* ptr) { return (uint64_t)ptr % alignof(Pack128); }
+
+// Try to limit consecutive load/stores to 8.
+// Use UNROLL 8 when we have a single source and a single destination, 4 otherwise
+#define AUTOUNROLL (UNROLL*(4/(MINDSTS+MINSRCS)))
+
+template<int UNROLL, class FUNC, typename T, int MINSRCS, int MAXSRCS, int MINDSTS, int MAXDSTS>
+__device__ void ReduceOrCopyMulti(const int tid, const int nthreads,
+    int nsrcs, const T* srcs[MAXSRCS], int ndsts, T* dsts[MAXDSTS],
+    int N) {
+  int Nrem = N;
+  if (Nrem <= 0) return;
+
+  int alignDiff = 0;
+  int align = ptrAlign128(srcs[0]);
+  #pragma unroll
+  for (int i=1; i<MINSRCS; i++) alignDiff |= (align ^ ptrAlign128(srcs[i]));
+  for (int i=MINSRCS; i<MAXSRCS && i<nsrcs; i++) alignDiff |= (align ^ ptrAlign128(srcs[i]));
+  #pragma unroll
+  for (int i=0; i<MINDSTS; i++) alignDiff |= (align ^ ptrAlign128(dsts[i]));
+  for (int i=MINDSTS; i<MAXDSTS && i<ndsts; i++) alignDiff |= (align ^ ptrAlign128(dsts[i]));
+
+  int Npreamble = alignDiff ? Nrem :
+    N < alignof(Pack128) ? N :
+    (alignof(Pack128) - align) % alignof(Pack128);
+
+  // stage 1: preamble: handle any elements up to the point of everything coming
+  // into alignment
+  if (Npreamble) {
+    ReduceCopyMulti<FUNC, T, MINSRCS, MAXSRCS, MINDSTS, MAXDSTS>(tid, nthreads, nsrcs, srcs, ndsts, dsts, 0, Npreamble);
+    Nrem -= Npreamble;
+    if (Nrem == 0) return;
+  }
+  int offset = Npreamble;
+
+  // stage 2: fast path: use 128b loads/stores to do the bulk of the work,
+  // assuming the pointers we have are all 128-bit alignable.
+  int w = tid / WARP_SIZE;       // Warp number
+  int nw = nthreads / WARP_SIZE; // Number of warps
+  int t = tid % WARP_SIZE;       // Thread (inside the warp)
+
+  const int packFactor = sizeof(Pack128) / sizeof(T);
+
+  // stage 2a: main loop
+  int Npack2a = (Nrem / (packFactor * AUTOUNROLL * WARP_SIZE))
+      * (AUTOUNROLL * WARP_SIZE); // round down
+  int Nelem2a = Npack2a * packFactor;
+
+  ReduceCopy128bMulti<FUNC, T, AUTOUNROLL, MINSRCS, MAXSRCS, MINDSTS, MAXDSTS>(w, nw, t, nsrcs, srcs, ndsts, dsts, offset, Npack2a);
+
+  Nrem -= Nelem2a;
+  if (Nrem == 0) return;
+  offset += Nelem2a;
+
+  // stage 2b: slightly less optimized for section when we don't have full
+  // unrolling
+
+  int Npack2b = Nrem / packFactor;
+  int Nelem2b = Npack2b * packFactor;
+
+  ReduceCopy128bMulti<FUNC, T, 1, MINSRCS, MAXSRCS, MINDSTS, MAXDSTS>(w, nw, t, nsrcs, srcs, ndsts, dsts, offset, Npack2b);
+
+  Nrem -= Nelem2b;
+  if (Nrem == 0) return;
+  offset += Nelem2b;
+
+  // stage 2c: tail
+  ReduceCopyMulti<FUNC, T, MINSRCS, MAXSRCS, MINDSTS, MAXDSTS>(tid, nthreads, nsrcs, srcs, ndsts, dsts, offset, Nrem);
+}
+
+// Assumptions:
+// - there is exactly 1 block
+// - THREADS is the number of producer threads
+// - this function is called by all producer threads
+template<int UNROLL, int THREADS, typename T>
+__device__ void Copy(volatile T * __restrict__ const dest,
+    const volatile T * __restrict__ const src, const int N) {
+  const T* srcs[2];
+  T* dsts[2];
+  srcs[0] = (const T*)src;
+  dsts[0] = (T*)dest;
+  ReduceOrCopyMulti<UNROLL, FuncPassA<T>, T, 1, 2, 1, 2>(threadIdx.x, THREADS,
+      1, srcs, 1, dsts, N);
+}
+
+template<int UNROLL, int THREADS, typename T>
+__device__ void DoubleCopy(volatile T * __restrict__ const dest0,
+    volatile T * __restrict__ const dest1,
+    const volatile T * __restrict__ const src, const int N) {
+  const T* srcs[2];
+  T* dsts[2];
+  srcs[0] = (const T*)src;
+  dsts[0] = (T*)dest0;
+  dsts[1] = (T*)dest1;
+  ReduceOrCopyMulti<UNROLL, FuncPassA<T>, T, 1, 2, 1, 2>(threadIdx.x, THREADS,
+      1, srcs, 2, dsts, N);
+}
+
+template<int UNROLL, int THREADS, typename T>
+__device__ void Reduce(volatile T * __restrict__ const dest,
+    const volatile T * __restrict__ const src0,
+    const volatile T * __restrict__ const src1, const int N) {
+  const T* srcs[2];
+  T* dsts[2];
+  srcs[0] = (const T*)src0;
+  srcs[1] = (const T*)src1;
+  dsts[0] = (T*)dest;
+  ReduceOrCopyMulti<UNROLL, FuncPassA<T>, T, 1, 2, 1, 2>(threadIdx.x, THREADS,
+      2, srcs, 1, dsts, N);
+}
+
+template<int UNROLL, int THREADS, typename T>
+__device__ void ReduceCopy(volatile T * __restrict__ const dest0,
+    volatile T * __restrict__ const dest1,
+    const volatile T * __restrict__ const src0,
+    const volatile T * __restrict__ const src1, const int N) {
+  const T* srcs[2];
+  T* dsts[2];
+  srcs[0] = (const T*)src0;
+  srcs[1] = (const T*)src1;
+  dsts[0] = (T*)dest0;
+  dsts[1] = (T*)dest1;
+  ReduceOrCopyMulti<UNROLL, FuncPassA<T>, T, 1, 2, 1, 2>(threadIdx.x, THREADS,
+      2, srcs, 2, dsts, N);
+}
+#endif // COPY_KERNEL_H_
@@ -0,0 +1,4 @@
+# Each line consists of L (# of links) followed by L white-space-separated triples of (srcGpu, dstGpu, #blocks)
+
+# Single link between GPUs 0 and 1
+1  0 1 1
@@ -0,0 +1,16 @@
+HIP_PATH?= $(wildcard /opt/rocm/hip)
+ifeq (,$(HIP_PATH))
+	HIP_PATH=../../..
+endif
+HIPCC=$(HIP_PATH)/bin/hipcc
+
+EXE=rccl_prim_test
+CXXFLAGS = -O3 -g -I/opt/rocm/rocrand/include
+
+all: $(EXE)
+
+$(EXE): rccl_prim_test.cpp
+	$(HIPCC) $(CXXFLAGS) $^ -o $@
+
+clean:
+	rm -f *.o $(EXE)
@@ -0,0 +1,310 @@
+/*************************************************************************
+ * Copyright (c) 2015, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+
+#ifndef COPY_KERNEL_H_
+#define COPY_KERNEL_H_
+#include <cstdio>
+#include <cstdint>
+
+// Define min for ssize_t
+static __device__ int min(int a, ssize_t b) { return (a < b) ? a : b; }
+
+typedef uint64_t PackType;
+
+template<class FUNC, typename T>
+struct MULTI {
+    __device__ PackType operator()(const PackType x, const PackType y) const
+    {
+        return FUNC()(x, y);
+    }
+};
+
+#define ALIGNUP(x, a)   ((((x)-1) & ~((a)-1)) + (a))
+
+template<typename T>
+__device__ inline volatile T* AlignUp(volatile T * ptr, size_t align) {
+  size_t ptrval = reinterpret_cast<size_t>(ptr);
+  return reinterpret_cast<volatile T*>(ALIGNUP(ptrval, align));
+}
+
+template<typename T> inline __device__
+T vFetch(const volatile T* ptr) {
+  return *ptr;
+}
+
+template<typename T> inline __device__
+void vStore(volatile T* ptr, const T val) {
+  *ptr = val;
+}
+
+template<class FUNC, typename T, bool TWO_INPUTS, bool TWO_OUTPUTS>
+__attribute__((noinline))
+__device__ inline void ReduceCopy(
+    const int tid, const int nthreads,
+    const volatile T * __restrict__ const src0,
+    const volatile T * __restrict__ const src1,
+    volatile T * __restrict__ const dest0,
+    volatile T * __restrict__ const dest1, const int N) {
+  for (int idx = tid; idx < N; idx += nthreads) {
+    T val = vFetch(src0+idx);
+    if (TWO_INPUTS) {
+      val = FUNC()(val, vFetch(src1+idx));
+    }
+    vStore(dest0+idx, val);
+    if (TWO_OUTPUTS) {
+      vStore(dest1+idx, val);
+    }
+  }
+}
+
+template<typename T>
+struct FuncPassA {
+  __device__ T operator()(const T x, const T y) const {
+    return x;
+  }
+};
+
+template<typename T>
+struct FuncSum {
+  __device__ T operator()(const T x, const T y) const {
+    return x + y;
+  }
+};
+
+template<class FUNC>
+struct MULTI<FUNC, float> {
+  static_assert(sizeof(PackType) == 2 * sizeof(float),
+      "PackType must be twice the size of float.");
+  union converter {
+    PackType storage;
+    struct {
+      float a, b;
+    };
+  };
+
+  __device__ PackType operator()(const PackType x, const PackType y) const {
+    converter cx, cy, cr;
+    cx.storage = x;
+    cy.storage = y;
+
+    cr.a = FUNC()(cx.a, cy.a);
+    cr.b = FUNC()(cx.b, cy.b);
+
+    return cr.storage;
+  }
+};
+
+
+typedef ulong2 Pack128;
+
+template<class FUNC, typename T>
+struct MULTI128 {
+  __device__ void operator()(Pack128& x, Pack128& y) {
+    x.x = MULTI<FUNC, T>()(x.x, y.x);
+    x.y = MULTI<FUNC, T>()(x.y, y.y);
+  }
+};
+
+inline __device__ void Fetch128(Pack128& v, const Pack128* p) {
+  v.x = p->x;
+  v.y = p->y;
+}
+inline __device__ void Store128(Pack128* p, Pack128& v) {
+  p->x = v.x;
+  p->y = v.y;
+}
+
+template<class FUNC, typename T, int MINSRCS, int MAXSRCS, int MINDSTS, int MAXDSTS>
+__device__ void ReduceCopyMulti(const int tid, const int nthreads,
+    int nsrcs, const T* srcs[MAXSRCS], int ndsts, T* dsts[MAXDSTS],
+    const int offset, const int N) {
+  for (int idx = offset+tid; idx < offset+N; idx += nthreads) {
+    T val = vFetch(srcs[0]+idx);
+    #pragma unroll
+    for (int i=1; i<MINSRCS; i++) val = FUNC()(val, vFetch(srcs[i]+idx));
+    #pragma unroll 1
+    for (int i=MINSRCS; i<MAXSRCS && i<nsrcs; i++) val = FUNC()(val, vFetch(srcs[i]+idx));
+
+    #pragma unroll
+    for (int i=0; i<MINDSTS; i++) vStore(dsts[i]+idx, val);
+    #pragma unroll 1
+    for (int i=MINDSTS; i<MAXDSTS && i<ndsts; i++) vStore(dsts[i]+idx, val);
+  }
+}
+
+#define WARP_SIZE 64
+
+template<class FUNC, typename T, int UNROLL, int MINSRCS, int MAXSRCS, int MINDSTS, int MAXDSTS>
+__device__ void ReduceCopy128bMulti( const int w, const int nw, const int t,
+    int nsrcs, const T* s[MAXSRCS], int ndsts, T* d[MAXDSTS],
+    const int elemOffset, const int Npack) {
+  const int inc = nw * UNROLL * WARP_SIZE;
+  int offset = w * UNROLL * WARP_SIZE + t;
+
+  const Pack128* srcs[MAXSRCS];
+  for (int i=0; i<MAXSRCS; i++) srcs[i] = ((const Pack128*)(s[i]+elemOffset))+offset;
+  Pack128* dsts[MAXDSTS];
+  for (int i=0; i<MAXDSTS; i++) dsts[i] = ((Pack128*)(d[i]+elemOffset))+offset;
+
+  while (offset < Npack) {
+    Pack128 vals[UNROLL];
+    // Load and reduce
+    for (int u = 0; u < UNROLL; ++u) Fetch128(vals[u], srcs[0]+u*WARP_SIZE);
+
+    for (int i=1; i<MINSRCS; i++) {
+      Pack128 vals2[UNROLL];
+      for (int u = 0; u < UNROLL; ++u) Fetch128(vals2[u], srcs[i]+u*WARP_SIZE);
+      for (int u = 0; u < UNROLL; ++u) MULTI128<FUNC, T>()(vals[u], vals2[u]);
+    }
+    #pragma unroll 1
+    for (int i=MINSRCS; i<MAXSRCS && i<nsrcs; i++) {
+      Pack128 vals2[UNROLL];
+      for (int u = 0; u < UNROLL; ++u) Fetch128(vals2[u], srcs[i]+u*WARP_SIZE);
+      for (int u = 0; u < UNROLL; ++u) MULTI128<FUNC, T>()(vals[u], vals2[u]);
+    }
+
+    // Store
+    for (int i = 0; i < MINDSTS; i++) {
+      for (int u = 0; u < UNROLL; ++u) Store128(dsts[i]+u*WARP_SIZE, vals[u]);
+    }
+    #pragma unroll 1
+    for (int i=MINDSTS; i<MAXDSTS && i<ndsts; i++) {
+      for (int u = 0; u < UNROLL; ++u) Store128(dsts[i]+u*WARP_SIZE, vals[u]);
+    }
+    for (int i=0; i<MAXSRCS; i++) srcs[i] += inc;
+    for (int i=0; i<MAXDSTS; i++) dsts[i] += inc;
+    offset += inc;
+  }
+}
+
+template <typename T>
+__device__ int ptrAlign128(T* ptr) { return (uint64_t)ptr % alignof(Pack128); }
+
+// Try to limit consecutive load/stores to 8.
+// Use UNROLL 8 when we have a single source and a single destination, 4 otherwise
+#define AUTOUNROLL (UNROLL*(4/(MINDSTS+MINSRCS)))
+
+template<int UNROLL, class FUNC, typename T, int MINSRCS, int MAXSRCS, int MINDSTS, int MAXDSTS>
+__device__ void ReduceOrCopyMulti(const int tid, const int nthreads,
+    int nsrcs, const T* srcs[MAXSRCS], int ndsts, T* dsts[MAXDSTS],
+    int N) {
+  int Nrem = N;
+  if (Nrem <= 0) return;
+
+  int alignDiff = 0;
+  int align = ptrAlign128(srcs[0]);
+  #pragma unroll
+  for (int i=1; i<MINSRCS; i++) alignDiff |= (align ^ ptrAlign128(srcs[i]));
+  for (int i=MINSRCS; i<MAXSRCS && i<nsrcs; i++) alignDiff |= (align ^ ptrAlign128(srcs[i]));
+  #pragma unroll
+  for (int i=0; i<MINDSTS; i++) alignDiff |= (align ^ ptrAlign128(dsts[i]));
+  for (int i=MINDSTS; i<MAXDSTS && i<ndsts; i++) alignDiff |= (align ^ ptrAlign128(dsts[i]));
+
+  int Npreamble = alignDiff ? Nrem :
+    N < alignof(Pack128) ? N :
+    (alignof(Pack128) - align) % alignof(Pack128);
+
+  // stage 1: preamble: handle any elements up to the point of everything coming
+  // into alignment
+  if (Npreamble) {
+    ReduceCopyMulti<FUNC, T, MINSRCS, MAXSRCS, MINDSTS, MAXDSTS>(tid, nthreads, nsrcs, srcs, ndsts, dsts, 0, Npreamble);
+    Nrem -= Npreamble;
+    if (Nrem == 0) return;
+  }
+  int offset = Npreamble;
+
+  // stage 2: fast path: use 128b loads/stores to do the bulk of the work,
+  // assuming the pointers we have are all 128-bit alignable.
+  int w = tid / WARP_SIZE;       // Warp number
+  int nw = nthreads / WARP_SIZE; // Number of warps
+  int t = tid % WARP_SIZE;       // Thread (inside the warp)
+
+  const int packFactor = sizeof(Pack128) / sizeof(T);
+
+  // stage 2a: main loop
+  int Npack2a = (Nrem / (packFactor * AUTOUNROLL * WARP_SIZE))
+      * (AUTOUNROLL * WARP_SIZE); // round down
+  int Nelem2a = Npack2a * packFactor;
+
+  ReduceCopy128bMulti<FUNC, T, AUTOUNROLL, MINSRCS, MAXSRCS, MINDSTS, MAXDSTS>(w, nw, t, nsrcs, srcs, ndsts, dsts, offset, Npack2a);
+
+  Nrem -= Nelem2a;
+  if (Nrem == 0) return;
+  offset += Nelem2a;
+
+  // stage 2b: slightly less optimized for section when we don't have full
+  // unrolling
+
+  int Npack2b = Nrem / packFactor;
+  int Nelem2b = Npack2b * packFactor;
+
+  ReduceCopy128bMulti<FUNC, T, 1, MINSRCS, MAXSRCS, MINDSTS, MAXDSTS>(w, nw, t, nsrcs, srcs, ndsts, dsts, offset, Npack2b);
+
+  Nrem -= Nelem2b;
+  if (Nrem == 0) return;
+  offset += Nelem2b;
+
+  // stage 2c: tail
+  ReduceCopyMulti<FUNC, T, MINSRCS, MAXSRCS, MINDSTS, MAXDSTS>(tid, nthreads, nsrcs, srcs, ndsts, dsts, offset, Nrem);
+}
+
+// Assumptions:
+// - there is exactly 1 block
+// - THREADS is the number of producer threads
+// - this function is called by all producer threads
+template<int UNROLL, int THREADS, typename T>
+__device__ void Copy(volatile T * __restrict__ const dest,
+    const volatile T * __restrict__ const src, const int N) {
+  const T* srcs[2];
+  T* dsts[2];
+  srcs[0] = (const T*)src;
+  dsts[0] = (T*)dest;
+  ReduceOrCopyMulti<UNROLL, FuncPassA<T>, T, 1, 2, 1, 2>(threadIdx.x, THREADS,
+      1, srcs, 1, dsts, N);
+}
+
+template<int UNROLL, int THREADS, typename T>
+__device__ void DoubleCopy(volatile T * __restrict__ const dest0,
+    volatile T * __restrict__ const dest1,
+    const volatile T * __restrict__ const src, const int N) {
+  const T* srcs[2];
+  T* dsts[2];
+  srcs[0] = (const T*)src;
+  dsts[0] = (T*)dest0;
+  dsts[1] = (T*)dest1;
+  ReduceOrCopyMulti<UNROLL, FuncPassA<T>, T, 1, 2, 1, 2>(threadIdx.x, THREADS,
+      1, srcs, 2, dsts, N);
+}
+
+template<int UNROLL, int THREADS, typename T>
+__device__ void Reduce(volatile T * __restrict__ const dest,
+    const volatile T * __restrict__ const src0,
+    const volatile T * __restrict__ const src1, const int N) {
+  const T* srcs[2];
+  T* dsts[2];
+  srcs[0] = (const T*)src0;
+  srcs[1] = (const T*)src1;
+  dsts[0] = (T*)dest;
+  ReduceOrCopyMulti<UNROLL, FuncPassA<T>, T, 1, 2, 1, 2>(threadIdx.x, THREADS,
+      2, srcs, 1, dsts, N);
+}
+
+template<int UNROLL, int THREADS, typename T>
+__device__ void ReduceCopy(volatile T * __restrict__ const dest0,
+    volatile T * __restrict__ const dest1,
+    const volatile T * __restrict__ const src0,
+    const volatile T * __restrict__ const src1, const int N) {
+  const T* srcs[2];
+  T* dsts[2];
+  srcs[0] = (const T*)src0;
+  srcs[1] = (const T*)src1;
+  dsts[0] = (T*)dest0;
+  dsts[1] = (T*)dest1;
+  ReduceOrCopyMulti<UNROLL, FuncPassA<T>, T, 1, 2, 1, 2>(threadIdx.x, THREADS,
+      2, srcs, 2, dsts, N);
+}
+#endif // COPY_KERNEL_H_
@@ -0,0 +1,454 @@
+/*
+Copyright (c) 2019 - present Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+/**
+ * @file rccl_prim_test.cpp
+ *
+ * test performance if individual rccl primitives
+ */
+#include <cstdio>  //fprintf
+#include <iostream> //cerr
+#include <unistd.h> //usleep
+#include <cstring>
+#include <hip/hip_runtime_api.h>
+#include <hip/hip_runtime.h>
+#include "copy_kernel.h"
+
+#define MAX_GPU 8
+#define MAX_WORKGROUPS 8
+#define THREADS 256
+
+#define COPY_UNROLL       4
+#define REDUCE_UNROLL     2
+#define DOUBLECOPY_UNROLL 2
+#define REDUCECOPY_UNROLL 2
+
+struct transfer_data_t {
+  float *dest0[MAX_WORKGROUPS]; //remote fine grain
+  float *src0[MAX_WORKGROUPS];  //local fine grain
+  float *dest1[MAX_WORKGROUPS]; //local coarse grain
+  float *src1[MAX_WORKGROUPS];  //local coarse grain
+  int N;
+  int gpu;
+  int ngpu;
+  uint64_t *remOpCount;
+};
+
+struct profiling_data_t {
+  uint64_t write_cycles;
+  uint64_t bytes_transferred;
+};
+
+
+#define LOAD(VAR) __atomic_load_n((VAR), __ATOMIC_SEQ_CST)
+#define STORE(DST, SRC) __atomic_store_n((DST), (SRC), __ATOMIC_SEQ_CST)
+
+enum Ops {
+  OP_COPY,
+  OP_LOCALCOPY,
+  OP_DOUBLECOPY,
+  OP_REDUCE,
+  OP_REDUCECOPY,
+  NUM_OPS,
+};
+
+template<int op, int sync>
+__global__ void flag_sync_kernel(struct transfer_data_t* transfer_data, struct profiling_data_t* profiling_data, uint64_t opCount) {
+  size_t idx = threadIdx.x;
+  uint64_t curr_time, next_time;
+  int bid = blockIdx.x;
+  int n = transfer_data->N;
+
+  // signal self ready and wait until all GPUs are ready
+  if (idx == 0) {
+    if (bid == 0)
+      STORE(&transfer_data->remOpCount[transfer_data->gpu], opCount);
+    if (sync) {
+      for (int i = 0; i < transfer_data->ngpu; i++) {
+        while (LOAD(&transfer_data->remOpCount[i]) < opCount) {};
+      }
+    }
+  }
+  __syncthreads();
+
+  if (idx == 0) {
+    curr_time = clock64();
+  }
+
+  if (op == OP_COPY) Copy<COPY_UNROLL, THREADS, float>(transfer_data->dest0[bid], transfer_data->src0[bid], n);
+  if (op == OP_LOCALCOPY) Copy<COPY_UNROLL, THREADS, float>(transfer_data->dest1[bid], transfer_data->src0[bid], n);
+  if (op == OP_DOUBLECOPY) DoubleCopy<DOUBLECOPY_UNROLL, THREADS, float>(transfer_data->dest0[bid], transfer_data->dest1[bid], transfer_data->src0[bid], n);
+  if (op == OP_REDUCE) Reduce<REDUCE_UNROLL, THREADS, float>(transfer_data->dest0[bid], transfer_data->src0[bid], transfer_data->src1[bid], n);
+  if (op == OP_REDUCECOPY) ReduceCopy<REDUCECOPY_UNROLL, THREADS, float>(transfer_data->dest0[bid], transfer_data->dest1[bid], transfer_data->src0[bid], transfer_data->src1[bid], n);
+
+  __syncthreads();
+  if (idx == 0) {
+    next_time = clock64();
+    __atomic_fetch_add(&(profiling_data->write_cycles), next_time - curr_time, __ATOMIC_SEQ_CST);
+    __atomic_fetch_add(&(profiling_data->bytes_transferred), n * sizeof(float), __ATOMIC_SEQ_CST);
+  }
+}
+
+typedef void(*flag_sync_kernel_t)(struct transfer_data_t* transfer_data, struct profiling_data_t* profiling_data, uint64_t opCount);
+
+static flag_sync_kernel_t const flagSyncKerns[NUM_OPS*2] = {
+  flag_sync_kernel<OP_COPY, 0>,
+  flag_sync_kernel<OP_COPY, 1>,
+  flag_sync_kernel<OP_LOCALCOPY, 0>,
+  flag_sync_kernel<OP_LOCALCOPY, 1>,
+  flag_sync_kernel<OP_DOUBLECOPY, 0>,
+  flag_sync_kernel<OP_DOUBLECOPY, 1>,
+  flag_sync_kernel<OP_REDUCE, 0>,
+  flag_sync_kernel<OP_REDUCE, 1>,
+  flag_sync_kernel<OP_REDUCECOPY, 0>,
+  flag_sync_kernel<OP_REDUCECOPY, 1>,
+};
+
+__global__ void initTestDataKernel(float* data, const size_t N, const int gpu) {
+  int tid = threadIdx.x + blockIdx.x * blockDim.x;
+  while (tid < N) {
+    data[tid] = 1.0/(float)(gpu*17 + tid%77);
+    tid += blockDim.x * gridDim.x;
+  }
+}
+
+#define HIPCHECK(cmd)                                                          \
+do {                                                                           \
+  hipError_t error = (cmd);                                                    \
+  if (error != hipSuccess)                                                     \
+  {                                                                            \
+    std::cerr << "Encountered HIP error (" << error << ") at line "            \
+              << __LINE__ << " in file " << __FILE__ << "\n";                  \
+    exit(-1);                                                                  \
+  }                                                                            \
+} while (0)
+
+static void setupPeers(uint32_t *info) {
+  int deviceCnt, dev;
+
+  HIPCHECK(hipGetDeviceCount(&deviceCnt));
+  HIPCHECK(hipGetDevice(&dev));
+  //! If gpus are not peer enabled, enable them
+  for (int i = 0; i < deviceCnt; i++) {
+    HIPCHECK(hipSetDevice(i));
+    for (int j = 0; j < deviceCnt; j++) {
+      if (i != j) {
+	int p2p;
+        HIPCHECK(hipDeviceCanAccessPeer(&p2p, i, j));
+        if (!p2p) {
+          printf("Cannot enable peer access between device %d and %d. You may use HIP_VISIBLE_DEVICES to limit GPUs.\n",
+           i, j);
+          exit(-1);
+        }
+        HIPCHECK(hipDeviceEnablePeerAccess(j, 0));
+        uint32_t linktype;
+        HIPCHECK(hipExtGetLinkTypeAndHopCount(i, j, &linktype, &info[i*deviceCnt+j]));
+      }
+      else
+        info[i*deviceCnt+j] = 0;
+    }
+  }
+  HIPCHECK(hipSetDevice(dev));
+}
+
+static void printRing(int id, int *ring, int deviceCnt) {
+  printf("Ring %d: ", id);
+  for (int i = 0; i < deviceCnt; i++)
+    printf("%1d ", ring[i]);
+  printf("\n");
+}
+
+static void findConnect(uint32_t *info, int *ring, int deviceCnt) {
+  int n = 0, curr = 0, best;
+  uint32_t temp[MAX_GPU*MAX_GPU];
+  for (int i = 0; i < deviceCnt*deviceCnt; i++) temp[i] = 0;
+  for (int i = 0; i < deviceCnt; i++) {
+    for (int j = 0; j < deviceCnt; j++) temp[j*deviceCnt+curr] = 1;
+    ring[n] = curr;
+    n++;
+    int hops = 99;
+    for (int j = 0; j < deviceCnt; j++) {
+      if (temp[curr*deviceCnt+j]) continue;
+      if (info[curr*deviceCnt+j] < hops) {
+        best = j;
+        hops = info[curr*deviceCnt+j];
+      }
+    }
+    curr = best;
+  }
+}
+
+static int findNextGpu(int *ring, int gpu, int deviceCnt) {
+  int i;
+  for (i = 0; i < deviceCnt; i ++)
+    if (ring[i] == gpu) break;
+  return ring[(i+1)%deviceCnt];
+}
+
+static void setupRings(uint32_t *info, int *ring_0, int *ring_1) {
+  int deviceCnt, dev;
+  HIPCHECK(hipGetDeviceCount(&deviceCnt));
+  printf("Connection matrix:\n");
+  for (int i = 0; i < deviceCnt; i++) {
+    for (int j = 0; j < deviceCnt; j++)
+      printf("%2d ", info[i*deviceCnt+j]);
+    printf("\n");
+  }
+  findConnect(info, ring_0, deviceCnt);
+  printRing(0, ring_0, deviceCnt);
+  ring_1[0] =0;
+  for (int i = 1; i < deviceCnt; i++)
+    ring_1[i] = ring_0[deviceCnt-i];
+  printRing(1, ring_1, deviceCnt);
+}
+
+char* getCmdOption(char ** begin, char ** end, const std::string & option) {
+    char ** itr = std::find(begin, end, option);
+    if (itr != end && ++itr != end)
+    {
+        return *itr;
+    }
+    return 0;
+}
+
+bool cmdOptionExists(char** begin, char** end, const std::string& option) {
+    return std::find(begin, end, option) != end;
+}
+
+
+static const char* link_type_name[] = {"HT", "QPI", "PCIE", "IB", "XGMI"};
+
+
+int main(int argc,char* argv[])
+{
+  if (cmdOptionExists(argv, argv + argc, "-h")) {
+    printf("./rccl_prim_test -w num_workgroups -p copy|localcopy|doublecopy|reduce|reducecopy|all -i iterations -n bytes -s 0|1\n");
+    exit(0);
+  }
+
+  int workgroups = 1;
+  char *wg = getCmdOption(argv, argv + argc, "-w");
+  if (wg)
+    workgroups = atol(wg);
+  printf("Benchmarking using %d workgroups\n", workgroups);
+
+  int iters = 10;
+  char *it = getCmdOption(argv, argv + argc, "-i");
+  if (it)
+    iters = atol(it);
+  printf("Benchmarking using %d iterations\n", iters);
+
+  uint64_t nBytes = 2097152;
+  char *nb = getCmdOption(argv, argv + argc, "-n");
+  if (nb)
+    nBytes = atol(nb);
+  printf("Benchmarking using %ld bytes\n", nBytes);
+  uint64_t N = nBytes/sizeof(float);
+
+  int sync = 0;
+  char *s = getCmdOption(argv, argv + argc, "-s");
+  if (s)
+    sync = atol(s);
+  if (sync) printf("Sync all GPUs before operation\n");
+
+  const char *ops[] = {"copy", "localcopy", "doublecopy", "reduce", "reducecopy", "all"};
+  char *prim = getCmdOption(argv, argv + argc, "-p");
+  int op = 5, begin_op, end_op;
+  if (prim) {
+    for (op = 0; op < sizeof(ops); op++)
+      if (!strcmp((const char *)prim, ops[op]))
+        break;
+  }
+  if (op < NUM_OPS ) {
+    begin_op = op;
+    end_op = op + 1;
+  } else {
+    begin_op = 0;
+    end_op = NUM_OPS;
+    printf("Benchmarking all ops\n");
+  }
+
+  uint32_t connection_info[MAX_GPU*MAX_GPU];
+  // Enable peer access
+  setupPeers(connection_info);
+  // clockwise and counter clockwise rings
+  int ring_0[MAX_GPU] = {-1, -1, -1, -1};
+  int ring_1[MAX_GPU] = {-1, -1, -1, -1};
+  setupRings(connection_info, ring_0, ring_1);
+
+  // data buffers
+  float *buff[MAX_GPU*MAX_WORKGROUPS], *buff_coarse[MAX_GPU*MAX_WORKGROUPS];
+  struct transfer_data_t h_transfer_data[MAX_GPU], *transfer_data[MAX_GPU];
+  struct profiling_data_t *profiling_data[MAX_GPU], *d_profiling_data[MAX_GPU];
+  hipStream_t stream[MAX_GPU];
+
+  int nGpu = 1;
+  HIPCHECK(hipGetDeviceCount(&nGpu));
+  uint64_t *remOpCount, *d_remOpCount;
+  HIPCHECK(hipHostMalloc((void**)&remOpCount, sizeof(uint64_t)*MAX_GPU, hipHostMallocMapped));
+  HIPCHECK(hipHostGetDevicePointer((void**)&d_remOpCount, (void*)remOpCount, 0));
+
+
+  for (int i = 0; i < nGpu; i ++) {
+    HIPCHECK(hipSetDevice(i));
+    hipDeviceProp_t prop;
+    HIPCHECK(hipGetDeviceProperties(&prop, i));
+    printf("#   device %d [0x%02x] %s\n",
+                    i, prop.pciBusID, prop.name);
+    //create stream
+    HIPCHECK(hipStreamCreate(&stream[i]));
+    profiling_data[i] = (struct profiling_data_t *)malloc(sizeof(struct profiling_data_t));
+    HIPCHECK(hipMalloc((void**) &d_profiling_data[i], sizeof(struct profiling_data_t)));
+
+    HIPCHECK(hipExtMallocWithFlags((void**) &transfer_data[i], sizeof(struct transfer_data_t), hipDeviceMallocFinegrained));
+    for (int j = 0; j < workgroups; j++) {
+      HIPCHECK(hipExtMallocWithFlags((void**) &buff[i*MAX_WORKGROUPS+j], 2*N*sizeof(float), hipDeviceMallocFinegrained));
+      HIPCHECK(hipMalloc((void**) &buff_coarse[i*MAX_WORKGROUPS+j], 2*N*sizeof(float)));
+      //randomize test data
+      hipLaunchKernelGGL(initTestDataKernel,
+          /*grid dim x,y,z*/        dim3(32, 1, 1),
+          /*block dim x,y,z*/       dim3(THREADS, 1, 1),
+          /*dynamic shared mem*/    0,
+          /*stream*/                stream[i],
+          /*kernel args*/           buff[i*MAX_WORKGROUPS+j], 2*N, 0);
+      hipLaunchKernelGGL(initTestDataKernel,
+          /*grid dim x,y,z*/        dim3(32, 1, 1),
+          /*block dim x,y,z*/       dim3(THREADS, 1, 1),
+          /*dynamic shared mem*/    0,
+          /*stream*/                stream[i],
+          /*kernel args*/           buff_coarse[i*MAX_WORKGROUPS+j], 2*N, 0);
+    }
+  }
+
+  for (int i = 0; i < nGpu; i ++) {
+    for (int j = 0; j < workgroups; j++) {
+      int next_gpu;
+      if (j%2)
+        next_gpu = findNextGpu(ring_1, i, nGpu);
+      else
+        next_gpu = findNextGpu(ring_0, i, nGpu);
+      //printf("GPU %d Ring %d -> Next GPU %d\n", i, j, next_gpu);
+      h_transfer_data[i].dest0[j] = buff[next_gpu*MAX_WORKGROUPS+j] + N;
+      h_transfer_data[i].dest1[j] = buff_coarse[i*MAX_WORKGROUPS+j] + N;
+      h_transfer_data[i].src0[j] = buff[i*MAX_WORKGROUPS+j];
+      h_transfer_data[i].src1[j] = buff_coarse[i*MAX_WORKGROUPS+j];
+    }
+    h_transfer_data[i].N = N;
+    h_transfer_data[i].gpu = i;
+    h_transfer_data[i].ngpu = nGpu;
+    h_transfer_data[i].remOpCount = d_remOpCount;
+  }
+
+  for (int i = 0; i < nGpu; i ++) {
+    HIPCHECK(hipSetDevice(i));
+    HIPCHECK(hipMemcpyAsync(transfer_data[i], &h_transfer_data[i],
+                            sizeof(struct transfer_data_t), hipMemcpyHostToDevice,
+                            stream[i]));
+    HIPCHECK(hipStreamSynchronize(stream[i]));
+  }
+
+  uint64_t opCount = 0;
+  for (int op = begin_op; op < end_op; op ++) {
+    const char *OpsName[] = {"Copy", "Local Copy", "Double Copy", "Reduce", "ReduceCopy"};
+    printf("[Testing %s]: \n", OpsName[op]);
+    // 2 warm up cycles
+    for (int i = 0; i < 2; i ++) {
+      for (int i = 0; i < nGpu; i ++) {
+        HIPCHECK(hipSetDevice(i));
+        //launch the kernel
+        hipLaunchKernelGGL(flagSyncKerns[op*2 + sync],
+            /*grid dim x,y,z*/        dim3(workgroups, 1, 1),
+            /*block dim x,y,z*/       dim3(THREADS, 1, 1),
+            /*dynamic shared mem*/    0,
+            /*stream*/                stream[i],
+            /*kernel args*/           transfer_data[i], d_profiling_data[i], opCount);
+      }
+      opCount++;
+    }
+
+    for (int i = 0; i < nGpu; i ++) {
+      HIPCHECK(hipSetDevice(i));
+      HIPCHECK(hipStreamSynchronize(stream[i]));
+      HIPCHECK(hipMemset(d_profiling_data[i], 0, sizeof(struct profiling_data_t)));
+    }
+
+    auto start = std::chrono::high_resolution_clock::now();
+    for (int i = 0; i < iters; i ++) {
+      for (int i = 0; i < nGpu; i ++) {
+        HIPCHECK(hipSetDevice(i));
+        //launch the kernel
+        hipLaunchKernelGGL(flagSyncKerns[op*2 + sync],
+            /*grid dim x,y,z*/        dim3(workgroups, 1, 1),
+            /*block dim x,y,z*/       dim3(THREADS, 1, 1),
+            /*dynamic shared mem*/    0,
+            /*stream*/                stream[i],
+            /*kernel args*/           transfer_data[i], d_profiling_data[i], opCount);
+      }
+      opCount++;
+    }
+
+    for (int i = 0; i < nGpu; i ++) {
+      HIPCHECK(hipSetDevice(i));
+      HIPCHECK(hipStreamSynchronize(stream[i]));
+    }
+
+    auto delta = std::chrono::high_resolution_clock::now() - start;
+    double deltaSec = std::chrono::duration_cast<std::chrono::duration<double>>(delta).count();
+
+    std::cout<<"***GPU to GPU Transfer Profiling Data***"<<std::endl;
+    for (int i = 0; i < nGpu; i ++) {
+      HIPCHECK(hipMemcpyAsync(profiling_data[i], d_profiling_data[i],
+                              sizeof(struct profiling_data_t), hipMemcpyDeviceToHost,
+                              stream[i]));
+      HIPCHECK(hipStreamSynchronize(stream[i]));
+#define RTC_CLOCK_FREQ 2.7E07
+      int next_gpu = findNextGpu(ring_0, i, nGpu);
+      uint32_t linktype;
+      uint32_t hopcount;
+      HIPCHECK(hipExtGetLinkTypeAndHopCount(i, next_gpu , &linktype, &hopcount));
+
+      double t0 = (double)profiling_data[i]->write_cycles/((double)RTC_CLOCK_FREQ)/(double)workgroups;
+      fprintf(stderr, "[GPU %d -> GPU %d][%s]:time %.4fs bytes_transferred %lu kernel throughput %.2f GB/s\n",
+        i, next_gpu,link_type_name[linktype],t0, profiling_data[i]->bytes_transferred, (double)profiling_data[i]->bytes_transferred/(t0*1.0E9));
+    }
+    std::cout<<"***Application Level Transfer Profiling Data***"<<std::endl;
+    double speed = (double)(profiling_data[0]->bytes_transferred) / (deltaSec*1.0E9);
+    printf("Transfered %lu bytes in %f s. Throughput %f GB/s\n", profiling_data[0]->bytes_transferred, deltaSec, speed);
+  }
+
+  for (int i = 0; i < nGpu; i ++) {
+    HIPCHECK(hipStreamDestroy(stream[i]));
+    HIPCHECK(hipFree((void*) transfer_data[i]));
+    for (int j = 0; j < workgroups; j++) {
+      HIPCHECK(hipFree((void*) buff[i*MAX_WORKGROUPS+j]));
+      HIPCHECK(hipFree((void*) buff_coarse[i*MAX_WORKGROUPS+j]));
+    }
+    HIPCHECK(hipFree((void*) d_profiling_data[i]));
+    free(profiling_data[i]);
+  }
+
+  printf("opCount: ");
+  for (int i = 0; i < nGpu; i++)
+    printf("%ld ", remOpCount[i]);
+  printf("\n");
+  HIPCHECK(hipHostFree((void*)remOpCount));
+}