diff --git a/projects/rccl/CMakeLists.txt b/projects/rccl/CMakeLists.txt
new file mode 100644
index 0000000000..bc0b59fb5d
--- /dev/null
+++ b/projects/rccl/CMakeLists.txt
@@ -0,0 +1,180 @@
+# Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
+
+cmake_minimum_required(VERSION 2.8.12)
+
+set(CMAKE_INSTALL_PREFIX "/opt/rocm" CACHE PATH "")
+
+project(rccl CXX)
+
+find_package(ROCM
+             REQUIRED
+             PATHS
+             /opt/rocm)
+
+include(ROCMInstallTargets)
+include(ROCMPackageConfigHelpers)
+include(ROCMSetupVersion)
+include(ROCMInstallSymlinks)
+include(ROCMCreatePackage)
+
+option(BUILD_TESTS "Build test programs" ON)
+
+# In order to support function calls within the kernel, we must use hcc as the
+# compiler
+set(CMAKE_CXX_COMPILER "/opt/rocm/bin/hcc")
+
+# parse version from Makefile NCCL_MAJOR, NCCL_MINOR, NCCL_PATCH must exist
+# NCCL_SUFFIX is optional NCCL_VERSION formatting is ((X) * 1000 + (Y) * 100 +
+# (Z)) so we must first detect one or two digits first
+file(READ makefiles/version.mk version_mk_text)
+if("${version_mk_text}" MATCHES "NCCL_MAJOR *:= *([0-9]*)")
+  set(NCCL_MAJOR ${CMAKE_MATCH_1})
+else()
+  message(FATAL_ERROR "Failed to parse NCCL_MAJOR")
+endif()
+if("${version_mk_text}" MATCHES "NCCL_MINOR *:= *([0-9]*)")
+  set(NCCL_MINOR ${CMAKE_MATCH_1})
+else()
+  message(FATAL_ERROR "Failed to parse NCCL_MINOR")
+endif()
+if("${version_mk_text}" MATCHES "NCCL_PATCH *:= *([0-9]*)")
+  set(NCCL_PATCH ${CMAKE_MATCH_1})
+else()
+  message(FATAL_ERROR "Failed to parse NCCL_PATCH")
+endif()
+if("${version_mk_text}" MATCHES "NCCL_SUFFIX *:= *([0-9]*)")
+  set(NCCL_SUFFIX ${CMAKE_MATCH_1})
+else()
+  set(NCCL_SUFFIX)
+endif()
+if("${version_mk_text}" MATCHES "PKG_REVISION *:= *([0-9]*)")
+  set(PKG_REVISION ${CMAKE_MATCH_1})
+else()
+  message(FATAL_ERROR "Failed to parse PKG_REVISION")
+endif()
+if("${NCCL_PATCH}" MATCHES "[0-9][0-9]")
+  set(NCCL_VERSION "${NCCL_MAJOR}${NCCL_MINOR}${NCCL_PATCH}")
+else()
+  set(NCCL_VERSION "${NCCL_MAJOR}${NCCL_MINOR}0${NCCL_PATCH}")
+endif()
+
+rocm_setup_version(VERSION
+                   "${NCCL_MAJOR}.${NCCL_MINOR}.${NCCL_PATCH}-${PKG_REVISION}")
+
+list(APPEND CMAKE_PREFIX_PATH
+            /opt/rocm
+            /opt/rocm/hip
+            /opt/rocm/hcc)
+
+find_package(hip REQUIRED)
+
+link_libraries(-amdgpu-target=gfx803
+               -amdgpu-target=gfx900
+               -amdgpu-target=gfx906
+               -hc-function-calls)
+
+option(BUILD_SHARED_LIBS "Build as a shared library" ON)
+
+configure_file(src/nccl.h.in ${PROJECT_BINARY_DIR}/rccl.h)
+configure_file(src/nccl.h.in ${PROJECT_BINARY_DIR}/nccl.h)
+
+include_directories(${PROJECT_BINARY_DIR}) # for generated rccl.h header
+include_directories(src)
+include_directories(src/include)
+include_directories(src/collectives)
+include_directories(src/collectives/device)
+
+set(CU_SOURCES
+    src/bootstrap.cu
+    src/collectives/all_gather.cu
+    src/collectives/all_reduce.cu
+    src/collectives/broadcast.cu
+    src/collectives/reduce.cu
+    src/collectives/reduce_scatter.cu
+    src/collectives/device/functions.cu
+    src/init.cu
+    src/misc/enqueue.cu
+    src/misc/group.cu
+    src/misc/ibvwrap.cu
+    src/misc/nvmlwrap_stub.cu
+    src/misc/rings.cu
+    src/misc/utils.cu
+    src/ring.cu
+    src/transport.cu
+    src/transport/net.cu
+    src/transport/net_ib.cu
+    src/transport/net_socket.cu
+    src/transport/p2p.cu
+    src/transport/shm.cu)
+
+set(CPP_SOURCES)
+foreach(filename ${CU_SOURCES})
+  string(REPLACE ".cu"
+                 ".cpp"
+                 cpp_filename
+                 ${filename})
+  configure_file(${filename} ${cpp_filename} COPYONLY)
+  list(APPEND CPP_SOURCES ${cpp_filename})
+endforeach(filename)
+
+list(APPEND CPP_SOURCES src/collectives/device/all_gather_0.cpp)
+list(APPEND CPP_SOURCES src/collectives/device/all_reduce_0.cpp)
+list(APPEND CPP_SOURCES src/collectives/device/all_reduce_1.cpp)
+list(APPEND CPP_SOURCES src/collectives/device/all_reduce_2.cpp)
+list(APPEND CPP_SOURCES src/collectives/device/all_reduce_3.cpp)
+list(APPEND CPP_SOURCES src/collectives/device/broadcast_0.cpp)
+list(APPEND CPP_SOURCES src/collectives/device/reduce_0.cpp)
+list(APPEND CPP_SOURCES src/collectives/device/reduce_1.cpp)
+list(APPEND CPP_SOURCES src/collectives/device/reduce_2.cpp)
+list(APPEND CPP_SOURCES src/collectives/device/reduce_3.cpp)
+list(APPEND CPP_SOURCES src/collectives/device/reduce_scatter_0.cpp)
+list(APPEND CPP_SOURCES src/collectives/device/reduce_scatter_1.cpp)
+list(APPEND CPP_SOURCES src/collectives/device/reduce_scatter_2.cpp)
+list(APPEND CPP_SOURCES src/collectives/device/reduce_scatter_3.cpp)
+
+add_library(rccl ${CPP_SOURCES})
+
+if(TRACE)
+  add_definitions(-DENABLE_TRACE)
+endif()
+
+if(TARGET hip::device)
+  target_link_libraries(rccl PRIVATE hip::device)
+  target_link_libraries(rccl INTERFACE hip::host)
+else()
+  target_link_libraries(rccl PUBLIC hip::hip_hcc ${hcc_LIBRARIES} numa)
+endif()
+
+rocm_install_targets(TARGETS
+                     rccl
+                     PREFIX
+                     rccl)
+install(FILES ${PROJECT_BINARY_DIR}/rccl.h
+        DESTINATION rccl/${CMAKE_INSTALL_INCLUDEDIR})
+
+rocm_export_targets(NAMESPACE
+                    roc::
+                    PREFIX
+                    rccl
+                    TARGETS
+                    rccl
+                    DEPENDS
+                    hip)
+
+set(CPACK_DEBIAN_PACKAGE_DEPENDS "hip_hcc")
+set(CPACK_RPM_PACKAGE_REQUIRES "hip_hcc")
+
+rocm_create_package(
+  NAME
+  rccl
+  DESCRIPTION
+  "Optimized primitives for collective multi-GPU communication"
+  MAINTAINER
+  "Jeff Daily <jeff.daily@amd.com>"
+  LDCONFIG)
+
+rocm_install_symlink_subdir(rccl)
+
+if(BUILD_TESTS)
+  add_subdirectory(test)
+endif()
diff --git a/projects/rccl/LICENSE.txt b/projects/rccl/LICENSE.txt
index 3593a7aa69..6b9c6a3138 100644
--- a/projects/rccl/LICENSE.txt
+++ b/projects/rccl/LICENSE.txt
@@ -1,5 +1,6 @@
 
  Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+ Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
 
  Redistribution and use in source and binary forms, with or without
  modification, are permitted provided that the following conditions
diff --git a/projects/rccl/NOTICES.txt b/projects/rccl/NOTICES.txt
new file mode 100644
index 0000000000..1b9bcc8eec
--- /dev/null
+++ b/projects/rccl/NOTICES.txt
@@ -0,0 +1,66 @@
+Notices and Licenses file
+_______________________________________________________________
+
+Dependencies on nvidia-nccl v2.3.7-1 (BSD3)
+Copyright (c) 2015-2018, NVIDIA CORPORATION.
+Modifications Copyright (c) 2019 Advanced Micro Devices, Inc.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+ * Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in the
+   documentation and/or other materials provided with the distribution.
+ * Neither the name of NVIDIA CORPORATION, Lawrence Berkeley National
+   Laboratory, the U.S. Department of Energy, nor the names of their
+   contributors may be used to endorse or promote products derived
+   from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+The U.S. Department of Energy funded the development of this software
+under subcontract 7078610 with Lawrence Berkeley National Laboratory.
+
+
+nvidia-nccl v2.3.7-1 (BSD2)
+Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+ * Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in the
+   documentation and/or other materials provided with the distribution.
+ * Neither the name of NVIDIA CORPORATION, Lawrence Berkeley National
+   Laboratory, the U.S. Department of Energy, nor the names of their
+   contributors may be used to endorse or promote products derived
+   from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+The U.S. Department of Energy funded the development of this software
+under subcontract 7078610 with Lawrence Berkeley National Laboratory.
\ No newline at end of file
diff --git a/projects/rccl/README.md b/projects/rccl/README.md
index fa5145323b..fc3b738227 100644
--- a/projects/rccl/README.md
+++ b/projects/rccl/README.md
@@ -1,12 +1,10 @@
-# NCCL
+# RCCL
 
-Optimized primitives for collective multi-GPU communication.
+ROCm Communication Collectives Library
 
 ## Introduction
 
-NCCL (pronounced "Nickel") is a stand-alone library of standard collective communication routines for GPUs, implementing all-reduce, all-gather, reduce, broadcast, and reduce-scatter. It has been optimized to achieve high bandwidth on platforms using PCIe, NVLink, NVswitch, as well as networking using InfiniBand Verbs or TCP/IP sockets. NCCL supports an arbitrary number of GPUs installed in a single node or across multiple nodes, and can be used in either single- or multi-process (e.g., MPI) applications.
-
-For more information on NCCL usage, please refer to the [NCCL documentation](https://docs.nvidia.com/deeplearning/sdk/nccl-developer-guide/index.html).
+RCCL (pronounced "Rickle") is a stand-alone library of standard collective communication routines for GPUs, implementing all-reduce, all-gather, reduce, broadcast, and reduce-scatter. It has been optimized to achieve high bandwidth on platforms using PCIe, xGMI as well as networking using InfiniBand Verbs or TCP/IP sockets. RCCL supports an arbitrary number of GPUs installed in a single node or across multiple nodes, and can be used in either single- or multi-process (e.g., MPI) applications.
 
 ## What's inside
 
@@ -22,71 +20,62 @@ These operations are implemented using ring algorithms and have been optimized f
 
 ## Requirements
 
-NCCL requires at least CUDA 7.0 and Kepler or newer GPUs. For PCIe based platforms, best performance is achieved when all GPUs are located on a common PCIe root complex, but multi-socket configurations are also supported.
+1. ROCm supported GPUs
+2. ROCm stack installed on the system (HIP runtime & HCC)
 
-## Build
+## Quickstart RCCL Build
 
-Note: the official and tested builds of NCCL can be downloaded from: https://developer.nvidia.com/nccl. You can skip the following build steps if you choose to use the official builds.
+RCCL directly depends on HIP runtime & HCC C++ compiler which are part of the ROCm software stack.
+In addition, HC Direct Function call support needs to be present on your machine.  There are binaries for hcc and HIP that need to be installed to get HC Direct Function call support.  These binaries are currently packaged with roc-master, and will be included in ROCm 2.4.
 
-To build the library :
+The root of this repository has a helper script 'install.sh' to build and install RCCL on Ubuntu with a single command.  It does not take a lot of options and hard-codes configuration that can be specified through invoking cmake directly, but it's a great way to get started quickly and can serve as an example of how to build/install.
+
+*  `./install.sh` -- builds library including unit tests
+*  `./install.sh -h` -- shows help
+*  `./install.sh -t` -- builds library including unit tests, and also runs unit tests
+
+## Manual build
+#### To build the library :
 
 ```shell
-$ cd nccl
-$ make -j src.build
+$ git clone https://github.com/ROCmSoftwarePlatform/rccl.git
+$ cd rccl
+$ mkdir build
+$ cd build
+$ CXX=/opt/rocm/bin/hcc cmake -DCMAKE_INSTALL_PREFIX=$PWD/rccl-install ..
+$ make -j 8 install
 ```
+You may substitute a path of your own choosing for CMAKE_INSTALL_PREFIX.
 
-If CUDA is not installed in the default /usr/local/cuda path, you can define the CUDA path with :
+#### To build the RCCL package and install package :
+
+Assuming you have already cloned this repository and built the library as shown in the previous section:
 
 ```shell
-$ make src.build CUDA_HOME=<path to cuda install>
+$ cd rccl/build
+$ make package
+$ sudo dpkg -i *.deb
 ```
 
-NCCL will be compiled and installed in `build/` unless `BUILDDIR` is set.
-
-By default, NCCL is compiled for all supported architectures. To accelerate the compilation and reduce the binary size, consider redefining `NVCC_GENCODE` (defined in `makefiles/common.mk`) to only include the architecture of the target platform :
-```shell
-$ make -j src.build NVCC_GENCODE="-gencode=arch=compute_70,code=sm_70"
-```
-
-## Install
-
-To install NCCL on the system, create a package then install it as root.
-
-Debian/Ubuntu :
-```shell
-$ # Install tools to create debian packages
-$ sudo apt install build-essential devscripts debhelper
-$ # Build NCCL deb package
-$ make pkg.debian.build
-$ ls build/pkg/deb/
-```
-
-RedHat/CentOS :
-```shell
-$ # Install tools to create rpm packages
-$ sudo yum install rpm-build rpmdevtools
-$ # Build NCCL rpm package
-$ make pkg.redhat.build
-$ ls build/pkg/rpm/
-```
-
-OS-agnostic tarball :
-```shell
-$ make pkg.txz.build
-$ ls build/pkg/txz/
-```
+RCCL package install requires sudo/root access because it creates a directory called "rccl" under /opt/rocm/. This is an optional step and RCCL can be used directly by including the path containing librccl.so.
 
 ## Tests
 
-Tests for NCCL are maintained separately at https://github.com/nvidia/nccl-tests.
+There are unit tests implemented with the Googletest framework in RCCL, which are currently a work-in-progress.  To invoke the unit tests, go to the rccl-install folder, then the test/ subfolder, and execute the appropriate unit test executable(s). Several notes for running the unit tests:
 
+1. The LD_LIBRARY_PATH environment variable will need to be set to include /path/to/rccl-install/lib/ in order to run the unit tests.
+2. The HSA_FORCE_FINE_GRAIN_PCIE environment variable will need to be set to 1 in order to run the unit tests.
+
+An example call to the unit tests:
 ```shell
-$ git clone https://github.com/NVIDIA/nccl-tests.git
-$ cd nccl-tests
-$ make
-$ ./build/all_reduce_perf -b 8 -e 256M -f 2 -g <ngpus>
+$ LD_LIBRARY_PATH=rccl-install/lib/ HSA_FORCE_FINE_GRAIN_PCIE=1 rccl-install/test/UnitTests
 ```
 
+There are also other performance and error-checking tests for RCCL.  These are maintained separately at https://github.com/ROCmSoftwarePlatform/rccl-tests.
+See the rccl-tests README for more information on how to build and run those tests.
+
 ## Copyright
 
 All source code and accompanying documentation is copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+
+All modifications are copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
diff --git a/projects/rccl/hipify.sh b/projects/rccl/hipify.sh
new file mode 100755
index 0000000000..e389fb519e
--- /dev/null
+++ b/projects/rccl/hipify.sh
@@ -0,0 +1,112 @@
+#!/bin/bash
+# Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
+
+FILES="
+./src/nccl.h.in
+./src/bootstrap.cu
+./src/collectives/all_gather.cu
+./src/collectives/all_reduce.cu
+./src/collectives/broadcast.cu
+./src/collectives/collectives.h
+./src/collectives/device/all_gather.cu
+./src/collectives/device/all_gather.h
+./src/collectives/device/all_reduce.cu
+./src/collectives/device/all_reduce.h
+./src/collectives/device/broadcast.cu
+./src/collectives/device/broadcast.h
+./src/collectives/device/common.h
+./src/collectives/device/common_kernel.h
+./src/collectives/device/functions.cu
+./src/collectives/device/ll_kernel.h
+./src/collectives/device/primitives.h
+./src/collectives/device/reduce.cu
+./src/collectives/device/reduce.h
+./src/collectives/device/reduce_kernel.h
+./src/collectives/device/reduce_scatter.cu
+./src/collectives/device/reduce_scatter.h
+./src/collectives/reduce.cu
+./src/collectives/reduce_scatter.cu
+./src/include/bootstrap.h
+./src/include/common_coll.h
+./src/include/core.h
+./src/include/debug.h
+./src/include/enqueue.h
+./src/include/group.h
+./src/include/ibvwrap.h
+./src/include/nccl_net.h
+./src/include/net.h
+./src/include/nvlink.h
+./src/include/nvmlwrap.h
+./src/include/param.h
+./src/include/ring.h
+./src/include/rings.h
+./src/include/shm.h
+./src/include/socket.h
+./src/include/topo.h
+./src/include/transport.h
+./src/include/utils.h
+./src/init.cu
+./src/misc/enqueue.cu
+./src/misc/group.cu
+./src/misc/ibvwrap.cu
+./src/misc/nvmlwrap.cu
+./src/misc/rings.cu
+./src/misc/utils.cu
+./src/ring.cu
+./src/transport.cu
+./src/transport/net.cu
+./src/transport/net_ib.cu
+./src/transport/net_socket.cu
+./src/transport/p2p.cu
+./src/transport/shm.cu
+"
+
+for f in $FILES
+do
+    sed -i \
+        -e 's@cuda_runtime.h@hip/hip_runtime_api.h@g' \
+        -e 's@cuda_fp16.h@hip/hip_fp16.h@g' \
+        -e 's/cudaDeviceCanAccessPeer/hipDeviceCanAccessPeer/g' \
+        -e 's/cudaDeviceEnablePeerAccess/hipDeviceEnablePeerAccess/g' \
+        -e 's/cudaDeviceGetPCIBusId/hipDeviceGetPCIBusId/g' \
+        -e 's/cudaErrorPeerAccessAlreadyEnabled/hipErrorPeerAccessAlreadyEnabled/g' \
+        -e 's/cudaError_t/hipError_t/g' \
+        -e 's/cudaEventCreateWithFlags/hipEventCreateWithFlags/g' \
+        -e 's/cudaEventDestroy/hipEventDestroy/g' \
+        -e 's/cudaEventDisableTiming/hipEventDisableTiming/g' \
+        -e 's/cudaEventRecord/hipEventRecord/g' \
+        -e 's/cudaEvent_t/hipEvent_t/g' \
+        -e 's/cudaFree/hipFree/g' \
+        -e 's/cudaFreeHost/hipHostFree/g' \
+        -e 's/cudaGetDevice/hipGetDevice/g' \
+        -e 's/cudaGetErrorString/hipGetErrorString/g' \
+        -e 's/cudaGetLastError/hipGetLastError/g' \
+        -e 's/cudaHostAlloc/hipHostMalloc/g' \
+        -e 's/cudaHostAllocMapped/hipHostMallocMapped/g' \
+        -e 's/cudaHostGetDevicePointer/hipHostGetDevicePointer/g' \
+        -e 's/cudaHostRegister/hipHostRegister/g' \
+        -e 's/cudaHostRegisterMapped/hipHostRegisterMapped/g' \
+        -e 's/cudaHostUnregister/hipHostUnregister/g' \
+        -e 's/cudaIpcCloseMemHandle/hipIpcCloseMemHandle/g' \
+        -e 's/cudaIpcGetMemHandle/hipIpcGetMemHandle/g' \
+        -e 's/cudaIpcMemHandle_t/hipIpcMemHandle_t/g' \
+        -e 's/cudaIpcMemLazyEnablePeerAccess/hipIpcMemLazyEnablePeerAccess/g' \
+        -e 's/cudaIpcOpenMemHandle/hipIpcOpenMemHandle/g' \
+        -e 's/cudaMalloc/hipMalloc/g' \
+        -e 's/cudaMemcpy/hipMemcpy/g' \
+        -e 's/cudaMemcpyAsync/hipMemcpyAsync/g' \
+        -e 's/cudaMemcpyDefault/hipMemcpyDefault/g' \
+        -e 's/cudaMemcpyDeviceToDevice/hipMemcpyDeviceToDevice/g' \
+        -e 's/cudaMemoryTypeDevice/hipMemoryTypeDevice/g' \
+        -e 's/cudaMemset/hipMemset/g' \
+        -e 's/cudaPointerAttributes/hipPointerAttribute_t/g' \
+        -e 's/cudaPointerGetAttributes/hipPointerGetAttributes/g' \
+        -e 's/cudaSetDevice/hipSetDevice/g' \
+        -e 's/cudaStreamCreateWithFlags/hipStreamCreateWithFlags/g' \
+        -e 's/cudaStreamDestroy/hipStreamDestroy/g' \
+        -e 's/cudaStreamNonBlocking/hipStreamNonBlocking/g' \
+        -e 's/cudaStreamWaitEvent/hipStreamWaitEvent/g' \
+        -e 's/cudaStream_t/hipStream_t/g' \
+        -e 's/cudaSuccess/hipSuccess/g' \
+        $f
+done
diff --git a/projects/rccl/install.sh b/projects/rccl/install.sh
new file mode 100755
index 0000000000..0069c5bce5
--- /dev/null
+++ b/projects/rccl/install.sh
@@ -0,0 +1,69 @@
+#!/bin/bash
+# Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
+
+export RCCL_DIR=$PWD/rccl-internal
+export RCCL_INSTALL=$PWD/rccl-install
+export ROCM_PATH=/opt/rocm/bin
+
+# #################################################
+# helper functions
+# #################################################
+function display_help()
+{
+    echo "RCCL build & installation helper script"
+    echo "./install [-h|--help] "
+    echo "    [-h|--help] prints this help message"
+    echo "    [-t|--test] run RCCL unit tests too"
+}
+
+# #################################################
+# Parameter parsing
+# #################################################
+
+# check if we have a modern version of getopt that can handle whitespace and long parameters
+getopt -T
+if [[ $? -eq 4 ]]; then
+    GETOPT_PARSE=$(getopt --name "${0}" --longoptions help,install,clients,debug,test --options hicdt -- "$@")
+else
+    echo "Need a new version of getopt"
+    exit 1
+fi
+
+if [[ $? -ne 0 ]]; then
+    echo "getopt invocation failed; could not parse the command line";
+    exit 1
+fi
+
+eval set -- "${GETOPT_PARSE}"
+
+run_tests=false
+
+while true; do
+    case "${1}" in
+	-h|--help)
+	    display_help
+	    exit 0
+	    ;;
+	-t|--test)
+	    run_tests=true
+	    shift ;;
+	--) shift ; break ;;
+	*)  echo "Unexpected command line parameter received; aborting";
+	    exit 1
+	    ;;
+    esac
+    done
+
+# Install the pre-commit hook
+#bash ./githooks/install
+
+rm -rf build
+mkdir build
+cd build
+CXX=$ROCM_PATH/hcc cmake -DCMAKE_INSTALL_PREFIX=$RCCL_INSTALL ..
+make -j 8 install
+
+if ($run_tests); then
+# Optionally, run tests if they're enabled.
+HSA_FORCE_FINE_GRAIN_PCIE=1 $RCCL_INSTALL/test/UnitTests
+fi
diff --git a/projects/rccl/jenkinsfile b/projects/rccl/jenkinsfile
new file mode 100644
index 0000000000..655f9dc7e0
--- /dev/null
+++ b/projects/rccl/jenkinsfile
@@ -0,0 +1,89 @@
+#!/usr/bin/env groovy
+// Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
+// This shared library is available at https://github.com/ROCmSoftwarePlatform/rccl
+@Library('rocJenkins') _
+
+// This is file for internal AMD use.
+// If you are interested in running your own Jenkins, please raise a github issue for assistance.
+
+import com.amd.project.*
+import com.amd.docker.*
+
+////////////////////////////////////////////////////////////////////////
+// Mostly generated from snippet generator 'properties; set job properties'
+// Time-based triggers added to execute nightly tests, eg '30 2 * * *' means 2:30 AM
+properties([
+    pipelineTriggers([cron('0 1 * * *'), [$class: 'PeriodicFolderTrigger', interval: '5m']]),
+    buildDiscarder(logRotator(
+      artifactDaysToKeepStr: '',
+      artifactNumToKeepStr: '',
+      daysToKeepStr: '',
+      numToKeepStr: '10')),
+    disableConcurrentBuilds(),
+    [$class: 'CopyArtifactPermissionProperty', projectNames: '*']
+   ])
+
+
+////////////////////////////////////////////////////////////////////////
+import java.nio.file.Path;
+
+rcclCI:
+{
+
+    def rccl = new rocProject('rccl')
+    // customize for project
+    rccl.paths.build_command = './install.sh'
+
+    // Define test architectures, optional rocm version argument is available
+    def nodes = new dockerNodes(['gfx906'], rccl)
+
+    boolean formatCheck = false
+
+    def compileCommand =
+    {
+        platform, project->
+
+        project.paths.construct_build_prefix()
+        def command = """#!/usr/bin/env bash
+                  set -x
+                  cd ${project.paths.project_build_prefix}
+                  LD_LIBRARY_PATH=/opt/rocm/hcc/lib CXX=${project.compiler.compiler_path} ${project.paths.build_command}
+                """
+
+        platform.runCommand(this, command)
+    }
+
+    def testCommand =
+    {
+        platform, project->
+
+        def command = """#!/usr/bin/env bash
+                set -x
+                cd ${project.paths.project_build_prefix}/rccl-install/test
+                ./UnitTest --gtest_output=xml --gtest_color=yes
+            """
+
+        platform.runCommand(this, command)
+        junit "${project.paths.project_build_prefix}/rccl-install/*.xml"
+    }
+
+    def packageCommand =
+    {
+        platform, project->
+
+        def command = """
+                      set -x
+                      cd ${project.paths.project_build_prefix}/build
+                      make package
+                      rm -rf package && mkdir -p package
+                      mv *.deb package/
+                      sudo dpkg -i package/*.deb
+                      """
+
+        platform.runCommand(this, command)
+        platform.archiveArtifacts(this, """${project.paths.project_build_prefix}/build/package/*.deb""")
+    }
+
+    buildProject(rccl, formatCheck, nodes.dockerArray, compileCommand, testCommand, packageCommand)
+
+}
\ No newline at end of file
diff --git a/projects/rccl/src/collectives/all_gather.cu b/projects/rccl/src/collectives/all_gather.cu
index 8dec28e63b..7ad36c777b 100644
--- a/projects/rccl/src/collectives/all_gather.cu
+++ b/projects/rccl/src/collectives/all_gather.cu
@@ -1,5 +1,6 @@
 /*************************************************************************
  * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+ * Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
  *
  * See LICENSE.txt for license information
  ************************************************************************/
@@ -10,12 +11,12 @@
 #include "collectives.h"
 
 ncclResult_t ncclAllGatherFunc(const void* sendbuff, void* recvbuff, size_t count,
-    ncclDataType_t datatype, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) {
+    ncclDataType_t datatype, ncclRedOp_t op, int root, ncclComm_t comm, hipStream_t stream) {
   size_t nbytes = count*ncclTypeSize(datatype);
   INFO(NCCL_COLL,"AllGather: opCount %lx sendbuff %p recvbuff %p count %zi datatype %d op %d root %d comm %p [nranks=%d] stream %p", comm->opCount, sendbuff, recvbuff, count, datatype, op, root, comm, comm->nRanks, stream);
   if (comm->nRanks == 1) {
     if (sendbuff != recvbuff)
-      CUDACHECK(cudaMemcpyAsync(recvbuff, sendbuff, nbytes, cudaMemcpyDeviceToDevice, stream));
+      CUDACHECK(hipMemcpyAsync(recvbuff, sendbuff, nbytes, hipMemcpyDeviceToDevice, stream));
   } else {
     NCCLCHECK(transportSaveProxies(ALLGATHER_SUBSTEPS, ALLGATHER_BUFCHUNKS, comm->nRanks-1, comm->nRanks, nbytes*comm->nRanks, proxyPatternRing, comm));
     NCCLCHECK(saveKernel(ncclCollAllGather, sendbuff, recvbuff, nbytes, ncclInt8, op, root, comm, stream, nbytes*comm->nRanks, 1));
@@ -24,9 +25,9 @@ ncclResult_t ncclAllGatherFunc(const void* sendbuff, void* recvbuff, size_t coun
 }
 
 NCCL_API(ncclResult_t, ncclAllGather, const void* sendbuff, void* recvbuff, size_t sendcount,
-    ncclDataType_t datatype, ncclComm_t comm, cudaStream_t stream);
+    ncclDataType_t datatype, ncclComm_t comm, hipStream_t stream);
 ncclResult_t ncclAllGather(const void* sendbuff, void* recvbuff, size_t sendcount,
-    ncclDataType_t datatype, ncclComm_t comm, cudaStream_t stream) {
+    ncclDataType_t datatype, ncclComm_t comm, hipStream_t stream) {
   return ncclEnqueueCheck(ncclAllGatherFunc, "AllGather", sendbuff, recvbuff, sendcount, datatype,
           ncclSum, 0, comm, stream);
 }
diff --git a/projects/rccl/src/collectives/all_reduce.cu b/projects/rccl/src/collectives/all_reduce.cu
index cc14083ab7..234af2c898 100644
--- a/projects/rccl/src/collectives/all_reduce.cu
+++ b/projects/rccl/src/collectives/all_reduce.cu
@@ -1,5 +1,6 @@
 /*************************************************************************
  * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+ * Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
  *
  * See LICENSE.txt for license information
  ************************************************************************/
@@ -10,12 +11,12 @@
 #include "collectives.h"
 
 ncclResult_t ncclAllReduceFunc(const void* sendbuff, void* recvbuff, size_t count,
-    ncclDataType_t datatype, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) {
+    ncclDataType_t datatype, ncclRedOp_t op, int root, ncclComm_t comm, hipStream_t stream) {
   size_t nbytes = count*ncclTypeSize(datatype);
   INFO(NCCL_COLL,"AllReduce: opCount %lx sendbuff %p recvbuff %p count %zi datatype %d op %d root %d comm %p [nranks=%d] stream %p", comm->opCount, sendbuff, recvbuff, count, datatype, op, root, comm, comm->nRanks, stream);
   if (comm->nRanks == 1) {
     if (sendbuff != recvbuff)
-      CUDACHECK(cudaMemcpyAsync(recvbuff, sendbuff, nbytes, cudaMemcpyDeviceToDevice, stream));
+      CUDACHECK(hipMemcpyAsync(recvbuff, sendbuff, nbytes, hipMemcpyDeviceToDevice, stream));
   } else {
     NCCLCHECK(transportSaveProxies(ALLREDUCE_SUBSTEPS, ALLREDUCE_BUFCHUNKS, (comm->nRanks)*2-2, comm->nRanks, nbytes, proxyPatternRing, comm));
     NCCLCHECK(saveKernel(ncclCollAllReduce, sendbuff, recvbuff, count, datatype, op, root, comm, stream, nbytes, comm->nRanks));
@@ -24,9 +25,9 @@ ncclResult_t ncclAllReduceFunc(const void* sendbuff, void* recvbuff, size_t coun
 }
 
 NCCL_API(ncclResult_t, ncclAllReduce, const void* sendbuff, void* recvbuff, size_t count,
-    ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm, cudaStream_t stream);
+    ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm, hipStream_t stream);
 ncclResult_t ncclAllReduce(const void* sendbuff, void* recvbuff, size_t count,
-    ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm, cudaStream_t stream) {
+    ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm, hipStream_t stream) {
   return ncclEnqueueCheck(ncclAllReduceFunc, "AllReduce", sendbuff, recvbuff, count, datatype,
           op, 0, comm, stream);
 }
diff --git a/projects/rccl/src/collectives/broadcast.cu b/projects/rccl/src/collectives/broadcast.cu
index 91ce905440..a2b65995f8 100644
--- a/projects/rccl/src/collectives/broadcast.cu
+++ b/projects/rccl/src/collectives/broadcast.cu
@@ -1,5 +1,6 @@
 /*************************************************************************
  * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+ * Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
  *
  * See LICENSE.txt for license information
  ************************************************************************/
@@ -10,12 +11,12 @@
 #include "collectives.h"
 
 ncclResult_t ncclBroadcastFunc(const void* sendbuff, void* recvbuff, const size_t count,
-    ncclDataType_t datatype, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) {
+    ncclDataType_t datatype, ncclRedOp_t op, int root, ncclComm_t comm, hipStream_t stream) {
   size_t nbytes = count*ncclTypeSize(datatype);
   INFO(NCCL_COLL,"Broadcast: opCount %lx sendbuff %p recvbuff %p count %zi datatype %d op %d root %d comm %p [nranks=%d] stream %p", comm->opCount, sendbuff, recvbuff, count, datatype, op, root, comm, comm->nRanks, stream);
   if (comm->nRanks == 1) {
     if (sendbuff != recvbuff)
-      CUDACHECK(cudaMemcpyAsync(recvbuff, sendbuff, nbytes, cudaMemcpyDeviceToDevice, stream));
+      CUDACHECK(hipMemcpyAsync(recvbuff, sendbuff, nbytes, hipMemcpyDeviceToDevice, stream));
   } else {
     NCCLCHECK(transportSaveProxies(BROADCAST_SUBSTEPS, BROADCAST_BUFCHUNKS, 1, 1, nbytes, proxyPatternFrom(root), comm));
     NCCLCHECK(saveKernel(ncclCollBroadcast, sendbuff, recvbuff, nbytes, ncclInt8, op, root, comm, stream, nbytes, 1));
@@ -26,17 +27,17 @@ ncclResult_t ncclBroadcastFunc(const void* sendbuff, void* recvbuff, const size_
 
 /* Deprecated original "in place" function, similar to MPI */
 NCCL_API(ncclResult_t, ncclBcast, void* buff, size_t count, ncclDataType_t datatype, int root,
-    ncclComm_t comm, cudaStream_t stream);
+    ncclComm_t comm, hipStream_t stream);
 ncclResult_t ncclBcast(void* buff, size_t count, ncclDataType_t datatype, int root,
-    ncclComm_t comm, cudaStream_t stream) {
+    ncclComm_t comm, hipStream_t stream) {
   return ncclEnqueueCheck(ncclBroadcastFunc, "Bcast", buff, buff, count, datatype,
           ncclSum, root, comm, stream);
 }
 
 NCCL_API(ncclResult_t, ncclBroadcast, const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype, int root,
-    ncclComm_t comm, cudaStream_t stream);
+    ncclComm_t comm, hipStream_t stream);
 ncclResult_t ncclBroadcast(const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype, int root,
-    ncclComm_t comm, cudaStream_t stream) {
+    ncclComm_t comm, hipStream_t stream) {
   return ncclEnqueueCheck(ncclBroadcastFunc, "Broadcast", sendbuff, recvbuff, count, datatype,
           ncclSum, root, comm, stream);
 }
diff --git a/projects/rccl/src/collectives/collectives.h b/projects/rccl/src/collectives/collectives.h
index 4a5cb7a98d..5b2f0f13f4 100644
--- a/projects/rccl/src/collectives/collectives.h
+++ b/projects/rccl/src/collectives/collectives.h
@@ -1,5 +1,6 @@
 /*************************************************************************
  * Copyright (c) 2017-2018, NVIDIA CORPORATION. All rights reserved.
+ * Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
  *
  * See LICENSE.txt for license information
  ************************************************************************/
@@ -19,7 +20,7 @@ typedef enum { ncclCollBroadcast, ncclCollReduce, ncclCollAllGather, ncclCollRed
 
 /* Declare all collective operations */
 #define DECL_COLL4(coll, op, dtype) \
-  extern __device__ void NCCL_COLL_NAME(coll, op, dtype)(struct CollectiveArgs* args); \
+  extern __device__ __attribute__((noinline)) void NCCL_COLL_NAME(coll, op, dtype)(struct CollectiveArgs* args); \
   extern __global__ void NCCL_KERN_NAME(coll, op, dtype)(struct ncclColl coll); \
 
 #define DECL_COLL3(coll, op, dtype) \
diff --git a/projects/rccl/src/collectives/device/all_gather.h b/projects/rccl/src/collectives/device/all_gather.h
index a30e575570..677435071e 100644
--- a/projects/rccl/src/collectives/device/all_gather.h
+++ b/projects/rccl/src/collectives/device/all_gather.h
@@ -1,5 +1,6 @@
 /*************************************************************************
  * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+ * Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
  *
  * See LICENSE.txt for license information
  ************************************************************************/
@@ -16,15 +17,16 @@
   if (noffset == buffSize) noffset = 0;
 
 template<int UNROLL, class FUNC, typename T>
+__attribute__((noinline))
 __device__ void ncclAllGatherKernel(struct CollectiveArgs* args) {
   const int tid = threadIdx.x;
-  const int nthreads = blockDim.x - 1;
+  const int nthreads = blockDim.x;
   const int bid = args->bid;
   __shared__ T* sharedNextOutput;
   struct ncclComm* comm = args->comm;
   struct ncclRing* ring = comm->rings+blockIdx.x;
-  int prevdirect = ring->recv.conn.direct;
-  int nextdirect = ring->send.conn.direct;
+  int prevdirect = 0;
+  int nextdirect = 0;
 
   WaitFlag waitDoneFromNext(ring->send.conn.head, ALLGATHER_BUFCHUNKS*ALLGATHER_SUBSTEPS);
   WaitFlag waitReadyFromPrev(ring->recv.conn.tail, ALLGATHER_SUBSTEPS);
@@ -41,7 +43,7 @@ __device__ void ncclAllGatherKernel(struct CollectiveArgs* args) {
 
   if (tid == 0) {
     // Update in case we skipped some collectives
-    *ring->recv.conn.opCount = args->opCount;
+    STORE(ring->recv.conn.opCount, args->opCount);
     // Wait for next to be ready
     WaitFlag waitOpCountNext(ring->send.conn.opCount, 0);
     waitOpCountNext.wait(args->opCount);
@@ -50,9 +52,9 @@ __device__ void ncclAllGatherKernel(struct CollectiveArgs* args) {
     }
     if (nextdirect) {
       void* volatile* ptr = &(ring->devMemSend->ptrExchange);
-      while (*ptr == nullptr);
-      sharedNextOutput = (T*)*ptr;
-      *ptr = nullptr;
+      while (LOAD(ptr) == nullptr);
+      sharedNextOutput = (T*)LOAD(ptr);
+      STORE(ptr, nullptr);
     }
   }
   __syncthreads();
@@ -158,10 +160,10 @@ __device__ void ncclAllGatherKernel(struct CollectiveArgs* args) {
 
   if (tid == 0) {
     waitDoneFromNext.wait(ALLGATHER_SUBSTEPS*(step + ALLGATHER_BUFCHUNKS));
-    *ring->send.conn.head = 0ULL;
-    *ring->recv.conn.tail = 0ULL;
+    STORE(ring->send.conn.head, 0ULL);
+    STORE(ring->recv.conn.tail, 0ULL);
     __threadfence_system();
-    *ring->recv.conn.opCount = args->opCount+1;
+    STORE(ring->recv.conn.opCount, args->opCount+1);
   }
 }
 
@@ -176,6 +178,7 @@ __device__ void ncclAllGatherKernel(struct CollectiveArgs* args) {
   step++;
 
 template<int UNUSED, class FUNC, typename T>
+__attribute__((noinline))
 __device__ void ncclAllGatherLLKernel(struct CollectiveArgs* args) {
   const int tid = threadIdx.x;
   const int bid = args->bid;
diff --git a/projects/rccl/src/collectives/device/all_gather_0.cpp b/projects/rccl/src/collectives/device/all_gather_0.cpp
new file mode 100644
index 0000000000..75f90ca5e6
--- /dev/null
+++ b/projects/rccl/src/collectives/device/all_gather_0.cpp
@@ -0,0 +1,8 @@
+/*************************************************************************
+ * Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#define NCCL_OP 0
+#include "device/all_gather.cu"
diff --git a/projects/rccl/src/collectives/device/all_reduce.h b/projects/rccl/src/collectives/device/all_reduce.h
index d7abc6445b..a323d4a70b 100644
--- a/projects/rccl/src/collectives/device/all_reduce.h
+++ b/projects/rccl/src/collectives/device/all_reduce.h
@@ -1,5 +1,6 @@
 /*************************************************************************
  * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+ * Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
  *
  * See LICENSE.txt for license information
  ************************************************************************/
@@ -16,15 +17,16 @@
   if (noffset == buffSize) noffset = 0;
 
 template<int UNROLL, class FUNC, typename T>
+__attribute__((noinline))
 __device__ void ncclAllReduceKernel(struct CollectiveArgs* args) {
   const int tid = threadIdx.x;
-  const int nthreads = blockDim.x - 1;
+  const int nthreads = blockDim.x;
   const int bid = args->bid;
   __shared__ T* sharedNextOutput;
   struct ncclComm* comm = args->comm;
   struct ncclRing* ring = comm->rings+blockIdx.x;
-  int prevdirect = ring->recv.conn.direct;
-  int nextdirect = ring->send.conn.direct;
+  int prevdirect = 0;
+  int nextdirect = 0;
 
   WaitFlag waitDoneFromNext(ring->send.conn.head, ALLREDUCE_BUFCHUNKS*ALLREDUCE_SUBSTEPS);
   WaitFlag waitReadyFromPrev(ring->recv.conn.tail, ALLREDUCE_SUBSTEPS);
@@ -42,7 +44,7 @@ __device__ void ncclAllReduceKernel(struct CollectiveArgs* args) {
 
   if (tid == 0) {
     // Update in case we skipped some collectives
-    *ring->recv.conn.opCount = args->opCount;
+    STORE(ring->recv.conn.opCount, args->opCount);
     // Wait for next to be ready
     WaitFlag waitOpCountNext(ring->send.conn.opCount, 0);
     waitOpCountNext.wait(args->opCount);
@@ -51,9 +53,9 @@ __device__ void ncclAllReduceKernel(struct CollectiveArgs* args) {
     }
     if (nextdirect) {
       void* volatile* ptr = &(ring->devMemSend->ptrExchange);
-      while (*ptr == nullptr);
-      sharedNextOutput = (T*)*ptr;
-      *ptr = nullptr;
+      while (LOAD(ptr) == nullptr);
+      sharedNextOutput = (T*)LOAD(ptr);
+      STORE(ptr, nullptr);
     }
   }
   __syncthreads();
@@ -189,10 +191,10 @@ __device__ void ncclAllReduceKernel(struct CollectiveArgs* args) {
   if (tid == 0) {
     // Wait for next to have consumed all data before we reset the flag
     waitDoneFromNext.wait(ALLREDUCE_SUBSTEPS*(step + ALLREDUCE_BUFCHUNKS));
-    *ring->send.conn.head = 0ULL;
-    *ring->recv.conn.tail = 0ULL;
+    STORE(ring->send.conn.head, 0ULL);
+    STORE(ring->recv.conn.tail, 0ULL);
     __threadfence_system();
-    *ring->recv.conn.opCount = args->opCount+1;
+    STORE(ring->recv.conn.opCount, args->opCount+1);
   }
 }
 
@@ -207,6 +209,7 @@ __device__ void ncclAllReduceKernel(struct CollectiveArgs* args) {
   step++;
 
 template<int UNUSED, class FUNC, typename T>
+__attribute__((noinline))
 __device__ void ncclAllReduceLLKernel(struct CollectiveArgs* args) {
   const int tid = threadIdx.x;
   const int bid = args->bid;
diff --git a/projects/rccl/src/collectives/device/all_reduce_0.cpp b/projects/rccl/src/collectives/device/all_reduce_0.cpp
new file mode 100644
index 0000000000..235005af1a
--- /dev/null
+++ b/projects/rccl/src/collectives/device/all_reduce_0.cpp
@@ -0,0 +1,8 @@
+/*************************************************************************
+ * Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#define NCCL_OP 0
+#include "device/all_reduce.cu"
diff --git a/projects/rccl/src/collectives/device/all_reduce_1.cpp b/projects/rccl/src/collectives/device/all_reduce_1.cpp
new file mode 100644
index 0000000000..dda4b5d517
--- /dev/null
+++ b/projects/rccl/src/collectives/device/all_reduce_1.cpp
@@ -0,0 +1,8 @@
+/*************************************************************************
+ * Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#define NCCL_OP 1
+#include "device/all_reduce.cu"
diff --git a/projects/rccl/src/collectives/device/all_reduce_2.cpp b/projects/rccl/src/collectives/device/all_reduce_2.cpp
new file mode 100644
index 0000000000..745435b60f
--- /dev/null
+++ b/projects/rccl/src/collectives/device/all_reduce_2.cpp
@@ -0,0 +1,8 @@
+/*************************************************************************
+ * Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#define NCCL_OP 2
+#include "device/all_reduce.cu"
diff --git a/projects/rccl/src/collectives/device/all_reduce_3.cpp b/projects/rccl/src/collectives/device/all_reduce_3.cpp
new file mode 100644
index 0000000000..d7f45f03dd
--- /dev/null
+++ b/projects/rccl/src/collectives/device/all_reduce_3.cpp
@@ -0,0 +1,8 @@
+/*************************************************************************
+ * Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#define NCCL_OP 3
+#include "device/all_reduce.cu"
diff --git a/projects/rccl/src/collectives/device/broadcast.h b/projects/rccl/src/collectives/device/broadcast.h
index c2f6d001e1..0baef3f476 100644
--- a/projects/rccl/src/collectives/device/broadcast.h
+++ b/projects/rccl/src/collectives/device/broadcast.h
@@ -1,5 +1,6 @@
 /*************************************************************************
  * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+ * Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
  *
  * See LICENSE.txt for license information
  ************************************************************************/
@@ -15,15 +16,16 @@
   if (boffset == buffSize) boffset = 0;
 
 template<int UNROLL, class FUNC, typename T>
+__attribute__((noinline))
 __device__ void ncclBroadcastKernel(struct CollectiveArgs* args) {
   const int tid = threadIdx.x;
-  const int nthreads = blockDim.x - 1;
+  const int nthreads = blockDim.x;
   const int bid = args->bid;
   __shared__ T* sharedNextOutput;
   struct ncclComm* comm = args->comm;
   struct ncclRing* ring = comm->rings+blockIdx.x;
-  int prevdirect = ring->recv.conn.direct;
-  int nextdirect = ring->send.conn.direct;
+  int prevdirect = 0;
+  int nextdirect = 0;
 
   WaitFlag waitDoneFromNext(ring->send.conn.head, (BROADCAST_BUFCHUNKS-1)*BROADCAST_SUBSTEPS);
   WaitFlag waitReadyFromPrev(ring->recv.conn.tail, 0);
@@ -42,7 +44,7 @@ __device__ void ncclBroadcastKernel(struct CollectiveArgs* args) {
 
   if (tid == 0) {
     // Update in case we skipped some collectives
-    *ring->recv.conn.opCount = args->opCount;
+    STORE(ring->recv.conn.opCount, args->opCount);
     if (nextRank != root) {
       // Wait for next to be ready
       WaitFlag waitOpCountNext(ring->send.conn.opCount, 0);
@@ -53,9 +55,9 @@ __device__ void ncclBroadcastKernel(struct CollectiveArgs* args) {
     }
     if (nextRank != root && nextdirect) {
       void* volatile* ptr = &(ring->devMemSend->ptrExchange);
-      while (*ptr == nullptr);
-      sharedNextOutput = (T*)*ptr;
-      *ptr = nullptr;
+      while (LOAD(ptr) == nullptr);
+      sharedNextOutput = (T*)LOAD(ptr);
+      STORE(ptr, nullptr);
     }
   }
   __syncthreads();
@@ -130,11 +132,11 @@ __device__ void ncclBroadcastKernel(struct CollectiveArgs* args) {
     if (nextRank != root) {
       // Wait for next to have consumed data before resetting the flag
       waitDoneFromNext.wait(BROADCAST_SUBSTEPS*(step + BROADCAST_BUFCHUNKS - 1));
-      *ring->send.conn.head = 0ULL;
+      STORE(ring->send.conn.head, 0ULL);
     }
-    *ring->recv.conn.tail = 0ULL;
+    STORE(ring->recv.conn.tail, 0ULL);
     __threadfence_system();
-    *ring->recv.conn.opCount = args->opCount+1;
+    STORE(ring->recv.conn.opCount, args->opCount+1);
   }
 }
 
@@ -147,6 +149,7 @@ __device__ void ncclBroadcastKernel(struct CollectiveArgs* args) {
   step++;
 
 template<int UNUSED, class FUNC, typename T>
+__attribute__((noinline))
 __device__ void ncclBroadcastLLKernel(struct CollectiveArgs* args) {
   const int tid = threadIdx.x;
   const int bid = args->bid;
diff --git a/projects/rccl/src/collectives/device/broadcast_0.cpp b/projects/rccl/src/collectives/device/broadcast_0.cpp
new file mode 100644
index 0000000000..75b75ad9cf
--- /dev/null
+++ b/projects/rccl/src/collectives/device/broadcast_0.cpp
@@ -0,0 +1,8 @@
+/*************************************************************************
+ * Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#define NCCL_OP 0
+#include "device/broadcast.cu"
diff --git a/projects/rccl/src/collectives/device/common.h b/projects/rccl/src/collectives/device/common.h
index c9889133eb..2f230c6b4b 100644
--- a/projects/rccl/src/collectives/device/common.h
+++ b/projects/rccl/src/collectives/device/common.h
@@ -1,5 +1,6 @@
 /*************************************************************************
  * Copyright (c) 2017-2018, NVIDIA CORPORATION. All rights reserved.
+ * Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
  *
  * See LICENSE.txt for license information
  ************************************************************************/
@@ -7,12 +8,49 @@
 #ifndef NCCL_DEVICE_COMMON_H_
 #define NCCL_DEVICE_COMMON_H_
 
+#include <hip/hip_runtime.h>
+
 #include "../collectives.h"
 #include "core.h"
 #include "nccl.h"
 
+#include <type_traits>
+
 typedef void(*ncclKern_t)(struct CollectiveArgs* args);
-extern __device__ ncclKern_t ncclFuncs[];
+extern __device__ const ncclKern_t ncclFuncs[];
+
+template<unsigned short f, unsigned short l>
+struct Caller {
+  static
+  void call(ncclColl* const c) noexcept
+  {
+    constexpr unsigned short m = f + (l - f) / 2;
+
+    return (c->funcIndex < m) ? Caller<f, m>::call(c) : Caller<m, l>::call(c);
+  }
+};
+
+template<unsigned short f>
+struct Caller<f, f + 1>{
+  static
+  void call(struct ncclColl* const c) noexcept { ncclFuncs[f](&c->args); }
+};
+
+inline
+__device__
+void NCCL_CALL_FUNCTIONS(struct ncclColl* const c) noexcept
+{
+  if (c->funcIndex < 72) {
+    if (c->funcIndex % 2) ncclBroadcastLL_copy_i8(&c->args);
+    else ncclBroadcast_copy_i8(&c->args);
+  }
+  else if (c->funcIndex < 144) Caller<72, 144>::call(c);
+  else if (c->funcIndex < 216) {
+    if (c->funcIndex % 2) ncclAllGatherLL_copy_i8(&c->args);
+    else ncclAllGather_copy_i8(&c->args);
+  }
+  else Caller<216, 360>::call(c);
+}
 
 static __device__ void load_parallel(void* dst, void* src, size_t size, int tid) {
   int* d = (int*)dst;
@@ -54,7 +92,7 @@ __global__ void NCCL_KERN_NAME(coll, op, dtype)(struct ncclColl firstColl) { \
       if (c->funcIndex == fIndex) { \
         coll##Kernel<UNROLL, ncclFunc<ctype>, ctype>(&c->args); \
       } else { \
-        ncclFuncs[c->funcIndex](&c->args); \
+        NCCL_CALL_FUNCTIONS(c); \
       } \
     } \
     int nextIndex = c->nextIndex; \
diff --git a/projects/rccl/src/collectives/device/common_kernel.h b/projects/rccl/src/collectives/device/common_kernel.h
index 0eaa0610d0..e8194bf4e3 100644
--- a/projects/rccl/src/collectives/device/common_kernel.h
+++ b/projects/rccl/src/collectives/device/common_kernel.h
@@ -1,5 +1,6 @@
 /*************************************************************************
  * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+ * Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
  *
  * See LICENSE.txt for license information
  ************************************************************************/
@@ -11,13 +12,25 @@
 #include <cstdio>
 #include <cstdint>
 
-#include <cuda_runtime.h>
+#include <hip/hip_runtime_api.h>
 
 // Define min for ssize_t
 static __device__ int min(int a, ssize_t b) { return (a < b) ? a : b; }
 
 typedef uint64_t PackType;
 
+#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__)
+
+template<class FUNC, typename T>
+struct MULTI {
+    __device__ PackType operator()(const PackType x, const PackType y) const
+    {
+        return FUNC()(x, y);
+    }
+};
+
+#else
+
 // unpack x and y to elements of type T and apply FUNC to each element
 template<class FUNC, typename T>
 struct MULTI {
@@ -192,6 +205,8 @@ struct MULTI<FUNC, int64_t> {
   }
 };
 
+#endif //defined(__HIP_PLATFORM_HCC__) || defined(__HCC__)
+
 #define ALIGNUP(x, a)   ((((x)-1) & ~((a)-1)) + (a))
 
 template<typename T>
@@ -210,7 +225,7 @@ void vStore(volatile T* ptr, const T val) {
   *ptr = val;
 }
 
-#if CUDART_VERSION < 9000
+#if CUDART_VERSION < 9000 && !(defined(__HIP_PLATFORM_HCC__) || defined(__HCC__))
 template<> inline __device__
 half vFetch<half>(const volatile half* ptr) {
   half r;
@@ -237,6 +252,7 @@ void vStore<half>(volatile half* ptr, const half val) {
 #endif
 
 template<class FUNC, typename T, bool TWO_INPUTS, bool TWO_OUTPUTS>
+__attribute__((noinline))
 __device__ inline void ReduceCopy(
     const int tid, const int nthreads,
     const volatile T * __restrict__ const src0,
@@ -266,14 +282,25 @@ struct MULTI128 {
 };
 
 inline __device__ void Fetch128(Pack128& v, Pack128* p) {
+#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__)
+  v.x = p->x;
+  v.y = p->y;
+#else
   asm volatile("ld.volatile.global.v2.u64 {%0,%1}, [%2];" : "=l"(v.x), "=l"(v.y) : "l"(p) : "memory");
+#endif
 }
 inline __device__ void Store128(Pack128* p, Pack128& v) {
+#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__)
+  p->x = v.x;
+  p->y = v.y;
+#else
   asm volatile("st.volatile.global.v2.u64 [%0], {%1,%2};" :: "l"(p), "l"(v.x), "l"(v.y) : "memory");
+#endif
 }
 
 #define WARP_SIZE 32
 template<class FUNC, typename T, bool TWO_INPUTS, bool TWO_OUTPUTS, int UNROLL>
+__attribute__((noinline))
 __device__ inline void ReduceCopy128b( const int w, const int nw, const int t,
     Pack128 * src0, Pack128 * src1, Pack128 * dest0, Pack128 * dest1,
     const int N) {
@@ -303,6 +330,7 @@ __device__ inline void ReduceCopy128b( const int w, const int nw, const int t,
 }
 
 template<int UNROLL, class FUNC, typename T, bool HAS_DEST1, bool HAS_SRC1>
+__attribute__((noinline))
 __device__ inline void ReduceOrCopy(const int tid, const int nthreads,
     volatile T * __restrict__ dest0, volatile T * __restrict__ dest1,
     const volatile T * __restrict__ src0, const volatile T * __restrict__ src1,
diff --git a/projects/rccl/src/collectives/device/functions.cu b/projects/rccl/src/collectives/device/functions.cu
index 1fb8108166..9226c576d4 100644
--- a/projects/rccl/src/collectives/device/functions.cu
+++ b/projects/rccl/src/collectives/device/functions.cu
@@ -1,5 +1,6 @@
 /*************************************************************************
  * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+ * Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
  *
  * See LICENSE.txt for license information
  ************************************************************************/
@@ -55,7 +56,15 @@
   NCCL_FUNCS2A(ncclAllReduce) }
 
 // Must be consistent with the ncclFuncSet enum
-__device__ ncclKern_t ncclFuncs[ncclCollCount*ncclNumOps*ncclNumTypes*2] = {
+using ncclKern_t = void (*)(struct CollectiveArgs*);
+__device__ constexpr ncclKern_t ncclFuncs[]{
+#if defined(__HIP_DEVICE_COMPILE__)
+  NCCL_FUNCS2B(ncclBroadcast),
+  NCCL_FUNCS2A(ncclReduce),
+  NCCL_FUNCS2B(ncclAllGather),
+  NCCL_FUNCS2A(ncclReduceScatter),
+  NCCL_FUNCS2A(ncclAllReduce)
+#endif
 // Don't try to initialize the host shadow copy of this device-side global
 // variable. There is no host pointer to a device-side function, which
 // confuses clang. This will be fixed in the next clang release.
diff --git a/projects/rccl/src/collectives/device/ll_kernel.h b/projects/rccl/src/collectives/device/ll_kernel.h
index 5ec3c9a871..ca7e4d63e5 100644
--- a/projects/rccl/src/collectives/device/ll_kernel.h
+++ b/projects/rccl/src/collectives/device/ll_kernel.h
@@ -1,5 +1,6 @@
 /*************************************************************************
  * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+ * Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
  *
  * See LICENSE.txt for license information
  ************************************************************************/
@@ -7,17 +8,41 @@
 #ifndef NCCL_LL_KERNEL_H_
 #define NCCL_LL_KERNEL_H_
 
-static __device__ uint64_t readLL(union ncclLLFifoLine* src, uint32_t flag) {
+static __device__ __attribute__((noinline)) uint64_t readLL(union ncclLLFifoLine* src, uint32_t flag) {
+#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__)
+  using Vec = uint32_t __attribute__((ext_vector_type(4)));
+  Vec i4;
+  do {
+    asm volatile ("flat_load_dwordx4 %0, %1, glc\n"
+      "s_waitcnt vmcnt(0)\n"
+      "buffer_wbinvl1_vol\n" : "=v"(i4) : "v"(src));
+  } while (i4[1] != flag || i4[3] != flag);
+  uint64_t val64 = (uint64_t)(i4[0]) + (((uint64_t)i4[2]) << 32);
+  return val64;
+#else
   uint32_t data1, flag1, data2, flag2;
   do {
     asm volatile("ld.volatile.global.v4.u32 {%0,%1,%2,%3}, [%4];" : "=r"(data1), "=r"(flag1), "=r"(data2), "=r"(flag2) : "l"(&src->i4));
   } while ((flag1 != flag) || (flag2 != flag));
   uint64_t val64 = data1 + (((uint64_t)data2) << 32);
   return val64;
+#endif
 }
 
-static __device__ void storeLL(union ncclLLFifoLine* dst, uint64_t val, uint32_t flag) {
+static __device__ __attribute__((noinline)) void storeLL(union ncclLLFifoLine* dst, uint64_t val, uint32_t flag) {
+#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__)
+  using Vec = uint32_t __attribute__((ext_vector_type(4)));
+  Vec i4;
+  i4[0] = val & 0xffffffff;
+  i4[1] = flag;
+  i4[2] = (val >> 32);
+  i4[3] = flag;
+  asm volatile ("flat_store_dwordx4 %0, %1, glc\n"
+    "s_waitcnt vmcnt(0)\n"
+    "buffer_wbinvl1_vol\n" : : "v"(dst), "v"(i4));
+#else
   asm volatile("st.volatile.global.v4.u32 [%0], {%1,%2,%3,%4};" :: "l"(&dst->i4), "r"((uint32_t)val), "r"(flag), "r"((uint32_t)(val >> 32)), "r"(flag));
+#endif
 }
 
 // Using memcpy handles misaligned pointers.
@@ -34,6 +59,7 @@ template <typename T, class FUNC>
 class LLPrimitives {
  private:
   template <int HAS_SRC1, int HAS_SRC2, int HAS_DST1, int HAS_DST2>
+  __attribute__((noinline))
   static __device__ void ReduceCopyGeneric(const T* src1, union ncclLLFifoLine* src2, T* dst1, union ncclLLFifoLine* dst2, int size, uint32_t iflag, uint32_t oflag, int nthreads) {
     if (size <= 0) return;
     size_t size64 = size * sizeof(T) / sizeof(uint64_t);
@@ -117,23 +143,29 @@ class LLPrimitives {
 #define STEP_TO_SLOT(step) \
   (step % NCCL_LL_CHUNKS)
 
+#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__)
+#define SYNC __syncthreads()
+#else
+#define SYNC asm volatile ("bar.sync 1, %0;" :: "r"(llNthreads))
+#endif
+
 #define WAIT_NEXT \
   if (tid == 0) { \
     while (sendHead + NCCL_LL_CHUNKS <= step) { \
-      sendHead = sendHeadPtr[0]; \
+      sendHead = LOAD(sendHeadPtr); \
     } \
   } \
-  asm volatile ("bar.sync 1, %0;" :: "r"(llNthreads));
+  SYNC;
 
 #define POST_SIZE \
-  if (tid == 0 && sizesFifo) sizesFifo[step % NCCL_LL_CHUNKS] = (maxOffset <= 0) ? -1 : (maxOffset*2*(int)sizeof(T));
+  if (tid == 0 && sizesFifo) { STORE(sizesFifo + step % NCCL_LL_CHUNKS, (maxOffset <= 0) ? -1 : (maxOffset*2*(int)sizeof(T))); }
 
 #define ACK_PREV \
-  asm volatile ("bar.sync 1, %0;" :: "r"(llNthreads)); \
-  if (tid == 0) recvHeadPtr[0] = step;
+  SYNC; \
+  if (tid == 0) STORE(recvHeadPtr,step);
 
 #define FIFO_CLEANING_AND_SAVE_STEP(flag) do { \
-  if (step > ring->send.conn.llLastCleaning + NCCL_LL_CLEAN_FREQ) { \
+  if (step > LOAD(&ring->send.conn.llLastCleaning) + NCCL_LL_CLEAN_FREQ) { \
     /* Reset all flags */ \
     static_assert((NCCL_LL_BUFF_SIZE % NCCL_LL_MAX_NTHREADS) == 0, "NCCL_LL_BUFF_SIZE must be a multiple of THREADS"); \
     static_assert(NCCL_LL_BUFF_SIZE/(sizeof(union ncclLLFifoLine)*NCCL_LL_MAX_NTHREADS) > 0, "NCCL_LL_BUFF_SIZE is less than 16 bytes*THREADS"); \
@@ -145,10 +177,10 @@ class LLPrimitives {
     /* Restart from the same slot, only make sure sender waits for data to be reset */ \
     step += NCCL_LL_CHUNKS; \
     ACK_PREV; \
-    while (sendHeadPtr[0] < step); \
-    if (tid == 0) ring->send.conn.llLastCleaning = step; \
+    while (LOAD(sendHeadPtr) < step); \
+    { if (tid == 0) STORE(&ring->send.conn.llLastCleaning, step); }\
   } \
-  ring->send.conn.llStep = step; \
+  STORE(&ring->send.conn.llStep, step); \
 } while (0);
 
 #endif
diff --git a/projects/rccl/src/collectives/device/primitives.h b/projects/rccl/src/collectives/device/primitives.h
index e2baa4b301..1ef5ac802e 100644
--- a/projects/rccl/src/collectives/device/primitives.h
+++ b/projects/rccl/src/collectives/device/primitives.h
@@ -1,5 +1,6 @@
 /*************************************************************************
  * Copyright (c) 2016-2018, NVIDIA CORPORATION. All rights reserved.
+ * Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
  *
  * See LICENSE.txt for license information
  ************************************************************************/
@@ -31,10 +32,10 @@ class WaitFlag {
   volatile uint64_t * const flag;
   const int shift;
  public:
-  __device__ __forceinline__
+  __device__
   WaitFlag(volatile uint64_t * const flag, const int shift) : flag(flag), shift(shift) { }
-  __device__ __forceinline__
-  void wait(uint64_t val) { while ((*flag + shift) < val) /*SPIN*/; }
+  __device__
+  void wait(uint64_t val) { while ((LOAD(flag) + shift) < val) /*SPIN*/; }
 };
 
 
@@ -44,83 +45,83 @@ class PostFlag {
   volatile int * const fifo;
   const int fifo_size;
  public:
-  __device__ __forceinline__
+  __device__
   PostFlag(volatile uint64_t* const flag, const int shift, volatile int* const fifo, const int fifo_size) : flag(flag), shift(shift), fifo(fifo), fifo_size(fifo_size) { }
-  __device__ __forceinline__
-  void post(uint64_t val) { *flag = (val - shift); }
-  __device__ __forceinline__
-  void postSize(uint64_t step, int size) { if (fifo != NULL) fifo[step%fifo_size] = size; };
+  __device__
+  void post(uint64_t val) { STORE(flag, (val - shift)); }
+  __device__
+  void postSize(uint64_t step, int size) { if (fifo != NULL) STORE(fifo + step%fifo_size, size); };
 };
 
 
 // Helper to check if any argument is of type T.
 // e.g. AnyAre<WaitFlag>(Flag1, Flag2, ...)
-template<typename T> __device__ __forceinline__
+template<typename T> __device__
 bool AnyAre() { return false; }
 
 template<typename T, typename FIRST_T, typename... TAIL_Ts>
-__device__ __forceinline__
+__device__
 bool AnyAre(FIRST_T first, TAIL_Ts... tail) {
   return std::is_same<T, FIRST_T>::value || AnyAre<T>(tail...);
 }
 
 
 // Wait on all WaitFlags, ignore PostFlags
-__device__ __forceinline__
-void WaitOnFlags(uint64_t val) { }
+__device__
+static void WaitOnFlags(uint64_t val) { }
 
-template <typename... TAIL_Ts> __device__ __forceinline__
-void WaitOnFlags(uint64_t val, WaitFlag flag, TAIL_Ts... tail) {
+template <typename... TAIL_Ts> __device__
+static void WaitOnFlags(uint64_t val, WaitFlag flag, TAIL_Ts... tail) {
   flag.wait(val);
   WaitOnFlags(val, tail...);
 }
 
-template <typename... TAIL_Ts> __device__ __forceinline__
-void WaitOnFlags(uint64_t val, PostFlag, TAIL_Ts... tail) {
+template <typename... TAIL_Ts> __device__
+static void WaitOnFlags(uint64_t val, PostFlag, TAIL_Ts... tail) {
   WaitOnFlags(val, tail...);
 }
 
 
 // Post all PostFlags, ignore WaitFlags
-__device__ __forceinline__
-void PostToFlags(uint64_t val) { }
+__device__
+static void PostToFlags(uint64_t val) { }
 
-template <typename... TAIL_Ts> __device__ __forceinline__
-void PostToFlags(uint64_t val, WaitFlag flag, TAIL_Ts... tail) {
+template <typename... TAIL_Ts> __device__
+static void PostToFlags(uint64_t val, WaitFlag flag, TAIL_Ts... tail) {
   PostToFlags(val, tail...);
 }
 
-template <typename... TAIL_Ts> __device__ __forceinline__
-void PostToFlags(uint64_t val, PostFlag flag, TAIL_Ts... tail) {
+template <typename... TAIL_Ts> __device__
+static void PostToFlags(uint64_t val, PostFlag flag, TAIL_Ts... tail) {
   flag.post(val);
   PostToFlags(val, tail...);
 }
 
 
 // Post sizes for PostFlags, ignore WaitFlags
-__device__ __forceinline__
-void PostSizeToFlags(uint64_t step, int size) { }
+__device__
+static void PostSizeToFlags(uint64_t step, int size) { }
 
-template <typename... TAIL_Ts> __device__ __forceinline__
-void PostSizeToFlags(uint64_t step, int size, WaitFlag flag, TAIL_Ts... tail) {
+template <typename... TAIL_Ts> __device__
+static void PostSizeToFlags(uint64_t step, int size, WaitFlag flag, TAIL_Ts... tail) {
   PostSizeToFlags(step, size, tail...);
 }
 
-template <typename... TAIL_Ts> __device__ __forceinline__
-void PostSizeToFlags(uint64_t step, int size, PostFlag flag, TAIL_Ts... tail) {
+template <typename... TAIL_Ts> __device__
+static void PostSizeToFlags(uint64_t step, int size, PostFlag flag, TAIL_Ts... tail) {
   flag.postSize(step, size);
   PostSizeToFlags(step, size, tail...);
 }
 
 
 // Create pointer arithmetic syntax that doesn't break for std::nullptr_t
-template <typename Tptr> __device__ __forceinline__
-Tptr ptradd(Tptr ptr, int i) {
+template <typename Tptr> __device__
+static Tptr ptradd(Tptr ptr, int i) {
   return ptr + i;
 }
 
-__device__ __forceinline__
-std::nullptr_t ptradd(std::nullptr_t ptr, int i) {
+__device__
+static std::nullptr_t ptradd(std::nullptr_t ptr, int i) {
   return nullptr;
 }
 
@@ -132,7 +133,7 @@ class Primitives {
   template <typename SRC2_T, // either T* or std::nullptr_t
       typename DST2_T, // either T* or std::nullptr_t
       typename... SYNC_Ts> // either WaitFunc or PostFunc
-  static __device__ __forceinline__ void
+  static __device__ __attribute__((noinline)) void
   GenericOp(const int tid, const int nthreads,
       const T*     src1,
       const SRC2_T src2,
@@ -160,7 +161,11 @@ class Primitives {
           if (tid == 0) {
             WaitOnFlags(SUBSTEPS*step + sub + 1, flags...);
           }
+#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__)
+          __syncthreads();
+#else
           asm volatile ("bar.sync 1, %0;" :: "r"(nthreads));
+#endif
         }
         ReduceOrCopy
         <
@@ -180,13 +185,11 @@ class Primitives {
         );
         if (AnyAre<PostFlag>(flags...)) {
           __syncthreads();
-        }
-      } else {
-        if (AnyAre<PostFlag>(flags...)) {
-          __syncthreads();
-          PostSizeToFlags(SUBSTEPS*step+sub, realSize*sizeof(T), flags...);
+          if(tid == 0)
+            PostSizeToFlags(SUBSTEPS*step+sub, realSize*sizeof(T), flags...);
           __threadfence_system();
-          PostToFlags(SUBSTEPS*step + sub + 1, flags...);
+          if(tid == 0)
+            PostToFlags(SUBSTEPS*step + sub + 1, flags...);
         }
       }
       sliceOffset += sliceSize;
@@ -195,28 +198,28 @@ class Primitives {
 
  public:
   template <typename... SYNC_Ts>
-  static __device__ __forceinline__ void
+  static __device__ void
   Copy(const int tid, const int nthreads, const T* src, T* dst,
       int len, int maxOffset, uint64_t step, SYNC_Ts... flags) {
     GenericOp(tid, nthreads, src, nullptr, dst, nullptr, len, maxOffset, step, flags...);
   }
 
   template <typename... SYNC_Ts>
-  static __device__ __forceinline__ void
+  static __device__ void
   DoubleCopy(const int tid, const int nthreads, const T* src, T* dst1, T* dst2,
       int len, int maxOffset, uint64_t step, SYNC_Ts... flags) {
     GenericOp(tid, nthreads, src, nullptr, dst1, dst2, len, maxOffset, step, flags...);
   }
 
   template <typename... SYNC_Ts>
-  static __device__ __forceinline__ void
+  static __device__ void
   Reduce(const int tid, const int nthreads, const T* src1, const T* src2, T* dst,
       int len, int maxOffset, uint64_t step, SYNC_Ts... flags) {
     GenericOp(tid, nthreads, src1, src2, dst, nullptr, len, maxOffset, step, flags...);
   }
 
   template <typename... SYNC_Ts>
-  static __device__ __forceinline__ void
+  static __device__ void
   ReduceCopy(const int tid, const int nthreads, const T* src1, const T* src2, T* dst1, T* dst2,
       int len, int maxOffset, uint64_t step, SYNC_Ts... flags) {
     GenericOp(tid, nthreads, src1, src2, dst1, dst2, len, maxOffset, step, flags...);
diff --git a/projects/rccl/src/collectives/device/reduce.h b/projects/rccl/src/collectives/device/reduce.h
index f5694b1456..91eb83f60d 100644
--- a/projects/rccl/src/collectives/device/reduce.h
+++ b/projects/rccl/src/collectives/device/reduce.h
@@ -1,5 +1,6 @@
 /*************************************************************************
  * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+ * Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
  *
  * See LICENSE.txt for license information
  ************************************************************************/
@@ -15,9 +16,10 @@
   if (boffset == buffSize) boffset = 0;
 
 template<int UNROLL, class FUNC, typename T>
+__attribute__((noinline))
 __device__ void ncclReduceKernel(struct CollectiveArgs* args) {
   const int tid = threadIdx.x;
-  const int nthreads = blockDim.x - 1;
+  const int nthreads = blockDim.x;
   const int bid = args->bid;
   struct ncclComm* comm = args->comm;
   struct ncclRing* ring = comm->rings+blockIdx.x;
@@ -40,7 +42,7 @@ __device__ void ncclReduceKernel(struct CollectiveArgs* args) {
 
   if (tid == 0) {
     // Update in case we skipped some collectives
-    *ring->recv.conn.opCount = args->opCount;
+    STORE(ring->recv.conn.opCount, args->opCount);
 
     if (rank != root) {
       // Wait for next to be ready
@@ -98,11 +100,11 @@ __device__ void ncclReduceKernel(struct CollectiveArgs* args) {
     if (rank != root) {
       // Wait for next to have consumed data before resetting the flag
       waitDoneFromNext.wait(REDUCE_SUBSTEPS*(step + REDUCE_BUFCHUNKS - 1));
-      *ring->send.conn.head = 0ULL;
+      STORE(ring->send.conn.head, 0ULL);
     }
-    *ring->recv.conn.tail = 0ULL;
+    STORE(ring->recv.conn.tail, 0ULL);
     __threadfence_system();
-    *ring->recv.conn.opCount = args->opCount+1;
+    STORE(ring->recv.conn.opCount, args->opCount+1);
   }
 }
 
@@ -115,6 +117,7 @@ __device__ void ncclReduceKernel(struct CollectiveArgs* args) {
   step++;
 
 template<int UNUSED, class FUNC, typename T>
+__attribute__((noinline))
 __device__ void ncclReduceLLKernel(struct CollectiveArgs* args) {
   const int tid = threadIdx.x;
   const int bid = args->bid;
diff --git a/projects/rccl/src/collectives/device/reduce_0.cpp b/projects/rccl/src/collectives/device/reduce_0.cpp
new file mode 100644
index 0000000000..f1b83bc655
--- /dev/null
+++ b/projects/rccl/src/collectives/device/reduce_0.cpp
@@ -0,0 +1,8 @@
+/*************************************************************************
+ * Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#define NCCL_OP 0
+#include "device/reduce.cu"
diff --git a/projects/rccl/src/collectives/device/reduce_1.cpp b/projects/rccl/src/collectives/device/reduce_1.cpp
new file mode 100644
index 0000000000..63b157075e
--- /dev/null
+++ b/projects/rccl/src/collectives/device/reduce_1.cpp
@@ -0,0 +1,8 @@
+/*************************************************************************
+ * Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#define NCCL_OP 1
+#include "device/reduce.cu"
diff --git a/projects/rccl/src/collectives/device/reduce_2.cpp b/projects/rccl/src/collectives/device/reduce_2.cpp
new file mode 100644
index 0000000000..7c84b0ada3
--- /dev/null
+++ b/projects/rccl/src/collectives/device/reduce_2.cpp
@@ -0,0 +1,8 @@
+/*************************************************************************
+ * Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#define NCCL_OP 2
+#include "device/reduce.cu"
diff --git a/projects/rccl/src/collectives/device/reduce_3.cpp b/projects/rccl/src/collectives/device/reduce_3.cpp
new file mode 100644
index 0000000000..c590bdd3c6
--- /dev/null
+++ b/projects/rccl/src/collectives/device/reduce_3.cpp
@@ -0,0 +1,8 @@
+/*************************************************************************
+ * Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#define NCCL_OP 3
+#include "device/reduce.cu"
diff --git a/projects/rccl/src/collectives/device/reduce_kernel.h b/projects/rccl/src/collectives/device/reduce_kernel.h
index 0cb8f139f7..86e0f56a12 100644
--- a/projects/rccl/src/collectives/device/reduce_kernel.h
+++ b/projects/rccl/src/collectives/device/reduce_kernel.h
@@ -1,5 +1,6 @@
 /*************************************************************************
  * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+ * Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
  *
  * See LICENSE.txt for license information
  ************************************************************************/
@@ -18,6 +19,123 @@ struct FuncNull {
   }
 };
 
+#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__)
+
+//we really don't need any specializations and we don't need
+//to break things into uint32_t
+template<typename T>
+__device__ inline T ncclMinFunc(T x, T y) { return y < x ? y : x; }
+
+template<typename T>
+__device__ inline T ncclMaxFunc(T x, T y) { return y < x ? x : y; }
+
+template<typename T>
+class FuncBase {
+protected:
+  static constexpr auto n = sizeof(PackType) / sizeof(T);
+
+  union Cvt {
+    using Vec = T __attribute__((ext_vector_type(n)));
+
+    PackType data;
+    Vec vec;
+
+    static_assert(sizeof(Vec) == sizeof(data), "Vec must be the same size of data.");
+  };
+};
+
+template<>
+class FuncBase<half> {
+protected:
+  static constexpr auto n = sizeof(PackType) / sizeof(_Float16);
+  union Cvt {
+    using Vec = _Float16 __attribute__((ext_vector_type(n)));
+
+    PackType data;
+    Vec vec;
+
+    static_assert(sizeof(Vec) == sizeof(data), "Vec must be the same size of data.");
+  };
+};
+
+template<typename T>
+struct FuncSum : private FuncBase<T> {
+  __device__ PackType operator()(PackType x, PackType y) const
+  {
+    using Cvt = typename FuncBase<T>::Cvt;
+
+    Cvt tmp_x{x};
+    tmp_x.vec += Cvt{y}.vec;
+
+    return tmp_x.data;
+  }
+  template<typename U = T, typename std::enable_if<!std::is_same<T, U>{}>* = nullptr>
+  __device__ T operator()(const T x, const T y) const {
+    return x + y;
+  }
+};
+
+template<typename T>
+struct FuncProd : private FuncBase<T> {
+  __device__ PackType operator()(PackType x, PackType y) const
+  {
+    using Cvt = typename FuncBase<T>::Cvt;
+
+    Cvt tmp_x{x};
+    tmp_x.vec *= Cvt{y}.vec;
+
+    return tmp_x.data;
+  }
+  template<typename U = T, typename std::enable_if<!std::is_same<T, U>{}>* = nullptr>
+  __device__ T operator()(const T x, const T y) const {
+    return x * y;
+  }
+};
+
+template<typename T>
+struct FuncMax : private FuncBase<T> {
+  __device__ PackType operator()(PackType x, PackType y) const
+  {
+    using Cvt = typename FuncBase<T>::Cvt;
+
+    Cvt tmp_x{x};
+    Cvt tmp_y{y};
+
+    for (auto i = 0u; i != FuncBase<T>::n; ++i) {
+        tmp_x.vec[i] = ncclMaxFunc(tmp_x.vec[i], tmp_y.vec[i]);
+    }
+
+    return tmp_x.data;
+  }
+  template<typename U = T, typename std::enable_if<!std::is_same<T, U>{}>* = nullptr>
+  __device__ T operator()(const T x, const T y) const {
+    return (x < y) ? y : x;
+  }
+};
+
+template<typename T>
+struct FuncMin : private FuncBase<T> {
+  __device__ PackType operator()(PackType x, PackType y) const
+  {
+    using Cvt = typename FuncBase<T>::Cvt;
+
+    Cvt tmp_x{x};
+    Cvt tmp_y{y};
+
+    for (auto i = 0u; i != FuncBase<T>::n; ++i) {
+        tmp_x.vec[i] = ncclMinFunc(tmp_x.vec[i], tmp_y.vec[i]);
+    }
+
+    return tmp_x.data;
+  }
+  template<typename U = T, typename std::enable_if<!std::is_same<T, U>{}>* = nullptr>
+  __device__ T operator()(const T x, const T y) const {
+    return (x < y) ? x : y;
+  }
+};
+
+#else
+
 template<typename T>
 struct FuncSum {
   __device__ T operator()(const T x, const T y) const {
@@ -361,4 +479,7 @@ struct FuncMin<half> {
     return __float2half(fm);
   }
 };
+
+#endif // defined(__HIP_PLATFORM_HCC__) || defined(__HCC__)
+
 #endif // REDUCE_KERNEL_H_
diff --git a/projects/rccl/src/collectives/device/reduce_scatter.cu b/projects/rccl/src/collectives/device/reduce_scatter.cu
index b16053c41b..efff65deba 100644
--- a/projects/rccl/src/collectives/device/reduce_scatter.cu
+++ b/projects/rccl/src/collectives/device/reduce_scatter.cu
@@ -1,5 +1,6 @@
 /*************************************************************************
  * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+ * Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
  *
  * See LICENSE.txt for license information
  ************************************************************************/
diff --git a/projects/rccl/src/collectives/device/reduce_scatter.h b/projects/rccl/src/collectives/device/reduce_scatter.h
index cad011b22e..aad151211d 100644
--- a/projects/rccl/src/collectives/device/reduce_scatter.h
+++ b/projects/rccl/src/collectives/device/reduce_scatter.h
@@ -1,5 +1,6 @@
 /*************************************************************************
  * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+ * Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
  *
  * See LICENSE.txt for license information
  ************************************************************************/
@@ -16,9 +17,10 @@
   if (noffset == buffSize) noffset = 0;
 
 template<int UNROLL, class FUNC, typename T>
+__attribute__((noinline))
 __device__ void ncclReduceScatterKernel(struct CollectiveArgs* args) {
   const int tid = threadIdx.x;
-  const int nthreads = blockDim.x - 1;
+  const int nthreads = blockDim.x;
   const int bid = args->bid;
   struct ncclComm* comm = args->comm;
   struct ncclRing* ring = comm->rings+blockIdx.x;
@@ -38,7 +40,7 @@ __device__ void ncclReduceScatterKernel(struct CollectiveArgs* args) {
 
   if (tid == 0) {
     // Update in case we skipped some collectives
-    *ring->recv.conn.opCount = args->opCount;
+    STORE(ring->recv.conn.opCount, args->opCount);
     // Wait for next to be ready
     WaitFlag waitOpCountNext(ring->send.conn.opCount, 0);
     waitOpCountNext.wait(args->opCount);
@@ -112,10 +114,10 @@ __device__ void ncclReduceScatterKernel(struct CollectiveArgs* args) {
 
   if (tid == 0) {
     waitDoneFromNext.wait(REDUCESCATTER_SUBSTEPS*(step + REDUCESCATTER_BUFCHUNKS));
-    *ring->send.conn.head = 0ULL;
-    *ring->recv.conn.tail = 0ULL;
+    STORE(ring->send.conn.head, 0ULL);
+    STORE(ring->recv.conn.tail, 0ULL);
     __threadfence_system();
-    *ring->recv.conn.opCount = args->opCount+1;
+    STORE(ring->recv.conn.opCount, args->opCount+1);
   }
 }
 
@@ -130,6 +132,7 @@ __device__ void ncclReduceScatterKernel(struct CollectiveArgs* args) {
   step++;
 
 template<int UNUSED, class FUNC, typename T>
+__attribute__((noinline))
 __device__ void ncclReduceScatterLLKernel(struct CollectiveArgs* args) {
   const int tid = threadIdx.x;
   const int bid = args->bid;
diff --git a/projects/rccl/src/collectives/device/reduce_scatter_0.cpp b/projects/rccl/src/collectives/device/reduce_scatter_0.cpp
new file mode 100644
index 0000000000..936f164605
--- /dev/null
+++ b/projects/rccl/src/collectives/device/reduce_scatter_0.cpp
@@ -0,0 +1,8 @@
+/*************************************************************************
+ * Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#define NCCL_OP 0
+#include "device/reduce_scatter.cu"
diff --git a/projects/rccl/src/collectives/device/reduce_scatter_1.cpp b/projects/rccl/src/collectives/device/reduce_scatter_1.cpp
new file mode 100644
index 0000000000..3dbd2466d7
--- /dev/null
+++ b/projects/rccl/src/collectives/device/reduce_scatter_1.cpp
@@ -0,0 +1,8 @@
+/*************************************************************************
+ * Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#define NCCL_OP 1
+#include "device/reduce_scatter.cu"
diff --git a/projects/rccl/src/collectives/device/reduce_scatter_2.cpp b/projects/rccl/src/collectives/device/reduce_scatter_2.cpp
new file mode 100644
index 0000000000..7302f55739
--- /dev/null
+++ b/projects/rccl/src/collectives/device/reduce_scatter_2.cpp
@@ -0,0 +1,8 @@
+/*************************************************************************
+ * Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#define NCCL_OP 2
+#include "device/reduce_scatter.cu"
diff --git a/projects/rccl/src/collectives/device/reduce_scatter_3.cpp b/projects/rccl/src/collectives/device/reduce_scatter_3.cpp
new file mode 100644
index 0000000000..95a2fc93b7
--- /dev/null
+++ b/projects/rccl/src/collectives/device/reduce_scatter_3.cpp
@@ -0,0 +1,8 @@
+/*************************************************************************
+ * Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#define NCCL_OP 3
+#include "device/reduce_scatter.cu"
diff --git a/projects/rccl/src/collectives/reduce.cu b/projects/rccl/src/collectives/reduce.cu
index d8fde80baa..89dc804b7f 100644
--- a/projects/rccl/src/collectives/reduce.cu
+++ b/projects/rccl/src/collectives/reduce.cu
@@ -10,12 +10,12 @@
 #include "collectives.h"
 
 ncclResult_t ncclReduceFunc(const void* sendbuff, void* recvbuff, const size_t count,
-    ncclDataType_t datatype, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) {
+    ncclDataType_t datatype, ncclRedOp_t op, int root, ncclComm_t comm, hipStream_t stream) {
   size_t nbytes = count*ncclTypeSize(datatype);
   INFO(NCCL_COLL,"Reduce: opCount %lx sendbuff %p recvbuff %p count %zi datatype %d op %d root %d comm %p [nranks=%d] stream %p", comm->opCount, sendbuff, recvbuff, count, datatype, op, root, comm, comm->nRanks, stream);
   if (comm->nRanks == 1) {
     if (sendbuff != recvbuff)
-      CUDACHECK(cudaMemcpyAsync(recvbuff, sendbuff, nbytes, cudaMemcpyDeviceToDevice, stream));
+      CUDACHECK(hipMemcpyAsync(recvbuff, sendbuff, nbytes, hipMemcpyDeviceToDevice, stream));
   } else {
     NCCLCHECK(transportSaveProxies(REDUCE_SUBSTEPS, REDUCE_BUFCHUNKS, 1, 1, nbytes, proxyPatternTo(root), comm));
     NCCLCHECK(saveKernel(ncclCollReduce, sendbuff, recvbuff, count, datatype, op, root, comm, stream, nbytes, 1));
@@ -25,9 +25,9 @@ ncclResult_t ncclReduceFunc(const void* sendbuff, void* recvbuff, const size_t c
 }
 
 NCCL_API(ncclResult_t, ncclReduce, const void* sendbuff, void* recvbuff, size_t count,
-    ncclDataType_t datatype, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream);
+    ncclDataType_t datatype, ncclRedOp_t op, int root, ncclComm_t comm, hipStream_t stream);
 ncclResult_t ncclReduce(const void* sendbuff, void* recvbuff, size_t count,
-    ncclDataType_t datatype, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) {
+    ncclDataType_t datatype, ncclRedOp_t op, int root, ncclComm_t comm, hipStream_t stream) {
   return ncclEnqueueCheck(ncclReduceFunc, "Reduce", sendbuff, recvbuff, count, datatype,
           op, root, comm, stream);
 }
diff --git a/projects/rccl/src/collectives/reduce_scatter.cu b/projects/rccl/src/collectives/reduce_scatter.cu
index 1447d4a91b..f73d50948d 100644
--- a/projects/rccl/src/collectives/reduce_scatter.cu
+++ b/projects/rccl/src/collectives/reduce_scatter.cu
@@ -10,12 +10,12 @@
 #include "collectives.h"
 
 ncclResult_t ncclReduceScatterFunc(const void* sendbuff, void* recvbuff, size_t count,
-    ncclDataType_t datatype, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) {
+    ncclDataType_t datatype, ncclRedOp_t op, int root, ncclComm_t comm, hipStream_t stream) {
   size_t nbytes = count*ncclTypeSize(datatype);
   INFO(NCCL_COLL,"ReduceScatter: opCount %lx sendbuff %p recvbuff %p count %zi datatype %d op %d root %d comm %p [nranks=%d] stream %p", comm->opCount, sendbuff, recvbuff, count, datatype, op, root, comm, comm->nRanks, stream);
   if (comm->nRanks == 1) {
     if (sendbuff != recvbuff)
-      CUDACHECK(cudaMemcpyAsync(recvbuff, sendbuff, nbytes, cudaMemcpyDeviceToDevice, stream));
+      CUDACHECK(hipMemcpyAsync(recvbuff, sendbuff, nbytes, hipMemcpyDeviceToDevice, stream));
   } else {
     NCCLCHECK(transportSaveProxies(REDUCESCATTER_SUBSTEPS, REDUCESCATTER_BUFCHUNKS, comm->nRanks-1, comm->nRanks, nbytes*comm->nRanks, proxyPatternRing, comm));
     NCCLCHECK(saveKernel(ncclCollReduceScatter, sendbuff, recvbuff, count, datatype, op, root, comm, stream, nbytes*comm->nRanks, 1));
@@ -24,9 +24,9 @@ ncclResult_t ncclReduceScatterFunc(const void* sendbuff, void* recvbuff, size_t
 }
 
 NCCL_API(ncclResult_t, ncclReduceScatter, const void* sendbuff, void* recvbuff, size_t recvcount,
-    ncclDataType_t datatype, ncclRedOp_t op, ncclComm* comm, cudaStream_t stream);
+    ncclDataType_t datatype, ncclRedOp_t op, ncclComm* comm, hipStream_t stream);
 ncclResult_t ncclReduceScatter(const void* sendbuff, void* recvbuff, size_t recvcount,
-    ncclDataType_t datatype, ncclRedOp_t op, ncclComm* comm, cudaStream_t stream) {
+    ncclDataType_t datatype, ncclRedOp_t op, ncclComm* comm, hipStream_t stream) {
   return ncclEnqueueCheck(ncclReduceScatterFunc, "ReduceScatter", sendbuff, recvbuff, recvcount, datatype,
           op, 0, comm, stream);
 }
diff --git a/projects/rccl/src/include/common_coll.h b/projects/rccl/src/include/common_coll.h
index 3ec7354f5d..be9aa0023f 100644
--- a/projects/rccl/src/include/common_coll.h
+++ b/projects/rccl/src/include/common_coll.h
@@ -1,5 +1,6 @@
 /*************************************************************************
  * Copyright (c) 2016-2018, NVIDIA CORPORATION. All rights reserved.
+ * Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
  *
  * See LICENSE.txt for license information
  ************************************************************************/
@@ -12,16 +13,16 @@
 #include "collectives/collectives.h"
 
 static ncclResult_t PointerCheck(const void* pointer, struct ncclComm* comm, const char* ptrname, const char* opname) {
-  cudaPointerAttributes attr;
-  cudaError_t err = cudaPointerGetAttributes(&attr, pointer);
-  if (err != cudaSuccess || attr.devicePointer == NULL) {
+  hipPointerAttribute_t attr;
+  hipError_t err = hipPointerGetAttributes(&attr, pointer);
+  if (err != hipSuccess || attr.devicePointer == NULL) {
     WARN("%s : %s is not a valid pointer", opname, ptrname);
     return ncclInvalidArgument;
   }
 #if CUDART_VERSION >= 10000
-  if (attr.type == cudaMemoryTypeDevice && attr.device != comm->cudaDev) {
+  if (attr.type == hipMemoryTypeDevice && attr.device != comm->cudaDev) {
 #else
-  if (attr.memoryType == cudaMemoryTypeDevice && attr.device != comm->cudaDev) {
+  if (attr.memoryType == hipMemoryTypeDevice && attr.device != comm->cudaDev) {
 #endif
     WARN("%s : %s allocated on device %d mismatchs with NCCL device %d", opname, ptrname, attr.device, comm->cudaDev);
     return ncclInvalidArgument;
@@ -99,7 +100,7 @@ static inline void ncclGetCollResource(ncclComm_t comm, size_t nbytes, int* nrin
   int llEnforced = 0; /* see if the size falls in the NCCL_LL_THRESHOLD range set by user */
   if (comm->llThreshold >= 0) { /* user sets total LL threshold */
     if (nbytes > comm->llThreshold) { /* non-LL */
-      *nthreads = comm->nThreads+1;
+      *nthreads = comm->nThreads;
       *nrings = comm->nRings;
       return;
     } else {
@@ -129,12 +130,12 @@ static inline void ncclGetCollResource(ncclComm_t comm, size_t nbytes, int* nrin
   nr = DIVUP(nbytes, (NCCL_LL_RING_THRESHOLD*ll_max_nthreads*comm->nRanks)); /* else we try the max number of LL threads */
   nr = nr == 0 ? 1 : nr > comm->nRings ? comm->nRings : nr;
   *ll = nbytes > comm->nRanks*nr*ll_max_nthreads*comm->threadThreshold ? llEnforced : 1;
-  *nthreads = *ll ? ll_max_nthreads : comm->nThreads+1;
+  *nthreads = *ll ? ll_max_nthreads : comm->nThreads;
   *nrings = *ll ? (int)nr : comm->nRings;
 }
 
 static ncclResult_t saveKernel(int coll, const void* sendbuff, void* recvbuff, size_t count,
-    ncclDataType_t dtype, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream, size_t nbytes, int loopFactor) {
+    ncclDataType_t dtype, ncclRedOp_t op, int root, ncclComm_t comm, hipStream_t stream, size_t nbytes, int loopFactor) {
   int llMode, nBlocks, nThreads;
   ncclGetCollResource(comm, nbytes, &nBlocks, &nThreads, &llMode);
   comm->myParams->blockDim.x = std::max((int)comm->myParams->blockDim.x, nThreads);
@@ -164,7 +165,7 @@ static ncclResult_t saveKernel(int coll, const void* sendbuff, void* recvbuff, s
     int opIndex = ring->collFifoTail;
     struct ncclColl* c = ring->collectives+opIndex;
     volatile uint8_t* activePtr = (volatile uint8_t*)&c->active;
-    while (activePtr[0] != 0) sched_yield();
+    while (LOAD(activePtr) != 0) sched_yield();
 
     struct CollectiveArgs* args = &c->args;
     args->root = root;
@@ -180,7 +181,7 @@ static ncclResult_t saveKernel(int coll, const void* sendbuff, void* recvbuff, s
 
     c->nThreads = nThreads;
     c->funcIndex = FUNC_INDEX(coll, op, dtype, llMode);
-    c->active = 1;
+    STORE(&c->active, 1);
     opIndex = (opIndex+1)%NCCL_MAX_OPS;
     c->nextIndex = opIndex;
     ring->collFifoTail = opIndex;
diff --git a/projects/rccl/src/include/core.h b/projects/rccl/src/include/core.h
index 8285df5d45..1e5950553b 100644
--- a/projects/rccl/src/include/core.h
+++ b/projects/rccl/src/include/core.h
@@ -1,5 +1,6 @@
 /*************************************************************************
  * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+ * Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
  *
  * See LICENSE.txt for license information
  ************************************************************************/
@@ -16,16 +17,17 @@
 #include <algorithm> // std::min/std::max
 #include <unistd.h>
 #include <stdlib.h>
-#include <cuda_runtime.h>
+#include <hip/hip_runtime_api.h>
+#include <hip/hip_runtime.h>
 
 #if CUDART_VERSION < 9000
 struct cudaLaunchParams {
-  void *func;
+  void (*func)(struct ncclColl);
   dim3 gridDim;
   dim3 blockDim;
-  void **args;
+  struct ncclColl **args;
   size_t sharedMem;
-  cudaStream_t stream;
+  hipStream_t stream;
 };
 #endif
 
@@ -35,10 +37,10 @@ struct cudaLaunchParams {
 
 // Rings / LL tuning
 #define NCCL_LL_RING_THRESHOLD 8 // Per thread size before we start increasing nrings
-#define NCCL_THREAD_THRESHOLD 64 // Per thread size before we switch to non-LL for Volta and above
+#define NCCL_THREAD_THRESHOLD 256 // Per thread size before we switch to non-LL for Volta and above
 #define NCCL_THREAD_THRESHOLD_PREVOLTA 32 // Per thread size before we switch to non-LL for pre-Volta archs
 #define NCCL_LL_MAX_NTHREADS 256
-#define NCCL_LL_MIN_NTHREADS 64
+#define NCCL_LL_MIN_NTHREADS 256
 
 #define DIVUP(x, y) \
     (((x)+(y)-1)/(y))
@@ -90,9 +92,9 @@ struct ncclConnector {
   struct ncclConnInfo conn;
 };
 
-#define CACHE_LINE_SIZE 128
+#define CACHE_LINE_SIZE 64
 #define MEM_ALIGN 4096
-#define SIZES_FIFO_SIZE 32
+#define SIZES_FIFO_SIZE 16
 #define CUDA_IPC_MIN 2097152UL /* 2MiB - not currently used */
 
 #define NCCL_LL_CHUNKS 8
@@ -164,6 +166,8 @@ struct ncclRing {
 };
 static_assert(sizeof(struct ncclRing) == 0x80*sizeof(int), "ncclRing must have a pow2 size");
 
+#pragma pack(push)  /* push current alignment to stack */
+#pragma pack(4)     /* set alignment to 4 bytes boundary */
 /* CollectiveArgs + ncclColl are to be a power of two, currently 64 bytes, */
 /* to make sure reads to host from the CUDA kernel are aligned. */
 /* Make sure to adjust padding at the end of ncclColl. */
@@ -197,6 +201,7 @@ struct ncclColl {
   };
 };
 static_assert(sizeof(struct ncclColl) == (0x10*sizeof(int)), "ncclColl must have a pow2 size");
+#pragma pack(pop)   /* restore original alignment from stack */
 
 struct ncclComm {
   struct ncclRing rings[MAXRINGS];
@@ -206,9 +211,9 @@ struct ncclComm {
   int cudaDev; // my cuda device index
 
   enum { GROUP, PARALLEL } launchMode;
-  cudaStream_t userStream;
+  hipStream_t userStream;
   bool userStreamSet;
-  cudaEvent_t doneEvent;
+  hipEvent_t doneEvent;
   bool checkPointers;
 
   // Counter to make sure collectives match (needed for bcast/reduce
@@ -225,7 +230,7 @@ struct ncclComm {
 
   // An internal CUDA stream for NCCL kernel CGMD launches
   int groupCudaStream;
-  cudaStream_t groupStream;
+  hipStream_t groupStream;
 
   // Device copy of the communicator
   struct ncclComm *devComm;
@@ -243,22 +248,31 @@ struct ncclComm {
   int* intraCGMode; // Whether we can use CUDA9 CGMD or not
   int* intraCC; // Only to check all have the same ComputeCap and disable CGMode if not
   struct ncclColl args;
-  void* argsptr;
+  struct ncclColl* argsptr;
 };
 
+// Convert volatile access to atomic
+#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__)
+#define LOAD(VAR) __atomic_load_n((VAR), __ATOMIC_SEQ_CST)
+#define STORE(DST, SRC) __atomic_store_n((DST), (SRC), __ATOMIC_SEQ_CST)
+#else
+#define LOAD(VAR) *(VAR)
+#define STORE(DST, SRC) *(DST) = (SRC)
+#endif
+
 // Check CUDA calls
 #define CUDACHECK(cmd) do {                                 \
-    cudaError_t e = cmd;                                    \
-    if( e != cudaSuccess ) {                                \
-        WARN("Cuda failure '%s'", cudaGetErrorString(e));   \
+    hipError_t e = cmd;                                    \
+    if( e != hipSuccess ) {                                \
+        WARN("Cuda failure '%s'", hipGetErrorString(e));   \
         return ncclUnhandledCudaError;                      \
     }                                                       \
 } while(false)
 
 #define CUDACHECKGOTO(cmd, res, label) do {                 \
-    cudaError_t e = cmd;                                    \
-    if( e != cudaSuccess ) {                                \
-        WARN("Cuda failure '%s'", cudaGetErrorString(e));   \
+    hipError_t e = cmd;                                    \
+    if( e != hipSuccess ) {                                \
+        WARN("Cuda failure '%s'", hipGetErrorString(e));   \
         res = ncclUnhandledCudaError;                       \
         goto label;                                         \
     }                                                       \
@@ -327,14 +341,14 @@ int ncclCudaCompCap();
 
 #include <sys/mman.h>
 static inline ncclResult_t ncclCudaHostAlloc(void** ptr, void** devPtr, size_t size) {
-  CUDACHECK(cudaHostAlloc(ptr, size, cudaHostAllocMapped));
+  CUDACHECK(hipHostMalloc(ptr, size, hipHostMallocMapped));
   memset(*ptr, 0, size);
   *devPtr = *ptr;
   return ncclSuccess;
 }
 
 static inline ncclResult_t ncclCudaHostFree(void* ptr) {
-  CUDACHECK(cudaFreeHost(ptr));
+  CUDACHECK(hipHostFree(ptr));
   return ncclSuccess;
 }
 
@@ -351,15 +365,18 @@ static ncclResult_t ncclCalloc(T** ptr, size_t nelem) {
 }
 
 template <typename T>
-static ncclResult_t ncclCudaCalloc(T** ptr, size_t nelem) {
-  CUDACHECK(cudaMalloc(ptr, nelem*sizeof(T)));
-  CUDACHECK(cudaMemset(*ptr, 0, nelem*sizeof(T)));
+static ncclResult_t ncclCudaCalloc(T** ptr, size_t nelem, bool isFineGrain = false) {
+  if (isFineGrain)
+    CUDACHECK(hipExtMallocWithFlags((void**)ptr, nelem*sizeof(T), hipDeviceMallocFinegrained));
+  else
+    CUDACHECK(hipMalloc(ptr, nelem*sizeof(T)));
+  CUDACHECK(hipMemset(*ptr, 0, nelem*sizeof(T)));
   return ncclSuccess;
 }
 
 template <typename T>
 static ncclResult_t ncclCudaMemcpy(T* dst, T* src, size_t nelem) {
-  CUDACHECK(cudaMemcpy(dst, src, nelem*sizeof(T), cudaMemcpyDefault));
+  CUDACHECK(hipMemcpy(dst, src, nelem*sizeof(T), hipMemcpyDefault));
   return ncclSuccess;
 }
 
diff --git a/projects/rccl/src/include/debug.h b/projects/rccl/src/include/debug.h
index 55dee1838c..1ef87d9f6a 100644
--- a/projects/rccl/src/include/debug.h
+++ b/projects/rccl/src/include/debug.h
@@ -1,5 +1,6 @@
 /*************************************************************************
  * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+ * Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
  *
  * See LICENSE.txt for license information
  ************************************************************************/
diff --git a/projects/rccl/src/include/enqueue.h b/projects/rccl/src/include/enqueue.h
index 69d0463d99..f17639826e 100644
--- a/projects/rccl/src/include/enqueue.h
+++ b/projects/rccl/src/include/enqueue.h
@@ -1,5 +1,6 @@
 /*************************************************************************
  * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+ * Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
  *
  * See LICENSE.txt for license information
  ************************************************************************/
@@ -11,11 +12,11 @@
 #include "group.h"
 
 typedef ncclResult_t(*ncclFunc_t)(const void* sendbuff, void* recvbuff, size_t count,
-    ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream);
+    ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, hipStream_t stream);
 
 ncclResult_t ncclEnqueueCheck(ncclFunc_t func, const char* primName, const void* sendbuff,
     void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root,
-    ncclComm_t comm, cudaStream_t stream);
+    ncclComm_t comm, hipStream_t stream);
 ncclResult_t ncclCpuBarrierIn(ncclComm_t comm, int* isLast);
 ncclResult_t ncclCpuBarrierLast(ncclComm_t comm);
 ncclResult_t ncclCpuBarrierOut(ncclComm_t comm);
diff --git a/projects/rccl/src/include/group.h b/projects/rccl/src/include/group.h
index 76da30f8c0..183e823516 100644
--- a/projects/rccl/src/include/group.h
+++ b/projects/rccl/src/include/group.h
@@ -1,5 +1,6 @@
 /*************************************************************************
  * Copyright (c) 2015-2017, NVIDIA CORPORATION. All rights reserved.
+ * Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
  *
  * See LICENSE.txt for license information
  ************************************************************************/
@@ -18,7 +19,7 @@ typedef ncclResult_t(*ncclInitFunc_t)(ncclComm_t* newcomm, int ndev, ncclUniqueI
 ncclResult_t ncclAsyncInit(ncclInitFunc_t func, int cudaDev, ncclComm_t* newcomm, int ndev, ncclUniqueId commId, int myrank);
 
 typedef ncclResult_t(*ncclCollFunc_t)(const void* sendbuff, void* recvbuff, size_t count,
-    ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream);
+    ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, hipStream_t stream);
 
 ncclResult_t ncclAsyncColl(ncclComm_t comm);
 #endif
diff --git a/projects/rccl/src/include/nvlink.h b/projects/rccl/src/include/nvlink.h
index 7eb74c9f91..28976386bb 100644
--- a/projects/rccl/src/include/nvlink.h
+++ b/projects/rccl/src/include/nvlink.h
@@ -1,5 +1,6 @@
 /*************************************************************************
  * Copyright (c) 2016-2018, NVIDIA CORPORATION. All rights reserved.
+ * Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
  *
  * See LICENSE.txt for license information
  ************************************************************************/
diff --git a/projects/rccl/src/include/nvlink_stub.h b/projects/rccl/src/include/nvlink_stub.h
new file mode 100644
index 0000000000..9ee176edf0
--- /dev/null
+++ b/projects/rccl/src/include/nvlink_stub.h
@@ -0,0 +1,20 @@
+/*************************************************************************
+ * Copyright (c) 2016-2018, NVIDIA CORPORATION. All rights reserved.
+ * Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef NCCL_NVLINK_H_
+#define NCCL_NVLINK_H_
+
+#include "topo.h"
+
+#define CONNECT_NVLINK 0x10
+#define CONNECT_NVSWITCH 0x100
+
+static int getNumNvlinks(const char* busId) {
+  return 0;
+}
+
+#endif
diff --git a/projects/rccl/src/include/rings.h b/projects/rccl/src/include/rings.h
index 751846c63a..3b4c311102 100644
--- a/projects/rccl/src/include/rings.h
+++ b/projects/rccl/src/include/rings.h
@@ -1,5 +1,6 @@
 /*************************************************************************
  * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+ * Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
  *
  * See LICENSE.txt for license information
  ************************************************************************/
@@ -9,7 +10,11 @@
 
 static int getDefaultThreads() {
   // On Kepler, rings are doubled later.
+#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__)
+  return 256;
+#else
   return ncclCudaCompCap() == 3 ? 128 : 256;
+#endif
 }
 
 ncclResult_t ncclGetRings(int* nrings, int* nthreads, int rank, int nranks, int* transports, ncclTvalue_t* values, int* prev, int* next);
diff --git a/projects/rccl/src/include/shm.h b/projects/rccl/src/include/shm.h
index 4fb49cbb8f..850ecae5ce 100644
--- a/projects/rccl/src/include/shm.h
+++ b/projects/rccl/src/include/shm.h
@@ -1,5 +1,6 @@
 /*************************************************************************
  * Copyright (c) 2016-2018, NVIDIA CORPORATION. All rights reserved.
+ * Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
  *
  * See LICENSE.txt for license information
  ************************************************************************/
@@ -39,14 +40,14 @@ static ncclResult_t shmOpen(const char* shmname, const int shmsize, void** shmPt
   ncclResult_t res = ncclSuccess;
 
   NCCLCHECKGOTO(shmSetup(shmname, shmsize, &fd, &ptr, create), res, sysError);
-  CUDACHECKGOTO(cudaHostRegister(ptr, shmsize, cudaHostRegisterMapped), res, cudaError);
-  CUDACHECKGOTO(cudaHostGetDevicePointer(devShmPtr, ptr, 0), res, cudaError);
+  CUDACHECKGOTO(hipHostRegister(ptr, shmsize, hipHostRegisterMapped), res, hipError_t);
+  CUDACHECKGOTO(hipHostGetDevicePointer(devShmPtr, ptr, 0), res, hipError_t);
 
   *shmPtr = ptr;
   return ncclSuccess;
 sysError:
   WARN("Error while %s shared memory segment %s (size %d)\n", create ? "creating" : "attaching to", shmname, shmsize);
-cudaError:
+hipError_t:
   if (fd != -1) close(fd);
   if (create) shm_unlink(shmname);
   if (ptr != MAP_FAILED) munmap(ptr, shmsize);
@@ -60,7 +61,7 @@ static ncclResult_t shmUnlink(const char* shmname) {
 }
 
 static ncclResult_t shmClose(void* shmPtr, void* devShmPtr, const int shmsize) {
-  CUDACHECK(cudaHostUnregister(shmPtr));
+  CUDACHECK(hipHostUnregister(shmPtr));
   if (munmap(shmPtr, shmsize) != 0) {
     WARN("munmap of shared memory failed");
     return ncclSystemError;
diff --git a/projects/rccl/src/include/topo.h b/projects/rccl/src/include/topo.h
index e824a81023..d14e38690e 100644
--- a/projects/rccl/src/include/topo.h
+++ b/projects/rccl/src/include/topo.h
@@ -1,5 +1,6 @@
 /*************************************************************************
  * Copyright (c) 2016-2018, NVIDIA CORPORATION. All rights reserved.
+ * Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
  *
  * See LICENSE.txt for license information
  ************************************************************************/
@@ -11,13 +12,41 @@
 #include <limits.h>
 #include <stdlib.h>
 #include <ctype.h>
+#include <iostream>
+#include <fstream>
+#include <string>
 
 #define BUSID_SIZE (sizeof("0000:00:00.0"))
 #define BUSID_REDUCED_SIZE (sizeof("0000:00"))
 
+static bool isEPYC() {
+  std::ifstream cpuinfo("/proc/cpuinfo");
+  std::string line;
+  int needed = 2;
+  static bool vendor_id = true, cpu_family = false, initialized = false;
+  if (initialized) return (vendor_id && cpu_family);
+  while (std::getline(cpuinfo, line)) {
+    if (line.compare(0, 9, "vendor_id") == 0) {
+      if(line.find("AuthenticAMD") == std::string::npos)
+        vendor_id = false;
+      needed --;
+    }
+    if (line.compare(0, 10, "cpu family") == 0) {
+      std::string family_str = line.substr(line.find(": ") + 2);
+      if (std::stoi(family_str) >= 23)
+        cpu_family = true;
+      needed --;
+    }
+    if (!needed)
+      break;
+  }
+  initialized = true;
+  return (vendor_id && cpu_family);
+}
+
 static ncclResult_t getCudaPath(int cudaDev, char** path) {
   char busId[BUSID_SIZE];
-  CUDACHECK(cudaDeviceGetPCIBusId(busId, BUSID_SIZE, cudaDev));
+  CUDACHECK(hipDeviceGetPCIBusId(busId, BUSID_SIZE, cudaDev));
   for (int i=0; i<BUSID_SIZE; i++) busId[i] = tolower(busId[i]);
   char busPath[] = "/sys/class/pci_bus/0000:00/../../0000:00:00.0";
   memcpy(busPath+sizeof("/sys/class/pci_bus/")-1, busId, BUSID_REDUCED_SIZE-1);
@@ -50,6 +79,7 @@ static int pciDistance(char* path1, char* path2) {
       if (same == 1) score++;
     }
   }
+  if (isEPYC() && score <= 3) return PATH_PHB;
   if (score <= 3) return PATH_SOC;
   if (score == 4) return PATH_PHB;
   if (score == depth-1) return PATH_PIX;
diff --git a/projects/rccl/src/init.cu b/projects/rccl/src/init.cu
index 9d0188edd7..1c878e37eb 100644
--- a/projects/rccl/src/init.cu
+++ b/projects/rccl/src/init.cu
@@ -1,5 +1,6 @@
 /*************************************************************************
  * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+ * Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
  *
  * See LICENSE.txt for license information
  ************************************************************************/
@@ -16,6 +17,8 @@
 #include "group.h"
 #include "utils.h"
 #include "net.h"
+#include "topo.h"
+#include <numa.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <sys/mman.h>
@@ -24,7 +27,7 @@
 #include <sched.h>
 #include <fcntl.h>
 #include <unistd.h>
-#include <cuda_runtime.h>
+#include <hip/hip_runtime_api.h>
 #include <string.h>
 #include <errno.h>
 #include <assert.h>
@@ -58,17 +61,17 @@ ncclNet_t* ncclNet = NULL;
 #pragma weak ncclCudaCompCap
 int ncclCudaCompCap() {
   int cudaDev;
-  if (cudaGetDevice(&cudaDev) != cudaSuccess) return 0;
+  if (hipGetDevice(&cudaDev) != hipSuccess) return 0;
   int ccMajor;
-  if (cudaDeviceGetAttribute(&ccMajor, cudaDevAttrComputeCapabilityMajor, cudaDev) != cudaSuccess) return 0;
+  if (hipDeviceGetAttribute(&ccMajor, hipDeviceAttributeComputeCapabilityMajor, cudaDev) != hipSuccess) return 0;
   return ccMajor;
 }
 int ncclCudaFullCompCap() {
   int cudaDev;
-  if (cudaGetDevice(&cudaDev) != cudaSuccess) return 0;
+  if (hipGetDevice(&cudaDev) != hipSuccess) return 0;
   int ccMajor, ccMinor;
-  if (cudaDeviceGetAttribute(&ccMajor, cudaDevAttrComputeCapabilityMajor, cudaDev) != cudaSuccess) return 0;
-  if (cudaDeviceGetAttribute(&ccMinor, cudaDevAttrComputeCapabilityMinor, cudaDev) != cudaSuccess) return 0;
+  if (hipDeviceGetAttribute(&ccMajor, hipDeviceAttributeComputeCapabilityMajor, cudaDev) != hipSuccess) return 0;
+  if (hipDeviceGetAttribute(&ccMinor, hipDeviceAttributeComputeCapabilityMinor, cudaDev) != hipSuccess) return 0;
   return ccMajor*10+ccMinor;
 }
 
@@ -144,6 +147,22 @@ int ncclThreadThreshold(int minCompCap, int multiNode) {
   return threshold;
 }
 
+bool useFineGrainVramPcie = false;
+
+void parseHsaForceFineGrainVramPcie() {
+  char* str = getenv("HSA_FORCE_FINE_GRAIN_PCIE");
+  if (str && strlen(str) > 0) {
+    errno = 0;
+    int64_t v = strtoll(str, NULL, 0);
+    if (errno || (v != 0 && v != 1)) {
+      INFO(NCCL_ALL,"Invalid value %s for %s, using default %u.", str, "HSA_FORCE_FINE_GRAIN_PCIE", useFineGrainVramPcie); \
+    } else {
+      useFineGrainVramPcie = v;
+      INFO(NCCL_ALL,"%s set by environment to %u.", "HSA_FORCE_FINE_GRAIN_PCIE", useFineGrainVramPcie);  \
+    }
+  }
+}
+
 pthread_mutex_t initLock = PTHREAD_MUTEX_INITIALIZER;
 static bool initialized = false;
 static ncclResult_t ncclInit() {
@@ -153,6 +172,8 @@ static ncclResult_t ncclInit() {
     initEnv();
     initDebug();
     initNet();
+    // Check if HSA_FORCE_FINE_GRAIN_PCIE is set in env
+    parseHsaForceFineGrainVramPcie();
     initialized = true;
   }
   pthread_mutex_unlock(&initLock);
@@ -177,16 +198,16 @@ static ncclResult_t commFree(ncclComm_t comm) {
   if (comm == NULL)
     return ncclSuccess;
 
-  CUDACHECK(cudaFree(comm->devComm));
+  CUDACHECK(hipFree(comm->devComm));
 
   for (int ring=0; ring<comm->nRings; ring++)
     NCCLCHECK(freeRing(comm->rings+ring));
 
   if (comm->doneEvent != NULL)
-    CUDACHECK(cudaEventDestroy(comm->doneEvent));
+    CUDACHECK(hipEventDestroy(comm->doneEvent));
 
   if (comm->launchMode == ncclComm::GROUP) {
-    CUDACHECK(cudaStreamDestroy(comm->groupStream));
+    CUDACHECK(hipStreamDestroy(comm->groupStream));
   }
 
   // Last rank frees shared resources between threads
@@ -216,8 +237,8 @@ static ncclResult_t commAlloc(ncclComm_t* comret, int ndev, int rank) {
 
   // Try to create a CUDA object right away. If there is something wrong with
   // the device we're on (failure cause #1) , better know it early.
-  cudaEvent_t doneEvent;
-  CUDACHECK(cudaEventCreateWithFlags(&doneEvent, cudaEventDisableTiming));
+  hipEvent_t doneEvent;
+  CUDACHECK(hipEventCreateWithFlags(&doneEvent, hipEventDisableTiming));
 
   struct ncclComm* comm;
   NCCLCHECK(ncclCalloc(&comm, 1));
@@ -225,7 +246,7 @@ static ncclResult_t commAlloc(ncclComm_t* comret, int ndev, int rank) {
   INFO(NCCL_INIT,"comm %p rank %d nranks %d", comm, rank, ndev);
   comm->rank = rank;
   comm->nRanks = ndev;
-  cudaGetDevice(&comm->cudaDev);
+  hipGetDevice(&comm->cudaDev);
   comm->doneEvent = doneEvent;
   comm->llThreshold = ncclParamLlThreshold();
   comm->checkPointers = ncclParamCheckPointers() == 1 ? true : false;
@@ -255,7 +276,11 @@ static ncclResult_t devCommSetup(ncclComm_t comm) {
 }
 
 // Pre-process the string so that running "strings" on the lib can quickly reveal the version.
+#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__)
+#define VERSION_STRING "NCCL version " STR(NCCL_MAJOR) "." STR(NCCL_MINOR) "." STR(NCCL_PATCH) NCCL_SUFFIX "+hip"
+#else
 #define VERSION_STRING "NCCL version " STR(NCCL_MAJOR) "." STR(NCCL_MINOR) "." STR(NCCL_PATCH) NCCL_SUFFIX "+cuda" STR(CUDA_MAJOR) "." STR(CUDA_MINOR)
+#endif
 static void showVersion() {
   static int shown = 0;
   if (shown == 0 && ncclDebugLevel >= NCCL_LOG_VERSION) {
@@ -416,8 +441,8 @@ static ncclResult_t buildRings(int nrings, int* rings, int rank, int nranks, int
 
 void* waitForNonNullPtr(void* p) {
   volatile void** ptr = (volatile void**) p;
-  while (*ptr == NULL) sched_yield();
-  return (void*)*ptr;
+  while (LOAD(ptr) == NULL) sched_yield();
+  return (void*)LOAD(ptr);
 }
 
 ncclResult_t initParams(struct ncclComm* comm) {
@@ -472,11 +497,11 @@ ncclResult_t ncclCommSetIntra(struct ncclComm* comm, int rank, int ranks, struct
     comm->launchMode = ncclComm::PARALLEL;
   }
   if (comm->launchMode == ncclComm::GROUP) {
-    CUDACHECK(cudaStreamCreateWithFlags(&comm->groupStream, cudaStreamNonBlocking));
+    CUDACHECK(hipStreamCreateWithFlags(&comm->groupStream, hipStreamNonBlocking));
 #if CUDART_VERSION >= 9000
     if (*comm->intraCC && (ncclCudaFullCompCap() == *comm->intraCC)) {
       // Check whether the GPU supports Cooperative Group Multi Device Launch
-      (void) cudaDeviceGetAttribute(&cgMdLaunch, cudaDevAttrCooperativeMultiDeviceLaunch, comm->cudaDev);
+      (void) hipDeviceGetAttribute(&cgMdLaunch, cudaDevAttrCooperativeMultiDeviceLaunch, comm->cudaDev);
     }
 #endif
   }
@@ -614,14 +639,41 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* comm
 }
 
 bool SetCpuAffinity(int cudaDev, nvmlDevice_t* nvmlDevice) {
+#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__)
+  if (numa_available() < 0) {
+    WARN("System does not support NUMA API!");
+    return false;
+  }
+  char* cudaPath;
+  NCCLCHECK(getCudaPath(cudaDev, &cudaPath));
+  strcat(cudaPath, "/numa_node");
+  int fd;
+  SYSCHECKVAL(open(cudaPath, O_RDONLY), "open", fd);
+  char numa_node[5];
+  int len;
+  SYSCHECKVAL(read(fd, numa_node, 4), "read", len);
+  SYSCHECK(close(fd), "close");
+  errno = 0;
+  long node = strtol(numa_node, NULL, 10);
+  if (errno == ERANGE || errno == EINVAL) {
+    INFO(NCCL_ALL,"%s: Call to strtol returned %s", __func__, strerror(errno));
+    free(cudaPath);
+    return false;
+  }
+  numa_run_on_node(node);
+  numa_set_preferred(node);
+  free(cudaPath);
+  return true;
+#else
   char busId[NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE];
-  if (cudaDeviceGetPCIBusId(busId, NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE, cudaDev) != cudaSuccess) return false;
+  if (hipDeviceGetPCIBusId(busId, NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE, cudaDev) != hipSuccess) return false;
   if (wrapNvmlDeviceGetHandleByPciBusId(busId, nvmlDevice) != ncclSuccess) return false;
   if (wrapNvmlDeviceSetCpuAffinity(*nvmlDevice) != ncclSuccess) {
     WARN("Failed to set CPU affinity");
     return false;
   }
   return true;
+#endif
 }
 
 ncclResult_t ncclCommInitRankSync(ncclComm_t* newcomm, int nranks, ncclUniqueId commId, int myrank) {
@@ -634,7 +686,7 @@ ncclResult_t ncclCommInitRankSync(ncclComm_t* newcomm, int nranks, ncclUniqueId
   // Make sure all host memory allocation are close to the GPU
   int cudaDev;
   nvmlDevice_t nvmlDevice;
-  CUDACHECK(cudaGetDevice(&cudaDev));
+  CUDACHECK(hipGetDevice(&cudaDev));
   SetCpuAffinity(cudaDev, &nvmlDevice);
   ncclResult_t res;
 
@@ -667,7 +719,7 @@ ncclResult_t ncclCommInitRank(ncclComm_t* newcomm, int nranks, ncclUniqueId comm
   INFO(NCCL_INIT,"rank %d nranks %d", myrank, nranks);
 
   // Make sure the CUDA runtime is initialized.
-  CUDACHECK(cudaFree(NULL));
+  CUDACHECK(hipFree(NULL));
 
   NCCLCHECK(PtrCheck(newcomm, "CommInitRank", "newcomm"));
   if (nranks < 1 || myrank < 0 || myrank >= nranks) {
@@ -677,7 +729,7 @@ ncclResult_t ncclCommInitRank(ncclComm_t* newcomm, int nranks, ncclUniqueId comm
 
   if (ncclAsyncMode()) {
     int cudaDev;
-    CUDACHECK(cudaGetDevice(&cudaDev));
+    CUDACHECK(hipGetDevice(&cudaDev));
     return ncclAsyncInit(ncclCommInitRankSync, cudaDev, newcomm, nranks, commId, myrank);
   } else {
     return ncclCommInitRankSync(newcomm, nranks, commId, myrank);
@@ -688,7 +740,7 @@ static ncclResult_t initTransportsAll(struct ncclComm** comms, const int* devs,
   struct ncclInfo* allInfo;
   NCCLCHECK(ncclCalloc(&allInfo, nranks));
   for (int rank=0; rank<nranks; rank++) {
-    CUDACHECK(cudaSetDevice(devs[rank]));
+    CUDACHECK(hipSetDevice(devs[rank]));
     NCCLCHECK(fillInfo(allInfo+rank, rank));
   }
 
@@ -709,7 +761,7 @@ static ncclResult_t initTransportsAll(struct ncclComm** comms, const int* devs,
   int myCompCap = ncclCudaCompCap();
   int minCompCap = myCompCap;
   for (int rank=0; rank<nranks; rank++) {
-    CUDACHECK(cudaSetDevice(devs[rank]));
+    CUDACHECK(hipSetDevice(devs[rank]));
     int nringsRank;
     int nthreadsRank = getDefaultThreads();
     myCompCap = ncclCudaCompCap();
@@ -750,7 +802,7 @@ static ncclResult_t initTransportsAll(struct ncclComm** comms, const int* devs,
     struct ncclConnect connect[2*nranks];
     int* ringRanks = rings+r*nranks;
     for (int rank=0; rank<nranks; rank++) {
-      CUDACHECK(cudaSetDevice(devs[rank]));
+      CUDACHECK(hipSetDevice(devs[rank]));
       NCCLCHECK(setupRing(comms[rank], r, rank, nranks, ringRanks, allInfo, connect+2*rank));
     }
     // RingExchange connect information
@@ -763,7 +815,7 @@ static ncclResult_t initTransportsAll(struct ncclComm** comms, const int* devs,
       swap(prevRankNextConnect, rankPrevConnect, sizeof(struct ncclConnect));
     }
     for (int rank=0; rank<nranks; rank++) {
-      CUDACHECK(cudaSetDevice(devs[rank]));
+      CUDACHECK(hipSetDevice(devs[rank]));
       struct ncclRing *ring = comms[rank]->rings+r;
       NCCLCHECK(ring->send.transport->send.connect(connect+2*rank+1, &ring->send));
       NCCLCHECK(ring->recv.transport->recv.connect(connect+2*rank+0, &ring->recv));
@@ -800,7 +852,7 @@ ncclResult_t ncclCommInitAll(ncclComm_t* comms, int ndev, const int* devlist) {
     ncclDevList[i] = devlist ? devlist[i] : i;
   }
 
-  cudaGetDevice(&savedDevice);
+  hipGetDevice(&savedDevice);
 
   for(rank=0; rank<ndev; ++rank)
     comms[rank] = NULL;
@@ -810,7 +862,7 @@ ncclResult_t ncclCommInitAll(ncclComm_t* comms, int ndev, const int* devlist) {
 
   for (rank=0; rank<ndev; ++rank) {
     cudaDev = ncclDevList[rank];
-    CUDACHECKGOTO(cudaSetDevice(cudaDev), res, cleanup);
+    CUDACHECKGOTO(hipSetDevice(cudaDev), res, cleanup);
 
     SetCpuAffinity(cudaDev, &nvmlDevice);
 
@@ -826,7 +878,7 @@ ncclResult_t ncclCommInitAll(ncclComm_t* comms, int ndev, const int* devlist) {
 
   for(rank=0; rank<ndev; ++rank) {
     cudaDev = ncclDevList[rank];
-    CUDACHECKGOTO(cudaSetDevice(cudaDev), res, cleanup);
+    CUDACHECKGOTO(hipSetDevice(cudaDev), res, cleanup);
     NCCLCHECKGOTO(devCommSetup(comms[rank]), res, cleanup);
   }
 
@@ -843,7 +895,7 @@ cleanup:
 final:
   if(wrapNvmlShutdown() != ncclSuccess)
     INFO(NCCL_INIT,"NCCL did not shutdown nvml properly");
-  cudaSetDevice(savedDevice);
+  hipSetDevice(savedDevice);
   sched_setaffinity(0, sizeof(cpu_set_t), &affinitySave);
   return res;
 }
@@ -854,17 +906,17 @@ ncclResult_t ncclCommDestroy(ncclComm_t comm) {
   if (comm == NULL)
     return ncclSuccess;
   int savedDevice;
-  CUDACHECK(cudaGetDevice(&savedDevice));
+  CUDACHECK(hipGetDevice(&savedDevice));
   int commDevice = comm->cudaDev;
 
   if (savedDevice != commDevice) {
-    CUDACHECK(cudaSetDevice(commDevice));
+    CUDACHECK(hipSetDevice(commDevice));
   }
 
   NCCLCHECK(commFree(comm));
 
   if (savedDevice != commDevice)
-    CUDACHECK(cudaSetDevice(savedDevice));
+    CUDACHECK(hipSetDevice(savedDevice));
 
   return ncclSuccess;
 }
diff --git a/projects/rccl/src/misc/enqueue.cu b/projects/rccl/src/misc/enqueue.cu
index 80846dd656..be5daf3926 100644
--- a/projects/rccl/src/misc/enqueue.cu
+++ b/projects/rccl/src/misc/enqueue.cu
@@ -1,9 +1,12 @@
 /*************************************************************************
  * Copyright (c) 2017-2018, NVIDIA CORPORATION. All rights reserved.
+ * Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
  *
  * See LICENSE.txt for license information
  ************************************************************************/
 
+#include <hip/hip_runtime.h>
+
 #include "enqueue.h"
 #include "common_coll.h"
 #include "param.h"
@@ -11,30 +14,30 @@
 #include "collectives/collectives.h"
 
 #define NCCL_FUNC4(coll, op, dtype) \
-  (void*)NCCL_KERN_NAME(coll, op, dtype), \
-  (void*)NCCL_KERN_NAME(coll##LL, op, dtype)
+  NCCL_KERN_NAME(coll, op, dtype), \
+  NCCL_KERN_NAME(coll##LL, op, dtype)
 
 // Must be consistent with ncclDataType_t
 #define NCCL_FUNCS3A(coll, op) \
-  (void*)NCCL_FUNC4(coll, op,  i8), \
-  (void*)NCCL_FUNC4(coll, op,  u8), \
-  (void*)NCCL_FUNC4(coll, op, i32), \
-  (void*)NCCL_FUNC4(coll, op, u32), \
-  (void*)NCCL_FUNC4(coll, op, i64), \
-  (void*)NCCL_FUNC4(coll, op, u64), \
-  (void*)NCCL_FUNC4(coll, op, f16), \
-  (void*)NCCL_FUNC4(coll, op, f32), \
-  (void*)NCCL_FUNC4(coll, op, f64)
+  NCCL_FUNC4(coll, op,  i8), \
+  NCCL_FUNC4(coll, op,  u8), \
+  NCCL_FUNC4(coll, op, i32), \
+  NCCL_FUNC4(coll, op, u32), \
+  NCCL_FUNC4(coll, op, i64), \
+  NCCL_FUNC4(coll, op, u64), \
+  NCCL_FUNC4(coll, op, f16), \
+  NCCL_FUNC4(coll, op, f32), \
+  NCCL_FUNC4(coll, op, f64)
 #define NCCL_FUNCS3B(coll, op) \
-  (void*)NCCL_FUNC4(coll, op,  i8), \
-  (void*)NCCL_FUNC4(coll, op,  i8), \
-  (void*)NCCL_FUNC4(coll, op,  i8), \
-  (void*)NCCL_FUNC4(coll, op,  i8), \
-  (void*)NCCL_FUNC4(coll, op,  i8), \
-  (void*)NCCL_FUNC4(coll, op,  i8), \
-  (void*)NCCL_FUNC4(coll, op,  i8), \
-  (void*)NCCL_FUNC4(coll, op,  i8), \
-  (void*)NCCL_FUNC4(coll, op,  i8)
+  NCCL_FUNC4(coll, op,  i8), \
+  NCCL_FUNC4(coll, op,  i8), \
+  NCCL_FUNC4(coll, op,  i8), \
+  NCCL_FUNC4(coll, op,  i8), \
+  NCCL_FUNC4(coll, op,  i8), \
+  NCCL_FUNC4(coll, op,  i8), \
+  NCCL_FUNC4(coll, op,  i8), \
+  NCCL_FUNC4(coll, op,  i8), \
+  NCCL_FUNC4(coll, op,  i8)
 
 // Must be consistent with ncclRedOp_t
 #define NCCL_FUNCS2A(coll) \
@@ -48,8 +51,9 @@
   NCCL_FUNCS3B(coll, copy), \
   NCCL_FUNCS3B(coll, copy)
 
+typedef void(*ncclKern_t)(struct ncclColl);
 // Must be consistent with the ncclFuncSet enum
-static void* const ncclKerns[ncclCollCount*ncclNumOps*ncclNumTypes*2] = {
+static ncclKern_t const ncclKerns[ncclCollCount*ncclNumOps*ncclNumTypes*2] = {
   NCCL_FUNCS2B(ncclBroadcast),
   NCCL_FUNCS2A(ncclReduce),
   NCCL_FUNCS2B(ncclAllGather),
@@ -67,13 +71,13 @@ ncclResult_t ncclLaunchCooperativeKernelMultiDevice(struct cudaLaunchParams *par
   }
 #endif
   int savedDev;
-  CUDACHECK(cudaGetDevice(&savedDev));
+  CUDACHECK(hipGetDevice(&savedDev));
   for (int i = 0; i < numDevices; i++) {
     struct cudaLaunchParams* params = paramsList+i;
-    CUDACHECK(cudaSetDevice(cudaDevs[i]));
-    CUDACHECK(cudaLaunchKernel(params->func, params->gridDim, params->blockDim, params->args, params->sharedMem, params->stream));
+    CUDACHECK(hipSetDevice(cudaDevs[i]));
+    hipLaunchKernelGGL(params->func, params->gridDim, params->blockDim, params->sharedMem, params->stream, **params->args);
   }
-  CUDACHECK(cudaSetDevice(savedDev));
+  CUDACHECK(hipSetDevice(savedDev));
   return ncclSuccess;
 }
 
@@ -83,7 +87,7 @@ ncclResult_t setupLaunch(struct ncclComm* comm, struct cudaLaunchParams* params)
   // Set active = 2 for the last operation
   for (int r=0; r<params->gridDim.x; r++) {
     struct ncclRing* ring = comm->rings+r;
-    ring->collectives[(ring->collStart+ring->collCount-1)%NCCL_MAX_OPS].active = 2;
+    STORE(&ring->collectives[(ring->collStart+ring->collCount-1)%NCCL_MAX_OPS].active, 2);
   }
 
   // Find the first operation, choose the kernel accordingly and pass it
@@ -91,7 +95,7 @@ ncclResult_t setupLaunch(struct ncclComm* comm, struct cudaLaunchParams* params)
   struct ncclColl* coll = comm->rings[0].collectives+comm->rings[0].collStart;
   memcpy(&comm->args, coll, sizeof(struct ncclColl));
   // As we pass that coll directly, we can free it immediately.
-  coll->active = 0;
+  STORE(&coll->active, 0);
 
   params->func = ncclKerns[coll->funcIndex];
   return ncclSuccess;
@@ -99,7 +103,7 @@ ncclResult_t setupLaunch(struct ncclComm* comm, struct cudaLaunchParams* params)
 
 ncclResult_t ncclCpuBarrierIn(struct ncclComm* comm, int* isLast) {
   volatile int* ptr = (volatile int*)(comm->intraBarrier+comm->intraPhase);
-  int val = *ptr;
+  int val = LOAD(ptr);
   bool done = false;
   while (done == false) {
     if (val >= comm->intraRanks) {
@@ -121,7 +125,7 @@ ncclResult_t ncclCpuBarrierIn(struct ncclComm* comm, int* isLast) {
 
 ncclResult_t ncclCpuBarrierLast(struct ncclComm* comm) {
   volatile int* ptr = (volatile int*)(comm->intraBarrier+comm->intraPhase);
-  int val = *ptr;
+  int val = LOAD(ptr);
   if (__sync_bool_compare_and_swap(ptr, val, val+1) != true) {
     WARN("Trying to launch too many collectives");
     return ncclInternalError;
@@ -131,7 +135,7 @@ ncclResult_t ncclCpuBarrierLast(struct ncclComm* comm) {
 
 ncclResult_t ncclCpuBarrierOut(struct ncclComm* comm) {
   volatile int* ptr = (volatile int*)(comm->intraBarrier+comm->intraPhase);
-  while (*ptr < comm->intraRanks) pthread_yield();
+  while (LOAD(ptr) < comm->intraRanks) pthread_yield();
   comm->intraPhase ^= 1;
   return ncclSuccess;
 }
@@ -145,14 +149,14 @@ ncclResult_t ncclBarrierEnqueue(struct ncclComm* comm) {
   // Use internal NCCL stream for CGMD/GROUP launch if required or if the user stream is NULL
   if (comm->launchMode == ncclComm::GROUP && (comm->groupCudaStream || comm->userStream == NULL)) {
     // Enqueue event in user stream
-    CUDACHECK(cudaEventRecord(comm->doneEvent, comm->userStream));
+    CUDACHECK(hipEventRecord(comm->doneEvent, comm->userStream));
     // Create dependency between user stream and internal NCCL stream
-    CUDACHECK(cudaStreamWaitEvent(comm->groupStream, comm->doneEvent, 0));
+    CUDACHECK(hipStreamWaitEvent(comm->groupStream, comm->doneEvent, 0));
     params->stream = comm->groupStream;
   } else {
     if (comm->userStream != params->stream) {
       // Stream changed from last call, create dependency against last NCCL kernel launch
-      CUDACHECK(cudaStreamWaitEvent(comm->userStream, comm->doneEvent, 0));
+      CUDACHECK(hipStreamWaitEvent(comm->userStream, comm->doneEvent, 0));
     }
     params->stream = comm->userStream;
   }
@@ -185,10 +189,10 @@ ncclResult_t ncclBarrierEnqueueWait(ncclComm_t comm) {
 
   struct cudaLaunchParams *params = comm->myParams;
   if (comm->launchMode == ncclComm::PARALLEL) {
-    CUDACHECK(cudaLaunchKernel(params->func, params->gridDim, params->blockDim, params->args, params->sharedMem, params->stream));
+    hipLaunchKernelGGL(params->func, params->gridDim, params->blockDim, params->sharedMem, params->stream, **params->args);
   }
   // Start the network proxies as soon as the kernel has been launched. We can't
-  // perform any CUDA call between the two or having a cudaFree between the CUDA
+  // perform any CUDA call between the two or having a hipFree between the CUDA
   // launch and the transportStartProxies call could cause a deadlock.
   // Also, starting the proxies after the CUDA launch seems to be better for
   // performance (latency).
@@ -205,11 +209,11 @@ ncclResult_t ncclBarrierEnqueueWait(ncclComm_t comm) {
 ncclResult_t ncclEnqueueEvents(ncclComm_t comm) {
   struct cudaLaunchParams *params = comm->myParams;
   // Enqueue event after NCCL kernel
-  CUDACHECK(cudaEventRecord(comm->doneEvent, params->stream));
+  CUDACHECK(hipEventRecord(comm->doneEvent, params->stream));
   // Use internal NCCL stream for CGMD/GROUP launch if required or if the user stream is NULL
   if (comm->launchMode == ncclComm::GROUP && (comm->groupCudaStream || comm->userStream == NULL)) {
     // Create dependency between NCCL internal stream and user stream
-    CUDACHECK(cudaStreamWaitEvent(comm->userStream, comm->doneEvent, 0));
+    CUDACHECK(hipStreamWaitEvent(comm->userStream, comm->doneEvent, 0));
   }
   comm->userStreamSet = false;
   return ncclSuccess;
@@ -217,15 +221,15 @@ ncclResult_t ncclEnqueueEvents(ncclComm_t comm) {
 
 ncclResult_t ncclEnqueueCheck(ncclFunc_t func, const char* primName, const void* sendbuff,
     void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root,
-    ncclComm_t comm, cudaStream_t stream) {
+    ncclComm_t comm, hipStream_t stream) {
   if (comm == NULL) return ncclInvalidArgument;
   // Launch asynchronously if needed
   if (ncclAsyncMode()) {
     ncclResult_t ret = ncclSuccess;
     int savedDev = -1;
     if (comm->checkPointers) {
-      CUDACHECKGOTO(cudaGetDevice(&savedDev), ret, end);
-      CUDACHECKGOTO(cudaSetDevice(comm->cudaDev), ret, end);
+      CUDACHECKGOTO(hipGetDevice(&savedDev), ret, end);
+      CUDACHECKGOTO(hipSetDevice(comm->cudaDev), ret, end);
     }
     // Check arguments
     NCCLCHECKGOTO(ArgsCheck(sendbuff, recvbuff, count, type, op, root, comm, primName), ret, end);
@@ -234,7 +238,7 @@ ncclResult_t ncclEnqueueCheck(ncclFunc_t func, const char* primName, const void*
     NCCLCHECK(ncclAsyncColl(comm));
     NCCLCHECKGOTO(func(sendbuff, recvbuff, count, type, op, root, comm, stream), ret, end);
 end:
-    if (savedDev != -1) CUDACHECK(cudaSetDevice(savedDev));
+    if (savedDev != -1) CUDACHECK(hipSetDevice(savedDev));
     ncclAsyncErrCheck(ret);
     return ret;
   } else {
diff --git a/projects/rccl/src/misc/group.cu b/projects/rccl/src/misc/group.cu
index 1716a75643..0144bee78d 100644
--- a/projects/rccl/src/misc/group.cu
+++ b/projects/rccl/src/misc/group.cu
@@ -1,5 +1,6 @@
 /*************************************************************************
  * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+ * Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
  *
  * See LICENSE.txt for license information
  ************************************************************************/
@@ -52,7 +53,7 @@ struct ncclAsyncArgs {
 thread_local struct ncclAsyncArgs ncclGroupArgs[MAX_ASYNC_OPS];
 
 ncclResult_t ncclSetDevice(int cudaDev) {
-  CUDACHECK(cudaSetDevice(cudaDev));
+  CUDACHECK(hipSetDevice(cudaDev));
   return ncclSuccess;
 }
 
@@ -116,7 +117,7 @@ ncclResult_t ncclGroupEnd() {
   ncclGroupMode--;
   if (ncclGroupMode > 0) return ncclSuccess;
   int savedDev;
-  CUDACHECK(cudaGetDevice(&savedDev));
+  CUDACHECK(hipGetDevice(&savedDev));
   int done = ncclGroupIndex;
   int doneArray[ncclGroupIndex];
   for (int i=0; i<ncclGroupIndex; i++) doneArray[i] = 0;
@@ -129,22 +130,22 @@ ncclResult_t ncclGroupEnd() {
    * 2. Barrier Wait. No CUDA call is permitted
    * 3. Enqueue Events. CUDA event wait/enqueue.
    * This is needed because step 2 cannot call any CUDA primitive, otherwise if
-   * cudaFree happens between 1 and 3, it could block that CUDA call and
+   * hipFree happens between 1 and 3, it could block that CUDA call and
    * prevent some ranks from launching their network threads, which would
-   * prevent the NCCL call from completing, blocking the cudaFree call.
+   * prevent the NCCL call from completing, blocking the hipFree call.
    */
   for (int i=0; i<ncclGroupIndex; i++) {
     struct ncclAsyncArgs* args = ncclGroupArgs+i;
     if (args->funcType == ASYNC_FUNC_COLL) {
       if (args->coll.comm->userStream == NULL)
-        CUDACHECKGOTO(cudaSetDevice(args->coll.comm->cudaDev), ret, end);
+        CUDACHECKGOTO(hipSetDevice(args->coll.comm->cudaDev), ret, end);
       NCCLCHECKGOTO(ncclBarrierEnqueue(args->coll.comm), ret, end);
     }
   }
   for (int i=0; i<ncclGroupIndex; i++) {
     struct ncclAsyncArgs* args = ncclGroupArgs+i;
     if (args->funcType == ASYNC_FUNC_COLL) {
-      CUDACHECKGOTO(cudaSetDevice(args->coll.comm->cudaDev), ret, end);
+      CUDACHECKGOTO(hipSetDevice(args->coll.comm->cudaDev), ret, end);
       NCCLCHECKGOTO(ncclBarrierEnqueueWait(args->coll.comm), ret, end);
     }
   }
@@ -152,7 +153,7 @@ ncclResult_t ncclGroupEnd() {
     struct ncclAsyncArgs* args = ncclGroupArgs+i;
     if (args->funcType == ASYNC_FUNC_COLL) {
       if (args->coll.comm->userStream == NULL)
-        CUDACHECKGOTO(cudaSetDevice(args->coll.comm->cudaDev), ret, end);
+        CUDACHECKGOTO(hipSetDevice(args->coll.comm->cudaDev), ret, end);
       NCCLCHECKGOTO(ncclEnqueueEvents(args->coll.comm), ret, end);
       doneArray[i] = 1;
       done--;
@@ -182,7 +183,7 @@ group_cleanup:
     for (int r=0; r<comm->nRings; r++) {
       struct ncclRing* ring = comm->rings+r;
       for (int i=0; i<ring->collCount; i++) {
-        ring->collectives[(ring->collStart + i)%NCCL_MAX_OPS].active = 0;
+        STORE(&ring->collectives[(ring->collStart + i)%NCCL_MAX_OPS].active, 0);
       }
       ring->collFifoTail = ring->collStart;
       ring->collCount = 0;
@@ -193,6 +194,6 @@ group_cleanup:
 end:
   ncclGroupError = ncclSuccess;
   ncclGroupIndex = 0;
-  CUDACHECK(cudaSetDevice(savedDev)); // do other clean-ups first before calling cudaSetDevice, because this call can fail too
+  CUDACHECK(hipSetDevice(savedDev)); // do other clean-ups first before calling hipSetDevice, because this call can fail too
   return ret;
 }
diff --git a/projects/rccl/src/misc/nvmlwrap.cu b/projects/rccl/src/misc/nvmlwrap.cu
index d9407f4686..f3ee2ac9ae 100644
--- a/projects/rccl/src/misc/nvmlwrap.cu
+++ b/projects/rccl/src/misc/nvmlwrap.cu
@@ -1,5 +1,6 @@
 /*************************************************************************
  * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+ * Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
  *
  * See LICENSE.txt for license information
  ************************************************************************/
diff --git a/projects/rccl/src/misc/nvmlwrap_stub.cu b/projects/rccl/src/misc/nvmlwrap_stub.cu
new file mode 100644
index 0000000000..85a389a1a9
--- /dev/null
+++ b/projects/rccl/src/misc/nvmlwrap_stub.cu
@@ -0,0 +1,53 @@
+/*************************************************************************
+ * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+ * Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#include "nvmlwrap.h"
+
+ncclResult_t wrapNvmlSymbols(void) {
+  return ncclSuccess;
+}
+
+ncclResult_t wrapNvmlInit(void) {
+  return ncclSuccess;
+}
+
+ncclResult_t wrapNvmlShutdown(void) {
+  return ncclSuccess;
+}
+
+ncclResult_t wrapNvmlDeviceGetHandleByPciBusId(const char* pciBusId, nvmlDevice_t* device) {
+  return ncclSuccess;
+}
+
+ncclResult_t wrapNvmlDeviceGetIndex(nvmlDevice_t device, unsigned* index) {
+  return ncclSuccess;
+}
+
+ncclResult_t wrapNvmlDeviceSetCpuAffinity(nvmlDevice_t device) {
+  return ncclSuccess;
+}
+
+ncclResult_t wrapNvmlDeviceClearCpuAffinity(nvmlDevice_t device) {
+  return ncclSuccess;
+}
+
+ncclResult_t wrapNvmlDeviceGetPciInfo(nvmlDevice_t device, nvmlPciInfo_t* pci) {
+  return ncclSuccess;
+}
+
+ncclResult_t wrapNvmlDeviceGetNvLinkState(nvmlDevice_t device, unsigned int link, nvmlEnableState_t *isActive) {
+  return ncclSuccess;
+}
+
+ncclResult_t wrapNvmlDeviceGetNvLinkRemotePciInfo(nvmlDevice_t device, unsigned int link, nvmlPciInfo_t *pci) {
+  return ncclSuccess;
+}
+
+ncclResult_t wrapNvmlDeviceGetNvLinkCapability(nvmlDevice_t device, unsigned int link,
+    nvmlNvLinkCapability_t capability, unsigned int *capResult) {
+  return ncclSuccess;
+}
\ No newline at end of file
diff --git a/projects/rccl/src/misc/rings.cu b/projects/rccl/src/misc/rings.cu
index a5d4616019..359e26b359 100644
--- a/projects/rccl/src/misc/rings.cu
+++ b/projects/rccl/src/misc/rings.cu
@@ -1,5 +1,6 @@
 /*************************************************************************
  * Copyright (c) 2016-2018, NVIDIA CORPORATION. All rights reserved.
+ * Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
  *
  * See LICENSE.txt for license information
  ************************************************************************/
@@ -337,7 +338,11 @@ ncclResult_t ncclGetRings(int* nrings, int* nthreads, int rank, int nranks, int*
     if (rank == 0) INFO(NCCL_INIT,"Limiting to %d rings per user request.", maxNrings);
     *nrings = maxNrings;
   } else {
+#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__)
+    int defaultMinNrings = 1;
+#else
     int defaultMinNrings = ncclCudaCompCap() == 3 ? 2 : 1;
+#endif
     if (minNrings < defaultMinNrings) minNrings = defaultMinNrings;
     if (minNrings > 0 && minNrings > *nrings) {
       if (rank == 0 && minNrings > defaultMinNrings) INFO(NCCL_INIT,"Duplicating rings to %d per user request.", minNrings);
diff --git a/projects/rccl/src/misc/utils.cu b/projects/rccl/src/misc/utils.cu
index d8e3aec5f5..f5b61c5498 100644
--- a/projects/rccl/src/misc/utils.cu
+++ b/projects/rccl/src/misc/utils.cu
@@ -32,7 +32,7 @@ void ncclDebugLog(ncclDebugLogLevel level, unsigned long flags, const char *file
   char hostname[1024];
   getHostName(hostname, 1024);
   int cudaDev;
-  cudaGetDevice(&cudaDev);
+  hipGetDevice(&cudaDev);
 
   char buffer[1024];
   size_t len = 0;
diff --git a/projects/rccl/src/nccl.h.in b/projects/rccl/src/nccl.h.in
index 72276254cd..be5f095541 100644
--- a/projects/rccl/src/nccl.h.in
+++ b/projects/rccl/src/nccl.h.in
@@ -1,5 +1,6 @@
 /*************************************************************************
  * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+ * Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
  *
  * See LICENSE.txt for license information
  ************************************************************************/
@@ -7,15 +8,15 @@
 #ifndef NCCL_H_
 #define NCCL_H_
 
-#include <cuda_runtime.h>
-#include <cuda_fp16.h>
+#include <hip/hip_runtime_api.h>
+#include <hip/hip_fp16.h>
 
-#define NCCL_MAJOR ${nccl:Major}
-#define NCCL_MINOR ${nccl:Minor}
-#define NCCL_PATCH ${nccl:Patch}
-#define NCCL_SUFFIX "${nccl:Suffix}"
+#define NCCL_MAJOR ${NCCL_MAJOR}
+#define NCCL_MINOR ${NCCL_MINOR}
+#define NCCL_PATCH ${NCCL_PATCH}
+#define NCCL_SUFFIX "${NCCL_SUFFIX}"
 
-#define NCCL_VERSION_CODE ${nccl:Version}
+#define NCCL_VERSION_CODE ${NCCL_VERSION}
 #define NCCL_VERSION(X,Y,Z) ((X) * 1000 + (Y) * 100 + (Z))
 
 #ifdef __cplusplus
@@ -132,9 +133,9 @@ typedef enum { ncclInt8       = 0, ncclChar       = 0,
  * In-place operation will happen if sendbuff == recvbuff.
  */
 ncclResult_t  ncclReduce(const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype,
-    ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream);
+    ncclRedOp_t op, int root, ncclComm_t comm, hipStream_t stream);
 ncclResult_t pncclReduce(const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype,
-    ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream);
+    ncclRedOp_t op, int root, ncclComm_t comm, hipStream_t stream);
 
 /*
  * (deprecated) Broadcast (in-place)
@@ -146,9 +147,9 @@ ncclResult_t pncclReduce(const void* sendbuff, void* recvbuff, size_t count, ncc
  * This operation is implicitely in place.
  */
 ncclResult_t  ncclBcast(void* buff, size_t count, ncclDataType_t datatype, int root,
-    ncclComm_t comm, cudaStream_t stream);
+    ncclComm_t comm, hipStream_t stream);
 ncclResult_t pncclBcast(void* buff, size_t count, ncclDataType_t datatype, int root,
-    ncclComm_t comm, cudaStream_t stream);
+    ncclComm_t comm, hipStream_t stream);
 
 /*
  * Broadcast
@@ -160,9 +161,9 @@ ncclResult_t pncclBcast(void* buff, size_t count, ncclDataType_t datatype, int r
  * In-place operation will happen if sendbuff == recvbuff.
  */
 ncclResult_t  ncclBroadcast(const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype, int root,
-    ncclComm_t comm, cudaStream_t stream);
+    ncclComm_t comm, hipStream_t stream);
 ncclResult_t pncclBroadcast(const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype, int root,
-    ncclComm_t comm, cudaStream_t stream);
+    ncclComm_t comm, hipStream_t stream);
 
 /*
  * All-Reduce
@@ -173,9 +174,9 @@ ncclResult_t pncclBroadcast(const void* sendbuff, void* recvbuff, size_t count,
  * In-place operation will happen if sendbuff == recvbuff.
  */
 ncclResult_t  ncclAllReduce(const void* sendbuff, void* recvbuff, size_t count,
-    ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm, cudaStream_t stream);
+    ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm, hipStream_t stream);
 ncclResult_t pncclAllReduce(const void* sendbuff, void* recvbuff, size_t count,
-    ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm, cudaStream_t stream);
+    ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm, hipStream_t stream);
 
 /*
  * Reduce-Scatter
@@ -190,10 +191,10 @@ ncclResult_t pncclAllReduce(const void* sendbuff, void* recvbuff, size_t count,
  */
 ncclResult_t  ncclReduceScatter(const void* sendbuff, void* recvbuff,
     size_t recvcount, ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm,
-    cudaStream_t stream);
+    hipStream_t stream);
 ncclResult_t pncclReduceScatter(const void* sendbuff, void* recvbuff,
     size_t recvcount, ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm,
-    cudaStream_t stream);
+    hipStream_t stream);
 
 /*
  * All-Gather
@@ -206,9 +207,9 @@ ncclResult_t pncclReduceScatter(const void* sendbuff, void* recvbuff,
  * In-place operations will happen if sendbuff == recvbuff + rank * sendcount.
  */
 ncclResult_t  ncclAllGather(const void* sendbuff, void* recvbuff, size_t sendcount,
-    ncclDataType_t datatype, ncclComm_t comm, cudaStream_t stream);
+    ncclDataType_t datatype, ncclComm_t comm, hipStream_t stream);
 ncclResult_t pncclAllGather(const void* sendbuff, void* recvbuff, size_t sendcount,
-    ncclDataType_t datatype, ncclComm_t comm, cudaStream_t stream);
+    ncclDataType_t datatype, ncclComm_t comm, hipStream_t stream);
 
 /*
  * Group semantics
diff --git a/projects/rccl/src/ring.cu b/projects/rccl/src/ring.cu
index fede79387f..23e27571f0 100644
--- a/projects/rccl/src/ring.cu
+++ b/projects/rccl/src/ring.cu
@@ -1,5 +1,6 @@
 /*************************************************************************
  * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+ * Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved. 
  *
  * See LICENSE.txt for license information
  ************************************************************************/
@@ -7,6 +8,8 @@
 #include "ring.h"
 #include "param.h"
 
+extern bool useFineGrainVramPcie;
+
 NCCL_PARAM(Buffsize, "BUFFSIZE", DEFAULT_BUFFER_SIZE_BYTES);
 
 ncclResult_t initRing(struct ncclComm* comm, int ringid) {
@@ -18,12 +21,12 @@ ncclResult_t initRing(struct ncclComm* comm, int ringid) {
 
   const int sendSize = ring->devMemSendSize = sizeof(struct ncclSendMem);
   struct ncclSendMem* sendMem;
-  NCCLCHECK(ncclCudaCalloc((char**)&sendMem, sendSize));
+  NCCLCHECK(ncclCudaCalloc((char**)&sendMem, sendSize, useFineGrainVramPcie));
   ring->devMemSend = sendMem;
 
   const int recvSize = ring->devMemRecvSize = offsetof(struct ncclRecvMem, buff)+ring->buffSize;
   struct ncclRecvMem* recvMem;
-  NCCLCHECK(ncclCudaCalloc((char**)&recvMem, recvSize));
+  NCCLCHECK(ncclCudaCalloc((char**)&recvMem, recvSize, useFineGrainVramPcie));
   ring->devMemRecv = recvMem;
 
   TRACE(NCCL_INIT,"sendMem %p size %d recvMem %p size %d", sendMem, sendSize, recvMem, recvSize);
@@ -51,12 +54,12 @@ ncclResult_t initRing(struct ncclComm* comm, int ringid) {
 
 ncclResult_t freeRing(struct ncclRing* ring) {
   // Intermediate buffering
-  CUDACHECK(cudaFree(ring->devMemSend));
-  CUDACHECK(cudaFree(ring->devMemRecv));
+  CUDACHECK(hipFree(ring->devMemSend));
+  CUDACHECK(hipFree(ring->devMemRecv));
 
   // Index to rank table
   free(ring->userRanks);
-  CUDACHECK(cudaFree(ring->devUserRanks));
+  CUDACHECK(hipFree(ring->devUserRanks));
 
   // Operation list
   NCCLCHECK(ncclCudaHostFree(ring->collectives));
diff --git a/projects/rccl/src/transport.cu b/projects/rccl/src/transport.cu
index 7c13d5c351..c78adcc1de 100644
--- a/projects/rccl/src/transport.cu
+++ b/projects/rccl/src/transport.cu
@@ -1,5 +1,6 @@
 /*************************************************************************
  * Copyright (c) 2016-2018, NVIDIA CORPORATION. All rights reserved.
+ * Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
  *
  * See LICENSE.txt for license information
  ************************************************************************/
@@ -18,35 +19,35 @@ struct ncclTransport ncclTransports[NTRANSPORTS] = {
 };
 
 static void FifoPullArgs(struct transportProxyInfo* info, struct ncclProxyArgs *args) {
-  struct ncclProxyArgs *fifoArgs = info->argsFifo + (info->argsFifoHead % TRANSPORT_PROXY_FIFO_SIZE);
+  struct ncclProxyArgs *fifoArgs = info->argsFifo + (LOAD(&info->argsFifoHead) % TRANSPORT_PROXY_FIFO_SIZE);
   pthread_mutex_lock(&info->mutex);
-  while (fifoArgs->active == 0)
+  while (LOAD(&fifoArgs->active) == 0)
     pthread_cond_wait(&info->cond, &info->mutex);
   __sync_synchronize();
   memcpy(args, fifoArgs, sizeof(struct ncclProxyArgs));
   __sync_synchronize();
-  fifoArgs->active = 0;
+  STORE(&fifoArgs->active, 0);
   pthread_cond_signal(&info->cond);
   pthread_mutex_unlock(&info->mutex);
-  info->argsFifoHead++;
+  __atomic_fetch_add(&info->argsFifoHead, 1, __ATOMIC_SEQ_CST);
 }
 
 static struct ncclProxyArgs* FifoGetNextArgs(struct transportProxyInfo* info) {
   if (info == NULL) return NULL;
-  struct ncclProxyArgs* fifoArgs = info->argsFifo + (info->argsFifoTail % TRANSPORT_PROXY_FIFO_SIZE);
+  struct ncclProxyArgs* fifoArgs = info->argsFifo + (LOAD(&info->argsFifoTail) % TRANSPORT_PROXY_FIFO_SIZE);
   pthread_mutex_lock(&info->mutex);
-  while (fifoArgs->active == 1)
+  while (LOAD(&fifoArgs->active) == 1)
     pthread_cond_wait(&info->cond, &info->mutex);
   pthread_mutex_unlock(&info->mutex);
-  info->argsFifoTail++;
+  __atomic_fetch_add(&info->argsFifoTail, 1, __ATOMIC_SEQ_CST);
   return fifoArgs;
 }
 
 static void FifoPushArgs(struct transportProxyInfo* info) {
   if (info == NULL) return;
 
-  struct ncclProxyArgs* fifoArgs = info->argsFifo + ((info->argsFifoTail-1) % TRANSPORT_PROXY_FIFO_SIZE);
-  if (fifoArgs->active == 0) return;
+  struct ncclProxyArgs* fifoArgs = info->argsFifo + ((LOAD(&info->argsFifoTail)-1) % TRANSPORT_PROXY_FIFO_SIZE);
+  if (LOAD(&fifoArgs->active) == 0) return;
 
   pthread_mutex_lock(&info->mutex);
   pthread_cond_signal(&info->cond);
@@ -55,21 +56,21 @@ static void FifoPushArgs(struct transportProxyInfo* info) {
 
 static void WaitProxyReady(struct transportProxyInfo* info) {
   pthread_mutex_lock(&info->mutex);
-  while (info->proxyReady == 0)
+  while (LOAD(&info->proxyReady) == 0)
     pthread_cond_wait(&info->cond, &info->mutex);
   pthread_mutex_unlock(&info->mutex);
 }
 
 static void SetProxyReady(struct transportProxyInfo* info) {
   pthread_mutex_lock(&info->mutex);
-  info->proxyReady = 1;
+  STORE(&info->proxyReady, 1);
   pthread_cond_signal(&info->cond);
   pthread_mutex_unlock(&info->mutex);
 }
 
 static void StopProxy(struct transportProxyInfo* info) {
   struct ncclProxyArgs* fifoArgs = FifoGetNextArgs(info);
-  fifoArgs->active = -1;
+  STORE(&fifoArgs->active, -1);
   FifoPushArgs(info);
 }
 
@@ -100,7 +101,7 @@ static void SaveProxy(struct ncclConnector* connector, struct ncclProxyArgs* arg
   __sync_synchronize();
   memcpy(fifoArgs, args, sizeof(struct ncclProxyArgs));
   __sync_synchronize();
-  fifoArgs->active = 1;
+  STORE(&fifoArgs->active, 1);
 }
 
 ncclResult_t transportSaveProxies(int substeps, int subchunks, int nstepsPerRound, int nblocksPerRound, size_t nbytes, int pattern, struct ncclComm* comm) {
@@ -136,9 +137,9 @@ ncclResult_t transportStartProxies(ncclComm* comm) {
 void* persistentThread(void *opaqueInfo) {
   struct transportProxyInfo* info = (struct transportProxyInfo*)opaqueInfo;
   // We need to initialize the context before launching any NCCL cuda kernel,
-  // otherwise we would create it during the first cudaMemcpyAsync inside the
+  // otherwise we would create it during the first hipMemcpyAsync inside the
   // proxy function and that would cause a deadlock
-  cudaSetDevice(info->comm->cudaDev);
+  hipSetDevice(info->comm->cudaDev);
   // Signal the main thread the context is created and it can proceed.
   SetProxyReady(info);
   while (1) {
@@ -167,8 +168,8 @@ ncclResult_t transportCreateProxy(int type, struct ncclRing* ring, struct ncclCo
     info->cond = PTHREAD_COND_INITIALIZER;
     info->mutex = PTHREAD_MUTEX_INITIALIZER;
     info->func = proxyfunc;
-    info->argsFifoHead = info->argsFifoTail = 0;
-    info->proxyReady = 0;
+    STORE(&info->argsFifoHead, 0); STORE(&info->argsFifoTail, 0);
+    STORE(&info->proxyReady, 0);
     pthread_create(&connector->proxyInfo->thread, NULL, persistentThread, info);
     // Wait for thread to initialize its CUDA context.
     WaitProxyReady(info);
diff --git a/projects/rccl/src/transport/net.cu b/projects/rccl/src/transport/net.cu
index 9c366b32f5..fe3a7f7c56 100644
--- a/projects/rccl/src/transport/net.cu
+++ b/projects/rccl/src/transport/net.cu
@@ -1,5 +1,6 @@
 /*************************************************************************
  * Copyright (c) 2016-2018, NVIDIA CORPORATION. All rights reserved.
+ * Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
  *
  * See LICENSE.txt for license information
  ************************************************************************/
@@ -9,9 +10,13 @@
 #include "nvmlwrap.h"
 #include "net.h"
 #include "param.h"
-#include "nvlink.h"
-#include <cuda_runtime.h>
+#include <hip/hip_runtime_api.h>
 #include <assert.h>
+#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__)
+#include "nvlink_stub.h"
+#else
+#include "nvlink.h"
+#endif
 
 #define NET_MAX_IFS 16
 
@@ -83,7 +88,7 @@ ncclResult_t netFillInfo(ncclTinfo_t* opaqueInfo, int rank) {
 
   // Find distance with current GPU
   int cudaDev;
-  cudaGetDevice(&cudaDev);
+  hipGetDevice(&cudaDev);
   char* cudaPath;
   NCCLCHECK(getCudaPath(cudaDev, &cudaPath));
 
@@ -214,18 +219,25 @@ int getDev(int ringId, int nDev, short* distances) {
 NCCL_PARAM(NetGdrRead, "NET_GDR_READ", -2);
 NCCL_PARAM(NetGdrLevel, "NET_GDR_LEVEL", PATH_PHB);
 
+extern bool useFineGrainVramPcie;
+
 static ncclResult_t netGetGdrSupport(int dev, int distance, int read, int* useGdr) {
   *useGdr = 0;
 
   int cudaDev;
-  CUDACHECK(cudaGetDevice(&cudaDev));
+  CUDACHECK(hipGetDevice(&cudaDev));
+
+  if (!useFineGrainVramPcie) {
+    INFO(NCCL_INIT|NCCL_NET,"NET/%s : GPU Direct RDMA Disabled for GPU %d / Need Fine Grain VRAM over PCIe", ncclNetName(), cudaDev);
+    return ncclSuccess;
+  }
 
   if (read) { // For reads (sends) only enable under certain conditions
     int gdrReadParam = ncclParamNetGdrRead();
     if (gdrReadParam == 0) return ncclSuccess;
     else if (gdrReadParam < 0) { // default : enable only on DGX2
       char busId[NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE];
-      CUDACHECK(cudaDeviceGetPCIBusId(busId, NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE, cudaDev));
+      CUDACHECK(hipDeviceGetPCIBusId(busId, NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE, cudaDev));
       int nvlinks = getNumNvlinks(busId);
       if (nvlinks < CONNECT_NVSWITCH || ncclCudaCompCap() < 7) return ncclSuccess;
     }
@@ -260,7 +272,7 @@ ncclResult_t netSendSetup(ncclTinfo_t* myOpaqueInfo, ncclTinfo_t* peerOpaqueInfo
 
   int size = offsetof(struct ncclRecvMem, buff)+ring->buffSize;
   if (resources->useGdr) {
-    NCCLCHECK(ncclCudaCalloc((char**)(&resources->devNetMem), size));
+    NCCLCHECK(ncclCudaCalloc((char**)(&resources->devNetMem), size, true));
   }
 
   NCCLCHECK(ncclCudaHostAlloc((void**)&resources->hostRecvMem, (void**)&resources->devHostRecvMem, size));
@@ -351,7 +363,7 @@ ncclResult_t netSendFree(void* transportResources) {
   NCCLCHECK(ncclCudaHostFree(resources->hostSendMem));
   NCCLCHECK(ncclCudaHostFree(resources->hostRecvMem));
   if (resources->useGdr)
-    CUDACHECK(cudaFree(resources->devNetMem));
+    CUDACHECK(hipFree(resources->devNetMem));
   NCCLCHECK(ncclNetCloseSend(resources->netSendComm));
   free(resources);
   return ncclSuccess;
@@ -403,7 +415,7 @@ ncclResult_t netSendProxy(struct ncclProxyArgs* args) {
     if (llMode) {
       if (tail < end && tail < head + args->substeps) {
         int slot = tail%args->substeps;
-        int size = sizesFifo[slot];
+        int size = LOAD(&sizesFifo[slot]);
         if (size != 0) {
           if (size == -1) size = 0;
           uint32_t flag = tail + 1;
@@ -413,20 +425,21 @@ ncclResult_t netSendProxy(struct ncclProxyArgs* args) {
           for (int i=0; i<nFifoLines; i++) {
             volatile uint32_t *f1 = &lines[i].flag1;
             volatile uint32_t *f2 = &lines[i].flag2;
-            while (f1[0] != flag || f2[0] != flag);
+            while (LOAD(&f1[0]) != flag || LOAD(&f2[0]) != flag);
           }
           NCCLCHECK(ncclNetIsend(resources->netSendComm, lines, size, ptrType, requests+slot));
           if (requests[slot] != NULL) {
-            sizesFifo[slot] = size;
+            STORE(&sizesFifo[slot], size);
             tail++;
             idle = 0;
           }
         }
       }
-    } else while (tail < *prevTail) {
+    } else while (tail < LOAD(prevTail)) {
         // Send through network
         int slot = tail%args->substeps;
-        NCCLCHECK(ncclNetIsend(resources->netSendComm, localBuff+slot*sliceSize, sizesFifo[slot], ptrType, requests+slot));
+        //TRACE(NCCL_NET,"head %d tail %d prevTail %d slot %d size %d ptrType %d", head, tail, LOAD(prevTail), slot, LOAD(&sizesFifo[slot]), ptrType);
+        NCCLCHECK(ncclNetIsend(resources->netSendComm, localBuff+slot*sliceSize, LOAD(&sizesFifo[slot]), ptrType, requests+slot));
         if (requests[slot] != NULL) {
           tail++;
           idle = 0;
@@ -438,12 +451,12 @@ ncclResult_t netSendProxy(struct ncclProxyArgs* args) {
       NCCLCHECK(ncclNetTest(requests[slot], &done, NULL));
       if (done) {
         if (llMode) {
-          sizesFifo[slot] = 0;
+          STORE(&sizesFifo[slot], 0);
           // Make sure size is reset to zero before we update the head.
           __sync_synchronize();
         }
         head++;
-        *prevHead = head;
+        STORE(prevHead, head);
         idle = 0;
       }
     }
@@ -451,17 +464,17 @@ ncclResult_t netSendProxy(struct ncclProxyArgs* args) {
   }
 
   // Reset
-  if (llMode == 0) *prevTail = 0;
+  if (llMode == 0) STORE(prevTail, 0);
 
 nextColl:
   if (llMode) {
     resources->llStep += args->nsteps;
     // Don't forget to ack otherwise the GPU won't be able to push data.
-    *prevHead = resources->llStep;
+    STORE(prevHead, resources->llStep);
     if (resources->llStep > resources->llLastCleaning + NCCL_LL_CLEAN_FREQ) {
       memset(localBuff, 0, NCCL_LL_BUFF_SIZE);
       resources->llStep += NCCL_LL_CHUNKS;
-      *prevHead = resources->llStep;
+      STORE(prevHead, resources->llStep);
       resources->llLastCleaning = resources->llStep;
     }
   }
@@ -503,7 +516,7 @@ ncclResult_t netRecvProxy(struct ncclProxyArgs* args) {
 
   while (head < end) {
     idle++;
-    if ((tail < head + args->substeps) && (tail < *nextHead + args->substeps) && (tail < end)) {
+    if ((tail < head + args->substeps) && (tail < LOAD(nextHead) + args->substeps) && (tail < end)) {
       int slot = tail%args->substeps;
       NCCLCHECK(ncclNetIrecv(resources->netRecvComm, localBuff+slot*sliceSize, sliceSize, ptrType, requests+slot));
       if (requests[slot] != NULL) {
@@ -521,7 +534,8 @@ ncclResult_t netRecvProxy(struct ncclProxyArgs* args) {
         head++;
         if (llMode == 0) {
           if (ptrType == NCCL_PTR_CUDA) ncclNetFlush(resources->netRecvComm, localBuff+slot*sliceSize, size);
-          *nextTail = head;
+          //TRACE(NCCL_NET,"head %d tail %d slot %d size %d ptrType %d", head, tail, slot, size, ptrType);
+          STORE(nextTail, head);
         }
         idle = 0;
       }
@@ -531,8 +545,8 @@ ncclResult_t netRecvProxy(struct ncclProxyArgs* args) {
 
   // Wait for last ack and reset
   if (llMode == 0) {
-    transportProxyWait([=] { return *nextHead == head; });
-    *nextHead = 0;
+    transportProxyWait([=] { return LOAD(nextHead) == head; });
+    STORE(nextHead, 0);
   }
 
 nextColl:
@@ -540,7 +554,7 @@ nextColl:
     resources->llStep += args->nsteps;
     if (resources->llStep > resources->llLastCleaning + NCCL_LL_CLEAN_FREQ) {
       resources->llStep += NCCL_LL_CHUNKS;
-      while (*nextHead < resources->llStep);
+      while (LOAD(nextHead) < resources->llStep);
       resources->llLastCleaning = resources->llStep;
     }
   }
diff --git a/projects/rccl/src/transport/net_ib.cu b/projects/rccl/src/transport/net_ib.cu
index 18e158df7c..cbe2f9c45f 100644
--- a/projects/rccl/src/transport/net_ib.cu
+++ b/projects/rccl/src/transport/net_ib.cu
@@ -1,5 +1,6 @@
 /*************************************************************************
  * Copyright (c) 2016-2018, NVIDIA CORPORATION. All rights reserved.
+ * Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
  *
  * See LICENSE.txt for license information
  ************************************************************************/
@@ -182,12 +183,16 @@ ncclResult_t ncclIbPciPath(int dev, char** path) {
 ncclResult_t ncclIbGdrSupport(int ibDev) {
   static int moduleLoaded = -1;
   if (moduleLoaded == -1) {
+#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__)
+    moduleLoaded = (access("/sys/kernel/mm/memory_peers/amdkfd/version", F_OK) == -1) ? 0 : 1;
+#else
     moduleLoaded = (access("/sys/kernel/mm/memory_peers/nv_mem/version", F_OK) == -1) ? 0 : 1;
+#endif
   }
   if (moduleLoaded == 0) return ncclSystemError;
   ncclResult_t ret = ncclSystemError;
   void* ptr;
-  if (cudaMalloc(&ptr, sizeof(int)) == cudaSuccess) {
+  if (hipMalloc(&ptr, sizeof(int)) == hipSuccess) {
     struct ibv_mr* mr;
     struct ibv_pd* pd;
     if (wrap_ibv_alloc_pd(&pd, ncclIbDevs[ibDev].context) == ncclSuccess) {
@@ -197,7 +202,7 @@ ncclResult_t ncclIbGdrSupport(int ibDev) {
       }
       wrap_ibv_dealloc_pd(pd);
     }
-    cudaFree(ptr);
+    hipFree(ptr);
   }
   return ret;
 }
@@ -206,7 +211,7 @@ ncclResult_t ncclIbPtrSupport(int dev, int* supportedTypes) {
   *supportedTypes = NCCL_PTR_HOST;
 
   int cudaDev;
-  CUDACHECK(cudaGetDevice(&cudaDev));
+  CUDACHECK(hipGetDevice(&cudaDev));
 
   if (ncclIbGdrSupport(dev) != ncclSuccess) {
     INFO(NCCL_INIT|NCCL_NET,"NET/IB : GPU Direct RDMA Disabled for GPU %d / HCA %s (no module or not supported by GPU)", cudaDev, ncclIbDevs[dev].devName);
@@ -637,7 +642,7 @@ ncclResult_t ncclIbIsend(void* sendComm, void* data, int size, int type, void**
   // Wait for the receiver to have posted the corresponding receive
   volatile struct ncclIbSendFifo* slot = comm->fifo + (comm->fifoHead%MAX_REQUESTS);
   volatile uint32_t * readyPtr = &slot->ready;
-  if (*readyPtr == 0) { *request = NULL; return ncclSuccess; }
+  if (LOAD(readyPtr) == 0) { *request = NULL; return ncclSuccess; }
 
   struct ncclIbRequest* req;
   NCCLCHECK(ncclIbGetRequest(comm->reqs, &req));
@@ -679,7 +684,7 @@ ncclResult_t ncclIbIsend(void* sendComm, void* data, int size, int type, void**
 #endif
   // We must clear slot->ready, but reset other fields to aid
   // debugging and sanity checks
-  slot->ready = 0;
+  STORE(&slot->ready, 0);
   slot->addr = 0ULL;
   slot->rkey = slot->size = slot->seq = 0;
   comm->fifoHead++;
diff --git a/projects/rccl/src/transport/net_socket.cu b/projects/rccl/src/transport/net_socket.cu
index 1efee15dda..b09e2e7234 100644
--- a/projects/rccl/src/transport/net_socket.cu
+++ b/projects/rccl/src/transport/net_socket.cu
@@ -1,5 +1,6 @@
 /*************************************************************************
  * Copyright (c) 2016-2018, NVIDIA CORPORATION. All rights reserved.
+ * Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
  *
  * See LICENSE.txt for license information
  ************************************************************************/
@@ -50,7 +51,12 @@ ncclResult_t ncclSocketDevices(int* ndev) {
 
 ncclResult_t ncclSocketPciPath(int dev, char** path) {
   char devicepath[PATH_MAX];
-  snprintf(devicepath, PATH_MAX, "/sys/class/net/%s/device", ncclNetIfNames+dev*MAX_IF_NAME_SIZE);
+  snprintf(devicepath, PATH_MAX, "/sys/class/net/%s", ncclNetIfNames+dev*MAX_IF_NAME_SIZE);
+  *path = realpath(devicepath, NULL);
+  const char* string_virual_network_device_path="/sys/devices/virtual/net/";
+  if (*path && !strncmp(*path, string_virual_network_device_path, strlen(string_virual_network_device_path)))
+    return ncclSuccess;
+  free(*path);
   *path = realpath(devicepath, NULL);
   if (*path == NULL) {
     INFO(NCCL_NET|NCCL_INIT, "Could not find real path of %s", devicepath);
diff --git a/projects/rccl/src/transport/p2p.cu b/projects/rccl/src/transport/p2p.cu
index 6c4626a77c..301ad3e90d 100644
--- a/projects/rccl/src/transport/p2p.cu
+++ b/projects/rccl/src/transport/p2p.cu
@@ -1,5 +1,6 @@
 /*************************************************************************
  * Copyright (c) 2016-2018, NVIDIA CORPORATION. All rights reserved.
+ * Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
  *
  * See LICENSE.txt for license information
  ************************************************************************/
@@ -10,10 +11,16 @@
 #include "transport.h"
 #include "param.h"
 #include <unistd.h>
-#include <cuda_runtime.h>
+#include <hip/hip_runtime_api.h>
 #include "nvmlwrap.h"
 #include <ctype.h>
+#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__)
+#include "nvlink_stub.h"
+#else
 #include "nvlink.h"
+#endif
+
+extern bool useFineGrainVramPcie;
 
 struct p2pInfo {
   int rank;
@@ -27,7 +34,7 @@ struct p2pConnectInfo {
   int direct;
   union {
     void* directPtr;
-    cudaIpcMemHandle_t devIpc;
+    hipIpcMemHandle_t devIpc;
   };
 };
 
@@ -39,7 +46,7 @@ ncclResult_t p2pFillInfo(ncclTinfo_t* opaqueInfo, int rank) {
   struct p2pInfo* info = (struct p2pInfo*)opaqueInfo;
   static_assert(sizeof(struct p2pInfo) <= sizeof(ncclTinfo_t), "p2p Info too large");
   info->rank = rank;
-  CUDACHECK(cudaGetDevice(&info->cudaDev));
+  CUDACHECK(hipGetDevice(&info->cudaDev));
   info->hostHash=getHostHash();
   info->pidHash=getPidHash();
 
@@ -47,12 +54,15 @@ ncclResult_t p2pFillInfo(ncclTinfo_t* opaqueInfo, int rank) {
   // cudaDev is a CUDA runtime dev number which could be different from the
   // NVML device number. Then we get the busID from NVML to be sure it is
   // consistent with NVML remote PCI bus Ids.
-  CUDACHECK(cudaDeviceGetPCIBusId(info->busId, NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE, info->cudaDev));
+  CUDACHECK(hipDeviceGetPCIBusId(info->busId, NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE, info->cudaDev));
+#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__)
+#else
   nvmlDevice_t nvmlDevice;
   NCCLCHECK(wrapNvmlDeviceGetHandleByPciBusId(info->busId, &nvmlDevice));
   nvmlPciInfo_t pciInfo;
   NCCLCHECK(wrapNvmlDeviceGetPciInfo(nvmlDevice, &pciInfo));
   strncpy(info->busId, pciInfo.busId, NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE);
+#endif
   return ncclSuccess;
 }
 
@@ -84,15 +94,41 @@ ncclResult_t p2pCanConnect(ncclTvalue_t* ret, ncclTinfo_t* myOpaqueInfo, ncclTin
 
   // See if CUDA can do P2P
   int p2p;
-  if (cudaDeviceCanAccessPeer(&p2p, myInfo->cudaDev, peerInfo->cudaDev) != cudaSuccess) {
+  if (hipDeviceCanAccessPeer(&p2p, myInfo->cudaDev, peerInfo->cudaDev) != hipSuccess) {
     INFO(NCCL_INIT|NCCL_P2P,"peer query failed between dev %d and dev %d",
         myInfo->cudaDev, peerInfo->cudaDev);
     return ncclSuccess;
   }
+
+  if (!useFineGrainVramPcie) p2p = 0;
+
   if (p2p == 0) return ncclSuccess;
 
-  // Check for NVLink/NVswitch
+#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__)
+  uint32_t link_type, hops;
+  if (hipExtGetLinkTypeAndHopCount(myInfo->cudaDev, peerInfo->cudaDev, &link_type, &hops) != hipSuccess) {
+    p2p = 0;
+    return ncclSuccess;
+  }
+  static const char* link_type_name[] = {"HT", "QPI", "PCIE", "IB", "XGMI"};
+  static unsigned long long link_status_print_once_mask = 0;
+  if (!(link_status_print_once_mask & (1 << (myInfo->cudaDev*8 + peerInfo->cudaDev)))) {
+    INFO(NCCL_INIT, "%d -> %d: link type %s hops %d", myInfo->cudaDev, peerInfo->cudaDev,
+      link_type_name[link_type], hops);
+    link_status_print_once_mask |= (1 << (myInfo->cudaDev*8 + peerInfo->cudaDev));
+  }
+  if (link_type != HSA_AMD_LINK_INFO_TYPE_XGMI) {
+  // enable below lines on release only: disable PCIe P2P until HDP flush is implemented.
+  //  p2p = 0;
+  //  return ncclSuccess;
+  }
+  int nvlinkp2p = 0;
+  if (link_type == HSA_AMD_LINK_INFO_TYPE_XGMI && hops == 1)
+    nvlinkp2p = CONNECT_NVLINK;
+#else
+ // Check for NVLink/NVswitch
   int nvlinkp2p = getNvlinkGpu(myInfo->busId, peerInfo->busId);
+#endif
   if (nvlinkp2p > 0) {
     *ret = nvlinkp2p;
     return ncclSuccess;
@@ -457,12 +493,12 @@ ncclResult_t p2pSendSetup(ncclTinfo_t* myOpaqueInfo, ncclTinfo_t* peerOpaqueInfo
       INFO(NCCL_INIT|NCCL_P2P,"Ring %02d : %d -> %d via P2P/common device", ring->id, myInfo->rank, peerInfo->rank);
     } else {
       // Enable P2P access
-      cudaError_t err = cudaDeviceEnablePeerAccess(peerInfo->cudaDev, 0);
-      if (err == cudaErrorPeerAccessAlreadyEnabled) {
-        cudaGetLastError();
-      } else if (err != cudaSuccess) {
+      hipError_t err = hipDeviceEnablePeerAccess(peerInfo->cudaDev, 0);
+      if (err == hipErrorPeerAccessAlreadyEnabled) {
+        hipGetLastError();
+      } else if (err != hipSuccess) {
         WARN("failed to peer with device %d: %d %s",
-            peerInfo->cudaDev, err, cudaGetErrorString(err));
+            peerInfo->cudaDev, err, hipGetErrorString(err));
         return ncclInternalError;
       }
       INFO(NCCL_INIT|NCCL_P2P,"Ring %02d : %d[%d] -> %d[%d] via P2P/direct pointer",
@@ -471,10 +507,10 @@ ncclResult_t p2pSendSetup(ncclTinfo_t* myOpaqueInfo, ncclTinfo_t* peerOpaqueInfo
   } else {
     info.direct = 0;
     // Map IPC and enable P2P access
-    cudaError_t err = cudaIpcGetMemHandle(&info.devIpc, (void*)ring->devMemSend);
-    if (err != cudaSuccess) {
+    hipError_t err = hipIpcGetMemHandle(&info.devIpc, (void*)ring->devMemSend);
+    if (err != hipSuccess) {
       WARN("rank %d failed to get CUDA IPC handle to device %d : %d %s",
-          myInfo->rank, peerInfo->cudaDev, err, cudaGetErrorString(err));
+          myInfo->rank, peerInfo->cudaDev, err, hipGetErrorString(err));
       return ncclInternalError;
     }
     INFO(NCCL_INIT|NCCL_P2P,"Ring %02d : %d[%d] -> %d[%d] via P2P/IPC",
@@ -498,12 +534,12 @@ ncclResult_t p2pRecvSetup(ncclTinfo_t* myOpaqueInfo, ncclTinfo_t* peerOpaqueInfo
       TRACE(NCCL_INIT|NCCL_P2P,"%d <- %d via P2P/common device", myInfo->rank, peerInfo->rank);
     } else {
       // Enable P2P access
-      cudaError_t err = cudaDeviceEnablePeerAccess(peerInfo->cudaDev, 0);
-      if (err == cudaErrorPeerAccessAlreadyEnabled) {
-        cudaGetLastError();
-      } else if (err != cudaSuccess) {
+      hipError_t err = hipDeviceEnablePeerAccess(peerInfo->cudaDev, 0);
+      if (err == hipErrorPeerAccessAlreadyEnabled) {
+        hipGetLastError();
+      } else if (err != hipSuccess) {
         WARN("failed to peer with device %d: %d %s",
-            peerInfo->cudaDev, err, cudaGetErrorString(err));
+            peerInfo->cudaDev, err, hipGetErrorString(err));
         return ncclInternalError;
       }
       TRACE(NCCL_INIT|NCCL_P2P,"Ring %02d : %d[%d] <- %d[%d] via P2P/direct pointer", ring->id, myInfo->rank, myInfo->cudaDev, peerInfo->rank, peerInfo->cudaDev);
@@ -511,10 +547,10 @@ ncclResult_t p2pRecvSetup(ncclTinfo_t* myOpaqueInfo, ncclTinfo_t* peerOpaqueInfo
   } else {
     info.direct = 0;
     // Map IPC and enable P2P access
-    cudaError_t err = cudaIpcGetMemHandle(&info.devIpc, (void*)ring->devMemRecv);
-    if (err != cudaSuccess) {
-      WARN("rank %d failed to get CUDA IPC handle to device %d : %d %s",
-          myInfo->rank, peerInfo->cudaDev, err, cudaGetErrorString(err));
+    hipError_t err = hipIpcGetMemHandle(&info.devIpc, (void*)ring->devMemRecv);
+    if (err != hipSuccess) {
+      WARN("rank %d failed to get HIP IPC handle to device %d : %d %s",
+          myInfo->rank, peerInfo->cudaDev, err, hipGetErrorString(err));
       return ncclInternalError;
     }
     TRACE(NCCL_INIT|NCCL_P2P,"Ring %02d : %d[%d] <- %d[%d] via P2P/IPC", ring->id, myInfo->rank, myInfo->cudaDev, peerInfo->rank, peerInfo->cudaDev);
@@ -537,15 +573,15 @@ static ncclResult_t p2pSendConnect(struct ncclConnect* connectInfo, struct ncclC
   } else {
     void* remPtr = NULL;
     //TRACE_DUMP_IPC(&info->devIpc);
-    cudaError_t err = cudaIpcOpenMemHandle(&remPtr, info->devIpc, cudaIpcMemLazyEnablePeerAccess);
+    hipError_t err = hipIpcOpenMemHandle(&remPtr, info->devIpc, hipIpcMemLazyEnablePeerAccess);
     void** ipcPtrSave;
     NCCLCHECK(ncclCalloc(&ipcPtrSave, 1));
     *resources = ipcPtrSave;
     *ipcPtrSave = remPtr;
     remDevMem = (struct ncclRecvMem*)remPtr;
-    if (err != cudaSuccess) {
+    if (err != hipSuccess) {
       WARN("failed to open CUDA IPC handle : %d %s",
-          err, cudaGetErrorString(err));
+          err, hipGetErrorString(err));
       return ncclUnhandledCudaError;
     }
   }
@@ -571,15 +607,15 @@ ncclResult_t p2pRecvConnect(struct ncclConnect* connectInfo, struct ncclConnecto
   } else {
     void* remPtr = NULL;
     //TRACE_DUMP_IPC(&info->devIpc);
-    cudaError_t err = cudaIpcOpenMemHandle(&remPtr, info->devIpc, cudaIpcMemLazyEnablePeerAccess);
+    hipError_t err = hipIpcOpenMemHandle(&remPtr, info->devIpc, hipIpcMemLazyEnablePeerAccess);
     void** ipcPtrSave;
     NCCLCHECK(ncclCalloc(&ipcPtrSave, 1));
     *resources = ipcPtrSave;
     *ipcPtrSave = remPtr;
     remDevMem = (struct ncclSendMem*)remPtr;
-    if (err != cudaSuccess) {
+    if (err != hipSuccess) {
       WARN("failed to open CUDA IPC handle : %d %s",
-          err, cudaGetErrorString(err));
+          err, hipGetErrorString(err));
       return ncclUnhandledCudaError;
     }
   }
@@ -595,7 +631,7 @@ ncclResult_t p2pRecvConnect(struct ncclConnect* connectInfo, struct ncclConnecto
 ncclResult_t p2pFree(void* resources) {
   if (resources != NULL) {
     void** ipcPtrSave = (void**) resources;
-    CUDACHECK(cudaIpcCloseMemHandle(*ipcPtrSave));
+    CUDACHECK(hipIpcCloseMemHandle(*ipcPtrSave));
     free(resources);
   }
   return ncclSuccess;
diff --git a/projects/rccl/src/transport/shm.cu b/projects/rccl/src/transport/shm.cu
index 317f652dac..557a32c86b 100644
--- a/projects/rccl/src/transport/shm.cu
+++ b/projects/rccl/src/transport/shm.cu
@@ -1,5 +1,6 @@
 /*************************************************************************
  * Copyright (c) 2016-2018, NVIDIA CORPORATION. All rights reserved.
+ * Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
  *
  * See LICENSE.txt for license information
  ************************************************************************/
@@ -10,7 +11,7 @@
 #include "param.h"
 #include "shm.h"
 #include <unistd.h>
-#include <cuda_runtime.h>
+#include <hip/hip_runtime_api.h>
 
 struct shmInfo {
   int rank;
@@ -57,7 +58,7 @@ ncclResult_t shmFillInfo(ncclTinfo_t* opaqueInfo, int rank) {
   struct shmInfo* info = (struct shmInfo*)opaqueInfo;
   static_assert(sizeof(struct shmInfo) <= sizeof(ncclTinfo_t), "shm Info too large");
   info->rank = rank;
-  CUDACHECK(cudaGetDevice(&info->cudaDev));
+  CUDACHECK(hipGetDevice(&info->cudaDev));
   info->hostHash=getHostHash();
   info->pidHash=getPidHash();
   return ncclSuccess;
diff --git a/projects/rccl/test/CMakeLists.txt b/projects/rccl/test/CMakeLists.txt
new file mode 100644
index 0000000000..1af7886276
--- /dev/null
+++ b/projects/rccl/test/CMakeLists.txt
@@ -0,0 +1,63 @@
+cmake_minimum_required(VERSION 2.8.12)
+
+if(BUILD_TESTS)
+
+  message("Going to build unit tests (Installed in /test/UnitTests)")
+
+  # OpenMP is used to drive GPUs (one per thread)
+  find_package(OpenMP REQUIRED)
+  set (CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OpenMP_C_FLAGS}")
+  set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}")
+  set (CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} ${OpenMP_EXE_LINKER_FLAGS}")
+
+  # Download and unpack googletest at configure time
+  configure_file(CMakeLists.txt.in googletest-download/CMakeLists.txt)
+  execute_process(
+    COMMAND ${CMAKE_COMMAND} -G "${CMAKE_GENERATOR}" .
+    RESULT_VARIABLE result
+    WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/googletest-download)
+  if(result)
+    message(FATAL_ERROR "CMake step for googletest failed: ${result}")
+  endif()
+  execute_process(
+    COMMAND ${CMAKE_COMMAND} --build .
+    RESULT_VARIABLE result
+    WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/googletest-download)
+  if(result)
+    message(FATAL_ERROR "Build step for googletest failed: ${result}")
+  endif()
+
+  # Add googletest directly to our build. This adds the following targets:
+  # gtest, gtest_main, gmock and gmock_main
+  add_subdirectory("${CMAKE_BINARY_DIR}/googletest-src"
+                   "${CMAKE_BINARY_DIR}/googletest-build")
+
+  # Add googletest directly to our build. This defines the gtest and gtest_main
+  # targets. add_subdirectory(${CMAKE_CURRENT_BINARY_DIR}/googletest-src
+  # ${CMAKE_CURRENT_BINARY_DIR}/googletest-build EXCLUDE_FROM_ALL)
+
+  # Collect source files for tests
+  set(TEST_SOURCES
+    test_AllGather.cpp
+    test_AllReduce.cpp
+    test_Broadcast.cpp
+    test_Reduce.cpp
+    test_ReduceScatter.cpp
+    test_GroupCalls.cpp
+  )
+
+  add_executable(UnitTests ${TEST_SOURCES})
+  target_include_directories(UnitTests PRIVATE /opt/rocm)
+  target_link_libraries(UnitTests PRIVATE gtest_main PRIVATE rccl)
+  install(TARGETS UnitTests RUNTIME DESTINATION test)
+
+  # HCC adds /opt/rocm/lib as RPATH, even though the install process is supposed to
+  # remove RPATH.  As a work-around, set the correct RPATH for the unit test executable
+  # as a post-install step
+  install(
+    CODE
+      "execute_process(COMMAND chrpath -r ${CMAKE_INSTALL_PREFIX}/lib:/opt/rocm/lib ${CMAKE_INSTALL_PREFIX}/test/UnitTests)"
+    )
+else()
+  message("Not building unit tests")
+endif()
diff --git a/projects/rccl/test/CMakeLists.txt.in b/projects/rccl/test/CMakeLists.txt.in
new file mode 100644
index 0000000000..128d29e7c2
--- /dev/null
+++ b/projects/rccl/test/CMakeLists.txt.in
@@ -0,0 +1,15 @@
+cmake_minimum_required(VERSION 2.8.2)
+
+project(googletest-download NONE)
+
+include(ExternalProject)
+ExternalProject_Add(googletest
+  GIT_REPOSITORY https://github.com/google/googletest.git
+  GIT_TAG master
+  SOURCE_DIR "${CMAKE_BINARY_DIR}/googletest-src"
+  BINARY_DIR "${CMAKE_BINARY_DIR}/googletest-build"
+  CONFIGURE_COMMAND ""
+  BUILD_COMMAND ""
+  INSTALL_COMMAND ""
+  TEST_COMMAND ""
+)
diff --git a/projects/rccl/test/CorrectnessTest.hpp b/projects/rccl/test/CorrectnessTest.hpp
new file mode 100644
index 0000000000..638c028788
--- /dev/null
+++ b/projects/rccl/test/CorrectnessTest.hpp
@@ -0,0 +1,311 @@
+/*************************************************************************
+ * Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+#ifndef CORRECTNESSTEST_HPP
+#define CORRECTNESSTEST_HPP
+
+#include <cstdio>
+#include <tuple>
+#include <vector>
+#include <gtest/gtest.h>
+#include "rccl.h"
+
+#define HIP_CALL(x) ASSERT_EQ(x, hipSuccess)
+#define NCCL_CALL(x) ASSERT_EQ(x, ncclSuccess)
+
+namespace CorrectnessTests
+{
+    // Performs the various basic reduction operations
+    template <typename T>
+    T ReduceOp(ncclRedOp_t const op, T const A, T const B)
+    {
+        switch (op)
+        {
+        case ncclSum:  return A + B;
+        case ncclProd: return A * B;
+        case ncclMax:  return std::max(A, B);
+        case ncclMin:  return std::min(A, B);
+        default:
+            fprintf(stderr, "[ERROR] Unsupported reduction operator (%d)\n", op);
+            exit(0);
+        }
+    }
+
+    // Returns the number of bytes per element for each supported datatype
+    static int DataTypeToBytes(ncclDataType_t const dataType)
+    {
+        switch (dataType)
+        {
+        case ncclInt8:   return 1;
+        case ncclUint8:  return 1;
+        case ncclInt32:  return 4;
+        case ncclUint32: return 4;
+        case ncclInt64:  return 8;
+        case ncclUint64: return 8;
+        case ncclFloat16: return 2;
+        case ncclFloat32: return 4;
+        case ncclFloat64: return 8;
+        default:
+            fprintf(stderr, "[ERROR] Unsupported datatype (%d)\n", dataType);
+            exit(0);
+        }
+    }
+
+    // Encapsulates all the memory used per devices for collectives, as well as reference results
+    struct Dataset
+    {
+        int                 numDevices;  // Number of devices participating
+        size_t              numElements; // Number of elements per array
+        ncclDataType_t      dataType;    // Data type of each input/output pointer
+        bool                inPlace;     // Whether or not output pointers are same as input pointers
+        std::vector<void *> inputs;      // Input pointers (1 per device)
+        std::vector<void *> outputs;     // Output pointers (1 per device)
+                                         // May be identical to input pointers for in-place tests
+        std::vector<void *> expected;    // Expected output (1 per device)
+
+        size_t NumBytes() const
+        {
+            return numElements * DataTypeToBytes(dataType);
+        }
+
+        void Initialize(int            const numDevices_,
+                        size_t         const numElements_,
+                        ncclDataType_t const dataType_,
+                        bool           const inPlace_)
+        {
+            numDevices  = numDevices_;
+            numElements = numElements_;
+            dataType    = dataType_;
+            inPlace     = inPlace_;
+
+            inputs.resize(numDevices);
+            outputs.resize(numDevices);
+            expected.resize(numDevices);
+
+            // Allocate per-device memory
+            size_t const numBytes = NumBytes();
+
+            for (int i = 0; i < numDevices; i++)
+            {
+                HIP_CALL(hipSetDevice(i));
+                HIP_CALL(hipMalloc((void **)&inputs[i], numBytes));
+                if (inPlace)
+                    outputs[i] = inputs[i];
+                else
+                    HIP_CALL(hipMalloc((void **)&outputs[i], numBytes));
+
+                expected[i] = malloc(numBytes);
+            }
+        }
+
+        ~Dataset()
+        {
+            for (int i = 0; i < outputs.size(); i++)
+            {
+                if (!inPlace) hipFree(outputs[i]);
+                hipFree(inputs[i]);
+                free(expected[i]);
+            }
+        }
+    };
+
+    typedef std::tuple<ncclRedOp_t    /* op          */,
+                       ncclDataType_t /* dataType    */,
+                       size_t         /* numElements */,
+                       int            /* numDevices  */,
+                       bool           /* inPlace     */> TestTuple;
+
+    // Base class for each collective test
+    // - Each test is instantiated with a different TestTuple
+    class CorrectnessTest : public testing::TestWithParam<TestTuple>
+    {
+    protected:
+
+        // This code is called per test-tuple
+        void SetUp() override
+        {
+            // Check for fine-grained env variable (otherwise will hang)
+            if (!getenv("HSA_FORCE_FINE_GRAIN_PCIE"))
+            {
+                printf("Must set HSA_FORCE_FINE_GRAIN_PCIE=1 prior to execution\n");
+                exit(0);
+            }
+
+            // Make the test tuple parameters accessible
+            std::tie(op, dataType, numElements, numDevices, inPlace) = GetParam();
+
+            // Collect the number of available GPUs
+            HIP_CALL(hipGetDeviceCount(&numDevicesAvailable));
+
+            // Only proceed with testing if there are enough GPUs
+            if (numDevices > numDevicesAvailable)
+            {
+                fprintf(stdout, "Skipping test requring %d devices (only %d available)\n",
+                        numDevices, numDevicesAvailable);
+                return;
+            }
+
+            // Initialize communicators
+            comms.resize(numDevices);
+            NCCL_CALL(ncclCommInitAll(comms.data(), numDevices, NULL));
+
+            // Create streams
+            streams.resize(numDevices);
+            for (int i = 0; i < numDevices; i++)
+                HIP_CALL(hipStreamCreate(&streams[i]));
+        }
+
+        // Clean up per TestTuple
+        void TearDown() override
+        {
+            // Release communicators and streams
+            for (int i = 0; i < numDevices; i++)
+            {
+                NCCL_CALL(ncclCommDestroy(comms[i]));
+                HIP_CALL(hipStreamDestroy(streams[i]));
+            }
+        }
+
+        void FillDatasetWithPattern(Dataset& dataset)
+        {
+            int8_t*   arrayI1 = (int8_t   *)malloc(dataset.NumBytes());
+            uint8_t*  arrayU1 = (uint8_t  *)arrayI1;
+            int32_t*  arrayI4 = (int32_t  *)arrayI1;
+            uint32_t* arrayU4 = (uint32_t *)arrayI1;
+            int64_t*  arrayI8 = (int64_t  *)arrayI1;
+            uint64_t* arrayU8 = (uint64_t *)arrayI1;
+            float*    arrayF4 = (float    *)arrayI1;
+            double*   arrayF8 = (double   *)arrayI1;
+
+            // NOTE: Currently half-precision float tests are unsupported due to half being supported
+            //       on GPU only and not host
+
+            // Fills input  data[i][j] with (i + j) % 6
+            // - Keeping range small to reduce likelihood of overflow
+            // - Sticking with floating points values that are perfectly representable
+            for (int i = 0; i < dataset.numDevices; i++)
+            {
+                for (int j = 0; j < dataset.numElements; j++)
+                {
+                    int    valueI = (i + j) % 6;
+                    float  valueF = (float)valueI;
+
+                    switch (dataset.dataType)
+                    {
+                    case ncclInt8:    arrayI1[j] = valueI; break;
+                    case ncclUint8:   arrayU1[j] = valueI; break;
+                    case ncclInt32:   arrayI4[j] = valueI; break;
+                    case ncclUint32:  arrayU4[j] = valueI; break;
+                    case ncclInt64:   arrayI8[j] = valueI; break;
+                    case ncclUint64:  arrayU8[j] = valueI; break;
+                    case ncclFloat32: arrayF4[j] = valueF; break;
+                    case ncclFloat64: arrayF8[j] = valueF; break;
+                    default:
+                        fprintf(stderr, "[ERROR] Unsupported datatype\n");
+                        exit(0);
+                    }
+                }
+
+                HIP_CALL(hipSetDevice(i));
+                HIP_CALL(hipMemcpy(dataset.inputs[i], arrayI1, dataset.NumBytes(), hipMemcpyHostToDevice));
+
+                // Fills output data[i][j] with 0 (if not inplace)
+                if (!dataset.inPlace)
+                    HIP_CALL(hipMemset(dataset.outputs[i], 0, dataset.NumBytes()));
+            }
+
+            free(arrayI1);
+        }
+
+        void ValidateResults(Dataset const& dataset) const
+        {
+            int8_t*   outputI1 = (int8_t   *)malloc(dataset.NumBytes());
+            uint8_t*  outputU1 = (uint8_t  *)outputI1;
+            int32_t*  outputI4 = (int32_t  *)outputI1;
+            uint32_t* outputU4 = (uint32_t *)outputI1;
+            int64_t*  outputI8 = (int64_t  *)outputI1;
+            uint64_t* outputU8 = (uint64_t *)outputI1;
+            float*    outputF4 = (float    *)outputI1;
+            double*   outputF8 = (double   *)outputI1;
+
+            bool isMatch = true;
+
+            // Loop over each device's output and compare it to the expected output
+            // (Each collective operation computes its own expected results)
+            for (int i = 0; i < dataset.numDevices && isMatch; i++)
+            {
+                HIP_CALL(hipMemcpy(outputI1, dataset.outputs[i], dataset.NumBytes(), hipMemcpyDeviceToHost));
+
+                int8_t*   expectedI1 = (int8_t   *)dataset.expected[i];
+                uint8_t*  expectedU1 = (uint8_t  *)expectedI1;
+                int32_t*  expectedI4 = (int32_t  *)expectedI1;
+                uint32_t* expectedU4 = (uint32_t *)expectedI1;
+                int64_t*  expectedI8 = (int64_t  *)expectedI1;
+                uint64_t* expectedU8 = (uint64_t *)expectedI1;
+                float*    expectedF4 = (float    *)expectedI1;
+                double*   expectedF8 = (double   *)expectedI1;
+
+                for (int j = 0; j < dataset.numElements && isMatch; j++)
+                {
+                    switch (dataset.dataType)
+                    {
+                    case ncclInt8:    isMatch &= (outputI1[j] == expectedI1[j]); break;
+                    case ncclUint8:   isMatch &= (outputU1[j] == expectedU1[j]); break;
+                    case ncclInt32:   isMatch &= (outputI4[j] == expectedI4[j]); break;
+                    case ncclUint32:  isMatch &= (outputU4[j] == expectedU4[j]); break;
+                    case ncclInt64:   isMatch &= (outputI8[j] == expectedI8[j]); break;
+                    case ncclUint64:  isMatch &= (outputU8[j] == expectedU8[j]); break;
+                    case ncclFloat32: isMatch &= (outputF4[j] == expectedF4[j]); break;
+                    case ncclFloat64: isMatch &= (outputF8[j] == expectedF8[j]); break;
+                    default:
+                        fprintf(stderr, "[ERROR] Unsupported datatype\n");
+                        exit(0);
+                    }
+
+                    if (!isMatch)
+                    {
+                        switch (dataset.dataType)
+                        {
+                        case ncclInt8:
+                            printf("Expected %d.  Output %d on device %d[%d]\n", outputI1[j], expectedI1[j], i, j); break;
+                        case ncclUint8:
+                            printf("Expected %u.  Output %u on device %d[%d]\n", outputU1[j], expectedU1[j], i, j); break;
+                        case ncclInt32:
+                            printf("Expected %d.  Output %d on device %d[%d]\n", outputI4[j], expectedI4[j], i, j); break;
+                        case ncclUint32:
+                            printf("Expected %u.  Output %u on device %d[%d]\n", outputU4[j], expectedU4[j], i, j); break;
+                        case ncclInt64:
+                            printf("Expected %ld.  Output %ld on device %d[%d]\n", outputI8[j], expectedI8[j], i, j); break;
+                        case ncclUint64:
+                            printf("Expected %lu.  Output %lu on device %d[%d]\n", outputU8[j], expectedU8[j], i, j); break;
+                        case ncclFloat32:
+                            printf("Expected %f.  Output %f on device %d[%d]\n", outputF4[j], expectedF4[j], i, j); break;
+                        case ncclFloat64:
+                            printf("Expected %lf.  Output %lf on device %d[%d]\n", outputF8[j], expectedF8[j], i, j); break;
+                        default:
+                            fprintf(stderr, "[ERROR] Unsupported datatype\n");
+                            exit(0);
+                        }
+                    }
+                }
+                ASSERT_EQ(isMatch, true);
+            }
+        }
+
+        // Passed in parameters from TestTuple
+        ncclRedOp_t              op;
+        ncclDataType_t           dataType;
+        size_t                   numElements;
+        int                      numDevices;
+        bool                     inPlace;
+
+        int                      numDevicesAvailable;
+        std::vector<ncclComm_t>  comms;
+        std::vector<hipStream_t> streams;
+    };
+
+}
+
+#endif
diff --git a/projects/rccl/test/test_AllGather.cpp b/projects/rccl/test/test_AllGather.cpp
new file mode 100644
index 0000000000..d65a45a28e
--- /dev/null
+++ b/projects/rccl/test/test_AllGather.cpp
@@ -0,0 +1,63 @@
+/*************************************************************************
+ * Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+#include "test_AllGather.hpp"
+#include <omp.h>
+
+namespace CorrectnessTests
+{
+    TEST_P(AllGatherCorrectnessTest, Correctness)
+    {
+        if (numDevices > numDevicesAvailable) return;
+        if (numElements % numDevices != 0) return;
+
+        // Prepare input / output / expected results
+        Dataset dataset;
+        dataset.Initialize(numDevices, numElements, dataType, inPlace);
+        FillDatasetWithPattern(dataset);
+        ComputeExpectedResults(dataset);
+
+        size_t const byteCount = dataset.NumBytes() / dataset.numDevices;
+        size_t const sendCount = dataset.numElements / dataset.numDevices;
+
+        // Launch the reduction (1 thread per GPU)
+        #pragma omp parallel for num_threads(numDevices)
+        for (int i = 0; i < numDevices; i++)
+        {
+            ncclAllGather((int8_t *)dataset.inputs[i] + (i * byteCount),
+                          dataset.outputs[i], sendCount,
+                          dataType, comms[i], streams[i]);
+        }
+
+        // Wait for reduction to complete
+        for (int i = 0; i < numDevices; i++)
+            hipStreamSynchronize(streams[i]);
+
+        // Check results
+        ValidateResults(dataset);
+    }
+
+    INSTANTIATE_TEST_SUITE_P(AllGatherCorrectnessSweep,
+                             AllGatherCorrectnessTest,
+                             testing::Combine(
+                                 // Reduction operator (not used)
+                                 testing::Values(ncclSum),
+                                 // Data types
+                                 testing::Values(ncclInt8,
+                                                 ncclUint8,
+                                                 ncclInt32,
+                                                 ncclUint32,
+                                                 ncclInt64,
+                                                 ncclUint64,
+                                                 //ncclFloat16,
+                                                 ncclFloat32,
+                                                 ncclFloat64),
+                                 // Number of elements
+                                 testing::Values(3072, 3145728),
+                                 // Number of devices
+                                 testing::Values(2,3,4),
+                                 // In-place or not
+                                 testing::Values(false, true)));
+} // namespace
diff --git a/projects/rccl/test/test_AllGather.hpp b/projects/rccl/test/test_AllGather.hpp
new file mode 100644
index 0000000000..3146fbe287
--- /dev/null
+++ b/projects/rccl/test/test_AllGather.hpp
@@ -0,0 +1,32 @@
+/*************************************************************************
+ * Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+#ifndef TEST_ALLGATHER_HPP
+#define TEST_ALLGATHER_HPP
+
+#include "CorrectnessTest.hpp"
+
+namespace CorrectnessTests
+{
+    class AllGatherCorrectnessTest : public CorrectnessTest
+    {
+    public:
+        static void ComputeExpectedResults(Dataset& dataset)
+        {
+            size_t const byteCount = dataset.NumBytes() / dataset.numDevices;
+
+            int8_t* result = (int8_t *)malloc(dataset.NumBytes());
+
+            for (int i = 0; i < dataset.numDevices; i++)
+                HIP_CALL(hipMemcpy(result + i * byteCount, (int8_t *)dataset.inputs[i] + (i * byteCount),
+                                   byteCount, hipMemcpyDeviceToHost));
+
+            for (int i = 0; i < dataset.numDevices; i++)
+                memcpy(dataset.expected[i], result, dataset.NumBytes());
+        }
+    };
+}
+
+#endif
diff --git a/projects/rccl/test/test_AllReduce.cpp b/projects/rccl/test/test_AllReduce.cpp
new file mode 100644
index 0000000000..d4b35b6890
--- /dev/null
+++ b/projects/rccl/test/test_AllReduce.cpp
@@ -0,0 +1,59 @@
+/*************************************************************************
+ * Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#include "test_AllReduce.hpp"
+#include <omp.h>
+
+namespace CorrectnessTests
+{
+    TEST_P(AllReduceCorrectnessTest, Correctness)
+    {
+        if (numDevices > numDevicesAvailable) return;
+
+        // Prepare input / output / expected results
+        Dataset dataset;
+        dataset.Initialize(numDevices, numElements, dataType, inPlace);
+        FillDatasetWithPattern(dataset);
+        ComputeExpectedResults(dataset, op);
+
+        // Launch the reduction (1 thread per GPU)
+        #pragma omp parallel for num_threads(numDevices)
+        for (int i = 0; i < numDevices; i++)
+        {
+            ncclAllReduce(dataset.inputs[i], dataset.outputs[i],
+                          numElements, dataType, op, comms[i], streams[i]);
+        }
+
+        // Wait for reduction to complete
+        for (int i = 0; i < numDevices; i++)
+            hipStreamSynchronize(streams[i]);
+
+        // Check results
+        ValidateResults(dataset);
+    }
+
+    INSTANTIATE_TEST_SUITE_P(AllReduceCorrectnessSweep,
+                             AllReduceCorrectnessTest,
+                             testing::Combine(
+                                 // Reduction operator
+                                 testing::Values(ncclSum, ncclProd, ncclMax, ncclMin),
+                                 // Data types
+                                 testing::Values(ncclInt8,
+                                                 ncclUint8,
+                                                 ncclInt32,
+                                                 ncclUint32,
+                                                 ncclInt64,
+                                                 ncclUint64,
+                                                 //ncclFloat16,
+                                                 ncclFloat32,
+                                                 ncclFloat64),
+                                 // Number of elements
+                                 testing::Values(1024, 1048576),
+                                 // Number of devices
+                                 testing::Values(2,3,4),
+                                 // In-place or not
+                                 testing::Values(false, true)));
+} // namespace
diff --git a/projects/rccl/test/test_AllReduce.hpp b/projects/rccl/test/test_AllReduce.hpp
new file mode 100644
index 0000000000..d8867cb649
--- /dev/null
+++ b/projects/rccl/test/test_AllReduce.hpp
@@ -0,0 +1,76 @@
+/*************************************************************************
+ * Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+#ifndef TEST_ALLREDUCE_HPP
+#define TEST_ALLREDUCE_HPP
+
+#include "CorrectnessTest.hpp"
+
+namespace CorrectnessTests
+{
+    class AllReduceCorrectnessTest : public CorrectnessTest
+    {
+    public:
+        static void ComputeExpectedResults(Dataset& dataset, ncclRedOp_t const op)
+        {
+            // Copy all inputs to expected arrays temporarily to perform reduction on host
+            for (int i = 0; i < dataset.numDevices; i++)
+                HIP_CALL(hipMemcpy(dataset.expected[i], dataset.inputs[i],
+                                   dataset.NumBytes(), hipMemcpyDeviceToHost));
+
+            // Allocate temporary host array to accumulate results
+            int8_t*   resultI1 = (int8_t   *)malloc(dataset.NumBytes());
+            uint8_t*  resultU1 = (uint8_t  *)resultI1;
+            int32_t*  resultI4 = (int32_t  *)resultI1;
+            uint32_t* resultU4 = (uint32_t *)resultI1;
+            int64_t*  resultI8 = (int64_t  *)resultI1;
+            uint64_t* resultU8 = (uint64_t *)resultI1;
+            float*    resultF4 = (float    *)resultI1;
+            double*   resultF8 = (double   *)resultI1;
+
+            // Initialize the result with the first device's array
+            memcpy(resultI1, dataset.expected[0], dataset.NumBytes());
+
+            // Perform reduction on the other device arrays
+            for (int i = 1; i < dataset.numDevices; i++)
+            {
+                int8_t*   arrayI1 = (int8_t   *)dataset.expected[i];
+                uint8_t*  arrayU1 = (uint8_t  *)arrayI1;
+                int32_t*  arrayI4 = (int32_t  *)arrayI1;
+                uint32_t* arrayU4 = (uint32_t *)arrayI1;
+                int64_t*  arrayI8 = (int64_t  *)arrayI1;
+                uint64_t* arrayU8 = (uint64_t *)arrayI1;
+                float*    arrayF4 = (float    *)arrayI1;
+                double*   arrayF8 = (double   *)arrayI1;
+
+                for (int j = 0; j < dataset.numElements; j++)
+                {
+                    switch (dataset.dataType)
+                    {
+                    case ncclInt8:    resultI1[j] = ReduceOp(op, resultI1[j], arrayI1[j]); break;
+                    case ncclUint8:   resultU1[j] = ReduceOp(op, resultU1[j], arrayU1[j]); break;
+                    case ncclInt32:   resultI4[j] = ReduceOp(op, resultI4[j], arrayI4[j]); break;
+                    case ncclUint32:  resultU4[j] = ReduceOp(op, resultU4[j], arrayU4[j]); break;
+                    case ncclInt64:   resultI8[j] = ReduceOp(op, resultI8[j], arrayI8[j]); break;
+                    case ncclUint64:  resultU8[j] = ReduceOp(op, resultU8[j], arrayU8[j]); break;
+                    case ncclFloat32: resultF4[j] = ReduceOp(op, resultF4[j], arrayF4[j]); break;
+                    case ncclFloat64: resultF8[j] = ReduceOp(op, resultF8[j], arrayF8[j]); break;
+                    default:
+                        fprintf(stderr, "[ERROR] Unsupported datatype\n");
+                        exit(0);
+                    }
+                }
+            }
+
+            // Copy results into expected arrays
+            for (int i = 0; i < dataset.numDevices; i++)
+                memcpy(dataset.expected[i], resultI1, dataset.NumBytes());
+
+            free(resultI1);
+        }
+    };
+}
+
+#endif
diff --git a/projects/rccl/test/test_Broadcast.cpp b/projects/rccl/test/test_Broadcast.cpp
new file mode 100644
index 0000000000..c2f47b30ad
--- /dev/null
+++ b/projects/rccl/test/test_Broadcast.cpp
@@ -0,0 +1,67 @@
+/*************************************************************************
+ * Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#include "test_Broadcast.hpp"
+#include <omp.h>
+
+namespace CorrectnessTests
+{
+    TEST_P(BroadcastCorrectnessTest, Correctness)
+    {
+        if (numDevices > numDevicesAvailable) return;
+
+        // Allocate data
+        Dataset dataset;
+        dataset.Initialize(numDevices, numElements, dataType, inPlace);
+
+        // Test each possible root
+        for (int root = 0; root < numDevices; root++)
+        {
+            // Prepare input / output / expected results
+            FillDatasetWithPattern(dataset);
+            ComputeExpectedResults(dataset, root);
+
+            // Launch the reduction (1 thread per GPU)
+            #pragma omp parallel for num_threads(numDevices)
+            for (int i = 0; i < numDevices; i++)
+            {
+                ncclBroadcast(dataset.inputs[i],
+                              dataset.outputs[i],
+                              numElements, dataType,
+                              root, comms[i], streams[i]);
+            }
+
+            // Wait for reduction to complete
+            for (int i = 0; i < numDevices; i++)
+                hipStreamSynchronize(streams[i]);
+
+            // Check results
+            ValidateResults(dataset);
+        }
+    }
+
+    INSTANTIATE_TEST_SUITE_P(BroadcastCorrectnessSweep,
+                             BroadcastCorrectnessTest,
+                             testing::Combine(
+                                 // Reduction operator is not used
+                                 testing::Values(ncclSum),
+                                 // Data types
+                                 testing::Values(ncclInt8,
+                                                 ncclUint8,
+                                                 ncclInt32,
+                                                 ncclUint32,
+                                                 ncclInt64,
+                                                 ncclUint64,
+                                                 //ncclFloat16,
+                                                 ncclFloat32,
+                                                 ncclFloat64),
+                                 // Number of elements
+                                 testing::Values(1024, 1048576),
+                                 // Number of devices
+                                 testing::Values(2,3,4),
+                                 // In-place or not
+                                 testing::Values(false, true)));
+} // namespace
diff --git a/projects/rccl/test/test_Broadcast.hpp b/projects/rccl/test/test_Broadcast.hpp
new file mode 100644
index 0000000000..5ccf351592
--- /dev/null
+++ b/projects/rccl/test/test_Broadcast.hpp
@@ -0,0 +1,26 @@
+/*************************************************************************
+ * Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+#ifndef TEST_BROADCAST_HPP
+#define TEST_BROADCAST_HPP
+
+#include "CorrectnessTest.hpp"
+#include <omp.h>
+
+namespace CorrectnessTests
+{
+    class BroadcastCorrectnessTest : public CorrectnessTest
+    {
+    public:
+        static void ComputeExpectedResults(Dataset& dataset, int const root)
+        {
+            for (int i = 0; i < dataset.numDevices; i++)
+                HIP_CALL(hipMemcpy(dataset.expected[i], dataset.inputs[root],
+                                   dataset.NumBytes(), hipMemcpyDeviceToHost));
+        }
+    };
+}
+
+#endif
diff --git a/projects/rccl/test/test_GroupCalls.cpp b/projects/rccl/test/test_GroupCalls.cpp
new file mode 100644
index 0000000000..9bf0dd5497
--- /dev/null
+++ b/projects/rccl/test/test_GroupCalls.cpp
@@ -0,0 +1,125 @@
+/*************************************************************************
+ * Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+#include "test_GroupCalls.hpp"
+
+#include "test_AllGather.hpp"
+#include "test_AllReduce.hpp"
+#include "test_Broadcast.hpp"
+#include "test_Reduce.hpp"
+#include "test_ReduceScatter.hpp"
+
+#include <omp.h>
+
+namespace CorrectnessTests
+{
+    TEST_P(GroupCallsCorrectnessTest, Correctness)
+    {
+        if (numDevices > numDevicesAvailable) return;
+
+        // Create multiple datasets for group operation
+        std::vector<Dataset> datasets(5);
+        for (int i = 0; i < datasets.size(); i++)
+        {
+            datasets[i].Initialize(numDevices, numElements, dataType, inPlace);
+            FillDatasetWithPattern(datasets[i]);
+        }
+
+        // Compute expected results for each dataset in group
+        int const root = 0;
+        AllGatherCorrectnessTest::ComputeExpectedResults(datasets[0]);
+        AllReduceCorrectnessTest::ComputeExpectedResults(datasets[1], op);
+        BroadcastCorrectnessTest::ComputeExpectedResults(datasets[2], root);
+        ReduceCorrectnessTest::ComputeExpectedResults(datasets[3], op, root);
+        ReduceScatterCorrectnessTest::ComputeExpectedResults(datasets[4], op);
+
+        // Start a group call
+        ncclGroupStart();
+
+        // AllGather
+        size_t const byteCount = datasets[0].NumBytes() / numDevices;
+        size_t const elemCount = numElements / numDevices;
+        for (int i = 0; i < numDevices; i++)
+        {
+            HIP_CALL(hipSetDevice(i));
+            ncclAllGather((int8_t *)datasets[0].inputs[i] + (i * byteCount),
+                          datasets[0].outputs[i], elemCount,
+                          dataType, comms[i], streams[i]);
+        }
+
+        // AllReduce
+        for (int i = 0; i < numDevices; i++)
+        {
+            HIP_CALL(hipSetDevice(i));
+            ncclAllReduce(datasets[1].inputs[i], datasets[1].outputs[i],
+                          numElements, dataType, op, comms[i], streams[i]);
+        }
+
+        // Broadcast
+        for (int i = 0; i < numDevices; i++)
+        {
+            HIP_CALL(hipSetDevice(i));
+            ncclBroadcast(datasets[2].inputs[i],
+                          datasets[2].outputs[i],
+                          numElements, dataType,
+                          root, comms[i], streams[i]);
+        }
+
+        // Reduce
+        for (int i = 0; i < numDevices; i++)
+        {
+            HIP_CALL(hipSetDevice(i));
+            ncclReduce(datasets[3].inputs[i],
+                       datasets[3].outputs[i],
+                       numElements, dataType, op,
+                       root, comms[i], streams[i]);
+        }
+
+        // ReduceScatter
+        for (int i = 0; i < numDevices; i++)
+        {
+            ncclReduceScatter(datasets[4].inputs[i],
+                              (int8_t *)datasets[4].outputs[i] + (i * byteCount),
+                              elemCount, dataType, op,
+                              comms[i], streams[i]);
+            HIP_CALL(hipSetDevice(i));
+        }
+
+        // Signal end of group call
+        ncclGroupEnd();
+
+        // Wait for reduction to complete
+        for (int i = 0; i < numDevices; i++)
+            hipStreamSynchronize(streams[i]);
+
+        // Check results for each collective in the group
+        for (int i = 0; i < 5; i++)
+        {
+            ValidateResults(datasets[i]);
+        }
+    }
+
+    INSTANTIATE_TEST_SUITE_P(GroupCallsCorrectnessSweep,
+                             GroupCallsCorrectnessTest,
+                             testing::Combine(
+                                 // Reduction operator (not used)
+                                 testing::Values(ncclSum),
+                                 // Data types
+                                 testing::Values(ncclInt8,
+                                                 ncclUint8,
+                                                 ncclInt32,
+                                                 ncclUint32,
+                                                 ncclInt64,
+                                                 ncclUint64,
+                                                 //ncclFloat16,
+                                                 ncclFloat32,
+                                                 ncclFloat64),
+                                 // Number of elements
+                                 testing::Values(3072, 3145728),
+                                 // Number of devices
+                                 testing::Values(2,3,4),
+                                 // In-place or not
+                                 testing::Values(false, true)));
+} // namespace
diff --git a/projects/rccl/test/test_GroupCalls.hpp b/projects/rccl/test/test_GroupCalls.hpp
new file mode 100644
index 0000000000..00f84ce6a9
--- /dev/null
+++ b/projects/rccl/test/test_GroupCalls.hpp
@@ -0,0 +1,17 @@
+/*************************************************************************
+ * Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef TEST_GROUPCALLS_HPP
+#define TEST_GROUPCALLS_HPP
+
+#include "CorrectnessTest.hpp"
+
+namespace CorrectnessTests
+{
+    class GroupCallsCorrectnessTest : public CorrectnessTest {};
+}
+
+#endif
diff --git a/projects/rccl/test/test_Reduce.cpp b/projects/rccl/test/test_Reduce.cpp
new file mode 100644
index 0000000000..089cc97593
--- /dev/null
+++ b/projects/rccl/test/test_Reduce.cpp
@@ -0,0 +1,67 @@
+/*************************************************************************
+ * Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#include "test_Reduce.hpp"
+#include <omp.h>
+
+namespace CorrectnessTests
+{
+    TEST_P(ReduceCorrectnessTest, Correctness)
+    {
+        if (numDevices > numDevicesAvailable) return;
+
+        // Allocate data
+        Dataset dataset;
+        dataset.Initialize(numDevices, numElements, dataType, inPlace);
+
+        // Test each possible root
+        for (int root = 0; root < numDevices; root++)
+        {
+            // Prepare input / output / expected results
+            FillDatasetWithPattern(dataset);
+            ComputeExpectedResults(dataset, op, root);
+
+            // Launch the reduction (1 thread per GPU)
+            #pragma omp parallel for num_threads(numDevices)
+            for (int i = 0; i < numDevices; i++)
+            {
+                ncclReduce(dataset.inputs[i],
+                           dataset.outputs[i],
+                           numElements, dataType, op,
+                           root, comms[i], streams[i]);
+            }
+
+            // Wait for reduction to complete
+            for (int i = 0; i < numDevices; i++)
+                hipStreamSynchronize(streams[i]);
+
+            // Check results
+            ValidateResults(dataset);
+        }
+    }
+
+    INSTANTIATE_TEST_SUITE_P(ReduceCorrectnessSweep,
+                             ReduceCorrectnessTest,
+                             testing::Combine(
+                                 // Reduction operator
+                                 testing::Values(ncclSum, ncclProd, ncclMax, ncclMin),
+                                 // Data types
+                                 testing::Values(ncclInt8,
+                                                 ncclUint8,
+                                                 ncclInt32,
+                                                 ncclUint32,
+                                                 ncclInt64,
+                                                 ncclUint64,
+                                                 //ncclFloat16,
+                                                 ncclFloat32,
+                                                 ncclFloat64),
+                                 // Number of elements
+                                 testing::Values(1024, 1048576),
+                                 // Number of devices
+                                 testing::Values(2,3,4),
+                                 // In-place or not
+                                 testing::Values(false, true)));
+} // namespace
diff --git a/projects/rccl/test/test_Reduce.hpp b/projects/rccl/test/test_Reduce.hpp
new file mode 100644
index 0000000000..3ab9d66b44
--- /dev/null
+++ b/projects/rccl/test/test_Reduce.hpp
@@ -0,0 +1,80 @@
+/*************************************************************************
+ * Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+#ifndef TEST_REDUCE_HPP
+#define TEST_REDUCE_HPP
+
+#include "CorrectnessTest.hpp"
+
+namespace CorrectnessTests
+{
+    class ReduceCorrectnessTest : public CorrectnessTest
+    {
+    public:
+        static void ComputeExpectedResults(Dataset& dataset, ncclRedOp_t const op, int const root)
+        {
+            // Copy all inputs to expected arrays temporarily to perform reduction on host
+            for (int i = 0; i < dataset.numDevices; i++)
+                HIP_CALL(hipMemcpy(dataset.expected[i], dataset.inputs[i],
+                                   dataset.NumBytes(), hipMemcpyDeviceToHost));
+
+            // Allocate temporary host array to accumulate results
+            int8_t*   resultI1 = (int8_t   *)malloc(dataset.NumBytes());
+            uint8_t*  resultU1 = (uint8_t  *)resultI1;
+            int32_t*  resultI4 = (int32_t  *)resultI1;
+            uint32_t* resultU4 = (uint32_t *)resultI1;
+            int64_t*  resultI8 = (int64_t  *)resultI1;
+            uint64_t* resultU8 = (uint64_t *)resultI1;
+            float*    resultF4 = (float    *)resultI1;
+            double*   resultF8 = (double   *)resultI1;
+
+            // Initialize the result with the first device's array
+            memcpy(resultI1, dataset.expected[0], dataset.NumBytes());
+
+            // Perform reduction on the other device arrays
+            for (int i = 1; i < dataset.numDevices; i++)
+            {
+                int8_t*   arrayI1 = (int8_t   *)dataset.expected[i];
+                uint8_t*  arrayU1 = (uint8_t  *)arrayI1;
+                int32_t*  arrayI4 = (int32_t  *)arrayI1;
+                uint32_t* arrayU4 = (uint32_t *)arrayI1;
+                int64_t*  arrayI8 = (int64_t  *)arrayI1;
+                uint64_t* arrayU8 = (uint64_t *)arrayI1;
+                float*    arrayF4 = (float    *)arrayI1;
+                double*   arrayF8 = (double   *)arrayI1;
+
+                for (int j = 0; j < dataset.numElements; j++)
+                {
+                    switch (dataset.dataType)
+                    {
+                    case ncclInt8:    resultI1[j] = ReduceOp(op, resultI1[j], arrayI1[j]); break;
+                    case ncclUint8:   resultU1[j] = ReduceOp(op, resultU1[j], arrayU1[j]); break;
+                    case ncclInt32:   resultI4[j] = ReduceOp(op, resultI4[j], arrayI4[j]); break;
+                    case ncclUint32:  resultU4[j] = ReduceOp(op, resultU4[j], arrayU4[j]); break;
+                    case ncclInt64:   resultI8[j] = ReduceOp(op, resultI8[j], arrayI8[j]); break;
+                    case ncclUint64:  resultU8[j] = ReduceOp(op, resultU8[j], arrayU8[j]); break;
+                    case ncclFloat32: resultF4[j] = ReduceOp(op, resultF4[j], arrayF4[j]); break;
+                    case ncclFloat64: resultF8[j] = ReduceOp(op, resultF8[j], arrayF8[j]); break;
+                    default:
+                        fprintf(stderr, "[ERROR] Unsupported datatype\n");
+                        exit(0);
+                    }
+                }
+            }
+
+            // Copy results into expected arrays
+            for (int i = 0; i < dataset.numDevices; i++)
+            {
+                if (i == root)
+                    memcpy(dataset.expected[root], resultI1, dataset.NumBytes());
+                else
+                    HIP_CALL(hipMemcpy(dataset.expected[i], dataset.outputs[i], dataset.NumBytes(), hipMemcpyDeviceToHost));
+            }
+            free(resultI1);
+        }
+    };
+}
+
+#endif
diff --git a/projects/rccl/test/test_ReduceScatter.cpp b/projects/rccl/test/test_ReduceScatter.cpp
new file mode 100644
index 0000000000..10ae2affc8
--- /dev/null
+++ b/projects/rccl/test/test_ReduceScatter.cpp
@@ -0,0 +1,65 @@
+/*************************************************************************
+ * Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#include "test_ReduceScatter.hpp"
+#include <omp.h>
+
+namespace CorrectnessTests
+{
+    TEST_P(ReduceScatterCorrectnessTest, Correctness)
+    {
+        if (numDevices > numDevicesAvailable) return;
+        if (numElements % numDevices != 0) return;
+
+        // Prepare input / output / expected results
+        Dataset dataset;
+        dataset.Initialize(numDevices, numElements, dataType, inPlace);
+        FillDatasetWithPattern(dataset);
+        ComputeExpectedResults(dataset, op);
+
+        size_t const byteCount = dataset.NumBytes() / dataset.numDevices;
+        size_t const recvCount = dataset.numElements / dataset.numDevices;
+
+        // Launch the reduction (1 thread per GPU)
+        #pragma omp parallel for num_threads(numDevices)
+        for (int i = 0; i < numDevices; i++)
+        {
+            ncclReduceScatter(dataset.inputs[i],
+                              (int8_t *)dataset.outputs[i] + (i * byteCount),
+                              recvCount, dataType, op,
+                              comms[i], streams[i]);
+        }
+
+        // Wait for reduction to complete
+        for (int i = 0; i < numDevices; i++)
+            hipStreamSynchronize(streams[i]);
+
+        // Check results
+        ValidateResults(dataset);
+    }
+
+    INSTANTIATE_TEST_SUITE_P(ReduceScatterCorrectnessSweep,
+                             ReduceScatterCorrectnessTest,
+                             testing::Combine(
+                                 // Reduction operator
+                                 testing::Values(ncclSum, ncclProd, ncclMax, ncclMin),
+                                 // Data types
+                                 testing::Values(ncclInt8,
+                                                 ncclUint8,
+                                                 ncclInt32,
+                                                 ncclUint32,
+                                                 ncclInt64,
+                                                 ncclUint64,
+                                                 //ncclFloat16,
+                                                 ncclFloat32,
+                                                 ncclFloat64),
+                                 // Number of elements
+                                 testing::Values(3072, 3145728),
+                                 // Number of devices
+                                 testing::Values(2,3,4),
+                                 // In-place or not
+                                 testing::Values(false, true)));
+} // namespace
diff --git a/projects/rccl/test/test_ReduceScatter.hpp b/projects/rccl/test/test_ReduceScatter.hpp
new file mode 100644
index 0000000000..a1731f13b2
--- /dev/null
+++ b/projects/rccl/test/test_ReduceScatter.hpp
@@ -0,0 +1,83 @@
+/*************************************************************************
+ * Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+#ifndef TEST_REDUCE_SCATTER_HPP
+#define TEST_REDUCE_SCATTER_HPP
+
+#include "CorrectnessTest.hpp"
+
+namespace CorrectnessTests
+{
+    class ReduceScatterCorrectnessTest : public CorrectnessTest
+    {
+    public:
+        static void ComputeExpectedResults(Dataset& dataset, ncclRedOp_t const op)
+        {
+            // Copy all inputs to expected arrays temporarily to perform reduction on host
+            for (int i = 0; i < dataset.numDevices; i++)
+                HIP_CALL(hipMemcpy(dataset.expected[i], dataset.inputs[i],
+                                   dataset.NumBytes(), hipMemcpyDeviceToHost));
+
+            // Allocate temporary host array to accumulate results
+            int8_t*   resultI1 = (int8_t   *)malloc(dataset.NumBytes());
+            uint8_t*  resultU1 = (uint8_t  *)resultI1;
+            int32_t*  resultI4 = (int32_t  *)resultI1;
+            uint32_t* resultU4 = (uint32_t *)resultI1;
+            int64_t*  resultI8 = (int64_t  *)resultI1;
+            uint64_t* resultU8 = (uint64_t *)resultI1;
+            float*    resultF4 = (float    *)resultI1;
+            double*   resultF8 = (double   *)resultI1;
+
+            // Initialize the result with the first device's array
+            memcpy(resultI1, dataset.expected[0], dataset.NumBytes());
+
+            // Perform reduction on the other device arrays
+            for (int i = 1; i < dataset.numDevices; i++)
+            {
+                int8_t*   arrayI1 = (int8_t   *)dataset.expected[i];
+                uint8_t*  arrayU1 = (uint8_t  *)arrayI1;
+                int32_t*  arrayI4 = (int32_t  *)arrayI1;
+                uint32_t* arrayU4 = (uint32_t *)arrayI1;
+                int64_t*  arrayI8 = (int64_t  *)arrayI1;
+                uint64_t* arrayU8 = (uint64_t *)arrayI1;
+                float*    arrayF4 = (float    *)arrayI1;
+                double*   arrayF8 = (double   *)arrayI1;
+
+                for (int j = 0; j < dataset.numElements; j++)
+                {
+                    switch (dataset.dataType)
+                    {
+                    case ncclInt8:    resultI1[j] = ReduceOp(op, resultI1[j], arrayI1[j]); break;
+                    case ncclUint8:   resultU1[j] = ReduceOp(op, resultU1[j], arrayU1[j]); break;
+                    case ncclInt32:   resultI4[j] = ReduceOp(op, resultI4[j], arrayI4[j]); break;
+                    case ncclUint32:  resultU4[j] = ReduceOp(op, resultU4[j], arrayU4[j]); break;
+                    case ncclInt64:   resultI8[j] = ReduceOp(op, resultI8[j], arrayI8[j]); break;
+                    case ncclUint64:  resultU8[j] = ReduceOp(op, resultU8[j], arrayU8[j]); break;
+                    case ncclFloat32: resultF4[j] = ReduceOp(op, resultF4[j], arrayF4[j]); break;
+                    case ncclFloat64: resultF8[j] = ReduceOp(op, resultF8[j], arrayF8[j]); break;
+                    default:
+                        fprintf(stderr, "[ERROR] Unsupported datatype\n");
+                        exit(0);
+                    }
+                }
+            }
+
+            // Copy results into expected arrays
+            size_t const byteCount = dataset.NumBytes() / dataset.numDevices;
+
+            for (int i = 0; i < dataset.numDevices; i++)
+                HIP_CALL(hipMemcpy(dataset.expected[i], dataset.outputs[i],
+                                   dataset.NumBytes(), hipMemcpyDeviceToHost));
+
+            for (int i = 0; i < dataset.numDevices; i++)
+                memcpy((int8_t *)dataset.expected[i] + (i * byteCount),
+                       resultI1 + (i * byteCount), byteCount);
+
+            free(resultI1);
+        }
+    };
+}
+
+#endif