From 00a910c2dad9995b72d24ee1520d47bc9b57375d Mon Sep 17 00:00:00 2001
From: Wenkai Du <wenkai.du@amd.com>
Date: Tue, 26 Nov 2019 12:40:26 -0800
Subject: [PATCH 1/8] Change manual build instructions to fit most common usage

---
 README.md | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index 56eca69c52..405b69127b 100644
--- a/README.md
+++ b/README.md
@@ -37,10 +37,14 @@ $ git clone https://github.com/ROCmSoftwarePlatform/rccl.git
 $ cd rccl
 $ mkdir build
 $ cd build
-$ CXX=/opt/rocm/bin/hcc cmake -DCMAKE_INSTALL_PREFIX=$PWD/rccl-install ..
+$ CXX=/opt/rocm/bin/hcc cmake ..
 $ make -j 8
 ```
-You may substitute a path of your own choosing for CMAKE_INSTALL_PREFIX. Note: ensure rocm-cmake is installed, `apt install rocm-cmake`.
+You may substitute an installation path of your own choosing by passing CMAKE_INSTALL_PREFIX. For example:
+```shell
+$ CXX=/opt/rocm/bin/hcc cmake -DCMAKE_INSTALL_PREFIX=$PWD/rccl-install ..
+```
+Note: ensure rocm-cmake is installed, `apt install rocm-cmake`.
 
 #### To build the RCCL package and install package :
 

From 90e928bcd5075e47648d913a7888909ba3499732 Mon Sep 17 00:00:00 2001
From: Wenkai Du <wenkai.du@amd.com>
Date: Wed, 4 Dec 2019 21:05:10 +0000
Subject: [PATCH 2/8] Change default P2P level

---
 src/transport/p2p.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transport/p2p.cc b/src/transport/p2p.cc
index 7045ee8abc..7ce944fef4 100644
--- a/src/transport/p2p.cc
+++ b/src/transport/p2p.cc
@@ -78,7 +78,7 @@ static int busIdToCudaDev(const char* busId) {
 /* Determine if we can communicate with the peer through p2p */
 ncclResult_t p2pCanConnect(ncclTvalue_t* ret, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo) {
   // Do not use P2P across root complexes by default (provided CUDA permits it)
-  int p2pLevel = PATH_NODE;
+  int p2pLevel = PATH_SYS;
   if (ncclParamP2pDisable() == 1) p2pLevel = 0;
   if (ncclParamP2pLevel() != -2) p2pLevel = ncclParamP2pLevel();
 

From bd59b6f880b8dd6593808b18946a3d4e569b7f60 Mon Sep 17 00:00:00 2001
From: saadrahim <44449863+saadrahim@users.noreply.github.com>
Date: Fri, 6 Dec 2019 14:00:25 -0700
Subject: [PATCH 3/8] Changing package dependency to rocm-dev (#160)

---
 CMakeLists.txt | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index b6ff4847ba..b7dcf55942 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -186,8 +186,8 @@ rocm_export_targets(NAMESPACE
                     DEPENDS
                     hip)
 
-set(CPACK_DEBIAN_PACKAGE_DEPENDS "hip_hcc")
-set(CPACK_RPM_PACKAGE_REQUIRES "hip_hcc")
+set(CPACK_DEBIAN_PACKAGE_DEPENDS "rocm-dev (>= 2.5.27)")
+set(CPACK_RPM_PACKAGE_REQUIRES "rocm-dev (>= 2.5.27)")
 
 set(CPACK_RPM_EXCLUDE_FROM_AUTO_FILELIST_ADDITION "/opt" "/opt/rocm")
 
@@ -197,7 +197,7 @@ rocm_create_package(
   DESCRIPTION
   "Optimized primitives for collective multi-GPU communication"
   MAINTAINER
-  "<no-reply@amd.com>"
+  "<rccl-maintainer@amd.com>"
   LDCONFIG)
 
 rocm_install_symlink_subdir(rccl)

From 0092b3513278bd39cae91ebb675a628968ccfa38 Mon Sep 17 00:00:00 2001
From: saadrahim <44449863+saadrahim@users.noreply.github.com>
Date: Fri, 6 Dec 2019 16:06:50 -0700
Subject: [PATCH 4/8] Package fix (#161)

* Fixing RHEL dependency on rocm-dev
---
 CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index b7dcf55942..a68b2836c6 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -187,7 +187,7 @@ rocm_export_targets(NAMESPACE
                     hip)
 
 set(CPACK_DEBIAN_PACKAGE_DEPENDS "rocm-dev (>= 2.5.27)")
-set(CPACK_RPM_PACKAGE_REQUIRES "rocm-dev (>= 2.5.27)")
+set(CPACK_RPM_PACKAGE_REQUIRES "rocm-dev >= 2.5.27")
 
 set(CPACK_RPM_EXCLUDE_FROM_AUTO_FILELIST_ADDITION "/opt" "/opt/rocm")
 

From 2f4269d06d803541057c6d4ef9ec192660d22fbc Mon Sep 17 00:00:00 2001
From: gilbertlee-amd <44450918+gilbertlee-amd@users.noreply.github.com>
Date: Thu, 12 Dec 2019 15:20:54 -0700
Subject: [PATCH 5/8] Adding new sleep after sync capability for data fabric
 profiling (#162)

Fixing missing header include for ROCM 3.0 changes
---
 tools/TransferBench/TransferBench.cpp | 18 +++++++++++++-----
 tools/TransferBench/TransferBench.hpp |  2 ++
 2 files changed, 15 insertions(+), 5 deletions(-)

diff --git a/tools/TransferBench/TransferBench.cpp b/tools/TransferBench/TransferBench.cpp
index afb548ff58..016ff80ed4 100644
--- a/tools/TransferBench/TransferBench.cpp
+++ b/tools/TransferBench/TransferBench.cpp
@@ -27,6 +27,7 @@ THE SOFTWARE.
 #include <cstdio>
 #include <cstdlib>
 #include <set>
+#include <unistd.h>
 
 #include <hip/hip_runtime.h>
 #include "copy_kernel.h"
@@ -49,12 +50,13 @@ int main(int argc, char **argv)
     printf("\n");
     printf("Environment variables:\n");
     printf("======================\n");
-    printf(" USE_HIP_CALL    - Use hip calls (hipMemcpyAsync/hipMemset) instead of kernel\n");
-    printf(" USE_MEMSET      - Write constant value (instead of doing a copy)\n");
-    printf(" USE_COARSE_MEM  - Use coarse-grained dst GPU memory (instead of fine-grained)\n");
-    printf(" USE_SINGLE_SYNC - Only synchronize once at end of iterations (disables GPU times)\n");
-    printf(" USE_INTERACTIVE - Waits for user-input prior to start and after transfer loop (for profiling)\n");
+    printf(" USE_HIP_CALL     - Use hip calls (hipMemcpyAsync/hipMemset) instead of kernel\n");
+    printf(" USE_MEMSET       - Write constant value (instead of doing a copy)\n");
+    printf(" USE_COARSE_MEM   - Use coarse-grained dst GPU memory (instead of fine-grained)\n");
+    printf(" USE_SINGLE_SYNC  - Only synchronize once at end of iterations (disables GPU times)\n");
+    printf(" USE_INTERACTIVE  - Waits for user-input prior to start and after transfer loop (for profiling)\n");
     printf(" USE_ITERATIONS=N - Sets number of iterations to run (default is 10)\n");
+    printf(" USE_SLEEP        - Adds a 100ms sleep after sync (for profiling)\n");
     exit(0);
   }
 
@@ -74,6 +76,7 @@ int main(int argc, char **argv)
   bool useCoarseMem = getenv("USE_COARSE_MEM");
   bool useSingleSync = getenv("USE_SINGLE_SYNC");
   bool useInteractive = getenv("USE_INTERACTIVE");
+  bool useSleep = getenv("USE_SLEEP");
 
   int numWarmups = 3;
   int numIterations = getenv("USE_ITERATIONS") ? atoi(getenv("USE_ITERATIONS")) : 10;
@@ -99,6 +102,10 @@ int main(int argc, char **argv)
     printf("Running in interactive mode (USE_INTERACTIVE)\n");
   else
     printf("Running in non-interactive mode (enable interactive mode via USE_INTERACTIVE)\n");
+  if (useSleep)
+    printf("Adding 100ms sleep after sync (USE_SLEEP)\n");
+  else
+    printf("No sleep per sync (enable sleep via USE_SLEEP)\n");
 
   printf("Executing %d warmup iteration(s), and %d timed iteration(s) (Set via USE_ITERATION=#)\n",
          numWarmups, numIterations);
@@ -314,6 +321,7 @@ int main(int argc, char **argv)
 
       auto cpuDelta = std::chrono::high_resolution_clock::now() - cpuStart;
       double deltaSec = std::chrono::duration_cast<std::chrono::duration<double>>(cpuDelta).count();
+      if (useSleep) usleep(100000);
 
       if (iteration >= 0)
       {
diff --git a/tools/TransferBench/TransferBench.hpp b/tools/TransferBench/TransferBench.hpp
index 5fde8f58fe..92c596d672 100644
--- a/tools/TransferBench/TransferBench.hpp
+++ b/tools/TransferBench/TransferBench.hpp
@@ -20,6 +20,8 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 THE SOFTWARE.
 */
 
+#include <sstream>
+
 // Helper macro for catching HIP errors
 #define HIP_CALL(cmd)                                                   \
     do {                                                                \

From 000bce6f27b0689ce7f5d2d9b22d9b718548aee9 Mon Sep 17 00:00:00 2001
From: gilbertlee-amd <44450918+gilbertlee-amd@users.noreply.github.com>
Date: Fri, 20 Dec 2019 11:41:56 -0700
Subject: [PATCH 6/8] Removing OpenMP from unit tests (#163)

---
 test/CMakeLists.txt          | 7 -------
 test/test_AllGather.cpp      | 7 ++++---
 test/test_AllReduce.cpp      | 4 ++--
 test/test_AllReduceAbort.cpp | 1 -
 test/test_Broadcast.cpp      | 5 ++---
 test/test_Broadcast.hpp      | 1 -
 test/test_BroadcastAbort.cpp | 1 -
 test/test_CombinedCalls.cpp  | 5 ++---
 test/test_GroupCalls.cpp     | 2 --
 test/test_Reduce.cpp         | 4 ++--
 test/test_ReduceScatter.cpp  | 5 ++---
 11 files changed, 14 insertions(+), 28 deletions(-)

diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
index 66236c2f18..ee542902b6 100644
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -10,13 +10,6 @@ if(BUILD_TESTS)
       message(FATAL_ERROR "chrpath is required for UnitTests. Please install (e.g. sudo apt-get install chrpath)")
   endif()
 
-  # OpenMP is used to drive GPUs (one per thread)
-  if(EXISTS /etc/redhat-release)
-    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fopenmp=libgomp -pthread")
-  else()
-    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fopenmp -pthread")
-  endif()    
-
   # Download and unpack googletest at configure time
   configure_file(CMakeLists.txt.in googletest-download/CMakeLists.txt)
   execute_process(
diff --git a/test/test_AllGather.cpp b/test/test_AllGather.cpp
index b28df48e87..df5aea23e3 100644
--- a/test/test_AllGather.cpp
+++ b/test/test_AllGather.cpp
@@ -4,7 +4,6 @@
  * See LICENSE.txt for license information
  ************************************************************************/
 #include "test_AllGather.hpp"
-#include <omp.h>
 
 namespace CorrectnessTests
 {
@@ -23,13 +22,14 @@ namespace CorrectnessTests
         size_t const sendCount = dataset.numElements / dataset.numDevices;
 
         // Launch the reduction (1 thread per GPU)
-        #pragma omp parallel for num_threads(numDevices)
+        ncclGroupStart();
         for (int i = 0; i < numDevices; i++)
         {
             ncclAllGather((int8_t *)dataset.inputs[i] + (i * byteCount),
                           dataset.outputs[i], sendCount,
                           dataType, comms[i], streams[i]);
         }
+        ncclGroupEnd();
 
         // Wait for reduction to complete
         Synchronize();
@@ -68,13 +68,14 @@ namespace CorrectnessTests
                 size_t const sendCount = subDataset.numElements / subDataset.numDevices;
 
                 // Launch the reduction (1 thread per GPU)
-                #pragma omp parallel for num_threads(numDevices)
+                ncclGroupStart();
                 for (int i = 0; i < numDevices; i++)
                 {
                     ncclAllGather((int8_t *)subDataset.inputs[i] + (i * byteCount),
                                   subDataset.outputs[i], sendCount,
                                   dataType, comms[i], streams[i]);
                 }
+                ncclGroupEnd();
 
                 // Wait for reduction to complete
                 Synchronize();
diff --git a/test/test_AllReduce.cpp b/test/test_AllReduce.cpp
index 0fd5eedf91..86d4654b7b 100644
--- a/test/test_AllReduce.cpp
+++ b/test/test_AllReduce.cpp
@@ -5,7 +5,6 @@
  ************************************************************************/
 
 #include "test_AllReduce.hpp"
-#include <omp.h>
 
 namespace CorrectnessTests
 {
@@ -20,12 +19,13 @@ namespace CorrectnessTests
         ComputeExpectedResults(dataset, op);
 
         // Launch the reduction (1 thread per GPU)
-        #pragma omp parallel for num_threads(numDevices)
+        ncclGroupStart();
         for (int i = 0; i < numDevices; i++)
         {
             ncclAllReduce(dataset.inputs[i], dataset.outputs[i],
                           numElements, dataType, op, comms[i], streams[i]);
         }
+        ncclGroupEnd();
 
         // Wait for reduction to complete
         Synchronize();
diff --git a/test/test_AllReduceAbort.cpp b/test/test_AllReduceAbort.cpp
index 9400bd84fc..cccca4a547 100644
--- a/test/test_AllReduceAbort.cpp
+++ b/test/test_AllReduceAbort.cpp
@@ -6,7 +6,6 @@
 
 #include "test_AllReduceAbort.hpp"
 #include "../include/core.h"
-#include <omp.h>
 
 #define NUM_ITER 8
 #define FAKE_OP_COUNT NUM_ITER+1
diff --git a/test/test_Broadcast.cpp b/test/test_Broadcast.cpp
index 0e728b0153..a98d5884cb 100644
--- a/test/test_Broadcast.cpp
+++ b/test/test_Broadcast.cpp
@@ -5,7 +5,6 @@
  ************************************************************************/
 
 #include "test_Broadcast.hpp"
-#include <omp.h>
 
 namespace CorrectnessTests
 {
@@ -25,7 +24,7 @@ namespace CorrectnessTests
             ComputeExpectedResults(dataset, root);
 
             // Launch the reduction (1 thread per GPU)
-            #pragma omp parallel for num_threads(numDevices)
+            ncclGroupStart();
             for (int i = 0; i < numDevices; i++)
             {
                 ncclBroadcast(dataset.inputs[i],
@@ -33,7 +32,7 @@ namespace CorrectnessTests
                               numElements, dataType,
                               root, comms[i], streams[i]);
             }
-
+            ncclGroupEnd();
 
             // Wait for reduction to complete
             Synchronize();
diff --git a/test/test_Broadcast.hpp b/test/test_Broadcast.hpp
index 5ccf351592..358d8cf09a 100644
--- a/test/test_Broadcast.hpp
+++ b/test/test_Broadcast.hpp
@@ -7,7 +7,6 @@
 #define TEST_BROADCAST_HPP
 
 #include "CorrectnessTest.hpp"
-#include <omp.h>
 
 namespace CorrectnessTests
 {
diff --git a/test/test_BroadcastAbort.cpp b/test/test_BroadcastAbort.cpp
index 28596cc52a..824bced360 100644
--- a/test/test_BroadcastAbort.cpp
+++ b/test/test_BroadcastAbort.cpp
@@ -6,7 +6,6 @@
 
 #include "test_BroadcastAbort.hpp"
 #include "../include/core.h"
-#include <omp.h>
 
 #define NUM_ITER 8
 #define FAKE_OP_COUNT NUM_ITER+1
diff --git a/test/test_CombinedCalls.cpp b/test/test_CombinedCalls.cpp
index 4b51ab5375..41e93c78a2 100644
--- a/test/test_CombinedCalls.cpp
+++ b/test/test_CombinedCalls.cpp
@@ -11,8 +11,6 @@
 #include "test_Reduce.hpp"
 #include "test_ReduceScatter.hpp"
 
-#include <omp.h>
-
 namespace CorrectnessTests
 {
     TEST_P(CombinedCallsCorrectnessTest, Correctness)
@@ -38,7 +36,7 @@ namespace CorrectnessTests
         size_t const byteCount = datasets[0].NumBytes() / numDevices;
         size_t const elemCount = numElements / numDevices;
 
-        #pragma omp parallel for num_threads(numDevices)
+        ncclGroupStart();
         for (int i = 0; i < numDevices; i++)
         {
             ncclAllGather((int8_t *)datasets[0].inputs[i] + (i * byteCount),
@@ -63,6 +61,7 @@ namespace CorrectnessTests
                               elemCount, dataType, op,
                               comms[i], streams[i]);
         }
+        ncclGroupEnd();
 
         // Wait for reduction to complete
         Synchronize();
diff --git a/test/test_GroupCalls.cpp b/test/test_GroupCalls.cpp
index 77780b633d..3cfd55c6f5 100644
--- a/test/test_GroupCalls.cpp
+++ b/test/test_GroupCalls.cpp
@@ -11,8 +11,6 @@
 #include "test_Reduce.hpp"
 #include "test_ReduceScatter.hpp"
 
-#include <omp.h>
-
 namespace CorrectnessTests
 {
     TEST_P(GroupCallsCorrectnessTest, Correctness)
diff --git a/test/test_Reduce.cpp b/test/test_Reduce.cpp
index 9844e928c1..8234f2ab05 100644
--- a/test/test_Reduce.cpp
+++ b/test/test_Reduce.cpp
@@ -5,7 +5,6 @@
  ************************************************************************/
 
 #include "test_Reduce.hpp"
-#include <omp.h>
 
 namespace CorrectnessTests
 {
@@ -25,7 +24,7 @@ namespace CorrectnessTests
             ComputeExpectedResults(dataset, op, root);
 
             // Launch the reduction (1 thread per GPU)
-            #pragma omp parallel for num_threads(numDevices)
+            ncclGroupStart();
             for (int i = 0; i < numDevices; i++)
             {
                 ncclReduce(dataset.inputs[i],
@@ -33,6 +32,7 @@ namespace CorrectnessTests
                            numElements, dataType, op,
                            root, comms[i], streams[i]);
             }
+            ncclGroupEnd();
 
             // Wait for reduction to complete
             Synchronize();
diff --git a/test/test_ReduceScatter.cpp b/test/test_ReduceScatter.cpp
index 11007732b5..f6461f20b8 100644
--- a/test/test_ReduceScatter.cpp
+++ b/test/test_ReduceScatter.cpp
@@ -5,7 +5,6 @@
  ************************************************************************/
 
 #include "test_ReduceScatter.hpp"
-#include <omp.h>
 
 namespace CorrectnessTests
 {
@@ -24,7 +23,7 @@ namespace CorrectnessTests
         size_t const recvCount = dataset.numElements / dataset.numDevices;
 
         // Launch the reduction (1 thread per GPU)
-        #pragma omp parallel for num_threads(numDevices)
+        ncclGroupStart();
         for (int i = 0; i < numDevices; i++)
         {
             ncclReduceScatter(dataset.inputs[i],
@@ -32,7 +31,7 @@ namespace CorrectnessTests
                               recvCount, dataType, op,
                               comms[i], streams[i]);
         }
-
+        ncclGroupEnd();
 
         // Wait for reduction to complete
         Synchronize();

From e5074ce94de0df40c460ab611df1ee11eca6344f Mon Sep 17 00:00:00 2001
From: Gilbert Lee <gilbert.lee@amd.com>
Date: Fri, 20 Dec 2019 17:08:39 -0800
Subject: [PATCH 7/8] Changing single sync mode to time all iterations instead
 of just last

---
 tools/TransferBench/TransferBench.cpp | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/tools/TransferBench/TransferBench.cpp b/tools/TransferBench/TransferBench.cpp
index 016ff80ed4..5bdccdf807 100644
--- a/tools/TransferBench/TransferBench.cpp
+++ b/tools/TransferBench/TransferBench.cpp
@@ -28,7 +28,9 @@ THE SOFTWARE.
 #include <cstdlib>
 #include <set>
 #include <unistd.h>
-
+#include <map>
+#include <iostream>
+#include <sstream>
 #include <hip/hip_runtime.h>
 #include "copy_kernel.h"
 #include "TransferBench.hpp"
@@ -272,7 +274,8 @@ int main(int argc, char **argv)
       {
         HIP_CALL(hipSetDevice(links[i].srcGpu));
 
-        HIP_CALL(hipEventRecord(startEvents[i], streams[i]));
+        if (!useSingleSync || iteration == 0)
+          HIP_CALL(hipEventRecord(startEvents[i], streams[i]));
 
         if (useHipCall)
         {
@@ -308,7 +311,8 @@ int main(int argc, char **argv)
                                gpuBlockParams[i]);
           }
         }
-        HIP_CALL(hipEventRecord(stopEvents[i], streams[i]));
+        if (!useSingleSync || iteration == numIterations - 1)
+          HIP_CALL(hipEventRecord(stopEvents[i], streams[i]));
       }
 
       // Synchronize per iteration, unless in single sync mode, in which case
@@ -378,8 +382,7 @@ int main(int argc, char **argv)
       }
       else
       {
-        if (!useSingleSync)
-          totalGpuTime[i] /= (1.0 * numIterations);
+        totalGpuTime[i] /= (1.0 * numIterations);
         printf("%8.3f", (linkCount[i] * numBytesPerLink / 1.0E9) / totalGpuTime[i]);
       }
     }

From 15c917244d4a3c6992751aa30d38a61b3f109f83 Mon Sep 17 00:00:00 2001
From: paulfreddy <52053501+paulfreddy@users.noreply.github.com>
Date: Wed, 8 Jan 2020 21:28:16 -0800
Subject: [PATCH 8/8] Changes for multiple ROCm installation (#164)

* Changes for multiple ROCm installation

   1. Set version to 2.10.1
   2. Add CMAKE_INSTALL_PREFIX to neccessary places
   3. Cleanup, fix rpath, use prefix in install.sh

* Changes for multiple ROCm installation

   1. Set soversion to match release version
   2. Add CMAKE_INSTALL_PREFIX to neccessary places
   3. Cleanup, fix rpath, use prefix in install.sh

* Changes for multiple ROCm installation

1. Set soversion to match release version
2. Add CMAKE_INSTALL_PREFIX to neccessary places
3. Cleanup, fix rpath, use prefix in install.sh
---
 CMakeLists.txt | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index a68b2836c6..b946053e47 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -170,6 +170,9 @@ else()
   target_link_libraries(rccl PUBLIC hip::hip_hcc ${hcc_LIBRARIES} numa)
 endif()
 
+#Setup librccl.so version
+rocm_set_soversion(rccl "1.0")
+
 rocm_install_targets(TARGETS
                      rccl
                      PREFIX