Merge remote-tracking branch 'remotes/rccl/master' into rccl_2.5.6_cleanup

2020-01-29 15:28:03 -08:00
commit fe6d012eb0
@@ -181,6 +181,9 @@ else()
  target_link_libraries(rccl PUBLIC hip::hip_hcc ${hcc_LIBRARIES} numa)
 endif()

+#Setup librccl.so version
+rocm_set_soversion(rccl "1.0")
+
 rocm_install_targets(TARGETS
                     rccl
                     PREFIX
@@ -197,8 +200,8 @@ rocm_export_targets(NAMESPACE
                    DEPENDS
                    hip)

-set(CPACK_DEBIAN_PACKAGE_DEPENDS "hip_hcc")
-set(CPACK_RPM_PACKAGE_REQUIRES "hip_hcc")
+set(CPACK_DEBIAN_PACKAGE_DEPENDS "rocm-dev (>= 2.5.27)")
+set(CPACK_RPM_PACKAGE_REQUIRES "rocm-dev >= 2.5.27")

 set(CPACK_RPM_EXCLUDE_FROM_AUTO_FILELIST_ADDITION "/opt" "/opt/rocm")

@@ -208,7 +211,7 @@ rocm_create_package(
  DESCRIPTION
  "Optimized primitives for collective multi-GPU communication"
  MAINTAINER
-  "<no-reply@amd.com>"
+  "<rccl-maintainer@amd.com>"
  LDCONFIG)

 rocm_install_symlink_subdir(rccl)
@@ -37,10 +37,14 @@ $ git clone https://github.com/ROCmSoftwarePlatform/rccl.git
 $ cd rccl
 $ mkdir build
 $ cd build
-$ CXX=/opt/rocm/bin/hcc cmake -DCMAKE_INSTALL_PREFIX=$PWD/rccl-install ..
+$ CXX=/opt/rocm/bin/hcc cmake ..
 $ make -j 8
 ```
-You may substitute a path of your own choosing for CMAKE_INSTALL_PREFIX. Note: ensure rocm-cmake is installed, `apt install rocm-cmake`.
+You may substitute an installation path of your own choosing by passing CMAKE_INSTALL_PREFIX. For example:
+```shell
+$ CXX=/opt/rocm/bin/hcc cmake -DCMAKE_INSTALL_PREFIX=$PWD/rccl-install ..
+```
+Note: ensure rocm-cmake is installed, `apt install rocm-cmake`.

 #### To build the RCCL package and install package :

@@ -10,13 +10,6 @@ if(BUILD_TESTS)
      message(FATAL_ERROR "chrpath is required for UnitTests. Please install (e.g. sudo apt-get install chrpath)")
  endif()

-  # OpenMP is used to drive GPUs (one per thread)
-  if(EXISTS /etc/redhat-release)
-    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fopenmp=libgomp -pthread")
-  else()
-    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fopenmp -pthread")
-  endif()    
-
  # Download and unpack googletest at configure time
  configure_file(CMakeLists.txt.in googletest-download/CMakeLists.txt)
  execute_process(
@@ -4,7 +4,6 @@
 * See LICENSE.txt for license information
 ************************************************************************/
 #include "test_AllGather.hpp"
-#include <omp.h>

 namespace CorrectnessTests
 {
@@ -23,13 +22,14 @@ namespace CorrectnessTests
        size_t const sendCount = dataset.numElements / dataset.numDevices;

        // Launch the reduction (1 thread per GPU)
-        #pragma omp parallel for num_threads(numDevices)
+        ncclGroupStart();
        for (int i = 0; i < numDevices; i++)
        {
            ncclAllGather((int8_t *)dataset.inputs[i] + (i * byteCount),
                          dataset.outputs[i], sendCount,
                          dataType, comms[i], streams[i]);
        }
+        ncclGroupEnd();

        // Wait for reduction to complete
        Synchronize();
@@ -68,13 +68,14 @@ namespace CorrectnessTests
                size_t const sendCount = subDataset.numElements / subDataset.numDevices;

                // Launch the reduction (1 thread per GPU)
-                #pragma omp parallel for num_threads(numDevices)
+                ncclGroupStart();
                for (int i = 0; i < numDevices; i++)
                {
                    ncclAllGather((int8_t *)subDataset.inputs[i] + (i * byteCount),
                                  subDataset.outputs[i], sendCount,
                                  dataType, comms[i], streams[i]);
                }
+                ncclGroupEnd();

                // Wait for reduction to complete
                Synchronize();
@@ -5,7 +5,6 @@
 ************************************************************************/

 #include "test_AllReduce.hpp"
-#include <omp.h>

 namespace CorrectnessTests
 {
@@ -20,12 +19,13 @@ namespace CorrectnessTests
        ComputeExpectedResults(dataset, op);

        // Launch the reduction (1 thread per GPU)
-        #pragma omp parallel for num_threads(numDevices)
+        ncclGroupStart();
        for (int i = 0; i < numDevices; i++)
        {
            ncclAllReduce(dataset.inputs[i], dataset.outputs[i],
                          numElements, dataType, op, comms[i], streams[i]);
        }
+        ncclGroupEnd();

        // Wait for reduction to complete
        Synchronize();
@@ -6,7 +6,6 @@

 #include "test_AllReduceAbort.hpp"
 #include "../include/comm.h"
-#include <omp.h>

 #define NUM_ITER 8
 #define FAKE_OP_COUNT NUM_ITER+1
@@ -5,7 +5,6 @@
 ************************************************************************/

 #include "test_Broadcast.hpp"
-#include <omp.h>

 namespace CorrectnessTests
 {
@@ -25,7 +24,7 @@ namespace CorrectnessTests
            ComputeExpectedResults(dataset, root);

            // Launch the reduction (1 thread per GPU)
-            #pragma omp parallel for num_threads(numDevices)
+            ncclGroupStart();
            for (int i = 0; i < numDevices; i++)
            {
                ncclBroadcast(dataset.inputs[i],
@@ -33,7 +32,7 @@ namespace CorrectnessTests
                              numElements, dataType,
                              root, comms[i], streams[i]);
            }
-
+            ncclGroupEnd();

            // Wait for reduction to complete
            Synchronize();
@@ -7,7 +7,6 @@
 #define TEST_BROADCAST_HPP

 #include "CorrectnessTest.hpp"
-#include <omp.h>

 namespace CorrectnessTests
 {
@@ -6,7 +6,6 @@

 #include "test_BroadcastAbort.hpp"
 #include "../include/comm.h"
-#include <omp.h>

 #define NUM_ITER 8
 #define FAKE_OP_COUNT NUM_ITER+1
@@ -11,8 +11,6 @@
 #include "test_Reduce.hpp"
 #include "test_ReduceScatter.hpp"

-#include <omp.h>
-
 namespace CorrectnessTests
 {
    TEST_P(CombinedCallsCorrectnessTest, Correctness)
@@ -38,7 +36,7 @@ namespace CorrectnessTests
        size_t const byteCount = datasets[0].NumBytes() / numDevices;
        size_t const elemCount = numElements / numDevices;

-        #pragma omp parallel for num_threads(numDevices)
+        ncclGroupStart();
        for (int i = 0; i < numDevices; i++)
        {
            ncclAllGather((int8_t *)datasets[0].inputs[i] + (i * byteCount),
@@ -63,6 +61,7 @@ namespace CorrectnessTests
                              elemCount, dataType, op,
                              comms[i], streams[i]);
        }
+        ncclGroupEnd();

        // Wait for reduction to complete
        Synchronize();
@@ -11,8 +11,6 @@
 #include "test_Reduce.hpp"
 #include "test_ReduceScatter.hpp"

-#include <omp.h>
-
 namespace CorrectnessTests
 {
    TEST_P(GroupCallsCorrectnessTest, Correctness)
@@ -5,7 +5,6 @@
 ************************************************************************/

 #include "test_Reduce.hpp"
-#include <omp.h>

 namespace CorrectnessTests
 {
@@ -25,7 +24,7 @@ namespace CorrectnessTests
            ComputeExpectedResults(dataset, op, root);

            // Launch the reduction (1 thread per GPU)
-            #pragma omp parallel for num_threads(numDevices)
+            ncclGroupStart();
            for (int i = 0; i < numDevices; i++)
            {
                ncclReduce(dataset.inputs[i],
@@ -33,6 +32,7 @@ namespace CorrectnessTests
                           numElements, dataType, op,
                           root, comms[i], streams[i]);
            }
+            ncclGroupEnd();

            // Wait for reduction to complete
            Synchronize();
@@ -5,7 +5,6 @@
 ************************************************************************/

 #include "test_ReduceScatter.hpp"
-#include <omp.h>

 namespace CorrectnessTests
 {
@@ -24,7 +23,7 @@ namespace CorrectnessTests
        size_t const recvCount = dataset.numElements / dataset.numDevices;

        // Launch the reduction (1 thread per GPU)
-        #pragma omp parallel for num_threads(numDevices)
+        ncclGroupStart();
        for (int i = 0; i < numDevices; i++)
        {
            ncclReduceScatter(dataset.inputs[i],
@@ -32,7 +31,7 @@ namespace CorrectnessTests
                              recvCount, dataType, op,
                              comms[i], streams[i]);
        }
-
+        ncclGroupEnd();

        // Wait for reduction to complete
        Synchronize();
@@ -27,7 +27,10 @@ THE SOFTWARE.
 #include <cstdio>
 #include <cstdlib>
 #include <set>
-
+#include <unistd.h>
+#include <map>
+#include <iostream>
+#include <sstream>
 #include <hip/hip_runtime.h>
 #include "copy_kernel.h"
 #include "TransferBench.hpp"
@@ -49,12 +52,13 @@ int main(int argc, char **argv)
    printf("\n");
    printf("Environment variables:\n");
    printf("======================\n");
-    printf(" USE_HIP_CALL    - Use hip calls (hipMemcpyAsync/hipMemset) instead of kernel\n");
-    printf(" USE_MEMSET      - Write constant value (instead of doing a copy)\n");
-    printf(" USE_COARSE_MEM  - Use coarse-grained dst GPU memory (instead of fine-grained)\n");
-    printf(" USE_SINGLE_SYNC - Only synchronize once at end of iterations (disables GPU times)\n");
-    printf(" USE_INTERACTIVE - Waits for user-input prior to start and after transfer loop (for profiling)\n");
+    printf(" USE_HIP_CALL     - Use hip calls (hipMemcpyAsync/hipMemset) instead of kernel\n");
+    printf(" USE_MEMSET       - Write constant value (instead of doing a copy)\n");
+    printf(" USE_COARSE_MEM   - Use coarse-grained dst GPU memory (instead of fine-grained)\n");
+    printf(" USE_SINGLE_SYNC  - Only synchronize once at end of iterations (disables GPU times)\n");
+    printf(" USE_INTERACTIVE  - Waits for user-input prior to start and after transfer loop (for profiling)\n");
    printf(" USE_ITERATIONS=N - Sets number of iterations to run (default is 10)\n");
+    printf(" USE_SLEEP        - Adds a 100ms sleep after sync (for profiling)\n");
    exit(0);
  }

@@ -74,6 +78,7 @@ int main(int argc, char **argv)
  bool useCoarseMem = getenv("USE_COARSE_MEM");
  bool useSingleSync = getenv("USE_SINGLE_SYNC");
  bool useInteractive = getenv("USE_INTERACTIVE");
+  bool useSleep = getenv("USE_SLEEP");

  int numWarmups = 3;
  int numIterations = getenv("USE_ITERATIONS") ? atoi(getenv("USE_ITERATIONS")) : 10;
@@ -99,6 +104,10 @@ int main(int argc, char **argv)
    printf("Running in interactive mode (USE_INTERACTIVE)\n");
  else
    printf("Running in non-interactive mode (enable interactive mode via USE_INTERACTIVE)\n");
+  if (useSleep)
+    printf("Adding 100ms sleep after sync (USE_SLEEP)\n");
+  else
+    printf("No sleep per sync (enable sleep via USE_SLEEP)\n");

  printf("Executing %d warmup iteration(s), and %d timed iteration(s) (Set via USE_ITERATION=#)\n",
         numWarmups, numIterations);
@@ -265,7 +274,8 @@ int main(int argc, char **argv)
      {
        HIP_CALL(hipSetDevice(links[i].srcGpu));

-        HIP_CALL(hipEventRecord(startEvents[i], streams[i]));
+        if (!useSingleSync || iteration == 0)
+          HIP_CALL(hipEventRecord(startEvents[i], streams[i]));

        if (useHipCall)
        {
@@ -301,7 +311,8 @@ int main(int argc, char **argv)
                               gpuBlockParams[i]);
          }
        }
-        HIP_CALL(hipEventRecord(stopEvents[i], streams[i]));
+        if (!useSingleSync || iteration == numIterations - 1)
+          HIP_CALL(hipEventRecord(stopEvents[i], streams[i]));
      }

      // Synchronize per iteration, unless in single sync mode, in which case
@@ -314,6 +325,7 @@ int main(int argc, char **argv)

      auto cpuDelta = std::chrono::high_resolution_clock::now() - cpuStart;
      double deltaSec = std::chrono::duration_cast<std::chrono::duration<double>>(cpuDelta).count();
+      if (useSleep) usleep(100000);

      if (iteration >= 0)
      {
@@ -370,8 +382,7 @@ int main(int argc, char **argv)
      }
      else
      {
-        if (!useSingleSync)
-          totalGpuTime[i] /= (1.0 * numIterations);
+        totalGpuTime[i] /= (1.0 * numIterations);
        printf("%8.3f", (linkCount[i] * numBytesPerLink / 1.0E9) / totalGpuTime[i]);
      }
    }
@@ -20,6 +20,8 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 THE SOFTWARE.
 */

+#include <sstream>
+
 // Helper macro for catching HIP errors
 #define HIP_CALL(cmd)                                                   \
    do {                                                                \