From 00a910c2dad9995b72d24ee1520d47bc9b57375d Mon Sep 17 00:00:00 2001 From: Wenkai Du Date: Tue, 26 Nov 2019 12:40:26 -0800 Subject: [PATCH 1/8] Change manual build instructions to fit most common usage --- README.md | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 56eca69c52..405b69127b 100644 --- a/README.md +++ b/README.md @@ -37,10 +37,14 @@ $ git clone https://github.com/ROCmSoftwarePlatform/rccl.git $ cd rccl $ mkdir build $ cd build -$ CXX=/opt/rocm/bin/hcc cmake -DCMAKE_INSTALL_PREFIX=$PWD/rccl-install .. +$ CXX=/opt/rocm/bin/hcc cmake .. $ make -j 8 ``` -You may substitute a path of your own choosing for CMAKE_INSTALL_PREFIX. Note: ensure rocm-cmake is installed, `apt install rocm-cmake`. +You may substitute an installation path of your own choosing by passing CMAKE_INSTALL_PREFIX. For example: +```shell +$ CXX=/opt/rocm/bin/hcc cmake -DCMAKE_INSTALL_PREFIX=$PWD/rccl-install .. +``` +Note: ensure rocm-cmake is installed, `apt install rocm-cmake`. #### To build the RCCL package and install package : From 90e928bcd5075e47648d913a7888909ba3499732 Mon Sep 17 00:00:00 2001 From: Wenkai Du Date: Wed, 4 Dec 2019 21:05:10 +0000 Subject: [PATCH 2/8] Change default P2P level --- src/transport/p2p.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/transport/p2p.cc b/src/transport/p2p.cc index 7045ee8abc..7ce944fef4 100644 --- a/src/transport/p2p.cc +++ b/src/transport/p2p.cc @@ -78,7 +78,7 @@ static int busIdToCudaDev(const char* busId) { /* Determine if we can communicate with the peer through p2p */ ncclResult_t p2pCanConnect(ncclTvalue_t* ret, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo) { // Do not use P2P across root complexes by default (provided CUDA permits it) - int p2pLevel = PATH_NODE; + int p2pLevel = PATH_SYS; if (ncclParamP2pDisable() == 1) p2pLevel = 0; if (ncclParamP2pLevel() != -2) p2pLevel = ncclParamP2pLevel(); From bd59b6f880b8dd6593808b18946a3d4e569b7f60 Mon Sep 17 00:00:00 2001 From: saadrahim <44449863+saadrahim@users.noreply.github.com> Date: Fri, 6 Dec 2019 14:00:25 -0700 Subject: [PATCH 3/8] Changing package dependency to rocm-dev (#160) --- CMakeLists.txt | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index b6ff4847ba..b7dcf55942 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -186,8 +186,8 @@ rocm_export_targets(NAMESPACE DEPENDS hip) -set(CPACK_DEBIAN_PACKAGE_DEPENDS "hip_hcc") -set(CPACK_RPM_PACKAGE_REQUIRES "hip_hcc") +set(CPACK_DEBIAN_PACKAGE_DEPENDS "rocm-dev (>= 2.5.27)") +set(CPACK_RPM_PACKAGE_REQUIRES "rocm-dev (>= 2.5.27)") set(CPACK_RPM_EXCLUDE_FROM_AUTO_FILELIST_ADDITION "/opt" "/opt/rocm") @@ -197,7 +197,7 @@ rocm_create_package( DESCRIPTION "Optimized primitives for collective multi-GPU communication" MAINTAINER - "" + "" LDCONFIG) rocm_install_symlink_subdir(rccl) From 0092b3513278bd39cae91ebb675a628968ccfa38 Mon Sep 17 00:00:00 2001 From: saadrahim <44449863+saadrahim@users.noreply.github.com> Date: Fri, 6 Dec 2019 16:06:50 -0700 Subject: [PATCH 4/8] Package fix (#161) * Fixing RHEL dependency on rocm-dev --- CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index b7dcf55942..a68b2836c6 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -187,7 +187,7 @@ rocm_export_targets(NAMESPACE hip) set(CPACK_DEBIAN_PACKAGE_DEPENDS "rocm-dev (>= 2.5.27)") -set(CPACK_RPM_PACKAGE_REQUIRES "rocm-dev (>= 2.5.27)") +set(CPACK_RPM_PACKAGE_REQUIRES "rocm-dev >= 2.5.27") set(CPACK_RPM_EXCLUDE_FROM_AUTO_FILELIST_ADDITION "/opt" "/opt/rocm") From 2f4269d06d803541057c6d4ef9ec192660d22fbc Mon Sep 17 00:00:00 2001 From: gilbertlee-amd <44450918+gilbertlee-amd@users.noreply.github.com> Date: Thu, 12 Dec 2019 15:20:54 -0700 Subject: [PATCH 5/8] Adding new sleep after sync capability for data fabric profiling (#162) Fixing missing header include for ROCM 3.0 changes --- tools/TransferBench/TransferBench.cpp | 18 +++++++++++++----- tools/TransferBench/TransferBench.hpp | 2 ++ 2 files changed, 15 insertions(+), 5 deletions(-) diff --git a/tools/TransferBench/TransferBench.cpp b/tools/TransferBench/TransferBench.cpp index afb548ff58..016ff80ed4 100644 --- a/tools/TransferBench/TransferBench.cpp +++ b/tools/TransferBench/TransferBench.cpp @@ -27,6 +27,7 @@ THE SOFTWARE. #include #include #include +#include #include #include "copy_kernel.h" @@ -49,12 +50,13 @@ int main(int argc, char **argv) printf("\n"); printf("Environment variables:\n"); printf("======================\n"); - printf(" USE_HIP_CALL - Use hip calls (hipMemcpyAsync/hipMemset) instead of kernel\n"); - printf(" USE_MEMSET - Write constant value (instead of doing a copy)\n"); - printf(" USE_COARSE_MEM - Use coarse-grained dst GPU memory (instead of fine-grained)\n"); - printf(" USE_SINGLE_SYNC - Only synchronize once at end of iterations (disables GPU times)\n"); - printf(" USE_INTERACTIVE - Waits for user-input prior to start and after transfer loop (for profiling)\n"); + printf(" USE_HIP_CALL - Use hip calls (hipMemcpyAsync/hipMemset) instead of kernel\n"); + printf(" USE_MEMSET - Write constant value (instead of doing a copy)\n"); + printf(" USE_COARSE_MEM - Use coarse-grained dst GPU memory (instead of fine-grained)\n"); + printf(" USE_SINGLE_SYNC - Only synchronize once at end of iterations (disables GPU times)\n"); + printf(" USE_INTERACTIVE - Waits for user-input prior to start and after transfer loop (for profiling)\n"); printf(" USE_ITERATIONS=N - Sets number of iterations to run (default is 10)\n"); + printf(" USE_SLEEP - Adds a 100ms sleep after sync (for profiling)\n"); exit(0); } @@ -74,6 +76,7 @@ int main(int argc, char **argv) bool useCoarseMem = getenv("USE_COARSE_MEM"); bool useSingleSync = getenv("USE_SINGLE_SYNC"); bool useInteractive = getenv("USE_INTERACTIVE"); + bool useSleep = getenv("USE_SLEEP"); int numWarmups = 3; int numIterations = getenv("USE_ITERATIONS") ? atoi(getenv("USE_ITERATIONS")) : 10; @@ -99,6 +102,10 @@ int main(int argc, char **argv) printf("Running in interactive mode (USE_INTERACTIVE)\n"); else printf("Running in non-interactive mode (enable interactive mode via USE_INTERACTIVE)\n"); + if (useSleep) + printf("Adding 100ms sleep after sync (USE_SLEEP)\n"); + else + printf("No sleep per sync (enable sleep via USE_SLEEP)\n"); printf("Executing %d warmup iteration(s), and %d timed iteration(s) (Set via USE_ITERATION=#)\n", numWarmups, numIterations); @@ -314,6 +321,7 @@ int main(int argc, char **argv) auto cpuDelta = std::chrono::high_resolution_clock::now() - cpuStart; double deltaSec = std::chrono::duration_cast>(cpuDelta).count(); + if (useSleep) usleep(100000); if (iteration >= 0) { diff --git a/tools/TransferBench/TransferBench.hpp b/tools/TransferBench/TransferBench.hpp index 5fde8f58fe..92c596d672 100644 --- a/tools/TransferBench/TransferBench.hpp +++ b/tools/TransferBench/TransferBench.hpp @@ -20,6 +20,8 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ +#include + // Helper macro for catching HIP errors #define HIP_CALL(cmd) \ do { \ From 000bce6f27b0689ce7f5d2d9b22d9b718548aee9 Mon Sep 17 00:00:00 2001 From: gilbertlee-amd <44450918+gilbertlee-amd@users.noreply.github.com> Date: Fri, 20 Dec 2019 11:41:56 -0700 Subject: [PATCH 6/8] Removing OpenMP from unit tests (#163) --- test/CMakeLists.txt | 7 ------- test/test_AllGather.cpp | 7 ++++--- test/test_AllReduce.cpp | 4 ++-- test/test_AllReduceAbort.cpp | 1 - test/test_Broadcast.cpp | 5 ++--- test/test_Broadcast.hpp | 1 - test/test_BroadcastAbort.cpp | 1 - test/test_CombinedCalls.cpp | 5 ++--- test/test_GroupCalls.cpp | 2 -- test/test_Reduce.cpp | 4 ++-- test/test_ReduceScatter.cpp | 5 ++--- 11 files changed, 14 insertions(+), 28 deletions(-) diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index 66236c2f18..ee542902b6 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -10,13 +10,6 @@ if(BUILD_TESTS) message(FATAL_ERROR "chrpath is required for UnitTests. Please install (e.g. sudo apt-get install chrpath)") endif() - # OpenMP is used to drive GPUs (one per thread) - if(EXISTS /etc/redhat-release) - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fopenmp=libgomp -pthread") - else() - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fopenmp -pthread") - endif() - # Download and unpack googletest at configure time configure_file(CMakeLists.txt.in googletest-download/CMakeLists.txt) execute_process( diff --git a/test/test_AllGather.cpp b/test/test_AllGather.cpp index b28df48e87..df5aea23e3 100644 --- a/test/test_AllGather.cpp +++ b/test/test_AllGather.cpp @@ -4,7 +4,6 @@ * See LICENSE.txt for license information ************************************************************************/ #include "test_AllGather.hpp" -#include namespace CorrectnessTests { @@ -23,13 +22,14 @@ namespace CorrectnessTests size_t const sendCount = dataset.numElements / dataset.numDevices; // Launch the reduction (1 thread per GPU) - #pragma omp parallel for num_threads(numDevices) + ncclGroupStart(); for (int i = 0; i < numDevices; i++) { ncclAllGather((int8_t *)dataset.inputs[i] + (i * byteCount), dataset.outputs[i], sendCount, dataType, comms[i], streams[i]); } + ncclGroupEnd(); // Wait for reduction to complete Synchronize(); @@ -68,13 +68,14 @@ namespace CorrectnessTests size_t const sendCount = subDataset.numElements / subDataset.numDevices; // Launch the reduction (1 thread per GPU) - #pragma omp parallel for num_threads(numDevices) + ncclGroupStart(); for (int i = 0; i < numDevices; i++) { ncclAllGather((int8_t *)subDataset.inputs[i] + (i * byteCount), subDataset.outputs[i], sendCount, dataType, comms[i], streams[i]); } + ncclGroupEnd(); // Wait for reduction to complete Synchronize(); diff --git a/test/test_AllReduce.cpp b/test/test_AllReduce.cpp index 0fd5eedf91..86d4654b7b 100644 --- a/test/test_AllReduce.cpp +++ b/test/test_AllReduce.cpp @@ -5,7 +5,6 @@ ************************************************************************/ #include "test_AllReduce.hpp" -#include namespace CorrectnessTests { @@ -20,12 +19,13 @@ namespace CorrectnessTests ComputeExpectedResults(dataset, op); // Launch the reduction (1 thread per GPU) - #pragma omp parallel for num_threads(numDevices) + ncclGroupStart(); for (int i = 0; i < numDevices; i++) { ncclAllReduce(dataset.inputs[i], dataset.outputs[i], numElements, dataType, op, comms[i], streams[i]); } + ncclGroupEnd(); // Wait for reduction to complete Synchronize(); diff --git a/test/test_AllReduceAbort.cpp b/test/test_AllReduceAbort.cpp index 9400bd84fc..cccca4a547 100644 --- a/test/test_AllReduceAbort.cpp +++ b/test/test_AllReduceAbort.cpp @@ -6,7 +6,6 @@ #include "test_AllReduceAbort.hpp" #include "../include/core.h" -#include #define NUM_ITER 8 #define FAKE_OP_COUNT NUM_ITER+1 diff --git a/test/test_Broadcast.cpp b/test/test_Broadcast.cpp index 0e728b0153..a98d5884cb 100644 --- a/test/test_Broadcast.cpp +++ b/test/test_Broadcast.cpp @@ -5,7 +5,6 @@ ************************************************************************/ #include "test_Broadcast.hpp" -#include namespace CorrectnessTests { @@ -25,7 +24,7 @@ namespace CorrectnessTests ComputeExpectedResults(dataset, root); // Launch the reduction (1 thread per GPU) - #pragma omp parallel for num_threads(numDevices) + ncclGroupStart(); for (int i = 0; i < numDevices; i++) { ncclBroadcast(dataset.inputs[i], @@ -33,7 +32,7 @@ namespace CorrectnessTests numElements, dataType, root, comms[i], streams[i]); } - + ncclGroupEnd(); // Wait for reduction to complete Synchronize(); diff --git a/test/test_Broadcast.hpp b/test/test_Broadcast.hpp index 5ccf351592..358d8cf09a 100644 --- a/test/test_Broadcast.hpp +++ b/test/test_Broadcast.hpp @@ -7,7 +7,6 @@ #define TEST_BROADCAST_HPP #include "CorrectnessTest.hpp" -#include namespace CorrectnessTests { diff --git a/test/test_BroadcastAbort.cpp b/test/test_BroadcastAbort.cpp index 28596cc52a..824bced360 100644 --- a/test/test_BroadcastAbort.cpp +++ b/test/test_BroadcastAbort.cpp @@ -6,7 +6,6 @@ #include "test_BroadcastAbort.hpp" #include "../include/core.h" -#include #define NUM_ITER 8 #define FAKE_OP_COUNT NUM_ITER+1 diff --git a/test/test_CombinedCalls.cpp b/test/test_CombinedCalls.cpp index 4b51ab5375..41e93c78a2 100644 --- a/test/test_CombinedCalls.cpp +++ b/test/test_CombinedCalls.cpp @@ -11,8 +11,6 @@ #include "test_Reduce.hpp" #include "test_ReduceScatter.hpp" -#include - namespace CorrectnessTests { TEST_P(CombinedCallsCorrectnessTest, Correctness) @@ -38,7 +36,7 @@ namespace CorrectnessTests size_t const byteCount = datasets[0].NumBytes() / numDevices; size_t const elemCount = numElements / numDevices; - #pragma omp parallel for num_threads(numDevices) + ncclGroupStart(); for (int i = 0; i < numDevices; i++) { ncclAllGather((int8_t *)datasets[0].inputs[i] + (i * byteCount), @@ -63,6 +61,7 @@ namespace CorrectnessTests elemCount, dataType, op, comms[i], streams[i]); } + ncclGroupEnd(); // Wait for reduction to complete Synchronize(); diff --git a/test/test_GroupCalls.cpp b/test/test_GroupCalls.cpp index 77780b633d..3cfd55c6f5 100644 --- a/test/test_GroupCalls.cpp +++ b/test/test_GroupCalls.cpp @@ -11,8 +11,6 @@ #include "test_Reduce.hpp" #include "test_ReduceScatter.hpp" -#include - namespace CorrectnessTests { TEST_P(GroupCallsCorrectnessTest, Correctness) diff --git a/test/test_Reduce.cpp b/test/test_Reduce.cpp index 9844e928c1..8234f2ab05 100644 --- a/test/test_Reduce.cpp +++ b/test/test_Reduce.cpp @@ -5,7 +5,6 @@ ************************************************************************/ #include "test_Reduce.hpp" -#include namespace CorrectnessTests { @@ -25,7 +24,7 @@ namespace CorrectnessTests ComputeExpectedResults(dataset, op, root); // Launch the reduction (1 thread per GPU) - #pragma omp parallel for num_threads(numDevices) + ncclGroupStart(); for (int i = 0; i < numDevices; i++) { ncclReduce(dataset.inputs[i], @@ -33,6 +32,7 @@ namespace CorrectnessTests numElements, dataType, op, root, comms[i], streams[i]); } + ncclGroupEnd(); // Wait for reduction to complete Synchronize(); diff --git a/test/test_ReduceScatter.cpp b/test/test_ReduceScatter.cpp index 11007732b5..f6461f20b8 100644 --- a/test/test_ReduceScatter.cpp +++ b/test/test_ReduceScatter.cpp @@ -5,7 +5,6 @@ ************************************************************************/ #include "test_ReduceScatter.hpp" -#include namespace CorrectnessTests { @@ -24,7 +23,7 @@ namespace CorrectnessTests size_t const recvCount = dataset.numElements / dataset.numDevices; // Launch the reduction (1 thread per GPU) - #pragma omp parallel for num_threads(numDevices) + ncclGroupStart(); for (int i = 0; i < numDevices; i++) { ncclReduceScatter(dataset.inputs[i], @@ -32,7 +31,7 @@ namespace CorrectnessTests recvCount, dataType, op, comms[i], streams[i]); } - + ncclGroupEnd(); // Wait for reduction to complete Synchronize(); From e5074ce94de0df40c460ab611df1ee11eca6344f Mon Sep 17 00:00:00 2001 From: Gilbert Lee Date: Fri, 20 Dec 2019 17:08:39 -0800 Subject: [PATCH 7/8] Changing single sync mode to time all iterations instead of just last --- tools/TransferBench/TransferBench.cpp | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/tools/TransferBench/TransferBench.cpp b/tools/TransferBench/TransferBench.cpp index 016ff80ed4..5bdccdf807 100644 --- a/tools/TransferBench/TransferBench.cpp +++ b/tools/TransferBench/TransferBench.cpp @@ -28,7 +28,9 @@ THE SOFTWARE. #include #include #include - +#include +#include +#include #include #include "copy_kernel.h" #include "TransferBench.hpp" @@ -272,7 +274,8 @@ int main(int argc, char **argv) { HIP_CALL(hipSetDevice(links[i].srcGpu)); - HIP_CALL(hipEventRecord(startEvents[i], streams[i])); + if (!useSingleSync || iteration == 0) + HIP_CALL(hipEventRecord(startEvents[i], streams[i])); if (useHipCall) { @@ -308,7 +311,8 @@ int main(int argc, char **argv) gpuBlockParams[i]); } } - HIP_CALL(hipEventRecord(stopEvents[i], streams[i])); + if (!useSingleSync || iteration == numIterations - 1) + HIP_CALL(hipEventRecord(stopEvents[i], streams[i])); } // Synchronize per iteration, unless in single sync mode, in which case @@ -378,8 +382,7 @@ int main(int argc, char **argv) } else { - if (!useSingleSync) - totalGpuTime[i] /= (1.0 * numIterations); + totalGpuTime[i] /= (1.0 * numIterations); printf("%8.3f", (linkCount[i] * numBytesPerLink / 1.0E9) / totalGpuTime[i]); } } From 15c917244d4a3c6992751aa30d38a61b3f109f83 Mon Sep 17 00:00:00 2001 From: paulfreddy <52053501+paulfreddy@users.noreply.github.com> Date: Wed, 8 Jan 2020 21:28:16 -0800 Subject: [PATCH 8/8] Changes for multiple ROCm installation (#164) * Changes for multiple ROCm installation 1. Set version to 2.10.1 2. Add CMAKE_INSTALL_PREFIX to neccessary places 3. Cleanup, fix rpath, use prefix in install.sh * Changes for multiple ROCm installation 1. Set soversion to match release version 2. Add CMAKE_INSTALL_PREFIX to neccessary places 3. Cleanup, fix rpath, use prefix in install.sh * Changes for multiple ROCm installation 1. Set soversion to match release version 2. Add CMAKE_INSTALL_PREFIX to neccessary places 3. Cleanup, fix rpath, use prefix in install.sh --- CMakeLists.txt | 3 +++ 1 file changed, 3 insertions(+) diff --git a/CMakeLists.txt b/CMakeLists.txt index a68b2836c6..b946053e47 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -170,6 +170,9 @@ else() target_link_libraries(rccl PUBLIC hip::hip_hcc ${hcc_LIBRARIES} numa) endif() +#Setup librccl.so version +rocm_set_soversion(rccl "1.0") + rocm_install_targets(TARGETS rccl PREFIX