diff --git a/CMakeLists.txt b/CMakeLists.txt index 3e1fec1403..81cd32af5f 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -181,6 +181,9 @@ else() target_link_libraries(rccl PUBLIC hip::hip_hcc ${hcc_LIBRARIES} numa) endif() +#Setup librccl.so version +rocm_set_soversion(rccl "1.0") + rocm_install_targets(TARGETS rccl PREFIX @@ -197,8 +200,8 @@ rocm_export_targets(NAMESPACE DEPENDS hip) -set(CPACK_DEBIAN_PACKAGE_DEPENDS "hip_hcc") -set(CPACK_RPM_PACKAGE_REQUIRES "hip_hcc") +set(CPACK_DEBIAN_PACKAGE_DEPENDS "rocm-dev (>= 2.5.27)") +set(CPACK_RPM_PACKAGE_REQUIRES "rocm-dev >= 2.5.27") set(CPACK_RPM_EXCLUDE_FROM_AUTO_FILELIST_ADDITION "/opt" "/opt/rocm") @@ -208,7 +211,7 @@ rocm_create_package( DESCRIPTION "Optimized primitives for collective multi-GPU communication" MAINTAINER - "" + "" LDCONFIG) rocm_install_symlink_subdir(rccl) diff --git a/README.md b/README.md index 56eca69c52..405b69127b 100644 --- a/README.md +++ b/README.md @@ -37,10 +37,14 @@ $ git clone https://github.com/ROCmSoftwarePlatform/rccl.git $ cd rccl $ mkdir build $ cd build -$ CXX=/opt/rocm/bin/hcc cmake -DCMAKE_INSTALL_PREFIX=$PWD/rccl-install .. +$ CXX=/opt/rocm/bin/hcc cmake .. $ make -j 8 ``` -You may substitute a path of your own choosing for CMAKE_INSTALL_PREFIX. Note: ensure rocm-cmake is installed, `apt install rocm-cmake`. +You may substitute an installation path of your own choosing by passing CMAKE_INSTALL_PREFIX. For example: +```shell +$ CXX=/opt/rocm/bin/hcc cmake -DCMAKE_INSTALL_PREFIX=$PWD/rccl-install .. +``` +Note: ensure rocm-cmake is installed, `apt install rocm-cmake`. #### To build the RCCL package and install package : diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index 66236c2f18..ee542902b6 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -10,13 +10,6 @@ if(BUILD_TESTS) message(FATAL_ERROR "chrpath is required for UnitTests. Please install (e.g. sudo apt-get install chrpath)") endif() - # OpenMP is used to drive GPUs (one per thread) - if(EXISTS /etc/redhat-release) - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fopenmp=libgomp -pthread") - else() - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fopenmp -pthread") - endif() - # Download and unpack googletest at configure time configure_file(CMakeLists.txt.in googletest-download/CMakeLists.txt) execute_process( diff --git a/test/test_AllGather.cpp b/test/test_AllGather.cpp index b28df48e87..df5aea23e3 100644 --- a/test/test_AllGather.cpp +++ b/test/test_AllGather.cpp @@ -4,7 +4,6 @@ * See LICENSE.txt for license information ************************************************************************/ #include "test_AllGather.hpp" -#include namespace CorrectnessTests { @@ -23,13 +22,14 @@ namespace CorrectnessTests size_t const sendCount = dataset.numElements / dataset.numDevices; // Launch the reduction (1 thread per GPU) - #pragma omp parallel for num_threads(numDevices) + ncclGroupStart(); for (int i = 0; i < numDevices; i++) { ncclAllGather((int8_t *)dataset.inputs[i] + (i * byteCount), dataset.outputs[i], sendCount, dataType, comms[i], streams[i]); } + ncclGroupEnd(); // Wait for reduction to complete Synchronize(); @@ -68,13 +68,14 @@ namespace CorrectnessTests size_t const sendCount = subDataset.numElements / subDataset.numDevices; // Launch the reduction (1 thread per GPU) - #pragma omp parallel for num_threads(numDevices) + ncclGroupStart(); for (int i = 0; i < numDevices; i++) { ncclAllGather((int8_t *)subDataset.inputs[i] + (i * byteCount), subDataset.outputs[i], sendCount, dataType, comms[i], streams[i]); } + ncclGroupEnd(); // Wait for reduction to complete Synchronize(); diff --git a/test/test_AllReduce.cpp b/test/test_AllReduce.cpp index 0fd5eedf91..86d4654b7b 100644 --- a/test/test_AllReduce.cpp +++ b/test/test_AllReduce.cpp @@ -5,7 +5,6 @@ ************************************************************************/ #include "test_AllReduce.hpp" -#include namespace CorrectnessTests { @@ -20,12 +19,13 @@ namespace CorrectnessTests ComputeExpectedResults(dataset, op); // Launch the reduction (1 thread per GPU) - #pragma omp parallel for num_threads(numDevices) + ncclGroupStart(); for (int i = 0; i < numDevices; i++) { ncclAllReduce(dataset.inputs[i], dataset.outputs[i], numElements, dataType, op, comms[i], streams[i]); } + ncclGroupEnd(); // Wait for reduction to complete Synchronize(); diff --git a/test/test_AllReduceAbort.cpp b/test/test_AllReduceAbort.cpp index 7b329d249d..719bf323d4 100644 --- a/test/test_AllReduceAbort.cpp +++ b/test/test_AllReduceAbort.cpp @@ -6,7 +6,6 @@ #include "test_AllReduceAbort.hpp" #include "../include/comm.h" -#include #define NUM_ITER 8 #define FAKE_OP_COUNT NUM_ITER+1 diff --git a/test/test_Broadcast.cpp b/test/test_Broadcast.cpp index 0e728b0153..a98d5884cb 100644 --- a/test/test_Broadcast.cpp +++ b/test/test_Broadcast.cpp @@ -5,7 +5,6 @@ ************************************************************************/ #include "test_Broadcast.hpp" -#include namespace CorrectnessTests { @@ -25,7 +24,7 @@ namespace CorrectnessTests ComputeExpectedResults(dataset, root); // Launch the reduction (1 thread per GPU) - #pragma omp parallel for num_threads(numDevices) + ncclGroupStart(); for (int i = 0; i < numDevices; i++) { ncclBroadcast(dataset.inputs[i], @@ -33,7 +32,7 @@ namespace CorrectnessTests numElements, dataType, root, comms[i], streams[i]); } - + ncclGroupEnd(); // Wait for reduction to complete Synchronize(); diff --git a/test/test_Broadcast.hpp b/test/test_Broadcast.hpp index 5ccf351592..358d8cf09a 100644 --- a/test/test_Broadcast.hpp +++ b/test/test_Broadcast.hpp @@ -7,7 +7,6 @@ #define TEST_BROADCAST_HPP #include "CorrectnessTest.hpp" -#include namespace CorrectnessTests { diff --git a/test/test_BroadcastAbort.cpp b/test/test_BroadcastAbort.cpp index e49a872e0b..df0175002a 100644 --- a/test/test_BroadcastAbort.cpp +++ b/test/test_BroadcastAbort.cpp @@ -6,7 +6,6 @@ #include "test_BroadcastAbort.hpp" #include "../include/comm.h" -#include #define NUM_ITER 8 #define FAKE_OP_COUNT NUM_ITER+1 diff --git a/test/test_CombinedCalls.cpp b/test/test_CombinedCalls.cpp index 4b51ab5375..41e93c78a2 100644 --- a/test/test_CombinedCalls.cpp +++ b/test/test_CombinedCalls.cpp @@ -11,8 +11,6 @@ #include "test_Reduce.hpp" #include "test_ReduceScatter.hpp" -#include - namespace CorrectnessTests { TEST_P(CombinedCallsCorrectnessTest, Correctness) @@ -38,7 +36,7 @@ namespace CorrectnessTests size_t const byteCount = datasets[0].NumBytes() / numDevices; size_t const elemCount = numElements / numDevices; - #pragma omp parallel for num_threads(numDevices) + ncclGroupStart(); for (int i = 0; i < numDevices; i++) { ncclAllGather((int8_t *)datasets[0].inputs[i] + (i * byteCount), @@ -63,6 +61,7 @@ namespace CorrectnessTests elemCount, dataType, op, comms[i], streams[i]); } + ncclGroupEnd(); // Wait for reduction to complete Synchronize(); diff --git a/test/test_GroupCalls.cpp b/test/test_GroupCalls.cpp index 77780b633d..3cfd55c6f5 100644 --- a/test/test_GroupCalls.cpp +++ b/test/test_GroupCalls.cpp @@ -11,8 +11,6 @@ #include "test_Reduce.hpp" #include "test_ReduceScatter.hpp" -#include - namespace CorrectnessTests { TEST_P(GroupCallsCorrectnessTest, Correctness) diff --git a/test/test_Reduce.cpp b/test/test_Reduce.cpp index 9844e928c1..8234f2ab05 100644 --- a/test/test_Reduce.cpp +++ b/test/test_Reduce.cpp @@ -5,7 +5,6 @@ ************************************************************************/ #include "test_Reduce.hpp" -#include namespace CorrectnessTests { @@ -25,7 +24,7 @@ namespace CorrectnessTests ComputeExpectedResults(dataset, op, root); // Launch the reduction (1 thread per GPU) - #pragma omp parallel for num_threads(numDevices) + ncclGroupStart(); for (int i = 0; i < numDevices; i++) { ncclReduce(dataset.inputs[i], @@ -33,6 +32,7 @@ namespace CorrectnessTests numElements, dataType, op, root, comms[i], streams[i]); } + ncclGroupEnd(); // Wait for reduction to complete Synchronize(); diff --git a/test/test_ReduceScatter.cpp b/test/test_ReduceScatter.cpp index 11007732b5..f6461f20b8 100644 --- a/test/test_ReduceScatter.cpp +++ b/test/test_ReduceScatter.cpp @@ -5,7 +5,6 @@ ************************************************************************/ #include "test_ReduceScatter.hpp" -#include namespace CorrectnessTests { @@ -24,7 +23,7 @@ namespace CorrectnessTests size_t const recvCount = dataset.numElements / dataset.numDevices; // Launch the reduction (1 thread per GPU) - #pragma omp parallel for num_threads(numDevices) + ncclGroupStart(); for (int i = 0; i < numDevices; i++) { ncclReduceScatter(dataset.inputs[i], @@ -32,7 +31,7 @@ namespace CorrectnessTests recvCount, dataType, op, comms[i], streams[i]); } - + ncclGroupEnd(); // Wait for reduction to complete Synchronize(); diff --git a/tools/TransferBench/TransferBench.cpp b/tools/TransferBench/TransferBench.cpp index afb548ff58..5bdccdf807 100644 --- a/tools/TransferBench/TransferBench.cpp +++ b/tools/TransferBench/TransferBench.cpp @@ -27,7 +27,10 @@ THE SOFTWARE. #include #include #include - +#include +#include +#include +#include #include #include "copy_kernel.h" #include "TransferBench.hpp" @@ -49,12 +52,13 @@ int main(int argc, char **argv) printf("\n"); printf("Environment variables:\n"); printf("======================\n"); - printf(" USE_HIP_CALL - Use hip calls (hipMemcpyAsync/hipMemset) instead of kernel\n"); - printf(" USE_MEMSET - Write constant value (instead of doing a copy)\n"); - printf(" USE_COARSE_MEM - Use coarse-grained dst GPU memory (instead of fine-grained)\n"); - printf(" USE_SINGLE_SYNC - Only synchronize once at end of iterations (disables GPU times)\n"); - printf(" USE_INTERACTIVE - Waits for user-input prior to start and after transfer loop (for profiling)\n"); + printf(" USE_HIP_CALL - Use hip calls (hipMemcpyAsync/hipMemset) instead of kernel\n"); + printf(" USE_MEMSET - Write constant value (instead of doing a copy)\n"); + printf(" USE_COARSE_MEM - Use coarse-grained dst GPU memory (instead of fine-grained)\n"); + printf(" USE_SINGLE_SYNC - Only synchronize once at end of iterations (disables GPU times)\n"); + printf(" USE_INTERACTIVE - Waits for user-input prior to start and after transfer loop (for profiling)\n"); printf(" USE_ITERATIONS=N - Sets number of iterations to run (default is 10)\n"); + printf(" USE_SLEEP - Adds a 100ms sleep after sync (for profiling)\n"); exit(0); } @@ -74,6 +78,7 @@ int main(int argc, char **argv) bool useCoarseMem = getenv("USE_COARSE_MEM"); bool useSingleSync = getenv("USE_SINGLE_SYNC"); bool useInteractive = getenv("USE_INTERACTIVE"); + bool useSleep = getenv("USE_SLEEP"); int numWarmups = 3; int numIterations = getenv("USE_ITERATIONS") ? atoi(getenv("USE_ITERATIONS")) : 10; @@ -99,6 +104,10 @@ int main(int argc, char **argv) printf("Running in interactive mode (USE_INTERACTIVE)\n"); else printf("Running in non-interactive mode (enable interactive mode via USE_INTERACTIVE)\n"); + if (useSleep) + printf("Adding 100ms sleep after sync (USE_SLEEP)\n"); + else + printf("No sleep per sync (enable sleep via USE_SLEEP)\n"); printf("Executing %d warmup iteration(s), and %d timed iteration(s) (Set via USE_ITERATION=#)\n", numWarmups, numIterations); @@ -265,7 +274,8 @@ int main(int argc, char **argv) { HIP_CALL(hipSetDevice(links[i].srcGpu)); - HIP_CALL(hipEventRecord(startEvents[i], streams[i])); + if (!useSingleSync || iteration == 0) + HIP_CALL(hipEventRecord(startEvents[i], streams[i])); if (useHipCall) { @@ -301,7 +311,8 @@ int main(int argc, char **argv) gpuBlockParams[i]); } } - HIP_CALL(hipEventRecord(stopEvents[i], streams[i])); + if (!useSingleSync || iteration == numIterations - 1) + HIP_CALL(hipEventRecord(stopEvents[i], streams[i])); } // Synchronize per iteration, unless in single sync mode, in which case @@ -314,6 +325,7 @@ int main(int argc, char **argv) auto cpuDelta = std::chrono::high_resolution_clock::now() - cpuStart; double deltaSec = std::chrono::duration_cast>(cpuDelta).count(); + if (useSleep) usleep(100000); if (iteration >= 0) { @@ -370,8 +382,7 @@ int main(int argc, char **argv) } else { - if (!useSingleSync) - totalGpuTime[i] /= (1.0 * numIterations); + totalGpuTime[i] /= (1.0 * numIterations); printf("%8.3f", (linkCount[i] * numBytesPerLink / 1.0E9) / totalGpuTime[i]); } } diff --git a/tools/TransferBench/TransferBench.hpp b/tools/TransferBench/TransferBench.hpp index 5fde8f58fe..92c596d672 100644 --- a/tools/TransferBench/TransferBench.hpp +++ b/tools/TransferBench/TransferBench.hpp @@ -20,6 +20,8 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ +#include + // Helper macro for catching HIP errors #define HIP_CALL(cmd) \ do { \