Merge remote-tracking branch 'remotes/rccl/master' into rccl_2.5.6_cleanup

This commit is contained in:
Wenkai Du
2020-01-29 15:28:03 -08:00
15 zmienionych plików z 49 dodań i 43 usunięć
+6 -3
Wyświetl plik
@@ -181,6 +181,9 @@ else()
target_link_libraries(rccl PUBLIC hip::hip_hcc ${hcc_LIBRARIES} numa)
endif()
#Setup librccl.so version
rocm_set_soversion(rccl "1.0")
rocm_install_targets(TARGETS
rccl
PREFIX
@@ -197,8 +200,8 @@ rocm_export_targets(NAMESPACE
DEPENDS
hip)
set(CPACK_DEBIAN_PACKAGE_DEPENDS "hip_hcc")
set(CPACK_RPM_PACKAGE_REQUIRES "hip_hcc")
set(CPACK_DEBIAN_PACKAGE_DEPENDS "rocm-dev (>= 2.5.27)")
set(CPACK_RPM_PACKAGE_REQUIRES "rocm-dev >= 2.5.27")
set(CPACK_RPM_EXCLUDE_FROM_AUTO_FILELIST_ADDITION "/opt" "/opt/rocm")
@@ -208,7 +211,7 @@ rocm_create_package(
DESCRIPTION
"Optimized primitives for collective multi-GPU communication"
MAINTAINER
"<no-reply@amd.com>"
"<rccl-maintainer@amd.com>"
LDCONFIG)
rocm_install_symlink_subdir(rccl)
+6 -2
Wyświetl plik
@@ -37,10 +37,14 @@ $ git clone https://github.com/ROCmSoftwarePlatform/rccl.git
$ cd rccl
$ mkdir build
$ cd build
$ CXX=/opt/rocm/bin/hcc cmake -DCMAKE_INSTALL_PREFIX=$PWD/rccl-install ..
$ CXX=/opt/rocm/bin/hcc cmake ..
$ make -j 8
```
You may substitute a path of your own choosing for CMAKE_INSTALL_PREFIX. Note: ensure rocm-cmake is installed, `apt install rocm-cmake`.
You may substitute an installation path of your own choosing by passing CMAKE_INSTALL_PREFIX. For example:
```shell
$ CXX=/opt/rocm/bin/hcc cmake -DCMAKE_INSTALL_PREFIX=$PWD/rccl-install ..
```
Note: ensure rocm-cmake is installed, `apt install rocm-cmake`.
#### To build the RCCL package and install package :
-7
Wyświetl plik
@@ -10,13 +10,6 @@ if(BUILD_TESTS)
message(FATAL_ERROR "chrpath is required for UnitTests. Please install (e.g. sudo apt-get install chrpath)")
endif()
# OpenMP is used to drive GPUs (one per thread)
if(EXISTS /etc/redhat-release)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fopenmp=libgomp -pthread")
else()
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fopenmp -pthread")
endif()
# Download and unpack googletest at configure time
configure_file(CMakeLists.txt.in googletest-download/CMakeLists.txt)
execute_process(
+4 -3
Wyświetl plik
@@ -4,7 +4,6 @@
* See LICENSE.txt for license information
************************************************************************/
#include "test_AllGather.hpp"
#include <omp.h>
namespace CorrectnessTests
{
@@ -23,13 +22,14 @@ namespace CorrectnessTests
size_t const sendCount = dataset.numElements / dataset.numDevices;
// Launch the reduction (1 thread per GPU)
#pragma omp parallel for num_threads(numDevices)
ncclGroupStart();
for (int i = 0; i < numDevices; i++)
{
ncclAllGather((int8_t *)dataset.inputs[i] + (i * byteCount),
dataset.outputs[i], sendCount,
dataType, comms[i], streams[i]);
}
ncclGroupEnd();
// Wait for reduction to complete
Synchronize();
@@ -68,13 +68,14 @@ namespace CorrectnessTests
size_t const sendCount = subDataset.numElements / subDataset.numDevices;
// Launch the reduction (1 thread per GPU)
#pragma omp parallel for num_threads(numDevices)
ncclGroupStart();
for (int i = 0; i < numDevices; i++)
{
ncclAllGather((int8_t *)subDataset.inputs[i] + (i * byteCount),
subDataset.outputs[i], sendCount,
dataType, comms[i], streams[i]);
}
ncclGroupEnd();
// Wait for reduction to complete
Synchronize();
+2 -2
Wyświetl plik
@@ -5,7 +5,6 @@
************************************************************************/
#include "test_AllReduce.hpp"
#include <omp.h>
namespace CorrectnessTests
{
@@ -20,12 +19,13 @@ namespace CorrectnessTests
ComputeExpectedResults(dataset, op);
// Launch the reduction (1 thread per GPU)
#pragma omp parallel for num_threads(numDevices)
ncclGroupStart();
for (int i = 0; i < numDevices; i++)
{
ncclAllReduce(dataset.inputs[i], dataset.outputs[i],
numElements, dataType, op, comms[i], streams[i]);
}
ncclGroupEnd();
// Wait for reduction to complete
Synchronize();
-1
Wyświetl plik
@@ -6,7 +6,6 @@
#include "test_AllReduceAbort.hpp"
#include "../include/comm.h"
#include <omp.h>
#define NUM_ITER 8
#define FAKE_OP_COUNT NUM_ITER+1
+2 -3
Wyświetl plik
@@ -5,7 +5,6 @@
************************************************************************/
#include "test_Broadcast.hpp"
#include <omp.h>
namespace CorrectnessTests
{
@@ -25,7 +24,7 @@ namespace CorrectnessTests
ComputeExpectedResults(dataset, root);
// Launch the reduction (1 thread per GPU)
#pragma omp parallel for num_threads(numDevices)
ncclGroupStart();
for (int i = 0; i < numDevices; i++)
{
ncclBroadcast(dataset.inputs[i],
@@ -33,7 +32,7 @@ namespace CorrectnessTests
numElements, dataType,
root, comms[i], streams[i]);
}
ncclGroupEnd();
// Wait for reduction to complete
Synchronize();
-1
Wyświetl plik
@@ -7,7 +7,6 @@
#define TEST_BROADCAST_HPP
#include "CorrectnessTest.hpp"
#include <omp.h>
namespace CorrectnessTests
{
-1
Wyświetl plik
@@ -6,7 +6,6 @@
#include "test_BroadcastAbort.hpp"
#include "../include/comm.h"
#include <omp.h>
#define NUM_ITER 8
#define FAKE_OP_COUNT NUM_ITER+1
+2 -3
Wyświetl plik
@@ -11,8 +11,6 @@
#include "test_Reduce.hpp"
#include "test_ReduceScatter.hpp"
#include <omp.h>
namespace CorrectnessTests
{
TEST_P(CombinedCallsCorrectnessTest, Correctness)
@@ -38,7 +36,7 @@ namespace CorrectnessTests
size_t const byteCount = datasets[0].NumBytes() / numDevices;
size_t const elemCount = numElements / numDevices;
#pragma omp parallel for num_threads(numDevices)
ncclGroupStart();
for (int i = 0; i < numDevices; i++)
{
ncclAllGather((int8_t *)datasets[0].inputs[i] + (i * byteCount),
@@ -63,6 +61,7 @@ namespace CorrectnessTests
elemCount, dataType, op,
comms[i], streams[i]);
}
ncclGroupEnd();
// Wait for reduction to complete
Synchronize();
-2
Wyświetl plik
@@ -11,8 +11,6 @@
#include "test_Reduce.hpp"
#include "test_ReduceScatter.hpp"
#include <omp.h>
namespace CorrectnessTests
{
TEST_P(GroupCallsCorrectnessTest, Correctness)
+2 -2
Wyświetl plik
@@ -5,7 +5,6 @@
************************************************************************/
#include "test_Reduce.hpp"
#include <omp.h>
namespace CorrectnessTests
{
@@ -25,7 +24,7 @@ namespace CorrectnessTests
ComputeExpectedResults(dataset, op, root);
// Launch the reduction (1 thread per GPU)
#pragma omp parallel for num_threads(numDevices)
ncclGroupStart();
for (int i = 0; i < numDevices; i++)
{
ncclReduce(dataset.inputs[i],
@@ -33,6 +32,7 @@ namespace CorrectnessTests
numElements, dataType, op,
root, comms[i], streams[i]);
}
ncclGroupEnd();
// Wait for reduction to complete
Synchronize();
+2 -3
Wyświetl plik
@@ -5,7 +5,6 @@
************************************************************************/
#include "test_ReduceScatter.hpp"
#include <omp.h>
namespace CorrectnessTests
{
@@ -24,7 +23,7 @@ namespace CorrectnessTests
size_t const recvCount = dataset.numElements / dataset.numDevices;
// Launch the reduction (1 thread per GPU)
#pragma omp parallel for num_threads(numDevices)
ncclGroupStart();
for (int i = 0; i < numDevices; i++)
{
ncclReduceScatter(dataset.inputs[i],
@@ -32,7 +31,7 @@ namespace CorrectnessTests
recvCount, dataType, op,
comms[i], streams[i]);
}
ncclGroupEnd();
// Wait for reduction to complete
Synchronize();
+21 -10
Wyświetl plik
@@ -27,7 +27,10 @@ THE SOFTWARE.
#include <cstdio>
#include <cstdlib>
#include <set>
#include <unistd.h>
#include <map>
#include <iostream>
#include <sstream>
#include <hip/hip_runtime.h>
#include "copy_kernel.h"
#include "TransferBench.hpp"
@@ -49,12 +52,13 @@ int main(int argc, char **argv)
printf("\n");
printf("Environment variables:\n");
printf("======================\n");
printf(" USE_HIP_CALL - Use hip calls (hipMemcpyAsync/hipMemset) instead of kernel\n");
printf(" USE_MEMSET - Write constant value (instead of doing a copy)\n");
printf(" USE_COARSE_MEM - Use coarse-grained dst GPU memory (instead of fine-grained)\n");
printf(" USE_SINGLE_SYNC - Only synchronize once at end of iterations (disables GPU times)\n");
printf(" USE_INTERACTIVE - Waits for user-input prior to start and after transfer loop (for profiling)\n");
printf(" USE_HIP_CALL - Use hip calls (hipMemcpyAsync/hipMemset) instead of kernel\n");
printf(" USE_MEMSET - Write constant value (instead of doing a copy)\n");
printf(" USE_COARSE_MEM - Use coarse-grained dst GPU memory (instead of fine-grained)\n");
printf(" USE_SINGLE_SYNC - Only synchronize once at end of iterations (disables GPU times)\n");
printf(" USE_INTERACTIVE - Waits for user-input prior to start and after transfer loop (for profiling)\n");
printf(" USE_ITERATIONS=N - Sets number of iterations to run (default is 10)\n");
printf(" USE_SLEEP - Adds a 100ms sleep after sync (for profiling)\n");
exit(0);
}
@@ -74,6 +78,7 @@ int main(int argc, char **argv)
bool useCoarseMem = getenv("USE_COARSE_MEM");
bool useSingleSync = getenv("USE_SINGLE_SYNC");
bool useInteractive = getenv("USE_INTERACTIVE");
bool useSleep = getenv("USE_SLEEP");
int numWarmups = 3;
int numIterations = getenv("USE_ITERATIONS") ? atoi(getenv("USE_ITERATIONS")) : 10;
@@ -99,6 +104,10 @@ int main(int argc, char **argv)
printf("Running in interactive mode (USE_INTERACTIVE)\n");
else
printf("Running in non-interactive mode (enable interactive mode via USE_INTERACTIVE)\n");
if (useSleep)
printf("Adding 100ms sleep after sync (USE_SLEEP)\n");
else
printf("No sleep per sync (enable sleep via USE_SLEEP)\n");
printf("Executing %d warmup iteration(s), and %d timed iteration(s) (Set via USE_ITERATION=#)\n",
numWarmups, numIterations);
@@ -265,7 +274,8 @@ int main(int argc, char **argv)
{
HIP_CALL(hipSetDevice(links[i].srcGpu));
HIP_CALL(hipEventRecord(startEvents[i], streams[i]));
if (!useSingleSync || iteration == 0)
HIP_CALL(hipEventRecord(startEvents[i], streams[i]));
if (useHipCall)
{
@@ -301,7 +311,8 @@ int main(int argc, char **argv)
gpuBlockParams[i]);
}
}
HIP_CALL(hipEventRecord(stopEvents[i], streams[i]));
if (!useSingleSync || iteration == numIterations - 1)
HIP_CALL(hipEventRecord(stopEvents[i], streams[i]));
}
// Synchronize per iteration, unless in single sync mode, in which case
@@ -314,6 +325,7 @@ int main(int argc, char **argv)
auto cpuDelta = std::chrono::high_resolution_clock::now() - cpuStart;
double deltaSec = std::chrono::duration_cast<std::chrono::duration<double>>(cpuDelta).count();
if (useSleep) usleep(100000);
if (iteration >= 0)
{
@@ -370,8 +382,7 @@ int main(int argc, char **argv)
}
else
{
if (!useSingleSync)
totalGpuTime[i] /= (1.0 * numIterations);
totalGpuTime[i] /= (1.0 * numIterations);
printf("%8.3f", (linkCount[i] * numBytesPerLink / 1.0E9) / totalGpuTime[i]);
}
}
@@ -20,6 +20,8 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
#include <sstream>
// Helper macro for catching HIP errors
#define HIP_CALL(cmd) \
do { \