Merge remote-tracking branch 'remotes/rccl/master' into rccl_2.5.6_cleanup
This commit is contained in:
+6
-3
@@ -181,6 +181,9 @@ else()
|
||||
target_link_libraries(rccl PUBLIC hip::hip_hcc ${hcc_LIBRARIES} numa)
|
||||
endif()
|
||||
|
||||
#Setup librccl.so version
|
||||
rocm_set_soversion(rccl "1.0")
|
||||
|
||||
rocm_install_targets(TARGETS
|
||||
rccl
|
||||
PREFIX
|
||||
@@ -197,8 +200,8 @@ rocm_export_targets(NAMESPACE
|
||||
DEPENDS
|
||||
hip)
|
||||
|
||||
set(CPACK_DEBIAN_PACKAGE_DEPENDS "hip_hcc")
|
||||
set(CPACK_RPM_PACKAGE_REQUIRES "hip_hcc")
|
||||
set(CPACK_DEBIAN_PACKAGE_DEPENDS "rocm-dev (>= 2.5.27)")
|
||||
set(CPACK_RPM_PACKAGE_REQUIRES "rocm-dev >= 2.5.27")
|
||||
|
||||
set(CPACK_RPM_EXCLUDE_FROM_AUTO_FILELIST_ADDITION "/opt" "/opt/rocm")
|
||||
|
||||
@@ -208,7 +211,7 @@ rocm_create_package(
|
||||
DESCRIPTION
|
||||
"Optimized primitives for collective multi-GPU communication"
|
||||
MAINTAINER
|
||||
"<no-reply@amd.com>"
|
||||
"<rccl-maintainer@amd.com>"
|
||||
LDCONFIG)
|
||||
|
||||
rocm_install_symlink_subdir(rccl)
|
||||
|
||||
+6
-2
@@ -37,10 +37,14 @@ $ git clone https://github.com/ROCmSoftwarePlatform/rccl.git
|
||||
$ cd rccl
|
||||
$ mkdir build
|
||||
$ cd build
|
||||
$ CXX=/opt/rocm/bin/hcc cmake -DCMAKE_INSTALL_PREFIX=$PWD/rccl-install ..
|
||||
$ CXX=/opt/rocm/bin/hcc cmake ..
|
||||
$ make -j 8
|
||||
```
|
||||
You may substitute a path of your own choosing for CMAKE_INSTALL_PREFIX. Note: ensure rocm-cmake is installed, `apt install rocm-cmake`.
|
||||
You may substitute an installation path of your own choosing by passing CMAKE_INSTALL_PREFIX. For example:
|
||||
```shell
|
||||
$ CXX=/opt/rocm/bin/hcc cmake -DCMAKE_INSTALL_PREFIX=$PWD/rccl-install ..
|
||||
```
|
||||
Note: ensure rocm-cmake is installed, `apt install rocm-cmake`.
|
||||
|
||||
#### To build the RCCL package and install package :
|
||||
|
||||
|
||||
@@ -10,13 +10,6 @@ if(BUILD_TESTS)
|
||||
message(FATAL_ERROR "chrpath is required for UnitTests. Please install (e.g. sudo apt-get install chrpath)")
|
||||
endif()
|
||||
|
||||
# OpenMP is used to drive GPUs (one per thread)
|
||||
if(EXISTS /etc/redhat-release)
|
||||
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fopenmp=libgomp -pthread")
|
||||
else()
|
||||
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fopenmp -pthread")
|
||||
endif()
|
||||
|
||||
# Download and unpack googletest at configure time
|
||||
configure_file(CMakeLists.txt.in googletest-download/CMakeLists.txt)
|
||||
execute_process(
|
||||
|
||||
@@ -4,7 +4,6 @@
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
#include "test_AllGather.hpp"
|
||||
#include <omp.h>
|
||||
|
||||
namespace CorrectnessTests
|
||||
{
|
||||
@@ -23,13 +22,14 @@ namespace CorrectnessTests
|
||||
size_t const sendCount = dataset.numElements / dataset.numDevices;
|
||||
|
||||
// Launch the reduction (1 thread per GPU)
|
||||
#pragma omp parallel for num_threads(numDevices)
|
||||
ncclGroupStart();
|
||||
for (int i = 0; i < numDevices; i++)
|
||||
{
|
||||
ncclAllGather((int8_t *)dataset.inputs[i] + (i * byteCount),
|
||||
dataset.outputs[i], sendCount,
|
||||
dataType, comms[i], streams[i]);
|
||||
}
|
||||
ncclGroupEnd();
|
||||
|
||||
// Wait for reduction to complete
|
||||
Synchronize();
|
||||
@@ -68,13 +68,14 @@ namespace CorrectnessTests
|
||||
size_t const sendCount = subDataset.numElements / subDataset.numDevices;
|
||||
|
||||
// Launch the reduction (1 thread per GPU)
|
||||
#pragma omp parallel for num_threads(numDevices)
|
||||
ncclGroupStart();
|
||||
for (int i = 0; i < numDevices; i++)
|
||||
{
|
||||
ncclAllGather((int8_t *)subDataset.inputs[i] + (i * byteCount),
|
||||
subDataset.outputs[i], sendCount,
|
||||
dataType, comms[i], streams[i]);
|
||||
}
|
||||
ncclGroupEnd();
|
||||
|
||||
// Wait for reduction to complete
|
||||
Synchronize();
|
||||
|
||||
@@ -5,7 +5,6 @@
|
||||
************************************************************************/
|
||||
|
||||
#include "test_AllReduce.hpp"
|
||||
#include <omp.h>
|
||||
|
||||
namespace CorrectnessTests
|
||||
{
|
||||
@@ -20,12 +19,13 @@ namespace CorrectnessTests
|
||||
ComputeExpectedResults(dataset, op);
|
||||
|
||||
// Launch the reduction (1 thread per GPU)
|
||||
#pragma omp parallel for num_threads(numDevices)
|
||||
ncclGroupStart();
|
||||
for (int i = 0; i < numDevices; i++)
|
||||
{
|
||||
ncclAllReduce(dataset.inputs[i], dataset.outputs[i],
|
||||
numElements, dataType, op, comms[i], streams[i]);
|
||||
}
|
||||
ncclGroupEnd();
|
||||
|
||||
// Wait for reduction to complete
|
||||
Synchronize();
|
||||
|
||||
@@ -6,7 +6,6 @@
|
||||
|
||||
#include "test_AllReduceAbort.hpp"
|
||||
#include "../include/comm.h"
|
||||
#include <omp.h>
|
||||
|
||||
#define NUM_ITER 8
|
||||
#define FAKE_OP_COUNT NUM_ITER+1
|
||||
|
||||
@@ -5,7 +5,6 @@
|
||||
************************************************************************/
|
||||
|
||||
#include "test_Broadcast.hpp"
|
||||
#include <omp.h>
|
||||
|
||||
namespace CorrectnessTests
|
||||
{
|
||||
@@ -25,7 +24,7 @@ namespace CorrectnessTests
|
||||
ComputeExpectedResults(dataset, root);
|
||||
|
||||
// Launch the reduction (1 thread per GPU)
|
||||
#pragma omp parallel for num_threads(numDevices)
|
||||
ncclGroupStart();
|
||||
for (int i = 0; i < numDevices; i++)
|
||||
{
|
||||
ncclBroadcast(dataset.inputs[i],
|
||||
@@ -33,7 +32,7 @@ namespace CorrectnessTests
|
||||
numElements, dataType,
|
||||
root, comms[i], streams[i]);
|
||||
}
|
||||
|
||||
ncclGroupEnd();
|
||||
|
||||
// Wait for reduction to complete
|
||||
Synchronize();
|
||||
|
||||
@@ -7,7 +7,6 @@
|
||||
#define TEST_BROADCAST_HPP
|
||||
|
||||
#include "CorrectnessTest.hpp"
|
||||
#include <omp.h>
|
||||
|
||||
namespace CorrectnessTests
|
||||
{
|
||||
|
||||
@@ -6,7 +6,6 @@
|
||||
|
||||
#include "test_BroadcastAbort.hpp"
|
||||
#include "../include/comm.h"
|
||||
#include <omp.h>
|
||||
|
||||
#define NUM_ITER 8
|
||||
#define FAKE_OP_COUNT NUM_ITER+1
|
||||
|
||||
@@ -11,8 +11,6 @@
|
||||
#include "test_Reduce.hpp"
|
||||
#include "test_ReduceScatter.hpp"
|
||||
|
||||
#include <omp.h>
|
||||
|
||||
namespace CorrectnessTests
|
||||
{
|
||||
TEST_P(CombinedCallsCorrectnessTest, Correctness)
|
||||
@@ -38,7 +36,7 @@ namespace CorrectnessTests
|
||||
size_t const byteCount = datasets[0].NumBytes() / numDevices;
|
||||
size_t const elemCount = numElements / numDevices;
|
||||
|
||||
#pragma omp parallel for num_threads(numDevices)
|
||||
ncclGroupStart();
|
||||
for (int i = 0; i < numDevices; i++)
|
||||
{
|
||||
ncclAllGather((int8_t *)datasets[0].inputs[i] + (i * byteCount),
|
||||
@@ -63,6 +61,7 @@ namespace CorrectnessTests
|
||||
elemCount, dataType, op,
|
||||
comms[i], streams[i]);
|
||||
}
|
||||
ncclGroupEnd();
|
||||
|
||||
// Wait for reduction to complete
|
||||
Synchronize();
|
||||
|
||||
@@ -11,8 +11,6 @@
|
||||
#include "test_Reduce.hpp"
|
||||
#include "test_ReduceScatter.hpp"
|
||||
|
||||
#include <omp.h>
|
||||
|
||||
namespace CorrectnessTests
|
||||
{
|
||||
TEST_P(GroupCallsCorrectnessTest, Correctness)
|
||||
|
||||
@@ -5,7 +5,6 @@
|
||||
************************************************************************/
|
||||
|
||||
#include "test_Reduce.hpp"
|
||||
#include <omp.h>
|
||||
|
||||
namespace CorrectnessTests
|
||||
{
|
||||
@@ -25,7 +24,7 @@ namespace CorrectnessTests
|
||||
ComputeExpectedResults(dataset, op, root);
|
||||
|
||||
// Launch the reduction (1 thread per GPU)
|
||||
#pragma omp parallel for num_threads(numDevices)
|
||||
ncclGroupStart();
|
||||
for (int i = 0; i < numDevices; i++)
|
||||
{
|
||||
ncclReduce(dataset.inputs[i],
|
||||
@@ -33,6 +32,7 @@ namespace CorrectnessTests
|
||||
numElements, dataType, op,
|
||||
root, comms[i], streams[i]);
|
||||
}
|
||||
ncclGroupEnd();
|
||||
|
||||
// Wait for reduction to complete
|
||||
Synchronize();
|
||||
|
||||
@@ -5,7 +5,6 @@
|
||||
************************************************************************/
|
||||
|
||||
#include "test_ReduceScatter.hpp"
|
||||
#include <omp.h>
|
||||
|
||||
namespace CorrectnessTests
|
||||
{
|
||||
@@ -24,7 +23,7 @@ namespace CorrectnessTests
|
||||
size_t const recvCount = dataset.numElements / dataset.numDevices;
|
||||
|
||||
// Launch the reduction (1 thread per GPU)
|
||||
#pragma omp parallel for num_threads(numDevices)
|
||||
ncclGroupStart();
|
||||
for (int i = 0; i < numDevices; i++)
|
||||
{
|
||||
ncclReduceScatter(dataset.inputs[i],
|
||||
@@ -32,7 +31,7 @@ namespace CorrectnessTests
|
||||
recvCount, dataType, op,
|
||||
comms[i], streams[i]);
|
||||
}
|
||||
|
||||
ncclGroupEnd();
|
||||
|
||||
// Wait for reduction to complete
|
||||
Synchronize();
|
||||
|
||||
@@ -27,7 +27,10 @@ THE SOFTWARE.
|
||||
#include <cstdio>
|
||||
#include <cstdlib>
|
||||
#include <set>
|
||||
|
||||
#include <unistd.h>
|
||||
#include <map>
|
||||
#include <iostream>
|
||||
#include <sstream>
|
||||
#include <hip/hip_runtime.h>
|
||||
#include "copy_kernel.h"
|
||||
#include "TransferBench.hpp"
|
||||
@@ -49,12 +52,13 @@ int main(int argc, char **argv)
|
||||
printf("\n");
|
||||
printf("Environment variables:\n");
|
||||
printf("======================\n");
|
||||
printf(" USE_HIP_CALL - Use hip calls (hipMemcpyAsync/hipMemset) instead of kernel\n");
|
||||
printf(" USE_MEMSET - Write constant value (instead of doing a copy)\n");
|
||||
printf(" USE_COARSE_MEM - Use coarse-grained dst GPU memory (instead of fine-grained)\n");
|
||||
printf(" USE_SINGLE_SYNC - Only synchronize once at end of iterations (disables GPU times)\n");
|
||||
printf(" USE_INTERACTIVE - Waits for user-input prior to start and after transfer loop (for profiling)\n");
|
||||
printf(" USE_HIP_CALL - Use hip calls (hipMemcpyAsync/hipMemset) instead of kernel\n");
|
||||
printf(" USE_MEMSET - Write constant value (instead of doing a copy)\n");
|
||||
printf(" USE_COARSE_MEM - Use coarse-grained dst GPU memory (instead of fine-grained)\n");
|
||||
printf(" USE_SINGLE_SYNC - Only synchronize once at end of iterations (disables GPU times)\n");
|
||||
printf(" USE_INTERACTIVE - Waits for user-input prior to start and after transfer loop (for profiling)\n");
|
||||
printf(" USE_ITERATIONS=N - Sets number of iterations to run (default is 10)\n");
|
||||
printf(" USE_SLEEP - Adds a 100ms sleep after sync (for profiling)\n");
|
||||
exit(0);
|
||||
}
|
||||
|
||||
@@ -74,6 +78,7 @@ int main(int argc, char **argv)
|
||||
bool useCoarseMem = getenv("USE_COARSE_MEM");
|
||||
bool useSingleSync = getenv("USE_SINGLE_SYNC");
|
||||
bool useInteractive = getenv("USE_INTERACTIVE");
|
||||
bool useSleep = getenv("USE_SLEEP");
|
||||
|
||||
int numWarmups = 3;
|
||||
int numIterations = getenv("USE_ITERATIONS") ? atoi(getenv("USE_ITERATIONS")) : 10;
|
||||
@@ -99,6 +104,10 @@ int main(int argc, char **argv)
|
||||
printf("Running in interactive mode (USE_INTERACTIVE)\n");
|
||||
else
|
||||
printf("Running in non-interactive mode (enable interactive mode via USE_INTERACTIVE)\n");
|
||||
if (useSleep)
|
||||
printf("Adding 100ms sleep after sync (USE_SLEEP)\n");
|
||||
else
|
||||
printf("No sleep per sync (enable sleep via USE_SLEEP)\n");
|
||||
|
||||
printf("Executing %d warmup iteration(s), and %d timed iteration(s) (Set via USE_ITERATION=#)\n",
|
||||
numWarmups, numIterations);
|
||||
@@ -265,7 +274,8 @@ int main(int argc, char **argv)
|
||||
{
|
||||
HIP_CALL(hipSetDevice(links[i].srcGpu));
|
||||
|
||||
HIP_CALL(hipEventRecord(startEvents[i], streams[i]));
|
||||
if (!useSingleSync || iteration == 0)
|
||||
HIP_CALL(hipEventRecord(startEvents[i], streams[i]));
|
||||
|
||||
if (useHipCall)
|
||||
{
|
||||
@@ -301,7 +311,8 @@ int main(int argc, char **argv)
|
||||
gpuBlockParams[i]);
|
||||
}
|
||||
}
|
||||
HIP_CALL(hipEventRecord(stopEvents[i], streams[i]));
|
||||
if (!useSingleSync || iteration == numIterations - 1)
|
||||
HIP_CALL(hipEventRecord(stopEvents[i], streams[i]));
|
||||
}
|
||||
|
||||
// Synchronize per iteration, unless in single sync mode, in which case
|
||||
@@ -314,6 +325,7 @@ int main(int argc, char **argv)
|
||||
|
||||
auto cpuDelta = std::chrono::high_resolution_clock::now() - cpuStart;
|
||||
double deltaSec = std::chrono::duration_cast<std::chrono::duration<double>>(cpuDelta).count();
|
||||
if (useSleep) usleep(100000);
|
||||
|
||||
if (iteration >= 0)
|
||||
{
|
||||
@@ -370,8 +382,7 @@ int main(int argc, char **argv)
|
||||
}
|
||||
else
|
||||
{
|
||||
if (!useSingleSync)
|
||||
totalGpuTime[i] /= (1.0 * numIterations);
|
||||
totalGpuTime[i] /= (1.0 * numIterations);
|
||||
printf("%8.3f", (linkCount[i] * numBytesPerLink / 1.0E9) / totalGpuTime[i]);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -20,6 +20,8 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
THE SOFTWARE.
|
||||
*/
|
||||
|
||||
#include <sstream>
|
||||
|
||||
// Helper macro for catching HIP errors
|
||||
#define HIP_CALL(cmd) \
|
||||
do { \
|
||||
|
||||
Reference in New Issue
Block a user