Adding unit tests and files for CI (#4)
* Adding initial unit test and Jenkins code.
Fixing scope of unit tests
Adding unit tests and files for CI
Fixing Jenkinsfile
* Removing typos from Jenkinsfile
* Making some fixes to the Jenkins file; temporarily disabling MPI
* Making corrections to Jenkinsfile
* Correcting dockerNodes entry in Jenkinsfile
* Fixed Jenkinsfile for CI
* Correcting Jenkinsfile for CI
* Updating README to include instructions on how to run unit tests.
[ROCm/rccl-tests commit: 924521ff57]
This commit is contained in:
committato da
Wenkai Du
parent
75b5e43633
commit
30d348de0e
esterno
+82
@@ -0,0 +1,82 @@
|
||||
#!/usr/bin/env groovy
|
||||
// Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
|
||||
// This shared library is available at https://github.com/ROCmSoftwarePlatform/rocJENKINS
|
||||
@Library('rocJenkins@noDocker') _
|
||||
|
||||
// This is file for internal AMD use.
|
||||
// If you are interested in running your own Jenkins, please raise a github issue for assistance.
|
||||
|
||||
import com.amd.project.*
|
||||
import com.amd.docker.*
|
||||
|
||||
////////////////////////////////////////////////////////////////////////
|
||||
// Mostly generated from snippet generator 'properties; set job properties'
|
||||
// Time-based triggers added to execute nightly tests, eg '30 2 * * *' means 2:30 AM
|
||||
properties([
|
||||
pipelineTriggers([cron('0 1 * * *'), [$class: 'PeriodicFolderTrigger', interval: '5m']]),
|
||||
buildDiscarder(logRotator(
|
||||
artifactDaysToKeepStr: '',
|
||||
artifactNumToKeepStr: '',
|
||||
daysToKeepStr: '',
|
||||
numToKeepStr: '10')),
|
||||
disableConcurrentBuilds(),
|
||||
[$class: 'CopyArtifactPermissionProperty', projectNames: '*']
|
||||
])
|
||||
|
||||
|
||||
////////////////////////////////////////////////////////////////////////
|
||||
import java.nio.file.Path;
|
||||
|
||||
rcclTestsCI:
|
||||
{
|
||||
def rcclTests = new rocProject('rcclTests')
|
||||
// customize for project
|
||||
rcclTests.paths.build_command = './install.sh'
|
||||
|
||||
// Define test architectures, optional rocm version argument is available
|
||||
def nodes = new dockerNodes(['RCCL'], rcclTests)
|
||||
|
||||
boolean formatCheck = false
|
||||
|
||||
def compileCommand =
|
||||
{
|
||||
platform, project->
|
||||
|
||||
project.paths.construct_build_prefix()
|
||||
|
||||
def command = """#!/usr/bin/env bash
|
||||
set -x
|
||||
rm -rf rccl
|
||||
git clone https://github.com/ROCmSoftwarePlatform/rccl
|
||||
cd rccl
|
||||
export RCCL_PATH=${WORKSPACE}/rccl/rccl-install
|
||||
./install.sh -i --prefix=\$RCCL_PATH
|
||||
cd ..
|
||||
cd ${project.paths.project_build_prefix}
|
||||
${project.paths.build_command} --rccl_home=\$RCCL_PATH
|
||||
"""
|
||||
sh command
|
||||
}
|
||||
def testCommand =
|
||||
{
|
||||
platform, project->
|
||||
|
||||
def command = """#!/usr/bin/env bash
|
||||
set -x
|
||||
LD_LIBRARY_PATH=\$LD_LIBRARY_PATH:${WORKSPACE}/rccl/rccl-install/lib/ python3 -m pytest -k "not MPI" --junitxml=./testreport.xml
|
||||
"""
|
||||
|
||||
sh command
|
||||
//junit "${project.paths.project_build_prefix}/build/release/*.xml"
|
||||
}
|
||||
|
||||
def packageCommand =
|
||||
{
|
||||
platform, project->
|
||||
|
||||
def command = """
|
||||
"""
|
||||
}
|
||||
|
||||
buildProjectNoDocker(rcclTests, formatCheck, nodes.dockerArray, compileCommand, testCommand, packageCommand)
|
||||
}
|
||||
@@ -64,6 +64,18 @@ All tests support the same set of arguments :
|
||||
* `-c,--check <0/1>` check correctness of results. This can be quite slow on large numbers of GPUs. Default : 1.
|
||||
* `-z,--blocking <0/1>` Make RCCL collective blocking, i.e. have CPUs wait and sync after each collective. Default : 0.
|
||||
|
||||
## Unit tests
|
||||
|
||||
Unit tests for rccl-tests are implemented with pytest (python3 is also required). Several notes for the unit tests:
|
||||
|
||||
1. The LD_LIBRARY_PATH environment variable will need to be set to include /path/to/rccl-install/lib/ in order to run the unit tests.
|
||||
2. The HSA_FORCE_FINE_GRAIN_PCIE environment variable will need to be set to 1 in order to run the unit tests which use fine-grained memory type.
|
||||
|
||||
The unit tests can be invoked within the rccl-tests root, or in the test subfolder. An example call to the unit tests:
|
||||
```shell
|
||||
$ LD_LIBRARY_PATH=/path/to/rccl-install/lib/ HSA_FORCE_FINE_GRAIN_PCIE=1 python3 -m pytest
|
||||
```
|
||||
|
||||
## Copyright
|
||||
|
||||
RCCL tests are provided under the BSD license.
|
||||
|
||||
Executable
+98
@@ -0,0 +1,98 @@
|
||||
#!/bin/bash
|
||||
# Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
# #################################################
|
||||
# helper functions
|
||||
# #################################################
|
||||
function display_help()
|
||||
{
|
||||
echo "RCCL-tests build & installation helper script"
|
||||
echo "./install [-h|--help] "
|
||||
echo " [-h|--help] Prints this help message."
|
||||
echo " [-m|--mpi] Build RCCL-tests with MPI support. (see --mpi_home below.)"
|
||||
echo " [--rccl_home] Specify custom path for RCCL installation (default: /opt/rocm/rccl)"
|
||||
echo " [--mpi_home] Specify path to your MPI installation."
|
||||
}
|
||||
|
||||
# #################################################
|
||||
# global variables
|
||||
# #################################################
|
||||
run_tests=false
|
||||
build_release=true
|
||||
mpi_enabled=false
|
||||
rccl_dir=/opt/rocm/rccl
|
||||
mpi_dir=""
|
||||
# #################################################
|
||||
# Parameter parsing
|
||||
# #################################################
|
||||
|
||||
# check if we have a modern version of getopt that can handle whitespace and long parameters
|
||||
getopt -T
|
||||
if [[ $? -eq 4 ]]; then
|
||||
GETOPT_PARSE=$(getopt --name "${0}" --longoptions help,mpi,test,rccl_home:,mpi_home: --options hmt -- "$@")
|
||||
else
|
||||
echo "Need a new version of getopt"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
if [[ $? -ne 0 ]]; then
|
||||
echo "getopt invocation failed; could not parse the command line";
|
||||
exit 1
|
||||
fi
|
||||
|
||||
eval set -- "${GETOPT_PARSE}"
|
||||
|
||||
while true; do
|
||||
case "${1}" in
|
||||
-h|--help)
|
||||
display_help
|
||||
exit 0
|
||||
;;
|
||||
-m|--mpi)
|
||||
mpi_enabled=true
|
||||
shift ;;
|
||||
-t|--test)
|
||||
run_tests=true
|
||||
shift ;;
|
||||
--rccl_home)
|
||||
rccl_dir=${2}
|
||||
shift 2 ;;
|
||||
--mpi_home)
|
||||
mpi_dir=${2}
|
||||
shift 2 ;;
|
||||
--) shift ; break ;;
|
||||
*) echo "Unexpected command line parameter received; aborting";
|
||||
exit 1
|
||||
;;
|
||||
esac
|
||||
done
|
||||
|
||||
# Install the pre-commit hook
|
||||
#bash ./githooks/install
|
||||
|
||||
build_dir=./build
|
||||
# #################################################
|
||||
# prep
|
||||
# #################################################
|
||||
# ensure a clean build environment
|
||||
rm -rf ${build_dir}
|
||||
|
||||
if ($mpi_enabled); then
|
||||
if [[ ${mpi_dir} -eq "" ]]; then
|
||||
echo "MPI flag enabled but path to MPI installation not specified. See --mpi_home command line argument."
|
||||
exit 1
|
||||
else
|
||||
make NCCL_HOME=${rccl_dir} CUSTOM_RCCL_LIB=${rccl_dir}/lib/librccl.so MPI=1 MPI_HOME=${mpi_dir} -j$(nproc)
|
||||
fi
|
||||
else
|
||||
make NCCL_HOME=${rccl_dir} CUSTOM_RCCL_LIB=${rccl_dir}/lib/librccl.so -j$(nproc)
|
||||
fi
|
||||
|
||||
# Optionally, run tests if they're enabled.
|
||||
if ($run_tests); then
|
||||
if ($mpi_enabled); then
|
||||
cd test; LD_LIBRARY_PATH=$LD_LIBRARY_PATH:${rccl_dir}/lib:${mpi_dir}/lib PATH=$PATH:${mpi_dir}/bin python3 -m pytest
|
||||
else
|
||||
cd test; LD_LIBRARY_PATH=$LD_LIBRARY_PATH:${rccl_dir}/lib python3 -m pytest
|
||||
fi
|
||||
fi
|
||||
@@ -292,7 +292,7 @@ void Barrier(struct threadArgs* args)
|
||||
args->barrier_idx=!args->barrier_idx;
|
||||
}
|
||||
|
||||
testResult_t CheckData(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t op, int root, int in_place, double *delta) {
|
||||
testResult_t CheckData(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t op, int root, int in_place, double *delta, bool *error) {
|
||||
size_t count = args->expectedBytes/wordSize(type);
|
||||
double maxDelta = 0.0;
|
||||
for (int i=0; i<args->nGpus; i++) {
|
||||
@@ -327,7 +327,11 @@ testResult_t CheckData(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t
|
||||
#endif
|
||||
}
|
||||
double nranks = args->nProcs*args->nThreads*args->nGpus;
|
||||
if (maxDelta > DeltaMaxValue(type)*(nranks - 1)) args->errors[0]++;
|
||||
if (maxDelta > DeltaMaxValue(type)*(nranks - 1))
|
||||
{
|
||||
args->errors[0]++;
|
||||
*error = true;
|
||||
}
|
||||
*delta = maxDelta;
|
||||
return testSuccess;
|
||||
}
|
||||
@@ -446,6 +450,7 @@ testResult_t BenchTime(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t
|
||||
Barrier(args);
|
||||
|
||||
double maxDelta = 0;
|
||||
bool error = false;
|
||||
static __thread int rep = 0;
|
||||
rep++;
|
||||
if (datacheck) {
|
||||
@@ -456,7 +461,7 @@ testResult_t BenchTime(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t
|
||||
TESTCHECK(startColl(args, type, op, root, in_place, 0));
|
||||
TESTCHECK(completeColl(args));
|
||||
|
||||
TESTCHECK(CheckData(args, type, op, root, in_place, &maxDelta));
|
||||
TESTCHECK(CheckData(args, type, op, root, in_place, &maxDelta, &error));
|
||||
|
||||
//aggregate delta from all threads and procs
|
||||
Barrier(args);
|
||||
@@ -466,6 +471,7 @@ testResult_t BenchTime(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t
|
||||
}
|
||||
#ifdef MPI_SUPPORT
|
||||
MPI_Allreduce(MPI_IN_PLACE, &maxDelta, 1, MPI_DOUBLE, MPI_MAX, MPI_COMM_WORLD);
|
||||
MPI_Allreduce(MPI_IN_PLACE, &error, 1, MPI_C_BOOL, MPI_LOR, MPI_COMM_WORLD);
|
||||
#endif
|
||||
}
|
||||
Barrier(args);
|
||||
@@ -481,7 +487,7 @@ testResult_t BenchTime(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t
|
||||
sprintf(timeStr, "%7.2f", timeUsec);
|
||||
}
|
||||
if (datacheck) {
|
||||
PRINT(" %7s %6.2f %6.2f %5.0le", timeStr, algBw, busBw, maxDelta);
|
||||
PRINT(" %7s %6.2f %6.2f %5.0le%s", timeStr, algBw, busBw, maxDelta, error ? "*" : "");
|
||||
} else {
|
||||
PRINT(" %7s %6.2f %6.2f %5s", timeStr, algBw, busBw, "N/A");
|
||||
}
|
||||
@@ -757,7 +763,7 @@ testResult_t run() {
|
||||
#endif
|
||||
is_main_thread = (proc == 0) ? 1 : 0;
|
||||
|
||||
PRINT("# nThread %d nGpus %d minBytes %ld maxBytes %ld step: %ld(%s) warmup iters: %d iters: %d validation: %d \n", nThreads, nGpus, minBytes, maxBytes,
|
||||
PRINT("# nThread: %d nGpus: %d minBytes: %ld maxBytes: %ld step: %ld(%s) warmupIters: %d iters: %d validation: %d \n", nThreads, nGpus, minBytes, maxBytes,
|
||||
(stepFactor > 1)?stepFactor:stepBytes, (stepFactor > 1)?"factor":"bytes", warmup_iters, iters, datacheck);
|
||||
if (blocking_coll) PRINT("# Blocking Enabled: wait for completion and barrier after each collective \n");
|
||||
if (parallel_init) PRINT("# Parallel Init Enabled: threads call into NcclInitRank concurrently \n");
|
||||
@@ -887,6 +893,7 @@ testResult_t run() {
|
||||
for (int t=nThreads-1; t>=0; t--) {
|
||||
if (t) pthread_join(threads[t].thread, NULL);
|
||||
TESTCHECK(threads[t].ret);
|
||||
|
||||
if (t) {
|
||||
errors[0] += errors[t];
|
||||
bw[0] += bw[t];
|
||||
@@ -927,6 +934,7 @@ testResult_t run() {
|
||||
double check_avg_bw = str ? atof(str) : -1;
|
||||
bw[0] /= bw_count[0];
|
||||
|
||||
if (datacheck) PRINT("# Errors with asterisks indicate errors that have exceeded the maximum threshold.\n");
|
||||
PRINT("# Out of bounds values : %d %s\n", errors[0], errors[0] ? "FAILED" : "OK");
|
||||
PRINT("# Avg bus bandwidth : %g %s\n", bw[0], check_avg_bw == -1 ? "" : (bw[0] < check_avg_bw*(0.9) ? "FAILED" : "OK"));
|
||||
PRINT("#\n");
|
||||
|
||||
@@ -0,0 +1,20 @@
|
||||
#################################################################################
|
||||
# Copyright (C) 2019 Advanced Micro Devices, Inc. All rights reserved.
|
||||
#
|
||||
# Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
# of this software and associated documentation files (the "Software"), to deal
|
||||
# in the Software without restriction, including without limitation the rights
|
||||
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell cop-
|
||||
# ies of the Software, and to permit persons to whom the Software is furnished
|
||||
# to do so, subject to the following conditions:
|
||||
#
|
||||
# The above copyright notice and this permission notice shall be included in all
|
||||
# copies or substantial portions of the Software.
|
||||
#
|
||||
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IM-
|
||||
# PLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
|
||||
# FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
|
||||
# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
|
||||
# IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNE-
|
||||
# CTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
################################################################################
|
||||
@@ -0,0 +1,23 @@
|
||||
#################################################################################
|
||||
# Copyright (C) 2019 Advanced Micro Devices, Inc. All rights reserved.
|
||||
#
|
||||
# Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
# of this software and associated documentation files (the "Software"), to deal
|
||||
# in the Software without restriction, including without limitation the rights
|
||||
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell cop-
|
||||
# ies of the Software, and to permit persons to whom the Software is furnished
|
||||
# to do so, subject to the following conditions:
|
||||
#
|
||||
# The above copyright notice and this permission notice shall be included in all
|
||||
# copies or substantial portions of the Software.
|
||||
#
|
||||
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IM-
|
||||
# PLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
|
||||
# FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
|
||||
# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
|
||||
# IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNE-
|
||||
# CTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
################################################################################
|
||||
|
||||
def pytest_addoption(parser):
|
||||
parser.addoption("--hostfile", action="store", default="", help="specify MPI hostfile")
|
||||
@@ -0,0 +1,102 @@
|
||||
#################################################################################
|
||||
# Copyright (C) 2019 Advanced Micro Devices, Inc. All rights reserved.
|
||||
#
|
||||
# Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
# of this software and associated documentation files (the "Software"), to deal
|
||||
# in the Software without restriction, including without limitation the rights
|
||||
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell cop-
|
||||
# ies of the Software, and to permit persons to whom the Software is furnished
|
||||
# to do so, subject to the following conditions:
|
||||
#
|
||||
# The above copyright notice and this permission notice shall be included in all
|
||||
# copies or substantial portions of the Software.
|
||||
#
|
||||
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IM-
|
||||
# PLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
|
||||
# FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
|
||||
# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
|
||||
# IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNE-
|
||||
# CTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
################################################################################
|
||||
|
||||
import os
|
||||
import subprocess
|
||||
import itertools
|
||||
|
||||
import pytest
|
||||
|
||||
nthreads = ["1"]
|
||||
nprocs = ["2"]
|
||||
ngpus_single = ["1","2","4"]
|
||||
ngpus_mpi = ["1","2"]
|
||||
byte_range = [("4", "128M")]
|
||||
op = ["sum", "prod", "min", "max"]
|
||||
step_factor = ["2"]
|
||||
datatype = ["int8", "uint8", "int32", "uint32", "int64", "uint64", "half", "float", "double"]
|
||||
memory_type = ["coarse","fine", "host"]
|
||||
|
||||
path = os.path.dirname(os.path.abspath(__file__))
|
||||
executable = path + "/../build/all_gather_perf"
|
||||
|
||||
@pytest.mark.parametrize("nthreads, ngpus_single, byte_range, op, step_factor, datatype, memory_type",
|
||||
itertools.product(nthreads, ngpus_single, byte_range, op, step_factor, datatype, memory_type))
|
||||
def test_AllGatherSingleProcess(nthreads, ngpus_single, byte_range, op, step_factor, datatype, memory_type):
|
||||
try:
|
||||
args = [executable,
|
||||
"-t", nthreads,
|
||||
"-g", ngpus_single,
|
||||
"-b", byte_range[0],
|
||||
"-e", byte_range[1],
|
||||
"-o", op,
|
||||
"-f", step_factor,
|
||||
"-d", datatype,
|
||||
"-y", memory_type]
|
||||
if memory_type == "fine":
|
||||
args.insert(0, "HSA_FORCE_FINE_GRAIN_PCIE=1")
|
||||
args_str = " ".join(args)
|
||||
rccl_test = subprocess.run(args_str, stdout=subprocess.PIPE, universal_newlines=True, shell=True)
|
||||
except subprocess.CalledProcessError as err:
|
||||
print(rccl_test.stdout)
|
||||
pytest.fail("AllGather test error(s) detected.")
|
||||
|
||||
assert rccl_test.returncode == 0
|
||||
|
||||
@pytest.mark.parametrize("nthreads, nprocs, ngpus_mpi, byte_range, op, step_factor, datatype",
|
||||
itertools.product(nthreads, nprocs, ngpus_mpi, byte_range, op, step_factor, datatype))
|
||||
def test_AllGatherMPI(request, nthreads, nprocs, ngpus_mpi, byte_range, op, step_factor, datatype):
|
||||
try:
|
||||
mpi_hostfile = request.config.getoption('--hostfile')
|
||||
if not mpi_hostfile:
|
||||
args = ["mpirun -np", nprocs,
|
||||
executable,
|
||||
"-p 1",
|
||||
"-t", nthreads,
|
||||
"-g", ngpus_mpi,
|
||||
"-b", byte_range[0],
|
||||
"-e", byte_range[1],
|
||||
"-o", op,
|
||||
"-f", step_factor,
|
||||
"-d", datatype]
|
||||
else:
|
||||
args = ["mpirun -np", nprocs,
|
||||
"-host", mpi_hostfile,
|
||||
executable,
|
||||
"-p 1",
|
||||
"-t", nthreads,
|
||||
"-g", ngpus_mpi,
|
||||
"-b", byte_range[0],
|
||||
"-e", byte_range[1],
|
||||
"-o", op,
|
||||
"-f", step_factor,
|
||||
"-d", datatype,
|
||||
"-y", memory_type]
|
||||
if memory_type == "fine":
|
||||
args.insert(0, "HSA_FORCE_FINE_GRAIN_PCIE=1")
|
||||
args_str = " ".join(args)
|
||||
print(args_str)
|
||||
rccl_test = subprocess.run(args_str, universal_newlines=True, shell=True)
|
||||
except subprocess.CalledProcessError as err:
|
||||
print(rccl_test.stdout)
|
||||
pytest.fail("AllGather test error(s) detected.")
|
||||
|
||||
assert rccl_test.returncode == 0
|
||||
@@ -0,0 +1,102 @@
|
||||
#################################################################################
|
||||
# Copyright (C) 2019 Advanced Micro Devices, Inc. All rights reserved.
|
||||
#
|
||||
# Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
# of this software and associated documentation files (the "Software"), to deal
|
||||
# in the Software without restriction, including without limitation the rights
|
||||
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell cop-
|
||||
# ies of the Software, and to permit persons to whom the Software is furnished
|
||||
# to do so, subject to the following conditions:
|
||||
#
|
||||
# The above copyright notice and this permission notice shall be included in all
|
||||
# copies or substantial portions of the Software.
|
||||
#
|
||||
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IM-
|
||||
# PLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
|
||||
# FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
|
||||
# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
|
||||
# IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNE-
|
||||
# CTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
################################################################################
|
||||
|
||||
import os
|
||||
import subprocess
|
||||
import itertools
|
||||
|
||||
import pytest
|
||||
|
||||
nthreads = ["1"]
|
||||
nprocs = ["2"]
|
||||
ngpus_single = ["1","2","4"]
|
||||
ngpus_mpi = ["1","2"]
|
||||
byte_range = [("4", "128M")]
|
||||
op = ["sum", "prod", "min", "max"]
|
||||
step_factor = ["2"]
|
||||
datatype = ["int8", "uint8", "int32", "uint32", "int64", "uint64", "half", "float", "double"]
|
||||
memory_type = ["coarse","fine", "host"]
|
||||
|
||||
path = os.path.dirname(os.path.abspath(__file__))
|
||||
executable = path + "/../build/all_reduce_perf"
|
||||
|
||||
@pytest.mark.parametrize("nthreads, ngpus_single, byte_range, op, step_factor, datatype, memory_type",
|
||||
itertools.product(nthreads, ngpus_single, byte_range, op, step_factor, datatype, memory_type))
|
||||
def test_AllReduceSingleProcess(nthreads, ngpus_single, byte_range, op, step_factor, datatype, memory_type):
|
||||
try:
|
||||
args = [executable,
|
||||
"-t", nthreads,
|
||||
"-g", ngpus_single,
|
||||
"-b", byte_range[0],
|
||||
"-e", byte_range[1],
|
||||
"-o", op,
|
||||
"-f", step_factor,
|
||||
"-d", datatype,
|
||||
"-y", memory_type]
|
||||
if memory_type == "fine":
|
||||
args.insert(0, "HSA_FORCE_FINE_GRAIN_PCIE=1")
|
||||
args_str = " ".join(args)
|
||||
rccl_test = subprocess.run(args_str, stdout=subprocess.PIPE, universal_newlines=True, shell=True)
|
||||
except subprocess.CalledProcessError as err:
|
||||
print(rccl_test.stdout)
|
||||
pytest.fail("AllReduce test error(s) detected.")
|
||||
|
||||
assert rccl_test.returncode == 0
|
||||
|
||||
@pytest.mark.parametrize("nthreads, nprocs, ngpus_mpi, byte_range, op, step_factor, datatype",
|
||||
itertools.product(nthreads, nprocs, ngpus_mpi, byte_range, op, step_factor, datatype))
|
||||
def test_AllReduceMPI(request, nthreads, nprocs, ngpus_mpi, byte_range, op, step_factor, datatype):
|
||||
try:
|
||||
mpi_hostfile = request.config.getoption('--hostfile')
|
||||
if not mpi_hostfile:
|
||||
args = ["mpirun -np", nprocs,
|
||||
executable,
|
||||
"-p 1",
|
||||
"-t", nthreads,
|
||||
"-g", ngpus_mpi,
|
||||
"-b", byte_range[0],
|
||||
"-e", byte_range[1],
|
||||
"-o", op,
|
||||
"-f", step_factor,
|
||||
"-d", datatype]
|
||||
else:
|
||||
args = ["mpirun -np", nprocs,
|
||||
"-host", mpi_hostfile,
|
||||
executable,
|
||||
"-p 1",
|
||||
"-t", nthreads,
|
||||
"-g", ngpus_mpi,
|
||||
"-b", byte_range[0],
|
||||
"-e", byte_range[1],
|
||||
"-o", op,
|
||||
"-f", step_factor,
|
||||
"-d", datatype,
|
||||
"-y", memory_type]
|
||||
if memory_type == "fine":
|
||||
args.insert(0, "HSA_FORCE_FINE_GRAIN_PCIE=1")
|
||||
args_str = " ".join(args)
|
||||
print(args_str)
|
||||
rccl_test = subprocess.run(args_str, universal_newlines=True, shell=True)
|
||||
except subprocess.CalledProcessError as err:
|
||||
print(rccl_test.stdout)
|
||||
pytest.fail("AllReduce test error(s) detected.")
|
||||
|
||||
assert rccl_test.returncode == 0
|
||||
@@ -0,0 +1,102 @@
|
||||
#################################################################################
|
||||
# Copyright (C) 2019 Advanced Micro Devices, Inc. All rights reserved.
|
||||
#
|
||||
# Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
# of this software and associated documentation files (the "Software"), to deal
|
||||
# in the Software without restriction, including without limitation the rights
|
||||
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell cop-
|
||||
# ies of the Software, and to permit persons to whom the Software is furnished
|
||||
# to do so, subject to the following conditions:
|
||||
#
|
||||
# The above copyright notice and this permission notice shall be included in all
|
||||
# copies or substantial portions of the Software.
|
||||
#
|
||||
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IM-
|
||||
# PLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
|
||||
# FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
|
||||
# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
|
||||
# IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNE-
|
||||
# CTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
################################################################################
|
||||
|
||||
import os
|
||||
import subprocess
|
||||
import itertools
|
||||
|
||||
import pytest
|
||||
|
||||
nthreads = ["1"]
|
||||
nprocs = ["2"]
|
||||
ngpus_single = ["1","2","4"]
|
||||
ngpus_mpi = ["1","2"]
|
||||
byte_range = [("4", "128M")]
|
||||
op = ["sum", "prod", "min", "max"]
|
||||
step_factor = ["2"]
|
||||
datatype = ["int8", "uint8", "int32", "uint32", "int64", "uint64", "half", "float", "double"]
|
||||
memory_type = ["coarse","fine", "host"]
|
||||
|
||||
path = os.path.dirname(os.path.abspath(__file__))
|
||||
executable = path + "/../build/broadcast_perf"
|
||||
|
||||
@pytest.mark.parametrize("nthreads, ngpus_single, byte_range, op, step_factor, datatype, memory_type",
|
||||
itertools.product(nthreads, ngpus_single, byte_range, op, step_factor, datatype, memory_type))
|
||||
def test_BroadcastSingleProcess(nthreads, ngpus_single, byte_range, op, step_factor, datatype, memory_type):
|
||||
try:
|
||||
args = [executable,
|
||||
"-t", nthreads,
|
||||
"-g", ngpus_single,
|
||||
"-b", byte_range[0],
|
||||
"-e", byte_range[1],
|
||||
"-o", op,
|
||||
"-f", step_factor,
|
||||
"-d", datatype,
|
||||
"-y", memory_type]
|
||||
if memory_type == "fine":
|
||||
args.insert(0, "HSA_FORCE_FINE_GRAIN_PCIE=1")
|
||||
args_str = " ".join(args)
|
||||
rccl_test = subprocess.run(args_str, stdout=subprocess.PIPE, universal_newlines=True, shell=True)
|
||||
except subprocess.CalledProcessError as err:
|
||||
print(rccl_test.stdout)
|
||||
pytest.fail("Broadcast test error(s) detected.")
|
||||
|
||||
assert rccl_test.returncode == 0
|
||||
|
||||
@pytest.mark.parametrize("nthreads, nprocs, ngpus_mpi, byte_range, op, step_factor, datatype",
|
||||
itertools.product(nthreads, nprocs, ngpus_mpi, byte_range, op, step_factor, datatype))
|
||||
def test_BroadcastMPI(request, nthreads, nprocs, ngpus_mpi, byte_range, op, step_factor, datatype):
|
||||
try:
|
||||
mpi_hostfile = request.config.getoption('--hostfile')
|
||||
if not mpi_hostfile:
|
||||
args = ["mpirun -np", nprocs,
|
||||
executable,
|
||||
"-p 1",
|
||||
"-t", nthreads,
|
||||
"-g", ngpus_mpi,
|
||||
"-b", byte_range[0],
|
||||
"-e", byte_range[1],
|
||||
"-o", op,
|
||||
"-f", step_factor,
|
||||
"-d", datatype]
|
||||
else:
|
||||
args = ["mpirun -np", nprocs,
|
||||
"-host", mpi_hostfile,
|
||||
executable,
|
||||
"-p 1",
|
||||
"-t", nthreads,
|
||||
"-g", ngpus_mpi,
|
||||
"-b", byte_range[0],
|
||||
"-e", byte_range[1],
|
||||
"-o", op,
|
||||
"-f", step_factor,
|
||||
"-d", datatype,
|
||||
"-y", memory_type]
|
||||
if memory_type == "fine":
|
||||
args.insert(0, "HSA_FORCE_FINE_GRAIN_PCIE=1")
|
||||
args_str = " ".join(args)
|
||||
print(args_str)
|
||||
rccl_test = subprocess.run(args_str, universal_newlines=True, shell=True)
|
||||
except subprocess.CalledProcessError as err:
|
||||
print(rccl_test.stdout)
|
||||
pytest.fail("Broadcast test error(s) detected.")
|
||||
|
||||
assert rccl_test.returncode == 0
|
||||
@@ -0,0 +1,102 @@
|
||||
#################################################################################
|
||||
# Copyright (C) 2019 Advanced Micro Devices, Inc. All rights reserved.
|
||||
#
|
||||
# Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
# of this software and associated documentation files (the "Software"), to deal
|
||||
# in the Software without restriction, including without limitation the rights
|
||||
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell cop-
|
||||
# ies of the Software, and to permit persons to whom the Software is furnished
|
||||
# to do so, subject to the following conditions:
|
||||
#
|
||||
# The above copyright notice and this permission notice shall be included in all
|
||||
# copies or substantial portions of the Software.
|
||||
#
|
||||
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IM-
|
||||
# PLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
|
||||
# FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
|
||||
# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
|
||||
# IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNE-
|
||||
# CTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
################################################################################
|
||||
|
||||
import os
|
||||
import subprocess
|
||||
import itertools
|
||||
|
||||
import pytest
|
||||
|
||||
nthreads = ["1"]
|
||||
nprocs = ["2"]
|
||||
ngpus_single = ["1","2","4"]
|
||||
ngpus_mpi = ["1","2"]
|
||||
byte_range = [("4", "128M")]
|
||||
op = ["sum", "prod", "min", "max"]
|
||||
step_factor = ["2"]
|
||||
datatype = ["int8", "uint8", "int32", "uint32", "int64", "uint64", "half", "float", "double"]
|
||||
memory_type = ["coarse","fine", "host"]
|
||||
|
||||
path = os.path.dirname(os.path.abspath(__file__))
|
||||
executable = path + "/../build/reduce_perf"
|
||||
|
||||
@pytest.mark.parametrize("nthreads, ngpus_single, byte_range, op, step_factor, datatype, memory_type",
|
||||
itertools.product(nthreads, ngpus_single, byte_range, op, step_factor, datatype, memory_type))
|
||||
def test_ReduceSingleProcess(nthreads, ngpus_single, byte_range, op, step_factor, datatype, memory_type):
|
||||
try:
|
||||
args = [executable,
|
||||
"-t", nthreads,
|
||||
"-g", ngpus_single,
|
||||
"-b", byte_range[0],
|
||||
"-e", byte_range[1],
|
||||
"-o", op,
|
||||
"-f", step_factor,
|
||||
"-d", datatype,
|
||||
"-y", memory_type]
|
||||
if memory_type == "fine":
|
||||
args.insert(0, "HSA_FORCE_FINE_GRAIN_PCIE=1")
|
||||
args_str = " ".join(args)
|
||||
rccl_test = subprocess.run(args_str, stdout=subprocess.PIPE, universal_newlines=True, shell=True)
|
||||
except subprocess.CalledProcessError as err:
|
||||
print(rccl_test.stdout)
|
||||
pytest.fail("Reduce test error(s) detected.")
|
||||
|
||||
assert rccl_test.returncode == 0
|
||||
|
||||
@pytest.mark.parametrize("nthreads, nprocs, ngpus_mpi, byte_range, op, step_factor, datatype",
|
||||
itertools.product(nthreads, nprocs, ngpus_mpi, byte_range, op, step_factor, datatype))
|
||||
def test_ReduceMPI(request, nthreads, nprocs, ngpus_mpi, byte_range, op, step_factor, datatype):
|
||||
try:
|
||||
mpi_hostfile = request.config.getoption('--hostfile')
|
||||
if not mpi_hostfile:
|
||||
args = ["mpirun -np", nprocs,
|
||||
executable,
|
||||
"-p 1",
|
||||
"-t", nthreads,
|
||||
"-g", ngpus_mpi,
|
||||
"-b", byte_range[0],
|
||||
"-e", byte_range[1],
|
||||
"-o", op,
|
||||
"-f", step_factor,
|
||||
"-d", datatype]
|
||||
else:
|
||||
args = ["mpirun -np", nprocs,
|
||||
"-host", mpi_hostfile,
|
||||
executable,
|
||||
"-p 1",
|
||||
"-t", nthreads,
|
||||
"-g", ngpus_mpi,
|
||||
"-b", byte_range[0],
|
||||
"-e", byte_range[1],
|
||||
"-o", op,
|
||||
"-f", step_factor,
|
||||
"-d", datatype,
|
||||
"-y", memory_type]
|
||||
if memory_type == "fine":
|
||||
args.insert(0, "HSA_FORCE_FINE_GRAIN_PCIE=1")
|
||||
args_str = " ".join(args)
|
||||
print(args_str)
|
||||
rccl_test = subprocess.run(args_str, universal_newlines=True, shell=True)
|
||||
except subprocess.CalledProcessError as err:
|
||||
print(rccl_test.stdout)
|
||||
pytest.fail("Reduce test error(s) detected.")
|
||||
|
||||
assert rccl_test.returncode == 0
|
||||
@@ -0,0 +1,102 @@
|
||||
#################################################################################
|
||||
# Copyright (C) 2019 Advanced Micro Devices, Inc. All rights reserved.
|
||||
#
|
||||
# Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
# of this software and associated documentation files (the "Software"), to deal
|
||||
# in the Software without restriction, including without limitation the rights
|
||||
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell cop-
|
||||
# ies of the Software, and to permit persons to whom the Software is furnished
|
||||
# to do so, subject to the following conditions:
|
||||
#
|
||||
# The above copyright notice and this permission notice shall be included in all
|
||||
# copies or substantial portions of the Software.
|
||||
#
|
||||
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IM-
|
||||
# PLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
|
||||
# FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
|
||||
# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
|
||||
# IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNE-
|
||||
# CTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
################################################################################
|
||||
|
||||
import os
|
||||
import subprocess
|
||||
import itertools
|
||||
|
||||
import pytest
|
||||
|
||||
nthreads = ["1"]
|
||||
nprocs = ["2"]
|
||||
ngpus_single = ["1","2","4"]
|
||||
ngpus_mpi = ["1","2"]
|
||||
byte_range = [("4", "128M")]
|
||||
op = ["sum", "prod", "min", "max"]
|
||||
step_factor = ["2"]
|
||||
datatype = ["int8", "uint8", "int32", "uint32", "int64", "uint64", "half", "float", "double"]
|
||||
memory_type = ["coarse","fine", "host"]
|
||||
|
||||
path = os.path.dirname(os.path.abspath(__file__))
|
||||
executable = path + "/../build/reduce_scatter_perf"
|
||||
|
||||
@pytest.mark.parametrize("nthreads, ngpus_single, byte_range, op, step_factor, datatype, memory_type",
|
||||
itertools.product(nthreads, ngpus_single, byte_range, op, step_factor, datatype, memory_type))
|
||||
def test_ReduceScatterSingleProcess(nthreads, ngpus_single, byte_range, op, step_factor, datatype, memory_type):
|
||||
try:
|
||||
args = [executable,
|
||||
"-t", nthreads,
|
||||
"-g", ngpus_single,
|
||||
"-b", byte_range[0],
|
||||
"-e", byte_range[1],
|
||||
"-o", op,
|
||||
"-f", step_factor,
|
||||
"-d", datatype,
|
||||
"-y", memory_type]
|
||||
if memory_type == "fine":
|
||||
args.insert(0, "HSA_FORCE_FINE_GRAIN_PCIE=1")
|
||||
args_str = " ".join(args)
|
||||
rccl_test = subprocess.run(args_str, stdout=subprocess.PIPE, universal_newlines=True, shell=True)
|
||||
except subprocess.CalledProcessError as err:
|
||||
print(rccl_test.stdout)
|
||||
pytest.fail("ReduceScatter test error(s) detected.")
|
||||
|
||||
assert rccl_test.returncode == 0
|
||||
|
||||
@pytest.mark.parametrize("nthreads, nprocs, ngpus_mpi, byte_range, op, step_factor, datatype",
|
||||
itertools.product(nthreads, nprocs, ngpus_mpi, byte_range, op, step_factor, datatype))
|
||||
def test_ReduceScatterMPI(request, nthreads, nprocs, ngpus_mpi, byte_range, op, step_factor, datatype):
|
||||
try:
|
||||
mpi_hostfile = request.config.getoption('--hostfile')
|
||||
if not mpi_hostfile:
|
||||
args = ["mpirun -np", nprocs,
|
||||
executable,
|
||||
"-p 1",
|
||||
"-t", nthreads,
|
||||
"-g", ngpus_mpi,
|
||||
"-b", byte_range[0],
|
||||
"-e", byte_range[1],
|
||||
"-o", op,
|
||||
"-f", step_factor,
|
||||
"-d", datatype]
|
||||
else:
|
||||
args = ["mpirun -np", nprocs,
|
||||
"-host", mpi_hostfile,
|
||||
executable,
|
||||
"-p 1",
|
||||
"-t", nthreads,
|
||||
"-g", ngpus_mpi,
|
||||
"-b", byte_range[0],
|
||||
"-e", byte_range[1],
|
||||
"-o", op,
|
||||
"-f", step_factor,
|
||||
"-d", datatype,
|
||||
"-y", memory_type]
|
||||
if memory_type == "fine":
|
||||
args.insert(0, "HSA_FORCE_FINE_GRAIN_PCIE=1")
|
||||
args_str = " ".join(args)
|
||||
print(args_str)
|
||||
rccl_test = subprocess.run(args_str, universal_newlines=True, shell=True)
|
||||
except subprocess.CalledProcessError as err:
|
||||
print(rccl_test.stdout)
|
||||
pytest.fail("ReduceScatter test error(s) detected.")
|
||||
|
||||
assert rccl_test.returncode == 0
|
||||
Fai riferimento in un nuovo problema
Block a user