Adding unit tests and files for CI (#4)

* Adding initial unit test and Jenkins code.

Fixing scope of unit tests

Adding unit tests and files for CI

Fixing Jenkinsfile

* Removing typos from Jenkinsfile

* Making some fixes to the Jenkins file; temporarily disabling MPI

* Making corrections to Jenkinsfile

* Correcting dockerNodes entry in Jenkinsfile

* Fixed Jenkinsfile for CI

* Correcting Jenkinsfile for CI

* Updating README to include instructions on how to run unit tests.


[ROCm/rccl-tests commit: 924521ff57]
This commit is contained in:
Stanley Tsang
2019-06-28 09:52:44 -06:00
committato da Wenkai Du
parent 75b5e43633
commit 30d348de0e
11 ha cambiato i file con 758 aggiunte e 5 eliminazioni
+82
Vedi File
@@ -0,0 +1,82 @@
#!/usr/bin/env groovy
// Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
// This shared library is available at https://github.com/ROCmSoftwarePlatform/rocJENKINS
@Library('rocJenkins@noDocker') _
// This is file for internal AMD use.
// If you are interested in running your own Jenkins, please raise a github issue for assistance.
import com.amd.project.*
import com.amd.docker.*
////////////////////////////////////////////////////////////////////////
// Mostly generated from snippet generator 'properties; set job properties'
// Time-based triggers added to execute nightly tests, eg '30 2 * * *' means 2:30 AM
properties([
pipelineTriggers([cron('0 1 * * *'), [$class: 'PeriodicFolderTrigger', interval: '5m']]),
buildDiscarder(logRotator(
artifactDaysToKeepStr: '',
artifactNumToKeepStr: '',
daysToKeepStr: '',
numToKeepStr: '10')),
disableConcurrentBuilds(),
[$class: 'CopyArtifactPermissionProperty', projectNames: '*']
])
////////////////////////////////////////////////////////////////////////
import java.nio.file.Path;
rcclTestsCI:
{
def rcclTests = new rocProject('rcclTests')
// customize for project
rcclTests.paths.build_command = './install.sh'
// Define test architectures, optional rocm version argument is available
def nodes = new dockerNodes(['RCCL'], rcclTests)
boolean formatCheck = false
def compileCommand =
{
platform, project->
project.paths.construct_build_prefix()
def command = """#!/usr/bin/env bash
set -x
rm -rf rccl
git clone https://github.com/ROCmSoftwarePlatform/rccl
cd rccl
export RCCL_PATH=${WORKSPACE}/rccl/rccl-install
./install.sh -i --prefix=\$RCCL_PATH
cd ..
cd ${project.paths.project_build_prefix}
${project.paths.build_command} --rccl_home=\$RCCL_PATH
"""
sh command
}
def testCommand =
{
platform, project->
def command = """#!/usr/bin/env bash
set -x
LD_LIBRARY_PATH=\$LD_LIBRARY_PATH:${WORKSPACE}/rccl/rccl-install/lib/ python3 -m pytest -k "not MPI" --junitxml=./testreport.xml
"""
sh command
//junit "${project.paths.project_build_prefix}/build/release/*.xml"
}
def packageCommand =
{
platform, project->
def command = """
"""
}
buildProjectNoDocker(rcclTests, formatCheck, nodes.dockerArray, compileCommand, testCommand, packageCommand)
}
+12
Vedi File
@@ -64,6 +64,18 @@ All tests support the same set of arguments :
* `-c,--check <0/1>` check correctness of results. This can be quite slow on large numbers of GPUs. Default : 1.
* `-z,--blocking <0/1>` Make RCCL collective blocking, i.e. have CPUs wait and sync after each collective. Default : 0.
## Unit tests
Unit tests for rccl-tests are implemented with pytest (python3 is also required). Several notes for the unit tests:
1. The LD_LIBRARY_PATH environment variable will need to be set to include /path/to/rccl-install/lib/ in order to run the unit tests.
2. The HSA_FORCE_FINE_GRAIN_PCIE environment variable will need to be set to 1 in order to run the unit tests which use fine-grained memory type.
The unit tests can be invoked within the rccl-tests root, or in the test subfolder. An example call to the unit tests:
```shell
$ LD_LIBRARY_PATH=/path/to/rccl-install/lib/ HSA_FORCE_FINE_GRAIN_PCIE=1 python3 -m pytest
```
## Copyright
RCCL tests are provided under the BSD license.
+98
Vedi File
@@ -0,0 +1,98 @@
#!/bin/bash
# Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
# #################################################
# helper functions
# #################################################
function display_help()
{
echo "RCCL-tests build & installation helper script"
echo "./install [-h|--help] "
echo " [-h|--help] Prints this help message."
echo " [-m|--mpi] Build RCCL-tests with MPI support. (see --mpi_home below.)"
echo " [--rccl_home] Specify custom path for RCCL installation (default: /opt/rocm/rccl)"
echo " [--mpi_home] Specify path to your MPI installation."
}
# #################################################
# global variables
# #################################################
run_tests=false
build_release=true
mpi_enabled=false
rccl_dir=/opt/rocm/rccl
mpi_dir=""
# #################################################
# Parameter parsing
# #################################################
# check if we have a modern version of getopt that can handle whitespace and long parameters
getopt -T
if [[ $? -eq 4 ]]; then
GETOPT_PARSE=$(getopt --name "${0}" --longoptions help,mpi,test,rccl_home:,mpi_home: --options hmt -- "$@")
else
echo "Need a new version of getopt"
exit 1
fi
if [[ $? -ne 0 ]]; then
echo "getopt invocation failed; could not parse the command line";
exit 1
fi
eval set -- "${GETOPT_PARSE}"
while true; do
case "${1}" in
-h|--help)
display_help
exit 0
;;
-m|--mpi)
mpi_enabled=true
shift ;;
-t|--test)
run_tests=true
shift ;;
--rccl_home)
rccl_dir=${2}
shift 2 ;;
--mpi_home)
mpi_dir=${2}
shift 2 ;;
--) shift ; break ;;
*) echo "Unexpected command line parameter received; aborting";
exit 1
;;
esac
done
# Install the pre-commit hook
#bash ./githooks/install
build_dir=./build
# #################################################
# prep
# #################################################
# ensure a clean build environment
rm -rf ${build_dir}
if ($mpi_enabled); then
if [[ ${mpi_dir} -eq "" ]]; then
echo "MPI flag enabled but path to MPI installation not specified. See --mpi_home command line argument."
exit 1
else
make NCCL_HOME=${rccl_dir} CUSTOM_RCCL_LIB=${rccl_dir}/lib/librccl.so MPI=1 MPI_HOME=${mpi_dir} -j$(nproc)
fi
else
make NCCL_HOME=${rccl_dir} CUSTOM_RCCL_LIB=${rccl_dir}/lib/librccl.so -j$(nproc)
fi
# Optionally, run tests if they're enabled.
if ($run_tests); then
if ($mpi_enabled); then
cd test; LD_LIBRARY_PATH=$LD_LIBRARY_PATH:${rccl_dir}/lib:${mpi_dir}/lib PATH=$PATH:${mpi_dir}/bin python3 -m pytest
else
cd test; LD_LIBRARY_PATH=$LD_LIBRARY_PATH:${rccl_dir}/lib python3 -m pytest
fi
fi
+13 -5
Vedi File
@@ -292,7 +292,7 @@ void Barrier(struct threadArgs* args)
args->barrier_idx=!args->barrier_idx;
}
testResult_t CheckData(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t op, int root, int in_place, double *delta) {
testResult_t CheckData(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t op, int root, int in_place, double *delta, bool *error) {
size_t count = args->expectedBytes/wordSize(type);
double maxDelta = 0.0;
for (int i=0; i<args->nGpus; i++) {
@@ -327,7 +327,11 @@ testResult_t CheckData(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t
#endif
}
double nranks = args->nProcs*args->nThreads*args->nGpus;
if (maxDelta > DeltaMaxValue(type)*(nranks - 1)) args->errors[0]++;
if (maxDelta > DeltaMaxValue(type)*(nranks - 1))
{
args->errors[0]++;
*error = true;
}
*delta = maxDelta;
return testSuccess;
}
@@ -446,6 +450,7 @@ testResult_t BenchTime(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t
Barrier(args);
double maxDelta = 0;
bool error = false;
static __thread int rep = 0;
rep++;
if (datacheck) {
@@ -456,7 +461,7 @@ testResult_t BenchTime(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t
TESTCHECK(startColl(args, type, op, root, in_place, 0));
TESTCHECK(completeColl(args));
TESTCHECK(CheckData(args, type, op, root, in_place, &maxDelta));
TESTCHECK(CheckData(args, type, op, root, in_place, &maxDelta, &error));
//aggregate delta from all threads and procs
Barrier(args);
@@ -466,6 +471,7 @@ testResult_t BenchTime(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t
}
#ifdef MPI_SUPPORT
MPI_Allreduce(MPI_IN_PLACE, &maxDelta, 1, MPI_DOUBLE, MPI_MAX, MPI_COMM_WORLD);
MPI_Allreduce(MPI_IN_PLACE, &error, 1, MPI_C_BOOL, MPI_LOR, MPI_COMM_WORLD);
#endif
}
Barrier(args);
@@ -481,7 +487,7 @@ testResult_t BenchTime(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t
sprintf(timeStr, "%7.2f", timeUsec);
}
if (datacheck) {
PRINT(" %7s %6.2f %6.2f %5.0le", timeStr, algBw, busBw, maxDelta);
PRINT(" %7s %6.2f %6.2f %5.0le%s", timeStr, algBw, busBw, maxDelta, error ? "*" : "");
} else {
PRINT(" %7s %6.2f %6.2f %5s", timeStr, algBw, busBw, "N/A");
}
@@ -757,7 +763,7 @@ testResult_t run() {
#endif
is_main_thread = (proc == 0) ? 1 : 0;
PRINT("# nThread %d nGpus %d minBytes %ld maxBytes %ld step: %ld(%s) warmup iters: %d iters: %d validation: %d \n", nThreads, nGpus, minBytes, maxBytes,
PRINT("# nThread: %d nGpus: %d minBytes: %ld maxBytes: %ld step: %ld(%s) warmupIters: %d iters: %d validation: %d \n", nThreads, nGpus, minBytes, maxBytes,
(stepFactor > 1)?stepFactor:stepBytes, (stepFactor > 1)?"factor":"bytes", warmup_iters, iters, datacheck);
if (blocking_coll) PRINT("# Blocking Enabled: wait for completion and barrier after each collective \n");
if (parallel_init) PRINT("# Parallel Init Enabled: threads call into NcclInitRank concurrently \n");
@@ -887,6 +893,7 @@ testResult_t run() {
for (int t=nThreads-1; t>=0; t--) {
if (t) pthread_join(threads[t].thread, NULL);
TESTCHECK(threads[t].ret);
if (t) {
errors[0] += errors[t];
bw[0] += bw[t];
@@ -927,6 +934,7 @@ testResult_t run() {
double check_avg_bw = str ? atof(str) : -1;
bw[0] /= bw_count[0];
if (datacheck) PRINT("# Errors with asterisks indicate errors that have exceeded the maximum threshold.\n");
PRINT("# Out of bounds values : %d %s\n", errors[0], errors[0] ? "FAILED" : "OK");
PRINT("# Avg bus bandwidth : %g %s\n", bw[0], check_avg_bw == -1 ? "" : (bw[0] < check_avg_bw*(0.9) ? "FAILED" : "OK"));
PRINT("#\n");
+20
Vedi File
@@ -0,0 +1,20 @@
#################################################################################
# Copyright (C) 2019 Advanced Micro Devices, Inc. All rights reserved.
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell cop-
# ies of the Software, and to permit persons to whom the Software is furnished
# to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IM-
# PLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
# FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
# IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNE-
# CTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
################################################################################
+23
Vedi File
@@ -0,0 +1,23 @@
#################################################################################
# Copyright (C) 2019 Advanced Micro Devices, Inc. All rights reserved.
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell cop-
# ies of the Software, and to permit persons to whom the Software is furnished
# to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IM-
# PLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
# FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
# IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNE-
# CTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
################################################################################
def pytest_addoption(parser):
parser.addoption("--hostfile", action="store", default="", help="specify MPI hostfile")
@@ -0,0 +1,102 @@
#################################################################################
# Copyright (C) 2019 Advanced Micro Devices, Inc. All rights reserved.
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell cop-
# ies of the Software, and to permit persons to whom the Software is furnished
# to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IM-
# PLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
# FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
# IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNE-
# CTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
################################################################################
import os
import subprocess
import itertools
import pytest
nthreads = ["1"]
nprocs = ["2"]
ngpus_single = ["1","2","4"]
ngpus_mpi = ["1","2"]
byte_range = [("4", "128M")]
op = ["sum", "prod", "min", "max"]
step_factor = ["2"]
datatype = ["int8", "uint8", "int32", "uint32", "int64", "uint64", "half", "float", "double"]
memory_type = ["coarse","fine", "host"]
path = os.path.dirname(os.path.abspath(__file__))
executable = path + "/../build/all_gather_perf"
@pytest.mark.parametrize("nthreads, ngpus_single, byte_range, op, step_factor, datatype, memory_type",
itertools.product(nthreads, ngpus_single, byte_range, op, step_factor, datatype, memory_type))
def test_AllGatherSingleProcess(nthreads, ngpus_single, byte_range, op, step_factor, datatype, memory_type):
try:
args = [executable,
"-t", nthreads,
"-g", ngpus_single,
"-b", byte_range[0],
"-e", byte_range[1],
"-o", op,
"-f", step_factor,
"-d", datatype,
"-y", memory_type]
if memory_type == "fine":
args.insert(0, "HSA_FORCE_FINE_GRAIN_PCIE=1")
args_str = " ".join(args)
rccl_test = subprocess.run(args_str, stdout=subprocess.PIPE, universal_newlines=True, shell=True)
except subprocess.CalledProcessError as err:
print(rccl_test.stdout)
pytest.fail("AllGather test error(s) detected.")
assert rccl_test.returncode == 0
@pytest.mark.parametrize("nthreads, nprocs, ngpus_mpi, byte_range, op, step_factor, datatype",
itertools.product(nthreads, nprocs, ngpus_mpi, byte_range, op, step_factor, datatype))
def test_AllGatherMPI(request, nthreads, nprocs, ngpus_mpi, byte_range, op, step_factor, datatype):
try:
mpi_hostfile = request.config.getoption('--hostfile')
if not mpi_hostfile:
args = ["mpirun -np", nprocs,
executable,
"-p 1",
"-t", nthreads,
"-g", ngpus_mpi,
"-b", byte_range[0],
"-e", byte_range[1],
"-o", op,
"-f", step_factor,
"-d", datatype]
else:
args = ["mpirun -np", nprocs,
"-host", mpi_hostfile,
executable,
"-p 1",
"-t", nthreads,
"-g", ngpus_mpi,
"-b", byte_range[0],
"-e", byte_range[1],
"-o", op,
"-f", step_factor,
"-d", datatype,
"-y", memory_type]
if memory_type == "fine":
args.insert(0, "HSA_FORCE_FINE_GRAIN_PCIE=1")
args_str = " ".join(args)
print(args_str)
rccl_test = subprocess.run(args_str, universal_newlines=True, shell=True)
except subprocess.CalledProcessError as err:
print(rccl_test.stdout)
pytest.fail("AllGather test error(s) detected.")
assert rccl_test.returncode == 0
@@ -0,0 +1,102 @@
#################################################################################
# Copyright (C) 2019 Advanced Micro Devices, Inc. All rights reserved.
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell cop-
# ies of the Software, and to permit persons to whom the Software is furnished
# to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IM-
# PLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
# FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
# IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNE-
# CTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
################################################################################
import os
import subprocess
import itertools
import pytest
nthreads = ["1"]
nprocs = ["2"]
ngpus_single = ["1","2","4"]
ngpus_mpi = ["1","2"]
byte_range = [("4", "128M")]
op = ["sum", "prod", "min", "max"]
step_factor = ["2"]
datatype = ["int8", "uint8", "int32", "uint32", "int64", "uint64", "half", "float", "double"]
memory_type = ["coarse","fine", "host"]
path = os.path.dirname(os.path.abspath(__file__))
executable = path + "/../build/all_reduce_perf"
@pytest.mark.parametrize("nthreads, ngpus_single, byte_range, op, step_factor, datatype, memory_type",
itertools.product(nthreads, ngpus_single, byte_range, op, step_factor, datatype, memory_type))
def test_AllReduceSingleProcess(nthreads, ngpus_single, byte_range, op, step_factor, datatype, memory_type):
try:
args = [executable,
"-t", nthreads,
"-g", ngpus_single,
"-b", byte_range[0],
"-e", byte_range[1],
"-o", op,
"-f", step_factor,
"-d", datatype,
"-y", memory_type]
if memory_type == "fine":
args.insert(0, "HSA_FORCE_FINE_GRAIN_PCIE=1")
args_str = " ".join(args)
rccl_test = subprocess.run(args_str, stdout=subprocess.PIPE, universal_newlines=True, shell=True)
except subprocess.CalledProcessError as err:
print(rccl_test.stdout)
pytest.fail("AllReduce test error(s) detected.")
assert rccl_test.returncode == 0
@pytest.mark.parametrize("nthreads, nprocs, ngpus_mpi, byte_range, op, step_factor, datatype",
itertools.product(nthreads, nprocs, ngpus_mpi, byte_range, op, step_factor, datatype))
def test_AllReduceMPI(request, nthreads, nprocs, ngpus_mpi, byte_range, op, step_factor, datatype):
try:
mpi_hostfile = request.config.getoption('--hostfile')
if not mpi_hostfile:
args = ["mpirun -np", nprocs,
executable,
"-p 1",
"-t", nthreads,
"-g", ngpus_mpi,
"-b", byte_range[0],
"-e", byte_range[1],
"-o", op,
"-f", step_factor,
"-d", datatype]
else:
args = ["mpirun -np", nprocs,
"-host", mpi_hostfile,
executable,
"-p 1",
"-t", nthreads,
"-g", ngpus_mpi,
"-b", byte_range[0],
"-e", byte_range[1],
"-o", op,
"-f", step_factor,
"-d", datatype,
"-y", memory_type]
if memory_type == "fine":
args.insert(0, "HSA_FORCE_FINE_GRAIN_PCIE=1")
args_str = " ".join(args)
print(args_str)
rccl_test = subprocess.run(args_str, universal_newlines=True, shell=True)
except subprocess.CalledProcessError as err:
print(rccl_test.stdout)
pytest.fail("AllReduce test error(s) detected.")
assert rccl_test.returncode == 0
@@ -0,0 +1,102 @@
#################################################################################
# Copyright (C) 2019 Advanced Micro Devices, Inc. All rights reserved.
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell cop-
# ies of the Software, and to permit persons to whom the Software is furnished
# to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IM-
# PLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
# FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
# IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNE-
# CTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
################################################################################
import os
import subprocess
import itertools
import pytest
nthreads = ["1"]
nprocs = ["2"]
ngpus_single = ["1","2","4"]
ngpus_mpi = ["1","2"]
byte_range = [("4", "128M")]
op = ["sum", "prod", "min", "max"]
step_factor = ["2"]
datatype = ["int8", "uint8", "int32", "uint32", "int64", "uint64", "half", "float", "double"]
memory_type = ["coarse","fine", "host"]
path = os.path.dirname(os.path.abspath(__file__))
executable = path + "/../build/broadcast_perf"
@pytest.mark.parametrize("nthreads, ngpus_single, byte_range, op, step_factor, datatype, memory_type",
itertools.product(nthreads, ngpus_single, byte_range, op, step_factor, datatype, memory_type))
def test_BroadcastSingleProcess(nthreads, ngpus_single, byte_range, op, step_factor, datatype, memory_type):
try:
args = [executable,
"-t", nthreads,
"-g", ngpus_single,
"-b", byte_range[0],
"-e", byte_range[1],
"-o", op,
"-f", step_factor,
"-d", datatype,
"-y", memory_type]
if memory_type == "fine":
args.insert(0, "HSA_FORCE_FINE_GRAIN_PCIE=1")
args_str = " ".join(args)
rccl_test = subprocess.run(args_str, stdout=subprocess.PIPE, universal_newlines=True, shell=True)
except subprocess.CalledProcessError as err:
print(rccl_test.stdout)
pytest.fail("Broadcast test error(s) detected.")
assert rccl_test.returncode == 0
@pytest.mark.parametrize("nthreads, nprocs, ngpus_mpi, byte_range, op, step_factor, datatype",
itertools.product(nthreads, nprocs, ngpus_mpi, byte_range, op, step_factor, datatype))
def test_BroadcastMPI(request, nthreads, nprocs, ngpus_mpi, byte_range, op, step_factor, datatype):
try:
mpi_hostfile = request.config.getoption('--hostfile')
if not mpi_hostfile:
args = ["mpirun -np", nprocs,
executable,
"-p 1",
"-t", nthreads,
"-g", ngpus_mpi,
"-b", byte_range[0],
"-e", byte_range[1],
"-o", op,
"-f", step_factor,
"-d", datatype]
else:
args = ["mpirun -np", nprocs,
"-host", mpi_hostfile,
executable,
"-p 1",
"-t", nthreads,
"-g", ngpus_mpi,
"-b", byte_range[0],
"-e", byte_range[1],
"-o", op,
"-f", step_factor,
"-d", datatype,
"-y", memory_type]
if memory_type == "fine":
args.insert(0, "HSA_FORCE_FINE_GRAIN_PCIE=1")
args_str = " ".join(args)
print(args_str)
rccl_test = subprocess.run(args_str, universal_newlines=True, shell=True)
except subprocess.CalledProcessError as err:
print(rccl_test.stdout)
pytest.fail("Broadcast test error(s) detected.")
assert rccl_test.returncode == 0
+102
Vedi File
@@ -0,0 +1,102 @@
#################################################################################
# Copyright (C) 2019 Advanced Micro Devices, Inc. All rights reserved.
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell cop-
# ies of the Software, and to permit persons to whom the Software is furnished
# to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IM-
# PLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
# FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
# IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNE-
# CTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
################################################################################
import os
import subprocess
import itertools
import pytest
nthreads = ["1"]
nprocs = ["2"]
ngpus_single = ["1","2","4"]
ngpus_mpi = ["1","2"]
byte_range = [("4", "128M")]
op = ["sum", "prod", "min", "max"]
step_factor = ["2"]
datatype = ["int8", "uint8", "int32", "uint32", "int64", "uint64", "half", "float", "double"]
memory_type = ["coarse","fine", "host"]
path = os.path.dirname(os.path.abspath(__file__))
executable = path + "/../build/reduce_perf"
@pytest.mark.parametrize("nthreads, ngpus_single, byte_range, op, step_factor, datatype, memory_type",
itertools.product(nthreads, ngpus_single, byte_range, op, step_factor, datatype, memory_type))
def test_ReduceSingleProcess(nthreads, ngpus_single, byte_range, op, step_factor, datatype, memory_type):
try:
args = [executable,
"-t", nthreads,
"-g", ngpus_single,
"-b", byte_range[0],
"-e", byte_range[1],
"-o", op,
"-f", step_factor,
"-d", datatype,
"-y", memory_type]
if memory_type == "fine":
args.insert(0, "HSA_FORCE_FINE_GRAIN_PCIE=1")
args_str = " ".join(args)
rccl_test = subprocess.run(args_str, stdout=subprocess.PIPE, universal_newlines=True, shell=True)
except subprocess.CalledProcessError as err:
print(rccl_test.stdout)
pytest.fail("Reduce test error(s) detected.")
assert rccl_test.returncode == 0
@pytest.mark.parametrize("nthreads, nprocs, ngpus_mpi, byte_range, op, step_factor, datatype",
itertools.product(nthreads, nprocs, ngpus_mpi, byte_range, op, step_factor, datatype))
def test_ReduceMPI(request, nthreads, nprocs, ngpus_mpi, byte_range, op, step_factor, datatype):
try:
mpi_hostfile = request.config.getoption('--hostfile')
if not mpi_hostfile:
args = ["mpirun -np", nprocs,
executable,
"-p 1",
"-t", nthreads,
"-g", ngpus_mpi,
"-b", byte_range[0],
"-e", byte_range[1],
"-o", op,
"-f", step_factor,
"-d", datatype]
else:
args = ["mpirun -np", nprocs,
"-host", mpi_hostfile,
executable,
"-p 1",
"-t", nthreads,
"-g", ngpus_mpi,
"-b", byte_range[0],
"-e", byte_range[1],
"-o", op,
"-f", step_factor,
"-d", datatype,
"-y", memory_type]
if memory_type == "fine":
args.insert(0, "HSA_FORCE_FINE_GRAIN_PCIE=1")
args_str = " ".join(args)
print(args_str)
rccl_test = subprocess.run(args_str, universal_newlines=True, shell=True)
except subprocess.CalledProcessError as err:
print(rccl_test.stdout)
pytest.fail("Reduce test error(s) detected.")
assert rccl_test.returncode == 0
@@ -0,0 +1,102 @@
#################################################################################
# Copyright (C) 2019 Advanced Micro Devices, Inc. All rights reserved.
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell cop-
# ies of the Software, and to permit persons to whom the Software is furnished
# to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IM-
# PLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
# FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
# IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNE-
# CTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
################################################################################
import os
import subprocess
import itertools
import pytest
nthreads = ["1"]
nprocs = ["2"]
ngpus_single = ["1","2","4"]
ngpus_mpi = ["1","2"]
byte_range = [("4", "128M")]
op = ["sum", "prod", "min", "max"]
step_factor = ["2"]
datatype = ["int8", "uint8", "int32", "uint32", "int64", "uint64", "half", "float", "double"]
memory_type = ["coarse","fine", "host"]
path = os.path.dirname(os.path.abspath(__file__))
executable = path + "/../build/reduce_scatter_perf"
@pytest.mark.parametrize("nthreads, ngpus_single, byte_range, op, step_factor, datatype, memory_type",
itertools.product(nthreads, ngpus_single, byte_range, op, step_factor, datatype, memory_type))
def test_ReduceScatterSingleProcess(nthreads, ngpus_single, byte_range, op, step_factor, datatype, memory_type):
try:
args = [executable,
"-t", nthreads,
"-g", ngpus_single,
"-b", byte_range[0],
"-e", byte_range[1],
"-o", op,
"-f", step_factor,
"-d", datatype,
"-y", memory_type]
if memory_type == "fine":
args.insert(0, "HSA_FORCE_FINE_GRAIN_PCIE=1")
args_str = " ".join(args)
rccl_test = subprocess.run(args_str, stdout=subprocess.PIPE, universal_newlines=True, shell=True)
except subprocess.CalledProcessError as err:
print(rccl_test.stdout)
pytest.fail("ReduceScatter test error(s) detected.")
assert rccl_test.returncode == 0
@pytest.mark.parametrize("nthreads, nprocs, ngpus_mpi, byte_range, op, step_factor, datatype",
itertools.product(nthreads, nprocs, ngpus_mpi, byte_range, op, step_factor, datatype))
def test_ReduceScatterMPI(request, nthreads, nprocs, ngpus_mpi, byte_range, op, step_factor, datatype):
try:
mpi_hostfile = request.config.getoption('--hostfile')
if not mpi_hostfile:
args = ["mpirun -np", nprocs,
executable,
"-p 1",
"-t", nthreads,
"-g", ngpus_mpi,
"-b", byte_range[0],
"-e", byte_range[1],
"-o", op,
"-f", step_factor,
"-d", datatype]
else:
args = ["mpirun -np", nprocs,
"-host", mpi_hostfile,
executable,
"-p 1",
"-t", nthreads,
"-g", ngpus_mpi,
"-b", byte_range[0],
"-e", byte_range[1],
"-o", op,
"-f", step_factor,
"-d", datatype,
"-y", memory_type]
if memory_type == "fine":
args.insert(0, "HSA_FORCE_FINE_GRAIN_PCIE=1")
args_str = " ".join(args)
print(args_str)
rccl_test = subprocess.run(args_str, universal_newlines=True, shell=True)
except subprocess.CalledProcessError as err:
print(rccl_test.stdout)
pytest.fail("ReduceScatter test error(s) detected.")
assert rccl_test.returncode == 0