From 30d348de0e627310c4c1b3885fc4122b076240cb Mon Sep 17 00:00:00 2001 From: Stanley Tsang Date: Fri, 28 Jun 2019 09:52:44 -0600 Subject: [PATCH] Adding unit tests and files for CI (#4) * Adding initial unit test and Jenkins code. Fixing scope of unit tests Adding unit tests and files for CI Fixing Jenkinsfile * Removing typos from Jenkinsfile * Making some fixes to the Jenkins file; temporarily disabling MPI * Making corrections to Jenkinsfile * Correcting dockerNodes entry in Jenkinsfile * Fixed Jenkinsfile for CI * Correcting Jenkinsfile for CI * Updating README to include instructions on how to run unit tests. [ROCm/rccl-tests commit: 924521ff570069f2969377001af9913b3b026065] --- projects/rccl-tests/Jenkinsfile | 82 ++++++++++++++ projects/rccl-tests/README.md | 12 +++ projects/rccl-tests/install.sh | 98 +++++++++++++++++ projects/rccl-tests/src/common.cu | 18 +++- projects/rccl-tests/test/__init__.py | 20 ++++ projects/rccl-tests/test/conftest.py | 23 ++++ projects/rccl-tests/test/test_AllGather.py | 102 ++++++++++++++++++ projects/rccl-tests/test/test_AllReduce.py | 102 ++++++++++++++++++ projects/rccl-tests/test/test_Broadcast.py | 102 ++++++++++++++++++ projects/rccl-tests/test/test_Reduce.py | 102 ++++++++++++++++++ .../rccl-tests/test/test_ReduceScatter.py | 102 ++++++++++++++++++ 11 files changed, 758 insertions(+), 5 deletions(-) create mode 100644 projects/rccl-tests/Jenkinsfile create mode 100755 projects/rccl-tests/install.sh create mode 100644 projects/rccl-tests/test/__init__.py create mode 100644 projects/rccl-tests/test/conftest.py create mode 100644 projects/rccl-tests/test/test_AllGather.py create mode 100644 projects/rccl-tests/test/test_AllReduce.py create mode 100644 projects/rccl-tests/test/test_Broadcast.py create mode 100644 projects/rccl-tests/test/test_Reduce.py create mode 100644 projects/rccl-tests/test/test_ReduceScatter.py diff --git a/projects/rccl-tests/Jenkinsfile b/projects/rccl-tests/Jenkinsfile new file mode 100644 index 0000000000..7589636c68 --- /dev/null +++ b/projects/rccl-tests/Jenkinsfile @@ -0,0 +1,82 @@ +#!/usr/bin/env groovy +// Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved. +// This shared library is available at https://github.com/ROCmSoftwarePlatform/rocJENKINS +@Library('rocJenkins@noDocker') _ + +// This is file for internal AMD use. +// If you are interested in running your own Jenkins, please raise a github issue for assistance. + +import com.amd.project.* +import com.amd.docker.* + +//////////////////////////////////////////////////////////////////////// +// Mostly generated from snippet generator 'properties; set job properties' +// Time-based triggers added to execute nightly tests, eg '30 2 * * *' means 2:30 AM +properties([ + pipelineTriggers([cron('0 1 * * *'), [$class: 'PeriodicFolderTrigger', interval: '5m']]), + buildDiscarder(logRotator( + artifactDaysToKeepStr: '', + artifactNumToKeepStr: '', + daysToKeepStr: '', + numToKeepStr: '10')), + disableConcurrentBuilds(), + [$class: 'CopyArtifactPermissionProperty', projectNames: '*'] + ]) + + +//////////////////////////////////////////////////////////////////////// +import java.nio.file.Path; + +rcclTestsCI: +{ + def rcclTests = new rocProject('rcclTests') + // customize for project + rcclTests.paths.build_command = './install.sh' + + // Define test architectures, optional rocm version argument is available + def nodes = new dockerNodes(['RCCL'], rcclTests) + + boolean formatCheck = false + + def compileCommand = + { + platform, project-> + + project.paths.construct_build_prefix() + + def command = """#!/usr/bin/env bash + set -x + rm -rf rccl + git clone https://github.com/ROCmSoftwarePlatform/rccl + cd rccl + export RCCL_PATH=${WORKSPACE}/rccl/rccl-install + ./install.sh -i --prefix=\$RCCL_PATH + cd .. + cd ${project.paths.project_build_prefix} + ${project.paths.build_command} --rccl_home=\$RCCL_PATH + """ + sh command + } + def testCommand = + { + platform, project-> + + def command = """#!/usr/bin/env bash + set -x + LD_LIBRARY_PATH=\$LD_LIBRARY_PATH:${WORKSPACE}/rccl/rccl-install/lib/ python3 -m pytest -k "not MPI" --junitxml=./testreport.xml + """ + + sh command + //junit "${project.paths.project_build_prefix}/build/release/*.xml" + } + + def packageCommand = + { + platform, project-> + + def command = """ + """ + } + + buildProjectNoDocker(rcclTests, formatCheck, nodes.dockerArray, compileCommand, testCommand, packageCommand) +} diff --git a/projects/rccl-tests/README.md b/projects/rccl-tests/README.md index 2731d65c65..dc3120f119 100644 --- a/projects/rccl-tests/README.md +++ b/projects/rccl-tests/README.md @@ -64,6 +64,18 @@ All tests support the same set of arguments : * `-c,--check <0/1>` check correctness of results. This can be quite slow on large numbers of GPUs. Default : 1. * `-z,--blocking <0/1>` Make RCCL collective blocking, i.e. have CPUs wait and sync after each collective. Default : 0. +## Unit tests + +Unit tests for rccl-tests are implemented with pytest (python3 is also required). Several notes for the unit tests: + +1. The LD_LIBRARY_PATH environment variable will need to be set to include /path/to/rccl-install/lib/ in order to run the unit tests. +2. The HSA_FORCE_FINE_GRAIN_PCIE environment variable will need to be set to 1 in order to run the unit tests which use fine-grained memory type. + +The unit tests can be invoked within the rccl-tests root, or in the test subfolder. An example call to the unit tests: +```shell +$ LD_LIBRARY_PATH=/path/to/rccl-install/lib/ HSA_FORCE_FINE_GRAIN_PCIE=1 python3 -m pytest +``` + ## Copyright RCCL tests are provided under the BSD license. diff --git a/projects/rccl-tests/install.sh b/projects/rccl-tests/install.sh new file mode 100755 index 0000000000..32e5dc4d4e --- /dev/null +++ b/projects/rccl-tests/install.sh @@ -0,0 +1,98 @@ +#!/bin/bash +# Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved. + +# ################################################# +# helper functions +# ################################################# +function display_help() +{ + echo "RCCL-tests build & installation helper script" + echo "./install [-h|--help] " + echo " [-h|--help] Prints this help message." + echo " [-m|--mpi] Build RCCL-tests with MPI support. (see --mpi_home below.)" + echo " [--rccl_home] Specify custom path for RCCL installation (default: /opt/rocm/rccl)" + echo " [--mpi_home] Specify path to your MPI installation." +} + +# ################################################# +# global variables +# ################################################# +run_tests=false +build_release=true +mpi_enabled=false +rccl_dir=/opt/rocm/rccl +mpi_dir="" +# ################################################# +# Parameter parsing +# ################################################# + +# check if we have a modern version of getopt that can handle whitespace and long parameters +getopt -T +if [[ $? -eq 4 ]]; then + GETOPT_PARSE=$(getopt --name "${0}" --longoptions help,mpi,test,rccl_home:,mpi_home: --options hmt -- "$@") +else + echo "Need a new version of getopt" + exit 1 +fi + +if [[ $? -ne 0 ]]; then + echo "getopt invocation failed; could not parse the command line"; + exit 1 +fi + +eval set -- "${GETOPT_PARSE}" + +while true; do + case "${1}" in + -h|--help) + display_help + exit 0 + ;; + -m|--mpi) + mpi_enabled=true + shift ;; + -t|--test) + run_tests=true + shift ;; + --rccl_home) + rccl_dir=${2} + shift 2 ;; + --mpi_home) + mpi_dir=${2} + shift 2 ;; + --) shift ; break ;; + *) echo "Unexpected command line parameter received; aborting"; + exit 1 + ;; + esac + done + +# Install the pre-commit hook +#bash ./githooks/install + +build_dir=./build +# ################################################# +# prep +# ################################################# +# ensure a clean build environment +rm -rf ${build_dir} + +if ($mpi_enabled); then + if [[ ${mpi_dir} -eq "" ]]; then + echo "MPI flag enabled but path to MPI installation not specified. See --mpi_home command line argument." + exit 1 + else + make NCCL_HOME=${rccl_dir} CUSTOM_RCCL_LIB=${rccl_dir}/lib/librccl.so MPI=1 MPI_HOME=${mpi_dir} -j$(nproc) + fi +else + make NCCL_HOME=${rccl_dir} CUSTOM_RCCL_LIB=${rccl_dir}/lib/librccl.so -j$(nproc) +fi + +# Optionally, run tests if they're enabled. +if ($run_tests); then + if ($mpi_enabled); then + cd test; LD_LIBRARY_PATH=$LD_LIBRARY_PATH:${rccl_dir}/lib:${mpi_dir}/lib PATH=$PATH:${mpi_dir}/bin python3 -m pytest + else + cd test; LD_LIBRARY_PATH=$LD_LIBRARY_PATH:${rccl_dir}/lib python3 -m pytest + fi +fi diff --git a/projects/rccl-tests/src/common.cu b/projects/rccl-tests/src/common.cu index 61084eb1bd..d708a7a916 100644 --- a/projects/rccl-tests/src/common.cu +++ b/projects/rccl-tests/src/common.cu @@ -292,7 +292,7 @@ void Barrier(struct threadArgs* args) args->barrier_idx=!args->barrier_idx; } -testResult_t CheckData(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t op, int root, int in_place, double *delta) { +testResult_t CheckData(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t op, int root, int in_place, double *delta, bool *error) { size_t count = args->expectedBytes/wordSize(type); double maxDelta = 0.0; for (int i=0; inGpus; i++) { @@ -327,7 +327,11 @@ testResult_t CheckData(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t #endif } double nranks = args->nProcs*args->nThreads*args->nGpus; - if (maxDelta > DeltaMaxValue(type)*(nranks - 1)) args->errors[0]++; + if (maxDelta > DeltaMaxValue(type)*(nranks - 1)) + { + args->errors[0]++; + *error = true; + } *delta = maxDelta; return testSuccess; } @@ -446,6 +450,7 @@ testResult_t BenchTime(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t Barrier(args); double maxDelta = 0; + bool error = false; static __thread int rep = 0; rep++; if (datacheck) { @@ -456,7 +461,7 @@ testResult_t BenchTime(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t TESTCHECK(startColl(args, type, op, root, in_place, 0)); TESTCHECK(completeColl(args)); - TESTCHECK(CheckData(args, type, op, root, in_place, &maxDelta)); + TESTCHECK(CheckData(args, type, op, root, in_place, &maxDelta, &error)); //aggregate delta from all threads and procs Barrier(args); @@ -466,6 +471,7 @@ testResult_t BenchTime(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t } #ifdef MPI_SUPPORT MPI_Allreduce(MPI_IN_PLACE, &maxDelta, 1, MPI_DOUBLE, MPI_MAX, MPI_COMM_WORLD); + MPI_Allreduce(MPI_IN_PLACE, &error, 1, MPI_C_BOOL, MPI_LOR, MPI_COMM_WORLD); #endif } Barrier(args); @@ -481,7 +487,7 @@ testResult_t BenchTime(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t sprintf(timeStr, "%7.2f", timeUsec); } if (datacheck) { - PRINT(" %7s %6.2f %6.2f %5.0le", timeStr, algBw, busBw, maxDelta); + PRINT(" %7s %6.2f %6.2f %5.0le%s", timeStr, algBw, busBw, maxDelta, error ? "*" : ""); } else { PRINT(" %7s %6.2f %6.2f %5s", timeStr, algBw, busBw, "N/A"); } @@ -757,7 +763,7 @@ testResult_t run() { #endif is_main_thread = (proc == 0) ? 1 : 0; - PRINT("# nThread %d nGpus %d minBytes %ld maxBytes %ld step: %ld(%s) warmup iters: %d iters: %d validation: %d \n", nThreads, nGpus, minBytes, maxBytes, + PRINT("# nThread: %d nGpus: %d minBytes: %ld maxBytes: %ld step: %ld(%s) warmupIters: %d iters: %d validation: %d \n", nThreads, nGpus, minBytes, maxBytes, (stepFactor > 1)?stepFactor:stepBytes, (stepFactor > 1)?"factor":"bytes", warmup_iters, iters, datacheck); if (blocking_coll) PRINT("# Blocking Enabled: wait for completion and barrier after each collective \n"); if (parallel_init) PRINT("# Parallel Init Enabled: threads call into NcclInitRank concurrently \n"); @@ -887,6 +893,7 @@ testResult_t run() { for (int t=nThreads-1; t>=0; t--) { if (t) pthread_join(threads[t].thread, NULL); TESTCHECK(threads[t].ret); + if (t) { errors[0] += errors[t]; bw[0] += bw[t]; @@ -927,6 +934,7 @@ testResult_t run() { double check_avg_bw = str ? atof(str) : -1; bw[0] /= bw_count[0]; + if (datacheck) PRINT("# Errors with asterisks indicate errors that have exceeded the maximum threshold.\n"); PRINT("# Out of bounds values : %d %s\n", errors[0], errors[0] ? "FAILED" : "OK"); PRINT("# Avg bus bandwidth : %g %s\n", bw[0], check_avg_bw == -1 ? "" : (bw[0] < check_avg_bw*(0.9) ? "FAILED" : "OK")); PRINT("#\n"); diff --git a/projects/rccl-tests/test/__init__.py b/projects/rccl-tests/test/__init__.py new file mode 100644 index 0000000000..cfd487930d --- /dev/null +++ b/projects/rccl-tests/test/__init__.py @@ -0,0 +1,20 @@ +################################################################################# +# Copyright (C) 2019 Advanced Micro Devices, Inc. All rights reserved. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell cop- +# ies of the Software, and to permit persons to whom the Software is furnished +# to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IM- +# PLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS +# FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER +# IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNE- +# CTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +################################################################################ \ No newline at end of file diff --git a/projects/rccl-tests/test/conftest.py b/projects/rccl-tests/test/conftest.py new file mode 100644 index 0000000000..79ce9b8ef8 --- /dev/null +++ b/projects/rccl-tests/test/conftest.py @@ -0,0 +1,23 @@ +################################################################################# +# Copyright (C) 2019 Advanced Micro Devices, Inc. All rights reserved. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell cop- +# ies of the Software, and to permit persons to whom the Software is furnished +# to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IM- +# PLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS +# FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER +# IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNE- +# CTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +################################################################################ + +def pytest_addoption(parser): + parser.addoption("--hostfile", action="store", default="", help="specify MPI hostfile") \ No newline at end of file diff --git a/projects/rccl-tests/test/test_AllGather.py b/projects/rccl-tests/test/test_AllGather.py new file mode 100644 index 0000000000..2d3d74bcef --- /dev/null +++ b/projects/rccl-tests/test/test_AllGather.py @@ -0,0 +1,102 @@ +################################################################################# +# Copyright (C) 2019 Advanced Micro Devices, Inc. All rights reserved. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell cop- +# ies of the Software, and to permit persons to whom the Software is furnished +# to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IM- +# PLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS +# FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER +# IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNE- +# CTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +################################################################################ + +import os +import subprocess +import itertools + +import pytest + +nthreads = ["1"] +nprocs = ["2"] +ngpus_single = ["1","2","4"] +ngpus_mpi = ["1","2"] +byte_range = [("4", "128M")] +op = ["sum", "prod", "min", "max"] +step_factor = ["2"] +datatype = ["int8", "uint8", "int32", "uint32", "int64", "uint64", "half", "float", "double"] +memory_type = ["coarse","fine", "host"] + +path = os.path.dirname(os.path.abspath(__file__)) +executable = path + "/../build/all_gather_perf" + +@pytest.mark.parametrize("nthreads, ngpus_single, byte_range, op, step_factor, datatype, memory_type", + itertools.product(nthreads, ngpus_single, byte_range, op, step_factor, datatype, memory_type)) +def test_AllGatherSingleProcess(nthreads, ngpus_single, byte_range, op, step_factor, datatype, memory_type): + try: + args = [executable, + "-t", nthreads, + "-g", ngpus_single, + "-b", byte_range[0], + "-e", byte_range[1], + "-o", op, + "-f", step_factor, + "-d", datatype, + "-y", memory_type] + if memory_type == "fine": + args.insert(0, "HSA_FORCE_FINE_GRAIN_PCIE=1") + args_str = " ".join(args) + rccl_test = subprocess.run(args_str, stdout=subprocess.PIPE, universal_newlines=True, shell=True) + except subprocess.CalledProcessError as err: + print(rccl_test.stdout) + pytest.fail("AllGather test error(s) detected.") + + assert rccl_test.returncode == 0 + +@pytest.mark.parametrize("nthreads, nprocs, ngpus_mpi, byte_range, op, step_factor, datatype", + itertools.product(nthreads, nprocs, ngpus_mpi, byte_range, op, step_factor, datatype)) +def test_AllGatherMPI(request, nthreads, nprocs, ngpus_mpi, byte_range, op, step_factor, datatype): + try: + mpi_hostfile = request.config.getoption('--hostfile') + if not mpi_hostfile: + args = ["mpirun -np", nprocs, + executable, + "-p 1", + "-t", nthreads, + "-g", ngpus_mpi, + "-b", byte_range[0], + "-e", byte_range[1], + "-o", op, + "-f", step_factor, + "-d", datatype] + else: + args = ["mpirun -np", nprocs, + "-host", mpi_hostfile, + executable, + "-p 1", + "-t", nthreads, + "-g", ngpus_mpi, + "-b", byte_range[0], + "-e", byte_range[1], + "-o", op, + "-f", step_factor, + "-d", datatype, + "-y", memory_type] + if memory_type == "fine": + args.insert(0, "HSA_FORCE_FINE_GRAIN_PCIE=1") + args_str = " ".join(args) + print(args_str) + rccl_test = subprocess.run(args_str, universal_newlines=True, shell=True) + except subprocess.CalledProcessError as err: + print(rccl_test.stdout) + pytest.fail("AllGather test error(s) detected.") + + assert rccl_test.returncode == 0 \ No newline at end of file diff --git a/projects/rccl-tests/test/test_AllReduce.py b/projects/rccl-tests/test/test_AllReduce.py new file mode 100644 index 0000000000..b3cb5f99ff --- /dev/null +++ b/projects/rccl-tests/test/test_AllReduce.py @@ -0,0 +1,102 @@ +################################################################################# +# Copyright (C) 2019 Advanced Micro Devices, Inc. All rights reserved. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell cop- +# ies of the Software, and to permit persons to whom the Software is furnished +# to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IM- +# PLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS +# FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER +# IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNE- +# CTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +################################################################################ + +import os +import subprocess +import itertools + +import pytest + +nthreads = ["1"] +nprocs = ["2"] +ngpus_single = ["1","2","4"] +ngpus_mpi = ["1","2"] +byte_range = [("4", "128M")] +op = ["sum", "prod", "min", "max"] +step_factor = ["2"] +datatype = ["int8", "uint8", "int32", "uint32", "int64", "uint64", "half", "float", "double"] +memory_type = ["coarse","fine", "host"] + +path = os.path.dirname(os.path.abspath(__file__)) +executable = path + "/../build/all_reduce_perf" + +@pytest.mark.parametrize("nthreads, ngpus_single, byte_range, op, step_factor, datatype, memory_type", + itertools.product(nthreads, ngpus_single, byte_range, op, step_factor, datatype, memory_type)) +def test_AllReduceSingleProcess(nthreads, ngpus_single, byte_range, op, step_factor, datatype, memory_type): + try: + args = [executable, + "-t", nthreads, + "-g", ngpus_single, + "-b", byte_range[0], + "-e", byte_range[1], + "-o", op, + "-f", step_factor, + "-d", datatype, + "-y", memory_type] + if memory_type == "fine": + args.insert(0, "HSA_FORCE_FINE_GRAIN_PCIE=1") + args_str = " ".join(args) + rccl_test = subprocess.run(args_str, stdout=subprocess.PIPE, universal_newlines=True, shell=True) + except subprocess.CalledProcessError as err: + print(rccl_test.stdout) + pytest.fail("AllReduce test error(s) detected.") + + assert rccl_test.returncode == 0 + +@pytest.mark.parametrize("nthreads, nprocs, ngpus_mpi, byte_range, op, step_factor, datatype", + itertools.product(nthreads, nprocs, ngpus_mpi, byte_range, op, step_factor, datatype)) +def test_AllReduceMPI(request, nthreads, nprocs, ngpus_mpi, byte_range, op, step_factor, datatype): + try: + mpi_hostfile = request.config.getoption('--hostfile') + if not mpi_hostfile: + args = ["mpirun -np", nprocs, + executable, + "-p 1", + "-t", nthreads, + "-g", ngpus_mpi, + "-b", byte_range[0], + "-e", byte_range[1], + "-o", op, + "-f", step_factor, + "-d", datatype] + else: + args = ["mpirun -np", nprocs, + "-host", mpi_hostfile, + executable, + "-p 1", + "-t", nthreads, + "-g", ngpus_mpi, + "-b", byte_range[0], + "-e", byte_range[1], + "-o", op, + "-f", step_factor, + "-d", datatype, + "-y", memory_type] + if memory_type == "fine": + args.insert(0, "HSA_FORCE_FINE_GRAIN_PCIE=1") + args_str = " ".join(args) + print(args_str) + rccl_test = subprocess.run(args_str, universal_newlines=True, shell=True) + except subprocess.CalledProcessError as err: + print(rccl_test.stdout) + pytest.fail("AllReduce test error(s) detected.") + + assert rccl_test.returncode == 0 \ No newline at end of file diff --git a/projects/rccl-tests/test/test_Broadcast.py b/projects/rccl-tests/test/test_Broadcast.py new file mode 100644 index 0000000000..f4b8b38363 --- /dev/null +++ b/projects/rccl-tests/test/test_Broadcast.py @@ -0,0 +1,102 @@ +################################################################################# +# Copyright (C) 2019 Advanced Micro Devices, Inc. All rights reserved. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell cop- +# ies of the Software, and to permit persons to whom the Software is furnished +# to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IM- +# PLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS +# FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER +# IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNE- +# CTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +################################################################################ + +import os +import subprocess +import itertools + +import pytest + +nthreads = ["1"] +nprocs = ["2"] +ngpus_single = ["1","2","4"] +ngpus_mpi = ["1","2"] +byte_range = [("4", "128M")] +op = ["sum", "prod", "min", "max"] +step_factor = ["2"] +datatype = ["int8", "uint8", "int32", "uint32", "int64", "uint64", "half", "float", "double"] +memory_type = ["coarse","fine", "host"] + +path = os.path.dirname(os.path.abspath(__file__)) +executable = path + "/../build/broadcast_perf" + +@pytest.mark.parametrize("nthreads, ngpus_single, byte_range, op, step_factor, datatype, memory_type", + itertools.product(nthreads, ngpus_single, byte_range, op, step_factor, datatype, memory_type)) +def test_BroadcastSingleProcess(nthreads, ngpus_single, byte_range, op, step_factor, datatype, memory_type): + try: + args = [executable, + "-t", nthreads, + "-g", ngpus_single, + "-b", byte_range[0], + "-e", byte_range[1], + "-o", op, + "-f", step_factor, + "-d", datatype, + "-y", memory_type] + if memory_type == "fine": + args.insert(0, "HSA_FORCE_FINE_GRAIN_PCIE=1") + args_str = " ".join(args) + rccl_test = subprocess.run(args_str, stdout=subprocess.PIPE, universal_newlines=True, shell=True) + except subprocess.CalledProcessError as err: + print(rccl_test.stdout) + pytest.fail("Broadcast test error(s) detected.") + + assert rccl_test.returncode == 0 + +@pytest.mark.parametrize("nthreads, nprocs, ngpus_mpi, byte_range, op, step_factor, datatype", + itertools.product(nthreads, nprocs, ngpus_mpi, byte_range, op, step_factor, datatype)) +def test_BroadcastMPI(request, nthreads, nprocs, ngpus_mpi, byte_range, op, step_factor, datatype): + try: + mpi_hostfile = request.config.getoption('--hostfile') + if not mpi_hostfile: + args = ["mpirun -np", nprocs, + executable, + "-p 1", + "-t", nthreads, + "-g", ngpus_mpi, + "-b", byte_range[0], + "-e", byte_range[1], + "-o", op, + "-f", step_factor, + "-d", datatype] + else: + args = ["mpirun -np", nprocs, + "-host", mpi_hostfile, + executable, + "-p 1", + "-t", nthreads, + "-g", ngpus_mpi, + "-b", byte_range[0], + "-e", byte_range[1], + "-o", op, + "-f", step_factor, + "-d", datatype, + "-y", memory_type] + if memory_type == "fine": + args.insert(0, "HSA_FORCE_FINE_GRAIN_PCIE=1") + args_str = " ".join(args) + print(args_str) + rccl_test = subprocess.run(args_str, universal_newlines=True, shell=True) + except subprocess.CalledProcessError as err: + print(rccl_test.stdout) + pytest.fail("Broadcast test error(s) detected.") + + assert rccl_test.returncode == 0 \ No newline at end of file diff --git a/projects/rccl-tests/test/test_Reduce.py b/projects/rccl-tests/test/test_Reduce.py new file mode 100644 index 0000000000..5df694490d --- /dev/null +++ b/projects/rccl-tests/test/test_Reduce.py @@ -0,0 +1,102 @@ +################################################################################# +# Copyright (C) 2019 Advanced Micro Devices, Inc. All rights reserved. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell cop- +# ies of the Software, and to permit persons to whom the Software is furnished +# to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IM- +# PLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS +# FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER +# IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNE- +# CTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +################################################################################ + +import os +import subprocess +import itertools + +import pytest + +nthreads = ["1"] +nprocs = ["2"] +ngpus_single = ["1","2","4"] +ngpus_mpi = ["1","2"] +byte_range = [("4", "128M")] +op = ["sum", "prod", "min", "max"] +step_factor = ["2"] +datatype = ["int8", "uint8", "int32", "uint32", "int64", "uint64", "half", "float", "double"] +memory_type = ["coarse","fine", "host"] + +path = os.path.dirname(os.path.abspath(__file__)) +executable = path + "/../build/reduce_perf" + +@pytest.mark.parametrize("nthreads, ngpus_single, byte_range, op, step_factor, datatype, memory_type", + itertools.product(nthreads, ngpus_single, byte_range, op, step_factor, datatype, memory_type)) +def test_ReduceSingleProcess(nthreads, ngpus_single, byte_range, op, step_factor, datatype, memory_type): + try: + args = [executable, + "-t", nthreads, + "-g", ngpus_single, + "-b", byte_range[0], + "-e", byte_range[1], + "-o", op, + "-f", step_factor, + "-d", datatype, + "-y", memory_type] + if memory_type == "fine": + args.insert(0, "HSA_FORCE_FINE_GRAIN_PCIE=1") + args_str = " ".join(args) + rccl_test = subprocess.run(args_str, stdout=subprocess.PIPE, universal_newlines=True, shell=True) + except subprocess.CalledProcessError as err: + print(rccl_test.stdout) + pytest.fail("Reduce test error(s) detected.") + + assert rccl_test.returncode == 0 + +@pytest.mark.parametrize("nthreads, nprocs, ngpus_mpi, byte_range, op, step_factor, datatype", + itertools.product(nthreads, nprocs, ngpus_mpi, byte_range, op, step_factor, datatype)) +def test_ReduceMPI(request, nthreads, nprocs, ngpus_mpi, byte_range, op, step_factor, datatype): + try: + mpi_hostfile = request.config.getoption('--hostfile') + if not mpi_hostfile: + args = ["mpirun -np", nprocs, + executable, + "-p 1", + "-t", nthreads, + "-g", ngpus_mpi, + "-b", byte_range[0], + "-e", byte_range[1], + "-o", op, + "-f", step_factor, + "-d", datatype] + else: + args = ["mpirun -np", nprocs, + "-host", mpi_hostfile, + executable, + "-p 1", + "-t", nthreads, + "-g", ngpus_mpi, + "-b", byte_range[0], + "-e", byte_range[1], + "-o", op, + "-f", step_factor, + "-d", datatype, + "-y", memory_type] + if memory_type == "fine": + args.insert(0, "HSA_FORCE_FINE_GRAIN_PCIE=1") + args_str = " ".join(args) + print(args_str) + rccl_test = subprocess.run(args_str, universal_newlines=True, shell=True) + except subprocess.CalledProcessError as err: + print(rccl_test.stdout) + pytest.fail("Reduce test error(s) detected.") + + assert rccl_test.returncode == 0 \ No newline at end of file diff --git a/projects/rccl-tests/test/test_ReduceScatter.py b/projects/rccl-tests/test/test_ReduceScatter.py new file mode 100644 index 0000000000..66b431b00a --- /dev/null +++ b/projects/rccl-tests/test/test_ReduceScatter.py @@ -0,0 +1,102 @@ +################################################################################# +# Copyright (C) 2019 Advanced Micro Devices, Inc. All rights reserved. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell cop- +# ies of the Software, and to permit persons to whom the Software is furnished +# to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IM- +# PLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS +# FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER +# IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNE- +# CTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +################################################################################ + +import os +import subprocess +import itertools + +import pytest + +nthreads = ["1"] +nprocs = ["2"] +ngpus_single = ["1","2","4"] +ngpus_mpi = ["1","2"] +byte_range = [("4", "128M")] +op = ["sum", "prod", "min", "max"] +step_factor = ["2"] +datatype = ["int8", "uint8", "int32", "uint32", "int64", "uint64", "half", "float", "double"] +memory_type = ["coarse","fine", "host"] + +path = os.path.dirname(os.path.abspath(__file__)) +executable = path + "/../build/reduce_scatter_perf" + +@pytest.mark.parametrize("nthreads, ngpus_single, byte_range, op, step_factor, datatype, memory_type", + itertools.product(nthreads, ngpus_single, byte_range, op, step_factor, datatype, memory_type)) +def test_ReduceScatterSingleProcess(nthreads, ngpus_single, byte_range, op, step_factor, datatype, memory_type): + try: + args = [executable, + "-t", nthreads, + "-g", ngpus_single, + "-b", byte_range[0], + "-e", byte_range[1], + "-o", op, + "-f", step_factor, + "-d", datatype, + "-y", memory_type] + if memory_type == "fine": + args.insert(0, "HSA_FORCE_FINE_GRAIN_PCIE=1") + args_str = " ".join(args) + rccl_test = subprocess.run(args_str, stdout=subprocess.PIPE, universal_newlines=True, shell=True) + except subprocess.CalledProcessError as err: + print(rccl_test.stdout) + pytest.fail("ReduceScatter test error(s) detected.") + + assert rccl_test.returncode == 0 + +@pytest.mark.parametrize("nthreads, nprocs, ngpus_mpi, byte_range, op, step_factor, datatype", + itertools.product(nthreads, nprocs, ngpus_mpi, byte_range, op, step_factor, datatype)) +def test_ReduceScatterMPI(request, nthreads, nprocs, ngpus_mpi, byte_range, op, step_factor, datatype): + try: + mpi_hostfile = request.config.getoption('--hostfile') + if not mpi_hostfile: + args = ["mpirun -np", nprocs, + executable, + "-p 1", + "-t", nthreads, + "-g", ngpus_mpi, + "-b", byte_range[0], + "-e", byte_range[1], + "-o", op, + "-f", step_factor, + "-d", datatype] + else: + args = ["mpirun -np", nprocs, + "-host", mpi_hostfile, + executable, + "-p 1", + "-t", nthreads, + "-g", ngpus_mpi, + "-b", byte_range[0], + "-e", byte_range[1], + "-o", op, + "-f", step_factor, + "-d", datatype, + "-y", memory_type] + if memory_type == "fine": + args.insert(0, "HSA_FORCE_FINE_GRAIN_PCIE=1") + args_str = " ".join(args) + print(args_str) + rccl_test = subprocess.run(args_str, universal_newlines=True, shell=True) + except subprocess.CalledProcessError as err: + print(rccl_test.stdout) + pytest.fail("ReduceScatter test error(s) detected.") + + assert rccl_test.returncode == 0 \ No newline at end of file