c5ab7dc5b5
* scaling tests to #ngpus
Signed-off-by: AtlantaPepsi <hyj1999110@gmail.com>
* switching to rocminfo
---------
Signed-off-by: AtlantaPepsi <hyj1999110@gmail.com>
[ROCm/rccl-tests commit: ae3e6357cb]
113 lines
4.7 KiB
Python
113 lines
4.7 KiB
Python
#################################################################################
|
|
# Copyright (C) 2019 Advanced Micro Devices, Inc. All rights reserved.
|
|
#
|
|
# Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
# of this software and associated documentation files (the "Software"), to deal
|
|
# in the Software without restriction, including without limitation the rights
|
|
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell cop-
|
|
# ies of the Software, and to permit persons to whom the Software is furnished
|
|
# to do so, subject to the following conditions:
|
|
#
|
|
# The above copyright notice and this permission notice shall be included in all
|
|
# copies or substantial portions of the Software.
|
|
#
|
|
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IM-
|
|
# PLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
|
|
# FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
|
|
# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
|
|
# IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNE-
|
|
# CTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
|
################################################################################
|
|
|
|
import os
|
|
import subprocess
|
|
import itertools
|
|
import math
|
|
|
|
import pytest
|
|
|
|
ngpus = 0
|
|
if os.environ.get('ROCR_VISIBLE_DEVICES') is not None:
|
|
ngpus = len(os.environ['ROCR_VISIBLE_DEVICES'].split(","))
|
|
elif os.environ.get('HIP_VISIBLE_DEVICES') is not None:
|
|
ngpus = len(os.environ['HIP_VISIBLE_DEVICES'].split(","))
|
|
else:
|
|
ngpus = int(subprocess.check_output("rocminfo | grep \"Device Type:.\s*.GPU\" | wc -l",shell=True))
|
|
log_ngpus = int(math.log2(ngpus))
|
|
|
|
nthreads = ["1"]
|
|
nprocs = ["2"]
|
|
ngpus_single = [str(2**x) for x in range(log_ngpus+1)]
|
|
ngpus_mpi = ["1","2"]
|
|
byte_range = [("4", "128M")]
|
|
op = ["sum", "prod", "min", "max"]
|
|
step_factor = ["2"]
|
|
datatype = ["int8", "uint8", "int32", "uint32", "int64", "uint64", "half", "float", "double"]
|
|
memory_type = ["coarse","fine", "host"]
|
|
|
|
path = os.path.dirname(os.path.abspath(__file__))
|
|
executable = path + "/../build/broadcast_perf"
|
|
|
|
@pytest.mark.parametrize("nthreads, ngpus_single, byte_range, op, step_factor, datatype, memory_type",
|
|
itertools.product(nthreads, ngpus_single, byte_range, op, step_factor, datatype, memory_type))
|
|
def test_BroadcastSingleProcess(nthreads, ngpus_single, byte_range, op, step_factor, datatype, memory_type):
|
|
try:
|
|
args = [executable,
|
|
"-t", nthreads,
|
|
"-g", ngpus_single,
|
|
"-b", byte_range[0],
|
|
"-e", byte_range[1],
|
|
"-o", op,
|
|
"-f", step_factor,
|
|
"-d", datatype,
|
|
"-y", memory_type]
|
|
if memory_type == "fine":
|
|
args.insert(0, "HSA_FORCE_FINE_GRAIN_PCIE=1")
|
|
args_str = " ".join(args)
|
|
rccl_test = subprocess.run(args_str, stdout=subprocess.PIPE, universal_newlines=True, shell=True)
|
|
except subprocess.CalledProcessError as err:
|
|
print(rccl_test.stdout)
|
|
pytest.fail("Broadcast test error(s) detected.")
|
|
|
|
assert rccl_test.returncode == 0
|
|
|
|
@pytest.mark.parametrize("nthreads, nprocs, ngpus_mpi, byte_range, op, step_factor, datatype",
|
|
itertools.product(nthreads, nprocs, ngpus_mpi, byte_range, op, step_factor, datatype))
|
|
def test_BroadcastMPI(request, nthreads, nprocs, ngpus_mpi, byte_range, op, step_factor, datatype):
|
|
try:
|
|
mpi_hostfile = request.config.getoption('--hostfile')
|
|
if not mpi_hostfile:
|
|
args = ["mpirun -np", nprocs,
|
|
executable,
|
|
"-p 1",
|
|
"-t", nthreads,
|
|
"-g", ngpus_mpi,
|
|
"-b", byte_range[0],
|
|
"-e", byte_range[1],
|
|
"-o", op,
|
|
"-f", step_factor,
|
|
"-d", datatype]
|
|
else:
|
|
args = ["mpirun -np", nprocs,
|
|
"-host", mpi_hostfile,
|
|
executable,
|
|
"-p 1",
|
|
"-t", nthreads,
|
|
"-g", ngpus_mpi,
|
|
"-b", byte_range[0],
|
|
"-e", byte_range[1],
|
|
"-o", op,
|
|
"-f", step_factor,
|
|
"-d", datatype,
|
|
"-y", memory_type]
|
|
if memory_type == "fine":
|
|
args.insert(0, "HSA_FORCE_FINE_GRAIN_PCIE=1")
|
|
args_str = " ".join(args)
|
|
print(args_str)
|
|
rccl_test = subprocess.run(args_str, universal_newlines=True, shell=True)
|
|
except subprocess.CalledProcessError as err:
|
|
print(rccl_test.stdout)
|
|
pytest.fail("Broadcast test error(s) detected.")
|
|
|
|
assert rccl_test.returncode == 0
|