Scaling tests to #ngpus (#81)
* scaling tests to #ngpus
Signed-off-by: AtlantaPepsi <hyj1999110@gmail.com>
* switching to rocminfo
---------
Signed-off-by: AtlantaPepsi <hyj1999110@gmail.com>
[ROCm/rccl-tests commit: ae3e6357cb]
This commit is contained in:
@@ -22,12 +22,22 @@
|
||||
import os
|
||||
import subprocess
|
||||
import itertools
|
||||
import math
|
||||
|
||||
import pytest
|
||||
|
||||
ngpus = 0
|
||||
if os.environ.get('ROCR_VISIBLE_DEVICES') is not None:
|
||||
ngpus = len(os.environ['ROCR_VISIBLE_DEVICES'].split(","))
|
||||
elif os.environ.get('HIP_VISIBLE_DEVICES') is not None:
|
||||
ngpus = len(os.environ['HIP_VISIBLE_DEVICES'].split(","))
|
||||
else:
|
||||
ngpus = int(subprocess.check_output("rocminfo | grep \"Device Type:.\s*.GPU\" | wc -l",shell=True))
|
||||
log_ngpus = int(math.log2(ngpus))
|
||||
|
||||
nthreads = ["1"]
|
||||
nprocs = ["2"]
|
||||
ngpus_single = ["1","2","4"]
|
||||
ngpus_single = [str(2**x) for x in range(log_ngpus+1)]
|
||||
ngpus_mpi = ["1","2"]
|
||||
byte_range = [("4", "128M")]
|
||||
op = ["sum", "prod", "min", "max"]
|
||||
@@ -99,4 +109,4 @@ def test_AllGatherMPI(request, nthreads, nprocs, ngpus_mpi, byte_range, op, step
|
||||
print(rccl_test.stdout)
|
||||
pytest.fail("AllGather test error(s) detected.")
|
||||
|
||||
assert rccl_test.returncode == 0
|
||||
assert rccl_test.returncode == 0
|
||||
|
||||
@@ -22,12 +22,22 @@
|
||||
import os
|
||||
import subprocess
|
||||
import itertools
|
||||
import math
|
||||
|
||||
import pytest
|
||||
|
||||
ngpus = 0
|
||||
if os.environ.get('ROCR_VISIBLE_DEVICES') is not None:
|
||||
ngpus = len(os.environ['ROCR_VISIBLE_DEVICES'].split(","))
|
||||
elif os.environ.get('HIP_VISIBLE_DEVICES') is not None:
|
||||
ngpus = len(os.environ['HIP_VISIBLE_DEVICES'].split(","))
|
||||
else:
|
||||
ngpus = int(subprocess.check_output("rocminfo | grep \"Device Type:.\s*.GPU\" | wc -l",shell=True))
|
||||
log_ngpus = int(math.log2(ngpus))
|
||||
|
||||
nthreads = ["1"]
|
||||
nprocs = ["2"]
|
||||
ngpus_single = ["1","2","4"]
|
||||
ngpus_single = [str(2**x) for x in range(log_ngpus+1)]
|
||||
ngpus_mpi = ["1","2"]
|
||||
byte_range = [("4", "128M")]
|
||||
op = ["sum", "prod", "min", "max"]
|
||||
@@ -99,4 +109,4 @@ def test_AllReduceMPI(request, nthreads, nprocs, ngpus_mpi, byte_range, op, step
|
||||
print(rccl_test.stdout)
|
||||
pytest.fail("AllReduce test error(s) detected.")
|
||||
|
||||
assert rccl_test.returncode == 0
|
||||
assert rccl_test.returncode == 0
|
||||
|
||||
@@ -22,12 +22,22 @@
|
||||
import os
|
||||
import subprocess
|
||||
import itertools
|
||||
import math
|
||||
|
||||
import pytest
|
||||
|
||||
ngpus = 0
|
||||
if os.environ.get('ROCR_VISIBLE_DEVICES') is not None:
|
||||
ngpus = len(os.environ['ROCR_VISIBLE_DEVICES'].split(","))
|
||||
elif os.environ.get('HIP_VISIBLE_DEVICES') is not None:
|
||||
ngpus = len(os.environ['HIP_VISIBLE_DEVICES'].split(","))
|
||||
else:
|
||||
ngpus = int(subprocess.check_output("rocminfo | grep \"Device Type:.\s*.GPU\" | wc -l",shell=True))
|
||||
log_ngpus = int(math.log2(ngpus))
|
||||
|
||||
nthreads = ["1"]
|
||||
nprocs = ["2"]
|
||||
ngpus_single = ["1","2","4"]
|
||||
ngpus_single = [str(2**x) for x in range(log_ngpus+1)]
|
||||
ngpus_mpi = ["1","2"]
|
||||
byte_range = [("4", "128M")]
|
||||
op = ["sum", "prod", "min", "max"]
|
||||
@@ -99,4 +109,4 @@ def test_BroadcastMPI(request, nthreads, nprocs, ngpus_mpi, byte_range, op, step
|
||||
print(rccl_test.stdout)
|
||||
pytest.fail("Broadcast test error(s) detected.")
|
||||
|
||||
assert rccl_test.returncode == 0
|
||||
assert rccl_test.returncode == 0
|
||||
|
||||
@@ -22,12 +22,22 @@
|
||||
import os
|
||||
import subprocess
|
||||
import itertools
|
||||
import math
|
||||
|
||||
import pytest
|
||||
|
||||
ngpus = 0
|
||||
if os.environ.get('ROCR_VISIBLE_DEVICES') is not None:
|
||||
ngpus = len(os.environ['ROCR_VISIBLE_DEVICES'].split(","))
|
||||
elif os.environ.get('HIP_VISIBLE_DEVICES') is not None:
|
||||
ngpus = len(os.environ['HIP_VISIBLE_DEVICES'].split(","))
|
||||
else:
|
||||
ngpus = int(subprocess.check_output("rocminfo | grep \"Device Type:.\s*.GPU\" | wc -l",shell=True))
|
||||
log_ngpus = int(math.log2(ngpus))
|
||||
|
||||
nthreads = ["1"]
|
||||
nprocs = ["2"]
|
||||
ngpus_single = ["1","2","4"]
|
||||
ngpus_single = [str(2**x) for x in range(log_ngpus+1)]
|
||||
ngpus_mpi = ["1","2"]
|
||||
byte_range = [("4", "128M")]
|
||||
op = ["sum", "prod", "min", "max"]
|
||||
@@ -99,4 +109,4 @@ def test_ReduceMPI(request, nthreads, nprocs, ngpus_mpi, byte_range, op, step_fa
|
||||
print(rccl_test.stdout)
|
||||
pytest.fail("Reduce test error(s) detected.")
|
||||
|
||||
assert rccl_test.returncode == 0
|
||||
assert rccl_test.returncode == 0
|
||||
|
||||
@@ -22,12 +22,22 @@
|
||||
import os
|
||||
import subprocess
|
||||
import itertools
|
||||
import math
|
||||
|
||||
import pytest
|
||||
|
||||
ngpus = 0
|
||||
if os.environ.get('ROCR_VISIBLE_DEVICES') is not None:
|
||||
ngpus = len(os.environ['ROCR_VISIBLE_DEVICES'].split(","))
|
||||
elif os.environ.get('HIP_VISIBLE_DEVICES') is not None:
|
||||
ngpus = len(os.environ['HIP_VISIBLE_DEVICES'].split(","))
|
||||
else:
|
||||
ngpus = int(subprocess.check_output("rocminfo | grep \"Device Type:.\s*.GPU\" | wc -l",shell=True))
|
||||
log_ngpus = int(math.log2(ngpus))
|
||||
|
||||
nthreads = ["1"]
|
||||
nprocs = ["2"]
|
||||
ngpus_single = ["1","2","4"]
|
||||
ngpus_single = [str(2**x) for x in range(log_ngpus+1)]
|
||||
ngpus_mpi = ["1","2"]
|
||||
byte_range = [("4", "128M")]
|
||||
op = ["sum", "prod", "min", "max"]
|
||||
@@ -99,4 +109,4 @@ def test_ReduceScatterMPI(request, nthreads, nprocs, ngpus_mpi, byte_range, op,
|
||||
print(rccl_test.stdout)
|
||||
pytest.fail("ReduceScatter test error(s) detected.")
|
||||
|
||||
assert rccl_test.returncode == 0
|
||||
assert rccl_test.returncode == 0
|
||||
|
||||
Fai riferimento in un nuovo problema
Block a user