Scaling tests to #ngpus (#81)
* scaling tests to #ngpus
Signed-off-by: AtlantaPepsi <hyj1999110@gmail.com>
* switching to rocminfo
---------
Signed-off-by: AtlantaPepsi <hyj1999110@gmail.com>
[ROCm/rccl-tests commit: ae3e6357cb]
This commit is contained in:
@@ -22,12 +22,22 @@
|
|||||||
import os
|
import os
|
||||||
import subprocess
|
import subprocess
|
||||||
import itertools
|
import itertools
|
||||||
|
import math
|
||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
|
ngpus = 0
|
||||||
|
if os.environ.get('ROCR_VISIBLE_DEVICES') is not None:
|
||||||
|
ngpus = len(os.environ['ROCR_VISIBLE_DEVICES'].split(","))
|
||||||
|
elif os.environ.get('HIP_VISIBLE_DEVICES') is not None:
|
||||||
|
ngpus = len(os.environ['HIP_VISIBLE_DEVICES'].split(","))
|
||||||
|
else:
|
||||||
|
ngpus = int(subprocess.check_output("rocminfo | grep \"Device Type:.\s*.GPU\" | wc -l",shell=True))
|
||||||
|
log_ngpus = int(math.log2(ngpus))
|
||||||
|
|
||||||
nthreads = ["1"]
|
nthreads = ["1"]
|
||||||
nprocs = ["2"]
|
nprocs = ["2"]
|
||||||
ngpus_single = ["1","2","4"]
|
ngpus_single = [str(2**x) for x in range(log_ngpus+1)]
|
||||||
ngpus_mpi = ["1","2"]
|
ngpus_mpi = ["1","2"]
|
||||||
byte_range = [("4", "128M")]
|
byte_range = [("4", "128M")]
|
||||||
op = ["sum", "prod", "min", "max"]
|
op = ["sum", "prod", "min", "max"]
|
||||||
@@ -99,4 +109,4 @@ def test_AllGatherMPI(request, nthreads, nprocs, ngpus_mpi, byte_range, op, step
|
|||||||
print(rccl_test.stdout)
|
print(rccl_test.stdout)
|
||||||
pytest.fail("AllGather test error(s) detected.")
|
pytest.fail("AllGather test error(s) detected.")
|
||||||
|
|
||||||
assert rccl_test.returncode == 0
|
assert rccl_test.returncode == 0
|
||||||
|
|||||||
@@ -22,12 +22,22 @@
|
|||||||
import os
|
import os
|
||||||
import subprocess
|
import subprocess
|
||||||
import itertools
|
import itertools
|
||||||
|
import math
|
||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
|
ngpus = 0
|
||||||
|
if os.environ.get('ROCR_VISIBLE_DEVICES') is not None:
|
||||||
|
ngpus = len(os.environ['ROCR_VISIBLE_DEVICES'].split(","))
|
||||||
|
elif os.environ.get('HIP_VISIBLE_DEVICES') is not None:
|
||||||
|
ngpus = len(os.environ['HIP_VISIBLE_DEVICES'].split(","))
|
||||||
|
else:
|
||||||
|
ngpus = int(subprocess.check_output("rocminfo | grep \"Device Type:.\s*.GPU\" | wc -l",shell=True))
|
||||||
|
log_ngpus = int(math.log2(ngpus))
|
||||||
|
|
||||||
nthreads = ["1"]
|
nthreads = ["1"]
|
||||||
nprocs = ["2"]
|
nprocs = ["2"]
|
||||||
ngpus_single = ["1","2","4"]
|
ngpus_single = [str(2**x) for x in range(log_ngpus+1)]
|
||||||
ngpus_mpi = ["1","2"]
|
ngpus_mpi = ["1","2"]
|
||||||
byte_range = [("4", "128M")]
|
byte_range = [("4", "128M")]
|
||||||
op = ["sum", "prod", "min", "max"]
|
op = ["sum", "prod", "min", "max"]
|
||||||
@@ -99,4 +109,4 @@ def test_AllReduceMPI(request, nthreads, nprocs, ngpus_mpi, byte_range, op, step
|
|||||||
print(rccl_test.stdout)
|
print(rccl_test.stdout)
|
||||||
pytest.fail("AllReduce test error(s) detected.")
|
pytest.fail("AllReduce test error(s) detected.")
|
||||||
|
|
||||||
assert rccl_test.returncode == 0
|
assert rccl_test.returncode == 0
|
||||||
|
|||||||
@@ -22,12 +22,22 @@
|
|||||||
import os
|
import os
|
||||||
import subprocess
|
import subprocess
|
||||||
import itertools
|
import itertools
|
||||||
|
import math
|
||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
|
ngpus = 0
|
||||||
|
if os.environ.get('ROCR_VISIBLE_DEVICES') is not None:
|
||||||
|
ngpus = len(os.environ['ROCR_VISIBLE_DEVICES'].split(","))
|
||||||
|
elif os.environ.get('HIP_VISIBLE_DEVICES') is not None:
|
||||||
|
ngpus = len(os.environ['HIP_VISIBLE_DEVICES'].split(","))
|
||||||
|
else:
|
||||||
|
ngpus = int(subprocess.check_output("rocminfo | grep \"Device Type:.\s*.GPU\" | wc -l",shell=True))
|
||||||
|
log_ngpus = int(math.log2(ngpus))
|
||||||
|
|
||||||
nthreads = ["1"]
|
nthreads = ["1"]
|
||||||
nprocs = ["2"]
|
nprocs = ["2"]
|
||||||
ngpus_single = ["1","2","4"]
|
ngpus_single = [str(2**x) for x in range(log_ngpus+1)]
|
||||||
ngpus_mpi = ["1","2"]
|
ngpus_mpi = ["1","2"]
|
||||||
byte_range = [("4", "128M")]
|
byte_range = [("4", "128M")]
|
||||||
op = ["sum", "prod", "min", "max"]
|
op = ["sum", "prod", "min", "max"]
|
||||||
@@ -99,4 +109,4 @@ def test_BroadcastMPI(request, nthreads, nprocs, ngpus_mpi, byte_range, op, step
|
|||||||
print(rccl_test.stdout)
|
print(rccl_test.stdout)
|
||||||
pytest.fail("Broadcast test error(s) detected.")
|
pytest.fail("Broadcast test error(s) detected.")
|
||||||
|
|
||||||
assert rccl_test.returncode == 0
|
assert rccl_test.returncode == 0
|
||||||
|
|||||||
@@ -22,12 +22,22 @@
|
|||||||
import os
|
import os
|
||||||
import subprocess
|
import subprocess
|
||||||
import itertools
|
import itertools
|
||||||
|
import math
|
||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
|
ngpus = 0
|
||||||
|
if os.environ.get('ROCR_VISIBLE_DEVICES') is not None:
|
||||||
|
ngpus = len(os.environ['ROCR_VISIBLE_DEVICES'].split(","))
|
||||||
|
elif os.environ.get('HIP_VISIBLE_DEVICES') is not None:
|
||||||
|
ngpus = len(os.environ['HIP_VISIBLE_DEVICES'].split(","))
|
||||||
|
else:
|
||||||
|
ngpus = int(subprocess.check_output("rocminfo | grep \"Device Type:.\s*.GPU\" | wc -l",shell=True))
|
||||||
|
log_ngpus = int(math.log2(ngpus))
|
||||||
|
|
||||||
nthreads = ["1"]
|
nthreads = ["1"]
|
||||||
nprocs = ["2"]
|
nprocs = ["2"]
|
||||||
ngpus_single = ["1","2","4"]
|
ngpus_single = [str(2**x) for x in range(log_ngpus+1)]
|
||||||
ngpus_mpi = ["1","2"]
|
ngpus_mpi = ["1","2"]
|
||||||
byte_range = [("4", "128M")]
|
byte_range = [("4", "128M")]
|
||||||
op = ["sum", "prod", "min", "max"]
|
op = ["sum", "prod", "min", "max"]
|
||||||
@@ -99,4 +109,4 @@ def test_ReduceMPI(request, nthreads, nprocs, ngpus_mpi, byte_range, op, step_fa
|
|||||||
print(rccl_test.stdout)
|
print(rccl_test.stdout)
|
||||||
pytest.fail("Reduce test error(s) detected.")
|
pytest.fail("Reduce test error(s) detected.")
|
||||||
|
|
||||||
assert rccl_test.returncode == 0
|
assert rccl_test.returncode == 0
|
||||||
|
|||||||
@@ -22,12 +22,22 @@
|
|||||||
import os
|
import os
|
||||||
import subprocess
|
import subprocess
|
||||||
import itertools
|
import itertools
|
||||||
|
import math
|
||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
|
ngpus = 0
|
||||||
|
if os.environ.get('ROCR_VISIBLE_DEVICES') is not None:
|
||||||
|
ngpus = len(os.environ['ROCR_VISIBLE_DEVICES'].split(","))
|
||||||
|
elif os.environ.get('HIP_VISIBLE_DEVICES') is not None:
|
||||||
|
ngpus = len(os.environ['HIP_VISIBLE_DEVICES'].split(","))
|
||||||
|
else:
|
||||||
|
ngpus = int(subprocess.check_output("rocminfo | grep \"Device Type:.\s*.GPU\" | wc -l",shell=True))
|
||||||
|
log_ngpus = int(math.log2(ngpus))
|
||||||
|
|
||||||
nthreads = ["1"]
|
nthreads = ["1"]
|
||||||
nprocs = ["2"]
|
nprocs = ["2"]
|
||||||
ngpus_single = ["1","2","4"]
|
ngpus_single = [str(2**x) for x in range(log_ngpus+1)]
|
||||||
ngpus_mpi = ["1","2"]
|
ngpus_mpi = ["1","2"]
|
||||||
byte_range = [("4", "128M")]
|
byte_range = [("4", "128M")]
|
||||||
op = ["sum", "prod", "min", "max"]
|
op = ["sum", "prod", "min", "max"]
|
||||||
@@ -99,4 +109,4 @@ def test_ReduceScatterMPI(request, nthreads, nprocs, ngpus_mpi, byte_range, op,
|
|||||||
print(rccl_test.stdout)
|
print(rccl_test.stdout)
|
||||||
pytest.fail("ReduceScatter test error(s) detected.")
|
pytest.fail("ReduceScatter test error(s) detected.")
|
||||||
|
|
||||||
assert rccl_test.returncode == 0
|
assert rccl_test.returncode == 0
|
||||||
|
|||||||
Reference in New Issue
Block a user