Scaling tests to #ngpus (#81)

* scaling tests to #ngpus

Signed-off-by: AtlantaPepsi <hyj1999110@gmail.com>

* switching to rocminfo

---------

Signed-off-by: AtlantaPepsi <hyj1999110@gmail.com>

[ROCm/rccl-tests commit: ae3e6357cb]
이 커밋은 다음에 포함됨:
Tim
2024-09-10 19:05:22 -04:00
커밋한 사람 GitHub
부모 ee4dd140bf
커밋 c5ab7dc5b5
5개의 변경된 파일60개의 추가작업 그리고 10개의 파일을 삭제
+12 -2
파일 보기
@@ -22,12 +22,22 @@
import os
import subprocess
import itertools
import math
import pytest
ngpus = 0
if os.environ.get('ROCR_VISIBLE_DEVICES') is not None:
ngpus = len(os.environ['ROCR_VISIBLE_DEVICES'].split(","))
elif os.environ.get('HIP_VISIBLE_DEVICES') is not None:
ngpus = len(os.environ['HIP_VISIBLE_DEVICES'].split(","))
else:
ngpus = int(subprocess.check_output("rocminfo | grep \"Device Type:.\s*.GPU\" | wc -l",shell=True))
log_ngpus = int(math.log2(ngpus))
nthreads = ["1"]
nprocs = ["2"]
ngpus_single = ["1","2","4"]
ngpus_single = [str(2**x) for x in range(log_ngpus+1)]
ngpus_mpi = ["1","2"]
byte_range = [("4", "128M")]
op = ["sum", "prod", "min", "max"]
@@ -99,4 +109,4 @@ def test_AllGatherMPI(request, nthreads, nprocs, ngpus_mpi, byte_range, op, step
print(rccl_test.stdout)
pytest.fail("AllGather test error(s) detected.")
assert rccl_test.returncode == 0
assert rccl_test.returncode == 0
+12 -2
파일 보기
@@ -22,12 +22,22 @@
import os
import subprocess
import itertools
import math
import pytest
ngpus = 0
if os.environ.get('ROCR_VISIBLE_DEVICES') is not None:
ngpus = len(os.environ['ROCR_VISIBLE_DEVICES'].split(","))
elif os.environ.get('HIP_VISIBLE_DEVICES') is not None:
ngpus = len(os.environ['HIP_VISIBLE_DEVICES'].split(","))
else:
ngpus = int(subprocess.check_output("rocminfo | grep \"Device Type:.\s*.GPU\" | wc -l",shell=True))
log_ngpus = int(math.log2(ngpus))
nthreads = ["1"]
nprocs = ["2"]
ngpus_single = ["1","2","4"]
ngpus_single = [str(2**x) for x in range(log_ngpus+1)]
ngpus_mpi = ["1","2"]
byte_range = [("4", "128M")]
op = ["sum", "prod", "min", "max"]
@@ -99,4 +109,4 @@ def test_AllReduceMPI(request, nthreads, nprocs, ngpus_mpi, byte_range, op, step
print(rccl_test.stdout)
pytest.fail("AllReduce test error(s) detected.")
assert rccl_test.returncode == 0
assert rccl_test.returncode == 0
+12 -2
파일 보기
@@ -22,12 +22,22 @@
import os
import subprocess
import itertools
import math
import pytest
ngpus = 0
if os.environ.get('ROCR_VISIBLE_DEVICES') is not None:
ngpus = len(os.environ['ROCR_VISIBLE_DEVICES'].split(","))
elif os.environ.get('HIP_VISIBLE_DEVICES') is not None:
ngpus = len(os.environ['HIP_VISIBLE_DEVICES'].split(","))
else:
ngpus = int(subprocess.check_output("rocminfo | grep \"Device Type:.\s*.GPU\" | wc -l",shell=True))
log_ngpus = int(math.log2(ngpus))
nthreads = ["1"]
nprocs = ["2"]
ngpus_single = ["1","2","4"]
ngpus_single = [str(2**x) for x in range(log_ngpus+1)]
ngpus_mpi = ["1","2"]
byte_range = [("4", "128M")]
op = ["sum", "prod", "min", "max"]
@@ -99,4 +109,4 @@ def test_BroadcastMPI(request, nthreads, nprocs, ngpus_mpi, byte_range, op, step
print(rccl_test.stdout)
pytest.fail("Broadcast test error(s) detected.")
assert rccl_test.returncode == 0
assert rccl_test.returncode == 0
+12 -2
파일 보기
@@ -22,12 +22,22 @@
import os
import subprocess
import itertools
import math
import pytest
ngpus = 0
if os.environ.get('ROCR_VISIBLE_DEVICES') is not None:
ngpus = len(os.environ['ROCR_VISIBLE_DEVICES'].split(","))
elif os.environ.get('HIP_VISIBLE_DEVICES') is not None:
ngpus = len(os.environ['HIP_VISIBLE_DEVICES'].split(","))
else:
ngpus = int(subprocess.check_output("rocminfo | grep \"Device Type:.\s*.GPU\" | wc -l",shell=True))
log_ngpus = int(math.log2(ngpus))
nthreads = ["1"]
nprocs = ["2"]
ngpus_single = ["1","2","4"]
ngpus_single = [str(2**x) for x in range(log_ngpus+1)]
ngpus_mpi = ["1","2"]
byte_range = [("4", "128M")]
op = ["sum", "prod", "min", "max"]
@@ -99,4 +109,4 @@ def test_ReduceMPI(request, nthreads, nprocs, ngpus_mpi, byte_range, op, step_fa
print(rccl_test.stdout)
pytest.fail("Reduce test error(s) detected.")
assert rccl_test.returncode == 0
assert rccl_test.returncode == 0
+12 -2
파일 보기
@@ -22,12 +22,22 @@
import os
import subprocess
import itertools
import math
import pytest
ngpus = 0
if os.environ.get('ROCR_VISIBLE_DEVICES') is not None:
ngpus = len(os.environ['ROCR_VISIBLE_DEVICES'].split(","))
elif os.environ.get('HIP_VISIBLE_DEVICES') is not None:
ngpus = len(os.environ['HIP_VISIBLE_DEVICES'].split(","))
else:
ngpus = int(subprocess.check_output("rocminfo | grep \"Device Type:.\s*.GPU\" | wc -l",shell=True))
log_ngpus = int(math.log2(ngpus))
nthreads = ["1"]
nprocs = ["2"]
ngpus_single = ["1","2","4"]
ngpus_single = [str(2**x) for x in range(log_ngpus+1)]
ngpus_mpi = ["1","2"]
byte_range = [("4", "128M")]
op = ["sum", "prod", "min", "max"]
@@ -99,4 +109,4 @@ def test_ReduceScatterMPI(request, nthreads, nprocs, ngpus_mpi, byte_range, op,
print(rccl_test.stdout)
pytest.fail("ReduceScatter test error(s) detected.")
assert rccl_test.returncode == 0
assert rccl_test.returncode == 0