gtest: dynamically generate tests based on test machine's GPU count (#467)

* gtest: dynamically generate tests based on test machine's GPU count

* Adjust test element size and bfloat16 threshold for up to 16 GPUs
Этот коммит содержится в:
Wenkai Du
2021-11-16 10:28:26 -08:00
коммит произвёл GitHub
родитель a6dba6b9dd
Коммит 03a830293c
14 изменённых файлов: 30 добавлений и 15 удалений
+7
Просмотреть файл
@@ -67,6 +67,13 @@ if(BUILD_TESTS)
target_link_libraries(UnitTestsMultiProcess PRIVATE ${GTEST_BOTH_LIBRARIES})
target_link_libraries(UnitTestsMultiProcess PRIVATE hip::host hip::device)
find_program( rocminfo_executable rocminfo )
execute_process(COMMAND bash "-c" "${rocminfo_executable} | grep 'Device Type' | grep GPU | wc -l | tr -d '\n'" OUTPUT_VARIABLE gtest_num_gpus)
if(${gtest_num_gpus} MATCHES "0" OR ${gtest_num_gpus} MATCHES "1")
set(gtest_num_gpus,"2")
endif()
target_compile_options(UnitTests PRIVATE -DGTESTS_NUM_GPUS=${gtest_num_gpus})
# UnitTests using static library of rccl requires passing rccl
# through -l and -L instead of command line input.
if(BUILD_STATIC)
+1 -1
Просмотреть файл
@@ -812,7 +812,7 @@ dropback:
case ncclUint64: isMatch &= (outputU8[j] == expectedU8[j]); break;
case ncclFloat32: isMatch &= (fabs(outputF4[j] - expectedF4[j]) < 1e-5); break;
case ncclFloat64: isMatch &= (fabs(outputF8[j] - expectedF8[j]) < 1e-12); break;
case ncclBfloat16: isMatch &= (fabs((float)outputB2[j] - (float)expectedB2[j]) < 5e-2); break;
case ncclBfloat16: isMatch &= (fabs((float)outputB2[j] - (float)expectedB2[j]) < 9e-2); break;
default:
fprintf(stderr, "[ERROR] Unsupported datatype\n");
exit(0);
+3 -1
Просмотреть файл
@@ -9,6 +9,8 @@ namespace CorrectnessTests
{
TEST_P(AllGatherCorrectnessTest, Correctness)
{
// Adjust numElements to be multiple of numDevices
numElements = (numElements/numDevices)*numDevices;
if (numDevices > numDevicesAvailable) return;
if (numElements % numDevices != 0) return;
@@ -107,7 +109,7 @@ namespace CorrectnessTests
// Number of elements
testing::Values(2520, 3026520),
// Number of devices
testing::Values(2,3,4,5,6,7,8),
testing::Range(2,(GTESTS_NUM_GPUS+1)),
// In-place or not
testing::Values(false, true),
testing::Values("")),
+2 -2
Просмотреть файл
@@ -46,7 +46,7 @@ namespace CorrectnessTests
// Number of elements
testing::Values(1024, 1048576),
// Number of devices
testing::Values(2,3,4,5,6,7,8),
testing::Range(2,(GTESTS_NUM_GPUS+1)),
// In-place or not
testing::Values(false, true),
testing::Values("RCCL_ENABLE_CLIQUE=0", "RCCL_ENABLE_CLIQUE=1")),
@@ -71,7 +71,7 @@ namespace CorrectnessTests
// Number of elements
testing::Values(1024, 1048576),
// Number of devices
testing::Values(2,3,4,5,6,7,8),
testing::Range(2,(GTESTS_NUM_GPUS+1)),
// In-place or not
testing::Values(false, true),
testing::Values("RCCL_ENABLE_CLIQUE=0", "RCCL_ENABLE_CLIQUE=1")),
+2 -2
Просмотреть файл
@@ -58,7 +58,7 @@ namespace CorrectnessTests
// Number of elements
testing::Values(1024, 1048576),
// Number of devices
testing::Values(2,3,4,5,6,7,8),
testing::Range(2,(GTESTS_NUM_GPUS+1)),
// In-place or not
testing::Values(false, true),
testing::Values("RCCL_ENABLE_CLIQUE=0", "RCCL_ENABLE_CLIQUE=1")),
@@ -74,7 +74,7 @@ namespace CorrectnessTests
// Number of elements
testing::Values(1024, 1048576),
// Number of devices
testing::Values(2,3,4,5,6,7,8),
testing::Range(2,(GTESTS_NUM_GPUS+1)),
// In-place or not
testing::Values(false, true),
testing::Values("RCCL_ENABLE_CLIQUE=0", "RCCL_ENABLE_CLIQUE=1")),
+1 -1
Просмотреть файл
@@ -59,7 +59,7 @@ namespace CorrectnessTests
// Number of elements
testing::Values(1024, 1048576),
// Number of devices
testing::Values(2,3,4,5,6,7,8),
testing::Range(2,(GTESTS_NUM_GPUS+1)),
// In-place or not
testing::Values(false),
testing::Values("")),
+1 -1
Просмотреть файл
@@ -67,7 +67,7 @@ namespace CorrectnessTests
// Number of elements
testing::Values(2520, 3026520),
// Number of devices
testing::Values(2,3,4,5,6,7,8),
testing::Range(2,(GTESTS_NUM_GPUS+1)),
// In-place or not
testing::Values(false),
testing::Values("")),
+1 -1
Просмотреть файл
@@ -63,7 +63,7 @@ namespace CorrectnessTests
// Number of elements
testing::Values(1024, 1048576),
// Number of devices
testing::Values(2,3,4,5,6,7,8),
testing::Range(2,(GTESTS_NUM_GPUS+1)),
// In-place or not
testing::Values(false, true),
testing::Values("")),
+3 -1
Просмотреть файл
@@ -27,6 +27,8 @@ namespace CorrectnessTests
ncclFuncs.push_back(ncclCollReduce);
ncclFuncs.push_back(ncclCollReduceScatter);
// Adjust numElements to be multiple of numDevices
numElements = (numElements/numDevices)*numDevices;
for (int i = 0; i < datasets.size(); i++)
{
datasets[i].Initialize(numDevices, numElements, dataType, inPlace, ncclFuncs[i]);
@@ -119,7 +121,7 @@ namespace CorrectnessTests
// Number of elements
testing::Values(2520, 3026520),
// Number of devices
testing::Values(2,3,4,5,6,7,8),
testing::Range(2,(GTESTS_NUM_GPUS+1)),
// In-place or not
testing::Values(false),
testing::Values("RCCL_ENABLE_CLIQUE=0", "RCCL_ENABLE_CLIQUE=1", "RCCL_P2P_NET_DISABLE=0", "RCCL_P2P_NET_DISABLE=1")),
+1 -1
Просмотреть файл
@@ -63,7 +63,7 @@ namespace CorrectnessTests
// Number of elements
testing::Values(1024, 1048576),
// Number of devices
testing::Values(2,3,4,5,6,7,8),
testing::Range(2,(GTESTS_NUM_GPUS+1)),
// In-place or not
testing::Values(false),
testing::Values("")),
+3 -1
Просмотреть файл
@@ -26,6 +26,8 @@ namespace CorrectnessTests
ncclFuncs.push_back(ncclCollReduce);
ncclFuncs.push_back(ncclCollReduceScatter);
// Adjust numElements to be multiple of numDevices
numElements = (numElements/numDevices)*numDevices;
for (int i = 0; i < datasets.size(); i++)
{
datasets[i].Initialize(numDevices, numElements, dataType, inPlace, ncclFuncs[i]);
@@ -120,7 +122,7 @@ namespace CorrectnessTests
// Number of elements
testing::Values(2520, 3026520),
// Number of devices
testing::Values(2,3,4,5,6,7,8),
testing::Range(2,(GTESTS_NUM_GPUS+1)),
// In-place or not
testing::Values(false, true),
testing::Values("RCCL_ENABLE_CLIQUE=0", "RCCL_ENABLE_CLIQUE=1")),
+1 -1
Просмотреть файл
@@ -63,7 +63,7 @@ namespace CorrectnessTests
// Number of elements
testing::Values(1024, 1048576),
// Number of devices
testing::Values(2,3,4,5,6,7,8),
testing::Range(2,(GTESTS_NUM_GPUS+1)),
// In-place or not
testing::Values(false, true),
testing::Values("")),
+3 -1
Просмотреть файл
@@ -10,6 +10,8 @@ namespace CorrectnessTests
{
TEST_P(ReduceScatterCorrectnessTest, Correctness)
{
// Adjust numElements to be multiple of numDevices
numElements = (numElements/numDevices)*numDevices;
if (numDevices > numDevicesAvailable) return;
if (numElements % numDevices != 0) return;
@@ -61,7 +63,7 @@ namespace CorrectnessTests
// Number of elements
testing::Values(2520, 3026520),
// Number of devices
testing::Values(2,3,4,5,6,7,8),
testing::Range(2,(GTESTS_NUM_GPUS+1)),
// In-place or not
testing::Values(false, true),
testing::Values("")),
+1 -1
Просмотреть файл
@@ -63,7 +63,7 @@ namespace CorrectnessTests
// Number of elements
testing::Values(1024, 1048576),
// Number of devices
testing::Values(2,3,4,5,6,7,8),
testing::Range(2,(GTESTS_NUM_GPUS+1)),
// In-place or not
testing::Values(false),
testing::Values("")),