Fixed unit-test env var list parsing and improved filtered test run speed (#1626)
* Fixed parsing of env var lists which were overwriting the mutable env var string and polluting future parses.
* Fixed all tests to obey UT_DATATYPES and UT_REDOPS filters.
* Allow tests to bail early via `GTEST_SKIP` if UT_DATATYPES or UT_REDOPS filters give a test size of zero. This allows tests to run much faster with filters on.
* Wrapped the support checks in helper functions on `TestBed`.
[ROCm/rccl commit: 18e9ad913b]
This commit is contained in:
committed by
GitHub
parent
2e0abab81a
commit
de82a18790
@@ -79,13 +79,19 @@ namespace RcclUnitTesting
|
||||
TestBed testBed;
|
||||
|
||||
// Configuration
|
||||
std::vector<ncclDataType_t> const& dataTypes = {ncclInt32, ncclFloat64, ncclFloat16};
|
||||
std::vector<ncclDataType_t> const& testDataTypes = {ncclInt32, ncclFloat64, ncclFloat16};
|
||||
bool const inPlace = false;
|
||||
bool const useManagedMem = false;
|
||||
bool const useHipGraph = false;
|
||||
|
||||
OptionalColArgs options;
|
||||
|
||||
std::vector<ncclDataType_t> dataTypes;
|
||||
testBed.GetSupportedDataTypes(dataTypes, testDataTypes);
|
||||
if (dataTypes.empty()) {
|
||||
GTEST_SKIP() << "Skipping... test datatypes excluded by UT_DATATYPES.";
|
||||
}
|
||||
|
||||
bool isCorrect = true;
|
||||
for (int totalRanks : testBed.ev.GetNumGpusList())
|
||||
for (int isMultiProcess : testBed.ev.GetIsMultiProcessList())
|
||||
@@ -137,13 +143,19 @@ namespace RcclUnitTesting
|
||||
TestBed testBed;
|
||||
|
||||
// Configuration
|
||||
std::vector<ncclDataType_t> const& dataTypes = {ncclFloat32, ncclInt8};
|
||||
std::vector<ncclDataType_t> const& testDataTypes = {ncclFloat32, ncclInt8};
|
||||
bool const inPlace = false;
|
||||
bool const useManagedMem = false;
|
||||
bool const useHipGraph = false;
|
||||
|
||||
OptionalColArgs options;
|
||||
|
||||
std::vector<ncclDataType_t> dataTypes;
|
||||
testBed.GetSupportedDataTypes(dataTypes, testDataTypes);
|
||||
if (dataTypes.empty()) {
|
||||
GTEST_SKIP() << "Skipping... test datatypes excluded by UT_DATATYPES.";
|
||||
}
|
||||
|
||||
bool isCorrect = true;
|
||||
for (int totalRanks : testBed.ev.GetNumGpusList())
|
||||
for (int isMultiProcess : testBed.ev.GetIsMultiProcessList())
|
||||
|
||||
@@ -14,14 +14,26 @@ namespace RcclUnitTesting
|
||||
|
||||
// Configuration
|
||||
std::vector<ncclFunc_t> const funcTypes = {ncclCollAllReduce, ncclCollAllReduce, ncclCollAllReduce};
|
||||
std::vector<ncclRedOp_t> const redOps = {ncclSum, ncclSum, ncclSum};
|
||||
std::vector<ncclDataType_t> const dataTypes = {ncclFloat, ncclFloat, ncclFloat};
|
||||
std::vector<ncclRedOp_t> const testRedOps = {ncclSum, ncclSum, ncclSum};
|
||||
std::vector<ncclDataType_t> const testDataTypes = {ncclFloat, ncclFloat, ncclFloat};
|
||||
std::vector<int> const numElements = {1048576, 384 * 1024, 384};
|
||||
|
||||
int const numCollPerGroup = numElements.size();
|
||||
bool const inPlace = false;
|
||||
bool const useManagedMem = false;
|
||||
|
||||
std::vector<ncclDataType_t> dataTypes;
|
||||
testBed.GetSupportedDataTypes(dataTypes, testDataTypes);
|
||||
if (dataTypes.empty()) {
|
||||
GTEST_SKIP() << "Skipping... test datatypes excluded by UT_DATATYPES.";
|
||||
}
|
||||
|
||||
std::vector<ncclRedOp_t> redOps;
|
||||
testBed.GetSupportedRedOps(redOps, testRedOps);
|
||||
if (redOps.empty()) {
|
||||
GTEST_SKIP() << "Skipping... test reduction operations excluded by UT_REDOPS.";
|
||||
}
|
||||
|
||||
bool isCorrect = true;
|
||||
for (int totalRanks : testBed.ev.GetNumGpusList())
|
||||
for (int isMultiProcess : testBed.ev.GetIsMultiProcessList())
|
||||
@@ -127,14 +139,26 @@ namespace RcclUnitTesting
|
||||
|
||||
// Configuration
|
||||
std::vector<ncclFunc_t> const funcTypes = {ncclCollAllReduce, ncclCollAllReduce, ncclCollAllReduce};
|
||||
std::vector<ncclRedOp_t> const redOps = {ncclSum, ncclSum, ncclSum};
|
||||
std::vector<ncclDataType_t> const dataTypes = {ncclFloat16, ncclFloat32, ncclFloat64};
|
||||
std::vector<ncclRedOp_t> const testRedOps = {ncclSum, ncclSum, ncclSum};
|
||||
std::vector<ncclDataType_t> const testDataTypes = {ncclFloat16, ncclFloat32, ncclFloat64};
|
||||
std::vector<int> const numElements = {1048576, 384 * 1024, 384};
|
||||
|
||||
int const numCollPerGroup = numElements.size();
|
||||
bool const inPlace = false;
|
||||
bool const useManagedMem = false;
|
||||
|
||||
std::vector<ncclDataType_t> dataTypes;
|
||||
testBed.GetSupportedDataTypes(dataTypes, testDataTypes);
|
||||
if (dataTypes.empty()) {
|
||||
GTEST_SKIP() << "Skipping... test datatypes excluded by UT_DATATYPES.";
|
||||
}
|
||||
|
||||
std::vector<ncclRedOp_t> redOps;
|
||||
testBed.GetSupportedRedOps(redOps, testRedOps);
|
||||
if (redOps.empty()) {
|
||||
GTEST_SKIP() << "Skipping... test reduction operations excluded by UT_REDOPS.";
|
||||
}
|
||||
|
||||
bool isCorrect = true;
|
||||
for (int totalRanks : testBed.ev.GetNumGpusList())
|
||||
for (int isMultiProcess : testBed.ev.GetIsMultiProcessList())
|
||||
@@ -230,8 +254,8 @@ namespace RcclUnitTesting
|
||||
{ncclCollAllToAll, ncclCollGather},
|
||||
{ncclCollBroadcast, ncclCollReduceScatter}};
|
||||
std::vector<std::vector<int>> const numElements = {{1250, 1048576}, {384, 384 * 1024}, {1048576, 127}};
|
||||
std::vector<ncclDataType_t> const dataTypes = {ncclFloat16, ncclFloat32, ncclBfloat16};
|
||||
std::vector<ncclRedOp_t> const redops = {ncclSum, ncclProd, ncclMax};
|
||||
std::vector<ncclDataType_t> const testDataTypes = {ncclFloat16, ncclFloat32, ncclBfloat16};
|
||||
std::vector<ncclRedOp_t> const testRedOps = {ncclSum, ncclProd, ncclMax};
|
||||
std::vector<int> const numCollsPerGroup = {2, 2, 2};
|
||||
std::vector<int> const numStreamsPerGroup = {1, 1, 1};
|
||||
std::vector<bool> const useHipGraphList = {true, false, true};
|
||||
@@ -241,6 +265,18 @@ namespace RcclUnitTesting
|
||||
int const numGroupCalls = groupCalls.size();
|
||||
int const numIterations = 10;
|
||||
|
||||
std::vector<ncclDataType_t> dataTypes;
|
||||
testBed.GetSupportedDataTypes(dataTypes, testDataTypes);
|
||||
if (dataTypes.empty()) {
|
||||
GTEST_SKIP() << "Skipping... test datatypes excluded by UT_DATATYPES.";
|
||||
}
|
||||
|
||||
std::vector<ncclRedOp_t> redOps;
|
||||
testBed.GetSupportedRedOps(redOps, testRedOps);
|
||||
if (redOps.empty()) {
|
||||
GTEST_SKIP() << "Skipping... test reduction operations excluded by UT_REDOPS.";
|
||||
}
|
||||
|
||||
bool isCorrect = true;
|
||||
for (int totalRanks : testBed.ev.GetNumGpusList())
|
||||
for (int isMultiProcess : testBed.ev.GetIsMultiProcessList())
|
||||
@@ -258,7 +294,7 @@ namespace RcclUnitTesting
|
||||
{
|
||||
std::vector<ncclFunc_t> funcTypes = groupCalls[groupCallIdx];
|
||||
OptionalColArgs options;
|
||||
options.redOp = redops[groupCallIdx];
|
||||
options.redOp = redOps[groupCallIdx];
|
||||
options.root = 0;
|
||||
|
||||
for (int collIdx = 0; collIdx < numCollsPerGroup[groupCallIdx]; ++collIdx)
|
||||
|
||||
@@ -12,12 +12,19 @@ namespace RcclUnitTesting
|
||||
TestBed testBed;
|
||||
|
||||
// Configuration
|
||||
std::vector<ncclDataType_t> const& dataTypes = {ncclInt32, ncclFloat16, ncclFloat64};
|
||||
std::vector<ncclDataType_t> const& testDataTypes = {ncclInt32, ncclFloat16, ncclFloat64};
|
||||
std::vector<int> const numElements = {1048576, 53327, 1024, 0};
|
||||
bool const inPlace = false;
|
||||
bool const useManagedMem = false;
|
||||
|
||||
OptionalColArgs options;
|
||||
|
||||
std::vector<ncclDataType_t> dataTypes;
|
||||
testBed.GetSupportedDataTypes(dataTypes, testDataTypes);
|
||||
if (dataTypes.empty()) {
|
||||
GTEST_SKIP() << "Skipping... test datatypes excluded by UT_DATATYPES.";
|
||||
}
|
||||
|
||||
bool isCorrect = true;
|
||||
int numGpus = testBed.ev.maxGpus;
|
||||
for (int rpg=0; rpg < 2 && isCorrect; ++rpg)
|
||||
@@ -104,13 +111,20 @@ namespace RcclUnitTesting
|
||||
TestBed testBed;
|
||||
|
||||
// Configuration
|
||||
std::vector<ncclDataType_t> const& dataTypes = {ncclInt32, ncclFloat16, ncclFloat64};
|
||||
std::vector<ncclDataType_t> const& testDataTypes = {ncclInt32, ncclFloat16, ncclFloat64};
|
||||
std::vector<int> const numElements = {1048576, 53327, 1024};
|
||||
bool const inPlace = false;
|
||||
bool const useManagedMem = false;
|
||||
bool const userRegistered = true;
|
||||
|
||||
OptionalColArgs options;
|
||||
|
||||
std::vector<ncclDataType_t> dataTypes;
|
||||
testBed.GetSupportedDataTypes(dataTypes, testDataTypes);
|
||||
if (dataTypes.empty()) {
|
||||
GTEST_SKIP() << "Skipping... test datatypes excluded by UT_DATATYPES.";
|
||||
}
|
||||
|
||||
bool isCorrect = true;
|
||||
int numGpus = testBed.ev.maxGpus;
|
||||
for (int rpg=0; rpg < 2 && isCorrect; ++rpg)
|
||||
|
||||
@@ -9,7 +9,9 @@
|
||||
#include <cstdlib>
|
||||
#include <unistd.h>
|
||||
#include <sys/wait.h>
|
||||
#include <algorithm>
|
||||
#include <iostream>
|
||||
#include <sstream>
|
||||
#include <unordered_map>
|
||||
|
||||
namespace RcclUnitTesting
|
||||
@@ -337,11 +339,13 @@ namespace RcclUnitTesting
|
||||
std::vector<std::string> result;
|
||||
if (getenv(varname.c_str()))
|
||||
{
|
||||
char* token = strtok(getenv(varname.c_str()), ",;");
|
||||
while (token != NULL)
|
||||
std::string env = getenv(varname.c_str());
|
||||
std::replace(env.begin(), env.end(), ';', ',');
|
||||
std::istringstream ss(env);
|
||||
std::string token;
|
||||
while (std::getline(ss, token, ','))
|
||||
{
|
||||
result.push_back(token);
|
||||
token = strtok(NULL, ",;");
|
||||
}
|
||||
}
|
||||
return result;
|
||||
|
||||
@@ -564,6 +564,40 @@ namespace RcclUnitTesting
|
||||
return ev.GetAllSupportedDataTypes();
|
||||
}
|
||||
|
||||
void TestBed::GetSupportedRedOps(std::vector<ncclRedOp_t>& redOps, const std::vector<ncclRedOp_t>& testRedOps)
|
||||
{
|
||||
// Filter out any unsupported reduction ops, in case only subset has been compiled for
|
||||
auto& supportedOps = ev.GetAllSupportedRedOps();
|
||||
for (auto redop : testRedOps)
|
||||
{
|
||||
for (int i = 0; i < supportedOps.size(); ++i)
|
||||
{
|
||||
if (supportedOps[i] == redop)
|
||||
{
|
||||
redOps.push_back(redop);
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void TestBed::GetSupportedDataTypes(std::vector<ncclDataType_t>& dataTypes, const std::vector<ncclDataType_t>& testDataTypes)
|
||||
{
|
||||
// Filter out any unsupported datatypes, in case only subset has been compiled for
|
||||
auto& supportedDataTypes = ev.GetAllSupportedDataTypes();
|
||||
for (auto dt : testDataTypes)
|
||||
{
|
||||
for (int i = 0; i < supportedDataTypes.size(); ++i)
|
||||
{
|
||||
if (supportedDataTypes[i] == dt)
|
||||
{
|
||||
dataTypes.push_back(dt);
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
std::vector<int> const TestBed::GetNumCollsPerGroup(int numCollectivesInGroup,
|
||||
int numGroupCalls)
|
||||
{
|
||||
@@ -642,34 +676,16 @@ namespace RcclUnitTesting
|
||||
std::vector<int> sortedN = numElements;
|
||||
std::sort(sortedN.rbegin(), sortedN.rend());
|
||||
OptionalColArgs optionalArgs;
|
||||
// Filter out any unsupported datatypes, in case only subset has been compiled for
|
||||
std::vector<ncclDataType_t> const& supportedDataTypes = this->GetAllSupportedDataTypes();
|
||||
std::vector<ncclDataType_t> dataTypes;
|
||||
for (auto dt : tmpDataTypes)
|
||||
{
|
||||
for (int i = 0; i < supportedDataTypes.size(); ++i)
|
||||
{
|
||||
if (supportedDataTypes[i] == dt)
|
||||
{
|
||||
dataTypes.push_back(dt);
|
||||
break;
|
||||
}
|
||||
}
|
||||
this->GetSupportedDataTypes(dataTypes, tmpDataTypes);
|
||||
if (dataTypes.empty()) {
|
||||
GTEST_SKIP() << "Skipping... test datatypes excluded by UT_DATATYPES.";
|
||||
}
|
||||
|
||||
// Filter out any unsupported reduction ops, in case only subset has been compiled for
|
||||
std::vector<ncclRedOp_t> const& supportedOps = this->GetAllSupportedRedOps();
|
||||
std::vector<ncclRedOp_t> redOps;
|
||||
for (auto redop : tmpRedOps)
|
||||
{
|
||||
for (int i = 0; i < supportedOps.size(); ++i)
|
||||
{
|
||||
if (supportedOps[i] == redop)
|
||||
{
|
||||
redOps.push_back(redop);
|
||||
break;
|
||||
}
|
||||
}
|
||||
this->GetSupportedRedOps(redOps, tmpRedOps);
|
||||
if (redOps.empty()) {
|
||||
GTEST_SKIP() << "Skipping... test reduction operations excluded by UT_REDOPS.";
|
||||
}
|
||||
|
||||
bool isCorrect = true;
|
||||
|
||||
@@ -127,6 +127,12 @@ namespace RcclUnitTesting
|
||||
// Return all the supported data types based on build settings
|
||||
std::vector<ncclDataType_t> const& GetAllSupportedDataTypes();
|
||||
|
||||
// Returns the intersection of testRedOps with supported reduction operations as redOps.
|
||||
void GetSupportedRedOps(std::vector<ncclRedOp_t>& redOps, const std::vector<ncclRedOp_t>& testRedOps);
|
||||
|
||||
// Returns the intersection of testDataTypes with supported data types as dataTypes.
|
||||
void GetSupportedDataTypes(std::vector<ncclDataType_t>& dataTypes, const std::vector<ncclDataType_t>& testDataTypes);
|
||||
|
||||
// Return a list for # of collectives per group
|
||||
std::vector<int> const GetNumCollsPerGroup(int const numCollectivesInGroup,
|
||||
int const numGroupCalls);
|
||||
|
||||
Reference in New Issue
Block a user