Fixed unit-test env var list parsing and improved filtered test run speed (#1626)

* Fixed parsing of env var lists which were overwriting the mutable env var string and polluting future parses.

* Fixed all tests to obey UT_DATATYPES and UT_REDOPS filters.

* Allow tests to bail early via `GTEST_SKIP` if UT_DATATYPES or UT_REDOPS filters give a test size of zero. This allows tests to run much faster with filters on.

* Wrapped the support checks in helper functions on `TestBed`.
This commit is contained in:
corey-derochie-amd
2025-12-10 10:06:44 -07:00
committed by GitHub
orang tua 6af9087b0c
melakukan 18e9ad913b
6 mengubah file dengan 126 tambahan dan 38 penghapusan
+14 -2
Melihat File
@@ -79,13 +79,19 @@ namespace RcclUnitTesting
TestBed testBed;
// Configuration
std::vector<ncclDataType_t> const& dataTypes = {ncclInt32, ncclFloat64, ncclFloat16};
std::vector<ncclDataType_t> const& testDataTypes = {ncclInt32, ncclFloat64, ncclFloat16};
bool const inPlace = false;
bool const useManagedMem = false;
bool const useHipGraph = false;
OptionalColArgs options;
std::vector<ncclDataType_t> dataTypes;
testBed.GetSupportedDataTypes(dataTypes, testDataTypes);
if (dataTypes.empty()) {
GTEST_SKIP() << "Skipping... test datatypes excluded by UT_DATATYPES.";
}
bool isCorrect = true;
for (int totalRanks : testBed.ev.GetNumGpusList())
for (int isMultiProcess : testBed.ev.GetIsMultiProcessList())
@@ -137,13 +143,19 @@ namespace RcclUnitTesting
TestBed testBed;
// Configuration
std::vector<ncclDataType_t> const& dataTypes = {ncclFloat32, ncclInt8};
std::vector<ncclDataType_t> const& testDataTypes = {ncclFloat32, ncclInt8};
bool const inPlace = false;
bool const useManagedMem = false;
bool const useHipGraph = false;
OptionalColArgs options;
std::vector<ncclDataType_t> dataTypes;
testBed.GetSupportedDataTypes(dataTypes, testDataTypes);
if (dataTypes.empty()) {
GTEST_SKIP() << "Skipping... test datatypes excluded by UT_DATATYPES.";
}
bool isCorrect = true;
for (int totalRanks : testBed.ev.GetNumGpusList())
for (int isMultiProcess : testBed.ev.GetIsMultiProcessList())
+43 -7
Melihat File
@@ -14,14 +14,26 @@ namespace RcclUnitTesting
// Configuration
std::vector<ncclFunc_t> const funcTypes = {ncclCollAllReduce, ncclCollAllReduce, ncclCollAllReduce};
std::vector<ncclRedOp_t> const redOps = {ncclSum, ncclSum, ncclSum};
std::vector<ncclDataType_t> const dataTypes = {ncclFloat, ncclFloat, ncclFloat};
std::vector<ncclRedOp_t> const testRedOps = {ncclSum, ncclSum, ncclSum};
std::vector<ncclDataType_t> const testDataTypes = {ncclFloat, ncclFloat, ncclFloat};
std::vector<int> const numElements = {1048576, 384 * 1024, 384};
int const numCollPerGroup = numElements.size();
bool const inPlace = false;
bool const useManagedMem = false;
std::vector<ncclDataType_t> dataTypes;
testBed.GetSupportedDataTypes(dataTypes, testDataTypes);
if (dataTypes.empty()) {
GTEST_SKIP() << "Skipping... test datatypes excluded by UT_DATATYPES.";
}
std::vector<ncclRedOp_t> redOps;
testBed.GetSupportedRedOps(redOps, testRedOps);
if (redOps.empty()) {
GTEST_SKIP() << "Skipping... test reduction operations excluded by UT_REDOPS.";
}
bool isCorrect = true;
for (int totalRanks : testBed.ev.GetNumGpusList())
for (int isMultiProcess : testBed.ev.GetIsMultiProcessList())
@@ -127,14 +139,26 @@ namespace RcclUnitTesting
// Configuration
std::vector<ncclFunc_t> const funcTypes = {ncclCollAllReduce, ncclCollAllReduce, ncclCollAllReduce};
std::vector<ncclRedOp_t> const redOps = {ncclSum, ncclSum, ncclSum};
std::vector<ncclDataType_t> const dataTypes = {ncclFloat16, ncclFloat32, ncclFloat64};
std::vector<ncclRedOp_t> const testRedOps = {ncclSum, ncclSum, ncclSum};
std::vector<ncclDataType_t> const testDataTypes = {ncclFloat16, ncclFloat32, ncclFloat64};
std::vector<int> const numElements = {1048576, 384 * 1024, 384};
int const numCollPerGroup = numElements.size();
bool const inPlace = false;
bool const useManagedMem = false;
std::vector<ncclDataType_t> dataTypes;
testBed.GetSupportedDataTypes(dataTypes, testDataTypes);
if (dataTypes.empty()) {
GTEST_SKIP() << "Skipping... test datatypes excluded by UT_DATATYPES.";
}
std::vector<ncclRedOp_t> redOps;
testBed.GetSupportedRedOps(redOps, testRedOps);
if (redOps.empty()) {
GTEST_SKIP() << "Skipping... test reduction operations excluded by UT_REDOPS.";
}
bool isCorrect = true;
for (int totalRanks : testBed.ev.GetNumGpusList())
for (int isMultiProcess : testBed.ev.GetIsMultiProcessList())
@@ -230,8 +254,8 @@ namespace RcclUnitTesting
{ncclCollAllToAll, ncclCollGather},
{ncclCollBroadcast, ncclCollReduceScatter}};
std::vector<std::vector<int>> const numElements = {{1250, 1048576}, {384, 384 * 1024}, {1048576, 127}};
std::vector<ncclDataType_t> const dataTypes = {ncclFloat16, ncclFloat32, ncclBfloat16};
std::vector<ncclRedOp_t> const redops = {ncclSum, ncclProd, ncclMax};
std::vector<ncclDataType_t> const testDataTypes = {ncclFloat16, ncclFloat32, ncclBfloat16};
std::vector<ncclRedOp_t> const testRedOps = {ncclSum, ncclProd, ncclMax};
std::vector<int> const numCollsPerGroup = {2, 2, 2};
std::vector<int> const numStreamsPerGroup = {1, 1, 1};
std::vector<bool> const useHipGraphList = {true, false, true};
@@ -241,6 +265,18 @@ namespace RcclUnitTesting
int const numGroupCalls = groupCalls.size();
int const numIterations = 10;
std::vector<ncclDataType_t> dataTypes;
testBed.GetSupportedDataTypes(dataTypes, testDataTypes);
if (dataTypes.empty()) {
GTEST_SKIP() << "Skipping... test datatypes excluded by UT_DATATYPES.";
}
std::vector<ncclRedOp_t> redOps;
testBed.GetSupportedRedOps(redOps, testRedOps);
if (redOps.empty()) {
GTEST_SKIP() << "Skipping... test reduction operations excluded by UT_REDOPS.";
}
bool isCorrect = true;
for (int totalRanks : testBed.ev.GetNumGpusList())
for (int isMultiProcess : testBed.ev.GetIsMultiProcessList())
@@ -258,7 +294,7 @@ namespace RcclUnitTesting
{
std::vector<ncclFunc_t> funcTypes = groupCalls[groupCallIdx];
OptionalColArgs options;
options.redOp = redops[groupCallIdx];
options.redOp = redOps[groupCallIdx];
options.root = 0;
for (int collIdx = 0; collIdx < numCollsPerGroup[groupCallIdx]; ++collIdx)
+16 -2
Melihat File
@@ -12,12 +12,19 @@ namespace RcclUnitTesting
TestBed testBed;
// Configuration
std::vector<ncclDataType_t> const& dataTypes = {ncclInt32, ncclFloat16, ncclFloat64};
std::vector<ncclDataType_t> const& testDataTypes = {ncclInt32, ncclFloat16, ncclFloat64};
std::vector<int> const numElements = {1048576, 53327, 1024, 0};
bool const inPlace = false;
bool const useManagedMem = false;
OptionalColArgs options;
std::vector<ncclDataType_t> dataTypes;
testBed.GetSupportedDataTypes(dataTypes, testDataTypes);
if (dataTypes.empty()) {
GTEST_SKIP() << "Skipping... test datatypes excluded by UT_DATATYPES.";
}
bool isCorrect = true;
int numGpus = testBed.ev.maxGpus;
for (int rpg=0; rpg < 2 && isCorrect; ++rpg)
@@ -104,13 +111,20 @@ namespace RcclUnitTesting
TestBed testBed;
// Configuration
std::vector<ncclDataType_t> const& dataTypes = {ncclInt32, ncclFloat16, ncclFloat64};
std::vector<ncclDataType_t> const& testDataTypes = {ncclInt32, ncclFloat16, ncclFloat64};
std::vector<int> const numElements = {1048576, 53327, 1024};
bool const inPlace = false;
bool const useManagedMem = false;
bool const userRegistered = true;
OptionalColArgs options;
std::vector<ncclDataType_t> dataTypes;
testBed.GetSupportedDataTypes(dataTypes, testDataTypes);
if (dataTypes.empty()) {
GTEST_SKIP() << "Skipping... test datatypes excluded by UT_DATATYPES.";
}
bool isCorrect = true;
int numGpus = testBed.ev.maxGpus;
for (int rpg=0; rpg < 2 && isCorrect; ++rpg)
+7 -3
Melihat File
@@ -9,7 +9,9 @@
#include <cstdlib>
#include <unistd.h>
#include <sys/wait.h>
#include <algorithm>
#include <iostream>
#include <sstream>
#include <unordered_map>
namespace RcclUnitTesting
@@ -337,11 +339,13 @@ namespace RcclUnitTesting
std::vector<std::string> result;
if (getenv(varname.c_str()))
{
char* token = strtok(getenv(varname.c_str()), ",;");
while (token != NULL)
std::string env = getenv(varname.c_str());
std::replace(env.begin(), env.end(), ';', ',');
std::istringstream ss(env);
std::string token;
while (std::getline(ss, token, ','))
{
result.push_back(token);
token = strtok(NULL, ",;");
}
}
return result;
+40 -24
Melihat File
@@ -564,6 +564,40 @@ namespace RcclUnitTesting
return ev.GetAllSupportedDataTypes();
}
void TestBed::GetSupportedRedOps(std::vector<ncclRedOp_t>& redOps, const std::vector<ncclRedOp_t>& testRedOps)
{
// Filter out any unsupported reduction ops, in case only subset has been compiled for
auto& supportedOps = ev.GetAllSupportedRedOps();
for (auto redop : testRedOps)
{
for (int i = 0; i < supportedOps.size(); ++i)
{
if (supportedOps[i] == redop)
{
redOps.push_back(redop);
break;
}
}
}
}
void TestBed::GetSupportedDataTypes(std::vector<ncclDataType_t>& dataTypes, const std::vector<ncclDataType_t>& testDataTypes)
{
// Filter out any unsupported datatypes, in case only subset has been compiled for
auto& supportedDataTypes = ev.GetAllSupportedDataTypes();
for (auto dt : testDataTypes)
{
for (int i = 0; i < supportedDataTypes.size(); ++i)
{
if (supportedDataTypes[i] == dt)
{
dataTypes.push_back(dt);
break;
}
}
}
}
std::vector<int> const TestBed::GetNumCollsPerGroup(int numCollectivesInGroup,
int numGroupCalls)
{
@@ -642,34 +676,16 @@ namespace RcclUnitTesting
std::vector<int> sortedN = numElements;
std::sort(sortedN.rbegin(), sortedN.rend());
OptionalColArgs optionalArgs;
// Filter out any unsupported datatypes, in case only subset has been compiled for
std::vector<ncclDataType_t> const& supportedDataTypes = this->GetAllSupportedDataTypes();
std::vector<ncclDataType_t> dataTypes;
for (auto dt : tmpDataTypes)
{
for (int i = 0; i < supportedDataTypes.size(); ++i)
{
if (supportedDataTypes[i] == dt)
{
dataTypes.push_back(dt);
break;
}
}
this->GetSupportedDataTypes(dataTypes, tmpDataTypes);
if (dataTypes.empty()) {
GTEST_SKIP() << "Skipping... test datatypes excluded by UT_DATATYPES.";
}
// Filter out any unsupported reduction ops, in case only subset has been compiled for
std::vector<ncclRedOp_t> const& supportedOps = this->GetAllSupportedRedOps();
std::vector<ncclRedOp_t> redOps;
for (auto redop : tmpRedOps)
{
for (int i = 0; i < supportedOps.size(); ++i)
{
if (supportedOps[i] == redop)
{
redOps.push_back(redop);
break;
}
}
this->GetSupportedRedOps(redOps, tmpRedOps);
if (redOps.empty()) {
GTEST_SKIP() << "Skipping... test reduction operations excluded by UT_REDOPS.";
}
bool isCorrect = true;
+6
Melihat File
@@ -127,6 +127,12 @@ namespace RcclUnitTesting
// Return all the supported data types based on build settings
std::vector<ncclDataType_t> const& GetAllSupportedDataTypes();
// Returns the intersection of testRedOps with supported reduction operations as redOps.
void GetSupportedRedOps(std::vector<ncclRedOp_t>& redOps, const std::vector<ncclRedOp_t>& testRedOps);
// Returns the intersection of testDataTypes with supported data types as dataTypes.
void GetSupportedDataTypes(std::vector<ncclDataType_t>& dataTypes, const std::vector<ncclDataType_t>& testDataTypes);
// Return a list for # of collectives per group
std::vector<int> const GetNumCollsPerGroup(int const numCollectivesInGroup,
int const numGroupCalls);