From a640c6983ff398a93a77ecf7550042234bcf0ea4 Mon Sep 17 00:00:00 2001 From: gilbertlee-amd <44450918+gilbertlee-amd@users.noreply.github.com> Date: Sat, 18 Feb 2023 09:50:46 -0700 Subject: [PATCH] Unit test fail check (#689) * Adding fall-through on unit test failure * Workaround for hipGraph validity check issue --- test/AllReduce_Msccl.cpp | 2 +- test/common/TestBed.cpp | 11 ++++++++++- 2 files changed, 11 insertions(+), 2 deletions(-) diff --git a/test/AllReduce_Msccl.cpp b/test/AllReduce_Msccl.cpp index 086985b2be..22fe53facf 100644 --- a/test/AllReduce_Msccl.cpp +++ b/test/AllReduce_Msccl.cpp @@ -23,7 +23,7 @@ namespace RcclUnitTesting std::vector const numElements = {384 * 1024, 384}; std::vector const inPlaceList = {true, false}; std::vector const managedMemList = {true, false}; - std::vector const useHipGraphList = {true, false}; + std::vector const useHipGraphList = {false, true}; testBed.RunSimpleSweep(funcTypes, dataTypes, redOps, roots, numElements, inPlaceList, managedMemList, useHipGraphList); testBed.Finalize(); diff --git a/test/common/TestBed.cpp b/test/common/TestBed.cpp index f922929237..64185712ff 100644 --- a/test/common/TestBed.cpp +++ b/test/common/TestBed.cpp @@ -40,6 +40,7 @@ if (response != TEST_SUCCESS) \ { \ ERROR("Child %d reports failure\n", childId); \ + ASSERT_EQ(response, TEST_SUCCESS); \ FAIL(); \ } \ } @@ -476,6 +477,7 @@ namespace RcclUnitTesting int const numChildren = isMultiProcess ? numGpus : 1; int const numRanks = numGpus*ranksPerGpu; this->InitComms(TestBed::GetDeviceIdsList(numChildren, numGpus, ranksPerGpu)); + if (testing::Test::HasFailure()) continue; for (int ftIdx = 0; ftIdx < funcTypes.size() && isCorrect; ++ftIdx) for (int dtIdx = 0; dtIdx < dataTypes.size() && isCorrect; ++dtIdx) @@ -499,9 +501,14 @@ namespace RcclUnitTesting numInputElements, numOutputElements, optionalArgs); + if (testing::Test::HasFailure()) continue; // Only allocate once for largest size - if (neIdx == 0) this->AllocateMem(inPlaceList[ipIdx], managedMemList[mmIdx]); + if (neIdx == 0) + { + this->AllocateMem(inPlaceList[ipIdx], managedMemList[mmIdx]); + if (testing::Test::HasFailure()) continue; + } for (int hgIdx = 0; hgIdx < useHipGraphList.size() && isCorrect; ++hgIdx) { @@ -512,6 +519,7 @@ namespace RcclUnitTesting funcTypes[ftIdx] == ncclCollReduce || funcTypes[ftIdx] == ncclCollAllReduce)); if (!canSkip) this->PrepareData(); + if (testing::Test::HasFailure()) continue; std::string name = this->GetTestCaseName(numGpus, isMultiProcess, funcTypes[ftIdx], dataTypes[dtIdx], @@ -526,6 +534,7 @@ namespace RcclUnitTesting std::vector currentRanksEmpty = {}; this->ExecuteCollectives(currentRanksEmpty, useHipGraphList[hgIdx]); + if (testing::Test::HasFailure()) continue; this->ValidateResults(isCorrect); if (!isCorrect) {