kfdtest: Add multi-process oversubscription test

This test is designed to reproduce soft-hangs cause by HWS running
with oversubscription.


Change-Id: I49861522b3ff5ba50df5ddc968545c35ccb25353
Signed-off-by: Felix Kuehling <Felix.Kuehling@amd.com>


[ROCm/ROCR-Runtime commit: 5475e618e5]
This commit is contained in:
Felix Kuehling
2019-06-12 21:44:46 -04:00
parent d9f3f826dc
commit 2ffc094890
3 ha cambiato i file con 186 aggiunte e 1 eliminazioni
@@ -26,6 +26,7 @@ PERMANENT_BLACKLIST_ALL_ASICS=\
# KFDQMTest.GPUDoorbellWrite fails intermittently (KFD-318)
# KFDQMTest.mGPUShareBO (KFD-334)
# KFDQMTest.PM4EventInterrupt/KFDDBGTest.BasicAddressWatch (KFD-431)
# KFDHWSTest.* (SWDEV-193035)
TEMPORARY_BLACKLIST_ALL_ASICS=\
"KFDGraphicsInterop.RegisterForeignDeviceMem:"\
"KFDPMTest.SuspendWithActiveProcess:"\
@@ -37,7 +38,8 @@ TEMPORARY_BLACKLIST_ALL_ASICS=\
"KFDDBGTest.BasicAddressWatch*:"\
"KFDQMTest.PM4EventInterrupt:"\
"KFDQMTest.SdmaEventInterrupt:"\
"KFDDBGTest.BasicDebuggerSuspendResume"
"KFDDBGTest.BasicDebuggerSuspendResume:"\
"KFDHWSTest.*"
BLACKLIST_ALL_ASICS=\
"$PERMANENT_BLACKLIST_ALL_ASICS:"\
@@ -0,0 +1,133 @@
/*
* Copyright (C) 2019 Advanced Micro Devices, Inc. All Rights Reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*
*/
#include "KFDHWSTest.hpp"
void KFDHWSTest::SetUp() {
ROUTINE_START
KFDBaseComponentTest::SetUp();
m_pIsaGen = IsaGenerator::Create(m_FamilyId);
ROUTINE_END
}
void KFDHWSTest::TearDown() {
ROUTINE_START
if (m_pIsaGen)
delete m_pIsaGen;
m_pIsaGen = NULL;
KFDBaseComponentTest::TearDown();
ROUTINE_END
}
void KFDHWSTest::RunTest(unsigned nProcesses, unsigned nQueues, unsigned nLoops) {
int defaultGPUNode = m_NodeInfo.HsaDefaultGPUNode();
ASSERT_GE(defaultGPUNode, 0) << "failed to get default GPU Node";
unsigned q, l;
bool timeout = false;
/* Fork the child processes */
ForkChildProcesses(nProcesses);
// Create queues
PM4Queue *queues = new PM4Queue[nQueues];
for (q = 0; q < nQueues; q++)
ASSERT_SUCCESS(queues[q].Create(defaultGPUNode));
// Create dispatch pointers. Each loop iteration creates fresh dispatches
Dispatch **dispatch = new Dispatch*[nQueues];
for (q = 0; q < nQueues; q++)
dispatch[q] = NULL;
// Logging: Each process prints its index after each loop iteration, all in one line.
std::ostream &log = LOG() << std::dec << "Process " << m_ProcessIndex << " starting." << std::endl;
// Run work on all queues
HsaMemoryBuffer isaBuffer(PAGE_SIZE, defaultGPUNode, true/*zero*/, false/*local*/, true/*exec*/);
m_pIsaGen->GetNoopIsa(isaBuffer);
for (l = 0; l < nLoops; l++) {
for (q = 0; q < nQueues; q++) {
if (dispatch[q])
delete dispatch[q];
dispatch[q] = new Dispatch(isaBuffer);
dispatch[q]->SetArgs(NULL, NULL);
dispatch[q]->SetDim(1, 1, 1);
dispatch[q]->Submit(queues[q]);
}
for (q = 0; q < nQueues; q++) {
timeout = dispatch[q]->SyncWithStatus(g_TestTimeOut);
if (timeout)
goto timeout;
}
log << m_ProcessIndex;
}
timeout:
log << std::endl;
if (timeout) {
WARN() << "Process " << m_ProcessIndex << " timeout." << std::endl;
} else {
LOG() << "Process " << m_ProcessIndex << " done. Waiting ..." << std::endl;
// Wait here before destroying queues. If another process' queues
// are soft-hanging, destroying queues can resolve the soft-hang
// by changing the run list. Make sure the other process's
// dispatches have a chance to time out first.
Delay(g_TestTimeOut+1000);
}
// Destroy queues and dispatches. Destroying the queues first
// ensures that the memory allocated by the Dispatch is no longer
// accessed by the GPU.
LOG() << "Process " << m_ProcessIndex << " cleaning up." << std::endl;
for (q = 0; q < nQueues; q++) {
EXPECT_SUCCESS(queues[q].Destroy());
if (dispatch[q])
delete dispatch[q];
}
delete[] queues;
delete[] dispatch;
// This is after all the cleanup to avoid leaving any garbage
// behind, but before WaitChildProcesses to ensure a child process
// with a timeout exits with an error that can be detected by the
// parent.
ASSERT_FALSE(timeout);
WaitChildProcesses();
}
TEST_F(KFDHWSTest, MultiProcessOversubscribed) {
TEST_REQUIRE_ENV_CAPABILITIES(ENVCAPS_64BITLINUX);
TEST_START(TESTPROFILE_RUNALL);
RunTest(3, 13, 40);
TEST_END
}
@@ -0,0 +1,50 @@
/*
* Copyright (C) 2019 Advanced Micro Devices, Inc. All Rights Reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*
*/
#ifndef __KFD_HWS_TEST__H__
#define __KFD_HWS_TEST__H__
#include <gtest/gtest.h>
#include "PM4Queue.hpp"
#include "IsaGenerator.hpp"
#include "KFDMultiProcessTest.hpp"
#include "Dispatch.hpp"
class KFDHWSTest : public KFDMultiProcessTest {
public:
KFDHWSTest():m_pIsaGen(NULL) {}
~KFDHWSTest() {}
protected:
virtual void SetUp();
virtual void TearDown();
void RunTest(unsigned nProcesses, unsigned nQueues, unsigned nLoops);
protected: // Members
IsaGenerator* m_pIsaGen;
};
#endif // __KFD_QCM_TEST__H__