kfdtest: Add KFD SDMA queue reset testing

The KFD can per-SDMA queue reset similar to compute queue reset.
Add test.
This commit is contained in:
Jonathan Kim
2025-01-30 14:01:52 -05:00
committed by Kim, Jonathan
parent d047708317
commit ee890e7d2b
2 changed files with 126 additions and 0 deletions
@@ -139,3 +139,127 @@ TEST_F(KFDNegativeTest, BasicPipeReset) {
TEST_END
}
/**
* Basic SDMA Reset
*
* To check SDMA queue reset, launch a healthy SDMA queue and a bad SDMA queue with
* dispatches per SDMA engine.
* Similar to compute queue reset, only processes that have bad SDMA queues should
* be reset, leaving healthy SDMA queue unaffected.
*
*/
TEST_F(KFDNegativeTest, BasicSDMAReset) {
TEST_START(TESTPROFILE_RUNALL);
int gpuNode = m_NodeInfo.HsaDefaultGPUNode();
ASSERT_GE(gpuNode, 0) << "failed to get default GPU Node";
const HsaNodeProperties *nodeProps = m_NodeInfo.GetNodeProperties(gpuNode);
int totalEngines = nodeProps->NumSdmaEngines + nodeProps->NumSdmaXgmiEngines;
bool perSDMAQueueResetSupported = nodeProps->Capability2.ui32.PerSDMAQueueResetSupported;
if (perSDMAQueueResetSupported) {
int pipe1[2];
int pipe2[2];
pipe(pipe1);
pipe(pipe2);
LOG() << std::dec << "Running SDMA queue reset on " << totalEngines
<<" SDMA engines" << std::endl;
pid_t childPid = fork();
if (childPid == 0) {
KFDBaseComponentTest::TearDown();
KFDBaseComponentTest::SetUp();
close(pipe1[1]); // Close write end of pipe1
close(pipe2[0]); // Close read end of pipe2
HsaMemoryBuffer destBuf(PAGE_SIZE, gpuNode, false);
unsigned int *dest = destBuf.As<unsigned int*>();
for (int i = 0; i < totalEngines; i++) {
HsaEvent *resetEvent;
ASSERT_SUCCESS(CreateHWExceptionEvent(false, false, gpuNode, &resetEvent));
// wait for parent to schedule healthy queue on engine
char buf1, buf2 ='x';
read(pipe1[0], &buf1, 1);
// submit bad queue and destroy to trigger reset
SDMAQueueByEngId queue(i);
ASSERT_SUCCESS(queue.Create(gpuNode));
queue.PlaceAndSubmitPacket(SDMAWriteDataPacket(queue.GetFamilyId(), &dest[0], 0, 6));
Delay(50);
LOG() << std::dec << "Reset SDMA queue on engine " << i << std::endl;
queue.Destroy();
// child expects hw exception event
EXPECT_SUCCESS(hsaKmtWaitOnEvent(resetEvent, g_TestTimeOut));
EXPECT_EQ(resetEvent->EventData.EventType, HSA_EVENTTYPE_HW_EXCEPTION);
hsaKmtDestroyEvent(resetEvent);
// ack reset to parent and wait for parent to check healthy queue
write(pipe2[1], &buf2, 1);
read(pipe1[0], &buf1, 1);
}
close(pipe1[0]);
close(pipe2[1]);
LOG() << "Child ==> Complete" << std::endl;
exit(0);
} else {
int childStatus = 0;
close(pipe1[0]); // Close read end of pipe1
close(pipe2[1]); // Close write end of pipe2
// parent process should not intercept reset event on child queue reset
HsaMemoryBuffer pollBuf(PAGE_SIZE, gpuNode, false);
HsaMemoryBuffer destBuf(PAGE_SIZE, gpuNode, false);
unsigned int *poll = pollBuf.As<unsigned int*>();
unsigned int *dest = destBuf.As<unsigned int*>();
uint32_t targetDestValue = 0x12345678;
for (int i = 0; i < totalEngines; i++) {
poll[0] = 0;
dest[0] = 0;
HsaEvent *event;
HsaEvent *resetEvent;
ASSERT_SUCCESS(CreateHWExceptionEvent(false, false, gpuNode, &resetEvent));
ASSERT_SUCCESS(CreateQueueTypeEvent(false, false, gpuNode, &event));
SDMAQueueByEngId queue(i);
ASSERT_SUCCESS(queue.Create(gpuNode));
// submit write on poll to maintain non-zero read/write pointer
// in engine during reset
queue.PlaceAndSubmitPacket(SDMAPollRegMemPacket(&poll[0], 1));
queue.PlaceAndSubmitPacket(SDMAWriteDataPacket(queue.GetFamilyId(), &dest[0], targetDestValue));
// wait for for child to trigger reset on engine
char buf1 = 'x', buf2;
write(pipe1[1], &buf1, 1);
read(pipe2[0], &buf2, 1);
// expect no reset event, then update poll to trigger write completion check
EXPECT_NE(HSAKMT_STATUS_SUCCESS, hsaKmtWaitOnEvent(resetEvent, 100));
poll[0] = 1;
queue.Wait4PacketConsumption();
EXPECT_TRUE(WaitOnValue(&dest[0], targetDestValue));
hsaKmtDestroyEvent(event);
hsaKmtDestroyEvent(resetEvent);
EXPECT_SUCCESS(queue.Destroy());
write(pipe1[1], &buf1, 1);
}
waitpid(childPid, &childStatus, 0);
close(pipe1[1]);
close(pipe2[0]);
LOG() << "Parent ==> Complete" << std::endl;
}
} else {
LOG() << "Skipping test: Family ID 0x" << m_FamilyId
<< " with per-sdma queue reset support = "
<< perSDMAQueueResetSupported << std::endl;
}
TEST_END
}
@@ -28,6 +28,8 @@
#include "PM4Queue.hpp"
#include "KFDBaseComponentTest.hpp"
#include "SDMAQueueByEngId.hpp"
#include "SDMAPacket.hpp"
class KFDNegativeTest : public KFDBaseComponentTest {
public: