diff --git a/libhsakmt/tests/kfdtest/src/KFDNegativeTest.cpp b/libhsakmt/tests/kfdtest/src/KFDNegativeTest.cpp index 00cfa6c3c6..5791296544 100644 --- a/libhsakmt/tests/kfdtest/src/KFDNegativeTest.cpp +++ b/libhsakmt/tests/kfdtest/src/KFDNegativeTest.cpp @@ -139,3 +139,127 @@ TEST_F(KFDNegativeTest, BasicPipeReset) { TEST_END } +/** + * Basic SDMA Reset + * + * To check SDMA queue reset, launch a healthy SDMA queue and a bad SDMA queue with + * dispatches per SDMA engine. + * Similar to compute queue reset, only processes that have bad SDMA queues should + * be reset, leaving healthy SDMA queue unaffected. + * + */ +TEST_F(KFDNegativeTest, BasicSDMAReset) { + TEST_START(TESTPROFILE_RUNALL); + + int gpuNode = m_NodeInfo.HsaDefaultGPUNode(); + ASSERT_GE(gpuNode, 0) << "failed to get default GPU Node"; + + const HsaNodeProperties *nodeProps = m_NodeInfo.GetNodeProperties(gpuNode); + int totalEngines = nodeProps->NumSdmaEngines + nodeProps->NumSdmaXgmiEngines; + bool perSDMAQueueResetSupported = nodeProps->Capability2.ui32.PerSDMAQueueResetSupported; + + if (perSDMAQueueResetSupported) { + int pipe1[2]; + int pipe2[2]; + pipe(pipe1); + pipe(pipe2); + + LOG() << std::dec << "Running SDMA queue reset on " << totalEngines + <<" SDMA engines" << std::endl; + + pid_t childPid = fork(); + + if (childPid == 0) { + KFDBaseComponentTest::TearDown(); + KFDBaseComponentTest::SetUp(); + close(pipe1[1]); // Close write end of pipe1 + close(pipe2[0]); // Close read end of pipe2 + HsaMemoryBuffer destBuf(PAGE_SIZE, gpuNode, false); + unsigned int *dest = destBuf.As(); + for (int i = 0; i < totalEngines; i++) { + HsaEvent *resetEvent; + ASSERT_SUCCESS(CreateHWExceptionEvent(false, false, gpuNode, &resetEvent)); + + // wait for parent to schedule healthy queue on engine + char buf1, buf2 ='x'; + read(pipe1[0], &buf1, 1); + + // submit bad queue and destroy to trigger reset + SDMAQueueByEngId queue(i); + ASSERT_SUCCESS(queue.Create(gpuNode)); + queue.PlaceAndSubmitPacket(SDMAWriteDataPacket(queue.GetFamilyId(), &dest[0], 0, 6)); + Delay(50); + LOG() << std::dec << "Reset SDMA queue on engine " << i << std::endl; + queue.Destroy(); + + // child expects hw exception event + EXPECT_SUCCESS(hsaKmtWaitOnEvent(resetEvent, g_TestTimeOut)); + EXPECT_EQ(resetEvent->EventData.EventType, HSA_EVENTTYPE_HW_EXCEPTION); + hsaKmtDestroyEvent(resetEvent); + + // ack reset to parent and wait for parent to check healthy queue + write(pipe2[1], &buf2, 1); + read(pipe1[0], &buf1, 1); + } + + close(pipe1[0]); + close(pipe2[1]); + LOG() << "Child ==> Complete" << std::endl; + exit(0); + } else { + int childStatus = 0; + close(pipe1[0]); // Close read end of pipe1 + close(pipe2[1]); // Close write end of pipe2 + + // parent process should not intercept reset event on child queue reset + HsaMemoryBuffer pollBuf(PAGE_SIZE, gpuNode, false); + HsaMemoryBuffer destBuf(PAGE_SIZE, gpuNode, false); + unsigned int *poll = pollBuf.As(); + unsigned int *dest = destBuf.As(); + uint32_t targetDestValue = 0x12345678; + + for (int i = 0; i < totalEngines; i++) { + poll[0] = 0; + dest[0] = 0; + HsaEvent *event; + HsaEvent *resetEvent; + ASSERT_SUCCESS(CreateHWExceptionEvent(false, false, gpuNode, &resetEvent)); + ASSERT_SUCCESS(CreateQueueTypeEvent(false, false, gpuNode, &event)); + + SDMAQueueByEngId queue(i); + ASSERT_SUCCESS(queue.Create(gpuNode)); + + // submit write on poll to maintain non-zero read/write pointer + // in engine during reset + queue.PlaceAndSubmitPacket(SDMAPollRegMemPacket(&poll[0], 1)); + queue.PlaceAndSubmitPacket(SDMAWriteDataPacket(queue.GetFamilyId(), &dest[0], targetDestValue)); + + // wait for for child to trigger reset on engine + char buf1 = 'x', buf2; + write(pipe1[1], &buf1, 1); + read(pipe2[0], &buf2, 1); + + // expect no reset event, then update poll to trigger write completion check + EXPECT_NE(HSAKMT_STATUS_SUCCESS, hsaKmtWaitOnEvent(resetEvent, 100)); + poll[0] = 1; + queue.Wait4PacketConsumption(); + EXPECT_TRUE(WaitOnValue(&dest[0], targetDestValue)); + hsaKmtDestroyEvent(event); + hsaKmtDestroyEvent(resetEvent); + EXPECT_SUCCESS(queue.Destroy()); + write(pipe1[1], &buf1, 1); + } + + waitpid(childPid, &childStatus, 0); + close(pipe1[1]); + close(pipe2[0]); + LOG() << "Parent ==> Complete" << std::endl; + } + } else { + LOG() << "Skipping test: Family ID 0x" << m_FamilyId + << " with per-sdma queue reset support = " + << perSDMAQueueResetSupported << std::endl; + } + + TEST_END +} diff --git a/libhsakmt/tests/kfdtest/src/KFDNegativeTest.hpp b/libhsakmt/tests/kfdtest/src/KFDNegativeTest.hpp index 3572586bf2..c76f102f7f 100644 --- a/libhsakmt/tests/kfdtest/src/KFDNegativeTest.hpp +++ b/libhsakmt/tests/kfdtest/src/KFDNegativeTest.hpp @@ -28,6 +28,8 @@ #include "PM4Queue.hpp" #include "KFDBaseComponentTest.hpp" +#include "SDMAQueueByEngId.hpp" +#include "SDMAPacket.hpp" class KFDNegativeTest : public KFDBaseComponentTest { public: