/* * Copyright (C) 2014-2018 Advanced Micro Devices, Inc. All Rights Reserved. * * Permission is hereby granted, free of charge, to any person obtaining a * copy of this software and associated documentation files (the "Software"), * to deal in the Software without restriction, including without limitation * the rights to use, copy, modify, merge, publish, distribute, sublicense, * and/or sell copies of the Software, and to permit persons to whom the * Software is furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in * all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR * OTHER DEALINGS IN THE SOFTWARE. * */ #include #include #include #include #include #include "KFDQMTest.hpp" #include "PM4Queue.hpp" #include "PM4Packet.hpp" #include "SDMAPacket.hpp" #include "XgmiOptimizedSDMAQueue.hpp" #include "AqlQueue.hpp" #include #include "Dispatch.hpp" extern unsigned int g_TestGPUsNum; void KFDQMTest::SetUp() { ROUTINE_START KFDBaseComponentTest::SetUp(); ROUTINE_END } void KFDQMTest::TearDown() { ROUTINE_START KFDBaseComponentTest::TearDown(); ROUTINE_END } static void CreateDestroyCpQueue(KFDTEST_PARAMETERS* pTestParamters) { int gpuNode = pTestParamters->gpuNode; KFDQMTest* pKFDQMTest = (KFDQMTest*)pTestParamters->pTestObject; PM4Queue queue; ASSERT_SUCCESS_GPU(queue.Create(gpuNode), gpuNode); EXPECT_SUCCESS_GPU(queue.Destroy(), gpuNode); } TEST_F(KFDQMTest, CreateDestroyCpQueue) { TEST_START(TESTPROFILE_RUNALL) ASSERT_SUCCESS(KFDTest_Launch(CreateDestroyCpQueue)); TEST_END } static void SubmitNopCpQueue(KFDTEST_PARAMETERS* pTestParamters) { int gpuNode = pTestParamters->gpuNode; KFDQMTest* pKFDQMTest = (KFDQMTest*)pTestParamters->pTestObject; PM4Queue queue; HsaEvent *event; ASSERT_SUCCESS_GPU(CreateQueueTypeEvent(false, false, gpuNode, &event), gpuNode); ASSERT_SUCCESS_GPU(queue.Create(gpuNode), gpuNode); queue.PlaceAndSubmitPacket(PM4NopPacket()); queue.Wait4PacketConsumption(event); hsaKmtDestroyEvent(event); EXPECT_SUCCESS_GPU(queue.Destroy(), gpuNode); } TEST_F(KFDQMTest, SubmitNopCpQueue) { TEST_START(TESTPROFILE_RUNALL) ASSERT_SUCCESS(KFDTest_Launch(SubmitNopCpQueue)); TEST_END } static void SubmitPacketCpQueue(KFDTEST_PARAMETERS* pTestParamters) { int gpuNode = pTestParamters->gpuNode; KFDQMTest* pKFDQMTest = (KFDQMTest*)pTestParamters->pTestObject; HsaMemoryBuffer destBuf(PAGE_SIZE, gpuNode, false); destBuf.Fill(0xFF); HsaEvent *event; ASSERT_SUCCESS_GPU(CreateQueueTypeEvent(false, false, gpuNode, &event), gpuNode); PM4Queue queue; ASSERT_SUCCESS_GPU(queue.Create(gpuNode), gpuNode); queue.PlaceAndSubmitPacket(PM4WriteDataPacket(destBuf.As(), 0, 0)); queue.Wait4PacketConsumption(event); EXPECT_TRUE_GPU(WaitOnValue(destBuf.As(), 0), gpuNode); hsaKmtDestroyEvent(event); EXPECT_SUCCESS_GPU(queue.Destroy(), gpuNode); } TEST_F(KFDQMTest, SubmitPacketCpQueue) { TEST_START(TESTPROFILE_RUNALL) ASSERT_SUCCESS(KFDTest_Launch(SubmitPacketCpQueue)); TEST_END } static void AllCpQueues(KFDTEST_PARAMETERS* pTestParamters) { int gpuNode = pTestParamters->gpuNode; KFDQMTest* pKFDQMTest = (KFDQMTest*)pTestParamters->pTestObject; int gpuIndex = pKFDQMTest->Get_NodeInfo()->HsaGPUindexFromGpuNode(gpuNode); HSAuint32 m_FamilyId = pKFDQMTest->GetFamilyIdFromNodeId(gpuNode); HsaMemoryBuffer destBuf(PAGE_SIZE, gpuNode, false); destBuf.Fill(0xFF); unsigned int m_numCpQueues = pKFDQMTest->Get_NumCpQueues(gpuIndex); std::vector queues(m_numCpQueues); for (unsigned int qidx = 0; qidx < m_numCpQueues; ++qidx) ASSERT_SUCCESS_GPU(queues[qidx].Create(gpuNode), gpuNode) << " QueueId=" << qidx; for (unsigned int qidx = 0; qidx < m_numCpQueues; ++qidx) { queues[qidx].PlaceAndSubmitPacket(PM4WriteDataPacket(destBuf.As()+qidx*2, qidx, qidx)); queues[qidx].PlaceAndSubmitPacket(PM4ReleaseMemoryPacket(m_FamilyId, true, 0, 0)); queues[qidx].Wait4PacketConsumption(); EXPECT_TRUE_GPU(WaitOnValue(destBuf.As()+qidx*2, qidx), gpuNode); } for (unsigned int qidx = 0; qidx < m_numCpQueues; ++qidx) EXPECT_SUCCESS_GPU(queues[qidx].Destroy(), gpuNode); } TEST_F(KFDQMTest, AllCpQueues) { TEST_START(TESTPROFILE_RUNALL) ASSERT_SUCCESS(KFDTest_Launch(AllCpQueues)); TEST_END } static void CreateDestroySdmaQueue(KFDTEST_PARAMETERS* pTestParamters) { int gpuNode = pTestParamters->gpuNode; SDMAQueue queue; ASSERT_SUCCESS_GPU(queue.Create(gpuNode), gpuNode); EXPECT_SUCCESS_GPU(queue.Destroy(), gpuNode); } TEST_F(KFDQMTest, CreateDestroySdmaQueue) { TEST_START(TESTPROFILE_RUNALL) ASSERT_SUCCESS(KFDTest_Launch(CreateDestroySdmaQueue)); TEST_END } static void SubmitNopSdmaQueue(KFDTEST_PARAMETERS* pTestParamters) { int gpuNode = pTestParamters->gpuNode; SDMAQueue queue; ASSERT_SUCCESS_GPU(queue.Create(gpuNode), gpuNode); queue.PlaceAndSubmitPacket(SDMANopPacket()); queue.Wait4PacketConsumption(); EXPECT_SUCCESS_GPU(queue.Destroy(), gpuNode); } TEST_F(KFDQMTest, SubmitNopSdmaQueue) { TEST_START(TESTPROFILE_RUNALL) ASSERT_SUCCESS(KFDTest_Launch(SubmitNopSdmaQueue)); TEST_END } static void SubmitPacketSdmaQueue(KFDTEST_PARAMETERS* pTestParamters) { int gpuNode = pTestParamters->gpuNode; HsaMemoryBuffer destBuf(PAGE_SIZE, gpuNode, false); destBuf.Fill(0xFF); SDMAQueue queue; ASSERT_SUCCESS_GPU(queue.Create(gpuNode), gpuNode); queue.PlaceAndSubmitPacket(SDMAWriteDataPacket(queue.GetFamilyId(), destBuf.As(), 0x02020202)); queue.Wait4PacketConsumption(); EXPECT_TRUE_GPU(WaitOnValue(destBuf.As(), 0x02020202), gpuNode); EXPECT_SUCCESS_GPU(queue.Destroy(), gpuNode); } TEST_F(KFDQMTest, SubmitPacketSdmaQueue) { TEST_START(TESTPROFILE_RUNALL) ASSERT_SUCCESS(KFDTest_Launch(SubmitPacketSdmaQueue)); TEST_END } static void AllSdmaQueues(KFDTEST_PARAMETERS* pTestParamters) { int gpuNode = pTestParamters->gpuNode; KFDQMTest* pKFDQMTest = (KFDQMTest*)pTestParamters->pTestObject; int gpuIndex = pKFDQMTest->Get_NodeInfo()->HsaGPUindexFromGpuNode(gpuNode); unsigned int m_numSdmaEngines = pKFDQMTest->Get_NumSdmaEngines(gpuIndex); unsigned int m_numSdmaQueuesPerEngine = pKFDQMTest->Get_NumSdmaSdmaQueuesPerEngine(gpuIndex); int bufSize = PAGE_SIZE; const unsigned int numSdmaQueues = m_numSdmaEngines * m_numSdmaQueuesPerEngine; LOG() << "Regular SDMA engines number: " << m_numSdmaEngines << " SDMA queues per engine: " << m_numSdmaQueuesPerEngine << std::endl; HsaMemoryBuffer destBuf(bufSize << 1 , gpuNode, false); HsaMemoryBuffer srcBuf(bufSize, gpuNode, false); destBuf.Fill(0xFF); std::vector queues(numSdmaQueues); for (unsigned int qidx = 0; qidx < numSdmaQueues; ++qidx) ASSERT_SUCCESS_GPU(queues[qidx].Create(gpuNode), gpuNode); for (unsigned int qidx = 0; qidx < numSdmaQueues; ++qidx) { destBuf.Fill(0x0); srcBuf.Fill(qidx + 0xa0); queues[qidx].PlaceAndSubmitPacket( SDMACopyDataPacket(queues[qidx].GetFamilyId(), destBuf.As(), srcBuf.As(), bufSize)); queues[qidx].PlaceAndSubmitPacket( SDMAWriteDataPacket(queues[qidx].GetFamilyId(), destBuf.As() + bufSize/4, 0x02020202)); queues[qidx].Wait4PacketConsumption(); EXPECT_TRUE_GPU(WaitOnValue(destBuf.As() + bufSize/4, 0x02020202), gpuNode); EXPECT_SUCCESS_GPU(memcmp( destBuf.As(), srcBuf.As(), bufSize), gpuNode); } for (unsigned int qidx = 0; qidx < numSdmaQueues; ++qidx) EXPECT_SUCCESS_GPU(queues[qidx].Destroy(), gpuNode); } TEST_F(KFDQMTest, AllSdmaQueues) { TEST_START(TESTPROFILE_RUNALL) ASSERT_SUCCESS(KFDTest_Launch(AllSdmaQueues)); TEST_END } static void AllXgmiSdmaQueues(KFDTEST_PARAMETERS* pTestParamters) { int gpuNode = pTestParamters->gpuNode; KFDQMTest* pKFDQMTest = (KFDQMTest*)pTestParamters->pTestObject; int gpuIndex = pKFDQMTest->Get_NodeInfo()->HsaGPUindexFromGpuNode(gpuNode); unsigned int m_numSdmaXgmiEngines = pKFDQMTest->Get_NumSdmaSdmaXgmiEngines(gpuIndex); unsigned int m_numSdmaQueuesPerEngine = pKFDQMTest->Get_NumSdmaSdmaQueuesPerEngine(gpuIndex); int bufSize = PAGE_SIZE; int j; const unsigned int numXgmiSdmaQueues = m_numSdmaXgmiEngines * m_numSdmaQueuesPerEngine; LOG() << "XGMI SDMA engines number: " << m_numSdmaXgmiEngines << " SDMA queues per engine: " << m_numSdmaQueuesPerEngine << std::endl; HsaMemoryBuffer destBuf(bufSize << 1 , gpuNode, false); HsaMemoryBuffer srcBuf(bufSize, gpuNode, false); destBuf.Fill(0xFF); std::vector xgmiSdmaQueues(numXgmiSdmaQueues); for (j = 0; j < numXgmiSdmaQueues; ++j) ASSERT_SUCCESS_GPU(xgmiSdmaQueues[j].Create(gpuNode), gpuNode); for (j = 0; j < numXgmiSdmaQueues; ++j) { destBuf.Fill(0x0); srcBuf.Fill(j + 0xa0); xgmiSdmaQueues[j].PlaceAndSubmitPacket( SDMACopyDataPacket(xgmiSdmaQueues[j].GetFamilyId(), destBuf.As(), srcBuf.As(), bufSize)); xgmiSdmaQueues[j].PlaceAndSubmitPacket( SDMAWriteDataPacket(xgmiSdmaQueues[j].GetFamilyId(), destBuf.As() + bufSize/4, 0x02020202)); xgmiSdmaQueues[j].Wait4PacketConsumption(); EXPECT_TRUE_GPU(WaitOnValue(destBuf.As() + bufSize/4, 0x02020202), gpuNode); EXPECT_SUCCESS_GPU(memcmp( destBuf.As(), srcBuf.As(), bufSize), gpuNode); } for (j = 0; j < numXgmiSdmaQueues; ++j) EXPECT_SUCCESS_GPU(xgmiSdmaQueues[j].Destroy(), gpuNode); } TEST_F(KFDQMTest, AllXgmiSdmaQueues) { TEST_START(TESTPROFILE_RUNALL) ASSERT_SUCCESS(KFDTest_Launch(AllXgmiSdmaQueues)); TEST_END } static void AllQueues(KFDTEST_PARAMETERS* pTestParamters) { int gpuNode = pTestParamters->gpuNode; KFDQMTest* pKFDQMTest = (KFDQMTest*)pTestParamters->pTestObject; int gpuIndex = pKFDQMTest->Get_NodeInfo()->HsaGPUindexFromGpuNode(gpuNode); HSAuint32 m_FamilyId = pKFDQMTest->GetFamilyIdFromNodeId(gpuNode); unsigned int m_numSdmaXgmiEngines = pKFDQMTest->Get_NumSdmaSdmaXgmiEngines(gpuIndex); unsigned int m_numSdmaQueuesPerEngine = pKFDQMTest->Get_NumSdmaSdmaQueuesPerEngine(gpuIndex); unsigned int m_numSdmaEngines = pKFDQMTest->Get_NumSdmaEngines(gpuIndex); unsigned int m_numCpQueues = pKFDQMTest->Get_NumCpQueues(gpuIndex); int bufSize = PAGE_SIZE; unsigned int i, j; const unsigned int numCpQueues = m_numCpQueues; const unsigned int numSdmaQueues = m_numSdmaEngines * m_numSdmaQueuesPerEngine; const unsigned int numXgmiSdmaQueues = m_numSdmaXgmiEngines * m_numSdmaQueuesPerEngine; HsaMemoryBuffer destBufCp(PAGE_SIZE, gpuNode, false); destBufCp.Fill(0xFF); HsaMemoryBuffer destBuf(bufSize << 1 , gpuNode, false); HsaMemoryBuffer srcBuf(bufSize, gpuNode, false); destBuf.Fill(0xFF); std::vector cpQueues(numCpQueues); std::vector sdmaQueues(numSdmaQueues); std::vector xgmiSdmaQueues(numXgmiSdmaQueues); for (i = 0; i < numCpQueues; ++i) ASSERT_SUCCESS_GPU(cpQueues[i].Create(gpuNode), gpuNode) << " QueueId=" << i; for (j = 0; j < numSdmaQueues; ++j) ASSERT_SUCCESS_GPU(sdmaQueues[j].Create(gpuNode), gpuNode); for (j = 0; j < numXgmiSdmaQueues; ++j) ASSERT_SUCCESS_GPU(xgmiSdmaQueues[j].Create(gpuNode), gpuNode); for (i = 0; i < numCpQueues; ++i) { cpQueues[i].PlaceAndSubmitPacket(PM4WriteDataPacket(destBufCp.As()+i*2, i, i)); cpQueues[i].PlaceAndSubmitPacket(PM4ReleaseMemoryPacket(m_FamilyId, true, 0, 0)); cpQueues[i].Wait4PacketConsumption(); EXPECT_TRUE_GPU(WaitOnValue(destBufCp.As()+i*2, i), gpuNode); } for (j = 0; j < numSdmaQueues; ++j) { destBuf.Fill(0x0); srcBuf.Fill(j + 0xa0); sdmaQueues[j].PlaceAndSubmitPacket( SDMACopyDataPacket(sdmaQueues[j].GetFamilyId(), destBuf.As(), srcBuf.As(), bufSize)); sdmaQueues[j].PlaceAndSubmitPacket( SDMAWriteDataPacket(sdmaQueues[j].GetFamilyId(), destBuf.As() + bufSize/4, 0x02020202)); sdmaQueues[j].Wait4PacketConsumption(); EXPECT_TRUE_GPU(WaitOnValue(destBuf.As() + bufSize/4, 0x02020202), gpuNode); EXPECT_SUCCESS_GPU(memcmp( destBuf.As(), srcBuf.As(), bufSize), gpuNode); } for (j = 0; j < numXgmiSdmaQueues; ++j) { destBuf.Fill(0x0); srcBuf.Fill(j + 0xa0); xgmiSdmaQueues[j].PlaceAndSubmitPacket( SDMACopyDataPacket(xgmiSdmaQueues[j].GetFamilyId(), destBuf.As(), srcBuf.As(), bufSize)); xgmiSdmaQueues[j].PlaceAndSubmitPacket( SDMAWriteDataPacket(xgmiSdmaQueues[j].GetFamilyId(), destBuf.As() + bufSize/4, 0x02020202)); xgmiSdmaQueues[j].Wait4PacketConsumption(); EXPECT_TRUE_GPU(WaitOnValue(destBuf.As() + bufSize/4, 0x02020202), gpuNode); EXPECT_SUCCESS_GPU(memcmp( destBuf.As(), srcBuf.As(), bufSize), gpuNode); } for (i = 0; i < numCpQueues; ++i) EXPECT_SUCCESS_GPU(cpQueues[i].Destroy(), gpuNode); for (j = 0; j < numSdmaQueues; ++j) EXPECT_SUCCESS_GPU(sdmaQueues[j].Destroy(), gpuNode); for (j = 0; j < numXgmiSdmaQueues; ++j) EXPECT_SUCCESS_GPU(xgmiSdmaQueues[j].Destroy(), gpuNode); } TEST_F(KFDQMTest, AllQueues) { TEST_START(TESTPROFILE_RUNALL) ASSERT_SUCCESS(KFDTest_Launch(AllQueues)); TEST_END } /* The following test is designed to reproduce an intermittent hang on * Fiji and other VI/Polaris GPUs. This test typically hangs in a few * seconds. According to analysis done by HW engineers, the culprit * seems to be PCIe speed switching. The problem can be worked around * by disabling the lowest DPM level on Fiji. */ static void SdmaConcurrentCopies(KFDTEST_PARAMETERS* pTestParamters) { int gpuNode = pTestParamters->gpuNode; KFDQMTest* pKFDQMTest = (KFDQMTest*)pTestParamters->pTestObject; int gpuIndex = pKFDQMTest->Get_NodeInfo()->HsaGPUindexFromGpuNode(gpuNode); HSAuint32 m_FamilyId = pKFDQMTest->GetFamilyIdFromNodeId(gpuNode); #define BUFFER_SIZE (64*1024) #define NPACKETS 1 #define COPY_SIZE (BUFFER_SIZE / NPACKETS) HsaMemoryBuffer srcBuf(BUFFER_SIZE, 0, true); HsaMemoryBuffer dstBuf(BUFFER_SIZE, gpuNode, false, hsakmt_is_dgpu() ? true : false); SDMAQueue queue; ASSERT_SUCCESS_GPU(queue.Create(gpuNode), gpuNode); std::ostream &log = LOG(); char progress[] = "-\b"; log << "Running ... "; for (unsigned i = 0; i < 100000; i++) { if (i % 1000 == 0) { const char progressSteps[4] = {'-', '\\', '|', '/'}; progress[0] = progressSteps[(i/1000) % 4]; log << progress; } for (unsigned j = 0; j < NPACKETS; j++) queue.PlacePacket( SDMACopyDataPacket(queue.GetFamilyId(), dstBuf.As()+COPY_SIZE*j, srcBuf.As()+COPY_SIZE*j, COPY_SIZE)); queue.SubmitPacket(); /* Waste a variable amount of time. Submission timing * while SDMA runs concurrently seems to be critical for * reproducing the hang */ for (int k = 0; k < (i & 0xfff); k++) memcpy(srcBuf.As()+PAGE_SIZE, srcBuf.As(), 1024); /* Wait for idle every 8 packets to allow the SDMA engine to * run concurrently for a bit without getting too far ahead */ if ((i & 0x7) == 0) queue.Wait4PacketConsumption(); } log << "Done." << std::endl; queue.PlaceAndSubmitPacket(SDMAWriteDataPacket(queue.GetFamilyId(), srcBuf.As(), 0x02020202)); queue.Wait4PacketConsumption(); EXPECT_TRUE_GPU(WaitOnValue(srcBuf.As(), 0x02020202), gpuNode); EXPECT_SUCCESS_GPU(queue.Destroy(), gpuNode); } TEST_F(KFDQMTest, SdmaConcurrentCopies) { TEST_START(TESTPROFILE_RUNALL) ASSERT_SUCCESS(KFDTest_Launch(SdmaConcurrentCopies)); TEST_END } static void DisableCpQueueByUpdateWithNullAddress(KFDTEST_PARAMETERS* pTestParamters) { int gpuNode = pTestParamters->gpuNode; KFDQMTest* pKFDQMTest = (KFDQMTest*)pTestParamters->pTestObject; HsaMemoryBuffer destBuf(PAGE_SIZE, gpuNode, false); destBuf.Fill(0xFFFFFFFF); PM4Queue queue; ASSERT_SUCCESS_GPU(queue.Create(gpuNode), gpuNode); HsaEvent *event; ASSERT_SUCCESS_GPU(CreateQueueTypeEvent(false, false, gpuNode, &event), gpuNode); queue.PlaceAndSubmitPacket(PM4WriteDataPacket(destBuf.As(), 0, 0)); queue.Wait4PacketConsumption(event); WaitOnValue(destBuf.As(), 0); destBuf.Fill(0xFFFFFFFF); EXPECT_SUCCESS_GPU(queue.Update(BaseQueue::DEFAULT_QUEUE_PERCENTAGE, BaseQueue::DEFAULT_PRIORITY, true), gpuNode); queue.PlaceAndSubmitPacket(PM4WriteDataPacket(destBuf.As(), 1, 1)); // Don't sync since we don't expect rptr to change when the queue is disabled. Delay(2000); EXPECT_EQ_GPU(destBuf.As()[0], 0xFFFFFFFF, gpuNode) << "Packet executed even though the queue is supposed to be disabled!"; EXPECT_SUCCESS_GPU(queue.Update(BaseQueue::DEFAULT_QUEUE_PERCENTAGE, BaseQueue::DEFAULT_PRIORITY, false), gpuNode); queue.Wait4PacketConsumption(event); WaitOnValue(destBuf.As(), 1); hsaKmtDestroyEvent(event); EXPECT_SUCCESS_GPU(queue.Destroy(), gpuNode); } TEST_F(KFDQMTest, DisableCpQueueByUpdateWithNullAddress) { TEST_START(TESTPROFILE_RUNALL) ASSERT_SUCCESS(KFDTest_Launch(DisableCpQueueByUpdateWithNullAddress)); TEST_END } static void DisableSdmaQueueByUpdateWithNullAddress(KFDTEST_PARAMETERS* pTestParamters) { int gpuNode = pTestParamters->gpuNode; HsaMemoryBuffer destBuf(PAGE_SIZE, gpuNode, false); destBuf.Fill(0xFFFFFFFF); SDMAQueue queue; ASSERT_SUCCESS_GPU(queue.Create(gpuNode), gpuNode); queue.PlaceAndSubmitPacket(SDMAWriteDataPacket(queue.GetFamilyId(), destBuf.As(), 0)); WaitOnValue(destBuf.As(), 0); destBuf.Fill(0xFFFFFFFF); EXPECT_SUCCESS_GPU(queue.Update(BaseQueue::DEFAULT_QUEUE_PERCENTAGE, BaseQueue::DEFAULT_PRIORITY, true), gpuNode); queue.PlaceAndSubmitPacket(SDMAWriteDataPacket(queue.GetFamilyId(), destBuf.As(), 0)); // Don't sync since we don't expect rptr to change when the queue is disabled. Delay(2000); EXPECT_EQ_GPU(destBuf.As()[0], 0xFFFFFFFF, gpuNode) << "Packet executed even though the queue is supposed to be disabled!"; EXPECT_SUCCESS_GPU(queue.Update(BaseQueue::DEFAULT_QUEUE_PERCENTAGE, BaseQueue::DEFAULT_PRIORITY, false), gpuNode); queue.Wait4PacketConsumption(); WaitOnValue(destBuf.As(), 0); EXPECT_SUCCESS_GPU(queue.Destroy(), gpuNode); } TEST_F(KFDQMTest, DisableSdmaQueueByUpdateWithNullAddress) { TEST_START(TESTPROFILE_RUNALL) ASSERT_SUCCESS(KFDTest_Launch(DisableSdmaQueueByUpdateWithNullAddress)); TEST_END } static void DisableCpQueueByUpdateWithZeroPercentage(KFDTEST_PARAMETERS* pTestParamters) { int gpuNode = pTestParamters->gpuNode; HsaMemoryBuffer destBuf(PAGE_SIZE, gpuNode, false); destBuf.Fill(0xFFFFFFFF); PM4Queue queue; ASSERT_SUCCESS_GPU(queue.Create(gpuNode), gpuNode); HsaEvent *event; ASSERT_SUCCESS_GPU(CreateQueueTypeEvent(false, false, gpuNode, &event), gpuNode); PM4WriteDataPacket packet1, packet2; packet1.InitPacket(destBuf.As(), 0, 0); packet2.InitPacket(destBuf.As(), 1, 1); queue.PlaceAndSubmitPacket(packet1); queue.Wait4PacketConsumption(event); WaitOnValue(destBuf.As(), 0); destBuf.Fill(0xFFFFFFFF); EXPECT_SUCCESS_GPU(queue.Update(0/*percentage*/, BaseQueue::DEFAULT_PRIORITY, false), gpuNode); queue.PlaceAndSubmitPacket(packet2); // Don't sync since we don't expect rptr to change when the queue is disabled. Delay(2000); EXPECT_EQ_GPU(destBuf.As()[0], 0xFFFFFFFF, gpuNode) << "Packet executed even though the queue is supposed to be disabled!"; EXPECT_SUCCESS_GPU(queue.Update(BaseQueue::DEFAULT_QUEUE_PERCENTAGE, BaseQueue::DEFAULT_PRIORITY, false), gpuNode); queue.Wait4PacketConsumption(event); WaitOnValue(destBuf.As(), 1); hsaKmtDestroyEvent(event); EXPECT_SUCCESS_GPU(queue.Destroy(), gpuNode); } TEST_F(KFDQMTest, DisableCpQueueByUpdateWithZeroPercentage) { TEST_START(TESTPROFILE_RUNALL) ASSERT_SUCCESS(KFDTest_Launch(DisableCpQueueByUpdateWithZeroPercentage)); TEST_END } static void CreateQueueStressSingleThreaded(KFDTEST_PARAMETERS* pTestParamters) { int gpuNode = pTestParamters->gpuNode; static const HSAuint64 TEST_TIME_SEC = 15; HSAuint64 initialTime = GetSystemTickCountInMicroSec(); unsigned int numIter = 0; HSAuint64 timePassed = 0; do { // The following means we'll get the order 0,0 => 0,1 => 1,0 => 1,1 so we cover all options. unsigned int firstToCreate = (numIter % 2 != 0) ? 1 : 0; unsigned int firstToDestroy = (numIter % 4 > 1) ? 1 : 0; unsigned int secondToCreate = (firstToCreate + 1)%2; unsigned int secondToDestroy = (firstToDestroy + 1)%2; BaseQueue *queues[2] = {new PM4Queue(), new SDMAQueue()}; ASSERT_SUCCESS_GPU(queues[firstToCreate]->Create(gpuNode), gpuNode); ASSERT_SUCCESS_GPU(queues[secondToCreate]->Create(gpuNode),gpuNode); EXPECT_SUCCESS_GPU(queues[firstToDestroy]->Destroy(), gpuNode); EXPECT_SUCCESS_GPU(queues[secondToDestroy]->Destroy(), gpuNode); delete queues[0]; delete queues[1]; ++numIter; HSAuint64 curTime = GetSystemTickCountInMicroSec(); timePassed = (curTime - initialTime) / 1000000; } while (timePassed < TEST_TIME_SEC); } TEST_F(KFDQMTest, CreateQueueStressSingleThreaded) { TEST_START(TESTPROFILE_RUNALL) ASSERT_SUCCESS(KFDTest_Launch(CreateQueueStressSingleThreaded)); TEST_END } static void OverSubscribeCpQueues(KFDTEST_PARAMETERS* pTestParamters) { int gpuNode = pTestParamters->gpuNode; KFDQMTest* pKFDQMTest = (KFDQMTest*)pTestParamters->pTestObject; const HSAuint32 m_FamilyId = pKFDQMTest->GetFamilyIdFromNodeId(gpuNode); if (m_FamilyId == FAMILY_CI || m_FamilyId == FAMILY_KV) { LOG() << "Skipping test: CI doesn't have HW scheduling." << std::endl; return; } /* The max queues per process is 1024 limited by * KFD, so MAX_CP_QUEUES is needed to adapt it * when total queues exceed it. */ static const unsigned int MAX_CP_QUEUES = g_TestGPUsNum > 15 ? 1024 / g_TestGPUsNum : 65; static const unsigned int MAX_PACKETS = 100; HsaMemoryBuffer destBuf(PAGE_SIZE, gpuNode, false); destBuf.Fill(0xFF); PM4Queue queues[MAX_CP_QUEUES]; for (unsigned int qidx = 0; qidx < MAX_CP_QUEUES; ++qidx) ASSERT_SUCCESS_GPU(queues[qidx].Create(gpuNode), gpuNode) << " QueueId=" << qidx; for (unsigned int qidx = 0; qidx < MAX_CP_QUEUES; ++qidx) { unsigned int pktSizeDw = 0; for (unsigned int i = 0; i < MAX_PACKETS; i++) { PM4WriteDataPacket packet; packet.InitPacket(destBuf.As()+qidx*2, qidx+i, qidx+i); // two dwords per packet queues[qidx].PlacePacket(packet); } } for (unsigned int qidx = 0; qidx < MAX_CP_QUEUES; ++qidx) queues[qidx].SubmitPacket(); // Delaying for 5 seconds in order to get all the results Delay(5000); for (unsigned int qidx = 0; qidx < MAX_CP_QUEUES; ++qidx) EXPECT_TRUE_GPU(queues[qidx].AllPacketsSubmitted(), gpuNode)<< "QueueId=" << qidx;; for (unsigned int qidx = 0; qidx < MAX_CP_QUEUES; ++qidx) EXPECT_SUCCESS_GPU(queues[qidx].Destroy(), gpuNode); } TEST_F(KFDQMTest, OverSubscribeCpQueues) { TEST_START(TESTPROFILE_RUNALL) ASSERT_SUCCESS(KFDTest_Launch(OverSubscribeCpQueues)); TEST_END } HSAint64 KFDQMTest::TimeConsumedwithCUMask(int node, uint32_t* mask, uint32_t mask_count) { HsaMemoryBuffer isaBuffer(PAGE_SIZE, node, true/*zero*/, false/*local*/, true/*exec*/); HsaMemoryBuffer dstBuffer(PAGE_SIZE, node, true, false, false); HsaMemoryBuffer ctlBuffer(PAGE_SIZE, node, true, false, false); EXPECT_SUCCESS(m_pAsm->RunAssembleBuf(LoopIsa, isaBuffer.As())); Dispatch dispatch(isaBuffer); dispatch.SetDim(1024, 16, 16); PM4Queue queue; EXPECT_SUCCESS(queue.Create(node)); EXPECT_SUCCESS(queue.SetCUMask(mask, mask_count)); queue.SetSkipWaitConsump(true); HSAuint64 startTime = GetSystemTickCountInMicroSec(); dispatch.Submit(queue); dispatch.Sync(); HSAuint64 endTime = GetSystemTickCountInMicroSec(); EXPECT_SUCCESS(queue.Destroy()); return endTime - startTime; } /* To cover for outliers, allow us to get the Average time based on a specified number of iterations */ HSAint64 KFDQMTest::GetAverageTimeConsumedwithCUMask(int node, uint32_t* mask, uint32_t mask_count, int iterations) { HSAint64 timeArray[iterations]; HSAint64 timeTotal = 0; if (iterations < 1) { LOG() << "ERROR: At least 1 iteration must be performed" << std::endl; return 0; } for (int x = 0; x < iterations; x++) { timeArray[x] = TimeConsumedwithCUMask(node, mask, mask_count); timeTotal += timeArray[x]; } if (timeTotal == 0) { LOG() << "ERROR: Total time reported as 0. Exiting" << std::endl; return 0; } for (int x = 0; x < iterations; x++) { HSAint64 variance = timeArray[x] / (timeTotal / iterations); if (variance < CuNegVariance || variance > CuPosVariance) LOG() << "WARNING: Measurement #" << x << "/" << iterations << " (" << timeArray[x] << ") is at least " << CuVariance*100 << "% away from the mean (" << timeTotal/iterations << ")" << std::endl; } return timeTotal / iterations; } /* * Apply CU masking in a linear fashion, adding 1 CU per iteration * until all Shader Engines are full */ void BasicCuMaskingLinear(KFDTEST_PARAMETERS* pTestParamters) { int gpuNode = pTestParamters->gpuNode; KFDQMTest* pKFDQMTest = (KFDQMTest*)pTestParamters->pTestObject; const HSAuint32 m_FamilyId = pKFDQMTest->GetFamilyIdFromNodeId(gpuNode); if (m_FamilyId >= FAMILY_VI) { const HsaNodeProperties *pNodeProperties = pKFDQMTest->Get_NodeInfo()->GetNodeProperties(gpuNode); uint32_t ActiveCU = (pNodeProperties->NumFComputeCores / pNodeProperties->NumSIMDPerCU); uint32_t numSEs = pNodeProperties->NumShaderBanks; LOG() << std::dec << "# Compute cores: " << pNodeProperties->NumFComputeCores << std::endl; LOG() << std::dec << "# SIMDs per CU: " << pNodeProperties->NumSIMDPerCU << std::endl; LOG() << std::dec << "# Shader engines: " << numSEs << std::endl; LOG() << std::dec << "# Active CUs: " << ActiveCU << std::endl; HSAint64 TimewithCU1, TimewithCU; uint32_t maskNumDwords = (ActiveCU + 31) / 32; /* Round up to the nearest multiple of 32 */ uint32_t maskNumBits = maskNumDwords * 32; uint32_t mask[maskNumDwords]; double ratio; mask[0] = 0x1; for (int i = 1; i < maskNumDwords; i++) mask[i] = 0x0; /* Execute once to get any HW optimizations out of the way */ pKFDQMTest->TimeConsumedwithCUMask(gpuNode, mask, maskNumBits); LOG() << "Getting baseline performance numbers (CU Mask: 0x1)" << std::endl; TimewithCU1 = pKFDQMTest->GetAverageTimeConsumedwithCUMask(gpuNode, mask, maskNumBits, 3); for (int nCUs = 2; nCUs <= ActiveCU; nCUs++) { int maskIndex = (nCUs - 1) / 32; mask[maskIndex] |= 1 << ((nCUs - 1) % 32); TimewithCU = pKFDQMTest->TimeConsumedwithCUMask(gpuNode, mask, maskNumBits); ratio = (double)(TimewithCU1) / ((double)(TimewithCU) * nCUs); LOG() << "Expected performance of " << nCUs << " CUs vs 1 CU:" << std::endl; LOG() << std::setprecision(2) << pKFDQMTest->CuNegVariance << " <= " << std::fixed << std::setprecision(8) << ratio << " <= " << std::setprecision(2) << pKFDQMTest->CuPosVariance << std::endl; EXPECT_TRUE((ratio >= pKFDQMTest->CuNegVariance) && (ratio <= pKFDQMTest->CuPosVariance)); RECORD(ratio) << "Ratio-" << nCUs << "-CUs"; } } else { LOG() << "Skipping test: Test not supported for family ID 0x" << m_FamilyId << "." << std::endl; } } TEST_F(KFDQMTest, BasicCuMaskingLinear) { TEST_START(TESTPROFILE_RUNALL); ASSERT_SUCCESS(KFDTest_Launch(BasicCuMaskingLinear)); TEST_END } // ====== ExtendedCuMasking Helper Functions ====== // #define CUMASK_DEBUG 0 // Enable extra output for debugging issues #if CUMASK_DEBUG #define DBG_PRINT printf #else #define DBG_PRINT #endif /* * Helper function to print multi-dword mask. * * pHeader: A non-NULL pointer to a string to use as the header. * pMask: A pointer to the mask to print out. * numDwords: Number of elements in mask array. * */ static void printMask(const char *pHeader, uint32_t *pMask, uint32_t numDwords) { printf("%s0x", pHeader); for (int i = numDwords - 1; i >= 0; i--) { printf("%08x", pMask[i]); } printf("\n"); } /* * Set the CU mask for each specified WGPs. * * Note: The effect is cumulative, function can be called multiple times to * set up additional WGPs in the provided pMask. * * pMask: A non-NULL pointer to the CU mask. * maskConfig: Information on GPU configuration. * seMask: Specifies SEs that are targetted. * saMask: Specifies SAs that are targetted within the SEs specified. * wgpMask: Specifies WGPs that are targetted within the (SE,SA) specified. * * For seMask, saMask, and wgpMask: * One bit per SE/SA/WGP, multiple bits can be specified. * Masks cannot be 0 (at least 1 SE, 1 SA and 1 WGP must be specified). * Special value: -1 (specifies ALL) * */ static bool setCUMask(uint32_t *pMask, mask_config_t maskConfig, uint32_t seMask, uint32_t saMask, uint32_t wgpMask) { bool result = true; if (pMask) { if (seMask && saMask && wgpMask) { // proceed only with non-zero mask for (int i = 0; i < maskConfig.numWGPperSA; i++) { if (((wgpMask >> i) & 1)) { for (int j = 0; j < maskConfig.numSAperSE; j++) { if (((saMask >> j) & 1)) { for (int k = 0; k < maskConfig.numSEs; k++) { if (((seMask >> k) & 1)) { uint32_t insLoc = k * 2 + j * (2 * maskConfig.numSEs) + i * (2 * maskConfig.numSEs * maskConfig.numSAperSE); pMask[insLoc / 32] |= (0x3 << (insLoc % 32)); } } } } } } } else { LOG() << "ERROR: SE/SA/WGP mask values must be non-zero!\n"; result = false; } } else { LOG() << "ERROR: pMask is NULL!\n"; result = false; } return result; } /* * Compute an adjusted CU mask to use when some WGPs are inactive. * * The adjusted mask takes into account the inactive WGPs by removing their corresponding * bits from the mask as these are skipped by KFD. As bits are removed from the mask, * the remaining bit values are shifted right. * * pAdjMask: A non-NULL pointer where the adjusted mask will be written. * pMask: A non-NULL pointer to the CU mask. * maskConfig: Information on GPU configuration. * * Returns: * true: If adjusted mask has one or more non-zero bit set. * false: If the adjusted mask is all zeroes (no WGPs left to do work). * * When false is returned, we should skipped the specific test scenario. * */ bool adjustMask(uint32_t *pAdjMask, uint32_t *pMask, mask_config_t maskConfig) { int wi = 0; int totalBits = maskConfig.numBits; bool nonZero = false; memset(pAdjMask, 0, sizeof(uint32_t) * maskConfig.numDwords); for (int ri = 0; ri < totalBits; ri += 2) { uint32_t value = (pMask[ri / 32] >> (ri % 32)) & 0x3; if ((maskConfig.pInactiveMask[ri / 32] & (0x3 << (ri % 32))) != 0) { // skip that entry } else { uint32_t newValue = value << (wi % 32); pAdjMask[wi / 32] |= newValue; wi += 2; if (value != 0) { nonZero = true; } } } #if CUMASK_DEBUG printf("\nAdjusting mask:\n"); printMask(" mask: ", pMask, maskConfig.numDwords); printMask("inactive: ", maskConfig.pInactiveMask, maskConfig.numDwords); printMask("adjusted: ", pAdjMask, maskConfig.numDwords); printf("\n"); #endif //CUMASK_DEBUG return nonZero; } /* * Validates the result of a test. * * pMask: A non-NULL pointer to the CU mask that was used for the test. * maskConfig: Information on GPU configuration. * numWorkItems: Number of work items used for shader execution. * pOutput: Pointer to the output array. * pResultMask: If non-NULL, result mask constructed from output is stored at that memory location. * */ static bool validateTest(uint32_t *pMask, mask_config_t maskConfig, uint32_t numWorkItems, out_data_t *pOutput, uint32_t *pResultMask) { uint32_t resultMask[maskConfig.numDwords]; bool result = false; memset(resultMask, 0, sizeof(resultMask)); for (int i = 0; i < numWorkItems; i++) { DBG_PRINT("=== % 4d: 0x%08x [ se: %2d, sa: %2d, wgp: %2d]\n", i, pOutput[i].data, pOutput[i].se, pOutput[i].sa, pOutput[i].wgp); setCUMask(resultMask, maskConfig, 1 << pOutput[i].se, 1 << pOutput[i].sa, 1 << pOutput[i].wgp); } if (pResultMask) { memcpy(pResultMask, resultMask, sizeof(resultMask)); } if (maskConfig.pInactiveMask) { // If some WGPs were inactive, compute a verify mask taking into account the inactive WGPs. uint32_t verifyMask[maskConfig.numDwords]; memset(verifyMask, 0, sizeof(verifyMask)); for (int i = 0; i < maskConfig.numDwords; i++) { verifyMask[i] = pMask[i] & ~maskConfig.pInactiveMask[i]; } #if CUMASK_DEBUG printf("\nValidate test:\n"); printMask(" mask: ", pMask, maskConfig.numDwords); printMask(" resultMask: ", resultMask, maskConfig.numDwords); printMask("inactiveMask: ", maskConfig.pInactiveMask, maskConfig.numDwords); printMask(" verifyMask: ", verifyMask, maskConfig.numDwords); #endif //CUMASK_DEBUG result = (memcmp(verifyMask, resultMask, sizeof(resultMask)) == 0); } else { #if CUMASK_DEBUG printf("\nValidate test:\n"); printMask(" mask: ", pMask, maskConfig.numDwords); printMask(" resultMask: ", resultMask, maskConfig.numDwords); #endif //CUMASK_DEBUG result = (memcmp(pMask, resultMask, sizeof(resultMask)) == 0); } DBG_PRINT(" Result: %s\n\n", result ? "PASS" : "FAIL"); return result; } /* * Set CU Mask, submit the testing shader, and validate the results. * * gpuNode: The node to use for the test. * pMask: A non-NULL pointer to the CU mask to use for the test. * maskConfig: Information on GPU configuration. * programBuffer: The buffer that contains the shader program. * numWorkItems: The number of work items to use. * pOutput: A non-NULL pointer to the output buffer used by the shader. * pResultMask: If non-NULL, result mask constructed from output is stored at that memory location. * */ static bool testCUMask(int gpuNode, uint32_t *pMask, mask_config_t maskConfig, HsaMemoryBuffer &programBuffer, uint32_t numWorkItems, out_data_t *pOutput, uint32_t *pResultMask = NULL) { PM4Queue queue; uint32_t *pAdjMask = NULL; uint32_t adjMask[maskConfig.numDwords]; if (maskConfig.pInactiveMask) { if (adjustMask(adjMask, pMask, maskConfig)) { pAdjMask = adjMask; } else { // Adjusted mask is all zeroes, skip test and mark as passing. return true; } } else { pAdjMask = pMask; } Dispatch dispatch(programBuffer); dispatch.SetArgs(NULL, pOutput); dispatch.SetDim(numWorkItems, 1, 1); EXPECT_SUCCESS_GPU(queue.Create(gpuNode), gpuNode); EXPECT_SUCCESS_GPU(queue.SetCUMask(pAdjMask, maskConfig.numBits), gpuNode); dispatch.Submit(queue); dispatch.Sync(); EXPECT_SUCCESS_GPU(queue.Destroy(), gpuNode); return validateTest(pMask, maskConfig, numWorkItems, pOutput, pResultMask); } /* * ExtendedCuMasking * * Newer implementation of CU mask testing that focuses on correctness of masking. * * Unlike previous implementations, this new implementation does not rely on performance * measurements to decide if the masking took place. Instead, this implementation checks * if waves were executed on all the CUs enabled and only the CUs enabled. * * Implementation does a series of tests, new tests can be easily added as needed. * * For each test, these steps are performed: * * 1) Decide the units that are enabled for the test (SEs, SAs, WGPs). * 2) Generate a CU mask that specifies the WGPs enabled on each (SE,SA) pairs. * 3) Set the mask for the queue and run a special shader. * 4) Shader records in a buffer the unit that is used by the wave (SE,SA,WGP). * 5) Test program analyses the results and verifies if shader used all and only the * WGP units specified by the mask. * * Multiple tests are done with different combinations. * There are (2^numWGPs - 1) possibilities, not everything can be tested. * * For each new ASIC supported, the following changes might be required: * 1) Minor shader changes to put fill information into buffer. * 2) Format of out_data_t struct. * 3) Changes to validation code. * */ static void extendedCuMasking(KFDTEST_PARAMETERS* pTestParameters) { int gpuNode = pTestParameters->gpuNode; KFDQMTest* pKFDQMTest = (KFDQMTest*)pTestParameters->pTestObject; const HSAuint32 m_FamilyId = pKFDQMTest->GetFamilyIdFromNodeId(gpuNode); if (m_FamilyId >= FAMILY_GFX12) { // Supporting GFX12 and up for now // Lock to prevent interleave of logging on multigpu (multithreaded) testing static std::mutex logMutex; const HsaNodeProperties *pProps = pKFDQMTest->Get_NodeInfo()->GetNodeProperties(gpuNode); const uint32_t activeCU = (pProps->NumFComputeCores / pProps->NumSIMDPerCU); const uint32_t numSEs = pProps->NumShaderBanks; const uint32_t numSAperSE = pProps->NumArrays; const uint32_t numWGPperSA = pProps->NumCUPerArray / 2; const uint32_t maxCU = numSEs * numSAperSE * numWGPperSA * 2; std::ostringstream nodeStream; nodeStream << "(Node " << gpuNode << ")"; const std::string nodeStr = nodeStream.str(); logMutex.lock(); LOG() << std::endl; LOG() << std::dec << "****** GFX Configuration " << nodeStr << " ******" << std::endl; LOG() << std::dec << " Compute Cores (SIMD): " << std::setw(3) << pProps->NumFComputeCores << std::endl; LOG() << std::dec << " SIMDs per CU: " << std::setw(3) << pProps->NumSIMDPerCU << std::endl; LOG() << std::dec << " Active CUs: " << std::setw(3) << activeCU << std::endl; LOG() << std::dec << " Max CUs: " << std::setw(3) << maxCU << std::endl; LOG() << std::dec << " Shader Engines: " << std::setw(3) << numSEs << std::endl; LOG() << std::dec << " SAs per SE: " << std::setw(3) << numSAperSE << std::endl; LOG() << std::dec << " WGPs per SA: " << std::setw(3) << numWGPperSA << std::endl; LOG() << std::dec << "****************************************" << std::endl; logMutex.unlock(); const uint32_t maskNumDwords = (maxCU + 31) / 32; /* Round up to the nearest multiple of 32 */ const uint32_t maskNumBits = maskNumDwords * 32; uint32_t mask[maskNumDwords]; uint32_t inactiveMask[maskNumDwords]; mask_config_t maskConfig = { maskNumDwords, maskNumBits, numSEs, numSAperSE, numWGPperSA, NULL }; /* * Note: On system with WGPs, CU bits in the same WGP must be either both set or both unset * i.e. enabling/disabling is on a per-WGP basis. * * Format of CU Mask array (Assuming 4 SEs) * * Bit Value Masking * * 0,1 0x03 SE0 SA0 WGP0 (i.e. CU0 and CU1) * 2,3 0x0c SE1 SA0 WGP0 * 4,5 0x30 SE2 SA0 WGP0 * 6,7 0xc0 SE3 SA0 WGP0 * * 8,9 0x0300 SE0 SA1 WGP0 * 10,11 0x0c00 SE1 SA1 WGP0 * 12,13 0x3000 SE2 SA1 WGP0 * 14,15 0xc000 SE3 SA1 WGP0 * * 16,17 0x030000 SE0 SA0 WGP1 * 18,19 0x030000 SE1 SA0 WGP1 * ... * 32,33 SE0 SA0 WGP2 * ... * 48,49 SE0 SA0 WGP3 * ... * */ /* * Number of work items needs to be sufficiently large to have enough work items for each WGP enabled. * * Using total number of WGPs multiplied by 16. * */ const uint32_t numWorkItems = 16 * numSEs * numSAperSE * numWGPperSA; // Allocate buffers for program and output HsaMemoryBuffer programBuffer(PAGE_SIZE, gpuNode, true, false, true); HsaMemoryBuffer outputBuffer(((sizeof(out_data_t) * numWorkItems) + (PAGE_SIZE - 1)) & ~(PAGE_SIZE - 1), gpuNode, true, false, false); out_data_t *pOutput = outputBuffer.As(); // Assemble shader Assembler *pAsm = pKFDQMTest->GetAssemblerFromNodeId(gpuNode); ASSERT_NOTNULL_GPU(pAsm, gpuNode); ASSERT_SUCCESS_GPU(pAsm->RunAssembleBuf(CheckCuMaskIsa, programBuffer.As()), gpuNode); /* * Check and record any inactive WPGs. * */ memset(mask, 0, sizeof(mask)); memset(inactiveMask, 0, sizeof(inactiveMask)); // Use full mask and collect all active CUs in inactiveMask setCUMask(mask, maskConfig, -1, -1, -1); if (testCUMask(gpuNode, mask, maskConfig, programBuffer, numWorkItems, pOutput, inactiveMask)) { // Using full mask, if all CUs are used, we expect them to be all active. EXPECT_TRUE_GPU(activeCU == maxCU, gpuNode); } else { // Some CUs were not used, generate inactive mask and count inactive CUs. uint32_t inactiveCount = 0; // Flip bits and count inactive for (int i = 0; i < maskNumDwords; i++) { inactiveMask[i] = ~inactiveMask[i]; inactiveCount += __builtin_popcount(inactiveMask[i]); } // Check if what we detected is consistent with info from KFD EXPECT_TRUE_GPU((activeCU + inactiveCount) == maxCU, gpuNode); maskConfig.pInactiveMask = inactiveMask; std::ostringstream logStr; logStr << nodeStr << " Inactive WGP detected: " << inactiveCount << " 0x" << std::hex << std::setw(8); for (int i = maskNumDwords - 1; i >= 0; i--) { logStr << inactiveMask[i]; } LOG() << logStr.str() << std::endl; } /* * Generate symmetric test configuration for all (SE, SA, WGP) combinations, one level at a time. * * Other levels fully enabled. * * Example: If testing SE disablement, all SA/WGP are enabled on the SE that are used. * If testing SA disablement, all SE are used, all WGP are enabled on the SA enabled. * */ uint32_t totalConfigTested = 0; // All SE combination (0 not allowed, need at least one enabled) LOG() << nodeStr << " === Testing SE mask (" << ((1 << numSEs) - 1) << " configs)\n"; for (int i = 1; i < (1 << numSEs); i++) { memset(mask, 0, sizeof(mask)); DBG_PRINT("SE mask: 0x%x\n", i); setCUMask(mask, maskConfig, i, -1, -1); EXPECT_TRUE_GPU(testCUMask(gpuNode, mask, maskConfig, programBuffer, numWorkItems, pOutput), gpuNode); totalConfigTested++; } // All SA combinations (0 not allowed, need at least one enabled) LOG() << nodeStr << " === Testing SA mask (" << ((1 << numSAperSE) - 1) << " configs)\n"; for (uint32_t i = 1; i < (1 << numSAperSE); i++) { memset(mask, 0, sizeof(mask)); DBG_PRINT("SA mask: 0x%x\n", i); setCUMask(mask, maskConfig, -1, i, -1); EXPECT_TRUE_GPU(testCUMask(gpuNode, mask, maskConfig, programBuffer, numWorkItems, pOutput), gpuNode); totalConfigTested++; } // All WGP combinations (0 not allowed, need at least one enabled) LOG() << nodeStr << " === Testing WGP mask (" << ((1 << numWGPperSA) - 1) << " configs)\n"; for (uint32_t i = 1; i < (1 << numWGPperSA); i++) { memset(mask, 0, sizeof(mask)); DBG_PRINT("WGP mask: 0x%x\n", i); setCUMask(mask, maskConfig, -1, -1, i); EXPECT_TRUE_GPU(testCUMask(gpuNode, mask, maskConfig, programBuffer, numWorkItems, pOutput), gpuNode); totalConfigTested++; } /* * Linear Masking * * Enable one WGP at a time until they are all enabled. * */ { uint32_t totalWGPs = numSEs * numSAperSE * numWGPperSA; LOG() << nodeStr << " === Testing linear mask (" << totalWGPs << " configs)\n"; memset(mask, 0, sizeof(mask)); for (int32_t i = 0; i < totalWGPs; i++) { mask[i / 16] |= (0x3 << (i * 2)); #if CUMASK_DEBUG printMask(" linear mask: ", mask, maskNumDwords); #endif //CUMASK_DEBUG EXPECT_TRUE_GPU(testCUMask(gpuNode, mask, maskConfig, programBuffer, numWorkItems, pOutput), gpuNode); totalConfigTested++; } } /* * Random asymmetric config. * * Asymmetric, different WGPs/SAs are enabled/disabled on different SEs. * */ { uint32_t randomCount = 1000; // Total number of random test to perform uint32_t seed = 1; // Specifying a seed to have deterministic random sequence srand(seed); LOG() << nodeStr << " === Testing " << randomCount << " random mask config...\n"; for (uint32_t i = 0; i < randomCount; i++) { memset(mask, 0, sizeof(mask)); uint32_t wgpLeft = maxCU / 2; // init to total WGPs uint32_t maskIndex = 0; while (wgpLeft > 0) { uint32_t wgpBlock = (wgpLeft > 16) ? 16 : wgpLeft; // max 16 WGPs at a time wgpLeft -= wgpBlock; /* * Pick random number between 0 to (2^wgpBlock - 1) - 1. * Then add 1 to get random number between 1 to (2^wgpBlock - 1). * This ensure that we don't end up with 0 for all the dwords in the mask. */ uint32_t wgpMask = (rand() % ((1ULL << wgpBlock) - 1)) + 1; // expand WGP mask to CU mask by doubling each individual bits. uint32_t expandToCUMask = 0; for (uint32_t j = 0; j < wgpBlock; j++) { if (wgpMask & (1 << j)) { expandToCUMask |= (0x3ULL << (j * 2)); } } DBG_PRINT("maskIndex: %u fullWGPMask: 0x%08x expand: 0x%08x\n", maskIndex, wgpMask, expandToCUMask); mask[maskIndex++] = expandToCUMask; } EXPECT_TRUE_GPU(testCUMask(gpuNode, mask, maskConfig, programBuffer, numWorkItems, pOutput), gpuNode); totalConfigTested++; } } LOG() << std::endl; LOG() << nodeStr << " Total config tested: " << totalConfigTested << std::endl; LOG() << std::endl; } else { LOG() << "Skipping test: Test not supported for family ID 0x" << m_FamilyId << "." << std::endl; } } TEST_F(KFDQMTest, ExtendedCuMasking) { TEST_START(TESTPROFILE_RUNALL); ASSERT_SUCCESS(KFDTest_Launch(extendedCuMasking)); TEST_END } #undef CUMASK_DEBUG #undef DBG_PRINT // ====== End of ExtendedCUMasking Functions ====== // /** * Apply CU masking where the number of CUs is equal across all Shader Engines * This will work due to the HW splitting the workload unevenly across the Shader * Engines when ((#ofCUs)/(#ofShaderEngines)) is not a whole number. The tests above * will not yield viable results when an uneven distribution of CUs is used over multiple * shader engines (e.g. 0x1000100030003), until the HW changes how it schedules work. */ void BasicCuMaskingEven(KFDTEST_PARAMETERS* pTestParamters) { int gpuNode = pTestParamters->gpuNode; KFDQMTest* pKFDQMTest = (KFDQMTest*)pTestParamters->pTestObject; const HSAuint32 m_FamilyId = pKFDQMTest->GetFamilyIdFromNodeId(gpuNode); if (m_FamilyId >= FAMILY_VI) { const HsaNodeProperties *pNodeProperties = pKFDQMTest->Get_NodeInfo()->GetNodeProperties(gpuNode); uint32_t ActiveCU = (pNodeProperties->NumFComputeCores / pNodeProperties->NumSIMDPerCU); uint32_t numShaderEngines = pNodeProperties->NumShaderBanks; if (numShaderEngines == 1) { LOG() << "Skipping test: Only 1 Shader Engine present." << std::endl; return; } LOG() << std::dec << "# Compute cores: " << pNodeProperties->NumFComputeCores << std::endl; LOG() << std::dec << "# SIMDs per CU: " << pNodeProperties->NumSIMDPerCU << std::endl; LOG() << std::dec << "# Shader engines: " << numShaderEngines << std::endl; LOG() << std::dec << "# Active CUs: " << ActiveCU << std::endl; HSAint64 TimewithCU1, TimewithCU; uint32_t maskNumDwords = (ActiveCU + 31) / 32; /* Round up to the nearest multiple of 32 */ uint32_t maskNumBits = maskNumDwords * 32; uint32_t mask[maskNumDwords]; int numCuPerShader = ActiveCU / numShaderEngines; double ratio; /* In KFD we symmetrically map mask to all SEs: * mask[0] bit0 -> se0 cu0; * mask[0] bit1 -> se1 cu0; * ... (if # SE is 4) * mask[0] bit4 -> se0 cu1; * ... */ /* Set Mask to 1 CU per SE */ memset(mask, 0, maskNumDwords * sizeof(uint32_t)); for (int i = 0; i < numShaderEngines; i++) { int maskIndex = (i / 32) % maskNumDwords; mask[maskIndex] |= 1 << (i % 32); } /* Execute once to get any HW optimizations out of the way */ pKFDQMTest->TimeConsumedwithCUMask(gpuNode, mask, maskNumBits); LOG() << "Getting baseline performance numbers (1 CU per SE)" << std::endl; TimewithCU1 = pKFDQMTest->GetAverageTimeConsumedwithCUMask(gpuNode, mask, maskNumBits, 3); /* Each loop will add 1 more CU per SE. We use the mod and divide to handle * when SEs aren't distributed in multiples of 32 (e.g. Tonga) * OR the new bit in for simplicity instead of re-creating the mask each iteration */ for (int x = 0; x < numCuPerShader; x++) { for (int se = 0; se < numShaderEngines; se++) { int offset = x * numShaderEngines + se; int maskIndex = (offset / 32) % maskNumDwords; mask[maskIndex] |= 1 << (offset % 32); } int nCUs = x + 1; TimewithCU = pKFDQMTest->TimeConsumedwithCUMask(gpuNode, mask, maskNumBits); ratio = (double)(TimewithCU1) / ((double)(TimewithCU) * nCUs); LOG() << "Expected performance of " << nCUs << " CU(s)/SE vs 1 CU/SE:" << std::endl; LOG() << std::setprecision(2) << pKFDQMTest->CuNegVariance << " <= " << std::fixed << std::setprecision(8) << ratio << " <= " << std::setprecision(2) << pKFDQMTest->CuPosVariance << std::endl; EXPECT_TRUE_GPU((ratio >= pKFDQMTest->CuNegVariance) && (ratio <= pKFDQMTest->CuPosVariance), gpuNode); RECORD(ratio) << "Ratio-" << nCUs << "-CUs"; } } else { LOG() << "Skipping test: Test not supported for family ID 0x" << m_FamilyId << "." << std::endl; } } TEST_F(KFDQMTest, BasicCuMaskingEven) { TEST_START(TESTPROFILE_RUNALL); ASSERT_SUCCESS(KFDTest_Launch(BasicCuMaskingEven)); TEST_END } void testQueuePriority(KFDTEST_PARAMETERS* pTestParamters, bool isSamePipe) { int gpuNode = pTestParamters->gpuNode; KFDQMTest* pKFDQMTest = (KFDQMTest*)pTestParamters->pTestObject; const HSAuint32 m_FamilyId = pKFDQMTest->GetFamilyIdFromNodeId(gpuNode); Assembler* m_pAsm; m_pAsm = pKFDQMTest->GetAssemblerFromNodeId(gpuNode); ASSERT_NOTNULL_GPU(m_pAsm, gpuNode); if (m_FamilyId < FAMILY_VI) { LOG() << "Skipping test: Shader won't run on CI." << std::endl; return; } // Reduce test case if running on emulator // Reduction applies to all 3 dims (effect is cubic) const int scaleDown = (g_IsEmuMode ? 4 : 1); HsaMemoryBuffer syncBuf(PAGE_SIZE, gpuNode, true/*zero*/, false/*local*/, true/*exec*/); HSAint32 *syncBuffer = syncBuf.As(); HsaMemoryBuffer isaBuffer(PAGE_SIZE, gpuNode, true/*zero*/, false/*local*/, true/*exec*/); //ASSERT_SUCCESS(m_pAsm->RunAssembleBuf(LoopIsa, isaBuffer.As())); ASSERT_SUCCESS_GPU(m_pAsm->RunAssembleBuf(LoopIsa, isaBuffer.As()), gpuNode); Dispatch dispatch[2] = { Dispatch(isaBuffer, true), Dispatch(isaBuffer, true) }; const int queueCount = isSamePipe ? 13 : 2; int activeTaskBitmap = 0x3; HSAuint64 startTime, endTime[2]; HsaEvent *pHsaEvent[2]; int numEvent = 2; PM4Queue queue[queueCount]; HSA_QUEUE_PRIORITY priority[2] = { HSA_QUEUE_PRIORITY_LOW, HSA_QUEUE_PRIORITY_HIGH }; int i; /* * For different pipe variation: * Only two queues are created, they should be on two different pipes. * * For same pipe variation: * queue[2..12] are dummy queues. Create queue in this sequence to * render queue[0] and queue[1] on same pipe with no assumptions * about the number of pipes used by KFD. Queue #12 is a multiple * of 1, 2, 3 and 4, so it falls on pipe 0 for any number of pipes */ EXPECT_SUCCESS_GPU(queue[0].Create(gpuNode), gpuNode); // Queue 0 is on Pipe 0 if (isSamePipe) { for (i = 2; i < queueCount; i++) EXPECT_SUCCESS_GPU(queue[i].Create(gpuNode), gpuNode); } EXPECT_SUCCESS_GPU(queue[1].Create(gpuNode), gpuNode); for (i = 0; i < 2; i++) { syncBuffer[i] = -1; queue[i].Update(BaseQueue::DEFAULT_QUEUE_PERCENTAGE, priority[i], false); pHsaEvent[i] = dispatch[i].GetHsaEvent(); pHsaEvent[i]->EventData.EventData.SyncVar.SyncVar.UserData = &syncBuffer[i]; dispatch[i].SetDim(1024 / scaleDown , 16 / scaleDown, 16 / scaleDown); } startTime = GetSystemTickCountInMicroSec(); for (i = 0; i < 2; i++) dispatch[i].Submit(queue[i]); while (activeTaskBitmap > 0) { hsaKmtWaitOnMultipleEvents(pHsaEvent, numEvent, false, g_TestTimeOut); for (i = 0; i < 2; i++) { if ((activeTaskBitmap & (1 << i)) && (syncBuffer[i] == pHsaEvent[i]->EventId)) { endTime[i] = GetSystemTickCountInMicroSec(); activeTaskBitmap &= ~(1 << i); } } } for (i = 0; i < 2; i++) { int usecs = endTime[i] - startTime; LOG() << "Task priority: " << std::dec << priority[i] << "\t"; LOG() << "Task duration: " << std::dec << std::setw(10) << usecs << " usecs" << std::endl; } for (i = 0; i < queueCount; i++) { EXPECT_SUCCESS_GPU(queue[i].Destroy(), gpuNode); } } static void QueuePriorityOnDifferentPipe(KFDTEST_PARAMETERS* pTestParamters) { testQueuePriority(pTestParamters, false); } TEST_F(KFDQMTest, QueuePriorityOnDifferentPipe) { TEST_START(TESTPROFILE_RUNALL); ASSERT_SUCCESS(KFDTest_Launch(QueuePriorityOnDifferentPipe)); TEST_END } void QueuePriorityOnSamePipe(KFDTEST_PARAMETERS* pTestParamters) { testQueuePriority(pTestParamters, true); } TEST_F(KFDQMTest, QueuePriorityOnSamePipe) { TEST_START(TESTPROFILE_RUNALL); ASSERT_SUCCESS(KFDTest_Launch(QueuePriorityOnSamePipe)); TEST_END } void KFDQMTest::SyncDispatch(const HsaMemoryBuffer& isaBuffer, void* pSrcBuf, void* pDstBuf, int node) { PM4Queue queue; if (node == -1) node = m_NodeInfo.HsaDefaultGPUNode(); ASSERT_GE_GPU(node, 0, node) << "failed to get GPU Node"; Dispatch dispatch(isaBuffer); dispatch.SetArgs(pSrcBuf, pDstBuf); dispatch.SetDim(1, 1, 1); ASSERT_SUCCESS_GPU(queue.Create(node), node); dispatch.Submit(queue); dispatch.Sync(); EXPECT_SUCCESS_GPU(queue.Destroy(), node); } void EmptyDispatch(KFDTEST_PARAMETERS* pTestParamters) { int gpuNode = pTestParamters->gpuNode; KFDQMTest* pKFDQMTest = (KFDQMTest*)pTestParamters->pTestObject; Assembler* m_pAsm; m_pAsm = pKFDQMTest->GetAssemblerFromNodeId(gpuNode); ASSERT_NOTNULL_GPU(m_pAsm, gpuNode); HsaMemoryBuffer isaBuffer(PAGE_SIZE, gpuNode, true/*zero*/, false/*local*/, true/*exec*/); ASSERT_SUCCESS_GPU(m_pAsm->RunAssembleBuf(NoopIsa, isaBuffer.As()), gpuNode); pKFDQMTest->SyncDispatch(isaBuffer, NULL, NULL, gpuNode); } TEST_F(KFDQMTest, EmptyDispatch) { TEST_START(TESTPROFILE_RUNALL); ASSERT_SUCCESS(KFDTest_Launch(EmptyDispatch)); TEST_END } void SimpleWriteDispatch(KFDTEST_PARAMETERS* pTestParamters) { int gpuNode = pTestParamters->gpuNode; KFDQMTest* pKFDQMTest = (KFDQMTest*)pTestParamters->pTestObject; Assembler* m_pAsm; m_pAsm = pKFDQMTest->GetAssemblerFromNodeId(gpuNode); ASSERT_NOTNULL_GPU(m_pAsm, gpuNode); HsaMemoryBuffer isaBuffer(PAGE_SIZE, gpuNode, true/*zero*/, false/*local*/, true/*exec*/); HsaMemoryBuffer srcBuffer(PAGE_SIZE, gpuNode, false); HsaMemoryBuffer destBuffer(PAGE_SIZE, gpuNode); srcBuffer.Fill(0x01010101); ASSERT_SUCCESS_GPU(m_pAsm->RunAssembleBuf(CopyDwordIsa, isaBuffer.As()),gpuNode); pKFDQMTest->SyncDispatch(isaBuffer, srcBuffer.As(), destBuffer.As(), gpuNode); EXPECT_EQ(destBuffer.As()[0], 0x01010101); } TEST_F(KFDQMTest, SimpleWriteDispatch) { TEST_START(TESTPROFILE_RUNALL); ASSERT_SUCCESS(KFDTest_Launch(SimpleWriteDispatch)); TEST_END } static void MultipleCpQueuesStressDispatch(KFDTEST_PARAMETERS* pTestParamters) { int gpuNode = pTestParamters->gpuNode; KFDQMTest* pKFDQMTest = (KFDQMTest*)pTestParamters->pTestObject; Assembler* m_pAsm; m_pAsm = pKFDQMTest->GetAssemblerFromNodeId(gpuNode); ASSERT_NOTNULL_GPU(m_pAsm, gpuNode); static const unsigned int MAX_CP_QUEUES = 16; HsaMemoryBuffer isaBuffer(PAGE_SIZE, gpuNode, true/*zero*/, false/*local*/, true/*exec*/); HsaMemoryBuffer srcBuffer(PAGE_SIZE, gpuNode, false); HsaMemoryBuffer destBuffer(PAGE_SIZE, gpuNode); unsigned int* src = srcBuffer.As(); unsigned int* dst = destBuffer.As(); static const HSAuint64 TEST_TIME_SEC = 15; HSAuint64 initialTime, curTime; unsigned int numIter = 0; HSAuint64 timePassed = 0; unsigned int i; PM4Queue queues[MAX_CP_QUEUES]; Dispatch* dispatch[MAX_CP_QUEUES]; destBuffer.Fill(0xFF); ASSERT_SUCCESS_GPU(m_pAsm->RunAssembleBuf(CopyDwordIsa, isaBuffer.As()), gpuNode); for (i = 0; i < MAX_CP_QUEUES; ++i) ASSERT_SUCCESS_GPU(queues[i].Create(gpuNode), gpuNode) << " QueueId=" << i; initialTime = GetSystemTickCountInMicroSec(); do { for (i = 0; i < MAX_CP_QUEUES; ++i) { dispatch[i] = new Dispatch(isaBuffer); src[i] = numIter; dst[i] = 0xff; dispatch[i]->SetArgs(&src[i], &dst[i]); dispatch[i]->SetDim(1, 1, 1); dispatch[i]->Submit(queues[i]); } for (i = 0; i < MAX_CP_QUEUES; ++i) { dispatch[i]->Sync(); EXPECT_EQ_GPU(dst[i], src[i], gpuNode); delete dispatch[i]; } ++numIter; curTime = GetSystemTickCountInMicroSec(); timePassed = (curTime - initialTime) / 1000000; } while (timePassed < TEST_TIME_SEC); LOG() << "Total iterated : " << std::dec << numIter << std::endl; for (i = 0; i < MAX_CP_QUEUES; ++i) EXPECT_SUCCESS_GPU(queues[i].Destroy(), gpuNode); } TEST_F(KFDQMTest, MultipleCpQueuesStressDispatch) { TEST_START(TESTPROFILE_RUNALL) ASSERT_SUCCESS(KFDTest_Launch(MultipleCpQueuesStressDispatch)); TEST_END } static void CpuWriteCoherence(KFDTEST_PARAMETERS* pTestParamters) { int gpuNode = pTestParamters->gpuNode; KFDQMTest* pKFDQMTest = (KFDQMTest*)pTestParamters->pTestObject; PM4Queue queue; HsaMemoryBuffer destBuf(PAGE_SIZE, gpuNode); ASSERT_SUCCESS_GPU(queue.Create(gpuNode), gpuNode); HsaEvent *event; ASSERT_SUCCESS_GPU(CreateQueueTypeEvent(false, false, gpuNode, &event), gpuNode); /* The queue might be full and we fail to submit. There is always one word space unused in queue. * So let rptr one step ahead then we continually submit packet. */ queue.PlaceAndSubmitPacket(PM4NopPacket()); queue.Wait4PacketConsumption(); EXPECT_EQ(1, queue.Rptr()); do { queue.PlaceAndSubmitPacket(PM4NopPacket()); } while (queue.Wptr() != 0); queue.Wait4PacketConsumption(); EXPECT_EQ_GPU(0, queue.Rptr(), gpuNode); /* Now that the GPU has cached the PQ contents, we modify them in CPU cache and * ensure that the GPU sees the updated value: */ queue.PlaceAndSubmitPacket(PM4WriteDataPacket(destBuf.As(), 0x42, 0x42)); queue.Wait4PacketConsumption(event); WaitOnValue(destBuf.As(), 0x42); hsaKmtDestroyEvent(event); } TEST_F(KFDQMTest, CpuWriteCoherence) { TEST_START(TESTPROFILE_RUNALL); ASSERT_SUCCESS(KFDTest_Launch(CpuWriteCoherence)); TEST_END } static void CreateAqlCpQueue(KFDTEST_PARAMETERS* pTestParamters) { int gpuNode = pTestParamters->gpuNode; KFDQMTest* pKFDQMTest = (KFDQMTest*)pTestParamters->pTestObject; AqlQueue queue; HsaMemoryBuffer pointers(PAGE_SIZE, gpuNode, /*zero*/true, /*local*/false, /*exec*/false, /*isScratch */false, /* isReadOnly */false, /* isUncached */false, /* NonPaged */g_baseTest->NeedNonPagedWptr(gpuNode)); ASSERT_SUCCESS_GPU(queue.Create(gpuNode, PAGE_SIZE, pointers.As()), gpuNode); EXPECT_SUCCESS_GPU(queue.Destroy(), gpuNode); } TEST_F(KFDQMTest, CreateAqlCpQueue) { TEST_START(TESTPROFILE_RUNALL) ASSERT_SUCCESS(KFDTest_Launch(CreateAqlCpQueue)); TEST_END } static void QueueLatency(KFDTEST_PARAMETERS* pTestParamters) { int gpuNode = pTestParamters->gpuNode; KFDQMTest* pKFDQMTest = (KFDQMTest*)pTestParamters->pTestObject; HSAuint32 m_FamilyId = pKFDQMTest->GetFamilyIdFromNodeId(gpuNode); PM4Queue queue; const int queueSize = PAGE_SIZE * 2; const int packetSize = PM4ReleaseMemoryPacket(m_FamilyId, 0, 0, 0, 0, 0).SizeInBytes(); /* We always leave one NOP(dword) empty after packet which is required by ring itself. * We also place NOPs when queue wraparound to avoid crossing buffer end. See PlacePacket(). * So the worst case is that we need two packetSize space to place one packet. * Like below, N=NOP,E=Empty,P=Packet. * |E|E|E|E|E|E|E|rptr...wptr|E|E|E|E|E| ---> |P|P|P|P|P|P|E|rptr...wptr|N|N|N|N|N| * So to respect that, we reserve packetSize space for these additional NOPs. * Also we reserve the remainder of the division by packetSize explicitly. * Reserve another packetSize for event-based wait which uses a releseMemory packet. */ const int reservedSpace = packetSize + queueSize % packetSize + packetSize; const int slots = (queueSize - reservedSpace) / packetSize; HSAint64 queue_latency_avg = 0, queue_latency_min, queue_latency_max, queue_latency_med; HSAint64 overhead, workload; HSAint64 *queue_latency_arr = reinterpret_cast(calloc(slots, sizeof(HSAint64))); const int skip = 2; const char *fs[skip] = {"1st", "2nd"}; HsaClockCounters *ts; HSAuint64 *qts; int i = 0; ASSERT_NE_GPU((HSAuint64)queue_latency_arr, 0, gpuNode); ASSERT_SUCCESS_GPU(queue.Create(gpuNode, queueSize), gpuNode); LOG() << std::dec << "Queue Submit NanoSeconds (" << slots << " Packets)" << std::endl; HsaMemoryBuffer buf(ALIGN_UP(slots * sizeof(HsaClockCounters), PAGE_SIZE), 0); ts = buf.As(); HsaMemoryBuffer qbuf(ALIGN_UP(slots * sizeof(HSAuint64), PAGE_SIZE), 0); qts = qbuf.As(); HsaEvent *event; ASSERT_SUCCESS_GPU(CreateQueueTypeEvent(false, false, gpuNode, &event), gpuNode); /* GpuCounter overhead*/ do { hsaKmtGetClockCounters(gpuNode, &ts[i]); } while (++i < slots); overhead = ts[slots-1].GPUClockCounter - ts[0].GPUClockCounter; overhead /= 2 * (slots - 1); /* Submit packets serially*/ i = 0; do { queue.PlacePacket(PM4ReleaseMemoryPacket(m_FamilyId, true, (HSAuint64)&qts[i], 0, true, 1)); hsaKmtGetClockCounters(gpuNode, &ts[i]); queue.SubmitPacket(); queue.Wait4PacketConsumption(event); } while (++i < slots); /* Calculate timing which includes workload and overhead*/ i = 0; do { HSAint64 queue_latency = qts[i] - ts[i].GPUClockCounter; EXPECT_GE_GPU(queue_latency, 0, gpuNode); queue_latency_arr[i] = queue_latency; if (i >= skip) queue_latency_avg += queue_latency; } while (++i < slots); /* Calculate avg from packet[skip, slots-1] */ queue_latency_avg /= (slots - skip); /* Workload of queue packet itself */ i = 0; do { queue.PlacePacket(PM4ReleaseMemoryPacket(m_FamilyId, true, (HSAuint64)&qts[i], 0, true, 1)); } while (++i < slots); queue.SubmitPacket(); queue.Wait4PacketConsumption(event); hsaKmtDestroyEvent(event); /* qts[i] records the timestamp of the end of packet[i] which is * approximate that of the beginging of packet[i+1]. * The workload total is [0, skip], [skip+1, slots-1]. * And We ignore [0, skip], that means we ignore (skip+1) packets. */ workload = qts[slots - 1] - qts[skip]; workload /= (slots - 1 - skip); EXPECT_GE_GPU(workload, 0, gpuNode); i = 0; do { /* The queue_latency is not that correct as the workload and overhead are average*/ queue_latency_arr[i] -= workload + overhead; /* The First submit takes an HSAint64 time*/ if (i < skip) LOG() << "Queue Latency " << fs[i] << ": \t" << CounterToNanoSec(queue_latency_arr[i]) << std::endl; } while (++i < slots); std::sort(queue_latency_arr + skip, queue_latency_arr + slots); queue_latency_min = queue_latency_arr[skip]; queue_latency_med = queue_latency_arr[(slots+skip)/2]; queue_latency_max = queue_latency_arr[slots-1]; LOG() << "Queue Latency Avg: \t" << CounterToNanoSec(queue_latency_avg) << std::endl; LOG() << "Queue Latency Min: \t" << CounterToNanoSec(queue_latency_min) << std::endl; LOG() << "Queue Latency Median: \t" << CounterToNanoSec(queue_latency_med) << std::endl; LOG() << "Queue Latency Max: \t" << CounterToNanoSec(queue_latency_max) << std::endl; LOG() << "Queue Packet Workload: \t" << CounterToNanoSec(workload) << std::endl; LOG() << "Get GpuCounter Overhead: \t" << CounterToNanoSec(overhead) << std::endl; RECORD(CounterToNanoSec(queue_latency_avg)) << "Queue-Latency-Avg"; RECORD(CounterToNanoSec(queue_latency_min)) << "Queue-Latency-Min"; RECORD(CounterToNanoSec(queue_latency_med)) << "Queue-Latency-Med"; RECORD(CounterToNanoSec(queue_latency_max)) << "Queue-Latency-Max"; RECORD(CounterToNanoSec(workload)) << "Queue-Packet-Workload"; RECORD(CounterToNanoSec(overhead)) << "GpuCounter-Overhead"; } TEST_F(KFDQMTest, QueueLatency) { TEST_START(TESTPROFILE_RUNALL); ASSERT_SUCCESS(KFDTest_Launch(QueueLatency)); TEST_END } static void CpQueueWraparound(KFDTEST_PARAMETERS* pTestParamters) { int gpuNode = pTestParamters->gpuNode; KFDQMTest* pKFDQMTest = (KFDQMTest*)pTestParamters->pTestObject; PM4Queue queue; HsaMemoryBuffer destBuf(PAGE_SIZE, gpuNode); ASSERT_SUCCESS_GPU(queue.Create(gpuNode), gpuNode); HsaEvent *event; ASSERT_SUCCESS_GPU(CreateQueueTypeEvent(false, false, gpuNode, &event), gpuNode); for (unsigned int pktIdx = 0; pktIdx <= PAGE_SIZE/sizeof(PM4WRITE_DATA_CI); ++pktIdx) { queue.PlaceAndSubmitPacket(PM4WriteDataPacket(destBuf.As(), pktIdx, pktIdx)); queue.Wait4PacketConsumption(event); WaitOnValue(destBuf.As(), pktIdx); } for (unsigned int pktIdx = 0; pktIdx <= PAGE_SIZE/sizeof(PM4WRITE_DATA_CI); ++pktIdx) { queue.PlaceAndSubmitPacket(PM4WriteDataPacket(destBuf.As(), pktIdx, pktIdx)); queue.Wait4PacketConsumption(event); WaitOnValue(destBuf.As(), pktIdx); } hsaKmtDestroyEvent(event); EXPECT_SUCCESS_GPU(queue.Destroy(), gpuNode); } TEST_F(KFDQMTest, CpQueueWraparound) { TEST_START(TESTPROFILE_RUNALL); ASSERT_SUCCESS(KFDTest_Launch(CpQueueWraparound)); TEST_END } static void SdmaQueueWraparound(KFDTEST_PARAMETERS* pTestParamters) { int gpuNode = pTestParamters->gpuNode; KFDQMTest* pKFDQMTest = (KFDQMTest*)pTestParamters->pTestObject; int bufSize = PAGE_SIZE; SDMAQueue queue; HsaMemoryBuffer destBuf(bufSize << 1, gpuNode, false); HsaMemoryBuffer srcBuf(bufSize, gpuNode, false); ASSERT_SUCCESS_GPU(queue.Create(gpuNode), gpuNode); for (unsigned int pktIdx = 0; pktIdx <= queue.Size()/sizeof(SDMA_PKT_COPY_LINEAR); ++pktIdx) { destBuf.Fill(0x0); srcBuf.Fill(pktIdx); queue.PlaceAndSubmitPacket( SDMACopyDataPacket(queue.GetFamilyId(), destBuf.As(), srcBuf.As(), bufSize)); queue.PlaceAndSubmitPacket( SDMAWriteDataPacket(queue.GetFamilyId(), destBuf.As() + bufSize/4, 0x02020202)); queue.Wait4PacketConsumption(); EXPECT_TRUE_GPU(WaitOnValue(destBuf.As() + bufSize/4, 0x02020202), gpuNode); EXPECT_SUCCESS_GPU(memcmp( destBuf.As(), srcBuf.As(), bufSize), gpuNode); } for (unsigned int pktIdx = 0; pktIdx <= queue.Size()/sizeof(SDMA_PKT_WRITE_UNTILED); ++pktIdx) { queue.PlaceAndSubmitPacket(SDMAWriteDataPacket(queue.GetFamilyId(), destBuf.As(), pktIdx)); queue.Wait4PacketConsumption(); WaitOnValue(destBuf.As(), pktIdx); } EXPECT_SUCCESS_GPU(queue.Destroy(), gpuNode); } TEST_F(KFDQMTest, SdmaQueueWraparound) { TEST_START(TESTPROFILE_RUNALL); ASSERT_SUCCESS(KFDTest_Launch(SdmaQueueWraparound)); TEST_END } struct AtomicIncThreadParams { HSAint64* pDest; volatile unsigned int count; volatile bool loop; }; unsigned int AtomicIncThread(void* pCtx) { AtomicIncThreadParams* pArgs = reinterpret_cast(pCtx); while (pArgs->loop) { AtomicInc(pArgs->pDest); ++pArgs->count; } LOG() << "CPU atomic increments finished" << std::endl; return 0; } static void Atomics(KFDTEST_PARAMETERS* pTestParamters) { int gpuNode = pTestParamters->gpuNode; KFDQMTest* pKFDQMTest = (KFDQMTest*)pTestParamters->pTestObject; Assembler* m_pAsm; m_pAsm = pKFDQMTest->GetAssemblerFromNodeId(gpuNode); ASSERT_NOTNULL_GPU(m_pAsm, gpuNode); if (!hasPciAtomicsSupport(gpuNode)) { LOG() << "Skipping test: Node doesn't support Atomics." << std::endl; return; } HsaMemoryBuffer isaBuf(PAGE_SIZE, gpuNode, true/*zero*/, false/*local*/, true/*exec*/); HsaMemoryBuffer destBuf(PAGE_SIZE, gpuNode); PM4Queue queue; ASSERT_SUCCESS_GPU(m_pAsm->RunAssembleBuf(AtomicIncIsa, isaBuf.As()), gpuNode); Dispatch dispatch(isaBuf); dispatch.SetArgs(destBuf.As(), NULL); dispatch.SetDim(1024, 1, 1); hsaKmtSetMemoryPolicy(gpuNode, HSA_CACHING_CACHED, HSA_CACHING_CACHED, NULL, 0); ASSERT_SUCCESS_GPU(queue.Create(gpuNode), gpuNode); AtomicIncThreadParams params; params.pDest = destBuf.As(); params.loop = true; params.count = 0; uint64_t threadId; ASSERT_EQ_GPU(true, StartThread(&AtomicIncThread, ¶ms, threadId), gpuNode); LOG() << "Waiting for CPU to atomic increment 1000 times" << std::endl; while (params.count < 1000) {} LOG() << "Submitting the GPU atomic increment shader" << std::endl; dispatch.Submit(queue); dispatch.Sync(); params.loop = false; WaitForThread(threadId); EXPECT_EQ_GPU(destBuf.As()[0], 1024 + params.count, gpuNode); LOG() << "GPU increments: 1024, CPU increments: " << std::dec << params.count << std::endl; queue.Destroy(); } TEST_F(KFDQMTest, Atomics) { TEST_START(TESTPROFILE_RUNALL); ASSERT_SUCCESS(KFDTest_Launch(Atomics)); TEST_END } TEST_F(KFDQMTest, mGPUShareBO) { TEST_START(TESTPROFILE_RUNALL); unsigned int src_node = 2; unsigned int dst_node = 1; if (g_TestDstNodeId != -1 && g_TestNodeId != -1) { src_node = g_TestNodeId; dst_node = g_TestDstNodeId; } HsaMemoryBuffer shared_addr(PAGE_SIZE, dst_node, true, false, false, false); HsaMemoryBuffer srcNodeMem(PAGE_SIZE, src_node); HsaMemoryBuffer dstNodeMem(PAGE_SIZE, dst_node); /* Handle ISA to write to local memory BO */ HsaMemoryBuffer isaBufferSrc(PAGE_SIZE, src_node, true/*zero*/, false/*local*/, true/*exec*/); HsaMemoryBuffer isaBufferDst(PAGE_SIZE, dst_node, true/*zero*/, false/*local*/, true/*exec*/); srcNodeMem.Fill(0x05050505); ASSERT_SUCCESS(m_pAsm->RunAssemble(CopyDwordIsa)); m_pAsm->CopyInstrStream(isaBufferSrc.As()); SyncDispatch(isaBufferSrc, srcNodeMem.As(), shared_addr.As(), src_node); m_pAsm->CopyInstrStream(isaBufferDst.As()); SyncDispatch(isaBufferDst, shared_addr.As(), dstNodeMem.As(), dst_node); EXPECT_EQ(dstNodeMem.As()[0], 0x05050505); EXPECT_SUCCESS(shared_addr.UnmapMemToNodes(&dst_node, 1)); TEST_END } static void sdma_copy(HSAuint32 node, void *src, void *const dst[], int n, HSAuint64 size) { SDMAQueue sdmaQueue; HsaEvent *event; ASSERT_SUCCESS(CreateQueueTypeEvent(false, false, node, &event)); ASSERT_SUCCESS(sdmaQueue.Create(node)); sdmaQueue.PlaceAndSubmitPacket(SDMACopyDataPacket(sdmaQueue.GetFamilyId(), dst, src, n, size)); sdmaQueue.Wait4PacketConsumption(event); EXPECT_SUCCESS(sdmaQueue.Destroy()); hsaKmtDestroyEvent(event); } static void sdma_fill(HSAint32 node, void *dst, unsigned int data, HSAuint64 size) { SDMAQueue sdmaQueue; HsaEvent *event; ASSERT_SUCCESS(CreateQueueTypeEvent(false, false, node, &event)); ASSERT_SUCCESS(sdmaQueue.Create(node)); sdmaQueue.PlaceAndSubmitPacket(SDMAFillDataPacket(sdmaQueue.GetFamilyId(), dst, data, size)); sdmaQueue.Wait4PacketConsumption(event); EXPECT_SUCCESS(sdmaQueue.Destroy()); hsaKmtDestroyEvent(event); } TEST_F(KFDQMTest, P2PTest) { TEST_START(TESTPROFILE_RUNALL); if (!hsakmt_is_dgpu()) { LOG() << "Skipping test: Two GPUs are required, but no dGPUs are present." << std::endl; return; } const std::vector gpuNodes = m_NodeInfo.GetNodesWithGPU(); if (gpuNodes.size() < 2) { LOG() << "Skipping test: At least two GPUs are required." << std::endl; return; } std::vector nodes; /* This test simulates RT team's P2P part in IPCtest: * * +------------------------------------------------+ * | gpu1 gpu2 gpuX | * |gpu1 mem ----> gpu2 mem ----> gpuX mem | * | \ \ \ mGPUShareBO | * | \ \ \ | * | system buffer system buffer system buffer| * +------------------------------------------------+ * * Copy data from current GPU memory to next GPU memory and system memory * Using current GPU, aka p2p push. * Verify the system buffer has the expected content after each push. */ /* Users can use "--node=gpu1 --dst_node=gpu2" to specify devices */ if (g_TestDstNodeId != -1 && g_TestNodeId != -1) { nodes.push_back(g_TestNodeId); nodes.push_back(g_TestDstNodeId); if (!m_NodeInfo.IsPeerAccessibleByNode(g_TestNodeId, g_TestDstNodeId)) { LOG() << "Skipping test: Dst GPU specified is not peer-accessible." << std::endl; return; } if (nodes[0] == nodes[1]) { LOG() << "Skipping test: Different GPUs must be specified (2 GPUs required)." << std::endl; return; } } else { nodes = m_NodeInfo.GetNodesWithGPU(); if (nodes.size() < 2) { LOG() << "Skipping test: Test requires at least one large bar GPU." << std::endl; LOG() << " or two GPUs are XGMI connected." << std::endl; return; } } HSAuint32 *sysBuf; HSAuint32 size = 16ULL<<20; // bigger than 16MB to test non-contiguous memory HsaMemFlags memFlags = {0}; HsaMemMapFlags mapFlags = {0}; memFlags.ui32.PageSize = HSA_PAGE_SIZE_4KB; memFlags.ui32.HostAccess = 0; memFlags.ui32.NonPaged = 1; memFlags.ui32.NoNUMABind = 1; unsigned int end = size / sizeof(HSAuint32) - 1; /* 1. Allocate a system buffer and allow the access to GPUs */ EXPECT_SUCCESS(hsaKmtAllocMemory(0, size, m_MemoryFlags, reinterpret_cast(&sysBuf))); EXPECT_SUCCESS(hsaKmtMapMemoryToGPUNodes(sysBuf, size, NULL, mapFlags, nodes.size(), (HSAuint32 *)&nodes[0])); #define MAGIC_NUM 0xdeadbeaf /* First GPU fills mem with MAGIC_NUM */ void *src, *dst; HSAuint32 cur = nodes[0], next; ASSERT_SUCCESS(hsaKmtAllocMemory(cur, size, memFlags, reinterpret_cast(&src))); ASSERT_SUCCESS(hsaKmtMapMemoryToGPU(src, size, NULL)); sdma_fill(cur, src, MAGIC_NUM, size); for (unsigned i = 1; i <= nodes.size(); i++) { int n; memset(sysBuf, 0, size); /* Last GPU just copy mem to sysBuf*/ if (i == nodes.size()) { n = 1; next = 0;/*system memory node*/ dst = 0; } else { n = 2; next = nodes[i]; /* check if cur access next node */ if (!m_NodeInfo.IsPeerAccessibleByNode(next, cur)) continue; ASSERT_SUCCESS(hsaKmtAllocMemory(next, size, memFlags, reinterpret_cast(&dst))); ASSERT_SUCCESS(hsaKmtMapMemoryToGPU(dst, size, NULL)); } LOG() << "Test " << cur << " -> " << next << std::endl; /* Copy to sysBuf and next GPU*/ void *dst_array[] = {sysBuf, dst}; sdma_copy(cur, src, dst_array, n, size); /* Verify the data*/ EXPECT_EQ(sysBuf[0], MAGIC_NUM); EXPECT_EQ(sysBuf[end], MAGIC_NUM); LOG() << "PASS " << cur << " -> " << next << std::endl; EXPECT_SUCCESS(hsaKmtUnmapMemoryToGPU(src)); EXPECT_SUCCESS(hsaKmtFreeMemory(src, size)); cur = next; src = dst; } EXPECT_SUCCESS(hsaKmtUnmapMemoryToGPU(sysBuf)); EXPECT_SUCCESS(hsaKmtFreeMemory(sysBuf, size)); TEST_END } static void PM4EventInterrupt(KFDTEST_PARAMETERS* pTestParamters) { int gpuNode = pTestParamters->gpuNode; KFDQMTest* pKFDQMTest = (KFDQMTest*)pTestParamters->pTestObject; HSAuint32 m_FamilyId = pKFDQMTest->GetFamilyIdFromNodeId(gpuNode); const HSAuint64 bufSize = PAGE_SIZE; const int packetCount = bufSize / sizeof(unsigned int); const int totalPacketSize = packetCount * PM4WriteDataPacket(0, 0).SizeInBytes() + PM4ReleaseMemoryPacket(m_FamilyId, 0, 0, 0).SizeInBytes(); const int queueSize = RoundToPowerOf2(totalPacketSize); /* Reduce number of iteration if running with emulator. */ const int numIter = (g_IsEmuMode ? 32 : 1024); /* 4 PM4 queues will be running at same time.*/ const int numPM4Queue = 4; HsaEvent *event[numPM4Queue]; PM4Queue queue[numPM4Queue]; HsaMemoryBuffer *destBuf[numPM4Queue]; unsigned int *buf[numPM4Queue]; for (int i = 0; i < numPM4Queue; i++) { destBuf[i] = new HsaMemoryBuffer(bufSize, gpuNode, true, false); // System memory buf[i] = destBuf[i]->As(); } /* A simple loop here to give more pressure.*/ for (int test_count = 0; test_count < numIter; test_count++) { for (int i = 0; i < numPM4Queue; i++) { ASSERT_SUCCESS_GPU(queue[i].Create(gpuNode, queueSize), gpuNode); ASSERT_SUCCESS_GPU(CreateQueueTypeEvent(false, false, gpuNode, &event[i]), gpuNode); /* Let CP have some workload first.*/ for(int index = 0; index < packetCount; index++) queue[i].PlacePacket(PM4WriteDataPacket(buf[i] + index, 0xdeadbeaf)); /* releaseMemory packet makes sure all previous written data is visible.*/ queue[i].PlacePacket(PM4ReleaseMemoryPacket(m_FamilyId, 0, reinterpret_cast(event[i]->EventData.HWData2), event[i]->EventId, true)); } for (int i = 0; i < numPM4Queue; i++) queue[i].SubmitPacket(); for (int i = 0; i < numPM4Queue; i++) { EXPECT_SUCCESS_GPU(hsaKmtWaitOnEvent(event[i], g_TestTimeOut), gpuNode); EXPECT_EQ_GPU(buf[i][0], 0xdeadbeaf, gpuNode); EXPECT_EQ_GPU(buf[i][packetCount - 1], 0xdeadbeaf, gpuNode); memset(buf[i], 0, bufSize); } for (int i = 0; i < numPM4Queue; i++) { EXPECT_SUCCESS_GPU(queue[i].Destroy(), gpuNode); EXPECT_SUCCESS_GPU(hsaKmtDestroyEvent(event[i]), gpuNode); } } for (int i = 0; i < numPM4Queue; i++) delete destBuf[i]; } TEST_F(KFDQMTest, PM4EventInterrupt) { TEST_START(TESTPROFILE_RUNALL) ASSERT_SUCCESS(KFDTest_Launch(PM4EventInterrupt)); TEST_END } #include "KFDTestUtilQueue.hpp" static void SdmaEventInterrupt(KFDTEST_PARAMETERS* pTestParamters) { int gpuNode = pTestParamters->gpuNode; KFDQMTest* pKFDQMTest = (KFDQMTest*)pTestParamters->pTestObject; const HSAuint64 bufSize = 4 << 20; HsaMemoryBuffer srcBuf(bufSize, 0); // System memory. HSAuint64 *src = srcBuf.As(); TimeStamp *tsbuf = srcBuf.As(); tsbuf = reinterpret_castALIGN_UP(tsbuf, sizeof(TimeStamp)); /* Have 3 queues created for test.*/ const int numSDMAQueue = 3; HsaEvent *event[numSDMAQueue]; SDMAQueue queue[numSDMAQueue]; HsaMemoryBuffer *destBuf[numSDMAQueue]; HSAuint64 *dst[numSDMAQueue]; for (int i = 0; i < numSDMAQueue; i++) { destBuf[i] = new HsaMemoryBuffer(bufSize, gpuNode, true, false); // System memory dst[i] = destBuf[i]->As(); } /* Test 1 queue, 2 queues, 3 queues running at same time one by one.*/ for (int testSDMAQueue = 1; testSDMAQueue <= numSDMAQueue; testSDMAQueue++) /* A simple loop here to give more pressure.*/ for (int test_count = 0; test_count < 2048; test_count++) { for (int i = 0; i < testSDMAQueue; i++) { TimeStamp *ts = tsbuf + i * 32; ASSERT_SUCCESS_GPU(queue[i].Create(gpuNode), gpuNode); /* FIXME * We create event every time along with queue. * However that will significantly enhance the failure of sdma event timeout. */ ASSERT_SUCCESS_GPU(CreateQueueTypeEvent(false, false, gpuNode, &event[i]), gpuNode); /* Get the timestamp directly. The first member of HsaClockCounters and TimeStamp is GPU clock counter.*/ hsaKmtGetClockCounters(gpuNode, reinterpret_cast(&ts[0])); /* Let sDMA have some workload first.*/ queue[i].PlacePacket(SDMATimePacket(&ts[1])); queue[i].PlacePacket( SDMACopyDataPacket(queue[i].GetFamilyId(), dst[i], src, bufSize)); queue[i].PlacePacket(SDMATimePacket(&ts[2])); queue[i].PlacePacket( SDMAFencePacket(queue[i].GetFamilyId(), reinterpret_cast(event[i]->EventData.HWData2), event[i]->EventId)); queue[i].PlacePacket(SDMATimePacket(&ts[3])); queue[i].PlacePacket(SDMATrapPacket(event[i]->EventId)); queue[i].PlacePacket(SDMATimePacket(&ts[4])); /* Will verify the value of srcBuf and destBuf later. Give it a different value each time.*/ src[0] = ts[0].timestamp; } for (int i = 0; i < testSDMAQueue; i++) queue[i].SubmitPacket(); for (int i = 0; i < testSDMAQueue; i++) { TimeStamp *ts = tsbuf + i * 32; HSAKMT_STATUS ret = hsaKmtWaitOnEvent(event[i], g_TestTimeOut); if (dst[i][0] != src[0]) WARN() << "SDMACopyData FAIL! " << std::dec << dst[i][0] << " VS " << src[0] << std::endl; if (ret == HSAKMT_STATUS_SUCCESS) { for (int i = 1; i <= 4; i++) /* Is queue latency too big? The workload is really small.*/ if (CounterToNanoSec(ts[i].timestamp - ts[i - 1].timestamp) > 1000000000) WARN() << "SDMA queue latency is bigger than 1s!" << std::endl; } else { WARN() << "Event On Queue " << testSDMAQueue << ":" << i << " Timeout, try to resubmit packets!" << std::endl; queue[i].SubmitPacket(); if (hsaKmtWaitOnEvent(event[i], g_TestTimeOut) == HSAKMT_STATUS_SUCCESS) WARN() << "The timeout event is signaled!" << std::endl; else WARN() << "The timeout event is lost after resubmit!" << std::endl; LOG() << "Time Consumption (ns)" << std::endl; for (int i = 1; i <= 4; i++) LOG() << std::dec << i << ": " << CounterToNanoSec(ts[i].timestamp - ts[i - 1].timestamp) << std::endl; } EXPECT_SUCCESS_GPU(ret, gpuNode); } for (int i = 0; i < testSDMAQueue; i++) { EXPECT_SUCCESS_GPU(queue[i].Destroy(), gpuNode); EXPECT_SUCCESS_GPU(hsaKmtDestroyEvent(event[i]), gpuNode); } } for (int i = 0; i < numSDMAQueue; i++) delete destBuf[i]; } TEST_F(KFDQMTest, SdmaEventInterrupt) { TEST_START(TESTPROFILE_RUNALL) ASSERT_SUCCESS(KFDTest_Launch(SdmaEventInterrupt)); TEST_END } #define DOORBELL_WRITE_USE_SDMA static void GPUDoorbellWrite(KFDTEST_PARAMETERS* pTestParamters) { int gpuNode = pTestParamters->gpuNode; KFDQMTest* pKFDQMTest = (KFDQMTest*)pTestParamters->pTestObject; HSAuint32 m_FamilyId = pKFDQMTest->GetFamilyIdFromNodeId(gpuNode); HsaMemoryBuffer destBuf(PAGE_SIZE, 0, true); PM4Queue pm4Queue; #ifdef DOORBELL_WRITE_USE_SDMA SDMAQueue otherQueue; #else PM4Queue otherQueue; #endif ASSERT_SUCCESS_GPU(pm4Queue.Create(gpuNode), gpuNode); ASSERT_SUCCESS_GPU(otherQueue.Create(gpuNode), gpuNode); /* Place PM4 packet in the queue, but don't submit it */ pm4Queue.PlacePacket(PM4WriteDataPacket(destBuf.As(), 0x12345678, 0x87654321)); HsaQueueResource *qRes = pm4Queue.GetResource(); if (m_FamilyId < FAMILY_AI) { unsigned int pendingWptr = pm4Queue.GetPendingWptr(); #ifdef DOORBELL_WRITE_USE_SDMA /* Write the wptr and doorbell update using the GPU's SDMA * engine. This should submit the PM4 packet on the first * queue. */ otherQueue.PlacePacket(SDMAWriteDataPacket(otherQueue.GetFamilyId(), qRes->Queue_write_ptr, pendingWptr)); otherQueue.PlacePacket(SDMAWriteDataPacket(otherQueue.GetFamilyId(), qRes->Queue_DoorBell, pendingWptr)); #else /* Write the wptr and doorbell update using WRITE_DATA packets * on a second PM4 queue. This should submit the PM4 packet on * the first queue. */ otherQueue.PlacePacket( PM4ReleaseMemoryPacket(m_FamilyId, true, (HSAuint64)qRes->Queue_write_ptr, pendingWptr, false)); otherQueue.PlacePacket( PM4ReleaseMemoryPacket(m_FamilyId, true, (HSAuint64)qRes->Queue_DoorBell, pendingWptr, false)); #endif otherQueue.SubmitPacket(); } else { HSAuint64 pendingWptr64 = pm4Queue.GetPendingWptr64(); #ifdef DOORBELL_WRITE_USE_SDMA /* Write the wptr and doorbell update using the GPU's SDMA * engine. This should submit the PM4 packet on the first * queue. */ otherQueue.PlacePacket(SDMAWriteDataPacket(otherQueue.GetFamilyId(), qRes->Queue_write_ptr, 2, &pendingWptr64)); otherQueue.PlacePacket(SDMAWriteDataPacket(otherQueue.GetFamilyId(), qRes->Queue_DoorBell, 2, &pendingWptr64)); #else /* Write the 64-bit wptr and doorbell update using RELEASE_MEM * packets without IRQs on a second PM4 queue. RELEASE_MEM * should perform one atomic 64-bit access. This should submit * the PM4 packet on the first queue. */ otherQueue.PlacePacket( PM4ReleaseMemoryPacket(m_FamilyId, true, (HSAuint64)qRes->Queue_write_ptr, pendingWptr64, true)); otherQueue.PlacePacket( PM4ReleaseMemoryPacket(m_FamilyId, true, (HSAuint64)qRes->Queue_DoorBell, pendingWptr64, true)); #endif otherQueue.SubmitPacket(); } /* Check that the PM4 packet has been executed */ EXPECT_TRUE_GPU(WaitOnValue(destBuf.As(), 0x12345678), gpuNode); EXPECT_TRUE_GPU(WaitOnValue(destBuf.As()+1, 0x87654321), gpuNode); EXPECT_SUCCESS_GPU(pm4Queue.Destroy(), gpuNode); EXPECT_SUCCESS_GPU(otherQueue.Destroy(), gpuNode); } TEST_F(KFDQMTest, GPUDoorbellWrite) { TEST_START(TESTPROFILE_RUNALL) ASSERT_SUCCESS(KFDTest_Launch(GPUDoorbellWrite)); TEST_END } TEST_F(KFDQMTest, UserQueueBufValidation) { TEST_START(TESTPROFILE_RUNALL) int defaultGPUNode = m_NodeInfo.HsaDefaultGPUNode(); ASSERT_GE(defaultGPUNode, 0) << "failed to get default GPU Node"; HsaQueueResource QueueResources; HsaMemoryBuffer *QueueBuf; HSAKMT_STATUS status; memset(&QueueResources, 0, sizeof(QueueResources)); // System memory mapping on GPU QueueBuf = new HsaMemoryBuffer(PAGE_SIZE, defaultGPUNode); EXPECT_SUCCESS(hsaKmtCreateQueue(defaultGPUNode, HSA_QUEUE_COMPUTE, 100, HSA_QUEUE_PRIORITY_NORMAL, QueueBuf->As(), PAGE_SIZE, NULL, &QueueResources)); EXPECT_SUCCESS(hsaKmtDestroyQueue(QueueResources.QueueId)); // CP Queue creation should fail using wrong ring buffer size EXPECT_SUCCESS(!hsaKmtCreateQueue(defaultGPUNode, HSA_QUEUE_COMPUTE, 100, HSA_QUEUE_PRIORITY_NORMAL, QueueBuf->As(), PAGE_SIZE * 2, NULL, &QueueResources)); // SDMA queue create should fail using wrong ring buffer size EXPECT_SUCCESS(!hsaKmtCreateQueue(defaultGPUNode, HSA_QUEUE_SDMA, 100, HSA_QUEUE_PRIORITY_NORMAL, QueueBuf->As(), PAGE_SIZE * 2, NULL, &QueueResources)); // CP queue create should fail using NULL ring buffer EXPECT_SUCCESS(!hsaKmtCreateQueue(defaultGPUNode, HSA_QUEUE_COMPUTE, 100, HSA_QUEUE_PRIORITY_NORMAL, NULL, PAGE_SIZE, NULL, &QueueResources)); // SDMA queue create should fail using NULL ring buffer EXPECT_SUCCESS(!hsaKmtCreateQueue(defaultGPUNode, HSA_QUEUE_SDMA, 100, HSA_QUEUE_PRIORITY_NORMAL, NULL, PAGE_SIZE, NULL, &QueueResources)); EXPECT_SUCCESS(hsaKmtUnmapMemoryToGPU(QueueBuf->As())); EXPECT_SUCCESS(hsaKmtFreeMemory(QueueBuf->As(), PAGE_SIZE)); // // This following negative test will evict user queues, must execute in child process, // because parent process is allowed to create queue to run the remaining tests. // pid_t childPid = fork(); if (childPid == 0) { /* Child process */ void *cwsr_addr; int exit_code = 1; TearDown(); SetUp(); // System memory mapping on GPU QueueBuf = new HsaMemoryBuffer(PAGE_SIZE, defaultGPUNode); memset(&QueueResources, 0, sizeof(QueueResources)); status = hsaKmtCreateQueue(defaultGPUNode, HSA_QUEUE_COMPUTE, 100, HSA_QUEUE_PRIORITY_NORMAL, QueueBuf->As(), PAGE_SIZE, NULL, &QueueResources); if (status != HSAKMT_STATUS_SUCCESS) { LOG() << "create queue failed." << std::endl; goto free_exit; } // Update queue percentage 0 to set queue inactive in order to get queue info CWSR area status = hsaKmtUpdateQueue(QueueResources.QueueId, 0, HSA_QUEUE_PRIORITY_NORMAL, QueueBuf->As(), PAGE_SIZE, NULL); if (status != HSAKMT_STATUS_SUCCESS) { LOG() << "update queue failed." << std::endl; goto err_exit; } HsaQueueInfo QueueInfo; status = hsaKmtGetQueueInfo(QueueResources.QueueId, &QueueInfo); if (status != HSAKMT_STATUS_SUCCESS) { LOG() << "get queue info failed." << std::endl; goto err_exit; } // unmap CWSR buffer will evict queue before queue is destroyed cwsr_addr = QueueInfo.UserContextSaveArea; munmap(cwsr_addr, PAGE_SIZE); // unmap and free queue ring buffer should fail before the queue is destroyed status = hsaKmtFreeMemory(QueueBuf->As(), PAGE_SIZE); if (status == HSAKMT_STATUS_SUCCESS) { LOG() << "free queue buf should fail." << std::endl; goto err_exit; } status = hsaKmtUnmapMemoryToGPU(QueueBuf->As()); if (status == HSAKMT_STATUS_SUCCESS) { LOG() << "unmap queue buf should fail." << std::endl; goto err_exit; } exit_code = 0; err_exit: status = hsaKmtDestroyQueue(QueueResources.QueueId); if (status != HSAKMT_STATUS_SUCCESS) { LOG() << "destroy queue failed." << std::endl; exit_code = 1; } free_exit: status = hsaKmtUnmapMemoryToGPU(QueueBuf->As()); if (status != HSAKMT_STATUS_SUCCESS) { LOG() << "unmap queue buf failed." << std::endl; exit_code = 1; } status = hsaKmtFreeMemory(QueueBuf->As(), PAGE_SIZE); if (status != HSAKMT_STATUS_SUCCESS) { LOG() << "free queue buf failed." << std::endl; exit_code = 1; } exit(exit_code); } else { int childStatus; waitpid(childPid, &childStatus, 0); EXPECT_EQ(true, WIFEXITED(childStatus)); EXPECT_EQ(0, WEXITSTATUS(childStatus)); } TEST_END }