Files
rocm-systems/tests/kfdtest/src/KFDQMTest.cpp
T
Yong Zhao fe97612800 kfdtest: Add basic tests for XGMI SDMA queues
After XGMI SDMA queues were separated from regular SDMA queues, they
were not covered in the current tests. Add tests for them now.

Change-Id: I036e3ca5d583ab7f022a9dc6cda3ef867f4773a0
Signed-off-by: Yong Zhao <Yong.Zhao@amd.com>
2020-01-10 15:32:12 -05:00

1988 строки
68 KiB
C++

/*
* Copyright (C) 2014-2018 Advanced Micro Devices, Inc. All Rights Reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*
*/
#include <sys/time.h>
#include <vector>
#include <utility>
#include "KFDQMTest.hpp"
#include "PM4Queue.hpp"
#include "PM4Packet.hpp"
#include "SDMAPacket.hpp"
#include "XgmiOptimizedSDMAQueue.hpp"
#include "AqlQueue.hpp"
#include <algorithm>
#include "Dispatch.hpp"
void KFDQMTest::SetUp() {
ROUTINE_START
KFDBaseComponentTest::SetUp();
m_pIsaGen = IsaGenerator::Create(m_FamilyId);
ROUTINE_END
}
void KFDQMTest::TearDown() {
ROUTINE_START
if (m_pIsaGen)
delete m_pIsaGen;
m_pIsaGen = NULL;
KFDBaseComponentTest::TearDown();
ROUTINE_END
}
TEST_F(KFDQMTest, CreateDestroyCpQueue) {
TEST_START(TESTPROFILE_RUNALL)
int defaultGPUNode = m_NodeInfo.HsaDefaultGPUNode();
ASSERT_GE(defaultGPUNode, 0) << "failed to get default GPU Node";
PM4Queue queue;
ASSERT_SUCCESS(queue.Create(defaultGPUNode));
EXPECT_SUCCESS(queue.Destroy());
TEST_END
}
TEST_F(KFDQMTest, SubmitNopCpQueue) {
TEST_START(TESTPROFILE_RUNALL)
int defaultGPUNode = m_NodeInfo.HsaDefaultGPUNode();
ASSERT_GE(defaultGPUNode, 0) << "failed to get default GPU Node";
PM4Queue queue;
ASSERT_SUCCESS(queue.Create(defaultGPUNode));
queue.PlaceAndSubmitPacket(PM4NopPacket());
queue.Wait4PacketConsumption();
EXPECT_SUCCESS(queue.Destroy());
TEST_END
}
TEST_F(KFDQMTest, SubmitPacketCpQueue) {
TEST_START(TESTPROFILE_RUNALL)
int defaultGPUNode = m_NodeInfo.HsaDefaultGPUNode();
ASSERT_GE(defaultGPUNode, 0) << "failed to get default GPU Node";
HsaMemoryBuffer destBuf(PAGE_SIZE, defaultGPUNode, false);
destBuf.Fill(0xFF);
PM4Queue queue;
ASSERT_SUCCESS(queue.Create(defaultGPUNode));
queue.PlaceAndSubmitPacket(PM4WriteDataPacket(destBuf.As<unsigned int*>(), 0, 0));
queue.Wait4PacketConsumption();
EXPECT_TRUE(WaitOnValue(destBuf.As<unsigned int*>(), 0));
EXPECT_SUCCESS(queue.Destroy());
TEST_END
}
TEST_F(KFDQMTest, AllCpQueues) {
TEST_START(TESTPROFILE_RUNALL)
int defaultGPUNode = m_NodeInfo.HsaDefaultGPUNode();
ASSERT_GE(defaultGPUNode, 0) << "failed to get default GPU Node";
HsaMemoryBuffer destBuf(PAGE_SIZE, defaultGPUNode, false);
destBuf.Fill(0xFF);
std::vector<PM4Queue> queues(m_numCpQueues);
for (unsigned int qidx = 0; qidx < m_numCpQueues; ++qidx)
ASSERT_SUCCESS(queues[qidx].Create(defaultGPUNode)) << " QueueId=" << qidx;
for (unsigned int qidx = 0; qidx < m_numCpQueues; ++qidx) {
queues[qidx].PlaceAndSubmitPacket(PM4WriteDataPacket(destBuf.As<unsigned int*>()+qidx*2, qidx, qidx));
queues[qidx].Wait4PacketConsumption();
EXPECT_TRUE(WaitOnValue(destBuf.As<unsigned int*>()+qidx*2, qidx));
}
for (unsigned int qidx = 0; qidx < m_numCpQueues; ++qidx)
EXPECT_SUCCESS(queues[qidx].Destroy());
TEST_END
}
TEST_F(KFDQMTest, CreateDestroySdmaQueue) {
TEST_START(TESTPROFILE_RUNALL)
int defaultGPUNode = m_NodeInfo.HsaDefaultGPUNode();
ASSERT_GE(defaultGPUNode, 0) << "failed to get default GPU Node";
SDMAQueue queue;
ASSERT_SUCCESS(queue.Create(defaultGPUNode));
EXPECT_SUCCESS(queue.Destroy());
TEST_END
}
TEST_F(KFDQMTest, SubmitNopSdmaQueue) {
TEST_START(TESTPROFILE_RUNALL)
int defaultGPUNode = m_NodeInfo.HsaDefaultGPUNode();
ASSERT_GE(defaultGPUNode, 0) << "failed to get default GPU Node";
SDMAQueue queue;
ASSERT_SUCCESS(queue.Create(defaultGPUNode));
queue.PlaceAndSubmitPacket(SDMANopPacket());
queue.Wait4PacketConsumption();
EXPECT_SUCCESS(queue.Destroy());
TEST_END
}
TEST_F(KFDQMTest, SubmitPacketSdmaQueue) {
TEST_START(TESTPROFILE_RUNALL)
int defaultGPUNode = m_NodeInfo.HsaDefaultGPUNode();
ASSERT_GE(defaultGPUNode, 0) << "failed to get default GPU Node";
HsaMemoryBuffer destBuf(PAGE_SIZE, defaultGPUNode, false);
destBuf.Fill(0xFF);
SDMAQueue queue;
ASSERT_SUCCESS(queue.Create(defaultGPUNode));
queue.PlaceAndSubmitPacket(SDMAWriteDataPacket(queue.GetFamilyId(), destBuf.As<void *>(), 0x02020202));
queue.Wait4PacketConsumption();
EXPECT_TRUE(WaitOnValue(destBuf.As<unsigned int*>(), 0x02020202));
EXPECT_SUCCESS(queue.Destroy());
TEST_END
}
TEST_F(KFDQMTest, AllSdmaQueues) {
TEST_START(TESTPROFILE_RUNALL)
int defaultGPUNode = m_NodeInfo.HsaDefaultGPUNode();
int bufSize = PAGE_SIZE;
ASSERT_GE(defaultGPUNode, 0) << "failed to get default GPU Node";
const unsigned int numSdmaQueues = m_numSdmaEngines * m_numSdmaQueuesPerEngine;
LOG() << "Regular SDMA engines number: " << m_numSdmaEngines
<< " SDMA queues per engine: " << m_numSdmaQueuesPerEngine << std::endl;
HsaMemoryBuffer destBuf(bufSize << 1 , defaultGPUNode, false);
HsaMemoryBuffer srcBuf(bufSize, defaultGPUNode, false);
destBuf.Fill(0xFF);
std::vector<SDMAQueue> queues(numSdmaQueues);
for (unsigned int qidx = 0; qidx < numSdmaQueues; ++qidx)
ASSERT_SUCCESS(queues[qidx].Create(defaultGPUNode));
for (unsigned int qidx = 0; qidx < numSdmaQueues; ++qidx) {
destBuf.Fill(0x0);
srcBuf.Fill(qidx + 0xa0);
queues[qidx].PlaceAndSubmitPacket(
SDMACopyDataPacket(queues[qidx].GetFamilyId(), destBuf.As<unsigned int*>(), srcBuf.As<unsigned int*>(), bufSize));
queues[qidx].PlaceAndSubmitPacket(
SDMAWriteDataPacket(queues[qidx].GetFamilyId(), destBuf.As<unsigned int*>() + bufSize/4, 0x02020202));
queues[qidx].Wait4PacketConsumption();
EXPECT_TRUE(WaitOnValue(destBuf.As<unsigned int*>() + bufSize/4, 0x02020202));
EXPECT_SUCCESS(memcmp(
destBuf.As<unsigned int*>(), srcBuf.As<unsigned int*>(), bufSize));
}
for (unsigned int qidx = 0; qidx < numSdmaQueues; ++qidx)
EXPECT_SUCCESS(queues[qidx].Destroy());
TEST_END
}
TEST_F(KFDQMTest, AllXgmiSdmaQueues) {
TEST_START(TESTPROFILE_RUNALL)
int defaultGPUNode = m_NodeInfo.HsaDefaultGPUNode();
int bufSize = PAGE_SIZE;
int j;
ASSERT_GE(defaultGPUNode, 0) << "failed to get default GPU Node";
const unsigned int numXgmiSdmaQueues =
m_numSdmaXgmiEngines * m_numSdmaQueuesPerEngine;
LOG() << "XGMI SDMA engines number: " << m_numSdmaXgmiEngines
<< " SDMA queues per engine: " << m_numSdmaQueuesPerEngine << std::endl;
HsaMemoryBuffer destBuf(bufSize << 1 , defaultGPUNode, false);
HsaMemoryBuffer srcBuf(bufSize, defaultGPUNode, false);
destBuf.Fill(0xFF);
std::vector<XgmiOptimizedSDMAQueue> xgmiSdmaQueues(numXgmiSdmaQueues);
for (j = 0; j < numXgmiSdmaQueues; ++j)
ASSERT_SUCCESS(xgmiSdmaQueues[j].Create(defaultGPUNode));
for (j = 0; j < numXgmiSdmaQueues; ++j) {
destBuf.Fill(0x0);
srcBuf.Fill(j + 0xa0);
xgmiSdmaQueues[j].PlaceAndSubmitPacket(
SDMACopyDataPacket(xgmiSdmaQueues[j].GetFamilyId(),
destBuf.As<unsigned int*>(), srcBuf.As<unsigned int*>(), bufSize));
xgmiSdmaQueues[j].PlaceAndSubmitPacket(
SDMAWriteDataPacket(xgmiSdmaQueues[j].GetFamilyId(),
destBuf.As<unsigned int*>() + bufSize/4, 0x02020202));
xgmiSdmaQueues[j].Wait4PacketConsumption();
EXPECT_TRUE(WaitOnValue(destBuf.As<unsigned int*>() + bufSize/4, 0x02020202));
EXPECT_SUCCESS(memcmp(
destBuf.As<unsigned int*>(), srcBuf.As<unsigned int*>(), bufSize));
}
for (j = 0; j < numXgmiSdmaQueues; ++j)
EXPECT_SUCCESS(xgmiSdmaQueues[j].Destroy());
TEST_END
}
TEST_F(KFDQMTest, AllQueues) {
TEST_START(TESTPROFILE_RUNALL)
int defaultGPUNode = m_NodeInfo.HsaDefaultGPUNode();
int bufSize = PAGE_SIZE;
ASSERT_GE(defaultGPUNode, 0) << "failed to get default GPU Node";
unsigned int i, j;
const unsigned int numCpQueues = m_numCpQueues;
const unsigned int numSdmaQueues = m_numSdmaEngines * m_numSdmaQueuesPerEngine;
const unsigned int numXgmiSdmaQueues =
m_numSdmaXgmiEngines * m_numSdmaQueuesPerEngine;
HsaMemoryBuffer destBufCp(PAGE_SIZE, defaultGPUNode, false);
destBufCp.Fill(0xFF);
HsaMemoryBuffer destBuf(bufSize << 1 , defaultGPUNode, false);
HsaMemoryBuffer srcBuf(bufSize, defaultGPUNode, false);
destBuf.Fill(0xFF);
std::vector<PM4Queue> cpQueues(numCpQueues);
std::vector<SDMAQueue> sdmaQueues(numSdmaQueues);
std::vector<XgmiOptimizedSDMAQueue> xgmiSdmaQueues(numXgmiSdmaQueues);
for (i = 0; i < numCpQueues; ++i)
ASSERT_SUCCESS(cpQueues[i].Create(defaultGPUNode)) << " QueueId=" << i;
for (j = 0; j < numSdmaQueues; ++j)
ASSERT_SUCCESS(sdmaQueues[j].Create(defaultGPUNode));
for (j = 0; j < numXgmiSdmaQueues; ++j)
ASSERT_SUCCESS(xgmiSdmaQueues[j].Create(defaultGPUNode));
for (i = 0; i < numCpQueues; ++i) {
cpQueues[i].PlaceAndSubmitPacket(PM4WriteDataPacket(destBufCp.As<unsigned int*>()+i*2, i, i));
cpQueues[i].Wait4PacketConsumption();
EXPECT_TRUE(WaitOnValue(destBufCp.As<unsigned int*>()+i*2, i));
}
for (j = 0; j < numSdmaQueues; ++j) {
destBuf.Fill(0x0);
srcBuf.Fill(j + 0xa0);
sdmaQueues[j].PlaceAndSubmitPacket(
SDMACopyDataPacket(sdmaQueues[j].GetFamilyId(), destBuf.As<unsigned int*>(), srcBuf.As<unsigned int*>(), bufSize));
sdmaQueues[j].PlaceAndSubmitPacket(
SDMAWriteDataPacket(sdmaQueues[j].GetFamilyId(), destBuf.As<unsigned int*>() + bufSize/4, 0x02020202));
sdmaQueues[j].Wait4PacketConsumption();
EXPECT_TRUE(WaitOnValue(destBuf.As<unsigned int*>() + bufSize/4, 0x02020202));
EXPECT_SUCCESS(memcmp(
destBuf.As<unsigned int*>(), srcBuf.As<unsigned int*>(), bufSize));
}
for (j = 0; j < numXgmiSdmaQueues; ++j) {
destBuf.Fill(0x0);
srcBuf.Fill(j + 0xa0);
xgmiSdmaQueues[j].PlaceAndSubmitPacket(
SDMACopyDataPacket(xgmiSdmaQueues[j].GetFamilyId(),
destBuf.As<unsigned int*>(), srcBuf.As<unsigned int*>(), bufSize));
xgmiSdmaQueues[j].PlaceAndSubmitPacket(
SDMAWriteDataPacket(xgmiSdmaQueues[j].GetFamilyId(),
destBuf.As<unsigned int*>() + bufSize/4, 0x02020202));
xgmiSdmaQueues[j].Wait4PacketConsumption();
EXPECT_TRUE(WaitOnValue(destBuf.As<unsigned int*>() + bufSize/4, 0x02020202));
EXPECT_SUCCESS(memcmp(
destBuf.As<unsigned int*>(), srcBuf.As<unsigned int*>(), bufSize));
}
for (i = 0; i < numCpQueues; ++i)
EXPECT_SUCCESS(cpQueues[i].Destroy());
for (j = 0; j < numSdmaQueues; ++j)
EXPECT_SUCCESS(sdmaQueues[j].Destroy());
for (j = 0; j < numXgmiSdmaQueues; ++j)
EXPECT_SUCCESS(xgmiSdmaQueues[j].Destroy());
TEST_END
}
/* The following test is designed to reproduce an intermittent hang on
* Fiji and other VI/Polaris GPUs. This test typically hangs in a few
* seconds. According to analysis done by HW engineers, the culprit
* seems to be PCIe speed switching. The problem can be worked around
* by disabling the lowest DPM level on Fiji.
*/
TEST_F(KFDQMTest, SdmaConcurrentCopies) {
TEST_START(TESTPROFILE_RUNALL)
int defaultGPUNode = m_NodeInfo.HsaDefaultGPUNode();
ASSERT_GE(defaultGPUNode, 0) << "failed to get default GPU Node";
#define BUFFER_SIZE (64*1024)
#define NPACKETS 1
#define COPY_SIZE (BUFFER_SIZE / NPACKETS)
HsaMemoryBuffer srcBuf(BUFFER_SIZE, 0, true);
HsaMemoryBuffer dstBuf(BUFFER_SIZE, defaultGPUNode, false, is_dgpu() ? true : false);
SDMAQueue queue;
ASSERT_SUCCESS(queue.Create(defaultGPUNode));
std::ostream &log = LOG();
char progress[] = "-\b";
log << "Running ... ";
for (unsigned i = 0; i < 100000; i++) {
if (i % 1000 == 0) {
const char progressSteps[4] = {'-', '\\', '|', '/'};
progress[0] = progressSteps[(i/1000) % 4];
log << progress;
}
for (unsigned j = 0; j < NPACKETS; j++)
queue.PlacePacket(
SDMACopyDataPacket(queue.GetFamilyId(), dstBuf.As<char *>()+COPY_SIZE*j,
srcBuf.As<char *>()+COPY_SIZE*j, COPY_SIZE));
queue.SubmitPacket();
/* Waste a variable amount of time. Submission timing
* while SDMA runs concurrently seems to be critical for
* reproducing the hang
*/
for (int k = 0; k < (i & 0xfff); k++)
memcpy(srcBuf.As<char *>()+PAGE_SIZE, srcBuf.As<char *>(), 1024);
/* Wait for idle every 8 packets to allow the SDMA engine to
* run concurrently for a bit without getting too far ahead
*/
if ((i & 0x7) == 0)
queue.Wait4PacketConsumption();
}
log << "Done." << std::endl;
queue.PlaceAndSubmitPacket(SDMAWriteDataPacket(queue.GetFamilyId(), srcBuf.As<unsigned *>(), 0x02020202));
queue.Wait4PacketConsumption();
EXPECT_TRUE(WaitOnValue(srcBuf.As<unsigned int*>(), 0x02020202));
EXPECT_SUCCESS(queue.Destroy());
TEST_END
}
TEST_F(KFDQMTest, DisableCpQueueByUpdateWithNullAddress) {
TEST_START(TESTPROFILE_RUNALL)
int defaultGPUNode = m_NodeInfo.HsaDefaultGPUNode();
ASSERT_GE(defaultGPUNode, 0) << "failed to get default GPU Node";
HsaMemoryBuffer destBuf(PAGE_SIZE, defaultGPUNode, false);
destBuf.Fill(0xFFFFFFFF);
PM4Queue queue;
ASSERT_SUCCESS(queue.Create(defaultGPUNode));
queue.PlaceAndSubmitPacket(PM4WriteDataPacket(destBuf.As<unsigned int*>(), 0, 0));
queue.Wait4PacketConsumption();
WaitOnValue(destBuf.As<unsigned int*>(), 0);
destBuf.Fill(0xFFFFFFFF);
EXPECT_SUCCESS(queue.Update(BaseQueue::DEFAULT_QUEUE_PERCENTAGE, BaseQueue::DEFAULT_PRIORITY, true));
queue.PlaceAndSubmitPacket(PM4WriteDataPacket(destBuf.As<unsigned int*>(), 1, 1));
// Don't sync since we don't expect rptr to change when the queue is disabled.
Delay(2000);
EXPECT_EQ(destBuf.As<unsigned int*>()[0], 0xFFFFFFFF)
<< "Packet executed even though the queue is supposed to be disabled!";
EXPECT_SUCCESS(queue.Update(BaseQueue::DEFAULT_QUEUE_PERCENTAGE, BaseQueue::DEFAULT_PRIORITY, false));
queue.Wait4PacketConsumption();
WaitOnValue(destBuf.As<unsigned int*>(), 1);
EXPECT_SUCCESS(queue.Destroy());
TEST_END
}
TEST_F(KFDQMTest, DisableSdmaQueueByUpdateWithNullAddress) {
TEST_START(TESTPROFILE_RUNALL)
int defaultGPUNode = m_NodeInfo.HsaDefaultGPUNode();
ASSERT_GE(defaultGPUNode, 0) << "failed to get default GPU Node";
HsaMemoryBuffer destBuf(PAGE_SIZE, defaultGPUNode, false);
destBuf.Fill(0xFFFFFFFF);
SDMAQueue queue;
ASSERT_SUCCESS(queue.Create(defaultGPUNode));
queue.PlaceAndSubmitPacket(SDMAWriteDataPacket(queue.GetFamilyId(), destBuf.As<void*>(), 0));
WaitOnValue(destBuf.As<unsigned int*>(), 0);
destBuf.Fill(0xFFFFFFFF);
EXPECT_SUCCESS(queue.Update(BaseQueue::DEFAULT_QUEUE_PERCENTAGE, BaseQueue::DEFAULT_PRIORITY, true));
queue.PlaceAndSubmitPacket(SDMAWriteDataPacket(queue.GetFamilyId(), destBuf.As<void*>(), 0));
// Don't sync since we don't expect rptr to change when the queue is disabled.
Delay(2000);
EXPECT_EQ(destBuf.As<unsigned int*>()[0], 0xFFFFFFFF)
<< "Packet executed even though the queue is supposed to be disabled!";
EXPECT_SUCCESS(queue.Update(BaseQueue::DEFAULT_QUEUE_PERCENTAGE, BaseQueue::DEFAULT_PRIORITY, false));
queue.Wait4PacketConsumption();
WaitOnValue(destBuf.As<unsigned int*>(), 0);
EXPECT_SUCCESS(queue.Destroy());
TEST_END
}
TEST_F(KFDQMTest, DisableCpQueueByUpdateWithZeroPercentage) {
TEST_START(TESTPROFILE_RUNALL)
int defaultGPUNode = m_NodeInfo.HsaDefaultGPUNode();
ASSERT_GE(defaultGPUNode, 0) << "failed to get default GPU Node";
HsaMemoryBuffer destBuf(PAGE_SIZE, defaultGPUNode, false);
destBuf.Fill(0xFFFFFFFF);
PM4Queue queue;
ASSERT_SUCCESS(queue.Create(defaultGPUNode));
PM4WriteDataPacket packet1, packet2;
packet1.InitPacket(destBuf.As<unsigned int*>(), 0, 0);
packet2.InitPacket(destBuf.As<unsigned int*>(), 1, 1);
queue.PlaceAndSubmitPacket(packet1);
queue.Wait4PacketConsumption();
WaitOnValue(destBuf.As<unsigned int*>(), 0);
destBuf.Fill(0xFFFFFFFF);
EXPECT_SUCCESS(queue.Update(0/*percentage*/, BaseQueue::DEFAULT_PRIORITY, false));
queue.PlaceAndSubmitPacket(packet2);
// Don't sync since we don't expect rptr to change when the queue is disabled.
Delay(2000);
EXPECT_EQ(destBuf.As<unsigned int*>()[0], 0xFFFFFFFF)
<< "Packet executed even though the queue is supposed to be disabled!";
EXPECT_SUCCESS(queue.Update(BaseQueue::DEFAULT_QUEUE_PERCENTAGE, BaseQueue::DEFAULT_PRIORITY, false));
queue.Wait4PacketConsumption();
WaitOnValue(destBuf.As<unsigned int*>(), 1);
EXPECT_SUCCESS(queue.Destroy());
TEST_END
}
TEST_F(KFDQMTest, CreateQueueStressSingleThreaded) {
TEST_START(TESTPROFILE_RUNALL)
static const HSAuint64 TEST_TIME_SEC = 15;
HSAuint64 initialTime = GetSystemTickCountInMicroSec();
unsigned int numIter = 0;
HSAuint64 timePassed = 0;
int defaultGPUNode = m_NodeInfo.HsaDefaultGPUNode();
ASSERT_GE(defaultGPUNode, 0) << "failed to get default GPU Node";
do {
// The following means we'll get the order 0,0 => 0,1 => 1,0 => 1,1 so we cover all options.
unsigned int firstToCreate = (numIter % 2 != 0) ? 1 : 0;
unsigned int firstToDestroy = (numIter % 4 > 1) ? 1 : 0;
unsigned int secondToCreate = (firstToCreate + 1)%2;
unsigned int secondToDestroy = (firstToDestroy + 1)%2;
BaseQueue *queues[2] = {new PM4Queue(), new SDMAQueue()};
ASSERT_SUCCESS(queues[firstToCreate]->Create(defaultGPUNode));
ASSERT_SUCCESS(queues[secondToCreate]->Create(defaultGPUNode));
EXPECT_SUCCESS(queues[firstToDestroy]->Destroy());
EXPECT_SUCCESS(queues[secondToDestroy]->Destroy());
delete queues[0];
delete queues[1];
++numIter;
HSAuint64 curTime = GetSystemTickCountInMicroSec();
timePassed = (curTime - initialTime) / 1000000;
} while (timePassed < TEST_TIME_SEC);
TEST_END
}
TEST_F(KFDQMTest, OverSubscribeCpQueues) {
TEST_START(TESTPROFILE_RUNALL)
if (m_FamilyId == FAMILY_CI || m_FamilyId == FAMILY_KV) {
LOG() << "Skipping test: CI doesn't have HW scheduling." << std::endl;
return;
}
static const unsigned int MAX_CP_QUEUES = 65;
static const unsigned int MAX_PACKETS = 100;
int defaultGPUNode = m_NodeInfo.HsaDefaultGPUNode();
ASSERT_GE(defaultGPUNode, 0) << "failed to get default GPU Node";
HsaMemoryBuffer destBuf(PAGE_SIZE, defaultGPUNode, false);
destBuf.Fill(0xFF);
PM4Queue queues[MAX_CP_QUEUES];
for (unsigned int qidx = 0; qidx < MAX_CP_QUEUES; ++qidx)
ASSERT_SUCCESS(queues[qidx].Create(defaultGPUNode)) << " QueueId=" << qidx;
for (unsigned int qidx = 0; qidx < MAX_CP_QUEUES; ++qidx) {
unsigned int pktSizeDw = 0;
for (unsigned int i = 0; i < MAX_PACKETS; i++) {
PM4WriteDataPacket packet;
packet.InitPacket(destBuf.As<unsigned int*>()+qidx*2, qidx+i, qidx+i); // two dwords per packet
queues[qidx].PlacePacket(packet);
}
}
for (unsigned int qidx = 0; qidx < MAX_CP_QUEUES; ++qidx)
queues[qidx].SubmitPacket();
// Delaying for 5 seconds in order to get all the results
Delay(5000);
for (unsigned int qidx = 0; qidx < MAX_CP_QUEUES; ++qidx)
EXPECT_TRUE(queues[qidx].AllPacketsSubmitted())<< "QueueId=" << qidx;;
for (unsigned int qidx = 0; qidx < MAX_CP_QUEUES; ++qidx)
EXPECT_SUCCESS(queues[qidx].Destroy());
TEST_END
}
/* A simple isa loop program with dense mathematic operations
* s1 controls the number iterations of the loop
* This shader can be used by GFX8, GFX9 and GFX10
*/
static const char *loop_isa = \
"\
shader loop_isa\n\
wave_size(32)\n\
type(CS)\n\
s_movk_i32 s0, 0x0008\n\
s_movk_i32 s1, 0x00ff\n\
v_mov_b32 v0, 0\n\
v_mov_b32 v1, 0\n\
v_mov_b32 v2, 0\n\
v_mov_b32 v3, 0\n\
v_mov_b32 v4, 0\n\
v_mov_b32 v5, 0\n\
v_mov_b32 v6, 0\n\
v_mov_b32 v7, 0\n\
v_mov_b32 v8, 0\n\
v_mov_b32 v9, 0\n\
v_mov_b32 v10, 0\n\
v_mov_b32 v11, 0\n\
v_mov_b32 v12, 0\n\
v_mov_b32 v13, 0\n\
v_mov_b32 v14, 0\n\
v_mov_b32 v15, 0\n\
v_mov_b32 v16, 0\n\
LOOP:\n\
s_mov_b32 s8, s4\n\
s_mov_b32 s9, s1\n\
s_mov_b32 s10, s6\n\
s_mov_b32 s11, s7\n\
s_cmp_le_i32 s1, s0\n\
s_cbranch_scc1 END_OF_PGM\n\
s_buffer_load_dwordx8 s[8:15], s[8:11], 0x10\n\
v_add_f32 v0, 2.0, v0\n\
v_cvt_f32_i32 v17, s1\n\
s_waitcnt lgkmcnt(0)\n\
v_add_f32 v18, s8, v17\n\
v_add_f32 v19, s9, v17\n\
v_add_f32 v20, s10, v17\n\
v_add_f32 v21, s11, v17\n\
v_add_f32 v22, s12, v17\n\
v_add_f32 v23, s13, v17\n\
v_add_f32 v24, s14, v17\n\
v_add_f32 v17, s15, v17\n\
v_log_f32 v25, v18\n\
v_mul_legacy_f32 v25, v22, v25\n\
v_exp_f32 v25, v25\n\
v_log_f32 v26, v19\n\
v_mul_legacy_f32 v26, v23, v26\n\
v_exp_f32 v26, v26\n\
v_log_f32 v27, v20\n\
v_mul_legacy_f32 v27, v24, v27\n\
v_exp_f32 v27, v27\n\
v_log_f32 v28, v21\n\
v_mul_legacy_f32 v28, v17, v28\n\
v_exp_f32 v28, v28\n\
v_add_f32 v5, v5, v25\n\
v_add_f32 v6, v6, v26\n\
v_add_f32 v7, v7, v27\n\
v_add_f32 v8, v8, v28\n\
v_mul_legacy_f32 v18, 0x3fb8aa3b, v18\n\
v_exp_f32 v18, v18\n\
v_mul_legacy_f32 v19, 0x3fb8aa3b, v19\n\
v_exp_f32 v19, v19\n\
v_mul_legacy_f32 v20, 0x3fb8aa3b, v20\n\
v_exp_f32 v20, v20\n\
v_mul_legacy_f32 v21, 0x3fb8aa3b, v21\n\
v_exp_f32 v21, v21\n\
v_add_f32 v9, v9, v18\n\
v_add_f32 v10, v10, v19\n\
v_add_f32 v11, v11, v20\n\
v_add_f32 v12, v12, v21\n\
v_sqrt_f32 v18, v22\n\
v_sqrt_f32 v19, v23\n\
v_sqrt_f32 v20, v24\n\
v_sqrt_f32 v21, v17\n\
v_add_f32 v13, v13, v18\n\
v_add_f32 v14, v14, v19\n\
v_add_f32 v15, v15, v20\n\
v_add_f32 v16, v16, v21\n\
v_rsq_f32 v18, v22\n\
v_rsq_f32 v19, v23\n\
v_rsq_f32 v20, v24\n\
v_rsq_f32 v17, v17\n\
v_add_f32 v1, v1, v18\n\
v_add_f32 v2, v2, v19\n\
v_add_f32 v3, v3, v20\n\
v_add_f32 v4, v4, v17\n\
s_add_u32 s0, s0, 1\n\
s_branch LOOP\n\
END_OF_PGM:\n\
s_endpgm\n\
end\n\
";
HSAint64 KFDQMTest::TimeConsumedwithCUMask(int node, uint32_t* mask, uint32_t mask_count) {
HsaMemoryBuffer isaBuffer(PAGE_SIZE, node, true/*zero*/, false/*local*/, true/*exec*/);
HsaMemoryBuffer dstBuffer(PAGE_SIZE, node, true, false, false);
HsaMemoryBuffer ctlBuffer(PAGE_SIZE, node, true, false, false);
m_pIsaGen = IsaGenerator::Create(m_FamilyId);
m_pIsaGen->CompileShader(loop_isa, "loop_isa", isaBuffer);
Dispatch dispatch(isaBuffer);
dispatch.SetDim(1024, 16, 16);
PM4Queue queue;
EXPECT_SUCCESS(queue.Create(node));
EXPECT_SUCCESS(queue.SetCUMask(mask, mask_count));
queue.SetSkipWaitConsump(true);
HSAuint64 startTime = GetSystemTickCountInMicroSec();
dispatch.Submit(queue);
dispatch.Sync();
HSAuint64 endTime = GetSystemTickCountInMicroSec();
EXPECT_SUCCESS(queue.Destroy());
return endTime - startTime;
}
/* To cover for outliers, allow us to get the Average time based on a specified number of iterations */
HSAint64 KFDQMTest::GetAverageTimeConsumedwithCUMask(int node, uint32_t* mask, uint32_t mask_count, int iterations) {
HSAint64 timeArray[iterations];
HSAint64 timeTotal = 0;
if (iterations < 1) {
LOG() << "ERROR: At least 1 iteration must be performed" << std::endl;
return 0;
}
for (int x = 0; x < iterations; x++) {
timeArray[x] = TimeConsumedwithCUMask(node, mask, mask_count);
timeTotal += timeArray[x];
}
if (timeTotal == 0) {
LOG() << "ERROR: Total time reported as 0. Exiting" << std::endl;
return 0;
}
for (int x = 0; x < iterations; x++) {
HSAint64 variance = timeArray[x] / (timeTotal / iterations);
if (variance < CuNegVariance || variance > CuPosVariance)
LOG() << "WARNING: Measurement #" << x << "/" << iterations << " (" << timeArray[x]
<< ") is at least " << CuVariance*100 << "% away from the mean (" << timeTotal/iterations << ")"
<< std::endl;
}
return timeTotal / iterations;
}
/*
* Apply CU masking in a linear fashion, adding 1 CU per iteration
* until all Shader Engines are full
*/
TEST_F(KFDQMTest, BasicCuMaskingLinear) {
TEST_START(TESTPROFILE_RUNALL);
int defaultGPUNode = m_NodeInfo.HsaDefaultGPUNode();
ASSERT_GE(defaultGPUNode, 0) << "failed to get default GPU Node";
m_pIsaGen = IsaGenerator::Create(m_FamilyId);
if (m_FamilyId >= FAMILY_VI) {
const HsaNodeProperties *pNodeProperties = m_NodeInfo.GetNodeProperties(defaultGPUNode);
uint32_t ActiveCU = (pNodeProperties->NumFComputeCores / pNodeProperties->NumSIMDPerCU);
uint32_t numSEs = pNodeProperties->NumShaderBanks;
LOG() << std::dec << "# Compute cores: " << pNodeProperties->NumFComputeCores << std::endl;
LOG() << std::dec << "# SIMDs per CU: " << pNodeProperties->NumSIMDPerCU << std::endl;
LOG() << std::dec << "# Shader engines: " << numSEs << std::endl;
LOG() << std::dec << "# Active CUs: " << ActiveCU << std::endl;
HSAint64 TimewithCU1, TimewithCU;
uint32_t maskNumDwords = (ActiveCU + 31) / 32; /* Round up to the nearest multiple of 32 */
uint32_t maskNumBits = maskNumDwords * 32;
uint32_t mask[maskNumDwords];
double ratio;
mask[0] = 0x1;
for (int i = 1; i < maskNumDwords; i++)
mask[i] = 0x0;
/* Execute once to get any HW optimizations out of the way */
TimeConsumedwithCUMask(defaultGPUNode, mask, maskNumBits);
LOG() << "Getting baseline performance numbers (CU Mask: 0x1)" << std::endl;
TimewithCU1 = GetAverageTimeConsumedwithCUMask(defaultGPUNode, mask, maskNumBits, 3);
for (int nCUs = 2; nCUs <= ActiveCU; nCUs++) {
int maskIndex = (nCUs - 1) / 32;
mask[maskIndex] |= 1 << ((nCUs - 1) % 32);
TimewithCU = TimeConsumedwithCUMask(defaultGPUNode, mask, maskNumBits);
ratio = (double)(TimewithCU1) / ((double)(TimewithCU) * nCUs);
LOG() << "Expected performance of " << nCUs << " CUs vs 1 CU:" << std::endl;
LOG() << std::setprecision(2) << CuNegVariance << " <= " << std::fixed << std::setprecision(8)
<< ratio << " <= " << std::setprecision(2) << CuPosVariance << std::endl;
EXPECT_TRUE((ratio >= CuNegVariance) && (ratio <= CuPosVariance));
RECORD(ratio) << "Ratio-" << nCUs << "-CUs";
}
} else {
LOG() << "Skipping test: Test not supported for family ID 0x" << m_FamilyId << "." << std::endl;
}
TEST_END
}
/**
* Apply CU masking where the number of CUs is equal across all Shader Engines
* This will work due to the HW splitting the workload unevenly across the Shader
* Engines when ((#ofCUs)/(#ofShaderEngines)) is not a whole number. The tests above
* will not yield viable results when an uneven distribution of CUs is used over multiple
* shader engines (e.g. 0x1000100030003), until the HW changes how it schedules work.
*/
TEST_F(KFDQMTest, BasicCuMaskingEven) {
TEST_START(TESTPROFILE_RUNALL);
int defaultGPUNode = m_NodeInfo.HsaDefaultGPUNode();
ASSERT_GE(defaultGPUNode, 0) << "failed to get default GPU Node";
if (m_FamilyId >= FAMILY_VI) {
const HsaNodeProperties *pNodeProperties = m_NodeInfo.GetNodeProperties(defaultGPUNode);
uint32_t ActiveCU = (pNodeProperties->NumFComputeCores / pNodeProperties->NumSIMDPerCU);
uint32_t numShaderEngines = pNodeProperties->NumShaderBanks;
if (numShaderEngines == 1) {
LOG() << "Skipping test: Only 1 Shader Engine present." << std::endl;
return;
}
LOG() << std::dec << "# Compute cores: " << pNodeProperties->NumFComputeCores << std::endl;
LOG() << std::dec << "# SIMDs per CU: " << pNodeProperties->NumSIMDPerCU << std::endl;
LOG() << std::dec << "# Shader engines: " << numShaderEngines << std::endl;
LOG() << std::dec << "# Active CUs: " << ActiveCU << std::endl;
HSAint64 TimewithCU1, TimewithCU;
uint32_t maskNumDwords = (ActiveCU + 31) / 32; /* Round up to the nearest multiple of 32 */
uint32_t maskNumBits = maskNumDwords * 32;
uint32_t mask[maskNumDwords];
int numCuPerShader = ActiveCU / numShaderEngines;
double ratio;
/* In KFD we symmetrically map mask to all SEs:
* mask[0] bit0 -> se0 cu0;
* mask[0] bit1 -> se1 cu0;
* ... (if # SE is 4)
* mask[0] bit4 -> se0 cu1;
* ...
*/
/* Set Mask to 1 CU per SE */
memset(mask, 0, maskNumDwords * sizeof(uint32_t));
for (int i = 0; i < numShaderEngines; i++) {
int maskIndex = (i / 32) % maskNumDwords;
mask[maskIndex] |= 1 << (i % 32);
}
/* Execute once to get any HW optimizations out of the way */
TimeConsumedwithCUMask(defaultGPUNode, mask, maskNumBits);
LOG() << "Getting baseline performance numbers (1 CU per SE)" << std::endl;
TimewithCU1 = GetAverageTimeConsumedwithCUMask(defaultGPUNode, mask, maskNumBits, 3);
/* Each loop will add 1 more CU per SE. We use the mod and divide to handle
* when SEs aren't distributed in multiples of 32 (e.g. Tonga)
* OR the new bit in for simplicity instead of re-creating the mask each iteration
*/
for (int x = 0; x < numCuPerShader; x++) {
for (int se = 0; se < numShaderEngines; se++) {
int offset = x * numShaderEngines + se;
int maskIndex = (offset / 32) % maskNumDwords;
mask[maskIndex] |= 1 << (offset % 32);
}
int nCUs = x + 1;
TimewithCU = TimeConsumedwithCUMask(defaultGPUNode, mask, maskNumBits);
ratio = (double)(TimewithCU1) / ((double)(TimewithCU) * nCUs);
LOG() << "Expected performance of " << nCUs << " CU(s)/SE vs 1 CU/SE:" << std::endl;
LOG() << std::setprecision(2) << CuNegVariance << " <= " << std::fixed << std::setprecision(8)
<< ratio << " <= " << std::setprecision(2) << CuPosVariance << std::endl;
EXPECT_TRUE((ratio >= CuNegVariance) && (ratio <= CuPosVariance));
RECORD(ratio) << "Ratio-" << nCUs << "-CUs";
}
} else {
LOG() << "Skipping test: Test not supported for family ID 0x" << m_FamilyId << "." << std::endl;
}
TEST_END
}
TEST_F(KFDQMTest, QueuePriorityOnDifferentPipe) {
TEST_START(TESTPROFILE_RUNALL);
if (m_FamilyId < FAMILY_VI) {
LOG() << "Skipping test: Shader won't run on CI." << std::endl;
return;
}
int node = m_NodeInfo.HsaDefaultGPUNode();
ASSERT_GE(node, 0) << "failed to get default GPU Node";
HsaMemoryBuffer syncBuf(PAGE_SIZE, node, true/*zero*/, false/*local*/, true/*exec*/);
HSAint32 *syncBuffer = syncBuf.As<HSAint32*>();
HsaMemoryBuffer isaBuffer(PAGE_SIZE, node, true/*zero*/, false/*local*/, true/*exec*/);
m_pIsaGen->CompileShader(loop_isa, "loop_isa", isaBuffer);
Dispatch dispatch[2] = {
Dispatch(isaBuffer, true),
Dispatch(isaBuffer, true)
};
int activeTaskBitmap = 0x3;
HSAuint64 startTime, endTime[2];
HsaEvent *pHsaEvent[2];
int numEvent = 2;
PM4Queue queue[2];
HSA_QUEUE_PRIORITY priority[2] = {
HSA_QUEUE_PRIORITY_LOW,
HSA_QUEUE_PRIORITY_HIGH
};
int i;
for (i = 0; i < 2; i++) {
syncBuffer[i] = -1;
ASSERT_SUCCESS(queue[i].Create(node));
queue[i].Update(BaseQueue::DEFAULT_QUEUE_PERCENTAGE, priority[i], false);
pHsaEvent[i] = dispatch[i].GetHsaEvent();
pHsaEvent[i]->EventData.EventData.SyncVar.SyncVar.UserData = &syncBuffer[i];
dispatch[i].SetDim(1024, 16, 16);
}
startTime = GetSystemTickCountInMicroSec();
for (i = 0; i < 2; i++)
dispatch[i].Submit(queue[i]);
while (activeTaskBitmap > 0) {
hsaKmtWaitOnMultipleEvents(pHsaEvent, numEvent, false, g_TestTimeOut);
for (i = 0; i < 2; i++) {
if ((activeTaskBitmap & (1 << i)) && (syncBuffer[i] == pHsaEvent[i]->EventId)) {
endTime[i] = GetSystemTickCountInMicroSec();
activeTaskBitmap &= ~(1 << i);
}
}
}
for (i = 0; i < 2; i++) {
EXPECT_SUCCESS(queue[i].Destroy());
int usecs = endTime[i] - startTime;
LOG() << "Task priority: " << std::dec << priority[i] << "\t";
LOG() << "Task duration: " << std::dec << usecs << "usecs" << std::endl;
}
TEST_END
}
TEST_F(KFDQMTest, QueuePriorityOnSamePipe) {
TEST_START(TESTPROFILE_RUNALL);
if (m_FamilyId < FAMILY_VI) {
LOG() << "Skipping test: Shader won't run on CI." << std::endl;
return;
}
int node = m_NodeInfo.HsaDefaultGPUNode();
ASSERT_GE(node, 0) << "failed to get default GPU Node";
HsaMemoryBuffer syncBuf(PAGE_SIZE, node, true/*zero*/, false/*local*/, true/*exec*/);
HSAint32 *syncBuffer = syncBuf.As<HSAint32*>();
HsaMemoryBuffer isaBuffer(PAGE_SIZE, node, true/*zero*/, false/*local*/, true/*exec*/);
m_pIsaGen->CompileShader(loop_isa, "loop_isa", isaBuffer);
Dispatch dispatch[2] = {
Dispatch(isaBuffer, true),
Dispatch(isaBuffer, true)
};
int activeTaskBitmap = 0x3;
HSAuint64 startTime, endTime[2];
HsaEvent *pHsaEvent[2];
int numEvent = 2;
PM4Queue queue[13];
HSA_QUEUE_PRIORITY priority[2] = {
HSA_QUEUE_PRIORITY_LOW,
HSA_QUEUE_PRIORITY_HIGH
};
int i;
/* queue[2..12] are dummy queues. Create queue in this sequence to
* render queue[0] and queue[1] on same pipe with no assumptions
* about the number of pipes used by KFD. Queue #12 is a multiple
* of 1, 2, 3 and 4, so it falls on pipe 0 for any number of pipes
*/
EXPECT_SUCCESS(queue[0].Create(node)); // Queue 0 is on Pipe 0
for (i = 2; i <= 12; i++)
EXPECT_SUCCESS(queue[i].Create(node));
EXPECT_SUCCESS(queue[1].Create(node)); // Queue 12 is on Pipe 0
for (i = 0; i < 2; i++) {
syncBuffer[i] = -1;
queue[i].Update(BaseQueue::DEFAULT_QUEUE_PERCENTAGE, priority[i], false);
pHsaEvent[i] = dispatch[i].GetHsaEvent();
pHsaEvent[i]->EventData.EventData.SyncVar.SyncVar.UserData = &syncBuffer[i];
dispatch[i].SetDim(1024, 16, 16);
}
startTime = GetSystemTickCountInMicroSec();
for (i = 0; i < 2; i++)
dispatch[i].Submit(queue[i]);
while (activeTaskBitmap > 0) {
hsaKmtWaitOnMultipleEvents(pHsaEvent, numEvent, false, g_TestTimeOut);
for (i = 0; i < 2; i++) {
if ((activeTaskBitmap & (1 << i)) && (syncBuffer[i] == pHsaEvent[i]->EventId)) {
endTime[i] = GetSystemTickCountInMicroSec();
activeTaskBitmap &= ~(1 << i);
}
}
}
for (i = 0; i < 2; i++) {
int usecs = endTime[i] - startTime;
LOG() << "Task priority: " << std::dec << priority[i] << "\t";
LOG() << "Task duration: " << std::dec << usecs << "usecs" << std::endl;
}
for (i = 0; i <= 12; i++) {
EXPECT_SUCCESS(queue[i].Destroy());
}
TEST_END
}
void KFDQMTest::SyncDispatch(const HsaMemoryBuffer& isaBuffer, void* pSrcBuf, void* pDstBuf, int node) {
PM4Queue queue;
int defaultGPUNode = m_NodeInfo.HsaDefaultGPUNode();
if (node != -1)
defaultGPUNode = node;
ASSERT_GE(defaultGPUNode, 0) << "failed to get default GPU Node";
Dispatch dispatch(isaBuffer);
dispatch.SetArgs(pSrcBuf, pDstBuf);
dispatch.SetDim(1, 1, 1);
ASSERT_SUCCESS(queue.Create(defaultGPUNode));
dispatch.Submit(queue);
dispatch.Sync();
EXPECT_SUCCESS(queue.Destroy());
}
TEST_F(KFDQMTest, EmptyDispatch) {
TEST_START(TESTPROFILE_RUNALL);
int defaultGPUNode = m_NodeInfo.HsaDefaultGPUNode();
ASSERT_GE(defaultGPUNode, 0) << "failed to get default GPU Node";
HsaMemoryBuffer isaBuffer(PAGE_SIZE, defaultGPUNode, true/*zero*/, false/*local*/, true/*exec*/);
m_pIsaGen->GetNoopIsa(isaBuffer);
SyncDispatch(isaBuffer, NULL, NULL);
TEST_END
}
TEST_F(KFDQMTest, SimpleWriteDispatch) {
TEST_START(TESTPROFILE_RUNALL);
int defaultGPUNode = m_NodeInfo.HsaDefaultGPUNode();
ASSERT_GE(defaultGPUNode, 0) << "failed to get default GPU Node";
HsaMemoryBuffer isaBuffer(PAGE_SIZE, defaultGPUNode, true/*zero*/, false/*local*/, true/*exec*/);
HsaMemoryBuffer srcBuffer(PAGE_SIZE, defaultGPUNode, false);
HsaMemoryBuffer destBuffer(PAGE_SIZE, defaultGPUNode);
srcBuffer.Fill(0x01010101);
m_pIsaGen->GetCopyDwordIsa(isaBuffer);
SyncDispatch(isaBuffer, srcBuffer.As<void*>(), destBuffer.As<void*>());
EXPECT_EQ(destBuffer.As<unsigned int*>()[0], 0x01010101);
TEST_END
}
TEST_F(KFDQMTest, MultipleCpQueuesStressDispatch) {
TEST_START(TESTPROFILE_RUNALL)
int defaultGPUNode = m_NodeInfo.HsaDefaultGPUNode();
ASSERT_GE(defaultGPUNode, 0) << "failed to get default GPU Node";
static const unsigned int MAX_CP_QUEUES = 16;
HsaMemoryBuffer isaBuffer(PAGE_SIZE, defaultGPUNode, true/*zero*/, false/*local*/, true/*exec*/);
HsaMemoryBuffer srcBuffer(PAGE_SIZE, defaultGPUNode, false);
HsaMemoryBuffer destBuffer(PAGE_SIZE, defaultGPUNode);
unsigned int* src = srcBuffer.As<unsigned int*>();
unsigned int* dst = destBuffer.As<unsigned int*>();
static const HSAuint64 TEST_TIME_SEC = 15;
HSAuint64 initialTime, curTime;
unsigned int numIter = 0;
HSAuint64 timePassed = 0;
unsigned int i;
PM4Queue queues[MAX_CP_QUEUES];
Dispatch* dispatch[MAX_CP_QUEUES];
destBuffer.Fill(0xFF);
m_pIsaGen->GetCopyDwordIsa(isaBuffer);
for (i = 0; i < MAX_CP_QUEUES; ++i)
ASSERT_SUCCESS(queues[i].Create(defaultGPUNode)) << " QueueId=" << i;
initialTime = GetSystemTickCountInMicroSec();
do {
for (i = 0; i < MAX_CP_QUEUES; ++i) {
dispatch[i] = new Dispatch(isaBuffer);
src[i] = numIter;
dst[i] = 0xff;
dispatch[i]->SetArgs(&src[i], &dst[i]);
dispatch[i]->SetDim(1, 1, 1);
dispatch[i]->Submit(queues[i]);
}
for (i = 0; i < MAX_CP_QUEUES; ++i) {
dispatch[i]->Sync();
EXPECT_EQ(dst[i], src[i]);
delete dispatch[i];
}
++numIter;
curTime = GetSystemTickCountInMicroSec();
timePassed = (curTime - initialTime) / 1000000;
} while (timePassed < TEST_TIME_SEC);
LOG() << "Total iterated : " << std::dec << numIter << std::endl;
for (i = 0; i < MAX_CP_QUEUES; ++i)
EXPECT_SUCCESS(queues[i].Destroy());
TEST_END
}
TEST_F(KFDQMTest, CpuWriteCoherence) {
TEST_START(TESTPROFILE_RUNALL);
PM4Queue queue;
int defaultGPUNode = m_NodeInfo.HsaDefaultGPUNode();
ASSERT_GE(defaultGPUNode, 0) << "failed to get default GPU Node";
HsaMemoryBuffer destBuf(PAGE_SIZE, defaultGPUNode);
ASSERT_SUCCESS(queue.Create(defaultGPUNode));
/* The queue might be full and we fail to submit. There is always one word space unused in queue.
* So let rptr one step ahead then we continually submit packet.
*/
queue.PlaceAndSubmitPacket(PM4NopPacket());
queue.Wait4PacketConsumption();
EXPECT_EQ(1, queue.Rptr());
do {
queue.PlaceAndSubmitPacket(PM4NopPacket());
} while (queue.Wptr() != 0);
queue.Wait4PacketConsumption();
EXPECT_EQ(0, queue.Rptr());
/* Now that the GPU has cached the PQ contents, we modify them in CPU cache and
* ensure that the GPU sees the updated value:
*/
queue.PlaceAndSubmitPacket(PM4WriteDataPacket(destBuf.As<unsigned int*>(), 0x42, 0x42));
queue.Wait4PacketConsumption();
WaitOnValue(destBuf.As<unsigned int*>(), 0x42);
TEST_END
}
TEST_F(KFDQMTest, CreateAqlCpQueue) {
TEST_START(TESTPROFILE_RUNALL)
AqlQueue queue;
int defaultGPUNode = m_NodeInfo.HsaDefaultGPUNode();
ASSERT_GE(defaultGPUNode, 0) << "failed to get default GPU Node";
HsaMemoryBuffer pointers(PAGE_SIZE, defaultGPUNode);
ASSERT_SUCCESS(queue.Create(defaultGPUNode, PAGE_SIZE, pointers.As<HSAuint64 *>()));
EXPECT_SUCCESS(queue.Destroy());
TEST_END
}
TEST_F(KFDQMTest, QueueLatency) {
TEST_START(TESTPROFILE_RUNALL);
PM4Queue queue;
const int queueSize = PAGE_SIZE * 2;
const int packetSize = PM4ReleaseMemoryPacket(m_FamilyId, 0, 0, 0, 0, 0).SizeInBytes();
/* We always leave one NOP(dword) empty after packet which is required by ring itself.
* We also place NOPs when queue wraparound to avoid crossing buffer end. See PlacePacket().
* So the worst case is that we need two packetSize space to place one packet.
* Like below, N=NOP,E=Empty,P=Packet.
* |E|E|E|E|E|E|E|rptr...wptr|E|E|E|E|E| ---> |P|P|P|P|P|P|E|rptr...wptr|N|N|N|N|N|
* So to respect that, we reserve packetSize space for these additional NOPs.
* Also we reserve the remainder of the division by packetSize explicitly.
* Reserve another packetSize for event-based wait which uses a releseMemory packet.
*/
const int reservedSpace = packetSize + queueSize % packetSize + packetSize;
const int slots = (queueSize - reservedSpace) / packetSize;
HSAint64 queue_latency_avg = 0, queue_latency_min, queue_latency_max, queue_latency_med;
HSAint64 overhead, workload;
HSAint64 *queue_latency_arr = reinterpret_cast<HSAint64*>(calloc(slots, sizeof(HSAint64)));
const int skip = 2;
const char *fs[skip] = {"1st", "2nd"};
HsaClockCounters *ts;
HSAuint64 *qts;
int i = 0;
ASSERT_NE((HSAuint64)queue_latency_arr, 0);
int defaultGPUNode = m_NodeInfo.HsaDefaultGPUNode();
ASSERT_GE(defaultGPUNode, 0) << "failed to get default GPU Node";
ASSERT_SUCCESS(queue.Create(defaultGPUNode, queueSize));
LOG() << std::dec << "Queue Submit NanoSeconds (" << slots << " Packets)" << std::endl;
HsaMemoryBuffer buf(ALIGN_UP(slots * sizeof(HsaClockCounters), PAGE_SIZE), 0);
ts = buf.As<HsaClockCounters*>();
HsaMemoryBuffer qbuf(ALIGN_UP(slots * sizeof(HSAuint64), PAGE_SIZE), 0);
qts = qbuf.As<HSAuint64*>();
HsaEvent *event;
ASSERT_SUCCESS(CreateQueueTypeEvent(false, false, defaultGPUNode, &event));
/* GpuCounter overhead*/
do {
hsaKmtGetClockCounters(defaultGPUNode, &ts[i]);
} while (++i < slots);
overhead = ts[slots-1].GPUClockCounter - ts[0].GPUClockCounter;
overhead /= 2 * (slots - 1);
/* Submit packets serially*/
i = 0;
do {
queue.PlacePacket(PM4ReleaseMemoryPacket(m_FamilyId, true,
(HSAuint64)&qts[i],
0,
true,
1));
hsaKmtGetClockCounters(defaultGPUNode, &ts[i]);
queue.SubmitPacket();
queue.Wait4PacketConsumption(event);
} while (++i < slots);
/* Calculate timing which includes workload and overhead*/
i = 0;
do {
HSAint64 queue_latency = qts[i] - ts[i].GPUClockCounter;
EXPECT_GE(queue_latency, 0);
queue_latency_arr[i] = queue_latency;
if (i >= skip)
queue_latency_avg += queue_latency;
} while (++i < slots);
/* Calculate avg from packet[skip, slots-1] */
queue_latency_avg /= (slots - skip);
/* Workload of queue packet itself */
i = 0;
do {
queue.PlacePacket(PM4ReleaseMemoryPacket(m_FamilyId, true,
(HSAuint64)&qts[i],
0,
true,
1));
} while (++i < slots);
queue.SubmitPacket();
queue.Wait4PacketConsumption(event);
hsaKmtDestroyEvent(event);
/* qts[i] records the timestamp of the end of packet[i] which is
* approximate that of the beginging of packet[i+1].
* The workload total is [0, skip], [skip+1, slots-1].
* And We ignore [0, skip], that means we ignore (skip+1) packets.
*/
workload = qts[slots - 1] - qts[skip];
workload /= (slots - 1 - skip);
EXPECT_GE(workload, 0);
i = 0;
do {
/* The queue_latency is not that correct as the workload and overhead are average*/
queue_latency_arr[i] -= workload + overhead;
/* The First submit takes an HSAint64 time*/
if (i < skip)
LOG() << "Queue Latency " << fs[i] << ": \t" << CounterToNanoSec(queue_latency_arr[i]) << std::endl;
} while (++i < slots);
std::sort(queue_latency_arr + skip, queue_latency_arr + slots);
queue_latency_min = queue_latency_arr[skip];
queue_latency_med = queue_latency_arr[(slots+skip)/2];
queue_latency_max = queue_latency_arr[slots-1];
LOG() << "Queue Latency Avg: \t" << CounterToNanoSec(queue_latency_avg) << std::endl;
LOG() << "Queue Latency Min: \t" << CounterToNanoSec(queue_latency_min) << std::endl;
LOG() << "Queue Latency Median: \t" << CounterToNanoSec(queue_latency_med) << std::endl;
LOG() << "Queue Latency Max: \t" << CounterToNanoSec(queue_latency_max) << std::endl;
LOG() << "Queue Packet Workload: \t" << CounterToNanoSec(workload) << std::endl;
LOG() << "Get GpuCounter Overhead: \t" << CounterToNanoSec(overhead) << std::endl;
RECORD(CounterToNanoSec(queue_latency_avg)) << "Queue-Latency-Avg";
RECORD(CounterToNanoSec(queue_latency_min)) << "Queue-Latency-Min";
RECORD(CounterToNanoSec(queue_latency_med)) << "Queue-Latency-Med";
RECORD(CounterToNanoSec(queue_latency_max)) << "Queue-Latency-Max";
RECORD(CounterToNanoSec(workload)) << "Queue-Packet-Workload";
RECORD(CounterToNanoSec(overhead)) << "GpuCounter-Overhead";
TEST_END
}
TEST_F(KFDQMTest, CpQueueWraparound) {
TEST_START(TESTPROFILE_RUNALL);
PM4Queue queue;
int defaultGPUNode = m_NodeInfo.HsaDefaultGPUNode();
ASSERT_GE(defaultGPUNode, 0) << "failed to get default GPU Node";
HsaMemoryBuffer destBuf(PAGE_SIZE, defaultGPUNode);
ASSERT_SUCCESS(queue.Create(defaultGPUNode));
for (unsigned int pktIdx = 0; pktIdx <= PAGE_SIZE/sizeof(PM4WRITE_DATA_CI); ++pktIdx) {
queue.PlaceAndSubmitPacket(PM4WriteDataPacket(destBuf.As<unsigned int*>(), pktIdx, pktIdx));
queue.Wait4PacketConsumption();
WaitOnValue(destBuf.As<unsigned int*>(), pktIdx);
}
for (unsigned int pktIdx = 0; pktIdx <= PAGE_SIZE/sizeof(PM4WRITE_DATA_CI); ++pktIdx) {
queue.PlaceAndSubmitPacket(PM4WriteDataPacket(destBuf.As<unsigned int*>(), pktIdx, pktIdx));
queue.Wait4PacketConsumption();
WaitOnValue(destBuf.As<unsigned int*>(), pktIdx);
}
EXPECT_SUCCESS(queue.Destroy());
TEST_END
}
TEST_F(KFDQMTest, SdmaQueueWraparound) {
TEST_START(TESTPROFILE_RUNALL);
int bufSize = PAGE_SIZE;
SDMAQueue queue;
int defaultGPUNode = m_NodeInfo.HsaDefaultGPUNode();
ASSERT_GE(defaultGPUNode, 0) << "failed to get default GPU Node";
HsaMemoryBuffer destBuf(bufSize << 1, defaultGPUNode, false);
HsaMemoryBuffer srcBuf(bufSize, defaultGPUNode, false);
ASSERT_SUCCESS(queue.Create(defaultGPUNode));
for (unsigned int pktIdx = 0; pktIdx <= queue.Size()/sizeof(SDMA_PKT_COPY_LINEAR); ++pktIdx) {
destBuf.Fill(0x0);
srcBuf.Fill(pktIdx);
queue.PlaceAndSubmitPacket(
SDMACopyDataPacket(queue.GetFamilyId(), destBuf.As<unsigned int*>(), srcBuf.As<unsigned int*>(), bufSize));
queue.PlaceAndSubmitPacket(
SDMAWriteDataPacket(queue.GetFamilyId(), destBuf.As<unsigned int*>() + bufSize/4, 0x02020202));
queue.Wait4PacketConsumption();
EXPECT_TRUE(WaitOnValue(destBuf.As<unsigned int*>() + bufSize/4, 0x02020202));
EXPECT_SUCCESS(memcmp(
destBuf.As<unsigned int*>(), srcBuf.As<unsigned int*>(), bufSize));
}
for (unsigned int pktIdx = 0; pktIdx <= queue.Size()/sizeof(SDMA_PKT_WRITE_UNTILED); ++pktIdx) {
queue.PlaceAndSubmitPacket(SDMAWriteDataPacket(queue.GetFamilyId(), destBuf.As<unsigned int*>(), pktIdx));
queue.Wait4PacketConsumption();
WaitOnValue(destBuf.As<unsigned int*>(), pktIdx);
}
EXPECT_SUCCESS(queue.Destroy());
TEST_END
}
struct AtomicIncThreadParams {
HSAint64* pDest;
volatile unsigned int count;
volatile bool loop;
};
unsigned int AtomicIncThread(void* pCtx) {
AtomicIncThreadParams* pArgs = reinterpret_cast<AtomicIncThreadParams*>(pCtx);
while (pArgs->loop) {
AtomicInc(pArgs->pDest);
++pArgs->count;
}
LOG() << "CPU atomic increments finished" << std::endl;
return 0;
}
TEST_F(KFDQMTest, Atomics) {
TEST_START(TESTPROFILE_RUNALL);
/* CI doesn't support Atomics. KV does, but gets its own FAMILY_KV def */
if (m_FamilyId == FAMILY_CI) {
LOG() << "Skipping test: CI doesn't support Atomics." << std::endl;
return;
}
int defaultGPUNode = m_NodeInfo.HsaDefaultGPUNode();
ASSERT_GE(defaultGPUNode, 0) << "failed to get default GPU Node";
HsaMemoryBuffer isaBuf(PAGE_SIZE, defaultGPUNode, true/*zero*/, false/*local*/, true/*exec*/);
HsaMemoryBuffer destBuf(PAGE_SIZE, defaultGPUNode);
PM4Queue queue;
m_pIsaGen->GetAtomicIncIsa(isaBuf);
Dispatch dispatch(isaBuf);
dispatch.SetArgs(destBuf.As<void*>(), NULL);
dispatch.SetDim(1024, 1, 1);
hsaKmtSetMemoryPolicy(defaultGPUNode, HSA_CACHING_CACHED, HSA_CACHING_CACHED, NULL, 0);
ASSERT_SUCCESS(queue.Create(defaultGPUNode));
AtomicIncThreadParams params;
params.pDest = destBuf.As<HSAint64*>();
params.loop = true;
params.count = 0;
uint64_t threadId;
ASSERT_EQ(true, StartThread(&AtomicIncThread, &params, threadId));
LOG() << "Waiting for CPU to atomic increment 1000 times" << std::endl;
while (params.count < 1000)
{}
LOG() << "Submitting the GPU atomic increment shader" << std::endl;
dispatch.Submit(queue);
dispatch.Sync();
params.loop = false;
WaitForThread(threadId);
EXPECT_EQ(destBuf.As<unsigned int*>()[0], 1024 + params.count);
LOG() << "GPU increments: 1024, CPU increments: " << std::dec
<< params.count << std::endl;
queue.Destroy();
TEST_END
}
TEST_F(KFDQMTest, mGPUShareBO) {
TEST_START(TESTPROFILE_RUNALL);
unsigned int src_node = 2;
unsigned int dst_node = 1;
if (g_TestDstNodeId != -1 && g_TestNodeId != -1) {
src_node = g_TestNodeId;
dst_node = g_TestDstNodeId;
}
HsaMemoryBuffer shared_addr(PAGE_SIZE, dst_node, true, false, false, false);
HsaMemoryBuffer srcNodeMem(PAGE_SIZE, src_node);
HsaMemoryBuffer dstNodeMem(PAGE_SIZE, dst_node);
/* Handle ISA to write to local memory BO */
HsaMemoryBuffer isaBufferSrc(PAGE_SIZE, src_node, true/*zero*/, false/*local*/, true/*exec*/);
HsaMemoryBuffer isaBufferDst(PAGE_SIZE, dst_node, true/*zero*/, false/*local*/, true/*exec*/);
srcNodeMem.Fill(0x05050505);
m_pIsaGen->GetCopyDwordIsa(isaBufferSrc);
SyncDispatch(isaBufferSrc, srcNodeMem.As<void*>(), shared_addr.As<void *>(), src_node);
m_pIsaGen->GetCopyDwordIsa(isaBufferDst);
SyncDispatch(isaBufferDst, shared_addr.As<void *>(), dstNodeMem.As<void*>(), dst_node);
EXPECT_EQ(dstNodeMem.As<unsigned int*>()[0], 0x05050505);
EXPECT_SUCCESS(shared_addr.UnmapMemToNodes(&dst_node, 1));
TEST_END
}
static void
sdma_copy(HSAuint32 node, void *src, void *const dst[], int n, HSAuint64 size) {
SDMAQueue sdmaQueue;
HsaEvent *event;
ASSERT_SUCCESS(CreateQueueTypeEvent(false, false, node, &event));
ASSERT_SUCCESS(sdmaQueue.Create(node));
sdmaQueue.PlaceAndSubmitPacket(SDMACopyDataPacket(sdmaQueue.GetFamilyId(), dst, src, n, size));
sdmaQueue.Wait4PacketConsumption(event);
EXPECT_SUCCESS(sdmaQueue.Destroy());
hsaKmtDestroyEvent(event);
}
static void
sdma_fill(HSAint32 node, void *dst, unsigned int data, HSAuint64 size) {
SDMAQueue sdmaQueue;
HsaEvent *event;
ASSERT_SUCCESS(CreateQueueTypeEvent(false, false, node, &event));
ASSERT_SUCCESS(sdmaQueue.Create(node));
sdmaQueue.PlaceAndSubmitPacket(SDMAFillDataPacket(sdmaQueue.GetFamilyId(), dst, data, size));
sdmaQueue.Wait4PacketConsumption(event);
EXPECT_SUCCESS(sdmaQueue.Destroy());
hsaKmtDestroyEvent(event);
}
TEST_F(KFDQMTest, P2PTest) {
TEST_START(TESTPROFILE_RUNALL);
if (!is_dgpu()) {
LOG() << "Skipping test: Two GPUs are required, but no dGPUs are present." << std::endl;
return;
}
const std::vector<int> gpuNodes = m_NodeInfo.GetNodesWithGPU();
if (gpuNodes.size() < 2) {
LOG() << "Skipping test: At least two GPUs are required." << std::endl;
return;
}
std::vector<HSAuint32> nodes;
/* This test simulates RT team's P2P part in IPCtest:
*
* +------------------------------------------------+
* | gpu1 gpu2 gpuX |
* |gpu1 mem ----> gpu2 mem ----> gpuX mem |
* | \ \ \ |
* | \ \ \ |
* | system buffer system buffer system buffer|
* +------------------------------------------------+
*
* Copy data from current GPU memory to next GPU memory and system memory
* Using current GPU, aka p2p push.
* Verify the system buffer has the expected content after each push.
*/
/* Users can use "--node=gpu1 --dst_node=gpu2" to specify devices */
if (g_TestDstNodeId != -1 && g_TestNodeId != -1) {
nodes.push_back(g_TestNodeId);
nodes.push_back(g_TestDstNodeId);
if (!m_NodeInfo.IsGPUNodeLargeBar(g_TestDstNodeId) &&
!m_NodeInfo.AreGPUNodesXGMI(g_TestNodeId, g_TestDstNodeId)) {
LOG() << "Skipping test: Dst GPU specified is not peer-accessible." << std::endl;
return;
}
if (nodes[0] == nodes[1]) {
LOG() << "Skipping test: Different GPUs must be specified (2 GPUs required)." << std::endl;
return;
}
} else {
HSAint32 defaultGPU = m_NodeInfo.HsaDefaultGPUNode();
m_NodeInfo.FindAccessiblePeers(&nodes, defaultGPU, true);
if (nodes.size() < 2) {
LOG() << "Skipping test: Test requires at least one large bar GPU." << std::endl;
LOG() << " or two GPUs are XGMI connected." << std::endl;
return;
}
}
HSAuint32 *sysBuf;
HSAuint32 size = 16ULL<<20; // bigger than 16MB to test non-contiguous memory
HsaMemFlags memFlags = {0};
HsaMemMapFlags mapFlags = {0};
memFlags.ui32.PageSize = HSA_PAGE_SIZE_4KB;
memFlags.ui32.HostAccess = 1;
memFlags.ui32.NonPaged = 1;
memFlags.ui32.NoNUMABind = 1;
unsigned int end = size / sizeof(HSAuint32) - 1;
/* 1. Allocate a system buffer and allow the access to GPUs */
EXPECT_SUCCESS(hsaKmtAllocMemory(0, size, memFlags,
reinterpret_cast<void **>(&sysBuf)));
EXPECT_SUCCESS(hsaKmtMapMemoryToGPUNodes(sysBuf, size, NULL,
mapFlags, nodes.size(), &nodes[0]));
#define MAGIC_NUM 0xdeadbeaf
/* First GPU fills mem with MAGIC_NUM */
void *src, *dst;
HSAuint32 cur = nodes[0], next;
ASSERT_SUCCESS(hsaKmtAllocMemory(cur, size, memFlags, reinterpret_cast<void**>(&src)));
ASSERT_SUCCESS(hsaKmtMapMemoryToGPU(src, size, NULL));
sdma_fill(cur, src, MAGIC_NUM, size);
for (unsigned i = 1; i <= nodes.size(); i++) {
int n;
memset(sysBuf, 0, size);
/* Last GPU just copy mem to sysBuf*/
if (i == nodes.size()) {
n = 1;
next = 0;/*system memory node*/
dst = 0;
} else {
n = 2;
next = nodes[i];
ASSERT_SUCCESS(hsaKmtAllocMemory(next, size, memFlags, reinterpret_cast<void**>(&dst)));
ASSERT_SUCCESS(hsaKmtMapMemoryToGPU(dst, size, NULL));
}
LOG() << "Test " << cur << " -> " << next << std::endl;
/* Copy to sysBuf and next GPU*/
void *dst_array[] = {sysBuf, dst};
sdma_copy(cur, src, dst_array, n, size);
/* Verify the data*/
EXPECT_EQ(sysBuf[0], MAGIC_NUM);
EXPECT_EQ(sysBuf[end], MAGIC_NUM);
LOG() << "PASS " << cur << " -> " << next << std::endl;
EXPECT_SUCCESS(hsaKmtUnmapMemoryToGPU(src));
EXPECT_SUCCESS(hsaKmtFreeMemory(src, size));
cur = next;
src = dst;
}
EXPECT_SUCCESS(hsaKmtUnmapMemoryToGPU(sysBuf));
EXPECT_SUCCESS(hsaKmtFreeMemory(sysBuf, size));
TEST_END
}
TEST_F(KFDQMTest, PM4EventInterrupt) {
TEST_START(TESTPROFILE_RUNALL)
int defaultGPUNode = m_NodeInfo.HsaDefaultGPUNode();
ASSERT_GE(defaultGPUNode, 0) << "failed to get default GPU Node";
const HSAuint64 bufSize = PAGE_SIZE;
const int packetCount = bufSize / sizeof(unsigned int);
const int totalPacketSize = packetCount * PM4WriteDataPacket(0, 0).SizeInBytes() +
PM4ReleaseMemoryPacket(m_FamilyId, 0, 0, 0).SizeInBytes();
const int queueSize = RoundToPowerOf2(totalPacketSize);
/* 4 PM4 queues will be running at same time.*/
const int numPM4Queue = 4;
HsaEvent *event[numPM4Queue];
PM4Queue queue[numPM4Queue];
HsaMemoryBuffer *destBuf[numPM4Queue];
unsigned int *buf[numPM4Queue];
for (int i = 0; i < numPM4Queue; i++) {
destBuf[i] = new HsaMemoryBuffer(bufSize, defaultGPUNode, true, false); // System memory
buf[i] = destBuf[i]->As<unsigned int *>();
}
/* A simple loop here to give more pressure.*/
for (int test_count = 0; test_count < 1024; test_count++) {
for (int i = 0; i < numPM4Queue; i++) {
ASSERT_SUCCESS(queue[i].Create(defaultGPUNode, queueSize));
ASSERT_SUCCESS(CreateQueueTypeEvent(false, false, defaultGPUNode, &event[i]));
/* Let CP have some workload first.*/
for(int index = 0; index < packetCount; index++)
queue[i].PlacePacket(PM4WriteDataPacket(buf[i] + index, 0xdeadbeaf));
/* releaseMemory packet makes sure all previous written data is visible.*/
queue[i].PlacePacket(PM4ReleaseMemoryPacket(m_FamilyId, 0,
reinterpret_cast<HSAuint64>(event[i]->EventData.HWData2),
event[i]->EventId,
true));
}
for (int i = 0; i < numPM4Queue; i++)
queue[i].SubmitPacket();
for (int i = 0; i < numPM4Queue; i++) {
EXPECT_SUCCESS(hsaKmtWaitOnEvent(event[i], g_TestTimeOut));
EXPECT_EQ(buf[i][0], 0xdeadbeaf);
EXPECT_EQ(buf[i][packetCount - 1], 0xdeadbeaf);
memset(buf[i], 0, bufSize);
}
for (int i = 0; i < numPM4Queue; i++) {
EXPECT_SUCCESS(queue[i].Destroy());
EXPECT_SUCCESS(hsaKmtDestroyEvent(event[i]));
}
}
for (int i = 0; i < numPM4Queue; i++)
delete destBuf[i];
TEST_END
}
#include "KFDTestUtilQueue.hpp"
TEST_F(KFDQMTest, SdmaEventInterrupt) {
TEST_START(TESTPROFILE_RUNALL)
int defaultGPUNode = m_NodeInfo.HsaDefaultGPUNode();
ASSERT_GE(defaultGPUNode, 0) << "failed to get default GPU Node";
const HSAuint64 bufSize = 4 << 20;
HsaMemoryBuffer srcBuf(bufSize, 0); // System memory.
HSAuint64 *src = srcBuf.As<HSAuint64*>();
TimeStamp *tsbuf = srcBuf.As<TimeStamp*>();
tsbuf = reinterpret_cast<TimeStamp *>ALIGN_UP(tsbuf, sizeof(TimeStamp));
/* Have 3 queues created for test.*/
const int numSDMAQueue = 3;
HsaEvent *event[numSDMAQueue];
SDMAQueue queue[numSDMAQueue];
HsaMemoryBuffer *destBuf[numSDMAQueue];
HSAuint64 *dst[numSDMAQueue];
for (int i = 0; i < numSDMAQueue; i++) {
destBuf[i] = new HsaMemoryBuffer(bufSize, defaultGPUNode, true, false); // System memory
dst[i] = destBuf[i]->As<HSAuint64*>();
}
/* Test 1 queue, 2 queues, 3 queues running at same time one by one.*/
for (int testSDMAQueue = 1; testSDMAQueue <= numSDMAQueue; testSDMAQueue++)
/* A simple loop here to give more pressure.*/
for (int test_count = 0; test_count < 2048; test_count++) {
for (int i = 0; i < testSDMAQueue; i++) {
TimeStamp *ts = tsbuf + i * 32;
ASSERT_SUCCESS(queue[i].Create(defaultGPUNode));
/* FIXME
* We create event every time along with queue.
* However that will significantly enhance the failure of sdma event timeout.
*/
ASSERT_SUCCESS(CreateQueueTypeEvent(false, false, defaultGPUNode, &event[i]));
/* Get the timestamp directly. The first member of HsaClockCounters and TimeStamp is GPU clock counter.*/
hsaKmtGetClockCounters(defaultGPUNode, reinterpret_cast<HsaClockCounters*>(&ts[0]));
/* Let sDMA have some workload first.*/
queue[i].PlacePacket(SDMATimePacket(&ts[1]));
queue[i].PlacePacket(
SDMACopyDataPacket(queue[i].GetFamilyId(), dst[i], src, bufSize));
queue[i].PlacePacket(SDMATimePacket(&ts[2]));
queue[i].PlacePacket(
SDMAFencePacket(queue[i].GetFamilyId(),
reinterpret_cast<void*>(event[i]->EventData.HWData2), event[i]->EventId));
queue[i].PlacePacket(SDMATimePacket(&ts[3]));
queue[i].PlacePacket(SDMATrapPacket(event[i]->EventId));
queue[i].PlacePacket(SDMATimePacket(&ts[4]));
/* Will verify the value of srcBuf and destBuf later. Give it a different value each time.*/
src[0] = ts[0].timestamp;
}
for (int i = 0; i < testSDMAQueue; i++)
queue[i].SubmitPacket();
for (int i = 0; i < testSDMAQueue; i++) {
TimeStamp *ts = tsbuf + i * 32;
HSAKMT_STATUS ret = hsaKmtWaitOnEvent(event[i], g_TestTimeOut);
if (dst[i][0] != src[0])
WARN() << "SDMACopyData FAIL! " << std::dec
<< dst[i][0] << " VS " << src[0] << std::endl;
if (ret == HSAKMT_STATUS_SUCCESS) {
for (int i = 1; i <= 4; i++)
/* Is queue latency too big? The workload is really small.*/
if (CounterToNanoSec(ts[i].timestamp - ts[i - 1].timestamp) > 1000000000)
WARN() << "SDMA queue latency is bigger than 1s!" << std::endl;
} else {
WARN() << "Event On Queue " << testSDMAQueue << ":" << i
<< " Timeout, try to resubmit packets!" << std::endl;
queue[i].SubmitPacket();
if (hsaKmtWaitOnEvent(event[i], g_TestTimeOut) == HSAKMT_STATUS_SUCCESS)
WARN() << "The timeout event is signaled!" << std::endl;
else
WARN() << "The timeout event is lost after resubmit!" << std::endl;
LOG() << "Time Consumption (ns)" << std::endl;
for (int i = 1; i <= 4; i++)
LOG() << std::dec << i << ": "
<< CounterToNanoSec(ts[i].timestamp - ts[i - 1].timestamp) << std::endl;
}
EXPECT_SUCCESS(ret);
}
for (int i = 0; i < testSDMAQueue; i++) {
EXPECT_SUCCESS(queue[i].Destroy());
EXPECT_SUCCESS(hsaKmtDestroyEvent(event[i]));
}
}
for (int i = 0; i < numSDMAQueue; i++)
delete destBuf[i];
TEST_END
}
#define DOORBELL_WRITE_USE_SDMA
TEST_F(KFDQMTest, GPUDoorbellWrite) {
TEST_START(TESTPROFILE_RUNALL)
int defaultGPUNode = m_NodeInfo.HsaDefaultGPUNode();
ASSERT_GE(defaultGPUNode, 0) << "failed to get default GPU Node";
HsaMemoryBuffer destBuf(PAGE_SIZE, 0, true);
PM4Queue pm4Queue;
#ifdef DOORBELL_WRITE_USE_SDMA
SDMAQueue otherQueue;
#else
PM4Queue otherQueue;
#endif
ASSERT_SUCCESS(pm4Queue.Create(defaultGPUNode));
ASSERT_SUCCESS(otherQueue.Create(defaultGPUNode));
/* Place PM4 packet in the queue, but don't submit it */
pm4Queue.PlacePacket(PM4WriteDataPacket(destBuf.As<unsigned int*>(), 0x12345678, 0x87654321));
HsaQueueResource *qRes = pm4Queue.GetResource();
if (m_FamilyId < FAMILY_AI) {
unsigned int pendingWptr = pm4Queue.GetPendingWptr();
#ifdef DOORBELL_WRITE_USE_SDMA
/* Write the wptr and doorbell update using the GPU's SDMA
* engine. This should submit the PM4 packet on the first
* queue.
*/
otherQueue.PlacePacket(SDMAWriteDataPacket(otherQueue.GetFamilyId(), qRes->Queue_write_ptr,
pendingWptr));
otherQueue.PlacePacket(SDMAWriteDataPacket(otherQueue.GetFamilyId(), qRes->Queue_DoorBell,
pendingWptr));
#else
/* Write the wptr and doorbell update using WRITE_DATA packets
* on a second PM4 queue. This should submit the PM4 packet on
* the first queue.
*/
otherQueue.PlacePacket(
PM4ReleaseMemoryPacket(m_FamilyId, true, (HSAuint64)qRes->Queue_write_ptr,
pendingWptr, false));
otherQueue.PlacePacket(
PM4ReleaseMemoryPacket(m_FamilyId, true, (HSAuint64)qRes->Queue_DoorBell,
pendingWptr, false));
#endif
otherQueue.SubmitPacket();
} else {
HSAuint64 pendingWptr64 = pm4Queue.GetPendingWptr64();
#ifdef DOORBELL_WRITE_USE_SDMA
/* Write the wptr and doorbell update using the GPU's SDMA
* engine. This should submit the PM4 packet on the first
* queue.
*/
otherQueue.PlacePacket(SDMAWriteDataPacket(otherQueue.GetFamilyId(), qRes->Queue_write_ptr,
2, &pendingWptr64));
otherQueue.PlacePacket(SDMAWriteDataPacket(otherQueue.GetFamilyId(), qRes->Queue_DoorBell,
2, &pendingWptr64));
#else
/* Write the 64-bit wptr and doorbell update using RELEASE_MEM
* packets without IRQs on a second PM4 queue. RELEASE_MEM
* should perform one atomic 64-bit access. This should submit
* the PM4 packet on the first queue.
*/
otherQueue.PlacePacket(
PM4ReleaseMemoryPacket(m_FamilyId, true, (HSAuint64)qRes->Queue_write_ptr,
pendingWptr64, true));
otherQueue.PlacePacket(
PM4ReleaseMemoryPacket(m_FamilyId, true, (HSAuint64)qRes->Queue_DoorBell,
pendingWptr64, true));
#endif
otherQueue.SubmitPacket();
}
/* Check that the PM4 packet has been executed */
EXPECT_TRUE(WaitOnValue(destBuf.As<unsigned int *>(), 0x12345678));
EXPECT_TRUE(WaitOnValue(destBuf.As<unsigned int *>()+1, 0x87654321));
EXPECT_SUCCESS(pm4Queue.Destroy());
EXPECT_SUCCESS(otherQueue.Destroy());
TEST_END
}