KFDTest: add cache coherence tests for gfx90a

Three kfd subtests are added to verify new XGMI connection with
cache coherence HW link on A+A.

Signed-off-by: Eric Huang <JinhuiEric.Huang@amd.com>
Change-Id: I6960ec91cbfb696c4e6acb3b79fd83107003acdd


[ROCm/ROCR-Runtime commit: 9aa521d1ff]
This commit is contained in:
Eric Huang
2020-04-15 16:13:52 -04:00
committed by Kent Russell
parent 60f8eb9441
commit 20b3f20fa0
@@ -229,6 +229,81 @@ type(CS)\n\
end\n\
";
/* Continuously poll the flag at src buffer
* After the flag of s[0:1] is 1 filled,
* copy the value from s[0:1]+4 to dst buffer
*/
const char* gfx9_PollAndCopy =
"\
shader CopyMemory\n\
wave_size(32)\n\
type(CS)\n\
/* Assume src buffer in s[0:1] and dst buffer in s[2:3]*/\n\
s_movk_i32 s18, 0x1\n\
LOOP:\n\
s_load_dword s16, s[0:1], 0x0 glc\n\
s_cmp_eq_i32 s16, s18\n\
s_cbranch_scc0 LOOP\n\
s_load_dword s17, s[0:1], 0x4 glc\n\
s_waitcnt vmcnt(0) & lgkmcnt(0)\n\
s_store_dword s17, s[2:3], 0x0 glc:1\n\
s_waitcnt vmcnt(0) & lgkmcnt(0)\n\
s_endpgm\n\
end\n\
";
const char* gfx9aldbrn_PollAndCopy =
"\
shader CopyMemory\n\
wave_size(32)\n\
type(CS)\n\
/* Assume src buffer in s[0:1] and dst buffer in s[2:3]*/\n\
v_mov_b32 v0, s0\n\
v_mov_b32 v1, s1\n\
v_mov_b32 v18, 0x1\n\
LOOP:\n\
flat_load_dword v16, v[0:1] scc:1\n\
s_waitcnt vmcnt(0) & lgkmcnt(0)\n\
v_cmp_eq_i32 vcc, v16, v18\n\
s_cbranch_vccz LOOP\n\
buffer_invl2\n\
s_load_dword s17, s[0:1], 0x4 glc\n\
s_waitcnt vmcnt(0) & lgkmcnt(0)\n\
s_store_dword s17, s[2:3], 0x0 glc\n\
s_waitcnt vmcnt(0) & lgkmcnt(0)\n\
buffer_wbl2\n\
s_waitcnt vmcnt(0) & lgkmcnt(0)\n\
s_endpgm\n\
end\n\
";
/* Input0: A buffer of at least 2 dwords.
* DW0: used as a signal. Write 0x1 to signal
* DW1: Write the value from 2nd input buffer
* for other device to read.
* Input1: A buffer of at least 2 dwords.
* DW0: used as the value to be written.
*/
const char* gfx9aldbrn_WriteFlagAndValue =
"\
shader WriteMemory\n\
wave_size(32)\n\
type(CS)\n\
/* Assume two inputs buffer in s[0:1] and s[2:3]*/\n\
v_mov_b32 v0, s0\n\
v_mov_b32 v1, s1\n\
s_load_dword s18, s[2:3], 0x0 glc\n\
s_waitcnt vmcnt(0) & lgkmcnt(0)\n\
s_store_dword s18, s[0:1], 0x4 glc\n\
s_waitcnt vmcnt(0) & lgkmcnt(0)\n\
buffer_wbl2\n\
s_waitcnt vmcnt(0) & lgkmcnt(0)\n\
v_mov_b32 v16, 0x1\n\
flat_store_dword v[0:1], v16 scc:1\n\
s_endpgm\n\
end\n\
";
//These gfx9_PullMemory, gfx9_CopyOnSignal, gfx9_WriteAndSignal shaders can be used by both gfx9 and gfx10
void KFDMemoryTest::SetUp() {
@@ -2258,3 +2333,211 @@ TEST_F(KFDMemoryTest, CacheInvalidateOnRemoteWrite) {
TEST_END
}
/* Test is for new cache coherence on Aldebaran. It is to verify
* two GPUs can coherently share a fine grain FB.
*/
TEST_F(KFDMemoryTest, VramCacheCoherenceWithRemoteGPU) {
TEST_REQUIRE_ENV_CAPABILITIES(ENVCAPS_64BITLINUX);
TEST_START(TESTPROFILE_RUNALL);
HSAuint32 defaultGPUNode = m_NodeInfo.HsaDefaultGPUNode();
HsaMemoryBuffer tmpBuffer(PAGE_SIZE, 0, true /* zero */);
volatile HSAuint32 *tmp = tmpBuffer.As<volatile HSAuint32 *>();
const int dwSource = 0x40 * sizeof(int); /* At 3rd cache line */
const int dwLocation = 0x80 * sizeof(int); /* At 5th cache line */
if (m_FamilyId != FAMILY_AL) {
LOG() << "Skipping test: Test requires aldebaran series asics." << std::endl;
return;
}
const std::vector<int> gpuNodes = m_NodeInfo.GetNodesWithGPU();
if (gpuNodes.size() < 2) {
LOG() << "Skipping test: At least two GPUs are required." << std::endl;
return;
}
HSAuint32 nondefaultNode;
for (unsigned i = 0; i < gpuNodes.size(); i++) {
if (gpuNodes.at(i) != defaultGPUNode) {
nondefaultNode = gpuNodes.at(i);
break;
}
}
unsigned int nodes[2] = {defaultGPUNode, nondefaultNode};
/* Allocate a local FB */
HsaMemoryBuffer buffer(PAGE_SIZE, defaultGPUNode, false/*zero*/, true/*local*/, false/*exec*/);
buffer.MapMemToNodes(&nodes[0], 2);
SDMAQueue sdmaQueue;
ASSERT_SUCCESS(sdmaQueue.Create(defaultGPUNode));
buffer.Fill(0, sdmaQueue, 0, PAGE_SIZE);
buffer.Fill(0x5678, sdmaQueue, dwSource, 4);
/* Read buffer[0] as flag from local shader to fill cache line (64 dws)
* which should has 0 at buffer[1]
*/
PM4Queue queue;
ASSERT_SUCCESS(queue.Create(defaultGPUNode));
HsaMemoryBuffer isaBuffer(PAGE_SIZE, defaultGPUNode, true/*zero*/, false/*local*/, true/*exec*/);
m_pIsaGen->CompileShader(gfx9aldbrn_PollAndCopy, "CopyMemory", isaBuffer);
Dispatch dispatch(isaBuffer);
dispatch.SetArgs(buffer.As<char *>(), buffer.As<char *>()+dwLocation);
dispatch.Submit(queue);
/* Delay 100ms to make sure shader executed*/
Delay(100);
/* Using remote shader to write the flag and copy value from dwSource
* to dwLocation in buffer.
* Local shader should get the flag and execute CopyMemory
*/
PM4Queue queue1;
ASSERT_SUCCESS(queue1.Create(nondefaultNode));
HsaMemoryBuffer isaBuffer1(PAGE_SIZE, nondefaultNode, true/*zero*/, false/*local*/, true/*exec*/);
m_pIsaGen->CompileShader(gfx9aldbrn_WriteFlagAndValue, "WriteMemory", isaBuffer1);
Dispatch dispatch1(isaBuffer1);
dispatch1.SetArgs(buffer.As<char *>(), buffer.As<char *>()+dwSource);
dispatch1.Submit(queue1);
dispatch1.Sync(g_TestTimeOut);
/* Check test result*/
dispatch.Sync(g_TestTimeOut);
EXPECT_EQ(buffer.IsPattern(dwLocation, 0x5678, sdmaQueue, tmp), true);
// Clean up
EXPECT_SUCCESS(queue.Destroy());
EXPECT_SUCCESS(queue1.Destroy());
EXPECT_SUCCESS(sdmaQueue.Destroy());
TEST_END
}
/* Test is for new cache coherence on A+A(Aldebaran). It is to verify
* new XGMI coherence HW link in caches between CPU and GPUs
* in local FB with fine grain mode.
*/
TEST_F(KFDMemoryTest, VramCacheCoherenceWithCPU) {
TEST_REQUIRE_ENV_CAPABILITIES(ENVCAPS_64BITLINUX);
TEST_START(TESTPROFILE_RUNALL);
if (m_FamilyId != FAMILY_AL) {
LOG() << "Skipping test: Test requires aldebaran series asics." << std::endl;
return;
}
HSAuint32 defaultGPUNode = m_NodeInfo.HsaDefaultGPUNode();
const int dwLocation = 0x80;
if (!m_NodeInfo.IsNodeXGMItoCPU(defaultGPUNode)) {
LOG() << "Skipping test: XGMI link to CPU is required." << std::endl;
return;
}
unsigned int *buffer;
HsaMemFlags memFlags = {0};
/* Allocate a fine grain local FB accessed by CPU */
memFlags.ui32.HostAccess = 1;
memFlags.ui32.NonPaged = 1;
ASSERT_SUCCESS(hsaKmtAllocMemory(defaultGPUNode, PAGE_SIZE, memFlags,
reinterpret_cast<void**>(&buffer)));
ASSERT_SUCCESS(hsaKmtMapMemoryToGPU(buffer, PAGE_SIZE, NULL));
buffer[0] = 0;
buffer[dwLocation] = 0;
/* Read buffer from shader to fill cache */
PM4Queue queue;
ASSERT_SUCCESS(queue.Create(defaultGPUNode));
HsaMemoryBuffer isaBuffer(PAGE_SIZE, defaultGPUNode, true/*zero*/, false/*local*/, true/*exec*/);
m_pIsaGen->CompileShader(gfx9aldbrn_PollAndCopy, "CopyMemory", isaBuffer);
Dispatch dispatch(isaBuffer);
dispatch.SetArgs(buffer, buffer+dwLocation);
dispatch.Submit(queue);
/* Delay 100ms to make sure shader executed*/
Delay(100);
/* CPU writes to buffer. Shader should get 0x5678 CPU writes
* after cache invalidating(buffer_invl2) and quits
*/
buffer[1] = 0x5678;
buffer[0] = 1;
/* Check test result*/
dispatch.Sync(g_TestTimeOut);
EXPECT_EQ(buffer[dwLocation], 0x5678);
// Clean up
EXPECT_SUCCESS(hsaKmtUnmapMemoryToGPU(buffer));
EXPECT_SUCCESS(hsaKmtFreeMemory(buffer, PAGE_SIZE));
EXPECT_SUCCESS(queue.Destroy());
TEST_END
}
/* Test is for new cache coherence on Aldebaran. It is to verify
* new XGMI coherence HW link in caches between CPU and GPUs
* in system RAM.
*/
TEST_F(KFDMemoryTest, SramCacheCoherenceWithGPU) {
TEST_REQUIRE_ENV_CAPABILITIES(ENVCAPS_64BITLINUX);
TEST_START(TESTPROFILE_RUNALL);
if (m_FamilyId != FAMILY_AL) {
LOG() << "Skipping test: Test requires aldebaran series asics." << std::endl;
return;
}
unsigned int *fineBuffer = NULL;
unsigned int tmp;
int defaultGPUNode = m_NodeInfo.HsaDefaultGPUNode();
const int dwLocation = 0x80;
ASSERT_SUCCESS(hsaKmtAllocMemory(defaultGPUNode /* system */, PAGE_SIZE, m_MemoryFlags,
reinterpret_cast<void**>(&fineBuffer)));
ASSERT_SUCCESS(hsaKmtMapMemoryToGPU(fineBuffer, PAGE_SIZE, NULL));
fineBuffer[0] = 0;
fineBuffer[1] = 0;
/* Read buffer from CPU to fill cache */
tmp = fineBuffer[dwLocation];
/* Read fine grain buffer from shader to fill cache */
PM4Queue queue;
ASSERT_SUCCESS(queue.Create(defaultGPUNode));
HsaMemoryBuffer isaBuffer(PAGE_SIZE, defaultGPUNode, true/*zero*/, false/*local*/, true/*exec*/);
if (m_NodeInfo.IsNodeXGMItoCPU(defaultGPUNode))
m_pIsaGen->CompileShader(gfx9aldbrn_PollAndCopy, "CopyMemory", isaBuffer);
else
m_pIsaGen->CompileShader(gfx9_PollAndCopy, "CopyMemory", isaBuffer);
Dispatch dispatch(isaBuffer);
dispatch.SetArgs(fineBuffer, fineBuffer+dwLocation);
dispatch.Submit(queue);
/* Delay 100ms to make sure shader executed*/
Delay(100);
/* CPU writes to buffer. Shader should get what CPU writes and quits*/
fineBuffer[1] = 0x5678;
fineBuffer[0] = 1;
/* Check test result, based on KFDEventTest.SignalEvent passed.
* if Sync times out,
* it means coherence issue that GPU doesn't read what CPU wrote.
* if buffer value is not expected,
* it means coherence issue that CPU doesn't read what GPU wrote.
*/
dispatch.Sync(g_TestTimeOut);
EXPECT_EQ(fineBuffer[dwLocation], 0x5678);
// Clean up
EXPECT_SUCCESS(hsaKmtUnmapMemoryToGPU(fineBuffer));
EXPECT_SUCCESS(hsaKmtFreeMemory(fineBuffer, PAGE_SIZE));
EXPECT_SUCCESS(queue.Destroy());
TEST_END
}