diff --git a/projects/rocr-runtime/tests/kfdtest/src/KFDMemoryTest.cpp b/projects/rocr-runtime/tests/kfdtest/src/KFDMemoryTest.cpp index 54ab73d1c0..95997c0641 100644 --- a/projects/rocr-runtime/tests/kfdtest/src/KFDMemoryTest.cpp +++ b/projects/rocr-runtime/tests/kfdtest/src/KFDMemoryTest.cpp @@ -229,6 +229,81 @@ type(CS)\n\ end\n\ "; +/* Continuously poll the flag at src buffer + * After the flag of s[0:1] is 1 filled, + * copy the value from s[0:1]+4 to dst buffer + */ +const char* gfx9_PollAndCopy = +"\ +shader CopyMemory\n\ +wave_size(32)\n\ +type(CS)\n\ +/* Assume src buffer in s[0:1] and dst buffer in s[2:3]*/\n\ + s_movk_i32 s18, 0x1\n\ + LOOP:\n\ + s_load_dword s16, s[0:1], 0x0 glc\n\ + s_cmp_eq_i32 s16, s18\n\ + s_cbranch_scc0 LOOP\n\ + s_load_dword s17, s[0:1], 0x4 glc\n\ + s_waitcnt vmcnt(0) & lgkmcnt(0)\n\ + s_store_dword s17, s[2:3], 0x0 glc:1\n\ + s_waitcnt vmcnt(0) & lgkmcnt(0)\n\ + s_endpgm\n\ + end\n\ +"; + +const char* gfx9aldbrn_PollAndCopy = +"\ +shader CopyMemory\n\ +wave_size(32)\n\ +type(CS)\n\ +/* Assume src buffer in s[0:1] and dst buffer in s[2:3]*/\n\ + v_mov_b32 v0, s0\n\ + v_mov_b32 v1, s1\n\ + v_mov_b32 v18, 0x1\n\ + LOOP:\n\ + flat_load_dword v16, v[0:1] scc:1\n\ + s_waitcnt vmcnt(0) & lgkmcnt(0)\n\ + v_cmp_eq_i32 vcc, v16, v18\n\ + s_cbranch_vccz LOOP\n\ + buffer_invl2\n\ + s_load_dword s17, s[0:1], 0x4 glc\n\ + s_waitcnt vmcnt(0) & lgkmcnt(0)\n\ + s_store_dword s17, s[2:3], 0x0 glc\n\ + s_waitcnt vmcnt(0) & lgkmcnt(0)\n\ + buffer_wbl2\n\ + s_waitcnt vmcnt(0) & lgkmcnt(0)\n\ + s_endpgm\n\ + end\n\ +"; + +/* Input0: A buffer of at least 2 dwords. + * DW0: used as a signal. Write 0x1 to signal + * DW1: Write the value from 2nd input buffer + * for other device to read. + * Input1: A buffer of at least 2 dwords. + * DW0: used as the value to be written. + */ +const char* gfx9aldbrn_WriteFlagAndValue = +"\ +shader WriteMemory\n\ +wave_size(32)\n\ +type(CS)\n\ +/* Assume two inputs buffer in s[0:1] and s[2:3]*/\n\ + v_mov_b32 v0, s0\n\ + v_mov_b32 v1, s1\n\ + s_load_dword s18, s[2:3], 0x0 glc\n\ + s_waitcnt vmcnt(0) & lgkmcnt(0)\n\ + s_store_dword s18, s[0:1], 0x4 glc\n\ + s_waitcnt vmcnt(0) & lgkmcnt(0)\n\ + buffer_wbl2\n\ + s_waitcnt vmcnt(0) & lgkmcnt(0)\n\ + v_mov_b32 v16, 0x1\n\ + flat_store_dword v[0:1], v16 scc:1\n\ + s_endpgm\n\ + end\n\ +"; + //These gfx9_PullMemory, gfx9_CopyOnSignal, gfx9_WriteAndSignal shaders can be used by both gfx9 and gfx10 void KFDMemoryTest::SetUp() { @@ -2258,3 +2333,211 @@ TEST_F(KFDMemoryTest, CacheInvalidateOnRemoteWrite) { TEST_END } + +/* Test is for new cache coherence on Aldebaran. It is to verify + * two GPUs can coherently share a fine grain FB. + */ +TEST_F(KFDMemoryTest, VramCacheCoherenceWithRemoteGPU) { + TEST_REQUIRE_ENV_CAPABILITIES(ENVCAPS_64BITLINUX); + TEST_START(TESTPROFILE_RUNALL); + + HSAuint32 defaultGPUNode = m_NodeInfo.HsaDefaultGPUNode(); + HsaMemoryBuffer tmpBuffer(PAGE_SIZE, 0, true /* zero */); + volatile HSAuint32 *tmp = tmpBuffer.As(); + const int dwSource = 0x40 * sizeof(int); /* At 3rd cache line */ + const int dwLocation = 0x80 * sizeof(int); /* At 5th cache line */ + + if (m_FamilyId != FAMILY_AL) { + LOG() << "Skipping test: Test requires aldebaran series asics." << std::endl; + return; + } + + const std::vector gpuNodes = m_NodeInfo.GetNodesWithGPU(); + if (gpuNodes.size() < 2) { + LOG() << "Skipping test: At least two GPUs are required." << std::endl; + return; + } + + HSAuint32 nondefaultNode; + for (unsigned i = 0; i < gpuNodes.size(); i++) { + if (gpuNodes.at(i) != defaultGPUNode) { + nondefaultNode = gpuNodes.at(i); + break; + } + } + + unsigned int nodes[2] = {defaultGPUNode, nondefaultNode}; + + /* Allocate a local FB */ + HsaMemoryBuffer buffer(PAGE_SIZE, defaultGPUNode, false/*zero*/, true/*local*/, false/*exec*/); + buffer.MapMemToNodes(&nodes[0], 2); + SDMAQueue sdmaQueue; + ASSERT_SUCCESS(sdmaQueue.Create(defaultGPUNode)); + buffer.Fill(0, sdmaQueue, 0, PAGE_SIZE); + buffer.Fill(0x5678, sdmaQueue, dwSource, 4); + + /* Read buffer[0] as flag from local shader to fill cache line (64 dws) + * which should has 0 at buffer[1] + */ + PM4Queue queue; + ASSERT_SUCCESS(queue.Create(defaultGPUNode)); + HsaMemoryBuffer isaBuffer(PAGE_SIZE, defaultGPUNode, true/*zero*/, false/*local*/, true/*exec*/); + m_pIsaGen->CompileShader(gfx9aldbrn_PollAndCopy, "CopyMemory", isaBuffer); + Dispatch dispatch(isaBuffer); + dispatch.SetArgs(buffer.As(), buffer.As()+dwLocation); + dispatch.Submit(queue); + + /* Delay 100ms to make sure shader executed*/ + Delay(100); + + /* Using remote shader to write the flag and copy value from dwSource + * to dwLocation in buffer. + * Local shader should get the flag and execute CopyMemory + */ + PM4Queue queue1; + ASSERT_SUCCESS(queue1.Create(nondefaultNode)); + HsaMemoryBuffer isaBuffer1(PAGE_SIZE, nondefaultNode, true/*zero*/, false/*local*/, true/*exec*/); + m_pIsaGen->CompileShader(gfx9aldbrn_WriteFlagAndValue, "WriteMemory", isaBuffer1); + Dispatch dispatch1(isaBuffer1); + dispatch1.SetArgs(buffer.As(), buffer.As()+dwSource); + dispatch1.Submit(queue1); + dispatch1.Sync(g_TestTimeOut); + + /* Check test result*/ + dispatch.Sync(g_TestTimeOut); + EXPECT_EQ(buffer.IsPattern(dwLocation, 0x5678, sdmaQueue, tmp), true); + + // Clean up + EXPECT_SUCCESS(queue.Destroy()); + EXPECT_SUCCESS(queue1.Destroy()); + EXPECT_SUCCESS(sdmaQueue.Destroy()); + + TEST_END +} + +/* Test is for new cache coherence on A+A(Aldebaran). It is to verify + * new XGMI coherence HW link in caches between CPU and GPUs + * in local FB with fine grain mode. + */ +TEST_F(KFDMemoryTest, VramCacheCoherenceWithCPU) { + TEST_REQUIRE_ENV_CAPABILITIES(ENVCAPS_64BITLINUX); + TEST_START(TESTPROFILE_RUNALL); + + if (m_FamilyId != FAMILY_AL) { + LOG() << "Skipping test: Test requires aldebaran series asics." << std::endl; + return; + } + + HSAuint32 defaultGPUNode = m_NodeInfo.HsaDefaultGPUNode(); + const int dwLocation = 0x80; + + if (!m_NodeInfo.IsNodeXGMItoCPU(defaultGPUNode)) { + LOG() << "Skipping test: XGMI link to CPU is required." << std::endl; + return; + } + + unsigned int *buffer; + HsaMemFlags memFlags = {0}; + /* Allocate a fine grain local FB accessed by CPU */ + memFlags.ui32.HostAccess = 1; + memFlags.ui32.NonPaged = 1; + ASSERT_SUCCESS(hsaKmtAllocMemory(defaultGPUNode, PAGE_SIZE, memFlags, + reinterpret_cast(&buffer))); + ASSERT_SUCCESS(hsaKmtMapMemoryToGPU(buffer, PAGE_SIZE, NULL)); + buffer[0] = 0; + buffer[dwLocation] = 0; + + /* Read buffer from shader to fill cache */ + PM4Queue queue; + ASSERT_SUCCESS(queue.Create(defaultGPUNode)); + HsaMemoryBuffer isaBuffer(PAGE_SIZE, defaultGPUNode, true/*zero*/, false/*local*/, true/*exec*/); + m_pIsaGen->CompileShader(gfx9aldbrn_PollAndCopy, "CopyMemory", isaBuffer); + Dispatch dispatch(isaBuffer); + dispatch.SetArgs(buffer, buffer+dwLocation); + dispatch.Submit(queue); + + /* Delay 100ms to make sure shader executed*/ + Delay(100); + + /* CPU writes to buffer. Shader should get 0x5678 CPU writes + * after cache invalidating(buffer_invl2) and quits + */ + buffer[1] = 0x5678; + buffer[0] = 1; + + /* Check test result*/ + dispatch.Sync(g_TestTimeOut); + EXPECT_EQ(buffer[dwLocation], 0x5678); + + // Clean up + EXPECT_SUCCESS(hsaKmtUnmapMemoryToGPU(buffer)); + EXPECT_SUCCESS(hsaKmtFreeMemory(buffer, PAGE_SIZE)); + EXPECT_SUCCESS(queue.Destroy()); + + TEST_END +} + +/* Test is for new cache coherence on Aldebaran. It is to verify + * new XGMI coherence HW link in caches between CPU and GPUs + * in system RAM. + */ +TEST_F(KFDMemoryTest, SramCacheCoherenceWithGPU) { + TEST_REQUIRE_ENV_CAPABILITIES(ENVCAPS_64BITLINUX); + TEST_START(TESTPROFILE_RUNALL); + + if (m_FamilyId != FAMILY_AL) { + LOG() << "Skipping test: Test requires aldebaran series asics." << std::endl; + return; + } + + unsigned int *fineBuffer = NULL; + unsigned int tmp; + + int defaultGPUNode = m_NodeInfo.HsaDefaultGPUNode(); + const int dwLocation = 0x80; + + ASSERT_SUCCESS(hsaKmtAllocMemory(defaultGPUNode /* system */, PAGE_SIZE, m_MemoryFlags, + reinterpret_cast(&fineBuffer))); + ASSERT_SUCCESS(hsaKmtMapMemoryToGPU(fineBuffer, PAGE_SIZE, NULL)); + fineBuffer[0] = 0; + fineBuffer[1] = 0; + /* Read buffer from CPU to fill cache */ + tmp = fineBuffer[dwLocation]; + + /* Read fine grain buffer from shader to fill cache */ + PM4Queue queue; + ASSERT_SUCCESS(queue.Create(defaultGPUNode)); + HsaMemoryBuffer isaBuffer(PAGE_SIZE, defaultGPUNode, true/*zero*/, false/*local*/, true/*exec*/); + + if (m_NodeInfo.IsNodeXGMItoCPU(defaultGPUNode)) + m_pIsaGen->CompileShader(gfx9aldbrn_PollAndCopy, "CopyMemory", isaBuffer); + else + m_pIsaGen->CompileShader(gfx9_PollAndCopy, "CopyMemory", isaBuffer); + + Dispatch dispatch(isaBuffer); + dispatch.SetArgs(fineBuffer, fineBuffer+dwLocation); + dispatch.Submit(queue); + + /* Delay 100ms to make sure shader executed*/ + Delay(100); + + /* CPU writes to buffer. Shader should get what CPU writes and quits*/ + fineBuffer[1] = 0x5678; + fineBuffer[0] = 1; + + /* Check test result, based on KFDEventTest.SignalEvent passed. + * if Sync times out, + * it means coherence issue that GPU doesn't read what CPU wrote. + * if buffer value is not expected, + * it means coherence issue that CPU doesn't read what GPU wrote. + */ + dispatch.Sync(g_TestTimeOut); + EXPECT_EQ(fineBuffer[dwLocation], 0x5678); + + // Clean up + EXPECT_SUCCESS(hsaKmtUnmapMemoryToGPU(fineBuffer)); + EXPECT_SUCCESS(hsaKmtFreeMemory(fineBuffer, PAGE_SIZE)); + EXPECT_SUCCESS(queue.Destroy()); + + TEST_END +}