diff --git a/projects/rocr-runtime/tests/kfdtest/src/KFDMemoryTest.cpp b/projects/rocr-runtime/tests/kfdtest/src/KFDMemoryTest.cpp
index 54ab73d1c0..95997c0641 100644
--- a/projects/rocr-runtime/tests/kfdtest/src/KFDMemoryTest.cpp
+++ b/projects/rocr-runtime/tests/kfdtest/src/KFDMemoryTest.cpp
@@ -229,6 +229,81 @@ type(CS)\n\
     end\n\
 ";
 
+/* Continuously poll the flag at src buffer
+ * After the flag of s[0:1] is 1 filled,
+ * copy the value from s[0:1]+4 to dst buffer
+ */
+const char* gfx9_PollAndCopy =
+"\
+shader CopyMemory\n\
+wave_size(32)\n\
+type(CS)\n\
+/* Assume src buffer in s[0:1] and dst buffer in s[2:3]*/\n\
+    s_movk_i32 s18, 0x1\n\
+    LOOP:\n\
+    s_load_dword s16, s[0:1], 0x0 glc\n\
+    s_cmp_eq_i32 s16, s18\n\
+    s_cbranch_scc0   LOOP\n\
+    s_load_dword s17, s[0:1], 0x4 glc\n\
+    s_waitcnt vmcnt(0) & lgkmcnt(0)\n\
+    s_store_dword s17, s[2:3], 0x0 glc:1\n\
+    s_waitcnt vmcnt(0) & lgkmcnt(0)\n\
+    s_endpgm\n\
+    end\n\
+";
+
+const char* gfx9aldbrn_PollAndCopy =
+"\
+shader CopyMemory\n\
+wave_size(32)\n\
+type(CS)\n\
+/* Assume src buffer in s[0:1] and dst buffer in s[2:3]*/\n\
+    v_mov_b32 v0, s0\n\
+    v_mov_b32 v1, s1\n\
+    v_mov_b32 v18, 0x1\n\
+    LOOP:\n\
+    flat_load_dword v16, v[0:1] scc:1\n\
+    s_waitcnt vmcnt(0) & lgkmcnt(0)\n\
+    v_cmp_eq_i32 vcc, v16, v18\n\
+    s_cbranch_vccz   LOOP\n\
+    buffer_invl2\n\
+    s_load_dword s17, s[0:1], 0x4 glc\n\
+    s_waitcnt vmcnt(0) & lgkmcnt(0)\n\
+    s_store_dword s17, s[2:3], 0x0 glc\n\
+    s_waitcnt vmcnt(0) & lgkmcnt(0)\n\
+    buffer_wbl2\n\
+    s_waitcnt vmcnt(0) & lgkmcnt(0)\n\
+    s_endpgm\n\
+    end\n\
+";
+
+/* Input0: A buffer of at least 2 dwords.
+ * DW0: used as a signal. Write 0x1 to signal
+ * DW1: Write the value from 2nd input buffer
+ *      for other device to read.
+ * Input1: A buffer of at least 2 dwords.
+ * DW0: used as the value to be written.
+ */
+const char* gfx9aldbrn_WriteFlagAndValue =
+"\
+shader WriteMemory\n\
+wave_size(32)\n\
+type(CS)\n\
+/* Assume two inputs buffer in s[0:1] and s[2:3]*/\n\
+    v_mov_b32 v0, s0\n\
+    v_mov_b32 v1, s1\n\
+    s_load_dword s18, s[2:3], 0x0 glc\n\
+    s_waitcnt vmcnt(0) & lgkmcnt(0)\n\
+    s_store_dword s18, s[0:1], 0x4 glc\n\
+    s_waitcnt vmcnt(0) & lgkmcnt(0)\n\
+    buffer_wbl2\n\
+    s_waitcnt vmcnt(0) & lgkmcnt(0)\n\
+    v_mov_b32 v16, 0x1\n\
+    flat_store_dword v[0:1], v16 scc:1\n\
+    s_endpgm\n\
+    end\n\
+";
+
 //These gfx9_PullMemory, gfx9_CopyOnSignal, gfx9_WriteAndSignal shaders can be used by both gfx9 and gfx10
 
 void KFDMemoryTest::SetUp() {
@@ -2258,3 +2333,211 @@ TEST_F(KFDMemoryTest, CacheInvalidateOnRemoteWrite) {
 
     TEST_END
 }
+
+/* Test is for new cache coherence on Aldebaran. It is to verify
+ * two GPUs can coherently share a fine grain FB.
+ */
+TEST_F(KFDMemoryTest, VramCacheCoherenceWithRemoteGPU) {
+    TEST_REQUIRE_ENV_CAPABILITIES(ENVCAPS_64BITLINUX);
+    TEST_START(TESTPROFILE_RUNALL);
+
+    HSAuint32 defaultGPUNode = m_NodeInfo.HsaDefaultGPUNode();
+    HsaMemoryBuffer tmpBuffer(PAGE_SIZE, 0, true /* zero */);
+    volatile HSAuint32 *tmp = tmpBuffer.As<volatile HSAuint32 *>();
+    const int dwSource = 0x40 * sizeof(int); /* At 3rd cache line */
+    const int dwLocation = 0x80 * sizeof(int); /* At 5th cache line  */
+
+    if (m_FamilyId != FAMILY_AL) {
+        LOG() << "Skipping test: Test requires aldebaran series asics." << std::endl;
+        return;
+    }
+
+    const std::vector<int> gpuNodes = m_NodeInfo.GetNodesWithGPU();
+    if (gpuNodes.size() < 2) {
+        LOG() << "Skipping test: At least two GPUs are required." << std::endl;
+        return;
+    }
+
+    HSAuint32 nondefaultNode;
+    for (unsigned i = 0; i < gpuNodes.size(); i++) {
+        if (gpuNodes.at(i) != defaultGPUNode) {
+            nondefaultNode = gpuNodes.at(i);
+            break;
+        }
+    }
+
+    unsigned int nodes[2] = {defaultGPUNode, nondefaultNode};
+
+    /* Allocate a local FB */
+    HsaMemoryBuffer buffer(PAGE_SIZE, defaultGPUNode, false/*zero*/, true/*local*/, false/*exec*/);
+    buffer.MapMemToNodes(&nodes[0], 2);
+    SDMAQueue sdmaQueue;
+    ASSERT_SUCCESS(sdmaQueue.Create(defaultGPUNode));
+    buffer.Fill(0, sdmaQueue, 0, PAGE_SIZE);
+    buffer.Fill(0x5678, sdmaQueue, dwSource, 4);
+
+    /* Read buffer[0] as flag from local shader to fill cache line (64 dws)
+     * which should has 0 at buffer[1]
+     */
+    PM4Queue queue;
+    ASSERT_SUCCESS(queue.Create(defaultGPUNode));
+    HsaMemoryBuffer isaBuffer(PAGE_SIZE, defaultGPUNode, true/*zero*/, false/*local*/, true/*exec*/);
+    m_pIsaGen->CompileShader(gfx9aldbrn_PollAndCopy, "CopyMemory", isaBuffer);
+    Dispatch dispatch(isaBuffer);
+    dispatch.SetArgs(buffer.As<char *>(), buffer.As<char *>()+dwLocation);
+    dispatch.Submit(queue);
+
+    /* Delay 100ms to make sure shader executed*/
+    Delay(100);
+
+    /* Using remote shader to write the flag and copy value from dwSource
+     * to dwLocation in buffer.
+     * Local shader should get the flag and execute CopyMemory
+     */
+    PM4Queue queue1;
+    ASSERT_SUCCESS(queue1.Create(nondefaultNode));
+    HsaMemoryBuffer isaBuffer1(PAGE_SIZE, nondefaultNode, true/*zero*/, false/*local*/, true/*exec*/);
+    m_pIsaGen->CompileShader(gfx9aldbrn_WriteFlagAndValue, "WriteMemory", isaBuffer1);
+    Dispatch dispatch1(isaBuffer1);
+    dispatch1.SetArgs(buffer.As<char *>(), buffer.As<char *>()+dwSource);
+    dispatch1.Submit(queue1);
+    dispatch1.Sync(g_TestTimeOut);
+
+    /* Check test result*/
+    dispatch.Sync(g_TestTimeOut);
+    EXPECT_EQ(buffer.IsPattern(dwLocation, 0x5678, sdmaQueue, tmp), true);
+
+    // Clean up
+    EXPECT_SUCCESS(queue.Destroy());
+    EXPECT_SUCCESS(queue1.Destroy());
+    EXPECT_SUCCESS(sdmaQueue.Destroy());
+
+    TEST_END
+}
+
+/* Test is for new cache coherence on A+A(Aldebaran). It is to verify
+ * new XGMI coherence HW link in caches between CPU and GPUs
+ * in local FB with fine grain mode.
+ */
+TEST_F(KFDMemoryTest, VramCacheCoherenceWithCPU) {
+    TEST_REQUIRE_ENV_CAPABILITIES(ENVCAPS_64BITLINUX);
+    TEST_START(TESTPROFILE_RUNALL);
+
+    if (m_FamilyId != FAMILY_AL) {
+        LOG() << "Skipping test: Test requires aldebaran series asics." << std::endl;
+        return;
+    }
+
+    HSAuint32 defaultGPUNode = m_NodeInfo.HsaDefaultGPUNode();
+    const int dwLocation = 0x80;
+
+    if (!m_NodeInfo.IsNodeXGMItoCPU(defaultGPUNode)) {
+        LOG() << "Skipping test: XGMI link to CPU is required." << std::endl;
+        return;
+    }
+
+    unsigned int *buffer;
+    HsaMemFlags memFlags = {0};
+    /* Allocate a fine grain local FB accessed by CPU */
+    memFlags.ui32.HostAccess = 1;
+    memFlags.ui32.NonPaged = 1;
+    ASSERT_SUCCESS(hsaKmtAllocMemory(defaultGPUNode, PAGE_SIZE, memFlags,
+            reinterpret_cast<void**>(&buffer)));
+    ASSERT_SUCCESS(hsaKmtMapMemoryToGPU(buffer, PAGE_SIZE, NULL));
+    buffer[0] = 0;
+    buffer[dwLocation] = 0;
+
+    /* Read buffer from shader to fill cache */
+    PM4Queue queue;
+    ASSERT_SUCCESS(queue.Create(defaultGPUNode));
+    HsaMemoryBuffer isaBuffer(PAGE_SIZE, defaultGPUNode, true/*zero*/, false/*local*/, true/*exec*/);
+    m_pIsaGen->CompileShader(gfx9aldbrn_PollAndCopy, "CopyMemory", isaBuffer);
+    Dispatch dispatch(isaBuffer);
+    dispatch.SetArgs(buffer, buffer+dwLocation);
+    dispatch.Submit(queue);
+
+    /* Delay 100ms to make sure shader executed*/
+    Delay(100);
+
+    /* CPU writes to buffer. Shader should get 0x5678 CPU writes
+     * after cache invalidating(buffer_invl2) and quits
+     */
+    buffer[1] = 0x5678;
+    buffer[0] = 1;
+
+    /* Check test result*/
+    dispatch.Sync(g_TestTimeOut);
+    EXPECT_EQ(buffer[dwLocation], 0x5678);
+
+    // Clean up
+    EXPECT_SUCCESS(hsaKmtUnmapMemoryToGPU(buffer));
+    EXPECT_SUCCESS(hsaKmtFreeMemory(buffer, PAGE_SIZE));
+    EXPECT_SUCCESS(queue.Destroy());
+
+    TEST_END
+}
+
+/* Test is for new cache coherence on Aldebaran. It is to verify
+ * new XGMI coherence HW link in caches between CPU and GPUs
+ * in system RAM.
+ */
+TEST_F(KFDMemoryTest, SramCacheCoherenceWithGPU) {
+    TEST_REQUIRE_ENV_CAPABILITIES(ENVCAPS_64BITLINUX);
+    TEST_START(TESTPROFILE_RUNALL);
+
+    if (m_FamilyId != FAMILY_AL) {
+        LOG() << "Skipping test: Test requires aldebaran series asics." << std::endl;
+        return;
+    }
+
+    unsigned int *fineBuffer = NULL;
+    unsigned int tmp;
+
+    int defaultGPUNode = m_NodeInfo.HsaDefaultGPUNode();
+    const int dwLocation = 0x80;
+
+    ASSERT_SUCCESS(hsaKmtAllocMemory(defaultGPUNode /* system */, PAGE_SIZE, m_MemoryFlags,
+                       reinterpret_cast<void**>(&fineBuffer)));
+    ASSERT_SUCCESS(hsaKmtMapMemoryToGPU(fineBuffer, PAGE_SIZE, NULL));
+    fineBuffer[0] = 0;
+    fineBuffer[1] = 0;
+    /* Read buffer from CPU to fill cache */
+    tmp = fineBuffer[dwLocation];
+
+    /* Read fine grain buffer from shader to fill cache */
+    PM4Queue queue;
+    ASSERT_SUCCESS(queue.Create(defaultGPUNode));
+    HsaMemoryBuffer isaBuffer(PAGE_SIZE, defaultGPUNode, true/*zero*/, false/*local*/, true/*exec*/);
+
+    if (m_NodeInfo.IsNodeXGMItoCPU(defaultGPUNode))
+        m_pIsaGen->CompileShader(gfx9aldbrn_PollAndCopy, "CopyMemory", isaBuffer);
+    else
+        m_pIsaGen->CompileShader(gfx9_PollAndCopy, "CopyMemory", isaBuffer);
+
+    Dispatch dispatch(isaBuffer);
+    dispatch.SetArgs(fineBuffer, fineBuffer+dwLocation);
+    dispatch.Submit(queue);
+
+    /* Delay 100ms to make sure shader executed*/
+    Delay(100);
+
+    /* CPU writes to buffer. Shader should get what CPU writes and quits*/
+    fineBuffer[1] = 0x5678;
+    fineBuffer[0] = 1;
+
+    /* Check test result, based on KFDEventTest.SignalEvent passed.
+     * if Sync times out,
+     * it means coherence issue that GPU doesn't read what CPU wrote.
+     * if buffer value is not expected,
+     * it means coherence issue that CPU doesn't read what GPU wrote.
+     */
+    dispatch.Sync(g_TestTimeOut);
+    EXPECT_EQ(fineBuffer[dwLocation], 0x5678);
+
+    // Clean up
+    EXPECT_SUCCESS(hsaKmtUnmapMemoryToGPU(fineBuffer));
+    EXPECT_SUCCESS(hsaKmtFreeMemory(fineBuffer, PAGE_SIZE));
+    EXPECT_SUCCESS(queue.Destroy());
+
+    TEST_END
+}