tests/kfdtest/src/KFDLocalMemoryTest.cpp

/*
 * Copyright (C) 2014-2018 Advanced Micro Devices, Inc. All Rights Reserved.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
 * copy of this software and associated documentation files (the "Software"),
 * to deal in the Software without restriction, including without limitation
 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
 * and/or sell copies of the Software, and to permit persons to whom the
 * Software is furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
 * OTHER DEALINGS IN THE SOFTWARE.
 *
 */

#include "KFDLocalMemoryTest.hpp"
#include "PM4Queue.hpp"
#include "PM4Packet.hpp"
#include "SDMAPacket.hpp"
#include "SDMAQueue.hpp"
#include "Dispatch.hpp"

// All tests are marked by their serial number in the QCM FDD

void KFDLocalMemoryTest::SetUp() {
    ROUTINE_START

    KFDBaseComponentTest::SetUp();

    m_pIsaGen = IsaGenerator::Create(m_FamilyId);

    ROUTINE_END
}

void KFDLocalMemoryTest::TearDown() {
    ROUTINE_START

    if (m_pIsaGen)
        delete m_pIsaGen;
    m_pIsaGen = NULL;

    KFDBaseComponentTest::TearDown();

    ROUTINE_END
}

TEST_F(KFDLocalMemoryTest, BasicTest) {
    TEST_REQUIRE_ENV_CAPABILITIES(ENVCAPS_64BITLINUX);
    TEST_START(TESTPROFILE_RUNALL);

    PM4Queue queue;
    HSAuint64 AlternateVAGPU;
    unsigned int BufferSize = PAGE_SIZE;

    int defaultGPUNode = m_NodeInfo.HsaDefaultGPUNode();
    ASSERT_GE(defaultGPUNode, 0) << "failed to get default GPU Node";

    if (!GetVramSize(defaultGPUNode)) {
        LOG() << "No VRAM found, skipping the test" << std::endl;
        return;
    }

    HsaMemoryBuffer isaBuffer(PAGE_SIZE, defaultGPUNode);
    HsaMemoryBuffer srcSysBuffer(BufferSize, defaultGPUNode, false);
    HsaMemoryBuffer destSysBuffer(BufferSize, defaultGPUNode);
    HsaMemoryBuffer srcLocalBuffer(BufferSize, defaultGPUNode, false, true);
    HsaMemoryBuffer dstLocalBuffer(BufferSize, defaultGPUNode, false, true);

    srcSysBuffer.Fill(0x01010101);

    m_pIsaGen->GetCopyDwordIsa(isaBuffer);

    ASSERT_SUCCESS(hsaKmtMapMemoryToGPU(srcLocalBuffer.As<void*>(), srcLocalBuffer.Size(), &AlternateVAGPU));
    ASSERT_SUCCESS(hsaKmtMapMemoryToGPU(dstLocalBuffer.As<void*>(), dstLocalBuffer.Size(), &AlternateVAGPU));

    ASSERT_SUCCESS(queue.Create(defaultGPUNode));
    queue.SetSkipWaitConsump(0);

    Dispatch dispatch(isaBuffer);

    dispatch.SetArgs(srcSysBuffer.As<void*>(), srcLocalBuffer.As<void*>());
    dispatch.Submit(queue);
    dispatch.Sync(g_TestTimeOut);

    dispatch.SetArgs(srcLocalBuffer.As<void*>(), dstLocalBuffer.As<void*>());
    dispatch.Submit(queue);
    dispatch.Sync(g_TestTimeOut);

    dispatch.SetArgs(dstLocalBuffer.As<void*>(), destSysBuffer.As<void*>());
    dispatch.Submit(queue);
    dispatch.Sync(g_TestTimeOut);

    ASSERT_SUCCESS(queue.Destroy());

    ASSERT_SUCCESS(hsaKmtUnmapMemoryToGPU(srcLocalBuffer.As<void*>()));
    ASSERT_SUCCESS(hsaKmtUnmapMemoryToGPU(dstLocalBuffer.As<void*>()));
    ASSERT_EQ(destSysBuffer.As<unsigned int*>()[0], 0x01010101);

    TEST_END
}

TEST_F(KFDLocalMemoryTest, VerifyContentsAfterUnmapAndMap) {
    TEST_REQUIRE_ENV_CAPABILITIES(ENVCAPS_64BITLINUX);
    TEST_START(TESTPROFILE_RUNALL);

    PM4Queue queue;
    HSAuint64 AlternateVAGPU;
    unsigned int BufferSize = PAGE_SIZE;

    int defaultGPUNode = m_NodeInfo.HsaDefaultGPUNode();
    ASSERT_GE(defaultGPUNode, 0) << "failed to get default GPU Node";

    if (!GetVramSize(defaultGPUNode)) {
        LOG() << "No VRAM found, skipping the test" << std::endl;
        return;
    }

    HsaMemoryBuffer isaBuffer(PAGE_SIZE, defaultGPUNode);
    HsaMemoryBuffer SysBufferA(BufferSize, defaultGPUNode, false);
    HsaMemoryBuffer SysBufferB(BufferSize, defaultGPUNode, true);
    HsaMemoryBuffer LocalBuffer(BufferSize, defaultGPUNode, true, true);

    SysBufferA.Fill(0x01010101);

    m_pIsaGen->GetCopyDwordIsa(isaBuffer);

    ASSERT_SUCCESS(queue.Create(defaultGPUNode));
    queue.SetSkipWaitConsump(0);

    if (!is_dgpu())
        ASSERT_SUCCESS(hsaKmtMapMemoryToGPU(LocalBuffer.As<void*>(), LocalBuffer.Size(), &AlternateVAGPU));

    Dispatch dispatch(isaBuffer);

    dispatch.SetArgs(SysBufferA.As<void*>(), LocalBuffer.As<void*>());
    dispatch.Submit(queue);
    dispatch.Sync(g_TestTimeOut);

    ASSERT_SUCCESS(hsaKmtUnmapMemoryToGPU(LocalBuffer.As<void*>()));
    ASSERT_SUCCESS(hsaKmtMapMemoryToGPU(LocalBuffer.As<void*>(), LocalBuffer.Size(), &AlternateVAGPU));

    dispatch.SetArgs(LocalBuffer.As<void*>(), SysBufferB.As<void*>());
    dispatch.Submit(queue);
    dispatch.Sync(g_TestTimeOut);

    ASSERT_SUCCESS(queue.Destroy());
    ASSERT_EQ(SysBufferB.As<unsigned int*>()[0], 0x01010101);
    if (!is_dgpu())
        ASSERT_SUCCESS(hsaKmtUnmapMemoryToGPU(LocalBuffer.As<void*>()));

    TEST_END
}

/* Deliberately fragment GPUVM aperture to fill up address space
 *
 * General idea: Allocate buffers, but don't map them to GPU. This
 * will reserve virtual address space without pinning physical
 * memory. It should allow using more address space than physically
 * available memory.
 *
 * Even without pinning memory, TTM will still commit memory at
 * allocation time and swap out movable buffers to system memory or
 * even the hard drive, if it needs to. So we can't allocate arbitrary
 * amounts of virtual memory.
 *
 * Strategy to maximize the amount of allocated, fragmented address
 * space while keeping the amount of committed memory bounded at all
 * times:
 *
 * 1. Allocate N blocks of a given size, initially 1 page
 * 2. Free every other block, creating holes in the address space.
 *    This frees up half the memory
 * 3. Allocate N/4 blocks of 2-pages each. This requires as much
 *    memory as was freed in step 2. The block size is bigger than
 *    the 1-page holes, so new address space will be used.
 * 4. Free half the blocks just allocated, and half of the
 *    remaining blocks of step 1. This creates 3-page holes between
 *    the 1-page blocks from step 1, and 2-page holes between the
 *    2-page blocks from step 3. It frees up half of the total
 *    memory.
 * 5. Double the block size to 4, devide number of blocks by 2.
 *    Again, this will require the amount of memory freed in step 4.
 *    The block size 4 is bigger than the biggest hole (3 pages).
 * 6. Free half the memory again, creating 7-page holes between
 *    1-page blocks, 6-page holes between 2-page blocks, and 4-page
 *    holes between 4-page blocks.
 *
 * Repeat, doubling block size and halving number of blocks in each
 * iteration. Each iteration starts and ends with half the total
 * memory free. Because the block size is always bigger than the
 * biggest hole, each iteration increases the amount of address space
 * occupied by half the total memory size. Once the block size reaches
 * half of the free memory (1/4 of total memory) the limit is reached.
 *
 * With 2^n pages available memory, n * 2^(n-1) pages of address space
 * can be reserved. At the end of that process, half the memory will
 * be free.
 *
 *     Total memory     | Fragmented address space
 * order | pages | size | pages |  size | ratio
 * ------+-------+------+-------+-------+-------
 *     2 |    4  |  16K |    4  |   16K |   1
 *     3 |    8  |  32K |   12  |   48K |   1.5
 *     4 |   16  |  64K |   32  |  128K |   2
 *     5 |   32  | 128K |   80  |  320K |   2.5
 *     6 |   64  | 256K |  192  |  768K |   3
 *     7 |  128  | 512K |  448  | 1.75M |   3.5
 *     8 |  256  |   1M |    1M |    4M |   4
 *     9 |  512  |   2M | 2.25M |    9M |   4.5
 *    10 |    1K |   4M |    5M |   20M |   5
 *    11 |    2K |   8M |   11M |   44M |   5.5
 *    12 |    4K |  16M |   24M |   96M |   6
 *    13 |    8K |  32M |   52M |  208M |   6.5
 *    14 |   16K |  64M |  112M |  448M |   7
 *    15 |   32K | 128M |  240M |  960M |   7.5
 *    16 |   64K | 256M |  512M |    2G |   8
 *    17 |  128K | 512M | 1088M | 4.25G |   8.5
 *    18 |  256K |   1G | 2.25G |    9G |   9
 *    19 |  512K |   2G | 4.75G |   19G |   9.5
 *    20 |    1M |   4G |   10G |   40G |  10
 */
TEST_F(KFDLocalMemoryTest, Fragmentation) {
    TEST_REQUIRE_ENV_CAPABILITIES(ENVCAPS_64BITLINUX);
    TEST_START(TESTPROFILE_RUNALL);

    int defaultGPUNode = m_NodeInfo.HsaDefaultGPUNode();
    ASSERT_GE(defaultGPUNode, 0) << "failed to get default GPU Node";

    HSAuint64 fbSize;

    fbSize = GetVramSize(defaultGPUNode);

    if (!fbSize) {
        LOG() << "No VRAM found, skipping test." << std::endl;
        return;
    } else {
        LOG() << "Found VRAM of " << std::dec << (fbSize >> 20) << "MB." << std::endl;
    }

    /* Use up to half of available memory. Using more results in
     * excessive memory movement in TTM and slows down the test too
     * much. maxOrder is the size of the biggest block that will be
     * allocated. It's 1/4 of the usable memory, so 1/8 the total FB
     * size in pages.
     *
     * Use 8x bigger page size on dGPU to match Tonga alignment
     * workaround. Also nicely matches the 8x bigger GPUVM address
     * space on AMDGPU compared to RADEON.
     */
    unsigned pageSize = is_dgpu() ? PAGE_SIZE*8 : PAGE_SIZE;
    fbSize /= pageSize;
    unsigned maxOrder = 0;
    // Limit maxOrder up to 14 so this test doesn't run longer than 10 mins
    while (((fbSize >> maxOrder) >= 16) && (maxOrder < 14))
        maxOrder++;

    /* Queue and memory used by the shader copy tests */
    HsaMemoryBuffer sysBuffer(PAGE_SIZE, defaultGPUNode, false);
    PM4Queue queue;
    ASSERT_SUCCESS(queue.Create(defaultGPUNode));
    HsaMemoryBuffer isaBuffer(PAGE_SIZE, defaultGPUNode);
    m_pIsaGen->GetCopyDwordIsa(isaBuffer);

    /* Allocate and test memory using the strategy explained at the top */
    HSAKMT_STATUS status;
    HsaMemFlags memFlags = {0};
    memFlags.ui32.PageSize = HSA_PAGE_SIZE_4KB;
    memFlags.ui32.HostAccess = 0;
    memFlags.ui32.NonPaged = 1;
    struct {
        void **pointers;
        unsigned long nPages;
    } pages[maxOrder+1];
    unsigned order, o;
    unsigned long p;
    HSAuint64 size;
    unsigned value = 0;
    memset(pages, 0, sizeof(pages));
    for (order = 0; order <= maxOrder; order++) {
        // At maxOrder, block sizes is 1/4 of available memory
        pages[order].nPages = 1UL << (maxOrder - order + 2);
        // At order != 0, 1/2 the memory is already allocated
        if (order > 0)
            pages[order].nPages >>= 1;
        // Allocate page pointers
        pages[order].pointers = new void *[pages[order].nPages];
        EXPECT_NE((void **)NULL, pages[order].pointers)
            << "Couldn't allocate memory for " << pages[order].nPages
            << " pointers at order " << order << std::endl;
        if (!pages[order].pointers) {
            pages[order].nPages = 0;
            break;
        }
        /* Allocate buffers and access the start and end of every one:
         * 1. Copy from sysBuffer[0] to start of block
         * 2. Copy from start of block to end of block
         * 3. Copy from end of block to sysBuffer[1]
         * 4. Compare results */
        size = (HSAuint64)(1 << order) * pageSize;
        LOG() << std::dec << "Trying to allocate " << pages[order].nPages
              << " order " << order << " blocks " << std::endl;
        for (p = 0; p < pages[order].nPages; p++) {
            status = hsaKmtAllocMemory(defaultGPUNode, size,
                                       memFlags, &pages[order].pointers[p]);
            if (status != HSAKMT_STATUS_SUCCESS) {
                EXPECT_EQ(HSAKMT_STATUS_NO_MEMORY, status);
                pages[order].nPages = p;
                break;
            }

            void *bufferEnd = (void *)((unsigned long)pages[order].pointers[p]
                                       + size - sizeof(unsigned));
            sysBuffer.As<unsigned *>()[0] = ++value;

            status = hsaKmtMapMemoryToGPU(pages[order].pointers[p],
                                                size, NULL);
            if (status != HSAKMT_STATUS_SUCCESS) {
                ASSERT_SUCCESS(hsaKmtFreeMemory(pages[order].pointers[p],
                                                size));
                pages[order].nPages = p;
                break;
            }
            Dispatch dispatch1(isaBuffer);
            dispatch1.SetArgs(sysBuffer.As<void*>(), pages[order].pointers[p]);
            dispatch1.Submit(queue);
            // no sync needed for multiple GPU dispatches to the same queue

            Dispatch dispatch2(isaBuffer);
            dispatch2.SetArgs(pages[order].pointers[p], bufferEnd);
            dispatch2.Submit(queue);
            // no sync needed for multiple GPU dispatches to the same queue

            Dispatch dispatch3(isaBuffer);
            dispatch3.SetArgs(bufferEnd,
                              (void *)&(sysBuffer.As<unsigned*>()[1]));
            dispatch3.Submit(queue);
            dispatch3.Sync(g_TestTimeOut);
            EXPECT_EQ(value, sysBuffer.As<unsigned *>()[1]);

            EXPECT_SUCCESS(hsaKmtUnmapMemoryToGPU(pages[order].pointers[p]));
        }
        LOG() << "  Got " << pages[order].nPages
              << ", end of last block addr: "
              << (void *)((unsigned long)pages[order].pointers[p-1] + size - 1)
              << std::endl;

        // Now free half the memory
        for (o = 0; o <= order; o++) {
            unsigned long step = 1UL << (order - o + 1);
            unsigned long offset = (step >> 1) - 1;
            size = (HSAuint64)(1 << o) * pageSize;
            LOG() << "  Freeing every " << step << "th order "
                  << o << " block starting with " << offset << std::endl;
            for (p = offset; p < pages[o].nPages; p += step) {
                ASSERT_NE((void **)NULL, pages[o].pointers[p]);
                EXPECT_SUCCESS(hsaKmtFreeMemory(pages[o].pointers[p], size));
                pages[o].pointers[p] = NULL;
            }
        }
    }

    /* Clean up */
    for (order = 0; order <= maxOrder; order++) {
        if (pages[order].pointers == NULL)
            continue;

        size = (HSAuint64)(1 << order) * pageSize;
        for (p = 0; p < pages[order].nPages; p++)
            if (pages[order].pointers[p] != NULL)
                EXPECT_SUCCESS(hsaKmtFreeMemory(pages[order].pointers[p], size));

        delete[] pages[order].pointers;
    }

    ASSERT_SUCCESS(queue.Destroy());

    TEST_END
}

TEST_F(KFDLocalMemoryTest, CheckZeroInitializationVram) {
    TEST_REQUIRE_ENV_CAPABILITIES(ENVCAPS_64BITLINUX);
    TEST_START(TESTPROFILE_RUNALL);

    int defaultGPUNode = m_NodeInfo.HsaDefaultGPUNode();
    ASSERT_GE(defaultGPUNode, 0) << "failed to get default GPU Node";

    /* Testing VRAM */
    HSAuint64 vramSizeMB = GetVramSize(defaultGPUNode) >> 20;

    if (!vramSizeMB) {
        LOG() << "No VRAM found, skipping the test" << std::endl;
        return;
    }

    HSAuint64 vramBufSizeMB = vramSizeMB >> 2;
    /* limit the buffer size in order not to overflow the SDMA queue buffer. */
    if (vramBufSizeMB > 1024) {
        vramBufSizeMB = 1024;
    }
    HSAuint64 vramBufSize = vramBufSizeMB * 1024 * 1024;

    /* Make sure the entire VRAM is used at least once */
    int count = (vramSizeMB + vramBufSizeMB - 1) / vramBufSizeMB + 1;

    LOG() << "Using " << std::dec << vramBufSizeMB
            << "MB VRAM buffer to test " << std::dec << count
            << " times"<< std::endl;

    SDMAQueue sdmaQueue;
    ASSERT_SUCCESS(sdmaQueue.Create(defaultGPUNode, 8 * PAGE_SIZE));

    HsaMemoryBuffer tmpBuffer(PAGE_SIZE, 0, true /* zero */);
    volatile HSAuint32 *tmp = tmpBuffer.As<volatile HSAuint32 *>();

    unsigned int offset = 2060;  // a constant offset, should be 4 aligned.

    while (count--) {
        HsaMemoryBuffer localBuffer(vramBufSize, defaultGPUNode, false, true);

        EXPECT_TRUE(localBuffer.IsPattern(0, 0, sdmaQueue, tmp));

        for (HSAuint64 i = offset; i < vramBufSize;) {
            EXPECT_TRUE(localBuffer.IsPattern(i, 0, sdmaQueue, tmp));
            i += 4096;
        }

        /* Checking last 4 bytes */
        EXPECT_TRUE(localBuffer.IsPattern(vramBufSize - 4, 0, sdmaQueue, tmp));

        localBuffer.Fill(0xABCDEFFF, sdmaQueue);
    }

    TEST_END
}

TEST_F(KFDLocalMemoryTest, MapVramToGPUNodesTest) {
    TEST_START(TESTPROFILE_RUNALL);

    HSAint32 src_node;
    HSAint32 dst_node;
    HsaPointerInfo info;

    const std::vector<int> gpuNodes = m_NodeInfo.GetNodesWithGPU();
    if (gpuNodes.size() < 2) {
        LOG() << "Skipping test: Need at least two GPUs" << std::endl;
        return;
    }

    if (g_TestDstNodeId != -1 && g_TestNodeId != -1) {
        src_node = g_TestNodeId;
        dst_node = g_TestDstNodeId;
    } else {
        int defaultGPUNode = m_NodeInfo.HsaDefaultGPUNode();

        dst_node = m_NodeInfo.FindLargeBarGPUNode();
        if (dst_node < 0) {
            LOG() << "Skipping test: Need at least one large bar GPU" << std::endl;
            return;
        }

        if (dst_node != defaultGPUNode) {
            /* at least one node should be defaultGPUNode */
            src_node = defaultGPUNode;
        } else {
            for (auto node : gpuNodes) {
                if (node != dst_node) {
                    src_node = node;
                    break;
                }
            }
        }
    }

    LOG() << "Testing from GPU " << src_node << " to GPU " << dst_node << std::endl;

    void *shared_addr;
    HSAuint32 nodes[] = { (HSAuint32)src_node, (HSAuint32)dst_node };
    HsaMemFlags memFlags = {0};
    memFlags.ui32.PageSize = HSA_PAGE_SIZE_4KB;
    memFlags.ui32.HostAccess = 1;
    memFlags.ui32.NonPaged = 1;
    memFlags.ui32.ExecuteAccess = 1;

    HsaMemMapFlags mapFlags = {0};

    EXPECT_SUCCESS(hsaKmtAllocMemory(nodes[1], PAGE_SIZE, memFlags, &shared_addr));
    EXPECT_SUCCESS(hsaKmtRegisterMemoryToNodes(shared_addr, PAGE_SIZE, 2, nodes));
    EXPECT_SUCCESS(hsaKmtMapMemoryToGPUNodes(shared_addr, PAGE_SIZE, NULL, mapFlags, 2, nodes));
    EXPECT_SUCCESS(hsaKmtQueryPointerInfo(shared_addr, &info));
    EXPECT_EQ(info.NRegisteredNodes, 2);
    EXPECT_EQ(info.NMappedNodes, 2);

    EXPECT_SUCCESS(hsaKmtMapMemoryToGPUNodes(shared_addr, PAGE_SIZE, NULL, mapFlags, 1, &nodes[0]));
    EXPECT_SUCCESS(hsaKmtQueryPointerInfo(shared_addr, &info));
    EXPECT_EQ(info.NRegisteredNodes, 2);
    EXPECT_EQ(info.NMappedNodes, 1);
    EXPECT_EQ(info.MappedNodes[0], nodes[0]);

    EXPECT_SUCCESS(hsaKmtMapMemoryToGPUNodes(shared_addr, PAGE_SIZE, NULL, mapFlags, 1, &nodes[1]));
    EXPECT_SUCCESS(hsaKmtQueryPointerInfo(shared_addr, &info));
    EXPECT_EQ(info.NRegisteredNodes, 2);
    EXPECT_EQ(info.NMappedNodes, 1);
    EXPECT_EQ(info.MappedNodes[0], nodes[1]);

    EXPECT_SUCCESS(hsaKmtUnmapMemoryToGPU(shared_addr));
    EXPECT_SUCCESS(hsaKmtQueryPointerInfo(shared_addr, &info));
    EXPECT_EQ(info.NRegisteredNodes, 2);
    EXPECT_EQ(info.NMappedNodes, 0);

    EXPECT_SUCCESS(hsaKmtMapMemoryToGPUNodes(shared_addr, PAGE_SIZE, NULL, mapFlags, 1, &nodes[0]));
    EXPECT_SUCCESS(hsaKmtQueryPointerInfo(shared_addr, &info));
    EXPECT_EQ(info.NRegisteredNodes, 2);
    EXPECT_EQ(info.NMappedNodes, 1);
    EXPECT_EQ(info.MappedNodes[0], nodes[0]);

    EXPECT_SUCCESS(hsaKmtUnmapMemoryToGPU(shared_addr));
    EXPECT_SUCCESS(hsaKmtFreeMemory(shared_addr, PAGE_SIZE));

    TEST_END
}
kfdtest: Add kfdtest source code 2018-07-23 14:45:44 -04:00			`/*`
			`* Copyright (C) 2014-2018 Advanced Micro Devices, Inc. All Rights Reserved.`
			`*`
			`* Permission is hereby granted, free of charge, to any person obtaining a`
			`* copy of this software and associated documentation files (the "Software"),`
			`* to deal in the Software without restriction, including without limitation`
			`* the rights to use, copy, modify, merge, publish, distribute, sublicense,`
			`* and/or sell copies of the Software, and to permit persons to whom the`
			`* Software is furnished to do so, subject to the following conditions:`
			`*`
			`* The above copyright notice and this permission notice shall be included in`
			`* all copies or substantial portions of the Software.`
			`*`
			`* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR`
			`* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,`
			`* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL`
			`* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR`
			`* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,`
			`* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR`
			`* OTHER DEALINGS IN THE SOFTWARE.`
			`*`
			`*/`

			`#include "KFDLocalMemoryTest.hpp"`
			`#include "PM4Queue.hpp"`
			`#include "PM4Packet.hpp"`
			`#include "SDMAPacket.hpp"`
			`#include "SDMAQueue.hpp"`
			`#include "Dispatch.hpp"`

			`// All tests are marked by their serial number in the QCM FDD`

			`void KFDLocalMemoryTest::SetUp() {`
			`ROUTINE_START`

			`KFDBaseComponentTest::SetUp();`

			`m_pIsaGen = IsaGenerator::Create(m_FamilyId);`

			`ROUTINE_END`
			`}`

			`void KFDLocalMemoryTest::TearDown() {`
			`ROUTINE_START`

			`if (m_pIsaGen)`
			`delete m_pIsaGen;`
			`m_pIsaGen = NULL;`

			`KFDBaseComponentTest::TearDown();`

			`ROUTINE_END`
			`}`

			`TEST_F(KFDLocalMemoryTest, BasicTest) {`
			`TEST_REQUIRE_ENV_CAPABILITIES(ENVCAPS_64BITLINUX);`
			`TEST_START(TESTPROFILE_RUNALL);`

			`PM4Queue queue;`
			`HSAuint64 AlternateVAGPU;`
			`unsigned int BufferSize = PAGE_SIZE;`

			`int defaultGPUNode = m_NodeInfo.HsaDefaultGPUNode();`
			`ASSERT_GE(defaultGPUNode, 0) << "failed to get default GPU Node";`

			`if (!GetVramSize(defaultGPUNode)) {`
			`LOG() << "No VRAM found, skipping the test" << std::endl;`
			`return;`
			`}`

			`HsaMemoryBuffer isaBuffer(PAGE_SIZE, defaultGPUNode);`
			`HsaMemoryBuffer srcSysBuffer(BufferSize, defaultGPUNode, false);`
			`HsaMemoryBuffer destSysBuffer(BufferSize, defaultGPUNode);`
			`HsaMemoryBuffer srcLocalBuffer(BufferSize, defaultGPUNode, false, true);`
			`HsaMemoryBuffer dstLocalBuffer(BufferSize, defaultGPUNode, false, true);`

			`srcSysBuffer.Fill(0x01010101);`

			`m_pIsaGen->GetCopyDwordIsa(isaBuffer);`

			`ASSERT_SUCCESS(hsaKmtMapMemoryToGPU(srcLocalBuffer.As<void*>(), srcLocalBuffer.Size(), &AlternateVAGPU));`
			`ASSERT_SUCCESS(hsaKmtMapMemoryToGPU(dstLocalBuffer.As<void*>(), dstLocalBuffer.Size(), &AlternateVAGPU));`

			`ASSERT_SUCCESS(queue.Create(defaultGPUNode));`
			`queue.SetSkipWaitConsump(0);`

			`Dispatch dispatch(isaBuffer);`

			`dispatch.SetArgs(srcSysBuffer.As<void>(), srcLocalBuffer.As<void>());`
			`dispatch.Submit(queue);`
			`dispatch.Sync(g_TestTimeOut);`

			`dispatch.SetArgs(srcLocalBuffer.As<void>(), dstLocalBuffer.As<void>());`
			`dispatch.Submit(queue);`
			`dispatch.Sync(g_TestTimeOut);`

			`dispatch.SetArgs(dstLocalBuffer.As<void>(), destSysBuffer.As<void>());`
			`dispatch.Submit(queue);`
			`dispatch.Sync(g_TestTimeOut);`

			`ASSERT_SUCCESS(queue.Destroy());`

			`ASSERT_SUCCESS(hsaKmtUnmapMemoryToGPU(srcLocalBuffer.As<void*>()));`
			`ASSERT_SUCCESS(hsaKmtUnmapMemoryToGPU(dstLocalBuffer.As<void*>()));`
			`ASSERT_EQ(destSysBuffer.As<unsigned int*>()[0], 0x01010101);`

			`TEST_END`
			`}`

			`TEST_F(KFDLocalMemoryTest, VerifyContentsAfterUnmapAndMap) {`
			`TEST_REQUIRE_ENV_CAPABILITIES(ENVCAPS_64BITLINUX);`
			`TEST_START(TESTPROFILE_RUNALL);`

			`PM4Queue queue;`
			`HSAuint64 AlternateVAGPU;`
			`unsigned int BufferSize = PAGE_SIZE;`

			`int defaultGPUNode = m_NodeInfo.HsaDefaultGPUNode();`
			`ASSERT_GE(defaultGPUNode, 0) << "failed to get default GPU Node";`

			`if (!GetVramSize(defaultGPUNode)) {`
			`LOG() << "No VRAM found, skipping the test" << std::endl;`
			`return;`
			`}`

			`HsaMemoryBuffer isaBuffer(PAGE_SIZE, defaultGPUNode);`
			`HsaMemoryBuffer SysBufferA(BufferSize, defaultGPUNode, false);`
			`HsaMemoryBuffer SysBufferB(BufferSize, defaultGPUNode, true);`
			`HsaMemoryBuffer LocalBuffer(BufferSize, defaultGPUNode, true, true);`

			`SysBufferA.Fill(0x01010101);`

			`m_pIsaGen->GetCopyDwordIsa(isaBuffer);`

			`ASSERT_SUCCESS(queue.Create(defaultGPUNode));`
			`queue.SetSkipWaitConsump(0);`

			`if (!is_dgpu())`
			`ASSERT_SUCCESS(hsaKmtMapMemoryToGPU(LocalBuffer.As<void*>(), LocalBuffer.Size(), &AlternateVAGPU));`

			`Dispatch dispatch(isaBuffer);`

			`dispatch.SetArgs(SysBufferA.As<void>(), LocalBuffer.As<void>());`
			`dispatch.Submit(queue);`
			`dispatch.Sync(g_TestTimeOut);`

			`ASSERT_SUCCESS(hsaKmtUnmapMemoryToGPU(LocalBuffer.As<void*>()));`
			`ASSERT_SUCCESS(hsaKmtMapMemoryToGPU(LocalBuffer.As<void*>(), LocalBuffer.Size(), &AlternateVAGPU));`

			`dispatch.SetArgs(LocalBuffer.As<void>(), SysBufferB.As<void>());`
			`dispatch.Submit(queue);`
			`dispatch.Sync(g_TestTimeOut);`

			`ASSERT_SUCCESS(queue.Destroy());`
			`ASSERT_EQ(SysBufferB.As<unsigned int*>()[0], 0x01010101);`
			`if (!is_dgpu())`
			`ASSERT_SUCCESS(hsaKmtUnmapMemoryToGPU(LocalBuffer.As<void*>()));`

			`TEST_END`
			`}`

			`/* Deliberately fragment GPUVM aperture to fill up address space`
			`*`
			`* General idea: Allocate buffers, but don't map them to GPU. This`
			`* will reserve virtual address space without pinning physical`
			`* memory. It should allow using more address space than physically`
			`* available memory.`
			`*`
			`* Even without pinning memory, TTM will still commit memory at`
			`* allocation time and swap out movable buffers to system memory or`
			`* even the hard drive, if it needs to. So we can't allocate arbitrary`
			`* amounts of virtual memory.`
			`*`
			`* Strategy to maximize the amount of allocated, fragmented address`
			`* space while keeping the amount of committed memory bounded at all`
			`* times:`
			`*`
			`* 1. Allocate N blocks of a given size, initially 1 page`
			`* 2. Free every other block, creating holes in the address space.`
			`* This frees up half the memory`
			`* 3. Allocate N/4 blocks of 2-pages each. This requires as much`
			`* memory as was freed in step 2. The block size is bigger than`
			`* the 1-page holes, so new address space will be used.`
			`* 4. Free half the blocks just allocated, and half of the`
			`* remaining blocks of step 1. This creates 3-page holes between`
			`* the 1-page blocks from step 1, and 2-page holes between the`
			`* 2-page blocks from step 3. It frees up half of the total`
			`* memory.`
			`* 5. Double the block size to 4, devide number of blocks by 2.`
			`* Again, this will require the amount of memory freed in step 4.`
			`* The block size 4 is bigger than the biggest hole (3 pages).`
			`* 6. Free half the memory again, creating 7-page holes between`
			`* 1-page blocks, 6-page holes between 2-page blocks, and 4-page`
			`* holes between 4-page blocks.`
			`*`
			`* Repeat, doubling block size and halving number of blocks in each`
			`* iteration. Each iteration starts and ends with half the total`
			`* memory free. Because the block size is always bigger than the`
			`* biggest hole, each iteration increases the amount of address space`
			`* occupied by half the total memory size. Once the block size reaches`
			`* half of the free memory (1/4 of total memory) the limit is reached.`
			`*`
			`* With 2^n pages available memory, n * 2^(n-1) pages of address space`
			`* can be reserved. At the end of that process, half the memory will`
			`* be free.`
			`*`
			`* Total memory \| Fragmented address space`
			`* order \| pages \| size \| pages \| size \| ratio`
			`* ------+-------+------+-------+-------+-------`
			`* 2 \| 4 \| 16K \| 4 \| 16K \| 1`
			`* 3 \| 8 \| 32K \| 12 \| 48K \| 1.5`
			`* 4 \| 16 \| 64K \| 32 \| 128K \| 2`
			`* 5 \| 32 \| 128K \| 80 \| 320K \| 2.5`
			`* 6 \| 64 \| 256K \| 192 \| 768K \| 3`
			`* 7 \| 128 \| 512K \| 448 \| 1.75M \| 3.5`
			`* 8 \| 256 \| 1M \| 1M \| 4M \| 4`
			`* 9 \| 512 \| 2M \| 2.25M \| 9M \| 4.5`
			`* 10 \| 1K \| 4M \| 5M \| 20M \| 5`
			`* 11 \| 2K \| 8M \| 11M \| 44M \| 5.5`
			`* 12 \| 4K \| 16M \| 24M \| 96M \| 6`
			`* 13 \| 8K \| 32M \| 52M \| 208M \| 6.5`
			`* 14 \| 16K \| 64M \| 112M \| 448M \| 7`
			`* 15 \| 32K \| 128M \| 240M \| 960M \| 7.5`
			`* 16 \| 64K \| 256M \| 512M \| 2G \| 8`
			`* 17 \| 128K \| 512M \| 1088M \| 4.25G \| 8.5`
			`* 18 \| 256K \| 1G \| 2.25G \| 9G \| 9`
			`* 19 \| 512K \| 2G \| 4.75G \| 19G \| 9.5`
			`* 20 \| 1M \| 4G \| 10G \| 40G \| 10`
			`*/`
			`TEST_F(KFDLocalMemoryTest, Fragmentation) {`
			`TEST_REQUIRE_ENV_CAPABILITIES(ENVCAPS_64BITLINUX);`
			`TEST_START(TESTPROFILE_RUNALL);`

			`int defaultGPUNode = m_NodeInfo.HsaDefaultGPUNode();`
			`ASSERT_GE(defaultGPUNode, 0) << "failed to get default GPU Node";`

			`HSAuint64 fbSize;`

			`fbSize = GetVramSize(defaultGPUNode);`

			`if (!fbSize) {`
			`LOG() << "No VRAM found, skipping test." << std::endl;`
			`return;`
			`} else {`
			`LOG() << "Found VRAM of " << std::dec << (fbSize >> 20) << "MB." << std::endl;`
			`}`

			`/* Use up to half of available memory. Using more results in`
			`* excessive memory movement in TTM and slows down the test too`
			`* much. maxOrder is the size of the biggest block that will be`
			`* allocated. It's 1/4 of the usable memory, so 1/8 the total FB`
			`* size in pages.`
			`*`
			`* Use 8x bigger page size on dGPU to match Tonga alignment`
			`* workaround. Also nicely matches the 8x bigger GPUVM address`
			`* space on AMDGPU compared to RADEON.`
			`*/`
			`unsigned pageSize = is_dgpu() ? PAGE_SIZE*8 : PAGE_SIZE;`
			`fbSize /= pageSize;`
			`unsigned maxOrder = 0;`
			`// Limit maxOrder up to 14 so this test doesn't run longer than 10 mins`
			`while (((fbSize >> maxOrder) >= 16) && (maxOrder < 14))`
			`maxOrder++;`

			`/* Queue and memory used by the shader copy tests */`
			`HsaMemoryBuffer sysBuffer(PAGE_SIZE, defaultGPUNode, false);`
			`PM4Queue queue;`
			`ASSERT_SUCCESS(queue.Create(defaultGPUNode));`
			`HsaMemoryBuffer isaBuffer(PAGE_SIZE, defaultGPUNode);`
			`m_pIsaGen->GetCopyDwordIsa(isaBuffer);`

			`/* Allocate and test memory using the strategy explained at the top */`
			`HSAKMT_STATUS status;`
			`HsaMemFlags memFlags = {0};`
			`memFlags.ui32.PageSize = HSA_PAGE_SIZE_4KB;`
			`memFlags.ui32.HostAccess = 0;`
			`memFlags.ui32.NonPaged = 1;`
			`struct {`
			`void **pointers;`
			`unsigned long nPages;`
			`} pages[maxOrder+1];`
			`unsigned order, o;`
			`unsigned long p;`
			`HSAuint64 size;`
			`unsigned value = 0;`
			`memset(pages, 0, sizeof(pages));`
			`for (order = 0; order <= maxOrder; order++) {`
			`// At maxOrder, block sizes is 1/4 of available memory`
			`pages[order].nPages = 1UL << (maxOrder - order + 2);`
			`// At order != 0, 1/2 the memory is already allocated`
			`if (order > 0)`
			`pages[order].nPages >>= 1;`
			`// Allocate page pointers`
			`pages[order].pointers = new void *[pages[order].nPages];`
			`EXPECT_NE((void **)NULL, pages[order].pointers)`
			`<< "Couldn't allocate memory for " << pages[order].nPages`
			`<< " pointers at order " << order << std::endl;`
			`if (!pages[order].pointers) {`
			`pages[order].nPages = 0;`
			`break;`
			`}`
			`/* Allocate buffers and access the start and end of every one:`
			`* 1. Copy from sysBuffer[0] to start of block`
			`* 2. Copy from start of block to end of block`
			`* 3. Copy from end of block to sysBuffer[1]`
			`* 4. Compare results */`
			`size = (HSAuint64)(1 << order) * pageSize;`
			`LOG() << std::dec << "Trying to allocate " << pages[order].nPages`
			`<< " order " << order << " blocks " << std::endl;`
			`for (p = 0; p < pages[order].nPages; p++) {`
			`status = hsaKmtAllocMemory(defaultGPUNode, size,`
			`memFlags, &pages[order].pointers[p]);`
			`if (status != HSAKMT_STATUS_SUCCESS) {`
			`EXPECT_EQ(HSAKMT_STATUS_NO_MEMORY, status);`
			`pages[order].nPages = p;`
			`break;`
			`}`

			`void bufferEnd = (void )((unsigned long)pages[order].pointers[p]`
			`+ size - sizeof(unsigned));`
			`sysBuffer.As<unsigned *>()[0] = ++value;`

			`status = hsaKmtMapMemoryToGPU(pages[order].pointers[p],`
			`size, NULL);`
			`if (status != HSAKMT_STATUS_SUCCESS) {`
			`ASSERT_SUCCESS(hsaKmtFreeMemory(pages[order].pointers[p],`
			`size));`
			`pages[order].nPages = p;`
			`break;`
			`}`
			`Dispatch dispatch1(isaBuffer);`
			`dispatch1.SetArgs(sysBuffer.As<void*>(), pages[order].pointers[p]);`
			`dispatch1.Submit(queue);`
			`// no sync needed for multiple GPU dispatches to the same queue`

			`Dispatch dispatch2(isaBuffer);`
			`dispatch2.SetArgs(pages[order].pointers[p], bufferEnd);`
			`dispatch2.Submit(queue);`
			`// no sync needed for multiple GPU dispatches to the same queue`

			`Dispatch dispatch3(isaBuffer);`
			`dispatch3.SetArgs(bufferEnd,`
			`(void )&(sysBuffer.As<unsigned>()[1]));`
			`dispatch3.Submit(queue);`
			`dispatch3.Sync(g_TestTimeOut);`
			`EXPECT_EQ(value, sysBuffer.As<unsigned *>()[1]);`

			`EXPECT_SUCCESS(hsaKmtUnmapMemoryToGPU(pages[order].pointers[p]));`
			`}`
			`LOG() << " Got " << pages[order].nPages`
			`<< ", end of last block addr: "`
			`<< (void *)((unsigned long)pages[order].pointers[p-1] + size - 1)`
			`<< std::endl;`

			`// Now free half the memory`
			`for (o = 0; o <= order; o++) {`
			`unsigned long step = 1UL << (order - o + 1);`
			`unsigned long offset = (step >> 1) - 1;`
			`size = (HSAuint64)(1 << o) * pageSize;`
			`LOG() << " Freeing every " << step << "th order "`
			`<< o << " block starting with " << offset << std::endl;`
			`for (p = offset; p < pages[o].nPages; p += step) {`
			`ASSERT_NE((void **)NULL, pages[o].pointers[p]);`
			`EXPECT_SUCCESS(hsaKmtFreeMemory(pages[o].pointers[p], size));`
			`pages[o].pointers[p] = NULL;`
			`}`
			`}`
			`}`

			`/* Clean up */`
			`for (order = 0; order <= maxOrder; order++) {`
			`if (pages[order].pointers == NULL)`
			`continue;`

			`size = (HSAuint64)(1 << order) * pageSize;`
			`for (p = 0; p < pages[order].nPages; p++)`
			`if (pages[order].pointers[p] != NULL)`
			`EXPECT_SUCCESS(hsaKmtFreeMemory(pages[order].pointers[p], size));`

			`delete[] pages[order].pointers;`
			`}`

			`ASSERT_SUCCESS(queue.Destroy());`

			`TEST_END`
			`}`

			`TEST_F(KFDLocalMemoryTest, CheckZeroInitializationVram) {`
			`TEST_REQUIRE_ENV_CAPABILITIES(ENVCAPS_64BITLINUX);`
			`TEST_START(TESTPROFILE_RUNALL);`

			`int defaultGPUNode = m_NodeInfo.HsaDefaultGPUNode();`
			`ASSERT_GE(defaultGPUNode, 0) << "failed to get default GPU Node";`

			`/* Testing VRAM */`
			`HSAuint64 vramSizeMB = GetVramSize(defaultGPUNode) >> 20;`

			`if (!vramSizeMB) {`
			`LOG() << "No VRAM found, skipping the test" << std::endl;`
			`return;`
			`}`

			`HSAuint64 vramBufSizeMB = vramSizeMB >> 2;`
			`/* limit the buffer size in order not to overflow the SDMA queue buffer. */`
			`if (vramBufSizeMB > 1024) {`
			`vramBufSizeMB = 1024;`
			`}`
			`HSAuint64 vramBufSize = vramBufSizeMB * 1024 * 1024;`

			`/* Make sure the entire VRAM is used at least once */`
			`int count = (vramSizeMB + vramBufSizeMB - 1) / vramBufSizeMB + 1;`

			`LOG() << "Using " << std::dec << vramBufSizeMB`
			`<< "MB VRAM buffer to test " << std::dec << count`
			`<< " times"<< std::endl;`

			`SDMAQueue sdmaQueue;`
			`ASSERT_SUCCESS(sdmaQueue.Create(defaultGPUNode, 8 * PAGE_SIZE));`

			`HsaMemoryBuffer tmpBuffer(PAGE_SIZE, 0, true /* zero */);`
			`volatile HSAuint32 tmp = tmpBuffer.As<volatile HSAuint32 >();`

			`unsigned int offset = 2060; // a constant offset, should be 4 aligned.`

			`while (count--) {`
			`HsaMemoryBuffer localBuffer(vramBufSize, defaultGPUNode, false, true);`

			`EXPECT_TRUE(localBuffer.IsPattern(0, 0, sdmaQueue, tmp));`

			`for (HSAuint64 i = offset; i < vramBufSize;) {`
			`EXPECT_TRUE(localBuffer.IsPattern(i, 0, sdmaQueue, tmp));`
			`i += 4096;`
			`}`

			`/* Checking last 4 bytes */`
			`EXPECT_TRUE(localBuffer.IsPattern(vramBufSize - 4, 0, sdmaQueue, tmp));`

			`localBuffer.Fill(0xABCDEFFF, sdmaQueue);`
			`}`

			`TEST_END`
			`}`

			`TEST_F(KFDLocalMemoryTest, MapVramToGPUNodesTest) {`
			`TEST_START(TESTPROFILE_RUNALL);`

			`HSAint32 src_node;`
			`HSAint32 dst_node;`
			`HsaPointerInfo info;`

			`const std::vector<int> gpuNodes = m_NodeInfo.GetNodesWithGPU();`
			`if (gpuNodes.size() < 2) {`
			`LOG() << "Skipping test: Need at least two GPUs" << std::endl;`
			`return;`
			`}`

			`if (g_TestDstNodeId != -1 && g_TestNodeId != -1) {`
			`src_node = g_TestNodeId;`
			`dst_node = g_TestDstNodeId;`
			`} else {`
			`int defaultGPUNode = m_NodeInfo.HsaDefaultGPUNode();`

			`dst_node = m_NodeInfo.FindLargeBarGPUNode();`
			`if (dst_node < 0) {`
			`LOG() << "Skipping test: Need at least one large bar GPU" << std::endl;`
			`return;`
			`}`

			`if (dst_node != defaultGPUNode) {`
			`/* at least one node should be defaultGPUNode */`
			`src_node = defaultGPUNode;`
			`} else {`
			`for (auto node : gpuNodes) {`
			`if (node != dst_node) {`
			`src_node = node;`
			`break;`
			`}`
			`}`
			`}`
			`}`

			`LOG() << "Testing from GPU " << src_node << " to GPU " << dst_node << std::endl;`

			`void *shared_addr;`
			`HSAuint32 nodes[] = { (HSAuint32)src_node, (HSAuint32)dst_node };`
			`HsaMemFlags memFlags = {0};`
			`memFlags.ui32.PageSize = HSA_PAGE_SIZE_4KB;`
			`memFlags.ui32.HostAccess = 1;`
			`memFlags.ui32.NonPaged = 1;`
			`memFlags.ui32.ExecuteAccess = 1;`

			`HsaMemMapFlags mapFlags = {0};`

			`EXPECT_SUCCESS(hsaKmtAllocMemory(nodes[1], PAGE_SIZE, memFlags, &shared_addr));`
			`EXPECT_SUCCESS(hsaKmtRegisterMemoryToNodes(shared_addr, PAGE_SIZE, 2, nodes));`
			`EXPECT_SUCCESS(hsaKmtMapMemoryToGPUNodes(shared_addr, PAGE_SIZE, NULL, mapFlags, 2, nodes));`
			`EXPECT_SUCCESS(hsaKmtQueryPointerInfo(shared_addr, &info));`
			`EXPECT_EQ(info.NRegisteredNodes, 2);`
			`EXPECT_EQ(info.NMappedNodes, 2);`

			`EXPECT_SUCCESS(hsaKmtMapMemoryToGPUNodes(shared_addr, PAGE_SIZE, NULL, mapFlags, 1, &nodes[0]));`
			`EXPECT_SUCCESS(hsaKmtQueryPointerInfo(shared_addr, &info));`
			`EXPECT_EQ(info.NRegisteredNodes, 2);`
			`EXPECT_EQ(info.NMappedNodes, 1);`
			`EXPECT_EQ(info.MappedNodes[0], nodes[0]);`

			`EXPECT_SUCCESS(hsaKmtMapMemoryToGPUNodes(shared_addr, PAGE_SIZE, NULL, mapFlags, 1, &nodes[1]));`
			`EXPECT_SUCCESS(hsaKmtQueryPointerInfo(shared_addr, &info));`
			`EXPECT_EQ(info.NRegisteredNodes, 2);`
			`EXPECT_EQ(info.NMappedNodes, 1);`
			`EXPECT_EQ(info.MappedNodes[0], nodes[1]);`

			`EXPECT_SUCCESS(hsaKmtUnmapMemoryToGPU(shared_addr));`
			`EXPECT_SUCCESS(hsaKmtQueryPointerInfo(shared_addr, &info));`
			`EXPECT_EQ(info.NRegisteredNodes, 2);`
			`EXPECT_EQ(info.NMappedNodes, 0);`

			`EXPECT_SUCCESS(hsaKmtMapMemoryToGPUNodes(shared_addr, PAGE_SIZE, NULL, mapFlags, 1, &nodes[0]));`
			`EXPECT_SUCCESS(hsaKmtQueryPointerInfo(shared_addr, &info));`
			`EXPECT_EQ(info.NRegisteredNodes, 2);`
			`EXPECT_EQ(info.NMappedNodes, 1);`
			`EXPECT_EQ(info.MappedNodes[0], nodes[0]);`

			`EXPECT_SUCCESS(hsaKmtUnmapMemoryToGPU(shared_addr));`
			`EXPECT_SUCCESS(hsaKmtFreeMemory(shared_addr, PAGE_SIZE));`

			`TEST_END`
			`}`