rocm-systems/tests/kfdtest/src/KFDMemoryTest.cpp

/*
 * Copyright (C) 2014-2018 Advanced Micro Devices, Inc. All Rights Reserved.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
 * copy of this software and associated documentation files (the "Software"),
 * to deal in the Software without restriction, including without limitation
 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
 * and/or sell copies of the Software, and to permit persons to whom the
 * Software is furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
 * OTHER DEALINGS IN THE SOFTWARE.
 *
 */

#include "KFDMemoryTest.hpp"
#include <sys/prctl.h>
#include <sys/ptrace.h>
#include <errno.h>
#include <string.h>
#include <stdlib.h>
#include <unistd.h>
#include <sys/types.h>
#include <signal.h>
#include <numa.h>
#include <vector>
#include "Dispatch.hpp"
#include "PM4Queue.hpp"
#include "PM4Packet.hpp"
#include "SDMAQueue.hpp"
#include "SDMAPacket.hpp"
#include "linux/kfd_ioctl.h"

const char* gfx8_ScratchCopyDword =
"\
shader ScratchCopyDword\n\
asic(VI)\n\
type(CS)\n\
/*copy the parameters from scalar registers to vector registers*/\n\
    v_mov_b32 v0, s0\n\
    v_mov_b32 v1, s1\n\
    v_mov_b32 v2, s2\n\
    v_mov_b32 v3, s3\n\
/*set up the scratch parameters. This assumes a single 16-reg block.*/\n\
    s_mov_b32 flat_scratch_lo, 8/*2 dwords of scratch per thread*/\n\
    s_mov_b32 flat_scratch_hi, 0/*offset in units of 256bytes*/\n\
/*copy a dword between the passed addresses*/\n\
    flat_load_dword v4, v[0:1] slc\n\
    s_waitcnt vmcnt(0)&lgkmcnt(0)\n\
    flat_store_dword v[2:3], v4 slc\n\
    \n\
    s_endpgm\n\
    \n\
end\n\
";

const char* gfx9_ScratchCopyDword =
"\
shader ScratchCopyDword\n\
asic(GFX9)\n\
type(CS)\n\
/*copy the parameters from scalar registers to vector registers*/\n\
    v_mov_b32 v0, s0\n\
    v_mov_b32 v1, s1\n\
    v_mov_b32 v2, s2\n\
    v_mov_b32 v3, s3\n\
/*set up the scratch parameters. This assumes a single 16-reg block.*/\n\
    s_mov_b32 flat_scratch_lo, s4\n\
    s_mov_b32 flat_scratch_hi, s5\n\
/*copy a dword between the passed addresses*/\n\
    flat_load_dword v4, v[0:1] slc\n\
    s_waitcnt vmcnt(0)&lgkmcnt(0)\n\
    flat_store_dword v[2:3], v4 slc\n\
    \n\
    s_endpgm\n\
    \n\
end\n\
";

/* Continuously poll src buffer and check buffer value
 * After src buffer is filled with specific value (0x5678,
 * by host program), fill dst buffer with specific
 * value(0x5678) and quit
 */
const char* gfx9_PollMemory =
"\
shader ReadMemory\n\
asic(GFX9)\n\
type(CS)\n\
/* Assume src address in s0, s1 and dst address in s2, s3*/\n\
    s_movk_i32 s18, 0x5678\n\
    LOOP:\n\
    s_load_dword s16, s[0:1], 0x0 glc\n\
    s_cmp_eq_i32 s16, s18\n\
    s_cbranch_scc0   LOOP\n\
    s_store_dword s18, s[2:3], 0x0 glc\n\
    s_endpgm\n\
    end\n\
";

/* Input: A buffer of at least 3 dwords.
 * DW0: used as a signal. 0xcafe means it is signaled
 * DW1: Input buffer for device to read.
 * DW2: Output buffer for device to write.
 * Once receive signal, device will copy DW1 to DW2
 * This shader continously poll the signal buffer,
 * Once signal buffer is signaled, it copies input buffer
 * to output buffer
 */
const char* gfx9_CopyOnSignal =
"\
shader CopyOnSignal\n\
asic(GFX9)\n\
type(CS)\n\
/* Assume input buffer in s0, s1 */\n\
    s_mov_b32 s18, 0xcafe\n\
POLLSIGNAL:\n\
    s_load_dword s16, s[0:1], 0x0 glc\n\
    s_cmp_eq_i32 s16, s18\n\
    s_cbranch_scc0   POLLSIGNAL\n\
    s_load_dword s17, s[0:1], 0x4 glc\n\
    s_waitcnt vmcnt(0) & lgkmcnt(0)\n\
    s_store_dword s17, s[0:1], 0x8 glc\n\
    s_waitcnt vmcnt(0) & lgkmcnt(0)\n\
    s_endpgm\n\
    end\n\
";

/* Input0: A buffer of at least 2 dwords.
 * DW0: used as a signal. Write 0xcafe to signal
 * DW1: Write to this buffer for other device to read.
 * Input1: mmio base address
 */
const char* gfx9_WriteAndSignal =
"\
shader WriteAndSignal\n\
asic(GFX9)\n\
type(CS)\n\
/* Assume input buffer in s0, s1 */\n\
    s_mov_b32 s18, 0xbeef\n\
    s_store_dword s18, s[0:1], 0x4 glc\n\
    s_mov_b32 s18, 0x1\n\
    s_store_dword s18, s[2:3], 0 glc\n\
    s_mov_b32 s18, 0xcafe\n\
    s_store_dword s18, s[0:1], 0x0 glc\n\
    s_endpgm\n\
    end\n\
";

void KFDMemoryTest::SetUp() {
    ROUTINE_START

    KFDBaseComponentTest::SetUp();

    m_pIsaGen = IsaGenerator::Create(m_FamilyId);

    ROUTINE_END
}

void KFDMemoryTest::TearDown() {
    ROUTINE_START

    if (m_pIsaGen)
        delete m_pIsaGen;
    m_pIsaGen = NULL;

    KFDBaseComponentTest::TearDown();

    ROUTINE_END
}

#include <sys/mman.h>
#define GB(x) ((x) << 30)

/*
 * Try to map as much as possible system memory to gpu
 * to see if KFD supports 1TB memory correctly or not.
 * After this test case, we can observe if there are any side effects.
 * NOTICE: There are memory usage limit checks in hsa/kfd according to the total
 * physical system memory.
 */
TEST_F(KFDMemoryTest, MMapLarge) {
    TEST_REQUIRE_ENV_CAPABILITIES(ENVCAPS_64BITLINUX);
    TEST_START(TESTPROFILE_RUNALL)

    if (!is_dgpu()) {
        LOG() << "Skipping test: Test not supported on APU." << std::endl;
        return;
    }

    HSAuint32 defaultGPUNode = m_NodeInfo.HsaDefaultGPUNode();
    ASSERT_GE(defaultGPUNode, 0) << "failed to get default GPU Node";
    const HSAuint64 nObjects = 1<<14;
    HSAuint64 *AlternateVAGPU = new HSAuint64[nObjects];
    ASSERT_NE((HSAuint64)AlternateVAGPU, 0);
    HsaMemMapFlags mapFlags = {0};
    HSAuint64 s;
    char *addr;
    HSAuint64 flags = MAP_ANONYMOUS | MAP_PRIVATE;

    /* Test up to 1TB memory*/
    s = GB(1024ULL) / nObjects;
    addr = reinterpret_cast<char*>(mmap(0, s, PROT_READ | PROT_WRITE, flags, -1, 0));
    ASSERT_NE(addr, MAP_FAILED);
    memset(addr, 0, s);

    int i = 0;
    /* Allocate 1024GB, aka 1TB*/
    for (; i < nObjects; i++) {
        if (hsaKmtRegisterMemory(addr + i, s - i))
            break;
        if (hsaKmtMapMemoryToGPUNodes(addr + i, s - i,
                    &AlternateVAGPU[i], mapFlags, 1, reinterpret_cast<HSAuint32 *>(&defaultGPUNode))) {
            hsaKmtDeregisterMemory(addr + i);
            break;
        }
    }

    LOG() << "Successfully registered and mapped " << (i * s >> 30)
            << "GB system memory to gpu" << std::endl;

    RECORD(i * s >> 30) << "Mmap-SysMem-Size";

    while (i--) {
        EXPECT_SUCCESS(hsaKmtUnmapMemoryToGPU(reinterpret_cast<void*>(AlternateVAGPU[i])));
        EXPECT_SUCCESS(hsaKmtDeregisterMemory(reinterpret_cast<void*>(AlternateVAGPU[i])));
    }

    munmap(addr, s);
    delete []AlternateVAGPU;

    TEST_END
}

/* Keep memory mapped to default node
 * Keep mapping/unmapping memory to/from non-default node
 * A shader running on default node consistantly accesses
 * memory - make sure memory is always accessible by default,
 * i.e. there is no gpu vm fault.
 * Synchronization b/t host program and shader:
 * 1. Host initializes src and dst buffer to 0
 * 2. Shader keeps reading src buffer and check value
 * 3. Host writes src buffer to 0x5678 to indicate quit, polling dst until it becomes 0x5678
 * 4. Shader write dst buffer to 0x5678 after src changes to 0x5678, then quits
 * 5. Host program quits after dst becomes 0x5678
 * Need at least two gpu nodes to run the test. The default node has to be a gfx9 node,
 * otherwise, test is skipped. Use kfdtest --node=$$ to specify the default node
 * This test case is introduced as a side-result of investigation of SWDEV-134798, which
 * is a gpu vm fault while running rocr conformance test. Here we try to simulate the
 * same test behaviour.
 */
TEST_F(KFDMemoryTest, MapUnmapToNodes) {
    TEST_START(TESTPROFILE_RUNALL)
    if (m_FamilyId != FAMILY_AI) {
        LOG() << "Skipping test: GFX9-based shader not supported on other ASICs." << std::endl;
        return;
    }

    const std::vector<int> gpuNodes = m_NodeInfo.GetNodesWithGPU();
    if (gpuNodes.size() < 2) {
        LOG() << "Skipping test: At least two GPUs are required." << std::endl;
        return;
    }
    HSAuint32 defaultGPUNode = m_NodeInfo.HsaDefaultGPUNode();
    LOG() << "default GPU node" << defaultGPUNode << std::endl;
    ASSERT_GE(defaultGPUNode, 0) << "failed to get default GPU Node";

    HSAuint32 nondefaultNode;
    for (unsigned i = 0; i < gpuNodes.size(); i++) {
        if (gpuNodes.at(i) != defaultGPUNode) {
            nondefaultNode = gpuNodes.at(i);
            break;
        }
    }
    HSAuint32 mapNodes[2] = {defaultGPUNode, nondefaultNode};

    HsaMemoryBuffer isaBuffer(PAGE_SIZE, defaultGPUNode, true/*zero*/, false/*local*/, true/*exec*/);
    HsaMemoryBuffer srcBuffer(PAGE_SIZE, defaultGPUNode);
    HsaMemoryBuffer dstBuffer(PAGE_SIZE, defaultGPUNode);

    m_pIsaGen->CompileShader(gfx9_PollMemory, "ReadMemory", isaBuffer);

    PM4Queue pm4Queue;
    ASSERT_SUCCESS(pm4Queue.Create(defaultGPUNode));

    Dispatch dispatch0(isaBuffer);
    dispatch0.SetArgs(srcBuffer.As<void*>(), dstBuffer.As<void*>());
    dispatch0.Submit(pm4Queue);

    HsaMemMapFlags memFlags = {0};
    memFlags.ui32.PageSize = HSA_PAGE_SIZE_4KB;
    memFlags.ui32.HostAccess = 1;

    for (unsigned i = 0; i < 1<<14; i ++) {
        hsaKmtMapMemoryToGPUNodes(srcBuffer.As<void*>(), PAGE_SIZE, NULL, memFlags, (i>>5)&1+1, mapNodes);
    }

    /* Fill src buffer so shader quits */
    srcBuffer.Fill(0x5678);
    WaitOnValue(dstBuffer.As<uint32_t *>(), 0x5678);
    EXPECT_EQ(*dstBuffer.As<uint32_t *>(), 0x5678);
    EXPECT_SUCCESS(pm4Queue.Destroy());
    TEST_END
}

// Basic test of hsaKmtMapMemoryToGPU and hsaKmtUnmapMemoryToGPU
TEST_F(KFDMemoryTest , MapMemoryToGPU) {
    TEST_START(TESTPROFILE_RUNALL)

    unsigned int *nullPtr = NULL;
    unsigned int* pDb = NULL;

    int defaultGPUNode = m_NodeInfo.HsaDefaultGPUNode();
    ASSERT_GE(defaultGPUNode, 0) << "failed to get default GPU Node";

    ASSERT_SUCCESS(hsaKmtAllocMemory(defaultGPUNode /* system */, PAGE_SIZE, m_MemoryFlags,
                   reinterpret_cast<void**>(&pDb)));
    // verify that pDb is not null before it's being used
    ASSERT_NE(nullPtr, pDb) << "hsaKmtAllocMemory returned a null pointer";
    ASSERT_SUCCESS(hsaKmtMapMemoryToGPU(pDb, PAGE_SIZE, NULL));
    EXPECT_SUCCESS(hsaKmtUnmapMemoryToGPU(pDb));
    // Release the buffers
    EXPECT_SUCCESS(hsaKmtFreeMemory(pDb, PAGE_SIZE));

    TEST_END
}

// Following tests are for hsaKmtAllocMemory with invalid params
TEST_F(KFDMemoryTest, InvalidMemoryPointerAlloc) {
    TEST_START(TESTPROFILE_RUNALL)

    EXPECT_EQ(HSAKMT_STATUS_INVALID_PARAMETER, hsaKmtAllocMemory(0 /* system */, PAGE_SIZE, m_MemoryFlags, NULL));

    TEST_END
}

TEST_F(KFDMemoryTest, ZeroMemorySizeAlloc) {
    TEST_START(TESTPROFILE_RUNALL)

    unsigned int* pDb = NULL;
    EXPECT_EQ(HSAKMT_STATUS_INVALID_PARAMETER, hsaKmtAllocMemory(0 /* system */, 0, m_MemoryFlags,
              reinterpret_cast<void**>(&pDb)));

    TEST_END
}

// Basic test for hsaKmtAllocMemory
TEST_F(KFDMemoryTest, MemoryAlloc) {
    TEST_START(TESTPROFILE_RUNALL)

    unsigned int* pDb = NULL;
    EXPECT_SUCCESS(hsaKmtAllocMemory(0 /* system */, PAGE_SIZE, m_MemoryFlags, reinterpret_cast<void**>(&pDb)));

    TEST_END
}

TEST_F(KFDMemoryTest, AccessPPRMem) {
    TEST_START(TESTPROFILE_RUNALL)

    int defaultGPUNode = m_NodeInfo.HsaDefaultGPUNode();
    ASSERT_GE(defaultGPUNode, 0) << "failed to get default GPU Node";

    if (is_dgpu()) {
        LOG() << "Skipping test: Test requires APU." << std::endl;
        return;
    }

    unsigned int *destBuf = (unsigned int *)VirtualAllocMemory(NULL, PAGE_SIZE,
                                            MEM_READ | MEM_WRITE);

    PM4Queue queue;

    ASSERT_SUCCESS(queue.Create(defaultGPUNode));

    queue.PlaceAndSubmitPacket(PM4WriteDataPacket(destBuf,
                                0xABCDEF09, 0x12345678));

    queue.Wait4PacketConsumption();

    WaitOnValue(destBuf, 0xABCDEF09);
    WaitOnValue(destBuf + 1, 0x12345678);

    EXPECT_SUCCESS(queue.Destroy());

    /* This sleep hides the dmesg PPR message storm on Raven, which happens
     * when the CPU buffer is freed before the excessive PPRs are all
     * consumed by IOMMU HW. Because of that, a kernel driver workaround
     * is put in place to address that, so we don't need to wait here.
     */
    // sleep(5);

    VirtualFreeMemory(destBuf, PAGE_SIZE);

    TEST_END
}

// Linux OS-specific Test for registering OS allocated memory
TEST_F(KFDMemoryTest, MemoryRegister) {
    const HsaNodeProperties *pNodeProperties = m_NodeInfo.HsaDefaultGPUNodeProperties();
    if (isTonga(pNodeProperties)) {
        LOG() << "Skipping test: Workaround in thunk for Tonga causes failure." << std::endl;
        return;
    }

    TEST_START(TESTPROFILE_RUNALL)

    int defaultGPUNode = m_NodeInfo.HsaDefaultGPUNode();
    ASSERT_GE(defaultGPUNode, 0) << "failed to get default GPU Node";

    /* Different unaligned memory locations to be mapped for GPU
     * access:
     *
     * - initialized data segment (file backed)
     * - stack (anonymous memory)
     *
     * Separate them enough so they are in different cache lines
     * (64-byte = 16-dword).
     */
    static volatile HSAuint32 globalData = 0xdeadbeef;
    volatile HSAuint32 stackData[17] = {0};
    const unsigned dstOffset = 0;
    const unsigned sdmaOffset = 16;

    HsaMemoryBuffer srcBuffer((void *)&globalData, sizeof(HSAuint32));
    HsaMemoryBuffer dstBuffer((void *)&stackData[dstOffset], sizeof(HSAuint32));
    HsaMemoryBuffer sdmaBuffer((void *)&stackData[sdmaOffset], sizeof(HSAuint32));

    /* Create PM4 and SDMA queues before fork+COW to test queue
     * eviction and restore
     */
    PM4Queue pm4Queue;
    SDMAQueue sdmaQueue;
    ASSERT_SUCCESS(pm4Queue.Create(defaultGPUNode));
    ASSERT_SUCCESS(sdmaQueue.Create(defaultGPUNode));

    HsaMemoryBuffer isaBuffer(PAGE_SIZE, defaultGPUNode, true/*zero*/, false/*local*/, true/*exec*/);
    m_pIsaGen->GetCopyDwordIsa(isaBuffer);

    /* First submit just so the queues are not empty, and to get the
     * TLB populated (in case we need to flush TLBs somewhere after
     * updating the page tables)
     */
    Dispatch dispatch0(isaBuffer);
    dispatch0.SetArgs(srcBuffer.As<void*>(), dstBuffer.As<void*>());
    dispatch0.Submit(pm4Queue);
    dispatch0.Sync(g_TestTimeOut);

    sdmaQueue.PlaceAndSubmitPacket(SDMAWriteDataPacket(sdmaBuffer.As<HSAuint32 *>(), 0x12345678));
    sdmaQueue.Wait4PacketConsumption();
    EXPECT_TRUE(WaitOnValue(&stackData[sdmaOffset], 0x12345678));

    /* Fork a child process to mark pages as COW */
    pid_t pid = fork();
    ASSERT_GE(pid, 0);
    if (pid == 0) {
        /* Child process waits for a SIGTERM from the parent. It can't
         * make any write access to the stack because we want the
         * parent to make the first write access and get a new copy. A
         * busy loop is the safest way to do that, since any function
         * call (e.g. sleep) would write to the stack.
         */
        while (1)
        {}
        WARN() << "Shouldn't get here!" << std::endl;
        exit(0);
    }

    /* Parent process writes to COW page(s) and gets a new copy. MMU
     * notifier needs to update the GPU mapping(s) for the test to
     * pass.
     */
    globalData = 0xD00BED00;
    stackData[dstOffset] = 0xdeadbeef;
    stackData[sdmaOffset] = 0xdeadbeef;

    /* Terminate the child process before a possible test failure that
     * would leave it spinning in the background indefinitely.
     */
    int status;
    EXPECT_EQ(0, kill(pid, SIGTERM));
    EXPECT_EQ(pid, waitpid(pid, &status, 0));
    EXPECT_NE(0, WIFSIGNALED(status));
    EXPECT_EQ(SIGTERM, WTERMSIG(status));

    /* Now check that the GPU is accessing the correct page */
    Dispatch dispatch1(isaBuffer);
    dispatch1.SetArgs(srcBuffer.As<void*>(), dstBuffer.As<void*>());
    dispatch1.Submit(pm4Queue);
    dispatch1.Sync(g_TestTimeOut);

    sdmaQueue.PlaceAndSubmitPacket(SDMAWriteDataPacket(sdmaBuffer.As<HSAuint32 *>(), 0xD0BED0BE));
    sdmaQueue.Wait4PacketConsumption();

    EXPECT_SUCCESS(pm4Queue.Destroy());
    EXPECT_SUCCESS(sdmaQueue.Destroy());

    EXPECT_EQ(0xD00BED00, globalData);
    EXPECT_EQ(0xD00BED00, stackData[dstOffset]);
    EXPECT_EQ(0xD0BED0BE, stackData[sdmaOffset]);

    TEST_END
}

TEST_F(KFDMemoryTest, MemoryRegisterSamePtr) {
    if (!is_dgpu()) {
        LOG() << "Skipping test: Will run on APU once APU+dGPU supported." << std::endl;
        return;
    }

    TEST_START(TESTPROFILE_RUNALL)

    int defaultGPUNode = m_NodeInfo.HsaDefaultGPUNode();
    ASSERT_GE(defaultGPUNode, 0) << "failed to get default GPU Node";
    const std::vector<int> gpuNodes = m_NodeInfo.GetNodesWithGPU();
    HSAuint64 nGPU = gpuNodes.size();  // number of gpu nodes
    static volatile HSAuint32 mem[4];
    HSAuint64 gpuva1, gpuva2;

    /* Same address, different size */
    EXPECT_SUCCESS(hsaKmtRegisterMemory((void *)&mem[0], sizeof(HSAuint32)*2));
    EXPECT_SUCCESS(hsaKmtMapMemoryToGPU((void *)&mem[0], sizeof(HSAuint32)*2,
                                        &gpuva1));
    EXPECT_SUCCESS(hsaKmtRegisterMemory((void *)&mem[0], sizeof(HSAuint32)));
    EXPECT_SUCCESS(hsaKmtMapMemoryToGPU((void *)&mem[0], sizeof(HSAuint32),
                                        &gpuva2));
    EXPECT_TRUE(gpuva1 != gpuva2);
    EXPECT_SUCCESS(hsaKmtUnmapMemoryToGPU(reinterpret_cast<void *>(gpuva1)));
    EXPECT_SUCCESS(hsaKmtDeregisterMemory(reinterpret_cast<void *>(gpuva1)));
    EXPECT_SUCCESS(hsaKmtUnmapMemoryToGPU(reinterpret_cast<void *>(gpuva2)));
    EXPECT_SUCCESS(hsaKmtDeregisterMemory(reinterpret_cast<void *>(gpuva2)));

    /* Same address, same size */
    HsaMemMapFlags memFlags = {0};
    memFlags.ui32.PageSize = HSA_PAGE_SIZE_4KB;
    memFlags.ui32.HostAccess = 1;

    HSAuint32 nodes[nGPU];
    for (unsigned int i = 0; i < nGPU; i++)
        nodes[i] = gpuNodes.at(i);
    EXPECT_SUCCESS(hsaKmtRegisterMemoryToNodes((void *)&mem[2],
                            sizeof(HSAuint32)*2, nGPU, nodes));
    EXPECT_SUCCESS(hsaKmtMapMemoryToGPUNodes((void *)&mem[2],
                                        sizeof(HSAuint32) * 2,
                                        &gpuva1, memFlags, nGPU, nodes));
    EXPECT_SUCCESS(hsaKmtRegisterMemoryToNodes((void *)&mem[2],
                                        sizeof(HSAuint32) * 2, nGPU, nodes));
    EXPECT_SUCCESS(hsaKmtMapMemoryToGPUNodes((void *)&mem[2],
                                        sizeof(HSAuint32) * 2,
                                        &gpuva2, memFlags, nGPU, nodes));
    EXPECT_EQ(gpuva1, gpuva2);
    EXPECT_SUCCESS(hsaKmtUnmapMemoryToGPU(reinterpret_cast<void *>(gpuva1)));
    EXPECT_SUCCESS(hsaKmtDeregisterMemory(reinterpret_cast<void *>(gpuva1)));
    /* Confirm that we still have access to the memory, mem[2] */
    PM4Queue queue;
    ASSERT_SUCCESS(queue.Create(defaultGPUNode));
    mem[2] = 0x0;
    queue.PlaceAndSubmitPacket(PM4WriteDataPacket(reinterpret_cast<unsigned int *>(gpuva2),
                                                  0xdeadbeef));
    queue.PlaceAndSubmitPacket(PM4ReleaseMemoryPacket(true, 0, 0));
    queue.Wait4PacketConsumption();
    EXPECT_EQ(true, WaitOnValue((unsigned int *)(&mem[2]), 0xdeadbeef));
    EXPECT_SUCCESS(queue.Destroy());
    EXPECT_SUCCESS(hsaKmtUnmapMemoryToGPU(reinterpret_cast<void *>(gpuva2)));
    EXPECT_SUCCESS(hsaKmtDeregisterMemory(reinterpret_cast<void *>(gpuva2)));

    TEST_END
}

/* FlatScratchAccess
 * Since HsaMemoryBuffer has to be associated with a specific GPU node, this function in the current form
 * will not work for multiple GPU nodes. For now test only one default GPU node.
 * TODO: Generalize it to support multiple nodes
 */

#define SCRATCH_SLICE_SIZE 0x10000
#define SCRATCH_SLICE_NUM 3
#define SCRATCH_SIZE (SCRATCH_SLICE_NUM * SCRATCH_SLICE_SIZE)
#define SCRATCH_SLICE_OFFSET(i) ((i) * SCRATCH_SLICE_SIZE)

TEST_F(KFDMemoryTest, FlatScratchAccess) {
    TEST_START(TESTPROFILE_RUNALL)
    if (m_FamilyId == FAMILY_CI || m_FamilyId == FAMILY_KV) {
        LOG() << "Skipping test: VI-based shader not supported on other ASICs." << std::endl;
        return;
    }

    int defaultGPUNode = m_NodeInfo.HsaDefaultGPUNode();
    ASSERT_GE(defaultGPUNode, 0) << "failed to get default GPU Node";

    HsaMemoryBuffer isaBuffer(PAGE_SIZE, defaultGPUNode, true/*zero*/, false/*local*/, true/*exec*/);
    HsaMemoryBuffer scratchBuffer(SCRATCH_SIZE, defaultGPUNode, false/*zero*/, false/*local*/,
                                  false/*exec*/, true /*scratch*/);

    // Unmap scratch for sub-allocation mapping tests
    ASSERT_SUCCESS(hsaKmtUnmapMemoryToGPU(scratchBuffer.As<void*>()));

    // Map and unmap a few slices in different order: 2-0-1, 0-2-1
    ASSERT_SUCCESS(hsaKmtMapMemoryToGPU(scratchBuffer.As<char*>() + SCRATCH_SLICE_OFFSET(2),
                                        SCRATCH_SLICE_SIZE, NULL));
    ASSERT_SUCCESS(hsaKmtMapMemoryToGPU(scratchBuffer.As<char*>() + SCRATCH_SLICE_OFFSET(0),
                                        SCRATCH_SLICE_SIZE, NULL));
    ASSERT_SUCCESS(hsaKmtMapMemoryToGPU(scratchBuffer.As<char*>() + SCRATCH_SLICE_OFFSET(1),
                                        SCRATCH_SLICE_SIZE, NULL));

    EXPECT_SUCCESS(hsaKmtUnmapMemoryToGPU(scratchBuffer.As<char*>() + SCRATCH_SLICE_OFFSET(1)));
    EXPECT_SUCCESS(hsaKmtUnmapMemoryToGPU(scratchBuffer.As<char*>() + SCRATCH_SLICE_OFFSET(2)));
    EXPECT_SUCCESS(hsaKmtUnmapMemoryToGPU(scratchBuffer.As<char*>() + SCRATCH_SLICE_OFFSET(0)));

    // Map everything for test below
    ASSERT_SUCCESS(hsaKmtMapMemoryToGPU(scratchBuffer.As<char*>(), SCRATCH_SIZE, NULL));

    HsaMemoryBuffer srcMemBuffer(PAGE_SIZE, defaultGPUNode);
    HsaMemoryBuffer dstMemBuffer(PAGE_SIZE, defaultGPUNode);

    // Initialize the srcBuffer to some fixed value
    srcMemBuffer.Fill(0x01010101);

    // Initialize a buffer with a dword copy ISA
    m_pIsaGen->CompileShader((m_FamilyId >= FAMILY_AI) ? gfx9_ScratchCopyDword : gfx8_ScratchCopyDword,
            "ScratchCopyDword", isaBuffer);

    const HsaNodeProperties *pNodeProperties = m_NodeInfo.GetNodeProperties(defaultGPUNode);

    /* TODO: Add support to all GPU Nodes.
     * The loop over the system nodes is removed as the test can be executed only on GPU nodes. This
     * also requires changes to be made to all the HsaMemoryBuffer variables defined above, as
     * HsaMemoryBuffer is now associated with a Node.
     */
    if (pNodeProperties != NULL) {
        // Get the aperture of the scratch buffer
        HsaMemoryProperties *memoryProperties = new HsaMemoryProperties[pNodeProperties->NumMemoryBanks];
        EXPECT_SUCCESS(hsaKmtGetNodeMemoryProperties(defaultGPUNode, pNodeProperties->NumMemoryBanks,
                       memoryProperties));

        for (unsigned int bank = 0; bank < pNodeProperties->NumMemoryBanks; bank++) {
            if (memoryProperties[bank].HeapType == HSA_HEAPTYPE_GPU_SCRATCH) {
                int numWaves = 4;  // WAVES must be >= # SE
                int waveSize = 1;  // Amount of space used by each wave in units of 256 dwords

                PM4Queue queue;
                ASSERT_SUCCESS(queue.Create(defaultGPUNode));

                HSAuint64 scratchApertureAddr = memoryProperties[bank].VirtualBaseAddress;

                // Create a dispatch packet to copy
                Dispatch dispatchSrcToScratch(isaBuffer);

                // Setup the dispatch packet
                // Copying from the source Memory Buffer to the scratch buffer
                dispatchSrcToScratch.SetArgs(srcMemBuffer.As<void*>(), reinterpret_cast<void*>(scratchApertureAddr));
                dispatchSrcToScratch.SetDim(1, 1, 1);
                dispatchSrcToScratch.SetScratch(numWaves, waveSize, scratchBuffer.As<uint64_t>());
                // Submit the packet
                dispatchSrcToScratch.Submit(queue);
                dispatchSrcToScratch.Sync();

                // Create another dispatch packet to copy scratch buffer contents to destination buffer.
                Dispatch dispatchScratchToDst(isaBuffer);

                // Set the arguments to copy from the scratch buffer to the destination buffer
                dispatchScratchToDst.SetArgs(reinterpret_cast<void*>(scratchApertureAddr), dstMemBuffer.As<void*>());
                dispatchScratchToDst.SetDim(1, 1, 1);
                dispatchScratchToDst.SetScratch(numWaves, waveSize, scratchBuffer.As<uint64_t>());

                // Submit the packet
                dispatchScratchToDst.Submit(queue);
                dispatchScratchToDst.Sync();

                // Check that the scratch buffer contents were correctly copied over to the system memory buffer
                EXPECT_EQ(dstMemBuffer.As<unsigned int*>()[0], 0x01010101);
            }
        }

        delete [] memoryProperties;
    }

    TEST_END
}

TEST_F(KFDMemoryTest, GetTileConfigTest) {
    TEST_START(TESTPROFILE_RUNALL)

    HSAuint32 tile_config[32] = {0};
    HSAuint32 macro_tile_config[16] = {0};
    unsigned int i;
    HsaGpuTileConfig config = {0};

    config.TileConfig = tile_config;
    config.MacroTileConfig = macro_tile_config;
    config.NumTileConfigs = 32;
    config.NumMacroTileConfigs = 16;

    int defaultGPUNode = m_NodeInfo.HsaDefaultGPUNode();

    ASSERT_SUCCESS(hsaKmtGetTileConfig(defaultGPUNode, &config));

    LOG() << "tile_config:" << std::endl;
    for (i = 0; i < config.NumTileConfigs; i++)
        LOG() << "\t" << std::dec << i << ": 0x" << std::hex
                << tile_config[i] << std::endl;

    LOG() << "macro_tile_config:" << std::endl;
    for (i = 0; i < config.NumMacroTileConfigs; i++)
        LOG() << "\t" << std::dec << i << ": 0x" << std::hex
                << macro_tile_config[i] << std::endl;

    LOG() << "gb_addr_config: 0x" << std::hex << config.GbAddrConfig
            << std::endl;
    LOG() << "num_banks: 0x" << std::hex << config.NumBanks << std::endl;
    LOG() << "num_ranks: 0x" << std::hex << config.NumRanks << std::endl;

    TEST_END
}

void KFDMemoryTest::BigBufferSystemMemory(int defaultGPUNode, HSAuint64 granularityMB,
                                                HSAuint64 *lastSize) {
    HSAuint64 sysMemSizeMB;
    HsaMemMapFlags mapFlags = {0};
    HSAuint64 AlternateVAGPU;
    int ret;

    sysMemSizeMB = GetSysMemSize() >> 20;

    LOG() << "Found System Memory of " << std::dec << sysMemSizeMB
                << "MB" << std::endl;

    /* Testing big buffers in system memory */
    unsigned int * pDb = NULL;
    HSAuint64 lowMB = 0;
    HSAuint64 highMB = (sysMemSizeMB + granularityMB - 1) & ~(granularityMB - 1);

    HSAuint64 sizeMB;
    HSAuint64 size = 0;
    HSAuint64 lastTestedSize = 0;

    while (highMB - lowMB > granularityMB) {
        sizeMB = (lowMB + highMB) / 2;
        size = sizeMB * 1024 * 1024;
        ret = hsaKmtAllocMemory(0 /* system */, size, m_MemoryFlags,
                                reinterpret_cast<void**>(&pDb));
        if (ret) {
            highMB = sizeMB;
            continue;
        }

        ret = hsaKmtMapMemoryToGPUNodes(pDb, size, &AlternateVAGPU,
                        mapFlags, 1, reinterpret_cast<HSAuint32 *>(&defaultGPUNode));
        if (ret) {
            EXPECT_SUCCESS(hsaKmtFreeMemory(pDb, size));
            highMB = sizeMB;
            continue;
        }
        EXPECT_SUCCESS(hsaKmtUnmapMemoryToGPU(pDb));
        EXPECT_SUCCESS(hsaKmtFreeMemory(pDb, size));

        lowMB = sizeMB;
        lastTestedSize = sizeMB;
    }

    /* Save the biggest allocated system buffer for signal handling test */
    LOG() << "The biggest allocated system buffer is " << std::dec
            << lastTestedSize << "MB" << std::endl;
    if (lastSize)
        *lastSize = lastTestedSize * 1024 *1024;
}

void KFDMemoryTest::BigBufferVRAM(int defaultGPUNode, HSAuint64 granularityMB,
                                        HSAuint64 *lastSize) {
    HSAuint64 AlternateVAGPU;
    int ret;
    HSAuint64 vramSizeMB;
    HsaMemFlags memFlags;
    HsaMemMapFlags mapFlags = {0};

    vramSizeMB = GetVramSize(defaultGPUNode) >> 20;

    LOG() << "Found VRAM of " << std::dec << vramSizeMB << "MB." << std::endl;

    /* Testing big buffers in VRAM */
    unsigned int * pDb = NULL;
    HSAuint64 lowMB = 0;
    HSAuint64 highMB = (vramSizeMB + granularityMB - 1) & ~(granularityMB - 1);

    HSAuint64 sizeMB;
    HSAuint64 size = 0;
    HSAuint64 lastTestedSize = 0;

    memset(&memFlags, 0, sizeof(memFlags));
    memFlags.ui32.HostAccess = 0;
    memFlags.ui32.NonPaged = 1;

    while (highMB - lowMB > granularityMB) {
        sizeMB = (lowMB + highMB) / 2;
        size = sizeMB * 1024 * 1024;
        ret = hsaKmtAllocMemory(defaultGPUNode, size, memFlags,
                                reinterpret_cast<void**>(&pDb));
        if (ret) {
            highMB = sizeMB;
            continue;
        }

        ret = hsaKmtMapMemoryToGPUNodes(pDb, size, &AlternateVAGPU,
                        mapFlags, 1, reinterpret_cast<HSAuint32 *>(&defaultGPUNode));
        if (ret) {
            EXPECT_SUCCESS(hsaKmtFreeMemory(pDb, size));
            highMB = sizeMB;
            continue;
        }
        EXPECT_SUCCESS(hsaKmtUnmapMemoryToGPU(pDb));
        EXPECT_SUCCESS(hsaKmtFreeMemory(pDb, size));

        lowMB = sizeMB;
        lastTestedSize = sizeMB;
    }

    LOG() << "The biggest allocated VRAM buffer is " << std::dec
            << lastTestedSize << "MB" << std::endl;
    if (lastSize)
        *lastSize = lastTestedSize * 1024 * 1024;

    /* Make sure 3/4 vram can be allocated.*/
    EXPECT_GE(lastTestedSize * 4, vramSizeMB * 3);
    if (lastTestedSize * 16 < vramSizeMB * 15)
        WARN() << "The biggest allocated VRAM buffer size is smaller than the expected "
            << vramSizeMB * 15 / 16 << "MB" << std::endl;
}

void KFDMemoryTest::NumaNodeBind(const char *nodeStr) {
    if (numa_available() != -1) {
        int num_node = numa_num_task_nodes();

        if (num_node > 1) {
            struct bitmask *nodemask;

            LOG() << "NUMA total nodes " << num_node << ", bind to " << nodeStr << std::endl;

            nodemask = numa_parse_nodestring(nodeStr);
            if (nodemask) {
                numa_bind(nodemask);
                numa_free_nodemask(nodemask);
            }
        }
    }
}

/* BigBufferStressTest allocs, maps/unmaps, and frees the biggest possible system
 * buffers. Its size is found using binary search in the range (0, RAM SIZE) with
 * a granularity of 128M. Repeat the similar logic on local buffers (VRAM).
 * Finally, it allocs and maps 128M system buffers in a loop until it
 * fails, then unmaps and frees them afterwards.
 * Please note we limit the biggest possible system buffer to be smaller than
 * the RAM size. The reason is that the system buffer can make use of virtual
 * memory so that a system buffer could be very large even though the RAM size
 * is small. For example, on a typical Carrizo platform, the biggest allocated
 * system buffer could be more than 14G even though it only has 4G memory.
 * In that situation, it will take too much time to finish the test, because of
 * the onerous memory swap operation. So we limit the buffer size that way.
 */
TEST_F(KFDMemoryTest, BigBufferStressTest) {
    if (!is_dgpu()) {
        LOG() << "Skipping test: Running on APU fails and locks the system." << std::endl;
        return;
    }
    TEST_REQUIRE_ENV_CAPABILITIES(ENVCAPS_64BITLINUX);
    TEST_START(TESTPROFILE_RUNALL);

    HSAuint64 AlternateVAGPU;
    HsaMemMapFlags mapFlags = {0};
    int ret;

    HSAuint64 granularityMB = 128;

    int defaultGPUNode = m_NodeInfo.HsaDefaultGPUNode();
    ASSERT_GE(defaultGPUNode, 0) << "failed to get default GPU Node";

    /* Don't run on node 0 on multiple NUMA node machine because dma32 zone is on node 0,
     * Use all memory including dma32 zone on node 0 will cause TTM eviction to free dma32
     * zone for other devices which supports 32bit physical address. The eviction and
     * restore may retry if busy and cause queue timeout and test failure.
     */
    NumaNodeBind("!0");

    BigBufferSystemMemory(defaultGPUNode, granularityMB, NULL);

    BigBufferVRAM(defaultGPUNode, granularityMB, NULL);

    /* Repeatedly allocate and map big buffers in system memory until it fails,
     * then unmap and free them.
     */
#define ARRAY_ENTRIES 2048

    int i = 0, allocationCount = 0;
    unsigned int* pDb_array[ARRAY_ENTRIES];
    HSAuint64 block_size_mb = 128;
    HSAuint64 block_size = block_size_mb * 1024 * 1024;
    PM4Queue queue;

    /* Test 4 times to see if there is any memory leak.*/
    for (int repeat = 1; repeat < 5; repeat++) {
        ASSERT_SUCCESS(queue.Create(defaultGPUNode));

        for (i = 0; i < ARRAY_ENTRIES; i++) {
            ret = hsaKmtAllocMemory(0 /* system */, block_size, m_MemoryFlags,
                    reinterpret_cast<void**>(&pDb_array[i]));
            if (ret)
                break;

            ret = hsaKmtMapMemoryToGPUNodes(pDb_array[i], block_size,
                    &AlternateVAGPU, mapFlags, 1, reinterpret_cast<HSAuint32 *>(&defaultGPUNode));
            if (ret) {
                EXPECT_SUCCESS(hsaKmtFreeMemory(pDb_array[i], block_size));
                break;
            }
        }

        LOG() << "Allocated system buffers time " << std::dec << repeat << ": " << i << "x"
            << block_size_mb << "MB" << std::endl;

        if (allocationCount == 0)
            allocationCount = i;
        EXPECT_GE(i, allocationCount) << "There might be memory leak!" << std::endl;

        for (int j = 0; j < i; j++) {
            /* To see if GPU can access the memory correctly*/
            unsigned int *begin = pDb_array[j];
            *begin = 0;
            queue.PlaceAndSubmitPacket(
                    PM4WriteDataPacket(begin, 0xdeadbeaf));
            queue.Wait4PacketConsumption(NULL, 300000);
            EXPECT_TRUE(WaitOnValue(begin, 0xdeadbeaf));
        }

        EXPECT_SUCCESS(queue.Destroy());

        for (int j = 0; j < i; j++) {
            EXPECT_SUCCESS(hsaKmtUnmapMemoryToGPU(pDb_array[j]));
            EXPECT_SUCCESS(hsaKmtFreeMemory(pDb_array[j], block_size));
        }
    }

    /* Reset to run on all task nodes */
    NumaNodeBind("all");

    TEST_END
}

TEST_F(KFDMemoryTest, MMBench) {
    TEST_REQUIRE_ENV_CAPABILITIES(ENVCAPS_64BITLINUX);
    TEST_START(TESTPROFILE_RUNALL);

    unsigned testIndex, sizeIndex, memType, nMemTypes;
    const char *memTypeStrings[2] = {"SysMem", "VRAM"};
    const struct {
        unsigned size;
        unsigned num;
    } bufParams[] = {
        /* Buffer sizes in x16 increments. Limit memory usage to about
         * 1GB. For small sizes we use 1000 buffers, which means we
         * conveniently measure microseconds and report nanoseconds.
         */
        {PAGE_SIZE      , 1000},  /*  4KB */
        {PAGE_SIZE <<  4, 1000},  /* 64KB */
        {PAGE_SIZE <<  9,  500},  /*  2MB */
        {PAGE_SIZE << 13,   32},  /* 32MB */
        {PAGE_SIZE << 18,    1},  /*  1GB */
    };
    const unsigned nSizes = sizeof(bufParams) / sizeof(bufParams[0]);
    const unsigned nTests = nSizes << 2;
#define TEST_BUFSIZE(index) (bufParams[(index) % nSizes].size)
#define TEST_NBUFS(index)  (bufParams[(index) % nSizes].num)
#define TEST_MEMTYPE(index) ((index / nSizes) & 0x1)
#define TEST_SDMA(index)    (((index / nSizes) >> 1) & 0x1)

    void *bufs[1000];
    HSAuint64 start, end;
    unsigned i;
    HSAKMT_STATUS ret;
    HsaMemFlags memFlags = {0};
    HsaMemMapFlags mapFlags = {0};
    HSAuint64 altVa;

    HSAuint32 defaultGPUNode = m_NodeInfo.HsaDefaultGPUNode();
    ASSERT_GE(defaultGPUNode, 0) << "failed to get default GPU Node";

    HSAuint64 vramSizeMB = GetVramSize(defaultGPUNode) >> 20;

    const std::vector<int> gpuNodes = m_NodeInfo.GetNodesWithGPU();
    bool is_all_large_bar = true;

    for (unsigned i = 0; i < gpuNodes.size(); i++) {
        if (!m_NodeInfo.IsGPUNodeLargeBar(gpuNodes.at(i))) {
                is_all_large_bar = false;
                break;
        }
    }

    LOG() << "Found VRAM of " << std::dec << vramSizeMB << "MB." << std::endl;

    if (vramSizeMB == 0)
        nMemTypes = 1;
    else
        nMemTypes = 2;

    /* Two SDMA queues to interleave user mode SDMA with memory
     * management on either SDMA engine. Make the queues long enough
     * to buffer at least nBufs x WriteData packets (7 dwords per
     * packet).
     */
    SDMAQueue sdmaQueue[2];
    ASSERT_SUCCESS(sdmaQueue[0].Create(defaultGPUNode, PAGE_SIZE*8));
    ASSERT_SUCCESS(sdmaQueue[1].Create(defaultGPUNode, PAGE_SIZE*8));
    HsaMemoryBuffer sdmaBuffer(PAGE_SIZE, 0); /* system memory */
#define INTERLEAVE_SDMA() do {                                          \
        if (interleaveSDMA) {                                           \
            sdmaQueue[0].PlaceAndSubmitPacket(                          \
                SDMAWriteDataPacket(sdmaBuffer.As<HSAuint32 *>(),       \
                                    0x12345678));                       \
            sdmaQueue[1].PlaceAndSubmitPacket(                          \
                SDMAWriteDataPacket(sdmaBuffer.As<HSAuint32 *>()+16,    \
                                    0x12345678));                       \
        }                                                               \
    } while (0)
#define IDLE_SDMA() do {                                                \
        if (interleaveSDMA) {                                           \
            sdmaQueue[0].Wait4PacketConsumption();                      \
            sdmaQueue[1].Wait4PacketConsumption();                      \
        }                                                               \
    } while (0)

    LOG() << "Test (avg. ns)\t    alloc   mapOne  umapOne   mapAll  umapAll     free" << std::endl;
    for (testIndex = 0; testIndex < nTests; testIndex++) {
        unsigned bufSize = TEST_BUFSIZE(testIndex);
        unsigned nBufs = TEST_NBUFS(testIndex);
        unsigned memType = TEST_MEMTYPE(testIndex);
        bool interleaveSDMA = TEST_SDMA(testIndex);
        HSAuint64 allocTime, map1Time, unmap1Time, mapAllTime, unmapAllTime, freeTime;
        HSAuint32 allocNode;

        if ((testIndex % nSizes) == 0)
            LOG() << "--------------------------------------------------------------------------" << std::endl;

        if (memType >= nMemTypes)
            continue;  // skip unsupported mem types

        if (memType == 0) {
            allocNode = 0;
            memFlags.ui32.PageSize = HSA_PAGE_SIZE_4KB;
            memFlags.ui32.HostAccess = 1;
            memFlags.ui32.NonPaged = 0;
        } else {
            allocNode = defaultGPUNode;
            memFlags.ui32.PageSize = HSA_PAGE_SIZE_4KB;
            memFlags.ui32.HostAccess = 0;
            memFlags.ui32.NonPaged = 1;
        }

        /* Allocation */
        start = GetSystemTickCountInMicroSec();
        for (i = 0; i < nBufs; i++) {
            ASSERT_SUCCESS(hsaKmtAllocMemory(allocNode, bufSize, memFlags,
                                             &bufs[i]));
            INTERLEAVE_SDMA();
        }
        allocTime = GetSystemTickCountInMicroSec() - start;
        IDLE_SDMA();

        /* Map to one GPU */
        start = GetSystemTickCountInMicroSec();
        for (i = 0; i < nBufs; i++) {
            ASSERT_SUCCESS(hsaKmtMapMemoryToGPUNodes(bufs[i], bufSize,
                                                     &altVa, mapFlags, 1,
                                                     &defaultGPUNode));
            INTERLEAVE_SDMA();
        }
        map1Time = GetSystemTickCountInMicroSec() - start;
        IDLE_SDMA();

        /* Unmap from GPU */
        start = GetSystemTickCountInMicroSec();
        for (i = 0; i < nBufs; i++) {
            EXPECT_SUCCESS(hsaKmtUnmapMemoryToGPU(bufs[i]));
            INTERLEAVE_SDMA();
        }
        unmap1Time = GetSystemTickCountInMicroSec() - start;
        IDLE_SDMA();

        /* Map to all GPUs */
        if (is_all_large_bar) {
            start = GetSystemTickCountInMicroSec();
            for (i = 0; i < nBufs; i++) {
                ASSERT_SUCCESS(hsaKmtMapMemoryToGPU(bufs[i], bufSize, &altVa));
                INTERLEAVE_SDMA();
            }
            mapAllTime = GetSystemTickCountInMicroSec() - start;
            IDLE_SDMA();

            /* Unmap from all GPUs */
            start = GetSystemTickCountInMicroSec();
            for (i = 0; i < nBufs; i++) {
                EXPECT_SUCCESS(hsaKmtUnmapMemoryToGPU(bufs[i]));
                INTERLEAVE_SDMA();
            }
            unmapAllTime = GetSystemTickCountInMicroSec() - start;
            IDLE_SDMA();
        }

        /* Free */
        start = GetSystemTickCountInMicroSec();
        for (i = 0; i < nBufs; i++) {
            EXPECT_SUCCESS(hsaKmtFreeMemory(bufs[i], bufSize));
            INTERLEAVE_SDMA();
        }
        freeTime = GetSystemTickCountInMicroSec() - start;
        IDLE_SDMA();

        allocTime = allocTime * 1000 / nBufs;
        map1Time = map1Time * 1000 / nBufs;
        unmap1Time = unmap1Time * 1000 / nBufs;
        mapAllTime = mapAllTime * 1000 / nBufs;
        unmapAllTime = unmapAllTime * 1000 / nBufs;
        freeTime = freeTime * 1000 / nBufs;

        unsigned bufSizeLog;
        char bufSizeUnit;
        if (bufSize < (1 << 20)) {
            bufSizeLog = bufSize >> 10;
            bufSizeUnit = 'K';
        } else if (bufSize < (1 << 30)) {
            bufSizeLog = bufSize >> 20;
            bufSizeUnit = 'M';
        } else {
            bufSizeLog = bufSize >> 30;
            bufSizeUnit = 'G';
        }

        LOG() << std::dec << std::setiosflags(std::ios::right)
              << std::setw(3) << bufSizeLog << bufSizeUnit << "-"
              << memTypeStrings[memType] << "-"
              << (interleaveSDMA ? "SDMA\t" : "noSDMA\t")
              << std::setw(9) << allocTime
              << std::setw(9) << map1Time
              << std::setw(9) << unmap1Time
              << std::setw(9) << mapAllTime
              << std::setw(9) << unmapAllTime
              << std::setw(9) << freeTime << std::endl;

#define MMBENCH_KEY_PREFIX memTypeStrings[memType] << "-" \
                           << (interleaveSDMA ? "SDMA" : "noSDMA") << "-" \
                           << (bufSize >> 10) << "K-"
        RECORD(allocTime) << MMBENCH_KEY_PREFIX << "alloc";
        RECORD(map1Time) << MMBENCH_KEY_PREFIX << "mapOne";
        RECORD(unmap1Time) << MMBENCH_KEY_PREFIX << "unmapOne";
        RECORD(mapAllTime) << MMBENCH_KEY_PREFIX << "mapAll";
        RECORD(unmapAllTime) << MMBENCH_KEY_PREFIX << "unmapAll";
        RECORD(freeTime) << MMBENCH_KEY_PREFIX << "free";
    }

    TEST_END
}

TEST_F(KFDMemoryTest, QueryPointerInfo) {
    TEST_START(TESTPROFILE_RUNALL)

    int defaultGPUNode = m_NodeInfo.HsaDefaultGPUNode();
    ASSERT_GE(defaultGPUNode, 0) << "failed to get default GPU Node";

    unsigned int bufSize = PAGE_SIZE * 8;  // CZ and Tonga need 8 pages
    HsaPointerInfo ptrInfo;
    const std::vector<int> gpuNodes = m_NodeInfo.GetNodesWithGPU();
    HSAuint64 nGPU = gpuNodes.size();  // number of gpu nodes

    /* GraphicHandle is tested at KFDGraphicsInterop.RegisterGraphicsHandle */

    /*** Memory allocated on CPU node ***/
    HsaMemoryBuffer hostBuffer(bufSize, 0/*node*/, false, false/*local*/);
    EXPECT_SUCCESS(hsaKmtQueryPointerInfo(hostBuffer.As<void*>(), &ptrInfo));
    EXPECT_EQ(ptrInfo.Type, HSA_POINTER_ALLOCATED);
    EXPECT_EQ(ptrInfo.Node, 0);
    EXPECT_EQ(ptrInfo.MemFlags.Value, hostBuffer.Flags().Value);
    EXPECT_EQ(ptrInfo.CPUAddress, hostBuffer.As<void*>());
    EXPECT_EQ(ptrInfo.GPUAddress, (HSAuint64)hostBuffer.As<void*>());
    EXPECT_EQ(ptrInfo.SizeInBytes, (HSAuint64)hostBuffer.Size());
    if (is_dgpu()) {
        EXPECT_EQ((HSAuint64)ptrInfo.NMappedNodes, nGPU);
        // Check NMappedNodes again after unmapping the memory
        hsaKmtUnmapMemoryToGPU(hostBuffer.As<void*>());
        hsaKmtQueryPointerInfo(hostBuffer.As<void*>(), &ptrInfo);
    }
    EXPECT_EQ((HSAuint64)ptrInfo.NMappedNodes, 0);

    /* Skip testing local memory if the platform does not have it */
    if (GetVramSize(defaultGPUNode)) {
        HsaMemoryBuffer localBuffer(bufSize, defaultGPUNode, false, true);
        EXPECT_SUCCESS(hsaKmtQueryPointerInfo(localBuffer.As<void*>(), &ptrInfo));
        EXPECT_EQ(ptrInfo.Type, HSA_POINTER_ALLOCATED);
        EXPECT_EQ(ptrInfo.Node, defaultGPUNode);
        EXPECT_EQ(ptrInfo.MemFlags.Value, localBuffer.Flags().Value);
        EXPECT_EQ(ptrInfo.CPUAddress, localBuffer.As<void*>());
        EXPECT_EQ(ptrInfo.GPUAddress, (HSAuint64)localBuffer.As<void*>());
        EXPECT_EQ(ptrInfo.SizeInBytes, (HSAuint64)localBuffer.Size());

        HSAuint32 *addr = localBuffer.As<HSAuint32 *>() + 4;
        EXPECT_SUCCESS(hsaKmtQueryPointerInfo(reinterpret_cast<void *>(addr), &ptrInfo));
        EXPECT_EQ(ptrInfo.GPUAddress, (HSAuint64)localBuffer.As<void*>());
    }

    /** Registered memory: user pointer */
    static volatile HSAuint32 mem[4];  // 8 bytes for register only and
                                       // 8 bytes for register to nodes
    HsaMemoryBuffer hsaBuffer((void *)(&mem[0]), sizeof(HSAuint32)*2);
    if (is_dgpu()) {  // APU doesn't use userptr
        EXPECT_SUCCESS(hsaKmtQueryPointerInfo((void *)(&mem[0]), &ptrInfo));
        EXPECT_EQ(ptrInfo.Type, HSA_POINTER_REGISTERED_USER);
        EXPECT_EQ(ptrInfo.CPUAddress, &mem[0]);
        EXPECT_EQ(ptrInfo.GPUAddress, (HSAuint64)hsaBuffer.As<void*>());
        EXPECT_EQ(ptrInfo.SizeInBytes, sizeof(HSAuint32)*2);
        EXPECT_EQ(ptrInfo.NRegisteredNodes, 0);
        EXPECT_EQ(ptrInfo.NMappedNodes, nGPU);
        // Register to nodes
        HSAuint32 nodes[nGPU];
        for (unsigned int i = 0; i < nGPU; i++)
            nodes[i] = gpuNodes.at(i);
        EXPECT_SUCCESS(hsaKmtRegisterMemoryToNodes((void *)(&mem[2]),
                                sizeof(HSAuint32)*2, nGPU, nodes));
        EXPECT_SUCCESS(hsaKmtQueryPointerInfo((void *)(&mem[2]), &ptrInfo));
        EXPECT_EQ(ptrInfo.NRegisteredNodes, nGPU);
        EXPECT_SUCCESS(hsaKmtDeregisterMemory((void *)(&mem[2])));
    }

    /* Not a starting address, but an address inside the memory range
     * should also get the memory information
     */
    HSAuint32 *address = hostBuffer.As<HSAuint32 *>() + 1;
    EXPECT_SUCCESS(hsaKmtQueryPointerInfo(reinterpret_cast<void *>(address), &ptrInfo));
    EXPECT_EQ(ptrInfo.Type, HSA_POINTER_ALLOCATED);
    EXPECT_EQ(ptrInfo.CPUAddress, hostBuffer.As<void*>());
    if (is_dgpu()) {
        EXPECT_SUCCESS(hsaKmtQueryPointerInfo((void *)(&mem[1]), &ptrInfo));
        EXPECT_EQ(ptrInfo.Type, HSA_POINTER_REGISTERED_USER);
        EXPECT_EQ(ptrInfo.CPUAddress, &mem[0]);
    }

    /*** Set user data ***/
    char userData[16] = "This is a test.";
    EXPECT_SUCCESS(hsaKmtSetMemoryUserData(hostBuffer.As<HSAuint32 *>(), reinterpret_cast<void *>(userData)));
    EXPECT_SUCCESS(hsaKmtQueryPointerInfo(hostBuffer.As<void*>(), &ptrInfo));
    EXPECT_EQ(ptrInfo.UserData, (void *)userData);

    TEST_END
}

/* Linux OS-specific test for a debugger accessing HSA memory in a
 * debugged process.
 *
 * Allocates a system memory and a visible local memory buffer (if
 * possible). Forks a child process that PTRACE_ATTACHes to the parent
 * to access its memory like a debugger would. Child copies data in
 * the parent process using PTRACE_PEEKDATA and PTRACE_POKEDATA. After
 * the child terminates, the parent checks that the copy was
 * successful.
 */
TEST_F(KFDMemoryTest, PtraceAccess) {
    TEST_START(TESTPROFILE_RUNALL)

    int defaultGPUNode = m_NodeInfo.HsaDefaultGPUNode();
    ASSERT_GE(defaultGPUNode, 0) << "failed to get default GPU Node";

    HsaMemFlags memFlags = {0};
    memFlags.ui32.PageSize = HSA_PAGE_SIZE_4KB;
    memFlags.ui32.HostAccess = 1;

    void *mem[2];
    unsigned i;

    /* Offset in the VRAM buffer to test crossing non-contiguous
     * buffer boundaries. The second access starting from offset
     * sizeof(HSAint64)+1 will cross a node boundary in a single access,
     * for node sizes of 4MB or smaller.
     */
    const HSAuint64 VRAM_OFFSET = (4 << 20) - 2 * sizeof(HSAint64);

    // Alloc system memory from node 0 and initialize it
    memFlags.ui32.NonPaged = 0;
    ASSERT_SUCCESS(hsaKmtAllocMemory(0, PAGE_SIZE*2, memFlags, &mem[0]));
    for (i = 0; i < 4*sizeof(HSAint64) + 4; i++) {
        (reinterpret_cast<HSAuint8 *>(mem[0]))[i] = i;            // source
        (reinterpret_cast<HSAuint8 *>(mem[0]))[PAGE_SIZE+i] = 0;  // destination
    }

    // Try to alloc local memory from GPU node
    memFlags.ui32.NonPaged = 1;
    if (m_NodeInfo.IsGPUNodeLargeBar(defaultGPUNode)) {
        EXPECT_SUCCESS(hsaKmtAllocMemory(defaultGPUNode, PAGE_SIZE*2 + (4 << 20),
                                            memFlags, &mem[1]));
        mem[1] = reinterpret_cast<void *>(reinterpret_cast<HSAuint8 *>(mem[1]) + VRAM_OFFSET);
        for (i = 0; i < 4*sizeof(HSAint64) + 4; i++) {
            (reinterpret_cast<HSAuint8 *>(mem[1]))[i] = i;
            (reinterpret_cast<HSAuint8 *>(mem[1]))[PAGE_SIZE+i] = 0;
        }
    } else {
        LOG() << "Not testing local memory, it's invisible" << std::endl;
        mem[1] = NULL;
    }

    /* Allow any process to trace this one. If kernel is built without
     * Yama, this is not needed, and this call will fail.
     */
#ifdef PR_SET_PTRACER
    prctl(PR_SET_PTRACER, PR_SET_PTRACER_ANY, 0, 0, 0);
#endif

    // Find current pid so the child can trace it
    pid_t tracePid = getpid();

    // Fork the child
    pid_t childPid = fork();
    ASSERT_GE(childPid, 0);
    if (childPid == 0) {
        int traceStatus;
        int err = 0, r;

        /* Child process: we catch any exceptions to make sure we detach
         * from the traced process, because terminating without detaching
         * leaves the traced process stopped.
         */
        r = ptrace(PTRACE_ATTACH, tracePid, NULL, NULL);
        if (r) {
            WARN() << "PTRACE_ATTACH failed: " << r << std::endl;
            exit(1);
        }
        try {
            do {
                waitpid(tracePid, &traceStatus, 0);
            } while (!WIFSTOPPED(traceStatus));

            for (i = 0; i < 4; i++) {
                // Test 4 different (mis-)alignments, leaving 1-byte gaps between longs
                HSAuint8 *addr = reinterpret_cast<HSAuint8 *>(reinterpret_cast<long *>(mem[0]) + i) + i;
                errno = 0;
                long data = ptrace(PTRACE_PEEKDATA, tracePid, addr, NULL);
                EXPECT_EQ(0, errno);
                EXPECT_EQ(0, ptrace(PTRACE_POKEDATA, tracePid, addr + PAGE_SIZE,
                                    reinterpret_cast<void *>(data)));

                if (mem[1] == NULL)
                    continue;

                addr = reinterpret_cast<HSAuint8 *>(reinterpret_cast<long *>(mem[1]) + i) + i;
                errno = 0;
                data = ptrace(PTRACE_PEEKDATA, tracePid, addr, NULL);
                EXPECT_EQ(0, errno);
                EXPECT_EQ(0, ptrace(PTRACE_POKEDATA, tracePid, addr + PAGE_SIZE,
                                reinterpret_cast<void *>(data)));
            }
        } catch (...) {
            err = 1;
        }
        r = ptrace(PTRACE_DETACH, tracePid, NULL, NULL);
        if (r) {
            WARN() << "PTRACE_DETACH failed: " << r << std::endl;
            exit(1);
        }
        exit(err);
    } else {
        int childStatus;

        // Parent process, just wait for the child to finish
        EXPECT_EQ(childPid, waitpid(childPid, &childStatus, 0));
        EXPECT_NE(0, WIFEXITED(childStatus));
        EXPECT_EQ(0, WEXITSTATUS(childStatus));
    }

    // Clear gaps in the source that should not have been copied
    (reinterpret_cast<uint8_t*>(mem[0]))[  sizeof(long)    ] = 0;
    (reinterpret_cast<uint8_t*>(mem[0]))[2*sizeof(long) + 1] = 0;
    (reinterpret_cast<uint8_t*>(mem[0]))[3*sizeof(long) + 2] = 0;
    (reinterpret_cast<uint8_t*>(mem[0]))[4*sizeof(long) + 3] = 0;
    // Check results
    EXPECT_EQ(0, memcmp(mem[0], reinterpret_cast<HSAuint8 *>(mem[0]) + PAGE_SIZE,
                        sizeof(long)*4 + 4));
    // Free memory
    EXPECT_SUCCESS(hsaKmtFreeMemory(mem[0], PAGE_SIZE*2));

    if (mem[1]) {
        (reinterpret_cast<uint8_t*>(mem[1]))[  sizeof(HSAint64)    ] = 0;
        (reinterpret_cast<uint8_t*>(mem[1]))[2*sizeof(HSAint64) + 1] = 0;
        (reinterpret_cast<uint8_t*>(mem[1]))[3*sizeof(HSAint64) + 2] = 0;
        (reinterpret_cast<uint8_t*>(mem[1]))[4*sizeof(HSAint64) + 3] = 0;
        EXPECT_EQ(0, memcmp(mem[1], reinterpret_cast<HSAuint8 *>(mem[1]) + PAGE_SIZE,
                            sizeof(HSAint64)*4 + 4));
        mem[1] = reinterpret_cast<void *>(reinterpret_cast<HSAuint8 *>(mem[1]) - VRAM_OFFSET);
        EXPECT_SUCCESS(hsaKmtFreeMemory(mem[1], PAGE_SIZE*2));
    }

    TEST_END
}

TEST_F(KFDMemoryTest, PtraceAccessInvisibleVram) {
    char *hsaDebug = getenv("HSA_DEBUG");

    if (!is_dgpu()) {
        LOG() << "Skipping test: There is no VRAM on APU." << std::endl;
        return;
    }

    if (!hsaDebug || !strcmp(hsaDebug, "0")) {
        LOG() << "Skipping test: HSA_DEBUG environment variable not set." << std::endl;
        return;
    }

    TEST_START(TESTPROFILE_RUNALL)

    int defaultGPUNode = m_NodeInfo.HsaDefaultGPUNode();
    ASSERT_GE(defaultGPUNode, 0) << "failed to get default GPU Node";

    HsaMemMapFlags mapFlags = {0};
    HsaMemFlags memFlags = {0};
    memFlags.ui32.PageSize = HSA_PAGE_SIZE_4KB;
    /* Allocate host not accessible vram */
    memFlags.ui32.HostAccess = 0;
    memFlags.ui32.NonPaged = 1;

    void *mem, *mem0, *mem1;
    unsigned size = PAGE_SIZE*2 + (4 << 20);
    HSAuint64 data[2] = {0xdeadbeefdeadbeef, 0xcafebabecafebabe};
    unsigned int data0[2] = {0xdeadbeef, 0xdeadbeef};
    unsigned int data1[2] = {0xcafebabe, 0xcafebabe};

    const HSAuint64 VRAM_OFFSET = (4 << 20) - sizeof(HSAuint64);

    ASSERT_SUCCESS(hsaKmtAllocMemory(defaultGPUNode, size, memFlags, &mem));
    ASSERT_SUCCESS(hsaKmtMapMemoryToGPUNodes(mem, size, NULL,
                                mapFlags, 1, reinterpret_cast<HSAuint32 *>(&defaultGPUNode)));
    /* Set the word before 4M boundary to 0xdeadbeefdeadbeef
     * and the word after 4M boundary to 0xcafebabecafebabe
     */
    mem0 = reinterpret_cast<void *>(reinterpret_cast<HSAuint8 *>(mem) + VRAM_OFFSET);
    mem1 = reinterpret_cast<void *>(reinterpret_cast<HSAuint8 *>(mem) + VRAM_OFFSET + sizeof(HSAuint64));
    PM4Queue queue;
    ASSERT_SUCCESS(queue.Create(defaultGPUNode));
    queue.PlaceAndSubmitPacket(PM4WriteDataPacket((unsigned int *)mem0,
                                                  data0[0], data0[1]));
    queue.PlaceAndSubmitPacket(PM4WriteDataPacket((unsigned int *)mem1,
                                                  data1[0], data1[1]));
    queue.PlaceAndSubmitPacket(PM4ReleaseMemoryPacket(true, 0, 0));
    queue.Wait4PacketConsumption();

    /* Allow any process to trace this one. If kernel is built without
     * Yama, this is not needed, and this call will fail.
     */
#ifdef PR_SET_PTRACER
    prctl(PR_SET_PTRACER, PR_SET_PTRACER_ANY, 0, 0, 0);
#endif

    // Find out my pid so the child can trace it
    pid_t tracePid = getpid();

    // Fork the child
    pid_t childPid = fork();
    ASSERT_GE(childPid, 0);
    if (childPid == 0) {
        int traceStatus;
        int err = 0, r;

        /* Child process: we catch any exceptions to make sure we detach
         * from the traced process, because terminating without detaching
         * leaves the traced process stopped.
         */
        r = ptrace(PTRACE_ATTACH, tracePid, NULL, NULL);
        if (r) {
            WARN() << "PTRACE_ATTACH failed: " << r << std::endl;
            exit(1);
        }
        try {
            do {
                waitpid(tracePid, &traceStatus, 0);
            } while (!WIFSTOPPED(traceStatus));

            /* Peek the memory */
            errno = 0;
            HSAint64 data0 = ptrace(PTRACE_PEEKDATA, tracePid, mem0, NULL);
            EXPECT_EQ(0, errno);
            EXPECT_EQ(data[0], data0);
            HSAint64 data1 = ptrace(PTRACE_PEEKDATA, tracePid, mem1, NULL);
            EXPECT_EQ(0, errno);
            EXPECT_EQ(data[1], data1);

            /* Swap mem0 and mem1 by poking */
            EXPECT_EQ(0, ptrace(PTRACE_POKEDATA, tracePid, mem0, reinterpret_cast<void *>(data[1])));
            EXPECT_EQ(0, errno);
            EXPECT_EQ(0, ptrace(PTRACE_POKEDATA, tracePid, mem1, reinterpret_cast<void *>(data[0])));
            EXPECT_EQ(0, errno);
        } catch (...) {
            err = 1;
        }
        r = ptrace(PTRACE_DETACH, tracePid, NULL, NULL);
        if (r) {
            WARN() << "PTRACE_DETACH failed: " << r << std::endl;
            exit(1);
        }
        exit(err);
    } else {
        int childStatus;

        // Parent process, just wait for the child to finish
        EXPECT_EQ(childPid, waitpid(childPid, &childStatus, 0));
        EXPECT_NE(0, WIFEXITED(childStatus));
        EXPECT_EQ(0, WEXITSTATUS(childStatus));
    }

    /* Use shader to read back data to check poke results */
    HsaMemoryBuffer isaBuffer(PAGE_SIZE, defaultGPUNode, true/*zero*/, false/*local*/, true/*exec*/);
    // dstBuffer is cpu accessible gtt memory
    HsaMemoryBuffer dstBuffer(PAGE_SIZE, defaultGPUNode);
    m_pIsaGen->CompileShader((m_FamilyId >= FAMILY_AI) ? gfx9_ScratchCopyDword : gfx8_ScratchCopyDword,
            "ScratchCopyDword", isaBuffer);
    Dispatch dispatch0(isaBuffer);
    dispatch0.SetArgs(mem0, dstBuffer.As<void*>());
    dispatch0.Submit(queue);
    dispatch0.Sync();
    EXPECT_EQ(data1[0], dstBuffer.As<unsigned int*>()[0]);

    Dispatch dispatch1(isaBuffer);
    dispatch1.SetArgs(mem1, dstBuffer.As<int*>());
    dispatch1.Submit(queue);
    dispatch1.Sync();
    WaitOnValue(dstBuffer.As<uint32_t *>(), data0[0]);
    EXPECT_EQ(data0[0], dstBuffer.As<unsigned int*>()[0]);

    // Clean up
    EXPECT_SUCCESS(hsaKmtUnmapMemoryToGPU(mem));
    EXPECT_SUCCESS(hsaKmtFreeMemory(mem, size));
    EXPECT_SUCCESS(queue.Destroy());

    TEST_END
}

void CatchSignal(int IntrSignal) {
    LOG() << "Interrupt Signal " << std::dec << IntrSignal
          << " Received" << std::endl;
}

TEST_F(KFDMemoryTest, SignalHandling) {
    TEST_START(TESTPROFILE_RUNALL)

    if (!is_dgpu()) {
        LOG() << "Skipping test: Test not supported on APU." << std::endl;
        return;
    }

    unsigned int *nullPtr = NULL;
    unsigned int* pDb = NULL;
    struct sigaction sa;
    SDMAQueue queue;
    HSAuint64 size, sysMemSize;

    int defaultGPUNode = m_NodeInfo.HsaDefaultGPUNode();
    ASSERT_GE(defaultGPUNode, 0) << "failed to get default GPU Node";

    sa.sa_handler = CatchSignal;
    sigemptyset(&sa.sa_mask);
    sa.sa_flags = 0;
    pid_t ParentPid = getpid();
    EXPECT_EQ(0, sigaction(SIGUSR1, &sa, NULL)) << "An error occurred while setting a signal handler";

    sysMemSize = GetSysMemSize();

    /* System (kernel) memory are limited to 3/8th System RAM
     * Try to allocate 1/4th System RAM
     */
    size = (sysMemSize >> 2) & ~(HSAuint64)(PAGE_SIZE - 1);

    ASSERT_SUCCESS(hsaKmtAllocMemory(0 /* system */, size, m_MemoryFlags, reinterpret_cast<void**>(&pDb)));
    // Verify that pDb is not null before it's being used
    EXPECT_NE(nullPtr, pDb) << "hsaKmtAllocMemory returned a null pointer";

    pid_t childPid = fork();
    ASSERT_GE(childPid, 0);
    if (childPid == 0) {
        EXPECT_EQ(0, kill(ParentPid, SIGUSR1));
        exit(0);
    } else {
        LOG() << "Start Memory Mapping..." << std::endl;
        ASSERT_SUCCESS(hsaKmtMapMemoryToGPU(pDb, size, NULL));
        LOG() << "Mapping finished" << std::endl;
        int childStatus;

        // Parent process, just wait for the child to finish
        EXPECT_EQ(childPid, waitpid(childPid, &childStatus, 0));
        EXPECT_NE(0, WIFEXITED(childStatus));
        EXPECT_EQ(0, WEXITSTATUS(childStatus));
    }

    pDb[0] = 0x02020202;
    ASSERT_SUCCESS(queue.Create(defaultGPUNode));
    queue.PlaceAndSubmitPacket(SDMAWriteDataPacket(pDb, 0x01010101) );
    queue.Wait4PacketConsumption();
    EXPECT_TRUE(WaitOnValue(pDb, 0x01010101));
    EXPECT_SUCCESS(queue.Destroy());

    EXPECT_SUCCESS(hsaKmtUnmapMemoryToGPU(pDb));
    // Release the buffers
    EXPECT_SUCCESS(hsaKmtFreeMemory(pDb, size));

    TEST_END
}

TEST_F(KFDMemoryTest, CheckZeroInitializationSysMem) {
    TEST_REQUIRE_ENV_CAPABILITIES(ENVCAPS_64BITLINUX);
    TEST_START(TESTPROFILE_RUNALL);

    int ret;

    int defaultGPUNode = m_NodeInfo.HsaDefaultGPUNode();
    ASSERT_GE(defaultGPUNode, 0) << "failed to get default GPU Node";

    HSAuint64 sysMemSizeMB = GetSysMemSize() >> 20;

    /* Testing system memory */
    HSAuint64 * pDb = NULL;

    HSAuint64 sysBufSizeMB = sysMemSizeMB >> 2;
    HSAuint64 sysBufSize = sysBufSizeMB * 1024 * 1024;

    int count = 5;

    LOG() << "Using " << std::dec << sysBufSizeMB
            << "MB system buffer to test " << std::dec << count
            << " times" << std::endl;

    unsigned int offset = 257;  // a constant offset, should be smaller than 512.
    unsigned int size = sysBufSize / sizeof(*pDb);

    while (count--) {
        ret = hsaKmtAllocMemory(0 /* system */, sysBufSize, m_MemoryFlags,
                                reinterpret_cast<void**>(&pDb));
        if (ret) {
            LOG() << "Failed to allocate system buffer of" << std::dec << sysBufSizeMB
                    << "MB" << std::endl;
            return;
        }

        /* Check the first 64 bits */
        EXPECT_EQ(0, pDb[0]);
        pDb[0] = 1;

        for (HSAuint64 i = offset; i < size;) {
            EXPECT_EQ(0, pDb[i]);
            pDb[i] = i + 1;  // set it to non zero

            i += 4096 / sizeof(*pDb);
        }

        /* check the last 64 bit */
        EXPECT_EQ(0, pDb[size-1]);
        pDb[size-1] = size;

        EXPECT_SUCCESS(hsaKmtFreeMemory(pDb, sysBufSize));
    }

    TEST_END
}

static inline void access(volatile void *sd, int size, int rw) {
    /* Most likely sitting in cache*/
    static struct DUMMY {
        char dummy[1024];
    } dummy;

    while ((size -= sizeof(dummy)) >= 0) {
        if (rw == 0)
            dummy = *(struct DUMMY *)((char*)sd + size);
        else
            *(struct DUMMY *)((char*)sd + size) = dummy;
    }
}

/*
 * On large-bar system, test the visible vram access speed.
 * KFD is not allowed to alloc visible vram on non-largebar system.
 */
TEST_F(KFDMemoryTest, MMBandWidth) {
    TEST_REQUIRE_ENV_CAPABILITIES(ENVCAPS_64BITLINUX);
    TEST_START(TESTPROFILE_RUNALL);

    const unsigned nBufs = 1000; /* measure us, report ns */
    unsigned testIndex, sizeIndex, memType;
    const unsigned nMemTypes = 2;
    const char *memTypeStrings[nMemTypes] = {"SysMem", "VRAM"};
    const unsigned nSizes = 4;
    const unsigned bufSizes[nSizes] = {PAGE_SIZE, PAGE_SIZE*4, PAGE_SIZE*16, PAGE_SIZE*64};
    const unsigned nTests = nSizes * nMemTypes;
    const unsigned tmpBufferSize = PAGE_SIZE*64;
#define _TEST_BUFSIZE(index) (bufSizes[index % nSizes])
#define _TEST_MEMTYPE(index) ((index / nSizes) % nMemTypes)

    void *bufs[nBufs];
    HSAuint64 start;
    unsigned i;
    HSAKMT_STATUS ret;
    HsaMemFlags memFlags = {0};
    HsaMemMapFlags mapFlags = {0};

    HSAuint32 defaultGPUNode = m_NodeInfo.HsaDefaultGPUNode();
    ASSERT_GE(defaultGPUNode, 0) << "failed to get default GPU Node";

    HSAuint64 vramSizeMB = GetVramSize(defaultGPUNode) >> 20;

    LOG() << "Found VRAM of " << std::dec << vramSizeMB << "MB." << std::endl;

    if (!m_NodeInfo.IsGPUNodeLargeBar(defaultGPUNode) || !vramSizeMB) {
        LOG() << "Skipping test: Test requires a large bar GPU." << std::endl;
        return;
    }

    void *tmp = mmap(0,
            tmpBufferSize,
            PROT_READ | PROT_WRITE,
            MAP_ANONYMOUS | MAP_PRIVATE,
            -1,
            0);
    EXPECT_NE(tmp, MAP_FAILED);
    memset(tmp, 0, tmpBufferSize);

    LOG() << "Test (avg. ns)\t  memcpyRTime memcpyWTime accessRTime accessWTime" << std::endl;
    for (testIndex = 0; testIndex < nTests; testIndex++) {
        unsigned bufSize = _TEST_BUFSIZE(testIndex);
        unsigned memType = _TEST_MEMTYPE(testIndex);
        HSAuint64 mcpRTime, mcpWTime, accessRTime, accessWTime;
        HSAuint32 allocNode;

        if ((testIndex & (nSizes-1)) == 0)
            LOG() << "----------------------------------------------------------------------" << std::endl;

        if (memType == 0) {
            allocNode = 0;
            memFlags.ui32.PageSize = HSA_PAGE_SIZE_4KB;
            memFlags.ui32.HostAccess = 1;
            memFlags.ui32.NonPaged = 0;
        } else {
            /* Alloc visible vram*/
            allocNode = defaultGPUNode;
            memFlags.ui32.PageSize = HSA_PAGE_SIZE_4KB;
            memFlags.ui32.HostAccess = 1;
            memFlags.ui32.NonPaged = 1;
        }

        for (i = 0; i < nBufs; i++)
            ASSERT_SUCCESS(hsaKmtAllocMemory(allocNode, bufSize, memFlags,
                        &bufs[i]));

        start = GetSystemTickCountInMicroSec();
        for (i = 0; i < nBufs; i++) {
            memcpy(bufs[i], tmp, bufSize);
        }
        mcpWTime = GetSystemTickCountInMicroSec() - start;

        start = GetSystemTickCountInMicroSec();
        for (i = 0; i < nBufs; i++) {
            access(bufs[i], bufSize, 1);
        }
        accessWTime = GetSystemTickCountInMicroSec() - start;

        start = GetSystemTickCountInMicroSec();
        for (i = 0; i < nBufs; i++) {
            memcpy(tmp, bufs[i], bufSize);
        }
        mcpRTime = GetSystemTickCountInMicroSec() - start;

        start = GetSystemTickCountInMicroSec();
        for (i = 0; i < nBufs; i++) {
            access(bufs[i], bufSize, 0);
        }
        accessRTime = GetSystemTickCountInMicroSec() - start;

        for (i = 0; i < nBufs; i++)
            EXPECT_SUCCESS(hsaKmtFreeMemory(bufs[i], bufSize));

        LOG() << std::dec
            << std::right << std::setw(3) << (bufSize >> 10) << "K-"
            << std::left << std::setw(14) << memTypeStrings[memType]
            << std::right
            << std::setw(12) << mcpRTime
            << std::setw(12) << mcpWTime
            << std::setw(12) << accessRTime
            << std::setw(12) << accessWTime
            << std::endl;

#define MMBANDWIDTH_KEY_PREFIX memTypeStrings[memType] << "-" \
                               << (bufSize >> 10) << "K" << "-"
        RECORD(mcpRTime) << MMBANDWIDTH_KEY_PREFIX << "mcpRTime";
        RECORD(mcpWTime) << MMBANDWIDTH_KEY_PREFIX << "mcpWTime";
        RECORD(accessRTime) << MMBANDWIDTH_KEY_PREFIX << "accessRTime";
        RECORD(accessWTime) << MMBANDWIDTH_KEY_PREFIX << "accessWTime";
    }

    munmap(tmp, tmpBufferSize);

    TEST_END
}

/* For the purpose of testing HDP flush from CPU.
 * Use CPU to write to coherent vram and check
 * from shader.
 * Asic before gfx9 doesn't support user space
 * HDP flush so only run on vega10 and after.
 * This should only run on large bar system.
 */
TEST_F(KFDMemoryTest, HostHdpFlush) {
    TEST_REQUIRE_ENV_CAPABILITIES(ENVCAPS_64BITLINUX);
    TEST_START(TESTPROFILE_RUNALL);

    HsaMemFlags memoryFlags = m_MemoryFlags;
    /* buffer[0]: signal; buffer[1]: Input to shader; buffer[2]: Output to
     * shader
     */
    unsigned int *buffer = NULL;
    HSAuint32 defaultGPUNode = m_NodeInfo.HsaDefaultGPUNode();
    ASSERT_GE(defaultGPUNode, 0) << "failed to get default GPU Node";
    const HsaNodeProperties *pNodeProperties = m_NodeInfo.GetNodeProperties(defaultGPUNode);
    HSAuint32 *mmioBase = NULL;
    unsigned int *nullPtr = NULL;

    if (!pNodeProperties) {
        LOG() << "Failed to get gpu node properties." << std::endl;
        return;
    }

    if (m_FamilyId < FAMILY_AI) {
        LOG() << "Skipping test: Test requires gfx9 and later asics." << std::endl;
        return;
    }
    HSAuint64 vramSizeMB = GetVramSize(defaultGPUNode) >> 20;

    if (!m_NodeInfo.IsGPUNodeLargeBar(defaultGPUNode) || !vramSizeMB) {
        LOG() << "Skipping test: Test requires a large bar GPU." << std::endl;
        return;
    }

    HsaMemoryProperties *memoryProperties = new HsaMemoryProperties[pNodeProperties->NumMemoryBanks];
    EXPECT_SUCCESS(hsaKmtGetNodeMemoryProperties(defaultGPUNode, pNodeProperties->NumMemoryBanks,
                   memoryProperties));
    for (unsigned int bank = 0; bank < pNodeProperties->NumMemoryBanks; bank++) {
        if (memoryProperties[bank].HeapType == HSA_HEAPTYPE_MMIO_REMAP) {
            mmioBase = (unsigned int *)memoryProperties[bank].VirtualBaseAddress;
            break;
        }
    }
    ASSERT_NE(mmioBase, nullPtr) << "mmio base is NULL";

    memoryFlags.ui32.NonPaged = 1;
    memoryFlags.ui32.CoarseGrain = 0;
    ASSERT_SUCCESS(hsaKmtAllocMemory(defaultGPUNode, PAGE_SIZE, memoryFlags,
                   reinterpret_cast<void**>(&buffer)));
    ASSERT_SUCCESS(hsaKmtMapMemoryToGPU(buffer, PAGE_SIZE, NULL));

    /* Signal is dead from the beginning*/
    buffer[0] = 0xdead;
    buffer[1] = 0xfeeb;
    buffer[2] = 0xfeed;
    /* Submit a shader to poll the signal*/
    PM4Queue queue;
    ASSERT_SUCCESS(queue.Create(defaultGPUNode));
    HsaMemoryBuffer isaBuffer(PAGE_SIZE, defaultGPUNode, true/*zero*/, false/*local*/, true/*exec*/);
    m_pIsaGen->CompileShader(gfx9_CopyOnSignal,"CopyOnSignal", isaBuffer);
    Dispatch dispatch0(isaBuffer);
    dispatch0.SetArgs(buffer, NULL);
    dispatch0.Submit(queue);

    buffer[1] = 0xbeef;
    /* Flush HDP */
    mmioBase[KFD_MMIO_REMAP_HDP_MEM_FLUSH_CNTL/4] = 0x1;
    buffer[0] = 0xcafe;

    /* Check test result*/
    dispatch0.Sync();
    mmioBase[KFD_MMIO_REMAP_HDP_MEM_FLUSH_CNTL/4] = 0x1;
    EXPECT_EQ(0xbeef, buffer[2]);

    // Clean up
    EXPECT_SUCCESS(queue.Destroy());
    delete [] memoryProperties;
    EXPECT_SUCCESS(hsaKmtUnmapMemoryToGPU(buffer));
    EXPECT_SUCCESS(hsaKmtFreeMemory(buffer, PAGE_SIZE));

    TEST_END
}

/* Test HDP flush from device.
 * Use shader on device 1 to write vram of device 0
 * and flush HDP of device 0. Read vram from device 0
 * and write back to vram to check the result from CPU.
 * Asic before gfx9 doesn't support device HDP flush
 * so only run on vega10 and after.
 * This should only run on system with at least one
 * large bar node (which is used as device 0).
 */
TEST_F(KFDMemoryTest, DeviceHdpFlush) {
    TEST_REQUIRE_ENV_CAPABILITIES(ENVCAPS_64BITLINUX);
    TEST_START(TESTPROFILE_RUNALL);

    HsaMemFlags memoryFlags = m_MemoryFlags;
    /* buffer is physically on device 0.
     * buffer[0]: Use as signaling b/t devices;
     * buffer[1]: Device 1 write to buffer[1] and device 0 read it
     * buffer[2]: Device 0 copy buffer[1] to buffer[2] for CPU to check
     */
    unsigned int *buffer = NULL;
    const HsaNodeProperties *pNodeProperties;
    HSAuint32 *mmioBase = NULL;
    unsigned int *nullPtr = NULL;
    std::vector<HSAuint32> nodes;

    const std::vector<int> gpuNodes = m_NodeInfo.GetNodesWithGPU();
    if (gpuNodes.size() < 2) {
        LOG() << "Skipping test: At least two GPUs are required." << std::endl;
        return;
    }

     /* Users can use "--node=gpu1 --dst_node=gpu2" to specify devices */
    if (g_TestDstNodeId != -1 && g_TestNodeId != -1) {
        nodes.push_back(g_TestNodeId);
        nodes.push_back(g_TestDstNodeId);
        if (!m_NodeInfo.IsGPUNodeLargeBar(nodes[0])) {
            LOG() << "Skipping test: first GPU specified is not a large bar GPU." << std::endl;
            return;
        }
        if (nodes[0] == nodes[1]) {
            LOG() << "Skipping test: Different GPUs must be specified (2 GPUs required)." << std::endl;
            return;
        }
    } else {
        HSAint32 defaultGPU = m_NodeInfo.HsaDefaultGPUNode();
        if (!m_NodeInfo.IsGPUNodeLargeBar(defaultGPU)) {
            LOG() << "Skipping test: Default GPUs must be large bar." << std::endl;
            return;
        }
        nodes.push_back(defaultGPU);
        for (unsigned i = 0; i < gpuNodes.size(); i++)
            if (gpuNodes.at(i) != defaultGPU)
                nodes.push_back(gpuNodes.at(i));
        if (nodes.size() < 2) {
            LOG() << "Skipping test: At least 2 GPUs required." << std::endl;
            return;
        }
    }

    pNodeProperties = m_NodeInfo.GetNodeProperties(nodes[0]);
    if (!pNodeProperties) {
        LOG() << "Failed to get gpu node properties." << std::endl;
        return;
    }

    if (m_FamilyId < FAMILY_AI) {
        LOG() << "Skipping test: Test requires gfx9 and later asics." << std::endl;
        return;
    }

    HsaMemoryProperties *memoryProperties = new HsaMemoryProperties[pNodeProperties->NumMemoryBanks];
    EXPECT_SUCCESS(hsaKmtGetNodeMemoryProperties(nodes[0], pNodeProperties->NumMemoryBanks,
                   memoryProperties));
    for (unsigned int bank = 0; bank < pNodeProperties->NumMemoryBanks; bank++) {
        if (memoryProperties[bank].HeapType == HSA_HEAPTYPE_MMIO_REMAP) {
            mmioBase = (unsigned int *)memoryProperties[bank].VirtualBaseAddress;
            break;
        }
    }
    ASSERT_NE(mmioBase, nullPtr) << "mmio base is NULL";

    memoryFlags.ui32.NonPaged = 1;
    memoryFlags.ui32.CoarseGrain = 0;
    ASSERT_SUCCESS(hsaKmtAllocMemory(nodes[0], PAGE_SIZE, memoryFlags,
                   reinterpret_cast<void**>(&buffer)));
    ASSERT_SUCCESS(hsaKmtMapMemoryToGPU(buffer, PAGE_SIZE, NULL));

    /* Signal is dead from the beginning*/
    buffer[0] = 0xdead;
    buffer[1] = 0xfeeb;
    buffer[2] = 0xfeeb;
    /* Submit shaders*/
    PM4Queue queue;
    ASSERT_SUCCESS(queue.Create(nodes[0]));
    HsaMemoryBuffer isaBuffer(PAGE_SIZE, nodes[0], true/*zero*/, false/*local*/, true/*exec*/);
    m_pIsaGen->CompileShader(gfx9_CopyOnSignal, "CopyOnSignal", isaBuffer);
    Dispatch dispatch(isaBuffer);
    dispatch.SetArgs(buffer, NULL);
    dispatch.Submit(queue);

    PM4Queue queue0;
    ASSERT_SUCCESS(queue0.Create(nodes[1]));
    HsaMemoryBuffer isaBuffer0(PAGE_SIZE, nodes[1], true/*zero*/, false/*local*/, true/*exec*/);
    m_pIsaGen->CompileShader(gfx9_WriteAndSignal, "WriteAndSignal", isaBuffer0);
    Dispatch dispatch0(isaBuffer0);
    dispatch0.SetArgs(buffer, mmioBase);
    dispatch0.Submit(queue0);

    /* Check test result*/
    dispatch0.Sync();
    dispatch.Sync();
    EXPECT_EQ(0xbeef, buffer[2]);

    // Clean up
    EXPECT_SUCCESS(queue.Destroy());
    EXPECT_SUCCESS(queue0.Destroy());
    delete [] memoryProperties;
    EXPECT_SUCCESS(hsaKmtUnmapMemoryToGPU(buffer));
    EXPECT_SUCCESS(hsaKmtFreeMemory(buffer, PAGE_SIZE));

    TEST_END
}