diff --git a/tests/kfdtest/CMakeLists.txt b/tests/kfdtest/CMakeLists.txt index a8c9f30299..04e797e7d5 100644 --- a/tests/kfdtest/CMakeLists.txt +++ b/tests/kfdtest/CMakeLists.txt @@ -144,7 +144,8 @@ set (SRC_FILES gtest-1.6.0/gtest-all.cpp src/KFDHWSTest.cpp src/KFDPerformanceTest.cpp src/KFDPMTest.cpp - + src/KFDSVMRangeTest.cpp + src/KFDSVMEvictTest.cpp src/KFDRASTest.cpp src/RDMATest.cpp) diff --git a/tests/kfdtest/src/KFDBaseComponentTest.hpp b/tests/kfdtest/src/KFDBaseComponentTest.hpp index 5fc61015e4..187d9fbd73 100644 --- a/tests/kfdtest/src/KFDBaseComponentTest.hpp +++ b/tests/kfdtest/src/KFDBaseComponentTest.hpp @@ -78,6 +78,14 @@ class KFDBaseComponentTest : public testing::Test { virtual void SetUp(); // @brief Executed after every test that uses KFDBaseComponentTest class. virtual void TearDown(); + + bool SVMAPISupported() { + bool supported = m_NodeInfo.HsaDefaultGPUNodeProperties() + ->Capability.ui32.SVMAPISupported; + if (!supported) + LOG() << "SVM API not supported" << std::endl; + return supported; + } }; extern KFDBaseComponentTest* g_baseTest; diff --git a/tests/kfdtest/src/KFDSVMEvictTest.cpp b/tests/kfdtest/src/KFDSVMEvictTest.cpp new file mode 100644 index 0000000000..c05e460c3d --- /dev/null +++ b/tests/kfdtest/src/KFDSVMEvictTest.cpp @@ -0,0 +1,452 @@ +/* + * Copyright (C) 2020 Advanced Micro Devices, Inc. All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + * + */ + +#include "KFDSVMEvictTest.hpp" +#include +#include +#include +#include "PM4Queue.hpp" +#include "PM4Packet.hpp" +#include "SDMAPacket.hpp" +#include "SDMAQueue.hpp" +#include "Dispatch.hpp" + +#define N_PROCESSES (4) /* number of processes running in parallel, at least 2 */ +#define ALLOCATE_BUF_SIZE_MB (64) +#define ALLOCATE_RETRY_TIMES (3) + +HSAint32 KFDSVMEvictTest::GetBufferCounter(HSAuint64 vramSize, HSAuint64 vramBufSize) { + HSAuint64 vramBufSizeInPages = vramBufSize >> PAGE_SHIFT; + HSAuint64 sysMemSize = GetSysMemSize(); + HSAuint64 size, sizeInPages; + HSAuint32 count; + + LOG() << "Found System RAM of " << std::dec << (sysMemSize >> 20) << "MB" << std::endl; + + /* use one third of total system memory for eviction buffer to test + * limit max allocate size to duoble of vramSize + * count is zero if not enough memory (sysMemSize/3 + vramSize) < (vramBufSize * N_PROCESSES) + */ + size = sysMemSize / 3 + vramSize; + size = size > vramSize << 1 ? vramSize << 1 : size; + sizeInPages = size >> PAGE_SHIFT; + count = sizeInPages / (vramBufSizeInPages * N_PROCESSES); + + return count; +} + +void KFDSVMEvictTest::AllocBuffers(HSAuint32 defaultGPUNode, HSAuint32 count, HSAuint64 vramBufSize, + std::vector &pBuffers) { + HSAuint64 totalMB; + + totalMB = N_PROCESSES * count * (vramBufSize >> 20); + if (m_IsParent) { + LOG() << "Testing " << N_PROCESSES << "*" << count << "*" << (vramBufSize>>20) << "(="<< totalMB << ")MB" << std::endl; + } + HSAKMT_STATUS ret; + HSAuint32 retry = 0; + + for (HSAuint32 i = 0; i < count; i++) { + m_pBuf = mmap(0, vramBufSize, PROT_READ | PROT_WRITE, MAP_ANONYMOUS | MAP_PRIVATE, -1, 0); + EXPECT_NOTNULL(m_pBuf); + + m_Flags = (HSA_SVM_FLAGS)0; +retry: + ret = RegisterSVMRange(defaultGPUNode, m_pBuf, vramBufSize, defaultGPUNode, m_Flags); + if (ret == HSAKMT_STATUS_SUCCESS) { + pBuffers.push_back(m_pBuf); + retry = 0; + } else { + if (retry++ > ALLOCATE_RETRY_TIMES) { + munmap(m_pBuf, vramBufSize); + break; + } + printf("retry %d allocate vram\n", retry); + + /* wait for 1 second to try allocate again */ + sleep(1); + goto retry; + } + } +} + +void KFDSVMEvictTest::FreeBuffers(std::vector &pBuffers, HSAuint64 vramBufSize) { + for (HSAuint32 i = 0; i < pBuffers.size(); i++) { + m_pBuf = pBuffers[i]; + if (m_pBuf != NULL) + munmap(m_pBuf, vramBufSize); + } +} + +void KFDSVMEvictTest::ForkChildProcesses(int nprocesses) { + int i; + + for (i = 0; i < nprocesses - 1; ++i) { + pid_t pid = fork(); + ASSERT_GE(pid, 0); + + if (pid == 0) { + /* Child process */ + /* Cleanup file descriptors copied from parent process + * then call SetUp->hsaKmtOpenKFD to create new process + */ + m_psName = "Test process " + std::to_string(i) + " "; + TearDown(); + SetUp(); + m_ChildPids.clear(); + m_IsParent = false; + return; + } + + /* Parent process */ + m_ChildPids.push_back(pid); + } + + m_psName = "Test process " + std::to_string(i) + " "; +} + +void KFDSVMEvictTest::WaitChildProcesses() { + if (m_IsParent) { + /* only run by parent process */ + int childStatus; + int childExitOkNum = 0; + int size = m_ChildPids.size(); + + for (HSAuint32 i = 0; i < size; i++) { + pid_t pid = m_ChildPids.front(); + + waitpid(pid, &childStatus, 0); + if (WIFEXITED(childStatus) == 1 && WEXITSTATUS(childStatus) == 0) + childExitOkNum++; + + m_ChildPids.erase(m_ChildPids.begin()); + } + + ASSERT_EQ(childExitOkNum, size); + } + + /* child process or parent process finished successfullly */ + m_ChildStatus = HSAKMT_STATUS_SUCCESS; +} + +/* Evict and restore procedure basic test + * + * Use N_PROCESSES processes to allocate vram buf size larger than total vram size + * + * ALLOCATE_BUF_SIZE_MB buf allocation size + * + * number of buf is equal to (vramSizeMB / (vramBufSizeMB * N_PROCESSES) ) + 8 + * Total vram all processes allocated: 8GB for 4GB Fiji, and 20GB for 16GB Vega10 + * + * many times of eviction and restore will happen: + * ttm will evict buffers of another process if not enough free vram + * process restore will evict buffers of another process + * + * Sometimes the allocate may fail (maybe that is normal) + * ALLOCATE_RETRY_TIMES max retry times to allocate + * + * This is basic test, no queue so vram are not used by GPU during test + * + * Todo: + * - Synchronization between the processes, so they know for sure when + * they are done allocating memory + */ +TEST_F(KFDSVMEvictTest, BasicTest) { + TEST_REQUIRE_ENV_CAPABILITIES(ENVCAPS_64BITLINUX); + TEST_START(TESTPROFILE_RUNALL); + + if (!SVMAPISupported()) + return; + + HSAuint32 defaultGPUNode = m_NodeInfo.HsaDefaultGPUNode(); + ASSERT_GE(defaultGPUNode, 0) << "failed to get default GPU Node"; + HSAuint64 vramBufSize = ALLOCATE_BUF_SIZE_MB * 1024 * 1024; + + HSAuint64 vramSize = GetVramSize(defaultGPUNode); + + if (!vramSize) { + LOG() << "No VRAM found, skipping the test" << std::endl; + return; + } else { + LOG() << "Found VRAM of " << std::dec << (vramSize >> 20) << "MB" << std::endl; + } + + HSAuint32 count = GetBufferCounter(vramSize, vramBufSize); + if (count == 0) { + LOG() << "Not enough system memory, skipping the test" << std::endl; + return; + } + + /* Fork the child processes */ + ForkChildProcesses(N_PROCESSES); + + std::vector pBuffers; + AllocBuffers(defaultGPUNode, count, vramBufSize, pBuffers); + + /* wait for other processes to finish allocation, then free buffer */ + sleep(ALLOCATE_RETRY_TIMES); + + LOG() << m_psName << "free buffer" << std::endl; + FreeBuffers(pBuffers, vramBufSize); + + WaitChildProcesses(); + + TEST_END +} + +/* Shader to read local buffers using multiple wavefronts in parallel + * until address buffer is filled with specific value 0x5678 by host program, + * then each wavefront fills value 0x5678 at corresponding result buffer and quit + * + * initial state: + * s[0:1] - address buffer base address + * s[2:3] - result buffer base address + * s4 - workgroup id + * v0 - workitem id, always 0 because NUM_THREADS_X(number of threads) in workgroup set to 1 + * registers: + * v0 - calculated workitem id, v0 = v0 + s4 * NUM_THREADS_X + * v[2:3] - address of corresponding local buf address offset: s[0:1] + v0 * 8 + * v[4:5] - corresponding output buf address: s[2:3] + v0 * 4 + * v[6:7] - local buf address used for read test + */ +static const char* gfx9_ReadMemory = +"\ + shader ReadMemory\n\ + asic(GFX9)\n\ + type(CS)\n\ + \n\ + // compute address of corresponding output buffer\n\ + v_mov_b32 v0, s4 // use workgroup id as index\n\ + v_lshlrev_b32 v0, 2, v0 // v0 *= 4\n\ + v_add_co_u32 v4, vcc, s2, v0 // v[4:5] = s[2:3] + v0 * 4\n\ + v_mov_b32 v5, s3\n\ + v_add_u32 v5, vcc_lo, v5\n\ + \n\ + // compute input buffer offset used to store corresponding local buffer address\n\ + v_lshlrev_b32 v0, 1, v0 // v0 *= 8\n\ + v_add_co_u32 v2, vcc, s0, v0 // v[2:3] = s[0:1] + v0 * 8\n\ + v_mov_b32 v3, s1\n\ + v_add_u32 v3, vcc_lo, v3\n\ + \n\ + // load 64bit local buffer address stored at v[2:3] to v[6:7]\n\ + flat_load_dwordx2 v[6:7], v[2:3] slc\n\ + s_waitcnt vmcnt(0) & lgkmcnt(0) // wait for memory reads to finish\n\ + \n\ + v_mov_b32 v8, 0x5678\n\ + s_movk_i32 s8, 0x5678\n\ +L_REPEAT:\n\ + s_load_dword s16, s[0:1], 0x0 glc\n\ + s_waitcnt vmcnt(0) & lgkmcnt(0) // wait for memory reads to finish\n\ + s_cmp_eq_i32 s16, s8\n\ + s_cbranch_scc1 L_QUIT // if notified to quit by host\n\ + // loop read 64M local buffer starting at v[6:7]\n\ + // every 4k page only read once\n\ + v_mov_b32 v9, 0\n\ + v_mov_b32 v10, 0x1000 // 4k page\n\ + v_mov_b32 v11, 0x4000000 // 64M size\n\ + v_mov_b32 v12, v6\n\ + v_mov_b32 v13, v7\n\ +L_LOOP_READ:\n\ + flat_load_dwordx2 v[14:15], v[12:13] slc\n\ + v_add_u32 v9, v9, v10 \n\ + v_add_co_u32 v12, vcc, v12, v10\n\ + v_add_u32 v13, vcc_lo, v13\n\ + v_cmp_lt_u32 vcc, v9, v11\n\ + s_cbranch_vccnz L_LOOP_READ\n\ + s_branch L_REPEAT\n\ +L_QUIT:\n\ + flat_store_dword v[4:5], v8\n\ + s_waitcnt vmcnt(0) & lgkmcnt(0) // wait for memory writes to finish\n\ + s_endpgm\n\ + end\n\ +"; + +static const char* gfx8_ReadMemory = +"\ + shader ReadMemory\n\ + asic(VI)\n\ + type(CS)\n\ + \n\ + // compute address of corresponding output buffer\n\ + v_mov_b32 v0, s4 // use workgroup id as index\n\ + v_lshlrev_b32 v0, 2, v0 // v0 *= 4\n\ + v_add_u32 v4, vcc, s2, v0 // v[4:5] = s[2:3] + v0 * 4\n\ + v_mov_b32 v5, s3\n\ + v_addc_u32 v5, vcc, v5, 0, vcc\n\ + \n\ + // compute input buffer offset used to store corresponding local buffer address\n\ + v_lshlrev_b32 v0, 1, v0 // v0 *= 8\n\ + v_add_u32 v2, vcc, s0, v0 // v[2:3] = s[0:1] + v0 * 8\n\ + v_mov_b32 v3, s1\n\ + v_addc_u32 v3, vcc, v3, 0, vcc\n\ + \n\ + // load 64bit local buffer address stored at v[2:3] to v[6:7]\n\ + flat_load_dwordx2 v[6:7], v[2:3] slc\n\ + s_waitcnt vmcnt(0) & lgkmcnt(0) // wait for memory reads to finish\n\ + \n\ + v_mov_b32 v8, 0x5678\n\ + s_movk_i32 s8, 0x5678\n\ +L_REPEAT:\n\ + s_load_dword s16, s[0:1], 0x0 glc\n\ + s_waitcnt vmcnt(0) & lgkmcnt(0) // wait for memory reads to finish\n\ + s_cmp_eq_i32 s16, s8\n\ + s_cbranch_scc1 L_QUIT // if notified to quit by host\n\ + // loop read 64M local buffer starting at v[6:7]\n\ + // every 4k page only read once\n\ + v_mov_b32 v9, 0\n\ + v_mov_b32 v10, 0x1000 // 4k page\n\ + v_mov_b32 v11, 0x4000000 // 64M size\n\ + v_mov_b32 v12, v6\n\ + v_mov_b32 v13, v7\n\ +L_LOOP_READ:\n\ + flat_load_dwordx2 v[14:15], v[12:13] slc\n\ + v_add_u32 v9, vcc, v9, v10 \n\ + v_add_u32 v12, vcc, v12, v10\n\ + v_addc_u32 v13, vcc, v13, 0, vcc\n\ + v_cmp_lt_u32 vcc, v9, v11\n\ + s_cbranch_vccnz L_LOOP_READ\n\ + s_branch L_REPEAT\n\ +L_QUIT:\n\ + flat_store_dword v[4:5], v8\n\ + s_waitcnt vmcnt(0) & lgkmcnt(0) // wait for memory writes to finish\n\ + s_endpgm\n\ + end\n\ +"; + +std::string KFDSVMEvictTest::CreateShader() { + if (m_FamilyId >= FAMILY_AI) + return gfx9_ReadMemory; + else + return gfx8_ReadMemory; +} + +/* Evict and restore queue test + * + * N_PROCESSES processes read all local buffers in parallel while buffers are evicted and restored + * If GPU vm page fault happens, then test shader will stop and failed to write specific value + * at dest buffer. Test will report failed. + * + * Steps: + * - fork N_PROCESSES processes, each process does the same below + * - allocate local buffers, each buffer size is 64MB + * - allocate zero initialized host access address buffer and result buffer + * address buffer to pass address of local buffers to shader + * result buffer to store shader output result + * - submit queue to run ReadMemory shader + * - shader start m_DimX wavefronts, each wavefront keep reading one local buffer + * - notify shader to quit + * - check result buffer with specific value to confirm all wavefronts quit normally + */ +TEST_F(KFDSVMEvictTest, QueueTest) { + TEST_REQUIRE_ENV_CAPABILITIES(ENVCAPS_64BITLINUX); + TEST_START(TESTPROFILE_RUNALL) + + if (!SVMAPISupported()) + return; + + HSAuint32 defaultGPUNode = m_NodeInfo.HsaDefaultGPUNode(); + ASSERT_GE(defaultGPUNode, 0) << "failed to get default GPU Node"; + HSAuint64 vramBufSize = ALLOCATE_BUF_SIZE_MB * 1024 * 1024; + + const HsaNodeProperties *pNodeProperties = m_NodeInfo.HsaDefaultGPUNodeProperties(); + + /* Skip test for chip it doesn't have CWSR, which the test depends on */ + if (m_FamilyId < FAMILY_VI || isTonga(pNodeProperties)) { + LOG() << std::hex << "Test is skipped for family ID 0x" << m_FamilyId << std::endl; + return; + } + + HSAuint32 i; + HSAuint64 vramSize = GetVramSize(defaultGPUNode); + + if (!vramSize) { + LOG() << "No VRAM found, skipping the test" << std::endl; + return; + } else { + LOG() << "Found VRAM of " << std::dec << (vramSize >> 20) << "MB." << std::endl; + } + + HSAuint32 count = GetBufferCounter(vramSize, vramBufSize); + if (count == 0) { + LOG() << "Not enough system memory, skipping the test" << std::endl; + return; + } + /* assert all buffer address can be stored within one page + * because only one page host memory srcBuf is allocated + */ + ASSERT_LE(count, PAGE_SIZE/sizeof(unsigned int *)); + + /* Fork the child processes */ + ForkChildProcesses(N_PROCESSES); + + HsaMemoryBuffer isaBuffer(PAGE_SIZE, defaultGPUNode, true/*zero*/, false/*local*/, true/*exec*/); + HsaMemoryBuffer addrBuffer(PAGE_SIZE, defaultGPUNode); + HsaMemoryBuffer resultBuffer(PAGE_SIZE, defaultGPUNode); + + std::vector pBuffers; + AllocBuffers(defaultGPUNode, count, vramBufSize, pBuffers); + + unsigned int wavefront_num = pBuffers.size(); + LOG() << m_psName << "wavefront number " << wavefront_num << std::endl; + + void **localBufAddr = addrBuffer.As(); + unsigned int *result = resultBuffer.As(); + + for (i = 0; i < wavefront_num; i++) + *(localBufAddr + i) = pBuffers[i]; + + m_pIsaGen->CompileShader(CreateShader().c_str(), "ReadMemory", isaBuffer); + + PM4Queue pm4Queue; + ASSERT_SUCCESS(pm4Queue.Create(defaultGPUNode)); + + Dispatch dispatch0(isaBuffer); + dispatch0.SetArgs(localBufAddr, result); + dispatch0.SetDim(wavefront_num, 1, 1); + /* submit the packet and start shader */ + dispatch0.Submit(pm4Queue); + + /* doing evict/restore queue test for 5 seconds while queue is running */ + sleep(5); + + /* LOG() << m_psName << "notify shader to quit" << std::endl; */ + /* fill address buffer so shader quits */ + addrBuffer.Fill(0x5678); + + /* wait for shader to finish or timeout if shade has vm page fault */ + dispatch0.SyncWithStatus(120000); + + ASSERT_SUCCESS(pm4Queue.Destroy()); + /* LOG() << m_psName << "free buffer" << std::endl; */ + /* cleanup */ + FreeBuffers(pBuffers, vramBufSize); + + /* check if all wavefronts finish successfully */ + for (i = 0; i < wavefront_num; i++) + ASSERT_EQ(0x5678, *(result + i)); + + WaitChildProcesses(); + + TEST_END +} + diff --git a/tests/kfdtest/src/KFDSVMEvictTest.hpp b/tests/kfdtest/src/KFDSVMEvictTest.hpp new file mode 100644 index 0000000000..7749ac2dee --- /dev/null +++ b/tests/kfdtest/src/KFDSVMEvictTest.hpp @@ -0,0 +1,70 @@ +/* + * Copyright (C) 2020 Advanced Micro Devices, Inc. All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + * + */ + +#ifndef __KFD_SVM_EVICT_TEST__H__ +#define __KFD_SVM_EVICT_TEST__H__ + +#include +#include +#include "KFDLocalMemoryTest.hpp" +#include "KFDBaseComponentTest.hpp" +#include "IsaGenerator.hpp" + +// @class KFDEvictTest +// Test eviction and restore procedure using two processes +class KFDSVMEvictTest : public KFDLocalMemoryTest { + public: + KFDSVMEvictTest(void): m_ChildStatus(HSAKMT_STATUS_ERROR), m_IsParent(true) {} + + ~KFDSVMEvictTest(void) { + if (!m_IsParent) { + /* child process has to exit + * otherwise gtest will continue other tests + */ + exit(m_ChildStatus); + } + + try { + WaitChildProcesses(); + } catch (...) {} + } + + protected: + std::string CreateShader(); + void AllocBuffers(HSAuint32 defaultGPUNode, HSAuint32 count, HSAuint64 vramBufSize, + std::vector &pBuffers); + void FreeBuffers(std::vector &pBuffers, HSAuint64 vramBufSize); + void ForkChildProcesses(int nprocesses); + void WaitChildProcesses(); + HSAint32 GetBufferCounter(HSAuint64 vramSize, HSAuint64 vramBufSize); + + protected: // members + std::string m_psName; + std::vector m_ChildPids; + HSA_SVM_FLAGS m_Flags; + void* m_pBuf; + HSAKMT_STATUS m_ChildStatus; + bool m_IsParent; +}; + +#endif // __KFD_SVM_EVICT_TEST__H__ diff --git a/tests/kfdtest/src/KFDSVMRangeTest.cpp b/tests/kfdtest/src/KFDSVMRangeTest.cpp new file mode 100644 index 0000000000..01243f8156 --- /dev/null +++ b/tests/kfdtest/src/KFDSVMRangeTest.cpp @@ -0,0 +1,1139 @@ +/* + * Copyright (C) 2020 Advanced Micro Devices, Inc. All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + * + */ +#include "KFDSVMRangeTest.hpp" +#include +#include +#include "PM4Queue.hpp" +#include "PM4Packet.hpp" +#include "SDMAPacket.hpp" +#include "SDMAQueue.hpp" +#include "Dispatch.hpp" + +void KFDSVMRangeTest::SetUp() { + ROUTINE_START + + KFDBaseComponentTest::SetUp(); + + m_pIsaGen = IsaGenerator::Create(m_FamilyId); + + ROUTINE_END +} + +void KFDSVMRangeTest::TearDown() { + ROUTINE_START + + if (m_pIsaGen) + delete m_pIsaGen; + m_pIsaGen = NULL; + + KFDBaseComponentTest::TearDown(); + + ROUTINE_END +} + +TEST_F(KFDSVMRangeTest, BasicSystemMemTest) { + TEST_REQUIRE_ENV_CAPABILITIES(ENVCAPS_64BITLINUX); + TEST_START(TESTPROFILE_RUNALL); + + if (!SVMAPISupported()) + return; + + PM4Queue queue; + HSAuint64 AlternateVAGPU; + unsigned int BufferSize = PAGE_SIZE; + + int defaultGPUNode = m_NodeInfo.HsaDefaultGPUNode(); + ASSERT_GE(defaultGPUNode, 0) << "failed to get default GPU Node"; + + if (!GetVramSize(defaultGPUNode)) { + LOG() << "Skipping test: No VRAM found." << std::endl; + return; + } + + HsaMemoryBuffer isaBuffer(PAGE_SIZE, defaultGPUNode); + HsaSVMRange srcSysBuffer(BufferSize, defaultGPUNode); + HsaSVMRange destSysBuffer(BufferSize, defaultGPUNode); + + srcSysBuffer.Fill(0x01010101); + + m_pIsaGen->GetCopyDwordIsa(isaBuffer); + + ASSERT_SUCCESS(queue.Create(defaultGPUNode)); + queue.SetSkipWaitConsump(0); + + Dispatch dispatch(isaBuffer); + + dispatch.SetArgs(srcSysBuffer.As(), destSysBuffer.As()); + dispatch.Submit(queue); + dispatch.Sync(g_TestTimeOut); + + EXPECT_SUCCESS(queue.Destroy()); + + EXPECT_EQ(destSysBuffer.As()[0], 0x01010101); + + TEST_END +} + +TEST_F(KFDSVMRangeTest, SetGetAttributesTest) { + TEST_REQUIRE_ENV_CAPABILITIES(ENVCAPS_64BITLINUX); + TEST_START(TESTPROFILE_RUNALL) + + if (!SVMAPISupported()) + return; + + int defaultGPUNode = m_NodeInfo.HsaDefaultGPUNode(); + ASSERT_GE(defaultGPUNode, 0) << "failed to get default GPU Node"; + + if (m_FamilyId < FAMILY_AI) { + LOG() << std::hex << "Skipping test: No svm range support for family ID 0x" << m_FamilyId << "." << std::endl; + return; + } + + int i; + unsigned int BufSize = PAGE_SIZE; + HsaSVMRange *sysBuffer; + HSAuint32 nAttributes = 5; + HSA_SVM_ATTRIBUTE outputAttributes[nAttributes]; + HSA_SVM_ATTRIBUTE inputAttributes[] = { + {HSA_SVM_ATTR_PREFETCH_LOC, (HSAuint32)defaultGPUNode}, + {HSA_SVM_ATTR_PREFERRED_LOC, (HSAuint32)defaultGPUNode}, + {HSA_SVM_ATTR_SET_FLAGS, + HSA_SVM_FLAG_HOST_ACCESS | HSA_SVM_FLAG_GPU_EXEC | HSA_SVM_FLAG_COHERENT}, + {HSA_SVM_ATTR_GRANULARITY, 0xFF}, + {HSA_SVM_ATTR_ACCESS, (HSAuint32)defaultGPUNode}, + }; + + HSAuint32 expectedDefaultResults[] = { + INVALID_NODEID, + INVALID_NODEID, + HSA_SVM_FLAG_HOST_ACCESS | HSA_SVM_FLAG_COHERENT, + 9, + 0, + }; + HSAint32 enable = -1; + EXPECT_SUCCESS(hsaKmtGetXNACKMode(&enable)); + //expectedDefaultResults[4] = (enable)?HSA_SVM_ATTR_ACCESS:HSA_SVM_ATTR_NO_ACCESS; + // FIXME: Waiting for KFD to implement retry faults on unregistered addresses + expectedDefaultResults[4] = HSA_SVM_ATTR_NO_ACCESS; + sysBuffer = new HsaSVMRange(BufSize); + char *pBuf = sysBuffer->As(); + + LOG() << "Get default atrributes" << std::endl; + memcpy(outputAttributes, inputAttributes, nAttributes * sizeof(HSA_SVM_ATTRIBUTE)); + EXPECT_SUCCESS(hsaKmtSVMGetAttr(pBuf, BufSize, + nAttributes, outputAttributes)); + + for (i = 0; i < nAttributes; i++) { + if (outputAttributes[i].type == HSA_SVM_ATTR_ACCESS || + outputAttributes[i].type == HSA_SVM_ATTR_ACCESS_IN_PLACE || + outputAttributes[i].type == HSA_SVM_ATTR_NO_ACCESS) + EXPECT_EQ(outputAttributes[i].type, expectedDefaultResults[i]); + else + EXPECT_EQ(outputAttributes[i].value, expectedDefaultResults[i]); + } + LOG() << "Setting/Getting atrributes" << std::endl; + memcpy(outputAttributes, inputAttributes, nAttributes * sizeof(HSA_SVM_ATTRIBUTE)); + EXPECT_SUCCESS(hsaKmtSVMSetAttr(pBuf, BufSize, + nAttributes, inputAttributes)); + EXPECT_SUCCESS(hsaKmtSVMGetAttr(pBuf, BufSize, + nAttributes, outputAttributes)); + for (i = 0; i < nAttributes; i++) { + if (outputAttributes[i].type == HSA_SVM_ATTR_ACCESS || + outputAttributes[i].type == HSA_SVM_ATTR_ACCESS_IN_PLACE || + outputAttributes[i].type == HSA_SVM_ATTR_NO_ACCESS) + EXPECT_EQ(inputAttributes[i].type, outputAttributes[i].type); + else + EXPECT_EQ(inputAttributes[i].value, outputAttributes[i].value); + } + + TEST_END +} + +TEST_F(KFDSVMRangeTest, XNACKModeTest) { + TEST_REQUIRE_ENV_CAPABILITIES(ENVCAPS_64BITLINUX); + TEST_START(TESTPROFILE_RUNALL); + + if (!SVMAPISupported()) + return; + + HSAuint32 i, j; + HSAint32 r; + PM4Queue queue; + HSAint32 enable = 0; + const std::vector gpuNodes = m_NodeInfo.GetNodesWithGPU(); + + EXPECT_SUCCESS(hsaKmtGetXNACKMode(&enable)); + for (i = 0; i < 2; i++) { + enable = !enable; + r = hsaKmtSetXNACKMode(enable); + if (r == HSAKMT_STATUS_SUCCESS) { + LOG() << "XNACK mode: " << std::boolalpha << enable << + " supported" << std::endl; + + for (j = 0; j < gpuNodes.size(); j++) { + LOG() << "Creating queue and try to set xnack mode on node: " + << gpuNodes.at(j) << std::endl; + ASSERT_SUCCESS(queue.Create(gpuNodes.at(j))); + EXPECT_EQ(HSAKMT_STATUS_ERROR, + hsaKmtSetXNACKMode(enable)); + EXPECT_SUCCESS(queue.Destroy()); + } + } else if (r == HSAKMT_STATUS_NOT_SUPPORTED) { + LOG() << "XNACK mode: " << std::boolalpha << enable << + " NOT supported" << std::endl; + } + } + TEST_END +} + +TEST_F(KFDSVMRangeTest, InvalidRangeTest) { + TEST_START(TESTPROFILE_RUNALL) + + if (!SVMAPISupported()) + return; + + HSAuint32 Flags;; + HSAKMT_STATUS ret; + + int defaultGPUNode = m_NodeInfo.HsaDefaultGPUNode(); + ASSERT_GE(defaultGPUNode, 0) << "failed to get default GPU Node"; + + Flags = HSA_SVM_FLAG_HOST_ACCESS | HSA_SVM_FLAG_COHERENT; + + ret = RegisterSVMRange(defaultGPUNode, reinterpret_cast(0x10000), 0x1000, 0, Flags); + EXPECT_NE(ret, HSAKMT_STATUS_SUCCESS); + + TEST_END +} + +void KFDSVMRangeTest::SplitRangeTest(int defaultGPUNode, int prefetch_location) { + unsigned int BufSize = 16 * PAGE_SIZE; + + if (!SVMAPISupported()) + return; + + HsaSVMRange *sysBuffer; + HsaSVMRange *sysBuffer2; + HsaSVMRange *sysBuffer3; + HsaSVMRange *sysBuffer4; + + void *pBuf; + + // case 1 + pBuf = mmap(0, BufSize, PROT_READ | PROT_WRITE, MAP_ANONYMOUS | MAP_PRIVATE, -1, 0); + sysBuffer = new HsaSVMRange(pBuf, BufSize, defaultGPUNode, prefetch_location); + sysBuffer2 = new HsaSVMRange(reinterpret_cast(pBuf) + 8192, PAGE_SIZE, defaultGPUNode, prefetch_location); + delete sysBuffer2; + delete sysBuffer; + munmap(pBuf, BufSize); + + // case 2.1 + pBuf = mmap(0, BufSize, PROT_READ | PROT_WRITE, MAP_ANONYMOUS | MAP_PRIVATE, -1, 0); + sysBuffer = new HsaSVMRange(pBuf, BufSize, defaultGPUNode, prefetch_location); + sysBuffer2 = new HsaSVMRange(reinterpret_cast(pBuf) + 4096, BufSize - 4096, defaultGPUNode, + prefetch_location); + delete sysBuffer2; + delete sysBuffer; + munmap(pBuf, BufSize); + + // case 2.2 + pBuf = mmap(0, BufSize + 8192, PROT_READ | PROT_WRITE, MAP_ANONYMOUS | MAP_PRIVATE, -1, 0); + sysBuffer = new HsaSVMRange(pBuf, BufSize, defaultGPUNode, prefetch_location); + sysBuffer2 = new HsaSVMRange(reinterpret_cast(pBuf) + 8192, BufSize, defaultGPUNode, prefetch_location); + delete sysBuffer2; + delete sysBuffer; + munmap(pBuf, BufSize + 8192); + + // case 3 + pBuf = mmap(0, BufSize, PROT_READ | PROT_WRITE, MAP_ANONYMOUS | MAP_PRIVATE, -1, 0); + sysBuffer = new HsaSVMRange(pBuf, BufSize, defaultGPUNode, prefetch_location); + sysBuffer2 = new HsaSVMRange(reinterpret_cast(pBuf), BufSize - 8192, defaultGPUNode, prefetch_location); + delete sysBuffer2; + delete sysBuffer; + munmap(pBuf, BufSize); + + // case 4.1 + pBuf = mmap(0, BufSize, PROT_READ | PROT_WRITE, MAP_ANONYMOUS | MAP_PRIVATE, -1, 0); + sysBuffer = new HsaSVMRange(pBuf, BufSize, defaultGPUNode, prefetch_location); + sysBuffer2 = new HsaSVMRange(pBuf, BufSize, defaultGPUNode, prefetch_location); + delete sysBuffer2; + delete sysBuffer; + munmap(pBuf, BufSize); + + // case 4.2 + pBuf = mmap(0, BufSize + 8192, PROT_READ | PROT_WRITE, MAP_ANONYMOUS | MAP_PRIVATE, -1, 0); + sysBuffer = new HsaSVMRange(pBuf, BufSize, defaultGPUNode, prefetch_location); + sysBuffer2 = new HsaSVMRange(pBuf, BufSize + 8192, defaultGPUNode, prefetch_location); + delete sysBuffer2; + delete sysBuffer; + munmap(pBuf, BufSize + 8192); + + // case 5 + pBuf = mmap(0, BufSize + 65536, PROT_READ | PROT_WRITE, MAP_ANONYMOUS | MAP_PRIVATE, -1, 0); + sysBuffer = new HsaSVMRange(reinterpret_cast(pBuf) + 8192, 8192, defaultGPUNode, prefetch_location); + sysBuffer2 = new HsaSVMRange(reinterpret_cast(pBuf) + 32768, 8192, defaultGPUNode, prefetch_location); + sysBuffer3 = new HsaSVMRange(pBuf, BufSize + 65536, defaultGPUNode, prefetch_location); + delete sysBuffer2; + delete sysBuffer3; + delete sysBuffer; + munmap(pBuf, BufSize + 65536); + + // case 6, unregister after free + pBuf = mmap(0, BufSize, PROT_READ | PROT_WRITE, MAP_ANONYMOUS | MAP_PRIVATE, -1, 0); + sysBuffer = new HsaSVMRange(reinterpret_cast(pBuf) + 8192, 8192, defaultGPUNode, prefetch_location); + munmap(pBuf, BufSize); + delete sysBuffer; +} + +TEST_F(KFDSVMRangeTest, SplitSystemRangeTest) { + const HsaNodeProperties *pNodeProperties = m_NodeInfo.HsaDefaultGPUNodeProperties(); + TEST_START(TESTPROFILE_RUNALL) + + if (!SVMAPISupported()) + return; + + int defaultGPUNode = m_NodeInfo.HsaDefaultGPUNode(); + ASSERT_GE(defaultGPUNode, 0) << "failed to get default GPU Node"; + + if (m_FamilyId < FAMILY_AI) { + LOG() << std::hex << "Skipping test: No svm range support for family ID 0x" << m_FamilyId << "." << std::endl; + return; + } + + SplitRangeTest(defaultGPUNode, 0); + + TEST_END +} + +TEST_F(KFDSVMRangeTest, EvictSystemRangeTest) { + const HsaNodeProperties *pNodeProperties = m_NodeInfo.HsaDefaultGPUNodeProperties(); + TEST_START(TESTPROFILE_RUNALL) + + if (!SVMAPISupported()) + return; + + int defaultGPUNode = m_NodeInfo.HsaDefaultGPUNode(); + ASSERT_GE(defaultGPUNode, 0) << "failed to get default GPU Node"; + + if (m_FamilyId < FAMILY_AI) { + LOG() << std::hex << "Skipping test: No svm range support for family ID 0x" << m_FamilyId << "." << std::endl; + return; + } + + HSAuint32 stackData[2 * PAGE_SIZE] = {0}; + char *pBuf = reinterpret_cast(((uint64_t)stackData + PAGE_SIZE) & ~(PAGE_SIZE - 1)); + HSAuint32 *globalData = reinterpret_cast(pBuf); + const unsigned dstOffset = ((uint64_t)pBuf + 2 * PAGE_SIZE - (uint64_t)stackData) / 4; + const unsigned sdmaOffset = dstOffset + PAGE_SIZE; + + *globalData = 0xdeadbeef; + + HsaSVMRange srcBuffer((globalData), PAGE_SIZE, defaultGPUNode); + HsaSVMRange dstBuffer(&stackData[dstOffset], PAGE_SIZE, defaultGPUNode); + HsaSVMRange sdmaBuffer(&stackData[sdmaOffset], PAGE_SIZE, defaultGPUNode); + + /* Create PM4 and SDMA queues before fork+COW to test queue + * eviction and restore + */ + PM4Queue pm4Queue; + SDMAQueue sdmaQueue; + ASSERT_SUCCESS(pm4Queue.Create(defaultGPUNode)); + ASSERT_SUCCESS(sdmaQueue.Create(defaultGPUNode)); + + HsaMemoryBuffer isaBuffer(PAGE_SIZE, defaultGPUNode, true/*zero*/, false/*local*/, true/*exec*/); + m_pIsaGen->GetCopyDwordIsa(isaBuffer); + + Dispatch dispatch0(isaBuffer); + dispatch0.SetArgs(srcBuffer.As(), dstBuffer.As()); + dispatch0.Submit(pm4Queue); + dispatch0.Sync(g_TestTimeOut); + + sdmaQueue.PlaceAndSubmitPacket(SDMAWriteDataPacket(sdmaQueue.GetFamilyId(), + sdmaBuffer.As(), 0x12345678)); + + sdmaQueue.Wait4PacketConsumption(); + EXPECT_TRUE(WaitOnValue(&stackData[sdmaOffset], 0x12345678)); + + /* Fork a child process to mark pages as COW */ + pid_t pid = fork(); + ASSERT_GE(pid, 0); + if (pid == 0) { + /* Child process waits for a SIGTERM from the parent. It can't + * make any write access to the stack because we want the + * parent to make the first write access and get a new copy. A + * busy loop is the safest way to do that, since any function + * call (e.g. sleep) would write to the stack. + */ + while (1) + {} + WARN() << "Shouldn't get here!" << std::endl; + exit(0); + } + + /* Parent process writes to COW page(s) and gets a new copy. MMU + * notifier needs to update the GPU mapping(s) for the test to + * pass. + */ + *globalData = 0xD00BED00; + stackData[dstOffset] = 0xdeadbeef; + stackData[sdmaOffset] = 0xdeadbeef; + + /* Terminate the child process before a possible test failure that + * would leave it spinning in the background indefinitely. + */ + int status; + EXPECT_EQ(0, kill(pid, SIGTERM)); + EXPECT_EQ(pid, waitpid(pid, &status, 0)); + EXPECT_NE(0, WIFSIGNALED(status)); + EXPECT_EQ(SIGTERM, WTERMSIG(status)); + + /* Now check that the GPU is accessing the correct page */ + Dispatch dispatch1(isaBuffer); + dispatch1.SetArgs(srcBuffer.As(), dstBuffer.As()); + dispatch1.Submit(pm4Queue); + dispatch1.Sync(g_TestTimeOut); + + sdmaQueue.PlaceAndSubmitPacket(SDMAWriteDataPacket(sdmaQueue.GetFamilyId(), + sdmaBuffer.As(), 0xD0BED0BE)); + sdmaQueue.Wait4PacketConsumption(); + + EXPECT_SUCCESS(pm4Queue.Destroy()); + EXPECT_SUCCESS(sdmaQueue.Destroy()); + + EXPECT_EQ(0xD00BED00, *globalData); + EXPECT_EQ(0xD00BED00, stackData[dstOffset]); + EXPECT_EQ(0xD0BED0BE, stackData[sdmaOffset]); + + TEST_END +} + +TEST_F(KFDSVMRangeTest, PartialUnmapSysMemTest) { + TEST_REQUIRE_ENV_CAPABILITIES(ENVCAPS_64BITLINUX); + TEST_START(TESTPROFILE_RUNALL); + + if (!SVMAPISupported()) + return; + + int defaultGPUNode = m_NodeInfo.HsaDefaultGPUNode(); + ASSERT_GE(defaultGPUNode, 0) << "failed to get default GPU Node"; + + unsigned int BufSize = 16 * PAGE_SIZE; + void *pBuf; + + PM4Queue queue; + HsaMemoryBuffer isaBuffer(PAGE_SIZE, defaultGPUNode); + HsaSVMRange *sysBuffer; + HsaSVMRange destSysBuffer(BufSize, defaultGPUNode); + + pBuf = mmap(0, BufSize, PROT_READ | PROT_WRITE, MAP_ANONYMOUS | MAP_PRIVATE, -1, 0); + sysBuffer = new HsaSVMRange(pBuf, BufSize, defaultGPUNode, 0); + sysBuffer->Fill(0x01010101); + + char *pBuf2 = reinterpret_cast(pBuf) + 8192; + unsigned int Buf2Size = 4 * PAGE_SIZE; + char *pBuf3 = pBuf2 + Buf2Size; + + munmap(pBuf2, Buf2Size); + + m_pIsaGen->GetCopyDwordIsa(isaBuffer); + ASSERT_SUCCESS(queue.Create(defaultGPUNode)); + + Dispatch dispatch(isaBuffer); + Dispatch dispatch2(isaBuffer); + + dispatch.SetArgs(pBuf3, destSysBuffer.As()); + dispatch.Submit(queue); + dispatch.Sync(g_TestTimeOut); + EXPECT_EQ(destSysBuffer.As()[0], 0x01010101); + + dispatch2.SetArgs(pBuf, destSysBuffer.As()); + dispatch2.Submit(queue); + dispatch2.Sync(g_TestTimeOut); + + EXPECT_EQ(destSysBuffer.As()[0], 0x01010101); + + EXPECT_SUCCESS(queue.Destroy()); + munmap(pBuf, BufSize); + + TEST_END +} + +TEST_F(KFDSVMRangeTest, BasicVramTest) { + TEST_REQUIRE_ENV_CAPABILITIES(ENVCAPS_64BITLINUX); + TEST_START(TESTPROFILE_RUNALL); + + if (!SVMAPISupported()) + return; + + PM4Queue queue; + HSAuint64 AlternateVAGPU; + unsigned int BufferSize = PAGE_SIZE; + + int defaultGPUNode = m_NodeInfo.HsaDefaultGPUNode(); + ASSERT_GE(defaultGPUNode, 0) << "failed to get default GPU Node"; + + if (!GetVramSize(defaultGPUNode)) { + LOG() << "Skipping test: No VRAM found." << std::endl; + return; + } + + HsaMemoryBuffer isaBuffer(PAGE_SIZE, defaultGPUNode); + HsaSVMRange srcSysBuffer(BufferSize, defaultGPUNode); + HsaSVMRange locBuffer(BufferSize, defaultGPUNode, defaultGPUNode); + HsaSVMRange destSysBuffer(BufferSize, defaultGPUNode); + + srcSysBuffer.Fill(0x01010101); + + m_pIsaGen->GetCopyDwordIsa(isaBuffer); + + ASSERT_SUCCESS(queue.Create(defaultGPUNode)); + queue.SetSkipWaitConsump(0); + + Dispatch dispatch(isaBuffer); + Dispatch dispatch2(isaBuffer); + + dispatch.SetArgs(srcSysBuffer.As(), locBuffer.As()); + dispatch.Submit(queue); + dispatch.Sync(g_TestTimeOut); + + dispatch2.SetArgs(locBuffer.As(), destSysBuffer.As()); + dispatch2.Submit(queue); + dispatch2.Sync(g_TestTimeOut); + + EXPECT_SUCCESS(queue.Destroy()); + + EXPECT_EQ(destSysBuffer.As()[0], 0x01010101); + TEST_END +} + +TEST_F(KFDSVMRangeTest, SplitVramRangeTest) { + TEST_START(TESTPROFILE_RUNALL) + + if (!SVMAPISupported()) + return; + + int defaultGPUNode = m_NodeInfo.HsaDefaultGPUNode(); + ASSERT_GE(defaultGPUNode, 0) << "failed to get default GPU Node"; + + if (m_FamilyId < FAMILY_AI) { + LOG() << std::hex << "Skipping test: No svm range support for family ID 0x" << m_FamilyId << "." << std::endl; + return; + } + + SplitRangeTest(defaultGPUNode, defaultGPUNode); + TEST_END +} + +TEST_F(KFDSVMRangeTest, PrefetchTest) { + TEST_START(TESTPROFILE_RUNALL); + + if (!SVMAPISupported()) + return; + + unsigned int BufSize = 16 << 10; + HsaSVMRange *sysBuffer; + uint32_t node_id; + + int defaultGPUNode = m_NodeInfo.HsaDefaultGPUNode(); + ASSERT_GE(defaultGPUNode, 0) << "failed to get default GPU Node"; + + sysBuffer = new HsaSVMRange(BufSize, defaultGPUNode); + char *pBuf = sysBuffer->As(); + /* Using invalid svm range to get prefetch node should return failed */ + delete sysBuffer; + EXPECT_SUCCESS(!SVMRangeGetPrefetchNode(pBuf, BufSize, &node_id)); + + sysBuffer = new HsaSVMRange(BufSize, defaultGPUNode); + pBuf = sysBuffer->As(); + char *pLocBuf = pBuf + BufSize / 2; + + EXPECT_SUCCESS(SVMRangeGetPrefetchNode(pBuf, BufSize, &node_id)); + EXPECT_EQ(node_id, 0); + + EXPECT_SUCCESS(SVMRangePrefetchToNode(pLocBuf, BufSize / 2, defaultGPUNode)); + + EXPECT_SUCCESS(SVMRangeGetPrefetchNode(pLocBuf, BufSize / 2, &node_id)); + EXPECT_EQ(node_id, defaultGPUNode); + + EXPECT_SUCCESS(SVMRangeGetPrefetchNode(pBuf, BufSize, &node_id)); + EXPECT_EQ(node_id, 0xffffffff); + delete sysBuffer; + + TEST_END +} + +TEST_F(KFDSVMRangeTest, MigrateTest) { + TEST_REQUIRE_ENV_CAPABILITIES(ENVCAPS_64BITLINUX); + TEST_START(TESTPROFILE_RUNALL); + + if (!SVMAPISupported()) + return; + + int defaultGPUNode = m_NodeInfo.HsaDefaultGPUNode(); + ASSERT_GE(defaultGPUNode, 0) << "failed to get default GPU Node"; + + if (m_FamilyId < FAMILY_AI) { + LOG() << std::hex << "Skipping test: No svm range support for family ID 0x" << m_FamilyId << "." << std::endl; + return; + } + + if (!GetVramSize(defaultGPUNode)) { + LOG() << "Skipping test: No VRAM found." << std::endl; + return; + } + + HSAuint32 migrateRepeat = 8; + unsigned int BufferSize = 16 << 20; + + HsaSVMRange DataBuffer(BufferSize, defaultGPUNode); + HSAuint32 *pData = DataBuffer.As(); + + HsaSVMRange SysBuffer(BufferSize, defaultGPUNode); + HSAuint32 *pBuf = SysBuffer.As(); + EXPECT_SUCCESS(SVMRangePrefetchToNode(pBuf, BufferSize, 0)); + + HsaSVMRange SysBuffer2(BufferSize, defaultGPUNode); + HSAuint32 *pBuf2 = SysBuffer2.As(); + EXPECT_SUCCESS(SVMRangePrefetchToNode(pBuf2, BufferSize, 0)); + + SDMAQueue sdmaQueue; + ASSERT_SUCCESS(sdmaQueue.Create(defaultGPUNode)); + + for (HSAuint32 i = 0; i < BufferSize / 4; i++) + pData[i] = i; + + while (migrateRepeat--) { + /* Migrate from ram to vram */ + EXPECT_SUCCESS(SVMRangePrefetchToNode(pBuf, BufferSize, defaultGPUNode)); + EXPECT_SUCCESS(SVMRangePrefetchToNode(pBuf2, BufferSize, defaultGPUNode)); + /* Update content in migrated buffer in vram */ + sdmaQueue.PlaceAndSubmitPacket(SDMACopyDataPacket(sdmaQueue.GetFamilyId(), + pBuf, pData, BufferSize)); + sdmaQueue.Wait4PacketConsumption(); + sdmaQueue.PlaceAndSubmitPacket(SDMACopyDataPacket(sdmaQueue.GetFamilyId(), + pBuf2, pData, BufferSize)); + sdmaQueue.Wait4PacketConsumption(); + + /* Migrate from vram to ram + * CPU access the buffer migrated to vram have page fault + * page fault trigger migration from vram back to ram + * so SysBuffer should have same value as in vram + */ + for (HSAuint32 i = 0; i < BufferSize / 4; i++) { + ASSERT_EQ(i, pBuf[i]); + ASSERT_EQ(i, pBuf2[i]); + } + } + + /* If xnack off, after migrating back to ram, GPU mapping should be updated to ram + * test if shade can read from ram + * If xnack on, GPU mapping should be cleared, test if GPU vm fault can update + * page table and shade can read from ram. + */ + sdmaQueue.PlaceAndSubmitPacket(SDMACopyDataPacket(sdmaQueue.GetFamilyId(), + pBuf, pData, BufferSize)); + sdmaQueue.Wait4PacketConsumption(); + for (HSAuint32 i = 0; i < BufferSize / 4; i++) + ASSERT_EQ(i, pBuf[i]); + + TEST_END +} + +/* + * The test changes migration granularity, then trigger CPU page fault to migrate + * the svm range from vram to ram. + * Check the dmesg driver output to confirm the number of CPU page fault is correct + * based on granularity. + * + * For example, this is BufferPages = 5, while granularity change from 2 to 0 + * [ 292.623498] amdgpu:svm_migrate_to_ram:744: CPU page fault address 0x7f22597ee000 + * [ 292.623727] amdgpu:svm_migrate_to_ram:744: CPU page fault address 0x7f22597f0000 + * [ 292.724414] amdgpu:svm_migrate_to_ram:744: CPU page fault address 0x7f22597ee000 + * [ 292.724824] amdgpu:svm_migrate_to_ram:744: CPU page fault address 0x7f22597f0000 + * [ 292.725094] amdgpu:svm_migrate_to_ram:744: CPU page fault address 0x7f22597f2000 + * [ 292.728186] amdgpu:svm_migrate_to_ram:744: CPU page fault address 0x7f22597ee000 + * [ 292.729171] amdgpu:svm_migrate_to_ram:744: CPU page fault address 0x7f22597ef000 + * [ 292.729576] amdgpu:svm_migrate_to_ram:744: CPU page fault address 0x7f22597f0000 + * [ 292.730010] amdgpu:svm_migrate_to_ram:744: CPU page fault address 0x7f22597f1000 + * [ 292.730931] amdgpu:svm_migrate_to_ram:744: CPU page fault address 0x7f22597f2000 + */ +TEST_F(KFDSVMRangeTest, MigrateGranularityTest) { + TEST_REQUIRE_ENV_CAPABILITIES(ENVCAPS_64BITLINUX); + TEST_START(TESTPROFILE_RUNALL); + + if (!SVMAPISupported()) + return; + + int defaultGPUNode = m_NodeInfo.HsaDefaultGPUNode(); + ASSERT_GE(defaultGPUNode, 0) << "failed to get default GPU Node"; + + if (m_FamilyId < FAMILY_AI) { + LOG() << std::hex << "Skipping test: No svm range support for family ID 0x" << m_FamilyId << "." << std::endl; + return; + } + + if (!GetVramSize(defaultGPUNode)) { + LOG() << "Skipping test: No VRAM found." << std::endl; + return; + } + + HSAuint64 BufferPages = 16384; + HSAuint64 BufferSize = BufferPages * PAGE_SIZE; + HsaSVMRange SysBuffer(BufferSize, defaultGPUNode); + HSAint32 *pBuf = SysBuffer.As(); + + HsaSVMRange SysBuffer2(BufferSize, defaultGPUNode); + HSAint32 *pBuf2 = SysBuffer2.As(); + + HSAint32 Granularity; + + SDMAQueue sdmaQueue; + ASSERT_SUCCESS(sdmaQueue.Create(defaultGPUNode)); + + for (Granularity = 0; (1ULL << Granularity) <= BufferPages; Granularity++); + for (HSAuint32 i = 0; i < BufferPages; i++) + pBuf2[i * PAGE_SIZE / 4] = i; + + while (Granularity--) { + /* Prefetch the entire range to vram */ + EXPECT_SUCCESS(SVMRangePrefetchToNode(pBuf, BufferSize, defaultGPUNode)); + EXPECT_SUCCESS(SVMRangSetGranularity(pBuf, BufferSize, Granularity)); + + /* Change Buffer content in vram, then migrate it back to ram */ + sdmaQueue.PlaceAndSubmitPacket(SDMACopyDataPacket(sdmaQueue.GetFamilyId(), + pBuf, pBuf2, BufferSize)); + sdmaQueue.Wait4PacketConsumption(); + + /* Migrate from vram to ram */ + for (HSAuint32 i = 0; i < BufferPages; i++) + ASSERT_EQ(i, pBuf[i * PAGE_SIZE / 4]); + } + TEST_END +} + +TEST_F(KFDSVMRangeTest, MigrateLargeBufTest) { + TEST_REQUIRE_ENV_CAPABILITIES(ENVCAPS_64BITLINUX); + TEST_START(TESTPROFILE_RUNALL); + + if (!SVMAPISupported()) + return; + + PM4Queue queue; + HSAuint64 AlternateVAGPU; + unsigned long BufferSize = 1L << 30; + unsigned long maxSDMASize = 128L << 20; /* IB size is 4K */ + unsigned long Size, i; + + int defaultGPUNode = m_NodeInfo.HsaDefaultGPUNode(); + ASSERT_GE(defaultGPUNode, 0) << "failed to get default GPU Node"; + + if (!GetVramSize(defaultGPUNode)) { + LOG() << "Skipping test: No VRAM found." << std::endl; + return; + } + + HsaSVMRange SysBuffer(BufferSize, defaultGPUNode); + SysBuffer.Fill(0x1); + + HsaSVMRange SysBuffer2(BufferSize, defaultGPUNode); + SysBuffer2.Fill(0x2); + + /* Migrate from ram to vram + * using same address to register to GPU to trigger migration + * so LocalBuffer will have same value as SysBuffer + */ + HsaSVMRange LocalBuffer(SysBuffer.As(), BufferSize, defaultGPUNode, defaultGPUNode); + + SDMAQueue sdmaQueue; + + ASSERT_SUCCESS(sdmaQueue.Create(defaultGPUNode)); + for (i = 0; i < BufferSize; i += Size) { + Size = (BufferSize - i) > maxSDMASize ? maxSDMASize : (BufferSize - i); + sdmaQueue.PlaceAndSubmitPacket(SDMACopyDataPacket(sdmaQueue.GetFamilyId(), + SysBuffer2.As() + i, LocalBuffer.As() + i, Size)); + sdmaQueue.Wait4PacketConsumption(); + } + + /* Check content in migrated buffer in vram */ + for (i = 0; i < BufferSize / 4; i += 1024) + ASSERT_EQ(0x1, SysBuffer2.As()[i]); + + /* Change LocalBuffer content in vram, then migrate it back to ram */ + SysBuffer2.Fill(0x3); + + for (i = 0; i < BufferSize; i += Size) { + Size = (BufferSize - i) > maxSDMASize ? maxSDMASize : (BufferSize - i); + sdmaQueue.PlaceAndSubmitPacket(SDMACopyDataPacket(sdmaQueue.GetFamilyId(), + LocalBuffer.As() + i, SysBuffer2.As() + i, Size)); + sdmaQueue.Wait4PacketConsumption(); + } + + /* Migrate from vram to ram + * CPU access the buffer migrated to vram have page fault + * page fault trigger migration from vram back to ram + * so SysBuffer should have same value as in LocalBuffer + */ + EXPECT_SUCCESS(SVMRangSetGranularity(SysBuffer.As(), BufferSize, 30)); + for (i = 0; i < BufferSize / 4; i += 1024) + ASSERT_EQ(0x3, SysBuffer.As()[i]); + + /* After migrating back to ram, GPU mapping should be updated to ram + * test if shade can read from ram + */ + SysBuffer.Fill(0x4); + + for (i = 0; i < BufferSize; i += Size) { + Size = (BufferSize - i) > maxSDMASize ? maxSDMASize : (BufferSize - i); + sdmaQueue.PlaceAndSubmitPacket(SDMACopyDataPacket(sdmaQueue.GetFamilyId(), + SysBuffer2.As() + i, LocalBuffer.As() + i, Size)); + sdmaQueue.Wait4PacketConsumption(); + } + + for (i = 0; i < BufferSize / 4; i += 1024) + ASSERT_EQ(0x4, SysBuffer2.As()[i]); + + TEST_END +} + +TEST_F(KFDSVMRangeTest, MigratePolicyTest) { + TEST_REQUIRE_ENV_CAPABILITIES(ENVCAPS_64BITLINUX); + TEST_START(TESTPROFILE_RUNALL); + + if (!SVMAPISupported()) + return; + + int defaultGPUNode = m_NodeInfo.HsaDefaultGPUNode(); + ASSERT_GE(defaultGPUNode, 0) << "failed to get default GPU Node"; + + if (m_FamilyId < FAMILY_AI) { + LOG() << std::hex << "Skipping test: No svm range support for family ID 0x" << m_FamilyId << "." << std::endl; + return; + } + + if (!GetVramSize(defaultGPUNode)) { + LOG() << "Skipping test: No VRAM found." << std::endl; + return; + } + + unsigned long BufferSize = 1UL << 20; + + HsaSVMRange DataBuffer(BufferSize, defaultGPUNode); + HSAuint64 *pData = DataBuffer.As(); + + HsaSVMRange SysBuffer(BufferSize, defaultGPUNode); + HSAuint64 *pBuf = SysBuffer.As(); + + SDMAQueue sdmaQueue; + ASSERT_SUCCESS(sdmaQueue.Create(defaultGPUNode)); + + for (HSAuint64 i = 0; i < BufferSize / 8; i++) + pData[i] = i; + + /* Prefetch to migrate from ram to vram */ + EXPECT_SUCCESS(SVMRangePrefetchToNode(pBuf, BufferSize, defaultGPUNode)); + + /* Update content in migrated buffer in vram */ + sdmaQueue.PlaceAndSubmitPacket(SDMACopyDataPacket(sdmaQueue.GetFamilyId(), + pBuf, pData, BufferSize)); + sdmaQueue.Wait4PacketConsumption(NULL, HSA_EVENTTIMEOUT_INFINITE); + + /* Migrate from vram to ram + * CPU access the buffer migrated to vram have page fault + * page fault trigger migration from vram back to ram + * so SysBuffer should have same value as in vram + */ + for (HSAuint64 i = 0; i < BufferSize / 8; i++) { + ASSERT_EQ(i, pBuf[i]); + /* Update buf */ + pBuf[i] = i + 1; + } + + /* Migrate from ram to vram if xnack on + * If xnack off, after migrating back to ram, GPU mapping should be updated to ram + * test if shade can read from ram + * If xnack on, GPU mapping should be cleared, test if GPU vm fault can update + * page table and shade can read from ram. + */ +//#define USE_PM4_QUEUE_TRIGGER_VM_FAULT +#ifdef USE_PM4_QUEUE_TRIGGER_VM_FAULT + HsaMemoryBuffer isaBuffer(PAGE_SIZE, defaultGPUNode); + PM4Queue queue; + m_pIsaGen->GetCopyDwordIsa(isaBuffer); + ASSERT_SUCCESS(queue.Create(defaultGPUNode)); + + for (HSAuint64 i = 0; i < BufferSize / 8; i += 512) { + Dispatch dispatch(isaBuffer); + + dispatch.SetArgs(pBuf + i, pData + i); + dispatch.Submit(queue); + dispatch.Sync(HSA_EVENTTIMEOUT_INFINITE); + } +#else + sdmaQueue.PlaceAndSubmitPacket(SDMACopyDataPacket(sdmaQueue.GetFamilyId(), + pData, pBuf, BufferSize)); + sdmaQueue.Wait4PacketConsumption(NULL, HSA_EVENTTIMEOUT_INFINITE); +#endif + + for (HSAuint64 i = 0; i < BufferSize / 8; i += 512) + ASSERT_EQ(i + 1, pData[i]); + + ASSERT_SUCCESS(sdmaQueue.Destroy()); + + TEST_END +} + +/* Multiple GPU migration test + * + * Steps: + * 1. Prefetch pBuf, pData to all GPUs, to test migration from GPU to GPU + * 2. Use sdma queue on all GPUs, to copy data from pBuf to pData + * 3. Check pData data + * + * Notes: + * With xnack on, step 2 will have retry fault on pBuf, to migrate from GPU to GPU, + * retry fault on pData, to migrate from CPU to GPU + * + * With xnack off, pBuf and pData should prefetch to CPU to ensure multiple GPU access + * + * step3 migrate pData from GPU to CPU + * + * Test will skip if only one GPU found + */ +TEST_F(KFDSVMRangeTest, MultiGPUMigrationTest) { + TEST_REQUIRE_ENV_CAPABILITIES(ENVCAPS_64BITLINUX); + TEST_START(TESTPROFILE_RUNALL); + + if (!SVMAPISupported()) + return; + + int defaultGPUNode = m_NodeInfo.HsaDefaultGPUNode(); + ASSERT_GE(defaultGPUNode, 0) << "failed to get default GPU Node"; + + if (m_FamilyId < FAMILY_AI) { + LOG() << std::hex << "Skipping test: No svm range support for family ID 0x" << m_FamilyId << "." << std::endl; + return; + } + + const std::vector gpuNodes = m_NodeInfo.GetNodesWithGPU(); + if (gpuNodes.size() < 2) { + LOG() << "Skipping test: at least two GPUs needed." << std::endl; + return; + } + + unsigned long BufferSize = 1UL << 20; + + HsaSVMRange SysBuffer(BufferSize, defaultGPUNode); + HSAuint64 *pBuf = SysBuffer.As(); + HsaSVMRange DataBuffer(BufferSize, defaultGPUNode); + HSAuint64 *pData = DataBuffer.As(); + + SDMAQueue sdmaQueue; + + for (HSAuint64 i = 0; i < BufferSize / 8; i++) + pBuf[i] = i; + + for (HSAuint64 gpuidx = 0; gpuidx < gpuNodes.size(); gpuidx++) { + EXPECT_SUCCESS(SVMRangeMapToNode(pBuf, BufferSize, gpuNodes.at(gpuidx))); + EXPECT_SUCCESS(SVMRangePrefetchToNode(pBuf, BufferSize, gpuNodes.at(gpuidx))); + + EXPECT_SUCCESS(SVMRangeMapToNode(pData, BufferSize, gpuNodes.at(gpuidx))); + EXPECT_SUCCESS(SVMRangePrefetchToNode(pData, BufferSize, gpuNodes.at(gpuidx))); + } + + for (HSAuint64 gpuidx = 0; gpuidx < gpuNodes.size(); gpuidx++) { + ASSERT_SUCCESS(sdmaQueue.Create(gpuNodes.at(gpuidx))); + + sdmaQueue.PlaceAndSubmitPacket(SDMACopyDataPacket(sdmaQueue.GetFamilyId(), + pData, pBuf, BufferSize)); + sdmaQueue.Wait4PacketConsumption(); + + for (HSAuint64 i = 0; i < BufferSize / 8; i += 512) + ASSERT_EQ(i, pData[i]); + + EXPECT_SUCCESS(sdmaQueue.Destroy()); + } + + TEST_END +} + +/* Multiple GPU access in place test + * + * Steps: + * 1. Prefetch pBuf, pData to all GPUs, with ACCESS_IN_PLACE on GPUs + * 2. Use sdma queue on all GPUs, to copy data from pBuf to pData + * 3. Prefetch pData to CPU, check pData data + * + * Notes: + * With xnack on, step 2 will have retry fault on pBuf, to migrate from GPU to GPU. + * If multiple GPU on xGMI same hive, there should not have retry fault on pBuf + * because mapping should update to another GPU vram through xGMI + * + * With xnack off, pBuf and pData should prefetch to CPU to ensure multiple GPU access + * + * step3 migrate pData from GPU to CPU, should not have retry fault on GPUs. + * + * Test will skip if only one GPU found + */ +TEST_F(KFDSVMRangeTest, MultiGPUAccessInPlaceTest) { + TEST_REQUIRE_ENV_CAPABILITIES(ENVCAPS_64BITLINUX); + TEST_START(TESTPROFILE_RUNALL); + + if (!SVMAPISupported()) + return; + + int defaultGPUNode = m_NodeInfo.HsaDefaultGPUNode(); + ASSERT_GE(defaultGPUNode, 0) << "failed to get default GPU Node"; + + if (m_FamilyId < FAMILY_AI) { + LOG() << std::hex << "Skipping test: No svm range support for family ID 0x" << m_FamilyId << "." << std::endl; + return; + } + + const std::vector gpuNodes = m_NodeInfo.GetNodesWithGPU(); + if (gpuNodes.size() < 2) { + LOG() << "Skipping test: at least two GPUs needed." << std::endl; + return; + } + + unsigned long BufferSize = 1UL << 20; + + HsaSVMRange SysBuffer(BufferSize, defaultGPUNode); + HSAuint64 *pBuf = SysBuffer.As(); + HsaSVMRange DataBuffer(BufferSize, defaultGPUNode); + HSAuint64 *pData = DataBuffer.As(); + + SDMAQueue sdmaQueue; + + for (HSAuint64 i = 0; i < BufferSize / 8; i++) + pBuf[i] = i; + + for (HSAuint64 gpuidx = 0; gpuidx < gpuNodes.size(); gpuidx++) { + EXPECT_SUCCESS(SVMRangeMapInPlaceToNode(pBuf, BufferSize, gpuNodes.at(gpuidx))); + EXPECT_SUCCESS(SVMRangePrefetchToNode(pBuf, BufferSize, gpuNodes.at(gpuidx))); + + EXPECT_SUCCESS(SVMRangeMapInPlaceToNode(pData, BufferSize, gpuNodes.at(gpuidx))); + EXPECT_SUCCESS(SVMRangePrefetchToNode(pData, BufferSize, gpuNodes.at(gpuidx))); + } + + for (HSAuint64 gpuidx = 0; gpuidx < gpuNodes.size(); gpuidx++) { + ASSERT_SUCCESS(sdmaQueue.Create(gpuNodes.at(gpuidx))); + + sdmaQueue.PlaceAndSubmitPacket(SDMACopyDataPacket(sdmaQueue.GetFamilyId(), + pData, pBuf, BufferSize)); + sdmaQueue.Wait4PacketConsumption(); + + for (HSAuint64 i = 0; i < BufferSize / 8; i += 512) + ASSERT_EQ(i, pData[i]); + + EXPECT_SUCCESS(sdmaQueue.Destroy()); + } + + TEST_END +} + +/* Multiple thread migration test + * + * 2 threads do migration at same time to test range migration race conditon handle. + * + * Steps: + * 1. register 128MB range on system memory, don't map to GPU, 128MB is max size to put in + * sdma queue 4KB IB buffer. + * 2. one thread prefetch range to GPU, another thread use sdma queue to access range at same + * time to generate retry vm fault to migrate range to GPU + * 3. one thread prefetch range to CPU, another thread read range to generate CPU page fault + * to migrate range to CPU at same time + * 4. loop test step 2 and 3 twice, to random CPU/GPU fault and prefetch migration order + */ +struct ReadThreadParams { + HSAuint64* pBuf; + HSAint64 BufferSize; + int defaultGPUNode; +}; + +unsigned int CpuReadThread(void* p) { + struct ReadThreadParams* pArgs = reinterpret_cast(p); + + for (HSAuint64 i = 0; i < pArgs->BufferSize / 8; i += 512) + EXPECT_EQ(i, pArgs->pBuf[i]); + return 0; +} + +unsigned int GpuReadThread(void* p) { + struct ReadThreadParams* pArgs = reinterpret_cast(p); + + EXPECT_SUCCESS(SVMRangePrefetchToNode(pArgs->pBuf, pArgs->BufferSize, pArgs->defaultGPUNode)); + return 0; +} + +TEST_F(KFDSVMRangeTest, MultiThreadMigrationTest) { + TEST_REQUIRE_ENV_CAPABILITIES(ENVCAPS_64BITLINUX); + TEST_START(TESTPROFILE_RUNALL); + + if (!SVMAPISupported()) + return; + + int defaultGPUNode = m_NodeInfo.HsaDefaultGPUNode(); + ASSERT_GE(defaultGPUNode, 0) << "failed to get default GPU Node"; + + if (m_FamilyId < FAMILY_AI) { + LOG() << std::hex << "Skipping test: No svm range support for family ID 0x" << m_FamilyId << "." << std::endl; + return; + } + + unsigned long test_loops = 2; + unsigned long BufferSize = 1UL << 27; + HsaSVMRange SysBuffer(BufferSize, defaultGPUNode); + HSAuint64 *pBuf = SysBuffer.As(); + HsaSVMRange DataBuffer(BufferSize, defaultGPUNode); + HSAuint64 *pData = DataBuffer.As(); + SDMAQueue sdmaQueue; + uint64_t threadId; + struct ReadThreadParams params; + + params.pBuf = pBuf; + params.BufferSize = BufferSize; + params.defaultGPUNode = defaultGPUNode; + + EXPECT_SUCCESS(sdmaQueue.Create(defaultGPUNode)); + + for (HSAuint64 i = 0; i < BufferSize / 8; i++) + pBuf[i] = i; + + for (HSAuint64 i = 0; i < test_loops; i++) { + /* 2 threads migrate to GPU */ + sdmaQueue.PlaceAndSubmitPacket(SDMACopyDataPacket(sdmaQueue.GetFamilyId(), + pData, pBuf, BufferSize)); + ASSERT_EQ(true, StartThread(&GpuReadThread, ¶ms, threadId)); + sdmaQueue.Wait4PacketConsumption(); + WaitForThread(threadId); + + /* 2 threads migrate to cpu */ + ASSERT_EQ(true, StartThread(&CpuReadThread, ¶ms, threadId)); + EXPECT_SUCCESS(SVMRangePrefetchToNode(pBuf, BufferSize, 0)); + WaitForThread(threadId); + } + + EXPECT_SUCCESS(sdmaQueue.Destroy()); + + TEST_END +} diff --git a/tests/kfdtest/src/KFDSVMRangeTest.hpp b/tests/kfdtest/src/KFDSVMRangeTest.hpp new file mode 100644 index 0000000000..88bddd94a8 --- /dev/null +++ b/tests/kfdtest/src/KFDSVMRangeTest.hpp @@ -0,0 +1,46 @@ +/* + * Copyright (C) 2020 Advanced Micro Devices, Inc. All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + * + */ + +#ifndef __KFD_SVMRANGE_TEST__H__ +#define __KFD_SVMRANGE_TEST__H__ + +#include + +#include "IsaGenerator.hpp" +#include "KFDBaseComponentTest.hpp" + +class KFDSVMRangeTest : public KFDBaseComponentTest { + public: + KFDSVMRangeTest() :m_pIsaGen(NULL) {} + ~KFDSVMRangeTest() {} + void SplitRangeTest(int defaultGPUNode, int prefetch_location); + + protected: + virtual void SetUp(); + virtual void TearDown(); + + protected: // Members + IsaGenerator* m_pIsaGen; +}; + +#endif // __KFD_LOCALMEMORY_TEST__H__ diff --git a/tests/kfdtest/src/KFDTestUtil.cpp b/tests/kfdtest/src/KFDTestUtil.cpp index 8651eaa51d..e52e7b496b 100644 --- a/tests/kfdtest/src/KFDTestUtil.cpp +++ b/tests/kfdtest/src/KFDTestUtil.cpp @@ -22,9 +22,9 @@ */ #include "KFDTestUtil.hpp" - #include #include +#include #include #include #include "BaseQueue.hpp" @@ -694,3 +694,181 @@ const bool HsaNodeInfo::IsNodeXGMItoCPU(int node) const { return ret; } + +HSAKMT_STATUS RegisterSVMRange(HSAuint32 GPUNode, void *MemoryAddress, + HSAuint64 SizeInBytes, HSAuint32 PrefetchNode, + HSAuint32 SVMFlags) { + HSA_SVM_ATTRIBUTE *attrs; + HSAuint64 s_attr; + HSAuint32 nattr; + HSAKMT_STATUS r; + + nattr = 4; + s_attr = sizeof(*attrs) * nattr; + attrs = (HSA_SVM_ATTRIBUTE *)alloca(s_attr); + + attrs[0].type = HSA_SVM_ATTR_PREFETCH_LOC; + attrs[0].value = PrefetchNode; + attrs[1].type = HSA_SVM_ATTR_PREFERRED_LOC; + attrs[1].value = PrefetchNode; + attrs[2].type = HSA_SVM_ATTR_SET_FLAGS; + attrs[2].value = SVMFlags; + attrs[3].type = HSA_SVM_ATTR_ACCESS; + attrs[3].value = GPUNode; + + r = hsaKmtSVMSetAttr(MemoryAddress, SizeInBytes, nattr, attrs); + if (r) { + LOG() << "set range attrs failed" << std::endl; + return HSAKMT_STATUS_ERROR; + } + + return HSAKMT_STATUS_SUCCESS; +} + +HSAKMT_STATUS SVMRangeGetPrefetchNode(void *MemoryAddress, HSAuint64 SizeInBytes, + HSAuint32 *PrefetchNode) { + HSA_SVM_ATTRIBUTE attr; + int r; + + attr.type = HSA_SVM_ATTR_PREFETCH_LOC; + attr.value = 0; + + r = hsaKmtSVMGetAttr(MemoryAddress, SizeInBytes, 1, &attr); + if (r) { + LOG() << "get prefetch node failed" << std::endl; + return HSAKMT_STATUS_ERROR; + } + + *PrefetchNode = attr.value; + + return HSAKMT_STATUS_SUCCESS; +} + +HSAKMT_STATUS SVMRangePrefetchToNode(void *MemoryAddress, HSAuint64 SizeInBytes, + HSAuint32 PrefetchNode) { + HSA_SVM_ATTRIBUTE attr; + int r; + + attr.type = HSA_SVM_ATTR_PREFETCH_LOC; + attr.value = PrefetchNode; + + r = hsaKmtSVMSetAttr(MemoryAddress, SizeInBytes, 1, &attr); + if (r) { + LOG() << "set prefetch node failed" << std::endl; + return HSAKMT_STATUS_ERROR; + } + + return HSAKMT_STATUS_SUCCESS; +} + +HSAKMT_STATUS SVMRangeMapToNode(void *MemoryAddress, HSAuint64 SizeInBytes, + HSAuint32 NodeID) { + HSA_SVM_ATTRIBUTE attr; + int r; + + attr.type = HSA_SVM_ATTR_ACCESS; + attr.value = NodeID; + + r = hsaKmtSVMSetAttr(MemoryAddress, SizeInBytes, 1, &attr); + if (r) { + LOG() << "set map to node failed" << std::endl; + return HSAKMT_STATUS_ERROR; + } + + return HSAKMT_STATUS_SUCCESS; +} + +HSAKMT_STATUS SVMRangeMapInPlaceToNode(void *MemoryAddress, HSAuint64 SizeInBytes, + HSAuint32 NodeID) { + HSA_SVM_ATTRIBUTE attr; + int r; + + attr.type = HSA_SVM_ATTR_ACCESS_IN_PLACE; + attr.value = NodeID; + + r = hsaKmtSVMSetAttr(MemoryAddress, SizeInBytes, 1, &attr); + if (r) { + LOG() << "set map in place to node failed" << std::endl; + return HSAKMT_STATUS_ERROR; + } + + return HSAKMT_STATUS_SUCCESS; +} + +HSAKMT_STATUS SVMRangSetGranularity(void *MemoryAddress, HSAuint64 SizeInBytes, + HSAuint32 Granularity) { + HSA_SVM_ATTRIBUTE attr; + int r; + + attr.type = HSA_SVM_ATTR_GRANULARITY; + attr.value = Granularity; + + r = hsaKmtSVMSetAttr(MemoryAddress, SizeInBytes, 1, &attr); + if (r) { + LOG() << "set granularity failed" << std::endl; + return HSAKMT_STATUS_ERROR; + } + + return HSAKMT_STATUS_SUCCESS; +} + +HsaSVMRange::HsaSVMRange(HSAuint64 size, HSAuint32 GPUNode) : + HsaSVMRange(NULL, size, GPUNode, 0) {} + +HsaSVMRange::HsaSVMRange(HSAuint64 size) : + HsaSVMRange(NULL, size, 0, 0, true) {} + +HsaSVMRange::HsaSVMRange(HSAuint64 size, HSAuint32 GPUNode, HSAuint32 PrefetchNode) : + HsaSVMRange(NULL, size, GPUNode, PrefetchNode) {} + +HsaSVMRange::HsaSVMRange(void *addr, HSAuint64 size, HSAuint32 GPUNode, HSAuint32 PrefetchNode, + bool noRegister, bool isLocal, bool isExec, bool isReadOnly): + m_Size(size), + m_pUser(addr), + m_Local(isLocal), + m_Node(PrefetchNode), + m_SelfAllocated(false) { + if (!m_pUser) { + m_pUser = mmap(0, m_Size, PROT_READ | PROT_WRITE, MAP_ANONYMOUS | MAP_PRIVATE, -1, 0); + EXPECT_NOTNULL(m_pUser); + m_SelfAllocated = true; + } + + if (m_Local) + m_Flags = HSA_SVM_FLAG_HOST_ACCESS; + else + m_Flags = HSA_SVM_FLAG_HOST_ACCESS | HSA_SVM_FLAG_COHERENT; + + if (isReadOnly) + m_Flags |= HSA_SVM_FLAG_GPU_RO; + if (isExec) + m_Flags |= HSA_SVM_FLAG_GPU_EXEC; + + if (!noRegister) + EXPECT_SUCCESS(RegisterSVMRange(GPUNode, m_pUser, m_Size, PrefetchNode, m_Flags)); +} + +HsaSVMRange::~HsaSVMRange() { + if (m_pUser != NULL) { + if (m_SelfAllocated) + munmap(m_pUser, m_Size); + m_pUser = NULL; + } +} + +void HsaSVMRange::Fill(HSAuint32 value, HSAuint64 offset, HSAuint64 size) { + HSAuint64 i; + HSAuint32 *ptr = NULL; + + size = size ? size : m_Size; + EXPECT_EQ((size & (sizeof(HSAuint32) - 1)), 0) << "Not word aligned. Call Fill(unsigned char)"; + ASSERT_TRUE(size + offset <= m_Size) << "Buffer Overflow" << std::endl; + + if (m_pUser != NULL) + ptr = reinterpret_cast(reinterpret_cast(m_pUser) + offset); + + ASSERT_NOTNULL(ptr); + + for (i = 0; i < size / sizeof(HSAuint32); i++) + ptr[i] = value; +} diff --git a/tests/kfdtest/src/KFDTestUtil.hpp b/tests/kfdtest/src/KFDTestUtil.hpp index 2076e27df0..ee88ed3909 100644 --- a/tests/kfdtest/src/KFDTestUtil.hpp +++ b/tests/kfdtest/src/KFDTestUtil.hpp @@ -121,8 +121,49 @@ class HsaMemoryBuffer { unsigned int m_Node; HSAuint64 m_MappedNodes; }; +HSAKMT_STATUS RegisterSVMRange(HSAuint32 GPUNode, void *MemoryAddress, + HSAuint64 SizeInBytes, HSAuint32 PrefetchNode, + HSAuint32 SVMFlags); +HSAKMT_STATUS SVMRangeGetPrefetchNode(void *MemoryAddress, HSAuint64 SizeInBytes, + HSAuint32 *PrefetchNode); +HSAKMT_STATUS SVMRangePrefetchToNode(void *MemoryAddress, HSAuint64 SizeInBytes, + HSAuint32 PrefetchNode); +HSAKMT_STATUS SVMRangeMapToNode(void *MemoryAddress, HSAuint64 SizeInBytes, + HSAuint32 NodeID); +HSAKMT_STATUS SVMRangeMapInPlaceToNode(void *MemoryAddress, HSAuint64 SizeInBytes, + HSAuint32 NodeID); +HSAKMT_STATUS SVMRangSetGranularity(void *MemoryAddress, HSAuint64 SizeInBytes, + HSAuint32 Granularity); +class HsaSVMRange { + public: + HsaSVMRange(HSAuint64 size, HSAuint32 GPUNode); + HsaSVMRange(HSAuint64 size, HSAuint32 GPUNode, HSAuint32 PreferredNode); + HsaSVMRange(HSAuint64 size); + HsaSVMRange(void *addr, HSAuint64 size, HSAuint32 GPUNode, HSAuint32 PreferredNode = 0, + bool noRegister = false, bool isLocal = false, bool isExec = false, + bool isReadOnly = false); + template + RetType As() { + return reinterpret_cast(m_pUser); + } + template + const RetType As() const { + return reinterpret_cast(m_pUser); + } + ~HsaSVMRange(); + + void Fill(HSAuint32 value, HSAuint64 offset = 0, HSAuint64 size = 0); + + private: + HSAuint32 m_Flags; + HSAuint64 m_Size; + void* m_pUser; + bool m_SelfAllocated; + bool m_Local; + unsigned int m_Node; +}; class HsaInteropMemoryBuffer { public: