/* * Copyright (C) 2014-2018 Advanced Micro Devices, Inc. All Rights Reserved. * * Permission is hereby granted, free of charge, to any person obtaining a * copy of this software and associated documentation files (the "Software"), * to deal in the Software without restriction, including without limitation * the rights to use, copy, modify, merge, publish, distribute, sublicense, * and/or sell copies of the Software, and to permit persons to whom the * Software is furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in * all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR * OTHER DEALINGS IN THE SOFTWARE. * */ #include "KFDLocalMemoryTest.hpp" #include "PM4Queue.hpp" #include "PM4Packet.hpp" #include "SDMAPacket.hpp" #include "SDMAQueue.hpp" #include "Dispatch.hpp" // All tests are marked by their serial number in the QCM FDD void KFDLocalMemoryTest::SetUp() { ROUTINE_START KFDBaseComponentTest::SetUp(); m_pIsaGen = IsaGenerator::Create(m_FamilyId); ROUTINE_END } void KFDLocalMemoryTest::TearDown() { ROUTINE_START if (m_pIsaGen) delete m_pIsaGen; m_pIsaGen = NULL; KFDBaseComponentTest::TearDown(); ROUTINE_END } TEST_F(KFDLocalMemoryTest, BasicTest) { TEST_REQUIRE_ENV_CAPABILITIES(ENVCAPS_64BITLINUX); TEST_START(TESTPROFILE_RUNALL); PM4Queue queue; HSAuint64 AlternateVAGPU; unsigned int BufferSize = PAGE_SIZE; int defaultGPUNode = m_NodeInfo.HsaDefaultGPUNode(); ASSERT_GE(defaultGPUNode, 0) << "failed to get default GPU Node"; if (!GetVramSize(defaultGPUNode)) { LOG() << "Skipping test: No VRAM found." << std::endl; return; } HsaMemoryBuffer isaBuffer(PAGE_SIZE, defaultGPUNode); HsaMemoryBuffer srcSysBuffer(BufferSize, defaultGPUNode, false); HsaMemoryBuffer destSysBuffer(BufferSize, defaultGPUNode); HsaMemoryBuffer srcLocalBuffer(BufferSize, defaultGPUNode, false, true); HsaMemoryBuffer dstLocalBuffer(BufferSize, defaultGPUNode, false, true); srcSysBuffer.Fill(0x01010101); m_pIsaGen->GetCopyDwordIsa(isaBuffer); ASSERT_SUCCESS(hsaKmtMapMemoryToGPU(srcLocalBuffer.As(), srcLocalBuffer.Size(), &AlternateVAGPU)); ASSERT_SUCCESS(hsaKmtMapMemoryToGPU(dstLocalBuffer.As(), dstLocalBuffer.Size(), &AlternateVAGPU)); ASSERT_SUCCESS(queue.Create(defaultGPUNode)); queue.SetSkipWaitConsump(0); Dispatch dispatch(isaBuffer); dispatch.SetArgs(srcSysBuffer.As(), srcLocalBuffer.As()); dispatch.Submit(queue); dispatch.Sync(g_TestTimeOut); dispatch.SetArgs(srcLocalBuffer.As(), dstLocalBuffer.As()); dispatch.Submit(queue); dispatch.Sync(g_TestTimeOut); dispatch.SetArgs(dstLocalBuffer.As(), destSysBuffer.As()); dispatch.Submit(queue); dispatch.Sync(g_TestTimeOut); ASSERT_SUCCESS(queue.Destroy()); ASSERT_SUCCESS(hsaKmtUnmapMemoryToGPU(srcLocalBuffer.As())); ASSERT_SUCCESS(hsaKmtUnmapMemoryToGPU(dstLocalBuffer.As())); ASSERT_EQ(destSysBuffer.As()[0], 0x01010101); TEST_END } TEST_F(KFDLocalMemoryTest, VerifyContentsAfterUnmapAndMap) { TEST_REQUIRE_ENV_CAPABILITIES(ENVCAPS_64BITLINUX); TEST_START(TESTPROFILE_RUNALL); PM4Queue queue; HSAuint64 AlternateVAGPU; unsigned int BufferSize = PAGE_SIZE; int defaultGPUNode = m_NodeInfo.HsaDefaultGPUNode(); ASSERT_GE(defaultGPUNode, 0) << "failed to get default GPU Node"; if (!GetVramSize(defaultGPUNode)) { LOG() << "Skipping test: No VRAM found." << std::endl; return; } HsaMemoryBuffer isaBuffer(PAGE_SIZE, defaultGPUNode); HsaMemoryBuffer SysBufferA(BufferSize, defaultGPUNode, false); HsaMemoryBuffer SysBufferB(BufferSize, defaultGPUNode, true); HsaMemoryBuffer LocalBuffer(BufferSize, defaultGPUNode, true, true); SysBufferA.Fill(0x01010101); m_pIsaGen->GetCopyDwordIsa(isaBuffer); ASSERT_SUCCESS(queue.Create(defaultGPUNode)); queue.SetSkipWaitConsump(0); if (!is_dgpu()) ASSERT_SUCCESS(hsaKmtMapMemoryToGPU(LocalBuffer.As(), LocalBuffer.Size(), &AlternateVAGPU)); Dispatch dispatch(isaBuffer); dispatch.SetArgs(SysBufferA.As(), LocalBuffer.As()); dispatch.Submit(queue); dispatch.Sync(g_TestTimeOut); ASSERT_SUCCESS(hsaKmtUnmapMemoryToGPU(LocalBuffer.As())); ASSERT_SUCCESS(hsaKmtMapMemoryToGPU(LocalBuffer.As(), LocalBuffer.Size(), &AlternateVAGPU)); dispatch.SetArgs(LocalBuffer.As(), SysBufferB.As()); dispatch.Submit(queue); dispatch.Sync(g_TestTimeOut); ASSERT_SUCCESS(queue.Destroy()); ASSERT_EQ(SysBufferB.As()[0], 0x01010101); if (!is_dgpu()) ASSERT_SUCCESS(hsaKmtUnmapMemoryToGPU(LocalBuffer.As())); TEST_END } /* Deliberately fragment GPUVM aperture to fill up address space * * General idea: Allocate buffers, but don't map them to GPU. This * will reserve virtual address space without pinning physical * memory. It should allow using more address space than physically * available memory. * * Even without pinning memory, TTM will still commit memory at * allocation time and swap out movable buffers to system memory or * even the hard drive, if it needs to. So we can't allocate arbitrary * amounts of virtual memory. * * Strategy to maximize the amount of allocated, fragmented address * space while keeping the amount of committed memory bounded at all * times: * * 1. Allocate N blocks of a given size, initially 1 page * 2. Free every other block, creating holes in the address space. * This frees up half the memory * 3. Allocate N/4 blocks of 2-pages each. This requires as much * memory as was freed in step 2. The block size is bigger than * the 1-page holes, so new address space will be used. * 4. Free half the blocks just allocated, and half of the * remaining blocks of step 1. This creates 3-page holes between * the 1-page blocks from step 1, and 2-page holes between the * 2-page blocks from step 3. It frees up half of the total * memory. * 5. Double the block size to 4, devide number of blocks by 2. * Again, this will require the amount of memory freed in step 4. * The block size 4 is bigger than the biggest hole (3 pages). * 6. Free half the memory again, creating 7-page holes between * 1-page blocks, 6-page holes between 2-page blocks, and 4-page * holes between 4-page blocks. * * Repeat, doubling block size and halving number of blocks in each * iteration. Each iteration starts and ends with half the total * memory free. Because the block size is always bigger than the * biggest hole, each iteration increases the amount of address space * occupied by half the total memory size. Once the block size reaches * half of the free memory (1/4 of total memory) the limit is reached. * * With 2^n pages available memory, n * 2^(n-1) pages of address space * can be reserved. At the end of that process, half the memory will * be free. * * Total memory | Fragmented address space * order | pages | size | pages | size | ratio * ------+-------+------+-------+-------+------- * 2 | 4 | 16K | 4 | 16K | 1 * 3 | 8 | 32K | 12 | 48K | 1.5 * 4 | 16 | 64K | 32 | 128K | 2 * 5 | 32 | 128K | 80 | 320K | 2.5 * 6 | 64 | 256K | 192 | 768K | 3 * 7 | 128 | 512K | 448 | 1.75M | 3.5 * 8 | 256 | 1M | 1M | 4M | 4 * 9 | 512 | 2M | 2.25M | 9M | 4.5 * 10 | 1K | 4M | 5M | 20M | 5 * 11 | 2K | 8M | 11M | 44M | 5.5 * 12 | 4K | 16M | 24M | 96M | 6 * 13 | 8K | 32M | 52M | 208M | 6.5 * 14 | 16K | 64M | 112M | 448M | 7 * 15 | 32K | 128M | 240M | 960M | 7.5 * 16 | 64K | 256M | 512M | 2G | 8 * 17 | 128K | 512M | 1088M | 4.25G | 8.5 * 18 | 256K | 1G | 2.25G | 9G | 9 * 19 | 512K | 2G | 4.75G | 19G | 9.5 * 20 | 1M | 4G | 10G | 40G | 10 */ TEST_F(KFDLocalMemoryTest, Fragmentation) { TEST_REQUIRE_ENV_CAPABILITIES(ENVCAPS_64BITLINUX); TEST_START(TESTPROFILE_RUNALL); int defaultGPUNode = m_NodeInfo.HsaDefaultGPUNode(); ASSERT_GE(defaultGPUNode, 0) << "failed to get default GPU Node"; HSAuint64 fbSize; fbSize = GetVramSize(defaultGPUNode); if (!fbSize) { LOG() << "Skipping test: No VRAM found." << std::endl; return; } else { LOG() << "Found VRAM of " << std::dec << (fbSize >> 20) << "MB." << std::endl; } /* Use up to half of available memory. Using more results in * excessive memory movement in TTM and slows down the test too * much. maxOrder is the size of the biggest block that will be * allocated. It's 1/4 of the usable memory, so 1/8 the total FB * size in pages. * * Use 8x bigger page size on dGPU to match Tonga alignment * workaround. Also nicely matches the 8x bigger GPUVM address * space on AMDGPU compared to RADEON. */ unsigned pageSize = is_dgpu() ? PAGE_SIZE*8 : PAGE_SIZE; fbSize /= pageSize; unsigned maxOrder = 0; // Limit maxOrder up to 14 so this test doesn't run longer than 10 mins while (((fbSize >> maxOrder) >= 16) && (maxOrder < 14)) maxOrder++; /* Queue and memory used by the shader copy tests */ HsaMemoryBuffer sysBuffer(PAGE_SIZE, defaultGPUNode, false); PM4Queue queue; ASSERT_SUCCESS(queue.Create(defaultGPUNode)); HsaMemoryBuffer isaBuffer(PAGE_SIZE, defaultGPUNode); m_pIsaGen->GetCopyDwordIsa(isaBuffer); /* Allocate and test memory using the strategy explained at the top */ HSAKMT_STATUS status; HsaMemFlags memFlags = {0}; memFlags.ui32.PageSize = HSA_PAGE_SIZE_4KB; memFlags.ui32.HostAccess = 0; memFlags.ui32.NonPaged = 1; struct { void **pointers; unsigned long nPages; } pages[maxOrder+1]; unsigned order, o; unsigned long p; HSAuint64 size; unsigned value = 0; memset(pages, 0, sizeof(pages)); for (order = 0; order <= maxOrder; order++) { // At maxOrder, block sizes is 1/4 of available memory pages[order].nPages = 1UL << (maxOrder - order + 2); // At order != 0, 1/2 the memory is already allocated if (order > 0) pages[order].nPages >>= 1; // Allocate page pointers pages[order].pointers = new void *[pages[order].nPages]; EXPECT_NE((void **)NULL, pages[order].pointers) << "Couldn't allocate memory for " << pages[order].nPages << " pointers at order " << order << std::endl; if (!pages[order].pointers) { pages[order].nPages = 0; break; } /* Allocate buffers and access the start and end of every one: * 1. Copy from sysBuffer[0] to start of block * 2. Copy from start of block to end of block * 3. Copy from end of block to sysBuffer[1] * 4. Compare results */ size = (HSAuint64)(1 << order) * pageSize; LOG() << std::dec << "Trying to allocate " << pages[order].nPages << " order " << order << " blocks " << std::endl; for (p = 0; p < pages[order].nPages; p++) { status = hsaKmtAllocMemory(defaultGPUNode, size, memFlags, &pages[order].pointers[p]); if (status != HSAKMT_STATUS_SUCCESS) { EXPECT_EQ(HSAKMT_STATUS_NO_MEMORY, status); pages[order].nPages = p; break; } void *bufferEnd = reinterpret_cast(reinterpret_cast(pages[order].pointers[p]) + size - sizeof(unsigned)); sysBuffer.As()[0] = ++value; status = hsaKmtMapMemoryToGPU(pages[order].pointers[p], size, NULL); if (status != HSAKMT_STATUS_SUCCESS) { ASSERT_SUCCESS(hsaKmtFreeMemory(pages[order].pointers[p], size)); pages[order].nPages = p; break; } Dispatch dispatch1(isaBuffer); dispatch1.SetArgs(sysBuffer.As(), pages[order].pointers[p]); dispatch1.Submit(queue); // no sync needed for multiple GPU dispatches to the same queue Dispatch dispatch2(isaBuffer); dispatch2.SetArgs(pages[order].pointers[p], bufferEnd); dispatch2.Submit(queue); // no sync needed for multiple GPU dispatches to the same queue Dispatch dispatch3(isaBuffer); dispatch3.SetArgs(bufferEnd, reinterpret_cast(&(sysBuffer.As()[1]))); dispatch3.Submit(queue); dispatch3.Sync(g_TestTimeOut); EXPECT_EQ(value, sysBuffer.As()[1]); EXPECT_SUCCESS(hsaKmtUnmapMemoryToGPU(pages[order].pointers[p])); } LOG() << " Got " << pages[order].nPages << ", end of last block addr: " << reinterpret_cast(reinterpret_cast(pages[order].pointers[p-1]) + size - 1) << std::endl; // Now free half the memory for (o = 0; o <= order; o++) { unsigned long step = 1UL << (order - o + 1); unsigned long offset = (step >> 1) - 1; size = (HSAuint64)(1 << o) * pageSize; LOG() << " Freeing every " << step << "th order " << o << " block starting with " << offset << std::endl; for (p = offset; p < pages[o].nPages; p += step) { ASSERT_NE((void **)NULL, pages[o].pointers[p]); EXPECT_SUCCESS(hsaKmtFreeMemory(pages[o].pointers[p], size)); pages[o].pointers[p] = NULL; } } } /* Clean up */ for (order = 0; order <= maxOrder; order++) { if (pages[order].pointers == NULL) continue; size = (HSAuint64)(1 << order) * pageSize; for (p = 0; p < pages[order].nPages; p++) if (pages[order].pointers[p] != NULL) EXPECT_SUCCESS(hsaKmtFreeMemory(pages[order].pointers[p], size)); delete[] pages[order].pointers; } ASSERT_SUCCESS(queue.Destroy()); TEST_END } TEST_F(KFDLocalMemoryTest, CheckZeroInitializationVram) { TEST_REQUIRE_ENV_CAPABILITIES(ENVCAPS_64BITLINUX); TEST_START(TESTPROFILE_RUNALL); int defaultGPUNode = m_NodeInfo.HsaDefaultGPUNode(); ASSERT_GE(defaultGPUNode, 0) << "failed to get default GPU Node"; /* Testing VRAM */ HSAuint64 vramSizeMB = GetVramSize(defaultGPUNode) >> 20; if (!vramSizeMB) { LOG() << "Skipping test: No VRAM found." << std::endl; return; } HSAuint64 vramBufSizeMB = vramSizeMB >> 2; /* limit the buffer size in order not to overflow the SDMA queue buffer. */ if (vramBufSizeMB > 1024) { vramBufSizeMB = 1024; } HSAuint64 vramBufSize = vramBufSizeMB * 1024 * 1024; /* Make sure the entire VRAM is used at least once */ int count = (vramSizeMB + vramBufSizeMB - 1) / vramBufSizeMB + 1; LOG() << "Using " << std::dec << vramBufSizeMB << "MB VRAM buffer to test " << std::dec << count << " times"<< std::endl; SDMAQueue sdmaQueue; ASSERT_SUCCESS(sdmaQueue.Create(defaultGPUNode, 8 * PAGE_SIZE)); HsaMemoryBuffer tmpBuffer(PAGE_SIZE, 0, true /* zero */); volatile HSAuint32 *tmp = tmpBuffer.As(); unsigned int offset = 2060; // a constant offset, should be 4 aligned. while (count--) { HsaMemoryBuffer localBuffer(vramBufSize, defaultGPUNode, false, true); EXPECT_TRUE(localBuffer.IsPattern(0, 0, sdmaQueue, tmp)); for (HSAuint64 i = offset; i < vramBufSize;) { EXPECT_TRUE(localBuffer.IsPattern(i, 0, sdmaQueue, tmp)); i += 4096; } /* Checking last 4 bytes */ EXPECT_TRUE(localBuffer.IsPattern(vramBufSize - 4, 0, sdmaQueue, tmp)); localBuffer.Fill(0xABCDEFFF, sdmaQueue); } TEST_END } TEST_F(KFDLocalMemoryTest, MapVramToGPUNodesTest) { TEST_START(TESTPROFILE_RUNALL); HSAint32 src_node; HSAint32 dst_node; HsaPointerInfo info; const std::vector gpuNodes = m_NodeInfo.GetNodesWithGPU(); if (gpuNodes.size() < 2) { LOG() << "Skipping test: Test requires at least two GPUs." << std::endl; return; } if (g_TestDstNodeId != -1 && g_TestNodeId != -1) { src_node = g_TestNodeId; dst_node = g_TestDstNodeId; } else { int defaultGPUNode = m_NodeInfo.HsaDefaultGPUNode(); dst_node = m_NodeInfo.FindLargeBarGPUNode(); if (dst_node < 0) { LOG() << "Skipping test: Test requires at least one large bar GPU." << std::endl; return; } if (dst_node != defaultGPUNode) { /* at least one node should be defaultGPUNode */ src_node = defaultGPUNode; } else { for (auto node : gpuNodes) { if (node != dst_node) { src_node = node; break; } } } } LOG() << "Testing from GPU " << src_node << " to GPU " << dst_node << std::endl; void *shared_addr; HSAuint32 nodes[] = { (HSAuint32)src_node, (HSAuint32)dst_node }; HsaMemFlags memFlags = {0}; memFlags.ui32.PageSize = HSA_PAGE_SIZE_4KB; memFlags.ui32.HostAccess = 1; memFlags.ui32.NonPaged = 1; memFlags.ui32.ExecuteAccess = 1; HsaMemMapFlags mapFlags = {0}; EXPECT_SUCCESS(hsaKmtAllocMemory(nodes[1], PAGE_SIZE, memFlags, &shared_addr)); EXPECT_SUCCESS(hsaKmtRegisterMemoryToNodes(shared_addr, PAGE_SIZE, 2, nodes)); EXPECT_SUCCESS(hsaKmtMapMemoryToGPUNodes(shared_addr, PAGE_SIZE, NULL, mapFlags, 2, nodes)); EXPECT_SUCCESS(hsaKmtQueryPointerInfo(shared_addr, &info)); EXPECT_EQ(info.NRegisteredNodes, 2); EXPECT_EQ(info.NMappedNodes, 2); EXPECT_SUCCESS(hsaKmtMapMemoryToGPUNodes(shared_addr, PAGE_SIZE, NULL, mapFlags, 1, &nodes[0])); EXPECT_SUCCESS(hsaKmtQueryPointerInfo(shared_addr, &info)); EXPECT_EQ(info.NRegisteredNodes, 2); EXPECT_EQ(info.NMappedNodes, 1); EXPECT_EQ(info.MappedNodes[0], nodes[0]); EXPECT_SUCCESS(hsaKmtMapMemoryToGPUNodes(shared_addr, PAGE_SIZE, NULL, mapFlags, 1, &nodes[1])); EXPECT_SUCCESS(hsaKmtQueryPointerInfo(shared_addr, &info)); EXPECT_EQ(info.NRegisteredNodes, 2); EXPECT_EQ(info.NMappedNodes, 1); EXPECT_EQ(info.MappedNodes[0], nodes[1]); EXPECT_SUCCESS(hsaKmtUnmapMemoryToGPU(shared_addr)); EXPECT_SUCCESS(hsaKmtQueryPointerInfo(shared_addr, &info)); EXPECT_EQ(info.NRegisteredNodes, 2); EXPECT_EQ(info.NMappedNodes, 0); EXPECT_SUCCESS(hsaKmtMapMemoryToGPUNodes(shared_addr, PAGE_SIZE, NULL, mapFlags, 1, &nodes[0])); EXPECT_SUCCESS(hsaKmtQueryPointerInfo(shared_addr, &info)); EXPECT_EQ(info.NRegisteredNodes, 2); EXPECT_EQ(info.NMappedNodes, 1); EXPECT_EQ(info.MappedNodes[0], nodes[0]); EXPECT_SUCCESS(hsaKmtUnmapMemoryToGPU(shared_addr)); EXPECT_SUCCESS(hsaKmtFreeMemory(shared_addr, PAGE_SIZE)); TEST_END }