Files
rocm-systems/tests/kfdtest/src/KFDMemoryTest.cpp
T

2059 líneas
75 KiB
C++
Original Vista normal Histórico

2018-07-23 14:45:44 -04:00
/*
* Copyright (C) 2014-2018 Advanced Micro Devices, Inc. All Rights Reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*
*/
#include "KFDMemoryTest.hpp"
#include <sys/prctl.h>
#include <sys/ptrace.h>
#include <errno.h>
#include <string.h>
#include <stdlib.h>
#include <unistd.h>
#include <sys/types.h>
#include <signal.h>
#include <numa.h>
2018-07-23 14:45:44 -04:00
#include <vector>
#include "Dispatch.hpp"
#include "PM4Queue.hpp"
#include "PM4Packet.hpp"
#include "SDMAQueue.hpp"
#include "SDMAPacket.hpp"
2019-04-30 15:32:01 -05:00
#include "linux/kfd_ioctl.h"
2018-07-23 14:45:44 -04:00
const char* gfx8_ScratchCopyDword =
"\
shader ScratchCopyDword\n\
asic(VI)\n\
type(CS)\n\
/*copy the parameters from scalar registers to vector registers*/\n\
v_mov_b32 v0, s0\n\
v_mov_b32 v1, s1\n\
v_mov_b32 v2, s2\n\
v_mov_b32 v3, s3\n\
/*set up the scratch parameters. This assumes a single 16-reg block.*/\n\
s_mov_b32 flat_scratch_lo, 8/*2 dwords of scratch per thread*/\n\
s_mov_b32 flat_scratch_hi, 0/*offset in units of 256bytes*/\n\
/*copy a dword between the passed addresses*/\n\
flat_load_dword v4, v[0:1] slc\n\
s_waitcnt vmcnt(0)&lgkmcnt(0)\n\
flat_store_dword v[2:3], v4 slc\n\
\n\
s_endpgm\n\
\n\
end\n\
";
const char* gfx9_ScratchCopyDword =
"\
shader ScratchCopyDword\n\
asic(GFX9)\n\
type(CS)\n\
/*copy the parameters from scalar registers to vector registers*/\n\
v_mov_b32 v0, s0\n\
v_mov_b32 v1, s1\n\
v_mov_b32 v2, s2\n\
v_mov_b32 v3, s3\n\
/*set up the scratch parameters. This assumes a single 16-reg block.*/\n\
s_mov_b32 flat_scratch_lo, s4\n\
s_mov_b32 flat_scratch_hi, s5\n\
/*copy a dword between the passed addresses*/\n\
flat_load_dword v4, v[0:1] slc\n\
s_waitcnt vmcnt(0)&lgkmcnt(0)\n\
flat_store_dword v[2:3], v4 slc\n\
\n\
s_endpgm\n\
\n\
end\n\
";
2019-07-18 19:34:55 -04:00
const char* gfx10_ScratchCopyDword =
"\
shader ScratchCopyDword\n\
asic(GFX10)\n\
type(CS)\n\
wave_size(32)\n\
/*copy the parameters from scalar registers to vector registers*/\n\
v_mov_b32 v0, s0\n\
v_mov_b32 v1, s1\n\
v_mov_b32 v2, s2\n\
v_mov_b32 v3, s3\n\
/*set up the scratch parameters. This assumes a single 16-reg block.*/\n\
s_setreg_b32 hwreg(HW_REG_SHADER_FLAT_SCRATCH_LO), s4\n\
s_setreg_b32 hwreg(HW_REG_SHADER_FLAT_SCRATCH_HI), s5\n\
/*copy a dword between the passed addresses*/\n\
flat_load_dword v4, v[0:1] slc\n\
s_waitcnt vmcnt(0)&lgkmcnt(0)\n\
flat_store_dword v[2:3], v4 slc\n\
\n\
s_endpgm\n\
\n\
end\n\
";
2018-07-23 14:45:44 -04:00
/* Continuously poll src buffer and check buffer value
* After src buffer is filled with specific value (0x5678,
* by host program), fill dst buffer with specific
* value(0x5678) and quit
*/
const char* gfx9_PollMemory =
"\
shader ReadMemory\n\
wave_size(32)\n\
2019-07-18 19:34:55 -04:00
type(CS)\n\
/* Assume src address in s0, s1 and dst address in s2, s3*/\n\
s_movk_i32 s18, 0x5678\n\
LOOP:\n\
s_load_dword s16, s[0:1], 0x0 glc\n\
s_cmp_eq_i32 s16, s18\n\
s_cbranch_scc0 LOOP\n\
s_store_dword s18, s[2:3], 0x0 glc\n\
s_endpgm\n\
end\n\
";
2019-04-30 15:32:01 -05:00
/* Input: A buffer of at least 3 dwords.
2019-05-30 16:09:06 -05:00
* DW0: used as a signal. 0xcafe means it is signaled
* DW1: Input buffer for device to read.
2019-04-30 15:32:01 -05:00
* DW2: Output buffer for device to write.
2019-05-30 16:09:06 -05:00
* Once receive signal, device will copy DW1 to DW2
2019-04-30 15:32:01 -05:00
* This shader continously poll the signal buffer,
* Once signal buffer is signaled, it copies input buffer
* to output buffer
*/
const char* gfx9_CopyOnSignal =
"\
shader CopyOnSignal\n\
wave_size(32)\n\
2019-07-18 19:34:55 -04:00
type(CS)\n\
/* Assume input buffer in s0, s1 */\n\
s_mov_b32 s18, 0xcafe\n\
POLLSIGNAL:\n\
s_load_dword s16, s[0:1], 0x0 glc\n\
s_cmp_eq_i32 s16, s18\n\
s_cbranch_scc0 POLLSIGNAL\n\
s_load_dword s17, s[0:1], 0x4 glc\n\
s_waitcnt vmcnt(0) & lgkmcnt(0)\n\
s_store_dword s17, s[0:1], 0x8 glc\n\
s_waitcnt vmcnt(0) & lgkmcnt(0)\n\
s_endpgm\n\
end\n\
";
2019-05-30 16:09:06 -05:00
/* Input0: A buffer of at least 2 dwords.
* DW0: used as a signal. Write 0xcafe to signal
* DW1: Write to this buffer for other device to read.
* Input1: mmio base address
*/
const char* gfx9_WriteAndSignal =
"\
shader WriteAndSignal\n\
wave_size(32)\n\
2019-05-30 16:09:06 -05:00
type(CS)\n\
/* Assume input buffer in s0, s1 */\n\
s_mov_b32 s18, 0xbeef\n\
s_store_dword s18, s[0:1], 0x4 glc\n\
s_mov_b32 s18, 0x1\n\
s_store_dword s18, s[2:3], 0 glc\n\
s_mov_b32 s18, 0xcafe\n\
s_store_dword s18, s[0:1], 0x0 glc\n\
s_endpgm\n\
end\n\
";
//These gfx9_PullMemory, gfx9_CopyOnSignal, gfx9_WriteAndSignal shaders can be used by both gfx9 and gfx10
2019-07-18 19:34:55 -04:00
2018-07-23 14:45:44 -04:00
void KFDMemoryTest::SetUp() {
ROUTINE_START
KFDBaseComponentTest::SetUp();
m_pIsaGen = IsaGenerator::Create(m_FamilyId);
ROUTINE_END
}
void KFDMemoryTest::TearDown() {
ROUTINE_START
if (m_pIsaGen)
delete m_pIsaGen;
m_pIsaGen = NULL;
KFDBaseComponentTest::TearDown();
ROUTINE_END
}
#include <sys/mman.h>
2018-08-13 09:03:31 -04:00
#define GB(x) ((x) << 30)
2018-07-23 14:45:44 -04:00
/*
2018-08-14 09:52:31 -04:00
* Try to map as much as possible system memory to gpu
* to see if KFD supports 1TB memory correctly or not.
* After this test case, we can observe if there are any side effects.
* NOTICE: There are memory usage limit checks in hsa/kfd according to the total
2018-07-23 14:45:44 -04:00
* physical system memory.
*/
TEST_F(KFDMemoryTest, MMapLarge) {
TEST_REQUIRE_ENV_CAPABILITIES(ENVCAPS_64BITLINUX);
TEST_START(TESTPROFILE_RUNALL)
if (!is_dgpu()) {
LOG() << "Skipping test: Test not supported on APU." << std::endl;
2018-07-23 14:45:44 -04:00
return;
}
HSAuint32 defaultGPUNode = m_NodeInfo.HsaDefaultGPUNode();
ASSERT_GE(defaultGPUNode, 0) << "failed to get default GPU Node";
2018-08-13 09:03:31 -04:00
const HSAuint64 nObjects = 1<<14;
2018-07-23 14:45:44 -04:00
HSAuint64 *AlternateVAGPU = new HSAuint64[nObjects];
2018-08-13 09:03:31 -04:00
ASSERT_NE((HSAuint64)AlternateVAGPU, 0);
2018-07-23 14:45:44 -04:00
HsaMemMapFlags mapFlags = {0};
2018-08-13 09:03:31 -04:00
HSAuint64 s;
2018-07-23 14:45:44 -04:00
char *addr;
2018-08-13 09:03:31 -04:00
HSAuint64 flags = MAP_ANONYMOUS | MAP_PRIVATE;
2018-07-23 14:45:44 -04:00
/* Test up to 1TB memory*/
s = GB(1024ULL) / nObjects;
2018-08-13 09:03:31 -04:00
addr = reinterpret_cast<char*>(mmap(0, s, PROT_READ | PROT_WRITE, flags, -1, 0));
2018-07-23 14:45:44 -04:00
ASSERT_NE(addr, MAP_FAILED);
memset(addr, 0, s);
int i = 0;
/* Allocate 1024GB, aka 1TB*/
for (; i < nObjects; i++) {
if (hsaKmtRegisterMemory(addr + i, s - i))
break;
if (hsaKmtMapMemoryToGPUNodes(addr + i, s - i,
2018-08-13 09:03:31 -04:00
&AlternateVAGPU[i], mapFlags, 1, reinterpret_cast<HSAuint32 *>(&defaultGPUNode))) {
hsaKmtDeregisterMemory(addr + i);
2018-07-23 14:45:44 -04:00
break;
}
}
LOG() << "Successfully registered and mapped " << (i * s >> 30)
<< "GB system memory to gpu" << std::endl;
RECORD(i * s >> 30) << "Mmap-SysMem-Size";
2018-07-23 14:45:44 -04:00
while (i--) {
EXPECT_SUCCESS(hsaKmtUnmapMemoryToGPU(reinterpret_cast<void*>(AlternateVAGPU[i])));
EXPECT_SUCCESS(hsaKmtDeregisterMemory(reinterpret_cast<void*>(AlternateVAGPU[i])));
2018-07-23 14:45:44 -04:00
}
munmap(addr, s);
delete []AlternateVAGPU;
TEST_END
}
2018-08-14 09:52:31 -04:00
/* Keep memory mapped to default node
2018-07-23 14:45:44 -04:00
* Keep mapping/unmapping memory to/from non-default node
2018-08-14 09:52:31 -04:00
* A shader running on default node consistantly accesses
* memory - make sure memory is always accessible by default,
* i.e. there is no gpu vm fault.
2018-07-23 14:45:44 -04:00
* Synchronization b/t host program and shader:
2018-08-14 09:52:31 -04:00
* 1. Host initializes src and dst buffer to 0
* 2. Shader keeps reading src buffer and check value
* 3. Host writes src buffer to 0x5678 to indicate quit, polling dst until it becomes 0x5678
* 4. Shader write dst buffer to 0x5678 after src changes to 0x5678, then quits
* 5. Host program quits after dst becomes 0x5678
* Need at least two gpu nodes to run the test. The default node has to be a gfx9 node,
* otherwise, test is skipped. Use kfdtest --node=$$ to specify the default node
2018-07-23 14:45:44 -04:00
* This test case is introduced as a side-result of investigation of SWDEV-134798, which
* is a gpu vm fault while running rocr conformance test. Here we try to simulate the
* same test behaviour.
*/
TEST_F(KFDMemoryTest, MapUnmapToNodes) {
TEST_START(TESTPROFILE_RUNALL)
2019-07-18 19:34:55 -04:00
if (m_FamilyId < FAMILY_AI) {
LOG() << "Skipping test: Test requires gfx9 and later asics." << std::endl;
2018-07-23 14:45:44 -04:00
return;
}
const std::vector<int> gpuNodes = m_NodeInfo.GetNodesWithGPU();
if (gpuNodes.size() < 2) {
LOG() << "Skipping test: At least two GPUs are required." << std::endl;
2018-07-23 14:45:44 -04:00
return;
}
HSAuint32 defaultGPUNode = m_NodeInfo.HsaDefaultGPUNode();
LOG() << "default GPU node" << defaultGPUNode << std::endl;
ASSERT_GE(defaultGPUNode, 0) << "failed to get default GPU Node";
HSAuint32 nondefaultNode;
for (unsigned i = 0; i < gpuNodes.size(); i++) {
if (gpuNodes.at(i) != defaultGPUNode) {
nondefaultNode = gpuNodes.at(i);
break;
}
}
HSAuint32 mapNodes[2] = {defaultGPUNode, nondefaultNode};
HsaMemoryBuffer isaBuffer(PAGE_SIZE, defaultGPUNode, true/*zero*/, false/*local*/, true/*exec*/);
HsaMemoryBuffer srcBuffer(PAGE_SIZE, defaultGPUNode);
HsaMemoryBuffer dstBuffer(PAGE_SIZE, defaultGPUNode);
m_pIsaGen->CompileShader(gfx9_PollMemory, "ReadMemory", isaBuffer);
2018-07-23 14:45:44 -04:00
PM4Queue pm4Queue;
ASSERT_SUCCESS(pm4Queue.Create(defaultGPUNode));
Dispatch dispatch0(isaBuffer);
dispatch0.SetArgs(srcBuffer.As<void*>(), dstBuffer.As<void*>());
dispatch0.Submit(pm4Queue);
HsaMemMapFlags memFlags = {0};
memFlags.ui32.PageSize = HSA_PAGE_SIZE_4KB;
memFlags.ui32.HostAccess = 1;
for (unsigned i = 0; i < 1<<14; i ++) {
hsaKmtMapMemoryToGPUNodes(srcBuffer.As<void*>(), PAGE_SIZE, NULL, memFlags, (i>>5)&1+1, mapNodes);
}
2018-08-14 09:52:31 -04:00
/* Fill src buffer so shader quits */
2018-07-23 14:45:44 -04:00
srcBuffer.Fill(0x5678);
WaitOnValue(dstBuffer.As<uint32_t *>(), 0x5678);
EXPECT_EQ(*dstBuffer.As<uint32_t *>(), 0x5678);
EXPECT_SUCCESS(pm4Queue.Destroy());
2018-07-23 14:45:44 -04:00
TEST_END
}
2018-08-14 09:52:31 -04:00
// Basic test of hsaKmtMapMemoryToGPU and hsaKmtUnmapMemoryToGPU
2018-07-23 14:45:44 -04:00
TEST_F(KFDMemoryTest , MapMemoryToGPU) {
TEST_START(TESTPROFILE_RUNALL)
unsigned int *nullPtr = NULL;
unsigned int* pDb = NULL;
int defaultGPUNode = m_NodeInfo.HsaDefaultGPUNode();
ASSERT_GE(defaultGPUNode, 0) << "failed to get default GPU Node";
2018-08-13 09:03:31 -04:00
ASSERT_SUCCESS(hsaKmtAllocMemory(defaultGPUNode /* system */, PAGE_SIZE, m_MemoryFlags,
reinterpret_cast<void**>(&pDb)));
2018-07-23 14:45:44 -04:00
// verify that pDb is not null before it's being used
ASSERT_NE(nullPtr, pDb) << "hsaKmtAllocMemory returned a null pointer";
ASSERT_SUCCESS(hsaKmtMapMemoryToGPU(pDb, PAGE_SIZE, NULL));
EXPECT_SUCCESS(hsaKmtUnmapMemoryToGPU(pDb));
2018-07-23 14:45:44 -04:00
// Release the buffers
EXPECT_SUCCESS(hsaKmtFreeMemory(pDb, PAGE_SIZE));
2018-07-23 14:45:44 -04:00
TEST_END
}
2018-08-14 09:52:31 -04:00
// Following tests are for hsaKmtAllocMemory with invalid params
2018-07-23 14:45:44 -04:00
TEST_F(KFDMemoryTest, InvalidMemoryPointerAlloc) {
TEST_START(TESTPROFILE_RUNALL)
EXPECT_EQ(HSAKMT_STATUS_INVALID_PARAMETER, hsaKmtAllocMemory(0 /* system */, PAGE_SIZE, m_MemoryFlags, NULL));
TEST_END
}
TEST_F(KFDMemoryTest, ZeroMemorySizeAlloc) {
TEST_START(TESTPROFILE_RUNALL)
unsigned int* pDb = NULL;
2018-08-13 09:03:31 -04:00
EXPECT_EQ(HSAKMT_STATUS_INVALID_PARAMETER, hsaKmtAllocMemory(0 /* system */, 0, m_MemoryFlags,
reinterpret_cast<void**>(&pDb)));
2018-07-23 14:45:44 -04:00
TEST_END
}
2018-08-14 09:52:31 -04:00
// Basic test for hsaKmtAllocMemory
2018-07-23 14:45:44 -04:00
TEST_F(KFDMemoryTest, MemoryAlloc) {
TEST_START(TESTPROFILE_RUNALL)
unsigned int* pDb = NULL;
2018-08-13 09:03:31 -04:00
EXPECT_SUCCESS(hsaKmtAllocMemory(0 /* system */, PAGE_SIZE, m_MemoryFlags, reinterpret_cast<void**>(&pDb)));
2018-07-23 14:45:44 -04:00
TEST_END
}
TEST_F(KFDMemoryTest, AccessPPRMem) {
TEST_START(TESTPROFILE_RUNALL)
int defaultGPUNode = m_NodeInfo.HsaDefaultGPUNode();
ASSERT_GE(defaultGPUNode, 0) << "failed to get default GPU Node";
if (is_dgpu()) {
LOG() << "Skipping test: Test requires APU." << std::endl;
2018-07-23 14:45:44 -04:00
return;
}
unsigned int *destBuf = (unsigned int *)VirtualAllocMemory(NULL, PAGE_SIZE,
MEM_READ | MEM_WRITE);
PM4Queue queue;
ASSERT_SUCCESS(queue.Create(defaultGPUNode));
queue.PlaceAndSubmitPacket(PM4WriteDataPacket(destBuf,
0xABCDEF09, 0x12345678));
queue.Wait4PacketConsumption();
WaitOnValue(destBuf, 0xABCDEF09);
WaitOnValue(destBuf + 1, 0x12345678);
EXPECT_SUCCESS(queue.Destroy());
2018-07-23 14:45:44 -04:00
/* This sleep hides the dmesg PPR message storm on Raven, which happens
* when the CPU buffer is freed before the excessive PPRs are all
* consumed by IOMMU HW. Because of that, a kernel driver workaround
* is put in place to address that, so we don't need to wait here.
*/
2018-08-13 09:03:31 -04:00
// sleep(5);
2018-07-23 14:45:44 -04:00
VirtualFreeMemory(destBuf, PAGE_SIZE);
TEST_END
}
// Linux OS-specific Test for registering OS allocated memory
TEST_F(KFDMemoryTest, MemoryRegister) {
const HsaNodeProperties *pNodeProperties = m_NodeInfo.HsaDefaultGPUNodeProperties();
if (isTonga(pNodeProperties)) {
LOG() << "Skipping test: Workaround in thunk for Tonga causes failure." << std::endl;
2018-07-23 14:45:44 -04:00
return;
}
TEST_START(TESTPROFILE_RUNALL)
int defaultGPUNode = m_NodeInfo.HsaDefaultGPUNode();
ASSERT_GE(defaultGPUNode, 0) << "failed to get default GPU Node";
/* Different unaligned memory locations to be mapped for GPU
* access:
*
* - initialized data segment (file backed)
* - stack (anonymous memory)
*
* Separate them enough so they are in different cache lines
* (64-byte = 16-dword).
*/
static volatile HSAuint32 globalData = 0xdeadbeef;
volatile HSAuint32 stackData[17] = {0};
const unsigned dstOffset = 0;
const unsigned sdmaOffset = 16;
HsaMemoryBuffer srcBuffer((void *)&globalData, sizeof(HSAuint32));
HsaMemoryBuffer dstBuffer((void *)&stackData[dstOffset], sizeof(HSAuint32));
HsaMemoryBuffer sdmaBuffer((void *)&stackData[sdmaOffset], sizeof(HSAuint32));
/* Create PM4 and SDMA queues before fork+COW to test queue
2018-08-14 09:52:31 -04:00
* eviction and restore
*/
2018-07-23 14:45:44 -04:00
PM4Queue pm4Queue;
SDMAQueue sdmaQueue;
ASSERT_SUCCESS(pm4Queue.Create(defaultGPUNode));
ASSERT_SUCCESS(sdmaQueue.Create(defaultGPUNode));
HsaMemoryBuffer isaBuffer(PAGE_SIZE, defaultGPUNode, true/*zero*/, false/*local*/, true/*exec*/);
m_pIsaGen->GetCopyDwordIsa(isaBuffer);
/* First submit just so the queues are not empty, and to get the
* TLB populated (in case we need to flush TLBs somewhere after
2018-08-14 09:52:31 -04:00
* updating the page tables)
*/
2018-07-23 14:45:44 -04:00
Dispatch dispatch0(isaBuffer);
dispatch0.SetArgs(srcBuffer.As<void*>(), dstBuffer.As<void*>());
dispatch0.Submit(pm4Queue);
dispatch0.Sync(g_TestTimeOut);
sdmaQueue.PlaceAndSubmitPacket(SDMAWriteDataPacket(sdmaQueue.GetFamilyId(), sdmaBuffer.As<HSAuint32 *>(), 0x12345678));
2018-07-23 14:45:44 -04:00
sdmaQueue.Wait4PacketConsumption();
EXPECT_TRUE(WaitOnValue(&stackData[sdmaOffset], 0x12345678));
2018-07-23 14:45:44 -04:00
/* Fork a child process to mark pages as COW */
pid_t pid = fork();
ASSERT_GE(pid, 0);
if (pid == 0) {
/* Child process waits for a SIGTERM from the parent. It can't
* make any write access to the stack because we want the
* parent to make the first write access and get a new copy. A
* busy loop is the safest way to do that, since any function
2018-08-14 09:52:31 -04:00
* call (e.g. sleep) would write to the stack.
*/
2018-07-23 14:45:44 -04:00
while (1)
{}
WARN() << "Shouldn't get here!" << std::endl;
exit(0);
}
/* Parent process writes to COW page(s) and gets a new copy. MMU
* notifier needs to update the GPU mapping(s) for the test to
2018-08-14 09:52:31 -04:00
* pass.
*/
2018-07-23 14:45:44 -04:00
globalData = 0xD00BED00;
stackData[dstOffset] = 0xdeadbeef;
stackData[sdmaOffset] = 0xdeadbeef;
/* Terminate the child process before a possible test failure that
2018-08-14 09:52:31 -04:00
* would leave it spinning in the background indefinitely.
*/
2018-07-23 14:45:44 -04:00
int status;
EXPECT_EQ(0, kill(pid, SIGTERM));
EXPECT_EQ(pid, waitpid(pid, &status, 0));
EXPECT_NE(0, WIFSIGNALED(status));
EXPECT_EQ(SIGTERM, WTERMSIG(status));
/* Now check that the GPU is accessing the correct page */
Dispatch dispatch1(isaBuffer);
dispatch1.SetArgs(srcBuffer.As<void*>(), dstBuffer.As<void*>());
dispatch1.Submit(pm4Queue);
dispatch1.Sync(g_TestTimeOut);
sdmaQueue.PlaceAndSubmitPacket(SDMAWriteDataPacket(sdmaQueue.GetFamilyId(), sdmaBuffer.As<HSAuint32 *>(), 0xD0BED0BE));
2018-07-23 14:45:44 -04:00
sdmaQueue.Wait4PacketConsumption();
EXPECT_SUCCESS(pm4Queue.Destroy());
EXPECT_SUCCESS(sdmaQueue.Destroy());
2018-07-23 14:45:44 -04:00
EXPECT_EQ(0xD00BED00, globalData);
EXPECT_EQ(0xD00BED00, stackData[dstOffset]);
EXPECT_EQ(0xD0BED0BE, stackData[sdmaOffset]);
2018-07-23 14:45:44 -04:00
TEST_END
}
TEST_F(KFDMemoryTest, MemoryRegisterSamePtr) {
if (!is_dgpu()) {
LOG() << "Skipping test: Will run on APU once APU+dGPU supported." << std::endl;
2018-07-23 14:45:44 -04:00
return;
}
TEST_START(TESTPROFILE_RUNALL)
int defaultGPUNode = m_NodeInfo.HsaDefaultGPUNode();
ASSERT_GE(defaultGPUNode, 0) << "failed to get default GPU Node";
const std::vector<int> gpuNodes = m_NodeInfo.GetNodesWithGPU();
HSAuint64 nGPU = gpuNodes.size(); // number of gpu nodes
static volatile HSAuint32 mem[4];
HSAuint64 gpuva1, gpuva2;
/* Same address, different size */
EXPECT_SUCCESS(hsaKmtRegisterMemory((void *)&mem[0], sizeof(HSAuint32)*2));
EXPECT_SUCCESS(hsaKmtMapMemoryToGPU((void *)&mem[0], sizeof(HSAuint32)*2,
&gpuva1));
EXPECT_SUCCESS(hsaKmtRegisterMemory((void *)&mem[0], sizeof(HSAuint32)));
EXPECT_SUCCESS(hsaKmtMapMemoryToGPU((void *)&mem[0], sizeof(HSAuint32),
&gpuva2));
EXPECT_TRUE(gpuva1 != gpuva2);
2018-08-13 09:03:31 -04:00
EXPECT_SUCCESS(hsaKmtUnmapMemoryToGPU(reinterpret_cast<void *>(gpuva1)));
EXPECT_SUCCESS(hsaKmtDeregisterMemory(reinterpret_cast<void *>(gpuva1)));
EXPECT_SUCCESS(hsaKmtUnmapMemoryToGPU(reinterpret_cast<void *>(gpuva2)));
EXPECT_SUCCESS(hsaKmtDeregisterMemory(reinterpret_cast<void *>(gpuva2)));
2018-07-23 14:45:44 -04:00
/* Same address, same size */
HsaMemMapFlags memFlags = {0};
memFlags.ui32.PageSize = HSA_PAGE_SIZE_4KB;
memFlags.ui32.HostAccess = 1;
HSAuint32 nodes[nGPU];
for (unsigned int i = 0; i < nGPU; i++)
nodes[i] = gpuNodes.at(i);
EXPECT_SUCCESS(hsaKmtRegisterMemoryToNodes((void *)&mem[2],
sizeof(HSAuint32)*2, nGPU, nodes));
EXPECT_SUCCESS(hsaKmtMapMemoryToGPUNodes((void *)&mem[2],
sizeof(HSAuint32) * 2,
&gpuva1, memFlags, nGPU, nodes));
EXPECT_SUCCESS(hsaKmtRegisterMemoryToNodes((void *)&mem[2],
sizeof(HSAuint32) * 2, nGPU, nodes));
EXPECT_SUCCESS(hsaKmtMapMemoryToGPUNodes((void *)&mem[2],
sizeof(HSAuint32) * 2,
&gpuva2, memFlags, nGPU, nodes));
EXPECT_EQ(gpuva1, gpuva2);
2018-08-13 09:03:31 -04:00
EXPECT_SUCCESS(hsaKmtUnmapMemoryToGPU(reinterpret_cast<void *>(gpuva1)));
EXPECT_SUCCESS(hsaKmtDeregisterMemory(reinterpret_cast<void *>(gpuva1)));
2018-07-23 14:45:44 -04:00
/* Confirm that we still have access to the memory, mem[2] */
PM4Queue queue;
ASSERT_SUCCESS(queue.Create(defaultGPUNode));
mem[2] = 0x0;
2018-08-13 09:03:31 -04:00
queue.PlaceAndSubmitPacket(PM4WriteDataPacket(reinterpret_cast<unsigned int *>(gpuva2),
2018-07-23 14:45:44 -04:00
0xdeadbeef));
queue.PlaceAndSubmitPacket(PM4ReleaseMemoryPacket(m_FamilyId, true, 0, 0));
2018-07-23 14:45:44 -04:00
queue.Wait4PacketConsumption();
2018-08-13 09:03:31 -04:00
EXPECT_EQ(true, WaitOnValue((unsigned int *)(&mem[2]), 0xdeadbeef));
2018-07-23 14:45:44 -04:00
EXPECT_SUCCESS(queue.Destroy());
2018-08-13 09:03:31 -04:00
EXPECT_SUCCESS(hsaKmtUnmapMemoryToGPU(reinterpret_cast<void *>(gpuva2)));
EXPECT_SUCCESS(hsaKmtDeregisterMemory(reinterpret_cast<void *>(gpuva2)));
2018-07-23 14:45:44 -04:00
TEST_END
}
2018-08-14 09:52:31 -04:00
/* FlatScratchAccess
* Since HsaMemoryBuffer has to be associated with a specific GPU node, this function in the current form
* will not work for multiple GPU nodes. For now test only one default GPU node.
* TODO: Generalize it to support multiple nodes
*/
2018-07-23 14:45:44 -04:00
#define SCRATCH_SLICE_SIZE 0x10000
#define SCRATCH_SLICE_NUM 3
#define SCRATCH_SIZE (SCRATCH_SLICE_NUM * SCRATCH_SLICE_SIZE)
#define SCRATCH_SLICE_OFFSET(i) ((i) * SCRATCH_SLICE_SIZE)
TEST_F(KFDMemoryTest, FlatScratchAccess) {
TEST_START(TESTPROFILE_RUNALL)
if (m_FamilyId == FAMILY_CI || m_FamilyId == FAMILY_KV) {
LOG() << "Skipping test: VI-based shader not supported on other ASICs." << std::endl;
2018-07-23 14:45:44 -04:00
return;
}
int defaultGPUNode = m_NodeInfo.HsaDefaultGPUNode();
ASSERT_GE(defaultGPUNode, 0) << "failed to get default GPU Node";
HsaMemoryBuffer isaBuffer(PAGE_SIZE, defaultGPUNode, true/*zero*/, false/*local*/, true/*exec*/);
2018-08-13 09:03:31 -04:00
HsaMemoryBuffer scratchBuffer(SCRATCH_SIZE, defaultGPUNode, false/*zero*/, false/*local*/,
false/*exec*/, true /*scratch*/);
2018-07-23 14:45:44 -04:00
// Unmap scratch for sub-allocation mapping tests
ASSERT_SUCCESS(hsaKmtUnmapMemoryToGPU(scratchBuffer.As<void*>()));
// Map and unmap a few slices in different order: 2-0-1, 0-2-1
ASSERT_SUCCESS(hsaKmtMapMemoryToGPU(scratchBuffer.As<char*>() + SCRATCH_SLICE_OFFSET(2),
SCRATCH_SLICE_SIZE, NULL));
ASSERT_SUCCESS(hsaKmtMapMemoryToGPU(scratchBuffer.As<char*>() + SCRATCH_SLICE_OFFSET(0),
SCRATCH_SLICE_SIZE, NULL));
ASSERT_SUCCESS(hsaKmtMapMemoryToGPU(scratchBuffer.As<char*>() + SCRATCH_SLICE_OFFSET(1),
SCRATCH_SLICE_SIZE, NULL));
EXPECT_SUCCESS(hsaKmtUnmapMemoryToGPU(scratchBuffer.As<char*>() + SCRATCH_SLICE_OFFSET(1)));
EXPECT_SUCCESS(hsaKmtUnmapMemoryToGPU(scratchBuffer.As<char*>() + SCRATCH_SLICE_OFFSET(2)));
EXPECT_SUCCESS(hsaKmtUnmapMemoryToGPU(scratchBuffer.As<char*>() + SCRATCH_SLICE_OFFSET(0)));
2018-07-23 14:45:44 -04:00
// Map everything for test below
ASSERT_SUCCESS(hsaKmtMapMemoryToGPU(scratchBuffer.As<char*>(), SCRATCH_SIZE, NULL));
HsaMemoryBuffer srcMemBuffer(PAGE_SIZE, defaultGPUNode);
HsaMemoryBuffer dstMemBuffer(PAGE_SIZE, defaultGPUNode);
// Initialize the srcBuffer to some fixed value
srcMemBuffer.Fill(0x01010101);
2019-07-18 19:34:55 -04:00
const char *pScratchCopyDword;
if (m_FamilyId < FAMILY_AI)
pScratchCopyDword = gfx8_ScratchCopyDword;
else if (m_FamilyId < FAMILY_NV)
pScratchCopyDword = gfx9_ScratchCopyDword;
else
pScratchCopyDword = gfx10_ScratchCopyDword;
m_pIsaGen->CompileShader(pScratchCopyDword, "ScratchCopyDword", isaBuffer);
2018-07-23 14:45:44 -04:00
const HsaNodeProperties *pNodeProperties = m_NodeInfo.GetNodeProperties(defaultGPUNode);
2018-08-14 09:52:31 -04:00
/* TODO: Add support to all GPU Nodes.
* The loop over the system nodes is removed as the test can be executed only on GPU nodes. This
* also requires changes to be made to all the HsaMemoryBuffer variables defined above, as
* HsaMemoryBuffer is now associated with a Node.
*/
2018-07-23 14:45:44 -04:00
if (pNodeProperties != NULL) {
// Get the aperture of the scratch buffer
HsaMemoryProperties *memoryProperties = new HsaMemoryProperties[pNodeProperties->NumMemoryBanks];
2018-08-13 09:03:31 -04:00
EXPECT_SUCCESS(hsaKmtGetNodeMemoryProperties(defaultGPUNode, pNodeProperties->NumMemoryBanks,
memoryProperties));
2018-07-23 14:45:44 -04:00
for (unsigned int bank = 0; bank < pNodeProperties->NumMemoryBanks; bank++) {
if (memoryProperties[bank].HeapType == HSA_HEAPTYPE_GPU_SCRATCH) {
int numWaves = 4; // WAVES must be >= # SE
2018-08-14 09:52:31 -04:00
int waveSize = 1; // Amount of space used by each wave in units of 256 dwords
2018-07-23 14:45:44 -04:00
PM4Queue queue;
ASSERT_SUCCESS(queue.Create(defaultGPUNode));
HSAuint64 scratchApertureAddr = memoryProperties[bank].VirtualBaseAddress;
// Create a dispatch packet to copy
Dispatch dispatchSrcToScratch(isaBuffer);
2018-08-14 09:52:31 -04:00
// Setup the dispatch packet
2018-07-23 14:45:44 -04:00
// Copying from the source Memory Buffer to the scratch buffer
dispatchSrcToScratch.SetArgs(srcMemBuffer.As<void*>(), reinterpret_cast<void*>(scratchApertureAddr));
dispatchSrcToScratch.SetDim(1, 1, 1);
dispatchSrcToScratch.SetScratch(numWaves, waveSize, scratchBuffer.As<uint64_t>());
2018-08-14 09:52:31 -04:00
// Submit the packet
2018-07-23 14:45:44 -04:00
dispatchSrcToScratch.Submit(queue);
dispatchSrcToScratch.Sync();
// Create another dispatch packet to copy scratch buffer contents to destination buffer.
Dispatch dispatchScratchToDst(isaBuffer);
2018-08-14 09:52:31 -04:00
// Set the arguments to copy from the scratch buffer to the destination buffer
2018-07-23 14:45:44 -04:00
dispatchScratchToDst.SetArgs(reinterpret_cast<void*>(scratchApertureAddr), dstMemBuffer.As<void*>());
dispatchScratchToDst.SetDim(1, 1, 1);
dispatchScratchToDst.SetScratch(numWaves, waveSize, scratchBuffer.As<uint64_t>());
2018-08-14 09:52:31 -04:00
// Submit the packet
2018-07-23 14:45:44 -04:00
dispatchScratchToDst.Submit(queue);
dispatchScratchToDst.Sync();
// Check that the scratch buffer contents were correctly copied over to the system memory buffer
EXPECT_EQ(dstMemBuffer.As<unsigned int*>()[0], 0x01010101);
2018-07-23 14:45:44 -04:00
}
}
delete [] memoryProperties;
}
TEST_END
}
TEST_F(KFDMemoryTest, GetTileConfigTest) {
TEST_START(TESTPROFILE_RUNALL)
HSAuint32 tile_config[32] = {0};
HSAuint32 macro_tile_config[16] = {0};
unsigned int i;
HsaGpuTileConfig config = {0};
config.TileConfig = tile_config;
config.MacroTileConfig = macro_tile_config;
config.NumTileConfigs = 32;
config.NumMacroTileConfigs = 16;
int defaultGPUNode = m_NodeInfo.HsaDefaultGPUNode();
ASSERT_SUCCESS(hsaKmtGetTileConfig(defaultGPUNode, &config));
LOG() << "tile_config:" << std::endl;
for (i = 0; i < config.NumTileConfigs; i++)
LOG() << "\t" << std::dec << i << ": 0x" << std::hex
<< tile_config[i] << std::endl;
LOG() << "macro_tile_config:" << std::endl;
for (i = 0; i < config.NumMacroTileConfigs; i++)
LOG() << "\t" << std::dec << i << ": 0x" << std::hex
<< macro_tile_config[i] << std::endl;
LOG() << "gb_addr_config: 0x" << std::hex << config.GbAddrConfig
<< std::endl;
LOG() << "num_banks: 0x" << std::hex << config.NumBanks << std::endl;
LOG() << "num_ranks: 0x" << std::hex << config.NumRanks << std::endl;
TEST_END
}
void KFDMemoryTest::BigBufferSystemMemory(int defaultGPUNode, HSAuint64 granularityMB,
HSAuint64 *lastSize) {
HSAuint64 sysMemSizeMB;
HsaMemMapFlags mapFlags = {0};
HSAuint64 AlternateVAGPU;
int ret;
sysMemSizeMB = GetSysMemSize() >> 20;
LOG() << "Found System Memory of " << std::dec << sysMemSizeMB
<< "MB" << std::endl;
/* Testing big buffers in system memory */
unsigned int * pDb = NULL;
HSAuint64 lowMB = 0;
HSAuint64 highMB = (sysMemSizeMB + granularityMB - 1) & ~(granularityMB - 1);
HSAuint64 sizeMB;
HSAuint64 size = 0;
HSAuint64 lastTestedSize = 0;
while (highMB - lowMB > granularityMB) {
sizeMB = (lowMB + highMB) / 2;
size = sizeMB * 1024 * 1024;
ret = hsaKmtAllocMemory(0 /* system */, size, m_MemoryFlags,
2018-08-13 09:03:31 -04:00
reinterpret_cast<void**>(&pDb));
2018-07-23 14:45:44 -04:00
if (ret) {
highMB = sizeMB;
continue;
}
ret = hsaKmtMapMemoryToGPUNodes(pDb, size, &AlternateVAGPU,
2018-08-13 09:03:31 -04:00
mapFlags, 1, reinterpret_cast<HSAuint32 *>(&defaultGPUNode));
2018-07-23 14:45:44 -04:00
if (ret) {
EXPECT_SUCCESS(hsaKmtFreeMemory(pDb, size));
2018-07-23 14:45:44 -04:00
highMB = sizeMB;
continue;
}
EXPECT_SUCCESS(hsaKmtUnmapMemoryToGPU(pDb));
EXPECT_SUCCESS(hsaKmtFreeMemory(pDb, size));
2018-07-23 14:45:44 -04:00
lowMB = sizeMB;
lastTestedSize = sizeMB;
}
2018-08-14 09:52:31 -04:00
/* Save the biggest allocated system buffer for signal handling test */
2018-07-23 14:45:44 -04:00
LOG() << "The biggest allocated system buffer is " << std::dec
<< lastTestedSize << "MB" << std::endl;
if (lastSize)
*lastSize = lastTestedSize * 1024 *1024;
}
void KFDMemoryTest::BigBufferVRAM(int defaultGPUNode, HSAuint64 granularityMB,
HSAuint64 *lastSize) {
HSAuint64 AlternateVAGPU;
int ret;
HSAuint64 vramSizeMB;
HsaMemFlags memFlags;
HsaMemMapFlags mapFlags = {0};
vramSizeMB = GetVramSize(defaultGPUNode) >> 20;
LOG() << "Found VRAM of " << std::dec << vramSizeMB << "MB." << std::endl;
/* Testing big buffers in VRAM */
unsigned int * pDb = NULL;
HSAuint64 lowMB = 0;
HSAuint64 highMB = (vramSizeMB + granularityMB - 1) & ~(granularityMB - 1);
HSAuint64 sizeMB;
HSAuint64 size = 0;
HSAuint64 lastTestedSize = 0;
memset(&memFlags, 0, sizeof(memFlags));
memFlags.ui32.HostAccess = 0;
memFlags.ui32.NonPaged = 1;
while (highMB - lowMB > granularityMB) {
sizeMB = (lowMB + highMB) / 2;
size = sizeMB * 1024 * 1024;
ret = hsaKmtAllocMemory(defaultGPUNode, size, memFlags,
2018-08-13 09:03:31 -04:00
reinterpret_cast<void**>(&pDb));
2018-07-23 14:45:44 -04:00
if (ret) {
highMB = sizeMB;
continue;
}
ret = hsaKmtMapMemoryToGPUNodes(pDb, size, &AlternateVAGPU,
2018-08-13 09:03:31 -04:00
mapFlags, 1, reinterpret_cast<HSAuint32 *>(&defaultGPUNode));
2018-07-23 14:45:44 -04:00
if (ret) {
EXPECT_SUCCESS(hsaKmtFreeMemory(pDb, size));
2018-07-23 14:45:44 -04:00
highMB = sizeMB;
continue;
}
EXPECT_SUCCESS(hsaKmtUnmapMemoryToGPU(pDb));
EXPECT_SUCCESS(hsaKmtFreeMemory(pDb, size));
2018-07-23 14:45:44 -04:00
lowMB = sizeMB;
lastTestedSize = sizeMB;
}
LOG() << "The biggest allocated VRAM buffer is " << std::dec
<< lastTestedSize << "MB" << std::endl;
if (lastSize)
*lastSize = lastTestedSize * 1024 * 1024;
/* Make sure 3/4 vram can be allocated.*/
EXPECT_GE(lastTestedSize * 4, vramSizeMB * 3);
if (lastTestedSize * 16 < vramSizeMB * 15)
WARN() << "The biggest allocated VRAM buffer size is smaller than the expected "
<< vramSizeMB * 15 / 16 << "MB" << std::endl;
2018-07-23 14:45:44 -04:00
}
void KFDMemoryTest::NumaNodeBind(const char *nodeStr) {
if (numa_available() != -1) {
int num_node = numa_num_task_nodes();
if (num_node > 1) {
struct bitmask *nodemask;
LOG() << "NUMA total nodes " << num_node << ", bind to " << nodeStr << std::endl;
nodemask = numa_parse_nodestring(nodeStr);
if (nodemask) {
numa_bind(nodemask);
numa_free_nodemask(nodemask);
}
}
}
}
2018-07-23 14:45:44 -04:00
/* BigBufferStressTest allocs, maps/unmaps, and frees the biggest possible system
* buffers. Its size is found using binary search in the range (0, RAM SIZE) with
* a granularity of 128M. Repeat the similar logic on local buffers (VRAM).
* Finally, it allocs and maps 128M system buffers in a loop until it
* fails, then unmaps and frees them afterwards.
* Please note we limit the biggest possible system buffer to be smaller than
* the RAM size. The reason is that the system buffer can make use of virtual
* memory so that a system buffer could be very large even though the RAM size
* is small. For example, on a typical Carrizo platform, the biggest allocated
* system buffer could be more than 14G even though it only has 4G memory.
* In that situation, it will take too much time to finish the test, because of
2018-08-14 09:52:31 -04:00
* the onerous memory swap operation. So we limit the buffer size that way.
*/
2018-07-23 14:45:44 -04:00
TEST_F(KFDMemoryTest, BigBufferStressTest) {
if (!is_dgpu()) {
LOG() << "Skipping test: Running on APU fails and locks the system." << std::endl;
2018-07-23 14:45:44 -04:00
return;
}
TEST_REQUIRE_ENV_CAPABILITIES(ENVCAPS_64BITLINUX);
TEST_START(TESTPROFILE_RUNALL);
HSAuint64 AlternateVAGPU;
HsaMemMapFlags mapFlags = {0};
int ret;
HSAuint64 granularityMB = 128;
int defaultGPUNode = m_NodeInfo.HsaDefaultGPUNode();
ASSERT_GE(defaultGPUNode, 0) << "failed to get default GPU Node";
/* Don't run on node 0 on multiple NUMA node machine because dma32 zone is on node 0,
* Use all memory including dma32 zone on node 0 will cause TTM eviction to free dma32
* zone for other devices which supports 32bit physical address. The eviction and
* restore may retry if busy and cause queue timeout and test failure.
*/
NumaNodeBind("!0");
2018-07-23 14:45:44 -04:00
BigBufferSystemMemory(defaultGPUNode, granularityMB, NULL);
BigBufferVRAM(defaultGPUNode, granularityMB, NULL);
/* Repeatedly allocate and map big buffers in system memory until it fails,
2018-08-14 09:52:31 -04:00
* then unmap and free them.
*/
2018-07-23 14:45:44 -04:00
#define ARRAY_ENTRIES 2048
int i = 0, allocationCount = 0;
2018-07-23 14:45:44 -04:00
unsigned int* pDb_array[ARRAY_ENTRIES];
HSAuint64 block_size_mb = 128;
HSAuint64 block_size = block_size_mb * 1024 * 1024;
PM4Queue queue;
2018-07-23 14:45:44 -04:00
/* Test 4 times to see if there is any memory leak.*/
for (int repeat = 1; repeat < 5; repeat++) {
ASSERT_SUCCESS(queue.Create(defaultGPUNode));
for (i = 0; i < ARRAY_ENTRIES; i++) {
ret = hsaKmtAllocMemory(0 /* system */, block_size, m_MemoryFlags,
reinterpret_cast<void**>(&pDb_array[i]));
if (ret)
break;
ret = hsaKmtMapMemoryToGPUNodes(pDb_array[i], block_size,
&AlternateVAGPU, mapFlags, 1, reinterpret_cast<HSAuint32 *>(&defaultGPUNode));
if (ret) {
EXPECT_SUCCESS(hsaKmtFreeMemory(pDb_array[i], block_size));
break;
}
2018-07-23 14:45:44 -04:00
}
LOG() << "Allocated system buffers time " << std::dec << repeat << ": " << i << "x"
<< block_size_mb << "MB" << std::endl;
2018-07-23 14:45:44 -04:00
if (allocationCount == 0)
allocationCount = i;
EXPECT_GE(i, allocationCount) << "There might be memory leak!" << std::endl;
2018-07-23 14:45:44 -04:00
for (int j = 0; j < i; j++) {
/* To see if GPU can access the memory correctly*/
unsigned int *begin = pDb_array[j];
*begin = 0;
queue.PlaceAndSubmitPacket(
PM4WriteDataPacket(begin, 0xdeadbeaf));
queue.Wait4PacketConsumption(NULL, 300000);
EXPECT_TRUE(WaitOnValue(begin, 0xdeadbeaf));
}
EXPECT_SUCCESS(queue.Destroy());
for (int j = 0; j < i; j++) {
EXPECT_SUCCESS(hsaKmtUnmapMemoryToGPU(pDb_array[j]));
EXPECT_SUCCESS(hsaKmtFreeMemory(pDb_array[j], block_size));
}
2018-07-23 14:45:44 -04:00
}
/* Reset to run on all task nodes */
NumaNodeBind("all");
2018-07-23 14:45:44 -04:00
TEST_END
}
TEST_F(KFDMemoryTest, MMBench) {
TEST_REQUIRE_ENV_CAPABILITIES(ENVCAPS_64BITLINUX);
TEST_START(TESTPROFILE_RUNALL);
unsigned testIndex, sizeIndex, memType, nMemTypes;
const char *memTypeStrings[2] = {"SysMem", "VRAM"};
const struct {
unsigned size;
unsigned num;
} bufParams[] = {
/* Buffer sizes in x16 increments. Limit memory usage to about
* 1GB. For small sizes we use 1000 buffers, which means we
* conveniently measure microseconds and report nanoseconds.
*/
{PAGE_SIZE , 1000}, /* 4KB */
{PAGE_SIZE << 4, 1000}, /* 64KB */
{PAGE_SIZE << 9, 500}, /* 2MB */
{PAGE_SIZE << 13, 32}, /* 32MB */
{PAGE_SIZE << 18, 1}, /* 1GB */
};
const unsigned nSizes = sizeof(bufParams) / sizeof(bufParams[0]);
2018-07-23 14:45:44 -04:00
const unsigned nTests = nSizes << 2;
#define TEST_BUFSIZE(index) (bufParams[(index) % nSizes].size)
#define TEST_NBUFS(index) (bufParams[(index) % nSizes].num)
2018-07-23 14:45:44 -04:00
#define TEST_MEMTYPE(index) ((index / nSizes) & 0x1)
#define TEST_SDMA(index) (((index / nSizes) >> 1) & 0x1)
void *bufs[1000];
2018-08-13 09:03:31 -04:00
HSAuint64 start, end;
2018-07-23 14:45:44 -04:00
unsigned i;
HSAKMT_STATUS ret;
HsaMemFlags memFlags = {0};
HsaMemMapFlags mapFlags = {0};
HSAuint64 altVa;
HSAuint32 defaultGPUNode = m_NodeInfo.HsaDefaultGPUNode();
ASSERT_GE(defaultGPUNode, 0) << "failed to get default GPU Node";
HSAuint64 vramSizeMB = GetVramSize(defaultGPUNode) >> 20;
const std::vector<int> gpuNodes = m_NodeInfo.GetNodesWithGPU();
bool is_all_large_bar = true;
for (unsigned i = 0; i < gpuNodes.size(); i++) {
if (!m_NodeInfo.IsGPUNodeLargeBar(gpuNodes.at(i))) {
is_all_large_bar = false;
break;
}
}
2018-07-23 14:45:44 -04:00
LOG() << "Found VRAM of " << std::dec << vramSizeMB << "MB." << std::endl;
if (vramSizeMB == 0)
nMemTypes = 1;
else
nMemTypes = 2;
/* Two SDMA queues to interleave user mode SDMA with memory
* management on either SDMA engine. Make the queues long enough
* to buffer at least nBufs x WriteData packets (7 dwords per
2018-08-14 09:52:31 -04:00
* packet).
*/
2018-07-23 14:45:44 -04:00
SDMAQueue sdmaQueue[2];
ASSERT_SUCCESS(sdmaQueue[0].Create(defaultGPUNode, PAGE_SIZE*8));
ASSERT_SUCCESS(sdmaQueue[1].Create(defaultGPUNode, PAGE_SIZE*8));
HsaMemoryBuffer sdmaBuffer(PAGE_SIZE, 0); /* system memory */
#define INTERLEAVE_SDMA() do { \
if (interleaveSDMA) { \
sdmaQueue[0].PlaceAndSubmitPacket( \
SDMAWriteDataPacket(sdmaQueue[0].GetFamilyId(), sdmaBuffer.As<HSAuint32 *>(), \
2018-07-23 14:45:44 -04:00
0x12345678)); \
sdmaQueue[1].PlaceAndSubmitPacket( \
SDMAWriteDataPacket(sdmaQueue[1].GetFamilyId(), sdmaBuffer.As<HSAuint32 *>()+16, \
2018-07-23 14:45:44 -04:00
0x12345678)); \
} \
} while (0)
#define IDLE_SDMA() do { \
if (interleaveSDMA) { \
sdmaQueue[0].Wait4PacketConsumption(); \
sdmaQueue[1].Wait4PacketConsumption(); \
} \
} while (0)
LOG() << "Test (avg. ns)\t alloc mapOne umapOne mapAll umapAll free" << std::endl;
2018-07-23 14:45:44 -04:00
for (testIndex = 0; testIndex < nTests; testIndex++) {
unsigned bufSize = TEST_BUFSIZE(testIndex);
unsigned nBufs = TEST_NBUFS(testIndex);
2018-07-23 14:45:44 -04:00
unsigned memType = TEST_MEMTYPE(testIndex);
bool interleaveSDMA = TEST_SDMA(testIndex);
2018-08-13 09:03:31 -04:00
HSAuint64 allocTime, map1Time, unmap1Time, mapAllTime, unmapAllTime, freeTime;
2018-07-23 14:45:44 -04:00
HSAuint32 allocNode;
if ((testIndex % nSizes) == 0)
LOG() << "--------------------------------------------------------------------------" << std::endl;
2018-07-23 14:45:44 -04:00
if (memType >= nMemTypes)
continue; // skip unsupported mem types
if (memType == 0) {
allocNode = 0;
memFlags.ui32.PageSize = HSA_PAGE_SIZE_4KB;
memFlags.ui32.HostAccess = 1;
memFlags.ui32.NonPaged = 0;
} else {
allocNode = defaultGPUNode;
memFlags.ui32.PageSize = HSA_PAGE_SIZE_4KB;
memFlags.ui32.HostAccess = 0;
memFlags.ui32.NonPaged = 1;
}
/* Allocation */
start = GetSystemTickCountInMicroSec();
for (i = 0; i < nBufs; i++) {
ASSERT_SUCCESS(hsaKmtAllocMemory(allocNode, bufSize, memFlags,
&bufs[i]));
INTERLEAVE_SDMA();
}
allocTime = GetSystemTickCountInMicroSec() - start;
IDLE_SDMA();
/* Map to one GPU */
start = GetSystemTickCountInMicroSec();
for (i = 0; i < nBufs; i++) {
ASSERT_SUCCESS(hsaKmtMapMemoryToGPUNodes(bufs[i], bufSize,
&altVa, mapFlags, 1,
&defaultGPUNode));
INTERLEAVE_SDMA();
}
map1Time = GetSystemTickCountInMicroSec() - start;
IDLE_SDMA();
/* Unmap from GPU */
start = GetSystemTickCountInMicroSec();
for (i = 0; i < nBufs; i++) {
EXPECT_SUCCESS(hsaKmtUnmapMemoryToGPU(bufs[i]));
2018-07-23 14:45:44 -04:00
INTERLEAVE_SDMA();
}
unmap1Time = GetSystemTickCountInMicroSec() - start;
IDLE_SDMA();
/* Map to all GPUs */
if (is_all_large_bar) {
start = GetSystemTickCountInMicroSec();
for (i = 0; i < nBufs; i++) {
ASSERT_SUCCESS(hsaKmtMapMemoryToGPU(bufs[i], bufSize, &altVa));
INTERLEAVE_SDMA();
}
mapAllTime = GetSystemTickCountInMicroSec() - start;
IDLE_SDMA();
/* Unmap from all GPUs */
start = GetSystemTickCountInMicroSec();
for (i = 0; i < nBufs; i++) {
EXPECT_SUCCESS(hsaKmtUnmapMemoryToGPU(bufs[i]));
INTERLEAVE_SDMA();
}
unmapAllTime = GetSystemTickCountInMicroSec() - start;
IDLE_SDMA();
2018-07-23 14:45:44 -04:00
}
/* Free */
start = GetSystemTickCountInMicroSec();
for (i = 0; i < nBufs; i++) {
EXPECT_SUCCESS(hsaKmtFreeMemory(bufs[i], bufSize));
2018-07-23 14:45:44 -04:00
INTERLEAVE_SDMA();
}
freeTime = GetSystemTickCountInMicroSec() - start;
IDLE_SDMA();
allocTime = allocTime * 1000 / nBufs;
map1Time = map1Time * 1000 / nBufs;
unmap1Time = unmap1Time * 1000 / nBufs;
mapAllTime = mapAllTime * 1000 / nBufs;
unmapAllTime = unmapAllTime * 1000 / nBufs;
freeTime = freeTime * 1000 / nBufs;
unsigned bufSizeLog;
char bufSizeUnit;
if (bufSize < (1 << 20)) {
bufSizeLog = bufSize >> 10;
bufSizeUnit = 'K';
} else if (bufSize < (1 << 30)) {
bufSizeLog = bufSize >> 20;
bufSizeUnit = 'M';
} else {
bufSizeLog = bufSize >> 30;
bufSizeUnit = 'G';
}
2018-07-23 14:45:44 -04:00
LOG() << std::dec << std::setiosflags(std::ios::right)
<< std::setw(3) << bufSizeLog << bufSizeUnit << "-"
2018-07-23 14:45:44 -04:00
<< memTypeStrings[memType] << "-"
<< (interleaveSDMA ? "SDMA\t" : "noSDMA\t")
<< std::setw(9) << allocTime
<< std::setw(9) << map1Time
<< std::setw(9) << unmap1Time
<< std::setw(9) << mapAllTime
<< std::setw(9) << unmapAllTime
<< std::setw(9) << freeTime << std::endl;
#define MMBENCH_KEY_PREFIX memTypeStrings[memType] << "-" \
<< (interleaveSDMA ? "SDMA" : "noSDMA") << "-" \
<< (bufSize >> 10) << "K-"
RECORD(allocTime) << MMBENCH_KEY_PREFIX << "alloc";
RECORD(map1Time) << MMBENCH_KEY_PREFIX << "mapOne";
RECORD(unmap1Time) << MMBENCH_KEY_PREFIX << "unmapOne";
RECORD(mapAllTime) << MMBENCH_KEY_PREFIX << "mapAll";
RECORD(unmapAllTime) << MMBENCH_KEY_PREFIX << "unmapAll";
RECORD(freeTime) << MMBENCH_KEY_PREFIX << "free";
2018-07-23 14:45:44 -04:00
}
TEST_END
}
TEST_F(KFDMemoryTest, QueryPointerInfo) {
TEST_START(TESTPROFILE_RUNALL)
int defaultGPUNode = m_NodeInfo.HsaDefaultGPUNode();
ASSERT_GE(defaultGPUNode, 0) << "failed to get default GPU Node";
unsigned int bufSize = PAGE_SIZE * 8; // CZ and Tonga need 8 pages
HsaPointerInfo ptrInfo;
const std::vector<int> gpuNodes = m_NodeInfo.GetNodesWithGPU();
HSAuint64 nGPU = gpuNodes.size(); // number of gpu nodes
/* GraphicHandle is tested at KFDGraphicsInterop.RegisterGraphicsHandle */
/*** Memory allocated on CPU node ***/
HsaMemoryBuffer hostBuffer(bufSize, 0/*node*/, false, false/*local*/);
EXPECT_SUCCESS(hsaKmtQueryPointerInfo(hostBuffer.As<void*>(), &ptrInfo));
EXPECT_EQ(ptrInfo.Type, HSA_POINTER_ALLOCATED);
EXPECT_EQ(ptrInfo.Node, 0);
EXPECT_EQ(ptrInfo.MemFlags.Value, hostBuffer.Flags().Value);
EXPECT_EQ(ptrInfo.CPUAddress, hostBuffer.As<void*>());
EXPECT_EQ(ptrInfo.GPUAddress, (HSAuint64)hostBuffer.As<void*>());
EXPECT_EQ(ptrInfo.SizeInBytes, (HSAuint64)hostBuffer.Size());
if (is_dgpu()) {
EXPECT_EQ((HSAuint64)ptrInfo.NMappedNodes, nGPU);
// Check NMappedNodes again after unmapping the memory
hsaKmtUnmapMemoryToGPU(hostBuffer.As<void*>());
hsaKmtQueryPointerInfo(hostBuffer.As<void*>(), &ptrInfo);
}
EXPECT_EQ((HSAuint64)ptrInfo.NMappedNodes, 0);
/* Skip testing local memory if the platform does not have it */
if (GetVramSize(defaultGPUNode)) {
HsaMemoryBuffer localBuffer(bufSize, defaultGPUNode, false, true);
EXPECT_SUCCESS(hsaKmtQueryPointerInfo(localBuffer.As<void*>(), &ptrInfo));
EXPECT_EQ(ptrInfo.Type, HSA_POINTER_ALLOCATED);
EXPECT_EQ(ptrInfo.Node, defaultGPUNode);
EXPECT_EQ(ptrInfo.MemFlags.Value, localBuffer.Flags().Value);
EXPECT_EQ(ptrInfo.CPUAddress, localBuffer.As<void*>());
EXPECT_EQ(ptrInfo.GPUAddress, (HSAuint64)localBuffer.As<void*>());
EXPECT_EQ(ptrInfo.SizeInBytes, (HSAuint64)localBuffer.Size());
HSAuint32 *addr = localBuffer.As<HSAuint32 *>() + 4;
2018-08-13 09:03:31 -04:00
EXPECT_SUCCESS(hsaKmtQueryPointerInfo(reinterpret_cast<void *>(addr), &ptrInfo));
2018-07-23 14:45:44 -04:00
EXPECT_EQ(ptrInfo.GPUAddress, (HSAuint64)localBuffer.As<void*>());
}
/** Registered memory: user pointer */
static volatile HSAuint32 mem[4]; // 8 bytes for register only and
// 8 bytes for register to nodes
2018-08-13 09:03:31 -04:00
HsaMemoryBuffer hsaBuffer((void *)(&mem[0]), sizeof(HSAuint32)*2);
2018-07-23 14:45:44 -04:00
if (is_dgpu()) { // APU doesn't use userptr
2018-08-13 09:03:31 -04:00
EXPECT_SUCCESS(hsaKmtQueryPointerInfo((void *)(&mem[0]), &ptrInfo));
2018-07-23 14:45:44 -04:00
EXPECT_EQ(ptrInfo.Type, HSA_POINTER_REGISTERED_USER);
EXPECT_EQ(ptrInfo.CPUAddress, &mem[0]);
EXPECT_EQ(ptrInfo.GPUAddress, (HSAuint64)hsaBuffer.As<void*>());
EXPECT_EQ(ptrInfo.SizeInBytes, sizeof(HSAuint32)*2);
EXPECT_EQ(ptrInfo.NRegisteredNodes, 0);
EXPECT_EQ(ptrInfo.NMappedNodes, nGPU);
// Register to nodes
HSAuint32 nodes[nGPU];
for (unsigned int i = 0; i < nGPU; i++)
nodes[i] = gpuNodes.at(i);
2018-08-13 09:03:31 -04:00
EXPECT_SUCCESS(hsaKmtRegisterMemoryToNodes((void *)(&mem[2]),
2018-07-23 14:45:44 -04:00
sizeof(HSAuint32)*2, nGPU, nodes));
2018-08-13 09:03:31 -04:00
EXPECT_SUCCESS(hsaKmtQueryPointerInfo((void *)(&mem[2]), &ptrInfo));
2018-07-23 14:45:44 -04:00
EXPECT_EQ(ptrInfo.NRegisteredNodes, nGPU);
2018-08-13 09:03:31 -04:00
EXPECT_SUCCESS(hsaKmtDeregisterMemory((void *)(&mem[2])));
2018-07-23 14:45:44 -04:00
}
/* Not a starting address, but an address inside the memory range
* should also get the memory information
*/
HSAuint32 *address = hostBuffer.As<HSAuint32 *>() + 1;
2018-08-13 09:03:31 -04:00
EXPECT_SUCCESS(hsaKmtQueryPointerInfo(reinterpret_cast<void *>(address), &ptrInfo));
2018-07-23 14:45:44 -04:00
EXPECT_EQ(ptrInfo.Type, HSA_POINTER_ALLOCATED);
EXPECT_EQ(ptrInfo.CPUAddress, hostBuffer.As<void*>());
if (is_dgpu()) {
2018-08-13 09:03:31 -04:00
EXPECT_SUCCESS(hsaKmtQueryPointerInfo((void *)(&mem[1]), &ptrInfo));
2018-07-23 14:45:44 -04:00
EXPECT_EQ(ptrInfo.Type, HSA_POINTER_REGISTERED_USER);
EXPECT_EQ(ptrInfo.CPUAddress, &mem[0]);
}
/*** Set user data ***/
char userData[16] = "This is a test.";
2018-08-13 09:03:31 -04:00
EXPECT_SUCCESS(hsaKmtSetMemoryUserData(hostBuffer.As<HSAuint32 *>(), reinterpret_cast<void *>(userData)));
2018-07-23 14:45:44 -04:00
EXPECT_SUCCESS(hsaKmtQueryPointerInfo(hostBuffer.As<void*>(), &ptrInfo));
EXPECT_EQ(ptrInfo.UserData, (void *)userData);
TEST_END
}
/* Linux OS-specific test for a debugger accessing HSA memory in a
* debugged process.
*
* Allocates a system memory and a visible local memory buffer (if
* possible). Forks a child process that PTRACE_ATTACHes to the parent
* to access its memory like a debugger would. Child copies data in
* the parent process using PTRACE_PEEKDATA and PTRACE_POKEDATA. After
* the child terminates, the parent checks that the copy was
2018-08-14 09:52:31 -04:00
* successful.
*/
2018-07-23 14:45:44 -04:00
TEST_F(KFDMemoryTest, PtraceAccess) {
TEST_START(TESTPROFILE_RUNALL)
int defaultGPUNode = m_NodeInfo.HsaDefaultGPUNode();
ASSERT_GE(defaultGPUNode, 0) << "failed to get default GPU Node";
HsaMemFlags memFlags = {0};
memFlags.ui32.PageSize = HSA_PAGE_SIZE_4KB;
memFlags.ui32.HostAccess = 1;
void *mem[2];
unsigned i;
2018-08-14 09:52:31 -04:00
/* Offset in the VRAM buffer to test crossing non-contiguous
* buffer boundaries. The second access starting from offset
* sizeof(HSAint64)+1 will cross a node boundary in a single access,
* for node sizes of 4MB or smaller.
*/
2018-08-13 09:03:31 -04:00
const HSAuint64 VRAM_OFFSET = (4 << 20) - 2 * sizeof(HSAint64);
2018-07-23 14:45:44 -04:00
2018-08-14 09:52:31 -04:00
// Alloc system memory from node 0 and initialize it
2018-07-23 14:45:44 -04:00
memFlags.ui32.NonPaged = 0;
ASSERT_SUCCESS(hsaKmtAllocMemory(0, PAGE_SIZE*2, memFlags, &mem[0]));
2018-08-13 09:03:31 -04:00
for (i = 0; i < 4*sizeof(HSAint64) + 4; i++) {
(reinterpret_cast<HSAuint8 *>(mem[0]))[i] = i; // source
(reinterpret_cast<HSAuint8 *>(mem[0]))[PAGE_SIZE+i] = 0; // destination
2018-07-23 14:45:44 -04:00
}
2018-08-14 09:52:31 -04:00
// Try to alloc local memory from GPU node
2018-07-23 14:45:44 -04:00
memFlags.ui32.NonPaged = 1;
if (m_NodeInfo.IsGPUNodeLargeBar(defaultGPUNode)) {
EXPECT_SUCCESS(hsaKmtAllocMemory(defaultGPUNode, PAGE_SIZE*2 + (4 << 20),
memFlags, &mem[1]));
2018-08-13 09:03:31 -04:00
mem[1] = reinterpret_cast<void *>(reinterpret_cast<HSAuint8 *>(mem[1]) + VRAM_OFFSET);
for (i = 0; i < 4*sizeof(HSAint64) + 4; i++) {
(reinterpret_cast<HSAuint8 *>(mem[1]))[i] = i;
(reinterpret_cast<HSAuint8 *>(mem[1]))[PAGE_SIZE+i] = 0;
2018-07-23 14:45:44 -04:00
}
} else {
LOG() << "Not testing local memory, it's invisible" << std::endl;
mem[1] = NULL;
}
2018-08-14 09:52:31 -04:00
/* Allow any process to trace this one. If kernel is built without
* Yama, this is not needed, and this call will fail.
*/
2018-07-23 14:45:44 -04:00
#ifdef PR_SET_PTRACER
prctl(PR_SET_PTRACER, PR_SET_PTRACER_ANY, 0, 0, 0);
#endif
2018-08-14 09:52:31 -04:00
// Find current pid so the child can trace it
2018-07-23 14:45:44 -04:00
pid_t tracePid = getpid();
// Fork the child
pid_t childPid = fork();
ASSERT_GE(childPid, 0);
if (childPid == 0) {
int traceStatus;
int err = 0, r;
/* Child process: we catch any exceptions to make sure we detach
* from the traced process, because terminating without detaching
* leaves the traced process stopped.
*/
2018-07-23 14:45:44 -04:00
r = ptrace(PTRACE_ATTACH, tracePid, NULL, NULL);
if (r) {
WARN() << "PTRACE_ATTACH failed: " << r << std::endl;
exit(1);
}
try {
do {
waitpid(tracePid, &traceStatus, 0);
} while (!WIFSTOPPED(traceStatus));
for (i = 0; i < 4; i++) {
2018-08-14 09:52:31 -04:00
// Test 4 different (mis-)alignments, leaving 1-byte gaps between longs
2018-08-13 09:03:31 -04:00
HSAuint8 *addr = reinterpret_cast<HSAuint8 *>(reinterpret_cast<long *>(mem[0]) + i) + i;
2018-07-23 14:45:44 -04:00
errno = 0;
long data = ptrace(PTRACE_PEEKDATA, tracePid, addr, NULL);
EXPECT_EQ(0, errno);
EXPECT_EQ(0, ptrace(PTRACE_POKEDATA, tracePid, addr + PAGE_SIZE,
2018-08-13 09:03:31 -04:00
reinterpret_cast<void *>(data)));
2018-07-23 14:45:44 -04:00
if (mem[1] == NULL)
continue;
2018-08-13 09:03:31 -04:00
addr = reinterpret_cast<HSAuint8 *>(reinterpret_cast<long *>(mem[1]) + i) + i;
2018-07-23 14:45:44 -04:00
errno = 0;
data = ptrace(PTRACE_PEEKDATA, tracePid, addr, NULL);
EXPECT_EQ(0, errno);
EXPECT_EQ(0, ptrace(PTRACE_POKEDATA, tracePid, addr + PAGE_SIZE,
2018-08-13 09:03:31 -04:00
reinterpret_cast<void *>(data)));
2018-07-23 14:45:44 -04:00
}
} catch (...) {
err = 1;
}
r = ptrace(PTRACE_DETACH, tracePid, NULL, NULL);
if (r) {
WARN() << "PTRACE_DETACH failed: " << r << std::endl;
exit(1);
}
exit(err);
} else {
int childStatus;
// Parent process, just wait for the child to finish
EXPECT_EQ(childPid, waitpid(childPid, &childStatus, 0));
EXPECT_NE(0, WIFEXITED(childStatus));
EXPECT_EQ(0, WEXITSTATUS(childStatus));
}
// Clear gaps in the source that should not have been copied
2018-08-13 09:03:31 -04:00
(reinterpret_cast<uint8_t*>(mem[0]))[ sizeof(long) ] = 0;
(reinterpret_cast<uint8_t*>(mem[0]))[2*sizeof(long) + 1] = 0;
(reinterpret_cast<uint8_t*>(mem[0]))[3*sizeof(long) + 2] = 0;
(reinterpret_cast<uint8_t*>(mem[0]))[4*sizeof(long) + 3] = 0;
2018-07-23 14:45:44 -04:00
// Check results
2018-08-13 09:03:31 -04:00
EXPECT_EQ(0, memcmp(mem[0], reinterpret_cast<HSAuint8 *>(mem[0]) + PAGE_SIZE,
2018-07-23 14:45:44 -04:00
sizeof(long)*4 + 4));
// Free memory
EXPECT_SUCCESS(hsaKmtFreeMemory(mem[0], PAGE_SIZE*2));
if (mem[1]) {
2018-08-13 09:03:31 -04:00
(reinterpret_cast<uint8_t*>(mem[1]))[ sizeof(HSAint64) ] = 0;
(reinterpret_cast<uint8_t*>(mem[1]))[2*sizeof(HSAint64) + 1] = 0;
(reinterpret_cast<uint8_t*>(mem[1]))[3*sizeof(HSAint64) + 2] = 0;
(reinterpret_cast<uint8_t*>(mem[1]))[4*sizeof(HSAint64) + 3] = 0;
EXPECT_EQ(0, memcmp(mem[1], reinterpret_cast<HSAuint8 *>(mem[1]) + PAGE_SIZE,
sizeof(HSAint64)*4 + 4));
mem[1] = reinterpret_cast<void *>(reinterpret_cast<HSAuint8 *>(mem[1]) - VRAM_OFFSET);
2018-07-23 14:45:44 -04:00
EXPECT_SUCCESS(hsaKmtFreeMemory(mem[1], PAGE_SIZE*2));
}
TEST_END
}
2018-08-13 09:03:31 -04:00
TEST_F(KFDMemoryTest, PtraceAccessInvisibleVram) {
2018-07-23 14:45:44 -04:00
char *hsaDebug = getenv("HSA_DEBUG");
if (!is_dgpu()) {
LOG() << "Skipping test: There is no VRAM on APU." << std::endl;
2018-07-23 14:45:44 -04:00
return;
}
if (!hsaDebug || !strcmp(hsaDebug, "0")) {
LOG() << "Skipping test: HSA_DEBUG environment variable not set." << std::endl;
2018-07-23 14:45:44 -04:00
return;
}
TEST_START(TESTPROFILE_RUNALL)
int defaultGPUNode = m_NodeInfo.HsaDefaultGPUNode();
ASSERT_GE(defaultGPUNode, 0) << "failed to get default GPU Node";
HsaMemMapFlags mapFlags = {0};
2018-07-23 14:45:44 -04:00
HsaMemFlags memFlags = {0};
memFlags.ui32.PageSize = HSA_PAGE_SIZE_4KB;
/* Allocate host not accessible vram */
memFlags.ui32.HostAccess = 0;
memFlags.ui32.NonPaged = 1;
void *mem, *mem0, *mem1;
unsigned size = PAGE_SIZE*2 + (4 << 20);
HSAuint64 data[2] = {0xdeadbeefdeadbeef, 0xcafebabecafebabe};
unsigned int data0[2] = {0xdeadbeef, 0xdeadbeef};
unsigned int data1[2] = {0xcafebabe, 0xcafebabe};
const HSAuint64 VRAM_OFFSET = (4 << 20) - sizeof(HSAuint64);
ASSERT_SUCCESS(hsaKmtAllocMemory(defaultGPUNode, size, memFlags, &mem));
ASSERT_SUCCESS(hsaKmtMapMemoryToGPUNodes(mem, size, NULL,
mapFlags, 1, reinterpret_cast<HSAuint32 *>(&defaultGPUNode)));
2018-08-14 09:52:31 -04:00
/* Set the word before 4M boundary to 0xdeadbeefdeadbeef
2018-07-23 14:45:44 -04:00
* and the word after 4M boundary to 0xcafebabecafebabe
*/
2018-08-13 09:03:31 -04:00
mem0 = reinterpret_cast<void *>(reinterpret_cast<HSAuint8 *>(mem) + VRAM_OFFSET);
mem1 = reinterpret_cast<void *>(reinterpret_cast<HSAuint8 *>(mem) + VRAM_OFFSET + sizeof(HSAuint64));
2018-07-23 14:45:44 -04:00
PM4Queue queue;
ASSERT_SUCCESS(queue.Create(defaultGPUNode));
queue.PlaceAndSubmitPacket(PM4WriteDataPacket((unsigned int *)mem0,
data0[0], data0[1]));
queue.PlaceAndSubmitPacket(PM4WriteDataPacket((unsigned int *)mem1,
data1[0], data1[1]));
queue.PlaceAndSubmitPacket(PM4ReleaseMemoryPacket(m_FamilyId, true, 0, 0));
2018-07-23 14:45:44 -04:00
queue.Wait4PacketConsumption();
/* Allow any process to trace this one. If kernel is built without
* Yama, this is not needed, and this call will fail.
*/
#ifdef PR_SET_PTRACER
prctl(PR_SET_PTRACER, PR_SET_PTRACER_ANY, 0, 0, 0);
#endif
// Find out my pid so the child can trace it
pid_t tracePid = getpid();
// Fork the child
pid_t childPid = fork();
ASSERT_GE(childPid, 0);
if (childPid == 0) {
int traceStatus;
int err = 0, r;
/* Child process: we catch any exceptions to make sure we detach
* from the traced process, because terminating without detaching
* leaves the traced process stopped.
2018-07-23 14:45:44 -04:00
*/
r = ptrace(PTRACE_ATTACH, tracePid, NULL, NULL);
if (r) {
WARN() << "PTRACE_ATTACH failed: " << r << std::endl;
exit(1);
}
try {
do {
waitpid(tracePid, &traceStatus, 0);
} while (!WIFSTOPPED(traceStatus));
2018-08-14 09:52:31 -04:00
/* Peek the memory */
2018-07-23 14:45:44 -04:00
errno = 0;
2018-08-13 09:03:31 -04:00
HSAint64 data0 = ptrace(PTRACE_PEEKDATA, tracePid, mem0, NULL);
EXPECT_EQ(0, errno);
EXPECT_EQ(data[0], data0);
2018-08-13 09:03:31 -04:00
HSAint64 data1 = ptrace(PTRACE_PEEKDATA, tracePid, mem1, NULL);
EXPECT_EQ(0, errno);
EXPECT_EQ(data[1], data1);
2018-07-23 14:45:44 -04:00
2018-08-14 09:52:31 -04:00
/* Swap mem0 and mem1 by poking */
EXPECT_EQ(0, ptrace(PTRACE_POKEDATA, tracePid, mem0, reinterpret_cast<void *>(data[1])));
EXPECT_EQ(0, errno);
EXPECT_EQ(0, ptrace(PTRACE_POKEDATA, tracePid, mem1, reinterpret_cast<void *>(data[0])));
EXPECT_EQ(0, errno);
2018-07-23 14:45:44 -04:00
} catch (...) {
err = 1;
}
r = ptrace(PTRACE_DETACH, tracePid, NULL, NULL);
if (r) {
WARN() << "PTRACE_DETACH failed: " << r << std::endl;
exit(1);
}
exit(err);
} else {
int childStatus;
// Parent process, just wait for the child to finish
EXPECT_EQ(childPid, waitpid(childPid, &childStatus, 0));
EXPECT_NE(0, WIFEXITED(childStatus));
EXPECT_EQ(0, WEXITSTATUS(childStatus));
}
/* Use shader to read back data to check poke results */
HsaMemoryBuffer isaBuffer(PAGE_SIZE, defaultGPUNode, true/*zero*/, false/*local*/, true/*exec*/);
2018-08-13 09:03:31 -04:00
// dstBuffer is cpu accessible gtt memory
2018-07-23 14:45:44 -04:00
HsaMemoryBuffer dstBuffer(PAGE_SIZE, defaultGPUNode);
2019-07-18 19:34:55 -04:00
const char *pScratchCopyDword;
if (m_FamilyId < FAMILY_AI)
pScratchCopyDword = gfx8_ScratchCopyDword;
else if (m_FamilyId < FAMILY_NV)
pScratchCopyDword = gfx9_ScratchCopyDword;
else
pScratchCopyDword = gfx10_ScratchCopyDword;
m_pIsaGen->CompileShader(pScratchCopyDword, "ScratchCopyDword", isaBuffer);
2018-07-23 14:45:44 -04:00
Dispatch dispatch0(isaBuffer);
dispatch0.SetArgs(mem0, dstBuffer.As<void*>());
dispatch0.Submit(queue);
dispatch0.Sync();
EXPECT_EQ(data1[0], dstBuffer.As<unsigned int*>()[0]);
2018-07-23 14:45:44 -04:00
Dispatch dispatch1(isaBuffer);
dispatch1.SetArgs(mem1, dstBuffer.As<int*>());
dispatch1.Submit(queue);
dispatch1.Sync();
WaitOnValue(dstBuffer.As<uint32_t *>(), data0[0]);
EXPECT_EQ(data0[0], dstBuffer.As<unsigned int*>()[0]);
2018-07-23 14:45:44 -04:00
// Clean up
EXPECT_SUCCESS(hsaKmtUnmapMemoryToGPU(mem));
EXPECT_SUCCESS(hsaKmtFreeMemory(mem, size));
EXPECT_SUCCESS(queue.Destroy());
2018-07-23 14:45:44 -04:00
TEST_END
}
void CatchSignal(int IntrSignal) {
LOG() << "Interrupt Signal " << std::dec << IntrSignal
<< " Received" << std::endl;
}
TEST_F(KFDMemoryTest, SignalHandling) {
TEST_START(TESTPROFILE_RUNALL)
if (!is_dgpu()) {
LOG() << "Skipping test: Test not supported on APU." << std::endl;
2018-07-23 14:45:44 -04:00
return;
}
unsigned int *nullPtr = NULL;
unsigned int* pDb = NULL;
struct sigaction sa;
SDMAQueue queue;
HSAuint64 size, sysMemSize;
int defaultGPUNode = m_NodeInfo.HsaDefaultGPUNode();
ASSERT_GE(defaultGPUNode, 0) << "failed to get default GPU Node";
sa.sa_handler = CatchSignal;
sigemptyset(&sa.sa_mask);
sa.sa_flags = 0;
pid_t ParentPid = getpid();
EXPECT_EQ(0, sigaction(SIGUSR1, &sa, NULL)) << "An error occurred while setting a signal handler";
sysMemSize = GetSysMemSize();
/* System (kernel) memory are limited to 3/8th System RAM
* Try to allocate 1/4th System RAM
*/
size = (sysMemSize >> 2) & ~(HSAuint64)(PAGE_SIZE - 1);
2018-08-13 09:03:31 -04:00
ASSERT_SUCCESS(hsaKmtAllocMemory(0 /* system */, size, m_MemoryFlags, reinterpret_cast<void**>(&pDb)));
2018-08-14 09:52:31 -04:00
// Verify that pDb is not null before it's being used
EXPECT_NE(nullPtr, pDb) << "hsaKmtAllocMemory returned a null pointer";
2018-07-23 14:45:44 -04:00
pid_t childPid = fork();
ASSERT_GE(childPid, 0);
if (childPid == 0) {
EXPECT_EQ(0, kill(ParentPid, SIGUSR1));
2018-07-23 14:45:44 -04:00
exit(0);
} else {
LOG() << "Start Memory Mapping..." << std::endl;
ASSERT_SUCCESS(hsaKmtMapMemoryToGPU(pDb, size, NULL));
LOG() << "Mapping finished" << std::endl;
int childStatus;
// Parent process, just wait for the child to finish
EXPECT_EQ(childPid, waitpid(childPid, &childStatus, 0));
EXPECT_NE(0, WIFEXITED(childStatus));
EXPECT_EQ(0, WEXITSTATUS(childStatus));
2018-07-23 14:45:44 -04:00
}
pDb[0] = 0x02020202;
ASSERT_SUCCESS(queue.Create(defaultGPUNode));
queue.PlaceAndSubmitPacket(SDMAWriteDataPacket(queue.GetFamilyId(), pDb, 0x01010101) );
2018-07-23 14:45:44 -04:00
queue.Wait4PacketConsumption();
EXPECT_TRUE(WaitOnValue(pDb, 0x01010101));
EXPECT_SUCCESS(queue.Destroy());
2018-07-23 14:45:44 -04:00
EXPECT_SUCCESS(hsaKmtUnmapMemoryToGPU(pDb));
2018-07-23 14:45:44 -04:00
// Release the buffers
EXPECT_SUCCESS(hsaKmtFreeMemory(pDb, size));
2018-07-23 14:45:44 -04:00
TEST_END
}
TEST_F(KFDMemoryTest, CheckZeroInitializationSysMem) {
TEST_REQUIRE_ENV_CAPABILITIES(ENVCAPS_64BITLINUX);
TEST_START(TESTPROFILE_RUNALL);
int ret;
int defaultGPUNode = m_NodeInfo.HsaDefaultGPUNode();
ASSERT_GE(defaultGPUNode, 0) << "failed to get default GPU Node";
HSAuint64 sysMemSizeMB = GetSysMemSize() >> 20;
/* Testing system memory */
HSAuint64 * pDb = NULL;
HSAuint64 sysBufSizeMB = sysMemSizeMB >> 2;
HSAuint64 sysBufSize = sysBufSizeMB * 1024 * 1024;
int count = 5;
LOG() << "Using " << std::dec << sysBufSizeMB
<< "MB system buffer to test " << std::dec << count
<< " times" << std::endl;
unsigned int offset = 257; // a constant offset, should be smaller than 512.
unsigned int size = sysBufSize / sizeof(*pDb);
while (count--) {
ret = hsaKmtAllocMemory(0 /* system */, sysBufSize, m_MemoryFlags,
2018-08-13 09:03:31 -04:00
reinterpret_cast<void**>(&pDb));
2018-07-23 14:45:44 -04:00
if (ret) {
LOG() << "Failed to allocate system buffer of" << std::dec << sysBufSizeMB
<< "MB" << std::endl;
return;
}
2018-08-14 09:52:31 -04:00
/* Check the first 64 bits */
2018-07-23 14:45:44 -04:00
EXPECT_EQ(0, pDb[0]);
pDb[0] = 1;
for (HSAuint64 i = offset; i < size;) {
EXPECT_EQ(0, pDb[i]);
pDb[i] = i + 1; // set it to non zero
i += 4096 / sizeof(*pDb);
}
/* check the last 64 bit */
EXPECT_EQ(0, pDb[size-1]);
pDb[size-1] = size;
EXPECT_SUCCESS(hsaKmtFreeMemory(pDb, sysBufSize));
2018-07-23 14:45:44 -04:00
}
TEST_END
}
2018-08-13 09:03:31 -04:00
static inline void access(volatile void *sd, int size, int rw) {
2018-08-14 09:52:31 -04:00
/* Most likely sitting in cache*/
2018-07-23 14:45:44 -04:00
static struct DUMMY {
char dummy[1024];
} dummy;
while ((size -= sizeof(dummy)) >= 0) {
if (rw == 0)
dummy = *(struct DUMMY *)((char*)sd + size);
else
*(struct DUMMY *)((char*)sd + size) = dummy;
}
}
/*
2018-08-14 09:52:31 -04:00
* On large-bar system, test the visible vram access speed.
* KFD is not allowed to alloc visible vram on non-largebar system.
2018-07-23 14:45:44 -04:00
*/
TEST_F(KFDMemoryTest, MMBandWidth) {
TEST_REQUIRE_ENV_CAPABILITIES(ENVCAPS_64BITLINUX);
TEST_START(TESTPROFILE_RUNALL);
const unsigned nBufs = 1000; /* measure us, report ns */
unsigned testIndex, sizeIndex, memType;
const unsigned nMemTypes = 2;
const char *memTypeStrings[nMemTypes] = {"SysMem", "VRAM"};
2018-07-23 14:45:44 -04:00
const unsigned nSizes = 4;
const unsigned bufSizes[nSizes] = {PAGE_SIZE, PAGE_SIZE*4, PAGE_SIZE*16, PAGE_SIZE*64};
const unsigned nTests = nSizes * nMemTypes;
const unsigned tmpBufferSize = PAGE_SIZE*64;
#define _TEST_BUFSIZE(index) (bufSizes[index % nSizes])
#define _TEST_MEMTYPE(index) ((index / nSizes) % nMemTypes)
void *bufs[nBufs];
2018-08-13 09:03:31 -04:00
HSAuint64 start;
2018-07-23 14:45:44 -04:00
unsigned i;
HSAKMT_STATUS ret;
HsaMemFlags memFlags = {0};
HsaMemMapFlags mapFlags = {0};
HSAuint32 defaultGPUNode = m_NodeInfo.HsaDefaultGPUNode();
ASSERT_GE(defaultGPUNode, 0) << "failed to get default GPU Node";
HSAuint64 vramSizeMB = GetVramSize(defaultGPUNode) >> 20;
LOG() << "Found VRAM of " << std::dec << vramSizeMB << "MB." << std::endl;
if (!m_NodeInfo.IsGPUNodeLargeBar(defaultGPUNode) || !vramSizeMB) {
LOG() << "Skipping test: Test requires a large bar GPU." << std::endl;
2018-07-23 14:45:44 -04:00
return;
}
void *tmp = mmap(0,
tmpBufferSize,
PROT_READ | PROT_WRITE,
MAP_ANONYMOUS | MAP_PRIVATE,
-1,
0);
EXPECT_NE(tmp, MAP_FAILED);
2018-07-23 14:45:44 -04:00
memset(tmp, 0, tmpBufferSize);
LOG() << "Test (avg. ns)\t memcpyRTime memcpyWTime accessRTime accessWTime" << std::endl;
for (testIndex = 0; testIndex < nTests; testIndex++) {
unsigned bufSize = _TEST_BUFSIZE(testIndex);
unsigned memType = _TEST_MEMTYPE(testIndex);
2018-08-13 09:03:31 -04:00
HSAuint64 mcpRTime, mcpWTime, accessRTime, accessWTime;
2018-07-23 14:45:44 -04:00
HSAuint32 allocNode;
if ((testIndex & (nSizes-1)) == 0)
LOG() << "----------------------------------------------------------------------" << std::endl;
if (memType == 0) {
allocNode = 0;
memFlags.ui32.PageSize = HSA_PAGE_SIZE_4KB;
memFlags.ui32.HostAccess = 1;
memFlags.ui32.NonPaged = 0;
} else {
2018-08-14 09:52:31 -04:00
/* Alloc visible vram*/
2018-07-23 14:45:44 -04:00
allocNode = defaultGPUNode;
memFlags.ui32.PageSize = HSA_PAGE_SIZE_4KB;
memFlags.ui32.HostAccess = 1;
memFlags.ui32.NonPaged = 1;
}
for (i = 0; i < nBufs; i++)
ASSERT_SUCCESS(hsaKmtAllocMemory(allocNode, bufSize, memFlags,
&bufs[i]));
start = GetSystemTickCountInMicroSec();
for (i = 0; i < nBufs; i++) {
memcpy(bufs[i], tmp, bufSize);
}
mcpWTime = GetSystemTickCountInMicroSec() - start;
start = GetSystemTickCountInMicroSec();
for (i = 0; i < nBufs; i++) {
access(bufs[i], bufSize, 1);
}
accessWTime = GetSystemTickCountInMicroSec() - start;
start = GetSystemTickCountInMicroSec();
for (i = 0; i < nBufs; i++) {
memcpy(tmp, bufs[i], bufSize);
}
mcpRTime = GetSystemTickCountInMicroSec() - start;
start = GetSystemTickCountInMicroSec();
for (i = 0; i < nBufs; i++) {
access(bufs[i], bufSize, 0);
}
accessRTime = GetSystemTickCountInMicroSec() - start;
for (i = 0; i < nBufs; i++)
EXPECT_SUCCESS(hsaKmtFreeMemory(bufs[i], bufSize));
2018-07-23 14:45:44 -04:00
LOG() << std::dec
<< std::right << std::setw(3) << (bufSize >> 10) << "K-"
<< std::left << std::setw(14) << memTypeStrings[memType]
<< std::right
2018-07-23 14:45:44 -04:00
<< std::setw(12) << mcpRTime
<< std::setw(12) << mcpWTime
<< std::setw(12) << accessRTime
<< std::setw(12) << accessWTime
<< std::endl;
#define MMBANDWIDTH_KEY_PREFIX memTypeStrings[memType] << "-" \
<< (bufSize >> 10) << "K" << "-"
RECORD(mcpRTime) << MMBANDWIDTH_KEY_PREFIX << "mcpRTime";
RECORD(mcpWTime) << MMBANDWIDTH_KEY_PREFIX << "mcpWTime";
RECORD(accessRTime) << MMBANDWIDTH_KEY_PREFIX << "accessRTime";
RECORD(accessWTime) << MMBANDWIDTH_KEY_PREFIX << "accessWTime";
2018-07-23 14:45:44 -04:00
}
munmap(tmp, tmpBufferSize);
TEST_END
}
2019-04-30 15:32:01 -05:00
/* For the purpose of testing HDP flush from CPU.
* Use CPU to write to coherent vram and check
* from shader.
* Asic before gfx9 doesn't support user space
* HDP flush so only run on vega10 and after.
* This should only run on large bar system.
*/
TEST_F(KFDMemoryTest, HostHdpFlush) {
TEST_REQUIRE_ENV_CAPABILITIES(ENVCAPS_64BITLINUX);
TEST_START(TESTPROFILE_RUNALL);
HsaMemFlags memoryFlags = m_MemoryFlags;
/* buffer[0]: signal; buffer[1]: Input to shader; buffer[2]: Output to
* shader
*/
unsigned int *buffer = NULL;
HSAuint32 defaultGPUNode = m_NodeInfo.HsaDefaultGPUNode();
ASSERT_GE(defaultGPUNode, 0) << "failed to get default GPU Node";
const HsaNodeProperties *pNodeProperties = m_NodeInfo.GetNodeProperties(defaultGPUNode);
HSAuint32 *mmioBase = NULL;
unsigned int *nullPtr = NULL;
if (!pNodeProperties) {
LOG() << "Failed to get gpu node properties." << std::endl;
return;
}
if (m_FamilyId < FAMILY_AI) {
LOG() << "Skipping test: Test requires gfx9 and later asics." << std::endl;
return;
}
HSAuint64 vramSizeMB = GetVramSize(defaultGPUNode) >> 20;
if (!m_NodeInfo.IsGPUNodeLargeBar(defaultGPUNode) || !vramSizeMB) {
LOG() << "Skipping test: Test requires a large bar GPU." << std::endl;
return;
}
HsaMemoryProperties *memoryProperties = new HsaMemoryProperties[pNodeProperties->NumMemoryBanks];
EXPECT_SUCCESS(hsaKmtGetNodeMemoryProperties(defaultGPUNode, pNodeProperties->NumMemoryBanks,
memoryProperties));
for (unsigned int bank = 0; bank < pNodeProperties->NumMemoryBanks; bank++) {
if (memoryProperties[bank].HeapType == HSA_HEAPTYPE_MMIO_REMAP) {
mmioBase = (unsigned int *)memoryProperties[bank].VirtualBaseAddress;
2019-05-27 14:57:57 -05:00
break;
2019-04-30 15:32:01 -05:00
}
}
ASSERT_NE(mmioBase, nullPtr) << "mmio base is NULL";
memoryFlags.ui32.NonPaged = 1;
memoryFlags.ui32.CoarseGrain = 0;
ASSERT_SUCCESS(hsaKmtAllocMemory(defaultGPUNode, PAGE_SIZE, memoryFlags,
reinterpret_cast<void**>(&buffer)));
ASSERT_SUCCESS(hsaKmtMapMemoryToGPU(buffer, PAGE_SIZE, NULL));
/* Signal is dead from the beginning*/
buffer[0] = 0xdead;
buffer[1] = 0xfeeb;
buffer[2] = 0xfeed;
/* Submit a shader to poll the signal*/
PM4Queue queue;
ASSERT_SUCCESS(queue.Create(defaultGPUNode));
HsaMemoryBuffer isaBuffer(PAGE_SIZE, defaultGPUNode, true/*zero*/, false/*local*/, true/*exec*/);
m_pIsaGen->CompileShader(gfx9_CopyOnSignal, "CopyOnSignal", isaBuffer);
2019-04-30 15:32:01 -05:00
Dispatch dispatch0(isaBuffer);
dispatch0.SetArgs(buffer, NULL);
dispatch0.Submit(queue);
buffer[1] = 0xbeef;
/* Flush HDP */
mmioBase[KFD_MMIO_REMAP_HDP_MEM_FLUSH_CNTL/4] = 0x1;
buffer[0] = 0xcafe;
/* Check test result*/
dispatch0.Sync();
2019-05-27 14:57:57 -05:00
mmioBase[KFD_MMIO_REMAP_HDP_MEM_FLUSH_CNTL/4] = 0x1;
2019-04-30 15:32:01 -05:00
EXPECT_EQ(0xbeef, buffer[2]);
// Clean up
EXPECT_SUCCESS(queue.Destroy());
delete [] memoryProperties;
EXPECT_SUCCESS(hsaKmtUnmapMemoryToGPU(buffer));
EXPECT_SUCCESS(hsaKmtFreeMemory(buffer, PAGE_SIZE));
TEST_END
}
2019-05-30 16:09:06 -05:00
/* Test HDP flush from device.
* Use shader on device 1 to write vram of device 0
* and flush HDP of device 0. Read vram from device 0
* and write back to vram to check the result from CPU.
* Asic before gfx9 doesn't support device HDP flush
* so only run on vega10 and after.
* This should only run on system with at least one
* large bar node (which is used as device 0).
*/
TEST_F(KFDMemoryTest, DeviceHdpFlush) {
TEST_REQUIRE_ENV_CAPABILITIES(ENVCAPS_64BITLINUX);
TEST_START(TESTPROFILE_RUNALL);
HsaMemFlags memoryFlags = m_MemoryFlags;
/* buffer is physically on device 0.
* buffer[0]: Use as signaling b/t devices;
* buffer[1]: Device 1 write to buffer[1] and device 0 read it
* buffer[2]: Device 0 copy buffer[1] to buffer[2] for CPU to check
*/
unsigned int *buffer = NULL;
const HsaNodeProperties *pNodeProperties;
HSAuint32 *mmioBase = NULL;
unsigned int *nullPtr = NULL;
std::vector<HSAuint32> nodes;
const std::vector<int> gpuNodes = m_NodeInfo.GetNodesWithGPU();
if (gpuNodes.size() < 2) {
LOG() << "Skipping test: At least two GPUs are required." << std::endl;
return;
}
/* Users can use "--node=gpu1 --dst_node=gpu2" to specify devices */
if (g_TestDstNodeId != -1 && g_TestNodeId != -1) {
nodes.push_back(g_TestNodeId);
nodes.push_back(g_TestDstNodeId);
if (!m_NodeInfo.IsGPUNodeLargeBar(nodes[0])) {
LOG() << "Skipping test: first GPU specified is not a large bar GPU." << std::endl;
return;
}
if (nodes[0] == nodes[1]) {
LOG() << "Skipping test: Different GPUs must be specified (2 GPUs required)." << std::endl;
return;
}
} else {
HSAint32 defaultGPU = m_NodeInfo.HsaDefaultGPUNode();
if (!m_NodeInfo.IsGPUNodeLargeBar(defaultGPU)) {
LOG() << "Skipping test: Default GPUs must be large bar." << std::endl;
return;
}
nodes.push_back(defaultGPU);
for (unsigned i = 0; i < gpuNodes.size(); i++)
if (gpuNodes.at(i) != defaultGPU)
nodes.push_back(gpuNodes.at(i));
if (nodes.size() < 2) {
LOG() << "Skipping test: At least 2 GPUs required." << std::endl;
return;
}
}
pNodeProperties = m_NodeInfo.GetNodeProperties(nodes[0]);
if (!pNodeProperties) {
LOG() << "Failed to get gpu node properties." << std::endl;
return;
}
if (m_FamilyId < FAMILY_AI) {
LOG() << "Skipping test: Test requires gfx9 and later asics." << std::endl;
return;
}
HsaMemoryProperties *memoryProperties = new HsaMemoryProperties[pNodeProperties->NumMemoryBanks];
EXPECT_SUCCESS(hsaKmtGetNodeMemoryProperties(nodes[0], pNodeProperties->NumMemoryBanks,
memoryProperties));
for (unsigned int bank = 0; bank < pNodeProperties->NumMemoryBanks; bank++) {
if (memoryProperties[bank].HeapType == HSA_HEAPTYPE_MMIO_REMAP) {
mmioBase = (unsigned int *)memoryProperties[bank].VirtualBaseAddress;
break;
}
}
ASSERT_NE(mmioBase, nullPtr) << "mmio base is NULL";
memoryFlags.ui32.NonPaged = 1;
memoryFlags.ui32.CoarseGrain = 0;
ASSERT_SUCCESS(hsaKmtAllocMemory(nodes[0], PAGE_SIZE, memoryFlags,
reinterpret_cast<void**>(&buffer)));
ASSERT_SUCCESS(hsaKmtMapMemoryToGPU(buffer, PAGE_SIZE, NULL));
/* Signal is dead from the beginning*/
buffer[0] = 0xdead;
buffer[1] = 0xfeeb;
buffer[2] = 0xfeeb;
/* Submit shaders*/
PM4Queue queue;
ASSERT_SUCCESS(queue.Create(nodes[0]));
HsaMemoryBuffer isaBuffer(PAGE_SIZE, nodes[0], true/*zero*/, false/*local*/, true/*exec*/);
m_pIsaGen->CompileShader(gfx9_CopyOnSignal, "CopyOnSignal", isaBuffer);
2019-05-30 16:09:06 -05:00
Dispatch dispatch(isaBuffer);
dispatch.SetArgs(buffer, NULL);
dispatch.Submit(queue);
PM4Queue queue0;
ASSERT_SUCCESS(queue0.Create(nodes[1]));
HsaMemoryBuffer isaBuffer0(PAGE_SIZE, nodes[1], true/*zero*/, false/*local*/, true/*exec*/);
m_pIsaGen->CompileShader(gfx9_WriteAndSignal, "WriteAndSignal", isaBuffer0);
2019-05-30 16:09:06 -05:00
Dispatch dispatch0(isaBuffer0);
dispatch0.SetArgs(buffer, mmioBase);
dispatch0.Submit(queue0);
/* Check test result*/
dispatch0.Sync();
dispatch.Sync();
EXPECT_EQ(0xbeef, buffer[2]);
// Clean up
EXPECT_SUCCESS(queue.Destroy());
EXPECT_SUCCESS(queue0.Destroy());
delete [] memoryProperties;
EXPECT_SUCCESS(hsaKmtUnmapMemoryToGPU(buffer));
EXPECT_SUCCESS(hsaKmtFreeMemory(buffer, PAGE_SIZE));
TEST_END
}