86a68b2774
Fixes assembler error. The SP3 backend if already set to FamilyId. Change-Id: I7721a555b05688b16993a03242a765694594825a Signed-off-by: Philip Yang <Philip.Yang@amd.com>
452 строки
16 KiB
C++
452 строки
16 KiB
C++
/*
|
|
* Copyright (C) 2020 Advanced Micro Devices, Inc. All Rights Reserved.
|
|
*
|
|
* Permission is hereby granted, free of charge, to any person obtaining a
|
|
* copy of this software and associated documentation files (the "Software"),
|
|
* to deal in the Software without restriction, including without limitation
|
|
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
|
* and/or sell copies of the Software, and to permit persons to whom the
|
|
* Software is furnished to do so, subject to the following conditions:
|
|
*
|
|
* The above copyright notice and this permission notice shall be included in
|
|
* all copies or substantial portions of the Software.
|
|
*
|
|
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
|
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
|
|
* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
|
|
* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
|
* OTHER DEALINGS IN THE SOFTWARE.
|
|
*
|
|
*/
|
|
|
|
#include "KFDSVMEvictTest.hpp"
|
|
#include <sys/mman.h>
|
|
#include <vector>
|
|
#include <string>
|
|
#include "PM4Queue.hpp"
|
|
#include "PM4Packet.hpp"
|
|
#include "SDMAPacket.hpp"
|
|
#include "SDMAQueue.hpp"
|
|
#include "Dispatch.hpp"
|
|
|
|
#define N_PROCESSES (4) /* number of processes running in parallel, at least 2 */
|
|
#define ALLOCATE_BUF_SIZE_MB (64)
|
|
#define ALLOCATE_RETRY_TIMES (3)
|
|
|
|
HSAint32 KFDSVMEvictTest::GetBufferCounter(HSAuint64 vramSize, HSAuint64 vramBufSize) {
|
|
HSAuint64 vramBufSizeInPages = vramBufSize >> PAGE_SHIFT;
|
|
HSAuint64 sysMemSize = GetSysMemSize();
|
|
HSAuint64 size, sizeInPages;
|
|
HSAuint32 count;
|
|
|
|
LOG() << "Found System RAM of " << std::dec << (sysMemSize >> 20) << "MB" << std::endl;
|
|
|
|
/* use one third of total system memory for eviction buffer to test
|
|
* limit max allocate size to duoble of vramSize
|
|
* count is zero if not enough memory (sysMemSize/3 + vramSize) < (vramBufSize * N_PROCESSES)
|
|
*/
|
|
size = sysMemSize / 3 + vramSize;
|
|
size = size > vramSize << 1 ? vramSize << 1 : size;
|
|
sizeInPages = size >> PAGE_SHIFT;
|
|
count = sizeInPages / (vramBufSizeInPages * N_PROCESSES);
|
|
|
|
return count;
|
|
}
|
|
|
|
void KFDSVMEvictTest::AllocBuffers(HSAuint32 defaultGPUNode, HSAuint32 count, HSAuint64 vramBufSize,
|
|
std::vector<void *> &pBuffers) {
|
|
HSAuint64 totalMB;
|
|
|
|
totalMB = N_PROCESSES * count * (vramBufSize >> 20);
|
|
if (m_IsParent) {
|
|
LOG() << "Testing " << N_PROCESSES << "*" << count << "*" << (vramBufSize>>20) << "(="<< totalMB << ")MB" << std::endl;
|
|
}
|
|
HSAKMT_STATUS ret;
|
|
HSAuint32 retry = 0;
|
|
|
|
for (HSAuint32 i = 0; i < count; i++) {
|
|
m_pBuf = mmap(0, vramBufSize, PROT_READ | PROT_WRITE, MAP_ANONYMOUS | MAP_PRIVATE, -1, 0);
|
|
EXPECT_NOTNULL(m_pBuf);
|
|
|
|
m_Flags = (HSA_SVM_FLAGS)0;
|
|
retry:
|
|
ret = RegisterSVMRange(defaultGPUNode, m_pBuf, vramBufSize, defaultGPUNode, m_Flags);
|
|
if (ret == HSAKMT_STATUS_SUCCESS) {
|
|
pBuffers.push_back(m_pBuf);
|
|
retry = 0;
|
|
} else {
|
|
if (retry++ > ALLOCATE_RETRY_TIMES) {
|
|
munmap(m_pBuf, vramBufSize);
|
|
break;
|
|
}
|
|
printf("retry %d allocate vram\n", retry);
|
|
|
|
/* wait for 1 second to try allocate again */
|
|
sleep(1);
|
|
goto retry;
|
|
}
|
|
}
|
|
}
|
|
|
|
void KFDSVMEvictTest::FreeBuffers(std::vector<void *> &pBuffers, HSAuint64 vramBufSize) {
|
|
for (HSAuint32 i = 0; i < pBuffers.size(); i++) {
|
|
m_pBuf = pBuffers[i];
|
|
if (m_pBuf != NULL)
|
|
munmap(m_pBuf, vramBufSize);
|
|
}
|
|
}
|
|
|
|
void KFDSVMEvictTest::ForkChildProcesses(int nprocesses) {
|
|
int i;
|
|
|
|
for (i = 0; i < nprocesses - 1; ++i) {
|
|
pid_t pid = fork();
|
|
ASSERT_GE(pid, 0);
|
|
|
|
if (pid == 0) {
|
|
/* Child process */
|
|
/* Cleanup file descriptors copied from parent process
|
|
* then call SetUp->hsaKmtOpenKFD to create new process
|
|
*/
|
|
m_psName = "Test process " + std::to_string(i) + " ";
|
|
TearDown();
|
|
SetUp();
|
|
m_ChildPids.clear();
|
|
m_IsParent = false;
|
|
return;
|
|
}
|
|
|
|
/* Parent process */
|
|
m_ChildPids.push_back(pid);
|
|
}
|
|
|
|
m_psName = "Test process " + std::to_string(i) + " ";
|
|
}
|
|
|
|
void KFDSVMEvictTest::WaitChildProcesses() {
|
|
if (m_IsParent) {
|
|
/* only run by parent process */
|
|
int childStatus;
|
|
int childExitOkNum = 0;
|
|
int size = m_ChildPids.size();
|
|
|
|
for (HSAuint32 i = 0; i < size; i++) {
|
|
pid_t pid = m_ChildPids.front();
|
|
|
|
waitpid(pid, &childStatus, 0);
|
|
if (WIFEXITED(childStatus) == 1 && WEXITSTATUS(childStatus) == 0)
|
|
childExitOkNum++;
|
|
|
|
m_ChildPids.erase(m_ChildPids.begin());
|
|
}
|
|
|
|
ASSERT_EQ(childExitOkNum, size);
|
|
}
|
|
|
|
/* child process or parent process finished successfullly */
|
|
m_ChildStatus = HSAKMT_STATUS_SUCCESS;
|
|
}
|
|
|
|
/* Evict and restore procedure basic test
|
|
*
|
|
* Use N_PROCESSES processes to allocate vram buf size larger than total vram size
|
|
*
|
|
* ALLOCATE_BUF_SIZE_MB buf allocation size
|
|
*
|
|
* number of buf is equal to (vramSizeMB / (vramBufSizeMB * N_PROCESSES) ) + 8
|
|
* Total vram all processes allocated: 8GB for 4GB Fiji, and 20GB for 16GB Vega10
|
|
*
|
|
* many times of eviction and restore will happen:
|
|
* ttm will evict buffers of another process if not enough free vram
|
|
* process restore will evict buffers of another process
|
|
*
|
|
* Sometimes the allocate may fail (maybe that is normal)
|
|
* ALLOCATE_RETRY_TIMES max retry times to allocate
|
|
*
|
|
* This is basic test, no queue so vram are not used by GPU during test
|
|
*
|
|
* Todo:
|
|
* - Synchronization between the processes, so they know for sure when
|
|
* they are done allocating memory
|
|
*/
|
|
TEST_F(KFDSVMEvictTest, BasicTest) {
|
|
TEST_REQUIRE_ENV_CAPABILITIES(ENVCAPS_64BITLINUX);
|
|
TEST_START(TESTPROFILE_RUNALL);
|
|
|
|
if (!SVMAPISupported())
|
|
return;
|
|
|
|
HSAuint32 defaultGPUNode = m_NodeInfo.HsaDefaultGPUNode();
|
|
ASSERT_GE(defaultGPUNode, 0) << "failed to get default GPU Node";
|
|
HSAuint64 vramBufSize = ALLOCATE_BUF_SIZE_MB * 1024 * 1024;
|
|
|
|
HSAuint64 vramSize = GetVramSize(defaultGPUNode);
|
|
|
|
if (!vramSize) {
|
|
LOG() << "No VRAM found, skipping the test" << std::endl;
|
|
return;
|
|
} else {
|
|
LOG() << "Found VRAM of " << std::dec << (vramSize >> 20) << "MB" << std::endl;
|
|
}
|
|
|
|
HSAuint32 count = GetBufferCounter(vramSize, vramBufSize);
|
|
if (count == 0) {
|
|
LOG() << "Not enough system memory, skipping the test" << std::endl;
|
|
return;
|
|
}
|
|
|
|
/* Fork the child processes */
|
|
ForkChildProcesses(N_PROCESSES);
|
|
|
|
std::vector<void *> pBuffers;
|
|
AllocBuffers(defaultGPUNode, count, vramBufSize, pBuffers);
|
|
|
|
/* wait for other processes to finish allocation, then free buffer */
|
|
sleep(ALLOCATE_RETRY_TIMES);
|
|
|
|
LOG() << m_psName << "free buffer" << std::endl;
|
|
FreeBuffers(pBuffers, vramBufSize);
|
|
|
|
WaitChildProcesses();
|
|
|
|
TEST_END
|
|
}
|
|
|
|
/* Shader to read local buffers using multiple wavefronts in parallel
|
|
* until address buffer is filled with specific value 0x5678 by host program,
|
|
* then each wavefront fills value 0x5678 at corresponding result buffer and quit
|
|
*
|
|
* initial state:
|
|
* s[0:1] - address buffer base address
|
|
* s[2:3] - result buffer base address
|
|
* s4 - workgroup id
|
|
* v0 - workitem id, always 0 because NUM_THREADS_X(number of threads) in workgroup set to 1
|
|
* registers:
|
|
* v0 - calculated workitem id, v0 = v0 + s4 * NUM_THREADS_X
|
|
* v[2:3] - address of corresponding local buf address offset: s[0:1] + v0 * 8
|
|
* v[4:5] - corresponding output buf address: s[2:3] + v0 * 4
|
|
* v[6:7] - local buf address used for read test
|
|
*/
|
|
static const char* gfx9_ReadMemory =
|
|
"\
|
|
shader ReadMemory\n\
|
|
type(CS)\n\
|
|
\n\
|
|
// compute address of corresponding output buffer\n\
|
|
v_mov_b32 v0, s4 // use workgroup id as index\n\
|
|
v_lshlrev_b32 v0, 2, v0 // v0 *= 4\n\
|
|
v_add_co_u32 v4, vcc, s2, v0 // v[4:5] = s[2:3] + v0 * 4\n\
|
|
v_mov_b32 v5, s3\n\
|
|
v_add_u32 v5, vcc_lo, v5\n\
|
|
\n\
|
|
// compute input buffer offset used to store corresponding local buffer address\n\
|
|
v_lshlrev_b32 v0, 1, v0 // v0 *= 8\n\
|
|
v_add_co_u32 v2, vcc, s0, v0 // v[2:3] = s[0:1] + v0 * 8\n\
|
|
v_mov_b32 v3, s1\n\
|
|
v_add_u32 v3, vcc_lo, v3\n\
|
|
\n\
|
|
// load 64bit local buffer address stored at v[2:3] to v[6:7]\n\
|
|
flat_load_dwordx2 v[6:7], v[2:3] slc\n\
|
|
s_waitcnt vmcnt(0) & lgkmcnt(0) // wait for memory reads to finish\n\
|
|
\n\
|
|
v_mov_b32 v8, 0x5678\n\
|
|
s_movk_i32 s8, 0x5678\n\
|
|
L_REPEAT:\n\
|
|
s_load_dword s16, s[0:1], 0x0 glc\n\
|
|
s_waitcnt vmcnt(0) & lgkmcnt(0) // wait for memory reads to finish\n\
|
|
s_cmp_eq_i32 s16, s8\n\
|
|
s_cbranch_scc1 L_QUIT // if notified to quit by host\n\
|
|
// loop read 64M local buffer starting at v[6:7]\n\
|
|
// every 4k page only read once\n\
|
|
v_mov_b32 v9, 0\n\
|
|
v_mov_b32 v10, 0x1000 // 4k page\n\
|
|
v_mov_b32 v11, 0x4000000 // 64M size\n\
|
|
v_mov_b32 v12, v6\n\
|
|
v_mov_b32 v13, v7\n\
|
|
L_LOOP_READ:\n\
|
|
flat_load_dwordx2 v[14:15], v[12:13] slc\n\
|
|
v_add_u32 v9, v9, v10 \n\
|
|
v_add_co_u32 v12, vcc, v12, v10\n\
|
|
v_add_u32 v13, vcc_lo, v13\n\
|
|
v_cmp_lt_u32 vcc, v9, v11\n\
|
|
s_cbranch_vccnz L_LOOP_READ\n\
|
|
s_branch L_REPEAT\n\
|
|
L_QUIT:\n\
|
|
flat_store_dword v[4:5], v8\n\
|
|
s_waitcnt vmcnt(0) & lgkmcnt(0) // wait for memory writes to finish\n\
|
|
s_endpgm\n\
|
|
end\n\
|
|
";
|
|
|
|
static const char* gfx8_ReadMemory =
|
|
"\
|
|
shader ReadMemory\n\
|
|
asic(VI)\n\
|
|
type(CS)\n\
|
|
\n\
|
|
// compute address of corresponding output buffer\n\
|
|
v_mov_b32 v0, s4 // use workgroup id as index\n\
|
|
v_lshlrev_b32 v0, 2, v0 // v0 *= 4\n\
|
|
v_add_u32 v4, vcc, s2, v0 // v[4:5] = s[2:3] + v0 * 4\n\
|
|
v_mov_b32 v5, s3\n\
|
|
v_addc_u32 v5, vcc, v5, 0, vcc\n\
|
|
\n\
|
|
// compute input buffer offset used to store corresponding local buffer address\n\
|
|
v_lshlrev_b32 v0, 1, v0 // v0 *= 8\n\
|
|
v_add_u32 v2, vcc, s0, v0 // v[2:3] = s[0:1] + v0 * 8\n\
|
|
v_mov_b32 v3, s1\n\
|
|
v_addc_u32 v3, vcc, v3, 0, vcc\n\
|
|
\n\
|
|
// load 64bit local buffer address stored at v[2:3] to v[6:7]\n\
|
|
flat_load_dwordx2 v[6:7], v[2:3] slc\n\
|
|
s_waitcnt vmcnt(0) & lgkmcnt(0) // wait for memory reads to finish\n\
|
|
\n\
|
|
v_mov_b32 v8, 0x5678\n\
|
|
s_movk_i32 s8, 0x5678\n\
|
|
L_REPEAT:\n\
|
|
s_load_dword s16, s[0:1], 0x0 glc\n\
|
|
s_waitcnt vmcnt(0) & lgkmcnt(0) // wait for memory reads to finish\n\
|
|
s_cmp_eq_i32 s16, s8\n\
|
|
s_cbranch_scc1 L_QUIT // if notified to quit by host\n\
|
|
// loop read 64M local buffer starting at v[6:7]\n\
|
|
// every 4k page only read once\n\
|
|
v_mov_b32 v9, 0\n\
|
|
v_mov_b32 v10, 0x1000 // 4k page\n\
|
|
v_mov_b32 v11, 0x4000000 // 64M size\n\
|
|
v_mov_b32 v12, v6\n\
|
|
v_mov_b32 v13, v7\n\
|
|
L_LOOP_READ:\n\
|
|
flat_load_dwordx2 v[14:15], v[12:13] slc\n\
|
|
v_add_u32 v9, vcc, v9, v10 \n\
|
|
v_add_u32 v12, vcc, v12, v10\n\
|
|
v_addc_u32 v13, vcc, v13, 0, vcc\n\
|
|
v_cmp_lt_u32 vcc, v9, v11\n\
|
|
s_cbranch_vccnz L_LOOP_READ\n\
|
|
s_branch L_REPEAT\n\
|
|
L_QUIT:\n\
|
|
flat_store_dword v[4:5], v8\n\
|
|
s_waitcnt vmcnt(0) & lgkmcnt(0) // wait for memory writes to finish\n\
|
|
s_endpgm\n\
|
|
end\n\
|
|
";
|
|
|
|
std::string KFDSVMEvictTest::CreateShader() {
|
|
if (m_FamilyId >= FAMILY_AI)
|
|
return gfx9_ReadMemory;
|
|
else
|
|
return gfx8_ReadMemory;
|
|
}
|
|
|
|
/* Evict and restore queue test
|
|
*
|
|
* N_PROCESSES processes read all local buffers in parallel while buffers are evicted and restored
|
|
* If GPU vm page fault happens, then test shader will stop and failed to write specific value
|
|
* at dest buffer. Test will report failed.
|
|
*
|
|
* Steps:
|
|
* - fork N_PROCESSES processes, each process does the same below
|
|
* - allocate local buffers, each buffer size is 64MB
|
|
* - allocate zero initialized host access address buffer and result buffer
|
|
* address buffer to pass address of local buffers to shader
|
|
* result buffer to store shader output result
|
|
* - submit queue to run ReadMemory shader
|
|
* - shader start m_DimX wavefronts, each wavefront keep reading one local buffer
|
|
* - notify shader to quit
|
|
* - check result buffer with specific value to confirm all wavefronts quit normally
|
|
*/
|
|
TEST_F(KFDSVMEvictTest, QueueTest) {
|
|
TEST_REQUIRE_ENV_CAPABILITIES(ENVCAPS_64BITLINUX);
|
|
TEST_START(TESTPROFILE_RUNALL)
|
|
|
|
if (!SVMAPISupported())
|
|
return;
|
|
|
|
HSAuint32 defaultGPUNode = m_NodeInfo.HsaDefaultGPUNode();
|
|
ASSERT_GE(defaultGPUNode, 0) << "failed to get default GPU Node";
|
|
HSAuint64 vramBufSize = ALLOCATE_BUF_SIZE_MB * 1024 * 1024;
|
|
|
|
const HsaNodeProperties *pNodeProperties = m_NodeInfo.HsaDefaultGPUNodeProperties();
|
|
|
|
/* Skip test for chip it doesn't have CWSR, which the test depends on */
|
|
if (m_FamilyId < FAMILY_VI || isTonga(pNodeProperties) || m_FamilyId >= FAMILY_NV) {
|
|
LOG() << std::hex << "Test is skipped for family ID 0x" << m_FamilyId << std::endl;
|
|
return;
|
|
}
|
|
|
|
HSAuint32 i;
|
|
HSAuint64 vramSize = GetVramSize(defaultGPUNode);
|
|
|
|
if (!vramSize) {
|
|
LOG() << "No VRAM found, skipping the test" << std::endl;
|
|
return;
|
|
} else {
|
|
LOG() << "Found VRAM of " << std::dec << (vramSize >> 20) << "MB." << std::endl;
|
|
}
|
|
|
|
HSAuint32 count = GetBufferCounter(vramSize, vramBufSize);
|
|
if (count == 0) {
|
|
LOG() << "Not enough system memory, skipping the test" << std::endl;
|
|
return;
|
|
}
|
|
/* assert all buffer address can be stored within one page
|
|
* because only one page host memory srcBuf is allocated
|
|
*/
|
|
ASSERT_LE(count, PAGE_SIZE/sizeof(unsigned int *));
|
|
|
|
/* Fork the child processes */
|
|
ForkChildProcesses(N_PROCESSES);
|
|
|
|
HsaMemoryBuffer isaBuffer(PAGE_SIZE, defaultGPUNode, true/*zero*/, false/*local*/, true/*exec*/);
|
|
HsaMemoryBuffer addrBuffer(PAGE_SIZE, defaultGPUNode);
|
|
HsaMemoryBuffer resultBuffer(PAGE_SIZE, defaultGPUNode);
|
|
|
|
std::vector<void *> pBuffers;
|
|
AllocBuffers(defaultGPUNode, count, vramBufSize, pBuffers);
|
|
|
|
unsigned int wavefront_num = pBuffers.size();
|
|
LOG() << m_psName << "wavefront number " << wavefront_num << std::endl;
|
|
|
|
void **localBufAddr = addrBuffer.As<void **>();
|
|
unsigned int *result = resultBuffer.As<uint32_t *>();
|
|
|
|
for (i = 0; i < wavefront_num; i++)
|
|
*(localBufAddr + i) = pBuffers[i];
|
|
|
|
m_pIsaGen->CompileShader(CreateShader().c_str(), "ReadMemory", isaBuffer);
|
|
|
|
PM4Queue pm4Queue;
|
|
ASSERT_SUCCESS(pm4Queue.Create(defaultGPUNode));
|
|
|
|
Dispatch dispatch0(isaBuffer);
|
|
dispatch0.SetArgs(localBufAddr, result);
|
|
dispatch0.SetDim(wavefront_num, 1, 1);
|
|
/* submit the packet and start shader */
|
|
dispatch0.Submit(pm4Queue);
|
|
|
|
/* doing evict/restore queue test for 5 seconds while queue is running */
|
|
sleep(5);
|
|
|
|
/* LOG() << m_psName << "notify shader to quit" << std::endl; */
|
|
/* fill address buffer so shader quits */
|
|
addrBuffer.Fill(0x5678);
|
|
|
|
/* wait for shader to finish or timeout if shade has vm page fault */
|
|
dispatch0.SyncWithStatus(120000);
|
|
|
|
ASSERT_SUCCESS(pm4Queue.Destroy());
|
|
/* LOG() << m_psName << "free buffer" << std::endl; */
|
|
/* cleanup */
|
|
FreeBuffers(pBuffers, vramBufSize);
|
|
|
|
/* check if all wavefronts finish successfully */
|
|
for (i = 0; i < wavefront_num; i++)
|
|
ASSERT_EQ(0x5678, *(result + i));
|
|
|
|
WaitChildProcesses();
|
|
|
|
TEST_END
|
|
}
|
|
|