Files
rocm-systems/tests/kfdtest/src/KFDEvictTest.cpp
T
Felix Kuehling 8baf02e80b kfdtest: Allow some CS to fail in EvictTest
amdgpu_cs_submit can fail intermittently if another process has too much
memory reserved at the time. Allow a small percental of command
submissions to fail to make the test more robust.

Signed-off-by: Felix Kuehling <Felix.Kuehling@amd.com>
Change-Id: If9f62b2b6f67be71420016d4e38d4dd6b6bca9a5
2021-05-03 11:01:35 -04:00

694 lines
23 KiB
C++

/*
* Copyright (C) 2017-2018 Advanced Micro Devices, Inc. All Rights Reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*
*/
#include <vector>
#include <string>
#include "KFDEvictTest.hpp"
#include "PM4Queue.hpp"
#include "PM4Packet.hpp"
#include "SDMAPacket.hpp"
#include "SDMAQueue.hpp"
#include "Dispatch.hpp"
#define N_PROCESSES (2) /* Number of processes running in parallel, must be at least 2 */
#define ALLOCATE_BUF_SIZE_MB (64)
#define ALLOCATE_RETRY_TIMES (3)
#define SDMA_NOP 0x0
void KFDEvictTest::SetUp() {
ROUTINE_START
KFDBaseComponentTest::SetUp();
m_pIsaGen = IsaGenerator::Create(m_FamilyId);
ROUTINE_END
}
void KFDEvictTest::TearDown() {
ROUTINE_START
if (m_pIsaGen)
delete m_pIsaGen;
m_pIsaGen = NULL;
KFDBaseComponentTest::TearDown();
ROUTINE_END
}
void KFDEvictTest::AllocBuffers(HSAuint32 defaultGPUNode, HSAuint32 count, HSAuint64 vramBufSize,
std::vector<void *> &pBuffers) {
HSAuint64 totalMB;
totalMB = N_PROCESSES*count*(vramBufSize>>20);
if (m_IsParent) {
LOG() << "Allocating " << N_PROCESSES << "*" << count << "*" << (vramBufSize>>20) << "(="
<< totalMB << ")MB VRAM in KFD" << std::endl;
}
HsaMemMapFlags mapFlags = {0};
HSAKMT_STATUS ret;
HSAuint32 retry = 0;
m_Flags.Value = 0;
m_Flags.ui32.PageSize = HSA_PAGE_SIZE_4KB;
m_Flags.ui32.HostAccess = 0;
m_Flags.ui32.NonPaged = 1;
for (HSAuint32 i = 0; i < count; ) {
ret = hsaKmtAllocMemory(defaultGPUNode, vramBufSize, m_Flags, &m_pBuf);
if (ret == HSAKMT_STATUS_SUCCESS) {
if (is_dgpu()) {
if (hsaKmtMapMemoryToGPUNodes(m_pBuf, vramBufSize, NULL,
mapFlags, 1, reinterpret_cast<HSAuint32 *>(&defaultGPUNode)) == HSAKMT_STATUS_ERROR) {
EXPECT_SUCCESS(hsaKmtFreeMemory(m_pBuf, vramBufSize));
break;
}
}
pBuffers.push_back(m_pBuf);
i++;
retry = 0;
} else {
if (retry++ > ALLOCATE_RETRY_TIMES) {
break;
}
/* Wait for 1 second to try allocate again */
sleep(1);
}
}
}
void KFDEvictTest::FreeBuffers(std::vector<void *> &pBuffers, HSAuint64 vramBufSize) {
for (HSAuint32 i = 0; i < pBuffers.size(); i++) {
m_pBuf = pBuffers[i];
if (m_pBuf != NULL) {
if (is_dgpu())
EXPECT_SUCCESS(hsaKmtUnmapMemoryToGPU(m_pBuf));
EXPECT_SUCCESS(hsaKmtFreeMemory(m_pBuf, vramBufSize));
}
}
}
void KFDEvictTest::AllocAmdgpuBo(int rn, HSAuint64 vramBufSize, amdgpu_bo_handle &handle) {
struct amdgpu_bo_alloc_request alloc;
alloc.alloc_size = vramBufSize / N_PROCESSES;
alloc.phys_alignment = PAGE_SIZE;
alloc.preferred_heap = AMDGPU_GEM_DOMAIN_VRAM;
alloc.flags = AMDGPU_GEM_CREATE_VRAM_CLEARED;
if (m_IsParent) {
LOG() << "Allocating " << N_PROCESSES << "*" << (vramBufSize >> 20) / N_PROCESSES << "(="
<< (vramBufSize >> 20) << ")MB VRAM in GFX" << std::endl;
}
ASSERT_EQ(0, amdgpu_bo_alloc(m_RenderNodes[rn].device_handle, &alloc, &handle));
}
void KFDEvictTest::FreeAmdgpuBo(amdgpu_bo_handle handle) {
ASSERT_EQ(0, amdgpu_bo_free(handle));
}
static int amdgpu_bo_alloc_and_map(amdgpu_device_handle dev, unsigned size,
unsigned alignment, unsigned heap, uint64_t flags,
amdgpu_bo_handle *bo, void **cpu, uint64_t *mc_address,
amdgpu_va_handle *va_handle) {
struct amdgpu_bo_alloc_request request = {};
amdgpu_bo_handle buf_handle;
amdgpu_va_handle handle;
uint64_t vmc_addr;
int r;
request.alloc_size = size;
request.phys_alignment = alignment;
request.preferred_heap = heap;
request.flags = flags;
r = amdgpu_bo_alloc(dev, &request, &buf_handle);
if (r)
return r;
r = amdgpu_va_range_alloc(dev,
amdgpu_gpu_va_range_general,
size, alignment, 0, &vmc_addr,
&handle, 0);
if (r)
goto error_va_alloc;
r = amdgpu_bo_va_op(buf_handle, 0, size, vmc_addr, 0, AMDGPU_VA_OP_MAP);
if (r)
goto error_va_map;
r = amdgpu_bo_cpu_map(buf_handle, cpu);
if (r)
goto error_cpu_map;
*bo = buf_handle;
*mc_address = vmc_addr;
*va_handle = handle;
return 0;
error_cpu_map:
amdgpu_bo_cpu_unmap(buf_handle);
error_va_map:
amdgpu_bo_va_op(buf_handle, 0, size, vmc_addr, 0, AMDGPU_VA_OP_UNMAP);
error_va_alloc:
amdgpu_bo_free(buf_handle);
return r;
}
static inline int amdgpu_bo_unmap_and_free(amdgpu_bo_handle bo, amdgpu_va_handle va_handle,
uint64_t mc_addr, uint64_t size) {
amdgpu_bo_cpu_unmap(bo);
amdgpu_bo_va_op(bo, 0, size, mc_addr, 0, AMDGPU_VA_OP_UNMAP);
amdgpu_va_range_free(va_handle);
amdgpu_bo_free(bo);
return 0;
}
static inline int amdgpu_get_bo_list(amdgpu_device_handle dev, amdgpu_bo_handle bo1,
amdgpu_bo_handle bo2, amdgpu_bo_list_handle *list) {
amdgpu_bo_handle resources[] = {bo1, bo2};
return amdgpu_bo_list_create(dev, bo2 ? 2 : 1, resources, NULL, list);
}
void KFDEvictTest::AmdgpuCommandSubmissionSdmaNop(int rn, amdgpu_bo_handle handle,
PM4Queue *computeQueue = NULL) {
amdgpu_context_handle contextHandle;
amdgpu_bo_handle ibResultHandle;
void *ibResultCpu;
uint64_t ibResultMcAddress;
struct amdgpu_cs_request ibsRequest;
struct amdgpu_cs_ib_info ibInfo;
struct amdgpu_cs_fence fenceStatus;
amdgpu_bo_list_handle boList;
amdgpu_va_handle vaHandle;
uint32_t *ptr;
uint32_t expired;
unsigned failCount = 0;
ASSERT_EQ(0, amdgpu_cs_ctx_create(m_RenderNodes[rn].device_handle, &contextHandle));
ASSERT_EQ(0, amdgpu_bo_alloc_and_map(m_RenderNodes[rn].device_handle,
PAGE_SIZE, PAGE_SIZE,
AMDGPU_GEM_DOMAIN_GTT, 0,
&ibResultHandle, &ibResultCpu,
&ibResultMcAddress, &vaHandle));
ASSERT_EQ(0, amdgpu_get_bo_list(m_RenderNodes[rn].device_handle, ibResultHandle, handle,
&boList));
/* Fill Nop cammands in IB */
ptr = reinterpret_cast<uint32_t *>(ibResultCpu);
for (int i = 0; i < 16; i++)
ptr[i] = SDMA_NOP;
memset(&ibInfo, 0, sizeof(struct amdgpu_cs_ib_info));
ibInfo.ib_mc_address = ibResultMcAddress;
ibInfo.size = 16;
memset(&ibsRequest, 0, sizeof(struct amdgpu_cs_request));
ibsRequest.ip_type = AMDGPU_HW_IP_DMA;
ibsRequest.ring = 0;
ibsRequest.number_of_ibs = 1;
ibsRequest.ibs = &ibInfo;
ibsRequest.resources = boList;
ibsRequest.fence_info.handle = NULL;
memset(&fenceStatus, 0, sizeof(struct amdgpu_cs_fence));
for (int i = 0; i < 100; i++) {
int r = amdgpu_cs_submit(contextHandle, 0, &ibsRequest, 1);
Delay(50);
if (r) {
failCount++;
ASSERT_LE(failCount, 2);
continue;
}
fenceStatus.context = contextHandle;
fenceStatus.ip_type = AMDGPU_HW_IP_DMA;
fenceStatus.ip_instance = 0;
fenceStatus.ring = 0;
fenceStatus.fence = ibsRequest.seq_no;
EXPECT_EQ(0, amdgpu_cs_query_fence_status(&fenceStatus,
g_TestTimeOut*1000000,
0, &expired));
if (!expired)
WARN() << "CS did not signal completion" << std::endl;
/* If a compute queue is given, submit a short compute job
* every 16 loops (about once a second). If the process was
* evicted, restore can take quite long.
*/
if (computeQueue && (i & 0xf) == 0) {
computeQueue->PlaceAndSubmitPacket(PM4NopPacket());
computeQueue->Wait4PacketConsumption(NULL, 10000);
}
}
EXPECT_EQ(0, amdgpu_bo_list_destroy(boList));
EXPECT_EQ(0, amdgpu_bo_unmap_and_free(ibResultHandle, vaHandle,
ibResultMcAddress, PAGE_SIZE));
EXPECT_EQ(0, amdgpu_cs_ctx_free(contextHandle));
}
/* Shader to read local buffers using multiple wavefronts in parallel
* until address buffer is filled with specific value 0x5678 by host program,
* then each wavefront fills value 0x5678 at corresponding result buffer and quit
*
* Initial state:
* s[0:1] - address buffer base address
* s[2:3] - result buffer base address
* s4 - workgroup id
* v0 - workitem id, always 0 because NUM_THREADS_X(number of threads) in workgroup set to 1
* Registers:
* v0 - calculated workitem id, v0 = v0 + s4 * NUM_THREADS_X
* v[2:3] - address of corresponding local buf address offset: s[0:1] + v0 * 8
* v[4:5] - corresponding output buf address: s[2:3] + v0 * 4
* v[6:7] - local buf address used for read test
*
* This shader can be used by gfx9 and gfx10
*
*/
static const char* gfx9_ReadMemory =
"\
shader ReadMemory\n\
wave_size(32)\n\
type(CS)\n\
\n\
// compute address of corresponding output buffer\n\
v_mov_b32 v0, s4 // use workgroup id as index\n\
v_lshlrev_b32 v0, 2, v0 // v0 *= 4\n\
v_add_co_u32 v4, vcc, s2, v0 // v[4:5] = s[2:3] + v0 * 4\n\
v_mov_b32 v5, s3\n\
v_add_co_u32 v5, vcc, v5, vcc_lo\n\
\n\
// compute input buffer offset used to store corresponding local buffer address\n\
v_lshlrev_b32 v0, 1, v0 // v0 *= 8\n\
v_add_co_u32 v2, vcc, s0, v0 // v[2:3] = s[0:1] + v0 * 8\n\
v_mov_b32 v3, s1\n\
v_add_co_u32 v3, vcc, v3, vcc_lo\n\
\n\
// load 64bit local buffer address stored at v[2:3] to v[6:7]\n\
flat_load_dwordx2 v[6:7], v[2:3] slc\n\
s_waitcnt vmcnt(0) & lgkmcnt(0) // wait for memory reads to finish\n\
\n\
v_mov_b32 v8, 0x5678\n\
s_movk_i32 s8, 0x5678\n\
L_REPEAT:\n\
s_load_dword s16, s[0:1], 0x0 glc\n\
s_waitcnt vmcnt(0) & lgkmcnt(0) // wait for memory reads to finish\n\
s_cmp_eq_i32 s16, s8\n\
s_cbranch_scc1 L_QUIT // if notified to quit by host\n\
// loop read 64M local buffer starting at v[6:7]\n\
// every 4k page only read once\n\
v_mov_b32 v9, 0\n\
v_mov_b32 v10, 0x1000 // 4k page\n\
v_mov_b32 v11, 0x4000000 // 64M size\n\
v_mov_b32 v12, v6\n\
v_mov_b32 v13, v7\n\
L_LOOP_READ:\n\
flat_load_dwordx2 v[14:15], v[12:13] slc\n\
v_add_co_u32 v9, vcc, v9, v10 \n\
v_add_co_u32 v12, vcc, v12, v10\n\
v_add_co_u32 v13, vcc, v13, vcc_lo\n\
v_cmp_lt_u32 vcc, v9, v11\n\
s_cbranch_vccnz L_LOOP_READ\n\
s_branch L_REPEAT\n\
L_QUIT:\n\
flat_store_dword v[4:5], v8\n\
s_waitcnt vmcnt(0) & lgkmcnt(0) // wait for memory writes to finish\n\
s_endpgm\n\
end\n\
";
static const char* gfx8_ReadMemory =
"\
shader ReadMemory\n\
asic(VI)\n\
type(CS)\n\
\n\
// compute address of corresponding output buffer\n\
v_mov_b32 v0, s4 // use workgroup id as index\n\
v_lshlrev_b32 v0, 2, v0 // v0 *= 4\n\
v_add_u32 v4, vcc, s2, v0 // v[4:5] = s[2:3] + v0 * 4\n\
v_mov_b32 v5, s3\n\
v_addc_u32 v5, vcc, v5, 0, vcc\n\
\n\
// compute input buffer offset used to store corresponding local buffer address\n\
v_lshlrev_b32 v0, 1, v0 // v0 *= 8\n\
v_add_u32 v2, vcc, s0, v0 // v[2:3] = s[0:1] + v0 * 8\n\
v_mov_b32 v3, s1\n\
v_addc_u32 v3, vcc, v3, 0, vcc\n\
\n\
// load 64bit local buffer address stored at v[2:3] to v[6:7]\n\
flat_load_dwordx2 v[6:7], v[2:3] slc\n\
s_waitcnt vmcnt(0) & lgkmcnt(0) // wait for memory reads to finish\n\
\n\
v_mov_b32 v8, 0x5678\n\
s_movk_i32 s8, 0x5678\n\
L_REPEAT:\n\
s_load_dword s16, s[0:1], 0x0 glc\n\
s_waitcnt vmcnt(0) & lgkmcnt(0) // wait for memory reads to finish\n\
s_cmp_eq_i32 s16, s8\n\
s_cbranch_scc1 L_QUIT // if notified to quit by host\n\
// loop read 64M local buffer starting at v[6:7]\n\
// every 4k page only read once\n\
v_mov_b32 v9, 0\n\
v_mov_b32 v10, 0x1000 // 4k page\n\
v_mov_b32 v11, 0x4000000 // 64M size\n\
v_mov_b32 v12, v6\n\
v_mov_b32 v13, v7\n\
L_LOOP_READ:\n\
flat_load_dwordx2 v[14:15], v[12:13] slc\n\
v_add_u32 v9, vcc, v9, v10 \n\
v_add_u32 v12, vcc, v12, v10\n\
v_addc_u32 v13, vcc, v13, 0, vcc\n\
v_cmp_lt_u32 vcc, v9, v11\n\
s_cbranch_vccnz L_LOOP_READ\n\
s_branch L_REPEAT\n\
L_QUIT:\n\
flat_store_dword v[4:5], v8\n\
s_waitcnt vmcnt(0) & lgkmcnt(0) // wait for memory writes to finish\n\
s_endpgm\n\
end\n\
";
std::string KFDEvictTest::CreateShader() {
if (m_FamilyId < FAMILY_AI)
return gfx8_ReadMemory;
else
return gfx9_ReadMemory;
}
/* Evict and restore procedure basic test
*
* Use N_PROCESSES processes to allocate vram buf size larger than total vram size
*
* ALLOCATE_BUF_SIZE_MB buf allocation size
*
* buf is equal to (vramSizeMB / (vramBufSizeMB * N_PROCESSES) ) + 8
* Total vram all processes allocated: 8GB for 4GB Fiji, and 20GB for 16GB Vega10
*
* Eviction and restore will happen many times:
* ttm will evict buffers of another process if there is not enough free vram
* process restore will evict buffers of another process
*
* Sometimes the allocation may fail (maybe that is normal)
* ALLOCATE_RETRY_TIMES max retry times to allocate
*
* This is basic test with no queue, so vram is not used by the GPU during test
*
* TODO:
* - Synchronization between the processes, so they know for sure when
* they are done allocating memory
*/
TEST_F(KFDEvictTest, BasicTest) {
TEST_REQUIRE_ENV_CAPABILITIES(ENVCAPS_64BITLINUX);
TEST_START(TESTPROFILE_RUNALL);
HSAuint32 defaultGPUNode = m_NodeInfo.HsaDefaultGPUNode();
ASSERT_GE(defaultGPUNode, 0) << "failed to get default GPU Node";
HSAuint64 vramBufSize = ALLOCATE_BUF_SIZE_MB * 1024 * 1024;
HSAuint64 vramSize = GetVramSize(defaultGPUNode);
HSAuint64 sysMemSize = GetSysMemSize();
if (!vramSize) {
LOG() << "Skipping test: No VRAM found." << std::endl;
return;
}
LOG() << "Found VRAM of " << std::dec << (vramSize >> 20) << "MB" << std::endl;
LOG() << "Found System RAM of " << std::dec << (sysMemSize >> 20) << "MB" << std::endl;
// Use 7/8 of VRAM between all processes
HSAuint64 testSize = vramSize * 7 / 8;
HSAuint32 count = testSize / (vramBufSize * N_PROCESSES);
if (count == 0) {
LOG() << "Skipping test: Not enough system memory available." << std::endl;
return;
}
/* Fork the child processes */
ForkChildProcesses(N_PROCESSES);
int rn = FindDRMRenderNode(defaultGPUNode);
if (rn < 0) {
LOG() << "Skipping test: Could not find render node for default GPU." << std::endl;
WaitChildProcesses();
return;
}
std::vector<void *> pBuffers;
AllocBuffers(defaultGPUNode, count, vramBufSize, pBuffers);
/* Allocate gfx vram size of at most one third system memory */
HSAuint64 size = sysMemSize / 3 < testSize ? sysMemSize / 3 : testSize;
amdgpu_bo_handle handle;
AllocAmdgpuBo(rn, size, handle);
AmdgpuCommandSubmissionSdmaNop(rn, handle);
FreeAmdgpuBo(handle);
LOG() << m_psName << "free buffer" << std::endl;
FreeBuffers(pBuffers, vramBufSize);
WaitChildProcesses();
TEST_END
}
/* Evict and restore queue test
*
* N_PROCESSES processes read all local buffers in parallel while buffers are evicted and restored
* If GPU vm page fault happens, then test shader will stop and failed to write specific value
* at dest buffer. Test will report failed.
*
* Steps:
* - fork N_PROCESSES processes, each process does the same below
* - allocate local buffers, each buffer size is 64MB
* - allocate zero initialized host access address buffer and result buffer
* address buffer to pass address of local buffers to shader
* result buffer to store shader output result
* - submit queue to run ReadMemory shader
* - shader start m_DimX wavefronts, each wavefront keep reading one local buffer
* - notify shader to quit
* - check result buffer with specific value to confirm all wavefronts quit normally
*/
TEST_F(KFDEvictTest, QueueTest) {
TEST_REQUIRE_ENV_CAPABILITIES(ENVCAPS_64BITLINUX);
TEST_START(TESTPROFILE_RUNALL)
HSAuint32 defaultGPUNode = m_NodeInfo.HsaDefaultGPUNode();
ASSERT_GE(defaultGPUNode, 0) << "failed to get default GPU Node";
HSAuint64 vramBufSize = ALLOCATE_BUF_SIZE_MB * 1024 * 1024;
const HsaNodeProperties *pNodeProperties = m_NodeInfo.HsaDefaultGPUNodeProperties();
/* Skip test for chip if it doesn't have CWSR, which the test depends on */
if (m_FamilyId < FAMILY_VI || isTonga(pNodeProperties)) {
LOG() << std::hex << "Skipping test: No CWSR present for family ID 0x" << m_FamilyId << "." << std::endl;
return;
}
HSAuint32 i;
HSAuint64 vramSize = GetVramSize(defaultGPUNode);
HSAuint64 sysMemSize = GetSysMemSize();
if (!vramSize) {
LOG() << "Skipping test: No VRAM found." << std::endl;
return;
}
LOG() << "Found VRAM of " << std::dec << (vramSize >> 20) << "MB" << std::endl;
LOG() << "Found System RAM of " << std::dec << (sysMemSize >> 20) << "MB" << std::endl;
// Use 7/8 of VRAM between all processes
HSAuint64 testSize = vramSize * 7 / 8;
HSAuint32 count = testSize / (vramBufSize * N_PROCESSES);
if (count == 0) {
LOG() << "Skipping test: Not enough system memory available." << std::endl;
return;
}
/* Assert all buffer address can be stored within one page
* because only one page host memory srcBuf is allocated
*/
ASSERT_LE(count, PAGE_SIZE/sizeof(unsigned int *));
/* Fork the child processes */
ForkChildProcesses(N_PROCESSES);
int rn = FindDRMRenderNode(defaultGPUNode);
if (rn < 0) {
LOG() << "Skipping test: Could not find render node for default GPU." << std::endl;
WaitChildProcesses();
return;
}
HsaMemoryBuffer isaBuffer(PAGE_SIZE, defaultGPUNode, true/*zero*/, false/*local*/, true/*exec*/);
HsaMemoryBuffer addrBuffer(PAGE_SIZE, defaultGPUNode);
HsaMemoryBuffer resultBuffer(PAGE_SIZE, defaultGPUNode);
m_pIsaGen->CompileShader(CreateShader().c_str(), "ReadMemory", isaBuffer);
PM4Queue pm4Queue;
ASSERT_SUCCESS(pm4Queue.Create(defaultGPUNode));
Dispatch dispatch0(isaBuffer);
std::vector<void *> pBuffers;
AllocBuffers(defaultGPUNode, count, vramBufSize, pBuffers);
/* Allocate gfx vram size of at most one third system memory */
HSAuint64 size = sysMemSize / 3 < testSize ? sysMemSize / 3 : testSize;
amdgpu_bo_handle handle;
AllocAmdgpuBo(rn, size, handle);
unsigned int wavefront_num = pBuffers.size();
LOG() << m_psName << "wavefront number " << wavefront_num << std::endl;
void **localBufAddr = addrBuffer.As<void **>();
unsigned int *result = resultBuffer.As<uint32_t *>();
for (i = 0; i < wavefront_num; i++)
*(localBufAddr + i) = pBuffers[i];
dispatch0.SetArgs(localBufAddr, result);
dispatch0.SetDim(wavefront_num, 1, 1);
/* Submit the packet and start shader */
dispatch0.Submit(pm4Queue);
AmdgpuCommandSubmissionSdmaNop(rn, handle);
/* Uncomment this line for debugging */
// LOG() << m_psName << "notify shader to quit" << std::endl;
/* Fill address buffer so shader quits */
addrBuffer.Fill(0x5678);
/* Wait for shader to finish or timeout if shader has vm page fault */
EXPECT_EQ(0, dispatch0.SyncWithStatus(120000));
EXPECT_SUCCESS(pm4Queue.Destroy());
FreeAmdgpuBo(handle);
/* Uncomment this line for debugging */
// LOG() << m_psName << "free buffer" << std::endl;
/* Cleanup */
FreeBuffers(pBuffers, vramBufSize);
/* Check if all wavefronts finished successfully */
for (i = 0; i < wavefront_num; i++)
EXPECT_EQ(0x5678, *(result + i));
WaitChildProcesses();
TEST_END
}
/* Evict a queue running in bursts, so that the process has a chance
* to be idle when restored but the queue needs to resume to perform
* more work later. This test is designed to stress the idle process
* eviction optimization in KFD that leaves idle processes evicted
* until the next time the doorbell page is accessed.
*/
TEST_F(KFDEvictTest, BurstyTest) {
TEST_REQUIRE_ENV_CAPABILITIES(ENVCAPS_64BITLINUX);
TEST_START(TESTPROFILE_RUNALL);
HSAuint32 defaultGPUNode = m_NodeInfo.HsaDefaultGPUNode();
ASSERT_GE(defaultGPUNode, 0) << "failed to get default GPU Node";
HSAuint64 vramBufSize = ALLOCATE_BUF_SIZE_MB * 1024 * 1024;
HSAuint64 vramSize = GetVramSize(defaultGPUNode);
HSAuint64 sysMemSize = GetSysMemSize();
if (!vramSize) {
LOG() << "Skipping test: No VRAM found." << std::endl;
return;
}
LOG() << "Found VRAM of " << std::dec << (vramSize >> 20) << "MB" << std::endl;
LOG() << "Found System RAM of " << std::dec << (sysMemSize >> 20) << "MB" << std::endl;
// Use 7/8 of VRAM between all processes
HSAuint64 testSize = vramSize * 7 / 8;
HSAuint32 count = testSize / (vramBufSize * N_PROCESSES);
if (count == 0) {
LOG() << "Skipping test: Not enough system memory available." << std::endl;
return;
}
/* Fork the child processes */
ForkChildProcesses(N_PROCESSES);
int rn = FindDRMRenderNode(defaultGPUNode);
if (rn < 0) {
LOG() << "Skipping test: Could not find render node for default GPU." << std::endl;
WaitChildProcesses();
return;
}
PM4Queue pm4Queue;
ASSERT_SUCCESS(pm4Queue.Create(defaultGPUNode));
std::vector<void *> pBuffers;
AllocBuffers(defaultGPUNode, count, vramBufSize, pBuffers);
/* Allocate gfx vram size of at most one third system memory */
HSAuint64 size = sysMemSize / 3 < testSize ? sysMemSize / 3 : testSize;
amdgpu_bo_handle handle;
AllocAmdgpuBo(rn, size, handle);
AmdgpuCommandSubmissionSdmaNop(rn, handle, &pm4Queue);
FreeAmdgpuBo(handle);
LOG() << m_psName << "free buffer" << std::endl;
FreeBuffers(pBuffers, vramBufSize);
EXPECT_SUCCESS(pm4Queue.Destroy());
WaitChildProcesses();
TEST_END
}