8baf02e80b
amdgpu_cs_submit can fail intermittently if another process has too much memory reserved at the time. Allow a small percental of command submissions to fail to make the test more robust. Signed-off-by: Felix Kuehling <Felix.Kuehling@amd.com> Change-Id: If9f62b2b6f67be71420016d4e38d4dd6b6bca9a5
694 lines
23 KiB
C++
694 lines
23 KiB
C++
/*
|
|
* Copyright (C) 2017-2018 Advanced Micro Devices, Inc. All Rights Reserved.
|
|
*
|
|
* Permission is hereby granted, free of charge, to any person obtaining a
|
|
* copy of this software and associated documentation files (the "Software"),
|
|
* to deal in the Software without restriction, including without limitation
|
|
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
|
* and/or sell copies of the Software, and to permit persons to whom the
|
|
* Software is furnished to do so, subject to the following conditions:
|
|
*
|
|
* The above copyright notice and this permission notice shall be included in
|
|
* all copies or substantial portions of the Software.
|
|
*
|
|
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
|
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
|
|
* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
|
|
* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
|
* OTHER DEALINGS IN THE SOFTWARE.
|
|
*
|
|
*/
|
|
|
|
#include <vector>
|
|
#include <string>
|
|
#include "KFDEvictTest.hpp"
|
|
#include "PM4Queue.hpp"
|
|
#include "PM4Packet.hpp"
|
|
#include "SDMAPacket.hpp"
|
|
#include "SDMAQueue.hpp"
|
|
#include "Dispatch.hpp"
|
|
|
|
#define N_PROCESSES (2) /* Number of processes running in parallel, must be at least 2 */
|
|
#define ALLOCATE_BUF_SIZE_MB (64)
|
|
#define ALLOCATE_RETRY_TIMES (3)
|
|
|
|
#define SDMA_NOP 0x0
|
|
|
|
void KFDEvictTest::SetUp() {
|
|
ROUTINE_START
|
|
|
|
KFDBaseComponentTest::SetUp();
|
|
|
|
m_pIsaGen = IsaGenerator::Create(m_FamilyId);
|
|
|
|
ROUTINE_END
|
|
}
|
|
|
|
void KFDEvictTest::TearDown() {
|
|
ROUTINE_START
|
|
|
|
if (m_pIsaGen)
|
|
delete m_pIsaGen;
|
|
m_pIsaGen = NULL;
|
|
|
|
KFDBaseComponentTest::TearDown();
|
|
|
|
ROUTINE_END
|
|
}
|
|
|
|
void KFDEvictTest::AllocBuffers(HSAuint32 defaultGPUNode, HSAuint32 count, HSAuint64 vramBufSize,
|
|
std::vector<void *> &pBuffers) {
|
|
HSAuint64 totalMB;
|
|
|
|
totalMB = N_PROCESSES*count*(vramBufSize>>20);
|
|
if (m_IsParent) {
|
|
LOG() << "Allocating " << N_PROCESSES << "*" << count << "*" << (vramBufSize>>20) << "(="
|
|
<< totalMB << ")MB VRAM in KFD" << std::endl;
|
|
}
|
|
|
|
HsaMemMapFlags mapFlags = {0};
|
|
HSAKMT_STATUS ret;
|
|
HSAuint32 retry = 0;
|
|
|
|
m_Flags.Value = 0;
|
|
m_Flags.ui32.PageSize = HSA_PAGE_SIZE_4KB;
|
|
m_Flags.ui32.HostAccess = 0;
|
|
m_Flags.ui32.NonPaged = 1;
|
|
|
|
for (HSAuint32 i = 0; i < count; ) {
|
|
ret = hsaKmtAllocMemory(defaultGPUNode, vramBufSize, m_Flags, &m_pBuf);
|
|
if (ret == HSAKMT_STATUS_SUCCESS) {
|
|
if (is_dgpu()) {
|
|
if (hsaKmtMapMemoryToGPUNodes(m_pBuf, vramBufSize, NULL,
|
|
mapFlags, 1, reinterpret_cast<HSAuint32 *>(&defaultGPUNode)) == HSAKMT_STATUS_ERROR) {
|
|
EXPECT_SUCCESS(hsaKmtFreeMemory(m_pBuf, vramBufSize));
|
|
break;
|
|
}
|
|
}
|
|
pBuffers.push_back(m_pBuf);
|
|
|
|
i++;
|
|
retry = 0;
|
|
} else {
|
|
if (retry++ > ALLOCATE_RETRY_TIMES) {
|
|
break;
|
|
}
|
|
|
|
/* Wait for 1 second to try allocate again */
|
|
sleep(1);
|
|
}
|
|
}
|
|
}
|
|
|
|
void KFDEvictTest::FreeBuffers(std::vector<void *> &pBuffers, HSAuint64 vramBufSize) {
|
|
for (HSAuint32 i = 0; i < pBuffers.size(); i++) {
|
|
m_pBuf = pBuffers[i];
|
|
if (m_pBuf != NULL) {
|
|
if (is_dgpu())
|
|
EXPECT_SUCCESS(hsaKmtUnmapMemoryToGPU(m_pBuf));
|
|
EXPECT_SUCCESS(hsaKmtFreeMemory(m_pBuf, vramBufSize));
|
|
}
|
|
}
|
|
}
|
|
|
|
void KFDEvictTest::AllocAmdgpuBo(int rn, HSAuint64 vramBufSize, amdgpu_bo_handle &handle) {
|
|
struct amdgpu_bo_alloc_request alloc;
|
|
|
|
alloc.alloc_size = vramBufSize / N_PROCESSES;
|
|
alloc.phys_alignment = PAGE_SIZE;
|
|
alloc.preferred_heap = AMDGPU_GEM_DOMAIN_VRAM;
|
|
alloc.flags = AMDGPU_GEM_CREATE_VRAM_CLEARED;
|
|
|
|
if (m_IsParent) {
|
|
LOG() << "Allocating " << N_PROCESSES << "*" << (vramBufSize >> 20) / N_PROCESSES << "(="
|
|
<< (vramBufSize >> 20) << ")MB VRAM in GFX" << std::endl;
|
|
}
|
|
ASSERT_EQ(0, amdgpu_bo_alloc(m_RenderNodes[rn].device_handle, &alloc, &handle));
|
|
}
|
|
|
|
void KFDEvictTest::FreeAmdgpuBo(amdgpu_bo_handle handle) {
|
|
ASSERT_EQ(0, amdgpu_bo_free(handle));
|
|
}
|
|
|
|
static int amdgpu_bo_alloc_and_map(amdgpu_device_handle dev, unsigned size,
|
|
unsigned alignment, unsigned heap, uint64_t flags,
|
|
amdgpu_bo_handle *bo, void **cpu, uint64_t *mc_address,
|
|
amdgpu_va_handle *va_handle) {
|
|
struct amdgpu_bo_alloc_request request = {};
|
|
amdgpu_bo_handle buf_handle;
|
|
amdgpu_va_handle handle;
|
|
uint64_t vmc_addr;
|
|
int r;
|
|
|
|
request.alloc_size = size;
|
|
request.phys_alignment = alignment;
|
|
request.preferred_heap = heap;
|
|
request.flags = flags;
|
|
|
|
r = amdgpu_bo_alloc(dev, &request, &buf_handle);
|
|
if (r)
|
|
return r;
|
|
|
|
r = amdgpu_va_range_alloc(dev,
|
|
amdgpu_gpu_va_range_general,
|
|
size, alignment, 0, &vmc_addr,
|
|
&handle, 0);
|
|
if (r)
|
|
goto error_va_alloc;
|
|
|
|
r = amdgpu_bo_va_op(buf_handle, 0, size, vmc_addr, 0, AMDGPU_VA_OP_MAP);
|
|
if (r)
|
|
goto error_va_map;
|
|
|
|
r = amdgpu_bo_cpu_map(buf_handle, cpu);
|
|
if (r)
|
|
goto error_cpu_map;
|
|
|
|
*bo = buf_handle;
|
|
*mc_address = vmc_addr;
|
|
*va_handle = handle;
|
|
|
|
return 0;
|
|
|
|
error_cpu_map:
|
|
amdgpu_bo_cpu_unmap(buf_handle);
|
|
|
|
error_va_map:
|
|
amdgpu_bo_va_op(buf_handle, 0, size, vmc_addr, 0, AMDGPU_VA_OP_UNMAP);
|
|
|
|
error_va_alloc:
|
|
amdgpu_bo_free(buf_handle);
|
|
return r;
|
|
}
|
|
|
|
static inline int amdgpu_bo_unmap_and_free(amdgpu_bo_handle bo, amdgpu_va_handle va_handle,
|
|
uint64_t mc_addr, uint64_t size) {
|
|
amdgpu_bo_cpu_unmap(bo);
|
|
amdgpu_bo_va_op(bo, 0, size, mc_addr, 0, AMDGPU_VA_OP_UNMAP);
|
|
amdgpu_va_range_free(va_handle);
|
|
amdgpu_bo_free(bo);
|
|
|
|
return 0;
|
|
}
|
|
|
|
static inline int amdgpu_get_bo_list(amdgpu_device_handle dev, amdgpu_bo_handle bo1,
|
|
amdgpu_bo_handle bo2, amdgpu_bo_list_handle *list) {
|
|
amdgpu_bo_handle resources[] = {bo1, bo2};
|
|
|
|
return amdgpu_bo_list_create(dev, bo2 ? 2 : 1, resources, NULL, list);
|
|
}
|
|
|
|
void KFDEvictTest::AmdgpuCommandSubmissionSdmaNop(int rn, amdgpu_bo_handle handle,
|
|
PM4Queue *computeQueue = NULL) {
|
|
amdgpu_context_handle contextHandle;
|
|
amdgpu_bo_handle ibResultHandle;
|
|
void *ibResultCpu;
|
|
uint64_t ibResultMcAddress;
|
|
struct amdgpu_cs_request ibsRequest;
|
|
struct amdgpu_cs_ib_info ibInfo;
|
|
struct amdgpu_cs_fence fenceStatus;
|
|
amdgpu_bo_list_handle boList;
|
|
amdgpu_va_handle vaHandle;
|
|
uint32_t *ptr;
|
|
uint32_t expired;
|
|
unsigned failCount = 0;
|
|
|
|
ASSERT_EQ(0, amdgpu_cs_ctx_create(m_RenderNodes[rn].device_handle, &contextHandle));
|
|
|
|
ASSERT_EQ(0, amdgpu_bo_alloc_and_map(m_RenderNodes[rn].device_handle,
|
|
PAGE_SIZE, PAGE_SIZE,
|
|
AMDGPU_GEM_DOMAIN_GTT, 0,
|
|
&ibResultHandle, &ibResultCpu,
|
|
&ibResultMcAddress, &vaHandle));
|
|
|
|
ASSERT_EQ(0, amdgpu_get_bo_list(m_RenderNodes[rn].device_handle, ibResultHandle, handle,
|
|
&boList));
|
|
|
|
/* Fill Nop cammands in IB */
|
|
ptr = reinterpret_cast<uint32_t *>(ibResultCpu);
|
|
for (int i = 0; i < 16; i++)
|
|
ptr[i] = SDMA_NOP;
|
|
|
|
memset(&ibInfo, 0, sizeof(struct amdgpu_cs_ib_info));
|
|
ibInfo.ib_mc_address = ibResultMcAddress;
|
|
ibInfo.size = 16;
|
|
|
|
memset(&ibsRequest, 0, sizeof(struct amdgpu_cs_request));
|
|
ibsRequest.ip_type = AMDGPU_HW_IP_DMA;
|
|
ibsRequest.ring = 0;
|
|
ibsRequest.number_of_ibs = 1;
|
|
ibsRequest.ibs = &ibInfo;
|
|
ibsRequest.resources = boList;
|
|
ibsRequest.fence_info.handle = NULL;
|
|
|
|
memset(&fenceStatus, 0, sizeof(struct amdgpu_cs_fence));
|
|
for (int i = 0; i < 100; i++) {
|
|
int r = amdgpu_cs_submit(contextHandle, 0, &ibsRequest, 1);
|
|
|
|
Delay(50);
|
|
if (r) {
|
|
failCount++;
|
|
ASSERT_LE(failCount, 2);
|
|
continue;
|
|
}
|
|
|
|
fenceStatus.context = contextHandle;
|
|
fenceStatus.ip_type = AMDGPU_HW_IP_DMA;
|
|
fenceStatus.ip_instance = 0;
|
|
fenceStatus.ring = 0;
|
|
fenceStatus.fence = ibsRequest.seq_no;
|
|
|
|
EXPECT_EQ(0, amdgpu_cs_query_fence_status(&fenceStatus,
|
|
g_TestTimeOut*1000000,
|
|
0, &expired));
|
|
if (!expired)
|
|
WARN() << "CS did not signal completion" << std::endl;
|
|
|
|
/* If a compute queue is given, submit a short compute job
|
|
* every 16 loops (about once a second). If the process was
|
|
* evicted, restore can take quite long.
|
|
*/
|
|
if (computeQueue && (i & 0xf) == 0) {
|
|
computeQueue->PlaceAndSubmitPacket(PM4NopPacket());
|
|
computeQueue->Wait4PacketConsumption(NULL, 10000);
|
|
}
|
|
}
|
|
|
|
EXPECT_EQ(0, amdgpu_bo_list_destroy(boList));
|
|
|
|
EXPECT_EQ(0, amdgpu_bo_unmap_and_free(ibResultHandle, vaHandle,
|
|
ibResultMcAddress, PAGE_SIZE));
|
|
|
|
EXPECT_EQ(0, amdgpu_cs_ctx_free(contextHandle));
|
|
}
|
|
|
|
/* Shader to read local buffers using multiple wavefronts in parallel
|
|
* until address buffer is filled with specific value 0x5678 by host program,
|
|
* then each wavefront fills value 0x5678 at corresponding result buffer and quit
|
|
*
|
|
* Initial state:
|
|
* s[0:1] - address buffer base address
|
|
* s[2:3] - result buffer base address
|
|
* s4 - workgroup id
|
|
* v0 - workitem id, always 0 because NUM_THREADS_X(number of threads) in workgroup set to 1
|
|
* Registers:
|
|
* v0 - calculated workitem id, v0 = v0 + s4 * NUM_THREADS_X
|
|
* v[2:3] - address of corresponding local buf address offset: s[0:1] + v0 * 8
|
|
* v[4:5] - corresponding output buf address: s[2:3] + v0 * 4
|
|
* v[6:7] - local buf address used for read test
|
|
*
|
|
* This shader can be used by gfx9 and gfx10
|
|
*
|
|
*/
|
|
|
|
static const char* gfx9_ReadMemory =
|
|
"\
|
|
shader ReadMemory\n\
|
|
wave_size(32)\n\
|
|
type(CS)\n\
|
|
\n\
|
|
// compute address of corresponding output buffer\n\
|
|
v_mov_b32 v0, s4 // use workgroup id as index\n\
|
|
v_lshlrev_b32 v0, 2, v0 // v0 *= 4\n\
|
|
v_add_co_u32 v4, vcc, s2, v0 // v[4:5] = s[2:3] + v0 * 4\n\
|
|
v_mov_b32 v5, s3\n\
|
|
v_add_co_u32 v5, vcc, v5, vcc_lo\n\
|
|
\n\
|
|
// compute input buffer offset used to store corresponding local buffer address\n\
|
|
v_lshlrev_b32 v0, 1, v0 // v0 *= 8\n\
|
|
v_add_co_u32 v2, vcc, s0, v0 // v[2:3] = s[0:1] + v0 * 8\n\
|
|
v_mov_b32 v3, s1\n\
|
|
v_add_co_u32 v3, vcc, v3, vcc_lo\n\
|
|
\n\
|
|
// load 64bit local buffer address stored at v[2:3] to v[6:7]\n\
|
|
flat_load_dwordx2 v[6:7], v[2:3] slc\n\
|
|
s_waitcnt vmcnt(0) & lgkmcnt(0) // wait for memory reads to finish\n\
|
|
\n\
|
|
v_mov_b32 v8, 0x5678\n\
|
|
s_movk_i32 s8, 0x5678\n\
|
|
L_REPEAT:\n\
|
|
s_load_dword s16, s[0:1], 0x0 glc\n\
|
|
s_waitcnt vmcnt(0) & lgkmcnt(0) // wait for memory reads to finish\n\
|
|
s_cmp_eq_i32 s16, s8\n\
|
|
s_cbranch_scc1 L_QUIT // if notified to quit by host\n\
|
|
// loop read 64M local buffer starting at v[6:7]\n\
|
|
// every 4k page only read once\n\
|
|
v_mov_b32 v9, 0\n\
|
|
v_mov_b32 v10, 0x1000 // 4k page\n\
|
|
v_mov_b32 v11, 0x4000000 // 64M size\n\
|
|
v_mov_b32 v12, v6\n\
|
|
v_mov_b32 v13, v7\n\
|
|
L_LOOP_READ:\n\
|
|
flat_load_dwordx2 v[14:15], v[12:13] slc\n\
|
|
v_add_co_u32 v9, vcc, v9, v10 \n\
|
|
v_add_co_u32 v12, vcc, v12, v10\n\
|
|
v_add_co_u32 v13, vcc, v13, vcc_lo\n\
|
|
v_cmp_lt_u32 vcc, v9, v11\n\
|
|
s_cbranch_vccnz L_LOOP_READ\n\
|
|
s_branch L_REPEAT\n\
|
|
L_QUIT:\n\
|
|
flat_store_dword v[4:5], v8\n\
|
|
s_waitcnt vmcnt(0) & lgkmcnt(0) // wait for memory writes to finish\n\
|
|
s_endpgm\n\
|
|
end\n\
|
|
";
|
|
|
|
static const char* gfx8_ReadMemory =
|
|
"\
|
|
shader ReadMemory\n\
|
|
asic(VI)\n\
|
|
type(CS)\n\
|
|
\n\
|
|
// compute address of corresponding output buffer\n\
|
|
v_mov_b32 v0, s4 // use workgroup id as index\n\
|
|
v_lshlrev_b32 v0, 2, v0 // v0 *= 4\n\
|
|
v_add_u32 v4, vcc, s2, v0 // v[4:5] = s[2:3] + v0 * 4\n\
|
|
v_mov_b32 v5, s3\n\
|
|
v_addc_u32 v5, vcc, v5, 0, vcc\n\
|
|
\n\
|
|
// compute input buffer offset used to store corresponding local buffer address\n\
|
|
v_lshlrev_b32 v0, 1, v0 // v0 *= 8\n\
|
|
v_add_u32 v2, vcc, s0, v0 // v[2:3] = s[0:1] + v0 * 8\n\
|
|
v_mov_b32 v3, s1\n\
|
|
v_addc_u32 v3, vcc, v3, 0, vcc\n\
|
|
\n\
|
|
// load 64bit local buffer address stored at v[2:3] to v[6:7]\n\
|
|
flat_load_dwordx2 v[6:7], v[2:3] slc\n\
|
|
s_waitcnt vmcnt(0) & lgkmcnt(0) // wait for memory reads to finish\n\
|
|
\n\
|
|
v_mov_b32 v8, 0x5678\n\
|
|
s_movk_i32 s8, 0x5678\n\
|
|
L_REPEAT:\n\
|
|
s_load_dword s16, s[0:1], 0x0 glc\n\
|
|
s_waitcnt vmcnt(0) & lgkmcnt(0) // wait for memory reads to finish\n\
|
|
s_cmp_eq_i32 s16, s8\n\
|
|
s_cbranch_scc1 L_QUIT // if notified to quit by host\n\
|
|
// loop read 64M local buffer starting at v[6:7]\n\
|
|
// every 4k page only read once\n\
|
|
v_mov_b32 v9, 0\n\
|
|
v_mov_b32 v10, 0x1000 // 4k page\n\
|
|
v_mov_b32 v11, 0x4000000 // 64M size\n\
|
|
v_mov_b32 v12, v6\n\
|
|
v_mov_b32 v13, v7\n\
|
|
L_LOOP_READ:\n\
|
|
flat_load_dwordx2 v[14:15], v[12:13] slc\n\
|
|
v_add_u32 v9, vcc, v9, v10 \n\
|
|
v_add_u32 v12, vcc, v12, v10\n\
|
|
v_addc_u32 v13, vcc, v13, 0, vcc\n\
|
|
v_cmp_lt_u32 vcc, v9, v11\n\
|
|
s_cbranch_vccnz L_LOOP_READ\n\
|
|
s_branch L_REPEAT\n\
|
|
L_QUIT:\n\
|
|
flat_store_dword v[4:5], v8\n\
|
|
s_waitcnt vmcnt(0) & lgkmcnt(0) // wait for memory writes to finish\n\
|
|
s_endpgm\n\
|
|
end\n\
|
|
";
|
|
|
|
std::string KFDEvictTest::CreateShader() {
|
|
if (m_FamilyId < FAMILY_AI)
|
|
return gfx8_ReadMemory;
|
|
else
|
|
return gfx9_ReadMemory;
|
|
}
|
|
|
|
/* Evict and restore procedure basic test
|
|
*
|
|
* Use N_PROCESSES processes to allocate vram buf size larger than total vram size
|
|
*
|
|
* ALLOCATE_BUF_SIZE_MB buf allocation size
|
|
*
|
|
* buf is equal to (vramSizeMB / (vramBufSizeMB * N_PROCESSES) ) + 8
|
|
* Total vram all processes allocated: 8GB for 4GB Fiji, and 20GB for 16GB Vega10
|
|
*
|
|
* Eviction and restore will happen many times:
|
|
* ttm will evict buffers of another process if there is not enough free vram
|
|
* process restore will evict buffers of another process
|
|
*
|
|
* Sometimes the allocation may fail (maybe that is normal)
|
|
* ALLOCATE_RETRY_TIMES max retry times to allocate
|
|
*
|
|
* This is basic test with no queue, so vram is not used by the GPU during test
|
|
*
|
|
* TODO:
|
|
* - Synchronization between the processes, so they know for sure when
|
|
* they are done allocating memory
|
|
*/
|
|
TEST_F(KFDEvictTest, BasicTest) {
|
|
TEST_REQUIRE_ENV_CAPABILITIES(ENVCAPS_64BITLINUX);
|
|
TEST_START(TESTPROFILE_RUNALL);
|
|
|
|
HSAuint32 defaultGPUNode = m_NodeInfo.HsaDefaultGPUNode();
|
|
ASSERT_GE(defaultGPUNode, 0) << "failed to get default GPU Node";
|
|
HSAuint64 vramBufSize = ALLOCATE_BUF_SIZE_MB * 1024 * 1024;
|
|
|
|
HSAuint64 vramSize = GetVramSize(defaultGPUNode);
|
|
HSAuint64 sysMemSize = GetSysMemSize();
|
|
|
|
if (!vramSize) {
|
|
LOG() << "Skipping test: No VRAM found." << std::endl;
|
|
return;
|
|
}
|
|
|
|
LOG() << "Found VRAM of " << std::dec << (vramSize >> 20) << "MB" << std::endl;
|
|
LOG() << "Found System RAM of " << std::dec << (sysMemSize >> 20) << "MB" << std::endl;
|
|
|
|
// Use 7/8 of VRAM between all processes
|
|
HSAuint64 testSize = vramSize * 7 / 8;
|
|
HSAuint32 count = testSize / (vramBufSize * N_PROCESSES);
|
|
|
|
if (count == 0) {
|
|
LOG() << "Skipping test: Not enough system memory available." << std::endl;
|
|
return;
|
|
}
|
|
|
|
/* Fork the child processes */
|
|
ForkChildProcesses(N_PROCESSES);
|
|
|
|
int rn = FindDRMRenderNode(defaultGPUNode);
|
|
if (rn < 0) {
|
|
LOG() << "Skipping test: Could not find render node for default GPU." << std::endl;
|
|
WaitChildProcesses();
|
|
return;
|
|
}
|
|
|
|
std::vector<void *> pBuffers;
|
|
AllocBuffers(defaultGPUNode, count, vramBufSize, pBuffers);
|
|
|
|
/* Allocate gfx vram size of at most one third system memory */
|
|
HSAuint64 size = sysMemSize / 3 < testSize ? sysMemSize / 3 : testSize;
|
|
amdgpu_bo_handle handle;
|
|
AllocAmdgpuBo(rn, size, handle);
|
|
|
|
AmdgpuCommandSubmissionSdmaNop(rn, handle);
|
|
|
|
FreeAmdgpuBo(handle);
|
|
LOG() << m_psName << "free buffer" << std::endl;
|
|
FreeBuffers(pBuffers, vramBufSize);
|
|
|
|
WaitChildProcesses();
|
|
|
|
TEST_END
|
|
}
|
|
|
|
/* Evict and restore queue test
|
|
*
|
|
* N_PROCESSES processes read all local buffers in parallel while buffers are evicted and restored
|
|
* If GPU vm page fault happens, then test shader will stop and failed to write specific value
|
|
* at dest buffer. Test will report failed.
|
|
*
|
|
* Steps:
|
|
* - fork N_PROCESSES processes, each process does the same below
|
|
* - allocate local buffers, each buffer size is 64MB
|
|
* - allocate zero initialized host access address buffer and result buffer
|
|
* address buffer to pass address of local buffers to shader
|
|
* result buffer to store shader output result
|
|
* - submit queue to run ReadMemory shader
|
|
* - shader start m_DimX wavefronts, each wavefront keep reading one local buffer
|
|
* - notify shader to quit
|
|
* - check result buffer with specific value to confirm all wavefronts quit normally
|
|
*/
|
|
TEST_F(KFDEvictTest, QueueTest) {
|
|
TEST_REQUIRE_ENV_CAPABILITIES(ENVCAPS_64BITLINUX);
|
|
TEST_START(TESTPROFILE_RUNALL)
|
|
|
|
HSAuint32 defaultGPUNode = m_NodeInfo.HsaDefaultGPUNode();
|
|
ASSERT_GE(defaultGPUNode, 0) << "failed to get default GPU Node";
|
|
HSAuint64 vramBufSize = ALLOCATE_BUF_SIZE_MB * 1024 * 1024;
|
|
|
|
const HsaNodeProperties *pNodeProperties = m_NodeInfo.HsaDefaultGPUNodeProperties();
|
|
|
|
/* Skip test for chip if it doesn't have CWSR, which the test depends on */
|
|
if (m_FamilyId < FAMILY_VI || isTonga(pNodeProperties)) {
|
|
LOG() << std::hex << "Skipping test: No CWSR present for family ID 0x" << m_FamilyId << "." << std::endl;
|
|
return;
|
|
}
|
|
|
|
HSAuint32 i;
|
|
HSAuint64 vramSize = GetVramSize(defaultGPUNode);
|
|
HSAuint64 sysMemSize = GetSysMemSize();
|
|
|
|
if (!vramSize) {
|
|
LOG() << "Skipping test: No VRAM found." << std::endl;
|
|
return;
|
|
}
|
|
|
|
LOG() << "Found VRAM of " << std::dec << (vramSize >> 20) << "MB" << std::endl;
|
|
LOG() << "Found System RAM of " << std::dec << (sysMemSize >> 20) << "MB" << std::endl;
|
|
|
|
// Use 7/8 of VRAM between all processes
|
|
HSAuint64 testSize = vramSize * 7 / 8;
|
|
HSAuint32 count = testSize / (vramBufSize * N_PROCESSES);
|
|
|
|
if (count == 0) {
|
|
LOG() << "Skipping test: Not enough system memory available." << std::endl;
|
|
return;
|
|
}
|
|
/* Assert all buffer address can be stored within one page
|
|
* because only one page host memory srcBuf is allocated
|
|
*/
|
|
ASSERT_LE(count, PAGE_SIZE/sizeof(unsigned int *));
|
|
|
|
/* Fork the child processes */
|
|
ForkChildProcesses(N_PROCESSES);
|
|
|
|
int rn = FindDRMRenderNode(defaultGPUNode);
|
|
if (rn < 0) {
|
|
LOG() << "Skipping test: Could not find render node for default GPU." << std::endl;
|
|
WaitChildProcesses();
|
|
return;
|
|
}
|
|
|
|
HsaMemoryBuffer isaBuffer(PAGE_SIZE, defaultGPUNode, true/*zero*/, false/*local*/, true/*exec*/);
|
|
HsaMemoryBuffer addrBuffer(PAGE_SIZE, defaultGPUNode);
|
|
HsaMemoryBuffer resultBuffer(PAGE_SIZE, defaultGPUNode);
|
|
|
|
m_pIsaGen->CompileShader(CreateShader().c_str(), "ReadMemory", isaBuffer);
|
|
|
|
PM4Queue pm4Queue;
|
|
ASSERT_SUCCESS(pm4Queue.Create(defaultGPUNode));
|
|
|
|
Dispatch dispatch0(isaBuffer);
|
|
|
|
std::vector<void *> pBuffers;
|
|
AllocBuffers(defaultGPUNode, count, vramBufSize, pBuffers);
|
|
|
|
/* Allocate gfx vram size of at most one third system memory */
|
|
HSAuint64 size = sysMemSize / 3 < testSize ? sysMemSize / 3 : testSize;
|
|
amdgpu_bo_handle handle;
|
|
AllocAmdgpuBo(rn, size, handle);
|
|
|
|
unsigned int wavefront_num = pBuffers.size();
|
|
LOG() << m_psName << "wavefront number " << wavefront_num << std::endl;
|
|
|
|
void **localBufAddr = addrBuffer.As<void **>();
|
|
unsigned int *result = resultBuffer.As<uint32_t *>();
|
|
|
|
for (i = 0; i < wavefront_num; i++)
|
|
*(localBufAddr + i) = pBuffers[i];
|
|
|
|
dispatch0.SetArgs(localBufAddr, result);
|
|
dispatch0.SetDim(wavefront_num, 1, 1);
|
|
/* Submit the packet and start shader */
|
|
dispatch0.Submit(pm4Queue);
|
|
|
|
AmdgpuCommandSubmissionSdmaNop(rn, handle);
|
|
|
|
/* Uncomment this line for debugging */
|
|
// LOG() << m_psName << "notify shader to quit" << std::endl;
|
|
|
|
/* Fill address buffer so shader quits */
|
|
addrBuffer.Fill(0x5678);
|
|
|
|
/* Wait for shader to finish or timeout if shader has vm page fault */
|
|
EXPECT_EQ(0, dispatch0.SyncWithStatus(120000));
|
|
|
|
EXPECT_SUCCESS(pm4Queue.Destroy());
|
|
|
|
FreeAmdgpuBo(handle);
|
|
|
|
/* Uncomment this line for debugging */
|
|
// LOG() << m_psName << "free buffer" << std::endl;
|
|
|
|
/* Cleanup */
|
|
FreeBuffers(pBuffers, vramBufSize);
|
|
|
|
/* Check if all wavefronts finished successfully */
|
|
for (i = 0; i < wavefront_num; i++)
|
|
EXPECT_EQ(0x5678, *(result + i));
|
|
|
|
WaitChildProcesses();
|
|
|
|
TEST_END
|
|
}
|
|
|
|
/* Evict a queue running in bursts, so that the process has a chance
|
|
* to be idle when restored but the queue needs to resume to perform
|
|
* more work later. This test is designed to stress the idle process
|
|
* eviction optimization in KFD that leaves idle processes evicted
|
|
* until the next time the doorbell page is accessed.
|
|
*/
|
|
TEST_F(KFDEvictTest, BurstyTest) {
|
|
TEST_REQUIRE_ENV_CAPABILITIES(ENVCAPS_64BITLINUX);
|
|
TEST_START(TESTPROFILE_RUNALL);
|
|
|
|
HSAuint32 defaultGPUNode = m_NodeInfo.HsaDefaultGPUNode();
|
|
ASSERT_GE(defaultGPUNode, 0) << "failed to get default GPU Node";
|
|
HSAuint64 vramBufSize = ALLOCATE_BUF_SIZE_MB * 1024 * 1024;
|
|
|
|
HSAuint64 vramSize = GetVramSize(defaultGPUNode);
|
|
HSAuint64 sysMemSize = GetSysMemSize();
|
|
|
|
if (!vramSize) {
|
|
LOG() << "Skipping test: No VRAM found." << std::endl;
|
|
return;
|
|
}
|
|
|
|
LOG() << "Found VRAM of " << std::dec << (vramSize >> 20) << "MB" << std::endl;
|
|
LOG() << "Found System RAM of " << std::dec << (sysMemSize >> 20) << "MB" << std::endl;
|
|
|
|
// Use 7/8 of VRAM between all processes
|
|
HSAuint64 testSize = vramSize * 7 / 8;
|
|
HSAuint32 count = testSize / (vramBufSize * N_PROCESSES);
|
|
|
|
if (count == 0) {
|
|
LOG() << "Skipping test: Not enough system memory available." << std::endl;
|
|
return;
|
|
}
|
|
|
|
/* Fork the child processes */
|
|
ForkChildProcesses(N_PROCESSES);
|
|
|
|
int rn = FindDRMRenderNode(defaultGPUNode);
|
|
if (rn < 0) {
|
|
LOG() << "Skipping test: Could not find render node for default GPU." << std::endl;
|
|
WaitChildProcesses();
|
|
return;
|
|
}
|
|
|
|
PM4Queue pm4Queue;
|
|
ASSERT_SUCCESS(pm4Queue.Create(defaultGPUNode));
|
|
|
|
std::vector<void *> pBuffers;
|
|
AllocBuffers(defaultGPUNode, count, vramBufSize, pBuffers);
|
|
|
|
/* Allocate gfx vram size of at most one third system memory */
|
|
HSAuint64 size = sysMemSize / 3 < testSize ? sysMemSize / 3 : testSize;
|
|
amdgpu_bo_handle handle;
|
|
AllocAmdgpuBo(rn, size, handle);
|
|
|
|
AmdgpuCommandSubmissionSdmaNop(rn, handle, &pm4Queue);
|
|
|
|
FreeAmdgpuBo(handle);
|
|
LOG() << m_psName << "free buffer" << std::endl;
|
|
FreeBuffers(pBuffers, vramBufSize);
|
|
|
|
EXPECT_SUCCESS(pm4Queue.Destroy());
|
|
|
|
WaitChildProcesses();
|
|
|
|
TEST_END
|
|
}
|