kfdtest: Make eviction tests more robust

- Run more graphics command submissions with shorter delay between
  them
- Synchronize after every graphics command submission
- Include the big VRAM BO in the BOList of the command submission
  to trigger more evictions
- In QueueTest, run AMDGPU command submissions concurrently with
  compute shader on the user mode queue
- Submit AMDGPU commands to GFX queue instead of compute queue to
  avoid deadlocks between user-mode and kernel-mode queues on the
  same pipe
- Allocate slightly less memory from KFD to avoid allocation errors
  due to fragmentation or memory leaks in previous tests
- Running only two processes maximizes the number of KFD evictions
  (probably because of lower chances of evicting non-KFD BOs)

Change-Id: If05d53f5fcf690b6488998a3f933f120ddaa71ee
Signed-off-by: Felix Kuehling <Felix.Kuehling@amd.com>


[ROCm/ROCR-Runtime commit: c8d823eb10]
This commit is contained in:
Felix Kuehling
2019-04-30 23:34:18 -04:00
rodzic dbfa65a604
commit 4b8a5ead52
2 zmienionych plików z 26 dodań i 25 usunięć
@@ -30,7 +30,7 @@
#include "SDMAQueue.hpp"
#include "Dispatch.hpp"
#define N_PROCESSES (8) /* Number of processes running in parallel, must be at least 2 */
#define N_PROCESSES (2) /* Number of processes running in parallel, must be at least 2 */
#define ALLOCATE_BUF_SIZE_MB (64)
#define ALLOCATE_RETRY_TIMES (3)
@@ -176,7 +176,7 @@ static inline int amdgpu_get_bo_list(amdgpu_device_handle dev, amdgpu_bo_handle
return amdgpu_bo_list_create(dev, bo2 ? 2 : 1, resources, NULL, list);
}
void KFDEvictTest::AmdgpuCommandSubmissionComputeNop(int rn) {
void KFDEvictTest::AmdgpuCommandSubmissionComputeNop(int rn, amdgpu_bo_handle handle) {
amdgpu_context_handle contextHandle;
amdgpu_bo_handle ibResultHandle;
void *ibResultCpu;
@@ -197,7 +197,7 @@ void KFDEvictTest::AmdgpuCommandSubmissionComputeNop(int rn) {
&ibResultHandle, &ibResultCpu,
&ibResultMcAddress, &vaHandle));
ASSERT_EQ(0, amdgpu_get_bo_list(m_RenderNodes[rn].device_handle, ibResultHandle, NULL,
ASSERT_EQ(0, amdgpu_get_bo_list(m_RenderNodes[rn].device_handle, ibResultHandle, handle,
&boList));
/* Fill Nop cammands in IB */
@@ -210,7 +210,7 @@ void KFDEvictTest::AmdgpuCommandSubmissionComputeNop(int rn) {
ibInfo.size = 16;
memset(&ibsRequest, 0, sizeof(struct amdgpu_cs_request));
ibsRequest.ip_type = AMDGPU_HW_IP_COMPUTE;
ibsRequest.ip_type = AMDGPU_HW_IP_GFX;
ibsRequest.ring = 0;
ibsRequest.number_of_ibs = 1;
ibsRequest.ibs = &ibInfo;
@@ -218,21 +218,23 @@ void KFDEvictTest::AmdgpuCommandSubmissionComputeNop(int rn) {
ibsRequest.fence_info.handle = NULL;
memset(&fenceStatus, 0, sizeof(struct amdgpu_cs_fence));
for (int i = 0; i < ALLOCATE_RETRY_TIMES; i++) {
for (int i = 0; i < 100; i++) {
ASSERT_EQ(0, amdgpu_cs_submit(contextHandle, 0, &ibsRequest, 1));
sleep(1);
Delay(50);
fenceStatus.context = contextHandle;
fenceStatus.ip_type = AMDGPU_HW_IP_GFX;
fenceStatus.ip_instance = 0;
fenceStatus.ring = 0;
fenceStatus.fence = ibsRequest.seq_no;
EXPECT_EQ(0, amdgpu_cs_query_fence_status(&fenceStatus,
g_TestTimeOut*1000000,
0, &expired));
if (!expired)
WARN() << "CS did not signal completion" << std::endl;
}
fenceStatus.context = contextHandle;
fenceStatus.ip_type = AMDGPU_HW_IP_COMPUTE;
fenceStatus.ip_instance = 0;
fenceStatus.ring = 0;
fenceStatus.fence = ibsRequest.seq_no;
EXPECT_EQ(0, amdgpu_cs_query_fence_status(&fenceStatus,
g_TestTimeOut,
0, &expired));
EXPECT_EQ(0, amdgpu_bo_list_destroy(boList));
EXPECT_EQ(0, amdgpu_bo_unmap_and_free(ibResultHandle, vaHandle,
@@ -331,7 +333,8 @@ TEST_F(KFDEvictTest, BasicTest) {
LOG() << "Found VRAM of " << std::dec << (vramSize >> 20) << "MB" << std::endl;
}
HSAint32 count = vramSize / vramBufSize / N_PROCESSES;
// Use 7/8 of VRAM between all processes
HSAuint32 count = vramSize * 7 / (8* vramBufSize * N_PROCESSES);
LOG() << "Found System RAM of " << std::dec << (GetSysMemSize() >> 20) << "MB" << std::endl;
@@ -353,7 +356,7 @@ TEST_F(KFDEvictTest, BasicTest) {
amdgpu_bo_handle handle;
AllocAmdgpuBo(rn, size, handle);
AmdgpuCommandSubmissionComputeNop(rn);
AmdgpuCommandSubmissionComputeNop(rn, handle);
FreeAmdgpuBo(handle);
LOG() << m_psName << "free buffer" << std::endl;
@@ -533,7 +536,8 @@ TEST_F(KFDEvictTest, QueueTest) {
LOG() << "Found VRAM of " << std::dec << (vramSize >> 20) << "MB." << std::endl;
}
HSAuint32 count = vramSize / vramBufSize / N_PROCESSES;
// Use 7/8 of VRAM between all processes
HSAuint32 count = vramSize * 7 / (8 * vramBufSize * N_PROCESSES);
LOG() << "Found System RAM of " << std::dec << (GetSysMemSize() >> 20) << "MB" << std::endl;
@@ -568,8 +572,6 @@ TEST_F(KFDEvictTest, QueueTest) {
amdgpu_bo_handle handle;
AllocAmdgpuBo(rn, size, handle);
AmdgpuCommandSubmissionComputeNop(rn);
unsigned int wavefront_num = pBuffers.size();
LOG() << m_psName << "wavefront number " << wavefront_num << std::endl;
@@ -590,8 +592,7 @@ TEST_F(KFDEvictTest, QueueTest) {
/* Submit the packet and start shader */
dispatch0.Submit(pm4Queue);
/* Doing evict/restore queue test for 5 seconds while queue is running */
sleep(5);
AmdgpuCommandSubmissionComputeNop(rn, handle);
/* Uncomment this line for debugging */
// LOG() << m_psName << "notify shader to quit" << std::endl;
@@ -600,7 +601,7 @@ TEST_F(KFDEvictTest, QueueTest) {
addrBuffer.Fill(0x5678);
/* Wait for shader to finish or timeout if shader has vm page fault */
dispatch0.SyncWithStatus(120000);
EXPECT_EQ(0, dispatch0.SyncWithStatus(120000));
EXPECT_SUCCESS(pm4Queue.Destroy());
@@ -56,7 +56,7 @@ class KFDEvictTest : public KFDLocalMemoryTest {
void FreeBuffers(std::vector<void *> &pBuffers, HSAuint64 vramBufSize);
void AllocAmdgpuBo(int rn, HSAuint64 vramBufSize, amdgpu_bo_handle &handle);
void FreeAmdgpuBo(amdgpu_bo_handle handle);
void AmdgpuCommandSubmissionComputeNop(int rn);
void AmdgpuCommandSubmissionComputeNop(int rn, amdgpu_bo_handle handle);
void ForkChildProcesses(int nprocesses);
void WaitChildProcesses();