kfdtest: Make eviction tests more robust

- Run more graphics command submissions with shorter delay between them - Synchronize after every graphics command submission - Include the big VRAM BO in the BOList of the command submission to trigger more evictions - In QueueTest, run AMDGPU command submissions concurrently with compute shader on the user mode queue - Submit AMDGPU commands to GFX queue instead of compute queue to avoid deadlocks between user-mode and kernel-mode queues on the same pipe - Allocate slightly less memory from KFD to avoid allocation errors due to fragmentation or memory leaks in previous tests - Running only two processes maximizes the number of KFD evictions (probably because of lower chances of evicting non-KFD BOs) Change-Id: If05d53f5fcf690b6488998a3f933f120ddaa71ee Signed-off-by: Felix Kuehling <Felix.Kuehling@amd.com> [ROCm/ROCR-Runtime commit: c8d823eb10]
2019-04-30 23:34:18 -04:00
commit 4b8a5ead52
@@ -30,7 +30,7 @@
 #include "SDMAQueue.hpp"
 #include "Dispatch.hpp"

-#define N_PROCESSES             (8)     /* Number of processes running in parallel, must be at least 2 */
+#define N_PROCESSES             (2)     /* Number of processes running in parallel, must be at least 2 */
 #define ALLOCATE_BUF_SIZE_MB    (64)
 #define ALLOCATE_RETRY_TIMES    (3)

@@ -176,7 +176,7 @@ static inline int amdgpu_get_bo_list(amdgpu_device_handle dev, amdgpu_bo_handle
    return amdgpu_bo_list_create(dev, bo2 ? 2 : 1, resources, NULL, list);
 }

-void KFDEvictTest::AmdgpuCommandSubmissionComputeNop(int rn) {
+void KFDEvictTest::AmdgpuCommandSubmissionComputeNop(int rn, amdgpu_bo_handle handle) {
    amdgpu_context_handle contextHandle;
    amdgpu_bo_handle ibResultHandle;
    void *ibResultCpu;
@@ -197,7 +197,7 @@ void KFDEvictTest::AmdgpuCommandSubmissionComputeNop(int rn) {
        &ibResultHandle, &ibResultCpu,
        &ibResultMcAddress, &vaHandle));

-    ASSERT_EQ(0, amdgpu_get_bo_list(m_RenderNodes[rn].device_handle, ibResultHandle, NULL,
+    ASSERT_EQ(0, amdgpu_get_bo_list(m_RenderNodes[rn].device_handle, ibResultHandle, handle,
        &boList));

    /* Fill Nop cammands in IB */
@@ -210,7 +210,7 @@ void KFDEvictTest::AmdgpuCommandSubmissionComputeNop(int rn) {
    ibInfo.size = 16;

    memset(&ibsRequest, 0, sizeof(struct amdgpu_cs_request));
-    ibsRequest.ip_type = AMDGPU_HW_IP_COMPUTE;
+    ibsRequest.ip_type = AMDGPU_HW_IP_GFX;
    ibsRequest.ring = 0;
    ibsRequest.number_of_ibs = 1;
    ibsRequest.ibs = &ibInfo;
@@ -218,21 +218,23 @@ void KFDEvictTest::AmdgpuCommandSubmissionComputeNop(int rn) {
    ibsRequest.fence_info.handle = NULL;

    memset(&fenceStatus, 0, sizeof(struct amdgpu_cs_fence));
-    for (int i = 0; i < ALLOCATE_RETRY_TIMES; i++) {
+    for (int i = 0; i < 100; i++) {
        ASSERT_EQ(0, amdgpu_cs_submit(contextHandle, 0, &ibsRequest, 1));
-        sleep(1);
+        Delay(50);
+
+        fenceStatus.context = contextHandle;
+        fenceStatus.ip_type = AMDGPU_HW_IP_GFX;
+        fenceStatus.ip_instance = 0;
+        fenceStatus.ring = 0;
+        fenceStatus.fence = ibsRequest.seq_no;
+
+        EXPECT_EQ(0, amdgpu_cs_query_fence_status(&fenceStatus,
+                                                  g_TestTimeOut*1000000,
+                                                  0, &expired));
+        if (!expired)
+            WARN() << "CS did not signal completion" << std::endl;
    }

-    fenceStatus.context = contextHandle;
-    fenceStatus.ip_type = AMDGPU_HW_IP_COMPUTE;
-    fenceStatus.ip_instance = 0;
-    fenceStatus.ring = 0;
-    fenceStatus.fence = ibsRequest.seq_no;
-
-    EXPECT_EQ(0, amdgpu_cs_query_fence_status(&fenceStatus,
-        g_TestTimeOut,
-        0, &expired));
-
    EXPECT_EQ(0, amdgpu_bo_list_destroy(boList));

    EXPECT_EQ(0, amdgpu_bo_unmap_and_free(ibResultHandle, vaHandle,
@@ -331,7 +333,8 @@ TEST_F(KFDEvictTest, BasicTest) {
        LOG() << "Found VRAM of " << std::dec << (vramSize >> 20) << "MB" << std::endl;
    }

-    HSAint32 count = vramSize / vramBufSize / N_PROCESSES;
+    // Use 7/8 of VRAM between all processes
+    HSAuint32 count = vramSize * 7 / (8* vramBufSize * N_PROCESSES);

    LOG() << "Found System RAM of " << std::dec << (GetSysMemSize() >> 20) << "MB" << std::endl;

@@ -353,7 +356,7 @@ TEST_F(KFDEvictTest, BasicTest) {
    amdgpu_bo_handle handle;
    AllocAmdgpuBo(rn, size, handle);

-    AmdgpuCommandSubmissionComputeNop(rn);
+    AmdgpuCommandSubmissionComputeNop(rn, handle);

    FreeAmdgpuBo(handle);
    LOG() << m_psName << "free buffer" << std::endl;
@@ -533,7 +536,8 @@ TEST_F(KFDEvictTest, QueueTest) {
        LOG() << "Found VRAM of " << std::dec << (vramSize >> 20) << "MB." << std::endl;
    }

-    HSAuint32 count = vramSize / vramBufSize / N_PROCESSES;
+    // Use 7/8 of VRAM between all processes
+    HSAuint32 count = vramSize * 7 / (8 * vramBufSize * N_PROCESSES);

    LOG() << "Found System RAM of " << std::dec << (GetSysMemSize() >> 20) << "MB" << std::endl;

@@ -568,8 +572,6 @@ TEST_F(KFDEvictTest, QueueTest) {
    amdgpu_bo_handle handle;
    AllocAmdgpuBo(rn, size, handle);

-    AmdgpuCommandSubmissionComputeNop(rn);
-
    unsigned int wavefront_num = pBuffers.size();
    LOG() << m_psName << "wavefront number " << wavefront_num << std::endl;

@@ -590,8 +592,7 @@ TEST_F(KFDEvictTest, QueueTest) {
    /* Submit the packet and start shader */
    dispatch0.Submit(pm4Queue);

-    /* Doing evict/restore queue test for 5 seconds while queue is running */
-    sleep(5);
+    AmdgpuCommandSubmissionComputeNop(rn, handle);

    /* Uncomment this line for debugging */
    // LOG() << m_psName << "notify shader to quit" << std::endl;
@@ -600,7 +601,7 @@ TEST_F(KFDEvictTest, QueueTest) {
    addrBuffer.Fill(0x5678);

    /* Wait for shader to finish or timeout if shader has vm page fault */
-    dispatch0.SyncWithStatus(120000);
+    EXPECT_EQ(0, dispatch0.SyncWithStatus(120000));

    EXPECT_SUCCESS(pm4Queue.Destroy());

@@ -56,7 +56,7 @@ class KFDEvictTest :  public KFDLocalMemoryTest {
    void FreeBuffers(std::vector<void *> &pBuffers, HSAuint64 vramBufSize);
    void AllocAmdgpuBo(int rn, HSAuint64 vramBufSize, amdgpu_bo_handle &handle);
    void FreeAmdgpuBo(amdgpu_bo_handle handle);
-    void AmdgpuCommandSubmissionComputeNop(int rn);
+    void AmdgpuCommandSubmissionComputeNop(int rn, amdgpu_bo_handle handle);
    void ForkChildProcesses(int nprocesses);
    void WaitChildProcesses();