kfdtest: Update KFDEvictTest to LLVM Asm

- Reformat shaders for legibility - Move assembly processes to from IsaGen (CompileShader) to Assembler (RunAssembleBuf) Signed-off-by: Graham Sider <Graham.Sider@amd.com> Change-Id: I7333d0e45ccd3f43690a2a01227f89a6e04fcecb
2021-09-24 18:47:40 -04:00
@@ -36,23 +36,132 @@

 #define SDMA_NOP  0x0

+/* Shader to read local buffers using multiple wavefronts in parallel
+ * until address buffer is filled with specific value 0x5678 by host program,
+ * then each wavefront fills value 0x5678 at corresponding result buffer and quit
+ *
+ * Initial state:
+ *   s[0:1] - address buffer base address
+ *   s[2:3] - result buffer base address
+ *   s4 - workgroup id
+ *   v0 - workitem id, always 0 because NUM_THREADS_X(number of threads) in workgroup set to 1
+ * Registers:
+ *   v0 - calculated workitem id, v0 = v0 + s4 * NUM_THREADS_X
+ *   v[2:3] - address of corresponding local buf address offset: s[0:1] + v0 * 8
+ *   v[4:5] - corresponding output buf address: s[2:3] + v0 * 4
+ *   v[6:7] - local buf address used for read test
+ *
+ *    This shader can be used by gfx9 and gfx10
+ *
+ */
+
+static const char* ReadMemoryIsa_gfx9 = R"(
+        .text
+        // Compute address of corresponding output buffer
+        v_mov_b32       v0, s4                  // use workgroup id as index
+        v_lshlrev_b32   v0, 2, v0               // v0 *= 4
+        v_add_co_u32    v4, vcc, s2, v0         // v[4:5] = s[2:3] + v0 * 4
+        v_mov_b32       v5, s3
+        v_add_co_u32    v5, vcc, v5, vcc_lo
+        // Compute input buffer offset used to store corresponding local buffer address
+        v_lshlrev_b32   v0, 1, v0               // v0 *= 8
+        v_add_co_u32    v2, vcc, s0, v0         // v[2:3] = s[0:1] + v0 * 8
+        v_mov_b32       v3, s1
+        v_add_co_u32    v3, vcc, v3, vcc_lo
+        // load 64bit local buffer address stored at v[2:3] to v[6:7]
+        flat_load_dwordx2   v[6:7], v[2:3] slc
+        s_waitcnt       vmcnt(0) & lgkmcnt(0)   // wait for memory reads to finish
+        v_mov_b32       v8, 0x5678
+        s_movk_i32      s8, 0x5678
+        L_REPEAT:
+        s_load_dword    s16, s[0:1], 0x0 glc
+        s_waitcnt       vmcnt(0) & lgkmcnt(0)   // wait for memory reads to finish
+        s_cmp_eq_i32    s16, s8
+        s_cbranch_scc1  L_QUIT                  // if notified to quit by host
+        // Loop read 64M local buffer starting at v[6:7]
+        // every 4k page only read once
+        v_mov_b32       v9, 0
+        v_mov_b32       v10, 0x1000             // 4k page
+        v_mov_b32       v11, 0x4000000          // 64M size
+        v_mov_b32       v12, v6
+        v_mov_b32       v13, v7
+        L_LOOP_READ:
+        flat_load_dwordx2   v[14:15], v[12:13] slc
+        v_add_co_u32    v9, vcc, v9, v10
+        v_add_co_u32    v12, vcc, v12, v10
+        v_add_co_u32    v13, vcc, v13, vcc_lo
+        v_cmp_lt_u32    vcc, v9, v11
+        s_cbranch_vccnz L_LOOP_READ
+        s_branch        L_REPEAT
+        L_QUIT:
+        flat_store_dword v[4:5], v8
+        s_waitcnt       vmcnt(0) & lgkmcnt(0)   // wait for memory writes to finish
+        s_endpgm
+)";
+
+static const char* ReadMemoryIsa_gfx8 = R"(
+        .text
+        // Compute address of corresponding output buffer
+        v_mov_b32       v0, s4                  // use workgroup id as index
+        v_lshlrev_b32   v0, 2, v0               // v0 *= 4
+        v_add_u32       v4, vcc, s2, v0         // v[4:5] = s[2:3] + v0 * 4
+        v_mov_b32       v5, s3
+        v_addc_u32      v5, vcc, v5, 0, vcc
+        // Compute input buffer offset used to store corresponding local buffer address
+        v_lshlrev_b32   v0, 1, v0               // v0 *= 8
+        v_add_u32       v2, vcc, s0, v0         // v[2:3] = s[0:1] + v0 * 8
+        v_mov_b32       v3, s1
+        v_addc_u32      v3, vcc, v3, 0, vcc
+        // Load 64bit local buffer address stored at v[2:3] to v[6:7]
+        flat_load_dwordx2   v[6:7], v[2:3] slc
+        s_waitcnt       vmcnt(0) & lgkmcnt(0)   // wait for memory reads to finish
+        v_mov_b32       v8, 0x5678
+        s_movk_i32      s8, 0x5678
+        L_REPEAT:
+        s_load_dword    s16, s[0:1], 0x0 glc
+        s_waitcnt       vmcnt(0) & lgkmcnt(0)   // wait for memory reads to finish
+        s_cmp_eq_i32    s16, s8
+        s_cbranch_scc1  L_QUIT // if notified to quit by host
+        // Loop read 64M local buffer starting at v[6:7]
+        // every 4k page only read once
+        v_mov_b32       v9, 0
+        v_mov_b32       v10, 0x1000             // 4k page
+        v_mov_b32       v11, 0x4000000          // 64M size
+        v_mov_b32       v12, v6
+        v_mov_b32       v13, v7
+        L_LOOP_READ:
+        flat_load_dwordx2   v[14:15], v[12:13] slc
+        v_add_u32       v9, vcc, v9, v10
+        v_add_u32       v12, vcc, v12, v10
+        v_addc_u32      v13, vcc, v13, 0, vcc
+        v_cmp_lt_u32    vcc, v9, v11
+        s_cbranch_vccnz L_LOOP_READ
+        s_branch        L_REPEAT
+        L_QUIT:
+        flat_store_dword v[4:5], v8
+        s_waitcnt       vmcnt(0) & lgkmcnt(0)   // wait for memory writes to finish
+        s_endpgm
+)";
+
+std::string KFDEvictTest::CreateShader() {
+    if (m_FamilyId < FAMILY_AI)
+        return ReadMemoryIsa_gfx8;
+    else
+        return ReadMemoryIsa_gfx9;
+}
+
+
 void KFDEvictTest::SetUp() {
    ROUTINE_START

    KFDBaseComponentTest::SetUp();

-    m_pIsaGen = IsaGenerator::Create(m_FamilyId);
-
    ROUTINE_END
 }

 void KFDEvictTest::TearDown() {
    ROUTINE_START

-    if (m_pIsaGen)
-        delete m_pIsaGen;
-    m_pIsaGen = NULL;
-
    KFDBaseComponentTest::TearDown();

    ROUTINE_END
@@ -286,136 +395,6 @@ void KFDEvictTest::AmdgpuCommandSubmissionSdmaNop(int rn, amdgpu_bo_handle handl
    EXPECT_EQ(0, amdgpu_cs_ctx_free(contextHandle));
 }

-/* Shader to read local buffers using multiple wavefronts in parallel
- * until address buffer is filled with specific value 0x5678 by host program,
- * then each wavefront fills value 0x5678 at corresponding result buffer and quit
- *
- * Initial state:
- *   s[0:1] - address buffer base address
- *   s[2:3] - result buffer base address
- *   s4 - workgroup id
- *   v0 - workitem id, always 0 because NUM_THREADS_X(number of threads) in workgroup set to 1
- * Registers:
- *   v0 - calculated workitem id, v0 = v0 + s4 * NUM_THREADS_X
- *   v[2:3] - address of corresponding local buf address offset: s[0:1] + v0 * 8
- *   v[4:5] - corresponding output buf address: s[2:3] + v0 * 4
- *   v[6:7] - local buf address used for read test
- *
- *    This shader can be used by gfx9 and gfx10
- *
- */
-
-static const char* gfx9_ReadMemory =
-"\
-    shader ReadMemory\n\
-    wave_size(32)\n\
-    type(CS)\n\
-    \n\
-    // compute address of corresponding output buffer\n\
-    v_mov_b32       v0, s4                  // use workgroup id as index\n\
-    v_lshlrev_b32   v0, 2, v0               // v0 *= 4\n\
-    v_add_co_u32    v4, vcc, s2, v0         // v[4:5] = s[2:3] + v0 * 4\n\
-    v_mov_b32       v5, s3\n\
-    v_add_co_u32    v5, vcc, v5, vcc_lo\n\
-    \n\
-    // compute input buffer offset used to store corresponding local buffer address\n\
-    v_lshlrev_b32   v0, 1, v0               // v0 *= 8\n\
-    v_add_co_u32    v2, vcc, s0, v0         // v[2:3] = s[0:1] + v0 * 8\n\
-    v_mov_b32       v3, s1\n\
-    v_add_co_u32    v3, vcc, v3, vcc_lo\n\
-    \n\
-    // load 64bit local buffer address stored at v[2:3] to v[6:7]\n\
-    flat_load_dwordx2   v[6:7], v[2:3] slc\n\
-    s_waitcnt       vmcnt(0) & lgkmcnt(0)   // wait for memory reads to finish\n\
-    \n\
-    v_mov_b32       v8, 0x5678\n\
-    s_movk_i32      s8, 0x5678\n\
-L_REPEAT:\n\
-    s_load_dword    s16, s[0:1], 0x0 glc\n\
-    s_waitcnt       vmcnt(0) & lgkmcnt(0)   // wait for memory reads to finish\n\
-    s_cmp_eq_i32    s16, s8\n\
-    s_cbranch_scc1  L_QUIT                  // if notified to quit by host\n\
-    // loop read 64M local buffer starting at v[6:7]\n\
-    // every 4k page only read once\n\
-    v_mov_b32       v9, 0\n\
-    v_mov_b32       v10, 0x1000             // 4k page\n\
-    v_mov_b32       v11, 0x4000000          // 64M size\n\
-    v_mov_b32       v12, v6\n\
-    v_mov_b32       v13, v7\n\
-L_LOOP_READ:\n\
-    flat_load_dwordx2   v[14:15], v[12:13] slc\n\
-    v_add_co_u32    v9, vcc, v9, v10 \n\
-    v_add_co_u32    v12, vcc, v12, v10\n\
-    v_add_co_u32    v13, vcc, v13, vcc_lo\n\
-    v_cmp_lt_u32    vcc, v9, v11\n\
-    s_cbranch_vccnz L_LOOP_READ\n\
-    s_branch        L_REPEAT\n\
-L_QUIT:\n\
-    flat_store_dword v[4:5], v8\n\
-    s_waitcnt       vmcnt(0) & lgkmcnt(0)   // wait for memory writes to finish\n\
-    s_endpgm\n\
-    end\n\
-";
-
-static const char* gfx8_ReadMemory =
-"\
-    shader ReadMemory\n\
-    asic(VI)\n\
-    type(CS)\n\
-    \n\
-    // compute address of corresponding output buffer\n\
-    v_mov_b32       v0, s4                  // use workgroup id as index\n\
-    v_lshlrev_b32   v0, 2, v0               // v0 *= 4\n\
-    v_add_u32       v4, vcc, s2, v0         // v[4:5] = s[2:3] + v0 * 4\n\
-    v_mov_b32       v5, s3\n\
-    v_addc_u32      v5, vcc, v5, 0, vcc\n\
-    \n\
-    // compute input buffer offset used to store corresponding local buffer address\n\
-    v_lshlrev_b32   v0, 1, v0               // v0 *= 8\n\
-    v_add_u32       v2, vcc, s0, v0         // v[2:3] = s[0:1] + v0 * 8\n\
-    v_mov_b32       v3, s1\n\
-    v_addc_u32      v3, vcc, v3, 0, vcc\n\
-    \n\
-    // load 64bit local buffer address stored at v[2:3] to v[6:7]\n\
-    flat_load_dwordx2   v[6:7], v[2:3] slc\n\
-    s_waitcnt       vmcnt(0) & lgkmcnt(0)   // wait for memory reads to finish\n\
-    \n\
-    v_mov_b32       v8, 0x5678\n\
-    s_movk_i32      s8, 0x5678\n\
-L_REPEAT:\n\
-    s_load_dword    s16, s[0:1], 0x0 glc\n\
-    s_waitcnt       vmcnt(0) & lgkmcnt(0)   // wait for memory reads to finish\n\
-    s_cmp_eq_i32    s16, s8\n\
-    s_cbranch_scc1  L_QUIT                  // if notified to quit by host\n\
-    // loop read 64M local buffer starting at v[6:7]\n\
-    // every 4k page only read once\n\
-    v_mov_b32       v9, 0\n\
-    v_mov_b32       v10, 0x1000             // 4k page\n\
-    v_mov_b32       v11, 0x4000000          // 64M size\n\
-    v_mov_b32       v12, v6\n\
-    v_mov_b32       v13, v7\n\
-L_LOOP_READ:\n\
-    flat_load_dwordx2   v[14:15], v[12:13] slc\n\
-    v_add_u32       v9, vcc, v9, v10 \n\
-    v_add_u32       v12, vcc, v12, v10\n\
-    v_addc_u32      v13, vcc, v13, 0, vcc\n\
-    v_cmp_lt_u32    vcc, v9, v11\n\
-    s_cbranch_vccnz L_LOOP_READ\n\
-    s_branch        L_REPEAT\n\
-L_QUIT:\n\
-    flat_store_dword v[4:5], v8\n\
-    s_waitcnt       vmcnt(0) & lgkmcnt(0)   // wait for memory writes to finish\n\
-    s_endpgm\n\
-    end\n\
-";
-
-std::string KFDEvictTest::CreateShader() {
-    if (m_FamilyId < FAMILY_AI)
-        return gfx8_ReadMemory;
-    else
-        return gfx9_ReadMemory;
-}
-
 /* Evict and restore procedure basic test
 *
 * Use N_PROCESSES processes to allocate vram buf size larger than total vram size
@@ -567,7 +546,7 @@ TEST_F(KFDEvictTest, QueueTest) {
    HsaMemoryBuffer addrBuffer(PAGE_SIZE, defaultGPUNode);
    HsaMemoryBuffer resultBuffer(PAGE_SIZE, defaultGPUNode);

-    m_pIsaGen->CompileShader(CreateShader().c_str(), "ReadMemory", isaBuffer);
+    ASSERT_SUCCESS(m_pAsm->RunAssembleBuf(CreateShader().c_str(), isaBuffer.As<char*>()));

    PM4Queue pm4Queue;
    ASSERT_SUCCESS(pm4Queue.Create(defaultGPUNode));
@@ -27,15 +27,13 @@
 #include <string>
 #include <vector>
 #include "KFDMultiProcessTest.hpp"
-#include "IsaGenerator.hpp"
 #include "PM4Queue.hpp"

 // @class KFDEvictTest
 // Test eviction and restore procedure using two processes
 class KFDEvictTest :  public KFDMultiProcessTest {
 public:
-    KFDEvictTest(void): m_pIsaGen(NULL) {}
-
+    KFDEvictTest(void) {}
    ~KFDEvictTest(void) {}

 protected:
@@ -52,7 +50,6 @@ class KFDEvictTest :  public KFDMultiProcessTest {
                                           PM4Queue *computeQueue);

 protected:  // Members
-    IsaGenerator*   m_pIsaGen;
    HsaMemFlags     m_Flags;
    void*           m_pBuf;
 };