From 798de4f4467cb4e4e3ff174bcf9a7dfb7f3f6361 Mon Sep 17 00:00:00 2001 From: Graham Sider Date: Fri, 24 Sep 2021 18:47:40 -0400 Subject: [PATCH] kfdtest: Update KFDEvictTest to LLVM Asm - Reformat shaders for legibility - Move assembly processes to from IsaGen (CompileShader) to Assembler (RunAssembleBuf) Signed-off-by: Graham Sider Change-Id: I7333d0e45ccd3f43690a2a01227f89a6e04fcecb [ROCm/ROCR-Runtime commit: b44d6762bd84afbf01420bc243ceb63a065556ca] --- .../tests/kfdtest/src/KFDEvictTest.cpp | 253 ++++++++---------- .../tests/kfdtest/src/KFDEvictTest.hpp | 5 +- 2 files changed, 117 insertions(+), 141 deletions(-) diff --git a/projects/rocr-runtime/tests/kfdtest/src/KFDEvictTest.cpp b/projects/rocr-runtime/tests/kfdtest/src/KFDEvictTest.cpp index 7ec86bc8bd..bf721238c8 100644 --- a/projects/rocr-runtime/tests/kfdtest/src/KFDEvictTest.cpp +++ b/projects/rocr-runtime/tests/kfdtest/src/KFDEvictTest.cpp @@ -36,23 +36,132 @@ #define SDMA_NOP 0x0 +/* Shader to read local buffers using multiple wavefronts in parallel + * until address buffer is filled with specific value 0x5678 by host program, + * then each wavefront fills value 0x5678 at corresponding result buffer and quit + * + * Initial state: + * s[0:1] - address buffer base address + * s[2:3] - result buffer base address + * s4 - workgroup id + * v0 - workitem id, always 0 because NUM_THREADS_X(number of threads) in workgroup set to 1 + * Registers: + * v0 - calculated workitem id, v0 = v0 + s4 * NUM_THREADS_X + * v[2:3] - address of corresponding local buf address offset: s[0:1] + v0 * 8 + * v[4:5] - corresponding output buf address: s[2:3] + v0 * 4 + * v[6:7] - local buf address used for read test + * + * This shader can be used by gfx9 and gfx10 + * + */ + +static const char* ReadMemoryIsa_gfx9 = R"( + .text + // Compute address of corresponding output buffer + v_mov_b32 v0, s4 // use workgroup id as index + v_lshlrev_b32 v0, 2, v0 // v0 *= 4 + v_add_co_u32 v4, vcc, s2, v0 // v[4:5] = s[2:3] + v0 * 4 + v_mov_b32 v5, s3 + v_add_co_u32 v5, vcc, v5, vcc_lo + // Compute input buffer offset used to store corresponding local buffer address + v_lshlrev_b32 v0, 1, v0 // v0 *= 8 + v_add_co_u32 v2, vcc, s0, v0 // v[2:3] = s[0:1] + v0 * 8 + v_mov_b32 v3, s1 + v_add_co_u32 v3, vcc, v3, vcc_lo + // load 64bit local buffer address stored at v[2:3] to v[6:7] + flat_load_dwordx2 v[6:7], v[2:3] slc + s_waitcnt vmcnt(0) & lgkmcnt(0) // wait for memory reads to finish + v_mov_b32 v8, 0x5678 + s_movk_i32 s8, 0x5678 + L_REPEAT: + s_load_dword s16, s[0:1], 0x0 glc + s_waitcnt vmcnt(0) & lgkmcnt(0) // wait for memory reads to finish + s_cmp_eq_i32 s16, s8 + s_cbranch_scc1 L_QUIT // if notified to quit by host + // Loop read 64M local buffer starting at v[6:7] + // every 4k page only read once + v_mov_b32 v9, 0 + v_mov_b32 v10, 0x1000 // 4k page + v_mov_b32 v11, 0x4000000 // 64M size + v_mov_b32 v12, v6 + v_mov_b32 v13, v7 + L_LOOP_READ: + flat_load_dwordx2 v[14:15], v[12:13] slc + v_add_co_u32 v9, vcc, v9, v10 + v_add_co_u32 v12, vcc, v12, v10 + v_add_co_u32 v13, vcc, v13, vcc_lo + v_cmp_lt_u32 vcc, v9, v11 + s_cbranch_vccnz L_LOOP_READ + s_branch L_REPEAT + L_QUIT: + flat_store_dword v[4:5], v8 + s_waitcnt vmcnt(0) & lgkmcnt(0) // wait for memory writes to finish + s_endpgm +)"; + +static const char* ReadMemoryIsa_gfx8 = R"( + .text + // Compute address of corresponding output buffer + v_mov_b32 v0, s4 // use workgroup id as index + v_lshlrev_b32 v0, 2, v0 // v0 *= 4 + v_add_u32 v4, vcc, s2, v0 // v[4:5] = s[2:3] + v0 * 4 + v_mov_b32 v5, s3 + v_addc_u32 v5, vcc, v5, 0, vcc + // Compute input buffer offset used to store corresponding local buffer address + v_lshlrev_b32 v0, 1, v0 // v0 *= 8 + v_add_u32 v2, vcc, s0, v0 // v[2:3] = s[0:1] + v0 * 8 + v_mov_b32 v3, s1 + v_addc_u32 v3, vcc, v3, 0, vcc + // Load 64bit local buffer address stored at v[2:3] to v[6:7] + flat_load_dwordx2 v[6:7], v[2:3] slc + s_waitcnt vmcnt(0) & lgkmcnt(0) // wait for memory reads to finish + v_mov_b32 v8, 0x5678 + s_movk_i32 s8, 0x5678 + L_REPEAT: + s_load_dword s16, s[0:1], 0x0 glc + s_waitcnt vmcnt(0) & lgkmcnt(0) // wait for memory reads to finish + s_cmp_eq_i32 s16, s8 + s_cbranch_scc1 L_QUIT // if notified to quit by host + // Loop read 64M local buffer starting at v[6:7] + // every 4k page only read once + v_mov_b32 v9, 0 + v_mov_b32 v10, 0x1000 // 4k page + v_mov_b32 v11, 0x4000000 // 64M size + v_mov_b32 v12, v6 + v_mov_b32 v13, v7 + L_LOOP_READ: + flat_load_dwordx2 v[14:15], v[12:13] slc + v_add_u32 v9, vcc, v9, v10 + v_add_u32 v12, vcc, v12, v10 + v_addc_u32 v13, vcc, v13, 0, vcc + v_cmp_lt_u32 vcc, v9, v11 + s_cbranch_vccnz L_LOOP_READ + s_branch L_REPEAT + L_QUIT: + flat_store_dword v[4:5], v8 + s_waitcnt vmcnt(0) & lgkmcnt(0) // wait for memory writes to finish + s_endpgm +)"; + +std::string KFDEvictTest::CreateShader() { + if (m_FamilyId < FAMILY_AI) + return ReadMemoryIsa_gfx8; + else + return ReadMemoryIsa_gfx9; +} + + void KFDEvictTest::SetUp() { ROUTINE_START KFDBaseComponentTest::SetUp(); - m_pIsaGen = IsaGenerator::Create(m_FamilyId); - ROUTINE_END } void KFDEvictTest::TearDown() { ROUTINE_START - if (m_pIsaGen) - delete m_pIsaGen; - m_pIsaGen = NULL; - KFDBaseComponentTest::TearDown(); ROUTINE_END @@ -286,136 +395,6 @@ void KFDEvictTest::AmdgpuCommandSubmissionSdmaNop(int rn, amdgpu_bo_handle handl EXPECT_EQ(0, amdgpu_cs_ctx_free(contextHandle)); } -/* Shader to read local buffers using multiple wavefronts in parallel - * until address buffer is filled with specific value 0x5678 by host program, - * then each wavefront fills value 0x5678 at corresponding result buffer and quit - * - * Initial state: - * s[0:1] - address buffer base address - * s[2:3] - result buffer base address - * s4 - workgroup id - * v0 - workitem id, always 0 because NUM_THREADS_X(number of threads) in workgroup set to 1 - * Registers: - * v0 - calculated workitem id, v0 = v0 + s4 * NUM_THREADS_X - * v[2:3] - address of corresponding local buf address offset: s[0:1] + v0 * 8 - * v[4:5] - corresponding output buf address: s[2:3] + v0 * 4 - * v[6:7] - local buf address used for read test - * - * This shader can be used by gfx9 and gfx10 - * - */ - -static const char* gfx9_ReadMemory = -"\ - shader ReadMemory\n\ - wave_size(32)\n\ - type(CS)\n\ - \n\ - // compute address of corresponding output buffer\n\ - v_mov_b32 v0, s4 // use workgroup id as index\n\ - v_lshlrev_b32 v0, 2, v0 // v0 *= 4\n\ - v_add_co_u32 v4, vcc, s2, v0 // v[4:5] = s[2:3] + v0 * 4\n\ - v_mov_b32 v5, s3\n\ - v_add_co_u32 v5, vcc, v5, vcc_lo\n\ - \n\ - // compute input buffer offset used to store corresponding local buffer address\n\ - v_lshlrev_b32 v0, 1, v0 // v0 *= 8\n\ - v_add_co_u32 v2, vcc, s0, v0 // v[2:3] = s[0:1] + v0 * 8\n\ - v_mov_b32 v3, s1\n\ - v_add_co_u32 v3, vcc, v3, vcc_lo\n\ - \n\ - // load 64bit local buffer address stored at v[2:3] to v[6:7]\n\ - flat_load_dwordx2 v[6:7], v[2:3] slc\n\ - s_waitcnt vmcnt(0) & lgkmcnt(0) // wait for memory reads to finish\n\ - \n\ - v_mov_b32 v8, 0x5678\n\ - s_movk_i32 s8, 0x5678\n\ -L_REPEAT:\n\ - s_load_dword s16, s[0:1], 0x0 glc\n\ - s_waitcnt vmcnt(0) & lgkmcnt(0) // wait for memory reads to finish\n\ - s_cmp_eq_i32 s16, s8\n\ - s_cbranch_scc1 L_QUIT // if notified to quit by host\n\ - // loop read 64M local buffer starting at v[6:7]\n\ - // every 4k page only read once\n\ - v_mov_b32 v9, 0\n\ - v_mov_b32 v10, 0x1000 // 4k page\n\ - v_mov_b32 v11, 0x4000000 // 64M size\n\ - v_mov_b32 v12, v6\n\ - v_mov_b32 v13, v7\n\ -L_LOOP_READ:\n\ - flat_load_dwordx2 v[14:15], v[12:13] slc\n\ - v_add_co_u32 v9, vcc, v9, v10 \n\ - v_add_co_u32 v12, vcc, v12, v10\n\ - v_add_co_u32 v13, vcc, v13, vcc_lo\n\ - v_cmp_lt_u32 vcc, v9, v11\n\ - s_cbranch_vccnz L_LOOP_READ\n\ - s_branch L_REPEAT\n\ -L_QUIT:\n\ - flat_store_dword v[4:5], v8\n\ - s_waitcnt vmcnt(0) & lgkmcnt(0) // wait for memory writes to finish\n\ - s_endpgm\n\ - end\n\ -"; - -static const char* gfx8_ReadMemory = -"\ - shader ReadMemory\n\ - asic(VI)\n\ - type(CS)\n\ - \n\ - // compute address of corresponding output buffer\n\ - v_mov_b32 v0, s4 // use workgroup id as index\n\ - v_lshlrev_b32 v0, 2, v0 // v0 *= 4\n\ - v_add_u32 v4, vcc, s2, v0 // v[4:5] = s[2:3] + v0 * 4\n\ - v_mov_b32 v5, s3\n\ - v_addc_u32 v5, vcc, v5, 0, vcc\n\ - \n\ - // compute input buffer offset used to store corresponding local buffer address\n\ - v_lshlrev_b32 v0, 1, v0 // v0 *= 8\n\ - v_add_u32 v2, vcc, s0, v0 // v[2:3] = s[0:1] + v0 * 8\n\ - v_mov_b32 v3, s1\n\ - v_addc_u32 v3, vcc, v3, 0, vcc\n\ - \n\ - // load 64bit local buffer address stored at v[2:3] to v[6:7]\n\ - flat_load_dwordx2 v[6:7], v[2:3] slc\n\ - s_waitcnt vmcnt(0) & lgkmcnt(0) // wait for memory reads to finish\n\ - \n\ - v_mov_b32 v8, 0x5678\n\ - s_movk_i32 s8, 0x5678\n\ -L_REPEAT:\n\ - s_load_dword s16, s[0:1], 0x0 glc\n\ - s_waitcnt vmcnt(0) & lgkmcnt(0) // wait for memory reads to finish\n\ - s_cmp_eq_i32 s16, s8\n\ - s_cbranch_scc1 L_QUIT // if notified to quit by host\n\ - // loop read 64M local buffer starting at v[6:7]\n\ - // every 4k page only read once\n\ - v_mov_b32 v9, 0\n\ - v_mov_b32 v10, 0x1000 // 4k page\n\ - v_mov_b32 v11, 0x4000000 // 64M size\n\ - v_mov_b32 v12, v6\n\ - v_mov_b32 v13, v7\n\ -L_LOOP_READ:\n\ - flat_load_dwordx2 v[14:15], v[12:13] slc\n\ - v_add_u32 v9, vcc, v9, v10 \n\ - v_add_u32 v12, vcc, v12, v10\n\ - v_addc_u32 v13, vcc, v13, 0, vcc\n\ - v_cmp_lt_u32 vcc, v9, v11\n\ - s_cbranch_vccnz L_LOOP_READ\n\ - s_branch L_REPEAT\n\ -L_QUIT:\n\ - flat_store_dword v[4:5], v8\n\ - s_waitcnt vmcnt(0) & lgkmcnt(0) // wait for memory writes to finish\n\ - s_endpgm\n\ - end\n\ -"; - -std::string KFDEvictTest::CreateShader() { - if (m_FamilyId < FAMILY_AI) - return gfx8_ReadMemory; - else - return gfx9_ReadMemory; -} - /* Evict and restore procedure basic test * * Use N_PROCESSES processes to allocate vram buf size larger than total vram size @@ -567,7 +546,7 @@ TEST_F(KFDEvictTest, QueueTest) { HsaMemoryBuffer addrBuffer(PAGE_SIZE, defaultGPUNode); HsaMemoryBuffer resultBuffer(PAGE_SIZE, defaultGPUNode); - m_pIsaGen->CompileShader(CreateShader().c_str(), "ReadMemory", isaBuffer); + ASSERT_SUCCESS(m_pAsm->RunAssembleBuf(CreateShader().c_str(), isaBuffer.As())); PM4Queue pm4Queue; ASSERT_SUCCESS(pm4Queue.Create(defaultGPUNode)); diff --git a/projects/rocr-runtime/tests/kfdtest/src/KFDEvictTest.hpp b/projects/rocr-runtime/tests/kfdtest/src/KFDEvictTest.hpp index 2b838a5388..d70aada6b4 100644 --- a/projects/rocr-runtime/tests/kfdtest/src/KFDEvictTest.hpp +++ b/projects/rocr-runtime/tests/kfdtest/src/KFDEvictTest.hpp @@ -27,15 +27,13 @@ #include #include #include "KFDMultiProcessTest.hpp" -#include "IsaGenerator.hpp" #include "PM4Queue.hpp" // @class KFDEvictTest // Test eviction and restore procedure using two processes class KFDEvictTest : public KFDMultiProcessTest { public: - KFDEvictTest(void): m_pIsaGen(NULL) {} - + KFDEvictTest(void) {} ~KFDEvictTest(void) {} protected: @@ -52,7 +50,6 @@ class KFDEvictTest : public KFDMultiProcessTest { PM4Queue *computeQueue); protected: // Members - IsaGenerator* m_pIsaGen; HsaMemFlags m_Flags; void* m_pBuf; };