kfdtest: Update KFDEvictTest to LLVM Asm

- Reformat shaders for legibility
- Move assembly processes to from IsaGen (CompileShader) to Assembler
(RunAssembleBuf)

Signed-off-by: Graham Sider <Graham.Sider@amd.com>
Change-Id: I7333d0e45ccd3f43690a2a01227f89a6e04fcecb
Bu işleme şunda yer alıyor:
Graham Sider
2021-09-24 18:47:40 -04:00
işlemeyi yapan: Harish Kasiviswanathan
ebeveyn c845b976d0
işleme b44d6762bd
2 değiştirilmiş dosya ile 117 ekleme ve 141 silme
+116 -137
Dosyayı Görüntüle
@@ -36,23 +36,132 @@
#define SDMA_NOP 0x0
/* Shader to read local buffers using multiple wavefronts in parallel
* until address buffer is filled with specific value 0x5678 by host program,
* then each wavefront fills value 0x5678 at corresponding result buffer and quit
*
* Initial state:
* s[0:1] - address buffer base address
* s[2:3] - result buffer base address
* s4 - workgroup id
* v0 - workitem id, always 0 because NUM_THREADS_X(number of threads) in workgroup set to 1
* Registers:
* v0 - calculated workitem id, v0 = v0 + s4 * NUM_THREADS_X
* v[2:3] - address of corresponding local buf address offset: s[0:1] + v0 * 8
* v[4:5] - corresponding output buf address: s[2:3] + v0 * 4
* v[6:7] - local buf address used for read test
*
* This shader can be used by gfx9 and gfx10
*
*/
static const char* ReadMemoryIsa_gfx9 = R"(
.text
// Compute address of corresponding output buffer
v_mov_b32 v0, s4 // use workgroup id as index
v_lshlrev_b32 v0, 2, v0 // v0 *= 4
v_add_co_u32 v4, vcc, s2, v0 // v[4:5] = s[2:3] + v0 * 4
v_mov_b32 v5, s3
v_add_co_u32 v5, vcc, v5, vcc_lo
// Compute input buffer offset used to store corresponding local buffer address
v_lshlrev_b32 v0, 1, v0 // v0 *= 8
v_add_co_u32 v2, vcc, s0, v0 // v[2:3] = s[0:1] + v0 * 8
v_mov_b32 v3, s1
v_add_co_u32 v3, vcc, v3, vcc_lo
// load 64bit local buffer address stored at v[2:3] to v[6:7]
flat_load_dwordx2 v[6:7], v[2:3] slc
s_waitcnt vmcnt(0) & lgkmcnt(0) // wait for memory reads to finish
v_mov_b32 v8, 0x5678
s_movk_i32 s8, 0x5678
L_REPEAT:
s_load_dword s16, s[0:1], 0x0 glc
s_waitcnt vmcnt(0) & lgkmcnt(0) // wait for memory reads to finish
s_cmp_eq_i32 s16, s8
s_cbranch_scc1 L_QUIT // if notified to quit by host
// Loop read 64M local buffer starting at v[6:7]
// every 4k page only read once
v_mov_b32 v9, 0
v_mov_b32 v10, 0x1000 // 4k page
v_mov_b32 v11, 0x4000000 // 64M size
v_mov_b32 v12, v6
v_mov_b32 v13, v7
L_LOOP_READ:
flat_load_dwordx2 v[14:15], v[12:13] slc
v_add_co_u32 v9, vcc, v9, v10
v_add_co_u32 v12, vcc, v12, v10
v_add_co_u32 v13, vcc, v13, vcc_lo
v_cmp_lt_u32 vcc, v9, v11
s_cbranch_vccnz L_LOOP_READ
s_branch L_REPEAT
L_QUIT:
flat_store_dword v[4:5], v8
s_waitcnt vmcnt(0) & lgkmcnt(0) // wait for memory writes to finish
s_endpgm
)";
static const char* ReadMemoryIsa_gfx8 = R"(
.text
// Compute address of corresponding output buffer
v_mov_b32 v0, s4 // use workgroup id as index
v_lshlrev_b32 v0, 2, v0 // v0 *= 4
v_add_u32 v4, vcc, s2, v0 // v[4:5] = s[2:3] + v0 * 4
v_mov_b32 v5, s3
v_addc_u32 v5, vcc, v5, 0, vcc
// Compute input buffer offset used to store corresponding local buffer address
v_lshlrev_b32 v0, 1, v0 // v0 *= 8
v_add_u32 v2, vcc, s0, v0 // v[2:3] = s[0:1] + v0 * 8
v_mov_b32 v3, s1
v_addc_u32 v3, vcc, v3, 0, vcc
// Load 64bit local buffer address stored at v[2:3] to v[6:7]
flat_load_dwordx2 v[6:7], v[2:3] slc
s_waitcnt vmcnt(0) & lgkmcnt(0) // wait for memory reads to finish
v_mov_b32 v8, 0x5678
s_movk_i32 s8, 0x5678
L_REPEAT:
s_load_dword s16, s[0:1], 0x0 glc
s_waitcnt vmcnt(0) & lgkmcnt(0) // wait for memory reads to finish
s_cmp_eq_i32 s16, s8
s_cbranch_scc1 L_QUIT // if notified to quit by host
// Loop read 64M local buffer starting at v[6:7]
// every 4k page only read once
v_mov_b32 v9, 0
v_mov_b32 v10, 0x1000 // 4k page
v_mov_b32 v11, 0x4000000 // 64M size
v_mov_b32 v12, v6
v_mov_b32 v13, v7
L_LOOP_READ:
flat_load_dwordx2 v[14:15], v[12:13] slc
v_add_u32 v9, vcc, v9, v10
v_add_u32 v12, vcc, v12, v10
v_addc_u32 v13, vcc, v13, 0, vcc
v_cmp_lt_u32 vcc, v9, v11
s_cbranch_vccnz L_LOOP_READ
s_branch L_REPEAT
L_QUIT:
flat_store_dword v[4:5], v8
s_waitcnt vmcnt(0) & lgkmcnt(0) // wait for memory writes to finish
s_endpgm
)";
std::string KFDEvictTest::CreateShader() {
if (m_FamilyId < FAMILY_AI)
return ReadMemoryIsa_gfx8;
else
return ReadMemoryIsa_gfx9;
}
void KFDEvictTest::SetUp() {
ROUTINE_START
KFDBaseComponentTest::SetUp();
m_pIsaGen = IsaGenerator::Create(m_FamilyId);
ROUTINE_END
}
void KFDEvictTest::TearDown() {
ROUTINE_START
if (m_pIsaGen)
delete m_pIsaGen;
m_pIsaGen = NULL;
KFDBaseComponentTest::TearDown();
ROUTINE_END
@@ -286,136 +395,6 @@ void KFDEvictTest::AmdgpuCommandSubmissionSdmaNop(int rn, amdgpu_bo_handle handl
EXPECT_EQ(0, amdgpu_cs_ctx_free(contextHandle));
}
/* Shader to read local buffers using multiple wavefronts in parallel
* until address buffer is filled with specific value 0x5678 by host program,
* then each wavefront fills value 0x5678 at corresponding result buffer and quit
*
* Initial state:
* s[0:1] - address buffer base address
* s[2:3] - result buffer base address
* s4 - workgroup id
* v0 - workitem id, always 0 because NUM_THREADS_X(number of threads) in workgroup set to 1
* Registers:
* v0 - calculated workitem id, v0 = v0 + s4 * NUM_THREADS_X
* v[2:3] - address of corresponding local buf address offset: s[0:1] + v0 * 8
* v[4:5] - corresponding output buf address: s[2:3] + v0 * 4
* v[6:7] - local buf address used for read test
*
* This shader can be used by gfx9 and gfx10
*
*/
static const char* gfx9_ReadMemory =
"\
shader ReadMemory\n\
wave_size(32)\n\
type(CS)\n\
\n\
// compute address of corresponding output buffer\n\
v_mov_b32 v0, s4 // use workgroup id as index\n\
v_lshlrev_b32 v0, 2, v0 // v0 *= 4\n\
v_add_co_u32 v4, vcc, s2, v0 // v[4:5] = s[2:3] + v0 * 4\n\
v_mov_b32 v5, s3\n\
v_add_co_u32 v5, vcc, v5, vcc_lo\n\
\n\
// compute input buffer offset used to store corresponding local buffer address\n\
v_lshlrev_b32 v0, 1, v0 // v0 *= 8\n\
v_add_co_u32 v2, vcc, s0, v0 // v[2:3] = s[0:1] + v0 * 8\n\
v_mov_b32 v3, s1\n\
v_add_co_u32 v3, vcc, v3, vcc_lo\n\
\n\
// load 64bit local buffer address stored at v[2:3] to v[6:7]\n\
flat_load_dwordx2 v[6:7], v[2:3] slc\n\
s_waitcnt vmcnt(0) & lgkmcnt(0) // wait for memory reads to finish\n\
\n\
v_mov_b32 v8, 0x5678\n\
s_movk_i32 s8, 0x5678\n\
L_REPEAT:\n\
s_load_dword s16, s[0:1], 0x0 glc\n\
s_waitcnt vmcnt(0) & lgkmcnt(0) // wait for memory reads to finish\n\
s_cmp_eq_i32 s16, s8\n\
s_cbranch_scc1 L_QUIT // if notified to quit by host\n\
// loop read 64M local buffer starting at v[6:7]\n\
// every 4k page only read once\n\
v_mov_b32 v9, 0\n\
v_mov_b32 v10, 0x1000 // 4k page\n\
v_mov_b32 v11, 0x4000000 // 64M size\n\
v_mov_b32 v12, v6\n\
v_mov_b32 v13, v7\n\
L_LOOP_READ:\n\
flat_load_dwordx2 v[14:15], v[12:13] slc\n\
v_add_co_u32 v9, vcc, v9, v10 \n\
v_add_co_u32 v12, vcc, v12, v10\n\
v_add_co_u32 v13, vcc, v13, vcc_lo\n\
v_cmp_lt_u32 vcc, v9, v11\n\
s_cbranch_vccnz L_LOOP_READ\n\
s_branch L_REPEAT\n\
L_QUIT:\n\
flat_store_dword v[4:5], v8\n\
s_waitcnt vmcnt(0) & lgkmcnt(0) // wait for memory writes to finish\n\
s_endpgm\n\
end\n\
";
static const char* gfx8_ReadMemory =
"\
shader ReadMemory\n\
asic(VI)\n\
type(CS)\n\
\n\
// compute address of corresponding output buffer\n\
v_mov_b32 v0, s4 // use workgroup id as index\n\
v_lshlrev_b32 v0, 2, v0 // v0 *= 4\n\
v_add_u32 v4, vcc, s2, v0 // v[4:5] = s[2:3] + v0 * 4\n\
v_mov_b32 v5, s3\n\
v_addc_u32 v5, vcc, v5, 0, vcc\n\
\n\
// compute input buffer offset used to store corresponding local buffer address\n\
v_lshlrev_b32 v0, 1, v0 // v0 *= 8\n\
v_add_u32 v2, vcc, s0, v0 // v[2:3] = s[0:1] + v0 * 8\n\
v_mov_b32 v3, s1\n\
v_addc_u32 v3, vcc, v3, 0, vcc\n\
\n\
// load 64bit local buffer address stored at v[2:3] to v[6:7]\n\
flat_load_dwordx2 v[6:7], v[2:3] slc\n\
s_waitcnt vmcnt(0) & lgkmcnt(0) // wait for memory reads to finish\n\
\n\
v_mov_b32 v8, 0x5678\n\
s_movk_i32 s8, 0x5678\n\
L_REPEAT:\n\
s_load_dword s16, s[0:1], 0x0 glc\n\
s_waitcnt vmcnt(0) & lgkmcnt(0) // wait for memory reads to finish\n\
s_cmp_eq_i32 s16, s8\n\
s_cbranch_scc1 L_QUIT // if notified to quit by host\n\
// loop read 64M local buffer starting at v[6:7]\n\
// every 4k page only read once\n\
v_mov_b32 v9, 0\n\
v_mov_b32 v10, 0x1000 // 4k page\n\
v_mov_b32 v11, 0x4000000 // 64M size\n\
v_mov_b32 v12, v6\n\
v_mov_b32 v13, v7\n\
L_LOOP_READ:\n\
flat_load_dwordx2 v[14:15], v[12:13] slc\n\
v_add_u32 v9, vcc, v9, v10 \n\
v_add_u32 v12, vcc, v12, v10\n\
v_addc_u32 v13, vcc, v13, 0, vcc\n\
v_cmp_lt_u32 vcc, v9, v11\n\
s_cbranch_vccnz L_LOOP_READ\n\
s_branch L_REPEAT\n\
L_QUIT:\n\
flat_store_dword v[4:5], v8\n\
s_waitcnt vmcnt(0) & lgkmcnt(0) // wait for memory writes to finish\n\
s_endpgm\n\
end\n\
";
std::string KFDEvictTest::CreateShader() {
if (m_FamilyId < FAMILY_AI)
return gfx8_ReadMemory;
else
return gfx9_ReadMemory;
}
/* Evict and restore procedure basic test
*
* Use N_PROCESSES processes to allocate vram buf size larger than total vram size
@@ -567,7 +546,7 @@ TEST_F(KFDEvictTest, QueueTest) {
HsaMemoryBuffer addrBuffer(PAGE_SIZE, defaultGPUNode);
HsaMemoryBuffer resultBuffer(PAGE_SIZE, defaultGPUNode);
m_pIsaGen->CompileShader(CreateShader().c_str(), "ReadMemory", isaBuffer);
ASSERT_SUCCESS(m_pAsm->RunAssembleBuf(CreateShader().c_str(), isaBuffer.As<char*>()));
PM4Queue pm4Queue;
ASSERT_SUCCESS(pm4Queue.Create(defaultGPUNode));
+1 -4
Dosyayı Görüntüle
@@ -27,15 +27,13 @@
#include <string>
#include <vector>
#include "KFDMultiProcessTest.hpp"
#include "IsaGenerator.hpp"
#include "PM4Queue.hpp"
// @class KFDEvictTest
// Test eviction and restore procedure using two processes
class KFDEvictTest : public KFDMultiProcessTest {
public:
KFDEvictTest(void): m_pIsaGen(NULL) {}
KFDEvictTest(void) {}
~KFDEvictTest(void) {}
protected:
@@ -52,7 +50,6 @@ class KFDEvictTest : public KFDMultiProcessTest {
PM4Queue *computeQueue);
protected: // Members
IsaGenerator* m_pIsaGen;
HsaMemFlags m_Flags;
void* m_pBuf;
};