kfdtest: Support blit kernel copy (#677)

Add support for blit kernel copy.
Add GpuMemCopyTest test for KFDQMTest.
This commit is contained in:
Alysa Liu
2026-01-07 16:48:11 -05:00
committato da GitHub
parent 7178747ebc
commit 5be4fddf06
6 ha cambiato i file con 186 aggiunte e 6 eliminazioni
@@ -37,6 +37,9 @@
#include "Dispatch.hpp"
const unsigned int FILL_VALUE = 0x01010101;
const unsigned int INIT_VALUE = 0x5A5A5A5A;
extern unsigned int g_TestGPUsNum;
void KFDQMTest::SetUp() {
@@ -1711,7 +1714,7 @@ TEST_F(KFDQMTest, QueuePriorityOnSamePipe) {
TEST_END
}
void KFDQMTest::SyncDispatch(const HsaMemoryBuffer& isaBuffer, void* pSrcBuf, void* pDstBuf, int node) {
void KFDQMTest::SyncDispatch(const HsaMemoryBuffer& isaBuffer, void* arg0, void* arg1, int node) {
PM4Queue queue;
if (node == -1)
@@ -1720,7 +1723,7 @@ void KFDQMTest::SyncDispatch(const HsaMemoryBuffer& isaBuffer, void* pSrcBuf, vo
ASSERT_GE_GPU(node, 0, node) << "failed to get GPU Node";
Dispatch dispatch(isaBuffer);
dispatch.SetArgs(pSrcBuf, pDstBuf);
dispatch.SetArgs(arg0, arg1);
dispatch.SetDim(1, 1, 1);
ASSERT_SUCCESS_GPU(queue.Create(node), node);
@@ -1765,13 +1768,13 @@ void KFDQMTest::SimpleWriteDispatch(int gpuNode) {
HsaMemoryBuffer srcBuffer(PAGE_SIZE, gpuNode, false);
HsaMemoryBuffer destBuffer(PAGE_SIZE, gpuNode);
srcBuffer.Fill(0x01010101);
srcBuffer.Fill(FILL_VALUE);
ASSERT_SUCCESS_GPU(m_pAsm->RunAssembleBuf(CopyDwordIsa, isaBuffer.As<char*>()),gpuNode);
SyncDispatch(isaBuffer, srcBuffer.As<void*>(), destBuffer.As<void*>(), gpuNode);
EXPECT_EQ(destBuffer.As<unsigned int*>()[0], 0x01010101);
EXPECT_EQ(destBuffer.As<unsigned int*>()[0], FILL_VALUE);
}
@@ -1785,6 +1788,63 @@ TEST_F(KFDQMTest, SimpleWriteDispatch) {
TEST_END
}
void KFDQMTest::GpuMemCopyTest(int gpuNode) {
HSAuint32 m_FamilyId = GetFamilyIdFromNodeId(gpuNode);
if (m_FamilyId < FAMILY_AR) {
LOG() << "Skipping test: MultipleWordsDispatch test not yet available for this family id." << std::endl;
return;
}
const size_t bufSize = PAGE_SIZE;
HsaMemoryBuffer srcBuffer(bufSize, gpuNode, false);
HsaMemoryBuffer dstBuffer(bufSize, gpuNode, false);
HsaMemoryBuffer dstLocalBuffer(bufSize, gpuNode, false, true);
HsaMemoryBuffer verifyBuffer(bufSize, gpuNode, false);
srcBuffer.Fill(FILL_VALUE, 0, bufSize);
// SDMA copy
dstBuffer.Fill(INIT_VALUE, 0, bufSize);
ASSERT_TRUE(GPUMemCopy(dstBuffer.As<void*>(), srcBuffer.As<void*>(), bufSize, gpuNode, true));
for (size_t i = 0; i < bufSize / sizeof(unsigned int); ++i)
EXPECT_EQ(dstBuffer.As<unsigned int*>()[i], FILL_VALUE);
// Blit kernel copy
dstBuffer.Fill(INIT_VALUE, 0, bufSize);
ASSERT_TRUE(GPUMemCopy(dstBuffer.As<void*>(), srcBuffer.As<void*>(), bufSize, gpuNode, false));
for (size_t i = 0; i < bufSize / sizeof(unsigned int); ++i)
EXPECT_EQ(dstBuffer.As<unsigned int*>()[i], FILL_VALUE);
// SDMA copy to local memory
verifyBuffer.Fill(INIT_VALUE, 0, bufSize);
ASSERT_TRUE(GPUMemCopy(dstLocalBuffer.As<void*>(), srcBuffer.As<void*>(), bufSize, gpuNode, true));
ASSERT_TRUE(GPUMemCopy(verifyBuffer.As<void*>(), dstLocalBuffer.As<void*>(), bufSize, gpuNode, true));
for (size_t i = 0; i < bufSize / sizeof(unsigned int); ++i)
EXPECT_EQ(verifyBuffer.As<unsigned int*>()[i], FILL_VALUE);
// Blit kernel copy to local memory
verifyBuffer.Fill(INIT_VALUE, 0, bufSize);
ASSERT_TRUE(GPUMemCopy(dstLocalBuffer.As<void*>(), srcBuffer.As<void*>(), bufSize, gpuNode, false));
ASSERT_TRUE(GPUMemCopy(verifyBuffer.As<void*>(), dstLocalBuffer.As<void*>(), bufSize, gpuNode, false));
for (size_t i = 0; i < bufSize / sizeof(unsigned int); ++i)
EXPECT_EQ(verifyBuffer.As<unsigned int*>()[i], FILL_VALUE);
}
TEST_F(KFDQMTest, GpuMemCopyTest) {
TEST_START(TESTPROFILE_RUNALL);
ASSERT_SUCCESS(KFDTestLaunch([this](int gpuNode) {
this->GpuMemCopyTest(gpuNode);
}));
TEST_END;
}
void KFDQMTest::MultipleCpQueuesStressDispatch(int gpuNode) {
Assembler* m_pAsm;
@@ -102,12 +102,12 @@ class KFDQMTest : public KFDBaseComponentTest {
void PM4EventInterrupt(int gpuNode);
void SdmaEventInterrupt(int gpuNode);
void GPUDoorbellWrite(int gpuNode);
void GpuMemCopyTest(int gpuNode);
protected:
virtual void SetUp();
virtual void TearDown();
void SyncDispatch(const HsaMemoryBuffer& isaBuffer, void* pSrcBuf, void* pDstBuf, int node = -1);
void SyncDispatch(const HsaMemoryBuffer& isaBuffer, void* arg0, void* arg1, int node = -1);
HSAint64 TimeConsumedwithCUMask(int node, uint32_t *mask, uint32_t mask_count);
HSAint64 GetAverageTimeConsumedwithCUMask(int node, uint32_t *mask, uint32_t mask_count, int iterations);
void testQueuePriority(int gpuNode, bool isSamePipe);
@@ -28,6 +28,8 @@
#include <algorithm>
#include <vector>
#include "BaseQueue.hpp"
#include "PM4Queue.hpp"
#include "SDMAQueue.hpp"
#include "Dispatch.hpp"
#include "SDMAPacket.hpp"
@@ -274,6 +276,48 @@ HSAuint64 GetSystemTickCountInMicroSec() {
return t.tv_sec * 1000000ULL + t.tv_usec;
}
bool GPUMemCopy(void* dst, void* src, size_t size, unsigned int node, bool useSdma) {
if (useSdma) {
SDMAQueue sdmaQueue;
if (sdmaQueue.Create(node) != HSAKMT_STATUS_SUCCESS)
return false;
sdmaQueue.PlaceAndSubmitPacket(SDMACopyDataPacket(sdmaQueue.GetFamilyId(), dst, src, size));
sdmaQueue.Wait4PacketConsumption();
sdmaQueue.Destroy();
return true;
} else {
PM4Queue pm4Queue;
if (pm4Queue.Create(node) != HSAKMT_STATUS_SUCCESS)
return false;
HsaNodeProperties nodeProperties;
if (hsaKmtGetNodeProperties(node, &nodeProperties) != HSAKMT_STATUS_SUCCESS)
return false;
Assembler pAsm(GetGfxVersion(&nodeProperties));
HsaMemoryBuffer isaBuffer(PAGE_SIZE, node, true, false, true);
if (pAsm.RunAssembleBuf(CopyWordsIsa, isaBuffer.As<char*>()) != HSAKMT_STATUS_SUCCESS)
return false;
HsaMemoryBuffer addrBuffer(PAGE_SIZE, node);
void **localBufAddr = addrBuffer.As<void **>();
localBufAddr[0] = src;
localBufAddr[1] = dst;
HsaMemoryBuffer sizeBuffer(PAGE_SIZE, node);
unsigned int *pSize = sizeBuffer.As<unsigned int *>();
*pSize = static_cast<unsigned int>(size / sizeof(unsigned int));
Dispatch dispatch(isaBuffer);
dispatch.SetArgs(localBufAddr, pSize);
dispatch.SetDim(1, 1, 1);
dispatch.Submit(pm4Queue);
dispatch.Sync();
pm4Queue.Destroy();
return true;
}
}
const HsaMemoryBuffer HsaMemoryBuffer::Null;
HsaMemoryBuffer::HsaMemoryBuffer(HSAuint64 size, unsigned int node, bool zero, bool isLocal, bool isExec,
@@ -29,6 +29,8 @@
#include "OSWrapper.hpp"
#include "GoogleTestExtension.hpp"
#include "hsakmt/hsakmt.h"
#include "Assemble.hpp"
#include "ShaderStore.hpp"
class BaseQueue;
#define ARRAY_SIZE(_x) (sizeof(_x)/sizeof(_x[0]))
@@ -65,6 +67,11 @@ void GetHwQueueInfo(const HsaNodeProperties *props,
HSAuint64 GetSystemTickCountInMicroSec();
// Copy @size bytes from @src to @dst.
bool GPUMemCopy(
void* dst, void* src, size_t size, unsigned int node,
bool useSdma);
class HsaMemoryBuffer {
public:
static const HsaMemoryBuffer Null;
@@ -157,6 +157,74 @@ const char *CopyDwordIsa =
s_endpgm
)";
const char *CopyWordsIsa =
SHADER_START
SHADER_MACROS_FLAT
R"(
v_mov_b32 v2, s0
v_mov_b32 v3, s1
.if (.amdgcn.gfx_generation_number >= 12)
FLAT_LOAD_DWORDX2_NSS v[0:1], v[2:3] scope:SCOPE_DEV
.else
FLAT_LOAD_DWORDX2_NSS v[0:1], v[2:3] slc
.endif
s_waitcnt vmcnt(0) & lgkmcnt(0)
.if (.amdgcn.gfx_generation_number >= 10)
v_add_nc_u32 v4, 8, v2
.else
v_add_u32 v4, 8, v2
.endif
v_mov_b32 v5, v3
.if (.amdgcn.gfx_generation_number >= 12)
FLAT_LOAD_DWORDX2_NSS v[6:7], v[4:5] scope:SCOPE_DEV
.else
FLAT_LOAD_DWORDX2_NSS v[6:7], v[4:5] slc
.endif
s_waitcnt vmcnt(0) & lgkmcnt(0)
v_mov_b32 v8, s2
v_mov_b32 v9, s3
.if (.amdgcn.gfx_generation_number >= 12)
FLAT_LOAD_DWORD_NSS v10, v[8:9] scope:SCOPE_DEV
.else
FLAT_LOAD_DWORD_NSS v10, v[8:9] slc
.endif
s_waitcnt vmcnt(0) & lgkmcnt(0)
v_mov_b32 v8, v10
v_mov_b32 v9, 0
LOOP:
.if (.amdgcn.gfx_generation_number >= 12)
FLAT_LOAD_DWORD_NSS v10, v[0:1] scope:SCOPE_SYS
s_wait_loadcnt 0
FLAT_STORE_DWORD_NSS v[6:7], v10 scope:SCOPE_SYS
.else
FLAT_LOAD_DWORD_NSS v10, v[0:1] glc slc
s_waitcnt vmcnt(0) & lgkmcnt(0)
FLAT_STORE_DWORD_NSS v[6:7], v10 glc slc
.endif
.if (.amdgcn.gfx_generation_number >= 10)
v_add_nc_u32 v0, 4, v0
v_add_nc_u32 v6, 4, v6
v_add_nc_u32 v9, 1, v9
.else
v_add_u32 v0, 4, v0
v_add_u32 v6, 4, v6
v_add_u32 v9, 1, v9
.endif
v_cmp_lt_u32 v9, v8
s_cbranch_vccnz LOOP
s_endpgm
)";
const char *InfiniteLoopIsa =
SHADER_START
R"(
@@ -32,6 +32,7 @@ extern const std::vector<const char*> ShaderList;
/* Common */
extern const char *NoopIsa;
extern const char *CopyDwordIsa;
extern const char *CopyWordsIsa;
extern const char *InfiniteLoopIsa;
extern const char *AtomicIncIsa;