From 5be4fddf065c8820c442bd9128b2f9785585499f Mon Sep 17 00:00:00 2001 From: Alysa Liu Date: Wed, 7 Jan 2026 16:48:11 -0500 Subject: [PATCH] kfdtest: Support blit kernel copy (#677) Add support for blit kernel copy. Add GpuMemCopyTest test for KFDQMTest. --- .../libhsakmt/tests/kfdtest/src/KFDQMTest.cpp | 68 +++++++++++++++++-- .../libhsakmt/tests/kfdtest/src/KFDQMTest.hpp | 4 +- .../tests/kfdtest/src/KFDTestUtil.cpp | 44 ++++++++++++ .../tests/kfdtest/src/KFDTestUtil.hpp | 7 ++ .../tests/kfdtest/src/ShaderStore.cpp | 68 +++++++++++++++++++ .../tests/kfdtest/src/ShaderStore.hpp | 1 + 6 files changed, 186 insertions(+), 6 deletions(-) diff --git a/projects/rocr-runtime/libhsakmt/tests/kfdtest/src/KFDQMTest.cpp b/projects/rocr-runtime/libhsakmt/tests/kfdtest/src/KFDQMTest.cpp index 3d90aeaf43..e1c56e40a8 100644 --- a/projects/rocr-runtime/libhsakmt/tests/kfdtest/src/KFDQMTest.cpp +++ b/projects/rocr-runtime/libhsakmt/tests/kfdtest/src/KFDQMTest.cpp @@ -37,6 +37,9 @@ #include "Dispatch.hpp" +const unsigned int FILL_VALUE = 0x01010101; +const unsigned int INIT_VALUE = 0x5A5A5A5A; + extern unsigned int g_TestGPUsNum; void KFDQMTest::SetUp() { @@ -1711,7 +1714,7 @@ TEST_F(KFDQMTest, QueuePriorityOnSamePipe) { TEST_END } -void KFDQMTest::SyncDispatch(const HsaMemoryBuffer& isaBuffer, void* pSrcBuf, void* pDstBuf, int node) { +void KFDQMTest::SyncDispatch(const HsaMemoryBuffer& isaBuffer, void* arg0, void* arg1, int node) { PM4Queue queue; if (node == -1) @@ -1720,7 +1723,7 @@ void KFDQMTest::SyncDispatch(const HsaMemoryBuffer& isaBuffer, void* pSrcBuf, vo ASSERT_GE_GPU(node, 0, node) << "failed to get GPU Node"; Dispatch dispatch(isaBuffer); - dispatch.SetArgs(pSrcBuf, pDstBuf); + dispatch.SetArgs(arg0, arg1); dispatch.SetDim(1, 1, 1); ASSERT_SUCCESS_GPU(queue.Create(node), node); @@ -1765,13 +1768,13 @@ void KFDQMTest::SimpleWriteDispatch(int gpuNode) { HsaMemoryBuffer srcBuffer(PAGE_SIZE, gpuNode, false); HsaMemoryBuffer destBuffer(PAGE_SIZE, gpuNode); - srcBuffer.Fill(0x01010101); + srcBuffer.Fill(FILL_VALUE); ASSERT_SUCCESS_GPU(m_pAsm->RunAssembleBuf(CopyDwordIsa, isaBuffer.As()),gpuNode); SyncDispatch(isaBuffer, srcBuffer.As(), destBuffer.As(), gpuNode); - EXPECT_EQ(destBuffer.As()[0], 0x01010101); + EXPECT_EQ(destBuffer.As()[0], FILL_VALUE); } @@ -1785,6 +1788,63 @@ TEST_F(KFDQMTest, SimpleWriteDispatch) { TEST_END } +void KFDQMTest::GpuMemCopyTest(int gpuNode) { + + HSAuint32 m_FamilyId = GetFamilyIdFromNodeId(gpuNode); + if (m_FamilyId < FAMILY_AR) { + LOG() << "Skipping test: MultipleWordsDispatch test not yet available for this family id." << std::endl; + return; + } + + const size_t bufSize = PAGE_SIZE; + HsaMemoryBuffer srcBuffer(bufSize, gpuNode, false); + HsaMemoryBuffer dstBuffer(bufSize, gpuNode, false); + HsaMemoryBuffer dstLocalBuffer(bufSize, gpuNode, false, true); + HsaMemoryBuffer verifyBuffer(bufSize, gpuNode, false); + + srcBuffer.Fill(FILL_VALUE, 0, bufSize); + + // SDMA copy + dstBuffer.Fill(INIT_VALUE, 0, bufSize); + ASSERT_TRUE(GPUMemCopy(dstBuffer.As(), srcBuffer.As(), bufSize, gpuNode, true)); + + for (size_t i = 0; i < bufSize / sizeof(unsigned int); ++i) + EXPECT_EQ(dstBuffer.As()[i], FILL_VALUE); + + // Blit kernel copy + dstBuffer.Fill(INIT_VALUE, 0, bufSize); + ASSERT_TRUE(GPUMemCopy(dstBuffer.As(), srcBuffer.As(), bufSize, gpuNode, false)); + + for (size_t i = 0; i < bufSize / sizeof(unsigned int); ++i) + EXPECT_EQ(dstBuffer.As()[i], FILL_VALUE); + + // SDMA copy to local memory + verifyBuffer.Fill(INIT_VALUE, 0, bufSize); + ASSERT_TRUE(GPUMemCopy(dstLocalBuffer.As(), srcBuffer.As(), bufSize, gpuNode, true)); + ASSERT_TRUE(GPUMemCopy(verifyBuffer.As(), dstLocalBuffer.As(), bufSize, gpuNode, true)); + + for (size_t i = 0; i < bufSize / sizeof(unsigned int); ++i) + EXPECT_EQ(verifyBuffer.As()[i], FILL_VALUE); + + // Blit kernel copy to local memory + verifyBuffer.Fill(INIT_VALUE, 0, bufSize); + ASSERT_TRUE(GPUMemCopy(dstLocalBuffer.As(), srcBuffer.As(), bufSize, gpuNode, false)); + ASSERT_TRUE(GPUMemCopy(verifyBuffer.As(), dstLocalBuffer.As(), bufSize, gpuNode, false)); + + for (size_t i = 0; i < bufSize / sizeof(unsigned int); ++i) + EXPECT_EQ(verifyBuffer.As()[i], FILL_VALUE); +} + +TEST_F(KFDQMTest, GpuMemCopyTest) { + TEST_START(TESTPROFILE_RUNALL); + + ASSERT_SUCCESS(KFDTestLaunch([this](int gpuNode) { + this->GpuMemCopyTest(gpuNode); + })); + + TEST_END; +} + void KFDQMTest::MultipleCpQueuesStressDispatch(int gpuNode) { Assembler* m_pAsm; diff --git a/projects/rocr-runtime/libhsakmt/tests/kfdtest/src/KFDQMTest.hpp b/projects/rocr-runtime/libhsakmt/tests/kfdtest/src/KFDQMTest.hpp index cc8a726de0..f1da2e15f7 100644 --- a/projects/rocr-runtime/libhsakmt/tests/kfdtest/src/KFDQMTest.hpp +++ b/projects/rocr-runtime/libhsakmt/tests/kfdtest/src/KFDQMTest.hpp @@ -102,12 +102,12 @@ class KFDQMTest : public KFDBaseComponentTest { void PM4EventInterrupt(int gpuNode); void SdmaEventInterrupt(int gpuNode); void GPUDoorbellWrite(int gpuNode); + void GpuMemCopyTest(int gpuNode); protected: virtual void SetUp(); virtual void TearDown(); - - void SyncDispatch(const HsaMemoryBuffer& isaBuffer, void* pSrcBuf, void* pDstBuf, int node = -1); + void SyncDispatch(const HsaMemoryBuffer& isaBuffer, void* arg0, void* arg1, int node = -1); HSAint64 TimeConsumedwithCUMask(int node, uint32_t *mask, uint32_t mask_count); HSAint64 GetAverageTimeConsumedwithCUMask(int node, uint32_t *mask, uint32_t mask_count, int iterations); void testQueuePriority(int gpuNode, bool isSamePipe); diff --git a/projects/rocr-runtime/libhsakmt/tests/kfdtest/src/KFDTestUtil.cpp b/projects/rocr-runtime/libhsakmt/tests/kfdtest/src/KFDTestUtil.cpp index bebac15860..e04a266ee4 100644 --- a/projects/rocr-runtime/libhsakmt/tests/kfdtest/src/KFDTestUtil.cpp +++ b/projects/rocr-runtime/libhsakmt/tests/kfdtest/src/KFDTestUtil.cpp @@ -28,6 +28,8 @@ #include #include #include "BaseQueue.hpp" +#include "PM4Queue.hpp" +#include "SDMAQueue.hpp" #include "Dispatch.hpp" #include "SDMAPacket.hpp" @@ -274,6 +276,48 @@ HSAuint64 GetSystemTickCountInMicroSec() { return t.tv_sec * 1000000ULL + t.tv_usec; } +bool GPUMemCopy(void* dst, void* src, size_t size, unsigned int node, bool useSdma) { + if (useSdma) { + SDMAQueue sdmaQueue; + if (sdmaQueue.Create(node) != HSAKMT_STATUS_SUCCESS) + return false; + sdmaQueue.PlaceAndSubmitPacket(SDMACopyDataPacket(sdmaQueue.GetFamilyId(), dst, src, size)); + sdmaQueue.Wait4PacketConsumption(); + sdmaQueue.Destroy(); + return true; + } else { + PM4Queue pm4Queue; + if (pm4Queue.Create(node) != HSAKMT_STATUS_SUCCESS) + return false; + + HsaNodeProperties nodeProperties; + if (hsaKmtGetNodeProperties(node, &nodeProperties) != HSAKMT_STATUS_SUCCESS) + return false; + Assembler pAsm(GetGfxVersion(&nodeProperties)); + + HsaMemoryBuffer isaBuffer(PAGE_SIZE, node, true, false, true); + if (pAsm.RunAssembleBuf(CopyWordsIsa, isaBuffer.As()) != HSAKMT_STATUS_SUCCESS) + return false; + + HsaMemoryBuffer addrBuffer(PAGE_SIZE, node); + void **localBufAddr = addrBuffer.As(); + localBufAddr[0] = src; + localBufAddr[1] = dst; + + HsaMemoryBuffer sizeBuffer(PAGE_SIZE, node); + unsigned int *pSize = sizeBuffer.As(); + *pSize = static_cast(size / sizeof(unsigned int)); + + Dispatch dispatch(isaBuffer); + dispatch.SetArgs(localBufAddr, pSize); + dispatch.SetDim(1, 1, 1); + dispatch.Submit(pm4Queue); + dispatch.Sync(); + pm4Queue.Destroy(); + return true; + } +} + const HsaMemoryBuffer HsaMemoryBuffer::Null; HsaMemoryBuffer::HsaMemoryBuffer(HSAuint64 size, unsigned int node, bool zero, bool isLocal, bool isExec, diff --git a/projects/rocr-runtime/libhsakmt/tests/kfdtest/src/KFDTestUtil.hpp b/projects/rocr-runtime/libhsakmt/tests/kfdtest/src/KFDTestUtil.hpp index 3e8cc8c9af..0d489b3a1e 100644 --- a/projects/rocr-runtime/libhsakmt/tests/kfdtest/src/KFDTestUtil.hpp +++ b/projects/rocr-runtime/libhsakmt/tests/kfdtest/src/KFDTestUtil.hpp @@ -29,6 +29,8 @@ #include "OSWrapper.hpp" #include "GoogleTestExtension.hpp" #include "hsakmt/hsakmt.h" +#include "Assemble.hpp" +#include "ShaderStore.hpp" class BaseQueue; #define ARRAY_SIZE(_x) (sizeof(_x)/sizeof(_x[0])) @@ -65,6 +67,11 @@ void GetHwQueueInfo(const HsaNodeProperties *props, HSAuint64 GetSystemTickCountInMicroSec(); +// Copy @size bytes from @src to @dst. +bool GPUMemCopy( + void* dst, void* src, size_t size, unsigned int node, + bool useSdma); + class HsaMemoryBuffer { public: static const HsaMemoryBuffer Null; diff --git a/projects/rocr-runtime/libhsakmt/tests/kfdtest/src/ShaderStore.cpp b/projects/rocr-runtime/libhsakmt/tests/kfdtest/src/ShaderStore.cpp index ce2a3c8a83..79f2332d6a 100644 --- a/projects/rocr-runtime/libhsakmt/tests/kfdtest/src/ShaderStore.cpp +++ b/projects/rocr-runtime/libhsakmt/tests/kfdtest/src/ShaderStore.cpp @@ -157,6 +157,74 @@ const char *CopyDwordIsa = s_endpgm )"; +const char *CopyWordsIsa = + SHADER_START + SHADER_MACROS_FLAT + R"( + v_mov_b32 v2, s0 + v_mov_b32 v3, s1 + + .if (.amdgcn.gfx_generation_number >= 12) + FLAT_LOAD_DWORDX2_NSS v[0:1], v[2:3] scope:SCOPE_DEV + .else + FLAT_LOAD_DWORDX2_NSS v[0:1], v[2:3] slc + .endif + s_waitcnt vmcnt(0) & lgkmcnt(0) + + .if (.amdgcn.gfx_generation_number >= 10) + v_add_nc_u32 v4, 8, v2 + .else + v_add_u32 v4, 8, v2 + .endif + + v_mov_b32 v5, v3 + .if (.amdgcn.gfx_generation_number >= 12) + FLAT_LOAD_DWORDX2_NSS v[6:7], v[4:5] scope:SCOPE_DEV + .else + FLAT_LOAD_DWORDX2_NSS v[6:7], v[4:5] slc + .endif + s_waitcnt vmcnt(0) & lgkmcnt(0) + + v_mov_b32 v8, s2 + v_mov_b32 v9, s3 + .if (.amdgcn.gfx_generation_number >= 12) + FLAT_LOAD_DWORD_NSS v10, v[8:9] scope:SCOPE_DEV + .else + FLAT_LOAD_DWORD_NSS v10, v[8:9] slc + .endif + s_waitcnt vmcnt(0) & lgkmcnt(0) + v_mov_b32 v8, v10 + + v_mov_b32 v9, 0 + + LOOP: + .if (.amdgcn.gfx_generation_number >= 12) + FLAT_LOAD_DWORD_NSS v10, v[0:1] scope:SCOPE_SYS + s_wait_loadcnt 0 + FLAT_STORE_DWORD_NSS v[6:7], v10 scope:SCOPE_SYS + .else + FLAT_LOAD_DWORD_NSS v10, v[0:1] glc slc + s_waitcnt vmcnt(0) & lgkmcnt(0) + FLAT_STORE_DWORD_NSS v[6:7], v10 glc slc + .endif + + .if (.amdgcn.gfx_generation_number >= 10) + v_add_nc_u32 v0, 4, v0 + v_add_nc_u32 v6, 4, v6 + v_add_nc_u32 v9, 1, v9 + .else + v_add_u32 v0, 4, v0 + v_add_u32 v6, 4, v6 + + v_add_u32 v9, 1, v9 + .endif + + v_cmp_lt_u32 v9, v8 + s_cbranch_vccnz LOOP + + s_endpgm +)"; + const char *InfiniteLoopIsa = SHADER_START R"( diff --git a/projects/rocr-runtime/libhsakmt/tests/kfdtest/src/ShaderStore.hpp b/projects/rocr-runtime/libhsakmt/tests/kfdtest/src/ShaderStore.hpp index 2e470ab216..b13da9613c 100644 --- a/projects/rocr-runtime/libhsakmt/tests/kfdtest/src/ShaderStore.hpp +++ b/projects/rocr-runtime/libhsakmt/tests/kfdtest/src/ShaderStore.hpp @@ -32,6 +32,7 @@ extern const std::vector ShaderList; /* Common */ extern const char *NoopIsa; extern const char *CopyDwordIsa; +extern const char *CopyWordsIsa; extern const char *InfiniteLoopIsa; extern const char *AtomicIncIsa;