kfdtest: Support blit kernel copy (#677)
Add support for blit kernel copy. Add GpuMemCopyTest test for KFDQMTest.
This commit is contained in:
@@ -37,6 +37,9 @@
|
||||
|
||||
#include "Dispatch.hpp"
|
||||
|
||||
const unsigned int FILL_VALUE = 0x01010101;
|
||||
const unsigned int INIT_VALUE = 0x5A5A5A5A;
|
||||
|
||||
extern unsigned int g_TestGPUsNum;
|
||||
|
||||
void KFDQMTest::SetUp() {
|
||||
@@ -1711,7 +1714,7 @@ TEST_F(KFDQMTest, QueuePriorityOnSamePipe) {
|
||||
TEST_END
|
||||
}
|
||||
|
||||
void KFDQMTest::SyncDispatch(const HsaMemoryBuffer& isaBuffer, void* pSrcBuf, void* pDstBuf, int node) {
|
||||
void KFDQMTest::SyncDispatch(const HsaMemoryBuffer& isaBuffer, void* arg0, void* arg1, int node) {
|
||||
PM4Queue queue;
|
||||
|
||||
if (node == -1)
|
||||
@@ -1720,7 +1723,7 @@ void KFDQMTest::SyncDispatch(const HsaMemoryBuffer& isaBuffer, void* pSrcBuf, vo
|
||||
ASSERT_GE_GPU(node, 0, node) << "failed to get GPU Node";
|
||||
|
||||
Dispatch dispatch(isaBuffer);
|
||||
dispatch.SetArgs(pSrcBuf, pDstBuf);
|
||||
dispatch.SetArgs(arg0, arg1);
|
||||
dispatch.SetDim(1, 1, 1);
|
||||
|
||||
ASSERT_SUCCESS_GPU(queue.Create(node), node);
|
||||
@@ -1765,13 +1768,13 @@ void KFDQMTest::SimpleWriteDispatch(int gpuNode) {
|
||||
HsaMemoryBuffer srcBuffer(PAGE_SIZE, gpuNode, false);
|
||||
HsaMemoryBuffer destBuffer(PAGE_SIZE, gpuNode);
|
||||
|
||||
srcBuffer.Fill(0x01010101);
|
||||
srcBuffer.Fill(FILL_VALUE);
|
||||
|
||||
ASSERT_SUCCESS_GPU(m_pAsm->RunAssembleBuf(CopyDwordIsa, isaBuffer.As<char*>()),gpuNode);
|
||||
|
||||
SyncDispatch(isaBuffer, srcBuffer.As<void*>(), destBuffer.As<void*>(), gpuNode);
|
||||
|
||||
EXPECT_EQ(destBuffer.As<unsigned int*>()[0], 0x01010101);
|
||||
EXPECT_EQ(destBuffer.As<unsigned int*>()[0], FILL_VALUE);
|
||||
|
||||
}
|
||||
|
||||
@@ -1785,6 +1788,63 @@ TEST_F(KFDQMTest, SimpleWriteDispatch) {
|
||||
TEST_END
|
||||
}
|
||||
|
||||
void KFDQMTest::GpuMemCopyTest(int gpuNode) {
|
||||
|
||||
HSAuint32 m_FamilyId = GetFamilyIdFromNodeId(gpuNode);
|
||||
if (m_FamilyId < FAMILY_AR) {
|
||||
LOG() << "Skipping test: MultipleWordsDispatch test not yet available for this family id." << std::endl;
|
||||
return;
|
||||
}
|
||||
|
||||
const size_t bufSize = PAGE_SIZE;
|
||||
HsaMemoryBuffer srcBuffer(bufSize, gpuNode, false);
|
||||
HsaMemoryBuffer dstBuffer(bufSize, gpuNode, false);
|
||||
HsaMemoryBuffer dstLocalBuffer(bufSize, gpuNode, false, true);
|
||||
HsaMemoryBuffer verifyBuffer(bufSize, gpuNode, false);
|
||||
|
||||
srcBuffer.Fill(FILL_VALUE, 0, bufSize);
|
||||
|
||||
// SDMA copy
|
||||
dstBuffer.Fill(INIT_VALUE, 0, bufSize);
|
||||
ASSERT_TRUE(GPUMemCopy(dstBuffer.As<void*>(), srcBuffer.As<void*>(), bufSize, gpuNode, true));
|
||||
|
||||
for (size_t i = 0; i < bufSize / sizeof(unsigned int); ++i)
|
||||
EXPECT_EQ(dstBuffer.As<unsigned int*>()[i], FILL_VALUE);
|
||||
|
||||
// Blit kernel copy
|
||||
dstBuffer.Fill(INIT_VALUE, 0, bufSize);
|
||||
ASSERT_TRUE(GPUMemCopy(dstBuffer.As<void*>(), srcBuffer.As<void*>(), bufSize, gpuNode, false));
|
||||
|
||||
for (size_t i = 0; i < bufSize / sizeof(unsigned int); ++i)
|
||||
EXPECT_EQ(dstBuffer.As<unsigned int*>()[i], FILL_VALUE);
|
||||
|
||||
// SDMA copy to local memory
|
||||
verifyBuffer.Fill(INIT_VALUE, 0, bufSize);
|
||||
ASSERT_TRUE(GPUMemCopy(dstLocalBuffer.As<void*>(), srcBuffer.As<void*>(), bufSize, gpuNode, true));
|
||||
ASSERT_TRUE(GPUMemCopy(verifyBuffer.As<void*>(), dstLocalBuffer.As<void*>(), bufSize, gpuNode, true));
|
||||
|
||||
for (size_t i = 0; i < bufSize / sizeof(unsigned int); ++i)
|
||||
EXPECT_EQ(verifyBuffer.As<unsigned int*>()[i], FILL_VALUE);
|
||||
|
||||
// Blit kernel copy to local memory
|
||||
verifyBuffer.Fill(INIT_VALUE, 0, bufSize);
|
||||
ASSERT_TRUE(GPUMemCopy(dstLocalBuffer.As<void*>(), srcBuffer.As<void*>(), bufSize, gpuNode, false));
|
||||
ASSERT_TRUE(GPUMemCopy(verifyBuffer.As<void*>(), dstLocalBuffer.As<void*>(), bufSize, gpuNode, false));
|
||||
|
||||
for (size_t i = 0; i < bufSize / sizeof(unsigned int); ++i)
|
||||
EXPECT_EQ(verifyBuffer.As<unsigned int*>()[i], FILL_VALUE);
|
||||
}
|
||||
|
||||
TEST_F(KFDQMTest, GpuMemCopyTest) {
|
||||
TEST_START(TESTPROFILE_RUNALL);
|
||||
|
||||
ASSERT_SUCCESS(KFDTestLaunch([this](int gpuNode) {
|
||||
this->GpuMemCopyTest(gpuNode);
|
||||
}));
|
||||
|
||||
TEST_END;
|
||||
}
|
||||
|
||||
void KFDQMTest::MultipleCpQueuesStressDispatch(int gpuNode) {
|
||||
|
||||
Assembler* m_pAsm;
|
||||
|
||||
@@ -102,12 +102,12 @@ class KFDQMTest : public KFDBaseComponentTest {
|
||||
void PM4EventInterrupt(int gpuNode);
|
||||
void SdmaEventInterrupt(int gpuNode);
|
||||
void GPUDoorbellWrite(int gpuNode);
|
||||
void GpuMemCopyTest(int gpuNode);
|
||||
|
||||
protected:
|
||||
virtual void SetUp();
|
||||
virtual void TearDown();
|
||||
|
||||
void SyncDispatch(const HsaMemoryBuffer& isaBuffer, void* pSrcBuf, void* pDstBuf, int node = -1);
|
||||
void SyncDispatch(const HsaMemoryBuffer& isaBuffer, void* arg0, void* arg1, int node = -1);
|
||||
HSAint64 TimeConsumedwithCUMask(int node, uint32_t *mask, uint32_t mask_count);
|
||||
HSAint64 GetAverageTimeConsumedwithCUMask(int node, uint32_t *mask, uint32_t mask_count, int iterations);
|
||||
void testQueuePriority(int gpuNode, bool isSamePipe);
|
||||
|
||||
@@ -28,6 +28,8 @@
|
||||
#include <algorithm>
|
||||
#include <vector>
|
||||
#include "BaseQueue.hpp"
|
||||
#include "PM4Queue.hpp"
|
||||
#include "SDMAQueue.hpp"
|
||||
#include "Dispatch.hpp"
|
||||
#include "SDMAPacket.hpp"
|
||||
|
||||
@@ -274,6 +276,48 @@ HSAuint64 GetSystemTickCountInMicroSec() {
|
||||
return t.tv_sec * 1000000ULL + t.tv_usec;
|
||||
}
|
||||
|
||||
bool GPUMemCopy(void* dst, void* src, size_t size, unsigned int node, bool useSdma) {
|
||||
if (useSdma) {
|
||||
SDMAQueue sdmaQueue;
|
||||
if (sdmaQueue.Create(node) != HSAKMT_STATUS_SUCCESS)
|
||||
return false;
|
||||
sdmaQueue.PlaceAndSubmitPacket(SDMACopyDataPacket(sdmaQueue.GetFamilyId(), dst, src, size));
|
||||
sdmaQueue.Wait4PacketConsumption();
|
||||
sdmaQueue.Destroy();
|
||||
return true;
|
||||
} else {
|
||||
PM4Queue pm4Queue;
|
||||
if (pm4Queue.Create(node) != HSAKMT_STATUS_SUCCESS)
|
||||
return false;
|
||||
|
||||
HsaNodeProperties nodeProperties;
|
||||
if (hsaKmtGetNodeProperties(node, &nodeProperties) != HSAKMT_STATUS_SUCCESS)
|
||||
return false;
|
||||
Assembler pAsm(GetGfxVersion(&nodeProperties));
|
||||
|
||||
HsaMemoryBuffer isaBuffer(PAGE_SIZE, node, true, false, true);
|
||||
if (pAsm.RunAssembleBuf(CopyWordsIsa, isaBuffer.As<char*>()) != HSAKMT_STATUS_SUCCESS)
|
||||
return false;
|
||||
|
||||
HsaMemoryBuffer addrBuffer(PAGE_SIZE, node);
|
||||
void **localBufAddr = addrBuffer.As<void **>();
|
||||
localBufAddr[0] = src;
|
||||
localBufAddr[1] = dst;
|
||||
|
||||
HsaMemoryBuffer sizeBuffer(PAGE_SIZE, node);
|
||||
unsigned int *pSize = sizeBuffer.As<unsigned int *>();
|
||||
*pSize = static_cast<unsigned int>(size / sizeof(unsigned int));
|
||||
|
||||
Dispatch dispatch(isaBuffer);
|
||||
dispatch.SetArgs(localBufAddr, pSize);
|
||||
dispatch.SetDim(1, 1, 1);
|
||||
dispatch.Submit(pm4Queue);
|
||||
dispatch.Sync();
|
||||
pm4Queue.Destroy();
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
const HsaMemoryBuffer HsaMemoryBuffer::Null;
|
||||
|
||||
HsaMemoryBuffer::HsaMemoryBuffer(HSAuint64 size, unsigned int node, bool zero, bool isLocal, bool isExec,
|
||||
|
||||
@@ -29,6 +29,8 @@
|
||||
#include "OSWrapper.hpp"
|
||||
#include "GoogleTestExtension.hpp"
|
||||
#include "hsakmt/hsakmt.h"
|
||||
#include "Assemble.hpp"
|
||||
#include "ShaderStore.hpp"
|
||||
|
||||
class BaseQueue;
|
||||
#define ARRAY_SIZE(_x) (sizeof(_x)/sizeof(_x[0]))
|
||||
@@ -65,6 +67,11 @@ void GetHwQueueInfo(const HsaNodeProperties *props,
|
||||
|
||||
HSAuint64 GetSystemTickCountInMicroSec();
|
||||
|
||||
// Copy @size bytes from @src to @dst.
|
||||
bool GPUMemCopy(
|
||||
void* dst, void* src, size_t size, unsigned int node,
|
||||
bool useSdma);
|
||||
|
||||
class HsaMemoryBuffer {
|
||||
public:
|
||||
static const HsaMemoryBuffer Null;
|
||||
|
||||
@@ -157,6 +157,74 @@ const char *CopyDwordIsa =
|
||||
s_endpgm
|
||||
)";
|
||||
|
||||
const char *CopyWordsIsa =
|
||||
SHADER_START
|
||||
SHADER_MACROS_FLAT
|
||||
R"(
|
||||
v_mov_b32 v2, s0
|
||||
v_mov_b32 v3, s1
|
||||
|
||||
.if (.amdgcn.gfx_generation_number >= 12)
|
||||
FLAT_LOAD_DWORDX2_NSS v[0:1], v[2:3] scope:SCOPE_DEV
|
||||
.else
|
||||
FLAT_LOAD_DWORDX2_NSS v[0:1], v[2:3] slc
|
||||
.endif
|
||||
s_waitcnt vmcnt(0) & lgkmcnt(0)
|
||||
|
||||
.if (.amdgcn.gfx_generation_number >= 10)
|
||||
v_add_nc_u32 v4, 8, v2
|
||||
.else
|
||||
v_add_u32 v4, 8, v2
|
||||
.endif
|
||||
|
||||
v_mov_b32 v5, v3
|
||||
.if (.amdgcn.gfx_generation_number >= 12)
|
||||
FLAT_LOAD_DWORDX2_NSS v[6:7], v[4:5] scope:SCOPE_DEV
|
||||
.else
|
||||
FLAT_LOAD_DWORDX2_NSS v[6:7], v[4:5] slc
|
||||
.endif
|
||||
s_waitcnt vmcnt(0) & lgkmcnt(0)
|
||||
|
||||
v_mov_b32 v8, s2
|
||||
v_mov_b32 v9, s3
|
||||
.if (.amdgcn.gfx_generation_number >= 12)
|
||||
FLAT_LOAD_DWORD_NSS v10, v[8:9] scope:SCOPE_DEV
|
||||
.else
|
||||
FLAT_LOAD_DWORD_NSS v10, v[8:9] slc
|
||||
.endif
|
||||
s_waitcnt vmcnt(0) & lgkmcnt(0)
|
||||
v_mov_b32 v8, v10
|
||||
|
||||
v_mov_b32 v9, 0
|
||||
|
||||
LOOP:
|
||||
.if (.amdgcn.gfx_generation_number >= 12)
|
||||
FLAT_LOAD_DWORD_NSS v10, v[0:1] scope:SCOPE_SYS
|
||||
s_wait_loadcnt 0
|
||||
FLAT_STORE_DWORD_NSS v[6:7], v10 scope:SCOPE_SYS
|
||||
.else
|
||||
FLAT_LOAD_DWORD_NSS v10, v[0:1] glc slc
|
||||
s_waitcnt vmcnt(0) & lgkmcnt(0)
|
||||
FLAT_STORE_DWORD_NSS v[6:7], v10 glc slc
|
||||
.endif
|
||||
|
||||
.if (.amdgcn.gfx_generation_number >= 10)
|
||||
v_add_nc_u32 v0, 4, v0
|
||||
v_add_nc_u32 v6, 4, v6
|
||||
v_add_nc_u32 v9, 1, v9
|
||||
.else
|
||||
v_add_u32 v0, 4, v0
|
||||
v_add_u32 v6, 4, v6
|
||||
|
||||
v_add_u32 v9, 1, v9
|
||||
.endif
|
||||
|
||||
v_cmp_lt_u32 v9, v8
|
||||
s_cbranch_vccnz LOOP
|
||||
|
||||
s_endpgm
|
||||
)";
|
||||
|
||||
const char *InfiniteLoopIsa =
|
||||
SHADER_START
|
||||
R"(
|
||||
|
||||
@@ -32,6 +32,7 @@ extern const std::vector<const char*> ShaderList;
|
||||
/* Common */
|
||||
extern const char *NoopIsa;
|
||||
extern const char *CopyDwordIsa;
|
||||
extern const char *CopyWordsIsa;
|
||||
extern const char *InfiniteLoopIsa;
|
||||
extern const char *AtomicIncIsa;
|
||||
|
||||
|
||||
Fai riferimento in un nuovo problema
Block a user