65ab296840
On GFX10, the wave size is determined by the COMPUTE_DISPATCH_INITIATOR value passed to DISPATCH_DIRECT.CS_W32_EN, default 0 value was giving 64 lane waves Change-Id: Ie8c407a24bd2825757ec481be62247b35047e5ca Signed-off-by: shaoyunl <shaoyun.liu@amd.com>
237 wiersze
10 KiB
C++
237 wiersze
10 KiB
C++
/*
|
|
* Copyright (C) 2014-2018 Advanced Micro Devices, Inc. All Rights Reserved.
|
|
*
|
|
* Permission is hereby granted, free of charge, to any person obtaining a
|
|
* copy of this software and associated documentation files (the "Software"),
|
|
* to deal in the Software without restriction, including without limitation
|
|
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
|
* and/or sell copies of the Software, and to permit persons to whom the
|
|
* Software is furnished to do so, subject to the following conditions:
|
|
*
|
|
* The above copyright notice and this permission notice shall be included in
|
|
* all copies or substantial portions of the Software.
|
|
*
|
|
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
|
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
|
|
* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
|
|
* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
|
* OTHER DEALINGS IN THE SOFTWARE.
|
|
*
|
|
*/
|
|
|
|
#include "Dispatch.hpp"
|
|
|
|
#include "PM4Packet.hpp"
|
|
|
|
#include "asic_reg/gfx_7_2_d.h"
|
|
#include "asic_reg/gfx_7_2_sh_mask.h"
|
|
|
|
#include "KFDBaseComponentTest.hpp"
|
|
|
|
Dispatch::Dispatch(const HsaMemoryBuffer& isaBuf, const bool eventAutoReset)
|
|
:m_IsaBuf(isaBuf), m_IndirectBuf(PACKETTYPE_PM4, PAGE_SIZE / sizeof(unsigned int), isaBuf.Node()),
|
|
m_DimX(1), m_DimY(1), m_DimZ(1), m_pArg1(NULL), m_pArg2(NULL), m_pEop(NULL), m_ScratchEn(false),
|
|
m_ComputeTmpringSize(0), m_scratch_base(0ll), m_SpiPriority(0) {
|
|
HsaEventDescriptor eventDesc;
|
|
eventDesc.EventType = HSA_EVENTTYPE_SIGNAL;
|
|
eventDesc.NodeId = isaBuf.Node();
|
|
eventDesc.SyncVar.SyncVar.UserData = NULL;
|
|
eventDesc.SyncVar.SyncVarSize = 0;
|
|
|
|
hsaKmtCreateEvent(&eventDesc, !eventAutoReset, false, &m_pEop);
|
|
|
|
m_FamilyId = g_baseTest->GetFamilyIdFromNodeId(isaBuf.Node());
|
|
}
|
|
|
|
Dispatch::~Dispatch() {
|
|
if (m_pEop != NULL)
|
|
hsaKmtDestroyEvent(m_pEop);
|
|
}
|
|
|
|
void Dispatch::SetArgs(void* pArg1, void* pArg2) {
|
|
m_pArg1 = pArg1;
|
|
m_pArg2 = pArg2;
|
|
}
|
|
|
|
void Dispatch::SetDim(unsigned int x, unsigned int y, unsigned int z) {
|
|
m_DimX = x;
|
|
m_DimY = y;
|
|
m_DimZ = z;
|
|
}
|
|
|
|
void Dispatch::SetScratch(int numWaves, int waveSize, HSAuint64 scratch_base) {
|
|
m_ComputeTmpringSize = ((waveSize << 12) | (numWaves));
|
|
m_ScratchEn = true;
|
|
m_scratch_base = scratch_base;
|
|
}
|
|
|
|
void Dispatch::SetSpiPriority(unsigned int priority) {
|
|
m_SpiPriority = priority;
|
|
}
|
|
|
|
void Dispatch::Submit(BaseQueue& queue) {
|
|
ASSERT_NE(m_pEop, (void*)0);
|
|
EXPECT_EQ(m_FamilyId, queue.GetFamilyId());
|
|
|
|
BuildIb();
|
|
|
|
queue.PlaceAndSubmitPacket(PM4IndirectBufPacket(&m_IndirectBuf));
|
|
|
|
// Write data to SyncVar for synchronization purpose
|
|
if (m_pEop->EventData.EventData.SyncVar.SyncVar.UserData != NULL) {
|
|
queue.PlaceAndSubmitPacket(PM4WriteDataPacket((unsigned int*)m_pEop->
|
|
EventData.EventData.SyncVar.SyncVar.UserData, m_pEop->EventId));
|
|
}
|
|
|
|
queue.PlaceAndSubmitPacket(PM4ReleaseMemoryPacket(m_FamilyId, false, m_pEop->EventData.HWData2, m_pEop->EventId));
|
|
|
|
if (!queue.GetSkipWaitConsump())
|
|
queue.Wait4PacketConsumption();
|
|
}
|
|
|
|
void Dispatch::Sync(unsigned int timeout) {
|
|
ASSERT_SUCCESS(hsaKmtWaitOnEvent(m_pEop, timeout));
|
|
}
|
|
|
|
// Returning with status in order to allow actions to be performed before process termination
|
|
int Dispatch::SyncWithStatus(unsigned int timeout) {
|
|
int stat;
|
|
|
|
return ((stat = hsaKmtWaitOnEvent(m_pEop, timeout)) != HSAKMT_STATUS_SUCCESS);
|
|
}
|
|
|
|
void Dispatch::BuildIb() {
|
|
HSAuint64 shiftedIsaAddr = m_IsaBuf.As<uint64_t>() >> 8;
|
|
unsigned int arg0, arg1, arg2, arg3;
|
|
SplitU64(reinterpret_cast<uint64_t>(m_pArg1), arg0, arg1);
|
|
SplitU64(reinterpret_cast<uint64_t>(m_pArg2), arg2, arg3);
|
|
|
|
// Starts at COMPUTE_START_X
|
|
const unsigned int COMPUTE_DISPATCH_DIMS_VALUES[] = {
|
|
0, // START_X
|
|
0, // START_Y
|
|
0, // START_Z
|
|
1, // NUM_THREADS_X - this is actually the number of threads in a thread group
|
|
1, // NUM_THREADS_Y
|
|
1, // NUM_THREADS_Z
|
|
0, // COMPUTE_PIPELINESTAT_ENABLE
|
|
0, // COMPUTE_PERFCOUNT_ENABLE
|
|
};
|
|
|
|
unsigned int pgmRsrc2 = 0;
|
|
pgmRsrc2 |= (m_ScratchEn << COMPUTE_PGM_RSRC2__SCRATCH_EN__SHIFT)
|
|
& COMPUTE_PGM_RSRC2__SCRATCH_EN_MASK;
|
|
pgmRsrc2 |= ((m_scratch_base ? 6 : 4) << COMPUTE_PGM_RSRC2__USER_SGPR__SHIFT)
|
|
& COMPUTE_PGM_RSRC2__USER_SGPR_MASK;
|
|
pgmRsrc2 |= (1 << COMPUTE_PGM_RSRC2__TRAP_PRESENT__SHIFT)
|
|
& COMPUTE_PGM_RSRC2__TRAP_PRESENT_MASK;
|
|
pgmRsrc2 |= (1 << COMPUTE_PGM_RSRC2__TGID_X_EN__SHIFT)
|
|
& COMPUTE_PGM_RSRC2__TGID_X_EN_MASK;
|
|
pgmRsrc2 |= (1 << COMPUTE_PGM_RSRC2__TIDIG_COMP_CNT__SHIFT)
|
|
& COMPUTE_PGM_RSRC2__TIDIG_COMP_CNT_MASK;
|
|
pgmRsrc2 |= (0 << COMPUTE_PGM_RSRC2__EXCP_EN__SHIFT)
|
|
& COMPUTE_PGM_RSRC2__EXCP_EN_MASK;
|
|
pgmRsrc2 |= (1 << COMPUTE_PGM_RSRC2__EXCP_EN_MSB__SHIFT)
|
|
& COMPUTE_PGM_RSRC2__EXCP_EN_MSB_MASK;
|
|
|
|
const unsigned int COMPUTE_PGM_RSRC[] = {
|
|
// PGM_RSRC1 = { VGPRS: 16 SGPRS: 16 PRIORITY: m_SpiPriority FLOAT_MODE: c0 PRIV: 0
|
|
// DX10_CLAMP: 0 DEBUG_MODE: 0 IEEE_MODE: 0 BULKY: 0 CDBG_USER: 0 }
|
|
0x000c0084 | ((m_SpiPriority & 3) << 10),
|
|
pgmRsrc2
|
|
};
|
|
|
|
// Starts at COMPUTE_PGM_LO
|
|
const unsigned int COMPUTE_PGM_VALUES_GFX8[] = {
|
|
static_cast<uint32_t>(shiftedIsaAddr), // PGM_LO
|
|
static_cast<uint32_t>(shiftedIsaAddr >> 32) // PGM_HI
|
|
| (is_dgpu() ? 0 : (1<<8)) // including PGM_ATC=?
|
|
};
|
|
|
|
// Starts at COMPUTE_PGM_LO
|
|
const unsigned int COMPUTE_PGM_VALUES_GFX9[] = {
|
|
static_cast<uint32_t>(shiftedIsaAddr), // PGM_LO
|
|
static_cast<uint32_t>(shiftedIsaAddr >> 32) // PGM_HI
|
|
| (is_dgpu() ? 0 : (1<<8)), // including PGM_ATC=?
|
|
0,
|
|
0,
|
|
static_cast<uint32_t>(m_scratch_base >> 8), // compute_dispatch_scratch_base
|
|
static_cast<uint32_t>(m_scratch_base >> 40)
|
|
};
|
|
|
|
// Starts at COMPUTE_RESOURCE_LIMITS
|
|
const unsigned int COMPUTE_RESOURCE_LIMITS[] = {
|
|
0, // COMPUTE_RESOURCE_LIMITS
|
|
};
|
|
|
|
// Starts at COMPUTE_TMPRING_SIZE
|
|
const unsigned int COMPUTE_TMPRING_SIZE[] = {
|
|
m_ComputeTmpringSize, // COMPUTE_TMPRING_SIZE
|
|
};
|
|
|
|
// Starts at COMPUTE_RESTART_X
|
|
const unsigned int COMPUTE_RESTART_VALUES[] = {
|
|
0, // COMPUTE_RESTART_X
|
|
0, // COMPUTE_RESTART_Y
|
|
0, // COMPUTE_RESTART_Z
|
|
0 // COMPUTE_THREAD_TRACE_ENABLE
|
|
};
|
|
|
|
// Starts at COMPUTE_USER_DATA_0
|
|
const unsigned int COMPUTE_USER_DATA_VALUES[] = {
|
|
// Reg name - use in KFDtest - use in ABI
|
|
arg0, // COMPUTE_USER_DATA_0 - arg0 - resource descriptor for the scratch buffer - 1st dword
|
|
arg1, // COMPUTE_USER_DATA_1 - arg1 - resource descriptor for the scratch buffer - 2nd dword
|
|
arg2, // COMPUTE_USER_DATA_2 - arg2 - resource descriptor for the scratch buffer - 3rd dword
|
|
arg3, // COMPUTE_USER_DATA_3 - arg3 - resource descriptor for the scratch buffer - 4th dword
|
|
static_cast<uint32_t>(m_scratch_base), // COMPUTE_USER_DATA_4 - flat_scratch_lo
|
|
static_cast<uint32_t>(m_scratch_base >> 32), // COMPUTE_USER_DATA_4 - flat_scratch_hi
|
|
0, // COMPUTE_USER_DATA_6 - - AQL queue address, low part
|
|
0, // COMPUTE_USER_DATA_7 - - AQL queue address, high part
|
|
0, // COMPUTE_USER_DATA_8 - - kernel arguments block, low part
|
|
0, // COMPUTE_USER_DATA_9 - - kernel arguments block, high part
|
|
0, // COMPUTE_USER_DATA_10 - - unused
|
|
0, // COMPUTE_USER_DATA_11 - - unused
|
|
0, // COMPUTE_USER_DATA_12 - - unused
|
|
0, // COMPUTE_USER_DATA_13 - - unused
|
|
0, // COMPUTE_USER_DATA_14 - - unused
|
|
0, // COMPUTE_USER_DATA_15 - - unused
|
|
};
|
|
|
|
const unsigned int DISPATCH_INIT_VALUE = 0x00000021 | (is_dgpu() ? 0 : 0x1000) |
|
|
((m_FamilyId >= FAMILY_NV) ? 0x8000 : 0);
|
|
// {COMPUTE_SHADER_EN=1, PARTIAL_TG_EN=0, FORCE_START_AT_000=0, ORDERED_APPEND_ENBL=0,
|
|
// ORDERED_APPEND_MODE=0, USE_THREAD_DIMENSIONS=1, ORDER_MODE=0, DISPATCH_CACHE_CNTL=0,
|
|
// SCALAR_L1_INV_VOL=0, VECTOR_L1_INV_VOL=0, DATA_ATC=?, RESTORE=0}
|
|
// Set CS_W32_EN for wave32 workloads for gfx10 since all the shaders used in KFDTest is 32 bit .
|
|
|
|
m_IndirectBuf.AddPacket(PM4AcquireMemoryPacket(m_FamilyId));
|
|
|
|
m_IndirectBuf.AddPacket(PM4SetShaderRegPacket(mmCOMPUTE_START_X, COMPUTE_DISPATCH_DIMS_VALUES,
|
|
ARRAY_SIZE(COMPUTE_DISPATCH_DIMS_VALUES)));
|
|
|
|
m_IndirectBuf.AddPacket(PM4SetShaderRegPacket(mmCOMPUTE_PGM_LO,
|
|
(m_FamilyId >= FAMILY_AI) ? COMPUTE_PGM_VALUES_GFX9 : COMPUTE_PGM_VALUES_GFX8,
|
|
(m_FamilyId >= FAMILY_AI) ? ARRAY_SIZE(COMPUTE_PGM_VALUES_GFX9) : ARRAY_SIZE(COMPUTE_PGM_VALUES_GFX8)));
|
|
m_IndirectBuf.AddPacket(PM4SetShaderRegPacket(mmCOMPUTE_PGM_RSRC1, COMPUTE_PGM_RSRC,
|
|
ARRAY_SIZE(COMPUTE_PGM_RSRC)));
|
|
|
|
m_IndirectBuf.AddPacket(PM4SetShaderRegPacket(mmCOMPUTE_RESOURCE_LIMITS, COMPUTE_RESOURCE_LIMITS,
|
|
ARRAY_SIZE(COMPUTE_RESOURCE_LIMITS)));
|
|
m_IndirectBuf.AddPacket(PM4SetShaderRegPacket(mmCOMPUTE_TMPRING_SIZE, COMPUTE_TMPRING_SIZE,
|
|
ARRAY_SIZE(COMPUTE_TMPRING_SIZE)));
|
|
m_IndirectBuf.AddPacket(PM4SetShaderRegPacket(mmCOMPUTE_RESTART_X, COMPUTE_RESTART_VALUES,
|
|
ARRAY_SIZE(COMPUTE_RESTART_VALUES)));
|
|
|
|
m_IndirectBuf.AddPacket(PM4SetShaderRegPacket(mmCOMPUTE_USER_DATA_0, COMPUTE_USER_DATA_VALUES,
|
|
ARRAY_SIZE(COMPUTE_USER_DATA_VALUES)));
|
|
|
|
m_IndirectBuf.AddPacket(PM4DispatchDirectPacket(m_DimX, m_DimY, m_DimZ, DISPATCH_INIT_VALUE));
|
|
|
|
m_IndirectBuf.AddPacket(PM4PartialFlushPacket());
|
|
|
|
m_IndirectBuf.AddPacket(PM4AcquireMemoryPacket(m_FamilyId));
|
|
}
|