Files
rocm-systems/tests/kfdtest/src/Dispatch.cpp
T
shaoyunl 65ab296840 KFDTest: Set CS_W32_EN bit for wave32 workloads for GFX10
On GFX10, the wave size is determined by the COMPUTE_DISPATCH_INITIATOR value passed to
DISPATCH_DIRECT.CS_W32_EN, default 0 value was giving 64 lane waves

Change-Id: Ie8c407a24bd2825757ec481be62247b35047e5ca
Signed-off-by: shaoyunl <shaoyun.liu@amd.com>
2019-08-06 18:57:14 -04:00

237 lines
10 KiB
C++

/*
* Copyright (C) 2014-2018 Advanced Micro Devices, Inc. All Rights Reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*
*/
#include "Dispatch.hpp"
#include "PM4Packet.hpp"
#include "asic_reg/gfx_7_2_d.h"
#include "asic_reg/gfx_7_2_sh_mask.h"
#include "KFDBaseComponentTest.hpp"
Dispatch::Dispatch(const HsaMemoryBuffer& isaBuf, const bool eventAutoReset)
:m_IsaBuf(isaBuf), m_IndirectBuf(PACKETTYPE_PM4, PAGE_SIZE / sizeof(unsigned int), isaBuf.Node()),
m_DimX(1), m_DimY(1), m_DimZ(1), m_pArg1(NULL), m_pArg2(NULL), m_pEop(NULL), m_ScratchEn(false),
m_ComputeTmpringSize(0), m_scratch_base(0ll), m_SpiPriority(0) {
HsaEventDescriptor eventDesc;
eventDesc.EventType = HSA_EVENTTYPE_SIGNAL;
eventDesc.NodeId = isaBuf.Node();
eventDesc.SyncVar.SyncVar.UserData = NULL;
eventDesc.SyncVar.SyncVarSize = 0;
hsaKmtCreateEvent(&eventDesc, !eventAutoReset, false, &m_pEop);
m_FamilyId = g_baseTest->GetFamilyIdFromNodeId(isaBuf.Node());
}
Dispatch::~Dispatch() {
if (m_pEop != NULL)
hsaKmtDestroyEvent(m_pEop);
}
void Dispatch::SetArgs(void* pArg1, void* pArg2) {
m_pArg1 = pArg1;
m_pArg2 = pArg2;
}
void Dispatch::SetDim(unsigned int x, unsigned int y, unsigned int z) {
m_DimX = x;
m_DimY = y;
m_DimZ = z;
}
void Dispatch::SetScratch(int numWaves, int waveSize, HSAuint64 scratch_base) {
m_ComputeTmpringSize = ((waveSize << 12) | (numWaves));
m_ScratchEn = true;
m_scratch_base = scratch_base;
}
void Dispatch::SetSpiPriority(unsigned int priority) {
m_SpiPriority = priority;
}
void Dispatch::Submit(BaseQueue& queue) {
ASSERT_NE(m_pEop, (void*)0);
EXPECT_EQ(m_FamilyId, queue.GetFamilyId());
BuildIb();
queue.PlaceAndSubmitPacket(PM4IndirectBufPacket(&m_IndirectBuf));
// Write data to SyncVar for synchronization purpose
if (m_pEop->EventData.EventData.SyncVar.SyncVar.UserData != NULL) {
queue.PlaceAndSubmitPacket(PM4WriteDataPacket((unsigned int*)m_pEop->
EventData.EventData.SyncVar.SyncVar.UserData, m_pEop->EventId));
}
queue.PlaceAndSubmitPacket(PM4ReleaseMemoryPacket(m_FamilyId, false, m_pEop->EventData.HWData2, m_pEop->EventId));
if (!queue.GetSkipWaitConsump())
queue.Wait4PacketConsumption();
}
void Dispatch::Sync(unsigned int timeout) {
ASSERT_SUCCESS(hsaKmtWaitOnEvent(m_pEop, timeout));
}
// Returning with status in order to allow actions to be performed before process termination
int Dispatch::SyncWithStatus(unsigned int timeout) {
int stat;
return ((stat = hsaKmtWaitOnEvent(m_pEop, timeout)) != HSAKMT_STATUS_SUCCESS);
}
void Dispatch::BuildIb() {
HSAuint64 shiftedIsaAddr = m_IsaBuf.As<uint64_t>() >> 8;
unsigned int arg0, arg1, arg2, arg3;
SplitU64(reinterpret_cast<uint64_t>(m_pArg1), arg0, arg1);
SplitU64(reinterpret_cast<uint64_t>(m_pArg2), arg2, arg3);
// Starts at COMPUTE_START_X
const unsigned int COMPUTE_DISPATCH_DIMS_VALUES[] = {
0, // START_X
0, // START_Y
0, // START_Z
1, // NUM_THREADS_X - this is actually the number of threads in a thread group
1, // NUM_THREADS_Y
1, // NUM_THREADS_Z
0, // COMPUTE_PIPELINESTAT_ENABLE
0, // COMPUTE_PERFCOUNT_ENABLE
};
unsigned int pgmRsrc2 = 0;
pgmRsrc2 |= (m_ScratchEn << COMPUTE_PGM_RSRC2__SCRATCH_EN__SHIFT)
& COMPUTE_PGM_RSRC2__SCRATCH_EN_MASK;
pgmRsrc2 |= ((m_scratch_base ? 6 : 4) << COMPUTE_PGM_RSRC2__USER_SGPR__SHIFT)
& COMPUTE_PGM_RSRC2__USER_SGPR_MASK;
pgmRsrc2 |= (1 << COMPUTE_PGM_RSRC2__TRAP_PRESENT__SHIFT)
& COMPUTE_PGM_RSRC2__TRAP_PRESENT_MASK;
pgmRsrc2 |= (1 << COMPUTE_PGM_RSRC2__TGID_X_EN__SHIFT)
& COMPUTE_PGM_RSRC2__TGID_X_EN_MASK;
pgmRsrc2 |= (1 << COMPUTE_PGM_RSRC2__TIDIG_COMP_CNT__SHIFT)
& COMPUTE_PGM_RSRC2__TIDIG_COMP_CNT_MASK;
pgmRsrc2 |= (0 << COMPUTE_PGM_RSRC2__EXCP_EN__SHIFT)
& COMPUTE_PGM_RSRC2__EXCP_EN_MASK;
pgmRsrc2 |= (1 << COMPUTE_PGM_RSRC2__EXCP_EN_MSB__SHIFT)
& COMPUTE_PGM_RSRC2__EXCP_EN_MSB_MASK;
const unsigned int COMPUTE_PGM_RSRC[] = {
// PGM_RSRC1 = { VGPRS: 16 SGPRS: 16 PRIORITY: m_SpiPriority FLOAT_MODE: c0 PRIV: 0
// DX10_CLAMP: 0 DEBUG_MODE: 0 IEEE_MODE: 0 BULKY: 0 CDBG_USER: 0 }
0x000c0084 | ((m_SpiPriority & 3) << 10),
pgmRsrc2
};
// Starts at COMPUTE_PGM_LO
const unsigned int COMPUTE_PGM_VALUES_GFX8[] = {
static_cast<uint32_t>(shiftedIsaAddr), // PGM_LO
static_cast<uint32_t>(shiftedIsaAddr >> 32) // PGM_HI
| (is_dgpu() ? 0 : (1<<8)) // including PGM_ATC=?
};
// Starts at COMPUTE_PGM_LO
const unsigned int COMPUTE_PGM_VALUES_GFX9[] = {
static_cast<uint32_t>(shiftedIsaAddr), // PGM_LO
static_cast<uint32_t>(shiftedIsaAddr >> 32) // PGM_HI
| (is_dgpu() ? 0 : (1<<8)), // including PGM_ATC=?
0,
0,
static_cast<uint32_t>(m_scratch_base >> 8), // compute_dispatch_scratch_base
static_cast<uint32_t>(m_scratch_base >> 40)
};
// Starts at COMPUTE_RESOURCE_LIMITS
const unsigned int COMPUTE_RESOURCE_LIMITS[] = {
0, // COMPUTE_RESOURCE_LIMITS
};
// Starts at COMPUTE_TMPRING_SIZE
const unsigned int COMPUTE_TMPRING_SIZE[] = {
m_ComputeTmpringSize, // COMPUTE_TMPRING_SIZE
};
// Starts at COMPUTE_RESTART_X
const unsigned int COMPUTE_RESTART_VALUES[] = {
0, // COMPUTE_RESTART_X
0, // COMPUTE_RESTART_Y
0, // COMPUTE_RESTART_Z
0 // COMPUTE_THREAD_TRACE_ENABLE
};
// Starts at COMPUTE_USER_DATA_0
const unsigned int COMPUTE_USER_DATA_VALUES[] = {
// Reg name - use in KFDtest - use in ABI
arg0, // COMPUTE_USER_DATA_0 - arg0 - resource descriptor for the scratch buffer - 1st dword
arg1, // COMPUTE_USER_DATA_1 - arg1 - resource descriptor for the scratch buffer - 2nd dword
arg2, // COMPUTE_USER_DATA_2 - arg2 - resource descriptor for the scratch buffer - 3rd dword
arg3, // COMPUTE_USER_DATA_3 - arg3 - resource descriptor for the scratch buffer - 4th dword
static_cast<uint32_t>(m_scratch_base), // COMPUTE_USER_DATA_4 - flat_scratch_lo
static_cast<uint32_t>(m_scratch_base >> 32), // COMPUTE_USER_DATA_4 - flat_scratch_hi
0, // COMPUTE_USER_DATA_6 - - AQL queue address, low part
0, // COMPUTE_USER_DATA_7 - - AQL queue address, high part
0, // COMPUTE_USER_DATA_8 - - kernel arguments block, low part
0, // COMPUTE_USER_DATA_9 - - kernel arguments block, high part
0, // COMPUTE_USER_DATA_10 - - unused
0, // COMPUTE_USER_DATA_11 - - unused
0, // COMPUTE_USER_DATA_12 - - unused
0, // COMPUTE_USER_DATA_13 - - unused
0, // COMPUTE_USER_DATA_14 - - unused
0, // COMPUTE_USER_DATA_15 - - unused
};
const unsigned int DISPATCH_INIT_VALUE = 0x00000021 | (is_dgpu() ? 0 : 0x1000) |
((m_FamilyId >= FAMILY_NV) ? 0x8000 : 0);
// {COMPUTE_SHADER_EN=1, PARTIAL_TG_EN=0, FORCE_START_AT_000=0, ORDERED_APPEND_ENBL=0,
// ORDERED_APPEND_MODE=0, USE_THREAD_DIMENSIONS=1, ORDER_MODE=0, DISPATCH_CACHE_CNTL=0,
// SCALAR_L1_INV_VOL=0, VECTOR_L1_INV_VOL=0, DATA_ATC=?, RESTORE=0}
// Set CS_W32_EN for wave32 workloads for gfx10 since all the shaders used in KFDTest is 32 bit .
m_IndirectBuf.AddPacket(PM4AcquireMemoryPacket(m_FamilyId));
m_IndirectBuf.AddPacket(PM4SetShaderRegPacket(mmCOMPUTE_START_X, COMPUTE_DISPATCH_DIMS_VALUES,
ARRAY_SIZE(COMPUTE_DISPATCH_DIMS_VALUES)));
m_IndirectBuf.AddPacket(PM4SetShaderRegPacket(mmCOMPUTE_PGM_LO,
(m_FamilyId >= FAMILY_AI) ? COMPUTE_PGM_VALUES_GFX9 : COMPUTE_PGM_VALUES_GFX8,
(m_FamilyId >= FAMILY_AI) ? ARRAY_SIZE(COMPUTE_PGM_VALUES_GFX9) : ARRAY_SIZE(COMPUTE_PGM_VALUES_GFX8)));
m_IndirectBuf.AddPacket(PM4SetShaderRegPacket(mmCOMPUTE_PGM_RSRC1, COMPUTE_PGM_RSRC,
ARRAY_SIZE(COMPUTE_PGM_RSRC)));
m_IndirectBuf.AddPacket(PM4SetShaderRegPacket(mmCOMPUTE_RESOURCE_LIMITS, COMPUTE_RESOURCE_LIMITS,
ARRAY_SIZE(COMPUTE_RESOURCE_LIMITS)));
m_IndirectBuf.AddPacket(PM4SetShaderRegPacket(mmCOMPUTE_TMPRING_SIZE, COMPUTE_TMPRING_SIZE,
ARRAY_SIZE(COMPUTE_TMPRING_SIZE)));
m_IndirectBuf.AddPacket(PM4SetShaderRegPacket(mmCOMPUTE_RESTART_X, COMPUTE_RESTART_VALUES,
ARRAY_SIZE(COMPUTE_RESTART_VALUES)));
m_IndirectBuf.AddPacket(PM4SetShaderRegPacket(mmCOMPUTE_USER_DATA_0, COMPUTE_USER_DATA_VALUES,
ARRAY_SIZE(COMPUTE_USER_DATA_VALUES)));
m_IndirectBuf.AddPacket(PM4DispatchDirectPacket(m_DimX, m_DimY, m_DimZ, DISPATCH_INIT_VALUE));
m_IndirectBuf.AddPacket(PM4PartialFlushPacket());
m_IndirectBuf.AddPacket(PM4AcquireMemoryPacket(m_FamilyId));
}