/* * Copyright (C) 2014-2018 Advanced Micro Devices, Inc. All Rights Reserved. * * Permission is hereby granted, free of charge, to any person obtaining a * copy of this software and associated documentation files (the "Software"), * to deal in the Software without restriction, including without limitation * the rights to use, copy, modify, merge, publish, distribute, sublicense, * and/or sell copies of the Software, and to permit persons to whom the * Software is furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in * all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR * OTHER DEALINGS IN THE SOFTWARE. * */ #include "Dispatch.hpp" #include "PM4Packet.hpp" #include "asic_reg/gfx_7_2_d.h" #include "asic_reg/gfx_7_2_sh_mask.h" #include "KFDBaseComponentTest.hpp" Dispatch::Dispatch(const HsaMemoryBuffer& isaBuf, const bool eventAutoReset) :m_IsaBuf(isaBuf), m_IndirectBuf(PACKETTYPE_PM4, PAGE_SIZE / sizeof(unsigned int), isaBuf.Node()), m_DimX(1), m_DimY(1), m_DimZ(1), m_pArg1(NULL), m_pArg2(NULL), m_pEop(NULL), m_ScratchEn(false), m_ComputeTmpringSize(0), m_scratch_base(0ll), m_SpiPriority(0) { HsaEventDescriptor eventDesc; eventDesc.EventType = HSA_EVENTTYPE_SIGNAL; eventDesc.NodeId = isaBuf.Node(); eventDesc.SyncVar.SyncVar.UserData = NULL; eventDesc.SyncVar.SyncVarSize = 0; hsaKmtCreateEvent(&eventDesc, !eventAutoReset, false, &m_pEop); m_FamilyId = g_baseTest->GetFamilyIdFromNodeId(isaBuf.Node()); } Dispatch::~Dispatch() { if (m_pEop != NULL) hsaKmtDestroyEvent(m_pEop); } void Dispatch::SetArgs(void* pArg1, void* pArg2) { m_pArg1 = pArg1; m_pArg2 = pArg2; } void Dispatch::SetDim(unsigned int x, unsigned int y, unsigned int z) { m_DimX = x; m_DimY = y; m_DimZ = z; } void Dispatch::SetScratch(int numWaves, int waveSize, HSAuint64 scratch_base) { m_ComputeTmpringSize = ((waveSize << 12) | (numWaves)); m_ScratchEn = true; m_scratch_base = scratch_base; } void Dispatch::SetSpiPriority(unsigned int priority) { m_SpiPriority = priority; } void Dispatch::Submit(BaseQueue& queue) { ASSERT_NE(m_pEop, (void*)0); EXPECT_EQ(m_FamilyId, queue.GetFamilyId()); BuildIb(); queue.PlaceAndSubmitPacket(PM4IndirectBufPacket(&m_IndirectBuf)); // Write data to SyncVar for synchronization purpose if (m_pEop->EventData.EventData.SyncVar.SyncVar.UserData != NULL) { queue.PlaceAndSubmitPacket(PM4WriteDataPacket((unsigned int*)m_pEop-> EventData.EventData.SyncVar.SyncVar.UserData, m_pEop->EventId)); } queue.PlaceAndSubmitPacket(PM4ReleaseMemoryPacket(m_FamilyId, false, m_pEop->EventData.HWData2, m_pEop->EventId)); if (!queue.GetSkipWaitConsump()) queue.Wait4PacketConsumption(); } void Dispatch::Sync(unsigned int timeout) { ASSERT_SUCCESS(hsaKmtWaitOnEvent(m_pEop, timeout)); } // Returning with status in order to allow actions to be performed before process termination int Dispatch::SyncWithStatus(unsigned int timeout) { int stat; return ((stat = hsaKmtWaitOnEvent(m_pEop, timeout)) != HSAKMT_STATUS_SUCCESS); } void Dispatch::BuildIb() { HSAuint64 shiftedIsaAddr = m_IsaBuf.As() >> 8; unsigned int arg0, arg1, arg2, arg3; SplitU64(reinterpret_cast(m_pArg1), arg0, arg1); SplitU64(reinterpret_cast(m_pArg2), arg2, arg3); // Starts at COMPUTE_START_X const unsigned int COMPUTE_DISPATCH_DIMS_VALUES[] = { 0, // START_X 0, // START_Y 0, // START_Z 1, // NUM_THREADS_X - this is actually the number of threads in a thread group 1, // NUM_THREADS_Y 1, // NUM_THREADS_Z 0, // COMPUTE_PIPELINESTAT_ENABLE 0, // COMPUTE_PERFCOUNT_ENABLE }; unsigned int pgmRsrc2 = 0; pgmRsrc2 |= (m_ScratchEn << COMPUTE_PGM_RSRC2__SCRATCH_EN__SHIFT) & COMPUTE_PGM_RSRC2__SCRATCH_EN_MASK; pgmRsrc2 |= ((m_scratch_base ? 6 : 4) << COMPUTE_PGM_RSRC2__USER_SGPR__SHIFT) & COMPUTE_PGM_RSRC2__USER_SGPR_MASK; pgmRsrc2 |= (1 << COMPUTE_PGM_RSRC2__TRAP_PRESENT__SHIFT) & COMPUTE_PGM_RSRC2__TRAP_PRESENT_MASK; pgmRsrc2 |= (1 << COMPUTE_PGM_RSRC2__TGID_X_EN__SHIFT) & COMPUTE_PGM_RSRC2__TGID_X_EN_MASK; pgmRsrc2 |= (1 << COMPUTE_PGM_RSRC2__TIDIG_COMP_CNT__SHIFT) & COMPUTE_PGM_RSRC2__TIDIG_COMP_CNT_MASK; pgmRsrc2 |= (0 << COMPUTE_PGM_RSRC2__EXCP_EN__SHIFT) & COMPUTE_PGM_RSRC2__EXCP_EN_MASK; pgmRsrc2 |= (1 << COMPUTE_PGM_RSRC2__EXCP_EN_MSB__SHIFT) & COMPUTE_PGM_RSRC2__EXCP_EN_MSB_MASK; const unsigned int COMPUTE_PGM_RSRC[] = { // PGM_RSRC1 = { VGPRS: 16 SGPRS: 16 PRIORITY: m_SpiPriority FLOAT_MODE: c0 PRIV: 0 // DX10_CLAMP: 0 DEBUG_MODE: 0 IEEE_MODE: 0 BULKY: 0 CDBG_USER: 0 } 0x000c0084 | ((m_SpiPriority & 3) << 10), pgmRsrc2 }; // Starts at COMPUTE_PGM_LO const unsigned int COMPUTE_PGM_VALUES_GFX8[] = { static_cast(shiftedIsaAddr), // PGM_LO static_cast(shiftedIsaAddr >> 32) // PGM_HI | (is_dgpu() ? 0 : (1<<8)) // including PGM_ATC=? }; // Starts at COMPUTE_PGM_LO const unsigned int COMPUTE_PGM_VALUES_GFX9[] = { static_cast(shiftedIsaAddr), // PGM_LO static_cast(shiftedIsaAddr >> 32) // PGM_HI | (is_dgpu() ? 0 : (1<<8)), // including PGM_ATC=? 0, 0, static_cast(m_scratch_base >> 8), // compute_dispatch_scratch_base static_cast(m_scratch_base >> 40) }; // Starts at COMPUTE_RESOURCE_LIMITS const unsigned int COMPUTE_RESOURCE_LIMITS[] = { 0, // COMPUTE_RESOURCE_LIMITS }; // Starts at COMPUTE_TMPRING_SIZE const unsigned int COMPUTE_TMPRING_SIZE[] = { m_ComputeTmpringSize, // COMPUTE_TMPRING_SIZE }; // Starts at COMPUTE_RESTART_X const unsigned int COMPUTE_RESTART_VALUES[] = { 0, // COMPUTE_RESTART_X 0, // COMPUTE_RESTART_Y 0, // COMPUTE_RESTART_Z 0 // COMPUTE_THREAD_TRACE_ENABLE }; // Starts at COMPUTE_USER_DATA_0 const unsigned int COMPUTE_USER_DATA_VALUES[] = { // Reg name - use in KFDtest - use in ABI arg0, // COMPUTE_USER_DATA_0 - arg0 - resource descriptor for the scratch buffer - 1st dword arg1, // COMPUTE_USER_DATA_1 - arg1 - resource descriptor for the scratch buffer - 2nd dword arg2, // COMPUTE_USER_DATA_2 - arg2 - resource descriptor for the scratch buffer - 3rd dword arg3, // COMPUTE_USER_DATA_3 - arg3 - resource descriptor for the scratch buffer - 4th dword static_cast(m_scratch_base), // COMPUTE_USER_DATA_4 - flat_scratch_lo static_cast(m_scratch_base >> 32), // COMPUTE_USER_DATA_4 - flat_scratch_hi 0, // COMPUTE_USER_DATA_6 - - AQL queue address, low part 0, // COMPUTE_USER_DATA_7 - - AQL queue address, high part 0, // COMPUTE_USER_DATA_8 - - kernel arguments block, low part 0, // COMPUTE_USER_DATA_9 - - kernel arguments block, high part 0, // COMPUTE_USER_DATA_10 - - unused 0, // COMPUTE_USER_DATA_11 - - unused 0, // COMPUTE_USER_DATA_12 - - unused 0, // COMPUTE_USER_DATA_13 - - unused 0, // COMPUTE_USER_DATA_14 - - unused 0, // COMPUTE_USER_DATA_15 - - unused }; const unsigned int DISPATCH_INIT_VALUE = 0x00000021 | (is_dgpu() ? 0 : 0x1000) | ((m_FamilyId >= FAMILY_NV) ? 0x8000 : 0); // {COMPUTE_SHADER_EN=1, PARTIAL_TG_EN=0, FORCE_START_AT_000=0, ORDERED_APPEND_ENBL=0, // ORDERED_APPEND_MODE=0, USE_THREAD_DIMENSIONS=1, ORDER_MODE=0, DISPATCH_CACHE_CNTL=0, // SCALAR_L1_INV_VOL=0, VECTOR_L1_INV_VOL=0, DATA_ATC=?, RESTORE=0} // Set CS_W32_EN for wave32 workloads for gfx10 since all the shaders used in KFDTest is 32 bit . m_IndirectBuf.AddPacket(PM4AcquireMemoryPacket(m_FamilyId)); m_IndirectBuf.AddPacket(PM4SetShaderRegPacket(mmCOMPUTE_START_X, COMPUTE_DISPATCH_DIMS_VALUES, ARRAY_SIZE(COMPUTE_DISPATCH_DIMS_VALUES))); m_IndirectBuf.AddPacket(PM4SetShaderRegPacket(mmCOMPUTE_PGM_LO, (m_FamilyId >= FAMILY_AI) ? COMPUTE_PGM_VALUES_GFX9 : COMPUTE_PGM_VALUES_GFX8, (m_FamilyId >= FAMILY_AI) ? ARRAY_SIZE(COMPUTE_PGM_VALUES_GFX9) : ARRAY_SIZE(COMPUTE_PGM_VALUES_GFX8))); m_IndirectBuf.AddPacket(PM4SetShaderRegPacket(mmCOMPUTE_PGM_RSRC1, COMPUTE_PGM_RSRC, ARRAY_SIZE(COMPUTE_PGM_RSRC))); m_IndirectBuf.AddPacket(PM4SetShaderRegPacket(mmCOMPUTE_RESOURCE_LIMITS, COMPUTE_RESOURCE_LIMITS, ARRAY_SIZE(COMPUTE_RESOURCE_LIMITS))); m_IndirectBuf.AddPacket(PM4SetShaderRegPacket(mmCOMPUTE_TMPRING_SIZE, COMPUTE_TMPRING_SIZE, ARRAY_SIZE(COMPUTE_TMPRING_SIZE))); m_IndirectBuf.AddPacket(PM4SetShaderRegPacket(mmCOMPUTE_RESTART_X, COMPUTE_RESTART_VALUES, ARRAY_SIZE(COMPUTE_RESTART_VALUES))); m_IndirectBuf.AddPacket(PM4SetShaderRegPacket(mmCOMPUTE_USER_DATA_0, COMPUTE_USER_DATA_VALUES, ARRAY_SIZE(COMPUTE_USER_DATA_VALUES))); m_IndirectBuf.AddPacket(PM4DispatchDirectPacket(m_DimX, m_DimY, m_DimZ, DISPATCH_INIT_VALUE)); m_IndirectBuf.AddPacket(PM4PartialFlushPacket()); m_IndirectBuf.AddPacket(PM4AcquireMemoryPacket(m_FamilyId)); }