From d916fe0129380ea6dafd2d09ab69b2920096ea62 Mon Sep 17 00:00:00 2001 From: Graham Sider Date: Fri, 24 Sep 2021 18:44:29 -0400 Subject: [PATCH 01/27] kfdtest: Add LLVM AMDGPU assembler components Initial commit for transition from IsaGenerator/SP3 assembler model to the LLVM AMDGPU (AMDGCN) assembler backend: - Add Assembler class, may be instantiated for assembly similar to IsaGenerator. - Add Assembler and LLVM archive dependencies to build process. - CXX bumped to gnu++14 as required for LLVM compilation. - Compatible with LLVM 7.0 and greater (latest Lightning/llvm-git version should be used for up-to-date gfx support). Note that this is just a build dependency and *not* a runtime dependency. LLVM does not need to be installed on the host machine to run kfdtest. - CMake will first look for a Lightning build. Lightning itself does not need to be installed system-wide, just built. If this fails, it will attempt to find a system-wide LLVM install. General Assembler usage and notes: - Similar to IsaGenerator, applicable test classes will contain an Assembler object pointer which may be instantiated in the test constructor. - Instantiation requires the GFXIP version in order to find the appropriate LLVM AMDGPU Target ID. - The RunAssemble() member func takes in a standard const char* shader and fills the TextData member with the output binary; TextSize with the size of TextData. These may be accessed via GetInstrStream() and GetInstrStreamSize(), or the output binary may be copied into an IsaBuffer via CopyInstrStream(). RunAssembleBuf() combines RunAssemble() and CopyInstrStream() and additionally takes an optional BufSize parameter to specify the size of the output buffer (defaults to PAGE_SIZE). - Assembler object deletion is to be done in the base test destructor. Assembler-specific memory allocation is freed in the Assembler destructor. - For debug, one can call PrintTextHex() to print out a formatted hex representation of the output binary, or PrintELFHex() to print out the intermediate ELF object. Note that PrintTextHex() is public whereas PrintELFHex() is private. - Prints use the LLVM outs() call as that allows for use of the LLVM format_hex() func in the aforementioned debug prints. This is subject to change if the LOG() call would be preferred. RunAssemble control flow: - Ensure correct Assembler initialization and clear previous run TextData (if necessary). - Initialize LLVM AMDGPU target, required interfaces, and buffers. - Set parser to specified target/subtarget and assemble into ELF code object. - Extract .text section from ELF, allocate space for TextData and store. - On success, returns 0 (HSAKMT_STATUS_SUCCESS). On error, returns -1 (subject to change to be in line with HSAKMT_STATUS enum). Signed-off-by: Graham Sider Change-Id: I1d96230824db651d3ffbaa46eb68fc274e7066b5 [ROCm/ROCR-Runtime commit: 65b1e0c058259b55b33b477860c92ce5a93b9149] --- .../rocr-runtime/tests/kfdtest/CMakeLists.txt | 38 +- .../tests/kfdtest/src/Assemble.cpp | 379 ++++++++++++++++++ .../tests/kfdtest/src/Assemble.hpp | 84 ++++ 3 files changed, 499 insertions(+), 2 deletions(-) create mode 100644 projects/rocr-runtime/tests/kfdtest/src/Assemble.cpp create mode 100644 projects/rocr-runtime/tests/kfdtest/src/Assemble.hpp diff --git a/projects/rocr-runtime/tests/kfdtest/CMakeLists.txt b/projects/rocr-runtime/tests/kfdtest/CMakeLists.txt index 05557ec16d..39994dbff3 100644 --- a/projects/rocr-runtime/tests/kfdtest/CMakeLists.txt +++ b/projects/rocr-runtime/tests/kfdtest/CMakeLists.txt @@ -95,6 +95,39 @@ endif() message ( "Find libhsakmt at ${HSAKMT_LIBRARY_DIRS}" ) +if ( POLICY CMP0074 ) + cmake_policy( SET CMP0074 NEW ) +endif() + +find_path( LIGHTNING_CMAKE_DIR NAMES LLVMConfig.cmake + PATHS $ENV{OUT_DIR}/llvm/lib/cmake/llvm NO_CACHE NO_DEFAULT_PATH) + +if ( DEFINED LIGHTNING_CMAKE_DIR AND EXISTS ${LIGHTNING_CMAKE_DIR} ) + set ( LLVM_DIR ${LIGHTNING_CMAKE_DIR} ) +else() + message( WARNING "Couldn't find Lightning build. " + "Attempting to use system LLVM install..." ) +endif() + +find_package( LLVM REQUIRED CONFIG ) + +if( ${LLVM_PACKAGE_VERSION} VERSION_LESS "7.0" ) + message( FATAL_ERROR "Requires LLVM 7.0 or greater " + "(found ${LLVM_PACKAGE_VERSION})" ) +elseif( ${LLVM_PACKAGE_VERSION} VERSION_LESS "14.0" ) + message( WARNING "Not using latest LLVM version. " + "Some ASIC targets may not work!" ) +endif() + +message( STATUS "Found LLVM ${LLVM_PACKAGE_VERSION}" ) +message( STATUS "Using LLVMConfig.cmake in: ${LLVM_DIR}" ) + +include_directories(${LLVM_INCLUDE_DIRS}) +separate_arguments(LLVM_DEFINITIONS_LIST NATIVE_COMMAND ${LLVM_DEFINITIONS}) +add_definitions(${LLVM_DEFINITIONS_LIST}) + +llvm_map_components_to_libnames(llvm_libs AMDGPUAsmParser Core Support) + set ( SP3_DIR ${PROJECT_SOURCE_DIR}/sp3 ) include_directories(${PROJECT_SOURCE_DIR}/gtest-1.6.0) @@ -112,6 +145,7 @@ set (SRC_FILES gtest-1.6.0/gtest-all.cpp src/Dispatch.cpp src/GoogleTestExtension.cpp src/IndirectBuffer.cpp + src/Assemble.cpp src/IsaGenerator.cpp src/IsaGenerator_Aldebaran.cpp src/IsaGenerator_Gfx10.cpp @@ -163,7 +197,7 @@ message( STATUS "PROJECT_SOURCE_DIR:" ${PROJECT_SOURCE_DIR} ) if ( "${CMAKE_C_COMPILER_VERSION}" STRGREATER "4.8.0") ## Add --enable-new-dtags to generate DT_RUNPATH -set ( CMAKE_CXX_FLAGS "-std=gnu++11 -Wl,--enable-new-dtags" ) +set ( CMAKE_CXX_FLAGS "-std=gnu++14 -Wl,--enable-new-dtags" ) endif() if ( "${CMAKE_BUILD_TYPE}" STREQUAL Release ) set ( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O2" ) @@ -185,7 +219,7 @@ link_directories(${SP3_DIR}) add_executable(kfdtest ${SRC_FILES}) -target_link_libraries(kfdtest ${HSAKMT_LIBRARIES} ${DRM_LDFLAGS} ${DRM_AMDGPU_LDFLAGS} pthread m stdc++ rt amdsp3 numa) +target_link_libraries(kfdtest ${HSAKMT_LIBRARIES} ${DRM_LDFLAGS} ${DRM_AMDGPU_LDFLAGS} ${llvm_libs} pthread m stdc++ rt amdsp3 numa) configure_file ( scripts/kfdtest.exclude kfdtest.exclude COPYONLY ) configure_file ( scripts/run_kfdtest.sh run_kfdtest.sh COPYONLY ) diff --git a/projects/rocr-runtime/tests/kfdtest/src/Assemble.cpp b/projects/rocr-runtime/tests/kfdtest/src/Assemble.cpp new file mode 100644 index 0000000000..cf4b9e7de0 --- /dev/null +++ b/projects/rocr-runtime/tests/kfdtest/src/Assemble.cpp @@ -0,0 +1,379 @@ +//////////////////////////////////////////////////////////////////////////////// +// +// The University of Illinois/NCSA +// Open Source License (NCSA) +// +// Copyright (c) 2022, Advanced Micro Devices, Inc. All rights reserved. +// +// Developed by: +// +// AMD Research and AMD HSA Software Development +// +// Advanced Micro Devices, Inc. +// +// www.amd.com +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to +// deal with the Software without restriction, including without limitation +// the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the +// Software is furnished to do so, subject to the following conditions: +// +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimers. +// - Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimers in +// the documentation and/or other materials provided with the distribution. +// - Neither the names of Advanced Micro Devices, Inc, +// nor the names of its contributors may be used to endorse or promote +// products derived from this Software without specific prior written +// permission. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR +// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +// DEALINGS WITH THE SOFTWARE. +// +//////////////////////////////////////////////////////////////////////////////// + +/** + * Self-contained assembler that uses the LLVM MC API to assemble AMDGCN + * instructions + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#if LLVM_VERSION_MAJOR > 13 +#include +#else +#include +#endif + +#include +#include "OSWrapper.hpp" +#include "Assemble.hpp" + +using namespace llvm; + +Assembler::Assembler(const uint32_t Gfxv) { + SetTargetAsic(Gfxv); + TextData = nullptr; + TextSize = 0; + LLVMInit(); +} + +Assembler::~Assembler() { + FlushText(); + llvm_shutdown(); +} + +const char* Assembler::GetInstrStream() { + return TextData; +} + +const size_t Assembler::GetInstrStreamSize() { + return TextSize; +} + +int Assembler::CopyInstrStream(char* OutBuf, const size_t BufSize) { + if (TextSize > BufSize) + return -2; + + std::copy(TextData, TextData + TextSize, OutBuf); + return 0; +} + +const char* Assembler::GetTargetAsic() { + return MCPU; +} + +/** + * Set MCPU via GFX Version from Thunk + * LLVM Target IDs use decimal for Maj/Min, hex for Step + */ +void Assembler::SetTargetAsic(const uint32_t Gfxv) { + const uint8_t Major = (Gfxv >> 16) & 0xff; + const uint8_t Minor = (Gfxv >> 8) & 0xff; + const uint8_t Step = Gfxv & 0xff; + + snprintf(MCPU, ASM_MCPU_LEN, "gfx%d%d%x", Major, Minor, Step); +} + +/** + * Initialize LLVM targets and assembly printers/parsers + */ +void Assembler::LLVMInit() { + LLVMInitializeAMDGPUTargetInfo(); + LLVMInitializeAMDGPUTargetMC(); + LLVMInitializeAMDGPUAsmParser(); +} + +/** + * Flush/reset TextData and TextSize to initial state + */ +void Assembler::FlushText() { + if (TextData) + delete[] TextData; + TextData = nullptr; + TextSize = 0; +} + +/** + * Print hex of ELF object to stdout (debug) + */ +void Assembler::PrintELFHex(const std::string Data) { + outs() << "ASM Info: assembled ELF hex data (length " << Data.length() << "):\n"; + outs() << "0x00:\t"; + for (size_t i = 0; i < Data.length(); ++i) { + char c = Data[i]; + outs() << format_hex(static_cast(c), 4); + if ((i+1) % 16 == 0) + outs() << "\n" << format_hex(i+1, 4) << ":\t"; + else + outs() << " "; + } + outs() << "\n"; +} + +/** + * Print hex of raw instruction stream to stdout (debug) + */ +void Assembler::PrintTextHex() { + outs() << "ASM Info: assembled .text hex data (length " << TextSize << "):\n"; + outs() << "0x00:\t"; + for (size_t i = 0; i < TextSize; i++) { + outs() << format_hex(static_cast(TextData[i]), 4); + if ((i+1) % 16 == 0) + outs() << "\n" << format_hex(i+1, 4) << ":\t"; + else + outs() << " "; + } + outs() << "\n"; +} + +/** + * Extract raw instruction stream from .text section in ELF object + * + * @param RawData Raw C string of ELF object + * @return 0 on success + */ +int Assembler::ExtractELFText(const char* RawData) { + const Elf64_Ehdr* ElfHeader; + const Elf64_Shdr* SectHeader; + const Elf64_Shdr* SectStrTable; + const char* SectStrAddr; + unsigned NumSects, SectIdx; + + if (!(ElfHeader = reinterpret_cast(RawData))) { + outs() << "ASM Error: elf data is invalid or corrupted\n"; + return -1; + } + if (ElfHeader->e_ident[EI_CLASS] != ELFCLASS64) { + outs() << "ASM Error: elf object must be of 64-bit type\n"; + return -1; + } + + SectHeader = reinterpret_cast(RawData + ElfHeader->e_shoff); + SectStrTable = &SectHeader[ElfHeader->e_shstrndx]; + SectStrAddr = static_cast(RawData + SectStrTable->sh_offset); + + // Loop through sections, break on .text + NumSects = ElfHeader->e_shnum; + for (SectIdx = 0; SectIdx < NumSects; SectIdx++) { + std::string SectName = std::string(SectStrAddr + SectHeader[SectIdx].sh_name); + if (SectName == std::string(".text")) { + TextSize = SectHeader[SectIdx].sh_size; + TextData = new char[TextSize]; + memcpy(TextData, RawData + SectHeader[SectIdx].sh_offset, TextSize); + break; + } + } + + if (SectIdx >= NumSects) { + outs() << "ASM Error: couldn't locate .text section\n"; + return -1; + } + + return 0; +} + +/** + * Assemble shader, fill member vars, and copy to output buffer + * + * @param AssemblySource Shader source represented as a raw C string + * @param OutBuf Raw instruction stream output buffer + * @param BufSize Size of OutBuf (defaults to PAGE_SIZE) + * @return Value of RunAssemble() (0 on success) + */ +int Assembler::RunAssembleBuf(const char* const AssemblySource, char* OutBuf, + const size_t BufSize) { + int ret = RunAssemble(AssemblySource); + return ret ? ret : CopyInstrStream(OutBuf, BufSize); +} + +/** + * Assemble shader and fill member vars + * + * @param AssemblySource Shader source represented as a raw C string + * @return 0 on success + */ +int Assembler::RunAssemble(const char* const AssemblySource) { + // Ensure target ASIC has been set + if (!MCPU) { + outs() << "ASM Error: target asic is uninitialized\n"; + return -1; + } + + // Delete TextData for any previous runs + FlushText(); + +#if 0 + outs() << "ASM Info: running assembly for target: " << MCPU << "\n"; + outs() << "ASM Info: source:\n"; + outs() << AssemblySource << "\n"; +#endif + + // Initialize MCOptions and target triple + const MCTargetOptions MCOptions; + Triple TheTriple; + + const Target* TheTarget = + TargetRegistry::lookupTarget(ArchName, TheTriple, Error); + if (!TheTarget) { + outs() << Error; + return -1; + } + + TheTriple.setArchName(ArchName); + TheTriple.setVendorName(VendorName); + TheTriple.setOSName(OSName); + + TripleName = TheTriple.getTriple(); + TheTriple.setTriple(Triple::normalize(TripleName)); + + // Create MemoryBuffer for assembly source + StringRef AssemblyRef(AssemblySource); + std::unique_ptr BufferPtr = + MemoryBuffer::getMemBuffer(AssemblyRef, "", false); + if (!BufferPtr->getBufferSize()) { + outs() << "ASM Error: assembly source is empty\n"; + return -1; + } + + // Instantiate SrcMgr and transfer BufferPtr ownership + SourceMgr SrcMgr; + SrcMgr.AddNewSourceBuffer(std::move(BufferPtr), SMLoc()); + + // Initialize MC interfaces and base class objects + std::unique_ptr MRI( + TheTarget->createMCRegInfo(TripleName)); + if (!MRI) { + outs() << "ASM Error: no register info for target " << MCPU << "\n"; + return -1; + } +#if LLVM_VERSION_MAJOR > 9 + std::unique_ptr MAI( + TheTarget->createMCAsmInfo(*MRI, TripleName, MCOptions)); +#else + std::unique_ptr MAI( + TheTarget->createMCAsmInfo(*MRI, TripleName)); +#endif + if (!MAI) { + outs() << "ASM Error: no assembly info for target " << MCPU << "\n"; + return -1; + } + std::unique_ptr MCII( + TheTarget->createMCInstrInfo()); + if (!MCII) { + outs() << "ASM Error: no instruction info for target " << MCPU << "\n"; + return -1; + } + std::unique_ptr STI( + TheTarget->createMCSubtargetInfo(TripleName, MCPU, std::string())); + if (!STI || !STI->isCPUStringValid(MCPU)) { + outs() << "ASM Error: no subtarget info for target " << MCPU << "\n"; + return -1; + } + + // Set up the MCContext for creating symbols and MCExpr's +#if LLVM_VERSION_MAJOR > 12 + MCContext Ctx(TheTriple, MAI.get(), MRI.get(), STI.get(), &SrcMgr, &MCOptions); +#else + MCObjectFileInfo MOFI; + MCContext Ctx(MAI.get(), MRI.get(), &MOFI, &SrcMgr, &MCOptions); + MOFI.InitMCObjectFileInfo(TheTriple, true, Ctx); +#endif + + // Finalize setup for output object code stream + std::string Data; + std::unique_ptr DataStream(std::make_unique(Data)); + std::unique_ptr BOS(std::make_unique(*DataStream)); + raw_pwrite_stream* OS = BOS.get(); + +#if LLVM_VERSION_MAJOR > 14 + MCCodeEmitter* CE = TheTarget->createMCCodeEmitter(*MCII, Ctx); +#else + MCCodeEmitter* CE = TheTarget->createMCCodeEmitter(*MCII, *MRI, Ctx); +#endif + MCAsmBackend* MAB = TheTarget->createMCAsmBackend(*STI, *MRI, MCOptions); + + std::unique_ptr Streamer(TheTarget->createMCObjectStreamer( + TheTriple, Ctx, + std::unique_ptr(MAB), MAB->createObjectWriter(*OS), + std::unique_ptr(CE), *STI, MCOptions.MCRelaxAll, + MCOptions.MCIncrementalLinkerCompatible, /*DWARFMustBeAtTheEnd*/ false)); + + std::unique_ptr Parser( + createMCAsmParser(SrcMgr, Ctx, *Streamer, *MAI)); + + // Set parser to target parser and run + std::unique_ptr TAP( + TheTarget->createMCAsmParser(*STI, *Parser, *MCII, MCOptions)); + if (!TAP) { + outs() << "ASM Error: no assembly parsing support for target " << MCPU << "\n"; + return -1; + } + Parser->setTargetParser(*TAP); + + if (Parser->Run(true)) { + outs() << "ASM Error: assembly parser failed\n"; + return -1; + } + + BOS.reset(); + DataStream->flush(); + + int ret = ExtractELFText(Data.data()); + if (ret < 0 || !TextData) { + outs() << "ASM Error: .text extraction failed\n"; + return ret; + } + +#if 0 + PrintELFHex(Data); + PrintTextHex(); +#endif + + return 0; +} diff --git a/projects/rocr-runtime/tests/kfdtest/src/Assemble.hpp b/projects/rocr-runtime/tests/kfdtest/src/Assemble.hpp new file mode 100644 index 0000000000..d61229a5a5 --- /dev/null +++ b/projects/rocr-runtime/tests/kfdtest/src/Assemble.hpp @@ -0,0 +1,84 @@ +//////////////////////////////////////////////////////////////////////////////// +// +// The University of Illinois/NCSA +// Open Source License (NCSA) +// +// Copyright (c) 2022, Advanced Micro Devices, Inc. All rights reserved. +// +// Developed by: +// +// AMD Research and AMD HSA Software Development +// +// Advanced Micro Devices, Inc. +// +// www.amd.com +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to +// deal with the Software without restriction, including without limitation +// the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the +// Software is furnished to do so, subject to the following conditions: +// +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimers. +// - Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimers in +// the documentation and/or other materials provided with the distribution. +// - Neither the names of Advanced Micro Devices, Inc, +// nor the names of its contributors may be used to endorse or promote +// products derived from this Software without specific prior written +// permission. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR +// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +// DEALINGS WITH THE SOFTWARE. +// +//////////////////////////////////////////////////////////////////////////////// + +#ifndef _ASSEMBLE_H_ +#define _ASSEMBLE_H_ + +#define ASM_MCPU_LEN 16 + +class Assembler { + private: + const char* ArchName = "amdgcn"; + const char* VendorName = "amd"; + const char* OSName = "amdhsa"; + char MCPU[ASM_MCPU_LEN]; + + std::string TripleName; + std::string Error; + + char* TextData; + size_t TextSize; + + void SetTargetAsic(const uint32_t Gfxv); + + void LLVMInit(); + void FlushText(); + void PrintELFHex(const std::string Data); + int ExtractELFText(const char* RawData); + + public: + Assembler(const uint32_t Gfxv); + ~Assembler(); + + void PrintTextHex(); + const char* GetTargetAsic(); + + const char* GetInstrStream(); + const size_t GetInstrStreamSize(); + int CopyInstrStream(char* OutBuf, const size_t BufSize = PAGE_SIZE); + + int RunAssemble(const char* const AssemblySource); + int RunAssembleBuf(const char* const AssemblySource, char* OutBuf, + const size_t BufSize = PAGE_SIZE); +}; + +#endif // _ASSEMBLE_H_ From 171f1e5a401b416875715f6a9e635adcb7f7e0b6 Mon Sep 17 00:00:00 2001 From: Graham Sider Date: Fri, 24 Sep 2021 18:49:02 -0400 Subject: [PATCH 02/27] kfdtest: Add GetGfxVersion to KFDTestUtil Required to derive LLVM AMDGPU target ASIC (MCPU). Signed-off-by: Graham Sider Change-Id: If8f139b3858c9bf42feba23ae9210e14625dc08b [ROCm/ROCR-Runtime commit: 2f73db8fb05fe8f96c82964734496c056234f16d] --- projects/rocr-runtime/tests/kfdtest/src/KFDTestUtil.cpp | 6 ++++++ projects/rocr-runtime/tests/kfdtest/src/KFDTestUtil.hpp | 1 + 2 files changed, 7 insertions(+) diff --git a/projects/rocr-runtime/tests/kfdtest/src/KFDTestUtil.cpp b/projects/rocr-runtime/tests/kfdtest/src/KFDTestUtil.cpp index 476e0bb1ce..2eddc8857b 100644 --- a/projects/rocr-runtime/tests/kfdtest/src/KFDTestUtil.cpp +++ b/projects/rocr-runtime/tests/kfdtest/src/KFDTestUtil.cpp @@ -231,6 +231,12 @@ bool isTonga(const HsaNodeProperties *props) { return false; } +const uint32_t GetGfxVersion(const HsaNodeProperties *props) { + return ((props->EngineId.ui32.Major << 16) | + (props->EngineId.ui32.Minor << 8) | + (props->EngineId.ui32.Stepping)); +} + HSAuint64 GetSystemTickCountInMicroSec() { struct timeval t; gettimeofday(&t, 0); diff --git a/projects/rocr-runtime/tests/kfdtest/src/KFDTestUtil.hpp b/projects/rocr-runtime/tests/kfdtest/src/KFDTestUtil.hpp index 7c2f9c61ce..938ff8bf69 100644 --- a/projects/rocr-runtime/tests/kfdtest/src/KFDTestUtil.hpp +++ b/projects/rocr-runtime/tests/kfdtest/src/KFDTestUtil.hpp @@ -52,6 +52,7 @@ bool is_dgpu(); bool isTonga(const HsaNodeProperties *props); bool hasPciAtomicsSupport(int node); unsigned int FamilyIdFromNode(const HsaNodeProperties *props); +const uint32_t GetGfxVersion(const HsaNodeProperties *props); void GetHwQueueInfo(const HsaNodeProperties *props, unsigned int *p_num_cp_queues, From 254ec57f0396136c60b090e9080f3bab0c664fc7 Mon Sep 17 00:00:00 2001 From: Graham Sider Date: Fri, 8 Oct 2021 13:35:51 -0400 Subject: [PATCH 03/27] kfdtest: Instantiate Assembler in KFDBaseComponentTest Instantiate in KFDBaseComponentTest::SetUp() and destroy in TearDown(). This ensures m_pAsm is available for all tests. Signed-off-by: Graham Sider Change-Id: I8b98a5350a9739d71455f14552c9879bdb1c475d [ROCm/ROCR-Runtime commit: 235636d59856e08f9e49fbec0015bc4652760e9c] --- .../rocr-runtime/tests/kfdtest/src/KFDBaseComponentTest.cpp | 6 ++++++ .../rocr-runtime/tests/kfdtest/src/KFDBaseComponentTest.hpp | 2 ++ 2 files changed, 8 insertions(+) diff --git a/projects/rocr-runtime/tests/kfdtest/src/KFDBaseComponentTest.cpp b/projects/rocr-runtime/tests/kfdtest/src/KFDBaseComponentTest.cpp index f950a7a1f1..5618945505 100644 --- a/projects/rocr-runtime/tests/kfdtest/src/KFDBaseComponentTest.cpp +++ b/projects/rocr-runtime/tests/kfdtest/src/KFDBaseComponentTest.cpp @@ -68,6 +68,8 @@ void KFDBaseComponentTest::SetUp() { g_baseTest = this; + m_pAsm = new Assembler(GetGfxVersion(nodeProperties)); + ROUTINE_END } @@ -86,6 +88,10 @@ void KFDBaseComponentTest::TearDown() { EXPECT_SUCCESS(hsaKmtCloseKFD()); g_baseTest = NULL; + if (m_pAsm) + delete m_pAsm; + m_pAsm = nullptr; + ROUTINE_END } diff --git a/projects/rocr-runtime/tests/kfdtest/src/KFDBaseComponentTest.hpp b/projects/rocr-runtime/tests/kfdtest/src/KFDBaseComponentTest.hpp index e27baf1cc1..ec3053c3d3 100644 --- a/projects/rocr-runtime/tests/kfdtest/src/KFDBaseComponentTest.hpp +++ b/projects/rocr-runtime/tests/kfdtest/src/KFDBaseComponentTest.hpp @@ -34,6 +34,7 @@ #include "hsakmt.h" #include "OSWrapper.hpp" #include "KFDTestUtil.hpp" +#include "Assemble.hpp" // @class KFDBaseComponentTest class KFDBaseComponentTest : public testing::Test { @@ -74,6 +75,7 @@ class KFDBaseComponentTest : public testing::Test { HsaMemFlags m_MemoryFlags; HsaNodeInfo m_NodeInfo; HSAint32 m_xnack; + Assembler* m_pAsm; // @brief Executed before every test that uses KFDBaseComponentTest class and sets all common settings for the tests. virtual void SetUp(); From 8b175d62333764b2e0daabf3ad30a8127941be5f Mon Sep 17 00:00:00 2001 From: Graham Sider Date: Fri, 24 Sep 2021 18:45:28 -0400 Subject: [PATCH 04/27] kfdtest: Update KFDMemoryTest to LLVM Asm - Reformat shaders for legibility - Move assembly processes to from IsaGen (CompileShader) to Assembler (RunAssembleBuf) - LLVM syntax change on ScratchCopyDwordIsa_gfx10: hwreg(HW_REG_SHADER_FLAT_SCRATCH_LO/HI) -> hwreg(HW_REG_FLAT_SCR_LO/HI) - Fix bug in CopyOnSignalIsa_gfx10 and PollMemoryIsa_gfx10 whereby flat_store_dword used vector reg format v[n,n]. Changed to v[n:n] Signed-off-by: Graham Sider Change-Id: Id182cfb8aeb7372366c59affb5cbdd145909ee96 [ROCm/ROCR-Runtime commit: 039bce94a6a7f3c4133c745c628749dca5013012] --- .../tests/kfdtest/src/KFDMemoryTest.cpp | 614 ++++++++---------- 1 file changed, 284 insertions(+), 330 deletions(-) diff --git a/projects/rocr-runtime/tests/kfdtest/src/KFDMemoryTest.cpp b/projects/rocr-runtime/tests/kfdtest/src/KFDMemoryTest.cpp index 8cf24ffb1b..980c9c7bf8 100644 --- a/projects/rocr-runtime/tests/kfdtest/src/KFDMemoryTest.cpp +++ b/projects/rocr-runtime/tests/kfdtest/src/KFDMemoryTest.cpp @@ -39,166 +39,124 @@ #include "SDMAPacket.hpp" #include "linux/kfd_ioctl.h" -const char* gfx8_ScratchCopyDword = -"\ -shader ScratchCopyDword\n\ -asic(VI)\n\ -type(CS)\n\ -/*copy the parameters from scalar registers to vector registers*/\n\ - v_mov_b32 v0, s0\n\ - v_mov_b32 v1, s1\n\ - v_mov_b32 v2, s2\n\ - v_mov_b32 v3, s3\n\ -/*set up the scratch parameters. This assumes a single 16-reg block.*/\n\ - s_mov_b32 flat_scratch_lo, 8/*2 dwords of scratch per thread*/\n\ - s_mov_b32 flat_scratch_hi, 0/*offset in units of 256bytes*/\n\ -/*copy a dword between the passed addresses*/\n\ - flat_load_dword v4, v[0:1] slc\n\ - s_waitcnt vmcnt(0)&lgkmcnt(0)\n\ - flat_store_dword v[2:3], v4 slc\n\ - \n\ - s_endpgm\n\ - \n\ -end\n\ -"; - -const char* gfx9_ScratchCopyDword = -"\ -shader ScratchCopyDword\n\ -asic(GFX9)\n\ -type(CS)\n\ -/*copy the parameters from scalar registers to vector registers*/\n\ - v_mov_b32 v0, s0\n\ - v_mov_b32 v1, s1\n\ - v_mov_b32 v2, s2\n\ - v_mov_b32 v3, s3\n\ -/*set up the scratch parameters. This assumes a single 16-reg block.*/\n\ - s_mov_b32 flat_scratch_lo, s4\n\ - s_mov_b32 flat_scratch_hi, s5\n\ -/*copy a dword between the passed addresses*/\n\ - flat_load_dword v4, v[0:1] slc\n\ - s_waitcnt vmcnt(0)&lgkmcnt(0)\n\ - flat_store_dword v[2:3], v4 slc\n\ - \n\ - s_endpgm\n\ - \n\ -end\n\ -"; -const char* gfx10_ScratchCopyDword = -"\ -shader ScratchCopyDword\n\ -asic(GFX10)\n\ -type(CS)\n\ -wave_size(32)\n\ -/*copy the parameters from scalar registers to vector registers*/\n\ - v_mov_b32 v0, s0\n\ - v_mov_b32 v1, s1\n\ - v_mov_b32 v2, s2\n\ - v_mov_b32 v3, s3\n\ -/*set up the scratch parameters. This assumes a single 16-reg block.*/\n\ - s_setreg_b32 hwreg(HW_REG_SHADER_FLAT_SCRATCH_LO), s4\n\ - s_setreg_b32 hwreg(HW_REG_SHADER_FLAT_SCRATCH_HI), s5\n\ -/*copy a dword between the passed addresses*/\n\ - flat_load_dword v4, v[0:1] slc\n\ - s_waitcnt vmcnt(0)&lgkmcnt(0)\n\ - flat_store_dword v[2:3], v4 slc\n\ - \n\ - s_endpgm\n\ - \n\ -end\n\ -"; - -const char* aldbrn_ScratchCopyDword = -"\ -shader ScratchCopyDword\n\ -asic(ALDEBARAN)\n\ -type(CS)\n\ -/*copy the parameters from scalar registers to vector registers*/\n\ - v_mov_b32 v0, s0\n\ - v_mov_b32 v1, s1\n\ - v_mov_b32 v2, s2\n\ - v_mov_b32 v3, s3\n\ -/*set up the scratch parameters. This assumes a single 16-reg block.*/\n\ - s_mov_b32 flat_scratch_lo, s4\n\ - s_mov_b32 flat_scratch_hi, s5\n\ -/*copy a dword between the passed addresses*/\n\ - flat_load_dword v4, v[0:1] slc\n\ - s_waitcnt vmcnt(0)&lgkmcnt(0)\n\ - flat_store_dword v[2:3], v4 slc\n\ - \n\ - s_endpgm\n\ - \n\ -end\n\ -"; - - +static const char* ScratchCopyDwordIsa_gfx8 = R"( + .text + // Copy the parameters from scalar registers to vector registers + v_mov_b32_e32 v0, s0 + v_mov_b32_e32 v1, s1 + v_mov_b32_e32 v2, s2 + v_mov_b32_e32 v3, s3 + // Setup the scratch parameters. This assumes a single 16-reg block + s_mov_b32 flat_scratch_lo, 8 + s_mov_b32 flat_scratch_hi, 0 + // Copy a dword between the passed addresses + flat_load_dword v4, v[0:1] slc + s_waitcnt vmcnt(0) & lgkmcnt(0) + flat_store_dword v[2:3], v4 slc + s_endpgm +)"; +static const char* ScratchCopyDwordIsa_gfx9 = R"( + .text + // Copy the parameters from scalar registers to vector registers + v_mov_b32 v0, s0 + v_mov_b32 v1, s1 + v_mov_b32 v2, s2 + v_mov_b32 v3, s3 + // Setup the scratch parameters. This assumes a single 16-reg block + s_mov_b32 flat_scratch_lo, s4 + s_mov_b32 flat_scratch_hi, s5 + // Copy a dword between the passed addresses + flat_load_dword v4, v[0:1] slc + s_waitcnt vmcnt(0) & lgkmcnt(0) + flat_store_dword v[2:3], v4 slc + s_endpgm +)"; +static const char* ScratchCopyDwordIsa_gfx10 = R"( + .text + // Copy the parameters from scalar registers to vector registers + v_mov_b32 v0, s0 + v_mov_b32 v1, s1 + v_mov_b32 v2, s2 + v_mov_b32 v3, s3 + // Setup the scratch parameters. This assumes a single 16-reg block + s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s4 + s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s5 + // Copy a dword between the passed addresses + flat_load_dword v4, v[0:1] slc + s_waitcnt vmcnt(0) & lgkmcnt(0) + flat_store_dword v[2:3], v4 slc + s_endpgm +)"; +static const char* ScratchCopyDwordIsa_gfx9aldbrn = R"( + .text + // Copy the parameters from scalar registers to vector registers + v_mov_b32 v0, s0 + v_mov_b32 v1, s1 + v_mov_b32 v2, s2 + v_mov_b32 v3, s3 + // Setup the scratch parameters. This assumes a single 16-reg block + s_mov_b32 flat_scratch_lo, s4 + s_mov_b32 flat_scratch_hi, s5 + // Copy a dword between the passed addresses + flat_load_dword v4, v[0:1] slc + s_waitcnt vmcnt(0) & lgkmcnt(0) + flat_store_dword v[2:3], v4 slc + s_endpgm +)"; /* Continuously poll src buffer and check buffer value * After src buffer is filled with specific value (0x5678, * by host program), fill dst buffer with specific * value(0x5678) and quit */ -const char* gfx9_PollMemory = -"\ -shader ReadMemory\n\ -wave_size(32)\n\ -type(CS)\n\ -/* Assume src address in s0, s1 and dst address in s2, s3*/\n\ - s_movk_i32 s18, 0x5678\n\ - LOOP:\n\ - s_load_dword s16, s[0:1], 0x0 glc\n\ - s_cmp_eq_i32 s16, s18\n\ - s_cbranch_scc0 LOOP\n\ - s_store_dword s18, s[2:3], 0x0 glc\n\ - s_endpgm\n\ - end\n\ -"; +static const char* PollMemoryIsa_gfx9 = R"( + .text + // Assume src address in s0, s1, and dst address in s2, s3 + s_movk_i32 s18, 0x5678 + LOOP: + s_load_dword s16, s[0:1], 0x0 glc + s_cmp_eq_i32 s16, s18 + s_cbranch_scc0 LOOP + s_store_dword s18, s[2:3], 0x0 glc + s_endpgm +)"; -/* Similar to gfx9_PollMemory except that the buffer +/* Similar to PollMemoryIsa_gfx9 except that the buffer * polled can be Non-coherant memory. SCC system-level * cache coherence is not supported in scalar (smem) path. * Use vmem operations with scc */ -const char* gfx9_PollNCMemory = -"\ -shader ReadMemory\n\ -asic(ALDEBARAN)\n\ -wave_size(32)\n\ -type(CS)\n\ -/* Assume src address in s0, s1 and dst address in s2, s3*/\n\ - v_mov_b32 v6, 0x5678\n\ - v_mov_b32 v0, s0\n\ - v_mov_b32 v1, s1\n\ - LOOP:\n\ - flat_load_dword v4, v[0:1] scc\n\ - v_cmp_eq_u32 vcc, v4, v6\n\ - s_cbranch_vccz LOOP\n\ - v_mov_b32 v0, s2\n\ - v_mov_b32 v1, s3\n\ - flat_store_dword v[0:1], v6 scc\n\ - s_endpgm\n\ - end\n\ -"; +static const char* PollNCMemoryIsa_gfx9 = R"( + .text + // Assume src address in s0, s1, and dst address in s2, s3 + v_mov_b32 v6, 0x5678 + v_mov_b32 v0, s0 + v_mov_b32 v1, s1 + LOOP: + flat_load_dword v4, v[0:1] scc + v_cmp_eq_u32 vcc, v4, v6 + s_cbranch_vccz LOOP + v_mov_b32 v0, s2 + v_mov_b32 v1, s3 + flat_store_dword v[0:1], v6 scc + s_endpgm +)"; -const char* gfx10_PollMemory = -"\ -shader ReadMemory\n\ -wave_size(32)\n\ -type(CS)\n\ -/* Assume src address in s0, s1 and dst address in s2, s3*/\n\ - s_movk_i32 s18, 0x5678\n\ - v_mov_b32 v0, s2\n\ - v_mov_b32 v1, s3\n\ - v_mov_b32 v2, 0x5678\n\ - LOOP:\n\ - s_load_dword s16, s[0:1], 0x0 glc\n\ - s_cmp_eq_i32 s16, s18\n\ - s_cbranch_scc0 LOOP\n\ - flat_store_dword v[0,1], v2 slc\n\ - s_waitcnt vmcnt(0)&lgkmcnt(0)\n\ - s_endpgm\n\ - end\n\ -"; +static const char* PollMemoryIsa_gfx10 = R"( + .text + // Assume src address in s0, s1, and dst address in s2, s3 + s_movk_i32 s18, 0x5678 + v_mov_b32 v0, s2 + v_mov_b32 v1, s3 + v_mov_b32 v2, 0x5678 + LOOP: + s_load_dword s16, s[0:1], 0x0 glc + s_cmp_eq_i32 s16, s18 + s_cbranch_scc0 LOOP + flat_store_dword v[0:1], v2 slc + s_waitcnt vmcnt(0) & lgkmcnt(0) + s_endpgm +)"; /* Input: A buffer of at least 3 dwords. * DW0: used as a signal. 0xcafe means it is signaled @@ -209,119 +167,99 @@ type(CS)\n\ * Once signal buffer is signaled, it copies input buffer * to output buffer */ -const char* gfx9_CopyOnSignal = -"\ -shader CopyOnSignal\n\ -wave_size(32)\n\ -type(CS)\n\ -/* Assume input buffer in s0, s1 */\n\ - s_mov_b32 s18, 0xcafe\n\ -POLLSIGNAL:\n\ - s_load_dword s16, s[0:1], 0x0 glc\n\ - s_cmp_eq_i32 s16, s18\n\ - s_cbranch_scc0 POLLSIGNAL\n\ - s_load_dword s17, s[0:1], 0x4 glc\n\ - s_waitcnt vmcnt(0) & lgkmcnt(0)\n\ - s_store_dword s17, s[0:1], 0x8 glc\n\ - s_waitcnt vmcnt(0) & lgkmcnt(0)\n\ - s_endpgm\n\ - end\n\ -"; +static const char* CopyOnSignalIsa_gfx9 = R"( + .text + // Assume input buffer in s0, s1 + s_mov_b32 s18, 0xcafe + POLLSIGNAL: + s_load_dword s16, s[0:1], 0x0 glc + s_cmp_eq_i32 s16, s18 + s_cbranch_scc0 POLLSIGNAL + s_load_dword s17, s[0:1], 0x4 glc + s_waitcnt vmcnt(0) & lgkmcnt(0) + s_store_dword s17, s[0:1], 0x8 glc + s_waitcnt vmcnt(0) & lgkmcnt(0) + s_endpgm +)"; -const char* gfx10_CopyOnSignal = -"\ -shader CopyOnSignal\n\ -wave_size(32)\n\ -type(CS)\n\ -/* Assume input buffer in s0, s1 */\n\ - s_add_u32 s2, s0, 0x8\n\ - s_addc_u32 s3, s1, 0x0\n\ - s_mov_b32 s18, 0xcafe\n\ - v_mov_b32 v0, s0\n\ - v_mov_b32 v1, s1\n\ - v_mov_b32 v4, s2\n\ - v_mov_b32 v5, s3\n\ -POLLSIGNAL:\n\ - s_load_dword s16, s[0:1], 0x0 glc\n\ - s_cmp_eq_i32 s16, s18\n\ - s_cbranch_scc0 POLLSIGNAL\n\ - s_load_dword s17, s[0:1], 0x4 glc\n\ - s_waitcnt vmcnt(0) & lgkmcnt(0)\n\ - v_mov_b32 v2, s17\n\ - flat_store_dword v[4,5], v2 glc\n\ - s_waitcnt vmcnt(0) & lgkmcnt(0)\n\ - s_endpgm\n\ - end\n\ -"; +static const char* CopyOnSignalIsa_gfx10 = R"( + .text + // Assume input buffer in s0, s1 + s_add_u32 s2, s0, 0x8 + s_addc_u32 s3, s1, 0x0 + s_mov_b32 s18, 0xcafe + v_mov_b32 v0, s0 + v_mov_b32 v1, s1 + v_mov_b32 v4, s2 + v_mov_b32 v5, s3 + POLLSIGNAL: + s_load_dword s16, s[0:1], 0x0 glc + s_cmp_eq_i32 s16, s18 + s_cbranch_scc0 POLLSIGNAL + s_load_dword s17, s[0:1], 0x4 glc + s_waitcnt vmcnt(0) & lgkmcnt(0) + v_mov_b32 v2, s17 + flat_store_dword v[4:5], v2 glc + s_waitcnt vmcnt(0) & lgkmcnt(0) + s_endpgm +)"; /* Input0: A buffer of at least 2 dwords. * DW0: used as a signal. Write 0xcafe to signal * DW1: Write to this buffer for other device to read. * Input1: mmio base address */ -const char* gfx9_WriteAndSignal = -"\ -shader WriteAndSignal\n\ -wave_size(32)\n\ -type(CS)\n\ -/* Assume input buffer in s0, s1 */\n\ - s_mov_b32 s18, 0xbeef\n\ - s_store_dword s18, s[0:1], 0x4 glc\n\ - s_mov_b32 s18, 0x1\n\ - s_store_dword s18, s[2:3], 0 glc\n\ - s_mov_b32 s18, 0xcafe\n\ - s_store_dword s18, s[0:1], 0x0 glc\n\ - s_endpgm\n\ - end\n\ -"; +static const char* WriteAndSignalIsa_gfx9 = R"( + .text + // Assume input buffer in s0, s1 + s_mov_b32 s18, 0xbeef + s_store_dword s18, s[0:1], 0x4 glc + s_mov_b32 s18, 0x1 + s_store_dword s18, s[2:3], 0 glc + s_mov_b32 s18, 0xcafe + s_store_dword s18, s[0:1], 0x0 glc + s_endpgm +)"; /* Continuously poll the flag at src buffer * After the flag of s[0:1] is 1 filled, * copy the value from s[0:1]+4 to dst buffer */ -const char* gfx9_PollAndCopy = -"\ -shader CopyMemory\n\ -wave_size(32)\n\ -type(CS)\n\ -/* Assume src buffer in s[0:1] and dst buffer in s[2:3]*/\n\ - s_movk_i32 s18, 0x1\n\ - LOOP:\n\ - s_load_dword s16, s[0:1], 0x0 glc\n\ - s_cmp_eq_i32 s16, s18\n\ - s_cbranch_scc0 LOOP\n\ - s_load_dword s17, s[0:1], 0x4 glc\n\ - s_waitcnt vmcnt(0) & lgkmcnt(0)\n\ - s_store_dword s17, s[2:3], 0x0 glc:1\n\ - s_waitcnt vmcnt(0) & lgkmcnt(0)\n\ - s_endpgm\n\ - end\n\ -"; +static const char* PollAndCopyIsa_gfx9 = R"( + .text + // Assume src buffer in s[0:1] and dst buffer in s[2:3] + s_movk_i32 s18, 0x1 + LOOP: + s_load_dword s16, s[0:1], 0x0 glc + s_cmp_eq_i32 s16, s18 + s_cbranch_scc0 LOOP + s_load_dword s17, s[0:1], 0x4 glc + s_waitcnt vmcnt(0) & lgkmcnt(0) + s_store_dword s17, s[2:3], 0x0 glc + s_waitcnt vmcnt(0) & lgkmcnt(0) + s_endpgm +)"; -const char* gfx9aldbrn_PollAndCopy = -"\ -shader CopyMemory\n\ -wave_size(32)\n\ -type(CS)\n\ -/* Assume src buffer in s[0:1] and dst buffer in s[2:3]*/\n\ - v_mov_b32 v0, s0\n\ - v_mov_b32 v1, s1\n\ - v_mov_b32 v18, 0x1\n\ - LOOP:\n\ - flat_load_dword v16, v[0:1] glc\n\ - s_waitcnt vmcnt(0) & lgkmcnt(0)\n\ - v_cmp_eq_i32 vcc, v16, v18\n\ - s_cbranch_vccz LOOP\n\ - buffer_invl2\n\ - s_load_dword s17, s[0:1], 0x4 glc\n\ - s_waitcnt vmcnt(0) & lgkmcnt(0)\n\ - s_store_dword s17, s[2:3], 0x0 glc\n\ - s_waitcnt vmcnt(0) & lgkmcnt(0)\n\ - buffer_wbl2\n\ - s_waitcnt vmcnt(0) & lgkmcnt(0)\n\ - s_endpgm\n\ - end\n\ -"; +static const char* PollAndCopyIsa_gfx9aldbrn = R"( + .text + // Assume src buffer in s[0:1] and dst buffer in s[2:3] + v_mov_b32 v0, s0 + v_mov_b32 v1, s1 + v_mov_b32 v18, 0x1 + LOOP: + flat_load_dword v16, v[0:1] glc + s_waitcnt vmcnt(0) & lgkmcnt(0) + v_cmp_eq_i32 vcc, v16, v18 + s_cbranch_vccz LOOP + buffer_invl2 + s_load_dword s17, s[0:1], 0x4 glc + s_waitcnt vmcnt(0) & lgkmcnt(0) + s_store_dword s17, s[2:3], 0x0 glc + s_waitcnt vmcnt(0) & lgkmcnt(0) + buffer_wbl2 + s_waitcnt vmcnt(0) & lgkmcnt(0) + s_endpgm +)"; /* Input0: A buffer of at least 2 dwords. * DW0: used as a signal. Write 0x1 to signal @@ -330,51 +268,45 @@ type(CS)\n\ * Input1: A buffer of at least 2 dwords. * DW0: used as the value to be written. */ -const char* gfx9aldbrn_WriteFlagAndValue = -"\ -shader WriteMemory\n\ -wave_size(32)\n\ -type(CS)\n\ -/* Assume two inputs buffer in s[0:1] and s[2:3]*/\n\ - v_mov_b32 v0, s0\n\ - v_mov_b32 v1, s1\n\ - s_load_dword s18, s[2:3], 0x0 glc\n\ - s_waitcnt vmcnt(0) & lgkmcnt(0)\n\ - s_store_dword s18, s[0:1], 0x4 glc\n\ - s_waitcnt vmcnt(0) & lgkmcnt(0)\n\ - buffer_wbl2\n\ - s_waitcnt vmcnt(0) & lgkmcnt(0)\n\ - v_mov_b32 v16, 0x1\n\ - flat_store_dword v[0:1], v16 glc\n\ - s_endpgm\n\ - end\n\ -"; +static const char* WriteFlagAndValueIsa_gfx9aldbrn = R"( + .text + // Assume two inputs buffer in s[0:1] and s[2:3] + v_mov_b32 v0, s0 + v_mov_b32 v1, s1 + s_load_dword s18, s[2:3], 0x0 glc + s_waitcnt vmcnt(0) & lgkmcnt(0) + s_store_dword s18, s[0:1], 0x4 glc + s_waitcnt vmcnt(0) & lgkmcnt(0) + buffer_wbl2 + s_waitcnt vmcnt(0) & lgkmcnt(0) + v_mov_b32 v16, 0x1 + flat_store_dword v[0:1], v16 glc + s_endpgm +)"; -const char* gfx10_WriteAndSignal = -"\ -shader WriteAndSignal\n\ -wave_size(32)\n\ -type(CS)\n\ -/* Assume input buffer in s0, s1 */\n\ - s_add_u32 s4, s0, 0x4\n\ - s_addc_u32 s5, s1, 0x0\n\ - v_mov_b32 v0, s0\n\ - v_mov_b32 v1, s1\n\ - v_mov_b32 v2, s2\n\ - v_mov_b32 v3, s3\n\ - v_mov_b32 v4, s4\n\ - v_mov_b32 v5, s5\n\ - v_mov_b32 v18, 0xbeef\n\ - flat_store_dword v[4:5], v18 glc\n\ - v_mov_b32 v18, 0x1\n\ - flat_store_dword v[2:3], v18 glc\n\ - v_mov_b32 v18, 0xcafe\n\ - flat_store_dword v[0:1], v18 glc\n\ - s_endpgm\n\ - end\n\ -"; +static const char* WriteAndSignalIsa_gfx10 = R"( + .text + // Assume input buffer in s0, s1 + s_add_u32 s4, s0, 0x4 + s_addc_u32 s5, s1, 0x0 + v_mov_b32 v0, s0 + v_mov_b32 v1, s1 + v_mov_b32 v2, s2 + v_mov_b32 v3, s3 + v_mov_b32 v4, s4 + v_mov_b32 v5, s5 + v_mov_b32 v18, 0xbeef + flat_store_dword v[4:5], v18 glc + v_mov_b32 v18, 0x1 + flat_store_dword v[2:3], v18 glc + v_mov_b32 v18, 0xcafe + flat_store_dword v[0:1], v18 glc + s_endpgm +)"; -//These gfx9_PullMemory, gfx9_CopyOnSignal, gfx9_WriteAndSignal shaders can be used by both gfx9 and gfx10 +/* These PollMemoryIsa_gfx9, CopyOnSignalIsa_gfx9, + * WriteAndSignalIsa_gfx9 shaders can be used by both gfx9 and gfx10 + */ void KFDMemoryTest::SetUp() { ROUTINE_START @@ -508,16 +440,15 @@ TEST_F(KFDMemoryTest, MapUnmapToNodes) { HsaMemoryBuffer dstBuffer(PAGE_SIZE, defaultGPUNode); const char *pReadMemory; - if (m_FamilyId < FAMILY_NV) - pReadMemory = gfx9_PollMemory; - else - pReadMemory = gfx10_PollMemory; - if (m_NodeInfo.IsNodeXGMItoCPU(defaultGPUNode)) /* On A+A system memory is mapped as NC */ - m_pIsaGen->CompileShader(gfx9_PollNCMemory, "ReadMemory", isaBuffer); + pReadMemory = PollNCMemoryIsa_gfx9; + else if (m_FamilyId < FAMILY_NV) + pReadMemory = PollMemoryIsa_gfx9; else - m_pIsaGen->CompileShader(pReadMemory, "ReadMemory", isaBuffer); + pReadMemory = PollMemoryIsa_gfx10; + + ASSERT_SUCCESS(m_pAsm->RunAssembleBuf(pReadMemory, isaBuffer.As())); PM4Queue pm4Queue; ASSERT_SUCCESS(pm4Queue.Create(defaultGPUNode)); @@ -855,16 +786,17 @@ TEST_F(KFDMemoryTest, FlatScratchAccess) { // Initialize the srcBuffer to some fixed value srcMemBuffer.Fill(0x01010101); - const char *pScratchCopyDword; + const char *pScratchCopyDwordIsa; if (m_FamilyId < FAMILY_AI) - pScratchCopyDword = gfx8_ScratchCopyDword; + pScratchCopyDwordIsa = ScratchCopyDwordIsa_gfx8; else if (m_FamilyId < FAMILY_AL) - pScratchCopyDword = gfx9_ScratchCopyDword; + pScratchCopyDwordIsa = ScratchCopyDwordIsa_gfx9; else if (m_FamilyId == FAMILY_AL) - pScratchCopyDword = aldbrn_ScratchCopyDword; + pScratchCopyDwordIsa = ScratchCopyDwordIsa_gfx9aldbrn; else - pScratchCopyDword = gfx10_ScratchCopyDword; - m_pIsaGen->CompileShader(pScratchCopyDword, "ScratchCopyDword", isaBuffer); + pScratchCopyDwordIsa = ScratchCopyDwordIsa_gfx10; + + ASSERT_SUCCESS(m_pAsm->RunAssembleBuf(pScratchCopyDwordIsa, isaBuffer.As())); const HsaNodeProperties *pNodeProperties = m_NodeInfo.GetNodeProperties(defaultGPUNode); @@ -1728,17 +1660,18 @@ TEST_F(KFDMemoryTest, PtraceAccessInvisibleVram) { // dstBuffer is cpu accessible gtt memory HsaMemoryBuffer dstBuffer(PAGE_SIZE, defaultGPUNode); - const char *pScratchCopyDword; + const char *pScratchCopyDwordIsa; if (m_FamilyId < FAMILY_AI) - pScratchCopyDword = gfx8_ScratchCopyDword; + pScratchCopyDwordIsa = ScratchCopyDwordIsa_gfx8; else if (m_FamilyId < FAMILY_AL) - pScratchCopyDword = gfx9_ScratchCopyDword; + pScratchCopyDwordIsa = ScratchCopyDwordIsa_gfx9; else if (m_FamilyId == FAMILY_AL) - pScratchCopyDword = aldbrn_ScratchCopyDword; + pScratchCopyDwordIsa = ScratchCopyDwordIsa_gfx9aldbrn; else - pScratchCopyDword = gfx10_ScratchCopyDword; + pScratchCopyDwordIsa = ScratchCopyDwordIsa_gfx10; + + ASSERT_SUCCESS(m_pAsm->RunAssembleBuf(pScratchCopyDwordIsa, isaBuffer.As())); - m_pIsaGen->CompileShader(pScratchCopyDword, "ScratchCopyDword", isaBuffer); Dispatch dispatch0(isaBuffer); dispatch0.SetArgs(mem0, dstBuffer.As()); dispatch0.Submit(queue); @@ -2109,12 +2042,14 @@ TEST_F(KFDMemoryTest, HostHdpFlush) { PM4Queue queue; ASSERT_SUCCESS(queue.Create(defaultGPUNode)); HsaMemoryBuffer isaBuffer(PAGE_SIZE, defaultGPUNode, true/*zero*/, false/*local*/, true/*exec*/); - const char *pCopyOnSignal; + const char *pCopyOnSignalIsa; if (m_FamilyId < FAMILY_NV) - pCopyOnSignal = gfx9_CopyOnSignal; + pCopyOnSignalIsa = CopyOnSignalIsa_gfx9; else - pCopyOnSignal = gfx10_CopyOnSignal; - m_pIsaGen->CompileShader(pCopyOnSignal, "CopyOnSignal", isaBuffer); + pCopyOnSignalIsa = CopyOnSignalIsa_gfx10; + + ASSERT_SUCCESS(m_pAsm->RunAssembleBuf(pCopyOnSignalIsa, isaBuffer.As())); + Dispatch dispatch0(isaBuffer); dispatch0.SetArgs(buffer, NULL); dispatch0.Submit(queue); @@ -2234,12 +2169,14 @@ TEST_F(KFDMemoryTest, DeviceHdpFlush) { PM4Queue queue; ASSERT_SUCCESS(queue.Create(nodes[0])); HsaMemoryBuffer isaBuffer(PAGE_SIZE, nodes[0], true/*zero*/, false/*local*/, true/*exec*/); - const char *pCopyOnSignal; + const char *pCopyOnSignalIsa; if (m_FamilyId < FAMILY_NV) - pCopyOnSignal = gfx9_CopyOnSignal; + pCopyOnSignalIsa = CopyOnSignalIsa_gfx9; else - pCopyOnSignal = gfx10_CopyOnSignal; - m_pIsaGen->CompileShader(pCopyOnSignal, "CopyOnSignal", isaBuffer); + pCopyOnSignalIsa = CopyOnSignalIsa_gfx10; + + ASSERT_SUCCESS(m_pAsm->RunAssembleBuf(pCopyOnSignalIsa, isaBuffer.As())); + Dispatch dispatch(isaBuffer); dispatch.SetArgs(buffer, NULL); dispatch.Submit(queue); @@ -2247,12 +2184,14 @@ TEST_F(KFDMemoryTest, DeviceHdpFlush) { PM4Queue queue0; ASSERT_SUCCESS(queue0.Create(nodes[1])); HsaMemoryBuffer isaBuffer0(PAGE_SIZE, nodes[1], true/*zero*/, false/*local*/, true/*exec*/); - const char *pWriteAndSignal; + const char *pWriteAndSignalIsa; if (m_FamilyId < FAMILY_NV) - pWriteAndSignal = gfx9_WriteAndSignal; + pWriteAndSignalIsa = WriteAndSignalIsa_gfx9; else - pWriteAndSignal = gfx10_WriteAndSignal; - m_pIsaGen->CompileShader(pWriteAndSignal, "WriteAndSignal", isaBuffer0); + pWriteAndSignalIsa = WriteAndSignalIsa_gfx10; + + ASSERT_SUCCESS(m_pAsm->RunAssembleBuf(pWriteAndSignalIsa, isaBuffer.As())); + Dispatch dispatch0(isaBuffer0); dispatch0.SetArgs(buffer, mmioBase); dispatch0.Submit(queue0); @@ -2304,7 +2243,9 @@ TEST_F(KFDMemoryTest, CacheInvalidateOnSdmaWrite) { PM4Queue queue; ASSERT_SUCCESS(queue.Create(defaultGPUNode)); HsaMemoryBuffer isaBuffer(PAGE_SIZE, defaultGPUNode, true/*zero*/, false/*local*/, true/*exec*/); - m_pIsaGen->CompileShader(gfx9_PollMemory, "ReadMemory", isaBuffer); + + ASSERT_SUCCESS(m_pAsm->RunAssembleBuf(PollMemoryIsa_gfx9, isaBuffer.As())); + Dispatch dispatch(isaBuffer); dispatch.SetArgs(buffer.As(), buffer.As()+dwLocation); dispatch.Submit(queue); @@ -2357,7 +2298,9 @@ TEST_F(KFDMemoryTest, CacheInvalidateOnCPUWrite) { PM4Queue queue; ASSERT_SUCCESS(queue.Create(defaultGPUNode)); HsaMemoryBuffer isaBuffer(PAGE_SIZE, defaultGPUNode, true/*zero*/, false/*local*/, true/*exec*/); - m_pIsaGen->CompileShader(gfx9_PollMemory, "ReadMemory", isaBuffer); + + ASSERT_SUCCESS(m_pAsm->RunAssembleBuf(PollMemoryIsa_gfx9, isaBuffer.As())); + Dispatch dispatch(isaBuffer); dispatch.SetArgs(buffer, buffer+100); dispatch.Submit(queue); @@ -2419,7 +2362,9 @@ TEST_F(KFDMemoryTest, CacheInvalidateOnRemoteWrite) { PM4Queue queue; ASSERT_SUCCESS(queue.Create(defaultGPUNode)); HsaMemoryBuffer isaBuffer(PAGE_SIZE, defaultGPUNode, true/*zero*/, false/*local*/, true/*exec*/); - m_pIsaGen->CompileShader(gfx9_PollMemory, "ReadMemory", isaBuffer); + + ASSERT_SUCCESS(m_pAsm->RunAssembleBuf(PollMemoryIsa_gfx9, isaBuffer.As())); + Dispatch dispatch(isaBuffer); dispatch.SetArgs(buffer.As(), buffer.As()+dwLocation); dispatch.Submit(queue); @@ -2500,7 +2445,9 @@ TEST_F(KFDMemoryTest, VramCacheCoherenceWithRemoteGPU) { PM4Queue queue; ASSERT_SUCCESS(queue.Create(defaultGPUNode)); HsaMemoryBuffer isaBuffer(PAGE_SIZE, defaultGPUNode, true/*zero*/, false/*local*/, true/*exec*/); - m_pIsaGen->CompileShader(gfx9aldbrn_PollAndCopy, "CopyMemory", isaBuffer); + + ASSERT_SUCCESS(m_pAsm->RunAssembleBuf(PollAndCopyIsa_gfx9aldbrn, isaBuffer.As())); + Dispatch dispatch(isaBuffer); dispatch.SetArgs(buffer.As(), buffer.As()+dwLocation); dispatch.Submit(queue); @@ -2515,7 +2462,9 @@ TEST_F(KFDMemoryTest, VramCacheCoherenceWithRemoteGPU) { PM4Queue queue1; ASSERT_SUCCESS(queue1.Create(nondefaultNode)); HsaMemoryBuffer isaBuffer1(PAGE_SIZE, nondefaultNode, true/*zero*/, false/*local*/, true/*exec*/); - m_pIsaGen->CompileShader(gfx9aldbrn_WriteFlagAndValue, "WriteMemory", isaBuffer1); + + ASSERT_SUCCESS(m_pAsm->RunAssembleBuf(WriteFlagAndValueIsa_gfx9aldbrn, isaBuffer.As())); + Dispatch dispatch1(isaBuffer1); dispatch1.SetArgs(buffer.As(), buffer.As()+dwSource); dispatch1.Submit(queue1); @@ -2569,7 +2518,9 @@ TEST_F(KFDMemoryTest, VramCacheCoherenceWithCPU) { PM4Queue queue; ASSERT_SUCCESS(queue.Create(defaultGPUNode)); HsaMemoryBuffer isaBuffer(PAGE_SIZE, defaultGPUNode, true/*zero*/, false/*local*/, true/*exec*/); - m_pIsaGen->CompileShader(gfx9aldbrn_PollAndCopy, "CopyMemory", isaBuffer); + + ASSERT_SUCCESS(m_pAsm->RunAssembleBuf(PollAndCopyIsa_gfx9aldbrn, isaBuffer.As())); + Dispatch dispatch(isaBuffer); dispatch.SetArgs(buffer, buffer+dwLocation); dispatch.Submit(queue); @@ -2627,10 +2578,13 @@ TEST_F(KFDMemoryTest, SramCacheCoherenceWithGPU) { ASSERT_SUCCESS(queue.Create(defaultGPUNode)); HsaMemoryBuffer isaBuffer(PAGE_SIZE, defaultGPUNode, true/*zero*/, false/*local*/, true/*exec*/); + const char* pPollAndCopyIsa; if (m_NodeInfo.IsNodeXGMItoCPU(defaultGPUNode)) - m_pIsaGen->CompileShader(gfx9aldbrn_PollAndCopy, "CopyMemory", isaBuffer); + pPollAndCopyIsa = PollAndCopyIsa_gfx9aldbrn; else - m_pIsaGen->CompileShader(gfx9_PollAndCopy, "CopyMemory", isaBuffer); + pPollAndCopyIsa = PollAndCopyIsa_gfx9; + + ASSERT_SUCCESS(m_pAsm->RunAssembleBuf(pPollAndCopyIsa, isaBuffer.As())); Dispatch dispatch(isaBuffer); dispatch.SetArgs(fineBuffer, fineBuffer+dwLocation); From 3bbfce112be05cd546e18fd5d63a08813044c999 Mon Sep 17 00:00:00 2001 From: Graham Sider Date: Fri, 24 Sep 2021 18:46:46 -0400 Subject: [PATCH 05/27] kfdtest: Update KFDQMTest to LLVM Asm - Reformat shaders for legibility - Move assembly processes to from IsaGen (CompileShader) to Assembler (RunAssembleBuf) Signed-off-by: Graham Sider Change-Id: I669f076b5c34eb90349865eeca1b29e17c9e80d6 [ROCm/ROCR-Runtime commit: 08d38fb14010fd0bcf24d73ce626d8b5bffc49d8] --- .../tests/kfdtest/src/KFDQMTest.cpp | 200 +++++++++--------- 1 file changed, 97 insertions(+), 103 deletions(-) diff --git a/projects/rocr-runtime/tests/kfdtest/src/KFDQMTest.cpp b/projects/rocr-runtime/tests/kfdtest/src/KFDQMTest.cpp index 9b4003b68e..c28715639f 100644 --- a/projects/rocr-runtime/tests/kfdtest/src/KFDQMTest.cpp +++ b/projects/rocr-runtime/tests/kfdtest/src/KFDQMTest.cpp @@ -34,6 +34,100 @@ #include "Dispatch.hpp" +/* A simple isa loop program with dense mathematic operations + * s1 controls the number iterations of the loop + * This shader can be used by GFX8, GFX9 and GFX10 + */ +static const char* LoopIsa = R"( + .text + s_movk_i32 s0, 0x0008 + s_movk_i32 s1, 0x00ff + v_mov_b32 v0, 0 + v_mov_b32 v1, 0 + v_mov_b32 v2, 0 + v_mov_b32 v3, 0 + v_mov_b32 v4, 0 + v_mov_b32 v5, 0 + v_mov_b32 v6, 0 + v_mov_b32 v7, 0 + v_mov_b32 v8, 0 + v_mov_b32 v9, 0 + v_mov_b32 v10, 0 + v_mov_b32 v11, 0 + v_mov_b32 v12, 0 + v_mov_b32 v13, 0 + v_mov_b32 v14, 0 + v_mov_b32 v15, 0 + v_mov_b32 v16, 0 + LOOP: + s_mov_b32 s8, s4 + s_mov_b32 s9, s1 + s_mov_b32 s10, s6 + s_mov_b32 s11, s7 + s_cmp_le_i32 s1, s0 + s_cbranch_scc1 END_OF_PGM + s_buffer_load_dwordx8 s[8:15], s[8:11], 0x10 + v_add_f32 v0, 2.0, v0 + v_cvt_f32_i32 v17, s1 + s_waitcnt lgkmcnt(0) + v_add_f32 v18, s8, v17 + v_add_f32 v19, s9, v17 + v_add_f32 v20, s10, v17 + v_add_f32 v21, s11, v17 + v_add_f32 v22, s12, v17 + v_add_f32 v23, s13, v17 + v_add_f32 v24, s14, v17 + v_add_f32 v17, s15, v17 + v_log_f32 v25, v18 + v_mul_f32 v25, v22, v25 + v_exp_f32 v25, v25 + v_log_f32 v26, v19 + v_mul_f32 v26, v23, v26 + v_exp_f32 v26, v26 + v_log_f32 v27, v20 + v_mul_f32 v27, v24, v27 + v_exp_f32 v27, v27 + v_log_f32 v28, v21 + v_mul_f32 v28, v17, v28 + v_exp_f32 v28, v28 + v_add_f32 v5, v5, v25 + v_add_f32 v6, v6, v26 + v_add_f32 v7, v7, v27 + v_add_f32 v8, v8, v28 + v_mul_f32 v18, 0x3fb8aa3b, v18 + v_exp_f32 v18, v18 + v_mul_f32 v19, 0x3fb8aa3b, v19 + v_exp_f32 v19, v19 + v_mul_f32 v20, 0x3fb8aa3b, v20 + v_exp_f32 v20, v20 + v_mul_f32 v21, 0x3fb8aa3b, v21 + v_exp_f32 v21, v21 + v_add_f32 v9, v9, v18 + v_add_f32 v10, v10, v19 + v_add_f32 v11, v11, v20 + v_add_f32 v12, v12, v21 + v_sqrt_f32 v18, v22 + v_sqrt_f32 v19, v23 + v_sqrt_f32 v20, v24 + v_sqrt_f32 v21, v17 + v_add_f32 v13, v13, v18 + v_add_f32 v14, v14, v19 + v_add_f32 v15, v15, v20 + v_add_f32 v16, v16, v21 + v_rsq_f32 v18, v22 + v_rsq_f32 v19, v23 + v_rsq_f32 v20, v24 + v_rsq_f32 v17, v17 + v_add_f32 v1, v1, v18 + v_add_f32 v2, v2, v19 + v_add_f32 v3, v3, v20 + v_add_f32 v4, v4, v17 + s_add_u32 s0, s0, 1 + s_branch LOOP + END_OF_PGM: + s_endpgm +)"; + void KFDQMTest::SetUp() { ROUTINE_START @@ -677,111 +771,12 @@ TEST_F(KFDQMTest, OverSubscribeCpQueues) { TEST_END } -/* A simple isa loop program with dense mathematic operations - * s1 controls the number iterations of the loop - * This shader can be used by GFX8, GFX9 and GFX10 - */ -static const char *loop_isa = \ -"\ -shader loop_isa\n\ -wave_size(32)\n\ -type(CS)\n\ - s_movk_i32 s0, 0x0008\n\ - s_movk_i32 s1, 0x00ff\n\ - v_mov_b32 v0, 0\n\ - v_mov_b32 v1, 0\n\ - v_mov_b32 v2, 0\n\ - v_mov_b32 v3, 0\n\ - v_mov_b32 v4, 0\n\ - v_mov_b32 v5, 0\n\ - v_mov_b32 v6, 0\n\ - v_mov_b32 v7, 0\n\ - v_mov_b32 v8, 0\n\ - v_mov_b32 v9, 0\n\ - v_mov_b32 v10, 0\n\ - v_mov_b32 v11, 0\n\ - v_mov_b32 v12, 0\n\ - v_mov_b32 v13, 0\n\ - v_mov_b32 v14, 0\n\ - v_mov_b32 v15, 0\n\ - v_mov_b32 v16, 0\n\ - LOOP:\n\ - s_mov_b32 s8, s4\n\ - s_mov_b32 s9, s1\n\ - s_mov_b32 s10, s6\n\ - s_mov_b32 s11, s7\n\ - s_cmp_le_i32 s1, s0\n\ - s_cbranch_scc1 END_OF_PGM\n\ - s_buffer_load_dwordx8 s[8:15], s[8:11], 0x10\n\ - v_add_f32 v0, 2.0, v0\n\ - v_cvt_f32_i32 v17, s1\n\ -s_waitcnt lgkmcnt(0)\n\ - v_add_f32 v18, s8, v17\n\ - v_add_f32 v19, s9, v17\n\ - v_add_f32 v20, s10, v17\n\ - v_add_f32 v21, s11, v17\n\ - v_add_f32 v22, s12, v17\n\ - v_add_f32 v23, s13, v17\n\ - v_add_f32 v24, s14, v17\n\ - v_add_f32 v17, s15, v17\n\ - v_log_f32 v25, v18\n\ - v_mul_f32 v25, v22, v25\n\ - v_exp_f32 v25, v25\n\ - v_log_f32 v26, v19\n\ - v_mul_f32 v26, v23, v26\n\ - v_exp_f32 v26, v26\n\ - v_log_f32 v27, v20\n\ - v_mul_f32 v27, v24, v27\n\ - v_exp_f32 v27, v27\n\ - v_log_f32 v28, v21\n\ - v_mul_f32 v28, v17, v28\n\ - v_exp_f32 v28, v28\n\ - v_add_f32 v5, v5, v25\n\ - v_add_f32 v6, v6, v26\n\ - v_add_f32 v7, v7, v27\n\ - v_add_f32 v8, v8, v28\n\ - v_mul_f32 v18, 0x3fb8aa3b, v18\n\ - v_exp_f32 v18, v18\n\ - v_mul_f32 v19, 0x3fb8aa3b, v19\n\ - v_exp_f32 v19, v19\n\ - v_mul_f32 v20, 0x3fb8aa3b, v20\n\ - v_exp_f32 v20, v20\n\ - v_mul_f32 v21, 0x3fb8aa3b, v21\n\ - v_exp_f32 v21, v21\n\ - v_add_f32 v9, v9, v18\n\ - v_add_f32 v10, v10, v19\n\ - v_add_f32 v11, v11, v20\n\ - v_add_f32 v12, v12, v21\n\ - v_sqrt_f32 v18, v22\n\ - v_sqrt_f32 v19, v23\n\ - v_sqrt_f32 v20, v24\n\ - v_sqrt_f32 v21, v17\n\ - v_add_f32 v13, v13, v18\n\ - v_add_f32 v14, v14, v19\n\ - v_add_f32 v15, v15, v20\n\ - v_add_f32 v16, v16, v21\n\ - v_rsq_f32 v18, v22\n\ - v_rsq_f32 v19, v23\n\ - v_rsq_f32 v20, v24\n\ - v_rsq_f32 v17, v17\n\ - v_add_f32 v1, v1, v18\n\ - v_add_f32 v2, v2, v19\n\ - v_add_f32 v3, v3, v20\n\ - v_add_f32 v4, v4, v17\n\ - s_add_u32 s0, s0, 1\n\ - s_branch LOOP\n\ - END_OF_PGM:\n\ - s_endpgm\n\ - end\n\ -"; - HSAint64 KFDQMTest::TimeConsumedwithCUMask(int node, uint32_t* mask, uint32_t mask_count) { HsaMemoryBuffer isaBuffer(PAGE_SIZE, node, true/*zero*/, false/*local*/, true/*exec*/); HsaMemoryBuffer dstBuffer(PAGE_SIZE, node, true, false, false); HsaMemoryBuffer ctlBuffer(PAGE_SIZE, node, true, false, false); - m_pIsaGen = IsaGenerator::Create(m_FamilyId); - m_pIsaGen->CompileShader(loop_isa, "loop_isa", isaBuffer); + EXPECT_SUCCESS(m_pAsm->RunAssembleBuf(LoopIsa, isaBuffer.As())); Dispatch dispatch(isaBuffer); dispatch.SetDim(1024, 16, 16); @@ -838,7 +833,6 @@ TEST_F(KFDQMTest, BasicCuMaskingLinear) { TEST_START(TESTPROFILE_RUNALL); int defaultGPUNode = m_NodeInfo.HsaDefaultGPUNode(); ASSERT_GE(defaultGPUNode, 0) << "failed to get default GPU Node"; - m_pIsaGen = IsaGenerator::Create(m_FamilyId); if (m_FamilyId >= FAMILY_VI) { const HsaNodeProperties *pNodeProperties = m_NodeInfo.GetNodeProperties(defaultGPUNode); @@ -982,7 +976,7 @@ TEST_F(KFDQMTest, QueuePriorityOnDifferentPipe) { HSAint32 *syncBuffer = syncBuf.As(); HsaMemoryBuffer isaBuffer(PAGE_SIZE, node, true/*zero*/, false/*local*/, true/*exec*/); - m_pIsaGen->CompileShader(loop_isa, "loop_isa", isaBuffer); + ASSERT_SUCCESS(m_pAsm->RunAssembleBuf(LoopIsa, isaBuffer.As())); Dispatch dispatch[2] = { Dispatch(isaBuffer, true), @@ -1047,7 +1041,7 @@ TEST_F(KFDQMTest, QueuePriorityOnSamePipe) { HSAint32 *syncBuffer = syncBuf.As(); HsaMemoryBuffer isaBuffer(PAGE_SIZE, node, true/*zero*/, false/*local*/, true/*exec*/); - m_pIsaGen->CompileShader(loop_isa, "loop_isa", isaBuffer); + ASSERT_SUCCESS(m_pAsm->RunAssembleBuf(LoopIsa, isaBuffer.As())); Dispatch dispatch[2] = { Dispatch(isaBuffer, true), From 5f50a05e61820ac3f5bd8d3e7c53c8fa89c5f0f9 Mon Sep 17 00:00:00 2001 From: Graham Sider Date: Fri, 24 Sep 2021 18:47:23 -0400 Subject: [PATCH 06/27] kfdtest: Update KFDCWSRTest to LLVM Asm - Reformat shaders for legibility - Move assembly processes to from IsaGen (CompileShader) to Assembler (RunAssembleBuf) Signed-off-by: Graham Sider Change-Id: I174f1ea5332c499440b30d9bcf06836274428a0f [ROCm/ROCR-Runtime commit: c845b976d0e297c769f5ce8a5e31d9293464e5de] --- .../tests/kfdtest/src/KFDCWSRTest.cpp | 122 ++++++++---------- .../tests/kfdtest/src/KFDCWSRTest.hpp | 4 +- 2 files changed, 54 insertions(+), 72 deletions(-) diff --git a/projects/rocr-runtime/tests/kfdtest/src/KFDCWSRTest.cpp b/projects/rocr-runtime/tests/kfdtest/src/KFDCWSRTest.cpp index e12e697566..c7a2e4b312 100644 --- a/projects/rocr-runtime/tests/kfdtest/src/KFDCWSRTest.cpp +++ b/projects/rocr-runtime/tests/kfdtest/src/KFDCWSRTest.cpp @@ -38,76 +38,63 @@ * v6 - counter */ -static const char* iterate_isa_gfx8 = \ -"\ -shader iterate_isa\n\ -wave_size(32)\n\ -type(CS)\n\ - // copy the parameters from scalar registers to vector registers\n\ - v_mov_b32 v2, s0 // v[2:3] = s[0:1] \n\ - v_mov_b32 v3, s1 // v[2:3] = s[0:1] \n\ - v_mov_b32 v0, s4 // use workgroup id as index \n\ - v_lshlrev_b32 v0, 2, v0 // v0 *= 4 \n\ - v_add_u32 v4, vcc, s2, v0 // v[4:5] = s[2:3] + v0 * 4 \n\ - v_mov_b32 v5, s3 // v[4:5] = s[2:3] + v0 * 4 \n\ - v_add_u32 v5, vcc, v5, vcc_lo // v[4:5] = s[2:3] + v0 * 4 \n\ - v_mov_b32 v6, 0 \n\ -LOOP: \n\ - v_add_u32 v6, vcc, 1, v6 \n\ - // compare the result value (v6) to iteration value (v2), and \n\ - // jump if equal (i.e. if VCC is not zero after the comparison) \n\ - v_cmp_lt_u32 vcc, v6, v2 \n\ - s_cbranch_vccnz LOOP \n\ - flat_store_dword v[4:5], v6 \n\ - s_waitcnt vmcnt(0)&lgkmcnt(0) \n\ - s_endpgm \n\ -end \n\ -"; +static const char* IterateIsa_gfx8 = R"( + .text + // Copy the parameters from scalar registers to vector registers + v_mov_b32 v2, s0 // v[2:3] = s[0:1] + v_mov_b32 v3, s1 // v[2:3] = s[0:1] + v_mov_b32 v0, s4 // use workgroup id as index + v_lshlrev_b32 v0, 2, v0 // v0 *= 4 + v_add_u32 v4, vcc, s2, v0 // v[4:5] = s[2:3] + v0 * 4 + v_mov_b32 v5, s3 // v[4:5] = s[2:3] + v0 * 4 + v_add_u32 v5, vcc, v5, vcc_lo // v[4:5] = s[2:3] + v0 * 4 + v_mov_b32 v6, 0 + LOOP: + v_add_u32 v6, vcc, 1, v6 + // Compare the result value (v6) to iteration value (v2), and + // jump if equal (i.e. if VCC is not zero after the comparison) + v_cmp_lt_u32 vcc, v6, v2 + s_cbranch_vccnz LOOP + flat_store_dword v[4:5], v6 + s_waitcnt vmcnt(0) & lgkmcnt(0) + s_endpgm +)"; -//This shader can be used by gfx9 and gfx10 -static const char* iterate_isa_gfx9 = \ -"\ -shader iterate_isa\n\ -wave_size(32)\n\ -type(CS)\n\ - // copy the parameters from scalar registers to vector registers\n\ - v_mov_b32 v2, s0 // v[2:3] = s[0:1] \n\ - v_mov_b32 v3, s1 // v[2:3] = s[0:1] \n\ - v_mov_b32 v0, s4 // use workgroup id as index \n\ - v_lshlrev_b32 v0, 2, v0 // v0 *= 4 \n\ - v_add_co_u32 v4, vcc, s2, v0 // v[4:5] = s[2:3] + v0 * 4 \n\ - v_mov_b32 v5, s3 // v[4:5] = s[2:3] + v0 * 4 \n\ - v_add_co_u32 v5, vcc, v5, vcc_lo // v[4:5] = s[2:3] + v0 * 4 \n\ - v_mov_b32 v6, 0 \n\ -LOOP: \n\ - v_add_co_u32 v6, vcc, 1, v6 \n\ - // compare the result value (v6) to iteration value (v2), and \n\ - // jump if equal (i.e. if VCC is not zero after the comparison) \n\ - v_cmp_lt_u32 vcc, v6, v2 \n\ - s_cbranch_vccnz LOOP \n\ - flat_store_dword v[4:5], v6 \n\ - s_waitcnt vmcnt(0)&lgkmcnt(0) \n\ - s_endpgm \n\ -end \n\ -"; +// This shader can be used by gfx9 and gfx10 +static const char* IterateIsa_gfx9 = R"( + .text + // Copy the parameters from scalar registers to vector registers + v_mov_b32 v2, s0 // v[2:3] = s[0:1] + v_mov_b32 v3, s1 // v[2:3] = s[0:1] + v_mov_b32 v0, s4 // use workgroup id as index + v_lshlrev_b32 v0, 2, v0 // v0 *= 4 + v_add_co_u32 v4, vcc, s2, v0 // v[4:5] = s[2:3] + v0 * 4 + v_mov_b32 v5, s3 // v[4:5] = s[2:3] + v0 * 4 + v_add_co_u32 v5, vcc, v5, vcc_lo // v[4:5] = s[2:3] + v0 * 4 + v_mov_b32 v6, 0 + LOOP: + v_add_co_u32 v6, vcc, 1, v6 + // Compare the result value (v6) to iteration value (v2), and + // jump if equal (i.e. if VCC is not zero after the comparison) + v_cmp_lt_u32 vcc, v6, v2 + s_cbranch_vccnz LOOP + flat_store_dword v[4:5], v6 + s_waitcnt vmcnt(0) & lgkmcnt(0) + s_endpgm +)"; -static const char* infinite_isa = \ -"\ -shader infinite_isa \n\ -wave_size(32) \n\ -type(CS) \n\ -LOOP: \n\ - s_branch LOOP \n\ -end \n\ -"; +static const char* InfiniteIsa = R"( + .text + LOOP: + s_branch LOOP + s_endpgm +)"; void KFDCWSRTest::SetUp() { ROUTINE_START KFDBaseComponentTest::SetUp(); - m_pIsaGen = IsaGenerator::Create(m_FamilyId); - wave_number = 1; ROUTINE_END @@ -115,9 +102,6 @@ void KFDCWSRTest::SetUp() { void KFDCWSRTest::TearDown() { ROUTINE_START - if (m_pIsaGen) - delete m_pIsaGen; - m_pIsaGen = NULL; KFDBaseComponentTest::TearDown(); @@ -159,9 +143,9 @@ TEST_F(KFDCWSRTest, BasicTest) { uint64_t count1 = 400000000; if (m_FamilyId < FAMILY_AI) - pIterateIsa = iterate_isa_gfx8; + pIterateIsa = IterateIsa_gfx8; else - pIterateIsa = iterate_isa_gfx9; + pIterateIsa = IterateIsa_gfx9; if (isOnEmulator()) { // Divide the iterator times by 10000 so that the test can @@ -172,7 +156,7 @@ TEST_F(KFDCWSRTest, BasicTest) { unsigned int* result1 = resultBuf1.As(); - m_pIsaGen->CompileShader(pIterateIsa, "iterate_isa", isaBuffer); + ASSERT_SUCCESS(m_pAsm->RunAssembleBuf(pIterateIsa, isaBuffer.As())); PM4Queue queue1; @@ -236,7 +220,7 @@ TEST_F(KFDCWSRTest, InterruptRestore) { if ((m_FamilyId >= FAMILY_VI) && (checkCWSREnabled())) { HsaMemoryBuffer isaBuffer(PAGE_SIZE, defaultGPUNode, true/*zero*/, false/*local*/, true/*exec*/); - m_pIsaGen->CompileShader(infinite_isa, "infinite_isa", isaBuffer); + ASSERT_SUCCESS(m_pAsm->RunAssembleBuf(InfiniteIsa, isaBuffer.As())); PM4Queue queue1, queue2, queue3; diff --git a/projects/rocr-runtime/tests/kfdtest/src/KFDCWSRTest.hpp b/projects/rocr-runtime/tests/kfdtest/src/KFDCWSRTest.hpp index 779180ea3d..53c925b0aa 100644 --- a/projects/rocr-runtime/tests/kfdtest/src/KFDCWSRTest.hpp +++ b/projects/rocr-runtime/tests/kfdtest/src/KFDCWSRTest.hpp @@ -27,12 +27,11 @@ #include #include "PM4Queue.hpp" -#include "IsaGenerator.hpp" #include "KFDBaseComponentTest.hpp" class KFDCWSRTest : public KFDBaseComponentTest { public: - KFDCWSRTest() :m_pIsaGen(NULL) {} + KFDCWSRTest() {} ~KFDCWSRTest() {} protected: @@ -41,7 +40,6 @@ class KFDCWSRTest : public KFDBaseComponentTest { protected: // Members unsigned wave_number; - IsaGenerator* m_pIsaGen; }; #endif // __KFD_CWSR_TEST__H__ From 798de4f4467cb4e4e3ff174bcf9a7dfb7f3f6361 Mon Sep 17 00:00:00 2001 From: Graham Sider Date: Fri, 24 Sep 2021 18:47:40 -0400 Subject: [PATCH 07/27] kfdtest: Update KFDEvictTest to LLVM Asm - Reformat shaders for legibility - Move assembly processes to from IsaGen (CompileShader) to Assembler (RunAssembleBuf) Signed-off-by: Graham Sider Change-Id: I7333d0e45ccd3f43690a2a01227f89a6e04fcecb [ROCm/ROCR-Runtime commit: b44d6762bd84afbf01420bc243ceb63a065556ca] --- .../tests/kfdtest/src/KFDEvictTest.cpp | 253 ++++++++---------- .../tests/kfdtest/src/KFDEvictTest.hpp | 5 +- 2 files changed, 117 insertions(+), 141 deletions(-) diff --git a/projects/rocr-runtime/tests/kfdtest/src/KFDEvictTest.cpp b/projects/rocr-runtime/tests/kfdtest/src/KFDEvictTest.cpp index 7ec86bc8bd..bf721238c8 100644 --- a/projects/rocr-runtime/tests/kfdtest/src/KFDEvictTest.cpp +++ b/projects/rocr-runtime/tests/kfdtest/src/KFDEvictTest.cpp @@ -36,23 +36,132 @@ #define SDMA_NOP 0x0 +/* Shader to read local buffers using multiple wavefronts in parallel + * until address buffer is filled with specific value 0x5678 by host program, + * then each wavefront fills value 0x5678 at corresponding result buffer and quit + * + * Initial state: + * s[0:1] - address buffer base address + * s[2:3] - result buffer base address + * s4 - workgroup id + * v0 - workitem id, always 0 because NUM_THREADS_X(number of threads) in workgroup set to 1 + * Registers: + * v0 - calculated workitem id, v0 = v0 + s4 * NUM_THREADS_X + * v[2:3] - address of corresponding local buf address offset: s[0:1] + v0 * 8 + * v[4:5] - corresponding output buf address: s[2:3] + v0 * 4 + * v[6:7] - local buf address used for read test + * + * This shader can be used by gfx9 and gfx10 + * + */ + +static const char* ReadMemoryIsa_gfx9 = R"( + .text + // Compute address of corresponding output buffer + v_mov_b32 v0, s4 // use workgroup id as index + v_lshlrev_b32 v0, 2, v0 // v0 *= 4 + v_add_co_u32 v4, vcc, s2, v0 // v[4:5] = s[2:3] + v0 * 4 + v_mov_b32 v5, s3 + v_add_co_u32 v5, vcc, v5, vcc_lo + // Compute input buffer offset used to store corresponding local buffer address + v_lshlrev_b32 v0, 1, v0 // v0 *= 8 + v_add_co_u32 v2, vcc, s0, v0 // v[2:3] = s[0:1] + v0 * 8 + v_mov_b32 v3, s1 + v_add_co_u32 v3, vcc, v3, vcc_lo + // load 64bit local buffer address stored at v[2:3] to v[6:7] + flat_load_dwordx2 v[6:7], v[2:3] slc + s_waitcnt vmcnt(0) & lgkmcnt(0) // wait for memory reads to finish + v_mov_b32 v8, 0x5678 + s_movk_i32 s8, 0x5678 + L_REPEAT: + s_load_dword s16, s[0:1], 0x0 glc + s_waitcnt vmcnt(0) & lgkmcnt(0) // wait for memory reads to finish + s_cmp_eq_i32 s16, s8 + s_cbranch_scc1 L_QUIT // if notified to quit by host + // Loop read 64M local buffer starting at v[6:7] + // every 4k page only read once + v_mov_b32 v9, 0 + v_mov_b32 v10, 0x1000 // 4k page + v_mov_b32 v11, 0x4000000 // 64M size + v_mov_b32 v12, v6 + v_mov_b32 v13, v7 + L_LOOP_READ: + flat_load_dwordx2 v[14:15], v[12:13] slc + v_add_co_u32 v9, vcc, v9, v10 + v_add_co_u32 v12, vcc, v12, v10 + v_add_co_u32 v13, vcc, v13, vcc_lo + v_cmp_lt_u32 vcc, v9, v11 + s_cbranch_vccnz L_LOOP_READ + s_branch L_REPEAT + L_QUIT: + flat_store_dword v[4:5], v8 + s_waitcnt vmcnt(0) & lgkmcnt(0) // wait for memory writes to finish + s_endpgm +)"; + +static const char* ReadMemoryIsa_gfx8 = R"( + .text + // Compute address of corresponding output buffer + v_mov_b32 v0, s4 // use workgroup id as index + v_lshlrev_b32 v0, 2, v0 // v0 *= 4 + v_add_u32 v4, vcc, s2, v0 // v[4:5] = s[2:3] + v0 * 4 + v_mov_b32 v5, s3 + v_addc_u32 v5, vcc, v5, 0, vcc + // Compute input buffer offset used to store corresponding local buffer address + v_lshlrev_b32 v0, 1, v0 // v0 *= 8 + v_add_u32 v2, vcc, s0, v0 // v[2:3] = s[0:1] + v0 * 8 + v_mov_b32 v3, s1 + v_addc_u32 v3, vcc, v3, 0, vcc + // Load 64bit local buffer address stored at v[2:3] to v[6:7] + flat_load_dwordx2 v[6:7], v[2:3] slc + s_waitcnt vmcnt(0) & lgkmcnt(0) // wait for memory reads to finish + v_mov_b32 v8, 0x5678 + s_movk_i32 s8, 0x5678 + L_REPEAT: + s_load_dword s16, s[0:1], 0x0 glc + s_waitcnt vmcnt(0) & lgkmcnt(0) // wait for memory reads to finish + s_cmp_eq_i32 s16, s8 + s_cbranch_scc1 L_QUIT // if notified to quit by host + // Loop read 64M local buffer starting at v[6:7] + // every 4k page only read once + v_mov_b32 v9, 0 + v_mov_b32 v10, 0x1000 // 4k page + v_mov_b32 v11, 0x4000000 // 64M size + v_mov_b32 v12, v6 + v_mov_b32 v13, v7 + L_LOOP_READ: + flat_load_dwordx2 v[14:15], v[12:13] slc + v_add_u32 v9, vcc, v9, v10 + v_add_u32 v12, vcc, v12, v10 + v_addc_u32 v13, vcc, v13, 0, vcc + v_cmp_lt_u32 vcc, v9, v11 + s_cbranch_vccnz L_LOOP_READ + s_branch L_REPEAT + L_QUIT: + flat_store_dword v[4:5], v8 + s_waitcnt vmcnt(0) & lgkmcnt(0) // wait for memory writes to finish + s_endpgm +)"; + +std::string KFDEvictTest::CreateShader() { + if (m_FamilyId < FAMILY_AI) + return ReadMemoryIsa_gfx8; + else + return ReadMemoryIsa_gfx9; +} + + void KFDEvictTest::SetUp() { ROUTINE_START KFDBaseComponentTest::SetUp(); - m_pIsaGen = IsaGenerator::Create(m_FamilyId); - ROUTINE_END } void KFDEvictTest::TearDown() { ROUTINE_START - if (m_pIsaGen) - delete m_pIsaGen; - m_pIsaGen = NULL; - KFDBaseComponentTest::TearDown(); ROUTINE_END @@ -286,136 +395,6 @@ void KFDEvictTest::AmdgpuCommandSubmissionSdmaNop(int rn, amdgpu_bo_handle handl EXPECT_EQ(0, amdgpu_cs_ctx_free(contextHandle)); } -/* Shader to read local buffers using multiple wavefronts in parallel - * until address buffer is filled with specific value 0x5678 by host program, - * then each wavefront fills value 0x5678 at corresponding result buffer and quit - * - * Initial state: - * s[0:1] - address buffer base address - * s[2:3] - result buffer base address - * s4 - workgroup id - * v0 - workitem id, always 0 because NUM_THREADS_X(number of threads) in workgroup set to 1 - * Registers: - * v0 - calculated workitem id, v0 = v0 + s4 * NUM_THREADS_X - * v[2:3] - address of corresponding local buf address offset: s[0:1] + v0 * 8 - * v[4:5] - corresponding output buf address: s[2:3] + v0 * 4 - * v[6:7] - local buf address used for read test - * - * This shader can be used by gfx9 and gfx10 - * - */ - -static const char* gfx9_ReadMemory = -"\ - shader ReadMemory\n\ - wave_size(32)\n\ - type(CS)\n\ - \n\ - // compute address of corresponding output buffer\n\ - v_mov_b32 v0, s4 // use workgroup id as index\n\ - v_lshlrev_b32 v0, 2, v0 // v0 *= 4\n\ - v_add_co_u32 v4, vcc, s2, v0 // v[4:5] = s[2:3] + v0 * 4\n\ - v_mov_b32 v5, s3\n\ - v_add_co_u32 v5, vcc, v5, vcc_lo\n\ - \n\ - // compute input buffer offset used to store corresponding local buffer address\n\ - v_lshlrev_b32 v0, 1, v0 // v0 *= 8\n\ - v_add_co_u32 v2, vcc, s0, v0 // v[2:3] = s[0:1] + v0 * 8\n\ - v_mov_b32 v3, s1\n\ - v_add_co_u32 v3, vcc, v3, vcc_lo\n\ - \n\ - // load 64bit local buffer address stored at v[2:3] to v[6:7]\n\ - flat_load_dwordx2 v[6:7], v[2:3] slc\n\ - s_waitcnt vmcnt(0) & lgkmcnt(0) // wait for memory reads to finish\n\ - \n\ - v_mov_b32 v8, 0x5678\n\ - s_movk_i32 s8, 0x5678\n\ -L_REPEAT:\n\ - s_load_dword s16, s[0:1], 0x0 glc\n\ - s_waitcnt vmcnt(0) & lgkmcnt(0) // wait for memory reads to finish\n\ - s_cmp_eq_i32 s16, s8\n\ - s_cbranch_scc1 L_QUIT // if notified to quit by host\n\ - // loop read 64M local buffer starting at v[6:7]\n\ - // every 4k page only read once\n\ - v_mov_b32 v9, 0\n\ - v_mov_b32 v10, 0x1000 // 4k page\n\ - v_mov_b32 v11, 0x4000000 // 64M size\n\ - v_mov_b32 v12, v6\n\ - v_mov_b32 v13, v7\n\ -L_LOOP_READ:\n\ - flat_load_dwordx2 v[14:15], v[12:13] slc\n\ - v_add_co_u32 v9, vcc, v9, v10 \n\ - v_add_co_u32 v12, vcc, v12, v10\n\ - v_add_co_u32 v13, vcc, v13, vcc_lo\n\ - v_cmp_lt_u32 vcc, v9, v11\n\ - s_cbranch_vccnz L_LOOP_READ\n\ - s_branch L_REPEAT\n\ -L_QUIT:\n\ - flat_store_dword v[4:5], v8\n\ - s_waitcnt vmcnt(0) & lgkmcnt(0) // wait for memory writes to finish\n\ - s_endpgm\n\ - end\n\ -"; - -static const char* gfx8_ReadMemory = -"\ - shader ReadMemory\n\ - asic(VI)\n\ - type(CS)\n\ - \n\ - // compute address of corresponding output buffer\n\ - v_mov_b32 v0, s4 // use workgroup id as index\n\ - v_lshlrev_b32 v0, 2, v0 // v0 *= 4\n\ - v_add_u32 v4, vcc, s2, v0 // v[4:5] = s[2:3] + v0 * 4\n\ - v_mov_b32 v5, s3\n\ - v_addc_u32 v5, vcc, v5, 0, vcc\n\ - \n\ - // compute input buffer offset used to store corresponding local buffer address\n\ - v_lshlrev_b32 v0, 1, v0 // v0 *= 8\n\ - v_add_u32 v2, vcc, s0, v0 // v[2:3] = s[0:1] + v0 * 8\n\ - v_mov_b32 v3, s1\n\ - v_addc_u32 v3, vcc, v3, 0, vcc\n\ - \n\ - // load 64bit local buffer address stored at v[2:3] to v[6:7]\n\ - flat_load_dwordx2 v[6:7], v[2:3] slc\n\ - s_waitcnt vmcnt(0) & lgkmcnt(0) // wait for memory reads to finish\n\ - \n\ - v_mov_b32 v8, 0x5678\n\ - s_movk_i32 s8, 0x5678\n\ -L_REPEAT:\n\ - s_load_dword s16, s[0:1], 0x0 glc\n\ - s_waitcnt vmcnt(0) & lgkmcnt(0) // wait for memory reads to finish\n\ - s_cmp_eq_i32 s16, s8\n\ - s_cbranch_scc1 L_QUIT // if notified to quit by host\n\ - // loop read 64M local buffer starting at v[6:7]\n\ - // every 4k page only read once\n\ - v_mov_b32 v9, 0\n\ - v_mov_b32 v10, 0x1000 // 4k page\n\ - v_mov_b32 v11, 0x4000000 // 64M size\n\ - v_mov_b32 v12, v6\n\ - v_mov_b32 v13, v7\n\ -L_LOOP_READ:\n\ - flat_load_dwordx2 v[14:15], v[12:13] slc\n\ - v_add_u32 v9, vcc, v9, v10 \n\ - v_add_u32 v12, vcc, v12, v10\n\ - v_addc_u32 v13, vcc, v13, 0, vcc\n\ - v_cmp_lt_u32 vcc, v9, v11\n\ - s_cbranch_vccnz L_LOOP_READ\n\ - s_branch L_REPEAT\n\ -L_QUIT:\n\ - flat_store_dword v[4:5], v8\n\ - s_waitcnt vmcnt(0) & lgkmcnt(0) // wait for memory writes to finish\n\ - s_endpgm\n\ - end\n\ -"; - -std::string KFDEvictTest::CreateShader() { - if (m_FamilyId < FAMILY_AI) - return gfx8_ReadMemory; - else - return gfx9_ReadMemory; -} - /* Evict and restore procedure basic test * * Use N_PROCESSES processes to allocate vram buf size larger than total vram size @@ -567,7 +546,7 @@ TEST_F(KFDEvictTest, QueueTest) { HsaMemoryBuffer addrBuffer(PAGE_SIZE, defaultGPUNode); HsaMemoryBuffer resultBuffer(PAGE_SIZE, defaultGPUNode); - m_pIsaGen->CompileShader(CreateShader().c_str(), "ReadMemory", isaBuffer); + ASSERT_SUCCESS(m_pAsm->RunAssembleBuf(CreateShader().c_str(), isaBuffer.As())); PM4Queue pm4Queue; ASSERT_SUCCESS(pm4Queue.Create(defaultGPUNode)); diff --git a/projects/rocr-runtime/tests/kfdtest/src/KFDEvictTest.hpp b/projects/rocr-runtime/tests/kfdtest/src/KFDEvictTest.hpp index 2b838a5388..d70aada6b4 100644 --- a/projects/rocr-runtime/tests/kfdtest/src/KFDEvictTest.hpp +++ b/projects/rocr-runtime/tests/kfdtest/src/KFDEvictTest.hpp @@ -27,15 +27,13 @@ #include #include #include "KFDMultiProcessTest.hpp" -#include "IsaGenerator.hpp" #include "PM4Queue.hpp" // @class KFDEvictTest // Test eviction and restore procedure using two processes class KFDEvictTest : public KFDMultiProcessTest { public: - KFDEvictTest(void): m_pIsaGen(NULL) {} - + KFDEvictTest(void) {} ~KFDEvictTest(void) {} protected: @@ -52,7 +50,6 @@ class KFDEvictTest : public KFDMultiProcessTest { PM4Queue *computeQueue); protected: // Members - IsaGenerator* m_pIsaGen; HsaMemFlags m_Flags; void* m_pBuf; }; From e917805a3343787e91b260a1bf2bcc76c74c1f7a Mon Sep 17 00:00:00 2001 From: Graham Sider Date: Fri, 24 Sep 2021 18:47:58 -0400 Subject: [PATCH 08/27] kfdtest: Update KFDGWSTest to LLVM Asm - Reformat shaders for legibility - Move assembly processes to from IsaGen (CompileShader) to Assembler (RunAssembleBuf) - Change gds:1 modifier to gds - Change offset0:0 modifier to offset:0 Signed-off-by: Graham Sider Change-Id: I2a863695bcf7344cf184a809704948ba3a0d230f [ROCm/ROCR-Runtime commit: ba9ccd32a191212b9adfe32018350d092e2f7656] --- .../tests/kfdtest/src/KFDGWSTest.cpp | 124 ++++++++---------- .../tests/kfdtest/src/KFDGWSTest.hpp | 6 +- 2 files changed, 54 insertions(+), 76 deletions(-) diff --git a/projects/rocr-runtime/tests/kfdtest/src/KFDGWSTest.cpp b/projects/rocr-runtime/tests/kfdtest/src/KFDGWSTest.cpp index 4c8aefc447..a6b3bec17b 100644 --- a/projects/rocr-runtime/tests/kfdtest/src/KFDGWSTest.cpp +++ b/projects/rocr-runtime/tests/kfdtest/src/KFDGWSTest.cpp @@ -26,23 +26,19 @@ #include "PM4Packet.hpp" #include "Dispatch.hpp" -/* Shader to initialize gws counter to 1*/ -const char* gfx9_10_GwsInit = -"\ -shader GwsInit\n\ -type(CS)\n\ -wave_size(32)\n\ - s_mov_b32 m0, 0\n\ - s_nop 0\n\ - s_load_dword s16, s[0:1], 0x0 glc\n\ - s_waitcnt 0\n\ - v_mov_b32 v0, s16\n\ - s_waitcnt 0\n\ - ds_gws_init v0 gds:1 offset0:0\n\ - s_waitcnt 0\n\ - s_endpgm\n\ - end\n\ -"; +/* Shader to initialize gws counter to 1 */ +static const char* GwsInitIsa_gfx9_10 = R"( + .text + s_mov_b32 m0, 0 + s_nop 0 + s_load_dword s16, s[0:1], 0x0 glc + s_waitcnt 0 + v_mov_b32 v0, s16 + s_waitcnt 0 + ds_gws_init v0 offset:0 gds + s_waitcnt 0 + s_endpgm +)"; /* Atomically increase a value in memory * This is expected to be executed from @@ -50,67 +46,53 @@ wave_size(32)\n\ * GWS semaphore is used to guarantee * the operation is atomic. */ -const char* gfx9_AtomicIncrease = -"\ -shader AtomicIncrease\n\ -type(CS)\n\ -/* Assume src address in s0, s1 */\n\ - s_mov_b32 m0, 0\n\ - s_nop 0\n\ - ds_gws_sema_p gds:1 offset0:0\n\ - s_waitcnt 0\n\ - s_load_dword s16, s[0:1], 0x0 glc\n\ - s_waitcnt 0\n\ - s_add_u32 s16, s16, 1\n\ - s_store_dword s16, s[0:1], 0x0 glc\n\ - s_waitcnt lgkmcnt(0)\n\ - ds_gws_sema_v gds:1 offset0:0\n\ - s_waitcnt 0\n\ - s_endpgm\n\ - end\n\ -"; +static const char* AtomicIncreaseIsa_gfx9 = R"( + .text + // Assume src address in s0, s1 + s_mov_b32 m0, 0 + s_nop 0 + ds_gws_sema_p offset:0 gds + s_waitcnt 0 + s_load_dword s16, s[0:1], 0x0 glc + s_waitcnt 0 + s_add_u32 s16, s16, 1 + s_store_dword s16, s[0:1], 0x0 glc + s_waitcnt lgkmcnt(0) + ds_gws_sema_v offset:0 gds + s_waitcnt 0 + s_endpgm +)"; -const char* gfx10_AtomicIncrease = -"\ -shader AtomicIncrease\n\ -asic(GFX10)\n\ -type(CS)\n\ -wave_size(32)\n\ -/* Assume src address in s0, s1 */\n\ - s_mov_b32 m0, 0\n\ - s_mov_b32 exec_lo, 0x1\n\ - v_mov_b32 v0, s0\n\ - v_mov_b32 v1, s1\n\ - ds_gws_sema_p gds:1 offset0:0\n\ - s_waitcnt 0\n\ - flat_load_dword v2, v[0:1] glc:1 dlc:1\n\ - s_waitcnt 0\n\ - v_add_nc_u32 v2, v2, 1\n\ - flat_store_dword v[0:1], v2\n\ - s_waitcnt_vscnt null, 0\n\ - ds_gws_sema_v gds:1 offset0:0\n\ - s_waitcnt 0\n\ - s_endpgm\n\ - end\n\ -"; +static const char* AtomicIncreaseIsa_gfx10 = R"( + .text + // Assume src address in s0, s1 + s_mov_b32 m0, 0 + s_mov_b32 exec_lo, 0x1 + v_mov_b32 v0, s0 + v_mov_b32 v1, s1 + ds_gws_sema_p offset:0 gds + s_waitcnt 0 + flat_load_dword v2, v[0:1] glc dlc + s_waitcnt 0 + v_add_nc_u32 v2, v2, 1 + flat_store_dword v[0:1], v2 + s_waitcnt_vscnt null, 0 + ds_gws_sema_v offset:0 gds + s_waitcnt 0 + s_endpgm +)"; void KFDGWSTest::SetUp() { ROUTINE_START KFDBaseComponentTest::SetUp(); - m_pIsaGen = IsaGenerator::Create(m_FamilyId); - ROUTINE_END } void KFDGWSTest::TearDown() { ROUTINE_START - if (m_pIsaGen) - delete m_pIsaGen; - m_pIsaGen = NULL; - KFDBaseComponentTest::TearDown(); ROUTINE_END @@ -160,21 +142,21 @@ TEST_F(KFDGWSTest, Semaphore) { pNodeProperties->NumGws,&firstGWS)); EXPECT_EQ(0, firstGWS); - m_pIsaGen = IsaGenerator::Create(m_FamilyId); - m_pIsaGen->CompileShader(gfx9_10_GwsInit, "GwsInit", isaBuffer); + ASSERT_SUCCESS(m_pAsm->RunAssembleBuf(GwsInitIsa_gfx9_10, isaBuffer.As())); + Dispatch dispatch0(isaBuffer); buffer.Fill(numResources, 0, 4); dispatch0.SetArgs(buffer.As(), NULL); dispatch0.Submit(queue); dispatch0.Sync(); - const char *pAtomicIncrease; + const char *pAtomicIncreaseIsa; if (m_FamilyId <= FAMILY_AL) - pAtomicIncrease = gfx9_AtomicIncrease; + pAtomicIncreaseIsa = AtomicIncreaseIsa_gfx9; else - pAtomicIncrease = gfx10_AtomicIncrease; + pAtomicIncreaseIsa = AtomicIncreaseIsa_gfx10; - m_pIsaGen->CompileShader(pAtomicIncrease, "AtomicIncrease", isaBuffer); + ASSERT_SUCCESS(m_pAsm->RunAssembleBuf(pAtomicIncreaseIsa, isaBuffer.As())); Dispatch dispatch(isaBuffer); dispatch.SetArgs(buffer.As(), NULL); diff --git a/projects/rocr-runtime/tests/kfdtest/src/KFDGWSTest.hpp b/projects/rocr-runtime/tests/kfdtest/src/KFDGWSTest.hpp index 15e61ee235..8413145982 100644 --- a/projects/rocr-runtime/tests/kfdtest/src/KFDGWSTest.hpp +++ b/projects/rocr-runtime/tests/kfdtest/src/KFDGWSTest.hpp @@ -26,20 +26,16 @@ #include -#include "IsaGenerator.hpp" #include "KFDBaseComponentTest.hpp" class KFDGWSTest : public KFDBaseComponentTest { public: - KFDGWSTest() :m_pIsaGen(NULL) {} + KFDGWSTest() {} ~KFDGWSTest() {} protected: virtual void SetUp(); virtual void TearDown(); - - protected: // Members - IsaGenerator* m_pIsaGen; }; #endif // __KFD_GWS_TEST__H__ From b5eb13f150c9ce4b4991e128f342f52c57fc088f Mon Sep 17 00:00:00 2001 From: Graham Sider Date: Fri, 24 Sep 2021 18:48:15 -0400 Subject: [PATCH 09/27] kfdtest: Update KFDSVMEvictTest to LLVM Asm - Reformat shaders for legibility - Move assembly processes to from IsaGen (CompileShader) to Assembler (RunAssembleBuf) Signed-off-by: Graham Sider Change-Id: Id1eb3856bc74bf0da46685c5dc08e91f5df66d4f [ROCm/ROCR-Runtime commit: a7b85fdb08ba6c314259e7f265f8cdfe586e861a] --- .../tests/kfdtest/src/KFDSVMEvictTest.cpp | 237 ++++++++---------- .../tests/kfdtest/src/KFDSVMEvictTest.hpp | 1 - 2 files changed, 111 insertions(+), 127 deletions(-) diff --git a/projects/rocr-runtime/tests/kfdtest/src/KFDSVMEvictTest.cpp b/projects/rocr-runtime/tests/kfdtest/src/KFDSVMEvictTest.cpp index d41aedac74..e712e7b44a 100644 --- a/projects/rocr-runtime/tests/kfdtest/src/KFDSVMEvictTest.cpp +++ b/projects/rocr-runtime/tests/kfdtest/src/KFDSVMEvictTest.cpp @@ -35,6 +35,109 @@ #define ALLOCATE_BUF_SIZE_MB (64) #define ALLOCATE_RETRY_TIMES (3) +/* Shader to read local buffers using multiple wavefronts in parallel + * until address buffer is filled with specific value 0x5678 by host program, + * then each wavefront fills value 0x5678 at corresponding result buffer and quit + * + * initial state: + * s[0:1] - address buffer base address + * s[2:3] - result buffer base address + * s4 - workgroup id + * v0 - workitem id, always 0 because NUM_THREADS_X(number of threads) in workgroup set to 1 + * registers: + * v0 - calculated workitem id, v0 = v0 + s4 * NUM_THREADS_X + * v[2:3] - address of corresponding local buf address offset: s[0:1] + v0 * 8 + * v[4:5] - corresponding output buf address: s[2:3] + v0 * 4 + * v[6:7] - local buf address used for read test + */ +static const char* gfx9_ReadMemory = R"( + .text + // Compute address of corresponding output buffer + v_mov_b32 v0, s4 // use workgroup id as index + v_lshlrev_b32 v0, 2, v0 // v0 *= 4 + v_add_co_u32 v4, vcc, s2, v0 // v[4:5] = s[2:3] + v0 * 4 + v_mov_b32 v5, s3 + v_add_u32 v5, vcc_lo, v5 + // Compute input buffer offset used to store corresponding local buffer address + v_lshlrev_b32 v0, 1, v0 // v0 *= 8 + v_add_co_u32 v2, vcc, s0, v0 // v[2:3] = s[0:1] + v0 * 8 + v_mov_b32 v3, s1 + v_add_u32 v3, vcc_lo, v3 + // Load 64bit local buffer address stored at v[2:3] to v[6:7] + flat_load_dwordx2 v[6:7], v[2:3] slc + s_waitcnt vmcnt(0) & lgkmcnt(0) // wait for memory reads to finish + v_mov_b32 v8, 0x5678 + s_movk_i32 s8, 0x5678 + L_REPEAT: + s_load_dword s16, s[0:1], 0x0 glc + s_waitcnt vmcnt(0) & lgkmcnt(0) // wait for memory reads to finish + s_cmp_eq_i32 s16, s8 + s_cbranch_scc1 L_QUIT // if notified to quit by host + // Loop read 64M local buffer starting at v[6:7] + // every 4k page only read once + v_mov_b32 v9, 0 + v_mov_b32 v10, 0x1000 // 4k page + v_mov_b32 v11, 0x4000000 // 64M size + v_mov_b32 v12, v6 + v_mov_b32 v13, v7 + L_LOOP_READ: + flat_load_dwordx2 v[14:15], v[12:13] slc + v_add_u32 v9, v9, v10 + v_add_co_u32 v12, vcc, v12, v10 + v_add_u32 v13, vcc_lo, v13 + v_cmp_lt_u32 vcc, v9, v11 + s_cbranch_vccnz L_LOOP_READ + s_branch L_REPEAT + L_QUIT: + flat_store_dword v[4:5], v8 + s_waitcnt vmcnt(0) & lgkmcnt(0) // wait for memory writes to finish + s_endpgm +)"; + +static const char* gfx8_ReadMemory = R"( + .text + // Compute address of corresponding output buffer + v_mov_b32 v0, s4 // use workgroup id as index + v_lshlrev_b32 v0, 2, v0 // v0 *= 4 + v_add_u32 v4, vcc, s2, v0 // v[4:5] = s[2:3] + v0 * 4 + v_mov_b32 v5, s3 + v_addc_u32 v5, vcc, v5, 0, vcc + // Compute input buffer offset used to store corresponding local buffer address + v_lshlrev_b32 v0, 1, v0 // v0 *= 8 + v_add_u32 v2, vcc, s0, v0 // v[2:3] = s[0:1] + v0 * 8 + v_mov_b32 v3, s1 + v_addc_u32 v3, vcc, v3, 0, vcc + // Load 64bit local buffer address stored at v[2:3] to v[6:7] + flat_load_dwordx2 v[6:7], v[2:3] slc + s_waitcnt vmcnt(0) & lgkmcnt(0) // wait for memory reads to finish + v_mov_b32 v8, 0x5678 + s_movk_i32 s8, 0x5678 + L_REPEAT: + s_load_dword s16, s[0:1], 0x0 glc + s_waitcnt vmcnt(0) & lgkmcnt(0) // wait for memory reads to finish + s_cmp_eq_i32 s16, s8 + s_cbranch_scc1 L_QUIT // if notified to quit by host + // Loop read 64M local buffer starting at v[6:7] + // every 4k page only read once + v_mov_b32 v9, 0 + v_mov_b32 v10, 0x1000 // 4k page + v_mov_b32 v11, 0x4000000 // 64M size + v_mov_b32 v12, v6 + v_mov_b32 v13, v7 + L_LOOP_READ: + flat_load_dwordx2 v[14:15], v[12:13] slc + v_add_u32 v9, vcc, v9, v10 + v_add_u32 v12, vcc, v12, v10 + v_addc_u32 v13, vcc, v13, 0, vcc + v_cmp_lt_u32 vcc, v9, v11 + s_cbranch_vccnz L_LOOP_READ + s_branch L_REPEAT + L_QUIT: + flat_store_dword v[4:5], v8 + s_waitcnt vmcnt(0) & lgkmcnt(0) // wait for memory writes to finish + s_endpgm +)"; + void KFDSVMEvictTest::SetUp() { ROUTINE_START @@ -55,6 +158,13 @@ void KFDSVMEvictTest::TearDown() { ROUTINE_END } +std::string KFDSVMEvictTest::CreateShader() { + if (m_FamilyId >= FAMILY_AI) + return gfx9_ReadMemory; + else + return gfx8_ReadMemory; +} + HSAint32 KFDSVMEvictTest::GetBufferCounter(HSAuint64 vramSize, HSAuint64 vramBufSize) { HSAuint64 vramBufSizeInPages = vramBufSize >> PAGE_SHIFT; HSAuint64 sysMemSize = GetSysMemSize(); @@ -234,131 +344,6 @@ TEST_F(KFDSVMEvictTest, BasicTest) { TEST_END } -/* Shader to read local buffers using multiple wavefronts in parallel - * until address buffer is filled with specific value 0x5678 by host program, - * then each wavefront fills value 0x5678 at corresponding result buffer and quit - * - * initial state: - * s[0:1] - address buffer base address - * s[2:3] - result buffer base address - * s4 - workgroup id - * v0 - workitem id, always 0 because NUM_THREADS_X(number of threads) in workgroup set to 1 - * registers: - * v0 - calculated workitem id, v0 = v0 + s4 * NUM_THREADS_X - * v[2:3] - address of corresponding local buf address offset: s[0:1] + v0 * 8 - * v[4:5] - corresponding output buf address: s[2:3] + v0 * 4 - * v[6:7] - local buf address used for read test - */ -static const char* gfx9_ReadMemory = -"\ - shader ReadMemory\n\ - type(CS)\n\ - \n\ - // compute address of corresponding output buffer\n\ - v_mov_b32 v0, s4 // use workgroup id as index\n\ - v_lshlrev_b32 v0, 2, v0 // v0 *= 4\n\ - v_add_co_u32 v4, vcc, s2, v0 // v[4:5] = s[2:3] + v0 * 4\n\ - v_mov_b32 v5, s3\n\ - v_add_u32 v5, vcc_lo, v5\n\ - \n\ - // compute input buffer offset used to store corresponding local buffer address\n\ - v_lshlrev_b32 v0, 1, v0 // v0 *= 8\n\ - v_add_co_u32 v2, vcc, s0, v0 // v[2:3] = s[0:1] + v0 * 8\n\ - v_mov_b32 v3, s1\n\ - v_add_u32 v3, vcc_lo, v3\n\ - \n\ - // load 64bit local buffer address stored at v[2:3] to v[6:7]\n\ - flat_load_dwordx2 v[6:7], v[2:3] slc\n\ - s_waitcnt vmcnt(0) & lgkmcnt(0) // wait for memory reads to finish\n\ - \n\ - v_mov_b32 v8, 0x5678\n\ - s_movk_i32 s8, 0x5678\n\ -L_REPEAT:\n\ - s_load_dword s16, s[0:1], 0x0 glc\n\ - s_waitcnt vmcnt(0) & lgkmcnt(0) // wait for memory reads to finish\n\ - s_cmp_eq_i32 s16, s8\n\ - s_cbranch_scc1 L_QUIT // if notified to quit by host\n\ - // loop read 64M local buffer starting at v[6:7]\n\ - // every 4k page only read once\n\ - v_mov_b32 v9, 0\n\ - v_mov_b32 v10, 0x1000 // 4k page\n\ - v_mov_b32 v11, 0x4000000 // 64M size\n\ - v_mov_b32 v12, v6\n\ - v_mov_b32 v13, v7\n\ -L_LOOP_READ:\n\ - flat_load_dwordx2 v[14:15], v[12:13] slc\n\ - v_add_u32 v9, v9, v10 \n\ - v_add_co_u32 v12, vcc, v12, v10\n\ - v_add_u32 v13, vcc_lo, v13\n\ - v_cmp_lt_u32 vcc, v9, v11\n\ - s_cbranch_vccnz L_LOOP_READ\n\ - s_branch L_REPEAT\n\ -L_QUIT:\n\ - flat_store_dword v[4:5], v8\n\ - s_waitcnt vmcnt(0) & lgkmcnt(0) // wait for memory writes to finish\n\ - s_endpgm\n\ - end\n\ -"; - -static const char* gfx8_ReadMemory = -"\ - shader ReadMemory\n\ - asic(VI)\n\ - type(CS)\n\ - \n\ - // compute address of corresponding output buffer\n\ - v_mov_b32 v0, s4 // use workgroup id as index\n\ - v_lshlrev_b32 v0, 2, v0 // v0 *= 4\n\ - v_add_u32 v4, vcc, s2, v0 // v[4:5] = s[2:3] + v0 * 4\n\ - v_mov_b32 v5, s3\n\ - v_addc_u32 v5, vcc, v5, 0, vcc\n\ - \n\ - // compute input buffer offset used to store corresponding local buffer address\n\ - v_lshlrev_b32 v0, 1, v0 // v0 *= 8\n\ - v_add_u32 v2, vcc, s0, v0 // v[2:3] = s[0:1] + v0 * 8\n\ - v_mov_b32 v3, s1\n\ - v_addc_u32 v3, vcc, v3, 0, vcc\n\ - \n\ - // load 64bit local buffer address stored at v[2:3] to v[6:7]\n\ - flat_load_dwordx2 v[6:7], v[2:3] slc\n\ - s_waitcnt vmcnt(0) & lgkmcnt(0) // wait for memory reads to finish\n\ - \n\ - v_mov_b32 v8, 0x5678\n\ - s_movk_i32 s8, 0x5678\n\ -L_REPEAT:\n\ - s_load_dword s16, s[0:1], 0x0 glc\n\ - s_waitcnt vmcnt(0) & lgkmcnt(0) // wait for memory reads to finish\n\ - s_cmp_eq_i32 s16, s8\n\ - s_cbranch_scc1 L_QUIT // if notified to quit by host\n\ - // loop read 64M local buffer starting at v[6:7]\n\ - // every 4k page only read once\n\ - v_mov_b32 v9, 0\n\ - v_mov_b32 v10, 0x1000 // 4k page\n\ - v_mov_b32 v11, 0x4000000 // 64M size\n\ - v_mov_b32 v12, v6\n\ - v_mov_b32 v13, v7\n\ -L_LOOP_READ:\n\ - flat_load_dwordx2 v[14:15], v[12:13] slc\n\ - v_add_u32 v9, vcc, v9, v10 \n\ - v_add_u32 v12, vcc, v12, v10\n\ - v_addc_u32 v13, vcc, v13, 0, vcc\n\ - v_cmp_lt_u32 vcc, v9, v11\n\ - s_cbranch_vccnz L_LOOP_READ\n\ - s_branch L_REPEAT\n\ -L_QUIT:\n\ - flat_store_dword v[4:5], v8\n\ - s_waitcnt vmcnt(0) & lgkmcnt(0) // wait for memory writes to finish\n\ - s_endpgm\n\ - end\n\ -"; - -std::string KFDSVMEvictTest::CreateShader() { - if (m_FamilyId >= FAMILY_AI) - return gfx9_ReadMemory; - else - return gfx8_ReadMemory; -} - /* Evict and restore queue test * * N_PROCESSES processes read all local buffers in parallel while buffers are evicted and restored @@ -434,7 +419,7 @@ TEST_F(KFDSVMEvictTest, QueueTest) { for (i = 0; i < wavefront_num; i++) *(localBufAddr + i) = pBuffers[i]; - m_pIsaGen->CompileShader(CreateShader().c_str(), "ReadMemory", isaBuffer); + ASSERT_SUCCESS(m_pAsm->RunAssembleBuf(CreateShader().c_str(), isaBuffer.As())); PM4Queue pm4Queue; ASSERT_SUCCESS(pm4Queue.Create(defaultGPUNode)); diff --git a/projects/rocr-runtime/tests/kfdtest/src/KFDSVMEvictTest.hpp b/projects/rocr-runtime/tests/kfdtest/src/KFDSVMEvictTest.hpp index 2b8a1de957..3f26287cc0 100644 --- a/projects/rocr-runtime/tests/kfdtest/src/KFDSVMEvictTest.hpp +++ b/projects/rocr-runtime/tests/kfdtest/src/KFDSVMEvictTest.hpp @@ -28,7 +28,6 @@ #include #include "KFDLocalMemoryTest.hpp" #include "KFDBaseComponentTest.hpp" -#include "IsaGenerator.hpp" // @class KFDEvictTest // Test eviction and restore procedure using two processes From e67ec560912cb2673a30793b08b2756557c93596 Mon Sep 17 00:00:00 2001 From: Graham Sider Date: Tue, 2 Nov 2021 13:45:08 -0400 Subject: [PATCH 10/27] kfdtest: Add ShaderStore.cpp/hpp Initial commit for ShaderStore.hpp. Will contain consts char*'s for all shaders used within KFDTest. The LLVM assembler now takes care of the correct instructions to be used for various GFX versions using directives embedded into the shader assembly. Signed-off-by: Graham Sider Change-Id: I2887a03b33d5c2cc382e4f96c2bc3e067715ab54 [ROCm/ROCR-Runtime commit: 34ca37d9e8a959b48ee336808e2a415ef38eaf17] --- .../rocr-runtime/tests/kfdtest/CMakeLists.txt | 1 + .../kfdtest/src/KFDBaseComponentTest.hpp | 1 + .../tests/kfdtest/src/ShaderStore.cpp | 65 +++++++++++++++++++ .../tests/kfdtest/src/ShaderStore.hpp | 33 ++++++++++ 4 files changed, 100 insertions(+) create mode 100644 projects/rocr-runtime/tests/kfdtest/src/ShaderStore.cpp create mode 100644 projects/rocr-runtime/tests/kfdtest/src/ShaderStore.hpp diff --git a/projects/rocr-runtime/tests/kfdtest/CMakeLists.txt b/projects/rocr-runtime/tests/kfdtest/CMakeLists.txt index 39994dbff3..e175253dad 100644 --- a/projects/rocr-runtime/tests/kfdtest/CMakeLists.txt +++ b/projects/rocr-runtime/tests/kfdtest/CMakeLists.txt @@ -146,6 +146,7 @@ set (SRC_FILES gtest-1.6.0/gtest-all.cpp src/GoogleTestExtension.cpp src/IndirectBuffer.cpp src/Assemble.cpp + src/ShaderStore.cpp src/IsaGenerator.cpp src/IsaGenerator_Aldebaran.cpp src/IsaGenerator_Gfx10.cpp diff --git a/projects/rocr-runtime/tests/kfdtest/src/KFDBaseComponentTest.hpp b/projects/rocr-runtime/tests/kfdtest/src/KFDBaseComponentTest.hpp index ec3053c3d3..cc87465ad5 100644 --- a/projects/rocr-runtime/tests/kfdtest/src/KFDBaseComponentTest.hpp +++ b/projects/rocr-runtime/tests/kfdtest/src/KFDBaseComponentTest.hpp @@ -35,6 +35,7 @@ #include "OSWrapper.hpp" #include "KFDTestUtil.hpp" #include "Assemble.hpp" +#include "ShaderStore.hpp" // @class KFDBaseComponentTest class KFDBaseComponentTest : public testing::Test { diff --git a/projects/rocr-runtime/tests/kfdtest/src/ShaderStore.cpp b/projects/rocr-runtime/tests/kfdtest/src/ShaderStore.cpp new file mode 100644 index 0000000000..5f46bf844a --- /dev/null +++ b/projects/rocr-runtime/tests/kfdtest/src/ShaderStore.cpp @@ -0,0 +1,65 @@ +/* + * Copyright (C) 2021 Advanced Micro Devices, Inc. All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + * + */ + +/** + * Common + */ + +const char *NoopIsa = R"( + .text + s_endpgm +)"; + +const char *CopyDwordIsa = R"( + .text + v_mov_b32 v0, s0 + v_mov_b32 v1, s1 + v_mov_b32 v2, s2 + v_mov_b32 v3, s3 + flat_load_dword v4, v[0:1] glc slc + s_waitcnt 0 + flat_store_dword v[2:3], v4 glc slc + s_endpgm +)"; + +const char *InfiniteLoopIsa = R"( + .text + LOOP: + s_branch LOOP + s_endpgm +)"; + +const char *AtomicIncIsa = R"( + .text + v_mov_b32 v0, s0 + v_mov_b32 v1, s1 + .if (.amdgcn.gfx_generation_number >= 8) + v_mov_b32 v2, 1 + flat_atomic_add v3, v[0:1], v2 glc slc + .else + v_mov_b32 v2, -1 + flat_atomic_inc v3, v[0:1], v2 glc slc + .endif + s_waitcnt 0 + s_endpgm +)"; diff --git a/projects/rocr-runtime/tests/kfdtest/src/ShaderStore.hpp b/projects/rocr-runtime/tests/kfdtest/src/ShaderStore.hpp new file mode 100644 index 0000000000..2344c5bca2 --- /dev/null +++ b/projects/rocr-runtime/tests/kfdtest/src/ShaderStore.hpp @@ -0,0 +1,33 @@ +/* + * Copyright (C) 2021 Advanced Micro Devices, Inc. All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + * + */ + +#ifndef _SHADERSTORE_H_ +#define _SHADERSTORE_H_ + +/* Common */ +extern const char *NoopIsa; +extern const char *CopyDwordIsa; +extern const char *InfiniteLoopIsa; +extern const char *AtomicIncIsa; + +#endif // _SHADERSTORE_H_ From e628983aed14339467889aaf8e789e06fccf1171 Mon Sep 17 00:00:00 2001 From: Graham Sider Date: Tue, 2 Nov 2021 13:47:01 -0400 Subject: [PATCH 11/27] kfdtest: Move KFDMemoryTest shaders to ShaderStore Signed-off-by: Graham Sider Change-Id: I3335ca1f9dbe849233cf85253e0e92b56a20b8c9 [ROCm/ROCR-Runtime commit: c926d83b5a5ece74324e7e422315e6c5fd6e4afe] --- .../tests/kfdtest/src/KFDGraphicsInterop.cpp | 3 +- .../tests/kfdtest/src/KFDMemoryTest.cpp | 364 ++---------------- .../tests/kfdtest/src/KFDMemoryTest.hpp | 5 +- .../tests/kfdtest/src/ShaderStore.cpp | 225 +++++++++++ .../tests/kfdtest/src/ShaderStore.hpp | 9 + 5 files changed, 264 insertions(+), 342 deletions(-) diff --git a/projects/rocr-runtime/tests/kfdtest/src/KFDGraphicsInterop.cpp b/projects/rocr-runtime/tests/kfdtest/src/KFDGraphicsInterop.cpp index bf2a928db1..48224bce0f 100644 --- a/projects/rocr-runtime/tests/kfdtest/src/KFDGraphicsInterop.cpp +++ b/projects/rocr-runtime/tests/kfdtest/src/KFDGraphicsInterop.cpp @@ -101,7 +101,8 @@ TEST_F(KFDGraphicsInterop, RegisterGraphicsHandle) { // Copy contents to a system memory buffer for comparison HsaMemoryBuffer isaBuffer(PAGE_SIZE, defaultGPUNode, true/*zero*/, false/*local*/, true/*exec*/); - m_pIsaGen->GetCopyDwordIsa(isaBuffer); + + ASSERT_SUCCESS(m_pAsm->RunAssembleBuf(CopyDwordIsa, isaBuffer.As())); HsaMemoryBuffer dstBuffer(PAGE_SIZE, defaultGPUNode, true/*zero*/); diff --git a/projects/rocr-runtime/tests/kfdtest/src/KFDMemoryTest.cpp b/projects/rocr-runtime/tests/kfdtest/src/KFDMemoryTest.cpp index 980c9c7bf8..4e9cb5fd19 100644 --- a/projects/rocr-runtime/tests/kfdtest/src/KFDMemoryTest.cpp +++ b/projects/rocr-runtime/tests/kfdtest/src/KFDMemoryTest.cpp @@ -39,292 +39,17 @@ #include "SDMAPacket.hpp" #include "linux/kfd_ioctl.h" -static const char* ScratchCopyDwordIsa_gfx8 = R"( - .text - // Copy the parameters from scalar registers to vector registers - v_mov_b32_e32 v0, s0 - v_mov_b32_e32 v1, s1 - v_mov_b32_e32 v2, s2 - v_mov_b32_e32 v3, s3 - // Setup the scratch parameters. This assumes a single 16-reg block - s_mov_b32 flat_scratch_lo, 8 - s_mov_b32 flat_scratch_hi, 0 - // Copy a dword between the passed addresses - flat_load_dword v4, v[0:1] slc - s_waitcnt vmcnt(0) & lgkmcnt(0) - flat_store_dword v[2:3], v4 slc - s_endpgm -)"; -static const char* ScratchCopyDwordIsa_gfx9 = R"( - .text - // Copy the parameters from scalar registers to vector registers - v_mov_b32 v0, s0 - v_mov_b32 v1, s1 - v_mov_b32 v2, s2 - v_mov_b32 v3, s3 - // Setup the scratch parameters. This assumes a single 16-reg block - s_mov_b32 flat_scratch_lo, s4 - s_mov_b32 flat_scratch_hi, s5 - // Copy a dword between the passed addresses - flat_load_dword v4, v[0:1] slc - s_waitcnt vmcnt(0) & lgkmcnt(0) - flat_store_dword v[2:3], v4 slc - s_endpgm -)"; -static const char* ScratchCopyDwordIsa_gfx10 = R"( - .text - // Copy the parameters from scalar registers to vector registers - v_mov_b32 v0, s0 - v_mov_b32 v1, s1 - v_mov_b32 v2, s2 - v_mov_b32 v3, s3 - // Setup the scratch parameters. This assumes a single 16-reg block - s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s4 - s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s5 - // Copy a dword between the passed addresses - flat_load_dword v4, v[0:1] slc - s_waitcnt vmcnt(0) & lgkmcnt(0) - flat_store_dword v[2:3], v4 slc - s_endpgm -)"; -static const char* ScratchCopyDwordIsa_gfx9aldbrn = R"( - .text - // Copy the parameters from scalar registers to vector registers - v_mov_b32 v0, s0 - v_mov_b32 v1, s1 - v_mov_b32 v2, s2 - v_mov_b32 v3, s3 - // Setup the scratch parameters. This assumes a single 16-reg block - s_mov_b32 flat_scratch_lo, s4 - s_mov_b32 flat_scratch_hi, s5 - // Copy a dword between the passed addresses - flat_load_dword v4, v[0:1] slc - s_waitcnt vmcnt(0) & lgkmcnt(0) - flat_store_dword v[2:3], v4 slc - s_endpgm -)"; - -/* Continuously poll src buffer and check buffer value - * After src buffer is filled with specific value (0x5678, - * by host program), fill dst buffer with specific - * value(0x5678) and quit - */ -static const char* PollMemoryIsa_gfx9 = R"( - .text - // Assume src address in s0, s1, and dst address in s2, s3 - s_movk_i32 s18, 0x5678 - LOOP: - s_load_dword s16, s[0:1], 0x0 glc - s_cmp_eq_i32 s16, s18 - s_cbranch_scc0 LOOP - s_store_dword s18, s[2:3], 0x0 glc - s_endpgm -)"; - -/* Similar to PollMemoryIsa_gfx9 except that the buffer - * polled can be Non-coherant memory. SCC system-level - * cache coherence is not supported in scalar (smem) path. - * Use vmem operations with scc - */ -static const char* PollNCMemoryIsa_gfx9 = R"( - .text - // Assume src address in s0, s1, and dst address in s2, s3 - v_mov_b32 v6, 0x5678 - v_mov_b32 v0, s0 - v_mov_b32 v1, s1 - LOOP: - flat_load_dword v4, v[0:1] scc - v_cmp_eq_u32 vcc, v4, v6 - s_cbranch_vccz LOOP - v_mov_b32 v0, s2 - v_mov_b32 v1, s3 - flat_store_dword v[0:1], v6 scc - s_endpgm -)"; - -static const char* PollMemoryIsa_gfx10 = R"( - .text - // Assume src address in s0, s1, and dst address in s2, s3 - s_movk_i32 s18, 0x5678 - v_mov_b32 v0, s2 - v_mov_b32 v1, s3 - v_mov_b32 v2, 0x5678 - LOOP: - s_load_dword s16, s[0:1], 0x0 glc - s_cmp_eq_i32 s16, s18 - s_cbranch_scc0 LOOP - flat_store_dword v[0:1], v2 slc - s_waitcnt vmcnt(0) & lgkmcnt(0) - s_endpgm -)"; - -/* Input: A buffer of at least 3 dwords. - * DW0: used as a signal. 0xcafe means it is signaled - * DW1: Input buffer for device to read. - * DW2: Output buffer for device to write. - * Once receive signal, device will copy DW1 to DW2 - * This shader continously poll the signal buffer, - * Once signal buffer is signaled, it copies input buffer - * to output buffer - */ -static const char* CopyOnSignalIsa_gfx9 = R"( - .text - // Assume input buffer in s0, s1 - s_mov_b32 s18, 0xcafe - POLLSIGNAL: - s_load_dword s16, s[0:1], 0x0 glc - s_cmp_eq_i32 s16, s18 - s_cbranch_scc0 POLLSIGNAL - s_load_dword s17, s[0:1], 0x4 glc - s_waitcnt vmcnt(0) & lgkmcnt(0) - s_store_dword s17, s[0:1], 0x8 glc - s_waitcnt vmcnt(0) & lgkmcnt(0) - s_endpgm -)"; - -static const char* CopyOnSignalIsa_gfx10 = R"( - .text - // Assume input buffer in s0, s1 - s_add_u32 s2, s0, 0x8 - s_addc_u32 s3, s1, 0x0 - s_mov_b32 s18, 0xcafe - v_mov_b32 v0, s0 - v_mov_b32 v1, s1 - v_mov_b32 v4, s2 - v_mov_b32 v5, s3 - POLLSIGNAL: - s_load_dword s16, s[0:1], 0x0 glc - s_cmp_eq_i32 s16, s18 - s_cbranch_scc0 POLLSIGNAL - s_load_dword s17, s[0:1], 0x4 glc - s_waitcnt vmcnt(0) & lgkmcnt(0) - v_mov_b32 v2, s17 - flat_store_dword v[4:5], v2 glc - s_waitcnt vmcnt(0) & lgkmcnt(0) - s_endpgm -)"; - -/* Input0: A buffer of at least 2 dwords. - * DW0: used as a signal. Write 0xcafe to signal - * DW1: Write to this buffer for other device to read. - * Input1: mmio base address - */ -static const char* WriteAndSignalIsa_gfx9 = R"( - .text - // Assume input buffer in s0, s1 - s_mov_b32 s18, 0xbeef - s_store_dword s18, s[0:1], 0x4 glc - s_mov_b32 s18, 0x1 - s_store_dword s18, s[2:3], 0 glc - s_mov_b32 s18, 0xcafe - s_store_dword s18, s[0:1], 0x0 glc - s_endpgm -)"; - -/* Continuously poll the flag at src buffer - * After the flag of s[0:1] is 1 filled, - * copy the value from s[0:1]+4 to dst buffer - */ -static const char* PollAndCopyIsa_gfx9 = R"( - .text - // Assume src buffer in s[0:1] and dst buffer in s[2:3] - s_movk_i32 s18, 0x1 - LOOP: - s_load_dword s16, s[0:1], 0x0 glc - s_cmp_eq_i32 s16, s18 - s_cbranch_scc0 LOOP - s_load_dword s17, s[0:1], 0x4 glc - s_waitcnt vmcnt(0) & lgkmcnt(0) - s_store_dword s17, s[2:3], 0x0 glc - s_waitcnt vmcnt(0) & lgkmcnt(0) - s_endpgm -)"; - -static const char* PollAndCopyIsa_gfx9aldbrn = R"( - .text - // Assume src buffer in s[0:1] and dst buffer in s[2:3] - v_mov_b32 v0, s0 - v_mov_b32 v1, s1 - v_mov_b32 v18, 0x1 - LOOP: - flat_load_dword v16, v[0:1] glc - s_waitcnt vmcnt(0) & lgkmcnt(0) - v_cmp_eq_i32 vcc, v16, v18 - s_cbranch_vccz LOOP - buffer_invl2 - s_load_dword s17, s[0:1], 0x4 glc - s_waitcnt vmcnt(0) & lgkmcnt(0) - s_store_dword s17, s[2:3], 0x0 glc - s_waitcnt vmcnt(0) & lgkmcnt(0) - buffer_wbl2 - s_waitcnt vmcnt(0) & lgkmcnt(0) - s_endpgm -)"; - -/* Input0: A buffer of at least 2 dwords. - * DW0: used as a signal. Write 0x1 to signal - * DW1: Write the value from 2nd input buffer - * for other device to read. - * Input1: A buffer of at least 2 dwords. - * DW0: used as the value to be written. - */ -static const char* WriteFlagAndValueIsa_gfx9aldbrn = R"( - .text - // Assume two inputs buffer in s[0:1] and s[2:3] - v_mov_b32 v0, s0 - v_mov_b32 v1, s1 - s_load_dword s18, s[2:3], 0x0 glc - s_waitcnt vmcnt(0) & lgkmcnt(0) - s_store_dword s18, s[0:1], 0x4 glc - s_waitcnt vmcnt(0) & lgkmcnt(0) - buffer_wbl2 - s_waitcnt vmcnt(0) & lgkmcnt(0) - v_mov_b32 v16, 0x1 - flat_store_dword v[0:1], v16 glc - s_endpgm -)"; - -static const char* WriteAndSignalIsa_gfx10 = R"( - .text - // Assume input buffer in s0, s1 - s_add_u32 s4, s0, 0x4 - s_addc_u32 s5, s1, 0x0 - v_mov_b32 v0, s0 - v_mov_b32 v1, s1 - v_mov_b32 v2, s2 - v_mov_b32 v3, s3 - v_mov_b32 v4, s4 - v_mov_b32 v5, s5 - v_mov_b32 v18, 0xbeef - flat_store_dword v[4:5], v18 glc - v_mov_b32 v18, 0x1 - flat_store_dword v[2:3], v18 glc - v_mov_b32 v18, 0xcafe - flat_store_dword v[0:1], v18 glc - s_endpgm -)"; - -/* These PollMemoryIsa_gfx9, CopyOnSignalIsa_gfx9, - * WriteAndSignalIsa_gfx9 shaders can be used by both gfx9 and gfx10 - */ - void KFDMemoryTest::SetUp() { ROUTINE_START KFDBaseComponentTest::SetUp(); - m_pIsaGen = IsaGenerator::Create(m_FamilyId); - ROUTINE_END } void KFDMemoryTest::TearDown() { ROUTINE_START - if (m_pIsaGen) - delete m_pIsaGen; - m_pIsaGen = NULL; - KFDBaseComponentTest::TearDown(); ROUTINE_END @@ -442,11 +167,9 @@ TEST_F(KFDMemoryTest, MapUnmapToNodes) { const char *pReadMemory; if (m_NodeInfo.IsNodeXGMItoCPU(defaultGPUNode)) /* On A+A system memory is mapped as NC */ - pReadMemory = PollNCMemoryIsa_gfx9; - else if (m_FamilyId < FAMILY_NV) - pReadMemory = PollMemoryIsa_gfx9; + pReadMemory = PollNCMemoryIsa; else - pReadMemory = PollMemoryIsa_gfx10; + pReadMemory = PollMemoryIsa; ASSERT_SUCCESS(m_pAsm->RunAssembleBuf(pReadMemory, isaBuffer.As())); @@ -605,7 +328,8 @@ TEST_F(KFDMemoryTest, MemoryRegister) { ASSERT_SUCCESS(sdmaQueue.Create(defaultGPUNode)); HsaMemoryBuffer isaBuffer(PAGE_SIZE, defaultGPUNode, true/*zero*/, false/*local*/, true/*exec*/); - m_pIsaGen->GetCopyDwordIsa(isaBuffer); + + ASSERT_SUCCESS(m_pAsm->RunAssembleBuf(CopyDwordIsa, isaBuffer.As())); /* First submit just so the queues are not empty, and to get the * TLB populated (in case we need to flush TLBs somewhere after @@ -786,17 +510,7 @@ TEST_F(KFDMemoryTest, FlatScratchAccess) { // Initialize the srcBuffer to some fixed value srcMemBuffer.Fill(0x01010101); - const char *pScratchCopyDwordIsa; - if (m_FamilyId < FAMILY_AI) - pScratchCopyDwordIsa = ScratchCopyDwordIsa_gfx8; - else if (m_FamilyId < FAMILY_AL) - pScratchCopyDwordIsa = ScratchCopyDwordIsa_gfx9; - else if (m_FamilyId == FAMILY_AL) - pScratchCopyDwordIsa = ScratchCopyDwordIsa_gfx9aldbrn; - else - pScratchCopyDwordIsa = ScratchCopyDwordIsa_gfx10; - - ASSERT_SUCCESS(m_pAsm->RunAssembleBuf(pScratchCopyDwordIsa, isaBuffer.As())); + ASSERT_SUCCESS(m_pAsm->RunAssembleBuf(ScratchCopyDwordIsa, isaBuffer.As())); const HsaNodeProperties *pNodeProperties = m_NodeInfo.GetNodeProperties(defaultGPUNode); @@ -1660,17 +1374,7 @@ TEST_F(KFDMemoryTest, PtraceAccessInvisibleVram) { // dstBuffer is cpu accessible gtt memory HsaMemoryBuffer dstBuffer(PAGE_SIZE, defaultGPUNode); - const char *pScratchCopyDwordIsa; - if (m_FamilyId < FAMILY_AI) - pScratchCopyDwordIsa = ScratchCopyDwordIsa_gfx8; - else if (m_FamilyId < FAMILY_AL) - pScratchCopyDwordIsa = ScratchCopyDwordIsa_gfx9; - else if (m_FamilyId == FAMILY_AL) - pScratchCopyDwordIsa = ScratchCopyDwordIsa_gfx9aldbrn; - else - pScratchCopyDwordIsa = ScratchCopyDwordIsa_gfx10; - - ASSERT_SUCCESS(m_pAsm->RunAssembleBuf(pScratchCopyDwordIsa, isaBuffer.As())); + ASSERT_SUCCESS(m_pAsm->RunAssembleBuf(ScratchCopyDwordIsa, isaBuffer.As())); Dispatch dispatch0(isaBuffer); dispatch0.SetArgs(mem0, dstBuffer.As()); @@ -2042,13 +1746,8 @@ TEST_F(KFDMemoryTest, HostHdpFlush) { PM4Queue queue; ASSERT_SUCCESS(queue.Create(defaultGPUNode)); HsaMemoryBuffer isaBuffer(PAGE_SIZE, defaultGPUNode, true/*zero*/, false/*local*/, true/*exec*/); - const char *pCopyOnSignalIsa; - if (m_FamilyId < FAMILY_NV) - pCopyOnSignalIsa = CopyOnSignalIsa_gfx9; - else - pCopyOnSignalIsa = CopyOnSignalIsa_gfx10; - ASSERT_SUCCESS(m_pAsm->RunAssembleBuf(pCopyOnSignalIsa, isaBuffer.As())); + ASSERT_SUCCESS(m_pAsm->RunAssembleBuf(CopyOnSignalIsa, isaBuffer.As())); Dispatch dispatch0(isaBuffer); dispatch0.SetArgs(buffer, NULL); @@ -2169,13 +1868,8 @@ TEST_F(KFDMemoryTest, DeviceHdpFlush) { PM4Queue queue; ASSERT_SUCCESS(queue.Create(nodes[0])); HsaMemoryBuffer isaBuffer(PAGE_SIZE, nodes[0], true/*zero*/, false/*local*/, true/*exec*/); - const char *pCopyOnSignalIsa; - if (m_FamilyId < FAMILY_NV) - pCopyOnSignalIsa = CopyOnSignalIsa_gfx9; - else - pCopyOnSignalIsa = CopyOnSignalIsa_gfx10; - ASSERT_SUCCESS(m_pAsm->RunAssembleBuf(pCopyOnSignalIsa, isaBuffer.As())); + ASSERT_SUCCESS(m_pAsm->RunAssembleBuf(CopyOnSignalIsa, isaBuffer.As())); Dispatch dispatch(isaBuffer); dispatch.SetArgs(buffer, NULL); @@ -2184,13 +1878,8 @@ TEST_F(KFDMemoryTest, DeviceHdpFlush) { PM4Queue queue0; ASSERT_SUCCESS(queue0.Create(nodes[1])); HsaMemoryBuffer isaBuffer0(PAGE_SIZE, nodes[1], true/*zero*/, false/*local*/, true/*exec*/); - const char *pWriteAndSignalIsa; - if (m_FamilyId < FAMILY_NV) - pWriteAndSignalIsa = WriteAndSignalIsa_gfx9; - else - pWriteAndSignalIsa = WriteAndSignalIsa_gfx10; - ASSERT_SUCCESS(m_pAsm->RunAssembleBuf(pWriteAndSignalIsa, isaBuffer.As())); + ASSERT_SUCCESS(m_pAsm->RunAssembleBuf(WriteAndSignalIsa, isaBuffer.As())); Dispatch dispatch0(isaBuffer0); dispatch0.SetArgs(buffer, mmioBase); @@ -2244,7 +1933,7 @@ TEST_F(KFDMemoryTest, CacheInvalidateOnSdmaWrite) { ASSERT_SUCCESS(queue.Create(defaultGPUNode)); HsaMemoryBuffer isaBuffer(PAGE_SIZE, defaultGPUNode, true/*zero*/, false/*local*/, true/*exec*/); - ASSERT_SUCCESS(m_pAsm->RunAssembleBuf(PollMemoryIsa_gfx9, isaBuffer.As())); + ASSERT_SUCCESS(m_pAsm->RunAssembleBuf(PollMemoryIsa, isaBuffer.As())); Dispatch dispatch(isaBuffer); dispatch.SetArgs(buffer.As(), buffer.As()+dwLocation); @@ -2299,7 +1988,7 @@ TEST_F(KFDMemoryTest, CacheInvalidateOnCPUWrite) { ASSERT_SUCCESS(queue.Create(defaultGPUNode)); HsaMemoryBuffer isaBuffer(PAGE_SIZE, defaultGPUNode, true/*zero*/, false/*local*/, true/*exec*/); - ASSERT_SUCCESS(m_pAsm->RunAssembleBuf(PollMemoryIsa_gfx9, isaBuffer.As())); + ASSERT_SUCCESS(m_pAsm->RunAssembleBuf(PollMemoryIsa, isaBuffer.As())); Dispatch dispatch(isaBuffer); dispatch.SetArgs(buffer, buffer+100); @@ -2363,7 +2052,7 @@ TEST_F(KFDMemoryTest, CacheInvalidateOnRemoteWrite) { ASSERT_SUCCESS(queue.Create(defaultGPUNode)); HsaMemoryBuffer isaBuffer(PAGE_SIZE, defaultGPUNode, true/*zero*/, false/*local*/, true/*exec*/); - ASSERT_SUCCESS(m_pAsm->RunAssembleBuf(PollMemoryIsa_gfx9, isaBuffer.As())); + ASSERT_SUCCESS(m_pAsm->RunAssembleBuf(PollMemoryIsa, isaBuffer.As())); Dispatch dispatch(isaBuffer); dispatch.SetArgs(buffer.As(), buffer.As()+dwLocation); @@ -2379,7 +2068,9 @@ TEST_F(KFDMemoryTest, CacheInvalidateOnRemoteWrite) { ASSERT_SUCCESS(queue1.Create(nondefaultNode)); buffer.Fill(0x5678, sdmaQueue, dwLocation1*sizeof(int), 4); HsaMemoryBuffer isaBuffer1(PAGE_SIZE, nondefaultNode, true/*zero*/, false/*local*/, true/*exec*/); - m_pIsaGen->GetCopyDwordIsa(isaBuffer1); + + ASSERT_SUCCESS(m_pAsm->RunAssembleBuf(CopyDwordIsa, isaBuffer.As())); + Dispatch dispatch1(isaBuffer1); dispatch1.SetArgs(buffer.As()+dwLocation1, buffer.As()); dispatch1.Submit(queue1); @@ -2446,7 +2137,7 @@ TEST_F(KFDMemoryTest, VramCacheCoherenceWithRemoteGPU) { ASSERT_SUCCESS(queue.Create(defaultGPUNode)); HsaMemoryBuffer isaBuffer(PAGE_SIZE, defaultGPUNode, true/*zero*/, false/*local*/, true/*exec*/); - ASSERT_SUCCESS(m_pAsm->RunAssembleBuf(PollAndCopyIsa_gfx9aldbrn, isaBuffer.As())); + ASSERT_SUCCESS(m_pAsm->RunAssembleBuf(PollAndCopyIsa, isaBuffer.As())); Dispatch dispatch(isaBuffer); dispatch.SetArgs(buffer.As(), buffer.As()+dwLocation); @@ -2463,7 +2154,7 @@ TEST_F(KFDMemoryTest, VramCacheCoherenceWithRemoteGPU) { ASSERT_SUCCESS(queue1.Create(nondefaultNode)); HsaMemoryBuffer isaBuffer1(PAGE_SIZE, nondefaultNode, true/*zero*/, false/*local*/, true/*exec*/); - ASSERT_SUCCESS(m_pAsm->RunAssembleBuf(WriteFlagAndValueIsa_gfx9aldbrn, isaBuffer.As())); + ASSERT_SUCCESS(m_pAsm->RunAssembleBuf(WriteFlagAndValueIsa, isaBuffer.As())); Dispatch dispatch1(isaBuffer1); dispatch1.SetArgs(buffer.As(), buffer.As()+dwSource); @@ -2519,7 +2210,7 @@ TEST_F(KFDMemoryTest, VramCacheCoherenceWithCPU) { ASSERT_SUCCESS(queue.Create(defaultGPUNode)); HsaMemoryBuffer isaBuffer(PAGE_SIZE, defaultGPUNode, true/*zero*/, false/*local*/, true/*exec*/); - ASSERT_SUCCESS(m_pAsm->RunAssembleBuf(PollAndCopyIsa_gfx9aldbrn, isaBuffer.As())); + ASSERT_SUCCESS(m_pAsm->RunAssembleBuf(PollAndCopyIsa, isaBuffer.As())); Dispatch dispatch(isaBuffer); dispatch.SetArgs(buffer, buffer+dwLocation); @@ -2559,12 +2250,17 @@ TEST_F(KFDMemoryTest, SramCacheCoherenceWithGPU) { return; } - unsigned int *fineBuffer = NULL; - unsigned int tmp; - int defaultGPUNode = m_NodeInfo.HsaDefaultGPUNode(); const int dwLocation = 0x80; + if (!m_NodeInfo.IsNodeXGMItoCPU(defaultGPUNode)) { + LOG() << "Skipping test: XGMI link to CPU is required." << std::endl; + return; + } + + unsigned int *fineBuffer = NULL; + unsigned int tmp; + ASSERT_SUCCESS(hsaKmtAllocMemory(defaultGPUNode /* system */, PAGE_SIZE, m_MemoryFlags, reinterpret_cast(&fineBuffer))); ASSERT_SUCCESS(hsaKmtMapMemoryToGPU(fineBuffer, PAGE_SIZE, NULL)); @@ -2578,13 +2274,7 @@ TEST_F(KFDMemoryTest, SramCacheCoherenceWithGPU) { ASSERT_SUCCESS(queue.Create(defaultGPUNode)); HsaMemoryBuffer isaBuffer(PAGE_SIZE, defaultGPUNode, true/*zero*/, false/*local*/, true/*exec*/); - const char* pPollAndCopyIsa; - if (m_NodeInfo.IsNodeXGMItoCPU(defaultGPUNode)) - pPollAndCopyIsa = PollAndCopyIsa_gfx9aldbrn; - else - pPollAndCopyIsa = PollAndCopyIsa_gfx9; - - ASSERT_SUCCESS(m_pAsm->RunAssembleBuf(pPollAndCopyIsa, isaBuffer.As())); + ASSERT_SUCCESS(m_pAsm->RunAssembleBuf(PollAndCopyIsa, isaBuffer.As())); Dispatch dispatch(isaBuffer); dispatch.SetArgs(fineBuffer, fineBuffer+dwLocation); diff --git a/projects/rocr-runtime/tests/kfdtest/src/KFDMemoryTest.hpp b/projects/rocr-runtime/tests/kfdtest/src/KFDMemoryTest.hpp index ea93395f71..03149e5639 100644 --- a/projects/rocr-runtime/tests/kfdtest/src/KFDMemoryTest.hpp +++ b/projects/rocr-runtime/tests/kfdtest/src/KFDMemoryTest.hpp @@ -22,7 +22,6 @@ */ #include "KFDBaseComponentTest.hpp" -#include "IsaGenerator.hpp" #ifndef __KFD_MEMORY_TEST__H__ #define __KFD_MEMORY_TEST__H__ @@ -33,15 +32,13 @@ */ class KFDMemoryTest : public KFDBaseComponentTest { public: - KFDMemoryTest(void) :m_pIsaGen(NULL) {} + KFDMemoryTest(void) {} ~KFDMemoryTest(void) {} protected: virtual void SetUp(); virtual void TearDown(); protected: - IsaGenerator* m_pIsaGen; - void BinarySearchLargestBuffer(int allocNode, const HsaMemFlags &memFlags, HSAuint64 highMB, int nodeToMap, HSAuint64 *lastSizeMB); diff --git a/projects/rocr-runtime/tests/kfdtest/src/ShaderStore.cpp b/projects/rocr-runtime/tests/kfdtest/src/ShaderStore.cpp index 5f46bf844a..e7cb07470b 100644 --- a/projects/rocr-runtime/tests/kfdtest/src/ShaderStore.cpp +++ b/projects/rocr-runtime/tests/kfdtest/src/ShaderStore.cpp @@ -63,3 +63,228 @@ const char *AtomicIncIsa = R"( s_waitcnt 0 s_endpgm )"; + +/** + * KFDMemoryTest + */ + +const char *ScratchCopyDwordIsa = R"( + .text + // Copy the parameters from scalar registers to vector registers + .if (.amdgcn.gfx_generation_number >= 9) + v_mov_b32 v0, s0 + v_mov_b32 v1, s1 + v_mov_b32 v2, s2 + v_mov_b32 v3, s3 + .else + v_mov_b32_e32 v0, s0 + v_mov_b32_e32 v1, s1 + v_mov_b32_e32 v2, s2 + v_mov_b32_e32 v3, s3 + .endif + // Setup the scratch parameters. This assumes a single 16-reg block + .if (.amdgcn.gfx_generation_number >= 10) + s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s4 + s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s5 + .elseif (.amdgcn.gfx_generation_number == 9) + s_mov_b32 flat_scratch_lo, s4 + s_mov_b32 flat_scratch_hi, s5 + .else + s_mov_b32 flat_scratch_lo, 8 + s_mov_b32 flat_scratch_hi, 0 + .endif + // Copy a dword between the passed addresses + flat_load_dword v4, v[0:1] slc + s_waitcnt vmcnt(0) & lgkmcnt(0) + flat_store_dword v[2:3], v4 slc + s_endpgm +)"; + +/* Continuously poll src buffer and check buffer value + * After src buffer is filled with specific value (0x5678, + * by host program), fill dst buffer with specific + * value(0x5678) and quit + */ +const char *PollMemoryIsa = R"( + .text + // Assume src address in s0, s1, and dst address in s2, s3 + s_movk_i32 s18, 0x5678 + .if (.amdgcn.gfx_generation_number >= 10) + v_mov_b32 v0, s2 + v_mov_b32 v1, s3 + v_mov_b32 v2, 0x5678 + .endif + LOOP: + s_load_dword s16, s[0:1], 0x0 glc + s_cmp_eq_i32 s16, s18 + s_cbranch_scc0 LOOP + .if (.amdgcn.gfx_generation_number >= 10) + flat_store_dword v[0:1], v2 slc + .else + s_store_dword s18, s[2:3], 0x0 glc + .endif + s_endpgm +)"; + +/* Similar to PollMemoryIsa except that the buffer + * polled can be Non-coherant memory. SCC system-level + * cache coherence is not supported in scalar (smem) path. + * Use vmem operations with scc + * + * Note: Only works on Aldebaran, and even then the scc modifier + * has been defeatured. This shader is more or less + * deprecated. + */ +const char *PollNCMemoryIsa = R"( + .text + // Assume src address in s0, s1, and dst address in s2, s3 + v_mov_b32 v6, 0x5678 + v_mov_b32 v0, s0 + v_mov_b32 v1, s1 + LOOP: + flat_load_dword v4, v[0:1] scc + v_cmp_eq_u32 vcc, v4, v6 + s_cbranch_vccz LOOP + v_mov_b32 v0, s2 + v_mov_b32 v1, s3 + flat_store_dword v[0:1], v6 scc + s_endpgm +)"; + +/* Input: A buffer of at least 3 dwords. + * DW0: used as a signal. 0xcafe means it is signaled + * DW1: Input buffer for device to read. + * DW2: Output buffer for device to write. + * Once receive signal, device will copy DW1 to DW2 + * This shader continously poll the signal buffer, + * Once signal buffer is signaled, it copies input buffer + * to output buffer + */ +const char *CopyOnSignalIsa = R"( + .text + // Assume input buffer in s0, s1 + .if (.amdgcn.gfx_generation_number >= 10) + s_add_u32 s2, s0, 0x8 + s_addc_u32 s3, s1, 0x0 + s_mov_b32 s18, 0xcafe + v_mov_b32 v0, s0 + v_mov_b32 v1, s1 + v_mov_b32 v4, s2 + v_mov_b32 v5, s3 + .else + s_mov_b32 s18, 0xcafe + .endif + POLLSIGNAL: + s_load_dword s16, s[0:1], 0x0 glc + s_cmp_eq_i32 s16, s18 + s_cbranch_scc0 POLLSIGNAL + s_load_dword s17, s[0:1], 0x4 glc + s_waitcnt vmcnt(0) & lgkmcnt(0) + .if (.amdgcn.gfx_generation_number >= 10) + v_mov_b32 v2, s17 + flat_store_dword v[4:5], v2 glc + .else + s_store_dword s17, s[0:1], 0x8 glc + .endif + s_waitcnt vmcnt(0) & lgkmcnt(0) + s_endpgm +)"; + +/* Continuously poll the flag at src buffer + * After the flag of s[0:1] is 1 filled, + * copy the value from s[0:1]+4 to dst buffer + * + * Note: Only works on GFX9 (only used in + * aldebaran tests) + */ +const char *PollAndCopyIsa = R"( + .text + // Assume src buffer in s[0:1] and dst buffer in s[2:3] + .if (.amdgcn.gfx_generation_number == 9 && .amdgcn.gfx_generation_stepping == 10) + // Path for Aldebaran + v_mov_b32 v0, s0 + v_mov_b32 v1, s1 + v_mov_b32 v18, 0x1 + LOOP_ALDBRN: + flat_load_dword v16, v[0:1] glc + s_waitcnt vmcnt(0) & lgkmcnt(0) + v_cmp_eq_i32 vcc, v16, v18 + s_cbranch_vccz LOOP_ALDBRN + buffer_invl2 + s_load_dword s17, s[0:1], 0x4 glc + s_waitcnt vmcnt(0) & lgkmcnt(0) + s_store_dword s17, s[2:3], 0x0 glc + s_waitcnt vmcnt(0) & lgkmcnt(0) + buffer_wbl2 + .else + s_movk_i32 s18, 0x1 + LOOP: + s_load_dword s16, s[0:1], 0x0 glc + s_cmp_eq_i32 s16, s18 + s_cbranch_scc0 LOOP + s_load_dword s17, s[0:1], 0x4 glc + s_waitcnt vmcnt(0) & lgkmcnt(0) + s_store_dword s17, s[2:3], 0x0 glc + .endif + s_waitcnt vmcnt(0) & lgkmcnt(0) + s_endpgm +)"; + +/* Input0: A buffer of at least 2 dwords. + * DW0: used as a signal. Write 0x1 to signal + * DW1: Write the value from 2nd input buffer + * for other device to read. + * Input1: A buffer of at least 2 dwords. + * DW0: used as the value to be written. + * + * Note: Only works on Aldebaran + */ +const char *WriteFlagAndValueIsa = R"( + .text + // Assume two inputs buffer in s[0:1] and s[2:3] + v_mov_b32 v0, s0 + v_mov_b32 v1, s1 + s_load_dword s18, s[2:3], 0x0 glc + s_waitcnt vmcnt(0) & lgkmcnt(0) + s_store_dword s18, s[0:1], 0x4 glc + s_waitcnt vmcnt(0) & lgkmcnt(0) + buffer_wbl2 + s_waitcnt vmcnt(0) & lgkmcnt(0) + v_mov_b32 v16, 0x1 + flat_store_dword v[0:1], v16 glc + s_endpgm +)"; + +/* Input0: A buffer of at least 2 dwords. + * DW0: used as a signal. Write 0xcafe to signal + * DW1: Write to this buffer for other device to read. + * Input1: mmio base address + */ +const char *WriteAndSignalIsa = R"( + .text + // Assume input buffer in s0, s1 + .if (.amdgcn.gfx_generation_number >= 10) + s_add_u32 s4, s0, 0x4 + s_addc_u32 s5, s1, 0x0 + v_mov_b32 v0, s0 + v_mov_b32 v1, s1 + v_mov_b32 v2, s2 + v_mov_b32 v3, s3 + v_mov_b32 v4, s4 + v_mov_b32 v5, s5 + v_mov_b32 v18, 0xbeef + flat_store_dword v[4:5], v18 glc + v_mov_b32 v18, 0x1 + flat_store_dword v[2:3], v18 glc + v_mov_b32 v18, 0xcafe + flat_store_dword v[0:1], v18 glc + .else + s_mov_b32 s18, 0xbeef + s_store_dword s18, s[0:1], 0x4 glc + s_mov_b32 s18, 0x1 + s_store_dword s18, s[2:3], 0 glc + s_mov_b32 s18, 0xcafe + s_store_dword s18, s[0:1], 0x0 glc + .endif + s_endpgm +)"; diff --git a/projects/rocr-runtime/tests/kfdtest/src/ShaderStore.hpp b/projects/rocr-runtime/tests/kfdtest/src/ShaderStore.hpp index 2344c5bca2..b98088b843 100644 --- a/projects/rocr-runtime/tests/kfdtest/src/ShaderStore.hpp +++ b/projects/rocr-runtime/tests/kfdtest/src/ShaderStore.hpp @@ -30,4 +30,13 @@ extern const char *CopyDwordIsa; extern const char *InfiniteLoopIsa; extern const char *AtomicIncIsa; +/* KFDMemoryTest */ +extern const char *ScratchCopyDwordIsa; +extern const char *PollMemoryIsa; +extern const char *PollNCMemoryIsa; +extern const char *CopyOnSignalIsa; +extern const char *PollAndCopyIsa; +extern const char *WriteFlagAndValueIsa; +extern const char *WriteAndSignalIsa; + #endif // _SHADERSTORE_H_ From ea451d26fd177bf377036f632c5686bad2abb8c5 Mon Sep 17 00:00:00 2001 From: Graham Sider Date: Tue, 2 Nov 2021 13:51:21 -0400 Subject: [PATCH 12/27] kfdtest: Move KFDQMTest shaders to ShaderStore Signed-off-by: Graham Sider Change-Id: Id50aea16528c4bed4530f95644a02f59efddae3e [ROCm/ROCR-Runtime commit: aced779f1b8b7458ed4d92e3bd540ad57cb9c9ba] --- .../tests/kfdtest/src/KFDQMTest.cpp | 114 ++---------------- .../tests/kfdtest/src/KFDQMTest.hpp | 4 +- .../tests/kfdtest/src/ShaderStore.cpp | 98 +++++++++++++++ .../tests/kfdtest/src/ShaderStore.hpp | 3 + 4 files changed, 110 insertions(+), 109 deletions(-) diff --git a/projects/rocr-runtime/tests/kfdtest/src/KFDQMTest.cpp b/projects/rocr-runtime/tests/kfdtest/src/KFDQMTest.cpp index c28715639f..ffc568ebdf 100644 --- a/projects/rocr-runtime/tests/kfdtest/src/KFDQMTest.cpp +++ b/projects/rocr-runtime/tests/kfdtest/src/KFDQMTest.cpp @@ -34,117 +34,17 @@ #include "Dispatch.hpp" -/* A simple isa loop program with dense mathematic operations - * s1 controls the number iterations of the loop - * This shader can be used by GFX8, GFX9 and GFX10 - */ -static const char* LoopIsa = R"( - .text - s_movk_i32 s0, 0x0008 - s_movk_i32 s1, 0x00ff - v_mov_b32 v0, 0 - v_mov_b32 v1, 0 - v_mov_b32 v2, 0 - v_mov_b32 v3, 0 - v_mov_b32 v4, 0 - v_mov_b32 v5, 0 - v_mov_b32 v6, 0 - v_mov_b32 v7, 0 - v_mov_b32 v8, 0 - v_mov_b32 v9, 0 - v_mov_b32 v10, 0 - v_mov_b32 v11, 0 - v_mov_b32 v12, 0 - v_mov_b32 v13, 0 - v_mov_b32 v14, 0 - v_mov_b32 v15, 0 - v_mov_b32 v16, 0 - LOOP: - s_mov_b32 s8, s4 - s_mov_b32 s9, s1 - s_mov_b32 s10, s6 - s_mov_b32 s11, s7 - s_cmp_le_i32 s1, s0 - s_cbranch_scc1 END_OF_PGM - s_buffer_load_dwordx8 s[8:15], s[8:11], 0x10 - v_add_f32 v0, 2.0, v0 - v_cvt_f32_i32 v17, s1 - s_waitcnt lgkmcnt(0) - v_add_f32 v18, s8, v17 - v_add_f32 v19, s9, v17 - v_add_f32 v20, s10, v17 - v_add_f32 v21, s11, v17 - v_add_f32 v22, s12, v17 - v_add_f32 v23, s13, v17 - v_add_f32 v24, s14, v17 - v_add_f32 v17, s15, v17 - v_log_f32 v25, v18 - v_mul_f32 v25, v22, v25 - v_exp_f32 v25, v25 - v_log_f32 v26, v19 - v_mul_f32 v26, v23, v26 - v_exp_f32 v26, v26 - v_log_f32 v27, v20 - v_mul_f32 v27, v24, v27 - v_exp_f32 v27, v27 - v_log_f32 v28, v21 - v_mul_f32 v28, v17, v28 - v_exp_f32 v28, v28 - v_add_f32 v5, v5, v25 - v_add_f32 v6, v6, v26 - v_add_f32 v7, v7, v27 - v_add_f32 v8, v8, v28 - v_mul_f32 v18, 0x3fb8aa3b, v18 - v_exp_f32 v18, v18 - v_mul_f32 v19, 0x3fb8aa3b, v19 - v_exp_f32 v19, v19 - v_mul_f32 v20, 0x3fb8aa3b, v20 - v_exp_f32 v20, v20 - v_mul_f32 v21, 0x3fb8aa3b, v21 - v_exp_f32 v21, v21 - v_add_f32 v9, v9, v18 - v_add_f32 v10, v10, v19 - v_add_f32 v11, v11, v20 - v_add_f32 v12, v12, v21 - v_sqrt_f32 v18, v22 - v_sqrt_f32 v19, v23 - v_sqrt_f32 v20, v24 - v_sqrt_f32 v21, v17 - v_add_f32 v13, v13, v18 - v_add_f32 v14, v14, v19 - v_add_f32 v15, v15, v20 - v_add_f32 v16, v16, v21 - v_rsq_f32 v18, v22 - v_rsq_f32 v19, v23 - v_rsq_f32 v20, v24 - v_rsq_f32 v17, v17 - v_add_f32 v1, v1, v18 - v_add_f32 v2, v2, v19 - v_add_f32 v3, v3, v20 - v_add_f32 v4, v4, v17 - s_add_u32 s0, s0, 1 - s_branch LOOP - END_OF_PGM: - s_endpgm -)"; - void KFDQMTest::SetUp() { ROUTINE_START KFDBaseComponentTest::SetUp(); - m_pIsaGen = IsaGenerator::Create(m_FamilyId); - ROUTINE_END } void KFDQMTest::TearDown() { ROUTINE_START - if (m_pIsaGen) - delete m_pIsaGen; - m_pIsaGen = NULL; - KFDBaseComponentTest::TearDown(); ROUTINE_END @@ -1134,7 +1034,7 @@ TEST_F(KFDQMTest, EmptyDispatch) { HsaMemoryBuffer isaBuffer(PAGE_SIZE, defaultGPUNode, true/*zero*/, false/*local*/, true/*exec*/); - m_pIsaGen->GetNoopIsa(isaBuffer); + ASSERT_SUCCESS(m_pAsm->RunAssembleBuf(LoopIsa, isaBuffer.As())); SyncDispatch(isaBuffer, NULL, NULL); @@ -1153,7 +1053,7 @@ TEST_F(KFDQMTest, SimpleWriteDispatch) { srcBuffer.Fill(0x01010101); - m_pIsaGen->GetCopyDwordIsa(isaBuffer); + ASSERT_SUCCESS(m_pAsm->RunAssembleBuf(CopyDwordIsa, isaBuffer.As())); SyncDispatch(isaBuffer, srcBuffer.As(), destBuffer.As()); @@ -1188,7 +1088,7 @@ TEST_F(KFDQMTest, MultipleCpQueuesStressDispatch) { destBuffer.Fill(0xFF); - m_pIsaGen->GetCopyDwordIsa(isaBuffer); + ASSERT_SUCCESS(m_pAsm->RunAssembleBuf(CopyDwordIsa, isaBuffer.As())); for (i = 0; i < MAX_CP_QUEUES; ++i) ASSERT_SUCCESS(queues[i].Create(defaultGPUNode)) << " QueueId=" << i; @@ -1527,7 +1427,7 @@ TEST_F(KFDQMTest, Atomics) { PM4Queue queue; - m_pIsaGen->GetAtomicIncIsa(isaBuf); + ASSERT_SUCCESS(m_pAsm->RunAssembleBuf(AtomicIncIsa, isaBuf.As())); Dispatch dispatch(isaBuf); dispatch.SetArgs(destBuf.As(), NULL); @@ -1592,10 +1492,12 @@ TEST_F(KFDQMTest, mGPUShareBO) { srcNodeMem.Fill(0x05050505); - m_pIsaGen->GetCopyDwordIsa(isaBufferSrc); + ASSERT_SUCCESS(m_pAsm->RunAssemble(CopyDwordIsa)); + + m_pAsm->CopyInstrStream(isaBufferSrc.As()); SyncDispatch(isaBufferSrc, srcNodeMem.As(), shared_addr.As(), src_node); - m_pIsaGen->GetCopyDwordIsa(isaBufferDst); + m_pAsm->CopyInstrStream(isaBufferDst.As()); SyncDispatch(isaBufferDst, shared_addr.As(), dstNodeMem.As(), dst_node); EXPECT_EQ(dstNodeMem.As()[0], 0x05050505); diff --git a/projects/rocr-runtime/tests/kfdtest/src/KFDQMTest.hpp b/projects/rocr-runtime/tests/kfdtest/src/KFDQMTest.hpp index b0d3f66073..dfc36d17d4 100644 --- a/projects/rocr-runtime/tests/kfdtest/src/KFDQMTest.hpp +++ b/projects/rocr-runtime/tests/kfdtest/src/KFDQMTest.hpp @@ -27,13 +27,12 @@ #include #include "PM4Queue.hpp" -#include "IsaGenerator.hpp" #include "KFDBaseComponentTest.hpp" #include "Dispatch.hpp" class KFDQMTest : public KFDBaseComponentTest { public: - KFDQMTest():m_pIsaGen(NULL) {} + KFDQMTest() {} ~KFDQMTest() {} @@ -49,7 +48,6 @@ class KFDQMTest : public KFDBaseComponentTest { const double CuVariance = 0.15; const double CuNegVariance = 1.0 - CuVariance; const double CuPosVariance = 1.0 + CuVariance; - IsaGenerator* m_pIsaGen; }; #endif // __KFD_QCM_TEST__H__ diff --git a/projects/rocr-runtime/tests/kfdtest/src/ShaderStore.cpp b/projects/rocr-runtime/tests/kfdtest/src/ShaderStore.cpp index e7cb07470b..28025f77d9 100644 --- a/projects/rocr-runtime/tests/kfdtest/src/ShaderStore.cpp +++ b/projects/rocr-runtime/tests/kfdtest/src/ShaderStore.cpp @@ -288,3 +288,101 @@ const char *WriteAndSignalIsa = R"( .endif s_endpgm )"; + +/** + * KFDQMTest + */ + +/* A simple isa loop program with dense mathematic operations + * s1 controls the number iterations of the loop + * This shader can be used by GFX8, GFX9 and GFX10 + */ +const char *LoopIsa = R"( + .text + s_movk_i32 s0, 0x0008 + s_movk_i32 s1, 0x00ff + v_mov_b32 v0, 0 + v_mov_b32 v1, 0 + v_mov_b32 v2, 0 + v_mov_b32 v3, 0 + v_mov_b32 v4, 0 + v_mov_b32 v5, 0 + v_mov_b32 v6, 0 + v_mov_b32 v7, 0 + v_mov_b32 v8, 0 + v_mov_b32 v9, 0 + v_mov_b32 v10, 0 + v_mov_b32 v11, 0 + v_mov_b32 v12, 0 + v_mov_b32 v13, 0 + v_mov_b32 v14, 0 + v_mov_b32 v15, 0 + v_mov_b32 v16, 0 + LOOP: + s_mov_b32 s8, s4 + s_mov_b32 s9, s1 + s_mov_b32 s10, s6 + s_mov_b32 s11, s7 + s_cmp_le_i32 s1, s0 + s_cbranch_scc1 END_OF_PGM + s_buffer_load_dwordx8 s[8:15], s[8:11], 0x10 + v_add_f32 v0, 2.0, v0 + v_cvt_f32_i32 v17, s1 + s_waitcnt lgkmcnt(0) + v_add_f32 v18, s8, v17 + v_add_f32 v19, s9, v17 + v_add_f32 v20, s10, v17 + v_add_f32 v21, s11, v17 + v_add_f32 v22, s12, v17 + v_add_f32 v23, s13, v17 + v_add_f32 v24, s14, v17 + v_add_f32 v17, s15, v17 + v_log_f32 v25, v18 + v_mul_f32 v25, v22, v25 + v_exp_f32 v25, v25 + v_log_f32 v26, v19 + v_mul_f32 v26, v23, v26 + v_exp_f32 v26, v26 + v_log_f32 v27, v20 + v_mul_f32 v27, v24, v27 + v_exp_f32 v27, v27 + v_log_f32 v28, v21 + v_mul_f32 v28, v17, v28 + v_exp_f32 v28, v28 + v_add_f32 v5, v5, v25 + v_add_f32 v6, v6, v26 + v_add_f32 v7, v7, v27 + v_add_f32 v8, v8, v28 + v_mul_f32 v18, 0x3fb8aa3b, v18 + v_exp_f32 v18, v18 + v_mul_f32 v19, 0x3fb8aa3b, v19 + v_exp_f32 v19, v19 + v_mul_f32 v20, 0x3fb8aa3b, v20 + v_exp_f32 v20, v20 + v_mul_f32 v21, 0x3fb8aa3b, v21 + v_exp_f32 v21, v21 + v_add_f32 v9, v9, v18 + v_add_f32 v10, v10, v19 + v_add_f32 v11, v11, v20 + v_add_f32 v12, v12, v21 + v_sqrt_f32 v18, v22 + v_sqrt_f32 v19, v23 + v_sqrt_f32 v20, v24 + v_sqrt_f32 v21, v17 + v_add_f32 v13, v13, v18 + v_add_f32 v14, v14, v19 + v_add_f32 v15, v15, v20 + v_add_f32 v16, v16, v21 + v_rsq_f32 v18, v22 + v_rsq_f32 v19, v23 + v_rsq_f32 v20, v24 + v_rsq_f32 v17, v17 + v_add_f32 v1, v1, v18 + v_add_f32 v2, v2, v19 + v_add_f32 v3, v3, v20 + v_add_f32 v4, v4, v17 + s_add_u32 s0, s0, 1 + s_branch LOOP + END_OF_PGM: + s_endpgm +)"; diff --git a/projects/rocr-runtime/tests/kfdtest/src/ShaderStore.hpp b/projects/rocr-runtime/tests/kfdtest/src/ShaderStore.hpp index b98088b843..3d4df30e94 100644 --- a/projects/rocr-runtime/tests/kfdtest/src/ShaderStore.hpp +++ b/projects/rocr-runtime/tests/kfdtest/src/ShaderStore.hpp @@ -39,4 +39,7 @@ extern const char *PollAndCopyIsa; extern const char *WriteFlagAndValueIsa; extern const char *WriteAndSignalIsa; +/* KFDQMTest */ +extern const char *LoopIsa; + #endif // _SHADERSTORE_H_ From a4b42c22136d41826f93341e464cee896e49e125 Mon Sep 17 00:00:00 2001 From: Graham Sider Date: Tue, 2 Nov 2021 13:54:54 -0400 Subject: [PATCH 13/27] kfdtest: Move KFDCWSRTest shaders to ShaderStore Signed-off-by: Graham Sider Change-Id: I7c89fca94e92145a4115d1089348380807a868ee [ROCm/ROCR-Runtime commit: ad5f98814fc3763b5d15dd2c6360ac3e275bef63] --- .../tests/kfdtest/src/KFDCWSRTest.cpp | 76 +------------------ .../tests/kfdtest/src/ShaderStore.cpp | 52 +++++++++++++ .../tests/kfdtest/src/ShaderStore.hpp | 3 + 3 files changed, 57 insertions(+), 74 deletions(-) diff --git a/projects/rocr-runtime/tests/kfdtest/src/KFDCWSRTest.cpp b/projects/rocr-runtime/tests/kfdtest/src/KFDCWSRTest.cpp index c7a2e4b312..5a4f853ee2 100644 --- a/projects/rocr-runtime/tests/kfdtest/src/KFDCWSRTest.cpp +++ b/projects/rocr-runtime/tests/kfdtest/src/KFDCWSRTest.cpp @@ -24,72 +24,6 @@ #include "KFDCWSRTest.hpp" #include "Dispatch.hpp" - -/* Initial state: - * s[0:1] - 64 bits iteration number; only the lower 32 bits are useful. - * s[2:3] - result buffer base address - * s4 - workgroup id - * v0 - workitem id, always 0 because - * NUM_THREADS_X(number of threads) in workgroup set to 1 - * Registers: - * v0 - calculated workitem = v0 + s4 * NUM_THREADS_X, which is s4 - * v2 - = s0, 32 bits iteration number - * v[4:5] - corresponding output buf address: s[2:3] + v0 * 4 - * v6 - counter - */ - -static const char* IterateIsa_gfx8 = R"( - .text - // Copy the parameters from scalar registers to vector registers - v_mov_b32 v2, s0 // v[2:3] = s[0:1] - v_mov_b32 v3, s1 // v[2:3] = s[0:1] - v_mov_b32 v0, s4 // use workgroup id as index - v_lshlrev_b32 v0, 2, v0 // v0 *= 4 - v_add_u32 v4, vcc, s2, v0 // v[4:5] = s[2:3] + v0 * 4 - v_mov_b32 v5, s3 // v[4:5] = s[2:3] + v0 * 4 - v_add_u32 v5, vcc, v5, vcc_lo // v[4:5] = s[2:3] + v0 * 4 - v_mov_b32 v6, 0 - LOOP: - v_add_u32 v6, vcc, 1, v6 - // Compare the result value (v6) to iteration value (v2), and - // jump if equal (i.e. if VCC is not zero after the comparison) - v_cmp_lt_u32 vcc, v6, v2 - s_cbranch_vccnz LOOP - flat_store_dword v[4:5], v6 - s_waitcnt vmcnt(0) & lgkmcnt(0) - s_endpgm -)"; - -// This shader can be used by gfx9 and gfx10 -static const char* IterateIsa_gfx9 = R"( - .text - // Copy the parameters from scalar registers to vector registers - v_mov_b32 v2, s0 // v[2:3] = s[0:1] - v_mov_b32 v3, s1 // v[2:3] = s[0:1] - v_mov_b32 v0, s4 // use workgroup id as index - v_lshlrev_b32 v0, 2, v0 // v0 *= 4 - v_add_co_u32 v4, vcc, s2, v0 // v[4:5] = s[2:3] + v0 * 4 - v_mov_b32 v5, s3 // v[4:5] = s[2:3] + v0 * 4 - v_add_co_u32 v5, vcc, v5, vcc_lo // v[4:5] = s[2:3] + v0 * 4 - v_mov_b32 v6, 0 - LOOP: - v_add_co_u32 v6, vcc, 1, v6 - // Compare the result value (v6) to iteration value (v2), and - // jump if equal (i.e. if VCC is not zero after the comparison) - v_cmp_lt_u32 vcc, v6, v2 - s_cbranch_vccnz LOOP - flat_store_dword v[4:5], v6 - s_waitcnt vmcnt(0) & lgkmcnt(0) - s_endpgm -)"; - -static const char* InfiniteIsa = R"( - .text - LOOP: - s_branch LOOP - s_endpgm -)"; - void KFDCWSRTest::SetUp() { ROUTINE_START @@ -137,16 +71,10 @@ TEST_F(KFDCWSRTest, BasicTest) { int defaultGPUNode = m_NodeInfo.HsaDefaultGPUNode(); if ((m_FamilyId >= FAMILY_VI) && (checkCWSREnabled())) { - const char *pIterateIsa; HsaMemoryBuffer isaBuffer(PAGE_SIZE, defaultGPUNode, true/*zero*/, false/*local*/, true/*exec*/); HsaMemoryBuffer resultBuf1(PAGE_SIZE, defaultGPUNode, true, false, false); uint64_t count1 = 400000000; - if (m_FamilyId < FAMILY_AI) - pIterateIsa = IterateIsa_gfx8; - else - pIterateIsa = IterateIsa_gfx9; - if (isOnEmulator()) { // Divide the iterator times by 10000 so that the test can // finish in a reasonable time. @@ -156,7 +84,7 @@ TEST_F(KFDCWSRTest, BasicTest) { unsigned int* result1 = resultBuf1.As(); - ASSERT_SUCCESS(m_pAsm->RunAssembleBuf(pIterateIsa, isaBuffer.As())); + ASSERT_SUCCESS(m_pAsm->RunAssembleBuf(IterateIsa, isaBuffer.As())); PM4Queue queue1; @@ -220,7 +148,7 @@ TEST_F(KFDCWSRTest, InterruptRestore) { if ((m_FamilyId >= FAMILY_VI) && (checkCWSREnabled())) { HsaMemoryBuffer isaBuffer(PAGE_SIZE, defaultGPUNode, true/*zero*/, false/*local*/, true/*exec*/); - ASSERT_SUCCESS(m_pAsm->RunAssembleBuf(InfiniteIsa, isaBuffer.As())); + ASSERT_SUCCESS(m_pAsm->RunAssembleBuf(InfiniteLoopIsa, isaBuffer.As())); PM4Queue queue1, queue2, queue3; diff --git a/projects/rocr-runtime/tests/kfdtest/src/ShaderStore.cpp b/projects/rocr-runtime/tests/kfdtest/src/ShaderStore.cpp index 28025f77d9..93a2d60184 100644 --- a/projects/rocr-runtime/tests/kfdtest/src/ShaderStore.cpp +++ b/projects/rocr-runtime/tests/kfdtest/src/ShaderStore.cpp @@ -386,3 +386,55 @@ const char *LoopIsa = R"( END_OF_PGM: s_endpgm )"; + + +/** + * KFDCWSRTest + */ + +/* Initial state: + * s[0:1] - 64 bits iteration number; only the lower 32 bits are useful. + * s[2:3] - result buffer base address + * s4 - workgroup id + * v0 - workitem id, always 0 because + * NUM_THREADS_X(number of threads) in workgroup set to 1 + * Registers: + * v0 - calculated workitem = v0 + s4 * NUM_THREADS_X, which is s4 + * v2 - = s0, 32 bits iteration number + * v[4:5] - corresponding output buf address: s[2:3] + v0 * 4 + * v6 - counter + */ +const char *IterateIsa = R"( + .text + // Copy the parameters from scalar registers to vector registers + v_mov_b32 v2, s0 // v[2:3] = s[0:1] + v_mov_b32 v3, s1 // v[2:3] = s[0:1] + v_mov_b32 v0, s4 // use workgroup id as index + v_lshlrev_b32 v0, 2, v0 // v0 *= 4 + .if (.amdgcn.gfx_generation_number >= 9) + v_add_co_u32 v4, vcc, s2, v0 // v[4:5] = s[2:3] + v0 * 4 + v_mov_b32 v5, s3 // v[4:5] = s[2:3] + v0 * 4 + v_add_co_u32 v5, vcc, v5, vcc_lo // v[4:5] = s[2:3] + v0 * 4 + v_mov_b32 v6, 0 + LOOP: + v_add_co_u32 v6, vcc, 1, v6 + // Compare the result value (v6) to iteration value (v2), and + // jump if equal (i.e. if VCC is not zero after the comparison) + v_cmp_lt_u32 vcc, v6, v2 + s_cbranch_vccnz LOOP + .else + v_add_u32 v4, vcc, s2, v0 // v[4:5] = s[2:3] + v0 * 4 + v_mov_b32 v5, s3 // v[4:5] = s[2:3] + v0 * 4 + v_add_u32 v5, vcc, v5, vcc_lo // v[4:5] = s[2:3] + v0 * 4 + v_mov_b32 v6, 0 + LOOP_GFX8: + v_add_u32 v6, vcc, 1, v6 + // Compare the result value (v6) to iteration value (v2), and + // jump if equal (i.e. if VCC is not zero after the comparison) + v_cmp_lt_u32 vcc, v6, v2 + s_cbranch_vccnz LOOP_GFX8 + .endif + flat_store_dword v[4:5], v6 + s_waitcnt vmcnt(0) & lgkmcnt(0) + s_endpgm +)"; diff --git a/projects/rocr-runtime/tests/kfdtest/src/ShaderStore.hpp b/projects/rocr-runtime/tests/kfdtest/src/ShaderStore.hpp index 3d4df30e94..f3a9d92781 100644 --- a/projects/rocr-runtime/tests/kfdtest/src/ShaderStore.hpp +++ b/projects/rocr-runtime/tests/kfdtest/src/ShaderStore.hpp @@ -42,4 +42,7 @@ extern const char *WriteAndSignalIsa; /* KFDQMTest */ extern const char *LoopIsa; +/* KFDCWSRTest */ +extern const char *IterateIsa; + #endif // _SHADERSTORE_H_ From 519277fd5300368025fa59346d022a3076b1998d Mon Sep 17 00:00:00 2001 From: Graham Sider Date: Tue, 2 Nov 2021 13:56:38 -0400 Subject: [PATCH 14/27] kfdtest: Move KFDEvictTest shaders to ShaderStore Signed-off-by: Graham Sider Change-Id: I4d7d349284ea213becdb4680b804dbd202196e1d [ROCm/ROCR-Runtime commit: 91cf11967ec201e056b04ca77f65eede728a8736] --- .../tests/kfdtest/src/KFDEvictTest.cpp | 117 +----------------- .../tests/kfdtest/src/KFDEvictTest.hpp | 1 - .../tests/kfdtest/src/ShaderStore.cpp | 104 ++++++++++++++++ .../tests/kfdtest/src/ShaderStore.hpp | 3 + 4 files changed, 108 insertions(+), 117 deletions(-) diff --git a/projects/rocr-runtime/tests/kfdtest/src/KFDEvictTest.cpp b/projects/rocr-runtime/tests/kfdtest/src/KFDEvictTest.cpp index bf721238c8..1effa639b0 100644 --- a/projects/rocr-runtime/tests/kfdtest/src/KFDEvictTest.cpp +++ b/projects/rocr-runtime/tests/kfdtest/src/KFDEvictTest.cpp @@ -36,121 +36,6 @@ #define SDMA_NOP 0x0 -/* Shader to read local buffers using multiple wavefronts in parallel - * until address buffer is filled with specific value 0x5678 by host program, - * then each wavefront fills value 0x5678 at corresponding result buffer and quit - * - * Initial state: - * s[0:1] - address buffer base address - * s[2:3] - result buffer base address - * s4 - workgroup id - * v0 - workitem id, always 0 because NUM_THREADS_X(number of threads) in workgroup set to 1 - * Registers: - * v0 - calculated workitem id, v0 = v0 + s4 * NUM_THREADS_X - * v[2:3] - address of corresponding local buf address offset: s[0:1] + v0 * 8 - * v[4:5] - corresponding output buf address: s[2:3] + v0 * 4 - * v[6:7] - local buf address used for read test - * - * This shader can be used by gfx9 and gfx10 - * - */ - -static const char* ReadMemoryIsa_gfx9 = R"( - .text - // Compute address of corresponding output buffer - v_mov_b32 v0, s4 // use workgroup id as index - v_lshlrev_b32 v0, 2, v0 // v0 *= 4 - v_add_co_u32 v4, vcc, s2, v0 // v[4:5] = s[2:3] + v0 * 4 - v_mov_b32 v5, s3 - v_add_co_u32 v5, vcc, v5, vcc_lo - // Compute input buffer offset used to store corresponding local buffer address - v_lshlrev_b32 v0, 1, v0 // v0 *= 8 - v_add_co_u32 v2, vcc, s0, v0 // v[2:3] = s[0:1] + v0 * 8 - v_mov_b32 v3, s1 - v_add_co_u32 v3, vcc, v3, vcc_lo - // load 64bit local buffer address stored at v[2:3] to v[6:7] - flat_load_dwordx2 v[6:7], v[2:3] slc - s_waitcnt vmcnt(0) & lgkmcnt(0) // wait for memory reads to finish - v_mov_b32 v8, 0x5678 - s_movk_i32 s8, 0x5678 - L_REPEAT: - s_load_dword s16, s[0:1], 0x0 glc - s_waitcnt vmcnt(0) & lgkmcnt(0) // wait for memory reads to finish - s_cmp_eq_i32 s16, s8 - s_cbranch_scc1 L_QUIT // if notified to quit by host - // Loop read 64M local buffer starting at v[6:7] - // every 4k page only read once - v_mov_b32 v9, 0 - v_mov_b32 v10, 0x1000 // 4k page - v_mov_b32 v11, 0x4000000 // 64M size - v_mov_b32 v12, v6 - v_mov_b32 v13, v7 - L_LOOP_READ: - flat_load_dwordx2 v[14:15], v[12:13] slc - v_add_co_u32 v9, vcc, v9, v10 - v_add_co_u32 v12, vcc, v12, v10 - v_add_co_u32 v13, vcc, v13, vcc_lo - v_cmp_lt_u32 vcc, v9, v11 - s_cbranch_vccnz L_LOOP_READ - s_branch L_REPEAT - L_QUIT: - flat_store_dword v[4:5], v8 - s_waitcnt vmcnt(0) & lgkmcnt(0) // wait for memory writes to finish - s_endpgm -)"; - -static const char* ReadMemoryIsa_gfx8 = R"( - .text - // Compute address of corresponding output buffer - v_mov_b32 v0, s4 // use workgroup id as index - v_lshlrev_b32 v0, 2, v0 // v0 *= 4 - v_add_u32 v4, vcc, s2, v0 // v[4:5] = s[2:3] + v0 * 4 - v_mov_b32 v5, s3 - v_addc_u32 v5, vcc, v5, 0, vcc - // Compute input buffer offset used to store corresponding local buffer address - v_lshlrev_b32 v0, 1, v0 // v0 *= 8 - v_add_u32 v2, vcc, s0, v0 // v[2:3] = s[0:1] + v0 * 8 - v_mov_b32 v3, s1 - v_addc_u32 v3, vcc, v3, 0, vcc - // Load 64bit local buffer address stored at v[2:3] to v[6:7] - flat_load_dwordx2 v[6:7], v[2:3] slc - s_waitcnt vmcnt(0) & lgkmcnt(0) // wait for memory reads to finish - v_mov_b32 v8, 0x5678 - s_movk_i32 s8, 0x5678 - L_REPEAT: - s_load_dword s16, s[0:1], 0x0 glc - s_waitcnt vmcnt(0) & lgkmcnt(0) // wait for memory reads to finish - s_cmp_eq_i32 s16, s8 - s_cbranch_scc1 L_QUIT // if notified to quit by host - // Loop read 64M local buffer starting at v[6:7] - // every 4k page only read once - v_mov_b32 v9, 0 - v_mov_b32 v10, 0x1000 // 4k page - v_mov_b32 v11, 0x4000000 // 64M size - v_mov_b32 v12, v6 - v_mov_b32 v13, v7 - L_LOOP_READ: - flat_load_dwordx2 v[14:15], v[12:13] slc - v_add_u32 v9, vcc, v9, v10 - v_add_u32 v12, vcc, v12, v10 - v_addc_u32 v13, vcc, v13, 0, vcc - v_cmp_lt_u32 vcc, v9, v11 - s_cbranch_vccnz L_LOOP_READ - s_branch L_REPEAT - L_QUIT: - flat_store_dword v[4:5], v8 - s_waitcnt vmcnt(0) & lgkmcnt(0) // wait for memory writes to finish - s_endpgm -)"; - -std::string KFDEvictTest::CreateShader() { - if (m_FamilyId < FAMILY_AI) - return ReadMemoryIsa_gfx8; - else - return ReadMemoryIsa_gfx9; -} - - void KFDEvictTest::SetUp() { ROUTINE_START @@ -546,7 +431,7 @@ TEST_F(KFDEvictTest, QueueTest) { HsaMemoryBuffer addrBuffer(PAGE_SIZE, defaultGPUNode); HsaMemoryBuffer resultBuffer(PAGE_SIZE, defaultGPUNode); - ASSERT_SUCCESS(m_pAsm->RunAssembleBuf(CreateShader().c_str(), isaBuffer.As())); + ASSERT_SUCCESS(m_pAsm->RunAssembleBuf(ReadMemoryIsa, isaBuffer.As())); PM4Queue pm4Queue; ASSERT_SUCCESS(pm4Queue.Create(defaultGPUNode)); diff --git a/projects/rocr-runtime/tests/kfdtest/src/KFDEvictTest.hpp b/projects/rocr-runtime/tests/kfdtest/src/KFDEvictTest.hpp index d70aada6b4..30f0a856be 100644 --- a/projects/rocr-runtime/tests/kfdtest/src/KFDEvictTest.hpp +++ b/projects/rocr-runtime/tests/kfdtest/src/KFDEvictTest.hpp @@ -40,7 +40,6 @@ class KFDEvictTest : public KFDMultiProcessTest { virtual void SetUp(); virtual void TearDown(); - std::string CreateShader(); void AllocBuffers(HSAuint32 defaultGPUNode, HSAuint32 count, HSAuint64 vramBufSize, std::vector &pBuffers); void FreeBuffers(std::vector &pBuffers, HSAuint64 vramBufSize); diff --git a/projects/rocr-runtime/tests/kfdtest/src/ShaderStore.cpp b/projects/rocr-runtime/tests/kfdtest/src/ShaderStore.cpp index 93a2d60184..fdf536b4db 100644 --- a/projects/rocr-runtime/tests/kfdtest/src/ShaderStore.cpp +++ b/projects/rocr-runtime/tests/kfdtest/src/ShaderStore.cpp @@ -438,3 +438,107 @@ const char *IterateIsa = R"( s_waitcnt vmcnt(0) & lgkmcnt(0) s_endpgm )"; + +/** + * KFDEvictTest + */ + +/* Shader to read local buffers using multiple wavefronts in parallel + * until address buffer is filled with specific value 0x5678 by host program, + * then each wavefront fills value 0x5678 at corresponding result buffer and quit + * + * Initial state: + * s[0:1] - address buffer base address + * s[2:3] - result buffer base address + * s4 - workgroup id + * v0 - workitem id, always 0 because NUM_THREADS_X(number of threads) in workgroup set to 1 + * Registers: + * v0 - calculated workitem id, v0 = v0 + s4 * NUM_THREADS_X + * v[2:3] - address of corresponding local buf address offset: s[0:1] + v0 * 8 + * v[4:5] - corresponding output buf address: s[2:3] + v0 * 4 + * v[6:7] - local buf address used for read test + */ +const char *ReadMemoryIsa = R"( + .text + .if (.amdgcn.gfx_generation_number >= 9) + // Compute address of corresponding output buffer + v_mov_b32 v0, s4 // use workgroup id as index + v_lshlrev_b32 v0, 2, v0 // v0 *= 4 + v_add_co_u32 v4, vcc, s2, v0 // v[4:5] = s[2:3] + v0 * 4 + v_mov_b32 v5, s3 + v_add_co_u32 v5, vcc, v5, vcc_lo + // Compute input buffer offset used to store corresponding local buffer address + v_lshlrev_b32 v0, 1, v0 // v0 *= 8 + v_add_co_u32 v2, vcc, s0, v0 // v[2:3] = s[0:1] + v0 * 8 + v_mov_b32 v3, s1 + v_add_co_u32 v3, vcc, v3, vcc_lo + // Load 64bit local buffer address stored at v[2:3] to v[6:7] + flat_load_dwordx2 v[6:7], v[2:3] slc + s_waitcnt vmcnt(0) & lgkmcnt(0) // wait for memory reads to finish + v_mov_b32 v8, 0x5678 + s_movk_i32 s8, 0x5678 + L_REPEAT: + s_load_dword s16, s[0:1], 0x0 glc + s_waitcnt vmcnt(0) & lgkmcnt(0) // wait for memory reads to finish + s_cmp_eq_i32 s16, s8 + s_cbranch_scc1 L_QUIT // if notified to quit by host + // Loop read 64M local buffer starting at v[6:7] + // every 4k page only read once + v_mov_b32 v9, 0 + v_mov_b32 v10, 0x1000 // 4k page + v_mov_b32 v11, 0x4000000 // 64M size + v_mov_b32 v12, v6 + v_mov_b32 v13, v7 + L_LOOP_READ: + flat_load_dwordx2 v[14:15], v[12:13] slc + v_add_co_u32 v9, vcc, v9, v10 + v_add_co_u32 v12, vcc, v12, v10 + v_add_co_u32 v13, vcc, v13, vcc_lo + v_cmp_lt_u32 vcc, v9, v11 + s_cbranch_vccnz L_LOOP_READ + s_branch L_REPEAT + L_QUIT: + flat_store_dword v[4:5], v8 + .else + // Compute address of corresponding output buffer + v_mov_b32 v0, s4 // use workgroup id as index + v_lshlrev_b32 v0, 2, v0 // v0 *= 4 + v_add_u32 v4, vcc, s2, v0 // v[4:5] = s[2:3] + v0 * 4 + v_mov_b32 v5, s3 + v_addc_u32 v5, vcc, v5, 0, vcc + // Compute input buffer offset used to store corresponding local buffer address + v_lshlrev_b32 v0, 1, v0 // v0 *= 8 + v_add_u32 v2, vcc, s0, v0 // v[2:3] = s[0:1] + v0 * 8 + v_mov_b32 v3, s1 + v_addc_u32 v3, vcc, v3, 0, vcc + // Load 64bit local buffer address stored at v[2:3] to v[6:7] + flat_load_dwordx2 v[6:7], v[2:3] slc + s_waitcnt vmcnt(0) & lgkmcnt(0) // wait for memory reads to finish + v_mov_b32 v8, 0x5678 + s_movk_i32 s8, 0x5678 + L_REPEAT_GFX8: + s_load_dword s16, s[0:1], 0x0 glc + s_waitcnt vmcnt(0) & lgkmcnt(0) // wait for memory reads to finish + s_cmp_eq_i32 s16, s8 + s_cbranch_scc1 L_QUIT_8 // if notified to quit by host + // Loop read 64M local buffer starting at v[6:7] + // every 4k page only read once + v_mov_b32 v9, 0 + v_mov_b32 v10, 0x1000 // 4k page + v_mov_b32 v11, 0x4000000 // 64M size + v_mov_b32 v12, v6 + v_mov_b32 v13, v7 + L_LOOP_READ_GFX8: + flat_load_dwordx2 v[14:15], v[12:13] slc + v_add_u32 v9, vcc, v9, v10 + v_add_u32 v12, vcc, v12, v10 + v_addc_u32 v13, vcc, v13, 0, vcc + v_cmp_lt_u32 vcc, v9, v11 + s_cbranch_vccnz L_LOOP_READ_GFX8 + s_branch L_REPEAT_GFX8 + L_QUIT_8: + flat_store_dword v[4:5], v8 + .endif + s_waitcnt vmcnt(0) & lgkmcnt(0) // wait for memory writes to finish + s_endpgm +)"; diff --git a/projects/rocr-runtime/tests/kfdtest/src/ShaderStore.hpp b/projects/rocr-runtime/tests/kfdtest/src/ShaderStore.hpp index f3a9d92781..de73034ee4 100644 --- a/projects/rocr-runtime/tests/kfdtest/src/ShaderStore.hpp +++ b/projects/rocr-runtime/tests/kfdtest/src/ShaderStore.hpp @@ -45,4 +45,7 @@ extern const char *LoopIsa; /* KFDCWSRTest */ extern const char *IterateIsa; +/* KFDEvictTest */ +extern const char *ReadMemoryIsa; + #endif // _SHADERSTORE_H_ From e0e0485579c11aca1f453f2f8c9c11e903989968 Mon Sep 17 00:00:00 2001 From: Graham Sider Date: Tue, 2 Nov 2021 14:04:48 -0400 Subject: [PATCH 15/27] kfdtest: Move KFDGWSTest shaders to ShaderStore Signed-off-by: Graham Sider Change-Id: I9608b8bea32d64d4d0e1a329191f9a62e3a771e7 [ROCm/ROCR-Runtime commit: b2b54dffe6886cef1c7126f92a7808f0c1a6aac4] --- .../tests/kfdtest/src/KFDGWSTest.cpp | 66 +------------------ .../tests/kfdtest/src/ShaderStore.cpp | 56 ++++++++++++++++ .../tests/kfdtest/src/ShaderStore.hpp | 4 ++ 3 files changed, 62 insertions(+), 64 deletions(-) diff --git a/projects/rocr-runtime/tests/kfdtest/src/KFDGWSTest.cpp b/projects/rocr-runtime/tests/kfdtest/src/KFDGWSTest.cpp index a6b3bec17b..99e9248d8f 100644 --- a/projects/rocr-runtime/tests/kfdtest/src/KFDGWSTest.cpp +++ b/projects/rocr-runtime/tests/kfdtest/src/KFDGWSTest.cpp @@ -26,62 +26,6 @@ #include "PM4Packet.hpp" #include "Dispatch.hpp" -/* Shader to initialize gws counter to 1 */ -static const char* GwsInitIsa_gfx9_10 = R"( - .text - s_mov_b32 m0, 0 - s_nop 0 - s_load_dword s16, s[0:1], 0x0 glc - s_waitcnt 0 - v_mov_b32 v0, s16 - s_waitcnt 0 - ds_gws_init v0 offset:0 gds - s_waitcnt 0 - s_endpgm -)"; - -/* Atomically increase a value in memory - * This is expected to be executed from - * multiple work groups simultaneously. - * GWS semaphore is used to guarantee - * the operation is atomic. - */ -static const char* AtomicIncreaseIsa_gfx9 = R"( - .text - // Assume src address in s0, s1 - s_mov_b32 m0, 0 - s_nop 0 - ds_gws_sema_p offset:0 gds - s_waitcnt 0 - s_load_dword s16, s[0:1], 0x0 glc - s_waitcnt 0 - s_add_u32 s16, s16, 1 - s_store_dword s16, s[0:1], 0x0 glc - s_waitcnt lgkmcnt(0) - ds_gws_sema_v offset:0 gds - s_waitcnt 0 - s_endpgm -)"; - -static const char* AtomicIncreaseIsa_gfx10 = R"( - .text - // Assume src address in s0, s1 - s_mov_b32 m0, 0 - s_mov_b32 exec_lo, 0x1 - v_mov_b32 v0, s0 - v_mov_b32 v1, s1 - ds_gws_sema_p offset:0 gds - s_waitcnt 0 - flat_load_dword v2, v[0:1] glc dlc - s_waitcnt 0 - v_add_nc_u32 v2, v2, 1 - flat_store_dword v[0:1], v2 - s_waitcnt_vscnt null, 0 - ds_gws_sema_v offset:0 gds - s_waitcnt 0 - s_endpgm -)"; - void KFDGWSTest::SetUp() { ROUTINE_START @@ -142,7 +86,7 @@ TEST_F(KFDGWSTest, Semaphore) { pNodeProperties->NumGws,&firstGWS)); EXPECT_EQ(0, firstGWS); - ASSERT_SUCCESS(m_pAsm->RunAssembleBuf(GwsInitIsa_gfx9_10, isaBuffer.As())); + ASSERT_SUCCESS(m_pAsm->RunAssembleBuf(GwsInitIsa, isaBuffer.As())); Dispatch dispatch0(isaBuffer); buffer.Fill(numResources, 0, 4); @@ -150,13 +94,7 @@ TEST_F(KFDGWSTest, Semaphore) { dispatch0.Submit(queue); dispatch0.Sync(); - const char *pAtomicIncreaseIsa; - if (m_FamilyId <= FAMILY_AL) - pAtomicIncreaseIsa = AtomicIncreaseIsa_gfx9; - else - pAtomicIncreaseIsa = AtomicIncreaseIsa_gfx10; - - ASSERT_SUCCESS(m_pAsm->RunAssembleBuf(pAtomicIncreaseIsa, isaBuffer.As())); + ASSERT_SUCCESS(m_pAsm->RunAssembleBuf(GwsAtomicIncreaseIsa, isaBuffer.As())); Dispatch dispatch(isaBuffer); dispatch.SetArgs(buffer.As(), NULL); diff --git a/projects/rocr-runtime/tests/kfdtest/src/ShaderStore.cpp b/projects/rocr-runtime/tests/kfdtest/src/ShaderStore.cpp index fdf536b4db..e720d4b8ce 100644 --- a/projects/rocr-runtime/tests/kfdtest/src/ShaderStore.cpp +++ b/projects/rocr-runtime/tests/kfdtest/src/ShaderStore.cpp @@ -542,3 +542,59 @@ const char *ReadMemoryIsa = R"( s_waitcnt vmcnt(0) & lgkmcnt(0) // wait for memory writes to finish s_endpgm )"; + +/** + * KFDGWSTest + */ + +/* Shader to initialize gws counter to 1 */ +const char *GwsInitIsa = R"( + .text + s_mov_b32 m0, 0 + s_nop 0 + s_load_dword s16, s[0:1], 0x0 glc + s_waitcnt 0 + v_mov_b32 v0, s16 + s_waitcnt 0 + ds_gws_init v0 offset:0 gds + s_waitcnt 0 + s_endpgm +)"; + +/* Atomically increase a value in memory + * This is expected to be executed from + * multiple work groups simultaneously. + * GWS semaphore is used to guarantee + * the operation is atomic. + */ +const char *GwsAtomicIncreaseIsa = R"( + .text + // Assume src address in s0, s1 + .if (.amdgcn.gfx_generation_number >= 10) + s_mov_b32 m0, 0 + s_mov_b32 exec_lo, 0x1 + v_mov_b32 v0, s0 + v_mov_b32 v1, s1 + ds_gws_sema_p offset:0 gds + s_waitcnt 0 + flat_load_dword v2, v[0:1] glc dlc + s_waitcnt 0 + v_add_nc_u32 v2, v2, 1 + flat_store_dword v[0:1], v2 + s_waitcnt_vscnt null, 0 + ds_gws_sema_v offset:0 gds + .else + s_mov_b32 m0, 0 + s_nop 0 + ds_gws_sema_p offset:0 gds + s_waitcnt 0 + s_load_dword s16, s[0:1], 0x0 glc + s_waitcnt 0 + s_add_u32 s16, s16, 1 + s_store_dword s16, s[0:1], 0x0 glc + s_waitcnt lgkmcnt(0) + ds_gws_sema_v offset:0 gds + .endif + s_waitcnt 0 + s_endpgm +)"; diff --git a/projects/rocr-runtime/tests/kfdtest/src/ShaderStore.hpp b/projects/rocr-runtime/tests/kfdtest/src/ShaderStore.hpp index de73034ee4..231e7f73d6 100644 --- a/projects/rocr-runtime/tests/kfdtest/src/ShaderStore.hpp +++ b/projects/rocr-runtime/tests/kfdtest/src/ShaderStore.hpp @@ -48,4 +48,8 @@ extern const char *IterateIsa; /* KFDEvictTest */ extern const char *ReadMemoryIsa; +/* KFDGWSTest */ +extern const char *GwsInitIsa; +extern const char *GwsAtomicIncreaseIsa; + #endif // _SHADERSTORE_H_ From d06327982df31a9c39d03453f6123f01f58d1a70 Mon Sep 17 00:00:00 2001 From: Graham Sider Date: Thu, 4 Nov 2021 17:08:43 -0400 Subject: [PATCH 16/27] kfdtest: Add macros to simplify instr differences Makes use of macros to simplify shader code with instruction-level differences depending on GFX version. These macros are extensible and are prepended to every shader so that they are usable everywhere. This patch introduces three macros used within IterateIsa and ReadMemoryIsa shaders. Signed-off-by: Graham Sider Change-Id: If954e1b6d2027e9f55bf7e99bd9df2668d1da524 [ROCm/ROCR-Runtime commit: 5ceb35f4280e3ff3b2d0b2a0b626112276bcf7c2] --- .../tests/kfdtest/src/ShaderStore.cpp | 207 ++++++++---------- 1 file changed, 95 insertions(+), 112 deletions(-) diff --git a/projects/rocr-runtime/tests/kfdtest/src/ShaderStore.cpp b/projects/rocr-runtime/tests/kfdtest/src/ShaderStore.cpp index e720d4b8ce..63cd68a063 100644 --- a/projects/rocr-runtime/tests/kfdtest/src/ShaderStore.cpp +++ b/projects/rocr-runtime/tests/kfdtest/src/ShaderStore.cpp @@ -21,6 +21,41 @@ * */ +/** + * Macros + */ + +/* Create macro for portable v_add_co_u32, v_add_co_ci_u32, + * and v_cmp_lt_u32 + */ +#define SHADER_MACROS \ + " .text\n"\ + " .macro V_ADD_CO_U32 vdst, src0, vsrc1\n"\ + " .if (.amdgcn.gfx_generation_number >= 10)\n"\ + " v_add_co_u32 \\vdst, vcc_lo, \\src0, \\vsrc1\n"\ + " .elseif (.amdgcn.gfx_generation_number >= 9)\n"\ + " v_add_co_u32 \\vdst, vcc, \\src0, \\vsrc1\n"\ + " .else\n"\ + " v_add_u32 \\vdst, vcc, \\src0, \\vsrc1\n"\ + " .endif\n"\ + " .endm\n"\ + " .macro V_ADD_CO_CI_U32 vdst, src0, vsrc1\n"\ + " .if (.amdgcn.gfx_generation_number >= 10)\n"\ + " v_add_co_ci_u32 \\vdst, vcc_lo, \\src0, \\vsrc1, vcc_lo\n"\ + " .elseif (.amdgcn.gfx_generation_number >= 9)\n"\ + " v_addc_co_u32 \\vdst, vcc, \\src0, \\vsrc1, vcc\n"\ + " .else\n"\ + " v_addc_u32 \\vdst, vcc, \\src0, \\vsrc1, vcc\n"\ + " .endif\n"\ + " .endm\n"\ + " .macro V_CMP_LT_U32 src0, vsrc1\n"\ + " .if (.amdgcn.gfx_generation_number >= 10)\n"\ + " v_cmp_lt_u32 vcc_lo, \\src0, \\vsrc1\n"\ + " .else\n"\ + " v_cmp_lt_u32 vcc, \\src0, \\vsrc1\n"\ + " .endif\n"\ + " .endm\n" + /** * Common */ @@ -404,37 +439,24 @@ const char *LoopIsa = R"( * v[4:5] - corresponding output buf address: s[2:3] + v0 * 4 * v6 - counter */ -const char *IterateIsa = R"( - .text +const char *IterateIsa = SHADER_MACROS R"( // Copy the parameters from scalar registers to vector registers - v_mov_b32 v2, s0 // v[2:3] = s[0:1] - v_mov_b32 v3, s1 // v[2:3] = s[0:1] - v_mov_b32 v0, s4 // use workgroup id as index - v_lshlrev_b32 v0, 2, v0 // v0 *= 4 - .if (.amdgcn.gfx_generation_number >= 9) - v_add_co_u32 v4, vcc, s2, v0 // v[4:5] = s[2:3] + v0 * 4 - v_mov_b32 v5, s3 // v[4:5] = s[2:3] + v0 * 4 - v_add_co_u32 v5, vcc, v5, vcc_lo // v[4:5] = s[2:3] + v0 * 4 - v_mov_b32 v6, 0 - LOOP: - v_add_co_u32 v6, vcc, 1, v6 - // Compare the result value (v6) to iteration value (v2), and - // jump if equal (i.e. if VCC is not zero after the comparison) - v_cmp_lt_u32 vcc, v6, v2 - s_cbranch_vccnz LOOP - .else - v_add_u32 v4, vcc, s2, v0 // v[4:5] = s[2:3] + v0 * 4 - v_mov_b32 v5, s3 // v[4:5] = s[2:3] + v0 * 4 - v_add_u32 v5, vcc, v5, vcc_lo // v[4:5] = s[2:3] + v0 * 4 - v_mov_b32 v6, 0 - LOOP_GFX8: - v_add_u32 v6, vcc, 1, v6 - // Compare the result value (v6) to iteration value (v2), and - // jump if equal (i.e. if VCC is not zero after the comparison) - v_cmp_lt_u32 vcc, v6, v2 - s_cbranch_vccnz LOOP_GFX8 - .endif - flat_store_dword v[4:5], v6 + v_mov_b32 v2, s0 // v[2:3] = s[0:1] + v_mov_b32 v3, s1 // v[2:3] = s[0:1] + v_mov_b32 v0, s4 // use workgroup id as index + v_lshlrev_b32 v0, 2, v0 // v0 *= 4 + V_ADD_CO_U32 v4, s2, v0 // v[4:5] = s[2:3] + v0 * 4 + v_mov_b32 v5, s3 // v[4:5] = s[2:3] + v0 * 4 + V_ADD_CO_CI_U32 v5, v5, 0 // v[4:5] = s[2:3] + v0 * 4 + v_mov_b32 v6, 0 + LOOP: + V_ADD_CO_U32 v6, 1, v6 + + // Compare the result value (v6) to iteration value (v2), and + // jump if equal (i.e. if VCC is not zero after the comparison) + V_CMP_LT_U32 v6, v2 + s_cbranch_vccnz LOOP + flat_store_dword v[4:5], v6 s_waitcnt vmcnt(0) & lgkmcnt(0) s_endpgm )"; @@ -458,88 +480,49 @@ const char *IterateIsa = R"( * v[4:5] - corresponding output buf address: s[2:3] + v0 * 4 * v[6:7] - local buf address used for read test */ -const char *ReadMemoryIsa = R"( - .text - .if (.amdgcn.gfx_generation_number >= 9) - // Compute address of corresponding output buffer - v_mov_b32 v0, s4 // use workgroup id as index - v_lshlrev_b32 v0, 2, v0 // v0 *= 4 - v_add_co_u32 v4, vcc, s2, v0 // v[4:5] = s[2:3] + v0 * 4 - v_mov_b32 v5, s3 - v_add_co_u32 v5, vcc, v5, vcc_lo - // Compute input buffer offset used to store corresponding local buffer address - v_lshlrev_b32 v0, 1, v0 // v0 *= 8 - v_add_co_u32 v2, vcc, s0, v0 // v[2:3] = s[0:1] + v0 * 8 - v_mov_b32 v3, s1 - v_add_co_u32 v3, vcc, v3, vcc_lo - // Load 64bit local buffer address stored at v[2:3] to v[6:7] - flat_load_dwordx2 v[6:7], v[2:3] slc - s_waitcnt vmcnt(0) & lgkmcnt(0) // wait for memory reads to finish - v_mov_b32 v8, 0x5678 - s_movk_i32 s8, 0x5678 - L_REPEAT: - s_load_dword s16, s[0:1], 0x0 glc - s_waitcnt vmcnt(0) & lgkmcnt(0) // wait for memory reads to finish - s_cmp_eq_i32 s16, s8 - s_cbranch_scc1 L_QUIT // if notified to quit by host - // Loop read 64M local buffer starting at v[6:7] - // every 4k page only read once - v_mov_b32 v9, 0 - v_mov_b32 v10, 0x1000 // 4k page - v_mov_b32 v11, 0x4000000 // 64M size - v_mov_b32 v12, v6 - v_mov_b32 v13, v7 - L_LOOP_READ: - flat_load_dwordx2 v[14:15], v[12:13] slc - v_add_co_u32 v9, vcc, v9, v10 - v_add_co_u32 v12, vcc, v12, v10 - v_add_co_u32 v13, vcc, v13, vcc_lo - v_cmp_lt_u32 vcc, v9, v11 - s_cbranch_vccnz L_LOOP_READ - s_branch L_REPEAT - L_QUIT: - flat_store_dword v[4:5], v8 - .else - // Compute address of corresponding output buffer - v_mov_b32 v0, s4 // use workgroup id as index - v_lshlrev_b32 v0, 2, v0 // v0 *= 4 - v_add_u32 v4, vcc, s2, v0 // v[4:5] = s[2:3] + v0 * 4 - v_mov_b32 v5, s3 - v_addc_u32 v5, vcc, v5, 0, vcc - // Compute input buffer offset used to store corresponding local buffer address - v_lshlrev_b32 v0, 1, v0 // v0 *= 8 - v_add_u32 v2, vcc, s0, v0 // v[2:3] = s[0:1] + v0 * 8 - v_mov_b32 v3, s1 - v_addc_u32 v3, vcc, v3, 0, vcc - // Load 64bit local buffer address stored at v[2:3] to v[6:7] - flat_load_dwordx2 v[6:7], v[2:3] slc - s_waitcnt vmcnt(0) & lgkmcnt(0) // wait for memory reads to finish - v_mov_b32 v8, 0x5678 - s_movk_i32 s8, 0x5678 - L_REPEAT_GFX8: - s_load_dword s16, s[0:1], 0x0 glc - s_waitcnt vmcnt(0) & lgkmcnt(0) // wait for memory reads to finish - s_cmp_eq_i32 s16, s8 - s_cbranch_scc1 L_QUIT_8 // if notified to quit by host - // Loop read 64M local buffer starting at v[6:7] - // every 4k page only read once - v_mov_b32 v9, 0 - v_mov_b32 v10, 0x1000 // 4k page - v_mov_b32 v11, 0x4000000 // 64M size - v_mov_b32 v12, v6 - v_mov_b32 v13, v7 - L_LOOP_READ_GFX8: - flat_load_dwordx2 v[14:15], v[12:13] slc - v_add_u32 v9, vcc, v9, v10 - v_add_u32 v12, vcc, v12, v10 - v_addc_u32 v13, vcc, v13, 0, vcc - v_cmp_lt_u32 vcc, v9, v11 - s_cbranch_vccnz L_LOOP_READ_GFX8 - s_branch L_REPEAT_GFX8 - L_QUIT_8: - flat_store_dword v[4:5], v8 - .endif - s_waitcnt vmcnt(0) & lgkmcnt(0) // wait for memory writes to finish +const char *ReadMemoryIsa = SHADER_MACROS R"( + // Compute address of corresponding output buffer + v_mov_b32 v0, s4 // use workgroup id as index + v_lshlrev_b32 v0, 2, v0 // v0 *= 4 + V_ADD_CO_U32 v4, s2, v0 // v[4:5] = s[2:3] + v0 * 4 + v_mov_b32 v5, s3 // v[4:5] = s[2:3] + v0 * 4 + V_ADD_CO_CI_U32 v5, v5, 0 // v[4:5] = s[2:3] + v0 * 4 + + // Compute input buffer offset used to store corresponding local buffer address + v_lshlrev_b32 v0, 1, v0 // v0 *= 8 + V_ADD_CO_U32 v2, s0, v0 // v[2:3] = s[0:1] + v0 * 8 + v_mov_b32 v3, s1 // v[2:3] = s[0:1] + v0 * 8 + V_ADD_CO_CI_U32 v3, v3, 0 // v[2:3] = s[0:1] + v0 * 8 + + // Load 64bit local buffer address stored at v[2:3] to v[6:7] + flat_load_dwordx2 v[6:7], v[2:3] slc + s_waitcnt vmcnt(0) & lgkmcnt(0) // wait for memory reads to finish + v_mov_b32 v8, 0x5678 + s_movk_i32 s8, 0x5678 + L_REPEAT: + s_load_dword s16, s[0:1], 0x0 glc + s_waitcnt vmcnt(0) & lgkmcnt(0) // wait for memory reads to finish + s_cmp_eq_i32 s16, s8 + s_cbranch_scc1 L_QUIT // if notified to quit by host + + // Loop read 64M local buffer starting at v[6:7] + // every 4k page only read once + v_mov_b32 v9, 0 + v_mov_b32 v10, 0x1000 // 4k page + v_mov_b32 v11, 0x4000000 // 64M size + v_mov_b32 v12, v6 + v_mov_b32 v13, v7 + L_LOOP_READ: + flat_load_dwordx2 v[14:15], v[12:13] slc + V_ADD_CO_U32 v9, v9, v10 + V_ADD_CO_U32 v12, v12, v10 + V_ADD_CO_CI_U32 v13, v13, 0 + V_CMP_LT_U32 v9, v11 + s_cbranch_vccnz L_LOOP_READ + s_branch L_REPEAT + L_QUIT: + flat_store_dword v[4:5], v8 + s_waitcnt vmcnt(0) & lgkmcnt(0) // wait for memory writes to finish s_endpgm )"; From be229daa44e60bc0bb09468261c319b60e1d5b65 Mon Sep 17 00:00:00 2001 From: Graham Sider Date: Tue, 2 Nov 2021 14:05:43 -0400 Subject: [PATCH 17/27] kfdtest: Remove KFDSVMEvictTest ReadMemory shader Use ReadMemoryIsa transferred and updated from KFDEvictTest. Signed-off-by: Graham Sider Change-Id: I566f9ec36398bc4d08ab90231688600356df4d6a [ROCm/ROCR-Runtime commit: 097b11abad5bc91a973fdc50e804c9542f53e7d7] --- .../tests/kfdtest/src/KFDSVMEvictTest.cpp | 112 +----------------- 1 file changed, 1 insertion(+), 111 deletions(-) diff --git a/projects/rocr-runtime/tests/kfdtest/src/KFDSVMEvictTest.cpp b/projects/rocr-runtime/tests/kfdtest/src/KFDSVMEvictTest.cpp index e712e7b44a..319b054a64 100644 --- a/projects/rocr-runtime/tests/kfdtest/src/KFDSVMEvictTest.cpp +++ b/projects/rocr-runtime/tests/kfdtest/src/KFDSVMEvictTest.cpp @@ -35,109 +35,6 @@ #define ALLOCATE_BUF_SIZE_MB (64) #define ALLOCATE_RETRY_TIMES (3) -/* Shader to read local buffers using multiple wavefronts in parallel - * until address buffer is filled with specific value 0x5678 by host program, - * then each wavefront fills value 0x5678 at corresponding result buffer and quit - * - * initial state: - * s[0:1] - address buffer base address - * s[2:3] - result buffer base address - * s4 - workgroup id - * v0 - workitem id, always 0 because NUM_THREADS_X(number of threads) in workgroup set to 1 - * registers: - * v0 - calculated workitem id, v0 = v0 + s4 * NUM_THREADS_X - * v[2:3] - address of corresponding local buf address offset: s[0:1] + v0 * 8 - * v[4:5] - corresponding output buf address: s[2:3] + v0 * 4 - * v[6:7] - local buf address used for read test - */ -static const char* gfx9_ReadMemory = R"( - .text - // Compute address of corresponding output buffer - v_mov_b32 v0, s4 // use workgroup id as index - v_lshlrev_b32 v0, 2, v0 // v0 *= 4 - v_add_co_u32 v4, vcc, s2, v0 // v[4:5] = s[2:3] + v0 * 4 - v_mov_b32 v5, s3 - v_add_u32 v5, vcc_lo, v5 - // Compute input buffer offset used to store corresponding local buffer address - v_lshlrev_b32 v0, 1, v0 // v0 *= 8 - v_add_co_u32 v2, vcc, s0, v0 // v[2:3] = s[0:1] + v0 * 8 - v_mov_b32 v3, s1 - v_add_u32 v3, vcc_lo, v3 - // Load 64bit local buffer address stored at v[2:3] to v[6:7] - flat_load_dwordx2 v[6:7], v[2:3] slc - s_waitcnt vmcnt(0) & lgkmcnt(0) // wait for memory reads to finish - v_mov_b32 v8, 0x5678 - s_movk_i32 s8, 0x5678 - L_REPEAT: - s_load_dword s16, s[0:1], 0x0 glc - s_waitcnt vmcnt(0) & lgkmcnt(0) // wait for memory reads to finish - s_cmp_eq_i32 s16, s8 - s_cbranch_scc1 L_QUIT // if notified to quit by host - // Loop read 64M local buffer starting at v[6:7] - // every 4k page only read once - v_mov_b32 v9, 0 - v_mov_b32 v10, 0x1000 // 4k page - v_mov_b32 v11, 0x4000000 // 64M size - v_mov_b32 v12, v6 - v_mov_b32 v13, v7 - L_LOOP_READ: - flat_load_dwordx2 v[14:15], v[12:13] slc - v_add_u32 v9, v9, v10 - v_add_co_u32 v12, vcc, v12, v10 - v_add_u32 v13, vcc_lo, v13 - v_cmp_lt_u32 vcc, v9, v11 - s_cbranch_vccnz L_LOOP_READ - s_branch L_REPEAT - L_QUIT: - flat_store_dword v[4:5], v8 - s_waitcnt vmcnt(0) & lgkmcnt(0) // wait for memory writes to finish - s_endpgm -)"; - -static const char* gfx8_ReadMemory = R"( - .text - // Compute address of corresponding output buffer - v_mov_b32 v0, s4 // use workgroup id as index - v_lshlrev_b32 v0, 2, v0 // v0 *= 4 - v_add_u32 v4, vcc, s2, v0 // v[4:5] = s[2:3] + v0 * 4 - v_mov_b32 v5, s3 - v_addc_u32 v5, vcc, v5, 0, vcc - // Compute input buffer offset used to store corresponding local buffer address - v_lshlrev_b32 v0, 1, v0 // v0 *= 8 - v_add_u32 v2, vcc, s0, v0 // v[2:3] = s[0:1] + v0 * 8 - v_mov_b32 v3, s1 - v_addc_u32 v3, vcc, v3, 0, vcc - // Load 64bit local buffer address stored at v[2:3] to v[6:7] - flat_load_dwordx2 v[6:7], v[2:3] slc - s_waitcnt vmcnt(0) & lgkmcnt(0) // wait for memory reads to finish - v_mov_b32 v8, 0x5678 - s_movk_i32 s8, 0x5678 - L_REPEAT: - s_load_dword s16, s[0:1], 0x0 glc - s_waitcnt vmcnt(0) & lgkmcnt(0) // wait for memory reads to finish - s_cmp_eq_i32 s16, s8 - s_cbranch_scc1 L_QUIT // if notified to quit by host - // Loop read 64M local buffer starting at v[6:7] - // every 4k page only read once - v_mov_b32 v9, 0 - v_mov_b32 v10, 0x1000 // 4k page - v_mov_b32 v11, 0x4000000 // 64M size - v_mov_b32 v12, v6 - v_mov_b32 v13, v7 - L_LOOP_READ: - flat_load_dwordx2 v[14:15], v[12:13] slc - v_add_u32 v9, vcc, v9, v10 - v_add_u32 v12, vcc, v12, v10 - v_addc_u32 v13, vcc, v13, 0, vcc - v_cmp_lt_u32 vcc, v9, v11 - s_cbranch_vccnz L_LOOP_READ - s_branch L_REPEAT - L_QUIT: - flat_store_dword v[4:5], v8 - s_waitcnt vmcnt(0) & lgkmcnt(0) // wait for memory writes to finish - s_endpgm -)"; - void KFDSVMEvictTest::SetUp() { ROUTINE_START @@ -158,13 +55,6 @@ void KFDSVMEvictTest::TearDown() { ROUTINE_END } -std::string KFDSVMEvictTest::CreateShader() { - if (m_FamilyId >= FAMILY_AI) - return gfx9_ReadMemory; - else - return gfx8_ReadMemory; -} - HSAint32 KFDSVMEvictTest::GetBufferCounter(HSAuint64 vramSize, HSAuint64 vramBufSize) { HSAuint64 vramBufSizeInPages = vramBufSize >> PAGE_SHIFT; HSAuint64 sysMemSize = GetSysMemSize(); @@ -419,7 +309,7 @@ TEST_F(KFDSVMEvictTest, QueueTest) { for (i = 0; i < wavefront_num; i++) *(localBufAddr + i) = pBuffers[i]; - ASSERT_SUCCESS(m_pAsm->RunAssembleBuf(CreateShader().c_str(), isaBuffer.As())); + ASSERT_SUCCESS(m_pAsm->RunAssembleBuf(ReadMemoryIsa, isaBuffer.As())); PM4Queue pm4Queue; ASSERT_SUCCESS(pm4Queue.Create(defaultGPUNode)); From 6b35fc4cf6e3cf08e5c24c3d0cf6f83b088feb9e Mon Sep 17 00:00:00 2001 From: Graham Sider Date: Thu, 30 Sep 2021 18:04:51 -0400 Subject: [PATCH 18/27] kfdtest: Remove IsaGen from KFDLocalMemoryTest Replace with LLVM-based Assembler. Signed-off-by: Graham Sider Change-Id: Ie83d27b6a93ac1b5169a830a7f274e360a31023e [ROCm/ROCR-Runtime commit: 549f7cdce20156e4ef78ac5b44dae662e21a4a25] --- .../tests/kfdtest/src/KFDLocalMemoryTest.cpp | 13 ++++--------- .../tests/kfdtest/src/KFDLocalMemoryTest.hpp | 6 +----- 2 files changed, 5 insertions(+), 14 deletions(-) diff --git a/projects/rocr-runtime/tests/kfdtest/src/KFDLocalMemoryTest.cpp b/projects/rocr-runtime/tests/kfdtest/src/KFDLocalMemoryTest.cpp index a27b502f97..b37528c651 100644 --- a/projects/rocr-runtime/tests/kfdtest/src/KFDLocalMemoryTest.cpp +++ b/projects/rocr-runtime/tests/kfdtest/src/KFDLocalMemoryTest.cpp @@ -33,18 +33,12 @@ void KFDLocalMemoryTest::SetUp() { KFDBaseComponentTest::SetUp(); - m_pIsaGen = IsaGenerator::Create(m_FamilyId); - ROUTINE_END } void KFDLocalMemoryTest::TearDown() { ROUTINE_START - if (m_pIsaGen) - delete m_pIsaGen; - m_pIsaGen = NULL; - KFDBaseComponentTest::TearDown(); ROUTINE_END @@ -107,7 +101,7 @@ TEST_F(KFDLocalMemoryTest, BasicTest) { srcSysBuffer.Fill(0x01010101); - m_pIsaGen->GetCopyDwordIsa(isaBuffer); + ASSERT_SUCCESS(m_pAsm->RunAssembleBuf(CopyDwordIsa, isaBuffer.As())); ASSERT_SUCCESS(hsaKmtMapMemoryToGPUNodes(srcLocalBuffer.As(), srcLocalBuffer.Size(), &AlternateVAGPU, mapFlags, 1, reinterpret_cast(&defaultGPUNode))); @@ -164,7 +158,7 @@ TEST_F(KFDLocalMemoryTest, VerifyContentsAfterUnmapAndMap) { SysBufferA.Fill(0x01010101); - m_pIsaGen->GetCopyDwordIsa(isaBuffer); + ASSERT_SUCCESS(m_pAsm->RunAssembleBuf(CopyDwordIsa, isaBuffer.As())); ASSERT_SUCCESS(queue.Create(defaultGPUNode)); queue.SetSkipWaitConsump(0); @@ -303,7 +297,8 @@ TEST_F(KFDLocalMemoryTest, Fragmentation) { PM4Queue queue; ASSERT_SUCCESS(queue.Create(defaultGPUNode)); HsaMemoryBuffer isaBuffer(PAGE_SIZE, defaultGPUNode); - m_pIsaGen->GetCopyDwordIsa(isaBuffer); + + ASSERT_SUCCESS(m_pAsm->RunAssembleBuf(CopyDwordIsa, isaBuffer.As())); /* Allocate and test memory using the strategy explained at the top */ HSAKMT_STATUS status; diff --git a/projects/rocr-runtime/tests/kfdtest/src/KFDLocalMemoryTest.hpp b/projects/rocr-runtime/tests/kfdtest/src/KFDLocalMemoryTest.hpp index 519081cfc0..b3a9b2add6 100644 --- a/projects/rocr-runtime/tests/kfdtest/src/KFDLocalMemoryTest.hpp +++ b/projects/rocr-runtime/tests/kfdtest/src/KFDLocalMemoryTest.hpp @@ -26,20 +26,16 @@ #include -#include "IsaGenerator.hpp" #include "KFDBaseComponentTest.hpp" class KFDLocalMemoryTest : public KFDBaseComponentTest { public: - KFDLocalMemoryTest() :m_pIsaGen(NULL) {} + KFDLocalMemoryTest() {} ~KFDLocalMemoryTest() {} protected: virtual void SetUp(); virtual void TearDown(); - - protected: // Members - IsaGenerator* m_pIsaGen; }; #endif // __KFD_LOCALMEMORY_TEST__H__ From ac1ba05c1229e3372c8752d084315a658f072e4f Mon Sep 17 00:00:00 2001 From: Graham Sider Date: Thu, 30 Sep 2021 18:16:57 -0400 Subject: [PATCH 19/27] kfdtest: Remove IsaGen from KFDSVMRangeTest Replace with LLVM-based Assembler. Signed-off-by: Graham Sider Change-Id: Id05f8a18a188d0ad354b711c1c196b71dffcc756 [ROCm/ROCR-Runtime commit: 469d5e67d234bcfe98a76c9d2538ce7e1031549a] --- .../tests/kfdtest/src/KFDSVMRangeTest.cpp | 20 +++++++++---------- .../tests/kfdtest/src/KFDSVMRangeTest.hpp | 6 +----- 2 files changed, 10 insertions(+), 16 deletions(-) diff --git a/projects/rocr-runtime/tests/kfdtest/src/KFDSVMRangeTest.cpp b/projects/rocr-runtime/tests/kfdtest/src/KFDSVMRangeTest.cpp index 6aad683f91..283a567ff4 100644 --- a/projects/rocr-runtime/tests/kfdtest/src/KFDSVMRangeTest.cpp +++ b/projects/rocr-runtime/tests/kfdtest/src/KFDSVMRangeTest.cpp @@ -34,8 +34,6 @@ void KFDSVMRangeTest::SetUp() { KFDBaseComponentTest::SetUp(); - m_pIsaGen = IsaGenerator::Create(m_FamilyId); - SVMSetXNACKMode(); ROUTINE_END @@ -44,10 +42,6 @@ void KFDSVMRangeTest::SetUp() { void KFDSVMRangeTest::TearDown() { ROUTINE_START - if (m_pIsaGen) - delete m_pIsaGen; - m_pIsaGen = NULL; - SVMRestoreXNACKMode(); KFDBaseComponentTest::TearDown(); @@ -80,7 +74,7 @@ TEST_F(KFDSVMRangeTest, BasicSystemMemTest) { srcSysBuffer.Fill(0x01010101); - m_pIsaGen->GetCopyDwordIsa(isaBuffer); + ASSERT_SUCCESS(m_pAsm->RunAssembleBuf(CopyDwordIsa, isaBuffer.As())); ASSERT_SUCCESS(queue.Create(defaultGPUNode)); queue.SetSkipWaitConsump(0); @@ -364,7 +358,8 @@ TEST_F(KFDSVMRangeTest, EvictSystemRangeTest) { ASSERT_SUCCESS(sdmaQueue.Create(defaultGPUNode)); HsaMemoryBuffer isaBuffer(PAGE_SIZE, defaultGPUNode, true/*zero*/, false/*local*/, true/*exec*/); - m_pIsaGen->GetCopyDwordIsa(isaBuffer); + + ASSERT_SUCCESS(m_pAsm->RunAssembleBuf(CopyDwordIsa, isaBuffer.As())); Dispatch dispatch0(isaBuffer); dispatch0.SetArgs(srcBuffer.As(), dstBuffer.As()); @@ -458,7 +453,8 @@ TEST_F(KFDSVMRangeTest, PartialUnmapSysMemTest) { munmap(pBuf2, Buf2Size); - m_pIsaGen->GetCopyDwordIsa(isaBuffer); + ASSERT_SUCCESS(m_pAsm->RunAssembleBuf(CopyDwordIsa, isaBuffer.As())); + ASSERT_SUCCESS(queue.Create(defaultGPUNode)); Dispatch dispatch(isaBuffer); @@ -507,7 +503,7 @@ TEST_F(KFDSVMRangeTest, BasicVramTest) { srcSysBuffer.Fill(0x01010101); - m_pIsaGen->GetCopyDwordIsa(isaBuffer); + ASSERT_SUCCESS(m_pAsm->RunAssembleBuf(CopyDwordIsa, isaBuffer.As())); ASSERT_SUCCESS(queue.Create(defaultGPUNode)); queue.SetSkipWaitConsump(0); @@ -943,7 +939,9 @@ TEST_F(KFDSVMRangeTest, MigratePolicyTest) { #ifdef USE_PM4_QUEUE_TRIGGER_VM_FAULT HsaMemoryBuffer isaBuffer(PAGE_SIZE, defaultGPUNode); PM4Queue queue; - m_pIsaGen->GetCopyDwordIsa(isaBuffer); + + ASSERT_SUCCESS(m_pAsm->RunAssembleBuf(CopyDwordIsa, isaBuffer.As())); + ASSERT_SUCCESS(queue.Create(defaultGPUNode)); for (HSAuint64 i = 0; i < BufferSize / 8; i += 512) { diff --git a/projects/rocr-runtime/tests/kfdtest/src/KFDSVMRangeTest.hpp b/projects/rocr-runtime/tests/kfdtest/src/KFDSVMRangeTest.hpp index 88bddd94a8..03a245dba3 100644 --- a/projects/rocr-runtime/tests/kfdtest/src/KFDSVMRangeTest.hpp +++ b/projects/rocr-runtime/tests/kfdtest/src/KFDSVMRangeTest.hpp @@ -26,21 +26,17 @@ #include -#include "IsaGenerator.hpp" #include "KFDBaseComponentTest.hpp" class KFDSVMRangeTest : public KFDBaseComponentTest { public: - KFDSVMRangeTest() :m_pIsaGen(NULL) {} + KFDSVMRangeTest() {} ~KFDSVMRangeTest() {} void SplitRangeTest(int defaultGPUNode, int prefetch_location); protected: virtual void SetUp(); virtual void TearDown(); - - protected: // Members - IsaGenerator* m_pIsaGen; }; #endif // __KFD_LOCALMEMORY_TEST__H__ From 904f0ade4f740b1d1ed9b37036b0f1bce825a390 Mon Sep 17 00:00:00 2001 From: Graham Sider Date: Thu, 30 Sep 2021 18:17:26 -0400 Subject: [PATCH 20/27] kfdtest: Remove IsaGen from RDMATest Replace with LLVM-based Assembler. Signed-off-by: Graham Sider Change-Id: I5dff1b9402e294af33cec78a24e2e2decfb5b6d3 [ROCm/ROCR-Runtime commit: 8a6743aef4a5b4fe15826ce093811c23cd719787] --- projects/rocr-runtime/tests/kfdtest/src/RDMATest.cpp | 8 ++------ projects/rocr-runtime/tests/kfdtest/src/RDMATest.hpp | 6 +----- 2 files changed, 3 insertions(+), 11 deletions(-) diff --git a/projects/rocr-runtime/tests/kfdtest/src/RDMATest.cpp b/projects/rocr-runtime/tests/kfdtest/src/RDMATest.cpp index 973dbdcdc8..d44d69895e 100644 --- a/projects/rocr-runtime/tests/kfdtest/src/RDMATest.cpp +++ b/projects/rocr-runtime/tests/kfdtest/src/RDMATest.cpp @@ -34,16 +34,11 @@ void RDMATest::SetUp() { KFDBaseComponentTest::SetUp(); - m_pIsaGen = IsaGenerator::Create(m_FamilyId); - ROUTINE_END } void RDMATest::TearDown() { ROUTINE_START - if (m_pIsaGen) - delete m_pIsaGen; - m_pIsaGen = NULL; KFDBaseComponentTest::TearDown(); @@ -77,7 +72,8 @@ TEST_F(RDMATest, GPUDirect) { srcSysBuffer.Fill(0xfe); /* Put 'copy dword' command to ISA buffer */ - m_pIsaGen->GetCopyDwordIsa(isaBuffer); + ASSERT_SUCCESS(m_pAsm->RunAssembleBuf(CopyDwordIsa, isaBuffer.As())); + ASSERT_SUCCESS(queue.Create(defaultGPUNode)); Dispatch dispatch(isaBuffer); diff --git a/projects/rocr-runtime/tests/kfdtest/src/RDMATest.hpp b/projects/rocr-runtime/tests/kfdtest/src/RDMATest.hpp index 3e4b2331aa..1d16853838 100644 --- a/projects/rocr-runtime/tests/kfdtest/src/RDMATest.hpp +++ b/projects/rocr-runtime/tests/kfdtest/src/RDMATest.hpp @@ -26,20 +26,16 @@ #include -#include "IsaGenerator.hpp" #include "KFDBaseComponentTest.hpp" class RDMATest : public KFDBaseComponentTest { public: - RDMATest():m_pIsaGen(NULL) {} + RDMATest() {} ~RDMATest() {} protected: virtual void SetUp(); virtual void TearDown(); - - protected: // Members - IsaGenerator* m_pIsaGen; }; #endif // __RDMA_TEST__H__ From 6b7da770ddc0dcdad02c6a77f54b611d11cd03f3 Mon Sep 17 00:00:00 2001 From: Graham Sider Date: Thu, 30 Sep 2021 18:32:48 -0400 Subject: [PATCH 21/27] kfdtest: Remove IsaGen from KFDHWSTest Replace with LLVM-based Assembler. Signed-off-by: Graham Sider Change-Id: Ibbc4103d9498321b87feadf14a523b0d44d1851c [ROCm/ROCR-Runtime commit: 780f0b618c9751e21308723ac6860ded4d83a3c3] --- projects/rocr-runtime/tests/kfdtest/src/KFDHWSTest.cpp | 10 +++------- projects/rocr-runtime/tests/kfdtest/src/KFDHWSTest.hpp | 7 +------ 2 files changed, 4 insertions(+), 13 deletions(-) diff --git a/projects/rocr-runtime/tests/kfdtest/src/KFDHWSTest.cpp b/projects/rocr-runtime/tests/kfdtest/src/KFDHWSTest.cpp index 66c0b5d8ff..3040b16ac6 100644 --- a/projects/rocr-runtime/tests/kfdtest/src/KFDHWSTest.cpp +++ b/projects/rocr-runtime/tests/kfdtest/src/KFDHWSTest.cpp @@ -28,18 +28,12 @@ void KFDHWSTest::SetUp() { KFDBaseComponentTest::SetUp(); - m_pIsaGen = IsaGenerator::Create(m_FamilyId); - ROUTINE_END } void KFDHWSTest::TearDown() { ROUTINE_START - if (m_pIsaGen) - delete m_pIsaGen; - m_pIsaGen = NULL; - KFDBaseComponentTest::TearDown(); ROUTINE_END @@ -70,7 +64,9 @@ void KFDHWSTest::RunTest(unsigned nProcesses, unsigned nQueues, unsigned nLoops) // Run work on all queues HsaMemoryBuffer isaBuffer(PAGE_SIZE, defaultGPUNode, true/*zero*/, false/*local*/, true/*exec*/); - m_pIsaGen->GetNoopIsa(isaBuffer); + + ASSERT_SUCCESS(m_pAsm->RunAssembleBuf(NoopIsa, isaBuffer.As())); + for (l = 0; l < nLoops; l++) { for (q = 0; q < nQueues; q++) { if (dispatch[q]) diff --git a/projects/rocr-runtime/tests/kfdtest/src/KFDHWSTest.hpp b/projects/rocr-runtime/tests/kfdtest/src/KFDHWSTest.hpp index e3ea5155c2..cbec52fbed 100644 --- a/projects/rocr-runtime/tests/kfdtest/src/KFDHWSTest.hpp +++ b/projects/rocr-runtime/tests/kfdtest/src/KFDHWSTest.hpp @@ -27,14 +27,12 @@ #include #include "PM4Queue.hpp" -#include "IsaGenerator.hpp" #include "KFDMultiProcessTest.hpp" #include "Dispatch.hpp" class KFDHWSTest : public KFDMultiProcessTest { public: - KFDHWSTest():m_pIsaGen(NULL) {} - + KFDHWSTest() {} ~KFDHWSTest() {} protected: @@ -42,9 +40,6 @@ class KFDHWSTest : public KFDMultiProcessTest { virtual void TearDown(); void RunTest(unsigned nProcesses, unsigned nQueues, unsigned nLoops); - - protected: // Members - IsaGenerator* m_pIsaGen; }; #endif // __KFD_QCM_TEST__H__ From 537381a123ee35489ac5f3c32703a95a5bd6aeaf Mon Sep 17 00:00:00 2001 From: Graham Sider Date: Thu, 30 Sep 2021 18:33:06 -0400 Subject: [PATCH 22/27] kfdtest: Remove IsaGen from KFDExceptionTest Replace with LLVM-based Assembler. Signed-off-by: Graham Sider Change-Id: Ia64b8cc77382773b48de576d350bbed3c1efdb74 [ROCm/ROCR-Runtime commit: 4c7cf6e7d22f4439022e44756c6d5c0029179211] --- .../rocr-runtime/tests/kfdtest/src/KFDExceptionTest.cpp | 9 ++------- .../rocr-runtime/tests/kfdtest/src/KFDExceptionTest.hpp | 5 +---- 2 files changed, 3 insertions(+), 11 deletions(-) diff --git a/projects/rocr-runtime/tests/kfdtest/src/KFDExceptionTest.cpp b/projects/rocr-runtime/tests/kfdtest/src/KFDExceptionTest.cpp index 11df6279c9..9fa15e0969 100644 --- a/projects/rocr-runtime/tests/kfdtest/src/KFDExceptionTest.cpp +++ b/projects/rocr-runtime/tests/kfdtest/src/KFDExceptionTest.cpp @@ -33,18 +33,12 @@ void KFDExceptionTest::SetUp() { KFDBaseComponentTest::SetUp(); - m_pIsaGen = IsaGenerator::Create(m_FamilyId); - ROUTINE_END } void KFDExceptionTest::TearDown() { ROUTINE_START - if (m_pIsaGen) - delete m_pIsaGen; - m_pIsaGen = NULL; - KFDBaseComponentTest::TearDown(); // WORKAROUND: This needs to be fixed in the kernel @@ -75,7 +69,8 @@ void KFDExceptionTest::TestMemoryException(int defaultGPUNode, HSAuint64 pSrc, eventDesc.SyncVar.SyncVar.UserData = NULL; eventDesc.SyncVar.SyncVarSize = 0; - m_pIsaGen->GetCopyDwordIsa(isaBuffer); + ASSERT_SUCCESS(m_pAsm->RunAssembleBuf(CopyDwordIsa, isaBuffer.As())); + m_ChildStatus = queue.Create(defaultGPUNode); if (m_ChildStatus != HSAKMT_STATUS_SUCCESS) { WARN() << "Queue create failed" << std::endl; diff --git a/projects/rocr-runtime/tests/kfdtest/src/KFDExceptionTest.hpp b/projects/rocr-runtime/tests/kfdtest/src/KFDExceptionTest.hpp index 00b45fe5db..df57649e2c 100644 --- a/projects/rocr-runtime/tests/kfdtest/src/KFDExceptionTest.hpp +++ b/projects/rocr-runtime/tests/kfdtest/src/KFDExceptionTest.hpp @@ -26,12 +26,11 @@ #include -#include "IsaGenerator.hpp" #include "KFDBaseComponentTest.hpp" class KFDExceptionTest : public KFDBaseComponentTest { public: - KFDExceptionTest() :m_pIsaGen(NULL), m_ChildPid(-1) { + KFDExceptionTest() : m_ChildPid(-1) { /* Because there could be early return before m_ChildPid is set * by fork(), we should initialize m_ChildPid to a non-zero value * to avoid possible exit of the main process. @@ -59,8 +58,6 @@ class KFDExceptionTest : public KFDBaseComponentTest { protected: // Members pid_t m_ChildPid; HSAKMT_STATUS m_ChildStatus; - - IsaGenerator* m_pIsaGen; }; #endif // __KFD_EXCEPTION_TEST__H__ From 8753861e5bfa8563e582052f878d560893d5ce17 Mon Sep 17 00:00:00 2001 From: Graham Sider Date: Thu, 30 Sep 2021 18:33:23 -0400 Subject: [PATCH 23/27] kfdtest: Remove IsaGen from KFDDBGTest KFDDBGTest is deprecated, so just removing references to IsaGen. Signed-off-by: Graham Sider Change-Id: I9f094d847a8ae43cb3793253b34a7d7ed2179ac1 [ROCm/ROCR-Runtime commit: ac48163885744426ebd01372ca3dd6ebf23f7f6e] --- projects/rocr-runtime/tests/kfdtest/src/KFDDBGTest.cpp | 5 ----- projects/rocr-runtime/tests/kfdtest/src/KFDDBGTest.hpp | 6 +----- 2 files changed, 1 insertion(+), 10 deletions(-) diff --git a/projects/rocr-runtime/tests/kfdtest/src/KFDDBGTest.cpp b/projects/rocr-runtime/tests/kfdtest/src/KFDDBGTest.cpp index b7a38bbd46..f256d8a135 100644 --- a/projects/rocr-runtime/tests/kfdtest/src/KFDDBGTest.cpp +++ b/projects/rocr-runtime/tests/kfdtest/src/KFDDBGTest.cpp @@ -176,16 +176,11 @@ void KFDDBGTest::SetUp() { KFDBaseComponentTest::SetUp(); - m_pIsaGen = IsaGenerator::Create(m_FamilyId); - ROUTINE_END } void KFDDBGTest::TearDown() { ROUTINE_START - if (m_pIsaGen) - delete m_pIsaGen; - m_pIsaGen = NULL; /* Reset the user trap handler */ hsaKmtSetTrapHandler(m_NodeInfo.HsaDefaultGPUNode(), 0, 0, 0, 0); diff --git a/projects/rocr-runtime/tests/kfdtest/src/KFDDBGTest.hpp b/projects/rocr-runtime/tests/kfdtest/src/KFDDBGTest.hpp index c4b46b296b..dccedc3fd3 100644 --- a/projects/rocr-runtime/tests/kfdtest/src/KFDDBGTest.hpp +++ b/projects/rocr-runtime/tests/kfdtest/src/KFDDBGTest.hpp @@ -26,20 +26,16 @@ #include -#include "IsaGenerator.hpp" #include "KFDBaseComponentTest.hpp" class KFDDBGTest : public KFDBaseComponentTest { public: - KFDDBGTest() :m_pIsaGen(NULL) {} + KFDDBGTest() {} ~KFDDBGTest() {} protected: virtual void SetUp(); virtual void TearDown(); - - protected: // Members - IsaGenerator* m_pIsaGen; }; #endif // __KFD_DBG_TEST__H__ From d31d1ca19adb3391b6f55bfe589d23ac3568abbc Mon Sep 17 00:00:00 2001 From: Graham Sider Date: Fri, 8 Oct 2021 14:52:45 -0400 Subject: [PATCH 24/27] kfdtest: Remove IsaGen from KFDIPCTest Signed-off-by: Graham Sider Change-Id: I3194e6a6bdab846da9cf346f47a7d7580e2def4d [ROCm/ROCR-Runtime commit: ffaa3d924695a622847cb07b039b109f8dbb8155] --- projects/rocr-runtime/tests/kfdtest/src/KFDIPCTest.hpp | 1 - 1 file changed, 1 deletion(-) diff --git a/projects/rocr-runtime/tests/kfdtest/src/KFDIPCTest.hpp b/projects/rocr-runtime/tests/kfdtest/src/KFDIPCTest.hpp index 961ecbd9e9..3ce0aa12d8 100644 --- a/projects/rocr-runtime/tests/kfdtest/src/KFDIPCTest.hpp +++ b/projects/rocr-runtime/tests/kfdtest/src/KFDIPCTest.hpp @@ -23,7 +23,6 @@ #include "KFDBaseComponentTest.hpp" #include "BaseQueue.hpp" -#include "IsaGenerator.hpp" #ifndef __KFD_MEMORY_TEST__H__ #define __KFD_MEMORY_TEST__H__ From 36ee679ca3752ed398522b600d7878a0e8df489d Mon Sep 17 00:00:00 2001 From: Graham Sider Date: Fri, 8 Oct 2021 14:50:04 -0400 Subject: [PATCH 25/27] kfdtest: Remove SP3/IsaGen - Remove SP3 lib directory - Remove IsaGenerator* files - Update CMakeLists accordingly Signed-off-by: Graham Sider Change-Id: I280161b0b238839ab318c18d6038cdd64fa66208 [ROCm/ROCR-Runtime commit: e6e498abf3f19a03c344955bbf4b7f5c331c6606] --- .../rocr-runtime/tests/kfdtest/CMakeLists.txt | 12 +- .../rocr-runtime/tests/kfdtest/sp3/README.txt | 6 - .../sp3/lib_helper/AMD_opensource_license.txt | 23 - .../kfdtest/sp3/lib_helper/CMakeLists_sp3.txt | 79 --- .../tests/kfdtest/sp3/lib_helper/build_sp3.sh | 57 -- projects/rocr-runtime/tests/kfdtest/sp3/sp3.h | 643 ------------------ .../tests/kfdtest/src/IsaGenerator.cpp | 126 ---- .../tests/kfdtest/src/IsaGenerator.hpp | 52 -- .../kfdtest/src/IsaGenerator_Aldebaran.cpp | 113 --- .../kfdtest/src/IsaGenerator_Aldebaran.hpp | 49 -- .../tests/kfdtest/src/IsaGenerator_Gfx10.cpp | 142 ---- .../tests/kfdtest/src/IsaGenerator_Gfx10.hpp | 49 -- .../tests/kfdtest/src/IsaGenerator_Gfx72.cpp | 123 ---- .../tests/kfdtest/src/IsaGenerator_Gfx72.hpp | 49 -- .../tests/kfdtest/src/IsaGenerator_Gfx8.cpp | 128 ---- .../tests/kfdtest/src/IsaGenerator_Gfx8.hpp | 49 -- .../tests/kfdtest/src/IsaGenerator_Gfx9.cpp | 113 --- .../tests/kfdtest/src/IsaGenerator_Gfx9.hpp | 49 -- 18 files changed, 1 insertion(+), 1861 deletions(-) delete mode 100644 projects/rocr-runtime/tests/kfdtest/sp3/README.txt delete mode 100644 projects/rocr-runtime/tests/kfdtest/sp3/lib_helper/AMD_opensource_license.txt delete mode 100644 projects/rocr-runtime/tests/kfdtest/sp3/lib_helper/CMakeLists_sp3.txt delete mode 100755 projects/rocr-runtime/tests/kfdtest/sp3/lib_helper/build_sp3.sh delete mode 100644 projects/rocr-runtime/tests/kfdtest/sp3/sp3.h delete mode 100644 projects/rocr-runtime/tests/kfdtest/src/IsaGenerator.cpp delete mode 100644 projects/rocr-runtime/tests/kfdtest/src/IsaGenerator.hpp delete mode 100644 projects/rocr-runtime/tests/kfdtest/src/IsaGenerator_Aldebaran.cpp delete mode 100644 projects/rocr-runtime/tests/kfdtest/src/IsaGenerator_Aldebaran.hpp delete mode 100644 projects/rocr-runtime/tests/kfdtest/src/IsaGenerator_Gfx10.cpp delete mode 100644 projects/rocr-runtime/tests/kfdtest/src/IsaGenerator_Gfx10.hpp delete mode 100644 projects/rocr-runtime/tests/kfdtest/src/IsaGenerator_Gfx72.cpp delete mode 100644 projects/rocr-runtime/tests/kfdtest/src/IsaGenerator_Gfx72.hpp delete mode 100644 projects/rocr-runtime/tests/kfdtest/src/IsaGenerator_Gfx8.cpp delete mode 100644 projects/rocr-runtime/tests/kfdtest/src/IsaGenerator_Gfx8.hpp delete mode 100644 projects/rocr-runtime/tests/kfdtest/src/IsaGenerator_Gfx9.cpp delete mode 100644 projects/rocr-runtime/tests/kfdtest/src/IsaGenerator_Gfx9.hpp diff --git a/projects/rocr-runtime/tests/kfdtest/CMakeLists.txt b/projects/rocr-runtime/tests/kfdtest/CMakeLists.txt index e175253dad..b1208f54c2 100644 --- a/projects/rocr-runtime/tests/kfdtest/CMakeLists.txt +++ b/projects/rocr-runtime/tests/kfdtest/CMakeLists.txt @@ -128,12 +128,9 @@ add_definitions(${LLVM_DEFINITIONS_LIST}) llvm_map_components_to_libnames(llvm_libs AMDGPUAsmParser Core Support) -set ( SP3_DIR ${PROJECT_SOURCE_DIR}/sp3 ) - include_directories(${PROJECT_SOURCE_DIR}/gtest-1.6.0) include_directories(${PROJECT_SOURCE_DIR}/include) include_directories(${PROJECT_SOURCE_DIR}/../../include) -include_directories(${SP3_DIR}) include_directories(${DRM_INCLUDE_DIRS}) @@ -147,12 +144,6 @@ set (SRC_FILES gtest-1.6.0/gtest-all.cpp src/IndirectBuffer.cpp src/Assemble.cpp src/ShaderStore.cpp - src/IsaGenerator.cpp - src/IsaGenerator_Aldebaran.cpp - src/IsaGenerator_Gfx10.cpp - src/IsaGenerator_Gfx72.cpp - src/IsaGenerator_Gfx8.cpp - src/IsaGenerator_Gfx9.cpp src/LinuxOSWrapper.cpp src/PM4Packet.cpp src/PM4Queue.cpp @@ -216,11 +207,10 @@ endif () # The modules found by pkg_check_modules() in the default pkg config # path do not need to use link_directories() here. link_directories(${HSAKMT_LIBRARY_DIRS}) -link_directories(${SP3_DIR}) add_executable(kfdtest ${SRC_FILES}) -target_link_libraries(kfdtest ${HSAKMT_LIBRARIES} ${DRM_LDFLAGS} ${DRM_AMDGPU_LDFLAGS} ${llvm_libs} pthread m stdc++ rt amdsp3 numa) +target_link_libraries(kfdtest ${HSAKMT_LIBRARIES} ${DRM_LDFLAGS} ${DRM_AMDGPU_LDFLAGS} ${llvm_libs} pthread m stdc++ rt numa) configure_file ( scripts/kfdtest.exclude kfdtest.exclude COPYONLY ) configure_file ( scripts/run_kfdtest.sh run_kfdtest.sh COPYONLY ) diff --git a/projects/rocr-runtime/tests/kfdtest/sp3/README.txt b/projects/rocr-runtime/tests/kfdtest/sp3/README.txt deleted file mode 100644 index 7cbe800f50..0000000000 --- a/projects/rocr-runtime/tests/kfdtest/sp3/README.txt +++ /dev/null @@ -1,6 +0,0 @@ -Note: This folder is primarily intended for AMD internal developers. - -The folder lib_helper contains the script to generate SP3 library libamdsp3.a -and the associated header files in the current folder for kfdtest to use. -cmake is required for the script to run. Just run ./build_sp3.sh after setting -up the environment variables (source build/envsetup.sh). diff --git a/projects/rocr-runtime/tests/kfdtest/sp3/lib_helper/AMD_opensource_license.txt b/projects/rocr-runtime/tests/kfdtest/sp3/lib_helper/AMD_opensource_license.txt deleted file mode 100644 index 673285ddb4..0000000000 --- a/projects/rocr-runtime/tests/kfdtest/sp3/lib_helper/AMD_opensource_license.txt +++ /dev/null @@ -1,23 +0,0 @@ -/* - * Copyright (C) 2014-2018 Advanced Micro Devices, Inc. All Rights Reserved. - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the "Software"), - * to deal in the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, - * and/or sell copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR - * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, - * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR - * OTHER DEALINGS IN THE SOFTWARE. - * - */ - diff --git a/projects/rocr-runtime/tests/kfdtest/sp3/lib_helper/CMakeLists_sp3.txt b/projects/rocr-runtime/tests/kfdtest/sp3/lib_helper/CMakeLists_sp3.txt deleted file mode 100644 index ce8a3cb33f..0000000000 --- a/projects/rocr-runtime/tests/kfdtest/sp3/lib_helper/CMakeLists_sp3.txt +++ /dev/null @@ -1,79 +0,0 @@ -# -# Copyright (C) 2018 Advanced Micro Devices, Inc. All Rights Reserved. -# -# Permission is hereby granted, free of charge, to any person obtaining a -# copy of this software and associated documentation files (the "Software"), -# to deal in the Software without restriction, including without limitation -# the rights to use, copy, modify, merge, publish, distribute, sublicense, -# and/or sell copies of the Software, and to permit persons to whom the -# Software is furnished to do so, subject to the following conditions: -# -# The above copyright notice and this permission notice shall be included in -# all copies or substantial portions of the Software. -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR -# OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, -# ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR -# OTHER DEALINGS IN THE SOFTWARE. -# -# - -cmake_minimum_required(VERSION 2.8 FATAL_ERROR) - -project(amdsp3) - -#set ( CMAKE_VERBOSE_MAKEFILE on ) - -find_package(PkgConfig) - -set ( P4_PATH $ENV{WORK_ROOT}/p4/driver/drivers ) - -set ( SCLIB_SRC ${PROJECT_SOURCE_DIR} ) -#if( DEFINED ENV{SCLIB_SRC} ) -# set ( SCLIB_SRC $ENV{SCLIB_SRC} ) -#else() -# set ( SCLIB_SRC ${P4_PATH}/sc/Chip ) -#endif() - -include_directories(${SCLIB_SRC}/sp3) -#include_directories(${SCLIB_SRC}/sp3/release_headers) -include_directories(${SCLIB_SRC}/sp3/gen) - -set ( SRC_FILES ${SRC_FILES} ${SCLIB_SRC}/sp3/sp3-asic.c ) -set ( SRC_FILES ${SRC_FILES} ${SCLIB_SRC}/sp3/sp3-dispatch.c ) -set ( SRC_FILES ${SRC_FILES} ${SCLIB_SRC}/sp3/sp3-eval.c ) -set ( SRC_FILES ${SRC_FILES} ${SCLIB_SRC}/sp3/sp3-gc.c ) -set ( SRC_FILES ${SRC_FILES} ${SCLIB_SRC}/sp3/sp3-int.c ) -set ( SRC_FILES ${SRC_FILES} ${SCLIB_SRC}/sp3/sp3-lib.c ) -set ( SRC_FILES ${SRC_FILES} ${SCLIB_SRC}/sp3/sp3-native.c ) -set ( SRC_FILES ${SRC_FILES} ${SCLIB_SRC}/sp3/sp3-cipher.c ) -set ( SRC_FILES ${SRC_FILES} ${SCLIB_SRC}/sp3/sp3-vm.c ) - -aux_source_directory(${SCLIB_SRC}/sp3/gen SRC_FILES) -aux_source_directory(${SCLIB_SRC}/sp3/backend/si/lib SRC_FILES) -aux_source_directory(${SCLIB_SRC}/sp3/backend/ci/lib SRC_FILES) -aux_source_directory(${SCLIB_SRC}/sp3/backend/gfx8/lib SRC_FILES) -aux_source_directory(${SCLIB_SRC}/sp3/backend/gfx81/lib SRC_FILES) -aux_source_directory(${SCLIB_SRC}/sp3/backend/gfx9/lib SRC_FILES) -aux_source_directory(${SCLIB_SRC}/sp3/backend/gfx10/lib SRC_FILES) -aux_source_directory(${SCLIB_SRC}/sp3/backend/aldbrn/lib SRC_FILES) -aux_source_directory(${SCLIB_SRC}/sp3/backend/gfx81/arch SRC_FILES) -aux_source_directory(${SCLIB_SRC}/sp3/backend/gfx9/arch SRC_FILES) -aux_source_directory(${SCLIB_SRC}/sp3/backend/gfx10/arch SRC_FILES) -aux_source_directory(${SCLIB_SRC}/sp3/backend/aldbrn/arch SRC_FILES) - - -message( STATUS "PROJECT_SOURCE_DIR:" ${PROJECT_SOURCE_DIR} ) -#message( STATUS "SRC_FILES: ") -#foreach(file ${SRC_FILES}) -# message(STATUS "${file}") -#endforeach() - -set ( CMAKE_C_FLAGS "-DSP3_STATIC_LIB -Wno-error -DPUBLIC_RELEASE -DLITTLEENDIAN_CPU -fPIC -DGFX101_BUILD -DALDBRN_BUILD" ) - -add_library(amdsp3 ${SRC_FILES}) - - diff --git a/projects/rocr-runtime/tests/kfdtest/sp3/lib_helper/build_sp3.sh b/projects/rocr-runtime/tests/kfdtest/sp3/lib_helper/build_sp3.sh deleted file mode 100755 index f93f145da6..0000000000 --- a/projects/rocr-runtime/tests/kfdtest/sp3/lib_helper/build_sp3.sh +++ /dev/null @@ -1,57 +0,0 @@ -# -# Copyright (C) 2018 Advanced Micro Devices, Inc. All Rights Reserved. -# -# Permission is hereby granted, free of charge, to any person obtaining a -# copy of this software and associated documentation files (the "Software"), -# to deal in the Software without restriction, including without limitation -# the rights to use, copy, modify, merge, publish, distribute, sublicense, -# and/or sell copies of the Software, and to permit persons to whom the -# Software is furnished to do so, subject to the following conditions: -# -# The above copyright notice and this permission notice shall be included in -# all copies or substantial portions of the Software. -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR -# OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, -# ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR -# OTHER DEALINGS IN THE SOFTWARE. -# -# - -#!/bin/bash - -if [ "$KFDTEST_ROOT" == "" ] || [ "$P4_ROOT" == "" ]; then - echo "Environment variables should be set before running this script" - exit 1 -fi - -cd $KFDTEST_ROOT/sp3/lib_helper - -SP3_PROJECT=$P4_ROOT/driver/drivers/sc/Chip/ -LIB_OUTPUT=$KFDTEST_ROOT/sp3/ - -cp CMakeLists_sp3.txt $SP3_PROJECT/CMakeLists.txt - -mkdir -p build -echo "Building SP3 lib" -pushd build -cmake $SP3_PROJECT/ -make -popd - -rsync --progress -a build/libamdsp3.a $LIB_OUTPUT -# Put the intermediate header files in the current folder for further processing -rsync --progress -a $SP3_PROJECT/sp3/public/lib/sp3.h . - -# Remove the build folder and CMakeLists.txt put into SP source folder -rm -r build -rm $SP3_PROJECT/CMakeLists.txt - -# Replace the license statement in the header files -{ cat AMD_opensource_license.txt; sed -e '1,/#ifndef/ { /#ifndef/b; d }' sp3.h; } > $LIB_OUTPUT/sp3.h - -# Delete the intermediate header files -rm sp3.h diff --git a/projects/rocr-runtime/tests/kfdtest/sp3/sp3.h b/projects/rocr-runtime/tests/kfdtest/sp3/sp3.h deleted file mode 100644 index 513167d595..0000000000 --- a/projects/rocr-runtime/tests/kfdtest/sp3/sp3.h +++ /dev/null @@ -1,643 +0,0 @@ -/* - * Copyright (C) 2014-2018 Advanced Micro Devices, Inc. All Rights Reserved. - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the "Software"), - * to deal in the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, - * and/or sell copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR - * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, - * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR - * OTHER DEALINGS IN THE SOFTWARE. - * - */ - -#ifndef __SP3_H__ -#define __SP3_H__ - -#ifdef __cplusplus -extern "C" { -#endif - - -/// @file sp3.h -/// @brief sp3 API -#include - -// Export tags -#define SP3_EXPORT - - -/// @defgroup sp3main SP3 Main API -/// -/// Main API to assemble and disassemble SP3 shaders. -/// -/// @{ - - -/// Valid shader stages. -enum sp3_shtype { - SP3_SHTYPE_NONE = -1, - SP3_SHTYPE_PS = 0, - SP3_SHTYPE_VS = 1, - SP3_SHTYPE_GS = 2, - SP3_SHTYPE_ES = 3, - SP3_SHTYPE_HS = 4, - SP3_SHTYPE_LS = 5, - SP3_SHTYPE_CS = 6, -#ifdef NAVI10LITE_BUILD - SP3_SHTYPE_ACV = 7, -#endif -}; - -/// Assorted constants used by sp3 API. -enum sp3_count { - SP3_NUM_MRT = 8, ///< Maximum number of render targets supported. - SP3_NUM_STRM = 4, ///< Maximum number of streams supported. -}; - -/// Disassembly flags. Bitwise-OR flags to set options. -enum sp3_flag { - SP3DIS_NO_STATE = 0x01, ///< Do not include state header at top of shader. - SP3DIS_NO_BINARY = 0x02, ///< Do not include comments with raw binary microcode. - SP3DIS_COMMENTS = 0x04, ///< Do not include comments. - SP3DIS_NO_GPR_COUNT = 0x08, ///< Do not include GPR allocation counts. - SP3DIS_FORCEVALID = 0x10, ///< Force all bytes of microcode to be disassembled. - SP3DIS_NO_ASIC = 0x20, ///< Do not emit the asic header at top of shader. -}; - -/// Shader context. Contains no user-visible fields. -struct sp3_context; - -/// Memory object. Contains no user-visible fields. -struct sp3_vma; - -/// VM addresses are 64-bit and the address unit is 32 bits -typedef uint64_t sp3_vmaddr; - -/// Storage entry for register streams. -struct sp3_reg { - uint32_t index; ///< One of the MM aperture register addresses. - uint32_t value; ///< 32-bit register data. -}; - -/// Bits for a single instruction. -struct sp3_inst_bits { - uint32_t val[5]; ///< Largest single instruction in any backend is 5 dwords. -}; - -/// Wrapped shader metadata. -/// -/// After generation, shaders are encapsulated in sp3_shader structures. -/// -/// Those structures contain the shader binary, its register stream, constants and constant -/// buffers and metadata needed for SC compatibility. -/// -struct sp3_shader { - enum sp3_shtype type; ///< One of the SHTYPE_* constants. - uint32_t asic_int; ///< Internal ASIC index. Do not use. - char asic[0x100]; ///< ASIC name as a string ("RV870" etc). - uint32_t size; ///< Size of the compiled shader, in 32-bit words. - uint32_t nsgprs; ///< Number of scalar GPRs used. - uint32_t nvgprs; ///< Number of vector GPRs used. - uint32_t nsvgprs; ///< Number of shared vector GPRs used (only available in certain projects). - uint32_t naccvgprs; ///< Number of accumulator vector GPRs used (only available in certain projects). - uint32_t nsgprs_manual_alloc; - uint32_t nvgprs_manual_alloc; - uint32_t nsvgprs_manual_alloc; - uint32_t naccvgprs_manual_alloc; - uint32_t trap_present; - uint32_t user_sgpr_count; - uint32_t scratch_en; - uint32_t dispatch_draw_en; - uint32_t so_en; - uint32_t so_base0_en; - uint32_t so_base1_en; - uint32_t so_base2_en; - uint32_t so_base3_en; - uint32_t oc_lds_en; - uint32_t tg_size_en; - uint32_t tidig_comp_cnt; ///< Number of components(-1) enabled for thread id in group - uint32_t tgid_x_en; - uint32_t tgid_y_en; - uint32_t tgid_z_en; - uint32_t wave_cnt_en; - uint32_t primgen_en; - uint32_t pc_base_en; - uint32_t sgpr_scratch; - uint32_t sgpr_psvs_state; - uint32_t sgpr_gs2vs_offset; - uint32_t sgpr_so_write_index; - uint32_t sgpr_so_base_offset0; - uint32_t sgpr_so_base_offset1; - uint32_t sgpr_so_base_offset2; - uint32_t sgpr_so_base_offset3; - uint32_t sgpr_offchip_lds; - uint32_t sgpr_is_offchip; - uint32_t sgpr_ring_offset; - uint32_t sgpr_gs_wave_id; - uint32_t sgpr_global_wave_id; - uint32_t sgpr_tg_size; - uint32_t sgpr_tgid_x; - uint32_t sgpr_tgid_y; - uint32_t sgpr_tgid_z; - uint32_t sgpr_tf_base; - uint32_t sgpr_pc_base; - uint32_t sgpr_wave_cnt; - uint32_t wave_size; ///< Number of threads in a wavefront (only certain ASICs; 0 = don't care). - uint32_t pc_exports; ///< Range of parameters exported (if VS). - uint32_t pos_export; ///< Shader executes a position export (if VS). - uint32_t cb_exports; ///< Range of MRTs exported (if PS). - uint32_t mrtz_export_format;///< Export format of the mrtz export. - uint32_t z_export; ///< Shader executes a Z export (if PS). - uint32_t pops_en; ///< Shader is POPS (PS) - uint32_t pops_num_samples; ///< (PS) - uint32_t load_collision_waveid; ///< Shader sets load collision waveid (if PS). - uint32_t load_intrawave_collision; ///< Shader is in intrawave mode (if PS). - uint32_t stencil_test_export; ///< Shader exports stencil (if PS). - uint32_t stencil_op_export; ///< Shader exports stencil (if PS). - uint32_t kill_used; ///< Shader executes ALU KILL operations. - uint32_t cb_masks[SP3_NUM_MRT]; ///< Component masks for each MRT exported (if PS). - uint32_t emit_used; ///< EMIT opcodes used (if GS). - uint32_t covmask_export; ///< Shader exports coverage mask (if PS). - uint32_t mask_export; ///< Shader exports mask (if PS). - uint32_t strm_used[SP3_NUM_STRM]; ///< Streamout operations used (map). - uint32_t scratch_used; ///< Scratch SMX exports used. - uint32_t scratch_itemsize; ///< Scratch ring item size. - uint32_t reduction_used; ///< Reduction SMX exports used. - uint32_t ring_used; ///< ESGS/GSVS ring SMX exports used. - uint32_t ring_itemsize; ///< ESGS/GSVS ring item size (for ES/GS respectively). - uint32_t vertex_size[4]; ///< GSVS ring vertex size (for GS). - uint32_t mem_used; ///< Raw memory SMX exports used. - uint32_t rats_used; ///< Mask of RATs (UAVs) used - uint32_t group_size[3]; ///< Wavefront group size (for ELF files). - uint32_t alloc_lds; ///< Number of LDS bytes allocated for wave group. (translates to lds_size in CS and LS) - uint32_t *data; ///< Shader binary data. - uint32_t nregs; ///< Number of register writes in the stream. - uint64_t crc64; ///< CRC64 of compiled shader, may be used for identification/fingerprinting. - uint32_t crc32; ///< 32-bit CRC of compiled shader (based on crc64), may be used for identification/fingerprinting. - struct sp3_reg *regs; ///< Register writes (index-value pairs). - struct sp3_shader *merged_2nd_shader; ///< Merged es/gs, ls/hs shader, this points to start of the second shader (only certain ASICs). -}; - -/// Comment callback. -typedef const char *(*sp3_comment_cb)(void *, int); - - -/// Get version of the sp3 library. -/// -/// @return String containing the version number. -/// -SP3_EXPORT const char *sp3_version(void); - -/// Create a new sp3 context. -/// -/// @return A new context for use in assembling and disassembling shaders. Free with sp3_close(). -/// -SP3_EXPORT struct sp3_context *sp3_new(void); - -/// Set option for sp3. -/// -/// @param state sp3 context. -/// @param option Option name. Unknown options will raise an error. -/// @param value Option value. NULL is used to represent value-less options. -/// -/// Currently supported options: -/// -/// stdlib (string) -- absolute path to standard library files. May be a colon-separated list -/// of paths that will be used to search for stdlib files. Used by sp3_parse_library(). -/// -/// The following options are deprecated because they take integer arguments; you should use -/// sp3_set_option_int() for these settings going forward. They will continue to be accepted by -/// this API to support legacy users. -/// -/// Werror (boolean) -- indicates whether warnings should be treated as errors. -/// -/// wave_size (integer) -- sets the wave size being used by the draw calls that will be using -/// this shader. Ignored in certain ASICs. You may set this to 32, 64 or the special value 0 -/// to indicate no preference on wave size. The shader will be checked to ensure it is -/// compatible with the size specified here. -/// -/// omit_version (boolean) -- omit generation of the S_VERSION opcode. -/// -/// omit_code_end (boolean) -- omit generation of the S_CODE_END footer. -/// -/// allow_raw_bits (boolean) -- allow use of the raw_bits() function in sp3 shaders. This is a -/// dangerous option to allow in general so you must explicitly enable this option, otherwise -/// the raw_bits() function will always error out. -/// -SP3_EXPORT void sp3_set_option( - struct sp3_context *state, - const char *option, - const char *value); - -/// Set option for sp3. -/// -/// @param state sp3 context. -/// @param option Option name. Unknown options will raise an error. -/// @param value Option value. -/// -/// Currently supported options: -/// -/// Werror (boolean) -- indicates whether warnings should be treated as errors. -/// -/// wave_size (integer) -- sets the wave size being used by the draw calls that will be using -/// this shader. Ignored in certain ASICs. You may set this to 32, 64 or the special value 0 -/// to indicate no preference on wave size. The shader will be checked to ensure it is -/// compatible with the size specified here. -/// -/// omit_version (boolean) -- omit generation of the S_VERSION opcode. -/// -/// omit_code_end (boolean) -- omit generation of the S_CODE_END footer. -/// -/// allow_raw_bits (boolean) -- allow use of the raw_bits() function in sp3 shaders. This is a -/// dangerous option to allow in general so you must explicitly enable this option, otherwise -/// the raw_bits() function will always error out. -/// -/// secure_mode (boolean) -- run in secure mode. Disables macro language features in assembly -/// path including calls to custom functions. Useful if sp3 is used as a backend to a web-based -/// assembly tool. -/// -/// debug_encoding (boolean) -- if true, debug encoding selection logic for assembly. Only -/// supported in 10.4+ backends. -/// -/// no_vs_export_check (boolean) -- if true, disable VS export sanity check. Only supported in -/// 10.4+ backends. -/// -SP3_EXPORT void sp3_set_option_int( - struct sp3_context *state, - const char *option, - int32_t value); - -/// Parse a file into a context. -/// -/// Use sp3_compile to generate binary microcode after the shader is parsed. -/// -/// @param state Context to use for parsing. -/// @param file File to read. If NULL, parse from stdin. -/// -SP3_EXPORT void sp3_parse_file(struct sp3_context *state, const char *file); - -/// Parse a string into a context. -/// -/// Use sp3_compile to generate binary microcode after the shader is parsed. -/// -/// @param state Context to use for parsing. -/// @param string String to parse. -/// -SP3_EXPORT void sp3_parse_string(struct sp3_context *state, const char *string); - -/// Parse a file from the standard library into a context. -/// -/// Use sp3_compile to generate binary microcode after the shader is parsed. -/// -/// @param state Context to use for parsing. -/// @param name Path to the standard library; files in this directory are parsed. -/// -SP3_EXPORT void sp3_parse_library(struct sp3_context *state, const char *name); - -/// Call a sp3 function. -/// -SP3_EXPORT void sp3_call(struct sp3_context *state, const char *func); - -/// Compile a shader program that has been parsed into the context. -/// -/// @param state sp3 context. -/// @param cffunc Name of clause to call. By convention, this is "main". -/// @return A compiled and linked shader. Free memory with sp3_free_shader(). -/// -SP3_EXPORT struct sp3_shader *sp3_compile( - struct sp3_context *state, - const char *cffunc); - -/// Free a sp3_shader. -/// -/// @param sh Shader object to delete. -/// -SP3_EXPORT void sp3_free_shader(struct sp3_shader *sh); - -/// Get current ASIC name set for a context. -/// -/// @param state Context to query. -/// @return Name of ASIC. -/// -SP3_EXPORT const char *sp3_getasic(struct sp3_context *state); - -/// Set current ASIC name for a context. -/// -/// @param state Context to modify. -/// @param chip Case-insensitive string representing the ASIC to compile or disassemble for. -/// -SP3_EXPORT void sp3_setasic(struct sp3_context *state, const char *chip); - -/// Set global variable in context to an integer. -/// -SP3_EXPORT void sp3_set_param_int( - struct sp3_context *state, - const char *name, - int32_t value); - -/// Set global variable in context to an integer vector. -/// -SP3_EXPORT void sp3_set_param_intvec( - struct sp3_context *state, - const char *name, - uint32_t size, - const int32_t *value); - -/// Set global variable in context to a float. -/// -SP3_EXPORT void sp3_set_param_float( - struct sp3_context *state, - const char *name, - float value); - -/// Set global variable in context to a float vector. -/// -SP3_EXPORT void sp3_set_param_floatvec( - struct sp3_context *state, - const char *name, - uint32_t size, - const float *value); - -/// Set error message header. -/// -/// @param state Context to modify. -/// @param str Text to include in error message header. -/// -SP3_EXPORT void sp3_set_error_header(struct sp3_context *state, const char *str); - -/// Get ASIC metrics for the ASIC in current state. -/// -/// Used by ELF tools to fill in some CAL fields. -/// -/// @param state Context to query. -/// @param name Name of ASIC metric. -/// @return Value of ASIC metric. -/// -SP3_EXPORT int sp3_asicinfo(struct sp3_context *state, const char *name); - -/// Free a context allocated by sp3_new/open/parse. -/// -/// @param state Context to delete. -/// -SP3_EXPORT void sp3_close(struct sp3_context *state); - -/// Disassemble a shader. -/// -/// This call is likely to change to something that will take a filled sp3_shader structure -/// later on. -/// -/// @param state sp3 context (use sp3_new to allocate and sp3_setasic to set ASIC). -/// @param bin Memory map with the opcodes (see sp3-vm.h). -/// @param base Start of the shader in the memory map (in VM entries, i.e. 32-bit words). -/// @param name Same to give the disassembled shader. -/// @param shader_type One of the SHTYPE_* constants. -/// @param include Literal text to include in the CF clause (NULL includes nothing). -/// @param max_len Maximum length of CF clause. Matters if SP3DIS_FORCEVALID is set. -/// @param flags A bitmask of SP3DIS_* flags. -/// -/// @return Shader disassembly as a string. Free memory with sp3_free(). -/// -SP3_EXPORT char *sp3_disasm( - struct sp3_context *state, - struct sp3_vma *bin, - sp3_vmaddr base, - const char *name, - enum sp3_shtype shader_type, - const char *include, - uint32_t max_len, - uint32_t flags); - -/// Disassemble a single shader instruction. -/// -/// This call is likely to change to something that will take a filled sp3_shader structure -/// later on. -/// -/// @param state sp3 context (use sp3_new to allocate and sp3_setasic to set ASIC). -/// @param inst Pointer to dwords containing instruction (exact number of dwords required depends on instruction). -/// @param base Start of the shader in the memory map (in VM entries, i.e. 32-bit words). -/// @param addr Address of the instruction being disassembled (in VM entries, i.e. 32-bit words). -/// @param shader_type One of the SHTYPE_* constants. -/// @param flags A mask of SP3DIS_* flags. -/// -/// @return Shader disassembly as a string. Free memory with sp3_free(). -/// -SP3_EXPORT char *sp3_disasm_inst( - struct sp3_context *state, - const struct sp3_inst_bits *inst, - sp3_vmaddr base, - sp3_vmaddr addr, - enum sp3_shtype shader_type, - uint32_t flags); - -/// Parse a register stream. -/// -/// Can be called before sp3_disasm to preset things like ALU, boolean and loop constants. -/// -/// This call is likely to merge with sp3_disasm later on. -/// -/// @param state sp3 context to fill with state. -/// @param nregs Number of register entries. -/// @param regs Register stream to parse. -/// @param shader_type One of the SHTYPE_* constants. -/// -SP3_EXPORT void sp3_setregs( - struct sp3_context *state, - uint32_t nregs, - const struct sp3_reg *regs, - enum sp3_shtype shader_type); - - -/// Set shader comments -/// -/// @param state sp3 context. -/// @param map Map of comments (0 for no comment, other values will be passed to the callback). -/// @param f_top Callback returning comment to place above the opcode. -/// @param f_right Callback returning comment to place to the right of the opcode. -/// @param ctx Void pointer to pass to comment callbacks. -/// -SP3_EXPORT void sp3_setcomments( - struct sp3_context *state, - struct sp3_vma *map, - sp3_comment_cb f_top, - sp3_comment_cb f_right, - void *ctx); - -/// Set alternate shader entry points -/// -/// Used for disassembly; this marks an additional location in memory -/// (besides the start address) where shader code may be found. Generally -/// required for jump tables and any case where the shader may perform -/// indirect jumps to ensure that disassembly locates all shader -/// instructions. -/// -/// @param state sp3 context (use sp3_new to allocate and sp3_setasic to set ASIC). -/// @param addr Address of the instruction being disassembled (in VM entries, i.e. 32-bit words). -/// -SP3_EXPORT void sp3_setentrypoint( - struct sp3_context *state, - sp3_vmaddr addr); - -/// Clear alternate shader entry points. -/// -/// Clear all entry points previously set with sp3_setentrypoint. -/// -/// @param state sp3 context (use sp3_new to allocate and sp3_setasic to set ASIC). -/// -SP3_EXPORT void sp3_clearentrypoints(struct sp3_context *state); - -/// Free memory allocated by sp3. -/// -/// Windows DLLs that allocate memory have to free it. This function -/// should be used to free the result of sp3_disasm, sp3_compile etc. -/// -SP3_EXPORT void sp3_free(void *ptr); - -/// SP3 API to merge two shaders given file names as input. -/// -SP3_EXPORT struct sp3_shader* sp3_merge_shaders( - struct sp3_context *pointer, - const char *first_file, - const char *second_file); - -/// SP3 API to merge two shaders given shader strings as input. -/// -SP3_EXPORT struct sp3_shader* sp3_merge_shader_strings( - struct sp3_context *pointer, - const char *first_string, - const char *second_string); - - -/// @} - - -/// @defgroup sp3vm SP3 Memory Objects -/// -/// The VM API is used to manage virtual memory maps. Those maps are used for binary storage -/// for disassembly, as they can naturally mirror the GPU's memory map (so no register -/// translation is needed). -/// -/// @{ - -/// Callback function that will fill a VMA on demand -/// -/// The VMA to be filled will be specified through the request address. -/// The callback should fill the VMA using sp3_vm_write calls. -/// -typedef void (* sp3_vmfill)(struct sp3_vma *vm, sp3_vmaddr addr, void *ctx); - -/// Create a new VM that is empty. -/// -/// Free the object with sp3_vm_free(). -/// -/// @return New VM object. -/// -SP3_EXPORT -struct sp3_vma *sp3_vm_new(void); - -/// Create a new VM that has a sp3_vmfill callback. -/// -/// Free the object with sp3_vm_free(). -/// -/// @param fill Function used to populate data in VM. The function will be pass the new VM object, the address and a context. -/// @param ctx User-specified context. Passed to the fill function and not used by sp3 itself. -/// @return New VM object. -/// -SP3_EXPORT -struct sp3_vma *sp3_vm_new_fill(sp3_vmfill fill, void *ctx); - -/// Create a new VM from an array of words. -/// -/// Free the object with sp3_vm_free(). -/// -/// @param base VM address to load array at. -/// @param len Number of 32-bit words in the array. -/// @param data Pointer to the array. -/// @return New VM object. -/// -SP3_EXPORT -struct sp3_vma *sp3_vm_new_ptr(sp3_vmaddr base, sp3_vmaddr len, const uint32_t *data); - -/// Find a VMA, optionally adding it. -/// -/// @param vm VM to search in. -/// @param addr Address to search for. -/// @param add Flag indicating whether a failure should result in adding a new VMA. -/// @return VM object matching the specified address. -/// -SP3_EXPORT -struct sp3_vma *sp3_vm_find(struct sp3_vma *vm, sp3_vmaddr addr, uint32_t add); - -/// Write a word to a VM. -/// -/// @param vm VM to write. -/// @param addr Address to write. -/// @param val 32-bits of data to write. -/// -SP3_EXPORT -void sp3_vm_write(struct sp3_vma *vm, sp3_vmaddr addr, uint32_t val); - -/// Read a word from a VM. -/// -/// @param vm VM to read. -/// @param addr Address to read. -/// @return 32-bits of data at specified address. -/// -SP3_EXPORT -uint32_t sp3_vm_read(struct sp3_vma *vm, sp3_vmaddr addr); - -/// Probe VM for presence. -/// -/// @param vm VM to probe. -/// @param addr Address to search for. -/// @return 1 if the specified address is backed in the VM, 0 otherwise. -/// -SP3_EXPORT -int sp3_vm_present(struct sp3_vma *vm, sp3_vmaddr addr); - -/// Return base address of VM. -/// -/// @param vm VM to query. -/// @return Base address. -/// -SP3_EXPORT -sp3_vmaddr sp3_vm_base(struct sp3_vma *vm); - -/// Return next VM. -/// -/// @param vm VM to query. -/// @return Next VM in list. -/// -SP3_EXPORT -struct sp3_vma *sp3_vm_next(struct sp3_vma *vm); - -/// Free a VM and all its storage. -/// -/// Use this function to free memory allocated by sp3_vm_new, sp3_vm_new_fill and -/// sp3_vm_new_ptr. -/// -/// @param vm VM to free. -/// -SP3_EXPORT -void sp3_vm_free(struct sp3_vma *vm); - - -/// @} - - -#ifdef __cplusplus -} -#endif - - -#endif /* __SP3_H__ */ diff --git a/projects/rocr-runtime/tests/kfdtest/src/IsaGenerator.cpp b/projects/rocr-runtime/tests/kfdtest/src/IsaGenerator.cpp deleted file mode 100644 index 3e69b5f9df..0000000000 --- a/projects/rocr-runtime/tests/kfdtest/src/IsaGenerator.cpp +++ /dev/null @@ -1,126 +0,0 @@ -/* - * Copyright (C) 2014-2018 Advanced Micro Devices, Inc. All Rights Reserved. - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the "Software"), - * to deal in the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, - * and/or sell copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR - * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, - * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR - * OTHER DEALINGS IN THE SOFTWARE. - * - */ - -#include "IsaGenerator.hpp" - -#include -#include - -#include "IsaGenerator_Gfx72.hpp" -#include "IsaGenerator_Gfx8.hpp" -#include "IsaGenerator_Gfx9.hpp" -#include "IsaGenerator_Gfx10.hpp" -#include "IsaGenerator_Aldebaran.hpp" - -#include "GoogleTestExtension.hpp" - -#include "sp3.h" - -const std::string IsaGenerator::ADDRESS_WATCH_SP3( - "var REG_TRAPSTS_EXCP_MASK = 0x000001ff\n" - "var WAVE_COUNT_OFFSET = 12\n" - "var TMA_CYCLE_OFFSET = 16\n" - "\n" - "/*\n" - " * ttmp[0:1] -- The ISA address that triggered this trap handler\n" - " * ttmp[10:11] -- The TMA user provided, used to store the debug info in this shader\n" - " * v[10:14] ttmp[7:8] -- temp use inside this shader\n" - " * s5 -- store the counts that this trap been triggered\n" - " * Each time when the trap is triggered , this shader will write\n" - " * ttmp[0] : ttmp[1] : Trap_Status : [reserved]\n" - " * to TMA + (trap count * TMA_CYCLE_OFFSET)\n" - " * The TMA + WAVE_COUNT_OFFSET(the first [reserved] address)\n" - " * used to store the total triggered trap count.\n" - " */\n" - "shader main\n" - "\n" - " asic(VI)\n" - "\n" - " type(CS)\n" - " v_mov_b32 v10, ttmp10\n" - " v_mov_b32 v11, ttmp11\n" - " s_mov_b32 ttmp7, s5\n" - " s_mulk_i32 ttmp7, TMA_CYCLE_OFFSET\n" - " s_addk_i32 s5, 1\n" - " v_mov_b32 v12, ttmp0\n" - " v_add_u32 v10, vcc, ttmp7, v10\n" - " flat_store_dword v[10,11], v12 slc glc\n" - " v_mov_b32 v12, ttmp1\n" - " v_add_u32 v10, vcc, 4, v10\n" - " flat_store_dword v[10,11], v12 slc glc\n" - " s_getreg_b32 ttmp8, hwreg(HW_REG_TRAPSTS)\n" - " s_and_b32 ttmp8, ttmp8, REG_TRAPSTS_EXCP_MASK\n" - " v_mov_b32 v12, ttmp8\n" - " v_add_u32 v10, vcc, 4, v10\n" - " flat_store_dword v[10,11], v12 glc\n" - " v_mov_b32 v10, ttmp10\n" - " v_add_u32 v10, vcc, WAVE_COUNT_OFFSET, v10\n" - " v_mov_b32 v13, 1\n" - " flat_atomic_add v14, v[10:11], v13 slc glc\n" - " s_and_b32 ttmp1, ttmp1, 0xffff\n" - " s_rfe_b64 [ttmp0,ttmp1]\n" - "end\n" -); - -IsaGenerator* IsaGenerator::Create(unsigned int familyId) { - switch (familyId) { - case FAMILY_CI: - case FAMILY_KV: - return new IsaGenerator_Gfx72; - case FAMILY_VI: - case FAMILY_CZ: - return new IsaGenerator_Gfx8; - case FAMILY_AI: - case FAMILY_RV: - case FAMILY_AR: - return new IsaGenerator_Gfx9; - case FAMILY_AL: - return new IsaGenerator_Aldbrn; - case FAMILY_NV: - return new IsaGenerator_Gfx10; - - default: - LOG() << "Error: Invalid ISA" << std::endl; - return NULL; - } -} - -void IsaGenerator::GetAwTrapHandler(HsaMemoryBuffer& rBuf) { - CompileShader(ADDRESS_WATCH_SP3.c_str(), "main", rBuf); -} - -void IsaGenerator::CompileShader(const char* shaderCode, const char* shaderName, HsaMemoryBuffer& rBuf) { - sp3_context* pSp3 = sp3_new(); - sp3_setasic(pSp3, GetAsicName().c_str()); - sp3_parse_string(pSp3, shaderCode); - sp3_shader* pShader = sp3_compile(pSp3, shaderName); - - std::copy(pShader->data, pShader->data + pShader->size, rBuf.As()); - sp3_free_shader(pShader); - - /** Inside this close function, there is an unknown reason of free memory not used by compiler. - * Comment out this as a workaround. System will do the garbage collection after this - * application is closed. - */ - // sp3_close(pSp3); -} diff --git a/projects/rocr-runtime/tests/kfdtest/src/IsaGenerator.hpp b/projects/rocr-runtime/tests/kfdtest/src/IsaGenerator.hpp deleted file mode 100644 index 4b9c49ad9e..0000000000 --- a/projects/rocr-runtime/tests/kfdtest/src/IsaGenerator.hpp +++ /dev/null @@ -1,52 +0,0 @@ -/* - * Copyright (C) 2014-2018 Advanced Micro Devices, Inc. All Rights Reserved. - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the "Software"), - * to deal in the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, - * and/or sell copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR - * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, - * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR - * OTHER DEALINGS IN THE SOFTWARE. - * - */ - -#ifndef _ISAGENERATOR_H_ -#define _ISAGENERATOR_H_ - -#include "KFDTestUtil.hpp" - -/* isa generation class - interface */ -class IsaGenerator { - public: - static IsaGenerator* Create(unsigned int familyId); - - virtual ~IsaGenerator() {} - - virtual void GetNoopIsa(HsaMemoryBuffer& rBuf) = 0; - virtual void GetCopyDwordIsa(HsaMemoryBuffer& rBuf) = 0; - virtual void GetInfiniteLoopIsa(HsaMemoryBuffer& rBuf) = 0; - virtual void GetAtomicIncIsa(HsaMemoryBuffer& rBuf) = 0; - virtual void GetCwsrTrapHandler(HsaMemoryBuffer& rBuf) {} - virtual void GetAwTrapHandler(HsaMemoryBuffer& rBuf); - - void CompileShader(const char* shaderCode, const char* shaderName, HsaMemoryBuffer& rBuf); - - protected: - virtual const std::string& GetAsicName() = 0; - - private: - static const std::string ADDRESS_WATCH_SP3; -}; - -#endif // _ISAGENERATOR_H_ diff --git a/projects/rocr-runtime/tests/kfdtest/src/IsaGenerator_Aldebaran.cpp b/projects/rocr-runtime/tests/kfdtest/src/IsaGenerator_Aldebaran.cpp deleted file mode 100644 index 2c377f9111..0000000000 --- a/projects/rocr-runtime/tests/kfdtest/src/IsaGenerator_Aldebaran.cpp +++ /dev/null @@ -1,113 +0,0 @@ -/* - * Copyright (C) 2020 Advanced Micro Devices, Inc. All Rights Reserved. - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the "Software"), - * to deal in the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, - * and/or sell copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR - * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, - * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR - * OTHER DEALINGS IN THE SOFTWARE. - * - */ - -#include "IsaGenerator_Aldebaran.hpp" - -#include -#include - -const std::string IsaGenerator_Aldbrn::ASIC_NAME = "ALDEBARAN"; - -/* The binaries are generated from following ISA */ -#if 0 -/* flat_atomic_inc will not support by some PCIE, use flat_atomic_add instead */ -shader atomic_add -asic(ALDEBARAN) -type(CS) - v_mov_b32 v0, s0 - v_mov_b32 v1, s1 - v_mov_b32 v2, 1 - flat_atomic_add v3, v[0:1], v2 slc glc scc - s_waitcnt 0 - s_endpgm -end - -shader copy_dword -asic(ALDEBARAN) -type(CS) -/* copy the parameters from scalar registers to vector registers */ - v_mov_b32 v0, s0 - v_mov_b32 v1, s1 - v_mov_b32 v2, s2 - v_mov_b32 v3, s3 -/* copy a dword between the passed addresses */ - flat_load_dword v4, v[0:1] slc glc - s_waitcnt 0 - flat_store_dword v[2:3], v4 slc glc - s_endpgm -end - -shader main -asic(ALDEBARAN) -type(CS) -loop: - s_branch loop - s_endpgm -end - - -#endif - -const uint32_t IsaGenerator_Aldbrn::NOOP_ISA[] = { - 0xbf810000 -}; - -const uint32_t IsaGenerator_Aldbrn::COPY_DWORD_ISA[] = { - 0x7e000200, 0x7e020201, - 0x7e040202, 0x7e060203, - 0xdc530000, 0x047f0000, - 0xbf8c0000, 0xdc730000, - 0x007f0402, 0xbf810000 -}; - -const uint32_t IsaGenerator_Aldbrn::INFINITE_LOOP_ISA[] = { - 0xbf82ffff, 0xbf810000 -}; - -const uint32_t IsaGenerator_Aldbrn::ATOMIC_ADD_ISA[] = { - 0x7e000200, 0x7e020201, - 0x7e040281, 0xdf0b0000, - 0x037f0200, 0xbf8c0000, - 0xbf810000, 0x00000000 -}; - -void IsaGenerator_Aldbrn::GetNoopIsa(HsaMemoryBuffer& rBuf) { - std::copy(NOOP_ISA, NOOP_ISA+ARRAY_SIZE(NOOP_ISA), rBuf.As()); -} - -void IsaGenerator_Aldbrn::GetCopyDwordIsa(HsaMemoryBuffer& rBuf) { - std::copy(COPY_DWORD_ISA, COPY_DWORD_ISA+ARRAY_SIZE(COPY_DWORD_ISA), rBuf.As()); -} - -void IsaGenerator_Aldbrn::GetInfiniteLoopIsa(HsaMemoryBuffer& rBuf) { - std::copy(INFINITE_LOOP_ISA, INFINITE_LOOP_ISA+ARRAY_SIZE(INFINITE_LOOP_ISA), rBuf.As()); -} - -void IsaGenerator_Aldbrn::GetAtomicIncIsa(HsaMemoryBuffer& rBuf) { - std::copy(ATOMIC_ADD_ISA, ATOMIC_ADD_ISA+ARRAY_SIZE(ATOMIC_ADD_ISA), rBuf.As()); -} - -const std::string& IsaGenerator_Aldbrn::GetAsicName() { - return ASIC_NAME; -} - diff --git a/projects/rocr-runtime/tests/kfdtest/src/IsaGenerator_Aldebaran.hpp b/projects/rocr-runtime/tests/kfdtest/src/IsaGenerator_Aldebaran.hpp deleted file mode 100644 index 5571b91c26..0000000000 --- a/projects/rocr-runtime/tests/kfdtest/src/IsaGenerator_Aldebaran.hpp +++ /dev/null @@ -1,49 +0,0 @@ -/* - * Copyright (C) 2020 Advanced Micro Devices, Inc. All Rights Reserved. - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the "Software"), - * to deal in the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, - * and/or sell copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR - * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, - * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR - * OTHER DEALINGS IN THE SOFTWARE. - * - */ - -#ifndef _ISAGENERATOR_ALDEBARAN_H_ -#define _ISAGENERATOR_ALDEBARAN_H_ - -#include -#include "IsaGenerator.hpp" - -class IsaGenerator_Aldbrn : public IsaGenerator { - public: - virtual void GetNoopIsa(HsaMemoryBuffer& rBuf); - virtual void GetCopyDwordIsa(HsaMemoryBuffer& rBuf); - virtual void GetInfiniteLoopIsa(HsaMemoryBuffer& rBuf); - virtual void GetAtomicIncIsa(HsaMemoryBuffer& rBuf); - - protected: - virtual const std::string& GetAsicName(); - - private: - static const std::string ASIC_NAME; - - static const uint32_t NOOP_ISA[]; - static const uint32_t COPY_DWORD_ISA[]; - static const uint32_t INFINITE_LOOP_ISA[]; - static const uint32_t ATOMIC_ADD_ISA[]; -}; - -#endif // _ISAGENERATOR_ALDEBARAN_H_ diff --git a/projects/rocr-runtime/tests/kfdtest/src/IsaGenerator_Gfx10.cpp b/projects/rocr-runtime/tests/kfdtest/src/IsaGenerator_Gfx10.cpp deleted file mode 100644 index d8d33086e5..0000000000 --- a/projects/rocr-runtime/tests/kfdtest/src/IsaGenerator_Gfx10.cpp +++ /dev/null @@ -1,142 +0,0 @@ -/* - * Copyright (C) 2019 Advanced Micro Devices, Inc. All Rights Reserved. - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the "Software"), - * to deal in the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, - * and/or sell copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR - * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, - * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR - * OTHER DEALINGS IN THE SOFTWARE. - * - */ - -#include "IsaGenerator_Gfx10.hpp" - -#include -#include - -/* The binaries are generated from following ISA */ -const std::string IsaGenerator_Gfx10::ASIC_NAME = "GFX10"; -#if 0 -static const char * atomic_add = \ -"\ -shader atomic_add \n\ -asic(GFX10) \n\ -wave_size(32) \n\ -type(CS) \n\ - v_mov_b32 v0, s0 \n\ - v_mov_b32 v1, s1 \n\ - v_mov_b32 v2, 1 \n\ - flat_atomic_add v3, v[0:1], v2 slc glc \n\ - s_waitcnt 0 \n\ - s_endpgm \n\ -end \n\ -"; - -static const char * copy_dword = \ -"\ -shader copy_dword \n\ -asic(GFX10) \n\ -wave_size(32) \n\ -type(CS) \n\ - v_mov_b32 v0, s0 \n\ - v_mov_b32 v1, s1 \n\ - v_mov_b32 v2, s2 \n\ - v_mov_b32 v3, s3 \n\ - flat_load_dword v4, v[0:1] slc glc \n\ - s_waitcnt 0 \n\ - flat_store_dword v[2:3], v4 slc glc \n\ - s_endpgm \n\ -end \n\ -"; - -static const char * loop= \ -"\ -shader loop \n\ -asic(GFX10) \n\ -type(CS) \n\ -wave_size(32) \n\ -loop: \n\ - s_branch loop \n\ - s_endpgm \n\ -end \n\ -"; - -static const char * noop= \ -"\ -shader noop \n\ -asic(GFX10) \n\ -type(CS) \n\ -wave_size(32) \n\ - s_endpgm \n\ -end \n\ -"; -#endif - -const uint32_t IsaGenerator_Gfx10::NOOP_ISA[] = { -0xb0804004, 0xbf810000, -0xbf9f0000, 0xbf9f0000, -0xbf9f0000, 0xbf9f0000, -0xbf9f0000 -}; - -const uint32_t IsaGenerator_Gfx10::COPY_DWORD_ISA[] = { -0xb0804004, 0x7e000200, -0x7e020201, 0x7e040202, -0x7e060203, 0xdc330000, -0x47d0000, 0xbf8c0000, -0xdc730000, 0x7d0402, -0xbf810000, 0xbf9f0000, -0xbf9f0000, 0xbf9f0000, -0xbf9f0000, 0xbf9f0000 -}; - -const uint32_t IsaGenerator_Gfx10::INFINITE_LOOP_ISA[] = { -0xbf82ffff, 0xb0804004, -0xbf810000, 0xbf9f0000, -0xbf9f0000, 0xbf9f0000, -0xbf9f0000, 0xbf9f0000 -}; - -const uint32_t IsaGenerator_Gfx10::ATOMIC_ADD_ISA[] = { -0xb0804004, 0x7e000200, -0x7e020201, 0x7e040281, -0xdccb0000, 0x37d0200, -0xbf8c0000, 0xbf810000, -0xbf9f0000, 0xbf9f0000, -0xbf9f0000, 0xbf9f0000, -0xbf9f0000 -}; - - -void IsaGenerator_Gfx10::GetNoopIsa(HsaMemoryBuffer& rBuf) { - std::copy(NOOP_ISA, NOOP_ISA+ARRAY_SIZE(NOOP_ISA), rBuf.As()); -} - -void IsaGenerator_Gfx10::GetCopyDwordIsa(HsaMemoryBuffer& rBuf) { - std::copy(COPY_DWORD_ISA, COPY_DWORD_ISA+ARRAY_SIZE(COPY_DWORD_ISA), rBuf.As()); -} - -void IsaGenerator_Gfx10::GetInfiniteLoopIsa(HsaMemoryBuffer& rBuf) { - std::copy(INFINITE_LOOP_ISA, INFINITE_LOOP_ISA+ARRAY_SIZE(INFINITE_LOOP_ISA), rBuf.As()); -} - -void IsaGenerator_Gfx10::GetAtomicIncIsa(HsaMemoryBuffer& rBuf) { - std::copy(ATOMIC_ADD_ISA, ATOMIC_ADD_ISA+ARRAY_SIZE(ATOMIC_ADD_ISA), rBuf.As()); -} - -const std::string& IsaGenerator_Gfx10::GetAsicName() { - return ASIC_NAME; -} - diff --git a/projects/rocr-runtime/tests/kfdtest/src/IsaGenerator_Gfx10.hpp b/projects/rocr-runtime/tests/kfdtest/src/IsaGenerator_Gfx10.hpp deleted file mode 100644 index e4a57cda56..0000000000 --- a/projects/rocr-runtime/tests/kfdtest/src/IsaGenerator_Gfx10.hpp +++ /dev/null @@ -1,49 +0,0 @@ -/* - * Copyright (C) 2019 Advanced Micro Devices, Inc. All Rights Reserved. - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the "Software"), - * to deal in the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, - * and/or sell copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR - * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, - * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR - * OTHER DEALINGS IN THE SOFTWARE. - * - */ - -#ifndef _ISAGENERATOR_GFX10_H_ -#define _ISAGENERATOR_GFX10_H_ - -#include -#include "IsaGenerator.hpp" - -class IsaGenerator_Gfx10 : public IsaGenerator { - public: - virtual void GetNoopIsa(HsaMemoryBuffer& rBuf); - virtual void GetCopyDwordIsa(HsaMemoryBuffer& rBuf); - virtual void GetInfiniteLoopIsa(HsaMemoryBuffer& rBuf); - virtual void GetAtomicIncIsa(HsaMemoryBuffer& rBuf); - - protected: - virtual const std::string& GetAsicName(); - - private: - static const std::string ASIC_NAME; - - static const uint32_t NOOP_ISA[]; - static const uint32_t COPY_DWORD_ISA[]; - static const uint32_t INFINITE_LOOP_ISA[]; - static const uint32_t ATOMIC_ADD_ISA[]; -}; - -#endif // _ISAGENERATOR_GFX9_H_ diff --git a/projects/rocr-runtime/tests/kfdtest/src/IsaGenerator_Gfx72.cpp b/projects/rocr-runtime/tests/kfdtest/src/IsaGenerator_Gfx72.cpp deleted file mode 100644 index e0d98fb5c8..0000000000 --- a/projects/rocr-runtime/tests/kfdtest/src/IsaGenerator_Gfx72.cpp +++ /dev/null @@ -1,123 +0,0 @@ -/* - * Copyright (C) 2014-2018 Advanced Micro Devices, Inc. All Rights Reserved. - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the "Software"), - * to deal in the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, - * and/or sell copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR - * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, - * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR - * OTHER DEALINGS IN THE SOFTWARE. - * - */ - -#include "IsaGenerator_Gfx72.hpp" - -#include -#include - -const std::string IsaGenerator_Gfx72::ASIC_NAME = "CI"; - -const uint32_t IsaGenerator_Gfx72::NOOP_ISA[] = { - 0xbf810000 // S_ENDPGM -}; - -/* The below arrays are filled with hex values in order not to reference - * proprietary header files, but we still leave the code here for future - * reference. - */ -#if 0 -const uint32_t IsaGenerator_Gfx72::COPY_DWORD_ISA[] = { - (63u << SQ_VOP1__ENCODING__SHIFT) | (0 << SQ_VOP1__VDST__SHIFT) | (SQ_V_MOV_B32 << SQ_VOP1__OP__SHIFT) | (0 << SQ_VOP1__SRC0__SHIFT), // v_mov_b32 v0, s0 (VOP1) - (63u << SQ_VOP1__ENCODING__SHIFT) | (1 << SQ_VOP1__VDST__SHIFT) | (SQ_V_MOV_B32 << SQ_VOP1__OP__SHIFT) | (1 << SQ_VOP1__SRC0__SHIFT), // v_mov_b32 v1, s1 (VOP1) - (63u << SQ_VOP1__ENCODING__SHIFT) | (2 << SQ_VOP1__VDST__SHIFT) | (SQ_V_MOV_B32 << SQ_VOP1__OP__SHIFT) | (2 << SQ_VOP1__SRC0__SHIFT), // v_mov_b32 v2, s2 (VOP1) - (63u << SQ_VOP1__ENCODING__SHIFT) | (3 << SQ_VOP1__VDST__SHIFT) | (SQ_V_MOV_B32 << SQ_VOP1__OP__SHIFT) | (3 << SQ_VOP1__SRC0__SHIFT), // v_mov_b32 v3, s3 (VOP1) - - (55u << SQ_FLAT_0__ENCODING__SHIFT) | (SQ_FLAT_LOAD_DWORD << SQ_FLAT_0__OP__SHIFT) | (1 << SQ_FLAT_0__SLC__SHIFT) | (1 << SQ_FLAT_0__GLC__SHIFT)/*(3 << 16)*/, // SQ_FLAT_0, flat_load_dword, slc = 1, glc = 1 (FLAT_0) - (4u << SQ_FLAT_1__VDST__SHIFT) | (0 << SQ_FLAT_1__ADDR__SHIFT), // ADDR = V0:V1, VDST = V4 (FLAT_1) - - (383u << SQ_SOPP__ENCODING__SHIFT) | (SQ_S_WAITCNT << SQ_SOPP__OP__SHIFT) | (0 << SQ_SOPP__SIMM16__SHIFT), // s_waitcnt 0 (SOPP) - - (55u << SQ_FLAT_0__ENCODING__SHIFT) | (SQ_FLAT_STORE_DWORD << SQ_FLAT_0__OP__SHIFT) | (1 << SQ_FLAT_0__SLC__SHIFT) | (1 << SQ_FLAT_0__GLC__SHIFT), // SQ_FLAT_0, flat_store_dword, slc = 1, glc = 1 (FLAT_0) - (4u << SQ_FLAT_1__DATA__SHIFT) | (2 << SQ_FLAT_1__ADDR__SHIFT), // ADDR = V2:V3, DATA = V4 (FLAT_1) - - 0xBF810000u // s_endpgm, note that we rely on the implicit s_waitcnt 0,0,0 -}; - -const uint32_t IsaGenerator_Gfx72::INFINITE_LOOP_ISA[] = { - (0x17F << SQ_SOPP__ENCODING__SHIFT) | (SQ_S_BRANCH << SQ_SOPP__OP__SHIFT) | ( (const uint32_t)-1 << SQ_SOPP__SIMM16__SHIFT), // s_branch -1 (PC <- PC + SIMM*4)+4 - 0xBF810000u // S_ENDPGM -}; - -const uint32_t IsaGenerator_Gfx72::ATOMIC_INC_ISA[] = { - (63u << SQ_VOP1__ENCODING__SHIFT) | (0 << SQ_VOP1__VDST__SHIFT) | (SQ_V_MOV_B32 << SQ_VOP1__OP__SHIFT) | (0 << SQ_VOP1__SRC0__SHIFT), // v_mov_b32 v0, s0 (VOP1) - (63u << SQ_VOP1__ENCODING__SHIFT) | (1 << SQ_VOP1__VDST__SHIFT) | (SQ_V_MOV_B32 << SQ_VOP1__OP__SHIFT) | (1 << SQ_VOP1__SRC0__SHIFT), // v_mov_b32 v1, s1 (VOP1) - (63u << SQ_VOP1__ENCODING__SHIFT) | (2 << SQ_VOP1__VDST__SHIFT) | (SQ_V_MOV_B32 << SQ_VOP1__OP__SHIFT) | (0xC1 << SQ_VOP1__SRC0__SHIFT), // v_mov_b32 0xFFFFFFFF, s2 (VOP1) - - (55u << SQ_FLAT_0__ENCODING__SHIFT) | (SQ_FLAT_ATOMIC_INC << SQ_FLAT_0__OP__SHIFT) | (1 << SQ_FLAT_0__SLC__SHIFT) | (0 << SQ_FLAT_0__GLC__SHIFT), // SQ_FLAT_0, flat_atomic_inc, slc = 1, glc = 0 (FLAT_0) - (3u << SQ_FLAT_1__VDST__SHIFT) | (2u << SQ_FLAT_1__DATA__SHIFT) | (0 << SQ_FLAT_1__ADDR__SHIFT), // ADDR/dst = V0:V1, VDST/ret = V3, DATA/src=V2 (FLAT_1) - 0xBF810000u // s_endpgm, note that we rely on the implicit s_waitcnt 0,0,0 -}; -#endif - -const uint32_t IsaGenerator_Gfx72::COPY_DWORD_ISA[] = { - 0x7e000200, // v_mov_b32 v0, s0 (VOP1) - 0x7e020201, // v_mov_b32 v1, s1 (VOP1) - 0x7e040202, // v_mov_b32 v2, s2 (VOP1) - 0x7e060203, // v_mov_b32 v3, s3 (VOP1) - - 0xdc330000, // SQ_FLAT_0, flat_load_dword, slc = 1, glc = 1 (FLAT_0) - 0x04000000, // ADDR = V0:V1, VDST = V4 (FLAT_1) - - 0xbf8c0000, // s_waitcnt 0 (SOPP) - - 0xdc730000, // SQ_FLAT_0, flat_store_dword, slc = 1, glc = 1 (FLAT_0) - 0x00000402, // ADDR = V2:V3, DATA = V4 (FLAT_1) - - 0xbf810000 // s_endpgm, note that we rely on the implicit s_waitcnt 0,0,0 -}; - -const uint32_t IsaGenerator_Gfx72::INFINITE_LOOP_ISA[] = { - 0xbf82ffff, // s_branch -1 (PC <- PC + SIMM*4)+4 - 0xbf810000 // S_ENDPGM -}; - -const uint32_t IsaGenerator_Gfx72::ATOMIC_INC_ISA[] = { - 0x7e000200, // v_mov_b32 v0, s0 (VOP1) - 0x7e020201, // v_mov_b32 v1, s1 (VOP1) - 0x7e0402c1, // v_mov_b32 0xFFFFFFFF, s2 (VOP1) - - 0xdcf20000, // SQ_FLAT_0, flat_atomic_inc, slc = 1, glc = 0 (FLAT_0) - 0x03000200, // ADDR/dst = V0:V1, VDST/ret = V3, DATA/src=V2 (FLAT_1) - 0xbf810000 // s_endpgm, note that we rely on the implicit s_waitcnt 0,0,0 -}; - -void IsaGenerator_Gfx72::GetNoopIsa(HsaMemoryBuffer& rBuf) { - std::copy(NOOP_ISA, NOOP_ISA+ARRAY_SIZE(NOOP_ISA), rBuf.As()); -} - -void IsaGenerator_Gfx72::GetCopyDwordIsa(HsaMemoryBuffer& rBuf) { - std::copy(COPY_DWORD_ISA, COPY_DWORD_ISA+ARRAY_SIZE(COPY_DWORD_ISA), rBuf.As()); -} - -void IsaGenerator_Gfx72::GetInfiniteLoopIsa(HsaMemoryBuffer& rBuf) { - std::copy(INFINITE_LOOP_ISA, INFINITE_LOOP_ISA+ARRAY_SIZE(INFINITE_LOOP_ISA), rBuf.As()); -} - -void IsaGenerator_Gfx72::GetAtomicIncIsa(HsaMemoryBuffer& rBuf) { - std::copy(ATOMIC_INC_ISA, ATOMIC_INC_ISA+ARRAY_SIZE(ATOMIC_INC_ISA), rBuf.As()); -} - -const std::string& IsaGenerator_Gfx72::GetAsicName() { - return ASIC_NAME; -} diff --git a/projects/rocr-runtime/tests/kfdtest/src/IsaGenerator_Gfx72.hpp b/projects/rocr-runtime/tests/kfdtest/src/IsaGenerator_Gfx72.hpp deleted file mode 100644 index 5c39ffa216..0000000000 --- a/projects/rocr-runtime/tests/kfdtest/src/IsaGenerator_Gfx72.hpp +++ /dev/null @@ -1,49 +0,0 @@ -/* - * Copyright (C) 2014-2018 Advanced Micro Devices, Inc. All Rights Reserved. - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the "Software"), - * to deal in the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, - * and/or sell copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR - * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, - * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR - * OTHER DEALINGS IN THE SOFTWARE. - * - */ - -#ifndef _ISAGENERATOR_GFX72_H_ -#define _ISAGENERATOR_GFX72_H_ - -#include -#include "IsaGenerator.hpp" - -class IsaGenerator_Gfx72 : public IsaGenerator { - public: - virtual void GetNoopIsa(HsaMemoryBuffer& rBuf); - virtual void GetCopyDwordIsa(HsaMemoryBuffer& rBuf); - virtual void GetInfiniteLoopIsa(HsaMemoryBuffer& rBuf); - virtual void GetAtomicIncIsa(HsaMemoryBuffer& rBuf); - - protected: - virtual const std::string& GetAsicName(); - - private: - static const std::string ASIC_NAME; - - static const uint32_t NOOP_ISA[]; - static const uint32_t COPY_DWORD_ISA[]; - static const uint32_t INFINITE_LOOP_ISA[]; - static const uint32_t ATOMIC_INC_ISA[]; -}; - -#endif // _ISAGENERATOR_GFX72_H_ diff --git a/projects/rocr-runtime/tests/kfdtest/src/IsaGenerator_Gfx8.cpp b/projects/rocr-runtime/tests/kfdtest/src/IsaGenerator_Gfx8.cpp deleted file mode 100644 index 65e0df6836..0000000000 --- a/projects/rocr-runtime/tests/kfdtest/src/IsaGenerator_Gfx8.cpp +++ /dev/null @@ -1,128 +0,0 @@ -/* - * Copyright (C) 2014-2018 Advanced Micro Devices, Inc. All Rights Reserved. - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the "Software"), - * to deal in the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, - * and/or sell copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR - * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, - * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR - * OTHER DEALINGS IN THE SOFTWARE. - * - */ - -#include "IsaGenerator_Gfx8.hpp" - -#include -#include - -const std::string IsaGenerator_Gfx8::ASIC_NAME = "VI"; - -const uint32_t IsaGenerator_Gfx8::NOOP_ISA[] = { - 0xbf810000 // S_ENDPGM -}; - -/** The below arrays are filled with hex values in order not to reference - * proprietary header files, but we still leave the code here for future - * reference. - */ -#if 0 -const uint32_t IsaGenerator_Gfx8::COPY_DWORD_ISA[] = { - (63u << SQ_VOP1__ENCODING__SHIFT) | (0 << SQ_VOP1__VDST__SHIFT) | (SQ_V_MOV_B32 << SQ_VOP1__OP__SHIFT) | (0 << SQ_VOP1__SRC0__SHIFT), // v_mov_b32 v0, s0 (VOP1) - (63u << SQ_VOP1__ENCODING__SHIFT) | (1 << SQ_VOP1__VDST__SHIFT) | (SQ_V_MOV_B32 << SQ_VOP1__OP__SHIFT) | (1 << SQ_VOP1__SRC0__SHIFT), // v_mov_b32 v1, s1 (VOP1) - (63u << SQ_VOP1__ENCODING__SHIFT) | (2 << SQ_VOP1__VDST__SHIFT) | (SQ_V_MOV_B32 << SQ_VOP1__OP__SHIFT) | (2 << SQ_VOP1__SRC0__SHIFT), // v_mov_b32 v2, s2 (VOP1) - (63u << SQ_VOP1__ENCODING__SHIFT) | (3 << SQ_VOP1__VDST__SHIFT) | (SQ_V_MOV_B32 << SQ_VOP1__OP__SHIFT) | (3 << SQ_VOP1__SRC0__SHIFT), // v_mov_b32 v3, s3 (VOP1) - - (55u << SQ_FLAT_0__ENCODING__SHIFT) | (SQ_FLAT_LOAD_DWORD << SQ_FLAT_0__OP__SHIFT) | (1 << SQ_FLAT_0__SLC__SHIFT) | (1 << SQ_FLAT_0__GLC__SHIFT)/*(3 << 16)*/, // SQ_FLAT_0, flat_load_dword, slc = 1, glc = 1 (FLAT_0) - (4u << SQ_FLAT_1__VDST__SHIFT) | (0 << SQ_FLAT_1__ADDR__SHIFT), // ADDR = V0:V1, VDST = V4 (FLAT_1) - - (383u << SQ_SOPP__ENCODING__SHIFT) | (SQ_S_WAITCNT << SQ_SOPP__OP__SHIFT) | (0 << SQ_SOPP__SIMM16__SHIFT), // s_waitcnt 0 (SOPP) - - (55u << SQ_FLAT_0__ENCODING__SHIFT) | (SQ_FLAT_STORE_DWORD << SQ_FLAT_0__OP__SHIFT) | (1 << SQ_FLAT_0__SLC__SHIFT) | (1 << SQ_FLAT_0__GLC__SHIFT), // SQ_FLAT_0, flat_store_dword, slc = 1, glc = 1 (FLAT_0) - (4u << SQ_FLAT_1__DATA__SHIFT) | (2 << SQ_FLAT_1__ADDR__SHIFT), // ADDR = V2:V3, DATA = V4 (FLAT_1) - - 0xBF810000u // s_endpgm, note that we rely on the implicit s_waitcnt 0,0,0 -}; - -const uint32_t IsaGenerator_Gfx8::INFINITE_LOOP_ISA[] = { - (0x17F << SQ_SOPP__ENCODING__SHIFT) | (SQ_S_BRANCH << SQ_SOPP__OP__SHIFT) | ( (const uint32_t)-1 << SQ_SOPP__SIMM16__SHIFT), // s_branch -1 (PC <- PC + SIMM*4)+4 - 0xBF810000u // S_ENDPGM -}; -#endif - -const uint32_t IsaGenerator_Gfx8::COPY_DWORD_ISA[] = { - 0x7e000200, // v_mov_b32 v0, s0 (VOP1) - 0x7e020201, // v_mov_b32 v1, s1 (VOP1) - 0x7e040202, // v_mov_b32 v2, s2 (VOP1) - 0x7e060203, // v_mov_b32 v3, s3 (VOP1) - - 0xdc530000, // SQ_FLAT_0, flat_load_dword, slc = 1, glc = 1 (FLAT_0) - 0x04000000, // ADDR = V0:V1, VDST = V4 (FLAT_1) - - 0xbf8c0000, // s_waitcnt 0 (SOPP) - - 0xdc730000, // SQ_FLAT_0, flat_store_dword, slc = 1, glc = 1 (FLAT_0) - 0x00000402, // ADDR = V2:V3, DATA = V4 (FLAT_1) - - 0xbf810000 // s_endpgm, note that we rely on the implicit s_waitcnt 0,0,0 -}; - -const uint32_t IsaGenerator_Gfx8::INFINITE_LOOP_ISA[] = { - 0xbf82ffff, // s_branch -1 (PC <- PC + SIMM*4)+4 - 0xbf810000 // S_ENDPGM -}; - -/** - * The atomic_add_isa binary is generated from following ISA - * The original atomic_inc is not support by some PCIE, so use atomic_add instead - * - */ -/* -shader atomic_add -asic(VI) -type(CS) - v_mov_b32 v0, s0 - v_mov_b32 v1, s1 - v_mov_b32 v2, 1 - flat_atomic_add v3, v[0:1], v2 slc glc - s_waitcnt 0 - s_endpgm -end -*/ - -const uint32_t IsaGenerator_Gfx8::ATOMIC_ADD_ISA[] = { - 0x7e000200, 0x7e020201, - 0x7e040281, 0xdd0b0000, - 0x03000200, 0xbf8c0000, - 0xbf810000, 0x00000000 -}; - -void IsaGenerator_Gfx8::GetNoopIsa(HsaMemoryBuffer& rBuf) { - std::copy(NOOP_ISA, NOOP_ISA+ARRAY_SIZE(NOOP_ISA), rBuf.As()); -} - -void IsaGenerator_Gfx8::GetCopyDwordIsa(HsaMemoryBuffer& rBuf) { - std::copy(COPY_DWORD_ISA, COPY_DWORD_ISA+ARRAY_SIZE(COPY_DWORD_ISA), rBuf.As()); -} - -void IsaGenerator_Gfx8::GetInfiniteLoopIsa(HsaMemoryBuffer& rBuf) { - std::copy(INFINITE_LOOP_ISA, INFINITE_LOOP_ISA+ARRAY_SIZE(INFINITE_LOOP_ISA), rBuf.As()); -} - -void IsaGenerator_Gfx8::GetAtomicIncIsa(HsaMemoryBuffer& rBuf) { - std::copy(ATOMIC_ADD_ISA, ATOMIC_ADD_ISA+ARRAY_SIZE(ATOMIC_ADD_ISA), rBuf.As()); -} - -const std::string& IsaGenerator_Gfx8::GetAsicName() { - return ASIC_NAME; -} diff --git a/projects/rocr-runtime/tests/kfdtest/src/IsaGenerator_Gfx8.hpp b/projects/rocr-runtime/tests/kfdtest/src/IsaGenerator_Gfx8.hpp deleted file mode 100644 index 7e5b9e3c89..0000000000 --- a/projects/rocr-runtime/tests/kfdtest/src/IsaGenerator_Gfx8.hpp +++ /dev/null @@ -1,49 +0,0 @@ -/* - * Copyright (C) 2014-2018 Advanced Micro Devices, Inc. All Rights Reserved. - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the "Software"), - * to deal in the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, - * and/or sell copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR - * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, - * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR - * OTHER DEALINGS IN THE SOFTWARE. - * - */ - -#ifndef _ISAGENERATOR_GFX8_H_ -#define _ISAGENERATOR_GFX8_H_ - -#include -#include "IsaGenerator.hpp" - -class IsaGenerator_Gfx8 : public IsaGenerator { - public: - virtual void GetNoopIsa(HsaMemoryBuffer& rBuf); - virtual void GetCopyDwordIsa(HsaMemoryBuffer& rBuf); - virtual void GetInfiniteLoopIsa(HsaMemoryBuffer& rBuf); - virtual void GetAtomicIncIsa(HsaMemoryBuffer& rBuf); - - protected: - virtual const std::string& GetAsicName(); - - private: - static const std::string ASIC_NAME; - - static const uint32_t NOOP_ISA[]; - static const uint32_t COPY_DWORD_ISA[]; - static const uint32_t INFINITE_LOOP_ISA[]; - static const uint32_t ATOMIC_ADD_ISA[]; -}; - -#endif // _ISAGENERATOR_GFX72_H_ diff --git a/projects/rocr-runtime/tests/kfdtest/src/IsaGenerator_Gfx9.cpp b/projects/rocr-runtime/tests/kfdtest/src/IsaGenerator_Gfx9.cpp deleted file mode 100644 index 8eaab32a5e..0000000000 --- a/projects/rocr-runtime/tests/kfdtest/src/IsaGenerator_Gfx9.cpp +++ /dev/null @@ -1,113 +0,0 @@ -/* - * Copyright (C) 2014-2018 Advanced Micro Devices, Inc. All Rights Reserved. - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the "Software"), - * to deal in the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, - * and/or sell copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR - * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, - * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR - * OTHER DEALINGS IN THE SOFTWARE. - * - */ - -#include "IsaGenerator_Gfx9.hpp" - -#include -#include - -const std::string IsaGenerator_Gfx9::ASIC_NAME = "GFX9"; - -/* The binaries are generated from following ISA */ -#if 0 -/* flat_atomic_inc will not support by some PCIE, use flat_atomic_add instead */ -shader atomic_add -asic(GFX9) -type(CS) - v_mov_b32 v0, s0 - v_mov_b32 v1, s1 - v_mov_b32 v2, 1 - flat_atomic_add v3, v[0:1], v2 slc glc - s_waitcnt 0 - s_endpgm -end - -shader copy_dword -asic(GFX9) -type(CS) -/* copy the parameters from scalar registers to vector registers */ - v_mov_b32 v0, s0 - v_mov_b32 v1, s1 - v_mov_b32 v2, s2 - v_mov_b32 v3, s3 -/* copy a dword between the passed addresses */ - flat_load_dword v4, v[0:1] slc glc - s_waitcnt 0 - flat_store_dword v[2:3], v4 slc glc - s_endpgm -end - -shader main -asic(GFX9) -type(CS) -loop: - s_branch loop - s_endpgm -end - - -#endif - -const uint32_t IsaGenerator_Gfx9::NOOP_ISA[] = { - 0xbf810000 -}; - -const uint32_t IsaGenerator_Gfx9::COPY_DWORD_ISA[] = { - 0x7e000200, 0x7e020201, - 0x7e040202, 0x7e060203, - 0xdc530000, 0x047f0000, - 0xbf8c0000, 0xdc730000, - 0x007f0402, 0xbf810000 -}; - -const uint32_t IsaGenerator_Gfx9::INFINITE_LOOP_ISA[] = { - 0xbf82ffff, 0xbf810000 -}; - -const uint32_t IsaGenerator_Gfx9::ATOMIC_ADD_ISA[] = { - 0x7e000200, 0x7e020201, - 0x7e040281, 0xdd0b0000, - 0x037f0200, 0xbf8c0000, - 0xbf810000, 0x00000000 -}; - -void IsaGenerator_Gfx9::GetNoopIsa(HsaMemoryBuffer& rBuf) { - std::copy(NOOP_ISA, NOOP_ISA+ARRAY_SIZE(NOOP_ISA), rBuf.As()); -} - -void IsaGenerator_Gfx9::GetCopyDwordIsa(HsaMemoryBuffer& rBuf) { - std::copy(COPY_DWORD_ISA, COPY_DWORD_ISA+ARRAY_SIZE(COPY_DWORD_ISA), rBuf.As()); -} - -void IsaGenerator_Gfx9::GetInfiniteLoopIsa(HsaMemoryBuffer& rBuf) { - std::copy(INFINITE_LOOP_ISA, INFINITE_LOOP_ISA+ARRAY_SIZE(INFINITE_LOOP_ISA), rBuf.As()); -} - -void IsaGenerator_Gfx9::GetAtomicIncIsa(HsaMemoryBuffer& rBuf) { - std::copy(ATOMIC_ADD_ISA, ATOMIC_ADD_ISA+ARRAY_SIZE(ATOMIC_ADD_ISA), rBuf.As()); -} - -const std::string& IsaGenerator_Gfx9::GetAsicName() { - return ASIC_NAME; -} - diff --git a/projects/rocr-runtime/tests/kfdtest/src/IsaGenerator_Gfx9.hpp b/projects/rocr-runtime/tests/kfdtest/src/IsaGenerator_Gfx9.hpp deleted file mode 100644 index 32103c0a15..0000000000 --- a/projects/rocr-runtime/tests/kfdtest/src/IsaGenerator_Gfx9.hpp +++ /dev/null @@ -1,49 +0,0 @@ -/* - * Copyright (C) 2014-2018 Advanced Micro Devices, Inc. All Rights Reserved. - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the "Software"), - * to deal in the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, - * and/or sell copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR - * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, - * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR - * OTHER DEALINGS IN THE SOFTWARE. - * - */ - -#ifndef _ISAGENERATOR_GFX9_H_ -#define _ISAGENERATOR_GFX9_H_ - -#include -#include "IsaGenerator.hpp" - -class IsaGenerator_Gfx9 : public IsaGenerator { - public: - virtual void GetNoopIsa(HsaMemoryBuffer& rBuf); - virtual void GetCopyDwordIsa(HsaMemoryBuffer& rBuf); - virtual void GetInfiniteLoopIsa(HsaMemoryBuffer& rBuf); - virtual void GetAtomicIncIsa(HsaMemoryBuffer& rBuf); - - protected: - virtual const std::string& GetAsicName(); - - private: - static const std::string ASIC_NAME; - - static const uint32_t NOOP_ISA[]; - static const uint32_t COPY_DWORD_ISA[]; - static const uint32_t INFINITE_LOOP_ISA[]; - static const uint32_t ATOMIC_ADD_ISA[]; -}; - -#endif // _ISAGENERATOR_GFX9_H_ From bbdd8cdddc3da004dd233d64e96a3e81655378ad Mon Sep 17 00:00:00 2001 From: Graham Sider Date: Tue, 9 Nov 2021 10:59:28 -0500 Subject: [PATCH 26/27] kfdtest: Remove TEMP_GFX10_BLACKLIST With LLVM-based assembly these shaders are now valid for GFX10, with the exception of KFDSVMEvictTest. Signed-off-by: Graham Sider Change-Id: Idc872139176bbc1cc8d7ae61a8e4572360ecb5d5 [ROCm/ROCR-Runtime commit: 025c6146d98c0c1bd184440f778d105dd45e67f5] --- .../tests/kfdtest/scripts/kfdtest.exclude | 36 ++++++------------- 1 file changed, 10 insertions(+), 26 deletions(-) diff --git a/projects/rocr-runtime/tests/kfdtest/scripts/kfdtest.exclude b/projects/rocr-runtime/tests/kfdtest/scripts/kfdtest.exclude index 392dba3cde..885054ba50 100644 --- a/projects/rocr-runtime/tests/kfdtest/scripts/kfdtest.exclude +++ b/projects/rocr-runtime/tests/kfdtest/scripts/kfdtest.exclude @@ -224,26 +224,10 @@ FILTER[aldebaran]=\ "KFDMemoryTest.PtraceAccess:"\ "KFDMemoryTest.DeviceHdpFlush" -# SP3 Compiler needs to be updated for GFX10. Temporarily disable all tests -# that require shader compiler -# Adding KFDSVMEvictTest as SVM/HMM was never validated on GFX10 -TEMP_GFX10_BLACKLIST=\ -"KFDMemoryTest.FlatScratchAccess:"\ -"KFDMemoryTest.PtraceAccessInvisibleVram:"\ -"KFDQMTest.QueuePriorityOnDifferentPipe:"\ -"KFDQMTest.QueuePriorityOnSamePipe:"\ -"KFDCWSRTest.BasicTest:"\ -"KFDQMTest.BasicCuMaskingEven:"\ -"KFDEvictTest.QueueTest:"\ -"KFDMemoryTest.MapUnmapToNodes:"\ -"KFDMemoryTest.HostHdpFlush:"\ -"KFDMemoryTest.DeviceHdpFlush:"\ -"KFDSVMEvictTest.*" - FILTER[navi10]=\ "$BLACKLIST_ALL_ASICS:"\ -"$TEMP_GFX10_BLACKLIST:"\ -"KFDMemoryTest.MMBench" +"KFDMemoryTest.MMBench:"\ +"KFDSVMEvictTest.*" # Need to verify the following failed tests on another machine: # Exceptions not being received during exception tests @@ -254,42 +238,42 @@ FILTER[navi12]=\ "KFDExceptionTest.*:"\ "KFDPerfCountersTest.*:"\ "KFDPerformanceTest.P2PBandWidthTest:"\ -"$TEMP_GFX10_BLACKLIST" +"KFDSVMEvictTest.*" FILTER[navi14]=\ "$BLACKLIST_ALL_ASICS:"\ -"$TEMP_GFX10_BLACKLIST" +"KFDSVMEvictTest.*" FILTER[sienna_cichlid]=\ "$BLACKLIST_ALL_ASICS:"\ -"$TEMP_GFX10_BLACKLIST:"\ "KFDQMTest.BasicCuMaskingEven:"\ "KFDDBGTest.*:"\ "KFDPerfCountersTest.*:"\ +"KFDSVMEvictTest.*" FILTER[navy_flounder]=\ "$BLACKLIST_ALL_ASICS:"\ -"$TEMP_GFX10_BLACKLIST:"\ "KFDQMTest.BasicCuMaskingEven:"\ "KFDDBGTest.*:"\ "KFDPerfCountersTest.*:"\ +"KFDSVMEvictTest.*" FILTER[dimgrey_cavefish]=\ "$BLACKLIST_ALL_ASICS:"\ -"$TEMP_GFX10_BLACKLIST:"\ "KFDQMTest.BasicCuMaskingEven:"\ "KFDDBGTest.*:"\ "KFDPerfCountersTest.*:"\ +"KFDSVMEvictTest.*" FILTER[beige_goby]=\ "$BLACKLIST_ALL_ASICS:"\ -"$TEMP_GFX10_BLACKLIST:"\ "KFDQMTest.BasicCuMaskingEven:"\ "KFDDBGTest.*:"\ "KFDPerfCountersTest.*:"\ +"KFDSVMEvictTest.*" FILTER[yellow_carp]=\ "$BLACKLIST_ALL_ASICS:"\ -"$TEMP_GFX10_BLACKLIST:"\ "KFDQMTest.BasicCuMaskingEven:"\ -"KFDIPCTest.CMABasicTest" +"KFDIPCTest.CMABasicTest:"\ +"KFDSVMEvictTest.*" From 0ac0c9527dfb09ef161f39a70d91fa15f311c770 Mon Sep 17 00:00:00 2001 From: Graham Sider Date: Wed, 9 Mar 2022 10:47:27 -0500 Subject: [PATCH 27/27] kfdtest: Add KFDASMTest Includes a simple AssembleShader test which loops through all shaders for all supported targets, dispatching a RunAssemble call for each shader. Also adds extra safety on a couple shaders that only work on gfx9/gfx90a. Signed-off-by: Graham Sider Change-Id: I3ca1c92136f3871eb62fcb9645694f22287aaeec [ROCm/ROCR-Runtime commit: 7eeba830f88ad9f60fce7a1fbf940764fe54c057] --- .../rocr-runtime/tests/kfdtest/CMakeLists.txt | 1 + .../tests/kfdtest/src/Assemble.hpp | 2 + .../tests/kfdtest/src/KFDASMTest.cpp | 73 +++++++++++++++++++ .../tests/kfdtest/src/KFDASMTest.hpp | 39 ++++++++++ .../tests/kfdtest/src/ShaderStore.cpp | 48 +++++++++--- .../tests/kfdtest/src/ShaderStore.hpp | 5 ++ 6 files changed, 157 insertions(+), 11 deletions(-) create mode 100644 projects/rocr-runtime/tests/kfdtest/src/KFDASMTest.cpp create mode 100644 projects/rocr-runtime/tests/kfdtest/src/KFDASMTest.hpp diff --git a/projects/rocr-runtime/tests/kfdtest/CMakeLists.txt b/projects/rocr-runtime/tests/kfdtest/CMakeLists.txt index b1208f54c2..a2b122d42b 100644 --- a/projects/rocr-runtime/tests/kfdtest/CMakeLists.txt +++ b/projects/rocr-runtime/tests/kfdtest/CMakeLists.txt @@ -169,6 +169,7 @@ set (SRC_FILES gtest-1.6.0/gtest-all.cpp src/KFDDBGTest.cpp src/KFDGWSTest.cpp src/KFDIPCTest.cpp + src/KFDASMTest.cpp src/KFDEvictTest.cpp src/KFDHWSTest.cpp diff --git a/projects/rocr-runtime/tests/kfdtest/src/Assemble.hpp b/projects/rocr-runtime/tests/kfdtest/src/Assemble.hpp index d61229a5a5..46fb946a84 100644 --- a/projects/rocr-runtime/tests/kfdtest/src/Assemble.hpp +++ b/projects/rocr-runtime/tests/kfdtest/src/Assemble.hpp @@ -43,6 +43,8 @@ #ifndef _ASSEMBLE_H_ #define _ASSEMBLE_H_ +#include "OSWrapper.hpp" + #define ASM_MCPU_LEN 16 class Assembler { diff --git a/projects/rocr-runtime/tests/kfdtest/src/KFDASMTest.cpp b/projects/rocr-runtime/tests/kfdtest/src/KFDASMTest.cpp new file mode 100644 index 0000000000..4b9f5d69c8 --- /dev/null +++ b/projects/rocr-runtime/tests/kfdtest/src/KFDASMTest.cpp @@ -0,0 +1,73 @@ +/* + * Copyright (C) 2022 Advanced Micro Devices, Inc. All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + * + */ + +#include "GoogleTestExtension.hpp" +#include "KFDASMTest.hpp" +#include "ShaderStore.hpp" +#include "Assemble.hpp" + +void KFDASMTest::SetUp() {} +void KFDASMTest::TearDown() {} + +static const std::vector TargetList = { + 0x080001, + 0x080002, + 0x080003, + 0x080005, + 0x080100, + 0x090000, + 0x090002, + 0x090004, + 0x090006, + 0x090008, + 0x090009, + 0x09000a, + 0x09000c, + 0x0a0100, + 0x0a0101, + 0x0a0102, + 0x0a0103, + 0x0a0300, + 0x0a0301, + 0x0a0302, + 0x0a0303, + 0x0a0304, + 0x0a0305, + 0x0a0306, +}; + +TEST_F(KFDASMTest, AssembleShaders) { + TEST_START(TESTPROFILE_RUNALL) + + for (auto &t : TargetList) { + Assembler asmblr(t); + + LOG() << "Running ASM test for target " << asmblr.GetTargetAsic() << std::endl; + + for (auto &s : ShaderList) { + EXPECT_SUCCESS(asmblr.RunAssemble(s)); + } + } + + TEST_END +} diff --git a/projects/rocr-runtime/tests/kfdtest/src/KFDASMTest.hpp b/projects/rocr-runtime/tests/kfdtest/src/KFDASMTest.hpp new file mode 100644 index 0000000000..5f601e165a --- /dev/null +++ b/projects/rocr-runtime/tests/kfdtest/src/KFDASMTest.hpp @@ -0,0 +1,39 @@ +/* + * Copyright (C) 2022 Advanced Micro Devices, Inc. All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + * + */ + +#ifndef __KFD_ASM_TEST__H__ +#define __KFD_ASM_TEST__H__ + +#include + +class KFDASMTest : public testing::Test { + public: + KFDASMTest() {} + ~KFDASMTest() {} + + protected: + virtual void SetUp(); + virtual void TearDown(); +}; + +#endif // __KFD_ASM_TEST__H__ diff --git a/projects/rocr-runtime/tests/kfdtest/src/ShaderStore.cpp b/projects/rocr-runtime/tests/kfdtest/src/ShaderStore.cpp index 63cd68a063..8b40351f04 100644 --- a/projects/rocr-runtime/tests/kfdtest/src/ShaderStore.cpp +++ b/projects/rocr-runtime/tests/kfdtest/src/ShaderStore.cpp @@ -21,6 +21,30 @@ * */ +#include "ShaderStore.hpp" + +/** + * KFDASMTest List + */ + +const std::vector ShaderList = { + NoopIsa, + CopyDwordIsa, + InfiniteLoopIsa, + AtomicIncIsa, + ScratchCopyDwordIsa, + PollMemoryIsa, + CopyOnSignalIsa, + PollAndCopyIsa, + WriteFlagAndValueIsa, + WriteAndSignalIsa, + LoopIsa, + IterateIsa, + ReadMemoryIsa, + GwsInitIsa, + GwsAtomicIncreaseIsa, +}; + /** * Macros */ @@ -251,7 +275,7 @@ const char *PollAndCopyIsa = R"( s_store_dword s17, s[2:3], 0x0 glc s_waitcnt vmcnt(0) & lgkmcnt(0) buffer_wbl2 - .else + .elseif (.amdgcn.gfx_generation_number == 9) s_movk_i32 s18, 0x1 LOOP: s_load_dword s16, s[0:1], 0x0 glc @@ -277,16 +301,18 @@ const char *PollAndCopyIsa = R"( const char *WriteFlagAndValueIsa = R"( .text // Assume two inputs buffer in s[0:1] and s[2:3] - v_mov_b32 v0, s0 - v_mov_b32 v1, s1 - s_load_dword s18, s[2:3], 0x0 glc - s_waitcnt vmcnt(0) & lgkmcnt(0) - s_store_dword s18, s[0:1], 0x4 glc - s_waitcnt vmcnt(0) & lgkmcnt(0) - buffer_wbl2 - s_waitcnt vmcnt(0) & lgkmcnt(0) - v_mov_b32 v16, 0x1 - flat_store_dword v[0:1], v16 glc + .if (.amdgcn.gfx_generation_number == 9 && .amdgcn.gfx_generation_stepping == 10) + v_mov_b32 v0, s0 + v_mov_b32 v1, s1 + s_load_dword s18, s[2:3], 0x0 glc + s_waitcnt vmcnt(0) & lgkmcnt(0) + s_store_dword s18, s[0:1], 0x4 glc + s_waitcnt vmcnt(0) & lgkmcnt(0) + buffer_wbl2 + s_waitcnt vmcnt(0) & lgkmcnt(0) + v_mov_b32 v16, 0x1 + flat_store_dword v[0:1], v16 glc + .endif s_endpgm )"; diff --git a/projects/rocr-runtime/tests/kfdtest/src/ShaderStore.hpp b/projects/rocr-runtime/tests/kfdtest/src/ShaderStore.hpp index 231e7f73d6..e0151a6537 100644 --- a/projects/rocr-runtime/tests/kfdtest/src/ShaderStore.hpp +++ b/projects/rocr-runtime/tests/kfdtest/src/ShaderStore.hpp @@ -24,6 +24,11 @@ #ifndef _SHADERSTORE_H_ #define _SHADERSTORE_H_ +#include + +/* KFDASMTest List */ +extern const std::vector ShaderList; + /* Common */ extern const char *NoopIsa; extern const char *CopyDwordIsa;