Merge branch 'sp3-llvm-transistion' into amd-staging

Transistion KFDTest to use open source LLVM compiler instead of SP3
compiler

Change-Id: I26fff6a958bc48cb1f5509a11ec194d2ececf0ce


[ROCm/ROCR-Runtime commit: b9651d3118]
Этот коммит содержится в:
Harish Kasiviswanathan
2022-04-26 13:15:59 -04:00
родитель 949f8fc7aa 0ac0c9527d
Коммит cc64271f9f
52 изменённых файлов: 1331 добавлений и 2863 удалений
+37 -11
Просмотреть файл
@@ -95,12 +95,42 @@ endif()
message ( "Find libhsakmt at ${HSAKMT_LIBRARY_DIRS}" )
set ( SP3_DIR ${PROJECT_SOURCE_DIR}/sp3 )
if ( POLICY CMP0074 )
cmake_policy( SET CMP0074 NEW )
endif()
find_path( LIGHTNING_CMAKE_DIR NAMES LLVMConfig.cmake
PATHS $ENV{OUT_DIR}/llvm/lib/cmake/llvm NO_CACHE NO_DEFAULT_PATH)
if ( DEFINED LIGHTNING_CMAKE_DIR AND EXISTS ${LIGHTNING_CMAKE_DIR} )
set ( LLVM_DIR ${LIGHTNING_CMAKE_DIR} )
else()
message( WARNING "Couldn't find Lightning build. "
"Attempting to use system LLVM install..." )
endif()
find_package( LLVM REQUIRED CONFIG )
if( ${LLVM_PACKAGE_VERSION} VERSION_LESS "7.0" )
message( FATAL_ERROR "Requires LLVM 7.0 or greater "
"(found ${LLVM_PACKAGE_VERSION})" )
elseif( ${LLVM_PACKAGE_VERSION} VERSION_LESS "14.0" )
message( WARNING "Not using latest LLVM version. "
"Some ASIC targets may not work!" )
endif()
message( STATUS "Found LLVM ${LLVM_PACKAGE_VERSION}" )
message( STATUS "Using LLVMConfig.cmake in: ${LLVM_DIR}" )
include_directories(${LLVM_INCLUDE_DIRS})
separate_arguments(LLVM_DEFINITIONS_LIST NATIVE_COMMAND ${LLVM_DEFINITIONS})
add_definitions(${LLVM_DEFINITIONS_LIST})
llvm_map_components_to_libnames(llvm_libs AMDGPUAsmParser Core Support)
include_directories(${PROJECT_SOURCE_DIR}/gtest-1.6.0)
include_directories(${PROJECT_SOURCE_DIR}/include)
include_directories(${PROJECT_SOURCE_DIR}/../../include)
include_directories(${SP3_DIR})
include_directories(${DRM_INCLUDE_DIRS})
@@ -112,12 +142,8 @@ set (SRC_FILES gtest-1.6.0/gtest-all.cpp
src/Dispatch.cpp
src/GoogleTestExtension.cpp
src/IndirectBuffer.cpp
src/IsaGenerator.cpp
src/IsaGenerator_Aldebaran.cpp
src/IsaGenerator_Gfx10.cpp
src/IsaGenerator_Gfx72.cpp
src/IsaGenerator_Gfx8.cpp
src/IsaGenerator_Gfx9.cpp
src/Assemble.cpp
src/ShaderStore.cpp
src/LinuxOSWrapper.cpp
src/PM4Packet.cpp
src/PM4Queue.cpp
@@ -143,6 +169,7 @@ set (SRC_FILES gtest-1.6.0/gtest-all.cpp
src/KFDDBGTest.cpp
src/KFDGWSTest.cpp
src/KFDIPCTest.cpp
src/KFDASMTest.cpp
src/KFDEvictTest.cpp
src/KFDHWSTest.cpp
@@ -163,7 +190,7 @@ message( STATUS "PROJECT_SOURCE_DIR:" ${PROJECT_SOURCE_DIR} )
if ( "${CMAKE_C_COMPILER_VERSION}" STRGREATER "4.8.0")
## Add --enable-new-dtags to generate DT_RUNPATH
set ( CMAKE_CXX_FLAGS "-std=gnu++11 -Wl,--enable-new-dtags" )
set ( CMAKE_CXX_FLAGS "-std=gnu++14 -Wl,--enable-new-dtags" )
endif()
if ( "${CMAKE_BUILD_TYPE}" STREQUAL Release )
set ( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O2" )
@@ -181,11 +208,10 @@ endif ()
# The modules found by pkg_check_modules() in the default pkg config
# path do not need to use link_directories() here.
link_directories(${HSAKMT_LIBRARY_DIRS})
link_directories(${SP3_DIR})
add_executable(kfdtest ${SRC_FILES})
target_link_libraries(kfdtest ${HSAKMT_LIBRARIES} ${DRM_LDFLAGS} ${DRM_AMDGPU_LDFLAGS} pthread m stdc++ rt amdsp3 numa)
target_link_libraries(kfdtest ${HSAKMT_LIBRARIES} ${DRM_LDFLAGS} ${DRM_AMDGPU_LDFLAGS} ${llvm_libs} pthread m stdc++ rt numa)
configure_file ( scripts/kfdtest.exclude kfdtest.exclude COPYONLY )
configure_file ( scripts/run_kfdtest.sh run_kfdtest.sh COPYONLY )
+10 -26
Просмотреть файл
@@ -224,26 +224,10 @@ FILTER[aldebaran]=\
"KFDMemoryTest.PtraceAccess:"\
"KFDMemoryTest.DeviceHdpFlush"
# SP3 Compiler needs to be updated for GFX10. Temporarily disable all tests
# that require shader compiler
# Adding KFDSVMEvictTest as SVM/HMM was never validated on GFX10
TEMP_GFX10_BLACKLIST=\
"KFDMemoryTest.FlatScratchAccess:"\
"KFDMemoryTest.PtraceAccessInvisibleVram:"\
"KFDQMTest.QueuePriorityOnDifferentPipe:"\
"KFDQMTest.QueuePriorityOnSamePipe:"\
"KFDCWSRTest.BasicTest:"\
"KFDQMTest.BasicCuMaskingEven:"\
"KFDEvictTest.QueueTest:"\
"KFDMemoryTest.MapUnmapToNodes:"\
"KFDMemoryTest.HostHdpFlush:"\
"KFDMemoryTest.DeviceHdpFlush:"\
"KFDSVMEvictTest.*"
FILTER[navi10]=\
"$BLACKLIST_ALL_ASICS:"\
"$TEMP_GFX10_BLACKLIST:"\
"KFDMemoryTest.MMBench"
"KFDMemoryTest.MMBench:"\
"KFDSVMEvictTest.*"
# Need to verify the following failed tests on another machine:
# Exceptions not being received during exception tests
@@ -254,42 +238,42 @@ FILTER[navi12]=\
"KFDExceptionTest.*:"\
"KFDPerfCountersTest.*:"\
"KFDPerformanceTest.P2PBandWidthTest:"\
"$TEMP_GFX10_BLACKLIST"
"KFDSVMEvictTest.*"
FILTER[navi14]=\
"$BLACKLIST_ALL_ASICS:"\
"$TEMP_GFX10_BLACKLIST"
"KFDSVMEvictTest.*"
FILTER[sienna_cichlid]=\
"$BLACKLIST_ALL_ASICS:"\
"$TEMP_GFX10_BLACKLIST:"\
"KFDQMTest.BasicCuMaskingEven:"\
"KFDDBGTest.*:"\
"KFDPerfCountersTest.*:"\
"KFDSVMEvictTest.*"
FILTER[navy_flounder]=\
"$BLACKLIST_ALL_ASICS:"\
"$TEMP_GFX10_BLACKLIST:"\
"KFDQMTest.BasicCuMaskingEven:"\
"KFDDBGTest.*:"\
"KFDPerfCountersTest.*:"\
"KFDSVMEvictTest.*"
FILTER[dimgrey_cavefish]=\
"$BLACKLIST_ALL_ASICS:"\
"$TEMP_GFX10_BLACKLIST:"\
"KFDQMTest.BasicCuMaskingEven:"\
"KFDDBGTest.*:"\
"KFDPerfCountersTest.*:"\
"KFDSVMEvictTest.*"
FILTER[beige_goby]=\
"$BLACKLIST_ALL_ASICS:"\
"$TEMP_GFX10_BLACKLIST:"\
"KFDQMTest.BasicCuMaskingEven:"\
"KFDDBGTest.*:"\
"KFDPerfCountersTest.*:"\
"KFDSVMEvictTest.*"
FILTER[yellow_carp]=\
"$BLACKLIST_ALL_ASICS:"\
"$TEMP_GFX10_BLACKLIST:"\
"KFDQMTest.BasicCuMaskingEven:"\
"KFDIPCTest.CMABasicTest"
"KFDIPCTest.CMABasicTest:"\
"KFDSVMEvictTest.*"
-6
Просмотреть файл
@@ -1,6 +0,0 @@
Note: This folder is primarily intended for AMD internal developers.
The folder lib_helper contains the script to generate SP3 library libamdsp3.a
and the associated header files in the current folder for kfdtest to use.
cmake is required for the script to run. Just run ./build_sp3.sh after setting
up the environment variables (source build/envsetup.sh).
-79
Просмотреть файл
@@ -1,79 +0,0 @@
#
# Copyright (C) 2018 Advanced Micro Devices, Inc. All Rights Reserved.
#
# Permission is hereby granted, free of charge, to any person obtaining a
# copy of this software and associated documentation files (the "Software"),
# to deal in the Software without restriction, including without limitation
# the rights to use, copy, modify, merge, publish, distribute, sublicense,
# and/or sell copies of the Software, and to permit persons to whom the
# Software is furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in
# all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
# OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
# ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
# OTHER DEALINGS IN THE SOFTWARE.
#
#
cmake_minimum_required(VERSION 2.8 FATAL_ERROR)
project(amdsp3)
#set ( CMAKE_VERBOSE_MAKEFILE on )
find_package(PkgConfig)
set ( P4_PATH $ENV{WORK_ROOT}/p4/driver/drivers )
set ( SCLIB_SRC ${PROJECT_SOURCE_DIR} )
#if( DEFINED ENV{SCLIB_SRC} )
# set ( SCLIB_SRC $ENV{SCLIB_SRC} )
#else()
# set ( SCLIB_SRC ${P4_PATH}/sc/Chip )
#endif()
include_directories(${SCLIB_SRC}/sp3)
#include_directories(${SCLIB_SRC}/sp3/release_headers)
include_directories(${SCLIB_SRC}/sp3/gen)
set ( SRC_FILES ${SRC_FILES} ${SCLIB_SRC}/sp3/sp3-asic.c )
set ( SRC_FILES ${SRC_FILES} ${SCLIB_SRC}/sp3/sp3-dispatch.c )
set ( SRC_FILES ${SRC_FILES} ${SCLIB_SRC}/sp3/sp3-eval.c )
set ( SRC_FILES ${SRC_FILES} ${SCLIB_SRC}/sp3/sp3-gc.c )
set ( SRC_FILES ${SRC_FILES} ${SCLIB_SRC}/sp3/sp3-int.c )
set ( SRC_FILES ${SRC_FILES} ${SCLIB_SRC}/sp3/sp3-lib.c )
set ( SRC_FILES ${SRC_FILES} ${SCLIB_SRC}/sp3/sp3-native.c )
set ( SRC_FILES ${SRC_FILES} ${SCLIB_SRC}/sp3/sp3-cipher.c )
set ( SRC_FILES ${SRC_FILES} ${SCLIB_SRC}/sp3/sp3-vm.c )
aux_source_directory(${SCLIB_SRC}/sp3/gen SRC_FILES)
aux_source_directory(${SCLIB_SRC}/sp3/backend/si/lib SRC_FILES)
aux_source_directory(${SCLIB_SRC}/sp3/backend/ci/lib SRC_FILES)
aux_source_directory(${SCLIB_SRC}/sp3/backend/gfx8/lib SRC_FILES)
aux_source_directory(${SCLIB_SRC}/sp3/backend/gfx81/lib SRC_FILES)
aux_source_directory(${SCLIB_SRC}/sp3/backend/gfx9/lib SRC_FILES)
aux_source_directory(${SCLIB_SRC}/sp3/backend/gfx10/lib SRC_FILES)
aux_source_directory(${SCLIB_SRC}/sp3/backend/aldbrn/lib SRC_FILES)
aux_source_directory(${SCLIB_SRC}/sp3/backend/gfx81/arch SRC_FILES)
aux_source_directory(${SCLIB_SRC}/sp3/backend/gfx9/arch SRC_FILES)
aux_source_directory(${SCLIB_SRC}/sp3/backend/gfx10/arch SRC_FILES)
aux_source_directory(${SCLIB_SRC}/sp3/backend/aldbrn/arch SRC_FILES)
message( STATUS "PROJECT_SOURCE_DIR:" ${PROJECT_SOURCE_DIR} )
#message( STATUS "SRC_FILES: ")
#foreach(file ${SRC_FILES})
# message(STATUS "${file}")
#endforeach()
set ( CMAKE_C_FLAGS "-DSP3_STATIC_LIB -Wno-error -DPUBLIC_RELEASE -DLITTLEENDIAN_CPU -fPIC -DGFX101_BUILD -DALDBRN_BUILD" )
add_library(amdsp3 ${SRC_FILES})
-57
Просмотреть файл
@@ -1,57 +0,0 @@
#
# Copyright (C) 2018 Advanced Micro Devices, Inc. All Rights Reserved.
#
# Permission is hereby granted, free of charge, to any person obtaining a
# copy of this software and associated documentation files (the "Software"),
# to deal in the Software without restriction, including without limitation
# the rights to use, copy, modify, merge, publish, distribute, sublicense,
# and/or sell copies of the Software, and to permit persons to whom the
# Software is furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in
# all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
# OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
# ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
# OTHER DEALINGS IN THE SOFTWARE.
#
#
#!/bin/bash
if [ "$KFDTEST_ROOT" == "" ] || [ "$P4_ROOT" == "" ]; then
echo "Environment variables should be set before running this script"
exit 1
fi
cd $KFDTEST_ROOT/sp3/lib_helper
SP3_PROJECT=$P4_ROOT/driver/drivers/sc/Chip/
LIB_OUTPUT=$KFDTEST_ROOT/sp3/
cp CMakeLists_sp3.txt $SP3_PROJECT/CMakeLists.txt
mkdir -p build
echo "Building SP3 lib"
pushd build
cmake $SP3_PROJECT/
make
popd
rsync --progress -a build/libamdsp3.a $LIB_OUTPUT
# Put the intermediate header files in the current folder for further processing
rsync --progress -a $SP3_PROJECT/sp3/public/lib/sp3.h .
# Remove the build folder and CMakeLists.txt put into SP source folder
rm -r build
rm $SP3_PROJECT/CMakeLists.txt
# Replace the license statement in the header files
{ cat AMD_opensource_license.txt; sed -e '1,/#ifndef/ { /#ifndef/b; d }' sp3.h; } > $LIB_OUTPUT/sp3.h
# Delete the intermediate header files
rm sp3.h
-643
Просмотреть файл
@@ -1,643 +0,0 @@
/*
* Copyright (C) 2014-2018 Advanced Micro Devices, Inc. All Rights Reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*
*/
#ifndef __SP3_H__
#define __SP3_H__
#ifdef __cplusplus
extern "C" {
#endif
/// @file sp3.h
/// @brief sp3 API
#include <stdint.h>
// Export tags
#define SP3_EXPORT
/// @defgroup sp3main SP3 Main API
///
/// Main API to assemble and disassemble SP3 shaders.
///
/// @{
/// Valid shader stages.
enum sp3_shtype {
SP3_SHTYPE_NONE = -1,
SP3_SHTYPE_PS = 0,
SP3_SHTYPE_VS = 1,
SP3_SHTYPE_GS = 2,
SP3_SHTYPE_ES = 3,
SP3_SHTYPE_HS = 4,
SP3_SHTYPE_LS = 5,
SP3_SHTYPE_CS = 6,
#ifdef NAVI10LITE_BUILD
SP3_SHTYPE_ACV = 7,
#endif
};
/// Assorted constants used by sp3 API.
enum sp3_count {
SP3_NUM_MRT = 8, ///< Maximum number of render targets supported.
SP3_NUM_STRM = 4, ///< Maximum number of streams supported.
};
/// Disassembly flags. Bitwise-OR flags to set options.
enum sp3_flag {
SP3DIS_NO_STATE = 0x01, ///< Do not include state header at top of shader.
SP3DIS_NO_BINARY = 0x02, ///< Do not include comments with raw binary microcode.
SP3DIS_COMMENTS = 0x04, ///< Do not include comments.
SP3DIS_NO_GPR_COUNT = 0x08, ///< Do not include GPR allocation counts.
SP3DIS_FORCEVALID = 0x10, ///< Force all bytes of microcode to be disassembled.
SP3DIS_NO_ASIC = 0x20, ///< Do not emit the asic header at top of shader.
};
/// Shader context. Contains no user-visible fields.
struct sp3_context;
/// Memory object. Contains no user-visible fields.
struct sp3_vma;
/// VM addresses are 64-bit and the address unit is 32 bits
typedef uint64_t sp3_vmaddr;
/// Storage entry for register streams.
struct sp3_reg {
uint32_t index; ///< One of the MM aperture register addresses.
uint32_t value; ///< 32-bit register data.
};
/// Bits for a single instruction.
struct sp3_inst_bits {
uint32_t val[5]; ///< Largest single instruction in any backend is 5 dwords.
};
/// Wrapped shader metadata.
///
/// After generation, shaders are encapsulated in sp3_shader structures.
///
/// Those structures contain the shader binary, its register stream, constants and constant
/// buffers and metadata needed for SC compatibility.
///
struct sp3_shader {
enum sp3_shtype type; ///< One of the SHTYPE_* constants.
uint32_t asic_int; ///< Internal ASIC index. Do not use.
char asic[0x100]; ///< ASIC name as a string ("RV870" etc).
uint32_t size; ///< Size of the compiled shader, in 32-bit words.
uint32_t nsgprs; ///< Number of scalar GPRs used.
uint32_t nvgprs; ///< Number of vector GPRs used.
uint32_t nsvgprs; ///< Number of shared vector GPRs used (only available in certain projects).
uint32_t naccvgprs; ///< Number of accumulator vector GPRs used (only available in certain projects).
uint32_t nsgprs_manual_alloc;
uint32_t nvgprs_manual_alloc;
uint32_t nsvgprs_manual_alloc;
uint32_t naccvgprs_manual_alloc;
uint32_t trap_present;
uint32_t user_sgpr_count;
uint32_t scratch_en;
uint32_t dispatch_draw_en;
uint32_t so_en;
uint32_t so_base0_en;
uint32_t so_base1_en;
uint32_t so_base2_en;
uint32_t so_base3_en;
uint32_t oc_lds_en;
uint32_t tg_size_en;
uint32_t tidig_comp_cnt; ///< Number of components(-1) enabled for thread id in group
uint32_t tgid_x_en;
uint32_t tgid_y_en;
uint32_t tgid_z_en;
uint32_t wave_cnt_en;
uint32_t primgen_en;
uint32_t pc_base_en;
uint32_t sgpr_scratch;
uint32_t sgpr_psvs_state;
uint32_t sgpr_gs2vs_offset;
uint32_t sgpr_so_write_index;
uint32_t sgpr_so_base_offset0;
uint32_t sgpr_so_base_offset1;
uint32_t sgpr_so_base_offset2;
uint32_t sgpr_so_base_offset3;
uint32_t sgpr_offchip_lds;
uint32_t sgpr_is_offchip;
uint32_t sgpr_ring_offset;
uint32_t sgpr_gs_wave_id;
uint32_t sgpr_global_wave_id;
uint32_t sgpr_tg_size;
uint32_t sgpr_tgid_x;
uint32_t sgpr_tgid_y;
uint32_t sgpr_tgid_z;
uint32_t sgpr_tf_base;
uint32_t sgpr_pc_base;
uint32_t sgpr_wave_cnt;
uint32_t wave_size; ///< Number of threads in a wavefront (only certain ASICs; 0 = don't care).
uint32_t pc_exports; ///< Range of parameters exported (if VS).
uint32_t pos_export; ///< Shader executes a position export (if VS).
uint32_t cb_exports; ///< Range of MRTs exported (if PS).
uint32_t mrtz_export_format;///< Export format of the mrtz export.
uint32_t z_export; ///< Shader executes a Z export (if PS).
uint32_t pops_en; ///< Shader is POPS (PS)
uint32_t pops_num_samples; ///< (PS)
uint32_t load_collision_waveid; ///< Shader sets load collision waveid (if PS).
uint32_t load_intrawave_collision; ///< Shader is in intrawave mode (if PS).
uint32_t stencil_test_export; ///< Shader exports stencil (if PS).
uint32_t stencil_op_export; ///< Shader exports stencil (if PS).
uint32_t kill_used; ///< Shader executes ALU KILL operations.
uint32_t cb_masks[SP3_NUM_MRT]; ///< Component masks for each MRT exported (if PS).
uint32_t emit_used; ///< EMIT opcodes used (if GS).
uint32_t covmask_export; ///< Shader exports coverage mask (if PS).
uint32_t mask_export; ///< Shader exports mask (if PS).
uint32_t strm_used[SP3_NUM_STRM]; ///< Streamout operations used (map).
uint32_t scratch_used; ///< Scratch SMX exports used.
uint32_t scratch_itemsize; ///< Scratch ring item size.
uint32_t reduction_used; ///< Reduction SMX exports used.
uint32_t ring_used; ///< ESGS/GSVS ring SMX exports used.
uint32_t ring_itemsize; ///< ESGS/GSVS ring item size (for ES/GS respectively).
uint32_t vertex_size[4]; ///< GSVS ring vertex size (for GS).
uint32_t mem_used; ///< Raw memory SMX exports used.
uint32_t rats_used; ///< Mask of RATs (UAVs) used
uint32_t group_size[3]; ///< Wavefront group size (for ELF files).
uint32_t alloc_lds; ///< Number of LDS bytes allocated for wave group. (translates to lds_size in CS and LS)
uint32_t *data; ///< Shader binary data.
uint32_t nregs; ///< Number of register writes in the stream.
uint64_t crc64; ///< CRC64 of compiled shader, may be used for identification/fingerprinting.
uint32_t crc32; ///< 32-bit CRC of compiled shader (based on crc64), may be used for identification/fingerprinting.
struct sp3_reg *regs; ///< Register writes (index-value pairs).
struct sp3_shader *merged_2nd_shader; ///< Merged es/gs, ls/hs shader, this points to start of the second shader (only certain ASICs).
};
/// Comment callback.
typedef const char *(*sp3_comment_cb)(void *, int);
/// Get version of the sp3 library.
///
/// @return String containing the version number.
///
SP3_EXPORT const char *sp3_version(void);
/// Create a new sp3 context.
///
/// @return A new context for use in assembling and disassembling shaders. Free with sp3_close().
///
SP3_EXPORT struct sp3_context *sp3_new(void);
/// Set option for sp3.
///
/// @param state sp3 context.
/// @param option Option name. Unknown options will raise an error.
/// @param value Option value. NULL is used to represent value-less options.
///
/// Currently supported options:
///
/// stdlib (string) -- absolute path to standard library files. May be a colon-separated list
/// of paths that will be used to search for stdlib files. Used by sp3_parse_library().
///
/// The following options are deprecated because they take integer arguments; you should use
/// sp3_set_option_int() for these settings going forward. They will continue to be accepted by
/// this API to support legacy users.
///
/// Werror (boolean) -- indicates whether warnings should be treated as errors.
///
/// wave_size (integer) -- sets the wave size being used by the draw calls that will be using
/// this shader. Ignored in certain ASICs. You may set this to 32, 64 or the special value 0
/// to indicate no preference on wave size. The shader will be checked to ensure it is
/// compatible with the size specified here.
///
/// omit_version (boolean) -- omit generation of the S_VERSION opcode.
///
/// omit_code_end (boolean) -- omit generation of the S_CODE_END footer.
///
/// allow_raw_bits (boolean) -- allow use of the raw_bits() function in sp3 shaders. This is a
/// dangerous option to allow in general so you must explicitly enable this option, otherwise
/// the raw_bits() function will always error out.
///
SP3_EXPORT void sp3_set_option(
struct sp3_context *state,
const char *option,
const char *value);
/// Set option for sp3.
///
/// @param state sp3 context.
/// @param option Option name. Unknown options will raise an error.
/// @param value Option value.
///
/// Currently supported options:
///
/// Werror (boolean) -- indicates whether warnings should be treated as errors.
///
/// wave_size (integer) -- sets the wave size being used by the draw calls that will be using
/// this shader. Ignored in certain ASICs. You may set this to 32, 64 or the special value 0
/// to indicate no preference on wave size. The shader will be checked to ensure it is
/// compatible with the size specified here.
///
/// omit_version (boolean) -- omit generation of the S_VERSION opcode.
///
/// omit_code_end (boolean) -- omit generation of the S_CODE_END footer.
///
/// allow_raw_bits (boolean) -- allow use of the raw_bits() function in sp3 shaders. This is a
/// dangerous option to allow in general so you must explicitly enable this option, otherwise
/// the raw_bits() function will always error out.
///
/// secure_mode (boolean) -- run in secure mode. Disables macro language features in assembly
/// path including calls to custom functions. Useful if sp3 is used as a backend to a web-based
/// assembly tool.
///
/// debug_encoding (boolean) -- if true, debug encoding selection logic for assembly. Only
/// supported in 10.4+ backends.
///
/// no_vs_export_check (boolean) -- if true, disable VS export sanity check. Only supported in
/// 10.4+ backends.
///
SP3_EXPORT void sp3_set_option_int(
struct sp3_context *state,
const char *option,
int32_t value);
/// Parse a file into a context.
///
/// Use sp3_compile to generate binary microcode after the shader is parsed.
///
/// @param state Context to use for parsing.
/// @param file File to read. If NULL, parse from stdin.
///
SP3_EXPORT void sp3_parse_file(struct sp3_context *state, const char *file);
/// Parse a string into a context.
///
/// Use sp3_compile to generate binary microcode after the shader is parsed.
///
/// @param state Context to use for parsing.
/// @param string String to parse.
///
SP3_EXPORT void sp3_parse_string(struct sp3_context *state, const char *string);
/// Parse a file from the standard library into a context.
///
/// Use sp3_compile to generate binary microcode after the shader is parsed.
///
/// @param state Context to use for parsing.
/// @param name Path to the standard library; files in this directory are parsed.
///
SP3_EXPORT void sp3_parse_library(struct sp3_context *state, const char *name);
/// Call a sp3 function.
///
SP3_EXPORT void sp3_call(struct sp3_context *state, const char *func);
/// Compile a shader program that has been parsed into the context.
///
/// @param state sp3 context.
/// @param cffunc Name of clause to call. By convention, this is "main".
/// @return A compiled and linked shader. Free memory with sp3_free_shader().
///
SP3_EXPORT struct sp3_shader *sp3_compile(
struct sp3_context *state,
const char *cffunc);
/// Free a sp3_shader.
///
/// @param sh Shader object to delete.
///
SP3_EXPORT void sp3_free_shader(struct sp3_shader *sh);
/// Get current ASIC name set for a context.
///
/// @param state Context to query.
/// @return Name of ASIC.
///
SP3_EXPORT const char *sp3_getasic(struct sp3_context *state);
/// Set current ASIC name for a context.
///
/// @param state Context to modify.
/// @param chip Case-insensitive string representing the ASIC to compile or disassemble for.
///
SP3_EXPORT void sp3_setasic(struct sp3_context *state, const char *chip);
/// Set global variable in context to an integer.
///
SP3_EXPORT void sp3_set_param_int(
struct sp3_context *state,
const char *name,
int32_t value);
/// Set global variable in context to an integer vector.
///
SP3_EXPORT void sp3_set_param_intvec(
struct sp3_context *state,
const char *name,
uint32_t size,
const int32_t *value);
/// Set global variable in context to a float.
///
SP3_EXPORT void sp3_set_param_float(
struct sp3_context *state,
const char *name,
float value);
/// Set global variable in context to a float vector.
///
SP3_EXPORT void sp3_set_param_floatvec(
struct sp3_context *state,
const char *name,
uint32_t size,
const float *value);
/// Set error message header.
///
/// @param state Context to modify.
/// @param str Text to include in error message header.
///
SP3_EXPORT void sp3_set_error_header(struct sp3_context *state, const char *str);
/// Get ASIC metrics for the ASIC in current state.
///
/// Used by ELF tools to fill in some CAL fields.
///
/// @param state Context to query.
/// @param name Name of ASIC metric.
/// @return Value of ASIC metric.
///
SP3_EXPORT int sp3_asicinfo(struct sp3_context *state, const char *name);
/// Free a context allocated by sp3_new/open/parse.
///
/// @param state Context to delete.
///
SP3_EXPORT void sp3_close(struct sp3_context *state);
/// Disassemble a shader.
///
/// This call is likely to change to something that will take a filled sp3_shader structure
/// later on.
///
/// @param state sp3 context (use sp3_new to allocate and sp3_setasic to set ASIC).
/// @param bin Memory map with the opcodes (see sp3-vm.h).
/// @param base Start of the shader in the memory map (in VM entries, i.e. 32-bit words).
/// @param name Same to give the disassembled shader.
/// @param shader_type One of the SHTYPE_* constants.
/// @param include Literal text to include in the CF clause (NULL includes nothing).
/// @param max_len Maximum length of CF clause. Matters if SP3DIS_FORCEVALID is set.
/// @param flags A bitmask of SP3DIS_* flags.
///
/// @return Shader disassembly as a string. Free memory with sp3_free().
///
SP3_EXPORT char *sp3_disasm(
struct sp3_context *state,
struct sp3_vma *bin,
sp3_vmaddr base,
const char *name,
enum sp3_shtype shader_type,
const char *include,
uint32_t max_len,
uint32_t flags);
/// Disassemble a single shader instruction.
///
/// This call is likely to change to something that will take a filled sp3_shader structure
/// later on.
///
/// @param state sp3 context (use sp3_new to allocate and sp3_setasic to set ASIC).
/// @param inst Pointer to dwords containing instruction (exact number of dwords required depends on instruction).
/// @param base Start of the shader in the memory map (in VM entries, i.e. 32-bit words).
/// @param addr Address of the instruction being disassembled (in VM entries, i.e. 32-bit words).
/// @param shader_type One of the SHTYPE_* constants.
/// @param flags A mask of SP3DIS_* flags.
///
/// @return Shader disassembly as a string. Free memory with sp3_free().
///
SP3_EXPORT char *sp3_disasm_inst(
struct sp3_context *state,
const struct sp3_inst_bits *inst,
sp3_vmaddr base,
sp3_vmaddr addr,
enum sp3_shtype shader_type,
uint32_t flags);
/// Parse a register stream.
///
/// Can be called before sp3_disasm to preset things like ALU, boolean and loop constants.
///
/// This call is likely to merge with sp3_disasm later on.
///
/// @param state sp3 context to fill with state.
/// @param nregs Number of register entries.
/// @param regs Register stream to parse.
/// @param shader_type One of the SHTYPE_* constants.
///
SP3_EXPORT void sp3_setregs(
struct sp3_context *state,
uint32_t nregs,
const struct sp3_reg *regs,
enum sp3_shtype shader_type);
/// Set shader comments
///
/// @param state sp3 context.
/// @param map Map of comments (0 for no comment, other values will be passed to the callback).
/// @param f_top Callback returning comment to place above the opcode.
/// @param f_right Callback returning comment to place to the right of the opcode.
/// @param ctx Void pointer to pass to comment callbacks.
///
SP3_EXPORT void sp3_setcomments(
struct sp3_context *state,
struct sp3_vma *map,
sp3_comment_cb f_top,
sp3_comment_cb f_right,
void *ctx);
/// Set alternate shader entry points
///
/// Used for disassembly; this marks an additional location in memory
/// (besides the start address) where shader code may be found. Generally
/// required for jump tables and any case where the shader may perform
/// indirect jumps to ensure that disassembly locates all shader
/// instructions.
///
/// @param state sp3 context (use sp3_new to allocate and sp3_setasic to set ASIC).
/// @param addr Address of the instruction being disassembled (in VM entries, i.e. 32-bit words).
///
SP3_EXPORT void sp3_setentrypoint(
struct sp3_context *state,
sp3_vmaddr addr);
/// Clear alternate shader entry points.
///
/// Clear all entry points previously set with sp3_setentrypoint.
///
/// @param state sp3 context (use sp3_new to allocate and sp3_setasic to set ASIC).
///
SP3_EXPORT void sp3_clearentrypoints(struct sp3_context *state);
/// Free memory allocated by sp3.
///
/// Windows DLLs that allocate memory have to free it. This function
/// should be used to free the result of sp3_disasm, sp3_compile etc.
///
SP3_EXPORT void sp3_free(void *ptr);
/// SP3 API to merge two shaders given file names as input.
///
SP3_EXPORT struct sp3_shader* sp3_merge_shaders(
struct sp3_context *pointer,
const char *first_file,
const char *second_file);
/// SP3 API to merge two shaders given shader strings as input.
///
SP3_EXPORT struct sp3_shader* sp3_merge_shader_strings(
struct sp3_context *pointer,
const char *first_string,
const char *second_string);
/// @}
/// @defgroup sp3vm SP3 Memory Objects
///
/// The VM API is used to manage virtual memory maps. Those maps are used for binary storage
/// for disassembly, as they can naturally mirror the GPU's memory map (so no register
/// translation is needed).
///
/// @{
/// Callback function that will fill a VMA on demand
///
/// The VMA to be filled will be specified through the request address.
/// The callback should fill the VMA using sp3_vm_write calls.
///
typedef void (* sp3_vmfill)(struct sp3_vma *vm, sp3_vmaddr addr, void *ctx);
/// Create a new VM that is empty.
///
/// Free the object with sp3_vm_free().
///
/// @return New VM object.
///
SP3_EXPORT
struct sp3_vma *sp3_vm_new(void);
/// Create a new VM that has a sp3_vmfill callback.
///
/// Free the object with sp3_vm_free().
///
/// @param fill Function used to populate data in VM. The function will be pass the new VM object, the address and a context.
/// @param ctx User-specified context. Passed to the fill function and not used by sp3 itself.
/// @return New VM object.
///
SP3_EXPORT
struct sp3_vma *sp3_vm_new_fill(sp3_vmfill fill, void *ctx);
/// Create a new VM from an array of words.
///
/// Free the object with sp3_vm_free().
///
/// @param base VM address to load array at.
/// @param len Number of 32-bit words in the array.
/// @param data Pointer to the array.
/// @return New VM object.
///
SP3_EXPORT
struct sp3_vma *sp3_vm_new_ptr(sp3_vmaddr base, sp3_vmaddr len, const uint32_t *data);
/// Find a VMA, optionally adding it.
///
/// @param vm VM to search in.
/// @param addr Address to search for.
/// @param add Flag indicating whether a failure should result in adding a new VMA.
/// @return VM object matching the specified address.
///
SP3_EXPORT
struct sp3_vma *sp3_vm_find(struct sp3_vma *vm, sp3_vmaddr addr, uint32_t add);
/// Write a word to a VM.
///
/// @param vm VM to write.
/// @param addr Address to write.
/// @param val 32-bits of data to write.
///
SP3_EXPORT
void sp3_vm_write(struct sp3_vma *vm, sp3_vmaddr addr, uint32_t val);
/// Read a word from a VM.
///
/// @param vm VM to read.
/// @param addr Address to read.
/// @return 32-bits of data at specified address.
///
SP3_EXPORT
uint32_t sp3_vm_read(struct sp3_vma *vm, sp3_vmaddr addr);
/// Probe VM for presence.
///
/// @param vm VM to probe.
/// @param addr Address to search for.
/// @return 1 if the specified address is backed in the VM, 0 otherwise.
///
SP3_EXPORT
int sp3_vm_present(struct sp3_vma *vm, sp3_vmaddr addr);
/// Return base address of VM.
///
/// @param vm VM to query.
/// @return Base address.
///
SP3_EXPORT
sp3_vmaddr sp3_vm_base(struct sp3_vma *vm);
/// Return next VM.
///
/// @param vm VM to query.
/// @return Next VM in list.
///
SP3_EXPORT
struct sp3_vma *sp3_vm_next(struct sp3_vma *vm);
/// Free a VM and all its storage.
///
/// Use this function to free memory allocated by sp3_vm_new, sp3_vm_new_fill and
/// sp3_vm_new_ptr.
///
/// @param vm VM to free.
///
SP3_EXPORT
void sp3_vm_free(struct sp3_vma *vm);
/// @}
#ifdef __cplusplus
}
#endif
#endif /* __SP3_H__ */
+379
Просмотреть файл
@@ -0,0 +1,379 @@
////////////////////////////////////////////////////////////////////////////////
//
// The University of Illinois/NCSA
// Open Source License (NCSA)
//
// Copyright (c) 2022, Advanced Micro Devices, Inc. All rights reserved.
//
// Developed by:
//
// AMD Research and AMD HSA Software Development
//
// Advanced Micro Devices, Inc.
//
// www.amd.com
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to
// deal with the Software without restriction, including without limitation
// the rights to use, copy, modify, merge, publish, distribute, sublicense,
// and/or sell copies of the Software, and to permit persons to whom the
// Software is furnished to do so, subject to the following conditions:
//
// - Redistributions of source code must retain the above copyright notice,
// this list of conditions and the following disclaimers.
// - Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimers in
// the documentation and/or other materials provided with the distribution.
// - Neither the names of Advanced Micro Devices, Inc,
// nor the names of its contributors may be used to endorse or promote
// products derived from this Software without specific prior written
// permission.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
// DEALINGS WITH THE SOFTWARE.
//
////////////////////////////////////////////////////////////////////////////////
/**
* Self-contained assembler that uses the LLVM MC API to assemble AMDGCN
* instructions
*/
#include <llvm/Config/llvm-config.h>
#include <llvm/MC/MCAsmBackend.h>
#include <llvm/MC/MCAsmInfo.h>
#include <llvm/MC/MCCodeEmitter.h>
#include <llvm/MC/MCContext.h>
#include <llvm/MC/MCInstPrinter.h>
#include <llvm/MC/MCInstrInfo.h>
#include <llvm/MC/MCObjectFileInfo.h>
#include <llvm/MC/MCObjectWriter.h>
#include <llvm/MC/MCParser/AsmLexer.h>
#include <llvm/MC/MCParser/MCTargetAsmParser.h>
#include <llvm/MC/MCRegisterInfo.h>
#include <llvm/MC/MCStreamer.h>
#include <llvm/MC/MCSubtargetInfo.h>
#include <llvm/Support/CommandLine.h>
#include <llvm/Support/InitLLVM.h>
#include <llvm/Support/MemoryBuffer.h>
#include <llvm/Support/SourceMgr.h>
#include <llvm/Support/TargetSelect.h>
#if LLVM_VERSION_MAJOR > 13
#include <llvm/MC/TargetRegistry.h>
#else
#include <llvm/Support/TargetRegistry.h>
#endif
#include <linux/elf.h>
#include "OSWrapper.hpp"
#include "Assemble.hpp"
using namespace llvm;
Assembler::Assembler(const uint32_t Gfxv) {
SetTargetAsic(Gfxv);
TextData = nullptr;
TextSize = 0;
LLVMInit();
}
Assembler::~Assembler() {
FlushText();
llvm_shutdown();
}
const char* Assembler::GetInstrStream() {
return TextData;
}
const size_t Assembler::GetInstrStreamSize() {
return TextSize;
}
int Assembler::CopyInstrStream(char* OutBuf, const size_t BufSize) {
if (TextSize > BufSize)
return -2;
std::copy(TextData, TextData + TextSize, OutBuf);
return 0;
}
const char* Assembler::GetTargetAsic() {
return MCPU;
}
/**
* Set MCPU via GFX Version from Thunk
* LLVM Target IDs use decimal for Maj/Min, hex for Step
*/
void Assembler::SetTargetAsic(const uint32_t Gfxv) {
const uint8_t Major = (Gfxv >> 16) & 0xff;
const uint8_t Minor = (Gfxv >> 8) & 0xff;
const uint8_t Step = Gfxv & 0xff;
snprintf(MCPU, ASM_MCPU_LEN, "gfx%d%d%x", Major, Minor, Step);
}
/**
* Initialize LLVM targets and assembly printers/parsers
*/
void Assembler::LLVMInit() {
LLVMInitializeAMDGPUTargetInfo();
LLVMInitializeAMDGPUTargetMC();
LLVMInitializeAMDGPUAsmParser();
}
/**
* Flush/reset TextData and TextSize to initial state
*/
void Assembler::FlushText() {
if (TextData)
delete[] TextData;
TextData = nullptr;
TextSize = 0;
}
/**
* Print hex of ELF object to stdout (debug)
*/
void Assembler::PrintELFHex(const std::string Data) {
outs() << "ASM Info: assembled ELF hex data (length " << Data.length() << "):\n";
outs() << "0x00:\t";
for (size_t i = 0; i < Data.length(); ++i) {
char c = Data[i];
outs() << format_hex(static_cast<uint8_t>(c), 4);
if ((i+1) % 16 == 0)
outs() << "\n" << format_hex(i+1, 4) << ":\t";
else
outs() << " ";
}
outs() << "\n";
}
/**
* Print hex of raw instruction stream to stdout (debug)
*/
void Assembler::PrintTextHex() {
outs() << "ASM Info: assembled .text hex data (length " << TextSize << "):\n";
outs() << "0x00:\t";
for (size_t i = 0; i < TextSize; i++) {
outs() << format_hex(static_cast<uint8_t>(TextData[i]), 4);
if ((i+1) % 16 == 0)
outs() << "\n" << format_hex(i+1, 4) << ":\t";
else
outs() << " ";
}
outs() << "\n";
}
/**
* Extract raw instruction stream from .text section in ELF object
*
* @param RawData Raw C string of ELF object
* @return 0 on success
*/
int Assembler::ExtractELFText(const char* RawData) {
const Elf64_Ehdr* ElfHeader;
const Elf64_Shdr* SectHeader;
const Elf64_Shdr* SectStrTable;
const char* SectStrAddr;
unsigned NumSects, SectIdx;
if (!(ElfHeader = reinterpret_cast<const Elf64_Ehdr*>(RawData))) {
outs() << "ASM Error: elf data is invalid or corrupted\n";
return -1;
}
if (ElfHeader->e_ident[EI_CLASS] != ELFCLASS64) {
outs() << "ASM Error: elf object must be of 64-bit type\n";
return -1;
}
SectHeader = reinterpret_cast<const Elf64_Shdr*>(RawData + ElfHeader->e_shoff);
SectStrTable = &SectHeader[ElfHeader->e_shstrndx];
SectStrAddr = static_cast<const char*>(RawData + SectStrTable->sh_offset);
// Loop through sections, break on .text
NumSects = ElfHeader->e_shnum;
for (SectIdx = 0; SectIdx < NumSects; SectIdx++) {
std::string SectName = std::string(SectStrAddr + SectHeader[SectIdx].sh_name);
if (SectName == std::string(".text")) {
TextSize = SectHeader[SectIdx].sh_size;
TextData = new char[TextSize];
memcpy(TextData, RawData + SectHeader[SectIdx].sh_offset, TextSize);
break;
}
}
if (SectIdx >= NumSects) {
outs() << "ASM Error: couldn't locate .text section\n";
return -1;
}
return 0;
}
/**
* Assemble shader, fill member vars, and copy to output buffer
*
* @param AssemblySource Shader source represented as a raw C string
* @param OutBuf Raw instruction stream output buffer
* @param BufSize Size of OutBuf (defaults to PAGE_SIZE)
* @return Value of RunAssemble() (0 on success)
*/
int Assembler::RunAssembleBuf(const char* const AssemblySource, char* OutBuf,
const size_t BufSize) {
int ret = RunAssemble(AssemblySource);
return ret ? ret : CopyInstrStream(OutBuf, BufSize);
}
/**
* Assemble shader and fill member vars
*
* @param AssemblySource Shader source represented as a raw C string
* @return 0 on success
*/
int Assembler::RunAssemble(const char* const AssemblySource) {
// Ensure target ASIC has been set
if (!MCPU) {
outs() << "ASM Error: target asic is uninitialized\n";
return -1;
}
// Delete TextData for any previous runs
FlushText();
#if 0
outs() << "ASM Info: running assembly for target: " << MCPU << "\n";
outs() << "ASM Info: source:\n";
outs() << AssemblySource << "\n";
#endif
// Initialize MCOptions and target triple
const MCTargetOptions MCOptions;
Triple TheTriple;
const Target* TheTarget =
TargetRegistry::lookupTarget(ArchName, TheTriple, Error);
if (!TheTarget) {
outs() << Error;
return -1;
}
TheTriple.setArchName(ArchName);
TheTriple.setVendorName(VendorName);
TheTriple.setOSName(OSName);
TripleName = TheTriple.getTriple();
TheTriple.setTriple(Triple::normalize(TripleName));
// Create MemoryBuffer for assembly source
StringRef AssemblyRef(AssemblySource);
std::unique_ptr<MemoryBuffer> BufferPtr =
MemoryBuffer::getMemBuffer(AssemblyRef, "", false);
if (!BufferPtr->getBufferSize()) {
outs() << "ASM Error: assembly source is empty\n";
return -1;
}
// Instantiate SrcMgr and transfer BufferPtr ownership
SourceMgr SrcMgr;
SrcMgr.AddNewSourceBuffer(std::move(BufferPtr), SMLoc());
// Initialize MC interfaces and base class objects
std::unique_ptr<const MCRegisterInfo> MRI(
TheTarget->createMCRegInfo(TripleName));
if (!MRI) {
outs() << "ASM Error: no register info for target " << MCPU << "\n";
return -1;
}
#if LLVM_VERSION_MAJOR > 9
std::unique_ptr<const MCAsmInfo> MAI(
TheTarget->createMCAsmInfo(*MRI, TripleName, MCOptions));
#else
std::unique_ptr<const MCAsmInfo> MAI(
TheTarget->createMCAsmInfo(*MRI, TripleName));
#endif
if (!MAI) {
outs() << "ASM Error: no assembly info for target " << MCPU << "\n";
return -1;
}
std::unique_ptr<MCInstrInfo> MCII(
TheTarget->createMCInstrInfo());
if (!MCII) {
outs() << "ASM Error: no instruction info for target " << MCPU << "\n";
return -1;
}
std::unique_ptr<MCSubtargetInfo> STI(
TheTarget->createMCSubtargetInfo(TripleName, MCPU, std::string()));
if (!STI || !STI->isCPUStringValid(MCPU)) {
outs() << "ASM Error: no subtarget info for target " << MCPU << "\n";
return -1;
}
// Set up the MCContext for creating symbols and MCExpr's
#if LLVM_VERSION_MAJOR > 12
MCContext Ctx(TheTriple, MAI.get(), MRI.get(), STI.get(), &SrcMgr, &MCOptions);
#else
MCObjectFileInfo MOFI;
MCContext Ctx(MAI.get(), MRI.get(), &MOFI, &SrcMgr, &MCOptions);
MOFI.InitMCObjectFileInfo(TheTriple, true, Ctx);
#endif
// Finalize setup for output object code stream
std::string Data;
std::unique_ptr<raw_string_ostream> DataStream(std::make_unique<raw_string_ostream>(Data));
std::unique_ptr<buffer_ostream> BOS(std::make_unique<buffer_ostream>(*DataStream));
raw_pwrite_stream* OS = BOS.get();
#if LLVM_VERSION_MAJOR > 14
MCCodeEmitter* CE = TheTarget->createMCCodeEmitter(*MCII, Ctx);
#else
MCCodeEmitter* CE = TheTarget->createMCCodeEmitter(*MCII, *MRI, Ctx);
#endif
MCAsmBackend* MAB = TheTarget->createMCAsmBackend(*STI, *MRI, MCOptions);
std::unique_ptr<MCStreamer> Streamer(TheTarget->createMCObjectStreamer(
TheTriple, Ctx,
std::unique_ptr<MCAsmBackend>(MAB), MAB->createObjectWriter(*OS),
std::unique_ptr<MCCodeEmitter>(CE), *STI, MCOptions.MCRelaxAll,
MCOptions.MCIncrementalLinkerCompatible, /*DWARFMustBeAtTheEnd*/ false));
std::unique_ptr<MCAsmParser> Parser(
createMCAsmParser(SrcMgr, Ctx, *Streamer, *MAI));
// Set parser to target parser and run
std::unique_ptr<MCTargetAsmParser> TAP(
TheTarget->createMCAsmParser(*STI, *Parser, *MCII, MCOptions));
if (!TAP) {
outs() << "ASM Error: no assembly parsing support for target " << MCPU << "\n";
return -1;
}
Parser->setTargetParser(*TAP);
if (Parser->Run(true)) {
outs() << "ASM Error: assembly parser failed\n";
return -1;
}
BOS.reset();
DataStream->flush();
int ret = ExtractELFText(Data.data());
if (ret < 0 || !TextData) {
outs() << "ASM Error: .text extraction failed\n";
return ret;
}
#if 0
PrintELFHex(Data);
PrintTextHex();
#endif
return 0;
}
+86
Просмотреть файл
@@ -0,0 +1,86 @@
////////////////////////////////////////////////////////////////////////////////
//
// The University of Illinois/NCSA
// Open Source License (NCSA)
//
// Copyright (c) 2022, Advanced Micro Devices, Inc. All rights reserved.
//
// Developed by:
//
// AMD Research and AMD HSA Software Development
//
// Advanced Micro Devices, Inc.
//
// www.amd.com
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to
// deal with the Software without restriction, including without limitation
// the rights to use, copy, modify, merge, publish, distribute, sublicense,
// and/or sell copies of the Software, and to permit persons to whom the
// Software is furnished to do so, subject to the following conditions:
//
// - Redistributions of source code must retain the above copyright notice,
// this list of conditions and the following disclaimers.
// - Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimers in
// the documentation and/or other materials provided with the distribution.
// - Neither the names of Advanced Micro Devices, Inc,
// nor the names of its contributors may be used to endorse or promote
// products derived from this Software without specific prior written
// permission.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
// DEALINGS WITH THE SOFTWARE.
//
////////////////////////////////////////////////////////////////////////////////
#ifndef _ASSEMBLE_H_
#define _ASSEMBLE_H_
#include "OSWrapper.hpp"
#define ASM_MCPU_LEN 16
class Assembler {
private:
const char* ArchName = "amdgcn";
const char* VendorName = "amd";
const char* OSName = "amdhsa";
char MCPU[ASM_MCPU_LEN];
std::string TripleName;
std::string Error;
char* TextData;
size_t TextSize;
void SetTargetAsic(const uint32_t Gfxv);
void LLVMInit();
void FlushText();
void PrintELFHex(const std::string Data);
int ExtractELFText(const char* RawData);
public:
Assembler(const uint32_t Gfxv);
~Assembler();
void PrintTextHex();
const char* GetTargetAsic();
const char* GetInstrStream();
const size_t GetInstrStreamSize();
int CopyInstrStream(char* OutBuf, const size_t BufSize = PAGE_SIZE);
int RunAssemble(const char* const AssemblySource);
int RunAssembleBuf(const char* const AssemblySource, char* OutBuf,
const size_t BufSize = PAGE_SIZE);
};
#endif // _ASSEMBLE_H_
-126
Просмотреть файл
@@ -1,126 +0,0 @@
/*
* Copyright (C) 2014-2018 Advanced Micro Devices, Inc. All Rights Reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*
*/
#include "IsaGenerator.hpp"
#include <algorithm>
#include <string>
#include "IsaGenerator_Gfx72.hpp"
#include "IsaGenerator_Gfx8.hpp"
#include "IsaGenerator_Gfx9.hpp"
#include "IsaGenerator_Gfx10.hpp"
#include "IsaGenerator_Aldebaran.hpp"
#include "GoogleTestExtension.hpp"
#include "sp3.h"
const std::string IsaGenerator::ADDRESS_WATCH_SP3(
"var REG_TRAPSTS_EXCP_MASK = 0x000001ff\n"
"var WAVE_COUNT_OFFSET = 12\n"
"var TMA_CYCLE_OFFSET = 16\n"
"\n"
"/*\n"
" * ttmp[0:1] -- The ISA address that triggered this trap handler\n"
" * ttmp[10:11] -- The TMA user provided, used to store the debug info in this shader\n"
" * v[10:14] ttmp[7:8] -- temp use inside this shader\n"
" * s5 -- store the counts that this trap been triggered\n"
" * Each time when the trap is triggered , this shader will write\n"
" * ttmp[0] : ttmp[1] : Trap_Status : [reserved]\n"
" * to TMA + (trap count * TMA_CYCLE_OFFSET)\n"
" * The TMA + WAVE_COUNT_OFFSET(the first [reserved] address)\n"
" * used to store the total triggered trap count.\n"
" */\n"
"shader main\n"
"\n"
" asic(VI)\n"
"\n"
" type(CS)\n"
" v_mov_b32 v10, ttmp10\n"
" v_mov_b32 v11, ttmp11\n"
" s_mov_b32 ttmp7, s5\n"
" s_mulk_i32 ttmp7, TMA_CYCLE_OFFSET\n"
" s_addk_i32 s5, 1\n"
" v_mov_b32 v12, ttmp0\n"
" v_add_u32 v10, vcc, ttmp7, v10\n"
" flat_store_dword v[10,11], v12 slc glc\n"
" v_mov_b32 v12, ttmp1\n"
" v_add_u32 v10, vcc, 4, v10\n"
" flat_store_dword v[10,11], v12 slc glc\n"
" s_getreg_b32 ttmp8, hwreg(HW_REG_TRAPSTS)\n"
" s_and_b32 ttmp8, ttmp8, REG_TRAPSTS_EXCP_MASK\n"
" v_mov_b32 v12, ttmp8\n"
" v_add_u32 v10, vcc, 4, v10\n"
" flat_store_dword v[10,11], v12 glc\n"
" v_mov_b32 v10, ttmp10\n"
" v_add_u32 v10, vcc, WAVE_COUNT_OFFSET, v10\n"
" v_mov_b32 v13, 1\n"
" flat_atomic_add v14, v[10:11], v13 slc glc\n"
" s_and_b32 ttmp1, ttmp1, 0xffff\n"
" s_rfe_b64 [ttmp0,ttmp1]\n"
"end\n"
);
IsaGenerator* IsaGenerator::Create(unsigned int familyId) {
switch (familyId) {
case FAMILY_CI:
case FAMILY_KV:
return new IsaGenerator_Gfx72;
case FAMILY_VI:
case FAMILY_CZ:
return new IsaGenerator_Gfx8;
case FAMILY_AI:
case FAMILY_RV:
case FAMILY_AR:
return new IsaGenerator_Gfx9;
case FAMILY_AL:
return new IsaGenerator_Aldbrn;
case FAMILY_NV:
return new IsaGenerator_Gfx10;
default:
LOG() << "Error: Invalid ISA" << std::endl;
return NULL;
}
}
void IsaGenerator::GetAwTrapHandler(HsaMemoryBuffer& rBuf) {
CompileShader(ADDRESS_WATCH_SP3.c_str(), "main", rBuf);
}
void IsaGenerator::CompileShader(const char* shaderCode, const char* shaderName, HsaMemoryBuffer& rBuf) {
sp3_context* pSp3 = sp3_new();
sp3_setasic(pSp3, GetAsicName().c_str());
sp3_parse_string(pSp3, shaderCode);
sp3_shader* pShader = sp3_compile(pSp3, shaderName);
std::copy(pShader->data, pShader->data + pShader->size, rBuf.As<unsigned int*>());
sp3_free_shader(pShader);
/** Inside this close function, there is an unknown reason of free memory not used by compiler.
* Comment out this as a workaround. System will do the garbage collection after this
* application is closed.
*/
// sp3_close(pSp3);
}
-52
Просмотреть файл
@@ -1,52 +0,0 @@
/*
* Copyright (C) 2014-2018 Advanced Micro Devices, Inc. All Rights Reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*
*/
#ifndef _ISAGENERATOR_H_
#define _ISAGENERATOR_H_
#include "KFDTestUtil.hpp"
/* isa generation class - interface */
class IsaGenerator {
public:
static IsaGenerator* Create(unsigned int familyId);
virtual ~IsaGenerator() {}
virtual void GetNoopIsa(HsaMemoryBuffer& rBuf) = 0;
virtual void GetCopyDwordIsa(HsaMemoryBuffer& rBuf) = 0;
virtual void GetInfiniteLoopIsa(HsaMemoryBuffer& rBuf) = 0;
virtual void GetAtomicIncIsa(HsaMemoryBuffer& rBuf) = 0;
virtual void GetCwsrTrapHandler(HsaMemoryBuffer& rBuf) {}
virtual void GetAwTrapHandler(HsaMemoryBuffer& rBuf);
void CompileShader(const char* shaderCode, const char* shaderName, HsaMemoryBuffer& rBuf);
protected:
virtual const std::string& GetAsicName() = 0;
private:
static const std::string ADDRESS_WATCH_SP3;
};
#endif // _ISAGENERATOR_H_
-113
Просмотреть файл
@@ -1,113 +0,0 @@
/*
* Copyright (C) 2020 Advanced Micro Devices, Inc. All Rights Reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*
*/
#include "IsaGenerator_Aldebaran.hpp"
#include <algorithm>
#include <string>
const std::string IsaGenerator_Aldbrn::ASIC_NAME = "ALDEBARAN";
/* The binaries are generated from following ISA */
#if 0
/* flat_atomic_inc will not support by some PCIE, use flat_atomic_add instead */
shader atomic_add
asic(ALDEBARAN)
type(CS)
v_mov_b32 v0, s0
v_mov_b32 v1, s1
v_mov_b32 v2, 1
flat_atomic_add v3, v[0:1], v2 slc glc scc
s_waitcnt 0
s_endpgm
end
shader copy_dword
asic(ALDEBARAN)
type(CS)
/* copy the parameters from scalar registers to vector registers */
v_mov_b32 v0, s0
v_mov_b32 v1, s1
v_mov_b32 v2, s2
v_mov_b32 v3, s3
/* copy a dword between the passed addresses */
flat_load_dword v4, v[0:1] slc glc
s_waitcnt 0
flat_store_dword v[2:3], v4 slc glc
s_endpgm
end
shader main
asic(ALDEBARAN)
type(CS)
loop:
s_branch loop
s_endpgm
end
#endif
const uint32_t IsaGenerator_Aldbrn::NOOP_ISA[] = {
0xbf810000
};
const uint32_t IsaGenerator_Aldbrn::COPY_DWORD_ISA[] = {
0x7e000200, 0x7e020201,
0x7e040202, 0x7e060203,
0xdc530000, 0x047f0000,
0xbf8c0000, 0xdc730000,
0x007f0402, 0xbf810000
};
const uint32_t IsaGenerator_Aldbrn::INFINITE_LOOP_ISA[] = {
0xbf82ffff, 0xbf810000
};
const uint32_t IsaGenerator_Aldbrn::ATOMIC_ADD_ISA[] = {
0x7e000200, 0x7e020201,
0x7e040281, 0xdf0b0000,
0x037f0200, 0xbf8c0000,
0xbf810000, 0x00000000
};
void IsaGenerator_Aldbrn::GetNoopIsa(HsaMemoryBuffer& rBuf) {
std::copy(NOOP_ISA, NOOP_ISA+ARRAY_SIZE(NOOP_ISA), rBuf.As<uint32_t*>());
}
void IsaGenerator_Aldbrn::GetCopyDwordIsa(HsaMemoryBuffer& rBuf) {
std::copy(COPY_DWORD_ISA, COPY_DWORD_ISA+ARRAY_SIZE(COPY_DWORD_ISA), rBuf.As<uint32_t*>());
}
void IsaGenerator_Aldbrn::GetInfiniteLoopIsa(HsaMemoryBuffer& rBuf) {
std::copy(INFINITE_LOOP_ISA, INFINITE_LOOP_ISA+ARRAY_SIZE(INFINITE_LOOP_ISA), rBuf.As<uint32_t*>());
}
void IsaGenerator_Aldbrn::GetAtomicIncIsa(HsaMemoryBuffer& rBuf) {
std::copy(ATOMIC_ADD_ISA, ATOMIC_ADD_ISA+ARRAY_SIZE(ATOMIC_ADD_ISA), rBuf.As<uint32_t*>());
}
const std::string& IsaGenerator_Aldbrn::GetAsicName() {
return ASIC_NAME;
}
-142
Просмотреть файл
@@ -1,142 +0,0 @@
/*
* Copyright (C) 2019 Advanced Micro Devices, Inc. All Rights Reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*
*/
#include "IsaGenerator_Gfx10.hpp"
#include <algorithm>
#include <string>
/* The binaries are generated from following ISA */
const std::string IsaGenerator_Gfx10::ASIC_NAME = "GFX10";
#if 0
static const char * atomic_add = \
"\
shader atomic_add \n\
asic(GFX10) \n\
wave_size(32) \n\
type(CS) \n\
v_mov_b32 v0, s0 \n\
v_mov_b32 v1, s1 \n\
v_mov_b32 v2, 1 \n\
flat_atomic_add v3, v[0:1], v2 slc glc \n\
s_waitcnt 0 \n\
s_endpgm \n\
end \n\
";
static const char * copy_dword = \
"\
shader copy_dword \n\
asic(GFX10) \n\
wave_size(32) \n\
type(CS) \n\
v_mov_b32 v0, s0 \n\
v_mov_b32 v1, s1 \n\
v_mov_b32 v2, s2 \n\
v_mov_b32 v3, s3 \n\
flat_load_dword v4, v[0:1] slc glc \n\
s_waitcnt 0 \n\
flat_store_dword v[2:3], v4 slc glc \n\
s_endpgm \n\
end \n\
";
static const char * loop= \
"\
shader loop \n\
asic(GFX10) \n\
type(CS) \n\
wave_size(32) \n\
loop: \n\
s_branch loop \n\
s_endpgm \n\
end \n\
";
static const char * noop= \
"\
shader noop \n\
asic(GFX10) \n\
type(CS) \n\
wave_size(32) \n\
s_endpgm \n\
end \n\
";
#endif
const uint32_t IsaGenerator_Gfx10::NOOP_ISA[] = {
0xb0804004, 0xbf810000,
0xbf9f0000, 0xbf9f0000,
0xbf9f0000, 0xbf9f0000,
0xbf9f0000
};
const uint32_t IsaGenerator_Gfx10::COPY_DWORD_ISA[] = {
0xb0804004, 0x7e000200,
0x7e020201, 0x7e040202,
0x7e060203, 0xdc330000,
0x47d0000, 0xbf8c0000,
0xdc730000, 0x7d0402,
0xbf810000, 0xbf9f0000,
0xbf9f0000, 0xbf9f0000,
0xbf9f0000, 0xbf9f0000
};
const uint32_t IsaGenerator_Gfx10::INFINITE_LOOP_ISA[] = {
0xbf82ffff, 0xb0804004,
0xbf810000, 0xbf9f0000,
0xbf9f0000, 0xbf9f0000,
0xbf9f0000, 0xbf9f0000
};
const uint32_t IsaGenerator_Gfx10::ATOMIC_ADD_ISA[] = {
0xb0804004, 0x7e000200,
0x7e020201, 0x7e040281,
0xdccb0000, 0x37d0200,
0xbf8c0000, 0xbf810000,
0xbf9f0000, 0xbf9f0000,
0xbf9f0000, 0xbf9f0000,
0xbf9f0000
};
void IsaGenerator_Gfx10::GetNoopIsa(HsaMemoryBuffer& rBuf) {
std::copy(NOOP_ISA, NOOP_ISA+ARRAY_SIZE(NOOP_ISA), rBuf.As<uint32_t*>());
}
void IsaGenerator_Gfx10::GetCopyDwordIsa(HsaMemoryBuffer& rBuf) {
std::copy(COPY_DWORD_ISA, COPY_DWORD_ISA+ARRAY_SIZE(COPY_DWORD_ISA), rBuf.As<uint32_t*>());
}
void IsaGenerator_Gfx10::GetInfiniteLoopIsa(HsaMemoryBuffer& rBuf) {
std::copy(INFINITE_LOOP_ISA, INFINITE_LOOP_ISA+ARRAY_SIZE(INFINITE_LOOP_ISA), rBuf.As<uint32_t*>());
}
void IsaGenerator_Gfx10::GetAtomicIncIsa(HsaMemoryBuffer& rBuf) {
std::copy(ATOMIC_ADD_ISA, ATOMIC_ADD_ISA+ARRAY_SIZE(ATOMIC_ADD_ISA), rBuf.As<uint32_t*>());
}
const std::string& IsaGenerator_Gfx10::GetAsicName() {
return ASIC_NAME;
}
-123
Просмотреть файл
@@ -1,123 +0,0 @@
/*
* Copyright (C) 2014-2018 Advanced Micro Devices, Inc. All Rights Reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*
*/
#include "IsaGenerator_Gfx72.hpp"
#include <algorithm>
#include <string>
const std::string IsaGenerator_Gfx72::ASIC_NAME = "CI";
const uint32_t IsaGenerator_Gfx72::NOOP_ISA[] = {
0xbf810000 // S_ENDPGM
};
/* The below arrays are filled with hex values in order not to reference
* proprietary header files, but we still leave the code here for future
* reference.
*/
#if 0
const uint32_t IsaGenerator_Gfx72::COPY_DWORD_ISA[] = {
(63u << SQ_VOP1__ENCODING__SHIFT) | (0 << SQ_VOP1__VDST__SHIFT) | (SQ_V_MOV_B32 << SQ_VOP1__OP__SHIFT) | (0 << SQ_VOP1__SRC0__SHIFT), // v_mov_b32 v0, s0 (VOP1)
(63u << SQ_VOP1__ENCODING__SHIFT) | (1 << SQ_VOP1__VDST__SHIFT) | (SQ_V_MOV_B32 << SQ_VOP1__OP__SHIFT) | (1 << SQ_VOP1__SRC0__SHIFT), // v_mov_b32 v1, s1 (VOP1)
(63u << SQ_VOP1__ENCODING__SHIFT) | (2 << SQ_VOP1__VDST__SHIFT) | (SQ_V_MOV_B32 << SQ_VOP1__OP__SHIFT) | (2 << SQ_VOP1__SRC0__SHIFT), // v_mov_b32 v2, s2 (VOP1)
(63u << SQ_VOP1__ENCODING__SHIFT) | (3 << SQ_VOP1__VDST__SHIFT) | (SQ_V_MOV_B32 << SQ_VOP1__OP__SHIFT) | (3 << SQ_VOP1__SRC0__SHIFT), // v_mov_b32 v3, s3 (VOP1)
(55u << SQ_FLAT_0__ENCODING__SHIFT) | (SQ_FLAT_LOAD_DWORD << SQ_FLAT_0__OP__SHIFT) | (1 << SQ_FLAT_0__SLC__SHIFT) | (1 << SQ_FLAT_0__GLC__SHIFT)/*(3 << 16)*/, // SQ_FLAT_0, flat_load_dword, slc = 1, glc = 1 (FLAT_0)
(4u << SQ_FLAT_1__VDST__SHIFT) | (0 << SQ_FLAT_1__ADDR__SHIFT), // ADDR = V0:V1, VDST = V4 (FLAT_1)
(383u << SQ_SOPP__ENCODING__SHIFT) | (SQ_S_WAITCNT << SQ_SOPP__OP__SHIFT) | (0 << SQ_SOPP__SIMM16__SHIFT), // s_waitcnt 0 (SOPP)
(55u << SQ_FLAT_0__ENCODING__SHIFT) | (SQ_FLAT_STORE_DWORD << SQ_FLAT_0__OP__SHIFT) | (1 << SQ_FLAT_0__SLC__SHIFT) | (1 << SQ_FLAT_0__GLC__SHIFT), // SQ_FLAT_0, flat_store_dword, slc = 1, glc = 1 (FLAT_0)
(4u << SQ_FLAT_1__DATA__SHIFT) | (2 << SQ_FLAT_1__ADDR__SHIFT), // ADDR = V2:V3, DATA = V4 (FLAT_1)
0xBF810000u // s_endpgm, note that we rely on the implicit s_waitcnt 0,0,0
};
const uint32_t IsaGenerator_Gfx72::INFINITE_LOOP_ISA[] = {
(0x17F << SQ_SOPP__ENCODING__SHIFT) | (SQ_S_BRANCH << SQ_SOPP__OP__SHIFT) | ( (const uint32_t)-1 << SQ_SOPP__SIMM16__SHIFT), // s_branch -1 (PC <- PC + SIMM*4)+4
0xBF810000u // S_ENDPGM
};
const uint32_t IsaGenerator_Gfx72::ATOMIC_INC_ISA[] = {
(63u << SQ_VOP1__ENCODING__SHIFT) | (0 << SQ_VOP1__VDST__SHIFT) | (SQ_V_MOV_B32 << SQ_VOP1__OP__SHIFT) | (0 << SQ_VOP1__SRC0__SHIFT), // v_mov_b32 v0, s0 (VOP1)
(63u << SQ_VOP1__ENCODING__SHIFT) | (1 << SQ_VOP1__VDST__SHIFT) | (SQ_V_MOV_B32 << SQ_VOP1__OP__SHIFT) | (1 << SQ_VOP1__SRC0__SHIFT), // v_mov_b32 v1, s1 (VOP1)
(63u << SQ_VOP1__ENCODING__SHIFT) | (2 << SQ_VOP1__VDST__SHIFT) | (SQ_V_MOV_B32 << SQ_VOP1__OP__SHIFT) | (0xC1 << SQ_VOP1__SRC0__SHIFT), // v_mov_b32 0xFFFFFFFF, s2 (VOP1)
(55u << SQ_FLAT_0__ENCODING__SHIFT) | (SQ_FLAT_ATOMIC_INC << SQ_FLAT_0__OP__SHIFT) | (1 << SQ_FLAT_0__SLC__SHIFT) | (0 << SQ_FLAT_0__GLC__SHIFT), // SQ_FLAT_0, flat_atomic_inc, slc = 1, glc = 0 (FLAT_0)
(3u << SQ_FLAT_1__VDST__SHIFT) | (2u << SQ_FLAT_1__DATA__SHIFT) | (0 << SQ_FLAT_1__ADDR__SHIFT), // ADDR/dst = V0:V1, VDST/ret = V3, DATA/src=V2 (FLAT_1)
0xBF810000u // s_endpgm, note that we rely on the implicit s_waitcnt 0,0,0
};
#endif
const uint32_t IsaGenerator_Gfx72::COPY_DWORD_ISA[] = {
0x7e000200, // v_mov_b32 v0, s0 (VOP1)
0x7e020201, // v_mov_b32 v1, s1 (VOP1)
0x7e040202, // v_mov_b32 v2, s2 (VOP1)
0x7e060203, // v_mov_b32 v3, s3 (VOP1)
0xdc330000, // SQ_FLAT_0, flat_load_dword, slc = 1, glc = 1 (FLAT_0)
0x04000000, // ADDR = V0:V1, VDST = V4 (FLAT_1)
0xbf8c0000, // s_waitcnt 0 (SOPP)
0xdc730000, // SQ_FLAT_0, flat_store_dword, slc = 1, glc = 1 (FLAT_0)
0x00000402, // ADDR = V2:V3, DATA = V4 (FLAT_1)
0xbf810000 // s_endpgm, note that we rely on the implicit s_waitcnt 0,0,0
};
const uint32_t IsaGenerator_Gfx72::INFINITE_LOOP_ISA[] = {
0xbf82ffff, // s_branch -1 (PC <- PC + SIMM*4)+4
0xbf810000 // S_ENDPGM
};
const uint32_t IsaGenerator_Gfx72::ATOMIC_INC_ISA[] = {
0x7e000200, // v_mov_b32 v0, s0 (VOP1)
0x7e020201, // v_mov_b32 v1, s1 (VOP1)
0x7e0402c1, // v_mov_b32 0xFFFFFFFF, s2 (VOP1)
0xdcf20000, // SQ_FLAT_0, flat_atomic_inc, slc = 1, glc = 0 (FLAT_0)
0x03000200, // ADDR/dst = V0:V1, VDST/ret = V3, DATA/src=V2 (FLAT_1)
0xbf810000 // s_endpgm, note that we rely on the implicit s_waitcnt 0,0,0
};
void IsaGenerator_Gfx72::GetNoopIsa(HsaMemoryBuffer& rBuf) {
std::copy(NOOP_ISA, NOOP_ISA+ARRAY_SIZE(NOOP_ISA), rBuf.As<uint32_t*>());
}
void IsaGenerator_Gfx72::GetCopyDwordIsa(HsaMemoryBuffer& rBuf) {
std::copy(COPY_DWORD_ISA, COPY_DWORD_ISA+ARRAY_SIZE(COPY_DWORD_ISA), rBuf.As<uint32_t*>());
}
void IsaGenerator_Gfx72::GetInfiniteLoopIsa(HsaMemoryBuffer& rBuf) {
std::copy(INFINITE_LOOP_ISA, INFINITE_LOOP_ISA+ARRAY_SIZE(INFINITE_LOOP_ISA), rBuf.As<uint32_t*>());
}
void IsaGenerator_Gfx72::GetAtomicIncIsa(HsaMemoryBuffer& rBuf) {
std::copy(ATOMIC_INC_ISA, ATOMIC_INC_ISA+ARRAY_SIZE(ATOMIC_INC_ISA), rBuf.As<uint32_t*>());
}
const std::string& IsaGenerator_Gfx72::GetAsicName() {
return ASIC_NAME;
}
-49
Просмотреть файл
@@ -1,49 +0,0 @@
/*
* Copyright (C) 2014-2018 Advanced Micro Devices, Inc. All Rights Reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*
*/
#ifndef _ISAGENERATOR_GFX72_H_
#define _ISAGENERATOR_GFX72_H_
#include <string>
#include "IsaGenerator.hpp"
class IsaGenerator_Gfx72 : public IsaGenerator {
public:
virtual void GetNoopIsa(HsaMemoryBuffer& rBuf);
virtual void GetCopyDwordIsa(HsaMemoryBuffer& rBuf);
virtual void GetInfiniteLoopIsa(HsaMemoryBuffer& rBuf);
virtual void GetAtomicIncIsa(HsaMemoryBuffer& rBuf);
protected:
virtual const std::string& GetAsicName();
private:
static const std::string ASIC_NAME;
static const uint32_t NOOP_ISA[];
static const uint32_t COPY_DWORD_ISA[];
static const uint32_t INFINITE_LOOP_ISA[];
static const uint32_t ATOMIC_INC_ISA[];
};
#endif // _ISAGENERATOR_GFX72_H_
-128
Просмотреть файл
@@ -1,128 +0,0 @@
/*
* Copyright (C) 2014-2018 Advanced Micro Devices, Inc. All Rights Reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*
*/
#include "IsaGenerator_Gfx8.hpp"
#include <algorithm>
#include <string>
const std::string IsaGenerator_Gfx8::ASIC_NAME = "VI";
const uint32_t IsaGenerator_Gfx8::NOOP_ISA[] = {
0xbf810000 // S_ENDPGM
};
/** The below arrays are filled with hex values in order not to reference
* proprietary header files, but we still leave the code here for future
* reference.
*/
#if 0
const uint32_t IsaGenerator_Gfx8::COPY_DWORD_ISA[] = {
(63u << SQ_VOP1__ENCODING__SHIFT) | (0 << SQ_VOP1__VDST__SHIFT) | (SQ_V_MOV_B32 << SQ_VOP1__OP__SHIFT) | (0 << SQ_VOP1__SRC0__SHIFT), // v_mov_b32 v0, s0 (VOP1)
(63u << SQ_VOP1__ENCODING__SHIFT) | (1 << SQ_VOP1__VDST__SHIFT) | (SQ_V_MOV_B32 << SQ_VOP1__OP__SHIFT) | (1 << SQ_VOP1__SRC0__SHIFT), // v_mov_b32 v1, s1 (VOP1)
(63u << SQ_VOP1__ENCODING__SHIFT) | (2 << SQ_VOP1__VDST__SHIFT) | (SQ_V_MOV_B32 << SQ_VOP1__OP__SHIFT) | (2 << SQ_VOP1__SRC0__SHIFT), // v_mov_b32 v2, s2 (VOP1)
(63u << SQ_VOP1__ENCODING__SHIFT) | (3 << SQ_VOP1__VDST__SHIFT) | (SQ_V_MOV_B32 << SQ_VOP1__OP__SHIFT) | (3 << SQ_VOP1__SRC0__SHIFT), // v_mov_b32 v3, s3 (VOP1)
(55u << SQ_FLAT_0__ENCODING__SHIFT) | (SQ_FLAT_LOAD_DWORD << SQ_FLAT_0__OP__SHIFT) | (1 << SQ_FLAT_0__SLC__SHIFT) | (1 << SQ_FLAT_0__GLC__SHIFT)/*(3 << 16)*/, // SQ_FLAT_0, flat_load_dword, slc = 1, glc = 1 (FLAT_0)
(4u << SQ_FLAT_1__VDST__SHIFT) | (0 << SQ_FLAT_1__ADDR__SHIFT), // ADDR = V0:V1, VDST = V4 (FLAT_1)
(383u << SQ_SOPP__ENCODING__SHIFT) | (SQ_S_WAITCNT << SQ_SOPP__OP__SHIFT) | (0 << SQ_SOPP__SIMM16__SHIFT), // s_waitcnt 0 (SOPP)
(55u << SQ_FLAT_0__ENCODING__SHIFT) | (SQ_FLAT_STORE_DWORD << SQ_FLAT_0__OP__SHIFT) | (1 << SQ_FLAT_0__SLC__SHIFT) | (1 << SQ_FLAT_0__GLC__SHIFT), // SQ_FLAT_0, flat_store_dword, slc = 1, glc = 1 (FLAT_0)
(4u << SQ_FLAT_1__DATA__SHIFT) | (2 << SQ_FLAT_1__ADDR__SHIFT), // ADDR = V2:V3, DATA = V4 (FLAT_1)
0xBF810000u // s_endpgm, note that we rely on the implicit s_waitcnt 0,0,0
};
const uint32_t IsaGenerator_Gfx8::INFINITE_LOOP_ISA[] = {
(0x17F << SQ_SOPP__ENCODING__SHIFT) | (SQ_S_BRANCH << SQ_SOPP__OP__SHIFT) | ( (const uint32_t)-1 << SQ_SOPP__SIMM16__SHIFT), // s_branch -1 (PC <- PC + SIMM*4)+4
0xBF810000u // S_ENDPGM
};
#endif
const uint32_t IsaGenerator_Gfx8::COPY_DWORD_ISA[] = {
0x7e000200, // v_mov_b32 v0, s0 (VOP1)
0x7e020201, // v_mov_b32 v1, s1 (VOP1)
0x7e040202, // v_mov_b32 v2, s2 (VOP1)
0x7e060203, // v_mov_b32 v3, s3 (VOP1)
0xdc530000, // SQ_FLAT_0, flat_load_dword, slc = 1, glc = 1 (FLAT_0)
0x04000000, // ADDR = V0:V1, VDST = V4 (FLAT_1)
0xbf8c0000, // s_waitcnt 0 (SOPP)
0xdc730000, // SQ_FLAT_0, flat_store_dword, slc = 1, glc = 1 (FLAT_0)
0x00000402, // ADDR = V2:V3, DATA = V4 (FLAT_1)
0xbf810000 // s_endpgm, note that we rely on the implicit s_waitcnt 0,0,0
};
const uint32_t IsaGenerator_Gfx8::INFINITE_LOOP_ISA[] = {
0xbf82ffff, // s_branch -1 (PC <- PC + SIMM*4)+4
0xbf810000 // S_ENDPGM
};
/**
* The atomic_add_isa binary is generated from following ISA
* The original atomic_inc is not support by some PCIE, so use atomic_add instead
*
*/
/*
shader atomic_add
asic(VI)
type(CS)
v_mov_b32 v0, s0
v_mov_b32 v1, s1
v_mov_b32 v2, 1
flat_atomic_add v3, v[0:1], v2 slc glc
s_waitcnt 0
s_endpgm
end
*/
const uint32_t IsaGenerator_Gfx8::ATOMIC_ADD_ISA[] = {
0x7e000200, 0x7e020201,
0x7e040281, 0xdd0b0000,
0x03000200, 0xbf8c0000,
0xbf810000, 0x00000000
};
void IsaGenerator_Gfx8::GetNoopIsa(HsaMemoryBuffer& rBuf) {
std::copy(NOOP_ISA, NOOP_ISA+ARRAY_SIZE(NOOP_ISA), rBuf.As<uint32_t*>());
}
void IsaGenerator_Gfx8::GetCopyDwordIsa(HsaMemoryBuffer& rBuf) {
std::copy(COPY_DWORD_ISA, COPY_DWORD_ISA+ARRAY_SIZE(COPY_DWORD_ISA), rBuf.As<uint32_t*>());
}
void IsaGenerator_Gfx8::GetInfiniteLoopIsa(HsaMemoryBuffer& rBuf) {
std::copy(INFINITE_LOOP_ISA, INFINITE_LOOP_ISA+ARRAY_SIZE(INFINITE_LOOP_ISA), rBuf.As<uint32_t*>());
}
void IsaGenerator_Gfx8::GetAtomicIncIsa(HsaMemoryBuffer& rBuf) {
std::copy(ATOMIC_ADD_ISA, ATOMIC_ADD_ISA+ARRAY_SIZE(ATOMIC_ADD_ISA), rBuf.As<uint32_t*>());
}
const std::string& IsaGenerator_Gfx8::GetAsicName() {
return ASIC_NAME;
}
-49
Просмотреть файл
@@ -1,49 +0,0 @@
/*
* Copyright (C) 2014-2018 Advanced Micro Devices, Inc. All Rights Reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*
*/
#ifndef _ISAGENERATOR_GFX8_H_
#define _ISAGENERATOR_GFX8_H_
#include <string>
#include "IsaGenerator.hpp"
class IsaGenerator_Gfx8 : public IsaGenerator {
public:
virtual void GetNoopIsa(HsaMemoryBuffer& rBuf);
virtual void GetCopyDwordIsa(HsaMemoryBuffer& rBuf);
virtual void GetInfiniteLoopIsa(HsaMemoryBuffer& rBuf);
virtual void GetAtomicIncIsa(HsaMemoryBuffer& rBuf);
protected:
virtual const std::string& GetAsicName();
private:
static const std::string ASIC_NAME;
static const uint32_t NOOP_ISA[];
static const uint32_t COPY_DWORD_ISA[];
static const uint32_t INFINITE_LOOP_ISA[];
static const uint32_t ATOMIC_ADD_ISA[];
};
#endif // _ISAGENERATOR_GFX72_H_
-113
Просмотреть файл
@@ -1,113 +0,0 @@
/*
* Copyright (C) 2014-2018 Advanced Micro Devices, Inc. All Rights Reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*
*/
#include "IsaGenerator_Gfx9.hpp"
#include <algorithm>
#include <string>
const std::string IsaGenerator_Gfx9::ASIC_NAME = "GFX9";
/* The binaries are generated from following ISA */
#if 0
/* flat_atomic_inc will not support by some PCIE, use flat_atomic_add instead */
shader atomic_add
asic(GFX9)
type(CS)
v_mov_b32 v0, s0
v_mov_b32 v1, s1
v_mov_b32 v2, 1
flat_atomic_add v3, v[0:1], v2 slc glc
s_waitcnt 0
s_endpgm
end
shader copy_dword
asic(GFX9)
type(CS)
/* copy the parameters from scalar registers to vector registers */
v_mov_b32 v0, s0
v_mov_b32 v1, s1
v_mov_b32 v2, s2
v_mov_b32 v3, s3
/* copy a dword between the passed addresses */
flat_load_dword v4, v[0:1] slc glc
s_waitcnt 0
flat_store_dword v[2:3], v4 slc glc
s_endpgm
end
shader main
asic(GFX9)
type(CS)
loop:
s_branch loop
s_endpgm
end
#endif
const uint32_t IsaGenerator_Gfx9::NOOP_ISA[] = {
0xbf810000
};
const uint32_t IsaGenerator_Gfx9::COPY_DWORD_ISA[] = {
0x7e000200, 0x7e020201,
0x7e040202, 0x7e060203,
0xdc530000, 0x047f0000,
0xbf8c0000, 0xdc730000,
0x007f0402, 0xbf810000
};
const uint32_t IsaGenerator_Gfx9::INFINITE_LOOP_ISA[] = {
0xbf82ffff, 0xbf810000
};
const uint32_t IsaGenerator_Gfx9::ATOMIC_ADD_ISA[] = {
0x7e000200, 0x7e020201,
0x7e040281, 0xdd0b0000,
0x037f0200, 0xbf8c0000,
0xbf810000, 0x00000000
};
void IsaGenerator_Gfx9::GetNoopIsa(HsaMemoryBuffer& rBuf) {
std::copy(NOOP_ISA, NOOP_ISA+ARRAY_SIZE(NOOP_ISA), rBuf.As<uint32_t*>());
}
void IsaGenerator_Gfx9::GetCopyDwordIsa(HsaMemoryBuffer& rBuf) {
std::copy(COPY_DWORD_ISA, COPY_DWORD_ISA+ARRAY_SIZE(COPY_DWORD_ISA), rBuf.As<uint32_t*>());
}
void IsaGenerator_Gfx9::GetInfiniteLoopIsa(HsaMemoryBuffer& rBuf) {
std::copy(INFINITE_LOOP_ISA, INFINITE_LOOP_ISA+ARRAY_SIZE(INFINITE_LOOP_ISA), rBuf.As<uint32_t*>());
}
void IsaGenerator_Gfx9::GetAtomicIncIsa(HsaMemoryBuffer& rBuf) {
std::copy(ATOMIC_ADD_ISA, ATOMIC_ADD_ISA+ARRAY_SIZE(ATOMIC_ADD_ISA), rBuf.As<uint32_t*>());
}
const std::string& IsaGenerator_Gfx9::GetAsicName() {
return ASIC_NAME;
}
-49
Просмотреть файл
@@ -1,49 +0,0 @@
/*
* Copyright (C) 2014-2018 Advanced Micro Devices, Inc. All Rights Reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*
*/
#ifndef _ISAGENERATOR_GFX9_H_
#define _ISAGENERATOR_GFX9_H_
#include <string>
#include "IsaGenerator.hpp"
class IsaGenerator_Gfx9 : public IsaGenerator {
public:
virtual void GetNoopIsa(HsaMemoryBuffer& rBuf);
virtual void GetCopyDwordIsa(HsaMemoryBuffer& rBuf);
virtual void GetInfiniteLoopIsa(HsaMemoryBuffer& rBuf);
virtual void GetAtomicIncIsa(HsaMemoryBuffer& rBuf);
protected:
virtual const std::string& GetAsicName();
private:
static const std::string ASIC_NAME;
static const uint32_t NOOP_ISA[];
static const uint32_t COPY_DWORD_ISA[];
static const uint32_t INFINITE_LOOP_ISA[];
static const uint32_t ATOMIC_ADD_ISA[];
};
#endif // _ISAGENERATOR_GFX9_H_
@@ -1,5 +1,5 @@
/*
* Copyright (C) 2020 Advanced Micro Devices, Inc. All Rights Reserved.
* Copyright (C) 2022 Advanced Micro Devices, Inc. All Rights Reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
@@ -21,29 +21,53 @@
*
*/
#ifndef _ISAGENERATOR_ALDEBARAN_H_
#define _ISAGENERATOR_ALDEBARAN_H_
#include "GoogleTestExtension.hpp"
#include "KFDASMTest.hpp"
#include "ShaderStore.hpp"
#include "Assemble.hpp"
#include <string>
#include "IsaGenerator.hpp"
void KFDASMTest::SetUp() {}
void KFDASMTest::TearDown() {}
class IsaGenerator_Aldbrn : public IsaGenerator {
public:
virtual void GetNoopIsa(HsaMemoryBuffer& rBuf);
virtual void GetCopyDwordIsa(HsaMemoryBuffer& rBuf);
virtual void GetInfiniteLoopIsa(HsaMemoryBuffer& rBuf);
virtual void GetAtomicIncIsa(HsaMemoryBuffer& rBuf);
protected:
virtual const std::string& GetAsicName();
private:
static const std::string ASIC_NAME;
static const uint32_t NOOP_ISA[];
static const uint32_t COPY_DWORD_ISA[];
static const uint32_t INFINITE_LOOP_ISA[];
static const uint32_t ATOMIC_ADD_ISA[];
static const std::vector<uint32_t> TargetList = {
0x080001,
0x080002,
0x080003,
0x080005,
0x080100,
0x090000,
0x090002,
0x090004,
0x090006,
0x090008,
0x090009,
0x09000a,
0x09000c,
0x0a0100,
0x0a0101,
0x0a0102,
0x0a0103,
0x0a0300,
0x0a0301,
0x0a0302,
0x0a0303,
0x0a0304,
0x0a0305,
0x0a0306,
};
#endif // _ISAGENERATOR_ALDEBARAN_H_
TEST_F(KFDASMTest, AssembleShaders) {
TEST_START(TESTPROFILE_RUNALL)
for (auto &t : TargetList) {
Assembler asmblr(t);
LOG() << "Running ASM test for target " << asmblr.GetTargetAsic() << std::endl;
for (auto &s : ShaderList) {
EXPECT_SUCCESS(asmblr.RunAssemble(s));
}
}
TEST_END
}
@@ -1,5 +1,5 @@
/*
* Copyright (C) 2014-2018 Advanced Micro Devices, Inc. All Rights Reserved.
* Copyright (C) 2022 Advanced Micro Devices, Inc. All Rights Reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
@@ -21,3 +21,19 @@
*
*/
#ifndef __KFD_ASM_TEST__H__
#define __KFD_ASM_TEST__H__
#include <gtest/gtest.h>
class KFDASMTest : public testing::Test {
public:
KFDASMTest() {}
~KFDASMTest() {}
protected:
virtual void SetUp();
virtual void TearDown();
};
#endif // __KFD_ASM_TEST__H__
+6
Просмотреть файл
@@ -68,6 +68,8 @@ void KFDBaseComponentTest::SetUp() {
g_baseTest = this;
m_pAsm = new Assembler(GetGfxVersion(nodeProperties));
ROUTINE_END
}
@@ -86,6 +88,10 @@ void KFDBaseComponentTest::TearDown() {
EXPECT_SUCCESS(hsaKmtCloseKFD());
g_baseTest = NULL;
if (m_pAsm)
delete m_pAsm;
m_pAsm = nullptr;
ROUTINE_END
}
+3
Просмотреть файл
@@ -34,6 +34,8 @@
#include "hsakmt.h"
#include "OSWrapper.hpp"
#include "KFDTestUtil.hpp"
#include "Assemble.hpp"
#include "ShaderStore.hpp"
// @class KFDBaseComponentTest
class KFDBaseComponentTest : public testing::Test {
@@ -74,6 +76,7 @@ class KFDBaseComponentTest : public testing::Test {
HsaMemFlags m_MemoryFlags;
HsaNodeInfo m_NodeInfo;
HSAint32 m_xnack;
Assembler* m_pAsm;
// @brief Executed before every test that uses KFDBaseComponentTest class and sets all common settings for the tests.
virtual void SetUp();
+2 -90
Просмотреть файл
@@ -24,90 +24,11 @@
#include "KFDCWSRTest.hpp"
#include "Dispatch.hpp"
/* Initial state:
* s[0:1] - 64 bits iteration number; only the lower 32 bits are useful.
* s[2:3] - result buffer base address
* s4 - workgroup id
* v0 - workitem id, always 0 because
* NUM_THREADS_X(number of threads) in workgroup set to 1
* Registers:
* v0 - calculated workitem = v0 + s4 * NUM_THREADS_X, which is s4
* v2 - = s0, 32 bits iteration number
* v[4:5] - corresponding output buf address: s[2:3] + v0 * 4
* v6 - counter
*/
static const char* iterate_isa_gfx8 = \
"\
shader iterate_isa\n\
wave_size(32)\n\
type(CS)\n\
// copy the parameters from scalar registers to vector registers\n\
v_mov_b32 v2, s0 // v[2:3] = s[0:1] \n\
v_mov_b32 v3, s1 // v[2:3] = s[0:1] \n\
v_mov_b32 v0, s4 // use workgroup id as index \n\
v_lshlrev_b32 v0, 2, v0 // v0 *= 4 \n\
v_add_u32 v4, vcc, s2, v0 // v[4:5] = s[2:3] + v0 * 4 \n\
v_mov_b32 v5, s3 // v[4:5] = s[2:3] + v0 * 4 \n\
v_add_u32 v5, vcc, v5, vcc_lo // v[4:5] = s[2:3] + v0 * 4 \n\
v_mov_b32 v6, 0 \n\
LOOP: \n\
v_add_u32 v6, vcc, 1, v6 \n\
// compare the result value (v6) to iteration value (v2), and \n\
// jump if equal (i.e. if VCC is not zero after the comparison) \n\
v_cmp_lt_u32 vcc, v6, v2 \n\
s_cbranch_vccnz LOOP \n\
flat_store_dword v[4:5], v6 \n\
s_waitcnt vmcnt(0)&lgkmcnt(0) \n\
s_endpgm \n\
end \n\
";
//This shader can be used by gfx9 and gfx10
static const char* iterate_isa_gfx9 = \
"\
shader iterate_isa\n\
wave_size(32)\n\
type(CS)\n\
// copy the parameters from scalar registers to vector registers\n\
v_mov_b32 v2, s0 // v[2:3] = s[0:1] \n\
v_mov_b32 v3, s1 // v[2:3] = s[0:1] \n\
v_mov_b32 v0, s4 // use workgroup id as index \n\
v_lshlrev_b32 v0, 2, v0 // v0 *= 4 \n\
v_add_co_u32 v4, vcc, s2, v0 // v[4:5] = s[2:3] + v0 * 4 \n\
v_mov_b32 v5, s3 // v[4:5] = s[2:3] + v0 * 4 \n\
v_add_co_u32 v5, vcc, v5, vcc_lo // v[4:5] = s[2:3] + v0 * 4 \n\
v_mov_b32 v6, 0 \n\
LOOP: \n\
v_add_co_u32 v6, vcc, 1, v6 \n\
// compare the result value (v6) to iteration value (v2), and \n\
// jump if equal (i.e. if VCC is not zero after the comparison) \n\
v_cmp_lt_u32 vcc, v6, v2 \n\
s_cbranch_vccnz LOOP \n\
flat_store_dword v[4:5], v6 \n\
s_waitcnt vmcnt(0)&lgkmcnt(0) \n\
s_endpgm \n\
end \n\
";
static const char* infinite_isa = \
"\
shader infinite_isa \n\
wave_size(32) \n\
type(CS) \n\
LOOP: \n\
s_branch LOOP \n\
end \n\
";
void KFDCWSRTest::SetUp() {
ROUTINE_START
KFDBaseComponentTest::SetUp();
m_pIsaGen = IsaGenerator::Create(m_FamilyId);
wave_number = 1;
ROUTINE_END
@@ -115,9 +36,6 @@ void KFDCWSRTest::SetUp() {
void KFDCWSRTest::TearDown() {
ROUTINE_START
if (m_pIsaGen)
delete m_pIsaGen;
m_pIsaGen = NULL;
KFDBaseComponentTest::TearDown();
@@ -153,16 +71,10 @@ TEST_F(KFDCWSRTest, BasicTest) {
int defaultGPUNode = m_NodeInfo.HsaDefaultGPUNode();
if ((m_FamilyId >= FAMILY_VI) && (checkCWSREnabled())) {
const char *pIterateIsa;
HsaMemoryBuffer isaBuffer(PAGE_SIZE, defaultGPUNode, true/*zero*/, false/*local*/, true/*exec*/);
HsaMemoryBuffer resultBuf1(PAGE_SIZE, defaultGPUNode, true, false, false);
uint64_t count1 = 400000000;
if (m_FamilyId < FAMILY_AI)
pIterateIsa = iterate_isa_gfx8;
else
pIterateIsa = iterate_isa_gfx9;
if (isOnEmulator()) {
// Divide the iterator times by 10000 so that the test can
// finish in a reasonable time.
@@ -172,7 +84,7 @@ TEST_F(KFDCWSRTest, BasicTest) {
unsigned int* result1 = resultBuf1.As<unsigned int*>();
m_pIsaGen->CompileShader(pIterateIsa, "iterate_isa", isaBuffer);
ASSERT_SUCCESS(m_pAsm->RunAssembleBuf(IterateIsa, isaBuffer.As<char*>()));
PM4Queue queue1;
@@ -236,7 +148,7 @@ TEST_F(KFDCWSRTest, InterruptRestore) {
if ((m_FamilyId >= FAMILY_VI) && (checkCWSREnabled())) {
HsaMemoryBuffer isaBuffer(PAGE_SIZE, defaultGPUNode, true/*zero*/, false/*local*/, true/*exec*/);
m_pIsaGen->CompileShader(infinite_isa, "infinite_isa", isaBuffer);
ASSERT_SUCCESS(m_pAsm->RunAssembleBuf(InfiniteLoopIsa, isaBuffer.As<char*>()));
PM4Queue queue1, queue2, queue3;
+1 -3
Просмотреть файл
@@ -27,12 +27,11 @@
#include <gtest/gtest.h>
#include "PM4Queue.hpp"
#include "IsaGenerator.hpp"
#include "KFDBaseComponentTest.hpp"
class KFDCWSRTest : public KFDBaseComponentTest {
public:
KFDCWSRTest() :m_pIsaGen(NULL) {}
KFDCWSRTest() {}
~KFDCWSRTest() {}
protected:
@@ -41,7 +40,6 @@ class KFDCWSRTest : public KFDBaseComponentTest {
protected: // Members
unsigned wave_number;
IsaGenerator* m_pIsaGen;
};
#endif // __KFD_CWSR_TEST__H__
-5
Просмотреть файл
@@ -176,16 +176,11 @@ void KFDDBGTest::SetUp() {
KFDBaseComponentTest::SetUp();
m_pIsaGen = IsaGenerator::Create(m_FamilyId);
ROUTINE_END
}
void KFDDBGTest::TearDown() {
ROUTINE_START
if (m_pIsaGen)
delete m_pIsaGen;
m_pIsaGen = NULL;
/* Reset the user trap handler */
hsaKmtSetTrapHandler(m_NodeInfo.HsaDefaultGPUNode(), 0, 0, 0, 0);
+1 -5
Просмотреть файл
@@ -26,20 +26,16 @@
#include <gtest/gtest.h>
#include "IsaGenerator.hpp"
#include "KFDBaseComponentTest.hpp"
class KFDDBGTest : public KFDBaseComponentTest {
public:
KFDDBGTest() :m_pIsaGen(NULL) {}
KFDDBGTest() {}
~KFDDBGTest() {}
protected:
virtual void SetUp();
virtual void TearDown();
protected: // Members
IsaGenerator* m_pIsaGen;
};
#endif // __KFD_DBG_TEST__H__
+1 -137
Просмотреть файл
@@ -41,18 +41,12 @@ void KFDEvictTest::SetUp() {
KFDBaseComponentTest::SetUp();
m_pIsaGen = IsaGenerator::Create(m_FamilyId);
ROUTINE_END
}
void KFDEvictTest::TearDown() {
ROUTINE_START
if (m_pIsaGen)
delete m_pIsaGen;
m_pIsaGen = NULL;
KFDBaseComponentTest::TearDown();
ROUTINE_END
@@ -286,136 +280,6 @@ void KFDEvictTest::AmdgpuCommandSubmissionSdmaNop(int rn, amdgpu_bo_handle handl
EXPECT_EQ(0, amdgpu_cs_ctx_free(contextHandle));
}
/* Shader to read local buffers using multiple wavefronts in parallel
* until address buffer is filled with specific value 0x5678 by host program,
* then each wavefront fills value 0x5678 at corresponding result buffer and quit
*
* Initial state:
* s[0:1] - address buffer base address
* s[2:3] - result buffer base address
* s4 - workgroup id
* v0 - workitem id, always 0 because NUM_THREADS_X(number of threads) in workgroup set to 1
* Registers:
* v0 - calculated workitem id, v0 = v0 + s4 * NUM_THREADS_X
* v[2:3] - address of corresponding local buf address offset: s[0:1] + v0 * 8
* v[4:5] - corresponding output buf address: s[2:3] + v0 * 4
* v[6:7] - local buf address used for read test
*
* This shader can be used by gfx9 and gfx10
*
*/
static const char* gfx9_ReadMemory =
"\
shader ReadMemory\n\
wave_size(32)\n\
type(CS)\n\
\n\
// compute address of corresponding output buffer\n\
v_mov_b32 v0, s4 // use workgroup id as index\n\
v_lshlrev_b32 v0, 2, v0 // v0 *= 4\n\
v_add_co_u32 v4, vcc, s2, v0 // v[4:5] = s[2:3] + v0 * 4\n\
v_mov_b32 v5, s3\n\
v_add_co_u32 v5, vcc, v5, vcc_lo\n\
\n\
// compute input buffer offset used to store corresponding local buffer address\n\
v_lshlrev_b32 v0, 1, v0 // v0 *= 8\n\
v_add_co_u32 v2, vcc, s0, v0 // v[2:3] = s[0:1] + v0 * 8\n\
v_mov_b32 v3, s1\n\
v_add_co_u32 v3, vcc, v3, vcc_lo\n\
\n\
// load 64bit local buffer address stored at v[2:3] to v[6:7]\n\
flat_load_dwordx2 v[6:7], v[2:3] slc\n\
s_waitcnt vmcnt(0) & lgkmcnt(0) // wait for memory reads to finish\n\
\n\
v_mov_b32 v8, 0x5678\n\
s_movk_i32 s8, 0x5678\n\
L_REPEAT:\n\
s_load_dword s16, s[0:1], 0x0 glc\n\
s_waitcnt vmcnt(0) & lgkmcnt(0) // wait for memory reads to finish\n\
s_cmp_eq_i32 s16, s8\n\
s_cbranch_scc1 L_QUIT // if notified to quit by host\n\
// loop read 64M local buffer starting at v[6:7]\n\
// every 4k page only read once\n\
v_mov_b32 v9, 0\n\
v_mov_b32 v10, 0x1000 // 4k page\n\
v_mov_b32 v11, 0x4000000 // 64M size\n\
v_mov_b32 v12, v6\n\
v_mov_b32 v13, v7\n\
L_LOOP_READ:\n\
flat_load_dwordx2 v[14:15], v[12:13] slc\n\
v_add_co_u32 v9, vcc, v9, v10 \n\
v_add_co_u32 v12, vcc, v12, v10\n\
v_add_co_u32 v13, vcc, v13, vcc_lo\n\
v_cmp_lt_u32 vcc, v9, v11\n\
s_cbranch_vccnz L_LOOP_READ\n\
s_branch L_REPEAT\n\
L_QUIT:\n\
flat_store_dword v[4:5], v8\n\
s_waitcnt vmcnt(0) & lgkmcnt(0) // wait for memory writes to finish\n\
s_endpgm\n\
end\n\
";
static const char* gfx8_ReadMemory =
"\
shader ReadMemory\n\
asic(VI)\n\
type(CS)\n\
\n\
// compute address of corresponding output buffer\n\
v_mov_b32 v0, s4 // use workgroup id as index\n\
v_lshlrev_b32 v0, 2, v0 // v0 *= 4\n\
v_add_u32 v4, vcc, s2, v0 // v[4:5] = s[2:3] + v0 * 4\n\
v_mov_b32 v5, s3\n\
v_addc_u32 v5, vcc, v5, 0, vcc\n\
\n\
// compute input buffer offset used to store corresponding local buffer address\n\
v_lshlrev_b32 v0, 1, v0 // v0 *= 8\n\
v_add_u32 v2, vcc, s0, v0 // v[2:3] = s[0:1] + v0 * 8\n\
v_mov_b32 v3, s1\n\
v_addc_u32 v3, vcc, v3, 0, vcc\n\
\n\
// load 64bit local buffer address stored at v[2:3] to v[6:7]\n\
flat_load_dwordx2 v[6:7], v[2:3] slc\n\
s_waitcnt vmcnt(0) & lgkmcnt(0) // wait for memory reads to finish\n\
\n\
v_mov_b32 v8, 0x5678\n\
s_movk_i32 s8, 0x5678\n\
L_REPEAT:\n\
s_load_dword s16, s[0:1], 0x0 glc\n\
s_waitcnt vmcnt(0) & lgkmcnt(0) // wait for memory reads to finish\n\
s_cmp_eq_i32 s16, s8\n\
s_cbranch_scc1 L_QUIT // if notified to quit by host\n\
// loop read 64M local buffer starting at v[6:7]\n\
// every 4k page only read once\n\
v_mov_b32 v9, 0\n\
v_mov_b32 v10, 0x1000 // 4k page\n\
v_mov_b32 v11, 0x4000000 // 64M size\n\
v_mov_b32 v12, v6\n\
v_mov_b32 v13, v7\n\
L_LOOP_READ:\n\
flat_load_dwordx2 v[14:15], v[12:13] slc\n\
v_add_u32 v9, vcc, v9, v10 \n\
v_add_u32 v12, vcc, v12, v10\n\
v_addc_u32 v13, vcc, v13, 0, vcc\n\
v_cmp_lt_u32 vcc, v9, v11\n\
s_cbranch_vccnz L_LOOP_READ\n\
s_branch L_REPEAT\n\
L_QUIT:\n\
flat_store_dword v[4:5], v8\n\
s_waitcnt vmcnt(0) & lgkmcnt(0) // wait for memory writes to finish\n\
s_endpgm\n\
end\n\
";
std::string KFDEvictTest::CreateShader() {
if (m_FamilyId < FAMILY_AI)
return gfx8_ReadMemory;
else
return gfx9_ReadMemory;
}
/* Evict and restore procedure basic test
*
* Use N_PROCESSES processes to allocate vram buf size larger than total vram size
@@ -567,7 +431,7 @@ TEST_F(KFDEvictTest, QueueTest) {
HsaMemoryBuffer addrBuffer(PAGE_SIZE, defaultGPUNode);
HsaMemoryBuffer resultBuffer(PAGE_SIZE, defaultGPUNode);
m_pIsaGen->CompileShader(CreateShader().c_str(), "ReadMemory", isaBuffer);
ASSERT_SUCCESS(m_pAsm->RunAssembleBuf(ReadMemoryIsa, isaBuffer.As<char*>()));
PM4Queue pm4Queue;
ASSERT_SUCCESS(pm4Queue.Create(defaultGPUNode));
+1 -5
Просмотреть файл
@@ -27,22 +27,19 @@
#include <string>
#include <vector>
#include "KFDMultiProcessTest.hpp"
#include "IsaGenerator.hpp"
#include "PM4Queue.hpp"
// @class KFDEvictTest
// Test eviction and restore procedure using two processes
class KFDEvictTest : public KFDMultiProcessTest {
public:
KFDEvictTest(void): m_pIsaGen(NULL) {}
KFDEvictTest(void) {}
~KFDEvictTest(void) {}
protected:
virtual void SetUp();
virtual void TearDown();
std::string CreateShader();
void AllocBuffers(HSAuint32 defaultGPUNode, HSAuint32 count, HSAuint64 vramBufSize,
std::vector<void *> &pBuffers);
void FreeBuffers(std::vector<void *> &pBuffers, HSAuint64 vramBufSize);
@@ -52,7 +49,6 @@ class KFDEvictTest : public KFDMultiProcessTest {
PM4Queue *computeQueue);
protected: // Members
IsaGenerator* m_pIsaGen;
HsaMemFlags m_Flags;
void* m_pBuf;
};
+2 -7
Просмотреть файл
@@ -33,18 +33,12 @@ void KFDExceptionTest::SetUp() {
KFDBaseComponentTest::SetUp();
m_pIsaGen = IsaGenerator::Create(m_FamilyId);
ROUTINE_END
}
void KFDExceptionTest::TearDown() {
ROUTINE_START
if (m_pIsaGen)
delete m_pIsaGen;
m_pIsaGen = NULL;
KFDBaseComponentTest::TearDown();
// WORKAROUND: This needs to be fixed in the kernel
@@ -75,7 +69,8 @@ void KFDExceptionTest::TestMemoryException(int defaultGPUNode, HSAuint64 pSrc,
eventDesc.SyncVar.SyncVar.UserData = NULL;
eventDesc.SyncVar.SyncVarSize = 0;
m_pIsaGen->GetCopyDwordIsa(isaBuffer);
ASSERT_SUCCESS(m_pAsm->RunAssembleBuf(CopyDwordIsa, isaBuffer.As<char*>()));
m_ChildStatus = queue.Create(defaultGPUNode);
if (m_ChildStatus != HSAKMT_STATUS_SUCCESS) {
WARN() << "Queue create failed" << std::endl;
+1 -4
Просмотреть файл
@@ -26,12 +26,11 @@
#include <gtest/gtest.h>
#include "IsaGenerator.hpp"
#include "KFDBaseComponentTest.hpp"
class KFDExceptionTest : public KFDBaseComponentTest {
public:
KFDExceptionTest() :m_pIsaGen(NULL), m_ChildPid(-1) {
KFDExceptionTest() : m_ChildPid(-1) {
/* Because there could be early return before m_ChildPid is set
* by fork(), we should initialize m_ChildPid to a non-zero value
* to avoid possible exit of the main process.
@@ -59,8 +58,6 @@ class KFDExceptionTest : public KFDBaseComponentTest {
protected: // Members
pid_t m_ChildPid;
HSAKMT_STATUS m_ChildStatus;
IsaGenerator* m_pIsaGen;
};
#endif // __KFD_EXCEPTION_TEST__H__
+3 -83
Просмотреть файл
@@ -26,91 +26,17 @@
#include "PM4Packet.hpp"
#include "Dispatch.hpp"
/* Shader to initialize gws counter to 1*/
const char* gfx9_10_GwsInit =
"\
shader GwsInit\n\
type(CS)\n\
wave_size(32)\n\
s_mov_b32 m0, 0\n\
s_nop 0\n\
s_load_dword s16, s[0:1], 0x0 glc\n\
s_waitcnt 0\n\
v_mov_b32 v0, s16\n\
s_waitcnt 0\n\
ds_gws_init v0 gds:1 offset0:0\n\
s_waitcnt 0\n\
s_endpgm\n\
end\n\
";
/* Atomically increase a value in memory
* This is expected to be executed from
* multiple work groups simultaneously.
* GWS semaphore is used to guarantee
* the operation is atomic.
*/
const char* gfx9_AtomicIncrease =
"\
shader AtomicIncrease\n\
type(CS)\n\
/* Assume src address in s0, s1 */\n\
s_mov_b32 m0, 0\n\
s_nop 0\n\
ds_gws_sema_p gds:1 offset0:0\n\
s_waitcnt 0\n\
s_load_dword s16, s[0:1], 0x0 glc\n\
s_waitcnt 0\n\
s_add_u32 s16, s16, 1\n\
s_store_dword s16, s[0:1], 0x0 glc\n\
s_waitcnt lgkmcnt(0)\n\
ds_gws_sema_v gds:1 offset0:0\n\
s_waitcnt 0\n\
s_endpgm\n\
end\n\
";
const char* gfx10_AtomicIncrease =
"\
shader AtomicIncrease\n\
asic(GFX10)\n\
type(CS)\n\
wave_size(32)\n\
/* Assume src address in s0, s1 */\n\
s_mov_b32 m0, 0\n\
s_mov_b32 exec_lo, 0x1\n\
v_mov_b32 v0, s0\n\
v_mov_b32 v1, s1\n\
ds_gws_sema_p gds:1 offset0:0\n\
s_waitcnt 0\n\
flat_load_dword v2, v[0:1] glc:1 dlc:1\n\
s_waitcnt 0\n\
v_add_nc_u32 v2, v2, 1\n\
flat_store_dword v[0:1], v2\n\
s_waitcnt_vscnt null, 0\n\
ds_gws_sema_v gds:1 offset0:0\n\
s_waitcnt 0\n\
s_endpgm\n\
end\n\
";
void KFDGWSTest::SetUp() {
ROUTINE_START
KFDBaseComponentTest::SetUp();
m_pIsaGen = IsaGenerator::Create(m_FamilyId);
ROUTINE_END
}
void KFDGWSTest::TearDown() {
ROUTINE_START
if (m_pIsaGen)
delete m_pIsaGen;
m_pIsaGen = NULL;
KFDBaseComponentTest::TearDown();
ROUTINE_END
@@ -160,21 +86,15 @@ TEST_F(KFDGWSTest, Semaphore) {
pNodeProperties->NumGws,&firstGWS));
EXPECT_EQ(0, firstGWS);
m_pIsaGen = IsaGenerator::Create(m_FamilyId);
m_pIsaGen->CompileShader(gfx9_10_GwsInit, "GwsInit", isaBuffer);
ASSERT_SUCCESS(m_pAsm->RunAssembleBuf(GwsInitIsa, isaBuffer.As<char*>()));
Dispatch dispatch0(isaBuffer);
buffer.Fill(numResources, 0, 4);
dispatch0.SetArgs(buffer.As<void*>(), NULL);
dispatch0.Submit(queue);
dispatch0.Sync();
const char *pAtomicIncrease;
if (m_FamilyId <= FAMILY_AL)
pAtomicIncrease = gfx9_AtomicIncrease;
else
pAtomicIncrease = gfx10_AtomicIncrease;
m_pIsaGen->CompileShader(pAtomicIncrease, "AtomicIncrease", isaBuffer);
ASSERT_SUCCESS(m_pAsm->RunAssembleBuf(GwsAtomicIncreaseIsa, isaBuffer.As<char*>()));
Dispatch dispatch(isaBuffer);
dispatch.SetArgs(buffer.As<void*>(), NULL);
+1 -5
Просмотреть файл
@@ -26,20 +26,16 @@
#include <gtest/gtest.h>
#include "IsaGenerator.hpp"
#include "KFDBaseComponentTest.hpp"
class KFDGWSTest : public KFDBaseComponentTest {
public:
KFDGWSTest() :m_pIsaGen(NULL) {}
KFDGWSTest() {}
~KFDGWSTest() {}
protected:
virtual void SetUp();
virtual void TearDown();
protected: // Members
IsaGenerator* m_pIsaGen;
};
#endif // __KFD_GWS_TEST__H__
+2 -1
Просмотреть файл
@@ -101,7 +101,8 @@ TEST_F(KFDGraphicsInterop, RegisterGraphicsHandle) {
// Copy contents to a system memory buffer for comparison
HsaMemoryBuffer isaBuffer(PAGE_SIZE, defaultGPUNode, true/*zero*/, false/*local*/, true/*exec*/);
m_pIsaGen->GetCopyDwordIsa(isaBuffer);
ASSERT_SUCCESS(m_pAsm->RunAssembleBuf(CopyDwordIsa, isaBuffer.As<char*>()));
HsaMemoryBuffer dstBuffer(PAGE_SIZE, defaultGPUNode, true/*zero*/);
+3 -7
Просмотреть файл
@@ -28,18 +28,12 @@ void KFDHWSTest::SetUp() {
KFDBaseComponentTest::SetUp();
m_pIsaGen = IsaGenerator::Create(m_FamilyId);
ROUTINE_END
}
void KFDHWSTest::TearDown() {
ROUTINE_START
if (m_pIsaGen)
delete m_pIsaGen;
m_pIsaGen = NULL;
KFDBaseComponentTest::TearDown();
ROUTINE_END
@@ -70,7 +64,9 @@ void KFDHWSTest::RunTest(unsigned nProcesses, unsigned nQueues, unsigned nLoops)
// Run work on all queues
HsaMemoryBuffer isaBuffer(PAGE_SIZE, defaultGPUNode, true/*zero*/, false/*local*/, true/*exec*/);
m_pIsaGen->GetNoopIsa(isaBuffer);
ASSERT_SUCCESS(m_pAsm->RunAssembleBuf(NoopIsa, isaBuffer.As<char*>()));
for (l = 0; l < nLoops; l++) {
for (q = 0; q < nQueues; q++) {
if (dispatch[q])
+1 -6
Просмотреть файл
@@ -27,14 +27,12 @@
#include <gtest/gtest.h>
#include "PM4Queue.hpp"
#include "IsaGenerator.hpp"
#include "KFDMultiProcessTest.hpp"
#include "Dispatch.hpp"
class KFDHWSTest : public KFDMultiProcessTest {
public:
KFDHWSTest():m_pIsaGen(NULL) {}
KFDHWSTest() {}
~KFDHWSTest() {}
protected:
@@ -42,9 +40,6 @@ class KFDHWSTest : public KFDMultiProcessTest {
virtual void TearDown();
void RunTest(unsigned nProcesses, unsigned nQueues, unsigned nLoops);
protected: // Members
IsaGenerator* m_pIsaGen;
};
#endif // __KFD_QCM_TEST__H__
-1
Просмотреть файл
@@ -23,7 +23,6 @@
#include "KFDBaseComponentTest.hpp"
#include "BaseQueue.hpp"
#include "IsaGenerator.hpp"
#ifndef __KFD_MEMORY_TEST__H__
#define __KFD_MEMORY_TEST__H__
+4 -9
Просмотреть файл
@@ -33,18 +33,12 @@ void KFDLocalMemoryTest::SetUp() {
KFDBaseComponentTest::SetUp();
m_pIsaGen = IsaGenerator::Create(m_FamilyId);
ROUTINE_END
}
void KFDLocalMemoryTest::TearDown() {
ROUTINE_START
if (m_pIsaGen)
delete m_pIsaGen;
m_pIsaGen = NULL;
KFDBaseComponentTest::TearDown();
ROUTINE_END
@@ -107,7 +101,7 @@ TEST_F(KFDLocalMemoryTest, BasicTest) {
srcSysBuffer.Fill(0x01010101);
m_pIsaGen->GetCopyDwordIsa(isaBuffer);
ASSERT_SUCCESS(m_pAsm->RunAssembleBuf(CopyDwordIsa, isaBuffer.As<char*>()));
ASSERT_SUCCESS(hsaKmtMapMemoryToGPUNodes(srcLocalBuffer.As<void*>(), srcLocalBuffer.Size(), &AlternateVAGPU,
mapFlags, 1, reinterpret_cast<HSAuint32 *>(&defaultGPUNode)));
@@ -164,7 +158,7 @@ TEST_F(KFDLocalMemoryTest, VerifyContentsAfterUnmapAndMap) {
SysBufferA.Fill(0x01010101);
m_pIsaGen->GetCopyDwordIsa(isaBuffer);
ASSERT_SUCCESS(m_pAsm->RunAssembleBuf(CopyDwordIsa, isaBuffer.As<char*>()));
ASSERT_SUCCESS(queue.Create(defaultGPUNode));
queue.SetSkipWaitConsump(0);
@@ -303,7 +297,8 @@ TEST_F(KFDLocalMemoryTest, Fragmentation) {
PM4Queue queue;
ASSERT_SUCCESS(queue.Create(defaultGPUNode));
HsaMemoryBuffer isaBuffer(PAGE_SIZE, defaultGPUNode);
m_pIsaGen->GetCopyDwordIsa(isaBuffer);
ASSERT_SUCCESS(m_pAsm->RunAssembleBuf(CopyDwordIsa, isaBuffer.As<char*>()));
/* Allocate and test memory using the strategy explained at the top */
HSAKMT_STATUS status;
+1 -5
Просмотреть файл
@@ -26,20 +26,16 @@
#include <gtest/gtest.h>
#include "IsaGenerator.hpp"
#include "KFDBaseComponentTest.hpp"
class KFDLocalMemoryTest : public KFDBaseComponentTest {
public:
KFDLocalMemoryTest() :m_pIsaGen(NULL) {}
KFDLocalMemoryTest() {}
~KFDLocalMemoryTest() {}
protected:
virtual void SetUp();
virtual void TearDown();
protected: // Members
IsaGenerator* m_pIsaGen;
};
#endif // __KFD_LOCALMEMORY_TEST__H__
+47 -403
Просмотреть файл
@@ -39,360 +39,17 @@
#include "SDMAPacket.hpp"
#include "linux/kfd_ioctl.h"
const char* gfx8_ScratchCopyDword =
"\
shader ScratchCopyDword\n\
asic(VI)\n\
type(CS)\n\
/*copy the parameters from scalar registers to vector registers*/\n\
v_mov_b32 v0, s0\n\
v_mov_b32 v1, s1\n\
v_mov_b32 v2, s2\n\
v_mov_b32 v3, s3\n\
/*set up the scratch parameters. This assumes a single 16-reg block.*/\n\
s_mov_b32 flat_scratch_lo, 8/*2 dwords of scratch per thread*/\n\
s_mov_b32 flat_scratch_hi, 0/*offset in units of 256bytes*/\n\
/*copy a dword between the passed addresses*/\n\
flat_load_dword v4, v[0:1] slc\n\
s_waitcnt vmcnt(0)&lgkmcnt(0)\n\
flat_store_dword v[2:3], v4 slc\n\
\n\
s_endpgm\n\
\n\
end\n\
";
const char* gfx9_ScratchCopyDword =
"\
shader ScratchCopyDword\n\
asic(GFX9)\n\
type(CS)\n\
/*copy the parameters from scalar registers to vector registers*/\n\
v_mov_b32 v0, s0\n\
v_mov_b32 v1, s1\n\
v_mov_b32 v2, s2\n\
v_mov_b32 v3, s3\n\
/*set up the scratch parameters. This assumes a single 16-reg block.*/\n\
s_mov_b32 flat_scratch_lo, s4\n\
s_mov_b32 flat_scratch_hi, s5\n\
/*copy a dword between the passed addresses*/\n\
flat_load_dword v4, v[0:1] slc\n\
s_waitcnt vmcnt(0)&lgkmcnt(0)\n\
flat_store_dword v[2:3], v4 slc\n\
\n\
s_endpgm\n\
\n\
end\n\
";
const char* gfx10_ScratchCopyDword =
"\
shader ScratchCopyDword\n\
asic(GFX10)\n\
type(CS)\n\
wave_size(32)\n\
/*copy the parameters from scalar registers to vector registers*/\n\
v_mov_b32 v0, s0\n\
v_mov_b32 v1, s1\n\
v_mov_b32 v2, s2\n\
v_mov_b32 v3, s3\n\
/*set up the scratch parameters. This assumes a single 16-reg block.*/\n\
s_setreg_b32 hwreg(HW_REG_SHADER_FLAT_SCRATCH_LO), s4\n\
s_setreg_b32 hwreg(HW_REG_SHADER_FLAT_SCRATCH_HI), s5\n\
/*copy a dword between the passed addresses*/\n\
flat_load_dword v4, v[0:1] slc\n\
s_waitcnt vmcnt(0)&lgkmcnt(0)\n\
flat_store_dword v[2:3], v4 slc\n\
\n\
s_endpgm\n\
\n\
end\n\
";
const char* aldbrn_ScratchCopyDword =
"\
shader ScratchCopyDword\n\
asic(ALDEBARAN)\n\
type(CS)\n\
/*copy the parameters from scalar registers to vector registers*/\n\
v_mov_b32 v0, s0\n\
v_mov_b32 v1, s1\n\
v_mov_b32 v2, s2\n\
v_mov_b32 v3, s3\n\
/*set up the scratch parameters. This assumes a single 16-reg block.*/\n\
s_mov_b32 flat_scratch_lo, s4\n\
s_mov_b32 flat_scratch_hi, s5\n\
/*copy a dword between the passed addresses*/\n\
flat_load_dword v4, v[0:1] slc\n\
s_waitcnt vmcnt(0)&lgkmcnt(0)\n\
flat_store_dword v[2:3], v4 slc\n\
\n\
s_endpgm\n\
\n\
end\n\
";
/* Continuously poll src buffer and check buffer value
* After src buffer is filled with specific value (0x5678,
* by host program), fill dst buffer with specific
* value(0x5678) and quit
*/
const char* gfx9_PollMemory =
"\
shader ReadMemory\n\
wave_size(32)\n\
type(CS)\n\
/* Assume src address in s0, s1 and dst address in s2, s3*/\n\
s_movk_i32 s18, 0x5678\n\
LOOP:\n\
s_load_dword s16, s[0:1], 0x0 glc\n\
s_cmp_eq_i32 s16, s18\n\
s_cbranch_scc0 LOOP\n\
s_store_dword s18, s[2:3], 0x0 glc\n\
s_endpgm\n\
end\n\
";
/* Similar to gfx9_PollMemory except that the buffer
* polled can be Non-coherant memory. SCC system-level
* cache coherence is not supported in scalar (smem) path.
* Use vmem operations with scc
*/
const char* gfx9_PollNCMemory =
"\
shader ReadMemory\n\
asic(ALDEBARAN)\n\
wave_size(32)\n\
type(CS)\n\
/* Assume src address in s0, s1 and dst address in s2, s3*/\n\
v_mov_b32 v6, 0x5678\n\
v_mov_b32 v0, s0\n\
v_mov_b32 v1, s1\n\
LOOP:\n\
flat_load_dword v4, v[0:1] scc\n\
v_cmp_eq_u32 vcc, v4, v6\n\
s_cbranch_vccz LOOP\n\
v_mov_b32 v0, s2\n\
v_mov_b32 v1, s3\n\
flat_store_dword v[0:1], v6 scc\n\
s_endpgm\n\
end\n\
";
const char* gfx10_PollMemory =
"\
shader ReadMemory\n\
wave_size(32)\n\
type(CS)\n\
/* Assume src address in s0, s1 and dst address in s2, s3*/\n\
s_movk_i32 s18, 0x5678\n\
v_mov_b32 v0, s2\n\
v_mov_b32 v1, s3\n\
v_mov_b32 v2, 0x5678\n\
LOOP:\n\
s_load_dword s16, s[0:1], 0x0 glc\n\
s_cmp_eq_i32 s16, s18\n\
s_cbranch_scc0 LOOP\n\
flat_store_dword v[0,1], v2 slc\n\
s_waitcnt vmcnt(0)&lgkmcnt(0)\n\
s_endpgm\n\
end\n\
";
/* Input: A buffer of at least 3 dwords.
* DW0: used as a signal. 0xcafe means it is signaled
* DW1: Input buffer for device to read.
* DW2: Output buffer for device to write.
* Once receive signal, device will copy DW1 to DW2
* This shader continously poll the signal buffer,
* Once signal buffer is signaled, it copies input buffer
* to output buffer
*/
const char* gfx9_CopyOnSignal =
"\
shader CopyOnSignal\n\
wave_size(32)\n\
type(CS)\n\
/* Assume input buffer in s0, s1 */\n\
s_mov_b32 s18, 0xcafe\n\
POLLSIGNAL:\n\
s_load_dword s16, s[0:1], 0x0 glc\n\
s_cmp_eq_i32 s16, s18\n\
s_cbranch_scc0 POLLSIGNAL\n\
s_load_dword s17, s[0:1], 0x4 glc\n\
s_waitcnt vmcnt(0) & lgkmcnt(0)\n\
s_store_dword s17, s[0:1], 0x8 glc\n\
s_waitcnt vmcnt(0) & lgkmcnt(0)\n\
s_endpgm\n\
end\n\
";
const char* gfx10_CopyOnSignal =
"\
shader CopyOnSignal\n\
wave_size(32)\n\
type(CS)\n\
/* Assume input buffer in s0, s1 */\n\
s_add_u32 s2, s0, 0x8\n\
s_addc_u32 s3, s1, 0x0\n\
s_mov_b32 s18, 0xcafe\n\
v_mov_b32 v0, s0\n\
v_mov_b32 v1, s1\n\
v_mov_b32 v4, s2\n\
v_mov_b32 v5, s3\n\
POLLSIGNAL:\n\
s_load_dword s16, s[0:1], 0x0 glc\n\
s_cmp_eq_i32 s16, s18\n\
s_cbranch_scc0 POLLSIGNAL\n\
s_load_dword s17, s[0:1], 0x4 glc\n\
s_waitcnt vmcnt(0) & lgkmcnt(0)\n\
v_mov_b32 v2, s17\n\
flat_store_dword v[4,5], v2 glc\n\
s_waitcnt vmcnt(0) & lgkmcnt(0)\n\
s_endpgm\n\
end\n\
";
/* Input0: A buffer of at least 2 dwords.
* DW0: used as a signal. Write 0xcafe to signal
* DW1: Write to this buffer for other device to read.
* Input1: mmio base address
*/
const char* gfx9_WriteAndSignal =
"\
shader WriteAndSignal\n\
wave_size(32)\n\
type(CS)\n\
/* Assume input buffer in s0, s1 */\n\
s_mov_b32 s18, 0xbeef\n\
s_store_dword s18, s[0:1], 0x4 glc\n\
s_mov_b32 s18, 0x1\n\
s_store_dword s18, s[2:3], 0 glc\n\
s_mov_b32 s18, 0xcafe\n\
s_store_dword s18, s[0:1], 0x0 glc\n\
s_endpgm\n\
end\n\
";
/* Continuously poll the flag at src buffer
* After the flag of s[0:1] is 1 filled,
* copy the value from s[0:1]+4 to dst buffer
*/
const char* gfx9_PollAndCopy =
"\
shader CopyMemory\n\
wave_size(32)\n\
type(CS)\n\
/* Assume src buffer in s[0:1] and dst buffer in s[2:3]*/\n\
s_movk_i32 s18, 0x1\n\
LOOP:\n\
s_load_dword s16, s[0:1], 0x0 glc\n\
s_cmp_eq_i32 s16, s18\n\
s_cbranch_scc0 LOOP\n\
s_load_dword s17, s[0:1], 0x4 glc\n\
s_waitcnt vmcnt(0) & lgkmcnt(0)\n\
s_store_dword s17, s[2:3], 0x0 glc:1\n\
s_waitcnt vmcnt(0) & lgkmcnt(0)\n\
s_endpgm\n\
end\n\
";
const char* gfx9aldbrn_PollAndCopy =
"\
shader CopyMemory\n\
wave_size(32)\n\
type(CS)\n\
/* Assume src buffer in s[0:1] and dst buffer in s[2:3]*/\n\
v_mov_b32 v0, s0\n\
v_mov_b32 v1, s1\n\
v_mov_b32 v18, 0x1\n\
LOOP:\n\
flat_load_dword v16, v[0:1] glc\n\
s_waitcnt vmcnt(0) & lgkmcnt(0)\n\
v_cmp_eq_i32 vcc, v16, v18\n\
s_cbranch_vccz LOOP\n\
buffer_invl2\n\
s_load_dword s17, s[0:1], 0x4 glc\n\
s_waitcnt vmcnt(0) & lgkmcnt(0)\n\
s_store_dword s17, s[2:3], 0x0 glc\n\
s_waitcnt vmcnt(0) & lgkmcnt(0)\n\
buffer_wbl2\n\
s_waitcnt vmcnt(0) & lgkmcnt(0)\n\
s_endpgm\n\
end\n\
";
/* Input0: A buffer of at least 2 dwords.
* DW0: used as a signal. Write 0x1 to signal
* DW1: Write the value from 2nd input buffer
* for other device to read.
* Input1: A buffer of at least 2 dwords.
* DW0: used as the value to be written.
*/
const char* gfx9aldbrn_WriteFlagAndValue =
"\
shader WriteMemory\n\
wave_size(32)\n\
type(CS)\n\
/* Assume two inputs buffer in s[0:1] and s[2:3]*/\n\
v_mov_b32 v0, s0\n\
v_mov_b32 v1, s1\n\
s_load_dword s18, s[2:3], 0x0 glc\n\
s_waitcnt vmcnt(0) & lgkmcnt(0)\n\
s_store_dword s18, s[0:1], 0x4 glc\n\
s_waitcnt vmcnt(0) & lgkmcnt(0)\n\
buffer_wbl2\n\
s_waitcnt vmcnt(0) & lgkmcnt(0)\n\
v_mov_b32 v16, 0x1\n\
flat_store_dword v[0:1], v16 glc\n\
s_endpgm\n\
end\n\
";
const char* gfx10_WriteAndSignal =
"\
shader WriteAndSignal\n\
wave_size(32)\n\
type(CS)\n\
/* Assume input buffer in s0, s1 */\n\
s_add_u32 s4, s0, 0x4\n\
s_addc_u32 s5, s1, 0x0\n\
v_mov_b32 v0, s0\n\
v_mov_b32 v1, s1\n\
v_mov_b32 v2, s2\n\
v_mov_b32 v3, s3\n\
v_mov_b32 v4, s4\n\
v_mov_b32 v5, s5\n\
v_mov_b32 v18, 0xbeef\n\
flat_store_dword v[4:5], v18 glc\n\
v_mov_b32 v18, 0x1\n\
flat_store_dword v[2:3], v18 glc\n\
v_mov_b32 v18, 0xcafe\n\
flat_store_dword v[0:1], v18 glc\n\
s_endpgm\n\
end\n\
";
//These gfx9_PullMemory, gfx9_CopyOnSignal, gfx9_WriteAndSignal shaders can be used by both gfx9 and gfx10
void KFDMemoryTest::SetUp() {
ROUTINE_START
KFDBaseComponentTest::SetUp();
m_pIsaGen = IsaGenerator::Create(m_FamilyId);
ROUTINE_END
}
void KFDMemoryTest::TearDown() {
ROUTINE_START
if (m_pIsaGen)
delete m_pIsaGen;
m_pIsaGen = NULL;
KFDBaseComponentTest::TearDown();
ROUTINE_END
@@ -508,16 +165,13 @@ TEST_F(KFDMemoryTest, MapUnmapToNodes) {
HsaMemoryBuffer dstBuffer(PAGE_SIZE, defaultGPUNode);
const char *pReadMemory;
if (m_FamilyId < FAMILY_NV)
pReadMemory = gfx9_PollMemory;
else
pReadMemory = gfx10_PollMemory;
if (m_NodeInfo.IsNodeXGMItoCPU(defaultGPUNode))
/* On A+A system memory is mapped as NC */
m_pIsaGen->CompileShader(gfx9_PollNCMemory, "ReadMemory", isaBuffer);
pReadMemory = PollNCMemoryIsa;
else
m_pIsaGen->CompileShader(pReadMemory, "ReadMemory", isaBuffer);
pReadMemory = PollMemoryIsa;
ASSERT_SUCCESS(m_pAsm->RunAssembleBuf(pReadMemory, isaBuffer.As<char*>()));
PM4Queue pm4Queue;
ASSERT_SUCCESS(pm4Queue.Create(defaultGPUNode));
@@ -674,7 +328,8 @@ TEST_F(KFDMemoryTest, MemoryRegister) {
ASSERT_SUCCESS(sdmaQueue.Create(defaultGPUNode));
HsaMemoryBuffer isaBuffer(PAGE_SIZE, defaultGPUNode, true/*zero*/, false/*local*/, true/*exec*/);
m_pIsaGen->GetCopyDwordIsa(isaBuffer);
ASSERT_SUCCESS(m_pAsm->RunAssembleBuf(CopyDwordIsa, isaBuffer.As<char*>()));
/* First submit just so the queues are not empty, and to get the
* TLB populated (in case we need to flush TLBs somewhere after
@@ -855,16 +510,7 @@ TEST_F(KFDMemoryTest, FlatScratchAccess) {
// Initialize the srcBuffer to some fixed value
srcMemBuffer.Fill(0x01010101);
const char *pScratchCopyDword;
if (m_FamilyId < FAMILY_AI)
pScratchCopyDword = gfx8_ScratchCopyDword;
else if (m_FamilyId < FAMILY_AL)
pScratchCopyDword = gfx9_ScratchCopyDword;
else if (m_FamilyId == FAMILY_AL)
pScratchCopyDword = aldbrn_ScratchCopyDword;
else
pScratchCopyDword = gfx10_ScratchCopyDword;
m_pIsaGen->CompileShader(pScratchCopyDword, "ScratchCopyDword", isaBuffer);
ASSERT_SUCCESS(m_pAsm->RunAssembleBuf(ScratchCopyDwordIsa, isaBuffer.As<char*>()));
const HsaNodeProperties *pNodeProperties = m_NodeInfo.GetNodeProperties(defaultGPUNode);
@@ -1728,17 +1374,8 @@ TEST_F(KFDMemoryTest, PtraceAccessInvisibleVram) {
// dstBuffer is cpu accessible gtt memory
HsaMemoryBuffer dstBuffer(PAGE_SIZE, defaultGPUNode);
const char *pScratchCopyDword;
if (m_FamilyId < FAMILY_AI)
pScratchCopyDword = gfx8_ScratchCopyDword;
else if (m_FamilyId < FAMILY_AL)
pScratchCopyDword = gfx9_ScratchCopyDword;
else if (m_FamilyId == FAMILY_AL)
pScratchCopyDword = aldbrn_ScratchCopyDword;
else
pScratchCopyDword = gfx10_ScratchCopyDword;
ASSERT_SUCCESS(m_pAsm->RunAssembleBuf(ScratchCopyDwordIsa, isaBuffer.As<char*>()));
m_pIsaGen->CompileShader(pScratchCopyDword, "ScratchCopyDword", isaBuffer);
Dispatch dispatch0(isaBuffer);
dispatch0.SetArgs(mem0, dstBuffer.As<void*>());
dispatch0.Submit(queue);
@@ -2109,12 +1746,9 @@ TEST_F(KFDMemoryTest, HostHdpFlush) {
PM4Queue queue;
ASSERT_SUCCESS(queue.Create(defaultGPUNode));
HsaMemoryBuffer isaBuffer(PAGE_SIZE, defaultGPUNode, true/*zero*/, false/*local*/, true/*exec*/);
const char *pCopyOnSignal;
if (m_FamilyId < FAMILY_NV)
pCopyOnSignal = gfx9_CopyOnSignal;
else
pCopyOnSignal = gfx10_CopyOnSignal;
m_pIsaGen->CompileShader(pCopyOnSignal, "CopyOnSignal", isaBuffer);
ASSERT_SUCCESS(m_pAsm->RunAssembleBuf(CopyOnSignalIsa, isaBuffer.As<char*>()));
Dispatch dispatch0(isaBuffer);
dispatch0.SetArgs(buffer, NULL);
dispatch0.Submit(queue);
@@ -2234,12 +1868,9 @@ TEST_F(KFDMemoryTest, DeviceHdpFlush) {
PM4Queue queue;
ASSERT_SUCCESS(queue.Create(nodes[0]));
HsaMemoryBuffer isaBuffer(PAGE_SIZE, nodes[0], true/*zero*/, false/*local*/, true/*exec*/);
const char *pCopyOnSignal;
if (m_FamilyId < FAMILY_NV)
pCopyOnSignal = gfx9_CopyOnSignal;
else
pCopyOnSignal = gfx10_CopyOnSignal;
m_pIsaGen->CompileShader(pCopyOnSignal, "CopyOnSignal", isaBuffer);
ASSERT_SUCCESS(m_pAsm->RunAssembleBuf(CopyOnSignalIsa, isaBuffer.As<char*>()));
Dispatch dispatch(isaBuffer);
dispatch.SetArgs(buffer, NULL);
dispatch.Submit(queue);
@@ -2247,12 +1878,9 @@ TEST_F(KFDMemoryTest, DeviceHdpFlush) {
PM4Queue queue0;
ASSERT_SUCCESS(queue0.Create(nodes[1]));
HsaMemoryBuffer isaBuffer0(PAGE_SIZE, nodes[1], true/*zero*/, false/*local*/, true/*exec*/);
const char *pWriteAndSignal;
if (m_FamilyId < FAMILY_NV)
pWriteAndSignal = gfx9_WriteAndSignal;
else
pWriteAndSignal = gfx10_WriteAndSignal;
m_pIsaGen->CompileShader(pWriteAndSignal, "WriteAndSignal", isaBuffer0);
ASSERT_SUCCESS(m_pAsm->RunAssembleBuf(WriteAndSignalIsa, isaBuffer.As<char*>()));
Dispatch dispatch0(isaBuffer0);
dispatch0.SetArgs(buffer, mmioBase);
dispatch0.Submit(queue0);
@@ -2304,7 +1932,9 @@ TEST_F(KFDMemoryTest, CacheInvalidateOnSdmaWrite) {
PM4Queue queue;
ASSERT_SUCCESS(queue.Create(defaultGPUNode));
HsaMemoryBuffer isaBuffer(PAGE_SIZE, defaultGPUNode, true/*zero*/, false/*local*/, true/*exec*/);
m_pIsaGen->CompileShader(gfx9_PollMemory, "ReadMemory", isaBuffer);
ASSERT_SUCCESS(m_pAsm->RunAssembleBuf(PollMemoryIsa, isaBuffer.As<char*>()));
Dispatch dispatch(isaBuffer);
dispatch.SetArgs(buffer.As<int*>(), buffer.As<int*>()+dwLocation);
dispatch.Submit(queue);
@@ -2357,7 +1987,9 @@ TEST_F(KFDMemoryTest, CacheInvalidateOnCPUWrite) {
PM4Queue queue;
ASSERT_SUCCESS(queue.Create(defaultGPUNode));
HsaMemoryBuffer isaBuffer(PAGE_SIZE, defaultGPUNode, true/*zero*/, false/*local*/, true/*exec*/);
m_pIsaGen->CompileShader(gfx9_PollMemory, "ReadMemory", isaBuffer);
ASSERT_SUCCESS(m_pAsm->RunAssembleBuf(PollMemoryIsa, isaBuffer.As<char*>()));
Dispatch dispatch(isaBuffer);
dispatch.SetArgs(buffer, buffer+100);
dispatch.Submit(queue);
@@ -2419,7 +2051,9 @@ TEST_F(KFDMemoryTest, CacheInvalidateOnRemoteWrite) {
PM4Queue queue;
ASSERT_SUCCESS(queue.Create(defaultGPUNode));
HsaMemoryBuffer isaBuffer(PAGE_SIZE, defaultGPUNode, true/*zero*/, false/*local*/, true/*exec*/);
m_pIsaGen->CompileShader(gfx9_PollMemory, "ReadMemory", isaBuffer);
ASSERT_SUCCESS(m_pAsm->RunAssembleBuf(PollMemoryIsa, isaBuffer.As<char*>()));
Dispatch dispatch(isaBuffer);
dispatch.SetArgs(buffer.As<int*>(), buffer.As<int*>()+dwLocation);
dispatch.Submit(queue);
@@ -2434,7 +2068,9 @@ TEST_F(KFDMemoryTest, CacheInvalidateOnRemoteWrite) {
ASSERT_SUCCESS(queue1.Create(nondefaultNode));
buffer.Fill(0x5678, sdmaQueue, dwLocation1*sizeof(int), 4);
HsaMemoryBuffer isaBuffer1(PAGE_SIZE, nondefaultNode, true/*zero*/, false/*local*/, true/*exec*/);
m_pIsaGen->GetCopyDwordIsa(isaBuffer1);
ASSERT_SUCCESS(m_pAsm->RunAssembleBuf(CopyDwordIsa, isaBuffer.As<char*>()));
Dispatch dispatch1(isaBuffer1);
dispatch1.SetArgs(buffer.As<int*>()+dwLocation1, buffer.As<int*>());
dispatch1.Submit(queue1);
@@ -2500,7 +2136,9 @@ TEST_F(KFDMemoryTest, VramCacheCoherenceWithRemoteGPU) {
PM4Queue queue;
ASSERT_SUCCESS(queue.Create(defaultGPUNode));
HsaMemoryBuffer isaBuffer(PAGE_SIZE, defaultGPUNode, true/*zero*/, false/*local*/, true/*exec*/);
m_pIsaGen->CompileShader(gfx9aldbrn_PollAndCopy, "CopyMemory", isaBuffer);
ASSERT_SUCCESS(m_pAsm->RunAssembleBuf(PollAndCopyIsa, isaBuffer.As<char*>()));
Dispatch dispatch(isaBuffer);
dispatch.SetArgs(buffer.As<char *>(), buffer.As<char *>()+dwLocation);
dispatch.Submit(queue);
@@ -2515,7 +2153,9 @@ TEST_F(KFDMemoryTest, VramCacheCoherenceWithRemoteGPU) {
PM4Queue queue1;
ASSERT_SUCCESS(queue1.Create(nondefaultNode));
HsaMemoryBuffer isaBuffer1(PAGE_SIZE, nondefaultNode, true/*zero*/, false/*local*/, true/*exec*/);
m_pIsaGen->CompileShader(gfx9aldbrn_WriteFlagAndValue, "WriteMemory", isaBuffer1);
ASSERT_SUCCESS(m_pAsm->RunAssembleBuf(WriteFlagAndValueIsa, isaBuffer.As<char*>()));
Dispatch dispatch1(isaBuffer1);
dispatch1.SetArgs(buffer.As<char *>(), buffer.As<char *>()+dwSource);
dispatch1.Submit(queue1);
@@ -2569,7 +2209,9 @@ TEST_F(KFDMemoryTest, VramCacheCoherenceWithCPU) {
PM4Queue queue;
ASSERT_SUCCESS(queue.Create(defaultGPUNode));
HsaMemoryBuffer isaBuffer(PAGE_SIZE, defaultGPUNode, true/*zero*/, false/*local*/, true/*exec*/);
m_pIsaGen->CompileShader(gfx9aldbrn_PollAndCopy, "CopyMemory", isaBuffer);
ASSERT_SUCCESS(m_pAsm->RunAssembleBuf(PollAndCopyIsa, isaBuffer.As<char*>()));
Dispatch dispatch(isaBuffer);
dispatch.SetArgs(buffer, buffer+dwLocation);
dispatch.Submit(queue);
@@ -2608,12 +2250,17 @@ TEST_F(KFDMemoryTest, SramCacheCoherenceWithGPU) {
return;
}
unsigned int *fineBuffer = NULL;
unsigned int tmp;
int defaultGPUNode = m_NodeInfo.HsaDefaultGPUNode();
const int dwLocation = 0x80;
if (!m_NodeInfo.IsNodeXGMItoCPU(defaultGPUNode)) {
LOG() << "Skipping test: XGMI link to CPU is required." << std::endl;
return;
}
unsigned int *fineBuffer = NULL;
unsigned int tmp;
ASSERT_SUCCESS(hsaKmtAllocMemory(defaultGPUNode /* system */, PAGE_SIZE, m_MemoryFlags,
reinterpret_cast<void**>(&fineBuffer)));
ASSERT_SUCCESS(hsaKmtMapMemoryToGPU(fineBuffer, PAGE_SIZE, NULL));
@@ -2627,10 +2274,7 @@ TEST_F(KFDMemoryTest, SramCacheCoherenceWithGPU) {
ASSERT_SUCCESS(queue.Create(defaultGPUNode));
HsaMemoryBuffer isaBuffer(PAGE_SIZE, defaultGPUNode, true/*zero*/, false/*local*/, true/*exec*/);
if (m_NodeInfo.IsNodeXGMItoCPU(defaultGPUNode))
m_pIsaGen->CompileShader(gfx9aldbrn_PollAndCopy, "CopyMemory", isaBuffer);
else
m_pIsaGen->CompileShader(gfx9_PollAndCopy, "CopyMemory", isaBuffer);
ASSERT_SUCCESS(m_pAsm->RunAssembleBuf(PollAndCopyIsa, isaBuffer.As<char*>()));
Dispatch dispatch(isaBuffer);
dispatch.SetArgs(fineBuffer, fineBuffer+dwLocation);
+1 -4
Просмотреть файл
@@ -22,7 +22,6 @@
*/
#include "KFDBaseComponentTest.hpp"
#include "IsaGenerator.hpp"
#ifndef __KFD_MEMORY_TEST__H__
#define __KFD_MEMORY_TEST__H__
@@ -33,15 +32,13 @@
*/
class KFDMemoryTest : public KFDBaseComponentTest {
public:
KFDMemoryTest(void) :m_pIsaGen(NULL) {}
KFDMemoryTest(void) {}
~KFDMemoryTest(void) {}
protected:
virtual void SetUp();
virtual void TearDown();
protected:
IsaGenerator* m_pIsaGen;
void BinarySearchLargestBuffer(int allocNode, const HsaMemFlags &memFlags,
HSAuint64 highMB, int nodeToMap,
HSAuint64 *lastSizeMB);
+11 -115
Просмотреть файл
@@ -39,18 +39,12 @@ void KFDQMTest::SetUp() {
KFDBaseComponentTest::SetUp();
m_pIsaGen = IsaGenerator::Create(m_FamilyId);
ROUTINE_END
}
void KFDQMTest::TearDown() {
ROUTINE_START
if (m_pIsaGen)
delete m_pIsaGen;
m_pIsaGen = NULL;
KFDBaseComponentTest::TearDown();
ROUTINE_END
@@ -677,111 +671,12 @@ TEST_F(KFDQMTest, OverSubscribeCpQueues) {
TEST_END
}
/* A simple isa loop program with dense mathematic operations
* s1 controls the number iterations of the loop
* This shader can be used by GFX8, GFX9 and GFX10
*/
static const char *loop_isa = \
"\
shader loop_isa\n\
wave_size(32)\n\
type(CS)\n\
s_movk_i32 s0, 0x0008\n\
s_movk_i32 s1, 0x00ff\n\
v_mov_b32 v0, 0\n\
v_mov_b32 v1, 0\n\
v_mov_b32 v2, 0\n\
v_mov_b32 v3, 0\n\
v_mov_b32 v4, 0\n\
v_mov_b32 v5, 0\n\
v_mov_b32 v6, 0\n\
v_mov_b32 v7, 0\n\
v_mov_b32 v8, 0\n\
v_mov_b32 v9, 0\n\
v_mov_b32 v10, 0\n\
v_mov_b32 v11, 0\n\
v_mov_b32 v12, 0\n\
v_mov_b32 v13, 0\n\
v_mov_b32 v14, 0\n\
v_mov_b32 v15, 0\n\
v_mov_b32 v16, 0\n\
LOOP:\n\
s_mov_b32 s8, s4\n\
s_mov_b32 s9, s1\n\
s_mov_b32 s10, s6\n\
s_mov_b32 s11, s7\n\
s_cmp_le_i32 s1, s0\n\
s_cbranch_scc1 END_OF_PGM\n\
s_buffer_load_dwordx8 s[8:15], s[8:11], 0x10\n\
v_add_f32 v0, 2.0, v0\n\
v_cvt_f32_i32 v17, s1\n\
s_waitcnt lgkmcnt(0)\n\
v_add_f32 v18, s8, v17\n\
v_add_f32 v19, s9, v17\n\
v_add_f32 v20, s10, v17\n\
v_add_f32 v21, s11, v17\n\
v_add_f32 v22, s12, v17\n\
v_add_f32 v23, s13, v17\n\
v_add_f32 v24, s14, v17\n\
v_add_f32 v17, s15, v17\n\
v_log_f32 v25, v18\n\
v_mul_f32 v25, v22, v25\n\
v_exp_f32 v25, v25\n\
v_log_f32 v26, v19\n\
v_mul_f32 v26, v23, v26\n\
v_exp_f32 v26, v26\n\
v_log_f32 v27, v20\n\
v_mul_f32 v27, v24, v27\n\
v_exp_f32 v27, v27\n\
v_log_f32 v28, v21\n\
v_mul_f32 v28, v17, v28\n\
v_exp_f32 v28, v28\n\
v_add_f32 v5, v5, v25\n\
v_add_f32 v6, v6, v26\n\
v_add_f32 v7, v7, v27\n\
v_add_f32 v8, v8, v28\n\
v_mul_f32 v18, 0x3fb8aa3b, v18\n\
v_exp_f32 v18, v18\n\
v_mul_f32 v19, 0x3fb8aa3b, v19\n\
v_exp_f32 v19, v19\n\
v_mul_f32 v20, 0x3fb8aa3b, v20\n\
v_exp_f32 v20, v20\n\
v_mul_f32 v21, 0x3fb8aa3b, v21\n\
v_exp_f32 v21, v21\n\
v_add_f32 v9, v9, v18\n\
v_add_f32 v10, v10, v19\n\
v_add_f32 v11, v11, v20\n\
v_add_f32 v12, v12, v21\n\
v_sqrt_f32 v18, v22\n\
v_sqrt_f32 v19, v23\n\
v_sqrt_f32 v20, v24\n\
v_sqrt_f32 v21, v17\n\
v_add_f32 v13, v13, v18\n\
v_add_f32 v14, v14, v19\n\
v_add_f32 v15, v15, v20\n\
v_add_f32 v16, v16, v21\n\
v_rsq_f32 v18, v22\n\
v_rsq_f32 v19, v23\n\
v_rsq_f32 v20, v24\n\
v_rsq_f32 v17, v17\n\
v_add_f32 v1, v1, v18\n\
v_add_f32 v2, v2, v19\n\
v_add_f32 v3, v3, v20\n\
v_add_f32 v4, v4, v17\n\
s_add_u32 s0, s0, 1\n\
s_branch LOOP\n\
END_OF_PGM:\n\
s_endpgm\n\
end\n\
";
HSAint64 KFDQMTest::TimeConsumedwithCUMask(int node, uint32_t* mask, uint32_t mask_count) {
HsaMemoryBuffer isaBuffer(PAGE_SIZE, node, true/*zero*/, false/*local*/, true/*exec*/);
HsaMemoryBuffer dstBuffer(PAGE_SIZE, node, true, false, false);
HsaMemoryBuffer ctlBuffer(PAGE_SIZE, node, true, false, false);
m_pIsaGen = IsaGenerator::Create(m_FamilyId);
m_pIsaGen->CompileShader(loop_isa, "loop_isa", isaBuffer);
EXPECT_SUCCESS(m_pAsm->RunAssembleBuf(LoopIsa, isaBuffer.As<char*>()));
Dispatch dispatch(isaBuffer);
dispatch.SetDim(1024, 16, 16);
@@ -838,7 +733,6 @@ TEST_F(KFDQMTest, BasicCuMaskingLinear) {
TEST_START(TESTPROFILE_RUNALL);
int defaultGPUNode = m_NodeInfo.HsaDefaultGPUNode();
ASSERT_GE(defaultGPUNode, 0) << "failed to get default GPU Node";
m_pIsaGen = IsaGenerator::Create(m_FamilyId);
if (m_FamilyId >= FAMILY_VI) {
const HsaNodeProperties *pNodeProperties = m_NodeInfo.GetNodeProperties(defaultGPUNode);
@@ -982,7 +876,7 @@ TEST_F(KFDQMTest, QueuePriorityOnDifferentPipe) {
HSAint32 *syncBuffer = syncBuf.As<HSAint32*>();
HsaMemoryBuffer isaBuffer(PAGE_SIZE, node, true/*zero*/, false/*local*/, true/*exec*/);
m_pIsaGen->CompileShader(loop_isa, "loop_isa", isaBuffer);
ASSERT_SUCCESS(m_pAsm->RunAssembleBuf(LoopIsa, isaBuffer.As<char*>()));
Dispatch dispatch[2] = {
Dispatch(isaBuffer, true),
@@ -1047,7 +941,7 @@ TEST_F(KFDQMTest, QueuePriorityOnSamePipe) {
HSAint32 *syncBuffer = syncBuf.As<HSAint32*>();
HsaMemoryBuffer isaBuffer(PAGE_SIZE, node, true/*zero*/, false/*local*/, true/*exec*/);
m_pIsaGen->CompileShader(loop_isa, "loop_isa", isaBuffer);
ASSERT_SUCCESS(m_pAsm->RunAssembleBuf(LoopIsa, isaBuffer.As<char*>()));
Dispatch dispatch[2] = {
Dispatch(isaBuffer, true),
@@ -1140,7 +1034,7 @@ TEST_F(KFDQMTest, EmptyDispatch) {
HsaMemoryBuffer isaBuffer(PAGE_SIZE, defaultGPUNode, true/*zero*/, false/*local*/, true/*exec*/);
m_pIsaGen->GetNoopIsa(isaBuffer);
ASSERT_SUCCESS(m_pAsm->RunAssembleBuf(LoopIsa, isaBuffer.As<char*>()));
SyncDispatch(isaBuffer, NULL, NULL);
@@ -1159,7 +1053,7 @@ TEST_F(KFDQMTest, SimpleWriteDispatch) {
srcBuffer.Fill(0x01010101);
m_pIsaGen->GetCopyDwordIsa(isaBuffer);
ASSERT_SUCCESS(m_pAsm->RunAssembleBuf(CopyDwordIsa, isaBuffer.As<char*>()));
SyncDispatch(isaBuffer, srcBuffer.As<void*>(), destBuffer.As<void*>());
@@ -1194,7 +1088,7 @@ TEST_F(KFDQMTest, MultipleCpQueuesStressDispatch) {
destBuffer.Fill(0xFF);
m_pIsaGen->GetCopyDwordIsa(isaBuffer);
ASSERT_SUCCESS(m_pAsm->RunAssembleBuf(CopyDwordIsa, isaBuffer.As<char*>()));
for (i = 0; i < MAX_CP_QUEUES; ++i)
ASSERT_SUCCESS(queues[i].Create(defaultGPUNode)) << " QueueId=" << i;
@@ -1533,7 +1427,7 @@ TEST_F(KFDQMTest, Atomics) {
PM4Queue queue;
m_pIsaGen->GetAtomicIncIsa(isaBuf);
ASSERT_SUCCESS(m_pAsm->RunAssembleBuf(AtomicIncIsa, isaBuf.As<char*>()));
Dispatch dispatch(isaBuf);
dispatch.SetArgs(destBuf.As<void*>(), NULL);
@@ -1598,10 +1492,12 @@ TEST_F(KFDQMTest, mGPUShareBO) {
srcNodeMem.Fill(0x05050505);
m_pIsaGen->GetCopyDwordIsa(isaBufferSrc);
ASSERT_SUCCESS(m_pAsm->RunAssemble(CopyDwordIsa));
m_pAsm->CopyInstrStream(isaBufferSrc.As<char*>());
SyncDispatch(isaBufferSrc, srcNodeMem.As<void*>(), shared_addr.As<void *>(), src_node);
m_pIsaGen->GetCopyDwordIsa(isaBufferDst);
m_pAsm->CopyInstrStream(isaBufferDst.As<char*>());
SyncDispatch(isaBufferDst, shared_addr.As<void *>(), dstNodeMem.As<void*>(), dst_node);
EXPECT_EQ(dstNodeMem.As<unsigned int*>()[0], 0x05050505);
+1 -3
Просмотреть файл
@@ -27,13 +27,12 @@
#include <gtest/gtest.h>
#include "PM4Queue.hpp"
#include "IsaGenerator.hpp"
#include "KFDBaseComponentTest.hpp"
#include "Dispatch.hpp"
class KFDQMTest : public KFDBaseComponentTest {
public:
KFDQMTest():m_pIsaGen(NULL) {}
KFDQMTest() {}
~KFDQMTest() {}
@@ -49,7 +48,6 @@ class KFDQMTest : public KFDBaseComponentTest {
const double CuVariance = 0.15;
const double CuNegVariance = 1.0 - CuVariance;
const double CuPosVariance = 1.0 + CuVariance;
IsaGenerator* m_pIsaGen;
};
#endif // __KFD_QCM_TEST__H__
+1 -126
Просмотреть файл
@@ -234,131 +234,6 @@ TEST_F(KFDSVMEvictTest, BasicTest) {
TEST_END
}
/* Shader to read local buffers using multiple wavefronts in parallel
* until address buffer is filled with specific value 0x5678 by host program,
* then each wavefront fills value 0x5678 at corresponding result buffer and quit
*
* initial state:
* s[0:1] - address buffer base address
* s[2:3] - result buffer base address
* s4 - workgroup id
* v0 - workitem id, always 0 because NUM_THREADS_X(number of threads) in workgroup set to 1
* registers:
* v0 - calculated workitem id, v0 = v0 + s4 * NUM_THREADS_X
* v[2:3] - address of corresponding local buf address offset: s[0:1] + v0 * 8
* v[4:5] - corresponding output buf address: s[2:3] + v0 * 4
* v[6:7] - local buf address used for read test
*/
static const char* gfx9_ReadMemory =
"\
shader ReadMemory\n\
type(CS)\n\
\n\
// compute address of corresponding output buffer\n\
v_mov_b32 v0, s4 // use workgroup id as index\n\
v_lshlrev_b32 v0, 2, v0 // v0 *= 4\n\
v_add_co_u32 v4, vcc, s2, v0 // v[4:5] = s[2:3] + v0 * 4\n\
v_mov_b32 v5, s3\n\
v_add_u32 v5, vcc_lo, v5\n\
\n\
// compute input buffer offset used to store corresponding local buffer address\n\
v_lshlrev_b32 v0, 1, v0 // v0 *= 8\n\
v_add_co_u32 v2, vcc, s0, v0 // v[2:3] = s[0:1] + v0 * 8\n\
v_mov_b32 v3, s1\n\
v_add_u32 v3, vcc_lo, v3\n\
\n\
// load 64bit local buffer address stored at v[2:3] to v[6:7]\n\
flat_load_dwordx2 v[6:7], v[2:3] slc\n\
s_waitcnt vmcnt(0) & lgkmcnt(0) // wait for memory reads to finish\n\
\n\
v_mov_b32 v8, 0x5678\n\
s_movk_i32 s8, 0x5678\n\
L_REPEAT:\n\
s_load_dword s16, s[0:1], 0x0 glc\n\
s_waitcnt vmcnt(0) & lgkmcnt(0) // wait for memory reads to finish\n\
s_cmp_eq_i32 s16, s8\n\
s_cbranch_scc1 L_QUIT // if notified to quit by host\n\
// loop read 64M local buffer starting at v[6:7]\n\
// every 4k page only read once\n\
v_mov_b32 v9, 0\n\
v_mov_b32 v10, 0x1000 // 4k page\n\
v_mov_b32 v11, 0x4000000 // 64M size\n\
v_mov_b32 v12, v6\n\
v_mov_b32 v13, v7\n\
L_LOOP_READ:\n\
flat_load_dwordx2 v[14:15], v[12:13] slc\n\
v_add_u32 v9, v9, v10 \n\
v_add_co_u32 v12, vcc, v12, v10\n\
v_add_u32 v13, vcc_lo, v13\n\
v_cmp_lt_u32 vcc, v9, v11\n\
s_cbranch_vccnz L_LOOP_READ\n\
s_branch L_REPEAT\n\
L_QUIT:\n\
flat_store_dword v[4:5], v8\n\
s_waitcnt vmcnt(0) & lgkmcnt(0) // wait for memory writes to finish\n\
s_endpgm\n\
end\n\
";
static const char* gfx8_ReadMemory =
"\
shader ReadMemory\n\
asic(VI)\n\
type(CS)\n\
\n\
// compute address of corresponding output buffer\n\
v_mov_b32 v0, s4 // use workgroup id as index\n\
v_lshlrev_b32 v0, 2, v0 // v0 *= 4\n\
v_add_u32 v4, vcc, s2, v0 // v[4:5] = s[2:3] + v0 * 4\n\
v_mov_b32 v5, s3\n\
v_addc_u32 v5, vcc, v5, 0, vcc\n\
\n\
// compute input buffer offset used to store corresponding local buffer address\n\
v_lshlrev_b32 v0, 1, v0 // v0 *= 8\n\
v_add_u32 v2, vcc, s0, v0 // v[2:3] = s[0:1] + v0 * 8\n\
v_mov_b32 v3, s1\n\
v_addc_u32 v3, vcc, v3, 0, vcc\n\
\n\
// load 64bit local buffer address stored at v[2:3] to v[6:7]\n\
flat_load_dwordx2 v[6:7], v[2:3] slc\n\
s_waitcnt vmcnt(0) & lgkmcnt(0) // wait for memory reads to finish\n\
\n\
v_mov_b32 v8, 0x5678\n\
s_movk_i32 s8, 0x5678\n\
L_REPEAT:\n\
s_load_dword s16, s[0:1], 0x0 glc\n\
s_waitcnt vmcnt(0) & lgkmcnt(0) // wait for memory reads to finish\n\
s_cmp_eq_i32 s16, s8\n\
s_cbranch_scc1 L_QUIT // if notified to quit by host\n\
// loop read 64M local buffer starting at v[6:7]\n\
// every 4k page only read once\n\
v_mov_b32 v9, 0\n\
v_mov_b32 v10, 0x1000 // 4k page\n\
v_mov_b32 v11, 0x4000000 // 64M size\n\
v_mov_b32 v12, v6\n\
v_mov_b32 v13, v7\n\
L_LOOP_READ:\n\
flat_load_dwordx2 v[14:15], v[12:13] slc\n\
v_add_u32 v9, vcc, v9, v10 \n\
v_add_u32 v12, vcc, v12, v10\n\
v_addc_u32 v13, vcc, v13, 0, vcc\n\
v_cmp_lt_u32 vcc, v9, v11\n\
s_cbranch_vccnz L_LOOP_READ\n\
s_branch L_REPEAT\n\
L_QUIT:\n\
flat_store_dword v[4:5], v8\n\
s_waitcnt vmcnt(0) & lgkmcnt(0) // wait for memory writes to finish\n\
s_endpgm\n\
end\n\
";
std::string KFDSVMEvictTest::CreateShader() {
if (m_FamilyId >= FAMILY_AI)
return gfx9_ReadMemory;
else
return gfx8_ReadMemory;
}
/* Evict and restore queue test
*
* N_PROCESSES processes read all local buffers in parallel while buffers are evicted and restored
@@ -434,7 +309,7 @@ TEST_F(KFDSVMEvictTest, QueueTest) {
for (i = 0; i < wavefront_num; i++)
*(localBufAddr + i) = pBuffers[i];
m_pIsaGen->CompileShader(CreateShader().c_str(), "ReadMemory", isaBuffer);
ASSERT_SUCCESS(m_pAsm->RunAssembleBuf(ReadMemoryIsa, isaBuffer.As<char*>()));
PM4Queue pm4Queue;
ASSERT_SUCCESS(pm4Queue.Create(defaultGPUNode));
-1
Просмотреть файл
@@ -28,7 +28,6 @@
#include <vector>
#include "KFDLocalMemoryTest.hpp"
#include "KFDBaseComponentTest.hpp"
#include "IsaGenerator.hpp"
// @class KFDEvictTest
// Test eviction and restore procedure using two processes
+9 -11
Просмотреть файл
@@ -34,8 +34,6 @@ void KFDSVMRangeTest::SetUp() {
KFDBaseComponentTest::SetUp();
m_pIsaGen = IsaGenerator::Create(m_FamilyId);
SVMSetXNACKMode();
ROUTINE_END
@@ -44,10 +42,6 @@ void KFDSVMRangeTest::SetUp() {
void KFDSVMRangeTest::TearDown() {
ROUTINE_START
if (m_pIsaGen)
delete m_pIsaGen;
m_pIsaGen = NULL;
SVMRestoreXNACKMode();
KFDBaseComponentTest::TearDown();
@@ -80,7 +74,7 @@ TEST_F(KFDSVMRangeTest, BasicSystemMemTest) {
srcSysBuffer.Fill(0x01010101);
m_pIsaGen->GetCopyDwordIsa(isaBuffer);
ASSERT_SUCCESS(m_pAsm->RunAssembleBuf(CopyDwordIsa, isaBuffer.As<char*>()));
ASSERT_SUCCESS(queue.Create(defaultGPUNode));
queue.SetSkipWaitConsump(0);
@@ -364,7 +358,8 @@ TEST_F(KFDSVMRangeTest, EvictSystemRangeTest) {
ASSERT_SUCCESS(sdmaQueue.Create(defaultGPUNode));
HsaMemoryBuffer isaBuffer(PAGE_SIZE, defaultGPUNode, true/*zero*/, false/*local*/, true/*exec*/);
m_pIsaGen->GetCopyDwordIsa(isaBuffer);
ASSERT_SUCCESS(m_pAsm->RunAssembleBuf(CopyDwordIsa, isaBuffer.As<char*>()));
Dispatch dispatch0(isaBuffer);
dispatch0.SetArgs(srcBuffer.As<void*>(), dstBuffer.As<void*>());
@@ -458,7 +453,8 @@ TEST_F(KFDSVMRangeTest, PartialUnmapSysMemTest) {
munmap(pBuf2, Buf2Size);
m_pIsaGen->GetCopyDwordIsa(isaBuffer);
ASSERT_SUCCESS(m_pAsm->RunAssembleBuf(CopyDwordIsa, isaBuffer.As<char*>()));
ASSERT_SUCCESS(queue.Create(defaultGPUNode));
Dispatch dispatch(isaBuffer);
@@ -507,7 +503,7 @@ TEST_F(KFDSVMRangeTest, BasicVramTest) {
srcSysBuffer.Fill(0x01010101);
m_pIsaGen->GetCopyDwordIsa(isaBuffer);
ASSERT_SUCCESS(m_pAsm->RunAssembleBuf(CopyDwordIsa, isaBuffer.As<char*>()));
ASSERT_SUCCESS(queue.Create(defaultGPUNode));
queue.SetSkipWaitConsump(0);
@@ -943,7 +939,9 @@ TEST_F(KFDSVMRangeTest, MigratePolicyTest) {
#ifdef USE_PM4_QUEUE_TRIGGER_VM_FAULT
HsaMemoryBuffer isaBuffer(PAGE_SIZE, defaultGPUNode);
PM4Queue queue;
m_pIsaGen->GetCopyDwordIsa(isaBuffer);
ASSERT_SUCCESS(m_pAsm->RunAssembleBuf(CopyDwordIsa, isaBuffer.As<char*>()));
ASSERT_SUCCESS(queue.Create(defaultGPUNode));
for (HSAuint64 i = 0; i < BufferSize / 8; i += 512) {
+1 -5
Просмотреть файл
@@ -26,21 +26,17 @@
#include <gtest/gtest.h>
#include "IsaGenerator.hpp"
#include "KFDBaseComponentTest.hpp"
class KFDSVMRangeTest : public KFDBaseComponentTest {
public:
KFDSVMRangeTest() :m_pIsaGen(NULL) {}
KFDSVMRangeTest() {}
~KFDSVMRangeTest() {}
void SplitRangeTest(int defaultGPUNode, int prefetch_location);
protected:
virtual void SetUp();
virtual void TearDown();
protected: // Members
IsaGenerator* m_pIsaGen;
};
#endif // __KFD_LOCALMEMORY_TEST__H__
+6
Просмотреть файл
@@ -231,6 +231,12 @@ bool isTonga(const HsaNodeProperties *props) {
return false;
}
const uint32_t GetGfxVersion(const HsaNodeProperties *props) {
return ((props->EngineId.ui32.Major << 16) |
(props->EngineId.ui32.Minor << 8) |
(props->EngineId.ui32.Stepping));
}
HSAuint64 GetSystemTickCountInMicroSec() {
struct timeval t;
gettimeofday(&t, 0);
+1
Просмотреть файл
@@ -52,6 +52,7 @@ bool is_dgpu();
bool isTonga(const HsaNodeProperties *props);
bool hasPciAtomicsSupport(int node);
unsigned int FamilyIdFromNode(const HsaNodeProperties *props);
const uint32_t GetGfxVersion(const HsaNodeProperties *props);
void GetHwQueueInfo(const HsaNodeProperties *props,
unsigned int *p_num_cp_queues,
+2 -6
Просмотреть файл
@@ -34,16 +34,11 @@ void RDMATest::SetUp() {
KFDBaseComponentTest::SetUp();
m_pIsaGen = IsaGenerator::Create(m_FamilyId);
ROUTINE_END
}
void RDMATest::TearDown() {
ROUTINE_START
if (m_pIsaGen)
delete m_pIsaGen;
m_pIsaGen = NULL;
KFDBaseComponentTest::TearDown();
@@ -77,7 +72,8 @@ TEST_F(RDMATest, GPUDirect) {
srcSysBuffer.Fill(0xfe);
/* Put 'copy dword' command to ISA buffer */
m_pIsaGen->GetCopyDwordIsa(isaBuffer);
ASSERT_SUCCESS(m_pAsm->RunAssembleBuf(CopyDwordIsa, isaBuffer.As<char*>()));
ASSERT_SUCCESS(queue.Create(defaultGPUNode));
Dispatch dispatch(isaBuffer);
+1 -5
Просмотреть файл
@@ -26,20 +26,16 @@
#include <gtest/gtest.h>
#include "IsaGenerator.hpp"
#include "KFDBaseComponentTest.hpp"
class RDMATest : public KFDBaseComponentTest {
public:
RDMATest():m_pIsaGen(NULL) {}
RDMATest() {}
~RDMATest() {}
protected:
virtual void SetUp();
virtual void TearDown();
protected: // Members
IsaGenerator* m_pIsaGen;
};
#endif // __RDMA_TEST__H__
+609
Просмотреть файл
@@ -0,0 +1,609 @@
/*
* Copyright (C) 2021 Advanced Micro Devices, Inc. All Rights Reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*
*/
#include "ShaderStore.hpp"
/**
* KFDASMTest List
*/
const std::vector<const char*> ShaderList = {
NoopIsa,
CopyDwordIsa,
InfiniteLoopIsa,
AtomicIncIsa,
ScratchCopyDwordIsa,
PollMemoryIsa,
CopyOnSignalIsa,
PollAndCopyIsa,
WriteFlagAndValueIsa,
WriteAndSignalIsa,
LoopIsa,
IterateIsa,
ReadMemoryIsa,
GwsInitIsa,
GwsAtomicIncreaseIsa,
};
/**
* Macros
*/
/* Create macro for portable v_add_co_u32, v_add_co_ci_u32,
* and v_cmp_lt_u32
*/
#define SHADER_MACROS \
" .text\n"\
" .macro V_ADD_CO_U32 vdst, src0, vsrc1\n"\
" .if (.amdgcn.gfx_generation_number >= 10)\n"\
" v_add_co_u32 \\vdst, vcc_lo, \\src0, \\vsrc1\n"\
" .elseif (.amdgcn.gfx_generation_number >= 9)\n"\
" v_add_co_u32 \\vdst, vcc, \\src0, \\vsrc1\n"\
" .else\n"\
" v_add_u32 \\vdst, vcc, \\src0, \\vsrc1\n"\
" .endif\n"\
" .endm\n"\
" .macro V_ADD_CO_CI_U32 vdst, src0, vsrc1\n"\
" .if (.amdgcn.gfx_generation_number >= 10)\n"\
" v_add_co_ci_u32 \\vdst, vcc_lo, \\src0, \\vsrc1, vcc_lo\n"\
" .elseif (.amdgcn.gfx_generation_number >= 9)\n"\
" v_addc_co_u32 \\vdst, vcc, \\src0, \\vsrc1, vcc\n"\
" .else\n"\
" v_addc_u32 \\vdst, vcc, \\src0, \\vsrc1, vcc\n"\
" .endif\n"\
" .endm\n"\
" .macro V_CMP_LT_U32 src0, vsrc1\n"\
" .if (.amdgcn.gfx_generation_number >= 10)\n"\
" v_cmp_lt_u32 vcc_lo, \\src0, \\vsrc1\n"\
" .else\n"\
" v_cmp_lt_u32 vcc, \\src0, \\vsrc1\n"\
" .endif\n"\
" .endm\n"
/**
* Common
*/
const char *NoopIsa = R"(
.text
s_endpgm
)";
const char *CopyDwordIsa = R"(
.text
v_mov_b32 v0, s0
v_mov_b32 v1, s1
v_mov_b32 v2, s2
v_mov_b32 v3, s3
flat_load_dword v4, v[0:1] glc slc
s_waitcnt 0
flat_store_dword v[2:3], v4 glc slc
s_endpgm
)";
const char *InfiniteLoopIsa = R"(
.text
LOOP:
s_branch LOOP
s_endpgm
)";
const char *AtomicIncIsa = R"(
.text
v_mov_b32 v0, s0
v_mov_b32 v1, s1
.if (.amdgcn.gfx_generation_number >= 8)
v_mov_b32 v2, 1
flat_atomic_add v3, v[0:1], v2 glc slc
.else
v_mov_b32 v2, -1
flat_atomic_inc v3, v[0:1], v2 glc slc
.endif
s_waitcnt 0
s_endpgm
)";
/**
* KFDMemoryTest
*/
const char *ScratchCopyDwordIsa = R"(
.text
// Copy the parameters from scalar registers to vector registers
.if (.amdgcn.gfx_generation_number >= 9)
v_mov_b32 v0, s0
v_mov_b32 v1, s1
v_mov_b32 v2, s2
v_mov_b32 v3, s3
.else
v_mov_b32_e32 v0, s0
v_mov_b32_e32 v1, s1
v_mov_b32_e32 v2, s2
v_mov_b32_e32 v3, s3
.endif
// Setup the scratch parameters. This assumes a single 16-reg block
.if (.amdgcn.gfx_generation_number >= 10)
s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s4
s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s5
.elseif (.amdgcn.gfx_generation_number == 9)
s_mov_b32 flat_scratch_lo, s4
s_mov_b32 flat_scratch_hi, s5
.else
s_mov_b32 flat_scratch_lo, 8
s_mov_b32 flat_scratch_hi, 0
.endif
// Copy a dword between the passed addresses
flat_load_dword v4, v[0:1] slc
s_waitcnt vmcnt(0) & lgkmcnt(0)
flat_store_dword v[2:3], v4 slc
s_endpgm
)";
/* Continuously poll src buffer and check buffer value
* After src buffer is filled with specific value (0x5678,
* by host program), fill dst buffer with specific
* value(0x5678) and quit
*/
const char *PollMemoryIsa = R"(
.text
// Assume src address in s0, s1, and dst address in s2, s3
s_movk_i32 s18, 0x5678
.if (.amdgcn.gfx_generation_number >= 10)
v_mov_b32 v0, s2
v_mov_b32 v1, s3
v_mov_b32 v2, 0x5678
.endif
LOOP:
s_load_dword s16, s[0:1], 0x0 glc
s_cmp_eq_i32 s16, s18
s_cbranch_scc0 LOOP
.if (.amdgcn.gfx_generation_number >= 10)
flat_store_dword v[0:1], v2 slc
.else
s_store_dword s18, s[2:3], 0x0 glc
.endif
s_endpgm
)";
/* Similar to PollMemoryIsa except that the buffer
* polled can be Non-coherant memory. SCC system-level
* cache coherence is not supported in scalar (smem) path.
* Use vmem operations with scc
*
* Note: Only works on Aldebaran, and even then the scc modifier
* has been defeatured. This shader is more or less
* deprecated.
*/
const char *PollNCMemoryIsa = R"(
.text
// Assume src address in s0, s1, and dst address in s2, s3
v_mov_b32 v6, 0x5678
v_mov_b32 v0, s0
v_mov_b32 v1, s1
LOOP:
flat_load_dword v4, v[0:1] scc
v_cmp_eq_u32 vcc, v4, v6
s_cbranch_vccz LOOP
v_mov_b32 v0, s2
v_mov_b32 v1, s3
flat_store_dword v[0:1], v6 scc
s_endpgm
)";
/* Input: A buffer of at least 3 dwords.
* DW0: used as a signal. 0xcafe means it is signaled
* DW1: Input buffer for device to read.
* DW2: Output buffer for device to write.
* Once receive signal, device will copy DW1 to DW2
* This shader continously poll the signal buffer,
* Once signal buffer is signaled, it copies input buffer
* to output buffer
*/
const char *CopyOnSignalIsa = R"(
.text
// Assume input buffer in s0, s1
.if (.amdgcn.gfx_generation_number >= 10)
s_add_u32 s2, s0, 0x8
s_addc_u32 s3, s1, 0x0
s_mov_b32 s18, 0xcafe
v_mov_b32 v0, s0
v_mov_b32 v1, s1
v_mov_b32 v4, s2
v_mov_b32 v5, s3
.else
s_mov_b32 s18, 0xcafe
.endif
POLLSIGNAL:
s_load_dword s16, s[0:1], 0x0 glc
s_cmp_eq_i32 s16, s18
s_cbranch_scc0 POLLSIGNAL
s_load_dword s17, s[0:1], 0x4 glc
s_waitcnt vmcnt(0) & lgkmcnt(0)
.if (.amdgcn.gfx_generation_number >= 10)
v_mov_b32 v2, s17
flat_store_dword v[4:5], v2 glc
.else
s_store_dword s17, s[0:1], 0x8 glc
.endif
s_waitcnt vmcnt(0) & lgkmcnt(0)
s_endpgm
)";
/* Continuously poll the flag at src buffer
* After the flag of s[0:1] is 1 filled,
* copy the value from s[0:1]+4 to dst buffer
*
* Note: Only works on GFX9 (only used in
* aldebaran tests)
*/
const char *PollAndCopyIsa = R"(
.text
// Assume src buffer in s[0:1] and dst buffer in s[2:3]
.if (.amdgcn.gfx_generation_number == 9 && .amdgcn.gfx_generation_stepping == 10)
// Path for Aldebaran
v_mov_b32 v0, s0
v_mov_b32 v1, s1
v_mov_b32 v18, 0x1
LOOP_ALDBRN:
flat_load_dword v16, v[0:1] glc
s_waitcnt vmcnt(0) & lgkmcnt(0)
v_cmp_eq_i32 vcc, v16, v18
s_cbranch_vccz LOOP_ALDBRN
buffer_invl2
s_load_dword s17, s[0:1], 0x4 glc
s_waitcnt vmcnt(0) & lgkmcnt(0)
s_store_dword s17, s[2:3], 0x0 glc
s_waitcnt vmcnt(0) & lgkmcnt(0)
buffer_wbl2
.elseif (.amdgcn.gfx_generation_number == 9)
s_movk_i32 s18, 0x1
LOOP:
s_load_dword s16, s[0:1], 0x0 glc
s_cmp_eq_i32 s16, s18
s_cbranch_scc0 LOOP
s_load_dword s17, s[0:1], 0x4 glc
s_waitcnt vmcnt(0) & lgkmcnt(0)
s_store_dword s17, s[2:3], 0x0 glc
.endif
s_waitcnt vmcnt(0) & lgkmcnt(0)
s_endpgm
)";
/* Input0: A buffer of at least 2 dwords.
* DW0: used as a signal. Write 0x1 to signal
* DW1: Write the value from 2nd input buffer
* for other device to read.
* Input1: A buffer of at least 2 dwords.
* DW0: used as the value to be written.
*
* Note: Only works on Aldebaran
*/
const char *WriteFlagAndValueIsa = R"(
.text
// Assume two inputs buffer in s[0:1] and s[2:3]
.if (.amdgcn.gfx_generation_number == 9 && .amdgcn.gfx_generation_stepping == 10)
v_mov_b32 v0, s0
v_mov_b32 v1, s1
s_load_dword s18, s[2:3], 0x0 glc
s_waitcnt vmcnt(0) & lgkmcnt(0)
s_store_dword s18, s[0:1], 0x4 glc
s_waitcnt vmcnt(0) & lgkmcnt(0)
buffer_wbl2
s_waitcnt vmcnt(0) & lgkmcnt(0)
v_mov_b32 v16, 0x1
flat_store_dword v[0:1], v16 glc
.endif
s_endpgm
)";
/* Input0: A buffer of at least 2 dwords.
* DW0: used as a signal. Write 0xcafe to signal
* DW1: Write to this buffer for other device to read.
* Input1: mmio base address
*/
const char *WriteAndSignalIsa = R"(
.text
// Assume input buffer in s0, s1
.if (.amdgcn.gfx_generation_number >= 10)
s_add_u32 s4, s0, 0x4
s_addc_u32 s5, s1, 0x0
v_mov_b32 v0, s0
v_mov_b32 v1, s1
v_mov_b32 v2, s2
v_mov_b32 v3, s3
v_mov_b32 v4, s4
v_mov_b32 v5, s5
v_mov_b32 v18, 0xbeef
flat_store_dword v[4:5], v18 glc
v_mov_b32 v18, 0x1
flat_store_dword v[2:3], v18 glc
v_mov_b32 v18, 0xcafe
flat_store_dword v[0:1], v18 glc
.else
s_mov_b32 s18, 0xbeef
s_store_dword s18, s[0:1], 0x4 glc
s_mov_b32 s18, 0x1
s_store_dword s18, s[2:3], 0 glc
s_mov_b32 s18, 0xcafe
s_store_dword s18, s[0:1], 0x0 glc
.endif
s_endpgm
)";
/**
* KFDQMTest
*/
/* A simple isa loop program with dense mathematic operations
* s1 controls the number iterations of the loop
* This shader can be used by GFX8, GFX9 and GFX10
*/
const char *LoopIsa = R"(
.text
s_movk_i32 s0, 0x0008
s_movk_i32 s1, 0x00ff
v_mov_b32 v0, 0
v_mov_b32 v1, 0
v_mov_b32 v2, 0
v_mov_b32 v3, 0
v_mov_b32 v4, 0
v_mov_b32 v5, 0
v_mov_b32 v6, 0
v_mov_b32 v7, 0
v_mov_b32 v8, 0
v_mov_b32 v9, 0
v_mov_b32 v10, 0
v_mov_b32 v11, 0
v_mov_b32 v12, 0
v_mov_b32 v13, 0
v_mov_b32 v14, 0
v_mov_b32 v15, 0
v_mov_b32 v16, 0
LOOP:
s_mov_b32 s8, s4
s_mov_b32 s9, s1
s_mov_b32 s10, s6
s_mov_b32 s11, s7
s_cmp_le_i32 s1, s0
s_cbranch_scc1 END_OF_PGM
s_buffer_load_dwordx8 s[8:15], s[8:11], 0x10
v_add_f32 v0, 2.0, v0
v_cvt_f32_i32 v17, s1
s_waitcnt lgkmcnt(0)
v_add_f32 v18, s8, v17
v_add_f32 v19, s9, v17
v_add_f32 v20, s10, v17
v_add_f32 v21, s11, v17
v_add_f32 v22, s12, v17
v_add_f32 v23, s13, v17
v_add_f32 v24, s14, v17
v_add_f32 v17, s15, v17
v_log_f32 v25, v18
v_mul_f32 v25, v22, v25
v_exp_f32 v25, v25
v_log_f32 v26, v19
v_mul_f32 v26, v23, v26
v_exp_f32 v26, v26
v_log_f32 v27, v20
v_mul_f32 v27, v24, v27
v_exp_f32 v27, v27
v_log_f32 v28, v21
v_mul_f32 v28, v17, v28
v_exp_f32 v28, v28
v_add_f32 v5, v5, v25
v_add_f32 v6, v6, v26
v_add_f32 v7, v7, v27
v_add_f32 v8, v8, v28
v_mul_f32 v18, 0x3fb8aa3b, v18
v_exp_f32 v18, v18
v_mul_f32 v19, 0x3fb8aa3b, v19
v_exp_f32 v19, v19
v_mul_f32 v20, 0x3fb8aa3b, v20
v_exp_f32 v20, v20
v_mul_f32 v21, 0x3fb8aa3b, v21
v_exp_f32 v21, v21
v_add_f32 v9, v9, v18
v_add_f32 v10, v10, v19
v_add_f32 v11, v11, v20
v_add_f32 v12, v12, v21
v_sqrt_f32 v18, v22
v_sqrt_f32 v19, v23
v_sqrt_f32 v20, v24
v_sqrt_f32 v21, v17
v_add_f32 v13, v13, v18
v_add_f32 v14, v14, v19
v_add_f32 v15, v15, v20
v_add_f32 v16, v16, v21
v_rsq_f32 v18, v22
v_rsq_f32 v19, v23
v_rsq_f32 v20, v24
v_rsq_f32 v17, v17
v_add_f32 v1, v1, v18
v_add_f32 v2, v2, v19
v_add_f32 v3, v3, v20
v_add_f32 v4, v4, v17
s_add_u32 s0, s0, 1
s_branch LOOP
END_OF_PGM:
s_endpgm
)";
/**
* KFDCWSRTest
*/
/* Initial state:
* s[0:1] - 64 bits iteration number; only the lower 32 bits are useful.
* s[2:3] - result buffer base address
* s4 - workgroup id
* v0 - workitem id, always 0 because
* NUM_THREADS_X(number of threads) in workgroup set to 1
* Registers:
* v0 - calculated workitem = v0 + s4 * NUM_THREADS_X, which is s4
* v2 - = s0, 32 bits iteration number
* v[4:5] - corresponding output buf address: s[2:3] + v0 * 4
* v6 - counter
*/
const char *IterateIsa = SHADER_MACROS R"(
// Copy the parameters from scalar registers to vector registers
v_mov_b32 v2, s0 // v[2:3] = s[0:1]
v_mov_b32 v3, s1 // v[2:3] = s[0:1]
v_mov_b32 v0, s4 // use workgroup id as index
v_lshlrev_b32 v0, 2, v0 // v0 *= 4
V_ADD_CO_U32 v4, s2, v0 // v[4:5] = s[2:3] + v0 * 4
v_mov_b32 v5, s3 // v[4:5] = s[2:3] + v0 * 4
V_ADD_CO_CI_U32 v5, v5, 0 // v[4:5] = s[2:3] + v0 * 4
v_mov_b32 v6, 0
LOOP:
V_ADD_CO_U32 v6, 1, v6
// Compare the result value (v6) to iteration value (v2), and
// jump if equal (i.e. if VCC is not zero after the comparison)
V_CMP_LT_U32 v6, v2
s_cbranch_vccnz LOOP
flat_store_dword v[4:5], v6
s_waitcnt vmcnt(0) & lgkmcnt(0)
s_endpgm
)";
/**
* KFDEvictTest
*/
/* Shader to read local buffers using multiple wavefronts in parallel
* until address buffer is filled with specific value 0x5678 by host program,
* then each wavefront fills value 0x5678 at corresponding result buffer and quit
*
* Initial state:
* s[0:1] - address buffer base address
* s[2:3] - result buffer base address
* s4 - workgroup id
* v0 - workitem id, always 0 because NUM_THREADS_X(number of threads) in workgroup set to 1
* Registers:
* v0 - calculated workitem id, v0 = v0 + s4 * NUM_THREADS_X
* v[2:3] - address of corresponding local buf address offset: s[0:1] + v0 * 8
* v[4:5] - corresponding output buf address: s[2:3] + v0 * 4
* v[6:7] - local buf address used for read test
*/
const char *ReadMemoryIsa = SHADER_MACROS R"(
// Compute address of corresponding output buffer
v_mov_b32 v0, s4 // use workgroup id as index
v_lshlrev_b32 v0, 2, v0 // v0 *= 4
V_ADD_CO_U32 v4, s2, v0 // v[4:5] = s[2:3] + v0 * 4
v_mov_b32 v5, s3 // v[4:5] = s[2:3] + v0 * 4
V_ADD_CO_CI_U32 v5, v5, 0 // v[4:5] = s[2:3] + v0 * 4
// Compute input buffer offset used to store corresponding local buffer address
v_lshlrev_b32 v0, 1, v0 // v0 *= 8
V_ADD_CO_U32 v2, s0, v0 // v[2:3] = s[0:1] + v0 * 8
v_mov_b32 v3, s1 // v[2:3] = s[0:1] + v0 * 8
V_ADD_CO_CI_U32 v3, v3, 0 // v[2:3] = s[0:1] + v0 * 8
// Load 64bit local buffer address stored at v[2:3] to v[6:7]
flat_load_dwordx2 v[6:7], v[2:3] slc
s_waitcnt vmcnt(0) & lgkmcnt(0) // wait for memory reads to finish
v_mov_b32 v8, 0x5678
s_movk_i32 s8, 0x5678
L_REPEAT:
s_load_dword s16, s[0:1], 0x0 glc
s_waitcnt vmcnt(0) & lgkmcnt(0) // wait for memory reads to finish
s_cmp_eq_i32 s16, s8
s_cbranch_scc1 L_QUIT // if notified to quit by host
// Loop read 64M local buffer starting at v[6:7]
// every 4k page only read once
v_mov_b32 v9, 0
v_mov_b32 v10, 0x1000 // 4k page
v_mov_b32 v11, 0x4000000 // 64M size
v_mov_b32 v12, v6
v_mov_b32 v13, v7
L_LOOP_READ:
flat_load_dwordx2 v[14:15], v[12:13] slc
V_ADD_CO_U32 v9, v9, v10
V_ADD_CO_U32 v12, v12, v10
V_ADD_CO_CI_U32 v13, v13, 0
V_CMP_LT_U32 v9, v11
s_cbranch_vccnz L_LOOP_READ
s_branch L_REPEAT
L_QUIT:
flat_store_dword v[4:5], v8
s_waitcnt vmcnt(0) & lgkmcnt(0) // wait for memory writes to finish
s_endpgm
)";
/**
* KFDGWSTest
*/
/* Shader to initialize gws counter to 1 */
const char *GwsInitIsa = R"(
.text
s_mov_b32 m0, 0
s_nop 0
s_load_dword s16, s[0:1], 0x0 glc
s_waitcnt 0
v_mov_b32 v0, s16
s_waitcnt 0
ds_gws_init v0 offset:0 gds
s_waitcnt 0
s_endpgm
)";
/* Atomically increase a value in memory
* This is expected to be executed from
* multiple work groups simultaneously.
* GWS semaphore is used to guarantee
* the operation is atomic.
*/
const char *GwsAtomicIncreaseIsa = R"(
.text
// Assume src address in s0, s1
.if (.amdgcn.gfx_generation_number >= 10)
s_mov_b32 m0, 0
s_mov_b32 exec_lo, 0x1
v_mov_b32 v0, s0
v_mov_b32 v1, s1
ds_gws_sema_p offset:0 gds
s_waitcnt 0
flat_load_dword v2, v[0:1] glc dlc
s_waitcnt 0
v_add_nc_u32 v2, v2, 1
flat_store_dword v[0:1], v2
s_waitcnt_vscnt null, 0
ds_gws_sema_v offset:0 gds
.else
s_mov_b32 m0, 0
s_nop 0
ds_gws_sema_p offset:0 gds
s_waitcnt 0
s_load_dword s16, s[0:1], 0x0 glc
s_waitcnt 0
s_add_u32 s16, s16, 1
s_store_dword s16, s[0:1], 0x0 glc
s_waitcnt lgkmcnt(0)
ds_gws_sema_v offset:0 gds
.endif
s_waitcnt 0
s_endpgm
)";
@@ -1,5 +1,5 @@
/*
* Copyright (C) 2019 Advanced Micro Devices, Inc. All Rights Reserved.
* Copyright (C) 2021 Advanced Micro Devices, Inc. All Rights Reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
@@ -21,29 +21,40 @@
*
*/
#ifndef _ISAGENERATOR_GFX10_H_
#define _ISAGENERATOR_GFX10_H_
#ifndef _SHADERSTORE_H_
#define _SHADERSTORE_H_
#include <string>
#include "IsaGenerator.hpp"
#include <vector>
class IsaGenerator_Gfx10 : public IsaGenerator {
public:
virtual void GetNoopIsa(HsaMemoryBuffer& rBuf);
virtual void GetCopyDwordIsa(HsaMemoryBuffer& rBuf);
virtual void GetInfiniteLoopIsa(HsaMemoryBuffer& rBuf);
virtual void GetAtomicIncIsa(HsaMemoryBuffer& rBuf);
/* KFDASMTest List */
extern const std::vector<const char*> ShaderList;
protected:
virtual const std::string& GetAsicName();
/* Common */
extern const char *NoopIsa;
extern const char *CopyDwordIsa;
extern const char *InfiniteLoopIsa;
extern const char *AtomicIncIsa;
private:
static const std::string ASIC_NAME;
/* KFDMemoryTest */
extern const char *ScratchCopyDwordIsa;
extern const char *PollMemoryIsa;
extern const char *PollNCMemoryIsa;
extern const char *CopyOnSignalIsa;
extern const char *PollAndCopyIsa;
extern const char *WriteFlagAndValueIsa;
extern const char *WriteAndSignalIsa;
static const uint32_t NOOP_ISA[];
static const uint32_t COPY_DWORD_ISA[];
static const uint32_t INFINITE_LOOP_ISA[];
static const uint32_t ATOMIC_ADD_ISA[];
};
/* KFDQMTest */
extern const char *LoopIsa;
#endif // _ISAGENERATOR_GFX9_H_
/* KFDCWSRTest */
extern const char *IterateIsa;
/* KFDEvictTest */
extern const char *ReadMemoryIsa;
/* KFDGWSTest */
extern const char *GwsInitIsa;
extern const char *GwsAtomicIncreaseIsa;
#endif // _SHADERSTORE_H_