Merge branch 'sp3-llvm-transistion' into amd-staging

Transistion KFDTest to use open source LLVM compiler instead of SP3 compiler Change-Id: I26fff6a958bc48cb1f5509a11ec194d2ececf0ce [ROCm/ROCR-Runtime commit: b9651d3118]
2022-04-26 13:15:59 -04:00
@@ -95,12 +95,42 @@ endif()

 message ( "Find libhsakmt at ${HSAKMT_LIBRARY_DIRS}" )

-set ( SP3_DIR ${PROJECT_SOURCE_DIR}/sp3 )
+if ( POLICY CMP0074 )
+    cmake_policy( SET CMP0074 NEW )
+endif()
+
+find_path( LIGHTNING_CMAKE_DIR NAMES LLVMConfig.cmake
+    PATHS $ENV{OUT_DIR}/llvm/lib/cmake/llvm NO_CACHE NO_DEFAULT_PATH)
+
+if ( DEFINED LIGHTNING_CMAKE_DIR AND EXISTS ${LIGHTNING_CMAKE_DIR} )
+    set ( LLVM_DIR ${LIGHTNING_CMAKE_DIR} )
+else()
+    message( WARNING "Couldn't find Lightning build. "
+        "Attempting to use system LLVM install..." )
+endif()
+
+find_package( LLVM REQUIRED CONFIG )
+
+if( ${LLVM_PACKAGE_VERSION} VERSION_LESS "7.0" )
+    message( FATAL_ERROR "Requires LLVM 7.0 or greater "
+        "(found ${LLVM_PACKAGE_VERSION})" )
+elseif( ${LLVM_PACKAGE_VERSION} VERSION_LESS "14.0" )
+    message( WARNING "Not using latest LLVM version. "
+        "Some ASIC targets may not work!" )
+endif()
+
+message( STATUS "Found LLVM ${LLVM_PACKAGE_VERSION}" )
+message( STATUS "Using LLVMConfig.cmake in: ${LLVM_DIR}" )
+
+include_directories(${LLVM_INCLUDE_DIRS})
+separate_arguments(LLVM_DEFINITIONS_LIST NATIVE_COMMAND ${LLVM_DEFINITIONS})
+add_definitions(${LLVM_DEFINITIONS_LIST})
+
+llvm_map_components_to_libnames(llvm_libs AMDGPUAsmParser Core Support)

 include_directories(${PROJECT_SOURCE_DIR}/gtest-1.6.0)
 include_directories(${PROJECT_SOURCE_DIR}/include)
 include_directories(${PROJECT_SOURCE_DIR}/../../include)
-include_directories(${SP3_DIR})

 include_directories(${DRM_INCLUDE_DIRS})

@@ -112,12 +142,8 @@ set (SRC_FILES gtest-1.6.0/gtest-all.cpp
  src/Dispatch.cpp
  src/GoogleTestExtension.cpp
  src/IndirectBuffer.cpp
-  src/IsaGenerator.cpp
-  src/IsaGenerator_Aldebaran.cpp
-  src/IsaGenerator_Gfx10.cpp
-  src/IsaGenerator_Gfx72.cpp
-  src/IsaGenerator_Gfx8.cpp
-  src/IsaGenerator_Gfx9.cpp
+  src/Assemble.cpp
+  src/ShaderStore.cpp
  src/LinuxOSWrapper.cpp
  src/PM4Packet.cpp
  src/PM4Queue.cpp
@@ -143,6 +169,7 @@ set (SRC_FILES gtest-1.6.0/gtest-all.cpp
  src/KFDDBGTest.cpp
  src/KFDGWSTest.cpp
  src/KFDIPCTest.cpp
+  src/KFDASMTest.cpp

  src/KFDEvictTest.cpp
  src/KFDHWSTest.cpp
@@ -163,7 +190,7 @@ message( STATUS "PROJECT_SOURCE_DIR:" ${PROJECT_SOURCE_DIR} )

 if ( "${CMAKE_C_COMPILER_VERSION}" STRGREATER "4.8.0")
 ## Add --enable-new-dtags to generate DT_RUNPATH
-set ( CMAKE_CXX_FLAGS "-std=gnu++11 -Wl,--enable-new-dtags" )
+set ( CMAKE_CXX_FLAGS "-std=gnu++14 -Wl,--enable-new-dtags" )
 endif()
 if ( "${CMAKE_BUILD_TYPE}" STREQUAL Release )
    set ( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O2" )
@@ -181,11 +208,10 @@ endif ()
 # The modules found by pkg_check_modules() in the default pkg config
 # path do not need to use link_directories() here.
 link_directories(${HSAKMT_LIBRARY_DIRS})
-link_directories(${SP3_DIR})

 add_executable(kfdtest ${SRC_FILES})

-target_link_libraries(kfdtest ${HSAKMT_LIBRARIES} ${DRM_LDFLAGS} ${DRM_AMDGPU_LDFLAGS} pthread m stdc++ rt amdsp3 numa)
+target_link_libraries(kfdtest ${HSAKMT_LIBRARIES} ${DRM_LDFLAGS} ${DRM_AMDGPU_LDFLAGS} ${llvm_libs} pthread m stdc++ rt numa)

 configure_file ( scripts/kfdtest.exclude kfdtest.exclude COPYONLY )
 configure_file ( scripts/run_kfdtest.sh run_kfdtest.sh COPYONLY )
@@ -224,26 +224,10 @@ FILTER[aldebaran]=\
 "KFDMemoryTest.PtraceAccess:"\
 "KFDMemoryTest.DeviceHdpFlush"

-# SP3 Compiler needs to be updated for GFX10. Temporarily disable all tests
-# that require shader compiler
-# Adding KFDSVMEvictTest as SVM/HMM was never validated on GFX10
-TEMP_GFX10_BLACKLIST=\
-"KFDMemoryTest.FlatScratchAccess:"\
-"KFDMemoryTest.PtraceAccessInvisibleVram:"\
-"KFDQMTest.QueuePriorityOnDifferentPipe:"\
-"KFDQMTest.QueuePriorityOnSamePipe:"\
-"KFDCWSRTest.BasicTest:"\
-"KFDQMTest.BasicCuMaskingEven:"\
-"KFDEvictTest.QueueTest:"\
-"KFDMemoryTest.MapUnmapToNodes:"\
-"KFDMemoryTest.HostHdpFlush:"\
-"KFDMemoryTest.DeviceHdpFlush:"\
-"KFDSVMEvictTest.*"
-
 FILTER[navi10]=\
 "$BLACKLIST_ALL_ASICS:"\
-"$TEMP_GFX10_BLACKLIST:"\
-"KFDMemoryTest.MMBench"
+"KFDMemoryTest.MMBench:"\
+"KFDSVMEvictTest.*"

 # Need to verify the following failed tests on another machine:
 # Exceptions not being received during exception tests
@@ -254,42 +238,42 @@ FILTER[navi12]=\
 "KFDExceptionTest.*:"\
 "KFDPerfCountersTest.*:"\
 "KFDPerformanceTest.P2PBandWidthTest:"\
-"$TEMP_GFX10_BLACKLIST"
+"KFDSVMEvictTest.*"

 FILTER[navi14]=\
 "$BLACKLIST_ALL_ASICS:"\
-"$TEMP_GFX10_BLACKLIST"
+"KFDSVMEvictTest.*"

 FILTER[sienna_cichlid]=\
 "$BLACKLIST_ALL_ASICS:"\
-"$TEMP_GFX10_BLACKLIST:"\
 "KFDQMTest.BasicCuMaskingEven:"\
 "KFDDBGTest.*:"\
 "KFDPerfCountersTest.*:"\
+"KFDSVMEvictTest.*"

 FILTER[navy_flounder]=\
 "$BLACKLIST_ALL_ASICS:"\
-"$TEMP_GFX10_BLACKLIST:"\
 "KFDQMTest.BasicCuMaskingEven:"\
 "KFDDBGTest.*:"\
 "KFDPerfCountersTest.*:"\
+"KFDSVMEvictTest.*"

 FILTER[dimgrey_cavefish]=\
 "$BLACKLIST_ALL_ASICS:"\
-"$TEMP_GFX10_BLACKLIST:"\
 "KFDQMTest.BasicCuMaskingEven:"\
 "KFDDBGTest.*:"\
 "KFDPerfCountersTest.*:"\
+"KFDSVMEvictTest.*"

 FILTER[beige_goby]=\
 "$BLACKLIST_ALL_ASICS:"\
-"$TEMP_GFX10_BLACKLIST:"\
 "KFDQMTest.BasicCuMaskingEven:"\
 "KFDDBGTest.*:"\
 "KFDPerfCountersTest.*:"\
+"KFDSVMEvictTest.*"

 FILTER[yellow_carp]=\
 "$BLACKLIST_ALL_ASICS:"\
-"$TEMP_GFX10_BLACKLIST:"\
 "KFDQMTest.BasicCuMaskingEven:"\
-"KFDIPCTest.CMABasicTest"
+"KFDIPCTest.CMABasicTest:"\
+"KFDSVMEvictTest.*"
@@ -1,6 +0,0 @@
-Note: This folder is primarily intended for AMD internal developers.
-
-The folder lib_helper contains the script to generate SP3 library libamdsp3.a
-and the associated header files in the current folder for kfdtest to use. 
-cmake is required for the script to run. Just run ./build_sp3.sh after setting
-up the environment variables (source build/envsetup.sh).
@@ -1,79 +0,0 @@
-#
-# Copyright (C) 2018 Advanced Micro Devices, Inc. All Rights Reserved.
-#
-# Permission is hereby granted, free of charge, to any person obtaining a
-# copy of this software and associated documentation files (the "Software"),
-# to deal in the Software without restriction, including without limitation
-# the rights to use, copy, modify, merge, publish, distribute, sublicense,
-# and/or sell copies of the Software, and to permit persons to whom the
-# Software is furnished to do so, subject to the following conditions:
-#
-# The above copyright notice and this permission notice shall be included in
-# all copies or substantial portions of the Software.
-#
-# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
-# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
-# OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
-# ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
-# OTHER DEALINGS IN THE SOFTWARE.
-#
-#
-
-cmake_minimum_required(VERSION 2.8 FATAL_ERROR)
-
-project(amdsp3)
-
-#set ( CMAKE_VERBOSE_MAKEFILE on )
-
-find_package(PkgConfig)
-
-set ( P4_PATH $ENV{WORK_ROOT}/p4/driver/drivers )
-
-set ( SCLIB_SRC ${PROJECT_SOURCE_DIR} )
-#if( DEFINED ENV{SCLIB_SRC} )
-#    set ( SCLIB_SRC $ENV{SCLIB_SRC} )
-#else()
-#    set ( SCLIB_SRC ${P4_PATH}/sc/Chip )
-#endif()
-
-include_directories(${SCLIB_SRC}/sp3)
-#include_directories(${SCLIB_SRC}/sp3/release_headers)
-include_directories(${SCLIB_SRC}/sp3/gen)
-
-set ( SRC_FILES ${SRC_FILES} ${SCLIB_SRC}/sp3/sp3-asic.c )
-set ( SRC_FILES ${SRC_FILES} ${SCLIB_SRC}/sp3/sp3-dispatch.c )
-set ( SRC_FILES ${SRC_FILES} ${SCLIB_SRC}/sp3/sp3-eval.c )
-set ( SRC_FILES ${SRC_FILES} ${SCLIB_SRC}/sp3/sp3-gc.c )
-set ( SRC_FILES ${SRC_FILES} ${SCLIB_SRC}/sp3/sp3-int.c )
-set ( SRC_FILES ${SRC_FILES} ${SCLIB_SRC}/sp3/sp3-lib.c )
-set ( SRC_FILES ${SRC_FILES} ${SCLIB_SRC}/sp3/sp3-native.c )
-set ( SRC_FILES ${SRC_FILES} ${SCLIB_SRC}/sp3/sp3-cipher.c )
-set ( SRC_FILES ${SRC_FILES} ${SCLIB_SRC}/sp3/sp3-vm.c )
-
-aux_source_directory(${SCLIB_SRC}/sp3/gen SRC_FILES)
-aux_source_directory(${SCLIB_SRC}/sp3/backend/si/lib SRC_FILES)
-aux_source_directory(${SCLIB_SRC}/sp3/backend/ci/lib SRC_FILES)
-aux_source_directory(${SCLIB_SRC}/sp3/backend/gfx8/lib SRC_FILES)
-aux_source_directory(${SCLIB_SRC}/sp3/backend/gfx81/lib SRC_FILES)
-aux_source_directory(${SCLIB_SRC}/sp3/backend/gfx9/lib SRC_FILES)
-aux_source_directory(${SCLIB_SRC}/sp3/backend/gfx10/lib SRC_FILES)
-aux_source_directory(${SCLIB_SRC}/sp3/backend/aldbrn/lib SRC_FILES)
-aux_source_directory(${SCLIB_SRC}/sp3/backend/gfx81/arch SRC_FILES)
-aux_source_directory(${SCLIB_SRC}/sp3/backend/gfx9/arch SRC_FILES)
-aux_source_directory(${SCLIB_SRC}/sp3/backend/gfx10/arch SRC_FILES)
-aux_source_directory(${SCLIB_SRC}/sp3/backend/aldbrn/arch SRC_FILES)
-
-
-message( STATUS "PROJECT_SOURCE_DIR:" ${PROJECT_SOURCE_DIR} )
-#message( STATUS "SRC_FILES: ")
-#foreach(file ${SRC_FILES})
-#  message(STATUS "${file}")
-#endforeach()
-
-set ( CMAKE_C_FLAGS "-DSP3_STATIC_LIB -Wno-error -DPUBLIC_RELEASE -DLITTLEENDIAN_CPU -fPIC -DGFX101_BUILD -DALDBRN_BUILD" )
-
-add_library(amdsp3 ${SRC_FILES})
-
-
@@ -1,57 +0,0 @@
-#
-# Copyright (C) 2018 Advanced Micro Devices, Inc. All Rights Reserved.
-#
-# Permission is hereby granted, free of charge, to any person obtaining a
-# copy of this software and associated documentation files (the "Software"),
-# to deal in the Software without restriction, including without limitation
-# the rights to use, copy, modify, merge, publish, distribute, sublicense,
-# and/or sell copies of the Software, and to permit persons to whom the
-# Software is furnished to do so, subject to the following conditions:
-#
-# The above copyright notice and this permission notice shall be included in
-# all copies or substantial portions of the Software.
-#
-# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
-# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
-# OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
-# ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
-# OTHER DEALINGS IN THE SOFTWARE.
-#
-#
-
-#!/bin/bash
-
-if [ "$KFDTEST_ROOT" == "" ] || [ "$P4_ROOT" == "" ]; then
-	echo "Environment variables should be set before running this script"
-	exit 1
-fi
-
-cd $KFDTEST_ROOT/sp3/lib_helper
-
-SP3_PROJECT=$P4_ROOT/driver/drivers/sc/Chip/
-LIB_OUTPUT=$KFDTEST_ROOT/sp3/
-
-cp CMakeLists_sp3.txt $SP3_PROJECT/CMakeLists.txt
-
-mkdir -p build
-echo "Building SP3 lib"
-pushd build
-cmake $SP3_PROJECT/
-make
-popd
-
-rsync --progress -a build/libamdsp3.a $LIB_OUTPUT
-# Put the intermediate header files in the current folder for further processing
-rsync --progress -a $SP3_PROJECT/sp3/public/lib/sp3.h .
-
-# Remove the build folder and CMakeLists.txt put into SP source folder
-rm -r build
-rm $SP3_PROJECT/CMakeLists.txt
-
-# Replace the license statement in the header files
-{ cat AMD_opensource_license.txt; sed -e '1,/#ifndef/ { /#ifndef/b; d }' sp3.h; } > $LIB_OUTPUT/sp3.h
-
-# Delete the intermediate header files
-rm sp3.h
@@ -1,643 +0,0 @@
-/*
- * Copyright (C) 2014-2018 Advanced Micro Devices, Inc. All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
- * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
- * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
- * OTHER DEALINGS IN THE SOFTWARE.
- *
- */
-
-#ifndef __SP3_H__
-#define __SP3_H__
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-
-/// @file sp3.h
-/// @brief sp3 API
-#include <stdint.h>
-
-// Export tags
-#define SP3_EXPORT
-
-
-/// @defgroup sp3main SP3 Main API
-///
-/// Main API to assemble and disassemble SP3 shaders.
-///
-/// @{
-
-
-/// Valid shader stages.
-enum sp3_shtype {
-    SP3_SHTYPE_NONE = -1,
-    SP3_SHTYPE_PS   = 0,
-    SP3_SHTYPE_VS   = 1,
-    SP3_SHTYPE_GS   = 2,
-    SP3_SHTYPE_ES   = 3,
-    SP3_SHTYPE_HS   = 4,
-    SP3_SHTYPE_LS   = 5,
-    SP3_SHTYPE_CS   = 6,
-#ifdef NAVI10LITE_BUILD
-    SP3_SHTYPE_ACV  = 7,
-#endif
-};
-
-/// Assorted constants used by sp3 API.
-enum sp3_count {
-    SP3_NUM_MRT     = 8,    ///< Maximum number of render targets supported.
-    SP3_NUM_STRM    = 4,    ///< Maximum number of streams supported.
-};
-
-/// Disassembly flags. Bitwise-OR flags to set options.
-enum sp3_flag {
-    SP3DIS_NO_STATE     = 0x01, ///< Do not include state header at top of shader.
-    SP3DIS_NO_BINARY    = 0x02, ///< Do not include comments with raw binary microcode.
-    SP3DIS_COMMENTS     = 0x04, ///< Do not include comments.
-    SP3DIS_NO_GPR_COUNT = 0x08, ///< Do not include GPR allocation counts.
-    SP3DIS_FORCEVALID   = 0x10, ///< Force all bytes of microcode to be disassembled.
-    SP3DIS_NO_ASIC      = 0x20, ///< Do not emit the asic header at top of shader.
-};
-
-/// Shader context. Contains no user-visible fields.
-struct sp3_context;
-
-/// Memory object. Contains no user-visible fields.
-struct sp3_vma;
-
-/// VM addresses are 64-bit and the address unit is 32 bits
-typedef uint64_t sp3_vmaddr;
-
-/// Storage entry for register streams.
-struct sp3_reg {
-    uint32_t index;             ///< One of the MM aperture register addresses.
-    uint32_t value;             ///< 32-bit register data.
-};
-
-/// Bits for a single instruction.
-struct sp3_inst_bits {
-    uint32_t val[5];            ///< Largest single instruction in any backend is 5 dwords.
-};
-
-/// Wrapped shader metadata.
-///
-/// After generation, shaders are encapsulated in sp3_shader structures.
-///
-/// Those structures contain the shader binary, its register stream, constants and constant
-/// buffers and metadata needed for SC compatibility.
-///
-struct sp3_shader {
-    enum sp3_shtype type;       ///< One of the SHTYPE_* constants.
-    uint32_t asic_int;          ///< Internal ASIC index. Do not use.
-    char asic[0x100];           ///< ASIC name as a string ("RV870" etc).
-    uint32_t size;              ///< Size of the compiled shader, in 32-bit words.
-    uint32_t nsgprs;            ///< Number of scalar GPRs used.
-    uint32_t nvgprs;            ///< Number of vector GPRs used.
-    uint32_t nsvgprs;           ///< Number of shared vector GPRs used (only available in certain projects).
-    uint32_t naccvgprs;         ///< Number of accumulator vector GPRs used (only available in certain projects).
-    uint32_t nsgprs_manual_alloc;
-    uint32_t nvgprs_manual_alloc;
-    uint32_t nsvgprs_manual_alloc;
-    uint32_t naccvgprs_manual_alloc;
-    uint32_t trap_present;
-    uint32_t user_sgpr_count;
-    uint32_t scratch_en;
-    uint32_t dispatch_draw_en;
-    uint32_t so_en;
-    uint32_t so_base0_en;
-    uint32_t so_base1_en;
-    uint32_t so_base2_en;
-    uint32_t so_base3_en;
-    uint32_t oc_lds_en;
-    uint32_t tg_size_en;
-    uint32_t tidig_comp_cnt;    ///< Number of components(-1) enabled for thread id in group
-    uint32_t tgid_x_en;
-    uint32_t tgid_y_en;
-    uint32_t tgid_z_en;
-    uint32_t wave_cnt_en;
-    uint32_t primgen_en;
-    uint32_t pc_base_en;
-    uint32_t sgpr_scratch;
-    uint32_t sgpr_psvs_state;
-    uint32_t sgpr_gs2vs_offset;
-    uint32_t sgpr_so_write_index;
-    uint32_t sgpr_so_base_offset0;
-    uint32_t sgpr_so_base_offset1;
-    uint32_t sgpr_so_base_offset2;
-    uint32_t sgpr_so_base_offset3;
-    uint32_t sgpr_offchip_lds;
-    uint32_t sgpr_is_offchip;
-    uint32_t sgpr_ring_offset;
-    uint32_t sgpr_gs_wave_id;
-    uint32_t sgpr_global_wave_id;
-    uint32_t sgpr_tg_size;
-    uint32_t sgpr_tgid_x;
-    uint32_t sgpr_tgid_y;
-    uint32_t sgpr_tgid_z;
-    uint32_t sgpr_tf_base;
-    uint32_t sgpr_pc_base;
-    uint32_t sgpr_wave_cnt;
-    uint32_t wave_size;         ///< Number of threads in a wavefront (only certain ASICs; 0 = don't care).
-    uint32_t pc_exports;        ///< Range of parameters exported (if VS).
-    uint32_t pos_export;        ///< Shader executes a position export (if VS).
-    uint32_t cb_exports;        ///< Range of MRTs exported (if PS).
-    uint32_t mrtz_export_format;///< Export format of the mrtz export.
-    uint32_t z_export;          ///< Shader executes a Z export (if PS).
-    uint32_t pops_en;           ///< Shader is POPS (PS)
-    uint32_t pops_num_samples;  ///<  (PS)
-    uint32_t load_collision_waveid;     ///< Shader sets load collision waveid (if PS).
-    uint32_t load_intrawave_collision;  ///< Shader is in intrawave mode (if PS).
-    uint32_t stencil_test_export;       ///< Shader exports stencil (if PS).
-    uint32_t stencil_op_export; ///< Shader exports stencil (if PS).
-    uint32_t kill_used;         ///< Shader executes ALU KILL operations.
-    uint32_t cb_masks[SP3_NUM_MRT];     ///< Component masks for each MRT exported (if PS).
-    uint32_t emit_used;         ///< EMIT opcodes used (if GS).
-    uint32_t covmask_export;    ///< Shader exports coverage mask (if PS).
-    uint32_t mask_export;       ///< Shader exports mask (if PS).
-    uint32_t strm_used[SP3_NUM_STRM];   ///< Streamout operations used (map).
-    uint32_t scratch_used;      ///< Scratch SMX exports used.
-    uint32_t scratch_itemsize;  ///< Scratch ring item size.
-    uint32_t reduction_used;    ///< Reduction SMX exports used.
-    uint32_t ring_used;         ///< ESGS/GSVS ring SMX exports used.
-    uint32_t ring_itemsize;     ///< ESGS/GSVS ring item size (for ES/GS respectively).
-    uint32_t vertex_size[4];    ///< GSVS ring vertex size (for GS).
-    uint32_t mem_used;          ///< Raw memory SMX exports used.
-    uint32_t rats_used;         ///< Mask of RATs (UAVs) used
-    uint32_t group_size[3];     ///< Wavefront group size (for ELF files).
-    uint32_t alloc_lds;         ///< Number of LDS bytes allocated for wave group. (translates to lds_size in CS and LS)
-    uint32_t *data;             ///< Shader binary data.
-    uint32_t nregs;             ///< Number of register writes in the stream.
-    uint64_t crc64;             ///< CRC64 of compiled shader, may be used for identification/fingerprinting.
-    uint32_t crc32;             ///< 32-bit CRC of compiled shader (based on crc64), may be used for identification/fingerprinting.
-    struct sp3_reg *regs;       ///< Register writes (index-value pairs).
-    struct sp3_shader *merged_2nd_shader;   ///< Merged es/gs, ls/hs shader, this points to start of the second shader (only certain ASICs).
-};
-
-/// Comment callback.
-typedef const char *(*sp3_comment_cb)(void *, int);
-
-
-/// Get version of the sp3 library.
-///
-/// @return String containing the version number.
-///
-SP3_EXPORT const char *sp3_version(void);
-
-/// Create a new sp3 context.
-///
-/// @return A new context for use in assembling and disassembling shaders. Free with sp3_close().
-///
-SP3_EXPORT struct sp3_context *sp3_new(void);
-
-/// Set option for sp3.
-///
-/// @param state sp3 context.
-/// @param option Option name. Unknown options will raise an error.
-/// @param value Option value. NULL is used to represent value-less options.
-///
-/// Currently supported options:
-///
-/// stdlib (string) -- absolute path to standard library files.  May be a colon-separated list
-/// of paths that will be used to search for stdlib files.  Used by sp3_parse_library().
-///
-/// The following options are deprecated because they take integer arguments; you should use
-/// sp3_set_option_int() for these settings going forward.  They will continue to be accepted by
-/// this API to support legacy users.
-///
-/// Werror (boolean) -- indicates whether warnings should be treated as errors.
-///
-/// wave_size (integer) -- sets the wave size being used by the draw calls that will be using
-/// this shader.  Ignored in certain ASICs.  You may set this to 32, 64 or the special value 0
-/// to indicate no preference on wave size.  The shader will be checked to ensure it is
-/// compatible with the size specified here.
-///
-/// omit_version (boolean) -- omit generation of the S_VERSION opcode.
-///
-/// omit_code_end (boolean) -- omit generation of the S_CODE_END footer.
-///
-/// allow_raw_bits (boolean) -- allow use of the raw_bits() function in sp3 shaders.  This is a
-/// dangerous option to allow in general so you must explicitly enable this option, otherwise
-/// the raw_bits() function will always error out.
-///
-SP3_EXPORT void sp3_set_option(
-    struct sp3_context *state,
-    const char *option,
-    const char *value);
-
-/// Set option for sp3.
-///
-/// @param state sp3 context.
-/// @param option Option name. Unknown options will raise an error.
-/// @param value Option value.
-///
-/// Currently supported options:
-///
-/// Werror (boolean) -- indicates whether warnings should be treated as errors.
-///
-/// wave_size (integer) -- sets the wave size being used by the draw calls that will be using
-/// this shader.  Ignored in certain ASICs.  You may set this to 32, 64 or the special value 0
-/// to indicate no preference on wave size.  The shader will be checked to ensure it is
-/// compatible with the size specified here.
-///
-/// omit_version (boolean) -- omit generation of the S_VERSION opcode.
-///
-/// omit_code_end (boolean) -- omit generation of the S_CODE_END footer.
-///
-/// allow_raw_bits (boolean) -- allow use of the raw_bits() function in sp3 shaders.  This is a
-/// dangerous option to allow in general so you must explicitly enable this option, otherwise
-/// the raw_bits() function will always error out.
-///
-/// secure_mode (boolean) -- run in secure mode. Disables macro language features in assembly
-/// path including calls to custom functions. Useful if sp3 is used as a backend to a web-based
-/// assembly tool.
-///
-/// debug_encoding (boolean) -- if true, debug encoding selection logic for assembly. Only
-/// supported in 10.4+ backends.
-///
-/// no_vs_export_check (boolean) -- if true, disable VS export sanity check.  Only supported in
-/// 10.4+ backends.
-///
-SP3_EXPORT void sp3_set_option_int(
-    struct sp3_context *state,
-    const char *option,
-    int32_t value);
-
-/// Parse a file into a context.
-///
-/// Use sp3_compile to generate binary microcode after the shader is parsed.
-///
-/// @param state Context to use for parsing.
-/// @param file File to read. If NULL, parse from stdin.
-///
-SP3_EXPORT void sp3_parse_file(struct sp3_context *state, const char *file);
-
-/// Parse a string into a context.
-///
-/// Use sp3_compile to generate binary microcode after the shader is parsed.
-///
-/// @param state Context to use for parsing.
-/// @param string String to parse.
-///
-SP3_EXPORT void sp3_parse_string(struct sp3_context *state, const char *string);
-
-/// Parse a file from the standard library into a context.
-///
-/// Use sp3_compile to generate binary microcode after the shader is parsed.
-///
-/// @param state Context to use for parsing.
-/// @param name Path to the standard library; files in this directory are parsed.
-///
-SP3_EXPORT void sp3_parse_library(struct sp3_context *state, const char *name);
-
-/// Call a sp3 function.
-///
-SP3_EXPORT void sp3_call(struct sp3_context *state, const char *func);
-
-/// Compile a shader program that has been parsed into the context.
-///
-/// @param state sp3 context.
-/// @param cffunc Name of clause to call. By convention, this is "main".
-/// @return A compiled and linked shader.  Free memory with sp3_free_shader().
-///
-SP3_EXPORT struct sp3_shader *sp3_compile(
-    struct sp3_context *state,
-    const char *cffunc);
-
-/// Free a sp3_shader.
-///
-/// @param sh Shader object to delete.
-///
-SP3_EXPORT void sp3_free_shader(struct sp3_shader *sh);
-
-/// Get current ASIC name set for a context.
-///
-/// @param state Context to query.
-/// @return Name of ASIC.
-///
-SP3_EXPORT const char *sp3_getasic(struct sp3_context *state);
-
-/// Set current ASIC name for a context.
-///
-/// @param state Context to modify.
-/// @param chip Case-insensitive string representing the ASIC to compile or disassemble for.
-///
-SP3_EXPORT void sp3_setasic(struct sp3_context *state, const char *chip);
-
-/// Set global variable in context to an integer.
-///
-SP3_EXPORT void sp3_set_param_int(
-    struct sp3_context *state,
-    const char *name,
-    int32_t value);
-
-/// Set global variable in context to an integer vector.
-///
-SP3_EXPORT void sp3_set_param_intvec(
-    struct sp3_context *state,
-    const char *name,
-    uint32_t size,
-    const int32_t *value);
-
-/// Set global variable in context to a float.
-///
-SP3_EXPORT void sp3_set_param_float(
-    struct sp3_context *state,
-    const char *name,
-    float value);
-
-/// Set global variable in context to a float vector.
-///
-SP3_EXPORT void sp3_set_param_floatvec(
-    struct sp3_context *state,
-    const char *name,
-    uint32_t size,
-    const float *value);
-
-/// Set error message header.
-///
-/// @param state Context to modify.
-/// @param str Text to include in error message header.
-///
-SP3_EXPORT void sp3_set_error_header(struct sp3_context *state, const char *str);
-
-/// Get ASIC metrics for the ASIC in current state.
-///
-/// Used by ELF tools to fill in some CAL fields.
-///
-/// @param state Context to query.
-/// @param name Name of ASIC metric.
-/// @return Value of ASIC metric.
-///
-SP3_EXPORT int sp3_asicinfo(struct sp3_context *state, const char *name);
-
-/// Free a context allocated by sp3_new/open/parse.
-///
-/// @param state Context to delete.
-///
-SP3_EXPORT void sp3_close(struct sp3_context *state);
-
-/// Disassemble a shader.
-///
-/// This call is likely to change to something that will take a filled sp3_shader structure
-/// later on.
-///
-/// @param state sp3 context (use sp3_new to allocate and sp3_setasic to set ASIC).
-/// @param bin Memory map with the opcodes (see sp3-vm.h).
-/// @param base Start of the shader in the memory map (in VM entries, i.e. 32-bit words).
-/// @param name Same to give the disassembled shader.
-/// @param shader_type One of the SHTYPE_* constants.
-/// @param include Literal text to include in the CF clause (NULL includes nothing).
-/// @param max_len Maximum length of CF clause. Matters if SP3DIS_FORCEVALID is set.
-/// @param flags A bitmask of SP3DIS_* flags.
-///
-/// @return Shader disassembly as a string. Free memory with sp3_free().
-///
-SP3_EXPORT char *sp3_disasm(
-    struct sp3_context *state,
-    struct sp3_vma *bin,
-    sp3_vmaddr base,
-    const char *name,
-    enum sp3_shtype shader_type,
-    const char *include,
-    uint32_t max_len,
-    uint32_t flags);
-
-/// Disassemble a single shader instruction.
-///
-/// This call is likely to change to something that will take a filled sp3_shader structure
-/// later on.
-///
-/// @param state sp3 context (use sp3_new to allocate and sp3_setasic to set ASIC).
-/// @param inst Pointer to dwords containing instruction (exact number of dwords required depends on instruction).
-/// @param base Start of the shader in the memory map (in VM entries, i.e. 32-bit words).
-/// @param addr Address of the instruction being disassembled (in VM entries, i.e. 32-bit words).
-/// @param shader_type One of the SHTYPE_* constants.
-/// @param flags A mask of SP3DIS_* flags.
-///
-/// @return Shader disassembly as a string. Free memory with sp3_free().
-///
-SP3_EXPORT char *sp3_disasm_inst(
-    struct sp3_context *state,
-    const struct sp3_inst_bits *inst,
-    sp3_vmaddr base,
-    sp3_vmaddr addr,
-    enum sp3_shtype shader_type,
-    uint32_t flags);
-
-/// Parse a register stream.
-///
-/// Can be called before sp3_disasm to preset things like ALU, boolean and loop constants.
-///
-/// This call is likely to merge with sp3_disasm later on.
-///
-/// @param state sp3 context to fill with state.
-/// @param nregs Number of register entries.
-/// @param regs Register stream to parse.
-/// @param shader_type One of the SHTYPE_* constants.
-///
-SP3_EXPORT void sp3_setregs(
-    struct sp3_context *state,
-    uint32_t nregs,
-    const struct sp3_reg *regs,
-    enum sp3_shtype shader_type);
-
-
-/// Set shader comments
-///
-/// @param state sp3 context.
-/// @param map Map of comments (0 for no comment, other values will be passed to the callback).
-/// @param f_top Callback returning comment to place above the opcode.
-/// @param f_right Callback returning comment to place to the right of the opcode.
-/// @param ctx Void pointer to pass to comment callbacks.
-///
-SP3_EXPORT void sp3_setcomments(
-    struct sp3_context *state,
-    struct sp3_vma *map,
-    sp3_comment_cb f_top,
-    sp3_comment_cb f_right,
-    void *ctx);
-
-/// Set alternate shader entry points
-///
-/// Used for disassembly; this marks an additional location in memory
-/// (besides the start address) where shader code may be found. Generally
-/// required for jump tables and any case where the shader may perform
-/// indirect jumps to ensure that disassembly locates all shader
-/// instructions.
-///
-/// @param state sp3 context (use sp3_new to allocate and sp3_setasic to set ASIC).
-/// @param addr Address of the instruction being disassembled (in VM entries, i.e. 32-bit words).
-///
-SP3_EXPORT void sp3_setentrypoint(
-    struct sp3_context *state,
-    sp3_vmaddr addr);
-
-/// Clear alternate shader entry points.
-///
-/// Clear all entry points previously set with sp3_setentrypoint.
-///
-/// @param state sp3 context (use sp3_new to allocate and sp3_setasic to set ASIC).
-///
-SP3_EXPORT void sp3_clearentrypoints(struct sp3_context *state);
-
-/// Free memory allocated by sp3.
-///
-/// Windows DLLs that allocate memory have to free it. This function
-/// should be used to free the result of sp3_disasm, sp3_compile etc.
-///
-SP3_EXPORT void sp3_free(void *ptr);
-
-/// SP3 API to merge two shaders given file names as input.
-///
-SP3_EXPORT struct sp3_shader* sp3_merge_shaders(
-    struct sp3_context *pointer,
-    const char *first_file,
-    const char *second_file);
-
-/// SP3 API to merge two shaders given shader strings as input.
-///
-SP3_EXPORT struct sp3_shader* sp3_merge_shader_strings(
-    struct sp3_context *pointer,
-    const char *first_string,
-    const char *second_string);
-
-
-/// @}
-
-
-/// @defgroup sp3vm SP3 Memory Objects
-///
-/// The VM API is used to manage virtual memory maps.  Those maps are used for binary storage
-/// for disassembly, as they can naturally mirror the GPU's memory map (so no register
-/// translation is needed).
-///
-/// @{
-
-/// Callback function that will fill a VMA on demand
-///
-/// The VMA to be filled will be specified through the request address.
-/// The callback should fill the VMA using sp3_vm_write calls.
-///
-typedef void (* sp3_vmfill)(struct sp3_vma *vm, sp3_vmaddr addr, void *ctx);
-
-/// Create a new VM that is empty.
-///
-/// Free the object with sp3_vm_free().
-///
-/// @return New VM object.
-///
-SP3_EXPORT
-struct sp3_vma *sp3_vm_new(void);
-
-/// Create a new VM that has a sp3_vmfill callback.
-///
-/// Free the object with sp3_vm_free().
-///
-/// @param fill Function used to populate data in VM. The function will be pass the new VM object, the address and a context.
-/// @param ctx User-specified context. Passed to the fill function and not used by sp3 itself.
-/// @return New VM object.
-///
-SP3_EXPORT
-struct sp3_vma *sp3_vm_new_fill(sp3_vmfill fill, void *ctx);
-
-/// Create a new VM from an array of words.
-///
-/// Free the object with sp3_vm_free().
-///
-/// @param base VM address to load array at.
-/// @param len Number of 32-bit words in the array.
-/// @param data Pointer to the array.
-/// @return New VM object.
-///
-SP3_EXPORT
-struct sp3_vma *sp3_vm_new_ptr(sp3_vmaddr base, sp3_vmaddr len, const uint32_t *data);
-
-/// Find a VMA, optionally adding it.
-///
-/// @param vm VM to search in.
-/// @param addr Address to search for.
-/// @param add Flag indicating whether a failure should result in adding a new VMA.
-/// @return VM object matching the specified address.
-///
-SP3_EXPORT
-struct sp3_vma *sp3_vm_find(struct sp3_vma *vm, sp3_vmaddr addr, uint32_t add);
-
-/// Write a word to a VM.
-///
-/// @param vm VM to write.
-/// @param addr Address to write.
-/// @param val 32-bits of data to write.
-///
-SP3_EXPORT
-void sp3_vm_write(struct sp3_vma *vm, sp3_vmaddr addr, uint32_t val);
-
-/// Read a word from a VM.
-///
-/// @param vm VM to read.
-/// @param addr Address to read.
-/// @return 32-bits of data at specified address.
-///
-SP3_EXPORT
-uint32_t sp3_vm_read(struct sp3_vma *vm, sp3_vmaddr addr);
-
-/// Probe VM for presence.
-///
-/// @param vm VM to probe.
-/// @param addr Address to search for.
-/// @return 1 if the specified address is backed in the VM, 0 otherwise.
-///
-SP3_EXPORT
-int sp3_vm_present(struct sp3_vma *vm, sp3_vmaddr addr);
-
-/// Return base address of VM.
-///
-/// @param vm VM to query.
-/// @return Base address.
-///
-SP3_EXPORT
-sp3_vmaddr sp3_vm_base(struct sp3_vma *vm);
-
-/// Return next VM.
-///
-/// @param vm VM to query.
-/// @return Next VM in list.
-///
-SP3_EXPORT
-struct sp3_vma *sp3_vm_next(struct sp3_vma *vm);
-
-/// Free a VM and all its storage.
-///
-/// Use this function to free memory allocated by sp3_vm_new, sp3_vm_new_fill and
-/// sp3_vm_new_ptr.
-///
-/// @param vm VM to free.
-///
-SP3_EXPORT
-void sp3_vm_free(struct sp3_vma *vm);
-
-
-/// @}
-
-
-#ifdef __cplusplus
-}
-#endif
-
-
-#endif /* __SP3_H__ */
@@ -0,0 +1,379 @@
+////////////////////////////////////////////////////////////////////////////////
+//
+// The University of Illinois/NCSA
+// Open Source License (NCSA)
+//
+// Copyright (c) 2022, Advanced Micro Devices, Inc. All rights reserved.
+//
+// Developed by:
+//
+//                 AMD Research and AMD HSA Software Development
+//
+//                 Advanced Micro Devices, Inc.
+//
+//                 www.amd.com
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to
+// deal with the Software without restriction, including without limitation
+// the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the
+// Software is furnished to do so, subject to the following conditions:
+//
+//  - Redistributions of source code must retain the above copyright notice,
+//    this list of conditions and the following disclaimers.
+//  - Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimers in
+//    the documentation and/or other materials provided with the distribution.
+//  - Neither the names of Advanced Micro Devices, Inc,
+//    nor the names of its contributors may be used to endorse or promote
+//    products derived from this Software without specific prior written
+//    permission.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+// DEALINGS WITH THE SOFTWARE.
+//
+////////////////////////////////////////////////////////////////////////////////
+
+/**
+ * Self-contained assembler that uses the LLVM MC API to assemble AMDGCN
+ * instructions
+ */
+
+#include <llvm/Config/llvm-config.h>
+#include <llvm/MC/MCAsmBackend.h>
+#include <llvm/MC/MCAsmInfo.h>
+#include <llvm/MC/MCCodeEmitter.h>
+#include <llvm/MC/MCContext.h>
+#include <llvm/MC/MCInstPrinter.h>
+#include <llvm/MC/MCInstrInfo.h>
+#include <llvm/MC/MCObjectFileInfo.h>
+#include <llvm/MC/MCObjectWriter.h>
+#include <llvm/MC/MCParser/AsmLexer.h>
+#include <llvm/MC/MCParser/MCTargetAsmParser.h>
+#include <llvm/MC/MCRegisterInfo.h>
+#include <llvm/MC/MCStreamer.h>
+#include <llvm/MC/MCSubtargetInfo.h>
+#include <llvm/Support/CommandLine.h>
+#include <llvm/Support/InitLLVM.h>
+#include <llvm/Support/MemoryBuffer.h>
+#include <llvm/Support/SourceMgr.h>
+#include <llvm/Support/TargetSelect.h>
+#if LLVM_VERSION_MAJOR > 13
+#include <llvm/MC/TargetRegistry.h>
+#else
+#include <llvm/Support/TargetRegistry.h>
+#endif
+
+#include <linux/elf.h>
+#include "OSWrapper.hpp"
+#include "Assemble.hpp"
+
+using namespace llvm;
+
+Assembler::Assembler(const uint32_t Gfxv) {
+    SetTargetAsic(Gfxv);
+    TextData = nullptr;
+    TextSize = 0;
+    LLVMInit();
+}
+
+Assembler::~Assembler() {
+    FlushText();
+    llvm_shutdown();
+}
+
+const char* Assembler::GetInstrStream() {
+    return TextData;
+}
+
+const size_t Assembler::GetInstrStreamSize() {
+    return TextSize;
+}
+
+int Assembler::CopyInstrStream(char* OutBuf, const size_t BufSize) {
+    if (TextSize > BufSize)
+        return -2;
+
+    std::copy(TextData, TextData + TextSize, OutBuf);
+    return 0;
+}
+
+const char* Assembler::GetTargetAsic() {
+    return MCPU;
+}
+
+/**
+ * Set MCPU via GFX Version from Thunk
+ * LLVM Target IDs use decimal for Maj/Min, hex for Step
+ */
+void Assembler::SetTargetAsic(const uint32_t Gfxv) {
+    const uint8_t Major = (Gfxv >> 16) & 0xff;
+    const uint8_t Minor = (Gfxv >> 8) & 0xff;
+    const uint8_t Step = Gfxv & 0xff;
+
+    snprintf(MCPU, ASM_MCPU_LEN, "gfx%d%d%x", Major, Minor, Step);
+}
+
+/**
+ * Initialize LLVM targets and assembly printers/parsers
+ */
+void Assembler::LLVMInit() {
+    LLVMInitializeAMDGPUTargetInfo();
+    LLVMInitializeAMDGPUTargetMC();
+    LLVMInitializeAMDGPUAsmParser();
+}
+
+/**
+ * Flush/reset TextData and TextSize to initial state
+ */
+void Assembler::FlushText() {
+    if (TextData)
+        delete[] TextData;
+    TextData = nullptr;
+    TextSize = 0;
+}
+
+/**
+ * Print hex of ELF object to stdout (debug)
+ */
+void Assembler::PrintELFHex(const std::string Data) {
+    outs() << "ASM Info: assembled ELF hex data (length " << Data.length() << "):\n";
+    outs() << "0x00:\t";
+    for (size_t i = 0; i < Data.length(); ++i) {
+        char c = Data[i];
+        outs() << format_hex(static_cast<uint8_t>(c), 4);
+        if ((i+1) % 16 == 0)
+            outs() << "\n" << format_hex(i+1, 4) << ":\t";
+        else
+            outs() << " ";
+    }
+    outs() << "\n";
+}
+
+/**
+ * Print hex of raw instruction stream to stdout (debug)
+ */
+void Assembler::PrintTextHex() {
+    outs() << "ASM Info: assembled .text hex data (length " << TextSize << "):\n";
+    outs() << "0x00:\t";
+    for (size_t i = 0; i < TextSize; i++) {
+        outs() << format_hex(static_cast<uint8_t>(TextData[i]), 4);
+        if ((i+1) % 16 == 0)
+            outs() << "\n" << format_hex(i+1, 4) << ":\t";
+        else
+            outs() << " ";
+    }
+    outs() << "\n";
+}
+
+/**
+ * Extract raw instruction stream from .text section in ELF object
+ *
+ * @param RawData Raw C string of ELF object
+ * @return 0 on success
+ */
+int Assembler::ExtractELFText(const char* RawData) {
+    const Elf64_Ehdr* ElfHeader;
+    const Elf64_Shdr* SectHeader;
+    const Elf64_Shdr* SectStrTable;
+    const char* SectStrAddr;
+    unsigned NumSects, SectIdx;
+
+    if (!(ElfHeader = reinterpret_cast<const Elf64_Ehdr*>(RawData))) {
+        outs() << "ASM Error: elf data is invalid or corrupted\n";
+        return -1;
+    }
+    if (ElfHeader->e_ident[EI_CLASS] != ELFCLASS64) {
+        outs() << "ASM Error: elf object must be of 64-bit type\n";
+        return -1;
+    }
+
+    SectHeader = reinterpret_cast<const Elf64_Shdr*>(RawData + ElfHeader->e_shoff);
+    SectStrTable = &SectHeader[ElfHeader->e_shstrndx];
+    SectStrAddr = static_cast<const char*>(RawData + SectStrTable->sh_offset);
+
+    // Loop through sections, break on .text
+    NumSects = ElfHeader->e_shnum;
+    for (SectIdx = 0; SectIdx < NumSects; SectIdx++) {
+        std::string SectName = std::string(SectStrAddr + SectHeader[SectIdx].sh_name);
+        if (SectName == std::string(".text")) {
+            TextSize = SectHeader[SectIdx].sh_size;
+            TextData = new char[TextSize];
+            memcpy(TextData, RawData + SectHeader[SectIdx].sh_offset, TextSize);
+            break;
+        }
+    }
+
+    if (SectIdx >= NumSects) {
+        outs() << "ASM Error: couldn't locate .text section\n";
+        return -1;
+    }
+
+    return 0;
+}
+
+/**
+ * Assemble shader, fill member vars, and copy to output buffer
+ *
+ * @param AssemblySource Shader source represented as a raw C string
+ * @param OutBuf Raw instruction stream output buffer
+ * @param BufSize Size of OutBuf (defaults to PAGE_SIZE)
+ * @return Value of RunAssemble() (0 on success)
+ */
+int Assembler::RunAssembleBuf(const char* const AssemblySource, char* OutBuf,
+                              const size_t BufSize) {
+    int ret = RunAssemble(AssemblySource);
+    return ret ? ret : CopyInstrStream(OutBuf, BufSize);
+}
+
+/**
+ * Assemble shader and fill member vars
+ *
+ * @param AssemblySource Shader source represented as a raw C string
+ * @return 0 on success
+ */
+int Assembler::RunAssemble(const char* const AssemblySource) {
+    // Ensure target ASIC has been set
+    if (!MCPU) {
+        outs() << "ASM Error: target asic is uninitialized\n";
+        return -1;
+    }
+
+    // Delete TextData for any previous runs
+    FlushText();
+
+#if 0
+    outs() << "ASM Info: running assembly for target: " << MCPU << "\n";
+    outs() << "ASM Info: source:\n";
+    outs() << AssemblySource << "\n";
+#endif
+
+    // Initialize MCOptions and target triple
+    const MCTargetOptions MCOptions;
+    Triple TheTriple;
+
+    const Target* TheTarget =
+        TargetRegistry::lookupTarget(ArchName, TheTriple, Error);
+    if (!TheTarget) {
+        outs() << Error;
+        return -1;
+    }
+
+    TheTriple.setArchName(ArchName);
+    TheTriple.setVendorName(VendorName);
+    TheTriple.setOSName(OSName);
+
+    TripleName = TheTriple.getTriple();
+    TheTriple.setTriple(Triple::normalize(TripleName));
+
+    // Create MemoryBuffer for assembly source
+    StringRef AssemblyRef(AssemblySource);
+    std::unique_ptr<MemoryBuffer> BufferPtr =
+        MemoryBuffer::getMemBuffer(AssemblyRef, "", false);
+    if (!BufferPtr->getBufferSize()) {
+        outs() << "ASM Error: assembly source is empty\n";
+        return -1;
+    }
+
+    // Instantiate SrcMgr and transfer BufferPtr ownership
+    SourceMgr SrcMgr;
+    SrcMgr.AddNewSourceBuffer(std::move(BufferPtr), SMLoc());
+
+    // Initialize MC interfaces and base class objects
+    std::unique_ptr<const MCRegisterInfo> MRI(
+            TheTarget->createMCRegInfo(TripleName));
+    if (!MRI) {
+        outs() << "ASM Error: no register info for target " << MCPU << "\n";
+        return -1;
+    }
+#if LLVM_VERSION_MAJOR > 9
+    std::unique_ptr<const MCAsmInfo> MAI(
+            TheTarget->createMCAsmInfo(*MRI, TripleName, MCOptions));
+#else
+    std::unique_ptr<const MCAsmInfo> MAI(
+            TheTarget->createMCAsmInfo(*MRI, TripleName));
+#endif
+    if (!MAI) {
+        outs() << "ASM Error: no assembly info for target " << MCPU << "\n";
+        return -1;
+    }
+    std::unique_ptr<MCInstrInfo> MCII(
+            TheTarget->createMCInstrInfo());
+    if (!MCII) {
+        outs() << "ASM Error: no instruction info for target " << MCPU << "\n";
+        return -1;
+    }
+    std::unique_ptr<MCSubtargetInfo> STI(
+            TheTarget->createMCSubtargetInfo(TripleName, MCPU, std::string()));
+    if (!STI || !STI->isCPUStringValid(MCPU)) {
+        outs() << "ASM Error: no subtarget info for target " << MCPU << "\n";
+        return -1;
+    }
+
+    // Set up the MCContext for creating symbols and MCExpr's
+#if LLVM_VERSION_MAJOR > 12
+    MCContext Ctx(TheTriple, MAI.get(), MRI.get(), STI.get(), &SrcMgr, &MCOptions);
+#else
+    MCObjectFileInfo MOFI;
+    MCContext Ctx(MAI.get(), MRI.get(), &MOFI, &SrcMgr, &MCOptions);
+    MOFI.InitMCObjectFileInfo(TheTriple, true, Ctx);
+#endif
+
+    // Finalize setup for output object code stream
+    std::string Data;
+    std::unique_ptr<raw_string_ostream> DataStream(std::make_unique<raw_string_ostream>(Data));
+    std::unique_ptr<buffer_ostream> BOS(std::make_unique<buffer_ostream>(*DataStream));
+    raw_pwrite_stream* OS = BOS.get();
+
+#if LLVM_VERSION_MAJOR > 14
+    MCCodeEmitter* CE = TheTarget->createMCCodeEmitter(*MCII, Ctx);
+#else
+    MCCodeEmitter* CE = TheTarget->createMCCodeEmitter(*MCII, *MRI, Ctx);
+#endif
+    MCAsmBackend* MAB = TheTarget->createMCAsmBackend(*STI, *MRI, MCOptions);
+
+    std::unique_ptr<MCStreamer> Streamer(TheTarget->createMCObjectStreamer(
+        TheTriple, Ctx,
+        std::unique_ptr<MCAsmBackend>(MAB), MAB->createObjectWriter(*OS),
+        std::unique_ptr<MCCodeEmitter>(CE), *STI, MCOptions.MCRelaxAll,
+        MCOptions.MCIncrementalLinkerCompatible, /*DWARFMustBeAtTheEnd*/ false));
+
+    std::unique_ptr<MCAsmParser> Parser(
+            createMCAsmParser(SrcMgr, Ctx, *Streamer, *MAI));
+
+    // Set parser to target parser and run
+    std::unique_ptr<MCTargetAsmParser> TAP(
+            TheTarget->createMCAsmParser(*STI, *Parser, *MCII, MCOptions));
+    if (!TAP) {
+        outs() << "ASM Error: no assembly parsing support for target " << MCPU << "\n";
+        return -1;
+    }
+    Parser->setTargetParser(*TAP);
+
+    if (Parser->Run(true)) {
+        outs() << "ASM Error: assembly parser failed\n";
+        return -1;
+    }
+
+    BOS.reset();
+    DataStream->flush();
+
+    int ret = ExtractELFText(Data.data());
+    if (ret < 0 || !TextData) {
+        outs() << "ASM Error: .text extraction failed\n";
+        return ret;
+    }
+
+#if 0
+    PrintELFHex(Data);
+    PrintTextHex();
+#endif
+
+    return 0;
+}
@@ -0,0 +1,86 @@
+////////////////////////////////////////////////////////////////////////////////
+//
+// The University of Illinois/NCSA
+// Open Source License (NCSA)
+//
+// Copyright (c) 2022, Advanced Micro Devices, Inc. All rights reserved.
+//
+// Developed by:
+//
+//                 AMD Research and AMD HSA Software Development
+//
+//                 Advanced Micro Devices, Inc.
+//
+//                 www.amd.com
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to
+// deal with the Software without restriction, including without limitation
+// the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the
+// Software is furnished to do so, subject to the following conditions:
+//
+//  - Redistributions of source code must retain the above copyright notice,
+//    this list of conditions and the following disclaimers.
+//  - Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimers in
+//    the documentation and/or other materials provided with the distribution.
+//  - Neither the names of Advanced Micro Devices, Inc,
+//    nor the names of its contributors may be used to endorse or promote
+//    products derived from this Software without specific prior written
+//    permission.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+// DEALINGS WITH THE SOFTWARE.
+//
+////////////////////////////////////////////////////////////////////////////////
+
+#ifndef _ASSEMBLE_H_
+#define _ASSEMBLE_H_
+
+#include "OSWrapper.hpp"
+
+#define ASM_MCPU_LEN 16
+
+class Assembler {
+  private:
+      const char* ArchName = "amdgcn";
+      const char* VendorName = "amd";
+      const char* OSName = "amdhsa";
+      char MCPU[ASM_MCPU_LEN];
+
+      std::string TripleName;
+      std::string Error;
+
+      char* TextData;
+      size_t TextSize;
+
+      void SetTargetAsic(const uint32_t Gfxv);
+
+      void LLVMInit();
+      void FlushText();
+      void PrintELFHex(const std::string Data);
+      int ExtractELFText(const char* RawData);
+
+  public:
+      Assembler(const uint32_t Gfxv);
+      ~Assembler();
+
+      void PrintTextHex();
+      const char* GetTargetAsic();
+
+      const char* GetInstrStream();
+      const size_t GetInstrStreamSize();
+      int CopyInstrStream(char* OutBuf, const size_t BufSize = PAGE_SIZE);
+
+      int RunAssemble(const char* const AssemblySource);
+      int RunAssembleBuf(const char* const AssemblySource, char* OutBuf,
+                         const size_t BufSize = PAGE_SIZE);
+};
+
+#endif  // _ASSEMBLE_H_
@@ -1,126 +0,0 @@
-/*
- * Copyright (C) 2014-2018 Advanced Micro Devices, Inc. All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
- * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
- * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
- * OTHER DEALINGS IN THE SOFTWARE.
- *
- */
-
-#include "IsaGenerator.hpp"
-
-#include <algorithm>
-#include <string>
-
-#include "IsaGenerator_Gfx72.hpp"
-#include "IsaGenerator_Gfx8.hpp"
-#include "IsaGenerator_Gfx9.hpp"
-#include "IsaGenerator_Gfx10.hpp"
-#include "IsaGenerator_Aldebaran.hpp"
-
-#include "GoogleTestExtension.hpp"
-
-#include "sp3.h"
-
-const std::string IsaGenerator::ADDRESS_WATCH_SP3(
-    "var REG_TRAPSTS_EXCP_MASK = 0x000001ff\n"
-    "var WAVE_COUNT_OFFSET = 12\n"
-    "var TMA_CYCLE_OFFSET  = 16\n"
-    "\n"
-    "/*\n"
-    " * ttmp[0:1]   -- The ISA address that triggered this trap handler\n"
-    " * ttmp[10:11] -- The TMA user provided, used to store the debug info in this shader\n"
-    " * v[10:14] ttmp[7:8] -- temp use inside this shader\n"
-    " * s5 -- store the counts that this trap been triggered\n"
-    " * Each time when the trap is triggered , this shader will write\n"
-    " * ttmp[0] : ttmp[1] : Trap_Status : [reserved]\n"
-    " * to TMA + (trap count * TMA_CYCLE_OFFSET)\n"
-    " * The TMA + WAVE_COUNT_OFFSET(the first [reserved] address)\n"
-    " * used to store the total triggered trap count.\n"
-    " */\n"
-    "shader main\n"
-    "\n"
-    "    asic(VI)\n"
-    "\n"
-    "    type(CS)\n"
-    "    v_mov_b32      v10, ttmp10\n"
-    "    v_mov_b32      v11, ttmp11\n"
-    "    s_mov_b32      ttmp7, s5\n"
-    "    s_mulk_i32     ttmp7, TMA_CYCLE_OFFSET\n"
-    "    s_addk_i32     s5, 1\n"
-    "    v_mov_b32      v12, ttmp0\n"
-    "    v_add_u32      v10, vcc, ttmp7, v10\n"
-    "    flat_store_dword   v[10,11], v12 slc glc\n"
-    "    v_mov_b32      v12, ttmp1\n"
-    "    v_add_u32      v10, vcc, 4, v10\n"
-    "    flat_store_dword   v[10,11], v12 slc  glc\n"
-    "    s_getreg_b32   ttmp8, hwreg(HW_REG_TRAPSTS)\n"
-    "    s_and_b32      ttmp8, ttmp8, REG_TRAPSTS_EXCP_MASK\n"
-    "    v_mov_b32      v12, ttmp8\n"
-    "    v_add_u32      v10, vcc, 4, v10\n"
-    "    flat_store_dword   v[10,11], v12  glc\n"
-    "    v_mov_b32      v10, ttmp10\n"
-    "    v_add_u32      v10, vcc, WAVE_COUNT_OFFSET, v10\n"
-    "    v_mov_b32      v13, 1\n"
-    "    flat_atomic_add    v14, v[10:11], v13 slc glc\n"
-    "    s_and_b32      ttmp1, ttmp1, 0xffff\n"
-    "    s_rfe_b64      [ttmp0,ttmp1]\n"
-    "end\n"
-);
-
-IsaGenerator* IsaGenerator::Create(unsigned int familyId) {
-    switch (familyId) {
-    case FAMILY_CI:
-    case FAMILY_KV:
-        return new IsaGenerator_Gfx72;
-    case FAMILY_VI:
-    case FAMILY_CZ:
-        return new IsaGenerator_Gfx8;
-    case FAMILY_AI:
-    case FAMILY_RV:
-    case FAMILY_AR:
-        return new IsaGenerator_Gfx9;
-    case FAMILY_AL:
-        return new IsaGenerator_Aldbrn;
-    case FAMILY_NV:
-        return new IsaGenerator_Gfx10;
-
-    default:
-        LOG() << "Error: Invalid ISA" << std::endl;
-        return NULL;
-    }
-}
-
-void IsaGenerator::GetAwTrapHandler(HsaMemoryBuffer& rBuf) {
-    CompileShader(ADDRESS_WATCH_SP3.c_str(), "main", rBuf);
-}
-
-void IsaGenerator::CompileShader(const char* shaderCode, const char* shaderName, HsaMemoryBuffer& rBuf) {
-    sp3_context* pSp3 = sp3_new();
-    sp3_setasic(pSp3, GetAsicName().c_str());
-    sp3_parse_string(pSp3, shaderCode);
-    sp3_shader* pShader = sp3_compile(pSp3, shaderName);
-
-    std::copy(pShader->data, pShader->data + pShader->size, rBuf.As<unsigned int*>());
-    sp3_free_shader(pShader);
-
-    /** Inside this close function, there is an unknown reason of free memory not used by compiler.
-     *  Comment out this as a workaround. System will do the garbage collection after this
-     *  application is closed.
-     */
-    // sp3_close(pSp3);
-}
@@ -1,52 +0,0 @@
-/*
- * Copyright (C) 2014-2018 Advanced Micro Devices, Inc. All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
- * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
- * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
- * OTHER DEALINGS IN THE SOFTWARE.
- *
- */
-
-#ifndef _ISAGENERATOR_H_
-#define _ISAGENERATOR_H_
-
-#include "KFDTestUtil.hpp"
-
-/* isa generation class - interface */
-class IsaGenerator {
- public:
-    static IsaGenerator* Create(unsigned int familyId);
-
-    virtual ~IsaGenerator() {}
-
-    virtual void GetNoopIsa(HsaMemoryBuffer& rBuf) = 0;
-    virtual void GetCopyDwordIsa(HsaMemoryBuffer& rBuf) = 0;
-    virtual void GetInfiniteLoopIsa(HsaMemoryBuffer& rBuf) = 0;
-    virtual void GetAtomicIncIsa(HsaMemoryBuffer& rBuf) = 0;
-    virtual void GetCwsrTrapHandler(HsaMemoryBuffer& rBuf) {}
-    virtual void GetAwTrapHandler(HsaMemoryBuffer& rBuf);
-
-    void CompileShader(const char* shaderCode, const char* shaderName, HsaMemoryBuffer& rBuf);
-
- protected:
-    virtual const std::string& GetAsicName() = 0;
-
- private:
-    static const std::string ADDRESS_WATCH_SP3;
-};
-
-#endif  // _ISAGENERATOR_H_
@@ -1,113 +0,0 @@
-/*
- * Copyright (C) 2020 Advanced Micro Devices, Inc. All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
- * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
- * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
- * OTHER DEALINGS IN THE SOFTWARE.
- *
- */
-
-#include "IsaGenerator_Aldebaran.hpp"
-
-#include <algorithm>
-#include <string>
-
-const std::string IsaGenerator_Aldbrn::ASIC_NAME = "ALDEBARAN";
-
-/* The binaries are generated from following ISA */
-#if 0
-/* flat_atomic_inc will not support by some PCIE, use flat_atomic_add instead */
-shader atomic_add
-asic(ALDEBARAN)
-type(CS)
-    v_mov_b32 v0, s0
-    v_mov_b32 v1, s1
-    v_mov_b32 v2, 1
-    flat_atomic_add v3, v[0:1], v2 slc glc scc
-    s_waitcnt 0
-    s_endpgm
-end
-
-shader copy_dword
-asic(ALDEBARAN)
-type(CS)
-/* copy the parameters from scalar registers to vector registers */
-    v_mov_b32 v0, s0
-    v_mov_b32 v1, s1
-    v_mov_b32 v2, s2
-    v_mov_b32 v3, s3
-/* copy a dword between the passed addresses */
-    flat_load_dword v4, v[0:1] slc glc
-    s_waitcnt 0
-    flat_store_dword v[2:3], v4 slc glc
-    s_endpgm
-end
-
-shader main
-asic(ALDEBARAN)
-type(CS)
-loop:
-    s_branch loop
-    s_endpgm
-end
-
-
-#endif
-
-const uint32_t IsaGenerator_Aldbrn::NOOP_ISA[] = {
-    0xbf810000
-};
-
-const uint32_t IsaGenerator_Aldbrn::COPY_DWORD_ISA[] = {
-    0x7e000200, 0x7e020201,
-    0x7e040202, 0x7e060203,
-    0xdc530000, 0x047f0000,
-    0xbf8c0000, 0xdc730000,
-    0x007f0402, 0xbf810000
-};
-
-const uint32_t IsaGenerator_Aldbrn::INFINITE_LOOP_ISA[] = {
-    0xbf82ffff, 0xbf810000
-};
-
-const uint32_t IsaGenerator_Aldbrn::ATOMIC_ADD_ISA[] = {
-    0x7e000200, 0x7e020201,
-    0x7e040281, 0xdf0b0000,
-    0x037f0200, 0xbf8c0000,
-    0xbf810000, 0x00000000
-};
-
-void IsaGenerator_Aldbrn::GetNoopIsa(HsaMemoryBuffer& rBuf) {
-    std::copy(NOOP_ISA, NOOP_ISA+ARRAY_SIZE(NOOP_ISA), rBuf.As<uint32_t*>());
-}
-
-void IsaGenerator_Aldbrn::GetCopyDwordIsa(HsaMemoryBuffer& rBuf) {
-    std::copy(COPY_DWORD_ISA, COPY_DWORD_ISA+ARRAY_SIZE(COPY_DWORD_ISA), rBuf.As<uint32_t*>());
-}
-
-void IsaGenerator_Aldbrn::GetInfiniteLoopIsa(HsaMemoryBuffer& rBuf) {
-    std::copy(INFINITE_LOOP_ISA, INFINITE_LOOP_ISA+ARRAY_SIZE(INFINITE_LOOP_ISA), rBuf.As<uint32_t*>());
-}
-
-void IsaGenerator_Aldbrn::GetAtomicIncIsa(HsaMemoryBuffer& rBuf) {
-    std::copy(ATOMIC_ADD_ISA, ATOMIC_ADD_ISA+ARRAY_SIZE(ATOMIC_ADD_ISA), rBuf.As<uint32_t*>());
-}
-
-const std::string& IsaGenerator_Aldbrn::GetAsicName() {
-    return ASIC_NAME;
-}
-
@@ -1,142 +0,0 @@
-/*
- * Copyright (C) 2019 Advanced Micro Devices, Inc. All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
- * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
- * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
- * OTHER DEALINGS IN THE SOFTWARE.
- *
- */
-
-#include "IsaGenerator_Gfx10.hpp"
-
-#include <algorithm>
-#include <string>
-
-/* The binaries are generated from following ISA */
-const std::string IsaGenerator_Gfx10::ASIC_NAME = "GFX10";
-#if 0
-static const char * atomic_add = \
-"\
-shader atomic_add \n\
-asic(GFX10) \n\
-wave_size(32) \n\
-type(CS) \n\
-    v_mov_b32 v0, s0 \n\
-    v_mov_b32 v1, s1 \n\
-    v_mov_b32 v2, 1 \n\
-    flat_atomic_add v3, v[0:1], v2 slc glc \n\
-    s_waitcnt 0 \n\
-    s_endpgm \n\
-end \n\
-";
-
-static const char * copy_dword = \
-"\
-shader copy_dword \n\
-asic(GFX10) \n\
-wave_size(32) \n\
-type(CS) \n\
-    v_mov_b32 v0, s0 \n\
-    v_mov_b32 v1, s1 \n\
-    v_mov_b32 v2, s2 \n\
-    v_mov_b32 v3, s3 \n\
-    flat_load_dword v4, v[0:1] slc glc \n\
-    s_waitcnt 0 \n\
-    flat_store_dword v[2:3], v4 slc glc \n\
-    s_endpgm \n\
-end \n\
-";
-
-static const char * loop= \
-"\
-shader loop \n\
-asic(GFX10) \n\
-type(CS) \n\
-wave_size(32) \n\
-loop: \n\
-    s_branch loop \n\
-    s_endpgm \n\
-end \n\
-";
-
-static const char * noop= \
-"\
-shader noop \n\
-asic(GFX10) \n\
-type(CS) \n\
-wave_size(32) \n\
-    s_endpgm \n\
-end \n\
-";
-#endif
-
-const uint32_t IsaGenerator_Gfx10::NOOP_ISA[] = {
-0xb0804004, 0xbf810000,
-0xbf9f0000, 0xbf9f0000,
-0xbf9f0000, 0xbf9f0000,
-0xbf9f0000
-};
-
-const uint32_t IsaGenerator_Gfx10::COPY_DWORD_ISA[] = {
-0xb0804004, 0x7e000200,
-0x7e020201, 0x7e040202,
-0x7e060203, 0xdc330000,
-0x47d0000, 0xbf8c0000,
-0xdc730000, 0x7d0402,
-0xbf810000, 0xbf9f0000,
-0xbf9f0000, 0xbf9f0000,
-0xbf9f0000, 0xbf9f0000
-};
-
-const uint32_t IsaGenerator_Gfx10::INFINITE_LOOP_ISA[] = {
-0xbf82ffff, 0xb0804004,
-0xbf810000, 0xbf9f0000,
-0xbf9f0000, 0xbf9f0000,
-0xbf9f0000, 0xbf9f0000
-};
-
-const uint32_t IsaGenerator_Gfx10::ATOMIC_ADD_ISA[] = {
-0xb0804004, 0x7e000200,
-0x7e020201, 0x7e040281,
-0xdccb0000, 0x37d0200,
-0xbf8c0000, 0xbf810000,
-0xbf9f0000, 0xbf9f0000,
-0xbf9f0000, 0xbf9f0000,
-0xbf9f0000
-};
-
-
-void IsaGenerator_Gfx10::GetNoopIsa(HsaMemoryBuffer& rBuf) {
-    std::copy(NOOP_ISA, NOOP_ISA+ARRAY_SIZE(NOOP_ISA), rBuf.As<uint32_t*>());
-}
-
-void IsaGenerator_Gfx10::GetCopyDwordIsa(HsaMemoryBuffer& rBuf) {
-    std::copy(COPY_DWORD_ISA, COPY_DWORD_ISA+ARRAY_SIZE(COPY_DWORD_ISA), rBuf.As<uint32_t*>());
-}
-
-void IsaGenerator_Gfx10::GetInfiniteLoopIsa(HsaMemoryBuffer& rBuf) {
-    std::copy(INFINITE_LOOP_ISA, INFINITE_LOOP_ISA+ARRAY_SIZE(INFINITE_LOOP_ISA), rBuf.As<uint32_t*>());
-}
-
-void IsaGenerator_Gfx10::GetAtomicIncIsa(HsaMemoryBuffer& rBuf) {
-    std::copy(ATOMIC_ADD_ISA, ATOMIC_ADD_ISA+ARRAY_SIZE(ATOMIC_ADD_ISA), rBuf.As<uint32_t*>());
-}
-
-const std::string& IsaGenerator_Gfx10::GetAsicName() {
-    return ASIC_NAME;
-}
-
@@ -1,123 +0,0 @@
-/*
- * Copyright (C) 2014-2018 Advanced Micro Devices, Inc. All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
- * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
- * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
- * OTHER DEALINGS IN THE SOFTWARE.
- *
- */
-
-#include "IsaGenerator_Gfx72.hpp"
-
-#include <algorithm>
-#include <string>
-
-const std::string IsaGenerator_Gfx72::ASIC_NAME = "CI";
-
-const uint32_t IsaGenerator_Gfx72::NOOP_ISA[] = {
-    0xbf810000  // S_ENDPGM
-};
-
-/* The below arrays are filled with hex values in order not to reference
- * proprietary header files, but we still leave the code here for future
- * reference.
- */
-#if 0
-const uint32_t IsaGenerator_Gfx72::COPY_DWORD_ISA[] = {
-    (63u << SQ_VOP1__ENCODING__SHIFT) | (0 << SQ_VOP1__VDST__SHIFT) | (SQ_V_MOV_B32 << SQ_VOP1__OP__SHIFT) | (0 << SQ_VOP1__SRC0__SHIFT),  // v_mov_b32 v0, s0 (VOP1)
-    (63u << SQ_VOP1__ENCODING__SHIFT) | (1 << SQ_VOP1__VDST__SHIFT) | (SQ_V_MOV_B32 << SQ_VOP1__OP__SHIFT) | (1 << SQ_VOP1__SRC0__SHIFT),  // v_mov_b32 v1, s1 (VOP1)
-    (63u << SQ_VOP1__ENCODING__SHIFT) | (2 << SQ_VOP1__VDST__SHIFT) | (SQ_V_MOV_B32 << SQ_VOP1__OP__SHIFT) | (2 << SQ_VOP1__SRC0__SHIFT),  // v_mov_b32 v2, s2 (VOP1)
-    (63u << SQ_VOP1__ENCODING__SHIFT) | (3 << SQ_VOP1__VDST__SHIFT) | (SQ_V_MOV_B32 << SQ_VOP1__OP__SHIFT) | (3 << SQ_VOP1__SRC0__SHIFT),  // v_mov_b32 v3, s3 (VOP1)
-
-    (55u << SQ_FLAT_0__ENCODING__SHIFT) | (SQ_FLAT_LOAD_DWORD << SQ_FLAT_0__OP__SHIFT) | (1 << SQ_FLAT_0__SLC__SHIFT) | (1 << SQ_FLAT_0__GLC__SHIFT)/*(3 << 16)*/,    // SQ_FLAT_0, flat_load_dword, slc = 1, glc = 1 (FLAT_0)
-    (4u << SQ_FLAT_1__VDST__SHIFT) | (0 << SQ_FLAT_1__ADDR__SHIFT),       // ADDR = V0:V1, VDST = V4 (FLAT_1)
-
-    (383u << SQ_SOPP__ENCODING__SHIFT) | (SQ_S_WAITCNT << SQ_SOPP__OP__SHIFT) | (0 << SQ_SOPP__SIMM16__SHIFT),  // s_waitcnt 0 (SOPP)
-
-    (55u << SQ_FLAT_0__ENCODING__SHIFT) | (SQ_FLAT_STORE_DWORD << SQ_FLAT_0__OP__SHIFT) | (1 << SQ_FLAT_0__SLC__SHIFT) | (1 << SQ_FLAT_0__GLC__SHIFT),    // SQ_FLAT_0, flat_store_dword, slc = 1, glc = 1 (FLAT_0)
-    (4u << SQ_FLAT_1__DATA__SHIFT) | (2 << SQ_FLAT_1__ADDR__SHIFT),        // ADDR = V2:V3, DATA = V4 (FLAT_1)
-
-    0xBF810000u  // s_endpgm, note that we rely on the implicit s_waitcnt 0,0,0
-};
-
-const uint32_t IsaGenerator_Gfx72::INFINITE_LOOP_ISA[] = {
-    (0x17F << SQ_SOPP__ENCODING__SHIFT) | (SQ_S_BRANCH << SQ_SOPP__OP__SHIFT) | ( (const uint32_t)-1 << SQ_SOPP__SIMM16__SHIFT),  // s_branch -1 (PC <- PC + SIMM*4)+4
-    0xBF810000u  // S_ENDPGM
-};
-
-const uint32_t IsaGenerator_Gfx72::ATOMIC_INC_ISA[] = {
-    (63u << SQ_VOP1__ENCODING__SHIFT) | (0 << SQ_VOP1__VDST__SHIFT) | (SQ_V_MOV_B32 << SQ_VOP1__OP__SHIFT) | (0 << SQ_VOP1__SRC0__SHIFT),  // v_mov_b32 v0, s0 (VOP1)
-    (63u << SQ_VOP1__ENCODING__SHIFT) | (1 << SQ_VOP1__VDST__SHIFT) | (SQ_V_MOV_B32 << SQ_VOP1__OP__SHIFT) | (1 << SQ_VOP1__SRC0__SHIFT),  // v_mov_b32 v1, s1 (VOP1)
-    (63u << SQ_VOP1__ENCODING__SHIFT) | (2 << SQ_VOP1__VDST__SHIFT) | (SQ_V_MOV_B32 << SQ_VOP1__OP__SHIFT) | (0xC1 << SQ_VOP1__SRC0__SHIFT),  // v_mov_b32 0xFFFFFFFF, s2 (VOP1)
-
-    (55u << SQ_FLAT_0__ENCODING__SHIFT) | (SQ_FLAT_ATOMIC_INC << SQ_FLAT_0__OP__SHIFT) | (1 << SQ_FLAT_0__SLC__SHIFT) | (0 << SQ_FLAT_0__GLC__SHIFT),    // SQ_FLAT_0, flat_atomic_inc, slc = 1, glc = 0 (FLAT_0)
-    (3u << SQ_FLAT_1__VDST__SHIFT) | (2u << SQ_FLAT_1__DATA__SHIFT) | (0 << SQ_FLAT_1__ADDR__SHIFT),        // ADDR/dst = V0:V1, VDST/ret = V3, DATA/src=V2 (FLAT_1)
-    0xBF810000u  // s_endpgm, note that we rely on the implicit s_waitcnt 0,0,0
-};
-#endif
-
-const uint32_t IsaGenerator_Gfx72::COPY_DWORD_ISA[] = {
-    0x7e000200,  // v_mov_b32 v0, s0 (VOP1)
-    0x7e020201,  // v_mov_b32 v1, s1 (VOP1)
-    0x7e040202,  // v_mov_b32 v2, s2 (VOP1)
-    0x7e060203,  // v_mov_b32 v3, s3 (VOP1)
-
-    0xdc330000,  // SQ_FLAT_0, flat_load_dword, slc = 1, glc = 1 (FLAT_0)
-    0x04000000,  // ADDR = V0:V1, VDST = V4 (FLAT_1)
-
-    0xbf8c0000,  // s_waitcnt 0 (SOPP)
-
-    0xdc730000,  // SQ_FLAT_0, flat_store_dword, slc = 1, glc = 1 (FLAT_0)
-    0x00000402,  // ADDR = V2:V3, DATA = V4 (FLAT_1)
-
-    0xbf810000   // s_endpgm, note that we rely on the implicit s_waitcnt 0,0,0
-};
-
-const uint32_t IsaGenerator_Gfx72::INFINITE_LOOP_ISA[] = {
-    0xbf82ffff,  // s_branch -1 (PC <- PC + SIMM*4)+4
-    0xbf810000   // S_ENDPGM
-};
-
-const uint32_t IsaGenerator_Gfx72::ATOMIC_INC_ISA[] = {
-    0x7e000200,  // v_mov_b32 v0, s0 (VOP1)
-    0x7e020201,  // v_mov_b32 v1, s1 (VOP1)
-    0x7e0402c1,  // v_mov_b32 0xFFFFFFFF, s2 (VOP1)
-
-    0xdcf20000,  // SQ_FLAT_0, flat_atomic_inc, slc = 1, glc = 0 (FLAT_0)
-    0x03000200,  // ADDR/dst = V0:V1, VDST/ret = V3, DATA/src=V2 (FLAT_1)
-    0xbf810000  // s_endpgm, note that we rely on the implicit s_waitcnt 0,0,0
-};
-
-void IsaGenerator_Gfx72::GetNoopIsa(HsaMemoryBuffer& rBuf) {
-    std::copy(NOOP_ISA, NOOP_ISA+ARRAY_SIZE(NOOP_ISA), rBuf.As<uint32_t*>());
-}
-
-void IsaGenerator_Gfx72::GetCopyDwordIsa(HsaMemoryBuffer& rBuf) {
-    std::copy(COPY_DWORD_ISA, COPY_DWORD_ISA+ARRAY_SIZE(COPY_DWORD_ISA), rBuf.As<uint32_t*>());
-}
-
-void IsaGenerator_Gfx72::GetInfiniteLoopIsa(HsaMemoryBuffer& rBuf) {
-    std::copy(INFINITE_LOOP_ISA, INFINITE_LOOP_ISA+ARRAY_SIZE(INFINITE_LOOP_ISA), rBuf.As<uint32_t*>());
-}
-
-void IsaGenerator_Gfx72::GetAtomicIncIsa(HsaMemoryBuffer& rBuf) {
-    std::copy(ATOMIC_INC_ISA, ATOMIC_INC_ISA+ARRAY_SIZE(ATOMIC_INC_ISA), rBuf.As<uint32_t*>());
-}
-
-const std::string& IsaGenerator_Gfx72::GetAsicName() {
-    return ASIC_NAME;
-}
@@ -1,49 +0,0 @@
-/*
- * Copyright (C) 2014-2018 Advanced Micro Devices, Inc. All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
- * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
- * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
- * OTHER DEALINGS IN THE SOFTWARE.
- *
- */
-
-#ifndef _ISAGENERATOR_GFX72_H_
-#define _ISAGENERATOR_GFX72_H_
-
-#include <string>
-#include "IsaGenerator.hpp"
-
-class IsaGenerator_Gfx72 : public IsaGenerator {
- public:
-    virtual void GetNoopIsa(HsaMemoryBuffer& rBuf);
-    virtual void GetCopyDwordIsa(HsaMemoryBuffer& rBuf);
-    virtual void GetInfiniteLoopIsa(HsaMemoryBuffer& rBuf);
-    virtual void GetAtomicIncIsa(HsaMemoryBuffer& rBuf);
-
- protected:
-    virtual const std::string& GetAsicName();
-
- private:
-    static const std::string ASIC_NAME;
-
-    static const uint32_t NOOP_ISA[];
-    static const uint32_t COPY_DWORD_ISA[];
-    static const uint32_t INFINITE_LOOP_ISA[];
-    static const uint32_t ATOMIC_INC_ISA[];
-};
-
-#endif  // _ISAGENERATOR_GFX72_H_
@@ -1,128 +0,0 @@
-/*
- * Copyright (C) 2014-2018 Advanced Micro Devices, Inc. All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
- * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
- * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
- * OTHER DEALINGS IN THE SOFTWARE.
- *
- */
-
-#include "IsaGenerator_Gfx8.hpp"
-
-#include <algorithm>
-#include <string>
-
-const std::string IsaGenerator_Gfx8::ASIC_NAME = "VI";
-
-const uint32_t IsaGenerator_Gfx8::NOOP_ISA[] = {
-    0xbf810000  // S_ENDPGM
-};
-
-/** The below arrays are filled with hex values in order not to reference
- *  proprietary header files, but we still leave the code here for future
- *  reference.
- */
-#if 0
-const uint32_t IsaGenerator_Gfx8::COPY_DWORD_ISA[] = {
-    (63u << SQ_VOP1__ENCODING__SHIFT) | (0 << SQ_VOP1__VDST__SHIFT) | (SQ_V_MOV_B32 << SQ_VOP1__OP__SHIFT) | (0 << SQ_VOP1__SRC0__SHIFT),  // v_mov_b32 v0, s0 (VOP1)
-    (63u << SQ_VOP1__ENCODING__SHIFT) | (1 << SQ_VOP1__VDST__SHIFT) | (SQ_V_MOV_B32 << SQ_VOP1__OP__SHIFT) | (1 << SQ_VOP1__SRC0__SHIFT),  // v_mov_b32 v1, s1 (VOP1)
-    (63u << SQ_VOP1__ENCODING__SHIFT) | (2 << SQ_VOP1__VDST__SHIFT) | (SQ_V_MOV_B32 << SQ_VOP1__OP__SHIFT) | (2 << SQ_VOP1__SRC0__SHIFT),  // v_mov_b32 v2, s2 (VOP1)
-    (63u << SQ_VOP1__ENCODING__SHIFT) | (3 << SQ_VOP1__VDST__SHIFT) | (SQ_V_MOV_B32 << SQ_VOP1__OP__SHIFT) | (3 << SQ_VOP1__SRC0__SHIFT),  // v_mov_b32 v3, s3 (VOP1)
-
-    (55u << SQ_FLAT_0__ENCODING__SHIFT) | (SQ_FLAT_LOAD_DWORD << SQ_FLAT_0__OP__SHIFT) | (1 << SQ_FLAT_0__SLC__SHIFT) | (1 << SQ_FLAT_0__GLC__SHIFT)/*(3 << 16)*/,    // SQ_FLAT_0, flat_load_dword, slc = 1, glc = 1 (FLAT_0)
-    (4u << SQ_FLAT_1__VDST__SHIFT) | (0 << SQ_FLAT_1__ADDR__SHIFT),       // ADDR = V0:V1, VDST = V4 (FLAT_1)
-
-    (383u << SQ_SOPP__ENCODING__SHIFT) | (SQ_S_WAITCNT << SQ_SOPP__OP__SHIFT) | (0 << SQ_SOPP__SIMM16__SHIFT),  // s_waitcnt 0 (SOPP)
-
-    (55u << SQ_FLAT_0__ENCODING__SHIFT) | (SQ_FLAT_STORE_DWORD << SQ_FLAT_0__OP__SHIFT) | (1 << SQ_FLAT_0__SLC__SHIFT) | (1 << SQ_FLAT_0__GLC__SHIFT),    // SQ_FLAT_0, flat_store_dword, slc = 1, glc = 1 (FLAT_0)
-    (4u << SQ_FLAT_1__DATA__SHIFT) | (2 << SQ_FLAT_1__ADDR__SHIFT),        // ADDR = V2:V3, DATA = V4 (FLAT_1)
-
-    0xBF810000u  // s_endpgm, note that we rely on the implicit s_waitcnt 0,0,0
-};
-
-const uint32_t IsaGenerator_Gfx8::INFINITE_LOOP_ISA[] = {
-    (0x17F << SQ_SOPP__ENCODING__SHIFT) | (SQ_S_BRANCH << SQ_SOPP__OP__SHIFT) | ( (const uint32_t)-1 << SQ_SOPP__SIMM16__SHIFT),  // s_branch -1 (PC <- PC + SIMM*4)+4
-    0xBF810000u  // S_ENDPGM
-};
-#endif
-
-const uint32_t IsaGenerator_Gfx8::COPY_DWORD_ISA[] = {
-    0x7e000200,  // v_mov_b32 v0, s0 (VOP1)
-    0x7e020201,  // v_mov_b32 v1, s1 (VOP1)
-    0x7e040202,  // v_mov_b32 v2, s2 (VOP1)
-    0x7e060203,  // v_mov_b32 v3, s3 (VOP1)
-
-    0xdc530000,  // SQ_FLAT_0, flat_load_dword, slc = 1, glc = 1 (FLAT_0)
-    0x04000000,  // ADDR = V0:V1, VDST = V4 (FLAT_1)
-
-    0xbf8c0000,  // s_waitcnt 0 (SOPP)
-
-    0xdc730000,  // SQ_FLAT_0, flat_store_dword, slc = 1, glc = 1 (FLAT_0)
-    0x00000402,  // ADDR = V2:V3, DATA = V4 (FLAT_1)
-
-    0xbf810000   // s_endpgm, note that we rely on the implicit s_waitcnt 0,0,0
-};
-
-const uint32_t IsaGenerator_Gfx8::INFINITE_LOOP_ISA[] = {
-    0xbf82ffff,  // s_branch -1 (PC <- PC + SIMM*4)+4
-    0xbf810000   // S_ENDPGM
-};
-
-/**
- * The atomic_add_isa binary is generated from following ISA
- * The original atomic_inc is not support by some PCIE, so use atomic_add instead
- *
- */
-/*
-shader atomic_add
-asic(VI)
-type(CS)
-    v_mov_b32 v0, s0
-    v_mov_b32 v1, s1
-    v_mov_b32 v2, 1
-    flat_atomic_add v3, v[0:1], v2 slc glc
-    s_waitcnt  0
-    s_endpgm
-end
-*/
-
-const uint32_t IsaGenerator_Gfx8::ATOMIC_ADD_ISA[] = {
-    0x7e000200, 0x7e020201,
-    0x7e040281, 0xdd0b0000,
-    0x03000200, 0xbf8c0000,
-    0xbf810000, 0x00000000
-};
-
-void IsaGenerator_Gfx8::GetNoopIsa(HsaMemoryBuffer& rBuf) {
-    std::copy(NOOP_ISA, NOOP_ISA+ARRAY_SIZE(NOOP_ISA), rBuf.As<uint32_t*>());
-}
-
-void IsaGenerator_Gfx8::GetCopyDwordIsa(HsaMemoryBuffer& rBuf) {
-    std::copy(COPY_DWORD_ISA, COPY_DWORD_ISA+ARRAY_SIZE(COPY_DWORD_ISA), rBuf.As<uint32_t*>());
-}
-
-void IsaGenerator_Gfx8::GetInfiniteLoopIsa(HsaMemoryBuffer& rBuf) {
-    std::copy(INFINITE_LOOP_ISA, INFINITE_LOOP_ISA+ARRAY_SIZE(INFINITE_LOOP_ISA), rBuf.As<uint32_t*>());
-}
-
-void IsaGenerator_Gfx8::GetAtomicIncIsa(HsaMemoryBuffer& rBuf) {
-    std::copy(ATOMIC_ADD_ISA, ATOMIC_ADD_ISA+ARRAY_SIZE(ATOMIC_ADD_ISA), rBuf.As<uint32_t*>());
-}
-
-const std::string& IsaGenerator_Gfx8::GetAsicName() {
-    return ASIC_NAME;
-}
@@ -1,49 +0,0 @@
-/*
- * Copyright (C) 2014-2018 Advanced Micro Devices, Inc. All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
- * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
- * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
- * OTHER DEALINGS IN THE SOFTWARE.
- *
- */
-
-#ifndef _ISAGENERATOR_GFX8_H_
-#define _ISAGENERATOR_GFX8_H_
-
-#include <string>
-#include "IsaGenerator.hpp"
-
-class IsaGenerator_Gfx8 : public IsaGenerator {
- public:
-    virtual void GetNoopIsa(HsaMemoryBuffer& rBuf);
-    virtual void GetCopyDwordIsa(HsaMemoryBuffer& rBuf);
-    virtual void GetInfiniteLoopIsa(HsaMemoryBuffer& rBuf);
-    virtual void GetAtomicIncIsa(HsaMemoryBuffer& rBuf);
-
- protected:
-    virtual const std::string& GetAsicName();
-
- private:
-    static const std::string ASIC_NAME;
-
-    static const uint32_t NOOP_ISA[];
-    static const uint32_t COPY_DWORD_ISA[];
-    static const uint32_t INFINITE_LOOP_ISA[];
-    static const uint32_t ATOMIC_ADD_ISA[];
-};
-
-#endif  // _ISAGENERATOR_GFX72_H_
@@ -1,113 +0,0 @@
-/*
- * Copyright (C) 2014-2018 Advanced Micro Devices, Inc. All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
- * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
- * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
- * OTHER DEALINGS IN THE SOFTWARE.
- *
- */
-
-#include "IsaGenerator_Gfx9.hpp"
-
-#include <algorithm>
-#include <string>
-
-const std::string IsaGenerator_Gfx9::ASIC_NAME = "GFX9";
-
-/* The binaries are generated from following ISA */
-#if 0
-/* flat_atomic_inc will not support by some PCIE, use flat_atomic_add instead */
-shader atomic_add
-asic(GFX9)
-type(CS)
-    v_mov_b32 v0, s0
-    v_mov_b32 v1, s1
-    v_mov_b32 v2, 1
-    flat_atomic_add v3, v[0:1], v2 slc glc
-    s_waitcnt 0
-    s_endpgm
-end
-
-shader copy_dword
-asic(GFX9)
-type(CS)
-/* copy the parameters from scalar registers to vector registers */
-    v_mov_b32 v0, s0
-    v_mov_b32 v1, s1
-    v_mov_b32 v2, s2
-    v_mov_b32 v3, s3
-/* copy a dword between the passed addresses */
-    flat_load_dword v4, v[0:1] slc glc
-    s_waitcnt 0
-    flat_store_dword v[2:3], v4 slc glc
-    s_endpgm
-end
-
-shader main
-asic(GFX9)
-type(CS)
-loop:
-    s_branch loop
-    s_endpgm
-end
-
-
-#endif
-
-const uint32_t IsaGenerator_Gfx9::NOOP_ISA[] = {
-    0xbf810000
-};
-
-const uint32_t IsaGenerator_Gfx9::COPY_DWORD_ISA[] = {
-    0x7e000200, 0x7e020201,
-    0x7e040202, 0x7e060203,
-    0xdc530000, 0x047f0000,
-    0xbf8c0000, 0xdc730000,
-    0x007f0402, 0xbf810000
-};
-
-const uint32_t IsaGenerator_Gfx9::INFINITE_LOOP_ISA[] = {
-    0xbf82ffff, 0xbf810000
-};
-
-const uint32_t IsaGenerator_Gfx9::ATOMIC_ADD_ISA[] = {
-    0x7e000200, 0x7e020201,
-    0x7e040281, 0xdd0b0000,
-    0x037f0200, 0xbf8c0000,
-    0xbf810000, 0x00000000
-};
-
-void IsaGenerator_Gfx9::GetNoopIsa(HsaMemoryBuffer& rBuf) {
-    std::copy(NOOP_ISA, NOOP_ISA+ARRAY_SIZE(NOOP_ISA), rBuf.As<uint32_t*>());
-}
-
-void IsaGenerator_Gfx9::GetCopyDwordIsa(HsaMemoryBuffer& rBuf) {
-    std::copy(COPY_DWORD_ISA, COPY_DWORD_ISA+ARRAY_SIZE(COPY_DWORD_ISA), rBuf.As<uint32_t*>());
-}
-
-void IsaGenerator_Gfx9::GetInfiniteLoopIsa(HsaMemoryBuffer& rBuf) {
-    std::copy(INFINITE_LOOP_ISA, INFINITE_LOOP_ISA+ARRAY_SIZE(INFINITE_LOOP_ISA), rBuf.As<uint32_t*>());
-}
-
-void IsaGenerator_Gfx9::GetAtomicIncIsa(HsaMemoryBuffer& rBuf) {
-    std::copy(ATOMIC_ADD_ISA, ATOMIC_ADD_ISA+ARRAY_SIZE(ATOMIC_ADD_ISA), rBuf.As<uint32_t*>());
-}
-
-const std::string& IsaGenerator_Gfx9::GetAsicName() {
-    return ASIC_NAME;
-}
-
@@ -1,49 +0,0 @@
-/*
- * Copyright (C) 2014-2018 Advanced Micro Devices, Inc. All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
- * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
- * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
- * OTHER DEALINGS IN THE SOFTWARE.
- *
- */
-
-#ifndef _ISAGENERATOR_GFX9_H_
-#define _ISAGENERATOR_GFX9_H_
-
-#include <string>
-#include "IsaGenerator.hpp"
-
-class IsaGenerator_Gfx9 : public IsaGenerator {
- public:
-    virtual void GetNoopIsa(HsaMemoryBuffer& rBuf);
-    virtual void GetCopyDwordIsa(HsaMemoryBuffer& rBuf);
-    virtual void GetInfiniteLoopIsa(HsaMemoryBuffer& rBuf);
-    virtual void GetAtomicIncIsa(HsaMemoryBuffer& rBuf);
-
- protected:
-    virtual const std::string& GetAsicName();
-
- private:
-    static const std::string ASIC_NAME;
-
-    static const uint32_t NOOP_ISA[];
-    static const uint32_t COPY_DWORD_ISA[];
-    static const uint32_t INFINITE_LOOP_ISA[];
-    static const uint32_t ATOMIC_ADD_ISA[];
-};
-
-#endif  // _ISAGENERATOR_GFX9_H_
@@ -1,5 +1,5 @@
 /*
- * Copyright (C) 2020 Advanced Micro Devices, Inc. All Rights Reserved.
+ * Copyright (C) 2022 Advanced Micro Devices, Inc. All Rights Reserved.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
 * copy of this software and associated documentation files (the "Software"),
@@ -21,29 +21,53 @@
 *
 */

-#ifndef _ISAGENERATOR_ALDEBARAN_H_
-#define _ISAGENERATOR_ALDEBARAN_H_
+#include "GoogleTestExtension.hpp"
+#include "KFDASMTest.hpp"
+#include "ShaderStore.hpp"
+#include "Assemble.hpp"

-#include <string>
-#include "IsaGenerator.hpp"
+void KFDASMTest::SetUp() {}
+void KFDASMTest::TearDown() {}

-class IsaGenerator_Aldbrn : public IsaGenerator {
- public:
-    virtual void GetNoopIsa(HsaMemoryBuffer& rBuf);
-    virtual void GetCopyDwordIsa(HsaMemoryBuffer& rBuf);
-    virtual void GetInfiniteLoopIsa(HsaMemoryBuffer& rBuf);
-    virtual void GetAtomicIncIsa(HsaMemoryBuffer& rBuf);
-
- protected:
-    virtual const std::string& GetAsicName();
-
- private:
-    static const std::string ASIC_NAME;
-
-    static const uint32_t NOOP_ISA[];
-    static const uint32_t COPY_DWORD_ISA[];
-    static const uint32_t INFINITE_LOOP_ISA[];
-    static const uint32_t ATOMIC_ADD_ISA[];
+static const std::vector<uint32_t> TargetList = {
+    0x080001,
+    0x080002,
+    0x080003,
+    0x080005,
+    0x080100,
+    0x090000,
+    0x090002,
+    0x090004,
+    0x090006,
+    0x090008,
+    0x090009,
+    0x09000a,
+    0x09000c,
+    0x0a0100,
+    0x0a0101,
+    0x0a0102,
+    0x0a0103,
+    0x0a0300,
+    0x0a0301,
+    0x0a0302,
+    0x0a0303,
+    0x0a0304,
+    0x0a0305,
+    0x0a0306,
 };

-#endif  // _ISAGENERATOR_ALDEBARAN_H_
+TEST_F(KFDASMTest, AssembleShaders) {
+    TEST_START(TESTPROFILE_RUNALL)
+
+    for (auto &t : TargetList) {
+        Assembler asmblr(t);
+
+        LOG() << "Running ASM test for target " << asmblr.GetTargetAsic() << std::endl;
+
+        for (auto &s : ShaderList) {
+            EXPECT_SUCCESS(asmblr.RunAssemble(s));
+        }
+    }
+
+    TEST_END
+}
@@ -1,5 +1,5 @@
 /*
- * Copyright (C) 2014-2018 Advanced Micro Devices, Inc. All Rights Reserved.
+ * Copyright (C) 2022 Advanced Micro Devices, Inc. All Rights Reserved.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
 * copy of this software and associated documentation files (the "Software"),
@@ -21,3 +21,19 @@
 *
 */

+#ifndef __KFD_ASM_TEST__H__
+#define __KFD_ASM_TEST__H__
+
+#include <gtest/gtest.h>
+
+class KFDASMTest : public testing::Test {
+ public:
+    KFDASMTest() {}
+    ~KFDASMTest() {}
+
+ protected:
+    virtual void SetUp();
+    virtual void TearDown();
+};
+
+#endif  // __KFD_ASM_TEST__H__
@@ -68,6 +68,8 @@ void KFDBaseComponentTest::SetUp() {

    g_baseTest = this;

+    m_pAsm = new Assembler(GetGfxVersion(nodeProperties));
+
    ROUTINE_END
 }

@@ -86,6 +88,10 @@ void KFDBaseComponentTest::TearDown() {
    EXPECT_SUCCESS(hsaKmtCloseKFD());
    g_baseTest = NULL;

+    if (m_pAsm)
+        delete m_pAsm;
+    m_pAsm = nullptr;
+
    ROUTINE_END
 }

@@ -34,6 +34,8 @@
 #include "hsakmt.h"
 #include "OSWrapper.hpp"
 #include "KFDTestUtil.hpp"
+#include "Assemble.hpp"
+#include "ShaderStore.hpp"

 //  @class KFDBaseComponentTest
 class KFDBaseComponentTest : public testing::Test {
@@ -74,6 +76,7 @@ class KFDBaseComponentTest : public testing::Test {
    HsaMemFlags m_MemoryFlags;
    HsaNodeInfo m_NodeInfo;
    HSAint32 m_xnack;
+    Assembler* m_pAsm;

    // @brief Executed before every test that uses KFDBaseComponentTest class and sets all common settings for the tests.
    virtual void SetUp();
@@ -24,90 +24,11 @@
 #include "KFDCWSRTest.hpp"
 #include "Dispatch.hpp"

-
-/* Initial state:
- *   s[0:1] - 64 bits iteration number; only the lower 32 bits are useful.
- *   s[2:3] - result buffer base address
- *   s4 - workgroup id
- *   v0 - workitem id, always 0 because
- *        NUM_THREADS_X(number of threads) in workgroup set to 1
- * Registers:
- *   v0 - calculated workitem = v0 + s4 * NUM_THREADS_X, which is s4
- *   v2 - = s0, 32 bits iteration number
- *   v[4:5] - corresponding output buf address: s[2:3] + v0 * 4
- *   v6 - counter
- */
-
-static const char* iterate_isa_gfx8 = \
-"\
-shader iterate_isa\n\
-wave_size(32)\n\
-type(CS)\n\
-    // copy the parameters from scalar registers to vector registers\n\
-    v_mov_b32       v2, s0   // v[2:3] = s[0:1] \n\
-    v_mov_b32       v3, s1   // v[2:3] = s[0:1] \n\
-    v_mov_b32       v0, s4   // use workgroup id as index \n\
-    v_lshlrev_b32   v0, 2, v0   // v0 *= 4 \n\
-    v_add_u32       v4, vcc, s2, v0   // v[4:5] = s[2:3] + v0 * 4 \n\
-    v_mov_b32       v5, s3   // v[4:5] = s[2:3] + v0 * 4 \n\
-    v_add_u32       v5, vcc, v5, vcc_lo   // v[4:5] = s[2:3] + v0 * 4 \n\
-    v_mov_b32       v6, 0 \n\
-LOOP: \n\
-    v_add_u32       v6, vcc, 1, v6 \n\
-    // compare the result value (v6) to iteration value (v2), and \n\
-    // jump if equal (i.e. if VCC is not zero after the comparison) \n\
-    v_cmp_lt_u32 vcc, v6, v2 \n\
-    s_cbranch_vccnz LOOP \n\
-    flat_store_dword v[4:5], v6 \n\
-    s_waitcnt vmcnt(0)&lgkmcnt(0) \n\
-    s_endpgm \n\
-end \n\
-";
-
-//This shader can be used by gfx9 and gfx10
-static const char* iterate_isa_gfx9 = \
-"\
-shader iterate_isa\n\
-wave_size(32)\n\
-type(CS)\n\
-    // copy the parameters from scalar registers to vector registers\n\
-    v_mov_b32       v2, s0   // v[2:3] = s[0:1] \n\
-    v_mov_b32       v3, s1   // v[2:3] = s[0:1] \n\
-    v_mov_b32       v0, s4   // use workgroup id as index \n\
-    v_lshlrev_b32   v0, 2, v0   // v0 *= 4 \n\
-    v_add_co_u32    v4, vcc, s2, v0   // v[4:5] = s[2:3] + v0 * 4 \n\
-    v_mov_b32       v5, s3   // v[4:5] = s[2:3] + v0 * 4 \n\
-    v_add_co_u32    v5, vcc, v5, vcc_lo   // v[4:5] = s[2:3] + v0 * 4 \n\
-    v_mov_b32       v6, 0 \n\
-LOOP: \n\
-    v_add_co_u32    v6, vcc, 1, v6 \n\
-    // compare the result value (v6) to iteration value (v2), and \n\
-    // jump if equal (i.e. if VCC is not zero after the comparison) \n\
-    v_cmp_lt_u32 vcc, v6, v2 \n\
-    s_cbranch_vccnz LOOP \n\
-    flat_store_dword v[4:5], v6 \n\
-    s_waitcnt vmcnt(0)&lgkmcnt(0) \n\
-    s_endpgm \n\
-end \n\
-";
-
-static const char* infinite_isa = \
-"\
-shader infinite_isa \n\
-wave_size(32) \n\
-type(CS) \n\
-LOOP: \n\
-    s_branch LOOP \n\
-end \n\
-";
-
 void KFDCWSRTest::SetUp() {
    ROUTINE_START

    KFDBaseComponentTest::SetUp();

-    m_pIsaGen = IsaGenerator::Create(m_FamilyId);
-
    wave_number = 1;

    ROUTINE_END
@@ -115,9 +36,6 @@ void KFDCWSRTest::SetUp() {

 void KFDCWSRTest::TearDown() {
    ROUTINE_START
-    if (m_pIsaGen)
-        delete m_pIsaGen;
-    m_pIsaGen = NULL;

    KFDBaseComponentTest::TearDown();

@@ -153,16 +71,10 @@ TEST_F(KFDCWSRTest, BasicTest) {
    int defaultGPUNode = m_NodeInfo.HsaDefaultGPUNode();

    if ((m_FamilyId >= FAMILY_VI) && (checkCWSREnabled())) {
-        const char *pIterateIsa;
        HsaMemoryBuffer isaBuffer(PAGE_SIZE, defaultGPUNode, true/*zero*/, false/*local*/, true/*exec*/);
        HsaMemoryBuffer resultBuf1(PAGE_SIZE, defaultGPUNode, true, false, false);
        uint64_t count1 = 400000000;

-        if (m_FamilyId < FAMILY_AI)
-            pIterateIsa = iterate_isa_gfx8;
-        else
-            pIterateIsa = iterate_isa_gfx9;
-
        if (isOnEmulator()) {
            // Divide the iterator times by 10000 so that the test can
            // finish in a reasonable time.
@@ -172,7 +84,7 @@ TEST_F(KFDCWSRTest, BasicTest) {

        unsigned int* result1 = resultBuf1.As<unsigned int*>();

-        m_pIsaGen->CompileShader(pIterateIsa, "iterate_isa", isaBuffer);
+        ASSERT_SUCCESS(m_pAsm->RunAssembleBuf(IterateIsa, isaBuffer.As<char*>()));

        PM4Queue queue1;

@@ -236,7 +148,7 @@ TEST_F(KFDCWSRTest, InterruptRestore) {
    if ((m_FamilyId >= FAMILY_VI) && (checkCWSREnabled())) {
        HsaMemoryBuffer isaBuffer(PAGE_SIZE, defaultGPUNode, true/*zero*/, false/*local*/, true/*exec*/);

-        m_pIsaGen->CompileShader(infinite_isa, "infinite_isa", isaBuffer);
+        ASSERT_SUCCESS(m_pAsm->RunAssembleBuf(InfiniteLoopIsa, isaBuffer.As<char*>()));

        PM4Queue queue1, queue2, queue3;

@@ -27,12 +27,11 @@
 #include <gtest/gtest.h>

 #include "PM4Queue.hpp"
-#include "IsaGenerator.hpp"
 #include "KFDBaseComponentTest.hpp"

 class KFDCWSRTest : public KFDBaseComponentTest {
 public:
-    KFDCWSRTest() :m_pIsaGen(NULL) {}
+    KFDCWSRTest() {}
    ~KFDCWSRTest() {}

 protected:
@@ -41,7 +40,6 @@ class KFDCWSRTest : public KFDBaseComponentTest {

 protected:  // Members
    unsigned wave_number;
-    IsaGenerator* m_pIsaGen;
 };

 #endif  // __KFD_CWSR_TEST__H__
@@ -176,16 +176,11 @@ void KFDDBGTest::SetUp() {

    KFDBaseComponentTest::SetUp();

-    m_pIsaGen = IsaGenerator::Create(m_FamilyId);
-
    ROUTINE_END
 }

 void KFDDBGTest::TearDown() {
    ROUTINE_START
-    if (m_pIsaGen)
-        delete m_pIsaGen;
-    m_pIsaGen = NULL;

    /* Reset the user trap handler */
    hsaKmtSetTrapHandler(m_NodeInfo.HsaDefaultGPUNode(), 0, 0, 0, 0);
@@ -26,20 +26,16 @@

 #include <gtest/gtest.h>

-#include "IsaGenerator.hpp"
 #include "KFDBaseComponentTest.hpp"

 class KFDDBGTest : public KFDBaseComponentTest {
 public:
-    KFDDBGTest() :m_pIsaGen(NULL) {}
+    KFDDBGTest() {}
    ~KFDDBGTest() {}

 protected:
    virtual void SetUp();
    virtual void TearDown();
-
- protected:  // Members
-    IsaGenerator* m_pIsaGen;
 };

 #endif  // __KFD_DBG_TEST__H__
@@ -41,18 +41,12 @@ void KFDEvictTest::SetUp() {

    KFDBaseComponentTest::SetUp();

-    m_pIsaGen = IsaGenerator::Create(m_FamilyId);
-
    ROUTINE_END
 }

 void KFDEvictTest::TearDown() {
    ROUTINE_START

-    if (m_pIsaGen)
-        delete m_pIsaGen;
-    m_pIsaGen = NULL;
-
    KFDBaseComponentTest::TearDown();

    ROUTINE_END
@@ -286,136 +280,6 @@ void KFDEvictTest::AmdgpuCommandSubmissionSdmaNop(int rn, amdgpu_bo_handle handl
    EXPECT_EQ(0, amdgpu_cs_ctx_free(contextHandle));
 }

-/* Shader to read local buffers using multiple wavefronts in parallel
- * until address buffer is filled with specific value 0x5678 by host program,
- * then each wavefront fills value 0x5678 at corresponding result buffer and quit
- *
- * Initial state:
- *   s[0:1] - address buffer base address
- *   s[2:3] - result buffer base address
- *   s4 - workgroup id
- *   v0 - workitem id, always 0 because NUM_THREADS_X(number of threads) in workgroup set to 1
- * Registers:
- *   v0 - calculated workitem id, v0 = v0 + s4 * NUM_THREADS_X
- *   v[2:3] - address of corresponding local buf address offset: s[0:1] + v0 * 8
- *   v[4:5] - corresponding output buf address: s[2:3] + v0 * 4
- *   v[6:7] - local buf address used for read test
- *
- *    This shader can be used by gfx9 and gfx10
- *
- */
-
-static const char* gfx9_ReadMemory =
-"\
-    shader ReadMemory\n\
-    wave_size(32)\n\
-    type(CS)\n\
-    \n\
-    // compute address of corresponding output buffer\n\
-    v_mov_b32       v0, s4                  // use workgroup id as index\n\
-    v_lshlrev_b32   v0, 2, v0               // v0 *= 4\n\
-    v_add_co_u32    v4, vcc, s2, v0         // v[4:5] = s[2:3] + v0 * 4\n\
-    v_mov_b32       v5, s3\n\
-    v_add_co_u32    v5, vcc, v5, vcc_lo\n\
-    \n\
-    // compute input buffer offset used to store corresponding local buffer address\n\
-    v_lshlrev_b32   v0, 1, v0               // v0 *= 8\n\
-    v_add_co_u32    v2, vcc, s0, v0         // v[2:3] = s[0:1] + v0 * 8\n\
-    v_mov_b32       v3, s1\n\
-    v_add_co_u32    v3, vcc, v3, vcc_lo\n\
-    \n\
-    // load 64bit local buffer address stored at v[2:3] to v[6:7]\n\
-    flat_load_dwordx2   v[6:7], v[2:3] slc\n\
-    s_waitcnt       vmcnt(0) & lgkmcnt(0)   // wait for memory reads to finish\n\
-    \n\
-    v_mov_b32       v8, 0x5678\n\
-    s_movk_i32      s8, 0x5678\n\
-L_REPEAT:\n\
-    s_load_dword    s16, s[0:1], 0x0 glc\n\
-    s_waitcnt       vmcnt(0) & lgkmcnt(0)   // wait for memory reads to finish\n\
-    s_cmp_eq_i32    s16, s8\n\
-    s_cbranch_scc1  L_QUIT                  // if notified to quit by host\n\
-    // loop read 64M local buffer starting at v[6:7]\n\
-    // every 4k page only read once\n\
-    v_mov_b32       v9, 0\n\
-    v_mov_b32       v10, 0x1000             // 4k page\n\
-    v_mov_b32       v11, 0x4000000          // 64M size\n\
-    v_mov_b32       v12, v6\n\
-    v_mov_b32       v13, v7\n\
-L_LOOP_READ:\n\
-    flat_load_dwordx2   v[14:15], v[12:13] slc\n\
-    v_add_co_u32    v9, vcc, v9, v10 \n\
-    v_add_co_u32    v12, vcc, v12, v10\n\
-    v_add_co_u32    v13, vcc, v13, vcc_lo\n\
-    v_cmp_lt_u32    vcc, v9, v11\n\
-    s_cbranch_vccnz L_LOOP_READ\n\
-    s_branch        L_REPEAT\n\
-L_QUIT:\n\
-    flat_store_dword v[4:5], v8\n\
-    s_waitcnt       vmcnt(0) & lgkmcnt(0)   // wait for memory writes to finish\n\
-    s_endpgm\n\
-    end\n\
-";
-
-static const char* gfx8_ReadMemory =
-"\
-    shader ReadMemory\n\
-    asic(VI)\n\
-    type(CS)\n\
-    \n\
-    // compute address of corresponding output buffer\n\
-    v_mov_b32       v0, s4                  // use workgroup id as index\n\
-    v_lshlrev_b32   v0, 2, v0               // v0 *= 4\n\
-    v_add_u32       v4, vcc, s2, v0         // v[4:5] = s[2:3] + v0 * 4\n\
-    v_mov_b32       v5, s3\n\
-    v_addc_u32      v5, vcc, v5, 0, vcc\n\
-    \n\
-    // compute input buffer offset used to store corresponding local buffer address\n\
-    v_lshlrev_b32   v0, 1, v0               // v0 *= 8\n\
-    v_add_u32       v2, vcc, s0, v0         // v[2:3] = s[0:1] + v0 * 8\n\
-    v_mov_b32       v3, s1\n\
-    v_addc_u32      v3, vcc, v3, 0, vcc\n\
-    \n\
-    // load 64bit local buffer address stored at v[2:3] to v[6:7]\n\
-    flat_load_dwordx2   v[6:7], v[2:3] slc\n\
-    s_waitcnt       vmcnt(0) & lgkmcnt(0)   // wait for memory reads to finish\n\
-    \n\
-    v_mov_b32       v8, 0x5678\n\
-    s_movk_i32      s8, 0x5678\n\
-L_REPEAT:\n\
-    s_load_dword    s16, s[0:1], 0x0 glc\n\
-    s_waitcnt       vmcnt(0) & lgkmcnt(0)   // wait for memory reads to finish\n\
-    s_cmp_eq_i32    s16, s8\n\
-    s_cbranch_scc1  L_QUIT                  // if notified to quit by host\n\
-    // loop read 64M local buffer starting at v[6:7]\n\
-    // every 4k page only read once\n\
-    v_mov_b32       v9, 0\n\
-    v_mov_b32       v10, 0x1000             // 4k page\n\
-    v_mov_b32       v11, 0x4000000          // 64M size\n\
-    v_mov_b32       v12, v6\n\
-    v_mov_b32       v13, v7\n\
-L_LOOP_READ:\n\
-    flat_load_dwordx2   v[14:15], v[12:13] slc\n\
-    v_add_u32       v9, vcc, v9, v10 \n\
-    v_add_u32       v12, vcc, v12, v10\n\
-    v_addc_u32      v13, vcc, v13, 0, vcc\n\
-    v_cmp_lt_u32    vcc, v9, v11\n\
-    s_cbranch_vccnz L_LOOP_READ\n\
-    s_branch        L_REPEAT\n\
-L_QUIT:\n\
-    flat_store_dword v[4:5], v8\n\
-    s_waitcnt       vmcnt(0) & lgkmcnt(0)   // wait for memory writes to finish\n\
-    s_endpgm\n\
-    end\n\
-";
-
-std::string KFDEvictTest::CreateShader() {
-    if (m_FamilyId < FAMILY_AI)
-        return gfx8_ReadMemory;
-    else
-        return gfx9_ReadMemory;
-}
-
 /* Evict and restore procedure basic test
 *
 * Use N_PROCESSES processes to allocate vram buf size larger than total vram size
@@ -567,7 +431,7 @@ TEST_F(KFDEvictTest, QueueTest) {
    HsaMemoryBuffer addrBuffer(PAGE_SIZE, defaultGPUNode);
    HsaMemoryBuffer resultBuffer(PAGE_SIZE, defaultGPUNode);

-    m_pIsaGen->CompileShader(CreateShader().c_str(), "ReadMemory", isaBuffer);
+    ASSERT_SUCCESS(m_pAsm->RunAssembleBuf(ReadMemoryIsa, isaBuffer.As<char*>()));

    PM4Queue pm4Queue;
    ASSERT_SUCCESS(pm4Queue.Create(defaultGPUNode));
@@ -27,22 +27,19 @@
 #include <string>
 #include <vector>
 #include "KFDMultiProcessTest.hpp"
-#include "IsaGenerator.hpp"
 #include "PM4Queue.hpp"

 // @class KFDEvictTest
 // Test eviction and restore procedure using two processes
 class KFDEvictTest :  public KFDMultiProcessTest {
 public:
-    KFDEvictTest(void): m_pIsaGen(NULL) {}
-
+    KFDEvictTest(void) {}
    ~KFDEvictTest(void) {}

 protected:
    virtual void SetUp();
    virtual void TearDown();

-    std::string CreateShader();
    void AllocBuffers(HSAuint32 defaultGPUNode, HSAuint32 count, HSAuint64 vramBufSize,
                      std::vector<void *> &pBuffers);
    void FreeBuffers(std::vector<void *> &pBuffers, HSAuint64 vramBufSize);
@@ -52,7 +49,6 @@ class KFDEvictTest :  public KFDMultiProcessTest {
                                           PM4Queue *computeQueue);

 protected:  // Members
-    IsaGenerator*   m_pIsaGen;
    HsaMemFlags     m_Flags;
    void*           m_pBuf;
 };
@@ -33,18 +33,12 @@ void KFDExceptionTest::SetUp() {

    KFDBaseComponentTest::SetUp();

-    m_pIsaGen = IsaGenerator::Create(m_FamilyId);
-
    ROUTINE_END
 }

 void KFDExceptionTest::TearDown() {
    ROUTINE_START

-    if (m_pIsaGen)
-        delete m_pIsaGen;
-    m_pIsaGen = NULL;
-
    KFDBaseComponentTest::TearDown();

    // WORKAROUND: This needs to be fixed in the kernel
@@ -75,7 +69,8 @@ void KFDExceptionTest::TestMemoryException(int defaultGPUNode, HSAuint64 pSrc,
    eventDesc.SyncVar.SyncVar.UserData = NULL;
    eventDesc.SyncVar.SyncVarSize = 0;

-    m_pIsaGen->GetCopyDwordIsa(isaBuffer);
+    ASSERT_SUCCESS(m_pAsm->RunAssembleBuf(CopyDwordIsa, isaBuffer.As<char*>()));
+
    m_ChildStatus = queue.Create(defaultGPUNode);
    if (m_ChildStatus != HSAKMT_STATUS_SUCCESS) {
        WARN() << "Queue create failed" << std::endl;
@@ -26,12 +26,11 @@

 #include <gtest/gtest.h>

-#include "IsaGenerator.hpp"
 #include "KFDBaseComponentTest.hpp"

 class KFDExceptionTest : public KFDBaseComponentTest {
 public:
-    KFDExceptionTest() :m_pIsaGen(NULL), m_ChildPid(-1) {
+    KFDExceptionTest() : m_ChildPid(-1) {
        /* Because there could be early return before m_ChildPid is set
         * by fork(), we should initialize m_ChildPid to a non-zero value
         * to avoid possible exit of the main process.
@@ -59,8 +58,6 @@ class KFDExceptionTest : public KFDBaseComponentTest {
 protected:  // Members
    pid_t m_ChildPid;
    HSAKMT_STATUS m_ChildStatus;
-
-    IsaGenerator* m_pIsaGen;
 };

 #endif  // __KFD_EXCEPTION_TEST__H__
@@ -26,91 +26,17 @@
 #include "PM4Packet.hpp"
 #include "Dispatch.hpp"

-/* Shader to initialize gws counter to 1*/
-const char* gfx9_10_GwsInit =
-"\
-shader GwsInit\n\
-type(CS)\n\
-wave_size(32)\n\
-    s_mov_b32 m0, 0\n\
-    s_nop 0\n\
-    s_load_dword s16, s[0:1], 0x0 glc\n\
-    s_waitcnt 0\n\
-    v_mov_b32 v0, s16\n\
-    s_waitcnt 0\n\
-    ds_gws_init v0 gds:1 offset0:0\n\
-    s_waitcnt 0\n\
-    s_endpgm\n\
-    end\n\
-";
-
-/* Atomically increase a value in memory
- * This is expected to be executed from
- * multiple work groups simultaneously.
- * GWS semaphore is used to guarantee
- * the operation is atomic.
- */
-const char* gfx9_AtomicIncrease =
-"\
-shader AtomicIncrease\n\
-type(CS)\n\
-/* Assume src address in s0, s1 */\n\
-    s_mov_b32 m0, 0\n\
-    s_nop 0\n\
-    ds_gws_sema_p gds:1 offset0:0\n\
-    s_waitcnt 0\n\
-    s_load_dword s16, s[0:1], 0x0 glc\n\
-    s_waitcnt 0\n\
-    s_add_u32 s16, s16, 1\n\
-    s_store_dword s16, s[0:1], 0x0 glc\n\
-    s_waitcnt lgkmcnt(0)\n\
-    ds_gws_sema_v gds:1 offset0:0\n\
-    s_waitcnt 0\n\
-    s_endpgm\n\
-    end\n\
-";
-
-const char* gfx10_AtomicIncrease =
-"\
-shader AtomicIncrease\n\
-asic(GFX10)\n\
-type(CS)\n\
-wave_size(32)\n\
-/* Assume src address in s0, s1 */\n\
-    s_mov_b32 m0, 0\n\
-    s_mov_b32 exec_lo, 0x1\n\
-    v_mov_b32 v0, s0\n\
-    v_mov_b32 v1, s1\n\
-    ds_gws_sema_p gds:1 offset0:0\n\
-    s_waitcnt 0\n\
-    flat_load_dword v2, v[0:1] glc:1 dlc:1\n\
-    s_waitcnt 0\n\
-    v_add_nc_u32 v2, v2, 1\n\
-    flat_store_dword v[0:1], v2\n\
-    s_waitcnt_vscnt null, 0\n\
-    ds_gws_sema_v gds:1 offset0:0\n\
-    s_waitcnt 0\n\
-    s_endpgm\n\
-    end\n\
-";
-
 void KFDGWSTest::SetUp() {
    ROUTINE_START

    KFDBaseComponentTest::SetUp();

-    m_pIsaGen = IsaGenerator::Create(m_FamilyId);
-
    ROUTINE_END
 }

 void KFDGWSTest::TearDown() {
    ROUTINE_START

-    if (m_pIsaGen)
-        delete m_pIsaGen;
-    m_pIsaGen = NULL;
-
    KFDBaseComponentTest::TearDown();

    ROUTINE_END
@@ -160,21 +86,15 @@ TEST_F(KFDGWSTest, Semaphore) {
 			    pNodeProperties->NumGws,&firstGWS));
    EXPECT_EQ(0, firstGWS);

-    m_pIsaGen = IsaGenerator::Create(m_FamilyId);
-    m_pIsaGen->CompileShader(gfx9_10_GwsInit, "GwsInit", isaBuffer);
+    ASSERT_SUCCESS(m_pAsm->RunAssembleBuf(GwsInitIsa, isaBuffer.As<char*>()));
+
    Dispatch dispatch0(isaBuffer);
    buffer.Fill(numResources, 0, 4);
    dispatch0.SetArgs(buffer.As<void*>(), NULL);
    dispatch0.Submit(queue);
    dispatch0.Sync();

-    const char *pAtomicIncrease;
-    if (m_FamilyId <= FAMILY_AL)
-        pAtomicIncrease = gfx9_AtomicIncrease;
-    else
-        pAtomicIncrease = gfx10_AtomicIncrease;
-
-    m_pIsaGen->CompileShader(pAtomicIncrease, "AtomicIncrease", isaBuffer);
+    ASSERT_SUCCESS(m_pAsm->RunAssembleBuf(GwsAtomicIncreaseIsa, isaBuffer.As<char*>()));

    Dispatch dispatch(isaBuffer);
    dispatch.SetArgs(buffer.As<void*>(), NULL);
@@ -26,20 +26,16 @@

 #include <gtest/gtest.h>

-#include "IsaGenerator.hpp"
 #include "KFDBaseComponentTest.hpp"

 class KFDGWSTest : public KFDBaseComponentTest {
 public:
-    KFDGWSTest() :m_pIsaGen(NULL) {}
+    KFDGWSTest() {}
    ~KFDGWSTest() {}

 protected:
    virtual void SetUp();
    virtual void TearDown();
-
- protected:  // Members
-    IsaGenerator* m_pIsaGen;
 };

 #endif  // __KFD_GWS_TEST__H__
@@ -101,7 +101,8 @@ TEST_F(KFDGraphicsInterop, RegisterGraphicsHandle) {

    // Copy contents to a system memory buffer for comparison
    HsaMemoryBuffer isaBuffer(PAGE_SIZE, defaultGPUNode, true/*zero*/, false/*local*/, true/*exec*/);
-    m_pIsaGen->GetCopyDwordIsa(isaBuffer);
+
+    ASSERT_SUCCESS(m_pAsm->RunAssembleBuf(CopyDwordIsa, isaBuffer.As<char*>()));

    HsaMemoryBuffer dstBuffer(PAGE_SIZE, defaultGPUNode, true/*zero*/);

@@ -28,18 +28,12 @@ void KFDHWSTest::SetUp() {

    KFDBaseComponentTest::SetUp();

-    m_pIsaGen = IsaGenerator::Create(m_FamilyId);
-
    ROUTINE_END
 }

 void KFDHWSTest::TearDown() {
    ROUTINE_START

-    if (m_pIsaGen)
-        delete m_pIsaGen;
-    m_pIsaGen = NULL;
-
    KFDBaseComponentTest::TearDown();

    ROUTINE_END
@@ -70,7 +64,9 @@ void KFDHWSTest::RunTest(unsigned nProcesses, unsigned nQueues, unsigned nLoops)

    // Run work on all queues
    HsaMemoryBuffer isaBuffer(PAGE_SIZE, defaultGPUNode, true/*zero*/, false/*local*/, true/*exec*/);
-    m_pIsaGen->GetNoopIsa(isaBuffer);
+
+    ASSERT_SUCCESS(m_pAsm->RunAssembleBuf(NoopIsa, isaBuffer.As<char*>()));
+
    for (l = 0; l < nLoops; l++) {
        for (q = 0; q < nQueues; q++) {
            if (dispatch[q])
@@ -27,14 +27,12 @@
 #include <gtest/gtest.h>

 #include "PM4Queue.hpp"
-#include "IsaGenerator.hpp"
 #include "KFDMultiProcessTest.hpp"
 #include "Dispatch.hpp"

 class KFDHWSTest : public KFDMultiProcessTest {
 public:
-    KFDHWSTest():m_pIsaGen(NULL) {}
-
+    KFDHWSTest() {}
    ~KFDHWSTest() {}

 protected:
@@ -42,9 +40,6 @@ class KFDHWSTest : public KFDMultiProcessTest {
    virtual void TearDown();

    void RunTest(unsigned nProcesses, unsigned nQueues, unsigned nLoops);
-
- protected:  // Members
-    IsaGenerator* m_pIsaGen;
 };

 #endif  // __KFD_QCM_TEST__H__
@@ -23,7 +23,6 @@

 #include "KFDBaseComponentTest.hpp"
 #include "BaseQueue.hpp"
-#include "IsaGenerator.hpp"

 #ifndef __KFD_MEMORY_TEST__H__
 #define __KFD_MEMORY_TEST__H__
@@ -33,18 +33,12 @@ void KFDLocalMemoryTest::SetUp() {

    KFDBaseComponentTest::SetUp();

-    m_pIsaGen = IsaGenerator::Create(m_FamilyId);
-
    ROUTINE_END
 }

 void KFDLocalMemoryTest::TearDown() {
    ROUTINE_START

-    if (m_pIsaGen)
-        delete m_pIsaGen;
-    m_pIsaGen = NULL;
-
    KFDBaseComponentTest::TearDown();

    ROUTINE_END
@@ -107,7 +101,7 @@ TEST_F(KFDLocalMemoryTest, BasicTest) {

    srcSysBuffer.Fill(0x01010101);

-    m_pIsaGen->GetCopyDwordIsa(isaBuffer);
+    ASSERT_SUCCESS(m_pAsm->RunAssembleBuf(CopyDwordIsa, isaBuffer.As<char*>()));

    ASSERT_SUCCESS(hsaKmtMapMemoryToGPUNodes(srcLocalBuffer.As<void*>(), srcLocalBuffer.Size(), &AlternateVAGPU,
                        mapFlags, 1, reinterpret_cast<HSAuint32 *>(&defaultGPUNode)));
@@ -164,7 +158,7 @@ TEST_F(KFDLocalMemoryTest, VerifyContentsAfterUnmapAndMap) {

    SysBufferA.Fill(0x01010101);

-    m_pIsaGen->GetCopyDwordIsa(isaBuffer);
+    ASSERT_SUCCESS(m_pAsm->RunAssembleBuf(CopyDwordIsa, isaBuffer.As<char*>()));

    ASSERT_SUCCESS(queue.Create(defaultGPUNode));
    queue.SetSkipWaitConsump(0);
@@ -303,7 +297,8 @@ TEST_F(KFDLocalMemoryTest, Fragmentation) {
    PM4Queue queue;
    ASSERT_SUCCESS(queue.Create(defaultGPUNode));
    HsaMemoryBuffer isaBuffer(PAGE_SIZE, defaultGPUNode);
-    m_pIsaGen->GetCopyDwordIsa(isaBuffer);
+
+    ASSERT_SUCCESS(m_pAsm->RunAssembleBuf(CopyDwordIsa, isaBuffer.As<char*>()));

    /* Allocate and test memory using the strategy explained at the top */
    HSAKMT_STATUS status;
@@ -26,20 +26,16 @@

 #include <gtest/gtest.h>

-#include "IsaGenerator.hpp"
 #include "KFDBaseComponentTest.hpp"

 class KFDLocalMemoryTest : public KFDBaseComponentTest {
 public:
-    KFDLocalMemoryTest() :m_pIsaGen(NULL) {}
+    KFDLocalMemoryTest() {}
    ~KFDLocalMemoryTest() {}

 protected:
    virtual void SetUp();
    virtual void TearDown();
-
- protected:  // Members
-    IsaGenerator* m_pIsaGen;
 };

 #endif  // __KFD_LOCALMEMORY_TEST__H__
@@ -39,360 +39,17 @@
 #include "SDMAPacket.hpp"
 #include "linux/kfd_ioctl.h"

-const char* gfx8_ScratchCopyDword =
-"\
-shader ScratchCopyDword\n\
-asic(VI)\n\
-type(CS)\n\
-/*copy the parameters from scalar registers to vector registers*/\n\
-    v_mov_b32 v0, s0\n\
-    v_mov_b32 v1, s1\n\
-    v_mov_b32 v2, s2\n\
-    v_mov_b32 v3, s3\n\
-/*set up the scratch parameters. This assumes a single 16-reg block.*/\n\
-    s_mov_b32 flat_scratch_lo, 8/*2 dwords of scratch per thread*/\n\
-    s_mov_b32 flat_scratch_hi, 0/*offset in units of 256bytes*/\n\
-/*copy a dword between the passed addresses*/\n\
-    flat_load_dword v4, v[0:1] slc\n\
-    s_waitcnt vmcnt(0)&lgkmcnt(0)\n\
-    flat_store_dword v[2:3], v4 slc\n\
-    \n\
-    s_endpgm\n\
-    \n\
-end\n\
-";
-
-const char* gfx9_ScratchCopyDword =
-"\
-shader ScratchCopyDword\n\
-asic(GFX9)\n\
-type(CS)\n\
-/*copy the parameters from scalar registers to vector registers*/\n\
-    v_mov_b32 v0, s0\n\
-    v_mov_b32 v1, s1\n\
-    v_mov_b32 v2, s2\n\
-    v_mov_b32 v3, s3\n\
-/*set up the scratch parameters. This assumes a single 16-reg block.*/\n\
-    s_mov_b32 flat_scratch_lo, s4\n\
-    s_mov_b32 flat_scratch_hi, s5\n\
-/*copy a dword between the passed addresses*/\n\
-    flat_load_dword v4, v[0:1] slc\n\
-    s_waitcnt vmcnt(0)&lgkmcnt(0)\n\
-    flat_store_dword v[2:3], v4 slc\n\
-    \n\
-    s_endpgm\n\
-    \n\
-end\n\
-";
-const char* gfx10_ScratchCopyDword =
-"\
-shader ScratchCopyDword\n\
-asic(GFX10)\n\
-type(CS)\n\
-wave_size(32)\n\
-/*copy the parameters from scalar registers to vector registers*/\n\
-    v_mov_b32 v0, s0\n\
-    v_mov_b32 v1, s1\n\
-    v_mov_b32 v2, s2\n\
-    v_mov_b32 v3, s3\n\
-/*set up the scratch parameters. This assumes a single 16-reg block.*/\n\
-    s_setreg_b32 hwreg(HW_REG_SHADER_FLAT_SCRATCH_LO), s4\n\
-    s_setreg_b32 hwreg(HW_REG_SHADER_FLAT_SCRATCH_HI), s5\n\
-/*copy a dword between the passed addresses*/\n\
-    flat_load_dword v4, v[0:1] slc\n\
-    s_waitcnt vmcnt(0)&lgkmcnt(0)\n\
-    flat_store_dword v[2:3], v4 slc\n\
-    \n\
-    s_endpgm\n\
-    \n\
-end\n\
-";
-
-const char* aldbrn_ScratchCopyDword =
-"\
-shader ScratchCopyDword\n\
-asic(ALDEBARAN)\n\
-type(CS)\n\
-/*copy the parameters from scalar registers to vector registers*/\n\
-    v_mov_b32 v0, s0\n\
-    v_mov_b32 v1, s1\n\
-    v_mov_b32 v2, s2\n\
-    v_mov_b32 v3, s3\n\
-/*set up the scratch parameters. This assumes a single 16-reg block.*/\n\
-    s_mov_b32 flat_scratch_lo, s4\n\
-    s_mov_b32 flat_scratch_hi, s5\n\
-/*copy a dword between the passed addresses*/\n\
-    flat_load_dword v4, v[0:1] slc\n\
-    s_waitcnt vmcnt(0)&lgkmcnt(0)\n\
-    flat_store_dword v[2:3], v4 slc\n\
-    \n\
-    s_endpgm\n\
-    \n\
-end\n\
-";
-
-
-
-/* Continuously poll src buffer and check buffer value
- * After src buffer is filled with specific value (0x5678,
- * by host program), fill dst buffer with specific
- * value(0x5678) and quit
- */
-const char* gfx9_PollMemory =
-"\
-shader ReadMemory\n\
-wave_size(32)\n\
-type(CS)\n\
-/* Assume src address in s0, s1 and dst address in s2, s3*/\n\
-    s_movk_i32 s18, 0x5678\n\
-    LOOP:\n\
-    s_load_dword s16, s[0:1], 0x0 glc\n\
-    s_cmp_eq_i32 s16, s18\n\
-    s_cbranch_scc0   LOOP\n\
-    s_store_dword s18, s[2:3], 0x0 glc\n\
-    s_endpgm\n\
-    end\n\
-";
-
-/* Similar to gfx9_PollMemory except that the buffer
- * polled can be Non-coherant memory. SCC system-level
- * cache coherence is not supported in scalar (smem) path.
- * Use vmem operations with scc
- */
-const char* gfx9_PollNCMemory =
-"\
-shader ReadMemory\n\
-asic(ALDEBARAN)\n\
-wave_size(32)\n\
-type(CS)\n\
-/* Assume src address in s0, s1 and dst address in s2, s3*/\n\
-    v_mov_b32 v6, 0x5678\n\
-    v_mov_b32 v0, s0\n\
-    v_mov_b32 v1, s1\n\
-    LOOP:\n\
-    flat_load_dword v4, v[0:1] scc\n\
-    v_cmp_eq_u32 vcc, v4, v6\n\
-    s_cbranch_vccz   LOOP\n\
-    v_mov_b32 v0, s2\n\
-    v_mov_b32 v1, s3\n\
-    flat_store_dword v[0:1], v6 scc\n\
-    s_endpgm\n\
-    end\n\
-";
-
-const char* gfx10_PollMemory =
-"\
-shader ReadMemory\n\
-wave_size(32)\n\
-type(CS)\n\
-/* Assume src address in s0, s1 and dst address in s2, s3*/\n\
-    s_movk_i32 s18, 0x5678\n\
-    v_mov_b32 v0, s2\n\
-    v_mov_b32 v1, s3\n\
-    v_mov_b32 v2, 0x5678\n\
-    LOOP:\n\
-    s_load_dword s16, s[0:1], 0x0 glc\n\
-    s_cmp_eq_i32 s16, s18\n\
-    s_cbranch_scc0   LOOP\n\
-    flat_store_dword v[0,1], v2 slc\n\
-    s_waitcnt vmcnt(0)&lgkmcnt(0)\n\
-    s_endpgm\n\
-    end\n\
-";
-
-/* Input: A buffer of at least 3 dwords.
- * DW0: used as a signal. 0xcafe means it is signaled
- * DW1: Input buffer for device to read.
- * DW2: Output buffer for device to write.
- * Once receive signal, device will copy DW1 to DW2
- * This shader continously poll the signal buffer,
- * Once signal buffer is signaled, it copies input buffer
- * to output buffer
- */
-const char* gfx9_CopyOnSignal =
-"\
-shader CopyOnSignal\n\
-wave_size(32)\n\
-type(CS)\n\
-/* Assume input buffer in s0, s1 */\n\
-    s_mov_b32 s18, 0xcafe\n\
-POLLSIGNAL:\n\
-    s_load_dword s16, s[0:1], 0x0 glc\n\
-    s_cmp_eq_i32 s16, s18\n\
-    s_cbranch_scc0   POLLSIGNAL\n\
-    s_load_dword s17, s[0:1], 0x4 glc\n\
-    s_waitcnt vmcnt(0) & lgkmcnt(0)\n\
-    s_store_dword s17, s[0:1], 0x8 glc\n\
-    s_waitcnt vmcnt(0) & lgkmcnt(0)\n\
-    s_endpgm\n\
-    end\n\
-";
-
-const char* gfx10_CopyOnSignal =
-"\
-shader CopyOnSignal\n\
-wave_size(32)\n\
-type(CS)\n\
-/* Assume input buffer in s0, s1 */\n\
-    s_add_u32 s2, s0, 0x8\n\
-    s_addc_u32 s3, s1, 0x0\n\
-    s_mov_b32 s18, 0xcafe\n\
-    v_mov_b32 v0, s0\n\
-    v_mov_b32 v1, s1\n\
-    v_mov_b32 v4, s2\n\
-    v_mov_b32 v5, s3\n\
-POLLSIGNAL:\n\
-    s_load_dword s16, s[0:1], 0x0 glc\n\
-    s_cmp_eq_i32 s16, s18\n\
-    s_cbranch_scc0   POLLSIGNAL\n\
-    s_load_dword s17, s[0:1], 0x4 glc\n\
-    s_waitcnt vmcnt(0) & lgkmcnt(0)\n\
-    v_mov_b32 v2, s17\n\
-    flat_store_dword v[4,5], v2 glc\n\
-    s_waitcnt vmcnt(0) & lgkmcnt(0)\n\
-    s_endpgm\n\
-    end\n\
-";
-
-/* Input0: A buffer of at least 2 dwords.
- * DW0: used as a signal. Write 0xcafe to signal
- * DW1: Write to this buffer for other device to read.
- * Input1: mmio base address
- */
-const char* gfx9_WriteAndSignal =
-"\
-shader WriteAndSignal\n\
-wave_size(32)\n\
-type(CS)\n\
-/* Assume input buffer in s0, s1 */\n\
-    s_mov_b32 s18, 0xbeef\n\
-    s_store_dword s18, s[0:1], 0x4 glc\n\
-    s_mov_b32 s18, 0x1\n\
-    s_store_dword s18, s[2:3], 0 glc\n\
-    s_mov_b32 s18, 0xcafe\n\
-    s_store_dword s18, s[0:1], 0x0 glc\n\
-    s_endpgm\n\
-    end\n\
-";
-
-/* Continuously poll the flag at src buffer
- * After the flag of s[0:1] is 1 filled,
- * copy the value from s[0:1]+4 to dst buffer
- */
-const char* gfx9_PollAndCopy =
-"\
-shader CopyMemory\n\
-wave_size(32)\n\
-type(CS)\n\
-/* Assume src buffer in s[0:1] and dst buffer in s[2:3]*/\n\
-    s_movk_i32 s18, 0x1\n\
-    LOOP:\n\
-    s_load_dword s16, s[0:1], 0x0 glc\n\
-    s_cmp_eq_i32 s16, s18\n\
-    s_cbranch_scc0   LOOP\n\
-    s_load_dword s17, s[0:1], 0x4 glc\n\
-    s_waitcnt vmcnt(0) & lgkmcnt(0)\n\
-    s_store_dword s17, s[2:3], 0x0 glc:1\n\
-    s_waitcnt vmcnt(0) & lgkmcnt(0)\n\
-    s_endpgm\n\
-    end\n\
-";
-
-const char* gfx9aldbrn_PollAndCopy =
-"\
-shader CopyMemory\n\
-wave_size(32)\n\
-type(CS)\n\
-/* Assume src buffer in s[0:1] and dst buffer in s[2:3]*/\n\
-    v_mov_b32 v0, s0\n\
-    v_mov_b32 v1, s1\n\
-    v_mov_b32 v18, 0x1\n\
-    LOOP:\n\
-    flat_load_dword v16, v[0:1] glc\n\
-    s_waitcnt vmcnt(0) & lgkmcnt(0)\n\
-    v_cmp_eq_i32 vcc, v16, v18\n\
-    s_cbranch_vccz   LOOP\n\
-    buffer_invl2\n\
-    s_load_dword s17, s[0:1], 0x4 glc\n\
-    s_waitcnt vmcnt(0) & lgkmcnt(0)\n\
-    s_store_dword s17, s[2:3], 0x0 glc\n\
-    s_waitcnt vmcnt(0) & lgkmcnt(0)\n\
-    buffer_wbl2\n\
-    s_waitcnt vmcnt(0) & lgkmcnt(0)\n\
-    s_endpgm\n\
-    end\n\
-";
-
-/* Input0: A buffer of at least 2 dwords.
- * DW0: used as a signal. Write 0x1 to signal
- * DW1: Write the value from 2nd input buffer
- *      for other device to read.
- * Input1: A buffer of at least 2 dwords.
- * DW0: used as the value to be written.
- */
-const char* gfx9aldbrn_WriteFlagAndValue =
-"\
-shader WriteMemory\n\
-wave_size(32)\n\
-type(CS)\n\
-/* Assume two inputs buffer in s[0:1] and s[2:3]*/\n\
-    v_mov_b32 v0, s0\n\
-    v_mov_b32 v1, s1\n\
-    s_load_dword s18, s[2:3], 0x0 glc\n\
-    s_waitcnt vmcnt(0) & lgkmcnt(0)\n\
-    s_store_dword s18, s[0:1], 0x4 glc\n\
-    s_waitcnt vmcnt(0) & lgkmcnt(0)\n\
-    buffer_wbl2\n\
-    s_waitcnt vmcnt(0) & lgkmcnt(0)\n\
-    v_mov_b32 v16, 0x1\n\
-    flat_store_dword v[0:1], v16 glc\n\
-    s_endpgm\n\
-    end\n\
-";
-
-const char* gfx10_WriteAndSignal =
-"\
-shader WriteAndSignal\n\
-wave_size(32)\n\
-type(CS)\n\
-/* Assume input buffer in s0, s1 */\n\
-    s_add_u32 s4, s0, 0x4\n\
-    s_addc_u32 s5, s1, 0x0\n\
-    v_mov_b32 v0, s0\n\
-    v_mov_b32 v1, s1\n\
-    v_mov_b32 v2, s2\n\
-    v_mov_b32 v3, s3\n\
-    v_mov_b32 v4, s4\n\
-    v_mov_b32 v5, s5\n\
-    v_mov_b32 v18, 0xbeef\n\
-    flat_store_dword v[4:5], v18 glc\n\
-    v_mov_b32 v18, 0x1\n\
-    flat_store_dword v[2:3], v18 glc\n\
-    v_mov_b32 v18, 0xcafe\n\
-    flat_store_dword v[0:1], v18 glc\n\
-    s_endpgm\n\
-    end\n\
-";
-
-//These gfx9_PullMemory, gfx9_CopyOnSignal, gfx9_WriteAndSignal shaders can be used by both gfx9 and gfx10
-
 void KFDMemoryTest::SetUp() {
    ROUTINE_START

    KFDBaseComponentTest::SetUp();

-    m_pIsaGen = IsaGenerator::Create(m_FamilyId);
-
    ROUTINE_END
 }

 void KFDMemoryTest::TearDown() {
    ROUTINE_START

-    if (m_pIsaGen)
-        delete m_pIsaGen;
-    m_pIsaGen = NULL;
-
    KFDBaseComponentTest::TearDown();

    ROUTINE_END
@@ -508,16 +165,13 @@ TEST_F(KFDMemoryTest, MapUnmapToNodes) {
    HsaMemoryBuffer dstBuffer(PAGE_SIZE, defaultGPUNode);

    const char *pReadMemory;
-    if (m_FamilyId < FAMILY_NV)
-        pReadMemory = gfx9_PollMemory;
-    else
-        pReadMemory = gfx10_PollMemory;
-
    if (m_NodeInfo.IsNodeXGMItoCPU(defaultGPUNode))
        /* On A+A system memory is mapped as NC */
-        m_pIsaGen->CompileShader(gfx9_PollNCMemory, "ReadMemory", isaBuffer);
+        pReadMemory = PollNCMemoryIsa;
    else
-        m_pIsaGen->CompileShader(pReadMemory, "ReadMemory", isaBuffer);
+        pReadMemory = PollMemoryIsa;
+
+    ASSERT_SUCCESS(m_pAsm->RunAssembleBuf(pReadMemory, isaBuffer.As<char*>()));

    PM4Queue pm4Queue;
    ASSERT_SUCCESS(pm4Queue.Create(defaultGPUNode));
@@ -674,7 +328,8 @@ TEST_F(KFDMemoryTest, MemoryRegister) {
    ASSERT_SUCCESS(sdmaQueue.Create(defaultGPUNode));

    HsaMemoryBuffer isaBuffer(PAGE_SIZE, defaultGPUNode, true/*zero*/, false/*local*/, true/*exec*/);
-    m_pIsaGen->GetCopyDwordIsa(isaBuffer);
+
+    ASSERT_SUCCESS(m_pAsm->RunAssembleBuf(CopyDwordIsa, isaBuffer.As<char*>()));

    /* First submit just so the queues are not empty, and to get the
     * TLB populated (in case we need to flush TLBs somewhere after
@@ -855,16 +510,7 @@ TEST_F(KFDMemoryTest, FlatScratchAccess) {
    // Initialize the srcBuffer to some fixed value
    srcMemBuffer.Fill(0x01010101);

-    const char *pScratchCopyDword;
-    if (m_FamilyId < FAMILY_AI)
-        pScratchCopyDword = gfx8_ScratchCopyDword;
-    else if (m_FamilyId < FAMILY_AL)
-        pScratchCopyDword = gfx9_ScratchCopyDword;
-    else if (m_FamilyId == FAMILY_AL)
-        pScratchCopyDword = aldbrn_ScratchCopyDword;
-    else
-        pScratchCopyDword = gfx10_ScratchCopyDword;
-    m_pIsaGen->CompileShader(pScratchCopyDword, "ScratchCopyDword", isaBuffer);
+    ASSERT_SUCCESS(m_pAsm->RunAssembleBuf(ScratchCopyDwordIsa, isaBuffer.As<char*>()));

    const HsaNodeProperties *pNodeProperties = m_NodeInfo.GetNodeProperties(defaultGPUNode);

@@ -1728,17 +1374,8 @@ TEST_F(KFDMemoryTest, PtraceAccessInvisibleVram) {
    // dstBuffer is cpu accessible gtt memory
    HsaMemoryBuffer dstBuffer(PAGE_SIZE, defaultGPUNode);

-    const char *pScratchCopyDword;
-    if (m_FamilyId < FAMILY_AI)
-        pScratchCopyDword = gfx8_ScratchCopyDword;
-    else if (m_FamilyId < FAMILY_AL)
-        pScratchCopyDword = gfx9_ScratchCopyDword;
-    else if (m_FamilyId == FAMILY_AL)
-        pScratchCopyDword = aldbrn_ScratchCopyDword;
-    else
-        pScratchCopyDword = gfx10_ScratchCopyDword;
+    ASSERT_SUCCESS(m_pAsm->RunAssembleBuf(ScratchCopyDwordIsa, isaBuffer.As<char*>()));

-    m_pIsaGen->CompileShader(pScratchCopyDword, "ScratchCopyDword", isaBuffer);
    Dispatch dispatch0(isaBuffer);
    dispatch0.SetArgs(mem0, dstBuffer.As<void*>());
    dispatch0.Submit(queue);
@@ -2109,12 +1746,9 @@ TEST_F(KFDMemoryTest, HostHdpFlush) {
    PM4Queue queue;
    ASSERT_SUCCESS(queue.Create(defaultGPUNode));
    HsaMemoryBuffer isaBuffer(PAGE_SIZE, defaultGPUNode, true/*zero*/, false/*local*/, true/*exec*/);
-    const char *pCopyOnSignal;
-    if (m_FamilyId < FAMILY_NV)
-        pCopyOnSignal = gfx9_CopyOnSignal;
-    else
-        pCopyOnSignal = gfx10_CopyOnSignal;
-    m_pIsaGen->CompileShader(pCopyOnSignal, "CopyOnSignal", isaBuffer);
+
+    ASSERT_SUCCESS(m_pAsm->RunAssembleBuf(CopyOnSignalIsa, isaBuffer.As<char*>()));
+
    Dispatch dispatch0(isaBuffer);
    dispatch0.SetArgs(buffer, NULL);
    dispatch0.Submit(queue);
@@ -2234,12 +1868,9 @@ TEST_F(KFDMemoryTest, DeviceHdpFlush) {
    PM4Queue queue;
    ASSERT_SUCCESS(queue.Create(nodes[0]));
    HsaMemoryBuffer isaBuffer(PAGE_SIZE, nodes[0], true/*zero*/, false/*local*/, true/*exec*/);
-    const char *pCopyOnSignal;
-    if (m_FamilyId < FAMILY_NV)
-        pCopyOnSignal = gfx9_CopyOnSignal;
-    else
-        pCopyOnSignal = gfx10_CopyOnSignal;
-    m_pIsaGen->CompileShader(pCopyOnSignal, "CopyOnSignal", isaBuffer);
+
+    ASSERT_SUCCESS(m_pAsm->RunAssembleBuf(CopyOnSignalIsa, isaBuffer.As<char*>()));
+
    Dispatch dispatch(isaBuffer);
    dispatch.SetArgs(buffer, NULL);
    dispatch.Submit(queue);
@@ -2247,12 +1878,9 @@ TEST_F(KFDMemoryTest, DeviceHdpFlush) {
    PM4Queue queue0;
    ASSERT_SUCCESS(queue0.Create(nodes[1]));
    HsaMemoryBuffer isaBuffer0(PAGE_SIZE, nodes[1], true/*zero*/, false/*local*/, true/*exec*/);
-    const char *pWriteAndSignal;
-    if (m_FamilyId < FAMILY_NV)
-        pWriteAndSignal = gfx9_WriteAndSignal;
-    else
-        pWriteAndSignal = gfx10_WriteAndSignal;
-    m_pIsaGen->CompileShader(pWriteAndSignal, "WriteAndSignal", isaBuffer0);
+
+    ASSERT_SUCCESS(m_pAsm->RunAssembleBuf(WriteAndSignalIsa, isaBuffer.As<char*>()));
+
    Dispatch dispatch0(isaBuffer0);
    dispatch0.SetArgs(buffer, mmioBase);
    dispatch0.Submit(queue0);
@@ -2304,7 +1932,9 @@ TEST_F(KFDMemoryTest, CacheInvalidateOnSdmaWrite) {
    PM4Queue queue;
    ASSERT_SUCCESS(queue.Create(defaultGPUNode));
    HsaMemoryBuffer isaBuffer(PAGE_SIZE, defaultGPUNode, true/*zero*/, false/*local*/, true/*exec*/);
-    m_pIsaGen->CompileShader(gfx9_PollMemory, "ReadMemory", isaBuffer);
+
+    ASSERT_SUCCESS(m_pAsm->RunAssembleBuf(PollMemoryIsa, isaBuffer.As<char*>()));
+
    Dispatch dispatch(isaBuffer);
    dispatch.SetArgs(buffer.As<int*>(), buffer.As<int*>()+dwLocation);
    dispatch.Submit(queue);
@@ -2357,7 +1987,9 @@ TEST_F(KFDMemoryTest, CacheInvalidateOnCPUWrite) {
    PM4Queue queue;
    ASSERT_SUCCESS(queue.Create(defaultGPUNode));
    HsaMemoryBuffer isaBuffer(PAGE_SIZE, defaultGPUNode, true/*zero*/, false/*local*/, true/*exec*/);
-    m_pIsaGen->CompileShader(gfx9_PollMemory, "ReadMemory", isaBuffer);
+
+    ASSERT_SUCCESS(m_pAsm->RunAssembleBuf(PollMemoryIsa, isaBuffer.As<char*>()));
+
    Dispatch dispatch(isaBuffer);
    dispatch.SetArgs(buffer, buffer+100);
    dispatch.Submit(queue);
@@ -2419,7 +2051,9 @@ TEST_F(KFDMemoryTest, CacheInvalidateOnRemoteWrite) {
    PM4Queue queue;
    ASSERT_SUCCESS(queue.Create(defaultGPUNode));
    HsaMemoryBuffer isaBuffer(PAGE_SIZE, defaultGPUNode, true/*zero*/, false/*local*/, true/*exec*/);
-    m_pIsaGen->CompileShader(gfx9_PollMemory, "ReadMemory", isaBuffer);
+
+    ASSERT_SUCCESS(m_pAsm->RunAssembleBuf(PollMemoryIsa, isaBuffer.As<char*>()));
+
    Dispatch dispatch(isaBuffer);
    dispatch.SetArgs(buffer.As<int*>(), buffer.As<int*>()+dwLocation);
    dispatch.Submit(queue);
@@ -2434,7 +2068,9 @@ TEST_F(KFDMemoryTest, CacheInvalidateOnRemoteWrite) {
    ASSERT_SUCCESS(queue1.Create(nondefaultNode));
    buffer.Fill(0x5678, sdmaQueue, dwLocation1*sizeof(int), 4);
    HsaMemoryBuffer isaBuffer1(PAGE_SIZE, nondefaultNode, true/*zero*/, false/*local*/, true/*exec*/);
-    m_pIsaGen->GetCopyDwordIsa(isaBuffer1);
+
+    ASSERT_SUCCESS(m_pAsm->RunAssembleBuf(CopyDwordIsa, isaBuffer.As<char*>()));
+
    Dispatch dispatch1(isaBuffer1);
    dispatch1.SetArgs(buffer.As<int*>()+dwLocation1, buffer.As<int*>());
    dispatch1.Submit(queue1);
@@ -2500,7 +2136,9 @@ TEST_F(KFDMemoryTest, VramCacheCoherenceWithRemoteGPU) {
    PM4Queue queue;
    ASSERT_SUCCESS(queue.Create(defaultGPUNode));
    HsaMemoryBuffer isaBuffer(PAGE_SIZE, defaultGPUNode, true/*zero*/, false/*local*/, true/*exec*/);
-    m_pIsaGen->CompileShader(gfx9aldbrn_PollAndCopy, "CopyMemory", isaBuffer);
+
+    ASSERT_SUCCESS(m_pAsm->RunAssembleBuf(PollAndCopyIsa, isaBuffer.As<char*>()));
+
    Dispatch dispatch(isaBuffer);
    dispatch.SetArgs(buffer.As<char *>(), buffer.As<char *>()+dwLocation);
    dispatch.Submit(queue);
@@ -2515,7 +2153,9 @@ TEST_F(KFDMemoryTest, VramCacheCoherenceWithRemoteGPU) {
    PM4Queue queue1;
    ASSERT_SUCCESS(queue1.Create(nondefaultNode));
    HsaMemoryBuffer isaBuffer1(PAGE_SIZE, nondefaultNode, true/*zero*/, false/*local*/, true/*exec*/);
-    m_pIsaGen->CompileShader(gfx9aldbrn_WriteFlagAndValue, "WriteMemory", isaBuffer1);
+
+    ASSERT_SUCCESS(m_pAsm->RunAssembleBuf(WriteFlagAndValueIsa, isaBuffer.As<char*>()));
+
    Dispatch dispatch1(isaBuffer1);
    dispatch1.SetArgs(buffer.As<char *>(), buffer.As<char *>()+dwSource);
    dispatch1.Submit(queue1);
@@ -2569,7 +2209,9 @@ TEST_F(KFDMemoryTest, VramCacheCoherenceWithCPU) {
    PM4Queue queue;
    ASSERT_SUCCESS(queue.Create(defaultGPUNode));
    HsaMemoryBuffer isaBuffer(PAGE_SIZE, defaultGPUNode, true/*zero*/, false/*local*/, true/*exec*/);
-    m_pIsaGen->CompileShader(gfx9aldbrn_PollAndCopy, "CopyMemory", isaBuffer);
+
+    ASSERT_SUCCESS(m_pAsm->RunAssembleBuf(PollAndCopyIsa, isaBuffer.As<char*>()));
+
    Dispatch dispatch(isaBuffer);
    dispatch.SetArgs(buffer, buffer+dwLocation);
    dispatch.Submit(queue);
@@ -2608,12 +2250,17 @@ TEST_F(KFDMemoryTest, SramCacheCoherenceWithGPU) {
        return;
    }

-    unsigned int *fineBuffer = NULL;
-    unsigned int tmp;
-
    int defaultGPUNode = m_NodeInfo.HsaDefaultGPUNode();
    const int dwLocation = 0x80;

+    if (!m_NodeInfo.IsNodeXGMItoCPU(defaultGPUNode)) {
+        LOG() << "Skipping test: XGMI link to CPU is required." << std::endl;
+        return;
+    }
+
+    unsigned int *fineBuffer = NULL;
+    unsigned int tmp;
+
    ASSERT_SUCCESS(hsaKmtAllocMemory(defaultGPUNode /* system */, PAGE_SIZE, m_MemoryFlags,
                       reinterpret_cast<void**>(&fineBuffer)));
    ASSERT_SUCCESS(hsaKmtMapMemoryToGPU(fineBuffer, PAGE_SIZE, NULL));
@@ -2627,10 +2274,7 @@ TEST_F(KFDMemoryTest, SramCacheCoherenceWithGPU) {
    ASSERT_SUCCESS(queue.Create(defaultGPUNode));
    HsaMemoryBuffer isaBuffer(PAGE_SIZE, defaultGPUNode, true/*zero*/, false/*local*/, true/*exec*/);

-    if (m_NodeInfo.IsNodeXGMItoCPU(defaultGPUNode))
-        m_pIsaGen->CompileShader(gfx9aldbrn_PollAndCopy, "CopyMemory", isaBuffer);
-    else
-        m_pIsaGen->CompileShader(gfx9_PollAndCopy, "CopyMemory", isaBuffer);
+    ASSERT_SUCCESS(m_pAsm->RunAssembleBuf(PollAndCopyIsa, isaBuffer.As<char*>()));

    Dispatch dispatch(isaBuffer);
    dispatch.SetArgs(fineBuffer, fineBuffer+dwLocation);
@@ -22,7 +22,6 @@
 */

 #include "KFDBaseComponentTest.hpp"
-#include "IsaGenerator.hpp"

 #ifndef __KFD_MEMORY_TEST__H__
 #define __KFD_MEMORY_TEST__H__
@@ -33,15 +32,13 @@
 */
 class KFDMemoryTest :  public KFDBaseComponentTest {
 public:
-    KFDMemoryTest(void) :m_pIsaGen(NULL) {}
+    KFDMemoryTest(void) {}
    ~KFDMemoryTest(void) {}
 protected:
    virtual void SetUp();
    virtual void TearDown();

 protected:
-    IsaGenerator* m_pIsaGen;
-
    void BinarySearchLargestBuffer(int allocNode, const HsaMemFlags &memFlags,
                                            HSAuint64 highMB, int nodeToMap,
                                            HSAuint64 *lastSizeMB);
@@ -39,18 +39,12 @@ void KFDQMTest::SetUp() {

    KFDBaseComponentTest::SetUp();

-    m_pIsaGen = IsaGenerator::Create(m_FamilyId);
-
    ROUTINE_END
 }

 void KFDQMTest::TearDown() {
    ROUTINE_START

-    if (m_pIsaGen)
-        delete m_pIsaGen;
-    m_pIsaGen = NULL;
-
    KFDBaseComponentTest::TearDown();

    ROUTINE_END
@@ -677,111 +671,12 @@ TEST_F(KFDQMTest, OverSubscribeCpQueues) {
    TEST_END
 }

-/* A simple isa loop program with dense mathematic operations
- * s1 controls the number iterations of the loop
- * This shader can be used by GFX8, GFX9 and GFX10
- */
-static const char *loop_isa = \
-"\
-shader loop_isa\n\
-wave_size(32)\n\
-type(CS)\n\
-    s_movk_i32    s0, 0x0008\n\
-    s_movk_i32    s1, 0x00ff\n\
-    v_mov_b32     v0, 0\n\
-    v_mov_b32     v1, 0\n\
-    v_mov_b32     v2, 0\n\
-    v_mov_b32     v3, 0\n\
-    v_mov_b32     v4, 0\n\
-    v_mov_b32     v5, 0\n\
-    v_mov_b32     v6, 0\n\
-    v_mov_b32     v7, 0\n\
-    v_mov_b32     v8, 0\n\
-    v_mov_b32     v9, 0\n\
-    v_mov_b32     v10, 0\n\
-    v_mov_b32     v11, 0\n\
-    v_mov_b32     v12, 0\n\
-    v_mov_b32     v13, 0\n\
-    v_mov_b32     v14, 0\n\
-    v_mov_b32     v15, 0\n\
-    v_mov_b32     v16, 0\n\
-    LOOP:\n\
-    s_mov_b32     s8, s4\n\
-    s_mov_b32     s9, s1\n\
-    s_mov_b32     s10, s6\n\
-    s_mov_b32     s11, s7\n\
-    s_cmp_le_i32  s1, s0\n\
-    s_cbranch_scc1  END_OF_PGM\n\
-    s_buffer_load_dwordx8  s[8:15], s[8:11], 0x10\n\
-    v_add_f32     v0, 2.0, v0\n\
-    v_cvt_f32_i32  v17, s1\n\
-s_waitcnt     lgkmcnt(0)\n\
-    v_add_f32     v18, s8, v17\n\
-    v_add_f32     v19, s9, v17\n\
-    v_add_f32     v20, s10, v17\n\
-    v_add_f32     v21, s11, v17\n\
-    v_add_f32     v22, s12, v17\n\
-    v_add_f32     v23, s13, v17\n\
-    v_add_f32     v24, s14, v17\n\
-    v_add_f32     v17, s15, v17\n\
-    v_log_f32     v25, v18\n\
-    v_mul_f32  v25, v22, v25\n\
-    v_exp_f32     v25, v25\n\
-    v_log_f32     v26, v19\n\
-    v_mul_f32  v26, v23, v26\n\
-    v_exp_f32     v26, v26\n\
-    v_log_f32     v27, v20\n\
-    v_mul_f32  v27, v24, v27\n\
-    v_exp_f32     v27, v27\n\
-    v_log_f32     v28, v21\n\
-    v_mul_f32  v28, v17, v28\n\
-    v_exp_f32     v28, v28\n\
-    v_add_f32     v5, v5, v25\n\
-    v_add_f32     v6, v6, v26\n\
-    v_add_f32     v7, v7, v27\n\
-    v_add_f32     v8, v8, v28\n\
-    v_mul_f32  v18, 0x3fb8aa3b, v18\n\
-    v_exp_f32     v18, v18\n\
-    v_mul_f32  v19, 0x3fb8aa3b, v19\n\
-    v_exp_f32     v19, v19\n\
-    v_mul_f32  v20, 0x3fb8aa3b, v20\n\
-    v_exp_f32     v20, v20\n\
-    v_mul_f32  v21, 0x3fb8aa3b, v21\n\
-    v_exp_f32     v21, v21\n\
-    v_add_f32     v9, v9, v18\n\
-    v_add_f32     v10, v10, v19\n\
-    v_add_f32     v11, v11, v20\n\
-    v_add_f32     v12, v12, v21\n\
-    v_sqrt_f32    v18, v22\n\
-    v_sqrt_f32    v19, v23\n\
-    v_sqrt_f32    v20, v24\n\
-    v_sqrt_f32    v21, v17\n\
-    v_add_f32     v13, v13, v18\n\
-    v_add_f32     v14, v14, v19\n\
-    v_add_f32     v15, v15, v20\n\
-    v_add_f32     v16, v16, v21\n\
-    v_rsq_f32     v18, v22\n\
-    v_rsq_f32     v19, v23\n\
-    v_rsq_f32     v20, v24\n\
-    v_rsq_f32     v17, v17\n\
-    v_add_f32     v1, v1, v18\n\
-    v_add_f32     v2, v2, v19\n\
-    v_add_f32     v3, v3, v20\n\
-    v_add_f32     v4, v4, v17\n\
-    s_add_u32     s0, s0, 1\n\
-    s_branch      LOOP\n\
-    END_OF_PGM:\n\
-    s_endpgm\n\
-    end\n\
-";
-
 HSAint64 KFDQMTest::TimeConsumedwithCUMask(int node, uint32_t* mask, uint32_t mask_count) {
    HsaMemoryBuffer isaBuffer(PAGE_SIZE, node, true/*zero*/, false/*local*/, true/*exec*/);
    HsaMemoryBuffer dstBuffer(PAGE_SIZE, node, true, false, false);
    HsaMemoryBuffer ctlBuffer(PAGE_SIZE, node, true, false, false);

-    m_pIsaGen = IsaGenerator::Create(m_FamilyId);
-    m_pIsaGen->CompileShader(loop_isa, "loop_isa", isaBuffer);
+    EXPECT_SUCCESS(m_pAsm->RunAssembleBuf(LoopIsa, isaBuffer.As<char*>()));

    Dispatch dispatch(isaBuffer);
    dispatch.SetDim(1024, 16, 16);
@@ -838,7 +733,6 @@ TEST_F(KFDQMTest, BasicCuMaskingLinear) {
    TEST_START(TESTPROFILE_RUNALL);
    int defaultGPUNode = m_NodeInfo.HsaDefaultGPUNode();
    ASSERT_GE(defaultGPUNode, 0) << "failed to get default GPU Node";
-    m_pIsaGen = IsaGenerator::Create(m_FamilyId);

    if (m_FamilyId >= FAMILY_VI) {
        const HsaNodeProperties *pNodeProperties = m_NodeInfo.GetNodeProperties(defaultGPUNode);
@@ -982,7 +876,7 @@ TEST_F(KFDQMTest, QueuePriorityOnDifferentPipe) {
    HSAint32 *syncBuffer = syncBuf.As<HSAint32*>();
    HsaMemoryBuffer isaBuffer(PAGE_SIZE, node, true/*zero*/, false/*local*/, true/*exec*/);

-    m_pIsaGen->CompileShader(loop_isa, "loop_isa", isaBuffer);
+    ASSERT_SUCCESS(m_pAsm->RunAssembleBuf(LoopIsa, isaBuffer.As<char*>()));

    Dispatch dispatch[2] = {
        Dispatch(isaBuffer, true),
@@ -1047,7 +941,7 @@ TEST_F(KFDQMTest, QueuePriorityOnSamePipe) {
    HSAint32 *syncBuffer = syncBuf.As<HSAint32*>();
    HsaMemoryBuffer isaBuffer(PAGE_SIZE, node, true/*zero*/, false/*local*/, true/*exec*/);

-    m_pIsaGen->CompileShader(loop_isa, "loop_isa", isaBuffer);
+    ASSERT_SUCCESS(m_pAsm->RunAssembleBuf(LoopIsa, isaBuffer.As<char*>()));

    Dispatch dispatch[2] = {
        Dispatch(isaBuffer, true),
@@ -1140,7 +1034,7 @@ TEST_F(KFDQMTest, EmptyDispatch) {

    HsaMemoryBuffer isaBuffer(PAGE_SIZE, defaultGPUNode, true/*zero*/, false/*local*/, true/*exec*/);

-    m_pIsaGen->GetNoopIsa(isaBuffer);
+    ASSERT_SUCCESS(m_pAsm->RunAssembleBuf(LoopIsa, isaBuffer.As<char*>()));

    SyncDispatch(isaBuffer, NULL, NULL);

@@ -1159,7 +1053,7 @@ TEST_F(KFDQMTest, SimpleWriteDispatch) {

    srcBuffer.Fill(0x01010101);

-    m_pIsaGen->GetCopyDwordIsa(isaBuffer);
+    ASSERT_SUCCESS(m_pAsm->RunAssembleBuf(CopyDwordIsa, isaBuffer.As<char*>()));

    SyncDispatch(isaBuffer, srcBuffer.As<void*>(), destBuffer.As<void*>());

@@ -1194,7 +1088,7 @@ TEST_F(KFDQMTest, MultipleCpQueuesStressDispatch) {

    destBuffer.Fill(0xFF);

-    m_pIsaGen->GetCopyDwordIsa(isaBuffer);
+    ASSERT_SUCCESS(m_pAsm->RunAssembleBuf(CopyDwordIsa, isaBuffer.As<char*>()));

    for (i = 0; i < MAX_CP_QUEUES; ++i)
        ASSERT_SUCCESS(queues[i].Create(defaultGPUNode)) << " QueueId=" << i;
@@ -1533,7 +1427,7 @@ TEST_F(KFDQMTest, Atomics) {

    PM4Queue queue;

-    m_pIsaGen->GetAtomicIncIsa(isaBuf);
+    ASSERT_SUCCESS(m_pAsm->RunAssembleBuf(AtomicIncIsa, isaBuf.As<char*>()));

    Dispatch dispatch(isaBuf);
    dispatch.SetArgs(destBuf.As<void*>(), NULL);
@@ -1598,10 +1492,12 @@ TEST_F(KFDQMTest, mGPUShareBO) {

    srcNodeMem.Fill(0x05050505);

-    m_pIsaGen->GetCopyDwordIsa(isaBufferSrc);
+    ASSERT_SUCCESS(m_pAsm->RunAssemble(CopyDwordIsa));
+
+    m_pAsm->CopyInstrStream(isaBufferSrc.As<char*>());
    SyncDispatch(isaBufferSrc, srcNodeMem.As<void*>(), shared_addr.As<void *>(), src_node);

-    m_pIsaGen->GetCopyDwordIsa(isaBufferDst);
+    m_pAsm->CopyInstrStream(isaBufferDst.As<char*>());
    SyncDispatch(isaBufferDst, shared_addr.As<void *>(), dstNodeMem.As<void*>(), dst_node);

    EXPECT_EQ(dstNodeMem.As<unsigned int*>()[0], 0x05050505);
@@ -27,13 +27,12 @@
 #include <gtest/gtest.h>

 #include "PM4Queue.hpp"
-#include "IsaGenerator.hpp"
 #include "KFDBaseComponentTest.hpp"
 #include "Dispatch.hpp"

 class KFDQMTest : public KFDBaseComponentTest {
 public:
-    KFDQMTest():m_pIsaGen(NULL) {}
+    KFDQMTest() {}

    ~KFDQMTest() {}

@@ -49,7 +48,6 @@ class KFDQMTest : public KFDBaseComponentTest {
    const double CuVariance = 0.15;
    const double CuNegVariance = 1.0 - CuVariance;
    const double CuPosVariance = 1.0 + CuVariance;
-    IsaGenerator* m_pIsaGen;
 };

 #endif  // __KFD_QCM_TEST__H__
@@ -234,131 +234,6 @@ TEST_F(KFDSVMEvictTest, BasicTest) {
    TEST_END
 }

-/* Shader to read local buffers using multiple wavefronts in parallel
- * until address buffer is filled with specific value 0x5678 by host program,
- * then each wavefront fills value 0x5678 at corresponding result buffer and quit
- *
- * initial state:
- *   s[0:1] - address buffer base address
- *   s[2:3] - result buffer base address
- *   s4 - workgroup id
- *   v0 - workitem id, always 0 because NUM_THREADS_X(number of threads) in workgroup set to 1
- * registers:
- *   v0 - calculated workitem id, v0 = v0 + s4 * NUM_THREADS_X
- *   v[2:3] - address of corresponding local buf address offset: s[0:1] + v0 * 8
- *   v[4:5] - corresponding output buf address: s[2:3] + v0 * 4
- *   v[6:7] - local buf address used for read test
- */
-static const char* gfx9_ReadMemory =
-"\
-    shader ReadMemory\n\
-    type(CS)\n\
-    \n\
-    // compute address of corresponding output buffer\n\
-    v_mov_b32       v0, s4                  // use workgroup id as index\n\
-    v_lshlrev_b32   v0, 2, v0               // v0 *= 4\n\
-    v_add_co_u32    v4, vcc, s2, v0         // v[4:5] = s[2:3] + v0 * 4\n\
-    v_mov_b32       v5, s3\n\
-    v_add_u32       v5, vcc_lo, v5\n\
-    \n\
-    // compute input buffer offset used to store corresponding local buffer address\n\
-    v_lshlrev_b32   v0, 1, v0               // v0 *= 8\n\
-    v_add_co_u32    v2, vcc, s0, v0         // v[2:3] = s[0:1] + v0 * 8\n\
-    v_mov_b32       v3, s1\n\
-    v_add_u32       v3, vcc_lo, v3\n\
-    \n\
-    // load 64bit local buffer address stored at v[2:3] to v[6:7]\n\
-    flat_load_dwordx2   v[6:7], v[2:3] slc\n\
-    s_waitcnt       vmcnt(0) & lgkmcnt(0)   // wait for memory reads to finish\n\
-    \n\
-    v_mov_b32       v8, 0x5678\n\
-    s_movk_i32      s8, 0x5678\n\
-L_REPEAT:\n\
-    s_load_dword    s16, s[0:1], 0x0 glc\n\
-    s_waitcnt       vmcnt(0) & lgkmcnt(0)   // wait for memory reads to finish\n\
-    s_cmp_eq_i32    s16, s8\n\
-    s_cbranch_scc1  L_QUIT                  // if notified to quit by host\n\
-    // loop read 64M local buffer starting at v[6:7]\n\
-    // every 4k page only read once\n\
-    v_mov_b32       v9, 0\n\
-    v_mov_b32       v10, 0x1000             // 4k page\n\
-    v_mov_b32       v11, 0x4000000          // 64M size\n\
-    v_mov_b32       v12, v6\n\
-    v_mov_b32       v13, v7\n\
-L_LOOP_READ:\n\
-    flat_load_dwordx2   v[14:15], v[12:13] slc\n\
-    v_add_u32       v9, v9, v10 \n\
-    v_add_co_u32    v12, vcc, v12, v10\n\
-    v_add_u32       v13, vcc_lo, v13\n\
-    v_cmp_lt_u32    vcc, v9, v11\n\
-    s_cbranch_vccnz L_LOOP_READ\n\
-    s_branch        L_REPEAT\n\
-L_QUIT:\n\
-    flat_store_dword v[4:5], v8\n\
-    s_waitcnt       vmcnt(0) & lgkmcnt(0)   // wait for memory writes to finish\n\
-    s_endpgm\n\
-    end\n\
-";
-
-static const char* gfx8_ReadMemory =
-"\
-    shader ReadMemory\n\
-    asic(VI)\n\
-    type(CS)\n\
-    \n\
-    // compute address of corresponding output buffer\n\
-    v_mov_b32       v0, s4                  // use workgroup id as index\n\
-    v_lshlrev_b32   v0, 2, v0               // v0 *= 4\n\
-    v_add_u32       v4, vcc, s2, v0         // v[4:5] = s[2:3] + v0 * 4\n\
-    v_mov_b32       v5, s3\n\
-    v_addc_u32      v5, vcc, v5, 0, vcc\n\
-    \n\
-    // compute input buffer offset used to store corresponding local buffer address\n\
-    v_lshlrev_b32   v0, 1, v0               // v0 *= 8\n\
-    v_add_u32       v2, vcc, s0, v0         // v[2:3] = s[0:1] + v0 * 8\n\
-    v_mov_b32       v3, s1\n\
-    v_addc_u32      v3, vcc, v3, 0, vcc\n\
-    \n\
-    // load 64bit local buffer address stored at v[2:3] to v[6:7]\n\
-    flat_load_dwordx2   v[6:7], v[2:3] slc\n\
-    s_waitcnt       vmcnt(0) & lgkmcnt(0)   // wait for memory reads to finish\n\
-    \n\
-    v_mov_b32       v8, 0x5678\n\
-    s_movk_i32      s8, 0x5678\n\
-L_REPEAT:\n\
-    s_load_dword    s16, s[0:1], 0x0 glc\n\
-    s_waitcnt       vmcnt(0) & lgkmcnt(0)   // wait for memory reads to finish\n\
-    s_cmp_eq_i32    s16, s8\n\
-    s_cbranch_scc1  L_QUIT                  // if notified to quit by host\n\
-    // loop read 64M local buffer starting at v[6:7]\n\
-    // every 4k page only read once\n\
-    v_mov_b32       v9, 0\n\
-    v_mov_b32       v10, 0x1000             // 4k page\n\
-    v_mov_b32       v11, 0x4000000          // 64M size\n\
-    v_mov_b32       v12, v6\n\
-    v_mov_b32       v13, v7\n\
-L_LOOP_READ:\n\
-    flat_load_dwordx2   v[14:15], v[12:13] slc\n\
-    v_add_u32       v9, vcc, v9, v10 \n\
-    v_add_u32       v12, vcc, v12, v10\n\
-    v_addc_u32      v13, vcc, v13, 0, vcc\n\
-    v_cmp_lt_u32    vcc, v9, v11\n\
-    s_cbranch_vccnz L_LOOP_READ\n\
-    s_branch        L_REPEAT\n\
-L_QUIT:\n\
-    flat_store_dword v[4:5], v8\n\
-    s_waitcnt       vmcnt(0) & lgkmcnt(0)   // wait for memory writes to finish\n\
-    s_endpgm\n\
-    end\n\
-";
-
-std::string KFDSVMEvictTest::CreateShader() {
-    if (m_FamilyId >= FAMILY_AI)
-        return gfx9_ReadMemory;
-    else
-        return gfx8_ReadMemory;
-}
-
 /* Evict and restore queue test
 *
 * N_PROCESSES processes read all local buffers in parallel while buffers are evicted and restored
@@ -434,7 +309,7 @@ TEST_F(KFDSVMEvictTest, QueueTest) {
    for (i = 0; i < wavefront_num; i++)
        *(localBufAddr + i) = pBuffers[i];

-    m_pIsaGen->CompileShader(CreateShader().c_str(), "ReadMemory", isaBuffer);
+    ASSERT_SUCCESS(m_pAsm->RunAssembleBuf(ReadMemoryIsa, isaBuffer.As<char*>()));

    PM4Queue pm4Queue;
    ASSERT_SUCCESS(pm4Queue.Create(defaultGPUNode));
@@ -28,7 +28,6 @@
 #include <vector>
 #include "KFDLocalMemoryTest.hpp"
 #include "KFDBaseComponentTest.hpp"
-#include "IsaGenerator.hpp"

 // @class KFDEvictTest
 // Test eviction and restore procedure using two processes
@@ -34,8 +34,6 @@ void KFDSVMRangeTest::SetUp() {

    KFDBaseComponentTest::SetUp();

-    m_pIsaGen = IsaGenerator::Create(m_FamilyId);
-
    SVMSetXNACKMode();

    ROUTINE_END
@@ -44,10 +42,6 @@ void KFDSVMRangeTest::SetUp() {
 void KFDSVMRangeTest::TearDown() {
    ROUTINE_START

-    if (m_pIsaGen)
-        delete m_pIsaGen;
-    m_pIsaGen = NULL;
-
    SVMRestoreXNACKMode();

    KFDBaseComponentTest::TearDown();
@@ -80,7 +74,7 @@ TEST_F(KFDSVMRangeTest, BasicSystemMemTest) {

    srcSysBuffer.Fill(0x01010101);

-    m_pIsaGen->GetCopyDwordIsa(isaBuffer);
+    ASSERT_SUCCESS(m_pAsm->RunAssembleBuf(CopyDwordIsa, isaBuffer.As<char*>()));

    ASSERT_SUCCESS(queue.Create(defaultGPUNode));
    queue.SetSkipWaitConsump(0);
@@ -364,7 +358,8 @@ TEST_F(KFDSVMRangeTest, EvictSystemRangeTest) {
    ASSERT_SUCCESS(sdmaQueue.Create(defaultGPUNode));

    HsaMemoryBuffer isaBuffer(PAGE_SIZE, defaultGPUNode, true/*zero*/, false/*local*/, true/*exec*/);
-    m_pIsaGen->GetCopyDwordIsa(isaBuffer);
+
+    ASSERT_SUCCESS(m_pAsm->RunAssembleBuf(CopyDwordIsa, isaBuffer.As<char*>()));

    Dispatch dispatch0(isaBuffer);
    dispatch0.SetArgs(srcBuffer.As<void*>(), dstBuffer.As<void*>());
@@ -458,7 +453,8 @@ TEST_F(KFDSVMRangeTest, PartialUnmapSysMemTest) {

    munmap(pBuf2, Buf2Size);

-    m_pIsaGen->GetCopyDwordIsa(isaBuffer);
+    ASSERT_SUCCESS(m_pAsm->RunAssembleBuf(CopyDwordIsa, isaBuffer.As<char*>()));
+
    ASSERT_SUCCESS(queue.Create(defaultGPUNode));

    Dispatch dispatch(isaBuffer);
@@ -507,7 +503,7 @@ TEST_F(KFDSVMRangeTest, BasicVramTest) {

    srcSysBuffer.Fill(0x01010101);

-    m_pIsaGen->GetCopyDwordIsa(isaBuffer);
+    ASSERT_SUCCESS(m_pAsm->RunAssembleBuf(CopyDwordIsa, isaBuffer.As<char*>()));

    ASSERT_SUCCESS(queue.Create(defaultGPUNode));
    queue.SetSkipWaitConsump(0);
@@ -943,7 +939,9 @@ TEST_F(KFDSVMRangeTest, MigratePolicyTest) {
 #ifdef USE_PM4_QUEUE_TRIGGER_VM_FAULT
    HsaMemoryBuffer isaBuffer(PAGE_SIZE, defaultGPUNode);
    PM4Queue queue;
-    m_pIsaGen->GetCopyDwordIsa(isaBuffer);
+
+    ASSERT_SUCCESS(m_pAsm->RunAssembleBuf(CopyDwordIsa, isaBuffer.As<char*>()));
+
    ASSERT_SUCCESS(queue.Create(defaultGPUNode));

    for (HSAuint64 i = 0; i < BufferSize / 8; i += 512) {
@@ -26,21 +26,17 @@

 #include <gtest/gtest.h>

-#include "IsaGenerator.hpp"
 #include "KFDBaseComponentTest.hpp"

 class KFDSVMRangeTest : public KFDBaseComponentTest {
 public:
-    KFDSVMRangeTest() :m_pIsaGen(NULL) {}
+    KFDSVMRangeTest() {}
    ~KFDSVMRangeTest() {}
    void SplitRangeTest(int defaultGPUNode, int prefetch_location);

 protected:
    virtual void SetUp();
    virtual void TearDown();
-
- protected:  // Members
-    IsaGenerator* m_pIsaGen;
 };

 #endif  // __KFD_LOCALMEMORY_TEST__H__
@@ -231,6 +231,12 @@ bool isTonga(const HsaNodeProperties *props) {
    return false;
 }

+const uint32_t GetGfxVersion(const HsaNodeProperties *props) {
+    return ((props->EngineId.ui32.Major << 16) |
+            (props->EngineId.ui32.Minor <<  8) |
+            (props->EngineId.ui32.Stepping));
+}
+
 HSAuint64 GetSystemTickCountInMicroSec() {
    struct timeval t;
    gettimeofday(&t, 0);
@@ -52,6 +52,7 @@ bool is_dgpu();
 bool isTonga(const HsaNodeProperties *props);
 bool hasPciAtomicsSupport(int node);
 unsigned int FamilyIdFromNode(const HsaNodeProperties *props);
+const uint32_t GetGfxVersion(const HsaNodeProperties *props);

 void GetHwQueueInfo(const HsaNodeProperties *props,
                 unsigned int *p_num_cp_queues,
@@ -34,16 +34,11 @@ void RDMATest::SetUp() {

    KFDBaseComponentTest::SetUp();

-    m_pIsaGen = IsaGenerator::Create(m_FamilyId);
-
    ROUTINE_END
 }

 void RDMATest::TearDown() {
    ROUTINE_START
-    if (m_pIsaGen)
-        delete m_pIsaGen;
-    m_pIsaGen = NULL;

    KFDBaseComponentTest::TearDown();

@@ -77,7 +72,8 @@ TEST_F(RDMATest, GPUDirect) {
    srcSysBuffer.Fill(0xfe);

    /* Put 'copy dword' command to ISA buffer */
-    m_pIsaGen->GetCopyDwordIsa(isaBuffer);
+    ASSERT_SUCCESS(m_pAsm->RunAssembleBuf(CopyDwordIsa, isaBuffer.As<char*>()));
+

    ASSERT_SUCCESS(queue.Create(defaultGPUNode));
    Dispatch dispatch(isaBuffer);
@@ -26,20 +26,16 @@

 #include <gtest/gtest.h>

-#include "IsaGenerator.hpp"
 #include "KFDBaseComponentTest.hpp"

 class RDMATest : public KFDBaseComponentTest {
 public:
-    RDMATest():m_pIsaGen(NULL) {}
+    RDMATest() {}
    ~RDMATest() {}

 protected:
    virtual void SetUp();
    virtual void TearDown();
-
- protected:  // Members
-    IsaGenerator* m_pIsaGen;
 };

 #endif  // __RDMA_TEST__H__
@@ -0,0 +1,609 @@
+/*
+ * Copyright (C) 2021 Advanced Micro Devices, Inc. All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ *
+ */
+
+#include "ShaderStore.hpp"
+
+/**
+ * KFDASMTest List
+ */
+
+const std::vector<const char*> ShaderList = {
+    NoopIsa,
+    CopyDwordIsa,
+    InfiniteLoopIsa,
+    AtomicIncIsa,
+    ScratchCopyDwordIsa,
+    PollMemoryIsa,
+    CopyOnSignalIsa,
+    PollAndCopyIsa,
+    WriteFlagAndValueIsa,
+    WriteAndSignalIsa,
+    LoopIsa,
+    IterateIsa,
+    ReadMemoryIsa,
+    GwsInitIsa,
+    GwsAtomicIncreaseIsa,
+};
+
+/**
+ * Macros
+ */
+
+/* Create macro for portable v_add_co_u32, v_add_co_ci_u32,
+ * and v_cmp_lt_u32
+ */
+#define SHADER_MACROS \
+    "   .text\n"\
+    "   .macro V_ADD_CO_U32 vdst, src0, vsrc1\n"\
+    "       .if (.amdgcn.gfx_generation_number >= 10)\n"\
+    "           v_add_co_u32        \\vdst, vcc_lo, \\src0, \\vsrc1\n"\
+    "       .elseif (.amdgcn.gfx_generation_number >= 9)\n"\
+    "           v_add_co_u32        \\vdst, vcc, \\src0, \\vsrc1\n"\
+    "       .else\n"\
+    "           v_add_u32           \\vdst, vcc, \\src0, \\vsrc1\n"\
+    "       .endif\n"\
+    "   .endm\n"\
+    "   .macro V_ADD_CO_CI_U32 vdst, src0, vsrc1\n"\
+    "       .if (.amdgcn.gfx_generation_number >= 10)\n"\
+    "           v_add_co_ci_u32     \\vdst, vcc_lo, \\src0, \\vsrc1, vcc_lo\n"\
+    "       .elseif (.amdgcn.gfx_generation_number >= 9)\n"\
+    "           v_addc_co_u32       \\vdst, vcc, \\src0, \\vsrc1, vcc\n"\
+    "       .else\n"\
+    "           v_addc_u32          \\vdst, vcc, \\src0, \\vsrc1, vcc\n"\
+    "       .endif\n"\
+    "   .endm\n"\
+    "   .macro V_CMP_LT_U32 src0, vsrc1\n"\
+    "       .if (.amdgcn.gfx_generation_number >= 10)\n"\
+    "           v_cmp_lt_u32        vcc_lo, \\src0, \\vsrc1\n"\
+    "       .else\n"\
+    "           v_cmp_lt_u32        vcc, \\src0, \\vsrc1\n"\
+    "       .endif\n"\
+    "   .endm\n"
+
+/**
+ * Common
+ */
+
+const char *NoopIsa = R"(
+        .text
+        s_endpgm
+)";
+
+const char *CopyDwordIsa = R"(
+        .text
+        v_mov_b32 v0, s0
+        v_mov_b32 v1, s1
+        v_mov_b32 v2, s2
+        v_mov_b32 v3, s3
+        flat_load_dword v4, v[0:1] glc slc
+        s_waitcnt 0
+        flat_store_dword v[2:3], v4 glc slc
+        s_endpgm
+)";
+
+const char *InfiniteLoopIsa = R"(
+        .text
+        LOOP:
+        s_branch LOOP
+        s_endpgm
+)";
+
+const char *AtomicIncIsa = R"(
+        .text
+        v_mov_b32 v0, s0
+        v_mov_b32 v1, s1
+        .if (.amdgcn.gfx_generation_number >= 8)
+            v_mov_b32 v2, 1
+            flat_atomic_add v3, v[0:1], v2 glc slc
+        .else
+            v_mov_b32 v2, -1
+            flat_atomic_inc v3, v[0:1], v2 glc slc
+        .endif
+        s_waitcnt 0
+        s_endpgm
+)";
+
+/**
+ * KFDMemoryTest
+ */
+
+const char *ScratchCopyDwordIsa = R"(
+        .text
+        // Copy the parameters from scalar registers to vector registers
+        .if (.amdgcn.gfx_generation_number >= 9)
+            v_mov_b32 v0, s0
+            v_mov_b32 v1, s1
+            v_mov_b32 v2, s2
+            v_mov_b32 v3, s3
+        .else
+            v_mov_b32_e32 v0, s0
+            v_mov_b32_e32 v1, s1
+            v_mov_b32_e32 v2, s2
+            v_mov_b32_e32 v3, s3
+        .endif
+        // Setup the scratch parameters. This assumes a single 16-reg block
+        .if (.amdgcn.gfx_generation_number >= 10)
+            s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s4
+            s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s5
+        .elseif (.amdgcn.gfx_generation_number == 9)
+            s_mov_b32 flat_scratch_lo, s4
+            s_mov_b32 flat_scratch_hi, s5
+        .else
+            s_mov_b32 flat_scratch_lo, 8
+            s_mov_b32 flat_scratch_hi, 0
+        .endif
+        // Copy a dword between the passed addresses
+        flat_load_dword v4, v[0:1] slc
+        s_waitcnt vmcnt(0) & lgkmcnt(0)
+        flat_store_dword v[2:3], v4 slc
+        s_endpgm
+)";
+
+/* Continuously poll src buffer and check buffer value
+ * After src buffer is filled with specific value (0x5678,
+ * by host program), fill dst buffer with specific
+ * value(0x5678) and quit
+ */
+const char *PollMemoryIsa = R"(
+        .text
+        // Assume src address in s0, s1, and dst address in s2, s3
+        s_movk_i32 s18, 0x5678
+        .if (.amdgcn.gfx_generation_number >= 10)
+            v_mov_b32 v0, s2
+            v_mov_b32 v1, s3
+            v_mov_b32 v2, 0x5678
+        .endif
+        LOOP:
+        s_load_dword s16, s[0:1], 0x0 glc
+        s_cmp_eq_i32 s16, s18
+        s_cbranch_scc0   LOOP
+        .if (.amdgcn.gfx_generation_number >= 10)
+            flat_store_dword v[0:1], v2 slc
+        .else
+            s_store_dword s18, s[2:3], 0x0 glc
+        .endif
+        s_endpgm
+)";
+
+/* Similar to PollMemoryIsa except that the buffer
+ * polled can be Non-coherant memory. SCC system-level
+ * cache coherence is not supported in scalar (smem) path.
+ * Use vmem operations with scc
+ *
+ * Note: Only works on Aldebaran, and even then the scc modifier
+ *       has been defeatured. This shader is more or less
+ *       deprecated.
+ */
+const char *PollNCMemoryIsa = R"(
+        .text
+        // Assume src address in s0, s1, and dst address in s2, s3
+        v_mov_b32 v6, 0x5678
+        v_mov_b32 v0, s0
+        v_mov_b32 v1, s1
+        LOOP:
+        flat_load_dword v4, v[0:1] scc
+        v_cmp_eq_u32 vcc, v4, v6
+        s_cbranch_vccz   LOOP
+        v_mov_b32 v0, s2
+        v_mov_b32 v1, s3
+        flat_store_dword v[0:1], v6 scc
+        s_endpgm
+)";
+
+/* Input: A buffer of at least 3 dwords.
+ * DW0: used as a signal. 0xcafe means it is signaled
+ * DW1: Input buffer for device to read.
+ * DW2: Output buffer for device to write.
+ * Once receive signal, device will copy DW1 to DW2
+ * This shader continously poll the signal buffer,
+ * Once signal buffer is signaled, it copies input buffer
+ * to output buffer
+ */
+const char *CopyOnSignalIsa = R"(
+        .text
+        // Assume input buffer in s0, s1
+        .if (.amdgcn.gfx_generation_number >= 10)
+            s_add_u32 s2, s0, 0x8
+            s_addc_u32 s3, s1, 0x0
+            s_mov_b32 s18, 0xcafe
+            v_mov_b32 v0, s0
+            v_mov_b32 v1, s1
+            v_mov_b32 v4, s2
+            v_mov_b32 v5, s3
+        .else
+            s_mov_b32 s18, 0xcafe
+        .endif
+        POLLSIGNAL:
+        s_load_dword s16, s[0:1], 0x0 glc
+        s_cmp_eq_i32 s16, s18
+        s_cbranch_scc0   POLLSIGNAL
+        s_load_dword s17, s[0:1], 0x4 glc
+        s_waitcnt vmcnt(0) & lgkmcnt(0)
+        .if (.amdgcn.gfx_generation_number >= 10)
+            v_mov_b32 v2, s17
+            flat_store_dword v[4:5], v2 glc
+        .else
+            s_store_dword s17, s[0:1], 0x8 glc
+        .endif
+        s_waitcnt vmcnt(0) & lgkmcnt(0)
+        s_endpgm
+)";
+
+/* Continuously poll the flag at src buffer
+ * After the flag of s[0:1] is 1 filled,
+ * copy the value from s[0:1]+4 to dst buffer
+ *
+ * Note: Only works on GFX9 (only used in
+ *       aldebaran tests)
+ */
+const char *PollAndCopyIsa = R"(
+        .text
+        // Assume src buffer in s[0:1] and dst buffer in s[2:3]
+        .if (.amdgcn.gfx_generation_number == 9 && .amdgcn.gfx_generation_stepping == 10)
+            // Path for Aldebaran
+            v_mov_b32 v0, s0
+            v_mov_b32 v1, s1
+            v_mov_b32 v18, 0x1
+            LOOP_ALDBRN:
+            flat_load_dword v16, v[0:1] glc
+            s_waitcnt vmcnt(0) & lgkmcnt(0)
+            v_cmp_eq_i32 vcc, v16, v18
+            s_cbranch_vccz   LOOP_ALDBRN
+            buffer_invl2
+            s_load_dword s17, s[0:1], 0x4 glc
+            s_waitcnt vmcnt(0) & lgkmcnt(0)
+            s_store_dword s17, s[2:3], 0x0 glc
+            s_waitcnt vmcnt(0) & lgkmcnt(0)
+            buffer_wbl2
+        .elseif (.amdgcn.gfx_generation_number == 9)
+            s_movk_i32 s18, 0x1
+            LOOP:
+            s_load_dword s16, s[0:1], 0x0 glc
+            s_cmp_eq_i32 s16, s18
+            s_cbranch_scc0   LOOP
+            s_load_dword s17, s[0:1], 0x4 glc
+            s_waitcnt vmcnt(0) & lgkmcnt(0)
+            s_store_dword s17, s[2:3], 0x0 glc
+        .endif
+        s_waitcnt vmcnt(0) & lgkmcnt(0)
+        s_endpgm
+)";
+
+/* Input0: A buffer of at least 2 dwords.
+ * DW0: used as a signal. Write 0x1 to signal
+ * DW1: Write the value from 2nd input buffer
+ *      for other device to read.
+ * Input1: A buffer of at least 2 dwords.
+ * DW0: used as the value to be written.
+ *
+ * Note: Only works on Aldebaran
+ */
+const char *WriteFlagAndValueIsa = R"(
+        .text
+        // Assume two inputs buffer in s[0:1] and s[2:3]
+        .if (.amdgcn.gfx_generation_number == 9 && .amdgcn.gfx_generation_stepping == 10)
+            v_mov_b32 v0, s0
+            v_mov_b32 v1, s1
+            s_load_dword s18, s[2:3], 0x0 glc
+            s_waitcnt vmcnt(0) & lgkmcnt(0)
+            s_store_dword s18, s[0:1], 0x4 glc
+            s_waitcnt vmcnt(0) & lgkmcnt(0)
+            buffer_wbl2
+            s_waitcnt vmcnt(0) & lgkmcnt(0)
+            v_mov_b32 v16, 0x1
+            flat_store_dword v[0:1], v16 glc
+        .endif
+        s_endpgm
+)";
+
+/* Input0: A buffer of at least 2 dwords.
+ * DW0: used as a signal. Write 0xcafe to signal
+ * DW1: Write to this buffer for other device to read.
+ * Input1: mmio base address
+ */
+const char *WriteAndSignalIsa = R"(
+        .text
+        // Assume input buffer in s0, s1
+        .if (.amdgcn.gfx_generation_number >= 10)
+            s_add_u32 s4, s0, 0x4
+            s_addc_u32 s5, s1, 0x0
+            v_mov_b32 v0, s0
+            v_mov_b32 v1, s1
+            v_mov_b32 v2, s2
+            v_mov_b32 v3, s3
+            v_mov_b32 v4, s4
+            v_mov_b32 v5, s5
+            v_mov_b32 v18, 0xbeef
+            flat_store_dword v[4:5], v18 glc
+            v_mov_b32 v18, 0x1
+            flat_store_dword v[2:3], v18 glc
+            v_mov_b32 v18, 0xcafe
+            flat_store_dword v[0:1], v18 glc
+        .else
+            s_mov_b32 s18, 0xbeef
+            s_store_dword s18, s[0:1], 0x4 glc
+            s_mov_b32 s18, 0x1
+            s_store_dword s18, s[2:3], 0 glc
+            s_mov_b32 s18, 0xcafe
+            s_store_dword s18, s[0:1], 0x0 glc
+        .endif
+        s_endpgm
+)";
+
+/**
+ * KFDQMTest
+ */
+
+/* A simple isa loop program with dense mathematic operations
+ * s1 controls the number iterations of the loop
+ * This shader can be used by GFX8, GFX9 and GFX10
+ */
+const char *LoopIsa = R"(
+        .text
+        s_movk_i32    s0, 0x0008
+        s_movk_i32    s1, 0x00ff
+        v_mov_b32     v0, 0
+        v_mov_b32     v1, 0
+        v_mov_b32     v2, 0
+        v_mov_b32     v3, 0
+        v_mov_b32     v4, 0
+        v_mov_b32     v5, 0
+        v_mov_b32     v6, 0
+        v_mov_b32     v7, 0
+        v_mov_b32     v8, 0
+        v_mov_b32     v9, 0
+        v_mov_b32     v10, 0
+        v_mov_b32     v11, 0
+        v_mov_b32     v12, 0
+        v_mov_b32     v13, 0
+        v_mov_b32     v14, 0
+        v_mov_b32     v15, 0
+        v_mov_b32     v16, 0
+        LOOP:
+        s_mov_b32     s8, s4
+        s_mov_b32     s9, s1
+        s_mov_b32     s10, s6
+        s_mov_b32     s11, s7
+        s_cmp_le_i32  s1, s0
+        s_cbranch_scc1  END_OF_PGM
+        s_buffer_load_dwordx8  s[8:15], s[8:11], 0x10
+        v_add_f32     v0, 2.0, v0
+        v_cvt_f32_i32 v17, s1
+        s_waitcnt     lgkmcnt(0)
+        v_add_f32     v18, s8, v17
+        v_add_f32     v19, s9, v17
+        v_add_f32     v20, s10, v17
+        v_add_f32     v21, s11, v17
+        v_add_f32     v22, s12, v17
+        v_add_f32     v23, s13, v17
+        v_add_f32     v24, s14, v17
+        v_add_f32     v17, s15, v17
+        v_log_f32     v25, v18
+        v_mul_f32     v25, v22, v25
+        v_exp_f32     v25, v25
+        v_log_f32     v26, v19
+        v_mul_f32     v26, v23, v26
+        v_exp_f32     v26, v26
+        v_log_f32     v27, v20
+        v_mul_f32     v27, v24, v27
+        v_exp_f32     v27, v27
+        v_log_f32     v28, v21
+        v_mul_f32     v28, v17, v28
+        v_exp_f32     v28, v28
+        v_add_f32     v5, v5, v25
+        v_add_f32     v6, v6, v26
+        v_add_f32     v7, v7, v27
+        v_add_f32     v8, v8, v28
+        v_mul_f32     v18, 0x3fb8aa3b, v18
+        v_exp_f32     v18, v18
+        v_mul_f32     v19, 0x3fb8aa3b, v19
+        v_exp_f32     v19, v19
+        v_mul_f32     v20, 0x3fb8aa3b, v20
+        v_exp_f32     v20, v20
+        v_mul_f32     v21, 0x3fb8aa3b, v21
+        v_exp_f32     v21, v21
+        v_add_f32     v9, v9, v18
+        v_add_f32     v10, v10, v19
+        v_add_f32     v11, v11, v20
+        v_add_f32     v12, v12, v21
+        v_sqrt_f32    v18, v22
+        v_sqrt_f32    v19, v23
+        v_sqrt_f32    v20, v24
+        v_sqrt_f32    v21, v17
+        v_add_f32     v13, v13, v18
+        v_add_f32     v14, v14, v19
+        v_add_f32     v15, v15, v20
+        v_add_f32     v16, v16, v21
+        v_rsq_f32     v18, v22
+        v_rsq_f32     v19, v23
+        v_rsq_f32     v20, v24
+        v_rsq_f32     v17, v17
+        v_add_f32     v1, v1, v18
+        v_add_f32     v2, v2, v19
+        v_add_f32     v3, v3, v20
+        v_add_f32     v4, v4, v17
+        s_add_u32     s0, s0, 1
+        s_branch      LOOP
+        END_OF_PGM:
+        s_endpgm
+)";
+
+
+/**
+ * KFDCWSRTest
+ */
+
+/* Initial state:
+ *   s[0:1] - 64 bits iteration number; only the lower 32 bits are useful.
+ *   s[2:3] - result buffer base address
+ *   s4 - workgroup id
+ *   v0 - workitem id, always 0 because
+ *        NUM_THREADS_X(number of threads) in workgroup set to 1
+ * Registers:
+ *   v0 - calculated workitem = v0 + s4 * NUM_THREADS_X, which is s4
+ *   v2 - = s0, 32 bits iteration number
+ *   v[4:5] - corresponding output buf address: s[2:3] + v0 * 4
+ *   v6 - counter
+ */
+const char *IterateIsa = SHADER_MACROS R"(
+        // Copy the parameters from scalar registers to vector registers
+        v_mov_b32               v2, s0          // v[2:3] = s[0:1]
+        v_mov_b32               v3, s1          // v[2:3] = s[0:1]
+        v_mov_b32               v0, s4          // use workgroup id as index
+        v_lshlrev_b32           v0, 2, v0       // v0 *= 4
+        V_ADD_CO_U32            v4, s2, v0      // v[4:5] = s[2:3] + v0 * 4
+        v_mov_b32               v5, s3          // v[4:5] = s[2:3] + v0 * 4
+        V_ADD_CO_CI_U32         v5, v5, 0       // v[4:5] = s[2:3] + v0 * 4
+        v_mov_b32               v6, 0
+        LOOP:
+        V_ADD_CO_U32            v6, 1, v6
+
+        // Compare the result value (v6) to iteration value (v2), and
+        // jump if equal (i.e. if VCC is not zero after the comparison)
+        V_CMP_LT_U32            v6, v2
+        s_cbranch_vccnz LOOP
+        flat_store_dword        v[4:5], v6
+        s_waitcnt vmcnt(0) & lgkmcnt(0)
+        s_endpgm
+)";
+
+/**
+ * KFDEvictTest
+ */
+
+/* Shader to read local buffers using multiple wavefronts in parallel
+ * until address buffer is filled with specific value 0x5678 by host program,
+ * then each wavefront fills value 0x5678 at corresponding result buffer and quit
+ *
+ * Initial state:
+ *   s[0:1] - address buffer base address
+ *   s[2:3] - result buffer base address
+ *   s4 - workgroup id
+ *   v0 - workitem id, always 0 because NUM_THREADS_X(number of threads) in workgroup set to 1
+ * Registers:
+ *   v0 - calculated workitem id, v0 = v0 + s4 * NUM_THREADS_X
+ *   v[2:3] - address of corresponding local buf address offset: s[0:1] + v0 * 8
+ *   v[4:5] - corresponding output buf address: s[2:3] + v0 * 4
+ *   v[6:7] - local buf address used for read test
+ */
+const char *ReadMemoryIsa = SHADER_MACROS R"(
+        // Compute address of corresponding output buffer
+        v_mov_b32               v0, s4          // use workgroup id as index
+        v_lshlrev_b32           v0, 2, v0       // v0 *= 4
+        V_ADD_CO_U32            v4, s2, v0      // v[4:5] = s[2:3] + v0 * 4
+        v_mov_b32               v5, s3          // v[4:5] = s[2:3] + v0 * 4
+        V_ADD_CO_CI_U32         v5, v5, 0       // v[4:5] = s[2:3] + v0 * 4
+
+        // Compute input buffer offset used to store corresponding local buffer address
+        v_lshlrev_b32           v0, 1, v0       // v0 *= 8
+        V_ADD_CO_U32            v2, s0, v0      // v[2:3] = s[0:1] + v0 * 8
+        v_mov_b32               v3, s1          // v[2:3] = s[0:1] + v0 * 8
+        V_ADD_CO_CI_U32         v3, v3, 0       // v[2:3] = s[0:1] + v0 * 8
+
+        // Load 64bit local buffer address stored at v[2:3] to v[6:7]
+        flat_load_dwordx2       v[6:7], v[2:3] slc
+        s_waitcnt vmcnt(0) & lgkmcnt(0)         // wait for memory reads to finish
+        v_mov_b32               v8, 0x5678
+        s_movk_i32              s8, 0x5678
+        L_REPEAT:
+        s_load_dword            s16, s[0:1], 0x0 glc
+        s_waitcnt vmcnt(0) & lgkmcnt(0)         // wait for memory reads to finish
+        s_cmp_eq_i32            s16, s8
+        s_cbranch_scc1          L_QUIT          // if notified to quit by host
+
+        // Loop read 64M local buffer starting at v[6:7]
+        // every 4k page only read once
+        v_mov_b32               v9, 0
+        v_mov_b32               v10, 0x1000     // 4k page
+        v_mov_b32               v11, 0x4000000  // 64M size
+        v_mov_b32               v12, v6
+        v_mov_b32               v13, v7
+        L_LOOP_READ:
+        flat_load_dwordx2       v[14:15], v[12:13] slc
+        V_ADD_CO_U32            v9, v9, v10
+        V_ADD_CO_U32            v12, v12, v10
+        V_ADD_CO_CI_U32         v13, v13, 0
+        V_CMP_LT_U32            v9, v11
+        s_cbranch_vccnz         L_LOOP_READ
+        s_branch                L_REPEAT
+        L_QUIT:
+        flat_store_dword        v[4:5], v8
+        s_waitcnt vmcnt(0) & lgkmcnt(0)         // wait for memory writes to finish
+        s_endpgm
+)";
+
+/**
+ * KFDGWSTest
+ */
+
+/* Shader to initialize gws counter to 1 */
+const char *GwsInitIsa = R"(
+        .text
+        s_mov_b32 m0, 0
+        s_nop 0
+        s_load_dword s16, s[0:1], 0x0 glc
+        s_waitcnt 0
+        v_mov_b32 v0, s16
+        s_waitcnt 0
+        ds_gws_init v0 offset:0 gds
+        s_waitcnt 0
+        s_endpgm
+)";
+
+/* Atomically increase a value in memory
+ * This is expected to be executed from
+ * multiple work groups simultaneously.
+ * GWS semaphore is used to guarantee
+ * the operation is atomic.
+ */
+const char *GwsAtomicIncreaseIsa = R"(
+        .text
+        // Assume src address in s0, s1
+        .if (.amdgcn.gfx_generation_number >= 10)
+            s_mov_b32 m0, 0
+            s_mov_b32 exec_lo, 0x1
+            v_mov_b32 v0, s0
+            v_mov_b32 v1, s1
+            ds_gws_sema_p offset:0 gds
+            s_waitcnt 0
+            flat_load_dword v2, v[0:1] glc dlc
+            s_waitcnt 0
+            v_add_nc_u32 v2, v2, 1
+            flat_store_dword v[0:1], v2
+            s_waitcnt_vscnt null, 0
+            ds_gws_sema_v offset:0 gds
+        .else
+            s_mov_b32 m0, 0
+            s_nop 0
+            ds_gws_sema_p offset:0 gds
+            s_waitcnt 0
+            s_load_dword s16, s[0:1], 0x0 glc
+            s_waitcnt 0
+            s_add_u32 s16, s16, 1
+            s_store_dword s16, s[0:1], 0x0 glc
+            s_waitcnt lgkmcnt(0)
+            ds_gws_sema_v offset:0 gds
+        .endif
+        s_waitcnt 0
+        s_endpgm
+)";
@@ -1,5 +1,5 @@
 /*
- * Copyright (C) 2019 Advanced Micro Devices, Inc. All Rights Reserved.
+ * Copyright (C) 2021 Advanced Micro Devices, Inc. All Rights Reserved.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
 * copy of this software and associated documentation files (the "Software"),
@@ -21,29 +21,40 @@
 *
 */

-#ifndef _ISAGENERATOR_GFX10_H_
-#define _ISAGENERATOR_GFX10_H_
+#ifndef _SHADERSTORE_H_
+#define _SHADERSTORE_H_

-#include <string>
-#include "IsaGenerator.hpp"
+#include <vector>

-class IsaGenerator_Gfx10 : public IsaGenerator {
- public:
-    virtual void GetNoopIsa(HsaMemoryBuffer& rBuf);
-    virtual void GetCopyDwordIsa(HsaMemoryBuffer& rBuf);
-    virtual void GetInfiniteLoopIsa(HsaMemoryBuffer& rBuf);
-    virtual void GetAtomicIncIsa(HsaMemoryBuffer& rBuf);
+/* KFDASMTest List */
+extern const std::vector<const char*> ShaderList;

- protected:
-    virtual const std::string& GetAsicName();
+/* Common */
+extern const char *NoopIsa;
+extern const char *CopyDwordIsa;
+extern const char *InfiniteLoopIsa;
+extern const char *AtomicIncIsa;

- private:
-    static const std::string ASIC_NAME;
+/* KFDMemoryTest */
+extern const char *ScratchCopyDwordIsa;
+extern const char *PollMemoryIsa;
+extern const char *PollNCMemoryIsa;
+extern const char *CopyOnSignalIsa;
+extern const char *PollAndCopyIsa;
+extern const char *WriteFlagAndValueIsa;
+extern const char *WriteAndSignalIsa;

-    static const uint32_t NOOP_ISA[];
-    static const uint32_t COPY_DWORD_ISA[];
-    static const uint32_t INFINITE_LOOP_ISA[];
-    static const uint32_t ATOMIC_ADD_ISA[];
-};
+/* KFDQMTest */
+extern const char *LoopIsa;

-#endif  // _ISAGENERATOR_GFX9_H_
+/* KFDCWSRTest */
+extern const char *IterateIsa;
+
+/* KFDEvictTest */
+extern const char *ReadMemoryIsa;
+
+/* KFDGWSTest */
+extern const char *GwsInitIsa;
+extern const char *GwsAtomicIncreaseIsa;
+
+#endif  // _SHADERSTORE_H_