Adding HSA extension AMD AQL profile library, see Readme.txt

Change-Id: Icbc1e0fb0185642eabbab411a2138ea030d22be8
2017-06-06 14:59:08 -05:00
@@ -0,0 +1,28 @@
+#
+# Minimum version of cmake required
+#
+cmake_minimum_required ( VERSION 3.5.0 )
+
+#
+# Setup flag to be verbose or not
+#
+set ( CMAKE_VERBOSE_MAKEFILE TRUE CACHE BOOL "Verbose Output" FORCE )
+
+set ( ROOT_DIR ${CMAKE_CURRENT_SOURCE_DIR} )
+set ( PROJ_DIR ${ROOT_DIR}/src )
+set ( TEST_DIR ${ROOT_DIR}/test )
+
+#
+# Build sources
+#
+include ( ${PROJ_DIR}/CMakeLists.txt )
+
+#
+# Build tests
+#
+add_subdirectory ( ${TEST_DIR} ${PROJECT_BINARY_DIR}/test )
+
+#
+# Style format
+#
+execute_process ( COMMAND sh -xc "/usr/bin/find ${ROOT_DIR} -name '*.cpp' -o -name '*.hpp' -o -name '*.h' -exec /usr/bin/clang-format -i -style=file \{\} \;" )
@@ -0,0 +1,40 @@
+HSA extension AMD AQL profile library.
+Provides AQL packets helper methods for
+perfcounters (PMC) and SQ threadtraces (SQTT).
+
+Current library implementation supports only GFX9.
+The library source tree:
+ - doc  - Documantation, the API specification and the presentation
+ - inc  - Public API
+   - hsa_ext_amd_aql_profile.h - AMD AQL profile library public API
+   - amd_aql_pm4_ib_packet.h   - AQL PM4 IB packet type
+ - src  - AMD AQL profile library sources
+   - aqlprofile - AMD AQL profile library
+   - commandwriter - PM4 command writer originated from 'hsa-runtime/tools'
+   - perfcounter - PM4 perfcounter manager originated from 'hsa-runtime/tools'
+   - threadtrace - PM4 threadtrace manager originated from 'hsa-runtime/tools'
+   - util - core/utils library build based on 'hsa-runtime/core/util'
+ - test - the library test suite
+   - ctrl - Test controll
+   - common - Test common utils
+   - SimpleConvolution - Simple convolution test
+
+To build the library:
+
+$ cd ..../hsa-ext-aql-profile
+$ mkdir build
+$ cd build
+$ cmake ..
+$ make
+
+To run the test:
+
+# cd ..../hsa-ext-aql-profile/build
+$ cp ../test/SimpleConvolution/gfx9_SimpleConvolution.hsaco .
+$ test/SimpleConvolution 
+
+to enable PMC profiling:
+export ROCR_ENABLE_PMC=1
+
+to enable SQTT profiling:
+export ROCR_ENABLE_SQTT=1
@@ -0,0 +1,66 @@
+#
+# Compiler Preprocessor definitions.
+#
+add_definitions ( -D__linux__ )
+add_definitions ( -DUNIX_OS )
+add_definitions ( -DLINUX )
+add_definitions ( -D__AMD64__ )
+add_definitions ( -D__x86_64__ )
+add_definitions ( -DAMD_INTERNAL_BUILD )
+add_definitions ( -DLITTLEENDIAN_CPU=1 )
+add_definitions ( -DHSA_LARGE_MODEL= )
+add_definitions ( -DHSA_DEPRECATED= )
+
+#
+# Linux Compiler options
+#
+set ( CMAKE_CXX_FLAGS "-std=c++11")
+set ( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Werror" )
+set ( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Werror=return-type" )
+set ( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fexceptions" )
+set ( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fvisibility=hidden" )
+set ( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-error=sign-compare" )
+set ( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-error=enum-compare" )
+set ( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-error=comment " )
+set ( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-error=pointer-arith" )
+set ( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-comment" )
+set ( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-sign-compare" )
+set ( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-pointer-arith" )
+set ( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-write-strings" )
+set ( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-conversion-null" )
+set ( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-deprecated-declarations" )
+set ( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fno-rtti" )
+set ( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fno-math-errno" )
+set ( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fno-threadsafe-statics" )
+set ( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fms-extensions" )
+set ( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fmerge-all-constants" )
+set ( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fPIC" )
+
+#
+# Extend Compiler flags based on build type
+#
+set ( CMAKE_BUILD_TYPE ${BUILD_TYPE} )
+if ( "${CMAKE_BUILD_TYPE}" STREQUAL Debug )
+  set ( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -ggdb" )
+endif ()
+
+#
+# Extend Compiler flags based on Processor architecture
+#
+if ( CMAKE_SYSTEM_PROCESSOR STREQUAL "x86_64" )
+  set ( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -m64  -msse -msse2" )
+elseif ( CMAKE_SYSTEM_PROCESSOR STREQUAL "x86" )
+  set ( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -m32" )
+endif ()
+
+#
+# Basic Tool Chain Information
+#
+message ( "-------------IS64BIT: " ${IS64BIT} )
+message ( "-----------BuildType: " ${BUILD_TYPE} )
+message ( " -----------Compiler: " ${CMAKE_CXX_COMPILER} )
+message ( " ------------Version: " ${CMAKE_CXX_COMPILER_VERSION} )
+message ( " ------------ProjDir: " ${PROJ_DIR} )
+message ( " ------------TestDir: " ${PROJ_DIR} )
+message ( "------HSA-RuntimeDir: " ${HSA_RUNTIME_DIR} )
+message ( " -----------CoreUtil: " ${CORE_UTIL_DIR} )
@@ -0,0 +1,52 @@
+#
+# Build is not supported on Windows plaform
+#
+if ( WIN32 )
+  message ( FATAL_ERROR "Windows build is not supported." )
+endif ()
+
+#
+# External dependencies for Rocr Header files
+#
+if ( NOT DEFINED ENV{ROCR_INC_DIR} )
+  message ( FATAL_ERROR "ERROR: Environment variable ROCR_INC_DIR is not set" )
+  return ()
+endif ()
+
+#
+# External dependencies for Rocr Library files
+#
+if ( NOT DEFINED ENV{ROCR_LIB_DIR} )
+  message ( FATAL_ERROR "ERROR: Environment variable ROCR_LIB_DIR is not set" )
+  return ()
+endif ()
+
+#
+# Process Env to determine build type
+#
+string ( TOLOWER "$ENV{ROCR_BLD_TYPE}" type )
+if ( "${type}" STREQUAL debug )
+  set ( ISDEBUG 1 )
+  set ( BUILD_TYPE "Debug" )
+else ()
+  set ( ISDEBUG 0 )
+  set ( BUILD_TYPE "Release" )
+endif ()
+
+#
+# Determine build is 32-bit or 64-bit
+# @note: By default it is not set
+#
+if ( "$ENV{ROCR_BLD_BITS}" STREQUAL 32 )
+    set ( ONLY64STR "" )
+    set ( IS64BIT 0 )
+else ()
+    set ( ONLY64STR "64" )
+    set ( IS64BIT 1 )
+endif ()
+
+#
+# Build information
+#
+message ( "---------ROCR-HdrDir: " $ENV{ROCR_INC_DIR} )
+message ( "---------ROCR-LibDir: " $ENV{ROCR_LIB_DIR} )
@@ -0,0 +1,67 @@
+////////////////////////////////////////////////////////////////////////////////
+//
+// Copyright 2017 ADVANCED MICRO DEVICES, INC.
+//
+// AMD is granting you permission to use this software and documentation(if any)
+// (collectively, the "Materials") pursuant to the terms and conditions of the
+// Software License Agreement included with the Materials.If you do not have a
+// copy of the Software License Agreement, contact your AMD representative for a
+// copy.
+//
+// You agree that you will not reverse engineer or decompile the Materials, in
+// whole or in part, except as allowed by applicable law.
+//
+// WARRANTY DISCLAIMER : THE SOFTWARE IS PROVIDED "AS IS" WITHOUT WARRANTY OF
+// ANY KIND.AMD DISCLAIMS ALL WARRANTIES, EXPRESS, IMPLIED, OR STATUTORY,
+// INCLUDING BUT NOT LIMITED TO THE IMPLIED WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE, TITLE, NON - INFRINGEMENT, THAT THE
+// SOFTWARE WILL RUN UNINTERRUPTED OR ERROR - FREE OR WARRANTIES ARISING FROM
+// CUSTOM OF TRADE OR COURSE OF USAGE.THE ENTIRE RISK ASSOCIATED WITH THE USE OF
+// THE SOFTWARE IS ASSUMED BY YOU.Some jurisdictions do not allow the exclusion
+// of implied warranties, so the above exclusion may not apply to You.
+//
+// LIMITATION OF LIABILITY AND INDEMNIFICATION : AMD AND ITS LICENSORS WILL NOT,
+// UNDER ANY CIRCUMSTANCES BE LIABLE TO YOU FOR ANY PUNITIVE, DIRECT,
+// INCIDENTAL, INDIRECT, SPECIAL OR CONSEQUENTIAL DAMAGES ARISING FROM USE OF
+// THE SOFTWARE OR THIS AGREEMENT EVEN IF AMD AND ITS LICENSORS HAVE BEEN
+// ADVISED OF THE POSSIBILITY OF SUCH DAMAGES.In no event shall AMD's total
+// liability to You for all damages, losses, and causes of action (whether in
+// contract, tort (including negligence) or otherwise) exceed the amount of $100
+// USD.  You agree to defend, indemnify and hold harmless AMD and its licensors,
+// and any of their directors, officers, employees, affiliates or agents from
+// and against any and all loss, damage, liability and other expenses (including
+// reasonable attorneys' fees), resulting from Your use of the Software or
+// violation of the terms and conditions of this Agreement.
+//
+// U.S.GOVERNMENT RESTRICTED RIGHTS : The Materials are provided with
+// "RESTRICTED RIGHTS." Use, duplication, or disclosure by the Government is
+// subject to the restrictions as set forth in FAR 52.227 - 14 and DFAR252.227 -
+// 7013, et seq., or its successor.Use of the Materials by the Government
+// constitutes acknowledgement of AMD's proprietary rights in them.
+//
+// EXPORT RESTRICTIONS: The Materials may be subject to export restrictions as
+//                      stated in the Software License Agreement.
+//
+////////////////////////////////////////////////////////////////////////////////
+
+#ifndef _AMD_AQL_PM4_IB_PACKET_H_
+#define _AMD_AQL_PM4_IB_PACKET_H_
+
+// Value of 'pm4_ib_format' field of amd_aql_pm4_ib_packet_t packet
+const static uint32_t AMD_AQL_PM4_IB_FORMAT = 1;
+// Value of 'dw_count_remain' field of amd_aql_pm4_ib_packet_t packet
+const static uint32_t AMD_AQL_PM4_IB_DW_COUNT_REMAIN = 10;
+// Size of 'reserved' array of amd_aql_pm4_ib_packet_t packet
+const static uint32_t AMD_AQL_PM4_IB_RESERVED_COUNT = 8;
+
+// AQL Vendor Specific Packet which carry PM4 IB command
+typedef struct {
+  uint16_t header;
+  uint16_t pm4_ib_format;
+  uint32_t pm4_ib_command[4];
+  uint32_t dw_count_remain;
+  uint32_t reserved[AMD_AQL_PM4_IB_RESERVED_COUNT];
+  hsa_signal_t completion_signal;
+} amd_aql_pm4_ib_packet_t;
+
+#endif  // _AMD_AQL_PM4_IB_H_
@@ -0,0 +1,262 @@
+////////////////////////////////////////////////////////////////////////////////
+//
+// Copyright 2017 ADVANCED MICRO DEVICES, INC.
+//
+// AMD is granting you permission to use this software and documentation(if any)
+// (collectively, the "Materials") pursuant to the terms and conditions of the
+// Software License Agreement included with the Materials.If you do not have a
+// copy of the Software License Agreement, contact your AMD representative for a
+// copy.
+//
+// You agree that you will not reverse engineer or decompile the Materials, in
+// whole or in part, except as allowed by applicable law.
+//
+// WARRANTY DISCLAIMER : THE SOFTWARE IS PROVIDED "AS IS" WITHOUT WARRANTY OF
+// ANY KIND.AMD DISCLAIMS ALL WARRANTIES, EXPRESS, IMPLIED, OR STATUTORY,
+// INCLUDING BUT NOT LIMITED TO THE IMPLIED WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE, TITLE, NON - INFRINGEMENT, THAT THE
+// SOFTWARE WILL RUN UNINTERRUPTED OR ERROR - FREE OR WARRANTIES ARISING FROM
+// CUSTOM OF TRADE OR COURSE OF USAGE.THE ENTIRE RISK ASSOCIATED WITH THE USE OF
+// THE SOFTWARE IS ASSUMED BY YOU.Some jurisdictions do not allow the exclusion
+// of implied warranties, so the above exclusion may not apply to You.
+//
+// LIMITATION OF LIABILITY AND INDEMNIFICATION : AMD AND ITS LICENSORS WILL NOT,
+// UNDER ANY CIRCUMSTANCES BE LIABLE TO YOU FOR ANY PUNITIVE, DIRECT,
+// INCIDENTAL, INDIRECT, SPECIAL OR CONSEQUENTIAL DAMAGES ARISING FROM USE OF
+// THE SOFTWARE OR THIS AGREEMENT EVEN IF AMD AND ITS LICENSORS HAVE BEEN
+// ADVISED OF THE POSSIBILITY OF SUCH DAMAGES.In no event shall AMD's total
+// liability to You for all damages, losses, and causes of action (whether in
+// contract, tort (including negligence) or otherwise) exceed the amount of $100
+// USD.  You agree to defend, indemnify and hold harmless AMD and its licensors,
+// and any of their directors, officers, employees, affiliates or agents from
+// and against any and all loss, damage, liability and other expenses (including
+// reasonable attorneys' fees), resulting from Your use of the Software or
+// violation of the terms and conditions of this Agreement.
+//
+// U.S.GOVERNMENT RESTRICTED RIGHTS : The Materials are provided with
+// "RESTRICTED RIGHTS." Use, duplication, or disclosure by the Government is
+// subject to the restrictions as set forth in FAR 52.227 - 14 and DFAR252.227 -
+// 7013, et seq., or its successor.Use of the Materials by the Government
+// constitutes acknowledgement of AMD's proprietary rights in them.
+//
+// EXPORT RESTRICTIONS: The Materials may be subject to export restrictions as
+//                      stated in the Software License Agreement.
+//
+////////////////////////////////////////////////////////////////////////////////
+
+#ifndef _HSA_EXT_AMD_AQL_PROFILE_H_
+#define _HSA_EXT_AMD_AQL_PROFILE_H_
+
+#include <stdint.h>
+#include <hsa.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif  // __cplusplus
+
+///////////////////////////////////////////////////////////////////////
+// Library API:
+// The library provides helper methods for instantiation of
+// the profile context object and for populating of the start
+// and stop AQL packets. The profile object contains a profiling
+// events list and needed for profiling buffers descriptors,
+// a command buffer and an output data buffer. To check if there
+// was an error the library methods return a status code. Also
+// the library provides methods for querying required buffers
+// attributes, to validate the event attributes and to get profiling
+// output data.
+//
+// Returned status:
+//     hsa_status_t – HSA status codes are used from hsa.h header
+//
+// Supported profiling features:
+//
+// Supported profiling events
+typedef enum {
+  HSA_EXT_AQL_PROFILE_EVENT_PMC,
+  HSA_EXT_AQL_PROFILE_EVENT_SQTT
+} hsa_ext_amd_aql_profile_event_type_t;
+
+// Supported performance counters (PMC) blocks
+// The block ID is the same for a block instances set, for example
+// each block instance from the TCC block set, TCC0, TCC1, …, TCCN
+// will have the same block ID HSA_EXT_AQL_PROFILE_BLOCKS_TCC.
+typedef enum {
+  HSA_EXT_AQL_PROFILE_BLOCK_CB,
+  HSA_EXT_AQL_PROFILE_BLOCK_CPF,
+  HSA_EXT_AQL_PROFILE_BLOCK_DB,
+  HSA_EXT_AQL_PROFILE_BLOCK_GRBM,
+  HSA_EXT_AQL_PROFILE_BLOCK_GRBMSE,
+  HSA_EXT_AQL_PROFILE_BLOCK_PASU,
+  HSA_EXT_AQL_PROFILE_BLOCK_PASC,
+  HSA_EXT_AQL_PROFILE_BLOCK_SPI,
+  HSA_EXT_AQL_PROFILE_BLOCK_SQ,
+  HSA_EXT_AQL_PROFILE_BLOCK_SQGS,
+  HSA_EXT_AQL_PROFILE_BLOCK_SQVS,
+  HSA_EXT_AQL_PROFILE_BLOCK_SQPS,
+  HSA_EXT_AQL_PROFILE_BLOCK_SQHS,
+  HSA_EXT_AQL_PROFILE_BLOCK_SQCS,
+  HSA_EXT_AQL_PROFILE_BLOCK_SX,
+  HSA_EXT_AQL_PROFILE_BLOCK_TA,
+  HSA_EXT_AQL_PROFILE_BLOCK_TCA,
+  HSA_EXT_AQL_PROFILE_BLOCK_TCC,
+  HSA_EXT_AQL_PROFILE_BLOCK_TD,
+  HSA_EXT_AQL_PROFILE_BLOCK_TCP,
+  HSA_EXT_AQL_PROFILE_BLOCK_GDS,
+  HSA_EXT_AQL_PROFILE_BLOCK_VGT,
+  HSA_EXT_AQL_PROFILE_BLOCK_IA,
+  HSA_EXT_AQL_PROFILE_BLOCK_MC,
+  HSA_EXT_AQL_PROFILE_BLOCK_TCS,
+  HSA_EXT_AQL_PROFILE_BLOCK_WD,
+  HSA_EXT_AQL_PROFILE_BLOCKS_NUMBER
+} hsa_ext_amd_aql_profile_block_name_t;
+
+// PMC event object structure
+// ‘counter_id’ value is specified in GFXIPs perfcounter user guides
+// which is the counters select value, “Performance Counters Selection”
+// chapter.
+typedef struct {
+  hsa_ext_amd_aql_profile_block_name_t block_name;
+  uint32_t block_index;
+  uint32_t counter_id;
+} hsa_ext_amd_aql_profile_event_t;
+
+// Check if event is valid for the specific GPU
+hsa_status_t hsa_ext_amd_aql_profile_validate_event(
+    hsa_agent_t agent,                             // HSA handle for the profiling GPU
+    const hsa_ext_amd_aql_profile_event_t* event,  // Pointer on validated event
+    bool* result);                                 // True if the event valid, False otherwise
+
+// Profiling parameters
+// All parameters are generic and if not applicable for a specific
+// profile configuration then error status will be returned.
+typedef enum {
+  // SQTT applicable parameters
+  HSA_EXT_AQL_PROFILE_PARAM_COMPUTE_UNIT_TARGET,
+  HSA_EXT_AQL_PROFILE_PARAM_VM_ID_MASK,
+  HSA_EXT_AQL_PROFILE_PARAM_MASK,
+  HSA_EXT_AQL_PROFILE_PARAM_TOKEN_MASK,
+  HSA_EXT_AQL_PROFILE_PARAM_TOKEN_MASK2
+} hsa_ext_amd_aql_profile_parameter_name_t;
+
+// Profile parameter object
+typedef struct {
+  hsa_ext_amd_aql_profile_parameter_name_t parameter_name;
+  uint32_t value;
+} hsa_ext_amd_aql_profile_parameters_t;
+
+//
+// Profile context object:
+// The library provides a profile object structure which contains
+// the events array, a buffer for the profiling start/stop commands
+// and a buffer for the output data.
+// The buffers are specified by the buffer descriptors and allocated
+// by the application. The buffers allocation attributes, the command
+// buffer size, the PMC output buffer size as well as profiling output
+// data can be get using the generic get profile info helper _get_info.
+//
+// Buffer descriptor
+typedef struct {
+  void* ptr;
+  uint32_t size;
+} hsa_ext_amd_aql_profile_descriptor_t;
+
+// Profile context object structure, contains profiling events list and
+// needed for profiling buffers descriptors, a command buffer and
+// an output data buffer
+typedef struct {
+  hsa_agent_t agent;                                       // GFXIP handle
+  hsa_ext_amd_aql_profile_event_type_t type;               // Events type
+  const hsa_ext_amd_aql_profile_event_t* events;           // Events array
+  uint32_t event_count;                                    // Events count
+  const hsa_ext_amd_aql_profile_parameters_t* parameters;  // Parameters array
+  uint32_t parameter_count;                                // Parameters count
+  hsa_ext_amd_aql_profile_descriptor_t output_buffer;      // Output buffer
+  hsa_ext_amd_aql_profile_descriptor_t command_buffer;     // PM4 commands
+} hsa_ext_amd_aql_profile_profile_t;
+
+//
+// AQL packets populating methods:
+// The helper methods to populate provided by the application START and
+// STOP AQL packets which the application is required to submit before and
+// after profiled GPU task packets respectively.
+//
+// AQL Vendor Specific packet which carries a PM4 command
+typedef struct {
+  uint16_t header;
+  uint16_t pm4_command[27];
+  hsa_signal_t completion_signal;
+} hsa_ext_amd_aql_pm4_packet_t;
+
+// Method to populate the provided AQL packet with profiling start commands
+// Only 'pm4_command' fields of the packet are set and the application
+// is responsible to set Vendor Specific header type a completion signal
+hsa_status_t hsa_ext_amd_aql_profile_start(
+    const hsa_ext_amd_aql_profile_profile_t* profile,  // [in] profile contex object
+    hsa_ext_amd_aql_pm4_packet_t* aql_start_packet);   // [out] profile start AQL packet
+
+// Method to populate the provided AQL packet with profiling stop commands
+// Only 'pm4_command' fields of the packet are set and the application
+// is responsible to set Vendor Specific header type and a completion signal
+hsa_status_t hsa_ext_amd_aql_profile_stop(
+    const hsa_ext_amd_aql_profile_profile_t* profile,  // [in] profile contex object
+    hsa_ext_amd_aql_pm4_packet_t* aql_stop_packet);    // [out] profile stop AQL packet
+
+// Legacy PM4 profiling packet size
+const unsigned HSA_EXT_AQL_PROFILE_LEGACY_PM4_PACKET_SIZE = 64;
+// Converting of the profiling AQL packet to PM4 packet, GFX8 support
+hsa_status_t hsa_ext_amd_aql_profile_legacy_get_pm4(
+    const hsa_ext_amd_aql_pm4_packet_t* aql_packet,  // AQL packet
+    void* pm4);                                      // PM4 packet blob
+
+//
+// Get profile info:
+// Generic method for getting various profile info including profile buffers
+// attributes like the command buffer size and the profiling PMC results.
+// It’s implied that all counters are 64bit values.
+//
+// Profile generic output data:
+typedef struct {
+  uint32_t sample_id;  // PMC sample of SQTT buffer index
+  union {
+    struct {
+      hsa_ext_amd_aql_profile_event_t event;  // PMC event
+      uint64_t result;                        // PMC result
+    } pmc_data;
+    hsa_ext_amd_aql_profile_descriptor_t sqtt_data;  // SQTT output data descriptor
+  };
+} hsa_ext_amd_aql_profile_info_data_t;
+
+// Profile attributes
+typedef enum {
+  HSA_EXT_AQL_PROFILE_INFO_COMMAND_BUFFER_SIZE,  // get_info returns uint32_t value
+  HSA_EXT_AQL_PROFILE_INFO_PMC_DATA_SIZE,        // get_info returns uint32_t value
+  HSA_EXT_AQL_PROFILE_INFO_PMC_DATA,             // get_info returns PMC uint64_t value
+                                                 // in info_data object
+  HSA_EXT_AQL_PROFILE_INFO_SQTT_DATA             // get_info returns SQTT buffer ptr/size
+                                                 // in info_data object
+} hsa_ext_amd_aql_profile_info_type_t;
+
+// Definition of output data iterator callback
+typedef hsa_status_t (*hsa_ext_amd_aql_profile_data_callback_t)(
+    hsa_ext_amd_aql_profile_info_type_t info_type,   // [in] data type, PMC or SQTT data
+    hsa_ext_amd_aql_profile_info_data_t* info_data,  // [in] info_data object
+    void* callback_data);                            // [in/out] data passed to the callback
+
+// Method for getting the profile info
+hsa_status_t hsa_ext_amd_aql_profile_get_info(
+    const hsa_ext_amd_aql_profile_profile_t* profile,  // [in] profile context object
+    hsa_ext_amd_aql_profile_info_type_t attribute,     // [in] requested profile attribute
+    void* value);                                      // [in/out] returned value
+
+// Method for iterating the events output data
+hsa_status_t hsa_ext_amd_aql_profile_iterate_data(
+    const hsa_ext_amd_aql_profile_profile_t* profile,  // [in] profile context object
+    hsa_ext_amd_aql_profile_data_callback_t callback,  // [in] callback to iterate the output data
+    void* data);                                       // [in/out] data passed to the callback
+
+#ifdef __cplusplus
+}
+#endif  // __cplusplus
+
+#endif  // _HSA_EXT_AMD_AQL_PROFILE_H_
@@ -0,0 +1,72 @@
+#
+# Minimum version of cmake required
+#
+cmake_minimum_required ( VERSION 3.5.0 )
+
+#
+# Setup flag to be verbose or not
+#
+set ( CMAKE_VERBOSE_MAKEFILE TRUE CACHE BOOL "Verbose Output" FORCE )
+
+#
+# Set name for the project
+# @note: Must come before adding any sub-directories
+#
+set ( TARGET_NAME "aqlprofile" )
+project ( ${TARGET_NAME} )
+
+if ( NOT DEFINED PROJ_DIR )
+  set ( PROJ_DIR ${CMAKE_CURRENT_SOURCE_DIR} )
+  set ( ROOT_DIR ${PROJ_DIR}/.. )
+endif ()
+
+set ( API_DIR ${ROOT_DIR}/inc )
+set ( HSA_RUNTIME_DIR ${PROJ_DIR}/../../.. )
+set ( HSA_RUNTIME_OSC_DIR ${HSA_RUNTIME_DIR}/opensrc/hsa-runtime )
+set ( CORE_UTIL_DIR ${HSA_RUNTIME_OSC_DIR}/core/util )
+include_directories ( ${ROOT_DIR} )
+
+#
+# Validate required build environment is setup correctly
+#
+include ( ${ROOT_DIR}/cmake_modules/validateBldEnv.cmake )
+
+#
+# Setup tool chain flags - preprocessor, compiler and linker
+#
+include ( ${ROOT_DIR}/cmake_modules/exportToolFlags.cmake )
+
+#
+# Set Name for Utils library and build it as a
+# static library to be linked with others
+#
+set ( UTIL_LIB "util${ONLY64STR}" )
+add_subdirectory ( ${PROJ_DIR}/util "${PROJECT_BINARY_DIR}/util" )
+
+#
+# Set Name for Cmdwriter library and build it as a
+# static library to be linked with others
+#
+set ( CMDWRITER_LIB "commandwriter${ONLY64STR}" )
+add_subdirectory ( ${PROJ_DIR}/commandwriter "${PROJECT_BINARY_DIR}/commandwriter" )
+
+#
+# Set Name for ThreadTrace library and build it as a
+# static library to be linked with others
+#
+set ( SQTT_LIB "sqtt${ONLY64STR}" )
+add_subdirectory ( ${PROJ_DIR}/threadtrace "${PROJECT_BINARY_DIR}/threadtrace" )
+
+#
+# Set Name for Profiler library and build it as a
+# static library to be linked with others
+#
+set ( PMC_LIB "pmc${ONLY64STR}" )
+add_subdirectory ( ${PROJ_DIR}/perfcounter "${PROJECT_BINARY_DIR}/perfcounter" )
+
+#
+# Build the library and link it with other static
+# libraries that have been built in this regard
+#
+set ( TARGET_LIB "${TARGET_NAME}${ONLY64STR}" )
+add_subdirectory ( ${PROJ_DIR}/${TARGET_NAME} "${PROJECT_BINARY_DIR}/${TARGET_NAME}" )
@@ -0,0 +1,20 @@
+#
+# Source files for Rocr Service Manager
+#
+set ( LIB_SRC aql_profile.cpp populate_aql.cpp gfx8_factory.cpp gfx9_factory.cpp )
+
+#
+# Header files include path(s).
+#
+include_directories ( $ENV{ROCR_INC_DIR} )
+include_directories ( ${PROJ_DIR}/perfcounter )
+include_directories ( ${PROJ_DIR}/threadtrace )
+include_directories ( ${PROJ_DIR}/commandwriter )
+include_directories ( ${API_DIR} )
+
+#
+# Build Service Manager as a dynamic Library object
+#
+set ( LIB_LIST ${PMC_LIB} ${SQTT_LIB} ${CMDWRITER_LIB} ${UTIL_LIB} )
+add_library ( ${TARGET_LIB} SHARED ${LIB_SRC} )
+target_link_libraries( ${TARGET_LIB} ${LIB_LIST} c stdc++ dl pthread rt )
@@ -0,0 +1,398 @@
+#include <string>
+
+#include "aql_profile.h"
+#include "pm4_factory.h"
+#include "cmdwriter.h" // commandwriter
+#include "hsa_perf.h" // perfcounter
+#include "thread_trace.h" // threadtrace
+#include "gpu_enum.h"
+#include "gpu_blockinfo.h"
+
+#define PUBLIC_API __attribute__((visibility("default")))
+
+namespace aql_profile {
+
+// Command buffer partitioning manager
+// Supports Pre/Post commands partitioning
+// and postfix control partition
+class CommandBufferMgr {
+  const static uint32_t align_size = 0x100;
+  const static uint32_t align_mask = align_size - 1;
+
+  struct info_t {
+    uint32_t precmds_size;
+    uint32_t postcmds_size;
+  };
+
+  descriptor_t buffer;
+  uint32_t postfix_size;
+  info_t* info;
+
+  uint32_t align(const uint32_t& size) { return (size + align_mask) & ~align_mask; }
+
+ public:
+  CommandBufferMgr(const profile_t* profile)
+      : buffer(profile->command_buffer), postfix_size(0), info(NULL) {
+    info = (info_t*)setPostfix(sizeof(info_t));
+  }
+
+  uint32_t getSize() { return buffer.size; }
+
+  void* setPostfix(const uint32_t& size) {
+    if (size > postfix_size) {
+      const uint32_t delta = size - postfix_size;
+      postfix_size = size;
+      buffer.size -= (delta < buffer.size) ? delta : buffer.size;
+    }
+    return (buffer.size != 0) ? buffer.ptr + buffer.size : NULL;
+  }
+
+  bool setPreSize(const uint32_t& size) {
+    bool suc = (size <= buffer.size);
+    if (suc) info->precmds_size = size;
+    return suc;
+  }
+
+  uint32_t getPostOffset() { return align(info->precmds_size); }
+
+  bool checkTotalSize(const uint32_t& size) {
+    bool suc = (size <= buffer.size);
+    if (suc) suc = (size >= info->precmds_size);
+    if (suc) {
+      info->postcmds_size = size - info->precmds_size;
+      suc = ((getPostOffset() + info->postcmds_size) <= buffer.size);
+    }
+    return suc;
+  }
+
+  descriptor_t getPreDescr() {
+    descriptor_t descr;
+    descr.ptr = buffer.ptr;
+    descr.size = info->precmds_size;
+    return descr;
+  }
+
+  descriptor_t getPostDescr() {
+    descriptor_t descr;
+    descr.ptr = buffer.ptr + getPostOffset();
+    descr.size = info->postcmds_size;
+    return descr;
+  }
+};
+
+static inline bool is_event_match(const event_t& event1, const event_t& event2) {
+  return (event1.block_name == event2.block_name) && (event1.block_index == event2.block_index) &&
+      (event1.counter_id == event2.counter_id);
+}
+
+hsa_status_t default_pmcdata_callback(hsa_ext_amd_aql_profile_info_type_t info_type,
+                                      hsa_ext_amd_aql_profile_info_data_t* info_data,
+                                      void* callback_data) {
+  hsa_status_t status = HSA_STATUS_SUCCESS;
+  hsa_ext_amd_aql_profile_info_data_t* passed_data =
+      reinterpret_cast<hsa_ext_amd_aql_profile_info_data_t*>(callback_data);
+
+  if (info_type == HSA_EXT_AQL_PROFILE_INFO_PMC_DATA) {
+    if (is_event_match(info_data->pmc_data.event, passed_data->pmc_data.event)) {
+      if (passed_data->sample_id == UINT32_MAX) {
+        passed_data->pmc_data.result += info_data->pmc_data.result;
+      } else if (passed_data->sample_id == info_data->sample_id) {
+        passed_data->pmc_data.result = info_data->pmc_data.result;
+        status = HSA_STATUS_INFO_BREAK;
+      }
+    }
+  }
+
+  return status;
+}
+
+struct sqtt_ctrl_t {
+  uint32_t status;
+  uint32_t counter;
+  uint32_t writePtr;
+};
+
+hsa_status_t default_sqttdata_callback(hsa_ext_amd_aql_profile_info_type_t info_type,
+                                       hsa_ext_amd_aql_profile_info_data_t* info_data,
+                                       void* callback_data) {
+  hsa_status_t status = HSA_STATUS_SUCCESS;
+  hsa_ext_amd_aql_profile_info_data_t* passed_data =
+      reinterpret_cast<hsa_ext_amd_aql_profile_info_data_t*>(callback_data);
+
+  if (info_type == HSA_EXT_AQL_PROFILE_INFO_SQTT_DATA) {
+    if (info_data->sample_id == passed_data->sample_id) {
+      passed_data->sqtt_data = info_data->sqtt_data;
+      status = HSA_STATUS_INFO_BREAK;
+    }
+  }
+
+  return status;
+}
+
+}  // aql_profile
+
+extern "C" {
+
+// Check if event is valid for the specific GPU
+PUBLIC_API hsa_status_t hsa_ext_amd_aql_profile_validate_event(
+    hsa_agent_t agent, const hsa_ext_amd_aql_profile_event_t* event, bool* result) {
+  return HSA_STATUS_SUCCESS;
+}
+
+// Method to populate the provided AQL packet with profiling start commands
+PUBLIC_API hsa_status_t hsa_ext_amd_aql_profile_start(
+    const hsa_ext_amd_aql_profile_profile_t* profile, aql_profile::packet_t* aql_start_packet) {
+
+  aql_profile::Pm4Factory * pm4_factory = aql_profile::Pm4Factory::Create(profile);
+  if (pm4_factory == NULL) return HSA_STATUS_ERROR;
+
+  pm4_profile::CommandWriter* cmdWriter = pm4_factory->getCommandWriter();
+  if (cmdWriter == NULL) return HSA_STATUS_ERROR;
+
+  pm4_profile::DefaultCmdBuf commands;
+  aql_profile::CommandBufferMgr cmdBufMgr(profile);
+  if (cmdBufMgr.getSize() == 0) return HSA_STATUS_ERROR;
+
+  if (profile->type == HSA_EXT_AQL_PROFILE_EVENT_PMC) {
+    pm4_profile::Pmu* pmcMgr = pm4_factory->getPmcMgr();
+    if (pmcMgr == NULL) return HSA_STATUS_ERROR;
+
+    pmcMgr->setPmcDataBuff((uint8_t*)profile->output_buffer.ptr, profile->output_buffer.size);
+
+    for (const hsa_ext_amd_aql_profile_event_t* p = profile->events;
+         p < profile->events + profile->event_count; ++p) {
+      pm4_profile::CounterBlock* block =
+          pmcMgr->getCounterBlockById(pm4_factory->getBlockId(p));
+      if (block == NULL) return HSA_STATUS_ERROR;
+
+      pm4_profile::Counter* counter = block->createCounter();
+      if (counter == NULL) return HSA_STATUS_ERROR;
+
+      counter->setParameter(HSA_EXT_TOOLS_COUNTER_PARAMETER_EVENT_INDEX, sizeof(uint32_t),
+                            &(p->counter_id));
+      counter->setEnable(true);
+    }
+
+    // Generate start commands
+    pmcMgr->begin(&commands, cmdWriter);
+    cmdBufMgr.setPreSize(commands.Size());
+    // Generate stop commands
+    pmcMgr->end(&commands, cmdWriter);
+  } else if (profile->type == HSA_EXT_AQL_PROFILE_EVENT_SQTT) {
+    pm4_profile::ThreadTrace* sqttMgr = pm4_factory->getSqttMgr();
+    if (sqttMgr == NULL) return HSA_STATUS_ERROR;
+
+    pm4_profile::ThreadTraceConfig sqtt_config;
+    sqttMgr->InitThreadTraceConfig(&sqtt_config);
+    if (profile->parameters) {
+      for (const hsa_ext_amd_aql_profile_parameters_t* p = profile->parameters;
+           p < (profile->parameters + profile->parameter_count); ++p) {
+        switch (p->parameter_name) {
+          case HSA_EXT_AQL_PROFILE_PARAM_COMPUTE_UNIT_TARGET:
+            sqtt_config.threadTraceTargetCu = p->value;
+            break;
+          case HSA_EXT_AQL_PROFILE_PARAM_VM_ID_MASK:
+            sqtt_config.threadTraceVmIdMask = p->value;
+            break;
+          case HSA_EXT_AQL_PROFILE_PARAM_MASK:
+            sqtt_config.threadTraceMask = p->value;
+            break;
+          case HSA_EXT_AQL_PROFILE_PARAM_TOKEN_MASK:
+            sqtt_config.threadTraceTokenMask = p->value;
+            break;
+          case HSA_EXT_AQL_PROFILE_PARAM_TOKEN_MASK2:
+            sqtt_config.threadTraceTokenMask2 = p->value;
+            break;
+          default:
+            return HSA_STATUS_ERROR;
+        }
+      }
+    }
+    sqttMgr->Init(&sqtt_config);
+
+    sqttMgr->setSqttDataBuff((uint8_t*)profile->output_buffer.ptr, profile->output_buffer.size);
+
+    const uint32_t status_size = sqttMgr->StatusSizeInfo();
+    void* status_ptr = cmdBufMgr.setPostfix(status_size);
+    if (status_ptr == NULL) return HSA_STATUS_ERROR;
+    // Control buffer registering
+    sqttMgr->setSqttCtrlBuff((uint32_t*)status_ptr);
+
+    // Generate start commands
+    sqttMgr->BeginSession(&commands, cmdWriter);
+    cmdBufMgr.setPreSize(commands.Size());
+    // Generate stop commands
+    sqttMgr->StopSession(&commands, cmdWriter);
+  } else
+    return HSA_STATUS_ERROR;
+
+  if (!cmdBufMgr.checkTotalSize(commands.Size())) return HSA_STATUS_ERROR;
+
+  const aql_profile::descriptor_t pre_descr = cmdBufMgr.getPreDescr();
+  const aql_profile::descriptor_t post_descr = cmdBufMgr.getPostDescr();
+  memcpy(pre_descr.ptr, commands.Base(), pre_descr.size);
+  memcpy(post_descr.ptr, commands.Base() + pre_descr.size, post_descr.size);
+  // Populate start aql packet
+  aql_profile::populateAql(pre_descr.ptr, pre_descr.size, cmdWriter, aql_start_packet);
+
+  return HSA_STATUS_SUCCESS;
+}
+
+// Method to populate the provided AQL packet with profiling stop commands
+PUBLIC_API hsa_status_t hsa_ext_amd_aql_profile_stop(
+    const hsa_ext_amd_aql_profile_profile_t* profile, aql_profile::packet_t* aql_stop_packet) {
+
+  aql_profile::Pm4Factory * pm4_factory = aql_profile::Pm4Factory::Create(profile);
+  if (pm4_factory == NULL) return HSA_STATUS_ERROR;
+
+  pm4_profile::CommandWriter* cmdWriter = pm4_factory->getCommandWriter();
+  if (cmdWriter == NULL) return HSA_STATUS_ERROR;
+
+  aql_profile::CommandBufferMgr cmdBufMgr(profile);
+  if (cmdBufMgr.getSize() == 0) return HSA_STATUS_ERROR;
+
+  const aql_profile::descriptor_t post_descr = cmdBufMgr.getPostDescr();
+  // Populate stop aql packet
+  aql_profile::populateAql(post_descr.ptr, post_descr.size, cmdWriter, aql_stop_packet);
+
+  return HSA_STATUS_SUCCESS;
+}
+
+// Converting of the profiling AQL packet to PM4 packet, GFX8 support
+PUBLIC_API hsa_status_t hsa_ext_amd_aql_profile_legacy_get_pm4(
+    const aql_profile::packet_t* aql_packet, void* pm4) {
+  return HSA_STATUS_ERROR;
+}
+
+// Method for getting the profile info
+PUBLIC_API hsa_status_t hsa_ext_amd_aql_profile_get_info(
+    const hsa_ext_amd_aql_profile_profile_t* profile, hsa_ext_amd_aql_profile_info_type_t attribute,
+    void* value) {
+  hsa_status_t status = HSA_STATUS_SUCCESS;
+
+  switch (attribute) {
+    case HSA_EXT_AQL_PROFILE_INFO_COMMAND_BUFFER_SIZE:
+      *(uint32_t*)value = 0x1000;  // a current approximation as 4K is big enaugh
+      break;
+    case HSA_EXT_AQL_PROFILE_INFO_PMC_DATA_SIZE:
+      *(uint32_t*)value = 0x1000;  // a current approximation as 4K is big enaugh
+      break;
+    case HSA_EXT_AQL_PROFILE_INFO_PMC_DATA:
+      reinterpret_cast<hsa_ext_amd_aql_profile_info_data_t*>(value)->pmc_data.result = 0;
+      status = hsa_ext_amd_aql_profile_iterate_data(profile, aql_profile::default_pmcdata_callback,
+                                                    value);
+      break;
+    case HSA_EXT_AQL_PROFILE_INFO_SQTT_DATA:
+      status = hsa_ext_amd_aql_profile_iterate_data(profile, aql_profile::default_sqttdata_callback,
+                                                    value);
+      break;
+    default:
+      status = HSA_STATUS_ERROR_INVALID_ARGUMENT;
+  }
+
+  return status;
+}
+
+// Method for iterating the events output data
+PUBLIC_API hsa_status_t hsa_ext_amd_aql_profile_iterate_data(
+    const hsa_ext_amd_aql_profile_profile_t* profile,
+    hsa_ext_amd_aql_profile_data_callback_t callback, void* data) {
+
+  hsa_status_t status = HSA_STATUS_SUCCESS;
+  aql_profile::Pm4Factory * pm4_factory = aql_profile::Pm4Factory::Create(profile);
+  if (pm4_factory == NULL) return HSA_STATUS_ERROR;
+
+  if (profile->type == HSA_EXT_AQL_PROFILE_EVENT_PMC) {
+    uint32_t info_size = 0;
+    void* info_data;
+    uint64_t* samples = (uint64_t*)profile->output_buffer.ptr;
+    const uint32_t sample_count = profile->output_buffer.size / sizeof(uint64_t);
+    uint32_t sample_index = 0;
+
+    pm4_profile::Pmu* pmcMgr = pm4_factory->getPmcMgr();
+    if (pmcMgr == NULL) return HSA_STATUS_ERROR;
+
+    for (const hsa_ext_amd_aql_profile_event_t* p = profile->events;
+         p < (profile->events + profile->event_count); ++p) {
+      pm4_profile::CounterBlock* block =
+          pmcMgr->getCounterBlockById(pm4_factory->getBlockId(p));
+      if (block == NULL) return HSA_STATUS_ERROR;
+      if (!block->getInfo(pm4_profile::GPU_BLK_INFO_CONTROL_METHOD, info_size, &info_data)) {
+        return HSA_STATUS_ERROR;
+      }
+      const pm4_profile::CntlMethod method =
+          static_cast<pm4_profile::CntlMethod>(*(static_cast<uint32_t*>(info_data)));
+      // A perfcounter data sample per ShaderEngine
+      const uint32_t block_samples_count = (method == pm4_profile::CntlMethodBySe ||
+                                            method == pm4_profile::CntlMethodBySeAndInstance)
+          ? pmcMgr->getNumSe()
+          : 1;
+      for (uint32_t i = 0; i < block_samples_count; ++i) {
+        assert(sample_index < sample_count);
+        if (sample_index >= sample_count) return HSA_STATUS_ERROR;
+
+        hsa_ext_amd_aql_profile_info_data_t sample_info;
+        sample_info.sample_id = i;
+        sample_info.pmc_data.event = *p;
+        sample_info.pmc_data.result = samples[sample_index];
+        status = callback(HSA_EXT_AQL_PROFILE_INFO_PMC_DATA, &sample_info, data);
+        if (status == HSA_STATUS_INFO_BREAK) {
+          status = HSA_STATUS_SUCCESS;
+          break;
+        }
+        if (status != HSA_STATUS_SUCCESS) break;
+        ++sample_index;
+      }
+    }
+  } else if (profile->type == HSA_EXT_AQL_PROFILE_EVENT_SQTT) {
+    pm4_profile::ThreadTrace* sqttMgr = pm4_factory->getSqttMgr();
+    if (sqttMgr == NULL) return HSA_STATUS_ERROR;
+
+    aql_profile::CommandBufferMgr cmdBufMgr(profile);
+    if (cmdBufMgr.getSize() == 0) return HSA_STATUS_ERROR;
+
+    const uint32_t status_size = sqttMgr->StatusSizeInfo();
+    // Control buffer was allocated as the CmdBuffer postfix partition
+    void* status_ptr = cmdBufMgr.setPostfix(status_size);
+    if (status_ptr == NULL) return HSA_STATUS_ERROR;
+    // Control buffer registering
+    sqttMgr->setSqttCtrlBuff((uint32_t*)status_ptr);
+    // Validate SQTT status and normalize WRPTR
+    if (sqttMgr->Validate() == false) return HSA_STATUS_ERROR;
+
+    const uint32_t se_number = sqttMgr->getNumSe();
+    // Casting status pointer to SQTT control per ShaderEngine array
+    aql_profile::sqtt_ctrl_t* sqtt_ctrl = (aql_profile::sqtt_ctrl_t*)status_ptr;
+    assert(status_size == sizeof(aql_profile::sqtt_ctrl_t) * se_number);
+    if (status_size != sizeof(aql_profile::sqtt_ctrl_t) * se_number) {
+      return HSA_STATUS_ERROR;
+    }
+    // SQTT output buffer and capacity per ShaderEngine
+    void* sample_ptr = profile->output_buffer.ptr;
+    const uint32_t sample_capacity = profile->output_buffer.size / se_number;
+    // The samples sizes are returned in the control buffer
+    for (int i = 0; i < se_number; ++i) {
+      // WPTR specifies the index in thread trace buffer where next token will be
+      // written by hardware. The index is incremented by size of 32 bytes.
+      uint32_t sample_size = sqtt_ctrl[i].writePtr * TT_WRITE_PTR_BLK;
+
+      hsa_ext_amd_aql_profile_info_data_t sample_info;
+      sample_info.sample_id = i;
+      sample_info.sqtt_data.ptr = sample_ptr;
+      sample_info.sqtt_data.size = sample_size;
+      status = callback(HSA_EXT_AQL_PROFILE_INFO_SQTT_DATA, &sample_info, data);
+      if (status == HSA_STATUS_INFO_BREAK) {
+        status = HSA_STATUS_SUCCESS;
+        break;
+      }
+      if (status != HSA_STATUS_SUCCESS) break;
+
+      sample_ptr += sample_capacity;
+    }
+  } else {
+    status = HSA_STATUS_ERROR;
+  }
+
+  return status;
+}
+}
@@ -0,0 +1,23 @@
+#ifndef _AQL_PROFILE_H_
+#define _AQL_PROFILE_H_
+
+#include "hsa_ext_amd_aql_profile.h"
+
+namespace pm4_profile {
+class CommandWriter;
+}
+
+namespace aql_profile {
+
+typedef hsa_ext_amd_aql_profile_descriptor_t descriptor_t;
+typedef hsa_ext_amd_aql_profile_profile_t profile_t;
+typedef hsa_ext_amd_aql_profile_info_type_t info_type_t;
+typedef hsa_ext_amd_aql_profile_data_callback_t data_callback_t;
+typedef hsa_ext_amd_aql_pm4_packet_t packet_t;
+typedef hsa_ext_amd_aql_profile_event_t event_t;
+
+void populateAql(void* cmdBuffer, uint32_t cmdSz, pm4_profile::CommandWriter* cmdWriter,
+                 packet_t* aqlPkt);
+}
+
+#endif  // _AQL_PROFILE_H_
@@ -0,0 +1,43 @@
+#include "pm4_factory.h"
+// Commandwriter includes
+#include "gfx8_cmdwriter.h"
+// PMC includes
+#include "vi_pmu.h"
+// SQTT includes
+#include "gfx8_thread_trace.h"
+
+namespace aql_profile {
+
+// GFX9 block ID mapping table
+uint32_t Gfx8Factory::block_id_table[HSA_EXT_AQL_PROFILE_BLOCKS_NUMBER] = {
+    pm4_profile::kHsaViCounterBlockIdCb0,    pm4_profile::kHsaViCounterBlockIdCpf,
+    pm4_profile::kHsaViCounterBlockIdDb0,    pm4_profile::kHsaViCounterBlockIdGrbm,
+    pm4_profile::kHsaViCounterBlockIdGrbmSe, pm4_profile::kHsaViCounterBlockIdPaSu,
+    pm4_profile::kHsaViCounterBlockIdPaSc,   pm4_profile::kHsaViCounterBlockIdSpi,
+    pm4_profile::kHsaViCounterBlockIdSq,     pm4_profile::kHsaViCounterBlockIdSqGs,
+    pm4_profile::kHsaViCounterBlockIdSqVs,   pm4_profile::kHsaViCounterBlockIdSqPs,
+    pm4_profile::kHsaViCounterBlockIdSqHs,   pm4_profile::kHsaViCounterBlockIdSqCs,
+    pm4_profile::kHsaViCounterBlockIdSx,     pm4_profile::kHsaViCounterBlockIdTa0,
+    pm4_profile::kHsaViCounterBlockIdTca0,   pm4_profile::kHsaViCounterBlockIdTcc0,
+    pm4_profile::kHsaViCounterBlockIdTd0,    pm4_profile::kHsaViCounterBlockIdTcp0,
+    pm4_profile::kHsaViCounterBlockIdGds,    pm4_profile::kHsaViCounterBlockIdVgt,
+    pm4_profile::kHsaViCounterBlockIdIa,     pm4_profile::kHsaViCounterBlockIdMc,
+    pm4_profile::kHsaViCounterBlockIdTcs,    pm4_profile::kHsaViCounterBlockIdWd};
+
+pm4_profile::CommandWriter * Gfx8Factory::getCommandWriter() {
+  return new pm4_profile::gfx8::Gfx8CmdWriter(false, true);
+}
+
+pm4_profile::Pmu * Gfx8Factory::getPmcMgr() {
+  return new pm4_profile::ViPmu();
+}
+
+pm4_profile::ThreadTrace * Gfx8Factory::getSqttMgr() {
+  return new pm4_profile::Gfx8ThreadTrace();
+}
+
+uint32_t Gfx8Factory::getBlockId(const event_t* event) {
+  return block_id_table[event->block_name] + event->block_index;
+}
+
+} // aql_profile
@@ -0,0 +1,70 @@
+#include "pm4_factory.h"
+// Commandwriter includes
+#include "gfx8_cmdwriter.h"
+#include "gfx9_cmdwriter.h"
+// PMC includes
+#include "vi_pmu.h"
+#include "ai_pmu.h"
+// SQTT includes
+#include "gfx8_thread_trace.h"
+#include "gfx9_thread_trace.h"
+
+namespace aql_profile {
+
+// GFX8 block ID mapping table
+uint32_t gfx8_block_id_table[HSA_EXT_AQL_PROFILE_BLOCKS_NUMBER] = {
+    pm4_profile::kHsaViCounterBlockIdCb0,    pm4_profile::kHsaViCounterBlockIdCpf,
+    pm4_profile::kHsaViCounterBlockIdDb0,    pm4_profile::kHsaViCounterBlockIdGrbm,
+    pm4_profile::kHsaViCounterBlockIdGrbmSe, pm4_profile::kHsaViCounterBlockIdPaSu,
+    pm4_profile::kHsaViCounterBlockIdPaSc,   pm4_profile::kHsaViCounterBlockIdSpi,
+    pm4_profile::kHsaViCounterBlockIdSq,     pm4_profile::kHsaViCounterBlockIdSqGs,
+    pm4_profile::kHsaViCounterBlockIdSqVs,   pm4_profile::kHsaViCounterBlockIdSqPs,
+    pm4_profile::kHsaViCounterBlockIdSqHs,   pm4_profile::kHsaViCounterBlockIdSqCs,
+    pm4_profile::kHsaViCounterBlockIdSx,     pm4_profile::kHsaViCounterBlockIdTa0,
+    pm4_profile::kHsaViCounterBlockIdTca0,   pm4_profile::kHsaViCounterBlockIdTcc0,
+    pm4_profile::kHsaViCounterBlockIdTd0,    pm4_profile::kHsaViCounterBlockIdTcp0,
+    pm4_profile::kHsaViCounterBlockIdGds,    pm4_profile::kHsaViCounterBlockIdVgt,
+    pm4_profile::kHsaViCounterBlockIdIa,     pm4_profile::kHsaViCounterBlockIdMc,
+    pm4_profile::kHsaViCounterBlockIdTcs,    pm4_profile::kHsaViCounterBlockIdWd};
+
+// GFX9 block ID mapping table
+uint32_t gfx9_block_id_table[HSA_EXT_AQL_PROFILE_BLOCKS_NUMBER] = {
+    pm4_profile::kHsaAiCounterBlockIdCb0,    pm4_profile::kHsaAiCounterBlockIdCpf,
+    pm4_profile::kHsaAiCounterBlockIdDb0,    pm4_profile::kHsaAiCounterBlockIdGrbm,
+    pm4_profile::kHsaAiCounterBlockIdGrbmSe, pm4_profile::kHsaAiCounterBlockIdPaSu,
+    pm4_profile::kHsaAiCounterBlockIdPaSc,   pm4_profile::kHsaAiCounterBlockIdSpi,
+    pm4_profile::kHsaAiCounterBlockIdSq,     pm4_profile::kHsaAiCounterBlockIdSqGs,
+    pm4_profile::kHsaAiCounterBlockIdSqVs,   pm4_profile::kHsaAiCounterBlockIdSqPs,
+    pm4_profile::kHsaAiCounterBlockIdSqHs,   pm4_profile::kHsaAiCounterBlockIdSqCs,
+    pm4_profile::kHsaAiCounterBlockIdSx,     pm4_profile::kHsaAiCounterBlockIdTa0,
+    pm4_profile::kHsaAiCounterBlockIdTca0,   pm4_profile::kHsaAiCounterBlockIdTcc0,
+    pm4_profile::kHsaAiCounterBlockIdTd0,    pm4_profile::kHsaAiCounterBlockIdTcp0,
+    pm4_profile::kHsaAiCounterBlockIdGds,    pm4_profile::kHsaAiCounterBlockIdVgt,
+    pm4_profile::kHsaAiCounterBlockIdIa,     pm4_profile::kHsaAiCounterBlockIdMc,
+    pm4_profile::kHsaAiCounterBlockIdTcs,    pm4_profile::kHsaAiCounterBlockIdWd};
+
+pm4_profile::CommandWriter * Pm4Factory::getCommandWriter() {
+  return (is_gfx9 == true) ?
+    new pm4_profile::gfx9::Gfx9CmdWriter(false, true) :
+    new pm4_profile::gfx8::Gfx8CmdWriter(false, true);
+}
+
+pm4_profile::Pmu * Pm4Factory::getPmcMgr() {
+  return (is_gfx9 == true) ?
+    new pm4_profile::AiPmu() :
+    new pm4_profile::ViPmu();
+}
+
+pm4_profile::ThreadTrace * Pm4Factory::getSqttMgr() {
+  return (is_gfx9 == true) ?
+    new pm4_profile::Gfx9ThreadTrace() :
+    new pm4_profile::Gfx8ThreadTrace();
+}
+
+uint32_t Pm4Factory::getBlockId(const event_t* event) {
+  return (is_gfx9 == true) ?
+    gfx9_block_id_table[event->block_name] + event->block_index :
+    gfx8_block_id_table[event->block_name] + event->block_index :
+}
+
+} // aql_profile
@@ -0,0 +1,43 @@
+#include "pm4_factory.h"
+// Commandwriter includes
+#include "gfx9_cmdwriter.h"
+// PMC includes
+#include "ai_pmu.h"
+// SQTT includes
+#include "gfx9_thread_trace.h"
+
+namespace aql_profile {
+
+// GFX9 block ID mapping table
+uint32_t Gfx9Factory::block_id_table[HSA_EXT_AQL_PROFILE_BLOCKS_NUMBER] = {
+    pm4_profile::kHsaAiCounterBlockIdCb0,    pm4_profile::kHsaAiCounterBlockIdCpf,
+    pm4_profile::kHsaAiCounterBlockIdDb0,    pm4_profile::kHsaAiCounterBlockIdGrbm,
+    pm4_profile::kHsaAiCounterBlockIdGrbmSe, pm4_profile::kHsaAiCounterBlockIdPaSu,
+    pm4_profile::kHsaAiCounterBlockIdPaSc,   pm4_profile::kHsaAiCounterBlockIdSpi,
+    pm4_profile::kHsaAiCounterBlockIdSq,     pm4_profile::kHsaAiCounterBlockIdSqGs,
+    pm4_profile::kHsaAiCounterBlockIdSqVs,   pm4_profile::kHsaAiCounterBlockIdSqPs,
+    pm4_profile::kHsaAiCounterBlockIdSqHs,   pm4_profile::kHsaAiCounterBlockIdSqCs,
+    pm4_profile::kHsaAiCounterBlockIdSx,     pm4_profile::kHsaAiCounterBlockIdTa0,
+    pm4_profile::kHsaAiCounterBlockIdTca0,   pm4_profile::kHsaAiCounterBlockIdTcc0,
+    pm4_profile::kHsaAiCounterBlockIdTd0,    pm4_profile::kHsaAiCounterBlockIdTcp0,
+    pm4_profile::kHsaAiCounterBlockIdGds,    pm4_profile::kHsaAiCounterBlockIdVgt,
+    pm4_profile::kHsaAiCounterBlockIdIa,     pm4_profile::kHsaAiCounterBlockIdMc,
+    pm4_profile::kHsaAiCounterBlockIdTcs,    pm4_profile::kHsaAiCounterBlockIdWd};
+
+pm4_profile::CommandWriter * Gfx9Factory::getCommandWriter() {
+  return new pm4_profile::gfx9::Gfx9CmdWriter(false, true);
+}
+
+pm4_profile::Pmu * Gfx9Factory::getPmcMgr() {
+  return new pm4_profile::AiPmu();
+}
+
+pm4_profile::ThreadTrace * Gfx9Factory::getSqttMgr() {
+  return new pm4_profile::Gfx9ThreadTrace();
+}
+
+uint32_t Gfx9Factory::getBlockId(const event_t* event) {
+  return block_id_table[event->block_name] + event->block_index;
+}
+
+} // aql_profile
@@ -0,0 +1,62 @@
+#ifndef _PM4_FACTORY_H_
+#define _PM4_FACTORY_H_
+
+#include <string.h>
+#include <assert.h>
+
+#include "aql_profile.h"
+
+namespace pm4_profile {
+class CommandWriter;
+class Pmu;
+class ThreadTrace;
+}
+
+namespace aql_profile {
+
+class Pm4Factory {
+ public:
+  static Pm4Factory* Create(const hsa_ext_amd_aql_profile_profile_t* profile);
+  virtual pm4_profile::CommandWriter* getCommandWriter() = 0;
+  virtual pm4_profile::Pmu* getPmcMgr() = 0;
+  virtual pm4_profile::ThreadTrace* getSqttMgr() = 0;
+  virtual uint32_t getBlockId(const event_t* event) = 0;
+};
+
+class Gfx8Factory : public Pm4Factory {
+ public:
+  pm4_profile::CommandWriter* getCommandWriter();
+  pm4_profile::Pmu* getPmcMgr();
+  pm4_profile::ThreadTrace* getSqttMgr();
+  uint32_t getBlockId(const event_t* event);
+
+ private:
+  static uint32_t block_id_table[HSA_EXT_AQL_PROFILE_BLOCKS_NUMBER];
+};
+
+class Gfx9Factory : public Pm4Factory {
+ public:
+  pm4_profile::CommandWriter* getCommandWriter();
+  pm4_profile::Pmu* getPmcMgr();
+  pm4_profile::ThreadTrace* getSqttMgr();
+  uint32_t getBlockId(const event_t* event);
+
+ private:
+  static uint32_t block_id_table[HSA_EXT_AQL_PROFILE_BLOCKS_NUMBER];
+};
+
+inline Pm4Factory* Pm4Factory::Create(const hsa_ext_amd_aql_profile_profile_t* profile) {
+  Pm4Factory* instance = NULL;
+  char agent_name[64];
+  hsa_agent_get_info(profile->agent, HSA_AGENT_INFO_NAME, agent_name);
+  if (strncmp(agent_name, "gfx8", 4) == 0) {
+    instance = new Gfx8Factory();
+  } else if (strncmp(agent_name, "gfx9", 4) == 0) {
+    instance = new Gfx9Factory();
+  }
+  return instance;
+}
+
+}  // aql_profile
+
+#endif  // _PM4_FACTORY_H_
@@ -0,0 +1,41 @@
+#include <iostream>
+#include <iomanip>
+
+#include "aql_profile.h"
+#include "cmdwriter.h"
+#include "amd_aql_pm4_ib_packet.h"
+
+namespace aql_profile {
+
+void populateAql(uint32_t* ib_packet, packet_t* aql_packet) {
+  // Populate relevant fields of Aql pkt
+  // Size of IB pkt is four DWords
+  // Header and completion sinal are not set
+  amd_aql_pm4_ib_packet_t* aql_pm4_ib = reinterpret_cast<amd_aql_pm4_ib_packet_t*>(aql_packet);
+  aql_pm4_ib->pm4_ib_format = AMD_AQL_PM4_IB_FORMAT;
+  aql_pm4_ib->pm4_ib_command[0] = ib_packet[0];
+  aql_pm4_ib->pm4_ib_command[1] = ib_packet[1];
+  aql_pm4_ib->pm4_ib_command[2] = ib_packet[2];
+  aql_pm4_ib->pm4_ib_command[3] = ib_packet[3];
+  aql_pm4_ib->dw_count_remain = AMD_AQL_PM4_IB_DW_COUNT_REMAIN;
+  for (int i = 0; i < AMD_AQL_PM4_IB_RESERVED_COUNT; ++i) {
+    aql_pm4_ib->reserved[i] = 0;
+  }
+
+  uint32_t* words = (uint32_t*)aql_packet;
+  std::clog << std::setw(40) << std::left << "AQL 'IB' size(16)"
+            << ":";
+  for (int idx = 0; idx < 16; idx++) {
+    std::clog << " " << std::hex << std::setw(8) << std::setfill('0') << words[idx];
+  }
+  std::clog << std::setfill(' ') << std::endl;
+}
+
+void populateAql(void* cmd_buffer, uint32_t cmd_size,
+                 pm4_profile::CommandWriter* cmd_writer, packet_t* ppt_packet) {
+  pm4_profile::DefaultCmdBuf ib_buffer;
+  cmd_writer->BuildIndirectBufferCmd(&ib_buffer, cmd_buffer, (size_t)cmd_size);
+  uint32_t* ib_cmds = (uint32_t*)ib_buffer.Base();
+  populateAql(ib_cmds, ppt_packet);
+}
+}
@@ -0,0 +1,15 @@
+#
+# Source files for Rocr Cmdwriter
+#
+set ( CmdWriterSrcs gfx8_cmdwriter.cpp )
+set ( CmdWriterSrcs ${CmdWriterSrcs} gfx9_cmdwriter.cpp )
+
+#
+# Header files include path(s).
+#
+include_directories ( $ENV{ROCR_INC_DIR} )
+
+#
+# Build Cmdwriter as a Static Library object
+#
+add_library ( ${CMDWRITER_LIB} STATIC ${CmdWriterSrcs} )
@@ -0,0 +1,515 @@
+// cmdwriter.h
+// Header file for CommandWriter and CmdBuf interfaces
+
+#ifndef _CMDWRITER_H_
+#define _CMDWRITER_H_
+
+#include <vector>
+#include <string.h>
+#include <stdint.h>
+#include <assert.h>
+
+namespace pm4_profile {
+
+// User defined options for flusing cache
+typedef struct FlushCacheOptions_ {
+  bool l1, l2;
+  bool icache, kcache;
+  bool l1_vol, l2_vol, kcache_vol;
+  FlushCacheOptions_() {
+    l1 = l2 = icache = kcache = false;
+    l1_vol = l2_vol = kcache_vol = false;
+  };
+} FlushCacheOptions;
+
+/// @brief Interface to build a list of Gpu commands into a byte
+/// buffer. Classes implementing this interface are used to translate
+/// various Gpu commands as byte stream.
+///
+/// @note: The Api does not require implementations to be thread safe.
+/// Users are therefore required to be access in a serialized manner.
+class CmdBuf {
+ public:
+  /// Default destructor.
+  virtual ~CmdBuf() {}
+
+  /// @brief Resets the command buffer object. All of the commands
+  /// previously packed into the buffer are lost i.e. the number of
+  /// bytes in command stream is reset.
+  ///
+  /// @note: This convenience Api is provided to allow reuse of the
+  /// command buffer object.
+  ///
+  /// @return bool true if successful, false otherwise.
+  virtual bool Reset(void) = 0;
+
+  /// @brief Appends input command into a buffer that could
+  /// be queried for its size and other properties. The append
+  /// does not verify the contents.
+  ///
+  /// @param cmd Buffer containing one or more instances of Gpu commands
+  ///
+  /// @param size size of the Gpu commands in bytes.
+  ///
+  /// @return void
+  virtual void AppendCommand(const void* cmd, uint32_t size) = 0;
+
+  /// @brief Returns the total size (in bytes) of the accumulated commands.
+  ///
+  /// @return size_t size of Gpu commands in bytes
+  virtual size_t Size() const = 0;
+
+ private:
+  /// Indexes the command buffer by dwords.  Allows accessing constants
+  /// in an assembled command buffer.
+  virtual uint32_t& operator[](size_t index) = 0;
+
+  friend class CommandWriter;
+};
+
+/// @brief Implements the interface CmdBuf and thus can be used to
+/// translate various Gpu commands as byte stream.
+///
+/// @note: The Api does not require implementations to be thread safe.
+/// Users are therefore required to be access in a serialized manner.
+class DefaultCmdBuf : public CmdBuf {
+ public:
+  /// @brief Append the command into the underlying buffer
+  ///
+  /// @param cmd Buffer containing one or more instances of Gpu commands
+  ///
+  /// @param size Size of Gpu command(s) in bytes
+  ///
+  /// @retur void
+  virtual void AppendCommand(const void* cmd, uint32_t size) {
+    memcpy(ReserveCmdbufSpace(size), cmd, size);
+  }
+
+  /// @brief Resets the Gpu command buffer
+  bool Reset() {
+    cmdbuf_.clear();
+    return true;
+  }
+
+  /// Size of Gpu commands in bytes in the underlying buffer
+  size_t Size() const { return cmdbuf_.size() * sizeof(StorageType); }
+
+  /// Address of the start of accumulated commands.
+  const void* Base() const { return &cmdbuf_[0]; }
+
+ private:
+  /// @brief Returns reference to the value of Gpu command buffer
+  /// at specified index
+  ///
+  /// @param index Specifies the buffer index whose value is needed
+  ///
+  /// @return uint32_t & Reference of the value being returned
+  uint32_t& operator[](size_t index) { return cmdbuf_[index]; }
+
+  /// @brief Increase Gpu command buffer by specified size
+  ///
+  /// @param size Size in bytes by which command buffer should
+  /// be resized.
+  ///
+  /// @return void * Pointer into the buffer where the next
+  /// command can be written
+  void* ReserveCmdbufSpace(std::size_t size) {
+    const size_t len = cmdbuf_.size();
+    cmdbuf_.resize(len + size / sizeof(StorageType));
+    return &cmdbuf_[len];
+  }
+
+  /// @brief Defines Gpu command buffer as a vector of StorageType
+  typedef uint32_t StorageType;
+  std::vector<StorageType> cmdbuf_;
+};
+
+/// @brief Specifies the public interface of CommandWriter for use by
+/// clients to build Gpu command streams.
+class CommandWriter {
+ public:
+  /// @brief These enums specify the operation to perform in the packet
+  /// generated by BuildAtomicPacket. The commenting for each enum uses
+  /// the arguments to the function BuildAtomicPacket to express the
+  /// resulting operation.
+  enum AtomicType {
+
+    /// *destination = *destination + 1;
+    kAtomicTypeIncrement,
+
+    /// *destination = *destination - 1;
+    kAtomicTypeDecrement,
+
+    /// if (*destination == compare) *destination = value;
+    kAtomicTypeCompareAndSwap,
+
+    /// while (*destination != compare);
+    /// *destination = value;
+    kAtomicTypeBlockingCompareAndSwap,
+
+    /// *destination = *destination + value;
+    kAtomicAdd,
+
+    /// *destination = *destination - value;
+    kAtomicSubtract,
+
+    /// *destination = value;
+    kAtomicSwap
+  };
+
+  /// @brief These enums specify the VGT EVENT TYPE to issue and wait for.
+  /// Command Processor (CP) uses these events to communicate with SPI to
+  /// learn about outstanding waves and determine kernel completion.
+  enum VgtEventType {
+
+    /// Enable Performance Counters
+    kPerfCntrsStart,
+
+    /// Disable Performance Counters
+    kPerfCntrsStop,
+
+    /// Read Performance Counters
+    kPerfCntrsSample,
+
+    /// Enable a Thread Trace session
+    kThrdTraceStart,
+
+    /// Disable a Thread Trace session
+    kThrdTraceStop,
+
+    /// Enable flushing of thread trace buffers
+    kThrdTraceFlush,
+
+    /// Enables resetting of BASE register to its last value
+    /// including flushing of thread trace buffers. This could
+    /// be used to toggle between two buffers so as to allow
+    /// collection of large token data
+    kThrdTraceFinish
+  };
+
+  /// @brief Returns the Dword that encodes a No-Op for the CP
+  ///
+  /// @return uint32_t Dword that can be used to populate a Pm4
+  /// command queue.
+  ///
+  virtual uint32_t GetNoOpCmd() = 0;
+
+  /// @brief Build an instance of Barrier command and copy it into
+  /// the input commmand buffer
+  ///
+  /// @param cmdbuf Pointer to command buffer which is updated with
+  /// an instance of Barrier command.
+  ///
+  /// @return void
+  virtual void BuildBarrierCommand(CmdBuf* cmdbuf) = 0;
+
+  /// @brief Builds the Gpu command to reference indirectly a stream
+  /// of other Gpu commands. The launch command is then copied into
+  /// the command buffer parameter.
+  ///
+  /// @param cmdBuf command buffer to be appended with launch command
+  ///
+  /// @param cmd_addr Address of command buffer carrying command stream
+  ///
+  /// @param cmd_size Size of dispatch command stream in bytes
+  ///
+  /// @return void
+  virtual void BuildIndirectBufferCmd(CmdBuf* cmdbuf, const void* cmd_addr,
+                                      std::size_t cmd_size) = 0;
+
+  /// @brief Build a Gpu command that triggers an event whose type
+  /// is specified by input parameter. It then copies it into the input
+  /// command buffer
+  ///
+  /// @param cmdbuf Pointer to command buffer to be appended
+  ///
+  /// @param event Id of Event to be triggered by Gpu
+  ///
+  /// @return void
+  virtual void BuildWriteEventPacket(CmdBuf* cmdbuf, uint32_t event) = 0;
+
+  /// @bried Builds a Gpu command to wait until condition is realized
+  ///
+  /// @param cmdbuf command buffer to be appended with launch command
+  ///
+  /// @param mem_space if the address is in memory or is a register offset
+  ///
+  /// @param wait_addr address to wait on
+  ///
+  /// @param func_eq true means equal, false means not-equal
+  ///
+  /// @param mask_val Mask to apply on value from addr in comparison
+  ///
+  /// @param wait_val value to apply for the func given above
+  virtual void BuildWaitRegMemCommand(CmdBuf* cmdbuf, bool mem_space, uint64_t wait_addr,
+                                      bool func_eq, uint32_t mask_val, uint32_t wait_val) = 0;
+
+  virtual void BuildUpdateHostAddress(CmdBuf* cmdbuf, uint64_t* addr, int64_t value) = 0;
+
+  /// @brief Build CP command to program a Gpu register
+  ///
+  /// @param cmdbuf Pointer to command buffer to be appended
+  /// @param addr Register to be programmed
+  /// @param value Value to write into register
+  ///
+  /// @return void
+  virtual void BuildWriteUConfigRegPacket(CmdBuf* cmdbuf, uint32_t addr, uint32_t value) = 0;
+
+  /// @brief Build and copy WriteShReg command
+  ///
+  /// @param cmdbuf Pointer to command buffer to be appended
+  ///
+  /// @param addr Offset of the register
+  ///
+  /// @param value Value to write into register
+  ///
+  /// @return void
+  virtual void BuildWriteShRegPacket(CmdBuf* cmdbuf, uint32_t addr, uint32_t value) = 0;
+
+  /// @brief Builds a Gpu command to flush Gpu caches and write a
+  /// user defined value at a configurable location that is Gpu
+  /// accessible.
+  ///
+  /// @param cmdBuf Command buffer to be appended with bottom of pipe
+  /// notification command
+  ///
+  /// @param write_addr Address into which Gpu should write
+  ///
+  /// @param write_val Value to write into user provided address
+  ///
+  /// @param interrupt True if Gpu should raise an interrupt upon writing
+  /// the user value
+  ///
+  /// @return void
+  virtual void BuildBOPNotifyCmd(CmdBuf* cmdbuf, const void* write_addr, uint32_t write_val,
+                                 bool intrpt) = 0;
+
+
+  /// @brief Build a Gpu command that copies data from a specified
+  /// source to destination
+  ///
+  /// @param cmdbuf Pointer to command buffer to be appended
+  ///
+  /// @param reg_to_mem flag to indicate if values are being read from a
+  /// Register or a memory location
+  ///
+  /// @param src_addr_lo Low 32-bit Source address of the data to read from
+  ///
+  /// @param src_addr_hi High 32-bit Source address of the data to read from
+  ///
+  /// @param dst_addr Destination address for the data to be written to
+  ///
+  /// @param size Size of the data to be written
+  ///
+  /// @param  wait True if Gpu command should confirm the write operation
+  /// operation has completed successfully
+  ///
+  /// @return void
+  ///
+  /// @NOTE Change interface to use void* for Src and void* for Dest
+  virtual void BuildCopyDataPacket(CmdBuf* cmdbuf, uint32_t src_sel, uint32_t src_addr_lo,
+                                   uint32_t src_addr_hi, uint32_t* dst_addr, uint32_t size,
+                                   bool wait) = 0;
+
+  /// @brief Build and copy a WaitIdle Gpu command into command buffer
+  ///
+  /// @param cmdbuf Pointer to command buffer to be appended
+  ///
+  /// @return void
+  virtual void BuildWriteWaitIdlePacket(CmdBuf* cmdbuf) = 0;
+
+  // Will issue a VGT event including a cache flush later on
+  virtual void BuildVgtEventPacket(CmdBuf* cmdbuf, uint32_t vgtEvent) = 0;
+
+  /// @brief Build and copy a WriteRegister Gpu command into command buffer
+  ///
+  /// @param cmdbuf Pointer to command buffer to be appended
+  ///
+  /// @param addr Register into which to write
+  ///
+  /// @param value Value to write into register
+  ///
+  /// @return void
+  virtual void BuildWriteRegisterPacket(CmdBuf* cmdbuf, uint32_t addr, uint32_t value) = 0;
+
+  /// @brief Build and copy a Gpu command to query the status of a
+  /// WriteEvent into command buffer
+  ///
+  /// @param cmdbuf Pointer to command buffer to be appended
+  ///
+  /// @param event Id of Event whose status is to be queried
+  ///
+  /// @param addr Address to update the status of WriteEvent operation
+  ///
+  /// @return void
+  virtual void BuildWriteEventQueryPacket(CmdBuf* cmdBuf, uint32_t event, uint32_t* addr) = 0;
+
+  /// @brief Builds and copies a Gpu comamnd to peform user specified
+  /// operation atomically. The various atomic operations on integers
+  /// that are supported include: increment, decrement, add, subtract,
+  /// compare-and-swap and swap. The operation to perform is specified
+  /// by the enum AtomicType.
+  ///
+  /// @param cmdbuf Pointer to command buffer to be appended
+  ///
+  /// @param atomic_op Id of the atomic operation to perform
+  ///
+  /// @param addr Pointer to the memory block where atomic operation
+  /// would be performed
+  ///
+  /// @param value New value to write if atomic operation can be performed
+  ///
+  /// @param compare Value to compare if atomic operation is a compare-and-swap
+  ///
+  /// @return void
+  virtual void BuildAtomicPacket(CmdBuf* cmdbuf, AtomicType atomic_op, volatile uint32_t* addr,
+                                 uint32_t value = 0, uint32_t compare = 0) = 0;
+
+  /// @brief Builds and copies a Gpu comamnd to peform user specified
+  /// operation atomically. The various atomic operations on integers
+  /// that are supported include: increment, decrement, add, subtract,
+  /// compare-and-swap and swap. The operation to perform is specified
+  /// by the enum AtomicType.
+  ///
+  /// @param cmdbuf Pointer to command buffer to be appended
+  ///
+  /// @param atomic_op Id of the atomic operation to perform
+  ///
+  /// @param addr Pointer to the memory block where atomic operation
+  /// would be performed
+  ///
+  /// @param value New value to write if atomic operation can be performed
+  ///
+  /// @param compare Value to compare if atomic operation is a compare-and-swap
+  ///
+  /// @return void
+  virtual void BuildAtomicPacket64(CmdBuf* cmdbuf, AtomicType atomic_op, volatile uint64_t* addr,
+                                   uint64_t value = 0, uint64_t compare = 0) = 0;
+
+  /// @brief Returns the size of an atomic packet
+  ///
+  /// @return size_t Size of atomic packet
+  virtual size_t SizeOfAtomicPacket() const = 0;
+
+  /// @brief Build and copy a Gpu command that will tell command processor
+  /// to conditionally execute or skip the next sequence of packets.
+  ///
+  /// @param cmdbuf Pointer to command buffer to be appended
+  ///
+  /// @param signal Pointer to an integer that tells the command processor
+  /// whether to skip or execute the next block of packets. If it is set
+  /// to 0 the following packets will be skipped, else it will execute the
+  /// following packets
+  ///
+  /// @param count The number of dwords in the following packet stream
+  /// that will be conditionally executed
+  ///
+  /// @return void
+  virtual void BuildConditionalExecute(CmdBuf* cmdbuf, uint32_t* signal, uint16_t count) = 0;
+
+  /// @brief Builds a CP command to write user specified value
+  /// at a user specified address. The command is then copied
+  /// into the command buffer for submission to a device queue.
+  ///
+  /// @param cmdbuf Pointer to command buffer to be appended
+  ///
+  /// @param write_addr Address into which CP will write the user
+  /// specified value
+  ///
+  /// @param write_value Value to write into the user specified address
+  ///
+  /// @return void
+  virtual void BuildWriteDataCommand(CmdBuf* cmdbuf, uint32_t* write_addr,
+                                     uint32_t write_value) = 0;
+
+  /// @brief Builds a CP command to write user specified value
+  /// at a user specified address. The command is then copied
+  /// into the command buffer for submission to a device queue.
+  ///
+  /// @param cmdbuf Pointer to command buffer to be appended
+  ///
+  /// @param write_addr Address into which CP will write the user
+  /// specified value
+  ///
+  /// @param write_value Value to write into the user specified address
+  ///
+  /// @return void
+  virtual void BuildWriteData64Command(CmdBuf* cmdbuf, uint64_t* write_addr,
+                                       uint64_t write_value) = 0;
+
+  /// Writes into input buffer Gpu commands to flush its cache. It is
+  /// necessary that the buffer provided for flush commands is large
+  /// enough to accommodate the full set of commands. It should be at
+  /// least 512 bytes.
+  ///
+  /// @param tsCmdBuf Buffer to write commands to.
+  /// @param writeAddr Registered address into which GPU should write
+  /// a user provided value upon executing the flush commands.
+  /// @param writeVal User provided value written by GPU at user provided
+  /// address, upon executing the flush commands.
+  ///
+  /// @return void
+  virtual void BuildFlushCacheCmd(CmdBuf* cmdbuf, FlushCacheOptions* options, uint32_t* writeAddr,
+                                  uint32_t writeVal) = 0;
+
+  /// @brief Builds Gpu command to copy data from source to destination
+  /// buffer using DMA engine.
+  ///
+  /// @param cmdbuf Buffer updated with Gpu copy command
+  /// @param srcAddr Address of source buffer address
+  /// @param dstAddr Address of destination buffer address
+  /// @param copySize Size of data to copy in bytes
+  /// @param waitForCompletion if command should wait for copying to complete
+  virtual void BuildDmaDataPacket(CmdBuf* cmdbuf, uint32_t* srcAddrLo, uint32_t* dstAddr,
+                                  uint32_t copySize, bool waitForCompletion) = 0;
+
+  /// @brief Release resources used by CommandWriter
+  virtual ~CommandWriter(){};
+
+ protected:
+  /// @brief Return the reference to a value in the command buffer
+  uint32_t& IndexBuffer(CmdBuf* cmdbuf, uint32_t index) { return (*cmdbuf)[index]; }
+};
+
+/// @brief Returns the Rounded value per input rounding factor
+inline uint32_t RoundUp(uint32_t u, uint32_t r) { return ((u + (r - 1)) & ~(r - 1)); }
+
+/// @brief Returns the lower 32-bits of a value
+inline uint32_t Low32(uint64_t u) { return (u & 0xFFFFFFFFUL); }
+
+/// @brief Returns the upper 32-bits of a value
+inline uint32_t High32(uint64_t u) { return (u >> 32); }
+
+/// @brief Returns the lower 32-bits of an address
+inline uint32_t Ptr48Low32(const void* p) {
+  uintptr_t ptr = reinterpret_cast<uintptr_t>(p);
+  assert((ptr & 0xFFFFFFFFFF00ULL) == ptr);
+  return (uint32_t)((ptr & 0xFFFFFFFFFFULL) >> 8);
+}
+
+/// @brief Returns the upper 8-bits of an address
+inline uint8_t Ptr48High8(const void* p) {
+  uintptr_t ptr = reinterpret_cast<uintptr_t>(p);
+  return (uint8_t)((ptr & 0xFF0000000000ULL) >> 40);
+}
+
+/// @brief Returns the lower 32-bits of an address
+inline uint32_t PtrLow32(const void* p) {
+  return static_cast<uint32_t>(reinterpret_cast<uintptr_t>(p));
+}
+
+/// @brief Returns the upper 32-bits of an address
+inline uint32_t PtrHigh32(const void* p) {
+  uint32_t hi_32 = 0;
+#ifdef HSA_LARGE_MODEL
+  hi_32 = static_cast<uint32_t>(reinterpret_cast<uintptr_t>(p) >> 32);
+  static_assert(sizeof(void*) == 8, "HSA_LARGE_MODEL is not set properly here!");
+#else
+  static_assert(sizeof(void*) == 4, "HSA_LARGE_MODEL is not set properly here!");
+#endif
+  return hi_32;
+}
+
+}  // pm4_profile
+
+#endif  // _CMDWRITER_H_
@@ -0,0 +1,161 @@
+#ifndef _GFX8_CMDS_H_
+#define _GFX8_CMDS_H_
+
+#include "gfxip/gfx8/si_ci_vi_merged_enum.h"
+#include "gfxip/gfx8/si_ci_vi_merged_mask.h"
+#include "gfxip/gfx8/si_ci_vi_merged_offset.h"
+#include "gfxip/gfx8/si_ci_vi_merged_registers.h"
+#include "gfxip/gfx8/si_ci_vi_merged_typedef.h"
+#include "gfxip/gfx8/si_ci_vi_merged_pm4_it_opcodes.h"
+#include "gfxip/gfx8/si_pm4defs.h"
+
+namespace pm4_profile {
+
+namespace gfx8 {
+
+// Desc: Defines the Gpu command to dispatch a kernel. It embeds
+// various Gpu hardware specific data structures for initialization
+// and configuration before a dispatch begins to run
+struct DispatchTemplate {
+  // Desc: Structure used to initialize the group dimensions
+  // of a kernel dispatch and if performance counters are enabled
+  struct DispatchDimensionRegs {
+    PM4CMDSETDATA cmd_set_data;
+    regCOMPUTE_START_X compute_start_x;
+    regCOMPUTE_START_Y compute_start_y;
+    regCOMPUTE_START_Z compute_start_z;
+    regCOMPUTE_NUM_THREAD_X compute_num_thread_x;
+    regCOMPUTE_NUM_THREAD_Y compute_num_thread_y;
+    regCOMPUTE_NUM_THREAD_Z compute_num_thread_z;
+    regCOMPUTE_PIPELINESTAT_ENABLE__CI__VI compute_pipelinestat_enable;
+  } dimension_regs;
+
+  // Desc: Structure used to initialize kernel Isa, trap
+  // handler, trap handler buffer, number of SGPR and VGPR
+  // registers needed, amount of Group memory and LDS needed,
+  // Rounding mode for Floating point numbers, etc.
+  struct DispatchProgramRegs {
+    PM4CMDSETDATA cmd_set_data;
+    regCOMPUTE_PGM_LO compute_pgm_lo;
+    regCOMPUTE_PGM_HI compute_pgm_hi;
+    regCOMPUTE_TBA_LO compute_tba_lo;
+    regCOMPUTE_TBA_HI compute_tba_hi;
+    regCOMPUTE_TMA_LO compute_tma_lo;
+    regCOMPUTE_TMA_HI compute_tma_hi;
+    regCOMPUTE_PGM_RSRC1 compute_pgm_rsrc1;
+    regCOMPUTE_PGM_RSRC2 compute_pgm_rsrc2;
+  } program_regs;
+
+  // Desc: Structure used to initialize parameters related to
+  // thread management i.e. number of waves to issue and number
+  // of Compute Units to use
+  struct DispatchResourceRegs {
+    PM4CMDSETDATA cmd_set_data;
+    regCOMPUTE_RESOURCE_LIMITS compute_resource_limits;
+    regCOMPUTE_STATIC_THREAD_MGMT_SE0 compute_static_thread_mgmt_se0;
+    regCOMPUTE_STATIC_THREAD_MGMT_SE1 compute_static_thread_mgmt_se1;
+    regCOMPUTE_TMPRING_SIZE compute_tmpring_size;
+    regCOMPUTE_STATIC_THREAD_MGMT_SE2__CI__VI compute_static_thread_mgmt_se2;
+    regCOMPUTE_STATIC_THREAD_MGMT_SE3__CI__VI compute_static_thread_mgmt_se3;
+    regCOMPUTE_RESTART_X__CI__VI compute_restart_x;
+    regCOMPUTE_RESTART_Y__CI__VI compute_restart_y;
+    regCOMPUTE_RESTART_Z__CI__VI compute_restart_z;
+    regCOMPUTE_THREAD_TRACE_ENABLE__CI__VI compute_thread_trace_enable;
+  } resource_regs;
+
+  // Desc: Structure used to pass handles of the Aql dispatch
+  // packet, Aql queue, Kernel argument address block, Scratch
+  // buffer
+  struct DispatchComputeUserDataRegs {
+    PM4CMDSETDATA cmd_set_data;
+    uint32_t compute_user_data[16];
+  } compute_user_data_regs;
+
+  // Desc: Structure used to configure Cache flush policy
+  // and dimensions of total work size
+  PM4CMDDISPATCHDIRECT dispatch_direct;
+};
+
+// Desc: Structure used to issue a Gpu Barrier command
+struct BarrierTemplate {
+  PM4CMDEVENTWRITE event_write;
+};
+
+// Desc: Structure used to configure the flushing
+// of various caches - instruction, constants, L1
+// and L2
+struct AcquireMemTemplate {
+  PM4CMDACQUIREMEM acquire_mem;
+};
+
+// Desc: Structure used to reference another Gpu command
+// indirectly. Generally used to reference a list of Gpu
+// commands (dispatch cmds) indirectly
+struct LaunchTemplate {
+  PM4CMDINDIRECTBUFFER indirect_buffer;
+};
+
+// Desc: Structure used to determine the end of
+// a kernel including cache flushes and writing to
+// a user configurable memory location
+struct EndofKernelNotifyTemplate {
+  PM4CMDRELEASEMEM release_mem;
+};
+
+// Desc: Strucuture used to perform various atomic
+// operations - add, subtract, increment, etc
+struct AtomicTemplate {
+  PM4CMDATOMIC atomic;
+};
+
+// Desc: Structure used to conditionalize the execution
+// of a Gpu command stream
+struct ConditionalExecuteTemplate {
+  PM4CMDCONDEXEC_CI conditional;
+};
+
+// Desc: PM4 command to write a 32-bit value into a memory
+// location accessible to Gpu
+struct WriteDataTemplate {
+  PM4CMDWRITEDATA write_data;
+  uint32_t write_data_value;
+};
+
+// Desc: PM4 command to write a 64-bit value into a memory
+// location accessible to Gpu
+struct WriteData64Template {
+  PM4CMDWRITEDATA write_data;
+  uint64_t write_data_value;
+};
+
+// Desc: PM4 command to wait for a certain event before proceeding
+// to process another command on the queue
+struct WaitRegMemTemplate {
+  PM4CMDWAITREGMEM wait_reg_mem;
+};
+
+// Desc: Initializer for commands that set shader registers
+template <class T> void GenerateSetShRegHeader(T* pm4, uint32_t reg_addr) {
+  pm4->cmd_set_data.header.u32All =
+      PM4_TYPE_3_HDR(IT_SET_SH_REG, sizeof(T) / sizeof(uint32_t), ShaderCompute, 0);
+  pm4->cmd_set_data.regOffset = reg_addr - PERSISTENT_SPACE_START;
+}
+
+// Desc: Initializer for various Gpu command headers
+template <class T> void GenerateCmdHeader(T* pm4, IT_OpCodeType op_code) {
+  pm4->header.u32All = PM4_TYPE_3_HDR(op_code, sizeof(T) / sizeof(uint32_t), ShaderCompute, 0);
+}
+
+// Desc: Initializer for commands that set configuration registers
+template <class T> void GenerateSetConfigRegHeader(T* pm4, uint32_t reg_addr) {
+  pm4->cmd_set_data.header.u32All =
+      PM4_TYPE_3_HDR(IT_SET_CONFIG_REG, sizeof(T) / sizeof(uint32_t), ShaderCompute, 0);
+  pm4->cmd_set_data.regOffset = reg_addr - CONFIG_SPACE_START;
+}
+
+
+}  // gfx8
+
+}  // pm4_profile
+
+#endif  //  _GFX8_CMDS_H_
@@ -0,0 +1,768 @@
+#include <algorithm>
+#include <iostream>
+#include <iomanip>
+#include <sstream>
+
+#include "gfx8_cmdwriter.h"
+#include "gfxip/gfx8/gfx8_utils.h"
+
+// RELEASE MEM DST SEL Definitions
+#define RELEASE_MEM_DST_SEL_MEMORY_CONTROLLER 0
+#define RELEASE_MEM_DST_SEL_TC_L2 1
+
+// RELEASE MEM CACHE POLICY Definitions
+#define RELEASE_MEM_CACHE_POLICY_LRU 0
+#define RELEASE_MEM_CACHE_POLICY_STREAM 1
+#define RELEASE_MEM_CACHE_POLICY_BYPASS 2
+
+template <class T>
+static void PrintPm4Packet(const T& command, const char* name) {
+#if ! defined(NDEBUG)
+  uint32_t * cmd = (uint32_t*)&command;
+  uint32_t size = sizeof(command) / sizeof(uint32_t);
+  std::ostringstream oss;
+  oss << "'" << name << "' size(" << std::dec << size << ")";
+  std::clog << std::setw(40) << std::left << oss.str() << ":";
+  for (uint32_t idx = 0; idx < size; idx++) {
+    std::clog << " " << std::hex << std::setw(8) << std::setfill('0') << cmd[idx];
+  }
+  std::clog << std::setfill(' ') << std::endl;
+#endif
+}
+
+#define APPEND_COMMAND_WRAPPER(cmdbuf, command) \
+  PrintPm4Packet(command, __FUNCTION__); \
+  AppendCommand(cmdbuf, command);
+
+namespace pm4_profile {
+namespace gfx8 {
+
+template <class T> void Gfx8CmdWriter::AppendCommand(CmdBuf* cmdbuf, const T& command) {
+  cmdbuf->AppendCommand(&command, sizeof(command));
+}
+
+void Gfx8CmdWriter::InitializeAtomicTemplate() {
+  memset(&atomic_template_.atomic, 0, sizeof(atomic_template_));
+  GenerateCmdHeader(&atomic_template_.atomic, IT_ATOMIC_MEM__CI);
+
+  if (atc_support_) {
+    const uint32_t kAtcShift = 24;
+    atomic_template_.atomic.ordinal2 |= 1 << kAtcShift;
+  }
+}
+
+void Gfx8CmdWriter::InitializeConditionalTemplate() {
+  memset(&conditional_template_.conditional, 0, sizeof(conditional_template_));
+  gfx8::GenerateCmdHeader(&conditional_template_.conditional, IT_COND_EXEC);
+
+  if (atc_support_) {
+    const uint32_t kAtcShift = 24;
+    conditional_template_.conditional.ordinal4 |= 1 << kAtcShift;
+  }
+}
+
+void Gfx8CmdWriter::InitializeLaunchTemplate() {
+  memset(&launch_template_, 0, sizeof(launch_template_));
+
+  GenerateCmdHeader(&launch_template_.indirect_buffer, IT_INDIRECT_BUFFER);
+  launch_template_.indirect_buffer.CI.valid = true;
+}
+
+void Gfx8CmdWriter::InitializeWriteDataTemplate() {
+  // Set the header of write data command
+  memset(&write_data_template_, 0, sizeof(write_data_template_));
+
+  // Initialize the header of command packet
+  PM4CMDWRITEDATA* command = &(write_data_template_.write_data);
+  uint32_t cmd_size = sizeof(write_data_template_) / sizeof(uint32_t);
+  command->ordinal1 = PM4_TYPE_3_HDR(IT_WRITE_DATA, cmd_size, ShaderCompute, 0);
+
+  // Set the ATC bit of command template - specifies if the address
+  // belongs to system memory
+  write_data_template_.write_data.atc__CI = (atc_support_) ? 1 : 0;
+
+  // Set the bit to confirm the write operation and cache policy
+  write_data_template_.write_data.wrConfirm = 1;
+  write_data_template_.write_data.cachePolicy__CI = WRITE_DATA_CACHE_POLICY_BYPASS;
+
+  // Specify the module that will execute the write data command
+  write_data_template_.write_data.engineSel = WRITE_DATA_ENGINE_ME;
+
+  // Specify the class to which the write destination belongs
+  write_data_template_.write_data.dstSel = WRITE_DATA_DST_SEL_MEMORY_ASYNC;
+}
+
+void Gfx8CmdWriter::InitializeWriteData64Template() {
+  // Set the header of write data command
+  memset(&write_data64_template_, 0, sizeof(write_data64_template_));
+
+  // Initialize the header of command packet
+  PM4CMDWRITEDATA* command = &(write_data64_template_.write_data);
+  uint32_t cmd_size = sizeof(write_data64_template_) / sizeof(uint32_t);
+  command->ordinal1 = PM4_TYPE_3_HDR(IT_WRITE_DATA, cmd_size, ShaderCompute, 0);
+
+  // Set the ATC bit of command template - specifies if the address
+  // belongs to system memory
+  write_data64_template_.write_data.atc__CI = (atc_support_) ? 1 : 0;
+
+  // Set the bit to confirm the write operation and cache policy
+  write_data64_template_.write_data.wrConfirm = 1;
+  write_data64_template_.write_data.cachePolicy__CI = WRITE_DATA_CACHE_POLICY_BYPASS;
+
+  // Specify the module that will execute the write data command
+  write_data64_template_.write_data.engineSel = WRITE_DATA_ENGINE_ME;
+
+  // Specify the class to which the write destination belongs
+  // write_data64_template_.write_data.dstSel = WRITE_DATA_DST_SEL_TCL2;
+  // TODO: For Hawaii bring up only.
+  write_data64_template_.write_data.dstSel = WRITE_DATA_DST_SEL_MEMORY_ASYNC;
+}
+
+void Gfx8CmdWriter::InitializeBarrierTemplate() {
+  memset(&pending_dispatch_template_, 0, sizeof(pending_dispatch_template_));
+
+  gfx8::GenerateCmdHeader(&pending_dispatch_template_.event_write, IT_EVENT_WRITE);
+  pending_dispatch_template_.event_write.eventType = CS_PARTIAL_FLUSH;
+  pending_dispatch_template_.event_write.eventIndex = EventTypeToIndexTable[CS_PARTIAL_FLUSH];
+}
+
+void Gfx8CmdWriter::InitializeAcquireMemTemplate() {
+  memset(&invalidate_cache_template_, 0, sizeof(invalidate_cache_template_));
+
+  gfx8::GenerateCmdHeader(&invalidate_cache_template_.acquire_mem, IT_ACQUIRE_MEM__CI__VI);
+  invalidate_cache_template_.acquire_mem.cpCoherBase.u32All = 0x00;
+  invalidate_cache_template_.acquire_mem.cpCoherBaseHi.u32All = 0x00;
+  invalidate_cache_template_.acquire_mem.cpCoherSize.u32All = 0xFFFFFFFF;
+  invalidate_cache_template_.acquire_mem.cpCoherSizeHi.u32All = 0xFF;
+  invalidate_cache_template_.acquire_mem.pollInterval = 0;
+}
+
+void Gfx8CmdWriter::InitializeWaitRegMemTemplate() {
+  memset(&wait_reg_mem_template_, 0, sizeof(wait_reg_mem_template_));
+
+  gfx8::GenerateCmdHeader(&wait_reg_mem_template_.wait_reg_mem, IT_WAIT_REG_MEM);
+  wait_reg_mem_template_.wait_reg_mem.atc__CI = (atc_support_) ? 1 : 0;
+  wait_reg_mem_template_.wait_reg_mem.cachePolicy__CI = 2;  // bypass
+  wait_reg_mem_template_.wait_reg_mem.pollInterval = 0;
+  wait_reg_mem_template_.wait_reg_mem.engine = WAIT_REG_MEM_ENGINE_ME;
+}
+
+Gfx8CmdWriter::Gfx8CmdWriter(bool atc_support, bool pcie_atomic_support) {
+  // Initialize various state variables related to
+  // atomic operations and atc support
+  pcie_atomic_support_ = pcie_atomic_support;
+  atc_support_ = atc_support;
+
+  InitializeLaunchTemplate();
+  InitializeAtomicTemplate();
+  InitializeConditionalTemplate();
+  InitializeWriteDataTemplate();
+  InitializeWriteData64Template();
+  InitializeBarrierTemplate();
+  InitializeAcquireMemTemplate();
+  InitializeWaitRegMemTemplate();
+}
+
+void Gfx8CmdWriter::BuildWaitRegMemCommand(CmdBuf* cmdbuf, bool mem_space, uint64_t wait_addr,
+                                              bool func_eq, uint32_t mask_val, uint32_t wait_val) {
+  gfx8::WaitRegMemTemplate wait_cmd = wait_reg_mem_template_;
+
+  // Apply the space to which addr belongs
+  if (mem_space) {
+    wait_cmd.wait_reg_mem.memSpace = WAIT_REG_MEM_SPACE_MEMORY;
+  } else {
+    wait_cmd.wait_reg_mem.memSpace = WAIT_REG_MEM_SPACE_REGISTER;
+  }
+
+  // Apply the function - equal / not equal desired by user
+  if (func_eq) {
+    wait_cmd.wait_reg_mem.function = WAIT_REG_MEM_FUNC_EQUAL;
+  } else {
+    wait_cmd.wait_reg_mem.function = WAIT_REG_MEM_FUNC_NOT_EQUAL;
+  }
+
+  // Apply the mask on value at address/register
+  wait_cmd.wait_reg_mem.mask = mask_val;
+
+  // Value to use in applying equal / not equal function
+  wait_cmd.wait_reg_mem.reference = wait_val;
+
+  // Update upper 32 bit address if addr is not a register
+  if (mem_space) {
+    assert(!(wait_addr & 0x3) && "WaitRegMem address must be 4 byte aligned");
+  }
+  wait_cmd.wait_reg_mem.pollAddressLo = Low32(wait_addr);
+  if (mem_space) {
+    wait_cmd.wait_reg_mem.pollAddressHi = High32(wait_addr);
+  }
+
+  APPEND_COMMAND_WRAPPER(cmdbuf, wait_cmd);
+}
+
+void Gfx8CmdWriter::BuildUpdateHostAddress(CmdBuf* cmdbuf, uint64_t* addr, int64_t value) {
+  // If Atomics are supported, use it
+  if (pcie_atomic_support_) {
+    BuildAtomicPacket64(cmdbuf, CommandWriter::AtomicType::kAtomicSwap, (volatile uint64_t*)addr,
+                        value);
+    return;
+  }
+
+  BuildWriteData64Command(cmdbuf, addr, value);
+  return;
+}
+
+void Gfx8CmdWriter::BuildIndirectBufferCmd(CmdBuf* cmdbuf, const void* cmd_addr,
+                                              std::size_t cmd_size) {
+  gfx8::LaunchTemplate launch = launch_template_;
+
+  launch.indirect_buffer.ibBaseLo = PtrLow32(cmd_addr);
+  launch.indirect_buffer.ibBaseHi = PtrHigh32(cmd_addr);
+  launch.indirect_buffer.CI.ibSize = cmd_size / sizeof(uint32_t);
+
+  APPEND_COMMAND_WRAPPER(cmdbuf, launch);
+}
+
+void Gfx8CmdWriter::BuildBOPNotifyCmd(CmdBuf* cmdbuf, const void* write_addr, uint32_t write_val,
+                                         bool interrupt) {
+  // Initialize the command including its header
+  gfx8::EndofKernelNotifyTemplate eopCmd;
+  memset(&eopCmd, 0, sizeof(eopCmd));
+  gfx8::GenerateCmdHeader(&eopCmd.release_mem, IT_RELEASE_MEM__CI__VI);
+
+  // Program CP to wait until following event is notified by SPI
+  eopCmd.release_mem.eventType = BOTTOM_OF_PIPE_TS;
+  eopCmd.release_mem.eventIndex = EventTypeToIndexTable[BOTTOM_OF_PIPE_TS];
+
+  // Program CP to perform various cache operations
+  // which complete before Write operation commences
+  eopCmd.release_mem.atc = atc_support_;
+  eopCmd.release_mem.l2Invlidate = true;
+  eopCmd.release_mem.l2WriteBack = true;
+
+  // Set destination as Memory with Write bypassing Cache
+  eopCmd.release_mem.cachePolicy = RELEASE_MEM_CACHE_POLICY_BYPASS;
+  eopCmd.release_mem.dstSel = RELEASE_MEM_DST_SEL_MEMORY_CONTROLLER;
+
+  // Program CP to write user specified value to user specified address
+  eopCmd.release_mem.ordinal4 = Low32(uint64_t(write_addr));
+  eopCmd.release_mem.addrHi = High32(uint64_t(write_addr));
+  eopCmd.release_mem.dataLo = Low32(write_val);
+  eopCmd.release_mem.dataHi = High32(write_val);
+  eopCmd.release_mem.dataSel = EVENTWRITEEOP_DATA_SEL_SEND_DATA32;
+
+  // Determine if host will poll or wait for interrupt
+  eopCmd.release_mem.intSel =
+      (interrupt == false) ? EVENTWRITEEOP_INT_SEL_NONE : EVENTWRITEEOP_INT_SEL_SEND_INT_ON_CONFIRM;
+
+  APPEND_COMMAND_WRAPPER(cmdbuf, eopCmd);
+}
+
+
+void Gfx8CmdWriter::BuildBarrierFenceCommands(CmdBuf* cmdbuf) {
+  gfx8::AcquireMemTemplate invalidate_src_caches = invalidate_cache_template_;
+
+  // wbINVL2 by default writes-back and invalidates both L1 and L2
+  invalidate_src_caches.acquire_mem.coherCntl =
+      CP_COHER_CNTL__TC_ACTION_ENA_MASK | CP_COHER_CNTL__TC_WB_ACTION_ENA_MASK__CI__VI;
+
+  APPEND_COMMAND_WRAPPER(cmdbuf, invalidate_src_caches);
+}
+
+// PM4 packet for profilers
+#define PM4_PACKET3 (0xC0000000)
+#define PM4_PACKET3_CMD_SHIFT 8
+#define PM4_PACKET3_COUNT_SHIFT 16
+
+#define PACKET3(cmd, count) \
+  (PM4_PACKET3 | (((count)-1) << PM4_PACKET3_COUNT_SHIFT) | ((cmd) << PM4_PACKET3_CMD_SHIFT))
+
+// Structure to store the event PM4 packet
+typedef struct WriteRegPacket_ { uint32_t item[3]; } WriteRegPacket;
+
+typedef struct WriteEventPacket_ { uint32_t item[7]; } WriteEventPacket;
+
+void Gfx8CmdWriter::BuildWriteEventPacket(CmdBuf* cmdbuf, uint32_t event) {
+
+  PM4CMDEVENTWRITE cp_event_initiator;
+  cp_event_initiator.ordinal1 = PACKET3(IT_EVENT_WRITE, 1);
+  cp_event_initiator.ordinal2 = 0;
+
+  VGT_EVENT_TYPE eventType = Reserved_0x00;
+  switch (event) {
+    case kPerfCntrsStart:
+      eventType = PERFCOUNTER_START;
+      break;
+    case kPerfCntrsStop:
+      eventType = PERFCOUNTER_STOP;
+      break;
+    case kPerfCntrsSample:
+      eventType = PERFCOUNTER_SAMPLE;
+      break;
+    default:
+      assert(false && "Illegal VGT Event Id");
+  }
+
+  cp_event_initiator.eventType = eventType;
+  cp_event_initiator.eventIndex = EventTypeToIndexTable[eventType];
+
+  APPEND_COMMAND_WRAPPER(cmdbuf, cp_event_initiator);
+
+  return;
+}
+
+void Gfx8CmdWriter::BuildWriteUnshadowRegPacket(CmdBuf* cmdbuf, uint32_t addr, uint32_t value) {
+  WriteRegPacket packet;
+  packet.item[0] = (PM4_TYPE_3_HDR(IT_SET_UCONFIG_REG__CI__VI, 1 + PM4_CMD_SET_CONFIG_REG_DWORDS,
+                                   ShaderGraphics, 0));
+  packet.item[1] = (addr - UCONFIG_SPACE_START__CI__VI);
+  packet.item[2] = value;
+
+  APPEND_COMMAND_WRAPPER(cmdbuf, packet);
+
+  return;
+}
+
+void Gfx8CmdWriter::BuildWriteUConfigRegPacket(CmdBuf* cmdbuf, uint32_t addr, uint32_t value) {
+  WriteRegPacket packet;
+  packet.item[0] = (PM4_TYPE_3_HDR(IT_SET_UCONFIG_REG__CI__VI, 1 + PM4_CMD_SET_CONFIG_REG_DWORDS,
+                                   ShaderCompute, 0));
+  packet.item[1] = (addr - UCONFIG_SPACE_START__CI__VI);
+  packet.item[2] = value;
+
+  APPEND_COMMAND_WRAPPER(cmdbuf, packet);
+
+  return;
+}
+
+void Gfx8CmdWriter::BuildWriteShRegPacket(CmdBuf* cmdbuf, uint32_t addr, uint32_t value) {
+  WriteRegPacket packet;
+  packet.item[0] = (PM4_TYPE_3_HDR(IT_SET_SH_REG, 1 + PM4_CMD_SET_SH_REG_DWORDS, ShaderCompute, 0));
+  packet.item[1] = (addr - PERSISTENT_SPACE_START);
+  packet.item[2] = value;
+
+  APPEND_COMMAND_WRAPPER(cmdbuf, packet);
+
+  return;
+}
+
+void Gfx8CmdWriter::BuildCopyDataPacket(CmdBuf* cmdbuf, uint32_t src_sel, uint32_t src_addr_lo,
+                                           uint32_t src_addr_hi, uint32_t* dst_addr, uint32_t size,
+                                           bool wait) {
+  PM4CMDCOPYDATA cmd_data;
+  memset(&cmd_data, 0, sizeof(PM4CMDCOPYDATA));
+
+  cmd_data.header.u32All = PACKET3(IT_COPY_DATA, 5);
+
+  cmd_data.srcAtc__CI = atc_support_;
+  cmd_data.srcCachePolicy__CI = COPY_DATA_SRC_CACHE_POLICY_BYPASS;
+  cmd_data.srcSel = src_sel;
+
+  cmd_data.dstAtc__CI = atc_support_;
+  cmd_data.dstSel = COPY_DATA_SEL_DST_ASYNC_MEMORY;
+  cmd_data.dstCachePolicy__CI = COPY_DATA_DST_CACHE_POLICY_BYPASS;
+
+  uint32_t dst_addr_lo, dst_addr_hi;
+
+  dst_addr_lo = PtrLow32(dst_addr);
+  dst_addr_hi = PtrHigh32(dst_addr);
+
+  cmd_data.srcAddressLo = src_addr_lo;
+  cmd_data.srcAddressHi = src_addr_hi;
+  cmd_data.dstAddressLo = dst_addr_lo;
+  cmd_data.dstAddressHi = dst_addr_hi;
+
+  cmd_data.countSel = size;
+  cmd_data.wrConfirm = wait;
+  cmd_data.engineSel = COPY_DATA_ENGINE_ME;
+
+  APPEND_COMMAND_WRAPPER(cmdbuf, cmd_data);
+
+  return;
+}
+
+void Gfx8CmdWriter::BuildCacheFlushPacket(CmdBuf* cmdbuf) {
+  WriteEventPacket packet;
+  packet.item[0] = PACKET3(IT_ACQUIRE_MEM__CI__VI, 6);
+  packet.item[1] = 0x28C00000;
+  packet.item[2] = 0xFFFFFFFF;
+  packet.item[3] = 0;
+  packet.item[4] = 0;
+  packet.item[5] = 0;
+  packet.item[6] = 0x00000004;
+
+  APPEND_COMMAND_WRAPPER(cmdbuf, packet);
+}
+
+void Gfx8CmdWriter::BuildWriteWaitIdlePacket(CmdBuf* cmdbuf) {
+  BuildBarrierCommand(cmdbuf);
+  BuildCacheFlushPacket(cmdbuf);
+  return;
+}
+
+// Will issue a VGT event including a cache flush later on
+void Gfx8CmdWriter::BuildVgtEventPacket(CmdBuf* cmdbuf, uint32_t vgtEvent) {
+  PM4CMDEVENTWRITE cp_event_initiator;
+
+  cp_event_initiator.ordinal1 = PACKET3(IT_EVENT_WRITE, 1);
+  cp_event_initiator.ordinal2 = 0;
+
+  VGT_EVENT_TYPE eventType = Reserved_0x00;
+  switch (vgtEvent) {
+    case kPerfCntrsStart:
+      eventType = PERFCOUNTER_START;
+      break;
+    case kPerfCntrsStop:
+      eventType = PERFCOUNTER_STOP;
+      break;
+    case kPerfCntrsSample:
+      eventType = PERFCOUNTER_SAMPLE;
+      break;
+    case kThrdTraceStart:
+      eventType = THREAD_TRACE_START;
+      break;
+    case kThrdTraceStop:
+      eventType = THREAD_TRACE_STOP;
+      break;
+    case kThrdTraceFlush:
+      eventType = THREAD_TRACE_FLUSH;
+      break;
+    case kThrdTraceFinish:
+      eventType = THREAD_TRACE_FINISH;
+      break;
+    default:
+      assert(false && "Illegal VGT Event Id");
+  }
+
+  cp_event_initiator.eventType = eventType;
+  cp_event_initiator.eventIndex = EventTypeToIndexTable[eventType];
+
+  APPEND_COMMAND_WRAPPER(cmdbuf, cp_event_initiator);
+
+  // Check If I should be issuing a cache flush operation as well
+  // test and remove it
+  BuildCacheFlushPacket(cmdbuf);
+  return;
+}
+
+void Gfx8CmdWriter::BuildWriteRegisterPacket(CmdBuf* cmdbuf, uint32_t addr, uint32_t value) {
+  WriteRegPacket packet;
+  packet.item[0] =
+      (PM4_TYPE_3_HDR(IT_SET_CONFIG_REG, 1 + PM4_CMD_SET_CONFIG_REG_DWORDS, ShaderGraphics, 0));
+  packet.item[1] = addr - CONFIG_SPACE_START;
+  packet.item[2] = value;
+
+  APPEND_COMMAND_WRAPPER(cmdbuf, packet);
+
+  return;
+}
+
+void Gfx8CmdWriter::BuildWriteEventQueryPacket(CmdBuf* cmdbuf, uint32_t event, uint32_t* addr) {
+  PM4CMDEVENTWRITEQUERY cp_event_initiator;
+  cp_event_initiator.ordinal1 = PACKET3(IT_EVENT_WRITE, 3);
+  cp_event_initiator.ordinal2 = 0;
+
+  // Update switch statements you want to support
+  VGT_EVENT_TYPE eventType = Reserved_0x00;
+  switch (event) {
+    default:
+      assert(false && "Illegal VGT Event Id");
+  }
+
+  cp_event_initiator.eventType = eventType;
+  cp_event_initiator.eventIndex = EventTypeToIndexTable[eventType];
+
+  // set the address
+  uint32_t addrLo = PtrLow32(addr);
+  uint32_t addrHi = PtrHigh32(addr);
+  ((addrLo & 0x7) != 0) ? assert(false) : assert(true);
+
+  cp_event_initiator.ordinal3 = 0;
+  cp_event_initiator.ordinal4 = 0;
+  cp_event_initiator.addressLo = addrLo;
+  cp_event_initiator.addressHi = addrHi;
+
+  APPEND_COMMAND_WRAPPER(cmdbuf, cp_event_initiator);
+
+  return;
+}
+
+void Gfx8CmdWriter::BuildBarrierCommand(CmdBuf* cmdBuf) {
+  APPEND_COMMAND_WRAPPER(cmdBuf, pending_dispatch_template_);
+}
+
+void Gfx8CmdWriter::WriteUserData(uint32_t* dst_addr, uint32_t count, const void* src_addr) {
+  memcpy(dst_addr, src_addr, count * sizeof(uint32_t));
+}
+
+
+void Gfx8CmdWriter::BuildAtomicPacket(CmdBuf* cmdbuf, AtomicType atomic_op,
+                                         volatile uint32_t* addr, uint32_t value,
+                                         uint32_t compare) {
+  gfx8::AtomicTemplate atomic = atomic_template_;
+
+  // make sure the destination adddress is aligned
+  uint32_t address_low = PtrLow32((void*)addr);
+  uint32_t address_high = PtrHigh32((void*)addr);
+  assert(!(address_low & 0x7) && "destination address must be 8 byte aligned");
+
+  atomic.atomic.addressLo = address_low;
+  atomic.atomic.addressHi = address_high;
+
+  switch (atomic_op) {
+    case CommandWriter::kAtomicTypeIncrement: {
+      atomic.atomic.atomOp = TC_OP_ATOMIC_ADD_RTN_32;
+      atomic.atomic.srcDataLo = 1;
+      break;
+    }
+    case CommandWriter::kAtomicTypeDecrement: {
+      atomic.atomic.atomOp = TC_OP_ATOMIC_SUB_RTN_32;
+      atomic.atomic.srcDataLo = 1;
+      break;
+    }
+    case CommandWriter::kAtomicTypeCompareAndSwap: {
+      atomic.atomic.atomOp = TC_OP_ATOMIC_CMPSWAP_RTN_32;
+      atomic.atomic.srcDataLo = value;
+      atomic.atomic.cmpDataLo = compare;
+      break;
+    }
+    case CommandWriter::kAtomicTypeBlockingCompareAndSwap: {
+      atomic.atomic.atomOp = TC_OP_ATOMIC_CMPSWAP_RTN_32;
+      atomic.atomic.srcDataLo = value;
+      atomic.atomic.cmpDataLo = compare;
+      atomic.atomic.command = 1;
+      atomic.atomic.loopInterval = 128;
+      break;
+    }
+    case CommandWriter::kAtomicAdd: {
+      atomic.atomic.atomOp = TC_OP_ATOMIC_ADD_RTN_32;
+      atomic.atomic.srcDataLo = value;
+      break;
+    }
+    case CommandWriter::kAtomicSubtract: {
+      atomic.atomic.atomOp = TC_OP_ATOMIC_SUB_RTN_32;
+      atomic.atomic.srcDataLo = value;
+      break;
+    }
+    case CommandWriter::kAtomicSwap: {
+      atomic.atomic.atomOp = TC_OP_ATOMIC_SWAP_RTN_32;
+      atomic.atomic.srcDataLo = value;
+      break;
+    }
+  }
+
+  APPEND_COMMAND_WRAPPER(cmdbuf, atomic);
+}
+
+void Gfx8CmdWriter::BuildAtomicPacket64(CmdBuf* cmdbuf, AtomicType atomic_op,
+                                           volatile uint64_t* addr, uint64_t value,
+                                           uint64_t compare) {
+  AtomicTemplate atomic = atomic_template_;
+
+  // make sure the destination adddress is aligned
+  uint32_t address_low = PtrLow32((void*)addr);
+  uint32_t address_high = PtrHigh32((void*)addr);
+  assert(!(address_low & 0x7) && "destination address must be 8 byte aligned");
+
+  atomic.atomic.addressLo = address_low;
+  atomic.atomic.addressHi = address_high;
+
+  atomic.atomic.atc = (atc_support_) ? 1 : 0;
+  atomic.atomic.cachePolicy = 2;
+
+  switch (atomic_op) {
+    case CommandWriter::kAtomicTypeIncrement: {
+      atomic.atomic.atomOp = TC_OP_ATOMIC_ADD_RTN_64;
+      atomic.atomic.srcDataLo = 1;
+      break;
+    }
+    case CommandWriter::kAtomicTypeDecrement: {
+      atomic.atomic.atomOp = TC_OP_ATOMIC_SUB_RTN_64;
+      atomic.atomic.srcDataLo = 1;
+      break;
+    }
+    case CommandWriter::kAtomicTypeCompareAndSwap: {
+      atomic.atomic.atomOp = TC_OP_ATOMIC_CMPSWAP_RTN_64;
+      atomic.atomic.srcDataLo = Low32(value);
+      atomic.atomic.srcDataHi = High32(value);
+      atomic.atomic.cmpDataLo = Low32(compare);
+      atomic.atomic.cmpDataHi = High32(compare);
+      break;
+    }
+    case CommandWriter::kAtomicTypeBlockingCompareAndSwap: {
+      atomic.atomic.atomOp = TC_OP_ATOMIC_CMPSWAP_RTN_64;
+      atomic.atomic.srcDataLo = Low32(value);
+      atomic.atomic.srcDataHi = High32(value);
+      atomic.atomic.cmpDataLo = Low32(compare);
+      atomic.atomic.cmpDataHi = High32(compare);
+      atomic.atomic.command = 1;
+      atomic.atomic.loopInterval = 128;
+      break;
+    }
+    case CommandWriter::kAtomicAdd: {
+      atomic.atomic.atomOp = TC_OP_ATOMIC_ADD_RTN_64;
+      atomic.atomic.srcDataLo = Low32(value);
+      atomic.atomic.srcDataHi = High32(value);
+      break;
+    }
+    case CommandWriter::kAtomicSubtract: {
+      atomic.atomic.atomOp = TC_OP_ATOMIC_SUB_RTN_64;
+      atomic.atomic.srcDataLo = Low32(value);
+      atomic.atomic.srcDataHi = High32(value);
+      break;
+    }
+    case CommandWriter::kAtomicSwap: {
+      atomic.atomic.atomOp = TC_OP_ATOMIC_SWAP_RTN_64;
+      atomic.atomic.srcDataLo = Low32(value);
+      atomic.atomic.srcDataHi = High32(value);
+      break;
+    }
+  }
+
+  APPEND_COMMAND_WRAPPER(cmdbuf, atomic);
+}
+
+size_t Gfx8CmdWriter::SizeOfAtomicPacket() const {
+  return sizeof(AtomicTemplate) / sizeof(uint32_t);
+}
+
+void Gfx8CmdWriter::BuildConditionalExecute(CmdBuf* cmdbuf, uint32_t* signal, uint16_t count) {
+  ConditionalExecuteTemplate conditional = conditional_template_;
+
+  uint32_t address_low = PtrLow32(signal);
+  uint32_t address_high = PtrHigh32(signal);
+  assert(!(address_low & 0x7) && "destination address must be 8 byte aligned");
+
+  conditional.conditional.boolAddrLo = address_low;
+  conditional.conditional.boolAddrHi = address_high;
+  conditional.conditional.execCount = count;
+
+  APPEND_COMMAND_WRAPPER(cmdbuf, conditional);
+}
+
+void Gfx8CmdWriter::BuildWriteDataCommand(CmdBuf* cmdbuf, uint32_t* write_addr,
+                                             uint32_t write_value) {
+  // Copy the initialize command packet
+  gfx8::WriteDataTemplate command = write_data_template_;
+
+  // Encode the user specified value to write
+  command.write_data_value = write_value;
+
+  // Encode the user specified address to write to
+  command.write_data.dstAddrLo = PtrLow32(write_addr);
+  command.write_data.dstAddrHi = PtrHigh32(write_addr);
+
+  // Append the built command into output Command Buffer
+  APPEND_COMMAND_WRAPPER(cmdbuf, command);
+}
+
+void Gfx8CmdWriter::BuildWriteData64Command(CmdBuf* cmdbuf, uint64_t* write_addr,
+                                               uint64_t write_value) {
+  // Copy the initialize command packet
+  gfx8::WriteData64Template command = write_data64_template_;
+
+  // Encode the user specified value to write
+  command.write_data_value = write_value;
+
+  // Encode the user specified address to write to
+  command.write_data.dstAddrLo = PtrLow32(write_addr);
+  command.write_data.dstAddrHi = PtrHigh32(write_addr);
+
+  // Append the built command into output Command Buffer
+  APPEND_COMMAND_WRAPPER(cmdbuf, command);
+}
+
+void Gfx8CmdWriter::BuildFlushCacheCmd(CmdBuf* cmdbuf, FlushCacheOptions* options,
+                                          uint32_t* writeAddr, uint32_t writeVal) {
+  PM4CMDACQUIREMEM flushCmd;
+  memset(&flushCmd, 0, sizeof(flushCmd));
+
+  // Verify write back address is valid. Note that this address is NOT
+  // used on CI. But to have a same interface as that on SI, we keep
+  // the address argument in this function. Thus, this check always pass
+  // no matter the address is NULL or not.
+  (writeAddr == NULL) ? assert(true) : assert(true);
+
+  // Initialize the command header
+  gfx8::GenerateCmdHeader(&flushCmd, IT_ACQUIRE_MEM__CI__VI);
+
+  // Specify the base address of memory being synchronized.
+  // The starting address is indicated as follows: bits [0-48].
+  flushCmd.cpCoherBase.u32All = 0;
+  flushCmd.cpCoherBaseHi.u32All = 0;
+
+  // Specify the size of memory being synchronized. It is indicated
+  // as follows:
+  //    COHER_SIZE_256B_MASK = 0xffffffffL
+  //    COHER_SIZE_HI_256B_MASK__CI__VI = 0x000000ffL
+  flushCmd.cpCoherSize.u32All = CP_COHER_SIZE__COHER_SIZE_256B_MASK;
+  flushCmd.cpCoherSizeHi.u32All = CP_COHER_SIZE_HI__COHER_SIZE_HI_256B_MASK__CI__VI;
+
+  // Periodicity of polling - interval to wait from the time
+  // of unsuccessful polling result is returned and a new
+  // poll is issued
+  flushCmd.pollInterval = 0x04;
+
+  // Program Coherence Control Register. Initialize L2 Cache flush
+  // for Non-Coherent memory blocks
+  uint32_t coher_cntl = 0;
+
+  coher_cntl |= (options->l1) ? CP_COHER_CNTL__TCL1_ACTION_ENA_MASK : 0;
+  coher_cntl |= (options->l2)
+      ? (CP_COHER_CNTL__TC_ACTION_ENA_MASK | CP_COHER_CNTL__TC_WB_ACTION_ENA_MASK__CI__VI)
+      : 0;
+  coher_cntl |= (options->icache) ? CP_COHER_CNTL__SH_ICACHE_ACTION_ENA_MASK : 0;
+  coher_cntl |= (options->kcache) ? CP_COHER_CNTL__SH_KCACHE_ACTION_ENA_MASK : 0;
+  flushCmd.coherCntl = coher_cntl;
+
+  // Copy AcquireMem command buffer stream
+  APPEND_COMMAND_WRAPPER(cmdbuf, flushCmd);
+  return;
+}
+
+void Gfx8CmdWriter::BuildDmaDataPacket(CmdBuf* cmdbuf, uint32_t* srcAddr, uint32_t* dstAddr,
+                                          uint32_t copySize, bool waitForConfirm) {
+  PM4CMDDMADATA cmdDmaData;
+  memset(&cmdDmaData, 0, sizeof(PM4CMDDMADATA));
+  cmdDmaData.header.u32All =
+      (PM4_TYPE_3_HDR(IT_DMA_DATA__CI__VI, PM4_CMD_DMA_DATA_DWORDS, ShaderCompute, 0));
+
+  // Id of Micro Engine
+  cmdDmaData.engine = 0;
+
+  // Specify attributes of source buffer such as its
+  // location, ATC property, Cache policy and Volatile
+  // A value of 1 for cache policy means to Stream
+  cmdDmaData.srcSel = 0;
+  cmdDmaData.srcATC = atc_support_;
+  cmdDmaData.srcCachePolicy = 1;
+  cmdDmaData.srcVolatile = 0;
+
+  // Specify attributes of destination buffer such as
+  // its location, ATC property, Cache policy and Volatile
+  // A value of 1 for cache policy means to Stream
+  cmdDmaData.dstSel = 0;
+  cmdDmaData.dstATC = atc_support_;
+  cmdDmaData.dstCachePolicy = 1;
+  cmdDmaData.dstVolatile = 0;
+
+  // Specify the source and destination addr
+  cmdDmaData.srcAddrHi = PtrHigh32(srcAddr);
+  cmdDmaData.srcAddrLoOrData = PtrLow32(srcAddr);
+  cmdDmaData.dstAddrLo = PtrLow32(dstAddr);
+  cmdDmaData.dstAddrHi = PtrHigh32(dstAddr);
+
+  // Number of bytes to copy. The command restricts
+  // the size to be (2 MB - 1) - 21 Bits
+  assert(copySize < 0x1FFFFF);
+  cmdDmaData.command.byteCount = copySize;
+
+  // Indicate that DMA Cmd should wait if its source
+  // is the destination of a previous DMA Cmd
+  cmdDmaData.command.rawWait = waitForConfirm;
+
+  APPEND_COMMAND_WRAPPER(cmdbuf, cmdDmaData);
+  return;
+}
+
+}  // gfx8
+}  // pm4_profile
@@ -0,0 +1,201 @@
+#ifndef _GFX8_CMDWRITER_H_
+#define _GFX8_CMDWRITER_H_
+
+#include "cmdwriter.h"
+#include "gfx8_cmds.h"
+
+namespace pm4_profile {
+
+namespace gfx8 {
+
+/// @brief class Gfx8CmdWriter implements the virtual class CommandWriter
+/// for Sea Islands (CI) and VI chipset
+class Gfx8CmdWriter : public CommandWriter {
+ public:
+  Gfx8CmdWriter(bool atc_support, bool pcie_atomic_support);
+
+  /// @brief Dword specifying NOOP command for SI/CI/VI chipsets. The macro
+  /// populates the NOOP command which is 32-bits wide. The second parameter,
+  /// the COUNT field of NOOP command, specifies the number of Dwords to skip.
+  /// To skip ZERO Dwords the value should be set to 0x3FFF. Since the macro
+  /// decrements the second parameter by TWO, an artifact of its definition,
+  /// the value is incremented by TWO to 0x4001 (0x3FFF + 2).
+  ///
+  inline uint32_t GetNoOpCmd() {
+    static const uint32_t nopCmd = PM4_TYPE_3_HDR(IT_NOP, 0x4001, ShaderCompute, 0);
+    return nopCmd;
+  }
+
+  void BuildBarrierCommand(CmdBuf* cmdBuf);
+
+  void BuildIndirectBufferCmd(CmdBuf* cmdbuf, const void* cmd_addr, std::size_t cmd_size);
+
+  void BuildBOPNotifyCmd(CmdBuf* cmdbuf, const void* write_addr, uint32_t write_val,
+                         bool interrupt);
+
+  void BuildBarrierFenceCommands(CmdBuf* cmdbuf);
+
+  void BuildWriteEventPacket(CmdBuf* cmdbuf, uint32_t event);
+
+  void BuildWaitRegMemCommand(CmdBuf* cmdbuf, bool mem_space, uint64_t wait_addr, bool func_eq,
+                              uint32_t mask_val, uint32_t wait_val);
+
+  void BuildWriteUnshadowRegPacket(CmdBuf* cmdbuf, uint32_t addr, uint32_t value);
+
+  /// @brief Build CP command to program a Gpu register
+  ///
+  /// @param cmdbuf Pointer to command buffer to be appended
+  /// @param addr Register to be programmed
+  /// @param value Value to write into register
+  ///
+  /// @return void
+  void BuildWriteUConfigRegPacket(CmdBuf* cmdbuf, uint32_t addr, uint32_t value);
+
+  void BuildWriteShRegPacket(CmdBuf* cmdbuf, uint32_t addr, uint32_t value);
+
+  void BuildCopyDataPacket(CmdBuf* cmdbuf, uint32_t src_sel, uint32_t src_addr_lo,
+                           uint32_t src_addr_hi, uint32_t* dst_addr, uint32_t size, bool wait);
+
+  void BuildWriteWaitIdlePacket(CmdBuf* cmdbuf);
+
+  // Will issue a VGT event including a cache flush later on
+  void BuildVgtEventPacket(CmdBuf* cmdbuf, uint32_t vgtEvent);
+
+  void BuildWriteRegisterPacket(CmdBuf* cmdbuf, uint32_t addr, uint32_t value);
+
+  void BuildWriteEventQueryPacket(CmdBuf* cmdbuf, uint32_t event, uint32_t* addr);
+
+  void BuildAtomicPacket(CmdBuf* cmdbuf, AtomicType atomic_op, volatile uint32_t* addr,
+                         uint32_t value, uint32_t compare);
+
+  void BuildAtomicPacket64(CmdBuf* cmdbuf, AtomicType atomic_op, volatile uint64_t* addr,
+                           uint64_t value = 0, uint64_t compare = 0);
+
+  size_t SizeOfAtomicPacket() const;
+
+  void BuildConditionalExecute(CmdBuf* cmdbuf, uint32_t* signal, uint16_t count);
+
+  void BuildWriteDataCommand(CmdBuf* cmdbuf, uint32_t* write_addr, uint32_t write_value);
+
+  void BuildWriteData64Command(CmdBuf* cmdbuf, uint64_t* write_addr, uint64_t write_value);
+
+  void BuildCacheFlushPacket(CmdBuf* cmdbuf);
+
+  /// Writes into input buffer Gpu commands to flush its cache. It is
+  /// necessary that the buffer provided for flush commands is large
+  /// enough to accommodate the full set of commands. It should be at
+  /// least 512 bytes.
+  ///
+  /// @param tsCmdBuf Buffer to write commands to.
+  /// @param writeAddr Registered address into which GPU should write
+  /// a user provided value upon executing the flush commands.
+  /// @param writeVal User provided value written by GPU at user provided
+  /// address, upon executing the flush commands.
+  ///
+  /// @return void
+  void BuildFlushCacheCmd(CmdBuf* cmdBuf, FlushCacheOptions* options, uint32_t* writeAddr,
+                          uint32_t writeVal);
+
+  /// Builds Gpu command to copy data from source to destination buffer
+  /// using DMA engine.
+  ///
+  /// @param cmdbuf Buffer updated with Gpu copy command
+  /// @param srcAddr Address of source buffer address
+  /// @param dstAddr Address of destination buffer address
+  /// @param copySize Size of data to copy in bytes
+  /// @param waitForCompletion if command should wait for copying to complete
+  void BuildDmaDataPacket(CmdBuf* cmdBuf, uint32_t* srcAddr, uint32_t* dstAddr, uint32_t copySize,
+                          bool waitForCompletion);
+
+ protected:
+  /// @brief Copies data from source buffer to destination buffer
+  ///
+  /// @param dst_addr Address of destination buffer data
+  ///
+  /// @count Size of data to copy in 32-bit words
+  ///
+  /// @param src_addr Address of buffer containing source data
+  ///
+  /// @return void
+  virtual void WriteUserData(uint32_t* dst_addr, uint32_t count, const void* src_addr);
+
+  /// @brief Append an instance of Gpu command into input command buffer stream.
+  ///
+  /// @param cmdbuf CommandWriter object appended with anohter Gpu command
+  ///
+  /// @param cmd Gpu command to be appended into command buffer
+  ///
+  /// @return void
+  template <class T> void AppendCommand(CmdBuf* cmdbuf, const T& cmd);
+
+ private:
+  /// @brief Initializes a Gpu command which can be used to
+  /// reference a Gpu command stream indirectly
+  void InitializeLaunchTemplate();
+
+  /// @brief Initializes a Gpu command to perform atomic operations
+  ////
+  void InitializeAtomicTemplate();
+
+  /// @brief Initializes a Gpu command to allow conditional execution
+  /// of a Gpu command stream
+  void InitializeConditionalTemplate();
+
+  /// @brief Initializes a Gpu command to let command processor
+  /// wait for some update before letting other commands to be
+  /// processed
+  void InitializeWaitRegMemTemplate();
+
+  /// @brief Initializes the template for Barrier command.
+  /// Applications can use Barrier command to ensure their
+  /// command is executed only after all other commands have
+  /// completed their execution.
+  void InitializeBarrierTemplate();
+
+  void BuildUpdateHostAddress(CmdBuf* cmdbuf, uint64_t* addr, int64_t value);
+
+  /// @brief Initializes Acquire Memory command template. Users
+  /// can submit this command to invalidate Gpu caches - L1 and
+  /// or L2.
+  void InitializeAcquireMemTemplate();
+
+  /// @brief Initializes an instance of Write Data command
+  /// for use by an application
+  void InitializeWriteDataTemplate();
+  void InitializeWriteData64Template();
+
+  /// @brief Instance of Gpu command to reference dispatch commands
+  LaunchTemplate launch_template_;
+
+  /// @brief Instance of Gpu command to use in performing atomic operations
+  AtomicTemplate atomic_template_;
+
+  /// @brief Instance of Gpu command to use in conditional execution
+  /// of a command stream
+  ConditionalExecuteTemplate conditional_template_;
+
+  /// @brief Instance of Pm4 command WRITE_DATA
+  WriteDataTemplate write_data_template_;
+  WriteData64Template write_data64_template_;
+
+  /// @brief Instance of Pm4 command EVENT_WRITE
+  BarrierTemplate pending_dispatch_template_;
+
+  /// @brief Instance of Pm4 command ACQUIRE_MEM
+  AcquireMemTemplate invalidate_cache_template_;
+
+  /// @brief Instance of Pm4 command WAIT_REG_MEM
+  WaitRegMemTemplate wait_reg_mem_template_;
+
+  /// @brief ATC support.
+  bool atc_support_;
+
+  /// @brief PCIe atomic support.
+  bool pcie_atomic_support_;
+};
+
+}  // gfx8
+
+}  // pm4_profile
+
+#endif  //  _GFX8_CMDWRITER_H_
@@ -0,0 +1,90 @@
+#ifndef _GFX9_CMDS_H_
+#define _GFX9_CMDS_H_
+
+#include "gfxip/gfx9/gfx9_utils.h"
+#include "gfxip/gfx9/gfx9_enum.h"
+#include "gfxip/gfx9/gfx9_mask.h"
+#include "gfxip/gfx9/gfx9_offset.h"
+#include "gfxip/gfx9/gfx9_typedef.h"
+#include "gfxip/gfx9/gfx9_registers.h"
+#include "gfxip/gfx9/gfx9_pm4_it_opcodes.h"
+#include "gfxip/gfx9/f32_mec_pm4_packets_vg10.h"
+#include "gfxip/gfx9/f32_pfp_pm4_packets_vg10.h"
+
+namespace pm4_profile {
+
+namespace gfx9 {
+
+/// @brief Initializer for commands that set shader registers
+template <class T> void GenerateSetShRegHeader(T* pm4, uint32_t reg_addr) {
+  pm4->cmd_set_data.header.u32All = PM4_TYPE3_HDR(IT_SET_SH_REG, sizeof(T) / sizeof(uint32_t));
+  pm4->cmd_set_data.bitfields2.reg_offset = reg_addr - PERSISTENT_SPACE_START;
+}
+
+// @brief Initializer for various Gpu command headers
+template <class T> void GenerateCmdHeader(T* pm4, IT_OpCodeType op_code) {
+  pm4->header.u32All = PM4_TYPE3_HDR(op_code, sizeof(T) / sizeof(uint32_t));
+}
+
+// @brief Initializer for commands that set configuration registers
+template <class T> void GenerateSetConfigRegHeader(T* pm4, uint32_t reg_addr) {
+  pm4->cmd_set_data.header.u32All = PM4_TYPE3_HDR(IT_SET_CONFIG_REG, sizeof(T) / sizeof(uint32_t));
+  pm4->cmd_set_data.bitfields2.reg_offset = reg_addr - CONFIG_SPACE_START;
+}
+
+/// @brief Structure used to issue a Gpu Barrier command
+struct BarrierTemplate {
+  PM4MEC_EVENT_WRITE event_write;
+};
+
+/// @brief Structure used to configure the flushing of
+/// various caches - instruction, constants, L1 and L2
+struct AcquireMemTemplate {
+  PM4MEC_ACQUIRE_MEM acquire_mem;
+};
+
+/// @brief Structure used to reference another Gpu command
+/// indirectly. Generally used to reference a list of Gpu
+/// commands (dispatch cmds) indirectly
+struct LaunchTemplate {
+  PM4MEC_INDIRECT_BUFFER indirect_buffer;
+};
+
+/// @brief Structure used to determine the end of
+/// a kernel including cache flushes and writing to
+/// a user configurable memory location
+struct EndofKernelNotifyTemplate {
+  PM4MEC_RELEASE_MEM release_mem;
+};
+
+// Desc: Strucuture used to perform various atomic
+// operations - add, subtract, increment, etc
+struct AtomicTemplate {
+  PM4MEC_ATOMIC_MEM atomic;
+};
+
+/// @brief PM4 command to write a 32-bit value into a memory
+/// location accessible to Gpu
+struct WriteDataTemplate {
+  PM4MEC_WRITE_DATA write_data;
+  uint32_t write_data_value;
+};
+
+/// @brief PM4 command to write a 64-bit value into a memory
+/// location accessible to Gpu
+struct WriteData64Template {
+  PM4MEC_WRITE_DATA write_data;
+  uint64_t write_data_value;
+};
+
+/// @brief PM4 command to wait for a certain event before proceeding
+/// to process another command on the queue
+struct WaitRegMemTemplate {
+  PM4MEC_WAIT_REG_MEM wait_reg_mem;
+};
+
+}  // gfx9
+
+}  // pm4_profile
+
+#endif  //  _GFX9_CMDS_H_
@@ -0,0 +1,743 @@
+#include <algorithm>
+#include <iostream>
+#include <iomanip>
+#include <sstream>
+
+#include "gfx9_cmdwriter.h"
+
+template <class T>
+static void PrintPm4Packet(const T& command, const char* name) {
+#if ! defined(NDEBUG)
+  uint32_t * cmd = (uint32_t*)&command;
+  uint32_t size = sizeof(command) / sizeof(uint32_t);
+  std::ostringstream oss;
+  oss << "'" << name << "' size(" << std::dec << size << ")";
+  std::clog << std::setw(40) << std::left << oss.str() << ":";
+  for (uint32_t idx = 0; idx < size; idx++) {
+    std::clog << " " << std::hex << std::setw(8) << std::setfill('0') << cmd[idx];
+  }
+  std::clog << std::setfill(' ') << std::endl;
+#endif
+}
+
+#define APPEND_COMMAND_WRAPPER(cmdbuf, command) \
+  PrintPm4Packet(command, __FUNCTION__); \
+  AppendCommand(cmdbuf, command);
+
+namespace pm4_profile {
+namespace gfx9 {
+
+template <class T> void Gfx9CmdWriter::AppendCommand(CmdBuf* cmdbuf, const T& command) {
+  cmdbuf->AppendCommand(&command, sizeof(command));
+}
+
+void Gfx9CmdWriter::InitializeLaunchTemplate() {
+  memset(&launch_template_, 0, sizeof(launch_template_));
+  GenerateCmdHeader(&launch_template_.indirect_buffer, IT_INDIRECT_BUFFER);
+}
+
+void Gfx9CmdWriter::InitializeAtomicTemplate() {
+  memset(&atomic_template_.atomic, 0, sizeof(atomic_template_));
+  GenerateCmdHeader(&atomic_template_.atomic, IT_ATOMIC_MEM);
+
+  // Specify the micro engine and cache policies
+  PM4MEC_ATOMIC_MEM* atomicCmd = &atomic_template_.atomic;
+  atomicCmd->bitfields2.cache_policy = cache_policy__mec_atomic_mem__stream;
+}
+
+void Gfx9CmdWriter::InitializeBarrierTemplate() {
+  memset(&pending_dispatch_template_, 0, sizeof(pending_dispatch_template_));
+  GenerateCmdHeader(&pending_dispatch_template_.event_write, IT_EVENT_WRITE);
+
+  MEC_EVENT_WRITE_event_index_enum index;
+  index = event_index__mec_event_write__cs_partial_flush;
+  pending_dispatch_template_.event_write.bitfields2.event_index = index;
+  pending_dispatch_template_.event_write.bitfields2.event_type = CS_PARTIAL_FLUSH;
+}
+
+void Gfx9CmdWriter::InitializeAcquireMemTemplate() {
+  memset(&invalidate_cache_template_, 0, sizeof(invalidate_cache_template_));
+  GenerateCmdHeader(&invalidate_cache_template_.acquire_mem, IT_ACQUIRE_MEM);
+
+  // Specify the CP module which will process this packet
+  PM4MEC_ACQUIRE_MEM* acquire_mem = &invalidate_cache_template_.acquire_mem;
+
+  // Specify the size of memory to invalidate. Size is
+  // specified in terms of 256 byte chunks. A coher_size
+  // of 0xFFFFFFFF actually specified 0xFFFFFFFF00 (40 bits)
+  // of memory. The field coher_size_hi specifies memory from
+  // bits 40-64 for a total of 256 TB.
+  acquire_mem->coher_size = 0xFFFFFFFF;
+  acquire_mem->bitfields4.coher_size_hi = 0xFFFFFF;
+
+  // Specify the address of memory to invalidate. The
+  // address must be 256 byte aligned.
+  acquire_mem->coher_base_lo = 0x00;
+  acquire_mem->bitfields6.coher_base_hi = 0x00;
+
+  // Specify the poll interval for determing if operation is complete
+  acquire_mem->bitfields7.poll_interval = 0x04;
+}
+
+void Gfx9CmdWriter::InitializeWaitRegMemTemplate() {
+  memset(&wait_reg_mem_template_, 0, sizeof(wait_reg_mem_template_));
+  GenerateCmdHeader(&wait_reg_mem_template_.wait_reg_mem, IT_WAIT_REG_MEM);
+
+  PM4MEC_WAIT_REG_MEM* wait_reg_mem = &wait_reg_mem_template_.wait_reg_mem;
+
+  wait_reg_mem->bitfields7.poll_interval = 0x04;
+  wait_reg_mem->bitfields2.operation = operation__mec_wait_reg_mem__wait_reg_mem;
+}
+
+void Gfx9CmdWriter::InitializeWriteDataTemplate(PM4MEC_WRITE_DATA* write_data, bool bit32) {
+  // Initialize the header of command packet by adjusting the
+  // size of payload - one 32bit DWord or two 32bit DWords
+  uint32_t cmd_size = (bit32) ? 1 : 2;
+  memset(write_data, 0, sizeof(PM4MEC_WRITE_DATA));
+  cmd_size = cmd_size + (sizeof(PM4MEC_WRITE_DATA) / sizeof(uint32_t));
+  write_data->ordinal1 = PM4_TYPE3_HDR(IT_WRITE_DATA, cmd_size);
+
+  // Set the bit to confirm the write operation and cache policy
+  write_data->bitfields2.wr_confirm = wr_confirm__mec_write_data__wait_for_write_confirmation;
+  write_data->bitfields2.cache_policy = cache_policy__mec_write_data__stream;
+
+  // Specify the command to increment address if writing more than one DWord
+  write_data->bitfields2.addr_incr = addr_incr__mec_write_data__increment_address;
+
+  // Specify the class to which the write destination belongs
+  write_data->bitfields2.dst_sel = dst_sel__mec_write_data__memory;
+}
+
+void Gfx9CmdWriter::InitializeWriteDataTemplate() {
+  InitializeWriteDataTemplate(&write_data_template_.write_data, true);
+}
+
+void Gfx9CmdWriter::InitializeWriteData64Template() {
+  InitializeWriteDataTemplate(&write_data64_template_.write_data, false);
+}
+
+void Gfx9CmdWriter::InitializeConditionalTemplate() {
+  /*
+  memset(&conditional_template_.conditional, 0, sizeof(conditional_template_));
+  GenerateCmdHeader(&conditional_template_.conditional, IT_COND_EXEC);
+
+  if (atc_support_) {
+    const uint32_t kAtcShift = 24;
+    conditional_template_.conditional.ordinal4 |= 1 << kAtcShift;
+  }
+  */
+}
+
+void Gfx9CmdWriter::InitializeEndOfKernelNotifyTemplate() {
+  memset(&notify_template_, 0, sizeof(notify_template_));
+  GenerateCmdHeader(&notify_template_.release_mem, IT_RELEASE_MEM);
+
+  // Set the event type to be bottom of pipe and cache policy
+  PM4MEC_RELEASE_MEM* rel_mem;
+  rel_mem = &notify_template_.release_mem;
+  rel_mem->bitfields2.event_type = BOTTOM_OF_PIPE_TS;
+  rel_mem->bitfields2.cache_policy = cache_policy__mec_release_mem__stream;
+  rel_mem->bitfields2.event_index = event_index__mec_release_mem__end_of_pipe;
+
+  // Specify the attributes of source and destinations of data
+  rel_mem->bitfields3.int_sel = int_sel__mec_release_mem__none;
+  rel_mem->bitfields3.data_sel = data_sel__mec_release_mem__none;
+  rel_mem->bitfields3.dst_sel = dst_sel__mec_release_mem__memory_controller;
+}
+
+Gfx9CmdWriter::Gfx9CmdWriter(bool atc_support, bool pcie_atomic_support) {
+  // Initialize various state variables related to
+  // atomic operations and atc support
+  this->atc_support_ = atc_support;
+  this->pcie_atomic_support_ = pcie_atomic_support;
+
+  // Initialize various command templates
+  InitializeLaunchTemplate();
+  InitializeAtomicTemplate();
+  InitializeBarrierTemplate();
+  InitializeAcquireMemTemplate();
+  InitializeWaitRegMemTemplate();
+  InitializeWriteDataTemplate();
+  InitializeWriteData64Template();
+  InitializeConditionalTemplate();
+  InitializeEndOfKernelNotifyTemplate();
+}
+
+void Gfx9CmdWriter::BuildIndirectBufferCmd(CmdBuf* cmdbuf, const void* cmd_addr,
+                                           std::size_t cmd_size) {
+  // Verify the address is 4-byte aligned
+  uint64_t addr = uintptr_t(cmd_addr);
+  assert(!(addr & 0x3) && "IndirectBuffer address must be 4 byte aligned");
+
+  // Specify the address of indirect buffer encoding cmd stream
+  LaunchTemplate launch = launch_template_;
+
+  launch.indirect_buffer.bitfields2.ib_base_lo = (PtrLow32(cmd_addr) >> 2);
+  launch.indirect_buffer.ib_base_hi = PtrHigh32(cmd_addr);
+
+  // Specify the size of indirect buffer and cache policy to set
+  // upon executing the cmds of indirect buffer
+  launch.indirect_buffer.bitfields4.priv = 0;
+  launch.indirect_buffer.bitfields4.valid = 1;
+  launch.indirect_buffer.bitfields4.ib_size = cmd_size / sizeof(uint32_t);
+  launch.indirect_buffer.bitfields4.cache_policy = cache_policy__mec_indirect_buffer__stream;
+
+  // Append the built command into output Command Buffer
+  APPEND_COMMAND_WRAPPER(cmdbuf, launch);
+}
+
+void Gfx9CmdWriter::BuildAtomicPacket(CmdBuf* cmdbuf, AtomicType atomic_op, volatile uint32_t* addr,
+                                      uint32_t value, uint32_t compare) {
+  AtomicTemplate atomicTemplate = atomic_template_;
+  PM4MEC_ATOMIC_MEM* atomicCmd = &atomicTemplate.atomic;
+
+  // make sure the destination adddress is aligned
+  uint32_t address_low = PtrLow32((void*)addr);
+  uint32_t address_high = PtrHigh32((void*)addr);
+  assert(!(address_low & 0x7) && "destination address must be 8 byte aligned");
+  atomicCmd->addr_lo = address_low;
+  atomicCmd->addr_hi = address_high;
+
+  switch (atomic_op) {
+    case CommandWriter::kAtomicTypeIncrement:
+      assert(!(value != 0x01) && "Atomic Increment value should be 1");
+    case CommandWriter::kAtomicAdd:
+      atomicCmd->src_data_lo = value;
+      atomicCmd->bitfields2.atomic = TC_OP_ATOMIC_ADD_RTN_32;
+      break;
+    case CommandWriter::kAtomicTypeDecrement:
+      assert(!(value != 0x01) && "Atomic Decrement value should be 1");
+    case CommandWriter::kAtomicSubtract:
+      atomicCmd->src_data_lo = value;
+      atomicCmd->bitfields2.atomic = TC_OP_ATOMIC_SUB_RTN_32;
+      break;
+    case CommandWriter::kAtomicTypeBlockingCompareAndSwap:
+      atomicCmd->bitfields9.loop_interval = 128;
+      atomicCmd->bitfields2.command = command__mec_atomic_mem__loop_until_compare_satisfied;
+    case CommandWriter::kAtomicTypeCompareAndSwap:
+      atomicCmd->src_data_lo = value;
+      atomicCmd->cmp_data_lo = compare;
+      atomicCmd->bitfields2.atomic = TC_OP_ATOMIC_CMPSWAP_RTN_32;
+      break;
+    case CommandWriter::kAtomicSwap:
+      atomicCmd->src_data_lo = value;
+      atomicCmd->bitfields2.atomic = TC_OP_ATOMIC_SWAP_RTN_32;
+      break;
+    default:
+      assert((false) && "Atomic operation id is invalid");
+  }
+
+  // Append the built command into output Command Buffer
+  APPEND_COMMAND_WRAPPER(cmdbuf, atomicTemplate);
+}
+
+void Gfx9CmdWriter::BuildAtomicPacket64(CmdBuf* cmdbuf, AtomicType atomic_op,
+                                        volatile uint64_t* addr, uint64_t value, uint64_t compare) {
+  AtomicTemplate atomicTemplate = atomic_template_;
+  PM4MEC_ATOMIC_MEM* atomicCmd = &atomicTemplate.atomic;
+
+  // make sure the destination adddress is aligned
+  uint32_t address_low = PtrLow32((void*)addr);
+  uint32_t address_high = PtrHigh32((void*)addr);
+  assert(!(address_low & 0x7) && "destination address must be 8 byte aligned");
+  atomicCmd->addr_lo = address_low;
+  atomicCmd->addr_hi = address_high;
+
+  switch (atomic_op) {
+    case CommandWriter::kAtomicTypeIncrement:
+      assert(!(value != 0x01) && "Atomic Increment value should be 1");
+    case CommandWriter::kAtomicAdd:
+      atomicCmd->src_data_lo = Low32(value);
+      atomicCmd->src_data_hi = High32(value);
+      atomicCmd->bitfields2.atomic = TC_OP_ATOMIC_ADD_RTN_64;
+      break;
+    case CommandWriter::kAtomicTypeDecrement:
+      assert(!(value != 0x01) && "Atomic Decrement value should be 1");
+    case CommandWriter::kAtomicSubtract:
+      atomicCmd->src_data_lo = Low32(value);
+      atomicCmd->src_data_hi = High32(value);
+      atomicCmd->bitfields2.atomic = TC_OP_ATOMIC_SUB_RTN_64;
+      break;
+    case CommandWriter::kAtomicTypeBlockingCompareAndSwap:
+      atomicCmd->bitfields9.loop_interval = 128;
+      atomicCmd->bitfields2.command = command__mec_atomic_mem__loop_until_compare_satisfied;
+    case CommandWriter::kAtomicTypeCompareAndSwap:
+      atomicCmd->src_data_lo = Low32(value);
+      atomicCmd->src_data_hi = High32(value);
+      atomicCmd->cmp_data_lo = Low32(compare);
+      atomicCmd->cmp_data_hi = High32(compare);
+      atomicCmd->bitfields2.atomic = TC_OP_ATOMIC_CMPSWAP_RTN_64;
+      break;
+    case CommandWriter::kAtomicSwap:
+      atomicCmd->src_data_lo = Low32(value);
+      atomicCmd->src_data_hi = High32(value);
+      atomicCmd->bitfields2.atomic = TC_OP_ATOMIC_SWAP_RTN_64;
+      break;
+    default:
+      assert((false) && "Atomic operation id is invalid");
+  }
+
+  // Append the built command into output Command Buffer
+  APPEND_COMMAND_WRAPPER(cmdbuf, atomicTemplate);
+}
+
+void Gfx9CmdWriter::BuildBarrierCommand(CmdBuf* cmdBuf) {
+  APPEND_COMMAND_WRAPPER(cmdBuf, pending_dispatch_template_);
+}
+
+void Gfx9CmdWriter::BuildWriteDataCommand(CmdBuf* cmdbuf, uint32_t* write_addr,
+                                          uint32_t write_value) {
+  // Copy the initialized command packet and its payload
+  WriteDataTemplate command = write_data_template_;
+
+  // Encode the user specified address to write to
+  uint64_t addr = uintptr_t(write_addr);
+  assert(!(addr & 0x3) && "WriteData address must be 4 byte aligned");
+
+  // Specify the value to write
+  command.write_data_value = write_value;
+
+  // Test Code to see if this makes a difference
+  command.write_data.dst_mem_addr_hi = PtrHigh32(write_addr);
+  command.write_data.bitfields3c.dst_mem_addr_lo = (PtrLow32(write_addr) >> 2);
+
+  // Append the built command into output Command Buffer
+  APPEND_COMMAND_WRAPPER(cmdbuf, command);
+}
+
+void Gfx9CmdWriter::BuildWriteData64Command(CmdBuf* cmdbuf, uint64_t* write_addr,
+                                            uint64_t write_value) {
+  // Copy the initialized command packet and its payload
+  WriteData64Template command = write_data64_template_;
+
+  // Encode the user specified address to write to
+  uint64_t addr = uintptr_t(write_addr);
+  assert(!(addr & 0x3) && "WriteData address must be 4 byte aligned");
+
+  command.write_data.bitfields3c.dst_mem_addr_lo = (PtrLow32(write_addr) >> 2);
+  command.write_data.dst_mem_addr_hi = PtrHigh32(write_addr);
+
+  // Specify the value to write
+  command.write_data_value = write_value;
+
+  // Append the built command into output Command Buffer
+  APPEND_COMMAND_WRAPPER(cmdbuf, command);
+}
+
+void Gfx9CmdWriter::BuildWaitRegMemCommand(CmdBuf* cmdbuf, bool mem_space, uint64_t wait_addr,
+                                           bool func_eq, uint32_t mask_val, uint32_t wait_val) {
+  WaitRegMemTemplate wait_cmd = wait_reg_mem_template_;
+
+  // Apply the space to which addr belongs
+  if (mem_space) {
+    wait_cmd.wait_reg_mem.bitfields2.mem_space = mem_space__mec_wait_reg_mem__memory_space;
+  } else {
+    wait_cmd.wait_reg_mem.bitfields2.mem_space = mem_space__mec_wait_reg_mem__register_space;
+  }
+
+  // Apply the function - equal / not equal desired by user
+  if (func_eq) {
+    wait_cmd.wait_reg_mem.bitfields2.function =
+        function__mec_wait_reg_mem__equal_to_the_reference_value;
+  } else {
+    wait_cmd.wait_reg_mem.bitfields2.function =
+        function__mec_wait_reg_mem__not_equal_reference_value;
+  }
+
+  // Value to use in applying equal / not equal function
+  wait_cmd.wait_reg_mem.reference = wait_val;
+
+  // Apply the mask on value at address/register
+  wait_cmd.wait_reg_mem.mask = mask_val;
+
+  // The address to poll should be DWord (4 byte) aligned
+  // Update upper 32 bit address if addr is not a register
+  if (mem_space) {
+    assert(!(wait_addr & 0x3) && "WaitRegMem address must be 4 byte aligned");
+  }
+  wait_cmd.wait_reg_mem.bitfields3a.mem_poll_addr_lo = (Low32(wait_addr) >> 2);
+  if (mem_space) {
+    wait_cmd.wait_reg_mem.mem_poll_addr_hi = High32(wait_addr);
+  }
+
+  // Append the command to cmd stream
+  APPEND_COMMAND_WRAPPER(cmdbuf, wait_cmd);
+}
+
+void Gfx9CmdWriter::BuildConditionalExecute(CmdBuf* cmdbuf, uint32_t* signal, uint16_t count) {
+  assert(false && "BuildConditionalExecute method is not implemented");
+  /*
+  ConditionalExecuteTemplate conditional = conditional_template_;
+
+  uint32_t address_low = PtrLow32(signal);
+  uint32_t address_high = PtrHigh32(signal);
+  assert(!(address_low & 0x7) && "destination address must be 8 byte aligned");
+
+  conditional.conditional.boolAddrLo = address_low;
+  conditional.conditional.boolAddrHi = address_high;
+  conditional.conditional.execCount = count;
+
+  APPEND_COMMAND_WRAPPER(cmdbuf, conditional);
+  */
+}
+
+void Gfx9CmdWriter::BuildUpdateHostAddress(CmdBuf* cmdbuf, uint64_t* addr, int64_t value) {
+  // If Atomics are supported, use it
+  if (pcie_atomic_support_) {
+    BuildAtomicPacket64(cmdbuf, CommandWriter::AtomicType::kAtomicSwap, (volatile uint64_t*)addr,
+                        value);
+    return;
+  }
+
+  BuildWriteData64Command(cmdbuf, addr, value);
+  return;
+}
+
+void Gfx9CmdWriter::BuildBOPNotifyCmd(CmdBuf* cmdbuf, const void* write_addr, uint32_t write_value,
+                                      bool interrupt) {
+  // Initialize the command including its header
+  EndofKernelNotifyTemplate eop = notify_template_;
+  PM4MEC_RELEASE_MEM* rel_mem = &eop.release_mem;
+
+  // Program CP to perform various cache operations
+  // before issuing the write operation commences
+  rel_mem->bitfields2.tc_action_ena = true;
+  rel_mem->bitfields2.tc_wb_action_ena = true;
+
+  // Update cmd to write a user specified 32-bit value
+  rel_mem->data_lo = write_value;
+  rel_mem->bitfields3.data_sel = data_sel__mec_release_mem__send_32_bit_low;
+
+  // Update cmd with user specified address to write to
+  rel_mem->address_hi = High32(uint64_t(write_addr));
+  rel_mem->bitfields4b.address_lo_64b = (Low32(uint64_t(write_addr) >> 3));
+
+  // Update cmd to issue interrupt if user has requested it
+  if (interrupt) {
+    rel_mem->bitfields3.int_sel = int_sel__mec_release_mem__send_interrupt_after_write_confirm;
+  }
+
+  // Serialize the command as stream of Dwords
+  APPEND_COMMAND_WRAPPER(cmdbuf, eop);
+}
+
+void Gfx9CmdWriter::BuildBarrierFenceCommands(CmdBuf* cmdbuf) {
+  // TODO: temporarily remove the check because some OpenCL tests
+  // (test_buffers, test_relationals) are failing.
+  //    if (using_cc_memory_policy_)
+  //        return;
+  AcquireMemTemplate invalidate_src_caches = invalidate_cache_template_;
+
+  // wbINVL2 by default writes-back and invalidates both L1 and L2
+  invalidate_src_caches.acquire_mem.bitfields2.coher_cntl = CP_COHER_CNTL__TC_ACTION_ENA_MASK;
+  invalidate_src_caches.acquire_mem.bitfields2.coher_cntl |= CP_COHER_CNTL__TC_WB_ACTION_ENA_MASK;
+
+  APPEND_COMMAND_WRAPPER(cmdbuf, invalidate_src_caches);
+}
+
+/*
+// PM4 packet for profilers
+#define PM4_PACKET3 (0xC0000000)
+#define PM4_PACKET3_CMD_SHIFT 8
+#define PM4_PACKET3_COUNT_SHIFT 16
+
+#define PACKET3(cmd, count)                                 \
+  (PM4_PACKET3 | (((count)-1) << PM4_PACKET3_COUNT_SHIFT) | \
+   ((cmd) << PM4_PACKET3_CMD_SHIFT))
+*/
+
+// Structure to store the event PM4 packet
+typedef struct WriteRegPacket_ { uint32_t item[3]; } WriteRegPacket;
+
+void Gfx9CmdWriter::BuildWriteEventPacket(CmdBuf* cmdbuf, uint32_t event) {
+  PM4MEC_EVENT_WRITE cp_event_initiator;
+  memset(&cp_event_initiator, 0, sizeof(PM4MEC_EVENT_WRITE));
+  cp_event_initiator.ordinal1 =
+      PM4_TYPE3_HDR(IT_EVENT_WRITE, (sizeof(PM4MEC_EVENT_WRITE) / sizeof(uint32_t)));
+  cp_event_initiator.ordinal2 = 0;
+
+  VGT_EVENT_TYPE eventType = Reserved_0x00;
+  switch (event) {
+    case kPerfCntrsStart:
+      eventType = PERFCOUNTER_START;
+      break;
+    case kPerfCntrsStop:
+      eventType = PERFCOUNTER_STOP;
+      break;
+    case kPerfCntrsSample:
+      eventType = PERFCOUNTER_SAMPLE;
+      break;
+    default:
+      assert(false && "Illegal VGT Event Id");
+  }
+
+  MEC_EVENT_WRITE_event_index_enum index;
+  index = event_index__mec_event_write__other;
+  cp_event_initiator.bitfields2.event_index = index;
+  cp_event_initiator.bitfields2.event_type = eventType;
+
+  // Append the built command into output Command Buffer
+  APPEND_COMMAND_WRAPPER(cmdbuf, cp_event_initiator);
+}
+
+void Gfx9CmdWriter::BuildWriteUnshadowRegPacket(CmdBuf* cmdbuf, uint32_t addr, uint32_t value) {
+  WriteRegPacket packet;
+  packet.item[0] =
+      PM4_TYPE3_HDR(IT_SET_UCONFIG_REG, (1 + sizeof(PM4MEC_SET_CONFIG_REG) / sizeof(uint32_t)));
+  packet.item[1] = (addr - UCONFIG_SPACE_START);
+  packet.item[2] = value;
+
+  APPEND_COMMAND_WRAPPER(cmdbuf, packet);
+}
+
+void Gfx9CmdWriter::BuildWriteUConfigRegPacket(CmdBuf* cmdbuf, uint32_t addr, uint32_t value) {
+  WriteRegPacket packet;
+  packet.item[0] =
+      PM4_TYPE3_HDR(IT_SET_UCONFIG_REG, (1 + sizeof(PM4MEC_SET_CONFIG_REG) / sizeof(uint32_t)));
+  packet.item[1] = (addr - UCONFIG_SPACE_START);
+  packet.item[2] = value;
+
+  APPEND_COMMAND_WRAPPER(cmdbuf, packet);
+}
+
+void Gfx9CmdWriter::BuildWriteShRegPacket(CmdBuf* cmdbuf, uint32_t addr, uint32_t value) {
+  WriteRegPacket packet;
+  packet.item[0] =
+      PM4_TYPE3_HDR(IT_SET_SH_REG, (1 + sizeof(PM4MEC_SET_CONFIG_REG) / sizeof(uint32_t)));
+  packet.item[1] = (addr - PERSISTENT_SPACE_START);
+  packet.item[2] = value;
+
+  APPEND_COMMAND_WRAPPER(cmdbuf, packet);
+}
+
+void Gfx9CmdWriter::BuildCopyDataPacket(CmdBuf* cmdbuf, uint32_t src_sel, uint32_t src_addr_lo,
+                                        uint32_t src_addr_hi, uint32_t* dst_addr, uint32_t size,
+                                        bool wait) {
+  PM4MEC_COPY_DATA cmd_data;
+  memset(&cmd_data, 0, sizeof(PM4MEC_COPY_DATA));
+  cmd_data.ordinal1 = PM4_TYPE3_HDR(IT_COPY_DATA, (sizeof(PM4MEC_COPY_DATA) / sizeof(uint32_t)));
+
+  MEC_COPY_DATA_src_sel_enum data_src = src_sel__mec_copy_data__memory;
+  switch (src_sel) {
+    case 0:
+      data_src = src_sel__mec_copy_data__mem_mapped_register;
+      break;
+    case 4:
+      data_src = src_sel__mec_copy_data__perfcounters;
+      break;
+    default:
+      assert(false && "CopyData Illegal value for source of data");
+      break;
+  }
+  cmd_data.bitfields2.src_sel = data_src;
+  cmd_data.bitfields2.src_cache_policy = src_cache_policy__mec_copy_data__stream;
+
+  cmd_data.bitfields2.dst_sel = dst_sel__mec_copy_data__memory;
+  cmd_data.bitfields2.dst_cache_policy = dst_cache_policy__mec_copy_data__stream;
+
+  cmd_data.bitfields2.wr_confirm = (MEC_COPY_DATA_wr_confirm_enum)wait;
+  cmd_data.bitfields2.count_sel = (size == 0) ? count_sel__mec_copy_data__32_bits_of_data
+                                              : count_sel__mec_copy_data__64_bits_of_data;
+
+  // Specify the source register offset
+  cmd_data.bitfields3a.src_reg_offset = src_addr_lo;
+
+  // Specify the destination memory address
+  cmd_data.dst_addr_hi = PtrHigh32(dst_addr);
+  if (size == 0) {
+    cmd_data.bitfields5b.dst_32b_addr_lo = (PtrLow32(dst_addr) >> 2);
+  } else {
+    cmd_data.bitfields5c.dst_64b_addr_lo = (PtrLow32(dst_addr) >> 3);
+  }
+
+  // Append the built command into output Command Buffer
+  APPEND_COMMAND_WRAPPER(cmdbuf, cmd_data);
+}
+
+void Gfx9CmdWriter::BuildCacheFlushPacket(CmdBuf* cmdbuf) {
+  // Initialize the command header
+  PM4MEC_ACQUIRE_MEM cache_flush = invalidate_cache_template_.acquire_mem;
+
+  // Program Coherence Control Register. Initialize L2 Cache flush
+  // for Non-Coherent memory blocks
+  uint32_t coher_cntl = 0;
+
+  coher_cntl |= CP_COHER_CNTL__TC_ACTION_ENA_MASK;
+  coher_cntl |= CP_COHER_CNTL__TCL1_ACTION_ENA_MASK;
+  coher_cntl |= CP_COHER_CNTL__TC_WB_ACTION_ENA_MASK;
+  coher_cntl |= CP_COHER_CNTL__SH_ICACHE_ACTION_ENA_MASK;
+  coher_cntl |= CP_COHER_CNTL__SH_KCACHE_ACTION_ENA_MASK;
+  cache_flush.bitfields2.coher_cntl = coher_cntl;
+
+  // Copy AcquireMem command buffer stream
+  APPEND_COMMAND_WRAPPER(cmdbuf, cache_flush);
+}
+
+void Gfx9CmdWriter::BuildWriteWaitIdlePacket(CmdBuf* cmdbuf) {
+  BuildBarrierCommand(cmdbuf);
+  BuildCacheFlushPacket(cmdbuf);
+}
+
+// Will issue a VGT event including a cache flush later on
+void Gfx9CmdWriter::BuildVgtEventPacket(CmdBuf* cmdbuf, uint32_t vgtEvent) {
+  PM4MEC_EVENT_WRITE cp_event_initiator;
+  memset(&cp_event_initiator, 0, sizeof(PM4MEC_EVENT_WRITE));
+  cp_event_initiator.ordinal1 =
+      PM4_TYPE3_HDR(IT_EVENT_WRITE, (sizeof(PM4MEC_EVENT_WRITE) / sizeof(uint32_t)));
+  cp_event_initiator.ordinal2 = 0;
+
+  VGT_EVENT_TYPE eventType = Reserved_0x00;
+  switch (vgtEvent) {
+    case kPerfCntrsStart:
+      eventType = PERFCOUNTER_START;
+      break;
+    case kPerfCntrsStop:
+      eventType = PERFCOUNTER_STOP;
+      break;
+    case kPerfCntrsSample:
+      eventType = PERFCOUNTER_SAMPLE;
+      break;
+    case kThrdTraceStart:
+      eventType = THREAD_TRACE_START;
+      break;
+    case kThrdTraceStop:
+      eventType = THREAD_TRACE_STOP;
+      break;
+    case kThrdTraceFlush:
+      eventType = THREAD_TRACE_FLUSH;
+      break;
+    case kThrdTraceFinish:
+      eventType = THREAD_TRACE_FINISH;
+      break;
+    default:
+      assert(false && "Illegal VGT Event Id");
+  }
+
+  MEC_EVENT_WRITE_event_index_enum index;
+  index = event_index__mec_event_write__other;
+  cp_event_initiator.bitfields2.event_index = index;
+  cp_event_initiator.bitfields2.event_type = eventType;
+
+  // Append the built command into output Command Buffer
+  APPEND_COMMAND_WRAPPER(cmdbuf, cp_event_initiator);
+
+  // Check If I should be issuing a cache flush operation as well
+  // test and remove it
+  BuildCacheFlushPacket(cmdbuf);
+}
+
+void Gfx9CmdWriter::BuildWriteRegisterPacket(CmdBuf* cmdbuf, uint32_t addr, uint32_t value) {
+  /*
+  WriteRegPacket packet;
+  packet.item[0] = (PM4_TYPE3_HDR(
+      IT_SET_CONFIG_REG, 1 + PM4_CMD_SET_CONFIG_REG_DWORDS, ShaderGraphics, 0));
+  packet.item[1] = addr - CONFIG_SPACE_START;
+  packet.item[2] = value;
+
+  APPEND_COMMAND_WRAPPER(cmdbuf, packet);
+
+  return;
+  */
+}
+
+void Gfx9CmdWriter::BuildWriteEventQueryPacket(CmdBuf* cmdbuf, uint32_t event, uint32_t* addr) {
+  PM4MEC_EVENT_WRITE_QUERY cp_event_initiator;
+  memset(&cp_event_initiator, 0, sizeof(PM4MEC_EVENT_WRITE_QUERY));
+  cp_event_initiator.ordinal1 =
+      PM4_TYPE3_HDR(IT_EVENT_WRITE, (sizeof(PM4MEC_EVENT_WRITE_QUERY) / sizeof(uint32_t)));
+  cp_event_initiator.ordinal2 = 0;
+
+  // Update switch statements you want to support
+  VGT_EVENT_TYPE eventType = Reserved_0x00;
+  switch (event) {
+    default:
+      assert(false && "Illegal VGT Event Id");
+  }
+
+  MEC_EVENT_WRITE_event_index_enum index;
+  cp_event_initiator.bitfields2.event_type = eventType;
+  index = (MEC_EVENT_WRITE_event_index_enum)EventTypeToIndexTable[eventType];
+  cp_event_initiator.bitfields2.event_index = index;
+
+  // set the address
+  uint32_t addrLo = PtrLow32(addr);
+  uint32_t addrHi = PtrHigh32(addr);
+  ((addrLo & 0x7) != 0) ? assert(false) : assert(true);
+
+  cp_event_initiator.address_hi = addrHi;
+  cp_event_initiator.bitfields3.address_lo = (addrLo >> 3);
+
+  // Append the built command into output Command Buffer
+  APPEND_COMMAND_WRAPPER(cmdbuf, cp_event_initiator);
+}
+
+size_t Gfx9CmdWriter::SizeOfAtomicPacket() const {
+  return sizeof(AtomicTemplate) / sizeof(uint32_t);
+}
+
+void Gfx9CmdWriter::BuildFlushCacheCmd(CmdBuf* cmdbuf, FlushCacheOptions* options,
+                                       uint32_t* writeAddr, uint32_t writeVal) {
+  PM4MEC_ACQUIRE_MEM cache_flush = invalidate_cache_template_.acquire_mem;
+
+  // Verify write back address is valid. Note that this address is NOT
+  // used on CI. But to have a same interface as that on SI, we keep
+  // the address argument in this function. Thus, this check always pass
+  // no matter the address is NULL or not.
+  (writeAddr == NULL) ? assert(true) : assert(true);
+
+  // Program Coherence Control Register. Initialize L2 Cache flush
+  // for Non-Coherent memory blocks
+  uint32_t coher_cntl = 0;
+  coher_cntl |= (options->l1) ? CP_COHER_CNTL__TCL1_ACTION_ENA_MASK : 0;
+  coher_cntl |= (options->l2)
+      ? (CP_COHER_CNTL__TC_ACTION_ENA_MASK | CP_COHER_CNTL__TC_WB_ACTION_ENA_MASK)
+      : 0;
+  coher_cntl |= (options->icache) ? CP_COHER_CNTL__SH_ICACHE_ACTION_ENA_MASK : 0;
+  coher_cntl |= (options->kcache) ? CP_COHER_CNTL__SH_KCACHE_ACTION_ENA_MASK : 0;
+  cache_flush.bitfields2.coher_cntl = coher_cntl;
+
+  // Append the built command into output Command Buffer
+  APPEND_COMMAND_WRAPPER(cmdbuf, cache_flush);
+  return;
+}
+
+void Gfx9CmdWriter::BuildDmaDataPacket(CmdBuf* cmdbuf, uint32_t* srcAddr, uint32_t* dstAddr,
+                                       uint32_t copySize, bool waitForConfirm) {
+  PM4MEC_DMA_DATA cmdDmaData;
+  memset(&cmdDmaData, 0, sizeof(PM4MEC_DMA_DATA));
+  cmdDmaData.header.u32All =
+      PM4_TYPE3_HDR(IT_DMA_DATA, (sizeof(PM4MEC_DMA_DATA) / sizeof(uint32_t)));
+
+  // Specify attributes of source buffer such as its
+  // location and Cache policy
+  cmdDmaData.bitfields2.src_sel = src_sel__mec_dma_data__src_addr_using_sas;
+  cmdDmaData.bitfields2.src_cache_policy = src_cache_policy__mec_dma_data__stream;
+
+  // Specify attributes of destination buffer such as its
+  // location and Cache policy
+  cmdDmaData.bitfields2.dst_sel = dst_sel__mec_dma_data__dst_addr_using_das;
+  cmdDmaData.bitfields2.dst_cache_policy = dst_cache_policy__mec_dma_data__stream;
+
+  // Specify the source and destination addr
+  cmdDmaData.src_addr_lo_or_data = PtrLow32(srcAddr);
+  cmdDmaData.src_addr_hi = PtrHigh32(srcAddr);
+  cmdDmaData.dst_addr_lo = PtrLow32(dstAddr);
+  cmdDmaData.dst_addr_hi = PtrHigh32(dstAddr);
+
+  // Number of bytes to copy. The command restricts
+  // the size to be (64 MB - 1) - 26 Bits
+  assert(copySize < 0x1FFFFF);
+  cmdDmaData.bitfields7.byte_count = copySize;
+
+  // Indicate that DMA Cmd should wait if its source
+  // is the destination of a previous DMA Cmd
+  cmdDmaData.bitfields7.raw_wait = waitForConfirm;
+
+  APPEND_COMMAND_WRAPPER(cmdbuf, cmdDmaData);
+  return;
+}
+
+
+}  // gfx9 namespace
+
+}  // pm4_profile
@@ -0,0 +1,199 @@
+#ifndef _GFX9_CMDWRITER_H_
+#define _GFX9_CMDWRITER_H_
+
+#include "cmdwriter.h"
+#include "gfx9_cmds.h"
+
+namespace pm4_profile {
+
+namespace gfx9 {
+
+
+/// @brief class Gfx9CmdWriter implements the virtual class CommandWriter
+/// for GFX9 chipsets
+class Gfx9CmdWriter : public CommandWriter {
+ public:
+  Gfx9CmdWriter(bool atc_support, bool pcie_atomic_support);
+
+  /// @brief Dword specifying NOOP command for GFX9 chipsets. The macro
+  /// populates the NOOP command which is 32-bits wide. The second parameter,
+  /// the COUNT field of NOOP command, specifies the number of Dwords to skip.
+  /// To skip ZERO Dwords the value should be set to 0x3FFF. Since the macro
+  /// decrements the second parameter by TWO, an artifact of its definition,
+  /// the value is incremented by TWO to 0x4001 (0x3FFF + 2).
+  ///
+  inline uint32_t GetNoOpCmd() {
+    static const uint32_t nopCmd = PM4_TYPE3_HDR(IT_NOP, 0x4001);
+    return nopCmd;
+  }
+
+  void BuildBarrierCommand(CmdBuf* cmdBuf);
+
+  void BuildIndirectBufferCmd(CmdBuf* cmdbuf, const void* cmd_addr, std::size_t cmd_size);
+
+  void BuildBOPNotifyCmd(CmdBuf* cmdbuf, const void* write_addr, uint32_t write_val,
+                         bool interrupt);
+
+  void BuildBarrierFenceCommands(CmdBuf* cmdbuf);
+
+  void BuildWriteEventPacket(CmdBuf* cmdbuf, uint32_t event);
+
+  void BuildWaitRegMemCommand(CmdBuf* cmdbuf, bool mem_space, uint64_t wait_addr, bool func_eq,
+                              uint32_t mask_val, uint32_t wait_val);
+
+  void BuildWriteUnshadowRegPacket(CmdBuf* cmdbuf, uint32_t addr, uint32_t value);
+
+  /// @brief Build CP command to program a Gpu register
+  ///
+  /// @param cmdbuf Pointer to command buffer to be appended
+  /// @param addr Register to be programmed
+  /// @param value Value to write into register
+  ///
+  /// @return void
+  void BuildWriteUConfigRegPacket(CmdBuf* cmdbuf, uint32_t addr, uint32_t value);
+
+  void BuildWriteShRegPacket(CmdBuf* cmdbuf, uint32_t addr, uint32_t value);
+
+  void BuildCopyDataPacket(CmdBuf* cmdbuf, uint32_t src_sel, uint32_t src_addr_lo,
+                           uint32_t src_addr_hi, uint32_t* dst_addr, uint32_t size, bool wait);
+
+  void BuildWriteWaitIdlePacket(CmdBuf* cmdbuf);
+
+  // Will issue a VGT event including a cache flush later on
+  void BuildVgtEventPacket(CmdBuf* cmdbuf, uint32_t vgtEvent);
+
+  void BuildWriteRegisterPacket(CmdBuf* cmdbuf, uint32_t addr, uint32_t value);
+
+  void BuildWriteEventQueryPacket(CmdBuf* cmdbuf, uint32_t event, uint32_t* addr);
+
+  void BuildAtomicPacket(CmdBuf* cmdbuf, AtomicType atomic_op, volatile uint32_t* addr,
+                         uint32_t value, uint32_t compare);
+
+  void BuildAtomicPacket64(CmdBuf* cmdbuf, AtomicType atomic_op, volatile uint64_t* addr,
+                           uint64_t value = 0, uint64_t compare = 0);
+
+  size_t SizeOfAtomicPacket() const;
+
+  void BuildConditionalExecute(CmdBuf* cmdbuf, uint32_t* signal, uint16_t count);
+
+  void BuildWriteDataCommand(CmdBuf* cmdbuf, uint32_t* write_addr, uint32_t write_value);
+
+  void BuildWriteData64Command(CmdBuf* cmdbuf, uint64_t* write_addr, uint64_t write_value);
+
+  void BuildCacheFlushPacket(CmdBuf* cmdbuf);
+
+  /// Writes into input buffer Gpu commands to flush its cache. It is
+  /// necessary that the buffer provided for flush commands is large
+  /// enough to accommodate the full set of commands. It should be at
+  /// least 512 bytes.
+  ///
+  /// @param tsCmdBuf Buffer to write commands to.
+  /// @param writeAddr Registered address into which GPU should write
+  /// a user provided value upon executing the flush commands.
+  /// @param writeVal User provided value written by GPU at user provided
+  /// address, upon executing the flush commands.
+  ///
+  /// @return void
+  void BuildFlushCacheCmd(CmdBuf* cmdBuf, FlushCacheOptions* options, uint32_t* writeAddr,
+                          uint32_t writeVal);
+
+  /// Builds Gpu command to copy data from source to destination buffer
+  /// using DMA engine.
+  ///
+  /// @param cmdbuf Buffer updated with Gpu copy command
+  /// @param srcAddr Address of source buffer address
+  /// @param dstAddr Address of destination buffer address
+  /// @param copySize Size of data to copy in bytes
+  /// @param waitForCompletion if command should wait for copying to complete
+  void BuildDmaDataPacket(CmdBuf* cmdBuf, uint32_t* srcAddr, uint32_t* dstAddr, uint32_t copySize,
+                          bool waitForCompletion);
+
+ protected:
+  /// @brief Append an instance of Gpu command into input command buffer stream.
+  ///
+  /// @param cmdbuf CommandWriter object appended with anohter Gpu command
+  ///
+  /// @param cmd Gpu command to be appended into command buffer
+  ///
+  /// @return void
+  template <class T> void AppendCommand(CmdBuf* cmdbuf, const T& cmd);
+
+ private:
+  /// @brief Initializes a Gpu command which can be used to
+  /// reference a Gpu command stream indirectly
+  void InitializeLaunchTemplate();
+
+  /// @brief Initializes a Gpu command which can be used to
+  /// flush Gpu caches and write to a user configurable address
+  /// to indicate an end of kernel
+  void InitializeEndOfKernelNotifyTemplate();
+
+  /// @brief Initializes a Gpu command to perform atomic operations
+  ////
+  void InitializeAtomicTemplate();
+
+  /// @brief Initializes a Gpu command to allow conditional execution
+  /// of a Gpu command stream
+  void InitializeConditionalTemplate();
+
+  /// @brief Initializes a Gpu command to let command processor
+  /// wait for some update before letting other commands to be
+  /// processed
+  void InitializeWaitRegMemTemplate();
+
+  /// @brief Initializes the template for Barrier command.
+  /// Applications can use Barrier command to ensure their
+  /// command is executed only after all other commands have
+  /// completed their execution.
+  void InitializeBarrierTemplate();
+
+  void BuildUpdateHostAddress(CmdBuf* cmdbuf, uint64_t* addr, int64_t value);
+
+  /// @brief Initializes Acquire Memory command template. Users
+  /// can submit this command to invalidate Gpu caches - L1 and
+  /// or L2.
+  void InitializeAcquireMemTemplate();
+
+  /// @brief Initializes an instance of Write Data command
+  /// for use by an application
+  void InitializeWriteDataTemplate();
+  void InitializeWriteData64Template();
+  void InitializeWriteDataTemplate(PM4MEC_WRITE_DATA* write_data, bool bit32);
+
+  /// @brief Builds wait_reg_mem with EQUALS condition
+  void BuildWaitRegMemCommand(CmdBuf* cmdbuf, uint64_t wait_addr, uint32_t wait_value);
+
+  /// @brief Instance of Gpu command to reference dispatch commands
+  LaunchTemplate launch_template_;
+
+  /// @brief Instance of Gpu command to use in determing end of kernel
+  EndofKernelNotifyTemplate notify_template_;
+
+  /// @brief Instance of Gpu command to use in performing atomic operations
+  AtomicTemplate atomic_template_;
+
+  /// @brief Instance of Pm4 command WRITE_DATA
+  WriteDataTemplate write_data_template_;
+  WriteData64Template write_data64_template_;
+
+  /// @brief Instance of Pm4 command EVENT_WRITE
+  BarrierTemplate pending_dispatch_template_;
+
+  /// @brief Instance of Pm4 command ACQUIRE_MEM
+  AcquireMemTemplate invalidate_cache_template_;
+
+  /// @brief Instance of Pm4 command WAIT_REG_MEM
+  WaitRegMemTemplate wait_reg_mem_template_;
+
+  /// @brief ATC support.
+  bool atc_support_;
+
+  /// @brief PCIe atomic support.
+  bool pcie_atomic_support_;
+};
+
+}  // gfx9
+
+}  // pm4_profile
+
+#endif  //  _GFX9_CMDWRITER_H_
@@ -0,0 +1,24 @@
+#
+# Source files for Rocr PerfCntr
+#
+set ( LIB_SRC var_data.cpp )
+set ( LIB_SRC ${LIB_SRC} info_set.cpp )
+set ( LIB_SRC ${LIB_SRC} parameter_set.cpp )
+set ( LIB_SRC ${LIB_SRC} gpu_counter.cpp )
+set ( LIB_SRC ${LIB_SRC} gpu_countergroup.cpp )
+set ( LIB_SRC ${LIB_SRC} vi_blockinfo.cpp )
+set ( LIB_SRC ${LIB_SRC} vi_pmu.cpp )
+set ( LIB_SRC ${LIB_SRC} ai_blockinfo.cpp )
+set ( LIB_SRC ${LIB_SRC} ai_pmu.cpp )
+
+#
+# Header files include path(s).
+#
+include_directories ( $ENV{ROCR_INC_DIR} )
+include_directories ( ${PROJ_DIR}/commandwriter )
+include_directories ( ${CORE_UTIL_DIR} )
+
+#
+# Build PerfCntr as a Static Library object
+#
+add_library ( ${PMC_LIB} STATIC ${LIB_SRC} )
@@ -0,0 +1,555 @@
+#include "ai_blockinfo.h"
+#include "gfxip/gfx9/gfx9_offset.h"
+#include "gfxip/gfx9/gfx9_typedef.h"
+
+namespace pm4_profile {
+/**
+ * Table containing CounterGroups which represent AI hardware blocks
+ * as defined by \ref GpuBlockInfo structure
+ */
+GpuBlockInfo AiPmuHwBlocks[] = {
+    // Counter block CB
+    {"AI_CB0", kHsaAiCounterBlockIdCb0, AI_MAX_NUM_SHADER_ENGINES, 2, AI_NUM_CB,
+     CntlMethodBySeAndInstance, 395, AI_COUNTER_NUM_PER_CB, 0, 0, true, 0, 0, false, 0, 0},
+    {"AI_CB1", kHsaAiCounterBlockIdCb1, AI_MAX_NUM_SHADER_ENGINES, 2, AI_NUM_CB,
+     CntlMethodBySeAndInstance, 395, AI_COUNTER_NUM_PER_CB, 0, 0, true, 0, 0, false, 0, 0},
+    {"AI_CB2", kHsaAiCounterBlockIdCb2, AI_MAX_NUM_SHADER_ENGINES, 2, AI_NUM_CB,
+     CntlMethodBySeAndInstance, 395, AI_COUNTER_NUM_PER_CB, 0, 0, true, 0, 0, false, 0, 0},
+    {"AI_CB3", kHsaAiCounterBlockIdCb3, AI_MAX_NUM_SHADER_ENGINES, 2, AI_NUM_CB,
+     CntlMethodBySeAndInstance, 395, AI_COUNTER_NUM_PER_CB, 0, 0, true, 0, 0, false, 0, 0},
+
+    // Temp commented for Vega10
+    // Counter block CPF
+    /*
+    {"AI_CPF", kHsaAiCounterBlockIdCpf, AI_MAX_NUM_SHADER_ENGINES, 2, 1, CntlMethodNone, 19,
+    AI_COUNTER_NUM_PER_CPF, 0, 0, true, 0, 0, false, 0, 0},
+    */
+    {"AI_CB3", kHsaAiCounterBlockIdCpf, AI_MAX_NUM_SHADER_ENGINES, 2, AI_NUM_CB,
+     CntlMethodBySeAndInstance, 395, AI_COUNTER_NUM_PER_CB, 0, 0, true, 0, 0, false, 0, 0},
+
+    // Counter block DB
+    {"AI_DB0", kHsaAiCounterBlockIdDb0, AI_MAX_NUM_SHADER_ENGINES, 2, AI_NUM_DB,
+     CntlMethodBySeAndInstance, 256, AI_COUNTER_NUM_PER_DB, 0, 0, true, 0, 0, false, 0, 0},
+    {"AI_DB1", kHsaAiCounterBlockIdDb1, AI_MAX_NUM_SHADER_ENGINES, 2, AI_NUM_DB,
+     CntlMethodBySeAndInstance, 256, AI_COUNTER_NUM_PER_DB, 0, 0, true, 0, 0, false, 0, 0},
+    {"AI_DB2", kHsaAiCounterBlockIdDb2, AI_MAX_NUM_SHADER_ENGINES, 2, AI_NUM_DB,
+     CntlMethodBySeAndInstance, 256, AI_COUNTER_NUM_PER_DB, 0, 0, true, 0, 0, false, 0, 0},
+    {"AI_DB3", kHsaAiCounterBlockIdDb3, AI_MAX_NUM_SHADER_ENGINES, 2, AI_NUM_DB,
+     CntlMethodBySeAndInstance, 256, AI_COUNTER_NUM_PER_DB, 0, 0, true, 0, 0, false, 0, 0},
+
+    // Counter block GRBM
+    {"AI_GRBM", kHsaAiCounterBlockIdGrbm, AI_MAX_NUM_SHADER_ENGINES, 2, 1, CntlMethodNone, 33,
+     AI_COUNTER_NUM_PER_GRBM, 0, 0, true, 0, 0, false, 0, 0},
+
+    // Counter block GRBMSE
+    {"AI_GRBMSE", kHsaAiCounterBlockIdGrbmSe, AI_MAX_NUM_SHADER_ENGINES, 2, 1, CntlMethodNone, 14,
+     AI_COUNTER_NUM_PER_GRBMSE, 0, 0, true, 0, 0, false, 0, 0},
+
+    // Counter block PA_SU
+    {"AI_PA_SU", kHsaAiCounterBlockIdPaSu, AI_MAX_NUM_SHADER_ENGINES, 2, 1, CntlMethodBySe, 152,
+     AI_COUNTER_NUM_PER_PA_SU, 0, 0, true, 0, 0, false, 0, 0},
+
+    // Counter block PA_SC
+    {"AI_PA_SC", kHsaAiCounterBlockIdPaSc, AI_MAX_NUM_SHADER_ENGINES, 2, 1, CntlMethodBySe, 396,
+     AI_COUNTER_NUM_PER_PA_SC, 0, 0, true, 0, 0, false, 0, 0},
+
+    // Counter block SPI
+    {"AI_SPI", kHsaAiCounterBlockIdSpi, AI_MAX_NUM_SHADER_ENGINES, 2, 1, CntlMethodBySe, 196,
+     AI_COUNTER_NUM_PER_SPI, 0, 0, true, 0, 0, false, 0, 0},
+
+    // Counter block SQ
+    {"AI_SQ", kHsaAiCounterBlockIdSq, AI_MAX_NUM_SHADER_ENGINES, 2, 1, CntlMethodBySe, 298,
+     AI_COUNTER_NUM_PER_SQ, 0, 0, true, 0, 0, false, 0, 0},
+    {"AI_SQ_GS", kHsaAiCounterBlockIdSqGs, AI_MAX_NUM_SHADER_ENGINES, 2, 1, CntlMethodBySe, 298,
+     AI_COUNTER_NUM_PER_SQ, 0, 0, true, 0, 0, false, 0, 0},
+    {"AI_SQ_VS", kHsaAiCounterBlockIdSqVs, AI_MAX_NUM_SHADER_ENGINES, 2, 1, CntlMethodBySe, 298,
+     AI_COUNTER_NUM_PER_SQ, 0, 0, true, 0, 0, false, 0, 0},
+    {"AI_SQ_PS", kHsaAiCounterBlockIdSqPs, AI_MAX_NUM_SHADER_ENGINES, 2, 1, CntlMethodBySe, 298,
+     AI_COUNTER_NUM_PER_SQ, 0, 0, true, 0, 0, false, 0, 0},
+    {"AI_SQ_HS", kHsaAiCounterBlockIdSqHs, AI_MAX_NUM_SHADER_ENGINES, 2, 1, CntlMethodBySe, 298,
+     AI_COUNTER_NUM_PER_SQ, 0, 0, true, 0, 0, false, 0, 0},
+    {"AI_SQ_CS", kHsaAiCounterBlockIdSqCs, AI_MAX_NUM_SHADER_ENGINES, 2, 1, CntlMethodBySe, 298,
+     AI_COUNTER_NUM_PER_SQ, 0, 0, true, 0, 0, false, 0, 0},
+
+    // Counter block SX
+    {"AI_SX", kHsaAiCounterBlockIdSx, AI_MAX_NUM_SHADER_ENGINES, 2, 1, CntlMethodBySe, 33,
+     AI_COUNTER_NUM_PER_SX, 0, 0, true, 0, 0, false, 0, 0},
+
+    // Counter block TA
+    {"AI_TA0", kHsaAiCounterBlockIdTa0, AI_MAX_NUM_SHADER_ENGINES, 2, AI_NUM_TA,
+     CntlMethodBySeAndInstance, 118, AI_COUNTER_NUM_PER_TA, 0, 0, true, 0, 0, false, 0, 0},
+    {"AI_TA1", kHsaAiCounterBlockIdTa1, AI_MAX_NUM_SHADER_ENGINES, 2, AI_NUM_TA,
+     CntlMethodBySeAndInstance, 118, AI_COUNTER_NUM_PER_TA, 0, 0, true, 0, 0, false, 0, 0},
+    {"AI_TA2", kHsaAiCounterBlockIdTa2, AI_MAX_NUM_SHADER_ENGINES, 2, AI_NUM_TA,
+     CntlMethodBySeAndInstance, 118, AI_COUNTER_NUM_PER_TA, 0, 0, true, 0, 0, false, 0, 0},
+    {"AI_TA3", kHsaAiCounterBlockIdTa3, AI_MAX_NUM_SHADER_ENGINES, 2, AI_NUM_TA,
+     CntlMethodBySeAndInstance, 118, AI_COUNTER_NUM_PER_TA, 0, 0, true, 0, 0, false, 0, 0},
+    {"AI_TA4", kHsaAiCounterBlockIdTa4, AI_MAX_NUM_SHADER_ENGINES, 2, AI_NUM_TA,
+     CntlMethodBySeAndInstance, 118, AI_COUNTER_NUM_PER_TA, 0, 0, true, 0, 0, false, 0, 0},
+    {"AI_TA5", kHsaAiCounterBlockIdTa5, AI_MAX_NUM_SHADER_ENGINES, 2, AI_NUM_TA,
+     CntlMethodBySeAndInstance, 118, AI_COUNTER_NUM_PER_TA, 0, 0, true, 0, 0, false, 0, 0},
+    {"AI_TA6", kHsaAiCounterBlockIdTa6, AI_MAX_NUM_SHADER_ENGINES, 2, AI_NUM_TA,
+     CntlMethodBySeAndInstance, 118, AI_COUNTER_NUM_PER_TA, 0, 0, true, 0, 0, false, 0, 0},
+    {"AI_TA7", kHsaAiCounterBlockIdTa7, AI_MAX_NUM_SHADER_ENGINES, 2, AI_NUM_TA,
+     CntlMethodBySeAndInstance, 118, AI_COUNTER_NUM_PER_TA, 0, 0, true, 0, 0, false, 0, 0},
+    {"AI_TA8", kHsaAiCounterBlockIdTa8, AI_MAX_NUM_SHADER_ENGINES, 2, AI_NUM_TA,
+     CntlMethodBySeAndInstance, 118, AI_COUNTER_NUM_PER_TA, 0, 0, true, 0, 0, false, 0, 0},
+    {"AI_TA9", kHsaAiCounterBlockIdTa9, AI_MAX_NUM_SHADER_ENGINES, 2, AI_NUM_TA,
+     CntlMethodBySeAndInstance, 118, AI_COUNTER_NUM_PER_TA, 0, 0, true, 0, 0, false, 0, 0},
+    {"AI_TA10", kHsaAiCounterBlockIdTa10, AI_MAX_NUM_SHADER_ENGINES, 2, AI_NUM_TA,
+     CntlMethodBySeAndInstance, 118, AI_COUNTER_NUM_PER_TA, 0, 0, true, 0, 0, false, 0, 0},
+    {"AI_TA11", kHsaAiCounterBlockIdTa11, AI_MAX_NUM_SHADER_ENGINES, 2, AI_NUM_TA,
+     CntlMethodBySeAndInstance, 118, AI_COUNTER_NUM_PER_TA, 0, 0, true, 0, 0, false, 0, 0},
+    {"AI_TA12", kHsaAiCounterBlockIdTa12, AI_MAX_NUM_SHADER_ENGINES, 2, AI_NUM_TA,
+     CntlMethodBySeAndInstance, 118, AI_COUNTER_NUM_PER_TA, 0, 0, true, 0, 0, false, 0, 0},
+    {"AI_TA13", kHsaAiCounterBlockIdTa13, AI_MAX_NUM_SHADER_ENGINES, 2, AI_NUM_TA,
+     CntlMethodBySeAndInstance, 118, AI_COUNTER_NUM_PER_TA, 0, 0, true, 0, 0, false, 0, 0},
+    {"AI_TA14", kHsaAiCounterBlockIdTa14, AI_MAX_NUM_SHADER_ENGINES, 2, AI_NUM_TA,
+     CntlMethodBySeAndInstance, 118, AI_COUNTER_NUM_PER_TA, 0, 0, true, 0, 0, false, 0, 0},
+    {"AI_TA15", kHsaAiCounterBlockIdTa15, AI_MAX_NUM_SHADER_ENGINES, 2, AI_NUM_TA,
+     CntlMethodBySeAndInstance, 118, AI_COUNTER_NUM_PER_TA, 0, 0, true, 0, 0, false, 0, 0},
+
+    // Counter block TCA
+    {"AI_TCA0", kHsaAiCounterBlockIdTca0, AI_MAX_NUM_SHADER_ENGINES, 2, AI_NUM_TCA,
+     CntlMethodByInstance, 34, AI_COUNTER_NUM_PER_TCA, 0, 0, true, 0, 0, false, 0, 0},
+    {"AI_TCA1", kHsaAiCounterBlockIdTca1, AI_MAX_NUM_SHADER_ENGINES, 2, AI_NUM_TCA,
+     CntlMethodByInstance, 34, AI_COUNTER_NUM_PER_TCA, 0, 0, true, 0, 0, false, 0, 0},
+
+    // Counter block TCC
+    {"AI_TCC0", kHsaAiCounterBlockIdTcc0, AI_MAX_NUM_SHADER_ENGINES, 2, AI_NUM_TCC,
+     CntlMethodByInstance, 191, AI_COUNTER_NUM_PER_TCC, 0, 0, true, 0, 0, false, 0, 0},
+    {"AI_TCC1", kHsaAiCounterBlockIdTcc1, AI_MAX_NUM_SHADER_ENGINES, 2, AI_NUM_TCC,
+     CntlMethodByInstance, 191, AI_COUNTER_NUM_PER_TCC, 0, 0, true, 0, 0, false, 0, 0},
+    {"AI_TCC2", kHsaAiCounterBlockIdTcc2, AI_MAX_NUM_SHADER_ENGINES, 2, AI_NUM_TCC,
+     CntlMethodByInstance, 191, AI_COUNTER_NUM_PER_TCC, 0, 0, true, 0, 0, false, 0, 0},
+    {"AI_TCC3", kHsaAiCounterBlockIdTcc3, AI_MAX_NUM_SHADER_ENGINES, 2, AI_NUM_TCC,
+     CntlMethodByInstance, 191, AI_COUNTER_NUM_PER_TCC, 0, 0, true, 0, 0, false, 0, 0},
+    {"AI_TCC4", kHsaAiCounterBlockIdTcc4, AI_MAX_NUM_SHADER_ENGINES, 2, AI_NUM_TCC,
+     CntlMethodByInstance, 191, AI_COUNTER_NUM_PER_TCC, 0, 0, true, 0, 0, false, 0, 0},
+    {"AI_TCC5", kHsaAiCounterBlockIdTcc5, AI_MAX_NUM_SHADER_ENGINES, 2, AI_NUM_TCC,
+     CntlMethodByInstance, 191, AI_COUNTER_NUM_PER_TCC, 0, 0, true, 0, 0, false, 0, 0},
+    {"AI_TCC6", kHsaAiCounterBlockIdTcc6, AI_MAX_NUM_SHADER_ENGINES, 2, AI_NUM_TCC,
+     CntlMethodByInstance, 191, AI_COUNTER_NUM_PER_TCC, 0, 0, true, 0, 0, false, 0, 0},
+    {"AI_TCC7", kHsaAiCounterBlockIdTcc7, AI_MAX_NUM_SHADER_ENGINES, 2, AI_NUM_TCC,
+     CntlMethodByInstance, 191, AI_COUNTER_NUM_PER_TCC, 0, 0, true, 0, 0, false, 0, 0},
+    {"AI_TCC8", kHsaAiCounterBlockIdTcc8, AI_MAX_NUM_SHADER_ENGINES, 2, AI_NUM_TCC,
+     CntlMethodByInstance, 191, AI_COUNTER_NUM_PER_TCC, 0, 0, true, 0, 0, false, 0, 0},
+    {"AI_TCC9", kHsaAiCounterBlockIdTcc9, AI_MAX_NUM_SHADER_ENGINES, 2, AI_NUM_TCC,
+     CntlMethodByInstance, 191, AI_COUNTER_NUM_PER_TCC, 0, 0, true, 0, 0, false, 0, 0},
+    {"AI_TCC10", kHsaAiCounterBlockIdTcc10, AI_MAX_NUM_SHADER_ENGINES, 2, AI_NUM_TCC,
+     CntlMethodByInstance, 191, AI_COUNTER_NUM_PER_TCC, 0, 0, true, 0, 0, false, 0, 0},
+    {"AI_TCC11", kHsaAiCounterBlockIdTcc11, AI_MAX_NUM_SHADER_ENGINES, 2, AI_NUM_TCC,
+     CntlMethodByInstance, 191, AI_COUNTER_NUM_PER_TCC, 0, 0, true, 0, 0, false, 0, 0},
+    {"AI_TCC12", kHsaAiCounterBlockIdTcc12, AI_MAX_NUM_SHADER_ENGINES, 2, AI_NUM_TCC,
+     CntlMethodByInstance, 191, AI_COUNTER_NUM_PER_TCC, 0, 0, true, 0, 0, false, 0, 0},
+    {"AI_TCC13", kHsaAiCounterBlockIdTcc13, AI_MAX_NUM_SHADER_ENGINES, 2, AI_NUM_TCC,
+     CntlMethodByInstance, 191, AI_COUNTER_NUM_PER_TCC, 0, 0, true, 0, 0, false, 0, 0},
+    {"AI_TCC14", kHsaAiCounterBlockIdTcc14, AI_MAX_NUM_SHADER_ENGINES, 2, AI_NUM_TCC,
+     CntlMethodByInstance, 191, AI_COUNTER_NUM_PER_TCC, 0, 0, true, 0, 0, false, 0, 0},
+    {"AI_TCC15", kHsaAiCounterBlockIdTcc15, AI_MAX_NUM_SHADER_ENGINES, 2, AI_NUM_TCC,
+     CntlMethodByInstance, 191, AI_COUNTER_NUM_PER_TCC, 0, 0, true, 0, 0, false, 0, 0},
+
+    // Counter block TD
+    {"AI_TD0", kHsaAiCounterBlockIdTd0, AI_MAX_NUM_SHADER_ENGINES, 2, AI_NUM_TD,
+     CntlMethodBySeAndInstance, 54, AI_COUNTER_NUM_PER_TD, 0, 0, true, 0, 0, false, 0, 0},
+    {"AI_TD1", kHsaAiCounterBlockIdTd1, AI_MAX_NUM_SHADER_ENGINES, 2, AI_NUM_TD,
+     CntlMethodBySeAndInstance, 54, AI_COUNTER_NUM_PER_TD, 0, 0, true, 0, 0, false, 0, 0},
+    {"AI_TD2", kHsaAiCounterBlockIdTd2, AI_MAX_NUM_SHADER_ENGINES, 2, AI_NUM_TD,
+     CntlMethodBySeAndInstance, 54, AI_COUNTER_NUM_PER_TD, 0, 0, true, 0, 0, false, 0, 0},
+    {"AI_TD3", kHsaAiCounterBlockIdTd3, AI_MAX_NUM_SHADER_ENGINES, 2, AI_NUM_TD,
+     CntlMethodBySeAndInstance, 54, AI_COUNTER_NUM_PER_TD, 0, 0, true, 0, 0, false, 0, 0},
+    {"AI_TD4", kHsaAiCounterBlockIdTd4, AI_MAX_NUM_SHADER_ENGINES, 2, AI_NUM_TD,
+     CntlMethodBySeAndInstance, 54, AI_COUNTER_NUM_PER_TD, 0, 0, true, 0, 0, false, 0, 0},
+    {"AI_TD5", kHsaAiCounterBlockIdTd5, AI_MAX_NUM_SHADER_ENGINES, 2, AI_NUM_TD,
+     CntlMethodBySeAndInstance, 54, AI_COUNTER_NUM_PER_TD, 0, 0, true, 0, 0, false, 0, 0},
+    {"AI_TD6", kHsaAiCounterBlockIdTd6, AI_MAX_NUM_SHADER_ENGINES, 2, AI_NUM_TD,
+     CntlMethodBySeAndInstance, 54, AI_COUNTER_NUM_PER_TD, 0, 0, true, 0, 0, false, 0, 0},
+    {"AI_TD7", kHsaAiCounterBlockIdTd7, AI_MAX_NUM_SHADER_ENGINES, 2, AI_NUM_TD,
+     CntlMethodBySeAndInstance, 54, AI_COUNTER_NUM_PER_TD, 0, 0, true, 0, 0, false, 0, 0},
+    {"AI_TD8", kHsaAiCounterBlockIdTd8, AI_MAX_NUM_SHADER_ENGINES, 2, AI_NUM_TD,
+     CntlMethodBySeAndInstance, 54, AI_COUNTER_NUM_PER_TD, 0, 0, true, 0, 0, false, 0, 0},
+    {"AI_TD9", kHsaAiCounterBlockIdTd9, AI_MAX_NUM_SHADER_ENGINES, 2, AI_NUM_TD,
+     CntlMethodBySeAndInstance, 54, AI_COUNTER_NUM_PER_TD, 0, 0, true, 0, 0, false, 0, 0},
+    {"AI_TD10", kHsaAiCounterBlockIdTd10, AI_MAX_NUM_SHADER_ENGINES, 2, AI_NUM_TD,
+     CntlMethodBySeAndInstance, 54, AI_COUNTER_NUM_PER_TD, 0, 0, true, 0, 0, false, 0, 0},
+    {"AI_TD11", kHsaAiCounterBlockIdTd11, AI_MAX_NUM_SHADER_ENGINES, 2, AI_NUM_TD,
+     CntlMethodBySeAndInstance, 54, AI_COUNTER_NUM_PER_TD, 0, 0, true, 0, 0, false, 0, 0},
+    {"AI_TD12", kHsaAiCounterBlockIdTd12, AI_MAX_NUM_SHADER_ENGINES, 2, AI_NUM_TD,
+     CntlMethodBySeAndInstance, 54, AI_COUNTER_NUM_PER_TD, 0, 0, true, 0, 0, false, 0, 0},
+    {"AI_TD13", kHsaAiCounterBlockIdTd13, AI_MAX_NUM_SHADER_ENGINES, 2, AI_NUM_TD,
+     CntlMethodBySeAndInstance, 54, AI_COUNTER_NUM_PER_TD, 0, 0, true, 0, 0, false, 0, 0},
+    {"AI_TD14", kHsaAiCounterBlockIdTd14, AI_MAX_NUM_SHADER_ENGINES, 2, AI_NUM_TD,
+     CntlMethodBySeAndInstance, 54, AI_COUNTER_NUM_PER_TD, 0, 0, true, 0, 0, false, 0, 0},
+    {"AI_TD15", kHsaAiCounterBlockIdTd15, AI_MAX_NUM_SHADER_ENGINES, 2, AI_NUM_TD,
+     CntlMethodBySeAndInstance, 54, AI_COUNTER_NUM_PER_TD, 0, 0, true, 0, 0, false, 0, 0},
+
+    // Counter block TCP
+    {"AI_TCP0", kHsaAiCounterBlockIdTcp0, AI_MAX_NUM_SHADER_ENGINES, 2, AI_NUM_TCP,
+     CntlMethodBySeAndInstance, 182, AI_COUNTER_NUM_PER_TCP, 0, 0, true, 0, 0, false, 0, 0},
+    {"AI_TCP1", kHsaAiCounterBlockIdTcp1, AI_MAX_NUM_SHADER_ENGINES, 2, AI_NUM_TCP,
+     CntlMethodBySeAndInstance, 182, AI_COUNTER_NUM_PER_TCP, 0, 0, true, 0, 0, false, 0, 0},
+    {"AI_TCP2", kHsaAiCounterBlockIdTcp2, AI_MAX_NUM_SHADER_ENGINES, 2, AI_NUM_TCP,
+     CntlMethodBySeAndInstance, 182, AI_COUNTER_NUM_PER_TCP, 0, 0, true, 0, 0, false, 0, 0},
+    {"AI_TCP3", kHsaAiCounterBlockIdTcp3, AI_MAX_NUM_SHADER_ENGINES, 2, AI_NUM_TCP,
+     CntlMethodBySeAndInstance, 182, AI_COUNTER_NUM_PER_TCP, 0, 0, true, 0, 0, false, 0, 0},
+    {"AI_TCP4", kHsaAiCounterBlockIdTcp4, AI_MAX_NUM_SHADER_ENGINES, 2, AI_NUM_TCP,
+     CntlMethodBySeAndInstance, 182, AI_COUNTER_NUM_PER_TCP, 0, 0, true, 0, 0, false, 0, 0},
+    {"AI_TCP5", kHsaAiCounterBlockIdTcp5, AI_MAX_NUM_SHADER_ENGINES, 2, AI_NUM_TCP,
+     CntlMethodBySeAndInstance, 182, AI_COUNTER_NUM_PER_TCP, 0, 0, true, 0, 0, false, 0, 0},
+    {"AI_TCP6", kHsaAiCounterBlockIdTcp6, AI_MAX_NUM_SHADER_ENGINES, 2, AI_NUM_TCP,
+     CntlMethodBySeAndInstance, 182, AI_COUNTER_NUM_PER_TCP, 0, 0, true, 0, 0, false, 0, 0},
+    {"AI_TCP7", kHsaAiCounterBlockIdTcp7, AI_MAX_NUM_SHADER_ENGINES, 2, AI_NUM_TCP,
+     CntlMethodBySeAndInstance, 182, AI_COUNTER_NUM_PER_TCP, 0, 0, true, 0, 0, false, 0, 0},
+    {"AI_TCP8", kHsaAiCounterBlockIdTcp8, AI_MAX_NUM_SHADER_ENGINES, 2, AI_NUM_TCP,
+     CntlMethodBySeAndInstance, 182, AI_COUNTER_NUM_PER_TCP, 0, 0, true, 0, 0, false, 0, 0},
+    {"AI_TCP9", kHsaAiCounterBlockIdTcp9, AI_MAX_NUM_SHADER_ENGINES, 2, AI_NUM_TCP,
+     CntlMethodBySeAndInstance, 182, AI_COUNTER_NUM_PER_TCP, 0, 0, true, 0, 0, false, 0, 0},
+    {"AI_TCP10", kHsaAiCounterBlockIdTcp10, AI_MAX_NUM_SHADER_ENGINES, 2, AI_NUM_TCP,
+     CntlMethodBySeAndInstance, 182, AI_COUNTER_NUM_PER_TCP, 0, 0, true, 0, 0, false, 0, 0},
+    {"AI_TCP11", kHsaAiCounterBlockIdTcp11, AI_MAX_NUM_SHADER_ENGINES, 2, AI_NUM_TCP,
+     CntlMethodBySeAndInstance, 182, AI_COUNTER_NUM_PER_TCP, 0, 0, true, 0, 0, false, 0, 0},
+    {"AI_TCP12", kHsaAiCounterBlockIdTcp12, AI_MAX_NUM_SHADER_ENGINES, 2, AI_NUM_TCP,
+     CntlMethodBySeAndInstance, 182, AI_COUNTER_NUM_PER_TCP, 0, 0, true, 0, 0, false, 0, 0},
+    {"AI_TCP13", kHsaAiCounterBlockIdTcp13, AI_MAX_NUM_SHADER_ENGINES, 2, AI_NUM_TCP,
+     CntlMethodBySeAndInstance, 182, AI_COUNTER_NUM_PER_TCP, 0, 0, true, 0, 0, false, 0, 0},
+    {"AI_TCP14", kHsaAiCounterBlockIdTcp14, AI_MAX_NUM_SHADER_ENGINES, 2, AI_NUM_TCP,
+     CntlMethodBySeAndInstance, 182, AI_COUNTER_NUM_PER_TCP, 0, 0, true, 0, 0, false, 0, 0},
+    {"AI_TCP15", kHsaAiCounterBlockIdTcp15, AI_MAX_NUM_SHADER_ENGINES, 2, AI_NUM_TCP,
+     CntlMethodBySeAndInstance, 182, AI_COUNTER_NUM_PER_TCP, 0, 0, true, 0, 0, false, 0, 0},
+
+    // Counter block GDS
+    {"AI_GDS", kHsaAiCounterBlockIdGds, AI_MAX_NUM_SHADER_ENGINES, 2, 1, CntlMethodNone, 120,
+     AI_COUNTER_NUM_PER_GDS, 0, 0, true, 0, 0, false, 0, 0},
+
+    // Counter block VGT
+    {"AI_VGT", kHsaAiCounterBlockIdVgt, AI_MAX_NUM_SHADER_ENGINES, 2, 1, CntlMethodBySe, 145,
+     AI_COUNTER_NUM_PER_VGT, 0, 0, true, 0, 0, false, 0, 0},
+
+    // Counter block IA
+    {"AI_IA", kHsaAiCounterBlockIdIa, AI_MAX_NUM_SHADER_ENGINES, 2, 1, CntlMethodBySe, 23,
+     AI_COUNTER_NUM_PER_IA, 0, 0, true, 0, 0, false, 0, 0},
+
+    // Counter block MC
+    {"AI_MC", kHsaAiCounterBlockIdMc, AI_MAX_NUM_SHADER_ENGINES, 2, 1, CntlMethodNone, 22,
+     AI_COUNTER_NUM_PER_MC, 0, 0, true, 0, 0, false, 0, 0},
+
+    // Temp commented out for Vega10
+    // Counter block SRBM
+    /*
+    {"AI_SRBM", kHsaAiCounterBlockIdSrbm, AI_MAX_NUM_SHADER_ENGINES, 2, 1, CntlMethodNone, 19,
+    AI_COUNTER_NUM_PER_SRBM, 0, 0, true, 0, 0, false, 0, 0},
+    */
+
+    // Counter block WD
+    {"AI_WD", kHsaAiCounterBlockIdWd, AI_MAX_NUM_SHADER_ENGINES, 2, 1, CntlMethodNone, 36,
+     AI_COUNTER_NUM_PER_WD, 0, 0, true, 0, 0, false, 0, 0},
+
+    // Counter block CPG
+    // Temp commented for Vega10
+    /*
+    {"AI_CPG", kHsaAiCounterBlockIdCpg, AI_MAX_NUM_SHADER_ENGINES, 2, 1, CntlMethodNone, 48,
+    AI_COUNTER_NUM_PER_CPG, 0, 0, true, 0, 0, false, 0, 0},
+    */
+
+    // Counter block CPC
+    // Temp commented for Vega10
+    /*
+    {"AI_CPC", kHsaAiCounterBlockIdCpc, AI_MAX_NUM_SHADER_ENGINES, 2, 1, CntlMethodNone, 24,
+    AI_COUNTER_NUM_PER_CPC, 0, 0, true, 0, 0, false, 0, 0},
+    */
+
+    // Counter block IOMMUV2
+    {"AI_IOMMUV2", kHsaAiCounterBlockIdIommuV2, AI_MAX_NUM_SHADER_ENGINES, 2, 1, CntlMethodNone, 25,
+     8, 0, 0, true, 0, 0, false, 0, 0},
+
+    // Counter block KernelDriver
+    {"AI_KD", kHsaAiCounterBlockIdKernelDriver, AI_MAX_NUM_SHADER_ENGINES, 2, 1, CntlMethodNone, 0,
+     0, 0, 0, true, 0, 0, false, 0, 0},
+
+    // Name of the last line should be empty to indicate end of all counter groups
+    {"", kHsaAiCounterBlockIdBlocksLast, 0, 0, 0, CntlMethodNone, 0, 0, 0, 0, false, 0, 0, false, 0,
+     0}};
+
+/*
+ * The following tables contain register addresses of the SQ counter registers
+ */
+
+/*
+ * SQ
+ */
+GpuCounterRegInfo AiSqCounterRegAddr[] = {
+    {mmSQ_PERFCOUNTER0_SELECT, mmSQ_PERFCOUNTER_CTRL, mmSQ_PERFCOUNTER0_LO, mmSQ_PERFCOUNTER0_HI},
+    {mmSQ_PERFCOUNTER1_SELECT, mmSQ_PERFCOUNTER_CTRL, mmSQ_PERFCOUNTER1_LO, mmSQ_PERFCOUNTER1_HI},
+    {mmSQ_PERFCOUNTER2_SELECT, mmSQ_PERFCOUNTER_CTRL, mmSQ_PERFCOUNTER2_LO, mmSQ_PERFCOUNTER2_HI},
+    {mmSQ_PERFCOUNTER3_SELECT, mmSQ_PERFCOUNTER_CTRL, mmSQ_PERFCOUNTER3_LO, mmSQ_PERFCOUNTER3_HI},
+    {mmSQ_PERFCOUNTER4_SELECT, mmSQ_PERFCOUNTER_CTRL, mmSQ_PERFCOUNTER4_LO, mmSQ_PERFCOUNTER4_HI},
+    {mmSQ_PERFCOUNTER5_SELECT, mmSQ_PERFCOUNTER_CTRL, mmSQ_PERFCOUNTER5_LO, mmSQ_PERFCOUNTER5_HI},
+    {mmSQ_PERFCOUNTER6_SELECT, mmSQ_PERFCOUNTER_CTRL, mmSQ_PERFCOUNTER6_LO, mmSQ_PERFCOUNTER6_HI},
+    {mmSQ_PERFCOUNTER7_SELECT, mmSQ_PERFCOUNTER_CTRL, mmSQ_PERFCOUNTER7_LO, mmSQ_PERFCOUNTER7_HI},
+    {mmSQ_PERFCOUNTER8_SELECT, mmSQ_PERFCOUNTER_CTRL, mmSQ_PERFCOUNTER8_LO, mmSQ_PERFCOUNTER8_HI},
+    {mmSQ_PERFCOUNTER9_SELECT, mmSQ_PERFCOUNTER_CTRL, mmSQ_PERFCOUNTER9_LO, mmSQ_PERFCOUNTER9_HI},
+    {mmSQ_PERFCOUNTER10_SELECT, mmSQ_PERFCOUNTER_CTRL, mmSQ_PERFCOUNTER10_LO,
+     mmSQ_PERFCOUNTER10_HI},
+    {mmSQ_PERFCOUNTER11_SELECT, mmSQ_PERFCOUNTER_CTRL, mmSQ_PERFCOUNTER11_LO,
+     mmSQ_PERFCOUNTER11_HI},
+    {mmSQ_PERFCOUNTER12_SELECT, mmSQ_PERFCOUNTER_CTRL, mmSQ_PERFCOUNTER12_LO,
+     mmSQ_PERFCOUNTER12_HI},
+    {mmSQ_PERFCOUNTER13_SELECT, mmSQ_PERFCOUNTER_CTRL, mmSQ_PERFCOUNTER13_LO,
+     mmSQ_PERFCOUNTER13_HI},
+    {mmSQ_PERFCOUNTER14_SELECT, mmSQ_PERFCOUNTER_CTRL, mmSQ_PERFCOUNTER14_LO,
+     mmSQ_PERFCOUNTER14_HI},
+    {mmSQ_PERFCOUNTER15_SELECT, mmSQ_PERFCOUNTER_CTRL, mmSQ_PERFCOUNTER15_LO,
+     mmSQ_PERFCOUNTER15_HI}};
+
+/*
+ * DRMDMA
+ */
+GpuCounterRegInfo AiDrmdmaCounterRegAddr[] = {
+    {mmSDMA0_PERFMON_CNTL, 0, mmSDMA0_PERFCOUNTER0_RESULT, 0},
+    {mmSDMA0_PERFMON_CNTL, 0, mmSDMA0_PERFCOUNTER1_RESULT, 0},
+    {mmSDMA1_PERFMON_CNTL, 0, mmSDMA1_PERFCOUNTER0_RESULT, 0},
+    {mmSDMA1_PERFMON_CNTL, 0, mmSDMA1_PERFCOUNTER1_RESULT, 0},
+};
+
+/*
+ * IH
+ */
+GpuCounterRegInfo AiIhCounterRegAddr[] = {{mmIH_PERFMON_CNTL, 0, mmIH_PERFCOUNTER0_RESULT, 0},
+                                          {mmIH_PERFMON_CNTL, 0, mmIH_PERFCOUNTER1_RESULT, 0}};
+
+/*
+ * CPF
+ */
+GpuCounterRegInfo AiCpfCounterRegAddr[] = {
+    {mmCPF_PERFCOUNTER0_SELECT, 0, mmCPF_PERFCOUNTER0_LO, mmCPF_PERFCOUNTER0_HI},
+    {mmCPF_PERFCOUNTER1_SELECT, 0, mmCPF_PERFCOUNTER1_LO, mmCPF_PERFCOUNTER1_HI}};
+
+/*
+ * DRM
+ */
+GpuCounterRegInfo AiDrmCounterRegAddr[] = {
+    /*
+    {mmDRM_PERFCOUNTER1_SELECT, 0, mmDRM_PERFCOUNTER1_LO, mmDRM_PERFCOUNTER1_HI},
+    {mmDRM_PERFCOUNTER2_SELECT, 0, mmDRM_PERFCOUNTER2_LO, mmDRM_PERFCOUNTER2_HI}
+    */
+};
+
+/*
+ * GRBM
+ */
+GpuCounterRegInfo AiGrbmCounterRegAddr[] = {
+    {mmGRBM_PERFCOUNTER0_SELECT, 0, mmGRBM_PERFCOUNTER0_LO, mmGRBM_PERFCOUNTER0_HI},
+    {mmGRBM_PERFCOUNTER1_SELECT, 0, mmGRBM_PERFCOUNTER1_LO, mmGRBM_PERFCOUNTER1_HI}};
+
+/*
+ * GRBM_SE
+ */
+GpuCounterRegInfo AiGrbmSeCounterRegAddr[] = {
+    {mmGRBM_SE0_PERFCOUNTER_SELECT, 0, mmGRBM_SE0_PERFCOUNTER_LO, mmGRBM_SE0_PERFCOUNTER_HI},
+    {mmGRBM_SE1_PERFCOUNTER_SELECT, 0, mmGRBM_SE1_PERFCOUNTER_LO, mmGRBM_SE1_PERFCOUNTER_HI},
+    {mmGRBM_SE2_PERFCOUNTER_SELECT, 0, mmGRBM_SE2_PERFCOUNTER_LO, mmGRBM_SE2_PERFCOUNTER_HI},
+    {mmGRBM_SE3_PERFCOUNTER_SELECT, 0, mmGRBM_SE3_PERFCOUNTER_LO, mmGRBM_SE3_PERFCOUNTER_HI}};
+
+/*
+ * PA_SU
+ */
+GpuCounterRegInfo AiPaSuCounterRegAddr[] = {
+    {mmPA_SU_PERFCOUNTER0_SELECT, 0, mmPA_SU_PERFCOUNTER0_LO, mmPA_SU_PERFCOUNTER0_HI},
+    {mmPA_SU_PERFCOUNTER1_SELECT, 0, mmPA_SU_PERFCOUNTER1_LO, mmPA_SU_PERFCOUNTER1_HI},
+    {mmPA_SU_PERFCOUNTER2_SELECT, 0, mmPA_SU_PERFCOUNTER2_LO, mmPA_SU_PERFCOUNTER2_HI},
+    {mmPA_SU_PERFCOUNTER3_SELECT, 0, mmPA_SU_PERFCOUNTER3_LO, mmPA_SU_PERFCOUNTER3_HI}};
+
+/*
+ * PA_SC
+ */
+GpuCounterRegInfo AiPaScCounterRegAddr[] = {
+    {mmPA_SC_PERFCOUNTER0_SELECT, 0, mmPA_SC_PERFCOUNTER0_LO, mmPA_SC_PERFCOUNTER0_HI},
+    {mmPA_SC_PERFCOUNTER1_SELECT, 0, mmPA_SC_PERFCOUNTER1_LO, mmPA_SC_PERFCOUNTER1_HI},
+    {mmPA_SC_PERFCOUNTER2_SELECT, 0, mmPA_SC_PERFCOUNTER2_LO, mmPA_SC_PERFCOUNTER2_HI},
+    {mmPA_SC_PERFCOUNTER3_SELECT, 0, mmPA_SC_PERFCOUNTER3_LO, mmPA_SC_PERFCOUNTER3_HI}};
+
+/*
+ * SPI
+ */
+GpuCounterRegInfo AiSpiCounterRegAddr[] = {
+    {mmSPI_PERFCOUNTER0_SELECT, 0, mmSPI_PERFCOUNTER0_LO, mmSPI_PERFCOUNTER0_HI},
+    {mmSPI_PERFCOUNTER1_SELECT, 0, mmSPI_PERFCOUNTER1_LO, mmSPI_PERFCOUNTER1_HI},
+    {mmSPI_PERFCOUNTER2_SELECT, 0, mmSPI_PERFCOUNTER2_LO, mmSPI_PERFCOUNTER2_HI},
+    {mmSPI_PERFCOUNTER3_SELECT, 0, mmSPI_PERFCOUNTER3_LO, mmSPI_PERFCOUNTER3_HI},
+    {mmSPI_PERFCOUNTER4_SELECT, 0, mmSPI_PERFCOUNTER4_LO, mmSPI_PERFCOUNTER4_HI},
+    {mmSPI_PERFCOUNTER5_SELECT, 0, mmSPI_PERFCOUNTER5_LO, mmSPI_PERFCOUNTER5_HI}};
+
+/*
+ * TCA
+ */
+GpuCounterRegInfo AiTcaCounterRegAddr[] = {
+    {mmTCA_PERFCOUNTER0_SELECT, 0, mmTCA_PERFCOUNTER0_LO, mmTCA_PERFCOUNTER0_HI},
+    {mmTCA_PERFCOUNTER1_SELECT, 0, mmTCA_PERFCOUNTER1_LO, mmTCA_PERFCOUNTER1_HI},
+    {mmTCA_PERFCOUNTER2_SELECT, 0, mmTCA_PERFCOUNTER2_LO, mmTCA_PERFCOUNTER2_HI},
+    {mmTCA_PERFCOUNTER3_SELECT, 0, mmTCA_PERFCOUNTER3_LO, mmTCA_PERFCOUNTER3_HI}};
+
+/*
+ * TCC
+ */
+GpuCounterRegInfo AiTccCounterRegAddr[] = {
+    {mmTCC_PERFCOUNTER0_SELECT, 0, mmTCC_PERFCOUNTER0_LO, mmTCC_PERFCOUNTER0_HI},
+    {mmTCC_PERFCOUNTER1_SELECT, 0, mmTCC_PERFCOUNTER1_LO, mmTCC_PERFCOUNTER1_HI},
+    {mmTCC_PERFCOUNTER2_SELECT, 0, mmTCC_PERFCOUNTER2_LO, mmTCC_PERFCOUNTER2_HI},
+    {mmTCC_PERFCOUNTER3_SELECT, 0, mmTCC_PERFCOUNTER3_LO, mmTCC_PERFCOUNTER3_HI}};
+
+/*
+ * TCP
+ */
+GpuCounterRegInfo AiTcpCounterRegAddr[] = {
+    {mmTCP_PERFCOUNTER0_SELECT, 0, mmTCP_PERFCOUNTER0_LO, mmTCP_PERFCOUNTER0_HI},
+    {mmTCP_PERFCOUNTER1_SELECT, 0, mmTCP_PERFCOUNTER1_LO, mmTCP_PERFCOUNTER1_HI},
+    {mmTCP_PERFCOUNTER2_SELECT, 0, mmTCP_PERFCOUNTER2_LO, mmTCP_PERFCOUNTER2_HI},
+    {mmTCP_PERFCOUNTER3_SELECT, 0, mmTCP_PERFCOUNTER3_LO, mmTCP_PERFCOUNTER3_HI}};
+
+/*
+ * CB
+ */
+GpuCounterRegInfo AiCbCounterRegAddr[] = {
+    {mmCB_PERFCOUNTER0_SELECT, 0, mmCB_PERFCOUNTER0_LO, mmCB_PERFCOUNTER0_HI},
+    {mmCB_PERFCOUNTER1_SELECT, 0, mmCB_PERFCOUNTER1_LO, mmCB_PERFCOUNTER1_HI},
+    {mmCB_PERFCOUNTER2_SELECT, 0, mmCB_PERFCOUNTER2_LO, mmCB_PERFCOUNTER2_HI},
+    {mmCB_PERFCOUNTER3_SELECT, 0, mmCB_PERFCOUNTER3_LO, mmCB_PERFCOUNTER3_HI}};
+
+/*
+ * DB
+ */
+GpuCounterRegInfo AiDbCounterRegAddr[] = {
+    {mmDB_PERFCOUNTER0_SELECT, 0, mmDB_PERFCOUNTER0_LO, mmDB_PERFCOUNTER0_HI},
+    {mmDB_PERFCOUNTER1_SELECT, 0, mmDB_PERFCOUNTER1_LO, mmDB_PERFCOUNTER1_HI},
+    {mmDB_PERFCOUNTER2_SELECT, 0, mmDB_PERFCOUNTER2_LO, mmDB_PERFCOUNTER2_HI},
+    {mmDB_PERFCOUNTER3_SELECT, 0, mmDB_PERFCOUNTER3_LO, mmDB_PERFCOUNTER3_HI}};
+
+/*
+ * RLC
+ */
+GpuCounterRegInfo AiRlcCounterRegAddr[] = {
+    {mmRLC_PERFCOUNTER0_SELECT, 0, mmRLC_PERFCOUNTER0_LO, mmRLC_PERFCOUNTER0_HI},
+    {mmRLC_PERFCOUNTER1_SELECT, 0, mmRLC_PERFCOUNTER1_LO, mmRLC_PERFCOUNTER1_HI}};
+
+/*
+ * SC
+ */
+GpuCounterRegInfo AiScCounterRegAddr[] = {
+    {mmPA_SC_PERFCOUNTER0_SELECT, 0, mmPA_SC_PERFCOUNTER0_LO, mmPA_SC_PERFCOUNTER0_HI},
+    {mmPA_SC_PERFCOUNTER1_SELECT, 0, mmPA_SC_PERFCOUNTER1_LO, mmPA_SC_PERFCOUNTER1_HI},
+    {mmPA_SC_PERFCOUNTER2_SELECT, 0, mmPA_SC_PERFCOUNTER2_LO, mmPA_SC_PERFCOUNTER2_HI},
+    {mmPA_SC_PERFCOUNTER3_SELECT, 0, mmPA_SC_PERFCOUNTER3_LO, mmPA_SC_PERFCOUNTER3_HI},
+    {mmPA_SC_PERFCOUNTER4_SELECT, 0, mmPA_SC_PERFCOUNTER4_LO, mmPA_SC_PERFCOUNTER4_HI},
+    {mmPA_SC_PERFCOUNTER5_SELECT, 0, mmPA_SC_PERFCOUNTER5_LO, mmPA_SC_PERFCOUNTER5_HI},
+    {mmPA_SC_PERFCOUNTER6_SELECT, 0, mmPA_SC_PERFCOUNTER6_LO, mmPA_SC_PERFCOUNTER6_HI},
+    {mmPA_SC_PERFCOUNTER7_SELECT, 0, mmPA_SC_PERFCOUNTER7_LO, mmPA_SC_PERFCOUNTER7_HI}};
+
+/*
+ * SX
+ */
+GpuCounterRegInfo AiSxCounterRegAddr[] = {
+    {mmSX_PERFCOUNTER0_SELECT, 0, mmSX_PERFCOUNTER0_LO, mmSX_PERFCOUNTER0_HI},
+    {mmSX_PERFCOUNTER1_SELECT, 0, mmSX_PERFCOUNTER1_LO, mmSX_PERFCOUNTER1_HI},
+    {mmSX_PERFCOUNTER2_SELECT, 0, mmSX_PERFCOUNTER2_LO, mmSX_PERFCOUNTER2_HI},
+    {mmSX_PERFCOUNTER3_SELECT, 0, mmSX_PERFCOUNTER3_LO, mmSX_PERFCOUNTER3_HI}};
+
+/*
+ * TA
+ */
+GpuCounterRegInfo AiTaCounterRegAddr[] = {
+    {mmTA_PERFCOUNTER0_SELECT, 0, mmTA_PERFCOUNTER0_LO, mmTA_PERFCOUNTER0_HI},
+    {mmTA_PERFCOUNTER1_SELECT, 0, mmTA_PERFCOUNTER1_LO, mmTA_PERFCOUNTER1_HI}};
+
+/*
+ * TD
+ */
+GpuCounterRegInfo AiTdCounterRegAddr[] = {
+    {mmTD_PERFCOUNTER0_SELECT, 0, mmTD_PERFCOUNTER0_LO, mmTD_PERFCOUNTER0_HI},
+    {mmTD_PERFCOUNTER1_SELECT, 0, mmTD_PERFCOUNTER1_LO, mmTD_PERFCOUNTER1_HI}};
+
+/*
+ * GDS
+ */
+GpuCounterRegInfo AiGdsCounterRegAddr[] = {
+    {mmGDS_PERFCOUNTER0_SELECT, 0, mmGDS_PERFCOUNTER0_LO, mmGDS_PERFCOUNTER0_HI},
+    {mmGDS_PERFCOUNTER1_SELECT, 0, mmGDS_PERFCOUNTER1_LO, mmGDS_PERFCOUNTER1_HI},
+    {mmGDS_PERFCOUNTER2_SELECT, 0, mmGDS_PERFCOUNTER2_LO, mmGDS_PERFCOUNTER2_HI},
+    {mmGDS_PERFCOUNTER3_SELECT, 0, mmGDS_PERFCOUNTER3_LO, mmGDS_PERFCOUNTER3_HI}};
+
+/*
+ * VGT
+ */
+GpuCounterRegInfo AiVgtCounterRegAddr[] = {
+    {mmVGT_PERFCOUNTER0_SELECT, 0, mmVGT_PERFCOUNTER0_LO, mmVGT_PERFCOUNTER0_HI},
+    {mmVGT_PERFCOUNTER1_SELECT, 0, mmVGT_PERFCOUNTER1_LO, mmVGT_PERFCOUNTER1_HI},
+    {mmVGT_PERFCOUNTER2_SELECT, 0, mmVGT_PERFCOUNTER2_LO, mmVGT_PERFCOUNTER2_HI},
+    {mmVGT_PERFCOUNTER3_SELECT, 0, mmVGT_PERFCOUNTER3_LO, mmVGT_PERFCOUNTER3_HI}};
+
+/*
+ * IA
+ */
+GpuCounterRegInfo AiIaCounterRegAddr[] = {
+    {mmIA_PERFCOUNTER0_SELECT, 0, mmIA_PERFCOUNTER0_LO, mmIA_PERFCOUNTER0_HI},
+    {mmIA_PERFCOUNTER1_SELECT, 0, mmIA_PERFCOUNTER1_LO, mmIA_PERFCOUNTER1_HI},
+    {mmIA_PERFCOUNTER2_SELECT, 0, mmIA_PERFCOUNTER2_LO, mmIA_PERFCOUNTER2_HI},
+    {mmIA_PERFCOUNTER3_SELECT, 0, mmIA_PERFCOUNTER3_LO, mmIA_PERFCOUNTER3_HI}};
+
+/*
+ * MC
+ */
+GpuCounterRegInfo AiMcCounterRegAddr[] = {
+    /*
+
+    {mmMC_SEQ_PERF_SEQ_CTL__SI__VI, 0, mmMC_SEQ_PERF_SEQ_CNT_A_I0__VI,
+     mmMC_SEQ_PERF_SEQ_CNT_A_I1__VI},
+    {mmMC_SEQ_PERF_SEQ_CTL__SI__VI, 0, mmMC_SEQ_PERF_SEQ_CNT_B_I0__VI,
+     mmMC_SEQ_PERF_SEQ_CNT_B_I1__VI},
+    {mmMC_SEQ_PERF_SEQ_CTL__SI__VI, 0, mmMC_SEQ_PERF_SEQ_CNT_C_I0__VI,
+     mmMC_SEQ_PERF_SEQ_CNT_C_I1__VI},
+    {mmMC_SEQ_PERF_SEQ_CTL__SI__VI, 0, mmMC_SEQ_PERF_SEQ_CNT_D_I0__VI,
+     mmMC_SEQ_PERF_SEQ_CNT_D_I1__VI}
+
+     */
+};
+
+/*
+ * SRBM
+ */
+GpuCounterRegInfo AiSrbmCounterRegAddr[] = {
+    /*
+    {mmSRBM_PERFCOUNTER0_SELECT, 0, mmSRBM_PERFCOUNTER0_LO,
+     mmSRBM_PERFCOUNTER0_HI},
+    {mmSRBM_PERFCOUNTER1_SELECT, 0, mmSRBM_PERFCOUNTER1_LO,
+     mmSRBM_PERFCOUNTER1_HI}
+     */
+};
+
+/*
+ * WD
+ */
+GpuCounterRegInfo AiWdCounterRegAddr[] = {
+    {mmWD_PERFCOUNTER0_SELECT, 0, mmWD_PERFCOUNTER0_LO, mmWD_PERFCOUNTER0_HI},
+    {mmWD_PERFCOUNTER1_SELECT, 0, mmWD_PERFCOUNTER1_LO, mmWD_PERFCOUNTER1_HI},
+    {mmWD_PERFCOUNTER2_SELECT, 0, mmWD_PERFCOUNTER2_LO, mmWD_PERFCOUNTER2_HI},
+    {mmWD_PERFCOUNTER3_SELECT, 0, mmWD_PERFCOUNTER3_LO, mmWD_PERFCOUNTER3_HI}};
+
+/*
+ * CPG
+ */
+GpuCounterRegInfo AiCpgCounterRegAddr[] = {
+    {mmCPG_PERFCOUNTER0_SELECT, 0, mmCPG_PERFCOUNTER0_LO, mmCPG_PERFCOUNTER0_HI},
+    {mmCPG_PERFCOUNTER1_SELECT, 0, mmCPG_PERFCOUNTER1_LO, mmCPG_PERFCOUNTER1_HI}};
+
+/*
+ * CPC
+ */
+GpuCounterRegInfo AiCpcCounterRegAddr[] = {
+    {mmCPC_PERFCOUNTER0_SELECT, 0, mmCPC_PERFCOUNTER0_LO, mmCPC_PERFCOUNTER0_HI},
+    {mmCPC_PERFCOUNTER1_SELECT, 0, mmCPC_PERFCOUNTER1_LO, mmCPC_PERFCOUNTER1_HI}};
+
+GpuPrivCounterBlockId AiBlockIdSq = {{0xb5c396b6, 0x47e4d310, 0xc35cfc86, 0x08f53a04}};
+GpuPrivCounterBlockId AiBlockIdMc = {{0x13900b57, 0x4d984956, 0x5268d081, 0x9cf53719}};
+GpuPrivCounterBlockId AiBlockIdIommuV2 = {{0x80969879, 0x4be6b0f6, 0x636af697, 0x1d10f500}};
+GpuPrivCounterBlockId AiBlockIdKernelDriver = {{0xea9b5ae1, 0x44b36c3f, 0xf0da5489, 0x0aa96575}};
+
+}  // pm4_profile
@@ -0,0 +1,252 @@
+#ifndef _AI_BLOCKINFO_H_
+#define _AI_BLOCKINFO_H_
+
+#include <stdint.h>
+#include "rocr_profiler.h"
+#include "gpu_enum.h"
+#include "gpu_blockinfo.h"
+
+namespace pm4_profile {
+
+// MAX Number of block instances for ARCTIC ISLANDS (From Vega10)
+// Values are found here //gfxip/gfx8/main/src/meta/features/variant/Fiji/album.dj
+
+// @brief Number of block instances.
+
+// Number of CB block instances per SE
+// and number of Perf Cntrs per CB block
+#define AI_NUM_CB 4
+#define AI_COUNTER_NUM_PER_CB 4
+
+// Number of DB block instances per SE
+// and number of Perf Cntrs per DB block
+#define AI_NUM_DB 4
+#define AI_COUNTER_NUM_PER_DB 4
+
+// Number of TA block instances per SE
+// and number of Perf Cntrs per TA block
+#define AI_NUM_TA 16
+#define AI_COUNTER_NUM_PER_TA 2
+
+// Number of TD block instances per SE
+// and number of Perf Cntrs per TD block
+#define AI_NUM_TD 16
+#define AI_COUNTER_NUM_PER_TD 2
+
+// Number of TCP block instances per SE
+// and number of Perf Cntrs per TCP block
+#define AI_NUM_TCP 16
+#define AI_COUNTER_NUM_PER_TCP 4
+
+// Number of TCA block instances per chip
+// and number of Perf Cntrs per TCA block
+#define AI_NUM_TCA 2
+#define AI_COUNTER_NUM_PER_TCA 4
+
+// Number of TCC block instances per chip
+// and number of Perf Cntrs per TCC block
+#define AI_NUM_TCC 16
+#define AI_COUNTER_NUM_PER_TCC 4
+
+// Number of SDMA block instances per chip
+// and number of Perf Cntrs per SDMA block
+#define AI_NUM_SDMA 2
+
+// Number of counter registers per block for arctic islands
+#define AI_COUNTER_NUM_PER_DRM 2
+#define AI_COUNTER_NUM_PER_DRMDMA 2
+#define AI_COUNTER_NUM_PER_IH 2
+#define AI_COUNTER_NUM_PER_SRBM 2
+#define AI_COUNTER_NUM_PER_CPF 2
+#define AI_COUNTER_NUM_PER_GRBM 2
+#define AI_COUNTER_NUM_PER_GRBMSE 4
+#define AI_COUNTER_NUM_PER_PA_SU 4
+#define AI_COUNTER_NUM_PER_RLC 2
+#define AI_COUNTER_NUM_PER_PA_SC 8
+#define AI_COUNTER_NUM_PER_SPI 6  // [Shucai: To do: double check the value]
+#define AI_COUNTER_NUM_PER_SQ 16
+#define AI_COUNTER_NUM_PER_SX 4
+#define AI_COUNTER_NUM_PER_GDS 4
+#define AI_COUNTER_NUM_PER_VGT 4
+#define AI_COUNTER_NUM_PER_IA 4
+#define AI_COUNTER_NUM_PER_MC 4
+#define AI_COUNTER_NUM_PER_TCS 4
+#define AI_COUNTER_NUM_PER_WD 4
+#define AI_COUNTER_NUM_PER_CPG 2
+#define AI_COUNTER_NUM_PER_CPC 2
+#define AI_COUNTER_NUM_PER_VM 1
+#define AI_COUNTER_NUM_PER_VM_MD 1
+#define AI_COUNTER_NUM_PER_PIPESTATS 12
+
+#define AI_MAX_NUM_SHADER_ENGINES 1
+
+// Enumeration of AI hardware counter blocks
+typedef enum HsaAiCounterBlockId {
+  kHsaAiCounterBlockIdCb0 = 0,
+  kHsaAiCounterBlockIdCb1,
+  kHsaAiCounterBlockIdCb2,
+  kHsaAiCounterBlockIdCb3,
+
+  // Temp commented for Vega10
+  kHsaAiCounterBlockIdCpf,
+
+  kHsaAiCounterBlockIdDb0,
+  kHsaAiCounterBlockIdDb1,
+  kHsaAiCounterBlockIdDb2,
+  kHsaAiCounterBlockIdDb3,
+
+  kHsaAiCounterBlockIdGrbm,
+  kHsaAiCounterBlockIdGrbmSe,
+  kHsaAiCounterBlockIdPaSu,
+  kHsaAiCounterBlockIdPaSc,
+  kHsaAiCounterBlockIdSpi,
+
+  kHsaAiCounterBlockIdSq,
+  kHsaAiCounterBlockIdSqGs,
+  kHsaAiCounterBlockIdSqVs,
+  kHsaAiCounterBlockIdSqPs,
+  kHsaAiCounterBlockIdSqHs,
+  kHsaAiCounterBlockIdSqCs,
+
+  kHsaAiCounterBlockIdSx,
+
+  kHsaAiCounterBlockIdTa0,
+  kHsaAiCounterBlockIdTa1,
+  kHsaAiCounterBlockIdTa2,
+  kHsaAiCounterBlockIdTa3,
+  kHsaAiCounterBlockIdTa4,
+  kHsaAiCounterBlockIdTa5,
+  kHsaAiCounterBlockIdTa6,
+  kHsaAiCounterBlockIdTa7,
+  kHsaAiCounterBlockIdTa8,
+  kHsaAiCounterBlockIdTa9,
+  kHsaAiCounterBlockIdTa10,
+  kHsaAiCounterBlockIdTa11,
+  kHsaAiCounterBlockIdTa12,
+  kHsaAiCounterBlockIdTa13,
+  kHsaAiCounterBlockIdTa14,
+  kHsaAiCounterBlockIdTa15,
+
+  kHsaAiCounterBlockIdTca0,
+  kHsaAiCounterBlockIdTca1,
+
+  kHsaAiCounterBlockIdTcc0,
+  kHsaAiCounterBlockIdTcc1,
+  kHsaAiCounterBlockIdTcc2,
+  kHsaAiCounterBlockIdTcc3,
+  kHsaAiCounterBlockIdTcc4,
+  kHsaAiCounterBlockIdTcc5,
+  kHsaAiCounterBlockIdTcc6,
+  kHsaAiCounterBlockIdTcc7,
+  kHsaAiCounterBlockIdTcc8,
+  kHsaAiCounterBlockIdTcc9,
+  kHsaAiCounterBlockIdTcc10,
+  kHsaAiCounterBlockIdTcc11,
+  kHsaAiCounterBlockIdTcc12,
+  kHsaAiCounterBlockIdTcc13,
+  kHsaAiCounterBlockIdTcc14,
+  kHsaAiCounterBlockIdTcc15,
+
+  kHsaAiCounterBlockIdTd0,
+  kHsaAiCounterBlockIdTd1,
+  kHsaAiCounterBlockIdTd2,
+  kHsaAiCounterBlockIdTd3,
+  kHsaAiCounterBlockIdTd4,
+  kHsaAiCounterBlockIdTd5,
+  kHsaAiCounterBlockIdTd6,
+  kHsaAiCounterBlockIdTd7,
+  kHsaAiCounterBlockIdTd8,
+  kHsaAiCounterBlockIdTd9,
+  kHsaAiCounterBlockIdTd10,
+  kHsaAiCounterBlockIdTd11,
+  kHsaAiCounterBlockIdTd12,
+  kHsaAiCounterBlockIdTd13,
+  kHsaAiCounterBlockIdTd14,
+  kHsaAiCounterBlockIdTd15,
+
+  kHsaAiCounterBlockIdTcp0,
+  kHsaAiCounterBlockIdTcp1,
+  kHsaAiCounterBlockIdTcp2,
+  kHsaAiCounterBlockIdTcp3,
+  kHsaAiCounterBlockIdTcp4,
+  kHsaAiCounterBlockIdTcp5,
+  kHsaAiCounterBlockIdTcp6,
+  kHsaAiCounterBlockIdTcp7,
+  kHsaAiCounterBlockIdTcp8,
+  kHsaAiCounterBlockIdTcp9,
+  kHsaAiCounterBlockIdTcp10,
+  kHsaAiCounterBlockIdTcp11,
+  kHsaAiCounterBlockIdTcp12,
+  kHsaAiCounterBlockIdTcp13,
+  kHsaAiCounterBlockIdTcp14,
+  kHsaAiCounterBlockIdTcp15,
+
+  kHsaAiCounterBlockIdGds,
+  kHsaAiCounterBlockIdVgt,
+  kHsaAiCounterBlockIdIa,
+  kHsaAiCounterBlockIdMc,
+
+  // Temp commented out for Vega10
+  // kHsaAiCounterBlockIdSrbm,
+
+  kHsaAiCounterBlockIdTcs,
+  kHsaAiCounterBlockIdWd,
+
+  // Temp commented out for Vega10
+  // kHsaAiCounterBlockIdCpg,
+
+  // Temp commented for Vega10
+  kHsaAiCounterBlockIdCpc,
+
+  // Counters retrieved by KFD
+  kHsaAiCounterBlockIdIommuV2,
+  kHsaAiCounterBlockIdKernelDriver,
+
+  kHsaAiCounterBlockIdCpPipeStats,
+  kHsaAiCounterBlockIdHwInfo,
+  kHsaAiCounterBlockIdBlocksFirst = kHsaAiCounterBlockIdCb0,
+  kHsaAiCounterBlockIdBlocksLast = kHsaAiCounterBlockIdHwInfo
+} HsaAiCounterBlockId;
+
+extern GpuBlockInfo AiPmuHwBlocks[];
+extern GpuCounterRegInfo AiSqCounterRegAddr[];
+extern GpuCounterRegInfo AiCbCounterRegAddr[];
+extern GpuCounterRegInfo AiDrmdmaCounterRegAddr[];
+extern GpuCounterRegInfo AiIhCounterRegAddr[];
+extern GpuCounterRegInfo AiCpfCounterRegAddr[];
+extern GpuCounterRegInfo AiCpgCounterRegAddr[];
+extern GpuCounterRegInfo AiCpcCounterRegAddr[];
+extern GpuCounterRegInfo AiDrmCounterRegAddr[];
+extern GpuCounterRegInfo AiGrbmCounterRegAddr[];
+extern GpuCounterRegInfo AiGrbmSeCounterRegAddr[];
+extern GpuCounterRegInfo AiPaSuCounterRegAddr[];
+extern GpuCounterRegInfo AiPaScCounterRegAddr[];
+extern GpuCounterRegInfo AiSpiCounterRegAddr[];
+extern GpuCounterRegInfo AiTcaCounterRegAddr[];
+extern GpuCounterRegInfo AiTccCounterRegAddr[];
+extern GpuCounterRegInfo AiTcpCounterRegAddr[];
+extern GpuCounterRegInfo AiDbCounterRegAddr[];
+extern GpuCounterRegInfo AiRlcCounterRegAddr[];
+extern GpuCounterRegInfo AiScCounterRegAddr[];
+extern GpuCounterRegInfo AiSxCounterRegAddr[];
+extern GpuCounterRegInfo AiTaCounterRegAddr[];
+extern GpuCounterRegInfo AiTdCounterRegAddr[];
+extern GpuCounterRegInfo AiGdsCounterRegAddr[];
+extern GpuCounterRegInfo AiVgtCounterRegAddr[];
+extern GpuCounterRegInfo AiIaCounterRegAddr[];
+extern GpuCounterRegInfo AiMcCounterRegAddr[];
+extern GpuCounterRegInfo AiSrbmCounterRegAddr[];
+
+// No Tcs Counter block on AI
+// extern GpuCounterRegInfo AiTcsCounterRegAddr[];
+extern GpuCounterRegInfo AiWdCounterRegAddr[];
+extern GpuCounterRegInfo AiCpgCounterRegAddr[];
+extern GpuCounterRegInfo AiCpcCounterRegAddr[];
+
+extern GpuPrivCounterBlockId AiBlockIdSq;
+extern GpuPrivCounterBlockId AiBlockIdMc;
+extern GpuPrivCounterBlockId AiBlockIdIommuV2;
+extern GpuPrivCounterBlockId AiBlockIdKernelDriver;
+}
+
+#endif  //  _AI_BLOCKINFO_H_
@@ -0,0 +1,137 @@
+#ifndef _AI_PMU_H_
+#define _AI_PMU_H_
+
+#include "hsa.h"
+#include "cmdwriter.h"
+#include "hsa_perf.h"
+#include "info_set.h"
+#include "parameter_set.h"
+#include "ai_blockinfo.h"
+#include "rocr_profiler.h"
+
+#include <stdlib.h>
+#include <stdint.h>
+#include <map>
+
+namespace pm4_profile {
+typedef std::map<HsaAiCounterBlockId, pm4_profile::CounterBlock*> AiCounterBlockMap;
+
+// This class implement the AI PMU.  It is responsible for setting up
+// CounterGroups to represent each AI hardware block which exposes performance
+// counters.
+class AiPmu : public pm4_profile::Pmu {
+ public:
+  AiPmu();
+
+  ~AiPmu();
+
+  // Returns number of shader engines per block
+  // for the blocks featured shader engines instancing
+  uint32_t getNumSe() { return num_se_; }
+
+  // Initializes the handle of buffer used to collect PMC data
+  bool setPmcDataBuff(uint8_t* pmcBuffer, uint32_t pmcBuffSz);
+
+  int getLastError();
+
+  std::string getErrorString(int error);
+
+  virtual bool begin(DefaultCmdBuf* cmdBuff, CommandWriter* cmdWriter, bool reset = true);
+
+  virtual bool end(DefaultCmdBuf* cmdBuff, CommandWriter* cmdWriter);
+
+  // IPMU inherits the IParameterSet and IInfoSetso we implement it
+  // through composition and function forwarding
+  bool getParameter(uint32_t param, uint32_t& ret_size, void** pp_data);
+
+  bool setParameter(uint32_t param, uint32_t param_size, const void* p_data);
+
+  bool getInfo(uint32_t info, uint32_t& ret_size, void** pp_data);
+
+  pm4_profile::CounterBlock* getCounterBlockById(uint32_t id);
+
+  rocr_pmu_state_t getCurrentState() { return profiler_state_; }
+
+  pm4_profile::CounterBlock** getAllCounterBlocks(uint32_t& num_groups);
+
+ private:
+  // Addr of Counter Data Buffer
+  uint32_t* pmcData_;
+
+  // Size of Counter Data Buffer
+  uint32_t pmcDataSz_;
+
+  void Init();
+
+  bool initCounterBlock();
+
+  bool isResultReady();
+
+  // Clear CounterBlockMap
+  void clearCounterBlockMap();
+
+  // Reset SQ and CB counters
+  void ResetCounterBlocks(DefaultCmdBuf* cmdBuff, CommandWriter* cmdWriter);
+
+  // Program SQ block related counters
+  uint32_t ProgramSQCntrs(uint32_t sqRegIdx, uint32_t* regAddr, uint32_t* regVal, uint32_t blkId,
+                          uint32_t blkCntrIdx);
+
+  // Program TA block related counters
+  uint32_t ProgramTaCntrs(uint32_t taRegIdx, uint32_t* regAddr, uint32_t* regVal, uint32_t blkId,
+                          uint32_t blkCntrIdx);
+
+  // Program TCA block related counters
+  uint32_t ProgramTcaCntrs(uint32_t tcaRegIdx, uint32_t* regAddr, uint32_t* regVal, uint32_t blkId,
+                           uint32_t blkCntrIdx);
+
+  // Program TCC block related counters
+  uint32_t ProgramTccCntrs(uint32_t tccRegIdx, uint32_t* regAddr, uint32_t* regVal, uint32_t blkId,
+                           uint32_t blkCntrIdx);
+
+  // Program TCP block related counters
+  uint32_t ProgramTcpCntrs(uint32_t tcpRegIdx, uint32_t* regAddr, uint32_t* regVal, uint32_t blkId,
+                           uint32_t blkCntrIdx);
+
+  // Program TD block related counters
+  uint32_t ProgramTdCntrs(uint32_t tdRegIdx, uint32_t* regAddr, uint32_t* regVal, uint32_t blkId,
+                          uint32_t blkCntrIdx);
+
+  // Build counter selection register, return how many registers are built
+  uint32_t BuildCounterSelRegister(uint32_t cntrIdx, uint32_t* regAddr, uint32_t* regVal,
+                                   uint32_t blkId, pm4_profile::Counter* blkCntr);
+
+  // Build counter selection register, return how many registers are built
+  uint32_t BuildCounterReadRegisters(uint32_t reg_index, uint32_t block_id, uint32_t* reg_addr,
+                                     uint32_t* reg_val);
+
+ private:
+  // Delete counter blocks in the PMU
+  hsa_status_t RemoveCounterBlocks();
+
+ private:
+  // This contains the available counter groups.
+  AiCounterBlockMap blk_map_;
+
+  // This stores the current profiling state.
+  rocr_pmu_state_t profiler_state_;
+
+  pm4_profile::ParameterSet* parameter_set_;
+
+  pm4_profile::InfoSet* info_set_;
+
+  int error_code_;
+
+  // Pointer used to store counter block list internally
+  uint32_t blk_list_size_;
+  pm4_profile::CounterBlock** blk_list_;
+
+  // Indicates the number of Shader Engines Present
+  uint32_t num_se_;
+
+  // Used to reset GRBM to its default state
+  uint32_t reset_grbm_;
+};
+}
+
+#endif  // _AI_PMU_H_
@@ -0,0 +1,101 @@
+#ifndef _GPU_BLOCKINFO_H_
+#define _GPU_BLOCKINFO_H_
+
+#include "rocr_profiler.h"
+#include "gpu_enum.h"
+
+#include <stdint.h>
+
+namespace pm4_profile {
+
+typedef enum CntlMethod {
+  CntlMethodNone = 0,
+  CntlMethodByInstance = 1,
+  CntlMethodBySe = 2,
+  CntlMethodBySeAndInstance = 3
+} CntlMethod;
+
+// Structure which contains information about a specific hardware block for CI.
+#define GPU_BLOCK_NAME_SIZE 15
+
+typedef struct GpuBlockInfo_ {
+  // Unique string identifier of the block.
+  const char blockName[GPU_BLOCK_NAME_SIZE];
+
+  // Unique string identifier of the block.
+  uint32_t counterGroupId;
+
+  // Maximum number of shader engines
+  uint32_t maxShaderEngineCount;
+
+  // Maximum number of shader arrays
+  uint32_t maxShaderArrayCount;
+
+  // Maximum number of block instances in the group per shader array
+  uint32_t maxInstanceCount;
+
+  // Counter control method
+  CntlMethod method;
+
+  // Maximum counter event ID
+  uint32_t maxEventId;
+
+  // Maximum number of counters that can be enabled at once
+  uint32_t maxSimultaneousCounters;
+
+  // Maximum number of streaming counters that can be enabled at once
+  uint32_t maxStreamingCounters;
+
+  // The number of hardware counters that are shared
+  // between regular and streaming counters.
+  // This is important so that resources are not double-booked
+  // between the two types of counters.
+  uint32_t sharedHWCounters;
+
+  // Block counters can be configured with additional filters
+  bool hasFilters;
+
+  //------------------------------------------
+  // Trace specific stuff regarding when they get locked
+
+  // Buffer size in bytes
+  uint32_t bufferSize;
+
+  // Current write pointer offset from beginning of the buffer
+  uint32_t wptrOffset;
+
+  // Flag that buffer might have wrapped
+  bool wrapped;
+
+  // If buffer has wrapped, this could indicate approximate
+  // total amount of data that was dumpued in the trace buffer
+  uint32_t dataSizeEstimate;
+
+  // Buffer data pointer
+  void* pData;
+} GpuBlockInfo;
+
+// Register address corresponding to each counter
+typedef struct GpuCounterRegInfo_ {
+  // counter select register address
+  uint32_t counterSelRegAddr;
+
+  // counter control register address
+  uint32_t counterCntlRegAddr;
+
+  // counter read register address low
+  uint32_t counterReadRegAddrLo;
+
+  // counter read register address high
+  uint32_t counterReadRegAddrHi;
+} GpuCounterRegInfo;
+
+// Gpu Privileged Block ID info. This number should be the same as that
+// defined in KFD
+typedef struct GpuPrivCounterBlockId_ {
+  // Block ID consists of 4 dwords
+  uint32_t items[4];
+} GpuPrivCounterBlockId;
+
+}  // pm4_profile
+#endif
@@ -0,0 +1,73 @@
+#include "gpu_counter.h"
+
+using namespace pm4_profile;
+
+namespace pm4_profile {
+static char error_string[][64] = {
+    {"No error"}, {"Counter generic error"}, {"Counter is already set"}, {"Counter not ready"},
+};
+
+GpuCounter::GpuCounter() : Counter() {
+  counter_enabled_ = false;
+  parameter_set_ = new ParameterSet();
+}
+
+GpuCounter::~GpuCounter() { delete parameter_set_; }
+
+bool GpuCounter::getResult(uint64_t* p_result) {
+  if (!p_result) {
+    return false;
+  }
+
+  *p_result = result_;
+
+  return true;
+}
+
+bool GpuCounter::setCounterBlock(pm4_profile::CounterBlock* p_cntr_group) {
+  if (!p_cntr_group) {
+    return false;
+  }
+
+  counter_block_ = p_cntr_group;
+
+  return true;
+}
+
+pm4_profile::CounterBlock* GpuCounter::getCounterBlock() { return counter_block_; }
+
+bool GpuCounter::setEnable(bool b) {
+  // TODO: Validate counter
+  counter_enabled_ = b;
+  return true;
+}
+
+void GpuCounter::setResult(uint64_t result) { result_ = result; }
+
+int GpuCounter::getLastError() { return error_code_; }
+
+std::string GpuCounter::getErrorString(int error) {
+  if ((error >= 0) && (error < kHsaCounterErrorCodeMax)) {
+    std::string err_string(error_string[error]);
+    return err_string;
+  }
+  return "Incorrect error index";
+}
+
+bool GpuCounter::getParameter(uint32_t param, uint32_t& ret_size, void** pp_data) {
+  return parameter_set_->getParameter(param, ret_size, pp_data);
+}
+
+bool GpuCounter::setParameter(uint32_t param, uint32_t param_size, const void* p_data) {
+  bool ret_code;
+
+  error_code_ = kHsaCounterErrorCodeNoError;
+
+  ret_code = parameter_set_->setParameter(param, param_size, p_data);
+  if (ret_code == false) {
+    error_code_ = kHsaCounterErrorCodeAlreadySet;
+  }
+
+  return ret_code;
+}
+}
@@ -0,0 +1,52 @@
+#ifndef _GPU_COUNTER_H_
+#define _GPU_COUNTER_H_
+
+#include "hsa_perf.h"
+#include "parameter_set.h"
+
+#include <stdlib.h>
+#include <stdint.h>
+#include <list>
+
+namespace pm4_profile {
+// @brief This class represent each CI performance counter
+class GpuCounter : public pm4_profile::Counter {
+ public:
+  GpuCounter();
+
+  virtual ~GpuCounter();
+
+  virtual int getLastError();
+
+  virtual std::string getErrorString(int error);
+
+  virtual bool getResult(uint64_t* p_result);
+
+  virtual pm4_profile::CounterBlock* getCounterBlock();
+
+  virtual bool setEnable(bool b);
+
+  virtual bool isEnabled() { return counter_enabled_; }
+
+  virtual bool isResultReady() { return is_result_ready_; }
+
+  virtual bool getParameter(uint32_t param, uint32_t& ret_size, void** pp_data);
+
+  virtual bool setParameter(uint32_t param, uint32_t param_size, const void* p_data);
+
+  bool setCounterBlock(pm4_profile::CounterBlock* p_cntr_group);
+
+  void setResult(uint64_t result);
+
+ private:
+  bool counter_enabled_;
+  bool is_result_ready_;
+  uint64_t result_;
+  pm4_profile::ParameterSet* parameter_set_;
+  pm4_profile::CounterBlock* counter_block_;
+  uint32_t error_code_;
+};
+
+typedef std::list<GpuCounter*> GpuCounterList;
+}
+#endif  // _GPU_COUNTER_H_
@@ -0,0 +1,215 @@
+#include "gpu_countergroup.h"
+#include "gpu_counter.h"
+#include "gpu_enum.h"
+
+using namespace pm4_profile;
+
+namespace pm4_profile {
+
+static char error_string[][64] = {
+    {"No error"}, {"Counter block error"}, {"Max counter reached"}, {"Unkown counter"}};
+
+GpuCounterBlock::GpuCounterBlock() : CounterBlock() {
+  cntr_list_.clear();
+  parameter_set_ = new ParameterSet();
+  info_set_ = new InfoSet();
+
+  // Initialize pointer to NULL
+  pp_cntrs_ = NULL;
+
+  _initCounterBlockType();
+}
+
+GpuCounterBlock::~GpuCounterBlock() {
+  GpuCounterList::iterator it = cntr_list_.begin();
+  GpuCounterList::iterator it_end = cntr_list_.end();
+
+  for (; it != it_end; it++) {
+    if (*it) {
+      delete (*it);
+    }
+  }
+  cntr_list_.clear();
+
+  delete parameter_set_;
+  delete info_set_;
+
+  if (pp_cntrs_) {
+    free(pp_cntrs_);
+    pp_cntrs_ = NULL;
+  }
+}
+
+void GpuCounterBlock::_initCounterBlockType() {
+  block_type_ = HSA_EXT_TOOLS_COUNTER_BLOCK_TYPE_ASYNC;
+}
+
+Counter* GpuCounterBlock::createCounter() {
+  if (!_checkMaxNumOfCounters()) {
+    return NULL;
+  }
+
+  GpuCounter* p_cntr = new GpuCounter();
+  if (!p_cntr) {
+    return NULL;
+  }
+
+  cntr_list_.push_back(p_cntr);
+
+  return (Counter*)p_cntr;
+}
+
+bool GpuCounterBlock::destroyCounter(Counter* p_cntr) {
+  bool ret = false;
+
+  if (!p_cntr) {
+    return ret;
+  }
+
+  GpuCounterList::iterator it = cntr_list_.begin();
+  GpuCounterList::iterator it_end = cntr_list_.end();
+  for (; it != it_end; it++) {
+    if (*it == p_cntr) {
+      delete (*it);
+      cntr_list_.erase(it);
+      ret = true;
+      break;
+    }
+  }
+
+  return ret;
+}
+
+bool GpuCounterBlock::destroyAllCounters() {
+  GpuCounterList::iterator it = cntr_list_.begin();
+  GpuCounterList::iterator it_end = cntr_list_.end();
+
+  for (; it != it_end; it++) {
+    if (*it) {
+      delete (*it);
+    }
+  }
+
+  cntr_list_.clear();
+
+  return true;
+}
+
+Counter** GpuCounterBlock::getEnabledCounters(uint32_t& num) {
+  if (pp_cntrs_) {
+    free(pp_cntrs_);
+    pp_cntrs_ = NULL;
+  }
+
+  pp_cntrs_ = (Counter**)malloc(sizeof(GpuCounter*) * cntr_list_.size());
+
+  if (!pp_cntrs_) {
+    return NULL;
+  }
+
+  int cnt = 0;
+  GpuCounterList::iterator it = cntr_list_.begin();
+  GpuCounterList::iterator it_end = cntr_list_.end();
+  for (; it != it_end; it++) {
+    GpuCounter* p_cntr = (*it);
+    bool is_enabled;
+    is_enabled = p_cntr->isEnabled();
+    if (is_enabled) {
+      *(pp_cntrs_ + cnt) = (Counter*)*it;
+      cnt++;
+    }
+  }
+
+  num = cnt;
+  if (0 == num) {
+    return NULL;
+  }
+
+  return pp_cntrs_;
+}
+
+Counter** GpuCounterBlock::getAllCounters(uint32_t& num) {
+  if (pp_cntrs_) {
+    free(pp_cntrs_);
+    pp_cntrs_ = NULL;
+  }
+
+  pp_cntrs_ = (Counter**)malloc(sizeof(GpuCounter*) * cntr_list_.size());
+
+  if (!pp_cntrs_) {
+    return NULL;
+  }
+
+  int cnt = 0;
+  GpuCounterList::iterator it = cntr_list_.begin();
+  GpuCounterList::iterator it_end = cntr_list_.end();
+  for (; it != it_end; it++, cnt++) {
+    *(pp_cntrs_ + cnt) = (Counter*)*it;
+  }
+
+  num = cnt;
+  if (0 == num) {
+    return NULL;
+  }
+
+  return pp_cntrs_;
+}
+
+bool GpuCounterBlock::setInfo(GPU_BLK_INFOS blk_info, uint32_t size, void* data) {
+  return info_set_->setInfo(blk_info, size, data);
+}
+
+bool GpuCounterBlock::_checkMaxNumOfCounters() {
+  uint32_t num_enabled = _getNumOfEnabledCounters();
+
+  uint32_t* p_num_max = NULL;
+  uint32_t size = 0;
+
+  if (!getInfo(GPU_BLK_INFO_MAX_SIMULTANEOUS_COUNTERS, size, (void**)&p_num_max)) {
+    return false;
+  }
+
+  if (num_enabled >= *p_num_max) {
+    return false;
+  }
+
+  return true;
+}
+
+uint32_t GpuCounterBlock::_getNumOfEnabledCounters() {
+  uint32_t cnt = 0;
+  GpuCounterList::iterator it = cntr_list_.begin();
+  GpuCounterList::iterator it_end = cntr_list_.end();
+
+  for (; it != it_end; it++) {
+    GpuCounter* p_cntr = (*it);
+    bool is_enabled;
+    is_enabled = p_cntr->isEnabled();
+    if (is_enabled) {
+      cnt++;
+    }
+  }
+
+  return cnt;
+}
+
+std::string GpuCounterBlock::getErrorString(int error) {
+  if ((error >= 0) && (error < kHsaCounterBlockErrorCodeMaxError)) {
+    std::string err_string(error_string[error]);
+    return err_string;
+  }
+  return "incorrect error code";
+}
+
+bool GpuCounterBlock::getParameter(uint32_t param, uint32_t& ret_size, void** pp_data) {
+  return parameter_set_->getParameter(param, ret_size, pp_data);
+}
+
+bool GpuCounterBlock::setParameter(uint32_t param, uint32_t param_size, const void* pData) {
+  return parameter_set_->setParameter(param, param_size, pData);
+}
+
+bool GpuCounterBlock::getInfo(uint32_t info, uint32_t& ret_size, void** pp_data) {
+  return info_set_->getInfo(info, ret_size, pp_data);
+}
+}
@@ -0,0 +1,70 @@
+#ifndef _GPU_COUNTER_GROUP_H_
+#define _GPU_COUNTER_GROUP_H_
+
+// This file contains declaration of Sea Island (CI) CounterBlock class.
+#include "hsa_perf.h"
+#include "gpu_counter.h"
+#include "parameter_set.h"
+#include "info_set.h"
+#include "gpu_enum.h"
+
+#include <stdlib.h>
+#include <stdint.h>
+
+namespace pm4_profile {
+// This class represents one CI hardware block. Each block contains
+// multiple performance counters.
+class GpuCounterBlock : public pm4_profile::CounterBlock {
+ public:
+  GpuCounterBlock();
+  ~GpuCounterBlock();
+
+  // NOTE [Suravee] : We specify CiPmu as a friend
+  // because the CiPmu needs to be able to setup info of
+  // the counter block.
+  friend class CiPmu;
+  friend class ViPmu;
+  friend class AiPmu;
+
+  std::string getErrorString(int error);
+
+  pm4_profile::Counter* createCounter();
+
+  virtual bool destroyCounter(pm4_profile::Counter* p_cntr);
+
+  virtual bool destroyAllCounters();
+
+  virtual pm4_profile::Counter** getEnabledCounters(uint32_t& num);
+
+  virtual pm4_profile::Counter** getAllCounters(uint32_t& num);
+
+  virtual bool getParameter(uint32_t param, uint32_t& ret_size, void** pp_data);
+
+  virtual bool setParameter(uint32_t param, uint32_t param_size, const void* p_data);
+
+  virtual bool getInfo(uint32_t info, uint32_t& ret_size, void** pp_data);
+
+ protected:
+  void _initCounterBlockType();
+
+  bool setInfo(GPU_BLK_INFOS blk_info, uint32_t size, void* data);
+
+  hsa_ext_tools_counter_block_type_t block_type_;
+
+ private:
+  bool _checkMaxNumOfCounters();
+
+  uint32_t _getNumOfEnabledCounters();
+
+  pm4_profile::ParameterSet* parameter_set_;
+  pm4_profile::InfoSet* info_set_;
+  GpuCounterList cntr_list_;
+  uint32_t error_code_;
+
+  // Pointer of buffer to store counter list
+  pm4_profile::Counter** pp_cntrs_;
+};
+
+}  // pm4_profile
+
+#endif  // _GPU_COUNTER_GROUP_H_
@@ -0,0 +1,65 @@
+#ifndef _GPU_ENUM_H_
+#define _GPU_ENUM_H_
+
+namespace pm4_profile {
+
+// Enumeration containing GPU hardware block information
+enum GPU_BLK_INFOS {
+  GPU_BLK_INFO_BLOCK_NAME,
+  GPU_BLK_INFO_ID,
+  GPU_BLK_INFO_MAX_SHADER_ENGINE_COUNT,
+  GPU_BLK_INFO_MAX_SHADER_ARRAY_COUNT,
+  GPU_BLK_INFO_MAX_INSTANCE_COUNT,
+  GPU_BLK_INFO_CONTROL_METHOD,
+  GPU_BLK_INFO_MAX_EVENT_ID,
+  GPU_BLK_INFO_MAX_SIMULTANEOUS_COUNTERS,
+  GPU_BLK_INFO_MAX_STREAMING_COUNTERS,
+  GPU_BLK_INFO_SHARED_HW_COUNTERS,
+  GPU_BLK_INFO_HAS_FILTERS,
+
+  // Trace-specific stuff
+  GPU_TRC_BLK_INFO_BUFFER_SIZE,
+  GPU_TRC_BLK_INFO_BUFFER_WRITE_POINTER_OFFSET,
+  GPU_TRC_BLK_INFO_BUFFER_WRAPPED,
+  GPU_TRC_BLK_INFO_DATA_SIZE_ESTIMATE,
+  GPU_TRC_BLK_INFO_DATA_POINTER,
+};
+
+
+/**
+ * Trace buffer parameters
+ */
+enum GPU_BLK_PARAMS {
+  // Allows user to specify the size of the trace buffers.
+  GPU_BLK_PARAM_TRACE_BUFFER_SIZE,
+
+  // If we decide to implement this functionality, this will allow the user
+  // to specify the number of trace buffers to create.
+  GPU_BLK_PARAM_TRACE_BUFFER_ARRAY,
+
+  // Specifies whether a new trace buffer should be used for each cmd buffer.
+  // This allows for better correlation of data back to the host application
+  // If this is enabled, and the user does not explicitly specify a
+  // TRACE_BUFFER_ARRAY, then the driver should automatically allocate
+  // additional buffers as needed so that as much of the application
+  // can be traced as possible, until the PerfExperiment is ended.
+  // If a TRACE_BUFFER_ARRAY is specified, then only as many buffers
+  // as specified should be created. If more cmd buffers get submitted
+  // than there are trace buffers, then the later cmd buffers should
+  // not be traced.
+  GPU_BLK_PARAM_TRACE_NEW_BUFFER_ON_SUBMIT,
+};
+
+
+// Enumeration containing GPU counter parameters
+enum GPU_CNTR_PARAMS {
+  GPU_CNTR_PARAM_SHADERENGINE_ID,
+  GPU_CNTR_PARAM_SHADERARRAY_ID,
+  GPU_CNTR_PARAM_INSTANCE_ID,
+  GPU_CNTR_PARAM_EVENT_SELECT_ID,
+  GPU_CNTR_PARAM_SIMD_MASK,
+  GPU_CNTR_PARAM_PERF_MODE,
+  GPU_CNTR_PARAM_TRACE_TYPE,
+};
+}
+#endif
@@ -0,0 +1,436 @@
+#ifndef _HSA_PERF_H_
+#define _HSA_PERF_H_
+
+#include "rocr_profiler.h"
+
+#if !defined(AMD_AMP_HSA_INCLUDES)
+#include <map>
+#include <string>
+#include <stdlib.h>
+#include <stdint.h>
+#endif
+
+namespace pm4_profile {
+class Pmu;
+class Counter;
+class CounterBlock;
+class TraceGroup;
+class CommandWriter;
+class DefaultCmdBuf;
+
+
+// @brief This is an abstract class for defining a CounterBlock. Each
+// CounterBlock contains a set of Counters that often belong to the
+// same functional unit
+//
+// For AMD GPU, this can represent blocks of Counters in each HW block
+// (e.g. SQ, SQI, CP, etc.).
+// For AMD CPU, this can represent blocks of core PMCs, NB PMCs, L2I PMCs
+// on each CPU device
+//
+// Generally, CounterBlocks are created and initialized by the \ref Pmu class.
+// Users can query them by calling \ref Pmu::getAllCounterBlocks() or
+// \ref Pmu::getCounterBlockById(). A CounterBlock is enabled if it contains
+// enabled Counters in the block.
+//
+// Users can manage Counters in each GounterBlock (e.g. create, destroy,
+// enable and disable).  To specify a Counter, users simply call \ref
+// createCounter. Then it can be enabled or disabled using \ref
+// Counter::setEnable.  When a Counter is enabled, it is checked against the
+// CounterBlock checks to make sure that the enabled-counter is valid and is
+// not conflicting with the current Counters in the block.
+class CounterBlock {
+ public:
+  typedef enum HsaCounterBlockErrorCode {
+    //  Generic CounterBlock error
+    kHsaCounterBlockErrorCodeNoError = 0x0,
+
+    // Generic CounterBlock error
+    kHsaCounterBlockErrorCodeGenericError,
+
+    // The maximum number of Counters in the block is reached.
+    kHsaCounterBlockErrorCodeMaxNumCounterReached,
+
+    // The counter does not belong to this block.
+    kHsaCounterBlockErrorCodeUnknownCounter,
+
+    // The counter does not belong to this block.
+    kHsaCounterBlockErrorCodeMaxError
+  } HsaCounterBlockErrorCode;
+
+  // Destructor of CounterBlock.
+  virtual ~CounterBlock() {}
+
+  // Given and error number reported from getLastError or returned from a
+  // function call, retreive the corresponding stl string.
+  // @param[in] error The error corresponding to a call to getLastError
+  // or a return code from a function call.
+  // Return An stl string representing a text corresponding to the error
+  // number.
+  // If invalid error code is given, the returned string is empty.
+  virtual std::string getErrorString(int error) = 0;
+
+  // Create an Counter object return a pointer to caller.
+  // Return On success, this function returns a pointer to Counter
+  //        On failure, this function returns NULL
+  // Possible error codes are:
+  //        kHSAPerfErrorCodesUnmodifiableState
+  //        kHsaCounterBlockErrorCodeMaxNumCounterReached
+  virtual Counter* createCounter() = 0;
+
+  // Destroy the Counter. The CounterBlock which owns the Counter must be in
+  // disabled state.
+  // Return true or false
+  // Possible error codes are:
+  //   kHSAPerfErrorCodesInvalidAargs
+  //   kHSAPerfErrorCodesUnmodifiableState
+  //   kHsaCounterBlockErrorCodeUnknownCounter
+  virtual bool destroyCounter(Counter* p_counter) = 0;
+
+  // Destroy all counters in the block. The CounterBlock must be in disable
+  // state.
+  // Return true or false.
+  // Possible error codes are:
+  //   kHSAPerfErrorCodesUnmodifiableState
+  virtual bool destroyAllCounters() = 0;
+
+  // Get a list of pointers to the enabled Counters in this CounterBlock.
+  // note The Counter must be created by the same CounterBlock object using
+  // createCounter().
+  // @param[in] num The number of Counter pointers returned.
+  // Return
+  //   return a list of pointers to the enabled Counters.
+  //   return NULL if no counter is enabled.
+  virtual Counter** getEnabledCounters(uint32_t& num) = 0;
+
+  // Get a list of pointers to the all Counters in this CounterBlock.
+  // note The Counter must be created by the same CounterBlock object using
+  // createCounter().
+  // @param[in] num The number of Counter pointers returned.
+  // Return
+  //   return a list of pointers in the CounterBlock.
+  //   return NULL if no counter is enabled.
+  virtual Counter** getAllCounters(uint32_t& num) = 0;
+
+  // Query value of the parameter specified by param
+  // @param[in] param The enumeration of parameter to be queried
+  // @param[out] return_size The returned size of data
+  // @param[out] pp_data The pointer to the returned data. The API is
+  // responsible for managing the memory to store the information as specified
+  // by return_size.
+  //
+  // Return true or false
+  // Possible error codes are:
+  //   kHSAPerfErrorCodesInvalidParam
+  //   kHSAPerfErrorCodesInvalidParamSize
+  //   kHSAPerfErrorCodesInvalidParamData
+  virtual bool getParameter(uint32_t param, uint32_t& return_size, void** pp_data) = 0;
+
+  // Set value for the parameter specified by param
+  // @param[in] param The enumeration of parameter to be queried
+  // @param[out] param_size The size of data
+  // @param[out] p_data The pointer to the data to be set. Users are responsible
+  // for deallocating the memory of p_data after calling the API.
+  // Return true or false
+  // Possible error codes are:
+  //   kHSAPerfErrorCodesUnmodifiableState
+  //   kHSAPerfErrorCodesInvalidParam
+  //   kHSAPerfErrorCodesInvalidParamSize
+  //   kHSAPerfErrorCodesInvalidParamData
+  virtual bool setParameter(uint32_t param, uint32_t param_size, const void* p_data) = 0;
+
+  // Query value of the information specified by info
+  // @param[in] info The enumeration of information to be queried
+  // @param[out] Return_size The returned size of data
+  // @param[out] pp_data The pointer to the returned data
+  // Return true or false
+  // Possible error codes are:
+  //   kHSAPerfErrorCodesInvalidInfo
+  //   kHSAPerfErrorCodesInvalidInfoSize
+  //   kHSAPerfErrorCodesInvalidInfoData
+  virtual bool getInfo(uint32_t info, uint32_t& return_size, void** pp_data) = 0;
+};  // class CounterBlock
+
+
+// This is an abstract class for defining a TraceGroup. TraceGroup inherits
+// CounterBlock and add interfaces for managing trace buffer. It also supports
+// user-data insertion into trace.  This allows users to insert arbitary data
+// (e.g. markers) into trace which and can be used to correlating a specific
+// events to the collected trace data.
+class TraceGroup : public CounterBlock {
+ public:
+  typedef enum HsaTraceGroupErrorCode {
+    // Generic TraceGroup error
+    HsaTraceGroupErrorCodeGenericError = 0x100,
+  } HsaTraceGroupErrorCode;
+
+  // Destructor of TraceGroup.
+  virtual ~TraceGroup() {}
+
+  // Obtains the number of buffers which were collected as part of
+  // the trace.
+  // Return The number of collected buffers.
+  virtual uint32_t getCollectedBufferCount() = 0;
+
+  // Locks a trace buffer for host access.
+  // @param[in] buffer_id The index of the buffer to be locked.
+  // Return true or false
+  virtual bool lock(uint32_t buffer_id) = 0;
+
+  // Unlock a trace buffer that was previously locked.
+  // @param[in] buffer_id The index of the buffer to be unlocked.
+  // Return true or false
+  virtual bool unlock(uint32_t buffer_id) = 0;
+
+  // Inserts data (e.g. trace marker) into the trace.
+  // @param[in] type The type of data to be inserted.
+  // @param[in] p_data The data to be inserted.
+  // @param[in] data_size The size of data to be inserted.
+  // Return true or false
+  virtual bool insertUserData(uint32_t type, void* p_data, uint32_t data_size) = 0;
+};  // class TraceGroup
+
+
+// This is an abstract class for defining a performance Counter.
+// Users can obtain a Counter from \ref CounterBlock::createCounter().
+// Once obtained, users can set up Counter parameters, and enable it using
+// \ref Counter::setEnable().
+//
+// There are several types of Counter as defined in \ref
+//    HsaCounterBlockTypeMask.
+// Only the supported Counter type can be added to the CounterBlock.
+//
+// Each Counter can store Counter-specific parameters.  The Counter is used to
+// specify types of event to be counted.
+class Counter {
+ public:
+  typedef enum HsaCounterErrorCode {
+    // Generic Counter error
+    kHsaCounterErrorCodeNoError = 0x0,
+
+    // Generic Counter error
+    kHsaCounterErrorCodeGenericError = 0x1,
+
+    // Counter already error
+    kHsaCounterErrorCodeAlreadySet = 0x2,
+
+    // Counter result is not ready.
+    kHsaCounterErrorCodeResultNotReady = 0x3,
+
+    // Max counter error num
+    kHsaCounterErrorCodeMax,
+  } HsaCounterErrorCode;
+
+  // Destructor of Counter
+  virtual ~Counter() {}
+
+  // Retrieve the last error code generated.  This should be checked when
+  // values returned are NULL or void.
+  // Return an integer corresponding to the last error reported.
+  virtual int getLastError() = 0;
+
+  // Given and error number reported from getLastError or returned from a
+  // function call, retreive the corresponding stl string.
+  // @param[in] error The error corresponding to a call to getLastError
+  // or a return code from a function call.
+  // Return An stl string representing a text corresponding to the error
+  // number. If invalid error code is given, the returned string is empty.
+  virtual std::string getErrorString(int error) = 0;
+
+  // Get the \ref CounterBlock which owns this counter.
+  // Return
+  //   On success, it returns a pointer to the CounterBlock.
+  //   On Failure, it returns NULL.
+  virtual CounterBlock* getCounterBlock() = 0;
+
+  // Enable or disable the Counter.
+  // @param[in] b Set to true to enable the CounterBlock.
+  // Return
+  //   return true when successfully set the state.
+  //   return false otherwise.
+  //   In case of the current state already is set to the specified value,
+  //   the API returns true.
+  //   Possible error codes are:
+  //     kHSAPerfErrorCodesUnmodifiableState
+  virtual bool setEnable(bool b) = 0;
+
+  // Return the current state of the Counter.
+  // Return true or false
+  virtual bool isEnabled() = 0;
+
+  // Return the status of this Counter whether the result is available.
+  // Return true or false
+  virtual bool isResultReady() = 0;
+
+  // Query Counter result
+  // note Must be implemented by derived classes
+  // @param[out] p_result The pointer containing the returned result.
+  // Return true or false
+  // Possible error codes are:
+  //   kHSAPerfErrorCodesInvalidAargs
+  //   kHsaCounterErrorCodeResultNotReady
+  virtual bool getResult(uint64_t* p_result) = 0;
+
+  // Query value of the parameter specified by param
+  // @param[in] param The enumeration of parameter to be queried
+  // @param[out] Return_size The returned size of data
+  // @param[out] pp_data The pointer to the returned data. The API is
+  // responsible for managing the memory to store the information as
+  // specified by return_size.
+  // Return true or false
+  //   Possible error codes are:
+  //     kHSAPerfErrorCodesInvalidParam
+  //     kHSAPerfErrorCodesInvalidParamSize
+  //     kHSAPerfErrorCodesInvalidParamData
+  virtual bool getParameter(uint32_t param, uint32_t& return_size, void** pp_data) = 0;
+
+  // Set value for the parameter specified by param
+  // @param[in] param The enumeration of parameter to be queried
+  // @param[out] param_size The size of data
+  // @param[out] p_data The pointer to the data to be set. Users are responsible
+  // for deallocating the memory of p_data after calling the API.
+  // Return true or false
+  //   Possible error codes are:
+  //     kHSAPerfErrorCodesUnmodifiableState
+  //     kHSAPerfErrorCodesInvalidParam
+  //     kHSAPerfErrorCodesInvalidParamSize
+  //     kHSAPerfErrorCodesInvalidParamData
+  virtual bool setParameter(uint32_t param, uint32_t param_size, const void* p_data) = 0;
+};  // class Counter
+
+class Pmu {
+ public:
+  // Enumeration of Pmu error codes
+  typedef enum HsaPmuErrorCode {
+    // Generic PMU error
+    kHsaPmuErrorCodeNoError = 0x0,
+
+    // Unknown CounterBlock ID
+    kHsaPmuErrorCodeUnknownCounterBlockId,
+
+    // No CounterBlock exists
+    kHsaPmuErrorCodeNoCounterBlock,
+
+    // The previously operation is not valid. This could be due to
+    // invalid transition from the current state.
+    kHsaPmuErrorCodeInvalidOperation,
+
+    // PMU is not currently available (e.g. PMU is currently
+    // in-used by others)
+    kHsaPmuErrorCodeNotAvailable,
+
+    // PMU is not currently available (e.g. PMU is currently
+    // in-used by others)
+    kHsaPmuErrorCodeErrorState,
+
+    // PMU result is timeout
+    kHsaPmuErrorCodeTimeOut,
+
+    // Max error count
+    kHsaPmuErrorCodeMax
+  } HsaPmuErrorCode;
+
+  // Destructor of PMU.
+  // note This stops the performance counters if running and releases
+  // any resources used by the PMU.
+  virtual ~Pmu() {}
+
+  // Retrieve the last error code generated.  This should be checked when
+  // values returned are NULL or void.
+  // Return an integer corresponding to the last error reported.
+  virtual int getLastError() = 0;
+
+  // Given and error number reported from getLastError or returned from a
+  // function call, retreive the corresponding stl string.
+  // @param[in] error The error corresponding to a call to getLastError
+  // or a return code from a function call.
+  // Return An stl string representing a text corresponding to the error
+  //   number. If invalid error code is given, the returned string is empty.
+  virtual std::string getErrorString(int error) = 0;
+
+  // Get CounterBlock from Id
+  // @param[in] id ID of the target CounterBlock
+  // Return
+  //   On success, it returns a pointer to specified CounterBlock.
+  //   On Failure, it returns NULL.
+  //   Possible error codes are:
+  //     kHsaPmuErrorCodeUnknownCounterBlockId.
+  virtual CounterBlock* getCounterBlockById(uint32_t id) = 0;
+
+  // Get all available CounterBlock
+  // @param[out] num_block The returned number of CounterBlocks
+  // Return On success, it returns an array of CounterBlock pointers.
+  //   On Failure, it returns NULL.
+  virtual CounterBlock** getAllCounterBlocks(uint32_t& num_block) = 0;
+
+  // Get current PMU profiling state.
+  // Return The PMU profiling state as defined in \ref PMU_PROFILE_STATES
+  virtual rocr_pmu_state_t getCurrentState() = 0;
+
+  // Start profiling on the PMU.
+  // @param[in] reset_counter indicates whether reset counter before
+  // recording. Default is reset counters.
+  // note This function must be implemented by children classes.
+  // Return true or false
+  //   Possible error codes are:
+  //     kHsaPmuErrorCodeInvalidOperation
+  //     kHsaPmuErrorCodeNotAvailable
+  virtual bool begin(DefaultCmdBuf* cmdBuff, CommandWriter* cmdWriter, bool reset = true) = 0;
+
+  // Stop profiling on the PMU.
+  // note This function must be called after \ref begin().
+  // note This function must be implemented by children classes.
+  // Return true or false
+  //   Possible error codes are:
+  //     kHsaPmuErrorCodeInvalidOperation
+  virtual bool end(DefaultCmdBuf* cmdBuff, CommandWriter* cmdWriter) = 0;
+
+  // Initializes the handle of buffer used to collect PMC data
+  // @param pmcBuffer The buffer pointer
+  // @param cmdBufSz Size in terms of bytes
+  virtual bool setPmcDataBuff(uint8_t* pmcBuffer, uint32_t pmcBuffSz) = 0;
+
+  // Query value of the parameter specified by param
+  // @param[in] param The enumeration of parameter to be queried
+  // @param[out] Return_size The returned size of data
+  // @param[out] pp_data The pointer to the returned data. The API is
+  // responsible for managing the memory to store the information as
+  // specified by return_size.
+  // Return true or false
+  //   Possible error codes are:
+  //     kHSAPerfErrorCodesInvalidParam
+  //     kHSAPerfErrorCodesInvalidParamSize
+  //     kHSAPerfErrorCodesInvalidParamData
+  virtual bool getParameter(uint32_t param, uint32_t& return_size, void** pp_data) = 0;
+
+  // Set value for the parameter specified by param
+  // @param[in] param The enumeration of parameter to be queried
+  // @param[out] param_size The size of data
+  // @param[out] p_data The pointer to the data to be set. Users are responsible
+  // for deallocating the memory of p_data after calling the API.
+  // Return true or false
+  //   Possible error codes are:
+  //     kHSAPerfErrorCodesUnmodifiableState
+  //     kHSAPerfErrorCodesInvalidParam
+  //     kHSAPerfErrorCodesInvalidParamSize
+  //     kHSAPerfErrorCodesInvalidParamData
+  virtual bool setParameter(uint32_t param, uint32_t param_size, const void* p_data) = 0;
+
+  // Query value of the information specified by info
+  // @param[in] info The enumeration of information to be queried
+  // @param[out] Return_size The returned size of data
+  // @param[out] pp_data The pointer to the returned data
+  // Return true or false
+  //   Possible error codes are:
+  //     kHSAPerfErrorCodesInvalidInfo
+  //     kHSAPerfErrorCodesInvalidInfoSize
+  //     kHSAPerfErrorCodesInvalidInfoData
+  virtual bool getInfo(uint32_t info, uint32_t& return_size, void** pp_data) = 0;
+
+  // Returns number of shader engines per block
+  // for the blocks featured shader engines instancing
+  virtual uint32_t getNumSe() = 0;
+
+};  // class Pmu
+}  // pm4_profile
+#endif  // _HSA_PERF_H_
@@ -0,0 +1,74 @@
+#include "info_set.h"
+#include "var_data.h"
+using namespace std;
+
+namespace pm4_profile {
+InfoSet::InfoSet() {
+  releaseParameters();
+  info_table_.clear();
+  p_data_ = NULL;
+}
+
+InfoSet::~InfoSet() {
+  releaseParameters();
+  info_table_.clear();
+  free(p_data_);
+  p_data_ = NULL;
+}
+
+bool InfoSet::setInfo(uint32_t info, uint32_t info_size, void* p_data) {
+  if (info_table_.end() != info_table_.find(info)) {
+    return false;
+  }
+
+  VarData data;
+  if (!data.set(info_size, p_data)) {
+    return false;
+  }
+
+  info_table_.insert(VarDataMap::value_type(info, data));
+  return true;
+}
+
+bool InfoSet::getInfo(uint32_t info, uint32_t& ret_size, void** pp_data) {
+  if (!pp_data || (0 == info_table_.size())) {
+    return false;
+  }
+
+  VarDataMap::iterator it = info_table_.find(info);
+  if (it == info_table_.end()) {
+    return false;
+  }
+
+  int size = it->second.getSize();
+  if (size == 0) {
+    return false;
+  }
+
+  free(p_data_);
+  p_data_ = NULL;
+
+  p_data_ = malloc(size);
+  if (!p_data_) {
+    return false;
+  }
+
+  *pp_data = p_data_;
+
+  ret_size = info_table_[info].get(size, *pp_data);
+
+  return true;
+}
+
+void InfoSet::releaseParameters() {
+  VarDataMap::iterator it = info_table_.begin();
+  VarDataMap::iterator table_end = info_table_.end();
+
+  for (; it != table_end; it++) {
+    it->second.clear();
+  }
+
+  return;
+}
+
+}  // pm4_profile
@@ -0,0 +1,48 @@
+#ifndef _INFO_SET_H_
+#define _INFO_SET_H_
+
+// This file contains declaration of IInfoSet class.
+#include "hsa_perf.h"
+#include "var_data.h"
+
+#include <stdlib.h>
+#include <stdint.h>
+
+namespace pm4_profile {
+// An abstract class defining a container to hold a information data set
+// (e.g. PMU info, CounterGroup info, etc.).  Unlike \ref IParameterSet,
+// This class allows only the children of the class to set the information.
+class InfoSet {
+ public:
+  // IInfoSet constructor
+  InfoSet();
+
+  // IInfoSet destructor
+  virtual ~InfoSet();
+
+  // Query value of the information specified by info
+  // @param[in] info The enumeration of information to be queried
+  // @param[out] ret_size The returned size of data
+  // @param[out] pp_data The pointer to the returned data
+  // /return true or false
+  bool getInfo(uint32_t info, uint32_t& ret_size, void** pp_data);
+
+  // Set value for the information specified by info
+  // @param[in] info The enumeration of information to be queried
+  // @param[out] info_size The size of data
+  // @param[out] p_data The pointer to the data to be set
+  // /return true or false
+  bool setInfo(uint32_t info, uint32_t info_size, void* p_data);
+
+ private:
+  // Remove all data in the parameter table
+  void releaseParameters();
+
+  // InfoSet property: The info table
+  VarDataMap info_table_;
+
+  // Pointer to the buffer used in getInfo
+  void* p_data_;
+};
+}
+#endif
@@ -0,0 +1,74 @@
+#include "parameter_set.h"
+
+using namespace std;
+
+namespace pm4_profile {
+ParameterSet::ParameterSet() {
+  releaseParameters();
+  param_table_.clear();
+  p_data_ = NULL;
+}
+
+ParameterSet::~ParameterSet() {
+  releaseParameters();
+  param_table_.clear();
+  free(p_data_);
+  p_data_ = NULL;
+}
+
+bool ParameterSet::setParameter(uint32_t param, uint32_t param_size, const void* p_data) {
+  if (param_table_.end() != param_table_.find(param)) {
+    return false;
+  }
+
+  VarData data;
+  if (!data.set(param_size, p_data)) {
+    return false;
+  }
+
+  param_table_.insert(VarDataMap::value_type(param, data));
+  return true;
+}
+
+bool ParameterSet::getParameter(uint32_t param, uint32_t& ret_size, void** pp_data) {
+  if (!pp_data || (0 == param_table_.size())) {
+    return false;
+  }
+
+  VarDataMap::iterator it = param_table_.find(param);
+  if (it == param_table_.end()) {
+    return false;
+  }
+
+  int size = it->second.getSize();
+  if (size == 0) {
+    return false;
+  }
+
+  // for NULL pointer, free does nothing
+  free(p_data_);
+  p_data_ = malloc(size);
+  if (!p_data_) {
+    return false;
+  }
+
+  // store the pointer to be freed
+  *pp_data = p_data_;
+
+  ret_size = param_table_[param].get(size, *pp_data);
+
+  return true;
+}
+
+bool ParameterSet::releaseParameters() {
+  VarDataMap::iterator it = param_table_.begin();
+  VarDataMap::iterator table_end = param_table_.end();
+
+  for (; it != table_end; it++) {
+    it->second.clear();
+  }
+
+  return true;
+}
+
+}  // pm4_profile
@@ -0,0 +1,75 @@
+#ifndef _PARAMETER_SET_H_
+#define _PARAMETER_SET_H_
+
+/*!
+   \note This file contains declaration of IParameterSet class.
+ */
+#include "hsa_perf.h"
+#include "var_data.h"
+
+#include <stdlib.h>
+#include <stdint.h>
+
+namespace pm4_profile {
+/*!
+   A class defining a container to hold parameter data set
+   (e.g. PMU parameter, CounterGroup parameter, etc.).
+ */
+class ParameterSet {
+ public:
+  /*!
+     Enumeration containing types of parameters
+   */
+  enum parameter {
+    PARAM_MAX,
+  };
+
+  /*! IParameterSet constructor */
+  ParameterSet();
+
+  /*! IParameterSet destructor */
+  virtual ~ParameterSet();
+
+  /*!
+     Query value of the parameter specified by param
+     @param[in] param The enumeration of parameter to be queried
+     @param[out] ret_size The returned size of data
+     @param[out] pp_data The pointer to the returned data
+     /return true or false
+   */
+  bool getParameter(
+      /*in*/ uint32_t param,
+      /*out*/ uint32_t& ret_size,
+      /*out*/ void** pp_data);
+
+  /*!
+     Set value for the parameter specified by param
+     @param[in] param The enumeration of parameter to be queried
+     @param[out] param_size The size of data
+     @param[out] p_data The pointer to the data to be set
+     /return true or false
+   */
+  bool setParameter(
+      /*in*/ uint32_t param,
+      /*in*/ uint32_t param_size,
+      /*in*/ const void* p_data);
+
+ private:
+  /*!
+     Remove all data in the parameter table
+  */
+  bool releaseParameters();
+
+  /*!
+     IParameterSet property: The parameter table
+   */
+  VarDataMap param_table_;
+
+  /*!
+    Pointer to the buffer used in getParameter
+   */
+  void* p_data_;
+};
+}
+
+#endif  // _PARAMETER_SET_H_
@@ -0,0 +1,254 @@
+////////////////////////////////////////////////////////////////////////////////
+//
+// The University of Illinois/NCSA
+// Open Source License (NCSA)
+//
+// Copyright (c) 2014-2015, Advanced Micro Devices, Inc. All rights reserved.
+//
+// Developed by:
+//
+//                 AMD Research and AMD HSA Software Development
+//
+//                 Advanced Micro Devices, Inc.
+//
+//                 www.amd.com
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to
+// deal with the Software without restriction, including without limitation
+// the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the
+// Software is furnished to do so, subject to the following conditions:
+//
+//  - Redistributions of source code must retain the above copyright notice,
+//    this list of conditions and the following disclaimers.
+//  - Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimers in
+//    the documentation and/or other materials provided with the distribution.
+//  - Neither the names of Advanced Micro Devices, Inc,
+//    nor the names of its contributors may be used to endorse or promote
+//    products derived from this Software without specific prior written
+//    permission.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+// DEALINGS WITH THE SOFTWARE.
+//
+////////////////////////////////////////////////////////////////////////////////
+
+#ifndef _ROCR_PROFILER_H_
+#define _ROCR_PROFILER_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif  // __cplusplus
+
+#if defined _WIN32 || defined __CYGWIN__
+#ifdef __GNUC__
+#define HSA_TOOLS_API __attribute__((dllexport))
+#else
+#define HSA_TOOLS_API __declspec(dllexport)  // Note: actually gcc seems
+// to also supports this
+// syntax.
+#endif
+#ifndef DLL_LOCAL
+#define DLL_LOCAL
+#endif
+
+#else  // defined _WIN32 || defined __CYGWIN__
+#if __GNUC__ >= 4
+#define HSA_TOOLS_API __attribute__((visibility("default")))
+#ifndef DLL_LOCAL
+#define DLL_LOCAL __attribute__((visibility("hidden")))
+#endif
+#else
+#define HSA_TOOLS_API
+#ifndef DLL_LOCAL
+#define DLL_LOCAL
+#endif
+#endif
+#endif  // defined _WIN32 || defined __CYGWIN__
+
+//---------------------------------------------------------------------------//
+// @brief Enumeration of various information that is set for a counter.      //
+// @detail This enumeration defines the various counter info that could be   //
+//         used in a counter. This is used by a counter object to specify    //
+//         its type and other conditions that are needed to retrieve a       //
+//         counter value.                                                    //
+//---------------------------------------------------------------------------//
+typedef enum hsa_ext_tools_counter_parameter_s {
+  // Event index of a counter
+  HSA_EXT_TOOLS_COUNTER_PARAMETER_EVENT_INDEX = 0,
+
+  // Simd mask of a counter
+  HSA_EXT_TOOLS_COUNTER_PARAMETER_SIMD_MASK = 1,
+
+  // Shader engine mask of a counter
+  HSA_EXT_TOOLS_COUNTER_PARAMETER_SHADER_MASK = 2,
+
+  // Max counter info index
+  HSA_EXT_TOOLS_COUNTER_PARAMETER_INFO_MAX
+} hsa_ext_tools_counter_parameter_t;
+
+//---------------------------------------------------------------------------//
+// @brief Enumeration of counter block type mask                             //
+// @details This enumeration define the bit mask representing types of       //
+// counter broup supported by HSA. This is used by counter block object to   //
+// specify its type.                                                         //
+//---------------------------------------------------------------------------//
+typedef enum hsa_ext_tools_counter_block_type_s {
+  // Unknown counter block type
+  HSA_EXT_TOOLS_COUNTER_BLOCK_TYPE_UNKNOWN = 0,
+
+  // The CounterBlock of this type can be access at anytime.
+  // note Examples are software Counters and CPU Counters.
+  HSA_EXT_TOOLS_COUNTER_BLOCK_TYPE_SYNC = 1,
+
+  // The CounterBlock type can be access asynchronously.
+  // It is required that the Counter must be stopped
+  // before accessing.
+  HSA_EXT_TOOLS_COUNTER_BLOCK_TYPE_ASYNC = 2,
+
+  // The CounterBlock of this counter block is used for generating
+  // trace.
+  HSA_EXT_TOOLS_COUNTER_BLOCK_TYPE_TRACE = 3,
+
+  // Max CounterBlock type
+  HSA_EXT_TOOLS_COUNTER_BLOCK_TYPE_MAX
+} hsa_ext_tools_counter_block_type_t;
+
+//---------------------------------------------------------------------------//
+// @brief Enumeration of various information that is set for a counter block.//
+// @detail This enumeration defines the various info that could be used      //
+// in a counter block. This is used by a counter object to specify its type  //
+// and other conditions that are needed for a counter block.                 //
+//---------------------------------------------------------------------------//
+/*
+typedef enum hsa_ext_tools_counter_block_info_s {
+  // Index of a counter block
+  HSA_EXT_TOOLS_COUNTER_BLOCK_INFO_EVENT_INDEX = 0,
+
+  // Shader bits of a counter block
+  HSA_EXT_TOOLS_COUNTER_BLOCK_INFO_SHADER_BITS = 1,
+
+  // Simd mask of a counter
+  HSA_EXT_TOOLS_COUNTER_BLOCK_INFO_CONTROL_METHOD = 2,
+
+  // Max index of counter block info
+  HSA_EXT_TOOLS_COUNTER_BLOCK_INFO_MAX
+} hsa_ext_tools_counter_block_info_t;
+*/
+
+//---------------------------------------------------------------------------//
+// Enumeration for the methods used to index into the correct registers.    //
+//---------------------------------------------------------------------------//
+/*
+typedef enum hsa_ext_tools_counter_index_method_s {
+  // No index
+  HSA_EXT_TOOLS_COUNTER_INDEX_METHOD_BY_NONE = 0,
+
+  // Index by block instance
+  HSA_EXT_TOOLS_COUNTER_INDEX_METHOD_BY_INSTANCE = 1,
+
+  // Index by shader engine
+  HSA_EXT_TOOLS_COUNTER_INDEX_METHOD_BY_SHADER_ENGINE = 2,
+
+  // Index by shader and instance
+  HSA_EXT_TOOLS_COUNTER_INDEX_METHOD_BY_SHADER_ENGINE_ANDINSTANCE = 3
+} hsa_ext_tools_counter_index_method_t;
+*/
+
+//---------------------------------------------------------------------------//
+// Enumeration for the HSAPerf generic error codes                           //
+//---------------------------------------------------------------------------//
+/*
+typedef enum hsa_ext_tools_error_codes_s {
+  // Successful
+  HSA_EXT_TOOLS_ERROR_CODE_OK = 0,
+
+  // Generic error code
+  HSA_EXT_TOOLS_ERROR_CODE_ERROR,
+
+  // Generic invalid HSAPerf API arguments
+  HSA_EXT_TOOLS_ERROR_CODE_INVALID_ARGS,
+
+  // The operation is not permit due to currently in the unmodifiable
+  // HSAPerf state .
+  HSA_EXT_TOOLS_ERROR_CODE_UNMODIFIABLE_STATE,
+
+  // The hsa_ext_tools_set_pmu_parameter() or
+  // hsa_ext_tools_get_pmu_parameter() API contains invalid parameter value.
+  HSA_EXT_TOOLS_ERROR_CODE_INVALID_PARAM,
+
+  // The hsa_ext_tools_set_pmu_parameter() or
+  // hsa_ext_tools_get_pmu_parameter() API contains invalid parameter size
+  // or return size.
+  HSA_EXT_TOOLS_ERROR_CODE_INVALID_PARAM_SIZE,
+
+  // The hsa_ext_tools_set_pmu_parameter() or
+  // hsa_ext_tools_get_pmu_parameter() API contains invalid
+  // pointer (e.g. NULL).
+  HSA_EXT_TOOLS_ERROR_CODE_INVALID_PARAM_DATA,
+
+  // The hsa_ext_tools_get_pmu_info() API contains invalid info value.
+  HSA_EXT_TOOLS_ERROR_CODE_INVALID_INFO,
+
+  // The hsa_ext_tools_get_pmu_info() API contains invalid info
+  // size (e.g. zero).
+  HSA_EXT_TOOLS_ERROR_CODE_INVALID_INFO_SIZE,
+
+  // The hsa_ext_tools_get_pmu_info() API contains invalid
+  // data (e.g. NULL).
+  HSA_EXT_TOOLS_ERROR_CODE_INVALID_INFO_DATA
+} hsa_ext_tools_error_codes_t;
+*/
+
+//---------------------------------------------------------------------------//
+// Enumeration for Pmu profiling state                                       //
+//---------------------------------------------------------------------------//
+typedef enum rocr_pmu_state_s {
+  // Profiling idle. In this state, changes can be made to
+  // the PMU, counter blocks, counters. This state can represent
+  // the moment prior to calling begin or after calling
+  // hsa_ext_tools_pmu_wait_for_completion().
+  ROCR_PMU_STATE_IDLE,
+
+  // Profiling start. In this state, changes cannot be made to
+  // the PMU, counter block, counters. The PMU is collecting
+  // performance counter data. This state represents
+  // the moment after calling hsa_ext_tools_pmu_begin() and before calling
+  // hsa_ext_tools_pmu_end()
+  ROCR_PMU_STATE_START,
+
+  // Profiling stop. In this state, changes cannot be made to
+  // the PMU, counter blocks, Counters. PMU has stopped the
+  // performance counter data collection. However, the result
+  // might not yet be available. This state represents
+  // the moment after calling hsa_ext_tools_pmu_end() and before the call
+  // to hsa_ext_tools_pmu_wait_for_completion() has returned success.
+  ROCR_PMU_STATE_STOP
+} rocr_pmu_state_t;
+
+//---------------------------------------------------------------------------//
+//  Opaque pointer to HSA performance monitor unit (PMU)                     //
+//---------------------------------------------------------------------------//
+// typedef void *  hsa_ext_tools_pmu_t;
+
+//---------------------------------------------------------------------------//
+// Opaque pointer to HSA counter block                                       //
+//---------------------------------------------------------------------------//
+// typedef void *  hsa_ext_tools_counter_block_t;
+
+//---------------------------------------------------------------------------//
+// Opaque pointer to HSA counter                                             //
+//---------------------------------------------------------------------------//
+// typedef void *  hsa_ext_tools_counter_t;
+
+#ifdef __cplusplus
+}
+#endif  // __cplusplus
+#endif  // _ROCR_PROFILER_H_
@@ -0,0 +1,48 @@
+#include <string.h>
+#include "var_data.h"
+
+namespace pm4_profile {
+VarData::VarData() {
+  size_ = 0;
+  p_data_ = NULL;
+}
+
+VarData::~VarData() {}
+
+void VarData::clear() {
+  size_ = 0;
+  if (p_data_) {
+    free(p_data_);
+    p_data_ = NULL;
+  }
+}
+
+bool VarData::set(uint32_t size, const void* p_data) {
+  if (!p_data || (size == 0)) {
+    return false;
+  }
+
+  clear();
+
+  if (NULL == (p_data_ = malloc(size))) {
+    return false;
+  }
+
+  memcpy(p_data_, p_data, size);
+  size_ = size;
+
+  return true;
+}
+
+uint32_t VarData::get(uint32_t size, void* p_data) {
+  if (!p_data || !size || !p_data_ || !size_) {
+    return 0;
+  }
+
+  uint32_t ret_size = size < size_ ? size : size_;
+
+  memcpy(p_data, p_data_, ret_size);
+
+  return ret_size;
+}
+}  // pm4_profile
@@ -0,0 +1,65 @@
+#ifndef _VAR_DATA_H_
+#define _VAR_DATA_H_
+
+/*!
+   \note This file contains declaration of IVarData class.
+ */
+
+#include "hsa_perf.h"
+
+#include <map>
+#include <stdlib.h>
+#include <stdint.h>
+
+namespace pm4_profile {
+/*!
+   This abstract class implements variable-size storage for information and
+      parameter
+   sets.
+ */
+class VarData {
+ public:
+  /*! Constructor for IVarData */
+  VarData();
+
+  /*! Destructor for IVarData */
+  ~VarData();
+
+  /*! Deallocate the memory and clean up */
+  void clear();
+
+  /*!
+     Set the data to be stored.
+     @param[in] size Size of data to be stored.
+     @param[in] p_data Pointer to data to be stored.
+     \return true or false
+   */
+  bool set(uint32_t size, const void* p_data);
+
+  /*!
+     Query the data that was stored.
+     @param[in] size Size (in bytes) of the memory pointed to by p_data.
+       This determines maximum size of the returned data.
+     @param[in,out] p_data Pointer to the result buffer.
+     \return Size (in bytes) of the returned result which is coppied into
+       the buffer pointed to by p_data.
+   */
+  uint32_t get(uint32_t size, void* p_data);
+
+  /*!
+     Get size of the current data stored
+     \return Size (in bytes) of the data stored.
+   */
+  uint32_t getSize() { return size_; }
+
+ private:
+  /*! Size of data being stored */
+  uint32_t size_;
+
+  /*! Pointer to the stored data */
+  void* p_data_;
+};
+
+typedef std::map<uint32_t, VarData> VarDataMap;
+}
+#endif
@@ -0,0 +1,622 @@
+#include "vi_blockinfo.h"
+#include "gfxip/gfx8/si_ci_vi_merged_offset.h"
+
+namespace pm4_profile {
+/**
+ * Table containing CounterGroups which represent VI hardware blocks
+ * as defined by \ref GpuBlockInfo structure
+ */
+GpuBlockInfo ViPmuHwBlocks[] = {
+    // Counter block CB
+    {"VI_CB0", kHsaViCounterBlockIdCb0, VI_MAX_NUM_SHADER_ENGINES, 2, VI_NUM_CB,
+     CntlMethodBySeAndInstance, 395, VI_COUNTER_NUM_PER_CB, 0, 0, true, 0, 0, false, 0, 0},
+    {"VI_CB1", kHsaViCounterBlockIdCb1, VI_MAX_NUM_SHADER_ENGINES, 2, VI_NUM_CB,
+     CntlMethodBySeAndInstance, 395, VI_COUNTER_NUM_PER_CB, 0, 0, true, 0, 0, false, 0, 0},
+    {"VI_CB2", kHsaViCounterBlockIdCb2, VI_MAX_NUM_SHADER_ENGINES, 2, VI_NUM_CB,
+     CntlMethodBySeAndInstance, 395, VI_COUNTER_NUM_PER_CB, 0, 0, true, 0, 0, false, 0, 0},
+    {"VI_CB3", kHsaViCounterBlockIdCb3, VI_MAX_NUM_SHADER_ENGINES, 2, VI_NUM_CB,
+     CntlMethodBySeAndInstance, 395, VI_COUNTER_NUM_PER_CB, 0, 0, true, 0, 0, false, 0, 0},
+
+    // Counter block CPF
+    {"VI_CPF", kHsaViCounterBlockIdCpf, VI_MAX_NUM_SHADER_ENGINES, 2, 1, CntlMethodNone, 19,
+     VI_COUNTER_NUM_PER_CPF, 0, 0, true, 0, 0, false, 0, 0},
+
+    // Counter block DB
+    {"VI_DB0", kHsaViCounterBlockIdDb0, VI_MAX_NUM_SHADER_ENGINES, 2, VI_NUM_DB,
+     CntlMethodBySeAndInstance, 256, VI_COUNTER_NUM_PER_DB, 0, 0, true, 0, 0, false, 0, 0},
+    {"VI_DB1", kHsaViCounterBlockIdDb1, VI_MAX_NUM_SHADER_ENGINES, 2, VI_NUM_DB,
+     CntlMethodBySeAndInstance, 256, VI_COUNTER_NUM_PER_DB, 0, 0, true, 0, 0, false, 0, 0},
+    {"VI_DB2", kHsaViCounterBlockIdDb2, VI_MAX_NUM_SHADER_ENGINES, 2, VI_NUM_DB,
+     CntlMethodBySeAndInstance, 256, VI_COUNTER_NUM_PER_DB, 0, 0, true, 0, 0, false, 0, 0},
+    {"VI_DB3", kHsaViCounterBlockIdDb3, VI_MAX_NUM_SHADER_ENGINES, 2, VI_NUM_DB,
+     CntlMethodBySeAndInstance, 256, VI_COUNTER_NUM_PER_DB, 0, 0, true, 0, 0, false, 0, 0},
+
+    // Counter block GRBM
+    {"VI_GRBM", kHsaViCounterBlockIdGrbm, VI_MAX_NUM_SHADER_ENGINES, 2, 1, CntlMethodNone, 33,
+     VI_COUNTER_NUM_PER_GRBM, 0, 0, true, 0, 0, false, 0, 0},
+
+    // Counter block GRBMSE
+    {"VI_GRBMSE", kHsaViCounterBlockIdGrbmSe, VI_MAX_NUM_SHADER_ENGINES, 2, 1, CntlMethodNone, 14,
+     VI_COUNTER_NUM_PER_GRBMSE, 0, 0, true, 0, 0, false, 0, 0},
+
+    // Counter block PA_SU
+    {"VI_PA_SU", kHsaViCounterBlockIdPaSu, VI_MAX_NUM_SHADER_ENGINES, 2, 1, CntlMethodBySe, 152,
+     VI_COUNTER_NUM_PER_PA_SU, 0, 0, true, 0, 0, false, 0, 0},
+
+    // Counter block PA_SC
+    {"VI_PA_SC", kHsaViCounterBlockIdPaSc, VI_MAX_NUM_SHADER_ENGINES, 2, 1, CntlMethodBySe, 396,
+     VI_COUNTER_NUM_PER_PA_SC, 0, 0, true, 0, 0, false, 0, 0},
+
+    // Counter block SPI
+    {"VI_SPI", kHsaViCounterBlockIdSpi, VI_MAX_NUM_SHADER_ENGINES, 2, 1, CntlMethodBySe, 196,
+     VI_COUNTER_NUM_PER_SPI, 0, 0, true, 0, 0, false, 0, 0},
+
+    // Counter block SQ
+    {"VI_SQ", kHsaViCounterBlockIdSq, VI_MAX_NUM_SHADER_ENGINES, 2, 1, CntlMethodBySe, 298,
+     VI_COUNTER_NUM_PER_SQ, 0, 0, true, 0, 0, false, 0, 0},
+    {"VI_SQ_ES", kHsaViCounterBlockIdSqEs, VI_MAX_NUM_SHADER_ENGINES, 2, 1, CntlMethodBySe, 298,
+     VI_COUNTER_NUM_PER_SQ, 0, 0, true, 0, 0, false, 0, 0},
+    {"VI_SQ_GS", kHsaViCounterBlockIdSqGs, VI_MAX_NUM_SHADER_ENGINES, 2, 1, CntlMethodBySe, 298,
+     VI_COUNTER_NUM_PER_SQ, 0, 0, true, 0, 0, false, 0, 0},
+    {"VI_SQ_VS", kHsaViCounterBlockIdSqVs, VI_MAX_NUM_SHADER_ENGINES, 2, 1, CntlMethodBySe, 298,
+     VI_COUNTER_NUM_PER_SQ, 0, 0, true, 0, 0, false, 0, 0},
+    {"VI_SQ_PS", kHsaViCounterBlockIdSqPs, VI_MAX_NUM_SHADER_ENGINES, 2, 1, CntlMethodBySe, 298,
+     VI_COUNTER_NUM_PER_SQ, 0, 0, true, 0, 0, false, 0, 0},
+    {"VI_SQ_LS", kHsaViCounterBlockIdSqLs, VI_MAX_NUM_SHADER_ENGINES, 2, 1, CntlMethodBySe, 298,
+     VI_COUNTER_NUM_PER_SQ, 0, 0, true, 0, 0, false, 0, 0},
+    {"VI_SQ_HS", kHsaViCounterBlockIdSqHs, VI_MAX_NUM_SHADER_ENGINES, 2, 1, CntlMethodBySe, 298,
+     VI_COUNTER_NUM_PER_SQ, 0, 0, true, 0, 0, false, 0, 0},
+    {"VI_SQ_CS", kHsaViCounterBlockIdSqCs, VI_MAX_NUM_SHADER_ENGINES, 2, 1, CntlMethodBySe, 298,
+     VI_COUNTER_NUM_PER_SQ, 0, 0, true, 0, 0, false, 0, 0},
+
+    // Counter block SX
+    {"VI_SX", kHsaViCounterBlockIdSx, VI_MAX_NUM_SHADER_ENGINES, 2, 1, CntlMethodBySe, 33,
+     VI_COUNTER_NUM_PER_SX, 0, 0, true, 0, 0, false, 0, 0},
+
+    // Counter block TA
+    {"VI_TA0", kHsaViCounterBlockIdTa0, VI_MAX_NUM_SHADER_ENGINES, 2, VI_NUM_TA,
+     CntlMethodBySeAndInstance, 118, VI_COUNTER_NUM_PER_TA, 0, 0, true, 0, 0, false, 0, 0},
+    {"VI_TA1", kHsaViCounterBlockIdTa1, VI_MAX_NUM_SHADER_ENGINES, 2, VI_NUM_TA,
+     CntlMethodBySeAndInstance, 118, VI_COUNTER_NUM_PER_TA, 0, 0, true, 0, 0, false, 0, 0},
+    {"VI_TA2", kHsaViCounterBlockIdTa2, VI_MAX_NUM_SHADER_ENGINES, 2, VI_NUM_TA,
+     CntlMethodBySeAndInstance, 118, VI_COUNTER_NUM_PER_TA, 0, 0, true, 0, 0, false, 0, 0},
+    {"VI_TA3", kHsaViCounterBlockIdTa3, VI_MAX_NUM_SHADER_ENGINES, 2, VI_NUM_TA,
+     CntlMethodBySeAndInstance, 118, VI_COUNTER_NUM_PER_TA, 0, 0, true, 0, 0, false, 0, 0},
+    {"VI_TA4", kHsaViCounterBlockIdTa4, VI_MAX_NUM_SHADER_ENGINES, 2, VI_NUM_TA,
+     CntlMethodBySeAndInstance, 118, VI_COUNTER_NUM_PER_TA, 0, 0, true, 0, 0, false, 0, 0},
+    {"VI_TA5", kHsaViCounterBlockIdTa5, VI_MAX_NUM_SHADER_ENGINES, 2, VI_NUM_TA,
+     CntlMethodBySeAndInstance, 118, VI_COUNTER_NUM_PER_TA, 0, 0, true, 0, 0, false, 0, 0},
+    {"VI_TA6", kHsaViCounterBlockIdTa6, VI_MAX_NUM_SHADER_ENGINES, 2, VI_NUM_TA,
+     CntlMethodBySeAndInstance, 118, VI_COUNTER_NUM_PER_TA, 0, 0, true, 0, 0, false, 0, 0},
+    {"VI_TA7", kHsaViCounterBlockIdTa7, VI_MAX_NUM_SHADER_ENGINES, 2, VI_NUM_TA,
+     CntlMethodBySeAndInstance, 118, VI_COUNTER_NUM_PER_TA, 0, 0, true, 0, 0, false, 0, 0},
+    {"VI_TA8", kHsaViCounterBlockIdTa8, VI_MAX_NUM_SHADER_ENGINES, 2, VI_NUM_TA,
+     CntlMethodBySeAndInstance, 118, VI_COUNTER_NUM_PER_TA, 0, 0, true, 0, 0, false, 0, 0},
+    {"VI_TA9", kHsaViCounterBlockIdTa9, VI_MAX_NUM_SHADER_ENGINES, 2, VI_NUM_TA,
+     CntlMethodBySeAndInstance, 118, VI_COUNTER_NUM_PER_TA, 0, 0, true, 0, 0, false, 0, 0},
+    {"VI_TA10", kHsaViCounterBlockIdTa10, VI_MAX_NUM_SHADER_ENGINES, 2, VI_NUM_TA,
+     CntlMethodBySeAndInstance, 118, VI_COUNTER_NUM_PER_TA, 0, 0, true, 0, 0, false, 0, 0},
+    {"VI_TA11", kHsaViCounterBlockIdTa11, VI_MAX_NUM_SHADER_ENGINES, 2, VI_NUM_TA,
+     CntlMethodBySeAndInstance, 118, VI_COUNTER_NUM_PER_TA, 0, 0, true, 0, 0, false, 0, 0},
+    {"VI_TA12", kHsaViCounterBlockIdTa12, VI_MAX_NUM_SHADER_ENGINES, 2, VI_NUM_TA,
+     CntlMethodBySeAndInstance, 118, VI_COUNTER_NUM_PER_TA, 0, 0, true, 0, 0, false, 0, 0},
+    {"VI_TA13", kHsaViCounterBlockIdTa13, VI_MAX_NUM_SHADER_ENGINES, 2, VI_NUM_TA,
+     CntlMethodBySeAndInstance, 118, VI_COUNTER_NUM_PER_TA, 0, 0, true, 0, 0, false, 0, 0},
+    {"VI_TA14", kHsaViCounterBlockIdTa14, VI_MAX_NUM_SHADER_ENGINES, 2, VI_NUM_TA,
+     CntlMethodBySeAndInstance, 118, VI_COUNTER_NUM_PER_TA, 0, 0, true, 0, 0, false, 0, 0},
+    {"VI_TA15", kHsaViCounterBlockIdTa15, VI_MAX_NUM_SHADER_ENGINES, 2, VI_NUM_TA,
+     CntlMethodBySeAndInstance, 118, VI_COUNTER_NUM_PER_TA, 0, 0, true, 0, 0, false, 0, 0},
+
+    // Counter block TCA
+    {"VI_TCA0", kHsaViCounterBlockIdTca0, VI_MAX_NUM_SHADER_ENGINES, 2, VI_NUM_TCA,
+     CntlMethodByInstance, 34, VI_COUNTER_NUM_PER_TCA, 0, 0, true, 0, 0, false, 0, 0},
+    {"VI_TCA1", kHsaViCounterBlockIdTca1, VI_MAX_NUM_SHADER_ENGINES, 2, VI_NUM_TCA,
+     CntlMethodByInstance, 34, VI_COUNTER_NUM_PER_TCA, 0, 0, true, 0, 0, false, 0, 0},
+
+    // Counter block TCC
+    {"VI_TCC0", kHsaViCounterBlockIdTcc0, VI_MAX_NUM_SHADER_ENGINES, 2, VI_NUM_TCC,
+     CntlMethodByInstance, 191, VI_COUNTER_NUM_PER_TCC, 0, 0, true, 0, 0, false, 0, 0},
+    {"VI_TCC1", kHsaViCounterBlockIdTcc1, VI_MAX_NUM_SHADER_ENGINES, 2, VI_NUM_TCC,
+     CntlMethodByInstance, 191, VI_COUNTER_NUM_PER_TCC, 0, 0, true, 0, 0, false, 0, 0},
+    {"VI_TCC2", kHsaViCounterBlockIdTcc2, VI_MAX_NUM_SHADER_ENGINES, 2, VI_NUM_TCC,
+     CntlMethodByInstance, 191, VI_COUNTER_NUM_PER_TCC, 0, 0, true, 0, 0, false, 0, 0},
+    {"VI_TCC3", kHsaViCounterBlockIdTcc3, VI_MAX_NUM_SHADER_ENGINES, 2, VI_NUM_TCC,
+     CntlMethodByInstance, 191, VI_COUNTER_NUM_PER_TCC, 0, 0, true, 0, 0, false, 0, 0},
+    {"VI_TCC4", kHsaViCounterBlockIdTcc4, VI_MAX_NUM_SHADER_ENGINES, 2, VI_NUM_TCC,
+     CntlMethodByInstance, 191, VI_COUNTER_NUM_PER_TCC, 0, 0, true, 0, 0, false, 0, 0},
+    {"VI_TCC5", kHsaViCounterBlockIdTcc5, VI_MAX_NUM_SHADER_ENGINES, 2, VI_NUM_TCC,
+     CntlMethodByInstance, 191, VI_COUNTER_NUM_PER_TCC, 0, 0, true, 0, 0, false, 0, 0},
+    {"VI_TCC6", kHsaViCounterBlockIdTcc6, VI_MAX_NUM_SHADER_ENGINES, 2, VI_NUM_TCC,
+     CntlMethodByInstance, 191, VI_COUNTER_NUM_PER_TCC, 0, 0, true, 0, 0, false, 0, 0},
+    {"VI_TCC7", kHsaViCounterBlockIdTcc7, VI_MAX_NUM_SHADER_ENGINES, 2, VI_NUM_TCC,
+     CntlMethodByInstance, 191, VI_COUNTER_NUM_PER_TCC, 0, 0, true, 0, 0, false, 0, 0},
+    {"VI_TCC8", kHsaViCounterBlockIdTcc8, VI_MAX_NUM_SHADER_ENGINES, 2, VI_NUM_TCC,
+     CntlMethodByInstance, 191, VI_COUNTER_NUM_PER_TCC, 0, 0, true, 0, 0, false, 0, 0},
+    {"VI_TCC9", kHsaViCounterBlockIdTcc9, VI_MAX_NUM_SHADER_ENGINES, 2, VI_NUM_TCC,
+     CntlMethodByInstance, 191, VI_COUNTER_NUM_PER_TCC, 0, 0, true, 0, 0, false, 0, 0},
+    {"VI_TCC10", kHsaViCounterBlockIdTcc10, VI_MAX_NUM_SHADER_ENGINES, 2, VI_NUM_TCC,
+     CntlMethodByInstance, 191, VI_COUNTER_NUM_PER_TCC, 0, 0, true, 0, 0, false, 0, 0},
+    {"VI_TCC11", kHsaViCounterBlockIdTcc11, VI_MAX_NUM_SHADER_ENGINES, 2, VI_NUM_TCC,
+     CntlMethodByInstance, 191, VI_COUNTER_NUM_PER_TCC, 0, 0, true, 0, 0, false, 0, 0},
+    {"VI_TCC12", kHsaViCounterBlockIdTcc12, VI_MAX_NUM_SHADER_ENGINES, 2, VI_NUM_TCC,
+     CntlMethodByInstance, 191, VI_COUNTER_NUM_PER_TCC, 0, 0, true, 0, 0, false, 0, 0},
+    {"VI_TCC13", kHsaViCounterBlockIdTcc13, VI_MAX_NUM_SHADER_ENGINES, 2, VI_NUM_TCC,
+     CntlMethodByInstance, 191, VI_COUNTER_NUM_PER_TCC, 0, 0, true, 0, 0, false, 0, 0},
+    {"VI_TCC14", kHsaViCounterBlockIdTcc14, VI_MAX_NUM_SHADER_ENGINES, 2, VI_NUM_TCC,
+     CntlMethodByInstance, 191, VI_COUNTER_NUM_PER_TCC, 0, 0, true, 0, 0, false, 0, 0},
+    {"VI_TCC15", kHsaViCounterBlockIdTcc15, VI_MAX_NUM_SHADER_ENGINES, 2, VI_NUM_TCC,
+     CntlMethodByInstance, 191, VI_COUNTER_NUM_PER_TCC, 0, 0, true, 0, 0, false, 0, 0},
+
+    // Counter block TD
+    {"VI_TD0", kHsaViCounterBlockIdTd0, VI_MAX_NUM_SHADER_ENGINES, 2, VI_NUM_TD,
+     CntlMethodBySeAndInstance, 54, VI_COUNTER_NUM_PER_TD, 0, 0, true, 0, 0, false, 0, 0},
+    {"VI_TD1", kHsaViCounterBlockIdTd1, VI_MAX_NUM_SHADER_ENGINES, 2, VI_NUM_TD,
+     CntlMethodBySeAndInstance, 54, VI_COUNTER_NUM_PER_TD, 0, 0, true, 0, 0, false, 0, 0},
+    {"VI_TD2", kHsaViCounterBlockIdTd2, VI_MAX_NUM_SHADER_ENGINES, 2, VI_NUM_TD,
+     CntlMethodBySeAndInstance, 54, VI_COUNTER_NUM_PER_TD, 0, 0, true, 0, 0, false, 0, 0},
+    {"VI_TD3", kHsaViCounterBlockIdTd3, VI_MAX_NUM_SHADER_ENGINES, 2, VI_NUM_TD,
+     CntlMethodBySeAndInstance, 54, VI_COUNTER_NUM_PER_TD, 0, 0, true, 0, 0, false, 0, 0},
+    {"VI_TD4", kHsaViCounterBlockIdTd4, VI_MAX_NUM_SHADER_ENGINES, 2, VI_NUM_TD,
+     CntlMethodBySeAndInstance, 54, VI_COUNTER_NUM_PER_TD, 0, 0, true, 0, 0, false, 0, 0},
+    {"VI_TD5", kHsaViCounterBlockIdTd5, VI_MAX_NUM_SHADER_ENGINES, 2, VI_NUM_TD,
+     CntlMethodBySeAndInstance, 54, VI_COUNTER_NUM_PER_TD, 0, 0, true, 0, 0, false, 0, 0},
+    {"VI_TD6", kHsaViCounterBlockIdTd6, VI_MAX_NUM_SHADER_ENGINES, 2, VI_NUM_TD,
+     CntlMethodBySeAndInstance, 54, VI_COUNTER_NUM_PER_TD, 0, 0, true, 0, 0, false, 0, 0},
+    {"VI_TD7", kHsaViCounterBlockIdTd7, VI_MAX_NUM_SHADER_ENGINES, 2, VI_NUM_TD,
+     CntlMethodBySeAndInstance, 54, VI_COUNTER_NUM_PER_TD, 0, 0, true, 0, 0, false, 0, 0},
+    {"VI_TD8", kHsaViCounterBlockIdTd8, VI_MAX_NUM_SHADER_ENGINES, 2, VI_NUM_TD,
+     CntlMethodBySeAndInstance, 54, VI_COUNTER_NUM_PER_TD, 0, 0, true, 0, 0, false, 0, 0},
+    {"VI_TD9", kHsaViCounterBlockIdTd9, VI_MAX_NUM_SHADER_ENGINES, 2, VI_NUM_TD,
+     CntlMethodBySeAndInstance, 54, VI_COUNTER_NUM_PER_TD, 0, 0, true, 0, 0, false, 0, 0},
+    {"VI_TD10", kHsaViCounterBlockIdTd10, VI_MAX_NUM_SHADER_ENGINES, 2, VI_NUM_TD,
+     CntlMethodBySeAndInstance, 54, VI_COUNTER_NUM_PER_TD, 0, 0, true, 0, 0, false, 0, 0},
+    {"VI_TD11", kHsaViCounterBlockIdTd11, VI_MAX_NUM_SHADER_ENGINES, 2, VI_NUM_TD,
+     CntlMethodBySeAndInstance, 54, VI_COUNTER_NUM_PER_TD, 0, 0, true, 0, 0, false, 0, 0},
+    {"VI_TD12", kHsaViCounterBlockIdTd12, VI_MAX_NUM_SHADER_ENGINES, 2, VI_NUM_TD,
+     CntlMethodBySeAndInstance, 54, VI_COUNTER_NUM_PER_TD, 0, 0, true, 0, 0, false, 0, 0},
+    {"VI_TD13", kHsaViCounterBlockIdTd13, VI_MAX_NUM_SHADER_ENGINES, 2, VI_NUM_TD,
+     CntlMethodBySeAndInstance, 54, VI_COUNTER_NUM_PER_TD, 0, 0, true, 0, 0, false, 0, 0},
+    {"VI_TD14", kHsaViCounterBlockIdTd14, VI_MAX_NUM_SHADER_ENGINES, 2, VI_NUM_TD,
+     CntlMethodBySeAndInstance, 54, VI_COUNTER_NUM_PER_TD, 0, 0, true, 0, 0, false, 0, 0},
+    {"VI_TD15", kHsaViCounterBlockIdTd15, VI_MAX_NUM_SHADER_ENGINES, 2, VI_NUM_TD,
+     CntlMethodBySeAndInstance, 54, VI_COUNTER_NUM_PER_TD, 0, 0, true, 0, 0, false, 0, 0},
+
+    // Counter block TCP
+    {"VI_TCP0", kHsaViCounterBlockIdTcp0, VI_MAX_NUM_SHADER_ENGINES, 2, VI_NUM_TCP,
+     CntlMethodBySeAndInstance, 182, VI_COUNTER_NUM_PER_TCP, 0, 0, true, 0, 0, false, 0, 0},
+    {"VI_TCP1", kHsaViCounterBlockIdTcp1, VI_MAX_NUM_SHADER_ENGINES, 2, VI_NUM_TCP,
+     CntlMethodBySeAndInstance, 182, VI_COUNTER_NUM_PER_TCP, 0, 0, true, 0, 0, false, 0, 0},
+    {"VI_TCP2", kHsaViCounterBlockIdTcp2, VI_MAX_NUM_SHADER_ENGINES, 2, VI_NUM_TCP,
+     CntlMethodBySeAndInstance, 182, VI_COUNTER_NUM_PER_TCP, 0, 0, true, 0, 0, false, 0, 0},
+    {"VI_TCP3", kHsaViCounterBlockIdTcp3, VI_MAX_NUM_SHADER_ENGINES, 2, VI_NUM_TCP,
+     CntlMethodBySeAndInstance, 182, VI_COUNTER_NUM_PER_TCP, 0, 0, true, 0, 0, false, 0, 0},
+    {"VI_TCP4", kHsaViCounterBlockIdTcp4, VI_MAX_NUM_SHADER_ENGINES, 2, VI_NUM_TCP,
+     CntlMethodBySeAndInstance, 182, VI_COUNTER_NUM_PER_TCP, 0, 0, true, 0, 0, false, 0, 0},
+    {"VI_TCP5", kHsaViCounterBlockIdTcp5, VI_MAX_NUM_SHADER_ENGINES, 2, VI_NUM_TCP,
+     CntlMethodBySeAndInstance, 182, VI_COUNTER_NUM_PER_TCP, 0, 0, true, 0, 0, false, 0, 0},
+    {"VI_TCP6", kHsaViCounterBlockIdTcp6, VI_MAX_NUM_SHADER_ENGINES, 2, VI_NUM_TCP,
+     CntlMethodBySeAndInstance, 182, VI_COUNTER_NUM_PER_TCP, 0, 0, true, 0, 0, false, 0, 0},
+    {"VI_TCP7", kHsaViCounterBlockIdTcp7, VI_MAX_NUM_SHADER_ENGINES, 2, VI_NUM_TCP,
+     CntlMethodBySeAndInstance, 182, VI_COUNTER_NUM_PER_TCP, 0, 0, true, 0, 0, false, 0, 0},
+    {"VI_TCP8", kHsaViCounterBlockIdTcp8, VI_MAX_NUM_SHADER_ENGINES, 2, VI_NUM_TCP,
+     CntlMethodBySeAndInstance, 182, VI_COUNTER_NUM_PER_TCP, 0, 0, true, 0, 0, false, 0, 0},
+    {"VI_TCP9", kHsaViCounterBlockIdTcp9, VI_MAX_NUM_SHADER_ENGINES, 2, VI_NUM_TCP,
+     CntlMethodBySeAndInstance, 182, VI_COUNTER_NUM_PER_TCP, 0, 0, true, 0, 0, false, 0, 0},
+    {"VI_TCP10", kHsaViCounterBlockIdTcp10, VI_MAX_NUM_SHADER_ENGINES, 2, VI_NUM_TCP,
+     CntlMethodBySeAndInstance, 182, VI_COUNTER_NUM_PER_TCP, 0, 0, true, 0, 0, false, 0, 0},
+    {"VI_TCP11", kHsaViCounterBlockIdTcp11, VI_MAX_NUM_SHADER_ENGINES, 2, VI_NUM_TCP,
+     CntlMethodBySeAndInstance, 182, VI_COUNTER_NUM_PER_TCP, 0, 0, true, 0, 0, false, 0, 0},
+    {"VI_TCP12", kHsaViCounterBlockIdTcp12, VI_MAX_NUM_SHADER_ENGINES, 2, VI_NUM_TCP,
+     CntlMethodBySeAndInstance, 182, VI_COUNTER_NUM_PER_TCP, 0, 0, true, 0, 0, false, 0, 0},
+    {"VI_TCP13", kHsaViCounterBlockIdTcp13, VI_MAX_NUM_SHADER_ENGINES, 2, VI_NUM_TCP,
+     CntlMethodBySeAndInstance, 182, VI_COUNTER_NUM_PER_TCP, 0, 0, true, 0, 0, false, 0, 0},
+    {"VI_TCP14", kHsaViCounterBlockIdTcp14, VI_MAX_NUM_SHADER_ENGINES, 2, VI_NUM_TCP,
+     CntlMethodBySeAndInstance, 182, VI_COUNTER_NUM_PER_TCP, 0, 0, true, 0, 0, false, 0, 0},
+    {"VI_TCP15", kHsaViCounterBlockIdTcp15, VI_MAX_NUM_SHADER_ENGINES, 2, VI_NUM_TCP,
+     CntlMethodBySeAndInstance, 182, VI_COUNTER_NUM_PER_TCP, 0, 0, true, 0, 0, false, 0, 0},
+
+    // Counter block GDS
+    {"VI_GDS", kHsaViCounterBlockIdGds, VI_MAX_NUM_SHADER_ENGINES, 2, 1, CntlMethodNone, 120,
+     VI_COUNTER_NUM_PER_GDS, 0, 0, true, 0, 0, false, 0, 0},
+
+    // Counter block VGT
+    {"VI_VGT", kHsaViCounterBlockIdVgt, VI_MAX_NUM_SHADER_ENGINES, 2, 1, CntlMethodBySe, 145,
+     VI_COUNTER_NUM_PER_VGT, 0, 0, true, 0, 0, false, 0, 0},
+
+    // Counter block IA
+    {"VI_IA", kHsaViCounterBlockIdIa, VI_MAX_NUM_SHADER_ENGINES, 2, 1, CntlMethodBySe, 23,
+     VI_COUNTER_NUM_PER_IA, 0, 0, true, 0, 0, false, 0, 0},
+
+    // Counter block MC
+    {"VI_MC", kHsaViCounterBlockIdMc, VI_MAX_NUM_SHADER_ENGINES, 2, 1, CntlMethodNone, 22,
+     VI_COUNTER_NUM_PER_MC, 0, 0, true, 0, 0, false, 0, 0},
+
+    // Counter block SRBM
+    {"VI_SRBM", kHsaViCounterBlockIdSrbm, VI_MAX_NUM_SHADER_ENGINES, 2, 1, CntlMethodNone, 19,
+     VI_COUNTER_NUM_PER_SRBM, 0, 0, true, 0, 0, false, 0, 0},
+
+    // Counter block WD
+    {"VI_WD", kHsaViCounterBlockIdWd, VI_MAX_NUM_SHADER_ENGINES, 2, 1, CntlMethodNone, 36,
+     VI_COUNTER_NUM_PER_WD, 0, 0, true, 0, 0, false, 0, 0},
+
+    // Counter block CPG
+    {"VI_CPG", kHsaViCounterBlockIdCpg, VI_MAX_NUM_SHADER_ENGINES, 2, 1, CntlMethodNone, 48,
+     VI_COUNTER_NUM_PER_CPG, 0, 0, true, 0, 0, false, 0, 0},
+
+    // Counter block CPC
+    {"VI_CPC", kHsaViCounterBlockIdCpc, VI_MAX_NUM_SHADER_ENGINES, 2, 1, CntlMethodNone, 24,
+     VI_COUNTER_NUM_PER_CPC, 0, 0, true, 0, 0, false, 0, 0},
+
+    // Counter block IOMMUV2
+    {"VI_IOMMUV2", kHsaViCounterBlockIdIommuV2, VI_MAX_NUM_SHADER_ENGINES, 2, 1, CntlMethodNone, 25,
+     8, 0, 0, true, 0, 0, false, 0, 0},
+
+    // Counter block KernelDriver
+    {"VI_KD", kHsaViCounterBlockIdKernelDriver, VI_MAX_NUM_SHADER_ENGINES, 2, 1, CntlMethodNone, 0,
+     0, 0, 0, true, 0, 0, false, 0, 0},
+
+    // Name of the last line should be empty to indicate end of all counter groups
+    {"", kHsaViCounterBlockIdBlocksLast, 0, 0, 0, CntlMethodNone, 0, 0, 0, 0, false, 0, 0, false, 0,
+     0}};
+
+/*
+ * The following tables contain register addresses of the SQ counter registers
+ */
+
+/*
+ * SQ
+ */
+GpuCounterRegInfo ViSqCounterRegAddr[] = {
+    {mmSQ_PERFCOUNTER0_SELECT__CI__VI, mmSQ_PERFCOUNTER_CTRL__CI__VI, mmSQ_PERFCOUNTER0_LO__CI__VI,
+     mmSQ_PERFCOUNTER0_HI__CI__VI},
+    {mmSQ_PERFCOUNTER1_SELECT__CI__VI, mmSQ_PERFCOUNTER_CTRL__CI__VI, mmSQ_PERFCOUNTER1_LO__CI__VI,
+     mmSQ_PERFCOUNTER1_HI__CI__VI},
+    {mmSQ_PERFCOUNTER2_SELECT__CI__VI, mmSQ_PERFCOUNTER_CTRL__CI__VI, mmSQ_PERFCOUNTER2_LO__CI__VI,
+     mmSQ_PERFCOUNTER2_HI__CI__VI},
+    {mmSQ_PERFCOUNTER3_SELECT__CI__VI, mmSQ_PERFCOUNTER_CTRL__CI__VI, mmSQ_PERFCOUNTER3_LO__CI__VI,
+     mmSQ_PERFCOUNTER3_HI__CI__VI},
+    {mmSQ_PERFCOUNTER4_SELECT__CI__VI, mmSQ_PERFCOUNTER_CTRL__CI__VI, mmSQ_PERFCOUNTER4_LO__CI__VI,
+     mmSQ_PERFCOUNTER4_HI__CI__VI},
+    {mmSQ_PERFCOUNTER5_SELECT__CI__VI, mmSQ_PERFCOUNTER_CTRL__CI__VI, mmSQ_PERFCOUNTER5_LO__CI__VI,
+     mmSQ_PERFCOUNTER5_HI__CI__VI},
+    {mmSQ_PERFCOUNTER6_SELECT__CI__VI, mmSQ_PERFCOUNTER_CTRL__CI__VI, mmSQ_PERFCOUNTER6_LO__CI__VI,
+     mmSQ_PERFCOUNTER6_HI__CI__VI},
+    {mmSQ_PERFCOUNTER7_SELECT__CI__VI, mmSQ_PERFCOUNTER_CTRL__CI__VI, mmSQ_PERFCOUNTER7_LO__CI__VI,
+     mmSQ_PERFCOUNTER7_HI__CI__VI},
+    {mmSQ_PERFCOUNTER8_SELECT__CI__VI, mmSQ_PERFCOUNTER_CTRL__CI__VI, mmSQ_PERFCOUNTER8_LO__CI__VI,
+     mmSQ_PERFCOUNTER8_HI__CI__VI},
+    {mmSQ_PERFCOUNTER9_SELECT__CI__VI, mmSQ_PERFCOUNTER_CTRL__CI__VI, mmSQ_PERFCOUNTER9_LO__CI__VI,
+     mmSQ_PERFCOUNTER9_HI__CI__VI},
+    {mmSQ_PERFCOUNTER10_SELECT__CI__VI, mmSQ_PERFCOUNTER_CTRL__CI__VI,
+     mmSQ_PERFCOUNTER10_LO__CI__VI, mmSQ_PERFCOUNTER10_HI__CI__VI},
+    {mmSQ_PERFCOUNTER11_SELECT__CI__VI, mmSQ_PERFCOUNTER_CTRL__CI__VI,
+     mmSQ_PERFCOUNTER11_LO__CI__VI, mmSQ_PERFCOUNTER11_HI__CI__VI},
+    {mmSQ_PERFCOUNTER12_SELECT__CI__VI, mmSQ_PERFCOUNTER_CTRL__CI__VI,
+     mmSQ_PERFCOUNTER12_LO__CI__VI, mmSQ_PERFCOUNTER12_HI__CI__VI},
+    {mmSQ_PERFCOUNTER13_SELECT__CI__VI, mmSQ_PERFCOUNTER_CTRL__CI__VI,
+     mmSQ_PERFCOUNTER13_LO__CI__VI, mmSQ_PERFCOUNTER13_HI__CI__VI},
+    {mmSQ_PERFCOUNTER14_SELECT__CI__VI, mmSQ_PERFCOUNTER_CTRL__CI__VI,
+     mmSQ_PERFCOUNTER14_LO__CI__VI, mmSQ_PERFCOUNTER14_HI__CI__VI},
+    {mmSQ_PERFCOUNTER15_SELECT__CI__VI, mmSQ_PERFCOUNTER_CTRL__CI__VI,
+     mmSQ_PERFCOUNTER15_LO__CI__VI, mmSQ_PERFCOUNTER15_HI__CI__VI}};
+
+/*
+ * DRMDMA
+ */
+GpuCounterRegInfo ViDrmdmaCounterRegAddr[] = {
+    {mmSDMA0_PERFMON_CNTL__VI, 0, mmSDMA0_PERFCOUNTER0_RESULT__VI, 0},
+    {mmSDMA0_PERFMON_CNTL__VI, 0, mmSDMA0_PERFCOUNTER1_RESULT__VI, 0},
+    {mmSDMA1_PERFMON_CNTL__VI, 0, mmSDMA1_PERFCOUNTER0_RESULT__VI, 0},
+    {mmSDMA1_PERFMON_CNTL__VI, 0, mmSDMA1_PERFCOUNTER1_RESULT__VI, 0},
+};
+
+/*
+ * IH
+ */
+GpuCounterRegInfo ViIhCounterRegAddr[] = {
+    {mmIH_PERFMON_CNTL__VI, 0, mmIH_PERFCOUNTER0_RESULT__VI, 0},
+    {mmIH_PERFMON_CNTL__VI, 0, mmIH_PERFCOUNTER1_RESULT__VI, 0}};
+
+/*
+ * CPF
+ */
+GpuCounterRegInfo ViCpfCounterRegAddr[] = {
+    {mmCPF_PERFCOUNTER0_SELECT__CI__VI, 0, mmCPF_PERFCOUNTER0_LO__CI__VI,
+     mmCPF_PERFCOUNTER0_HI__CI__VI},
+    {mmCPF_PERFCOUNTER1_SELECT__CI__VI, 0, mmCPF_PERFCOUNTER1_LO__CI__VI,
+     mmCPF_PERFCOUNTER1_HI__CI__VI}};
+
+/*
+ * DRM
+ */
+GpuCounterRegInfo ViDrmCounterRegAddr[] = {
+    {mmDRM_PERFCOUNTER1_SELECT, 0, mmDRM_PERFCOUNTER1_LO, mmDRM_PERFCOUNTER1_HI},
+    {mmDRM_PERFCOUNTER2_SELECT, 0, mmDRM_PERFCOUNTER2_LO, mmDRM_PERFCOUNTER2_HI}};
+
+/*
+ * GRBM
+ */
+GpuCounterRegInfo ViGrbmCounterRegAddr[] = {
+    {mmGRBM_PERFCOUNTER0_SELECT__CI__VI, 0, mmGRBM_PERFCOUNTER0_LO__CI__VI,
+     mmGRBM_PERFCOUNTER0_HI__CI__VI},
+    {mmGRBM_PERFCOUNTER1_SELECT__CI__VI, 0, mmGRBM_PERFCOUNTER1_LO__CI__VI,
+     mmGRBM_PERFCOUNTER1_HI__CI__VI}};
+
+/*
+ * GRBM_SE
+ */
+GpuCounterRegInfo ViGrbmSeCounterRegAddr[] = {
+    {mmGRBM_SE0_PERFCOUNTER_SELECT__CI__VI, 0, mmGRBM_SE0_PERFCOUNTER_LO__CI__VI,
+     mmGRBM_SE0_PERFCOUNTER_HI__CI__VI},
+    {mmGRBM_SE1_PERFCOUNTER_SELECT__CI__VI, 0, mmGRBM_SE1_PERFCOUNTER_LO__CI__VI,
+     mmGRBM_SE1_PERFCOUNTER_HI__CI__VI},
+    {mmGRBM_SE2_PERFCOUNTER_SELECT__CI__VI, 0, mmGRBM_SE2_PERFCOUNTER_LO__CI__VI,
+     mmGRBM_SE2_PERFCOUNTER_HI__CI__VI},
+    {mmGRBM_SE3_PERFCOUNTER_SELECT__CI__VI, 0, mmGRBM_SE3_PERFCOUNTER_LO__CI__VI,
+     mmGRBM_SE3_PERFCOUNTER_HI__CI__VI}};
+
+/*
+ * PA_SU
+ */
+GpuCounterRegInfo ViPaSuCounterRegAddr[] = {
+    {mmPA_SU_PERFCOUNTER0_SELECT__CI__VI, 0, mmPA_SU_PERFCOUNTER0_LO__CI__VI,
+     mmPA_SU_PERFCOUNTER0_HI__CI__VI},
+    {mmPA_SU_PERFCOUNTER1_SELECT__CI__VI, 0, mmPA_SU_PERFCOUNTER1_LO__CI__VI,
+     mmPA_SU_PERFCOUNTER1_HI__CI__VI},
+    {mmPA_SU_PERFCOUNTER2_SELECT__CI__VI, 0, mmPA_SU_PERFCOUNTER2_LO__CI__VI,
+     mmPA_SU_PERFCOUNTER2_HI__CI__VI},
+    {mmPA_SU_PERFCOUNTER3_SELECT__CI__VI, 0, mmPA_SU_PERFCOUNTER3_LO__CI__VI,
+     mmPA_SU_PERFCOUNTER3_HI__CI__VI}};
+
+/*
+ * PA_SC
+ */
+GpuCounterRegInfo ViPaScCounterRegAddr[] = {
+    {mmPA_SC_PERFCOUNTER0_SELECT__CI__VI, 0, mmPA_SC_PERFCOUNTER0_LO__CI__VI,
+     mmPA_SC_PERFCOUNTER0_HI__CI__VI},
+    {mmPA_SC_PERFCOUNTER1_SELECT__CI__VI, 0, mmPA_SC_PERFCOUNTER1_LO__CI__VI,
+     mmPA_SC_PERFCOUNTER1_HI__CI__VI},
+    {mmPA_SC_PERFCOUNTER2_SELECT__CI__VI, 0, mmPA_SC_PERFCOUNTER2_LO__CI__VI,
+     mmPA_SC_PERFCOUNTER2_HI__CI__VI},
+    {mmPA_SC_PERFCOUNTER3_SELECT__CI__VI, 0, mmPA_SC_PERFCOUNTER3_LO__CI__VI,
+     mmPA_SC_PERFCOUNTER3_HI__CI__VI}};
+
+/*
+ * SPI
+ */
+GpuCounterRegInfo ViSpiCounterRegAddr[] = {
+    {mmSPI_PERFCOUNTER0_SELECT__CI__VI, 0, mmSPI_PERFCOUNTER0_LO__CI__VI,
+     mmSPI_PERFCOUNTER0_HI__CI__VI},
+    {mmSPI_PERFCOUNTER1_SELECT__CI__VI, 0, mmSPI_PERFCOUNTER1_LO__CI__VI,
+     mmSPI_PERFCOUNTER1_HI__CI__VI},
+    {mmSPI_PERFCOUNTER2_SELECT__CI__VI, 0, mmSPI_PERFCOUNTER2_LO__CI__VI,
+     mmSPI_PERFCOUNTER2_HI__CI__VI},
+    {mmSPI_PERFCOUNTER3_SELECT__CI__VI, 0, mmSPI_PERFCOUNTER3_LO__CI__VI,
+     mmSPI_PERFCOUNTER3_HI__CI__VI},
+    {mmSPI_PERFCOUNTER4_SELECT__CI__VI, 0, mmSPI_PERFCOUNTER4_LO__CI__VI,
+     mmSPI_PERFCOUNTER4_HI__CI__VI},
+    {mmSPI_PERFCOUNTER5_SELECT__CI__VI, 0, mmSPI_PERFCOUNTER5_LO__CI__VI,
+     mmSPI_PERFCOUNTER5_HI__CI__VI}};
+
+/*
+ * TCA
+ */
+GpuCounterRegInfo ViTcaCounterRegAddr[] = {
+    {mmTCA_PERFCOUNTER0_SELECT__CI__VI, 0, mmTCA_PERFCOUNTER0_LO__CI__VI,
+     mmTCA_PERFCOUNTER0_HI__CI__VI},
+    {mmTCA_PERFCOUNTER1_SELECT__CI__VI, 0, mmTCA_PERFCOUNTER1_LO__CI__VI,
+     mmTCA_PERFCOUNTER1_HI__CI__VI},
+    {mmTCA_PERFCOUNTER2_SELECT__CI__VI, 0, mmTCA_PERFCOUNTER2_LO__CI__VI,
+     mmTCA_PERFCOUNTER2_HI__CI__VI},
+    {mmTCA_PERFCOUNTER3_SELECT__CI__VI, 0, mmTCA_PERFCOUNTER3_LO__CI__VI,
+     mmTCA_PERFCOUNTER3_HI__CI__VI}};
+
+/*
+ * TCC
+ */
+GpuCounterRegInfo ViTccCounterRegAddr[] = {
+    {mmTCC_PERFCOUNTER0_SELECT__CI__VI, 0, mmTCC_PERFCOUNTER0_LO__CI__VI,
+     mmTCC_PERFCOUNTER0_HI__CI__VI},
+    {mmTCC_PERFCOUNTER1_SELECT__CI__VI, 0, mmTCC_PERFCOUNTER1_LO__CI__VI,
+     mmTCC_PERFCOUNTER1_HI__CI__VI},
+    {mmTCC_PERFCOUNTER2_SELECT__CI__VI, 0, mmTCC_PERFCOUNTER2_LO__CI__VI,
+     mmTCC_PERFCOUNTER2_HI__CI__VI},
+    {mmTCC_PERFCOUNTER3_SELECT__CI__VI, 0, mmTCC_PERFCOUNTER3_LO__CI__VI,
+     mmTCC_PERFCOUNTER3_HI__CI__VI}};
+
+/*
+ * TCP
+ */
+GpuCounterRegInfo ViTcpCounterRegAddr[] = {
+    {mmTCP_PERFCOUNTER0_SELECT__CI__VI, 0, mmTCP_PERFCOUNTER0_LO__CI__VI,
+     mmTCP_PERFCOUNTER0_HI__CI__VI},
+    {mmTCP_PERFCOUNTER1_SELECT__CI__VI, 0, mmTCP_PERFCOUNTER1_LO__CI__VI,
+     mmTCP_PERFCOUNTER1_HI__CI__VI},
+    {mmTCP_PERFCOUNTER2_SELECT__CI__VI, 0, mmTCP_PERFCOUNTER2_LO__CI__VI,
+     mmTCP_PERFCOUNTER2_HI__CI__VI},
+    {mmTCP_PERFCOUNTER3_SELECT__CI__VI, 0, mmTCP_PERFCOUNTER3_LO__CI__VI,
+     mmTCP_PERFCOUNTER3_HI__CI__VI}};
+
+/*
+ * CB
+ */
+GpuCounterRegInfo ViCbCounterRegAddr[] = {
+    {mmCB_PERFCOUNTER0_SELECT__CI__VI, 0, mmCB_PERFCOUNTER0_LO__CI__VI,
+     mmCB_PERFCOUNTER0_HI__CI__VI},
+    {mmCB_PERFCOUNTER1_SELECT__CI__VI, 0, mmCB_PERFCOUNTER1_LO__CI__VI,
+     mmCB_PERFCOUNTER1_HI__CI__VI},
+    {mmCB_PERFCOUNTER2_SELECT__CI__VI, 0, mmCB_PERFCOUNTER2_LO__CI__VI,
+     mmCB_PERFCOUNTER2_HI__CI__VI},
+    {mmCB_PERFCOUNTER3_SELECT__CI__VI, 0, mmCB_PERFCOUNTER3_LO__CI__VI,
+     mmCB_PERFCOUNTER3_HI__CI__VI}};
+
+/*
+ * DB
+ */
+GpuCounterRegInfo ViDbCounterRegAddr[] = {
+    {mmDB_PERFCOUNTER0_SELECT__CI__VI, 0, mmDB_PERFCOUNTER0_LO__CI__VI,
+     mmDB_PERFCOUNTER0_HI__CI__VI},
+    {mmDB_PERFCOUNTER1_SELECT__CI__VI, 0, mmDB_PERFCOUNTER1_LO__CI__VI,
+     mmDB_PERFCOUNTER1_HI__CI__VI},
+    {mmDB_PERFCOUNTER2_SELECT__CI__VI, 0, mmDB_PERFCOUNTER2_LO__CI__VI,
+     mmDB_PERFCOUNTER2_HI__CI__VI},
+    {mmDB_PERFCOUNTER3_SELECT__CI__VI, 0, mmDB_PERFCOUNTER3_LO__CI__VI,
+     mmDB_PERFCOUNTER3_HI__CI__VI}};
+
+/*
+ * RLC
+ */
+GpuCounterRegInfo ViRlcCounterRegAddr[] = {
+    {mmRLC_PERFCOUNTER0_SELECT__CI__VI, 0, mmRLC_PERFCOUNTER0_LO__CI__VI,
+     mmRLC_PERFCOUNTER0_HI__CI__VI},
+    {mmRLC_PERFCOUNTER1_SELECT__CI__VI, 0, mmRLC_PERFCOUNTER1_LO__CI__VI,
+     mmRLC_PERFCOUNTER1_HI__CI__VI}};
+
+/*
+ * SC
+ */
+GpuCounterRegInfo ViScCounterRegAddr[] = {
+    {mmPA_SC_PERFCOUNTER0_SELECT__CI__VI, 0, mmPA_SC_PERFCOUNTER0_LO__CI__VI,
+     mmPA_SC_PERFCOUNTER0_HI__CI__VI},
+    {mmPA_SC_PERFCOUNTER1_SELECT__CI__VI, 0, mmPA_SC_PERFCOUNTER1_LO__CI__VI,
+     mmPA_SC_PERFCOUNTER1_HI__CI__VI},
+    {mmPA_SC_PERFCOUNTER2_SELECT__CI__VI, 0, mmPA_SC_PERFCOUNTER2_LO__CI__VI,
+     mmPA_SC_PERFCOUNTER2_HI__CI__VI},
+    {mmPA_SC_PERFCOUNTER3_SELECT__CI__VI, 0, mmPA_SC_PERFCOUNTER3_LO__CI__VI,
+     mmPA_SC_PERFCOUNTER3_HI__CI__VI},
+    {mmPA_SC_PERFCOUNTER4_SELECT__CI__VI, 0, mmPA_SC_PERFCOUNTER4_LO__CI__VI,
+     mmPA_SC_PERFCOUNTER4_HI__CI__VI},
+    {mmPA_SC_PERFCOUNTER5_SELECT__CI__VI, 0, mmPA_SC_PERFCOUNTER5_LO__CI__VI,
+     mmPA_SC_PERFCOUNTER5_HI__CI__VI},
+    {mmPA_SC_PERFCOUNTER6_SELECT__CI__VI, 0, mmPA_SC_PERFCOUNTER6_LO__CI__VI,
+     mmPA_SC_PERFCOUNTER6_HI__CI__VI},
+    {mmPA_SC_PERFCOUNTER7_SELECT__CI__VI, 0, mmPA_SC_PERFCOUNTER7_LO__CI__VI,
+     mmPA_SC_PERFCOUNTER7_HI__CI__VI}};
+
+/*
+ * SX
+ */
+GpuCounterRegInfo ViSxCounterRegAddr[] = {
+    {mmSX_PERFCOUNTER0_SELECT__CI__VI, 0, mmSX_PERFCOUNTER0_LO__CI__VI,
+     mmSX_PERFCOUNTER0_HI__CI__VI},
+    {mmSX_PERFCOUNTER1_SELECT__CI__VI, 0, mmSX_PERFCOUNTER1_LO__CI__VI,
+     mmSX_PERFCOUNTER1_HI__CI__VI},
+    {mmSX_PERFCOUNTER2_SELECT__CI__VI, 0, mmSX_PERFCOUNTER2_LO__CI__VI,
+     mmSX_PERFCOUNTER2_HI__CI__VI},
+    {mmSX_PERFCOUNTER3_SELECT__CI__VI, 0, mmSX_PERFCOUNTER3_LO__CI__VI,
+     mmSX_PERFCOUNTER3_HI__CI__VI}};
+
+/*
+ * TA
+ */
+GpuCounterRegInfo ViTaCounterRegAddr[] = {
+    {mmTA_PERFCOUNTER0_SELECT__CI__VI, 0, mmTA_PERFCOUNTER0_LO__CI__VI,
+     mmTA_PERFCOUNTER0_HI__CI__VI},
+    {mmTA_PERFCOUNTER1_SELECT__CI__VI, 0, mmTA_PERFCOUNTER1_LO__CI__VI,
+     mmTA_PERFCOUNTER1_HI__CI__VI}};
+
+/*
+ * TD
+ */
+GpuCounterRegInfo ViTdCounterRegAddr[] = {
+    {mmTD_PERFCOUNTER0_SELECT__CI__VI, 0, mmTD_PERFCOUNTER0_LO__CI__VI,
+     mmTD_PERFCOUNTER0_HI__CI__VI},
+    {mmTD_PERFCOUNTER1_SELECT__CI__VI, 0, mmTD_PERFCOUNTER1_LO__CI__VI,
+     mmTD_PERFCOUNTER1_HI__CI__VI}};
+
+/*
+ * GDS
+ */
+GpuCounterRegInfo ViGdsCounterRegAddr[] = {
+    {mmGDS_PERFCOUNTER0_SELECT__CI__VI, 0, mmGDS_PERFCOUNTER0_LO__CI__VI,
+     mmGDS_PERFCOUNTER0_HI__CI__VI},
+    {mmGDS_PERFCOUNTER1_SELECT__CI__VI, 0, mmGDS_PERFCOUNTER1_LO__CI__VI,
+     mmGDS_PERFCOUNTER1_HI__CI__VI},
+    {mmGDS_PERFCOUNTER2_SELECT__CI__VI, 0, mmGDS_PERFCOUNTER2_LO__CI__VI,
+     mmGDS_PERFCOUNTER2_HI__CI__VI},
+    {mmGDS_PERFCOUNTER3_SELECT__CI__VI, 0, mmGDS_PERFCOUNTER3_LO__CI__VI,
+     mmGDS_PERFCOUNTER3_HI__CI__VI}};
+
+/*
+ * VGT
+ */
+GpuCounterRegInfo ViVgtCounterRegAddr[] = {
+    {mmVGT_PERFCOUNTER0_SELECT__CI__VI, 0, mmVGT_PERFCOUNTER0_LO__CI__VI,
+     mmVGT_PERFCOUNTER0_HI__CI__VI},
+    {mmVGT_PERFCOUNTER1_SELECT__CI__VI, 0, mmVGT_PERFCOUNTER1_LO__CI__VI,
+     mmVGT_PERFCOUNTER1_HI__CI__VI},
+    {mmVGT_PERFCOUNTER2_SELECT__CI__VI, 0, mmVGT_PERFCOUNTER2_LO__CI__VI,
+     mmVGT_PERFCOUNTER2_HI__CI__VI},
+    {mmVGT_PERFCOUNTER3_SELECT__CI__VI, 0, mmVGT_PERFCOUNTER3_LO__CI__VI,
+     mmVGT_PERFCOUNTER3_HI__CI__VI}};
+
+/*
+ * IA
+ */
+GpuCounterRegInfo ViIaCounterRegAddr[] = {
+    {mmIA_PERFCOUNTER0_SELECT__CI__VI, 0, mmIA_PERFCOUNTER0_LO__CI__VI,
+     mmIA_PERFCOUNTER0_HI__CI__VI},
+    {mmIA_PERFCOUNTER1_SELECT__CI__VI, 0, mmIA_PERFCOUNTER1_LO__CI__VI,
+     mmIA_PERFCOUNTER1_HI__CI__VI},
+    {mmIA_PERFCOUNTER2_SELECT__CI__VI, 0, mmIA_PERFCOUNTER2_LO__CI__VI,
+     mmIA_PERFCOUNTER2_HI__CI__VI},
+    {mmIA_PERFCOUNTER3_SELECT__CI__VI, 0, mmIA_PERFCOUNTER3_LO__CI__VI,
+     mmIA_PERFCOUNTER3_HI__CI__VI}};
+
+/*
+ * MC
+ */
+GpuCounterRegInfo ViMcCounterRegAddr[] = {
+    {mmMC_SEQ_PERF_SEQ_CTL__SI__VI, 0, mmMC_SEQ_PERF_SEQ_CNT_A_I0__VI,
+     mmMC_SEQ_PERF_SEQ_CNT_A_I1__VI},
+    {mmMC_SEQ_PERF_SEQ_CTL__SI__VI, 0, mmMC_SEQ_PERF_SEQ_CNT_B_I0__VI,
+     mmMC_SEQ_PERF_SEQ_CNT_B_I1__VI},
+    {mmMC_SEQ_PERF_SEQ_CTL__SI__VI, 0, mmMC_SEQ_PERF_SEQ_CNT_C_I0__VI,
+     mmMC_SEQ_PERF_SEQ_CNT_C_I1__VI},
+    {mmMC_SEQ_PERF_SEQ_CTL__SI__VI, 0, mmMC_SEQ_PERF_SEQ_CNT_D_I0__VI,
+     mmMC_SEQ_PERF_SEQ_CNT_D_I1__VI}};
+
+/*
+ * SRBM
+ */
+GpuCounterRegInfo ViSrbmCounterRegAddr[] = {
+    {mmSRBM_PERFCOUNTER0_SELECT__VI, 0, mmSRBM_PERFCOUNTER0_LO__VI, mmSRBM_PERFCOUNTER0_HI__VI},
+    {mmSRBM_PERFCOUNTER1_SELECT__VI, 0, mmSRBM_PERFCOUNTER1_LO__VI, mmSRBM_PERFCOUNTER1_HI__VI}};
+
+/*
+ * WD
+ */
+GpuCounterRegInfo ViWdCounterRegAddr[] = {
+    {mmWD_PERFCOUNTER0_SELECT__CI__VI, 0, mmWD_PERFCOUNTER0_LO__CI__VI,
+     mmWD_PERFCOUNTER0_HI__CI__VI},
+    {mmWD_PERFCOUNTER1_SELECT__CI__VI, 0, mmWD_PERFCOUNTER1_LO__CI__VI,
+     mmWD_PERFCOUNTER1_HI__CI__VI},
+    {mmWD_PERFCOUNTER2_SELECT__CI__VI, 0, mmWD_PERFCOUNTER2_LO__CI__VI,
+     mmWD_PERFCOUNTER2_HI__CI__VI},
+    {mmWD_PERFCOUNTER3_SELECT__CI__VI, 0, mmWD_PERFCOUNTER3_LO__CI__VI,
+     mmWD_PERFCOUNTER3_HI__CI__VI}};
+
+/*
+ * CPG
+ */
+GpuCounterRegInfo ViCpgCounterRegAddr[] = {
+    {mmCPG_PERFCOUNTER0_SELECT__CI__VI, 0, mmCPG_PERFCOUNTER0_LO__CI__VI,
+     mmCPG_PERFCOUNTER0_HI__CI__VI},
+    {mmCPG_PERFCOUNTER1_SELECT__CI__VI, 0, mmCPG_PERFCOUNTER1_LO__CI__VI,
+     mmCPG_PERFCOUNTER1_HI__CI__VI}};
+
+/*
+ * CPC
+ */
+GpuCounterRegInfo ViCpcCounterRegAddr[] = {
+    {mmCPC_PERFCOUNTER0_SELECT__CI__VI, 0, mmCPC_PERFCOUNTER0_LO__CI__VI,
+     mmCPC_PERFCOUNTER0_HI__CI__VI},
+    {mmCPC_PERFCOUNTER1_SELECT__CI__VI, 0, mmCPC_PERFCOUNTER1_LO__CI__VI,
+     mmCPC_PERFCOUNTER1_HI__CI__VI}};
+
+GpuPrivCounterBlockId ViBlockIdSq = {{0xb5c396b6, 0x47e4d310, 0xc35cfc86, 0x08f53a04}};
+GpuPrivCounterBlockId ViBlockIdMc = {{0x13900b57, 0x4d984956, 0x5268d081, 0x9cf53719}};
+GpuPrivCounterBlockId ViBlockIdIommuV2 = {{0x80969879, 0x4be6b0f6, 0x636af697, 0x1d10f500}};
+GpuPrivCounterBlockId ViBlockIdKernelDriver = {{0xea9b5ae1, 0x44b36c3f, 0xf0da5489, 0x0aa96575}};
+
+}  // pm4_profile
@@ -0,0 +1,230 @@
+#ifndef _VI_BLOCKINFO_H_
+#define _VI_BLOCKINFO_H_
+
+#include <stdint.h>
+#include "rocr_profiler.h"
+#include "gpu_enum.h"
+#include "gpu_blockinfo.h"
+
+namespace pm4_profile {
+
+// MAX Number of block instances for VOLCANIC ISLANDS (From Fiji)
+// Values are found here //gfxip/gfx8/main/src/meta/features/variant/Fiji/album.dj
+
+// @brief Number of block instances.
+
+// We index per SE and instance
+#define VI_NUM_CB 4  // CB has 4 instances per SE
+#define VI_NUM_DB 4  // DB has 4 instances per SE
+
+// For TA, TD and TCP, the values below are the same as the number of CUs
+// per SH. We index per SE and instance
+#define VI_NUM_TA 16   // TA has 11 instances
+#define VI_NUM_TD 16   // TD has 11 instances
+#define VI_NUM_TCP 16  // TCP has 11 instances
+
+// These values are per chip, we index directly per instance
+#define VI_NUM_TCA 2   // TCA has 2 instances per chip
+#define VI_NUM_TCC 16  // TCC has 16 instances per chip
+#define VI_NUM_SDMA 2  // There are two SDMA blocks on VI, exposed as 2
+                       // instances here
+
+// Number of counter registers per block for volcanic islands
+#define VI_COUNTER_NUM_PER_DRM 2
+#define VI_COUNTER_NUM_PER_DRMDMA 2
+#define VI_COUNTER_NUM_PER_IH 2
+#define VI_COUNTER_NUM_PER_SRBM 2
+#define VI_COUNTER_NUM_PER_CB 4
+#define VI_COUNTER_NUM_PER_CPF 2
+#define VI_COUNTER_NUM_PER_DB 4
+#define VI_COUNTER_NUM_PER_GRBM 2
+#define VI_COUNTER_NUM_PER_GRBMSE 4
+#define VI_COUNTER_NUM_PER_PA_SU 4
+#define VI_COUNTER_NUM_PER_RLC 2
+#define VI_COUNTER_NUM_PER_PA_SC 8
+#define VI_COUNTER_NUM_PER_SPI 6  // [Shucai: To do: double check the value]
+#define VI_COUNTER_NUM_PER_SQ 16
+#define VI_COUNTER_NUM_PER_SX 4
+#define VI_COUNTER_NUM_PER_TA 2
+#define VI_COUNTER_NUM_PER_TCA 4
+#define VI_COUNTER_NUM_PER_TCC 4
+#define VI_COUNTER_NUM_PER_TD 2  // [Shucai: To do: double check the value]
+#define VI_COUNTER_NUM_PER_TCP 4
+#define VI_COUNTER_NUM_PER_GDS 4
+#define VI_COUNTER_NUM_PER_VGT 4
+#define VI_COUNTER_NUM_PER_IA 4
+#define VI_COUNTER_NUM_PER_MC 4
+#define VI_COUNTER_NUM_PER_TCS 4
+#define VI_COUNTER_NUM_PER_WD 4
+#define VI_COUNTER_NUM_PER_CPG 2
+#define VI_COUNTER_NUM_PER_CPC 2
+#define VI_COUNTER_NUM_PER_VM 1
+#define VI_COUNTER_NUM_PER_VM_MD 1
+#define VI_COUNTER_NUM_PER_PIPESTATS 12
+
+#define VI_MAX_NUM_SHADER_ENGINES 1
+
+// Enumeration of VI hardware counter blocks
+typedef enum HsaViCounterBlockId {
+  kHsaViCounterBlockIdCb0 = 0,
+  kHsaViCounterBlockIdCb1,
+  kHsaViCounterBlockIdCb2,
+  kHsaViCounterBlockIdCb3,
+
+  kHsaViCounterBlockIdCpf,
+
+  kHsaViCounterBlockIdDb0,
+  kHsaViCounterBlockIdDb1,
+  kHsaViCounterBlockIdDb2,
+  kHsaViCounterBlockIdDb3,
+
+  kHsaViCounterBlockIdGrbm,
+  kHsaViCounterBlockIdGrbmSe,
+  kHsaViCounterBlockIdPaSu,
+  kHsaViCounterBlockIdPaSc,
+  kHsaViCounterBlockIdSpi,
+
+  kHsaViCounterBlockIdSq,
+  kHsaViCounterBlockIdSqEs,
+  kHsaViCounterBlockIdSqGs,
+  kHsaViCounterBlockIdSqVs,
+  kHsaViCounterBlockIdSqPs,
+  kHsaViCounterBlockIdSqLs,
+  kHsaViCounterBlockIdSqHs,
+  kHsaViCounterBlockIdSqCs,
+
+  kHsaViCounterBlockIdSx,
+
+  kHsaViCounterBlockIdTa0,
+  kHsaViCounterBlockIdTa1,
+  kHsaViCounterBlockIdTa2,
+  kHsaViCounterBlockIdTa3,
+  kHsaViCounterBlockIdTa4,
+  kHsaViCounterBlockIdTa5,
+  kHsaViCounterBlockIdTa6,
+  kHsaViCounterBlockIdTa7,
+  kHsaViCounterBlockIdTa8,
+  kHsaViCounterBlockIdTa9,
+  kHsaViCounterBlockIdTa10,
+  kHsaViCounterBlockIdTa11,
+  kHsaViCounterBlockIdTa12,
+  kHsaViCounterBlockIdTa13,
+  kHsaViCounterBlockIdTa14,
+  kHsaViCounterBlockIdTa15,
+
+  kHsaViCounterBlockIdTca0,
+  kHsaViCounterBlockIdTca1,
+
+  kHsaViCounterBlockIdTcc0,
+  kHsaViCounterBlockIdTcc1,
+  kHsaViCounterBlockIdTcc2,
+  kHsaViCounterBlockIdTcc3,
+  kHsaViCounterBlockIdTcc4,
+  kHsaViCounterBlockIdTcc5,
+  kHsaViCounterBlockIdTcc6,
+  kHsaViCounterBlockIdTcc7,
+  kHsaViCounterBlockIdTcc8,
+  kHsaViCounterBlockIdTcc9,
+  kHsaViCounterBlockIdTcc10,
+  kHsaViCounterBlockIdTcc11,
+  kHsaViCounterBlockIdTcc12,
+  kHsaViCounterBlockIdTcc13,
+  kHsaViCounterBlockIdTcc14,
+  kHsaViCounterBlockIdTcc15,
+
+  kHsaViCounterBlockIdTd0,
+  kHsaViCounterBlockIdTd1,
+  kHsaViCounterBlockIdTd2,
+  kHsaViCounterBlockIdTd3,
+  kHsaViCounterBlockIdTd4,
+  kHsaViCounterBlockIdTd5,
+  kHsaViCounterBlockIdTd6,
+  kHsaViCounterBlockIdTd7,
+  kHsaViCounterBlockIdTd8,
+  kHsaViCounterBlockIdTd9,
+  kHsaViCounterBlockIdTd10,
+  kHsaViCounterBlockIdTd11,
+  kHsaViCounterBlockIdTd12,
+  kHsaViCounterBlockIdTd13,
+  kHsaViCounterBlockIdTd14,
+  kHsaViCounterBlockIdTd15,
+
+  kHsaViCounterBlockIdTcp0,
+  kHsaViCounterBlockIdTcp1,
+  kHsaViCounterBlockIdTcp2,
+  kHsaViCounterBlockIdTcp3,
+  kHsaViCounterBlockIdTcp4,
+  kHsaViCounterBlockIdTcp5,
+  kHsaViCounterBlockIdTcp6,
+  kHsaViCounterBlockIdTcp7,
+  kHsaViCounterBlockIdTcp8,
+  kHsaViCounterBlockIdTcp9,
+  kHsaViCounterBlockIdTcp10,
+  kHsaViCounterBlockIdTcp11,
+  kHsaViCounterBlockIdTcp12,
+  kHsaViCounterBlockIdTcp13,
+  kHsaViCounterBlockIdTcp14,
+  kHsaViCounterBlockIdTcp15,
+
+  kHsaViCounterBlockIdGds,
+  kHsaViCounterBlockIdVgt,
+  kHsaViCounterBlockIdIa,
+  kHsaViCounterBlockIdMc,
+  kHsaViCounterBlockIdSrbm,
+
+  kHsaViCounterBlockIdTcs,
+  kHsaViCounterBlockIdWd,
+  kHsaViCounterBlockIdCpg,
+  kHsaViCounterBlockIdCpc,
+
+  // Counters retrieved by KFD
+  kHsaViCounterBlockIdIommuV2,
+  kHsaViCounterBlockIdKernelDriver,
+
+  kHsaViCounterBlockIdCpPipeStats,
+  kHsaViCounterBlockIdHwInfo,
+  kHsaViCounterBlockIdBlocksFirst = kHsaViCounterBlockIdCb0,
+  kHsaViCounterBlockIdBlocksLast = kHsaViCounterBlockIdHwInfo
+} HsaViCounterBlockId;
+
+extern GpuBlockInfo ViPmuHwBlocks[];
+extern GpuCounterRegInfo ViSqCounterRegAddr[];
+extern GpuCounterRegInfo ViCbCounterRegAddr[];
+extern GpuCounterRegInfo ViDrmdmaCounterRegAddr[];
+extern GpuCounterRegInfo ViIhCounterRegAddr[];
+extern GpuCounterRegInfo ViCpfCounterRegAddr[];
+extern GpuCounterRegInfo ViCpgCounterRegAddr[];
+extern GpuCounterRegInfo ViCpcCounterRegAddr[];
+extern GpuCounterRegInfo ViDrmCounterRegAddr[];
+extern GpuCounterRegInfo ViGrbmCounterRegAddr[];
+extern GpuCounterRegInfo ViGrbmSeCounterRegAddr[];
+extern GpuCounterRegInfo ViPaSuCounterRegAddr[];
+extern GpuCounterRegInfo ViPaScCounterRegAddr[];
+extern GpuCounterRegInfo ViSpiCounterRegAddr[];
+extern GpuCounterRegInfo ViTcaCounterRegAddr[];
+extern GpuCounterRegInfo ViTccCounterRegAddr[];
+extern GpuCounterRegInfo ViTcpCounterRegAddr[];
+extern GpuCounterRegInfo ViDbCounterRegAddr[];
+extern GpuCounterRegInfo ViRlcCounterRegAddr[];
+extern GpuCounterRegInfo ViScCounterRegAddr[];
+extern GpuCounterRegInfo ViSxCounterRegAddr[];
+extern GpuCounterRegInfo ViTaCounterRegAddr[];
+extern GpuCounterRegInfo ViTdCounterRegAddr[];
+extern GpuCounterRegInfo ViGdsCounterRegAddr[];
+extern GpuCounterRegInfo ViVgtCounterRegAddr[];
+extern GpuCounterRegInfo ViIaCounterRegAddr[];
+extern GpuCounterRegInfo ViMcCounterRegAddr[];
+extern GpuCounterRegInfo ViSrbmCounterRegAddr[];
+
+// No Tcs Counter block on VI
+// extern GpuCounterRegInfo ViTcsCounterRegAddr[];
+extern GpuCounterRegInfo ViWdCounterRegAddr[];
+extern GpuCounterRegInfo ViCpgCounterRegAddr[];
+extern GpuCounterRegInfo ViCpcCounterRegAddr[];
+
+extern GpuPrivCounterBlockId ViBlockIdSq;
+extern GpuPrivCounterBlockId ViBlockIdMc;
+extern GpuPrivCounterBlockId ViBlockIdIommuV2;
+extern GpuPrivCounterBlockId ViBlockIdKernelDriver;
+}
+#endif
@@ -0,0 +1,141 @@
+#ifndef _VI_PMU_H_
+#define _VI_PMU_H_
+
+#include "hsa.h"
+#include "cmdwriter.h"
+#include "hsa_perf.h"
+#include "info_set.h"
+#include "parameter_set.h"
+#include "vi_blockinfo.h"
+#include "rocr_profiler.h"
+
+#include <stdlib.h>
+#include <stdint.h>
+#include <map>
+
+namespace pm4_profile {
+typedef std::map<HsaViCounterBlockId, pm4_profile::CounterBlock*> ViCounterBlockMap;
+
+// This class implement the VI PMU.  It is responsible for setting up
+// CounterGroups to represent each VI hardware block which exposes performance
+// counters.
+class ViPmu : public pm4_profile::Pmu {
+ public:
+  ViPmu();
+  ~ViPmu();
+
+  // Returns number of shader engines per block
+  // for the blocks featured shader engines instancing
+  uint32_t getNumSe() { return num_se_; }
+
+  // Initializes the handle of buffer used to collect PMC data
+  bool setPmcDataBuff(uint8_t* pmcBuffer, uint32_t pmcBuffSz);
+
+  int getLastError();
+
+  std::string getErrorString(int error);
+
+  virtual bool begin(pm4_profile::DefaultCmdBuf* cmdBuff, pm4_profile::CommandWriter* cmdWriter,
+                     bool reset = true);
+
+  virtual bool end(pm4_profile::DefaultCmdBuf* cmdBuff, pm4_profile::CommandWriter* cmdWriter);
+
+  // IPMU inherits the IParameterSet and IInfoSetso we implement it
+  // through composition and function forwarding
+  bool getParameter(uint32_t param, uint32_t& ret_size, void** pp_data);
+
+  bool setParameter(uint32_t param, uint32_t param_size, const void* p_data);
+
+  bool getInfo(uint32_t info, uint32_t& ret_size, void** pp_data);
+
+  pm4_profile::CounterBlock* getCounterBlockById(uint32_t id);
+
+  rocr_pmu_state_t getCurrentState() { return profiler_state_; }
+
+  pm4_profile::CounterBlock** getAllCounterBlocks(uint32_t& num_groups);
+
+ private:
+  // Addr of Counter Data Buffer
+  uint32_t* pmcData_;
+
+  // Size of Counter Data Buffer
+  uint32_t pmcDataSz_;
+
+  void Init();
+
+  bool initCounterBlock();
+
+  bool isResultReady();
+
+  // Clear CounterBlockMap
+  void clearCounterBlockMap();
+
+  // Reset SQ and CB counters
+  void ResetCounterBlocks(pm4_profile::DefaultCmdBuf* cmdBuff,
+                          pm4_profile::CommandWriter* cmdWriter);
+
+  // Program SQ block related counters
+  uint32_t ProgramSQCntrs(uint32_t sqRegIdx, uint32_t* regAddr, uint32_t* regVal, uint32_t blkId,
+                          uint32_t blkCntrIdx);
+
+  // Program TA block related counters
+  uint32_t ProgramTaCntrs(uint32_t taRegIdx, uint32_t* regAddr, uint32_t* regVal, uint32_t blkId,
+                          uint32_t blkCntrIdx);
+
+  // Program TCA block related counters
+  uint32_t ProgramTcaCntrs(uint32_t tcaRegIdx, uint32_t* regAddr, uint32_t* regVal, uint32_t blkId,
+                           uint32_t blkCntrIdx);
+
+  // Program TCC block related counters
+  uint32_t ProgramTccCntrs(uint32_t tccRegIdx, uint32_t* regAddr, uint32_t* regVal, uint32_t blkId,
+                           uint32_t blkCntrIdx);
+
+  // Program TCP block related counters
+  uint32_t ProgramTcpCntrs(uint32_t tcpRegIdx, uint32_t* regAddr, uint32_t* regVal, uint32_t blkId,
+                           uint32_t blkCntrIdx);
+
+  // Program TD block related counters
+  uint32_t ProgramTdCntrs(uint32_t tdRegIdx, uint32_t* regAddr, uint32_t* regVal, uint32_t blkId,
+                          uint32_t blkCntrIdx);
+
+  // Build counter selection register, return how many registers are built
+  uint32_t BuildCounterSelRegister(uint32_t cntrIdx, uint32_t* regAddr, uint32_t* regVal,
+                                   uint32_t blkId, pm4_profile::Counter* blkCntr);
+
+  // Build counter selection register, return how many registers are built
+  uint32_t BuildCounterReadRegisters(uint32_t reg_index, uint32_t block_id, uint32_t* reg_addr,
+                                     uint32_t* reg_val);
+
+ private:
+  // Delete counter blocks in the PMU
+  hsa_status_t RemoveCounterBlocks();
+
+ private:
+  // This contains the available counter groups.
+  ViCounterBlockMap blk_map_;
+
+  // This stores the current profiling state.
+  rocr_pmu_state_t profiler_state_;
+
+  pm4_profile::ParameterSet* parameter_set_;
+
+  pm4_profile::InfoSet* info_set_;
+
+  int error_code_;
+
+// A flag to indicate the current packet is for copy register value
+#define COPY_DATA_FLAG 0xFFFFFFFF
+#define MAX_REG_NUM 100
+
+  // Pointer used to store counter block list internally
+  uint32_t blk_list_size_;
+  pm4_profile::CounterBlock** blk_list_;
+
+  // Indicates the number of Shader Engines Present
+  uint32_t num_se_;
+
+  // Used to reset GRBM to its default state
+  uint32_t reset_grbm_;
+};
+}
+#endif
@@ -0,0 +1,18 @@
+#
+# Source files for Rocr ThreadTrace
+#
+set ( LIB_SRC thread_trace.cpp )
+set ( LIB_SRC ${LIB_SRC} gfx8_thread_trace.cpp )
+set ( LIB_SRC ${LIB_SRC} gfx9_thread_trace.cpp )
+
+#
+# Header files include path(s).
+#
+include_directories ( $ENV{ROCR_INC_DIR} )
+include_directories ( ${PROJ_DIR}/commandwriter )
+include_directories ( ${HSA_RUNTIME_OSC_DIR} )
+
+#
+# Build ThreadTrace as a Static Library object
+#
+add_library ( ${SQTT_LIB} STATIC ${LIB_SRC} )
@@ -0,0 +1,360 @@
+
+#include <iostream>
+#include <fstream>
+#include <iomanip>
+#include <random>
+#include <memory>
+
+#include "core/util/os.h"
+#include "gfx8_thread_trace.h"
+
+/// @brief Returns the lower 32-bits of a value
+inline uint32_t Low32(uint64_t u) { return (u & 0xFFFFFFFFUL); }
+
+/// @brief Returns the upper 32-bits of a value
+inline uint32_t High32(uint64_t u) { return (u >> 32); }
+
+namespace pm4_profile {
+
+Gfx8ThreadTrace::Gfx8ThreadTrace() {
+  // Initialize the number of shader engines
+  numSE_ = 4;
+}
+
+Gfx8ThreadTrace::~Gfx8ThreadTrace() {}
+
+bool Gfx8ThreadTrace::Init(const ThreadTraceConfig* config) {
+  // Initialize SQTT Configuration and Register objects
+  if (!ThreadTrace::Init(config)) return false;
+  InitThreadTraceCfgRegs();
+  return true;
+}
+
+void Gfx8ThreadTrace::InitThreadTraceCfgRegs() {
+  // Indicates the size of buffer to use per Shader Engine instance.
+  // The size is specified in terms of 4KB blocks
+  ttCfgRegs_.ttRegSize.u32All = 0;
+
+  // Indicates various attributes of a thread trace session.
+  //
+  // MASK_CS: Which shader types should be enabled for data collection
+  //      Enable CS Shader types.
+  //
+  // WRAP: How trace buffer should be used as a ring buffer or as a linear
+  //      buffer - Disable WRAP mode i.e use it as a linear buffer
+  //
+  // MODE: Enables a thread trace session
+  //
+  // CAPTURE_MODE: When thread trace data is collected immediately after MODE
+  //      is enabled or wait until a Thread Trace Start event is received
+  //
+  // AUTOFLUSH_EN: Flush thread trace data to buffer often automatically
+  //
+  ttCfgRegs_.ttRegMode.u32All = 0;
+  ttCfgRegs_.ttRegMode.bits.WRAP = 0;
+  ttCfgRegs_.ttRegMode.bits.CAPTURE_MODE = 0;
+  ttCfgRegs_.ttRegMode.bits.MASK_CS = 1;
+  ttCfgRegs_.ttRegMode.bits.AUTOFLUSH_EN = 1;
+  ttCfgRegs_.ttRegMode.bits.MODE = SQ_THREAD_TRACE_MODE_OFF;
+
+  // Enable Thread Trace for all VM Id's
+  // Enable all of the SIMD's of the compute unit
+  // Enable Compute Unit (CU) at index Zero to be used for fine-grained data
+  // Enable Shader Array (SH) at index Zero to be used for fine-grained data
+  //
+  // @note: Not enabling REG_STALL_EN, SPI_STALL_EN and SQ_STALL_EN bits. They
+  // are useful if we wish to program buffer throttling.
+  //
+  ttCfgRegs_.ttRegMask.u32All = 0;
+  ttCfgRegs_.ttRegMask.bits.SH_SEL = 0x0;
+  ttCfgRegs_.ttRegMask.bits.SIMD_EN = 0xF;
+  ttCfgRegs_.ttRegMask.bits.CU_SEL = SetCuId();
+  ttCfgRegs_.ttRegMask.bits.SQ_STALL_EN__CI__VI = 0x1;
+  ttCfgRegs_.ttRegMask.bits.SPI_STALL_EN__CI__VI = 0x1;
+  ttCfgRegs_.ttRegMask.bits.REG_STALL_EN__CI__VI = 0x1;
+  ttCfgRegs_.ttRegMask.bits.VM_ID_MASK = SetVmId();
+
+  // Override Mask value if a user value is available
+  uint32_t ttMask = SetMask();
+  if (ttMask) {
+    ttCfgRegs_.ttRegMask.u32All = ttMask;
+  }
+
+  // Mask of compute units to get thread trace data from
+  ttCfgRegs_.ttRegPerfMask.u32All = 0;
+  ttCfgRegs_.ttRegPerfMask.bits.SH0_MASK = 0xFFFF;
+  ttCfgRegs_.ttRegPerfMask.bits.SH1_MASK = 0xFFFF;
+
+  // Indicate the different TT messages/tokens that should be enabled/logged
+  // Indicate the different TT tokens that specify register operations to be logged
+  ttCfgRegs_.ttRegTokenMask.u32All = 0;
+  ttCfgRegs_.ttRegTokenMask.bits.REG_MASK = 0xFF;
+  ttCfgRegs_.ttRegTokenMask.bits.TOKEN_MASK = 0xFFFF;
+  ttCfgRegs_.ttRegTokenMask.bits.REG_DROP_ON_STALL__CI__VI = 0x1;
+
+  // Override TokenMask1 value if a user value is available
+  uint32_t tokenMask1 = SetTokenMask();
+  if (tokenMask1) {
+    ttCfgRegs_.ttRegTokenMask.u32All = tokenMask1;
+  }
+
+  // Indicate the different TT tokens that specify instruction operations to be logged
+  // Disabling specifically instruction operations updating Program Counter (PC).
+  // @note: The field is defined in the spec incorrectly as a 16-bit value
+  ttCfgRegs_.ttRegTokenMask2.u32All = 0;
+  ttCfgRegs_.ttRegTokenMask2.bits.INST_MASK = 0xFFFFFF7F;
+
+  // Override TokenMask2 value if a user value is available
+  uint32_t tokenMask2 = SetTokenMask2();
+  if (tokenMask2) {
+    ttCfgRegs_.ttRegTokenMask2.u32All = tokenMask2;
+  }
+}
+
+void Gfx8ThreadTrace::setSqttDataBuff(uint8_t* sqttBuffer, uint32_t sqttBuffSz) {
+  // Compute the size of buffer available for each shader engine
+  ttBuffSize_ = sqttBuffSz / numSE_;
+
+  // Populate the sqtt buffer array submitted to device
+  for (int idx = 0; idx < numSE_; idx++) {
+    uint64_t sqttSEAddr = uint64_t(sqttBuffer + (ttBuffSize_ * idx));
+    devMemList_.push_back(sqttSEAddr);
+  }
+
+  // Update the size bit-field of sqtt ctrl register
+  ttCfgRegs_.ttRegSize.bits.SIZE = ttBuffSize_ >> TT_BUFF_ALIGN_SHIFT;
+}
+
+void Gfx8ThreadTrace::BeginSession(DefaultCmdBuf* cmdBuff, CommandWriter* cmdWriter) {
+  // Program Grbm to broadcast messages to all shader engines
+  regGRBM_GFX_INDEX grbm_gfx_index;
+  grbm_gfx_index.u32All = 0;
+  grbm_gfx_index.bitfields.SH_BROADCAST_WRITES = 1;
+  grbm_gfx_index.bitfields.SE_BROADCAST_WRITES = 1;
+  grbm_gfx_index.bitfields.INSTANCE_BROADCAST_WRITES = 1;
+  cmdWriter->BuildWriteUConfigRegPacket(cmdBuff, mmGRBM_GFX_INDEX__CI__VI, grbm_gfx_index.u32All);
+
+  // Issue a CSPartialFlush cmd including cache flush
+  cmdWriter->BuildWriteWaitIdlePacket(cmdBuff);
+
+  // Disable RLC Perfmon Clock Gating
+  // On Vega this is needed to collect Perf Cntrs
+  // cmdWriter->BuildWriteUConfigRegPacket(cmdBuff, mmRLC_PERFMON_CLK_CNTL__VI, 1);
+
+  // Program the Compute register to indicate SQTT is enabled
+  /*
+  regCOMPUTE_THREAD_TRACE_ENABLE__CI__VI enableTT = {0};
+  enableTT.bits.THREAD_TRACE_ENABLE = 1;
+  cmdWriter->BuildWriteUConfigRegPacket(cmdBuff,
+                                        mmCOMPUTE_THREAD_TRACE_ENABLE__CI__VI,
+                                        enableTT.u32All);
+  */
+
+  // Program the thread trace mask - specifies SH, CU, SIMD and
+  // VM Id masks to apply. Enabling SQ/SPI/REG_STALL_EN bits
+  cmdWriter->BuildWriteUConfigRegPacket(cmdBuff, mmSQ_THREAD_TRACE_MASK__VI,
+                                        ttCfgRegs_.ttRegMask.u32All);
+
+  // Program the thread trace Perf mask
+  cmdWriter->BuildWriteUConfigRegPacket(cmdBuff, mmSQ_THREAD_TRACE_PERF_MASK__VI,
+                                        ttCfgRegs_.ttRegPerfMask.u32All);
+
+  // Program the thread trace token mask
+  cmdWriter->BuildWriteUConfigRegPacket(cmdBuff, mmSQ_THREAD_TRACE_TOKEN_MASK__VI,
+                                        ttCfgRegs_.ttRegTokenMask.u32All);
+
+  // Program the thread trace token mask2 to specify the list of instruction
+  // tokens to record. Disabling INST_PC instruction tokens
+  cmdWriter->BuildWriteUConfigRegPacket(cmdBuff, mmSQ_THREAD_TRACE_TOKEN_MASK2__VI,
+                                        ttCfgRegs_.ttRegTokenMask2.u32All);
+
+  // Program the thread trace mode register
+  cmdWriter->BuildWriteUConfigRegPacket(cmdBuff, mmSQ_THREAD_TRACE_MODE__VI,
+                                        ttCfgRegs_.ttRegMode.u32All);
+
+  // Program the HiWaterMark register to support stalling
+  if ((ttCfgRegs_.ttRegMask.bits.SQ_STALL_EN__CI__VI) ||
+      (ttCfgRegs_.ttRegMask.bits.SPI_STALL_EN__CI__VI) ||
+      (ttCfgRegs_.ttRegMask.bits.REG_STALL_EN__CI__VI) ||
+      (ttCfgRegs_.ttRegTokenMask.bits.REG_DROP_ON_STALL__CI__VI)) {
+    cmdWriter->BuildWriteUConfigRegPacket(cmdBuff, mmSQ_THREAD_TRACE_HIWATER__VI, 0x06);
+  }
+
+  // Iterate through the list of SE's and program the register
+  // for carrying address of thread trace buffer which is aligned
+  // to 4KB per thread trace specification
+  uint64_t baseAddr = 0;
+  for (int idx = 0; idx < numSE_; idx++) {
+    // Program Grbm to direct writes to one SE
+    grbm_gfx_index.bitfields.SH_INDEX = 0;
+    grbm_gfx_index.bitfields.SE_INDEX = idx;
+    grbm_gfx_index.bitfields.SH_BROADCAST_WRITES = 0;
+    grbm_gfx_index.bitfields.SE_BROADCAST_WRITES = 0;
+    cmdWriter->BuildWriteUConfigRegPacket(cmdBuff, mmGRBM_GFX_INDEX__CI__VI, grbm_gfx_index.u32All);
+
+    // Program base2 address of buffer to use for thread trace
+    // Encodes ATC bit, so the correct way to program is to use
+    // ATC Bit property of the device
+    /*
+    regSQ_THREAD_TRACE_BASE2__CI__VI sqttBase2 = {};
+    sqttBase2.u32All = 0;
+    sqttBase2.bits.ATC = 0;
+    sqttBase2.bits.ADDR_HI = 0;
+    cmdWriter->BuildWriteUConfigRegPacket(cmdBuff,
+                                          mmSQ_THREAD_TRACE_BASE2__VI,
+                                          sqttBase2.u32All);
+    */
+
+    // Program the base address to use
+    baseAddr = devMemList_[idx] >> TT_BUFF_ALIGN_SHIFT;
+
+    // Program base address of buffer to use for thread trace
+    regSQ_THREAD_TRACE_BASE sqttBase = {};
+    sqttBase.bits.ADDR = Low32(baseAddr);
+    cmdWriter->BuildWriteUConfigRegPacket(cmdBuff, mmSQ_THREAD_TRACE_BASE__VI, sqttBase.u32All);
+
+    // Program the size of thread trace buffer
+    cmdWriter->BuildWriteUConfigRegPacket(cmdBuff, mmSQ_THREAD_TRACE_SIZE__VI,
+                                          ttCfgRegs_.ttRegSize.u32All);
+
+    // Program the thread trace ctrl register
+    regSQ_THREAD_TRACE_CTRL sqttCtrl = {};
+    sqttCtrl.u32All = 0;
+    sqttCtrl.bits.RESET_BUFFER = 1;
+    cmdWriter->BuildWriteUConfigRegPacket(cmdBuff, mmSQ_THREAD_TRACE_CTRL__VI, sqttCtrl.u32All);
+  }
+
+  // Reset the GRBM to broadcast mode
+  grbm_gfx_index.bitfields.SH_BROADCAST_WRITES = 1;
+  grbm_gfx_index.bitfields.SE_BROADCAST_WRITES = 1;
+  cmdWriter->BuildWriteUConfigRegPacket(cmdBuff, mmGRBM_GFX_INDEX__CI__VI, grbm_gfx_index.u32All);
+
+  // Issue a CSPartialFlush cmd including cache flush
+  cmdWriter->BuildWriteWaitIdlePacket(cmdBuff);
+
+  // Program the thread trace mode register
+  ttCfgRegs_.ttRegMode.bits.MODE = SQ_THREAD_TRACE_MODE_ON;
+  cmdWriter->BuildWriteUConfigRegPacket(cmdBuff, mmSQ_THREAD_TRACE_MODE__VI,
+                                        ttCfgRegs_.ttRegMode.u32All);
+  ttCfgRegs_.ttRegMode.bits.MODE = SQ_THREAD_TRACE_MODE_OFF;
+
+  // Issue a CSPartialFlush cmd including cache flush
+  cmdWriter->BuildWriteWaitIdlePacket(cmdBuff);
+  return;
+}
+
+void Gfx8ThreadTrace::StopSession(DefaultCmdBuf* cmdBuff, CommandWriter* cmdWriter) {
+  // Program Grbm to broadcast messages to all shader engines
+  regGRBM_GFX_INDEX grbm_gfx_index;
+  grbm_gfx_index.u32All = 0;
+  grbm_gfx_index.bitfields.SH_BROADCAST_WRITES = 1;
+  grbm_gfx_index.bitfields.SE_BROADCAST_WRITES = 1;
+  grbm_gfx_index.bitfields.INSTANCE_BROADCAST_WRITES = 1;
+  cmdWriter->BuildWriteUConfigRegPacket(cmdBuff, mmGRBM_GFX_INDEX__CI__VI, grbm_gfx_index.u32All);
+
+  // Issue a CSPartialFlush cmd including cache flush
+  cmdWriter->BuildWriteWaitIdlePacket(cmdBuff);
+
+  // Program the thread trace mode register to disable thread trace
+  // The MODE register is set to disable thread trace by default
+  cmdWriter->BuildWriteUConfigRegPacket(cmdBuff, mmSQ_THREAD_TRACE_MODE__VI,
+                                        ttCfgRegs_.ttRegMode.u32All);
+
+  // Issue a CSPartialFlush cmd including cache flush
+  cmdWriter->BuildWriteWaitIdlePacket(cmdBuff);
+
+  // Iterate through the list of SE's and read the Status, Counter and
+  // Write Pointer registers of Thread Trace subsystem
+  uint64_t baseAddr = 0;
+  for (int idx = 0; idx < numSE_; idx++) {
+    // Program Grbm to direct writes to one SE
+    grbm_gfx_index.bitfields.SH_INDEX = 0;
+    grbm_gfx_index.bitfields.SE_INDEX = idx;
+    grbm_gfx_index.bitfields.SH_BROADCAST_WRITES = 0;
+    grbm_gfx_index.bitfields.SE_BROADCAST_WRITES = 0;
+    cmdWriter->BuildWriteUConfigRegPacket(cmdBuff, mmGRBM_GFX_INDEX__CI__VI, grbm_gfx_index.u32All);
+
+    // Issue WaitRegMem command to wait until SQTT event has completed
+    bool funcEq = false;
+    bool memSpace = false;
+    uint32_t waitVal = 0x01;
+    uint32_t maskVal = 0x40000000L;
+    uint32_t statusOffset = mmSQ_THREAD_TRACE_STATUS__VI - UCONFIG_SPACE_START__CI__VI;
+    cmdWriter->BuildWaitRegMemCommand(cmdBuff, memSpace, statusOffset, funcEq, maskVal, waitVal);
+
+    // Retrieve the values from various status registers
+    cmdWriter->BuildCopyDataPacket(cmdBuff, COPY_DATA_SEL_SRC_SYS_PERF_COUNTER,
+                                   mmSQ_THREAD_TRACE_STATUS__VI, 0,
+                                   ttStatus_ + ((TT_STATUS_IDX_MAX * idx) + TT_STATUS_IDX_STATUS),
+                                   COPY_DATA_SEL_COUNT_1DW, true);
+
+    cmdWriter->BuildCopyDataPacket(cmdBuff, COPY_DATA_SEL_SRC_SYS_PERF_COUNTER,
+                                   mmSQ_THREAD_TRACE_CNTR, 0,
+                                   ttStatus_ + ((TT_STATUS_IDX_MAX * idx) + TT_STATUS_IDX_CNTR),
+                                   COPY_DATA_SEL_COUNT_1DW, true);
+
+    uint32_t wptrIdx = ((TT_STATUS_IDX_MAX * idx) + TT_STATUS_IDX_WPTR);
+    cmdWriter->BuildCopyDataPacket(cmdBuff, COPY_DATA_SEL_SRC_SYS_PERF_COUNTER,
+                                   mmSQ_THREAD_TRACE_WPTR__VI, 0, ttStatus_ + wptrIdx,
+                                   COPY_DATA_SEL_COUNT_1DW, true);
+  }
+
+  // Reset the GRBM to broadcast mode
+  grbm_gfx_index.bitfields.SH_BROADCAST_WRITES = 1;
+  grbm_gfx_index.bitfields.SE_BROADCAST_WRITES = 1;
+  cmdWriter->BuildWriteUConfigRegPacket(cmdBuff, mmGRBM_GFX_INDEX__CI__VI, grbm_gfx_index.u32All);
+
+  // Initialize cache flush request object
+  FlushCacheOptions flush;
+  flush.l1 = true;
+  flush.l2 = true;
+  flush.icache = true;
+  flush.kcache = true;
+  cmdWriter->BuildFlushCacheCmd(cmdBuff, &flush, NULL, 0);
+
+  // Program the size of thread trace buffer
+  regSQ_THREAD_TRACE_SIZE ttRegSize = {0};
+  ttRegSize.u32All = 0;
+  cmdWriter->BuildWriteUConfigRegPacket(cmdBuff, mmSQ_THREAD_TRACE_SIZE__VI, ttRegSize.u32All);
+
+  // Program the thread trace ctrl register
+  regSQ_THREAD_TRACE_CTRL sqttCtrl = {};
+  sqttCtrl.u32All = 0;
+  sqttCtrl.bits.RESET_BUFFER = 1;
+  cmdWriter->BuildWriteUConfigRegPacket(cmdBuff, mmSQ_THREAD_TRACE_CTRL__VI, sqttCtrl.u32All);
+
+  // Program the compute_thread_trace_enable register
+  /*
+  regCOMPUTE_THREAD_TRACE_ENABLE__CI__VI disableTT = {0};
+  cmdWriter->BuildWriteUConfigRegPacket(cmdBuff,
+                                        mmCOMPUTE_THREAD_TRACE_ENABLE__CI__VI,
+                                        disableTT.u32All);
+  */
+
+  // Disable RLC Perfmon Clock Gating
+  // On Vega this is needed to collect Perf Cntrs
+  // cmdWriter->BuildWriteUConfigRegPacket(cmdBuff, mmRLC_PERFMON_CLK_CNTL__VI, 0);
+
+  // Issue a CSPartialFlush cmd including cache flush
+  cmdWriter->BuildWriteWaitIdlePacket(cmdBuff);
+  return;
+}
+
+bool Gfx8ThreadTrace::Validate() {
+  // Iterate through the list of SE to verify
+  for (int idx = 0; idx < numSE_; idx++) {
+    // Determine if the buffer has wrapped
+    uint32_t statusIdx = ((TT_STATUS_IDX_MAX * idx) + TT_STATUS_IDX_STATUS);
+    if (ttStatus_[statusIdx] & 0x80000000) {
+      return false;
+    }
+
+    // Adjust the value of Write Ptr which is bits [29-0]
+    uint32_t wptrIdx = ((TT_STATUS_IDX_MAX * idx) + TT_STATUS_IDX_WPTR);
+    ttStatus_[wptrIdx] = (ttStatus_[wptrIdx] & TT_WRITE_PTR_MASK);
+  }
+
+  return true;
+}
+
+}  // pm4_profile
@@ -0,0 +1,101 @@
+#ifndef _GFX8_THREAD_TRACE_H_
+#define _GFX8_THREAD_TRACE_H_
+
+#include "gfxip/gfx8/si_ci_vi_merged_typedef.h"
+#include "gfxip/gfx8/si_ci_vi_merged_offset.h"
+#include "gfxip/gfx8/si_ci_vi_merged_enum.h"
+#include "gfxip/gfx8/si_pm4defs.h"
+#include "thread_trace.h"
+
+#include <string>
+
+namespace pm4_profile {
+
+typedef struct Gfx8ThreadTraceCfgRegs {
+  // Size of thread trace buffer
+  regSQ_THREAD_TRACE_SIZE ttRegSize;
+  // Thread trace mode
+  regSQ_THREAD_TRACE_MODE ttRegMode;
+  // Thread trace wave mask
+  regSQ_THREAD_TRACE_MASK ttRegMask;
+  // Thread trace token mask
+  regSQ_THREAD_TRACE_TOKEN_MASK ttRegTokenMask;
+  // Thread trace token mask2
+  regSQ_THREAD_TRACE_TOKEN_MASK2__VI ttRegTokenMask2;
+  // Thread trace perf mask
+  regSQ_THREAD_TRACE_PERF_MASK ttRegPerfMask;
+} Gfx8ThreadTraceCfgRegs;
+
+// Encapsulates the various Api and structures used to enable a thread
+// trace session and collect its data
+class Gfx8ThreadTrace : public ThreadTrace {
+ public:
+  Gfx8ThreadTrace();
+
+  ~Gfx8ThreadTrace();
+
+  // Initializes various data structures and handles that
+  // are needed to support a thread trace session
+  bool Init(const ThreadTraceConfig* config);
+
+  // Builds Pm4 command stream to program hardware registers that
+  // enable a thread trace session, including the issue of an event
+  // to begin thread session
+  void BeginSession(pm4_profile::DefaultCmdBuf* cmdBuff, pm4_profile::CommandWriter* cmdWriter);
+
+  // Builds Pm4 command stream to program hardware registers that
+  // disable a thread trace session, including the issue of an event
+  // to stop currently ongoing thread session
+  void StopSession(pm4_profile::DefaultCmdBuf* cmdBuff, pm4_profile::CommandWriter* cmdWriter);
+
+  // Validates that thread trace session ran correctly i.e. did not
+  // encounter any errors.
+  bool Validate();
+
+  // Initializes the handle of buffer used to collect SQTT data
+  void setSqttDataBuff(uint8_t* sqttBuffer, uint32_t sqttBuffSz);
+
+  // Initializes the handle of buffer used to read control data of SQTT
+  void setSqttCtrlBuff(uint32_t* ctrlBuff) { ttStatus_ = ctrlBuff; }
+
+  // Return status info size
+  uint32_t StatusSizeInfo() const { return TT_STATUS_IDX_MAX * sizeof(uint32_t) * numSE_; }
+
+  // Return number of Shader Engines
+  uint32_t getNumSe() { return numSE_; }
+
+ private:
+  // Holds number of Shader Engines present on device
+  uint32_t numSE_;
+
+  // Thread traces status register indices to determine
+  // status of thread trace run
+  typedef enum {
+    TT_STATUS_IDX_STATUS = 0,
+    TT_STATUS_IDX_CNTR = 1,
+    TT_STATUS_IDX_WPTR = 2,
+    TT_STATUS_IDX_MAX = 3
+  } TTStatusReg;
+
+  // A list of tuples of TT_STATUS_IDX_MAX size,
+  // giving status of thread trace
+  uint32_t* ttStatus_;
+
+  // Size of thread trace buffer per shader engine
+  uint32_t ttBuffSize_;
+
+  // Handles of Device memory used for thread trace
+  std::vector<uint64_t> devMemList_;
+
+  // Registers that need to be programmed for Thread Trace
+  Gfx8ThreadTraceCfgRegs ttCfgRegs_;
+
+  // Initializes thread trace registers with default parameters.
+  // These are potentially updated based on updates to thread trace
+  // configuration object by user
+  void InitThreadTraceCfgRegs();
+};
+
+}  // pm4_profile
+
+#endif  // _GFX8_THREAD_TRACE_H_
@@ -0,0 +1,356 @@
+
+#include <iostream>
+#include <fstream>
+#include <iomanip>
+#include <random>
+#include <memory>
+
+#include "core/util/os.h"
+#include "gfx9_thread_trace.h"
+
+/// @brief Returns the lower 32-bits of a value
+inline uint32_t Low32(uint64_t u) { return (u & 0xFFFFFFFFUL); }
+
+/// @brief Returns the upper 32-bits of a value
+inline uint32_t High32(uint64_t u) { return (u >> 32); }
+
+namespace pm4_profile {
+
+Gfx9ThreadTrace::Gfx9ThreadTrace() {
+  // Initialize the number of shader engines
+  numSE_ = 4;
+}
+
+Gfx9ThreadTrace::~Gfx9ThreadTrace() {}
+
+bool Gfx9ThreadTrace::Init(const ThreadTraceConfig* config) {
+  // Initialize SQTT Configuration and Register objects
+  if (!ThreadTrace::Init(config)) return false;
+  InitThreadTraceCfgRegs();
+  return true;
+}
+
+void Gfx9ThreadTrace::InitThreadTraceCfgRegs() {
+  // Indicates the size of buffer to use per Shader Engine instance.
+  // The size is specified in terms of 4KB blocks
+  ttCfgRegs_.ttRegSize.u32All = 0;
+
+  // Indicates various attributes of a thread trace session.
+  //
+  // MASK_CS: Which shader types should be enabled for data collection
+  //      Enable CS Shader types.
+  //
+  // WRAP: How trace buffer should be used as a ring buffer or as a linear
+  //      buffer - Disable WRAP mode i.e use it as a linear buffer
+  //
+  // MODE: Enables a thread trace session
+  //
+  // CAPTURE_MODE: When thread trace data is collected immediately after MODE
+  //      is enabled or wait until a Thread Trace Start event is received
+  //
+  // AUTOFLUSH_EN: Flush thread trace data to buffer often automatically
+  //
+  ttCfgRegs_.ttRegMode.u32All = 0;
+  ttCfgRegs_.ttRegMode.bits.WRAP = 0;
+  ttCfgRegs_.ttRegMode.bits.CAPTURE_MODE = 0;
+  ttCfgRegs_.ttRegMode.bits.MASK_CS = 1;
+  ttCfgRegs_.ttRegMode.bits.AUTOFLUSH_EN = 1;
+  ttCfgRegs_.ttRegMode.bits.MODE = SQ_THREAD_TRACE_MODE_OFF;
+
+  // Enable Thread Trace for all VM Id's
+  // Enable all of the SIMD's of the compute unit
+  // Enable Compute Unit (CU) at index Zero to be used for fine-grained data
+  // Enable Shader Array (SH) at index Zero to be used for fine-grained data
+  //
+  // @note: Not enabling REG_STALL_EN, SPI_STALL_EN and SQ_STALL_EN bits. They
+  // are useful if we wish to program buffer throttling.
+  //
+  ttCfgRegs_.ttRegMask.u32All = 0;
+  ttCfgRegs_.ttRegMask.bits.SH_SEL = 0x0;
+  ttCfgRegs_.ttRegMask.bits.SIMD_EN = 0xF;
+  ttCfgRegs_.ttRegMask.bits.CU_SEL = SetCuId();
+  ttCfgRegs_.ttRegMask.bits.SQ_STALL_EN = 0x1;
+  ttCfgRegs_.ttRegMask.bits.SPI_STALL_EN = 0x1;
+  ttCfgRegs_.ttRegMask.bits.REG_STALL_EN = 0x1;
+  ttCfgRegs_.ttRegMask.bits.VM_ID_MASK = SetVmId();
+
+  // Override Mask value if a user value is available
+  uint32_t ttMask = SetMask();
+  if (ttMask) {
+    ttCfgRegs_.ttRegMask.u32All = ttMask;
+  }
+
+  // Mask of compute units to get thread trace data from
+  ttCfgRegs_.ttRegPerfMask.u32All = 0;
+  ttCfgRegs_.ttRegPerfMask.bits.SH0_MASK = 0xFFFF;
+  ttCfgRegs_.ttRegPerfMask.bits.SH1_MASK = 0xFFFF;
+
+  // Indicate the different TT messages/tokens that should be enabled/logged
+  // Indicate the different TT tokens that specify register operations to be logged
+  ttCfgRegs_.ttRegTokenMask.u32All = 0;
+  ttCfgRegs_.ttRegTokenMask.bits.REG_MASK = 0xFF;
+  ttCfgRegs_.ttRegTokenMask.bits.TOKEN_MASK = 0xFFFF;
+  ttCfgRegs_.ttRegTokenMask.bits.REG_DROP_ON_STALL = 0x1;
+
+  // Override TokenMask1 value if a user value is available
+  uint32_t tokenMask1 = SetTokenMask();
+  if (tokenMask1) {
+    ttCfgRegs_.ttRegTokenMask.u32All = tokenMask1;
+  }
+
+  // Indicate the different TT tokens that specify instruction operations to be logged
+  // Disabling specifically instruction operations updating Program Counter (PC).
+  // @note: The field is defined in the spec incorrectly as a 16-bit value
+  ttCfgRegs_.ttRegTokenMask2.u32All = 0;
+  ttCfgRegs_.ttRegTokenMask2.bits.INST_MASK = 0xFFFFFF7F;
+
+  // Override TokenMask2 value if a user value is available
+  uint32_t tokenMask2 = SetTokenMask2();
+  if (tokenMask2) {
+    ttCfgRegs_.ttRegTokenMask2.u32All = tokenMask2;
+  }
+}
+
+void Gfx9ThreadTrace::setSqttDataBuff(uint8_t* sqttBuffer, uint32_t sqttBuffSz) {
+  // Compute the size of buffer available for each shader engine
+  ttBuffSize_ = sqttBuffSz / numSE_;
+
+  // Populate the sqtt buffer array submitted to device
+  for (int idx = 0; idx < numSE_; idx++) {
+    uint64_t sqttSEAddr = uint64_t(sqttBuffer + (ttBuffSize_ * idx));
+    devMemList_.push_back(sqttSEAddr);
+  }
+
+  // Update the size bit-field of sqtt ctrl register
+  ttCfgRegs_.ttRegSize.bits.SIZE = ttBuffSize_ >> TT_BUFF_ALIGN_SHIFT;
+}
+
+void Gfx9ThreadTrace::BeginSession(DefaultCmdBuf* cmdBuff, CommandWriter* cmdWriter) {
+  // Program Grbm to broadcast messages to all shader engines
+  regGRBM_GFX_INDEX grbm_gfx_index;
+  grbm_gfx_index.u32All = 0;
+  grbm_gfx_index.bitfields.SH_BROADCAST_WRITES = 1;
+  grbm_gfx_index.bitfields.SE_BROADCAST_WRITES = 1;
+  grbm_gfx_index.bitfields.INSTANCE_BROADCAST_WRITES = 1;
+  cmdWriter->BuildWriteUConfigRegPacket(cmdBuff, mmGRBM_GFX_INDEX, grbm_gfx_index.u32All);
+
+  // Issue a CSPartialFlush cmd including cache flush
+  cmdWriter->BuildWriteWaitIdlePacket(cmdBuff);
+
+  // Disable RLC Perfmon Clock Gating
+  // On Vega this is needed to collect Perf Cntrs
+  // cmdWriter->BuildWriteUConfigRegPacket(cmdBuff, mmRLC_PERFMON_CLK_CNTL, 1);
+
+  // Program the Compute register to indicate SQTT is enabled
+  /*
+  regCOMPUTE_THREAD_TRACE_ENABLE enableTT = {0};
+  enableTT.bits.THREAD_TRACE_ENABLE = 1;
+  cmdWriter->BuildWriteUConfigRegPacket(cmdBuff,
+                                        mmCOMPUTE_THREAD_TRACE_ENABLE,
+                                        enableTT.u32All);
+  */
+
+  // Program the thread trace mask - specifies SH, CU, SIMD and
+  // VM Id masks to apply. Enabling SQ/SPI/REG_STALL_EN bits
+  cmdWriter->BuildWriteUConfigRegPacket(cmdBuff, mmSQ_THREAD_TRACE_MASK,
+                                        ttCfgRegs_.ttRegMask.u32All);
+
+  // Program the thread trace Perf mask
+  cmdWriter->BuildWriteUConfigRegPacket(cmdBuff, mmSQ_THREAD_TRACE_PERF_MASK,
+                                        ttCfgRegs_.ttRegPerfMask.u32All);
+
+  // Program the thread trace token mask
+  cmdWriter->BuildWriteUConfigRegPacket(cmdBuff, mmSQ_THREAD_TRACE_TOKEN_MASK,
+                                        ttCfgRegs_.ttRegTokenMask.u32All);
+
+  // Program the thread trace token mask2 to specify the list of instruction
+  // tokens to record. Disabling INST_PC instruction tokens
+  cmdWriter->BuildWriteUConfigRegPacket(cmdBuff, mmSQ_THREAD_TRACE_TOKEN_MASK2,
+                                        ttCfgRegs_.ttRegTokenMask2.u32All);
+
+  // Program the thread trace mode register
+  cmdWriter->BuildWriteUConfigRegPacket(cmdBuff, mmSQ_THREAD_TRACE_MODE,
+                                        ttCfgRegs_.ttRegMode.u32All);
+
+  // Program the HiWaterMark register to support stalling
+  if ((ttCfgRegs_.ttRegMask.bits.SQ_STALL_EN) || (ttCfgRegs_.ttRegMask.bits.SPI_STALL_EN) ||
+      (ttCfgRegs_.ttRegMask.bits.REG_STALL_EN) ||
+      (ttCfgRegs_.ttRegTokenMask.bits.REG_DROP_ON_STALL)) {
+    cmdWriter->BuildWriteUConfigRegPacket(cmdBuff, mmSQ_THREAD_TRACE_HIWATER, 0x06);
+  }
+
+  // Iterate through the list of SE's and program the register
+  // for carrying address of thread trace buffer which is aligned
+  // to 4KB per thread trace specification
+  uint64_t baseAddr = 0;
+  for (int idx = 0; idx < numSE_; idx++) {
+    // Program Grbm to direct writes to one SE
+    grbm_gfx_index.bitfields.SH_INDEX = 0;
+    grbm_gfx_index.bitfields.SE_INDEX = idx;
+    grbm_gfx_index.bitfields.SH_BROADCAST_WRITES = 0;
+    grbm_gfx_index.bitfields.SE_BROADCAST_WRITES = 0;
+    cmdWriter->BuildWriteUConfigRegPacket(cmdBuff, mmGRBM_GFX_INDEX, grbm_gfx_index.u32All);
+
+    // Program base2 address of buffer to use for thread trace
+    /*
+    regSQ_THREAD_TRACE_BASE2 sqttBase2 = {};
+    sqttBase2.u32All = 0;
+    sqttBase2.bits.ADDR_HI = 0;
+    cmdWriter->BuildWriteUConfigRegPacket(cmdBuff,
+                                          mmSQ_THREAD_TRACE_BASE2,
+                                          sqttBase2.u32All);
+    */
+
+    // Program the base address to use
+    baseAddr = devMemList_[idx] >> TT_BUFF_ALIGN_SHIFT;
+
+    // Program base address of buffer to use for thread trace
+    regSQ_THREAD_TRACE_BASE sqttBase = {};
+    sqttBase.bits.ADDR = Low32(baseAddr);
+    cmdWriter->BuildWriteUConfigRegPacket(cmdBuff, mmSQ_THREAD_TRACE_BASE, sqttBase.u32All);
+
+    // Program the size of thread trace buffer
+    cmdWriter->BuildWriteUConfigRegPacket(cmdBuff, mmSQ_THREAD_TRACE_SIZE,
+                                          ttCfgRegs_.ttRegSize.u32All);
+
+    // Program the thread trace ctrl register
+    regSQ_THREAD_TRACE_CTRL sqttCtrl = {};
+    sqttCtrl.u32All = 0;
+    sqttCtrl.bits.RESET_BUFFER = 1;
+    cmdWriter->BuildWriteUConfigRegPacket(cmdBuff, mmSQ_THREAD_TRACE_CTRL, sqttCtrl.u32All);
+  }
+
+  // Reset the GRBM to broadcast mode
+  grbm_gfx_index.bitfields.SH_BROADCAST_WRITES = 1;
+  grbm_gfx_index.bitfields.SE_BROADCAST_WRITES = 1;
+  cmdWriter->BuildWriteUConfigRegPacket(cmdBuff, mmGRBM_GFX_INDEX, grbm_gfx_index.u32All);
+
+  // Issue a CSPartialFlush cmd including cache flush
+  cmdWriter->BuildWriteWaitIdlePacket(cmdBuff);
+
+  // Program the thread trace mode register
+  ttCfgRegs_.ttRegMode.bits.MODE = SQ_THREAD_TRACE_MODE_ON;
+  cmdWriter->BuildWriteUConfigRegPacket(cmdBuff, mmSQ_THREAD_TRACE_MODE,
+                                        ttCfgRegs_.ttRegMode.u32All);
+  ttCfgRegs_.ttRegMode.bits.MODE = SQ_THREAD_TRACE_MODE_OFF;
+
+  // Issue a CSPartialFlush cmd including cache flush
+  cmdWriter->BuildWriteWaitIdlePacket(cmdBuff);
+  return;
+}
+
+void Gfx9ThreadTrace::StopSession(DefaultCmdBuf* cmdBuff, CommandWriter* cmdWriter) {
+  // Program Grbm to broadcast messages to all shader engines
+  regGRBM_GFX_INDEX grbm_gfx_index;
+  grbm_gfx_index.u32All = 0;
+  grbm_gfx_index.bitfields.SH_BROADCAST_WRITES = 1;
+  grbm_gfx_index.bitfields.SE_BROADCAST_WRITES = 1;
+  grbm_gfx_index.bitfields.INSTANCE_BROADCAST_WRITES = 1;
+  cmdWriter->BuildWriteUConfigRegPacket(cmdBuff, mmGRBM_GFX_INDEX, grbm_gfx_index.u32All);
+
+  // Issue a CSPartialFlush cmd including cache flush
+  cmdWriter->BuildWriteWaitIdlePacket(cmdBuff);
+
+  // Program the thread trace mode register to disable thread trace
+  // The MODE register is set to disable thread trace by default
+  cmdWriter->BuildWriteUConfigRegPacket(cmdBuff, mmSQ_THREAD_TRACE_MODE,
+                                        ttCfgRegs_.ttRegMode.u32All);
+
+  // Issue a CSPartialFlush cmd including cache flush
+  cmdWriter->BuildWriteWaitIdlePacket(cmdBuff);
+
+  // Iterate through the list of SE's and read the Status, Counter and
+  // Write Pointer registers of Thread Trace subsystem
+  uint64_t baseAddr = 0;
+  for (int idx = 0; idx < numSE_; idx++) {
+    // Program Grbm to direct writes to one SE
+    grbm_gfx_index.bitfields.SH_INDEX = 0;
+    grbm_gfx_index.bitfields.SE_INDEX = idx;
+    grbm_gfx_index.bitfields.SH_BROADCAST_WRITES = 0;
+    grbm_gfx_index.bitfields.SE_BROADCAST_WRITES = 0;
+    cmdWriter->BuildWriteUConfigRegPacket(cmdBuff, mmGRBM_GFX_INDEX, grbm_gfx_index.u32All);
+
+    // Issue WaitRegMem command to wait until SQTT event has completed
+    bool funcEq = false;
+    bool memSpace = false;
+    uint32_t waitVal = 0x01;
+    uint32_t maskVal = 0x40000000L;
+    uint32_t statusOffset = mmSQ_THREAD_TRACE_STATUS - UCONFIG_SPACE_START;
+    cmdWriter->BuildWaitRegMemCommand(cmdBuff, memSpace, statusOffset, funcEq, maskVal, waitVal);
+
+    // Retrieve the values from various status registers
+    cmdWriter->BuildCopyDataPacket(cmdBuff, COPY_DATA_SEL_SRC_SYS_PERF_COUNTER,
+                                   mmSQ_THREAD_TRACE_STATUS, 0,
+                                   ttStatus_ + ((TT_STATUS_IDX_MAX * idx) + TT_STATUS_IDX_STATUS),
+                                   COPY_DATA_SEL_COUNT_1DW, true);
+
+    cmdWriter->BuildCopyDataPacket(cmdBuff, COPY_DATA_SEL_SRC_SYS_PERF_COUNTER,
+                                   mmSQ_THREAD_TRACE_CNTR, 0,
+                                   ttStatus_ + ((TT_STATUS_IDX_MAX * idx) + TT_STATUS_IDX_CNTR),
+                                   COPY_DATA_SEL_COUNT_1DW, true);
+
+    uint32_t wptrIdx = ((TT_STATUS_IDX_MAX * idx) + TT_STATUS_IDX_WPTR);
+    cmdWriter->BuildCopyDataPacket(cmdBuff, COPY_DATA_SEL_SRC_SYS_PERF_COUNTER,
+                                   mmSQ_THREAD_TRACE_WPTR, 0, ttStatus_ + wptrIdx,
+                                   COPY_DATA_SEL_COUNT_1DW, true);
+  }
+
+  // Reset the GRBM to broadcast mode
+  grbm_gfx_index.bitfields.SH_BROADCAST_WRITES = 1;
+  grbm_gfx_index.bitfields.SE_BROADCAST_WRITES = 1;
+  cmdWriter->BuildWriteUConfigRegPacket(cmdBuff, mmGRBM_GFX_INDEX, grbm_gfx_index.u32All);
+
+  // Initialize cache flush request object
+  FlushCacheOptions flush;
+  flush.l1 = true;
+  flush.l2 = true;
+  flush.icache = true;
+  flush.kcache = true;
+  cmdWriter->BuildFlushCacheCmd(cmdBuff, &flush, NULL, 0);
+
+  // Program the size of thread trace buffer
+  regSQ_THREAD_TRACE_SIZE ttRegSize = {0};
+  ttRegSize.u32All = 0;
+  cmdWriter->BuildWriteUConfigRegPacket(cmdBuff, mmSQ_THREAD_TRACE_SIZE, ttRegSize.u32All);
+
+  // Program the thread trace ctrl register
+  regSQ_THREAD_TRACE_CTRL sqttCtrl = {};
+  sqttCtrl.u32All = 0;
+  sqttCtrl.bits.RESET_BUFFER = 1;
+  cmdWriter->BuildWriteUConfigRegPacket(cmdBuff, mmSQ_THREAD_TRACE_CTRL, sqttCtrl.u32All);
+
+  // Program the compute_thread_trace_enable register
+  /*
+  regCOMPUTE_THREAD_TRACE_ENABLE disableTT = {0};
+  cmdWriter->BuildWriteUConfigRegPacket(cmdBuff,
+                                        mmCOMPUTE_THREAD_TRACE_ENABLE,
+                                        disableTT.u32All);
+  */
+
+  // Disable RLC Perfmon Clock Gating
+  // On Vega this is needed to collect Perf Cntrs
+  // cmdWriter->BuildWriteUConfigRegPacket(cmdBuff, mmRLC_PERFMON_CLK_CNTL, 0);
+
+  // Issue a CSPartialFlush cmd including cache flush
+  cmdWriter->BuildWriteWaitIdlePacket(cmdBuff);
+  return;
+}
+
+bool Gfx9ThreadTrace::Validate() {
+  // Iterate through the list of SE to verify
+  for (int idx = 0; idx < numSE_; idx++) {
+    // Determine if the buffer has wrapped
+    uint32_t statusIdx = ((TT_STATUS_IDX_MAX * idx) + TT_STATUS_IDX_STATUS);
+    if (ttStatus_[statusIdx] & 0x80000000) {
+      return false;
+    }
+
+    // Adjust the value of Write Ptr which is bits [29-0]
+    uint32_t wptrIdx = ((TT_STATUS_IDX_MAX * idx) + TT_STATUS_IDX_WPTR);
+    ttStatus_[wptrIdx] = (ttStatus_[wptrIdx] & TT_WRITE_PTR_MASK);
+  }
+
+  return true;
+}
+
+}  // pm4_profile
@@ -0,0 +1,104 @@
+#ifndef _GFX9_THREAD_TRACE_H_
+#define _GFX9_THREAD_TRACE_H_
+
+#include "gfxip/gfx9/gfx9_registers.h"
+#include "gfxip/gfx9/gfx9_typedef.h"
+#include "gfxip/gfx9/gfx9_enum.h"
+#include "gfxip/gfx9/gfx9_offset.h"
+#include "gfxip/gfx9/gfx9_pm4defs.h"
+#include "thread_trace.h"
+
+#include <string>
+
+using namespace pm4_profile::gfx9;
+
+namespace pm4_profile {
+
+typedef struct Gfx9ThreadTraceCfgRegs {
+  // Size of thread trace buffer
+  regSQ_THREAD_TRACE_SIZE ttRegSize;
+  // Thread trace mode
+  regSQ_THREAD_TRACE_MODE ttRegMode;
+  // Thread trace wave mask
+  regSQ_THREAD_TRACE_MASK ttRegMask;
+  // Thread trace token mask
+  regSQ_THREAD_TRACE_TOKEN_MASK ttRegTokenMask;
+  // Thread trace token mask2
+  regSQ_THREAD_TRACE_TOKEN_MASK2 ttRegTokenMask2;
+  // Thread trace perf mask
+  regSQ_THREAD_TRACE_PERF_MASK ttRegPerfMask;
+} Gfx9ThreadTraceCfgRegs;
+
+// Encapsulates the various Api and structures used to enable a thread
+// trace session and collect its data
+class Gfx9ThreadTrace : public ThreadTrace {
+ public:
+  Gfx9ThreadTrace();
+
+  ~Gfx9ThreadTrace();
+
+  // Initializes various data structures and handles that
+  // are needed to support a thread trace session
+  bool Init(const ThreadTraceConfig* config);
+
+  // Builds Pm4 command stream to program hardware registers that
+  // enable a thread trace session, including the issue of an event
+  // to begin thread session
+  void BeginSession(pm4_profile::DefaultCmdBuf* cmdBuff, pm4_profile::CommandWriter* cmdWriter);
+
+  // Builds Pm4 command stream to program hardware registers that
+  // disable a thread trace session, including the issue of an event
+  // to stop currently ongoing thread session
+  void StopSession(pm4_profile::DefaultCmdBuf* cmdBuff, pm4_profile::CommandWriter* cmdWriter);
+
+  // Validates that thread trace session ran correctly i.e. did not
+  // encounter any errors.
+  bool Validate();
+
+  // Initializes the handle of buffer used to collect SQTT data
+  void setSqttDataBuff(uint8_t* sqttBuffer, uint32_t sqttBuffSz);
+
+  // Initializes the handle of buffer used to read control data of SQTT
+  void setSqttCtrlBuff(uint32_t* ctrlBuff) { ttStatus_ = ctrlBuff; }
+
+  // Return status info size
+  uint32_t StatusSizeInfo() const { return TT_STATUS_IDX_MAX * sizeof(uint32_t) * numSE_; }
+
+  // Return number of Shader Engines
+  uint32_t getNumSe() { return numSE_; }
+
+ private:
+  // Holds number of Shader Engines present on device
+  uint32_t numSE_;
+
+  // Thread traces status register indices to determine
+  // status of thread trace run
+  typedef enum {
+    TT_STATUS_IDX_STATUS = 0,
+    TT_STATUS_IDX_CNTR = 1,
+    TT_STATUS_IDX_WPTR = 2,
+    TT_STATUS_IDX_MAX = 3
+  } TTStatusReg;
+
+  // A list of tuples of TT_STATUS_IDX_MAX size,
+  // giving status of thread trace
+  uint32_t* ttStatus_;
+
+  // Size of thread trace buffer per shader engine
+  uint32_t ttBuffSize_;
+
+  // Handles of Device memory used for thread trace
+  std::vector<uint64_t> devMemList_;
+
+  // Registers that need to be programmed for Thread Trace
+  Gfx9ThreadTraceCfgRegs ttCfgRegs_;
+
+  // Initializes thread trace registers with default parameters.
+  // These are potentially updated based on updates to thread trace
+  // configuration object by user
+  void InitThreadTraceCfgRegs();
+};
+
+}  // pm4_profile
+
+#endif  // _GFX9_THREAD_TRACE_H_
@@ -0,0 +1,105 @@
+#include <iostream>
+
+#include "core/util/os.h"
+#include "thread_trace.h"
+
+namespace pm4_profile {
+
+bool ThreadTrace::Init(const ThreadTraceConfig* config) {
+  if (config) {
+    ttConfig_ = *config;
+  } else {
+    InitThreadTraceConfig(&ttConfig_);
+  }
+  return true;
+}
+
+void ThreadTrace::InitThreadTraceConfig(ThreadTraceConfig* config) const {
+  memset(config, 0, sizeof(ThreadTraceConfig));
+
+  config->threadTraceTargetCu = 0;
+  config->threadTraceVmIdMask = 0;
+  config->threadTraceMask = 0;
+  config->threadTraceTokenMask = 0;
+  config->threadTraceTokenMask2 = 0;
+}
+
+uint8_t ThreadTrace::SetCuId() {
+  uint32_t cuId = ttConfig_.threadTraceTargetCu;
+
+  // Allow users to specify the CU to choose for Target tokens
+  std::string var = os::GetEnvVar("HSA_THREAD_TRACE_SELECT_CU");
+  if (var.length() > 0) {
+    cuId = std::stol(var, nullptr, 16);
+    std::cout << "Using " << cuId << " as CUID for Thread Trace" << std::endl;
+  }
+
+  assert((cuId <= 15) && "Cu Id must be between 0 and 15");
+
+  return cuId;
+}
+
+uint8_t ThreadTrace::SetVmId() {
+  uint32_t vmId = ttConfig_.threadTraceVmIdMask;
+
+  // Allow users to specify the VMID to choose for Target tokens
+  std::string var = os::GetEnvVar("HSA_THREAD_TRACE_SELECT_VMID");
+  if (var.length() > 0) {
+    vmId = std::stol(var, nullptr, 16);
+    std::cout << "Using " << vmId << " as VMID for Thread Trace" << std::endl;
+  }
+
+  assert((vmId <= 2) && "VmId must be between 0 and 2");
+
+  return vmId;
+}
+
+uint32_t ThreadTrace::SetMask() {
+  uint32_t ttMask = ttConfig_.threadTraceMask;
+  const uint32_t validMask = 0x00C0D0;
+
+  // Allow users to specify the Mask to choose for configuration parameters
+  std::string var = os::GetEnvVar("HSA_THREAD_TRACE_SELECT_MASK");
+  if (var.length() > 0) {
+    ttMask = std::stol(var, nullptr, 16);
+    std::cout << "Using " << ttMask << " as Mask for Thread Trace" << std::endl;
+  }
+
+  assert(((ttMask & validMask) == 0) && "Mask should have bits [4,6,7] set to Zero");
+
+  return ttMask;
+}
+
+uint32_t ThreadTrace::SetTokenMask() {
+  uint32_t tokenMask = ttConfig_.threadTraceTokenMask;
+  const uint32_t validMask = 0xFF000000;
+
+  // Allow users to specify the TokenMask to choose for Target tokens
+  std::string var = os::GetEnvVar("HSA_THREAD_TRACE_SELECT_TOKEN_MASK1");
+  if (var.length() > 0) {
+    tokenMask = std::stol(var, nullptr, 16);
+    std::cout << "Using " << tokenMask << " as TokenMask for Thread Trace" << std::endl;
+  }
+
+  assert(((tokenMask & validMask) == 0) && "TokenMask should have bits [31:25] set to Zero");
+
+  return tokenMask;
+}
+
+uint32_t ThreadTrace::SetTokenMask2() {
+  uint32_t tokenMask2 = ttConfig_.threadTraceTokenMask2;
+  const uint32_t validMask = 0xFFFF0000;
+
+  // Allow users to specify the TokenMask2 to choose for Target tokens
+  std::string var = os::GetEnvVar("HSA_THREAD_TRACE_SELECT_TOKEN_MASK2");
+  if (var.length() > 0) {
+    tokenMask2 = std::stol(var, nullptr, 16);
+    std::cout << "Using " << tokenMask2 << " as TokenMask2 for Thread Trace" << std::endl;
+  }
+
+  assert(((tokenMask2 & validMask) == 0) && "TokenMask2 should have bits [31:16] set to Zero");
+
+  return tokenMask2;
+}
+
+}  // pm4_profile
@@ -0,0 +1,104 @@
+#ifndef _THREAD_TRACE_H_
+#define _THREAD_TRACE_H_
+
+#include <stdint.h>
+
+#include "cmdwriter.h"
+
+// Move them as static variables later on
+#define TT_WRITE_PTR_MASK (0x3FFFFFFF)
+#define TT_DEFAULT_BUFF_SIZE_SCALE (16)
+#define TT_DEFAULT_BUFF_SIZE (1024 * 1024 * 8)
+
+// Size of block in bytesper increment in WPTR
+#define TT_WRITE_PTR_BLK (32)
+
+// Factor by which to shift buffer address
+#define TT_BUFF_ALIGN_SHIFT (12)
+
+// Align address to 64 Kilobytes
+#define TT_BUFF_ADDR_ALIGN (0x10000)
+
+namespace pm4_profile {
+
+// ThreadTrace config
+typedef struct ThreadTraceConfig {
+  uint32_t threadTraceTargetCu;
+  uint32_t threadTraceVmIdMask;
+  uint32_t threadTraceMask;
+  uint32_t threadTraceTokenMask;
+  uint32_t threadTraceTokenMask2;
+} ThreadTraceConfig;
+
+// Encapsulates the various Api and structures that are used to enable
+// a thread trace session and collect its data. Implementations of this
+// interface program device specific registers to realize the functionality
+class ThreadTrace {
+  // Holds Thread Trace configuration information
+  // @note: Currently not used i.e. is not exposed to users
+  ThreadTraceConfig ttConfig_;
+
+ public:
+  // Destructor of the thread trace service handle
+  virtual ~ThreadTrace(){};
+
+  // Obtain the CU id to use for thread tracing
+  uint8_t SetCuId();
+
+  // Obtain the VM id to use for thread tracing
+  uint8_t SetVmId();
+
+  // Obtain the Mask to use for thread tracing
+  uint32_t SetMask();
+
+  // Obtain the Token Mask 1 to use for thread tracing
+  uint32_t SetTokenMask();
+
+  // Obtain the Token Mask 2 to use for thread tracing
+  uint32_t SetTokenMask2();
+
+  // Initializes various data structures and handles that
+  // are needed to support a thread trace session
+  virtual bool Init(const ThreadTraceConfig* config);
+
+  // Initializes thread trace configuration object with default
+  // parameters, that could potentially be overriden by user
+  // @note: Currently not used i.e. is not exposed to users
+  virtual void InitThreadTraceConfig(ThreadTraceConfig* config) const;
+
+  // Allows user to configure various parameters of a thread trace session
+  // @note: Currently not used i.e. is not exposed to users
+  bool Config(uint32_t key, uint32_t value) { return true; };
+
+  // Builds Pm4 command stream to program hardware registers that
+  // enable a thread trace session, including the issue of an event
+  // to begin thread session
+  virtual void BeginSession(pm4_profile::DefaultCmdBuf* cmdBuff,
+                            pm4_profile::CommandWriter* cmdWriter) = 0;
+
+  // Builds Pm4 command stream to program hardware registers that
+  // disable a thread trace session, including the issue of an event
+  // to stop currently ongoing thread session
+  virtual void StopSession(pm4_profile::DefaultCmdBuf* cmdBuff,
+                           pm4_profile::CommandWriter* cmdWriter) = 0;
+
+  // Validates that thread trace session ran correctly i.e. did not
+  // encounter any errors.
+  virtual bool Validate() = 0;
+
+  // Initializes the handle of buffer used to collect SQTT data
+  virtual void setSqttDataBuff(uint8_t* sqttBuffer, uint32_t sqttBuffSz) = 0;
+
+  // Initializes the handle of buffer used to read control data of SQTT
+  virtual void setSqttCtrlBuff(uint32_t* ctrlBuff) = 0;
+
+  // Return number of Shader Engines
+  virtual uint32_t getNumSe() = 0;
+
+  // Return status info size
+  virtual uint32_t StatusSizeInfo() const = 0;
+};
+
+}  // pm4_profile
+
+#endif  // _THREAD_TRACE_H_
@@ -0,0 +1,17 @@
+#
+# Source files for Rocr Utils library
+#
+set ( MODULE_SRC ${CORE_UTIL_DIR}/lnx/os_linux.cpp )
+
+#
+# Header files include path(s).
+#
+include_directories ( $ENV{ROCR_INC_DIR} )
+include_directories ( ${HSA_RUNTIME_OSC_DIR} )
+include_directories ( ${CORE_UTIL_DIR} )
+
+#
+# Build Utils as a Static Library object
+#
+add_library( ${UTIL_LIB} STATIC ${MODULE_SRC} )
+target_link_libraries( ${UTIL_LIB} c stdc++ dl pthread rt )
@@ -0,0 +1,48 @@
+#
+# Header files include path(s).
+#
+include_directories ( $ENV{ROCR_INC_DIR} )
+include_directories ( ${API_DIR} )
+include_directories ( ${PROJ_DIR}/cmdwriter )
+include_directories ( ${PROJ_DIR}/perfcounter )
+include_directories ( ${PROJ_DIR}/threadtrace )
+include_directories ( ${PROJ_DIR}/aqlprofile )
+include_directories ( ${TEST_DIR}/common )
+include_directories ( ${TEST_DIR}/ctrl )
+include_directories ( ${CORE_UTIL_DIR} )
+
+#
+# Specify the directory containing the libraries of HsaRt
+# to be linked against for building a Hsa Perf application
+#
+LINK_DIRECTORIES($ENV{ROCR_LIB_DIR})
+find_library ( ROCR_LIB NAMES hsa-runtime64 PATHS $ENV{ROCR_LIB_DIR} )
+
+#
+# Set Name for Common library and build it as a
+# static library to be linked with others
+#
+set ( COMMON_LIB "common${ONLY64STR}" )
+add_subdirectory ( ${TEST_DIR}/common "${PROJECT_BINARY_DIR}/common" )
+
+#
+# Build the test library
+#
+set ( TEST_NAME simple_convolution )
+include_directories ( ${TEST_DIR}/${TEST_NAME} )
+set ( LIB_NAME "${TEST_NAME}${ONLY64STR}" )
+add_library ( ${LIB_NAME} STATIC ${TEST_DIR}/${TEST_NAME}/${TEST_NAME}.cpp )
+target_link_libraries( ${LIB_NAME} c stdc++ )
+execute_process ( COMMAND sh -xc "cp ${TEST_DIR}/${TEST_NAME}/*.hsaco ${PROJECT_BINARY_DIR}" )
+set ( TEST_LIBS ${LIB_NAME} )
+
+#
+# Build the test control
+#
+set ( SRC_LIST ${TEST_DIR}/ctrl/test.cpp )
+set ( SRC_LIST ${SRC_LIST} ${TEST_DIR}/ctrl/test_pmgr.cpp )
+set ( SRC_LIST ${SRC_LIST} ${TEST_DIR}/ctrl/test_hsa.cpp )
+set ( LIB_LIST ${TEST_LIBS} ${COMMON_LIB} ${CORE_UTILS_LIB} ${ROCR_LIB} ${TARGET_LIB} )
+set ( EXE_NAME "ctrl" )
+add_executable ( ${EXE_NAME} ${SRC_LIST} )
+target_link_libraries( ${EXE_NAME} ${LIB_LIST} c stdc++ dl pthread rt )
@@ -0,0 +1,876 @@
+/*
+ * =============================================================================
+ *   ROC Runtime Conformance Release License
+ * =============================================================================
+ * The University of Illinois/NCSA
+ * Open Source License (NCSA)
+ *
+ * Copyright (c) 2017, Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * Developed by:
+ *
+ *                 AMD Research and AMD ROC Software Development
+ *
+ *                 Advanced Micro Devices, Inc.
+ *
+ *                 www.amd.com
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal with the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ *  - Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimers.
+ *  - Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimers in
+ *    the documentation and/or other materials provided with the distribution.
+ *  - Neither the names of <Name of Development Group, Name of Institution>,
+ *    nor the names of its contributors may be used to endorse or promote
+ *    products derived from this Software without specific prior written
+ *    permission.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS WITH THE SOFTWARE.
+ *
+ */
+
+#include <assert.h>
+#include <stdint.h>
+#include <string.h>
+#include <fcntl.h>
+#include <unistd.h>
+#include <string>
+#include <iostream>
+#include <climits>
+#include "hsa/hsa.h"
+#include "hsa/hsa_ext_amd.h"
+
+#define RET_IF_HSA_ERR(err) { \
+  if ((err) != HSA_STATUS_SUCCESS) { \
+    std::cout << "hsa api call failure at line " << __LINE__ << ", file: " << \
+              __FILE__ << ". Call returned " << err << std::endl; \
+    return (err); \
+  } \
+}
+
+static const uint32_t kBinarySearchLength = 512;
+static const uint32_t kBinarySearchFindMe = 108;
+static const uint32_t kWorkGroupSize = 256;
+
+// Hold all the info specific to binary search
+typedef struct BinarySearch {
+  // Binary Search parameters
+  uint32_t length;
+  uint32_t work_group_size;
+  uint32_t work_grid_size;
+  uint32_t num_sub_divisions;
+  uint32_t find_me;
+
+  // Buffers needed for this application
+  uint32_t* input;
+  uint32_t* input_arr;
+  uint32_t* input_arr_local;
+  uint32_t* output;
+  // Keneral argument buffers and addresses
+  void* kern_arg_buffer;  // Begin of allocated memory
+  //  this pointer to be deallocated
+  void* kern_arg_address;  // Properly aligned address to be used in aql
+  // packet (don't use for deallocation)
+
+  // Kernel code
+  std::string kernel_file_name;
+  std::string kernel_name;
+  uint32_t kernarg_size;
+  uint32_t kernarg_align;
+
+  // HSA/RocR objects needed for this application
+  hsa_agent_t gpu_dev;
+  hsa_agent_t cpu_dev;
+  hsa_signal_t signal;
+  hsa_queue_t* queue;
+  hsa_amd_memory_pool_t cpu_pool;
+  hsa_amd_memory_pool_t gpu_pool;
+  hsa_amd_memory_pool_t kern_arg_pool;
+
+  // Other items we need to populate AQL packet
+  uint64_t kernel_object;
+  uint32_t group_segment_size;   ///< Kernel group seg size
+  uint32_t private_segment_size;   ///< Kernel private seg size
+} BinarySearch;
+
+void InitializeBinarySearch(BinarySearch* bs) {
+  bs->kernel_file_name = "./binary_search_kernels.hsaco";
+  bs->kernel_name = "binarySearch";
+  bs->length = 512;
+  bs->find_me = 108;
+  bs->work_group_size = 256;
+  bs->num_sub_divisions = bs->length / bs->work_group_size;
+}
+
+// This function is called by the call-back functions used to find an agent of
+// the specified hsa_device_type_t. Note that it cannot be called directly from
+// hsa_iterate_agents() as it does not match the prototype of the call-back
+// function. It must be wrapped by a function with the correct prototype.
+//
+// Return values:
+//  HSA_STATUS_INFO_BREAK -- "agent" is of the specified type (dev_type)
+//  HSA_STATUS_SUCCESS -- "agent" is not of the specified type
+//  Other -- Some error occurred
+static hsa_status_t FindAgent(hsa_agent_t agent, void* data,
+                              hsa_device_type_t dev_type) {
+  if (data == nullptr) {
+    return HSA_STATUS_ERROR_INVALID_ARGUMENT;
+  }
+
+  // See if the provided agent matches the input type (dev_type)
+  hsa_device_type_t hsa_device_type;
+  hsa_status_t hsa_error_code = hsa_agent_get_info(agent, HSA_AGENT_INFO_DEVICE,
+                                &hsa_device_type);
+  RET_IF_HSA_ERR(hsa_error_code);
+
+  if (hsa_device_type == dev_type) {
+    *(reinterpret_cast<hsa_agent_t*>(data)) = agent;
+    return HSA_STATUS_INFO_BREAK;
+  }
+
+  return HSA_STATUS_SUCCESS;
+}
+
+// This is the call-back function used to find a GPU type agent. Note that the
+// prototype of this function is dictated by the HSA specification
+hsa_status_t FindGPUDevice(hsa_agent_t agent, void* data) {
+  return FindAgent(agent, data, HSA_DEVICE_TYPE_GPU);
+}
+
+// This is the call-back function used to find a CPU type agent. Note that the
+// prototype of this function is dictated by the HSA specification
+hsa_status_t FindCPUDevice(hsa_agent_t agent, void* data) {
+  return FindAgent(agent, data, HSA_DEVICE_TYPE_CPU);
+}
+
+// Find the CPU and GPU agents we need to run this sample, and save them in the
+// BinarySearch structure for later use.
+hsa_status_t FindDevices(BinarySearch* bs) {
+  hsa_status_t err;
+
+  // Note that hsa_iterate_agents iterate through all known agents until
+  // HSA_STATUS_SUCCESS is not returned. The call-backs are implemented such
+  // that HSA_STATUS_INFO_BREAK means we found an agent of the specified type.
+  // This value is returned by hsa_iterate_agents.
+  bs->gpu_dev.handle = 0;
+  err = hsa_iterate_agents(FindGPUDevice, &bs->gpu_dev);
+
+  if (err != HSA_STATUS_INFO_BREAK) {
+    return HSA_STATUS_ERROR;
+  }
+
+  bs->cpu_dev.handle = 0;
+  err = hsa_iterate_agents(FindCPUDevice, &bs->cpu_dev);
+
+  if (err != HSA_STATUS_INFO_BREAK) {
+    return HSA_STATUS_ERROR;
+  }
+
+  if (0 == bs->gpu_dev.handle) {
+    std::cout << "GPU Device is not Created properly!" << std::endl;
+    RET_IF_HSA_ERR(HSA_STATUS_ERROR);
+  }
+
+  if (0 == bs->cpu_dev.handle) {
+    std::cout << "CPU Device is not Created properly!" << std::endl;
+    RET_IF_HSA_ERR(HSA_STATUS_ERROR);
+  }
+
+  return HSA_STATUS_SUCCESS;
+}
+
+// This function checks to see if the provided
+// pool has the HSA_AMD_SEGMENT_GLOBAL property. If the kern_arg flag is true,
+// the function adds an additional requirement that the pool have the
+// HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_KERNARG_INIT property. If kern_arg is false,
+// pools must NOT have this property.
+// Upon finding a pool that meets these conditions, HSA_STATUS_INFO_BREAK is
+// returned. HSA_STATUS_SUCCESS is returned if no errors were encountered, but
+// no pool was found meeting the requirements. If an error is encountered, we
+// return that error.
+
+// Note that this function does not match the required prototype for the
+// hsa_amd_agent_iterate_memory_pools call back function, and therefore must be
+// wrapped by a function with the correct prototype.
+static hsa_status_t
+FindGlobalPool(hsa_amd_memory_pool_t pool, void* data, bool kern_arg) {
+  hsa_status_t err;
+  hsa_amd_segment_t segment;
+  uint32_t flag;
+
+  if (nullptr == data) {
+    return HSA_STATUS_ERROR_INVALID_ARGUMENT;
+  }
+
+  err = hsa_amd_memory_pool_get_info(pool, HSA_AMD_MEMORY_POOL_INFO_SEGMENT,
+                                     &segment);
+  RET_IF_HSA_ERR(err);
+
+  if (HSA_AMD_SEGMENT_GLOBAL != segment) {
+    return HSA_STATUS_SUCCESS;
+  }
+
+  err = hsa_amd_memory_pool_get_info(pool,
+                                HSA_AMD_MEMORY_POOL_INFO_GLOBAL_FLAGS, &flag);
+  RET_IF_HSA_ERR(err);
+
+  uint32_t karg_st = flag & HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_KERNARG_INIT;
+
+  if ((karg_st == 0 && kern_arg) ||
+      (karg_st != 0 && !kern_arg)) {
+    return HSA_STATUS_SUCCESS;
+  }
+
+  *(reinterpret_cast<hsa_amd_memory_pool_t*>(data)) = pool;
+  return HSA_STATUS_INFO_BREAK;
+}
+
+// This is the call-back function for hsa_amd_agent_iterate_memory_pools() that
+// finds a pool with the properties of HSA_AMD_SEGMENT_GLOBAL and that is NOT
+// HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_KERNARG_INIT
+hsa_status_t FindStandardPool(hsa_amd_memory_pool_t pool, void* data) {
+  return FindGlobalPool(pool, data, false);
+}
+
+// This is the call-back function for hsa_amd_agent_iterate_memory_pools() that
+// finds a pool with the properties of HSA_AMD_SEGMENT_GLOBAL and that IS
+// HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_KERNARG_INIT
+hsa_status_t FindKernArgPool(hsa_amd_memory_pool_t pool, void* data) {
+  return FindGlobalPool(pool, data, true);
+}
+
+// Find memory pools that we will need to allocate from for this sample
+// application. We will need memory associated with the host CPU, the GPU
+// executing the kernels, and for kernel arguments. This function will
+// save the found pools to the BinarySearch structure for use elsewhere
+// in this program.
+hsa_status_t FindPools(BinarySearch* bs) {
+  hsa_status_t err;
+
+  err = hsa_amd_agent_iterate_memory_pools(bs->cpu_dev, FindStandardPool,
+        &bs->cpu_pool);
+
+  if (err != HSA_STATUS_INFO_BREAK) {
+    return HSA_STATUS_ERROR;
+  }
+
+  err = hsa_amd_agent_iterate_memory_pools(bs->gpu_dev, FindStandardPool,
+        &bs->gpu_pool);
+
+  if (err != HSA_STATUS_INFO_BREAK) {
+    return HSA_STATUS_ERROR;
+  }
+
+  err = hsa_amd_agent_iterate_memory_pools(bs->cpu_dev,
+        FindKernArgPool, &bs->kern_arg_pool);
+
+  if (err != HSA_STATUS_INFO_BREAK) {
+    return HSA_STATUS_ERROR;
+  }
+
+  return HSA_STATUS_SUCCESS;
+}
+
+// Once the needed memory pools have been found and the BinarySearch structure
+// has been updated with these handles, this function is then used to allocate
+// memory from those pools.
+// Devices with which a pool is associated already have access to the pool.
+// However, other devices may also need to read or write to that memory. Below,
+// we see how we can grant access to other devices to address this issue.
+hsa_status_t AllocateAndInitBuffers(BinarySearch* bs) {
+  hsa_status_t err;
+  uint32_t out_length = 4 * sizeof(uint32_t);
+  uint32_t in_length = bs->num_sub_divisions * 2 * sizeof(uint32_t);
+
+  // In all of these examples, we want both the cpu and gpu to have access to
+  // the buffer in question. We use the array of agents below in the susequent
+  // calls to hsa_amd_agents_allow_access() for this purpose.
+  hsa_agent_t ag_list[2] = {bs->gpu_dev, bs->cpu_dev};
+
+  err = hsa_amd_memory_pool_allocate(bs->cpu_pool, in_length, 0,
+                                     reinterpret_cast<void**>(&bs->input));
+  RET_IF_HSA_ERR(err);
+  err = hsa_amd_agents_allow_access(2, ag_list, NULL, bs->input);
+  RET_IF_HSA_ERR(err);
+  (void)memset(bs->input, 0, in_length);
+
+  err = hsa_amd_memory_pool_allocate(bs->cpu_pool, out_length, 0,
+                                     reinterpret_cast<void**>(&bs->output));
+  RET_IF_HSA_ERR(err);
+  err = hsa_amd_agents_allow_access(2, ag_list, NULL, bs->output);
+  RET_IF_HSA_ERR(err);
+  (void)memset(bs->input, 0, in_length);
+
+  err = hsa_amd_memory_pool_allocate(bs->cpu_pool, in_length, 0,
+                                     reinterpret_cast<void**>(&bs->input_arr));
+  RET_IF_HSA_ERR(err);
+  err = hsa_amd_agents_allow_access(2, ag_list, NULL, bs->input_arr);
+  RET_IF_HSA_ERR(err);
+  (void)memset(bs->input, 0, in_length);
+
+  err = hsa_amd_memory_pool_allocate(bs->cpu_pool, in_length, 0,
+                               reinterpret_cast<void**>(&bs->input_arr_local));
+  RET_IF_HSA_ERR(err);
+  err = hsa_amd_agents_allow_access(2, ag_list, NULL, bs->input_arr_local);
+  RET_IF_HSA_ERR(err);
+
+  // Binary-search application specific code...
+  // Initialize input buffer with random values in an increasing order
+  uint32_t max = bs->length * 20;
+  bs->input[0] = 0;
+
+  uint32_t seed = (unsigned int)time(NULL);
+  srand(seed);
+
+  for (uint32_t i = 1; i < bs->length; ++i) {
+    bs->input[i] = bs->input[i - 1] +
+     static_cast<uint32_t>(max * rand_r(&seed) / static_cast<float>(RAND_MAX));
+  }
+
+// #define VERBOSE 1
+#ifdef VERBOSE
+  std::cout << "Input array values:" << std::endl;
+
+  for (uint32_t i = 0; i < bs->length; ++i) {
+    std::cout << "input[" << i << "] = " << bs->input[i] << " ";
+
+    if (i % 4 == 0) {
+      std::cout << std::endl;
+    }
+  }
+
+  std::cout << std::endl;
+#endif
+
+  return err;
+}
+
+// The code in this function illustrates how to load a kernel from
+// pre-compiled code. The goal is to get a handle that can be later
+// used in an AQL packet and also to extract information about kernel
+// that we will need. All of the information hand kernel handle will
+// be saved to the BinarySearch structure. It will be used when we
+// populate the AQL packet.
+hsa_status_t LoadKernelFromObjFile(BinarySearch* bs) {
+  hsa_status_t err;
+  hsa_code_object_reader_t code_obj_rdr = {0};
+  hsa_executable_t executable = {0};
+
+  hsa_file_t file_handle = open(bs->kernel_file_name.c_str(), O_RDONLY);
+
+  if (file_handle == -1) {
+    std::cout << "failed to open " << bs->kernel_file_name.c_str() <<
+              " at line " << __LINE__ << ", errno: " << errno << std::endl;
+    return HSA_STATUS_ERROR;
+  }
+
+  err = hsa_code_object_reader_create_from_file(file_handle, &code_obj_rdr);
+  RET_IF_HSA_ERR(err);
+  close(file_handle);
+
+  err = hsa_executable_create_alt(HSA_PROFILE_FULL,
+                HSA_DEFAULT_FLOAT_ROUNDING_MODE_DEFAULT, NULL, &executable);
+  RET_IF_HSA_ERR(err);
+
+  err = hsa_executable_load_agent_code_object(executable, bs->gpu_dev,
+        code_obj_rdr, NULL, NULL);
+  RET_IF_HSA_ERR(err);
+
+  err = hsa_executable_freeze(executable, NULL);
+  RET_IF_HSA_ERR(err);
+
+  hsa_executable_symbol_t kern_sym;
+  err = hsa_executable_get_symbol(executable, NULL, bs->kernel_name.c_str(),
+                                  bs->gpu_dev, 0, &kern_sym);
+  RET_IF_HSA_ERR(err);
+
+  err = hsa_executable_symbol_get_info(kern_sym,
+                                    HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT,
+                                                          &bs->kernel_object);
+  RET_IF_HSA_ERR(err);
+
+  err = hsa_executable_symbol_get_info(kern_sym,
+                      HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_PRIVATE_SEGMENT_SIZE,
+                                                   &bs->private_segment_size);
+  RET_IF_HSA_ERR(err);
+
+  err = hsa_executable_symbol_get_info(kern_sym,
+                        HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_GROUP_SEGMENT_SIZE,
+                                                     &bs->group_segment_size);
+  RET_IF_HSA_ERR(err);
+
+  err = hsa_executable_symbol_get_info(kern_sym,
+                      HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_KERNARG_SEGMENT_SIZE,
+                                                           &bs->kernarg_size);
+  RET_IF_HSA_ERR(err);
+
+  err = hsa_executable_symbol_get_info(kern_sym,
+                 HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_KERNARG_SEGMENT_ALIGNMENT,
+                                                          &bs->kernarg_align);
+  RET_IF_HSA_ERR(err);
+
+  return err;
+}
+
+// This function shows how to do an asynchronous copy. We have to create a
+// signal and use the signal to notify us when the copy has completed.
+hsa_status_t AgentMemcpy(void* dst, const void* src,
+                         size_t size, hsa_agent_t dst_ag, hsa_agent_t src_ag) {
+  hsa_signal_t s;
+  hsa_status_t err;
+
+  err = hsa_signal_create(1, 0, NULL, &s);
+  RET_IF_HSA_ERR(err);
+
+  err = hsa_amd_memory_async_copy(dst, dst_ag, src, src_ag, size, 0, NULL, s);
+  RET_IF_HSA_ERR(err);
+
+  if (hsa_signal_wait_scacquire(s, HSA_SIGNAL_CONDITION_LT, 1,
+                                UINT64_MAX, HSA_WAIT_STATE_BLOCKED) != 0) {
+    err = HSA_STATUS_ERROR;
+    std::cout << "Async copy signal error" << std::endl;
+
+    RET_IF_HSA_ERR(err);
+  }
+
+  err = hsa_signal_destroy(s);
+
+  RET_IF_HSA_ERR(err);
+
+  return err;
+}
+
+// AlignDown and AlignUp are 2 utility functions we use to find an aligned
+// boundary either below or above a given value (address). The function will
+// return a value that has the specified alignment.
+static intptr_t
+AlignDown(intptr_t value, size_t alignment) {
+  return (intptr_t) (value & ~(alignment - 1));
+}
+static void*
+AlignUp(void* value, size_t alignment) {
+  return reinterpret_cast<void*>(AlignDown((uintptr_t)
+           (reinterpret_cast<uintptr_t>(value) + alignment - 1), alignment));
+}
+
+// This function populates the AQL patch with the information
+// we have collected and stored in the BinarySearch structure thus far.
+void PopulateAQLPacket(BinarySearch const* bs,
+                       hsa_kernel_dispatch_packet_t* aql) {
+  aql->header = 0;  // Dummy val. for now. Set this right before doorbell ring
+  aql->setup = 1;
+  aql->workgroup_size_x = bs->work_group_size;
+  aql->workgroup_size_y = 1;
+  aql->workgroup_size_z = 1;
+  aql->grid_size_x = bs->work_grid_size;
+  aql->grid_size_y = 1;
+  aql->grid_size_z = 1;
+  aql->private_segment_size = bs->private_segment_size;
+  aql->group_segment_size = bs->group_segment_size;
+  aql->kernel_object = bs->kernel_object;
+  aql->kernarg_address = bs->kern_arg_address;
+  aql->completion_signal = bs->signal;
+
+  return;
+}
+/*
+ * Write everything in the provided AQL packet to the queue except the first 32
+ * bits which include the header and setup fields. That should be done
+ * last.
+ */
+void WriteAQLToQueue(hsa_kernel_dispatch_packet_t const* in_aql,
+                     hsa_queue_t* q) {
+  void* queue_base = q->base_address;
+  const uint32_t queue_mask = q->size - 1;
+  uint64_t que_idx = hsa_queue_add_write_index_relaxed(q, 1);
+
+  hsa_kernel_dispatch_packet_t* queue_aql_packet;
+
+  queue_aql_packet =
+    &(reinterpret_cast<hsa_kernel_dispatch_packet_t*>(queue_base))
+    [que_idx & queue_mask];
+
+  queue_aql_packet->workgroup_size_x = in_aql->workgroup_size_x;
+  queue_aql_packet->workgroup_size_y = in_aql->workgroup_size_y;
+  queue_aql_packet->workgroup_size_z = in_aql->workgroup_size_z;
+  queue_aql_packet->grid_size_x = in_aql->grid_size_x;
+  queue_aql_packet->grid_size_y = in_aql->grid_size_y;
+  queue_aql_packet->grid_size_z = in_aql->grid_size_z;
+  queue_aql_packet->private_segment_size = in_aql->private_segment_size;
+  queue_aql_packet->group_segment_size = in_aql->group_segment_size;
+  queue_aql_packet->kernel_object = in_aql->kernel_object;
+  queue_aql_packet->kernarg_address = in_aql->kernarg_address;
+  queue_aql_packet->completion_signal = in_aql->completion_signal;
+}
+
+// This function allocates memory from the kern_arg pool we already found, and
+// then sets the argument values needed by the kernel code.
+hsa_status_t AllocAndSetKernArgs(BinarySearch* bs, void* args,
+                                 size_t arg_size, void** aql_buf_ptr) {
+  void* kern_arg_buf = nullptr;
+  hsa_status_t err;
+  size_t buf_size;
+  size_t req_align;
+
+  // The kernel code must be written to memory at the correct alignment. We
+  // already queried the executable to get the correct alignment, which is
+  // stored in bs->kernarg_align. In case the memory returned from
+  // hsa_amd_memory_pool is not of the correct alignment, we request a little
+  // more than what we need in case we need to adjust.
+  req_align = bs->kernarg_align;
+  // Allocate enough extra space for alignment adjustments if ncessary
+  buf_size = arg_size + (req_align << 1);
+
+  err = hsa_amd_memory_pool_allocate(bs->kern_arg_pool, buf_size, 0,
+                                     reinterpret_cast<void**>(&kern_arg_buf));
+  RET_IF_HSA_ERR(err);
+
+  // Address of the allocated buffer
+  bs->kern_arg_buffer = kern_arg_buf;
+
+  // Addr. of kern arg start.
+  bs->kern_arg_address = AlignUp(kern_arg_buf, req_align);
+
+  assert(arg_size >= bs->kernarg_size);
+  assert(((uintptr_t)bs->kern_arg_address + arg_size) <
+         ((uintptr_t)bs->kern_arg_buffer + buf_size));
+
+  (void)memcpy(bs->kern_arg_address, args, arg_size);
+  RET_IF_HSA_ERR(err);
+
+  // Make sure both the CPU and GPU can access the kernel arguments
+  hsa_agent_t ag_list[2] = {bs->gpu_dev, bs->cpu_dev};
+  err = hsa_amd_agents_allow_access(2, ag_list, NULL, bs->kern_arg_buffer);
+  RET_IF_HSA_ERR(err);
+
+  // Save this info in our BinarySearch structure for later.
+  *aql_buf_ptr = bs->kern_arg_address;
+
+  return HSA_STATUS_SUCCESS;
+}
+
+// This wrapper atomically writes the provided header and setup to the
+// provided AQL packet. The provided AQL packet address should be in the
+// queue memory space.
+inline void AtomicSetPacketHeader(uint16_t header, uint16_t setup,
+                                  hsa_kernel_dispatch_packet_t* queue_packet) {
+  __atomic_store_n(reinterpret_cast<uint32_t*>(queue_packet),
+                   header | (setup << 16), __ATOMIC_RELEASE);
+}
+
+// Once all the required data for kernel execution is collected (in this
+// application it is stored in the BinarySearch structure) we can put it in
+// an AQL packet and ring the queue door bell to tell the command processor to
+// execute it.
+hsa_status_t Run(BinarySearch* bs) {
+  hsa_status_t err;
+
+  std::cout << "Executing kernel " << bs->kernel_name << std::endl;
+
+  // Adjust the size of workgroup
+  // This is mostly application specific.
+  if (bs->work_group_size > 64) {
+    bs->work_group_size = 64;
+    bs->num_sub_divisions = bs->length / bs->work_group_size;
+
+    if (bs->num_sub_divisions < bs->work_group_size) {
+      bs->num_sub_divisions = bs->work_group_size;
+    }
+
+    bs->work_grid_size = bs->num_sub_divisions;
+  }
+
+  // Explanation of BinarySearch algorithm.
+  /*
+   * Since a plain binary search on the GPU would not achieve much benefit
+   * over the GPU we are doing an N'ary search. We split the array into N
+   * segments every pass and therefore get log (base N) passes instead of log
+   * (base 2) passes.
+   *
+   * In every pass, only the thread that can potentially have the element we
+   * are looking for writes to the output array. For ex: if we are looking to
+   * find 4567 in the array and every thread is searching over a segment of
+   * 1000 values and the input array is 1, 2, 3, 4,... then the first thread
+   * is searching in 1 to 1000, the second one from 1001 to 2000, etc. The
+   * first one does not write to the output. The second one doesn't either.
+   * The fifth one however is from 4001 to 5000. So it can potentially have
+   * the element 4567 which lies between them.
+   *
+   * This particular thread writes to the output the lower bound, upper bound
+   * and whether the element equals the lower bound element. So, it would be
+   * 4001, 5000, 0
+   *
+   * The next pass would subdivide 4001 to 5000 into smaller segments and
+   * continue the same process from there.
+   *
+   * When a pass returns 1 in the third element, it means the element has been
+   * found and we can stop executing the kernel. If the element is not found,
+   * then the execution stops after looking at segment of size 1.
+   */
+
+  uint32_t global_lower_bound = 0;
+  uint32_t global_upper_bound = bs->length - 1;
+  uint32_t sub_div_size = (global_upper_bound - global_lower_bound + 1) /
+                          bs->num_sub_divisions;
+
+  if ((bs->input[0] > bs->find_me) ||
+      (bs->input[bs->length - 1] < bs->find_me)) {
+    bs->output[0] = 0;
+    bs->output[1] = bs->length - 1;
+    bs->output[2] = 0;
+    std::cout << "Returning too early" << std::endl;
+    return HSA_STATUS_SUCCESS;
+  }
+
+  bs->output[3] = 1;
+
+  // Setup the kernel args
+  // See the meta-data for the compiled OpenCL kernel code to ascertain
+  // the sizes, padding and alignment required for kernel arguments.
+  // This can be seen by executing
+  // $ amdgcn-amd-amdhsa-readelf -aw ./binary_search_kernels.hsaco
+  // The kernel code will expect the following arguments aligned as shown.
+  typedef uint32_t uint2[2];
+  typedef uint32_t uint4[4];
+  struct __attribute__((aligned(16))) local_args_t {
+    uint4* outputArray;
+    uint2*  sortedArray;
+    uint32_t findMe;
+    uint32_t pad;
+    uint64_t global_offset_x;
+    uint64_t global_offset_y;
+    uint64_t global_offset_z;
+  } local_args;
+
+  local_args.outputArray = reinterpret_cast<uint4*>(bs->output);
+  local_args.sortedArray = reinterpret_cast<uint2*>(bs->input_arr_local);
+  local_args.findMe = bs->find_me;
+  local_args.global_offset_x = 0;
+  local_args.global_offset_y = 0;
+  local_args.global_offset_z = 0;
+
+  // Copy the kernel args structure into kernel arg memory
+  err = AllocAndSetKernArgs(bs, &local_args, sizeof(local_args),
+                            &bs->kern_arg_address);
+  RET_IF_HSA_ERR(err);
+
+  // Populate an AQL packet with the info we've gathered
+  hsa_kernel_dispatch_packet_t aql;
+  PopulateAQLPacket(bs, &aql);
+
+  uint32_t in_length = bs->num_sub_divisions * 2 * sizeof(uint32_t);
+
+  while ((sub_div_size > 1) && (bs->output[3] != 0)) {
+    for (uint32_t i = 0 ; i < bs->num_sub_divisions; i++) {
+      int idx1 = i * sub_div_size;
+      int idx2 = ((i + 1) * sub_div_size) - 1;
+      bs->input_arr[2 * i] = bs->input[idx1];
+      bs->input_arr[2 * i + 1] = bs->input[idx2];
+    }
+
+    // Copy kernel parameter from system memory to local memory
+    err = AgentMemcpy(reinterpret_cast<uint8_t*>(bs->input_arr_local),
+                      reinterpret_cast<uint8_t*>(bs->input_arr),
+                                        in_length, bs->gpu_dev, bs->cpu_dev);
+
+    RET_IF_HSA_ERR(err);
+
+    // Reset output buffer to zero
+    bs->output[3] = 0;
+
+    // Dispatch kernel with global work size, work group size with ONE dimesion
+    // and wait for kernel to complete
+
+    // Compute the write index of queue and copy Aql packet into it
+    uint64_t que_idx = hsa_queue_load_write_index_relaxed(bs->queue);
+
+    const uint32_t mask = bs->queue->size - 1;
+
+    // This function simply copies the data we've collected so far into our
+    // local AQL packet, except the the setup and header fields.
+    WriteAQLToQueue(&aql, bs->queue);
+
+    uint32_t aql_header = HSA_PACKET_TYPE_KERNEL_DISPATCH;
+    aql_header |= HSA_FENCE_SCOPE_SYSTEM <<
+                  HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE;
+    aql_header |= HSA_FENCE_SCOPE_SYSTEM <<
+                  HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE;
+
+    // Set the packet's type, acquire and release fences. This should be done
+    // atomically after all the other fields have been set, using release
+    // memory ordering to ensure all the fields are set when the door bell
+    // signal is activated.
+    void* q_base = bs->queue->base_address;
+
+    AtomicSetPacketHeader(aql_header, aql.setup,
+                      &(reinterpret_cast<hsa_kernel_dispatch_packet_t*>
+                                                   (q_base))[que_idx & mask]);
+
+    // Increment the write index and ring the doorbell to dispatch kernel.
+    hsa_queue_store_write_index_relaxed(bs->queue, (que_idx + 1));
+    hsa_signal_store_relaxed(bs->queue->doorbell_signal, que_idx);
+
+    // Wait on the dispatch signal until the kernel is finished.
+    // Modify the wait condition to HSA_WAIT_STATE_ACTIVE (instead of
+    // HSA_WAIT_STATE_BLOCKED) if polling is needed instead of blocking, as we
+    // have below.
+    // The call below will block until the condition is met. Below we have said
+    // the condition is that the signal value (initiailzed to 1) associated with
+    // the queue is less than 1. When the kernel associated with the queued AQL
+    // packet has completed execution, the signal value is automatically
+    // decremented by the packet processor.
+    hsa_signal_value_t value = hsa_signal_wait_scacquire(bs->signal,
+                               HSA_SIGNAL_CONDITION_LT, 1,
+                               UINT64_MAX, HSA_WAIT_STATE_BLOCKED);
+
+    // value should be 0, or we timed-out
+    if (value) {
+      std::cout << "Timed out waiting for kernel to complete?" << std::endl;
+      RET_IF_HSA_ERR(HSA_STATUS_ERROR);
+    }
+
+    // Reset the signal to its initial value for the next iteration
+    hsa_signal_store_screlease(bs->signal, 1);
+
+    // Binary search algorithm stuff...
+    global_lower_bound = bs->output[0] * sub_div_size;
+    global_upper_bound = global_lower_bound + sub_div_size - 1;
+    sub_div_size = (global_upper_bound - global_lower_bound + 1) /
+                   bs->num_sub_divisions;
+  }
+
+  uint32_t element_index = UINT_MAX;
+
+  for (uint32_t i = global_lower_bound; i <= global_upper_bound; i++) {
+    if (bs->input[i] == bs->find_me) {
+      element_index = i;
+      bs->output[0] = i;
+      bs->output[1] = i + 1;
+      bs->output[2] = 1;
+      break;
+    }
+
+    // Element is not found in region specified
+    // by global lower bound to global upper bound
+    bs->output[2] = 0;
+  }
+
+  uint32_t is_elem_found = bs->output[2];
+
+  std::cout << "Lower bound = " << global_lower_bound << std::endl;
+  std::cout << "Upper bound = " << global_upper_bound << std::endl;
+  std::cout << "Element search for = " << bs->find_me << std::endl;
+
+
+  if (is_elem_found == 1) {
+    std::cout << "Element found at index " << element_index << std::endl;
+  } else {
+    std::cout << "Element value " << bs->find_me << " not found" << std::endl;
+  }
+
+  return HSA_STATUS_SUCCESS;
+}
+
+// Release all the RocR resources we have acquired in this application.
+hsa_status_t CleanUp(BinarySearch* bs) {
+  hsa_status_t err;
+
+  err = hsa_amd_memory_pool_free(bs->input);
+  RET_IF_HSA_ERR(err);
+
+  err = hsa_amd_memory_pool_free(bs->output);
+  RET_IF_HSA_ERR(err);
+
+  err = hsa_amd_memory_pool_free(bs->input_arr);
+  RET_IF_HSA_ERR(err);
+
+  err = hsa_amd_memory_pool_free(bs->kern_arg_buffer);
+  RET_IF_HSA_ERR(err);
+
+  err = hsa_queue_destroy(bs->queue);
+  RET_IF_HSA_ERR(err);
+
+  err = hsa_signal_destroy(bs->signal);
+  RET_IF_HSA_ERR(err);
+
+  err = hsa_shut_down();
+  RET_IF_HSA_ERR(err);
+
+  return HSA_STATUS_SUCCESS;
+}
+
+int main(int argc, char* argv[]) {
+  // This BinarySearch structure (bs) below holds all of the appl. specific
+  // info we need to run the sample. This includes algorithm specific
+  // information as well as handles to RocR/HSA objects.
+
+  // The basic structure of this sample is to fill in this structure with the
+  // required RocR/HSA handles to RocR resources (e.g., agents, memory pools,
+  // queues, etc.) and then dispatch the packets to the queue, and examine the
+  // output.
+
+  BinarySearch bs;
+  hsa_status_t err;
+
+  // Set some working values specific to this application
+  InitializeBinarySearch(&bs);
+
+  // hsa_init() initializes internal data structures and causes devices
+  // (agents), memory pools and other resources to be discovered.
+  err = hsa_init();
+  RET_IF_HSA_ERR(err);
+
+  // Find the agents needed for the sample
+  err = FindDevices(&bs);
+  RET_IF_HSA_ERR(err);
+
+  // Create the completion signal used when dispatching a packet
+  err = hsa_signal_create(1, 0, NULL, &bs.signal);
+  RET_IF_HSA_ERR(err);
+
+  // Create a queue to submit our binary search AQL packets
+  err = hsa_queue_create(bs.gpu_dev, 128, HSA_QUEUE_TYPE_MULTI, NULL, NULL,
+                         UINT32_MAX, UINT32_MAX, &bs.queue);
+  RET_IF_HSA_ERR(err);
+
+  // Find the HSA memory pools we need to run this sample
+  err = FindPools(&bs);
+  RET_IF_HSA_ERR(err);
+
+  // Allocate memory from the correct memory pool, and initialize them as
+  // neeeded for the algorihm.
+  err = AllocateAndInitBuffers(&bs);
+  RET_IF_HSA_ERR(err);
+
+  // Create a kernel object from the pre-compiled kernel, and read some
+  // attributes associated with the kernel that we will need.
+  err = LoadKernelFromObjFile(&bs);
+  RET_IF_HSA_ERR(err);
+
+  // Fill in the AQL packet, assign the kernel arguments, enqueue the packet,
+  // "ring" the doorbell, and wait for completion.
+  err = Run(&bs);
+  RET_IF_HSA_ERR(err);
+
+  // Release all the RocR resources we've acquired and shutdown HSA.
+  err = CleanUp(&bs);
+
+  return 0;
+}
+
+
+#undef RET_IF_HSA_ERR
@@ -0,0 +1,127 @@
+/*
+ * =============================================================================
+ *   ROC Runtime Conformance Release License
+ * =============================================================================
+ * The University of Illinois/NCSA
+ * Open Source License (NCSA)
+ *
+ * Copyright (c) 2017, Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * Developed by:
+ *
+ *                 AMD Research and AMD ROC Software Development
+ *
+ *                 Advanced Micro Devices, Inc.
+ *
+ *                 www.amd.com
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal with the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ *  - Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimers.
+ *  - Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimers in
+ *    the documentation and/or other materials provided with the distribution.
+ *  - Neither the names of <Name of Development Group, Name of Institution>,
+ *    nor the names of its contributors may be used to endorse or promote
+ *    products derived from this Software without specific prior written
+ *    permission.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS WITH THE SOFTWARE.
+ *
+ */
+
+/**
+ * One instance of this kernel call is a thread.
+ * Each thread finds out the segment in which it should look for the element.
+ * After that, it checks if the element is between the lower bound and upper
+ * bound of its segment. If yes, then this segment becomes the total
+ * searchspace for the next pass.
+ *
+ * To achieve this, it writes the lower bound and upper bound to the output
+ * array. In case the element at the left end (lower bound) matches the element
+ * we are looking for, that is marked in the output and we no longer need to
+ * look any further.
+ */
+ 
+__kernel void
+binarySearch(__global uint4 * outputArray,
+             __const __global uint2  * sortedArray,
+             const   unsigned int findMe) {
+  unsigned int tid = get_global_id(0);
+
+  // Then we find the elements  for this thread
+  uint2 element = sortedArray[tid];
+
+
+  // If the element to be found does not lie between
+  // them, then nothing left to do in this thread
+  if((element.x > findMe) || (element.y < findMe)) {
+    return;
+  } else {
+    // However, if the element does lie between the lower
+    // and upper bounds of this thread's searchspace
+    // we need to narrow down the search further in this
+    // search space 
+    // The search space for this thread is marked in the
+    // output as being the total search space for the next pass
+    outputArray[0].x = tid;
+    outputArray[0].w = 1;
+  }
+}
+
+
+__kernel void
+binarySearch_mulkeys(__global int *keys,
+                     __global uint *input,
+                     const unsigned int numKeys,
+                     __global int *output) {
+
+  int gid = get_global_id(0);
+  int lBound = gid * 256;
+  int uBound = lBound + 255;
+
+  for(int i = 0; i < numKeys; i++) {
+    if(keys[i] >= input[lBound] && keys[i] <= input[uBound])
+      output[i]=lBound;
+  }
+
+}
+
+
+__kernel void
+binarySearch_mulkeysConcurrent(__global uint *keys,
+                               __global uint *input,
+                               const unsigned int inputSize, // num. of inputs
+                               const unsigned int numSubdivisions,
+                               __global int *output) {
+
+  int lBound = (get_global_id(0) % numSubdivisions) * (inputSize / numSubdivisions);
+  int uBound = lBound + inputSize / numSubdivisions;
+  int myKey = keys[get_global_id(0) / numSubdivisions];
+  int mid;
+
+  while(uBound >= lBound) {
+    mid = (lBound + uBound) / 2;
+    if(input[mid] == myKey) {
+      output[get_global_id(0) / numSubdivisions] = mid;
+      return;
+    } else if(input[mid] > myKey) {
+      uBound = mid - 1;
+    } else {
+      lBound = mid + 1;
+    }
+  }
+}
@@ -0,0 +1,15 @@
+#
+# Source files for Rocr Utils library
+#
+file( GLOB MODULE_SRC "*.cpp" )
+
+#
+# Header files include path(s).
+#
+include_directories ( $ENV{ROCR_INC_DIR} )
+
+#
+# Build Utils as a Static Library object
+#
+add_library( ${COMMON_LIB} STATIC ${MODULE_SRC} )
+target_link_libraries( ${COMMON_LIB} c stdc++ dl pthread rt )
@@ -0,0 +1,45 @@
+#include "common.hpp"
+
+void ErrorCheck(hsa_status_t hsa_error_code) {
+  if (hsa_error_code != HSA_STATUS_SUCCESS) {
+    std::cerr << "HSA reported error!" << std::endl;
+    exit(EXIT_FAILURE);
+  }
+}
+
+hsa_status_t FindGpuDevice(hsa_agent_t agent, void* data) {
+  if (data == NULL) {
+    return HSA_STATUS_ERROR_INVALID_ARGUMENT;
+  }
+
+  hsa_device_type_t hsa_device_type;
+  hsa_status_t hsa_error_code = hsa_agent_get_info(agent, HSA_AGENT_INFO_DEVICE, &hsa_device_type);
+  if (hsa_error_code != HSA_STATUS_SUCCESS) {
+    return hsa_error_code;
+  }
+
+  if (hsa_device_type == HSA_DEVICE_TYPE_GPU) {
+    *((hsa_agent_t*)data) = agent;
+  }
+
+  return HSA_STATUS_SUCCESS;
+}
+
+hsa_status_t FindHostRegion(hsa_region_t region, void* data) {
+  if (data == NULL) {
+    return HSA_STATUS_ERROR_INVALID_ARGUMENT;
+  }
+
+  bool is_host_region = false;
+  hsa_status_t hsa_error_code = hsa_region_get_info(
+      region, (hsa_region_info_t)HSA_AMD_REGION_INFO_HOST_ACCESSIBLE, &is_host_region);
+  if (hsa_error_code != HSA_STATUS_SUCCESS) {
+    return hsa_error_code;
+  }
+
+  if (is_host_region) {
+    *((hsa_region_t*)data) = region;
+  }
+
+  return HSA_STATUS_SUCCESS;
+}
@@ -0,0 +1,27 @@
+#ifndef COMMON_COMMON_HPP
+#define COMMON_COMMON_HPP
+
+#include <cstdlib>
+#include <iostream>
+
+#include "hsa.h"
+#include "hsa_ext_finalize.h"
+#include "hsa_ext_amd.h"
+
+#if defined(_MSC_VER)
+#define ALIGNED_(x) __declspec(align(x))
+#else
+#if defined(__GNUC__)
+#define ALIGNED_(x) __attribute__((aligned(x)))
+#endif  // __GNUC__
+#endif  // _MSC_VER
+
+#define MULTILINE(...) #__VA_ARGS__
+
+void ErrorCheck(hsa_status_t hsa_error_code);
+
+hsa_status_t FindGpuDevice(hsa_agent_t agent, void* data);
+
+hsa_status_t FindHostRegion(hsa_region_t region, void* data);
+
+#endif  // COMMON_COMMON_HPP
@@ -0,0 +1,262 @@
+/**********************************************************************
+Copyright ©2013 Advanced Micro Devices, Inc. All rights reserved.
+
+Redistribution and use in source and binary forms, with or without modification, are permitted
+provided that the following conditions are met:
+
+•	Redistributions of source code must retain the above copyright notice, this list of
+conditions and the following disclaimer.
+•	Redistributions in binary form must reproduce the above copyright notice, this list of
+conditions and the following disclaimer in the documentation and/or
+ other materials provided with the distribution.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
+IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT
+SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY
+ DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
+ OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE.
+********************************************************************/
+
+#include "helper_funcs.hpp"
+
+#ifndef _WIN32
+#include <unistd.h>
+#endif
+
+
+/*
+ * Prints no more than 256 elements of the given array.
+ * Prints full array if length is less than 256.
+ * Prints Array name followed by elements.
+ */
+template <typename T>
+void printArray(const std::string header, const T* data, const int width, const int height) {
+  std::cout << header << " :\n";
+  for (int i = 0; i < height; i++) {
+    std::cout << "> ";
+    for (int j = 0; j < width; j++) {
+      std::cout << data[i * width + j] << " ";
+    }
+    std::cout << "\n";
+  }
+}
+
+template <typename T>
+int fillRandom(T* arrayPtr, const int width, const int height, const T rangeMin, const T rangeMax,
+               unsigned int seed) {
+  if (!arrayPtr) {
+    error("Cannot fill array. NULL pointer.");
+    return HSA_SDK_FAILURE;
+  }
+
+  if (!seed) seed = (unsigned int)time(NULL);
+
+  srand(seed);
+  double range = double(rangeMax - rangeMin) + 1.0;
+
+  /* random initialisation of input */
+  for (int i = 0; i < height; i++)
+    for (int j = 0; j < width; j++) {
+      int index = i * width + j;
+      arrayPtr[index] = rangeMin + T(range * rand() / (RAND_MAX + 1.0));
+    }
+
+  return HSA_SDK_SUCCESS;
+}
+
+template <typename T> int fillPos(T* arrayPtr, const int width, const int height) {
+  if (!arrayPtr) {
+    error("Cannot fill array. NULL pointer.");
+    return HSA_SDK_FAILURE;
+  }
+
+  /* initialisation of input with positions*/
+  for (T i = 0; i < height; i++)
+    for (T j = 0; j < width; j++) {
+      T index = i * width + j;
+      arrayPtr[index] = index;
+    }
+
+  return HSA_SDK_SUCCESS;
+}
+
+template <typename T>
+int fillConstant(T* arrayPtr, const int width, const int height, const T val) {
+  if (!arrayPtr) {
+    error("Cannot fill array. NULL pointer.");
+    return HSA_SDK_FAILURE;
+  }
+
+  /* initialisation of input with constant value*/
+  for (int i = 0; i < height; i++)
+    for (int j = 0; j < width; j++) {
+      int index = i * width + j;
+      arrayPtr[index] = val;
+    }
+
+  return HSA_SDK_SUCCESS;
+}
+
+template <typename T> T roundToPowerOf2(T val) {
+  int bytes = sizeof(T);
+
+  val--;
+  for (int i = 0; i < bytes; i++) val |= val >> (1 << i);
+  val++;
+
+  return val;
+}
+
+template <typename T> int isPowerOf2(T val) {
+  long long _val = val;
+  if ((_val & (-_val)) - _val == 0 && _val != 0)
+    return HSA_SDK_SUCCESS;
+  else
+    return HSA_SDK_FAILURE;
+}
+
+
+template <typename T> bool checkVal(T input, T reference, std::string message, bool isAPIerror) {
+  if (input == reference) {
+    return true;
+  } else {
+    error(message);
+    return false;
+  }
+}
+
+
+template <typename T> std::string toString(T t, std::ios_base& (*r)(std::ios_base&)) {
+  std::ostringstream output;
+  output << r << t;
+  return output.str();
+}
+
+
+bool compare(const float* refData, const float* data, const int length, const float epsilon) {
+  float error = 0.0f;
+  float ref = 0.0f;
+
+  for (int i = 1; i < length; ++i) {
+    float diff = refData[i] - data[i];
+    error += diff * diff;
+    ref += refData[i] * refData[i];
+  }
+
+  float normRef = ::sqrtf((float)ref);
+  if (::fabs((float)ref) < 1e-7f) {
+    return false;
+  }
+  float normError = ::sqrtf((float)error);
+  error = normError / normRef;
+
+  return error < epsilon;
+}
+
+bool compare(const double* refData, const double* data, const int length, const double epsilon) {
+  double error = 0.0;
+  double ref = 0.0;
+
+  for (int i = 1; i < length; ++i) {
+    double diff = refData[i] - data[i];
+    error += diff * diff;
+    ref += refData[i] * refData[i];
+  }
+
+  double normRef = ::sqrt((double)ref);
+  if (::fabs((double)ref) < 1e-7) {
+    return false;
+  }
+  double normError = ::sqrt((double)error);
+  error = normError / normRef;
+
+  return error < epsilon;
+}
+
+void error(const char* errorMsg) { std::cout << "Error: " << errorMsg << std::endl; }
+
+void error(std::string errorMsg) { std::cout << "Error: " << errorMsg << std::endl; }
+
+void expectedError(const char* errorMsg) {
+  std::cout << "Expected Error: " << errorMsg << std::endl;
+}
+
+void expectedError(std::string errorMsg) {
+  std::cout << "Expected Error: " << errorMsg << std::endl;
+}
+
+
+/////////////////////////////////////////////////////////////////
+// Template Instantiations
+/////////////////////////////////////////////////////////////////
+template void printArray<short>(const std::string, const short*, int, int);
+template void printArray<unsigned char>(const std::string, const unsigned char*, int, int);
+template void printArray<unsigned int>(const std::string, const unsigned int*, int, int);
+template void printArray<int>(const std::string, const int*, int, int);
+template void printArray<long>(const std::string, const long*, int, int);
+template void printArray<float>(const std::string, const float*, int, int);
+template void printArray<double>(const std::string, const double*, int, int);
+
+template int fillRandom<unsigned char>(unsigned char* arrayPtr, const int width, const int height,
+                                       unsigned char rangeMin, unsigned char rangeMax,
+                                       unsigned int seed);
+template int fillRandom<unsigned int>(unsigned int* arrayPtr, const int width, const int height,
+                                      unsigned int rangeMin, unsigned int rangeMax,
+                                      unsigned int seed);
+template int fillRandom<int>(int* arrayPtr, const int width, const int height, int rangeMin,
+                             int rangeMax, unsigned int seed);
+template int fillRandom<long>(long* arrayPtr, const int width, const int height, long rangeMin,
+                              long rangeMax, unsigned int seed);
+template int fillRandom<float>(float* arrayPtr, const int width, const int height, float rangeMin,
+                               float rangeMax, unsigned int seed);
+template int fillRandom<double>(double* arrayPtr, const int width, const int height,
+                                double rangeMin, double rangeMax, unsigned int seed);
+
+template short roundToPowerOf2<short>(short val);
+template unsigned int roundToPowerOf2<unsigned int>(unsigned int val);
+template int roundToPowerOf2<int>(int val);
+template long roundToPowerOf2<long>(long val);
+
+template int isPowerOf2<short>(short val);
+template int isPowerOf2<unsigned int>(unsigned int val);
+template int isPowerOf2<int>(int val);
+template int isPowerOf2<long>(long val);
+
+template <> int fillPos<short>(short* arrayPtr, const int width, const int height);
+template <> int fillPos<unsigned int>(unsigned int* arrayPtr, const int width, const int height);
+template <> int fillPos<int>(int* arrayPtr, const int width, const int height);
+template <> int fillPos<long>(long* arrayPtr, const int width, const int height);
+
+template <>
+int fillConstant<short>(short* arrayPtr, const int width, const int height, const short val);
+template <>
+int fillConstant(unsigned int* arrayPtr, const int width, const int height, const unsigned int val);
+template <> int fillConstant(int* arrayPtr, const int width, const int height, const int val);
+template <> int fillConstant(long* arrayPtr, const int width, const int height, const long val);
+template <> int fillConstant(long* arrayPtr, const int width, const int height, const long val);
+template <> int fillConstant(long* arrayPtr, const int width, const int height, const long val);
+
+
+template bool checkVal<char>(char input, char reference, std::string message, bool isAPIerror);
+template bool checkVal<bool>(bool input, bool reference, std::string message, bool isAPIerror);
+template bool checkVal<std::string>(std::string input, std::string reference, std::string message,
+                                    bool isAPIerror);
+template bool checkVal<short>(short input, short reference, std::string message, bool isAPIerror);
+template bool checkVal<unsigned int>(unsigned int input, unsigned int reference,
+                                     std::string message, bool isAPIerror);
+template bool checkVal<int>(int input, int reference, std::string message, bool isAPIerror);
+template bool checkVal<long>(long input, long reference, std::string message, bool isAPIerror);
+
+
+template std::string toString<char>(char t, std::ios_base& (*r)(std::ios_base&));
+template std::string toString<short>(short t, std::ios_base& (*r)(std::ios_base&));
+template std::string toString<unsigned int>(unsigned int t, std::ios_base& (*r)(std::ios_base&));
+template std::string toString<int>(int t, std::ios_base& (*r)(std::ios_base&));
+template std::string toString<long>(long t, std::ios_base& (*r)(std::ios_base&));
+template std::string toString<float>(float t, std::ios_base& (*r)(std::ios_base&));
+template std::string toString<double>(double t, std::ios_base& (*r)(std::ios_base&));
@@ -0,0 +1,141 @@
+/**********************************************************************
+Copyright ©2013 Advanced Micro Devices, Inc. All rights reserved.
+
+Redistribution and use in source and binary forms, with or without modification, are permitted
+provided that the following conditions are met:
+
+•	Redistributions of source code must retain the above copyright notice, this list of
+conditions and the following disclaimer.
+•	Redistributions in binary form must reproduce the above copyright notice, this list of
+conditions and the following disclaimer in the documentation and/or
+ other materials provided with the distribution.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
+IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT
+SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY
+ DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
+ OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE.
+********************************************************************/
+#ifndef HELPER_FUNCS_HPP_
+#define HELPER_FUNCS_HPP_
+
+#define HSA_SDK_SUCCESS 0
+#define HSA_SDK_FAILURE 1
+#define HSA_SDK_EXPECTED_FAILURE 2
+
+#include <iostream>
+#include <fstream>
+#include <iomanip>
+#include <sstream>
+#include <string>
+#include <ctime>
+#include <cmath>
+#include <time.h>
+#include <stdlib.h>
+#include <string.h>
+#include <vector>
+#include <malloc.h>
+
+/**
+ * error
+ * constant function, Prints error messages
+ * @param errorMsg char* message
+ */
+void error(const char* errorMsg);
+
+/**
+ * error
+ * constant function, Prints error messages
+ * @param errorMsg std::string message
+ */
+void error(std::string errorMsg);
+
+/**
+ * expectedError
+ * constant function, Prints error messages
+ * @param errorMsg char* message
+ */
+void expectedError(const char* errorMsg);
+
+/**
+ * expectedError
+ * constant function, Prints error messages
+ * @param errorMsg string message
+ */
+void expectedError(std::string errorMsg);
+
+/**
+ * compare template version
+ * compare data to check error
+ * @param refData templated input
+ * @param data templated input
+ * @param length number of values to compare
+ * @param epsilon errorWindow
+ */
+bool compare(const float* refData, const float* data, const int length,
+             const float epsilon = 1e-6f);
+bool compare(const double* refData, const double* data, const int length,
+             const double epsilon = 1e-6);
+
+/**
+ * printArray
+ * displays a array on std::out
+ */
+template <typename T>
+void printArray(const std::string header, const T* data, const int width, const int height);
+
+
+/**
+ * fillRandom
+ * fill array with random values
+ */
+template <typename T>
+int fillRandom(T* arrayPtr, const int width, const int height, const T rangeMin, const T rangeMax,
+               unsigned int seed = 123);
+
+/**
+ * fillPos
+ * fill the specified positions
+ */
+template <typename T> int fillPos(T* arrayPtr, const int width, const int height);
+
+/**
+ * fillConstant
+ * fill the array with constant value
+ */
+template <typename T> int fillConstant(T* arrayPtr, const int width, const int height, const T val);
+
+
+/**
+ * roundToPowerOf2
+ * rounds to a power of 2
+ */
+template <typename T> T roundToPowerOf2(T val);
+
+/**
+ * isPowerOf2
+ * checks if input is a power of 2
+ */
+template <typename T> int isPowerOf2(T val);
+
+/**
+ * checkVal
+ * Set default(isAPIerror) parameter to false
+ * if checkVaul is used to check otherthan OpenCL API error code
+ */
+template <typename T>
+bool checkVal(T input, T reference, std::string message, bool isAPIerror = true);
+
+/**
+ * toString
+ * convert a T type to string
+ */
+template <typename T> std::string toString(T t, std::ios_base& (*r)(std::ios_base&));
+
+
+#endif
@@ -0,0 +1,155 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <string.h>
+#include <cassert>
+
+#include <iostream>
+#include <vector>
+#include <string>
+
+#include "hsa.h"
+#include "hsa_ext_profiler.h"
+#include "amd_hsa_tools_interfaces.h"
+
+#include "hsa_perf_cntrs.hpp"
+
+using namespace std;
+
+void PreDispatchCallback(const hsa_dispatch_callback_t* dispParam, void* usrArg) {
+  assert((dispParam->pre_dispatch) && "Pre Dispatch Callback Param is Malformed");
+
+  hsa_ext_tools_pmu_t* perfMgr = reinterpret_cast<hsa_ext_tools_pmu_t*>(usrArg);
+  hsa_status_t status =
+      hsa_ext_tools_pmu_begin(*perfMgr, dispParam->queue, dispParam->aql_translation_handle, true);
+  assert((status == HSA_STATUS_SUCCESS) && "Error in beginning Perf Cntr Session");
+}
+
+void PostDispatchCallback(const hsa_dispatch_callback_t* dispParam, void* usrArg) {
+  assert((!dispParam->pre_dispatch) && "Post Dispatch Callback Param is Malformed");
+
+  hsa_ext_tools_pmu_t* perfMgr = reinterpret_cast<hsa_ext_tools_pmu_t*>(usrArg);
+  hsa_status_t status =
+      hsa_ext_tools_pmu_end(*perfMgr, dispParam->queue, dispParam->aql_translation_handle);
+  assert((status == HSA_STATUS_SUCCESS) && "Error in endning Perf Cntr Session");
+}
+
+// Constructor of the class
+RocrPerfCntrApp::RocrPerfCntrApp() : perfMgr_(NULL) {}
+
+// Destructor of the class. Ideally it should delete the
+// PMU and its counters
+RocrPerfCntrApp::~RocrPerfCntrApp() {}
+
+// Return the number of perf counters
+uint32_t RocrPerfCntrApp::GetNumPerfCntrs() { return uint32_t(cntrList_.size()); }
+
+// Return the handle of perf counter at specified index
+CntrInfo* RocrPerfCntrApp::GetPerfCntr(uint32_t idx) { return cntrList_[idx]; }
+
+// Print the various fields of Perf Cntrs being programmed
+bool RocrPerfCntrApp::PrintCntrs() {
+  CntrInfo* info;
+  int size = uint32_t(cntrList_.size());
+  for (int idx = 0; idx < size; idx++) {
+    info = cntrList_[idx];
+    std::cout << std::endl;
+    std::cout << "Rocr Perf Cntr Id: " << info->cntrId << std::endl;
+    std::cout << "Rocr Perf Cntr Name: " << info->cntrName << std::endl;
+    std::cout << "Rocr Perf Cntr Blk Id: " << info->blkId << std::endl;
+    std::cout << "Rocr Perf Cntr Value: " << info->cntrResult << std::endl;
+    std::cout << "Rocr Perf Cntr Validation: " << info->cnfType << std::endl;
+    std::cout << std::endl;
+  }
+  return true;
+}
+
+// Initialize the list of perf counters
+// block id of kHsaAiCounterBlockSQ = 14 == 0x0E
+hsa_status_t RocrPerfCntrApp::Init(hsa_agent_t agent) {
+  // Initialize the list of Perf Cntrs
+  // Add SQ counter for number of waves
+  CntrInfo* info = NULL;
+  cntrList_.reserve(23);
+
+  // Event for number of Waves
+  info = new CntrInfo(0x4, "SQ_SQ_PERF_SEL_WAVES", NULL, 0x0E, NULL, 0x00, 0xFFFFFFFF,
+                      CntrValCnf_Exact);
+  cntrList_.push_back(info);
+
+  // Event for number of Threads
+  info = new CntrInfo(0xE, "SQ_SQ_PERF_SEL_ITEMS", NULL, 0x0E, NULL, 0x00, 0xFFFFFFFF,
+                      CntrValCnf_Exact);
+  cntrList_.push_back(info);
+
+
+  // Create an instance of Perf Mgr
+  hsa_status_t status;
+  status = hsa_ext_tools_create_pmu(agent, &perfMgr_);
+  assert((status == HSA_STATUS_SUCCESS) && "Error in creating Perf Cntr Mgr");
+
+  // Process each counter from the list as necessary
+  // each counter descriptor with its perf block handle
+  // and create an instance of counter in that block
+  uint32_t size = GetNumPerfCntrs();
+  for (uint32_t idx = 0; idx < size; idx++) {
+    info = GetPerfCntr(idx);
+
+    // Obtain the handle of perf block
+    if (info->blkHndl == NULL) {
+      status = hsa_ext_tools_get_counter_block_by_id(perfMgr_, info->blkId, &info->blkHndl);
+      assert((status == HSA_STATUS_SUCCESS) && "Error in getting Perf Cntr Blk Hndl");
+    }
+
+    // Create an instance of counter in the perf block
+    status = hsa_ext_tools_create_counter(info->blkHndl, &info->cntrHndl);
+    assert((status == HSA_STATUS_SUCCESS) && "Error in creating Perf Cntr in Perf Blk");
+
+    // Update the Event Index property of counter
+    uint32_t cntrProp = HSA_EXT_TOOLS_COUNTER_PARAMETER_EVENT_INDEX;
+    status = hsa_ext_tools_set_counter_parameter(info->cntrHndl, cntrProp, sizeof(uint32_t),
+                                                 (void*)&info->cntrId);
+    assert((status == HSA_STATUS_SUCCESS) && "Error in updating Perf Cntr Property Event Index");
+
+    // Enable the updated perf counter
+    status = hsa_ext_tools_set_counter_enabled(info->cntrHndl, true);
+    assert((status == HSA_STATUS_SUCCESS) && "Error in enabing Perf Cntr");
+  }
+
+  return status;
+}
+
+// Register Pre and Post dispatch callbacks
+void RocrPerfCntrApp::RegisterCallbacks(hsa_queue_t* queue) {
+  hsa_status_t status;
+  status = hsa_ext_tools_set_callback_functions(queue, PreDispatchCallback, PostDispatchCallback);
+  assert((status == HSA_STATUS_SUCCESS) && "Error in registering Pre & Post Dispatch Callbacks");
+  status = hsa_ext_tools_set_callback_arguments(queue, &perfMgr_, &perfMgr_);
+  assert((status == HSA_STATUS_SUCCESS) &&
+         "Error in registering Pre & Post Dispatch Callback Params");
+  return;
+}
+
+// Wait for perf counter collection to complete
+hsa_status_t RocrPerfCntrApp::Wait() {
+  hsa_status_t status;
+  status = hsa_ext_tools_pmu_wait_for_completion(perfMgr_, 5000);
+  assert((status == HSA_STATUS_SUCCESS) && "Error in Waiting for Perf Cntr Completion");
+  return status;
+}
+
+// Validate perf counter values
+hsa_status_t RocrPerfCntrApp::Validate() {
+  // Retrieve the results of the different Perf Cntrs
+  // and validate them as configured
+  CntrInfo* info = NULL;
+  hsa_status_t status = HSA_STATUS_SUCCESS;
+  uint32_t size = GetNumPerfCntrs();
+  for (uint32_t idx = 0; idx < size; idx++) {
+    info = GetPerfCntr(idx);
+    status = hsa_ext_tools_get_counter_result(info->cntrHndl, &info->cntrResult);
+    std::cout << "Value of Perf Cntr is: " << info->cntrResult << std::endl;
+  }
+
+  return status;
+}
@@ -0,0 +1,110 @@
+#ifndef ROCR_PERF_CNTR_APP_H_
+#define ROCR_PERF_CNTR_APP_H_
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <string.h>
+
+#include <iostream>
+#include <vector>
+#include <string>
+
+#include "hsa.h"
+#include "hsa_ext_profiler.h"
+
+typedef enum CntrValCnfType {
+
+  ///< no counter value validation should be performed
+  CntrValCnf_None,
+
+  ///< counter value should be an exact match to expectedResult
+  CntrValCnf_Exact,
+
+  ///< counter value should be greater than expectedResult
+  CntrValCnf_GreaterThan,
+
+  ///< counter value should be less than expectedResult
+  CntrValCnf_LessThan
+
+} CntrValCnfType;
+
+/// Struct used to encapsulate Counter Info
+typedef struct CntrInfo {
+  ///< Id of counter in hardware block
+  uint32_t cntrId;
+
+  ///< Name of counter
+  char cntrName[72];
+
+  ///< Handle of perf counter
+  hsa_ext_tools_counter_t cntrHndl;
+
+  ///< Id of hardware block containing the counter
+  uint32_t blkId;
+
+  ///< Handle of counter block
+  hsa_ext_tools_counter_block_t blkHndl;
+
+  ///< Expected value of perf counte
+  uint64_t expectedResult;
+
+  ///< Value of perf counter expected
+  uint64_t cntrResult;
+
+  ///< Type of validation upon completion of dispatch
+  CntrValCnfType cnfType;
+
+  CntrInfo(uint32_t cntrId, char* cntrName, void* cntrHndl, uint32_t blkId, void* blkHndl,
+           uint64_t expResult, uint64_t result, CntrValCnfType cnfType) {
+    this->cntrId = cntrId;
+    this->cntrHndl = cntrHndl;
+    this->blkId = blkId;
+    this->blkHndl = blkHndl;
+    this->expectedResult = expResult;
+    this->cntrResult = result;
+    this->cnfType = cnfType;
+    memcpy(this->cntrName, cntrName, strlen(cntrName));
+  }
+
+} CntrInfo;
+
+class RocrPerfCntrApp {
+ public:
+  // Constructor of the class. Will initialize the list of perf counters
+  // that will be used to program the device
+  RocrPerfCntrApp();
+
+  // Destructor of the class
+  ~RocrPerfCntrApp();
+
+  // Return the number of perf counters
+  uint32_t GetNumPerfCntrs();
+
+  // Return the handle of perf counter at specified index
+  CntrInfo* GetPerfCntr(uint32_t idx);
+
+  // Print the list of perf counters
+  bool PrintCntrs();
+
+  // Initialize the list of perf counters
+  hsa_status_t Init(hsa_agent_t agent);
+
+  // Register Pre and Post dispatch callbacks
+  void RegisterCallbacks(hsa_queue_t* queue);
+
+  // Wait for perf counter collection to complete
+  hsa_status_t Wait();
+
+  // Validate perf counter values
+  hsa_status_t Validate();
+
+ private:
+  // Number of queues to create
+  std::vector<CntrInfo*> cntrList_;
+
+  // Handle of Perf Cntr Manager
+  hsa_ext_tools_pmu_t perfMgr_;
+};
+
+#endif  //  ROCR_PERF_CNTR_APP_H_
@@ -0,0 +1,476 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <string.h>
+#include <cassert>
+
+#include <fstream>
+#include <iostream>
+#include <vector>
+#include <string>
+
+#include "hsa.h"
+#include "hsa_rsrc_factory.hpp"
+#include "hsa_ext_finalize.h"
+#include "hsa_ext_profiler.h"
+
+#include "common.hpp"
+
+using namespace std;
+
+// Provide access to command line arguments passed in by user
+uint32_t hsa_cmdline_arg_cnt;
+char** hsa_cmdline_arg_list;
+
+// Callback function to find and bind kernarg region of an agent
+static hsa_status_t find_memregions(hsa_region_t region, void* data) {
+  hsa_region_global_flag_t flags;
+  hsa_region_segment_t segment_id;
+
+  hsa_region_get_info(region, HSA_REGION_INFO_SEGMENT, &segment_id);
+  if (segment_id != HSA_REGION_SEGMENT_GLOBAL) {
+    return HSA_STATUS_SUCCESS;
+  }
+
+  AgentInfo* agent_info = (AgentInfo*)data;
+  hsa_region_get_info(region, HSA_REGION_INFO_GLOBAL_FLAGS, &flags);
+  if (flags & HSA_REGION_GLOBAL_FLAG_COARSE_GRAINED) {
+    agent_info->coarse_region = region;
+  }
+
+  if (flags & HSA_REGION_GLOBAL_FLAG_KERNARG) {
+    agent_info->kernarg_region = region;
+  }
+
+  return HSA_STATUS_SUCCESS;
+}
+
+// Callback function to get the number of agents
+static hsa_status_t get_hsa_agents(hsa_agent_t agent, void* data) {
+  // Copy handle of agent and increment number of agents reported
+  HsaRsrcFactory* rsrcFactory = reinterpret_cast<HsaRsrcFactory*>(data);
+
+  // Determine if device is a Gpu agent
+  hsa_status_t status;
+  hsa_device_type_t type;
+  status = hsa_agent_get_info(agent, HSA_AGENT_INFO_DEVICE, &type);
+  if (type == HSA_DEVICE_TYPE_DSP) {
+    return HSA_STATUS_SUCCESS;
+  }
+
+  if (type == HSA_DEVICE_TYPE_CPU) {
+    AgentInfo* agent_info = reinterpret_cast<AgentInfo*>(malloc(sizeof(AgentInfo)));
+    agent_info->dev_id = agent;
+    agent_info->dev_type = HSA_DEVICE_TYPE_CPU;
+    rsrcFactory->AddAgentInfo(agent_info, false);
+    return HSA_STATUS_SUCCESS;
+  }
+
+  // Device is a Gpu agent, build an instance of AgentInfo
+  AgentInfo* agent_info = reinterpret_cast<AgentInfo*>(malloc(sizeof(AgentInfo)));
+  agent_info->dev_id = agent;
+  agent_info->dev_type = HSA_DEVICE_TYPE_GPU;
+  hsa_agent_get_info(agent, HSA_AGENT_INFO_NAME, agent_info->name);
+  agent_info->max_wave_size = 0;
+  hsa_agent_get_info(agent, HSA_AGENT_INFO_WAVEFRONT_SIZE, &agent_info->max_wave_size);
+  agent_info->max_queue_size = 0;
+  hsa_agent_get_info(agent, HSA_AGENT_INFO_QUEUE_MAX_SIZE, &agent_info->max_queue_size);
+  agent_info->profile = hsa_profile_t(108);
+  hsa_agent_get_info(agent, HSA_AGENT_INFO_PROFILE, &agent_info->profile);
+
+  // Initialize memory regions to zero
+  agent_info->kernarg_region.handle = 0;
+  agent_info->coarse_region.handle = 0;
+
+  // Find and Bind Memory regions of the Gpu agent
+  hsa_agent_iterate_regions(agent, find_memregions, agent_info);
+
+  // Save the instance of AgentInfo
+  rsrcFactory->AddAgentInfo(agent_info, true);
+  return HSA_STATUS_SUCCESS;
+}
+
+// Definitions for Static Data members of the class
+char* HsaRsrcFactory::brig_path_ = NULL;
+uint32_t HsaRsrcFactory::num_cus_ = 4;
+uint32_t HsaRsrcFactory::num_waves_;
+uint32_t HsaRsrcFactory::num_workitems_;
+uint32_t HsaRsrcFactory::kernel_loop_count_;
+bool HsaRsrcFactory::print_debug_info_ = false;
+
+char* HsaRsrcFactory::num_cus_key_ = "num_cus";
+char* HsaRsrcFactory::brig_path_key_ = "brig_path";
+char* HsaRsrcFactory::num_waves_key_ = "waves_per_cu";
+char* HsaRsrcFactory::num_workitems_key_ = "workitems_per_wave";
+char* HsaRsrcFactory::print_debug_key_ = "print_debug";
+char* HsaRsrcFactory::kernel_loop_count_key_ = "kernel_loop_count";
+
+// Constructor of the class
+HsaRsrcFactory::HsaRsrcFactory() {
+  // Initialize the Hsa Runtime
+  hsa_status_t status = hsa_init();
+  check("Error in hsa_init", status);
+
+  // Discover the set of Gpu devices available on the platform
+  status = hsa_iterate_agents(get_hsa_agents, this);
+  check("Error Calling hsa_iterate_agents", status);
+
+  // Process command line arguments
+  ProcessCmdline();
+}
+
+// Destructor of the class
+HsaRsrcFactory::~HsaRsrcFactory() {}
+
+// Get the count of Hsa Gpu Agents available on the platform
+//
+// @return uint32_t Number of Gpu agents on platform
+//
+uint32_t HsaRsrcFactory::GetCountOfGpuAgents() { return uint32_t(gpu_list_.size()); }
+
+// Get the count of Hsa Cpu Agents available on the platform
+//
+// @return uint32_t Number of Cpu agents on platform
+//
+uint32_t HsaRsrcFactory::GetCountOfCpuAgents() { return uint32_t(cpu_list_.size()); }
+
+// Get the AgentInfo handle of a Gpu device
+//
+// @param idx Gpu Agent at specified index
+//
+// @param agent_info Output parameter updated with AgentInfo
+//
+// @return bool true if successful, false otherwise
+//
+bool HsaRsrcFactory::GetGpuAgentInfo(uint32_t idx, AgentInfo** agent_info) {
+  // Determine if request is valid
+  uint32_t size = uint32_t(gpu_list_.size());
+  if (idx >= size) {
+    return false;
+  }
+
+  // Copy AgentInfo from specified index
+  *agent_info = gpu_list_[idx];
+  return true;
+}
+
+// Get the AgentInfo handle of a Cpu device
+//
+// @param idx Cpu Agent at specified index
+//
+// @param agent_info Output parameter updated with AgentInfo
+//
+// @return bool true if successful, false otherwise
+//
+bool HsaRsrcFactory::GetCpuAgentInfo(uint32_t idx, AgentInfo** agent_info) {
+  // Determine if request is valid
+  uint32_t size = uint32_t(cpu_list_.size());
+  if (idx >= size) {
+    return false;
+  }
+
+  // Copy AgentInfo from specified index
+  *agent_info = cpu_list_[idx];
+  return true;
+}
+
+// Create a Queue object and return its handle. The queue object is expected
+// to support user requested number of Aql dispatch packets.
+//
+// @param agent_info Gpu Agent on which to create a queue object
+//
+// @param num_Pkts Number of packets to be held by queue
+//
+// @param queue Output parameter updated with handle of queue object
+//
+// @return bool true if successful, false otherwise
+//
+bool HsaRsrcFactory::CreateQueue(AgentInfo* agent_info, uint32_t num_pkts, hsa_queue_t** queue) {
+  hsa_status_t status;
+  status = hsa_queue_create(agent_info->dev_id, num_pkts, HSA_QUEUE_TYPE_MULTI, NULL, NULL,
+                            UINT32_MAX, UINT32_MAX, queue);
+  return (status == HSA_STATUS_SUCCESS);
+}
+
+// Create a Signal object and return its handle.
+//
+// @param value Initial value of signal object
+//
+// @param signal Output parameter updated with handle of signal object
+//
+// @return bool true if successful, false otherwise
+//
+bool HsaRsrcFactory::CreateSignal(uint32_t value, hsa_signal_t* signal) {
+  hsa_status_t status;
+  status = hsa_signal_create(value, 0, NULL, signal);
+  return (status == HSA_STATUS_SUCCESS);
+}
+
+// Allocate memory for use by a kernel of specified size in specified
+// agent's memory region. Currently supports Global segment whose Kernarg
+// flag set.
+//
+// @param agent_info Agent from whose memory region to allocate
+//
+// @param size Size of memory in terms of bytes
+//
+// @return uint8_t* Pointer to buffer, null if allocation fails.
+//
+uint8_t* HsaRsrcFactory::AllocateLocalMemory(AgentInfo* agent_info, size_t size) {
+  hsa_status_t status;
+  uint8_t* buffer = NULL;
+
+  if (agent_info->coarse_region.handle != 0) {
+    // Allocate in local memory if it is available
+    status = hsa_memory_allocate(agent_info->coarse_region, size, (void**)&buffer);
+    if (status == HSA_STATUS_SUCCESS) {
+      status = hsa_memory_assign_agent(buffer, agent_info->dev_id, HSA_ACCESS_PERMISSION_RW);
+    }
+  } else {
+    // Allocate in system memory if local memory is not available
+    status = hsa_memory_allocate(agent_info->kernarg_region, size, (void**)&buffer);
+  }
+
+  return (status == HSA_STATUS_SUCCESS) ? buffer : NULL;
+}
+
+// Allocate memory tp pass kernel parameters.
+//
+// @param agent_info Agent from whose memory region to allocate
+//
+// @param size Size of memory in terms of bytes
+//
+// @return uint8_t* Pointer to buffer, null if allocation fails.
+//
+uint8_t* HsaRsrcFactory::AllocateSysMemory(AgentInfo* agent_info, size_t size) {
+  hsa_status_t status;
+  uint8_t* buffer = NULL;
+  status = hsa_memory_allocate(agent_info->kernarg_region, size, (void**)&buffer);
+  return (status == HSA_STATUS_SUCCESS) ? buffer : NULL;
+}
+
+bool HsaRsrcFactory::TransferData(uint8_t* dest_buff, uint8_t* src_buff, uint32_t length,
+                                  bool host_to_dev) {
+  hsa_status_t status;
+  status = hsa_memory_copy(dest_buff, src_buff, length);
+  return (status == HSA_STATUS_SUCCESS);
+}
+
+// Fake method for compilation steps only
+uint8_t* HsaRsrcFactory::AllocateMemory(AgentInfo* agent_info, size_t size) {
+  hsa_status_t status;
+  uint8_t* buffer = NULL;
+  status = hsa_memory_allocate(agent_info->kernarg_region, size, (void**)&buffer);
+  return (status == HSA_STATUS_SUCCESS) ? buffer : NULL;
+}
+
+// Loads an Assembled Brig file and Finalizes it into Device Isa
+//
+// @param agent_info Gpu device for which to finalize
+//
+// @param brig_path File path of the Assembled Brig file
+//
+// @param kernel_name Name of the kernel to finalize
+//
+// @param code_desc Handle of finalized Code Descriptor that could
+// be used to submit for execution
+//
+// @return bool true if successful, false otherwise
+//
+bool HsaRsrcFactory::LoadAndFinalize(AgentInfo* agent_info, const char* brig_path,
+                                     char* kernel_name, hsa_executable_symbol_t* code_desc) {
+  // Finalize the Hsail object into code object
+  hsa_status_t status;
+  hsa_code_object_t code_object;
+
+  // Build the code object filename
+  std::string filename(brig_path);
+  std::cout << "Code object filename: " << filename << std::endl;
+
+  // Open the file containing code object
+  std::ifstream codeStream(filename.c_str(), std::ios::binary | std::ios::ate);
+  if (!codeStream) {
+    std::cout << "Error: failed to load " << filename << std::endl;
+    assert(false);
+    return false;
+  }
+
+  // Allocate memory to read in code object from file
+  size_t size = std::string::size_type(codeStream.tellg());
+  char* codeBuff = (char*)AllocateSysMemory(agent_info, size);
+  if (!codeBuff) {
+    std::cout << "Error: failed to allocate memory for code object." << std::endl;
+    assert(false);
+    return false;
+  }
+
+  // Read the code object into allocated memory
+  codeStream.seekg(0, std::ios::beg);
+  std::copy(std::istreambuf_iterator<char>(codeStream), std::istreambuf_iterator<char>(), codeBuff);
+
+  // De-Serialize the code object that has been read into memory
+  status = hsa_code_object_deserialize(codeBuff, size, NULL, &code_object);
+  if (status != HSA_STATUS_SUCCESS) {
+    std::cout << "Failed to deserialize code object" << std::endl;
+    return false;
+  }
+
+  // Create executable.
+  hsa_executable_t hsaExecutable;
+  // status = hsa_executable_create(agent_info->profile,
+  status =
+      hsa_executable_create(HSA_PROFILE_FULL, HSA_EXECUTABLE_STATE_UNFROZEN, "", &hsaExecutable);
+  check("Error in creating executable object", status);
+
+  // Load code object.
+  status = hsa_executable_load_code_object(hsaExecutable, agent_info->dev_id, code_object, "");
+  check("Error in loading executable object", status);
+
+  // Freeze executable.
+  status = hsa_executable_freeze(hsaExecutable, "");
+  check("Error in freezing executable object", status);
+
+  // Get symbol handle.
+  hsa_executable_symbol_t kernelSymbol;
+  status = hsa_executable_get_symbol(hsaExecutable, NULL, kernel_name, agent_info->dev_id, 0,
+                                     &kernelSymbol);
+  check("Error in looking up kernel symbol", status);
+
+  // Update output parameter
+  *code_desc = kernelSymbol;
+  return true;
+}
+
+// Add an instance of AgentInfo representing a Hsa Gpu agent
+void HsaRsrcFactory::AddAgentInfo(AgentInfo* agent_info, bool gpu) {
+  // Add input to Gpu list
+  if (gpu) {
+    gpu_list_.push_back(agent_info);
+    return;
+  }
+
+  // Add input to Cpu list
+  cpu_list_.push_back(agent_info);
+}
+
+// Print the various fields of Hsa Gpu Agents
+bool HsaRsrcFactory::PrintGpuAgents(const std::string& header) {
+  std::cout << header << " :" << std::endl;
+
+  AgentInfo* agent_info;
+  int size = uint32_t(gpu_list_.size());
+  for (int idx = 0; idx < size; idx++) {
+    agent_info = gpu_list_[idx];
+
+    std::cout << "> agent[" << idx << "] :" << std::endl;
+    std::cout << ">> Name : " << agent_info->name << std::endl;
+    std::cout << ">> Max Wave Size : " << agent_info->max_wave_size << std::endl;
+    std::cout << ">> Max Queue Size : " << agent_info->max_queue_size << std::endl;
+    std::cout << ">> Kernarg Region Id : " << agent_info->coarse_region.handle << std::endl;
+  }
+  return true;
+}
+
+// Returns the file path where brig files is located. Value is
+// available only after an instance has been built.
+char* HsaRsrcFactory::GetBrigPath() { return HsaRsrcFactory::brig_path_; }
+
+// Returns the number of compute units present on platform
+// Value is available only after an instance has been built.
+uint32_t HsaRsrcFactory::GetNumOfCUs() { return HsaRsrcFactory::num_cus_; }
+
+// Returns the maximum number of waves that can be launched
+// per compute unit. The actual number that can be launched
+// is affected by resource availability
+//
+// Value is available only after an instance has been built.
+uint32_t HsaRsrcFactory::GetNumOfWavesPerCU() { return HsaRsrcFactory::num_waves_; }
+
+// Returns the number of work-items that can execute per wave
+// Value is available only after an instance has been built.
+uint32_t HsaRsrcFactory::GetNumOfWorkItemsPerWave() { return HsaRsrcFactory::num_workitems_; }
+
+// Returns the number of times kernel loop body should execute.
+// Value is available only after an instance has been built.
+uint32_t HsaRsrcFactory::GetKernelLoopCount() { return HsaRsrcFactory::kernel_loop_count_; }
+
+// Returns boolean flag to indicate if debug info should be printed
+// Value is available only after an instance has been built.
+uint32_t HsaRsrcFactory::GetPrintDebugInfo() { return HsaRsrcFactory::print_debug_info_; }
+
+// Process command line arguments. The method will capture
+// various user command line parameters for tests to use
+void HsaRsrcFactory::ProcessCmdline() {
+  // Command line arguments are given
+  uint32_t idx;
+  uint32_t arg_idx;
+  for (idx = 1; idx < hsa_cmdline_arg_cnt; idx += 2) {
+    arg_idx = GetArgIndex((char*)hsa_cmdline_arg_list[idx]);
+    switch (arg_idx) {
+      case 0:
+        HsaRsrcFactory::brig_path_ = hsa_cmdline_arg_list[idx + 1];
+        break;
+      case 1:
+        HsaRsrcFactory::num_cus_ = atoi(hsa_cmdline_arg_list[idx + 1]);
+        break;
+      case 2:
+        HsaRsrcFactory::num_waves_ = atoi(hsa_cmdline_arg_list[idx + 1]);
+        break;
+      case 3:
+        HsaRsrcFactory::num_workitems_ = atoi(hsa_cmdline_arg_list[idx + 1]);
+        break;
+      case 4:
+        HsaRsrcFactory::kernel_loop_count_ = atoi(hsa_cmdline_arg_list[idx + 1]);
+        break;
+      case 5:
+        HsaRsrcFactory::print_debug_info_ = true;
+        break;
+    }
+  }
+}
+
+uint32_t HsaRsrcFactory::GetArgIndex(char* arg_value) {
+  // Map Brig file path to index zero
+  if (!strcmp(HsaRsrcFactory::brig_path_key_, arg_value)) {
+    return 0;
+  }
+
+  // Map Number of Compute Units to index one
+  if (!strcmp(HsaRsrcFactory::num_cus_key_, arg_value)) {
+    return 1;
+  }
+
+  // Map Number of Waves per CU to index two
+  if (!strcmp(HsaRsrcFactory::num_waves_key_, arg_value)) {
+    return 2;
+  }
+
+  // Map Number of Workitems per Wave to index three
+  if (!strcmp(HsaRsrcFactory::num_workitems_key_, arg_value)) {
+    return 3;
+  }
+
+  // Map Kernel Loop Count to index four
+  if (!strcmp(HsaRsrcFactory::kernel_loop_count_key_, arg_value)) {
+    return 4;
+  }
+
+  // Map print debug info parameter
+  if (!strcmp(HsaRsrcFactory::print_debug_key_, arg_value)) {
+    return 5;
+  }
+
+  return 108;
+}
+
+void HsaRsrcFactory::PrintHelpMsg() {
+  std::cout << "Key for passing Brig filepath: " << HsaRsrcFactory::brig_path_key_ << std::endl;
+  std::cout << "Key for passing Number of Compute Units: " << HsaRsrcFactory::num_cus_key_
+            << std::endl;
+  std::cout << "Key for passing Number of Waves per CU: " << HsaRsrcFactory::num_waves_key_
+            << std::endl;
+  std::cout << "Key for passing Number of Workitems per Wave: "
+            << HsaRsrcFactory::num_workitems_key_ << std::endl;
+  std::cout << "Key for passing Kernel Loop Count: " << HsaRsrcFactory::kernel_loop_count_key_
+            << std::endl;
+}
@@ -0,0 +1,262 @@
+#ifndef HSA_RSRC_FACTORY_H_
+#define HSA_RSRC_FACTORY_H_
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <string.h>
+
+#include <iostream>
+#include <vector>
+#include <string>
+
+#include "hsatimer.h"
+#include "hsa.h"
+#include "hsa_ext_finalize.h"
+
+#define HSA_ARGUMENT_ALIGN_BYTES 16
+#define HSA_QUEUE_ALIGN_BYTES 64
+#define HSA_PACKET_ALIGN_BYTES 64
+
+#define check(msg, status)                                                                         \
+  if (status != HSA_STATUS_SUCCESS) {                                                              \
+    const char* emsg = 0;                                                                          \
+    hsa_status_string(status, &emsg);                                                              \
+    printf("%s: %s\n", msg, emsg ? emsg : "<unknown error>");                                      \
+    exit(1);                                                                                       \
+  }
+
+#define check_build(msg, status)                                                                   \
+  if (status != STATUS_SUCCESS) {                                                                  \
+    printf("%s\n", msg);                                                                           \
+    exit(1);                                                                                       \
+  }
+
+// Provide access to command line arguments passed in by user
+extern uint32_t hsa_cmdline_arg_cnt;
+extern char** hsa_cmdline_arg_list;
+
+// Encapsulates information about a Hsa Agent such as its
+// handle, name, max queue size, max wavefront size, etc.
+typedef struct {
+  // Handle of Agent
+  hsa_agent_t dev_id;
+
+  // Agent type - Cpu = 0, Gpu = 1 or Dsp = 2
+  uint32_t dev_type;
+
+  // Name of Agent whose length is less than 64
+  char name[64];
+
+  // Max size of Wavefront size
+  uint32_t max_wave_size;
+
+  // Max size of Queue buffer
+  uint32_t max_queue_size;
+
+  // Hsail profile supported by agent
+  hsa_profile_t profile;
+
+  // Memory region supporting kernel parameters
+  hsa_region_t coarse_region;
+
+  // Memory region supporting kernel arguments
+  hsa_region_t kernarg_region;
+
+} AgentInfo;
+
+class HsaRsrcFactory {
+ public:
+  // Constructor of the class. Will initialize the Hsa Runtime and
+  // query the system topology to get the list of Cpu and Gpu devices
+  HsaRsrcFactory();
+
+  // Destructor of the class
+  ~HsaRsrcFactory();
+
+  // Get the count of Hsa Gpu Agents available on the platform
+  //
+  // @return uint32_t Number of Gpu agents on platform
+  //
+  uint32_t GetCountOfGpuAgents();
+
+  // Get the count of Hsa Cpu Agents available on the platform
+  //
+  // @return uint32_t Number of Cpu agents on platform
+  //
+  uint32_t GetCountOfCpuAgents();
+
+  // Get the AgentInfo handle of a Gpu device
+  //
+  // @param idx Gpu Agent at specified index
+  //
+  // @param agent_info Output parameter updated with AgentInfo
+  //
+  // @return bool true if successful, false otherwise
+  //
+  bool GetGpuAgentInfo(uint32_t idx, AgentInfo** agent_info);
+
+  // Get the AgentInfo handle of a Cpu device
+  //
+  // @param idx Cpu Agent at specified index
+  //
+  // @param agent_info Output parameter updated with AgentInfo
+  //
+  // @return bool true if successful, false otherwise
+  //
+  bool GetCpuAgentInfo(uint32_t idx, AgentInfo** agent_info);
+
+  // Create a Queue object and return its handle. The queue object is expected
+  // to support user requested number of Aql dispatch packets.
+  //
+  // @param agent_info Gpu Agent on which to create a queue object
+  //
+  // @param num_Pkts Number of packets to be held by queue
+  //
+  // @param queue Output parameter updated with handle of queue object
+  //
+  // @return bool true if successful, false otherwise
+  //
+  bool CreateQueue(AgentInfo* agent_info, uint32_t num_pkts, hsa_queue_t** queue);
+
+  // Create a Signal object and return its handle.
+  //
+  // @param value Initial value of signal object
+  //
+  // @param signal Output parameter updated with handle of signal object
+  //
+  // @return bool true if successful, false otherwise
+  //
+  bool CreateSignal(uint32_t value, hsa_signal_t* signal);
+
+  // Allocate memory for use by a kernel of specified size in specified
+  // agent's memory region. Currently supports Global segment whose Kernarg
+  // flag set.
+  //
+  // @param agent_info Agent from whose memory region to allocate
+  //
+  // @param size Size of memory in terms of bytes
+  //
+  // @return uint8_t* Pointer to buffer, null if allocation fails.
+  //
+  uint8_t* AllocateLocalMemory(AgentInfo* agent_info, size_t size);
+  uint8_t* AllocateMemory(AgentInfo* agent_info, size_t size);
+
+  bool TransferData(uint8_t* dest_buff, uint8_t* src_buff, uint32_t length, bool host_to_dev);
+
+  // Allocate memory tp pass kernel parameters.
+  //
+  // @param agent_info Agent from whose memory region to allocate
+  //
+  // @param size Size of memory in terms of bytes
+  //
+  // @return uint8_t* Pointer to buffer, null if allocation fails.
+  //
+  uint8_t* AllocateSysMemory(AgentInfo* agent_info, size_t size);
+
+  // Loads an Assembled Brig file and Finalizes it into Device Isa
+  //
+  // @param agent_info Gpu device for which to finalize
+  //
+  // @param brig_path File path of the Assembled Brig file
+  //
+  // @param kernel_name Name of the kernel to finalize
+  //
+  // @param code_desc Handle of finalized Code Descriptor that could
+  // be used to submit for execution
+  //
+  // @return bool true if successful, false otherwise
+  //
+  bool LoadAndFinalize(AgentInfo* agent_info, const char* brig_path, char* kernel_name,
+                       hsa_executable_symbol_t* code_desc);
+
+  // Add an instance of AgentInfo representing a Hsa Gpu agent
+  void AddAgentInfo(AgentInfo* agent_info, bool gpu);
+
+  // Returns the file path where brig files is located
+  static char* GetBrigPath();
+
+  // Returns the number of compute units present on platform
+  static uint32_t GetNumOfCUs();
+
+  // Returns the maximum number of waves that can be launched
+  // per compute unit. The actual number that can be launched
+  // is affected by resource availability
+  static uint32_t GetNumOfWavesPerCU();
+
+  // Returns the number of work-items that can execute per wave
+  static uint32_t GetNumOfWorkItemsPerWave();
+
+  // Returns the number of times kernel loop body should execute.
+  static uint32_t GetKernelLoopCount();
+
+  // Returns boolean flag to indicate if debug info should be printed
+  static uint32_t GetPrintDebugInfo();
+
+  // Print the various fields of Hsa Gpu Agents
+  bool PrintGpuAgents(const std::string& header);
+
+ private:
+  // Number of queues to create
+  uint32_t num_queues_;
+
+  // Used to maintain a list of Hsa Queue handles
+  std::vector<hsa_queue_t*> queue_list_;
+
+  // Number of Signals to create
+  uint32_t num_signals_;
+
+  // Used to maintain a list of Hsa Signal handles
+  std::vector<hsa_signal_t*> signal_list_;
+
+  // Number of agents reported by platform
+  uint32_t num_agents_;
+
+  // Used to maintain a list of Hsa Gpu Agent Info
+  std::vector<AgentInfo*> gpu_list_;
+
+  // Used to maintain a list of Hsa Cpu Agent Info
+  std::vector<AgentInfo*> cpu_list_;
+
+  // Records the file path where Brig file is located.
+  // Value is available only after an instance has been built.
+  static char* brig_path_;
+  static char* brig_path_key_;
+
+  // Records the number of Compute units present on system.
+  // Value is available only after an instance has been built.
+  static uint32_t num_cus_;
+  static char* num_cus_key_;
+
+  // Records the number of waves that can be launched per Compute unit
+  // Value is available only after an instance has been built.
+  static uint32_t num_waves_;
+  static char* num_waves_key_;
+
+  // Records the number of work-items that can be packed into a wave
+  // Value is available only after an instance has been built.
+  static uint32_t num_workitems_;
+  static char* num_workitems_key_;
+
+  // Records the number of times kernel loop body should run. Value
+  // is available only after an instance has been built.
+  static uint32_t kernel_loop_count_;
+  static char* kernel_loop_count_key_;
+
+  // Records the number of times kernel loop body should run. Value
+  // is available only after an instance has been built.
+  static bool print_debug_info_;
+  static char* print_debug_key_;
+
+  // Process command line arguments. The method will capture
+  // various user command line parameters for tests to use
+  static void ProcessCmdline();
+
+  // Prints the help banner on user arg keys
+  static void PrintHelpMsg();
+
+  // Maps an index for the user argument
+  static uint32_t GetArgIndex(char* arg_value);
+};
+
+#endif  //  HSA_RSRC_FACTORY_H_
@@ -0,0 +1,168 @@
+#include "hsatimer.h"
+
+PerfTimer::PerfTimer() { freq_in_100mhz = MeasureTSCFreqHz(); }
+
+PerfTimer::~PerfTimer() {
+  while (!_timers.empty()) {
+    Timer* temp = _timers.back();
+    _timers.pop_back();
+    delete temp;
+  }
+}
+
+// a new cretaed timer instantance index will be returned
+int PerfTimer::CreateTimer() {
+  Timer* newTimer = new Timer;
+  newTimer->_start = 0;
+  newTimer->_clocks = 0;
+
+#ifdef _WIN32
+  QueryPerformanceFrequency((LARGE_INTEGER*)&newTimer->_freq);
+#else
+  newTimer->_freq = (long long)1.0E3;
+#endif
+
+  /* Push back the address of new Timer instance created */
+  _timers.push_back(newTimer);
+  return (int)(_timers.size() - 1);
+}
+
+int PerfTimer::StartTimer(int index) {
+  if (index >= (int)_timers.size()) {
+    Error("Cannot reset timer. Invalid handle.");
+    return HSA_FAILURE;
+  }
+
+#ifdef _WIN32
+// General Windows timing method
+#ifndef _AMD
+  long long tmpStart;
+  QueryPerformanceCounter((LARGE_INTEGER*)&(tmpStart));
+  _timers[index]->_start = (double)tmpStart;
+#else
+// AMD Windows timing method
+
+#endif
+
+#else
+// General Linux timing method
+#ifndef _AMD
+  struct timeval s;
+  gettimeofday(&s, 0);
+  _timers[index]->_start = s.tv_sec * 1.0E3 + ((double)(s.tv_usec / 1.0E3));
+#else
+
+  // AMD timing method
+
+  unsigned int unused;
+  _timers[index]->_start = __rdtscp(&unused);
+
+#endif
+
+#endif
+
+  return HSA_SUCCESS;
+}
+
+
+int PerfTimer::StopTimer(int index) {
+  double n = 0;
+  if (index >= (int)_timers.size()) {
+    Error("Cannot reset timer. Invalid handle.");
+    return HSA_FAILURE;
+  }
+#ifdef _WIN32
+#ifndef _AMD
+  long long n1;
+  QueryPerformanceCounter((LARGE_INTEGER*)&(n1));
+  n = (double)n1;
+#else
+
+// AMD Window Timing
+
+#endif
+
+#else
+// General Linux timing method
+#ifndef _AMD
+  struct timeval s;
+  gettimeofday(&s, 0);
+  n = s.tv_sec * 1.0E3 + (double)(s.tv_usec / 1.0E3);
+#else
+  // AMD Linux timing
+
+  unsigned int unused;
+  n = __rdtscp(&unused);
+#endif
+
+#endif
+
+  n -= _timers[index]->_start;
+  _timers[index]->_start = 0;
+
+#ifndef _AMD
+  _timers[index]->_clocks += n;
+#else
+  //_timers[index]->_clocks += 10 * n /freq_in_100mhz;      // unit is ns
+  _timers[index]->_clocks += 1.0E-6 * 10 * n / freq_in_100mhz;  // convert to ms
+  cout << "_AMD is enabled!!!" << endl;
+#endif
+
+  return HSA_SUCCESS;
+}
+
+void PerfTimer::Error(string str) { cout << str << endl; }
+
+
+double PerfTimer::ReadTimer(int index) {
+  if (index >= (int)_timers.size()) {
+    Error("Cannot read timer. Invalid handle.");
+    return HSA_FAILURE;
+  }
+
+  double reading = double(_timers[index]->_clocks);
+
+  reading = double(reading / _timers[index]->_freq);
+
+  return reading;
+}
+
+
+uint64_t PerfTimer::CoarseTimestampUs() {
+#ifdef _WIN32
+  uint64_t freqHz, ticks;
+  QueryPerformanceFrequency((LARGE_INTEGER*)&freqHz);
+  QueryPerformanceCounter((LARGE_INTEGER*)&ticks);
+
+  // Scale numerator and divisor until (ticks * 1000000) fits in uint64_t.
+  while (ticks > (1ULL << 44)) {
+    ticks /= 16;
+    freqHz /= 16;
+  }
+
+  return (ticks * 1000000) / freqHz;
+#else
+  struct timespec ts;
+  clock_gettime(CLOCK_MONOTONIC_RAW, &ts);
+  return uint64_t(ts.tv_sec) * 1000000 + ts.tv_nsec / 1000;
+#endif
+}
+
+uint64_t PerfTimer::MeasureTSCFreqHz() {
+  // Make a coarse interval measurement of TSC ticks for 1 gigacycles.
+  unsigned int unused;
+  uint64_t tscTicksEnd;
+
+  uint64_t coarseBeginUs = CoarseTimestampUs();
+  uint64_t tscTicksBegin = __rdtscp(&unused);
+  do {
+    tscTicksEnd = __rdtscp(&unused);
+  } while (tscTicksEnd - tscTicksBegin < 1000000000);
+
+  uint64_t coarseEndUs = CoarseTimestampUs();
+
+  // Compute the TSC frequency and round to nearest 100MHz.
+  uint64_t coarseIntervalNs = (coarseEndUs - coarseBeginUs) * 1000;
+  uint64_t tscIntervalTicks = tscTicksEnd - tscTicksBegin;
+  return (tscIntervalTicks * 10 + (coarseIntervalNs / 2)) / coarseIntervalNs;
+}
@@ -0,0 +1,68 @@
+#ifndef __MYTIME__
+#define __MYTIME__
+
+// Will use AMD timer and general Linux timer based on users' need --> compilation flag
+// need to consider platform is Windows or Linux
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <string.h>
+#include <iostream>
+#include <vector>
+#include <string>
+using namespace std;
+
+#if defined(_MSC_VER)
+#include <time.h>
+#include <windows.h>
+#include <intrin.h>
+#else
+#if defined(__GNUC__)
+#include <sys/time.h>
+#include <x86intrin.h>
+#endif  // __GNUC__
+#endif  //_MSC_VER
+
+#define HSA_FAILURE 1
+#define HSA_SUCCESS 0
+
+class PerfTimer {
+ private:
+  struct Timer {
+    string name;     /* < name name of time object*/
+    long long _freq; /* < _freq frequency*/
+    double _clocks;  /* < _clocks number of ticks at end*/
+    double _start;   /* < _start start point ticks*/
+  };
+
+  std::vector<Timer*> _timers; /*< _timers vector to Timer objects */
+  double freq_in_100mhz;
+
+ public:
+  PerfTimer();
+  ~PerfTimer();
+
+ private:
+  // AMD timing method
+  uint64_t CoarseTimestampUs();
+  uint64_t MeasureTSCFreqHz();
+
+  // General Linux timing method
+
+ public:
+  int CreateTimer();
+  int StartTimer(int index);
+  int StopTimer(int index);
+
+ public:
+  // retrieve time
+  double ReadTimer(int index);
+  // write into a file
+  double WriteTimer(int index);
+
+ public:
+  void Error(string str);
+};
+
+#endif
@@ -0,0 +1,91 @@
+/******************************************************************************
+
+Copyright ©2013 Advanced Micro Devices, Inc. All rights reserved.
+
+Redistribution and use in source and binary forms, with or without modification,
+are permitted provided that the following conditions are met:
+
+Redistributions of source code must retain the above copyright notice, this list
+of conditions and the following disclaimer.
+
+Redistributions in binary form must reproduce the above copyright notice, this
+list of conditions and the following disclaimer in the documentation and/or
+other materials provided with the distribution.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
+INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
+OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*******************************************************************************/
+
+#include <assert.h>
+#include "simple_convolution.h"
+#include "test_hsa.h"
+#include "test_pgen_pmc.h"
+#include "test_pgen_sqtt.h"
+
+int main(int argc, char* argv[]) {
+#if defined(NDEBUG)
+  clog.rdbuf(NULL);
+#endif
+
+  bool ret_val = true;
+
+  // Create SimpleConvolution test object
+  TestKernel* test_kernel = new SimpleConvolution();
+  TestAql* test_aql = new TestHSA(test_kernel);
+
+  const bool pmc_enable = (getenv("ROCR_ENABLE_PMC") != NULL);
+  const bool sqtt_enable = (getenv("ROCR_ENABLE_SQTT") != NULL);
+  if (pmc_enable)
+    test_aql = new TestPGenPMC(test_aql);
+  else if (sqtt_enable)
+    test_aql = new TestPGenSQTT(test_aql);
+  assert(test_aql != NULL);
+  if (test_aql == NULL) return 1;
+
+  // Initialization of Hsa Runtime
+  ret_val = test_aql->initialize(argc, argv);
+  if (ret_val == false) {
+    std::cout << "Error in the test initialization" << std::endl;
+    assert(ret_val);
+    return 1;
+  }
+
+  // Setup Hsa resources needed for execution
+  ret_val = test_aql->setup();
+  if (ret_val == false) {
+    std::cout << "Error in creating hsa resources" << std::endl;
+    assert(ret_val);
+    return 1;
+  }
+
+  // Run SimpleConvolution kernel
+  ret_val = test_aql->run();
+  if (ret_val == false) {
+    std::cout << "Error in running the test kernel" << std::endl;
+    assert(ret_val);
+    return 1;
+  }
+
+  // Verify the results of the execution
+  ret_val = test_aql->verify_results();
+  if (ret_val) {
+    std::cout << "Test : Passed" << std::endl;
+  } else {
+    std::cout << "Test : Failed" << std::endl;
+  }
+
+  // Print time taken by sample
+  test_aql->print_time();
+  test_aql->cleanup();
+
+  return (ret_val) ? 0 : 1;
+}
@@ -0,0 +1,87 @@
+/******************************************************************************
+
+Copyright ©2013 Advanced Micro Devices, Inc. All rights reserved.
+
+Redistribution and use in source and binary forms, with or without modification,
+are permitted provided that the following conditions are met:
+
+Redistributions of source code must retain the above copyright notice, this list
+of conditions and the following disclaimer.
+
+Redistributions in binary form must reproduce the above copyright notice, this
+list of conditions and the following disclaimer in the documentation and/or
+other materials provided with the distribution.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
+INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
+OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*******************************************************************************/
+
+#ifndef _TESTAQL_H_
+#define _TESTAQL_H_
+
+#include "hsa.h"
+#include "hsa_rsrc_factory.hpp"
+#include "hsa_ext_amd_aql_profile.h"
+
+#define test_assert(cond)                                                                          \
+  {                                                                                                \
+    if (cond) {                                                                                    \
+      std::cout << "ASSERT FAILED: " << #cond << " : " << __FILE__ << "(" << __LINE__ << ")"       \
+                << std::endl;                                                                      \
+      abort();                                                                                     \
+    }                                                                                              \
+  }
+
+// Test AQL interface
+class TestAql {
+  TestAql* const test_aql;
+
+ public:
+  TestAql(TestAql* t = 0) : test_aql(t) {}
+  virtual ~TestAql() {}
+
+  TestAql* testAql() { return test_aql; }
+  virtual AgentInfo* getAgentInfo() { return (test_aql) ? test_aql->getAgentInfo() : 0; }
+  virtual hsa_queue_t* getQueue() { return (test_aql) ? test_aql->getQueue() : 0; }
+  virtual HsaRsrcFactory* getRsrcFactory() { return (test_aql) ? test_aql->getRsrcFactory() : 0; }
+
+  // Initialize application environment including setting
+  // up of various configuration parameters based on
+  // command line arguments
+  // @return bool true on success and false on failure
+  virtual bool initialize(int argc, char** argv) {
+    return (test_aql) ? test_aql->initialize(argc, argv) : true;
+  }
+
+  // Setup application parameters for exectuion
+  // @return bool true on success and false on failure
+  virtual bool setup() { return (test_aql) ? test_aql->setup() : true; }
+
+  // Run the kernel
+  // @return bool true on success and false on failure
+  virtual bool run() { return (test_aql) ? test_aql->run() : true; }
+
+  // Verify results
+  // @return bool true on success and false on failure
+  virtual bool verify_results() { return (test_aql) ? test_aql->verify_results() : true; }
+
+  // Print to console the time taken to execute kernel
+  virtual void print_time() {
+    if (test_aql) test_aql->print_time();
+  }
+
+  // Release resources e.g. memory allocations
+  // @return bool true on success and false on failure
+  virtual bool cleanup() { return (test_aql) ? test_aql->cleanup() : true; }
+};
+
+#endif  // _TESTAQL_H_
@@ -0,0 +1,234 @@
+/******************************************************************************
+
+Copyright ©2013 Advanced Micro Devices, Inc. All rights reserved.
+
+Redistribution and use in source and binary forms, with or without modification,
+are permitted provided that the following conditions are met:
+
+Redistributions of source code must retain the above copyright notice, this list
+of conditions and the following disclaimer.
+
+Redistributions in binary form must reproduce the above copyright notice, this
+list of conditions and the following disclaimer in the documentation and/or
+other materials provided with the distribution.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
+INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
+OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*******************************************************************************/
+
+#include "os.h"
+#include "helper_funcs.hpp"
+#include "hsa_rsrc_factory.hpp"
+#include "test_hsa.h"
+
+bool TestHSA::initialize(int arg_cnt, char** arg_list) {
+  std::cout << "TestHSA::initialize :" << std::endl;
+  // Initialize command line arguments
+  hsa_cmdline_arg_cnt = arg_cnt;
+  hsa_cmdline_arg_list = arg_list;
+
+  // Instantiate a Timer object
+  setup_timer_idx_ = hsa_timer_.CreateTimer();
+  dispatch_timer_idx_ = hsa_timer_.CreateTimer();
+
+  // Instantiate an instance of Hsa Resources Factory
+  hsa_rsrc_ = new HsaRsrcFactory();
+
+  // Print properties of the agents
+  hsa_rsrc_->PrintGpuAgents("> GPU agents");
+
+  // Create an instance of Gpu agent
+  const char* p = getenv("ROCR_AGENT_IND");
+  const uint32_t agent_ind = (p == NULL) ? 0 : atol(p);
+  if (!hsa_rsrc_->GetGpuAgentInfo(agent_ind, &agent_info_)) {
+    std::cout << "> error: agent[" << agent_ind << "] is not found" << std::endl;
+    return false;
+  }
+  std::cout << "> Using agent[" << agent_ind << "] : " << agent_info_->name << std::endl;
+
+  // Create an instance of Aql Queue
+  uint32_t num_pkts = 128;
+  hsa_rsrc_->CreateQueue(agent_info_, num_pkts, &hsa_queue_);
+
+  // Obtain handle of signal
+  hsa_rsrc_->CreateSignal(1, &hsa_signal_);
+
+  // Obtain the code object file name
+  std::string agentName(agent_info_->name);
+  if (agentName.compare(0, 4, "gfx8") == 0) {
+    brig_path_obj_.append("gfx8");
+  } else if (agentName.compare(0, 4, "gfx9") == 0) {
+    brig_path_obj_.append("gfx9");
+  } else {
+    assert(false);
+    return false;
+  }
+  brig_path_obj_.append("_" + name_ + ".hsaco");
+
+  return true;
+}
+
+bool TestHSA::setup() {
+  std::cout << "TestHSA::setup :" << std::endl;
+
+  // Start the timer object
+  hsa_timer_.StartTimer(setup_timer_idx_);
+
+  mem_map_t& mem_map = test_->get_mem_map();
+  for (mem_it_t it = mem_map.begin(); it != mem_map.end(); ++it) {
+    mem_descr_t& des = it->second;
+    void* ptr = (des.local) ? hsa_rsrc_->AllocateLocalMemory(agent_info_, des.size)
+                            : hsa_rsrc_->AllocateSysMemory(agent_info_, des.size);
+    des.ptr = ptr;
+    assert(ptr != NULL);
+    if (ptr == NULL) return false;
+  }
+  test_->init();
+
+  // Load and Finalize Kernel Code Descriptor
+  char* brig_path = (char*)brig_path_obj_.c_str();
+  const bool ret_val =
+      hsa_rsrc_->LoadAndFinalize(agent_info_, brig_path, strdup(name_.c_str()), &kernel_code_desc_);
+  if (ret_val == false) {
+    std::cout << "Error in loading and finalizing Kernel" << std::endl;
+    return ret_val;
+  }
+
+  // Stop the timer object
+  hsa_timer_.StopTimer(setup_timer_idx_);
+  setup_time_taken_ = hsa_timer_.ReadTimer(setup_timer_idx_);
+  total_time_taken_ = setup_time_taken_;
+
+  return true;
+}
+
+bool TestHSA::run() {
+  std::cout << "TestHSA::run :" << std::endl;
+
+  const uint32_t work_group_size = 64;
+  const uint32_t work_grid_size = test_->get_elements_count();
+  uint32_t group_segment_size = 0;
+  uint32_t private_segment_size = 0;
+  const size_t kernarg_segment_size = test_->get_kernarg_size();
+  uint64_t code_handle = 0;
+
+  // Retrieve the amount of group memory needed
+  hsa_executable_symbol_get_info(
+      kernel_code_desc_, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_GROUP_SEGMENT_SIZE, &group_segment_size);
+
+  // Retrieve the amount of private memory needed
+  hsa_executable_symbol_get_info(kernel_code_desc_,
+                                 HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_PRIVATE_SEGMENT_SIZE,
+                                 &private_segment_size);
+
+  // Check the kernel args size
+  size_t size_info = 0;
+  hsa_executable_symbol_get_info(
+      kernel_code_desc_, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_KERNARG_SEGMENT_SIZE, &size_info);
+  assert(kernarg_segment_size == size_info);
+  if (kernarg_segment_size != size_info) return false;
+
+  // Retrieve handle of the code block
+  hsa_executable_symbol_get_info(kernel_code_desc_, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT,
+                                 &code_handle);
+
+  // Initialize the dispatch packet.
+  hsa_kernel_dispatch_packet_t aql;
+  memset(&aql, 0, sizeof(aql));
+  // Set the packet's type, acquire and release fences
+  aql.header = HSA_PACKET_TYPE_KERNEL_DISPATCH;
+  aql.header |= HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE;
+  aql.header |= HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE;
+  // Populate Aql packet with default values
+  aql.setup = 1;
+  aql.grid_size_x = work_grid_size;
+  aql.grid_size_y = 1;
+  aql.grid_size_z = 1;
+  aql.workgroup_size_x = work_group_size;
+  aql.workgroup_size_y = 1;
+  aql.workgroup_size_z = 1;
+  // Bind the kernel code descriptor and arguments
+  aql.kernel_object = code_handle;
+  aql.kernarg_address = test_->get_kernarg_ptr();
+  aql.group_segment_size = group_segment_size;
+  aql.private_segment_size = private_segment_size;
+  // Initialize Aql packet with handle of signal
+  aql.completion_signal = hsa_signal_;
+
+  // Compute the write index of queue and copy Aql packet into it
+  const uint64_t que_idx = hsa_queue_load_write_index_relaxed(hsa_queue_);
+  const uint32_t mask = hsa_queue_->size - 1;
+
+  std::cout << "> Executing kernel: \"" << name_ << "\"" << std::endl;
+
+  // Start the timer object
+  hsa_timer_.StartTimer(dispatch_timer_idx_);
+
+  // Disable packet so that submission to HW is complete
+  const auto header = aql.header;
+  const uint8_t packet_type_mask = (1 << HSA_PACKET_HEADER_WIDTH_TYPE) - 1;
+  aql.header &= (~packet_type_mask) << HSA_PACKET_HEADER_TYPE;
+  aql.header |= HSA_PACKET_TYPE_INVALID << HSA_PACKET_HEADER_TYPE;
+
+  // Copy Aql packet into queue buffer
+  ((hsa_kernel_dispatch_packet_t*)(hsa_queue_->base_address))[que_idx & mask] = aql;
+
+  // After AQL packet is fully copied into queue buffer
+  // update packet header from invalid state to valid state
+  std::atomic_thread_fence(std::memory_order_release);
+  ((hsa_kernel_dispatch_packet_t*)(hsa_queue_->base_address))[que_idx & mask].header = header;
+
+  // Increment the write index and ring the doorbell to dispatch the kernel.
+  hsa_queue_store_write_index_relaxed(hsa_queue_, (que_idx + 1));
+  hsa_signal_store_relaxed(hsa_queue_->doorbell_signal, que_idx);
+
+  std::cout << "> Waiting on kernel dispatch signal" << std::endl;
+
+  // Wait on the dispatch signal until the kernel is finished.
+  // Update wait condition to HSA_WAIT_STATE_ACTIVE for Polling
+  hsa_signal_value_t value = hsa_signal_wait_acquire(hsa_signal_, HSA_SIGNAL_CONDITION_LT, 1,
+                                                     (uint64_t)-1, HSA_WAIT_STATE_BLOCKED);
+
+  // Stop the timer object
+  hsa_timer_.StopTimer(dispatch_timer_idx_);
+  dispatch_time_taken_ = hsa_timer_.ReadTimer(dispatch_timer_idx_);
+  total_time_taken_ += dispatch_time_taken_;
+
+  // Copy kernel buffers from local memory into system memory
+  hsa_rsrc_->TransferData((uint8_t*)test_->get_output_ptr(), (uint8_t*)test_->get_local_ptr(),
+                          test_->get_output_size(), false);
+  test_->print_output();
+
+  return true;
+}
+
+bool TestHSA::verify_results() {
+  // Compare the results and see if they match
+  const int32_t cmp_val =
+      memcmp(test_->get_output_ptr(), test_->get_refout_ptr(), test_->get_output_size());
+  return (cmp_val == 0);
+}
+
+void TestHSA::print_time() {
+  std::cout << "Time taken for Setup by " << this->name_ << " : " << this->setup_time_taken_
+            << std::endl;
+  std::cout << "Time taken for Dispatch by " << this->name_ << " : " << this->dispatch_time_taken_
+            << std::endl;
+  std::cout << "Time taken in Total by " << this->name_ << " : " << this->total_time_taken_
+            << std::endl;
+}
+
+bool TestHSA::cleanup() {
+  // shutdown Hsa Runtime system
+  hsa_status_t ret_val = hsa_shut_down();
+  return (HSA_STATUS_SUCCESS == ret_val);
+}
@@ -0,0 +1,115 @@
+/******************************************************************************
+
+Copyright ©2013 Advanced Micro Devices, Inc. All rights reserved.
+
+Redistribution and use in source and binary forms, with or without modification,
+are permitted provided that the following conditions are met:
+
+Redistributions of source code must retain the above copyright notice, this list
+of conditions and the following disclaimer.
+
+Redistributions in binary form must reproduce the above copyright notice, this
+list of conditions and the following disclaimer in the documentation and/or
+other materials provided with the distribution.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
+INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
+OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*******************************************************************************/
+
+#ifndef _TEST_HSA_H_
+#define _TEST_HSA_H_
+
+#include "test_aql.h"
+#include "test_kernel.h"
+#include "hsa_rsrc_factory.hpp"
+
+// Class implements HSA test
+class TestHSA : public TestAql {
+ public:
+  // Constructor
+  TestHSA(TestKernel* test) : test_(test), name_(test->Name()) {
+    total_time_taken_ = 0;
+    setup_time_taken_ = 0;
+    dispatch_time_taken_ = 0;
+  }
+
+  // Get methods for Agent Info, HAS queue, HSA Resourcse Manager
+  AgentInfo* getAgentInfo() { return agent_info_; }
+  hsa_queue_t* getQueue() { return hsa_queue_; }
+  HsaRsrcFactory* getRsrcFactory() { return hsa_rsrc_; }
+
+  // Initialize application environment including setting
+  // up of various configuration parameters based on
+  // command line arguments
+  // @return bool true on success and false on failure
+  bool initialize(int argc, char** argv);
+
+  // Setup application parameters for exectuion
+  // @return bool true on success and false on failure
+  bool setup();
+
+  // Run the BinarySearch kernel
+  // @return bool true on success and false on failure
+  bool run();
+
+  // Verify against reference implementation
+  // @return bool true on success and false on failure
+  bool verify_results();
+
+  // Print to console the time taken to execute kernel
+  void print_time();
+
+  // Release resources e.g. memory allocations
+  // @return bool true on success and false on failure
+  bool cleanup();
+
+ private:
+  typedef TestKernel::mem_descr_t mem_descr_t;
+  typedef TestKernel::mem_map_t mem_map_t;
+  typedef TestKernel::mem_it_t mem_it_t;
+
+  // Test object
+  TestKernel* test_;
+
+  // Path of Brig file
+  std::string brig_path_obj_;
+
+  // Used to track time taken to run the sample
+  double total_time_taken_;
+  double setup_time_taken_;
+  double dispatch_time_taken_;
+
+  // Handle to an Hsa Gpu Agent
+  AgentInfo* agent_info_;
+
+  // Handle to an Hsa Queue
+  hsa_queue_t* hsa_queue_;
+
+  // Handle of signal
+  hsa_signal_t hsa_signal_;
+
+  // Handle of Kernel Code Descriptor
+  hsa_executable_symbol_t kernel_code_desc_;
+
+  // Instance of timer object
+  uint32_t setup_timer_idx_;
+  uint32_t dispatch_timer_idx_;
+  PerfTimer hsa_timer_;
+
+  // Instance of Hsa Resources Factory
+  HsaRsrcFactory* hsa_rsrc_;
+
+  // Test kernel name
+  std::string name_;
+};
+
+#endif  // _TEST_HSA_H_
@@ -0,0 +1,105 @@
+/******************************************************************************
+
+Copyright ©2013 Advanced Micro Devices, Inc. All rights reserved.
+
+Redistribution and use in source and binary forms, with or without modification,
+are permitted provided that the following conditions are met:
+
+Redistributions of source code must retain the above copyright notice, this list
+of conditions and the following disclaimer.
+
+Redistributions in binary form must reproduce the above copyright notice, this
+list of conditions and the following disclaimer in the documentation and/or
+other materials provided with the distribution.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
+INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
+OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*******************************************************************************/
+
+#ifndef _TEST_KERNEL_H_
+#define _TEST_KERNEL_H_
+
+#include <map>
+#include <stdint.h>
+
+// Class implements Kernel test
+class TestKernel {
+ public:
+  // Memory descriptors IDs
+  enum { INPUT_DES_ID, OUTPUT_DES_ID, LOCAL_DES_ID, MASK_DES_ID, KERNARG_DES_ID, REFOUT_DES_ID };
+
+  // Memory descriptors vector declaration
+  struct mem_descr_t {
+    void* ptr;
+    uint32_t size;
+    bool local;
+  };
+
+  // Memory map declaration
+  typedef std::map<uint32_t, mem_descr_t> mem_map_t;
+  typedef mem_map_t::iterator mem_it_t;
+  typedef mem_map_t::const_iterator mem_const_it_t;
+
+  // Initialize method
+  virtual void init() = 0;
+
+  // Return kernel memory map
+  mem_map_t& get_mem_map() { return mem_map_; }
+
+  // Return NULL descriptor
+  static mem_descr_t null_descriptor() { return {0, 0, 0}; }
+
+  // Methods to get the kernel attributes
+  void* get_kernarg_ptr() const { return get_descr(KERNARG_DES_ID).ptr; }
+  uint32_t get_kernarg_size() const { return get_descr(KERNARG_DES_ID).size; }
+  void* get_output_ptr() const { return get_descr(OUTPUT_DES_ID).ptr; }
+  uint32_t get_output_size() const { return get_descr(OUTPUT_DES_ID).size; }
+  void* get_local_ptr() const { return get_descr(LOCAL_DES_ID).ptr; }
+  void* get_refout_ptr() const { return get_descr(REFOUT_DES_ID).ptr; }
+  virtual uint32_t get_elements_count() const = 0;
+
+  // Print output
+  virtual void print_output() const = 0;
+
+  // Return name
+  virtual std::string Name() const = 0;
+
+ protected:
+  // Set system memory descriptor
+  bool set_sys_descr(const uint32_t& id, const uint32_t& size) {
+    return set_mem_descr(id, size, false);
+  }
+
+  // Set local memory descriptor
+  bool set_local_descr(const uint32_t& id, const uint32_t& size) {
+    return set_mem_descr(id, size, true);
+  }
+
+  // Get memory descriptor
+  mem_descr_t get_descr(const uint32_t& id) const {
+    mem_const_it_t it = mem_map_.find(id);
+    return (it != mem_map_.end()) ? it->second : null_descriptor();
+  }
+
+ private:
+  // Set memory descriptor
+  bool set_mem_descr(const uint32_t& id, const uint32_t& size, const bool& local) {
+    const mem_descr_t des = {NULL, size, local};
+    auto ret = mem_map_.insert(mem_map_t::value_type(id, des));
+    return ret.second;
+  }
+
+  // Kernel memory map object
+  mem_map_t mem_map_;
+};
+
+#endif  // _TEST_KERNEL_H_
@@ -0,0 +1,46 @@
+/******************************************************************************
+
+Copyright ©2013 Advanced Micro Devices, Inc. All rights reserved.
+
+Redistribution and use in source and binary forms, with or without modification,
+are permitted provided that the following conditions are met:
+
+Redistributions of source code must retain the above copyright notice, this list
+of conditions and the following disclaimer.
+
+Redistributions in binary form must reproduce the above copyright notice, this
+list of conditions and the following disclaimer in the documentation and/or
+other materials provided with the distribution.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
+INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
+OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*******************************************************************************/
+
+#ifndef _TEST_PGEN_H_
+#define _TEST_PGEN_H_
+
+#include "test_pmgr.h"
+#include "hsa_ext_amd_aql_profile.h"
+
+// SimpleConvolution: Class implements OpenCL SimpleConvolution sample
+class TestPGen : public TestPMgr {
+  typedef hsa_ext_amd_aql_pm4_packet_t packet_t;
+
+ protected:
+  packet_t* PrePacket() { return reinterpret_cast<packet_t*>(&prePacket); }
+  packet_t* PostPacket() { return reinterpret_cast<packet_t*>(&postPacket); }
+
+ public:
+  TestPGen(TestAql* t) : TestPMgr(t) {}
+};
+
+#endif  // _TEST_PGEN_H_
@@ -0,0 +1,142 @@
+/******************************************************************************
+
+Copyright ©2013 Advanced Micro Devices, Inc. All rights reserved.
+
+Redistribution and use in source and binary forms, with or without modification,
+are permitted provided that the following conditions are met:
+
+Redistributions of source code must retain the above copyright notice, this list
+of conditions and the following disclaimer.
+
+Redistributions in binary form must reproduce the above copyright notice, this
+list of conditions and the following disclaimer in the documentation and/or
+other materials provided with the distribution.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
+INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
+OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*******************************************************************************/
+
+#ifndef _TEST_PGEN_PMC_H_
+#define _TEST_PGEN_PMC_H_
+
+#include "test_pgen.h"
+
+hsa_status_t TestPGenPMC_Callback(hsa_ext_amd_aql_profile_info_type_t info_type,
+                                  hsa_ext_amd_aql_profile_info_data_t* info_data,
+                                  void* callback_data) {
+  hsa_status_t status = HSA_STATUS_SUCCESS;
+  typedef std::vector<hsa_ext_amd_aql_profile_info_data_t> passed_data_t;
+  reinterpret_cast<passed_data_t*>(callback_data)->push_back(*info_data);
+  return status;
+}
+
+// SimpleConvolution: Class implements OpenCL SimpleConvolution sample
+class TestPGenPMC : public TestPGen {
+  const static uint32_t buffer_alignment = 0x1000;  // 4K
+
+  hsa_agent_t agent;
+  hsa_ext_amd_aql_profile_profile_t profile;
+  hsa_ext_amd_aql_profile_event_t events[2];
+
+  bool buildPackets() { return true; }
+
+  bool dumpData() {
+    std::cout << "TestPGenPMC::dumpData :" << std::endl;
+
+    typedef std::vector<hsa_ext_amd_aql_profile_info_data_t> callback_data_t;
+
+    callback_data_t data;
+    hsa_ext_amd_aql_profile_iterate_data(&profile, TestPGenPMC_Callback, &data);
+    for (callback_data_t::iterator it = data.begin(); it != data.end(); ++it) {
+      std::cout << "> sample(" << dec << it->sample_id << ") block("
+                << it->pmc_data.event.block_name << "_" << it->pmc_data.event.block_index
+                << ") result(" << hex << it->pmc_data.result << ")" << std::endl;
+    }
+
+    return true;
+  }
+
+ public:
+  TestPGenPMC(TestAql* t) : TestPGen(t) { std::cout << "Test: PGen PMC" << std::endl; }
+
+  bool initialize(int arg_cnt, char** arg_list) {
+    if (!TestPMgr::initialize(arg_cnt, arg_list)) return false;
+
+    hsa_status_t status;
+    hsa_agent_t agent;
+    uint32_t command_buffer_alignment;
+    uint32_t command_buffer_size;
+    uint32_t output_buffer_alignment;
+    uint32_t output_buffer_size;
+
+    // GPU identificator
+    agent = getAgentInfo()->dev_id;
+
+    // Instantiation of the profile object
+    // //////////////////////////////////////////////////////////////
+    // Set the event fields
+    events[0].block_name = HSA_EXT_AQL_PROFILE_BLOCK_SQ;
+    events[0].block_index = 0;
+    events[0].counter_id = 0x4;  // SQ_SQ_PERF_SEL_WAVES
+    events[1].block_name = HSA_EXT_AQL_PROFILE_BLOCK_SQ;
+    events[1].block_index = 0;
+    events[1].counter_id = 0xe;  // SQ_SQ_PERF_SEL_ITEMS
+
+    // Initialization the profile
+    memset(&profile, 0, sizeof(profile));
+    profile.agent = agent;
+    profile.type = HSA_EXT_AQL_PROFILE_EVENT_PMC;
+
+    // set enabled events list
+    profile.events = events;
+    profile.event_count = 2;
+
+    // Profile buffers attributes
+    command_buffer_alignment = buffer_alignment;
+    status = hsa_ext_amd_aql_profile_get_info(
+        &profile, HSA_EXT_AQL_PROFILE_INFO_COMMAND_BUFFER_SIZE, &command_buffer_size);
+    assert(status == HSA_STATUS_SUCCESS);
+
+    output_buffer_alignment = buffer_alignment;
+    status = hsa_ext_amd_aql_profile_get_info(&profile, HSA_EXT_AQL_PROFILE_INFO_PMC_DATA_SIZE,
+                                              &output_buffer_size);
+    assert(status == HSA_STATUS_SUCCESS);
+
+    // Application is allocating the command buffer
+    // Allocate(command_buffer_alignment, command_buffer_size,
+    //          MODE_HOST_ACC|MODE_DEV_ACC|MODE_EXEC_DATA)
+    profile.command_buffer.ptr =
+        getRsrcFactory()->AllocateSysMemory(getAgentInfo(), command_buffer_size);
+    profile.command_buffer.size = command_buffer_size;
+
+    // Application is allocating the output buffer
+    // Allocate(output_buffer_alignment, output_buffer_size,
+    //          MODE_HOST_ACC|MODE_DEV_ACC)
+    profile.output_buffer.ptr =
+        getRsrcFactory()->AllocateSysMemory(getAgentInfo(), output_buffer_size);
+    profile.output_buffer.size = output_buffer_size;
+    memset(profile.output_buffer.ptr, 0x77, output_buffer_size);
+
+    // Populating the AQL start packet
+    status = hsa_ext_amd_aql_profile_start(&profile, PrePacket());
+    assert(status == HSA_STATUS_SUCCESS);
+    if (status != HSA_STATUS_SUCCESS) return false;
+
+    // Populating the AQL stop packet
+    status = hsa_ext_amd_aql_profile_stop(&profile, PostPacket());
+    assert(status == HSA_STATUS_SUCCESS);
+
+    return (status == HSA_STATUS_SUCCESS);
+  }
+};
+
+#endif  // _TEST_PGEN_PMC_H_
@@ -0,0 +1,160 @@
+/******************************************************************************
+
+Copyright ©2013 Advanced Micro Devices, Inc. All rights reserved.
+
+Redistribution and use in source and binary forms, with or without modification,
+are permitted provided that the following conditions are met:
+
+Redistributions of source code must retain the above copyright notice, this list
+of conditions and the following disclaimer.
+
+Redistributions in binary form must reproduce the above copyright notice, this
+list of conditions and the following disclaimer in the documentation and/or
+other materials provided with the distribution.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
+INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
+OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*******************************************************************************/
+
+#ifndef _TEST_PGEN_SQTT_H_
+#define _TEST_PGEN_SQTT_H_
+
+#include <iostream>
+#include <iomanip>
+#include <fstream>
+
+#include "test_pgen.h"
+
+hsa_status_t TestPGenSQTT_Callback(hsa_ext_amd_aql_profile_info_type_t info_type,
+                                   hsa_ext_amd_aql_profile_info_data_t* info_data,
+                                   void* callback_data) {
+  hsa_status_t status = HSA_STATUS_SUCCESS;
+  typedef std::vector<hsa_ext_amd_aql_profile_info_data_t> passed_data_t;
+  reinterpret_cast<passed_data_t*>(callback_data)->push_back(*info_data);
+  return status;
+}
+
+// SimpleConvolution: Class implements OpenCL SimpleConvolution sample
+class TestPGenSQTT : public TestPGen {
+  const static uint32_t buffer_alignment = 0x1000;  // 4K
+  const static uint32_t buffer_size = 0x2000000;    // 32M
+
+  hsa_agent_t agent;
+  hsa_ext_amd_aql_profile_profile_t profile;
+
+  bool buildPackets() { return true; }
+
+  bool dumpData() {
+    std::cout << "TestPGenSQTT::dumpData :" << std::endl;
+
+    typedef std::vector<hsa_ext_amd_aql_profile_info_data_t> callback_data_t;
+
+    callback_data_t data;
+    hsa_ext_amd_aql_profile_iterate_data(&profile, TestPGenSQTT_Callback, &data);
+    for (callback_data_t::iterator it = data.begin(); it != data.end(); ++it) {
+      std::cout << "> sample(" << dec << it->sample_id << ") ptr(" << hex << it->sqtt_data.ptr
+                << ") size(" << dec << it->sqtt_data.size << ")" << std::endl;
+
+      void* sys_buf = getRsrcFactory()->AllocateSysMemory(getAgentInfo(), it->sqtt_data.size);
+      assert(sys_buf != NULL);
+      if (sys_buf == NULL) return HSA_STATUS_ERROR;
+
+      hsa_status_t status = hsa_memory_copy(sys_buf, it->sqtt_data.ptr, it->sqtt_data.size);
+      assert(status == HSA_STATUS_SUCCESS);
+      if (status != HSA_STATUS_SUCCESS) return status;
+
+      std::string file_name;
+      file_name.append("sqtt_dump_");
+      file_name.append(std::to_string(it->sample_id));
+      file_name.append(".txt");
+      std::ofstream out_file;
+      out_file.open(file_name);
+
+      // Write the buffer in terms of shorts (16 bits)
+      short* sqtt_data = (short*)sys_buf;
+      for (int i = 0; i < (it->sqtt_data.size / sizeof(short)); ++i) {
+        out_file << std::setw(4) << std::setfill('0') << std::hex << sqtt_data[i] << "\n";
+      }
+
+      out_file.close();
+    }
+
+    return true;
+  }
+
+ public:
+  TestPGenSQTT(TestAql* t) : TestPGen(t) { std::cout << "Test: PGen SQTT" << std::endl; }
+
+  bool initialize(int arg_cnt, char** arg_list) {
+    if (!TestPMgr::initialize(arg_cnt, arg_list)) return false;
+
+    hsa_status_t status;
+    hsa_agent_t agent;
+    uint32_t command_buffer_alignment;
+    uint32_t command_buffer_size;
+    uint32_t output_buffer_alignment;
+    uint32_t output_buffer_size;
+
+    // GPU identificator
+    agent = getAgentInfo()->dev_id;
+
+    // Instantiation of the profile object
+    // //////////////////////////////////////////////////////////////
+    // Set the parameters
+    // parameters = ....;
+
+    // Initialization the profile
+    memset(&profile, 0, sizeof(profile));
+    profile.agent = agent;
+    profile.type = HSA_EXT_AQL_PROFILE_EVENT_SQTT;
+
+    // set parameters
+    // profile.parameters = &event;
+    // profile.parameter_count = 1;
+
+    // Profile buffers attributes
+    command_buffer_alignment = buffer_alignment;
+    status = hsa_ext_amd_aql_profile_get_info(
+        &profile, HSA_EXT_AQL_PROFILE_INFO_COMMAND_BUFFER_SIZE, &command_buffer_size);
+    assert(status == HSA_STATUS_SUCCESS);
+
+    output_buffer_alignment = buffer_alignment;
+    output_buffer_size = buffer_size;
+
+    // Application is allocating the command buffer
+    // AllocateSystem(command_buffer_alignment, command_buffer_size,
+    //                MODE_HOST_ACC|MODE_DEV_ACC|MODE_EXEC_DATA)
+    profile.command_buffer.ptr =
+        getRsrcFactory()->AllocateSysMemory(getAgentInfo(), command_buffer_size);
+    profile.command_buffer.size = command_buffer_size;
+
+    // Application is allocating the output buffer
+    // AllocateLocal(output_buffer_alignment, output_buffer_size,
+    //               MODE_DEV_ACC)
+    profile.output_buffer.ptr =
+        getRsrcFactory()->AllocateLocalMemory(getAgentInfo(), output_buffer_size);
+    profile.output_buffer.size = output_buffer_size;
+
+    // Populating the AQL start packet
+    status = hsa_ext_amd_aql_profile_start(&profile, PrePacket());
+    assert(status == HSA_STATUS_SUCCESS);
+    if (status != HSA_STATUS_SUCCESS) return false;
+
+    // Populating the AQL stop packet
+    status = hsa_ext_amd_aql_profile_stop(&profile, PostPacket());
+    assert(status == HSA_STATUS_SUCCESS);
+
+    return (status == HSA_STATUS_SUCCESS);
+  }
+};
+
+#endif  // _TEST_PGEN_SQTT_H_
@@ -0,0 +1,98 @@
+/******************************************************************************
+
+Copyright ©2013 Advanced Micro Devices, Inc. All rights reserved.
+
+Redistribution and use in source and binary forms, with or without modification,
+are permitted provided that the following conditions are met:
+
+Redistributions of source code must retain the above copyright notice, this list
+of conditions and the following disclaimer.
+
+Redistributions in binary form must reproduce the above copyright notice, this
+list of conditions and the following disclaimer in the documentation and/or
+other materials provided with the distribution.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
+INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
+OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*******************************************************************************/
+
+#include <atomic>
+#include <assert.h>
+
+#include "test_pmgr.h"
+
+bool TestPMgr::addPacket(const packet_t* packet) {
+  packet_t aql_packet = *packet;
+
+  // Compute the write index of queue and copy Aql packet into it
+  uint64_t que_idx = hsa_queue_load_write_index_relaxed(getQueue());
+  const uint32_t mask = getQueue()->size - 1;
+
+  // Disable packet so that submission to HW is complete
+  const auto header = HSA_PACKET_TYPE_VENDOR_SPECIFIC << HSA_PACKET_HEADER_TYPE;
+  aql_packet.header &= (~((1 << HSA_PACKET_HEADER_WIDTH_TYPE) - 1)) << HSA_PACKET_HEADER_TYPE;
+  aql_packet.header |= HSA_PACKET_TYPE_INVALID << HSA_PACKET_HEADER_TYPE;
+
+  // Copy Aql packet into queue buffer
+  ((packet_t*)(getQueue()->base_address))[que_idx & mask] = aql_packet;
+
+  // After AQL packet is fully copied into queue buffer
+  // update packet header from invalid state to valid state
+  std::atomic_thread_fence(std::memory_order_release);
+  ((packet_t*)(getQueue()->base_address))[que_idx & mask].header = header;
+
+  // Increment the write index and ring the doorbell to dispatch the kernel.
+  hsa_queue_store_write_index_relaxed(getQueue(), (que_idx + 1));
+  hsa_signal_store_relaxed(getQueue()->doorbell_signal, que_idx);
+
+  return true;
+}
+
+bool TestPMgr::run() {
+  // Build Aql Pkts
+  const bool active = buildPackets();
+  if (active) {
+    // Submit Pre-Dispatch Aql packet
+    addPacket(&prePacket);
+  }
+
+  testAql()->run();
+
+  if (active) {
+    // Set post packet completion signal
+    postPacket.completion_signal = postSignal;
+
+    // Submit Post-Dispatch Aql packet
+    addPacket(&postPacket);
+
+    // Wait for Post-Dispatch packet to complete
+    hsa_signal_wait_acquire(postSignal, HSA_SIGNAL_CONDITION_LT, 1, (uint64_t)-1,
+                            HSA_WAIT_STATE_BLOCKED);
+
+    // Dumping profiling data
+    dumpData();
+  }
+
+  return true;
+}
+
+bool TestPMgr::initialize(int argc, char** argv) {
+  TestAql::initialize(argc, argv);
+  hsa_status_t status = hsa_signal_create(1, 0, NULL, &postSignal);
+  assert(status == HSA_STATUS_SUCCESS);
+  return (status == HSA_STATUS_SUCCESS);
+}
+
+TestPMgr::TestPMgr(TestAql* t) : TestAql(t) {
+  dummySignal.handle = 0;
+  postSignal = dummySignal;
+}
@@ -0,0 +1,57 @@
+/******************************************************************************
+
+Copyright ©2013 Advanced Micro Devices, Inc. All rights reserved.
+
+Redistribution and use in source and binary forms, with or without modification,
+are permitted provided that the following conditions are met:
+
+Redistributions of source code must retain the above copyright notice, this list
+of conditions and the following disclaimer.
+
+Redistributions in binary form must reproduce the above copyright notice, this
+list of conditions and the following disclaimer in the documentation and/or
+other materials provided with the distribution.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
+INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
+OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*******************************************************************************/
+
+#ifndef _TEST_SMGR_H_
+#define _TEST_SMGR_H_
+
+#include "test_aql.h"
+#include "amd_aql_pm4_ib_packet.h"
+
+// SimpleConvolution: Class implements OpenCL SimpleConvolution sample
+class TestPMgr : public TestAql {
+ public:
+  typedef amd_aql_pm4_ib_packet_t packet_t;
+
+ private:
+  bool addPacket(const packet_t* packet);
+
+ protected:
+  packet_t prePacket;
+  packet_t postPacket;
+  hsa_signal_t dummySignal;
+  hsa_signal_t postSignal;
+
+  virtual bool buildPackets() { return false; }
+  virtual bool dumpData() { return false; }
+  virtual bool initialize(int argc, char** argv);
+
+ public:
+  TestPMgr(TestAql* t);
+  bool run();
+};
+
+#endif  // _TEST_SMGR_H_
@@ -0,0 +1,81 @@
+/******************************************************************************
+
+Copyright ©2013 Advanced Micro Devices, Inc. All rights reserved.
+
+Redistribution and use in source and binary forms, with or without modification,
+are permitted provided that the following conditions are met:
+
+Redistributions of source code must retain the above copyright notice, this list
+of conditions and the following disclaimer.
+
+Redistributions in binary form must reproduce the above copyright notice, this
+list of conditions and the following disclaimer in the documentation and/or
+other materials provided with the distribution.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
+INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
+OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+OF THE POSSIBILITY OF SUCH DAMAGE.
+
+********************************************************************************/
+
+/**
+ * SimpleConvolution is where each pixel of the output image
+ * is the weighted sum of the neighborhood pixels of the input image
+ * The neighborhood is defined by the dimensions of the mask and 
+ * weight of each neighbor is defined by the mask itself.
+ * @param output Output matrix after performing convolution
+ * @param input  Input  matrix on which convolution is to be performed
+ * @param mask   mask matrix using which convolution was to be performed
+ * @param inputDimensions dimensions of the input matrix
+ * @param maskDimensions  dimensions of the mask matrix
+ */
+__kernel void simpleConvolution(__global  uint  * output,
+                                __global  uint  * input,
+                                __global  float  * mask,
+                                const     uint2  inputDimensions,
+                                const     uint2  maskDimensions) {
+
+  uint tid   = get_global_id(0);
+
+  uint width  = inputDimensions.x;
+  uint height = inputDimensions.y;
+
+  uint x      = tid%width;
+  uint y      = tid/width;
+
+  uint maskWidth  = maskDimensions.x;
+  uint maskHeight = maskDimensions.y;
+
+  uint vstep = (maskWidth  -1)/2;
+  uint hstep = (maskHeight -1)/2;
+
+  // find the left, right, top and bottom indices such that
+  // the indices do not go beyond image boundaires
+  uint left    = (x           <  vstep) ? 0         : (x - vstep);
+  uint right   = ((x + vstep) >= width) ? width - 1 : (x + vstep); 
+  uint top     = (y           <  hstep) ? 0         : (y - hstep);
+  uint bottom  = ((y + hstep) >= height)? height - 1: (y + hstep); 
+
+  // initializing wighted sum value
+  float sumFX = 0;
+
+  for(uint i = left; i <= right; ++i) {
+    for(uint j = top ; j <= bottom; ++j) {
+      // performing wighted sum within the mask boundaries
+      uint maskIndex = (j - (y - hstep)) * maskWidth  + (i - (x - vstep));
+      uint index     = j                 * width      + i;
+      sumFX += ((float)input[index] * mask[maskIndex]);
+    }
+  }
+
+  // To round to the nearest integer
+  sumFX += 0.5f;
+  output[tid] = (uint)sumFX;
+}
@@ -0,0 +1,157 @@
+/******************************************************************************
+
+Copyright ©2013 Advanced Micro Devices, Inc. All rights reserved.
+
+Redistribution and use in source and binary forms, with or without modification,
+are permitted provided that the following conditions are met:
+
+Redistributions of source code must retain the above copyright notice, this list
+of conditions and the following disclaimer.
+
+Redistributions in binary form must reproduce the above copyright notice, this
+list of conditions and the following disclaimer in the documentation and/or
+other materials provided with the distribution.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
+INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
+OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*******************************************************************************/
+
+#include "helper_funcs.hpp"
+#include "simple_convolution.h"
+
+SimpleConvolution::SimpleConvolution() {
+  width_ = 64;
+  height_ = 64;
+  mask_width_ = 3;
+  mask_height_ = mask_width_;
+
+  if (!isPowerOf2(width_)) {
+    width_ = roundToPowerOf2(width_);
+  }
+
+  if (!isPowerOf2(height_)) {
+    height_ = roundToPowerOf2(height_);
+  }
+
+  if (!(mask_width_ % 2)) {
+    mask_width_++;
+  }
+
+  if (!(mask_height_ % 2)) {
+    mask_height_++;
+  }
+
+  if (width_ * height_ < 256) {
+    width_ = 64;
+    height_ = 64;
+  }
+
+  const uint32_t input_size_bytes = width_ * height_ * sizeof(uint32_t);
+  const uint32_t mask_size_bytes = mask_width_ * mask_height_ * sizeof(float);
+
+  set_sys_descr(KERNARG_DES_ID, sizeof(kernel_args_t));
+  set_sys_descr(INPUT_DES_ID, input_size_bytes);
+  set_sys_descr(OUTPUT_DES_ID, input_size_bytes);
+  set_local_descr(LOCAL_DES_ID, input_size_bytes);
+  set_sys_descr(MASK_DES_ID, mask_size_bytes);
+  set_sys_descr(REFOUT_DES_ID, input_size_bytes);
+}
+
+void SimpleConvolution::init() {
+  std::cout << "SimpleConvolution::init :" << std::endl;
+
+  mem_descr_t input_des = get_descr(INPUT_DES_ID);
+  mem_descr_t local_des = get_descr(LOCAL_DES_ID);
+  mem_descr_t mask_des = get_descr(MASK_DES_ID);
+  mem_descr_t refout_des = get_descr(REFOUT_DES_ID);
+  mem_descr_t kernarg_des = get_descr(KERNARG_DES_ID);
+
+  uint32_t* input = (uint32_t*)input_des.ptr;
+  uint32_t* output_local = (uint32_t*)local_des.ptr;
+  float* mask = (float*)mask_des.ptr;
+  kernel_args_t* kernel_args = (kernel_args_t*)kernarg_des.ptr;
+
+  // random initialisation of input
+  fillRandom<uint32_t>(input, width_, height_, 0, 255);
+
+  // Fill a blurr filter or some other filter of your choice
+  const float val = 1.0f / (mask_width_ * 2.0f - 1.0f);
+  for (uint32_t i = 0; i < (mask_width_ * mask_height_); i++) {
+    mask[i] = 0;
+  }
+  for (uint32_t i = 0; i < mask_width_; i++) {
+    uint32_t y = mask_height_ / 2;
+    mask[y * mask_width_ + i] = val;
+  }
+  for (uint32_t i = 0; i < mask_height_; i++) {
+    uint32_t x = mask_width_ / 2;
+    mask[i * mask_width_ + x] = val;
+  }
+
+  // Print the INPUT array.
+  printArray<uint32_t>("> Input[0]", input, width_, 1);
+  printArray<float>("> Mask", mask, mask_width_, mask_height_);
+
+  // Fill the kernel args
+  kernel_args->arg1 = output_local;
+  kernel_args->arg2 = input;
+  kernel_args->arg3 = mask;
+  kernel_args->arg4 = width_;
+  kernel_args->arg41 = height_;
+  kernel_args->arg5 = mask_width_;
+  kernel_args->arg51 = mask_height_;
+
+  // Calculate the reference output
+  memset(refout_des.ptr, 0, refout_des.size);
+  reference_impl((uint32_t*)refout_des.ptr, input, mask, width_, height_, mask_width_,
+                 mask_height_);
+}
+
+void SimpleConvolution::print_output() const {
+  printArray<uint32_t>("> Output[0]", (uint32_t*)get_output_ptr(), width_, 1);
+}
+
+bool SimpleConvolution::reference_impl(uint32_t* output, const uint32_t* input, const float* mask,
+                                       const uint32_t width, const uint32_t height,
+                                       const uint32_t mask_width, const uint32_t mask_height) {
+  const uint32_t vstep = (mask_width - 1) / 2;
+  const uint32_t hstep = (mask_height - 1) / 2;
+
+  // for each pixel in the input
+  for (uint32_t x = 0; x < width; x++) {
+    for (uint32_t y = 0; y < height; y++) {
+      // find the left, right, top and bottom indices such that
+      // the indices do not go beyond image boundaires
+      const uint32_t left = (x < vstep) ? 0 : (x - vstep);
+      const uint32_t right = ((x + vstep) >= width) ? width - 1 : (x + vstep);
+      const uint32_t top = (y < hstep) ? 0 : (y - hstep);
+      const uint32_t bottom = ((y + hstep) >= height) ? height - 1 : (y + hstep);
+
+      // initializing wighted sum value
+      float sum_fx = 0;
+      for (uint32_t i = left; i <= right; ++i) {
+        for (uint32_t j = top; j <= bottom; ++j) {
+          // performing wighted sum within the mask boundaries
+          uint32_t mask_idx = (j - (y - hstep)) * mask_width + (i - (x - vstep));
+          uint32_t index = j * width + i;
+
+          // to round to the nearest integer
+          sum_fx += ((float)input[index] * mask[mask_idx]);
+        }
+      }
+      sum_fx += 0.5f;
+      output[y * width + x] = uint32_t(sum_fx);
+    }
+  }
+
+  return true;
+}
@@ -0,0 +1,90 @@
+/******************************************************************************
+
+Copyright ©2013 Advanced Micro Devices, Inc. All rights reserved.
+
+Redistribution and use in source and binary forms, with or without modification,
+are permitted provided that the following conditions are met:
+
+Redistributions of source code must retain the above copyright notice, this list
+of conditions and the following disclaimer.
+
+Redistributions in binary form must reproduce the above copyright notice, this
+list of conditions and the following disclaimer in the documentation and/or
+other materials provided with the distribution.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
+INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
+OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*******************************************************************************/
+
+#ifndef _SIMPLE_CONVOLUTION_H_
+#define _SIMPLE_CONVOLUTION_H_
+
+#include <vector>
+#include <map>
+
+#include "test_kernel.h"
+
+// SimpleConvolution: Class implements OpenCL SimpleConvolution sample
+class SimpleConvolution : public TestKernel {
+ public:
+  // Constructor
+  SimpleConvolution();
+
+  // Initialize method
+  void init();
+
+  // Return number of compute elements
+  uint32_t get_elements_count() const { return width_ * height_; }
+
+  // Print output
+  void print_output() const;
+
+  // Return name
+  std::string Name() const { return std::string("simpleConvolution"); }
+
+ private:
+  // Local kernel arguments declaration
+  struct kernel_args_t {
+    void* arg1;
+    void* arg2;
+    void* arg3;
+    uint32_t arg4;
+    uint32_t arg41;
+    uint32_t arg5;
+    uint32_t arg51;
+  };
+
+  // Width of the Input array
+  uint32_t width_;
+
+  // Height of the Input array
+  uint32_t height_;
+
+  // Mask dimensions
+  uint32_t mask_width_;
+
+  // Mask dimensions
+  uint32_t mask_height_;
+
+  // Reference CPU implementation of Simple Convolution
+  // @param output Output matrix after performing convolution
+  // @param input  Input  matrix on which convolution is to be performed
+  // @param mask   mask matrix using which convolution was to be performed
+  // @param input_dimensions dimensions of the input matrix
+  // @param mask_dimensions  dimensions of the mask matrix
+  // @return bool true on success and false on failure
+  bool reference_impl(uint32_t* output, const uint32_t* input, const float* mask,
+                      const uint32_t width, const uint32_t height, const uint32_t maskWidth,
+                      const uint32_t maskHeight);
+};
+
+#endif  // _SIMPLE_CONVOLUTION_H_
@@ -0,0 +1,154 @@
+module &m:1:0:$full:$large:$default;
+extension "amd:gcn";
+extension "IMAGE";
+
+decl prog function &abort()();
+
+prog kernel &__OpenCL_SimpleConvolution(kernarg_u64 %__global_offset_0,
+                                        kernarg_u64 %output,
+                                        kernarg_u64 %input,
+                                        kernarg_u64 %mask,
+                                        kernarg_u32 %inputDimensions[2],
+                                        kernarg_u32 %maskDimensions[2]) {
+
+	pragma  "AMD RTI", "ARGSTART:__OpenCL_SimpleConvolution";
+	pragma  "AMD RTI", "version:3:1:104";
+	pragma  "AMD RTI", "device:generic";
+	pragma  "AMD RTI", "uniqueid:1024";
+	pragma  "AMD RTI", "memory:private:0";
+	pragma  "AMD RTI", "memory:region:0";
+	pragma  "AMD RTI", "memory:local:0";
+	pragma  "AMD RTI", "value:__global_offset_0:u64:1:1:0";
+	pragma  "AMD RTI", "pointer:output:u32:1:1:96:uav:7:4:RW:0:0:0";
+	pragma  "AMD RTI", "pointer:input:u32:1:1:112:uav:7:4:RW:0:0:0";
+	pragma  "AMD RTI", "pointer:mask:float:1:1:128:uav:7:4:RW:0:0:0";
+	pragma  "AMD RTI", "value:inputDimensions:u32:2:1:144";
+	pragma  "AMD RTI", "constarg:4:inputDimensions";
+	pragma  "AMD RTI", "value:maskDimensions:u32:2:1:160";
+	pragma  "AMD RTI", "constarg:5:maskDimensions";
+	pragma  "AMD RTI", "function:1:0";
+	pragma  "AMD RTI", "memory:64bitABI";
+	pragma  "AMD RTI", "privateid:8";
+	pragma  "AMD RTI", "enqueue_kernel:0";
+	pragma  "AMD RTI", "kernel_index:0";
+	pragma  "AMD RTI", "reflection:0:size_t";
+	pragma  "AMD RTI", "reflection:1:uint*";
+	pragma  "AMD RTI", "reflection:2:uint*";
+	pragma  "AMD RTI", "reflection:3:float*";
+	pragma  "AMD RTI", "reflection:4:uint2";
+	pragma  "AMD RTI", "reflection:5:uint2";
+	pragma  "AMD RTI", "ARGEND:__OpenCL_SimpleConvolution";
+
+  @__OpenCL_SimpleConvolution_Entry:
+	
+  // BB#0:    // %entry
+
+	workitemabsid_u32	$s6, 0;
+	cvt_u64_u32	$d0, $s6;
+	ld_kernarg_align(8)_width(all)_u64	$d4, [%__global_offset_0];
+	add_u64	$d0, $d0, $d4;
+	cvt_u32_u64	$s5, $d0;
+	ld_v2_kernarg_align(4)_width(all)_u32	($s0, $s4), [%inputDimensions];
+	ld_v2_kernarg_align(4)_width(all)_u32	($s1, $s9), [%maskDimensions];
+	rem_u32	$s7, $s5, $s0;
+	add_u32	$s2, $s1, 4294967295;
+	shr_u32	$s8, $s2, 1;
+	add_u32	$s2, $s7, $s8;
+	add_u32	$s3, $s0, 4294967295;
+	cmp_ge_b1_u32	$c0, $s2, $s0;
+	cmov_b32	$s2, $c0, $s3, $s2;
+	sub_u32	$s3, $s7, $s8;
+	cmp_lt_b1_u32	$c0, $s7, $s8;
+	cmov_b32	$s3, $c0, 0, $s3;
+	ld_kernarg_align(8)_width(all)_u64	$d1, [%output];
+	cmp_le_b1_u32	$c0, $s3, $s2;
+	cbr_b1	$c0, @BB0_2;
+
+	// BB#1:
+	
+  mov_b32	$s6, 0;
+	br	@BB0_6;
+
+	// @BB0_2:    // %for.cond32.preheader.lr.ph
+  
+  @BB0_2:
+	
+  div_u32	$s5, $s5, $s0;
+	add_u32	$s9, $s9, 4294967295;
+	shr_u32	$s9, $s9, 1;
+	add_u32	$s10, $s5, $s9;
+	add_u32	$s11, $s4, 4294967295;
+	cmp_ge_b1_u32	$c0, $s10, $s4;
+	cmov_b32	$s4, $c0, $s11, $s10;
+	sub_u32	$s10, $s5, $s9;
+	cmp_lt_b1_u32	$c0, $s5, $s9;
+	cmov_b32	$s5, $c0, 0, $s10;
+	ld_kernarg_align(8)_width(all)_u64	$d2, [%mask];
+	ld_kernarg_align(8)_width(all)_u64	$d3, [%input];
+	cvt_u64_u32	$d5, $s6;
+	add_u64	$d4, $d4, $d5;
+	cvt_u32_u64	$s6, $d4;
+	div_u32	$s6, $s6, $s0;
+	max_u32	$s10, $s9, $s6;
+	sub_u32	$s12, $s10, $s6;
+	max_u32	$s11, $s7, $s8;
+	mov_b32	$s6, 0;
+	mad_u32	$s12, $s1, $s12, $s11;
+	sub_u32	$s7, $s12, $s7;
+	sub_u32	$s9, $s10, $s9;
+	mad_u32	$s9, $s0, $s9, $s11;
+	sub_u32	$s8, $s9, $s8;
+
+	// @BB0_3:    // %for.cond32.preheader
+
+  @BB0_3:
+	
+  cmp_gt_b1_u32	$c0, $s5, $s4;
+	mov_b32	$s9, $s7;
+	mov_b32	$s10, $s8;
+	mov_b32	$s11, $s5;
+	cbr_b1	$c0, @BB0_5;
+
+  // @BB0_4:    // %for.body35
+
+  @BB0_4:
+	
+	cvt_u64_u32	$d4, $s9;
+	shl_u64	$d4, $d4, 2;
+	add_u64	$d4, $d2, $d4;
+	ld_global_align(4)_f32	$s12, [$d4];
+	cvt_u64_u32	$d4, $s10;
+	shl_u64	$d4, $d4, 2;
+	add_u64	$d4, $d3, $d4;
+	ld_global_align(4)_u32	$s13, [$d4];
+	cvt_f32_u32	$s13, $s13;
+	mul_ftz_f32	$s12, $s13, $s12;
+	add_u32	$s9, $s9, $s1;
+	add_u32	$s10, $s10, $s0;
+	add_u32	$s11, $s11, 1;
+	add_ftz_f32	$s6, $s6, $s12;
+	cmp_le_b1_u32	$c0, $s11, $s4;
+	cbr_b1	$c0, @BB0_4;
+
+	// @BB0_5:    // %for.inc48
+  
+  @BB0_5:
+	
+  add_u32	$s7, $s7, 1;
+	add_u32	$s8, $s8, 1;
+	add_u32	$s3, $s3, 1;
+	cmp_le_b1_u32	$c0, $s3, $s2;
+	cbr_b1	$c0, @BB0_3;
+
+	// @BB0_6:    // %for.end50
+
+  @BB0_6:
+	
+  and_b64	$d0, $d0, 4294967295;
+	shl_u64	$d0, $d0, 2;
+	add_u64	$d0, $d1, $d0;
+	add_ftz_f32	$s0, $s6, 0F3f000000;
+	cvt_ftz_u32_f32	$s0, $s0;
+	st_global_align(4)_u32	$s0, [$d0];
+	ret;
+};