Adding HSA extension AMD AQL profile library, see Readme.txt

Change-Id: Icbc1e0fb0185642eabbab411a2138ea030d22be8
Этот коммит содержится в:
Evgeny
2017-06-06 14:59:08 -05:00
коммит произвёл Evgeny Shcherbakov
родитель da831502ab
Коммит 25035b8d09
84 изменённых файлов: 16322 добавлений и 0 удалений
+28
Просмотреть файл
@@ -0,0 +1,28 @@
#
# Minimum version of cmake required
#
cmake_minimum_required ( VERSION 3.5.0 )
#
# Setup flag to be verbose or not
#
set ( CMAKE_VERBOSE_MAKEFILE TRUE CACHE BOOL "Verbose Output" FORCE )
set ( ROOT_DIR ${CMAKE_CURRENT_SOURCE_DIR} )
set ( PROJ_DIR ${ROOT_DIR}/src )
set ( TEST_DIR ${ROOT_DIR}/test )
#
# Build sources
#
include ( ${PROJ_DIR}/CMakeLists.txt )
#
# Build tests
#
add_subdirectory ( ${TEST_DIR} ${PROJECT_BINARY_DIR}/test )
#
# Style format
#
execute_process ( COMMAND sh -xc "/usr/bin/find ${ROOT_DIR} -name '*.cpp' -o -name '*.hpp' -o -name '*.h' -exec /usr/bin/clang-format -i -style=file \{\} \;" )
+40
Просмотреть файл
@@ -0,0 +1,40 @@
HSA extension AMD AQL profile library.
Provides AQL packets helper methods for
perfcounters (PMC) and SQ threadtraces (SQTT).
Current library implementation supports only GFX9.
The library source tree:
- doc - Documantation, the API specification and the presentation
- inc - Public API
- hsa_ext_amd_aql_profile.h - AMD AQL profile library public API
- amd_aql_pm4_ib_packet.h - AQL PM4 IB packet type
- src - AMD AQL profile library sources
- aqlprofile - AMD AQL profile library
- commandwriter - PM4 command writer originated from 'hsa-runtime/tools'
- perfcounter - PM4 perfcounter manager originated from 'hsa-runtime/tools'
- threadtrace - PM4 threadtrace manager originated from 'hsa-runtime/tools'
- util - core/utils library build based on 'hsa-runtime/core/util'
- test - the library test suite
- ctrl - Test controll
- common - Test common utils
- SimpleConvolution - Simple convolution test
To build the library:
$ cd ..../hsa-ext-aql-profile
$ mkdir build
$ cd build
$ cmake ..
$ make
To run the test:
# cd ..../hsa-ext-aql-profile/build
$ cp ../test/SimpleConvolution/gfx9_SimpleConvolution.hsaco .
$ test/SimpleConvolution
to enable PMC profiling:
export ROCR_ENABLE_PMC=1
to enable SQTT profiling:
export ROCR_ENABLE_SQTT=1
+66
Просмотреть файл
@@ -0,0 +1,66 @@
#
# Compiler Preprocessor definitions.
#
add_definitions ( -D__linux__ )
add_definitions ( -DUNIX_OS )
add_definitions ( -DLINUX )
add_definitions ( -D__AMD64__ )
add_definitions ( -D__x86_64__ )
add_definitions ( -DAMD_INTERNAL_BUILD )
add_definitions ( -DLITTLEENDIAN_CPU=1 )
add_definitions ( -DHSA_LARGE_MODEL= )
add_definitions ( -DHSA_DEPRECATED= )
#
# Linux Compiler options
#
set ( CMAKE_CXX_FLAGS "-std=c++11")
set ( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Werror" )
set ( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Werror=return-type" )
set ( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fexceptions" )
set ( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fvisibility=hidden" )
set ( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-error=sign-compare" )
set ( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-error=enum-compare" )
set ( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-error=comment " )
set ( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-error=pointer-arith" )
set ( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-comment" )
set ( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-sign-compare" )
set ( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-pointer-arith" )
set ( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-write-strings" )
set ( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-conversion-null" )
set ( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-deprecated-declarations" )
set ( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fno-rtti" )
set ( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fno-math-errno" )
set ( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fno-threadsafe-statics" )
set ( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fms-extensions" )
set ( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fmerge-all-constants" )
set ( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fPIC" )
#
# Extend Compiler flags based on build type
#
set ( CMAKE_BUILD_TYPE ${BUILD_TYPE} )
if ( "${CMAKE_BUILD_TYPE}" STREQUAL Debug )
set ( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -ggdb" )
endif ()
#
# Extend Compiler flags based on Processor architecture
#
if ( CMAKE_SYSTEM_PROCESSOR STREQUAL "x86_64" )
set ( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -m64 -msse -msse2" )
elseif ( CMAKE_SYSTEM_PROCESSOR STREQUAL "x86" )
set ( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -m32" )
endif ()
#
# Basic Tool Chain Information
#
message ( "-------------IS64BIT: " ${IS64BIT} )
message ( "-----------BuildType: " ${BUILD_TYPE} )
message ( " -----------Compiler: " ${CMAKE_CXX_COMPILER} )
message ( " ------------Version: " ${CMAKE_CXX_COMPILER_VERSION} )
message ( " ------------ProjDir: " ${PROJ_DIR} )
message ( " ------------TestDir: " ${PROJ_DIR} )
message ( "------HSA-RuntimeDir: " ${HSA_RUNTIME_DIR} )
message ( " -----------CoreUtil: " ${CORE_UTIL_DIR} )
+52
Просмотреть файл
@@ -0,0 +1,52 @@
#
# Build is not supported on Windows plaform
#
if ( WIN32 )
message ( FATAL_ERROR "Windows build is not supported." )
endif ()
#
# External dependencies for Rocr Header files
#
if ( NOT DEFINED ENV{ROCR_INC_DIR} )
message ( FATAL_ERROR "ERROR: Environment variable ROCR_INC_DIR is not set" )
return ()
endif ()
#
# External dependencies for Rocr Library files
#
if ( NOT DEFINED ENV{ROCR_LIB_DIR} )
message ( FATAL_ERROR "ERROR: Environment variable ROCR_LIB_DIR is not set" )
return ()
endif ()
#
# Process Env to determine build type
#
string ( TOLOWER "$ENV{ROCR_BLD_TYPE}" type )
if ( "${type}" STREQUAL debug )
set ( ISDEBUG 1 )
set ( BUILD_TYPE "Debug" )
else ()
set ( ISDEBUG 0 )
set ( BUILD_TYPE "Release" )
endif ()
#
# Determine build is 32-bit or 64-bit
# @note: By default it is not set
#
if ( "$ENV{ROCR_BLD_BITS}" STREQUAL 32 )
set ( ONLY64STR "" )
set ( IS64BIT 0 )
else ()
set ( ONLY64STR "64" )
set ( IS64BIT 1 )
endif ()
#
# Build information
#
message ( "---------ROCR-HdrDir: " $ENV{ROCR_INC_DIR} )
message ( "---------ROCR-LibDir: " $ENV{ROCR_LIB_DIR} )
Двоичный файл не отображается.
Двоичный файл не отображается.
+67
Просмотреть файл
@@ -0,0 +1,67 @@
////////////////////////////////////////////////////////////////////////////////
//
// Copyright 2017 ADVANCED MICRO DEVICES, INC.
//
// AMD is granting you permission to use this software and documentation(if any)
// (collectively, the "Materials") pursuant to the terms and conditions of the
// Software License Agreement included with the Materials.If you do not have a
// copy of the Software License Agreement, contact your AMD representative for a
// copy.
//
// You agree that you will not reverse engineer or decompile the Materials, in
// whole or in part, except as allowed by applicable law.
//
// WARRANTY DISCLAIMER : THE SOFTWARE IS PROVIDED "AS IS" WITHOUT WARRANTY OF
// ANY KIND.AMD DISCLAIMS ALL WARRANTIES, EXPRESS, IMPLIED, OR STATUTORY,
// INCLUDING BUT NOT LIMITED TO THE IMPLIED WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE, TITLE, NON - INFRINGEMENT, THAT THE
// SOFTWARE WILL RUN UNINTERRUPTED OR ERROR - FREE OR WARRANTIES ARISING FROM
// CUSTOM OF TRADE OR COURSE OF USAGE.THE ENTIRE RISK ASSOCIATED WITH THE USE OF
// THE SOFTWARE IS ASSUMED BY YOU.Some jurisdictions do not allow the exclusion
// of implied warranties, so the above exclusion may not apply to You.
//
// LIMITATION OF LIABILITY AND INDEMNIFICATION : AMD AND ITS LICENSORS WILL NOT,
// UNDER ANY CIRCUMSTANCES BE LIABLE TO YOU FOR ANY PUNITIVE, DIRECT,
// INCIDENTAL, INDIRECT, SPECIAL OR CONSEQUENTIAL DAMAGES ARISING FROM USE OF
// THE SOFTWARE OR THIS AGREEMENT EVEN IF AMD AND ITS LICENSORS HAVE BEEN
// ADVISED OF THE POSSIBILITY OF SUCH DAMAGES.In no event shall AMD's total
// liability to You for all damages, losses, and causes of action (whether in
// contract, tort (including negligence) or otherwise) exceed the amount of $100
// USD. You agree to defend, indemnify and hold harmless AMD and its licensors,
// and any of their directors, officers, employees, affiliates or agents from
// and against any and all loss, damage, liability and other expenses (including
// reasonable attorneys' fees), resulting from Your use of the Software or
// violation of the terms and conditions of this Agreement.
//
// U.S.GOVERNMENT RESTRICTED RIGHTS : The Materials are provided with
// "RESTRICTED RIGHTS." Use, duplication, or disclosure by the Government is
// subject to the restrictions as set forth in FAR 52.227 - 14 and DFAR252.227 -
// 7013, et seq., or its successor.Use of the Materials by the Government
// constitutes acknowledgement of AMD's proprietary rights in them.
//
// EXPORT RESTRICTIONS: The Materials may be subject to export restrictions as
// stated in the Software License Agreement.
//
////////////////////////////////////////////////////////////////////////////////
#ifndef _AMD_AQL_PM4_IB_PACKET_H_
#define _AMD_AQL_PM4_IB_PACKET_H_
// Value of 'pm4_ib_format' field of amd_aql_pm4_ib_packet_t packet
const static uint32_t AMD_AQL_PM4_IB_FORMAT = 1;
// Value of 'dw_count_remain' field of amd_aql_pm4_ib_packet_t packet
const static uint32_t AMD_AQL_PM4_IB_DW_COUNT_REMAIN = 10;
// Size of 'reserved' array of amd_aql_pm4_ib_packet_t packet
const static uint32_t AMD_AQL_PM4_IB_RESERVED_COUNT = 8;
// AQL Vendor Specific Packet which carry PM4 IB command
typedef struct {
uint16_t header;
uint16_t pm4_ib_format;
uint32_t pm4_ib_command[4];
uint32_t dw_count_remain;
uint32_t reserved[AMD_AQL_PM4_IB_RESERVED_COUNT];
hsa_signal_t completion_signal;
} amd_aql_pm4_ib_packet_t;
#endif // _AMD_AQL_PM4_IB_H_
+262
Просмотреть файл
@@ -0,0 +1,262 @@
////////////////////////////////////////////////////////////////////////////////
//
// Copyright 2017 ADVANCED MICRO DEVICES, INC.
//
// AMD is granting you permission to use this software and documentation(if any)
// (collectively, the "Materials") pursuant to the terms and conditions of the
// Software License Agreement included with the Materials.If you do not have a
// copy of the Software License Agreement, contact your AMD representative for a
// copy.
//
// You agree that you will not reverse engineer or decompile the Materials, in
// whole or in part, except as allowed by applicable law.
//
// WARRANTY DISCLAIMER : THE SOFTWARE IS PROVIDED "AS IS" WITHOUT WARRANTY OF
// ANY KIND.AMD DISCLAIMS ALL WARRANTIES, EXPRESS, IMPLIED, OR STATUTORY,
// INCLUDING BUT NOT LIMITED TO THE IMPLIED WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE, TITLE, NON - INFRINGEMENT, THAT THE
// SOFTWARE WILL RUN UNINTERRUPTED OR ERROR - FREE OR WARRANTIES ARISING FROM
// CUSTOM OF TRADE OR COURSE OF USAGE.THE ENTIRE RISK ASSOCIATED WITH THE USE OF
// THE SOFTWARE IS ASSUMED BY YOU.Some jurisdictions do not allow the exclusion
// of implied warranties, so the above exclusion may not apply to You.
//
// LIMITATION OF LIABILITY AND INDEMNIFICATION : AMD AND ITS LICENSORS WILL NOT,
// UNDER ANY CIRCUMSTANCES BE LIABLE TO YOU FOR ANY PUNITIVE, DIRECT,
// INCIDENTAL, INDIRECT, SPECIAL OR CONSEQUENTIAL DAMAGES ARISING FROM USE OF
// THE SOFTWARE OR THIS AGREEMENT EVEN IF AMD AND ITS LICENSORS HAVE BEEN
// ADVISED OF THE POSSIBILITY OF SUCH DAMAGES.In no event shall AMD's total
// liability to You for all damages, losses, and causes of action (whether in
// contract, tort (including negligence) or otherwise) exceed the amount of $100
// USD. You agree to defend, indemnify and hold harmless AMD and its licensors,
// and any of their directors, officers, employees, affiliates or agents from
// and against any and all loss, damage, liability and other expenses (including
// reasonable attorneys' fees), resulting from Your use of the Software or
// violation of the terms and conditions of this Agreement.
//
// U.S.GOVERNMENT RESTRICTED RIGHTS : The Materials are provided with
// "RESTRICTED RIGHTS." Use, duplication, or disclosure by the Government is
// subject to the restrictions as set forth in FAR 52.227 - 14 and DFAR252.227 -
// 7013, et seq., or its successor.Use of the Materials by the Government
// constitutes acknowledgement of AMD's proprietary rights in them.
//
// EXPORT RESTRICTIONS: The Materials may be subject to export restrictions as
// stated in the Software License Agreement.
//
////////////////////////////////////////////////////////////////////////////////
#ifndef _HSA_EXT_AMD_AQL_PROFILE_H_
#define _HSA_EXT_AMD_AQL_PROFILE_H_
#include <stdint.h>
#include <hsa.h>
#ifdef __cplusplus
extern "C" {
#endif // __cplusplus
///////////////////////////////////////////////////////////////////////
// Library API:
// The library provides helper methods for instantiation of
// the profile context object and for populating of the start
// and stop AQL packets. The profile object contains a profiling
// events list and needed for profiling buffers descriptors,
// a command buffer and an output data buffer. To check if there
// was an error the library methods return a status code. Also
// the library provides methods for querying required buffers
// attributes, to validate the event attributes and to get profiling
// output data.
//
// Returned status:
// hsa_status_t – HSA status codes are used from hsa.h header
//
// Supported profiling features:
//
// Supported profiling events
typedef enum {
HSA_EXT_AQL_PROFILE_EVENT_PMC,
HSA_EXT_AQL_PROFILE_EVENT_SQTT
} hsa_ext_amd_aql_profile_event_type_t;
// Supported performance counters (PMC) blocks
// The block ID is the same for a block instances set, for example
// each block instance from the TCC block set, TCC0, TCC1, …, TCCN
// will have the same block ID HSA_EXT_AQL_PROFILE_BLOCKS_TCC.
typedef enum {
HSA_EXT_AQL_PROFILE_BLOCK_CB,
HSA_EXT_AQL_PROFILE_BLOCK_CPF,
HSA_EXT_AQL_PROFILE_BLOCK_DB,
HSA_EXT_AQL_PROFILE_BLOCK_GRBM,
HSA_EXT_AQL_PROFILE_BLOCK_GRBMSE,
HSA_EXT_AQL_PROFILE_BLOCK_PASU,
HSA_EXT_AQL_PROFILE_BLOCK_PASC,
HSA_EXT_AQL_PROFILE_BLOCK_SPI,
HSA_EXT_AQL_PROFILE_BLOCK_SQ,
HSA_EXT_AQL_PROFILE_BLOCK_SQGS,
HSA_EXT_AQL_PROFILE_BLOCK_SQVS,
HSA_EXT_AQL_PROFILE_BLOCK_SQPS,
HSA_EXT_AQL_PROFILE_BLOCK_SQHS,
HSA_EXT_AQL_PROFILE_BLOCK_SQCS,
HSA_EXT_AQL_PROFILE_BLOCK_SX,
HSA_EXT_AQL_PROFILE_BLOCK_TA,
HSA_EXT_AQL_PROFILE_BLOCK_TCA,
HSA_EXT_AQL_PROFILE_BLOCK_TCC,
HSA_EXT_AQL_PROFILE_BLOCK_TD,
HSA_EXT_AQL_PROFILE_BLOCK_TCP,
HSA_EXT_AQL_PROFILE_BLOCK_GDS,
HSA_EXT_AQL_PROFILE_BLOCK_VGT,
HSA_EXT_AQL_PROFILE_BLOCK_IA,
HSA_EXT_AQL_PROFILE_BLOCK_MC,
HSA_EXT_AQL_PROFILE_BLOCK_TCS,
HSA_EXT_AQL_PROFILE_BLOCK_WD,
HSA_EXT_AQL_PROFILE_BLOCKS_NUMBER
} hsa_ext_amd_aql_profile_block_name_t;
// PMC event object structure
// counter_id value is specified in GFXIPs perfcounter user guides
// which is the counters select value, “Performance Counters Selection”
// chapter.
typedef struct {
hsa_ext_amd_aql_profile_block_name_t block_name;
uint32_t block_index;
uint32_t counter_id;
} hsa_ext_amd_aql_profile_event_t;
// Check if event is valid for the specific GPU
hsa_status_t hsa_ext_amd_aql_profile_validate_event(
hsa_agent_t agent, // HSA handle for the profiling GPU
const hsa_ext_amd_aql_profile_event_t* event, // Pointer on validated event
bool* result); // True if the event valid, False otherwise
// Profiling parameters
// All parameters are generic and if not applicable for a specific
// profile configuration then error status will be returned.
typedef enum {
// SQTT applicable parameters
HSA_EXT_AQL_PROFILE_PARAM_COMPUTE_UNIT_TARGET,
HSA_EXT_AQL_PROFILE_PARAM_VM_ID_MASK,
HSA_EXT_AQL_PROFILE_PARAM_MASK,
HSA_EXT_AQL_PROFILE_PARAM_TOKEN_MASK,
HSA_EXT_AQL_PROFILE_PARAM_TOKEN_MASK2
} hsa_ext_amd_aql_profile_parameter_name_t;
// Profile parameter object
typedef struct {
hsa_ext_amd_aql_profile_parameter_name_t parameter_name;
uint32_t value;
} hsa_ext_amd_aql_profile_parameters_t;
//
// Profile context object:
// The library provides a profile object structure which contains
// the events array, a buffer for the profiling start/stop commands
// and a buffer for the output data.
// The buffers are specified by the buffer descriptors and allocated
// by the application. The buffers allocation attributes, the command
// buffer size, the PMC output buffer size as well as profiling output
// data can be get using the generic get profile info helper _get_info.
//
// Buffer descriptor
typedef struct {
void* ptr;
uint32_t size;
} hsa_ext_amd_aql_profile_descriptor_t;
// Profile context object structure, contains profiling events list and
// needed for profiling buffers descriptors, a command buffer and
// an output data buffer
typedef struct {
hsa_agent_t agent; // GFXIP handle
hsa_ext_amd_aql_profile_event_type_t type; // Events type
const hsa_ext_amd_aql_profile_event_t* events; // Events array
uint32_t event_count; // Events count
const hsa_ext_amd_aql_profile_parameters_t* parameters; // Parameters array
uint32_t parameter_count; // Parameters count
hsa_ext_amd_aql_profile_descriptor_t output_buffer; // Output buffer
hsa_ext_amd_aql_profile_descriptor_t command_buffer; // PM4 commands
} hsa_ext_amd_aql_profile_profile_t;
//
// AQL packets populating methods:
// The helper methods to populate provided by the application START and
// STOP AQL packets which the application is required to submit before and
// after profiled GPU task packets respectively.
//
// AQL Vendor Specific packet which carries a PM4 command
typedef struct {
uint16_t header;
uint16_t pm4_command[27];
hsa_signal_t completion_signal;
} hsa_ext_amd_aql_pm4_packet_t;
// Method to populate the provided AQL packet with profiling start commands
// Only 'pm4_command' fields of the packet are set and the application
// is responsible to set Vendor Specific header type a completion signal
hsa_status_t hsa_ext_amd_aql_profile_start(
const hsa_ext_amd_aql_profile_profile_t* profile, // [in] profile contex object
hsa_ext_amd_aql_pm4_packet_t* aql_start_packet); // [out] profile start AQL packet
// Method to populate the provided AQL packet with profiling stop commands
// Only 'pm4_command' fields of the packet are set and the application
// is responsible to set Vendor Specific header type and a completion signal
hsa_status_t hsa_ext_amd_aql_profile_stop(
const hsa_ext_amd_aql_profile_profile_t* profile, // [in] profile contex object
hsa_ext_amd_aql_pm4_packet_t* aql_stop_packet); // [out] profile stop AQL packet
// Legacy PM4 profiling packet size
const unsigned HSA_EXT_AQL_PROFILE_LEGACY_PM4_PACKET_SIZE = 64;
// Converting of the profiling AQL packet to PM4 packet, GFX8 support
hsa_status_t hsa_ext_amd_aql_profile_legacy_get_pm4(
const hsa_ext_amd_aql_pm4_packet_t* aql_packet, // AQL packet
void* pm4); // PM4 packet blob
//
// Get profile info:
// Generic method for getting various profile info including profile buffers
// attributes like the command buffer size and the profiling PMC results.
// Its implied that all counters are 64bit values.
//
// Profile generic output data:
typedef struct {
uint32_t sample_id; // PMC sample of SQTT buffer index
union {
struct {
hsa_ext_amd_aql_profile_event_t event; // PMC event
uint64_t result; // PMC result
} pmc_data;
hsa_ext_amd_aql_profile_descriptor_t sqtt_data; // SQTT output data descriptor
};
} hsa_ext_amd_aql_profile_info_data_t;
// Profile attributes
typedef enum {
HSA_EXT_AQL_PROFILE_INFO_COMMAND_BUFFER_SIZE, // get_info returns uint32_t value
HSA_EXT_AQL_PROFILE_INFO_PMC_DATA_SIZE, // get_info returns uint32_t value
HSA_EXT_AQL_PROFILE_INFO_PMC_DATA, // get_info returns PMC uint64_t value
// in info_data object
HSA_EXT_AQL_PROFILE_INFO_SQTT_DATA // get_info returns SQTT buffer ptr/size
// in info_data object
} hsa_ext_amd_aql_profile_info_type_t;
// Definition of output data iterator callback
typedef hsa_status_t (*hsa_ext_amd_aql_profile_data_callback_t)(
hsa_ext_amd_aql_profile_info_type_t info_type, // [in] data type, PMC or SQTT data
hsa_ext_amd_aql_profile_info_data_t* info_data, // [in] info_data object
void* callback_data); // [in/out] data passed to the callback
// Method for getting the profile info
hsa_status_t hsa_ext_amd_aql_profile_get_info(
const hsa_ext_amd_aql_profile_profile_t* profile, // [in] profile context object
hsa_ext_amd_aql_profile_info_type_t attribute, // [in] requested profile attribute
void* value); // [in/out] returned value
// Method for iterating the events output data
hsa_status_t hsa_ext_amd_aql_profile_iterate_data(
const hsa_ext_amd_aql_profile_profile_t* profile, // [in] profile context object
hsa_ext_amd_aql_profile_data_callback_t callback, // [in] callback to iterate the output data
void* data); // [in/out] data passed to the callback
#ifdef __cplusplus
}
#endif // __cplusplus
#endif // _HSA_EXT_AMD_AQL_PROFILE_H_
+72
Просмотреть файл
@@ -0,0 +1,72 @@
#
# Minimum version of cmake required
#
cmake_minimum_required ( VERSION 3.5.0 )
#
# Setup flag to be verbose or not
#
set ( CMAKE_VERBOSE_MAKEFILE TRUE CACHE BOOL "Verbose Output" FORCE )
#
# Set name for the project
# @note: Must come before adding any sub-directories
#
set ( TARGET_NAME "aqlprofile" )
project ( ${TARGET_NAME} )
if ( NOT DEFINED PROJ_DIR )
set ( PROJ_DIR ${CMAKE_CURRENT_SOURCE_DIR} )
set ( ROOT_DIR ${PROJ_DIR}/.. )
endif ()
set ( API_DIR ${ROOT_DIR}/inc )
set ( HSA_RUNTIME_DIR ${PROJ_DIR}/../../.. )
set ( HSA_RUNTIME_OSC_DIR ${HSA_RUNTIME_DIR}/opensrc/hsa-runtime )
set ( CORE_UTIL_DIR ${HSA_RUNTIME_OSC_DIR}/core/util )
include_directories ( ${ROOT_DIR} )
#
# Validate required build environment is setup correctly
#
include ( ${ROOT_DIR}/cmake_modules/validateBldEnv.cmake )
#
# Setup tool chain flags - preprocessor, compiler and linker
#
include ( ${ROOT_DIR}/cmake_modules/exportToolFlags.cmake )
#
# Set Name for Utils library and build it as a
# static library to be linked with others
#
set ( UTIL_LIB "util${ONLY64STR}" )
add_subdirectory ( ${PROJ_DIR}/util "${PROJECT_BINARY_DIR}/util" )
#
# Set Name for Cmdwriter library and build it as a
# static library to be linked with others
#
set ( CMDWRITER_LIB "commandwriter${ONLY64STR}" )
add_subdirectory ( ${PROJ_DIR}/commandwriter "${PROJECT_BINARY_DIR}/commandwriter" )
#
# Set Name for ThreadTrace library and build it as a
# static library to be linked with others
#
set ( SQTT_LIB "sqtt${ONLY64STR}" )
add_subdirectory ( ${PROJ_DIR}/threadtrace "${PROJECT_BINARY_DIR}/threadtrace" )
#
# Set Name for Profiler library and build it as a
# static library to be linked with others
#
set ( PMC_LIB "pmc${ONLY64STR}" )
add_subdirectory ( ${PROJ_DIR}/perfcounter "${PROJECT_BINARY_DIR}/perfcounter" )
#
# Build the library and link it with other static
# libraries that have been built in this regard
#
set ( TARGET_LIB "${TARGET_NAME}${ONLY64STR}" )
add_subdirectory ( ${PROJ_DIR}/${TARGET_NAME} "${PROJECT_BINARY_DIR}/${TARGET_NAME}" )
+20
Просмотреть файл
@@ -0,0 +1,20 @@
#
# Source files for Rocr Service Manager
#
set ( LIB_SRC aql_profile.cpp populate_aql.cpp gfx8_factory.cpp gfx9_factory.cpp )
#
# Header files include path(s).
#
include_directories ( $ENV{ROCR_INC_DIR} )
include_directories ( ${PROJ_DIR}/perfcounter )
include_directories ( ${PROJ_DIR}/threadtrace )
include_directories ( ${PROJ_DIR}/commandwriter )
include_directories ( ${API_DIR} )
#
# Build Service Manager as a dynamic Library object
#
set ( LIB_LIST ${PMC_LIB} ${SQTT_LIB} ${CMDWRITER_LIB} ${UTIL_LIB} )
add_library ( ${TARGET_LIB} SHARED ${LIB_SRC} )
target_link_libraries( ${TARGET_LIB} ${LIB_LIST} c stdc++ dl pthread rt )
+398
Просмотреть файл
@@ -0,0 +1,398 @@
#include <string>
#include "aql_profile.h"
#include "pm4_factory.h"
#include "cmdwriter.h" // commandwriter
#include "hsa_perf.h" // perfcounter
#include "thread_trace.h" // threadtrace
#include "gpu_enum.h"
#include "gpu_blockinfo.h"
#define PUBLIC_API __attribute__((visibility("default")))
namespace aql_profile {
// Command buffer partitioning manager
// Supports Pre/Post commands partitioning
// and postfix control partition
class CommandBufferMgr {
const static uint32_t align_size = 0x100;
const static uint32_t align_mask = align_size - 1;
struct info_t {
uint32_t precmds_size;
uint32_t postcmds_size;
};
descriptor_t buffer;
uint32_t postfix_size;
info_t* info;
uint32_t align(const uint32_t& size) { return (size + align_mask) & ~align_mask; }
public:
CommandBufferMgr(const profile_t* profile)
: buffer(profile->command_buffer), postfix_size(0), info(NULL) {
info = (info_t*)setPostfix(sizeof(info_t));
}
uint32_t getSize() { return buffer.size; }
void* setPostfix(const uint32_t& size) {
if (size > postfix_size) {
const uint32_t delta = size - postfix_size;
postfix_size = size;
buffer.size -= (delta < buffer.size) ? delta : buffer.size;
}
return (buffer.size != 0) ? buffer.ptr + buffer.size : NULL;
}
bool setPreSize(const uint32_t& size) {
bool suc = (size <= buffer.size);
if (suc) info->precmds_size = size;
return suc;
}
uint32_t getPostOffset() { return align(info->precmds_size); }
bool checkTotalSize(const uint32_t& size) {
bool suc = (size <= buffer.size);
if (suc) suc = (size >= info->precmds_size);
if (suc) {
info->postcmds_size = size - info->precmds_size;
suc = ((getPostOffset() + info->postcmds_size) <= buffer.size);
}
return suc;
}
descriptor_t getPreDescr() {
descriptor_t descr;
descr.ptr = buffer.ptr;
descr.size = info->precmds_size;
return descr;
}
descriptor_t getPostDescr() {
descriptor_t descr;
descr.ptr = buffer.ptr + getPostOffset();
descr.size = info->postcmds_size;
return descr;
}
};
static inline bool is_event_match(const event_t& event1, const event_t& event2) {
return (event1.block_name == event2.block_name) && (event1.block_index == event2.block_index) &&
(event1.counter_id == event2.counter_id);
}
hsa_status_t default_pmcdata_callback(hsa_ext_amd_aql_profile_info_type_t info_type,
hsa_ext_amd_aql_profile_info_data_t* info_data,
void* callback_data) {
hsa_status_t status = HSA_STATUS_SUCCESS;
hsa_ext_amd_aql_profile_info_data_t* passed_data =
reinterpret_cast<hsa_ext_amd_aql_profile_info_data_t*>(callback_data);
if (info_type == HSA_EXT_AQL_PROFILE_INFO_PMC_DATA) {
if (is_event_match(info_data->pmc_data.event, passed_data->pmc_data.event)) {
if (passed_data->sample_id == UINT32_MAX) {
passed_data->pmc_data.result += info_data->pmc_data.result;
} else if (passed_data->sample_id == info_data->sample_id) {
passed_data->pmc_data.result = info_data->pmc_data.result;
status = HSA_STATUS_INFO_BREAK;
}
}
}
return status;
}
struct sqtt_ctrl_t {
uint32_t status;
uint32_t counter;
uint32_t writePtr;
};
hsa_status_t default_sqttdata_callback(hsa_ext_amd_aql_profile_info_type_t info_type,
hsa_ext_amd_aql_profile_info_data_t* info_data,
void* callback_data) {
hsa_status_t status = HSA_STATUS_SUCCESS;
hsa_ext_amd_aql_profile_info_data_t* passed_data =
reinterpret_cast<hsa_ext_amd_aql_profile_info_data_t*>(callback_data);
if (info_type == HSA_EXT_AQL_PROFILE_INFO_SQTT_DATA) {
if (info_data->sample_id == passed_data->sample_id) {
passed_data->sqtt_data = info_data->sqtt_data;
status = HSA_STATUS_INFO_BREAK;
}
}
return status;
}
} // aql_profile
extern "C" {
// Check if event is valid for the specific GPU
PUBLIC_API hsa_status_t hsa_ext_amd_aql_profile_validate_event(
hsa_agent_t agent, const hsa_ext_amd_aql_profile_event_t* event, bool* result) {
return HSA_STATUS_SUCCESS;
}
// Method to populate the provided AQL packet with profiling start commands
PUBLIC_API hsa_status_t hsa_ext_amd_aql_profile_start(
const hsa_ext_amd_aql_profile_profile_t* profile, aql_profile::packet_t* aql_start_packet) {
aql_profile::Pm4Factory * pm4_factory = aql_profile::Pm4Factory::Create(profile);
if (pm4_factory == NULL) return HSA_STATUS_ERROR;
pm4_profile::CommandWriter* cmdWriter = pm4_factory->getCommandWriter();
if (cmdWriter == NULL) return HSA_STATUS_ERROR;
pm4_profile::DefaultCmdBuf commands;
aql_profile::CommandBufferMgr cmdBufMgr(profile);
if (cmdBufMgr.getSize() == 0) return HSA_STATUS_ERROR;
if (profile->type == HSA_EXT_AQL_PROFILE_EVENT_PMC) {
pm4_profile::Pmu* pmcMgr = pm4_factory->getPmcMgr();
if (pmcMgr == NULL) return HSA_STATUS_ERROR;
pmcMgr->setPmcDataBuff((uint8_t*)profile->output_buffer.ptr, profile->output_buffer.size);
for (const hsa_ext_amd_aql_profile_event_t* p = profile->events;
p < profile->events + profile->event_count; ++p) {
pm4_profile::CounterBlock* block =
pmcMgr->getCounterBlockById(pm4_factory->getBlockId(p));
if (block == NULL) return HSA_STATUS_ERROR;
pm4_profile::Counter* counter = block->createCounter();
if (counter == NULL) return HSA_STATUS_ERROR;
counter->setParameter(HSA_EXT_TOOLS_COUNTER_PARAMETER_EVENT_INDEX, sizeof(uint32_t),
&(p->counter_id));
counter->setEnable(true);
}
// Generate start commands
pmcMgr->begin(&commands, cmdWriter);
cmdBufMgr.setPreSize(commands.Size());
// Generate stop commands
pmcMgr->end(&commands, cmdWriter);
} else if (profile->type == HSA_EXT_AQL_PROFILE_EVENT_SQTT) {
pm4_profile::ThreadTrace* sqttMgr = pm4_factory->getSqttMgr();
if (sqttMgr == NULL) return HSA_STATUS_ERROR;
pm4_profile::ThreadTraceConfig sqtt_config;
sqttMgr->InitThreadTraceConfig(&sqtt_config);
if (profile->parameters) {
for (const hsa_ext_amd_aql_profile_parameters_t* p = profile->parameters;
p < (profile->parameters + profile->parameter_count); ++p) {
switch (p->parameter_name) {
case HSA_EXT_AQL_PROFILE_PARAM_COMPUTE_UNIT_TARGET:
sqtt_config.threadTraceTargetCu = p->value;
break;
case HSA_EXT_AQL_PROFILE_PARAM_VM_ID_MASK:
sqtt_config.threadTraceVmIdMask = p->value;
break;
case HSA_EXT_AQL_PROFILE_PARAM_MASK:
sqtt_config.threadTraceMask = p->value;
break;
case HSA_EXT_AQL_PROFILE_PARAM_TOKEN_MASK:
sqtt_config.threadTraceTokenMask = p->value;
break;
case HSA_EXT_AQL_PROFILE_PARAM_TOKEN_MASK2:
sqtt_config.threadTraceTokenMask2 = p->value;
break;
default:
return HSA_STATUS_ERROR;
}
}
}
sqttMgr->Init(&sqtt_config);
sqttMgr->setSqttDataBuff((uint8_t*)profile->output_buffer.ptr, profile->output_buffer.size);
const uint32_t status_size = sqttMgr->StatusSizeInfo();
void* status_ptr = cmdBufMgr.setPostfix(status_size);
if (status_ptr == NULL) return HSA_STATUS_ERROR;
// Control buffer registering
sqttMgr->setSqttCtrlBuff((uint32_t*)status_ptr);
// Generate start commands
sqttMgr->BeginSession(&commands, cmdWriter);
cmdBufMgr.setPreSize(commands.Size());
// Generate stop commands
sqttMgr->StopSession(&commands, cmdWriter);
} else
return HSA_STATUS_ERROR;
if (!cmdBufMgr.checkTotalSize(commands.Size())) return HSA_STATUS_ERROR;
const aql_profile::descriptor_t pre_descr = cmdBufMgr.getPreDescr();
const aql_profile::descriptor_t post_descr = cmdBufMgr.getPostDescr();
memcpy(pre_descr.ptr, commands.Base(), pre_descr.size);
memcpy(post_descr.ptr, commands.Base() + pre_descr.size, post_descr.size);
// Populate start aql packet
aql_profile::populateAql(pre_descr.ptr, pre_descr.size, cmdWriter, aql_start_packet);
return HSA_STATUS_SUCCESS;
}
// Method to populate the provided AQL packet with profiling stop commands
PUBLIC_API hsa_status_t hsa_ext_amd_aql_profile_stop(
const hsa_ext_amd_aql_profile_profile_t* profile, aql_profile::packet_t* aql_stop_packet) {
aql_profile::Pm4Factory * pm4_factory = aql_profile::Pm4Factory::Create(profile);
if (pm4_factory == NULL) return HSA_STATUS_ERROR;
pm4_profile::CommandWriter* cmdWriter = pm4_factory->getCommandWriter();
if (cmdWriter == NULL) return HSA_STATUS_ERROR;
aql_profile::CommandBufferMgr cmdBufMgr(profile);
if (cmdBufMgr.getSize() == 0) return HSA_STATUS_ERROR;
const aql_profile::descriptor_t post_descr = cmdBufMgr.getPostDescr();
// Populate stop aql packet
aql_profile::populateAql(post_descr.ptr, post_descr.size, cmdWriter, aql_stop_packet);
return HSA_STATUS_SUCCESS;
}
// Converting of the profiling AQL packet to PM4 packet, GFX8 support
PUBLIC_API hsa_status_t hsa_ext_amd_aql_profile_legacy_get_pm4(
const aql_profile::packet_t* aql_packet, void* pm4) {
return HSA_STATUS_ERROR;
}
// Method for getting the profile info
PUBLIC_API hsa_status_t hsa_ext_amd_aql_profile_get_info(
const hsa_ext_amd_aql_profile_profile_t* profile, hsa_ext_amd_aql_profile_info_type_t attribute,
void* value) {
hsa_status_t status = HSA_STATUS_SUCCESS;
switch (attribute) {
case HSA_EXT_AQL_PROFILE_INFO_COMMAND_BUFFER_SIZE:
*(uint32_t*)value = 0x1000; // a current approximation as 4K is big enaugh
break;
case HSA_EXT_AQL_PROFILE_INFO_PMC_DATA_SIZE:
*(uint32_t*)value = 0x1000; // a current approximation as 4K is big enaugh
break;
case HSA_EXT_AQL_PROFILE_INFO_PMC_DATA:
reinterpret_cast<hsa_ext_amd_aql_profile_info_data_t*>(value)->pmc_data.result = 0;
status = hsa_ext_amd_aql_profile_iterate_data(profile, aql_profile::default_pmcdata_callback,
value);
break;
case HSA_EXT_AQL_PROFILE_INFO_SQTT_DATA:
status = hsa_ext_amd_aql_profile_iterate_data(profile, aql_profile::default_sqttdata_callback,
value);
break;
default:
status = HSA_STATUS_ERROR_INVALID_ARGUMENT;
}
return status;
}
// Method for iterating the events output data
PUBLIC_API hsa_status_t hsa_ext_amd_aql_profile_iterate_data(
const hsa_ext_amd_aql_profile_profile_t* profile,
hsa_ext_amd_aql_profile_data_callback_t callback, void* data) {
hsa_status_t status = HSA_STATUS_SUCCESS;
aql_profile::Pm4Factory * pm4_factory = aql_profile::Pm4Factory::Create(profile);
if (pm4_factory == NULL) return HSA_STATUS_ERROR;
if (profile->type == HSA_EXT_AQL_PROFILE_EVENT_PMC) {
uint32_t info_size = 0;
void* info_data;
uint64_t* samples = (uint64_t*)profile->output_buffer.ptr;
const uint32_t sample_count = profile->output_buffer.size / sizeof(uint64_t);
uint32_t sample_index = 0;
pm4_profile::Pmu* pmcMgr = pm4_factory->getPmcMgr();
if (pmcMgr == NULL) return HSA_STATUS_ERROR;
for (const hsa_ext_amd_aql_profile_event_t* p = profile->events;
p < (profile->events + profile->event_count); ++p) {
pm4_profile::CounterBlock* block =
pmcMgr->getCounterBlockById(pm4_factory->getBlockId(p));
if (block == NULL) return HSA_STATUS_ERROR;
if (!block->getInfo(pm4_profile::GPU_BLK_INFO_CONTROL_METHOD, info_size, &info_data)) {
return HSA_STATUS_ERROR;
}
const pm4_profile::CntlMethod method =
static_cast<pm4_profile::CntlMethod>(*(static_cast<uint32_t*>(info_data)));
// A perfcounter data sample per ShaderEngine
const uint32_t block_samples_count = (method == pm4_profile::CntlMethodBySe ||
method == pm4_profile::CntlMethodBySeAndInstance)
? pmcMgr->getNumSe()
: 1;
for (uint32_t i = 0; i < block_samples_count; ++i) {
assert(sample_index < sample_count);
if (sample_index >= sample_count) return HSA_STATUS_ERROR;
hsa_ext_amd_aql_profile_info_data_t sample_info;
sample_info.sample_id = i;
sample_info.pmc_data.event = *p;
sample_info.pmc_data.result = samples[sample_index];
status = callback(HSA_EXT_AQL_PROFILE_INFO_PMC_DATA, &sample_info, data);
if (status == HSA_STATUS_INFO_BREAK) {
status = HSA_STATUS_SUCCESS;
break;
}
if (status != HSA_STATUS_SUCCESS) break;
++sample_index;
}
}
} else if (profile->type == HSA_EXT_AQL_PROFILE_EVENT_SQTT) {
pm4_profile::ThreadTrace* sqttMgr = pm4_factory->getSqttMgr();
if (sqttMgr == NULL) return HSA_STATUS_ERROR;
aql_profile::CommandBufferMgr cmdBufMgr(profile);
if (cmdBufMgr.getSize() == 0) return HSA_STATUS_ERROR;
const uint32_t status_size = sqttMgr->StatusSizeInfo();
// Control buffer was allocated as the CmdBuffer postfix partition
void* status_ptr = cmdBufMgr.setPostfix(status_size);
if (status_ptr == NULL) return HSA_STATUS_ERROR;
// Control buffer registering
sqttMgr->setSqttCtrlBuff((uint32_t*)status_ptr);
// Validate SQTT status and normalize WRPTR
if (sqttMgr->Validate() == false) return HSA_STATUS_ERROR;
const uint32_t se_number = sqttMgr->getNumSe();
// Casting status pointer to SQTT control per ShaderEngine array
aql_profile::sqtt_ctrl_t* sqtt_ctrl = (aql_profile::sqtt_ctrl_t*)status_ptr;
assert(status_size == sizeof(aql_profile::sqtt_ctrl_t) * se_number);
if (status_size != sizeof(aql_profile::sqtt_ctrl_t) * se_number) {
return HSA_STATUS_ERROR;
}
// SQTT output buffer and capacity per ShaderEngine
void* sample_ptr = profile->output_buffer.ptr;
const uint32_t sample_capacity = profile->output_buffer.size / se_number;
// The samples sizes are returned in the control buffer
for (int i = 0; i < se_number; ++i) {
// WPTR specifies the index in thread trace buffer where next token will be
// written by hardware. The index is incremented by size of 32 bytes.
uint32_t sample_size = sqtt_ctrl[i].writePtr * TT_WRITE_PTR_BLK;
hsa_ext_amd_aql_profile_info_data_t sample_info;
sample_info.sample_id = i;
sample_info.sqtt_data.ptr = sample_ptr;
sample_info.sqtt_data.size = sample_size;
status = callback(HSA_EXT_AQL_PROFILE_INFO_SQTT_DATA, &sample_info, data);
if (status == HSA_STATUS_INFO_BREAK) {
status = HSA_STATUS_SUCCESS;
break;
}
if (status != HSA_STATUS_SUCCESS) break;
sample_ptr += sample_capacity;
}
} else {
status = HSA_STATUS_ERROR;
}
return status;
}
}
+23
Просмотреть файл
@@ -0,0 +1,23 @@
#ifndef _AQL_PROFILE_H_
#define _AQL_PROFILE_H_
#include "hsa_ext_amd_aql_profile.h"
namespace pm4_profile {
class CommandWriter;
}
namespace aql_profile {
typedef hsa_ext_amd_aql_profile_descriptor_t descriptor_t;
typedef hsa_ext_amd_aql_profile_profile_t profile_t;
typedef hsa_ext_amd_aql_profile_info_type_t info_type_t;
typedef hsa_ext_amd_aql_profile_data_callback_t data_callback_t;
typedef hsa_ext_amd_aql_pm4_packet_t packet_t;
typedef hsa_ext_amd_aql_profile_event_t event_t;
void populateAql(void* cmdBuffer, uint32_t cmdSz, pm4_profile::CommandWriter* cmdWriter,
packet_t* aqlPkt);
}
#endif // _AQL_PROFILE_H_
+43
Просмотреть файл
@@ -0,0 +1,43 @@
#include "pm4_factory.h"
// Commandwriter includes
#include "gfx8_cmdwriter.h"
// PMC includes
#include "vi_pmu.h"
// SQTT includes
#include "gfx8_thread_trace.h"
namespace aql_profile {
// GFX9 block ID mapping table
uint32_t Gfx8Factory::block_id_table[HSA_EXT_AQL_PROFILE_BLOCKS_NUMBER] = {
pm4_profile::kHsaViCounterBlockIdCb0, pm4_profile::kHsaViCounterBlockIdCpf,
pm4_profile::kHsaViCounterBlockIdDb0, pm4_profile::kHsaViCounterBlockIdGrbm,
pm4_profile::kHsaViCounterBlockIdGrbmSe, pm4_profile::kHsaViCounterBlockIdPaSu,
pm4_profile::kHsaViCounterBlockIdPaSc, pm4_profile::kHsaViCounterBlockIdSpi,
pm4_profile::kHsaViCounterBlockIdSq, pm4_profile::kHsaViCounterBlockIdSqGs,
pm4_profile::kHsaViCounterBlockIdSqVs, pm4_profile::kHsaViCounterBlockIdSqPs,
pm4_profile::kHsaViCounterBlockIdSqHs, pm4_profile::kHsaViCounterBlockIdSqCs,
pm4_profile::kHsaViCounterBlockIdSx, pm4_profile::kHsaViCounterBlockIdTa0,
pm4_profile::kHsaViCounterBlockIdTca0, pm4_profile::kHsaViCounterBlockIdTcc0,
pm4_profile::kHsaViCounterBlockIdTd0, pm4_profile::kHsaViCounterBlockIdTcp0,
pm4_profile::kHsaViCounterBlockIdGds, pm4_profile::kHsaViCounterBlockIdVgt,
pm4_profile::kHsaViCounterBlockIdIa, pm4_profile::kHsaViCounterBlockIdMc,
pm4_profile::kHsaViCounterBlockIdTcs, pm4_profile::kHsaViCounterBlockIdWd};
pm4_profile::CommandWriter * Gfx8Factory::getCommandWriter() {
return new pm4_profile::gfx8::Gfx8CmdWriter(false, true);
}
pm4_profile::Pmu * Gfx8Factory::getPmcMgr() {
return new pm4_profile::ViPmu();
}
pm4_profile::ThreadTrace * Gfx8Factory::getSqttMgr() {
return new pm4_profile::Gfx8ThreadTrace();
}
uint32_t Gfx8Factory::getBlockId(const event_t* event) {
return block_id_table[event->block_name] + event->block_index;
}
} // aql_profile
+70
Просмотреть файл
@@ -0,0 +1,70 @@
#include "pm4_factory.h"
// Commandwriter includes
#include "gfx8_cmdwriter.h"
#include "gfx9_cmdwriter.h"
// PMC includes
#include "vi_pmu.h"
#include "ai_pmu.h"
// SQTT includes
#include "gfx8_thread_trace.h"
#include "gfx9_thread_trace.h"
namespace aql_profile {
// GFX8 block ID mapping table
uint32_t gfx8_block_id_table[HSA_EXT_AQL_PROFILE_BLOCKS_NUMBER] = {
pm4_profile::kHsaViCounterBlockIdCb0, pm4_profile::kHsaViCounterBlockIdCpf,
pm4_profile::kHsaViCounterBlockIdDb0, pm4_profile::kHsaViCounterBlockIdGrbm,
pm4_profile::kHsaViCounterBlockIdGrbmSe, pm4_profile::kHsaViCounterBlockIdPaSu,
pm4_profile::kHsaViCounterBlockIdPaSc, pm4_profile::kHsaViCounterBlockIdSpi,
pm4_profile::kHsaViCounterBlockIdSq, pm4_profile::kHsaViCounterBlockIdSqGs,
pm4_profile::kHsaViCounterBlockIdSqVs, pm4_profile::kHsaViCounterBlockIdSqPs,
pm4_profile::kHsaViCounterBlockIdSqHs, pm4_profile::kHsaViCounterBlockIdSqCs,
pm4_profile::kHsaViCounterBlockIdSx, pm4_profile::kHsaViCounterBlockIdTa0,
pm4_profile::kHsaViCounterBlockIdTca0, pm4_profile::kHsaViCounterBlockIdTcc0,
pm4_profile::kHsaViCounterBlockIdTd0, pm4_profile::kHsaViCounterBlockIdTcp0,
pm4_profile::kHsaViCounterBlockIdGds, pm4_profile::kHsaViCounterBlockIdVgt,
pm4_profile::kHsaViCounterBlockIdIa, pm4_profile::kHsaViCounterBlockIdMc,
pm4_profile::kHsaViCounterBlockIdTcs, pm4_profile::kHsaViCounterBlockIdWd};
// GFX9 block ID mapping table
uint32_t gfx9_block_id_table[HSA_EXT_AQL_PROFILE_BLOCKS_NUMBER] = {
pm4_profile::kHsaAiCounterBlockIdCb0, pm4_profile::kHsaAiCounterBlockIdCpf,
pm4_profile::kHsaAiCounterBlockIdDb0, pm4_profile::kHsaAiCounterBlockIdGrbm,
pm4_profile::kHsaAiCounterBlockIdGrbmSe, pm4_profile::kHsaAiCounterBlockIdPaSu,
pm4_profile::kHsaAiCounterBlockIdPaSc, pm4_profile::kHsaAiCounterBlockIdSpi,
pm4_profile::kHsaAiCounterBlockIdSq, pm4_profile::kHsaAiCounterBlockIdSqGs,
pm4_profile::kHsaAiCounterBlockIdSqVs, pm4_profile::kHsaAiCounterBlockIdSqPs,
pm4_profile::kHsaAiCounterBlockIdSqHs, pm4_profile::kHsaAiCounterBlockIdSqCs,
pm4_profile::kHsaAiCounterBlockIdSx, pm4_profile::kHsaAiCounterBlockIdTa0,
pm4_profile::kHsaAiCounterBlockIdTca0, pm4_profile::kHsaAiCounterBlockIdTcc0,
pm4_profile::kHsaAiCounterBlockIdTd0, pm4_profile::kHsaAiCounterBlockIdTcp0,
pm4_profile::kHsaAiCounterBlockIdGds, pm4_profile::kHsaAiCounterBlockIdVgt,
pm4_profile::kHsaAiCounterBlockIdIa, pm4_profile::kHsaAiCounterBlockIdMc,
pm4_profile::kHsaAiCounterBlockIdTcs, pm4_profile::kHsaAiCounterBlockIdWd};
pm4_profile::CommandWriter * Pm4Factory::getCommandWriter() {
return (is_gfx9 == true) ?
new pm4_profile::gfx9::Gfx9CmdWriter(false, true) :
new pm4_profile::gfx8::Gfx8CmdWriter(false, true);
}
pm4_profile::Pmu * Pm4Factory::getPmcMgr() {
return (is_gfx9 == true) ?
new pm4_profile::AiPmu() :
new pm4_profile::ViPmu();
}
pm4_profile::ThreadTrace * Pm4Factory::getSqttMgr() {
return (is_gfx9 == true) ?
new pm4_profile::Gfx9ThreadTrace() :
new pm4_profile::Gfx8ThreadTrace();
}
uint32_t Pm4Factory::getBlockId(const event_t* event) {
return (is_gfx9 == true) ?
gfx9_block_id_table[event->block_name] + event->block_index :
gfx8_block_id_table[event->block_name] + event->block_index :
}
} // aql_profile
+43
Просмотреть файл
@@ -0,0 +1,43 @@
#include "pm4_factory.h"
// Commandwriter includes
#include "gfx9_cmdwriter.h"
// PMC includes
#include "ai_pmu.h"
// SQTT includes
#include "gfx9_thread_trace.h"
namespace aql_profile {
// GFX9 block ID mapping table
uint32_t Gfx9Factory::block_id_table[HSA_EXT_AQL_PROFILE_BLOCKS_NUMBER] = {
pm4_profile::kHsaAiCounterBlockIdCb0, pm4_profile::kHsaAiCounterBlockIdCpf,
pm4_profile::kHsaAiCounterBlockIdDb0, pm4_profile::kHsaAiCounterBlockIdGrbm,
pm4_profile::kHsaAiCounterBlockIdGrbmSe, pm4_profile::kHsaAiCounterBlockIdPaSu,
pm4_profile::kHsaAiCounterBlockIdPaSc, pm4_profile::kHsaAiCounterBlockIdSpi,
pm4_profile::kHsaAiCounterBlockIdSq, pm4_profile::kHsaAiCounterBlockIdSqGs,
pm4_profile::kHsaAiCounterBlockIdSqVs, pm4_profile::kHsaAiCounterBlockIdSqPs,
pm4_profile::kHsaAiCounterBlockIdSqHs, pm4_profile::kHsaAiCounterBlockIdSqCs,
pm4_profile::kHsaAiCounterBlockIdSx, pm4_profile::kHsaAiCounterBlockIdTa0,
pm4_profile::kHsaAiCounterBlockIdTca0, pm4_profile::kHsaAiCounterBlockIdTcc0,
pm4_profile::kHsaAiCounterBlockIdTd0, pm4_profile::kHsaAiCounterBlockIdTcp0,
pm4_profile::kHsaAiCounterBlockIdGds, pm4_profile::kHsaAiCounterBlockIdVgt,
pm4_profile::kHsaAiCounterBlockIdIa, pm4_profile::kHsaAiCounterBlockIdMc,
pm4_profile::kHsaAiCounterBlockIdTcs, pm4_profile::kHsaAiCounterBlockIdWd};
pm4_profile::CommandWriter * Gfx9Factory::getCommandWriter() {
return new pm4_profile::gfx9::Gfx9CmdWriter(false, true);
}
pm4_profile::Pmu * Gfx9Factory::getPmcMgr() {
return new pm4_profile::AiPmu();
}
pm4_profile::ThreadTrace * Gfx9Factory::getSqttMgr() {
return new pm4_profile::Gfx9ThreadTrace();
}
uint32_t Gfx9Factory::getBlockId(const event_t* event) {
return block_id_table[event->block_name] + event->block_index;
}
} // aql_profile
+62
Просмотреть файл
@@ -0,0 +1,62 @@
#ifndef _PM4_FACTORY_H_
#define _PM4_FACTORY_H_
#include <string.h>
#include <assert.h>
#include "aql_profile.h"
namespace pm4_profile {
class CommandWriter;
class Pmu;
class ThreadTrace;
}
namespace aql_profile {
class Pm4Factory {
public:
static Pm4Factory* Create(const hsa_ext_amd_aql_profile_profile_t* profile);
virtual pm4_profile::CommandWriter* getCommandWriter() = 0;
virtual pm4_profile::Pmu* getPmcMgr() = 0;
virtual pm4_profile::ThreadTrace* getSqttMgr() = 0;
virtual uint32_t getBlockId(const event_t* event) = 0;
};
class Gfx8Factory : public Pm4Factory {
public:
pm4_profile::CommandWriter* getCommandWriter();
pm4_profile::Pmu* getPmcMgr();
pm4_profile::ThreadTrace* getSqttMgr();
uint32_t getBlockId(const event_t* event);
private:
static uint32_t block_id_table[HSA_EXT_AQL_PROFILE_BLOCKS_NUMBER];
};
class Gfx9Factory : public Pm4Factory {
public:
pm4_profile::CommandWriter* getCommandWriter();
pm4_profile::Pmu* getPmcMgr();
pm4_profile::ThreadTrace* getSqttMgr();
uint32_t getBlockId(const event_t* event);
private:
static uint32_t block_id_table[HSA_EXT_AQL_PROFILE_BLOCKS_NUMBER];
};
inline Pm4Factory* Pm4Factory::Create(const hsa_ext_amd_aql_profile_profile_t* profile) {
Pm4Factory* instance = NULL;
char agent_name[64];
hsa_agent_get_info(profile->agent, HSA_AGENT_INFO_NAME, agent_name);
if (strncmp(agent_name, "gfx8", 4) == 0) {
instance = new Gfx8Factory();
} else if (strncmp(agent_name, "gfx9", 4) == 0) {
instance = new Gfx9Factory();
}
return instance;
}
} // aql_profile
#endif // _PM4_FACTORY_H_
+41
Просмотреть файл
@@ -0,0 +1,41 @@
#include <iostream>
#include <iomanip>
#include "aql_profile.h"
#include "cmdwriter.h"
#include "amd_aql_pm4_ib_packet.h"
namespace aql_profile {
void populateAql(uint32_t* ib_packet, packet_t* aql_packet) {
// Populate relevant fields of Aql pkt
// Size of IB pkt is four DWords
// Header and completion sinal are not set
amd_aql_pm4_ib_packet_t* aql_pm4_ib = reinterpret_cast<amd_aql_pm4_ib_packet_t*>(aql_packet);
aql_pm4_ib->pm4_ib_format = AMD_AQL_PM4_IB_FORMAT;
aql_pm4_ib->pm4_ib_command[0] = ib_packet[0];
aql_pm4_ib->pm4_ib_command[1] = ib_packet[1];
aql_pm4_ib->pm4_ib_command[2] = ib_packet[2];
aql_pm4_ib->pm4_ib_command[3] = ib_packet[3];
aql_pm4_ib->dw_count_remain = AMD_AQL_PM4_IB_DW_COUNT_REMAIN;
for (int i = 0; i < AMD_AQL_PM4_IB_RESERVED_COUNT; ++i) {
aql_pm4_ib->reserved[i] = 0;
}
uint32_t* words = (uint32_t*)aql_packet;
std::clog << std::setw(40) << std::left << "AQL 'IB' size(16)"
<< ":";
for (int idx = 0; idx < 16; idx++) {
std::clog << " " << std::hex << std::setw(8) << std::setfill('0') << words[idx];
}
std::clog << std::setfill(' ') << std::endl;
}
void populateAql(void* cmd_buffer, uint32_t cmd_size,
pm4_profile::CommandWriter* cmd_writer, packet_t* ppt_packet) {
pm4_profile::DefaultCmdBuf ib_buffer;
cmd_writer->BuildIndirectBufferCmd(&ib_buffer, cmd_buffer, (size_t)cmd_size);
uint32_t* ib_cmds = (uint32_t*)ib_buffer.Base();
populateAql(ib_cmds, ppt_packet);
}
}
+15
Просмотреть файл
@@ -0,0 +1,15 @@
#
# Source files for Rocr Cmdwriter
#
set ( CmdWriterSrcs gfx8_cmdwriter.cpp )
set ( CmdWriterSrcs ${CmdWriterSrcs} gfx9_cmdwriter.cpp )
#
# Header files include path(s).
#
include_directories ( $ENV{ROCR_INC_DIR} )
#
# Build Cmdwriter as a Static Library object
#
add_library ( ${CMDWRITER_LIB} STATIC ${CmdWriterSrcs} )
+515
Просмотреть файл
@@ -0,0 +1,515 @@
// cmdwriter.h
// Header file for CommandWriter and CmdBuf interfaces
#ifndef _CMDWRITER_H_
#define _CMDWRITER_H_
#include <vector>
#include <string.h>
#include <stdint.h>
#include <assert.h>
namespace pm4_profile {
// User defined options for flusing cache
typedef struct FlushCacheOptions_ {
bool l1, l2;
bool icache, kcache;
bool l1_vol, l2_vol, kcache_vol;
FlushCacheOptions_() {
l1 = l2 = icache = kcache = false;
l1_vol = l2_vol = kcache_vol = false;
};
} FlushCacheOptions;
/// @brief Interface to build a list of Gpu commands into a byte
/// buffer. Classes implementing this interface are used to translate
/// various Gpu commands as byte stream.
///
/// @note: The Api does not require implementations to be thread safe.
/// Users are therefore required to be access in a serialized manner.
class CmdBuf {
public:
/// Default destructor.
virtual ~CmdBuf() {}
/// @brief Resets the command buffer object. All of the commands
/// previously packed into the buffer are lost i.e. the number of
/// bytes in command stream is reset.
///
/// @note: This convenience Api is provided to allow reuse of the
/// command buffer object.
///
/// @return bool true if successful, false otherwise.
virtual bool Reset(void) = 0;
/// @brief Appends input command into a buffer that could
/// be queried for its size and other properties. The append
/// does not verify the contents.
///
/// @param cmd Buffer containing one or more instances of Gpu commands
///
/// @param size size of the Gpu commands in bytes.
///
/// @return void
virtual void AppendCommand(const void* cmd, uint32_t size) = 0;
/// @brief Returns the total size (in bytes) of the accumulated commands.
///
/// @return size_t size of Gpu commands in bytes
virtual size_t Size() const = 0;
private:
/// Indexes the command buffer by dwords. Allows accessing constants
/// in an assembled command buffer.
virtual uint32_t& operator[](size_t index) = 0;
friend class CommandWriter;
};
/// @brief Implements the interface CmdBuf and thus can be used to
/// translate various Gpu commands as byte stream.
///
/// @note: The Api does not require implementations to be thread safe.
/// Users are therefore required to be access in a serialized manner.
class DefaultCmdBuf : public CmdBuf {
public:
/// @brief Append the command into the underlying buffer
///
/// @param cmd Buffer containing one or more instances of Gpu commands
///
/// @param size Size of Gpu command(s) in bytes
///
/// @retur void
virtual void AppendCommand(const void* cmd, uint32_t size) {
memcpy(ReserveCmdbufSpace(size), cmd, size);
}
/// @brief Resets the Gpu command buffer
bool Reset() {
cmdbuf_.clear();
return true;
}
/// Size of Gpu commands in bytes in the underlying buffer
size_t Size() const { return cmdbuf_.size() * sizeof(StorageType); }
/// Address of the start of accumulated commands.
const void* Base() const { return &cmdbuf_[0]; }
private:
/// @brief Returns reference to the value of Gpu command buffer
/// at specified index
///
/// @param index Specifies the buffer index whose value is needed
///
/// @return uint32_t & Reference of the value being returned
uint32_t& operator[](size_t index) { return cmdbuf_[index]; }
/// @brief Increase Gpu command buffer by specified size
///
/// @param size Size in bytes by which command buffer should
/// be resized.
///
/// @return void * Pointer into the buffer where the next
/// command can be written
void* ReserveCmdbufSpace(std::size_t size) {
const size_t len = cmdbuf_.size();
cmdbuf_.resize(len + size / sizeof(StorageType));
return &cmdbuf_[len];
}
/// @brief Defines Gpu command buffer as a vector of StorageType
typedef uint32_t StorageType;
std::vector<StorageType> cmdbuf_;
};
/// @brief Specifies the public interface of CommandWriter for use by
/// clients to build Gpu command streams.
class CommandWriter {
public:
/// @brief These enums specify the operation to perform in the packet
/// generated by BuildAtomicPacket. The commenting for each enum uses
/// the arguments to the function BuildAtomicPacket to express the
/// resulting operation.
enum AtomicType {
/// *destination = *destination + 1;
kAtomicTypeIncrement,
/// *destination = *destination - 1;
kAtomicTypeDecrement,
/// if (*destination == compare) *destination = value;
kAtomicTypeCompareAndSwap,
/// while (*destination != compare);
/// *destination = value;
kAtomicTypeBlockingCompareAndSwap,
/// *destination = *destination + value;
kAtomicAdd,
/// *destination = *destination - value;
kAtomicSubtract,
/// *destination = value;
kAtomicSwap
};
/// @brief These enums specify the VGT EVENT TYPE to issue and wait for.
/// Command Processor (CP) uses these events to communicate with SPI to
/// learn about outstanding waves and determine kernel completion.
enum VgtEventType {
/// Enable Performance Counters
kPerfCntrsStart,
/// Disable Performance Counters
kPerfCntrsStop,
/// Read Performance Counters
kPerfCntrsSample,
/// Enable a Thread Trace session
kThrdTraceStart,
/// Disable a Thread Trace session
kThrdTraceStop,
/// Enable flushing of thread trace buffers
kThrdTraceFlush,
/// Enables resetting of BASE register to its last value
/// including flushing of thread trace buffers. This could
/// be used to toggle between two buffers so as to allow
/// collection of large token data
kThrdTraceFinish
};
/// @brief Returns the Dword that encodes a No-Op for the CP
///
/// @return uint32_t Dword that can be used to populate a Pm4
/// command queue.
///
virtual uint32_t GetNoOpCmd() = 0;
/// @brief Build an instance of Barrier command and copy it into
/// the input commmand buffer
///
/// @param cmdbuf Pointer to command buffer which is updated with
/// an instance of Barrier command.
///
/// @return void
virtual void BuildBarrierCommand(CmdBuf* cmdbuf) = 0;
/// @brief Builds the Gpu command to reference indirectly a stream
/// of other Gpu commands. The launch command is then copied into
/// the command buffer parameter.
///
/// @param cmdBuf command buffer to be appended with launch command
///
/// @param cmd_addr Address of command buffer carrying command stream
///
/// @param cmd_size Size of dispatch command stream in bytes
///
/// @return void
virtual void BuildIndirectBufferCmd(CmdBuf* cmdbuf, const void* cmd_addr,
std::size_t cmd_size) = 0;
/// @brief Build a Gpu command that triggers an event whose type
/// is specified by input parameter. It then copies it into the input
/// command buffer
///
/// @param cmdbuf Pointer to command buffer to be appended
///
/// @param event Id of Event to be triggered by Gpu
///
/// @return void
virtual void BuildWriteEventPacket(CmdBuf* cmdbuf, uint32_t event) = 0;
/// @bried Builds a Gpu command to wait until condition is realized
///
/// @param cmdbuf command buffer to be appended with launch command
///
/// @param mem_space if the address is in memory or is a register offset
///
/// @param wait_addr address to wait on
///
/// @param func_eq true means equal, false means not-equal
///
/// @param mask_val Mask to apply on value from addr in comparison
///
/// @param wait_val value to apply for the func given above
virtual void BuildWaitRegMemCommand(CmdBuf* cmdbuf, bool mem_space, uint64_t wait_addr,
bool func_eq, uint32_t mask_val, uint32_t wait_val) = 0;
virtual void BuildUpdateHostAddress(CmdBuf* cmdbuf, uint64_t* addr, int64_t value) = 0;
/// @brief Build CP command to program a Gpu register
///
/// @param cmdbuf Pointer to command buffer to be appended
/// @param addr Register to be programmed
/// @param value Value to write into register
///
/// @return void
virtual void BuildWriteUConfigRegPacket(CmdBuf* cmdbuf, uint32_t addr, uint32_t value) = 0;
/// @brief Build and copy WriteShReg command
///
/// @param cmdbuf Pointer to command buffer to be appended
///
/// @param addr Offset of the register
///
/// @param value Value to write into register
///
/// @return void
virtual void BuildWriteShRegPacket(CmdBuf* cmdbuf, uint32_t addr, uint32_t value) = 0;
/// @brief Builds a Gpu command to flush Gpu caches and write a
/// user defined value at a configurable location that is Gpu
/// accessible.
///
/// @param cmdBuf Command buffer to be appended with bottom of pipe
/// notification command
///
/// @param write_addr Address into which Gpu should write
///
/// @param write_val Value to write into user provided address
///
/// @param interrupt True if Gpu should raise an interrupt upon writing
/// the user value
///
/// @return void
virtual void BuildBOPNotifyCmd(CmdBuf* cmdbuf, const void* write_addr, uint32_t write_val,
bool intrpt) = 0;
/// @brief Build a Gpu command that copies data from a specified
/// source to destination
///
/// @param cmdbuf Pointer to command buffer to be appended
///
/// @param reg_to_mem flag to indicate if values are being read from a
/// Register or a memory location
///
/// @param src_addr_lo Low 32-bit Source address of the data to read from
///
/// @param src_addr_hi High 32-bit Source address of the data to read from
///
/// @param dst_addr Destination address for the data to be written to
///
/// @param size Size of the data to be written
///
/// @param wait True if Gpu command should confirm the write operation
/// operation has completed successfully
///
/// @return void
///
/// @NOTE Change interface to use void* for Src and void* for Dest
virtual void BuildCopyDataPacket(CmdBuf* cmdbuf, uint32_t src_sel, uint32_t src_addr_lo,
uint32_t src_addr_hi, uint32_t* dst_addr, uint32_t size,
bool wait) = 0;
/// @brief Build and copy a WaitIdle Gpu command into command buffer
///
/// @param cmdbuf Pointer to command buffer to be appended
///
/// @return void
virtual void BuildWriteWaitIdlePacket(CmdBuf* cmdbuf) = 0;
// Will issue a VGT event including a cache flush later on
virtual void BuildVgtEventPacket(CmdBuf* cmdbuf, uint32_t vgtEvent) = 0;
/// @brief Build and copy a WriteRegister Gpu command into command buffer
///
/// @param cmdbuf Pointer to command buffer to be appended
///
/// @param addr Register into which to write
///
/// @param value Value to write into register
///
/// @return void
virtual void BuildWriteRegisterPacket(CmdBuf* cmdbuf, uint32_t addr, uint32_t value) = 0;
/// @brief Build and copy a Gpu command to query the status of a
/// WriteEvent into command buffer
///
/// @param cmdbuf Pointer to command buffer to be appended
///
/// @param event Id of Event whose status is to be queried
///
/// @param addr Address to update the status of WriteEvent operation
///
/// @return void
virtual void BuildWriteEventQueryPacket(CmdBuf* cmdBuf, uint32_t event, uint32_t* addr) = 0;
/// @brief Builds and copies a Gpu comamnd to peform user specified
/// operation atomically. The various atomic operations on integers
/// that are supported include: increment, decrement, add, subtract,
/// compare-and-swap and swap. The operation to perform is specified
/// by the enum AtomicType.
///
/// @param cmdbuf Pointer to command buffer to be appended
///
/// @param atomic_op Id of the atomic operation to perform
///
/// @param addr Pointer to the memory block where atomic operation
/// would be performed
///
/// @param value New value to write if atomic operation can be performed
///
/// @param compare Value to compare if atomic operation is a compare-and-swap
///
/// @return void
virtual void BuildAtomicPacket(CmdBuf* cmdbuf, AtomicType atomic_op, volatile uint32_t* addr,
uint32_t value = 0, uint32_t compare = 0) = 0;
/// @brief Builds and copies a Gpu comamnd to peform user specified
/// operation atomically. The various atomic operations on integers
/// that are supported include: increment, decrement, add, subtract,
/// compare-and-swap and swap. The operation to perform is specified
/// by the enum AtomicType.
///
/// @param cmdbuf Pointer to command buffer to be appended
///
/// @param atomic_op Id of the atomic operation to perform
///
/// @param addr Pointer to the memory block where atomic operation
/// would be performed
///
/// @param value New value to write if atomic operation can be performed
///
/// @param compare Value to compare if atomic operation is a compare-and-swap
///
/// @return void
virtual void BuildAtomicPacket64(CmdBuf* cmdbuf, AtomicType atomic_op, volatile uint64_t* addr,
uint64_t value = 0, uint64_t compare = 0) = 0;
/// @brief Returns the size of an atomic packet
///
/// @return size_t Size of atomic packet
virtual size_t SizeOfAtomicPacket() const = 0;
/// @brief Build and copy a Gpu command that will tell command processor
/// to conditionally execute or skip the next sequence of packets.
///
/// @param cmdbuf Pointer to command buffer to be appended
///
/// @param signal Pointer to an integer that tells the command processor
/// whether to skip or execute the next block of packets. If it is set
/// to 0 the following packets will be skipped, else it will execute the
/// following packets
///
/// @param count The number of dwords in the following packet stream
/// that will be conditionally executed
///
/// @return void
virtual void BuildConditionalExecute(CmdBuf* cmdbuf, uint32_t* signal, uint16_t count) = 0;
/// @brief Builds a CP command to write user specified value
/// at a user specified address. The command is then copied
/// into the command buffer for submission to a device queue.
///
/// @param cmdbuf Pointer to command buffer to be appended
///
/// @param write_addr Address into which CP will write the user
/// specified value
///
/// @param write_value Value to write into the user specified address
///
/// @return void
virtual void BuildWriteDataCommand(CmdBuf* cmdbuf, uint32_t* write_addr,
uint32_t write_value) = 0;
/// @brief Builds a CP command to write user specified value
/// at a user specified address. The command is then copied
/// into the command buffer for submission to a device queue.
///
/// @param cmdbuf Pointer to command buffer to be appended
///
/// @param write_addr Address into which CP will write the user
/// specified value
///
/// @param write_value Value to write into the user specified address
///
/// @return void
virtual void BuildWriteData64Command(CmdBuf* cmdbuf, uint64_t* write_addr,
uint64_t write_value) = 0;
/// Writes into input buffer Gpu commands to flush its cache. It is
/// necessary that the buffer provided for flush commands is large
/// enough to accommodate the full set of commands. It should be at
/// least 512 bytes.
///
/// @param tsCmdBuf Buffer to write commands to.
/// @param writeAddr Registered address into which GPU should write
/// a user provided value upon executing the flush commands.
/// @param writeVal User provided value written by GPU at user provided
/// address, upon executing the flush commands.
///
/// @return void
virtual void BuildFlushCacheCmd(CmdBuf* cmdbuf, FlushCacheOptions* options, uint32_t* writeAddr,
uint32_t writeVal) = 0;
/// @brief Builds Gpu command to copy data from source to destination
/// buffer using DMA engine.
///
/// @param cmdbuf Buffer updated with Gpu copy command
/// @param srcAddr Address of source buffer address
/// @param dstAddr Address of destination buffer address
/// @param copySize Size of data to copy in bytes
/// @param waitForCompletion if command should wait for copying to complete
virtual void BuildDmaDataPacket(CmdBuf* cmdbuf, uint32_t* srcAddrLo, uint32_t* dstAddr,
uint32_t copySize, bool waitForCompletion) = 0;
/// @brief Release resources used by CommandWriter
virtual ~CommandWriter(){};
protected:
/// @brief Return the reference to a value in the command buffer
uint32_t& IndexBuffer(CmdBuf* cmdbuf, uint32_t index) { return (*cmdbuf)[index]; }
};
/// @brief Returns the Rounded value per input rounding factor
inline uint32_t RoundUp(uint32_t u, uint32_t r) { return ((u + (r - 1)) & ~(r - 1)); }
/// @brief Returns the lower 32-bits of a value
inline uint32_t Low32(uint64_t u) { return (u & 0xFFFFFFFFUL); }
/// @brief Returns the upper 32-bits of a value
inline uint32_t High32(uint64_t u) { return (u >> 32); }
/// @brief Returns the lower 32-bits of an address
inline uint32_t Ptr48Low32(const void* p) {
uintptr_t ptr = reinterpret_cast<uintptr_t>(p);
assert((ptr & 0xFFFFFFFFFF00ULL) == ptr);
return (uint32_t)((ptr & 0xFFFFFFFFFFULL) >> 8);
}
/// @brief Returns the upper 8-bits of an address
inline uint8_t Ptr48High8(const void* p) {
uintptr_t ptr = reinterpret_cast<uintptr_t>(p);
return (uint8_t)((ptr & 0xFF0000000000ULL) >> 40);
}
/// @brief Returns the lower 32-bits of an address
inline uint32_t PtrLow32(const void* p) {
return static_cast<uint32_t>(reinterpret_cast<uintptr_t>(p));
}
/// @brief Returns the upper 32-bits of an address
inline uint32_t PtrHigh32(const void* p) {
uint32_t hi_32 = 0;
#ifdef HSA_LARGE_MODEL
hi_32 = static_cast<uint32_t>(reinterpret_cast<uintptr_t>(p) >> 32);
static_assert(sizeof(void*) == 8, "HSA_LARGE_MODEL is not set properly here!");
#else
static_assert(sizeof(void*) == 4, "HSA_LARGE_MODEL is not set properly here!");
#endif
return hi_32;
}
} // pm4_profile
#endif // _CMDWRITER_H_
+161
Просмотреть файл
@@ -0,0 +1,161 @@
#ifndef _GFX8_CMDS_H_
#define _GFX8_CMDS_H_
#include "gfxip/gfx8/si_ci_vi_merged_enum.h"
#include "gfxip/gfx8/si_ci_vi_merged_mask.h"
#include "gfxip/gfx8/si_ci_vi_merged_offset.h"
#include "gfxip/gfx8/si_ci_vi_merged_registers.h"
#include "gfxip/gfx8/si_ci_vi_merged_typedef.h"
#include "gfxip/gfx8/si_ci_vi_merged_pm4_it_opcodes.h"
#include "gfxip/gfx8/si_pm4defs.h"
namespace pm4_profile {
namespace gfx8 {
// Desc: Defines the Gpu command to dispatch a kernel. It embeds
// various Gpu hardware specific data structures for initialization
// and configuration before a dispatch begins to run
struct DispatchTemplate {
// Desc: Structure used to initialize the group dimensions
// of a kernel dispatch and if performance counters are enabled
struct DispatchDimensionRegs {
PM4CMDSETDATA cmd_set_data;
regCOMPUTE_START_X compute_start_x;
regCOMPUTE_START_Y compute_start_y;
regCOMPUTE_START_Z compute_start_z;
regCOMPUTE_NUM_THREAD_X compute_num_thread_x;
regCOMPUTE_NUM_THREAD_Y compute_num_thread_y;
regCOMPUTE_NUM_THREAD_Z compute_num_thread_z;
regCOMPUTE_PIPELINESTAT_ENABLE__CI__VI compute_pipelinestat_enable;
} dimension_regs;
// Desc: Structure used to initialize kernel Isa, trap
// handler, trap handler buffer, number of SGPR and VGPR
// registers needed, amount of Group memory and LDS needed,
// Rounding mode for Floating point numbers, etc.
struct DispatchProgramRegs {
PM4CMDSETDATA cmd_set_data;
regCOMPUTE_PGM_LO compute_pgm_lo;
regCOMPUTE_PGM_HI compute_pgm_hi;
regCOMPUTE_TBA_LO compute_tba_lo;
regCOMPUTE_TBA_HI compute_tba_hi;
regCOMPUTE_TMA_LO compute_tma_lo;
regCOMPUTE_TMA_HI compute_tma_hi;
regCOMPUTE_PGM_RSRC1 compute_pgm_rsrc1;
regCOMPUTE_PGM_RSRC2 compute_pgm_rsrc2;
} program_regs;
// Desc: Structure used to initialize parameters related to
// thread management i.e. number of waves to issue and number
// of Compute Units to use
struct DispatchResourceRegs {
PM4CMDSETDATA cmd_set_data;
regCOMPUTE_RESOURCE_LIMITS compute_resource_limits;
regCOMPUTE_STATIC_THREAD_MGMT_SE0 compute_static_thread_mgmt_se0;
regCOMPUTE_STATIC_THREAD_MGMT_SE1 compute_static_thread_mgmt_se1;
regCOMPUTE_TMPRING_SIZE compute_tmpring_size;
regCOMPUTE_STATIC_THREAD_MGMT_SE2__CI__VI compute_static_thread_mgmt_se2;
regCOMPUTE_STATIC_THREAD_MGMT_SE3__CI__VI compute_static_thread_mgmt_se3;
regCOMPUTE_RESTART_X__CI__VI compute_restart_x;
regCOMPUTE_RESTART_Y__CI__VI compute_restart_y;
regCOMPUTE_RESTART_Z__CI__VI compute_restart_z;
regCOMPUTE_THREAD_TRACE_ENABLE__CI__VI compute_thread_trace_enable;
} resource_regs;
// Desc: Structure used to pass handles of the Aql dispatch
// packet, Aql queue, Kernel argument address block, Scratch
// buffer
struct DispatchComputeUserDataRegs {
PM4CMDSETDATA cmd_set_data;
uint32_t compute_user_data[16];
} compute_user_data_regs;
// Desc: Structure used to configure Cache flush policy
// and dimensions of total work size
PM4CMDDISPATCHDIRECT dispatch_direct;
};
// Desc: Structure used to issue a Gpu Barrier command
struct BarrierTemplate {
PM4CMDEVENTWRITE event_write;
};
// Desc: Structure used to configure the flushing
// of various caches - instruction, constants, L1
// and L2
struct AcquireMemTemplate {
PM4CMDACQUIREMEM acquire_mem;
};
// Desc: Structure used to reference another Gpu command
// indirectly. Generally used to reference a list of Gpu
// commands (dispatch cmds) indirectly
struct LaunchTemplate {
PM4CMDINDIRECTBUFFER indirect_buffer;
};
// Desc: Structure used to determine the end of
// a kernel including cache flushes and writing to
// a user configurable memory location
struct EndofKernelNotifyTemplate {
PM4CMDRELEASEMEM release_mem;
};
// Desc: Strucuture used to perform various atomic
// operations - add, subtract, increment, etc
struct AtomicTemplate {
PM4CMDATOMIC atomic;
};
// Desc: Structure used to conditionalize the execution
// of a Gpu command stream
struct ConditionalExecuteTemplate {
PM4CMDCONDEXEC_CI conditional;
};
// Desc: PM4 command to write a 32-bit value into a memory
// location accessible to Gpu
struct WriteDataTemplate {
PM4CMDWRITEDATA write_data;
uint32_t write_data_value;
};
// Desc: PM4 command to write a 64-bit value into a memory
// location accessible to Gpu
struct WriteData64Template {
PM4CMDWRITEDATA write_data;
uint64_t write_data_value;
};
// Desc: PM4 command to wait for a certain event before proceeding
// to process another command on the queue
struct WaitRegMemTemplate {
PM4CMDWAITREGMEM wait_reg_mem;
};
// Desc: Initializer for commands that set shader registers
template <class T> void GenerateSetShRegHeader(T* pm4, uint32_t reg_addr) {
pm4->cmd_set_data.header.u32All =
PM4_TYPE_3_HDR(IT_SET_SH_REG, sizeof(T) / sizeof(uint32_t), ShaderCompute, 0);
pm4->cmd_set_data.regOffset = reg_addr - PERSISTENT_SPACE_START;
}
// Desc: Initializer for various Gpu command headers
template <class T> void GenerateCmdHeader(T* pm4, IT_OpCodeType op_code) {
pm4->header.u32All = PM4_TYPE_3_HDR(op_code, sizeof(T) / sizeof(uint32_t), ShaderCompute, 0);
}
// Desc: Initializer for commands that set configuration registers
template <class T> void GenerateSetConfigRegHeader(T* pm4, uint32_t reg_addr) {
pm4->cmd_set_data.header.u32All =
PM4_TYPE_3_HDR(IT_SET_CONFIG_REG, sizeof(T) / sizeof(uint32_t), ShaderCompute, 0);
pm4->cmd_set_data.regOffset = reg_addr - CONFIG_SPACE_START;
}
} // gfx8
} // pm4_profile
#endif // _GFX8_CMDS_H_
+768
Просмотреть файл
@@ -0,0 +1,768 @@
#include <algorithm>
#include <iostream>
#include <iomanip>
#include <sstream>
#include "gfx8_cmdwriter.h"
#include "gfxip/gfx8/gfx8_utils.h"
// RELEASE MEM DST SEL Definitions
#define RELEASE_MEM_DST_SEL_MEMORY_CONTROLLER 0
#define RELEASE_MEM_DST_SEL_TC_L2 1
// RELEASE MEM CACHE POLICY Definitions
#define RELEASE_MEM_CACHE_POLICY_LRU 0
#define RELEASE_MEM_CACHE_POLICY_STREAM 1
#define RELEASE_MEM_CACHE_POLICY_BYPASS 2
template <class T>
static void PrintPm4Packet(const T& command, const char* name) {
#if ! defined(NDEBUG)
uint32_t * cmd = (uint32_t*)&command;
uint32_t size = sizeof(command) / sizeof(uint32_t);
std::ostringstream oss;
oss << "'" << name << "' size(" << std::dec << size << ")";
std::clog << std::setw(40) << std::left << oss.str() << ":";
for (uint32_t idx = 0; idx < size; idx++) {
std::clog << " " << std::hex << std::setw(8) << std::setfill('0') << cmd[idx];
}
std::clog << std::setfill(' ') << std::endl;
#endif
}
#define APPEND_COMMAND_WRAPPER(cmdbuf, command) \
PrintPm4Packet(command, __FUNCTION__); \
AppendCommand(cmdbuf, command);
namespace pm4_profile {
namespace gfx8 {
template <class T> void Gfx8CmdWriter::AppendCommand(CmdBuf* cmdbuf, const T& command) {
cmdbuf->AppendCommand(&command, sizeof(command));
}
void Gfx8CmdWriter::InitializeAtomicTemplate() {
memset(&atomic_template_.atomic, 0, sizeof(atomic_template_));
GenerateCmdHeader(&atomic_template_.atomic, IT_ATOMIC_MEM__CI);
if (atc_support_) {
const uint32_t kAtcShift = 24;
atomic_template_.atomic.ordinal2 |= 1 << kAtcShift;
}
}
void Gfx8CmdWriter::InitializeConditionalTemplate() {
memset(&conditional_template_.conditional, 0, sizeof(conditional_template_));
gfx8::GenerateCmdHeader(&conditional_template_.conditional, IT_COND_EXEC);
if (atc_support_) {
const uint32_t kAtcShift = 24;
conditional_template_.conditional.ordinal4 |= 1 << kAtcShift;
}
}
void Gfx8CmdWriter::InitializeLaunchTemplate() {
memset(&launch_template_, 0, sizeof(launch_template_));
GenerateCmdHeader(&launch_template_.indirect_buffer, IT_INDIRECT_BUFFER);
launch_template_.indirect_buffer.CI.valid = true;
}
void Gfx8CmdWriter::InitializeWriteDataTemplate() {
// Set the header of write data command
memset(&write_data_template_, 0, sizeof(write_data_template_));
// Initialize the header of command packet
PM4CMDWRITEDATA* command = &(write_data_template_.write_data);
uint32_t cmd_size = sizeof(write_data_template_) / sizeof(uint32_t);
command->ordinal1 = PM4_TYPE_3_HDR(IT_WRITE_DATA, cmd_size, ShaderCompute, 0);
// Set the ATC bit of command template - specifies if the address
// belongs to system memory
write_data_template_.write_data.atc__CI = (atc_support_) ? 1 : 0;
// Set the bit to confirm the write operation and cache policy
write_data_template_.write_data.wrConfirm = 1;
write_data_template_.write_data.cachePolicy__CI = WRITE_DATA_CACHE_POLICY_BYPASS;
// Specify the module that will execute the write data command
write_data_template_.write_data.engineSel = WRITE_DATA_ENGINE_ME;
// Specify the class to which the write destination belongs
write_data_template_.write_data.dstSel = WRITE_DATA_DST_SEL_MEMORY_ASYNC;
}
void Gfx8CmdWriter::InitializeWriteData64Template() {
// Set the header of write data command
memset(&write_data64_template_, 0, sizeof(write_data64_template_));
// Initialize the header of command packet
PM4CMDWRITEDATA* command = &(write_data64_template_.write_data);
uint32_t cmd_size = sizeof(write_data64_template_) / sizeof(uint32_t);
command->ordinal1 = PM4_TYPE_3_HDR(IT_WRITE_DATA, cmd_size, ShaderCompute, 0);
// Set the ATC bit of command template - specifies if the address
// belongs to system memory
write_data64_template_.write_data.atc__CI = (atc_support_) ? 1 : 0;
// Set the bit to confirm the write operation and cache policy
write_data64_template_.write_data.wrConfirm = 1;
write_data64_template_.write_data.cachePolicy__CI = WRITE_DATA_CACHE_POLICY_BYPASS;
// Specify the module that will execute the write data command
write_data64_template_.write_data.engineSel = WRITE_DATA_ENGINE_ME;
// Specify the class to which the write destination belongs
// write_data64_template_.write_data.dstSel = WRITE_DATA_DST_SEL_TCL2;
// TODO: For Hawaii bring up only.
write_data64_template_.write_data.dstSel = WRITE_DATA_DST_SEL_MEMORY_ASYNC;
}
void Gfx8CmdWriter::InitializeBarrierTemplate() {
memset(&pending_dispatch_template_, 0, sizeof(pending_dispatch_template_));
gfx8::GenerateCmdHeader(&pending_dispatch_template_.event_write, IT_EVENT_WRITE);
pending_dispatch_template_.event_write.eventType = CS_PARTIAL_FLUSH;
pending_dispatch_template_.event_write.eventIndex = EventTypeToIndexTable[CS_PARTIAL_FLUSH];
}
void Gfx8CmdWriter::InitializeAcquireMemTemplate() {
memset(&invalidate_cache_template_, 0, sizeof(invalidate_cache_template_));
gfx8::GenerateCmdHeader(&invalidate_cache_template_.acquire_mem, IT_ACQUIRE_MEM__CI__VI);
invalidate_cache_template_.acquire_mem.cpCoherBase.u32All = 0x00;
invalidate_cache_template_.acquire_mem.cpCoherBaseHi.u32All = 0x00;
invalidate_cache_template_.acquire_mem.cpCoherSize.u32All = 0xFFFFFFFF;
invalidate_cache_template_.acquire_mem.cpCoherSizeHi.u32All = 0xFF;
invalidate_cache_template_.acquire_mem.pollInterval = 0;
}
void Gfx8CmdWriter::InitializeWaitRegMemTemplate() {
memset(&wait_reg_mem_template_, 0, sizeof(wait_reg_mem_template_));
gfx8::GenerateCmdHeader(&wait_reg_mem_template_.wait_reg_mem, IT_WAIT_REG_MEM);
wait_reg_mem_template_.wait_reg_mem.atc__CI = (atc_support_) ? 1 : 0;
wait_reg_mem_template_.wait_reg_mem.cachePolicy__CI = 2; // bypass
wait_reg_mem_template_.wait_reg_mem.pollInterval = 0;
wait_reg_mem_template_.wait_reg_mem.engine = WAIT_REG_MEM_ENGINE_ME;
}
Gfx8CmdWriter::Gfx8CmdWriter(bool atc_support, bool pcie_atomic_support) {
// Initialize various state variables related to
// atomic operations and atc support
pcie_atomic_support_ = pcie_atomic_support;
atc_support_ = atc_support;
InitializeLaunchTemplate();
InitializeAtomicTemplate();
InitializeConditionalTemplate();
InitializeWriteDataTemplate();
InitializeWriteData64Template();
InitializeBarrierTemplate();
InitializeAcquireMemTemplate();
InitializeWaitRegMemTemplate();
}
void Gfx8CmdWriter::BuildWaitRegMemCommand(CmdBuf* cmdbuf, bool mem_space, uint64_t wait_addr,
bool func_eq, uint32_t mask_val, uint32_t wait_val) {
gfx8::WaitRegMemTemplate wait_cmd = wait_reg_mem_template_;
// Apply the space to which addr belongs
if (mem_space) {
wait_cmd.wait_reg_mem.memSpace = WAIT_REG_MEM_SPACE_MEMORY;
} else {
wait_cmd.wait_reg_mem.memSpace = WAIT_REG_MEM_SPACE_REGISTER;
}
// Apply the function - equal / not equal desired by user
if (func_eq) {
wait_cmd.wait_reg_mem.function = WAIT_REG_MEM_FUNC_EQUAL;
} else {
wait_cmd.wait_reg_mem.function = WAIT_REG_MEM_FUNC_NOT_EQUAL;
}
// Apply the mask on value at address/register
wait_cmd.wait_reg_mem.mask = mask_val;
// Value to use in applying equal / not equal function
wait_cmd.wait_reg_mem.reference = wait_val;
// Update upper 32 bit address if addr is not a register
if (mem_space) {
assert(!(wait_addr & 0x3) && "WaitRegMem address must be 4 byte aligned");
}
wait_cmd.wait_reg_mem.pollAddressLo = Low32(wait_addr);
if (mem_space) {
wait_cmd.wait_reg_mem.pollAddressHi = High32(wait_addr);
}
APPEND_COMMAND_WRAPPER(cmdbuf, wait_cmd);
}
void Gfx8CmdWriter::BuildUpdateHostAddress(CmdBuf* cmdbuf, uint64_t* addr, int64_t value) {
// If Atomics are supported, use it
if (pcie_atomic_support_) {
BuildAtomicPacket64(cmdbuf, CommandWriter::AtomicType::kAtomicSwap, (volatile uint64_t*)addr,
value);
return;
}
BuildWriteData64Command(cmdbuf, addr, value);
return;
}
void Gfx8CmdWriter::BuildIndirectBufferCmd(CmdBuf* cmdbuf, const void* cmd_addr,
std::size_t cmd_size) {
gfx8::LaunchTemplate launch = launch_template_;
launch.indirect_buffer.ibBaseLo = PtrLow32(cmd_addr);
launch.indirect_buffer.ibBaseHi = PtrHigh32(cmd_addr);
launch.indirect_buffer.CI.ibSize = cmd_size / sizeof(uint32_t);
APPEND_COMMAND_WRAPPER(cmdbuf, launch);
}
void Gfx8CmdWriter::BuildBOPNotifyCmd(CmdBuf* cmdbuf, const void* write_addr, uint32_t write_val,
bool interrupt) {
// Initialize the command including its header
gfx8::EndofKernelNotifyTemplate eopCmd;
memset(&eopCmd, 0, sizeof(eopCmd));
gfx8::GenerateCmdHeader(&eopCmd.release_mem, IT_RELEASE_MEM__CI__VI);
// Program CP to wait until following event is notified by SPI
eopCmd.release_mem.eventType = BOTTOM_OF_PIPE_TS;
eopCmd.release_mem.eventIndex = EventTypeToIndexTable[BOTTOM_OF_PIPE_TS];
// Program CP to perform various cache operations
// which complete before Write operation commences
eopCmd.release_mem.atc = atc_support_;
eopCmd.release_mem.l2Invlidate = true;
eopCmd.release_mem.l2WriteBack = true;
// Set destination as Memory with Write bypassing Cache
eopCmd.release_mem.cachePolicy = RELEASE_MEM_CACHE_POLICY_BYPASS;
eopCmd.release_mem.dstSel = RELEASE_MEM_DST_SEL_MEMORY_CONTROLLER;
// Program CP to write user specified value to user specified address
eopCmd.release_mem.ordinal4 = Low32(uint64_t(write_addr));
eopCmd.release_mem.addrHi = High32(uint64_t(write_addr));
eopCmd.release_mem.dataLo = Low32(write_val);
eopCmd.release_mem.dataHi = High32(write_val);
eopCmd.release_mem.dataSel = EVENTWRITEEOP_DATA_SEL_SEND_DATA32;
// Determine if host will poll or wait for interrupt
eopCmd.release_mem.intSel =
(interrupt == false) ? EVENTWRITEEOP_INT_SEL_NONE : EVENTWRITEEOP_INT_SEL_SEND_INT_ON_CONFIRM;
APPEND_COMMAND_WRAPPER(cmdbuf, eopCmd);
}
void Gfx8CmdWriter::BuildBarrierFenceCommands(CmdBuf* cmdbuf) {
gfx8::AcquireMemTemplate invalidate_src_caches = invalidate_cache_template_;
// wbINVL2 by default writes-back and invalidates both L1 and L2
invalidate_src_caches.acquire_mem.coherCntl =
CP_COHER_CNTL__TC_ACTION_ENA_MASK | CP_COHER_CNTL__TC_WB_ACTION_ENA_MASK__CI__VI;
APPEND_COMMAND_WRAPPER(cmdbuf, invalidate_src_caches);
}
// PM4 packet for profilers
#define PM4_PACKET3 (0xC0000000)
#define PM4_PACKET3_CMD_SHIFT 8
#define PM4_PACKET3_COUNT_SHIFT 16
#define PACKET3(cmd, count) \
(PM4_PACKET3 | (((count)-1) << PM4_PACKET3_COUNT_SHIFT) | ((cmd) << PM4_PACKET3_CMD_SHIFT))
// Structure to store the event PM4 packet
typedef struct WriteRegPacket_ { uint32_t item[3]; } WriteRegPacket;
typedef struct WriteEventPacket_ { uint32_t item[7]; } WriteEventPacket;
void Gfx8CmdWriter::BuildWriteEventPacket(CmdBuf* cmdbuf, uint32_t event) {
PM4CMDEVENTWRITE cp_event_initiator;
cp_event_initiator.ordinal1 = PACKET3(IT_EVENT_WRITE, 1);
cp_event_initiator.ordinal2 = 0;
VGT_EVENT_TYPE eventType = Reserved_0x00;
switch (event) {
case kPerfCntrsStart:
eventType = PERFCOUNTER_START;
break;
case kPerfCntrsStop:
eventType = PERFCOUNTER_STOP;
break;
case kPerfCntrsSample:
eventType = PERFCOUNTER_SAMPLE;
break;
default:
assert(false && "Illegal VGT Event Id");
}
cp_event_initiator.eventType = eventType;
cp_event_initiator.eventIndex = EventTypeToIndexTable[eventType];
APPEND_COMMAND_WRAPPER(cmdbuf, cp_event_initiator);
return;
}
void Gfx8CmdWriter::BuildWriteUnshadowRegPacket(CmdBuf* cmdbuf, uint32_t addr, uint32_t value) {
WriteRegPacket packet;
packet.item[0] = (PM4_TYPE_3_HDR(IT_SET_UCONFIG_REG__CI__VI, 1 + PM4_CMD_SET_CONFIG_REG_DWORDS,
ShaderGraphics, 0));
packet.item[1] = (addr - UCONFIG_SPACE_START__CI__VI);
packet.item[2] = value;
APPEND_COMMAND_WRAPPER(cmdbuf, packet);
return;
}
void Gfx8CmdWriter::BuildWriteUConfigRegPacket(CmdBuf* cmdbuf, uint32_t addr, uint32_t value) {
WriteRegPacket packet;
packet.item[0] = (PM4_TYPE_3_HDR(IT_SET_UCONFIG_REG__CI__VI, 1 + PM4_CMD_SET_CONFIG_REG_DWORDS,
ShaderCompute, 0));
packet.item[1] = (addr - UCONFIG_SPACE_START__CI__VI);
packet.item[2] = value;
APPEND_COMMAND_WRAPPER(cmdbuf, packet);
return;
}
void Gfx8CmdWriter::BuildWriteShRegPacket(CmdBuf* cmdbuf, uint32_t addr, uint32_t value) {
WriteRegPacket packet;
packet.item[0] = (PM4_TYPE_3_HDR(IT_SET_SH_REG, 1 + PM4_CMD_SET_SH_REG_DWORDS, ShaderCompute, 0));
packet.item[1] = (addr - PERSISTENT_SPACE_START);
packet.item[2] = value;
APPEND_COMMAND_WRAPPER(cmdbuf, packet);
return;
}
void Gfx8CmdWriter::BuildCopyDataPacket(CmdBuf* cmdbuf, uint32_t src_sel, uint32_t src_addr_lo,
uint32_t src_addr_hi, uint32_t* dst_addr, uint32_t size,
bool wait) {
PM4CMDCOPYDATA cmd_data;
memset(&cmd_data, 0, sizeof(PM4CMDCOPYDATA));
cmd_data.header.u32All = PACKET3(IT_COPY_DATA, 5);
cmd_data.srcAtc__CI = atc_support_;
cmd_data.srcCachePolicy__CI = COPY_DATA_SRC_CACHE_POLICY_BYPASS;
cmd_data.srcSel = src_sel;
cmd_data.dstAtc__CI = atc_support_;
cmd_data.dstSel = COPY_DATA_SEL_DST_ASYNC_MEMORY;
cmd_data.dstCachePolicy__CI = COPY_DATA_DST_CACHE_POLICY_BYPASS;
uint32_t dst_addr_lo, dst_addr_hi;
dst_addr_lo = PtrLow32(dst_addr);
dst_addr_hi = PtrHigh32(dst_addr);
cmd_data.srcAddressLo = src_addr_lo;
cmd_data.srcAddressHi = src_addr_hi;
cmd_data.dstAddressLo = dst_addr_lo;
cmd_data.dstAddressHi = dst_addr_hi;
cmd_data.countSel = size;
cmd_data.wrConfirm = wait;
cmd_data.engineSel = COPY_DATA_ENGINE_ME;
APPEND_COMMAND_WRAPPER(cmdbuf, cmd_data);
return;
}
void Gfx8CmdWriter::BuildCacheFlushPacket(CmdBuf* cmdbuf) {
WriteEventPacket packet;
packet.item[0] = PACKET3(IT_ACQUIRE_MEM__CI__VI, 6);
packet.item[1] = 0x28C00000;
packet.item[2] = 0xFFFFFFFF;
packet.item[3] = 0;
packet.item[4] = 0;
packet.item[5] = 0;
packet.item[6] = 0x00000004;
APPEND_COMMAND_WRAPPER(cmdbuf, packet);
}
void Gfx8CmdWriter::BuildWriteWaitIdlePacket(CmdBuf* cmdbuf) {
BuildBarrierCommand(cmdbuf);
BuildCacheFlushPacket(cmdbuf);
return;
}
// Will issue a VGT event including a cache flush later on
void Gfx8CmdWriter::BuildVgtEventPacket(CmdBuf* cmdbuf, uint32_t vgtEvent) {
PM4CMDEVENTWRITE cp_event_initiator;
cp_event_initiator.ordinal1 = PACKET3(IT_EVENT_WRITE, 1);
cp_event_initiator.ordinal2 = 0;
VGT_EVENT_TYPE eventType = Reserved_0x00;
switch (vgtEvent) {
case kPerfCntrsStart:
eventType = PERFCOUNTER_START;
break;
case kPerfCntrsStop:
eventType = PERFCOUNTER_STOP;
break;
case kPerfCntrsSample:
eventType = PERFCOUNTER_SAMPLE;
break;
case kThrdTraceStart:
eventType = THREAD_TRACE_START;
break;
case kThrdTraceStop:
eventType = THREAD_TRACE_STOP;
break;
case kThrdTraceFlush:
eventType = THREAD_TRACE_FLUSH;
break;
case kThrdTraceFinish:
eventType = THREAD_TRACE_FINISH;
break;
default:
assert(false && "Illegal VGT Event Id");
}
cp_event_initiator.eventType = eventType;
cp_event_initiator.eventIndex = EventTypeToIndexTable[eventType];
APPEND_COMMAND_WRAPPER(cmdbuf, cp_event_initiator);
// Check If I should be issuing a cache flush operation as well
// test and remove it
BuildCacheFlushPacket(cmdbuf);
return;
}
void Gfx8CmdWriter::BuildWriteRegisterPacket(CmdBuf* cmdbuf, uint32_t addr, uint32_t value) {
WriteRegPacket packet;
packet.item[0] =
(PM4_TYPE_3_HDR(IT_SET_CONFIG_REG, 1 + PM4_CMD_SET_CONFIG_REG_DWORDS, ShaderGraphics, 0));
packet.item[1] = addr - CONFIG_SPACE_START;
packet.item[2] = value;
APPEND_COMMAND_WRAPPER(cmdbuf, packet);
return;
}
void Gfx8CmdWriter::BuildWriteEventQueryPacket(CmdBuf* cmdbuf, uint32_t event, uint32_t* addr) {
PM4CMDEVENTWRITEQUERY cp_event_initiator;
cp_event_initiator.ordinal1 = PACKET3(IT_EVENT_WRITE, 3);
cp_event_initiator.ordinal2 = 0;
// Update switch statements you want to support
VGT_EVENT_TYPE eventType = Reserved_0x00;
switch (event) {
default:
assert(false && "Illegal VGT Event Id");
}
cp_event_initiator.eventType = eventType;
cp_event_initiator.eventIndex = EventTypeToIndexTable[eventType];
// set the address
uint32_t addrLo = PtrLow32(addr);
uint32_t addrHi = PtrHigh32(addr);
((addrLo & 0x7) != 0) ? assert(false) : assert(true);
cp_event_initiator.ordinal3 = 0;
cp_event_initiator.ordinal4 = 0;
cp_event_initiator.addressLo = addrLo;
cp_event_initiator.addressHi = addrHi;
APPEND_COMMAND_WRAPPER(cmdbuf, cp_event_initiator);
return;
}
void Gfx8CmdWriter::BuildBarrierCommand(CmdBuf* cmdBuf) {
APPEND_COMMAND_WRAPPER(cmdBuf, pending_dispatch_template_);
}
void Gfx8CmdWriter::WriteUserData(uint32_t* dst_addr, uint32_t count, const void* src_addr) {
memcpy(dst_addr, src_addr, count * sizeof(uint32_t));
}
void Gfx8CmdWriter::BuildAtomicPacket(CmdBuf* cmdbuf, AtomicType atomic_op,
volatile uint32_t* addr, uint32_t value,
uint32_t compare) {
gfx8::AtomicTemplate atomic = atomic_template_;
// make sure the destination adddress is aligned
uint32_t address_low = PtrLow32((void*)addr);
uint32_t address_high = PtrHigh32((void*)addr);
assert(!(address_low & 0x7) && "destination address must be 8 byte aligned");
atomic.atomic.addressLo = address_low;
atomic.atomic.addressHi = address_high;
switch (atomic_op) {
case CommandWriter::kAtomicTypeIncrement: {
atomic.atomic.atomOp = TC_OP_ATOMIC_ADD_RTN_32;
atomic.atomic.srcDataLo = 1;
break;
}
case CommandWriter::kAtomicTypeDecrement: {
atomic.atomic.atomOp = TC_OP_ATOMIC_SUB_RTN_32;
atomic.atomic.srcDataLo = 1;
break;
}
case CommandWriter::kAtomicTypeCompareAndSwap: {
atomic.atomic.atomOp = TC_OP_ATOMIC_CMPSWAP_RTN_32;
atomic.atomic.srcDataLo = value;
atomic.atomic.cmpDataLo = compare;
break;
}
case CommandWriter::kAtomicTypeBlockingCompareAndSwap: {
atomic.atomic.atomOp = TC_OP_ATOMIC_CMPSWAP_RTN_32;
atomic.atomic.srcDataLo = value;
atomic.atomic.cmpDataLo = compare;
atomic.atomic.command = 1;
atomic.atomic.loopInterval = 128;
break;
}
case CommandWriter::kAtomicAdd: {
atomic.atomic.atomOp = TC_OP_ATOMIC_ADD_RTN_32;
atomic.atomic.srcDataLo = value;
break;
}
case CommandWriter::kAtomicSubtract: {
atomic.atomic.atomOp = TC_OP_ATOMIC_SUB_RTN_32;
atomic.atomic.srcDataLo = value;
break;
}
case CommandWriter::kAtomicSwap: {
atomic.atomic.atomOp = TC_OP_ATOMIC_SWAP_RTN_32;
atomic.atomic.srcDataLo = value;
break;
}
}
APPEND_COMMAND_WRAPPER(cmdbuf, atomic);
}
void Gfx8CmdWriter::BuildAtomicPacket64(CmdBuf* cmdbuf, AtomicType atomic_op,
volatile uint64_t* addr, uint64_t value,
uint64_t compare) {
AtomicTemplate atomic = atomic_template_;
// make sure the destination adddress is aligned
uint32_t address_low = PtrLow32((void*)addr);
uint32_t address_high = PtrHigh32((void*)addr);
assert(!(address_low & 0x7) && "destination address must be 8 byte aligned");
atomic.atomic.addressLo = address_low;
atomic.atomic.addressHi = address_high;
atomic.atomic.atc = (atc_support_) ? 1 : 0;
atomic.atomic.cachePolicy = 2;
switch (atomic_op) {
case CommandWriter::kAtomicTypeIncrement: {
atomic.atomic.atomOp = TC_OP_ATOMIC_ADD_RTN_64;
atomic.atomic.srcDataLo = 1;
break;
}
case CommandWriter::kAtomicTypeDecrement: {
atomic.atomic.atomOp = TC_OP_ATOMIC_SUB_RTN_64;
atomic.atomic.srcDataLo = 1;
break;
}
case CommandWriter::kAtomicTypeCompareAndSwap: {
atomic.atomic.atomOp = TC_OP_ATOMIC_CMPSWAP_RTN_64;
atomic.atomic.srcDataLo = Low32(value);
atomic.atomic.srcDataHi = High32(value);
atomic.atomic.cmpDataLo = Low32(compare);
atomic.atomic.cmpDataHi = High32(compare);
break;
}
case CommandWriter::kAtomicTypeBlockingCompareAndSwap: {
atomic.atomic.atomOp = TC_OP_ATOMIC_CMPSWAP_RTN_64;
atomic.atomic.srcDataLo = Low32(value);
atomic.atomic.srcDataHi = High32(value);
atomic.atomic.cmpDataLo = Low32(compare);
atomic.atomic.cmpDataHi = High32(compare);
atomic.atomic.command = 1;
atomic.atomic.loopInterval = 128;
break;
}
case CommandWriter::kAtomicAdd: {
atomic.atomic.atomOp = TC_OP_ATOMIC_ADD_RTN_64;
atomic.atomic.srcDataLo = Low32(value);
atomic.atomic.srcDataHi = High32(value);
break;
}
case CommandWriter::kAtomicSubtract: {
atomic.atomic.atomOp = TC_OP_ATOMIC_SUB_RTN_64;
atomic.atomic.srcDataLo = Low32(value);
atomic.atomic.srcDataHi = High32(value);
break;
}
case CommandWriter::kAtomicSwap: {
atomic.atomic.atomOp = TC_OP_ATOMIC_SWAP_RTN_64;
atomic.atomic.srcDataLo = Low32(value);
atomic.atomic.srcDataHi = High32(value);
break;
}
}
APPEND_COMMAND_WRAPPER(cmdbuf, atomic);
}
size_t Gfx8CmdWriter::SizeOfAtomicPacket() const {
return sizeof(AtomicTemplate) / sizeof(uint32_t);
}
void Gfx8CmdWriter::BuildConditionalExecute(CmdBuf* cmdbuf, uint32_t* signal, uint16_t count) {
ConditionalExecuteTemplate conditional = conditional_template_;
uint32_t address_low = PtrLow32(signal);
uint32_t address_high = PtrHigh32(signal);
assert(!(address_low & 0x7) && "destination address must be 8 byte aligned");
conditional.conditional.boolAddrLo = address_low;
conditional.conditional.boolAddrHi = address_high;
conditional.conditional.execCount = count;
APPEND_COMMAND_WRAPPER(cmdbuf, conditional);
}
void Gfx8CmdWriter::BuildWriteDataCommand(CmdBuf* cmdbuf, uint32_t* write_addr,
uint32_t write_value) {
// Copy the initialize command packet
gfx8::WriteDataTemplate command = write_data_template_;
// Encode the user specified value to write
command.write_data_value = write_value;
// Encode the user specified address to write to
command.write_data.dstAddrLo = PtrLow32(write_addr);
command.write_data.dstAddrHi = PtrHigh32(write_addr);
// Append the built command into output Command Buffer
APPEND_COMMAND_WRAPPER(cmdbuf, command);
}
void Gfx8CmdWriter::BuildWriteData64Command(CmdBuf* cmdbuf, uint64_t* write_addr,
uint64_t write_value) {
// Copy the initialize command packet
gfx8::WriteData64Template command = write_data64_template_;
// Encode the user specified value to write
command.write_data_value = write_value;
// Encode the user specified address to write to
command.write_data.dstAddrLo = PtrLow32(write_addr);
command.write_data.dstAddrHi = PtrHigh32(write_addr);
// Append the built command into output Command Buffer
APPEND_COMMAND_WRAPPER(cmdbuf, command);
}
void Gfx8CmdWriter::BuildFlushCacheCmd(CmdBuf* cmdbuf, FlushCacheOptions* options,
uint32_t* writeAddr, uint32_t writeVal) {
PM4CMDACQUIREMEM flushCmd;
memset(&flushCmd, 0, sizeof(flushCmd));
// Verify write back address is valid. Note that this address is NOT
// used on CI. But to have a same interface as that on SI, we keep
// the address argument in this function. Thus, this check always pass
// no matter the address is NULL or not.
(writeAddr == NULL) ? assert(true) : assert(true);
// Initialize the command header
gfx8::GenerateCmdHeader(&flushCmd, IT_ACQUIRE_MEM__CI__VI);
// Specify the base address of memory being synchronized.
// The starting address is indicated as follows: bits [0-48].
flushCmd.cpCoherBase.u32All = 0;
flushCmd.cpCoherBaseHi.u32All = 0;
// Specify the size of memory being synchronized. It is indicated
// as follows:
// COHER_SIZE_256B_MASK = 0xffffffffL
// COHER_SIZE_HI_256B_MASK__CI__VI = 0x000000ffL
flushCmd.cpCoherSize.u32All = CP_COHER_SIZE__COHER_SIZE_256B_MASK;
flushCmd.cpCoherSizeHi.u32All = CP_COHER_SIZE_HI__COHER_SIZE_HI_256B_MASK__CI__VI;
// Periodicity of polling - interval to wait from the time
// of unsuccessful polling result is returned and a new
// poll is issued
flushCmd.pollInterval = 0x04;
// Program Coherence Control Register. Initialize L2 Cache flush
// for Non-Coherent memory blocks
uint32_t coher_cntl = 0;
coher_cntl |= (options->l1) ? CP_COHER_CNTL__TCL1_ACTION_ENA_MASK : 0;
coher_cntl |= (options->l2)
? (CP_COHER_CNTL__TC_ACTION_ENA_MASK | CP_COHER_CNTL__TC_WB_ACTION_ENA_MASK__CI__VI)
: 0;
coher_cntl |= (options->icache) ? CP_COHER_CNTL__SH_ICACHE_ACTION_ENA_MASK : 0;
coher_cntl |= (options->kcache) ? CP_COHER_CNTL__SH_KCACHE_ACTION_ENA_MASK : 0;
flushCmd.coherCntl = coher_cntl;
// Copy AcquireMem command buffer stream
APPEND_COMMAND_WRAPPER(cmdbuf, flushCmd);
return;
}
void Gfx8CmdWriter::BuildDmaDataPacket(CmdBuf* cmdbuf, uint32_t* srcAddr, uint32_t* dstAddr,
uint32_t copySize, bool waitForConfirm) {
PM4CMDDMADATA cmdDmaData;
memset(&cmdDmaData, 0, sizeof(PM4CMDDMADATA));
cmdDmaData.header.u32All =
(PM4_TYPE_3_HDR(IT_DMA_DATA__CI__VI, PM4_CMD_DMA_DATA_DWORDS, ShaderCompute, 0));
// Id of Micro Engine
cmdDmaData.engine = 0;
// Specify attributes of source buffer such as its
// location, ATC property, Cache policy and Volatile
// A value of 1 for cache policy means to Stream
cmdDmaData.srcSel = 0;
cmdDmaData.srcATC = atc_support_;
cmdDmaData.srcCachePolicy = 1;
cmdDmaData.srcVolatile = 0;
// Specify attributes of destination buffer such as
// its location, ATC property, Cache policy and Volatile
// A value of 1 for cache policy means to Stream
cmdDmaData.dstSel = 0;
cmdDmaData.dstATC = atc_support_;
cmdDmaData.dstCachePolicy = 1;
cmdDmaData.dstVolatile = 0;
// Specify the source and destination addr
cmdDmaData.srcAddrHi = PtrHigh32(srcAddr);
cmdDmaData.srcAddrLoOrData = PtrLow32(srcAddr);
cmdDmaData.dstAddrLo = PtrLow32(dstAddr);
cmdDmaData.dstAddrHi = PtrHigh32(dstAddr);
// Number of bytes to copy. The command restricts
// the size to be (2 MB - 1) - 21 Bits
assert(copySize < 0x1FFFFF);
cmdDmaData.command.byteCount = copySize;
// Indicate that DMA Cmd should wait if its source
// is the destination of a previous DMA Cmd
cmdDmaData.command.rawWait = waitForConfirm;
APPEND_COMMAND_WRAPPER(cmdbuf, cmdDmaData);
return;
}
} // gfx8
} // pm4_profile
+201
Просмотреть файл
@@ -0,0 +1,201 @@
#ifndef _GFX8_CMDWRITER_H_
#define _GFX8_CMDWRITER_H_
#include "cmdwriter.h"
#include "gfx8_cmds.h"
namespace pm4_profile {
namespace gfx8 {
/// @brief class Gfx8CmdWriter implements the virtual class CommandWriter
/// for Sea Islands (CI) and VI chipset
class Gfx8CmdWriter : public CommandWriter {
public:
Gfx8CmdWriter(bool atc_support, bool pcie_atomic_support);
/// @brief Dword specifying NOOP command for SI/CI/VI chipsets. The macro
/// populates the NOOP command which is 32-bits wide. The second parameter,
/// the COUNT field of NOOP command, specifies the number of Dwords to skip.
/// To skip ZERO Dwords the value should be set to 0x3FFF. Since the macro
/// decrements the second parameter by TWO, an artifact of its definition,
/// the value is incremented by TWO to 0x4001 (0x3FFF + 2).
///
inline uint32_t GetNoOpCmd() {
static const uint32_t nopCmd = PM4_TYPE_3_HDR(IT_NOP, 0x4001, ShaderCompute, 0);
return nopCmd;
}
void BuildBarrierCommand(CmdBuf* cmdBuf);
void BuildIndirectBufferCmd(CmdBuf* cmdbuf, const void* cmd_addr, std::size_t cmd_size);
void BuildBOPNotifyCmd(CmdBuf* cmdbuf, const void* write_addr, uint32_t write_val,
bool interrupt);
void BuildBarrierFenceCommands(CmdBuf* cmdbuf);
void BuildWriteEventPacket(CmdBuf* cmdbuf, uint32_t event);
void BuildWaitRegMemCommand(CmdBuf* cmdbuf, bool mem_space, uint64_t wait_addr, bool func_eq,
uint32_t mask_val, uint32_t wait_val);
void BuildWriteUnshadowRegPacket(CmdBuf* cmdbuf, uint32_t addr, uint32_t value);
/// @brief Build CP command to program a Gpu register
///
/// @param cmdbuf Pointer to command buffer to be appended
/// @param addr Register to be programmed
/// @param value Value to write into register
///
/// @return void
void BuildWriteUConfigRegPacket(CmdBuf* cmdbuf, uint32_t addr, uint32_t value);
void BuildWriteShRegPacket(CmdBuf* cmdbuf, uint32_t addr, uint32_t value);
void BuildCopyDataPacket(CmdBuf* cmdbuf, uint32_t src_sel, uint32_t src_addr_lo,
uint32_t src_addr_hi, uint32_t* dst_addr, uint32_t size, bool wait);
void BuildWriteWaitIdlePacket(CmdBuf* cmdbuf);
// Will issue a VGT event including a cache flush later on
void BuildVgtEventPacket(CmdBuf* cmdbuf, uint32_t vgtEvent);
void BuildWriteRegisterPacket(CmdBuf* cmdbuf, uint32_t addr, uint32_t value);
void BuildWriteEventQueryPacket(CmdBuf* cmdbuf, uint32_t event, uint32_t* addr);
void BuildAtomicPacket(CmdBuf* cmdbuf, AtomicType atomic_op, volatile uint32_t* addr,
uint32_t value, uint32_t compare);
void BuildAtomicPacket64(CmdBuf* cmdbuf, AtomicType atomic_op, volatile uint64_t* addr,
uint64_t value = 0, uint64_t compare = 0);
size_t SizeOfAtomicPacket() const;
void BuildConditionalExecute(CmdBuf* cmdbuf, uint32_t* signal, uint16_t count);
void BuildWriteDataCommand(CmdBuf* cmdbuf, uint32_t* write_addr, uint32_t write_value);
void BuildWriteData64Command(CmdBuf* cmdbuf, uint64_t* write_addr, uint64_t write_value);
void BuildCacheFlushPacket(CmdBuf* cmdbuf);
/// Writes into input buffer Gpu commands to flush its cache. It is
/// necessary that the buffer provided for flush commands is large
/// enough to accommodate the full set of commands. It should be at
/// least 512 bytes.
///
/// @param tsCmdBuf Buffer to write commands to.
/// @param writeAddr Registered address into which GPU should write
/// a user provided value upon executing the flush commands.
/// @param writeVal User provided value written by GPU at user provided
/// address, upon executing the flush commands.
///
/// @return void
void BuildFlushCacheCmd(CmdBuf* cmdBuf, FlushCacheOptions* options, uint32_t* writeAddr,
uint32_t writeVal);
/// Builds Gpu command to copy data from source to destination buffer
/// using DMA engine.
///
/// @param cmdbuf Buffer updated with Gpu copy command
/// @param srcAddr Address of source buffer address
/// @param dstAddr Address of destination buffer address
/// @param copySize Size of data to copy in bytes
/// @param waitForCompletion if command should wait for copying to complete
void BuildDmaDataPacket(CmdBuf* cmdBuf, uint32_t* srcAddr, uint32_t* dstAddr, uint32_t copySize,
bool waitForCompletion);
protected:
/// @brief Copies data from source buffer to destination buffer
///
/// @param dst_addr Address of destination buffer data
///
/// @count Size of data to copy in 32-bit words
///
/// @param src_addr Address of buffer containing source data
///
/// @return void
virtual void WriteUserData(uint32_t* dst_addr, uint32_t count, const void* src_addr);
/// @brief Append an instance of Gpu command into input command buffer stream.
///
/// @param cmdbuf CommandWriter object appended with anohter Gpu command
///
/// @param cmd Gpu command to be appended into command buffer
///
/// @return void
template <class T> void AppendCommand(CmdBuf* cmdbuf, const T& cmd);
private:
/// @brief Initializes a Gpu command which can be used to
/// reference a Gpu command stream indirectly
void InitializeLaunchTemplate();
/// @brief Initializes a Gpu command to perform atomic operations
////
void InitializeAtomicTemplate();
/// @brief Initializes a Gpu command to allow conditional execution
/// of a Gpu command stream
void InitializeConditionalTemplate();
/// @brief Initializes a Gpu command to let command processor
/// wait for some update before letting other commands to be
/// processed
void InitializeWaitRegMemTemplate();
/// @brief Initializes the template for Barrier command.
/// Applications can use Barrier command to ensure their
/// command is executed only after all other commands have
/// completed their execution.
void InitializeBarrierTemplate();
void BuildUpdateHostAddress(CmdBuf* cmdbuf, uint64_t* addr, int64_t value);
/// @brief Initializes Acquire Memory command template. Users
/// can submit this command to invalidate Gpu caches - L1 and
/// or L2.
void InitializeAcquireMemTemplate();
/// @brief Initializes an instance of Write Data command
/// for use by an application
void InitializeWriteDataTemplate();
void InitializeWriteData64Template();
/// @brief Instance of Gpu command to reference dispatch commands
LaunchTemplate launch_template_;
/// @brief Instance of Gpu command to use in performing atomic operations
AtomicTemplate atomic_template_;
/// @brief Instance of Gpu command to use in conditional execution
/// of a command stream
ConditionalExecuteTemplate conditional_template_;
/// @brief Instance of Pm4 command WRITE_DATA
WriteDataTemplate write_data_template_;
WriteData64Template write_data64_template_;
/// @brief Instance of Pm4 command EVENT_WRITE
BarrierTemplate pending_dispatch_template_;
/// @brief Instance of Pm4 command ACQUIRE_MEM
AcquireMemTemplate invalidate_cache_template_;
/// @brief Instance of Pm4 command WAIT_REG_MEM
WaitRegMemTemplate wait_reg_mem_template_;
/// @brief ATC support.
bool atc_support_;
/// @brief PCIe atomic support.
bool pcie_atomic_support_;
};
} // gfx8
} // pm4_profile
#endif // _GFX8_CMDWRITER_H_
+90
Просмотреть файл
@@ -0,0 +1,90 @@
#ifndef _GFX9_CMDS_H_
#define _GFX9_CMDS_H_
#include "gfxip/gfx9/gfx9_utils.h"
#include "gfxip/gfx9/gfx9_enum.h"
#include "gfxip/gfx9/gfx9_mask.h"
#include "gfxip/gfx9/gfx9_offset.h"
#include "gfxip/gfx9/gfx9_typedef.h"
#include "gfxip/gfx9/gfx9_registers.h"
#include "gfxip/gfx9/gfx9_pm4_it_opcodes.h"
#include "gfxip/gfx9/f32_mec_pm4_packets_vg10.h"
#include "gfxip/gfx9/f32_pfp_pm4_packets_vg10.h"
namespace pm4_profile {
namespace gfx9 {
/// @brief Initializer for commands that set shader registers
template <class T> void GenerateSetShRegHeader(T* pm4, uint32_t reg_addr) {
pm4->cmd_set_data.header.u32All = PM4_TYPE3_HDR(IT_SET_SH_REG, sizeof(T) / sizeof(uint32_t));
pm4->cmd_set_data.bitfields2.reg_offset = reg_addr - PERSISTENT_SPACE_START;
}
// @brief Initializer for various Gpu command headers
template <class T> void GenerateCmdHeader(T* pm4, IT_OpCodeType op_code) {
pm4->header.u32All = PM4_TYPE3_HDR(op_code, sizeof(T) / sizeof(uint32_t));
}
// @brief Initializer for commands that set configuration registers
template <class T> void GenerateSetConfigRegHeader(T* pm4, uint32_t reg_addr) {
pm4->cmd_set_data.header.u32All = PM4_TYPE3_HDR(IT_SET_CONFIG_REG, sizeof(T) / sizeof(uint32_t));
pm4->cmd_set_data.bitfields2.reg_offset = reg_addr - CONFIG_SPACE_START;
}
/// @brief Structure used to issue a Gpu Barrier command
struct BarrierTemplate {
PM4MEC_EVENT_WRITE event_write;
};
/// @brief Structure used to configure the flushing of
/// various caches - instruction, constants, L1 and L2
struct AcquireMemTemplate {
PM4MEC_ACQUIRE_MEM acquire_mem;
};
/// @brief Structure used to reference another Gpu command
/// indirectly. Generally used to reference a list of Gpu
/// commands (dispatch cmds) indirectly
struct LaunchTemplate {
PM4MEC_INDIRECT_BUFFER indirect_buffer;
};
/// @brief Structure used to determine the end of
/// a kernel including cache flushes and writing to
/// a user configurable memory location
struct EndofKernelNotifyTemplate {
PM4MEC_RELEASE_MEM release_mem;
};
// Desc: Strucuture used to perform various atomic
// operations - add, subtract, increment, etc
struct AtomicTemplate {
PM4MEC_ATOMIC_MEM atomic;
};
/// @brief PM4 command to write a 32-bit value into a memory
/// location accessible to Gpu
struct WriteDataTemplate {
PM4MEC_WRITE_DATA write_data;
uint32_t write_data_value;
};
/// @brief PM4 command to write a 64-bit value into a memory
/// location accessible to Gpu
struct WriteData64Template {
PM4MEC_WRITE_DATA write_data;
uint64_t write_data_value;
};
/// @brief PM4 command to wait for a certain event before proceeding
/// to process another command on the queue
struct WaitRegMemTemplate {
PM4MEC_WAIT_REG_MEM wait_reg_mem;
};
} // gfx9
} // pm4_profile
#endif // _GFX9_CMDS_H_
+743
Просмотреть файл
@@ -0,0 +1,743 @@
#include <algorithm>
#include <iostream>
#include <iomanip>
#include <sstream>
#include "gfx9_cmdwriter.h"
template <class T>
static void PrintPm4Packet(const T& command, const char* name) {
#if ! defined(NDEBUG)
uint32_t * cmd = (uint32_t*)&command;
uint32_t size = sizeof(command) / sizeof(uint32_t);
std::ostringstream oss;
oss << "'" << name << "' size(" << std::dec << size << ")";
std::clog << std::setw(40) << std::left << oss.str() << ":";
for (uint32_t idx = 0; idx < size; idx++) {
std::clog << " " << std::hex << std::setw(8) << std::setfill('0') << cmd[idx];
}
std::clog << std::setfill(' ') << std::endl;
#endif
}
#define APPEND_COMMAND_WRAPPER(cmdbuf, command) \
PrintPm4Packet(command, __FUNCTION__); \
AppendCommand(cmdbuf, command);
namespace pm4_profile {
namespace gfx9 {
template <class T> void Gfx9CmdWriter::AppendCommand(CmdBuf* cmdbuf, const T& command) {
cmdbuf->AppendCommand(&command, sizeof(command));
}
void Gfx9CmdWriter::InitializeLaunchTemplate() {
memset(&launch_template_, 0, sizeof(launch_template_));
GenerateCmdHeader(&launch_template_.indirect_buffer, IT_INDIRECT_BUFFER);
}
void Gfx9CmdWriter::InitializeAtomicTemplate() {
memset(&atomic_template_.atomic, 0, sizeof(atomic_template_));
GenerateCmdHeader(&atomic_template_.atomic, IT_ATOMIC_MEM);
// Specify the micro engine and cache policies
PM4MEC_ATOMIC_MEM* atomicCmd = &atomic_template_.atomic;
atomicCmd->bitfields2.cache_policy = cache_policy__mec_atomic_mem__stream;
}
void Gfx9CmdWriter::InitializeBarrierTemplate() {
memset(&pending_dispatch_template_, 0, sizeof(pending_dispatch_template_));
GenerateCmdHeader(&pending_dispatch_template_.event_write, IT_EVENT_WRITE);
MEC_EVENT_WRITE_event_index_enum index;
index = event_index__mec_event_write__cs_partial_flush;
pending_dispatch_template_.event_write.bitfields2.event_index = index;
pending_dispatch_template_.event_write.bitfields2.event_type = CS_PARTIAL_FLUSH;
}
void Gfx9CmdWriter::InitializeAcquireMemTemplate() {
memset(&invalidate_cache_template_, 0, sizeof(invalidate_cache_template_));
GenerateCmdHeader(&invalidate_cache_template_.acquire_mem, IT_ACQUIRE_MEM);
// Specify the CP module which will process this packet
PM4MEC_ACQUIRE_MEM* acquire_mem = &invalidate_cache_template_.acquire_mem;
// Specify the size of memory to invalidate. Size is
// specified in terms of 256 byte chunks. A coher_size
// of 0xFFFFFFFF actually specified 0xFFFFFFFF00 (40 bits)
// of memory. The field coher_size_hi specifies memory from
// bits 40-64 for a total of 256 TB.
acquire_mem->coher_size = 0xFFFFFFFF;
acquire_mem->bitfields4.coher_size_hi = 0xFFFFFF;
// Specify the address of memory to invalidate. The
// address must be 256 byte aligned.
acquire_mem->coher_base_lo = 0x00;
acquire_mem->bitfields6.coher_base_hi = 0x00;
// Specify the poll interval for determing if operation is complete
acquire_mem->bitfields7.poll_interval = 0x04;
}
void Gfx9CmdWriter::InitializeWaitRegMemTemplate() {
memset(&wait_reg_mem_template_, 0, sizeof(wait_reg_mem_template_));
GenerateCmdHeader(&wait_reg_mem_template_.wait_reg_mem, IT_WAIT_REG_MEM);
PM4MEC_WAIT_REG_MEM* wait_reg_mem = &wait_reg_mem_template_.wait_reg_mem;
wait_reg_mem->bitfields7.poll_interval = 0x04;
wait_reg_mem->bitfields2.operation = operation__mec_wait_reg_mem__wait_reg_mem;
}
void Gfx9CmdWriter::InitializeWriteDataTemplate(PM4MEC_WRITE_DATA* write_data, bool bit32) {
// Initialize the header of command packet by adjusting the
// size of payload - one 32bit DWord or two 32bit DWords
uint32_t cmd_size = (bit32) ? 1 : 2;
memset(write_data, 0, sizeof(PM4MEC_WRITE_DATA));
cmd_size = cmd_size + (sizeof(PM4MEC_WRITE_DATA) / sizeof(uint32_t));
write_data->ordinal1 = PM4_TYPE3_HDR(IT_WRITE_DATA, cmd_size);
// Set the bit to confirm the write operation and cache policy
write_data->bitfields2.wr_confirm = wr_confirm__mec_write_data__wait_for_write_confirmation;
write_data->bitfields2.cache_policy = cache_policy__mec_write_data__stream;
// Specify the command to increment address if writing more than one DWord
write_data->bitfields2.addr_incr = addr_incr__mec_write_data__increment_address;
// Specify the class to which the write destination belongs
write_data->bitfields2.dst_sel = dst_sel__mec_write_data__memory;
}
void Gfx9CmdWriter::InitializeWriteDataTemplate() {
InitializeWriteDataTemplate(&write_data_template_.write_data, true);
}
void Gfx9CmdWriter::InitializeWriteData64Template() {
InitializeWriteDataTemplate(&write_data64_template_.write_data, false);
}
void Gfx9CmdWriter::InitializeConditionalTemplate() {
/*
memset(&conditional_template_.conditional, 0, sizeof(conditional_template_));
GenerateCmdHeader(&conditional_template_.conditional, IT_COND_EXEC);
if (atc_support_) {
const uint32_t kAtcShift = 24;
conditional_template_.conditional.ordinal4 |= 1 << kAtcShift;
}
*/
}
void Gfx9CmdWriter::InitializeEndOfKernelNotifyTemplate() {
memset(&notify_template_, 0, sizeof(notify_template_));
GenerateCmdHeader(&notify_template_.release_mem, IT_RELEASE_MEM);
// Set the event type to be bottom of pipe and cache policy
PM4MEC_RELEASE_MEM* rel_mem;
rel_mem = &notify_template_.release_mem;
rel_mem->bitfields2.event_type = BOTTOM_OF_PIPE_TS;
rel_mem->bitfields2.cache_policy = cache_policy__mec_release_mem__stream;
rel_mem->bitfields2.event_index = event_index__mec_release_mem__end_of_pipe;
// Specify the attributes of source and destinations of data
rel_mem->bitfields3.int_sel = int_sel__mec_release_mem__none;
rel_mem->bitfields3.data_sel = data_sel__mec_release_mem__none;
rel_mem->bitfields3.dst_sel = dst_sel__mec_release_mem__memory_controller;
}
Gfx9CmdWriter::Gfx9CmdWriter(bool atc_support, bool pcie_atomic_support) {
// Initialize various state variables related to
// atomic operations and atc support
this->atc_support_ = atc_support;
this->pcie_atomic_support_ = pcie_atomic_support;
// Initialize various command templates
InitializeLaunchTemplate();
InitializeAtomicTemplate();
InitializeBarrierTemplate();
InitializeAcquireMemTemplate();
InitializeWaitRegMemTemplate();
InitializeWriteDataTemplate();
InitializeWriteData64Template();
InitializeConditionalTemplate();
InitializeEndOfKernelNotifyTemplate();
}
void Gfx9CmdWriter::BuildIndirectBufferCmd(CmdBuf* cmdbuf, const void* cmd_addr,
std::size_t cmd_size) {
// Verify the address is 4-byte aligned
uint64_t addr = uintptr_t(cmd_addr);
assert(!(addr & 0x3) && "IndirectBuffer address must be 4 byte aligned");
// Specify the address of indirect buffer encoding cmd stream
LaunchTemplate launch = launch_template_;
launch.indirect_buffer.bitfields2.ib_base_lo = (PtrLow32(cmd_addr) >> 2);
launch.indirect_buffer.ib_base_hi = PtrHigh32(cmd_addr);
// Specify the size of indirect buffer and cache policy to set
// upon executing the cmds of indirect buffer
launch.indirect_buffer.bitfields4.priv = 0;
launch.indirect_buffer.bitfields4.valid = 1;
launch.indirect_buffer.bitfields4.ib_size = cmd_size / sizeof(uint32_t);
launch.indirect_buffer.bitfields4.cache_policy = cache_policy__mec_indirect_buffer__stream;
// Append the built command into output Command Buffer
APPEND_COMMAND_WRAPPER(cmdbuf, launch);
}
void Gfx9CmdWriter::BuildAtomicPacket(CmdBuf* cmdbuf, AtomicType atomic_op, volatile uint32_t* addr,
uint32_t value, uint32_t compare) {
AtomicTemplate atomicTemplate = atomic_template_;
PM4MEC_ATOMIC_MEM* atomicCmd = &atomicTemplate.atomic;
// make sure the destination adddress is aligned
uint32_t address_low = PtrLow32((void*)addr);
uint32_t address_high = PtrHigh32((void*)addr);
assert(!(address_low & 0x7) && "destination address must be 8 byte aligned");
atomicCmd->addr_lo = address_low;
atomicCmd->addr_hi = address_high;
switch (atomic_op) {
case CommandWriter::kAtomicTypeIncrement:
assert(!(value != 0x01) && "Atomic Increment value should be 1");
case CommandWriter::kAtomicAdd:
atomicCmd->src_data_lo = value;
atomicCmd->bitfields2.atomic = TC_OP_ATOMIC_ADD_RTN_32;
break;
case CommandWriter::kAtomicTypeDecrement:
assert(!(value != 0x01) && "Atomic Decrement value should be 1");
case CommandWriter::kAtomicSubtract:
atomicCmd->src_data_lo = value;
atomicCmd->bitfields2.atomic = TC_OP_ATOMIC_SUB_RTN_32;
break;
case CommandWriter::kAtomicTypeBlockingCompareAndSwap:
atomicCmd->bitfields9.loop_interval = 128;
atomicCmd->bitfields2.command = command__mec_atomic_mem__loop_until_compare_satisfied;
case CommandWriter::kAtomicTypeCompareAndSwap:
atomicCmd->src_data_lo = value;
atomicCmd->cmp_data_lo = compare;
atomicCmd->bitfields2.atomic = TC_OP_ATOMIC_CMPSWAP_RTN_32;
break;
case CommandWriter::kAtomicSwap:
atomicCmd->src_data_lo = value;
atomicCmd->bitfields2.atomic = TC_OP_ATOMIC_SWAP_RTN_32;
break;
default:
assert((false) && "Atomic operation id is invalid");
}
// Append the built command into output Command Buffer
APPEND_COMMAND_WRAPPER(cmdbuf, atomicTemplate);
}
void Gfx9CmdWriter::BuildAtomicPacket64(CmdBuf* cmdbuf, AtomicType atomic_op,
volatile uint64_t* addr, uint64_t value, uint64_t compare) {
AtomicTemplate atomicTemplate = atomic_template_;
PM4MEC_ATOMIC_MEM* atomicCmd = &atomicTemplate.atomic;
// make sure the destination adddress is aligned
uint32_t address_low = PtrLow32((void*)addr);
uint32_t address_high = PtrHigh32((void*)addr);
assert(!(address_low & 0x7) && "destination address must be 8 byte aligned");
atomicCmd->addr_lo = address_low;
atomicCmd->addr_hi = address_high;
switch (atomic_op) {
case CommandWriter::kAtomicTypeIncrement:
assert(!(value != 0x01) && "Atomic Increment value should be 1");
case CommandWriter::kAtomicAdd:
atomicCmd->src_data_lo = Low32(value);
atomicCmd->src_data_hi = High32(value);
atomicCmd->bitfields2.atomic = TC_OP_ATOMIC_ADD_RTN_64;
break;
case CommandWriter::kAtomicTypeDecrement:
assert(!(value != 0x01) && "Atomic Decrement value should be 1");
case CommandWriter::kAtomicSubtract:
atomicCmd->src_data_lo = Low32(value);
atomicCmd->src_data_hi = High32(value);
atomicCmd->bitfields2.atomic = TC_OP_ATOMIC_SUB_RTN_64;
break;
case CommandWriter::kAtomicTypeBlockingCompareAndSwap:
atomicCmd->bitfields9.loop_interval = 128;
atomicCmd->bitfields2.command = command__mec_atomic_mem__loop_until_compare_satisfied;
case CommandWriter::kAtomicTypeCompareAndSwap:
atomicCmd->src_data_lo = Low32(value);
atomicCmd->src_data_hi = High32(value);
atomicCmd->cmp_data_lo = Low32(compare);
atomicCmd->cmp_data_hi = High32(compare);
atomicCmd->bitfields2.atomic = TC_OP_ATOMIC_CMPSWAP_RTN_64;
break;
case CommandWriter::kAtomicSwap:
atomicCmd->src_data_lo = Low32(value);
atomicCmd->src_data_hi = High32(value);
atomicCmd->bitfields2.atomic = TC_OP_ATOMIC_SWAP_RTN_64;
break;
default:
assert((false) && "Atomic operation id is invalid");
}
// Append the built command into output Command Buffer
APPEND_COMMAND_WRAPPER(cmdbuf, atomicTemplate);
}
void Gfx9CmdWriter::BuildBarrierCommand(CmdBuf* cmdBuf) {
APPEND_COMMAND_WRAPPER(cmdBuf, pending_dispatch_template_);
}
void Gfx9CmdWriter::BuildWriteDataCommand(CmdBuf* cmdbuf, uint32_t* write_addr,
uint32_t write_value) {
// Copy the initialized command packet and its payload
WriteDataTemplate command = write_data_template_;
// Encode the user specified address to write to
uint64_t addr = uintptr_t(write_addr);
assert(!(addr & 0x3) && "WriteData address must be 4 byte aligned");
// Specify the value to write
command.write_data_value = write_value;
// Test Code to see if this makes a difference
command.write_data.dst_mem_addr_hi = PtrHigh32(write_addr);
command.write_data.bitfields3c.dst_mem_addr_lo = (PtrLow32(write_addr) >> 2);
// Append the built command into output Command Buffer
APPEND_COMMAND_WRAPPER(cmdbuf, command);
}
void Gfx9CmdWriter::BuildWriteData64Command(CmdBuf* cmdbuf, uint64_t* write_addr,
uint64_t write_value) {
// Copy the initialized command packet and its payload
WriteData64Template command = write_data64_template_;
// Encode the user specified address to write to
uint64_t addr = uintptr_t(write_addr);
assert(!(addr & 0x3) && "WriteData address must be 4 byte aligned");
command.write_data.bitfields3c.dst_mem_addr_lo = (PtrLow32(write_addr) >> 2);
command.write_data.dst_mem_addr_hi = PtrHigh32(write_addr);
// Specify the value to write
command.write_data_value = write_value;
// Append the built command into output Command Buffer
APPEND_COMMAND_WRAPPER(cmdbuf, command);
}
void Gfx9CmdWriter::BuildWaitRegMemCommand(CmdBuf* cmdbuf, bool mem_space, uint64_t wait_addr,
bool func_eq, uint32_t mask_val, uint32_t wait_val) {
WaitRegMemTemplate wait_cmd = wait_reg_mem_template_;
// Apply the space to which addr belongs
if (mem_space) {
wait_cmd.wait_reg_mem.bitfields2.mem_space = mem_space__mec_wait_reg_mem__memory_space;
} else {
wait_cmd.wait_reg_mem.bitfields2.mem_space = mem_space__mec_wait_reg_mem__register_space;
}
// Apply the function - equal / not equal desired by user
if (func_eq) {
wait_cmd.wait_reg_mem.bitfields2.function =
function__mec_wait_reg_mem__equal_to_the_reference_value;
} else {
wait_cmd.wait_reg_mem.bitfields2.function =
function__mec_wait_reg_mem__not_equal_reference_value;
}
// Value to use in applying equal / not equal function
wait_cmd.wait_reg_mem.reference = wait_val;
// Apply the mask on value at address/register
wait_cmd.wait_reg_mem.mask = mask_val;
// The address to poll should be DWord (4 byte) aligned
// Update upper 32 bit address if addr is not a register
if (mem_space) {
assert(!(wait_addr & 0x3) && "WaitRegMem address must be 4 byte aligned");
}
wait_cmd.wait_reg_mem.bitfields3a.mem_poll_addr_lo = (Low32(wait_addr) >> 2);
if (mem_space) {
wait_cmd.wait_reg_mem.mem_poll_addr_hi = High32(wait_addr);
}
// Append the command to cmd stream
APPEND_COMMAND_WRAPPER(cmdbuf, wait_cmd);
}
void Gfx9CmdWriter::BuildConditionalExecute(CmdBuf* cmdbuf, uint32_t* signal, uint16_t count) {
assert(false && "BuildConditionalExecute method is not implemented");
/*
ConditionalExecuteTemplate conditional = conditional_template_;
uint32_t address_low = PtrLow32(signal);
uint32_t address_high = PtrHigh32(signal);
assert(!(address_low & 0x7) && "destination address must be 8 byte aligned");
conditional.conditional.boolAddrLo = address_low;
conditional.conditional.boolAddrHi = address_high;
conditional.conditional.execCount = count;
APPEND_COMMAND_WRAPPER(cmdbuf, conditional);
*/
}
void Gfx9CmdWriter::BuildUpdateHostAddress(CmdBuf* cmdbuf, uint64_t* addr, int64_t value) {
// If Atomics are supported, use it
if (pcie_atomic_support_) {
BuildAtomicPacket64(cmdbuf, CommandWriter::AtomicType::kAtomicSwap, (volatile uint64_t*)addr,
value);
return;
}
BuildWriteData64Command(cmdbuf, addr, value);
return;
}
void Gfx9CmdWriter::BuildBOPNotifyCmd(CmdBuf* cmdbuf, const void* write_addr, uint32_t write_value,
bool interrupt) {
// Initialize the command including its header
EndofKernelNotifyTemplate eop = notify_template_;
PM4MEC_RELEASE_MEM* rel_mem = &eop.release_mem;
// Program CP to perform various cache operations
// before issuing the write operation commences
rel_mem->bitfields2.tc_action_ena = true;
rel_mem->bitfields2.tc_wb_action_ena = true;
// Update cmd to write a user specified 32-bit value
rel_mem->data_lo = write_value;
rel_mem->bitfields3.data_sel = data_sel__mec_release_mem__send_32_bit_low;
// Update cmd with user specified address to write to
rel_mem->address_hi = High32(uint64_t(write_addr));
rel_mem->bitfields4b.address_lo_64b = (Low32(uint64_t(write_addr) >> 3));
// Update cmd to issue interrupt if user has requested it
if (interrupt) {
rel_mem->bitfields3.int_sel = int_sel__mec_release_mem__send_interrupt_after_write_confirm;
}
// Serialize the command as stream of Dwords
APPEND_COMMAND_WRAPPER(cmdbuf, eop);
}
void Gfx9CmdWriter::BuildBarrierFenceCommands(CmdBuf* cmdbuf) {
// TODO: temporarily remove the check because some OpenCL tests
// (test_buffers, test_relationals) are failing.
// if (using_cc_memory_policy_)
// return;
AcquireMemTemplate invalidate_src_caches = invalidate_cache_template_;
// wbINVL2 by default writes-back and invalidates both L1 and L2
invalidate_src_caches.acquire_mem.bitfields2.coher_cntl = CP_COHER_CNTL__TC_ACTION_ENA_MASK;
invalidate_src_caches.acquire_mem.bitfields2.coher_cntl |= CP_COHER_CNTL__TC_WB_ACTION_ENA_MASK;
APPEND_COMMAND_WRAPPER(cmdbuf, invalidate_src_caches);
}
/*
// PM4 packet for profilers
#define PM4_PACKET3 (0xC0000000)
#define PM4_PACKET3_CMD_SHIFT 8
#define PM4_PACKET3_COUNT_SHIFT 16
#define PACKET3(cmd, count) \
(PM4_PACKET3 | (((count)-1) << PM4_PACKET3_COUNT_SHIFT) | \
((cmd) << PM4_PACKET3_CMD_SHIFT))
*/
// Structure to store the event PM4 packet
typedef struct WriteRegPacket_ { uint32_t item[3]; } WriteRegPacket;
void Gfx9CmdWriter::BuildWriteEventPacket(CmdBuf* cmdbuf, uint32_t event) {
PM4MEC_EVENT_WRITE cp_event_initiator;
memset(&cp_event_initiator, 0, sizeof(PM4MEC_EVENT_WRITE));
cp_event_initiator.ordinal1 =
PM4_TYPE3_HDR(IT_EVENT_WRITE, (sizeof(PM4MEC_EVENT_WRITE) / sizeof(uint32_t)));
cp_event_initiator.ordinal2 = 0;
VGT_EVENT_TYPE eventType = Reserved_0x00;
switch (event) {
case kPerfCntrsStart:
eventType = PERFCOUNTER_START;
break;
case kPerfCntrsStop:
eventType = PERFCOUNTER_STOP;
break;
case kPerfCntrsSample:
eventType = PERFCOUNTER_SAMPLE;
break;
default:
assert(false && "Illegal VGT Event Id");
}
MEC_EVENT_WRITE_event_index_enum index;
index = event_index__mec_event_write__other;
cp_event_initiator.bitfields2.event_index = index;
cp_event_initiator.bitfields2.event_type = eventType;
// Append the built command into output Command Buffer
APPEND_COMMAND_WRAPPER(cmdbuf, cp_event_initiator);
}
void Gfx9CmdWriter::BuildWriteUnshadowRegPacket(CmdBuf* cmdbuf, uint32_t addr, uint32_t value) {
WriteRegPacket packet;
packet.item[0] =
PM4_TYPE3_HDR(IT_SET_UCONFIG_REG, (1 + sizeof(PM4MEC_SET_CONFIG_REG) / sizeof(uint32_t)));
packet.item[1] = (addr - UCONFIG_SPACE_START);
packet.item[2] = value;
APPEND_COMMAND_WRAPPER(cmdbuf, packet);
}
void Gfx9CmdWriter::BuildWriteUConfigRegPacket(CmdBuf* cmdbuf, uint32_t addr, uint32_t value) {
WriteRegPacket packet;
packet.item[0] =
PM4_TYPE3_HDR(IT_SET_UCONFIG_REG, (1 + sizeof(PM4MEC_SET_CONFIG_REG) / sizeof(uint32_t)));
packet.item[1] = (addr - UCONFIG_SPACE_START);
packet.item[2] = value;
APPEND_COMMAND_WRAPPER(cmdbuf, packet);
}
void Gfx9CmdWriter::BuildWriteShRegPacket(CmdBuf* cmdbuf, uint32_t addr, uint32_t value) {
WriteRegPacket packet;
packet.item[0] =
PM4_TYPE3_HDR(IT_SET_SH_REG, (1 + sizeof(PM4MEC_SET_CONFIG_REG) / sizeof(uint32_t)));
packet.item[1] = (addr - PERSISTENT_SPACE_START);
packet.item[2] = value;
APPEND_COMMAND_WRAPPER(cmdbuf, packet);
}
void Gfx9CmdWriter::BuildCopyDataPacket(CmdBuf* cmdbuf, uint32_t src_sel, uint32_t src_addr_lo,
uint32_t src_addr_hi, uint32_t* dst_addr, uint32_t size,
bool wait) {
PM4MEC_COPY_DATA cmd_data;
memset(&cmd_data, 0, sizeof(PM4MEC_COPY_DATA));
cmd_data.ordinal1 = PM4_TYPE3_HDR(IT_COPY_DATA, (sizeof(PM4MEC_COPY_DATA) / sizeof(uint32_t)));
MEC_COPY_DATA_src_sel_enum data_src = src_sel__mec_copy_data__memory;
switch (src_sel) {
case 0:
data_src = src_sel__mec_copy_data__mem_mapped_register;
break;
case 4:
data_src = src_sel__mec_copy_data__perfcounters;
break;
default:
assert(false && "CopyData Illegal value for source of data");
break;
}
cmd_data.bitfields2.src_sel = data_src;
cmd_data.bitfields2.src_cache_policy = src_cache_policy__mec_copy_data__stream;
cmd_data.bitfields2.dst_sel = dst_sel__mec_copy_data__memory;
cmd_data.bitfields2.dst_cache_policy = dst_cache_policy__mec_copy_data__stream;
cmd_data.bitfields2.wr_confirm = (MEC_COPY_DATA_wr_confirm_enum)wait;
cmd_data.bitfields2.count_sel = (size == 0) ? count_sel__mec_copy_data__32_bits_of_data
: count_sel__mec_copy_data__64_bits_of_data;
// Specify the source register offset
cmd_data.bitfields3a.src_reg_offset = src_addr_lo;
// Specify the destination memory address
cmd_data.dst_addr_hi = PtrHigh32(dst_addr);
if (size == 0) {
cmd_data.bitfields5b.dst_32b_addr_lo = (PtrLow32(dst_addr) >> 2);
} else {
cmd_data.bitfields5c.dst_64b_addr_lo = (PtrLow32(dst_addr) >> 3);
}
// Append the built command into output Command Buffer
APPEND_COMMAND_WRAPPER(cmdbuf, cmd_data);
}
void Gfx9CmdWriter::BuildCacheFlushPacket(CmdBuf* cmdbuf) {
// Initialize the command header
PM4MEC_ACQUIRE_MEM cache_flush = invalidate_cache_template_.acquire_mem;
// Program Coherence Control Register. Initialize L2 Cache flush
// for Non-Coherent memory blocks
uint32_t coher_cntl = 0;
coher_cntl |= CP_COHER_CNTL__TC_ACTION_ENA_MASK;
coher_cntl |= CP_COHER_CNTL__TCL1_ACTION_ENA_MASK;
coher_cntl |= CP_COHER_CNTL__TC_WB_ACTION_ENA_MASK;
coher_cntl |= CP_COHER_CNTL__SH_ICACHE_ACTION_ENA_MASK;
coher_cntl |= CP_COHER_CNTL__SH_KCACHE_ACTION_ENA_MASK;
cache_flush.bitfields2.coher_cntl = coher_cntl;
// Copy AcquireMem command buffer stream
APPEND_COMMAND_WRAPPER(cmdbuf, cache_flush);
}
void Gfx9CmdWriter::BuildWriteWaitIdlePacket(CmdBuf* cmdbuf) {
BuildBarrierCommand(cmdbuf);
BuildCacheFlushPacket(cmdbuf);
}
// Will issue a VGT event including a cache flush later on
void Gfx9CmdWriter::BuildVgtEventPacket(CmdBuf* cmdbuf, uint32_t vgtEvent) {
PM4MEC_EVENT_WRITE cp_event_initiator;
memset(&cp_event_initiator, 0, sizeof(PM4MEC_EVENT_WRITE));
cp_event_initiator.ordinal1 =
PM4_TYPE3_HDR(IT_EVENT_WRITE, (sizeof(PM4MEC_EVENT_WRITE) / sizeof(uint32_t)));
cp_event_initiator.ordinal2 = 0;
VGT_EVENT_TYPE eventType = Reserved_0x00;
switch (vgtEvent) {
case kPerfCntrsStart:
eventType = PERFCOUNTER_START;
break;
case kPerfCntrsStop:
eventType = PERFCOUNTER_STOP;
break;
case kPerfCntrsSample:
eventType = PERFCOUNTER_SAMPLE;
break;
case kThrdTraceStart:
eventType = THREAD_TRACE_START;
break;
case kThrdTraceStop:
eventType = THREAD_TRACE_STOP;
break;
case kThrdTraceFlush:
eventType = THREAD_TRACE_FLUSH;
break;
case kThrdTraceFinish:
eventType = THREAD_TRACE_FINISH;
break;
default:
assert(false && "Illegal VGT Event Id");
}
MEC_EVENT_WRITE_event_index_enum index;
index = event_index__mec_event_write__other;
cp_event_initiator.bitfields2.event_index = index;
cp_event_initiator.bitfields2.event_type = eventType;
// Append the built command into output Command Buffer
APPEND_COMMAND_WRAPPER(cmdbuf, cp_event_initiator);
// Check If I should be issuing a cache flush operation as well
// test and remove it
BuildCacheFlushPacket(cmdbuf);
}
void Gfx9CmdWriter::BuildWriteRegisterPacket(CmdBuf* cmdbuf, uint32_t addr, uint32_t value) {
/*
WriteRegPacket packet;
packet.item[0] = (PM4_TYPE3_HDR(
IT_SET_CONFIG_REG, 1 + PM4_CMD_SET_CONFIG_REG_DWORDS, ShaderGraphics, 0));
packet.item[1] = addr - CONFIG_SPACE_START;
packet.item[2] = value;
APPEND_COMMAND_WRAPPER(cmdbuf, packet);
return;
*/
}
void Gfx9CmdWriter::BuildWriteEventQueryPacket(CmdBuf* cmdbuf, uint32_t event, uint32_t* addr) {
PM4MEC_EVENT_WRITE_QUERY cp_event_initiator;
memset(&cp_event_initiator, 0, sizeof(PM4MEC_EVENT_WRITE_QUERY));
cp_event_initiator.ordinal1 =
PM4_TYPE3_HDR(IT_EVENT_WRITE, (sizeof(PM4MEC_EVENT_WRITE_QUERY) / sizeof(uint32_t)));
cp_event_initiator.ordinal2 = 0;
// Update switch statements you want to support
VGT_EVENT_TYPE eventType = Reserved_0x00;
switch (event) {
default:
assert(false && "Illegal VGT Event Id");
}
MEC_EVENT_WRITE_event_index_enum index;
cp_event_initiator.bitfields2.event_type = eventType;
index = (MEC_EVENT_WRITE_event_index_enum)EventTypeToIndexTable[eventType];
cp_event_initiator.bitfields2.event_index = index;
// set the address
uint32_t addrLo = PtrLow32(addr);
uint32_t addrHi = PtrHigh32(addr);
((addrLo & 0x7) != 0) ? assert(false) : assert(true);
cp_event_initiator.address_hi = addrHi;
cp_event_initiator.bitfields3.address_lo = (addrLo >> 3);
// Append the built command into output Command Buffer
APPEND_COMMAND_WRAPPER(cmdbuf, cp_event_initiator);
}
size_t Gfx9CmdWriter::SizeOfAtomicPacket() const {
return sizeof(AtomicTemplate) / sizeof(uint32_t);
}
void Gfx9CmdWriter::BuildFlushCacheCmd(CmdBuf* cmdbuf, FlushCacheOptions* options,
uint32_t* writeAddr, uint32_t writeVal) {
PM4MEC_ACQUIRE_MEM cache_flush = invalidate_cache_template_.acquire_mem;
// Verify write back address is valid. Note that this address is NOT
// used on CI. But to have a same interface as that on SI, we keep
// the address argument in this function. Thus, this check always pass
// no matter the address is NULL or not.
(writeAddr == NULL) ? assert(true) : assert(true);
// Program Coherence Control Register. Initialize L2 Cache flush
// for Non-Coherent memory blocks
uint32_t coher_cntl = 0;
coher_cntl |= (options->l1) ? CP_COHER_CNTL__TCL1_ACTION_ENA_MASK : 0;
coher_cntl |= (options->l2)
? (CP_COHER_CNTL__TC_ACTION_ENA_MASK | CP_COHER_CNTL__TC_WB_ACTION_ENA_MASK)
: 0;
coher_cntl |= (options->icache) ? CP_COHER_CNTL__SH_ICACHE_ACTION_ENA_MASK : 0;
coher_cntl |= (options->kcache) ? CP_COHER_CNTL__SH_KCACHE_ACTION_ENA_MASK : 0;
cache_flush.bitfields2.coher_cntl = coher_cntl;
// Append the built command into output Command Buffer
APPEND_COMMAND_WRAPPER(cmdbuf, cache_flush);
return;
}
void Gfx9CmdWriter::BuildDmaDataPacket(CmdBuf* cmdbuf, uint32_t* srcAddr, uint32_t* dstAddr,
uint32_t copySize, bool waitForConfirm) {
PM4MEC_DMA_DATA cmdDmaData;
memset(&cmdDmaData, 0, sizeof(PM4MEC_DMA_DATA));
cmdDmaData.header.u32All =
PM4_TYPE3_HDR(IT_DMA_DATA, (sizeof(PM4MEC_DMA_DATA) / sizeof(uint32_t)));
// Specify attributes of source buffer such as its
// location and Cache policy
cmdDmaData.bitfields2.src_sel = src_sel__mec_dma_data__src_addr_using_sas;
cmdDmaData.bitfields2.src_cache_policy = src_cache_policy__mec_dma_data__stream;
// Specify attributes of destination buffer such as its
// location and Cache policy
cmdDmaData.bitfields2.dst_sel = dst_sel__mec_dma_data__dst_addr_using_das;
cmdDmaData.bitfields2.dst_cache_policy = dst_cache_policy__mec_dma_data__stream;
// Specify the source and destination addr
cmdDmaData.src_addr_lo_or_data = PtrLow32(srcAddr);
cmdDmaData.src_addr_hi = PtrHigh32(srcAddr);
cmdDmaData.dst_addr_lo = PtrLow32(dstAddr);
cmdDmaData.dst_addr_hi = PtrHigh32(dstAddr);
// Number of bytes to copy. The command restricts
// the size to be (64 MB - 1) - 26 Bits
assert(copySize < 0x1FFFFF);
cmdDmaData.bitfields7.byte_count = copySize;
// Indicate that DMA Cmd should wait if its source
// is the destination of a previous DMA Cmd
cmdDmaData.bitfields7.raw_wait = waitForConfirm;
APPEND_COMMAND_WRAPPER(cmdbuf, cmdDmaData);
return;
}
} // gfx9 namespace
} // pm4_profile
+199
Просмотреть файл
@@ -0,0 +1,199 @@
#ifndef _GFX9_CMDWRITER_H_
#define _GFX9_CMDWRITER_H_
#include "cmdwriter.h"
#include "gfx9_cmds.h"
namespace pm4_profile {
namespace gfx9 {
/// @brief class Gfx9CmdWriter implements the virtual class CommandWriter
/// for GFX9 chipsets
class Gfx9CmdWriter : public CommandWriter {
public:
Gfx9CmdWriter(bool atc_support, bool pcie_atomic_support);
/// @brief Dword specifying NOOP command for GFX9 chipsets. The macro
/// populates the NOOP command which is 32-bits wide. The second parameter,
/// the COUNT field of NOOP command, specifies the number of Dwords to skip.
/// To skip ZERO Dwords the value should be set to 0x3FFF. Since the macro
/// decrements the second parameter by TWO, an artifact of its definition,
/// the value is incremented by TWO to 0x4001 (0x3FFF + 2).
///
inline uint32_t GetNoOpCmd() {
static const uint32_t nopCmd = PM4_TYPE3_HDR(IT_NOP, 0x4001);
return nopCmd;
}
void BuildBarrierCommand(CmdBuf* cmdBuf);
void BuildIndirectBufferCmd(CmdBuf* cmdbuf, const void* cmd_addr, std::size_t cmd_size);
void BuildBOPNotifyCmd(CmdBuf* cmdbuf, const void* write_addr, uint32_t write_val,
bool interrupt);
void BuildBarrierFenceCommands(CmdBuf* cmdbuf);
void BuildWriteEventPacket(CmdBuf* cmdbuf, uint32_t event);
void BuildWaitRegMemCommand(CmdBuf* cmdbuf, bool mem_space, uint64_t wait_addr, bool func_eq,
uint32_t mask_val, uint32_t wait_val);
void BuildWriteUnshadowRegPacket(CmdBuf* cmdbuf, uint32_t addr, uint32_t value);
/// @brief Build CP command to program a Gpu register
///
/// @param cmdbuf Pointer to command buffer to be appended
/// @param addr Register to be programmed
/// @param value Value to write into register
///
/// @return void
void BuildWriteUConfigRegPacket(CmdBuf* cmdbuf, uint32_t addr, uint32_t value);
void BuildWriteShRegPacket(CmdBuf* cmdbuf, uint32_t addr, uint32_t value);
void BuildCopyDataPacket(CmdBuf* cmdbuf, uint32_t src_sel, uint32_t src_addr_lo,
uint32_t src_addr_hi, uint32_t* dst_addr, uint32_t size, bool wait);
void BuildWriteWaitIdlePacket(CmdBuf* cmdbuf);
// Will issue a VGT event including a cache flush later on
void BuildVgtEventPacket(CmdBuf* cmdbuf, uint32_t vgtEvent);
void BuildWriteRegisterPacket(CmdBuf* cmdbuf, uint32_t addr, uint32_t value);
void BuildWriteEventQueryPacket(CmdBuf* cmdbuf, uint32_t event, uint32_t* addr);
void BuildAtomicPacket(CmdBuf* cmdbuf, AtomicType atomic_op, volatile uint32_t* addr,
uint32_t value, uint32_t compare);
void BuildAtomicPacket64(CmdBuf* cmdbuf, AtomicType atomic_op, volatile uint64_t* addr,
uint64_t value = 0, uint64_t compare = 0);
size_t SizeOfAtomicPacket() const;
void BuildConditionalExecute(CmdBuf* cmdbuf, uint32_t* signal, uint16_t count);
void BuildWriteDataCommand(CmdBuf* cmdbuf, uint32_t* write_addr, uint32_t write_value);
void BuildWriteData64Command(CmdBuf* cmdbuf, uint64_t* write_addr, uint64_t write_value);
void BuildCacheFlushPacket(CmdBuf* cmdbuf);
/// Writes into input buffer Gpu commands to flush its cache. It is
/// necessary that the buffer provided for flush commands is large
/// enough to accommodate the full set of commands. It should be at
/// least 512 bytes.
///
/// @param tsCmdBuf Buffer to write commands to.
/// @param writeAddr Registered address into which GPU should write
/// a user provided value upon executing the flush commands.
/// @param writeVal User provided value written by GPU at user provided
/// address, upon executing the flush commands.
///
/// @return void
void BuildFlushCacheCmd(CmdBuf* cmdBuf, FlushCacheOptions* options, uint32_t* writeAddr,
uint32_t writeVal);
/// Builds Gpu command to copy data from source to destination buffer
/// using DMA engine.
///
/// @param cmdbuf Buffer updated with Gpu copy command
/// @param srcAddr Address of source buffer address
/// @param dstAddr Address of destination buffer address
/// @param copySize Size of data to copy in bytes
/// @param waitForCompletion if command should wait for copying to complete
void BuildDmaDataPacket(CmdBuf* cmdBuf, uint32_t* srcAddr, uint32_t* dstAddr, uint32_t copySize,
bool waitForCompletion);
protected:
/// @brief Append an instance of Gpu command into input command buffer stream.
///
/// @param cmdbuf CommandWriter object appended with anohter Gpu command
///
/// @param cmd Gpu command to be appended into command buffer
///
/// @return void
template <class T> void AppendCommand(CmdBuf* cmdbuf, const T& cmd);
private:
/// @brief Initializes a Gpu command which can be used to
/// reference a Gpu command stream indirectly
void InitializeLaunchTemplate();
/// @brief Initializes a Gpu command which can be used to
/// flush Gpu caches and write to a user configurable address
/// to indicate an end of kernel
void InitializeEndOfKernelNotifyTemplate();
/// @brief Initializes a Gpu command to perform atomic operations
////
void InitializeAtomicTemplate();
/// @brief Initializes a Gpu command to allow conditional execution
/// of a Gpu command stream
void InitializeConditionalTemplate();
/// @brief Initializes a Gpu command to let command processor
/// wait for some update before letting other commands to be
/// processed
void InitializeWaitRegMemTemplate();
/// @brief Initializes the template for Barrier command.
/// Applications can use Barrier command to ensure their
/// command is executed only after all other commands have
/// completed their execution.
void InitializeBarrierTemplate();
void BuildUpdateHostAddress(CmdBuf* cmdbuf, uint64_t* addr, int64_t value);
/// @brief Initializes Acquire Memory command template. Users
/// can submit this command to invalidate Gpu caches - L1 and
/// or L2.
void InitializeAcquireMemTemplate();
/// @brief Initializes an instance of Write Data command
/// for use by an application
void InitializeWriteDataTemplate();
void InitializeWriteData64Template();
void InitializeWriteDataTemplate(PM4MEC_WRITE_DATA* write_data, bool bit32);
/// @brief Builds wait_reg_mem with EQUALS condition
void BuildWaitRegMemCommand(CmdBuf* cmdbuf, uint64_t wait_addr, uint32_t wait_value);
/// @brief Instance of Gpu command to reference dispatch commands
LaunchTemplate launch_template_;
/// @brief Instance of Gpu command to use in determing end of kernel
EndofKernelNotifyTemplate notify_template_;
/// @brief Instance of Gpu command to use in performing atomic operations
AtomicTemplate atomic_template_;
/// @brief Instance of Pm4 command WRITE_DATA
WriteDataTemplate write_data_template_;
WriteData64Template write_data64_template_;
/// @brief Instance of Pm4 command EVENT_WRITE
BarrierTemplate pending_dispatch_template_;
/// @brief Instance of Pm4 command ACQUIRE_MEM
AcquireMemTemplate invalidate_cache_template_;
/// @brief Instance of Pm4 command WAIT_REG_MEM
WaitRegMemTemplate wait_reg_mem_template_;
/// @brief ATC support.
bool atc_support_;
/// @brief PCIe atomic support.
bool pcie_atomic_support_;
};
} // gfx9
} // pm4_profile
#endif // _GFX9_CMDWRITER_H_
+24
Просмотреть файл
@@ -0,0 +1,24 @@
#
# Source files for Rocr PerfCntr
#
set ( LIB_SRC var_data.cpp )
set ( LIB_SRC ${LIB_SRC} info_set.cpp )
set ( LIB_SRC ${LIB_SRC} parameter_set.cpp )
set ( LIB_SRC ${LIB_SRC} gpu_counter.cpp )
set ( LIB_SRC ${LIB_SRC} gpu_countergroup.cpp )
set ( LIB_SRC ${LIB_SRC} vi_blockinfo.cpp )
set ( LIB_SRC ${LIB_SRC} vi_pmu.cpp )
set ( LIB_SRC ${LIB_SRC} ai_blockinfo.cpp )
set ( LIB_SRC ${LIB_SRC} ai_pmu.cpp )
#
# Header files include path(s).
#
include_directories ( $ENV{ROCR_INC_DIR} )
include_directories ( ${PROJ_DIR}/commandwriter )
include_directories ( ${CORE_UTIL_DIR} )
#
# Build PerfCntr as a Static Library object
#
add_library ( ${PMC_LIB} STATIC ${LIB_SRC} )
+555
Просмотреть файл
@@ -0,0 +1,555 @@
#include "ai_blockinfo.h"
#include "gfxip/gfx9/gfx9_offset.h"
#include "gfxip/gfx9/gfx9_typedef.h"
namespace pm4_profile {
/**
* Table containing CounterGroups which represent AI hardware blocks
* as defined by \ref GpuBlockInfo structure
*/
GpuBlockInfo AiPmuHwBlocks[] = {
// Counter block CB
{"AI_CB0", kHsaAiCounterBlockIdCb0, AI_MAX_NUM_SHADER_ENGINES, 2, AI_NUM_CB,
CntlMethodBySeAndInstance, 395, AI_COUNTER_NUM_PER_CB, 0, 0, true, 0, 0, false, 0, 0},
{"AI_CB1", kHsaAiCounterBlockIdCb1, AI_MAX_NUM_SHADER_ENGINES, 2, AI_NUM_CB,
CntlMethodBySeAndInstance, 395, AI_COUNTER_NUM_PER_CB, 0, 0, true, 0, 0, false, 0, 0},
{"AI_CB2", kHsaAiCounterBlockIdCb2, AI_MAX_NUM_SHADER_ENGINES, 2, AI_NUM_CB,
CntlMethodBySeAndInstance, 395, AI_COUNTER_NUM_PER_CB, 0, 0, true, 0, 0, false, 0, 0},
{"AI_CB3", kHsaAiCounterBlockIdCb3, AI_MAX_NUM_SHADER_ENGINES, 2, AI_NUM_CB,
CntlMethodBySeAndInstance, 395, AI_COUNTER_NUM_PER_CB, 0, 0, true, 0, 0, false, 0, 0},
// Temp commented for Vega10
// Counter block CPF
/*
{"AI_CPF", kHsaAiCounterBlockIdCpf, AI_MAX_NUM_SHADER_ENGINES, 2, 1, CntlMethodNone, 19,
AI_COUNTER_NUM_PER_CPF, 0, 0, true, 0, 0, false, 0, 0},
*/
{"AI_CB3", kHsaAiCounterBlockIdCpf, AI_MAX_NUM_SHADER_ENGINES, 2, AI_NUM_CB,
CntlMethodBySeAndInstance, 395, AI_COUNTER_NUM_PER_CB, 0, 0, true, 0, 0, false, 0, 0},
// Counter block DB
{"AI_DB0", kHsaAiCounterBlockIdDb0, AI_MAX_NUM_SHADER_ENGINES, 2, AI_NUM_DB,
CntlMethodBySeAndInstance, 256, AI_COUNTER_NUM_PER_DB, 0, 0, true, 0, 0, false, 0, 0},
{"AI_DB1", kHsaAiCounterBlockIdDb1, AI_MAX_NUM_SHADER_ENGINES, 2, AI_NUM_DB,
CntlMethodBySeAndInstance, 256, AI_COUNTER_NUM_PER_DB, 0, 0, true, 0, 0, false, 0, 0},
{"AI_DB2", kHsaAiCounterBlockIdDb2, AI_MAX_NUM_SHADER_ENGINES, 2, AI_NUM_DB,
CntlMethodBySeAndInstance, 256, AI_COUNTER_NUM_PER_DB, 0, 0, true, 0, 0, false, 0, 0},
{"AI_DB3", kHsaAiCounterBlockIdDb3, AI_MAX_NUM_SHADER_ENGINES, 2, AI_NUM_DB,
CntlMethodBySeAndInstance, 256, AI_COUNTER_NUM_PER_DB, 0, 0, true, 0, 0, false, 0, 0},
// Counter block GRBM
{"AI_GRBM", kHsaAiCounterBlockIdGrbm, AI_MAX_NUM_SHADER_ENGINES, 2, 1, CntlMethodNone, 33,
AI_COUNTER_NUM_PER_GRBM, 0, 0, true, 0, 0, false, 0, 0},
// Counter block GRBMSE
{"AI_GRBMSE", kHsaAiCounterBlockIdGrbmSe, AI_MAX_NUM_SHADER_ENGINES, 2, 1, CntlMethodNone, 14,
AI_COUNTER_NUM_PER_GRBMSE, 0, 0, true, 0, 0, false, 0, 0},
// Counter block PA_SU
{"AI_PA_SU", kHsaAiCounterBlockIdPaSu, AI_MAX_NUM_SHADER_ENGINES, 2, 1, CntlMethodBySe, 152,
AI_COUNTER_NUM_PER_PA_SU, 0, 0, true, 0, 0, false, 0, 0},
// Counter block PA_SC
{"AI_PA_SC", kHsaAiCounterBlockIdPaSc, AI_MAX_NUM_SHADER_ENGINES, 2, 1, CntlMethodBySe, 396,
AI_COUNTER_NUM_PER_PA_SC, 0, 0, true, 0, 0, false, 0, 0},
// Counter block SPI
{"AI_SPI", kHsaAiCounterBlockIdSpi, AI_MAX_NUM_SHADER_ENGINES, 2, 1, CntlMethodBySe, 196,
AI_COUNTER_NUM_PER_SPI, 0, 0, true, 0, 0, false, 0, 0},
// Counter block SQ
{"AI_SQ", kHsaAiCounterBlockIdSq, AI_MAX_NUM_SHADER_ENGINES, 2, 1, CntlMethodBySe, 298,
AI_COUNTER_NUM_PER_SQ, 0, 0, true, 0, 0, false, 0, 0},
{"AI_SQ_GS", kHsaAiCounterBlockIdSqGs, AI_MAX_NUM_SHADER_ENGINES, 2, 1, CntlMethodBySe, 298,
AI_COUNTER_NUM_PER_SQ, 0, 0, true, 0, 0, false, 0, 0},
{"AI_SQ_VS", kHsaAiCounterBlockIdSqVs, AI_MAX_NUM_SHADER_ENGINES, 2, 1, CntlMethodBySe, 298,
AI_COUNTER_NUM_PER_SQ, 0, 0, true, 0, 0, false, 0, 0},
{"AI_SQ_PS", kHsaAiCounterBlockIdSqPs, AI_MAX_NUM_SHADER_ENGINES, 2, 1, CntlMethodBySe, 298,
AI_COUNTER_NUM_PER_SQ, 0, 0, true, 0, 0, false, 0, 0},
{"AI_SQ_HS", kHsaAiCounterBlockIdSqHs, AI_MAX_NUM_SHADER_ENGINES, 2, 1, CntlMethodBySe, 298,
AI_COUNTER_NUM_PER_SQ, 0, 0, true, 0, 0, false, 0, 0},
{"AI_SQ_CS", kHsaAiCounterBlockIdSqCs, AI_MAX_NUM_SHADER_ENGINES, 2, 1, CntlMethodBySe, 298,
AI_COUNTER_NUM_PER_SQ, 0, 0, true, 0, 0, false, 0, 0},
// Counter block SX
{"AI_SX", kHsaAiCounterBlockIdSx, AI_MAX_NUM_SHADER_ENGINES, 2, 1, CntlMethodBySe, 33,
AI_COUNTER_NUM_PER_SX, 0, 0, true, 0, 0, false, 0, 0},
// Counter block TA
{"AI_TA0", kHsaAiCounterBlockIdTa0, AI_MAX_NUM_SHADER_ENGINES, 2, AI_NUM_TA,
CntlMethodBySeAndInstance, 118, AI_COUNTER_NUM_PER_TA, 0, 0, true, 0, 0, false, 0, 0},
{"AI_TA1", kHsaAiCounterBlockIdTa1, AI_MAX_NUM_SHADER_ENGINES, 2, AI_NUM_TA,
CntlMethodBySeAndInstance, 118, AI_COUNTER_NUM_PER_TA, 0, 0, true, 0, 0, false, 0, 0},
{"AI_TA2", kHsaAiCounterBlockIdTa2, AI_MAX_NUM_SHADER_ENGINES, 2, AI_NUM_TA,
CntlMethodBySeAndInstance, 118, AI_COUNTER_NUM_PER_TA, 0, 0, true, 0, 0, false, 0, 0},
{"AI_TA3", kHsaAiCounterBlockIdTa3, AI_MAX_NUM_SHADER_ENGINES, 2, AI_NUM_TA,
CntlMethodBySeAndInstance, 118, AI_COUNTER_NUM_PER_TA, 0, 0, true, 0, 0, false, 0, 0},
{"AI_TA4", kHsaAiCounterBlockIdTa4, AI_MAX_NUM_SHADER_ENGINES, 2, AI_NUM_TA,
CntlMethodBySeAndInstance, 118, AI_COUNTER_NUM_PER_TA, 0, 0, true, 0, 0, false, 0, 0},
{"AI_TA5", kHsaAiCounterBlockIdTa5, AI_MAX_NUM_SHADER_ENGINES, 2, AI_NUM_TA,
CntlMethodBySeAndInstance, 118, AI_COUNTER_NUM_PER_TA, 0, 0, true, 0, 0, false, 0, 0},
{"AI_TA6", kHsaAiCounterBlockIdTa6, AI_MAX_NUM_SHADER_ENGINES, 2, AI_NUM_TA,
CntlMethodBySeAndInstance, 118, AI_COUNTER_NUM_PER_TA, 0, 0, true, 0, 0, false, 0, 0},
{"AI_TA7", kHsaAiCounterBlockIdTa7, AI_MAX_NUM_SHADER_ENGINES, 2, AI_NUM_TA,
CntlMethodBySeAndInstance, 118, AI_COUNTER_NUM_PER_TA, 0, 0, true, 0, 0, false, 0, 0},
{"AI_TA8", kHsaAiCounterBlockIdTa8, AI_MAX_NUM_SHADER_ENGINES, 2, AI_NUM_TA,
CntlMethodBySeAndInstance, 118, AI_COUNTER_NUM_PER_TA, 0, 0, true, 0, 0, false, 0, 0},
{"AI_TA9", kHsaAiCounterBlockIdTa9, AI_MAX_NUM_SHADER_ENGINES, 2, AI_NUM_TA,
CntlMethodBySeAndInstance, 118, AI_COUNTER_NUM_PER_TA, 0, 0, true, 0, 0, false, 0, 0},
{"AI_TA10", kHsaAiCounterBlockIdTa10, AI_MAX_NUM_SHADER_ENGINES, 2, AI_NUM_TA,
CntlMethodBySeAndInstance, 118, AI_COUNTER_NUM_PER_TA, 0, 0, true, 0, 0, false, 0, 0},
{"AI_TA11", kHsaAiCounterBlockIdTa11, AI_MAX_NUM_SHADER_ENGINES, 2, AI_NUM_TA,
CntlMethodBySeAndInstance, 118, AI_COUNTER_NUM_PER_TA, 0, 0, true, 0, 0, false, 0, 0},
{"AI_TA12", kHsaAiCounterBlockIdTa12, AI_MAX_NUM_SHADER_ENGINES, 2, AI_NUM_TA,
CntlMethodBySeAndInstance, 118, AI_COUNTER_NUM_PER_TA, 0, 0, true, 0, 0, false, 0, 0},
{"AI_TA13", kHsaAiCounterBlockIdTa13, AI_MAX_NUM_SHADER_ENGINES, 2, AI_NUM_TA,
CntlMethodBySeAndInstance, 118, AI_COUNTER_NUM_PER_TA, 0, 0, true, 0, 0, false, 0, 0},
{"AI_TA14", kHsaAiCounterBlockIdTa14, AI_MAX_NUM_SHADER_ENGINES, 2, AI_NUM_TA,
CntlMethodBySeAndInstance, 118, AI_COUNTER_NUM_PER_TA, 0, 0, true, 0, 0, false, 0, 0},
{"AI_TA15", kHsaAiCounterBlockIdTa15, AI_MAX_NUM_SHADER_ENGINES, 2, AI_NUM_TA,
CntlMethodBySeAndInstance, 118, AI_COUNTER_NUM_PER_TA, 0, 0, true, 0, 0, false, 0, 0},
// Counter block TCA
{"AI_TCA0", kHsaAiCounterBlockIdTca0, AI_MAX_NUM_SHADER_ENGINES, 2, AI_NUM_TCA,
CntlMethodByInstance, 34, AI_COUNTER_NUM_PER_TCA, 0, 0, true, 0, 0, false, 0, 0},
{"AI_TCA1", kHsaAiCounterBlockIdTca1, AI_MAX_NUM_SHADER_ENGINES, 2, AI_NUM_TCA,
CntlMethodByInstance, 34, AI_COUNTER_NUM_PER_TCA, 0, 0, true, 0, 0, false, 0, 0},
// Counter block TCC
{"AI_TCC0", kHsaAiCounterBlockIdTcc0, AI_MAX_NUM_SHADER_ENGINES, 2, AI_NUM_TCC,
CntlMethodByInstance, 191, AI_COUNTER_NUM_PER_TCC, 0, 0, true, 0, 0, false, 0, 0},
{"AI_TCC1", kHsaAiCounterBlockIdTcc1, AI_MAX_NUM_SHADER_ENGINES, 2, AI_NUM_TCC,
CntlMethodByInstance, 191, AI_COUNTER_NUM_PER_TCC, 0, 0, true, 0, 0, false, 0, 0},
{"AI_TCC2", kHsaAiCounterBlockIdTcc2, AI_MAX_NUM_SHADER_ENGINES, 2, AI_NUM_TCC,
CntlMethodByInstance, 191, AI_COUNTER_NUM_PER_TCC, 0, 0, true, 0, 0, false, 0, 0},
{"AI_TCC3", kHsaAiCounterBlockIdTcc3, AI_MAX_NUM_SHADER_ENGINES, 2, AI_NUM_TCC,
CntlMethodByInstance, 191, AI_COUNTER_NUM_PER_TCC, 0, 0, true, 0, 0, false, 0, 0},
{"AI_TCC4", kHsaAiCounterBlockIdTcc4, AI_MAX_NUM_SHADER_ENGINES, 2, AI_NUM_TCC,
CntlMethodByInstance, 191, AI_COUNTER_NUM_PER_TCC, 0, 0, true, 0, 0, false, 0, 0},
{"AI_TCC5", kHsaAiCounterBlockIdTcc5, AI_MAX_NUM_SHADER_ENGINES, 2, AI_NUM_TCC,
CntlMethodByInstance, 191, AI_COUNTER_NUM_PER_TCC, 0, 0, true, 0, 0, false, 0, 0},
{"AI_TCC6", kHsaAiCounterBlockIdTcc6, AI_MAX_NUM_SHADER_ENGINES, 2, AI_NUM_TCC,
CntlMethodByInstance, 191, AI_COUNTER_NUM_PER_TCC, 0, 0, true, 0, 0, false, 0, 0},
{"AI_TCC7", kHsaAiCounterBlockIdTcc7, AI_MAX_NUM_SHADER_ENGINES, 2, AI_NUM_TCC,
CntlMethodByInstance, 191, AI_COUNTER_NUM_PER_TCC, 0, 0, true, 0, 0, false, 0, 0},
{"AI_TCC8", kHsaAiCounterBlockIdTcc8, AI_MAX_NUM_SHADER_ENGINES, 2, AI_NUM_TCC,
CntlMethodByInstance, 191, AI_COUNTER_NUM_PER_TCC, 0, 0, true, 0, 0, false, 0, 0},
{"AI_TCC9", kHsaAiCounterBlockIdTcc9, AI_MAX_NUM_SHADER_ENGINES, 2, AI_NUM_TCC,
CntlMethodByInstance, 191, AI_COUNTER_NUM_PER_TCC, 0, 0, true, 0, 0, false, 0, 0},
{"AI_TCC10", kHsaAiCounterBlockIdTcc10, AI_MAX_NUM_SHADER_ENGINES, 2, AI_NUM_TCC,
CntlMethodByInstance, 191, AI_COUNTER_NUM_PER_TCC, 0, 0, true, 0, 0, false, 0, 0},
{"AI_TCC11", kHsaAiCounterBlockIdTcc11, AI_MAX_NUM_SHADER_ENGINES, 2, AI_NUM_TCC,
CntlMethodByInstance, 191, AI_COUNTER_NUM_PER_TCC, 0, 0, true, 0, 0, false, 0, 0},
{"AI_TCC12", kHsaAiCounterBlockIdTcc12, AI_MAX_NUM_SHADER_ENGINES, 2, AI_NUM_TCC,
CntlMethodByInstance, 191, AI_COUNTER_NUM_PER_TCC, 0, 0, true, 0, 0, false, 0, 0},
{"AI_TCC13", kHsaAiCounterBlockIdTcc13, AI_MAX_NUM_SHADER_ENGINES, 2, AI_NUM_TCC,
CntlMethodByInstance, 191, AI_COUNTER_NUM_PER_TCC, 0, 0, true, 0, 0, false, 0, 0},
{"AI_TCC14", kHsaAiCounterBlockIdTcc14, AI_MAX_NUM_SHADER_ENGINES, 2, AI_NUM_TCC,
CntlMethodByInstance, 191, AI_COUNTER_NUM_PER_TCC, 0, 0, true, 0, 0, false, 0, 0},
{"AI_TCC15", kHsaAiCounterBlockIdTcc15, AI_MAX_NUM_SHADER_ENGINES, 2, AI_NUM_TCC,
CntlMethodByInstance, 191, AI_COUNTER_NUM_PER_TCC, 0, 0, true, 0, 0, false, 0, 0},
// Counter block TD
{"AI_TD0", kHsaAiCounterBlockIdTd0, AI_MAX_NUM_SHADER_ENGINES, 2, AI_NUM_TD,
CntlMethodBySeAndInstance, 54, AI_COUNTER_NUM_PER_TD, 0, 0, true, 0, 0, false, 0, 0},
{"AI_TD1", kHsaAiCounterBlockIdTd1, AI_MAX_NUM_SHADER_ENGINES, 2, AI_NUM_TD,
CntlMethodBySeAndInstance, 54, AI_COUNTER_NUM_PER_TD, 0, 0, true, 0, 0, false, 0, 0},
{"AI_TD2", kHsaAiCounterBlockIdTd2, AI_MAX_NUM_SHADER_ENGINES, 2, AI_NUM_TD,
CntlMethodBySeAndInstance, 54, AI_COUNTER_NUM_PER_TD, 0, 0, true, 0, 0, false, 0, 0},
{"AI_TD3", kHsaAiCounterBlockIdTd3, AI_MAX_NUM_SHADER_ENGINES, 2, AI_NUM_TD,
CntlMethodBySeAndInstance, 54, AI_COUNTER_NUM_PER_TD, 0, 0, true, 0, 0, false, 0, 0},
{"AI_TD4", kHsaAiCounterBlockIdTd4, AI_MAX_NUM_SHADER_ENGINES, 2, AI_NUM_TD,
CntlMethodBySeAndInstance, 54, AI_COUNTER_NUM_PER_TD, 0, 0, true, 0, 0, false, 0, 0},
{"AI_TD5", kHsaAiCounterBlockIdTd5, AI_MAX_NUM_SHADER_ENGINES, 2, AI_NUM_TD,
CntlMethodBySeAndInstance, 54, AI_COUNTER_NUM_PER_TD, 0, 0, true, 0, 0, false, 0, 0},
{"AI_TD6", kHsaAiCounterBlockIdTd6, AI_MAX_NUM_SHADER_ENGINES, 2, AI_NUM_TD,
CntlMethodBySeAndInstance, 54, AI_COUNTER_NUM_PER_TD, 0, 0, true, 0, 0, false, 0, 0},
{"AI_TD7", kHsaAiCounterBlockIdTd7, AI_MAX_NUM_SHADER_ENGINES, 2, AI_NUM_TD,
CntlMethodBySeAndInstance, 54, AI_COUNTER_NUM_PER_TD, 0, 0, true, 0, 0, false, 0, 0},
{"AI_TD8", kHsaAiCounterBlockIdTd8, AI_MAX_NUM_SHADER_ENGINES, 2, AI_NUM_TD,
CntlMethodBySeAndInstance, 54, AI_COUNTER_NUM_PER_TD, 0, 0, true, 0, 0, false, 0, 0},
{"AI_TD9", kHsaAiCounterBlockIdTd9, AI_MAX_NUM_SHADER_ENGINES, 2, AI_NUM_TD,
CntlMethodBySeAndInstance, 54, AI_COUNTER_NUM_PER_TD, 0, 0, true, 0, 0, false, 0, 0},
{"AI_TD10", kHsaAiCounterBlockIdTd10, AI_MAX_NUM_SHADER_ENGINES, 2, AI_NUM_TD,
CntlMethodBySeAndInstance, 54, AI_COUNTER_NUM_PER_TD, 0, 0, true, 0, 0, false, 0, 0},
{"AI_TD11", kHsaAiCounterBlockIdTd11, AI_MAX_NUM_SHADER_ENGINES, 2, AI_NUM_TD,
CntlMethodBySeAndInstance, 54, AI_COUNTER_NUM_PER_TD, 0, 0, true, 0, 0, false, 0, 0},
{"AI_TD12", kHsaAiCounterBlockIdTd12, AI_MAX_NUM_SHADER_ENGINES, 2, AI_NUM_TD,
CntlMethodBySeAndInstance, 54, AI_COUNTER_NUM_PER_TD, 0, 0, true, 0, 0, false, 0, 0},
{"AI_TD13", kHsaAiCounterBlockIdTd13, AI_MAX_NUM_SHADER_ENGINES, 2, AI_NUM_TD,
CntlMethodBySeAndInstance, 54, AI_COUNTER_NUM_PER_TD, 0, 0, true, 0, 0, false, 0, 0},
{"AI_TD14", kHsaAiCounterBlockIdTd14, AI_MAX_NUM_SHADER_ENGINES, 2, AI_NUM_TD,
CntlMethodBySeAndInstance, 54, AI_COUNTER_NUM_PER_TD, 0, 0, true, 0, 0, false, 0, 0},
{"AI_TD15", kHsaAiCounterBlockIdTd15, AI_MAX_NUM_SHADER_ENGINES, 2, AI_NUM_TD,
CntlMethodBySeAndInstance, 54, AI_COUNTER_NUM_PER_TD, 0, 0, true, 0, 0, false, 0, 0},
// Counter block TCP
{"AI_TCP0", kHsaAiCounterBlockIdTcp0, AI_MAX_NUM_SHADER_ENGINES, 2, AI_NUM_TCP,
CntlMethodBySeAndInstance, 182, AI_COUNTER_NUM_PER_TCP, 0, 0, true, 0, 0, false, 0, 0},
{"AI_TCP1", kHsaAiCounterBlockIdTcp1, AI_MAX_NUM_SHADER_ENGINES, 2, AI_NUM_TCP,
CntlMethodBySeAndInstance, 182, AI_COUNTER_NUM_PER_TCP, 0, 0, true, 0, 0, false, 0, 0},
{"AI_TCP2", kHsaAiCounterBlockIdTcp2, AI_MAX_NUM_SHADER_ENGINES, 2, AI_NUM_TCP,
CntlMethodBySeAndInstance, 182, AI_COUNTER_NUM_PER_TCP, 0, 0, true, 0, 0, false, 0, 0},
{"AI_TCP3", kHsaAiCounterBlockIdTcp3, AI_MAX_NUM_SHADER_ENGINES, 2, AI_NUM_TCP,
CntlMethodBySeAndInstance, 182, AI_COUNTER_NUM_PER_TCP, 0, 0, true, 0, 0, false, 0, 0},
{"AI_TCP4", kHsaAiCounterBlockIdTcp4, AI_MAX_NUM_SHADER_ENGINES, 2, AI_NUM_TCP,
CntlMethodBySeAndInstance, 182, AI_COUNTER_NUM_PER_TCP, 0, 0, true, 0, 0, false, 0, 0},
{"AI_TCP5", kHsaAiCounterBlockIdTcp5, AI_MAX_NUM_SHADER_ENGINES, 2, AI_NUM_TCP,
CntlMethodBySeAndInstance, 182, AI_COUNTER_NUM_PER_TCP, 0, 0, true, 0, 0, false, 0, 0},
{"AI_TCP6", kHsaAiCounterBlockIdTcp6, AI_MAX_NUM_SHADER_ENGINES, 2, AI_NUM_TCP,
CntlMethodBySeAndInstance, 182, AI_COUNTER_NUM_PER_TCP, 0, 0, true, 0, 0, false, 0, 0},
{"AI_TCP7", kHsaAiCounterBlockIdTcp7, AI_MAX_NUM_SHADER_ENGINES, 2, AI_NUM_TCP,
CntlMethodBySeAndInstance, 182, AI_COUNTER_NUM_PER_TCP, 0, 0, true, 0, 0, false, 0, 0},
{"AI_TCP8", kHsaAiCounterBlockIdTcp8, AI_MAX_NUM_SHADER_ENGINES, 2, AI_NUM_TCP,
CntlMethodBySeAndInstance, 182, AI_COUNTER_NUM_PER_TCP, 0, 0, true, 0, 0, false, 0, 0},
{"AI_TCP9", kHsaAiCounterBlockIdTcp9, AI_MAX_NUM_SHADER_ENGINES, 2, AI_NUM_TCP,
CntlMethodBySeAndInstance, 182, AI_COUNTER_NUM_PER_TCP, 0, 0, true, 0, 0, false, 0, 0},
{"AI_TCP10", kHsaAiCounterBlockIdTcp10, AI_MAX_NUM_SHADER_ENGINES, 2, AI_NUM_TCP,
CntlMethodBySeAndInstance, 182, AI_COUNTER_NUM_PER_TCP, 0, 0, true, 0, 0, false, 0, 0},
{"AI_TCP11", kHsaAiCounterBlockIdTcp11, AI_MAX_NUM_SHADER_ENGINES, 2, AI_NUM_TCP,
CntlMethodBySeAndInstance, 182, AI_COUNTER_NUM_PER_TCP, 0, 0, true, 0, 0, false, 0, 0},
{"AI_TCP12", kHsaAiCounterBlockIdTcp12, AI_MAX_NUM_SHADER_ENGINES, 2, AI_NUM_TCP,
CntlMethodBySeAndInstance, 182, AI_COUNTER_NUM_PER_TCP, 0, 0, true, 0, 0, false, 0, 0},
{"AI_TCP13", kHsaAiCounterBlockIdTcp13, AI_MAX_NUM_SHADER_ENGINES, 2, AI_NUM_TCP,
CntlMethodBySeAndInstance, 182, AI_COUNTER_NUM_PER_TCP, 0, 0, true, 0, 0, false, 0, 0},
{"AI_TCP14", kHsaAiCounterBlockIdTcp14, AI_MAX_NUM_SHADER_ENGINES, 2, AI_NUM_TCP,
CntlMethodBySeAndInstance, 182, AI_COUNTER_NUM_PER_TCP, 0, 0, true, 0, 0, false, 0, 0},
{"AI_TCP15", kHsaAiCounterBlockIdTcp15, AI_MAX_NUM_SHADER_ENGINES, 2, AI_NUM_TCP,
CntlMethodBySeAndInstance, 182, AI_COUNTER_NUM_PER_TCP, 0, 0, true, 0, 0, false, 0, 0},
// Counter block GDS
{"AI_GDS", kHsaAiCounterBlockIdGds, AI_MAX_NUM_SHADER_ENGINES, 2, 1, CntlMethodNone, 120,
AI_COUNTER_NUM_PER_GDS, 0, 0, true, 0, 0, false, 0, 0},
// Counter block VGT
{"AI_VGT", kHsaAiCounterBlockIdVgt, AI_MAX_NUM_SHADER_ENGINES, 2, 1, CntlMethodBySe, 145,
AI_COUNTER_NUM_PER_VGT, 0, 0, true, 0, 0, false, 0, 0},
// Counter block IA
{"AI_IA", kHsaAiCounterBlockIdIa, AI_MAX_NUM_SHADER_ENGINES, 2, 1, CntlMethodBySe, 23,
AI_COUNTER_NUM_PER_IA, 0, 0, true, 0, 0, false, 0, 0},
// Counter block MC
{"AI_MC", kHsaAiCounterBlockIdMc, AI_MAX_NUM_SHADER_ENGINES, 2, 1, CntlMethodNone, 22,
AI_COUNTER_NUM_PER_MC, 0, 0, true, 0, 0, false, 0, 0},
// Temp commented out for Vega10
// Counter block SRBM
/*
{"AI_SRBM", kHsaAiCounterBlockIdSrbm, AI_MAX_NUM_SHADER_ENGINES, 2, 1, CntlMethodNone, 19,
AI_COUNTER_NUM_PER_SRBM, 0, 0, true, 0, 0, false, 0, 0},
*/
// Counter block WD
{"AI_WD", kHsaAiCounterBlockIdWd, AI_MAX_NUM_SHADER_ENGINES, 2, 1, CntlMethodNone, 36,
AI_COUNTER_NUM_PER_WD, 0, 0, true, 0, 0, false, 0, 0},
// Counter block CPG
// Temp commented for Vega10
/*
{"AI_CPG", kHsaAiCounterBlockIdCpg, AI_MAX_NUM_SHADER_ENGINES, 2, 1, CntlMethodNone, 48,
AI_COUNTER_NUM_PER_CPG, 0, 0, true, 0, 0, false, 0, 0},
*/
// Counter block CPC
// Temp commented for Vega10
/*
{"AI_CPC", kHsaAiCounterBlockIdCpc, AI_MAX_NUM_SHADER_ENGINES, 2, 1, CntlMethodNone, 24,
AI_COUNTER_NUM_PER_CPC, 0, 0, true, 0, 0, false, 0, 0},
*/
// Counter block IOMMUV2
{"AI_IOMMUV2", kHsaAiCounterBlockIdIommuV2, AI_MAX_NUM_SHADER_ENGINES, 2, 1, CntlMethodNone, 25,
8, 0, 0, true, 0, 0, false, 0, 0},
// Counter block KernelDriver
{"AI_KD", kHsaAiCounterBlockIdKernelDriver, AI_MAX_NUM_SHADER_ENGINES, 2, 1, CntlMethodNone, 0,
0, 0, 0, true, 0, 0, false, 0, 0},
// Name of the last line should be empty to indicate end of all counter groups
{"", kHsaAiCounterBlockIdBlocksLast, 0, 0, 0, CntlMethodNone, 0, 0, 0, 0, false, 0, 0, false, 0,
0}};
/*
* The following tables contain register addresses of the SQ counter registers
*/
/*
* SQ
*/
GpuCounterRegInfo AiSqCounterRegAddr[] = {
{mmSQ_PERFCOUNTER0_SELECT, mmSQ_PERFCOUNTER_CTRL, mmSQ_PERFCOUNTER0_LO, mmSQ_PERFCOUNTER0_HI},
{mmSQ_PERFCOUNTER1_SELECT, mmSQ_PERFCOUNTER_CTRL, mmSQ_PERFCOUNTER1_LO, mmSQ_PERFCOUNTER1_HI},
{mmSQ_PERFCOUNTER2_SELECT, mmSQ_PERFCOUNTER_CTRL, mmSQ_PERFCOUNTER2_LO, mmSQ_PERFCOUNTER2_HI},
{mmSQ_PERFCOUNTER3_SELECT, mmSQ_PERFCOUNTER_CTRL, mmSQ_PERFCOUNTER3_LO, mmSQ_PERFCOUNTER3_HI},
{mmSQ_PERFCOUNTER4_SELECT, mmSQ_PERFCOUNTER_CTRL, mmSQ_PERFCOUNTER4_LO, mmSQ_PERFCOUNTER4_HI},
{mmSQ_PERFCOUNTER5_SELECT, mmSQ_PERFCOUNTER_CTRL, mmSQ_PERFCOUNTER5_LO, mmSQ_PERFCOUNTER5_HI},
{mmSQ_PERFCOUNTER6_SELECT, mmSQ_PERFCOUNTER_CTRL, mmSQ_PERFCOUNTER6_LO, mmSQ_PERFCOUNTER6_HI},
{mmSQ_PERFCOUNTER7_SELECT, mmSQ_PERFCOUNTER_CTRL, mmSQ_PERFCOUNTER7_LO, mmSQ_PERFCOUNTER7_HI},
{mmSQ_PERFCOUNTER8_SELECT, mmSQ_PERFCOUNTER_CTRL, mmSQ_PERFCOUNTER8_LO, mmSQ_PERFCOUNTER8_HI},
{mmSQ_PERFCOUNTER9_SELECT, mmSQ_PERFCOUNTER_CTRL, mmSQ_PERFCOUNTER9_LO, mmSQ_PERFCOUNTER9_HI},
{mmSQ_PERFCOUNTER10_SELECT, mmSQ_PERFCOUNTER_CTRL, mmSQ_PERFCOUNTER10_LO,
mmSQ_PERFCOUNTER10_HI},
{mmSQ_PERFCOUNTER11_SELECT, mmSQ_PERFCOUNTER_CTRL, mmSQ_PERFCOUNTER11_LO,
mmSQ_PERFCOUNTER11_HI},
{mmSQ_PERFCOUNTER12_SELECT, mmSQ_PERFCOUNTER_CTRL, mmSQ_PERFCOUNTER12_LO,
mmSQ_PERFCOUNTER12_HI},
{mmSQ_PERFCOUNTER13_SELECT, mmSQ_PERFCOUNTER_CTRL, mmSQ_PERFCOUNTER13_LO,
mmSQ_PERFCOUNTER13_HI},
{mmSQ_PERFCOUNTER14_SELECT, mmSQ_PERFCOUNTER_CTRL, mmSQ_PERFCOUNTER14_LO,
mmSQ_PERFCOUNTER14_HI},
{mmSQ_PERFCOUNTER15_SELECT, mmSQ_PERFCOUNTER_CTRL, mmSQ_PERFCOUNTER15_LO,
mmSQ_PERFCOUNTER15_HI}};
/*
* DRMDMA
*/
GpuCounterRegInfo AiDrmdmaCounterRegAddr[] = {
{mmSDMA0_PERFMON_CNTL, 0, mmSDMA0_PERFCOUNTER0_RESULT, 0},
{mmSDMA0_PERFMON_CNTL, 0, mmSDMA0_PERFCOUNTER1_RESULT, 0},
{mmSDMA1_PERFMON_CNTL, 0, mmSDMA1_PERFCOUNTER0_RESULT, 0},
{mmSDMA1_PERFMON_CNTL, 0, mmSDMA1_PERFCOUNTER1_RESULT, 0},
};
/*
* IH
*/
GpuCounterRegInfo AiIhCounterRegAddr[] = {{mmIH_PERFMON_CNTL, 0, mmIH_PERFCOUNTER0_RESULT, 0},
{mmIH_PERFMON_CNTL, 0, mmIH_PERFCOUNTER1_RESULT, 0}};
/*
* CPF
*/
GpuCounterRegInfo AiCpfCounterRegAddr[] = {
{mmCPF_PERFCOUNTER0_SELECT, 0, mmCPF_PERFCOUNTER0_LO, mmCPF_PERFCOUNTER0_HI},
{mmCPF_PERFCOUNTER1_SELECT, 0, mmCPF_PERFCOUNTER1_LO, mmCPF_PERFCOUNTER1_HI}};
/*
* DRM
*/
GpuCounterRegInfo AiDrmCounterRegAddr[] = {
/*
{mmDRM_PERFCOUNTER1_SELECT, 0, mmDRM_PERFCOUNTER1_LO, mmDRM_PERFCOUNTER1_HI},
{mmDRM_PERFCOUNTER2_SELECT, 0, mmDRM_PERFCOUNTER2_LO, mmDRM_PERFCOUNTER2_HI}
*/
};
/*
* GRBM
*/
GpuCounterRegInfo AiGrbmCounterRegAddr[] = {
{mmGRBM_PERFCOUNTER0_SELECT, 0, mmGRBM_PERFCOUNTER0_LO, mmGRBM_PERFCOUNTER0_HI},
{mmGRBM_PERFCOUNTER1_SELECT, 0, mmGRBM_PERFCOUNTER1_LO, mmGRBM_PERFCOUNTER1_HI}};
/*
* GRBM_SE
*/
GpuCounterRegInfo AiGrbmSeCounterRegAddr[] = {
{mmGRBM_SE0_PERFCOUNTER_SELECT, 0, mmGRBM_SE0_PERFCOUNTER_LO, mmGRBM_SE0_PERFCOUNTER_HI},
{mmGRBM_SE1_PERFCOUNTER_SELECT, 0, mmGRBM_SE1_PERFCOUNTER_LO, mmGRBM_SE1_PERFCOUNTER_HI},
{mmGRBM_SE2_PERFCOUNTER_SELECT, 0, mmGRBM_SE2_PERFCOUNTER_LO, mmGRBM_SE2_PERFCOUNTER_HI},
{mmGRBM_SE3_PERFCOUNTER_SELECT, 0, mmGRBM_SE3_PERFCOUNTER_LO, mmGRBM_SE3_PERFCOUNTER_HI}};
/*
* PA_SU
*/
GpuCounterRegInfo AiPaSuCounterRegAddr[] = {
{mmPA_SU_PERFCOUNTER0_SELECT, 0, mmPA_SU_PERFCOUNTER0_LO, mmPA_SU_PERFCOUNTER0_HI},
{mmPA_SU_PERFCOUNTER1_SELECT, 0, mmPA_SU_PERFCOUNTER1_LO, mmPA_SU_PERFCOUNTER1_HI},
{mmPA_SU_PERFCOUNTER2_SELECT, 0, mmPA_SU_PERFCOUNTER2_LO, mmPA_SU_PERFCOUNTER2_HI},
{mmPA_SU_PERFCOUNTER3_SELECT, 0, mmPA_SU_PERFCOUNTER3_LO, mmPA_SU_PERFCOUNTER3_HI}};
/*
* PA_SC
*/
GpuCounterRegInfo AiPaScCounterRegAddr[] = {
{mmPA_SC_PERFCOUNTER0_SELECT, 0, mmPA_SC_PERFCOUNTER0_LO, mmPA_SC_PERFCOUNTER0_HI},
{mmPA_SC_PERFCOUNTER1_SELECT, 0, mmPA_SC_PERFCOUNTER1_LO, mmPA_SC_PERFCOUNTER1_HI},
{mmPA_SC_PERFCOUNTER2_SELECT, 0, mmPA_SC_PERFCOUNTER2_LO, mmPA_SC_PERFCOUNTER2_HI},
{mmPA_SC_PERFCOUNTER3_SELECT, 0, mmPA_SC_PERFCOUNTER3_LO, mmPA_SC_PERFCOUNTER3_HI}};
/*
* SPI
*/
GpuCounterRegInfo AiSpiCounterRegAddr[] = {
{mmSPI_PERFCOUNTER0_SELECT, 0, mmSPI_PERFCOUNTER0_LO, mmSPI_PERFCOUNTER0_HI},
{mmSPI_PERFCOUNTER1_SELECT, 0, mmSPI_PERFCOUNTER1_LO, mmSPI_PERFCOUNTER1_HI},
{mmSPI_PERFCOUNTER2_SELECT, 0, mmSPI_PERFCOUNTER2_LO, mmSPI_PERFCOUNTER2_HI},
{mmSPI_PERFCOUNTER3_SELECT, 0, mmSPI_PERFCOUNTER3_LO, mmSPI_PERFCOUNTER3_HI},
{mmSPI_PERFCOUNTER4_SELECT, 0, mmSPI_PERFCOUNTER4_LO, mmSPI_PERFCOUNTER4_HI},
{mmSPI_PERFCOUNTER5_SELECT, 0, mmSPI_PERFCOUNTER5_LO, mmSPI_PERFCOUNTER5_HI}};
/*
* TCA
*/
GpuCounterRegInfo AiTcaCounterRegAddr[] = {
{mmTCA_PERFCOUNTER0_SELECT, 0, mmTCA_PERFCOUNTER0_LO, mmTCA_PERFCOUNTER0_HI},
{mmTCA_PERFCOUNTER1_SELECT, 0, mmTCA_PERFCOUNTER1_LO, mmTCA_PERFCOUNTER1_HI},
{mmTCA_PERFCOUNTER2_SELECT, 0, mmTCA_PERFCOUNTER2_LO, mmTCA_PERFCOUNTER2_HI},
{mmTCA_PERFCOUNTER3_SELECT, 0, mmTCA_PERFCOUNTER3_LO, mmTCA_PERFCOUNTER3_HI}};
/*
* TCC
*/
GpuCounterRegInfo AiTccCounterRegAddr[] = {
{mmTCC_PERFCOUNTER0_SELECT, 0, mmTCC_PERFCOUNTER0_LO, mmTCC_PERFCOUNTER0_HI},
{mmTCC_PERFCOUNTER1_SELECT, 0, mmTCC_PERFCOUNTER1_LO, mmTCC_PERFCOUNTER1_HI},
{mmTCC_PERFCOUNTER2_SELECT, 0, mmTCC_PERFCOUNTER2_LO, mmTCC_PERFCOUNTER2_HI},
{mmTCC_PERFCOUNTER3_SELECT, 0, mmTCC_PERFCOUNTER3_LO, mmTCC_PERFCOUNTER3_HI}};
/*
* TCP
*/
GpuCounterRegInfo AiTcpCounterRegAddr[] = {
{mmTCP_PERFCOUNTER0_SELECT, 0, mmTCP_PERFCOUNTER0_LO, mmTCP_PERFCOUNTER0_HI},
{mmTCP_PERFCOUNTER1_SELECT, 0, mmTCP_PERFCOUNTER1_LO, mmTCP_PERFCOUNTER1_HI},
{mmTCP_PERFCOUNTER2_SELECT, 0, mmTCP_PERFCOUNTER2_LO, mmTCP_PERFCOUNTER2_HI},
{mmTCP_PERFCOUNTER3_SELECT, 0, mmTCP_PERFCOUNTER3_LO, mmTCP_PERFCOUNTER3_HI}};
/*
* CB
*/
GpuCounterRegInfo AiCbCounterRegAddr[] = {
{mmCB_PERFCOUNTER0_SELECT, 0, mmCB_PERFCOUNTER0_LO, mmCB_PERFCOUNTER0_HI},
{mmCB_PERFCOUNTER1_SELECT, 0, mmCB_PERFCOUNTER1_LO, mmCB_PERFCOUNTER1_HI},
{mmCB_PERFCOUNTER2_SELECT, 0, mmCB_PERFCOUNTER2_LO, mmCB_PERFCOUNTER2_HI},
{mmCB_PERFCOUNTER3_SELECT, 0, mmCB_PERFCOUNTER3_LO, mmCB_PERFCOUNTER3_HI}};
/*
* DB
*/
GpuCounterRegInfo AiDbCounterRegAddr[] = {
{mmDB_PERFCOUNTER0_SELECT, 0, mmDB_PERFCOUNTER0_LO, mmDB_PERFCOUNTER0_HI},
{mmDB_PERFCOUNTER1_SELECT, 0, mmDB_PERFCOUNTER1_LO, mmDB_PERFCOUNTER1_HI},
{mmDB_PERFCOUNTER2_SELECT, 0, mmDB_PERFCOUNTER2_LO, mmDB_PERFCOUNTER2_HI},
{mmDB_PERFCOUNTER3_SELECT, 0, mmDB_PERFCOUNTER3_LO, mmDB_PERFCOUNTER3_HI}};
/*
* RLC
*/
GpuCounterRegInfo AiRlcCounterRegAddr[] = {
{mmRLC_PERFCOUNTER0_SELECT, 0, mmRLC_PERFCOUNTER0_LO, mmRLC_PERFCOUNTER0_HI},
{mmRLC_PERFCOUNTER1_SELECT, 0, mmRLC_PERFCOUNTER1_LO, mmRLC_PERFCOUNTER1_HI}};
/*
* SC
*/
GpuCounterRegInfo AiScCounterRegAddr[] = {
{mmPA_SC_PERFCOUNTER0_SELECT, 0, mmPA_SC_PERFCOUNTER0_LO, mmPA_SC_PERFCOUNTER0_HI},
{mmPA_SC_PERFCOUNTER1_SELECT, 0, mmPA_SC_PERFCOUNTER1_LO, mmPA_SC_PERFCOUNTER1_HI},
{mmPA_SC_PERFCOUNTER2_SELECT, 0, mmPA_SC_PERFCOUNTER2_LO, mmPA_SC_PERFCOUNTER2_HI},
{mmPA_SC_PERFCOUNTER3_SELECT, 0, mmPA_SC_PERFCOUNTER3_LO, mmPA_SC_PERFCOUNTER3_HI},
{mmPA_SC_PERFCOUNTER4_SELECT, 0, mmPA_SC_PERFCOUNTER4_LO, mmPA_SC_PERFCOUNTER4_HI},
{mmPA_SC_PERFCOUNTER5_SELECT, 0, mmPA_SC_PERFCOUNTER5_LO, mmPA_SC_PERFCOUNTER5_HI},
{mmPA_SC_PERFCOUNTER6_SELECT, 0, mmPA_SC_PERFCOUNTER6_LO, mmPA_SC_PERFCOUNTER6_HI},
{mmPA_SC_PERFCOUNTER7_SELECT, 0, mmPA_SC_PERFCOUNTER7_LO, mmPA_SC_PERFCOUNTER7_HI}};
/*
* SX
*/
GpuCounterRegInfo AiSxCounterRegAddr[] = {
{mmSX_PERFCOUNTER0_SELECT, 0, mmSX_PERFCOUNTER0_LO, mmSX_PERFCOUNTER0_HI},
{mmSX_PERFCOUNTER1_SELECT, 0, mmSX_PERFCOUNTER1_LO, mmSX_PERFCOUNTER1_HI},
{mmSX_PERFCOUNTER2_SELECT, 0, mmSX_PERFCOUNTER2_LO, mmSX_PERFCOUNTER2_HI},
{mmSX_PERFCOUNTER3_SELECT, 0, mmSX_PERFCOUNTER3_LO, mmSX_PERFCOUNTER3_HI}};
/*
* TA
*/
GpuCounterRegInfo AiTaCounterRegAddr[] = {
{mmTA_PERFCOUNTER0_SELECT, 0, mmTA_PERFCOUNTER0_LO, mmTA_PERFCOUNTER0_HI},
{mmTA_PERFCOUNTER1_SELECT, 0, mmTA_PERFCOUNTER1_LO, mmTA_PERFCOUNTER1_HI}};
/*
* TD
*/
GpuCounterRegInfo AiTdCounterRegAddr[] = {
{mmTD_PERFCOUNTER0_SELECT, 0, mmTD_PERFCOUNTER0_LO, mmTD_PERFCOUNTER0_HI},
{mmTD_PERFCOUNTER1_SELECT, 0, mmTD_PERFCOUNTER1_LO, mmTD_PERFCOUNTER1_HI}};
/*
* GDS
*/
GpuCounterRegInfo AiGdsCounterRegAddr[] = {
{mmGDS_PERFCOUNTER0_SELECT, 0, mmGDS_PERFCOUNTER0_LO, mmGDS_PERFCOUNTER0_HI},
{mmGDS_PERFCOUNTER1_SELECT, 0, mmGDS_PERFCOUNTER1_LO, mmGDS_PERFCOUNTER1_HI},
{mmGDS_PERFCOUNTER2_SELECT, 0, mmGDS_PERFCOUNTER2_LO, mmGDS_PERFCOUNTER2_HI},
{mmGDS_PERFCOUNTER3_SELECT, 0, mmGDS_PERFCOUNTER3_LO, mmGDS_PERFCOUNTER3_HI}};
/*
* VGT
*/
GpuCounterRegInfo AiVgtCounterRegAddr[] = {
{mmVGT_PERFCOUNTER0_SELECT, 0, mmVGT_PERFCOUNTER0_LO, mmVGT_PERFCOUNTER0_HI},
{mmVGT_PERFCOUNTER1_SELECT, 0, mmVGT_PERFCOUNTER1_LO, mmVGT_PERFCOUNTER1_HI},
{mmVGT_PERFCOUNTER2_SELECT, 0, mmVGT_PERFCOUNTER2_LO, mmVGT_PERFCOUNTER2_HI},
{mmVGT_PERFCOUNTER3_SELECT, 0, mmVGT_PERFCOUNTER3_LO, mmVGT_PERFCOUNTER3_HI}};
/*
* IA
*/
GpuCounterRegInfo AiIaCounterRegAddr[] = {
{mmIA_PERFCOUNTER0_SELECT, 0, mmIA_PERFCOUNTER0_LO, mmIA_PERFCOUNTER0_HI},
{mmIA_PERFCOUNTER1_SELECT, 0, mmIA_PERFCOUNTER1_LO, mmIA_PERFCOUNTER1_HI},
{mmIA_PERFCOUNTER2_SELECT, 0, mmIA_PERFCOUNTER2_LO, mmIA_PERFCOUNTER2_HI},
{mmIA_PERFCOUNTER3_SELECT, 0, mmIA_PERFCOUNTER3_LO, mmIA_PERFCOUNTER3_HI}};
/*
* MC
*/
GpuCounterRegInfo AiMcCounterRegAddr[] = {
/*
{mmMC_SEQ_PERF_SEQ_CTL__SI__VI, 0, mmMC_SEQ_PERF_SEQ_CNT_A_I0__VI,
mmMC_SEQ_PERF_SEQ_CNT_A_I1__VI},
{mmMC_SEQ_PERF_SEQ_CTL__SI__VI, 0, mmMC_SEQ_PERF_SEQ_CNT_B_I0__VI,
mmMC_SEQ_PERF_SEQ_CNT_B_I1__VI},
{mmMC_SEQ_PERF_SEQ_CTL__SI__VI, 0, mmMC_SEQ_PERF_SEQ_CNT_C_I0__VI,
mmMC_SEQ_PERF_SEQ_CNT_C_I1__VI},
{mmMC_SEQ_PERF_SEQ_CTL__SI__VI, 0, mmMC_SEQ_PERF_SEQ_CNT_D_I0__VI,
mmMC_SEQ_PERF_SEQ_CNT_D_I1__VI}
*/
};
/*
* SRBM
*/
GpuCounterRegInfo AiSrbmCounterRegAddr[] = {
/*
{mmSRBM_PERFCOUNTER0_SELECT, 0, mmSRBM_PERFCOUNTER0_LO,
mmSRBM_PERFCOUNTER0_HI},
{mmSRBM_PERFCOUNTER1_SELECT, 0, mmSRBM_PERFCOUNTER1_LO,
mmSRBM_PERFCOUNTER1_HI}
*/
};
/*
* WD
*/
GpuCounterRegInfo AiWdCounterRegAddr[] = {
{mmWD_PERFCOUNTER0_SELECT, 0, mmWD_PERFCOUNTER0_LO, mmWD_PERFCOUNTER0_HI},
{mmWD_PERFCOUNTER1_SELECT, 0, mmWD_PERFCOUNTER1_LO, mmWD_PERFCOUNTER1_HI},
{mmWD_PERFCOUNTER2_SELECT, 0, mmWD_PERFCOUNTER2_LO, mmWD_PERFCOUNTER2_HI},
{mmWD_PERFCOUNTER3_SELECT, 0, mmWD_PERFCOUNTER3_LO, mmWD_PERFCOUNTER3_HI}};
/*
* CPG
*/
GpuCounterRegInfo AiCpgCounterRegAddr[] = {
{mmCPG_PERFCOUNTER0_SELECT, 0, mmCPG_PERFCOUNTER0_LO, mmCPG_PERFCOUNTER0_HI},
{mmCPG_PERFCOUNTER1_SELECT, 0, mmCPG_PERFCOUNTER1_LO, mmCPG_PERFCOUNTER1_HI}};
/*
* CPC
*/
GpuCounterRegInfo AiCpcCounterRegAddr[] = {
{mmCPC_PERFCOUNTER0_SELECT, 0, mmCPC_PERFCOUNTER0_LO, mmCPC_PERFCOUNTER0_HI},
{mmCPC_PERFCOUNTER1_SELECT, 0, mmCPC_PERFCOUNTER1_LO, mmCPC_PERFCOUNTER1_HI}};
GpuPrivCounterBlockId AiBlockIdSq = {{0xb5c396b6, 0x47e4d310, 0xc35cfc86, 0x08f53a04}};
GpuPrivCounterBlockId AiBlockIdMc = {{0x13900b57, 0x4d984956, 0x5268d081, 0x9cf53719}};
GpuPrivCounterBlockId AiBlockIdIommuV2 = {{0x80969879, 0x4be6b0f6, 0x636af697, 0x1d10f500}};
GpuPrivCounterBlockId AiBlockIdKernelDriver = {{0xea9b5ae1, 0x44b36c3f, 0xf0da5489, 0x0aa96575}};
} // pm4_profile
+252
Просмотреть файл
@@ -0,0 +1,252 @@
#ifndef _AI_BLOCKINFO_H_
#define _AI_BLOCKINFO_H_
#include <stdint.h>
#include "rocr_profiler.h"
#include "gpu_enum.h"
#include "gpu_blockinfo.h"
namespace pm4_profile {
// MAX Number of block instances for ARCTIC ISLANDS (From Vega10)
// Values are found here //gfxip/gfx8/main/src/meta/features/variant/Fiji/album.dj
// @brief Number of block instances.
// Number of CB block instances per SE
// and number of Perf Cntrs per CB block
#define AI_NUM_CB 4
#define AI_COUNTER_NUM_PER_CB 4
// Number of DB block instances per SE
// and number of Perf Cntrs per DB block
#define AI_NUM_DB 4
#define AI_COUNTER_NUM_PER_DB 4
// Number of TA block instances per SE
// and number of Perf Cntrs per TA block
#define AI_NUM_TA 16
#define AI_COUNTER_NUM_PER_TA 2
// Number of TD block instances per SE
// and number of Perf Cntrs per TD block
#define AI_NUM_TD 16
#define AI_COUNTER_NUM_PER_TD 2
// Number of TCP block instances per SE
// and number of Perf Cntrs per TCP block
#define AI_NUM_TCP 16
#define AI_COUNTER_NUM_PER_TCP 4
// Number of TCA block instances per chip
// and number of Perf Cntrs per TCA block
#define AI_NUM_TCA 2
#define AI_COUNTER_NUM_PER_TCA 4
// Number of TCC block instances per chip
// and number of Perf Cntrs per TCC block
#define AI_NUM_TCC 16
#define AI_COUNTER_NUM_PER_TCC 4
// Number of SDMA block instances per chip
// and number of Perf Cntrs per SDMA block
#define AI_NUM_SDMA 2
// Number of counter registers per block for arctic islands
#define AI_COUNTER_NUM_PER_DRM 2
#define AI_COUNTER_NUM_PER_DRMDMA 2
#define AI_COUNTER_NUM_PER_IH 2
#define AI_COUNTER_NUM_PER_SRBM 2
#define AI_COUNTER_NUM_PER_CPF 2
#define AI_COUNTER_NUM_PER_GRBM 2
#define AI_COUNTER_NUM_PER_GRBMSE 4
#define AI_COUNTER_NUM_PER_PA_SU 4
#define AI_COUNTER_NUM_PER_RLC 2
#define AI_COUNTER_NUM_PER_PA_SC 8
#define AI_COUNTER_NUM_PER_SPI 6 // [Shucai: To do: double check the value]
#define AI_COUNTER_NUM_PER_SQ 16
#define AI_COUNTER_NUM_PER_SX 4
#define AI_COUNTER_NUM_PER_GDS 4
#define AI_COUNTER_NUM_PER_VGT 4
#define AI_COUNTER_NUM_PER_IA 4
#define AI_COUNTER_NUM_PER_MC 4
#define AI_COUNTER_NUM_PER_TCS 4
#define AI_COUNTER_NUM_PER_WD 4
#define AI_COUNTER_NUM_PER_CPG 2
#define AI_COUNTER_NUM_PER_CPC 2
#define AI_COUNTER_NUM_PER_VM 1
#define AI_COUNTER_NUM_PER_VM_MD 1
#define AI_COUNTER_NUM_PER_PIPESTATS 12
#define AI_MAX_NUM_SHADER_ENGINES 1
// Enumeration of AI hardware counter blocks
typedef enum HsaAiCounterBlockId {
kHsaAiCounterBlockIdCb0 = 0,
kHsaAiCounterBlockIdCb1,
kHsaAiCounterBlockIdCb2,
kHsaAiCounterBlockIdCb3,
// Temp commented for Vega10
kHsaAiCounterBlockIdCpf,
kHsaAiCounterBlockIdDb0,
kHsaAiCounterBlockIdDb1,
kHsaAiCounterBlockIdDb2,
kHsaAiCounterBlockIdDb3,
kHsaAiCounterBlockIdGrbm,
kHsaAiCounterBlockIdGrbmSe,
kHsaAiCounterBlockIdPaSu,
kHsaAiCounterBlockIdPaSc,
kHsaAiCounterBlockIdSpi,
kHsaAiCounterBlockIdSq,
kHsaAiCounterBlockIdSqGs,
kHsaAiCounterBlockIdSqVs,
kHsaAiCounterBlockIdSqPs,
kHsaAiCounterBlockIdSqHs,
kHsaAiCounterBlockIdSqCs,
kHsaAiCounterBlockIdSx,
kHsaAiCounterBlockIdTa0,
kHsaAiCounterBlockIdTa1,
kHsaAiCounterBlockIdTa2,
kHsaAiCounterBlockIdTa3,
kHsaAiCounterBlockIdTa4,
kHsaAiCounterBlockIdTa5,
kHsaAiCounterBlockIdTa6,
kHsaAiCounterBlockIdTa7,
kHsaAiCounterBlockIdTa8,
kHsaAiCounterBlockIdTa9,
kHsaAiCounterBlockIdTa10,
kHsaAiCounterBlockIdTa11,
kHsaAiCounterBlockIdTa12,
kHsaAiCounterBlockIdTa13,
kHsaAiCounterBlockIdTa14,
kHsaAiCounterBlockIdTa15,
kHsaAiCounterBlockIdTca0,
kHsaAiCounterBlockIdTca1,
kHsaAiCounterBlockIdTcc0,
kHsaAiCounterBlockIdTcc1,
kHsaAiCounterBlockIdTcc2,
kHsaAiCounterBlockIdTcc3,
kHsaAiCounterBlockIdTcc4,
kHsaAiCounterBlockIdTcc5,
kHsaAiCounterBlockIdTcc6,
kHsaAiCounterBlockIdTcc7,
kHsaAiCounterBlockIdTcc8,
kHsaAiCounterBlockIdTcc9,
kHsaAiCounterBlockIdTcc10,
kHsaAiCounterBlockIdTcc11,
kHsaAiCounterBlockIdTcc12,
kHsaAiCounterBlockIdTcc13,
kHsaAiCounterBlockIdTcc14,
kHsaAiCounterBlockIdTcc15,
kHsaAiCounterBlockIdTd0,
kHsaAiCounterBlockIdTd1,
kHsaAiCounterBlockIdTd2,
kHsaAiCounterBlockIdTd3,
kHsaAiCounterBlockIdTd4,
kHsaAiCounterBlockIdTd5,
kHsaAiCounterBlockIdTd6,
kHsaAiCounterBlockIdTd7,
kHsaAiCounterBlockIdTd8,
kHsaAiCounterBlockIdTd9,
kHsaAiCounterBlockIdTd10,
kHsaAiCounterBlockIdTd11,
kHsaAiCounterBlockIdTd12,
kHsaAiCounterBlockIdTd13,
kHsaAiCounterBlockIdTd14,
kHsaAiCounterBlockIdTd15,
kHsaAiCounterBlockIdTcp0,
kHsaAiCounterBlockIdTcp1,
kHsaAiCounterBlockIdTcp2,
kHsaAiCounterBlockIdTcp3,
kHsaAiCounterBlockIdTcp4,
kHsaAiCounterBlockIdTcp5,
kHsaAiCounterBlockIdTcp6,
kHsaAiCounterBlockIdTcp7,
kHsaAiCounterBlockIdTcp8,
kHsaAiCounterBlockIdTcp9,
kHsaAiCounterBlockIdTcp10,
kHsaAiCounterBlockIdTcp11,
kHsaAiCounterBlockIdTcp12,
kHsaAiCounterBlockIdTcp13,
kHsaAiCounterBlockIdTcp14,
kHsaAiCounterBlockIdTcp15,
kHsaAiCounterBlockIdGds,
kHsaAiCounterBlockIdVgt,
kHsaAiCounterBlockIdIa,
kHsaAiCounterBlockIdMc,
// Temp commented out for Vega10
// kHsaAiCounterBlockIdSrbm,
kHsaAiCounterBlockIdTcs,
kHsaAiCounterBlockIdWd,
// Temp commented out for Vega10
// kHsaAiCounterBlockIdCpg,
// Temp commented for Vega10
kHsaAiCounterBlockIdCpc,
// Counters retrieved by KFD
kHsaAiCounterBlockIdIommuV2,
kHsaAiCounterBlockIdKernelDriver,
kHsaAiCounterBlockIdCpPipeStats,
kHsaAiCounterBlockIdHwInfo,
kHsaAiCounterBlockIdBlocksFirst = kHsaAiCounterBlockIdCb0,
kHsaAiCounterBlockIdBlocksLast = kHsaAiCounterBlockIdHwInfo
} HsaAiCounterBlockId;
extern GpuBlockInfo AiPmuHwBlocks[];
extern GpuCounterRegInfo AiSqCounterRegAddr[];
extern GpuCounterRegInfo AiCbCounterRegAddr[];
extern GpuCounterRegInfo AiDrmdmaCounterRegAddr[];
extern GpuCounterRegInfo AiIhCounterRegAddr[];
extern GpuCounterRegInfo AiCpfCounterRegAddr[];
extern GpuCounterRegInfo AiCpgCounterRegAddr[];
extern GpuCounterRegInfo AiCpcCounterRegAddr[];
extern GpuCounterRegInfo AiDrmCounterRegAddr[];
extern GpuCounterRegInfo AiGrbmCounterRegAddr[];
extern GpuCounterRegInfo AiGrbmSeCounterRegAddr[];
extern GpuCounterRegInfo AiPaSuCounterRegAddr[];
extern GpuCounterRegInfo AiPaScCounterRegAddr[];
extern GpuCounterRegInfo AiSpiCounterRegAddr[];
extern GpuCounterRegInfo AiTcaCounterRegAddr[];
extern GpuCounterRegInfo AiTccCounterRegAddr[];
extern GpuCounterRegInfo AiTcpCounterRegAddr[];
extern GpuCounterRegInfo AiDbCounterRegAddr[];
extern GpuCounterRegInfo AiRlcCounterRegAddr[];
extern GpuCounterRegInfo AiScCounterRegAddr[];
extern GpuCounterRegInfo AiSxCounterRegAddr[];
extern GpuCounterRegInfo AiTaCounterRegAddr[];
extern GpuCounterRegInfo AiTdCounterRegAddr[];
extern GpuCounterRegInfo AiGdsCounterRegAddr[];
extern GpuCounterRegInfo AiVgtCounterRegAddr[];
extern GpuCounterRegInfo AiIaCounterRegAddr[];
extern GpuCounterRegInfo AiMcCounterRegAddr[];
extern GpuCounterRegInfo AiSrbmCounterRegAddr[];
// No Tcs Counter block on AI
// extern GpuCounterRegInfo AiTcsCounterRegAddr[];
extern GpuCounterRegInfo AiWdCounterRegAddr[];
extern GpuCounterRegInfo AiCpgCounterRegAddr[];
extern GpuCounterRegInfo AiCpcCounterRegAddr[];
extern GpuPrivCounterBlockId AiBlockIdSq;
extern GpuPrivCounterBlockId AiBlockIdMc;
extern GpuPrivCounterBlockId AiBlockIdIommuV2;
extern GpuPrivCounterBlockId AiBlockIdKernelDriver;
}
#endif // _AI_BLOCKINFO_H_
Разница между файлами не показана из-за своего большого размера Загрузить разницу
+137
Просмотреть файл
@@ -0,0 +1,137 @@
#ifndef _AI_PMU_H_
#define _AI_PMU_H_
#include "hsa.h"
#include "cmdwriter.h"
#include "hsa_perf.h"
#include "info_set.h"
#include "parameter_set.h"
#include "ai_blockinfo.h"
#include "rocr_profiler.h"
#include <stdlib.h>
#include <stdint.h>
#include <map>
namespace pm4_profile {
typedef std::map<HsaAiCounterBlockId, pm4_profile::CounterBlock*> AiCounterBlockMap;
// This class implement the AI PMU. It is responsible for setting up
// CounterGroups to represent each AI hardware block which exposes performance
// counters.
class AiPmu : public pm4_profile::Pmu {
public:
AiPmu();
~AiPmu();
// Returns number of shader engines per block
// for the blocks featured shader engines instancing
uint32_t getNumSe() { return num_se_; }
// Initializes the handle of buffer used to collect PMC data
bool setPmcDataBuff(uint8_t* pmcBuffer, uint32_t pmcBuffSz);
int getLastError();
std::string getErrorString(int error);
virtual bool begin(DefaultCmdBuf* cmdBuff, CommandWriter* cmdWriter, bool reset = true);
virtual bool end(DefaultCmdBuf* cmdBuff, CommandWriter* cmdWriter);
// IPMU inherits the IParameterSet and IInfoSetso we implement it
// through composition and function forwarding
bool getParameter(uint32_t param, uint32_t& ret_size, void** pp_data);
bool setParameter(uint32_t param, uint32_t param_size, const void* p_data);
bool getInfo(uint32_t info, uint32_t& ret_size, void** pp_data);
pm4_profile::CounterBlock* getCounterBlockById(uint32_t id);
rocr_pmu_state_t getCurrentState() { return profiler_state_; }
pm4_profile::CounterBlock** getAllCounterBlocks(uint32_t& num_groups);
private:
// Addr of Counter Data Buffer
uint32_t* pmcData_;
// Size of Counter Data Buffer
uint32_t pmcDataSz_;
void Init();
bool initCounterBlock();
bool isResultReady();
// Clear CounterBlockMap
void clearCounterBlockMap();
// Reset SQ and CB counters
void ResetCounterBlocks(DefaultCmdBuf* cmdBuff, CommandWriter* cmdWriter);
// Program SQ block related counters
uint32_t ProgramSQCntrs(uint32_t sqRegIdx, uint32_t* regAddr, uint32_t* regVal, uint32_t blkId,
uint32_t blkCntrIdx);
// Program TA block related counters
uint32_t ProgramTaCntrs(uint32_t taRegIdx, uint32_t* regAddr, uint32_t* regVal, uint32_t blkId,
uint32_t blkCntrIdx);
// Program TCA block related counters
uint32_t ProgramTcaCntrs(uint32_t tcaRegIdx, uint32_t* regAddr, uint32_t* regVal, uint32_t blkId,
uint32_t blkCntrIdx);
// Program TCC block related counters
uint32_t ProgramTccCntrs(uint32_t tccRegIdx, uint32_t* regAddr, uint32_t* regVal, uint32_t blkId,
uint32_t blkCntrIdx);
// Program TCP block related counters
uint32_t ProgramTcpCntrs(uint32_t tcpRegIdx, uint32_t* regAddr, uint32_t* regVal, uint32_t blkId,
uint32_t blkCntrIdx);
// Program TD block related counters
uint32_t ProgramTdCntrs(uint32_t tdRegIdx, uint32_t* regAddr, uint32_t* regVal, uint32_t blkId,
uint32_t blkCntrIdx);
// Build counter selection register, return how many registers are built
uint32_t BuildCounterSelRegister(uint32_t cntrIdx, uint32_t* regAddr, uint32_t* regVal,
uint32_t blkId, pm4_profile::Counter* blkCntr);
// Build counter selection register, return how many registers are built
uint32_t BuildCounterReadRegisters(uint32_t reg_index, uint32_t block_id, uint32_t* reg_addr,
uint32_t* reg_val);
private:
// Delete counter blocks in the PMU
hsa_status_t RemoveCounterBlocks();
private:
// This contains the available counter groups.
AiCounterBlockMap blk_map_;
// This stores the current profiling state.
rocr_pmu_state_t profiler_state_;
pm4_profile::ParameterSet* parameter_set_;
pm4_profile::InfoSet* info_set_;
int error_code_;
// Pointer used to store counter block list internally
uint32_t blk_list_size_;
pm4_profile::CounterBlock** blk_list_;
// Indicates the number of Shader Engines Present
uint32_t num_se_;
// Used to reset GRBM to its default state
uint32_t reset_grbm_;
};
}
#endif // _AI_PMU_H_
+101
Просмотреть файл
@@ -0,0 +1,101 @@
#ifndef _GPU_BLOCKINFO_H_
#define _GPU_BLOCKINFO_H_
#include "rocr_profiler.h"
#include "gpu_enum.h"
#include <stdint.h>
namespace pm4_profile {
typedef enum CntlMethod {
CntlMethodNone = 0,
CntlMethodByInstance = 1,
CntlMethodBySe = 2,
CntlMethodBySeAndInstance = 3
} CntlMethod;
// Structure which contains information about a specific hardware block for CI.
#define GPU_BLOCK_NAME_SIZE 15
typedef struct GpuBlockInfo_ {
// Unique string identifier of the block.
const char blockName[GPU_BLOCK_NAME_SIZE];
// Unique string identifier of the block.
uint32_t counterGroupId;
// Maximum number of shader engines
uint32_t maxShaderEngineCount;
// Maximum number of shader arrays
uint32_t maxShaderArrayCount;
// Maximum number of block instances in the group per shader array
uint32_t maxInstanceCount;
// Counter control method
CntlMethod method;
// Maximum counter event ID
uint32_t maxEventId;
// Maximum number of counters that can be enabled at once
uint32_t maxSimultaneousCounters;
// Maximum number of streaming counters that can be enabled at once
uint32_t maxStreamingCounters;
// The number of hardware counters that are shared
// between regular and streaming counters.
// This is important so that resources are not double-booked
// between the two types of counters.
uint32_t sharedHWCounters;
// Block counters can be configured with additional filters
bool hasFilters;
//------------------------------------------
// Trace specific stuff regarding when they get locked
// Buffer size in bytes
uint32_t bufferSize;
// Current write pointer offset from beginning of the buffer
uint32_t wptrOffset;
// Flag that buffer might have wrapped
bool wrapped;
// If buffer has wrapped, this could indicate approximate
// total amount of data that was dumpued in the trace buffer
uint32_t dataSizeEstimate;
// Buffer data pointer
void* pData;
} GpuBlockInfo;
// Register address corresponding to each counter
typedef struct GpuCounterRegInfo_ {
// counter select register address
uint32_t counterSelRegAddr;
// counter control register address
uint32_t counterCntlRegAddr;
// counter read register address low
uint32_t counterReadRegAddrLo;
// counter read register address high
uint32_t counterReadRegAddrHi;
} GpuCounterRegInfo;
// Gpu Privileged Block ID info. This number should be the same as that
// defined in KFD
typedef struct GpuPrivCounterBlockId_ {
// Block ID consists of 4 dwords
uint32_t items[4];
} GpuPrivCounterBlockId;
} // pm4_profile
#endif
+73
Просмотреть файл
@@ -0,0 +1,73 @@
#include "gpu_counter.h"
using namespace pm4_profile;
namespace pm4_profile {
static char error_string[][64] = {
{"No error"}, {"Counter generic error"}, {"Counter is already set"}, {"Counter not ready"},
};
GpuCounter::GpuCounter() : Counter() {
counter_enabled_ = false;
parameter_set_ = new ParameterSet();
}
GpuCounter::~GpuCounter() { delete parameter_set_; }
bool GpuCounter::getResult(uint64_t* p_result) {
if (!p_result) {
return false;
}
*p_result = result_;
return true;
}
bool GpuCounter::setCounterBlock(pm4_profile::CounterBlock* p_cntr_group) {
if (!p_cntr_group) {
return false;
}
counter_block_ = p_cntr_group;
return true;
}
pm4_profile::CounterBlock* GpuCounter::getCounterBlock() { return counter_block_; }
bool GpuCounter::setEnable(bool b) {
// TODO: Validate counter
counter_enabled_ = b;
return true;
}
void GpuCounter::setResult(uint64_t result) { result_ = result; }
int GpuCounter::getLastError() { return error_code_; }
std::string GpuCounter::getErrorString(int error) {
if ((error >= 0) && (error < kHsaCounterErrorCodeMax)) {
std::string err_string(error_string[error]);
return err_string;
}
return "Incorrect error index";
}
bool GpuCounter::getParameter(uint32_t param, uint32_t& ret_size, void** pp_data) {
return parameter_set_->getParameter(param, ret_size, pp_data);
}
bool GpuCounter::setParameter(uint32_t param, uint32_t param_size, const void* p_data) {
bool ret_code;
error_code_ = kHsaCounterErrorCodeNoError;
ret_code = parameter_set_->setParameter(param, param_size, p_data);
if (ret_code == false) {
error_code_ = kHsaCounterErrorCodeAlreadySet;
}
return ret_code;
}
}
+52
Просмотреть файл
@@ -0,0 +1,52 @@
#ifndef _GPU_COUNTER_H_
#define _GPU_COUNTER_H_
#include "hsa_perf.h"
#include "parameter_set.h"
#include <stdlib.h>
#include <stdint.h>
#include <list>
namespace pm4_profile {
// @brief This class represent each CI performance counter
class GpuCounter : public pm4_profile::Counter {
public:
GpuCounter();
virtual ~GpuCounter();
virtual int getLastError();
virtual std::string getErrorString(int error);
virtual bool getResult(uint64_t* p_result);
virtual pm4_profile::CounterBlock* getCounterBlock();
virtual bool setEnable(bool b);
virtual bool isEnabled() { return counter_enabled_; }
virtual bool isResultReady() { return is_result_ready_; }
virtual bool getParameter(uint32_t param, uint32_t& ret_size, void** pp_data);
virtual bool setParameter(uint32_t param, uint32_t param_size, const void* p_data);
bool setCounterBlock(pm4_profile::CounterBlock* p_cntr_group);
void setResult(uint64_t result);
private:
bool counter_enabled_;
bool is_result_ready_;
uint64_t result_;
pm4_profile::ParameterSet* parameter_set_;
pm4_profile::CounterBlock* counter_block_;
uint32_t error_code_;
};
typedef std::list<GpuCounter*> GpuCounterList;
}
#endif // _GPU_COUNTER_H_
+215
Просмотреть файл
@@ -0,0 +1,215 @@
#include "gpu_countergroup.h"
#include "gpu_counter.h"
#include "gpu_enum.h"
using namespace pm4_profile;
namespace pm4_profile {
static char error_string[][64] = {
{"No error"}, {"Counter block error"}, {"Max counter reached"}, {"Unkown counter"}};
GpuCounterBlock::GpuCounterBlock() : CounterBlock() {
cntr_list_.clear();
parameter_set_ = new ParameterSet();
info_set_ = new InfoSet();
// Initialize pointer to NULL
pp_cntrs_ = NULL;
_initCounterBlockType();
}
GpuCounterBlock::~GpuCounterBlock() {
GpuCounterList::iterator it = cntr_list_.begin();
GpuCounterList::iterator it_end = cntr_list_.end();
for (; it != it_end; it++) {
if (*it) {
delete (*it);
}
}
cntr_list_.clear();
delete parameter_set_;
delete info_set_;
if (pp_cntrs_) {
free(pp_cntrs_);
pp_cntrs_ = NULL;
}
}
void GpuCounterBlock::_initCounterBlockType() {
block_type_ = HSA_EXT_TOOLS_COUNTER_BLOCK_TYPE_ASYNC;
}
Counter* GpuCounterBlock::createCounter() {
if (!_checkMaxNumOfCounters()) {
return NULL;
}
GpuCounter* p_cntr = new GpuCounter();
if (!p_cntr) {
return NULL;
}
cntr_list_.push_back(p_cntr);
return (Counter*)p_cntr;
}
bool GpuCounterBlock::destroyCounter(Counter* p_cntr) {
bool ret = false;
if (!p_cntr) {
return ret;
}
GpuCounterList::iterator it = cntr_list_.begin();
GpuCounterList::iterator it_end = cntr_list_.end();
for (; it != it_end; it++) {
if (*it == p_cntr) {
delete (*it);
cntr_list_.erase(it);
ret = true;
break;
}
}
return ret;
}
bool GpuCounterBlock::destroyAllCounters() {
GpuCounterList::iterator it = cntr_list_.begin();
GpuCounterList::iterator it_end = cntr_list_.end();
for (; it != it_end; it++) {
if (*it) {
delete (*it);
}
}
cntr_list_.clear();
return true;
}
Counter** GpuCounterBlock::getEnabledCounters(uint32_t& num) {
if (pp_cntrs_) {
free(pp_cntrs_);
pp_cntrs_ = NULL;
}
pp_cntrs_ = (Counter**)malloc(sizeof(GpuCounter*) * cntr_list_.size());
if (!pp_cntrs_) {
return NULL;
}
int cnt = 0;
GpuCounterList::iterator it = cntr_list_.begin();
GpuCounterList::iterator it_end = cntr_list_.end();
for (; it != it_end; it++) {
GpuCounter* p_cntr = (*it);
bool is_enabled;
is_enabled = p_cntr->isEnabled();
if (is_enabled) {
*(pp_cntrs_ + cnt) = (Counter*)*it;
cnt++;
}
}
num = cnt;
if (0 == num) {
return NULL;
}
return pp_cntrs_;
}
Counter** GpuCounterBlock::getAllCounters(uint32_t& num) {
if (pp_cntrs_) {
free(pp_cntrs_);
pp_cntrs_ = NULL;
}
pp_cntrs_ = (Counter**)malloc(sizeof(GpuCounter*) * cntr_list_.size());
if (!pp_cntrs_) {
return NULL;
}
int cnt = 0;
GpuCounterList::iterator it = cntr_list_.begin();
GpuCounterList::iterator it_end = cntr_list_.end();
for (; it != it_end; it++, cnt++) {
*(pp_cntrs_ + cnt) = (Counter*)*it;
}
num = cnt;
if (0 == num) {
return NULL;
}
return pp_cntrs_;
}
bool GpuCounterBlock::setInfo(GPU_BLK_INFOS blk_info, uint32_t size, void* data) {
return info_set_->setInfo(blk_info, size, data);
}
bool GpuCounterBlock::_checkMaxNumOfCounters() {
uint32_t num_enabled = _getNumOfEnabledCounters();
uint32_t* p_num_max = NULL;
uint32_t size = 0;
if (!getInfo(GPU_BLK_INFO_MAX_SIMULTANEOUS_COUNTERS, size, (void**)&p_num_max)) {
return false;
}
if (num_enabled >= *p_num_max) {
return false;
}
return true;
}
uint32_t GpuCounterBlock::_getNumOfEnabledCounters() {
uint32_t cnt = 0;
GpuCounterList::iterator it = cntr_list_.begin();
GpuCounterList::iterator it_end = cntr_list_.end();
for (; it != it_end; it++) {
GpuCounter* p_cntr = (*it);
bool is_enabled;
is_enabled = p_cntr->isEnabled();
if (is_enabled) {
cnt++;
}
}
return cnt;
}
std::string GpuCounterBlock::getErrorString(int error) {
if ((error >= 0) && (error < kHsaCounterBlockErrorCodeMaxError)) {
std::string err_string(error_string[error]);
return err_string;
}
return "incorrect error code";
}
bool GpuCounterBlock::getParameter(uint32_t param, uint32_t& ret_size, void** pp_data) {
return parameter_set_->getParameter(param, ret_size, pp_data);
}
bool GpuCounterBlock::setParameter(uint32_t param, uint32_t param_size, const void* pData) {
return parameter_set_->setParameter(param, param_size, pData);
}
bool GpuCounterBlock::getInfo(uint32_t info, uint32_t& ret_size, void** pp_data) {
return info_set_->getInfo(info, ret_size, pp_data);
}
}
+70
Просмотреть файл
@@ -0,0 +1,70 @@
#ifndef _GPU_COUNTER_GROUP_H_
#define _GPU_COUNTER_GROUP_H_
// This file contains declaration of Sea Island (CI) CounterBlock class.
#include "hsa_perf.h"
#include "gpu_counter.h"
#include "parameter_set.h"
#include "info_set.h"
#include "gpu_enum.h"
#include <stdlib.h>
#include <stdint.h>
namespace pm4_profile {
// This class represents one CI hardware block. Each block contains
// multiple performance counters.
class GpuCounterBlock : public pm4_profile::CounterBlock {
public:
GpuCounterBlock();
~GpuCounterBlock();
// NOTE [Suravee] : We specify CiPmu as a friend
// because the CiPmu needs to be able to setup info of
// the counter block.
friend class CiPmu;
friend class ViPmu;
friend class AiPmu;
std::string getErrorString(int error);
pm4_profile::Counter* createCounter();
virtual bool destroyCounter(pm4_profile::Counter* p_cntr);
virtual bool destroyAllCounters();
virtual pm4_profile::Counter** getEnabledCounters(uint32_t& num);
virtual pm4_profile::Counter** getAllCounters(uint32_t& num);
virtual bool getParameter(uint32_t param, uint32_t& ret_size, void** pp_data);
virtual bool setParameter(uint32_t param, uint32_t param_size, const void* p_data);
virtual bool getInfo(uint32_t info, uint32_t& ret_size, void** pp_data);
protected:
void _initCounterBlockType();
bool setInfo(GPU_BLK_INFOS blk_info, uint32_t size, void* data);
hsa_ext_tools_counter_block_type_t block_type_;
private:
bool _checkMaxNumOfCounters();
uint32_t _getNumOfEnabledCounters();
pm4_profile::ParameterSet* parameter_set_;
pm4_profile::InfoSet* info_set_;
GpuCounterList cntr_list_;
uint32_t error_code_;
// Pointer of buffer to store counter list
pm4_profile::Counter** pp_cntrs_;
};
} // pm4_profile
#endif // _GPU_COUNTER_GROUP_H_
+65
Просмотреть файл
@@ -0,0 +1,65 @@
#ifndef _GPU_ENUM_H_
#define _GPU_ENUM_H_
namespace pm4_profile {
// Enumeration containing GPU hardware block information
enum GPU_BLK_INFOS {
GPU_BLK_INFO_BLOCK_NAME,
GPU_BLK_INFO_ID,
GPU_BLK_INFO_MAX_SHADER_ENGINE_COUNT,
GPU_BLK_INFO_MAX_SHADER_ARRAY_COUNT,
GPU_BLK_INFO_MAX_INSTANCE_COUNT,
GPU_BLK_INFO_CONTROL_METHOD,
GPU_BLK_INFO_MAX_EVENT_ID,
GPU_BLK_INFO_MAX_SIMULTANEOUS_COUNTERS,
GPU_BLK_INFO_MAX_STREAMING_COUNTERS,
GPU_BLK_INFO_SHARED_HW_COUNTERS,
GPU_BLK_INFO_HAS_FILTERS,
// Trace-specific stuff
GPU_TRC_BLK_INFO_BUFFER_SIZE,
GPU_TRC_BLK_INFO_BUFFER_WRITE_POINTER_OFFSET,
GPU_TRC_BLK_INFO_BUFFER_WRAPPED,
GPU_TRC_BLK_INFO_DATA_SIZE_ESTIMATE,
GPU_TRC_BLK_INFO_DATA_POINTER,
};
/**
* Trace buffer parameters
*/
enum GPU_BLK_PARAMS {
// Allows user to specify the size of the trace buffers.
GPU_BLK_PARAM_TRACE_BUFFER_SIZE,
// If we decide to implement this functionality, this will allow the user
// to specify the number of trace buffers to create.
GPU_BLK_PARAM_TRACE_BUFFER_ARRAY,
// Specifies whether a new trace buffer should be used for each cmd buffer.
// This allows for better correlation of data back to the host application
// If this is enabled, and the user does not explicitly specify a
// TRACE_BUFFER_ARRAY, then the driver should automatically allocate
// additional buffers as needed so that as much of the application
// can be traced as possible, until the PerfExperiment is ended.
// If a TRACE_BUFFER_ARRAY is specified, then only as many buffers
// as specified should be created. If more cmd buffers get submitted
// than there are trace buffers, then the later cmd buffers should
// not be traced.
GPU_BLK_PARAM_TRACE_NEW_BUFFER_ON_SUBMIT,
};
// Enumeration containing GPU counter parameters
enum GPU_CNTR_PARAMS {
GPU_CNTR_PARAM_SHADERENGINE_ID,
GPU_CNTR_PARAM_SHADERARRAY_ID,
GPU_CNTR_PARAM_INSTANCE_ID,
GPU_CNTR_PARAM_EVENT_SELECT_ID,
GPU_CNTR_PARAM_SIMD_MASK,
GPU_CNTR_PARAM_PERF_MODE,
GPU_CNTR_PARAM_TRACE_TYPE,
};
}
#endif
+436
Просмотреть файл
@@ -0,0 +1,436 @@
#ifndef _HSA_PERF_H_
#define _HSA_PERF_H_
#include "rocr_profiler.h"
#if !defined(AMD_AMP_HSA_INCLUDES)
#include <map>
#include <string>
#include <stdlib.h>
#include <stdint.h>
#endif
namespace pm4_profile {
class Pmu;
class Counter;
class CounterBlock;
class TraceGroup;
class CommandWriter;
class DefaultCmdBuf;
// @brief This is an abstract class for defining a CounterBlock. Each
// CounterBlock contains a set of Counters that often belong to the
// same functional unit
//
// For AMD GPU, this can represent blocks of Counters in each HW block
// (e.g. SQ, SQI, CP, etc.).
// For AMD CPU, this can represent blocks of core PMCs, NB PMCs, L2I PMCs
// on each CPU device
//
// Generally, CounterBlocks are created and initialized by the \ref Pmu class.
// Users can query them by calling \ref Pmu::getAllCounterBlocks() or
// \ref Pmu::getCounterBlockById(). A CounterBlock is enabled if it contains
// enabled Counters in the block.
//
// Users can manage Counters in each GounterBlock (e.g. create, destroy,
// enable and disable). To specify a Counter, users simply call \ref
// createCounter. Then it can be enabled or disabled using \ref
// Counter::setEnable. When a Counter is enabled, it is checked against the
// CounterBlock checks to make sure that the enabled-counter is valid and is
// not conflicting with the current Counters in the block.
class CounterBlock {
public:
typedef enum HsaCounterBlockErrorCode {
// Generic CounterBlock error
kHsaCounterBlockErrorCodeNoError = 0x0,
// Generic CounterBlock error
kHsaCounterBlockErrorCodeGenericError,
// The maximum number of Counters in the block is reached.
kHsaCounterBlockErrorCodeMaxNumCounterReached,
// The counter does not belong to this block.
kHsaCounterBlockErrorCodeUnknownCounter,
// The counter does not belong to this block.
kHsaCounterBlockErrorCodeMaxError
} HsaCounterBlockErrorCode;
// Destructor of CounterBlock.
virtual ~CounterBlock() {}
// Given and error number reported from getLastError or returned from a
// function call, retreive the corresponding stl string.
// @param[in] error The error corresponding to a call to getLastError
// or a return code from a function call.
// Return An stl string representing a text corresponding to the error
// number.
// If invalid error code is given, the returned string is empty.
virtual std::string getErrorString(int error) = 0;
// Create an Counter object return a pointer to caller.
// Return On success, this function returns a pointer to Counter
// On failure, this function returns NULL
// Possible error codes are:
// kHSAPerfErrorCodesUnmodifiableState
// kHsaCounterBlockErrorCodeMaxNumCounterReached
virtual Counter* createCounter() = 0;
// Destroy the Counter. The CounterBlock which owns the Counter must be in
// disabled state.
// Return true or false
// Possible error codes are:
// kHSAPerfErrorCodesInvalidAargs
// kHSAPerfErrorCodesUnmodifiableState
// kHsaCounterBlockErrorCodeUnknownCounter
virtual bool destroyCounter(Counter* p_counter) = 0;
// Destroy all counters in the block. The CounterBlock must be in disable
// state.
// Return true or false.
// Possible error codes are:
// kHSAPerfErrorCodesUnmodifiableState
virtual bool destroyAllCounters() = 0;
// Get a list of pointers to the enabled Counters in this CounterBlock.
// note The Counter must be created by the same CounterBlock object using
// createCounter().
// @param[in] num The number of Counter pointers returned.
// Return
// return a list of pointers to the enabled Counters.
// return NULL if no counter is enabled.
virtual Counter** getEnabledCounters(uint32_t& num) = 0;
// Get a list of pointers to the all Counters in this CounterBlock.
// note The Counter must be created by the same CounterBlock object using
// createCounter().
// @param[in] num The number of Counter pointers returned.
// Return
// return a list of pointers in the CounterBlock.
// return NULL if no counter is enabled.
virtual Counter** getAllCounters(uint32_t& num) = 0;
// Query value of the parameter specified by param
// @param[in] param The enumeration of parameter to be queried
// @param[out] return_size The returned size of data
// @param[out] pp_data The pointer to the returned data. The API is
// responsible for managing the memory to store the information as specified
// by return_size.
//
// Return true or false
// Possible error codes are:
// kHSAPerfErrorCodesInvalidParam
// kHSAPerfErrorCodesInvalidParamSize
// kHSAPerfErrorCodesInvalidParamData
virtual bool getParameter(uint32_t param, uint32_t& return_size, void** pp_data) = 0;
// Set value for the parameter specified by param
// @param[in] param The enumeration of parameter to be queried
// @param[out] param_size The size of data
// @param[out] p_data The pointer to the data to be set. Users are responsible
// for deallocating the memory of p_data after calling the API.
// Return true or false
// Possible error codes are:
// kHSAPerfErrorCodesUnmodifiableState
// kHSAPerfErrorCodesInvalidParam
// kHSAPerfErrorCodesInvalidParamSize
// kHSAPerfErrorCodesInvalidParamData
virtual bool setParameter(uint32_t param, uint32_t param_size, const void* p_data) = 0;
// Query value of the information specified by info
// @param[in] info The enumeration of information to be queried
// @param[out] Return_size The returned size of data
// @param[out] pp_data The pointer to the returned data
// Return true or false
// Possible error codes are:
// kHSAPerfErrorCodesInvalidInfo
// kHSAPerfErrorCodesInvalidInfoSize
// kHSAPerfErrorCodesInvalidInfoData
virtual bool getInfo(uint32_t info, uint32_t& return_size, void** pp_data) = 0;
}; // class CounterBlock
// This is an abstract class for defining a TraceGroup. TraceGroup inherits
// CounterBlock and add interfaces for managing trace buffer. It also supports
// user-data insertion into trace. This allows users to insert arbitary data
// (e.g. markers) into trace which and can be used to correlating a specific
// events to the collected trace data.
class TraceGroup : public CounterBlock {
public:
typedef enum HsaTraceGroupErrorCode {
// Generic TraceGroup error
HsaTraceGroupErrorCodeGenericError = 0x100,
} HsaTraceGroupErrorCode;
// Destructor of TraceGroup.
virtual ~TraceGroup() {}
// Obtains the number of buffers which were collected as part of
// the trace.
// Return The number of collected buffers.
virtual uint32_t getCollectedBufferCount() = 0;
// Locks a trace buffer for host access.
// @param[in] buffer_id The index of the buffer to be locked.
// Return true or false
virtual bool lock(uint32_t buffer_id) = 0;
// Unlock a trace buffer that was previously locked.
// @param[in] buffer_id The index of the buffer to be unlocked.
// Return true or false
virtual bool unlock(uint32_t buffer_id) = 0;
// Inserts data (e.g. trace marker) into the trace.
// @param[in] type The type of data to be inserted.
// @param[in] p_data The data to be inserted.
// @param[in] data_size The size of data to be inserted.
// Return true or false
virtual bool insertUserData(uint32_t type, void* p_data, uint32_t data_size) = 0;
}; // class TraceGroup
// This is an abstract class for defining a performance Counter.
// Users can obtain a Counter from \ref CounterBlock::createCounter().
// Once obtained, users can set up Counter parameters, and enable it using
// \ref Counter::setEnable().
//
// There are several types of Counter as defined in \ref
// HsaCounterBlockTypeMask.
// Only the supported Counter type can be added to the CounterBlock.
//
// Each Counter can store Counter-specific parameters. The Counter is used to
// specify types of event to be counted.
class Counter {
public:
typedef enum HsaCounterErrorCode {
// Generic Counter error
kHsaCounterErrorCodeNoError = 0x0,
// Generic Counter error
kHsaCounterErrorCodeGenericError = 0x1,
// Counter already error
kHsaCounterErrorCodeAlreadySet = 0x2,
// Counter result is not ready.
kHsaCounterErrorCodeResultNotReady = 0x3,
// Max counter error num
kHsaCounterErrorCodeMax,
} HsaCounterErrorCode;
// Destructor of Counter
virtual ~Counter() {}
// Retrieve the last error code generated. This should be checked when
// values returned are NULL or void.
// Return an integer corresponding to the last error reported.
virtual int getLastError() = 0;
// Given and error number reported from getLastError or returned from a
// function call, retreive the corresponding stl string.
// @param[in] error The error corresponding to a call to getLastError
// or a return code from a function call.
// Return An stl string representing a text corresponding to the error
// number. If invalid error code is given, the returned string is empty.
virtual std::string getErrorString(int error) = 0;
// Get the \ref CounterBlock which owns this counter.
// Return
// On success, it returns a pointer to the CounterBlock.
// On Failure, it returns NULL.
virtual CounterBlock* getCounterBlock() = 0;
// Enable or disable the Counter.
// @param[in] b Set to true to enable the CounterBlock.
// Return
// return true when successfully set the state.
// return false otherwise.
// In case of the current state already is set to the specified value,
// the API returns true.
// Possible error codes are:
// kHSAPerfErrorCodesUnmodifiableState
virtual bool setEnable(bool b) = 0;
// Return the current state of the Counter.
// Return true or false
virtual bool isEnabled() = 0;
// Return the status of this Counter whether the result is available.
// Return true or false
virtual bool isResultReady() = 0;
// Query Counter result
// note Must be implemented by derived classes
// @param[out] p_result The pointer containing the returned result.
// Return true or false
// Possible error codes are:
// kHSAPerfErrorCodesInvalidAargs
// kHsaCounterErrorCodeResultNotReady
virtual bool getResult(uint64_t* p_result) = 0;
// Query value of the parameter specified by param
// @param[in] param The enumeration of parameter to be queried
// @param[out] Return_size The returned size of data
// @param[out] pp_data The pointer to the returned data. The API is
// responsible for managing the memory to store the information as
// specified by return_size.
// Return true or false
// Possible error codes are:
// kHSAPerfErrorCodesInvalidParam
// kHSAPerfErrorCodesInvalidParamSize
// kHSAPerfErrorCodesInvalidParamData
virtual bool getParameter(uint32_t param, uint32_t& return_size, void** pp_data) = 0;
// Set value for the parameter specified by param
// @param[in] param The enumeration of parameter to be queried
// @param[out] param_size The size of data
// @param[out] p_data The pointer to the data to be set. Users are responsible
// for deallocating the memory of p_data after calling the API.
// Return true or false
// Possible error codes are:
// kHSAPerfErrorCodesUnmodifiableState
// kHSAPerfErrorCodesInvalidParam
// kHSAPerfErrorCodesInvalidParamSize
// kHSAPerfErrorCodesInvalidParamData
virtual bool setParameter(uint32_t param, uint32_t param_size, const void* p_data) = 0;
}; // class Counter
class Pmu {
public:
// Enumeration of Pmu error codes
typedef enum HsaPmuErrorCode {
// Generic PMU error
kHsaPmuErrorCodeNoError = 0x0,
// Unknown CounterBlock ID
kHsaPmuErrorCodeUnknownCounterBlockId,
// No CounterBlock exists
kHsaPmuErrorCodeNoCounterBlock,
// The previously operation is not valid. This could be due to
// invalid transition from the current state.
kHsaPmuErrorCodeInvalidOperation,
// PMU is not currently available (e.g. PMU is currently
// in-used by others)
kHsaPmuErrorCodeNotAvailable,
// PMU is not currently available (e.g. PMU is currently
// in-used by others)
kHsaPmuErrorCodeErrorState,
// PMU result is timeout
kHsaPmuErrorCodeTimeOut,
// Max error count
kHsaPmuErrorCodeMax
} HsaPmuErrorCode;
// Destructor of PMU.
// note This stops the performance counters if running and releases
// any resources used by the PMU.
virtual ~Pmu() {}
// Retrieve the last error code generated. This should be checked when
// values returned are NULL or void.
// Return an integer corresponding to the last error reported.
virtual int getLastError() = 0;
// Given and error number reported from getLastError or returned from a
// function call, retreive the corresponding stl string.
// @param[in] error The error corresponding to a call to getLastError
// or a return code from a function call.
// Return An stl string representing a text corresponding to the error
// number. If invalid error code is given, the returned string is empty.
virtual std::string getErrorString(int error) = 0;
// Get CounterBlock from Id
// @param[in] id ID of the target CounterBlock
// Return
// On success, it returns a pointer to specified CounterBlock.
// On Failure, it returns NULL.
// Possible error codes are:
// kHsaPmuErrorCodeUnknownCounterBlockId.
virtual CounterBlock* getCounterBlockById(uint32_t id) = 0;
// Get all available CounterBlock
// @param[out] num_block The returned number of CounterBlocks
// Return On success, it returns an array of CounterBlock pointers.
// On Failure, it returns NULL.
virtual CounterBlock** getAllCounterBlocks(uint32_t& num_block) = 0;
// Get current PMU profiling state.
// Return The PMU profiling state as defined in \ref PMU_PROFILE_STATES
virtual rocr_pmu_state_t getCurrentState() = 0;
// Start profiling on the PMU.
// @param[in] reset_counter indicates whether reset counter before
// recording. Default is reset counters.
// note This function must be implemented by children classes.
// Return true or false
// Possible error codes are:
// kHsaPmuErrorCodeInvalidOperation
// kHsaPmuErrorCodeNotAvailable
virtual bool begin(DefaultCmdBuf* cmdBuff, CommandWriter* cmdWriter, bool reset = true) = 0;
// Stop profiling on the PMU.
// note This function must be called after \ref begin().
// note This function must be implemented by children classes.
// Return true or false
// Possible error codes are:
// kHsaPmuErrorCodeInvalidOperation
virtual bool end(DefaultCmdBuf* cmdBuff, CommandWriter* cmdWriter) = 0;
// Initializes the handle of buffer used to collect PMC data
// @param pmcBuffer The buffer pointer
// @param cmdBufSz Size in terms of bytes
virtual bool setPmcDataBuff(uint8_t* pmcBuffer, uint32_t pmcBuffSz) = 0;
// Query value of the parameter specified by param
// @param[in] param The enumeration of parameter to be queried
// @param[out] Return_size The returned size of data
// @param[out] pp_data The pointer to the returned data. The API is
// responsible for managing the memory to store the information as
// specified by return_size.
// Return true or false
// Possible error codes are:
// kHSAPerfErrorCodesInvalidParam
// kHSAPerfErrorCodesInvalidParamSize
// kHSAPerfErrorCodesInvalidParamData
virtual bool getParameter(uint32_t param, uint32_t& return_size, void** pp_data) = 0;
// Set value for the parameter specified by param
// @param[in] param The enumeration of parameter to be queried
// @param[out] param_size The size of data
// @param[out] p_data The pointer to the data to be set. Users are responsible
// for deallocating the memory of p_data after calling the API.
// Return true or false
// Possible error codes are:
// kHSAPerfErrorCodesUnmodifiableState
// kHSAPerfErrorCodesInvalidParam
// kHSAPerfErrorCodesInvalidParamSize
// kHSAPerfErrorCodesInvalidParamData
virtual bool setParameter(uint32_t param, uint32_t param_size, const void* p_data) = 0;
// Query value of the information specified by info
// @param[in] info The enumeration of information to be queried
// @param[out] Return_size The returned size of data
// @param[out] pp_data The pointer to the returned data
// Return true or false
// Possible error codes are:
// kHSAPerfErrorCodesInvalidInfo
// kHSAPerfErrorCodesInvalidInfoSize
// kHSAPerfErrorCodesInvalidInfoData
virtual bool getInfo(uint32_t info, uint32_t& return_size, void** pp_data) = 0;
// Returns number of shader engines per block
// for the blocks featured shader engines instancing
virtual uint32_t getNumSe() = 0;
}; // class Pmu
} // pm4_profile
#endif // _HSA_PERF_H_
+74
Просмотреть файл
@@ -0,0 +1,74 @@
#include "info_set.h"
#include "var_data.h"
using namespace std;
namespace pm4_profile {
InfoSet::InfoSet() {
releaseParameters();
info_table_.clear();
p_data_ = NULL;
}
InfoSet::~InfoSet() {
releaseParameters();
info_table_.clear();
free(p_data_);
p_data_ = NULL;
}
bool InfoSet::setInfo(uint32_t info, uint32_t info_size, void* p_data) {
if (info_table_.end() != info_table_.find(info)) {
return false;
}
VarData data;
if (!data.set(info_size, p_data)) {
return false;
}
info_table_.insert(VarDataMap::value_type(info, data));
return true;
}
bool InfoSet::getInfo(uint32_t info, uint32_t& ret_size, void** pp_data) {
if (!pp_data || (0 == info_table_.size())) {
return false;
}
VarDataMap::iterator it = info_table_.find(info);
if (it == info_table_.end()) {
return false;
}
int size = it->second.getSize();
if (size == 0) {
return false;
}
free(p_data_);
p_data_ = NULL;
p_data_ = malloc(size);
if (!p_data_) {
return false;
}
*pp_data = p_data_;
ret_size = info_table_[info].get(size, *pp_data);
return true;
}
void InfoSet::releaseParameters() {
VarDataMap::iterator it = info_table_.begin();
VarDataMap::iterator table_end = info_table_.end();
for (; it != table_end; it++) {
it->second.clear();
}
return;
}
} // pm4_profile
+48
Просмотреть файл
@@ -0,0 +1,48 @@
#ifndef _INFO_SET_H_
#define _INFO_SET_H_
// This file contains declaration of IInfoSet class.
#include "hsa_perf.h"
#include "var_data.h"
#include <stdlib.h>
#include <stdint.h>
namespace pm4_profile {
// An abstract class defining a container to hold a information data set
// (e.g. PMU info, CounterGroup info, etc.). Unlike \ref IParameterSet,
// This class allows only the children of the class to set the information.
class InfoSet {
public:
// IInfoSet constructor
InfoSet();
// IInfoSet destructor
virtual ~InfoSet();
// Query value of the information specified by info
// @param[in] info The enumeration of information to be queried
// @param[out] ret_size The returned size of data
// @param[out] pp_data The pointer to the returned data
// /return true or false
bool getInfo(uint32_t info, uint32_t& ret_size, void** pp_data);
// Set value for the information specified by info
// @param[in] info The enumeration of information to be queried
// @param[out] info_size The size of data
// @param[out] p_data The pointer to the data to be set
// /return true or false
bool setInfo(uint32_t info, uint32_t info_size, void* p_data);
private:
// Remove all data in the parameter table
void releaseParameters();
// InfoSet property: The info table
VarDataMap info_table_;
// Pointer to the buffer used in getInfo
void* p_data_;
};
}
#endif
+74
Просмотреть файл
@@ -0,0 +1,74 @@
#include "parameter_set.h"
using namespace std;
namespace pm4_profile {
ParameterSet::ParameterSet() {
releaseParameters();
param_table_.clear();
p_data_ = NULL;
}
ParameterSet::~ParameterSet() {
releaseParameters();
param_table_.clear();
free(p_data_);
p_data_ = NULL;
}
bool ParameterSet::setParameter(uint32_t param, uint32_t param_size, const void* p_data) {
if (param_table_.end() != param_table_.find(param)) {
return false;
}
VarData data;
if (!data.set(param_size, p_data)) {
return false;
}
param_table_.insert(VarDataMap::value_type(param, data));
return true;
}
bool ParameterSet::getParameter(uint32_t param, uint32_t& ret_size, void** pp_data) {
if (!pp_data || (0 == param_table_.size())) {
return false;
}
VarDataMap::iterator it = param_table_.find(param);
if (it == param_table_.end()) {
return false;
}
int size = it->second.getSize();
if (size == 0) {
return false;
}
// for NULL pointer, free does nothing
free(p_data_);
p_data_ = malloc(size);
if (!p_data_) {
return false;
}
// store the pointer to be freed
*pp_data = p_data_;
ret_size = param_table_[param].get(size, *pp_data);
return true;
}
bool ParameterSet::releaseParameters() {
VarDataMap::iterator it = param_table_.begin();
VarDataMap::iterator table_end = param_table_.end();
for (; it != table_end; it++) {
it->second.clear();
}
return true;
}
} // pm4_profile
+75
Просмотреть файл
@@ -0,0 +1,75 @@
#ifndef _PARAMETER_SET_H_
#define _PARAMETER_SET_H_
/*!
\note This file contains declaration of IParameterSet class.
*/
#include "hsa_perf.h"
#include "var_data.h"
#include <stdlib.h>
#include <stdint.h>
namespace pm4_profile {
/*!
A class defining a container to hold parameter data set
(e.g. PMU parameter, CounterGroup parameter, etc.).
*/
class ParameterSet {
public:
/*!
Enumeration containing types of parameters
*/
enum parameter {
PARAM_MAX,
};
/*! IParameterSet constructor */
ParameterSet();
/*! IParameterSet destructor */
virtual ~ParameterSet();
/*!
Query value of the parameter specified by param
@param[in] param The enumeration of parameter to be queried
@param[out] ret_size The returned size of data
@param[out] pp_data The pointer to the returned data
/return true or false
*/
bool getParameter(
/*in*/ uint32_t param,
/*out*/ uint32_t& ret_size,
/*out*/ void** pp_data);
/*!
Set value for the parameter specified by param
@param[in] param The enumeration of parameter to be queried
@param[out] param_size The size of data
@param[out] p_data The pointer to the data to be set
/return true or false
*/
bool setParameter(
/*in*/ uint32_t param,
/*in*/ uint32_t param_size,
/*in*/ const void* p_data);
private:
/*!
Remove all data in the parameter table
*/
bool releaseParameters();
/*!
IParameterSet property: The parameter table
*/
VarDataMap param_table_;
/*!
Pointer to the buffer used in getParameter
*/
void* p_data_;
};
}
#endif // _PARAMETER_SET_H_
+254
Просмотреть файл
@@ -0,0 +1,254 @@
////////////////////////////////////////////////////////////////////////////////
//
// The University of Illinois/NCSA
// Open Source License (NCSA)
//
// Copyright (c) 2014-2015, Advanced Micro Devices, Inc. All rights reserved.
//
// Developed by:
//
// AMD Research and AMD HSA Software Development
//
// Advanced Micro Devices, Inc.
//
// www.amd.com
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to
// deal with the Software without restriction, including without limitation
// the rights to use, copy, modify, merge, publish, distribute, sublicense,
// and/or sell copies of the Software, and to permit persons to whom the
// Software is furnished to do so, subject to the following conditions:
//
// - Redistributions of source code must retain the above copyright notice,
// this list of conditions and the following disclaimers.
// - Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimers in
// the documentation and/or other materials provided with the distribution.
// - Neither the names of Advanced Micro Devices, Inc,
// nor the names of its contributors may be used to endorse or promote
// products derived from this Software without specific prior written
// permission.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
// DEALINGS WITH THE SOFTWARE.
//
////////////////////////////////////////////////////////////////////////////////
#ifndef _ROCR_PROFILER_H_
#define _ROCR_PROFILER_H_
#ifdef __cplusplus
extern "C" {
#endif // __cplusplus
#if defined _WIN32 || defined __CYGWIN__
#ifdef __GNUC__
#define HSA_TOOLS_API __attribute__((dllexport))
#else
#define HSA_TOOLS_API __declspec(dllexport) // Note: actually gcc seems
// to also supports this
// syntax.
#endif
#ifndef DLL_LOCAL
#define DLL_LOCAL
#endif
#else // defined _WIN32 || defined __CYGWIN__
#if __GNUC__ >= 4
#define HSA_TOOLS_API __attribute__((visibility("default")))
#ifndef DLL_LOCAL
#define DLL_LOCAL __attribute__((visibility("hidden")))
#endif
#else
#define HSA_TOOLS_API
#ifndef DLL_LOCAL
#define DLL_LOCAL
#endif
#endif
#endif // defined _WIN32 || defined __CYGWIN__
//---------------------------------------------------------------------------//
// @brief Enumeration of various information that is set for a counter. //
// @detail This enumeration defines the various counter info that could be //
// used in a counter. This is used by a counter object to specify //
// its type and other conditions that are needed to retrieve a //
// counter value. //
//---------------------------------------------------------------------------//
typedef enum hsa_ext_tools_counter_parameter_s {
// Event index of a counter
HSA_EXT_TOOLS_COUNTER_PARAMETER_EVENT_INDEX = 0,
// Simd mask of a counter
HSA_EXT_TOOLS_COUNTER_PARAMETER_SIMD_MASK = 1,
// Shader engine mask of a counter
HSA_EXT_TOOLS_COUNTER_PARAMETER_SHADER_MASK = 2,
// Max counter info index
HSA_EXT_TOOLS_COUNTER_PARAMETER_INFO_MAX
} hsa_ext_tools_counter_parameter_t;
//---------------------------------------------------------------------------//
// @brief Enumeration of counter block type mask //
// @details This enumeration define the bit mask representing types of //
// counter broup supported by HSA. This is used by counter block object to //
// specify its type. //
//---------------------------------------------------------------------------//
typedef enum hsa_ext_tools_counter_block_type_s {
// Unknown counter block type
HSA_EXT_TOOLS_COUNTER_BLOCK_TYPE_UNKNOWN = 0,
// The CounterBlock of this type can be access at anytime.
// note Examples are software Counters and CPU Counters.
HSA_EXT_TOOLS_COUNTER_BLOCK_TYPE_SYNC = 1,
// The CounterBlock type can be access asynchronously.
// It is required that the Counter must be stopped
// before accessing.
HSA_EXT_TOOLS_COUNTER_BLOCK_TYPE_ASYNC = 2,
// The CounterBlock of this counter block is used for generating
// trace.
HSA_EXT_TOOLS_COUNTER_BLOCK_TYPE_TRACE = 3,
// Max CounterBlock type
HSA_EXT_TOOLS_COUNTER_BLOCK_TYPE_MAX
} hsa_ext_tools_counter_block_type_t;
//---------------------------------------------------------------------------//
// @brief Enumeration of various information that is set for a counter block.//
// @detail This enumeration defines the various info that could be used //
// in a counter block. This is used by a counter object to specify its type //
// and other conditions that are needed for a counter block. //
//---------------------------------------------------------------------------//
/*
typedef enum hsa_ext_tools_counter_block_info_s {
// Index of a counter block
HSA_EXT_TOOLS_COUNTER_BLOCK_INFO_EVENT_INDEX = 0,
// Shader bits of a counter block
HSA_EXT_TOOLS_COUNTER_BLOCK_INFO_SHADER_BITS = 1,
// Simd mask of a counter
HSA_EXT_TOOLS_COUNTER_BLOCK_INFO_CONTROL_METHOD = 2,
// Max index of counter block info
HSA_EXT_TOOLS_COUNTER_BLOCK_INFO_MAX
} hsa_ext_tools_counter_block_info_t;
*/
//---------------------------------------------------------------------------//
// Enumeration for the methods used to index into the correct registers. //
//---------------------------------------------------------------------------//
/*
typedef enum hsa_ext_tools_counter_index_method_s {
// No index
HSA_EXT_TOOLS_COUNTER_INDEX_METHOD_BY_NONE = 0,
// Index by block instance
HSA_EXT_TOOLS_COUNTER_INDEX_METHOD_BY_INSTANCE = 1,
// Index by shader engine
HSA_EXT_TOOLS_COUNTER_INDEX_METHOD_BY_SHADER_ENGINE = 2,
// Index by shader and instance
HSA_EXT_TOOLS_COUNTER_INDEX_METHOD_BY_SHADER_ENGINE_ANDINSTANCE = 3
} hsa_ext_tools_counter_index_method_t;
*/
//---------------------------------------------------------------------------//
// Enumeration for the HSAPerf generic error codes //
//---------------------------------------------------------------------------//
/*
typedef enum hsa_ext_tools_error_codes_s {
// Successful
HSA_EXT_TOOLS_ERROR_CODE_OK = 0,
// Generic error code
HSA_EXT_TOOLS_ERROR_CODE_ERROR,
// Generic invalid HSAPerf API arguments
HSA_EXT_TOOLS_ERROR_CODE_INVALID_ARGS,
// The operation is not permit due to currently in the unmodifiable
// HSAPerf state .
HSA_EXT_TOOLS_ERROR_CODE_UNMODIFIABLE_STATE,
// The hsa_ext_tools_set_pmu_parameter() or
// hsa_ext_tools_get_pmu_parameter() API contains invalid parameter value.
HSA_EXT_TOOLS_ERROR_CODE_INVALID_PARAM,
// The hsa_ext_tools_set_pmu_parameter() or
// hsa_ext_tools_get_pmu_parameter() API contains invalid parameter size
// or return size.
HSA_EXT_TOOLS_ERROR_CODE_INVALID_PARAM_SIZE,
// The hsa_ext_tools_set_pmu_parameter() or
// hsa_ext_tools_get_pmu_parameter() API contains invalid
// pointer (e.g. NULL).
HSA_EXT_TOOLS_ERROR_CODE_INVALID_PARAM_DATA,
// The hsa_ext_tools_get_pmu_info() API contains invalid info value.
HSA_EXT_TOOLS_ERROR_CODE_INVALID_INFO,
// The hsa_ext_tools_get_pmu_info() API contains invalid info
// size (e.g. zero).
HSA_EXT_TOOLS_ERROR_CODE_INVALID_INFO_SIZE,
// The hsa_ext_tools_get_pmu_info() API contains invalid
// data (e.g. NULL).
HSA_EXT_TOOLS_ERROR_CODE_INVALID_INFO_DATA
} hsa_ext_tools_error_codes_t;
*/
//---------------------------------------------------------------------------//
// Enumeration for Pmu profiling state //
//---------------------------------------------------------------------------//
typedef enum rocr_pmu_state_s {
// Profiling idle. In this state, changes can be made to
// the PMU, counter blocks, counters. This state can represent
// the moment prior to calling begin or after calling
// hsa_ext_tools_pmu_wait_for_completion().
ROCR_PMU_STATE_IDLE,
// Profiling start. In this state, changes cannot be made to
// the PMU, counter block, counters. The PMU is collecting
// performance counter data. This state represents
// the moment after calling hsa_ext_tools_pmu_begin() and before calling
// hsa_ext_tools_pmu_end()
ROCR_PMU_STATE_START,
// Profiling stop. In this state, changes cannot be made to
// the PMU, counter blocks, Counters. PMU has stopped the
// performance counter data collection. However, the result
// might not yet be available. This state represents
// the moment after calling hsa_ext_tools_pmu_end() and before the call
// to hsa_ext_tools_pmu_wait_for_completion() has returned success.
ROCR_PMU_STATE_STOP
} rocr_pmu_state_t;
//---------------------------------------------------------------------------//
// Opaque pointer to HSA performance monitor unit (PMU) //
//---------------------------------------------------------------------------//
// typedef void * hsa_ext_tools_pmu_t;
//---------------------------------------------------------------------------//
// Opaque pointer to HSA counter block //
//---------------------------------------------------------------------------//
// typedef void * hsa_ext_tools_counter_block_t;
//---------------------------------------------------------------------------//
// Opaque pointer to HSA counter //
//---------------------------------------------------------------------------//
// typedef void * hsa_ext_tools_counter_t;
#ifdef __cplusplus
}
#endif // __cplusplus
#endif // _ROCR_PROFILER_H_
+48
Просмотреть файл
@@ -0,0 +1,48 @@
#include <string.h>
#include "var_data.h"
namespace pm4_profile {
VarData::VarData() {
size_ = 0;
p_data_ = NULL;
}
VarData::~VarData() {}
void VarData::clear() {
size_ = 0;
if (p_data_) {
free(p_data_);
p_data_ = NULL;
}
}
bool VarData::set(uint32_t size, const void* p_data) {
if (!p_data || (size == 0)) {
return false;
}
clear();
if (NULL == (p_data_ = malloc(size))) {
return false;
}
memcpy(p_data_, p_data, size);
size_ = size;
return true;
}
uint32_t VarData::get(uint32_t size, void* p_data) {
if (!p_data || !size || !p_data_ || !size_) {
return 0;
}
uint32_t ret_size = size < size_ ? size : size_;
memcpy(p_data, p_data_, ret_size);
return ret_size;
}
} // pm4_profile
+65
Просмотреть файл
@@ -0,0 +1,65 @@
#ifndef _VAR_DATA_H_
#define _VAR_DATA_H_
/*!
\note This file contains declaration of IVarData class.
*/
#include "hsa_perf.h"
#include <map>
#include <stdlib.h>
#include <stdint.h>
namespace pm4_profile {
/*!
This abstract class implements variable-size storage for information and
parameter
sets.
*/
class VarData {
public:
/*! Constructor for IVarData */
VarData();
/*! Destructor for IVarData */
~VarData();
/*! Deallocate the memory and clean up */
void clear();
/*!
Set the data to be stored.
@param[in] size Size of data to be stored.
@param[in] p_data Pointer to data to be stored.
\return true or false
*/
bool set(uint32_t size, const void* p_data);
/*!
Query the data that was stored.
@param[in] size Size (in bytes) of the memory pointed to by p_data.
This determines maximum size of the returned data.
@param[in,out] p_data Pointer to the result buffer.
\return Size (in bytes) of the returned result which is coppied into
the buffer pointed to by p_data.
*/
uint32_t get(uint32_t size, void* p_data);
/*!
Get size of the current data stored
\return Size (in bytes) of the data stored.
*/
uint32_t getSize() { return size_; }
private:
/*! Size of data being stored */
uint32_t size_;
/*! Pointer to the stored data */
void* p_data_;
};
typedef std::map<uint32_t, VarData> VarDataMap;
}
#endif
+622
Просмотреть файл
@@ -0,0 +1,622 @@
#include "vi_blockinfo.h"
#include "gfxip/gfx8/si_ci_vi_merged_offset.h"
namespace pm4_profile {
/**
* Table containing CounterGroups which represent VI hardware blocks
* as defined by \ref GpuBlockInfo structure
*/
GpuBlockInfo ViPmuHwBlocks[] = {
// Counter block CB
{"VI_CB0", kHsaViCounterBlockIdCb0, VI_MAX_NUM_SHADER_ENGINES, 2, VI_NUM_CB,
CntlMethodBySeAndInstance, 395, VI_COUNTER_NUM_PER_CB, 0, 0, true, 0, 0, false, 0, 0},
{"VI_CB1", kHsaViCounterBlockIdCb1, VI_MAX_NUM_SHADER_ENGINES, 2, VI_NUM_CB,
CntlMethodBySeAndInstance, 395, VI_COUNTER_NUM_PER_CB, 0, 0, true, 0, 0, false, 0, 0},
{"VI_CB2", kHsaViCounterBlockIdCb2, VI_MAX_NUM_SHADER_ENGINES, 2, VI_NUM_CB,
CntlMethodBySeAndInstance, 395, VI_COUNTER_NUM_PER_CB, 0, 0, true, 0, 0, false, 0, 0},
{"VI_CB3", kHsaViCounterBlockIdCb3, VI_MAX_NUM_SHADER_ENGINES, 2, VI_NUM_CB,
CntlMethodBySeAndInstance, 395, VI_COUNTER_NUM_PER_CB, 0, 0, true, 0, 0, false, 0, 0},
// Counter block CPF
{"VI_CPF", kHsaViCounterBlockIdCpf, VI_MAX_NUM_SHADER_ENGINES, 2, 1, CntlMethodNone, 19,
VI_COUNTER_NUM_PER_CPF, 0, 0, true, 0, 0, false, 0, 0},
// Counter block DB
{"VI_DB0", kHsaViCounterBlockIdDb0, VI_MAX_NUM_SHADER_ENGINES, 2, VI_NUM_DB,
CntlMethodBySeAndInstance, 256, VI_COUNTER_NUM_PER_DB, 0, 0, true, 0, 0, false, 0, 0},
{"VI_DB1", kHsaViCounterBlockIdDb1, VI_MAX_NUM_SHADER_ENGINES, 2, VI_NUM_DB,
CntlMethodBySeAndInstance, 256, VI_COUNTER_NUM_PER_DB, 0, 0, true, 0, 0, false, 0, 0},
{"VI_DB2", kHsaViCounterBlockIdDb2, VI_MAX_NUM_SHADER_ENGINES, 2, VI_NUM_DB,
CntlMethodBySeAndInstance, 256, VI_COUNTER_NUM_PER_DB, 0, 0, true, 0, 0, false, 0, 0},
{"VI_DB3", kHsaViCounterBlockIdDb3, VI_MAX_NUM_SHADER_ENGINES, 2, VI_NUM_DB,
CntlMethodBySeAndInstance, 256, VI_COUNTER_NUM_PER_DB, 0, 0, true, 0, 0, false, 0, 0},
// Counter block GRBM
{"VI_GRBM", kHsaViCounterBlockIdGrbm, VI_MAX_NUM_SHADER_ENGINES, 2, 1, CntlMethodNone, 33,
VI_COUNTER_NUM_PER_GRBM, 0, 0, true, 0, 0, false, 0, 0},
// Counter block GRBMSE
{"VI_GRBMSE", kHsaViCounterBlockIdGrbmSe, VI_MAX_NUM_SHADER_ENGINES, 2, 1, CntlMethodNone, 14,
VI_COUNTER_NUM_PER_GRBMSE, 0, 0, true, 0, 0, false, 0, 0},
// Counter block PA_SU
{"VI_PA_SU", kHsaViCounterBlockIdPaSu, VI_MAX_NUM_SHADER_ENGINES, 2, 1, CntlMethodBySe, 152,
VI_COUNTER_NUM_PER_PA_SU, 0, 0, true, 0, 0, false, 0, 0},
// Counter block PA_SC
{"VI_PA_SC", kHsaViCounterBlockIdPaSc, VI_MAX_NUM_SHADER_ENGINES, 2, 1, CntlMethodBySe, 396,
VI_COUNTER_NUM_PER_PA_SC, 0, 0, true, 0, 0, false, 0, 0},
// Counter block SPI
{"VI_SPI", kHsaViCounterBlockIdSpi, VI_MAX_NUM_SHADER_ENGINES, 2, 1, CntlMethodBySe, 196,
VI_COUNTER_NUM_PER_SPI, 0, 0, true, 0, 0, false, 0, 0},
// Counter block SQ
{"VI_SQ", kHsaViCounterBlockIdSq, VI_MAX_NUM_SHADER_ENGINES, 2, 1, CntlMethodBySe, 298,
VI_COUNTER_NUM_PER_SQ, 0, 0, true, 0, 0, false, 0, 0},
{"VI_SQ_ES", kHsaViCounterBlockIdSqEs, VI_MAX_NUM_SHADER_ENGINES, 2, 1, CntlMethodBySe, 298,
VI_COUNTER_NUM_PER_SQ, 0, 0, true, 0, 0, false, 0, 0},
{"VI_SQ_GS", kHsaViCounterBlockIdSqGs, VI_MAX_NUM_SHADER_ENGINES, 2, 1, CntlMethodBySe, 298,
VI_COUNTER_NUM_PER_SQ, 0, 0, true, 0, 0, false, 0, 0},
{"VI_SQ_VS", kHsaViCounterBlockIdSqVs, VI_MAX_NUM_SHADER_ENGINES, 2, 1, CntlMethodBySe, 298,
VI_COUNTER_NUM_PER_SQ, 0, 0, true, 0, 0, false, 0, 0},
{"VI_SQ_PS", kHsaViCounterBlockIdSqPs, VI_MAX_NUM_SHADER_ENGINES, 2, 1, CntlMethodBySe, 298,
VI_COUNTER_NUM_PER_SQ, 0, 0, true, 0, 0, false, 0, 0},
{"VI_SQ_LS", kHsaViCounterBlockIdSqLs, VI_MAX_NUM_SHADER_ENGINES, 2, 1, CntlMethodBySe, 298,
VI_COUNTER_NUM_PER_SQ, 0, 0, true, 0, 0, false, 0, 0},
{"VI_SQ_HS", kHsaViCounterBlockIdSqHs, VI_MAX_NUM_SHADER_ENGINES, 2, 1, CntlMethodBySe, 298,
VI_COUNTER_NUM_PER_SQ, 0, 0, true, 0, 0, false, 0, 0},
{"VI_SQ_CS", kHsaViCounterBlockIdSqCs, VI_MAX_NUM_SHADER_ENGINES, 2, 1, CntlMethodBySe, 298,
VI_COUNTER_NUM_PER_SQ, 0, 0, true, 0, 0, false, 0, 0},
// Counter block SX
{"VI_SX", kHsaViCounterBlockIdSx, VI_MAX_NUM_SHADER_ENGINES, 2, 1, CntlMethodBySe, 33,
VI_COUNTER_NUM_PER_SX, 0, 0, true, 0, 0, false, 0, 0},
// Counter block TA
{"VI_TA0", kHsaViCounterBlockIdTa0, VI_MAX_NUM_SHADER_ENGINES, 2, VI_NUM_TA,
CntlMethodBySeAndInstance, 118, VI_COUNTER_NUM_PER_TA, 0, 0, true, 0, 0, false, 0, 0},
{"VI_TA1", kHsaViCounterBlockIdTa1, VI_MAX_NUM_SHADER_ENGINES, 2, VI_NUM_TA,
CntlMethodBySeAndInstance, 118, VI_COUNTER_NUM_PER_TA, 0, 0, true, 0, 0, false, 0, 0},
{"VI_TA2", kHsaViCounterBlockIdTa2, VI_MAX_NUM_SHADER_ENGINES, 2, VI_NUM_TA,
CntlMethodBySeAndInstance, 118, VI_COUNTER_NUM_PER_TA, 0, 0, true, 0, 0, false, 0, 0},
{"VI_TA3", kHsaViCounterBlockIdTa3, VI_MAX_NUM_SHADER_ENGINES, 2, VI_NUM_TA,
CntlMethodBySeAndInstance, 118, VI_COUNTER_NUM_PER_TA, 0, 0, true, 0, 0, false, 0, 0},
{"VI_TA4", kHsaViCounterBlockIdTa4, VI_MAX_NUM_SHADER_ENGINES, 2, VI_NUM_TA,
CntlMethodBySeAndInstance, 118, VI_COUNTER_NUM_PER_TA, 0, 0, true, 0, 0, false, 0, 0},
{"VI_TA5", kHsaViCounterBlockIdTa5, VI_MAX_NUM_SHADER_ENGINES, 2, VI_NUM_TA,
CntlMethodBySeAndInstance, 118, VI_COUNTER_NUM_PER_TA, 0, 0, true, 0, 0, false, 0, 0},
{"VI_TA6", kHsaViCounterBlockIdTa6, VI_MAX_NUM_SHADER_ENGINES, 2, VI_NUM_TA,
CntlMethodBySeAndInstance, 118, VI_COUNTER_NUM_PER_TA, 0, 0, true, 0, 0, false, 0, 0},
{"VI_TA7", kHsaViCounterBlockIdTa7, VI_MAX_NUM_SHADER_ENGINES, 2, VI_NUM_TA,
CntlMethodBySeAndInstance, 118, VI_COUNTER_NUM_PER_TA, 0, 0, true, 0, 0, false, 0, 0},
{"VI_TA8", kHsaViCounterBlockIdTa8, VI_MAX_NUM_SHADER_ENGINES, 2, VI_NUM_TA,
CntlMethodBySeAndInstance, 118, VI_COUNTER_NUM_PER_TA, 0, 0, true, 0, 0, false, 0, 0},
{"VI_TA9", kHsaViCounterBlockIdTa9, VI_MAX_NUM_SHADER_ENGINES, 2, VI_NUM_TA,
CntlMethodBySeAndInstance, 118, VI_COUNTER_NUM_PER_TA, 0, 0, true, 0, 0, false, 0, 0},
{"VI_TA10", kHsaViCounterBlockIdTa10, VI_MAX_NUM_SHADER_ENGINES, 2, VI_NUM_TA,
CntlMethodBySeAndInstance, 118, VI_COUNTER_NUM_PER_TA, 0, 0, true, 0, 0, false, 0, 0},
{"VI_TA11", kHsaViCounterBlockIdTa11, VI_MAX_NUM_SHADER_ENGINES, 2, VI_NUM_TA,
CntlMethodBySeAndInstance, 118, VI_COUNTER_NUM_PER_TA, 0, 0, true, 0, 0, false, 0, 0},
{"VI_TA12", kHsaViCounterBlockIdTa12, VI_MAX_NUM_SHADER_ENGINES, 2, VI_NUM_TA,
CntlMethodBySeAndInstance, 118, VI_COUNTER_NUM_PER_TA, 0, 0, true, 0, 0, false, 0, 0},
{"VI_TA13", kHsaViCounterBlockIdTa13, VI_MAX_NUM_SHADER_ENGINES, 2, VI_NUM_TA,
CntlMethodBySeAndInstance, 118, VI_COUNTER_NUM_PER_TA, 0, 0, true, 0, 0, false, 0, 0},
{"VI_TA14", kHsaViCounterBlockIdTa14, VI_MAX_NUM_SHADER_ENGINES, 2, VI_NUM_TA,
CntlMethodBySeAndInstance, 118, VI_COUNTER_NUM_PER_TA, 0, 0, true, 0, 0, false, 0, 0},
{"VI_TA15", kHsaViCounterBlockIdTa15, VI_MAX_NUM_SHADER_ENGINES, 2, VI_NUM_TA,
CntlMethodBySeAndInstance, 118, VI_COUNTER_NUM_PER_TA, 0, 0, true, 0, 0, false, 0, 0},
// Counter block TCA
{"VI_TCA0", kHsaViCounterBlockIdTca0, VI_MAX_NUM_SHADER_ENGINES, 2, VI_NUM_TCA,
CntlMethodByInstance, 34, VI_COUNTER_NUM_PER_TCA, 0, 0, true, 0, 0, false, 0, 0},
{"VI_TCA1", kHsaViCounterBlockIdTca1, VI_MAX_NUM_SHADER_ENGINES, 2, VI_NUM_TCA,
CntlMethodByInstance, 34, VI_COUNTER_NUM_PER_TCA, 0, 0, true, 0, 0, false, 0, 0},
// Counter block TCC
{"VI_TCC0", kHsaViCounterBlockIdTcc0, VI_MAX_NUM_SHADER_ENGINES, 2, VI_NUM_TCC,
CntlMethodByInstance, 191, VI_COUNTER_NUM_PER_TCC, 0, 0, true, 0, 0, false, 0, 0},
{"VI_TCC1", kHsaViCounterBlockIdTcc1, VI_MAX_NUM_SHADER_ENGINES, 2, VI_NUM_TCC,
CntlMethodByInstance, 191, VI_COUNTER_NUM_PER_TCC, 0, 0, true, 0, 0, false, 0, 0},
{"VI_TCC2", kHsaViCounterBlockIdTcc2, VI_MAX_NUM_SHADER_ENGINES, 2, VI_NUM_TCC,
CntlMethodByInstance, 191, VI_COUNTER_NUM_PER_TCC, 0, 0, true, 0, 0, false, 0, 0},
{"VI_TCC3", kHsaViCounterBlockIdTcc3, VI_MAX_NUM_SHADER_ENGINES, 2, VI_NUM_TCC,
CntlMethodByInstance, 191, VI_COUNTER_NUM_PER_TCC, 0, 0, true, 0, 0, false, 0, 0},
{"VI_TCC4", kHsaViCounterBlockIdTcc4, VI_MAX_NUM_SHADER_ENGINES, 2, VI_NUM_TCC,
CntlMethodByInstance, 191, VI_COUNTER_NUM_PER_TCC, 0, 0, true, 0, 0, false, 0, 0},
{"VI_TCC5", kHsaViCounterBlockIdTcc5, VI_MAX_NUM_SHADER_ENGINES, 2, VI_NUM_TCC,
CntlMethodByInstance, 191, VI_COUNTER_NUM_PER_TCC, 0, 0, true, 0, 0, false, 0, 0},
{"VI_TCC6", kHsaViCounterBlockIdTcc6, VI_MAX_NUM_SHADER_ENGINES, 2, VI_NUM_TCC,
CntlMethodByInstance, 191, VI_COUNTER_NUM_PER_TCC, 0, 0, true, 0, 0, false, 0, 0},
{"VI_TCC7", kHsaViCounterBlockIdTcc7, VI_MAX_NUM_SHADER_ENGINES, 2, VI_NUM_TCC,
CntlMethodByInstance, 191, VI_COUNTER_NUM_PER_TCC, 0, 0, true, 0, 0, false, 0, 0},
{"VI_TCC8", kHsaViCounterBlockIdTcc8, VI_MAX_NUM_SHADER_ENGINES, 2, VI_NUM_TCC,
CntlMethodByInstance, 191, VI_COUNTER_NUM_PER_TCC, 0, 0, true, 0, 0, false, 0, 0},
{"VI_TCC9", kHsaViCounterBlockIdTcc9, VI_MAX_NUM_SHADER_ENGINES, 2, VI_NUM_TCC,
CntlMethodByInstance, 191, VI_COUNTER_NUM_PER_TCC, 0, 0, true, 0, 0, false, 0, 0},
{"VI_TCC10", kHsaViCounterBlockIdTcc10, VI_MAX_NUM_SHADER_ENGINES, 2, VI_NUM_TCC,
CntlMethodByInstance, 191, VI_COUNTER_NUM_PER_TCC, 0, 0, true, 0, 0, false, 0, 0},
{"VI_TCC11", kHsaViCounterBlockIdTcc11, VI_MAX_NUM_SHADER_ENGINES, 2, VI_NUM_TCC,
CntlMethodByInstance, 191, VI_COUNTER_NUM_PER_TCC, 0, 0, true, 0, 0, false, 0, 0},
{"VI_TCC12", kHsaViCounterBlockIdTcc12, VI_MAX_NUM_SHADER_ENGINES, 2, VI_NUM_TCC,
CntlMethodByInstance, 191, VI_COUNTER_NUM_PER_TCC, 0, 0, true, 0, 0, false, 0, 0},
{"VI_TCC13", kHsaViCounterBlockIdTcc13, VI_MAX_NUM_SHADER_ENGINES, 2, VI_NUM_TCC,
CntlMethodByInstance, 191, VI_COUNTER_NUM_PER_TCC, 0, 0, true, 0, 0, false, 0, 0},
{"VI_TCC14", kHsaViCounterBlockIdTcc14, VI_MAX_NUM_SHADER_ENGINES, 2, VI_NUM_TCC,
CntlMethodByInstance, 191, VI_COUNTER_NUM_PER_TCC, 0, 0, true, 0, 0, false, 0, 0},
{"VI_TCC15", kHsaViCounterBlockIdTcc15, VI_MAX_NUM_SHADER_ENGINES, 2, VI_NUM_TCC,
CntlMethodByInstance, 191, VI_COUNTER_NUM_PER_TCC, 0, 0, true, 0, 0, false, 0, 0},
// Counter block TD
{"VI_TD0", kHsaViCounterBlockIdTd0, VI_MAX_NUM_SHADER_ENGINES, 2, VI_NUM_TD,
CntlMethodBySeAndInstance, 54, VI_COUNTER_NUM_PER_TD, 0, 0, true, 0, 0, false, 0, 0},
{"VI_TD1", kHsaViCounterBlockIdTd1, VI_MAX_NUM_SHADER_ENGINES, 2, VI_NUM_TD,
CntlMethodBySeAndInstance, 54, VI_COUNTER_NUM_PER_TD, 0, 0, true, 0, 0, false, 0, 0},
{"VI_TD2", kHsaViCounterBlockIdTd2, VI_MAX_NUM_SHADER_ENGINES, 2, VI_NUM_TD,
CntlMethodBySeAndInstance, 54, VI_COUNTER_NUM_PER_TD, 0, 0, true, 0, 0, false, 0, 0},
{"VI_TD3", kHsaViCounterBlockIdTd3, VI_MAX_NUM_SHADER_ENGINES, 2, VI_NUM_TD,
CntlMethodBySeAndInstance, 54, VI_COUNTER_NUM_PER_TD, 0, 0, true, 0, 0, false, 0, 0},
{"VI_TD4", kHsaViCounterBlockIdTd4, VI_MAX_NUM_SHADER_ENGINES, 2, VI_NUM_TD,
CntlMethodBySeAndInstance, 54, VI_COUNTER_NUM_PER_TD, 0, 0, true, 0, 0, false, 0, 0},
{"VI_TD5", kHsaViCounterBlockIdTd5, VI_MAX_NUM_SHADER_ENGINES, 2, VI_NUM_TD,
CntlMethodBySeAndInstance, 54, VI_COUNTER_NUM_PER_TD, 0, 0, true, 0, 0, false, 0, 0},
{"VI_TD6", kHsaViCounterBlockIdTd6, VI_MAX_NUM_SHADER_ENGINES, 2, VI_NUM_TD,
CntlMethodBySeAndInstance, 54, VI_COUNTER_NUM_PER_TD, 0, 0, true, 0, 0, false, 0, 0},
{"VI_TD7", kHsaViCounterBlockIdTd7, VI_MAX_NUM_SHADER_ENGINES, 2, VI_NUM_TD,
CntlMethodBySeAndInstance, 54, VI_COUNTER_NUM_PER_TD, 0, 0, true, 0, 0, false, 0, 0},
{"VI_TD8", kHsaViCounterBlockIdTd8, VI_MAX_NUM_SHADER_ENGINES, 2, VI_NUM_TD,
CntlMethodBySeAndInstance, 54, VI_COUNTER_NUM_PER_TD, 0, 0, true, 0, 0, false, 0, 0},
{"VI_TD9", kHsaViCounterBlockIdTd9, VI_MAX_NUM_SHADER_ENGINES, 2, VI_NUM_TD,
CntlMethodBySeAndInstance, 54, VI_COUNTER_NUM_PER_TD, 0, 0, true, 0, 0, false, 0, 0},
{"VI_TD10", kHsaViCounterBlockIdTd10, VI_MAX_NUM_SHADER_ENGINES, 2, VI_NUM_TD,
CntlMethodBySeAndInstance, 54, VI_COUNTER_NUM_PER_TD, 0, 0, true, 0, 0, false, 0, 0},
{"VI_TD11", kHsaViCounterBlockIdTd11, VI_MAX_NUM_SHADER_ENGINES, 2, VI_NUM_TD,
CntlMethodBySeAndInstance, 54, VI_COUNTER_NUM_PER_TD, 0, 0, true, 0, 0, false, 0, 0},
{"VI_TD12", kHsaViCounterBlockIdTd12, VI_MAX_NUM_SHADER_ENGINES, 2, VI_NUM_TD,
CntlMethodBySeAndInstance, 54, VI_COUNTER_NUM_PER_TD, 0, 0, true, 0, 0, false, 0, 0},
{"VI_TD13", kHsaViCounterBlockIdTd13, VI_MAX_NUM_SHADER_ENGINES, 2, VI_NUM_TD,
CntlMethodBySeAndInstance, 54, VI_COUNTER_NUM_PER_TD, 0, 0, true, 0, 0, false, 0, 0},
{"VI_TD14", kHsaViCounterBlockIdTd14, VI_MAX_NUM_SHADER_ENGINES, 2, VI_NUM_TD,
CntlMethodBySeAndInstance, 54, VI_COUNTER_NUM_PER_TD, 0, 0, true, 0, 0, false, 0, 0},
{"VI_TD15", kHsaViCounterBlockIdTd15, VI_MAX_NUM_SHADER_ENGINES, 2, VI_NUM_TD,
CntlMethodBySeAndInstance, 54, VI_COUNTER_NUM_PER_TD, 0, 0, true, 0, 0, false, 0, 0},
// Counter block TCP
{"VI_TCP0", kHsaViCounterBlockIdTcp0, VI_MAX_NUM_SHADER_ENGINES, 2, VI_NUM_TCP,
CntlMethodBySeAndInstance, 182, VI_COUNTER_NUM_PER_TCP, 0, 0, true, 0, 0, false, 0, 0},
{"VI_TCP1", kHsaViCounterBlockIdTcp1, VI_MAX_NUM_SHADER_ENGINES, 2, VI_NUM_TCP,
CntlMethodBySeAndInstance, 182, VI_COUNTER_NUM_PER_TCP, 0, 0, true, 0, 0, false, 0, 0},
{"VI_TCP2", kHsaViCounterBlockIdTcp2, VI_MAX_NUM_SHADER_ENGINES, 2, VI_NUM_TCP,
CntlMethodBySeAndInstance, 182, VI_COUNTER_NUM_PER_TCP, 0, 0, true, 0, 0, false, 0, 0},
{"VI_TCP3", kHsaViCounterBlockIdTcp3, VI_MAX_NUM_SHADER_ENGINES, 2, VI_NUM_TCP,
CntlMethodBySeAndInstance, 182, VI_COUNTER_NUM_PER_TCP, 0, 0, true, 0, 0, false, 0, 0},
{"VI_TCP4", kHsaViCounterBlockIdTcp4, VI_MAX_NUM_SHADER_ENGINES, 2, VI_NUM_TCP,
CntlMethodBySeAndInstance, 182, VI_COUNTER_NUM_PER_TCP, 0, 0, true, 0, 0, false, 0, 0},
{"VI_TCP5", kHsaViCounterBlockIdTcp5, VI_MAX_NUM_SHADER_ENGINES, 2, VI_NUM_TCP,
CntlMethodBySeAndInstance, 182, VI_COUNTER_NUM_PER_TCP, 0, 0, true, 0, 0, false, 0, 0},
{"VI_TCP6", kHsaViCounterBlockIdTcp6, VI_MAX_NUM_SHADER_ENGINES, 2, VI_NUM_TCP,
CntlMethodBySeAndInstance, 182, VI_COUNTER_NUM_PER_TCP, 0, 0, true, 0, 0, false, 0, 0},
{"VI_TCP7", kHsaViCounterBlockIdTcp7, VI_MAX_NUM_SHADER_ENGINES, 2, VI_NUM_TCP,
CntlMethodBySeAndInstance, 182, VI_COUNTER_NUM_PER_TCP, 0, 0, true, 0, 0, false, 0, 0},
{"VI_TCP8", kHsaViCounterBlockIdTcp8, VI_MAX_NUM_SHADER_ENGINES, 2, VI_NUM_TCP,
CntlMethodBySeAndInstance, 182, VI_COUNTER_NUM_PER_TCP, 0, 0, true, 0, 0, false, 0, 0},
{"VI_TCP9", kHsaViCounterBlockIdTcp9, VI_MAX_NUM_SHADER_ENGINES, 2, VI_NUM_TCP,
CntlMethodBySeAndInstance, 182, VI_COUNTER_NUM_PER_TCP, 0, 0, true, 0, 0, false, 0, 0},
{"VI_TCP10", kHsaViCounterBlockIdTcp10, VI_MAX_NUM_SHADER_ENGINES, 2, VI_NUM_TCP,
CntlMethodBySeAndInstance, 182, VI_COUNTER_NUM_PER_TCP, 0, 0, true, 0, 0, false, 0, 0},
{"VI_TCP11", kHsaViCounterBlockIdTcp11, VI_MAX_NUM_SHADER_ENGINES, 2, VI_NUM_TCP,
CntlMethodBySeAndInstance, 182, VI_COUNTER_NUM_PER_TCP, 0, 0, true, 0, 0, false, 0, 0},
{"VI_TCP12", kHsaViCounterBlockIdTcp12, VI_MAX_NUM_SHADER_ENGINES, 2, VI_NUM_TCP,
CntlMethodBySeAndInstance, 182, VI_COUNTER_NUM_PER_TCP, 0, 0, true, 0, 0, false, 0, 0},
{"VI_TCP13", kHsaViCounterBlockIdTcp13, VI_MAX_NUM_SHADER_ENGINES, 2, VI_NUM_TCP,
CntlMethodBySeAndInstance, 182, VI_COUNTER_NUM_PER_TCP, 0, 0, true, 0, 0, false, 0, 0},
{"VI_TCP14", kHsaViCounterBlockIdTcp14, VI_MAX_NUM_SHADER_ENGINES, 2, VI_NUM_TCP,
CntlMethodBySeAndInstance, 182, VI_COUNTER_NUM_PER_TCP, 0, 0, true, 0, 0, false, 0, 0},
{"VI_TCP15", kHsaViCounterBlockIdTcp15, VI_MAX_NUM_SHADER_ENGINES, 2, VI_NUM_TCP,
CntlMethodBySeAndInstance, 182, VI_COUNTER_NUM_PER_TCP, 0, 0, true, 0, 0, false, 0, 0},
// Counter block GDS
{"VI_GDS", kHsaViCounterBlockIdGds, VI_MAX_NUM_SHADER_ENGINES, 2, 1, CntlMethodNone, 120,
VI_COUNTER_NUM_PER_GDS, 0, 0, true, 0, 0, false, 0, 0},
// Counter block VGT
{"VI_VGT", kHsaViCounterBlockIdVgt, VI_MAX_NUM_SHADER_ENGINES, 2, 1, CntlMethodBySe, 145,
VI_COUNTER_NUM_PER_VGT, 0, 0, true, 0, 0, false, 0, 0},
// Counter block IA
{"VI_IA", kHsaViCounterBlockIdIa, VI_MAX_NUM_SHADER_ENGINES, 2, 1, CntlMethodBySe, 23,
VI_COUNTER_NUM_PER_IA, 0, 0, true, 0, 0, false, 0, 0},
// Counter block MC
{"VI_MC", kHsaViCounterBlockIdMc, VI_MAX_NUM_SHADER_ENGINES, 2, 1, CntlMethodNone, 22,
VI_COUNTER_NUM_PER_MC, 0, 0, true, 0, 0, false, 0, 0},
// Counter block SRBM
{"VI_SRBM", kHsaViCounterBlockIdSrbm, VI_MAX_NUM_SHADER_ENGINES, 2, 1, CntlMethodNone, 19,
VI_COUNTER_NUM_PER_SRBM, 0, 0, true, 0, 0, false, 0, 0},
// Counter block WD
{"VI_WD", kHsaViCounterBlockIdWd, VI_MAX_NUM_SHADER_ENGINES, 2, 1, CntlMethodNone, 36,
VI_COUNTER_NUM_PER_WD, 0, 0, true, 0, 0, false, 0, 0},
// Counter block CPG
{"VI_CPG", kHsaViCounterBlockIdCpg, VI_MAX_NUM_SHADER_ENGINES, 2, 1, CntlMethodNone, 48,
VI_COUNTER_NUM_PER_CPG, 0, 0, true, 0, 0, false, 0, 0},
// Counter block CPC
{"VI_CPC", kHsaViCounterBlockIdCpc, VI_MAX_NUM_SHADER_ENGINES, 2, 1, CntlMethodNone, 24,
VI_COUNTER_NUM_PER_CPC, 0, 0, true, 0, 0, false, 0, 0},
// Counter block IOMMUV2
{"VI_IOMMUV2", kHsaViCounterBlockIdIommuV2, VI_MAX_NUM_SHADER_ENGINES, 2, 1, CntlMethodNone, 25,
8, 0, 0, true, 0, 0, false, 0, 0},
// Counter block KernelDriver
{"VI_KD", kHsaViCounterBlockIdKernelDriver, VI_MAX_NUM_SHADER_ENGINES, 2, 1, CntlMethodNone, 0,
0, 0, 0, true, 0, 0, false, 0, 0},
// Name of the last line should be empty to indicate end of all counter groups
{"", kHsaViCounterBlockIdBlocksLast, 0, 0, 0, CntlMethodNone, 0, 0, 0, 0, false, 0, 0, false, 0,
0}};
/*
* The following tables contain register addresses of the SQ counter registers
*/
/*
* SQ
*/
GpuCounterRegInfo ViSqCounterRegAddr[] = {
{mmSQ_PERFCOUNTER0_SELECT__CI__VI, mmSQ_PERFCOUNTER_CTRL__CI__VI, mmSQ_PERFCOUNTER0_LO__CI__VI,
mmSQ_PERFCOUNTER0_HI__CI__VI},
{mmSQ_PERFCOUNTER1_SELECT__CI__VI, mmSQ_PERFCOUNTER_CTRL__CI__VI, mmSQ_PERFCOUNTER1_LO__CI__VI,
mmSQ_PERFCOUNTER1_HI__CI__VI},
{mmSQ_PERFCOUNTER2_SELECT__CI__VI, mmSQ_PERFCOUNTER_CTRL__CI__VI, mmSQ_PERFCOUNTER2_LO__CI__VI,
mmSQ_PERFCOUNTER2_HI__CI__VI},
{mmSQ_PERFCOUNTER3_SELECT__CI__VI, mmSQ_PERFCOUNTER_CTRL__CI__VI, mmSQ_PERFCOUNTER3_LO__CI__VI,
mmSQ_PERFCOUNTER3_HI__CI__VI},
{mmSQ_PERFCOUNTER4_SELECT__CI__VI, mmSQ_PERFCOUNTER_CTRL__CI__VI, mmSQ_PERFCOUNTER4_LO__CI__VI,
mmSQ_PERFCOUNTER4_HI__CI__VI},
{mmSQ_PERFCOUNTER5_SELECT__CI__VI, mmSQ_PERFCOUNTER_CTRL__CI__VI, mmSQ_PERFCOUNTER5_LO__CI__VI,
mmSQ_PERFCOUNTER5_HI__CI__VI},
{mmSQ_PERFCOUNTER6_SELECT__CI__VI, mmSQ_PERFCOUNTER_CTRL__CI__VI, mmSQ_PERFCOUNTER6_LO__CI__VI,
mmSQ_PERFCOUNTER6_HI__CI__VI},
{mmSQ_PERFCOUNTER7_SELECT__CI__VI, mmSQ_PERFCOUNTER_CTRL__CI__VI, mmSQ_PERFCOUNTER7_LO__CI__VI,
mmSQ_PERFCOUNTER7_HI__CI__VI},
{mmSQ_PERFCOUNTER8_SELECT__CI__VI, mmSQ_PERFCOUNTER_CTRL__CI__VI, mmSQ_PERFCOUNTER8_LO__CI__VI,
mmSQ_PERFCOUNTER8_HI__CI__VI},
{mmSQ_PERFCOUNTER9_SELECT__CI__VI, mmSQ_PERFCOUNTER_CTRL__CI__VI, mmSQ_PERFCOUNTER9_LO__CI__VI,
mmSQ_PERFCOUNTER9_HI__CI__VI},
{mmSQ_PERFCOUNTER10_SELECT__CI__VI, mmSQ_PERFCOUNTER_CTRL__CI__VI,
mmSQ_PERFCOUNTER10_LO__CI__VI, mmSQ_PERFCOUNTER10_HI__CI__VI},
{mmSQ_PERFCOUNTER11_SELECT__CI__VI, mmSQ_PERFCOUNTER_CTRL__CI__VI,
mmSQ_PERFCOUNTER11_LO__CI__VI, mmSQ_PERFCOUNTER11_HI__CI__VI},
{mmSQ_PERFCOUNTER12_SELECT__CI__VI, mmSQ_PERFCOUNTER_CTRL__CI__VI,
mmSQ_PERFCOUNTER12_LO__CI__VI, mmSQ_PERFCOUNTER12_HI__CI__VI},
{mmSQ_PERFCOUNTER13_SELECT__CI__VI, mmSQ_PERFCOUNTER_CTRL__CI__VI,
mmSQ_PERFCOUNTER13_LO__CI__VI, mmSQ_PERFCOUNTER13_HI__CI__VI},
{mmSQ_PERFCOUNTER14_SELECT__CI__VI, mmSQ_PERFCOUNTER_CTRL__CI__VI,
mmSQ_PERFCOUNTER14_LO__CI__VI, mmSQ_PERFCOUNTER14_HI__CI__VI},
{mmSQ_PERFCOUNTER15_SELECT__CI__VI, mmSQ_PERFCOUNTER_CTRL__CI__VI,
mmSQ_PERFCOUNTER15_LO__CI__VI, mmSQ_PERFCOUNTER15_HI__CI__VI}};
/*
* DRMDMA
*/
GpuCounterRegInfo ViDrmdmaCounterRegAddr[] = {
{mmSDMA0_PERFMON_CNTL__VI, 0, mmSDMA0_PERFCOUNTER0_RESULT__VI, 0},
{mmSDMA0_PERFMON_CNTL__VI, 0, mmSDMA0_PERFCOUNTER1_RESULT__VI, 0},
{mmSDMA1_PERFMON_CNTL__VI, 0, mmSDMA1_PERFCOUNTER0_RESULT__VI, 0},
{mmSDMA1_PERFMON_CNTL__VI, 0, mmSDMA1_PERFCOUNTER1_RESULT__VI, 0},
};
/*
* IH
*/
GpuCounterRegInfo ViIhCounterRegAddr[] = {
{mmIH_PERFMON_CNTL__VI, 0, mmIH_PERFCOUNTER0_RESULT__VI, 0},
{mmIH_PERFMON_CNTL__VI, 0, mmIH_PERFCOUNTER1_RESULT__VI, 0}};
/*
* CPF
*/
GpuCounterRegInfo ViCpfCounterRegAddr[] = {
{mmCPF_PERFCOUNTER0_SELECT__CI__VI, 0, mmCPF_PERFCOUNTER0_LO__CI__VI,
mmCPF_PERFCOUNTER0_HI__CI__VI},
{mmCPF_PERFCOUNTER1_SELECT__CI__VI, 0, mmCPF_PERFCOUNTER1_LO__CI__VI,
mmCPF_PERFCOUNTER1_HI__CI__VI}};
/*
* DRM
*/
GpuCounterRegInfo ViDrmCounterRegAddr[] = {
{mmDRM_PERFCOUNTER1_SELECT, 0, mmDRM_PERFCOUNTER1_LO, mmDRM_PERFCOUNTER1_HI},
{mmDRM_PERFCOUNTER2_SELECT, 0, mmDRM_PERFCOUNTER2_LO, mmDRM_PERFCOUNTER2_HI}};
/*
* GRBM
*/
GpuCounterRegInfo ViGrbmCounterRegAddr[] = {
{mmGRBM_PERFCOUNTER0_SELECT__CI__VI, 0, mmGRBM_PERFCOUNTER0_LO__CI__VI,
mmGRBM_PERFCOUNTER0_HI__CI__VI},
{mmGRBM_PERFCOUNTER1_SELECT__CI__VI, 0, mmGRBM_PERFCOUNTER1_LO__CI__VI,
mmGRBM_PERFCOUNTER1_HI__CI__VI}};
/*
* GRBM_SE
*/
GpuCounterRegInfo ViGrbmSeCounterRegAddr[] = {
{mmGRBM_SE0_PERFCOUNTER_SELECT__CI__VI, 0, mmGRBM_SE0_PERFCOUNTER_LO__CI__VI,
mmGRBM_SE0_PERFCOUNTER_HI__CI__VI},
{mmGRBM_SE1_PERFCOUNTER_SELECT__CI__VI, 0, mmGRBM_SE1_PERFCOUNTER_LO__CI__VI,
mmGRBM_SE1_PERFCOUNTER_HI__CI__VI},
{mmGRBM_SE2_PERFCOUNTER_SELECT__CI__VI, 0, mmGRBM_SE2_PERFCOUNTER_LO__CI__VI,
mmGRBM_SE2_PERFCOUNTER_HI__CI__VI},
{mmGRBM_SE3_PERFCOUNTER_SELECT__CI__VI, 0, mmGRBM_SE3_PERFCOUNTER_LO__CI__VI,
mmGRBM_SE3_PERFCOUNTER_HI__CI__VI}};
/*
* PA_SU
*/
GpuCounterRegInfo ViPaSuCounterRegAddr[] = {
{mmPA_SU_PERFCOUNTER0_SELECT__CI__VI, 0, mmPA_SU_PERFCOUNTER0_LO__CI__VI,
mmPA_SU_PERFCOUNTER0_HI__CI__VI},
{mmPA_SU_PERFCOUNTER1_SELECT__CI__VI, 0, mmPA_SU_PERFCOUNTER1_LO__CI__VI,
mmPA_SU_PERFCOUNTER1_HI__CI__VI},
{mmPA_SU_PERFCOUNTER2_SELECT__CI__VI, 0, mmPA_SU_PERFCOUNTER2_LO__CI__VI,
mmPA_SU_PERFCOUNTER2_HI__CI__VI},
{mmPA_SU_PERFCOUNTER3_SELECT__CI__VI, 0, mmPA_SU_PERFCOUNTER3_LO__CI__VI,
mmPA_SU_PERFCOUNTER3_HI__CI__VI}};
/*
* PA_SC
*/
GpuCounterRegInfo ViPaScCounterRegAddr[] = {
{mmPA_SC_PERFCOUNTER0_SELECT__CI__VI, 0, mmPA_SC_PERFCOUNTER0_LO__CI__VI,
mmPA_SC_PERFCOUNTER0_HI__CI__VI},
{mmPA_SC_PERFCOUNTER1_SELECT__CI__VI, 0, mmPA_SC_PERFCOUNTER1_LO__CI__VI,
mmPA_SC_PERFCOUNTER1_HI__CI__VI},
{mmPA_SC_PERFCOUNTER2_SELECT__CI__VI, 0, mmPA_SC_PERFCOUNTER2_LO__CI__VI,
mmPA_SC_PERFCOUNTER2_HI__CI__VI},
{mmPA_SC_PERFCOUNTER3_SELECT__CI__VI, 0, mmPA_SC_PERFCOUNTER3_LO__CI__VI,
mmPA_SC_PERFCOUNTER3_HI__CI__VI}};
/*
* SPI
*/
GpuCounterRegInfo ViSpiCounterRegAddr[] = {
{mmSPI_PERFCOUNTER0_SELECT__CI__VI, 0, mmSPI_PERFCOUNTER0_LO__CI__VI,
mmSPI_PERFCOUNTER0_HI__CI__VI},
{mmSPI_PERFCOUNTER1_SELECT__CI__VI, 0, mmSPI_PERFCOUNTER1_LO__CI__VI,
mmSPI_PERFCOUNTER1_HI__CI__VI},
{mmSPI_PERFCOUNTER2_SELECT__CI__VI, 0, mmSPI_PERFCOUNTER2_LO__CI__VI,
mmSPI_PERFCOUNTER2_HI__CI__VI},
{mmSPI_PERFCOUNTER3_SELECT__CI__VI, 0, mmSPI_PERFCOUNTER3_LO__CI__VI,
mmSPI_PERFCOUNTER3_HI__CI__VI},
{mmSPI_PERFCOUNTER4_SELECT__CI__VI, 0, mmSPI_PERFCOUNTER4_LO__CI__VI,
mmSPI_PERFCOUNTER4_HI__CI__VI},
{mmSPI_PERFCOUNTER5_SELECT__CI__VI, 0, mmSPI_PERFCOUNTER5_LO__CI__VI,
mmSPI_PERFCOUNTER5_HI__CI__VI}};
/*
* TCA
*/
GpuCounterRegInfo ViTcaCounterRegAddr[] = {
{mmTCA_PERFCOUNTER0_SELECT__CI__VI, 0, mmTCA_PERFCOUNTER0_LO__CI__VI,
mmTCA_PERFCOUNTER0_HI__CI__VI},
{mmTCA_PERFCOUNTER1_SELECT__CI__VI, 0, mmTCA_PERFCOUNTER1_LO__CI__VI,
mmTCA_PERFCOUNTER1_HI__CI__VI},
{mmTCA_PERFCOUNTER2_SELECT__CI__VI, 0, mmTCA_PERFCOUNTER2_LO__CI__VI,
mmTCA_PERFCOUNTER2_HI__CI__VI},
{mmTCA_PERFCOUNTER3_SELECT__CI__VI, 0, mmTCA_PERFCOUNTER3_LO__CI__VI,
mmTCA_PERFCOUNTER3_HI__CI__VI}};
/*
* TCC
*/
GpuCounterRegInfo ViTccCounterRegAddr[] = {
{mmTCC_PERFCOUNTER0_SELECT__CI__VI, 0, mmTCC_PERFCOUNTER0_LO__CI__VI,
mmTCC_PERFCOUNTER0_HI__CI__VI},
{mmTCC_PERFCOUNTER1_SELECT__CI__VI, 0, mmTCC_PERFCOUNTER1_LO__CI__VI,
mmTCC_PERFCOUNTER1_HI__CI__VI},
{mmTCC_PERFCOUNTER2_SELECT__CI__VI, 0, mmTCC_PERFCOUNTER2_LO__CI__VI,
mmTCC_PERFCOUNTER2_HI__CI__VI},
{mmTCC_PERFCOUNTER3_SELECT__CI__VI, 0, mmTCC_PERFCOUNTER3_LO__CI__VI,
mmTCC_PERFCOUNTER3_HI__CI__VI}};
/*
* TCP
*/
GpuCounterRegInfo ViTcpCounterRegAddr[] = {
{mmTCP_PERFCOUNTER0_SELECT__CI__VI, 0, mmTCP_PERFCOUNTER0_LO__CI__VI,
mmTCP_PERFCOUNTER0_HI__CI__VI},
{mmTCP_PERFCOUNTER1_SELECT__CI__VI, 0, mmTCP_PERFCOUNTER1_LO__CI__VI,
mmTCP_PERFCOUNTER1_HI__CI__VI},
{mmTCP_PERFCOUNTER2_SELECT__CI__VI, 0, mmTCP_PERFCOUNTER2_LO__CI__VI,
mmTCP_PERFCOUNTER2_HI__CI__VI},
{mmTCP_PERFCOUNTER3_SELECT__CI__VI, 0, mmTCP_PERFCOUNTER3_LO__CI__VI,
mmTCP_PERFCOUNTER3_HI__CI__VI}};
/*
* CB
*/
GpuCounterRegInfo ViCbCounterRegAddr[] = {
{mmCB_PERFCOUNTER0_SELECT__CI__VI, 0, mmCB_PERFCOUNTER0_LO__CI__VI,
mmCB_PERFCOUNTER0_HI__CI__VI},
{mmCB_PERFCOUNTER1_SELECT__CI__VI, 0, mmCB_PERFCOUNTER1_LO__CI__VI,
mmCB_PERFCOUNTER1_HI__CI__VI},
{mmCB_PERFCOUNTER2_SELECT__CI__VI, 0, mmCB_PERFCOUNTER2_LO__CI__VI,
mmCB_PERFCOUNTER2_HI__CI__VI},
{mmCB_PERFCOUNTER3_SELECT__CI__VI, 0, mmCB_PERFCOUNTER3_LO__CI__VI,
mmCB_PERFCOUNTER3_HI__CI__VI}};
/*
* DB
*/
GpuCounterRegInfo ViDbCounterRegAddr[] = {
{mmDB_PERFCOUNTER0_SELECT__CI__VI, 0, mmDB_PERFCOUNTER0_LO__CI__VI,
mmDB_PERFCOUNTER0_HI__CI__VI},
{mmDB_PERFCOUNTER1_SELECT__CI__VI, 0, mmDB_PERFCOUNTER1_LO__CI__VI,
mmDB_PERFCOUNTER1_HI__CI__VI},
{mmDB_PERFCOUNTER2_SELECT__CI__VI, 0, mmDB_PERFCOUNTER2_LO__CI__VI,
mmDB_PERFCOUNTER2_HI__CI__VI},
{mmDB_PERFCOUNTER3_SELECT__CI__VI, 0, mmDB_PERFCOUNTER3_LO__CI__VI,
mmDB_PERFCOUNTER3_HI__CI__VI}};
/*
* RLC
*/
GpuCounterRegInfo ViRlcCounterRegAddr[] = {
{mmRLC_PERFCOUNTER0_SELECT__CI__VI, 0, mmRLC_PERFCOUNTER0_LO__CI__VI,
mmRLC_PERFCOUNTER0_HI__CI__VI},
{mmRLC_PERFCOUNTER1_SELECT__CI__VI, 0, mmRLC_PERFCOUNTER1_LO__CI__VI,
mmRLC_PERFCOUNTER1_HI__CI__VI}};
/*
* SC
*/
GpuCounterRegInfo ViScCounterRegAddr[] = {
{mmPA_SC_PERFCOUNTER0_SELECT__CI__VI, 0, mmPA_SC_PERFCOUNTER0_LO__CI__VI,
mmPA_SC_PERFCOUNTER0_HI__CI__VI},
{mmPA_SC_PERFCOUNTER1_SELECT__CI__VI, 0, mmPA_SC_PERFCOUNTER1_LO__CI__VI,
mmPA_SC_PERFCOUNTER1_HI__CI__VI},
{mmPA_SC_PERFCOUNTER2_SELECT__CI__VI, 0, mmPA_SC_PERFCOUNTER2_LO__CI__VI,
mmPA_SC_PERFCOUNTER2_HI__CI__VI},
{mmPA_SC_PERFCOUNTER3_SELECT__CI__VI, 0, mmPA_SC_PERFCOUNTER3_LO__CI__VI,
mmPA_SC_PERFCOUNTER3_HI__CI__VI},
{mmPA_SC_PERFCOUNTER4_SELECT__CI__VI, 0, mmPA_SC_PERFCOUNTER4_LO__CI__VI,
mmPA_SC_PERFCOUNTER4_HI__CI__VI},
{mmPA_SC_PERFCOUNTER5_SELECT__CI__VI, 0, mmPA_SC_PERFCOUNTER5_LO__CI__VI,
mmPA_SC_PERFCOUNTER5_HI__CI__VI},
{mmPA_SC_PERFCOUNTER6_SELECT__CI__VI, 0, mmPA_SC_PERFCOUNTER6_LO__CI__VI,
mmPA_SC_PERFCOUNTER6_HI__CI__VI},
{mmPA_SC_PERFCOUNTER7_SELECT__CI__VI, 0, mmPA_SC_PERFCOUNTER7_LO__CI__VI,
mmPA_SC_PERFCOUNTER7_HI__CI__VI}};
/*
* SX
*/
GpuCounterRegInfo ViSxCounterRegAddr[] = {
{mmSX_PERFCOUNTER0_SELECT__CI__VI, 0, mmSX_PERFCOUNTER0_LO__CI__VI,
mmSX_PERFCOUNTER0_HI__CI__VI},
{mmSX_PERFCOUNTER1_SELECT__CI__VI, 0, mmSX_PERFCOUNTER1_LO__CI__VI,
mmSX_PERFCOUNTER1_HI__CI__VI},
{mmSX_PERFCOUNTER2_SELECT__CI__VI, 0, mmSX_PERFCOUNTER2_LO__CI__VI,
mmSX_PERFCOUNTER2_HI__CI__VI},
{mmSX_PERFCOUNTER3_SELECT__CI__VI, 0, mmSX_PERFCOUNTER3_LO__CI__VI,
mmSX_PERFCOUNTER3_HI__CI__VI}};
/*
* TA
*/
GpuCounterRegInfo ViTaCounterRegAddr[] = {
{mmTA_PERFCOUNTER0_SELECT__CI__VI, 0, mmTA_PERFCOUNTER0_LO__CI__VI,
mmTA_PERFCOUNTER0_HI__CI__VI},
{mmTA_PERFCOUNTER1_SELECT__CI__VI, 0, mmTA_PERFCOUNTER1_LO__CI__VI,
mmTA_PERFCOUNTER1_HI__CI__VI}};
/*
* TD
*/
GpuCounterRegInfo ViTdCounterRegAddr[] = {
{mmTD_PERFCOUNTER0_SELECT__CI__VI, 0, mmTD_PERFCOUNTER0_LO__CI__VI,
mmTD_PERFCOUNTER0_HI__CI__VI},
{mmTD_PERFCOUNTER1_SELECT__CI__VI, 0, mmTD_PERFCOUNTER1_LO__CI__VI,
mmTD_PERFCOUNTER1_HI__CI__VI}};
/*
* GDS
*/
GpuCounterRegInfo ViGdsCounterRegAddr[] = {
{mmGDS_PERFCOUNTER0_SELECT__CI__VI, 0, mmGDS_PERFCOUNTER0_LO__CI__VI,
mmGDS_PERFCOUNTER0_HI__CI__VI},
{mmGDS_PERFCOUNTER1_SELECT__CI__VI, 0, mmGDS_PERFCOUNTER1_LO__CI__VI,
mmGDS_PERFCOUNTER1_HI__CI__VI},
{mmGDS_PERFCOUNTER2_SELECT__CI__VI, 0, mmGDS_PERFCOUNTER2_LO__CI__VI,
mmGDS_PERFCOUNTER2_HI__CI__VI},
{mmGDS_PERFCOUNTER3_SELECT__CI__VI, 0, mmGDS_PERFCOUNTER3_LO__CI__VI,
mmGDS_PERFCOUNTER3_HI__CI__VI}};
/*
* VGT
*/
GpuCounterRegInfo ViVgtCounterRegAddr[] = {
{mmVGT_PERFCOUNTER0_SELECT__CI__VI, 0, mmVGT_PERFCOUNTER0_LO__CI__VI,
mmVGT_PERFCOUNTER0_HI__CI__VI},
{mmVGT_PERFCOUNTER1_SELECT__CI__VI, 0, mmVGT_PERFCOUNTER1_LO__CI__VI,
mmVGT_PERFCOUNTER1_HI__CI__VI},
{mmVGT_PERFCOUNTER2_SELECT__CI__VI, 0, mmVGT_PERFCOUNTER2_LO__CI__VI,
mmVGT_PERFCOUNTER2_HI__CI__VI},
{mmVGT_PERFCOUNTER3_SELECT__CI__VI, 0, mmVGT_PERFCOUNTER3_LO__CI__VI,
mmVGT_PERFCOUNTER3_HI__CI__VI}};
/*
* IA
*/
GpuCounterRegInfo ViIaCounterRegAddr[] = {
{mmIA_PERFCOUNTER0_SELECT__CI__VI, 0, mmIA_PERFCOUNTER0_LO__CI__VI,
mmIA_PERFCOUNTER0_HI__CI__VI},
{mmIA_PERFCOUNTER1_SELECT__CI__VI, 0, mmIA_PERFCOUNTER1_LO__CI__VI,
mmIA_PERFCOUNTER1_HI__CI__VI},
{mmIA_PERFCOUNTER2_SELECT__CI__VI, 0, mmIA_PERFCOUNTER2_LO__CI__VI,
mmIA_PERFCOUNTER2_HI__CI__VI},
{mmIA_PERFCOUNTER3_SELECT__CI__VI, 0, mmIA_PERFCOUNTER3_LO__CI__VI,
mmIA_PERFCOUNTER3_HI__CI__VI}};
/*
* MC
*/
GpuCounterRegInfo ViMcCounterRegAddr[] = {
{mmMC_SEQ_PERF_SEQ_CTL__SI__VI, 0, mmMC_SEQ_PERF_SEQ_CNT_A_I0__VI,
mmMC_SEQ_PERF_SEQ_CNT_A_I1__VI},
{mmMC_SEQ_PERF_SEQ_CTL__SI__VI, 0, mmMC_SEQ_PERF_SEQ_CNT_B_I0__VI,
mmMC_SEQ_PERF_SEQ_CNT_B_I1__VI},
{mmMC_SEQ_PERF_SEQ_CTL__SI__VI, 0, mmMC_SEQ_PERF_SEQ_CNT_C_I0__VI,
mmMC_SEQ_PERF_SEQ_CNT_C_I1__VI},
{mmMC_SEQ_PERF_SEQ_CTL__SI__VI, 0, mmMC_SEQ_PERF_SEQ_CNT_D_I0__VI,
mmMC_SEQ_PERF_SEQ_CNT_D_I1__VI}};
/*
* SRBM
*/
GpuCounterRegInfo ViSrbmCounterRegAddr[] = {
{mmSRBM_PERFCOUNTER0_SELECT__VI, 0, mmSRBM_PERFCOUNTER0_LO__VI, mmSRBM_PERFCOUNTER0_HI__VI},
{mmSRBM_PERFCOUNTER1_SELECT__VI, 0, mmSRBM_PERFCOUNTER1_LO__VI, mmSRBM_PERFCOUNTER1_HI__VI}};
/*
* WD
*/
GpuCounterRegInfo ViWdCounterRegAddr[] = {
{mmWD_PERFCOUNTER0_SELECT__CI__VI, 0, mmWD_PERFCOUNTER0_LO__CI__VI,
mmWD_PERFCOUNTER0_HI__CI__VI},
{mmWD_PERFCOUNTER1_SELECT__CI__VI, 0, mmWD_PERFCOUNTER1_LO__CI__VI,
mmWD_PERFCOUNTER1_HI__CI__VI},
{mmWD_PERFCOUNTER2_SELECT__CI__VI, 0, mmWD_PERFCOUNTER2_LO__CI__VI,
mmWD_PERFCOUNTER2_HI__CI__VI},
{mmWD_PERFCOUNTER3_SELECT__CI__VI, 0, mmWD_PERFCOUNTER3_LO__CI__VI,
mmWD_PERFCOUNTER3_HI__CI__VI}};
/*
* CPG
*/
GpuCounterRegInfo ViCpgCounterRegAddr[] = {
{mmCPG_PERFCOUNTER0_SELECT__CI__VI, 0, mmCPG_PERFCOUNTER0_LO__CI__VI,
mmCPG_PERFCOUNTER0_HI__CI__VI},
{mmCPG_PERFCOUNTER1_SELECT__CI__VI, 0, mmCPG_PERFCOUNTER1_LO__CI__VI,
mmCPG_PERFCOUNTER1_HI__CI__VI}};
/*
* CPC
*/
GpuCounterRegInfo ViCpcCounterRegAddr[] = {
{mmCPC_PERFCOUNTER0_SELECT__CI__VI, 0, mmCPC_PERFCOUNTER0_LO__CI__VI,
mmCPC_PERFCOUNTER0_HI__CI__VI},
{mmCPC_PERFCOUNTER1_SELECT__CI__VI, 0, mmCPC_PERFCOUNTER1_LO__CI__VI,
mmCPC_PERFCOUNTER1_HI__CI__VI}};
GpuPrivCounterBlockId ViBlockIdSq = {{0xb5c396b6, 0x47e4d310, 0xc35cfc86, 0x08f53a04}};
GpuPrivCounterBlockId ViBlockIdMc = {{0x13900b57, 0x4d984956, 0x5268d081, 0x9cf53719}};
GpuPrivCounterBlockId ViBlockIdIommuV2 = {{0x80969879, 0x4be6b0f6, 0x636af697, 0x1d10f500}};
GpuPrivCounterBlockId ViBlockIdKernelDriver = {{0xea9b5ae1, 0x44b36c3f, 0xf0da5489, 0x0aa96575}};
} // pm4_profile
+230
Просмотреть файл
@@ -0,0 +1,230 @@
#ifndef _VI_BLOCKINFO_H_
#define _VI_BLOCKINFO_H_
#include <stdint.h>
#include "rocr_profiler.h"
#include "gpu_enum.h"
#include "gpu_blockinfo.h"
namespace pm4_profile {
// MAX Number of block instances for VOLCANIC ISLANDS (From Fiji)
// Values are found here //gfxip/gfx8/main/src/meta/features/variant/Fiji/album.dj
// @brief Number of block instances.
// We index per SE and instance
#define VI_NUM_CB 4 // CB has 4 instances per SE
#define VI_NUM_DB 4 // DB has 4 instances per SE
// For TA, TD and TCP, the values below are the same as the number of CUs
// per SH. We index per SE and instance
#define VI_NUM_TA 16 // TA has 11 instances
#define VI_NUM_TD 16 // TD has 11 instances
#define VI_NUM_TCP 16 // TCP has 11 instances
// These values are per chip, we index directly per instance
#define VI_NUM_TCA 2 // TCA has 2 instances per chip
#define VI_NUM_TCC 16 // TCC has 16 instances per chip
#define VI_NUM_SDMA 2 // There are two SDMA blocks on VI, exposed as 2
// instances here
// Number of counter registers per block for volcanic islands
#define VI_COUNTER_NUM_PER_DRM 2
#define VI_COUNTER_NUM_PER_DRMDMA 2
#define VI_COUNTER_NUM_PER_IH 2
#define VI_COUNTER_NUM_PER_SRBM 2
#define VI_COUNTER_NUM_PER_CB 4
#define VI_COUNTER_NUM_PER_CPF 2
#define VI_COUNTER_NUM_PER_DB 4
#define VI_COUNTER_NUM_PER_GRBM 2
#define VI_COUNTER_NUM_PER_GRBMSE 4
#define VI_COUNTER_NUM_PER_PA_SU 4
#define VI_COUNTER_NUM_PER_RLC 2
#define VI_COUNTER_NUM_PER_PA_SC 8
#define VI_COUNTER_NUM_PER_SPI 6 // [Shucai: To do: double check the value]
#define VI_COUNTER_NUM_PER_SQ 16
#define VI_COUNTER_NUM_PER_SX 4
#define VI_COUNTER_NUM_PER_TA 2
#define VI_COUNTER_NUM_PER_TCA 4
#define VI_COUNTER_NUM_PER_TCC 4
#define VI_COUNTER_NUM_PER_TD 2 // [Shucai: To do: double check the value]
#define VI_COUNTER_NUM_PER_TCP 4
#define VI_COUNTER_NUM_PER_GDS 4
#define VI_COUNTER_NUM_PER_VGT 4
#define VI_COUNTER_NUM_PER_IA 4
#define VI_COUNTER_NUM_PER_MC 4
#define VI_COUNTER_NUM_PER_TCS 4
#define VI_COUNTER_NUM_PER_WD 4
#define VI_COUNTER_NUM_PER_CPG 2
#define VI_COUNTER_NUM_PER_CPC 2
#define VI_COUNTER_NUM_PER_VM 1
#define VI_COUNTER_NUM_PER_VM_MD 1
#define VI_COUNTER_NUM_PER_PIPESTATS 12
#define VI_MAX_NUM_SHADER_ENGINES 1
// Enumeration of VI hardware counter blocks
typedef enum HsaViCounterBlockId {
kHsaViCounterBlockIdCb0 = 0,
kHsaViCounterBlockIdCb1,
kHsaViCounterBlockIdCb2,
kHsaViCounterBlockIdCb3,
kHsaViCounterBlockIdCpf,
kHsaViCounterBlockIdDb0,
kHsaViCounterBlockIdDb1,
kHsaViCounterBlockIdDb2,
kHsaViCounterBlockIdDb3,
kHsaViCounterBlockIdGrbm,
kHsaViCounterBlockIdGrbmSe,
kHsaViCounterBlockIdPaSu,
kHsaViCounterBlockIdPaSc,
kHsaViCounterBlockIdSpi,
kHsaViCounterBlockIdSq,
kHsaViCounterBlockIdSqEs,
kHsaViCounterBlockIdSqGs,
kHsaViCounterBlockIdSqVs,
kHsaViCounterBlockIdSqPs,
kHsaViCounterBlockIdSqLs,
kHsaViCounterBlockIdSqHs,
kHsaViCounterBlockIdSqCs,
kHsaViCounterBlockIdSx,
kHsaViCounterBlockIdTa0,
kHsaViCounterBlockIdTa1,
kHsaViCounterBlockIdTa2,
kHsaViCounterBlockIdTa3,
kHsaViCounterBlockIdTa4,
kHsaViCounterBlockIdTa5,
kHsaViCounterBlockIdTa6,
kHsaViCounterBlockIdTa7,
kHsaViCounterBlockIdTa8,
kHsaViCounterBlockIdTa9,
kHsaViCounterBlockIdTa10,
kHsaViCounterBlockIdTa11,
kHsaViCounterBlockIdTa12,
kHsaViCounterBlockIdTa13,
kHsaViCounterBlockIdTa14,
kHsaViCounterBlockIdTa15,
kHsaViCounterBlockIdTca0,
kHsaViCounterBlockIdTca1,
kHsaViCounterBlockIdTcc0,
kHsaViCounterBlockIdTcc1,
kHsaViCounterBlockIdTcc2,
kHsaViCounterBlockIdTcc3,
kHsaViCounterBlockIdTcc4,
kHsaViCounterBlockIdTcc5,
kHsaViCounterBlockIdTcc6,
kHsaViCounterBlockIdTcc7,
kHsaViCounterBlockIdTcc8,
kHsaViCounterBlockIdTcc9,
kHsaViCounterBlockIdTcc10,
kHsaViCounterBlockIdTcc11,
kHsaViCounterBlockIdTcc12,
kHsaViCounterBlockIdTcc13,
kHsaViCounterBlockIdTcc14,
kHsaViCounterBlockIdTcc15,
kHsaViCounterBlockIdTd0,
kHsaViCounterBlockIdTd1,
kHsaViCounterBlockIdTd2,
kHsaViCounterBlockIdTd3,
kHsaViCounterBlockIdTd4,
kHsaViCounterBlockIdTd5,
kHsaViCounterBlockIdTd6,
kHsaViCounterBlockIdTd7,
kHsaViCounterBlockIdTd8,
kHsaViCounterBlockIdTd9,
kHsaViCounterBlockIdTd10,
kHsaViCounterBlockIdTd11,
kHsaViCounterBlockIdTd12,
kHsaViCounterBlockIdTd13,
kHsaViCounterBlockIdTd14,
kHsaViCounterBlockIdTd15,
kHsaViCounterBlockIdTcp0,
kHsaViCounterBlockIdTcp1,
kHsaViCounterBlockIdTcp2,
kHsaViCounterBlockIdTcp3,
kHsaViCounterBlockIdTcp4,
kHsaViCounterBlockIdTcp5,
kHsaViCounterBlockIdTcp6,
kHsaViCounterBlockIdTcp7,
kHsaViCounterBlockIdTcp8,
kHsaViCounterBlockIdTcp9,
kHsaViCounterBlockIdTcp10,
kHsaViCounterBlockIdTcp11,
kHsaViCounterBlockIdTcp12,
kHsaViCounterBlockIdTcp13,
kHsaViCounterBlockIdTcp14,
kHsaViCounterBlockIdTcp15,
kHsaViCounterBlockIdGds,
kHsaViCounterBlockIdVgt,
kHsaViCounterBlockIdIa,
kHsaViCounterBlockIdMc,
kHsaViCounterBlockIdSrbm,
kHsaViCounterBlockIdTcs,
kHsaViCounterBlockIdWd,
kHsaViCounterBlockIdCpg,
kHsaViCounterBlockIdCpc,
// Counters retrieved by KFD
kHsaViCounterBlockIdIommuV2,
kHsaViCounterBlockIdKernelDriver,
kHsaViCounterBlockIdCpPipeStats,
kHsaViCounterBlockIdHwInfo,
kHsaViCounterBlockIdBlocksFirst = kHsaViCounterBlockIdCb0,
kHsaViCounterBlockIdBlocksLast = kHsaViCounterBlockIdHwInfo
} HsaViCounterBlockId;
extern GpuBlockInfo ViPmuHwBlocks[];
extern GpuCounterRegInfo ViSqCounterRegAddr[];
extern GpuCounterRegInfo ViCbCounterRegAddr[];
extern GpuCounterRegInfo ViDrmdmaCounterRegAddr[];
extern GpuCounterRegInfo ViIhCounterRegAddr[];
extern GpuCounterRegInfo ViCpfCounterRegAddr[];
extern GpuCounterRegInfo ViCpgCounterRegAddr[];
extern GpuCounterRegInfo ViCpcCounterRegAddr[];
extern GpuCounterRegInfo ViDrmCounterRegAddr[];
extern GpuCounterRegInfo ViGrbmCounterRegAddr[];
extern GpuCounterRegInfo ViGrbmSeCounterRegAddr[];
extern GpuCounterRegInfo ViPaSuCounterRegAddr[];
extern GpuCounterRegInfo ViPaScCounterRegAddr[];
extern GpuCounterRegInfo ViSpiCounterRegAddr[];
extern GpuCounterRegInfo ViTcaCounterRegAddr[];
extern GpuCounterRegInfo ViTccCounterRegAddr[];
extern GpuCounterRegInfo ViTcpCounterRegAddr[];
extern GpuCounterRegInfo ViDbCounterRegAddr[];
extern GpuCounterRegInfo ViRlcCounterRegAddr[];
extern GpuCounterRegInfo ViScCounterRegAddr[];
extern GpuCounterRegInfo ViSxCounterRegAddr[];
extern GpuCounterRegInfo ViTaCounterRegAddr[];
extern GpuCounterRegInfo ViTdCounterRegAddr[];
extern GpuCounterRegInfo ViGdsCounterRegAddr[];
extern GpuCounterRegInfo ViVgtCounterRegAddr[];
extern GpuCounterRegInfo ViIaCounterRegAddr[];
extern GpuCounterRegInfo ViMcCounterRegAddr[];
extern GpuCounterRegInfo ViSrbmCounterRegAddr[];
// No Tcs Counter block on VI
// extern GpuCounterRegInfo ViTcsCounterRegAddr[];
extern GpuCounterRegInfo ViWdCounterRegAddr[];
extern GpuCounterRegInfo ViCpgCounterRegAddr[];
extern GpuCounterRegInfo ViCpcCounterRegAddr[];
extern GpuPrivCounterBlockId ViBlockIdSq;
extern GpuPrivCounterBlockId ViBlockIdMc;
extern GpuPrivCounterBlockId ViBlockIdIommuV2;
extern GpuPrivCounterBlockId ViBlockIdKernelDriver;
}
#endif
Разница между файлами не показана из-за своего большого размера Загрузить разницу
+141
Просмотреть файл
@@ -0,0 +1,141 @@
#ifndef _VI_PMU_H_
#define _VI_PMU_H_
#include "hsa.h"
#include "cmdwriter.h"
#include "hsa_perf.h"
#include "info_set.h"
#include "parameter_set.h"
#include "vi_blockinfo.h"
#include "rocr_profiler.h"
#include <stdlib.h>
#include <stdint.h>
#include <map>
namespace pm4_profile {
typedef std::map<HsaViCounterBlockId, pm4_profile::CounterBlock*> ViCounterBlockMap;
// This class implement the VI PMU. It is responsible for setting up
// CounterGroups to represent each VI hardware block which exposes performance
// counters.
class ViPmu : public pm4_profile::Pmu {
public:
ViPmu();
~ViPmu();
// Returns number of shader engines per block
// for the blocks featured shader engines instancing
uint32_t getNumSe() { return num_se_; }
// Initializes the handle of buffer used to collect PMC data
bool setPmcDataBuff(uint8_t* pmcBuffer, uint32_t pmcBuffSz);
int getLastError();
std::string getErrorString(int error);
virtual bool begin(pm4_profile::DefaultCmdBuf* cmdBuff, pm4_profile::CommandWriter* cmdWriter,
bool reset = true);
virtual bool end(pm4_profile::DefaultCmdBuf* cmdBuff, pm4_profile::CommandWriter* cmdWriter);
// IPMU inherits the IParameterSet and IInfoSetso we implement it
// through composition and function forwarding
bool getParameter(uint32_t param, uint32_t& ret_size, void** pp_data);
bool setParameter(uint32_t param, uint32_t param_size, const void* p_data);
bool getInfo(uint32_t info, uint32_t& ret_size, void** pp_data);
pm4_profile::CounterBlock* getCounterBlockById(uint32_t id);
rocr_pmu_state_t getCurrentState() { return profiler_state_; }
pm4_profile::CounterBlock** getAllCounterBlocks(uint32_t& num_groups);
private:
// Addr of Counter Data Buffer
uint32_t* pmcData_;
// Size of Counter Data Buffer
uint32_t pmcDataSz_;
void Init();
bool initCounterBlock();
bool isResultReady();
// Clear CounterBlockMap
void clearCounterBlockMap();
// Reset SQ and CB counters
void ResetCounterBlocks(pm4_profile::DefaultCmdBuf* cmdBuff,
pm4_profile::CommandWriter* cmdWriter);
// Program SQ block related counters
uint32_t ProgramSQCntrs(uint32_t sqRegIdx, uint32_t* regAddr, uint32_t* regVal, uint32_t blkId,
uint32_t blkCntrIdx);
// Program TA block related counters
uint32_t ProgramTaCntrs(uint32_t taRegIdx, uint32_t* regAddr, uint32_t* regVal, uint32_t blkId,
uint32_t blkCntrIdx);
// Program TCA block related counters
uint32_t ProgramTcaCntrs(uint32_t tcaRegIdx, uint32_t* regAddr, uint32_t* regVal, uint32_t blkId,
uint32_t blkCntrIdx);
// Program TCC block related counters
uint32_t ProgramTccCntrs(uint32_t tccRegIdx, uint32_t* regAddr, uint32_t* regVal, uint32_t blkId,
uint32_t blkCntrIdx);
// Program TCP block related counters
uint32_t ProgramTcpCntrs(uint32_t tcpRegIdx, uint32_t* regAddr, uint32_t* regVal, uint32_t blkId,
uint32_t blkCntrIdx);
// Program TD block related counters
uint32_t ProgramTdCntrs(uint32_t tdRegIdx, uint32_t* regAddr, uint32_t* regVal, uint32_t blkId,
uint32_t blkCntrIdx);
// Build counter selection register, return how many registers are built
uint32_t BuildCounterSelRegister(uint32_t cntrIdx, uint32_t* regAddr, uint32_t* regVal,
uint32_t blkId, pm4_profile::Counter* blkCntr);
// Build counter selection register, return how many registers are built
uint32_t BuildCounterReadRegisters(uint32_t reg_index, uint32_t block_id, uint32_t* reg_addr,
uint32_t* reg_val);
private:
// Delete counter blocks in the PMU
hsa_status_t RemoveCounterBlocks();
private:
// This contains the available counter groups.
ViCounterBlockMap blk_map_;
// This stores the current profiling state.
rocr_pmu_state_t profiler_state_;
pm4_profile::ParameterSet* parameter_set_;
pm4_profile::InfoSet* info_set_;
int error_code_;
// A flag to indicate the current packet is for copy register value
#define COPY_DATA_FLAG 0xFFFFFFFF
#define MAX_REG_NUM 100
// Pointer used to store counter block list internally
uint32_t blk_list_size_;
pm4_profile::CounterBlock** blk_list_;
// Indicates the number of Shader Engines Present
uint32_t num_se_;
// Used to reset GRBM to its default state
uint32_t reset_grbm_;
};
}
#endif
+18
Просмотреть файл
@@ -0,0 +1,18 @@
#
# Source files for Rocr ThreadTrace
#
set ( LIB_SRC thread_trace.cpp )
set ( LIB_SRC ${LIB_SRC} gfx8_thread_trace.cpp )
set ( LIB_SRC ${LIB_SRC} gfx9_thread_trace.cpp )
#
# Header files include path(s).
#
include_directories ( $ENV{ROCR_INC_DIR} )
include_directories ( ${PROJ_DIR}/commandwriter )
include_directories ( ${HSA_RUNTIME_OSC_DIR} )
#
# Build ThreadTrace as a Static Library object
#
add_library ( ${SQTT_LIB} STATIC ${LIB_SRC} )
+360
Просмотреть файл
@@ -0,0 +1,360 @@
#include <iostream>
#include <fstream>
#include <iomanip>
#include <random>
#include <memory>
#include "core/util/os.h"
#include "gfx8_thread_trace.h"
/// @brief Returns the lower 32-bits of a value
inline uint32_t Low32(uint64_t u) { return (u & 0xFFFFFFFFUL); }
/// @brief Returns the upper 32-bits of a value
inline uint32_t High32(uint64_t u) { return (u >> 32); }
namespace pm4_profile {
Gfx8ThreadTrace::Gfx8ThreadTrace() {
// Initialize the number of shader engines
numSE_ = 4;
}
Gfx8ThreadTrace::~Gfx8ThreadTrace() {}
bool Gfx8ThreadTrace::Init(const ThreadTraceConfig* config) {
// Initialize SQTT Configuration and Register objects
if (!ThreadTrace::Init(config)) return false;
InitThreadTraceCfgRegs();
return true;
}
void Gfx8ThreadTrace::InitThreadTraceCfgRegs() {
// Indicates the size of buffer to use per Shader Engine instance.
// The size is specified in terms of 4KB blocks
ttCfgRegs_.ttRegSize.u32All = 0;
// Indicates various attributes of a thread trace session.
//
// MASK_CS: Which shader types should be enabled for data collection
// Enable CS Shader types.
//
// WRAP: How trace buffer should be used as a ring buffer or as a linear
// buffer - Disable WRAP mode i.e use it as a linear buffer
//
// MODE: Enables a thread trace session
//
// CAPTURE_MODE: When thread trace data is collected immediately after MODE
// is enabled or wait until a Thread Trace Start event is received
//
// AUTOFLUSH_EN: Flush thread trace data to buffer often automatically
//
ttCfgRegs_.ttRegMode.u32All = 0;
ttCfgRegs_.ttRegMode.bits.WRAP = 0;
ttCfgRegs_.ttRegMode.bits.CAPTURE_MODE = 0;
ttCfgRegs_.ttRegMode.bits.MASK_CS = 1;
ttCfgRegs_.ttRegMode.bits.AUTOFLUSH_EN = 1;
ttCfgRegs_.ttRegMode.bits.MODE = SQ_THREAD_TRACE_MODE_OFF;
// Enable Thread Trace for all VM Id's
// Enable all of the SIMD's of the compute unit
// Enable Compute Unit (CU) at index Zero to be used for fine-grained data
// Enable Shader Array (SH) at index Zero to be used for fine-grained data
//
// @note: Not enabling REG_STALL_EN, SPI_STALL_EN and SQ_STALL_EN bits. They
// are useful if we wish to program buffer throttling.
//
ttCfgRegs_.ttRegMask.u32All = 0;
ttCfgRegs_.ttRegMask.bits.SH_SEL = 0x0;
ttCfgRegs_.ttRegMask.bits.SIMD_EN = 0xF;
ttCfgRegs_.ttRegMask.bits.CU_SEL = SetCuId();
ttCfgRegs_.ttRegMask.bits.SQ_STALL_EN__CI__VI = 0x1;
ttCfgRegs_.ttRegMask.bits.SPI_STALL_EN__CI__VI = 0x1;
ttCfgRegs_.ttRegMask.bits.REG_STALL_EN__CI__VI = 0x1;
ttCfgRegs_.ttRegMask.bits.VM_ID_MASK = SetVmId();
// Override Mask value if a user value is available
uint32_t ttMask = SetMask();
if (ttMask) {
ttCfgRegs_.ttRegMask.u32All = ttMask;
}
// Mask of compute units to get thread trace data from
ttCfgRegs_.ttRegPerfMask.u32All = 0;
ttCfgRegs_.ttRegPerfMask.bits.SH0_MASK = 0xFFFF;
ttCfgRegs_.ttRegPerfMask.bits.SH1_MASK = 0xFFFF;
// Indicate the different TT messages/tokens that should be enabled/logged
// Indicate the different TT tokens that specify register operations to be logged
ttCfgRegs_.ttRegTokenMask.u32All = 0;
ttCfgRegs_.ttRegTokenMask.bits.REG_MASK = 0xFF;
ttCfgRegs_.ttRegTokenMask.bits.TOKEN_MASK = 0xFFFF;
ttCfgRegs_.ttRegTokenMask.bits.REG_DROP_ON_STALL__CI__VI = 0x1;
// Override TokenMask1 value if a user value is available
uint32_t tokenMask1 = SetTokenMask();
if (tokenMask1) {
ttCfgRegs_.ttRegTokenMask.u32All = tokenMask1;
}
// Indicate the different TT tokens that specify instruction operations to be logged
// Disabling specifically instruction operations updating Program Counter (PC).
// @note: The field is defined in the spec incorrectly as a 16-bit value
ttCfgRegs_.ttRegTokenMask2.u32All = 0;
ttCfgRegs_.ttRegTokenMask2.bits.INST_MASK = 0xFFFFFF7F;
// Override TokenMask2 value if a user value is available
uint32_t tokenMask2 = SetTokenMask2();
if (tokenMask2) {
ttCfgRegs_.ttRegTokenMask2.u32All = tokenMask2;
}
}
void Gfx8ThreadTrace::setSqttDataBuff(uint8_t* sqttBuffer, uint32_t sqttBuffSz) {
// Compute the size of buffer available for each shader engine
ttBuffSize_ = sqttBuffSz / numSE_;
// Populate the sqtt buffer array submitted to device
for (int idx = 0; idx < numSE_; idx++) {
uint64_t sqttSEAddr = uint64_t(sqttBuffer + (ttBuffSize_ * idx));
devMemList_.push_back(sqttSEAddr);
}
// Update the size bit-field of sqtt ctrl register
ttCfgRegs_.ttRegSize.bits.SIZE = ttBuffSize_ >> TT_BUFF_ALIGN_SHIFT;
}
void Gfx8ThreadTrace::BeginSession(DefaultCmdBuf* cmdBuff, CommandWriter* cmdWriter) {
// Program Grbm to broadcast messages to all shader engines
regGRBM_GFX_INDEX grbm_gfx_index;
grbm_gfx_index.u32All = 0;
grbm_gfx_index.bitfields.SH_BROADCAST_WRITES = 1;
grbm_gfx_index.bitfields.SE_BROADCAST_WRITES = 1;
grbm_gfx_index.bitfields.INSTANCE_BROADCAST_WRITES = 1;
cmdWriter->BuildWriteUConfigRegPacket(cmdBuff, mmGRBM_GFX_INDEX__CI__VI, grbm_gfx_index.u32All);
// Issue a CSPartialFlush cmd including cache flush
cmdWriter->BuildWriteWaitIdlePacket(cmdBuff);
// Disable RLC Perfmon Clock Gating
// On Vega this is needed to collect Perf Cntrs
// cmdWriter->BuildWriteUConfigRegPacket(cmdBuff, mmRLC_PERFMON_CLK_CNTL__VI, 1);
// Program the Compute register to indicate SQTT is enabled
/*
regCOMPUTE_THREAD_TRACE_ENABLE__CI__VI enableTT = {0};
enableTT.bits.THREAD_TRACE_ENABLE = 1;
cmdWriter->BuildWriteUConfigRegPacket(cmdBuff,
mmCOMPUTE_THREAD_TRACE_ENABLE__CI__VI,
enableTT.u32All);
*/
// Program the thread trace mask - specifies SH, CU, SIMD and
// VM Id masks to apply. Enabling SQ/SPI/REG_STALL_EN bits
cmdWriter->BuildWriteUConfigRegPacket(cmdBuff, mmSQ_THREAD_TRACE_MASK__VI,
ttCfgRegs_.ttRegMask.u32All);
// Program the thread trace Perf mask
cmdWriter->BuildWriteUConfigRegPacket(cmdBuff, mmSQ_THREAD_TRACE_PERF_MASK__VI,
ttCfgRegs_.ttRegPerfMask.u32All);
// Program the thread trace token mask
cmdWriter->BuildWriteUConfigRegPacket(cmdBuff, mmSQ_THREAD_TRACE_TOKEN_MASK__VI,
ttCfgRegs_.ttRegTokenMask.u32All);
// Program the thread trace token mask2 to specify the list of instruction
// tokens to record. Disabling INST_PC instruction tokens
cmdWriter->BuildWriteUConfigRegPacket(cmdBuff, mmSQ_THREAD_TRACE_TOKEN_MASK2__VI,
ttCfgRegs_.ttRegTokenMask2.u32All);
// Program the thread trace mode register
cmdWriter->BuildWriteUConfigRegPacket(cmdBuff, mmSQ_THREAD_TRACE_MODE__VI,
ttCfgRegs_.ttRegMode.u32All);
// Program the HiWaterMark register to support stalling
if ((ttCfgRegs_.ttRegMask.bits.SQ_STALL_EN__CI__VI) ||
(ttCfgRegs_.ttRegMask.bits.SPI_STALL_EN__CI__VI) ||
(ttCfgRegs_.ttRegMask.bits.REG_STALL_EN__CI__VI) ||
(ttCfgRegs_.ttRegTokenMask.bits.REG_DROP_ON_STALL__CI__VI)) {
cmdWriter->BuildWriteUConfigRegPacket(cmdBuff, mmSQ_THREAD_TRACE_HIWATER__VI, 0x06);
}
// Iterate through the list of SE's and program the register
// for carrying address of thread trace buffer which is aligned
// to 4KB per thread trace specification
uint64_t baseAddr = 0;
for (int idx = 0; idx < numSE_; idx++) {
// Program Grbm to direct writes to one SE
grbm_gfx_index.bitfields.SH_INDEX = 0;
grbm_gfx_index.bitfields.SE_INDEX = idx;
grbm_gfx_index.bitfields.SH_BROADCAST_WRITES = 0;
grbm_gfx_index.bitfields.SE_BROADCAST_WRITES = 0;
cmdWriter->BuildWriteUConfigRegPacket(cmdBuff, mmGRBM_GFX_INDEX__CI__VI, grbm_gfx_index.u32All);
// Program base2 address of buffer to use for thread trace
// Encodes ATC bit, so the correct way to program is to use
// ATC Bit property of the device
/*
regSQ_THREAD_TRACE_BASE2__CI__VI sqttBase2 = {};
sqttBase2.u32All = 0;
sqttBase2.bits.ATC = 0;
sqttBase2.bits.ADDR_HI = 0;
cmdWriter->BuildWriteUConfigRegPacket(cmdBuff,
mmSQ_THREAD_TRACE_BASE2__VI,
sqttBase2.u32All);
*/
// Program the base address to use
baseAddr = devMemList_[idx] >> TT_BUFF_ALIGN_SHIFT;
// Program base address of buffer to use for thread trace
regSQ_THREAD_TRACE_BASE sqttBase = {};
sqttBase.bits.ADDR = Low32(baseAddr);
cmdWriter->BuildWriteUConfigRegPacket(cmdBuff, mmSQ_THREAD_TRACE_BASE__VI, sqttBase.u32All);
// Program the size of thread trace buffer
cmdWriter->BuildWriteUConfigRegPacket(cmdBuff, mmSQ_THREAD_TRACE_SIZE__VI,
ttCfgRegs_.ttRegSize.u32All);
// Program the thread trace ctrl register
regSQ_THREAD_TRACE_CTRL sqttCtrl = {};
sqttCtrl.u32All = 0;
sqttCtrl.bits.RESET_BUFFER = 1;
cmdWriter->BuildWriteUConfigRegPacket(cmdBuff, mmSQ_THREAD_TRACE_CTRL__VI, sqttCtrl.u32All);
}
// Reset the GRBM to broadcast mode
grbm_gfx_index.bitfields.SH_BROADCAST_WRITES = 1;
grbm_gfx_index.bitfields.SE_BROADCAST_WRITES = 1;
cmdWriter->BuildWriteUConfigRegPacket(cmdBuff, mmGRBM_GFX_INDEX__CI__VI, grbm_gfx_index.u32All);
// Issue a CSPartialFlush cmd including cache flush
cmdWriter->BuildWriteWaitIdlePacket(cmdBuff);
// Program the thread trace mode register
ttCfgRegs_.ttRegMode.bits.MODE = SQ_THREAD_TRACE_MODE_ON;
cmdWriter->BuildWriteUConfigRegPacket(cmdBuff, mmSQ_THREAD_TRACE_MODE__VI,
ttCfgRegs_.ttRegMode.u32All);
ttCfgRegs_.ttRegMode.bits.MODE = SQ_THREAD_TRACE_MODE_OFF;
// Issue a CSPartialFlush cmd including cache flush
cmdWriter->BuildWriteWaitIdlePacket(cmdBuff);
return;
}
void Gfx8ThreadTrace::StopSession(DefaultCmdBuf* cmdBuff, CommandWriter* cmdWriter) {
// Program Grbm to broadcast messages to all shader engines
regGRBM_GFX_INDEX grbm_gfx_index;
grbm_gfx_index.u32All = 0;
grbm_gfx_index.bitfields.SH_BROADCAST_WRITES = 1;
grbm_gfx_index.bitfields.SE_BROADCAST_WRITES = 1;
grbm_gfx_index.bitfields.INSTANCE_BROADCAST_WRITES = 1;
cmdWriter->BuildWriteUConfigRegPacket(cmdBuff, mmGRBM_GFX_INDEX__CI__VI, grbm_gfx_index.u32All);
// Issue a CSPartialFlush cmd including cache flush
cmdWriter->BuildWriteWaitIdlePacket(cmdBuff);
// Program the thread trace mode register to disable thread trace
// The MODE register is set to disable thread trace by default
cmdWriter->BuildWriteUConfigRegPacket(cmdBuff, mmSQ_THREAD_TRACE_MODE__VI,
ttCfgRegs_.ttRegMode.u32All);
// Issue a CSPartialFlush cmd including cache flush
cmdWriter->BuildWriteWaitIdlePacket(cmdBuff);
// Iterate through the list of SE's and read the Status, Counter and
// Write Pointer registers of Thread Trace subsystem
uint64_t baseAddr = 0;
for (int idx = 0; idx < numSE_; idx++) {
// Program Grbm to direct writes to one SE
grbm_gfx_index.bitfields.SH_INDEX = 0;
grbm_gfx_index.bitfields.SE_INDEX = idx;
grbm_gfx_index.bitfields.SH_BROADCAST_WRITES = 0;
grbm_gfx_index.bitfields.SE_BROADCAST_WRITES = 0;
cmdWriter->BuildWriteUConfigRegPacket(cmdBuff, mmGRBM_GFX_INDEX__CI__VI, grbm_gfx_index.u32All);
// Issue WaitRegMem command to wait until SQTT event has completed
bool funcEq = false;
bool memSpace = false;
uint32_t waitVal = 0x01;
uint32_t maskVal = 0x40000000L;
uint32_t statusOffset = mmSQ_THREAD_TRACE_STATUS__VI - UCONFIG_SPACE_START__CI__VI;
cmdWriter->BuildWaitRegMemCommand(cmdBuff, memSpace, statusOffset, funcEq, maskVal, waitVal);
// Retrieve the values from various status registers
cmdWriter->BuildCopyDataPacket(cmdBuff, COPY_DATA_SEL_SRC_SYS_PERF_COUNTER,
mmSQ_THREAD_TRACE_STATUS__VI, 0,
ttStatus_ + ((TT_STATUS_IDX_MAX * idx) + TT_STATUS_IDX_STATUS),
COPY_DATA_SEL_COUNT_1DW, true);
cmdWriter->BuildCopyDataPacket(cmdBuff, COPY_DATA_SEL_SRC_SYS_PERF_COUNTER,
mmSQ_THREAD_TRACE_CNTR, 0,
ttStatus_ + ((TT_STATUS_IDX_MAX * idx) + TT_STATUS_IDX_CNTR),
COPY_DATA_SEL_COUNT_1DW, true);
uint32_t wptrIdx = ((TT_STATUS_IDX_MAX * idx) + TT_STATUS_IDX_WPTR);
cmdWriter->BuildCopyDataPacket(cmdBuff, COPY_DATA_SEL_SRC_SYS_PERF_COUNTER,
mmSQ_THREAD_TRACE_WPTR__VI, 0, ttStatus_ + wptrIdx,
COPY_DATA_SEL_COUNT_1DW, true);
}
// Reset the GRBM to broadcast mode
grbm_gfx_index.bitfields.SH_BROADCAST_WRITES = 1;
grbm_gfx_index.bitfields.SE_BROADCAST_WRITES = 1;
cmdWriter->BuildWriteUConfigRegPacket(cmdBuff, mmGRBM_GFX_INDEX__CI__VI, grbm_gfx_index.u32All);
// Initialize cache flush request object
FlushCacheOptions flush;
flush.l1 = true;
flush.l2 = true;
flush.icache = true;
flush.kcache = true;
cmdWriter->BuildFlushCacheCmd(cmdBuff, &flush, NULL, 0);
// Program the size of thread trace buffer
regSQ_THREAD_TRACE_SIZE ttRegSize = {0};
ttRegSize.u32All = 0;
cmdWriter->BuildWriteUConfigRegPacket(cmdBuff, mmSQ_THREAD_TRACE_SIZE__VI, ttRegSize.u32All);
// Program the thread trace ctrl register
regSQ_THREAD_TRACE_CTRL sqttCtrl = {};
sqttCtrl.u32All = 0;
sqttCtrl.bits.RESET_BUFFER = 1;
cmdWriter->BuildWriteUConfigRegPacket(cmdBuff, mmSQ_THREAD_TRACE_CTRL__VI, sqttCtrl.u32All);
// Program the compute_thread_trace_enable register
/*
regCOMPUTE_THREAD_TRACE_ENABLE__CI__VI disableTT = {0};
cmdWriter->BuildWriteUConfigRegPacket(cmdBuff,
mmCOMPUTE_THREAD_TRACE_ENABLE__CI__VI,
disableTT.u32All);
*/
// Disable RLC Perfmon Clock Gating
// On Vega this is needed to collect Perf Cntrs
// cmdWriter->BuildWriteUConfigRegPacket(cmdBuff, mmRLC_PERFMON_CLK_CNTL__VI, 0);
// Issue a CSPartialFlush cmd including cache flush
cmdWriter->BuildWriteWaitIdlePacket(cmdBuff);
return;
}
bool Gfx8ThreadTrace::Validate() {
// Iterate through the list of SE to verify
for (int idx = 0; idx < numSE_; idx++) {
// Determine if the buffer has wrapped
uint32_t statusIdx = ((TT_STATUS_IDX_MAX * idx) + TT_STATUS_IDX_STATUS);
if (ttStatus_[statusIdx] & 0x80000000) {
return false;
}
// Adjust the value of Write Ptr which is bits [29-0]
uint32_t wptrIdx = ((TT_STATUS_IDX_MAX * idx) + TT_STATUS_IDX_WPTR);
ttStatus_[wptrIdx] = (ttStatus_[wptrIdx] & TT_WRITE_PTR_MASK);
}
return true;
}
} // pm4_profile
+101
Просмотреть файл
@@ -0,0 +1,101 @@
#ifndef _GFX8_THREAD_TRACE_H_
#define _GFX8_THREAD_TRACE_H_
#include "gfxip/gfx8/si_ci_vi_merged_typedef.h"
#include "gfxip/gfx8/si_ci_vi_merged_offset.h"
#include "gfxip/gfx8/si_ci_vi_merged_enum.h"
#include "gfxip/gfx8/si_pm4defs.h"
#include "thread_trace.h"
#include <string>
namespace pm4_profile {
typedef struct Gfx8ThreadTraceCfgRegs {
// Size of thread trace buffer
regSQ_THREAD_TRACE_SIZE ttRegSize;
// Thread trace mode
regSQ_THREAD_TRACE_MODE ttRegMode;
// Thread trace wave mask
regSQ_THREAD_TRACE_MASK ttRegMask;
// Thread trace token mask
regSQ_THREAD_TRACE_TOKEN_MASK ttRegTokenMask;
// Thread trace token mask2
regSQ_THREAD_TRACE_TOKEN_MASK2__VI ttRegTokenMask2;
// Thread trace perf mask
regSQ_THREAD_TRACE_PERF_MASK ttRegPerfMask;
} Gfx8ThreadTraceCfgRegs;
// Encapsulates the various Api and structures used to enable a thread
// trace session and collect its data
class Gfx8ThreadTrace : public ThreadTrace {
public:
Gfx8ThreadTrace();
~Gfx8ThreadTrace();
// Initializes various data structures and handles that
// are needed to support a thread trace session
bool Init(const ThreadTraceConfig* config);
// Builds Pm4 command stream to program hardware registers that
// enable a thread trace session, including the issue of an event
// to begin thread session
void BeginSession(pm4_profile::DefaultCmdBuf* cmdBuff, pm4_profile::CommandWriter* cmdWriter);
// Builds Pm4 command stream to program hardware registers that
// disable a thread trace session, including the issue of an event
// to stop currently ongoing thread session
void StopSession(pm4_profile::DefaultCmdBuf* cmdBuff, pm4_profile::CommandWriter* cmdWriter);
// Validates that thread trace session ran correctly i.e. did not
// encounter any errors.
bool Validate();
// Initializes the handle of buffer used to collect SQTT data
void setSqttDataBuff(uint8_t* sqttBuffer, uint32_t sqttBuffSz);
// Initializes the handle of buffer used to read control data of SQTT
void setSqttCtrlBuff(uint32_t* ctrlBuff) { ttStatus_ = ctrlBuff; }
// Return status info size
uint32_t StatusSizeInfo() const { return TT_STATUS_IDX_MAX * sizeof(uint32_t) * numSE_; }
// Return number of Shader Engines
uint32_t getNumSe() { return numSE_; }
private:
// Holds number of Shader Engines present on device
uint32_t numSE_;
// Thread traces status register indices to determine
// status of thread trace run
typedef enum {
TT_STATUS_IDX_STATUS = 0,
TT_STATUS_IDX_CNTR = 1,
TT_STATUS_IDX_WPTR = 2,
TT_STATUS_IDX_MAX = 3
} TTStatusReg;
// A list of tuples of TT_STATUS_IDX_MAX size,
// giving status of thread trace
uint32_t* ttStatus_;
// Size of thread trace buffer per shader engine
uint32_t ttBuffSize_;
// Handles of Device memory used for thread trace
std::vector<uint64_t> devMemList_;
// Registers that need to be programmed for Thread Trace
Gfx8ThreadTraceCfgRegs ttCfgRegs_;
// Initializes thread trace registers with default parameters.
// These are potentially updated based on updates to thread trace
// configuration object by user
void InitThreadTraceCfgRegs();
};
} // pm4_profile
#endif // _GFX8_THREAD_TRACE_H_
+356
Просмотреть файл
@@ -0,0 +1,356 @@
#include <iostream>
#include <fstream>
#include <iomanip>
#include <random>
#include <memory>
#include "core/util/os.h"
#include "gfx9_thread_trace.h"
/// @brief Returns the lower 32-bits of a value
inline uint32_t Low32(uint64_t u) { return (u & 0xFFFFFFFFUL); }
/// @brief Returns the upper 32-bits of a value
inline uint32_t High32(uint64_t u) { return (u >> 32); }
namespace pm4_profile {
Gfx9ThreadTrace::Gfx9ThreadTrace() {
// Initialize the number of shader engines
numSE_ = 4;
}
Gfx9ThreadTrace::~Gfx9ThreadTrace() {}
bool Gfx9ThreadTrace::Init(const ThreadTraceConfig* config) {
// Initialize SQTT Configuration and Register objects
if (!ThreadTrace::Init(config)) return false;
InitThreadTraceCfgRegs();
return true;
}
void Gfx9ThreadTrace::InitThreadTraceCfgRegs() {
// Indicates the size of buffer to use per Shader Engine instance.
// The size is specified in terms of 4KB blocks
ttCfgRegs_.ttRegSize.u32All = 0;
// Indicates various attributes of a thread trace session.
//
// MASK_CS: Which shader types should be enabled for data collection
// Enable CS Shader types.
//
// WRAP: How trace buffer should be used as a ring buffer or as a linear
// buffer - Disable WRAP mode i.e use it as a linear buffer
//
// MODE: Enables a thread trace session
//
// CAPTURE_MODE: When thread trace data is collected immediately after MODE
// is enabled or wait until a Thread Trace Start event is received
//
// AUTOFLUSH_EN: Flush thread trace data to buffer often automatically
//
ttCfgRegs_.ttRegMode.u32All = 0;
ttCfgRegs_.ttRegMode.bits.WRAP = 0;
ttCfgRegs_.ttRegMode.bits.CAPTURE_MODE = 0;
ttCfgRegs_.ttRegMode.bits.MASK_CS = 1;
ttCfgRegs_.ttRegMode.bits.AUTOFLUSH_EN = 1;
ttCfgRegs_.ttRegMode.bits.MODE = SQ_THREAD_TRACE_MODE_OFF;
// Enable Thread Trace for all VM Id's
// Enable all of the SIMD's of the compute unit
// Enable Compute Unit (CU) at index Zero to be used for fine-grained data
// Enable Shader Array (SH) at index Zero to be used for fine-grained data
//
// @note: Not enabling REG_STALL_EN, SPI_STALL_EN and SQ_STALL_EN bits. They
// are useful if we wish to program buffer throttling.
//
ttCfgRegs_.ttRegMask.u32All = 0;
ttCfgRegs_.ttRegMask.bits.SH_SEL = 0x0;
ttCfgRegs_.ttRegMask.bits.SIMD_EN = 0xF;
ttCfgRegs_.ttRegMask.bits.CU_SEL = SetCuId();
ttCfgRegs_.ttRegMask.bits.SQ_STALL_EN = 0x1;
ttCfgRegs_.ttRegMask.bits.SPI_STALL_EN = 0x1;
ttCfgRegs_.ttRegMask.bits.REG_STALL_EN = 0x1;
ttCfgRegs_.ttRegMask.bits.VM_ID_MASK = SetVmId();
// Override Mask value if a user value is available
uint32_t ttMask = SetMask();
if (ttMask) {
ttCfgRegs_.ttRegMask.u32All = ttMask;
}
// Mask of compute units to get thread trace data from
ttCfgRegs_.ttRegPerfMask.u32All = 0;
ttCfgRegs_.ttRegPerfMask.bits.SH0_MASK = 0xFFFF;
ttCfgRegs_.ttRegPerfMask.bits.SH1_MASK = 0xFFFF;
// Indicate the different TT messages/tokens that should be enabled/logged
// Indicate the different TT tokens that specify register operations to be logged
ttCfgRegs_.ttRegTokenMask.u32All = 0;
ttCfgRegs_.ttRegTokenMask.bits.REG_MASK = 0xFF;
ttCfgRegs_.ttRegTokenMask.bits.TOKEN_MASK = 0xFFFF;
ttCfgRegs_.ttRegTokenMask.bits.REG_DROP_ON_STALL = 0x1;
// Override TokenMask1 value if a user value is available
uint32_t tokenMask1 = SetTokenMask();
if (tokenMask1) {
ttCfgRegs_.ttRegTokenMask.u32All = tokenMask1;
}
// Indicate the different TT tokens that specify instruction operations to be logged
// Disabling specifically instruction operations updating Program Counter (PC).
// @note: The field is defined in the spec incorrectly as a 16-bit value
ttCfgRegs_.ttRegTokenMask2.u32All = 0;
ttCfgRegs_.ttRegTokenMask2.bits.INST_MASK = 0xFFFFFF7F;
// Override TokenMask2 value if a user value is available
uint32_t tokenMask2 = SetTokenMask2();
if (tokenMask2) {
ttCfgRegs_.ttRegTokenMask2.u32All = tokenMask2;
}
}
void Gfx9ThreadTrace::setSqttDataBuff(uint8_t* sqttBuffer, uint32_t sqttBuffSz) {
// Compute the size of buffer available for each shader engine
ttBuffSize_ = sqttBuffSz / numSE_;
// Populate the sqtt buffer array submitted to device
for (int idx = 0; idx < numSE_; idx++) {
uint64_t sqttSEAddr = uint64_t(sqttBuffer + (ttBuffSize_ * idx));
devMemList_.push_back(sqttSEAddr);
}
// Update the size bit-field of sqtt ctrl register
ttCfgRegs_.ttRegSize.bits.SIZE = ttBuffSize_ >> TT_BUFF_ALIGN_SHIFT;
}
void Gfx9ThreadTrace::BeginSession(DefaultCmdBuf* cmdBuff, CommandWriter* cmdWriter) {
// Program Grbm to broadcast messages to all shader engines
regGRBM_GFX_INDEX grbm_gfx_index;
grbm_gfx_index.u32All = 0;
grbm_gfx_index.bitfields.SH_BROADCAST_WRITES = 1;
grbm_gfx_index.bitfields.SE_BROADCAST_WRITES = 1;
grbm_gfx_index.bitfields.INSTANCE_BROADCAST_WRITES = 1;
cmdWriter->BuildWriteUConfigRegPacket(cmdBuff, mmGRBM_GFX_INDEX, grbm_gfx_index.u32All);
// Issue a CSPartialFlush cmd including cache flush
cmdWriter->BuildWriteWaitIdlePacket(cmdBuff);
// Disable RLC Perfmon Clock Gating
// On Vega this is needed to collect Perf Cntrs
// cmdWriter->BuildWriteUConfigRegPacket(cmdBuff, mmRLC_PERFMON_CLK_CNTL, 1);
// Program the Compute register to indicate SQTT is enabled
/*
regCOMPUTE_THREAD_TRACE_ENABLE enableTT = {0};
enableTT.bits.THREAD_TRACE_ENABLE = 1;
cmdWriter->BuildWriteUConfigRegPacket(cmdBuff,
mmCOMPUTE_THREAD_TRACE_ENABLE,
enableTT.u32All);
*/
// Program the thread trace mask - specifies SH, CU, SIMD and
// VM Id masks to apply. Enabling SQ/SPI/REG_STALL_EN bits
cmdWriter->BuildWriteUConfigRegPacket(cmdBuff, mmSQ_THREAD_TRACE_MASK,
ttCfgRegs_.ttRegMask.u32All);
// Program the thread trace Perf mask
cmdWriter->BuildWriteUConfigRegPacket(cmdBuff, mmSQ_THREAD_TRACE_PERF_MASK,
ttCfgRegs_.ttRegPerfMask.u32All);
// Program the thread trace token mask
cmdWriter->BuildWriteUConfigRegPacket(cmdBuff, mmSQ_THREAD_TRACE_TOKEN_MASK,
ttCfgRegs_.ttRegTokenMask.u32All);
// Program the thread trace token mask2 to specify the list of instruction
// tokens to record. Disabling INST_PC instruction tokens
cmdWriter->BuildWriteUConfigRegPacket(cmdBuff, mmSQ_THREAD_TRACE_TOKEN_MASK2,
ttCfgRegs_.ttRegTokenMask2.u32All);
// Program the thread trace mode register
cmdWriter->BuildWriteUConfigRegPacket(cmdBuff, mmSQ_THREAD_TRACE_MODE,
ttCfgRegs_.ttRegMode.u32All);
// Program the HiWaterMark register to support stalling
if ((ttCfgRegs_.ttRegMask.bits.SQ_STALL_EN) || (ttCfgRegs_.ttRegMask.bits.SPI_STALL_EN) ||
(ttCfgRegs_.ttRegMask.bits.REG_STALL_EN) ||
(ttCfgRegs_.ttRegTokenMask.bits.REG_DROP_ON_STALL)) {
cmdWriter->BuildWriteUConfigRegPacket(cmdBuff, mmSQ_THREAD_TRACE_HIWATER, 0x06);
}
// Iterate through the list of SE's and program the register
// for carrying address of thread trace buffer which is aligned
// to 4KB per thread trace specification
uint64_t baseAddr = 0;
for (int idx = 0; idx < numSE_; idx++) {
// Program Grbm to direct writes to one SE
grbm_gfx_index.bitfields.SH_INDEX = 0;
grbm_gfx_index.bitfields.SE_INDEX = idx;
grbm_gfx_index.bitfields.SH_BROADCAST_WRITES = 0;
grbm_gfx_index.bitfields.SE_BROADCAST_WRITES = 0;
cmdWriter->BuildWriteUConfigRegPacket(cmdBuff, mmGRBM_GFX_INDEX, grbm_gfx_index.u32All);
// Program base2 address of buffer to use for thread trace
/*
regSQ_THREAD_TRACE_BASE2 sqttBase2 = {};
sqttBase2.u32All = 0;
sqttBase2.bits.ADDR_HI = 0;
cmdWriter->BuildWriteUConfigRegPacket(cmdBuff,
mmSQ_THREAD_TRACE_BASE2,
sqttBase2.u32All);
*/
// Program the base address to use
baseAddr = devMemList_[idx] >> TT_BUFF_ALIGN_SHIFT;
// Program base address of buffer to use for thread trace
regSQ_THREAD_TRACE_BASE sqttBase = {};
sqttBase.bits.ADDR = Low32(baseAddr);
cmdWriter->BuildWriteUConfigRegPacket(cmdBuff, mmSQ_THREAD_TRACE_BASE, sqttBase.u32All);
// Program the size of thread trace buffer
cmdWriter->BuildWriteUConfigRegPacket(cmdBuff, mmSQ_THREAD_TRACE_SIZE,
ttCfgRegs_.ttRegSize.u32All);
// Program the thread trace ctrl register
regSQ_THREAD_TRACE_CTRL sqttCtrl = {};
sqttCtrl.u32All = 0;
sqttCtrl.bits.RESET_BUFFER = 1;
cmdWriter->BuildWriteUConfigRegPacket(cmdBuff, mmSQ_THREAD_TRACE_CTRL, sqttCtrl.u32All);
}
// Reset the GRBM to broadcast mode
grbm_gfx_index.bitfields.SH_BROADCAST_WRITES = 1;
grbm_gfx_index.bitfields.SE_BROADCAST_WRITES = 1;
cmdWriter->BuildWriteUConfigRegPacket(cmdBuff, mmGRBM_GFX_INDEX, grbm_gfx_index.u32All);
// Issue a CSPartialFlush cmd including cache flush
cmdWriter->BuildWriteWaitIdlePacket(cmdBuff);
// Program the thread trace mode register
ttCfgRegs_.ttRegMode.bits.MODE = SQ_THREAD_TRACE_MODE_ON;
cmdWriter->BuildWriteUConfigRegPacket(cmdBuff, mmSQ_THREAD_TRACE_MODE,
ttCfgRegs_.ttRegMode.u32All);
ttCfgRegs_.ttRegMode.bits.MODE = SQ_THREAD_TRACE_MODE_OFF;
// Issue a CSPartialFlush cmd including cache flush
cmdWriter->BuildWriteWaitIdlePacket(cmdBuff);
return;
}
void Gfx9ThreadTrace::StopSession(DefaultCmdBuf* cmdBuff, CommandWriter* cmdWriter) {
// Program Grbm to broadcast messages to all shader engines
regGRBM_GFX_INDEX grbm_gfx_index;
grbm_gfx_index.u32All = 0;
grbm_gfx_index.bitfields.SH_BROADCAST_WRITES = 1;
grbm_gfx_index.bitfields.SE_BROADCAST_WRITES = 1;
grbm_gfx_index.bitfields.INSTANCE_BROADCAST_WRITES = 1;
cmdWriter->BuildWriteUConfigRegPacket(cmdBuff, mmGRBM_GFX_INDEX, grbm_gfx_index.u32All);
// Issue a CSPartialFlush cmd including cache flush
cmdWriter->BuildWriteWaitIdlePacket(cmdBuff);
// Program the thread trace mode register to disable thread trace
// The MODE register is set to disable thread trace by default
cmdWriter->BuildWriteUConfigRegPacket(cmdBuff, mmSQ_THREAD_TRACE_MODE,
ttCfgRegs_.ttRegMode.u32All);
// Issue a CSPartialFlush cmd including cache flush
cmdWriter->BuildWriteWaitIdlePacket(cmdBuff);
// Iterate through the list of SE's and read the Status, Counter and
// Write Pointer registers of Thread Trace subsystem
uint64_t baseAddr = 0;
for (int idx = 0; idx < numSE_; idx++) {
// Program Grbm to direct writes to one SE
grbm_gfx_index.bitfields.SH_INDEX = 0;
grbm_gfx_index.bitfields.SE_INDEX = idx;
grbm_gfx_index.bitfields.SH_BROADCAST_WRITES = 0;
grbm_gfx_index.bitfields.SE_BROADCAST_WRITES = 0;
cmdWriter->BuildWriteUConfigRegPacket(cmdBuff, mmGRBM_GFX_INDEX, grbm_gfx_index.u32All);
// Issue WaitRegMem command to wait until SQTT event has completed
bool funcEq = false;
bool memSpace = false;
uint32_t waitVal = 0x01;
uint32_t maskVal = 0x40000000L;
uint32_t statusOffset = mmSQ_THREAD_TRACE_STATUS - UCONFIG_SPACE_START;
cmdWriter->BuildWaitRegMemCommand(cmdBuff, memSpace, statusOffset, funcEq, maskVal, waitVal);
// Retrieve the values from various status registers
cmdWriter->BuildCopyDataPacket(cmdBuff, COPY_DATA_SEL_SRC_SYS_PERF_COUNTER,
mmSQ_THREAD_TRACE_STATUS, 0,
ttStatus_ + ((TT_STATUS_IDX_MAX * idx) + TT_STATUS_IDX_STATUS),
COPY_DATA_SEL_COUNT_1DW, true);
cmdWriter->BuildCopyDataPacket(cmdBuff, COPY_DATA_SEL_SRC_SYS_PERF_COUNTER,
mmSQ_THREAD_TRACE_CNTR, 0,
ttStatus_ + ((TT_STATUS_IDX_MAX * idx) + TT_STATUS_IDX_CNTR),
COPY_DATA_SEL_COUNT_1DW, true);
uint32_t wptrIdx = ((TT_STATUS_IDX_MAX * idx) + TT_STATUS_IDX_WPTR);
cmdWriter->BuildCopyDataPacket(cmdBuff, COPY_DATA_SEL_SRC_SYS_PERF_COUNTER,
mmSQ_THREAD_TRACE_WPTR, 0, ttStatus_ + wptrIdx,
COPY_DATA_SEL_COUNT_1DW, true);
}
// Reset the GRBM to broadcast mode
grbm_gfx_index.bitfields.SH_BROADCAST_WRITES = 1;
grbm_gfx_index.bitfields.SE_BROADCAST_WRITES = 1;
cmdWriter->BuildWriteUConfigRegPacket(cmdBuff, mmGRBM_GFX_INDEX, grbm_gfx_index.u32All);
// Initialize cache flush request object
FlushCacheOptions flush;
flush.l1 = true;
flush.l2 = true;
flush.icache = true;
flush.kcache = true;
cmdWriter->BuildFlushCacheCmd(cmdBuff, &flush, NULL, 0);
// Program the size of thread trace buffer
regSQ_THREAD_TRACE_SIZE ttRegSize = {0};
ttRegSize.u32All = 0;
cmdWriter->BuildWriteUConfigRegPacket(cmdBuff, mmSQ_THREAD_TRACE_SIZE, ttRegSize.u32All);
// Program the thread trace ctrl register
regSQ_THREAD_TRACE_CTRL sqttCtrl = {};
sqttCtrl.u32All = 0;
sqttCtrl.bits.RESET_BUFFER = 1;
cmdWriter->BuildWriteUConfigRegPacket(cmdBuff, mmSQ_THREAD_TRACE_CTRL, sqttCtrl.u32All);
// Program the compute_thread_trace_enable register
/*
regCOMPUTE_THREAD_TRACE_ENABLE disableTT = {0};
cmdWriter->BuildWriteUConfigRegPacket(cmdBuff,
mmCOMPUTE_THREAD_TRACE_ENABLE,
disableTT.u32All);
*/
// Disable RLC Perfmon Clock Gating
// On Vega this is needed to collect Perf Cntrs
// cmdWriter->BuildWriteUConfigRegPacket(cmdBuff, mmRLC_PERFMON_CLK_CNTL, 0);
// Issue a CSPartialFlush cmd including cache flush
cmdWriter->BuildWriteWaitIdlePacket(cmdBuff);
return;
}
bool Gfx9ThreadTrace::Validate() {
// Iterate through the list of SE to verify
for (int idx = 0; idx < numSE_; idx++) {
// Determine if the buffer has wrapped
uint32_t statusIdx = ((TT_STATUS_IDX_MAX * idx) + TT_STATUS_IDX_STATUS);
if (ttStatus_[statusIdx] & 0x80000000) {
return false;
}
// Adjust the value of Write Ptr which is bits [29-0]
uint32_t wptrIdx = ((TT_STATUS_IDX_MAX * idx) + TT_STATUS_IDX_WPTR);
ttStatus_[wptrIdx] = (ttStatus_[wptrIdx] & TT_WRITE_PTR_MASK);
}
return true;
}
} // pm4_profile
+104
Просмотреть файл
@@ -0,0 +1,104 @@
#ifndef _GFX9_THREAD_TRACE_H_
#define _GFX9_THREAD_TRACE_H_
#include "gfxip/gfx9/gfx9_registers.h"
#include "gfxip/gfx9/gfx9_typedef.h"
#include "gfxip/gfx9/gfx9_enum.h"
#include "gfxip/gfx9/gfx9_offset.h"
#include "gfxip/gfx9/gfx9_pm4defs.h"
#include "thread_trace.h"
#include <string>
using namespace pm4_profile::gfx9;
namespace pm4_profile {
typedef struct Gfx9ThreadTraceCfgRegs {
// Size of thread trace buffer
regSQ_THREAD_TRACE_SIZE ttRegSize;
// Thread trace mode
regSQ_THREAD_TRACE_MODE ttRegMode;
// Thread trace wave mask
regSQ_THREAD_TRACE_MASK ttRegMask;
// Thread trace token mask
regSQ_THREAD_TRACE_TOKEN_MASK ttRegTokenMask;
// Thread trace token mask2
regSQ_THREAD_TRACE_TOKEN_MASK2 ttRegTokenMask2;
// Thread trace perf mask
regSQ_THREAD_TRACE_PERF_MASK ttRegPerfMask;
} Gfx9ThreadTraceCfgRegs;
// Encapsulates the various Api and structures used to enable a thread
// trace session and collect its data
class Gfx9ThreadTrace : public ThreadTrace {
public:
Gfx9ThreadTrace();
~Gfx9ThreadTrace();
// Initializes various data structures and handles that
// are needed to support a thread trace session
bool Init(const ThreadTraceConfig* config);
// Builds Pm4 command stream to program hardware registers that
// enable a thread trace session, including the issue of an event
// to begin thread session
void BeginSession(pm4_profile::DefaultCmdBuf* cmdBuff, pm4_profile::CommandWriter* cmdWriter);
// Builds Pm4 command stream to program hardware registers that
// disable a thread trace session, including the issue of an event
// to stop currently ongoing thread session
void StopSession(pm4_profile::DefaultCmdBuf* cmdBuff, pm4_profile::CommandWriter* cmdWriter);
// Validates that thread trace session ran correctly i.e. did not
// encounter any errors.
bool Validate();
// Initializes the handle of buffer used to collect SQTT data
void setSqttDataBuff(uint8_t* sqttBuffer, uint32_t sqttBuffSz);
// Initializes the handle of buffer used to read control data of SQTT
void setSqttCtrlBuff(uint32_t* ctrlBuff) { ttStatus_ = ctrlBuff; }
// Return status info size
uint32_t StatusSizeInfo() const { return TT_STATUS_IDX_MAX * sizeof(uint32_t) * numSE_; }
// Return number of Shader Engines
uint32_t getNumSe() { return numSE_; }
private:
// Holds number of Shader Engines present on device
uint32_t numSE_;
// Thread traces status register indices to determine
// status of thread trace run
typedef enum {
TT_STATUS_IDX_STATUS = 0,
TT_STATUS_IDX_CNTR = 1,
TT_STATUS_IDX_WPTR = 2,
TT_STATUS_IDX_MAX = 3
} TTStatusReg;
// A list of tuples of TT_STATUS_IDX_MAX size,
// giving status of thread trace
uint32_t* ttStatus_;
// Size of thread trace buffer per shader engine
uint32_t ttBuffSize_;
// Handles of Device memory used for thread trace
std::vector<uint64_t> devMemList_;
// Registers that need to be programmed for Thread Trace
Gfx9ThreadTraceCfgRegs ttCfgRegs_;
// Initializes thread trace registers with default parameters.
// These are potentially updated based on updates to thread trace
// configuration object by user
void InitThreadTraceCfgRegs();
};
} // pm4_profile
#endif // _GFX9_THREAD_TRACE_H_
+105
Просмотреть файл
@@ -0,0 +1,105 @@
#include <iostream>
#include "core/util/os.h"
#include "thread_trace.h"
namespace pm4_profile {
bool ThreadTrace::Init(const ThreadTraceConfig* config) {
if (config) {
ttConfig_ = *config;
} else {
InitThreadTraceConfig(&ttConfig_);
}
return true;
}
void ThreadTrace::InitThreadTraceConfig(ThreadTraceConfig* config) const {
memset(config, 0, sizeof(ThreadTraceConfig));
config->threadTraceTargetCu = 0;
config->threadTraceVmIdMask = 0;
config->threadTraceMask = 0;
config->threadTraceTokenMask = 0;
config->threadTraceTokenMask2 = 0;
}
uint8_t ThreadTrace::SetCuId() {
uint32_t cuId = ttConfig_.threadTraceTargetCu;
// Allow users to specify the CU to choose for Target tokens
std::string var = os::GetEnvVar("HSA_THREAD_TRACE_SELECT_CU");
if (var.length() > 0) {
cuId = std::stol(var, nullptr, 16);
std::cout << "Using " << cuId << " as CUID for Thread Trace" << std::endl;
}
assert((cuId <= 15) && "Cu Id must be between 0 and 15");
return cuId;
}
uint8_t ThreadTrace::SetVmId() {
uint32_t vmId = ttConfig_.threadTraceVmIdMask;
// Allow users to specify the VMID to choose for Target tokens
std::string var = os::GetEnvVar("HSA_THREAD_TRACE_SELECT_VMID");
if (var.length() > 0) {
vmId = std::stol(var, nullptr, 16);
std::cout << "Using " << vmId << " as VMID for Thread Trace" << std::endl;
}
assert((vmId <= 2) && "VmId must be between 0 and 2");
return vmId;
}
uint32_t ThreadTrace::SetMask() {
uint32_t ttMask = ttConfig_.threadTraceMask;
const uint32_t validMask = 0x00C0D0;
// Allow users to specify the Mask to choose for configuration parameters
std::string var = os::GetEnvVar("HSA_THREAD_TRACE_SELECT_MASK");
if (var.length() > 0) {
ttMask = std::stol(var, nullptr, 16);
std::cout << "Using " << ttMask << " as Mask for Thread Trace" << std::endl;
}
assert(((ttMask & validMask) == 0) && "Mask should have bits [4,6,7] set to Zero");
return ttMask;
}
uint32_t ThreadTrace::SetTokenMask() {
uint32_t tokenMask = ttConfig_.threadTraceTokenMask;
const uint32_t validMask = 0xFF000000;
// Allow users to specify the TokenMask to choose for Target tokens
std::string var = os::GetEnvVar("HSA_THREAD_TRACE_SELECT_TOKEN_MASK1");
if (var.length() > 0) {
tokenMask = std::stol(var, nullptr, 16);
std::cout << "Using " << tokenMask << " as TokenMask for Thread Trace" << std::endl;
}
assert(((tokenMask & validMask) == 0) && "TokenMask should have bits [31:25] set to Zero");
return tokenMask;
}
uint32_t ThreadTrace::SetTokenMask2() {
uint32_t tokenMask2 = ttConfig_.threadTraceTokenMask2;
const uint32_t validMask = 0xFFFF0000;
// Allow users to specify the TokenMask2 to choose for Target tokens
std::string var = os::GetEnvVar("HSA_THREAD_TRACE_SELECT_TOKEN_MASK2");
if (var.length() > 0) {
tokenMask2 = std::stol(var, nullptr, 16);
std::cout << "Using " << tokenMask2 << " as TokenMask2 for Thread Trace" << std::endl;
}
assert(((tokenMask2 & validMask) == 0) && "TokenMask2 should have bits [31:16] set to Zero");
return tokenMask2;
}
} // pm4_profile
+104
Просмотреть файл
@@ -0,0 +1,104 @@
#ifndef _THREAD_TRACE_H_
#define _THREAD_TRACE_H_
#include <stdint.h>
#include "cmdwriter.h"
// Move them as static variables later on
#define TT_WRITE_PTR_MASK (0x3FFFFFFF)
#define TT_DEFAULT_BUFF_SIZE_SCALE (16)
#define TT_DEFAULT_BUFF_SIZE (1024 * 1024 * 8)
// Size of block in bytesper increment in WPTR
#define TT_WRITE_PTR_BLK (32)
// Factor by which to shift buffer address
#define TT_BUFF_ALIGN_SHIFT (12)
// Align address to 64 Kilobytes
#define TT_BUFF_ADDR_ALIGN (0x10000)
namespace pm4_profile {
// ThreadTrace config
typedef struct ThreadTraceConfig {
uint32_t threadTraceTargetCu;
uint32_t threadTraceVmIdMask;
uint32_t threadTraceMask;
uint32_t threadTraceTokenMask;
uint32_t threadTraceTokenMask2;
} ThreadTraceConfig;
// Encapsulates the various Api and structures that are used to enable
// a thread trace session and collect its data. Implementations of this
// interface program device specific registers to realize the functionality
class ThreadTrace {
// Holds Thread Trace configuration information
// @note: Currently not used i.e. is not exposed to users
ThreadTraceConfig ttConfig_;
public:
// Destructor of the thread trace service handle
virtual ~ThreadTrace(){};
// Obtain the CU id to use for thread tracing
uint8_t SetCuId();
// Obtain the VM id to use for thread tracing
uint8_t SetVmId();
// Obtain the Mask to use for thread tracing
uint32_t SetMask();
// Obtain the Token Mask 1 to use for thread tracing
uint32_t SetTokenMask();
// Obtain the Token Mask 2 to use for thread tracing
uint32_t SetTokenMask2();
// Initializes various data structures and handles that
// are needed to support a thread trace session
virtual bool Init(const ThreadTraceConfig* config);
// Initializes thread trace configuration object with default
// parameters, that could potentially be overriden by user
// @note: Currently not used i.e. is not exposed to users
virtual void InitThreadTraceConfig(ThreadTraceConfig* config) const;
// Allows user to configure various parameters of a thread trace session
// @note: Currently not used i.e. is not exposed to users
bool Config(uint32_t key, uint32_t value) { return true; };
// Builds Pm4 command stream to program hardware registers that
// enable a thread trace session, including the issue of an event
// to begin thread session
virtual void BeginSession(pm4_profile::DefaultCmdBuf* cmdBuff,
pm4_profile::CommandWriter* cmdWriter) = 0;
// Builds Pm4 command stream to program hardware registers that
// disable a thread trace session, including the issue of an event
// to stop currently ongoing thread session
virtual void StopSession(pm4_profile::DefaultCmdBuf* cmdBuff,
pm4_profile::CommandWriter* cmdWriter) = 0;
// Validates that thread trace session ran correctly i.e. did not
// encounter any errors.
virtual bool Validate() = 0;
// Initializes the handle of buffer used to collect SQTT data
virtual void setSqttDataBuff(uint8_t* sqttBuffer, uint32_t sqttBuffSz) = 0;
// Initializes the handle of buffer used to read control data of SQTT
virtual void setSqttCtrlBuff(uint32_t* ctrlBuff) = 0;
// Return number of Shader Engines
virtual uint32_t getNumSe() = 0;
// Return status info size
virtual uint32_t StatusSizeInfo() const = 0;
};
} // pm4_profile
#endif // _THREAD_TRACE_H_
+17
Просмотреть файл
@@ -0,0 +1,17 @@
#
# Source files for Rocr Utils library
#
set ( MODULE_SRC ${CORE_UTIL_DIR}/lnx/os_linux.cpp )
#
# Header files include path(s).
#
include_directories ( $ENV{ROCR_INC_DIR} )
include_directories ( ${HSA_RUNTIME_OSC_DIR} )
include_directories ( ${CORE_UTIL_DIR} )
#
# Build Utils as a Static Library object
#
add_library( ${UTIL_LIB} STATIC ${MODULE_SRC} )
target_link_libraries( ${UTIL_LIB} c stdc++ dl pthread rt )
+48
Просмотреть файл
@@ -0,0 +1,48 @@
#
# Header files include path(s).
#
include_directories ( $ENV{ROCR_INC_DIR} )
include_directories ( ${API_DIR} )
include_directories ( ${PROJ_DIR}/cmdwriter )
include_directories ( ${PROJ_DIR}/perfcounter )
include_directories ( ${PROJ_DIR}/threadtrace )
include_directories ( ${PROJ_DIR}/aqlprofile )
include_directories ( ${TEST_DIR}/common )
include_directories ( ${TEST_DIR}/ctrl )
include_directories ( ${CORE_UTIL_DIR} )
#
# Specify the directory containing the libraries of HsaRt
# to be linked against for building a Hsa Perf application
#
LINK_DIRECTORIES($ENV{ROCR_LIB_DIR})
find_library ( ROCR_LIB NAMES hsa-runtime64 PATHS $ENV{ROCR_LIB_DIR} )
#
# Set Name for Common library and build it as a
# static library to be linked with others
#
set ( COMMON_LIB "common${ONLY64STR}" )
add_subdirectory ( ${TEST_DIR}/common "${PROJECT_BINARY_DIR}/common" )
#
# Build the test library
#
set ( TEST_NAME simple_convolution )
include_directories ( ${TEST_DIR}/${TEST_NAME} )
set ( LIB_NAME "${TEST_NAME}${ONLY64STR}" )
add_library ( ${LIB_NAME} STATIC ${TEST_DIR}/${TEST_NAME}/${TEST_NAME}.cpp )
target_link_libraries( ${LIB_NAME} c stdc++ )
execute_process ( COMMAND sh -xc "cp ${TEST_DIR}/${TEST_NAME}/*.hsaco ${PROJECT_BINARY_DIR}" )
set ( TEST_LIBS ${LIB_NAME} )
#
# Build the test control
#
set ( SRC_LIST ${TEST_DIR}/ctrl/test.cpp )
set ( SRC_LIST ${SRC_LIST} ${TEST_DIR}/ctrl/test_pmgr.cpp )
set ( SRC_LIST ${SRC_LIST} ${TEST_DIR}/ctrl/test_hsa.cpp )
set ( LIB_LIST ${TEST_LIBS} ${COMMON_LIB} ${CORE_UTILS_LIB} ${ROCR_LIB} ${TARGET_LIB} )
set ( EXE_NAME "ctrl" )
add_executable ( ${EXE_NAME} ${SRC_LIST} )
target_link_libraries( ${EXE_NAME} ${LIB_LIST} c stdc++ dl pthread rt )
+876
Просмотреть файл
@@ -0,0 +1,876 @@
/*
* =============================================================================
* ROC Runtime Conformance Release License
* =============================================================================
* The University of Illinois/NCSA
* Open Source License (NCSA)
*
* Copyright (c) 2017, Advanced Micro Devices, Inc.
* All rights reserved.
*
* Developed by:
*
* AMD Research and AMD ROC Software Development
*
* Advanced Micro Devices, Inc.
*
* www.amd.com
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to
* deal with the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* - Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimers.
* - Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimers in
* the documentation and/or other materials provided with the distribution.
* - Neither the names of <Name of Development Group, Name of Institution>,
* nor the names of its contributors may be used to endorse or promote
* products derived from this Software without specific prior written
* permission.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
* DEALINGS WITH THE SOFTWARE.
*
*/
#include <assert.h>
#include <stdint.h>
#include <string.h>
#include <fcntl.h>
#include <unistd.h>
#include <string>
#include <iostream>
#include <climits>
#include "hsa/hsa.h"
#include "hsa/hsa_ext_amd.h"
#define RET_IF_HSA_ERR(err) { \
if ((err) != HSA_STATUS_SUCCESS) { \
std::cout << "hsa api call failure at line " << __LINE__ << ", file: " << \
__FILE__ << ". Call returned " << err << std::endl; \
return (err); \
} \
}
static const uint32_t kBinarySearchLength = 512;
static const uint32_t kBinarySearchFindMe = 108;
static const uint32_t kWorkGroupSize = 256;
// Hold all the info specific to binary search
typedef struct BinarySearch {
// Binary Search parameters
uint32_t length;
uint32_t work_group_size;
uint32_t work_grid_size;
uint32_t num_sub_divisions;
uint32_t find_me;
// Buffers needed for this application
uint32_t* input;
uint32_t* input_arr;
uint32_t* input_arr_local;
uint32_t* output;
// Keneral argument buffers and addresses
void* kern_arg_buffer; // Begin of allocated memory
// this pointer to be deallocated
void* kern_arg_address; // Properly aligned address to be used in aql
// packet (don't use for deallocation)
// Kernel code
std::string kernel_file_name;
std::string kernel_name;
uint32_t kernarg_size;
uint32_t kernarg_align;
// HSA/RocR objects needed for this application
hsa_agent_t gpu_dev;
hsa_agent_t cpu_dev;
hsa_signal_t signal;
hsa_queue_t* queue;
hsa_amd_memory_pool_t cpu_pool;
hsa_amd_memory_pool_t gpu_pool;
hsa_amd_memory_pool_t kern_arg_pool;
// Other items we need to populate AQL packet
uint64_t kernel_object;
uint32_t group_segment_size; ///< Kernel group seg size
uint32_t private_segment_size; ///< Kernel private seg size
} BinarySearch;
void InitializeBinarySearch(BinarySearch* bs) {
bs->kernel_file_name = "./binary_search_kernels.hsaco";
bs->kernel_name = "binarySearch";
bs->length = 512;
bs->find_me = 108;
bs->work_group_size = 256;
bs->num_sub_divisions = bs->length / bs->work_group_size;
}
// This function is called by the call-back functions used to find an agent of
// the specified hsa_device_type_t. Note that it cannot be called directly from
// hsa_iterate_agents() as it does not match the prototype of the call-back
// function. It must be wrapped by a function with the correct prototype.
//
// Return values:
// HSA_STATUS_INFO_BREAK -- "agent" is of the specified type (dev_type)
// HSA_STATUS_SUCCESS -- "agent" is not of the specified type
// Other -- Some error occurred
static hsa_status_t FindAgent(hsa_agent_t agent, void* data,
hsa_device_type_t dev_type) {
if (data == nullptr) {
return HSA_STATUS_ERROR_INVALID_ARGUMENT;
}
// See if the provided agent matches the input type (dev_type)
hsa_device_type_t hsa_device_type;
hsa_status_t hsa_error_code = hsa_agent_get_info(agent, HSA_AGENT_INFO_DEVICE,
&hsa_device_type);
RET_IF_HSA_ERR(hsa_error_code);
if (hsa_device_type == dev_type) {
*(reinterpret_cast<hsa_agent_t*>(data)) = agent;
return HSA_STATUS_INFO_BREAK;
}
return HSA_STATUS_SUCCESS;
}
// This is the call-back function used to find a GPU type agent. Note that the
// prototype of this function is dictated by the HSA specification
hsa_status_t FindGPUDevice(hsa_agent_t agent, void* data) {
return FindAgent(agent, data, HSA_DEVICE_TYPE_GPU);
}
// This is the call-back function used to find a CPU type agent. Note that the
// prototype of this function is dictated by the HSA specification
hsa_status_t FindCPUDevice(hsa_agent_t agent, void* data) {
return FindAgent(agent, data, HSA_DEVICE_TYPE_CPU);
}
// Find the CPU and GPU agents we need to run this sample, and save them in the
// BinarySearch structure for later use.
hsa_status_t FindDevices(BinarySearch* bs) {
hsa_status_t err;
// Note that hsa_iterate_agents iterate through all known agents until
// HSA_STATUS_SUCCESS is not returned. The call-backs are implemented such
// that HSA_STATUS_INFO_BREAK means we found an agent of the specified type.
// This value is returned by hsa_iterate_agents.
bs->gpu_dev.handle = 0;
err = hsa_iterate_agents(FindGPUDevice, &bs->gpu_dev);
if (err != HSA_STATUS_INFO_BREAK) {
return HSA_STATUS_ERROR;
}
bs->cpu_dev.handle = 0;
err = hsa_iterate_agents(FindCPUDevice, &bs->cpu_dev);
if (err != HSA_STATUS_INFO_BREAK) {
return HSA_STATUS_ERROR;
}
if (0 == bs->gpu_dev.handle) {
std::cout << "GPU Device is not Created properly!" << std::endl;
RET_IF_HSA_ERR(HSA_STATUS_ERROR);
}
if (0 == bs->cpu_dev.handle) {
std::cout << "CPU Device is not Created properly!" << std::endl;
RET_IF_HSA_ERR(HSA_STATUS_ERROR);
}
return HSA_STATUS_SUCCESS;
}
// This function checks to see if the provided
// pool has the HSA_AMD_SEGMENT_GLOBAL property. If the kern_arg flag is true,
// the function adds an additional requirement that the pool have the
// HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_KERNARG_INIT property. If kern_arg is false,
// pools must NOT have this property.
// Upon finding a pool that meets these conditions, HSA_STATUS_INFO_BREAK is
// returned. HSA_STATUS_SUCCESS is returned if no errors were encountered, but
// no pool was found meeting the requirements. If an error is encountered, we
// return that error.
// Note that this function does not match the required prototype for the
// hsa_amd_agent_iterate_memory_pools call back function, and therefore must be
// wrapped by a function with the correct prototype.
static hsa_status_t
FindGlobalPool(hsa_amd_memory_pool_t pool, void* data, bool kern_arg) {
hsa_status_t err;
hsa_amd_segment_t segment;
uint32_t flag;
if (nullptr == data) {
return HSA_STATUS_ERROR_INVALID_ARGUMENT;
}
err = hsa_amd_memory_pool_get_info(pool, HSA_AMD_MEMORY_POOL_INFO_SEGMENT,
&segment);
RET_IF_HSA_ERR(err);
if (HSA_AMD_SEGMENT_GLOBAL != segment) {
return HSA_STATUS_SUCCESS;
}
err = hsa_amd_memory_pool_get_info(pool,
HSA_AMD_MEMORY_POOL_INFO_GLOBAL_FLAGS, &flag);
RET_IF_HSA_ERR(err);
uint32_t karg_st = flag & HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_KERNARG_INIT;
if ((karg_st == 0 && kern_arg) ||
(karg_st != 0 && !kern_arg)) {
return HSA_STATUS_SUCCESS;
}
*(reinterpret_cast<hsa_amd_memory_pool_t*>(data)) = pool;
return HSA_STATUS_INFO_BREAK;
}
// This is the call-back function for hsa_amd_agent_iterate_memory_pools() that
// finds a pool with the properties of HSA_AMD_SEGMENT_GLOBAL and that is NOT
// HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_KERNARG_INIT
hsa_status_t FindStandardPool(hsa_amd_memory_pool_t pool, void* data) {
return FindGlobalPool(pool, data, false);
}
// This is the call-back function for hsa_amd_agent_iterate_memory_pools() that
// finds a pool with the properties of HSA_AMD_SEGMENT_GLOBAL and that IS
// HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_KERNARG_INIT
hsa_status_t FindKernArgPool(hsa_amd_memory_pool_t pool, void* data) {
return FindGlobalPool(pool, data, true);
}
// Find memory pools that we will need to allocate from for this sample
// application. We will need memory associated with the host CPU, the GPU
// executing the kernels, and for kernel arguments. This function will
// save the found pools to the BinarySearch structure for use elsewhere
// in this program.
hsa_status_t FindPools(BinarySearch* bs) {
hsa_status_t err;
err = hsa_amd_agent_iterate_memory_pools(bs->cpu_dev, FindStandardPool,
&bs->cpu_pool);
if (err != HSA_STATUS_INFO_BREAK) {
return HSA_STATUS_ERROR;
}
err = hsa_amd_agent_iterate_memory_pools(bs->gpu_dev, FindStandardPool,
&bs->gpu_pool);
if (err != HSA_STATUS_INFO_BREAK) {
return HSA_STATUS_ERROR;
}
err = hsa_amd_agent_iterate_memory_pools(bs->cpu_dev,
FindKernArgPool, &bs->kern_arg_pool);
if (err != HSA_STATUS_INFO_BREAK) {
return HSA_STATUS_ERROR;
}
return HSA_STATUS_SUCCESS;
}
// Once the needed memory pools have been found and the BinarySearch structure
// has been updated with these handles, this function is then used to allocate
// memory from those pools.
// Devices with which a pool is associated already have access to the pool.
// However, other devices may also need to read or write to that memory. Below,
// we see how we can grant access to other devices to address this issue.
hsa_status_t AllocateAndInitBuffers(BinarySearch* bs) {
hsa_status_t err;
uint32_t out_length = 4 * sizeof(uint32_t);
uint32_t in_length = bs->num_sub_divisions * 2 * sizeof(uint32_t);
// In all of these examples, we want both the cpu and gpu to have access to
// the buffer in question. We use the array of agents below in the susequent
// calls to hsa_amd_agents_allow_access() for this purpose.
hsa_agent_t ag_list[2] = {bs->gpu_dev, bs->cpu_dev};
err = hsa_amd_memory_pool_allocate(bs->cpu_pool, in_length, 0,
reinterpret_cast<void**>(&bs->input));
RET_IF_HSA_ERR(err);
err = hsa_amd_agents_allow_access(2, ag_list, NULL, bs->input);
RET_IF_HSA_ERR(err);
(void)memset(bs->input, 0, in_length);
err = hsa_amd_memory_pool_allocate(bs->cpu_pool, out_length, 0,
reinterpret_cast<void**>(&bs->output));
RET_IF_HSA_ERR(err);
err = hsa_amd_agents_allow_access(2, ag_list, NULL, bs->output);
RET_IF_HSA_ERR(err);
(void)memset(bs->input, 0, in_length);
err = hsa_amd_memory_pool_allocate(bs->cpu_pool, in_length, 0,
reinterpret_cast<void**>(&bs->input_arr));
RET_IF_HSA_ERR(err);
err = hsa_amd_agents_allow_access(2, ag_list, NULL, bs->input_arr);
RET_IF_HSA_ERR(err);
(void)memset(bs->input, 0, in_length);
err = hsa_amd_memory_pool_allocate(bs->cpu_pool, in_length, 0,
reinterpret_cast<void**>(&bs->input_arr_local));
RET_IF_HSA_ERR(err);
err = hsa_amd_agents_allow_access(2, ag_list, NULL, bs->input_arr_local);
RET_IF_HSA_ERR(err);
// Binary-search application specific code...
// Initialize input buffer with random values in an increasing order
uint32_t max = bs->length * 20;
bs->input[0] = 0;
uint32_t seed = (unsigned int)time(NULL);
srand(seed);
for (uint32_t i = 1; i < bs->length; ++i) {
bs->input[i] = bs->input[i - 1] +
static_cast<uint32_t>(max * rand_r(&seed) / static_cast<float>(RAND_MAX));
}
// #define VERBOSE 1
#ifdef VERBOSE
std::cout << "Input array values:" << std::endl;
for (uint32_t i = 0; i < bs->length; ++i) {
std::cout << "input[" << i << "] = " << bs->input[i] << " ";
if (i % 4 == 0) {
std::cout << std::endl;
}
}
std::cout << std::endl;
#endif
return err;
}
// The code in this function illustrates how to load a kernel from
// pre-compiled code. The goal is to get a handle that can be later
// used in an AQL packet and also to extract information about kernel
// that we will need. All of the information hand kernel handle will
// be saved to the BinarySearch structure. It will be used when we
// populate the AQL packet.
hsa_status_t LoadKernelFromObjFile(BinarySearch* bs) {
hsa_status_t err;
hsa_code_object_reader_t code_obj_rdr = {0};
hsa_executable_t executable = {0};
hsa_file_t file_handle = open(bs->kernel_file_name.c_str(), O_RDONLY);
if (file_handle == -1) {
std::cout << "failed to open " << bs->kernel_file_name.c_str() <<
" at line " << __LINE__ << ", errno: " << errno << std::endl;
return HSA_STATUS_ERROR;
}
err = hsa_code_object_reader_create_from_file(file_handle, &code_obj_rdr);
RET_IF_HSA_ERR(err);
close(file_handle);
err = hsa_executable_create_alt(HSA_PROFILE_FULL,
HSA_DEFAULT_FLOAT_ROUNDING_MODE_DEFAULT, NULL, &executable);
RET_IF_HSA_ERR(err);
err = hsa_executable_load_agent_code_object(executable, bs->gpu_dev,
code_obj_rdr, NULL, NULL);
RET_IF_HSA_ERR(err);
err = hsa_executable_freeze(executable, NULL);
RET_IF_HSA_ERR(err);
hsa_executable_symbol_t kern_sym;
err = hsa_executable_get_symbol(executable, NULL, bs->kernel_name.c_str(),
bs->gpu_dev, 0, &kern_sym);
RET_IF_HSA_ERR(err);
err = hsa_executable_symbol_get_info(kern_sym,
HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT,
&bs->kernel_object);
RET_IF_HSA_ERR(err);
err = hsa_executable_symbol_get_info(kern_sym,
HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_PRIVATE_SEGMENT_SIZE,
&bs->private_segment_size);
RET_IF_HSA_ERR(err);
err = hsa_executable_symbol_get_info(kern_sym,
HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_GROUP_SEGMENT_SIZE,
&bs->group_segment_size);
RET_IF_HSA_ERR(err);
err = hsa_executable_symbol_get_info(kern_sym,
HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_KERNARG_SEGMENT_SIZE,
&bs->kernarg_size);
RET_IF_HSA_ERR(err);
err = hsa_executable_symbol_get_info(kern_sym,
HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_KERNARG_SEGMENT_ALIGNMENT,
&bs->kernarg_align);
RET_IF_HSA_ERR(err);
return err;
}
// This function shows how to do an asynchronous copy. We have to create a
// signal and use the signal to notify us when the copy has completed.
hsa_status_t AgentMemcpy(void* dst, const void* src,
size_t size, hsa_agent_t dst_ag, hsa_agent_t src_ag) {
hsa_signal_t s;
hsa_status_t err;
err = hsa_signal_create(1, 0, NULL, &s);
RET_IF_HSA_ERR(err);
err = hsa_amd_memory_async_copy(dst, dst_ag, src, src_ag, size, 0, NULL, s);
RET_IF_HSA_ERR(err);
if (hsa_signal_wait_scacquire(s, HSA_SIGNAL_CONDITION_LT, 1,
UINT64_MAX, HSA_WAIT_STATE_BLOCKED) != 0) {
err = HSA_STATUS_ERROR;
std::cout << "Async copy signal error" << std::endl;
RET_IF_HSA_ERR(err);
}
err = hsa_signal_destroy(s);
RET_IF_HSA_ERR(err);
return err;
}
// AlignDown and AlignUp are 2 utility functions we use to find an aligned
// boundary either below or above a given value (address). The function will
// return a value that has the specified alignment.
static intptr_t
AlignDown(intptr_t value, size_t alignment) {
return (intptr_t) (value & ~(alignment - 1));
}
static void*
AlignUp(void* value, size_t alignment) {
return reinterpret_cast<void*>(AlignDown((uintptr_t)
(reinterpret_cast<uintptr_t>(value) + alignment - 1), alignment));
}
// This function populates the AQL patch with the information
// we have collected and stored in the BinarySearch structure thus far.
void PopulateAQLPacket(BinarySearch const* bs,
hsa_kernel_dispatch_packet_t* aql) {
aql->header = 0; // Dummy val. for now. Set this right before doorbell ring
aql->setup = 1;
aql->workgroup_size_x = bs->work_group_size;
aql->workgroup_size_y = 1;
aql->workgroup_size_z = 1;
aql->grid_size_x = bs->work_grid_size;
aql->grid_size_y = 1;
aql->grid_size_z = 1;
aql->private_segment_size = bs->private_segment_size;
aql->group_segment_size = bs->group_segment_size;
aql->kernel_object = bs->kernel_object;
aql->kernarg_address = bs->kern_arg_address;
aql->completion_signal = bs->signal;
return;
}
/*
* Write everything in the provided AQL packet to the queue except the first 32
* bits which include the header and setup fields. That should be done
* last.
*/
void WriteAQLToQueue(hsa_kernel_dispatch_packet_t const* in_aql,
hsa_queue_t* q) {
void* queue_base = q->base_address;
const uint32_t queue_mask = q->size - 1;
uint64_t que_idx = hsa_queue_add_write_index_relaxed(q, 1);
hsa_kernel_dispatch_packet_t* queue_aql_packet;
queue_aql_packet =
&(reinterpret_cast<hsa_kernel_dispatch_packet_t*>(queue_base))
[que_idx & queue_mask];
queue_aql_packet->workgroup_size_x = in_aql->workgroup_size_x;
queue_aql_packet->workgroup_size_y = in_aql->workgroup_size_y;
queue_aql_packet->workgroup_size_z = in_aql->workgroup_size_z;
queue_aql_packet->grid_size_x = in_aql->grid_size_x;
queue_aql_packet->grid_size_y = in_aql->grid_size_y;
queue_aql_packet->grid_size_z = in_aql->grid_size_z;
queue_aql_packet->private_segment_size = in_aql->private_segment_size;
queue_aql_packet->group_segment_size = in_aql->group_segment_size;
queue_aql_packet->kernel_object = in_aql->kernel_object;
queue_aql_packet->kernarg_address = in_aql->kernarg_address;
queue_aql_packet->completion_signal = in_aql->completion_signal;
}
// This function allocates memory from the kern_arg pool we already found, and
// then sets the argument values needed by the kernel code.
hsa_status_t AllocAndSetKernArgs(BinarySearch* bs, void* args,
size_t arg_size, void** aql_buf_ptr) {
void* kern_arg_buf = nullptr;
hsa_status_t err;
size_t buf_size;
size_t req_align;
// The kernel code must be written to memory at the correct alignment. We
// already queried the executable to get the correct alignment, which is
// stored in bs->kernarg_align. In case the memory returned from
// hsa_amd_memory_pool is not of the correct alignment, we request a little
// more than what we need in case we need to adjust.
req_align = bs->kernarg_align;
// Allocate enough extra space for alignment adjustments if ncessary
buf_size = arg_size + (req_align << 1);
err = hsa_amd_memory_pool_allocate(bs->kern_arg_pool, buf_size, 0,
reinterpret_cast<void**>(&kern_arg_buf));
RET_IF_HSA_ERR(err);
// Address of the allocated buffer
bs->kern_arg_buffer = kern_arg_buf;
// Addr. of kern arg start.
bs->kern_arg_address = AlignUp(kern_arg_buf, req_align);
assert(arg_size >= bs->kernarg_size);
assert(((uintptr_t)bs->kern_arg_address + arg_size) <
((uintptr_t)bs->kern_arg_buffer + buf_size));
(void)memcpy(bs->kern_arg_address, args, arg_size);
RET_IF_HSA_ERR(err);
// Make sure both the CPU and GPU can access the kernel arguments
hsa_agent_t ag_list[2] = {bs->gpu_dev, bs->cpu_dev};
err = hsa_amd_agents_allow_access(2, ag_list, NULL, bs->kern_arg_buffer);
RET_IF_HSA_ERR(err);
// Save this info in our BinarySearch structure for later.
*aql_buf_ptr = bs->kern_arg_address;
return HSA_STATUS_SUCCESS;
}
// This wrapper atomically writes the provided header and setup to the
// provided AQL packet. The provided AQL packet address should be in the
// queue memory space.
inline void AtomicSetPacketHeader(uint16_t header, uint16_t setup,
hsa_kernel_dispatch_packet_t* queue_packet) {
__atomic_store_n(reinterpret_cast<uint32_t*>(queue_packet),
header | (setup << 16), __ATOMIC_RELEASE);
}
// Once all the required data for kernel execution is collected (in this
// application it is stored in the BinarySearch structure) we can put it in
// an AQL packet and ring the queue door bell to tell the command processor to
// execute it.
hsa_status_t Run(BinarySearch* bs) {
hsa_status_t err;
std::cout << "Executing kernel " << bs->kernel_name << std::endl;
// Adjust the size of workgroup
// This is mostly application specific.
if (bs->work_group_size > 64) {
bs->work_group_size = 64;
bs->num_sub_divisions = bs->length / bs->work_group_size;
if (bs->num_sub_divisions < bs->work_group_size) {
bs->num_sub_divisions = bs->work_group_size;
}
bs->work_grid_size = bs->num_sub_divisions;
}
// Explanation of BinarySearch algorithm.
/*
* Since a plain binary search on the GPU would not achieve much benefit
* over the GPU we are doing an N'ary search. We split the array into N
* segments every pass and therefore get log (base N) passes instead of log
* (base 2) passes.
*
* In every pass, only the thread that can potentially have the element we
* are looking for writes to the output array. For ex: if we are looking to
* find 4567 in the array and every thread is searching over a segment of
* 1000 values and the input array is 1, 2, 3, 4,... then the first thread
* is searching in 1 to 1000, the second one from 1001 to 2000, etc. The
* first one does not write to the output. The second one doesn't either.
* The fifth one however is from 4001 to 5000. So it can potentially have
* the element 4567 which lies between them.
*
* This particular thread writes to the output the lower bound, upper bound
* and whether the element equals the lower bound element. So, it would be
* 4001, 5000, 0
*
* The next pass would subdivide 4001 to 5000 into smaller segments and
* continue the same process from there.
*
* When a pass returns 1 in the third element, it means the element has been
* found and we can stop executing the kernel. If the element is not found,
* then the execution stops after looking at segment of size 1.
*/
uint32_t global_lower_bound = 0;
uint32_t global_upper_bound = bs->length - 1;
uint32_t sub_div_size = (global_upper_bound - global_lower_bound + 1) /
bs->num_sub_divisions;
if ((bs->input[0] > bs->find_me) ||
(bs->input[bs->length - 1] < bs->find_me)) {
bs->output[0] = 0;
bs->output[1] = bs->length - 1;
bs->output[2] = 0;
std::cout << "Returning too early" << std::endl;
return HSA_STATUS_SUCCESS;
}
bs->output[3] = 1;
// Setup the kernel args
// See the meta-data for the compiled OpenCL kernel code to ascertain
// the sizes, padding and alignment required for kernel arguments.
// This can be seen by executing
// $ amdgcn-amd-amdhsa-readelf -aw ./binary_search_kernels.hsaco
// The kernel code will expect the following arguments aligned as shown.
typedef uint32_t uint2[2];
typedef uint32_t uint4[4];
struct __attribute__((aligned(16))) local_args_t {
uint4* outputArray;
uint2* sortedArray;
uint32_t findMe;
uint32_t pad;
uint64_t global_offset_x;
uint64_t global_offset_y;
uint64_t global_offset_z;
} local_args;
local_args.outputArray = reinterpret_cast<uint4*>(bs->output);
local_args.sortedArray = reinterpret_cast<uint2*>(bs->input_arr_local);
local_args.findMe = bs->find_me;
local_args.global_offset_x = 0;
local_args.global_offset_y = 0;
local_args.global_offset_z = 0;
// Copy the kernel args structure into kernel arg memory
err = AllocAndSetKernArgs(bs, &local_args, sizeof(local_args),
&bs->kern_arg_address);
RET_IF_HSA_ERR(err);
// Populate an AQL packet with the info we've gathered
hsa_kernel_dispatch_packet_t aql;
PopulateAQLPacket(bs, &aql);
uint32_t in_length = bs->num_sub_divisions * 2 * sizeof(uint32_t);
while ((sub_div_size > 1) && (bs->output[3] != 0)) {
for (uint32_t i = 0 ; i < bs->num_sub_divisions; i++) {
int idx1 = i * sub_div_size;
int idx2 = ((i + 1) * sub_div_size) - 1;
bs->input_arr[2 * i] = bs->input[idx1];
bs->input_arr[2 * i + 1] = bs->input[idx2];
}
// Copy kernel parameter from system memory to local memory
err = AgentMemcpy(reinterpret_cast<uint8_t*>(bs->input_arr_local),
reinterpret_cast<uint8_t*>(bs->input_arr),
in_length, bs->gpu_dev, bs->cpu_dev);
RET_IF_HSA_ERR(err);
// Reset output buffer to zero
bs->output[3] = 0;
// Dispatch kernel with global work size, work group size with ONE dimesion
// and wait for kernel to complete
// Compute the write index of queue and copy Aql packet into it
uint64_t que_idx = hsa_queue_load_write_index_relaxed(bs->queue);
const uint32_t mask = bs->queue->size - 1;
// This function simply copies the data we've collected so far into our
// local AQL packet, except the the setup and header fields.
WriteAQLToQueue(&aql, bs->queue);
uint32_t aql_header = HSA_PACKET_TYPE_KERNEL_DISPATCH;
aql_header |= HSA_FENCE_SCOPE_SYSTEM <<
HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE;
aql_header |= HSA_FENCE_SCOPE_SYSTEM <<
HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE;
// Set the packet's type, acquire and release fences. This should be done
// atomically after all the other fields have been set, using release
// memory ordering to ensure all the fields are set when the door bell
// signal is activated.
void* q_base = bs->queue->base_address;
AtomicSetPacketHeader(aql_header, aql.setup,
&(reinterpret_cast<hsa_kernel_dispatch_packet_t*>
(q_base))[que_idx & mask]);
// Increment the write index and ring the doorbell to dispatch kernel.
hsa_queue_store_write_index_relaxed(bs->queue, (que_idx + 1));
hsa_signal_store_relaxed(bs->queue->doorbell_signal, que_idx);
// Wait on the dispatch signal until the kernel is finished.
// Modify the wait condition to HSA_WAIT_STATE_ACTIVE (instead of
// HSA_WAIT_STATE_BLOCKED) if polling is needed instead of blocking, as we
// have below.
// The call below will block until the condition is met. Below we have said
// the condition is that the signal value (initiailzed to 1) associated with
// the queue is less than 1. When the kernel associated with the queued AQL
// packet has completed execution, the signal value is automatically
// decremented by the packet processor.
hsa_signal_value_t value = hsa_signal_wait_scacquire(bs->signal,
HSA_SIGNAL_CONDITION_LT, 1,
UINT64_MAX, HSA_WAIT_STATE_BLOCKED);
// value should be 0, or we timed-out
if (value) {
std::cout << "Timed out waiting for kernel to complete?" << std::endl;
RET_IF_HSA_ERR(HSA_STATUS_ERROR);
}
// Reset the signal to its initial value for the next iteration
hsa_signal_store_screlease(bs->signal, 1);
// Binary search algorithm stuff...
global_lower_bound = bs->output[0] * sub_div_size;
global_upper_bound = global_lower_bound + sub_div_size - 1;
sub_div_size = (global_upper_bound - global_lower_bound + 1) /
bs->num_sub_divisions;
}
uint32_t element_index = UINT_MAX;
for (uint32_t i = global_lower_bound; i <= global_upper_bound; i++) {
if (bs->input[i] == bs->find_me) {
element_index = i;
bs->output[0] = i;
bs->output[1] = i + 1;
bs->output[2] = 1;
break;
}
// Element is not found in region specified
// by global lower bound to global upper bound
bs->output[2] = 0;
}
uint32_t is_elem_found = bs->output[2];
std::cout << "Lower bound = " << global_lower_bound << std::endl;
std::cout << "Upper bound = " << global_upper_bound << std::endl;
std::cout << "Element search for = " << bs->find_me << std::endl;
if (is_elem_found == 1) {
std::cout << "Element found at index " << element_index << std::endl;
} else {
std::cout << "Element value " << bs->find_me << " not found" << std::endl;
}
return HSA_STATUS_SUCCESS;
}
// Release all the RocR resources we have acquired in this application.
hsa_status_t CleanUp(BinarySearch* bs) {
hsa_status_t err;
err = hsa_amd_memory_pool_free(bs->input);
RET_IF_HSA_ERR(err);
err = hsa_amd_memory_pool_free(bs->output);
RET_IF_HSA_ERR(err);
err = hsa_amd_memory_pool_free(bs->input_arr);
RET_IF_HSA_ERR(err);
err = hsa_amd_memory_pool_free(bs->kern_arg_buffer);
RET_IF_HSA_ERR(err);
err = hsa_queue_destroy(bs->queue);
RET_IF_HSA_ERR(err);
err = hsa_signal_destroy(bs->signal);
RET_IF_HSA_ERR(err);
err = hsa_shut_down();
RET_IF_HSA_ERR(err);
return HSA_STATUS_SUCCESS;
}
int main(int argc, char* argv[]) {
// This BinarySearch structure (bs) below holds all of the appl. specific
// info we need to run the sample. This includes algorithm specific
// information as well as handles to RocR/HSA objects.
// The basic structure of this sample is to fill in this structure with the
// required RocR/HSA handles to RocR resources (e.g., agents, memory pools,
// queues, etc.) and then dispatch the packets to the queue, and examine the
// output.
BinarySearch bs;
hsa_status_t err;
// Set some working values specific to this application
InitializeBinarySearch(&bs);
// hsa_init() initializes internal data structures and causes devices
// (agents), memory pools and other resources to be discovered.
err = hsa_init();
RET_IF_HSA_ERR(err);
// Find the agents needed for the sample
err = FindDevices(&bs);
RET_IF_HSA_ERR(err);
// Create the completion signal used when dispatching a packet
err = hsa_signal_create(1, 0, NULL, &bs.signal);
RET_IF_HSA_ERR(err);
// Create a queue to submit our binary search AQL packets
err = hsa_queue_create(bs.gpu_dev, 128, HSA_QUEUE_TYPE_MULTI, NULL, NULL,
UINT32_MAX, UINT32_MAX, &bs.queue);
RET_IF_HSA_ERR(err);
// Find the HSA memory pools we need to run this sample
err = FindPools(&bs);
RET_IF_HSA_ERR(err);
// Allocate memory from the correct memory pool, and initialize them as
// neeeded for the algorihm.
err = AllocateAndInitBuffers(&bs);
RET_IF_HSA_ERR(err);
// Create a kernel object from the pre-compiled kernel, and read some
// attributes associated with the kernel that we will need.
err = LoadKernelFromObjFile(&bs);
RET_IF_HSA_ERR(err);
// Fill in the AQL packet, assign the kernel arguments, enqueue the packet,
// "ring" the doorbell, and wait for completion.
err = Run(&bs);
RET_IF_HSA_ERR(err);
// Release all the RocR resources we've acquired and shutdown HSA.
err = CleanUp(&bs);
return 0;
}
#undef RET_IF_HSA_ERR
+127
Просмотреть файл
@@ -0,0 +1,127 @@
/*
* =============================================================================
* ROC Runtime Conformance Release License
* =============================================================================
* The University of Illinois/NCSA
* Open Source License (NCSA)
*
* Copyright (c) 2017, Advanced Micro Devices, Inc.
* All rights reserved.
*
* Developed by:
*
* AMD Research and AMD ROC Software Development
*
* Advanced Micro Devices, Inc.
*
* www.amd.com
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to
* deal with the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* - Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimers.
* - Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimers in
* the documentation and/or other materials provided with the distribution.
* - Neither the names of <Name of Development Group, Name of Institution>,
* nor the names of its contributors may be used to endorse or promote
* products derived from this Software without specific prior written
* permission.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
* DEALINGS WITH THE SOFTWARE.
*
*/
/**
* One instance of this kernel call is a thread.
* Each thread finds out the segment in which it should look for the element.
* After that, it checks if the element is between the lower bound and upper
* bound of its segment. If yes, then this segment becomes the total
* searchspace for the next pass.
*
* To achieve this, it writes the lower bound and upper bound to the output
* array. In case the element at the left end (lower bound) matches the element
* we are looking for, that is marked in the output and we no longer need to
* look any further.
*/
__kernel void
binarySearch(__global uint4 * outputArray,
__const __global uint2 * sortedArray,
const unsigned int findMe) {
unsigned int tid = get_global_id(0);
// Then we find the elements for this thread
uint2 element = sortedArray[tid];
// If the element to be found does not lie between
// them, then nothing left to do in this thread
if((element.x > findMe) || (element.y < findMe)) {
return;
} else {
// However, if the element does lie between the lower
// and upper bounds of this thread's searchspace
// we need to narrow down the search further in this
// search space
// The search space for this thread is marked in the
// output as being the total search space for the next pass
outputArray[0].x = tid;
outputArray[0].w = 1;
}
}
__kernel void
binarySearch_mulkeys(__global int *keys,
__global uint *input,
const unsigned int numKeys,
__global int *output) {
int gid = get_global_id(0);
int lBound = gid * 256;
int uBound = lBound + 255;
for(int i = 0; i < numKeys; i++) {
if(keys[i] >= input[lBound] && keys[i] <= input[uBound])
output[i]=lBound;
}
}
__kernel void
binarySearch_mulkeysConcurrent(__global uint *keys,
__global uint *input,
const unsigned int inputSize, // num. of inputs
const unsigned int numSubdivisions,
__global int *output) {
int lBound = (get_global_id(0) % numSubdivisions) * (inputSize / numSubdivisions);
int uBound = lBound + inputSize / numSubdivisions;
int myKey = keys[get_global_id(0) / numSubdivisions];
int mid;
while(uBound >= lBound) {
mid = (lBound + uBound) / 2;
if(input[mid] == myKey) {
output[get_global_id(0) / numSubdivisions] = mid;
return;
} else if(input[mid] > myKey) {
uBound = mid - 1;
} else {
lBound = mid + 1;
}
}
}
+15
Просмотреть файл
@@ -0,0 +1,15 @@
#
# Source files for Rocr Utils library
#
file( GLOB MODULE_SRC "*.cpp" )
#
# Header files include path(s).
#
include_directories ( $ENV{ROCR_INC_DIR} )
#
# Build Utils as a Static Library object
#
add_library( ${COMMON_LIB} STATIC ${MODULE_SRC} )
target_link_libraries( ${COMMON_LIB} c stdc++ dl pthread rt )
+45
Просмотреть файл
@@ -0,0 +1,45 @@
#include "common.hpp"
void ErrorCheck(hsa_status_t hsa_error_code) {
if (hsa_error_code != HSA_STATUS_SUCCESS) {
std::cerr << "HSA reported error!" << std::endl;
exit(EXIT_FAILURE);
}
}
hsa_status_t FindGpuDevice(hsa_agent_t agent, void* data) {
if (data == NULL) {
return HSA_STATUS_ERROR_INVALID_ARGUMENT;
}
hsa_device_type_t hsa_device_type;
hsa_status_t hsa_error_code = hsa_agent_get_info(agent, HSA_AGENT_INFO_DEVICE, &hsa_device_type);
if (hsa_error_code != HSA_STATUS_SUCCESS) {
return hsa_error_code;
}
if (hsa_device_type == HSA_DEVICE_TYPE_GPU) {
*((hsa_agent_t*)data) = agent;
}
return HSA_STATUS_SUCCESS;
}
hsa_status_t FindHostRegion(hsa_region_t region, void* data) {
if (data == NULL) {
return HSA_STATUS_ERROR_INVALID_ARGUMENT;
}
bool is_host_region = false;
hsa_status_t hsa_error_code = hsa_region_get_info(
region, (hsa_region_info_t)HSA_AMD_REGION_INFO_HOST_ACCESSIBLE, &is_host_region);
if (hsa_error_code != HSA_STATUS_SUCCESS) {
return hsa_error_code;
}
if (is_host_region) {
*((hsa_region_t*)data) = region;
}
return HSA_STATUS_SUCCESS;
}
+27
Просмотреть файл
@@ -0,0 +1,27 @@
#ifndef COMMON_COMMON_HPP
#define COMMON_COMMON_HPP
#include <cstdlib>
#include <iostream>
#include "hsa.h"
#include "hsa_ext_finalize.h"
#include "hsa_ext_amd.h"
#if defined(_MSC_VER)
#define ALIGNED_(x) __declspec(align(x))
#else
#if defined(__GNUC__)
#define ALIGNED_(x) __attribute__((aligned(x)))
#endif // __GNUC__
#endif // _MSC_VER
#define MULTILINE(...) #__VA_ARGS__
void ErrorCheck(hsa_status_t hsa_error_code);
hsa_status_t FindGpuDevice(hsa_agent_t agent, void* data);
hsa_status_t FindHostRegion(hsa_region_t region, void* data);
#endif // COMMON_COMMON_HPP
+262
Просмотреть файл
@@ -0,0 +1,262 @@
/**********************************************************************
Copyright ©2013 Advanced Micro Devices, Inc. All rights reserved.
Redistribution and use in source and binary forms, with or without modification, are permitted
provided that the following conditions are met:
• Redistributions of source code must retain the above copyright notice, this list of
conditions and the following disclaimer.
• Redistributions in binary form must reproduce the above copyright notice, this list of
conditions and the following disclaimer in the documentation and/or
other materials provided with the distribution.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT
SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY
DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
POSSIBILITY OF SUCH DAMAGE.
********************************************************************/
#include "helper_funcs.hpp"
#ifndef _WIN32
#include <unistd.h>
#endif
/*
* Prints no more than 256 elements of the given array.
* Prints full array if length is less than 256.
* Prints Array name followed by elements.
*/
template <typename T>
void printArray(const std::string header, const T* data, const int width, const int height) {
std::cout << header << " :\n";
for (int i = 0; i < height; i++) {
std::cout << "> ";
for (int j = 0; j < width; j++) {
std::cout << data[i * width + j] << " ";
}
std::cout << "\n";
}
}
template <typename T>
int fillRandom(T* arrayPtr, const int width, const int height, const T rangeMin, const T rangeMax,
unsigned int seed) {
if (!arrayPtr) {
error("Cannot fill array. NULL pointer.");
return HSA_SDK_FAILURE;
}
if (!seed) seed = (unsigned int)time(NULL);
srand(seed);
double range = double(rangeMax - rangeMin) + 1.0;
/* random initialisation of input */
for (int i = 0; i < height; i++)
for (int j = 0; j < width; j++) {
int index = i * width + j;
arrayPtr[index] = rangeMin + T(range * rand() / (RAND_MAX + 1.0));
}
return HSA_SDK_SUCCESS;
}
template <typename T> int fillPos(T* arrayPtr, const int width, const int height) {
if (!arrayPtr) {
error("Cannot fill array. NULL pointer.");
return HSA_SDK_FAILURE;
}
/* initialisation of input with positions*/
for (T i = 0; i < height; i++)
for (T j = 0; j < width; j++) {
T index = i * width + j;
arrayPtr[index] = index;
}
return HSA_SDK_SUCCESS;
}
template <typename T>
int fillConstant(T* arrayPtr, const int width, const int height, const T val) {
if (!arrayPtr) {
error("Cannot fill array. NULL pointer.");
return HSA_SDK_FAILURE;
}
/* initialisation of input with constant value*/
for (int i = 0; i < height; i++)
for (int j = 0; j < width; j++) {
int index = i * width + j;
arrayPtr[index] = val;
}
return HSA_SDK_SUCCESS;
}
template <typename T> T roundToPowerOf2(T val) {
int bytes = sizeof(T);
val--;
for (int i = 0; i < bytes; i++) val |= val >> (1 << i);
val++;
return val;
}
template <typename T> int isPowerOf2(T val) {
long long _val = val;
if ((_val & (-_val)) - _val == 0 && _val != 0)
return HSA_SDK_SUCCESS;
else
return HSA_SDK_FAILURE;
}
template <typename T> bool checkVal(T input, T reference, std::string message, bool isAPIerror) {
if (input == reference) {
return true;
} else {
error(message);
return false;
}
}
template <typename T> std::string toString(T t, std::ios_base& (*r)(std::ios_base&)) {
std::ostringstream output;
output << r << t;
return output.str();
}
bool compare(const float* refData, const float* data, const int length, const float epsilon) {
float error = 0.0f;
float ref = 0.0f;
for (int i = 1; i < length; ++i) {
float diff = refData[i] - data[i];
error += diff * diff;
ref += refData[i] * refData[i];
}
float normRef = ::sqrtf((float)ref);
if (::fabs((float)ref) < 1e-7f) {
return false;
}
float normError = ::sqrtf((float)error);
error = normError / normRef;
return error < epsilon;
}
bool compare(const double* refData, const double* data, const int length, const double epsilon) {
double error = 0.0;
double ref = 0.0;
for (int i = 1; i < length; ++i) {
double diff = refData[i] - data[i];
error += diff * diff;
ref += refData[i] * refData[i];
}
double normRef = ::sqrt((double)ref);
if (::fabs((double)ref) < 1e-7) {
return false;
}
double normError = ::sqrt((double)error);
error = normError / normRef;
return error < epsilon;
}
void error(const char* errorMsg) { std::cout << "Error: " << errorMsg << std::endl; }
void error(std::string errorMsg) { std::cout << "Error: " << errorMsg << std::endl; }
void expectedError(const char* errorMsg) {
std::cout << "Expected Error: " << errorMsg << std::endl;
}
void expectedError(std::string errorMsg) {
std::cout << "Expected Error: " << errorMsg << std::endl;
}
/////////////////////////////////////////////////////////////////
// Template Instantiations
/////////////////////////////////////////////////////////////////
template void printArray<short>(const std::string, const short*, int, int);
template void printArray<unsigned char>(const std::string, const unsigned char*, int, int);
template void printArray<unsigned int>(const std::string, const unsigned int*, int, int);
template void printArray<int>(const std::string, const int*, int, int);
template void printArray<long>(const std::string, const long*, int, int);
template void printArray<float>(const std::string, const float*, int, int);
template void printArray<double>(const std::string, const double*, int, int);
template int fillRandom<unsigned char>(unsigned char* arrayPtr, const int width, const int height,
unsigned char rangeMin, unsigned char rangeMax,
unsigned int seed);
template int fillRandom<unsigned int>(unsigned int* arrayPtr, const int width, const int height,
unsigned int rangeMin, unsigned int rangeMax,
unsigned int seed);
template int fillRandom<int>(int* arrayPtr, const int width, const int height, int rangeMin,
int rangeMax, unsigned int seed);
template int fillRandom<long>(long* arrayPtr, const int width, const int height, long rangeMin,
long rangeMax, unsigned int seed);
template int fillRandom<float>(float* arrayPtr, const int width, const int height, float rangeMin,
float rangeMax, unsigned int seed);
template int fillRandom<double>(double* arrayPtr, const int width, const int height,
double rangeMin, double rangeMax, unsigned int seed);
template short roundToPowerOf2<short>(short val);
template unsigned int roundToPowerOf2<unsigned int>(unsigned int val);
template int roundToPowerOf2<int>(int val);
template long roundToPowerOf2<long>(long val);
template int isPowerOf2<short>(short val);
template int isPowerOf2<unsigned int>(unsigned int val);
template int isPowerOf2<int>(int val);
template int isPowerOf2<long>(long val);
template <> int fillPos<short>(short* arrayPtr, const int width, const int height);
template <> int fillPos<unsigned int>(unsigned int* arrayPtr, const int width, const int height);
template <> int fillPos<int>(int* arrayPtr, const int width, const int height);
template <> int fillPos<long>(long* arrayPtr, const int width, const int height);
template <>
int fillConstant<short>(short* arrayPtr, const int width, const int height, const short val);
template <>
int fillConstant(unsigned int* arrayPtr, const int width, const int height, const unsigned int val);
template <> int fillConstant(int* arrayPtr, const int width, const int height, const int val);
template <> int fillConstant(long* arrayPtr, const int width, const int height, const long val);
template <> int fillConstant(long* arrayPtr, const int width, const int height, const long val);
template <> int fillConstant(long* arrayPtr, const int width, const int height, const long val);
template bool checkVal<char>(char input, char reference, std::string message, bool isAPIerror);
template bool checkVal<bool>(bool input, bool reference, std::string message, bool isAPIerror);
template bool checkVal<std::string>(std::string input, std::string reference, std::string message,
bool isAPIerror);
template bool checkVal<short>(short input, short reference, std::string message, bool isAPIerror);
template bool checkVal<unsigned int>(unsigned int input, unsigned int reference,
std::string message, bool isAPIerror);
template bool checkVal<int>(int input, int reference, std::string message, bool isAPIerror);
template bool checkVal<long>(long input, long reference, std::string message, bool isAPIerror);
template std::string toString<char>(char t, std::ios_base& (*r)(std::ios_base&));
template std::string toString<short>(short t, std::ios_base& (*r)(std::ios_base&));
template std::string toString<unsigned int>(unsigned int t, std::ios_base& (*r)(std::ios_base&));
template std::string toString<int>(int t, std::ios_base& (*r)(std::ios_base&));
template std::string toString<long>(long t, std::ios_base& (*r)(std::ios_base&));
template std::string toString<float>(float t, std::ios_base& (*r)(std::ios_base&));
template std::string toString<double>(double t, std::ios_base& (*r)(std::ios_base&));
+141
Просмотреть файл
@@ -0,0 +1,141 @@
/**********************************************************************
Copyright ©2013 Advanced Micro Devices, Inc. All rights reserved.
Redistribution and use in source and binary forms, with or without modification, are permitted
provided that the following conditions are met:
• Redistributions of source code must retain the above copyright notice, this list of
conditions and the following disclaimer.
• Redistributions in binary form must reproduce the above copyright notice, this list of
conditions and the following disclaimer in the documentation and/or
other materials provided with the distribution.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT
SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY
DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
POSSIBILITY OF SUCH DAMAGE.
********************************************************************/
#ifndef HELPER_FUNCS_HPP_
#define HELPER_FUNCS_HPP_
#define HSA_SDK_SUCCESS 0
#define HSA_SDK_FAILURE 1
#define HSA_SDK_EXPECTED_FAILURE 2
#include <iostream>
#include <fstream>
#include <iomanip>
#include <sstream>
#include <string>
#include <ctime>
#include <cmath>
#include <time.h>
#include <stdlib.h>
#include <string.h>
#include <vector>
#include <malloc.h>
/**
* error
* constant function, Prints error messages
* @param errorMsg char* message
*/
void error(const char* errorMsg);
/**
* error
* constant function, Prints error messages
* @param errorMsg std::string message
*/
void error(std::string errorMsg);
/**
* expectedError
* constant function, Prints error messages
* @param errorMsg char* message
*/
void expectedError(const char* errorMsg);
/**
* expectedError
* constant function, Prints error messages
* @param errorMsg string message
*/
void expectedError(std::string errorMsg);
/**
* compare template version
* compare data to check error
* @param refData templated input
* @param data templated input
* @param length number of values to compare
* @param epsilon errorWindow
*/
bool compare(const float* refData, const float* data, const int length,
const float epsilon = 1e-6f);
bool compare(const double* refData, const double* data, const int length,
const double epsilon = 1e-6);
/**
* printArray
* displays a array on std::out
*/
template <typename T>
void printArray(const std::string header, const T* data, const int width, const int height);
/**
* fillRandom
* fill array with random values
*/
template <typename T>
int fillRandom(T* arrayPtr, const int width, const int height, const T rangeMin, const T rangeMax,
unsigned int seed = 123);
/**
* fillPos
* fill the specified positions
*/
template <typename T> int fillPos(T* arrayPtr, const int width, const int height);
/**
* fillConstant
* fill the array with constant value
*/
template <typename T> int fillConstant(T* arrayPtr, const int width, const int height, const T val);
/**
* roundToPowerOf2
* rounds to a power of 2
*/
template <typename T> T roundToPowerOf2(T val);
/**
* isPowerOf2
* checks if input is a power of 2
*/
template <typename T> int isPowerOf2(T val);
/**
* checkVal
* Set default(isAPIerror) parameter to false
* if checkVaul is used to check otherthan OpenCL API error code
*/
template <typename T>
bool checkVal(T input, T reference, std::string message, bool isAPIerror = true);
/**
* toString
* convert a T type to string
*/
template <typename T> std::string toString(T t, std::ios_base& (*r)(std::ios_base&));
#endif
+155
Просмотреть файл
@@ -0,0 +1,155 @@
#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#include <string.h>
#include <cassert>
#include <iostream>
#include <vector>
#include <string>
#include "hsa.h"
#include "hsa_ext_profiler.h"
#include "amd_hsa_tools_interfaces.h"
#include "hsa_perf_cntrs.hpp"
using namespace std;
void PreDispatchCallback(const hsa_dispatch_callback_t* dispParam, void* usrArg) {
assert((dispParam->pre_dispatch) && "Pre Dispatch Callback Param is Malformed");
hsa_ext_tools_pmu_t* perfMgr = reinterpret_cast<hsa_ext_tools_pmu_t*>(usrArg);
hsa_status_t status =
hsa_ext_tools_pmu_begin(*perfMgr, dispParam->queue, dispParam->aql_translation_handle, true);
assert((status == HSA_STATUS_SUCCESS) && "Error in beginning Perf Cntr Session");
}
void PostDispatchCallback(const hsa_dispatch_callback_t* dispParam, void* usrArg) {
assert((!dispParam->pre_dispatch) && "Post Dispatch Callback Param is Malformed");
hsa_ext_tools_pmu_t* perfMgr = reinterpret_cast<hsa_ext_tools_pmu_t*>(usrArg);
hsa_status_t status =
hsa_ext_tools_pmu_end(*perfMgr, dispParam->queue, dispParam->aql_translation_handle);
assert((status == HSA_STATUS_SUCCESS) && "Error in endning Perf Cntr Session");
}
// Constructor of the class
RocrPerfCntrApp::RocrPerfCntrApp() : perfMgr_(NULL) {}
// Destructor of the class. Ideally it should delete the
// PMU and its counters
RocrPerfCntrApp::~RocrPerfCntrApp() {}
// Return the number of perf counters
uint32_t RocrPerfCntrApp::GetNumPerfCntrs() { return uint32_t(cntrList_.size()); }
// Return the handle of perf counter at specified index
CntrInfo* RocrPerfCntrApp::GetPerfCntr(uint32_t idx) { return cntrList_[idx]; }
// Print the various fields of Perf Cntrs being programmed
bool RocrPerfCntrApp::PrintCntrs() {
CntrInfo* info;
int size = uint32_t(cntrList_.size());
for (int idx = 0; idx < size; idx++) {
info = cntrList_[idx];
std::cout << std::endl;
std::cout << "Rocr Perf Cntr Id: " << info->cntrId << std::endl;
std::cout << "Rocr Perf Cntr Name: " << info->cntrName << std::endl;
std::cout << "Rocr Perf Cntr Blk Id: " << info->blkId << std::endl;
std::cout << "Rocr Perf Cntr Value: " << info->cntrResult << std::endl;
std::cout << "Rocr Perf Cntr Validation: " << info->cnfType << std::endl;
std::cout << std::endl;
}
return true;
}
// Initialize the list of perf counters
// block id of kHsaAiCounterBlockSQ = 14 == 0x0E
hsa_status_t RocrPerfCntrApp::Init(hsa_agent_t agent) {
// Initialize the list of Perf Cntrs
// Add SQ counter for number of waves
CntrInfo* info = NULL;
cntrList_.reserve(23);
// Event for number of Waves
info = new CntrInfo(0x4, "SQ_SQ_PERF_SEL_WAVES", NULL, 0x0E, NULL, 0x00, 0xFFFFFFFF,
CntrValCnf_Exact);
cntrList_.push_back(info);
// Event for number of Threads
info = new CntrInfo(0xE, "SQ_SQ_PERF_SEL_ITEMS", NULL, 0x0E, NULL, 0x00, 0xFFFFFFFF,
CntrValCnf_Exact);
cntrList_.push_back(info);
// Create an instance of Perf Mgr
hsa_status_t status;
status = hsa_ext_tools_create_pmu(agent, &perfMgr_);
assert((status == HSA_STATUS_SUCCESS) && "Error in creating Perf Cntr Mgr");
// Process each counter from the list as necessary
// each counter descriptor with its perf block handle
// and create an instance of counter in that block
uint32_t size = GetNumPerfCntrs();
for (uint32_t idx = 0; idx < size; idx++) {
info = GetPerfCntr(idx);
// Obtain the handle of perf block
if (info->blkHndl == NULL) {
status = hsa_ext_tools_get_counter_block_by_id(perfMgr_, info->blkId, &info->blkHndl);
assert((status == HSA_STATUS_SUCCESS) && "Error in getting Perf Cntr Blk Hndl");
}
// Create an instance of counter in the perf block
status = hsa_ext_tools_create_counter(info->blkHndl, &info->cntrHndl);
assert((status == HSA_STATUS_SUCCESS) && "Error in creating Perf Cntr in Perf Blk");
// Update the Event Index property of counter
uint32_t cntrProp = HSA_EXT_TOOLS_COUNTER_PARAMETER_EVENT_INDEX;
status = hsa_ext_tools_set_counter_parameter(info->cntrHndl, cntrProp, sizeof(uint32_t),
(void*)&info->cntrId);
assert((status == HSA_STATUS_SUCCESS) && "Error in updating Perf Cntr Property Event Index");
// Enable the updated perf counter
status = hsa_ext_tools_set_counter_enabled(info->cntrHndl, true);
assert((status == HSA_STATUS_SUCCESS) && "Error in enabing Perf Cntr");
}
return status;
}
// Register Pre and Post dispatch callbacks
void RocrPerfCntrApp::RegisterCallbacks(hsa_queue_t* queue) {
hsa_status_t status;
status = hsa_ext_tools_set_callback_functions(queue, PreDispatchCallback, PostDispatchCallback);
assert((status == HSA_STATUS_SUCCESS) && "Error in registering Pre & Post Dispatch Callbacks");
status = hsa_ext_tools_set_callback_arguments(queue, &perfMgr_, &perfMgr_);
assert((status == HSA_STATUS_SUCCESS) &&
"Error in registering Pre & Post Dispatch Callback Params");
return;
}
// Wait for perf counter collection to complete
hsa_status_t RocrPerfCntrApp::Wait() {
hsa_status_t status;
status = hsa_ext_tools_pmu_wait_for_completion(perfMgr_, 5000);
assert((status == HSA_STATUS_SUCCESS) && "Error in Waiting for Perf Cntr Completion");
return status;
}
// Validate perf counter values
hsa_status_t RocrPerfCntrApp::Validate() {
// Retrieve the results of the different Perf Cntrs
// and validate them as configured
CntrInfo* info = NULL;
hsa_status_t status = HSA_STATUS_SUCCESS;
uint32_t size = GetNumPerfCntrs();
for (uint32_t idx = 0; idx < size; idx++) {
info = GetPerfCntr(idx);
status = hsa_ext_tools_get_counter_result(info->cntrHndl, &info->cntrResult);
std::cout << "Value of Perf Cntr is: " << info->cntrResult << std::endl;
}
return status;
}
+110
Просмотреть файл
@@ -0,0 +1,110 @@
#ifndef ROCR_PERF_CNTR_APP_H_
#define ROCR_PERF_CNTR_APP_H_
#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#include <string.h>
#include <iostream>
#include <vector>
#include <string>
#include "hsa.h"
#include "hsa_ext_profiler.h"
typedef enum CntrValCnfType {
///< no counter value validation should be performed
CntrValCnf_None,
///< counter value should be an exact match to expectedResult
CntrValCnf_Exact,
///< counter value should be greater than expectedResult
CntrValCnf_GreaterThan,
///< counter value should be less than expectedResult
CntrValCnf_LessThan
} CntrValCnfType;
/// Struct used to encapsulate Counter Info
typedef struct CntrInfo {
///< Id of counter in hardware block
uint32_t cntrId;
///< Name of counter
char cntrName[72];
///< Handle of perf counter
hsa_ext_tools_counter_t cntrHndl;
///< Id of hardware block containing the counter
uint32_t blkId;
///< Handle of counter block
hsa_ext_tools_counter_block_t blkHndl;
///< Expected value of perf counte
uint64_t expectedResult;
///< Value of perf counter expected
uint64_t cntrResult;
///< Type of validation upon completion of dispatch
CntrValCnfType cnfType;
CntrInfo(uint32_t cntrId, char* cntrName, void* cntrHndl, uint32_t blkId, void* blkHndl,
uint64_t expResult, uint64_t result, CntrValCnfType cnfType) {
this->cntrId = cntrId;
this->cntrHndl = cntrHndl;
this->blkId = blkId;
this->blkHndl = blkHndl;
this->expectedResult = expResult;
this->cntrResult = result;
this->cnfType = cnfType;
memcpy(this->cntrName, cntrName, strlen(cntrName));
}
} CntrInfo;
class RocrPerfCntrApp {
public:
// Constructor of the class. Will initialize the list of perf counters
// that will be used to program the device
RocrPerfCntrApp();
// Destructor of the class
~RocrPerfCntrApp();
// Return the number of perf counters
uint32_t GetNumPerfCntrs();
// Return the handle of perf counter at specified index
CntrInfo* GetPerfCntr(uint32_t idx);
// Print the list of perf counters
bool PrintCntrs();
// Initialize the list of perf counters
hsa_status_t Init(hsa_agent_t agent);
// Register Pre and Post dispatch callbacks
void RegisterCallbacks(hsa_queue_t* queue);
// Wait for perf counter collection to complete
hsa_status_t Wait();
// Validate perf counter values
hsa_status_t Validate();
private:
// Number of queues to create
std::vector<CntrInfo*> cntrList_;
// Handle of Perf Cntr Manager
hsa_ext_tools_pmu_t perfMgr_;
};
#endif // ROCR_PERF_CNTR_APP_H_
+476
Просмотреть файл
@@ -0,0 +1,476 @@
#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#include <string.h>
#include <cassert>
#include <fstream>
#include <iostream>
#include <vector>
#include <string>
#include "hsa.h"
#include "hsa_rsrc_factory.hpp"
#include "hsa_ext_finalize.h"
#include "hsa_ext_profiler.h"
#include "common.hpp"
using namespace std;
// Provide access to command line arguments passed in by user
uint32_t hsa_cmdline_arg_cnt;
char** hsa_cmdline_arg_list;
// Callback function to find and bind kernarg region of an agent
static hsa_status_t find_memregions(hsa_region_t region, void* data) {
hsa_region_global_flag_t flags;
hsa_region_segment_t segment_id;
hsa_region_get_info(region, HSA_REGION_INFO_SEGMENT, &segment_id);
if (segment_id != HSA_REGION_SEGMENT_GLOBAL) {
return HSA_STATUS_SUCCESS;
}
AgentInfo* agent_info = (AgentInfo*)data;
hsa_region_get_info(region, HSA_REGION_INFO_GLOBAL_FLAGS, &flags);
if (flags & HSA_REGION_GLOBAL_FLAG_COARSE_GRAINED) {
agent_info->coarse_region = region;
}
if (flags & HSA_REGION_GLOBAL_FLAG_KERNARG) {
agent_info->kernarg_region = region;
}
return HSA_STATUS_SUCCESS;
}
// Callback function to get the number of agents
static hsa_status_t get_hsa_agents(hsa_agent_t agent, void* data) {
// Copy handle of agent and increment number of agents reported
HsaRsrcFactory* rsrcFactory = reinterpret_cast<HsaRsrcFactory*>(data);
// Determine if device is a Gpu agent
hsa_status_t status;
hsa_device_type_t type;
status = hsa_agent_get_info(agent, HSA_AGENT_INFO_DEVICE, &type);
if (type == HSA_DEVICE_TYPE_DSP) {
return HSA_STATUS_SUCCESS;
}
if (type == HSA_DEVICE_TYPE_CPU) {
AgentInfo* agent_info = reinterpret_cast<AgentInfo*>(malloc(sizeof(AgentInfo)));
agent_info->dev_id = agent;
agent_info->dev_type = HSA_DEVICE_TYPE_CPU;
rsrcFactory->AddAgentInfo(agent_info, false);
return HSA_STATUS_SUCCESS;
}
// Device is a Gpu agent, build an instance of AgentInfo
AgentInfo* agent_info = reinterpret_cast<AgentInfo*>(malloc(sizeof(AgentInfo)));
agent_info->dev_id = agent;
agent_info->dev_type = HSA_DEVICE_TYPE_GPU;
hsa_agent_get_info(agent, HSA_AGENT_INFO_NAME, agent_info->name);
agent_info->max_wave_size = 0;
hsa_agent_get_info(agent, HSA_AGENT_INFO_WAVEFRONT_SIZE, &agent_info->max_wave_size);
agent_info->max_queue_size = 0;
hsa_agent_get_info(agent, HSA_AGENT_INFO_QUEUE_MAX_SIZE, &agent_info->max_queue_size);
agent_info->profile = hsa_profile_t(108);
hsa_agent_get_info(agent, HSA_AGENT_INFO_PROFILE, &agent_info->profile);
// Initialize memory regions to zero
agent_info->kernarg_region.handle = 0;
agent_info->coarse_region.handle = 0;
// Find and Bind Memory regions of the Gpu agent
hsa_agent_iterate_regions(agent, find_memregions, agent_info);
// Save the instance of AgentInfo
rsrcFactory->AddAgentInfo(agent_info, true);
return HSA_STATUS_SUCCESS;
}
// Definitions for Static Data members of the class
char* HsaRsrcFactory::brig_path_ = NULL;
uint32_t HsaRsrcFactory::num_cus_ = 4;
uint32_t HsaRsrcFactory::num_waves_;
uint32_t HsaRsrcFactory::num_workitems_;
uint32_t HsaRsrcFactory::kernel_loop_count_;
bool HsaRsrcFactory::print_debug_info_ = false;
char* HsaRsrcFactory::num_cus_key_ = "num_cus";
char* HsaRsrcFactory::brig_path_key_ = "brig_path";
char* HsaRsrcFactory::num_waves_key_ = "waves_per_cu";
char* HsaRsrcFactory::num_workitems_key_ = "workitems_per_wave";
char* HsaRsrcFactory::print_debug_key_ = "print_debug";
char* HsaRsrcFactory::kernel_loop_count_key_ = "kernel_loop_count";
// Constructor of the class
HsaRsrcFactory::HsaRsrcFactory() {
// Initialize the Hsa Runtime
hsa_status_t status = hsa_init();
check("Error in hsa_init", status);
// Discover the set of Gpu devices available on the platform
status = hsa_iterate_agents(get_hsa_agents, this);
check("Error Calling hsa_iterate_agents", status);
// Process command line arguments
ProcessCmdline();
}
// Destructor of the class
HsaRsrcFactory::~HsaRsrcFactory() {}
// Get the count of Hsa Gpu Agents available on the platform
//
// @return uint32_t Number of Gpu agents on platform
//
uint32_t HsaRsrcFactory::GetCountOfGpuAgents() { return uint32_t(gpu_list_.size()); }
// Get the count of Hsa Cpu Agents available on the platform
//
// @return uint32_t Number of Cpu agents on platform
//
uint32_t HsaRsrcFactory::GetCountOfCpuAgents() { return uint32_t(cpu_list_.size()); }
// Get the AgentInfo handle of a Gpu device
//
// @param idx Gpu Agent at specified index
//
// @param agent_info Output parameter updated with AgentInfo
//
// @return bool true if successful, false otherwise
//
bool HsaRsrcFactory::GetGpuAgentInfo(uint32_t idx, AgentInfo** agent_info) {
// Determine if request is valid
uint32_t size = uint32_t(gpu_list_.size());
if (idx >= size) {
return false;
}
// Copy AgentInfo from specified index
*agent_info = gpu_list_[idx];
return true;
}
// Get the AgentInfo handle of a Cpu device
//
// @param idx Cpu Agent at specified index
//
// @param agent_info Output parameter updated with AgentInfo
//
// @return bool true if successful, false otherwise
//
bool HsaRsrcFactory::GetCpuAgentInfo(uint32_t idx, AgentInfo** agent_info) {
// Determine if request is valid
uint32_t size = uint32_t(cpu_list_.size());
if (idx >= size) {
return false;
}
// Copy AgentInfo from specified index
*agent_info = cpu_list_[idx];
return true;
}
// Create a Queue object and return its handle. The queue object is expected
// to support user requested number of Aql dispatch packets.
//
// @param agent_info Gpu Agent on which to create a queue object
//
// @param num_Pkts Number of packets to be held by queue
//
// @param queue Output parameter updated with handle of queue object
//
// @return bool true if successful, false otherwise
//
bool HsaRsrcFactory::CreateQueue(AgentInfo* agent_info, uint32_t num_pkts, hsa_queue_t** queue) {
hsa_status_t status;
status = hsa_queue_create(agent_info->dev_id, num_pkts, HSA_QUEUE_TYPE_MULTI, NULL, NULL,
UINT32_MAX, UINT32_MAX, queue);
return (status == HSA_STATUS_SUCCESS);
}
// Create a Signal object and return its handle.
//
// @param value Initial value of signal object
//
// @param signal Output parameter updated with handle of signal object
//
// @return bool true if successful, false otherwise
//
bool HsaRsrcFactory::CreateSignal(uint32_t value, hsa_signal_t* signal) {
hsa_status_t status;
status = hsa_signal_create(value, 0, NULL, signal);
return (status == HSA_STATUS_SUCCESS);
}
// Allocate memory for use by a kernel of specified size in specified
// agent's memory region. Currently supports Global segment whose Kernarg
// flag set.
//
// @param agent_info Agent from whose memory region to allocate
//
// @param size Size of memory in terms of bytes
//
// @return uint8_t* Pointer to buffer, null if allocation fails.
//
uint8_t* HsaRsrcFactory::AllocateLocalMemory(AgentInfo* agent_info, size_t size) {
hsa_status_t status;
uint8_t* buffer = NULL;
if (agent_info->coarse_region.handle != 0) {
// Allocate in local memory if it is available
status = hsa_memory_allocate(agent_info->coarse_region, size, (void**)&buffer);
if (status == HSA_STATUS_SUCCESS) {
status = hsa_memory_assign_agent(buffer, agent_info->dev_id, HSA_ACCESS_PERMISSION_RW);
}
} else {
// Allocate in system memory if local memory is not available
status = hsa_memory_allocate(agent_info->kernarg_region, size, (void**)&buffer);
}
return (status == HSA_STATUS_SUCCESS) ? buffer : NULL;
}
// Allocate memory tp pass kernel parameters.
//
// @param agent_info Agent from whose memory region to allocate
//
// @param size Size of memory in terms of bytes
//
// @return uint8_t* Pointer to buffer, null if allocation fails.
//
uint8_t* HsaRsrcFactory::AllocateSysMemory(AgentInfo* agent_info, size_t size) {
hsa_status_t status;
uint8_t* buffer = NULL;
status = hsa_memory_allocate(agent_info->kernarg_region, size, (void**)&buffer);
return (status == HSA_STATUS_SUCCESS) ? buffer : NULL;
}
bool HsaRsrcFactory::TransferData(uint8_t* dest_buff, uint8_t* src_buff, uint32_t length,
bool host_to_dev) {
hsa_status_t status;
status = hsa_memory_copy(dest_buff, src_buff, length);
return (status == HSA_STATUS_SUCCESS);
}
// Fake method for compilation steps only
uint8_t* HsaRsrcFactory::AllocateMemory(AgentInfo* agent_info, size_t size) {
hsa_status_t status;
uint8_t* buffer = NULL;
status = hsa_memory_allocate(agent_info->kernarg_region, size, (void**)&buffer);
return (status == HSA_STATUS_SUCCESS) ? buffer : NULL;
}
// Loads an Assembled Brig file and Finalizes it into Device Isa
//
// @param agent_info Gpu device for which to finalize
//
// @param brig_path File path of the Assembled Brig file
//
// @param kernel_name Name of the kernel to finalize
//
// @param code_desc Handle of finalized Code Descriptor that could
// be used to submit for execution
//
// @return bool true if successful, false otherwise
//
bool HsaRsrcFactory::LoadAndFinalize(AgentInfo* agent_info, const char* brig_path,
char* kernel_name, hsa_executable_symbol_t* code_desc) {
// Finalize the Hsail object into code object
hsa_status_t status;
hsa_code_object_t code_object;
// Build the code object filename
std::string filename(brig_path);
std::cout << "Code object filename: " << filename << std::endl;
// Open the file containing code object
std::ifstream codeStream(filename.c_str(), std::ios::binary | std::ios::ate);
if (!codeStream) {
std::cout << "Error: failed to load " << filename << std::endl;
assert(false);
return false;
}
// Allocate memory to read in code object from file
size_t size = std::string::size_type(codeStream.tellg());
char* codeBuff = (char*)AllocateSysMemory(agent_info, size);
if (!codeBuff) {
std::cout << "Error: failed to allocate memory for code object." << std::endl;
assert(false);
return false;
}
// Read the code object into allocated memory
codeStream.seekg(0, std::ios::beg);
std::copy(std::istreambuf_iterator<char>(codeStream), std::istreambuf_iterator<char>(), codeBuff);
// De-Serialize the code object that has been read into memory
status = hsa_code_object_deserialize(codeBuff, size, NULL, &code_object);
if (status != HSA_STATUS_SUCCESS) {
std::cout << "Failed to deserialize code object" << std::endl;
return false;
}
// Create executable.
hsa_executable_t hsaExecutable;
// status = hsa_executable_create(agent_info->profile,
status =
hsa_executable_create(HSA_PROFILE_FULL, HSA_EXECUTABLE_STATE_UNFROZEN, "", &hsaExecutable);
check("Error in creating executable object", status);
// Load code object.
status = hsa_executable_load_code_object(hsaExecutable, agent_info->dev_id, code_object, "");
check("Error in loading executable object", status);
// Freeze executable.
status = hsa_executable_freeze(hsaExecutable, "");
check("Error in freezing executable object", status);
// Get symbol handle.
hsa_executable_symbol_t kernelSymbol;
status = hsa_executable_get_symbol(hsaExecutable, NULL, kernel_name, agent_info->dev_id, 0,
&kernelSymbol);
check("Error in looking up kernel symbol", status);
// Update output parameter
*code_desc = kernelSymbol;
return true;
}
// Add an instance of AgentInfo representing a Hsa Gpu agent
void HsaRsrcFactory::AddAgentInfo(AgentInfo* agent_info, bool gpu) {
// Add input to Gpu list
if (gpu) {
gpu_list_.push_back(agent_info);
return;
}
// Add input to Cpu list
cpu_list_.push_back(agent_info);
}
// Print the various fields of Hsa Gpu Agents
bool HsaRsrcFactory::PrintGpuAgents(const std::string& header) {
std::cout << header << " :" << std::endl;
AgentInfo* agent_info;
int size = uint32_t(gpu_list_.size());
for (int idx = 0; idx < size; idx++) {
agent_info = gpu_list_[idx];
std::cout << "> agent[" << idx << "] :" << std::endl;
std::cout << ">> Name : " << agent_info->name << std::endl;
std::cout << ">> Max Wave Size : " << agent_info->max_wave_size << std::endl;
std::cout << ">> Max Queue Size : " << agent_info->max_queue_size << std::endl;
std::cout << ">> Kernarg Region Id : " << agent_info->coarse_region.handle << std::endl;
}
return true;
}
// Returns the file path where brig files is located. Value is
// available only after an instance has been built.
char* HsaRsrcFactory::GetBrigPath() { return HsaRsrcFactory::brig_path_; }
// Returns the number of compute units present on platform
// Value is available only after an instance has been built.
uint32_t HsaRsrcFactory::GetNumOfCUs() { return HsaRsrcFactory::num_cus_; }
// Returns the maximum number of waves that can be launched
// per compute unit. The actual number that can be launched
// is affected by resource availability
//
// Value is available only after an instance has been built.
uint32_t HsaRsrcFactory::GetNumOfWavesPerCU() { return HsaRsrcFactory::num_waves_; }
// Returns the number of work-items that can execute per wave
// Value is available only after an instance has been built.
uint32_t HsaRsrcFactory::GetNumOfWorkItemsPerWave() { return HsaRsrcFactory::num_workitems_; }
// Returns the number of times kernel loop body should execute.
// Value is available only after an instance has been built.
uint32_t HsaRsrcFactory::GetKernelLoopCount() { return HsaRsrcFactory::kernel_loop_count_; }
// Returns boolean flag to indicate if debug info should be printed
// Value is available only after an instance has been built.
uint32_t HsaRsrcFactory::GetPrintDebugInfo() { return HsaRsrcFactory::print_debug_info_; }
// Process command line arguments. The method will capture
// various user command line parameters for tests to use
void HsaRsrcFactory::ProcessCmdline() {
// Command line arguments are given
uint32_t idx;
uint32_t arg_idx;
for (idx = 1; idx < hsa_cmdline_arg_cnt; idx += 2) {
arg_idx = GetArgIndex((char*)hsa_cmdline_arg_list[idx]);
switch (arg_idx) {
case 0:
HsaRsrcFactory::brig_path_ = hsa_cmdline_arg_list[idx + 1];
break;
case 1:
HsaRsrcFactory::num_cus_ = atoi(hsa_cmdline_arg_list[idx + 1]);
break;
case 2:
HsaRsrcFactory::num_waves_ = atoi(hsa_cmdline_arg_list[idx + 1]);
break;
case 3:
HsaRsrcFactory::num_workitems_ = atoi(hsa_cmdline_arg_list[idx + 1]);
break;
case 4:
HsaRsrcFactory::kernel_loop_count_ = atoi(hsa_cmdline_arg_list[idx + 1]);
break;
case 5:
HsaRsrcFactory::print_debug_info_ = true;
break;
}
}
}
uint32_t HsaRsrcFactory::GetArgIndex(char* arg_value) {
// Map Brig file path to index zero
if (!strcmp(HsaRsrcFactory::brig_path_key_, arg_value)) {
return 0;
}
// Map Number of Compute Units to index one
if (!strcmp(HsaRsrcFactory::num_cus_key_, arg_value)) {
return 1;
}
// Map Number of Waves per CU to index two
if (!strcmp(HsaRsrcFactory::num_waves_key_, arg_value)) {
return 2;
}
// Map Number of Workitems per Wave to index three
if (!strcmp(HsaRsrcFactory::num_workitems_key_, arg_value)) {
return 3;
}
// Map Kernel Loop Count to index four
if (!strcmp(HsaRsrcFactory::kernel_loop_count_key_, arg_value)) {
return 4;
}
// Map print debug info parameter
if (!strcmp(HsaRsrcFactory::print_debug_key_, arg_value)) {
return 5;
}
return 108;
}
void HsaRsrcFactory::PrintHelpMsg() {
std::cout << "Key for passing Brig filepath: " << HsaRsrcFactory::brig_path_key_ << std::endl;
std::cout << "Key for passing Number of Compute Units: " << HsaRsrcFactory::num_cus_key_
<< std::endl;
std::cout << "Key for passing Number of Waves per CU: " << HsaRsrcFactory::num_waves_key_
<< std::endl;
std::cout << "Key for passing Number of Workitems per Wave: "
<< HsaRsrcFactory::num_workitems_key_ << std::endl;
std::cout << "Key for passing Kernel Loop Count: " << HsaRsrcFactory::kernel_loop_count_key_
<< std::endl;
}
+262
Просмотреть файл
@@ -0,0 +1,262 @@
#ifndef HSA_RSRC_FACTORY_H_
#define HSA_RSRC_FACTORY_H_
#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#include <string.h>
#include <iostream>
#include <vector>
#include <string>
#include "hsatimer.h"
#include "hsa.h"
#include "hsa_ext_finalize.h"
#define HSA_ARGUMENT_ALIGN_BYTES 16
#define HSA_QUEUE_ALIGN_BYTES 64
#define HSA_PACKET_ALIGN_BYTES 64
#define check(msg, status) \
if (status != HSA_STATUS_SUCCESS) { \
const char* emsg = 0; \
hsa_status_string(status, &emsg); \
printf("%s: %s\n", msg, emsg ? emsg : "<unknown error>"); \
exit(1); \
}
#define check_build(msg, status) \
if (status != STATUS_SUCCESS) { \
printf("%s\n", msg); \
exit(1); \
}
// Provide access to command line arguments passed in by user
extern uint32_t hsa_cmdline_arg_cnt;
extern char** hsa_cmdline_arg_list;
// Encapsulates information about a Hsa Agent such as its
// handle, name, max queue size, max wavefront size, etc.
typedef struct {
// Handle of Agent
hsa_agent_t dev_id;
// Agent type - Cpu = 0, Gpu = 1 or Dsp = 2
uint32_t dev_type;
// Name of Agent whose length is less than 64
char name[64];
// Max size of Wavefront size
uint32_t max_wave_size;
// Max size of Queue buffer
uint32_t max_queue_size;
// Hsail profile supported by agent
hsa_profile_t profile;
// Memory region supporting kernel parameters
hsa_region_t coarse_region;
// Memory region supporting kernel arguments
hsa_region_t kernarg_region;
} AgentInfo;
class HsaRsrcFactory {
public:
// Constructor of the class. Will initialize the Hsa Runtime and
// query the system topology to get the list of Cpu and Gpu devices
HsaRsrcFactory();
// Destructor of the class
~HsaRsrcFactory();
// Get the count of Hsa Gpu Agents available on the platform
//
// @return uint32_t Number of Gpu agents on platform
//
uint32_t GetCountOfGpuAgents();
// Get the count of Hsa Cpu Agents available on the platform
//
// @return uint32_t Number of Cpu agents on platform
//
uint32_t GetCountOfCpuAgents();
// Get the AgentInfo handle of a Gpu device
//
// @param idx Gpu Agent at specified index
//
// @param agent_info Output parameter updated with AgentInfo
//
// @return bool true if successful, false otherwise
//
bool GetGpuAgentInfo(uint32_t idx, AgentInfo** agent_info);
// Get the AgentInfo handle of a Cpu device
//
// @param idx Cpu Agent at specified index
//
// @param agent_info Output parameter updated with AgentInfo
//
// @return bool true if successful, false otherwise
//
bool GetCpuAgentInfo(uint32_t idx, AgentInfo** agent_info);
// Create a Queue object and return its handle. The queue object is expected
// to support user requested number of Aql dispatch packets.
//
// @param agent_info Gpu Agent on which to create a queue object
//
// @param num_Pkts Number of packets to be held by queue
//
// @param queue Output parameter updated with handle of queue object
//
// @return bool true if successful, false otherwise
//
bool CreateQueue(AgentInfo* agent_info, uint32_t num_pkts, hsa_queue_t** queue);
// Create a Signal object and return its handle.
//
// @param value Initial value of signal object
//
// @param signal Output parameter updated with handle of signal object
//
// @return bool true if successful, false otherwise
//
bool CreateSignal(uint32_t value, hsa_signal_t* signal);
// Allocate memory for use by a kernel of specified size in specified
// agent's memory region. Currently supports Global segment whose Kernarg
// flag set.
//
// @param agent_info Agent from whose memory region to allocate
//
// @param size Size of memory in terms of bytes
//
// @return uint8_t* Pointer to buffer, null if allocation fails.
//
uint8_t* AllocateLocalMemory(AgentInfo* agent_info, size_t size);
uint8_t* AllocateMemory(AgentInfo* agent_info, size_t size);
bool TransferData(uint8_t* dest_buff, uint8_t* src_buff, uint32_t length, bool host_to_dev);
// Allocate memory tp pass kernel parameters.
//
// @param agent_info Agent from whose memory region to allocate
//
// @param size Size of memory in terms of bytes
//
// @return uint8_t* Pointer to buffer, null if allocation fails.
//
uint8_t* AllocateSysMemory(AgentInfo* agent_info, size_t size);
// Loads an Assembled Brig file and Finalizes it into Device Isa
//
// @param agent_info Gpu device for which to finalize
//
// @param brig_path File path of the Assembled Brig file
//
// @param kernel_name Name of the kernel to finalize
//
// @param code_desc Handle of finalized Code Descriptor that could
// be used to submit for execution
//
// @return bool true if successful, false otherwise
//
bool LoadAndFinalize(AgentInfo* agent_info, const char* brig_path, char* kernel_name,
hsa_executable_symbol_t* code_desc);
// Add an instance of AgentInfo representing a Hsa Gpu agent
void AddAgentInfo(AgentInfo* agent_info, bool gpu);
// Returns the file path where brig files is located
static char* GetBrigPath();
// Returns the number of compute units present on platform
static uint32_t GetNumOfCUs();
// Returns the maximum number of waves that can be launched
// per compute unit. The actual number that can be launched
// is affected by resource availability
static uint32_t GetNumOfWavesPerCU();
// Returns the number of work-items that can execute per wave
static uint32_t GetNumOfWorkItemsPerWave();
// Returns the number of times kernel loop body should execute.
static uint32_t GetKernelLoopCount();
// Returns boolean flag to indicate if debug info should be printed
static uint32_t GetPrintDebugInfo();
// Print the various fields of Hsa Gpu Agents
bool PrintGpuAgents(const std::string& header);
private:
// Number of queues to create
uint32_t num_queues_;
// Used to maintain a list of Hsa Queue handles
std::vector<hsa_queue_t*> queue_list_;
// Number of Signals to create
uint32_t num_signals_;
// Used to maintain a list of Hsa Signal handles
std::vector<hsa_signal_t*> signal_list_;
// Number of agents reported by platform
uint32_t num_agents_;
// Used to maintain a list of Hsa Gpu Agent Info
std::vector<AgentInfo*> gpu_list_;
// Used to maintain a list of Hsa Cpu Agent Info
std::vector<AgentInfo*> cpu_list_;
// Records the file path where Brig file is located.
// Value is available only after an instance has been built.
static char* brig_path_;
static char* brig_path_key_;
// Records the number of Compute units present on system.
// Value is available only after an instance has been built.
static uint32_t num_cus_;
static char* num_cus_key_;
// Records the number of waves that can be launched per Compute unit
// Value is available only after an instance has been built.
static uint32_t num_waves_;
static char* num_waves_key_;
// Records the number of work-items that can be packed into a wave
// Value is available only after an instance has been built.
static uint32_t num_workitems_;
static char* num_workitems_key_;
// Records the number of times kernel loop body should run. Value
// is available only after an instance has been built.
static uint32_t kernel_loop_count_;
static char* kernel_loop_count_key_;
// Records the number of times kernel loop body should run. Value
// is available only after an instance has been built.
static bool print_debug_info_;
static char* print_debug_key_;
// Process command line arguments. The method will capture
// various user command line parameters for tests to use
static void ProcessCmdline();
// Prints the help banner on user arg keys
static void PrintHelpMsg();
// Maps an index for the user argument
static uint32_t GetArgIndex(char* arg_value);
};
#endif // HSA_RSRC_FACTORY_H_
+168
Просмотреть файл
@@ -0,0 +1,168 @@
#include "hsatimer.h"
PerfTimer::PerfTimer() { freq_in_100mhz = MeasureTSCFreqHz(); }
PerfTimer::~PerfTimer() {
while (!_timers.empty()) {
Timer* temp = _timers.back();
_timers.pop_back();
delete temp;
}
}
// a new cretaed timer instantance index will be returned
int PerfTimer::CreateTimer() {
Timer* newTimer = new Timer;
newTimer->_start = 0;
newTimer->_clocks = 0;
#ifdef _WIN32
QueryPerformanceFrequency((LARGE_INTEGER*)&newTimer->_freq);
#else
newTimer->_freq = (long long)1.0E3;
#endif
/* Push back the address of new Timer instance created */
_timers.push_back(newTimer);
return (int)(_timers.size() - 1);
}
int PerfTimer::StartTimer(int index) {
if (index >= (int)_timers.size()) {
Error("Cannot reset timer. Invalid handle.");
return HSA_FAILURE;
}
#ifdef _WIN32
// General Windows timing method
#ifndef _AMD
long long tmpStart;
QueryPerformanceCounter((LARGE_INTEGER*)&(tmpStart));
_timers[index]->_start = (double)tmpStart;
#else
// AMD Windows timing method
#endif
#else
// General Linux timing method
#ifndef _AMD
struct timeval s;
gettimeofday(&s, 0);
_timers[index]->_start = s.tv_sec * 1.0E3 + ((double)(s.tv_usec / 1.0E3));
#else
// AMD timing method
unsigned int unused;
_timers[index]->_start = __rdtscp(&unused);
#endif
#endif
return HSA_SUCCESS;
}
int PerfTimer::StopTimer(int index) {
double n = 0;
if (index >= (int)_timers.size()) {
Error("Cannot reset timer. Invalid handle.");
return HSA_FAILURE;
}
#ifdef _WIN32
#ifndef _AMD
long long n1;
QueryPerformanceCounter((LARGE_INTEGER*)&(n1));
n = (double)n1;
#else
// AMD Window Timing
#endif
#else
// General Linux timing method
#ifndef _AMD
struct timeval s;
gettimeofday(&s, 0);
n = s.tv_sec * 1.0E3 + (double)(s.tv_usec / 1.0E3);
#else
// AMD Linux timing
unsigned int unused;
n = __rdtscp(&unused);
#endif
#endif
n -= _timers[index]->_start;
_timers[index]->_start = 0;
#ifndef _AMD
_timers[index]->_clocks += n;
#else
//_timers[index]->_clocks += 10 * n /freq_in_100mhz; // unit is ns
_timers[index]->_clocks += 1.0E-6 * 10 * n / freq_in_100mhz; // convert to ms
cout << "_AMD is enabled!!!" << endl;
#endif
return HSA_SUCCESS;
}
void PerfTimer::Error(string str) { cout << str << endl; }
double PerfTimer::ReadTimer(int index) {
if (index >= (int)_timers.size()) {
Error("Cannot read timer. Invalid handle.");
return HSA_FAILURE;
}
double reading = double(_timers[index]->_clocks);
reading = double(reading / _timers[index]->_freq);
return reading;
}
uint64_t PerfTimer::CoarseTimestampUs() {
#ifdef _WIN32
uint64_t freqHz, ticks;
QueryPerformanceFrequency((LARGE_INTEGER*)&freqHz);
QueryPerformanceCounter((LARGE_INTEGER*)&ticks);
// Scale numerator and divisor until (ticks * 1000000) fits in uint64_t.
while (ticks > (1ULL << 44)) {
ticks /= 16;
freqHz /= 16;
}
return (ticks * 1000000) / freqHz;
#else
struct timespec ts;
clock_gettime(CLOCK_MONOTONIC_RAW, &ts);
return uint64_t(ts.tv_sec) * 1000000 + ts.tv_nsec / 1000;
#endif
}
uint64_t PerfTimer::MeasureTSCFreqHz() {
// Make a coarse interval measurement of TSC ticks for 1 gigacycles.
unsigned int unused;
uint64_t tscTicksEnd;
uint64_t coarseBeginUs = CoarseTimestampUs();
uint64_t tscTicksBegin = __rdtscp(&unused);
do {
tscTicksEnd = __rdtscp(&unused);
} while (tscTicksEnd - tscTicksBegin < 1000000000);
uint64_t coarseEndUs = CoarseTimestampUs();
// Compute the TSC frequency and round to nearest 100MHz.
uint64_t coarseIntervalNs = (coarseEndUs - coarseBeginUs) * 1000;
uint64_t tscIntervalTicks = tscTicksEnd - tscTicksBegin;
return (tscIntervalTicks * 10 + (coarseIntervalNs / 2)) / coarseIntervalNs;
}
+68
Просмотреть файл
@@ -0,0 +1,68 @@
#ifndef __MYTIME__
#define __MYTIME__
// Will use AMD timer and general Linux timer based on users' need --> compilation flag
// need to consider platform is Windows or Linux
#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#include <string.h>
#include <iostream>
#include <vector>
#include <string>
using namespace std;
#if defined(_MSC_VER)
#include <time.h>
#include <windows.h>
#include <intrin.h>
#else
#if defined(__GNUC__)
#include <sys/time.h>
#include <x86intrin.h>
#endif // __GNUC__
#endif //_MSC_VER
#define HSA_FAILURE 1
#define HSA_SUCCESS 0
class PerfTimer {
private:
struct Timer {
string name; /* < name name of time object*/
long long _freq; /* < _freq frequency*/
double _clocks; /* < _clocks number of ticks at end*/
double _start; /* < _start start point ticks*/
};
std::vector<Timer*> _timers; /*< _timers vector to Timer objects */
double freq_in_100mhz;
public:
PerfTimer();
~PerfTimer();
private:
// AMD timing method
uint64_t CoarseTimestampUs();
uint64_t MeasureTSCFreqHz();
// General Linux timing method
public:
int CreateTimer();
int StartTimer(int index);
int StopTimer(int index);
public:
// retrieve time
double ReadTimer(int index);
// write into a file
double WriteTimer(int index);
public:
void Error(string str);
};
#endif
+91
Просмотреть файл
@@ -0,0 +1,91 @@
/******************************************************************************
Copyright ©2013 Advanced Micro Devices, Inc. All rights reserved.
Redistribution and use in source and binary forms, with or without modification,
are permitted provided that the following conditions are met:
Redistributions of source code must retain the above copyright notice, this list
of conditions and the following disclaimer.
Redistributions in binary form must reproduce the above copyright notice, this
list of conditions and the following disclaimer in the documentation and/or
other materials provided with the distribution.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
OF THE POSSIBILITY OF SUCH DAMAGE.
*******************************************************************************/
#include <assert.h>
#include "simple_convolution.h"
#include "test_hsa.h"
#include "test_pgen_pmc.h"
#include "test_pgen_sqtt.h"
int main(int argc, char* argv[]) {
#if defined(NDEBUG)
clog.rdbuf(NULL);
#endif
bool ret_val = true;
// Create SimpleConvolution test object
TestKernel* test_kernel = new SimpleConvolution();
TestAql* test_aql = new TestHSA(test_kernel);
const bool pmc_enable = (getenv("ROCR_ENABLE_PMC") != NULL);
const bool sqtt_enable = (getenv("ROCR_ENABLE_SQTT") != NULL);
if (pmc_enable)
test_aql = new TestPGenPMC(test_aql);
else if (sqtt_enable)
test_aql = new TestPGenSQTT(test_aql);
assert(test_aql != NULL);
if (test_aql == NULL) return 1;
// Initialization of Hsa Runtime
ret_val = test_aql->initialize(argc, argv);
if (ret_val == false) {
std::cout << "Error in the test initialization" << std::endl;
assert(ret_val);
return 1;
}
// Setup Hsa resources needed for execution
ret_val = test_aql->setup();
if (ret_val == false) {
std::cout << "Error in creating hsa resources" << std::endl;
assert(ret_val);
return 1;
}
// Run SimpleConvolution kernel
ret_val = test_aql->run();
if (ret_val == false) {
std::cout << "Error in running the test kernel" << std::endl;
assert(ret_val);
return 1;
}
// Verify the results of the execution
ret_val = test_aql->verify_results();
if (ret_val) {
std::cout << "Test : Passed" << std::endl;
} else {
std::cout << "Test : Failed" << std::endl;
}
// Print time taken by sample
test_aql->print_time();
test_aql->cleanup();
return (ret_val) ? 0 : 1;
}
+87
Просмотреть файл
@@ -0,0 +1,87 @@
/******************************************************************************
Copyright ©2013 Advanced Micro Devices, Inc. All rights reserved.
Redistribution and use in source and binary forms, with or without modification,
are permitted provided that the following conditions are met:
Redistributions of source code must retain the above copyright notice, this list
of conditions and the following disclaimer.
Redistributions in binary form must reproduce the above copyright notice, this
list of conditions and the following disclaimer in the documentation and/or
other materials provided with the distribution.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
OF THE POSSIBILITY OF SUCH DAMAGE.
*******************************************************************************/
#ifndef _TESTAQL_H_
#define _TESTAQL_H_
#include "hsa.h"
#include "hsa_rsrc_factory.hpp"
#include "hsa_ext_amd_aql_profile.h"
#define test_assert(cond) \
{ \
if (cond) { \
std::cout << "ASSERT FAILED: " << #cond << " : " << __FILE__ << "(" << __LINE__ << ")" \
<< std::endl; \
abort(); \
} \
}
// Test AQL interface
class TestAql {
TestAql* const test_aql;
public:
TestAql(TestAql* t = 0) : test_aql(t) {}
virtual ~TestAql() {}
TestAql* testAql() { return test_aql; }
virtual AgentInfo* getAgentInfo() { return (test_aql) ? test_aql->getAgentInfo() : 0; }
virtual hsa_queue_t* getQueue() { return (test_aql) ? test_aql->getQueue() : 0; }
virtual HsaRsrcFactory* getRsrcFactory() { return (test_aql) ? test_aql->getRsrcFactory() : 0; }
// Initialize application environment including setting
// up of various configuration parameters based on
// command line arguments
// @return bool true on success and false on failure
virtual bool initialize(int argc, char** argv) {
return (test_aql) ? test_aql->initialize(argc, argv) : true;
}
// Setup application parameters for exectuion
// @return bool true on success and false on failure
virtual bool setup() { return (test_aql) ? test_aql->setup() : true; }
// Run the kernel
// @return bool true on success and false on failure
virtual bool run() { return (test_aql) ? test_aql->run() : true; }
// Verify results
// @return bool true on success and false on failure
virtual bool verify_results() { return (test_aql) ? test_aql->verify_results() : true; }
// Print to console the time taken to execute kernel
virtual void print_time() {
if (test_aql) test_aql->print_time();
}
// Release resources e.g. memory allocations
// @return bool true on success and false on failure
virtual bool cleanup() { return (test_aql) ? test_aql->cleanup() : true; }
};
#endif // _TESTAQL_H_
+234
Просмотреть файл
@@ -0,0 +1,234 @@
/******************************************************************************
Copyright ©2013 Advanced Micro Devices, Inc. All rights reserved.
Redistribution and use in source and binary forms, with or without modification,
are permitted provided that the following conditions are met:
Redistributions of source code must retain the above copyright notice, this list
of conditions and the following disclaimer.
Redistributions in binary form must reproduce the above copyright notice, this
list of conditions and the following disclaimer in the documentation and/or
other materials provided with the distribution.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
OF THE POSSIBILITY OF SUCH DAMAGE.
*******************************************************************************/
#include "os.h"
#include "helper_funcs.hpp"
#include "hsa_rsrc_factory.hpp"
#include "test_hsa.h"
bool TestHSA::initialize(int arg_cnt, char** arg_list) {
std::cout << "TestHSA::initialize :" << std::endl;
// Initialize command line arguments
hsa_cmdline_arg_cnt = arg_cnt;
hsa_cmdline_arg_list = arg_list;
// Instantiate a Timer object
setup_timer_idx_ = hsa_timer_.CreateTimer();
dispatch_timer_idx_ = hsa_timer_.CreateTimer();
// Instantiate an instance of Hsa Resources Factory
hsa_rsrc_ = new HsaRsrcFactory();
// Print properties of the agents
hsa_rsrc_->PrintGpuAgents("> GPU agents");
// Create an instance of Gpu agent
const char* p = getenv("ROCR_AGENT_IND");
const uint32_t agent_ind = (p == NULL) ? 0 : atol(p);
if (!hsa_rsrc_->GetGpuAgentInfo(agent_ind, &agent_info_)) {
std::cout << "> error: agent[" << agent_ind << "] is not found" << std::endl;
return false;
}
std::cout << "> Using agent[" << agent_ind << "] : " << agent_info_->name << std::endl;
// Create an instance of Aql Queue
uint32_t num_pkts = 128;
hsa_rsrc_->CreateQueue(agent_info_, num_pkts, &hsa_queue_);
// Obtain handle of signal
hsa_rsrc_->CreateSignal(1, &hsa_signal_);
// Obtain the code object file name
std::string agentName(agent_info_->name);
if (agentName.compare(0, 4, "gfx8") == 0) {
brig_path_obj_.append("gfx8");
} else if (agentName.compare(0, 4, "gfx9") == 0) {
brig_path_obj_.append("gfx9");
} else {
assert(false);
return false;
}
brig_path_obj_.append("_" + name_ + ".hsaco");
return true;
}
bool TestHSA::setup() {
std::cout << "TestHSA::setup :" << std::endl;
// Start the timer object
hsa_timer_.StartTimer(setup_timer_idx_);
mem_map_t& mem_map = test_->get_mem_map();
for (mem_it_t it = mem_map.begin(); it != mem_map.end(); ++it) {
mem_descr_t& des = it->second;
void* ptr = (des.local) ? hsa_rsrc_->AllocateLocalMemory(agent_info_, des.size)
: hsa_rsrc_->AllocateSysMemory(agent_info_, des.size);
des.ptr = ptr;
assert(ptr != NULL);
if (ptr == NULL) return false;
}
test_->init();
// Load and Finalize Kernel Code Descriptor
char* brig_path = (char*)brig_path_obj_.c_str();
const bool ret_val =
hsa_rsrc_->LoadAndFinalize(agent_info_, brig_path, strdup(name_.c_str()), &kernel_code_desc_);
if (ret_val == false) {
std::cout << "Error in loading and finalizing Kernel" << std::endl;
return ret_val;
}
// Stop the timer object
hsa_timer_.StopTimer(setup_timer_idx_);
setup_time_taken_ = hsa_timer_.ReadTimer(setup_timer_idx_);
total_time_taken_ = setup_time_taken_;
return true;
}
bool TestHSA::run() {
std::cout << "TestHSA::run :" << std::endl;
const uint32_t work_group_size = 64;
const uint32_t work_grid_size = test_->get_elements_count();
uint32_t group_segment_size = 0;
uint32_t private_segment_size = 0;
const size_t kernarg_segment_size = test_->get_kernarg_size();
uint64_t code_handle = 0;
// Retrieve the amount of group memory needed
hsa_executable_symbol_get_info(
kernel_code_desc_, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_GROUP_SEGMENT_SIZE, &group_segment_size);
// Retrieve the amount of private memory needed
hsa_executable_symbol_get_info(kernel_code_desc_,
HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_PRIVATE_SEGMENT_SIZE,
&private_segment_size);
// Check the kernel args size
size_t size_info = 0;
hsa_executable_symbol_get_info(
kernel_code_desc_, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_KERNARG_SEGMENT_SIZE, &size_info);
assert(kernarg_segment_size == size_info);
if (kernarg_segment_size != size_info) return false;
// Retrieve handle of the code block
hsa_executable_symbol_get_info(kernel_code_desc_, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT,
&code_handle);
// Initialize the dispatch packet.
hsa_kernel_dispatch_packet_t aql;
memset(&aql, 0, sizeof(aql));
// Set the packet's type, acquire and release fences
aql.header = HSA_PACKET_TYPE_KERNEL_DISPATCH;
aql.header |= HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE;
aql.header |= HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE;
// Populate Aql packet with default values
aql.setup = 1;
aql.grid_size_x = work_grid_size;
aql.grid_size_y = 1;
aql.grid_size_z = 1;
aql.workgroup_size_x = work_group_size;
aql.workgroup_size_y = 1;
aql.workgroup_size_z = 1;
// Bind the kernel code descriptor and arguments
aql.kernel_object = code_handle;
aql.kernarg_address = test_->get_kernarg_ptr();
aql.group_segment_size = group_segment_size;
aql.private_segment_size = private_segment_size;
// Initialize Aql packet with handle of signal
aql.completion_signal = hsa_signal_;
// Compute the write index of queue and copy Aql packet into it
const uint64_t que_idx = hsa_queue_load_write_index_relaxed(hsa_queue_);
const uint32_t mask = hsa_queue_->size - 1;
std::cout << "> Executing kernel: \"" << name_ << "\"" << std::endl;
// Start the timer object
hsa_timer_.StartTimer(dispatch_timer_idx_);
// Disable packet so that submission to HW is complete
const auto header = aql.header;
const uint8_t packet_type_mask = (1 << HSA_PACKET_HEADER_WIDTH_TYPE) - 1;
aql.header &= (~packet_type_mask) << HSA_PACKET_HEADER_TYPE;
aql.header |= HSA_PACKET_TYPE_INVALID << HSA_PACKET_HEADER_TYPE;
// Copy Aql packet into queue buffer
((hsa_kernel_dispatch_packet_t*)(hsa_queue_->base_address))[que_idx & mask] = aql;
// After AQL packet is fully copied into queue buffer
// update packet header from invalid state to valid state
std::atomic_thread_fence(std::memory_order_release);
((hsa_kernel_dispatch_packet_t*)(hsa_queue_->base_address))[que_idx & mask].header = header;
// Increment the write index and ring the doorbell to dispatch the kernel.
hsa_queue_store_write_index_relaxed(hsa_queue_, (que_idx + 1));
hsa_signal_store_relaxed(hsa_queue_->doorbell_signal, que_idx);
std::cout << "> Waiting on kernel dispatch signal" << std::endl;
// Wait on the dispatch signal until the kernel is finished.
// Update wait condition to HSA_WAIT_STATE_ACTIVE for Polling
hsa_signal_value_t value = hsa_signal_wait_acquire(hsa_signal_, HSA_SIGNAL_CONDITION_LT, 1,
(uint64_t)-1, HSA_WAIT_STATE_BLOCKED);
// Stop the timer object
hsa_timer_.StopTimer(dispatch_timer_idx_);
dispatch_time_taken_ = hsa_timer_.ReadTimer(dispatch_timer_idx_);
total_time_taken_ += dispatch_time_taken_;
// Copy kernel buffers from local memory into system memory
hsa_rsrc_->TransferData((uint8_t*)test_->get_output_ptr(), (uint8_t*)test_->get_local_ptr(),
test_->get_output_size(), false);
test_->print_output();
return true;
}
bool TestHSA::verify_results() {
// Compare the results and see if they match
const int32_t cmp_val =
memcmp(test_->get_output_ptr(), test_->get_refout_ptr(), test_->get_output_size());
return (cmp_val == 0);
}
void TestHSA::print_time() {
std::cout << "Time taken for Setup by " << this->name_ << " : " << this->setup_time_taken_
<< std::endl;
std::cout << "Time taken for Dispatch by " << this->name_ << " : " << this->dispatch_time_taken_
<< std::endl;
std::cout << "Time taken in Total by " << this->name_ << " : " << this->total_time_taken_
<< std::endl;
}
bool TestHSA::cleanup() {
// shutdown Hsa Runtime system
hsa_status_t ret_val = hsa_shut_down();
return (HSA_STATUS_SUCCESS == ret_val);
}
+115
Просмотреть файл
@@ -0,0 +1,115 @@
/******************************************************************************
Copyright ©2013 Advanced Micro Devices, Inc. All rights reserved.
Redistribution and use in source and binary forms, with or without modification,
are permitted provided that the following conditions are met:
Redistributions of source code must retain the above copyright notice, this list
of conditions and the following disclaimer.
Redistributions in binary form must reproduce the above copyright notice, this
list of conditions and the following disclaimer in the documentation and/or
other materials provided with the distribution.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
OF THE POSSIBILITY OF SUCH DAMAGE.
*******************************************************************************/
#ifndef _TEST_HSA_H_
#define _TEST_HSA_H_
#include "test_aql.h"
#include "test_kernel.h"
#include "hsa_rsrc_factory.hpp"
// Class implements HSA test
class TestHSA : public TestAql {
public:
// Constructor
TestHSA(TestKernel* test) : test_(test), name_(test->Name()) {
total_time_taken_ = 0;
setup_time_taken_ = 0;
dispatch_time_taken_ = 0;
}
// Get methods for Agent Info, HAS queue, HSA Resourcse Manager
AgentInfo* getAgentInfo() { return agent_info_; }
hsa_queue_t* getQueue() { return hsa_queue_; }
HsaRsrcFactory* getRsrcFactory() { return hsa_rsrc_; }
// Initialize application environment including setting
// up of various configuration parameters based on
// command line arguments
// @return bool true on success and false on failure
bool initialize(int argc, char** argv);
// Setup application parameters for exectuion
// @return bool true on success and false on failure
bool setup();
// Run the BinarySearch kernel
// @return bool true on success and false on failure
bool run();
// Verify against reference implementation
// @return bool true on success and false on failure
bool verify_results();
// Print to console the time taken to execute kernel
void print_time();
// Release resources e.g. memory allocations
// @return bool true on success and false on failure
bool cleanup();
private:
typedef TestKernel::mem_descr_t mem_descr_t;
typedef TestKernel::mem_map_t mem_map_t;
typedef TestKernel::mem_it_t mem_it_t;
// Test object
TestKernel* test_;
// Path of Brig file
std::string brig_path_obj_;
// Used to track time taken to run the sample
double total_time_taken_;
double setup_time_taken_;
double dispatch_time_taken_;
// Handle to an Hsa Gpu Agent
AgentInfo* agent_info_;
// Handle to an Hsa Queue
hsa_queue_t* hsa_queue_;
// Handle of signal
hsa_signal_t hsa_signal_;
// Handle of Kernel Code Descriptor
hsa_executable_symbol_t kernel_code_desc_;
// Instance of timer object
uint32_t setup_timer_idx_;
uint32_t dispatch_timer_idx_;
PerfTimer hsa_timer_;
// Instance of Hsa Resources Factory
HsaRsrcFactory* hsa_rsrc_;
// Test kernel name
std::string name_;
};
#endif // _TEST_HSA_H_
+105
Просмотреть файл
@@ -0,0 +1,105 @@
/******************************************************************************
Copyright ©2013 Advanced Micro Devices, Inc. All rights reserved.
Redistribution and use in source and binary forms, with or without modification,
are permitted provided that the following conditions are met:
Redistributions of source code must retain the above copyright notice, this list
of conditions and the following disclaimer.
Redistributions in binary form must reproduce the above copyright notice, this
list of conditions and the following disclaimer in the documentation and/or
other materials provided with the distribution.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
OF THE POSSIBILITY OF SUCH DAMAGE.
*******************************************************************************/
#ifndef _TEST_KERNEL_H_
#define _TEST_KERNEL_H_
#include <map>
#include <stdint.h>
// Class implements Kernel test
class TestKernel {
public:
// Memory descriptors IDs
enum { INPUT_DES_ID, OUTPUT_DES_ID, LOCAL_DES_ID, MASK_DES_ID, KERNARG_DES_ID, REFOUT_DES_ID };
// Memory descriptors vector declaration
struct mem_descr_t {
void* ptr;
uint32_t size;
bool local;
};
// Memory map declaration
typedef std::map<uint32_t, mem_descr_t> mem_map_t;
typedef mem_map_t::iterator mem_it_t;
typedef mem_map_t::const_iterator mem_const_it_t;
// Initialize method
virtual void init() = 0;
// Return kernel memory map
mem_map_t& get_mem_map() { return mem_map_; }
// Return NULL descriptor
static mem_descr_t null_descriptor() { return {0, 0, 0}; }
// Methods to get the kernel attributes
void* get_kernarg_ptr() const { return get_descr(KERNARG_DES_ID).ptr; }
uint32_t get_kernarg_size() const { return get_descr(KERNARG_DES_ID).size; }
void* get_output_ptr() const { return get_descr(OUTPUT_DES_ID).ptr; }
uint32_t get_output_size() const { return get_descr(OUTPUT_DES_ID).size; }
void* get_local_ptr() const { return get_descr(LOCAL_DES_ID).ptr; }
void* get_refout_ptr() const { return get_descr(REFOUT_DES_ID).ptr; }
virtual uint32_t get_elements_count() const = 0;
// Print output
virtual void print_output() const = 0;
// Return name
virtual std::string Name() const = 0;
protected:
// Set system memory descriptor
bool set_sys_descr(const uint32_t& id, const uint32_t& size) {
return set_mem_descr(id, size, false);
}
// Set local memory descriptor
bool set_local_descr(const uint32_t& id, const uint32_t& size) {
return set_mem_descr(id, size, true);
}
// Get memory descriptor
mem_descr_t get_descr(const uint32_t& id) const {
mem_const_it_t it = mem_map_.find(id);
return (it != mem_map_.end()) ? it->second : null_descriptor();
}
private:
// Set memory descriptor
bool set_mem_descr(const uint32_t& id, const uint32_t& size, const bool& local) {
const mem_descr_t des = {NULL, size, local};
auto ret = mem_map_.insert(mem_map_t::value_type(id, des));
return ret.second;
}
// Kernel memory map object
mem_map_t mem_map_;
};
#endif // _TEST_KERNEL_H_
+46
Просмотреть файл
@@ -0,0 +1,46 @@
/******************************************************************************
Copyright ©2013 Advanced Micro Devices, Inc. All rights reserved.
Redistribution and use in source and binary forms, with or without modification,
are permitted provided that the following conditions are met:
Redistributions of source code must retain the above copyright notice, this list
of conditions and the following disclaimer.
Redistributions in binary form must reproduce the above copyright notice, this
list of conditions and the following disclaimer in the documentation and/or
other materials provided with the distribution.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
OF THE POSSIBILITY OF SUCH DAMAGE.
*******************************************************************************/
#ifndef _TEST_PGEN_H_
#define _TEST_PGEN_H_
#include "test_pmgr.h"
#include "hsa_ext_amd_aql_profile.h"
// SimpleConvolution: Class implements OpenCL SimpleConvolution sample
class TestPGen : public TestPMgr {
typedef hsa_ext_amd_aql_pm4_packet_t packet_t;
protected:
packet_t* PrePacket() { return reinterpret_cast<packet_t*>(&prePacket); }
packet_t* PostPacket() { return reinterpret_cast<packet_t*>(&postPacket); }
public:
TestPGen(TestAql* t) : TestPMgr(t) {}
};
#endif // _TEST_PGEN_H_
+142
Просмотреть файл
@@ -0,0 +1,142 @@
/******************************************************************************
Copyright ©2013 Advanced Micro Devices, Inc. All rights reserved.
Redistribution and use in source and binary forms, with or without modification,
are permitted provided that the following conditions are met:
Redistributions of source code must retain the above copyright notice, this list
of conditions and the following disclaimer.
Redistributions in binary form must reproduce the above copyright notice, this
list of conditions and the following disclaimer in the documentation and/or
other materials provided with the distribution.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
OF THE POSSIBILITY OF SUCH DAMAGE.
*******************************************************************************/
#ifndef _TEST_PGEN_PMC_H_
#define _TEST_PGEN_PMC_H_
#include "test_pgen.h"
hsa_status_t TestPGenPMC_Callback(hsa_ext_amd_aql_profile_info_type_t info_type,
hsa_ext_amd_aql_profile_info_data_t* info_data,
void* callback_data) {
hsa_status_t status = HSA_STATUS_SUCCESS;
typedef std::vector<hsa_ext_amd_aql_profile_info_data_t> passed_data_t;
reinterpret_cast<passed_data_t*>(callback_data)->push_back(*info_data);
return status;
}
// SimpleConvolution: Class implements OpenCL SimpleConvolution sample
class TestPGenPMC : public TestPGen {
const static uint32_t buffer_alignment = 0x1000; // 4K
hsa_agent_t agent;
hsa_ext_amd_aql_profile_profile_t profile;
hsa_ext_amd_aql_profile_event_t events[2];
bool buildPackets() { return true; }
bool dumpData() {
std::cout << "TestPGenPMC::dumpData :" << std::endl;
typedef std::vector<hsa_ext_amd_aql_profile_info_data_t> callback_data_t;
callback_data_t data;
hsa_ext_amd_aql_profile_iterate_data(&profile, TestPGenPMC_Callback, &data);
for (callback_data_t::iterator it = data.begin(); it != data.end(); ++it) {
std::cout << "> sample(" << dec << it->sample_id << ") block("
<< it->pmc_data.event.block_name << "_" << it->pmc_data.event.block_index
<< ") result(" << hex << it->pmc_data.result << ")" << std::endl;
}
return true;
}
public:
TestPGenPMC(TestAql* t) : TestPGen(t) { std::cout << "Test: PGen PMC" << std::endl; }
bool initialize(int arg_cnt, char** arg_list) {
if (!TestPMgr::initialize(arg_cnt, arg_list)) return false;
hsa_status_t status;
hsa_agent_t agent;
uint32_t command_buffer_alignment;
uint32_t command_buffer_size;
uint32_t output_buffer_alignment;
uint32_t output_buffer_size;
// GPU identificator
agent = getAgentInfo()->dev_id;
// Instantiation of the profile object
// //////////////////////////////////////////////////////////////
// Set the event fields
events[0].block_name = HSA_EXT_AQL_PROFILE_BLOCK_SQ;
events[0].block_index = 0;
events[0].counter_id = 0x4; // SQ_SQ_PERF_SEL_WAVES
events[1].block_name = HSA_EXT_AQL_PROFILE_BLOCK_SQ;
events[1].block_index = 0;
events[1].counter_id = 0xe; // SQ_SQ_PERF_SEL_ITEMS
// Initialization the profile
memset(&profile, 0, sizeof(profile));
profile.agent = agent;
profile.type = HSA_EXT_AQL_PROFILE_EVENT_PMC;
// set enabled events list
profile.events = events;
profile.event_count = 2;
// Profile buffers attributes
command_buffer_alignment = buffer_alignment;
status = hsa_ext_amd_aql_profile_get_info(
&profile, HSA_EXT_AQL_PROFILE_INFO_COMMAND_BUFFER_SIZE, &command_buffer_size);
assert(status == HSA_STATUS_SUCCESS);
output_buffer_alignment = buffer_alignment;
status = hsa_ext_amd_aql_profile_get_info(&profile, HSA_EXT_AQL_PROFILE_INFO_PMC_DATA_SIZE,
&output_buffer_size);
assert(status == HSA_STATUS_SUCCESS);
// Application is allocating the command buffer
// Allocate(command_buffer_alignment, command_buffer_size,
// MODE_HOST_ACC|MODE_DEV_ACC|MODE_EXEC_DATA)
profile.command_buffer.ptr =
getRsrcFactory()->AllocateSysMemory(getAgentInfo(), command_buffer_size);
profile.command_buffer.size = command_buffer_size;
// Application is allocating the output buffer
// Allocate(output_buffer_alignment, output_buffer_size,
// MODE_HOST_ACC|MODE_DEV_ACC)
profile.output_buffer.ptr =
getRsrcFactory()->AllocateSysMemory(getAgentInfo(), output_buffer_size);
profile.output_buffer.size = output_buffer_size;
memset(profile.output_buffer.ptr, 0x77, output_buffer_size);
// Populating the AQL start packet
status = hsa_ext_amd_aql_profile_start(&profile, PrePacket());
assert(status == HSA_STATUS_SUCCESS);
if (status != HSA_STATUS_SUCCESS) return false;
// Populating the AQL stop packet
status = hsa_ext_amd_aql_profile_stop(&profile, PostPacket());
assert(status == HSA_STATUS_SUCCESS);
return (status == HSA_STATUS_SUCCESS);
}
};
#endif // _TEST_PGEN_PMC_H_
+160
Просмотреть файл
@@ -0,0 +1,160 @@
/******************************************************************************
Copyright ©2013 Advanced Micro Devices, Inc. All rights reserved.
Redistribution and use in source and binary forms, with or without modification,
are permitted provided that the following conditions are met:
Redistributions of source code must retain the above copyright notice, this list
of conditions and the following disclaimer.
Redistributions in binary form must reproduce the above copyright notice, this
list of conditions and the following disclaimer in the documentation and/or
other materials provided with the distribution.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
OF THE POSSIBILITY OF SUCH DAMAGE.
*******************************************************************************/
#ifndef _TEST_PGEN_SQTT_H_
#define _TEST_PGEN_SQTT_H_
#include <iostream>
#include <iomanip>
#include <fstream>
#include "test_pgen.h"
hsa_status_t TestPGenSQTT_Callback(hsa_ext_amd_aql_profile_info_type_t info_type,
hsa_ext_amd_aql_profile_info_data_t* info_data,
void* callback_data) {
hsa_status_t status = HSA_STATUS_SUCCESS;
typedef std::vector<hsa_ext_amd_aql_profile_info_data_t> passed_data_t;
reinterpret_cast<passed_data_t*>(callback_data)->push_back(*info_data);
return status;
}
// SimpleConvolution: Class implements OpenCL SimpleConvolution sample
class TestPGenSQTT : public TestPGen {
const static uint32_t buffer_alignment = 0x1000; // 4K
const static uint32_t buffer_size = 0x2000000; // 32M
hsa_agent_t agent;
hsa_ext_amd_aql_profile_profile_t profile;
bool buildPackets() { return true; }
bool dumpData() {
std::cout << "TestPGenSQTT::dumpData :" << std::endl;
typedef std::vector<hsa_ext_amd_aql_profile_info_data_t> callback_data_t;
callback_data_t data;
hsa_ext_amd_aql_profile_iterate_data(&profile, TestPGenSQTT_Callback, &data);
for (callback_data_t::iterator it = data.begin(); it != data.end(); ++it) {
std::cout << "> sample(" << dec << it->sample_id << ") ptr(" << hex << it->sqtt_data.ptr
<< ") size(" << dec << it->sqtt_data.size << ")" << std::endl;
void* sys_buf = getRsrcFactory()->AllocateSysMemory(getAgentInfo(), it->sqtt_data.size);
assert(sys_buf != NULL);
if (sys_buf == NULL) return HSA_STATUS_ERROR;
hsa_status_t status = hsa_memory_copy(sys_buf, it->sqtt_data.ptr, it->sqtt_data.size);
assert(status == HSA_STATUS_SUCCESS);
if (status != HSA_STATUS_SUCCESS) return status;
std::string file_name;
file_name.append("sqtt_dump_");
file_name.append(std::to_string(it->sample_id));
file_name.append(".txt");
std::ofstream out_file;
out_file.open(file_name);
// Write the buffer in terms of shorts (16 bits)
short* sqtt_data = (short*)sys_buf;
for (int i = 0; i < (it->sqtt_data.size / sizeof(short)); ++i) {
out_file << std::setw(4) << std::setfill('0') << std::hex << sqtt_data[i] << "\n";
}
out_file.close();
}
return true;
}
public:
TestPGenSQTT(TestAql* t) : TestPGen(t) { std::cout << "Test: PGen SQTT" << std::endl; }
bool initialize(int arg_cnt, char** arg_list) {
if (!TestPMgr::initialize(arg_cnt, arg_list)) return false;
hsa_status_t status;
hsa_agent_t agent;
uint32_t command_buffer_alignment;
uint32_t command_buffer_size;
uint32_t output_buffer_alignment;
uint32_t output_buffer_size;
// GPU identificator
agent = getAgentInfo()->dev_id;
// Instantiation of the profile object
// //////////////////////////////////////////////////////////////
// Set the parameters
// parameters = ....;
// Initialization the profile
memset(&profile, 0, sizeof(profile));
profile.agent = agent;
profile.type = HSA_EXT_AQL_PROFILE_EVENT_SQTT;
// set parameters
// profile.parameters = &event;
// profile.parameter_count = 1;
// Profile buffers attributes
command_buffer_alignment = buffer_alignment;
status = hsa_ext_amd_aql_profile_get_info(
&profile, HSA_EXT_AQL_PROFILE_INFO_COMMAND_BUFFER_SIZE, &command_buffer_size);
assert(status == HSA_STATUS_SUCCESS);
output_buffer_alignment = buffer_alignment;
output_buffer_size = buffer_size;
// Application is allocating the command buffer
// AllocateSystem(command_buffer_alignment, command_buffer_size,
// MODE_HOST_ACC|MODE_DEV_ACC|MODE_EXEC_DATA)
profile.command_buffer.ptr =
getRsrcFactory()->AllocateSysMemory(getAgentInfo(), command_buffer_size);
profile.command_buffer.size = command_buffer_size;
// Application is allocating the output buffer
// AllocateLocal(output_buffer_alignment, output_buffer_size,
// MODE_DEV_ACC)
profile.output_buffer.ptr =
getRsrcFactory()->AllocateLocalMemory(getAgentInfo(), output_buffer_size);
profile.output_buffer.size = output_buffer_size;
// Populating the AQL start packet
status = hsa_ext_amd_aql_profile_start(&profile, PrePacket());
assert(status == HSA_STATUS_SUCCESS);
if (status != HSA_STATUS_SUCCESS) return false;
// Populating the AQL stop packet
status = hsa_ext_amd_aql_profile_stop(&profile, PostPacket());
assert(status == HSA_STATUS_SUCCESS);
return (status == HSA_STATUS_SUCCESS);
}
};
#endif // _TEST_PGEN_SQTT_H_
+98
Просмотреть файл
@@ -0,0 +1,98 @@
/******************************************************************************
Copyright ©2013 Advanced Micro Devices, Inc. All rights reserved.
Redistribution and use in source and binary forms, with or without modification,
are permitted provided that the following conditions are met:
Redistributions of source code must retain the above copyright notice, this list
of conditions and the following disclaimer.
Redistributions in binary form must reproduce the above copyright notice, this
list of conditions and the following disclaimer in the documentation and/or
other materials provided with the distribution.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
OF THE POSSIBILITY OF SUCH DAMAGE.
*******************************************************************************/
#include <atomic>
#include <assert.h>
#include "test_pmgr.h"
bool TestPMgr::addPacket(const packet_t* packet) {
packet_t aql_packet = *packet;
// Compute the write index of queue and copy Aql packet into it
uint64_t que_idx = hsa_queue_load_write_index_relaxed(getQueue());
const uint32_t mask = getQueue()->size - 1;
// Disable packet so that submission to HW is complete
const auto header = HSA_PACKET_TYPE_VENDOR_SPECIFIC << HSA_PACKET_HEADER_TYPE;
aql_packet.header &= (~((1 << HSA_PACKET_HEADER_WIDTH_TYPE) - 1)) << HSA_PACKET_HEADER_TYPE;
aql_packet.header |= HSA_PACKET_TYPE_INVALID << HSA_PACKET_HEADER_TYPE;
// Copy Aql packet into queue buffer
((packet_t*)(getQueue()->base_address))[que_idx & mask] = aql_packet;
// After AQL packet is fully copied into queue buffer
// update packet header from invalid state to valid state
std::atomic_thread_fence(std::memory_order_release);
((packet_t*)(getQueue()->base_address))[que_idx & mask].header = header;
// Increment the write index and ring the doorbell to dispatch the kernel.
hsa_queue_store_write_index_relaxed(getQueue(), (que_idx + 1));
hsa_signal_store_relaxed(getQueue()->doorbell_signal, que_idx);
return true;
}
bool TestPMgr::run() {
// Build Aql Pkts
const bool active = buildPackets();
if (active) {
// Submit Pre-Dispatch Aql packet
addPacket(&prePacket);
}
testAql()->run();
if (active) {
// Set post packet completion signal
postPacket.completion_signal = postSignal;
// Submit Post-Dispatch Aql packet
addPacket(&postPacket);
// Wait for Post-Dispatch packet to complete
hsa_signal_wait_acquire(postSignal, HSA_SIGNAL_CONDITION_LT, 1, (uint64_t)-1,
HSA_WAIT_STATE_BLOCKED);
// Dumping profiling data
dumpData();
}
return true;
}
bool TestPMgr::initialize(int argc, char** argv) {
TestAql::initialize(argc, argv);
hsa_status_t status = hsa_signal_create(1, 0, NULL, &postSignal);
assert(status == HSA_STATUS_SUCCESS);
return (status == HSA_STATUS_SUCCESS);
}
TestPMgr::TestPMgr(TestAql* t) : TestAql(t) {
dummySignal.handle = 0;
postSignal = dummySignal;
}
+57
Просмотреть файл
@@ -0,0 +1,57 @@
/******************************************************************************
Copyright ©2013 Advanced Micro Devices, Inc. All rights reserved.
Redistribution and use in source and binary forms, with or without modification,
are permitted provided that the following conditions are met:
Redistributions of source code must retain the above copyright notice, this list
of conditions and the following disclaimer.
Redistributions in binary form must reproduce the above copyright notice, this
list of conditions and the following disclaimer in the documentation and/or
other materials provided with the distribution.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
OF THE POSSIBILITY OF SUCH DAMAGE.
*******************************************************************************/
#ifndef _TEST_SMGR_H_
#define _TEST_SMGR_H_
#include "test_aql.h"
#include "amd_aql_pm4_ib_packet.h"
// SimpleConvolution: Class implements OpenCL SimpleConvolution sample
class TestPMgr : public TestAql {
public:
typedef amd_aql_pm4_ib_packet_t packet_t;
private:
bool addPacket(const packet_t* packet);
protected:
packet_t prePacket;
packet_t postPacket;
hsa_signal_t dummySignal;
hsa_signal_t postSignal;
virtual bool buildPackets() { return false; }
virtual bool dumpData() { return false; }
virtual bool initialize(int argc, char** argv);
public:
TestPMgr(TestAql* t);
bool run();
};
#endif // _TEST_SMGR_H_
+81
Просмотреть файл
@@ -0,0 +1,81 @@
/******************************************************************************
Copyright ©2013 Advanced Micro Devices, Inc. All rights reserved.
Redistribution and use in source and binary forms, with or without modification,
are permitted provided that the following conditions are met:
Redistributions of source code must retain the above copyright notice, this list
of conditions and the following disclaimer.
Redistributions in binary form must reproduce the above copyright notice, this
list of conditions and the following disclaimer in the documentation and/or
other materials provided with the distribution.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
OF THE POSSIBILITY OF SUCH DAMAGE.
********************************************************************************/
/**
* SimpleConvolution is where each pixel of the output image
* is the weighted sum of the neighborhood pixels of the input image
* The neighborhood is defined by the dimensions of the mask and
* weight of each neighbor is defined by the mask itself.
* @param output Output matrix after performing convolution
* @param input Input matrix on which convolution is to be performed
* @param mask mask matrix using which convolution was to be performed
* @param inputDimensions dimensions of the input matrix
* @param maskDimensions dimensions of the mask matrix
*/
__kernel void simpleConvolution(__global uint * output,
__global uint * input,
__global float * mask,
const uint2 inputDimensions,
const uint2 maskDimensions) {
uint tid = get_global_id(0);
uint width = inputDimensions.x;
uint height = inputDimensions.y;
uint x = tid%width;
uint y = tid/width;
uint maskWidth = maskDimensions.x;
uint maskHeight = maskDimensions.y;
uint vstep = (maskWidth -1)/2;
uint hstep = (maskHeight -1)/2;
// find the left, right, top and bottom indices such that
// the indices do not go beyond image boundaires
uint left = (x < vstep) ? 0 : (x - vstep);
uint right = ((x + vstep) >= width) ? width - 1 : (x + vstep);
uint top = (y < hstep) ? 0 : (y - hstep);
uint bottom = ((y + hstep) >= height)? height - 1: (y + hstep);
// initializing wighted sum value
float sumFX = 0;
for(uint i = left; i <= right; ++i) {
for(uint j = top ; j <= bottom; ++j) {
// performing wighted sum within the mask boundaries
uint maskIndex = (j - (y - hstep)) * maskWidth + (i - (x - vstep));
uint index = j * width + i;
sumFX += ((float)input[index] * mask[maskIndex]);
}
}
// To round to the nearest integer
sumFX += 0.5f;
output[tid] = (uint)sumFX;
}
+157
Просмотреть файл
@@ -0,0 +1,157 @@
/******************************************************************************
Copyright ©2013 Advanced Micro Devices, Inc. All rights reserved.
Redistribution and use in source and binary forms, with or without modification,
are permitted provided that the following conditions are met:
Redistributions of source code must retain the above copyright notice, this list
of conditions and the following disclaimer.
Redistributions in binary form must reproduce the above copyright notice, this
list of conditions and the following disclaimer in the documentation and/or
other materials provided with the distribution.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
OF THE POSSIBILITY OF SUCH DAMAGE.
*******************************************************************************/
#include "helper_funcs.hpp"
#include "simple_convolution.h"
SimpleConvolution::SimpleConvolution() {
width_ = 64;
height_ = 64;
mask_width_ = 3;
mask_height_ = mask_width_;
if (!isPowerOf2(width_)) {
width_ = roundToPowerOf2(width_);
}
if (!isPowerOf2(height_)) {
height_ = roundToPowerOf2(height_);
}
if (!(mask_width_ % 2)) {
mask_width_++;
}
if (!(mask_height_ % 2)) {
mask_height_++;
}
if (width_ * height_ < 256) {
width_ = 64;
height_ = 64;
}
const uint32_t input_size_bytes = width_ * height_ * sizeof(uint32_t);
const uint32_t mask_size_bytes = mask_width_ * mask_height_ * sizeof(float);
set_sys_descr(KERNARG_DES_ID, sizeof(kernel_args_t));
set_sys_descr(INPUT_DES_ID, input_size_bytes);
set_sys_descr(OUTPUT_DES_ID, input_size_bytes);
set_local_descr(LOCAL_DES_ID, input_size_bytes);
set_sys_descr(MASK_DES_ID, mask_size_bytes);
set_sys_descr(REFOUT_DES_ID, input_size_bytes);
}
void SimpleConvolution::init() {
std::cout << "SimpleConvolution::init :" << std::endl;
mem_descr_t input_des = get_descr(INPUT_DES_ID);
mem_descr_t local_des = get_descr(LOCAL_DES_ID);
mem_descr_t mask_des = get_descr(MASK_DES_ID);
mem_descr_t refout_des = get_descr(REFOUT_DES_ID);
mem_descr_t kernarg_des = get_descr(KERNARG_DES_ID);
uint32_t* input = (uint32_t*)input_des.ptr;
uint32_t* output_local = (uint32_t*)local_des.ptr;
float* mask = (float*)mask_des.ptr;
kernel_args_t* kernel_args = (kernel_args_t*)kernarg_des.ptr;
// random initialisation of input
fillRandom<uint32_t>(input, width_, height_, 0, 255);
// Fill a blurr filter or some other filter of your choice
const float val = 1.0f / (mask_width_ * 2.0f - 1.0f);
for (uint32_t i = 0; i < (mask_width_ * mask_height_); i++) {
mask[i] = 0;
}
for (uint32_t i = 0; i < mask_width_; i++) {
uint32_t y = mask_height_ / 2;
mask[y * mask_width_ + i] = val;
}
for (uint32_t i = 0; i < mask_height_; i++) {
uint32_t x = mask_width_ / 2;
mask[i * mask_width_ + x] = val;
}
// Print the INPUT array.
printArray<uint32_t>("> Input[0]", input, width_, 1);
printArray<float>("> Mask", mask, mask_width_, mask_height_);
// Fill the kernel args
kernel_args->arg1 = output_local;
kernel_args->arg2 = input;
kernel_args->arg3 = mask;
kernel_args->arg4 = width_;
kernel_args->arg41 = height_;
kernel_args->arg5 = mask_width_;
kernel_args->arg51 = mask_height_;
// Calculate the reference output
memset(refout_des.ptr, 0, refout_des.size);
reference_impl((uint32_t*)refout_des.ptr, input, mask, width_, height_, mask_width_,
mask_height_);
}
void SimpleConvolution::print_output() const {
printArray<uint32_t>("> Output[0]", (uint32_t*)get_output_ptr(), width_, 1);
}
bool SimpleConvolution::reference_impl(uint32_t* output, const uint32_t* input, const float* mask,
const uint32_t width, const uint32_t height,
const uint32_t mask_width, const uint32_t mask_height) {
const uint32_t vstep = (mask_width - 1) / 2;
const uint32_t hstep = (mask_height - 1) / 2;
// for each pixel in the input
for (uint32_t x = 0; x < width; x++) {
for (uint32_t y = 0; y < height; y++) {
// find the left, right, top and bottom indices such that
// the indices do not go beyond image boundaires
const uint32_t left = (x < vstep) ? 0 : (x - vstep);
const uint32_t right = ((x + vstep) >= width) ? width - 1 : (x + vstep);
const uint32_t top = (y < hstep) ? 0 : (y - hstep);
const uint32_t bottom = ((y + hstep) >= height) ? height - 1 : (y + hstep);
// initializing wighted sum value
float sum_fx = 0;
for (uint32_t i = left; i <= right; ++i) {
for (uint32_t j = top; j <= bottom; ++j) {
// performing wighted sum within the mask boundaries
uint32_t mask_idx = (j - (y - hstep)) * mask_width + (i - (x - vstep));
uint32_t index = j * width + i;
// to round to the nearest integer
sum_fx += ((float)input[index] * mask[mask_idx]);
}
}
sum_fx += 0.5f;
output[y * width + x] = uint32_t(sum_fx);
}
}
return true;
}
+90
Просмотреть файл
@@ -0,0 +1,90 @@
/******************************************************************************
Copyright ©2013 Advanced Micro Devices, Inc. All rights reserved.
Redistribution and use in source and binary forms, with or without modification,
are permitted provided that the following conditions are met:
Redistributions of source code must retain the above copyright notice, this list
of conditions and the following disclaimer.
Redistributions in binary form must reproduce the above copyright notice, this
list of conditions and the following disclaimer in the documentation and/or
other materials provided with the distribution.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
OF THE POSSIBILITY OF SUCH DAMAGE.
*******************************************************************************/
#ifndef _SIMPLE_CONVOLUTION_H_
#define _SIMPLE_CONVOLUTION_H_
#include <vector>
#include <map>
#include "test_kernel.h"
// SimpleConvolution: Class implements OpenCL SimpleConvolution sample
class SimpleConvolution : public TestKernel {
public:
// Constructor
SimpleConvolution();
// Initialize method
void init();
// Return number of compute elements
uint32_t get_elements_count() const { return width_ * height_; }
// Print output
void print_output() const;
// Return name
std::string Name() const { return std::string("simpleConvolution"); }
private:
// Local kernel arguments declaration
struct kernel_args_t {
void* arg1;
void* arg2;
void* arg3;
uint32_t arg4;
uint32_t arg41;
uint32_t arg5;
uint32_t arg51;
};
// Width of the Input array
uint32_t width_;
// Height of the Input array
uint32_t height_;
// Mask dimensions
uint32_t mask_width_;
// Mask dimensions
uint32_t mask_height_;
// Reference CPU implementation of Simple Convolution
// @param output Output matrix after performing convolution
// @param input Input matrix on which convolution is to be performed
// @param mask mask matrix using which convolution was to be performed
// @param input_dimensions dimensions of the input matrix
// @param mask_dimensions dimensions of the mask matrix
// @return bool true on success and false on failure
bool reference_impl(uint32_t* output, const uint32_t* input, const float* mask,
const uint32_t width, const uint32_t height, const uint32_t maskWidth,
const uint32_t maskHeight);
};
#endif // _SIMPLE_CONVOLUTION_H_
+154
Просмотреть файл
@@ -0,0 +1,154 @@
module &m:1:0:$full:$large:$default;
extension "amd:gcn";
extension "IMAGE";
decl prog function &abort()();
prog kernel &__OpenCL_SimpleConvolution(kernarg_u64 %__global_offset_0,
kernarg_u64 %output,
kernarg_u64 %input,
kernarg_u64 %mask,
kernarg_u32 %inputDimensions[2],
kernarg_u32 %maskDimensions[2]) {
pragma "AMD RTI", "ARGSTART:__OpenCL_SimpleConvolution";
pragma "AMD RTI", "version:3:1:104";
pragma "AMD RTI", "device:generic";
pragma "AMD RTI", "uniqueid:1024";
pragma "AMD RTI", "memory:private:0";
pragma "AMD RTI", "memory:region:0";
pragma "AMD RTI", "memory:local:0";
pragma "AMD RTI", "value:__global_offset_0:u64:1:1:0";
pragma "AMD RTI", "pointer:output:u32:1:1:96:uav:7:4:RW:0:0:0";
pragma "AMD RTI", "pointer:input:u32:1:1:112:uav:7:4:RW:0:0:0";
pragma "AMD RTI", "pointer:mask:float:1:1:128:uav:7:4:RW:0:0:0";
pragma "AMD RTI", "value:inputDimensions:u32:2:1:144";
pragma "AMD RTI", "constarg:4:inputDimensions";
pragma "AMD RTI", "value:maskDimensions:u32:2:1:160";
pragma "AMD RTI", "constarg:5:maskDimensions";
pragma "AMD RTI", "function:1:0";
pragma "AMD RTI", "memory:64bitABI";
pragma "AMD RTI", "privateid:8";
pragma "AMD RTI", "enqueue_kernel:0";
pragma "AMD RTI", "kernel_index:0";
pragma "AMD RTI", "reflection:0:size_t";
pragma "AMD RTI", "reflection:1:uint*";
pragma "AMD RTI", "reflection:2:uint*";
pragma "AMD RTI", "reflection:3:float*";
pragma "AMD RTI", "reflection:4:uint2";
pragma "AMD RTI", "reflection:5:uint2";
pragma "AMD RTI", "ARGEND:__OpenCL_SimpleConvolution";
@__OpenCL_SimpleConvolution_Entry:
// BB#0: // %entry
workitemabsid_u32 $s6, 0;
cvt_u64_u32 $d0, $s6;
ld_kernarg_align(8)_width(all)_u64 $d4, [%__global_offset_0];
add_u64 $d0, $d0, $d4;
cvt_u32_u64 $s5, $d0;
ld_v2_kernarg_align(4)_width(all)_u32 ($s0, $s4), [%inputDimensions];
ld_v2_kernarg_align(4)_width(all)_u32 ($s1, $s9), [%maskDimensions];
rem_u32 $s7, $s5, $s0;
add_u32 $s2, $s1, 4294967295;
shr_u32 $s8, $s2, 1;
add_u32 $s2, $s7, $s8;
add_u32 $s3, $s0, 4294967295;
cmp_ge_b1_u32 $c0, $s2, $s0;
cmov_b32 $s2, $c0, $s3, $s2;
sub_u32 $s3, $s7, $s8;
cmp_lt_b1_u32 $c0, $s7, $s8;
cmov_b32 $s3, $c0, 0, $s3;
ld_kernarg_align(8)_width(all)_u64 $d1, [%output];
cmp_le_b1_u32 $c0, $s3, $s2;
cbr_b1 $c0, @BB0_2;
// BB#1:
mov_b32 $s6, 0;
br @BB0_6;
// @BB0_2: // %for.cond32.preheader.lr.ph
@BB0_2:
div_u32 $s5, $s5, $s0;
add_u32 $s9, $s9, 4294967295;
shr_u32 $s9, $s9, 1;
add_u32 $s10, $s5, $s9;
add_u32 $s11, $s4, 4294967295;
cmp_ge_b1_u32 $c0, $s10, $s4;
cmov_b32 $s4, $c0, $s11, $s10;
sub_u32 $s10, $s5, $s9;
cmp_lt_b1_u32 $c0, $s5, $s9;
cmov_b32 $s5, $c0, 0, $s10;
ld_kernarg_align(8)_width(all)_u64 $d2, [%mask];
ld_kernarg_align(8)_width(all)_u64 $d3, [%input];
cvt_u64_u32 $d5, $s6;
add_u64 $d4, $d4, $d5;
cvt_u32_u64 $s6, $d4;
div_u32 $s6, $s6, $s0;
max_u32 $s10, $s9, $s6;
sub_u32 $s12, $s10, $s6;
max_u32 $s11, $s7, $s8;
mov_b32 $s6, 0;
mad_u32 $s12, $s1, $s12, $s11;
sub_u32 $s7, $s12, $s7;
sub_u32 $s9, $s10, $s9;
mad_u32 $s9, $s0, $s9, $s11;
sub_u32 $s8, $s9, $s8;
// @BB0_3: // %for.cond32.preheader
@BB0_3:
cmp_gt_b1_u32 $c0, $s5, $s4;
mov_b32 $s9, $s7;
mov_b32 $s10, $s8;
mov_b32 $s11, $s5;
cbr_b1 $c0, @BB0_5;
// @BB0_4: // %for.body35
@BB0_4:
cvt_u64_u32 $d4, $s9;
shl_u64 $d4, $d4, 2;
add_u64 $d4, $d2, $d4;
ld_global_align(4)_f32 $s12, [$d4];
cvt_u64_u32 $d4, $s10;
shl_u64 $d4, $d4, 2;
add_u64 $d4, $d3, $d4;
ld_global_align(4)_u32 $s13, [$d4];
cvt_f32_u32 $s13, $s13;
mul_ftz_f32 $s12, $s13, $s12;
add_u32 $s9, $s9, $s1;
add_u32 $s10, $s10, $s0;
add_u32 $s11, $s11, 1;
add_ftz_f32 $s6, $s6, $s12;
cmp_le_b1_u32 $c0, $s11, $s4;
cbr_b1 $c0, @BB0_4;
// @BB0_5: // %for.inc48
@BB0_5:
add_u32 $s7, $s7, 1;
add_u32 $s8, $s8, 1;
add_u32 $s3, $s3, 1;
cmp_le_b1_u32 $c0, $s3, $s2;
cbr_b1 $c0, @BB0_3;
// @BB0_6: // %for.end50
@BB0_6:
and_b64 $d0, $d0, 4294967295;
shl_u64 $d0, $d0, 2;
add_u64 $d0, $d1, $d0;
add_ftz_f32 $s0, $s6, 0F3f000000;
cvt_ftz_u32_f32 $s0, $s0;
st_global_align(4)_u32 $s0, [$d0];
ret;
};