diff --git a/runtime/hsa-ext-aql-profile/CMakeLists.txt b/runtime/hsa-ext-aql-profile/CMakeLists.txt new file mode 100644 index 0000000000..ceb476dca3 --- /dev/null +++ b/runtime/hsa-ext-aql-profile/CMakeLists.txt @@ -0,0 +1,28 @@ +# +# Minimum version of cmake required +# +cmake_minimum_required ( VERSION 3.5.0 ) + +# +# Setup flag to be verbose or not +# +set ( CMAKE_VERBOSE_MAKEFILE TRUE CACHE BOOL "Verbose Output" FORCE ) + +set ( ROOT_DIR ${CMAKE_CURRENT_SOURCE_DIR} ) +set ( PROJ_DIR ${ROOT_DIR}/src ) +set ( TEST_DIR ${ROOT_DIR}/test ) + +# +# Build sources +# +include ( ${PROJ_DIR}/CMakeLists.txt ) + +# +# Build tests +# +add_subdirectory ( ${TEST_DIR} ${PROJECT_BINARY_DIR}/test ) + +# +# Style format +# +execute_process ( COMMAND sh -xc "/usr/bin/find ${ROOT_DIR} -name '*.cpp' -o -name '*.hpp' -o -name '*.h' -exec /usr/bin/clang-format -i -style=file \{\} \;" ) diff --git a/runtime/hsa-ext-aql-profile/Readme.txt b/runtime/hsa-ext-aql-profile/Readme.txt new file mode 100644 index 0000000000..d4cd2ff5df --- /dev/null +++ b/runtime/hsa-ext-aql-profile/Readme.txt @@ -0,0 +1,40 @@ +HSA extension AMD AQL profile library. +Provides AQL packets helper methods for +perfcounters (PMC) and SQ threadtraces (SQTT). + +Current library implementation supports only GFX9. +The library source tree: + - doc - Documantation, the API specification and the presentation + - inc - Public API + - hsa_ext_amd_aql_profile.h - AMD AQL profile library public API + - amd_aql_pm4_ib_packet.h - AQL PM4 IB packet type + - src - AMD AQL profile library sources + - aqlprofile - AMD AQL profile library + - commandwriter - PM4 command writer originated from 'hsa-runtime/tools' + - perfcounter - PM4 perfcounter manager originated from 'hsa-runtime/tools' + - threadtrace - PM4 threadtrace manager originated from 'hsa-runtime/tools' + - util - core/utils library build based on 'hsa-runtime/core/util' + - test - the library test suite + - ctrl - Test controll + - common - Test common utils + - SimpleConvolution - Simple convolution test + +To build the library: + +$ cd ..../hsa-ext-aql-profile +$ mkdir build +$ cd build +$ cmake .. +$ make + +To run the test: + +# cd ..../hsa-ext-aql-profile/build +$ cp ../test/SimpleConvolution/gfx9_SimpleConvolution.hsaco . +$ test/SimpleConvolution + +to enable PMC profiling: +export ROCR_ENABLE_PMC=1 + +to enable SQTT profiling: +export ROCR_ENABLE_SQTT=1 diff --git a/runtime/hsa-ext-aql-profile/cmake_modules/exportToolFlags.cmake b/runtime/hsa-ext-aql-profile/cmake_modules/exportToolFlags.cmake new file mode 100644 index 0000000000..59b470a991 --- /dev/null +++ b/runtime/hsa-ext-aql-profile/cmake_modules/exportToolFlags.cmake @@ -0,0 +1,66 @@ +# +# Compiler Preprocessor definitions. +# +add_definitions ( -D__linux__ ) +add_definitions ( -DUNIX_OS ) +add_definitions ( -DLINUX ) +add_definitions ( -D__AMD64__ ) +add_definitions ( -D__x86_64__ ) +add_definitions ( -DAMD_INTERNAL_BUILD ) +add_definitions ( -DLITTLEENDIAN_CPU=1 ) +add_definitions ( -DHSA_LARGE_MODEL= ) +add_definitions ( -DHSA_DEPRECATED= ) + +# +# Linux Compiler options +# +set ( CMAKE_CXX_FLAGS "-std=c++11") +set ( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Werror" ) +set ( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Werror=return-type" ) +set ( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fexceptions" ) +set ( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fvisibility=hidden" ) +set ( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-error=sign-compare" ) +set ( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-error=enum-compare" ) +set ( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-error=comment " ) +set ( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-error=pointer-arith" ) +set ( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-comment" ) +set ( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-sign-compare" ) +set ( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-pointer-arith" ) +set ( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-write-strings" ) +set ( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-conversion-null" ) +set ( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-deprecated-declarations" ) +set ( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fno-rtti" ) +set ( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fno-math-errno" ) +set ( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fno-threadsafe-statics" ) +set ( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fms-extensions" ) +set ( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fmerge-all-constants" ) +set ( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fPIC" ) + +# +# Extend Compiler flags based on build type +# +set ( CMAKE_BUILD_TYPE ${BUILD_TYPE} ) +if ( "${CMAKE_BUILD_TYPE}" STREQUAL Debug ) + set ( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -ggdb" ) +endif () + +# +# Extend Compiler flags based on Processor architecture +# +if ( CMAKE_SYSTEM_PROCESSOR STREQUAL "x86_64" ) + set ( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -m64 -msse -msse2" ) +elseif ( CMAKE_SYSTEM_PROCESSOR STREQUAL "x86" ) + set ( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -m32" ) +endif () + +# +# Basic Tool Chain Information +# +message ( "-------------IS64BIT: " ${IS64BIT} ) +message ( "-----------BuildType: " ${BUILD_TYPE} ) +message ( " -----------Compiler: " ${CMAKE_CXX_COMPILER} ) +message ( " ------------Version: " ${CMAKE_CXX_COMPILER_VERSION} ) +message ( " ------------ProjDir: " ${PROJ_DIR} ) +message ( " ------------TestDir: " ${PROJ_DIR} ) +message ( "------HSA-RuntimeDir: " ${HSA_RUNTIME_DIR} ) +message ( " -----------CoreUtil: " ${CORE_UTIL_DIR} ) diff --git a/runtime/hsa-ext-aql-profile/cmake_modules/validateBldEnv.cmake b/runtime/hsa-ext-aql-profile/cmake_modules/validateBldEnv.cmake new file mode 100644 index 0000000000..d38352bbbc --- /dev/null +++ b/runtime/hsa-ext-aql-profile/cmake_modules/validateBldEnv.cmake @@ -0,0 +1,52 @@ +# +# Build is not supported on Windows plaform +# +if ( WIN32 ) + message ( FATAL_ERROR "Windows build is not supported." ) +endif () + +# +# External dependencies for Rocr Header files +# +if ( NOT DEFINED ENV{ROCR_INC_DIR} ) + message ( FATAL_ERROR "ERROR: Environment variable ROCR_INC_DIR is not set" ) + return () +endif () + +# +# External dependencies for Rocr Library files +# +if ( NOT DEFINED ENV{ROCR_LIB_DIR} ) + message ( FATAL_ERROR "ERROR: Environment variable ROCR_LIB_DIR is not set" ) + return () +endif () + +# +# Process Env to determine build type +# +string ( TOLOWER "$ENV{ROCR_BLD_TYPE}" type ) +if ( "${type}" STREQUAL debug ) + set ( ISDEBUG 1 ) + set ( BUILD_TYPE "Debug" ) +else () + set ( ISDEBUG 0 ) + set ( BUILD_TYPE "Release" ) +endif () + +# +# Determine build is 32-bit or 64-bit +# @note: By default it is not set +# +if ( "$ENV{ROCR_BLD_BITS}" STREQUAL 32 ) + set ( ONLY64STR "" ) + set ( IS64BIT 0 ) +else () + set ( ONLY64STR "64" ) + set ( IS64BIT 1 ) +endif () + +# +# Build information +# +message ( "---------ROCR-HdrDir: " $ENV{ROCR_INC_DIR} ) +message ( "---------ROCR-LibDir: " $ENV{ROCR_LIB_DIR} ) diff --git a/runtime/hsa-ext-aql-profile/doc/HSA_ext_profile_api.pptx b/runtime/hsa-ext-aql-profile/doc/HSA_ext_profile_api.pptx new file mode 100644 index 0000000000..a3df42bba0 Binary files /dev/null and b/runtime/hsa-ext-aql-profile/doc/HSA_ext_profile_api.pptx differ diff --git a/runtime/hsa-ext-aql-profile/doc/HSA_ext_profile_api_v1_1_0.docx b/runtime/hsa-ext-aql-profile/doc/HSA_ext_profile_api_v1_1_0.docx new file mode 100644 index 0000000000..35ae8579aa Binary files /dev/null and b/runtime/hsa-ext-aql-profile/doc/HSA_ext_profile_api_v1_1_0.docx differ diff --git a/runtime/hsa-ext-aql-profile/inc/amd_aql_pm4_ib_packet.h b/runtime/hsa-ext-aql-profile/inc/amd_aql_pm4_ib_packet.h new file mode 100644 index 0000000000..bd364593b3 --- /dev/null +++ b/runtime/hsa-ext-aql-profile/inc/amd_aql_pm4_ib_packet.h @@ -0,0 +1,67 @@ +//////////////////////////////////////////////////////////////////////////////// +// +// Copyright 2017 ADVANCED MICRO DEVICES, INC. +// +// AMD is granting you permission to use this software and documentation(if any) +// (collectively, the "Materials") pursuant to the terms and conditions of the +// Software License Agreement included with the Materials.If you do not have a +// copy of the Software License Agreement, contact your AMD representative for a +// copy. +// +// You agree that you will not reverse engineer or decompile the Materials, in +// whole or in part, except as allowed by applicable law. +// +// WARRANTY DISCLAIMER : THE SOFTWARE IS PROVIDED "AS IS" WITHOUT WARRANTY OF +// ANY KIND.AMD DISCLAIMS ALL WARRANTIES, EXPRESS, IMPLIED, OR STATUTORY, +// INCLUDING BUT NOT LIMITED TO THE IMPLIED WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE, TITLE, NON - INFRINGEMENT, THAT THE +// SOFTWARE WILL RUN UNINTERRUPTED OR ERROR - FREE OR WARRANTIES ARISING FROM +// CUSTOM OF TRADE OR COURSE OF USAGE.THE ENTIRE RISK ASSOCIATED WITH THE USE OF +// THE SOFTWARE IS ASSUMED BY YOU.Some jurisdictions do not allow the exclusion +// of implied warranties, so the above exclusion may not apply to You. +// +// LIMITATION OF LIABILITY AND INDEMNIFICATION : AMD AND ITS LICENSORS WILL NOT, +// UNDER ANY CIRCUMSTANCES BE LIABLE TO YOU FOR ANY PUNITIVE, DIRECT, +// INCIDENTAL, INDIRECT, SPECIAL OR CONSEQUENTIAL DAMAGES ARISING FROM USE OF +// THE SOFTWARE OR THIS AGREEMENT EVEN IF AMD AND ITS LICENSORS HAVE BEEN +// ADVISED OF THE POSSIBILITY OF SUCH DAMAGES.In no event shall AMD's total +// liability to You for all damages, losses, and causes of action (whether in +// contract, tort (including negligence) or otherwise) exceed the amount of $100 +// USD. You agree to defend, indemnify and hold harmless AMD and its licensors, +// and any of their directors, officers, employees, affiliates or agents from +// and against any and all loss, damage, liability and other expenses (including +// reasonable attorneys' fees), resulting from Your use of the Software or +// violation of the terms and conditions of this Agreement. +// +// U.S.GOVERNMENT RESTRICTED RIGHTS : The Materials are provided with +// "RESTRICTED RIGHTS." Use, duplication, or disclosure by the Government is +// subject to the restrictions as set forth in FAR 52.227 - 14 and DFAR252.227 - +// 7013, et seq., or its successor.Use of the Materials by the Government +// constitutes acknowledgement of AMD's proprietary rights in them. +// +// EXPORT RESTRICTIONS: The Materials may be subject to export restrictions as +// stated in the Software License Agreement. +// +//////////////////////////////////////////////////////////////////////////////// + +#ifndef _AMD_AQL_PM4_IB_PACKET_H_ +#define _AMD_AQL_PM4_IB_PACKET_H_ + +// Value of 'pm4_ib_format' field of amd_aql_pm4_ib_packet_t packet +const static uint32_t AMD_AQL_PM4_IB_FORMAT = 1; +// Value of 'dw_count_remain' field of amd_aql_pm4_ib_packet_t packet +const static uint32_t AMD_AQL_PM4_IB_DW_COUNT_REMAIN = 10; +// Size of 'reserved' array of amd_aql_pm4_ib_packet_t packet +const static uint32_t AMD_AQL_PM4_IB_RESERVED_COUNT = 8; + +// AQL Vendor Specific Packet which carry PM4 IB command +typedef struct { + uint16_t header; + uint16_t pm4_ib_format; + uint32_t pm4_ib_command[4]; + uint32_t dw_count_remain; + uint32_t reserved[AMD_AQL_PM4_IB_RESERVED_COUNT]; + hsa_signal_t completion_signal; +} amd_aql_pm4_ib_packet_t; + +#endif // _AMD_AQL_PM4_IB_H_ diff --git a/runtime/hsa-ext-aql-profile/inc/hsa_ext_amd_aql_profile.h b/runtime/hsa-ext-aql-profile/inc/hsa_ext_amd_aql_profile.h new file mode 100644 index 0000000000..478b3cf1fb --- /dev/null +++ b/runtime/hsa-ext-aql-profile/inc/hsa_ext_amd_aql_profile.h @@ -0,0 +1,262 @@ +//////////////////////////////////////////////////////////////////////////////// +// +// Copyright 2017 ADVANCED MICRO DEVICES, INC. +// +// AMD is granting you permission to use this software and documentation(if any) +// (collectively, the "Materials") pursuant to the terms and conditions of the +// Software License Agreement included with the Materials.If you do not have a +// copy of the Software License Agreement, contact your AMD representative for a +// copy. +// +// You agree that you will not reverse engineer or decompile the Materials, in +// whole or in part, except as allowed by applicable law. +// +// WARRANTY DISCLAIMER : THE SOFTWARE IS PROVIDED "AS IS" WITHOUT WARRANTY OF +// ANY KIND.AMD DISCLAIMS ALL WARRANTIES, EXPRESS, IMPLIED, OR STATUTORY, +// INCLUDING BUT NOT LIMITED TO THE IMPLIED WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE, TITLE, NON - INFRINGEMENT, THAT THE +// SOFTWARE WILL RUN UNINTERRUPTED OR ERROR - FREE OR WARRANTIES ARISING FROM +// CUSTOM OF TRADE OR COURSE OF USAGE.THE ENTIRE RISK ASSOCIATED WITH THE USE OF +// THE SOFTWARE IS ASSUMED BY YOU.Some jurisdictions do not allow the exclusion +// of implied warranties, so the above exclusion may not apply to You. +// +// LIMITATION OF LIABILITY AND INDEMNIFICATION : AMD AND ITS LICENSORS WILL NOT, +// UNDER ANY CIRCUMSTANCES BE LIABLE TO YOU FOR ANY PUNITIVE, DIRECT, +// INCIDENTAL, INDIRECT, SPECIAL OR CONSEQUENTIAL DAMAGES ARISING FROM USE OF +// THE SOFTWARE OR THIS AGREEMENT EVEN IF AMD AND ITS LICENSORS HAVE BEEN +// ADVISED OF THE POSSIBILITY OF SUCH DAMAGES.In no event shall AMD's total +// liability to You for all damages, losses, and causes of action (whether in +// contract, tort (including negligence) or otherwise) exceed the amount of $100 +// USD. You agree to defend, indemnify and hold harmless AMD and its licensors, +// and any of their directors, officers, employees, affiliates or agents from +// and against any and all loss, damage, liability and other expenses (including +// reasonable attorneys' fees), resulting from Your use of the Software or +// violation of the terms and conditions of this Agreement. +// +// U.S.GOVERNMENT RESTRICTED RIGHTS : The Materials are provided with +// "RESTRICTED RIGHTS." Use, duplication, or disclosure by the Government is +// subject to the restrictions as set forth in FAR 52.227 - 14 and DFAR252.227 - +// 7013, et seq., or its successor.Use of the Materials by the Government +// constitutes acknowledgement of AMD's proprietary rights in them. +// +// EXPORT RESTRICTIONS: The Materials may be subject to export restrictions as +// stated in the Software License Agreement. +// +//////////////////////////////////////////////////////////////////////////////// + +#ifndef _HSA_EXT_AMD_AQL_PROFILE_H_ +#define _HSA_EXT_AMD_AQL_PROFILE_H_ + +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif // __cplusplus + +/////////////////////////////////////////////////////////////////////// +// Library API: +// The library provides helper methods for instantiation of +// the profile context object and for populating of the start +// and stop AQL packets. The profile object contains a profiling +// events list and needed for profiling buffers descriptors, +// a command buffer and an output data buffer. To check if there +// was an error the library methods return a status code. Also +// the library provides methods for querying required buffers +// attributes, to validate the event attributes and to get profiling +// output data. +// +// Returned status: +// hsa_status_t – HSA status codes are used from hsa.h header +// +// Supported profiling features: +// +// Supported profiling events +typedef enum { + HSA_EXT_AQL_PROFILE_EVENT_PMC, + HSA_EXT_AQL_PROFILE_EVENT_SQTT +} hsa_ext_amd_aql_profile_event_type_t; + +// Supported performance counters (PMC) blocks +// The block ID is the same for a block instances set, for example +// each block instance from the TCC block set, TCC0, TCC1, …, TCCN +// will have the same block ID HSA_EXT_AQL_PROFILE_BLOCKS_TCC. +typedef enum { + HSA_EXT_AQL_PROFILE_BLOCK_CB, + HSA_EXT_AQL_PROFILE_BLOCK_CPF, + HSA_EXT_AQL_PROFILE_BLOCK_DB, + HSA_EXT_AQL_PROFILE_BLOCK_GRBM, + HSA_EXT_AQL_PROFILE_BLOCK_GRBMSE, + HSA_EXT_AQL_PROFILE_BLOCK_PASU, + HSA_EXT_AQL_PROFILE_BLOCK_PASC, + HSA_EXT_AQL_PROFILE_BLOCK_SPI, + HSA_EXT_AQL_PROFILE_BLOCK_SQ, + HSA_EXT_AQL_PROFILE_BLOCK_SQGS, + HSA_EXT_AQL_PROFILE_BLOCK_SQVS, + HSA_EXT_AQL_PROFILE_BLOCK_SQPS, + HSA_EXT_AQL_PROFILE_BLOCK_SQHS, + HSA_EXT_AQL_PROFILE_BLOCK_SQCS, + HSA_EXT_AQL_PROFILE_BLOCK_SX, + HSA_EXT_AQL_PROFILE_BLOCK_TA, + HSA_EXT_AQL_PROFILE_BLOCK_TCA, + HSA_EXT_AQL_PROFILE_BLOCK_TCC, + HSA_EXT_AQL_PROFILE_BLOCK_TD, + HSA_EXT_AQL_PROFILE_BLOCK_TCP, + HSA_EXT_AQL_PROFILE_BLOCK_GDS, + HSA_EXT_AQL_PROFILE_BLOCK_VGT, + HSA_EXT_AQL_PROFILE_BLOCK_IA, + HSA_EXT_AQL_PROFILE_BLOCK_MC, + HSA_EXT_AQL_PROFILE_BLOCK_TCS, + HSA_EXT_AQL_PROFILE_BLOCK_WD, + HSA_EXT_AQL_PROFILE_BLOCKS_NUMBER +} hsa_ext_amd_aql_profile_block_name_t; + +// PMC event object structure +// ‘counter_id’ value is specified in GFXIPs perfcounter user guides +// which is the counters select value, “Performance Counters Selection” +// chapter. +typedef struct { + hsa_ext_amd_aql_profile_block_name_t block_name; + uint32_t block_index; + uint32_t counter_id; +} hsa_ext_amd_aql_profile_event_t; + +// Check if event is valid for the specific GPU +hsa_status_t hsa_ext_amd_aql_profile_validate_event( + hsa_agent_t agent, // HSA handle for the profiling GPU + const hsa_ext_amd_aql_profile_event_t* event, // Pointer on validated event + bool* result); // True if the event valid, False otherwise + +// Profiling parameters +// All parameters are generic and if not applicable for a specific +// profile configuration then error status will be returned. +typedef enum { + // SQTT applicable parameters + HSA_EXT_AQL_PROFILE_PARAM_COMPUTE_UNIT_TARGET, + HSA_EXT_AQL_PROFILE_PARAM_VM_ID_MASK, + HSA_EXT_AQL_PROFILE_PARAM_MASK, + HSA_EXT_AQL_PROFILE_PARAM_TOKEN_MASK, + HSA_EXT_AQL_PROFILE_PARAM_TOKEN_MASK2 +} hsa_ext_amd_aql_profile_parameter_name_t; + +// Profile parameter object +typedef struct { + hsa_ext_amd_aql_profile_parameter_name_t parameter_name; + uint32_t value; +} hsa_ext_amd_aql_profile_parameters_t; + +// +// Profile context object: +// The library provides a profile object structure which contains +// the events array, a buffer for the profiling start/stop commands +// and a buffer for the output data. +// The buffers are specified by the buffer descriptors and allocated +// by the application. The buffers allocation attributes, the command +// buffer size, the PMC output buffer size as well as profiling output +// data can be get using the generic get profile info helper _get_info. +// +// Buffer descriptor +typedef struct { + void* ptr; + uint32_t size; +} hsa_ext_amd_aql_profile_descriptor_t; + +// Profile context object structure, contains profiling events list and +// needed for profiling buffers descriptors, a command buffer and +// an output data buffer +typedef struct { + hsa_agent_t agent; // GFXIP handle + hsa_ext_amd_aql_profile_event_type_t type; // Events type + const hsa_ext_amd_aql_profile_event_t* events; // Events array + uint32_t event_count; // Events count + const hsa_ext_amd_aql_profile_parameters_t* parameters; // Parameters array + uint32_t parameter_count; // Parameters count + hsa_ext_amd_aql_profile_descriptor_t output_buffer; // Output buffer + hsa_ext_amd_aql_profile_descriptor_t command_buffer; // PM4 commands +} hsa_ext_amd_aql_profile_profile_t; + +// +// AQL packets populating methods: +// The helper methods to populate provided by the application START and +// STOP AQL packets which the application is required to submit before and +// after profiled GPU task packets respectively. +// +// AQL Vendor Specific packet which carries a PM4 command +typedef struct { + uint16_t header; + uint16_t pm4_command[27]; + hsa_signal_t completion_signal; +} hsa_ext_amd_aql_pm4_packet_t; + +// Method to populate the provided AQL packet with profiling start commands +// Only 'pm4_command' fields of the packet are set and the application +// is responsible to set Vendor Specific header type a completion signal +hsa_status_t hsa_ext_amd_aql_profile_start( + const hsa_ext_amd_aql_profile_profile_t* profile, // [in] profile contex object + hsa_ext_amd_aql_pm4_packet_t* aql_start_packet); // [out] profile start AQL packet + +// Method to populate the provided AQL packet with profiling stop commands +// Only 'pm4_command' fields of the packet are set and the application +// is responsible to set Vendor Specific header type and a completion signal +hsa_status_t hsa_ext_amd_aql_profile_stop( + const hsa_ext_amd_aql_profile_profile_t* profile, // [in] profile contex object + hsa_ext_amd_aql_pm4_packet_t* aql_stop_packet); // [out] profile stop AQL packet + +// Legacy PM4 profiling packet size +const unsigned HSA_EXT_AQL_PROFILE_LEGACY_PM4_PACKET_SIZE = 64; +// Converting of the profiling AQL packet to PM4 packet, GFX8 support +hsa_status_t hsa_ext_amd_aql_profile_legacy_get_pm4( + const hsa_ext_amd_aql_pm4_packet_t* aql_packet, // AQL packet + void* pm4); // PM4 packet blob + +// +// Get profile info: +// Generic method for getting various profile info including profile buffers +// attributes like the command buffer size and the profiling PMC results. +// It’s implied that all counters are 64bit values. +// +// Profile generic output data: +typedef struct { + uint32_t sample_id; // PMC sample of SQTT buffer index + union { + struct { + hsa_ext_amd_aql_profile_event_t event; // PMC event + uint64_t result; // PMC result + } pmc_data; + hsa_ext_amd_aql_profile_descriptor_t sqtt_data; // SQTT output data descriptor + }; +} hsa_ext_amd_aql_profile_info_data_t; + +// Profile attributes +typedef enum { + HSA_EXT_AQL_PROFILE_INFO_COMMAND_BUFFER_SIZE, // get_info returns uint32_t value + HSA_EXT_AQL_PROFILE_INFO_PMC_DATA_SIZE, // get_info returns uint32_t value + HSA_EXT_AQL_PROFILE_INFO_PMC_DATA, // get_info returns PMC uint64_t value + // in info_data object + HSA_EXT_AQL_PROFILE_INFO_SQTT_DATA // get_info returns SQTT buffer ptr/size + // in info_data object +} hsa_ext_amd_aql_profile_info_type_t; + +// Definition of output data iterator callback +typedef hsa_status_t (*hsa_ext_amd_aql_profile_data_callback_t)( + hsa_ext_amd_aql_profile_info_type_t info_type, // [in] data type, PMC or SQTT data + hsa_ext_amd_aql_profile_info_data_t* info_data, // [in] info_data object + void* callback_data); // [in/out] data passed to the callback + +// Method for getting the profile info +hsa_status_t hsa_ext_amd_aql_profile_get_info( + const hsa_ext_amd_aql_profile_profile_t* profile, // [in] profile context object + hsa_ext_amd_aql_profile_info_type_t attribute, // [in] requested profile attribute + void* value); // [in/out] returned value + +// Method for iterating the events output data +hsa_status_t hsa_ext_amd_aql_profile_iterate_data( + const hsa_ext_amd_aql_profile_profile_t* profile, // [in] profile context object + hsa_ext_amd_aql_profile_data_callback_t callback, // [in] callback to iterate the output data + void* data); // [in/out] data passed to the callback + +#ifdef __cplusplus +} +#endif // __cplusplus + +#endif // _HSA_EXT_AMD_AQL_PROFILE_H_ diff --git a/runtime/hsa-ext-aql-profile/src/CMakeLists.txt b/runtime/hsa-ext-aql-profile/src/CMakeLists.txt new file mode 100644 index 0000000000..0c17919ed7 --- /dev/null +++ b/runtime/hsa-ext-aql-profile/src/CMakeLists.txt @@ -0,0 +1,72 @@ +# +# Minimum version of cmake required +# +cmake_minimum_required ( VERSION 3.5.0 ) + +# +# Setup flag to be verbose or not +# +set ( CMAKE_VERBOSE_MAKEFILE TRUE CACHE BOOL "Verbose Output" FORCE ) + +# +# Set name for the project +# @note: Must come before adding any sub-directories +# +set ( TARGET_NAME "aqlprofile" ) +project ( ${TARGET_NAME} ) + +if ( NOT DEFINED PROJ_DIR ) + set ( PROJ_DIR ${CMAKE_CURRENT_SOURCE_DIR} ) + set ( ROOT_DIR ${PROJ_DIR}/.. ) +endif () + +set ( API_DIR ${ROOT_DIR}/inc ) +set ( HSA_RUNTIME_DIR ${PROJ_DIR}/../../.. ) +set ( HSA_RUNTIME_OSC_DIR ${HSA_RUNTIME_DIR}/opensrc/hsa-runtime ) +set ( CORE_UTIL_DIR ${HSA_RUNTIME_OSC_DIR}/core/util ) +include_directories ( ${ROOT_DIR} ) + +# +# Validate required build environment is setup correctly +# +include ( ${ROOT_DIR}/cmake_modules/validateBldEnv.cmake ) + +# +# Setup tool chain flags - preprocessor, compiler and linker +# +include ( ${ROOT_DIR}/cmake_modules/exportToolFlags.cmake ) + +# +# Set Name for Utils library and build it as a +# static library to be linked with others +# +set ( UTIL_LIB "util${ONLY64STR}" ) +add_subdirectory ( ${PROJ_DIR}/util "${PROJECT_BINARY_DIR}/util" ) + +# +# Set Name for Cmdwriter library and build it as a +# static library to be linked with others +# +set ( CMDWRITER_LIB "commandwriter${ONLY64STR}" ) +add_subdirectory ( ${PROJ_DIR}/commandwriter "${PROJECT_BINARY_DIR}/commandwriter" ) + +# +# Set Name for ThreadTrace library and build it as a +# static library to be linked with others +# +set ( SQTT_LIB "sqtt${ONLY64STR}" ) +add_subdirectory ( ${PROJ_DIR}/threadtrace "${PROJECT_BINARY_DIR}/threadtrace" ) + +# +# Set Name for Profiler library and build it as a +# static library to be linked with others +# +set ( PMC_LIB "pmc${ONLY64STR}" ) +add_subdirectory ( ${PROJ_DIR}/perfcounter "${PROJECT_BINARY_DIR}/perfcounter" ) + +# +# Build the library and link it with other static +# libraries that have been built in this regard +# +set ( TARGET_LIB "${TARGET_NAME}${ONLY64STR}" ) +add_subdirectory ( ${PROJ_DIR}/${TARGET_NAME} "${PROJECT_BINARY_DIR}/${TARGET_NAME}" ) diff --git a/runtime/hsa-ext-aql-profile/src/aqlprofile/CMakeLists.txt b/runtime/hsa-ext-aql-profile/src/aqlprofile/CMakeLists.txt new file mode 100644 index 0000000000..7bb981c8ae --- /dev/null +++ b/runtime/hsa-ext-aql-profile/src/aqlprofile/CMakeLists.txt @@ -0,0 +1,20 @@ +# +# Source files for Rocr Service Manager +# +set ( LIB_SRC aql_profile.cpp populate_aql.cpp gfx8_factory.cpp gfx9_factory.cpp ) + +# +# Header files include path(s). +# +include_directories ( $ENV{ROCR_INC_DIR} ) +include_directories ( ${PROJ_DIR}/perfcounter ) +include_directories ( ${PROJ_DIR}/threadtrace ) +include_directories ( ${PROJ_DIR}/commandwriter ) +include_directories ( ${API_DIR} ) + +# +# Build Service Manager as a dynamic Library object +# +set ( LIB_LIST ${PMC_LIB} ${SQTT_LIB} ${CMDWRITER_LIB} ${UTIL_LIB} ) +add_library ( ${TARGET_LIB} SHARED ${LIB_SRC} ) +target_link_libraries( ${TARGET_LIB} ${LIB_LIST} c stdc++ dl pthread rt ) diff --git a/runtime/hsa-ext-aql-profile/src/aqlprofile/aql_profile.cpp b/runtime/hsa-ext-aql-profile/src/aqlprofile/aql_profile.cpp new file mode 100644 index 0000000000..f06fb161b3 --- /dev/null +++ b/runtime/hsa-ext-aql-profile/src/aqlprofile/aql_profile.cpp @@ -0,0 +1,398 @@ +#include + +#include "aql_profile.h" +#include "pm4_factory.h" +#include "cmdwriter.h" // commandwriter +#include "hsa_perf.h" // perfcounter +#include "thread_trace.h" // threadtrace +#include "gpu_enum.h" +#include "gpu_blockinfo.h" + +#define PUBLIC_API __attribute__((visibility("default"))) + +namespace aql_profile { + +// Command buffer partitioning manager +// Supports Pre/Post commands partitioning +// and postfix control partition +class CommandBufferMgr { + const static uint32_t align_size = 0x100; + const static uint32_t align_mask = align_size - 1; + + struct info_t { + uint32_t precmds_size; + uint32_t postcmds_size; + }; + + descriptor_t buffer; + uint32_t postfix_size; + info_t* info; + + uint32_t align(const uint32_t& size) { return (size + align_mask) & ~align_mask; } + + public: + CommandBufferMgr(const profile_t* profile) + : buffer(profile->command_buffer), postfix_size(0), info(NULL) { + info = (info_t*)setPostfix(sizeof(info_t)); + } + + uint32_t getSize() { return buffer.size; } + + void* setPostfix(const uint32_t& size) { + if (size > postfix_size) { + const uint32_t delta = size - postfix_size; + postfix_size = size; + buffer.size -= (delta < buffer.size) ? delta : buffer.size; + } + return (buffer.size != 0) ? buffer.ptr + buffer.size : NULL; + } + + bool setPreSize(const uint32_t& size) { + bool suc = (size <= buffer.size); + if (suc) info->precmds_size = size; + return suc; + } + + uint32_t getPostOffset() { return align(info->precmds_size); } + + bool checkTotalSize(const uint32_t& size) { + bool suc = (size <= buffer.size); + if (suc) suc = (size >= info->precmds_size); + if (suc) { + info->postcmds_size = size - info->precmds_size; + suc = ((getPostOffset() + info->postcmds_size) <= buffer.size); + } + return suc; + } + + descriptor_t getPreDescr() { + descriptor_t descr; + descr.ptr = buffer.ptr; + descr.size = info->precmds_size; + return descr; + } + + descriptor_t getPostDescr() { + descriptor_t descr; + descr.ptr = buffer.ptr + getPostOffset(); + descr.size = info->postcmds_size; + return descr; + } +}; + +static inline bool is_event_match(const event_t& event1, const event_t& event2) { + return (event1.block_name == event2.block_name) && (event1.block_index == event2.block_index) && + (event1.counter_id == event2.counter_id); +} + +hsa_status_t default_pmcdata_callback(hsa_ext_amd_aql_profile_info_type_t info_type, + hsa_ext_amd_aql_profile_info_data_t* info_data, + void* callback_data) { + hsa_status_t status = HSA_STATUS_SUCCESS; + hsa_ext_amd_aql_profile_info_data_t* passed_data = + reinterpret_cast(callback_data); + + if (info_type == HSA_EXT_AQL_PROFILE_INFO_PMC_DATA) { + if (is_event_match(info_data->pmc_data.event, passed_data->pmc_data.event)) { + if (passed_data->sample_id == UINT32_MAX) { + passed_data->pmc_data.result += info_data->pmc_data.result; + } else if (passed_data->sample_id == info_data->sample_id) { + passed_data->pmc_data.result = info_data->pmc_data.result; + status = HSA_STATUS_INFO_BREAK; + } + } + } + + return status; +} + +struct sqtt_ctrl_t { + uint32_t status; + uint32_t counter; + uint32_t writePtr; +}; + +hsa_status_t default_sqttdata_callback(hsa_ext_amd_aql_profile_info_type_t info_type, + hsa_ext_amd_aql_profile_info_data_t* info_data, + void* callback_data) { + hsa_status_t status = HSA_STATUS_SUCCESS; + hsa_ext_amd_aql_profile_info_data_t* passed_data = + reinterpret_cast(callback_data); + + if (info_type == HSA_EXT_AQL_PROFILE_INFO_SQTT_DATA) { + if (info_data->sample_id == passed_data->sample_id) { + passed_data->sqtt_data = info_data->sqtt_data; + status = HSA_STATUS_INFO_BREAK; + } + } + + return status; +} + +} // aql_profile + +extern "C" { + +// Check if event is valid for the specific GPU +PUBLIC_API hsa_status_t hsa_ext_amd_aql_profile_validate_event( + hsa_agent_t agent, const hsa_ext_amd_aql_profile_event_t* event, bool* result) { + return HSA_STATUS_SUCCESS; +} + +// Method to populate the provided AQL packet with profiling start commands +PUBLIC_API hsa_status_t hsa_ext_amd_aql_profile_start( + const hsa_ext_amd_aql_profile_profile_t* profile, aql_profile::packet_t* aql_start_packet) { + + aql_profile::Pm4Factory * pm4_factory = aql_profile::Pm4Factory::Create(profile); + if (pm4_factory == NULL) return HSA_STATUS_ERROR; + + pm4_profile::CommandWriter* cmdWriter = pm4_factory->getCommandWriter(); + if (cmdWriter == NULL) return HSA_STATUS_ERROR; + + pm4_profile::DefaultCmdBuf commands; + aql_profile::CommandBufferMgr cmdBufMgr(profile); + if (cmdBufMgr.getSize() == 0) return HSA_STATUS_ERROR; + + if (profile->type == HSA_EXT_AQL_PROFILE_EVENT_PMC) { + pm4_profile::Pmu* pmcMgr = pm4_factory->getPmcMgr(); + if (pmcMgr == NULL) return HSA_STATUS_ERROR; + + pmcMgr->setPmcDataBuff((uint8_t*)profile->output_buffer.ptr, profile->output_buffer.size); + + for (const hsa_ext_amd_aql_profile_event_t* p = profile->events; + p < profile->events + profile->event_count; ++p) { + pm4_profile::CounterBlock* block = + pmcMgr->getCounterBlockById(pm4_factory->getBlockId(p)); + if (block == NULL) return HSA_STATUS_ERROR; + + pm4_profile::Counter* counter = block->createCounter(); + if (counter == NULL) return HSA_STATUS_ERROR; + + counter->setParameter(HSA_EXT_TOOLS_COUNTER_PARAMETER_EVENT_INDEX, sizeof(uint32_t), + &(p->counter_id)); + counter->setEnable(true); + } + + // Generate start commands + pmcMgr->begin(&commands, cmdWriter); + cmdBufMgr.setPreSize(commands.Size()); + // Generate stop commands + pmcMgr->end(&commands, cmdWriter); + } else if (profile->type == HSA_EXT_AQL_PROFILE_EVENT_SQTT) { + pm4_profile::ThreadTrace* sqttMgr = pm4_factory->getSqttMgr(); + if (sqttMgr == NULL) return HSA_STATUS_ERROR; + + pm4_profile::ThreadTraceConfig sqtt_config; + sqttMgr->InitThreadTraceConfig(&sqtt_config); + if (profile->parameters) { + for (const hsa_ext_amd_aql_profile_parameters_t* p = profile->parameters; + p < (profile->parameters + profile->parameter_count); ++p) { + switch (p->parameter_name) { + case HSA_EXT_AQL_PROFILE_PARAM_COMPUTE_UNIT_TARGET: + sqtt_config.threadTraceTargetCu = p->value; + break; + case HSA_EXT_AQL_PROFILE_PARAM_VM_ID_MASK: + sqtt_config.threadTraceVmIdMask = p->value; + break; + case HSA_EXT_AQL_PROFILE_PARAM_MASK: + sqtt_config.threadTraceMask = p->value; + break; + case HSA_EXT_AQL_PROFILE_PARAM_TOKEN_MASK: + sqtt_config.threadTraceTokenMask = p->value; + break; + case HSA_EXT_AQL_PROFILE_PARAM_TOKEN_MASK2: + sqtt_config.threadTraceTokenMask2 = p->value; + break; + default: + return HSA_STATUS_ERROR; + } + } + } + sqttMgr->Init(&sqtt_config); + + sqttMgr->setSqttDataBuff((uint8_t*)profile->output_buffer.ptr, profile->output_buffer.size); + + const uint32_t status_size = sqttMgr->StatusSizeInfo(); + void* status_ptr = cmdBufMgr.setPostfix(status_size); + if (status_ptr == NULL) return HSA_STATUS_ERROR; + // Control buffer registering + sqttMgr->setSqttCtrlBuff((uint32_t*)status_ptr); + + // Generate start commands + sqttMgr->BeginSession(&commands, cmdWriter); + cmdBufMgr.setPreSize(commands.Size()); + // Generate stop commands + sqttMgr->StopSession(&commands, cmdWriter); + } else + return HSA_STATUS_ERROR; + + if (!cmdBufMgr.checkTotalSize(commands.Size())) return HSA_STATUS_ERROR; + + const aql_profile::descriptor_t pre_descr = cmdBufMgr.getPreDescr(); + const aql_profile::descriptor_t post_descr = cmdBufMgr.getPostDescr(); + memcpy(pre_descr.ptr, commands.Base(), pre_descr.size); + memcpy(post_descr.ptr, commands.Base() + pre_descr.size, post_descr.size); + // Populate start aql packet + aql_profile::populateAql(pre_descr.ptr, pre_descr.size, cmdWriter, aql_start_packet); + + return HSA_STATUS_SUCCESS; +} + +// Method to populate the provided AQL packet with profiling stop commands +PUBLIC_API hsa_status_t hsa_ext_amd_aql_profile_stop( + const hsa_ext_amd_aql_profile_profile_t* profile, aql_profile::packet_t* aql_stop_packet) { + + aql_profile::Pm4Factory * pm4_factory = aql_profile::Pm4Factory::Create(profile); + if (pm4_factory == NULL) return HSA_STATUS_ERROR; + + pm4_profile::CommandWriter* cmdWriter = pm4_factory->getCommandWriter(); + if (cmdWriter == NULL) return HSA_STATUS_ERROR; + + aql_profile::CommandBufferMgr cmdBufMgr(profile); + if (cmdBufMgr.getSize() == 0) return HSA_STATUS_ERROR; + + const aql_profile::descriptor_t post_descr = cmdBufMgr.getPostDescr(); + // Populate stop aql packet + aql_profile::populateAql(post_descr.ptr, post_descr.size, cmdWriter, aql_stop_packet); + + return HSA_STATUS_SUCCESS; +} + +// Converting of the profiling AQL packet to PM4 packet, GFX8 support +PUBLIC_API hsa_status_t hsa_ext_amd_aql_profile_legacy_get_pm4( + const aql_profile::packet_t* aql_packet, void* pm4) { + return HSA_STATUS_ERROR; +} + +// Method for getting the profile info +PUBLIC_API hsa_status_t hsa_ext_amd_aql_profile_get_info( + const hsa_ext_amd_aql_profile_profile_t* profile, hsa_ext_amd_aql_profile_info_type_t attribute, + void* value) { + hsa_status_t status = HSA_STATUS_SUCCESS; + + switch (attribute) { + case HSA_EXT_AQL_PROFILE_INFO_COMMAND_BUFFER_SIZE: + *(uint32_t*)value = 0x1000; // a current approximation as 4K is big enaugh + break; + case HSA_EXT_AQL_PROFILE_INFO_PMC_DATA_SIZE: + *(uint32_t*)value = 0x1000; // a current approximation as 4K is big enaugh + break; + case HSA_EXT_AQL_PROFILE_INFO_PMC_DATA: + reinterpret_cast(value)->pmc_data.result = 0; + status = hsa_ext_amd_aql_profile_iterate_data(profile, aql_profile::default_pmcdata_callback, + value); + break; + case HSA_EXT_AQL_PROFILE_INFO_SQTT_DATA: + status = hsa_ext_amd_aql_profile_iterate_data(profile, aql_profile::default_sqttdata_callback, + value); + break; + default: + status = HSA_STATUS_ERROR_INVALID_ARGUMENT; + } + + return status; +} + +// Method for iterating the events output data +PUBLIC_API hsa_status_t hsa_ext_amd_aql_profile_iterate_data( + const hsa_ext_amd_aql_profile_profile_t* profile, + hsa_ext_amd_aql_profile_data_callback_t callback, void* data) { + + hsa_status_t status = HSA_STATUS_SUCCESS; + aql_profile::Pm4Factory * pm4_factory = aql_profile::Pm4Factory::Create(profile); + if (pm4_factory == NULL) return HSA_STATUS_ERROR; + + if (profile->type == HSA_EXT_AQL_PROFILE_EVENT_PMC) { + uint32_t info_size = 0; + void* info_data; + uint64_t* samples = (uint64_t*)profile->output_buffer.ptr; + const uint32_t sample_count = profile->output_buffer.size / sizeof(uint64_t); + uint32_t sample_index = 0; + + pm4_profile::Pmu* pmcMgr = pm4_factory->getPmcMgr(); + if (pmcMgr == NULL) return HSA_STATUS_ERROR; + + for (const hsa_ext_amd_aql_profile_event_t* p = profile->events; + p < (profile->events + profile->event_count); ++p) { + pm4_profile::CounterBlock* block = + pmcMgr->getCounterBlockById(pm4_factory->getBlockId(p)); + if (block == NULL) return HSA_STATUS_ERROR; + if (!block->getInfo(pm4_profile::GPU_BLK_INFO_CONTROL_METHOD, info_size, &info_data)) { + return HSA_STATUS_ERROR; + } + const pm4_profile::CntlMethod method = + static_cast(*(static_cast(info_data))); + // A perfcounter data sample per ShaderEngine + const uint32_t block_samples_count = (method == pm4_profile::CntlMethodBySe || + method == pm4_profile::CntlMethodBySeAndInstance) + ? pmcMgr->getNumSe() + : 1; + for (uint32_t i = 0; i < block_samples_count; ++i) { + assert(sample_index < sample_count); + if (sample_index >= sample_count) return HSA_STATUS_ERROR; + + hsa_ext_amd_aql_profile_info_data_t sample_info; + sample_info.sample_id = i; + sample_info.pmc_data.event = *p; + sample_info.pmc_data.result = samples[sample_index]; + status = callback(HSA_EXT_AQL_PROFILE_INFO_PMC_DATA, &sample_info, data); + if (status == HSA_STATUS_INFO_BREAK) { + status = HSA_STATUS_SUCCESS; + break; + } + if (status != HSA_STATUS_SUCCESS) break; + ++sample_index; + } + } + } else if (profile->type == HSA_EXT_AQL_PROFILE_EVENT_SQTT) { + pm4_profile::ThreadTrace* sqttMgr = pm4_factory->getSqttMgr(); + if (sqttMgr == NULL) return HSA_STATUS_ERROR; + + aql_profile::CommandBufferMgr cmdBufMgr(profile); + if (cmdBufMgr.getSize() == 0) return HSA_STATUS_ERROR; + + const uint32_t status_size = sqttMgr->StatusSizeInfo(); + // Control buffer was allocated as the CmdBuffer postfix partition + void* status_ptr = cmdBufMgr.setPostfix(status_size); + if (status_ptr == NULL) return HSA_STATUS_ERROR; + // Control buffer registering + sqttMgr->setSqttCtrlBuff((uint32_t*)status_ptr); + // Validate SQTT status and normalize WRPTR + if (sqttMgr->Validate() == false) return HSA_STATUS_ERROR; + + const uint32_t se_number = sqttMgr->getNumSe(); + // Casting status pointer to SQTT control per ShaderEngine array + aql_profile::sqtt_ctrl_t* sqtt_ctrl = (aql_profile::sqtt_ctrl_t*)status_ptr; + assert(status_size == sizeof(aql_profile::sqtt_ctrl_t) * se_number); + if (status_size != sizeof(aql_profile::sqtt_ctrl_t) * se_number) { + return HSA_STATUS_ERROR; + } + // SQTT output buffer and capacity per ShaderEngine + void* sample_ptr = profile->output_buffer.ptr; + const uint32_t sample_capacity = profile->output_buffer.size / se_number; + // The samples sizes are returned in the control buffer + for (int i = 0; i < se_number; ++i) { + // WPTR specifies the index in thread trace buffer where next token will be + // written by hardware. The index is incremented by size of 32 bytes. + uint32_t sample_size = sqtt_ctrl[i].writePtr * TT_WRITE_PTR_BLK; + + hsa_ext_amd_aql_profile_info_data_t sample_info; + sample_info.sample_id = i; + sample_info.sqtt_data.ptr = sample_ptr; + sample_info.sqtt_data.size = sample_size; + status = callback(HSA_EXT_AQL_PROFILE_INFO_SQTT_DATA, &sample_info, data); + if (status == HSA_STATUS_INFO_BREAK) { + status = HSA_STATUS_SUCCESS; + break; + } + if (status != HSA_STATUS_SUCCESS) break; + + sample_ptr += sample_capacity; + } + } else { + status = HSA_STATUS_ERROR; + } + + return status; +} +} diff --git a/runtime/hsa-ext-aql-profile/src/aqlprofile/aql_profile.h b/runtime/hsa-ext-aql-profile/src/aqlprofile/aql_profile.h new file mode 100644 index 0000000000..b51d754001 --- /dev/null +++ b/runtime/hsa-ext-aql-profile/src/aqlprofile/aql_profile.h @@ -0,0 +1,23 @@ +#ifndef _AQL_PROFILE_H_ +#define _AQL_PROFILE_H_ + +#include "hsa_ext_amd_aql_profile.h" + +namespace pm4_profile { +class CommandWriter; +} + +namespace aql_profile { + +typedef hsa_ext_amd_aql_profile_descriptor_t descriptor_t; +typedef hsa_ext_amd_aql_profile_profile_t profile_t; +typedef hsa_ext_amd_aql_profile_info_type_t info_type_t; +typedef hsa_ext_amd_aql_profile_data_callback_t data_callback_t; +typedef hsa_ext_amd_aql_pm4_packet_t packet_t; +typedef hsa_ext_amd_aql_profile_event_t event_t; + +void populateAql(void* cmdBuffer, uint32_t cmdSz, pm4_profile::CommandWriter* cmdWriter, + packet_t* aqlPkt); +} + +#endif // _AQL_PROFILE_H_ diff --git a/runtime/hsa-ext-aql-profile/src/aqlprofile/gfx8_factory.cpp b/runtime/hsa-ext-aql-profile/src/aqlprofile/gfx8_factory.cpp new file mode 100644 index 0000000000..8327134478 --- /dev/null +++ b/runtime/hsa-ext-aql-profile/src/aqlprofile/gfx8_factory.cpp @@ -0,0 +1,43 @@ +#include "pm4_factory.h" +// Commandwriter includes +#include "gfx8_cmdwriter.h" +// PMC includes +#include "vi_pmu.h" +// SQTT includes +#include "gfx8_thread_trace.h" + +namespace aql_profile { + +// GFX9 block ID mapping table +uint32_t Gfx8Factory::block_id_table[HSA_EXT_AQL_PROFILE_BLOCKS_NUMBER] = { + pm4_profile::kHsaViCounterBlockIdCb0, pm4_profile::kHsaViCounterBlockIdCpf, + pm4_profile::kHsaViCounterBlockIdDb0, pm4_profile::kHsaViCounterBlockIdGrbm, + pm4_profile::kHsaViCounterBlockIdGrbmSe, pm4_profile::kHsaViCounterBlockIdPaSu, + pm4_profile::kHsaViCounterBlockIdPaSc, pm4_profile::kHsaViCounterBlockIdSpi, + pm4_profile::kHsaViCounterBlockIdSq, pm4_profile::kHsaViCounterBlockIdSqGs, + pm4_profile::kHsaViCounterBlockIdSqVs, pm4_profile::kHsaViCounterBlockIdSqPs, + pm4_profile::kHsaViCounterBlockIdSqHs, pm4_profile::kHsaViCounterBlockIdSqCs, + pm4_profile::kHsaViCounterBlockIdSx, pm4_profile::kHsaViCounterBlockIdTa0, + pm4_profile::kHsaViCounterBlockIdTca0, pm4_profile::kHsaViCounterBlockIdTcc0, + pm4_profile::kHsaViCounterBlockIdTd0, pm4_profile::kHsaViCounterBlockIdTcp0, + pm4_profile::kHsaViCounterBlockIdGds, pm4_profile::kHsaViCounterBlockIdVgt, + pm4_profile::kHsaViCounterBlockIdIa, pm4_profile::kHsaViCounterBlockIdMc, + pm4_profile::kHsaViCounterBlockIdTcs, pm4_profile::kHsaViCounterBlockIdWd}; + +pm4_profile::CommandWriter * Gfx8Factory::getCommandWriter() { + return new pm4_profile::gfx8::Gfx8CmdWriter(false, true); +} + +pm4_profile::Pmu * Gfx8Factory::getPmcMgr() { + return new pm4_profile::ViPmu(); +} + +pm4_profile::ThreadTrace * Gfx8Factory::getSqttMgr() { + return new pm4_profile::Gfx8ThreadTrace(); +} + +uint32_t Gfx8Factory::getBlockId(const event_t* event) { + return block_id_table[event->block_name] + event->block_index; +} + +} // aql_profile diff --git a/runtime/hsa-ext-aql-profile/src/aqlprofile/gfx8_pm4_factory.cpp b/runtime/hsa-ext-aql-profile/src/aqlprofile/gfx8_pm4_factory.cpp new file mode 100644 index 0000000000..3bba202f54 --- /dev/null +++ b/runtime/hsa-ext-aql-profile/src/aqlprofile/gfx8_pm4_factory.cpp @@ -0,0 +1,70 @@ +#include "pm4_factory.h" +// Commandwriter includes +#include "gfx8_cmdwriter.h" +#include "gfx9_cmdwriter.h" +// PMC includes +#include "vi_pmu.h" +#include "ai_pmu.h" +// SQTT includes +#include "gfx8_thread_trace.h" +#include "gfx9_thread_trace.h" + +namespace aql_profile { + +// GFX8 block ID mapping table +uint32_t gfx8_block_id_table[HSA_EXT_AQL_PROFILE_BLOCKS_NUMBER] = { + pm4_profile::kHsaViCounterBlockIdCb0, pm4_profile::kHsaViCounterBlockIdCpf, + pm4_profile::kHsaViCounterBlockIdDb0, pm4_profile::kHsaViCounterBlockIdGrbm, + pm4_profile::kHsaViCounterBlockIdGrbmSe, pm4_profile::kHsaViCounterBlockIdPaSu, + pm4_profile::kHsaViCounterBlockIdPaSc, pm4_profile::kHsaViCounterBlockIdSpi, + pm4_profile::kHsaViCounterBlockIdSq, pm4_profile::kHsaViCounterBlockIdSqGs, + pm4_profile::kHsaViCounterBlockIdSqVs, pm4_profile::kHsaViCounterBlockIdSqPs, + pm4_profile::kHsaViCounterBlockIdSqHs, pm4_profile::kHsaViCounterBlockIdSqCs, + pm4_profile::kHsaViCounterBlockIdSx, pm4_profile::kHsaViCounterBlockIdTa0, + pm4_profile::kHsaViCounterBlockIdTca0, pm4_profile::kHsaViCounterBlockIdTcc0, + pm4_profile::kHsaViCounterBlockIdTd0, pm4_profile::kHsaViCounterBlockIdTcp0, + pm4_profile::kHsaViCounterBlockIdGds, pm4_profile::kHsaViCounterBlockIdVgt, + pm4_profile::kHsaViCounterBlockIdIa, pm4_profile::kHsaViCounterBlockIdMc, + pm4_profile::kHsaViCounterBlockIdTcs, pm4_profile::kHsaViCounterBlockIdWd}; + +// GFX9 block ID mapping table +uint32_t gfx9_block_id_table[HSA_EXT_AQL_PROFILE_BLOCKS_NUMBER] = { + pm4_profile::kHsaAiCounterBlockIdCb0, pm4_profile::kHsaAiCounterBlockIdCpf, + pm4_profile::kHsaAiCounterBlockIdDb0, pm4_profile::kHsaAiCounterBlockIdGrbm, + pm4_profile::kHsaAiCounterBlockIdGrbmSe, pm4_profile::kHsaAiCounterBlockIdPaSu, + pm4_profile::kHsaAiCounterBlockIdPaSc, pm4_profile::kHsaAiCounterBlockIdSpi, + pm4_profile::kHsaAiCounterBlockIdSq, pm4_profile::kHsaAiCounterBlockIdSqGs, + pm4_profile::kHsaAiCounterBlockIdSqVs, pm4_profile::kHsaAiCounterBlockIdSqPs, + pm4_profile::kHsaAiCounterBlockIdSqHs, pm4_profile::kHsaAiCounterBlockIdSqCs, + pm4_profile::kHsaAiCounterBlockIdSx, pm4_profile::kHsaAiCounterBlockIdTa0, + pm4_profile::kHsaAiCounterBlockIdTca0, pm4_profile::kHsaAiCounterBlockIdTcc0, + pm4_profile::kHsaAiCounterBlockIdTd0, pm4_profile::kHsaAiCounterBlockIdTcp0, + pm4_profile::kHsaAiCounterBlockIdGds, pm4_profile::kHsaAiCounterBlockIdVgt, + pm4_profile::kHsaAiCounterBlockIdIa, pm4_profile::kHsaAiCounterBlockIdMc, + pm4_profile::kHsaAiCounterBlockIdTcs, pm4_profile::kHsaAiCounterBlockIdWd}; + +pm4_profile::CommandWriter * Pm4Factory::getCommandWriter() { + return (is_gfx9 == true) ? + new pm4_profile::gfx9::Gfx9CmdWriter(false, true) : + new pm4_profile::gfx8::Gfx8CmdWriter(false, true); +} + +pm4_profile::Pmu * Pm4Factory::getPmcMgr() { + return (is_gfx9 == true) ? + new pm4_profile::AiPmu() : + new pm4_profile::ViPmu(); +} + +pm4_profile::ThreadTrace * Pm4Factory::getSqttMgr() { + return (is_gfx9 == true) ? + new pm4_profile::Gfx9ThreadTrace() : + new pm4_profile::Gfx8ThreadTrace(); +} + +uint32_t Pm4Factory::getBlockId(const event_t* event) { + return (is_gfx9 == true) ? + gfx9_block_id_table[event->block_name] + event->block_index : + gfx8_block_id_table[event->block_name] + event->block_index : +} + +} // aql_profile diff --git a/runtime/hsa-ext-aql-profile/src/aqlprofile/gfx9_factory.cpp b/runtime/hsa-ext-aql-profile/src/aqlprofile/gfx9_factory.cpp new file mode 100644 index 0000000000..6909eb3d63 --- /dev/null +++ b/runtime/hsa-ext-aql-profile/src/aqlprofile/gfx9_factory.cpp @@ -0,0 +1,43 @@ +#include "pm4_factory.h" +// Commandwriter includes +#include "gfx9_cmdwriter.h" +// PMC includes +#include "ai_pmu.h" +// SQTT includes +#include "gfx9_thread_trace.h" + +namespace aql_profile { + +// GFX9 block ID mapping table +uint32_t Gfx9Factory::block_id_table[HSA_EXT_AQL_PROFILE_BLOCKS_NUMBER] = { + pm4_profile::kHsaAiCounterBlockIdCb0, pm4_profile::kHsaAiCounterBlockIdCpf, + pm4_profile::kHsaAiCounterBlockIdDb0, pm4_profile::kHsaAiCounterBlockIdGrbm, + pm4_profile::kHsaAiCounterBlockIdGrbmSe, pm4_profile::kHsaAiCounterBlockIdPaSu, + pm4_profile::kHsaAiCounterBlockIdPaSc, pm4_profile::kHsaAiCounterBlockIdSpi, + pm4_profile::kHsaAiCounterBlockIdSq, pm4_profile::kHsaAiCounterBlockIdSqGs, + pm4_profile::kHsaAiCounterBlockIdSqVs, pm4_profile::kHsaAiCounterBlockIdSqPs, + pm4_profile::kHsaAiCounterBlockIdSqHs, pm4_profile::kHsaAiCounterBlockIdSqCs, + pm4_profile::kHsaAiCounterBlockIdSx, pm4_profile::kHsaAiCounterBlockIdTa0, + pm4_profile::kHsaAiCounterBlockIdTca0, pm4_profile::kHsaAiCounterBlockIdTcc0, + pm4_profile::kHsaAiCounterBlockIdTd0, pm4_profile::kHsaAiCounterBlockIdTcp0, + pm4_profile::kHsaAiCounterBlockIdGds, pm4_profile::kHsaAiCounterBlockIdVgt, + pm4_profile::kHsaAiCounterBlockIdIa, pm4_profile::kHsaAiCounterBlockIdMc, + pm4_profile::kHsaAiCounterBlockIdTcs, pm4_profile::kHsaAiCounterBlockIdWd}; + +pm4_profile::CommandWriter * Gfx9Factory::getCommandWriter() { + return new pm4_profile::gfx9::Gfx9CmdWriter(false, true); +} + +pm4_profile::Pmu * Gfx9Factory::getPmcMgr() { + return new pm4_profile::AiPmu(); +} + +pm4_profile::ThreadTrace * Gfx9Factory::getSqttMgr() { + return new pm4_profile::Gfx9ThreadTrace(); +} + +uint32_t Gfx9Factory::getBlockId(const event_t* event) { + return block_id_table[event->block_name] + event->block_index; +} + +} // aql_profile diff --git a/runtime/hsa-ext-aql-profile/src/aqlprofile/pm4_factory.h b/runtime/hsa-ext-aql-profile/src/aqlprofile/pm4_factory.h new file mode 100644 index 0000000000..53889f9033 --- /dev/null +++ b/runtime/hsa-ext-aql-profile/src/aqlprofile/pm4_factory.h @@ -0,0 +1,62 @@ +#ifndef _PM4_FACTORY_H_ +#define _PM4_FACTORY_H_ + +#include +#include + +#include "aql_profile.h" + +namespace pm4_profile { +class CommandWriter; +class Pmu; +class ThreadTrace; +} + +namespace aql_profile { + +class Pm4Factory { + public: + static Pm4Factory* Create(const hsa_ext_amd_aql_profile_profile_t* profile); + virtual pm4_profile::CommandWriter* getCommandWriter() = 0; + virtual pm4_profile::Pmu* getPmcMgr() = 0; + virtual pm4_profile::ThreadTrace* getSqttMgr() = 0; + virtual uint32_t getBlockId(const event_t* event) = 0; +}; + +class Gfx8Factory : public Pm4Factory { + public: + pm4_profile::CommandWriter* getCommandWriter(); + pm4_profile::Pmu* getPmcMgr(); + pm4_profile::ThreadTrace* getSqttMgr(); + uint32_t getBlockId(const event_t* event); + + private: + static uint32_t block_id_table[HSA_EXT_AQL_PROFILE_BLOCKS_NUMBER]; +}; + +class Gfx9Factory : public Pm4Factory { + public: + pm4_profile::CommandWriter* getCommandWriter(); + pm4_profile::Pmu* getPmcMgr(); + pm4_profile::ThreadTrace* getSqttMgr(); + uint32_t getBlockId(const event_t* event); + + private: + static uint32_t block_id_table[HSA_EXT_AQL_PROFILE_BLOCKS_NUMBER]; +}; + +inline Pm4Factory* Pm4Factory::Create(const hsa_ext_amd_aql_profile_profile_t* profile) { + Pm4Factory* instance = NULL; + char agent_name[64]; + hsa_agent_get_info(profile->agent, HSA_AGENT_INFO_NAME, agent_name); + if (strncmp(agent_name, "gfx8", 4) == 0) { + instance = new Gfx8Factory(); + } else if (strncmp(agent_name, "gfx9", 4) == 0) { + instance = new Gfx9Factory(); + } + return instance; +} + +} // aql_profile + +#endif // _PM4_FACTORY_H_ diff --git a/runtime/hsa-ext-aql-profile/src/aqlprofile/populate_aql.cpp b/runtime/hsa-ext-aql-profile/src/aqlprofile/populate_aql.cpp new file mode 100644 index 0000000000..aa3da9f463 --- /dev/null +++ b/runtime/hsa-ext-aql-profile/src/aqlprofile/populate_aql.cpp @@ -0,0 +1,41 @@ +#include +#include + +#include "aql_profile.h" +#include "cmdwriter.h" +#include "amd_aql_pm4_ib_packet.h" + +namespace aql_profile { + +void populateAql(uint32_t* ib_packet, packet_t* aql_packet) { + // Populate relevant fields of Aql pkt + // Size of IB pkt is four DWords + // Header and completion sinal are not set + amd_aql_pm4_ib_packet_t* aql_pm4_ib = reinterpret_cast(aql_packet); + aql_pm4_ib->pm4_ib_format = AMD_AQL_PM4_IB_FORMAT; + aql_pm4_ib->pm4_ib_command[0] = ib_packet[0]; + aql_pm4_ib->pm4_ib_command[1] = ib_packet[1]; + aql_pm4_ib->pm4_ib_command[2] = ib_packet[2]; + aql_pm4_ib->pm4_ib_command[3] = ib_packet[3]; + aql_pm4_ib->dw_count_remain = AMD_AQL_PM4_IB_DW_COUNT_REMAIN; + for (int i = 0; i < AMD_AQL_PM4_IB_RESERVED_COUNT; ++i) { + aql_pm4_ib->reserved[i] = 0; + } + + uint32_t* words = (uint32_t*)aql_packet; + std::clog << std::setw(40) << std::left << "AQL 'IB' size(16)" + << ":"; + for (int idx = 0; idx < 16; idx++) { + std::clog << " " << std::hex << std::setw(8) << std::setfill('0') << words[idx]; + } + std::clog << std::setfill(' ') << std::endl; +} + +void populateAql(void* cmd_buffer, uint32_t cmd_size, + pm4_profile::CommandWriter* cmd_writer, packet_t* ppt_packet) { + pm4_profile::DefaultCmdBuf ib_buffer; + cmd_writer->BuildIndirectBufferCmd(&ib_buffer, cmd_buffer, (size_t)cmd_size); + uint32_t* ib_cmds = (uint32_t*)ib_buffer.Base(); + populateAql(ib_cmds, ppt_packet); +} +} diff --git a/runtime/hsa-ext-aql-profile/src/commandwriter/CMakeLists.txt b/runtime/hsa-ext-aql-profile/src/commandwriter/CMakeLists.txt new file mode 100644 index 0000000000..0e127d718c --- /dev/null +++ b/runtime/hsa-ext-aql-profile/src/commandwriter/CMakeLists.txt @@ -0,0 +1,15 @@ +# +# Source files for Rocr Cmdwriter +# +set ( CmdWriterSrcs gfx8_cmdwriter.cpp ) +set ( CmdWriterSrcs ${CmdWriterSrcs} gfx9_cmdwriter.cpp ) + +# +# Header files include path(s). +# +include_directories ( $ENV{ROCR_INC_DIR} ) + +# +# Build Cmdwriter as a Static Library object +# +add_library ( ${CMDWRITER_LIB} STATIC ${CmdWriterSrcs} ) diff --git a/runtime/hsa-ext-aql-profile/src/commandwriter/cmdwriter.h b/runtime/hsa-ext-aql-profile/src/commandwriter/cmdwriter.h new file mode 100644 index 0000000000..20554d81a3 --- /dev/null +++ b/runtime/hsa-ext-aql-profile/src/commandwriter/cmdwriter.h @@ -0,0 +1,515 @@ +// cmdwriter.h +// Header file for CommandWriter and CmdBuf interfaces + +#ifndef _CMDWRITER_H_ +#define _CMDWRITER_H_ + +#include +#include +#include +#include + +namespace pm4_profile { + +// User defined options for flusing cache +typedef struct FlushCacheOptions_ { + bool l1, l2; + bool icache, kcache; + bool l1_vol, l2_vol, kcache_vol; + FlushCacheOptions_() { + l1 = l2 = icache = kcache = false; + l1_vol = l2_vol = kcache_vol = false; + }; +} FlushCacheOptions; + +/// @brief Interface to build a list of Gpu commands into a byte +/// buffer. Classes implementing this interface are used to translate +/// various Gpu commands as byte stream. +/// +/// @note: The Api does not require implementations to be thread safe. +/// Users are therefore required to be access in a serialized manner. +class CmdBuf { + public: + /// Default destructor. + virtual ~CmdBuf() {} + + /// @brief Resets the command buffer object. All of the commands + /// previously packed into the buffer are lost i.e. the number of + /// bytes in command stream is reset. + /// + /// @note: This convenience Api is provided to allow reuse of the + /// command buffer object. + /// + /// @return bool true if successful, false otherwise. + virtual bool Reset(void) = 0; + + /// @brief Appends input command into a buffer that could + /// be queried for its size and other properties. The append + /// does not verify the contents. + /// + /// @param cmd Buffer containing one or more instances of Gpu commands + /// + /// @param size size of the Gpu commands in bytes. + /// + /// @return void + virtual void AppendCommand(const void* cmd, uint32_t size) = 0; + + /// @brief Returns the total size (in bytes) of the accumulated commands. + /// + /// @return size_t size of Gpu commands in bytes + virtual size_t Size() const = 0; + + private: + /// Indexes the command buffer by dwords. Allows accessing constants + /// in an assembled command buffer. + virtual uint32_t& operator[](size_t index) = 0; + + friend class CommandWriter; +}; + +/// @brief Implements the interface CmdBuf and thus can be used to +/// translate various Gpu commands as byte stream. +/// +/// @note: The Api does not require implementations to be thread safe. +/// Users are therefore required to be access in a serialized manner. +class DefaultCmdBuf : public CmdBuf { + public: + /// @brief Append the command into the underlying buffer + /// + /// @param cmd Buffer containing one or more instances of Gpu commands + /// + /// @param size Size of Gpu command(s) in bytes + /// + /// @retur void + virtual void AppendCommand(const void* cmd, uint32_t size) { + memcpy(ReserveCmdbufSpace(size), cmd, size); + } + + /// @brief Resets the Gpu command buffer + bool Reset() { + cmdbuf_.clear(); + return true; + } + + /// Size of Gpu commands in bytes in the underlying buffer + size_t Size() const { return cmdbuf_.size() * sizeof(StorageType); } + + /// Address of the start of accumulated commands. + const void* Base() const { return &cmdbuf_[0]; } + + private: + /// @brief Returns reference to the value of Gpu command buffer + /// at specified index + /// + /// @param index Specifies the buffer index whose value is needed + /// + /// @return uint32_t & Reference of the value being returned + uint32_t& operator[](size_t index) { return cmdbuf_[index]; } + + /// @brief Increase Gpu command buffer by specified size + /// + /// @param size Size in bytes by which command buffer should + /// be resized. + /// + /// @return void * Pointer into the buffer where the next + /// command can be written + void* ReserveCmdbufSpace(std::size_t size) { + const size_t len = cmdbuf_.size(); + cmdbuf_.resize(len + size / sizeof(StorageType)); + return &cmdbuf_[len]; + } + + /// @brief Defines Gpu command buffer as a vector of StorageType + typedef uint32_t StorageType; + std::vector cmdbuf_; +}; + +/// @brief Specifies the public interface of CommandWriter for use by +/// clients to build Gpu command streams. +class CommandWriter { + public: + /// @brief These enums specify the operation to perform in the packet + /// generated by BuildAtomicPacket. The commenting for each enum uses + /// the arguments to the function BuildAtomicPacket to express the + /// resulting operation. + enum AtomicType { + + /// *destination = *destination + 1; + kAtomicTypeIncrement, + + /// *destination = *destination - 1; + kAtomicTypeDecrement, + + /// if (*destination == compare) *destination = value; + kAtomicTypeCompareAndSwap, + + /// while (*destination != compare); + /// *destination = value; + kAtomicTypeBlockingCompareAndSwap, + + /// *destination = *destination + value; + kAtomicAdd, + + /// *destination = *destination - value; + kAtomicSubtract, + + /// *destination = value; + kAtomicSwap + }; + + /// @brief These enums specify the VGT EVENT TYPE to issue and wait for. + /// Command Processor (CP) uses these events to communicate with SPI to + /// learn about outstanding waves and determine kernel completion. + enum VgtEventType { + + /// Enable Performance Counters + kPerfCntrsStart, + + /// Disable Performance Counters + kPerfCntrsStop, + + /// Read Performance Counters + kPerfCntrsSample, + + /// Enable a Thread Trace session + kThrdTraceStart, + + /// Disable a Thread Trace session + kThrdTraceStop, + + /// Enable flushing of thread trace buffers + kThrdTraceFlush, + + /// Enables resetting of BASE register to its last value + /// including flushing of thread trace buffers. This could + /// be used to toggle between two buffers so as to allow + /// collection of large token data + kThrdTraceFinish + }; + + /// @brief Returns the Dword that encodes a No-Op for the CP + /// + /// @return uint32_t Dword that can be used to populate a Pm4 + /// command queue. + /// + virtual uint32_t GetNoOpCmd() = 0; + + /// @brief Build an instance of Barrier command and copy it into + /// the input commmand buffer + /// + /// @param cmdbuf Pointer to command buffer which is updated with + /// an instance of Barrier command. + /// + /// @return void + virtual void BuildBarrierCommand(CmdBuf* cmdbuf) = 0; + + /// @brief Builds the Gpu command to reference indirectly a stream + /// of other Gpu commands. The launch command is then copied into + /// the command buffer parameter. + /// + /// @param cmdBuf command buffer to be appended with launch command + /// + /// @param cmd_addr Address of command buffer carrying command stream + /// + /// @param cmd_size Size of dispatch command stream in bytes + /// + /// @return void + virtual void BuildIndirectBufferCmd(CmdBuf* cmdbuf, const void* cmd_addr, + std::size_t cmd_size) = 0; + + /// @brief Build a Gpu command that triggers an event whose type + /// is specified by input parameter. It then copies it into the input + /// command buffer + /// + /// @param cmdbuf Pointer to command buffer to be appended + /// + /// @param event Id of Event to be triggered by Gpu + /// + /// @return void + virtual void BuildWriteEventPacket(CmdBuf* cmdbuf, uint32_t event) = 0; + + /// @bried Builds a Gpu command to wait until condition is realized + /// + /// @param cmdbuf command buffer to be appended with launch command + /// + /// @param mem_space if the address is in memory or is a register offset + /// + /// @param wait_addr address to wait on + /// + /// @param func_eq true means equal, false means not-equal + /// + /// @param mask_val Mask to apply on value from addr in comparison + /// + /// @param wait_val value to apply for the func given above + virtual void BuildWaitRegMemCommand(CmdBuf* cmdbuf, bool mem_space, uint64_t wait_addr, + bool func_eq, uint32_t mask_val, uint32_t wait_val) = 0; + + virtual void BuildUpdateHostAddress(CmdBuf* cmdbuf, uint64_t* addr, int64_t value) = 0; + + /// @brief Build CP command to program a Gpu register + /// + /// @param cmdbuf Pointer to command buffer to be appended + /// @param addr Register to be programmed + /// @param value Value to write into register + /// + /// @return void + virtual void BuildWriteUConfigRegPacket(CmdBuf* cmdbuf, uint32_t addr, uint32_t value) = 0; + + /// @brief Build and copy WriteShReg command + /// + /// @param cmdbuf Pointer to command buffer to be appended + /// + /// @param addr Offset of the register + /// + /// @param value Value to write into register + /// + /// @return void + virtual void BuildWriteShRegPacket(CmdBuf* cmdbuf, uint32_t addr, uint32_t value) = 0; + + /// @brief Builds a Gpu command to flush Gpu caches and write a + /// user defined value at a configurable location that is Gpu + /// accessible. + /// + /// @param cmdBuf Command buffer to be appended with bottom of pipe + /// notification command + /// + /// @param write_addr Address into which Gpu should write + /// + /// @param write_val Value to write into user provided address + /// + /// @param interrupt True if Gpu should raise an interrupt upon writing + /// the user value + /// + /// @return void + virtual void BuildBOPNotifyCmd(CmdBuf* cmdbuf, const void* write_addr, uint32_t write_val, + bool intrpt) = 0; + + + /// @brief Build a Gpu command that copies data from a specified + /// source to destination + /// + /// @param cmdbuf Pointer to command buffer to be appended + /// + /// @param reg_to_mem flag to indicate if values are being read from a + /// Register or a memory location + /// + /// @param src_addr_lo Low 32-bit Source address of the data to read from + /// + /// @param src_addr_hi High 32-bit Source address of the data to read from + /// + /// @param dst_addr Destination address for the data to be written to + /// + /// @param size Size of the data to be written + /// + /// @param wait True if Gpu command should confirm the write operation + /// operation has completed successfully + /// + /// @return void + /// + /// @NOTE Change interface to use void* for Src and void* for Dest + virtual void BuildCopyDataPacket(CmdBuf* cmdbuf, uint32_t src_sel, uint32_t src_addr_lo, + uint32_t src_addr_hi, uint32_t* dst_addr, uint32_t size, + bool wait) = 0; + + /// @brief Build and copy a WaitIdle Gpu command into command buffer + /// + /// @param cmdbuf Pointer to command buffer to be appended + /// + /// @return void + virtual void BuildWriteWaitIdlePacket(CmdBuf* cmdbuf) = 0; + + // Will issue a VGT event including a cache flush later on + virtual void BuildVgtEventPacket(CmdBuf* cmdbuf, uint32_t vgtEvent) = 0; + + /// @brief Build and copy a WriteRegister Gpu command into command buffer + /// + /// @param cmdbuf Pointer to command buffer to be appended + /// + /// @param addr Register into which to write + /// + /// @param value Value to write into register + /// + /// @return void + virtual void BuildWriteRegisterPacket(CmdBuf* cmdbuf, uint32_t addr, uint32_t value) = 0; + + /// @brief Build and copy a Gpu command to query the status of a + /// WriteEvent into command buffer + /// + /// @param cmdbuf Pointer to command buffer to be appended + /// + /// @param event Id of Event whose status is to be queried + /// + /// @param addr Address to update the status of WriteEvent operation + /// + /// @return void + virtual void BuildWriteEventQueryPacket(CmdBuf* cmdBuf, uint32_t event, uint32_t* addr) = 0; + + /// @brief Builds and copies a Gpu comamnd to peform user specified + /// operation atomically. The various atomic operations on integers + /// that are supported include: increment, decrement, add, subtract, + /// compare-and-swap and swap. The operation to perform is specified + /// by the enum AtomicType. + /// + /// @param cmdbuf Pointer to command buffer to be appended + /// + /// @param atomic_op Id of the atomic operation to perform + /// + /// @param addr Pointer to the memory block where atomic operation + /// would be performed + /// + /// @param value New value to write if atomic operation can be performed + /// + /// @param compare Value to compare if atomic operation is a compare-and-swap + /// + /// @return void + virtual void BuildAtomicPacket(CmdBuf* cmdbuf, AtomicType atomic_op, volatile uint32_t* addr, + uint32_t value = 0, uint32_t compare = 0) = 0; + + /// @brief Builds and copies a Gpu comamnd to peform user specified + /// operation atomically. The various atomic operations on integers + /// that are supported include: increment, decrement, add, subtract, + /// compare-and-swap and swap. The operation to perform is specified + /// by the enum AtomicType. + /// + /// @param cmdbuf Pointer to command buffer to be appended + /// + /// @param atomic_op Id of the atomic operation to perform + /// + /// @param addr Pointer to the memory block where atomic operation + /// would be performed + /// + /// @param value New value to write if atomic operation can be performed + /// + /// @param compare Value to compare if atomic operation is a compare-and-swap + /// + /// @return void + virtual void BuildAtomicPacket64(CmdBuf* cmdbuf, AtomicType atomic_op, volatile uint64_t* addr, + uint64_t value = 0, uint64_t compare = 0) = 0; + + /// @brief Returns the size of an atomic packet + /// + /// @return size_t Size of atomic packet + virtual size_t SizeOfAtomicPacket() const = 0; + + /// @brief Build and copy a Gpu command that will tell command processor + /// to conditionally execute or skip the next sequence of packets. + /// + /// @param cmdbuf Pointer to command buffer to be appended + /// + /// @param signal Pointer to an integer that tells the command processor + /// whether to skip or execute the next block of packets. If it is set + /// to 0 the following packets will be skipped, else it will execute the + /// following packets + /// + /// @param count The number of dwords in the following packet stream + /// that will be conditionally executed + /// + /// @return void + virtual void BuildConditionalExecute(CmdBuf* cmdbuf, uint32_t* signal, uint16_t count) = 0; + + /// @brief Builds a CP command to write user specified value + /// at a user specified address. The command is then copied + /// into the command buffer for submission to a device queue. + /// + /// @param cmdbuf Pointer to command buffer to be appended + /// + /// @param write_addr Address into which CP will write the user + /// specified value + /// + /// @param write_value Value to write into the user specified address + /// + /// @return void + virtual void BuildWriteDataCommand(CmdBuf* cmdbuf, uint32_t* write_addr, + uint32_t write_value) = 0; + + /// @brief Builds a CP command to write user specified value + /// at a user specified address. The command is then copied + /// into the command buffer for submission to a device queue. + /// + /// @param cmdbuf Pointer to command buffer to be appended + /// + /// @param write_addr Address into which CP will write the user + /// specified value + /// + /// @param write_value Value to write into the user specified address + /// + /// @return void + virtual void BuildWriteData64Command(CmdBuf* cmdbuf, uint64_t* write_addr, + uint64_t write_value) = 0; + + /// Writes into input buffer Gpu commands to flush its cache. It is + /// necessary that the buffer provided for flush commands is large + /// enough to accommodate the full set of commands. It should be at + /// least 512 bytes. + /// + /// @param tsCmdBuf Buffer to write commands to. + /// @param writeAddr Registered address into which GPU should write + /// a user provided value upon executing the flush commands. + /// @param writeVal User provided value written by GPU at user provided + /// address, upon executing the flush commands. + /// + /// @return void + virtual void BuildFlushCacheCmd(CmdBuf* cmdbuf, FlushCacheOptions* options, uint32_t* writeAddr, + uint32_t writeVal) = 0; + + /// @brief Builds Gpu command to copy data from source to destination + /// buffer using DMA engine. + /// + /// @param cmdbuf Buffer updated with Gpu copy command + /// @param srcAddr Address of source buffer address + /// @param dstAddr Address of destination buffer address + /// @param copySize Size of data to copy in bytes + /// @param waitForCompletion if command should wait for copying to complete + virtual void BuildDmaDataPacket(CmdBuf* cmdbuf, uint32_t* srcAddrLo, uint32_t* dstAddr, + uint32_t copySize, bool waitForCompletion) = 0; + + /// @brief Release resources used by CommandWriter + virtual ~CommandWriter(){}; + + protected: + /// @brief Return the reference to a value in the command buffer + uint32_t& IndexBuffer(CmdBuf* cmdbuf, uint32_t index) { return (*cmdbuf)[index]; } +}; + +/// @brief Returns the Rounded value per input rounding factor +inline uint32_t RoundUp(uint32_t u, uint32_t r) { return ((u + (r - 1)) & ~(r - 1)); } + +/// @brief Returns the lower 32-bits of a value +inline uint32_t Low32(uint64_t u) { return (u & 0xFFFFFFFFUL); } + +/// @brief Returns the upper 32-bits of a value +inline uint32_t High32(uint64_t u) { return (u >> 32); } + +/// @brief Returns the lower 32-bits of an address +inline uint32_t Ptr48Low32(const void* p) { + uintptr_t ptr = reinterpret_cast(p); + assert((ptr & 0xFFFFFFFFFF00ULL) == ptr); + return (uint32_t)((ptr & 0xFFFFFFFFFFULL) >> 8); +} + +/// @brief Returns the upper 8-bits of an address +inline uint8_t Ptr48High8(const void* p) { + uintptr_t ptr = reinterpret_cast(p); + return (uint8_t)((ptr & 0xFF0000000000ULL) >> 40); +} + +/// @brief Returns the lower 32-bits of an address +inline uint32_t PtrLow32(const void* p) { + return static_cast(reinterpret_cast(p)); +} + +/// @brief Returns the upper 32-bits of an address +inline uint32_t PtrHigh32(const void* p) { + uint32_t hi_32 = 0; +#ifdef HSA_LARGE_MODEL + hi_32 = static_cast(reinterpret_cast(p) >> 32); + static_assert(sizeof(void*) == 8, "HSA_LARGE_MODEL is not set properly here!"); +#else + static_assert(sizeof(void*) == 4, "HSA_LARGE_MODEL is not set properly here!"); +#endif + return hi_32; +} + +} // pm4_profile + +#endif // _CMDWRITER_H_ diff --git a/runtime/hsa-ext-aql-profile/src/commandwriter/gfx8_cmds.h b/runtime/hsa-ext-aql-profile/src/commandwriter/gfx8_cmds.h new file mode 100644 index 0000000000..2db962c03b --- /dev/null +++ b/runtime/hsa-ext-aql-profile/src/commandwriter/gfx8_cmds.h @@ -0,0 +1,161 @@ +#ifndef _GFX8_CMDS_H_ +#define _GFX8_CMDS_H_ + +#include "gfxip/gfx8/si_ci_vi_merged_enum.h" +#include "gfxip/gfx8/si_ci_vi_merged_mask.h" +#include "gfxip/gfx8/si_ci_vi_merged_offset.h" +#include "gfxip/gfx8/si_ci_vi_merged_registers.h" +#include "gfxip/gfx8/si_ci_vi_merged_typedef.h" +#include "gfxip/gfx8/si_ci_vi_merged_pm4_it_opcodes.h" +#include "gfxip/gfx8/si_pm4defs.h" + +namespace pm4_profile { + +namespace gfx8 { + +// Desc: Defines the Gpu command to dispatch a kernel. It embeds +// various Gpu hardware specific data structures for initialization +// and configuration before a dispatch begins to run +struct DispatchTemplate { + // Desc: Structure used to initialize the group dimensions + // of a kernel dispatch and if performance counters are enabled + struct DispatchDimensionRegs { + PM4CMDSETDATA cmd_set_data; + regCOMPUTE_START_X compute_start_x; + regCOMPUTE_START_Y compute_start_y; + regCOMPUTE_START_Z compute_start_z; + regCOMPUTE_NUM_THREAD_X compute_num_thread_x; + regCOMPUTE_NUM_THREAD_Y compute_num_thread_y; + regCOMPUTE_NUM_THREAD_Z compute_num_thread_z; + regCOMPUTE_PIPELINESTAT_ENABLE__CI__VI compute_pipelinestat_enable; + } dimension_regs; + + // Desc: Structure used to initialize kernel Isa, trap + // handler, trap handler buffer, number of SGPR and VGPR + // registers needed, amount of Group memory and LDS needed, + // Rounding mode for Floating point numbers, etc. + struct DispatchProgramRegs { + PM4CMDSETDATA cmd_set_data; + regCOMPUTE_PGM_LO compute_pgm_lo; + regCOMPUTE_PGM_HI compute_pgm_hi; + regCOMPUTE_TBA_LO compute_tba_lo; + regCOMPUTE_TBA_HI compute_tba_hi; + regCOMPUTE_TMA_LO compute_tma_lo; + regCOMPUTE_TMA_HI compute_tma_hi; + regCOMPUTE_PGM_RSRC1 compute_pgm_rsrc1; + regCOMPUTE_PGM_RSRC2 compute_pgm_rsrc2; + } program_regs; + + // Desc: Structure used to initialize parameters related to + // thread management i.e. number of waves to issue and number + // of Compute Units to use + struct DispatchResourceRegs { + PM4CMDSETDATA cmd_set_data; + regCOMPUTE_RESOURCE_LIMITS compute_resource_limits; + regCOMPUTE_STATIC_THREAD_MGMT_SE0 compute_static_thread_mgmt_se0; + regCOMPUTE_STATIC_THREAD_MGMT_SE1 compute_static_thread_mgmt_se1; + regCOMPUTE_TMPRING_SIZE compute_tmpring_size; + regCOMPUTE_STATIC_THREAD_MGMT_SE2__CI__VI compute_static_thread_mgmt_se2; + regCOMPUTE_STATIC_THREAD_MGMT_SE3__CI__VI compute_static_thread_mgmt_se3; + regCOMPUTE_RESTART_X__CI__VI compute_restart_x; + regCOMPUTE_RESTART_Y__CI__VI compute_restart_y; + regCOMPUTE_RESTART_Z__CI__VI compute_restart_z; + regCOMPUTE_THREAD_TRACE_ENABLE__CI__VI compute_thread_trace_enable; + } resource_regs; + + // Desc: Structure used to pass handles of the Aql dispatch + // packet, Aql queue, Kernel argument address block, Scratch + // buffer + struct DispatchComputeUserDataRegs { + PM4CMDSETDATA cmd_set_data; + uint32_t compute_user_data[16]; + } compute_user_data_regs; + + // Desc: Structure used to configure Cache flush policy + // and dimensions of total work size + PM4CMDDISPATCHDIRECT dispatch_direct; +}; + +// Desc: Structure used to issue a Gpu Barrier command +struct BarrierTemplate { + PM4CMDEVENTWRITE event_write; +}; + +// Desc: Structure used to configure the flushing +// of various caches - instruction, constants, L1 +// and L2 +struct AcquireMemTemplate { + PM4CMDACQUIREMEM acquire_mem; +}; + +// Desc: Structure used to reference another Gpu command +// indirectly. Generally used to reference a list of Gpu +// commands (dispatch cmds) indirectly +struct LaunchTemplate { + PM4CMDINDIRECTBUFFER indirect_buffer; +}; + +// Desc: Structure used to determine the end of +// a kernel including cache flushes and writing to +// a user configurable memory location +struct EndofKernelNotifyTemplate { + PM4CMDRELEASEMEM release_mem; +}; + +// Desc: Strucuture used to perform various atomic +// operations - add, subtract, increment, etc +struct AtomicTemplate { + PM4CMDATOMIC atomic; +}; + +// Desc: Structure used to conditionalize the execution +// of a Gpu command stream +struct ConditionalExecuteTemplate { + PM4CMDCONDEXEC_CI conditional; +}; + +// Desc: PM4 command to write a 32-bit value into a memory +// location accessible to Gpu +struct WriteDataTemplate { + PM4CMDWRITEDATA write_data; + uint32_t write_data_value; +}; + +// Desc: PM4 command to write a 64-bit value into a memory +// location accessible to Gpu +struct WriteData64Template { + PM4CMDWRITEDATA write_data; + uint64_t write_data_value; +}; + +// Desc: PM4 command to wait for a certain event before proceeding +// to process another command on the queue +struct WaitRegMemTemplate { + PM4CMDWAITREGMEM wait_reg_mem; +}; + +// Desc: Initializer for commands that set shader registers +template void GenerateSetShRegHeader(T* pm4, uint32_t reg_addr) { + pm4->cmd_set_data.header.u32All = + PM4_TYPE_3_HDR(IT_SET_SH_REG, sizeof(T) / sizeof(uint32_t), ShaderCompute, 0); + pm4->cmd_set_data.regOffset = reg_addr - PERSISTENT_SPACE_START; +} + +// Desc: Initializer for various Gpu command headers +template void GenerateCmdHeader(T* pm4, IT_OpCodeType op_code) { + pm4->header.u32All = PM4_TYPE_3_HDR(op_code, sizeof(T) / sizeof(uint32_t), ShaderCompute, 0); +} + +// Desc: Initializer for commands that set configuration registers +template void GenerateSetConfigRegHeader(T* pm4, uint32_t reg_addr) { + pm4->cmd_set_data.header.u32All = + PM4_TYPE_3_HDR(IT_SET_CONFIG_REG, sizeof(T) / sizeof(uint32_t), ShaderCompute, 0); + pm4->cmd_set_data.regOffset = reg_addr - CONFIG_SPACE_START; +} + + +} // gfx8 + +} // pm4_profile + +#endif // _GFX8_CMDS_H_ diff --git a/runtime/hsa-ext-aql-profile/src/commandwriter/gfx8_cmdwriter.cpp b/runtime/hsa-ext-aql-profile/src/commandwriter/gfx8_cmdwriter.cpp new file mode 100644 index 0000000000..1dbe1183e2 --- /dev/null +++ b/runtime/hsa-ext-aql-profile/src/commandwriter/gfx8_cmdwriter.cpp @@ -0,0 +1,768 @@ +#include +#include +#include +#include + +#include "gfx8_cmdwriter.h" +#include "gfxip/gfx8/gfx8_utils.h" + +// RELEASE MEM DST SEL Definitions +#define RELEASE_MEM_DST_SEL_MEMORY_CONTROLLER 0 +#define RELEASE_MEM_DST_SEL_TC_L2 1 + +// RELEASE MEM CACHE POLICY Definitions +#define RELEASE_MEM_CACHE_POLICY_LRU 0 +#define RELEASE_MEM_CACHE_POLICY_STREAM 1 +#define RELEASE_MEM_CACHE_POLICY_BYPASS 2 + +template +static void PrintPm4Packet(const T& command, const char* name) { +#if ! defined(NDEBUG) + uint32_t * cmd = (uint32_t*)&command; + uint32_t size = sizeof(command) / sizeof(uint32_t); + std::ostringstream oss; + oss << "'" << name << "' size(" << std::dec << size << ")"; + std::clog << std::setw(40) << std::left << oss.str() << ":"; + for (uint32_t idx = 0; idx < size; idx++) { + std::clog << " " << std::hex << std::setw(8) << std::setfill('0') << cmd[idx]; + } + std::clog << std::setfill(' ') << std::endl; +#endif +} + +#define APPEND_COMMAND_WRAPPER(cmdbuf, command) \ + PrintPm4Packet(command, __FUNCTION__); \ + AppendCommand(cmdbuf, command); + +namespace pm4_profile { +namespace gfx8 { + +template void Gfx8CmdWriter::AppendCommand(CmdBuf* cmdbuf, const T& command) { + cmdbuf->AppendCommand(&command, sizeof(command)); +} + +void Gfx8CmdWriter::InitializeAtomicTemplate() { + memset(&atomic_template_.atomic, 0, sizeof(atomic_template_)); + GenerateCmdHeader(&atomic_template_.atomic, IT_ATOMIC_MEM__CI); + + if (atc_support_) { + const uint32_t kAtcShift = 24; + atomic_template_.atomic.ordinal2 |= 1 << kAtcShift; + } +} + +void Gfx8CmdWriter::InitializeConditionalTemplate() { + memset(&conditional_template_.conditional, 0, sizeof(conditional_template_)); + gfx8::GenerateCmdHeader(&conditional_template_.conditional, IT_COND_EXEC); + + if (atc_support_) { + const uint32_t kAtcShift = 24; + conditional_template_.conditional.ordinal4 |= 1 << kAtcShift; + } +} + +void Gfx8CmdWriter::InitializeLaunchTemplate() { + memset(&launch_template_, 0, sizeof(launch_template_)); + + GenerateCmdHeader(&launch_template_.indirect_buffer, IT_INDIRECT_BUFFER); + launch_template_.indirect_buffer.CI.valid = true; +} + +void Gfx8CmdWriter::InitializeWriteDataTemplate() { + // Set the header of write data command + memset(&write_data_template_, 0, sizeof(write_data_template_)); + + // Initialize the header of command packet + PM4CMDWRITEDATA* command = &(write_data_template_.write_data); + uint32_t cmd_size = sizeof(write_data_template_) / sizeof(uint32_t); + command->ordinal1 = PM4_TYPE_3_HDR(IT_WRITE_DATA, cmd_size, ShaderCompute, 0); + + // Set the ATC bit of command template - specifies if the address + // belongs to system memory + write_data_template_.write_data.atc__CI = (atc_support_) ? 1 : 0; + + // Set the bit to confirm the write operation and cache policy + write_data_template_.write_data.wrConfirm = 1; + write_data_template_.write_data.cachePolicy__CI = WRITE_DATA_CACHE_POLICY_BYPASS; + + // Specify the module that will execute the write data command + write_data_template_.write_data.engineSel = WRITE_DATA_ENGINE_ME; + + // Specify the class to which the write destination belongs + write_data_template_.write_data.dstSel = WRITE_DATA_DST_SEL_MEMORY_ASYNC; +} + +void Gfx8CmdWriter::InitializeWriteData64Template() { + // Set the header of write data command + memset(&write_data64_template_, 0, sizeof(write_data64_template_)); + + // Initialize the header of command packet + PM4CMDWRITEDATA* command = &(write_data64_template_.write_data); + uint32_t cmd_size = sizeof(write_data64_template_) / sizeof(uint32_t); + command->ordinal1 = PM4_TYPE_3_HDR(IT_WRITE_DATA, cmd_size, ShaderCompute, 0); + + // Set the ATC bit of command template - specifies if the address + // belongs to system memory + write_data64_template_.write_data.atc__CI = (atc_support_) ? 1 : 0; + + // Set the bit to confirm the write operation and cache policy + write_data64_template_.write_data.wrConfirm = 1; + write_data64_template_.write_data.cachePolicy__CI = WRITE_DATA_CACHE_POLICY_BYPASS; + + // Specify the module that will execute the write data command + write_data64_template_.write_data.engineSel = WRITE_DATA_ENGINE_ME; + + // Specify the class to which the write destination belongs + // write_data64_template_.write_data.dstSel = WRITE_DATA_DST_SEL_TCL2; + // TODO: For Hawaii bring up only. + write_data64_template_.write_data.dstSel = WRITE_DATA_DST_SEL_MEMORY_ASYNC; +} + +void Gfx8CmdWriter::InitializeBarrierTemplate() { + memset(&pending_dispatch_template_, 0, sizeof(pending_dispatch_template_)); + + gfx8::GenerateCmdHeader(&pending_dispatch_template_.event_write, IT_EVENT_WRITE); + pending_dispatch_template_.event_write.eventType = CS_PARTIAL_FLUSH; + pending_dispatch_template_.event_write.eventIndex = EventTypeToIndexTable[CS_PARTIAL_FLUSH]; +} + +void Gfx8CmdWriter::InitializeAcquireMemTemplate() { + memset(&invalidate_cache_template_, 0, sizeof(invalidate_cache_template_)); + + gfx8::GenerateCmdHeader(&invalidate_cache_template_.acquire_mem, IT_ACQUIRE_MEM__CI__VI); + invalidate_cache_template_.acquire_mem.cpCoherBase.u32All = 0x00; + invalidate_cache_template_.acquire_mem.cpCoherBaseHi.u32All = 0x00; + invalidate_cache_template_.acquire_mem.cpCoherSize.u32All = 0xFFFFFFFF; + invalidate_cache_template_.acquire_mem.cpCoherSizeHi.u32All = 0xFF; + invalidate_cache_template_.acquire_mem.pollInterval = 0; +} + +void Gfx8CmdWriter::InitializeWaitRegMemTemplate() { + memset(&wait_reg_mem_template_, 0, sizeof(wait_reg_mem_template_)); + + gfx8::GenerateCmdHeader(&wait_reg_mem_template_.wait_reg_mem, IT_WAIT_REG_MEM); + wait_reg_mem_template_.wait_reg_mem.atc__CI = (atc_support_) ? 1 : 0; + wait_reg_mem_template_.wait_reg_mem.cachePolicy__CI = 2; // bypass + wait_reg_mem_template_.wait_reg_mem.pollInterval = 0; + wait_reg_mem_template_.wait_reg_mem.engine = WAIT_REG_MEM_ENGINE_ME; +} + +Gfx8CmdWriter::Gfx8CmdWriter(bool atc_support, bool pcie_atomic_support) { + // Initialize various state variables related to + // atomic operations and atc support + pcie_atomic_support_ = pcie_atomic_support; + atc_support_ = atc_support; + + InitializeLaunchTemplate(); + InitializeAtomicTemplate(); + InitializeConditionalTemplate(); + InitializeWriteDataTemplate(); + InitializeWriteData64Template(); + InitializeBarrierTemplate(); + InitializeAcquireMemTemplate(); + InitializeWaitRegMemTemplate(); +} + +void Gfx8CmdWriter::BuildWaitRegMemCommand(CmdBuf* cmdbuf, bool mem_space, uint64_t wait_addr, + bool func_eq, uint32_t mask_val, uint32_t wait_val) { + gfx8::WaitRegMemTemplate wait_cmd = wait_reg_mem_template_; + + // Apply the space to which addr belongs + if (mem_space) { + wait_cmd.wait_reg_mem.memSpace = WAIT_REG_MEM_SPACE_MEMORY; + } else { + wait_cmd.wait_reg_mem.memSpace = WAIT_REG_MEM_SPACE_REGISTER; + } + + // Apply the function - equal / not equal desired by user + if (func_eq) { + wait_cmd.wait_reg_mem.function = WAIT_REG_MEM_FUNC_EQUAL; + } else { + wait_cmd.wait_reg_mem.function = WAIT_REG_MEM_FUNC_NOT_EQUAL; + } + + // Apply the mask on value at address/register + wait_cmd.wait_reg_mem.mask = mask_val; + + // Value to use in applying equal / not equal function + wait_cmd.wait_reg_mem.reference = wait_val; + + // Update upper 32 bit address if addr is not a register + if (mem_space) { + assert(!(wait_addr & 0x3) && "WaitRegMem address must be 4 byte aligned"); + } + wait_cmd.wait_reg_mem.pollAddressLo = Low32(wait_addr); + if (mem_space) { + wait_cmd.wait_reg_mem.pollAddressHi = High32(wait_addr); + } + + APPEND_COMMAND_WRAPPER(cmdbuf, wait_cmd); +} + +void Gfx8CmdWriter::BuildUpdateHostAddress(CmdBuf* cmdbuf, uint64_t* addr, int64_t value) { + // If Atomics are supported, use it + if (pcie_atomic_support_) { + BuildAtomicPacket64(cmdbuf, CommandWriter::AtomicType::kAtomicSwap, (volatile uint64_t*)addr, + value); + return; + } + + BuildWriteData64Command(cmdbuf, addr, value); + return; +} + +void Gfx8CmdWriter::BuildIndirectBufferCmd(CmdBuf* cmdbuf, const void* cmd_addr, + std::size_t cmd_size) { + gfx8::LaunchTemplate launch = launch_template_; + + launch.indirect_buffer.ibBaseLo = PtrLow32(cmd_addr); + launch.indirect_buffer.ibBaseHi = PtrHigh32(cmd_addr); + launch.indirect_buffer.CI.ibSize = cmd_size / sizeof(uint32_t); + + APPEND_COMMAND_WRAPPER(cmdbuf, launch); +} + +void Gfx8CmdWriter::BuildBOPNotifyCmd(CmdBuf* cmdbuf, const void* write_addr, uint32_t write_val, + bool interrupt) { + // Initialize the command including its header + gfx8::EndofKernelNotifyTemplate eopCmd; + memset(&eopCmd, 0, sizeof(eopCmd)); + gfx8::GenerateCmdHeader(&eopCmd.release_mem, IT_RELEASE_MEM__CI__VI); + + // Program CP to wait until following event is notified by SPI + eopCmd.release_mem.eventType = BOTTOM_OF_PIPE_TS; + eopCmd.release_mem.eventIndex = EventTypeToIndexTable[BOTTOM_OF_PIPE_TS]; + + // Program CP to perform various cache operations + // which complete before Write operation commences + eopCmd.release_mem.atc = atc_support_; + eopCmd.release_mem.l2Invlidate = true; + eopCmd.release_mem.l2WriteBack = true; + + // Set destination as Memory with Write bypassing Cache + eopCmd.release_mem.cachePolicy = RELEASE_MEM_CACHE_POLICY_BYPASS; + eopCmd.release_mem.dstSel = RELEASE_MEM_DST_SEL_MEMORY_CONTROLLER; + + // Program CP to write user specified value to user specified address + eopCmd.release_mem.ordinal4 = Low32(uint64_t(write_addr)); + eopCmd.release_mem.addrHi = High32(uint64_t(write_addr)); + eopCmd.release_mem.dataLo = Low32(write_val); + eopCmd.release_mem.dataHi = High32(write_val); + eopCmd.release_mem.dataSel = EVENTWRITEEOP_DATA_SEL_SEND_DATA32; + + // Determine if host will poll or wait for interrupt + eopCmd.release_mem.intSel = + (interrupt == false) ? EVENTWRITEEOP_INT_SEL_NONE : EVENTWRITEEOP_INT_SEL_SEND_INT_ON_CONFIRM; + + APPEND_COMMAND_WRAPPER(cmdbuf, eopCmd); +} + + +void Gfx8CmdWriter::BuildBarrierFenceCommands(CmdBuf* cmdbuf) { + gfx8::AcquireMemTemplate invalidate_src_caches = invalidate_cache_template_; + + // wbINVL2 by default writes-back and invalidates both L1 and L2 + invalidate_src_caches.acquire_mem.coherCntl = + CP_COHER_CNTL__TC_ACTION_ENA_MASK | CP_COHER_CNTL__TC_WB_ACTION_ENA_MASK__CI__VI; + + APPEND_COMMAND_WRAPPER(cmdbuf, invalidate_src_caches); +} + +// PM4 packet for profilers +#define PM4_PACKET3 (0xC0000000) +#define PM4_PACKET3_CMD_SHIFT 8 +#define PM4_PACKET3_COUNT_SHIFT 16 + +#define PACKET3(cmd, count) \ + (PM4_PACKET3 | (((count)-1) << PM4_PACKET3_COUNT_SHIFT) | ((cmd) << PM4_PACKET3_CMD_SHIFT)) + +// Structure to store the event PM4 packet +typedef struct WriteRegPacket_ { uint32_t item[3]; } WriteRegPacket; + +typedef struct WriteEventPacket_ { uint32_t item[7]; } WriteEventPacket; + +void Gfx8CmdWriter::BuildWriteEventPacket(CmdBuf* cmdbuf, uint32_t event) { + + PM4CMDEVENTWRITE cp_event_initiator; + cp_event_initiator.ordinal1 = PACKET3(IT_EVENT_WRITE, 1); + cp_event_initiator.ordinal2 = 0; + + VGT_EVENT_TYPE eventType = Reserved_0x00; + switch (event) { + case kPerfCntrsStart: + eventType = PERFCOUNTER_START; + break; + case kPerfCntrsStop: + eventType = PERFCOUNTER_STOP; + break; + case kPerfCntrsSample: + eventType = PERFCOUNTER_SAMPLE; + break; + default: + assert(false && "Illegal VGT Event Id"); + } + + cp_event_initiator.eventType = eventType; + cp_event_initiator.eventIndex = EventTypeToIndexTable[eventType]; + + APPEND_COMMAND_WRAPPER(cmdbuf, cp_event_initiator); + + return; +} + +void Gfx8CmdWriter::BuildWriteUnshadowRegPacket(CmdBuf* cmdbuf, uint32_t addr, uint32_t value) { + WriteRegPacket packet; + packet.item[0] = (PM4_TYPE_3_HDR(IT_SET_UCONFIG_REG__CI__VI, 1 + PM4_CMD_SET_CONFIG_REG_DWORDS, + ShaderGraphics, 0)); + packet.item[1] = (addr - UCONFIG_SPACE_START__CI__VI); + packet.item[2] = value; + + APPEND_COMMAND_WRAPPER(cmdbuf, packet); + + return; +} + +void Gfx8CmdWriter::BuildWriteUConfigRegPacket(CmdBuf* cmdbuf, uint32_t addr, uint32_t value) { + WriteRegPacket packet; + packet.item[0] = (PM4_TYPE_3_HDR(IT_SET_UCONFIG_REG__CI__VI, 1 + PM4_CMD_SET_CONFIG_REG_DWORDS, + ShaderCompute, 0)); + packet.item[1] = (addr - UCONFIG_SPACE_START__CI__VI); + packet.item[2] = value; + + APPEND_COMMAND_WRAPPER(cmdbuf, packet); + + return; +} + +void Gfx8CmdWriter::BuildWriteShRegPacket(CmdBuf* cmdbuf, uint32_t addr, uint32_t value) { + WriteRegPacket packet; + packet.item[0] = (PM4_TYPE_3_HDR(IT_SET_SH_REG, 1 + PM4_CMD_SET_SH_REG_DWORDS, ShaderCompute, 0)); + packet.item[1] = (addr - PERSISTENT_SPACE_START); + packet.item[2] = value; + + APPEND_COMMAND_WRAPPER(cmdbuf, packet); + + return; +} + +void Gfx8CmdWriter::BuildCopyDataPacket(CmdBuf* cmdbuf, uint32_t src_sel, uint32_t src_addr_lo, + uint32_t src_addr_hi, uint32_t* dst_addr, uint32_t size, + bool wait) { + PM4CMDCOPYDATA cmd_data; + memset(&cmd_data, 0, sizeof(PM4CMDCOPYDATA)); + + cmd_data.header.u32All = PACKET3(IT_COPY_DATA, 5); + + cmd_data.srcAtc__CI = atc_support_; + cmd_data.srcCachePolicy__CI = COPY_DATA_SRC_CACHE_POLICY_BYPASS; + cmd_data.srcSel = src_sel; + + cmd_data.dstAtc__CI = atc_support_; + cmd_data.dstSel = COPY_DATA_SEL_DST_ASYNC_MEMORY; + cmd_data.dstCachePolicy__CI = COPY_DATA_DST_CACHE_POLICY_BYPASS; + + uint32_t dst_addr_lo, dst_addr_hi; + + dst_addr_lo = PtrLow32(dst_addr); + dst_addr_hi = PtrHigh32(dst_addr); + + cmd_data.srcAddressLo = src_addr_lo; + cmd_data.srcAddressHi = src_addr_hi; + cmd_data.dstAddressLo = dst_addr_lo; + cmd_data.dstAddressHi = dst_addr_hi; + + cmd_data.countSel = size; + cmd_data.wrConfirm = wait; + cmd_data.engineSel = COPY_DATA_ENGINE_ME; + + APPEND_COMMAND_WRAPPER(cmdbuf, cmd_data); + + return; +} + +void Gfx8CmdWriter::BuildCacheFlushPacket(CmdBuf* cmdbuf) { + WriteEventPacket packet; + packet.item[0] = PACKET3(IT_ACQUIRE_MEM__CI__VI, 6); + packet.item[1] = 0x28C00000; + packet.item[2] = 0xFFFFFFFF; + packet.item[3] = 0; + packet.item[4] = 0; + packet.item[5] = 0; + packet.item[6] = 0x00000004; + + APPEND_COMMAND_WRAPPER(cmdbuf, packet); +} + +void Gfx8CmdWriter::BuildWriteWaitIdlePacket(CmdBuf* cmdbuf) { + BuildBarrierCommand(cmdbuf); + BuildCacheFlushPacket(cmdbuf); + return; +} + +// Will issue a VGT event including a cache flush later on +void Gfx8CmdWriter::BuildVgtEventPacket(CmdBuf* cmdbuf, uint32_t vgtEvent) { + PM4CMDEVENTWRITE cp_event_initiator; + + cp_event_initiator.ordinal1 = PACKET3(IT_EVENT_WRITE, 1); + cp_event_initiator.ordinal2 = 0; + + VGT_EVENT_TYPE eventType = Reserved_0x00; + switch (vgtEvent) { + case kPerfCntrsStart: + eventType = PERFCOUNTER_START; + break; + case kPerfCntrsStop: + eventType = PERFCOUNTER_STOP; + break; + case kPerfCntrsSample: + eventType = PERFCOUNTER_SAMPLE; + break; + case kThrdTraceStart: + eventType = THREAD_TRACE_START; + break; + case kThrdTraceStop: + eventType = THREAD_TRACE_STOP; + break; + case kThrdTraceFlush: + eventType = THREAD_TRACE_FLUSH; + break; + case kThrdTraceFinish: + eventType = THREAD_TRACE_FINISH; + break; + default: + assert(false && "Illegal VGT Event Id"); + } + + cp_event_initiator.eventType = eventType; + cp_event_initiator.eventIndex = EventTypeToIndexTable[eventType]; + + APPEND_COMMAND_WRAPPER(cmdbuf, cp_event_initiator); + + // Check If I should be issuing a cache flush operation as well + // test and remove it + BuildCacheFlushPacket(cmdbuf); + return; +} + +void Gfx8CmdWriter::BuildWriteRegisterPacket(CmdBuf* cmdbuf, uint32_t addr, uint32_t value) { + WriteRegPacket packet; + packet.item[0] = + (PM4_TYPE_3_HDR(IT_SET_CONFIG_REG, 1 + PM4_CMD_SET_CONFIG_REG_DWORDS, ShaderGraphics, 0)); + packet.item[1] = addr - CONFIG_SPACE_START; + packet.item[2] = value; + + APPEND_COMMAND_WRAPPER(cmdbuf, packet); + + return; +} + +void Gfx8CmdWriter::BuildWriteEventQueryPacket(CmdBuf* cmdbuf, uint32_t event, uint32_t* addr) { + PM4CMDEVENTWRITEQUERY cp_event_initiator; + cp_event_initiator.ordinal1 = PACKET3(IT_EVENT_WRITE, 3); + cp_event_initiator.ordinal2 = 0; + + // Update switch statements you want to support + VGT_EVENT_TYPE eventType = Reserved_0x00; + switch (event) { + default: + assert(false && "Illegal VGT Event Id"); + } + + cp_event_initiator.eventType = eventType; + cp_event_initiator.eventIndex = EventTypeToIndexTable[eventType]; + + // set the address + uint32_t addrLo = PtrLow32(addr); + uint32_t addrHi = PtrHigh32(addr); + ((addrLo & 0x7) != 0) ? assert(false) : assert(true); + + cp_event_initiator.ordinal3 = 0; + cp_event_initiator.ordinal4 = 0; + cp_event_initiator.addressLo = addrLo; + cp_event_initiator.addressHi = addrHi; + + APPEND_COMMAND_WRAPPER(cmdbuf, cp_event_initiator); + + return; +} + +void Gfx8CmdWriter::BuildBarrierCommand(CmdBuf* cmdBuf) { + APPEND_COMMAND_WRAPPER(cmdBuf, pending_dispatch_template_); +} + +void Gfx8CmdWriter::WriteUserData(uint32_t* dst_addr, uint32_t count, const void* src_addr) { + memcpy(dst_addr, src_addr, count * sizeof(uint32_t)); +} + + +void Gfx8CmdWriter::BuildAtomicPacket(CmdBuf* cmdbuf, AtomicType atomic_op, + volatile uint32_t* addr, uint32_t value, + uint32_t compare) { + gfx8::AtomicTemplate atomic = atomic_template_; + + // make sure the destination adddress is aligned + uint32_t address_low = PtrLow32((void*)addr); + uint32_t address_high = PtrHigh32((void*)addr); + assert(!(address_low & 0x7) && "destination address must be 8 byte aligned"); + + atomic.atomic.addressLo = address_low; + atomic.atomic.addressHi = address_high; + + switch (atomic_op) { + case CommandWriter::kAtomicTypeIncrement: { + atomic.atomic.atomOp = TC_OP_ATOMIC_ADD_RTN_32; + atomic.atomic.srcDataLo = 1; + break; + } + case CommandWriter::kAtomicTypeDecrement: { + atomic.atomic.atomOp = TC_OP_ATOMIC_SUB_RTN_32; + atomic.atomic.srcDataLo = 1; + break; + } + case CommandWriter::kAtomicTypeCompareAndSwap: { + atomic.atomic.atomOp = TC_OP_ATOMIC_CMPSWAP_RTN_32; + atomic.atomic.srcDataLo = value; + atomic.atomic.cmpDataLo = compare; + break; + } + case CommandWriter::kAtomicTypeBlockingCompareAndSwap: { + atomic.atomic.atomOp = TC_OP_ATOMIC_CMPSWAP_RTN_32; + atomic.atomic.srcDataLo = value; + atomic.atomic.cmpDataLo = compare; + atomic.atomic.command = 1; + atomic.atomic.loopInterval = 128; + break; + } + case CommandWriter::kAtomicAdd: { + atomic.atomic.atomOp = TC_OP_ATOMIC_ADD_RTN_32; + atomic.atomic.srcDataLo = value; + break; + } + case CommandWriter::kAtomicSubtract: { + atomic.atomic.atomOp = TC_OP_ATOMIC_SUB_RTN_32; + atomic.atomic.srcDataLo = value; + break; + } + case CommandWriter::kAtomicSwap: { + atomic.atomic.atomOp = TC_OP_ATOMIC_SWAP_RTN_32; + atomic.atomic.srcDataLo = value; + break; + } + } + + APPEND_COMMAND_WRAPPER(cmdbuf, atomic); +} + +void Gfx8CmdWriter::BuildAtomicPacket64(CmdBuf* cmdbuf, AtomicType atomic_op, + volatile uint64_t* addr, uint64_t value, + uint64_t compare) { + AtomicTemplate atomic = atomic_template_; + + // make sure the destination adddress is aligned + uint32_t address_low = PtrLow32((void*)addr); + uint32_t address_high = PtrHigh32((void*)addr); + assert(!(address_low & 0x7) && "destination address must be 8 byte aligned"); + + atomic.atomic.addressLo = address_low; + atomic.atomic.addressHi = address_high; + + atomic.atomic.atc = (atc_support_) ? 1 : 0; + atomic.atomic.cachePolicy = 2; + + switch (atomic_op) { + case CommandWriter::kAtomicTypeIncrement: { + atomic.atomic.atomOp = TC_OP_ATOMIC_ADD_RTN_64; + atomic.atomic.srcDataLo = 1; + break; + } + case CommandWriter::kAtomicTypeDecrement: { + atomic.atomic.atomOp = TC_OP_ATOMIC_SUB_RTN_64; + atomic.atomic.srcDataLo = 1; + break; + } + case CommandWriter::kAtomicTypeCompareAndSwap: { + atomic.atomic.atomOp = TC_OP_ATOMIC_CMPSWAP_RTN_64; + atomic.atomic.srcDataLo = Low32(value); + atomic.atomic.srcDataHi = High32(value); + atomic.atomic.cmpDataLo = Low32(compare); + atomic.atomic.cmpDataHi = High32(compare); + break; + } + case CommandWriter::kAtomicTypeBlockingCompareAndSwap: { + atomic.atomic.atomOp = TC_OP_ATOMIC_CMPSWAP_RTN_64; + atomic.atomic.srcDataLo = Low32(value); + atomic.atomic.srcDataHi = High32(value); + atomic.atomic.cmpDataLo = Low32(compare); + atomic.atomic.cmpDataHi = High32(compare); + atomic.atomic.command = 1; + atomic.atomic.loopInterval = 128; + break; + } + case CommandWriter::kAtomicAdd: { + atomic.atomic.atomOp = TC_OP_ATOMIC_ADD_RTN_64; + atomic.atomic.srcDataLo = Low32(value); + atomic.atomic.srcDataHi = High32(value); + break; + } + case CommandWriter::kAtomicSubtract: { + atomic.atomic.atomOp = TC_OP_ATOMIC_SUB_RTN_64; + atomic.atomic.srcDataLo = Low32(value); + atomic.atomic.srcDataHi = High32(value); + break; + } + case CommandWriter::kAtomicSwap: { + atomic.atomic.atomOp = TC_OP_ATOMIC_SWAP_RTN_64; + atomic.atomic.srcDataLo = Low32(value); + atomic.atomic.srcDataHi = High32(value); + break; + } + } + + APPEND_COMMAND_WRAPPER(cmdbuf, atomic); +} + +size_t Gfx8CmdWriter::SizeOfAtomicPacket() const { + return sizeof(AtomicTemplate) / sizeof(uint32_t); +} + +void Gfx8CmdWriter::BuildConditionalExecute(CmdBuf* cmdbuf, uint32_t* signal, uint16_t count) { + ConditionalExecuteTemplate conditional = conditional_template_; + + uint32_t address_low = PtrLow32(signal); + uint32_t address_high = PtrHigh32(signal); + assert(!(address_low & 0x7) && "destination address must be 8 byte aligned"); + + conditional.conditional.boolAddrLo = address_low; + conditional.conditional.boolAddrHi = address_high; + conditional.conditional.execCount = count; + + APPEND_COMMAND_WRAPPER(cmdbuf, conditional); +} + +void Gfx8CmdWriter::BuildWriteDataCommand(CmdBuf* cmdbuf, uint32_t* write_addr, + uint32_t write_value) { + // Copy the initialize command packet + gfx8::WriteDataTemplate command = write_data_template_; + + // Encode the user specified value to write + command.write_data_value = write_value; + + // Encode the user specified address to write to + command.write_data.dstAddrLo = PtrLow32(write_addr); + command.write_data.dstAddrHi = PtrHigh32(write_addr); + + // Append the built command into output Command Buffer + APPEND_COMMAND_WRAPPER(cmdbuf, command); +} + +void Gfx8CmdWriter::BuildWriteData64Command(CmdBuf* cmdbuf, uint64_t* write_addr, + uint64_t write_value) { + // Copy the initialize command packet + gfx8::WriteData64Template command = write_data64_template_; + + // Encode the user specified value to write + command.write_data_value = write_value; + + // Encode the user specified address to write to + command.write_data.dstAddrLo = PtrLow32(write_addr); + command.write_data.dstAddrHi = PtrHigh32(write_addr); + + // Append the built command into output Command Buffer + APPEND_COMMAND_WRAPPER(cmdbuf, command); +} + +void Gfx8CmdWriter::BuildFlushCacheCmd(CmdBuf* cmdbuf, FlushCacheOptions* options, + uint32_t* writeAddr, uint32_t writeVal) { + PM4CMDACQUIREMEM flushCmd; + memset(&flushCmd, 0, sizeof(flushCmd)); + + // Verify write back address is valid. Note that this address is NOT + // used on CI. But to have a same interface as that on SI, we keep + // the address argument in this function. Thus, this check always pass + // no matter the address is NULL or not. + (writeAddr == NULL) ? assert(true) : assert(true); + + // Initialize the command header + gfx8::GenerateCmdHeader(&flushCmd, IT_ACQUIRE_MEM__CI__VI); + + // Specify the base address of memory being synchronized. + // The starting address is indicated as follows: bits [0-48]. + flushCmd.cpCoherBase.u32All = 0; + flushCmd.cpCoherBaseHi.u32All = 0; + + // Specify the size of memory being synchronized. It is indicated + // as follows: + // COHER_SIZE_256B_MASK = 0xffffffffL + // COHER_SIZE_HI_256B_MASK__CI__VI = 0x000000ffL + flushCmd.cpCoherSize.u32All = CP_COHER_SIZE__COHER_SIZE_256B_MASK; + flushCmd.cpCoherSizeHi.u32All = CP_COHER_SIZE_HI__COHER_SIZE_HI_256B_MASK__CI__VI; + + // Periodicity of polling - interval to wait from the time + // of unsuccessful polling result is returned and a new + // poll is issued + flushCmd.pollInterval = 0x04; + + // Program Coherence Control Register. Initialize L2 Cache flush + // for Non-Coherent memory blocks + uint32_t coher_cntl = 0; + + coher_cntl |= (options->l1) ? CP_COHER_CNTL__TCL1_ACTION_ENA_MASK : 0; + coher_cntl |= (options->l2) + ? (CP_COHER_CNTL__TC_ACTION_ENA_MASK | CP_COHER_CNTL__TC_WB_ACTION_ENA_MASK__CI__VI) + : 0; + coher_cntl |= (options->icache) ? CP_COHER_CNTL__SH_ICACHE_ACTION_ENA_MASK : 0; + coher_cntl |= (options->kcache) ? CP_COHER_CNTL__SH_KCACHE_ACTION_ENA_MASK : 0; + flushCmd.coherCntl = coher_cntl; + + // Copy AcquireMem command buffer stream + APPEND_COMMAND_WRAPPER(cmdbuf, flushCmd); + return; +} + +void Gfx8CmdWriter::BuildDmaDataPacket(CmdBuf* cmdbuf, uint32_t* srcAddr, uint32_t* dstAddr, + uint32_t copySize, bool waitForConfirm) { + PM4CMDDMADATA cmdDmaData; + memset(&cmdDmaData, 0, sizeof(PM4CMDDMADATA)); + cmdDmaData.header.u32All = + (PM4_TYPE_3_HDR(IT_DMA_DATA__CI__VI, PM4_CMD_DMA_DATA_DWORDS, ShaderCompute, 0)); + + // Id of Micro Engine + cmdDmaData.engine = 0; + + // Specify attributes of source buffer such as its + // location, ATC property, Cache policy and Volatile + // A value of 1 for cache policy means to Stream + cmdDmaData.srcSel = 0; + cmdDmaData.srcATC = atc_support_; + cmdDmaData.srcCachePolicy = 1; + cmdDmaData.srcVolatile = 0; + + // Specify attributes of destination buffer such as + // its location, ATC property, Cache policy and Volatile + // A value of 1 for cache policy means to Stream + cmdDmaData.dstSel = 0; + cmdDmaData.dstATC = atc_support_; + cmdDmaData.dstCachePolicy = 1; + cmdDmaData.dstVolatile = 0; + + // Specify the source and destination addr + cmdDmaData.srcAddrHi = PtrHigh32(srcAddr); + cmdDmaData.srcAddrLoOrData = PtrLow32(srcAddr); + cmdDmaData.dstAddrLo = PtrLow32(dstAddr); + cmdDmaData.dstAddrHi = PtrHigh32(dstAddr); + + // Number of bytes to copy. The command restricts + // the size to be (2 MB - 1) - 21 Bits + assert(copySize < 0x1FFFFF); + cmdDmaData.command.byteCount = copySize; + + // Indicate that DMA Cmd should wait if its source + // is the destination of a previous DMA Cmd + cmdDmaData.command.rawWait = waitForConfirm; + + APPEND_COMMAND_WRAPPER(cmdbuf, cmdDmaData); + return; +} + +} // gfx8 +} // pm4_profile diff --git a/runtime/hsa-ext-aql-profile/src/commandwriter/gfx8_cmdwriter.h b/runtime/hsa-ext-aql-profile/src/commandwriter/gfx8_cmdwriter.h new file mode 100644 index 0000000000..9e14c1ab29 --- /dev/null +++ b/runtime/hsa-ext-aql-profile/src/commandwriter/gfx8_cmdwriter.h @@ -0,0 +1,201 @@ +#ifndef _GFX8_CMDWRITER_H_ +#define _GFX8_CMDWRITER_H_ + +#include "cmdwriter.h" +#include "gfx8_cmds.h" + +namespace pm4_profile { + +namespace gfx8 { + +/// @brief class Gfx8CmdWriter implements the virtual class CommandWriter +/// for Sea Islands (CI) and VI chipset +class Gfx8CmdWriter : public CommandWriter { + public: + Gfx8CmdWriter(bool atc_support, bool pcie_atomic_support); + + /// @brief Dword specifying NOOP command for SI/CI/VI chipsets. The macro + /// populates the NOOP command which is 32-bits wide. The second parameter, + /// the COUNT field of NOOP command, specifies the number of Dwords to skip. + /// To skip ZERO Dwords the value should be set to 0x3FFF. Since the macro + /// decrements the second parameter by TWO, an artifact of its definition, + /// the value is incremented by TWO to 0x4001 (0x3FFF + 2). + /// + inline uint32_t GetNoOpCmd() { + static const uint32_t nopCmd = PM4_TYPE_3_HDR(IT_NOP, 0x4001, ShaderCompute, 0); + return nopCmd; + } + + void BuildBarrierCommand(CmdBuf* cmdBuf); + + void BuildIndirectBufferCmd(CmdBuf* cmdbuf, const void* cmd_addr, std::size_t cmd_size); + + void BuildBOPNotifyCmd(CmdBuf* cmdbuf, const void* write_addr, uint32_t write_val, + bool interrupt); + + void BuildBarrierFenceCommands(CmdBuf* cmdbuf); + + void BuildWriteEventPacket(CmdBuf* cmdbuf, uint32_t event); + + void BuildWaitRegMemCommand(CmdBuf* cmdbuf, bool mem_space, uint64_t wait_addr, bool func_eq, + uint32_t mask_val, uint32_t wait_val); + + void BuildWriteUnshadowRegPacket(CmdBuf* cmdbuf, uint32_t addr, uint32_t value); + + /// @brief Build CP command to program a Gpu register + /// + /// @param cmdbuf Pointer to command buffer to be appended + /// @param addr Register to be programmed + /// @param value Value to write into register + /// + /// @return void + void BuildWriteUConfigRegPacket(CmdBuf* cmdbuf, uint32_t addr, uint32_t value); + + void BuildWriteShRegPacket(CmdBuf* cmdbuf, uint32_t addr, uint32_t value); + + void BuildCopyDataPacket(CmdBuf* cmdbuf, uint32_t src_sel, uint32_t src_addr_lo, + uint32_t src_addr_hi, uint32_t* dst_addr, uint32_t size, bool wait); + + void BuildWriteWaitIdlePacket(CmdBuf* cmdbuf); + + // Will issue a VGT event including a cache flush later on + void BuildVgtEventPacket(CmdBuf* cmdbuf, uint32_t vgtEvent); + + void BuildWriteRegisterPacket(CmdBuf* cmdbuf, uint32_t addr, uint32_t value); + + void BuildWriteEventQueryPacket(CmdBuf* cmdbuf, uint32_t event, uint32_t* addr); + + void BuildAtomicPacket(CmdBuf* cmdbuf, AtomicType atomic_op, volatile uint32_t* addr, + uint32_t value, uint32_t compare); + + void BuildAtomicPacket64(CmdBuf* cmdbuf, AtomicType atomic_op, volatile uint64_t* addr, + uint64_t value = 0, uint64_t compare = 0); + + size_t SizeOfAtomicPacket() const; + + void BuildConditionalExecute(CmdBuf* cmdbuf, uint32_t* signal, uint16_t count); + + void BuildWriteDataCommand(CmdBuf* cmdbuf, uint32_t* write_addr, uint32_t write_value); + + void BuildWriteData64Command(CmdBuf* cmdbuf, uint64_t* write_addr, uint64_t write_value); + + void BuildCacheFlushPacket(CmdBuf* cmdbuf); + + /// Writes into input buffer Gpu commands to flush its cache. It is + /// necessary that the buffer provided for flush commands is large + /// enough to accommodate the full set of commands. It should be at + /// least 512 bytes. + /// + /// @param tsCmdBuf Buffer to write commands to. + /// @param writeAddr Registered address into which GPU should write + /// a user provided value upon executing the flush commands. + /// @param writeVal User provided value written by GPU at user provided + /// address, upon executing the flush commands. + /// + /// @return void + void BuildFlushCacheCmd(CmdBuf* cmdBuf, FlushCacheOptions* options, uint32_t* writeAddr, + uint32_t writeVal); + + /// Builds Gpu command to copy data from source to destination buffer + /// using DMA engine. + /// + /// @param cmdbuf Buffer updated with Gpu copy command + /// @param srcAddr Address of source buffer address + /// @param dstAddr Address of destination buffer address + /// @param copySize Size of data to copy in bytes + /// @param waitForCompletion if command should wait for copying to complete + void BuildDmaDataPacket(CmdBuf* cmdBuf, uint32_t* srcAddr, uint32_t* dstAddr, uint32_t copySize, + bool waitForCompletion); + + protected: + /// @brief Copies data from source buffer to destination buffer + /// + /// @param dst_addr Address of destination buffer data + /// + /// @count Size of data to copy in 32-bit words + /// + /// @param src_addr Address of buffer containing source data + /// + /// @return void + virtual void WriteUserData(uint32_t* dst_addr, uint32_t count, const void* src_addr); + + /// @brief Append an instance of Gpu command into input command buffer stream. + /// + /// @param cmdbuf CommandWriter object appended with anohter Gpu command + /// + /// @param cmd Gpu command to be appended into command buffer + /// + /// @return void + template void AppendCommand(CmdBuf* cmdbuf, const T& cmd); + + private: + /// @brief Initializes a Gpu command which can be used to + /// reference a Gpu command stream indirectly + void InitializeLaunchTemplate(); + + /// @brief Initializes a Gpu command to perform atomic operations + //// + void InitializeAtomicTemplate(); + + /// @brief Initializes a Gpu command to allow conditional execution + /// of a Gpu command stream + void InitializeConditionalTemplate(); + + /// @brief Initializes a Gpu command to let command processor + /// wait for some update before letting other commands to be + /// processed + void InitializeWaitRegMemTemplate(); + + /// @brief Initializes the template for Barrier command. + /// Applications can use Barrier command to ensure their + /// command is executed only after all other commands have + /// completed their execution. + void InitializeBarrierTemplate(); + + void BuildUpdateHostAddress(CmdBuf* cmdbuf, uint64_t* addr, int64_t value); + + /// @brief Initializes Acquire Memory command template. Users + /// can submit this command to invalidate Gpu caches - L1 and + /// or L2. + void InitializeAcquireMemTemplate(); + + /// @brief Initializes an instance of Write Data command + /// for use by an application + void InitializeWriteDataTemplate(); + void InitializeWriteData64Template(); + + /// @brief Instance of Gpu command to reference dispatch commands + LaunchTemplate launch_template_; + + /// @brief Instance of Gpu command to use in performing atomic operations + AtomicTemplate atomic_template_; + + /// @brief Instance of Gpu command to use in conditional execution + /// of a command stream + ConditionalExecuteTemplate conditional_template_; + + /// @brief Instance of Pm4 command WRITE_DATA + WriteDataTemplate write_data_template_; + WriteData64Template write_data64_template_; + + /// @brief Instance of Pm4 command EVENT_WRITE + BarrierTemplate pending_dispatch_template_; + + /// @brief Instance of Pm4 command ACQUIRE_MEM + AcquireMemTemplate invalidate_cache_template_; + + /// @brief Instance of Pm4 command WAIT_REG_MEM + WaitRegMemTemplate wait_reg_mem_template_; + + /// @brief ATC support. + bool atc_support_; + + /// @brief PCIe atomic support. + bool pcie_atomic_support_; +}; + +} // gfx8 + +} // pm4_profile + +#endif // _GFX8_CMDWRITER_H_ diff --git a/runtime/hsa-ext-aql-profile/src/commandwriter/gfx9_cmds.h b/runtime/hsa-ext-aql-profile/src/commandwriter/gfx9_cmds.h new file mode 100644 index 0000000000..5dac4f07d1 --- /dev/null +++ b/runtime/hsa-ext-aql-profile/src/commandwriter/gfx9_cmds.h @@ -0,0 +1,90 @@ +#ifndef _GFX9_CMDS_H_ +#define _GFX9_CMDS_H_ + +#include "gfxip/gfx9/gfx9_utils.h" +#include "gfxip/gfx9/gfx9_enum.h" +#include "gfxip/gfx9/gfx9_mask.h" +#include "gfxip/gfx9/gfx9_offset.h" +#include "gfxip/gfx9/gfx9_typedef.h" +#include "gfxip/gfx9/gfx9_registers.h" +#include "gfxip/gfx9/gfx9_pm4_it_opcodes.h" +#include "gfxip/gfx9/f32_mec_pm4_packets_vg10.h" +#include "gfxip/gfx9/f32_pfp_pm4_packets_vg10.h" + +namespace pm4_profile { + +namespace gfx9 { + +/// @brief Initializer for commands that set shader registers +template void GenerateSetShRegHeader(T* pm4, uint32_t reg_addr) { + pm4->cmd_set_data.header.u32All = PM4_TYPE3_HDR(IT_SET_SH_REG, sizeof(T) / sizeof(uint32_t)); + pm4->cmd_set_data.bitfields2.reg_offset = reg_addr - PERSISTENT_SPACE_START; +} + +// @brief Initializer for various Gpu command headers +template void GenerateCmdHeader(T* pm4, IT_OpCodeType op_code) { + pm4->header.u32All = PM4_TYPE3_HDR(op_code, sizeof(T) / sizeof(uint32_t)); +} + +// @brief Initializer for commands that set configuration registers +template void GenerateSetConfigRegHeader(T* pm4, uint32_t reg_addr) { + pm4->cmd_set_data.header.u32All = PM4_TYPE3_HDR(IT_SET_CONFIG_REG, sizeof(T) / sizeof(uint32_t)); + pm4->cmd_set_data.bitfields2.reg_offset = reg_addr - CONFIG_SPACE_START; +} + +/// @brief Structure used to issue a Gpu Barrier command +struct BarrierTemplate { + PM4MEC_EVENT_WRITE event_write; +}; + +/// @brief Structure used to configure the flushing of +/// various caches - instruction, constants, L1 and L2 +struct AcquireMemTemplate { + PM4MEC_ACQUIRE_MEM acquire_mem; +}; + +/// @brief Structure used to reference another Gpu command +/// indirectly. Generally used to reference a list of Gpu +/// commands (dispatch cmds) indirectly +struct LaunchTemplate { + PM4MEC_INDIRECT_BUFFER indirect_buffer; +}; + +/// @brief Structure used to determine the end of +/// a kernel including cache flushes and writing to +/// a user configurable memory location +struct EndofKernelNotifyTemplate { + PM4MEC_RELEASE_MEM release_mem; +}; + +// Desc: Strucuture used to perform various atomic +// operations - add, subtract, increment, etc +struct AtomicTemplate { + PM4MEC_ATOMIC_MEM atomic; +}; + +/// @brief PM4 command to write a 32-bit value into a memory +/// location accessible to Gpu +struct WriteDataTemplate { + PM4MEC_WRITE_DATA write_data; + uint32_t write_data_value; +}; + +/// @brief PM4 command to write a 64-bit value into a memory +/// location accessible to Gpu +struct WriteData64Template { + PM4MEC_WRITE_DATA write_data; + uint64_t write_data_value; +}; + +/// @brief PM4 command to wait for a certain event before proceeding +/// to process another command on the queue +struct WaitRegMemTemplate { + PM4MEC_WAIT_REG_MEM wait_reg_mem; +}; + +} // gfx9 + +} // pm4_profile + +#endif // _GFX9_CMDS_H_ diff --git a/runtime/hsa-ext-aql-profile/src/commandwriter/gfx9_cmdwriter.cpp b/runtime/hsa-ext-aql-profile/src/commandwriter/gfx9_cmdwriter.cpp new file mode 100644 index 0000000000..32ec820b17 --- /dev/null +++ b/runtime/hsa-ext-aql-profile/src/commandwriter/gfx9_cmdwriter.cpp @@ -0,0 +1,743 @@ +#include +#include +#include +#include + +#include "gfx9_cmdwriter.h" + +template +static void PrintPm4Packet(const T& command, const char* name) { +#if ! defined(NDEBUG) + uint32_t * cmd = (uint32_t*)&command; + uint32_t size = sizeof(command) / sizeof(uint32_t); + std::ostringstream oss; + oss << "'" << name << "' size(" << std::dec << size << ")"; + std::clog << std::setw(40) << std::left << oss.str() << ":"; + for (uint32_t idx = 0; idx < size; idx++) { + std::clog << " " << std::hex << std::setw(8) << std::setfill('0') << cmd[idx]; + } + std::clog << std::setfill(' ') << std::endl; +#endif +} + +#define APPEND_COMMAND_WRAPPER(cmdbuf, command) \ + PrintPm4Packet(command, __FUNCTION__); \ + AppendCommand(cmdbuf, command); + +namespace pm4_profile { +namespace gfx9 { + +template void Gfx9CmdWriter::AppendCommand(CmdBuf* cmdbuf, const T& command) { + cmdbuf->AppendCommand(&command, sizeof(command)); +} + +void Gfx9CmdWriter::InitializeLaunchTemplate() { + memset(&launch_template_, 0, sizeof(launch_template_)); + GenerateCmdHeader(&launch_template_.indirect_buffer, IT_INDIRECT_BUFFER); +} + +void Gfx9CmdWriter::InitializeAtomicTemplate() { + memset(&atomic_template_.atomic, 0, sizeof(atomic_template_)); + GenerateCmdHeader(&atomic_template_.atomic, IT_ATOMIC_MEM); + + // Specify the micro engine and cache policies + PM4MEC_ATOMIC_MEM* atomicCmd = &atomic_template_.atomic; + atomicCmd->bitfields2.cache_policy = cache_policy__mec_atomic_mem__stream; +} + +void Gfx9CmdWriter::InitializeBarrierTemplate() { + memset(&pending_dispatch_template_, 0, sizeof(pending_dispatch_template_)); + GenerateCmdHeader(&pending_dispatch_template_.event_write, IT_EVENT_WRITE); + + MEC_EVENT_WRITE_event_index_enum index; + index = event_index__mec_event_write__cs_partial_flush; + pending_dispatch_template_.event_write.bitfields2.event_index = index; + pending_dispatch_template_.event_write.bitfields2.event_type = CS_PARTIAL_FLUSH; +} + +void Gfx9CmdWriter::InitializeAcquireMemTemplate() { + memset(&invalidate_cache_template_, 0, sizeof(invalidate_cache_template_)); + GenerateCmdHeader(&invalidate_cache_template_.acquire_mem, IT_ACQUIRE_MEM); + + // Specify the CP module which will process this packet + PM4MEC_ACQUIRE_MEM* acquire_mem = &invalidate_cache_template_.acquire_mem; + + // Specify the size of memory to invalidate. Size is + // specified in terms of 256 byte chunks. A coher_size + // of 0xFFFFFFFF actually specified 0xFFFFFFFF00 (40 bits) + // of memory. The field coher_size_hi specifies memory from + // bits 40-64 for a total of 256 TB. + acquire_mem->coher_size = 0xFFFFFFFF; + acquire_mem->bitfields4.coher_size_hi = 0xFFFFFF; + + // Specify the address of memory to invalidate. The + // address must be 256 byte aligned. + acquire_mem->coher_base_lo = 0x00; + acquire_mem->bitfields6.coher_base_hi = 0x00; + + // Specify the poll interval for determing if operation is complete + acquire_mem->bitfields7.poll_interval = 0x04; +} + +void Gfx9CmdWriter::InitializeWaitRegMemTemplate() { + memset(&wait_reg_mem_template_, 0, sizeof(wait_reg_mem_template_)); + GenerateCmdHeader(&wait_reg_mem_template_.wait_reg_mem, IT_WAIT_REG_MEM); + + PM4MEC_WAIT_REG_MEM* wait_reg_mem = &wait_reg_mem_template_.wait_reg_mem; + + wait_reg_mem->bitfields7.poll_interval = 0x04; + wait_reg_mem->bitfields2.operation = operation__mec_wait_reg_mem__wait_reg_mem; +} + +void Gfx9CmdWriter::InitializeWriteDataTemplate(PM4MEC_WRITE_DATA* write_data, bool bit32) { + // Initialize the header of command packet by adjusting the + // size of payload - one 32bit DWord or two 32bit DWords + uint32_t cmd_size = (bit32) ? 1 : 2; + memset(write_data, 0, sizeof(PM4MEC_WRITE_DATA)); + cmd_size = cmd_size + (sizeof(PM4MEC_WRITE_DATA) / sizeof(uint32_t)); + write_data->ordinal1 = PM4_TYPE3_HDR(IT_WRITE_DATA, cmd_size); + + // Set the bit to confirm the write operation and cache policy + write_data->bitfields2.wr_confirm = wr_confirm__mec_write_data__wait_for_write_confirmation; + write_data->bitfields2.cache_policy = cache_policy__mec_write_data__stream; + + // Specify the command to increment address if writing more than one DWord + write_data->bitfields2.addr_incr = addr_incr__mec_write_data__increment_address; + + // Specify the class to which the write destination belongs + write_data->bitfields2.dst_sel = dst_sel__mec_write_data__memory; +} + +void Gfx9CmdWriter::InitializeWriteDataTemplate() { + InitializeWriteDataTemplate(&write_data_template_.write_data, true); +} + +void Gfx9CmdWriter::InitializeWriteData64Template() { + InitializeWriteDataTemplate(&write_data64_template_.write_data, false); +} + +void Gfx9CmdWriter::InitializeConditionalTemplate() { + /* + memset(&conditional_template_.conditional, 0, sizeof(conditional_template_)); + GenerateCmdHeader(&conditional_template_.conditional, IT_COND_EXEC); + + if (atc_support_) { + const uint32_t kAtcShift = 24; + conditional_template_.conditional.ordinal4 |= 1 << kAtcShift; + } + */ +} + +void Gfx9CmdWriter::InitializeEndOfKernelNotifyTemplate() { + memset(¬ify_template_, 0, sizeof(notify_template_)); + GenerateCmdHeader(¬ify_template_.release_mem, IT_RELEASE_MEM); + + // Set the event type to be bottom of pipe and cache policy + PM4MEC_RELEASE_MEM* rel_mem; + rel_mem = ¬ify_template_.release_mem; + rel_mem->bitfields2.event_type = BOTTOM_OF_PIPE_TS; + rel_mem->bitfields2.cache_policy = cache_policy__mec_release_mem__stream; + rel_mem->bitfields2.event_index = event_index__mec_release_mem__end_of_pipe; + + // Specify the attributes of source and destinations of data + rel_mem->bitfields3.int_sel = int_sel__mec_release_mem__none; + rel_mem->bitfields3.data_sel = data_sel__mec_release_mem__none; + rel_mem->bitfields3.dst_sel = dst_sel__mec_release_mem__memory_controller; +} + +Gfx9CmdWriter::Gfx9CmdWriter(bool atc_support, bool pcie_atomic_support) { + // Initialize various state variables related to + // atomic operations and atc support + this->atc_support_ = atc_support; + this->pcie_atomic_support_ = pcie_atomic_support; + + // Initialize various command templates + InitializeLaunchTemplate(); + InitializeAtomicTemplate(); + InitializeBarrierTemplate(); + InitializeAcquireMemTemplate(); + InitializeWaitRegMemTemplate(); + InitializeWriteDataTemplate(); + InitializeWriteData64Template(); + InitializeConditionalTemplate(); + InitializeEndOfKernelNotifyTemplate(); +} + +void Gfx9CmdWriter::BuildIndirectBufferCmd(CmdBuf* cmdbuf, const void* cmd_addr, + std::size_t cmd_size) { + // Verify the address is 4-byte aligned + uint64_t addr = uintptr_t(cmd_addr); + assert(!(addr & 0x3) && "IndirectBuffer address must be 4 byte aligned"); + + // Specify the address of indirect buffer encoding cmd stream + LaunchTemplate launch = launch_template_; + + launch.indirect_buffer.bitfields2.ib_base_lo = (PtrLow32(cmd_addr) >> 2); + launch.indirect_buffer.ib_base_hi = PtrHigh32(cmd_addr); + + // Specify the size of indirect buffer and cache policy to set + // upon executing the cmds of indirect buffer + launch.indirect_buffer.bitfields4.priv = 0; + launch.indirect_buffer.bitfields4.valid = 1; + launch.indirect_buffer.bitfields4.ib_size = cmd_size / sizeof(uint32_t); + launch.indirect_buffer.bitfields4.cache_policy = cache_policy__mec_indirect_buffer__stream; + + // Append the built command into output Command Buffer + APPEND_COMMAND_WRAPPER(cmdbuf, launch); +} + +void Gfx9CmdWriter::BuildAtomicPacket(CmdBuf* cmdbuf, AtomicType atomic_op, volatile uint32_t* addr, + uint32_t value, uint32_t compare) { + AtomicTemplate atomicTemplate = atomic_template_; + PM4MEC_ATOMIC_MEM* atomicCmd = &atomicTemplate.atomic; + + // make sure the destination adddress is aligned + uint32_t address_low = PtrLow32((void*)addr); + uint32_t address_high = PtrHigh32((void*)addr); + assert(!(address_low & 0x7) && "destination address must be 8 byte aligned"); + atomicCmd->addr_lo = address_low; + atomicCmd->addr_hi = address_high; + + switch (atomic_op) { + case CommandWriter::kAtomicTypeIncrement: + assert(!(value != 0x01) && "Atomic Increment value should be 1"); + case CommandWriter::kAtomicAdd: + atomicCmd->src_data_lo = value; + atomicCmd->bitfields2.atomic = TC_OP_ATOMIC_ADD_RTN_32; + break; + case CommandWriter::kAtomicTypeDecrement: + assert(!(value != 0x01) && "Atomic Decrement value should be 1"); + case CommandWriter::kAtomicSubtract: + atomicCmd->src_data_lo = value; + atomicCmd->bitfields2.atomic = TC_OP_ATOMIC_SUB_RTN_32; + break; + case CommandWriter::kAtomicTypeBlockingCompareAndSwap: + atomicCmd->bitfields9.loop_interval = 128; + atomicCmd->bitfields2.command = command__mec_atomic_mem__loop_until_compare_satisfied; + case CommandWriter::kAtomicTypeCompareAndSwap: + atomicCmd->src_data_lo = value; + atomicCmd->cmp_data_lo = compare; + atomicCmd->bitfields2.atomic = TC_OP_ATOMIC_CMPSWAP_RTN_32; + break; + case CommandWriter::kAtomicSwap: + atomicCmd->src_data_lo = value; + atomicCmd->bitfields2.atomic = TC_OP_ATOMIC_SWAP_RTN_32; + break; + default: + assert((false) && "Atomic operation id is invalid"); + } + + // Append the built command into output Command Buffer + APPEND_COMMAND_WRAPPER(cmdbuf, atomicTemplate); +} + +void Gfx9CmdWriter::BuildAtomicPacket64(CmdBuf* cmdbuf, AtomicType atomic_op, + volatile uint64_t* addr, uint64_t value, uint64_t compare) { + AtomicTemplate atomicTemplate = atomic_template_; + PM4MEC_ATOMIC_MEM* atomicCmd = &atomicTemplate.atomic; + + // make sure the destination adddress is aligned + uint32_t address_low = PtrLow32((void*)addr); + uint32_t address_high = PtrHigh32((void*)addr); + assert(!(address_low & 0x7) && "destination address must be 8 byte aligned"); + atomicCmd->addr_lo = address_low; + atomicCmd->addr_hi = address_high; + + switch (atomic_op) { + case CommandWriter::kAtomicTypeIncrement: + assert(!(value != 0x01) && "Atomic Increment value should be 1"); + case CommandWriter::kAtomicAdd: + atomicCmd->src_data_lo = Low32(value); + atomicCmd->src_data_hi = High32(value); + atomicCmd->bitfields2.atomic = TC_OP_ATOMIC_ADD_RTN_64; + break; + case CommandWriter::kAtomicTypeDecrement: + assert(!(value != 0x01) && "Atomic Decrement value should be 1"); + case CommandWriter::kAtomicSubtract: + atomicCmd->src_data_lo = Low32(value); + atomicCmd->src_data_hi = High32(value); + atomicCmd->bitfields2.atomic = TC_OP_ATOMIC_SUB_RTN_64; + break; + case CommandWriter::kAtomicTypeBlockingCompareAndSwap: + atomicCmd->bitfields9.loop_interval = 128; + atomicCmd->bitfields2.command = command__mec_atomic_mem__loop_until_compare_satisfied; + case CommandWriter::kAtomicTypeCompareAndSwap: + atomicCmd->src_data_lo = Low32(value); + atomicCmd->src_data_hi = High32(value); + atomicCmd->cmp_data_lo = Low32(compare); + atomicCmd->cmp_data_hi = High32(compare); + atomicCmd->bitfields2.atomic = TC_OP_ATOMIC_CMPSWAP_RTN_64; + break; + case CommandWriter::kAtomicSwap: + atomicCmd->src_data_lo = Low32(value); + atomicCmd->src_data_hi = High32(value); + atomicCmd->bitfields2.atomic = TC_OP_ATOMIC_SWAP_RTN_64; + break; + default: + assert((false) && "Atomic operation id is invalid"); + } + + // Append the built command into output Command Buffer + APPEND_COMMAND_WRAPPER(cmdbuf, atomicTemplate); +} + +void Gfx9CmdWriter::BuildBarrierCommand(CmdBuf* cmdBuf) { + APPEND_COMMAND_WRAPPER(cmdBuf, pending_dispatch_template_); +} + +void Gfx9CmdWriter::BuildWriteDataCommand(CmdBuf* cmdbuf, uint32_t* write_addr, + uint32_t write_value) { + // Copy the initialized command packet and its payload + WriteDataTemplate command = write_data_template_; + + // Encode the user specified address to write to + uint64_t addr = uintptr_t(write_addr); + assert(!(addr & 0x3) && "WriteData address must be 4 byte aligned"); + + // Specify the value to write + command.write_data_value = write_value; + + // Test Code to see if this makes a difference + command.write_data.dst_mem_addr_hi = PtrHigh32(write_addr); + command.write_data.bitfields3c.dst_mem_addr_lo = (PtrLow32(write_addr) >> 2); + + // Append the built command into output Command Buffer + APPEND_COMMAND_WRAPPER(cmdbuf, command); +} + +void Gfx9CmdWriter::BuildWriteData64Command(CmdBuf* cmdbuf, uint64_t* write_addr, + uint64_t write_value) { + // Copy the initialized command packet and its payload + WriteData64Template command = write_data64_template_; + + // Encode the user specified address to write to + uint64_t addr = uintptr_t(write_addr); + assert(!(addr & 0x3) && "WriteData address must be 4 byte aligned"); + + command.write_data.bitfields3c.dst_mem_addr_lo = (PtrLow32(write_addr) >> 2); + command.write_data.dst_mem_addr_hi = PtrHigh32(write_addr); + + // Specify the value to write + command.write_data_value = write_value; + + // Append the built command into output Command Buffer + APPEND_COMMAND_WRAPPER(cmdbuf, command); +} + +void Gfx9CmdWriter::BuildWaitRegMemCommand(CmdBuf* cmdbuf, bool mem_space, uint64_t wait_addr, + bool func_eq, uint32_t mask_val, uint32_t wait_val) { + WaitRegMemTemplate wait_cmd = wait_reg_mem_template_; + + // Apply the space to which addr belongs + if (mem_space) { + wait_cmd.wait_reg_mem.bitfields2.mem_space = mem_space__mec_wait_reg_mem__memory_space; + } else { + wait_cmd.wait_reg_mem.bitfields2.mem_space = mem_space__mec_wait_reg_mem__register_space; + } + + // Apply the function - equal / not equal desired by user + if (func_eq) { + wait_cmd.wait_reg_mem.bitfields2.function = + function__mec_wait_reg_mem__equal_to_the_reference_value; + } else { + wait_cmd.wait_reg_mem.bitfields2.function = + function__mec_wait_reg_mem__not_equal_reference_value; + } + + // Value to use in applying equal / not equal function + wait_cmd.wait_reg_mem.reference = wait_val; + + // Apply the mask on value at address/register + wait_cmd.wait_reg_mem.mask = mask_val; + + // The address to poll should be DWord (4 byte) aligned + // Update upper 32 bit address if addr is not a register + if (mem_space) { + assert(!(wait_addr & 0x3) && "WaitRegMem address must be 4 byte aligned"); + } + wait_cmd.wait_reg_mem.bitfields3a.mem_poll_addr_lo = (Low32(wait_addr) >> 2); + if (mem_space) { + wait_cmd.wait_reg_mem.mem_poll_addr_hi = High32(wait_addr); + } + + // Append the command to cmd stream + APPEND_COMMAND_WRAPPER(cmdbuf, wait_cmd); +} + +void Gfx9CmdWriter::BuildConditionalExecute(CmdBuf* cmdbuf, uint32_t* signal, uint16_t count) { + assert(false && "BuildConditionalExecute method is not implemented"); + /* + ConditionalExecuteTemplate conditional = conditional_template_; + + uint32_t address_low = PtrLow32(signal); + uint32_t address_high = PtrHigh32(signal); + assert(!(address_low & 0x7) && "destination address must be 8 byte aligned"); + + conditional.conditional.boolAddrLo = address_low; + conditional.conditional.boolAddrHi = address_high; + conditional.conditional.execCount = count; + + APPEND_COMMAND_WRAPPER(cmdbuf, conditional); + */ +} + +void Gfx9CmdWriter::BuildUpdateHostAddress(CmdBuf* cmdbuf, uint64_t* addr, int64_t value) { + // If Atomics are supported, use it + if (pcie_atomic_support_) { + BuildAtomicPacket64(cmdbuf, CommandWriter::AtomicType::kAtomicSwap, (volatile uint64_t*)addr, + value); + return; + } + + BuildWriteData64Command(cmdbuf, addr, value); + return; +} + +void Gfx9CmdWriter::BuildBOPNotifyCmd(CmdBuf* cmdbuf, const void* write_addr, uint32_t write_value, + bool interrupt) { + // Initialize the command including its header + EndofKernelNotifyTemplate eop = notify_template_; + PM4MEC_RELEASE_MEM* rel_mem = &eop.release_mem; + + // Program CP to perform various cache operations + // before issuing the write operation commences + rel_mem->bitfields2.tc_action_ena = true; + rel_mem->bitfields2.tc_wb_action_ena = true; + + // Update cmd to write a user specified 32-bit value + rel_mem->data_lo = write_value; + rel_mem->bitfields3.data_sel = data_sel__mec_release_mem__send_32_bit_low; + + // Update cmd with user specified address to write to + rel_mem->address_hi = High32(uint64_t(write_addr)); + rel_mem->bitfields4b.address_lo_64b = (Low32(uint64_t(write_addr) >> 3)); + + // Update cmd to issue interrupt if user has requested it + if (interrupt) { + rel_mem->bitfields3.int_sel = int_sel__mec_release_mem__send_interrupt_after_write_confirm; + } + + // Serialize the command as stream of Dwords + APPEND_COMMAND_WRAPPER(cmdbuf, eop); +} + +void Gfx9CmdWriter::BuildBarrierFenceCommands(CmdBuf* cmdbuf) { + // TODO: temporarily remove the check because some OpenCL tests + // (test_buffers, test_relationals) are failing. + // if (using_cc_memory_policy_) + // return; + AcquireMemTemplate invalidate_src_caches = invalidate_cache_template_; + + // wbINVL2 by default writes-back and invalidates both L1 and L2 + invalidate_src_caches.acquire_mem.bitfields2.coher_cntl = CP_COHER_CNTL__TC_ACTION_ENA_MASK; + invalidate_src_caches.acquire_mem.bitfields2.coher_cntl |= CP_COHER_CNTL__TC_WB_ACTION_ENA_MASK; + + APPEND_COMMAND_WRAPPER(cmdbuf, invalidate_src_caches); +} + +/* +// PM4 packet for profilers +#define PM4_PACKET3 (0xC0000000) +#define PM4_PACKET3_CMD_SHIFT 8 +#define PM4_PACKET3_COUNT_SHIFT 16 + +#define PACKET3(cmd, count) \ + (PM4_PACKET3 | (((count)-1) << PM4_PACKET3_COUNT_SHIFT) | \ + ((cmd) << PM4_PACKET3_CMD_SHIFT)) +*/ + +// Structure to store the event PM4 packet +typedef struct WriteRegPacket_ { uint32_t item[3]; } WriteRegPacket; + +void Gfx9CmdWriter::BuildWriteEventPacket(CmdBuf* cmdbuf, uint32_t event) { + PM4MEC_EVENT_WRITE cp_event_initiator; + memset(&cp_event_initiator, 0, sizeof(PM4MEC_EVENT_WRITE)); + cp_event_initiator.ordinal1 = + PM4_TYPE3_HDR(IT_EVENT_WRITE, (sizeof(PM4MEC_EVENT_WRITE) / sizeof(uint32_t))); + cp_event_initiator.ordinal2 = 0; + + VGT_EVENT_TYPE eventType = Reserved_0x00; + switch (event) { + case kPerfCntrsStart: + eventType = PERFCOUNTER_START; + break; + case kPerfCntrsStop: + eventType = PERFCOUNTER_STOP; + break; + case kPerfCntrsSample: + eventType = PERFCOUNTER_SAMPLE; + break; + default: + assert(false && "Illegal VGT Event Id"); + } + + MEC_EVENT_WRITE_event_index_enum index; + index = event_index__mec_event_write__other; + cp_event_initiator.bitfields2.event_index = index; + cp_event_initiator.bitfields2.event_type = eventType; + + // Append the built command into output Command Buffer + APPEND_COMMAND_WRAPPER(cmdbuf, cp_event_initiator); +} + +void Gfx9CmdWriter::BuildWriteUnshadowRegPacket(CmdBuf* cmdbuf, uint32_t addr, uint32_t value) { + WriteRegPacket packet; + packet.item[0] = + PM4_TYPE3_HDR(IT_SET_UCONFIG_REG, (1 + sizeof(PM4MEC_SET_CONFIG_REG) / sizeof(uint32_t))); + packet.item[1] = (addr - UCONFIG_SPACE_START); + packet.item[2] = value; + + APPEND_COMMAND_WRAPPER(cmdbuf, packet); +} + +void Gfx9CmdWriter::BuildWriteUConfigRegPacket(CmdBuf* cmdbuf, uint32_t addr, uint32_t value) { + WriteRegPacket packet; + packet.item[0] = + PM4_TYPE3_HDR(IT_SET_UCONFIG_REG, (1 + sizeof(PM4MEC_SET_CONFIG_REG) / sizeof(uint32_t))); + packet.item[1] = (addr - UCONFIG_SPACE_START); + packet.item[2] = value; + + APPEND_COMMAND_WRAPPER(cmdbuf, packet); +} + +void Gfx9CmdWriter::BuildWriteShRegPacket(CmdBuf* cmdbuf, uint32_t addr, uint32_t value) { + WriteRegPacket packet; + packet.item[0] = + PM4_TYPE3_HDR(IT_SET_SH_REG, (1 + sizeof(PM4MEC_SET_CONFIG_REG) / sizeof(uint32_t))); + packet.item[1] = (addr - PERSISTENT_SPACE_START); + packet.item[2] = value; + + APPEND_COMMAND_WRAPPER(cmdbuf, packet); +} + +void Gfx9CmdWriter::BuildCopyDataPacket(CmdBuf* cmdbuf, uint32_t src_sel, uint32_t src_addr_lo, + uint32_t src_addr_hi, uint32_t* dst_addr, uint32_t size, + bool wait) { + PM4MEC_COPY_DATA cmd_data; + memset(&cmd_data, 0, sizeof(PM4MEC_COPY_DATA)); + cmd_data.ordinal1 = PM4_TYPE3_HDR(IT_COPY_DATA, (sizeof(PM4MEC_COPY_DATA) / sizeof(uint32_t))); + + MEC_COPY_DATA_src_sel_enum data_src = src_sel__mec_copy_data__memory; + switch (src_sel) { + case 0: + data_src = src_sel__mec_copy_data__mem_mapped_register; + break; + case 4: + data_src = src_sel__mec_copy_data__perfcounters; + break; + default: + assert(false && "CopyData Illegal value for source of data"); + break; + } + cmd_data.bitfields2.src_sel = data_src; + cmd_data.bitfields2.src_cache_policy = src_cache_policy__mec_copy_data__stream; + + cmd_data.bitfields2.dst_sel = dst_sel__mec_copy_data__memory; + cmd_data.bitfields2.dst_cache_policy = dst_cache_policy__mec_copy_data__stream; + + cmd_data.bitfields2.wr_confirm = (MEC_COPY_DATA_wr_confirm_enum)wait; + cmd_data.bitfields2.count_sel = (size == 0) ? count_sel__mec_copy_data__32_bits_of_data + : count_sel__mec_copy_data__64_bits_of_data; + + // Specify the source register offset + cmd_data.bitfields3a.src_reg_offset = src_addr_lo; + + // Specify the destination memory address + cmd_data.dst_addr_hi = PtrHigh32(dst_addr); + if (size == 0) { + cmd_data.bitfields5b.dst_32b_addr_lo = (PtrLow32(dst_addr) >> 2); + } else { + cmd_data.bitfields5c.dst_64b_addr_lo = (PtrLow32(dst_addr) >> 3); + } + + // Append the built command into output Command Buffer + APPEND_COMMAND_WRAPPER(cmdbuf, cmd_data); +} + +void Gfx9CmdWriter::BuildCacheFlushPacket(CmdBuf* cmdbuf) { + // Initialize the command header + PM4MEC_ACQUIRE_MEM cache_flush = invalidate_cache_template_.acquire_mem; + + // Program Coherence Control Register. Initialize L2 Cache flush + // for Non-Coherent memory blocks + uint32_t coher_cntl = 0; + + coher_cntl |= CP_COHER_CNTL__TC_ACTION_ENA_MASK; + coher_cntl |= CP_COHER_CNTL__TCL1_ACTION_ENA_MASK; + coher_cntl |= CP_COHER_CNTL__TC_WB_ACTION_ENA_MASK; + coher_cntl |= CP_COHER_CNTL__SH_ICACHE_ACTION_ENA_MASK; + coher_cntl |= CP_COHER_CNTL__SH_KCACHE_ACTION_ENA_MASK; + cache_flush.bitfields2.coher_cntl = coher_cntl; + + // Copy AcquireMem command buffer stream + APPEND_COMMAND_WRAPPER(cmdbuf, cache_flush); +} + +void Gfx9CmdWriter::BuildWriteWaitIdlePacket(CmdBuf* cmdbuf) { + BuildBarrierCommand(cmdbuf); + BuildCacheFlushPacket(cmdbuf); +} + +// Will issue a VGT event including a cache flush later on +void Gfx9CmdWriter::BuildVgtEventPacket(CmdBuf* cmdbuf, uint32_t vgtEvent) { + PM4MEC_EVENT_WRITE cp_event_initiator; + memset(&cp_event_initiator, 0, sizeof(PM4MEC_EVENT_WRITE)); + cp_event_initiator.ordinal1 = + PM4_TYPE3_HDR(IT_EVENT_WRITE, (sizeof(PM4MEC_EVENT_WRITE) / sizeof(uint32_t))); + cp_event_initiator.ordinal2 = 0; + + VGT_EVENT_TYPE eventType = Reserved_0x00; + switch (vgtEvent) { + case kPerfCntrsStart: + eventType = PERFCOUNTER_START; + break; + case kPerfCntrsStop: + eventType = PERFCOUNTER_STOP; + break; + case kPerfCntrsSample: + eventType = PERFCOUNTER_SAMPLE; + break; + case kThrdTraceStart: + eventType = THREAD_TRACE_START; + break; + case kThrdTraceStop: + eventType = THREAD_TRACE_STOP; + break; + case kThrdTraceFlush: + eventType = THREAD_TRACE_FLUSH; + break; + case kThrdTraceFinish: + eventType = THREAD_TRACE_FINISH; + break; + default: + assert(false && "Illegal VGT Event Id"); + } + + MEC_EVENT_WRITE_event_index_enum index; + index = event_index__mec_event_write__other; + cp_event_initiator.bitfields2.event_index = index; + cp_event_initiator.bitfields2.event_type = eventType; + + // Append the built command into output Command Buffer + APPEND_COMMAND_WRAPPER(cmdbuf, cp_event_initiator); + + // Check If I should be issuing a cache flush operation as well + // test and remove it + BuildCacheFlushPacket(cmdbuf); +} + +void Gfx9CmdWriter::BuildWriteRegisterPacket(CmdBuf* cmdbuf, uint32_t addr, uint32_t value) { + /* + WriteRegPacket packet; + packet.item[0] = (PM4_TYPE3_HDR( + IT_SET_CONFIG_REG, 1 + PM4_CMD_SET_CONFIG_REG_DWORDS, ShaderGraphics, 0)); + packet.item[1] = addr - CONFIG_SPACE_START; + packet.item[2] = value; + + APPEND_COMMAND_WRAPPER(cmdbuf, packet); + + return; + */ +} + +void Gfx9CmdWriter::BuildWriteEventQueryPacket(CmdBuf* cmdbuf, uint32_t event, uint32_t* addr) { + PM4MEC_EVENT_WRITE_QUERY cp_event_initiator; + memset(&cp_event_initiator, 0, sizeof(PM4MEC_EVENT_WRITE_QUERY)); + cp_event_initiator.ordinal1 = + PM4_TYPE3_HDR(IT_EVENT_WRITE, (sizeof(PM4MEC_EVENT_WRITE_QUERY) / sizeof(uint32_t))); + cp_event_initiator.ordinal2 = 0; + + // Update switch statements you want to support + VGT_EVENT_TYPE eventType = Reserved_0x00; + switch (event) { + default: + assert(false && "Illegal VGT Event Id"); + } + + MEC_EVENT_WRITE_event_index_enum index; + cp_event_initiator.bitfields2.event_type = eventType; + index = (MEC_EVENT_WRITE_event_index_enum)EventTypeToIndexTable[eventType]; + cp_event_initiator.bitfields2.event_index = index; + + // set the address + uint32_t addrLo = PtrLow32(addr); + uint32_t addrHi = PtrHigh32(addr); + ((addrLo & 0x7) != 0) ? assert(false) : assert(true); + + cp_event_initiator.address_hi = addrHi; + cp_event_initiator.bitfields3.address_lo = (addrLo >> 3); + + // Append the built command into output Command Buffer + APPEND_COMMAND_WRAPPER(cmdbuf, cp_event_initiator); +} + +size_t Gfx9CmdWriter::SizeOfAtomicPacket() const { + return sizeof(AtomicTemplate) / sizeof(uint32_t); +} + +void Gfx9CmdWriter::BuildFlushCacheCmd(CmdBuf* cmdbuf, FlushCacheOptions* options, + uint32_t* writeAddr, uint32_t writeVal) { + PM4MEC_ACQUIRE_MEM cache_flush = invalidate_cache_template_.acquire_mem; + + // Verify write back address is valid. Note that this address is NOT + // used on CI. But to have a same interface as that on SI, we keep + // the address argument in this function. Thus, this check always pass + // no matter the address is NULL or not. + (writeAddr == NULL) ? assert(true) : assert(true); + + // Program Coherence Control Register. Initialize L2 Cache flush + // for Non-Coherent memory blocks + uint32_t coher_cntl = 0; + coher_cntl |= (options->l1) ? CP_COHER_CNTL__TCL1_ACTION_ENA_MASK : 0; + coher_cntl |= (options->l2) + ? (CP_COHER_CNTL__TC_ACTION_ENA_MASK | CP_COHER_CNTL__TC_WB_ACTION_ENA_MASK) + : 0; + coher_cntl |= (options->icache) ? CP_COHER_CNTL__SH_ICACHE_ACTION_ENA_MASK : 0; + coher_cntl |= (options->kcache) ? CP_COHER_CNTL__SH_KCACHE_ACTION_ENA_MASK : 0; + cache_flush.bitfields2.coher_cntl = coher_cntl; + + // Append the built command into output Command Buffer + APPEND_COMMAND_WRAPPER(cmdbuf, cache_flush); + return; +} + +void Gfx9CmdWriter::BuildDmaDataPacket(CmdBuf* cmdbuf, uint32_t* srcAddr, uint32_t* dstAddr, + uint32_t copySize, bool waitForConfirm) { + PM4MEC_DMA_DATA cmdDmaData; + memset(&cmdDmaData, 0, sizeof(PM4MEC_DMA_DATA)); + cmdDmaData.header.u32All = + PM4_TYPE3_HDR(IT_DMA_DATA, (sizeof(PM4MEC_DMA_DATA) / sizeof(uint32_t))); + + // Specify attributes of source buffer such as its + // location and Cache policy + cmdDmaData.bitfields2.src_sel = src_sel__mec_dma_data__src_addr_using_sas; + cmdDmaData.bitfields2.src_cache_policy = src_cache_policy__mec_dma_data__stream; + + // Specify attributes of destination buffer such as its + // location and Cache policy + cmdDmaData.bitfields2.dst_sel = dst_sel__mec_dma_data__dst_addr_using_das; + cmdDmaData.bitfields2.dst_cache_policy = dst_cache_policy__mec_dma_data__stream; + + // Specify the source and destination addr + cmdDmaData.src_addr_lo_or_data = PtrLow32(srcAddr); + cmdDmaData.src_addr_hi = PtrHigh32(srcAddr); + cmdDmaData.dst_addr_lo = PtrLow32(dstAddr); + cmdDmaData.dst_addr_hi = PtrHigh32(dstAddr); + + // Number of bytes to copy. The command restricts + // the size to be (64 MB - 1) - 26 Bits + assert(copySize < 0x1FFFFF); + cmdDmaData.bitfields7.byte_count = copySize; + + // Indicate that DMA Cmd should wait if its source + // is the destination of a previous DMA Cmd + cmdDmaData.bitfields7.raw_wait = waitForConfirm; + + APPEND_COMMAND_WRAPPER(cmdbuf, cmdDmaData); + return; +} + + +} // gfx9 namespace + +} // pm4_profile diff --git a/runtime/hsa-ext-aql-profile/src/commandwriter/gfx9_cmdwriter.h b/runtime/hsa-ext-aql-profile/src/commandwriter/gfx9_cmdwriter.h new file mode 100644 index 0000000000..fe7e3ce216 --- /dev/null +++ b/runtime/hsa-ext-aql-profile/src/commandwriter/gfx9_cmdwriter.h @@ -0,0 +1,199 @@ +#ifndef _GFX9_CMDWRITER_H_ +#define _GFX9_CMDWRITER_H_ + +#include "cmdwriter.h" +#include "gfx9_cmds.h" + +namespace pm4_profile { + +namespace gfx9 { + + +/// @brief class Gfx9CmdWriter implements the virtual class CommandWriter +/// for GFX9 chipsets +class Gfx9CmdWriter : public CommandWriter { + public: + Gfx9CmdWriter(bool atc_support, bool pcie_atomic_support); + + /// @brief Dword specifying NOOP command for GFX9 chipsets. The macro + /// populates the NOOP command which is 32-bits wide. The second parameter, + /// the COUNT field of NOOP command, specifies the number of Dwords to skip. + /// To skip ZERO Dwords the value should be set to 0x3FFF. Since the macro + /// decrements the second parameter by TWO, an artifact of its definition, + /// the value is incremented by TWO to 0x4001 (0x3FFF + 2). + /// + inline uint32_t GetNoOpCmd() { + static const uint32_t nopCmd = PM4_TYPE3_HDR(IT_NOP, 0x4001); + return nopCmd; + } + + void BuildBarrierCommand(CmdBuf* cmdBuf); + + void BuildIndirectBufferCmd(CmdBuf* cmdbuf, const void* cmd_addr, std::size_t cmd_size); + + void BuildBOPNotifyCmd(CmdBuf* cmdbuf, const void* write_addr, uint32_t write_val, + bool interrupt); + + void BuildBarrierFenceCommands(CmdBuf* cmdbuf); + + void BuildWriteEventPacket(CmdBuf* cmdbuf, uint32_t event); + + void BuildWaitRegMemCommand(CmdBuf* cmdbuf, bool mem_space, uint64_t wait_addr, bool func_eq, + uint32_t mask_val, uint32_t wait_val); + + void BuildWriteUnshadowRegPacket(CmdBuf* cmdbuf, uint32_t addr, uint32_t value); + + /// @brief Build CP command to program a Gpu register + /// + /// @param cmdbuf Pointer to command buffer to be appended + /// @param addr Register to be programmed + /// @param value Value to write into register + /// + /// @return void + void BuildWriteUConfigRegPacket(CmdBuf* cmdbuf, uint32_t addr, uint32_t value); + + void BuildWriteShRegPacket(CmdBuf* cmdbuf, uint32_t addr, uint32_t value); + + void BuildCopyDataPacket(CmdBuf* cmdbuf, uint32_t src_sel, uint32_t src_addr_lo, + uint32_t src_addr_hi, uint32_t* dst_addr, uint32_t size, bool wait); + + void BuildWriteWaitIdlePacket(CmdBuf* cmdbuf); + + // Will issue a VGT event including a cache flush later on + void BuildVgtEventPacket(CmdBuf* cmdbuf, uint32_t vgtEvent); + + void BuildWriteRegisterPacket(CmdBuf* cmdbuf, uint32_t addr, uint32_t value); + + void BuildWriteEventQueryPacket(CmdBuf* cmdbuf, uint32_t event, uint32_t* addr); + + void BuildAtomicPacket(CmdBuf* cmdbuf, AtomicType atomic_op, volatile uint32_t* addr, + uint32_t value, uint32_t compare); + + void BuildAtomicPacket64(CmdBuf* cmdbuf, AtomicType atomic_op, volatile uint64_t* addr, + uint64_t value = 0, uint64_t compare = 0); + + size_t SizeOfAtomicPacket() const; + + void BuildConditionalExecute(CmdBuf* cmdbuf, uint32_t* signal, uint16_t count); + + void BuildWriteDataCommand(CmdBuf* cmdbuf, uint32_t* write_addr, uint32_t write_value); + + void BuildWriteData64Command(CmdBuf* cmdbuf, uint64_t* write_addr, uint64_t write_value); + + void BuildCacheFlushPacket(CmdBuf* cmdbuf); + + /// Writes into input buffer Gpu commands to flush its cache. It is + /// necessary that the buffer provided for flush commands is large + /// enough to accommodate the full set of commands. It should be at + /// least 512 bytes. + /// + /// @param tsCmdBuf Buffer to write commands to. + /// @param writeAddr Registered address into which GPU should write + /// a user provided value upon executing the flush commands. + /// @param writeVal User provided value written by GPU at user provided + /// address, upon executing the flush commands. + /// + /// @return void + void BuildFlushCacheCmd(CmdBuf* cmdBuf, FlushCacheOptions* options, uint32_t* writeAddr, + uint32_t writeVal); + + /// Builds Gpu command to copy data from source to destination buffer + /// using DMA engine. + /// + /// @param cmdbuf Buffer updated with Gpu copy command + /// @param srcAddr Address of source buffer address + /// @param dstAddr Address of destination buffer address + /// @param copySize Size of data to copy in bytes + /// @param waitForCompletion if command should wait for copying to complete + void BuildDmaDataPacket(CmdBuf* cmdBuf, uint32_t* srcAddr, uint32_t* dstAddr, uint32_t copySize, + bool waitForCompletion); + + protected: + /// @brief Append an instance of Gpu command into input command buffer stream. + /// + /// @param cmdbuf CommandWriter object appended with anohter Gpu command + /// + /// @param cmd Gpu command to be appended into command buffer + /// + /// @return void + template void AppendCommand(CmdBuf* cmdbuf, const T& cmd); + + private: + /// @brief Initializes a Gpu command which can be used to + /// reference a Gpu command stream indirectly + void InitializeLaunchTemplate(); + + /// @brief Initializes a Gpu command which can be used to + /// flush Gpu caches and write to a user configurable address + /// to indicate an end of kernel + void InitializeEndOfKernelNotifyTemplate(); + + /// @brief Initializes a Gpu command to perform atomic operations + //// + void InitializeAtomicTemplate(); + + /// @brief Initializes a Gpu command to allow conditional execution + /// of a Gpu command stream + void InitializeConditionalTemplate(); + + /// @brief Initializes a Gpu command to let command processor + /// wait for some update before letting other commands to be + /// processed + void InitializeWaitRegMemTemplate(); + + /// @brief Initializes the template for Barrier command. + /// Applications can use Barrier command to ensure their + /// command is executed only after all other commands have + /// completed their execution. + void InitializeBarrierTemplate(); + + void BuildUpdateHostAddress(CmdBuf* cmdbuf, uint64_t* addr, int64_t value); + + /// @brief Initializes Acquire Memory command template. Users + /// can submit this command to invalidate Gpu caches - L1 and + /// or L2. + void InitializeAcquireMemTemplate(); + + /// @brief Initializes an instance of Write Data command + /// for use by an application + void InitializeWriteDataTemplate(); + void InitializeWriteData64Template(); + void InitializeWriteDataTemplate(PM4MEC_WRITE_DATA* write_data, bool bit32); + + /// @brief Builds wait_reg_mem with EQUALS condition + void BuildWaitRegMemCommand(CmdBuf* cmdbuf, uint64_t wait_addr, uint32_t wait_value); + + /// @brief Instance of Gpu command to reference dispatch commands + LaunchTemplate launch_template_; + + /// @brief Instance of Gpu command to use in determing end of kernel + EndofKernelNotifyTemplate notify_template_; + + /// @brief Instance of Gpu command to use in performing atomic operations + AtomicTemplate atomic_template_; + + /// @brief Instance of Pm4 command WRITE_DATA + WriteDataTemplate write_data_template_; + WriteData64Template write_data64_template_; + + /// @brief Instance of Pm4 command EVENT_WRITE + BarrierTemplate pending_dispatch_template_; + + /// @brief Instance of Pm4 command ACQUIRE_MEM + AcquireMemTemplate invalidate_cache_template_; + + /// @brief Instance of Pm4 command WAIT_REG_MEM + WaitRegMemTemplate wait_reg_mem_template_; + + /// @brief ATC support. + bool atc_support_; + + /// @brief PCIe atomic support. + bool pcie_atomic_support_; +}; + +} // gfx9 + +} // pm4_profile + +#endif // _GFX9_CMDWRITER_H_ diff --git a/runtime/hsa-ext-aql-profile/src/perfcounter/CMakeLists.txt b/runtime/hsa-ext-aql-profile/src/perfcounter/CMakeLists.txt new file mode 100644 index 0000000000..c8902cc036 --- /dev/null +++ b/runtime/hsa-ext-aql-profile/src/perfcounter/CMakeLists.txt @@ -0,0 +1,24 @@ +# +# Source files for Rocr PerfCntr +# +set ( LIB_SRC var_data.cpp ) +set ( LIB_SRC ${LIB_SRC} info_set.cpp ) +set ( LIB_SRC ${LIB_SRC} parameter_set.cpp ) +set ( LIB_SRC ${LIB_SRC} gpu_counter.cpp ) +set ( LIB_SRC ${LIB_SRC} gpu_countergroup.cpp ) +set ( LIB_SRC ${LIB_SRC} vi_blockinfo.cpp ) +set ( LIB_SRC ${LIB_SRC} vi_pmu.cpp ) +set ( LIB_SRC ${LIB_SRC} ai_blockinfo.cpp ) +set ( LIB_SRC ${LIB_SRC} ai_pmu.cpp ) + +# +# Header files include path(s). +# +include_directories ( $ENV{ROCR_INC_DIR} ) +include_directories ( ${PROJ_DIR}/commandwriter ) +include_directories ( ${CORE_UTIL_DIR} ) + +# +# Build PerfCntr as a Static Library object +# +add_library ( ${PMC_LIB} STATIC ${LIB_SRC} ) diff --git a/runtime/hsa-ext-aql-profile/src/perfcounter/ai_blockinfo.cpp b/runtime/hsa-ext-aql-profile/src/perfcounter/ai_blockinfo.cpp new file mode 100644 index 0000000000..f99dbe4b13 --- /dev/null +++ b/runtime/hsa-ext-aql-profile/src/perfcounter/ai_blockinfo.cpp @@ -0,0 +1,555 @@ +#include "ai_blockinfo.h" +#include "gfxip/gfx9/gfx9_offset.h" +#include "gfxip/gfx9/gfx9_typedef.h" + +namespace pm4_profile { +/** + * Table containing CounterGroups which represent AI hardware blocks + * as defined by \ref GpuBlockInfo structure + */ +GpuBlockInfo AiPmuHwBlocks[] = { + // Counter block CB + {"AI_CB0", kHsaAiCounterBlockIdCb0, AI_MAX_NUM_SHADER_ENGINES, 2, AI_NUM_CB, + CntlMethodBySeAndInstance, 395, AI_COUNTER_NUM_PER_CB, 0, 0, true, 0, 0, false, 0, 0}, + {"AI_CB1", kHsaAiCounterBlockIdCb1, AI_MAX_NUM_SHADER_ENGINES, 2, AI_NUM_CB, + CntlMethodBySeAndInstance, 395, AI_COUNTER_NUM_PER_CB, 0, 0, true, 0, 0, false, 0, 0}, + {"AI_CB2", kHsaAiCounterBlockIdCb2, AI_MAX_NUM_SHADER_ENGINES, 2, AI_NUM_CB, + CntlMethodBySeAndInstance, 395, AI_COUNTER_NUM_PER_CB, 0, 0, true, 0, 0, false, 0, 0}, + {"AI_CB3", kHsaAiCounterBlockIdCb3, AI_MAX_NUM_SHADER_ENGINES, 2, AI_NUM_CB, + CntlMethodBySeAndInstance, 395, AI_COUNTER_NUM_PER_CB, 0, 0, true, 0, 0, false, 0, 0}, + + // Temp commented for Vega10 + // Counter block CPF + /* + {"AI_CPF", kHsaAiCounterBlockIdCpf, AI_MAX_NUM_SHADER_ENGINES, 2, 1, CntlMethodNone, 19, + AI_COUNTER_NUM_PER_CPF, 0, 0, true, 0, 0, false, 0, 0}, + */ + {"AI_CB3", kHsaAiCounterBlockIdCpf, AI_MAX_NUM_SHADER_ENGINES, 2, AI_NUM_CB, + CntlMethodBySeAndInstance, 395, AI_COUNTER_NUM_PER_CB, 0, 0, true, 0, 0, false, 0, 0}, + + // Counter block DB + {"AI_DB0", kHsaAiCounterBlockIdDb0, AI_MAX_NUM_SHADER_ENGINES, 2, AI_NUM_DB, + CntlMethodBySeAndInstance, 256, AI_COUNTER_NUM_PER_DB, 0, 0, true, 0, 0, false, 0, 0}, + {"AI_DB1", kHsaAiCounterBlockIdDb1, AI_MAX_NUM_SHADER_ENGINES, 2, AI_NUM_DB, + CntlMethodBySeAndInstance, 256, AI_COUNTER_NUM_PER_DB, 0, 0, true, 0, 0, false, 0, 0}, + {"AI_DB2", kHsaAiCounterBlockIdDb2, AI_MAX_NUM_SHADER_ENGINES, 2, AI_NUM_DB, + CntlMethodBySeAndInstance, 256, AI_COUNTER_NUM_PER_DB, 0, 0, true, 0, 0, false, 0, 0}, + {"AI_DB3", kHsaAiCounterBlockIdDb3, AI_MAX_NUM_SHADER_ENGINES, 2, AI_NUM_DB, + CntlMethodBySeAndInstance, 256, AI_COUNTER_NUM_PER_DB, 0, 0, true, 0, 0, false, 0, 0}, + + // Counter block GRBM + {"AI_GRBM", kHsaAiCounterBlockIdGrbm, AI_MAX_NUM_SHADER_ENGINES, 2, 1, CntlMethodNone, 33, + AI_COUNTER_NUM_PER_GRBM, 0, 0, true, 0, 0, false, 0, 0}, + + // Counter block GRBMSE + {"AI_GRBMSE", kHsaAiCounterBlockIdGrbmSe, AI_MAX_NUM_SHADER_ENGINES, 2, 1, CntlMethodNone, 14, + AI_COUNTER_NUM_PER_GRBMSE, 0, 0, true, 0, 0, false, 0, 0}, + + // Counter block PA_SU + {"AI_PA_SU", kHsaAiCounterBlockIdPaSu, AI_MAX_NUM_SHADER_ENGINES, 2, 1, CntlMethodBySe, 152, + AI_COUNTER_NUM_PER_PA_SU, 0, 0, true, 0, 0, false, 0, 0}, + + // Counter block PA_SC + {"AI_PA_SC", kHsaAiCounterBlockIdPaSc, AI_MAX_NUM_SHADER_ENGINES, 2, 1, CntlMethodBySe, 396, + AI_COUNTER_NUM_PER_PA_SC, 0, 0, true, 0, 0, false, 0, 0}, + + // Counter block SPI + {"AI_SPI", kHsaAiCounterBlockIdSpi, AI_MAX_NUM_SHADER_ENGINES, 2, 1, CntlMethodBySe, 196, + AI_COUNTER_NUM_PER_SPI, 0, 0, true, 0, 0, false, 0, 0}, + + // Counter block SQ + {"AI_SQ", kHsaAiCounterBlockIdSq, AI_MAX_NUM_SHADER_ENGINES, 2, 1, CntlMethodBySe, 298, + AI_COUNTER_NUM_PER_SQ, 0, 0, true, 0, 0, false, 0, 0}, + {"AI_SQ_GS", kHsaAiCounterBlockIdSqGs, AI_MAX_NUM_SHADER_ENGINES, 2, 1, CntlMethodBySe, 298, + AI_COUNTER_NUM_PER_SQ, 0, 0, true, 0, 0, false, 0, 0}, + {"AI_SQ_VS", kHsaAiCounterBlockIdSqVs, AI_MAX_NUM_SHADER_ENGINES, 2, 1, CntlMethodBySe, 298, + AI_COUNTER_NUM_PER_SQ, 0, 0, true, 0, 0, false, 0, 0}, + {"AI_SQ_PS", kHsaAiCounterBlockIdSqPs, AI_MAX_NUM_SHADER_ENGINES, 2, 1, CntlMethodBySe, 298, + AI_COUNTER_NUM_PER_SQ, 0, 0, true, 0, 0, false, 0, 0}, + {"AI_SQ_HS", kHsaAiCounterBlockIdSqHs, AI_MAX_NUM_SHADER_ENGINES, 2, 1, CntlMethodBySe, 298, + AI_COUNTER_NUM_PER_SQ, 0, 0, true, 0, 0, false, 0, 0}, + {"AI_SQ_CS", kHsaAiCounterBlockIdSqCs, AI_MAX_NUM_SHADER_ENGINES, 2, 1, CntlMethodBySe, 298, + AI_COUNTER_NUM_PER_SQ, 0, 0, true, 0, 0, false, 0, 0}, + + // Counter block SX + {"AI_SX", kHsaAiCounterBlockIdSx, AI_MAX_NUM_SHADER_ENGINES, 2, 1, CntlMethodBySe, 33, + AI_COUNTER_NUM_PER_SX, 0, 0, true, 0, 0, false, 0, 0}, + + // Counter block TA + {"AI_TA0", kHsaAiCounterBlockIdTa0, AI_MAX_NUM_SHADER_ENGINES, 2, AI_NUM_TA, + CntlMethodBySeAndInstance, 118, AI_COUNTER_NUM_PER_TA, 0, 0, true, 0, 0, false, 0, 0}, + {"AI_TA1", kHsaAiCounterBlockIdTa1, AI_MAX_NUM_SHADER_ENGINES, 2, AI_NUM_TA, + CntlMethodBySeAndInstance, 118, AI_COUNTER_NUM_PER_TA, 0, 0, true, 0, 0, false, 0, 0}, + {"AI_TA2", kHsaAiCounterBlockIdTa2, AI_MAX_NUM_SHADER_ENGINES, 2, AI_NUM_TA, + CntlMethodBySeAndInstance, 118, AI_COUNTER_NUM_PER_TA, 0, 0, true, 0, 0, false, 0, 0}, + {"AI_TA3", kHsaAiCounterBlockIdTa3, AI_MAX_NUM_SHADER_ENGINES, 2, AI_NUM_TA, + CntlMethodBySeAndInstance, 118, AI_COUNTER_NUM_PER_TA, 0, 0, true, 0, 0, false, 0, 0}, + {"AI_TA4", kHsaAiCounterBlockIdTa4, AI_MAX_NUM_SHADER_ENGINES, 2, AI_NUM_TA, + CntlMethodBySeAndInstance, 118, AI_COUNTER_NUM_PER_TA, 0, 0, true, 0, 0, false, 0, 0}, + {"AI_TA5", kHsaAiCounterBlockIdTa5, AI_MAX_NUM_SHADER_ENGINES, 2, AI_NUM_TA, + CntlMethodBySeAndInstance, 118, AI_COUNTER_NUM_PER_TA, 0, 0, true, 0, 0, false, 0, 0}, + {"AI_TA6", kHsaAiCounterBlockIdTa6, AI_MAX_NUM_SHADER_ENGINES, 2, AI_NUM_TA, + CntlMethodBySeAndInstance, 118, AI_COUNTER_NUM_PER_TA, 0, 0, true, 0, 0, false, 0, 0}, + {"AI_TA7", kHsaAiCounterBlockIdTa7, AI_MAX_NUM_SHADER_ENGINES, 2, AI_NUM_TA, + CntlMethodBySeAndInstance, 118, AI_COUNTER_NUM_PER_TA, 0, 0, true, 0, 0, false, 0, 0}, + {"AI_TA8", kHsaAiCounterBlockIdTa8, AI_MAX_NUM_SHADER_ENGINES, 2, AI_NUM_TA, + CntlMethodBySeAndInstance, 118, AI_COUNTER_NUM_PER_TA, 0, 0, true, 0, 0, false, 0, 0}, + {"AI_TA9", kHsaAiCounterBlockIdTa9, AI_MAX_NUM_SHADER_ENGINES, 2, AI_NUM_TA, + CntlMethodBySeAndInstance, 118, AI_COUNTER_NUM_PER_TA, 0, 0, true, 0, 0, false, 0, 0}, + {"AI_TA10", kHsaAiCounterBlockIdTa10, AI_MAX_NUM_SHADER_ENGINES, 2, AI_NUM_TA, + CntlMethodBySeAndInstance, 118, AI_COUNTER_NUM_PER_TA, 0, 0, true, 0, 0, false, 0, 0}, + {"AI_TA11", kHsaAiCounterBlockIdTa11, AI_MAX_NUM_SHADER_ENGINES, 2, AI_NUM_TA, + CntlMethodBySeAndInstance, 118, AI_COUNTER_NUM_PER_TA, 0, 0, true, 0, 0, false, 0, 0}, + {"AI_TA12", kHsaAiCounterBlockIdTa12, AI_MAX_NUM_SHADER_ENGINES, 2, AI_NUM_TA, + CntlMethodBySeAndInstance, 118, AI_COUNTER_NUM_PER_TA, 0, 0, true, 0, 0, false, 0, 0}, + {"AI_TA13", kHsaAiCounterBlockIdTa13, AI_MAX_NUM_SHADER_ENGINES, 2, AI_NUM_TA, + CntlMethodBySeAndInstance, 118, AI_COUNTER_NUM_PER_TA, 0, 0, true, 0, 0, false, 0, 0}, + {"AI_TA14", kHsaAiCounterBlockIdTa14, AI_MAX_NUM_SHADER_ENGINES, 2, AI_NUM_TA, + CntlMethodBySeAndInstance, 118, AI_COUNTER_NUM_PER_TA, 0, 0, true, 0, 0, false, 0, 0}, + {"AI_TA15", kHsaAiCounterBlockIdTa15, AI_MAX_NUM_SHADER_ENGINES, 2, AI_NUM_TA, + CntlMethodBySeAndInstance, 118, AI_COUNTER_NUM_PER_TA, 0, 0, true, 0, 0, false, 0, 0}, + + // Counter block TCA + {"AI_TCA0", kHsaAiCounterBlockIdTca0, AI_MAX_NUM_SHADER_ENGINES, 2, AI_NUM_TCA, + CntlMethodByInstance, 34, AI_COUNTER_NUM_PER_TCA, 0, 0, true, 0, 0, false, 0, 0}, + {"AI_TCA1", kHsaAiCounterBlockIdTca1, AI_MAX_NUM_SHADER_ENGINES, 2, AI_NUM_TCA, + CntlMethodByInstance, 34, AI_COUNTER_NUM_PER_TCA, 0, 0, true, 0, 0, false, 0, 0}, + + // Counter block TCC + {"AI_TCC0", kHsaAiCounterBlockIdTcc0, AI_MAX_NUM_SHADER_ENGINES, 2, AI_NUM_TCC, + CntlMethodByInstance, 191, AI_COUNTER_NUM_PER_TCC, 0, 0, true, 0, 0, false, 0, 0}, + {"AI_TCC1", kHsaAiCounterBlockIdTcc1, AI_MAX_NUM_SHADER_ENGINES, 2, AI_NUM_TCC, + CntlMethodByInstance, 191, AI_COUNTER_NUM_PER_TCC, 0, 0, true, 0, 0, false, 0, 0}, + {"AI_TCC2", kHsaAiCounterBlockIdTcc2, AI_MAX_NUM_SHADER_ENGINES, 2, AI_NUM_TCC, + CntlMethodByInstance, 191, AI_COUNTER_NUM_PER_TCC, 0, 0, true, 0, 0, false, 0, 0}, + {"AI_TCC3", kHsaAiCounterBlockIdTcc3, AI_MAX_NUM_SHADER_ENGINES, 2, AI_NUM_TCC, + CntlMethodByInstance, 191, AI_COUNTER_NUM_PER_TCC, 0, 0, true, 0, 0, false, 0, 0}, + {"AI_TCC4", kHsaAiCounterBlockIdTcc4, AI_MAX_NUM_SHADER_ENGINES, 2, AI_NUM_TCC, + CntlMethodByInstance, 191, AI_COUNTER_NUM_PER_TCC, 0, 0, true, 0, 0, false, 0, 0}, + {"AI_TCC5", kHsaAiCounterBlockIdTcc5, AI_MAX_NUM_SHADER_ENGINES, 2, AI_NUM_TCC, + CntlMethodByInstance, 191, AI_COUNTER_NUM_PER_TCC, 0, 0, true, 0, 0, false, 0, 0}, + {"AI_TCC6", kHsaAiCounterBlockIdTcc6, AI_MAX_NUM_SHADER_ENGINES, 2, AI_NUM_TCC, + CntlMethodByInstance, 191, AI_COUNTER_NUM_PER_TCC, 0, 0, true, 0, 0, false, 0, 0}, + {"AI_TCC7", kHsaAiCounterBlockIdTcc7, AI_MAX_NUM_SHADER_ENGINES, 2, AI_NUM_TCC, + CntlMethodByInstance, 191, AI_COUNTER_NUM_PER_TCC, 0, 0, true, 0, 0, false, 0, 0}, + {"AI_TCC8", kHsaAiCounterBlockIdTcc8, AI_MAX_NUM_SHADER_ENGINES, 2, AI_NUM_TCC, + CntlMethodByInstance, 191, AI_COUNTER_NUM_PER_TCC, 0, 0, true, 0, 0, false, 0, 0}, + {"AI_TCC9", kHsaAiCounterBlockIdTcc9, AI_MAX_NUM_SHADER_ENGINES, 2, AI_NUM_TCC, + CntlMethodByInstance, 191, AI_COUNTER_NUM_PER_TCC, 0, 0, true, 0, 0, false, 0, 0}, + {"AI_TCC10", kHsaAiCounterBlockIdTcc10, AI_MAX_NUM_SHADER_ENGINES, 2, AI_NUM_TCC, + CntlMethodByInstance, 191, AI_COUNTER_NUM_PER_TCC, 0, 0, true, 0, 0, false, 0, 0}, + {"AI_TCC11", kHsaAiCounterBlockIdTcc11, AI_MAX_NUM_SHADER_ENGINES, 2, AI_NUM_TCC, + CntlMethodByInstance, 191, AI_COUNTER_NUM_PER_TCC, 0, 0, true, 0, 0, false, 0, 0}, + {"AI_TCC12", kHsaAiCounterBlockIdTcc12, AI_MAX_NUM_SHADER_ENGINES, 2, AI_NUM_TCC, + CntlMethodByInstance, 191, AI_COUNTER_NUM_PER_TCC, 0, 0, true, 0, 0, false, 0, 0}, + {"AI_TCC13", kHsaAiCounterBlockIdTcc13, AI_MAX_NUM_SHADER_ENGINES, 2, AI_NUM_TCC, + CntlMethodByInstance, 191, AI_COUNTER_NUM_PER_TCC, 0, 0, true, 0, 0, false, 0, 0}, + {"AI_TCC14", kHsaAiCounterBlockIdTcc14, AI_MAX_NUM_SHADER_ENGINES, 2, AI_NUM_TCC, + CntlMethodByInstance, 191, AI_COUNTER_NUM_PER_TCC, 0, 0, true, 0, 0, false, 0, 0}, + {"AI_TCC15", kHsaAiCounterBlockIdTcc15, AI_MAX_NUM_SHADER_ENGINES, 2, AI_NUM_TCC, + CntlMethodByInstance, 191, AI_COUNTER_NUM_PER_TCC, 0, 0, true, 0, 0, false, 0, 0}, + + // Counter block TD + {"AI_TD0", kHsaAiCounterBlockIdTd0, AI_MAX_NUM_SHADER_ENGINES, 2, AI_NUM_TD, + CntlMethodBySeAndInstance, 54, AI_COUNTER_NUM_PER_TD, 0, 0, true, 0, 0, false, 0, 0}, + {"AI_TD1", kHsaAiCounterBlockIdTd1, AI_MAX_NUM_SHADER_ENGINES, 2, AI_NUM_TD, + CntlMethodBySeAndInstance, 54, AI_COUNTER_NUM_PER_TD, 0, 0, true, 0, 0, false, 0, 0}, + {"AI_TD2", kHsaAiCounterBlockIdTd2, AI_MAX_NUM_SHADER_ENGINES, 2, AI_NUM_TD, + CntlMethodBySeAndInstance, 54, AI_COUNTER_NUM_PER_TD, 0, 0, true, 0, 0, false, 0, 0}, + {"AI_TD3", kHsaAiCounterBlockIdTd3, AI_MAX_NUM_SHADER_ENGINES, 2, AI_NUM_TD, + CntlMethodBySeAndInstance, 54, AI_COUNTER_NUM_PER_TD, 0, 0, true, 0, 0, false, 0, 0}, + {"AI_TD4", kHsaAiCounterBlockIdTd4, AI_MAX_NUM_SHADER_ENGINES, 2, AI_NUM_TD, + CntlMethodBySeAndInstance, 54, AI_COUNTER_NUM_PER_TD, 0, 0, true, 0, 0, false, 0, 0}, + {"AI_TD5", kHsaAiCounterBlockIdTd5, AI_MAX_NUM_SHADER_ENGINES, 2, AI_NUM_TD, + CntlMethodBySeAndInstance, 54, AI_COUNTER_NUM_PER_TD, 0, 0, true, 0, 0, false, 0, 0}, + {"AI_TD6", kHsaAiCounterBlockIdTd6, AI_MAX_NUM_SHADER_ENGINES, 2, AI_NUM_TD, + CntlMethodBySeAndInstance, 54, AI_COUNTER_NUM_PER_TD, 0, 0, true, 0, 0, false, 0, 0}, + {"AI_TD7", kHsaAiCounterBlockIdTd7, AI_MAX_NUM_SHADER_ENGINES, 2, AI_NUM_TD, + CntlMethodBySeAndInstance, 54, AI_COUNTER_NUM_PER_TD, 0, 0, true, 0, 0, false, 0, 0}, + {"AI_TD8", kHsaAiCounterBlockIdTd8, AI_MAX_NUM_SHADER_ENGINES, 2, AI_NUM_TD, + CntlMethodBySeAndInstance, 54, AI_COUNTER_NUM_PER_TD, 0, 0, true, 0, 0, false, 0, 0}, + {"AI_TD9", kHsaAiCounterBlockIdTd9, AI_MAX_NUM_SHADER_ENGINES, 2, AI_NUM_TD, + CntlMethodBySeAndInstance, 54, AI_COUNTER_NUM_PER_TD, 0, 0, true, 0, 0, false, 0, 0}, + {"AI_TD10", kHsaAiCounterBlockIdTd10, AI_MAX_NUM_SHADER_ENGINES, 2, AI_NUM_TD, + CntlMethodBySeAndInstance, 54, AI_COUNTER_NUM_PER_TD, 0, 0, true, 0, 0, false, 0, 0}, + {"AI_TD11", kHsaAiCounterBlockIdTd11, AI_MAX_NUM_SHADER_ENGINES, 2, AI_NUM_TD, + CntlMethodBySeAndInstance, 54, AI_COUNTER_NUM_PER_TD, 0, 0, true, 0, 0, false, 0, 0}, + {"AI_TD12", kHsaAiCounterBlockIdTd12, AI_MAX_NUM_SHADER_ENGINES, 2, AI_NUM_TD, + CntlMethodBySeAndInstance, 54, AI_COUNTER_NUM_PER_TD, 0, 0, true, 0, 0, false, 0, 0}, + {"AI_TD13", kHsaAiCounterBlockIdTd13, AI_MAX_NUM_SHADER_ENGINES, 2, AI_NUM_TD, + CntlMethodBySeAndInstance, 54, AI_COUNTER_NUM_PER_TD, 0, 0, true, 0, 0, false, 0, 0}, + {"AI_TD14", kHsaAiCounterBlockIdTd14, AI_MAX_NUM_SHADER_ENGINES, 2, AI_NUM_TD, + CntlMethodBySeAndInstance, 54, AI_COUNTER_NUM_PER_TD, 0, 0, true, 0, 0, false, 0, 0}, + {"AI_TD15", kHsaAiCounterBlockIdTd15, AI_MAX_NUM_SHADER_ENGINES, 2, AI_NUM_TD, + CntlMethodBySeAndInstance, 54, AI_COUNTER_NUM_PER_TD, 0, 0, true, 0, 0, false, 0, 0}, + + // Counter block TCP + {"AI_TCP0", kHsaAiCounterBlockIdTcp0, AI_MAX_NUM_SHADER_ENGINES, 2, AI_NUM_TCP, + CntlMethodBySeAndInstance, 182, AI_COUNTER_NUM_PER_TCP, 0, 0, true, 0, 0, false, 0, 0}, + {"AI_TCP1", kHsaAiCounterBlockIdTcp1, AI_MAX_NUM_SHADER_ENGINES, 2, AI_NUM_TCP, + CntlMethodBySeAndInstance, 182, AI_COUNTER_NUM_PER_TCP, 0, 0, true, 0, 0, false, 0, 0}, + {"AI_TCP2", kHsaAiCounterBlockIdTcp2, AI_MAX_NUM_SHADER_ENGINES, 2, AI_NUM_TCP, + CntlMethodBySeAndInstance, 182, AI_COUNTER_NUM_PER_TCP, 0, 0, true, 0, 0, false, 0, 0}, + {"AI_TCP3", kHsaAiCounterBlockIdTcp3, AI_MAX_NUM_SHADER_ENGINES, 2, AI_NUM_TCP, + CntlMethodBySeAndInstance, 182, AI_COUNTER_NUM_PER_TCP, 0, 0, true, 0, 0, false, 0, 0}, + {"AI_TCP4", kHsaAiCounterBlockIdTcp4, AI_MAX_NUM_SHADER_ENGINES, 2, AI_NUM_TCP, + CntlMethodBySeAndInstance, 182, AI_COUNTER_NUM_PER_TCP, 0, 0, true, 0, 0, false, 0, 0}, + {"AI_TCP5", kHsaAiCounterBlockIdTcp5, AI_MAX_NUM_SHADER_ENGINES, 2, AI_NUM_TCP, + CntlMethodBySeAndInstance, 182, AI_COUNTER_NUM_PER_TCP, 0, 0, true, 0, 0, false, 0, 0}, + {"AI_TCP6", kHsaAiCounterBlockIdTcp6, AI_MAX_NUM_SHADER_ENGINES, 2, AI_NUM_TCP, + CntlMethodBySeAndInstance, 182, AI_COUNTER_NUM_PER_TCP, 0, 0, true, 0, 0, false, 0, 0}, + {"AI_TCP7", kHsaAiCounterBlockIdTcp7, AI_MAX_NUM_SHADER_ENGINES, 2, AI_NUM_TCP, + CntlMethodBySeAndInstance, 182, AI_COUNTER_NUM_PER_TCP, 0, 0, true, 0, 0, false, 0, 0}, + {"AI_TCP8", kHsaAiCounterBlockIdTcp8, AI_MAX_NUM_SHADER_ENGINES, 2, AI_NUM_TCP, + CntlMethodBySeAndInstance, 182, AI_COUNTER_NUM_PER_TCP, 0, 0, true, 0, 0, false, 0, 0}, + {"AI_TCP9", kHsaAiCounterBlockIdTcp9, AI_MAX_NUM_SHADER_ENGINES, 2, AI_NUM_TCP, + CntlMethodBySeAndInstance, 182, AI_COUNTER_NUM_PER_TCP, 0, 0, true, 0, 0, false, 0, 0}, + {"AI_TCP10", kHsaAiCounterBlockIdTcp10, AI_MAX_NUM_SHADER_ENGINES, 2, AI_NUM_TCP, + CntlMethodBySeAndInstance, 182, AI_COUNTER_NUM_PER_TCP, 0, 0, true, 0, 0, false, 0, 0}, + {"AI_TCP11", kHsaAiCounterBlockIdTcp11, AI_MAX_NUM_SHADER_ENGINES, 2, AI_NUM_TCP, + CntlMethodBySeAndInstance, 182, AI_COUNTER_NUM_PER_TCP, 0, 0, true, 0, 0, false, 0, 0}, + {"AI_TCP12", kHsaAiCounterBlockIdTcp12, AI_MAX_NUM_SHADER_ENGINES, 2, AI_NUM_TCP, + CntlMethodBySeAndInstance, 182, AI_COUNTER_NUM_PER_TCP, 0, 0, true, 0, 0, false, 0, 0}, + {"AI_TCP13", kHsaAiCounterBlockIdTcp13, AI_MAX_NUM_SHADER_ENGINES, 2, AI_NUM_TCP, + CntlMethodBySeAndInstance, 182, AI_COUNTER_NUM_PER_TCP, 0, 0, true, 0, 0, false, 0, 0}, + {"AI_TCP14", kHsaAiCounterBlockIdTcp14, AI_MAX_NUM_SHADER_ENGINES, 2, AI_NUM_TCP, + CntlMethodBySeAndInstance, 182, AI_COUNTER_NUM_PER_TCP, 0, 0, true, 0, 0, false, 0, 0}, + {"AI_TCP15", kHsaAiCounterBlockIdTcp15, AI_MAX_NUM_SHADER_ENGINES, 2, AI_NUM_TCP, + CntlMethodBySeAndInstance, 182, AI_COUNTER_NUM_PER_TCP, 0, 0, true, 0, 0, false, 0, 0}, + + // Counter block GDS + {"AI_GDS", kHsaAiCounterBlockIdGds, AI_MAX_NUM_SHADER_ENGINES, 2, 1, CntlMethodNone, 120, + AI_COUNTER_NUM_PER_GDS, 0, 0, true, 0, 0, false, 0, 0}, + + // Counter block VGT + {"AI_VGT", kHsaAiCounterBlockIdVgt, AI_MAX_NUM_SHADER_ENGINES, 2, 1, CntlMethodBySe, 145, + AI_COUNTER_NUM_PER_VGT, 0, 0, true, 0, 0, false, 0, 0}, + + // Counter block IA + {"AI_IA", kHsaAiCounterBlockIdIa, AI_MAX_NUM_SHADER_ENGINES, 2, 1, CntlMethodBySe, 23, + AI_COUNTER_NUM_PER_IA, 0, 0, true, 0, 0, false, 0, 0}, + + // Counter block MC + {"AI_MC", kHsaAiCounterBlockIdMc, AI_MAX_NUM_SHADER_ENGINES, 2, 1, CntlMethodNone, 22, + AI_COUNTER_NUM_PER_MC, 0, 0, true, 0, 0, false, 0, 0}, + + // Temp commented out for Vega10 + // Counter block SRBM + /* + {"AI_SRBM", kHsaAiCounterBlockIdSrbm, AI_MAX_NUM_SHADER_ENGINES, 2, 1, CntlMethodNone, 19, + AI_COUNTER_NUM_PER_SRBM, 0, 0, true, 0, 0, false, 0, 0}, + */ + + // Counter block WD + {"AI_WD", kHsaAiCounterBlockIdWd, AI_MAX_NUM_SHADER_ENGINES, 2, 1, CntlMethodNone, 36, + AI_COUNTER_NUM_PER_WD, 0, 0, true, 0, 0, false, 0, 0}, + + // Counter block CPG + // Temp commented for Vega10 + /* + {"AI_CPG", kHsaAiCounterBlockIdCpg, AI_MAX_NUM_SHADER_ENGINES, 2, 1, CntlMethodNone, 48, + AI_COUNTER_NUM_PER_CPG, 0, 0, true, 0, 0, false, 0, 0}, + */ + + // Counter block CPC + // Temp commented for Vega10 + /* + {"AI_CPC", kHsaAiCounterBlockIdCpc, AI_MAX_NUM_SHADER_ENGINES, 2, 1, CntlMethodNone, 24, + AI_COUNTER_NUM_PER_CPC, 0, 0, true, 0, 0, false, 0, 0}, + */ + + // Counter block IOMMUV2 + {"AI_IOMMUV2", kHsaAiCounterBlockIdIommuV2, AI_MAX_NUM_SHADER_ENGINES, 2, 1, CntlMethodNone, 25, + 8, 0, 0, true, 0, 0, false, 0, 0}, + + // Counter block KernelDriver + {"AI_KD", kHsaAiCounterBlockIdKernelDriver, AI_MAX_NUM_SHADER_ENGINES, 2, 1, CntlMethodNone, 0, + 0, 0, 0, true, 0, 0, false, 0, 0}, + + // Name of the last line should be empty to indicate end of all counter groups + {"", kHsaAiCounterBlockIdBlocksLast, 0, 0, 0, CntlMethodNone, 0, 0, 0, 0, false, 0, 0, false, 0, + 0}}; + +/* + * The following tables contain register addresses of the SQ counter registers + */ + +/* + * SQ + */ +GpuCounterRegInfo AiSqCounterRegAddr[] = { + {mmSQ_PERFCOUNTER0_SELECT, mmSQ_PERFCOUNTER_CTRL, mmSQ_PERFCOUNTER0_LO, mmSQ_PERFCOUNTER0_HI}, + {mmSQ_PERFCOUNTER1_SELECT, mmSQ_PERFCOUNTER_CTRL, mmSQ_PERFCOUNTER1_LO, mmSQ_PERFCOUNTER1_HI}, + {mmSQ_PERFCOUNTER2_SELECT, mmSQ_PERFCOUNTER_CTRL, mmSQ_PERFCOUNTER2_LO, mmSQ_PERFCOUNTER2_HI}, + {mmSQ_PERFCOUNTER3_SELECT, mmSQ_PERFCOUNTER_CTRL, mmSQ_PERFCOUNTER3_LO, mmSQ_PERFCOUNTER3_HI}, + {mmSQ_PERFCOUNTER4_SELECT, mmSQ_PERFCOUNTER_CTRL, mmSQ_PERFCOUNTER4_LO, mmSQ_PERFCOUNTER4_HI}, + {mmSQ_PERFCOUNTER5_SELECT, mmSQ_PERFCOUNTER_CTRL, mmSQ_PERFCOUNTER5_LO, mmSQ_PERFCOUNTER5_HI}, + {mmSQ_PERFCOUNTER6_SELECT, mmSQ_PERFCOUNTER_CTRL, mmSQ_PERFCOUNTER6_LO, mmSQ_PERFCOUNTER6_HI}, + {mmSQ_PERFCOUNTER7_SELECT, mmSQ_PERFCOUNTER_CTRL, mmSQ_PERFCOUNTER7_LO, mmSQ_PERFCOUNTER7_HI}, + {mmSQ_PERFCOUNTER8_SELECT, mmSQ_PERFCOUNTER_CTRL, mmSQ_PERFCOUNTER8_LO, mmSQ_PERFCOUNTER8_HI}, + {mmSQ_PERFCOUNTER9_SELECT, mmSQ_PERFCOUNTER_CTRL, mmSQ_PERFCOUNTER9_LO, mmSQ_PERFCOUNTER9_HI}, + {mmSQ_PERFCOUNTER10_SELECT, mmSQ_PERFCOUNTER_CTRL, mmSQ_PERFCOUNTER10_LO, + mmSQ_PERFCOUNTER10_HI}, + {mmSQ_PERFCOUNTER11_SELECT, mmSQ_PERFCOUNTER_CTRL, mmSQ_PERFCOUNTER11_LO, + mmSQ_PERFCOUNTER11_HI}, + {mmSQ_PERFCOUNTER12_SELECT, mmSQ_PERFCOUNTER_CTRL, mmSQ_PERFCOUNTER12_LO, + mmSQ_PERFCOUNTER12_HI}, + {mmSQ_PERFCOUNTER13_SELECT, mmSQ_PERFCOUNTER_CTRL, mmSQ_PERFCOUNTER13_LO, + mmSQ_PERFCOUNTER13_HI}, + {mmSQ_PERFCOUNTER14_SELECT, mmSQ_PERFCOUNTER_CTRL, mmSQ_PERFCOUNTER14_LO, + mmSQ_PERFCOUNTER14_HI}, + {mmSQ_PERFCOUNTER15_SELECT, mmSQ_PERFCOUNTER_CTRL, mmSQ_PERFCOUNTER15_LO, + mmSQ_PERFCOUNTER15_HI}}; + +/* + * DRMDMA + */ +GpuCounterRegInfo AiDrmdmaCounterRegAddr[] = { + {mmSDMA0_PERFMON_CNTL, 0, mmSDMA0_PERFCOUNTER0_RESULT, 0}, + {mmSDMA0_PERFMON_CNTL, 0, mmSDMA0_PERFCOUNTER1_RESULT, 0}, + {mmSDMA1_PERFMON_CNTL, 0, mmSDMA1_PERFCOUNTER0_RESULT, 0}, + {mmSDMA1_PERFMON_CNTL, 0, mmSDMA1_PERFCOUNTER1_RESULT, 0}, +}; + +/* + * IH + */ +GpuCounterRegInfo AiIhCounterRegAddr[] = {{mmIH_PERFMON_CNTL, 0, mmIH_PERFCOUNTER0_RESULT, 0}, + {mmIH_PERFMON_CNTL, 0, mmIH_PERFCOUNTER1_RESULT, 0}}; + +/* + * CPF + */ +GpuCounterRegInfo AiCpfCounterRegAddr[] = { + {mmCPF_PERFCOUNTER0_SELECT, 0, mmCPF_PERFCOUNTER0_LO, mmCPF_PERFCOUNTER0_HI}, + {mmCPF_PERFCOUNTER1_SELECT, 0, mmCPF_PERFCOUNTER1_LO, mmCPF_PERFCOUNTER1_HI}}; + +/* + * DRM + */ +GpuCounterRegInfo AiDrmCounterRegAddr[] = { + /* + {mmDRM_PERFCOUNTER1_SELECT, 0, mmDRM_PERFCOUNTER1_LO, mmDRM_PERFCOUNTER1_HI}, + {mmDRM_PERFCOUNTER2_SELECT, 0, mmDRM_PERFCOUNTER2_LO, mmDRM_PERFCOUNTER2_HI} + */ +}; + +/* + * GRBM + */ +GpuCounterRegInfo AiGrbmCounterRegAddr[] = { + {mmGRBM_PERFCOUNTER0_SELECT, 0, mmGRBM_PERFCOUNTER0_LO, mmGRBM_PERFCOUNTER0_HI}, + {mmGRBM_PERFCOUNTER1_SELECT, 0, mmGRBM_PERFCOUNTER1_LO, mmGRBM_PERFCOUNTER1_HI}}; + +/* + * GRBM_SE + */ +GpuCounterRegInfo AiGrbmSeCounterRegAddr[] = { + {mmGRBM_SE0_PERFCOUNTER_SELECT, 0, mmGRBM_SE0_PERFCOUNTER_LO, mmGRBM_SE0_PERFCOUNTER_HI}, + {mmGRBM_SE1_PERFCOUNTER_SELECT, 0, mmGRBM_SE1_PERFCOUNTER_LO, mmGRBM_SE1_PERFCOUNTER_HI}, + {mmGRBM_SE2_PERFCOUNTER_SELECT, 0, mmGRBM_SE2_PERFCOUNTER_LO, mmGRBM_SE2_PERFCOUNTER_HI}, + {mmGRBM_SE3_PERFCOUNTER_SELECT, 0, mmGRBM_SE3_PERFCOUNTER_LO, mmGRBM_SE3_PERFCOUNTER_HI}}; + +/* + * PA_SU + */ +GpuCounterRegInfo AiPaSuCounterRegAddr[] = { + {mmPA_SU_PERFCOUNTER0_SELECT, 0, mmPA_SU_PERFCOUNTER0_LO, mmPA_SU_PERFCOUNTER0_HI}, + {mmPA_SU_PERFCOUNTER1_SELECT, 0, mmPA_SU_PERFCOUNTER1_LO, mmPA_SU_PERFCOUNTER1_HI}, + {mmPA_SU_PERFCOUNTER2_SELECT, 0, mmPA_SU_PERFCOUNTER2_LO, mmPA_SU_PERFCOUNTER2_HI}, + {mmPA_SU_PERFCOUNTER3_SELECT, 0, mmPA_SU_PERFCOUNTER3_LO, mmPA_SU_PERFCOUNTER3_HI}}; + +/* + * PA_SC + */ +GpuCounterRegInfo AiPaScCounterRegAddr[] = { + {mmPA_SC_PERFCOUNTER0_SELECT, 0, mmPA_SC_PERFCOUNTER0_LO, mmPA_SC_PERFCOUNTER0_HI}, + {mmPA_SC_PERFCOUNTER1_SELECT, 0, mmPA_SC_PERFCOUNTER1_LO, mmPA_SC_PERFCOUNTER1_HI}, + {mmPA_SC_PERFCOUNTER2_SELECT, 0, mmPA_SC_PERFCOUNTER2_LO, mmPA_SC_PERFCOUNTER2_HI}, + {mmPA_SC_PERFCOUNTER3_SELECT, 0, mmPA_SC_PERFCOUNTER3_LO, mmPA_SC_PERFCOUNTER3_HI}}; + +/* + * SPI + */ +GpuCounterRegInfo AiSpiCounterRegAddr[] = { + {mmSPI_PERFCOUNTER0_SELECT, 0, mmSPI_PERFCOUNTER0_LO, mmSPI_PERFCOUNTER0_HI}, + {mmSPI_PERFCOUNTER1_SELECT, 0, mmSPI_PERFCOUNTER1_LO, mmSPI_PERFCOUNTER1_HI}, + {mmSPI_PERFCOUNTER2_SELECT, 0, mmSPI_PERFCOUNTER2_LO, mmSPI_PERFCOUNTER2_HI}, + {mmSPI_PERFCOUNTER3_SELECT, 0, mmSPI_PERFCOUNTER3_LO, mmSPI_PERFCOUNTER3_HI}, + {mmSPI_PERFCOUNTER4_SELECT, 0, mmSPI_PERFCOUNTER4_LO, mmSPI_PERFCOUNTER4_HI}, + {mmSPI_PERFCOUNTER5_SELECT, 0, mmSPI_PERFCOUNTER5_LO, mmSPI_PERFCOUNTER5_HI}}; + +/* + * TCA + */ +GpuCounterRegInfo AiTcaCounterRegAddr[] = { + {mmTCA_PERFCOUNTER0_SELECT, 0, mmTCA_PERFCOUNTER0_LO, mmTCA_PERFCOUNTER0_HI}, + {mmTCA_PERFCOUNTER1_SELECT, 0, mmTCA_PERFCOUNTER1_LO, mmTCA_PERFCOUNTER1_HI}, + {mmTCA_PERFCOUNTER2_SELECT, 0, mmTCA_PERFCOUNTER2_LO, mmTCA_PERFCOUNTER2_HI}, + {mmTCA_PERFCOUNTER3_SELECT, 0, mmTCA_PERFCOUNTER3_LO, mmTCA_PERFCOUNTER3_HI}}; + +/* + * TCC + */ +GpuCounterRegInfo AiTccCounterRegAddr[] = { + {mmTCC_PERFCOUNTER0_SELECT, 0, mmTCC_PERFCOUNTER0_LO, mmTCC_PERFCOUNTER0_HI}, + {mmTCC_PERFCOUNTER1_SELECT, 0, mmTCC_PERFCOUNTER1_LO, mmTCC_PERFCOUNTER1_HI}, + {mmTCC_PERFCOUNTER2_SELECT, 0, mmTCC_PERFCOUNTER2_LO, mmTCC_PERFCOUNTER2_HI}, + {mmTCC_PERFCOUNTER3_SELECT, 0, mmTCC_PERFCOUNTER3_LO, mmTCC_PERFCOUNTER3_HI}}; + +/* + * TCP + */ +GpuCounterRegInfo AiTcpCounterRegAddr[] = { + {mmTCP_PERFCOUNTER0_SELECT, 0, mmTCP_PERFCOUNTER0_LO, mmTCP_PERFCOUNTER0_HI}, + {mmTCP_PERFCOUNTER1_SELECT, 0, mmTCP_PERFCOUNTER1_LO, mmTCP_PERFCOUNTER1_HI}, + {mmTCP_PERFCOUNTER2_SELECT, 0, mmTCP_PERFCOUNTER2_LO, mmTCP_PERFCOUNTER2_HI}, + {mmTCP_PERFCOUNTER3_SELECT, 0, mmTCP_PERFCOUNTER3_LO, mmTCP_PERFCOUNTER3_HI}}; + +/* + * CB + */ +GpuCounterRegInfo AiCbCounterRegAddr[] = { + {mmCB_PERFCOUNTER0_SELECT, 0, mmCB_PERFCOUNTER0_LO, mmCB_PERFCOUNTER0_HI}, + {mmCB_PERFCOUNTER1_SELECT, 0, mmCB_PERFCOUNTER1_LO, mmCB_PERFCOUNTER1_HI}, + {mmCB_PERFCOUNTER2_SELECT, 0, mmCB_PERFCOUNTER2_LO, mmCB_PERFCOUNTER2_HI}, + {mmCB_PERFCOUNTER3_SELECT, 0, mmCB_PERFCOUNTER3_LO, mmCB_PERFCOUNTER3_HI}}; + +/* + * DB + */ +GpuCounterRegInfo AiDbCounterRegAddr[] = { + {mmDB_PERFCOUNTER0_SELECT, 0, mmDB_PERFCOUNTER0_LO, mmDB_PERFCOUNTER0_HI}, + {mmDB_PERFCOUNTER1_SELECT, 0, mmDB_PERFCOUNTER1_LO, mmDB_PERFCOUNTER1_HI}, + {mmDB_PERFCOUNTER2_SELECT, 0, mmDB_PERFCOUNTER2_LO, mmDB_PERFCOUNTER2_HI}, + {mmDB_PERFCOUNTER3_SELECT, 0, mmDB_PERFCOUNTER3_LO, mmDB_PERFCOUNTER3_HI}}; + +/* + * RLC + */ +GpuCounterRegInfo AiRlcCounterRegAddr[] = { + {mmRLC_PERFCOUNTER0_SELECT, 0, mmRLC_PERFCOUNTER0_LO, mmRLC_PERFCOUNTER0_HI}, + {mmRLC_PERFCOUNTER1_SELECT, 0, mmRLC_PERFCOUNTER1_LO, mmRLC_PERFCOUNTER1_HI}}; + +/* + * SC + */ +GpuCounterRegInfo AiScCounterRegAddr[] = { + {mmPA_SC_PERFCOUNTER0_SELECT, 0, mmPA_SC_PERFCOUNTER0_LO, mmPA_SC_PERFCOUNTER0_HI}, + {mmPA_SC_PERFCOUNTER1_SELECT, 0, mmPA_SC_PERFCOUNTER1_LO, mmPA_SC_PERFCOUNTER1_HI}, + {mmPA_SC_PERFCOUNTER2_SELECT, 0, mmPA_SC_PERFCOUNTER2_LO, mmPA_SC_PERFCOUNTER2_HI}, + {mmPA_SC_PERFCOUNTER3_SELECT, 0, mmPA_SC_PERFCOUNTER3_LO, mmPA_SC_PERFCOUNTER3_HI}, + {mmPA_SC_PERFCOUNTER4_SELECT, 0, mmPA_SC_PERFCOUNTER4_LO, mmPA_SC_PERFCOUNTER4_HI}, + {mmPA_SC_PERFCOUNTER5_SELECT, 0, mmPA_SC_PERFCOUNTER5_LO, mmPA_SC_PERFCOUNTER5_HI}, + {mmPA_SC_PERFCOUNTER6_SELECT, 0, mmPA_SC_PERFCOUNTER6_LO, mmPA_SC_PERFCOUNTER6_HI}, + {mmPA_SC_PERFCOUNTER7_SELECT, 0, mmPA_SC_PERFCOUNTER7_LO, mmPA_SC_PERFCOUNTER7_HI}}; + +/* + * SX + */ +GpuCounterRegInfo AiSxCounterRegAddr[] = { + {mmSX_PERFCOUNTER0_SELECT, 0, mmSX_PERFCOUNTER0_LO, mmSX_PERFCOUNTER0_HI}, + {mmSX_PERFCOUNTER1_SELECT, 0, mmSX_PERFCOUNTER1_LO, mmSX_PERFCOUNTER1_HI}, + {mmSX_PERFCOUNTER2_SELECT, 0, mmSX_PERFCOUNTER2_LO, mmSX_PERFCOUNTER2_HI}, + {mmSX_PERFCOUNTER3_SELECT, 0, mmSX_PERFCOUNTER3_LO, mmSX_PERFCOUNTER3_HI}}; + +/* + * TA + */ +GpuCounterRegInfo AiTaCounterRegAddr[] = { + {mmTA_PERFCOUNTER0_SELECT, 0, mmTA_PERFCOUNTER0_LO, mmTA_PERFCOUNTER0_HI}, + {mmTA_PERFCOUNTER1_SELECT, 0, mmTA_PERFCOUNTER1_LO, mmTA_PERFCOUNTER1_HI}}; + +/* + * TD + */ +GpuCounterRegInfo AiTdCounterRegAddr[] = { + {mmTD_PERFCOUNTER0_SELECT, 0, mmTD_PERFCOUNTER0_LO, mmTD_PERFCOUNTER0_HI}, + {mmTD_PERFCOUNTER1_SELECT, 0, mmTD_PERFCOUNTER1_LO, mmTD_PERFCOUNTER1_HI}}; + +/* + * GDS + */ +GpuCounterRegInfo AiGdsCounterRegAddr[] = { + {mmGDS_PERFCOUNTER0_SELECT, 0, mmGDS_PERFCOUNTER0_LO, mmGDS_PERFCOUNTER0_HI}, + {mmGDS_PERFCOUNTER1_SELECT, 0, mmGDS_PERFCOUNTER1_LO, mmGDS_PERFCOUNTER1_HI}, + {mmGDS_PERFCOUNTER2_SELECT, 0, mmGDS_PERFCOUNTER2_LO, mmGDS_PERFCOUNTER2_HI}, + {mmGDS_PERFCOUNTER3_SELECT, 0, mmGDS_PERFCOUNTER3_LO, mmGDS_PERFCOUNTER3_HI}}; + +/* + * VGT + */ +GpuCounterRegInfo AiVgtCounterRegAddr[] = { + {mmVGT_PERFCOUNTER0_SELECT, 0, mmVGT_PERFCOUNTER0_LO, mmVGT_PERFCOUNTER0_HI}, + {mmVGT_PERFCOUNTER1_SELECT, 0, mmVGT_PERFCOUNTER1_LO, mmVGT_PERFCOUNTER1_HI}, + {mmVGT_PERFCOUNTER2_SELECT, 0, mmVGT_PERFCOUNTER2_LO, mmVGT_PERFCOUNTER2_HI}, + {mmVGT_PERFCOUNTER3_SELECT, 0, mmVGT_PERFCOUNTER3_LO, mmVGT_PERFCOUNTER3_HI}}; + +/* + * IA + */ +GpuCounterRegInfo AiIaCounterRegAddr[] = { + {mmIA_PERFCOUNTER0_SELECT, 0, mmIA_PERFCOUNTER0_LO, mmIA_PERFCOUNTER0_HI}, + {mmIA_PERFCOUNTER1_SELECT, 0, mmIA_PERFCOUNTER1_LO, mmIA_PERFCOUNTER1_HI}, + {mmIA_PERFCOUNTER2_SELECT, 0, mmIA_PERFCOUNTER2_LO, mmIA_PERFCOUNTER2_HI}, + {mmIA_PERFCOUNTER3_SELECT, 0, mmIA_PERFCOUNTER3_LO, mmIA_PERFCOUNTER3_HI}}; + +/* + * MC + */ +GpuCounterRegInfo AiMcCounterRegAddr[] = { + /* + + {mmMC_SEQ_PERF_SEQ_CTL__SI__VI, 0, mmMC_SEQ_PERF_SEQ_CNT_A_I0__VI, + mmMC_SEQ_PERF_SEQ_CNT_A_I1__VI}, + {mmMC_SEQ_PERF_SEQ_CTL__SI__VI, 0, mmMC_SEQ_PERF_SEQ_CNT_B_I0__VI, + mmMC_SEQ_PERF_SEQ_CNT_B_I1__VI}, + {mmMC_SEQ_PERF_SEQ_CTL__SI__VI, 0, mmMC_SEQ_PERF_SEQ_CNT_C_I0__VI, + mmMC_SEQ_PERF_SEQ_CNT_C_I1__VI}, + {mmMC_SEQ_PERF_SEQ_CTL__SI__VI, 0, mmMC_SEQ_PERF_SEQ_CNT_D_I0__VI, + mmMC_SEQ_PERF_SEQ_CNT_D_I1__VI} + + */ +}; + +/* + * SRBM + */ +GpuCounterRegInfo AiSrbmCounterRegAddr[] = { + /* + {mmSRBM_PERFCOUNTER0_SELECT, 0, mmSRBM_PERFCOUNTER0_LO, + mmSRBM_PERFCOUNTER0_HI}, + {mmSRBM_PERFCOUNTER1_SELECT, 0, mmSRBM_PERFCOUNTER1_LO, + mmSRBM_PERFCOUNTER1_HI} + */ +}; + +/* + * WD + */ +GpuCounterRegInfo AiWdCounterRegAddr[] = { + {mmWD_PERFCOUNTER0_SELECT, 0, mmWD_PERFCOUNTER0_LO, mmWD_PERFCOUNTER0_HI}, + {mmWD_PERFCOUNTER1_SELECT, 0, mmWD_PERFCOUNTER1_LO, mmWD_PERFCOUNTER1_HI}, + {mmWD_PERFCOUNTER2_SELECT, 0, mmWD_PERFCOUNTER2_LO, mmWD_PERFCOUNTER2_HI}, + {mmWD_PERFCOUNTER3_SELECT, 0, mmWD_PERFCOUNTER3_LO, mmWD_PERFCOUNTER3_HI}}; + +/* + * CPG + */ +GpuCounterRegInfo AiCpgCounterRegAddr[] = { + {mmCPG_PERFCOUNTER0_SELECT, 0, mmCPG_PERFCOUNTER0_LO, mmCPG_PERFCOUNTER0_HI}, + {mmCPG_PERFCOUNTER1_SELECT, 0, mmCPG_PERFCOUNTER1_LO, mmCPG_PERFCOUNTER1_HI}}; + +/* + * CPC + */ +GpuCounterRegInfo AiCpcCounterRegAddr[] = { + {mmCPC_PERFCOUNTER0_SELECT, 0, mmCPC_PERFCOUNTER0_LO, mmCPC_PERFCOUNTER0_HI}, + {mmCPC_PERFCOUNTER1_SELECT, 0, mmCPC_PERFCOUNTER1_LO, mmCPC_PERFCOUNTER1_HI}}; + +GpuPrivCounterBlockId AiBlockIdSq = {{0xb5c396b6, 0x47e4d310, 0xc35cfc86, 0x08f53a04}}; +GpuPrivCounterBlockId AiBlockIdMc = {{0x13900b57, 0x4d984956, 0x5268d081, 0x9cf53719}}; +GpuPrivCounterBlockId AiBlockIdIommuV2 = {{0x80969879, 0x4be6b0f6, 0x636af697, 0x1d10f500}}; +GpuPrivCounterBlockId AiBlockIdKernelDriver = {{0xea9b5ae1, 0x44b36c3f, 0xf0da5489, 0x0aa96575}}; + +} // pm4_profile diff --git a/runtime/hsa-ext-aql-profile/src/perfcounter/ai_blockinfo.h b/runtime/hsa-ext-aql-profile/src/perfcounter/ai_blockinfo.h new file mode 100644 index 0000000000..8e2460d7c0 --- /dev/null +++ b/runtime/hsa-ext-aql-profile/src/perfcounter/ai_blockinfo.h @@ -0,0 +1,252 @@ +#ifndef _AI_BLOCKINFO_H_ +#define _AI_BLOCKINFO_H_ + +#include +#include "rocr_profiler.h" +#include "gpu_enum.h" +#include "gpu_blockinfo.h" + +namespace pm4_profile { + +// MAX Number of block instances for ARCTIC ISLANDS (From Vega10) +// Values are found here //gfxip/gfx8/main/src/meta/features/variant/Fiji/album.dj + +// @brief Number of block instances. + +// Number of CB block instances per SE +// and number of Perf Cntrs per CB block +#define AI_NUM_CB 4 +#define AI_COUNTER_NUM_PER_CB 4 + +// Number of DB block instances per SE +// and number of Perf Cntrs per DB block +#define AI_NUM_DB 4 +#define AI_COUNTER_NUM_PER_DB 4 + +// Number of TA block instances per SE +// and number of Perf Cntrs per TA block +#define AI_NUM_TA 16 +#define AI_COUNTER_NUM_PER_TA 2 + +// Number of TD block instances per SE +// and number of Perf Cntrs per TD block +#define AI_NUM_TD 16 +#define AI_COUNTER_NUM_PER_TD 2 + +// Number of TCP block instances per SE +// and number of Perf Cntrs per TCP block +#define AI_NUM_TCP 16 +#define AI_COUNTER_NUM_PER_TCP 4 + +// Number of TCA block instances per chip +// and number of Perf Cntrs per TCA block +#define AI_NUM_TCA 2 +#define AI_COUNTER_NUM_PER_TCA 4 + +// Number of TCC block instances per chip +// and number of Perf Cntrs per TCC block +#define AI_NUM_TCC 16 +#define AI_COUNTER_NUM_PER_TCC 4 + +// Number of SDMA block instances per chip +// and number of Perf Cntrs per SDMA block +#define AI_NUM_SDMA 2 + +// Number of counter registers per block for arctic islands +#define AI_COUNTER_NUM_PER_DRM 2 +#define AI_COUNTER_NUM_PER_DRMDMA 2 +#define AI_COUNTER_NUM_PER_IH 2 +#define AI_COUNTER_NUM_PER_SRBM 2 +#define AI_COUNTER_NUM_PER_CPF 2 +#define AI_COUNTER_NUM_PER_GRBM 2 +#define AI_COUNTER_NUM_PER_GRBMSE 4 +#define AI_COUNTER_NUM_PER_PA_SU 4 +#define AI_COUNTER_NUM_PER_RLC 2 +#define AI_COUNTER_NUM_PER_PA_SC 8 +#define AI_COUNTER_NUM_PER_SPI 6 // [Shucai: To do: double check the value] +#define AI_COUNTER_NUM_PER_SQ 16 +#define AI_COUNTER_NUM_PER_SX 4 +#define AI_COUNTER_NUM_PER_GDS 4 +#define AI_COUNTER_NUM_PER_VGT 4 +#define AI_COUNTER_NUM_PER_IA 4 +#define AI_COUNTER_NUM_PER_MC 4 +#define AI_COUNTER_NUM_PER_TCS 4 +#define AI_COUNTER_NUM_PER_WD 4 +#define AI_COUNTER_NUM_PER_CPG 2 +#define AI_COUNTER_NUM_PER_CPC 2 +#define AI_COUNTER_NUM_PER_VM 1 +#define AI_COUNTER_NUM_PER_VM_MD 1 +#define AI_COUNTER_NUM_PER_PIPESTATS 12 + +#define AI_MAX_NUM_SHADER_ENGINES 1 + +// Enumeration of AI hardware counter blocks +typedef enum HsaAiCounterBlockId { + kHsaAiCounterBlockIdCb0 = 0, + kHsaAiCounterBlockIdCb1, + kHsaAiCounterBlockIdCb2, + kHsaAiCounterBlockIdCb3, + + // Temp commented for Vega10 + kHsaAiCounterBlockIdCpf, + + kHsaAiCounterBlockIdDb0, + kHsaAiCounterBlockIdDb1, + kHsaAiCounterBlockIdDb2, + kHsaAiCounterBlockIdDb3, + + kHsaAiCounterBlockIdGrbm, + kHsaAiCounterBlockIdGrbmSe, + kHsaAiCounterBlockIdPaSu, + kHsaAiCounterBlockIdPaSc, + kHsaAiCounterBlockIdSpi, + + kHsaAiCounterBlockIdSq, + kHsaAiCounterBlockIdSqGs, + kHsaAiCounterBlockIdSqVs, + kHsaAiCounterBlockIdSqPs, + kHsaAiCounterBlockIdSqHs, + kHsaAiCounterBlockIdSqCs, + + kHsaAiCounterBlockIdSx, + + kHsaAiCounterBlockIdTa0, + kHsaAiCounterBlockIdTa1, + kHsaAiCounterBlockIdTa2, + kHsaAiCounterBlockIdTa3, + kHsaAiCounterBlockIdTa4, + kHsaAiCounterBlockIdTa5, + kHsaAiCounterBlockIdTa6, + kHsaAiCounterBlockIdTa7, + kHsaAiCounterBlockIdTa8, + kHsaAiCounterBlockIdTa9, + kHsaAiCounterBlockIdTa10, + kHsaAiCounterBlockIdTa11, + kHsaAiCounterBlockIdTa12, + kHsaAiCounterBlockIdTa13, + kHsaAiCounterBlockIdTa14, + kHsaAiCounterBlockIdTa15, + + kHsaAiCounterBlockIdTca0, + kHsaAiCounterBlockIdTca1, + + kHsaAiCounterBlockIdTcc0, + kHsaAiCounterBlockIdTcc1, + kHsaAiCounterBlockIdTcc2, + kHsaAiCounterBlockIdTcc3, + kHsaAiCounterBlockIdTcc4, + kHsaAiCounterBlockIdTcc5, + kHsaAiCounterBlockIdTcc6, + kHsaAiCounterBlockIdTcc7, + kHsaAiCounterBlockIdTcc8, + kHsaAiCounterBlockIdTcc9, + kHsaAiCounterBlockIdTcc10, + kHsaAiCounterBlockIdTcc11, + kHsaAiCounterBlockIdTcc12, + kHsaAiCounterBlockIdTcc13, + kHsaAiCounterBlockIdTcc14, + kHsaAiCounterBlockIdTcc15, + + kHsaAiCounterBlockIdTd0, + kHsaAiCounterBlockIdTd1, + kHsaAiCounterBlockIdTd2, + kHsaAiCounterBlockIdTd3, + kHsaAiCounterBlockIdTd4, + kHsaAiCounterBlockIdTd5, + kHsaAiCounterBlockIdTd6, + kHsaAiCounterBlockIdTd7, + kHsaAiCounterBlockIdTd8, + kHsaAiCounterBlockIdTd9, + kHsaAiCounterBlockIdTd10, + kHsaAiCounterBlockIdTd11, + kHsaAiCounterBlockIdTd12, + kHsaAiCounterBlockIdTd13, + kHsaAiCounterBlockIdTd14, + kHsaAiCounterBlockIdTd15, + + kHsaAiCounterBlockIdTcp0, + kHsaAiCounterBlockIdTcp1, + kHsaAiCounterBlockIdTcp2, + kHsaAiCounterBlockIdTcp3, + kHsaAiCounterBlockIdTcp4, + kHsaAiCounterBlockIdTcp5, + kHsaAiCounterBlockIdTcp6, + kHsaAiCounterBlockIdTcp7, + kHsaAiCounterBlockIdTcp8, + kHsaAiCounterBlockIdTcp9, + kHsaAiCounterBlockIdTcp10, + kHsaAiCounterBlockIdTcp11, + kHsaAiCounterBlockIdTcp12, + kHsaAiCounterBlockIdTcp13, + kHsaAiCounterBlockIdTcp14, + kHsaAiCounterBlockIdTcp15, + + kHsaAiCounterBlockIdGds, + kHsaAiCounterBlockIdVgt, + kHsaAiCounterBlockIdIa, + kHsaAiCounterBlockIdMc, + + // Temp commented out for Vega10 + // kHsaAiCounterBlockIdSrbm, + + kHsaAiCounterBlockIdTcs, + kHsaAiCounterBlockIdWd, + + // Temp commented out for Vega10 + // kHsaAiCounterBlockIdCpg, + + // Temp commented for Vega10 + kHsaAiCounterBlockIdCpc, + + // Counters retrieved by KFD + kHsaAiCounterBlockIdIommuV2, + kHsaAiCounterBlockIdKernelDriver, + + kHsaAiCounterBlockIdCpPipeStats, + kHsaAiCounterBlockIdHwInfo, + kHsaAiCounterBlockIdBlocksFirst = kHsaAiCounterBlockIdCb0, + kHsaAiCounterBlockIdBlocksLast = kHsaAiCounterBlockIdHwInfo +} HsaAiCounterBlockId; + +extern GpuBlockInfo AiPmuHwBlocks[]; +extern GpuCounterRegInfo AiSqCounterRegAddr[]; +extern GpuCounterRegInfo AiCbCounterRegAddr[]; +extern GpuCounterRegInfo AiDrmdmaCounterRegAddr[]; +extern GpuCounterRegInfo AiIhCounterRegAddr[]; +extern GpuCounterRegInfo AiCpfCounterRegAddr[]; +extern GpuCounterRegInfo AiCpgCounterRegAddr[]; +extern GpuCounterRegInfo AiCpcCounterRegAddr[]; +extern GpuCounterRegInfo AiDrmCounterRegAddr[]; +extern GpuCounterRegInfo AiGrbmCounterRegAddr[]; +extern GpuCounterRegInfo AiGrbmSeCounterRegAddr[]; +extern GpuCounterRegInfo AiPaSuCounterRegAddr[]; +extern GpuCounterRegInfo AiPaScCounterRegAddr[]; +extern GpuCounterRegInfo AiSpiCounterRegAddr[]; +extern GpuCounterRegInfo AiTcaCounterRegAddr[]; +extern GpuCounterRegInfo AiTccCounterRegAddr[]; +extern GpuCounterRegInfo AiTcpCounterRegAddr[]; +extern GpuCounterRegInfo AiDbCounterRegAddr[]; +extern GpuCounterRegInfo AiRlcCounterRegAddr[]; +extern GpuCounterRegInfo AiScCounterRegAddr[]; +extern GpuCounterRegInfo AiSxCounterRegAddr[]; +extern GpuCounterRegInfo AiTaCounterRegAddr[]; +extern GpuCounterRegInfo AiTdCounterRegAddr[]; +extern GpuCounterRegInfo AiGdsCounterRegAddr[]; +extern GpuCounterRegInfo AiVgtCounterRegAddr[]; +extern GpuCounterRegInfo AiIaCounterRegAddr[]; +extern GpuCounterRegInfo AiMcCounterRegAddr[]; +extern GpuCounterRegInfo AiSrbmCounterRegAddr[]; + +// No Tcs Counter block on AI +// extern GpuCounterRegInfo AiTcsCounterRegAddr[]; +extern GpuCounterRegInfo AiWdCounterRegAddr[]; +extern GpuCounterRegInfo AiCpgCounterRegAddr[]; +extern GpuCounterRegInfo AiCpcCounterRegAddr[]; + +extern GpuPrivCounterBlockId AiBlockIdSq; +extern GpuPrivCounterBlockId AiBlockIdMc; +extern GpuPrivCounterBlockId AiBlockIdIommuV2; +extern GpuPrivCounterBlockId AiBlockIdKernelDriver; +} + +#endif // _AI_BLOCKINFO_H_ diff --git a/runtime/hsa-ext-aql-profile/src/perfcounter/ai_pmu.cpp b/runtime/hsa-ext-aql-profile/src/perfcounter/ai_pmu.cpp new file mode 100644 index 0000000000..5fa851a53e --- /dev/null +++ b/runtime/hsa-ext-aql-profile/src/perfcounter/ai_pmu.cpp @@ -0,0 +1,1601 @@ +#include "os.h" + +#include "gfxip/gfx9/gfx9_registers.h" +#include "gfxip/gfx9/gfx9_typedef.h" +#include "gfxip/gfx9/gfx9_offset.h" +#include "cmdwriter.h" + +#include "ai_pmu.h" +#include "gpu_countergroup.h" +#include "ai_blockinfo.h" +#include "gpu_enum.h" + +#include +#include + +#include + +using namespace std; +using namespace pm4_profile; +using namespace pm4_profile::gfx9; + +// A flag to indicate the current packet is for copy register value +#define MAX_REG_NUM (100) +#define COPY_DATA_FLAG (0xFFFFFFFF) +#define COPY_DATA_SEL_REG (0x00) ///< Mem-mapped register +#define COPY_DATA_SEL_COUNT_1DW (0x00) ///< Copy 1 word (32 bits) +#define COPY_DATA_SEL_COUNT_2DW (0x01) ///< Copy 2 words (64 bits) + +namespace pm4_profile { + +static char errorString[][64] = {{"No error"}, + {"unknow countergroup id"}, + {"no countergroup id"}, + {"invalid operation"}, + {"counter is not available"}, + {"countegroup error state"}, + {"countegroup is not completed"}}; + +AiPmu::AiPmu() { + // Initialize the number of shader engines + num_se_ = 4; + Init(); +} + +void AiPmu::Init() { + error_code_ = 0; + info_set_ = new InfoSet(); + parameter_set_ = new ParameterSet(); + + // Initialize pointer to stored counter block list to NULL + blk_list_ = NULL; + initCounterBlock(); + + // Initialize the value to use in resetting GRBM + regGRBM_GFX_INDEX grbm_gfx_index; + grbm_gfx_index.u32All = 0; + grbm_gfx_index.bitfields.INSTANCE_BROADCAST_WRITES = 1; + grbm_gfx_index.bitfields.SE_BROADCAST_WRITES = 1; + grbm_gfx_index.bitfields.SH_BROADCAST_WRITES = 1; + reset_grbm_ = grbm_gfx_index.u32All; + + // Update state of Perf Mgmt Unit + profiler_state_ = ROCR_PMU_STATE_IDLE; +} + +AiPmu::~AiPmu() { + // Remove all counter blocks + RemoveCounterBlocks(); + blk_map_.clear(); + delete parameter_set_; + delete info_set_; + + if (blk_list_) { + free(blk_list_); + blk_list_ = NULL; + } +} + +// Initializes the handle of buffer used to collect PMC data +// @param cmdBufSz Size in terms of bytes +bool AiPmu::setPmcDataBuff(uint8_t* pmcBuffer, uint32_t pmcBuffSz) { + // Update counter data buffer addr and size params + pmcDataSz_ = pmcBuffSz; + pmcData_ = (uint32_t*)pmcBuffer; + return true; +} + +// +// The logic is quite simple and is as follows +// +// Issue CsPartialFlush +// Issue Cmd to stop Perf Counters +// Issue Cmd to Disable & Reset Perf Counters +// +void AiPmu::ResetCounterBlocks(pm4_profile::DefaultCmdBuf* cmdBuff, + pm4_profile::CommandWriter* cmdWriter) { + // Waits until all outstanding commands have completed + // by issing CS Partial Flush command + cmdWriter->BuildWriteWaitIdlePacket(cmdBuff); + + // Program CP Perfmon Cntrl Rgstr to disable and reset counters + regCP_PERFMON_CNTL cp_perfmon_cntl; + cp_perfmon_cntl.u32All = 0; + cp_perfmon_cntl.bits.PERFMON_STATE = 0; + cmdWriter->BuildWriteUConfigRegPacket(cmdBuff, mmCP_PERFMON_CNTL, cp_perfmon_cntl.u32All); +} + +bool AiPmu::begin(pm4_profile::DefaultCmdBuf* cmdBuff, pm4_profile::CommandWriter* cmdWriter, + bool reset_counter) { + if (profiler_state_ != ROCR_PMU_STATE_IDLE) { + error_code_ = kHsaPmuErrorCodeErrorState; + return false; + } + + // Reset Grbm to its default state - broadcast + cmdWriter->BuildWriteUConfigRegPacket(cmdBuff, mmGRBM_GFX_INDEX, reset_grbm_); + + // Disable RLC Perfmon Clock Gating + // On Vega this is needed to collect Perf Cntrs + cmdWriter->BuildWriteUConfigRegPacket(cmdBuff, mmRLC_PERFMON_CLK_CNTL, 1); + + // Collect all the program counter blocks + uint32_t reg_val[MAX_REG_NUM], reg_addr[MAX_REG_NUM], reg_num; + + // Retrieve the list of blocks whose perf counters have been enabled + uint32_t blk_cnt = 0; + CounterBlock** blk_list = getAllCounterBlocks(blk_cnt); + + // Iterate through the list of blocks to generate Pm4 commands to + // program corresponding perf counters of each block + for (uint32_t blkIdx = 0; blkIdx < blk_cnt; blkIdx++) { + // Retrieve the list of perf counters and their count + uint32_t counter_num; + Counter** cntr_list; + cntr_list = blk_list[blkIdx]->getEnabledCounters(counter_num); + if (counter_num == 0) { + continue; + } + + // Retrieve the block Id of perf counters + void* p_data; + uint32_t block_id; + uint32_t data_size; + blk_list[blkIdx]->getInfo(GPU_BLK_INFO_ID, data_size, (void**)&p_data); + block_id = *(static_cast(p_data)); + + // Iterate through each enabled perf counter and building + // corresponding Pm4 commands to program the various control + // registers involved + for (uint32_t cntrIdx = 0; cntrIdx < counter_num; cntrIdx++) { + // Build the list of control registers to program which + // varies per perf counter block + reg_num = BuildCounterSelRegister(cntrIdx, reg_addr, reg_val, block_id, cntr_list[cntrIdx]); + + // Build the list of Pm4 commands that support control + // register programming + for (uint32_t regIdx = 0; regIdx < reg_num; regIdx++) { + cmdWriter->BuildWriteUConfigRegPacket(cmdBuff, reg_addr[regIdx], reg_val[regIdx]); + } + } + } + + // Reset Grbm to its default state - broadcast + cmdWriter->BuildWriteUConfigRegPacket(cmdBuff, mmGRBM_GFX_INDEX, reset_grbm_); + + // Program Compute_Perfcount_Enable register to support perf counting + regCOMPUTE_PERFCOUNT_ENABLE cp_perfcount_enable; + cp_perfcount_enable.u32All = 0; + cp_perfcount_enable.bits.PERFCOUNT_ENABLE = 1; + cmdWriter->BuildWriteShRegPacket(cmdBuff, mmCOMPUTE_PERFCOUNT_ENABLE, cp_perfcount_enable.u32All); + + // Reset the counter list + regCP_PERFMON_CNTL cp_perfmon_cntl; + cp_perfmon_cntl.u32All = 0; + cp_perfmon_cntl.bits.PERFMON_STATE = 0; + cmdWriter->BuildWriteUConfigRegPacket(cmdBuff, mmCP_PERFMON_CNTL, cp_perfmon_cntl.u32All); + + // Start the counter list + cp_perfmon_cntl.bits.PERFMON_STATE = 1; + cmdWriter->BuildWriteUConfigRegPacket(cmdBuff, mmCP_PERFMON_CNTL, cp_perfmon_cntl.u32All); + + cmdWriter->BuildWriteWaitIdlePacket(cmdBuff); + + profiler_state_ = ROCR_PMU_STATE_START; + return true; +} + +bool AiPmu::end(pm4_profile::DefaultCmdBuf* cmdBuff, pm4_profile::CommandWriter* cmdWriter) { + if (profiler_state_ != ROCR_PMU_STATE_START) { + error_code_ = kHsaPmuErrorCodeErrorState; + return false; + } + + void* p_data; + regGRBM_GFX_INDEX grbm_gfx_index; + + // Issue CsPartialFlush command to wait for dispatch to complete + cmdWriter->BuildWriteWaitIdlePacket(cmdBuff); + + // Build PM4 packet for starting counters + regCP_PERFMON_CNTL cp_perfmon_cntl; + cp_perfmon_cntl.u32All = 0; + cp_perfmon_cntl.bits.PERFMON_STATE = 2; + cp_perfmon_cntl.bits.PERFMON_SAMPLE_ENABLE = 1; + cmdWriter->BuildWriteUConfigRegPacket(cmdBuff, mmCP_PERFMON_CNTL, cp_perfmon_cntl.u32All); + + // Collect all the program counter blocks + uint32_t i, j, k, reg_addr[MAX_REG_NUM], reg_val[MAX_REG_NUM], reg_num, data_size; + + uint32_t blk_cnt = 0; + CounterBlock** blk_list = getAllCounterBlocks(blk_cnt); + + uint32_t counter_num; + Counter** cntr_list; + uint32_t total_counter_num = 0; + for (i = 0; i < blk_cnt; i++) { + // Retrieve all enabled cntr_list in each counter block + cntr_list = blk_list[i]->getEnabledCounters(counter_num); + if (!blk_list[i]->getInfo(GPU_BLK_INFO_CONTROL_METHOD, data_size, &p_data)) { + return false; + } + + CntlMethod method; + method = static_cast(*(static_cast(p_data))); + + // Need to read counter values from each shader engine + if (method == CntlMethodBySe || method == CntlMethodBySeAndInstance) { + counter_num = counter_num * num_se_; + } + + total_counter_num += counter_num; + } + + size_t cntrSize = sizeof(int32_t) * 2 * total_counter_num; + if (cntrSize > pmcDataSz_) { + return false; + } + + // Reset Grbm to its default state - broadcast + cmdWriter->BuildWriteUConfigRegPacket(cmdBuff, mmGRBM_GFX_INDEX, reset_grbm_); + + // Create PM4 packet to read counter values + total_counter_num = 0; + for (i = 0; i < blk_cnt; i++) { + // Retrieve all enabled cntr_list in each counter block + cntr_list = blk_list[i]->getEnabledCounters(counter_num); + if (counter_num > 0) { + uint32_t block_id; + uint32_t data_size; + if (!blk_list[i]->getInfo(GPU_BLK_INFO_ID, data_size, (void**)&p_data)) { + return false; + } + block_id = *(static_cast(p_data)); + + for (j = 0; j < counter_num; j++) { + // retrieve the registers to be set + reg_num = BuildCounterReadRegisters(j, block_id, reg_addr, reg_val); + for (k = 0; k < reg_num; k++) { + if (reg_val[k] == COPY_DATA_FLAG) { + cmdWriter->BuildCopyDataPacket(cmdBuff, COPY_DATA_SEL_REG, reg_addr[k], 0, + pmcData_ + total_counter_num, COPY_DATA_SEL_COUNT_1DW, + false); + total_counter_num++; + } else { + cmdWriter->BuildWriteUConfigRegPacket(cmdBuff, reg_addr[k], reg_val[k]); + } + } + } + } + } + + // Reset Grbm to its default state - broadcast + cmdWriter->BuildWriteUConfigRegPacket(cmdBuff, mmGRBM_GFX_INDEX, reset_grbm_); + + // Enable RLC Perfmon Clock Gating. On Vega this is + // was disabled during Perf Cntrs collection session + cmdWriter->BuildWriteUConfigRegPacket(cmdBuff, mmRLC_PERFMON_CLK_CNTL, 0); + + profiler_state_ = ROCR_PMU_STATE_STOP; + return true; +} + +bool AiPmu::initCounterBlock() { + for (int i = 0; !(std::string(AiPmuHwBlocks[i].blockName).empty()); i++) { + // Override the value of max number of shader engines + AiPmuHwBlocks[i].maxShaderEngineCount = num_se_; + + // Instantiate a perf counter block and its properties + GpuCounterBlock* cntr_blk = new GpuCounterBlock(); + if (!cntr_blk) { + blk_map_.clear(); + return false; + } + + cntr_blk->setInfo(GPU_BLK_INFO_BLOCK_NAME, GPU_BLOCK_NAME_SIZE, + (void*)AiPmuHwBlocks[i].blockName); + + cntr_blk->setInfo(GPU_BLK_INFO_ID, sizeof(uint32_t), (void*)&AiPmuHwBlocks[i].counterGroupId); + + cntr_blk->setInfo(GPU_BLK_INFO_MAX_SHADER_ENGINE_COUNT, sizeof(uint32_t), + (void*)&(AiPmuHwBlocks[i].maxShaderEngineCount)); + + cntr_blk->setInfo(GPU_BLK_INFO_MAX_SHADER_ARRAY_COUNT, sizeof(uint32_t), + (void*)&(AiPmuHwBlocks[i].maxShaderArrayCount)); + + cntr_blk->setInfo(GPU_BLK_INFO_MAX_INSTANCE_COUNT, sizeof(uint32_t), + (void*)&(AiPmuHwBlocks[i].maxInstanceCount)); + + cntr_blk->setInfo(GPU_BLK_INFO_CONTROL_METHOD, sizeof(uint32_t), + (void*)&(AiPmuHwBlocks[i].method)); + + cntr_blk->setInfo(GPU_BLK_INFO_MAX_EVENT_ID, sizeof(uint32_t), + (void*)&(AiPmuHwBlocks[i].maxEventId)); + + cntr_blk->setInfo(GPU_BLK_INFO_MAX_SIMULTANEOUS_COUNTERS, sizeof(uint32_t), + (void*)&(AiPmuHwBlocks[i].maxSimultaneousCounters)); + + cntr_blk->setInfo(GPU_BLK_INFO_MAX_STREAMING_COUNTERS, sizeof(uint32_t), + (void*)&(AiPmuHwBlocks[i].maxStreamingCounters)); + + cntr_blk->setInfo(GPU_BLK_INFO_SHARED_HW_COUNTERS, sizeof(uint32_t), + (void*)&(AiPmuHwBlocks[i].sharedHWCounters)); + + cntr_blk->setInfo(GPU_BLK_INFO_HAS_FILTERS, sizeof(bool), + (void*)&(AiPmuHwBlocks[i].hasFilters)); + + // TODO: Need to fill in the Threadtrace stuff here + HsaAiCounterBlockId blk_id; + blk_id = static_cast(AiPmuHwBlocks[i].counterGroupId); + blk_map_.insert(AiCounterBlockMap::value_type(blk_id, cntr_blk)); + } + + // Initiate the PMU state and error code + error_code_ = 0; + profiler_state_ = ROCR_PMU_STATE_IDLE; + return true; +} + +int AiPmu::getLastError() { return error_code_; } + +std::string AiPmu::getErrorString(int error) { + if ((error >= 0) && (error < kHsaPmuErrorCodeMax)) { + std::string err_string(errorString[error]); + return err_string; + } + return string("Error input code!"); +} + +bool AiPmu::getParameter(uint32_t param, uint32_t& retSize, void** ppData) { + return parameter_set_->getParameter(param, retSize, ppData); +} + +bool AiPmu::setParameter(uint32_t param, uint32_t paramSize, const void* p_data) { + return parameter_set_->setParameter(param, paramSize, p_data); +} + +bool AiPmu::getInfo(uint32_t info, uint32_t& retSize, void** ppData) { + return info_set_->getInfo(info, retSize, ppData); +} + +pm4_profile::CounterBlock* AiPmu::getCounterBlockById(uint32_t id) { + HsaAiCounterBlockId block_id = static_cast(id); + + return blk_map_[block_id]; +} + +pm4_profile::CounterBlock** AiPmu::getAllCounterBlocks(uint32_t& num_blocks) { + size_t block_size = blk_map_.size(); + + if (block_size <= 0) { + error_code_ = kHsaPmuErrorCodeNoCounterBlock; + return NULL; + } + + if (blk_list_) { + free(blk_list_); + blk_list_ = NULL; + } + + blk_list_size_ = (uint32_t)(sizeof(GpuCounterBlock*) * block_size); + blk_list_size_ = ((blk_list_size_ % 4096) != 0) ? 4096 : blk_list_size_; + blk_list_ = (CounterBlock**)malloc(blk_list_size_); + if (blk_list_ == NULL) { + return NULL; + } + + AiCounterBlockMap::iterator it; + uint32_t blk_cnt = 0; + for (it = blk_map_.begin(); it != blk_map_.end(); it++) { + blk_list_[blk_cnt] = it->second; + blk_cnt++; + } + + num_blocks = blk_cnt; + return blk_list_; +} + +uint32_t AiPmu::ProgramTcpCntrs(uint32_t tcpRegIdx, uint32_t* regAddr, uint32_t* regVal, + uint32_t blkId, uint32_t blkCntrIdx) { + regGRBM_GFX_INDEX grbm_gfx_index; + + grbm_gfx_index.u32All = 0; + grbm_gfx_index.bitfields.SE_BROADCAST_WRITES = 1; + grbm_gfx_index.bitfields.SH_BROADCAST_WRITES = 1; + grbm_gfx_index.bitfields.INSTANCE_INDEX = blkId - kHsaAiCounterBlockIdTcp0; + + uint32_t regIdx = 0; + regVal[regIdx] = grbm_gfx_index.u32All; + regAddr[regIdx] = mmGRBM_GFX_INDEX; + regIdx++; + + regTCP_PERFCOUNTER0_SELECT tcp_perf_counter_select; + tcp_perf_counter_select.u32All = 0; + tcp_perf_counter_select.bits.PERF_SEL = blkCntrIdx; + + regVal[regIdx] = tcp_perf_counter_select.u32All; + regAddr[regIdx] = AiTcpCounterRegAddr[tcpRegIdx].counterSelRegAddr; + regIdx++; + + return regIdx; +} + +uint32_t AiPmu::ProgramTdCntrs(uint32_t tdRegIdx, uint32_t* regAddr, uint32_t* regVal, + uint32_t blkId, uint32_t blkCntrIdx) { + regGRBM_GFX_INDEX grbm_gfx_index; + + grbm_gfx_index.u32All = 0; + grbm_gfx_index.bitfields.SE_BROADCAST_WRITES = 1; + grbm_gfx_index.bitfields.SH_BROADCAST_WRITES = 1; + grbm_gfx_index.bitfields.INSTANCE_INDEX = blkId - kHsaAiCounterBlockIdTd0; + + uint32_t regIdx = 0; + regVal[regIdx] = grbm_gfx_index.u32All; + regAddr[regIdx] = mmGRBM_GFX_INDEX; + regIdx++; + + regTD_PERFCOUNTER0_SELECT td_perf_counter_select; + td_perf_counter_select.u32All = 0; + td_perf_counter_select.bits.PERF_SEL = blkCntrIdx; + regVal[regIdx] = td_perf_counter_select.u32All; + regAddr[regIdx] = AiTdCounterRegAddr[tdRegIdx].counterSelRegAddr; + regIdx++; + + return regIdx; +} + +uint32_t AiPmu::ProgramTccCntrs(uint32_t tccRegIdx, uint32_t* regAddr, uint32_t* regVal, + uint32_t blkId, uint32_t blkCntrIdx) { + regGRBM_GFX_INDEX grbm_gfx_index; + + grbm_gfx_index.u32All = 0; + grbm_gfx_index.bitfields.SE_BROADCAST_WRITES = 1; + grbm_gfx_index.bitfields.SH_BROADCAST_WRITES = 1; + grbm_gfx_index.bitfields.INSTANCE_INDEX = blkId - kHsaAiCounterBlockIdTcc0; + + uint32_t regIdx = 0; + regVal[regIdx] = grbm_gfx_index.u32All; + regAddr[regIdx] = mmGRBM_GFX_INDEX; + regIdx++; + + regTCC_PERFCOUNTER0_SELECT tcc_perf_counter_select; + tcc_perf_counter_select.u32All = 0; + tcc_perf_counter_select.bits.PERF_SEL = blkCntrIdx; + + regVal[regIdx] = tcc_perf_counter_select.u32All; + regAddr[regIdx] = AiTccCounterRegAddr[tccRegIdx].counterSelRegAddr; + regIdx++; + + return regIdx; +} + +uint32_t AiPmu::ProgramTcaCntrs(uint32_t tcaRegIdx, uint32_t* regAddr, uint32_t* regVal, + uint32_t blkId, uint32_t blkCntrIdx) { + regGRBM_GFX_INDEX grbm_gfx_index; + + grbm_gfx_index.u32All = 0; + grbm_gfx_index.bitfields.SE_BROADCAST_WRITES = 1; + grbm_gfx_index.bitfields.SH_BROADCAST_WRITES = 1; + grbm_gfx_index.bitfields.INSTANCE_INDEX = blkId - kHsaAiCounterBlockIdTca0; + + uint32_t regIdx = 0; + regVal[regIdx] = grbm_gfx_index.u32All; + regAddr[regIdx] = mmGRBM_GFX_INDEX; + regIdx++; + + regTCA_PERFCOUNTER0_SELECT tca_perf_counter_select; + tca_perf_counter_select.u32All = 0; + tca_perf_counter_select.bits.PERF_SEL = blkCntrIdx; + + regVal[regIdx] = tca_perf_counter_select.u32All; + regAddr[regIdx] = AiTcaCounterRegAddr[tcaRegIdx].counterSelRegAddr; + regIdx++; + return regIdx; +} + +uint32_t AiPmu::ProgramTaCntrs(uint32_t taRegIdx, uint32_t* regAddr, uint32_t* regVal, + uint32_t blkId, uint32_t blkCntrIdx) { + regGRBM_GFX_INDEX grbm_gfx_index; + + grbm_gfx_index.u32All = 0; + grbm_gfx_index.bitfields.SE_BROADCAST_WRITES = 1; + grbm_gfx_index.bitfields.SH_BROADCAST_WRITES = 1; + grbm_gfx_index.bitfields.INSTANCE_INDEX = blkId - kHsaAiCounterBlockIdTa0; + + uint32_t regIdx = 0; + regVal[regIdx] = grbm_gfx_index.u32All; + regAddr[regIdx] = mmGRBM_GFX_INDEX; + regIdx++; + + regTA_PERFCOUNTER0_SELECT ta_perf_counter_select; + ta_perf_counter_select.u32All = 0; + ta_perf_counter_select.bits.PERF_SEL = blkCntrIdx; + + regVal[regIdx] = ta_perf_counter_select.u32All; + regAddr[regIdx] = AiTaCounterRegAddr[taRegIdx].counterSelRegAddr; + regIdx++; + + return regIdx; +} + +uint32_t AiPmu::ProgramSQCntrs(uint32_t sqRegIdx, uint32_t* regAddr, uint32_t* regVal, + uint32_t blkId, uint32_t blkCntrIdx) { + uint32_t regIdx = 0; + + // Program the SQ Counter Select Register + regSQ_PERFCOUNTER0_SELECT sq_cntr_sel; + sq_cntr_sel.u32All = 0; + sq_cntr_sel.bits.SIMD_MASK = 0xF; + sq_cntr_sel.bits.SQC_BANK_MASK = 0xF; + sq_cntr_sel.bits.SQC_CLIENT_MASK = 0xF; + sq_cntr_sel.bits.PERF_SEL = blkCntrIdx; + regVal[regIdx] = sq_cntr_sel.u32All; + regAddr[regIdx] = AiSqCounterRegAddr[sqRegIdx].counterSelRegAddr; + regIdx++; + + // Program the SQ Counter Mask Register + regSQ_PERFCOUNTER_MASK sq_cntr_mask; + sq_cntr_mask.u32All = 0; + sq_cntr_mask.bits.SH0_MASK = 0xFFFF; + sq_cntr_mask.bits.SH1_MASK = 0xFFFF; + regVal[regIdx] = sq_cntr_mask.u32All; + regAddr[regIdx] = mmSQ_PERFCOUNTER_MASK; + regIdx++; + + // Initialize the register content + // Program the SQ Counter Control Register + regSQ_PERFCOUNTER_CTRL sq_cntr_ctrl; + sq_cntr_ctrl.u32All = 0; + if (blkId == kHsaAiCounterBlockIdSq) { + sq_cntr_ctrl.bits.PS_EN = 0x1; + sq_cntr_ctrl.bits.VS_EN = 0x1; + sq_cntr_ctrl.bits.GS_EN = 0x1; + sq_cntr_ctrl.bits.HS_EN = 0x1; + sq_cntr_ctrl.bits.CS_EN = 0x1; + } else if (blkId == kHsaAiCounterBlockIdSqGs) { + sq_cntr_ctrl.bits.GS_EN = 0x1; + } else if (blkId == kHsaAiCounterBlockIdSqVs) { + sq_cntr_ctrl.bits.VS_EN = 0x1; + } else if (blkId == kHsaAiCounterBlockIdSqPs) { + sq_cntr_ctrl.bits.PS_EN = 0x1; + } else if (blkId == kHsaAiCounterBlockIdSqHs) { + sq_cntr_ctrl.bits.HS_EN = 0x1; + } else if (blkId == kHsaAiCounterBlockIdSqCs) { + sq_cntr_ctrl.bits.CS_EN = 0x1; + } + + regVal[regIdx] = sq_cntr_ctrl.u32All; + regAddr[regIdx] = AiSqCounterRegAddr[sqRegIdx].counterCntlRegAddr; + regIdx++; + + return regIdx; +} + +uint32_t AiPmu::BuildCounterSelRegister(uint32_t cntrIdx, uint32_t* regAddr, uint32_t* regVal, + uint32_t blkId, pm4_profile::Counter* blkCntr) { + void* p_data; + uint32_t data_size; + uint32_t blkCntrIdx; + uint32_t instance_index; + regGRBM_GFX_INDEX grbm_gfx_index; + + // Get the blkCntr selection value + if (!blkCntr->getParameter(HSA_EXT_TOOLS_COUNTER_PARAMETER_EVENT_INDEX, data_size, + (void**)&p_data)) { + return 0; + } + blkCntrIdx = *(static_cast(p_data)); + + uint32_t regIdx = 0; + switch (blkId) { + // Program counters belonging to SQ block + case kHsaAiCounterBlockIdSq: + case kHsaAiCounterBlockIdSqGs: + case kHsaAiCounterBlockIdSqVs: + case kHsaAiCounterBlockIdSqPs: + case kHsaAiCounterBlockIdSqHs: + case kHsaAiCounterBlockIdSqCs: + return ProgramSQCntrs(cntrIdx, regAddr, regVal, blkId, blkCntrIdx); + + case kHsaAiCounterBlockIdCb0: + case kHsaAiCounterBlockIdCb1: + case kHsaAiCounterBlockIdCb2: + case kHsaAiCounterBlockIdCb3: { + regIdx = 0; + instance_index = blkId - kHsaAiCounterBlockIdCb0; + grbm_gfx_index.u32All = 0; + grbm_gfx_index.bitfields.INSTANCE_INDEX = instance_index; + grbm_gfx_index.bitfields.SE_BROADCAST_WRITES = 1; + grbm_gfx_index.bitfields.SH_BROADCAST_WRITES = 1; + + regVal[regIdx] = grbm_gfx_index.u32All; + regAddr[regIdx] = mmGRBM_GFX_INDEX; + regIdx++; + + regVal[regIdx] = 0; + regAddr[regIdx] = mmCB_PERFCOUNTER0_LO; + regIdx++; + + regVal[regIdx] = 0; + regAddr[regIdx] = mmCB_PERFCOUNTER0_HI; + regIdx++; + + regVal[regIdx] = 0; + regAddr[regIdx] = mmCB_PERFCOUNTER1_LO; + regIdx++; + + regVal[regIdx] = 0; + regAddr[regIdx] = mmCB_PERFCOUNTER1_HI; + regIdx++; + + regVal[regIdx] = 0; + regAddr[regIdx] = mmCB_PERFCOUNTER2_LO; + regIdx++; + + regVal[regIdx] = 0; + regAddr[regIdx] = mmCB_PERFCOUNTER2_HI; + regIdx++; + + regVal[regIdx] = 0; + regAddr[regIdx] = mmCB_PERFCOUNTER3_LO; + regIdx++; + + regVal[regIdx] = 0; + regAddr[regIdx] = mmCB_PERFCOUNTER3_HI; + regIdx++; + + regCB_PERFCOUNTER0_SELECT cb_perf_counter_select; + cb_perf_counter_select.u32All = 0; + cb_perf_counter_select.bits.PERF_SEL = blkCntrIdx; + + regVal[regIdx] = cb_perf_counter_select.u32All; + regAddr[regIdx] = AiCbCounterRegAddr[cntrIdx].counterSelRegAddr; + regIdx++; + + break; + } + + // Temp commented for Vega10 + /* + case kHsaAiCounterBlockIdCpf: { + regCPF_PERFCOUNTER0_SELECT cpf_perf_counter_select; + cpf_perf_counter_select.u32All = 0; + cpf_perf_counter_select.bits.PERF_SEL = blkCntrIdx; + + regVal[0] = cpf_perf_counter_select.u32All; + regAddr[0] = AiCpfCounterRegAddr[cntrIdx].counterSelRegAddr; + regIdx = 1; + break; + } + */ + + case kHsaAiCounterBlockIdDb0: + case kHsaAiCounterBlockIdDb1: + case kHsaAiCounterBlockIdDb2: + case kHsaAiCounterBlockIdDb3: { + instance_index = blkId - kHsaAiCounterBlockIdDb0; + regIdx = 0; + grbm_gfx_index.u32All = 0; + grbm_gfx_index.bitfields.INSTANCE_INDEX = instance_index; + grbm_gfx_index.bitfields.SE_BROADCAST_WRITES = 1; + grbm_gfx_index.bitfields.SH_BROADCAST_WRITES = 1; + + regVal[regIdx] = grbm_gfx_index.u32All; + regAddr[regIdx] = mmGRBM_GFX_INDEX; + regIdx++; + + regVal[regIdx] = 0; + regAddr[regIdx] = mmDB_PERFCOUNTER0_LO; + regIdx++; + regVal[regIdx] = 0; + regAddr[regIdx] = mmDB_PERFCOUNTER0_HI; + regIdx++; + regVal[regIdx] = 0; + regAddr[regIdx] = mmDB_PERFCOUNTER1_LO; + regIdx++; + regVal[regIdx] = 0; + regAddr[regIdx] = mmDB_PERFCOUNTER1_HI; + regIdx++; + regVal[regIdx] = 0; + regAddr[regIdx] = mmDB_PERFCOUNTER2_LO; + regIdx++; + regVal[regIdx] = 0; + regAddr[regIdx] = mmDB_PERFCOUNTER2_HI; + regIdx++; + regVal[regIdx] = 0; + regAddr[regIdx] = mmDB_PERFCOUNTER3_LO; + regIdx++; + regVal[regIdx] = 0; + regAddr[regIdx] = mmDB_PERFCOUNTER3_HI; + regIdx++; + + regDB_PERFCOUNTER0_SELECT db_perf_counter_select; + db_perf_counter_select.u32All = 0; + db_perf_counter_select.bits.PERF_SEL = blkCntrIdx; + regVal[regIdx] = db_perf_counter_select.u32All; + regAddr[regIdx] = AiDbCounterRegAddr[cntrIdx].counterSelRegAddr; + regIdx++; + break; + } + + case kHsaAiCounterBlockIdGrbm: { + regGRBM_PERFCOUNTER0_SELECT grbm_perf_counter_select; + grbm_perf_counter_select.u32All = 0; + grbm_perf_counter_select.bits.PERF_SEL = blkCntrIdx; + regVal[0] = grbm_perf_counter_select.u32All; + regAddr[0] = AiGrbmCounterRegAddr[cntrIdx].counterSelRegAddr; + regIdx = 1; + break; + } + + case kHsaAiCounterBlockIdGrbmSe: { + regGRBM_SE0_PERFCOUNTER_SELECT grbm_se0_perf_counter_select; + grbm_se0_perf_counter_select.u32All = 0; + grbm_se0_perf_counter_select.bits.PERF_SEL = blkCntrIdx; + regVal[0] = grbm_se0_perf_counter_select.u32All; + regAddr[0] = AiGrbmSeCounterRegAddr[cntrIdx].counterSelRegAddr; + regIdx = 1; + break; + } + + case kHsaAiCounterBlockIdPaSu: { + regPA_SU_PERFCOUNTER0_SELECT pa_su_perf_counter_select; + pa_su_perf_counter_select.u32All = 0; + pa_su_perf_counter_select.bits.PERF_SEL = blkCntrIdx; + regVal[0] = pa_su_perf_counter_select.u32All; + regAddr[0] = AiPaSuCounterRegAddr[cntrIdx].counterSelRegAddr; + regIdx = 1; + break; + } + + case kHsaAiCounterBlockIdPaSc: { + regPA_SC_PERFCOUNTER0_SELECT pa_sc_perf_counter_select; + pa_sc_perf_counter_select.u32All = 0; + pa_sc_perf_counter_select.bits.PERF_SEL = blkCntrIdx; + regVal[0] = pa_sc_perf_counter_select.u32All; + regAddr[0] = AiPaScCounterRegAddr[cntrIdx].counterSelRegAddr; + regIdx = 1; + break; + } + + case kHsaAiCounterBlockIdSpi: { + regSPI_PERFCOUNTER0_SELECT spi_perf_counter_select; + spi_perf_counter_select.u32All = 0; + spi_perf_counter_select.bits.PERF_SEL = blkCntrIdx; + regVal[0] = spi_perf_counter_select.u32All; + regAddr[0] = AiSpiCounterRegAddr[cntrIdx].counterSelRegAddr; + regIdx = 1; + break; + } + + case kHsaAiCounterBlockIdSx: { + regIdx = 0; + regVal[regIdx] = 0; + regAddr[regIdx] = mmSX_PERFCOUNTER0_LO; + regIdx++; + + regVal[regIdx] = 0; + regAddr[regIdx] = mmSX_PERFCOUNTER0_HI; + regIdx++; + + regVal[regIdx] = 0; + regAddr[regIdx] = mmSX_PERFCOUNTER1_LO; + regIdx++; + + regVal[regIdx] = 0; + regAddr[regIdx] = mmSX_PERFCOUNTER1_HI; + regIdx++; + + regVal[regIdx] = 0; + regAddr[regIdx] = mmSX_PERFCOUNTER2_LO; + regIdx++; + + regVal[regIdx] = 0; + regAddr[regIdx] = mmSX_PERFCOUNTER2_HI; + regIdx++; + + regVal[regIdx] = 0; + regAddr[regIdx] = mmSX_PERFCOUNTER3_LO; + regIdx++; + + regSX_PERFCOUNTER0_SELECT sx_perf_counter_select; + sx_perf_counter_select.u32All = 0; + sx_perf_counter_select.bits.PERFCOUNTER_SELECT = blkCntrIdx; + regVal[regIdx] = sx_perf_counter_select.u32All; + regAddr[regIdx] = AiSxCounterRegAddr[cntrIdx].counterSelRegAddr; + regIdx++; + break; + } + + case kHsaAiCounterBlockIdTa0: + case kHsaAiCounterBlockIdTa1: + case kHsaAiCounterBlockIdTa2: + case kHsaAiCounterBlockIdTa3: + case kHsaAiCounterBlockIdTa4: + case kHsaAiCounterBlockIdTa5: + case kHsaAiCounterBlockIdTa6: + case kHsaAiCounterBlockIdTa7: + case kHsaAiCounterBlockIdTa8: + case kHsaAiCounterBlockIdTa9: + case kHsaAiCounterBlockIdTa10: + case kHsaAiCounterBlockIdTa11: + case kHsaAiCounterBlockIdTa12: + case kHsaAiCounterBlockIdTa13: + case kHsaAiCounterBlockIdTa14: + case kHsaAiCounterBlockIdTa15: + return ProgramTaCntrs(cntrIdx, regAddr, regVal, blkId, blkCntrIdx); + + case kHsaAiCounterBlockIdTca0: + case kHsaAiCounterBlockIdTca1: + return ProgramTcaCntrs(cntrIdx, regAddr, regVal, blkId, blkCntrIdx); + + case kHsaAiCounterBlockIdTcc0: + case kHsaAiCounterBlockIdTcc1: + case kHsaAiCounterBlockIdTcc2: + case kHsaAiCounterBlockIdTcc3: + case kHsaAiCounterBlockIdTcc4: + case kHsaAiCounterBlockIdTcc5: + case kHsaAiCounterBlockIdTcc6: + case kHsaAiCounterBlockIdTcc7: + case kHsaAiCounterBlockIdTcc8: + case kHsaAiCounterBlockIdTcc9: + case kHsaAiCounterBlockIdTcc10: + case kHsaAiCounterBlockIdTcc11: + case kHsaAiCounterBlockIdTcc12: + case kHsaAiCounterBlockIdTcc13: + case kHsaAiCounterBlockIdTcc14: + case kHsaAiCounterBlockIdTcc15: + return ProgramTccCntrs(cntrIdx, regAddr, regVal, blkId, blkCntrIdx); + + case kHsaAiCounterBlockIdTd0: + case kHsaAiCounterBlockIdTd1: + case kHsaAiCounterBlockIdTd2: + case kHsaAiCounterBlockIdTd3: + case kHsaAiCounterBlockIdTd4: + case kHsaAiCounterBlockIdTd5: + case kHsaAiCounterBlockIdTd6: + case kHsaAiCounterBlockIdTd7: + case kHsaAiCounterBlockIdTd8: + case kHsaAiCounterBlockIdTd9: + case kHsaAiCounterBlockIdTd10: + case kHsaAiCounterBlockIdTd11: + case kHsaAiCounterBlockIdTd12: + case kHsaAiCounterBlockIdTd13: + case kHsaAiCounterBlockIdTd14: + case kHsaAiCounterBlockIdTd15: + return ProgramTdCntrs(cntrIdx, regAddr, regVal, blkId, blkCntrIdx); + + case kHsaAiCounterBlockIdTcp0: + case kHsaAiCounterBlockIdTcp1: + case kHsaAiCounterBlockIdTcp2: + case kHsaAiCounterBlockIdTcp3: + case kHsaAiCounterBlockIdTcp4: + case kHsaAiCounterBlockIdTcp5: + case kHsaAiCounterBlockIdTcp6: + case kHsaAiCounterBlockIdTcp7: + case kHsaAiCounterBlockIdTcp8: + case kHsaAiCounterBlockIdTcp9: + case kHsaAiCounterBlockIdTcp10: + case kHsaAiCounterBlockIdTcp11: + case kHsaAiCounterBlockIdTcp12: + case kHsaAiCounterBlockIdTcp13: + case kHsaAiCounterBlockIdTcp14: + case kHsaAiCounterBlockIdTcp15: + return ProgramTcpCntrs(cntrIdx, regAddr, regVal, blkId, blkCntrIdx); + + case kHsaAiCounterBlockIdGds: { + regGDS_PERFCOUNTER0_SELECT gds_perf_counter_select; + gds_perf_counter_select.u32All = 0; + gds_perf_counter_select.bits.PERFCOUNTER_SELECT = blkCntrIdx; + regVal[0] = gds_perf_counter_select.u32All; + regAddr[0] = AiGdsCounterRegAddr[cntrIdx].counterSelRegAddr; + regIdx = 1; + break; + } + + case kHsaAiCounterBlockIdVgt: { + regVGT_PERFCOUNTER0_SELECT vgt_perf_counter_select; + vgt_perf_counter_select.u32All = 0; + vgt_perf_counter_select.bits.PERF_SEL = blkCntrIdx; + regVal[0] = vgt_perf_counter_select.u32All; + regAddr[0] = AiVgtCounterRegAddr[cntrIdx].counterSelRegAddr; + regIdx = 1; + break; + } + + case kHsaAiCounterBlockIdIa: { + regIA_PERFCOUNTER0_SELECT ia_perf_counter_select; + ia_perf_counter_select.u32All = 0; + ia_perf_counter_select.bits.PERF_SEL = blkCntrIdx; + regVal[0] = ia_perf_counter_select.u32All; + regAddr[0] = AiIaCounterRegAddr[cntrIdx].counterSelRegAddr; + regIdx = 1; + break; + } + + /* + case kHsaAiCounterBlockIdMc: { + // To be investigated later + //regMC_SEQ_PERF_SEQ_CTL mc_perfcounter_select; + //mc_perfcounter_select.u32All = 0; + //mc_perfcounter_select.bits.PERF_SEL = blkCntrIdx; + //regVal[0] = mc_perfcounter_select.u32All; + //regAddr[0] = AiMcCounterRegAddr[cntrIdx].counterSelRegAddr; + //regIdx = 1; + } + break; + */ + + // Temp Commented out for Vega10 + /* + case kHsaAiCounterBlockIdSrbm: { + regSRBM_PERFCOUNTER0_SELECT srbm_perf_counter_select; + srbm_perf_counter_select.u32All = 0; + srbm_perf_counter_select.bits.PERF_SEL = blkCntrIdx; + regVal[0] = srbm_perf_counter_select.u32All; + regAddr[0] = AiSrbmCounterRegAddr[cntrIdx].counterSelRegAddr; + regIdx = 1; + break; + } + */ + + /* + case kHsaAiCounterBlockIdTcs: { + regTCS_PERFCOUNTER0_SELECT__CI tcs_perf_counter_select; + tcs_perf_counter_select.u32All = 0; + tcs_perf_counter_select.bits.PERF_SEL = blkCntrIdx; + regVal[0] = tcs_perf_counter_select.u32All; + regAddr[0] = AiTcsCounterRegAddr[cntrIdx].counterSelRegAddr; + regIdx = 1; + break; + } + */ + + case kHsaAiCounterBlockIdWd: { + regWD_PERFCOUNTER0_SELECT wd_perf_counter_select; + wd_perf_counter_select.u32All = 0; + wd_perf_counter_select.bits.PERF_SEL = blkCntrIdx; + regVal[0] = wd_perf_counter_select.u32All; + regAddr[0] = AiWdCounterRegAddr[cntrIdx].counterSelRegAddr; + regIdx = 1; + break; + } + + // Temp commented for Vega10 + /* + case kHsaAiCounterBlockIdCpg: { + regCPG_PERFCOUNTER0_SELECT cpg_perf_counter_select; + cpg_perf_counter_select.u32All = 0; + cpg_perf_counter_select.bits.PERF_SEL = blkCntrIdx; + regVal[0] = cpg_perf_counter_select.u32All; + regAddr[0] = AiCpgCounterRegAddr[cntrIdx].counterSelRegAddr; + regIdx = 1; + break; + } + */ + + // Temp commented for Vega10 + /* + case kHsaAiCounterBlockIdCpc: { + regCPC_PERFCOUNTER0_SELECT cpc_perf_counter_select; + cpc_perf_counter_select.u32All = 0; + cpc_perf_counter_select.bits.PERF_SEL = blkCntrIdx; + regVal[0] = cpc_perf_counter_select.u32All; + regAddr[0] = AiCpcCounterRegAddr[cntrIdx].counterSelRegAddr; + regIdx = 1; + break; + } + */ + + /* + case kHsaAiCounterBlockIdMc: { + AddPriviledgedCountersToList(AiBlockIdMc, blkCntrIdx); + //Num of regs equals to 0 means it is processed by KFD + regIdx = 0; + break; + } + + case kHsaAiCounterBlockIdIommuV2: { + AddPriviledgedCountersToList(AiBlockIdIommuV2, blkCntrIdx); + //Num of regs equals to 0 means it is processed by KFD + regIdx = 0; + break; + } + + case kHsaAiCounterBlockIdKernelDriver: { + AddPriviledgedCountersToList(AiBlockIdKernelDriver, blkCntrIdx); + //Num of regs equals to 0 means it is processed by KFD + regIdx = 0; + break; + } + */ + + default: { + regIdx = 0; + break; + } + } + + return regIdx; +} + +uint32_t AiPmu::BuildCounterReadRegisters(uint32_t reg_index, uint32_t block_id, uint32_t* reg_addr, + uint32_t* reg_val) { + uint32_t ii; + uint32_t reg_num = 0; + uint32_t instance_index; + regGRBM_GFX_INDEX grbm_gfx_index; + switch (block_id) { + case kHsaAiCounterBlockIdSq: + case kHsaAiCounterBlockIdSqGs: + case kHsaAiCounterBlockIdSqVs: + case kHsaAiCounterBlockIdSqPs: + case kHsaAiCounterBlockIdSqHs: + case kHsaAiCounterBlockIdSqCs: { + for (ii = 0; ii < num_se_; ii++) { + grbm_gfx_index.u32All = 0; + grbm_gfx_index.bitfields.INSTANCE_BROADCAST_WRITES = 1; + grbm_gfx_index.bitfields.SE_INDEX = ii; + grbm_gfx_index.bitfields.SH_BROADCAST_WRITES = 1; + + reg_addr[reg_num] = mmGRBM_GFX_INDEX; + reg_val[reg_num] = grbm_gfx_index.u32All; + reg_num++; + + reg_addr[reg_num] = AiSqCounterRegAddr[reg_index].counterReadRegAddrLo; + reg_val[reg_num] = COPY_DATA_FLAG; + reg_num++; + + reg_addr[reg_num] = AiSqCounterRegAddr[reg_index].counterReadRegAddrHi; + reg_val[reg_num] = COPY_DATA_FLAG; + reg_num++; + } + break; + } + + case kHsaAiCounterBlockIdCb0: + case kHsaAiCounterBlockIdCb1: + case kHsaAiCounterBlockIdCb2: + case kHsaAiCounterBlockIdCb3: { + instance_index = block_id - kHsaAiCounterBlockIdCb0; + for (ii = 0; ii < num_se_; ii++) { + grbm_gfx_index.u32All = 0; + grbm_gfx_index.bitfields.INSTANCE_INDEX = instance_index; + grbm_gfx_index.bitfields.SE_INDEX = ii; + grbm_gfx_index.bitfields.SH_BROADCAST_WRITES = 1; + + reg_addr[reg_num] = mmGRBM_GFX_INDEX; + reg_val[reg_num] = grbm_gfx_index.u32All; + reg_num++; + + reg_addr[reg_num] = AiCbCounterRegAddr[reg_index].counterReadRegAddrLo; + reg_val[reg_num] = COPY_DATA_FLAG; + reg_num++; + + reg_addr[reg_num] = AiCbCounterRegAddr[reg_index].counterReadRegAddrHi; + reg_val[reg_num] = COPY_DATA_FLAG; + reg_num++; + } + break; + } + + // Temp commented for Vega10 + /* + case kHsaAiCounterBlockIdCpf: { + reg_addr[reg_num] = mmGRBM_GFX_INDEX; + reg_val[reg_num] = reset_grbm_; + reg_num++; + + reg_addr[reg_num] = AiCpfCounterRegAddr[reg_index].counterReadRegAddrLo; + reg_val[reg_num] = COPY_DATA_FLAG; + reg_num++; + + reg_addr[reg_num] = AiCpfCounterRegAddr[reg_index].counterReadRegAddrHi; + reg_val[reg_num] = COPY_DATA_FLAG; + reg_num++; + break; + } + */ + + case kHsaAiCounterBlockIdDb0: + case kHsaAiCounterBlockIdDb1: + case kHsaAiCounterBlockIdDb2: + case kHsaAiCounterBlockIdDb3: { + instance_index = block_id - kHsaAiCounterBlockIdDb0; + for (ii = 0; ii < num_se_; ii++) { + grbm_gfx_index.u32All = 0; + grbm_gfx_index.bitfields.INSTANCE_INDEX = instance_index; + grbm_gfx_index.bitfields.SE_INDEX = ii; + grbm_gfx_index.bitfields.SH_BROADCAST_WRITES = 1; + + reg_addr[reg_num] = mmGRBM_GFX_INDEX; + reg_val[reg_num] = grbm_gfx_index.u32All; + reg_num++; + + reg_addr[reg_num] = AiDbCounterRegAddr[reg_index].counterReadRegAddrLo; + reg_val[reg_num] = COPY_DATA_FLAG; + reg_num++; + + reg_addr[reg_num] = AiDbCounterRegAddr[reg_index].counterReadRegAddrHi; + reg_val[reg_num] = COPY_DATA_FLAG; + reg_num++; + } + break; + } + + case kHsaAiCounterBlockIdGrbm: { + reg_addr[reg_num] = mmGRBM_GFX_INDEX; + reg_val[reg_num] = reset_grbm_; + reg_num++; + + reg_addr[reg_num] = AiGrbmCounterRegAddr[reg_index].counterReadRegAddrLo; + reg_val[reg_num] = COPY_DATA_FLAG; + reg_num++; + + reg_addr[reg_num] = AiGrbmCounterRegAddr[reg_index].counterReadRegAddrHi; + reg_val[reg_num] = COPY_DATA_FLAG; + reg_num++; + break; + } + + case kHsaAiCounterBlockIdGrbmSe: { + reg_addr[reg_num] = mmGRBM_GFX_INDEX; + reg_val[reg_num] = reset_grbm_; + reg_num++; + + reg_addr[reg_num] = AiGrbmSeCounterRegAddr[reg_index].counterReadRegAddrLo; + reg_val[reg_num] = COPY_DATA_FLAG; + reg_num++; + + reg_addr[reg_num] = AiGrbmSeCounterRegAddr[reg_index].counterReadRegAddrHi; + reg_val[reg_num] = COPY_DATA_FLAG; + reg_num++; + break; + } + + case kHsaAiCounterBlockIdPaSu: { + for (ii = 0; ii < num_se_; ii++) { + grbm_gfx_index.u32All = 0; + grbm_gfx_index.bitfields.INSTANCE_BROADCAST_WRITES = 1; + grbm_gfx_index.bitfields.SE_INDEX = ii; + grbm_gfx_index.bitfields.SH_BROADCAST_WRITES = 1; + + reg_addr[reg_num] = mmGRBM_GFX_INDEX; + reg_val[reg_num] = grbm_gfx_index.u32All; + reg_num++; + + reg_addr[reg_num] = AiPaSuCounterRegAddr[reg_index].counterReadRegAddrLo; + reg_val[reg_num] = COPY_DATA_FLAG; + reg_num++; + + reg_addr[reg_num] = AiPaSuCounterRegAddr[reg_index].counterReadRegAddrHi; + reg_val[reg_num] = COPY_DATA_FLAG; + reg_num++; + } + break; + } + + case kHsaAiCounterBlockIdPaSc: { + for (ii = 0; ii < num_se_; ii++) { + grbm_gfx_index.u32All = 0; + grbm_gfx_index.bitfields.INSTANCE_BROADCAST_WRITES = 1; + grbm_gfx_index.bitfields.SE_INDEX = ii; + grbm_gfx_index.bitfields.SH_BROADCAST_WRITES = 1; + + reg_addr[reg_num] = mmGRBM_GFX_INDEX; + reg_val[reg_num] = grbm_gfx_index.u32All; + reg_num++; + + reg_addr[reg_num] = AiPaScCounterRegAddr[reg_index].counterReadRegAddrLo; + reg_val[reg_num] = COPY_DATA_FLAG; + reg_num++; + + reg_addr[reg_num] = AiPaScCounterRegAddr[reg_index].counterReadRegAddrHi; + reg_val[reg_num] = COPY_DATA_FLAG; + reg_num++; + } + break; + } + + case kHsaAiCounterBlockIdSpi: { + for (ii = 0; ii < num_se_; ii++) { + grbm_gfx_index.u32All = 0; + grbm_gfx_index.bitfields.INSTANCE_BROADCAST_WRITES = 1; + grbm_gfx_index.bitfields.SE_INDEX = ii; + grbm_gfx_index.bitfields.SH_BROADCAST_WRITES = 1; + + reg_addr[reg_num] = mmGRBM_GFX_INDEX; + reg_val[reg_num] = grbm_gfx_index.u32All; + reg_num++; + + reg_addr[reg_num] = AiSpiCounterRegAddr[reg_index].counterReadRegAddrLo; + reg_val[reg_num] = COPY_DATA_FLAG; + reg_num++; + + reg_addr[reg_num] = AiSpiCounterRegAddr[reg_index].counterReadRegAddrHi; + reg_val[reg_num] = COPY_DATA_FLAG; + reg_num++; + } + break; + } + + case kHsaAiCounterBlockIdSx: { + for (ii = 0; ii < num_se_; ii++) { + grbm_gfx_index.u32All = 0; + grbm_gfx_index.bitfields.INSTANCE_BROADCAST_WRITES = 1; + grbm_gfx_index.bitfields.SE_INDEX = ii; + grbm_gfx_index.bitfields.SH_BROADCAST_WRITES = 1; + + reg_addr[reg_num] = mmGRBM_GFX_INDEX; + reg_val[reg_num] = grbm_gfx_index.u32All; + reg_num++; + + reg_addr[reg_num] = AiSxCounterRegAddr[reg_index].counterReadRegAddrLo; + reg_val[reg_num] = COPY_DATA_FLAG; + reg_num++; + + reg_addr[reg_num] = AiSxCounterRegAddr[reg_index].counterReadRegAddrHi; + reg_val[reg_num] = COPY_DATA_FLAG; + reg_num++; + } + break; + } + + case kHsaAiCounterBlockIdTa0: + case kHsaAiCounterBlockIdTa1: + case kHsaAiCounterBlockIdTa2: + case kHsaAiCounterBlockIdTa3: + case kHsaAiCounterBlockIdTa4: + case kHsaAiCounterBlockIdTa5: + case kHsaAiCounterBlockIdTa6: + case kHsaAiCounterBlockIdTa7: + case kHsaAiCounterBlockIdTa8: + case kHsaAiCounterBlockIdTa9: + case kHsaAiCounterBlockIdTa10: + case kHsaAiCounterBlockIdTa11: + case kHsaAiCounterBlockIdTa12: + case kHsaAiCounterBlockIdTa13: + case kHsaAiCounterBlockIdTa14: + case kHsaAiCounterBlockIdTa15: { + instance_index = block_id - kHsaAiCounterBlockIdTa0; + for (ii = 0; ii < num_se_; ii++) { + grbm_gfx_index.u32All = 0; + grbm_gfx_index.bitfields.INSTANCE_INDEX = instance_index; + grbm_gfx_index.bitfields.SE_INDEX = ii; + grbm_gfx_index.bitfields.SH_BROADCAST_WRITES = 1; + + reg_addr[reg_num] = mmGRBM_GFX_INDEX; + reg_val[reg_num] = grbm_gfx_index.u32All; + reg_num++; + + reg_addr[reg_num] = AiTaCounterRegAddr[reg_index].counterReadRegAddrLo; + reg_val[reg_num] = COPY_DATA_FLAG; + reg_num++; + + reg_addr[reg_num] = AiTaCounterRegAddr[reg_index].counterReadRegAddrHi; + reg_val[reg_num] = COPY_DATA_FLAG; + reg_num++; + } + break; + } + + case kHsaAiCounterBlockIdTca0: + case kHsaAiCounterBlockIdTca1: { + instance_index = block_id - kHsaAiCounterBlockIdTca0; + grbm_gfx_index.u32All = 0; + grbm_gfx_index.bitfields.INSTANCE_INDEX = instance_index; + grbm_gfx_index.bitfields.SE_BROADCAST_WRITES = 1; + grbm_gfx_index.bitfields.SH_BROADCAST_WRITES = 1; + + reg_addr[reg_num] = mmGRBM_GFX_INDEX; + reg_val[reg_num] = grbm_gfx_index.u32All; + reg_num++; + + reg_addr[reg_num] = AiTcaCounterRegAddr[reg_index].counterReadRegAddrLo; + reg_val[reg_num] = COPY_DATA_FLAG; + reg_num++; + + reg_addr[reg_num] = AiTcaCounterRegAddr[reg_index].counterReadRegAddrHi; + reg_val[reg_num] = COPY_DATA_FLAG; + reg_num++; + break; + } + + case kHsaAiCounterBlockIdTcc0: + case kHsaAiCounterBlockIdTcc1: + case kHsaAiCounterBlockIdTcc2: + case kHsaAiCounterBlockIdTcc3: + case kHsaAiCounterBlockIdTcc4: + case kHsaAiCounterBlockIdTcc5: + case kHsaAiCounterBlockIdTcc6: + case kHsaAiCounterBlockIdTcc7: + case kHsaAiCounterBlockIdTcc8: + case kHsaAiCounterBlockIdTcc9: + case kHsaAiCounterBlockIdTcc10: + case kHsaAiCounterBlockIdTcc11: + case kHsaAiCounterBlockIdTcc12: + case kHsaAiCounterBlockIdTcc13: + case kHsaAiCounterBlockIdTcc14: + case kHsaAiCounterBlockIdTcc15: { + instance_index = block_id - kHsaAiCounterBlockIdTcc0; + grbm_gfx_index.u32All = 0; + grbm_gfx_index.bitfields.INSTANCE_INDEX = instance_index; + grbm_gfx_index.bitfields.SE_BROADCAST_WRITES = 1; + grbm_gfx_index.bitfields.SH_BROADCAST_WRITES = 1; + + reg_addr[reg_num] = mmGRBM_GFX_INDEX; + reg_val[reg_num] = grbm_gfx_index.u32All; + reg_num++; + + reg_addr[reg_num] = AiTccCounterRegAddr[reg_index].counterReadRegAddrLo; + reg_val[reg_num] = COPY_DATA_FLAG; + reg_num++; + + reg_addr[reg_num] = AiTccCounterRegAddr[reg_index].counterReadRegAddrHi; + reg_val[reg_num] = COPY_DATA_FLAG; + reg_num++; + break; + } + + case kHsaAiCounterBlockIdTd0: + case kHsaAiCounterBlockIdTd1: + case kHsaAiCounterBlockIdTd2: + case kHsaAiCounterBlockIdTd3: + case kHsaAiCounterBlockIdTd4: + case kHsaAiCounterBlockIdTd5: + case kHsaAiCounterBlockIdTd6: + case kHsaAiCounterBlockIdTd7: + case kHsaAiCounterBlockIdTd8: + case kHsaAiCounterBlockIdTd9: + case kHsaAiCounterBlockIdTd10: + case kHsaAiCounterBlockIdTd11: + case kHsaAiCounterBlockIdTd12: + case kHsaAiCounterBlockIdTd13: + case kHsaAiCounterBlockIdTd14: + case kHsaAiCounterBlockIdTd15: { + instance_index = block_id - kHsaAiCounterBlockIdTd0; + for (ii = 0; ii < num_se_; ii++) { + grbm_gfx_index.u32All = 0; + grbm_gfx_index.bitfields.INSTANCE_INDEX = instance_index; + grbm_gfx_index.bitfields.SE_INDEX = ii; + grbm_gfx_index.bitfields.SH_BROADCAST_WRITES = 1; + + reg_addr[reg_num] = mmGRBM_GFX_INDEX; + reg_val[reg_num] = grbm_gfx_index.u32All; + reg_num++; + + reg_addr[reg_num] = AiTdCounterRegAddr[reg_index].counterReadRegAddrLo; + reg_val[reg_num] = COPY_DATA_FLAG; + reg_num++; + + reg_addr[reg_num] = AiTdCounterRegAddr[reg_index].counterReadRegAddrHi; + reg_val[reg_num] = COPY_DATA_FLAG; + reg_num++; + } + break; + } + + case kHsaAiCounterBlockIdTcp0: + case kHsaAiCounterBlockIdTcp1: + case kHsaAiCounterBlockIdTcp2: + case kHsaAiCounterBlockIdTcp3: + case kHsaAiCounterBlockIdTcp4: + case kHsaAiCounterBlockIdTcp5: + case kHsaAiCounterBlockIdTcp6: + case kHsaAiCounterBlockIdTcp7: + case kHsaAiCounterBlockIdTcp8: + case kHsaAiCounterBlockIdTcp9: + case kHsaAiCounterBlockIdTcp10: + case kHsaAiCounterBlockIdTcp11: + case kHsaAiCounterBlockIdTcp12: + case kHsaAiCounterBlockIdTcp13: + case kHsaAiCounterBlockIdTcp14: + case kHsaAiCounterBlockIdTcp15: { + instance_index = block_id - kHsaAiCounterBlockIdTcp0; + for (ii = 0; ii < num_se_; ii++) { + grbm_gfx_index.u32All = 0; + grbm_gfx_index.bitfields.INSTANCE_INDEX = instance_index; + grbm_gfx_index.bitfields.SE_INDEX = ii; + grbm_gfx_index.bitfields.SH_BROADCAST_WRITES = 1; + + reg_addr[reg_num] = mmGRBM_GFX_INDEX; + reg_val[reg_num] = grbm_gfx_index.u32All; + reg_num++; + + reg_addr[reg_num] = AiTcpCounterRegAddr[reg_index].counterReadRegAddrLo; + reg_val[reg_num] = COPY_DATA_FLAG; + reg_num++; + + reg_addr[reg_num] = AiTcpCounterRegAddr[reg_index].counterReadRegAddrHi; + reg_val[reg_num] = COPY_DATA_FLAG; + reg_num++; + } + break; + } + + case kHsaAiCounterBlockIdGds: { + reg_addr[reg_num] = mmGRBM_GFX_INDEX; + reg_val[reg_num] = reset_grbm_; + reg_num++; + + reg_addr[reg_num] = AiGdsCounterRegAddr[reg_index].counterReadRegAddrLo; + reg_val[reg_num] = COPY_DATA_FLAG; + reg_num++; + + reg_addr[reg_num] = AiGdsCounterRegAddr[reg_index].counterReadRegAddrHi; + reg_val[reg_num] = COPY_DATA_FLAG; + reg_num++; + break; + } + + case kHsaAiCounterBlockIdVgt: { + for (ii = 0; ii < num_se_; ii++) { + grbm_gfx_index.u32All = 0; + grbm_gfx_index.bitfields.INSTANCE_BROADCAST_WRITES = 1; + grbm_gfx_index.bitfields.SE_INDEX = ii; + grbm_gfx_index.bitfields.SH_BROADCAST_WRITES = 1; + + reg_addr[reg_num] = mmGRBM_GFX_INDEX; + reg_val[reg_num] = grbm_gfx_index.u32All; + reg_num++; + + reg_addr[reg_num] = AiVgtCounterRegAddr[reg_index].counterReadRegAddrLo; + reg_val[reg_num] = COPY_DATA_FLAG; + reg_num++; + + reg_addr[reg_num] = AiVgtCounterRegAddr[reg_index].counterReadRegAddrHi; + reg_val[reg_num] = COPY_DATA_FLAG; + reg_num++; + } + break; + } + + case kHsaAiCounterBlockIdIa: { + for (ii = 0; ii < num_se_; ii++) { + grbm_gfx_index.u32All = 0; + grbm_gfx_index.bitfields.INSTANCE_BROADCAST_WRITES = 1; + grbm_gfx_index.bitfields.SE_INDEX = ii; + grbm_gfx_index.bitfields.SH_BROADCAST_WRITES = 1; + + reg_addr[reg_num] = mmGRBM_GFX_INDEX; + reg_val[reg_num] = grbm_gfx_index.u32All; + reg_num++; + + reg_addr[reg_num] = AiIaCounterRegAddr[reg_index].counterReadRegAddrLo; + reg_val[reg_num] = COPY_DATA_FLAG; + reg_num++; + + reg_addr[reg_num] = AiIaCounterRegAddr[reg_index].counterReadRegAddrHi; + reg_val[reg_num] = COPY_DATA_FLAG; + reg_num++; + } + break; + } + /* + case kHsaAiCounterBlockIdMc: { + reg_addr[reg_num] = mmGRBM_GFX_INDEX; + reg_val[reg_num] = reset_grbm_; + reg_num++; + + reg_addr[reg_num] = AiMcCounterRegAddr[reg_index].counterReadRegAddrLo; + reg_val[reg_num] = COPY_DATA_FLAG; + reg_num++; + + reg_addr[reg_num] = AiMcCounterRegAddr[reg_index].counterReadRegAddrHi; + reg_val[reg_num] = COPY_DATA_FLAG; + reg_num++; + break; + } + */ + // Temp Commented out for Vega10 + /* + case kHsaAiCounterBlockIdSrbm: { + reg_addr[reg_num] = mmGRBM_GFX_INDEX; + reg_val[reg_num] = reset_grbm_; + reg_num++; + + reg_addr[reg_num] = AiSrbmCounterRegAddr[reg_index].counterReadRegAddrLo; + reg_val[reg_num] = COPY_DATA_FLAG; + reg_num++; + + reg_addr[reg_num] = AiSrbmCounterRegAddr[reg_index].counterReadRegAddrHi; + reg_val[reg_num] = COPY_DATA_FLAG; + reg_num++; + break; + } + */ + /* + case kHsaAiCounterBlockIdTcs: { + reg_addr[reg_num] = mmGRBM_GFX_INDEX; + reg_val[reg_num] = reset_grbm_; + reg_num++; + + reg_addr[reg_num] = AiTcsCounterRegAddr[reg_index].counterReadRegAddrLo; + reg_val[reg_num] = COPY_DATA_FLAG; + reg_num++; + + reg_addr[reg_num] = AiTcsCounterRegAddr[reg_index].counterReadRegAddrHi; + reg_val[reg_num] = COPY_DATA_FLAG; + reg_num++; + break; + } + */ + case kHsaAiCounterBlockIdWd: { + reg_addr[reg_num] = mmGRBM_GFX_INDEX; + reg_val[reg_num] = reset_grbm_; + reg_num++; + + reg_addr[reg_num] = AiWdCounterRegAddr[reg_index].counterReadRegAddrLo; + reg_val[reg_num] = COPY_DATA_FLAG; + reg_num++; + + reg_addr[reg_num] = AiWdCounterRegAddr[reg_index].counterReadRegAddrHi; + reg_val[reg_num] = COPY_DATA_FLAG; + reg_num++; + break; + } + + // Temp commented for Vega10 + /* + case kHsaAiCounterBlockIdCpg: { + reg_addr[reg_num] = mmGRBM_GFX_INDEX; + reg_val[reg_num] = reset_grbm_; + reg_num++; + + reg_addr[reg_num] = AiCpgCounterRegAddr[reg_index].counterReadRegAddrLo; + reg_val[reg_num] = COPY_DATA_FLAG; + reg_num++; + + reg_addr[reg_num] = AiCpgCounterRegAddr[reg_index].counterReadRegAddrHi; + reg_val[reg_num] = COPY_DATA_FLAG; + reg_num++; + break; + } + */ + + // Temp commented for Vega10 + /* + case kHsaAiCounterBlockIdCpc: { + reg_addr[reg_num] = mmGRBM_GFX_INDEX; + reg_val[reg_num] = reset_grbm_; + reg_num++; + + reg_addr[reg_num] = AiCpcCounterRegAddr[reg_index].counterReadRegAddrLo; + reg_val[reg_num] = COPY_DATA_FLAG; + reg_num++; + + reg_addr[reg_num] = AiCpcCounterRegAddr[reg_index].counterReadRegAddrHi; + reg_val[reg_num] = COPY_DATA_FLAG; + reg_num++; + break; + } + */ + + // IommuV2, MC, kernel driver counters are retrieved via + // KFD implementation + case kHsaAiCounterBlockIdMc: + case kHsaAiCounterBlockIdIommuV2: + case kHsaAiCounterBlockIdKernelDriver: { + reg_num = 0; + break; + } + + default: { break; } + } + + return reg_num; +} + +hsa_status_t AiPmu::RemoveCounterBlocks() { + AiCounterBlockMap::iterator it = blk_map_.begin(); + AiCounterBlockMap::iterator block_end = blk_map_.end(); + + for (; it != block_end; it++) { + delete it->second; + } + + return HSA_STATUS_SUCCESS; +} + + +} /* namespace */ diff --git a/runtime/hsa-ext-aql-profile/src/perfcounter/ai_pmu.h b/runtime/hsa-ext-aql-profile/src/perfcounter/ai_pmu.h new file mode 100644 index 0000000000..a76ee9df0d --- /dev/null +++ b/runtime/hsa-ext-aql-profile/src/perfcounter/ai_pmu.h @@ -0,0 +1,137 @@ +#ifndef _AI_PMU_H_ +#define _AI_PMU_H_ + +#include "hsa.h" +#include "cmdwriter.h" +#include "hsa_perf.h" +#include "info_set.h" +#include "parameter_set.h" +#include "ai_blockinfo.h" +#include "rocr_profiler.h" + +#include +#include +#include + +namespace pm4_profile { +typedef std::map AiCounterBlockMap; + +// This class implement the AI PMU. It is responsible for setting up +// CounterGroups to represent each AI hardware block which exposes performance +// counters. +class AiPmu : public pm4_profile::Pmu { + public: + AiPmu(); + + ~AiPmu(); + + // Returns number of shader engines per block + // for the blocks featured shader engines instancing + uint32_t getNumSe() { return num_se_; } + + // Initializes the handle of buffer used to collect PMC data + bool setPmcDataBuff(uint8_t* pmcBuffer, uint32_t pmcBuffSz); + + int getLastError(); + + std::string getErrorString(int error); + + virtual bool begin(DefaultCmdBuf* cmdBuff, CommandWriter* cmdWriter, bool reset = true); + + virtual bool end(DefaultCmdBuf* cmdBuff, CommandWriter* cmdWriter); + + // IPMU inherits the IParameterSet and IInfoSetso we implement it + // through composition and function forwarding + bool getParameter(uint32_t param, uint32_t& ret_size, void** pp_data); + + bool setParameter(uint32_t param, uint32_t param_size, const void* p_data); + + bool getInfo(uint32_t info, uint32_t& ret_size, void** pp_data); + + pm4_profile::CounterBlock* getCounterBlockById(uint32_t id); + + rocr_pmu_state_t getCurrentState() { return profiler_state_; } + + pm4_profile::CounterBlock** getAllCounterBlocks(uint32_t& num_groups); + + private: + // Addr of Counter Data Buffer + uint32_t* pmcData_; + + // Size of Counter Data Buffer + uint32_t pmcDataSz_; + + void Init(); + + bool initCounterBlock(); + + bool isResultReady(); + + // Clear CounterBlockMap + void clearCounterBlockMap(); + + // Reset SQ and CB counters + void ResetCounterBlocks(DefaultCmdBuf* cmdBuff, CommandWriter* cmdWriter); + + // Program SQ block related counters + uint32_t ProgramSQCntrs(uint32_t sqRegIdx, uint32_t* regAddr, uint32_t* regVal, uint32_t blkId, + uint32_t blkCntrIdx); + + // Program TA block related counters + uint32_t ProgramTaCntrs(uint32_t taRegIdx, uint32_t* regAddr, uint32_t* regVal, uint32_t blkId, + uint32_t blkCntrIdx); + + // Program TCA block related counters + uint32_t ProgramTcaCntrs(uint32_t tcaRegIdx, uint32_t* regAddr, uint32_t* regVal, uint32_t blkId, + uint32_t blkCntrIdx); + + // Program TCC block related counters + uint32_t ProgramTccCntrs(uint32_t tccRegIdx, uint32_t* regAddr, uint32_t* regVal, uint32_t blkId, + uint32_t blkCntrIdx); + + // Program TCP block related counters + uint32_t ProgramTcpCntrs(uint32_t tcpRegIdx, uint32_t* regAddr, uint32_t* regVal, uint32_t blkId, + uint32_t blkCntrIdx); + + // Program TD block related counters + uint32_t ProgramTdCntrs(uint32_t tdRegIdx, uint32_t* regAddr, uint32_t* regVal, uint32_t blkId, + uint32_t blkCntrIdx); + + // Build counter selection register, return how many registers are built + uint32_t BuildCounterSelRegister(uint32_t cntrIdx, uint32_t* regAddr, uint32_t* regVal, + uint32_t blkId, pm4_profile::Counter* blkCntr); + + // Build counter selection register, return how many registers are built + uint32_t BuildCounterReadRegisters(uint32_t reg_index, uint32_t block_id, uint32_t* reg_addr, + uint32_t* reg_val); + + private: + // Delete counter blocks in the PMU + hsa_status_t RemoveCounterBlocks(); + + private: + // This contains the available counter groups. + AiCounterBlockMap blk_map_; + + // This stores the current profiling state. + rocr_pmu_state_t profiler_state_; + + pm4_profile::ParameterSet* parameter_set_; + + pm4_profile::InfoSet* info_set_; + + int error_code_; + + // Pointer used to store counter block list internally + uint32_t blk_list_size_; + pm4_profile::CounterBlock** blk_list_; + + // Indicates the number of Shader Engines Present + uint32_t num_se_; + + // Used to reset GRBM to its default state + uint32_t reset_grbm_; +}; +} + +#endif // _AI_PMU_H_ diff --git a/runtime/hsa-ext-aql-profile/src/perfcounter/gpu_blockinfo.h b/runtime/hsa-ext-aql-profile/src/perfcounter/gpu_blockinfo.h new file mode 100644 index 0000000000..acc19908c8 --- /dev/null +++ b/runtime/hsa-ext-aql-profile/src/perfcounter/gpu_blockinfo.h @@ -0,0 +1,101 @@ +#ifndef _GPU_BLOCKINFO_H_ +#define _GPU_BLOCKINFO_H_ + +#include "rocr_profiler.h" +#include "gpu_enum.h" + +#include + +namespace pm4_profile { + +typedef enum CntlMethod { + CntlMethodNone = 0, + CntlMethodByInstance = 1, + CntlMethodBySe = 2, + CntlMethodBySeAndInstance = 3 +} CntlMethod; + +// Structure which contains information about a specific hardware block for CI. +#define GPU_BLOCK_NAME_SIZE 15 + +typedef struct GpuBlockInfo_ { + // Unique string identifier of the block. + const char blockName[GPU_BLOCK_NAME_SIZE]; + + // Unique string identifier of the block. + uint32_t counterGroupId; + + // Maximum number of shader engines + uint32_t maxShaderEngineCount; + + // Maximum number of shader arrays + uint32_t maxShaderArrayCount; + + // Maximum number of block instances in the group per shader array + uint32_t maxInstanceCount; + + // Counter control method + CntlMethod method; + + // Maximum counter event ID + uint32_t maxEventId; + + // Maximum number of counters that can be enabled at once + uint32_t maxSimultaneousCounters; + + // Maximum number of streaming counters that can be enabled at once + uint32_t maxStreamingCounters; + + // The number of hardware counters that are shared + // between regular and streaming counters. + // This is important so that resources are not double-booked + // between the two types of counters. + uint32_t sharedHWCounters; + + // Block counters can be configured with additional filters + bool hasFilters; + + //------------------------------------------ + // Trace specific stuff regarding when they get locked + + // Buffer size in bytes + uint32_t bufferSize; + + // Current write pointer offset from beginning of the buffer + uint32_t wptrOffset; + + // Flag that buffer might have wrapped + bool wrapped; + + // If buffer has wrapped, this could indicate approximate + // total amount of data that was dumpued in the trace buffer + uint32_t dataSizeEstimate; + + // Buffer data pointer + void* pData; +} GpuBlockInfo; + +// Register address corresponding to each counter +typedef struct GpuCounterRegInfo_ { + // counter select register address + uint32_t counterSelRegAddr; + + // counter control register address + uint32_t counterCntlRegAddr; + + // counter read register address low + uint32_t counterReadRegAddrLo; + + // counter read register address high + uint32_t counterReadRegAddrHi; +} GpuCounterRegInfo; + +// Gpu Privileged Block ID info. This number should be the same as that +// defined in KFD +typedef struct GpuPrivCounterBlockId_ { + // Block ID consists of 4 dwords + uint32_t items[4]; +} GpuPrivCounterBlockId; + +} // pm4_profile +#endif diff --git a/runtime/hsa-ext-aql-profile/src/perfcounter/gpu_counter.cpp b/runtime/hsa-ext-aql-profile/src/perfcounter/gpu_counter.cpp new file mode 100644 index 0000000000..6e46af151e --- /dev/null +++ b/runtime/hsa-ext-aql-profile/src/perfcounter/gpu_counter.cpp @@ -0,0 +1,73 @@ +#include "gpu_counter.h" + +using namespace pm4_profile; + +namespace pm4_profile { +static char error_string[][64] = { + {"No error"}, {"Counter generic error"}, {"Counter is already set"}, {"Counter not ready"}, +}; + +GpuCounter::GpuCounter() : Counter() { + counter_enabled_ = false; + parameter_set_ = new ParameterSet(); +} + +GpuCounter::~GpuCounter() { delete parameter_set_; } + +bool GpuCounter::getResult(uint64_t* p_result) { + if (!p_result) { + return false; + } + + *p_result = result_; + + return true; +} + +bool GpuCounter::setCounterBlock(pm4_profile::CounterBlock* p_cntr_group) { + if (!p_cntr_group) { + return false; + } + + counter_block_ = p_cntr_group; + + return true; +} + +pm4_profile::CounterBlock* GpuCounter::getCounterBlock() { return counter_block_; } + +bool GpuCounter::setEnable(bool b) { + // TODO: Validate counter + counter_enabled_ = b; + return true; +} + +void GpuCounter::setResult(uint64_t result) { result_ = result; } + +int GpuCounter::getLastError() { return error_code_; } + +std::string GpuCounter::getErrorString(int error) { + if ((error >= 0) && (error < kHsaCounterErrorCodeMax)) { + std::string err_string(error_string[error]); + return err_string; + } + return "Incorrect error index"; +} + +bool GpuCounter::getParameter(uint32_t param, uint32_t& ret_size, void** pp_data) { + return parameter_set_->getParameter(param, ret_size, pp_data); +} + +bool GpuCounter::setParameter(uint32_t param, uint32_t param_size, const void* p_data) { + bool ret_code; + + error_code_ = kHsaCounterErrorCodeNoError; + + ret_code = parameter_set_->setParameter(param, param_size, p_data); + if (ret_code == false) { + error_code_ = kHsaCounterErrorCodeAlreadySet; + } + + return ret_code; +} +} diff --git a/runtime/hsa-ext-aql-profile/src/perfcounter/gpu_counter.h b/runtime/hsa-ext-aql-profile/src/perfcounter/gpu_counter.h new file mode 100644 index 0000000000..f09d2849a7 --- /dev/null +++ b/runtime/hsa-ext-aql-profile/src/perfcounter/gpu_counter.h @@ -0,0 +1,52 @@ +#ifndef _GPU_COUNTER_H_ +#define _GPU_COUNTER_H_ + +#include "hsa_perf.h" +#include "parameter_set.h" + +#include +#include +#include + +namespace pm4_profile { +// @brief This class represent each CI performance counter +class GpuCounter : public pm4_profile::Counter { + public: + GpuCounter(); + + virtual ~GpuCounter(); + + virtual int getLastError(); + + virtual std::string getErrorString(int error); + + virtual bool getResult(uint64_t* p_result); + + virtual pm4_profile::CounterBlock* getCounterBlock(); + + virtual bool setEnable(bool b); + + virtual bool isEnabled() { return counter_enabled_; } + + virtual bool isResultReady() { return is_result_ready_; } + + virtual bool getParameter(uint32_t param, uint32_t& ret_size, void** pp_data); + + virtual bool setParameter(uint32_t param, uint32_t param_size, const void* p_data); + + bool setCounterBlock(pm4_profile::CounterBlock* p_cntr_group); + + void setResult(uint64_t result); + + private: + bool counter_enabled_; + bool is_result_ready_; + uint64_t result_; + pm4_profile::ParameterSet* parameter_set_; + pm4_profile::CounterBlock* counter_block_; + uint32_t error_code_; +}; + +typedef std::list GpuCounterList; +} +#endif // _GPU_COUNTER_H_ diff --git a/runtime/hsa-ext-aql-profile/src/perfcounter/gpu_countergroup.cpp b/runtime/hsa-ext-aql-profile/src/perfcounter/gpu_countergroup.cpp new file mode 100644 index 0000000000..5b4005c4c0 --- /dev/null +++ b/runtime/hsa-ext-aql-profile/src/perfcounter/gpu_countergroup.cpp @@ -0,0 +1,215 @@ +#include "gpu_countergroup.h" +#include "gpu_counter.h" +#include "gpu_enum.h" + +using namespace pm4_profile; + +namespace pm4_profile { + +static char error_string[][64] = { + {"No error"}, {"Counter block error"}, {"Max counter reached"}, {"Unkown counter"}}; + +GpuCounterBlock::GpuCounterBlock() : CounterBlock() { + cntr_list_.clear(); + parameter_set_ = new ParameterSet(); + info_set_ = new InfoSet(); + + // Initialize pointer to NULL + pp_cntrs_ = NULL; + + _initCounterBlockType(); +} + +GpuCounterBlock::~GpuCounterBlock() { + GpuCounterList::iterator it = cntr_list_.begin(); + GpuCounterList::iterator it_end = cntr_list_.end(); + + for (; it != it_end; it++) { + if (*it) { + delete (*it); + } + } + cntr_list_.clear(); + + delete parameter_set_; + delete info_set_; + + if (pp_cntrs_) { + free(pp_cntrs_); + pp_cntrs_ = NULL; + } +} + +void GpuCounterBlock::_initCounterBlockType() { + block_type_ = HSA_EXT_TOOLS_COUNTER_BLOCK_TYPE_ASYNC; +} + +Counter* GpuCounterBlock::createCounter() { + if (!_checkMaxNumOfCounters()) { + return NULL; + } + + GpuCounter* p_cntr = new GpuCounter(); + if (!p_cntr) { + return NULL; + } + + cntr_list_.push_back(p_cntr); + + return (Counter*)p_cntr; +} + +bool GpuCounterBlock::destroyCounter(Counter* p_cntr) { + bool ret = false; + + if (!p_cntr) { + return ret; + } + + GpuCounterList::iterator it = cntr_list_.begin(); + GpuCounterList::iterator it_end = cntr_list_.end(); + for (; it != it_end; it++) { + if (*it == p_cntr) { + delete (*it); + cntr_list_.erase(it); + ret = true; + break; + } + } + + return ret; +} + +bool GpuCounterBlock::destroyAllCounters() { + GpuCounterList::iterator it = cntr_list_.begin(); + GpuCounterList::iterator it_end = cntr_list_.end(); + + for (; it != it_end; it++) { + if (*it) { + delete (*it); + } + } + + cntr_list_.clear(); + + return true; +} + +Counter** GpuCounterBlock::getEnabledCounters(uint32_t& num) { + if (pp_cntrs_) { + free(pp_cntrs_); + pp_cntrs_ = NULL; + } + + pp_cntrs_ = (Counter**)malloc(sizeof(GpuCounter*) * cntr_list_.size()); + + if (!pp_cntrs_) { + return NULL; + } + + int cnt = 0; + GpuCounterList::iterator it = cntr_list_.begin(); + GpuCounterList::iterator it_end = cntr_list_.end(); + for (; it != it_end; it++) { + GpuCounter* p_cntr = (*it); + bool is_enabled; + is_enabled = p_cntr->isEnabled(); + if (is_enabled) { + *(pp_cntrs_ + cnt) = (Counter*)*it; + cnt++; + } + } + + num = cnt; + if (0 == num) { + return NULL; + } + + return pp_cntrs_; +} + +Counter** GpuCounterBlock::getAllCounters(uint32_t& num) { + if (pp_cntrs_) { + free(pp_cntrs_); + pp_cntrs_ = NULL; + } + + pp_cntrs_ = (Counter**)malloc(sizeof(GpuCounter*) * cntr_list_.size()); + + if (!pp_cntrs_) { + return NULL; + } + + int cnt = 0; + GpuCounterList::iterator it = cntr_list_.begin(); + GpuCounterList::iterator it_end = cntr_list_.end(); + for (; it != it_end; it++, cnt++) { + *(pp_cntrs_ + cnt) = (Counter*)*it; + } + + num = cnt; + if (0 == num) { + return NULL; + } + + return pp_cntrs_; +} + +bool GpuCounterBlock::setInfo(GPU_BLK_INFOS blk_info, uint32_t size, void* data) { + return info_set_->setInfo(blk_info, size, data); +} + +bool GpuCounterBlock::_checkMaxNumOfCounters() { + uint32_t num_enabled = _getNumOfEnabledCounters(); + + uint32_t* p_num_max = NULL; + uint32_t size = 0; + + if (!getInfo(GPU_BLK_INFO_MAX_SIMULTANEOUS_COUNTERS, size, (void**)&p_num_max)) { + return false; + } + + if (num_enabled >= *p_num_max) { + return false; + } + + return true; +} + +uint32_t GpuCounterBlock::_getNumOfEnabledCounters() { + uint32_t cnt = 0; + GpuCounterList::iterator it = cntr_list_.begin(); + GpuCounterList::iterator it_end = cntr_list_.end(); + + for (; it != it_end; it++) { + GpuCounter* p_cntr = (*it); + bool is_enabled; + is_enabled = p_cntr->isEnabled(); + if (is_enabled) { + cnt++; + } + } + + return cnt; +} + +std::string GpuCounterBlock::getErrorString(int error) { + if ((error >= 0) && (error < kHsaCounterBlockErrorCodeMaxError)) { + std::string err_string(error_string[error]); + return err_string; + } + return "incorrect error code"; +} + +bool GpuCounterBlock::getParameter(uint32_t param, uint32_t& ret_size, void** pp_data) { + return parameter_set_->getParameter(param, ret_size, pp_data); +} + +bool GpuCounterBlock::setParameter(uint32_t param, uint32_t param_size, const void* pData) { + return parameter_set_->setParameter(param, param_size, pData); +} + +bool GpuCounterBlock::getInfo(uint32_t info, uint32_t& ret_size, void** pp_data) { + return info_set_->getInfo(info, ret_size, pp_data); +} +} diff --git a/runtime/hsa-ext-aql-profile/src/perfcounter/gpu_countergroup.h b/runtime/hsa-ext-aql-profile/src/perfcounter/gpu_countergroup.h new file mode 100644 index 0000000000..8b1549b076 --- /dev/null +++ b/runtime/hsa-ext-aql-profile/src/perfcounter/gpu_countergroup.h @@ -0,0 +1,70 @@ +#ifndef _GPU_COUNTER_GROUP_H_ +#define _GPU_COUNTER_GROUP_H_ + +// This file contains declaration of Sea Island (CI) CounterBlock class. +#include "hsa_perf.h" +#include "gpu_counter.h" +#include "parameter_set.h" +#include "info_set.h" +#include "gpu_enum.h" + +#include +#include + +namespace pm4_profile { +// This class represents one CI hardware block. Each block contains +// multiple performance counters. +class GpuCounterBlock : public pm4_profile::CounterBlock { + public: + GpuCounterBlock(); + ~GpuCounterBlock(); + + // NOTE [Suravee] : We specify CiPmu as a friend + // because the CiPmu needs to be able to setup info of + // the counter block. + friend class CiPmu; + friend class ViPmu; + friend class AiPmu; + + std::string getErrorString(int error); + + pm4_profile::Counter* createCounter(); + + virtual bool destroyCounter(pm4_profile::Counter* p_cntr); + + virtual bool destroyAllCounters(); + + virtual pm4_profile::Counter** getEnabledCounters(uint32_t& num); + + virtual pm4_profile::Counter** getAllCounters(uint32_t& num); + + virtual bool getParameter(uint32_t param, uint32_t& ret_size, void** pp_data); + + virtual bool setParameter(uint32_t param, uint32_t param_size, const void* p_data); + + virtual bool getInfo(uint32_t info, uint32_t& ret_size, void** pp_data); + + protected: + void _initCounterBlockType(); + + bool setInfo(GPU_BLK_INFOS blk_info, uint32_t size, void* data); + + hsa_ext_tools_counter_block_type_t block_type_; + + private: + bool _checkMaxNumOfCounters(); + + uint32_t _getNumOfEnabledCounters(); + + pm4_profile::ParameterSet* parameter_set_; + pm4_profile::InfoSet* info_set_; + GpuCounterList cntr_list_; + uint32_t error_code_; + + // Pointer of buffer to store counter list + pm4_profile::Counter** pp_cntrs_; +}; + +} // pm4_profile + +#endif // _GPU_COUNTER_GROUP_H_ diff --git a/runtime/hsa-ext-aql-profile/src/perfcounter/gpu_enum.h b/runtime/hsa-ext-aql-profile/src/perfcounter/gpu_enum.h new file mode 100644 index 0000000000..085bc6f29b --- /dev/null +++ b/runtime/hsa-ext-aql-profile/src/perfcounter/gpu_enum.h @@ -0,0 +1,65 @@ +#ifndef _GPU_ENUM_H_ +#define _GPU_ENUM_H_ + +namespace pm4_profile { + +// Enumeration containing GPU hardware block information +enum GPU_BLK_INFOS { + GPU_BLK_INFO_BLOCK_NAME, + GPU_BLK_INFO_ID, + GPU_BLK_INFO_MAX_SHADER_ENGINE_COUNT, + GPU_BLK_INFO_MAX_SHADER_ARRAY_COUNT, + GPU_BLK_INFO_MAX_INSTANCE_COUNT, + GPU_BLK_INFO_CONTROL_METHOD, + GPU_BLK_INFO_MAX_EVENT_ID, + GPU_BLK_INFO_MAX_SIMULTANEOUS_COUNTERS, + GPU_BLK_INFO_MAX_STREAMING_COUNTERS, + GPU_BLK_INFO_SHARED_HW_COUNTERS, + GPU_BLK_INFO_HAS_FILTERS, + + // Trace-specific stuff + GPU_TRC_BLK_INFO_BUFFER_SIZE, + GPU_TRC_BLK_INFO_BUFFER_WRITE_POINTER_OFFSET, + GPU_TRC_BLK_INFO_BUFFER_WRAPPED, + GPU_TRC_BLK_INFO_DATA_SIZE_ESTIMATE, + GPU_TRC_BLK_INFO_DATA_POINTER, +}; + + +/** + * Trace buffer parameters + */ +enum GPU_BLK_PARAMS { + // Allows user to specify the size of the trace buffers. + GPU_BLK_PARAM_TRACE_BUFFER_SIZE, + + // If we decide to implement this functionality, this will allow the user + // to specify the number of trace buffers to create. + GPU_BLK_PARAM_TRACE_BUFFER_ARRAY, + + // Specifies whether a new trace buffer should be used for each cmd buffer. + // This allows for better correlation of data back to the host application + // If this is enabled, and the user does not explicitly specify a + // TRACE_BUFFER_ARRAY, then the driver should automatically allocate + // additional buffers as needed so that as much of the application + // can be traced as possible, until the PerfExperiment is ended. + // If a TRACE_BUFFER_ARRAY is specified, then only as many buffers + // as specified should be created. If more cmd buffers get submitted + // than there are trace buffers, then the later cmd buffers should + // not be traced. + GPU_BLK_PARAM_TRACE_NEW_BUFFER_ON_SUBMIT, +}; + + +// Enumeration containing GPU counter parameters +enum GPU_CNTR_PARAMS { + GPU_CNTR_PARAM_SHADERENGINE_ID, + GPU_CNTR_PARAM_SHADERARRAY_ID, + GPU_CNTR_PARAM_INSTANCE_ID, + GPU_CNTR_PARAM_EVENT_SELECT_ID, + GPU_CNTR_PARAM_SIMD_MASK, + GPU_CNTR_PARAM_PERF_MODE, + GPU_CNTR_PARAM_TRACE_TYPE, +}; +} +#endif diff --git a/runtime/hsa-ext-aql-profile/src/perfcounter/hsa_perf.h b/runtime/hsa-ext-aql-profile/src/perfcounter/hsa_perf.h new file mode 100644 index 0000000000..1611e63faa --- /dev/null +++ b/runtime/hsa-ext-aql-profile/src/perfcounter/hsa_perf.h @@ -0,0 +1,436 @@ +#ifndef _HSA_PERF_H_ +#define _HSA_PERF_H_ + +#include "rocr_profiler.h" + +#if !defined(AMD_AMP_HSA_INCLUDES) +#include +#include +#include +#include +#endif + +namespace pm4_profile { +class Pmu; +class Counter; +class CounterBlock; +class TraceGroup; +class CommandWriter; +class DefaultCmdBuf; + + +// @brief This is an abstract class for defining a CounterBlock. Each +// CounterBlock contains a set of Counters that often belong to the +// same functional unit +// +// For AMD GPU, this can represent blocks of Counters in each HW block +// (e.g. SQ, SQI, CP, etc.). +// For AMD CPU, this can represent blocks of core PMCs, NB PMCs, L2I PMCs +// on each CPU device +// +// Generally, CounterBlocks are created and initialized by the \ref Pmu class. +// Users can query them by calling \ref Pmu::getAllCounterBlocks() or +// \ref Pmu::getCounterBlockById(). A CounterBlock is enabled if it contains +// enabled Counters in the block. +// +// Users can manage Counters in each GounterBlock (e.g. create, destroy, +// enable and disable). To specify a Counter, users simply call \ref +// createCounter. Then it can be enabled or disabled using \ref +// Counter::setEnable. When a Counter is enabled, it is checked against the +// CounterBlock checks to make sure that the enabled-counter is valid and is +// not conflicting with the current Counters in the block. +class CounterBlock { + public: + typedef enum HsaCounterBlockErrorCode { + // Generic CounterBlock error + kHsaCounterBlockErrorCodeNoError = 0x0, + + // Generic CounterBlock error + kHsaCounterBlockErrorCodeGenericError, + + // The maximum number of Counters in the block is reached. + kHsaCounterBlockErrorCodeMaxNumCounterReached, + + // The counter does not belong to this block. + kHsaCounterBlockErrorCodeUnknownCounter, + + // The counter does not belong to this block. + kHsaCounterBlockErrorCodeMaxError + } HsaCounterBlockErrorCode; + + // Destructor of CounterBlock. + virtual ~CounterBlock() {} + + // Given and error number reported from getLastError or returned from a + // function call, retreive the corresponding stl string. + // @param[in] error The error corresponding to a call to getLastError + // or a return code from a function call. + // Return An stl string representing a text corresponding to the error + // number. + // If invalid error code is given, the returned string is empty. + virtual std::string getErrorString(int error) = 0; + + // Create an Counter object return a pointer to caller. + // Return On success, this function returns a pointer to Counter + // On failure, this function returns NULL + // Possible error codes are: + // kHSAPerfErrorCodesUnmodifiableState + // kHsaCounterBlockErrorCodeMaxNumCounterReached + virtual Counter* createCounter() = 0; + + // Destroy the Counter. The CounterBlock which owns the Counter must be in + // disabled state. + // Return true or false + // Possible error codes are: + // kHSAPerfErrorCodesInvalidAargs + // kHSAPerfErrorCodesUnmodifiableState + // kHsaCounterBlockErrorCodeUnknownCounter + virtual bool destroyCounter(Counter* p_counter) = 0; + + // Destroy all counters in the block. The CounterBlock must be in disable + // state. + // Return true or false. + // Possible error codes are: + // kHSAPerfErrorCodesUnmodifiableState + virtual bool destroyAllCounters() = 0; + + // Get a list of pointers to the enabled Counters in this CounterBlock. + // note The Counter must be created by the same CounterBlock object using + // createCounter(). + // @param[in] num The number of Counter pointers returned. + // Return + // return a list of pointers to the enabled Counters. + // return NULL if no counter is enabled. + virtual Counter** getEnabledCounters(uint32_t& num) = 0; + + // Get a list of pointers to the all Counters in this CounterBlock. + // note The Counter must be created by the same CounterBlock object using + // createCounter(). + // @param[in] num The number of Counter pointers returned. + // Return + // return a list of pointers in the CounterBlock. + // return NULL if no counter is enabled. + virtual Counter** getAllCounters(uint32_t& num) = 0; + + // Query value of the parameter specified by param + // @param[in] param The enumeration of parameter to be queried + // @param[out] return_size The returned size of data + // @param[out] pp_data The pointer to the returned data. The API is + // responsible for managing the memory to store the information as specified + // by return_size. + // + // Return true or false + // Possible error codes are: + // kHSAPerfErrorCodesInvalidParam + // kHSAPerfErrorCodesInvalidParamSize + // kHSAPerfErrorCodesInvalidParamData + virtual bool getParameter(uint32_t param, uint32_t& return_size, void** pp_data) = 0; + + // Set value for the parameter specified by param + // @param[in] param The enumeration of parameter to be queried + // @param[out] param_size The size of data + // @param[out] p_data The pointer to the data to be set. Users are responsible + // for deallocating the memory of p_data after calling the API. + // Return true or false + // Possible error codes are: + // kHSAPerfErrorCodesUnmodifiableState + // kHSAPerfErrorCodesInvalidParam + // kHSAPerfErrorCodesInvalidParamSize + // kHSAPerfErrorCodesInvalidParamData + virtual bool setParameter(uint32_t param, uint32_t param_size, const void* p_data) = 0; + + // Query value of the information specified by info + // @param[in] info The enumeration of information to be queried + // @param[out] Return_size The returned size of data + // @param[out] pp_data The pointer to the returned data + // Return true or false + // Possible error codes are: + // kHSAPerfErrorCodesInvalidInfo + // kHSAPerfErrorCodesInvalidInfoSize + // kHSAPerfErrorCodesInvalidInfoData + virtual bool getInfo(uint32_t info, uint32_t& return_size, void** pp_data) = 0; +}; // class CounterBlock + + +// This is an abstract class for defining a TraceGroup. TraceGroup inherits +// CounterBlock and add interfaces for managing trace buffer. It also supports +// user-data insertion into trace. This allows users to insert arbitary data +// (e.g. markers) into trace which and can be used to correlating a specific +// events to the collected trace data. +class TraceGroup : public CounterBlock { + public: + typedef enum HsaTraceGroupErrorCode { + // Generic TraceGroup error + HsaTraceGroupErrorCodeGenericError = 0x100, + } HsaTraceGroupErrorCode; + + // Destructor of TraceGroup. + virtual ~TraceGroup() {} + + // Obtains the number of buffers which were collected as part of + // the trace. + // Return The number of collected buffers. + virtual uint32_t getCollectedBufferCount() = 0; + + // Locks a trace buffer for host access. + // @param[in] buffer_id The index of the buffer to be locked. + // Return true or false + virtual bool lock(uint32_t buffer_id) = 0; + + // Unlock a trace buffer that was previously locked. + // @param[in] buffer_id The index of the buffer to be unlocked. + // Return true or false + virtual bool unlock(uint32_t buffer_id) = 0; + + // Inserts data (e.g. trace marker) into the trace. + // @param[in] type The type of data to be inserted. + // @param[in] p_data The data to be inserted. + // @param[in] data_size The size of data to be inserted. + // Return true or false + virtual bool insertUserData(uint32_t type, void* p_data, uint32_t data_size) = 0; +}; // class TraceGroup + + +// This is an abstract class for defining a performance Counter. +// Users can obtain a Counter from \ref CounterBlock::createCounter(). +// Once obtained, users can set up Counter parameters, and enable it using +// \ref Counter::setEnable(). +// +// There are several types of Counter as defined in \ref +// HsaCounterBlockTypeMask. +// Only the supported Counter type can be added to the CounterBlock. +// +// Each Counter can store Counter-specific parameters. The Counter is used to +// specify types of event to be counted. +class Counter { + public: + typedef enum HsaCounterErrorCode { + // Generic Counter error + kHsaCounterErrorCodeNoError = 0x0, + + // Generic Counter error + kHsaCounterErrorCodeGenericError = 0x1, + + // Counter already error + kHsaCounterErrorCodeAlreadySet = 0x2, + + // Counter result is not ready. + kHsaCounterErrorCodeResultNotReady = 0x3, + + // Max counter error num + kHsaCounterErrorCodeMax, + } HsaCounterErrorCode; + + // Destructor of Counter + virtual ~Counter() {} + + // Retrieve the last error code generated. This should be checked when + // values returned are NULL or void. + // Return an integer corresponding to the last error reported. + virtual int getLastError() = 0; + + // Given and error number reported from getLastError or returned from a + // function call, retreive the corresponding stl string. + // @param[in] error The error corresponding to a call to getLastError + // or a return code from a function call. + // Return An stl string representing a text corresponding to the error + // number. If invalid error code is given, the returned string is empty. + virtual std::string getErrorString(int error) = 0; + + // Get the \ref CounterBlock which owns this counter. + // Return + // On success, it returns a pointer to the CounterBlock. + // On Failure, it returns NULL. + virtual CounterBlock* getCounterBlock() = 0; + + // Enable or disable the Counter. + // @param[in] b Set to true to enable the CounterBlock. + // Return + // return true when successfully set the state. + // return false otherwise. + // In case of the current state already is set to the specified value, + // the API returns true. + // Possible error codes are: + // kHSAPerfErrorCodesUnmodifiableState + virtual bool setEnable(bool b) = 0; + + // Return the current state of the Counter. + // Return true or false + virtual bool isEnabled() = 0; + + // Return the status of this Counter whether the result is available. + // Return true or false + virtual bool isResultReady() = 0; + + // Query Counter result + // note Must be implemented by derived classes + // @param[out] p_result The pointer containing the returned result. + // Return true or false + // Possible error codes are: + // kHSAPerfErrorCodesInvalidAargs + // kHsaCounterErrorCodeResultNotReady + virtual bool getResult(uint64_t* p_result) = 0; + + // Query value of the parameter specified by param + // @param[in] param The enumeration of parameter to be queried + // @param[out] Return_size The returned size of data + // @param[out] pp_data The pointer to the returned data. The API is + // responsible for managing the memory to store the information as + // specified by return_size. + // Return true or false + // Possible error codes are: + // kHSAPerfErrorCodesInvalidParam + // kHSAPerfErrorCodesInvalidParamSize + // kHSAPerfErrorCodesInvalidParamData + virtual bool getParameter(uint32_t param, uint32_t& return_size, void** pp_data) = 0; + + // Set value for the parameter specified by param + // @param[in] param The enumeration of parameter to be queried + // @param[out] param_size The size of data + // @param[out] p_data The pointer to the data to be set. Users are responsible + // for deallocating the memory of p_data after calling the API. + // Return true or false + // Possible error codes are: + // kHSAPerfErrorCodesUnmodifiableState + // kHSAPerfErrorCodesInvalidParam + // kHSAPerfErrorCodesInvalidParamSize + // kHSAPerfErrorCodesInvalidParamData + virtual bool setParameter(uint32_t param, uint32_t param_size, const void* p_data) = 0; +}; // class Counter + +class Pmu { + public: + // Enumeration of Pmu error codes + typedef enum HsaPmuErrorCode { + // Generic PMU error + kHsaPmuErrorCodeNoError = 0x0, + + // Unknown CounterBlock ID + kHsaPmuErrorCodeUnknownCounterBlockId, + + // No CounterBlock exists + kHsaPmuErrorCodeNoCounterBlock, + + // The previously operation is not valid. This could be due to + // invalid transition from the current state. + kHsaPmuErrorCodeInvalidOperation, + + // PMU is not currently available (e.g. PMU is currently + // in-used by others) + kHsaPmuErrorCodeNotAvailable, + + // PMU is not currently available (e.g. PMU is currently + // in-used by others) + kHsaPmuErrorCodeErrorState, + + // PMU result is timeout + kHsaPmuErrorCodeTimeOut, + + // Max error count + kHsaPmuErrorCodeMax + } HsaPmuErrorCode; + + // Destructor of PMU. + // note This stops the performance counters if running and releases + // any resources used by the PMU. + virtual ~Pmu() {} + + // Retrieve the last error code generated. This should be checked when + // values returned are NULL or void. + // Return an integer corresponding to the last error reported. + virtual int getLastError() = 0; + + // Given and error number reported from getLastError or returned from a + // function call, retreive the corresponding stl string. + // @param[in] error The error corresponding to a call to getLastError + // or a return code from a function call. + // Return An stl string representing a text corresponding to the error + // number. If invalid error code is given, the returned string is empty. + virtual std::string getErrorString(int error) = 0; + + // Get CounterBlock from Id + // @param[in] id ID of the target CounterBlock + // Return + // On success, it returns a pointer to specified CounterBlock. + // On Failure, it returns NULL. + // Possible error codes are: + // kHsaPmuErrorCodeUnknownCounterBlockId. + virtual CounterBlock* getCounterBlockById(uint32_t id) = 0; + + // Get all available CounterBlock + // @param[out] num_block The returned number of CounterBlocks + // Return On success, it returns an array of CounterBlock pointers. + // On Failure, it returns NULL. + virtual CounterBlock** getAllCounterBlocks(uint32_t& num_block) = 0; + + // Get current PMU profiling state. + // Return The PMU profiling state as defined in \ref PMU_PROFILE_STATES + virtual rocr_pmu_state_t getCurrentState() = 0; + + // Start profiling on the PMU. + // @param[in] reset_counter indicates whether reset counter before + // recording. Default is reset counters. + // note This function must be implemented by children classes. + // Return true or false + // Possible error codes are: + // kHsaPmuErrorCodeInvalidOperation + // kHsaPmuErrorCodeNotAvailable + virtual bool begin(DefaultCmdBuf* cmdBuff, CommandWriter* cmdWriter, bool reset = true) = 0; + + // Stop profiling on the PMU. + // note This function must be called after \ref begin(). + // note This function must be implemented by children classes. + // Return true or false + // Possible error codes are: + // kHsaPmuErrorCodeInvalidOperation + virtual bool end(DefaultCmdBuf* cmdBuff, CommandWriter* cmdWriter) = 0; + + // Initializes the handle of buffer used to collect PMC data + // @param pmcBuffer The buffer pointer + // @param cmdBufSz Size in terms of bytes + virtual bool setPmcDataBuff(uint8_t* pmcBuffer, uint32_t pmcBuffSz) = 0; + + // Query value of the parameter specified by param + // @param[in] param The enumeration of parameter to be queried + // @param[out] Return_size The returned size of data + // @param[out] pp_data The pointer to the returned data. The API is + // responsible for managing the memory to store the information as + // specified by return_size. + // Return true or false + // Possible error codes are: + // kHSAPerfErrorCodesInvalidParam + // kHSAPerfErrorCodesInvalidParamSize + // kHSAPerfErrorCodesInvalidParamData + virtual bool getParameter(uint32_t param, uint32_t& return_size, void** pp_data) = 0; + + // Set value for the parameter specified by param + // @param[in] param The enumeration of parameter to be queried + // @param[out] param_size The size of data + // @param[out] p_data The pointer to the data to be set. Users are responsible + // for deallocating the memory of p_data after calling the API. + // Return true or false + // Possible error codes are: + // kHSAPerfErrorCodesUnmodifiableState + // kHSAPerfErrorCodesInvalidParam + // kHSAPerfErrorCodesInvalidParamSize + // kHSAPerfErrorCodesInvalidParamData + virtual bool setParameter(uint32_t param, uint32_t param_size, const void* p_data) = 0; + + // Query value of the information specified by info + // @param[in] info The enumeration of information to be queried + // @param[out] Return_size The returned size of data + // @param[out] pp_data The pointer to the returned data + // Return true or false + // Possible error codes are: + // kHSAPerfErrorCodesInvalidInfo + // kHSAPerfErrorCodesInvalidInfoSize + // kHSAPerfErrorCodesInvalidInfoData + virtual bool getInfo(uint32_t info, uint32_t& return_size, void** pp_data) = 0; + + // Returns number of shader engines per block + // for the blocks featured shader engines instancing + virtual uint32_t getNumSe() = 0; + +}; // class Pmu +} // pm4_profile +#endif // _HSA_PERF_H_ diff --git a/runtime/hsa-ext-aql-profile/src/perfcounter/info_set.cpp b/runtime/hsa-ext-aql-profile/src/perfcounter/info_set.cpp new file mode 100644 index 0000000000..f886170eb5 --- /dev/null +++ b/runtime/hsa-ext-aql-profile/src/perfcounter/info_set.cpp @@ -0,0 +1,74 @@ +#include "info_set.h" +#include "var_data.h" +using namespace std; + +namespace pm4_profile { +InfoSet::InfoSet() { + releaseParameters(); + info_table_.clear(); + p_data_ = NULL; +} + +InfoSet::~InfoSet() { + releaseParameters(); + info_table_.clear(); + free(p_data_); + p_data_ = NULL; +} + +bool InfoSet::setInfo(uint32_t info, uint32_t info_size, void* p_data) { + if (info_table_.end() != info_table_.find(info)) { + return false; + } + + VarData data; + if (!data.set(info_size, p_data)) { + return false; + } + + info_table_.insert(VarDataMap::value_type(info, data)); + return true; +} + +bool InfoSet::getInfo(uint32_t info, uint32_t& ret_size, void** pp_data) { + if (!pp_data || (0 == info_table_.size())) { + return false; + } + + VarDataMap::iterator it = info_table_.find(info); + if (it == info_table_.end()) { + return false; + } + + int size = it->second.getSize(); + if (size == 0) { + return false; + } + + free(p_data_); + p_data_ = NULL; + + p_data_ = malloc(size); + if (!p_data_) { + return false; + } + + *pp_data = p_data_; + + ret_size = info_table_[info].get(size, *pp_data); + + return true; +} + +void InfoSet::releaseParameters() { + VarDataMap::iterator it = info_table_.begin(); + VarDataMap::iterator table_end = info_table_.end(); + + for (; it != table_end; it++) { + it->second.clear(); + } + + return; +} + +} // pm4_profile diff --git a/runtime/hsa-ext-aql-profile/src/perfcounter/info_set.h b/runtime/hsa-ext-aql-profile/src/perfcounter/info_set.h new file mode 100644 index 0000000000..8527491908 --- /dev/null +++ b/runtime/hsa-ext-aql-profile/src/perfcounter/info_set.h @@ -0,0 +1,48 @@ +#ifndef _INFO_SET_H_ +#define _INFO_SET_H_ + +// This file contains declaration of IInfoSet class. +#include "hsa_perf.h" +#include "var_data.h" + +#include +#include + +namespace pm4_profile { +// An abstract class defining a container to hold a information data set +// (e.g. PMU info, CounterGroup info, etc.). Unlike \ref IParameterSet, +// This class allows only the children of the class to set the information. +class InfoSet { + public: + // IInfoSet constructor + InfoSet(); + + // IInfoSet destructor + virtual ~InfoSet(); + + // Query value of the information specified by info + // @param[in] info The enumeration of information to be queried + // @param[out] ret_size The returned size of data + // @param[out] pp_data The pointer to the returned data + // /return true or false + bool getInfo(uint32_t info, uint32_t& ret_size, void** pp_data); + + // Set value for the information specified by info + // @param[in] info The enumeration of information to be queried + // @param[out] info_size The size of data + // @param[out] p_data The pointer to the data to be set + // /return true or false + bool setInfo(uint32_t info, uint32_t info_size, void* p_data); + + private: + // Remove all data in the parameter table + void releaseParameters(); + + // InfoSet property: The info table + VarDataMap info_table_; + + // Pointer to the buffer used in getInfo + void* p_data_; +}; +} +#endif diff --git a/runtime/hsa-ext-aql-profile/src/perfcounter/parameter_set.cpp b/runtime/hsa-ext-aql-profile/src/perfcounter/parameter_set.cpp new file mode 100644 index 0000000000..f43ac28569 --- /dev/null +++ b/runtime/hsa-ext-aql-profile/src/perfcounter/parameter_set.cpp @@ -0,0 +1,74 @@ +#include "parameter_set.h" + +using namespace std; + +namespace pm4_profile { +ParameterSet::ParameterSet() { + releaseParameters(); + param_table_.clear(); + p_data_ = NULL; +} + +ParameterSet::~ParameterSet() { + releaseParameters(); + param_table_.clear(); + free(p_data_); + p_data_ = NULL; +} + +bool ParameterSet::setParameter(uint32_t param, uint32_t param_size, const void* p_data) { + if (param_table_.end() != param_table_.find(param)) { + return false; + } + + VarData data; + if (!data.set(param_size, p_data)) { + return false; + } + + param_table_.insert(VarDataMap::value_type(param, data)); + return true; +} + +bool ParameterSet::getParameter(uint32_t param, uint32_t& ret_size, void** pp_data) { + if (!pp_data || (0 == param_table_.size())) { + return false; + } + + VarDataMap::iterator it = param_table_.find(param); + if (it == param_table_.end()) { + return false; + } + + int size = it->second.getSize(); + if (size == 0) { + return false; + } + + // for NULL pointer, free does nothing + free(p_data_); + p_data_ = malloc(size); + if (!p_data_) { + return false; + } + + // store the pointer to be freed + *pp_data = p_data_; + + ret_size = param_table_[param].get(size, *pp_data); + + return true; +} + +bool ParameterSet::releaseParameters() { + VarDataMap::iterator it = param_table_.begin(); + VarDataMap::iterator table_end = param_table_.end(); + + for (; it != table_end; it++) { + it->second.clear(); + } + + return true; +} + +} // pm4_profile diff --git a/runtime/hsa-ext-aql-profile/src/perfcounter/parameter_set.h b/runtime/hsa-ext-aql-profile/src/perfcounter/parameter_set.h new file mode 100644 index 0000000000..2d9d181a98 --- /dev/null +++ b/runtime/hsa-ext-aql-profile/src/perfcounter/parameter_set.h @@ -0,0 +1,75 @@ +#ifndef _PARAMETER_SET_H_ +#define _PARAMETER_SET_H_ + +/*! + \note This file contains declaration of IParameterSet class. + */ +#include "hsa_perf.h" +#include "var_data.h" + +#include +#include + +namespace pm4_profile { +/*! + A class defining a container to hold parameter data set + (e.g. PMU parameter, CounterGroup parameter, etc.). + */ +class ParameterSet { + public: + /*! + Enumeration containing types of parameters + */ + enum parameter { + PARAM_MAX, + }; + + /*! IParameterSet constructor */ + ParameterSet(); + + /*! IParameterSet destructor */ + virtual ~ParameterSet(); + + /*! + Query value of the parameter specified by param + @param[in] param The enumeration of parameter to be queried + @param[out] ret_size The returned size of data + @param[out] pp_data The pointer to the returned data + /return true or false + */ + bool getParameter( + /*in*/ uint32_t param, + /*out*/ uint32_t& ret_size, + /*out*/ void** pp_data); + + /*! + Set value for the parameter specified by param + @param[in] param The enumeration of parameter to be queried + @param[out] param_size The size of data + @param[out] p_data The pointer to the data to be set + /return true or false + */ + bool setParameter( + /*in*/ uint32_t param, + /*in*/ uint32_t param_size, + /*in*/ const void* p_data); + + private: + /*! + Remove all data in the parameter table + */ + bool releaseParameters(); + + /*! + IParameterSet property: The parameter table + */ + VarDataMap param_table_; + + /*! + Pointer to the buffer used in getParameter + */ + void* p_data_; +}; +} + +#endif // _PARAMETER_SET_H_ diff --git a/runtime/hsa-ext-aql-profile/src/perfcounter/rocr_profiler.h b/runtime/hsa-ext-aql-profile/src/perfcounter/rocr_profiler.h new file mode 100644 index 0000000000..dc2c385413 --- /dev/null +++ b/runtime/hsa-ext-aql-profile/src/perfcounter/rocr_profiler.h @@ -0,0 +1,254 @@ +//////////////////////////////////////////////////////////////////////////////// +// +// The University of Illinois/NCSA +// Open Source License (NCSA) +// +// Copyright (c) 2014-2015, Advanced Micro Devices, Inc. All rights reserved. +// +// Developed by: +// +// AMD Research and AMD HSA Software Development +// +// Advanced Micro Devices, Inc. +// +// www.amd.com +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to +// deal with the Software without restriction, including without limitation +// the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the +// Software is furnished to do so, subject to the following conditions: +// +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimers. +// - Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimers in +// the documentation and/or other materials provided with the distribution. +// - Neither the names of Advanced Micro Devices, Inc, +// nor the names of its contributors may be used to endorse or promote +// products derived from this Software without specific prior written +// permission. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR +// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +// DEALINGS WITH THE SOFTWARE. +// +//////////////////////////////////////////////////////////////////////////////// + +#ifndef _ROCR_PROFILER_H_ +#define _ROCR_PROFILER_H_ + +#ifdef __cplusplus +extern "C" { +#endif // __cplusplus + +#if defined _WIN32 || defined __CYGWIN__ +#ifdef __GNUC__ +#define HSA_TOOLS_API __attribute__((dllexport)) +#else +#define HSA_TOOLS_API __declspec(dllexport) // Note: actually gcc seems +// to also supports this +// syntax. +#endif +#ifndef DLL_LOCAL +#define DLL_LOCAL +#endif + +#else // defined _WIN32 || defined __CYGWIN__ +#if __GNUC__ >= 4 +#define HSA_TOOLS_API __attribute__((visibility("default"))) +#ifndef DLL_LOCAL +#define DLL_LOCAL __attribute__((visibility("hidden"))) +#endif +#else +#define HSA_TOOLS_API +#ifndef DLL_LOCAL +#define DLL_LOCAL +#endif +#endif +#endif // defined _WIN32 || defined __CYGWIN__ + +//---------------------------------------------------------------------------// +// @brief Enumeration of various information that is set for a counter. // +// @detail This enumeration defines the various counter info that could be // +// used in a counter. This is used by a counter object to specify // +// its type and other conditions that are needed to retrieve a // +// counter value. // +//---------------------------------------------------------------------------// +typedef enum hsa_ext_tools_counter_parameter_s { + // Event index of a counter + HSA_EXT_TOOLS_COUNTER_PARAMETER_EVENT_INDEX = 0, + + // Simd mask of a counter + HSA_EXT_TOOLS_COUNTER_PARAMETER_SIMD_MASK = 1, + + // Shader engine mask of a counter + HSA_EXT_TOOLS_COUNTER_PARAMETER_SHADER_MASK = 2, + + // Max counter info index + HSA_EXT_TOOLS_COUNTER_PARAMETER_INFO_MAX +} hsa_ext_tools_counter_parameter_t; + +//---------------------------------------------------------------------------// +// @brief Enumeration of counter block type mask // +// @details This enumeration define the bit mask representing types of // +// counter broup supported by HSA. This is used by counter block object to // +// specify its type. // +//---------------------------------------------------------------------------// +typedef enum hsa_ext_tools_counter_block_type_s { + // Unknown counter block type + HSA_EXT_TOOLS_COUNTER_BLOCK_TYPE_UNKNOWN = 0, + + // The CounterBlock of this type can be access at anytime. + // note Examples are software Counters and CPU Counters. + HSA_EXT_TOOLS_COUNTER_BLOCK_TYPE_SYNC = 1, + + // The CounterBlock type can be access asynchronously. + // It is required that the Counter must be stopped + // before accessing. + HSA_EXT_TOOLS_COUNTER_BLOCK_TYPE_ASYNC = 2, + + // The CounterBlock of this counter block is used for generating + // trace. + HSA_EXT_TOOLS_COUNTER_BLOCK_TYPE_TRACE = 3, + + // Max CounterBlock type + HSA_EXT_TOOLS_COUNTER_BLOCK_TYPE_MAX +} hsa_ext_tools_counter_block_type_t; + +//---------------------------------------------------------------------------// +// @brief Enumeration of various information that is set for a counter block.// +// @detail This enumeration defines the various info that could be used // +// in a counter block. This is used by a counter object to specify its type // +// and other conditions that are needed for a counter block. // +//---------------------------------------------------------------------------// +/* +typedef enum hsa_ext_tools_counter_block_info_s { + // Index of a counter block + HSA_EXT_TOOLS_COUNTER_BLOCK_INFO_EVENT_INDEX = 0, + + // Shader bits of a counter block + HSA_EXT_TOOLS_COUNTER_BLOCK_INFO_SHADER_BITS = 1, + + // Simd mask of a counter + HSA_EXT_TOOLS_COUNTER_BLOCK_INFO_CONTROL_METHOD = 2, + + // Max index of counter block info + HSA_EXT_TOOLS_COUNTER_BLOCK_INFO_MAX +} hsa_ext_tools_counter_block_info_t; +*/ + +//---------------------------------------------------------------------------// +// Enumeration for the methods used to index into the correct registers. // +//---------------------------------------------------------------------------// +/* +typedef enum hsa_ext_tools_counter_index_method_s { + // No index + HSA_EXT_TOOLS_COUNTER_INDEX_METHOD_BY_NONE = 0, + + // Index by block instance + HSA_EXT_TOOLS_COUNTER_INDEX_METHOD_BY_INSTANCE = 1, + + // Index by shader engine + HSA_EXT_TOOLS_COUNTER_INDEX_METHOD_BY_SHADER_ENGINE = 2, + + // Index by shader and instance + HSA_EXT_TOOLS_COUNTER_INDEX_METHOD_BY_SHADER_ENGINE_ANDINSTANCE = 3 +} hsa_ext_tools_counter_index_method_t; +*/ + +//---------------------------------------------------------------------------// +// Enumeration for the HSAPerf generic error codes // +//---------------------------------------------------------------------------// +/* +typedef enum hsa_ext_tools_error_codes_s { + // Successful + HSA_EXT_TOOLS_ERROR_CODE_OK = 0, + + // Generic error code + HSA_EXT_TOOLS_ERROR_CODE_ERROR, + + // Generic invalid HSAPerf API arguments + HSA_EXT_TOOLS_ERROR_CODE_INVALID_ARGS, + + // The operation is not permit due to currently in the unmodifiable + // HSAPerf state . + HSA_EXT_TOOLS_ERROR_CODE_UNMODIFIABLE_STATE, + + // The hsa_ext_tools_set_pmu_parameter() or + // hsa_ext_tools_get_pmu_parameter() API contains invalid parameter value. + HSA_EXT_TOOLS_ERROR_CODE_INVALID_PARAM, + + // The hsa_ext_tools_set_pmu_parameter() or + // hsa_ext_tools_get_pmu_parameter() API contains invalid parameter size + // or return size. + HSA_EXT_TOOLS_ERROR_CODE_INVALID_PARAM_SIZE, + + // The hsa_ext_tools_set_pmu_parameter() or + // hsa_ext_tools_get_pmu_parameter() API contains invalid + // pointer (e.g. NULL). + HSA_EXT_TOOLS_ERROR_CODE_INVALID_PARAM_DATA, + + // The hsa_ext_tools_get_pmu_info() API contains invalid info value. + HSA_EXT_TOOLS_ERROR_CODE_INVALID_INFO, + + // The hsa_ext_tools_get_pmu_info() API contains invalid info + // size (e.g. zero). + HSA_EXT_TOOLS_ERROR_CODE_INVALID_INFO_SIZE, + + // The hsa_ext_tools_get_pmu_info() API contains invalid + // data (e.g. NULL). + HSA_EXT_TOOLS_ERROR_CODE_INVALID_INFO_DATA +} hsa_ext_tools_error_codes_t; +*/ + +//---------------------------------------------------------------------------// +// Enumeration for Pmu profiling state // +//---------------------------------------------------------------------------// +typedef enum rocr_pmu_state_s { + // Profiling idle. In this state, changes can be made to + // the PMU, counter blocks, counters. This state can represent + // the moment prior to calling begin or after calling + // hsa_ext_tools_pmu_wait_for_completion(). + ROCR_PMU_STATE_IDLE, + + // Profiling start. In this state, changes cannot be made to + // the PMU, counter block, counters. The PMU is collecting + // performance counter data. This state represents + // the moment after calling hsa_ext_tools_pmu_begin() and before calling + // hsa_ext_tools_pmu_end() + ROCR_PMU_STATE_START, + + // Profiling stop. In this state, changes cannot be made to + // the PMU, counter blocks, Counters. PMU has stopped the + // performance counter data collection. However, the result + // might not yet be available. This state represents + // the moment after calling hsa_ext_tools_pmu_end() and before the call + // to hsa_ext_tools_pmu_wait_for_completion() has returned success. + ROCR_PMU_STATE_STOP +} rocr_pmu_state_t; + +//---------------------------------------------------------------------------// +// Opaque pointer to HSA performance monitor unit (PMU) // +//---------------------------------------------------------------------------// +// typedef void * hsa_ext_tools_pmu_t; + +//---------------------------------------------------------------------------// +// Opaque pointer to HSA counter block // +//---------------------------------------------------------------------------// +// typedef void * hsa_ext_tools_counter_block_t; + +//---------------------------------------------------------------------------// +// Opaque pointer to HSA counter // +//---------------------------------------------------------------------------// +// typedef void * hsa_ext_tools_counter_t; + +#ifdef __cplusplus +} +#endif // __cplusplus +#endif // _ROCR_PROFILER_H_ diff --git a/runtime/hsa-ext-aql-profile/src/perfcounter/var_data.cpp b/runtime/hsa-ext-aql-profile/src/perfcounter/var_data.cpp new file mode 100644 index 0000000000..c7ec19e609 --- /dev/null +++ b/runtime/hsa-ext-aql-profile/src/perfcounter/var_data.cpp @@ -0,0 +1,48 @@ +#include +#include "var_data.h" + +namespace pm4_profile { +VarData::VarData() { + size_ = 0; + p_data_ = NULL; +} + +VarData::~VarData() {} + +void VarData::clear() { + size_ = 0; + if (p_data_) { + free(p_data_); + p_data_ = NULL; + } +} + +bool VarData::set(uint32_t size, const void* p_data) { + if (!p_data || (size == 0)) { + return false; + } + + clear(); + + if (NULL == (p_data_ = malloc(size))) { + return false; + } + + memcpy(p_data_, p_data, size); + size_ = size; + + return true; +} + +uint32_t VarData::get(uint32_t size, void* p_data) { + if (!p_data || !size || !p_data_ || !size_) { + return 0; + } + + uint32_t ret_size = size < size_ ? size : size_; + + memcpy(p_data, p_data_, ret_size); + + return ret_size; +} +} // pm4_profile diff --git a/runtime/hsa-ext-aql-profile/src/perfcounter/var_data.h b/runtime/hsa-ext-aql-profile/src/perfcounter/var_data.h new file mode 100644 index 0000000000..e94a6b8b90 --- /dev/null +++ b/runtime/hsa-ext-aql-profile/src/perfcounter/var_data.h @@ -0,0 +1,65 @@ +#ifndef _VAR_DATA_H_ +#define _VAR_DATA_H_ + +/*! + \note This file contains declaration of IVarData class. + */ + +#include "hsa_perf.h" + +#include +#include +#include + +namespace pm4_profile { +/*! + This abstract class implements variable-size storage for information and + parameter + sets. + */ +class VarData { + public: + /*! Constructor for IVarData */ + VarData(); + + /*! Destructor for IVarData */ + ~VarData(); + + /*! Deallocate the memory and clean up */ + void clear(); + + /*! + Set the data to be stored. + @param[in] size Size of data to be stored. + @param[in] p_data Pointer to data to be stored. + \return true or false + */ + bool set(uint32_t size, const void* p_data); + + /*! + Query the data that was stored. + @param[in] size Size (in bytes) of the memory pointed to by p_data. + This determines maximum size of the returned data. + @param[in,out] p_data Pointer to the result buffer. + \return Size (in bytes) of the returned result which is coppied into + the buffer pointed to by p_data. + */ + uint32_t get(uint32_t size, void* p_data); + + /*! + Get size of the current data stored + \return Size (in bytes) of the data stored. + */ + uint32_t getSize() { return size_; } + + private: + /*! Size of data being stored */ + uint32_t size_; + + /*! Pointer to the stored data */ + void* p_data_; +}; + +typedef std::map VarDataMap; +} +#endif diff --git a/runtime/hsa-ext-aql-profile/src/perfcounter/vi_blockinfo.cpp b/runtime/hsa-ext-aql-profile/src/perfcounter/vi_blockinfo.cpp new file mode 100644 index 0000000000..bc21dd1b7a --- /dev/null +++ b/runtime/hsa-ext-aql-profile/src/perfcounter/vi_blockinfo.cpp @@ -0,0 +1,622 @@ +#include "vi_blockinfo.h" +#include "gfxip/gfx8/si_ci_vi_merged_offset.h" + +namespace pm4_profile { +/** + * Table containing CounterGroups which represent VI hardware blocks + * as defined by \ref GpuBlockInfo structure + */ +GpuBlockInfo ViPmuHwBlocks[] = { + // Counter block CB + {"VI_CB0", kHsaViCounterBlockIdCb0, VI_MAX_NUM_SHADER_ENGINES, 2, VI_NUM_CB, + CntlMethodBySeAndInstance, 395, VI_COUNTER_NUM_PER_CB, 0, 0, true, 0, 0, false, 0, 0}, + {"VI_CB1", kHsaViCounterBlockIdCb1, VI_MAX_NUM_SHADER_ENGINES, 2, VI_NUM_CB, + CntlMethodBySeAndInstance, 395, VI_COUNTER_NUM_PER_CB, 0, 0, true, 0, 0, false, 0, 0}, + {"VI_CB2", kHsaViCounterBlockIdCb2, VI_MAX_NUM_SHADER_ENGINES, 2, VI_NUM_CB, + CntlMethodBySeAndInstance, 395, VI_COUNTER_NUM_PER_CB, 0, 0, true, 0, 0, false, 0, 0}, + {"VI_CB3", kHsaViCounterBlockIdCb3, VI_MAX_NUM_SHADER_ENGINES, 2, VI_NUM_CB, + CntlMethodBySeAndInstance, 395, VI_COUNTER_NUM_PER_CB, 0, 0, true, 0, 0, false, 0, 0}, + + // Counter block CPF + {"VI_CPF", kHsaViCounterBlockIdCpf, VI_MAX_NUM_SHADER_ENGINES, 2, 1, CntlMethodNone, 19, + VI_COUNTER_NUM_PER_CPF, 0, 0, true, 0, 0, false, 0, 0}, + + // Counter block DB + {"VI_DB0", kHsaViCounterBlockIdDb0, VI_MAX_NUM_SHADER_ENGINES, 2, VI_NUM_DB, + CntlMethodBySeAndInstance, 256, VI_COUNTER_NUM_PER_DB, 0, 0, true, 0, 0, false, 0, 0}, + {"VI_DB1", kHsaViCounterBlockIdDb1, VI_MAX_NUM_SHADER_ENGINES, 2, VI_NUM_DB, + CntlMethodBySeAndInstance, 256, VI_COUNTER_NUM_PER_DB, 0, 0, true, 0, 0, false, 0, 0}, + {"VI_DB2", kHsaViCounterBlockIdDb2, VI_MAX_NUM_SHADER_ENGINES, 2, VI_NUM_DB, + CntlMethodBySeAndInstance, 256, VI_COUNTER_NUM_PER_DB, 0, 0, true, 0, 0, false, 0, 0}, + {"VI_DB3", kHsaViCounterBlockIdDb3, VI_MAX_NUM_SHADER_ENGINES, 2, VI_NUM_DB, + CntlMethodBySeAndInstance, 256, VI_COUNTER_NUM_PER_DB, 0, 0, true, 0, 0, false, 0, 0}, + + // Counter block GRBM + {"VI_GRBM", kHsaViCounterBlockIdGrbm, VI_MAX_NUM_SHADER_ENGINES, 2, 1, CntlMethodNone, 33, + VI_COUNTER_NUM_PER_GRBM, 0, 0, true, 0, 0, false, 0, 0}, + + // Counter block GRBMSE + {"VI_GRBMSE", kHsaViCounterBlockIdGrbmSe, VI_MAX_NUM_SHADER_ENGINES, 2, 1, CntlMethodNone, 14, + VI_COUNTER_NUM_PER_GRBMSE, 0, 0, true, 0, 0, false, 0, 0}, + + // Counter block PA_SU + {"VI_PA_SU", kHsaViCounterBlockIdPaSu, VI_MAX_NUM_SHADER_ENGINES, 2, 1, CntlMethodBySe, 152, + VI_COUNTER_NUM_PER_PA_SU, 0, 0, true, 0, 0, false, 0, 0}, + + // Counter block PA_SC + {"VI_PA_SC", kHsaViCounterBlockIdPaSc, VI_MAX_NUM_SHADER_ENGINES, 2, 1, CntlMethodBySe, 396, + VI_COUNTER_NUM_PER_PA_SC, 0, 0, true, 0, 0, false, 0, 0}, + + // Counter block SPI + {"VI_SPI", kHsaViCounterBlockIdSpi, VI_MAX_NUM_SHADER_ENGINES, 2, 1, CntlMethodBySe, 196, + VI_COUNTER_NUM_PER_SPI, 0, 0, true, 0, 0, false, 0, 0}, + + // Counter block SQ + {"VI_SQ", kHsaViCounterBlockIdSq, VI_MAX_NUM_SHADER_ENGINES, 2, 1, CntlMethodBySe, 298, + VI_COUNTER_NUM_PER_SQ, 0, 0, true, 0, 0, false, 0, 0}, + {"VI_SQ_ES", kHsaViCounterBlockIdSqEs, VI_MAX_NUM_SHADER_ENGINES, 2, 1, CntlMethodBySe, 298, + VI_COUNTER_NUM_PER_SQ, 0, 0, true, 0, 0, false, 0, 0}, + {"VI_SQ_GS", kHsaViCounterBlockIdSqGs, VI_MAX_NUM_SHADER_ENGINES, 2, 1, CntlMethodBySe, 298, + VI_COUNTER_NUM_PER_SQ, 0, 0, true, 0, 0, false, 0, 0}, + {"VI_SQ_VS", kHsaViCounterBlockIdSqVs, VI_MAX_NUM_SHADER_ENGINES, 2, 1, CntlMethodBySe, 298, + VI_COUNTER_NUM_PER_SQ, 0, 0, true, 0, 0, false, 0, 0}, + {"VI_SQ_PS", kHsaViCounterBlockIdSqPs, VI_MAX_NUM_SHADER_ENGINES, 2, 1, CntlMethodBySe, 298, + VI_COUNTER_NUM_PER_SQ, 0, 0, true, 0, 0, false, 0, 0}, + {"VI_SQ_LS", kHsaViCounterBlockIdSqLs, VI_MAX_NUM_SHADER_ENGINES, 2, 1, CntlMethodBySe, 298, + VI_COUNTER_NUM_PER_SQ, 0, 0, true, 0, 0, false, 0, 0}, + {"VI_SQ_HS", kHsaViCounterBlockIdSqHs, VI_MAX_NUM_SHADER_ENGINES, 2, 1, CntlMethodBySe, 298, + VI_COUNTER_NUM_PER_SQ, 0, 0, true, 0, 0, false, 0, 0}, + {"VI_SQ_CS", kHsaViCounterBlockIdSqCs, VI_MAX_NUM_SHADER_ENGINES, 2, 1, CntlMethodBySe, 298, + VI_COUNTER_NUM_PER_SQ, 0, 0, true, 0, 0, false, 0, 0}, + + // Counter block SX + {"VI_SX", kHsaViCounterBlockIdSx, VI_MAX_NUM_SHADER_ENGINES, 2, 1, CntlMethodBySe, 33, + VI_COUNTER_NUM_PER_SX, 0, 0, true, 0, 0, false, 0, 0}, + + // Counter block TA + {"VI_TA0", kHsaViCounterBlockIdTa0, VI_MAX_NUM_SHADER_ENGINES, 2, VI_NUM_TA, + CntlMethodBySeAndInstance, 118, VI_COUNTER_NUM_PER_TA, 0, 0, true, 0, 0, false, 0, 0}, + {"VI_TA1", kHsaViCounterBlockIdTa1, VI_MAX_NUM_SHADER_ENGINES, 2, VI_NUM_TA, + CntlMethodBySeAndInstance, 118, VI_COUNTER_NUM_PER_TA, 0, 0, true, 0, 0, false, 0, 0}, + {"VI_TA2", kHsaViCounterBlockIdTa2, VI_MAX_NUM_SHADER_ENGINES, 2, VI_NUM_TA, + CntlMethodBySeAndInstance, 118, VI_COUNTER_NUM_PER_TA, 0, 0, true, 0, 0, false, 0, 0}, + {"VI_TA3", kHsaViCounterBlockIdTa3, VI_MAX_NUM_SHADER_ENGINES, 2, VI_NUM_TA, + CntlMethodBySeAndInstance, 118, VI_COUNTER_NUM_PER_TA, 0, 0, true, 0, 0, false, 0, 0}, + {"VI_TA4", kHsaViCounterBlockIdTa4, VI_MAX_NUM_SHADER_ENGINES, 2, VI_NUM_TA, + CntlMethodBySeAndInstance, 118, VI_COUNTER_NUM_PER_TA, 0, 0, true, 0, 0, false, 0, 0}, + {"VI_TA5", kHsaViCounterBlockIdTa5, VI_MAX_NUM_SHADER_ENGINES, 2, VI_NUM_TA, + CntlMethodBySeAndInstance, 118, VI_COUNTER_NUM_PER_TA, 0, 0, true, 0, 0, false, 0, 0}, + {"VI_TA6", kHsaViCounterBlockIdTa6, VI_MAX_NUM_SHADER_ENGINES, 2, VI_NUM_TA, + CntlMethodBySeAndInstance, 118, VI_COUNTER_NUM_PER_TA, 0, 0, true, 0, 0, false, 0, 0}, + {"VI_TA7", kHsaViCounterBlockIdTa7, VI_MAX_NUM_SHADER_ENGINES, 2, VI_NUM_TA, + CntlMethodBySeAndInstance, 118, VI_COUNTER_NUM_PER_TA, 0, 0, true, 0, 0, false, 0, 0}, + {"VI_TA8", kHsaViCounterBlockIdTa8, VI_MAX_NUM_SHADER_ENGINES, 2, VI_NUM_TA, + CntlMethodBySeAndInstance, 118, VI_COUNTER_NUM_PER_TA, 0, 0, true, 0, 0, false, 0, 0}, + {"VI_TA9", kHsaViCounterBlockIdTa9, VI_MAX_NUM_SHADER_ENGINES, 2, VI_NUM_TA, + CntlMethodBySeAndInstance, 118, VI_COUNTER_NUM_PER_TA, 0, 0, true, 0, 0, false, 0, 0}, + {"VI_TA10", kHsaViCounterBlockIdTa10, VI_MAX_NUM_SHADER_ENGINES, 2, VI_NUM_TA, + CntlMethodBySeAndInstance, 118, VI_COUNTER_NUM_PER_TA, 0, 0, true, 0, 0, false, 0, 0}, + {"VI_TA11", kHsaViCounterBlockIdTa11, VI_MAX_NUM_SHADER_ENGINES, 2, VI_NUM_TA, + CntlMethodBySeAndInstance, 118, VI_COUNTER_NUM_PER_TA, 0, 0, true, 0, 0, false, 0, 0}, + {"VI_TA12", kHsaViCounterBlockIdTa12, VI_MAX_NUM_SHADER_ENGINES, 2, VI_NUM_TA, + CntlMethodBySeAndInstance, 118, VI_COUNTER_NUM_PER_TA, 0, 0, true, 0, 0, false, 0, 0}, + {"VI_TA13", kHsaViCounterBlockIdTa13, VI_MAX_NUM_SHADER_ENGINES, 2, VI_NUM_TA, + CntlMethodBySeAndInstance, 118, VI_COUNTER_NUM_PER_TA, 0, 0, true, 0, 0, false, 0, 0}, + {"VI_TA14", kHsaViCounterBlockIdTa14, VI_MAX_NUM_SHADER_ENGINES, 2, VI_NUM_TA, + CntlMethodBySeAndInstance, 118, VI_COUNTER_NUM_PER_TA, 0, 0, true, 0, 0, false, 0, 0}, + {"VI_TA15", kHsaViCounterBlockIdTa15, VI_MAX_NUM_SHADER_ENGINES, 2, VI_NUM_TA, + CntlMethodBySeAndInstance, 118, VI_COUNTER_NUM_PER_TA, 0, 0, true, 0, 0, false, 0, 0}, + + // Counter block TCA + {"VI_TCA0", kHsaViCounterBlockIdTca0, VI_MAX_NUM_SHADER_ENGINES, 2, VI_NUM_TCA, + CntlMethodByInstance, 34, VI_COUNTER_NUM_PER_TCA, 0, 0, true, 0, 0, false, 0, 0}, + {"VI_TCA1", kHsaViCounterBlockIdTca1, VI_MAX_NUM_SHADER_ENGINES, 2, VI_NUM_TCA, + CntlMethodByInstance, 34, VI_COUNTER_NUM_PER_TCA, 0, 0, true, 0, 0, false, 0, 0}, + + // Counter block TCC + {"VI_TCC0", kHsaViCounterBlockIdTcc0, VI_MAX_NUM_SHADER_ENGINES, 2, VI_NUM_TCC, + CntlMethodByInstance, 191, VI_COUNTER_NUM_PER_TCC, 0, 0, true, 0, 0, false, 0, 0}, + {"VI_TCC1", kHsaViCounterBlockIdTcc1, VI_MAX_NUM_SHADER_ENGINES, 2, VI_NUM_TCC, + CntlMethodByInstance, 191, VI_COUNTER_NUM_PER_TCC, 0, 0, true, 0, 0, false, 0, 0}, + {"VI_TCC2", kHsaViCounterBlockIdTcc2, VI_MAX_NUM_SHADER_ENGINES, 2, VI_NUM_TCC, + CntlMethodByInstance, 191, VI_COUNTER_NUM_PER_TCC, 0, 0, true, 0, 0, false, 0, 0}, + {"VI_TCC3", kHsaViCounterBlockIdTcc3, VI_MAX_NUM_SHADER_ENGINES, 2, VI_NUM_TCC, + CntlMethodByInstance, 191, VI_COUNTER_NUM_PER_TCC, 0, 0, true, 0, 0, false, 0, 0}, + {"VI_TCC4", kHsaViCounterBlockIdTcc4, VI_MAX_NUM_SHADER_ENGINES, 2, VI_NUM_TCC, + CntlMethodByInstance, 191, VI_COUNTER_NUM_PER_TCC, 0, 0, true, 0, 0, false, 0, 0}, + {"VI_TCC5", kHsaViCounterBlockIdTcc5, VI_MAX_NUM_SHADER_ENGINES, 2, VI_NUM_TCC, + CntlMethodByInstance, 191, VI_COUNTER_NUM_PER_TCC, 0, 0, true, 0, 0, false, 0, 0}, + {"VI_TCC6", kHsaViCounterBlockIdTcc6, VI_MAX_NUM_SHADER_ENGINES, 2, VI_NUM_TCC, + CntlMethodByInstance, 191, VI_COUNTER_NUM_PER_TCC, 0, 0, true, 0, 0, false, 0, 0}, + {"VI_TCC7", kHsaViCounterBlockIdTcc7, VI_MAX_NUM_SHADER_ENGINES, 2, VI_NUM_TCC, + CntlMethodByInstance, 191, VI_COUNTER_NUM_PER_TCC, 0, 0, true, 0, 0, false, 0, 0}, + {"VI_TCC8", kHsaViCounterBlockIdTcc8, VI_MAX_NUM_SHADER_ENGINES, 2, VI_NUM_TCC, + CntlMethodByInstance, 191, VI_COUNTER_NUM_PER_TCC, 0, 0, true, 0, 0, false, 0, 0}, + {"VI_TCC9", kHsaViCounterBlockIdTcc9, VI_MAX_NUM_SHADER_ENGINES, 2, VI_NUM_TCC, + CntlMethodByInstance, 191, VI_COUNTER_NUM_PER_TCC, 0, 0, true, 0, 0, false, 0, 0}, + {"VI_TCC10", kHsaViCounterBlockIdTcc10, VI_MAX_NUM_SHADER_ENGINES, 2, VI_NUM_TCC, + CntlMethodByInstance, 191, VI_COUNTER_NUM_PER_TCC, 0, 0, true, 0, 0, false, 0, 0}, + {"VI_TCC11", kHsaViCounterBlockIdTcc11, VI_MAX_NUM_SHADER_ENGINES, 2, VI_NUM_TCC, + CntlMethodByInstance, 191, VI_COUNTER_NUM_PER_TCC, 0, 0, true, 0, 0, false, 0, 0}, + {"VI_TCC12", kHsaViCounterBlockIdTcc12, VI_MAX_NUM_SHADER_ENGINES, 2, VI_NUM_TCC, + CntlMethodByInstance, 191, VI_COUNTER_NUM_PER_TCC, 0, 0, true, 0, 0, false, 0, 0}, + {"VI_TCC13", kHsaViCounterBlockIdTcc13, VI_MAX_NUM_SHADER_ENGINES, 2, VI_NUM_TCC, + CntlMethodByInstance, 191, VI_COUNTER_NUM_PER_TCC, 0, 0, true, 0, 0, false, 0, 0}, + {"VI_TCC14", kHsaViCounterBlockIdTcc14, VI_MAX_NUM_SHADER_ENGINES, 2, VI_NUM_TCC, + CntlMethodByInstance, 191, VI_COUNTER_NUM_PER_TCC, 0, 0, true, 0, 0, false, 0, 0}, + {"VI_TCC15", kHsaViCounterBlockIdTcc15, VI_MAX_NUM_SHADER_ENGINES, 2, VI_NUM_TCC, + CntlMethodByInstance, 191, VI_COUNTER_NUM_PER_TCC, 0, 0, true, 0, 0, false, 0, 0}, + + // Counter block TD + {"VI_TD0", kHsaViCounterBlockIdTd0, VI_MAX_NUM_SHADER_ENGINES, 2, VI_NUM_TD, + CntlMethodBySeAndInstance, 54, VI_COUNTER_NUM_PER_TD, 0, 0, true, 0, 0, false, 0, 0}, + {"VI_TD1", kHsaViCounterBlockIdTd1, VI_MAX_NUM_SHADER_ENGINES, 2, VI_NUM_TD, + CntlMethodBySeAndInstance, 54, VI_COUNTER_NUM_PER_TD, 0, 0, true, 0, 0, false, 0, 0}, + {"VI_TD2", kHsaViCounterBlockIdTd2, VI_MAX_NUM_SHADER_ENGINES, 2, VI_NUM_TD, + CntlMethodBySeAndInstance, 54, VI_COUNTER_NUM_PER_TD, 0, 0, true, 0, 0, false, 0, 0}, + {"VI_TD3", kHsaViCounterBlockIdTd3, VI_MAX_NUM_SHADER_ENGINES, 2, VI_NUM_TD, + CntlMethodBySeAndInstance, 54, VI_COUNTER_NUM_PER_TD, 0, 0, true, 0, 0, false, 0, 0}, + {"VI_TD4", kHsaViCounterBlockIdTd4, VI_MAX_NUM_SHADER_ENGINES, 2, VI_NUM_TD, + CntlMethodBySeAndInstance, 54, VI_COUNTER_NUM_PER_TD, 0, 0, true, 0, 0, false, 0, 0}, + {"VI_TD5", kHsaViCounterBlockIdTd5, VI_MAX_NUM_SHADER_ENGINES, 2, VI_NUM_TD, + CntlMethodBySeAndInstance, 54, VI_COUNTER_NUM_PER_TD, 0, 0, true, 0, 0, false, 0, 0}, + {"VI_TD6", kHsaViCounterBlockIdTd6, VI_MAX_NUM_SHADER_ENGINES, 2, VI_NUM_TD, + CntlMethodBySeAndInstance, 54, VI_COUNTER_NUM_PER_TD, 0, 0, true, 0, 0, false, 0, 0}, + {"VI_TD7", kHsaViCounterBlockIdTd7, VI_MAX_NUM_SHADER_ENGINES, 2, VI_NUM_TD, + CntlMethodBySeAndInstance, 54, VI_COUNTER_NUM_PER_TD, 0, 0, true, 0, 0, false, 0, 0}, + {"VI_TD8", kHsaViCounterBlockIdTd8, VI_MAX_NUM_SHADER_ENGINES, 2, VI_NUM_TD, + CntlMethodBySeAndInstance, 54, VI_COUNTER_NUM_PER_TD, 0, 0, true, 0, 0, false, 0, 0}, + {"VI_TD9", kHsaViCounterBlockIdTd9, VI_MAX_NUM_SHADER_ENGINES, 2, VI_NUM_TD, + CntlMethodBySeAndInstance, 54, VI_COUNTER_NUM_PER_TD, 0, 0, true, 0, 0, false, 0, 0}, + {"VI_TD10", kHsaViCounterBlockIdTd10, VI_MAX_NUM_SHADER_ENGINES, 2, VI_NUM_TD, + CntlMethodBySeAndInstance, 54, VI_COUNTER_NUM_PER_TD, 0, 0, true, 0, 0, false, 0, 0}, + {"VI_TD11", kHsaViCounterBlockIdTd11, VI_MAX_NUM_SHADER_ENGINES, 2, VI_NUM_TD, + CntlMethodBySeAndInstance, 54, VI_COUNTER_NUM_PER_TD, 0, 0, true, 0, 0, false, 0, 0}, + {"VI_TD12", kHsaViCounterBlockIdTd12, VI_MAX_NUM_SHADER_ENGINES, 2, VI_NUM_TD, + CntlMethodBySeAndInstance, 54, VI_COUNTER_NUM_PER_TD, 0, 0, true, 0, 0, false, 0, 0}, + {"VI_TD13", kHsaViCounterBlockIdTd13, VI_MAX_NUM_SHADER_ENGINES, 2, VI_NUM_TD, + CntlMethodBySeAndInstance, 54, VI_COUNTER_NUM_PER_TD, 0, 0, true, 0, 0, false, 0, 0}, + {"VI_TD14", kHsaViCounterBlockIdTd14, VI_MAX_NUM_SHADER_ENGINES, 2, VI_NUM_TD, + CntlMethodBySeAndInstance, 54, VI_COUNTER_NUM_PER_TD, 0, 0, true, 0, 0, false, 0, 0}, + {"VI_TD15", kHsaViCounterBlockIdTd15, VI_MAX_NUM_SHADER_ENGINES, 2, VI_NUM_TD, + CntlMethodBySeAndInstance, 54, VI_COUNTER_NUM_PER_TD, 0, 0, true, 0, 0, false, 0, 0}, + + // Counter block TCP + {"VI_TCP0", kHsaViCounterBlockIdTcp0, VI_MAX_NUM_SHADER_ENGINES, 2, VI_NUM_TCP, + CntlMethodBySeAndInstance, 182, VI_COUNTER_NUM_PER_TCP, 0, 0, true, 0, 0, false, 0, 0}, + {"VI_TCP1", kHsaViCounterBlockIdTcp1, VI_MAX_NUM_SHADER_ENGINES, 2, VI_NUM_TCP, + CntlMethodBySeAndInstance, 182, VI_COUNTER_NUM_PER_TCP, 0, 0, true, 0, 0, false, 0, 0}, + {"VI_TCP2", kHsaViCounterBlockIdTcp2, VI_MAX_NUM_SHADER_ENGINES, 2, VI_NUM_TCP, + CntlMethodBySeAndInstance, 182, VI_COUNTER_NUM_PER_TCP, 0, 0, true, 0, 0, false, 0, 0}, + {"VI_TCP3", kHsaViCounterBlockIdTcp3, VI_MAX_NUM_SHADER_ENGINES, 2, VI_NUM_TCP, + CntlMethodBySeAndInstance, 182, VI_COUNTER_NUM_PER_TCP, 0, 0, true, 0, 0, false, 0, 0}, + {"VI_TCP4", kHsaViCounterBlockIdTcp4, VI_MAX_NUM_SHADER_ENGINES, 2, VI_NUM_TCP, + CntlMethodBySeAndInstance, 182, VI_COUNTER_NUM_PER_TCP, 0, 0, true, 0, 0, false, 0, 0}, + {"VI_TCP5", kHsaViCounterBlockIdTcp5, VI_MAX_NUM_SHADER_ENGINES, 2, VI_NUM_TCP, + CntlMethodBySeAndInstance, 182, VI_COUNTER_NUM_PER_TCP, 0, 0, true, 0, 0, false, 0, 0}, + {"VI_TCP6", kHsaViCounterBlockIdTcp6, VI_MAX_NUM_SHADER_ENGINES, 2, VI_NUM_TCP, + CntlMethodBySeAndInstance, 182, VI_COUNTER_NUM_PER_TCP, 0, 0, true, 0, 0, false, 0, 0}, + {"VI_TCP7", kHsaViCounterBlockIdTcp7, VI_MAX_NUM_SHADER_ENGINES, 2, VI_NUM_TCP, + CntlMethodBySeAndInstance, 182, VI_COUNTER_NUM_PER_TCP, 0, 0, true, 0, 0, false, 0, 0}, + {"VI_TCP8", kHsaViCounterBlockIdTcp8, VI_MAX_NUM_SHADER_ENGINES, 2, VI_NUM_TCP, + CntlMethodBySeAndInstance, 182, VI_COUNTER_NUM_PER_TCP, 0, 0, true, 0, 0, false, 0, 0}, + {"VI_TCP9", kHsaViCounterBlockIdTcp9, VI_MAX_NUM_SHADER_ENGINES, 2, VI_NUM_TCP, + CntlMethodBySeAndInstance, 182, VI_COUNTER_NUM_PER_TCP, 0, 0, true, 0, 0, false, 0, 0}, + {"VI_TCP10", kHsaViCounterBlockIdTcp10, VI_MAX_NUM_SHADER_ENGINES, 2, VI_NUM_TCP, + CntlMethodBySeAndInstance, 182, VI_COUNTER_NUM_PER_TCP, 0, 0, true, 0, 0, false, 0, 0}, + {"VI_TCP11", kHsaViCounterBlockIdTcp11, VI_MAX_NUM_SHADER_ENGINES, 2, VI_NUM_TCP, + CntlMethodBySeAndInstance, 182, VI_COUNTER_NUM_PER_TCP, 0, 0, true, 0, 0, false, 0, 0}, + {"VI_TCP12", kHsaViCounterBlockIdTcp12, VI_MAX_NUM_SHADER_ENGINES, 2, VI_NUM_TCP, + CntlMethodBySeAndInstance, 182, VI_COUNTER_NUM_PER_TCP, 0, 0, true, 0, 0, false, 0, 0}, + {"VI_TCP13", kHsaViCounterBlockIdTcp13, VI_MAX_NUM_SHADER_ENGINES, 2, VI_NUM_TCP, + CntlMethodBySeAndInstance, 182, VI_COUNTER_NUM_PER_TCP, 0, 0, true, 0, 0, false, 0, 0}, + {"VI_TCP14", kHsaViCounterBlockIdTcp14, VI_MAX_NUM_SHADER_ENGINES, 2, VI_NUM_TCP, + CntlMethodBySeAndInstance, 182, VI_COUNTER_NUM_PER_TCP, 0, 0, true, 0, 0, false, 0, 0}, + {"VI_TCP15", kHsaViCounterBlockIdTcp15, VI_MAX_NUM_SHADER_ENGINES, 2, VI_NUM_TCP, + CntlMethodBySeAndInstance, 182, VI_COUNTER_NUM_PER_TCP, 0, 0, true, 0, 0, false, 0, 0}, + + // Counter block GDS + {"VI_GDS", kHsaViCounterBlockIdGds, VI_MAX_NUM_SHADER_ENGINES, 2, 1, CntlMethodNone, 120, + VI_COUNTER_NUM_PER_GDS, 0, 0, true, 0, 0, false, 0, 0}, + + // Counter block VGT + {"VI_VGT", kHsaViCounterBlockIdVgt, VI_MAX_NUM_SHADER_ENGINES, 2, 1, CntlMethodBySe, 145, + VI_COUNTER_NUM_PER_VGT, 0, 0, true, 0, 0, false, 0, 0}, + + // Counter block IA + {"VI_IA", kHsaViCounterBlockIdIa, VI_MAX_NUM_SHADER_ENGINES, 2, 1, CntlMethodBySe, 23, + VI_COUNTER_NUM_PER_IA, 0, 0, true, 0, 0, false, 0, 0}, + + // Counter block MC + {"VI_MC", kHsaViCounterBlockIdMc, VI_MAX_NUM_SHADER_ENGINES, 2, 1, CntlMethodNone, 22, + VI_COUNTER_NUM_PER_MC, 0, 0, true, 0, 0, false, 0, 0}, + + // Counter block SRBM + {"VI_SRBM", kHsaViCounterBlockIdSrbm, VI_MAX_NUM_SHADER_ENGINES, 2, 1, CntlMethodNone, 19, + VI_COUNTER_NUM_PER_SRBM, 0, 0, true, 0, 0, false, 0, 0}, + + // Counter block WD + {"VI_WD", kHsaViCounterBlockIdWd, VI_MAX_NUM_SHADER_ENGINES, 2, 1, CntlMethodNone, 36, + VI_COUNTER_NUM_PER_WD, 0, 0, true, 0, 0, false, 0, 0}, + + // Counter block CPG + {"VI_CPG", kHsaViCounterBlockIdCpg, VI_MAX_NUM_SHADER_ENGINES, 2, 1, CntlMethodNone, 48, + VI_COUNTER_NUM_PER_CPG, 0, 0, true, 0, 0, false, 0, 0}, + + // Counter block CPC + {"VI_CPC", kHsaViCounterBlockIdCpc, VI_MAX_NUM_SHADER_ENGINES, 2, 1, CntlMethodNone, 24, + VI_COUNTER_NUM_PER_CPC, 0, 0, true, 0, 0, false, 0, 0}, + + // Counter block IOMMUV2 + {"VI_IOMMUV2", kHsaViCounterBlockIdIommuV2, VI_MAX_NUM_SHADER_ENGINES, 2, 1, CntlMethodNone, 25, + 8, 0, 0, true, 0, 0, false, 0, 0}, + + // Counter block KernelDriver + {"VI_KD", kHsaViCounterBlockIdKernelDriver, VI_MAX_NUM_SHADER_ENGINES, 2, 1, CntlMethodNone, 0, + 0, 0, 0, true, 0, 0, false, 0, 0}, + + // Name of the last line should be empty to indicate end of all counter groups + {"", kHsaViCounterBlockIdBlocksLast, 0, 0, 0, CntlMethodNone, 0, 0, 0, 0, false, 0, 0, false, 0, + 0}}; + +/* + * The following tables contain register addresses of the SQ counter registers + */ + +/* + * SQ + */ +GpuCounterRegInfo ViSqCounterRegAddr[] = { + {mmSQ_PERFCOUNTER0_SELECT__CI__VI, mmSQ_PERFCOUNTER_CTRL__CI__VI, mmSQ_PERFCOUNTER0_LO__CI__VI, + mmSQ_PERFCOUNTER0_HI__CI__VI}, + {mmSQ_PERFCOUNTER1_SELECT__CI__VI, mmSQ_PERFCOUNTER_CTRL__CI__VI, mmSQ_PERFCOUNTER1_LO__CI__VI, + mmSQ_PERFCOUNTER1_HI__CI__VI}, + {mmSQ_PERFCOUNTER2_SELECT__CI__VI, mmSQ_PERFCOUNTER_CTRL__CI__VI, mmSQ_PERFCOUNTER2_LO__CI__VI, + mmSQ_PERFCOUNTER2_HI__CI__VI}, + {mmSQ_PERFCOUNTER3_SELECT__CI__VI, mmSQ_PERFCOUNTER_CTRL__CI__VI, mmSQ_PERFCOUNTER3_LO__CI__VI, + mmSQ_PERFCOUNTER3_HI__CI__VI}, + {mmSQ_PERFCOUNTER4_SELECT__CI__VI, mmSQ_PERFCOUNTER_CTRL__CI__VI, mmSQ_PERFCOUNTER4_LO__CI__VI, + mmSQ_PERFCOUNTER4_HI__CI__VI}, + {mmSQ_PERFCOUNTER5_SELECT__CI__VI, mmSQ_PERFCOUNTER_CTRL__CI__VI, mmSQ_PERFCOUNTER5_LO__CI__VI, + mmSQ_PERFCOUNTER5_HI__CI__VI}, + {mmSQ_PERFCOUNTER6_SELECT__CI__VI, mmSQ_PERFCOUNTER_CTRL__CI__VI, mmSQ_PERFCOUNTER6_LO__CI__VI, + mmSQ_PERFCOUNTER6_HI__CI__VI}, + {mmSQ_PERFCOUNTER7_SELECT__CI__VI, mmSQ_PERFCOUNTER_CTRL__CI__VI, mmSQ_PERFCOUNTER7_LO__CI__VI, + mmSQ_PERFCOUNTER7_HI__CI__VI}, + {mmSQ_PERFCOUNTER8_SELECT__CI__VI, mmSQ_PERFCOUNTER_CTRL__CI__VI, mmSQ_PERFCOUNTER8_LO__CI__VI, + mmSQ_PERFCOUNTER8_HI__CI__VI}, + {mmSQ_PERFCOUNTER9_SELECT__CI__VI, mmSQ_PERFCOUNTER_CTRL__CI__VI, mmSQ_PERFCOUNTER9_LO__CI__VI, + mmSQ_PERFCOUNTER9_HI__CI__VI}, + {mmSQ_PERFCOUNTER10_SELECT__CI__VI, mmSQ_PERFCOUNTER_CTRL__CI__VI, + mmSQ_PERFCOUNTER10_LO__CI__VI, mmSQ_PERFCOUNTER10_HI__CI__VI}, + {mmSQ_PERFCOUNTER11_SELECT__CI__VI, mmSQ_PERFCOUNTER_CTRL__CI__VI, + mmSQ_PERFCOUNTER11_LO__CI__VI, mmSQ_PERFCOUNTER11_HI__CI__VI}, + {mmSQ_PERFCOUNTER12_SELECT__CI__VI, mmSQ_PERFCOUNTER_CTRL__CI__VI, + mmSQ_PERFCOUNTER12_LO__CI__VI, mmSQ_PERFCOUNTER12_HI__CI__VI}, + {mmSQ_PERFCOUNTER13_SELECT__CI__VI, mmSQ_PERFCOUNTER_CTRL__CI__VI, + mmSQ_PERFCOUNTER13_LO__CI__VI, mmSQ_PERFCOUNTER13_HI__CI__VI}, + {mmSQ_PERFCOUNTER14_SELECT__CI__VI, mmSQ_PERFCOUNTER_CTRL__CI__VI, + mmSQ_PERFCOUNTER14_LO__CI__VI, mmSQ_PERFCOUNTER14_HI__CI__VI}, + {mmSQ_PERFCOUNTER15_SELECT__CI__VI, mmSQ_PERFCOUNTER_CTRL__CI__VI, + mmSQ_PERFCOUNTER15_LO__CI__VI, mmSQ_PERFCOUNTER15_HI__CI__VI}}; + +/* + * DRMDMA + */ +GpuCounterRegInfo ViDrmdmaCounterRegAddr[] = { + {mmSDMA0_PERFMON_CNTL__VI, 0, mmSDMA0_PERFCOUNTER0_RESULT__VI, 0}, + {mmSDMA0_PERFMON_CNTL__VI, 0, mmSDMA0_PERFCOUNTER1_RESULT__VI, 0}, + {mmSDMA1_PERFMON_CNTL__VI, 0, mmSDMA1_PERFCOUNTER0_RESULT__VI, 0}, + {mmSDMA1_PERFMON_CNTL__VI, 0, mmSDMA1_PERFCOUNTER1_RESULT__VI, 0}, +}; + +/* + * IH + */ +GpuCounterRegInfo ViIhCounterRegAddr[] = { + {mmIH_PERFMON_CNTL__VI, 0, mmIH_PERFCOUNTER0_RESULT__VI, 0}, + {mmIH_PERFMON_CNTL__VI, 0, mmIH_PERFCOUNTER1_RESULT__VI, 0}}; + +/* + * CPF + */ +GpuCounterRegInfo ViCpfCounterRegAddr[] = { + {mmCPF_PERFCOUNTER0_SELECT__CI__VI, 0, mmCPF_PERFCOUNTER0_LO__CI__VI, + mmCPF_PERFCOUNTER0_HI__CI__VI}, + {mmCPF_PERFCOUNTER1_SELECT__CI__VI, 0, mmCPF_PERFCOUNTER1_LO__CI__VI, + mmCPF_PERFCOUNTER1_HI__CI__VI}}; + +/* + * DRM + */ +GpuCounterRegInfo ViDrmCounterRegAddr[] = { + {mmDRM_PERFCOUNTER1_SELECT, 0, mmDRM_PERFCOUNTER1_LO, mmDRM_PERFCOUNTER1_HI}, + {mmDRM_PERFCOUNTER2_SELECT, 0, mmDRM_PERFCOUNTER2_LO, mmDRM_PERFCOUNTER2_HI}}; + +/* + * GRBM + */ +GpuCounterRegInfo ViGrbmCounterRegAddr[] = { + {mmGRBM_PERFCOUNTER0_SELECT__CI__VI, 0, mmGRBM_PERFCOUNTER0_LO__CI__VI, + mmGRBM_PERFCOUNTER0_HI__CI__VI}, + {mmGRBM_PERFCOUNTER1_SELECT__CI__VI, 0, mmGRBM_PERFCOUNTER1_LO__CI__VI, + mmGRBM_PERFCOUNTER1_HI__CI__VI}}; + +/* + * GRBM_SE + */ +GpuCounterRegInfo ViGrbmSeCounterRegAddr[] = { + {mmGRBM_SE0_PERFCOUNTER_SELECT__CI__VI, 0, mmGRBM_SE0_PERFCOUNTER_LO__CI__VI, + mmGRBM_SE0_PERFCOUNTER_HI__CI__VI}, + {mmGRBM_SE1_PERFCOUNTER_SELECT__CI__VI, 0, mmGRBM_SE1_PERFCOUNTER_LO__CI__VI, + mmGRBM_SE1_PERFCOUNTER_HI__CI__VI}, + {mmGRBM_SE2_PERFCOUNTER_SELECT__CI__VI, 0, mmGRBM_SE2_PERFCOUNTER_LO__CI__VI, + mmGRBM_SE2_PERFCOUNTER_HI__CI__VI}, + {mmGRBM_SE3_PERFCOUNTER_SELECT__CI__VI, 0, mmGRBM_SE3_PERFCOUNTER_LO__CI__VI, + mmGRBM_SE3_PERFCOUNTER_HI__CI__VI}}; + +/* + * PA_SU + */ +GpuCounterRegInfo ViPaSuCounterRegAddr[] = { + {mmPA_SU_PERFCOUNTER0_SELECT__CI__VI, 0, mmPA_SU_PERFCOUNTER0_LO__CI__VI, + mmPA_SU_PERFCOUNTER0_HI__CI__VI}, + {mmPA_SU_PERFCOUNTER1_SELECT__CI__VI, 0, mmPA_SU_PERFCOUNTER1_LO__CI__VI, + mmPA_SU_PERFCOUNTER1_HI__CI__VI}, + {mmPA_SU_PERFCOUNTER2_SELECT__CI__VI, 0, mmPA_SU_PERFCOUNTER2_LO__CI__VI, + mmPA_SU_PERFCOUNTER2_HI__CI__VI}, + {mmPA_SU_PERFCOUNTER3_SELECT__CI__VI, 0, mmPA_SU_PERFCOUNTER3_LO__CI__VI, + mmPA_SU_PERFCOUNTER3_HI__CI__VI}}; + +/* + * PA_SC + */ +GpuCounterRegInfo ViPaScCounterRegAddr[] = { + {mmPA_SC_PERFCOUNTER0_SELECT__CI__VI, 0, mmPA_SC_PERFCOUNTER0_LO__CI__VI, + mmPA_SC_PERFCOUNTER0_HI__CI__VI}, + {mmPA_SC_PERFCOUNTER1_SELECT__CI__VI, 0, mmPA_SC_PERFCOUNTER1_LO__CI__VI, + mmPA_SC_PERFCOUNTER1_HI__CI__VI}, + {mmPA_SC_PERFCOUNTER2_SELECT__CI__VI, 0, mmPA_SC_PERFCOUNTER2_LO__CI__VI, + mmPA_SC_PERFCOUNTER2_HI__CI__VI}, + {mmPA_SC_PERFCOUNTER3_SELECT__CI__VI, 0, mmPA_SC_PERFCOUNTER3_LO__CI__VI, + mmPA_SC_PERFCOUNTER3_HI__CI__VI}}; + +/* + * SPI + */ +GpuCounterRegInfo ViSpiCounterRegAddr[] = { + {mmSPI_PERFCOUNTER0_SELECT__CI__VI, 0, mmSPI_PERFCOUNTER0_LO__CI__VI, + mmSPI_PERFCOUNTER0_HI__CI__VI}, + {mmSPI_PERFCOUNTER1_SELECT__CI__VI, 0, mmSPI_PERFCOUNTER1_LO__CI__VI, + mmSPI_PERFCOUNTER1_HI__CI__VI}, + {mmSPI_PERFCOUNTER2_SELECT__CI__VI, 0, mmSPI_PERFCOUNTER2_LO__CI__VI, + mmSPI_PERFCOUNTER2_HI__CI__VI}, + {mmSPI_PERFCOUNTER3_SELECT__CI__VI, 0, mmSPI_PERFCOUNTER3_LO__CI__VI, + mmSPI_PERFCOUNTER3_HI__CI__VI}, + {mmSPI_PERFCOUNTER4_SELECT__CI__VI, 0, mmSPI_PERFCOUNTER4_LO__CI__VI, + mmSPI_PERFCOUNTER4_HI__CI__VI}, + {mmSPI_PERFCOUNTER5_SELECT__CI__VI, 0, mmSPI_PERFCOUNTER5_LO__CI__VI, + mmSPI_PERFCOUNTER5_HI__CI__VI}}; + +/* + * TCA + */ +GpuCounterRegInfo ViTcaCounterRegAddr[] = { + {mmTCA_PERFCOUNTER0_SELECT__CI__VI, 0, mmTCA_PERFCOUNTER0_LO__CI__VI, + mmTCA_PERFCOUNTER0_HI__CI__VI}, + {mmTCA_PERFCOUNTER1_SELECT__CI__VI, 0, mmTCA_PERFCOUNTER1_LO__CI__VI, + mmTCA_PERFCOUNTER1_HI__CI__VI}, + {mmTCA_PERFCOUNTER2_SELECT__CI__VI, 0, mmTCA_PERFCOUNTER2_LO__CI__VI, + mmTCA_PERFCOUNTER2_HI__CI__VI}, + {mmTCA_PERFCOUNTER3_SELECT__CI__VI, 0, mmTCA_PERFCOUNTER3_LO__CI__VI, + mmTCA_PERFCOUNTER3_HI__CI__VI}}; + +/* + * TCC + */ +GpuCounterRegInfo ViTccCounterRegAddr[] = { + {mmTCC_PERFCOUNTER0_SELECT__CI__VI, 0, mmTCC_PERFCOUNTER0_LO__CI__VI, + mmTCC_PERFCOUNTER0_HI__CI__VI}, + {mmTCC_PERFCOUNTER1_SELECT__CI__VI, 0, mmTCC_PERFCOUNTER1_LO__CI__VI, + mmTCC_PERFCOUNTER1_HI__CI__VI}, + {mmTCC_PERFCOUNTER2_SELECT__CI__VI, 0, mmTCC_PERFCOUNTER2_LO__CI__VI, + mmTCC_PERFCOUNTER2_HI__CI__VI}, + {mmTCC_PERFCOUNTER3_SELECT__CI__VI, 0, mmTCC_PERFCOUNTER3_LO__CI__VI, + mmTCC_PERFCOUNTER3_HI__CI__VI}}; + +/* + * TCP + */ +GpuCounterRegInfo ViTcpCounterRegAddr[] = { + {mmTCP_PERFCOUNTER0_SELECT__CI__VI, 0, mmTCP_PERFCOUNTER0_LO__CI__VI, + mmTCP_PERFCOUNTER0_HI__CI__VI}, + {mmTCP_PERFCOUNTER1_SELECT__CI__VI, 0, mmTCP_PERFCOUNTER1_LO__CI__VI, + mmTCP_PERFCOUNTER1_HI__CI__VI}, + {mmTCP_PERFCOUNTER2_SELECT__CI__VI, 0, mmTCP_PERFCOUNTER2_LO__CI__VI, + mmTCP_PERFCOUNTER2_HI__CI__VI}, + {mmTCP_PERFCOUNTER3_SELECT__CI__VI, 0, mmTCP_PERFCOUNTER3_LO__CI__VI, + mmTCP_PERFCOUNTER3_HI__CI__VI}}; + +/* + * CB + */ +GpuCounterRegInfo ViCbCounterRegAddr[] = { + {mmCB_PERFCOUNTER0_SELECT__CI__VI, 0, mmCB_PERFCOUNTER0_LO__CI__VI, + mmCB_PERFCOUNTER0_HI__CI__VI}, + {mmCB_PERFCOUNTER1_SELECT__CI__VI, 0, mmCB_PERFCOUNTER1_LO__CI__VI, + mmCB_PERFCOUNTER1_HI__CI__VI}, + {mmCB_PERFCOUNTER2_SELECT__CI__VI, 0, mmCB_PERFCOUNTER2_LO__CI__VI, + mmCB_PERFCOUNTER2_HI__CI__VI}, + {mmCB_PERFCOUNTER3_SELECT__CI__VI, 0, mmCB_PERFCOUNTER3_LO__CI__VI, + mmCB_PERFCOUNTER3_HI__CI__VI}}; + +/* + * DB + */ +GpuCounterRegInfo ViDbCounterRegAddr[] = { + {mmDB_PERFCOUNTER0_SELECT__CI__VI, 0, mmDB_PERFCOUNTER0_LO__CI__VI, + mmDB_PERFCOUNTER0_HI__CI__VI}, + {mmDB_PERFCOUNTER1_SELECT__CI__VI, 0, mmDB_PERFCOUNTER1_LO__CI__VI, + mmDB_PERFCOUNTER1_HI__CI__VI}, + {mmDB_PERFCOUNTER2_SELECT__CI__VI, 0, mmDB_PERFCOUNTER2_LO__CI__VI, + mmDB_PERFCOUNTER2_HI__CI__VI}, + {mmDB_PERFCOUNTER3_SELECT__CI__VI, 0, mmDB_PERFCOUNTER3_LO__CI__VI, + mmDB_PERFCOUNTER3_HI__CI__VI}}; + +/* + * RLC + */ +GpuCounterRegInfo ViRlcCounterRegAddr[] = { + {mmRLC_PERFCOUNTER0_SELECT__CI__VI, 0, mmRLC_PERFCOUNTER0_LO__CI__VI, + mmRLC_PERFCOUNTER0_HI__CI__VI}, + {mmRLC_PERFCOUNTER1_SELECT__CI__VI, 0, mmRLC_PERFCOUNTER1_LO__CI__VI, + mmRLC_PERFCOUNTER1_HI__CI__VI}}; + +/* + * SC + */ +GpuCounterRegInfo ViScCounterRegAddr[] = { + {mmPA_SC_PERFCOUNTER0_SELECT__CI__VI, 0, mmPA_SC_PERFCOUNTER0_LO__CI__VI, + mmPA_SC_PERFCOUNTER0_HI__CI__VI}, + {mmPA_SC_PERFCOUNTER1_SELECT__CI__VI, 0, mmPA_SC_PERFCOUNTER1_LO__CI__VI, + mmPA_SC_PERFCOUNTER1_HI__CI__VI}, + {mmPA_SC_PERFCOUNTER2_SELECT__CI__VI, 0, mmPA_SC_PERFCOUNTER2_LO__CI__VI, + mmPA_SC_PERFCOUNTER2_HI__CI__VI}, + {mmPA_SC_PERFCOUNTER3_SELECT__CI__VI, 0, mmPA_SC_PERFCOUNTER3_LO__CI__VI, + mmPA_SC_PERFCOUNTER3_HI__CI__VI}, + {mmPA_SC_PERFCOUNTER4_SELECT__CI__VI, 0, mmPA_SC_PERFCOUNTER4_LO__CI__VI, + mmPA_SC_PERFCOUNTER4_HI__CI__VI}, + {mmPA_SC_PERFCOUNTER5_SELECT__CI__VI, 0, mmPA_SC_PERFCOUNTER5_LO__CI__VI, + mmPA_SC_PERFCOUNTER5_HI__CI__VI}, + {mmPA_SC_PERFCOUNTER6_SELECT__CI__VI, 0, mmPA_SC_PERFCOUNTER6_LO__CI__VI, + mmPA_SC_PERFCOUNTER6_HI__CI__VI}, + {mmPA_SC_PERFCOUNTER7_SELECT__CI__VI, 0, mmPA_SC_PERFCOUNTER7_LO__CI__VI, + mmPA_SC_PERFCOUNTER7_HI__CI__VI}}; + +/* + * SX + */ +GpuCounterRegInfo ViSxCounterRegAddr[] = { + {mmSX_PERFCOUNTER0_SELECT__CI__VI, 0, mmSX_PERFCOUNTER0_LO__CI__VI, + mmSX_PERFCOUNTER0_HI__CI__VI}, + {mmSX_PERFCOUNTER1_SELECT__CI__VI, 0, mmSX_PERFCOUNTER1_LO__CI__VI, + mmSX_PERFCOUNTER1_HI__CI__VI}, + {mmSX_PERFCOUNTER2_SELECT__CI__VI, 0, mmSX_PERFCOUNTER2_LO__CI__VI, + mmSX_PERFCOUNTER2_HI__CI__VI}, + {mmSX_PERFCOUNTER3_SELECT__CI__VI, 0, mmSX_PERFCOUNTER3_LO__CI__VI, + mmSX_PERFCOUNTER3_HI__CI__VI}}; + +/* + * TA + */ +GpuCounterRegInfo ViTaCounterRegAddr[] = { + {mmTA_PERFCOUNTER0_SELECT__CI__VI, 0, mmTA_PERFCOUNTER0_LO__CI__VI, + mmTA_PERFCOUNTER0_HI__CI__VI}, + {mmTA_PERFCOUNTER1_SELECT__CI__VI, 0, mmTA_PERFCOUNTER1_LO__CI__VI, + mmTA_PERFCOUNTER1_HI__CI__VI}}; + +/* + * TD + */ +GpuCounterRegInfo ViTdCounterRegAddr[] = { + {mmTD_PERFCOUNTER0_SELECT__CI__VI, 0, mmTD_PERFCOUNTER0_LO__CI__VI, + mmTD_PERFCOUNTER0_HI__CI__VI}, + {mmTD_PERFCOUNTER1_SELECT__CI__VI, 0, mmTD_PERFCOUNTER1_LO__CI__VI, + mmTD_PERFCOUNTER1_HI__CI__VI}}; + +/* + * GDS + */ +GpuCounterRegInfo ViGdsCounterRegAddr[] = { + {mmGDS_PERFCOUNTER0_SELECT__CI__VI, 0, mmGDS_PERFCOUNTER0_LO__CI__VI, + mmGDS_PERFCOUNTER0_HI__CI__VI}, + {mmGDS_PERFCOUNTER1_SELECT__CI__VI, 0, mmGDS_PERFCOUNTER1_LO__CI__VI, + mmGDS_PERFCOUNTER1_HI__CI__VI}, + {mmGDS_PERFCOUNTER2_SELECT__CI__VI, 0, mmGDS_PERFCOUNTER2_LO__CI__VI, + mmGDS_PERFCOUNTER2_HI__CI__VI}, + {mmGDS_PERFCOUNTER3_SELECT__CI__VI, 0, mmGDS_PERFCOUNTER3_LO__CI__VI, + mmGDS_PERFCOUNTER3_HI__CI__VI}}; + +/* + * VGT + */ +GpuCounterRegInfo ViVgtCounterRegAddr[] = { + {mmVGT_PERFCOUNTER0_SELECT__CI__VI, 0, mmVGT_PERFCOUNTER0_LO__CI__VI, + mmVGT_PERFCOUNTER0_HI__CI__VI}, + {mmVGT_PERFCOUNTER1_SELECT__CI__VI, 0, mmVGT_PERFCOUNTER1_LO__CI__VI, + mmVGT_PERFCOUNTER1_HI__CI__VI}, + {mmVGT_PERFCOUNTER2_SELECT__CI__VI, 0, mmVGT_PERFCOUNTER2_LO__CI__VI, + mmVGT_PERFCOUNTER2_HI__CI__VI}, + {mmVGT_PERFCOUNTER3_SELECT__CI__VI, 0, mmVGT_PERFCOUNTER3_LO__CI__VI, + mmVGT_PERFCOUNTER3_HI__CI__VI}}; + +/* + * IA + */ +GpuCounterRegInfo ViIaCounterRegAddr[] = { + {mmIA_PERFCOUNTER0_SELECT__CI__VI, 0, mmIA_PERFCOUNTER0_LO__CI__VI, + mmIA_PERFCOUNTER0_HI__CI__VI}, + {mmIA_PERFCOUNTER1_SELECT__CI__VI, 0, mmIA_PERFCOUNTER1_LO__CI__VI, + mmIA_PERFCOUNTER1_HI__CI__VI}, + {mmIA_PERFCOUNTER2_SELECT__CI__VI, 0, mmIA_PERFCOUNTER2_LO__CI__VI, + mmIA_PERFCOUNTER2_HI__CI__VI}, + {mmIA_PERFCOUNTER3_SELECT__CI__VI, 0, mmIA_PERFCOUNTER3_LO__CI__VI, + mmIA_PERFCOUNTER3_HI__CI__VI}}; + +/* + * MC + */ +GpuCounterRegInfo ViMcCounterRegAddr[] = { + {mmMC_SEQ_PERF_SEQ_CTL__SI__VI, 0, mmMC_SEQ_PERF_SEQ_CNT_A_I0__VI, + mmMC_SEQ_PERF_SEQ_CNT_A_I1__VI}, + {mmMC_SEQ_PERF_SEQ_CTL__SI__VI, 0, mmMC_SEQ_PERF_SEQ_CNT_B_I0__VI, + mmMC_SEQ_PERF_SEQ_CNT_B_I1__VI}, + {mmMC_SEQ_PERF_SEQ_CTL__SI__VI, 0, mmMC_SEQ_PERF_SEQ_CNT_C_I0__VI, + mmMC_SEQ_PERF_SEQ_CNT_C_I1__VI}, + {mmMC_SEQ_PERF_SEQ_CTL__SI__VI, 0, mmMC_SEQ_PERF_SEQ_CNT_D_I0__VI, + mmMC_SEQ_PERF_SEQ_CNT_D_I1__VI}}; + +/* + * SRBM + */ +GpuCounterRegInfo ViSrbmCounterRegAddr[] = { + {mmSRBM_PERFCOUNTER0_SELECT__VI, 0, mmSRBM_PERFCOUNTER0_LO__VI, mmSRBM_PERFCOUNTER0_HI__VI}, + {mmSRBM_PERFCOUNTER1_SELECT__VI, 0, mmSRBM_PERFCOUNTER1_LO__VI, mmSRBM_PERFCOUNTER1_HI__VI}}; + +/* + * WD + */ +GpuCounterRegInfo ViWdCounterRegAddr[] = { + {mmWD_PERFCOUNTER0_SELECT__CI__VI, 0, mmWD_PERFCOUNTER0_LO__CI__VI, + mmWD_PERFCOUNTER0_HI__CI__VI}, + {mmWD_PERFCOUNTER1_SELECT__CI__VI, 0, mmWD_PERFCOUNTER1_LO__CI__VI, + mmWD_PERFCOUNTER1_HI__CI__VI}, + {mmWD_PERFCOUNTER2_SELECT__CI__VI, 0, mmWD_PERFCOUNTER2_LO__CI__VI, + mmWD_PERFCOUNTER2_HI__CI__VI}, + {mmWD_PERFCOUNTER3_SELECT__CI__VI, 0, mmWD_PERFCOUNTER3_LO__CI__VI, + mmWD_PERFCOUNTER3_HI__CI__VI}}; + +/* + * CPG + */ +GpuCounterRegInfo ViCpgCounterRegAddr[] = { + {mmCPG_PERFCOUNTER0_SELECT__CI__VI, 0, mmCPG_PERFCOUNTER0_LO__CI__VI, + mmCPG_PERFCOUNTER0_HI__CI__VI}, + {mmCPG_PERFCOUNTER1_SELECT__CI__VI, 0, mmCPG_PERFCOUNTER1_LO__CI__VI, + mmCPG_PERFCOUNTER1_HI__CI__VI}}; + +/* + * CPC + */ +GpuCounterRegInfo ViCpcCounterRegAddr[] = { + {mmCPC_PERFCOUNTER0_SELECT__CI__VI, 0, mmCPC_PERFCOUNTER0_LO__CI__VI, + mmCPC_PERFCOUNTER0_HI__CI__VI}, + {mmCPC_PERFCOUNTER1_SELECT__CI__VI, 0, mmCPC_PERFCOUNTER1_LO__CI__VI, + mmCPC_PERFCOUNTER1_HI__CI__VI}}; + +GpuPrivCounterBlockId ViBlockIdSq = {{0xb5c396b6, 0x47e4d310, 0xc35cfc86, 0x08f53a04}}; +GpuPrivCounterBlockId ViBlockIdMc = {{0x13900b57, 0x4d984956, 0x5268d081, 0x9cf53719}}; +GpuPrivCounterBlockId ViBlockIdIommuV2 = {{0x80969879, 0x4be6b0f6, 0x636af697, 0x1d10f500}}; +GpuPrivCounterBlockId ViBlockIdKernelDriver = {{0xea9b5ae1, 0x44b36c3f, 0xf0da5489, 0x0aa96575}}; + +} // pm4_profile diff --git a/runtime/hsa-ext-aql-profile/src/perfcounter/vi_blockinfo.h b/runtime/hsa-ext-aql-profile/src/perfcounter/vi_blockinfo.h new file mode 100644 index 0000000000..2665836988 --- /dev/null +++ b/runtime/hsa-ext-aql-profile/src/perfcounter/vi_blockinfo.h @@ -0,0 +1,230 @@ +#ifndef _VI_BLOCKINFO_H_ +#define _VI_BLOCKINFO_H_ + +#include +#include "rocr_profiler.h" +#include "gpu_enum.h" +#include "gpu_blockinfo.h" + +namespace pm4_profile { + +// MAX Number of block instances for VOLCANIC ISLANDS (From Fiji) +// Values are found here //gfxip/gfx8/main/src/meta/features/variant/Fiji/album.dj + +// @brief Number of block instances. + +// We index per SE and instance +#define VI_NUM_CB 4 // CB has 4 instances per SE +#define VI_NUM_DB 4 // DB has 4 instances per SE + +// For TA, TD and TCP, the values below are the same as the number of CUs +// per SH. We index per SE and instance +#define VI_NUM_TA 16 // TA has 11 instances +#define VI_NUM_TD 16 // TD has 11 instances +#define VI_NUM_TCP 16 // TCP has 11 instances + +// These values are per chip, we index directly per instance +#define VI_NUM_TCA 2 // TCA has 2 instances per chip +#define VI_NUM_TCC 16 // TCC has 16 instances per chip +#define VI_NUM_SDMA 2 // There are two SDMA blocks on VI, exposed as 2 + // instances here + +// Number of counter registers per block for volcanic islands +#define VI_COUNTER_NUM_PER_DRM 2 +#define VI_COUNTER_NUM_PER_DRMDMA 2 +#define VI_COUNTER_NUM_PER_IH 2 +#define VI_COUNTER_NUM_PER_SRBM 2 +#define VI_COUNTER_NUM_PER_CB 4 +#define VI_COUNTER_NUM_PER_CPF 2 +#define VI_COUNTER_NUM_PER_DB 4 +#define VI_COUNTER_NUM_PER_GRBM 2 +#define VI_COUNTER_NUM_PER_GRBMSE 4 +#define VI_COUNTER_NUM_PER_PA_SU 4 +#define VI_COUNTER_NUM_PER_RLC 2 +#define VI_COUNTER_NUM_PER_PA_SC 8 +#define VI_COUNTER_NUM_PER_SPI 6 // [Shucai: To do: double check the value] +#define VI_COUNTER_NUM_PER_SQ 16 +#define VI_COUNTER_NUM_PER_SX 4 +#define VI_COUNTER_NUM_PER_TA 2 +#define VI_COUNTER_NUM_PER_TCA 4 +#define VI_COUNTER_NUM_PER_TCC 4 +#define VI_COUNTER_NUM_PER_TD 2 // [Shucai: To do: double check the value] +#define VI_COUNTER_NUM_PER_TCP 4 +#define VI_COUNTER_NUM_PER_GDS 4 +#define VI_COUNTER_NUM_PER_VGT 4 +#define VI_COUNTER_NUM_PER_IA 4 +#define VI_COUNTER_NUM_PER_MC 4 +#define VI_COUNTER_NUM_PER_TCS 4 +#define VI_COUNTER_NUM_PER_WD 4 +#define VI_COUNTER_NUM_PER_CPG 2 +#define VI_COUNTER_NUM_PER_CPC 2 +#define VI_COUNTER_NUM_PER_VM 1 +#define VI_COUNTER_NUM_PER_VM_MD 1 +#define VI_COUNTER_NUM_PER_PIPESTATS 12 + +#define VI_MAX_NUM_SHADER_ENGINES 1 + +// Enumeration of VI hardware counter blocks +typedef enum HsaViCounterBlockId { + kHsaViCounterBlockIdCb0 = 0, + kHsaViCounterBlockIdCb1, + kHsaViCounterBlockIdCb2, + kHsaViCounterBlockIdCb3, + + kHsaViCounterBlockIdCpf, + + kHsaViCounterBlockIdDb0, + kHsaViCounterBlockIdDb1, + kHsaViCounterBlockIdDb2, + kHsaViCounterBlockIdDb3, + + kHsaViCounterBlockIdGrbm, + kHsaViCounterBlockIdGrbmSe, + kHsaViCounterBlockIdPaSu, + kHsaViCounterBlockIdPaSc, + kHsaViCounterBlockIdSpi, + + kHsaViCounterBlockIdSq, + kHsaViCounterBlockIdSqEs, + kHsaViCounterBlockIdSqGs, + kHsaViCounterBlockIdSqVs, + kHsaViCounterBlockIdSqPs, + kHsaViCounterBlockIdSqLs, + kHsaViCounterBlockIdSqHs, + kHsaViCounterBlockIdSqCs, + + kHsaViCounterBlockIdSx, + + kHsaViCounterBlockIdTa0, + kHsaViCounterBlockIdTa1, + kHsaViCounterBlockIdTa2, + kHsaViCounterBlockIdTa3, + kHsaViCounterBlockIdTa4, + kHsaViCounterBlockIdTa5, + kHsaViCounterBlockIdTa6, + kHsaViCounterBlockIdTa7, + kHsaViCounterBlockIdTa8, + kHsaViCounterBlockIdTa9, + kHsaViCounterBlockIdTa10, + kHsaViCounterBlockIdTa11, + kHsaViCounterBlockIdTa12, + kHsaViCounterBlockIdTa13, + kHsaViCounterBlockIdTa14, + kHsaViCounterBlockIdTa15, + + kHsaViCounterBlockIdTca0, + kHsaViCounterBlockIdTca1, + + kHsaViCounterBlockIdTcc0, + kHsaViCounterBlockIdTcc1, + kHsaViCounterBlockIdTcc2, + kHsaViCounterBlockIdTcc3, + kHsaViCounterBlockIdTcc4, + kHsaViCounterBlockIdTcc5, + kHsaViCounterBlockIdTcc6, + kHsaViCounterBlockIdTcc7, + kHsaViCounterBlockIdTcc8, + kHsaViCounterBlockIdTcc9, + kHsaViCounterBlockIdTcc10, + kHsaViCounterBlockIdTcc11, + kHsaViCounterBlockIdTcc12, + kHsaViCounterBlockIdTcc13, + kHsaViCounterBlockIdTcc14, + kHsaViCounterBlockIdTcc15, + + kHsaViCounterBlockIdTd0, + kHsaViCounterBlockIdTd1, + kHsaViCounterBlockIdTd2, + kHsaViCounterBlockIdTd3, + kHsaViCounterBlockIdTd4, + kHsaViCounterBlockIdTd5, + kHsaViCounterBlockIdTd6, + kHsaViCounterBlockIdTd7, + kHsaViCounterBlockIdTd8, + kHsaViCounterBlockIdTd9, + kHsaViCounterBlockIdTd10, + kHsaViCounterBlockIdTd11, + kHsaViCounterBlockIdTd12, + kHsaViCounterBlockIdTd13, + kHsaViCounterBlockIdTd14, + kHsaViCounterBlockIdTd15, + + kHsaViCounterBlockIdTcp0, + kHsaViCounterBlockIdTcp1, + kHsaViCounterBlockIdTcp2, + kHsaViCounterBlockIdTcp3, + kHsaViCounterBlockIdTcp4, + kHsaViCounterBlockIdTcp5, + kHsaViCounterBlockIdTcp6, + kHsaViCounterBlockIdTcp7, + kHsaViCounterBlockIdTcp8, + kHsaViCounterBlockIdTcp9, + kHsaViCounterBlockIdTcp10, + kHsaViCounterBlockIdTcp11, + kHsaViCounterBlockIdTcp12, + kHsaViCounterBlockIdTcp13, + kHsaViCounterBlockIdTcp14, + kHsaViCounterBlockIdTcp15, + + kHsaViCounterBlockIdGds, + kHsaViCounterBlockIdVgt, + kHsaViCounterBlockIdIa, + kHsaViCounterBlockIdMc, + kHsaViCounterBlockIdSrbm, + + kHsaViCounterBlockIdTcs, + kHsaViCounterBlockIdWd, + kHsaViCounterBlockIdCpg, + kHsaViCounterBlockIdCpc, + + // Counters retrieved by KFD + kHsaViCounterBlockIdIommuV2, + kHsaViCounterBlockIdKernelDriver, + + kHsaViCounterBlockIdCpPipeStats, + kHsaViCounterBlockIdHwInfo, + kHsaViCounterBlockIdBlocksFirst = kHsaViCounterBlockIdCb0, + kHsaViCounterBlockIdBlocksLast = kHsaViCounterBlockIdHwInfo +} HsaViCounterBlockId; + +extern GpuBlockInfo ViPmuHwBlocks[]; +extern GpuCounterRegInfo ViSqCounterRegAddr[]; +extern GpuCounterRegInfo ViCbCounterRegAddr[]; +extern GpuCounterRegInfo ViDrmdmaCounterRegAddr[]; +extern GpuCounterRegInfo ViIhCounterRegAddr[]; +extern GpuCounterRegInfo ViCpfCounterRegAddr[]; +extern GpuCounterRegInfo ViCpgCounterRegAddr[]; +extern GpuCounterRegInfo ViCpcCounterRegAddr[]; +extern GpuCounterRegInfo ViDrmCounterRegAddr[]; +extern GpuCounterRegInfo ViGrbmCounterRegAddr[]; +extern GpuCounterRegInfo ViGrbmSeCounterRegAddr[]; +extern GpuCounterRegInfo ViPaSuCounterRegAddr[]; +extern GpuCounterRegInfo ViPaScCounterRegAddr[]; +extern GpuCounterRegInfo ViSpiCounterRegAddr[]; +extern GpuCounterRegInfo ViTcaCounterRegAddr[]; +extern GpuCounterRegInfo ViTccCounterRegAddr[]; +extern GpuCounterRegInfo ViTcpCounterRegAddr[]; +extern GpuCounterRegInfo ViDbCounterRegAddr[]; +extern GpuCounterRegInfo ViRlcCounterRegAddr[]; +extern GpuCounterRegInfo ViScCounterRegAddr[]; +extern GpuCounterRegInfo ViSxCounterRegAddr[]; +extern GpuCounterRegInfo ViTaCounterRegAddr[]; +extern GpuCounterRegInfo ViTdCounterRegAddr[]; +extern GpuCounterRegInfo ViGdsCounterRegAddr[]; +extern GpuCounterRegInfo ViVgtCounterRegAddr[]; +extern GpuCounterRegInfo ViIaCounterRegAddr[]; +extern GpuCounterRegInfo ViMcCounterRegAddr[]; +extern GpuCounterRegInfo ViSrbmCounterRegAddr[]; + +// No Tcs Counter block on VI +// extern GpuCounterRegInfo ViTcsCounterRegAddr[]; +extern GpuCounterRegInfo ViWdCounterRegAddr[]; +extern GpuCounterRegInfo ViCpgCounterRegAddr[]; +extern GpuCounterRegInfo ViCpcCounterRegAddr[]; + +extern GpuPrivCounterBlockId ViBlockIdSq; +extern GpuPrivCounterBlockId ViBlockIdMc; +extern GpuPrivCounterBlockId ViBlockIdIommuV2; +extern GpuPrivCounterBlockId ViBlockIdKernelDriver; +} +#endif diff --git a/runtime/hsa-ext-aql-profile/src/perfcounter/vi_pmu.cpp b/runtime/hsa-ext-aql-profile/src/perfcounter/vi_pmu.cpp new file mode 100644 index 0000000000..b877ee31ff --- /dev/null +++ b/runtime/hsa-ext-aql-profile/src/perfcounter/vi_pmu.cpp @@ -0,0 +1,1569 @@ +#include +#include + +#include "os.h" + +#include "gfxip/gfx8/si_ci_vi_merged_typedef.h" +#include "gfxip/gfx8/si_ci_vi_merged_offset.h" +#include "gfxip/gfx8/si_ci_vi_merged_enum.h" +#include "gfxip/gfx8/si_pm4defs.h" +#include "cmdwriter.h" + +#include "vi_pmu.h" +#include "gpu_countergroup.h" +#include "vi_blockinfo.h" +#include "gpu_enum.h" + +using namespace std; +using namespace pm4_profile; + +namespace pm4_profile { + +static char errorString[][64] = {{"No error"}, + {"unknow countergroup id"}, + {"no countergroup id"}, + {"invalid operation"}, + {"counter is not available"}, + {"countegroup error state"}, + {"countegroup is not completed"}}; + +ViPmu::ViPmu() { + // Initialize the number of shader engines + num_se_ = 4; + Init(); +} + +void ViPmu::Init() { + error_code_ = 0; + info_set_ = new InfoSet(); + parameter_set_ = new ParameterSet(); + + // Initialize pointer to stored counter block list to NULL + blk_list_ = NULL; + initCounterBlock(); + + // Initialize the value to use in resetting GRBM + regGRBM_GFX_INDEX grbm_gfx_index; + grbm_gfx_index.u32All = 0; + grbm_gfx_index.bitfields.INSTANCE_BROADCAST_WRITES = 1; + grbm_gfx_index.bitfields.SE_BROADCAST_WRITES = 1; + grbm_gfx_index.bitfields.SH_BROADCAST_WRITES = 1; + reset_grbm_ = grbm_gfx_index.u32All; + + // Update state of Perf Mgmt Unit + profiler_state_ = ROCR_PMU_STATE_IDLE; +} + +ViPmu::~ViPmu() { + // Remove all counter blocks + RemoveCounterBlocks(); + blk_map_.clear(); + delete parameter_set_; + delete info_set_; + + if (blk_list_) { + free(blk_list_); + blk_list_ = NULL; + } +} + +// Initializes the handle of buffer used to collect PMC data +// @param cmdBufSz Size in terms of bytes +bool ViPmu::setPmcDataBuff(uint8_t* pmcBuffer, uint32_t pmcBuffSz) { + // Update counter data buffer addr and size params + pmcDataSz_ = pmcBuffSz; + pmcData_ = (uint32_t*)pmcBuffer; + return true; +} + +// +// The logic is quite simple and is as follows +// +// Issue CsPartialFlush +// Issue Cmd to stop Perf Counters +// Issue Cmd to Disable & Reset Perf Counters +// +void ViPmu::ResetCounterBlocks(pm4_profile::DefaultCmdBuf* cmdBuff, + pm4_profile::CommandWriter* cmdWriter) { + // Waits until all outstanding commands have completed + // by issing CS Partial Flush command + cmdWriter->BuildWriteWaitIdlePacket(cmdBuff); + + // Program CP Perfmon Cntrl Rgstr to disable and reset counters + regCP_PERFMON_CNTL cp_perfmon_cntl; + cp_perfmon_cntl.u32All = 0; + cp_perfmon_cntl.bits.PERFMON_STATE = 0; + cmdWriter->BuildWriteUConfigRegPacket(cmdBuff, mmCP_PERFMON_CNTL__CI__VI, cp_perfmon_cntl.u32All); +} + +bool ViPmu::begin(pm4_profile::DefaultCmdBuf* cmdBuff, pm4_profile::CommandWriter* cmdWriter, + bool reset_counter) { + if (profiler_state_ != ROCR_PMU_STATE_IDLE) { + error_code_ = kHsaPmuErrorCodeErrorState; + return false; + } + + // Reset Grbm to its default state - broadcast + cmdWriter->BuildWriteUConfigRegPacket(cmdBuff, mmGRBM_GFX_INDEX__CI__VI, reset_grbm_); + + // Program CP Perfmon Cntrl Rgstr to disable and reset counters + regCP_PERFMON_CNTL cp_perfmon_cntl; + cp_perfmon_cntl.u32All = 0; + cp_perfmon_cntl.bits.PERFMON_STATE = 0; + cmdWriter->BuildWriteUConfigRegPacket(cmdBuff, mmCP_PERFMON_CNTL__CI__VI, cp_perfmon_cntl.u32All); + + // Collect all the program counter blocks + uint32_t reg_val[MAX_REG_NUM], reg_addr[MAX_REG_NUM], reg_num; + + // Retrieve the list of blocks whose perf counters have been enabled + uint32_t blk_cnt = 0; + CounterBlock** blk_list = getAllCounterBlocks(blk_cnt); + + // Iterate through the list of blocks to generate Pm4 commands to + // program corresponding perf counters of each block + for (uint32_t blkIdx = 0; blkIdx < blk_cnt; blkIdx++) { + // Retrieve the list of perf counters and their count + uint32_t counter_num; + Counter** cntr_list; + cntr_list = blk_list[blkIdx]->getEnabledCounters(counter_num); + if (counter_num == 0) { + continue; + } + + // Retrieve the block Id of perf counters + void* p_data; + uint32_t block_id; + uint32_t data_size; + blk_list[blkIdx]->getInfo(GPU_BLK_INFO_ID, data_size, (void**)&p_data); + block_id = *(static_cast(p_data)); + + // Iterate through each enabled perf counter and building + // corresponding Pm4 commands to program the various control + // registers involved + for (uint32_t cntrIdx = 0; cntrIdx < counter_num; cntrIdx++) { + // Build the list of control registers to program which + // varies per perf counter block + reg_num = BuildCounterSelRegister(cntrIdx, reg_addr, reg_val, block_id, cntr_list[cntrIdx]); + + // Build the list of Pm4 commands that support control + // register programming + for (uint32_t regIdx = 0; regIdx < reg_num; regIdx++) { + cmdWriter->BuildWriteUConfigRegPacket(cmdBuff, reg_addr[regIdx], reg_val[regIdx]); + } + } + } + + // Reset Grbm to its default state - broadcast + cmdWriter->BuildWriteUConfigRegPacket(cmdBuff, mmGRBM_GFX_INDEX__CI__VI, reset_grbm_); + + // Program Compute_Perfcount_Enable register to support perf counting + regCOMPUTE_PERFCOUNT_ENABLE__CI__VI cp_perfcount_enable; + cp_perfcount_enable.u32All = 0; + cp_perfcount_enable.bits.PERFCOUNT_ENABLE = 1; + cmdWriter->BuildWriteShRegPacket(cmdBuff, mmCOMPUTE_PERFCOUNT_ENABLE__CI__VI, + cp_perfcount_enable.u32All); + + // Start the counter list + cp_perfmon_cntl.u32All = 0; + cp_perfmon_cntl.bits.PERFMON_STATE = 1; + cmdWriter->BuildWriteUConfigRegPacket(cmdBuff, mmCP_PERFMON_CNTL__CI__VI, cp_perfmon_cntl.u32All); + + cmdWriter->BuildWriteWaitIdlePacket(cmdBuff); + + profiler_state_ = ROCR_PMU_STATE_START; + return true; +} + +bool ViPmu::end(pm4_profile::DefaultCmdBuf* cmdBuff, pm4_profile::CommandWriter* cmdWriter) { + if (profiler_state_ != ROCR_PMU_STATE_START) { + error_code_ = kHsaPmuErrorCodeErrorState; + return false; + } + + void* p_data; + regGRBM_GFX_INDEX grbm_gfx_index; + + // Issue CsPartialFlush command to wait for dispatch to complete + cmdWriter->BuildWriteWaitIdlePacket(cmdBuff); + + // Build PM4 packet for starting counters + regCP_PERFMON_CNTL cp_perfmon_cntl; + cp_perfmon_cntl.u32All = 0; + cp_perfmon_cntl.bits.PERFMON_STATE = 2; + cp_perfmon_cntl.bits.PERFMON_SAMPLE_ENABLE = 1; + cmdWriter->BuildWriteUConfigRegPacket(cmdBuff, mmCP_PERFMON_CNTL__CI__VI, cp_perfmon_cntl.u32All); + + // Collect all the program counter blocks + uint32_t i, j, k, reg_addr[MAX_REG_NUM], reg_val[MAX_REG_NUM], reg_num, data_size; + + uint32_t blk_cnt = 0; + CounterBlock** blk_list = getAllCounterBlocks(blk_cnt); + + uint32_t counter_num; + Counter** cntr_list; + uint32_t total_counter_num = 0; + for (i = 0; i < blk_cnt; i++) { + // Retrieve all enabled cntr_list in each counter block + cntr_list = blk_list[i]->getEnabledCounters(counter_num); + if (!blk_list[i]->getInfo(GPU_BLK_INFO_CONTROL_METHOD, data_size, &p_data)) { + return false; + } + + CntlMethod method; + method = static_cast(*(static_cast(p_data))); + + // Need to read counter values from each shader engine + if (method == CntlMethodBySe || method == CntlMethodBySeAndInstance) { + counter_num = counter_num * num_se_; + } + + total_counter_num += counter_num; + } + + size_t cntrSize = sizeof(int32_t) * 2 * total_counter_num; + if (cntrSize > pmcDataSz_) { + return false; + } + + // Reset Grbm to its default state - broadcast + cmdWriter->BuildWriteUConfigRegPacket(cmdBuff, mmGRBM_GFX_INDEX__CI__VI, reset_grbm_); + + // Create PM4 packet to read counter values + total_counter_num = 0; + for (i = 0; i < blk_cnt; i++) { + // Retrieve all enabled cntr_list in each counter block + cntr_list = blk_list[i]->getEnabledCounters(counter_num); + if (counter_num > 0) { + uint32_t block_id; + uint32_t data_size; + if (!blk_list[i]->getInfo(GPU_BLK_INFO_ID, data_size, (void**)&p_data)) { + return false; + } + block_id = *(static_cast(p_data)); + + for (j = 0; j < counter_num; j++) { + // retrieve the registers to be set + reg_num = BuildCounterReadRegisters(j, block_id, reg_addr, reg_val); + for (k = 0; k < reg_num; k++) { + if (reg_val[k] == COPY_DATA_FLAG) { + cmdWriter->BuildCopyDataPacket(cmdBuff, COPY_DATA_SEL_REG, reg_addr[k], 0, + pmcData_ + total_counter_num, COPY_DATA_SEL_COUNT_1DW, + false); + total_counter_num++; + } else { + cmdWriter->BuildWriteUConfigRegPacket(cmdBuff, reg_addr[k], reg_val[k]); + } + } + } + } + } + + // Reset Grbm to its default state - broadcast + cmdWriter->BuildWriteUConfigRegPacket(cmdBuff, mmGRBM_GFX_INDEX__CI__VI, reset_grbm_); + + profiler_state_ = ROCR_PMU_STATE_STOP; + return true; +} + +bool ViPmu::initCounterBlock() { + for (int i = 0; !(std::string(ViPmuHwBlocks[i].blockName).empty()); i++) { + // Override the value of max number of shader engines + ViPmuHwBlocks[i].maxShaderEngineCount = num_se_; + + // Instantiate a perf counter block and its properties + GpuCounterBlock* cntr_blk = new GpuCounterBlock(); + if (!cntr_blk) { + blk_map_.clear(); + return false; + } + + cntr_blk->setInfo(GPU_BLK_INFO_BLOCK_NAME, GPU_BLOCK_NAME_SIZE, + (void*)ViPmuHwBlocks[i].blockName); + + cntr_blk->setInfo(GPU_BLK_INFO_ID, sizeof(uint32_t), (void*)&ViPmuHwBlocks[i].counterGroupId); + + cntr_blk->setInfo(GPU_BLK_INFO_MAX_SHADER_ENGINE_COUNT, sizeof(uint32_t), + (void*)&(ViPmuHwBlocks[i].maxShaderEngineCount)); + + cntr_blk->setInfo(GPU_BLK_INFO_MAX_SHADER_ARRAY_COUNT, sizeof(uint32_t), + (void*)&(ViPmuHwBlocks[i].maxShaderArrayCount)); + + cntr_blk->setInfo(GPU_BLK_INFO_MAX_INSTANCE_COUNT, sizeof(uint32_t), + (void*)&(ViPmuHwBlocks[i].maxInstanceCount)); + + cntr_blk->setInfo(GPU_BLK_INFO_CONTROL_METHOD, sizeof(uint32_t), + (void*)&(ViPmuHwBlocks[i].method)); + + cntr_blk->setInfo(GPU_BLK_INFO_MAX_EVENT_ID, sizeof(uint32_t), + (void*)&(ViPmuHwBlocks[i].maxEventId)); + + cntr_blk->setInfo(GPU_BLK_INFO_MAX_SIMULTANEOUS_COUNTERS, sizeof(uint32_t), + (void*)&(ViPmuHwBlocks[i].maxSimultaneousCounters)); + + cntr_blk->setInfo(GPU_BLK_INFO_MAX_STREAMING_COUNTERS, sizeof(uint32_t), + (void*)&(ViPmuHwBlocks[i].maxStreamingCounters)); + + cntr_blk->setInfo(GPU_BLK_INFO_SHARED_HW_COUNTERS, sizeof(uint32_t), + (void*)&(ViPmuHwBlocks[i].sharedHWCounters)); + + cntr_blk->setInfo(GPU_BLK_INFO_HAS_FILTERS, sizeof(bool), + (void*)&(ViPmuHwBlocks[i].hasFilters)); + + // TODO: Need to fill in the Threadtrace stuff here + HsaViCounterBlockId blk_id; + blk_id = static_cast(ViPmuHwBlocks[i].counterGroupId); + blk_map_.insert(ViCounterBlockMap::value_type(blk_id, cntr_blk)); + } + + // Initiate the PMU state and error code + error_code_ = 0; + profiler_state_ = ROCR_PMU_STATE_IDLE; + return true; +} + +int ViPmu::getLastError() { return error_code_; } + +std::string ViPmu::getErrorString(int error) { + if ((error >= 0) && (error < kHsaPmuErrorCodeMax)) { + std::string err_string(errorString[error]); + return err_string; + } + return string("Error input code!"); +} + +bool ViPmu::getParameter(uint32_t param, uint32_t& retSize, void** ppData) { + return parameter_set_->getParameter(param, retSize, ppData); +} + +bool ViPmu::setParameter(uint32_t param, uint32_t paramSize, const void* p_data) { + return parameter_set_->setParameter(param, paramSize, p_data); +} + +bool ViPmu::getInfo(uint32_t info, uint32_t& retSize, void** ppData) { + return info_set_->getInfo(info, retSize, ppData); +} + +pm4_profile::CounterBlock* ViPmu::getCounterBlockById(uint32_t id) { + HsaViCounterBlockId block_id = static_cast(id); + + // Carrizo has only 8 instances of TA, TD, TCP Perf Blocks + /* + if (asic_ == HsaAmdDeviceAsicTypeCZ) { + if ( ((id >= kHsaViCounterBlockIdTa8) && (id <= kHsaViCounterBlockIdTa15)) || + ((id >= kHsaViCounterBlockIdTd8) && (id <= kHsaViCounterBlockIdTd15)) || + ((id >= kHsaViCounterBlockIdTcp8) && (id <= kHsaViCounterBlockIdTcp15))) { + return NULL; + } + } + */ + + return blk_map_[block_id]; +} + +pm4_profile::CounterBlock** ViPmu::getAllCounterBlocks(uint32_t& num_blocks) { + size_t block_size = blk_map_.size(); + + if (block_size <= 0) { + error_code_ = kHsaPmuErrorCodeNoCounterBlock; + return NULL; + } + + if (blk_list_) { + free(blk_list_); + blk_list_ = NULL; + } + + blk_list_size_ = (uint32_t)(sizeof(GpuCounterBlock*) * block_size); + blk_list_size_ = ((blk_list_size_ % 4096) != 0) ? 4096 : blk_list_size_; + blk_list_ = (CounterBlock**)malloc(blk_list_size_); + if (blk_list_ == NULL) { + return NULL; + } + + ViCounterBlockMap::iterator it; + uint32_t blk_cnt = 0; + for (it = blk_map_.begin(); it != blk_map_.end(); it++) { + blk_list_[blk_cnt] = it->second; + blk_cnt++; + } + + num_blocks = blk_cnt; + return blk_list_; +} + +uint32_t ViPmu::ProgramTcpCntrs(uint32_t tcpRegIdx, uint32_t* regAddr, uint32_t* regVal, + uint32_t blkId, uint32_t blkCntrIdx) { + regGRBM_GFX_INDEX grbm_gfx_index; + + grbm_gfx_index.u32All = 0; + grbm_gfx_index.bitfields.SE_BROADCAST_WRITES = 1; + grbm_gfx_index.bitfields.SH_BROADCAST_WRITES = 1; + grbm_gfx_index.bitfields.INSTANCE_INDEX = blkId - kHsaViCounterBlockIdTcp0; + + uint32_t regIdx = 0; + regVal[regIdx] = grbm_gfx_index.u32All; + regAddr[regIdx] = mmGRBM_GFX_INDEX__CI__VI; + regIdx++; + + regTCP_PERFCOUNTER0_SELECT__CI__VI tcp_perf_counter_select; + tcp_perf_counter_select.u32All = 0; + tcp_perf_counter_select.bits.PERF_SEL = blkCntrIdx; + + regVal[regIdx] = tcp_perf_counter_select.u32All; + regAddr[regIdx] = ViTcpCounterRegAddr[tcpRegIdx].counterSelRegAddr; + regIdx++; + + return regIdx; +} + +uint32_t ViPmu::ProgramTdCntrs(uint32_t tdRegIdx, uint32_t* regAddr, uint32_t* regVal, + uint32_t blkId, uint32_t blkCntrIdx) { + regGRBM_GFX_INDEX grbm_gfx_index; + + grbm_gfx_index.u32All = 0; + grbm_gfx_index.bitfields.SE_BROADCAST_WRITES = 1; + grbm_gfx_index.bitfields.SH_BROADCAST_WRITES = 1; + grbm_gfx_index.bitfields.INSTANCE_INDEX = blkId - kHsaViCounterBlockIdTd0; + + uint32_t regIdx = 0; + regVal[regIdx] = grbm_gfx_index.u32All; + regAddr[regIdx] = mmGRBM_GFX_INDEX__CI__VI; + regIdx++; + + regTD_PERFCOUNTER0_SELECT td_perf_counter_select; + td_perf_counter_select.u32All = 0; + td_perf_counter_select.bits.PERF_SEL = blkCntrIdx; + regVal[regIdx] = td_perf_counter_select.u32All; + regAddr[regIdx] = ViTdCounterRegAddr[tdRegIdx].counterSelRegAddr; + regIdx++; + + return regIdx; +} + +uint32_t ViPmu::ProgramTccCntrs(uint32_t tccRegIdx, uint32_t* regAddr, uint32_t* regVal, + uint32_t blkId, uint32_t blkCntrIdx) { + regGRBM_GFX_INDEX grbm_gfx_index; + + grbm_gfx_index.u32All = 0; + grbm_gfx_index.bitfields.SE_BROADCAST_WRITES = 1; + grbm_gfx_index.bitfields.SH_BROADCAST_WRITES = 1; + grbm_gfx_index.bitfields.INSTANCE_INDEX = blkId - kHsaViCounterBlockIdTcc0; + + uint32_t regIdx = 0; + regVal[regIdx] = grbm_gfx_index.u32All; + regAddr[regIdx] = mmGRBM_GFX_INDEX__CI__VI; + regIdx++; + + regTCC_PERFCOUNTER0_SELECT__CI__VI tcc_perf_counter_select; + tcc_perf_counter_select.u32All = 0; + tcc_perf_counter_select.bits.PERF_SEL = blkCntrIdx; + + regVal[regIdx] = tcc_perf_counter_select.u32All; + regAddr[regIdx] = ViTccCounterRegAddr[tccRegIdx].counterSelRegAddr; + regIdx++; + + return regIdx; +} + +uint32_t ViPmu::ProgramTcaCntrs(uint32_t tcaRegIdx, uint32_t* regAddr, uint32_t* regVal, + uint32_t blkId, uint32_t blkCntrIdx) { + regGRBM_GFX_INDEX grbm_gfx_index; + + grbm_gfx_index.u32All = 0; + grbm_gfx_index.bitfields.SE_BROADCAST_WRITES = 1; + grbm_gfx_index.bitfields.SH_BROADCAST_WRITES = 1; + grbm_gfx_index.bitfields.INSTANCE_INDEX = blkId - kHsaViCounterBlockIdTca0; + + uint32_t regIdx = 0; + regVal[regIdx] = grbm_gfx_index.u32All; + regAddr[regIdx] = mmGRBM_GFX_INDEX__CI__VI; + regIdx++; + + regTCA_PERFCOUNTER0_SELECT__CI__VI tca_perf_counter_select; + tca_perf_counter_select.u32All = 0; + tca_perf_counter_select.bits.PERF_SEL = blkCntrIdx; + + regVal[regIdx] = tca_perf_counter_select.u32All; + regAddr[regIdx] = ViTcaCounterRegAddr[tcaRegIdx].counterSelRegAddr; + regIdx++; + return regIdx; +} + +uint32_t ViPmu::ProgramTaCntrs(uint32_t taRegIdx, uint32_t* regAddr, uint32_t* regVal, + uint32_t blkId, uint32_t blkCntrIdx) { + regGRBM_GFX_INDEX grbm_gfx_index; + + grbm_gfx_index.u32All = 0; + grbm_gfx_index.bitfields.SE_BROADCAST_WRITES = 1; + grbm_gfx_index.bitfields.SH_BROADCAST_WRITES = 1; + grbm_gfx_index.bitfields.INSTANCE_INDEX = blkId - kHsaViCounterBlockIdTa0; + + uint32_t regIdx = 0; + regVal[regIdx] = grbm_gfx_index.u32All; + regAddr[regIdx] = mmGRBM_GFX_INDEX__CI__VI; + regIdx++; + + regTA_PERFCOUNTER0_SELECT ta_perf_counter_select; + ta_perf_counter_select.u32All = 0; + ta_perf_counter_select.bits.PERF_SEL = blkCntrIdx; + + regVal[regIdx] = ta_perf_counter_select.u32All; + regAddr[regIdx] = ViTaCounterRegAddr[taRegIdx].counterSelRegAddr; + regIdx++; + + return regIdx; +} + +uint32_t ViPmu::ProgramSQCntrs(uint32_t sqRegIdx, uint32_t* regAddr, uint32_t* regVal, + uint32_t blkId, uint32_t blkCntrIdx) { + uint32_t regIdx = 0; + + // Program the SQ Counter Select Register + regSQ_PERFCOUNTER0_SELECT__CI__VI sq_cntr_sel; + sq_cntr_sel.u32All = 0; + sq_cntr_sel.bits.SIMD_MASK = 0xF; + sq_cntr_sel.bits.SQC_BANK_MASK = 0xF; + sq_cntr_sel.bits.SQC_CLIENT_MASK = 0xF; + sq_cntr_sel.bits.PERF_SEL = blkCntrIdx; + regVal[regIdx] = sq_cntr_sel.u32All; + regAddr[regIdx] = ViSqCounterRegAddr[sqRegIdx].counterSelRegAddr; + regIdx++; + + // Program the SQ Counter Mask Register + regSQ_PERFCOUNTER_MASK__CI__VI sq_cntr_mask; + sq_cntr_mask.u32All = 0; + sq_cntr_mask.bits.SH0_MASK = 0xFFFF; + sq_cntr_mask.bits.SH1_MASK = 0xFFFF; + regVal[regIdx] = sq_cntr_mask.u32All; + regAddr[regIdx] = mmSQ_PERFCOUNTER_MASK__CI__VI; + regIdx++; + + // Initialize the register content + // Program the SQ Counter Control Register + regSQ_PERFCOUNTER_CTRL sq_cntr_ctrl; + sq_cntr_ctrl.u32All = 0; + if (blkId == kHsaViCounterBlockIdSq) { + sq_cntr_ctrl.bits.ES_EN = 0x1; + sq_cntr_ctrl.bits.GS_EN = 0x1; + sq_cntr_ctrl.bits.VS_EN = 0x1; + sq_cntr_ctrl.bits.PS_EN = 0x1; + sq_cntr_ctrl.bits.LS_EN = 0x1; + sq_cntr_ctrl.bits.HS_EN = 0x1; + sq_cntr_ctrl.bits.CS_EN = 0x1; + } else if (blkId == kHsaViCounterBlockIdSqEs) { + sq_cntr_ctrl.bits.ES_EN = 0x1; + } else if (blkId == kHsaViCounterBlockIdSqGs) { + sq_cntr_ctrl.bits.GS_EN = 0x1; + } else if (blkId == kHsaViCounterBlockIdSqVs) { + sq_cntr_ctrl.bits.VS_EN = 0x1; + } else if (blkId == kHsaViCounterBlockIdSqPs) { + sq_cntr_ctrl.bits.PS_EN = 0x1; + } else if (blkId == kHsaViCounterBlockIdSqLs) { + sq_cntr_ctrl.bits.LS_EN = 0x1; + } else if (blkId == kHsaViCounterBlockIdSqHs) { + sq_cntr_ctrl.bits.HS_EN = 0x1; + } else if (blkId == kHsaViCounterBlockIdSqCs) { + sq_cntr_ctrl.bits.CS_EN = 0x1; + } + + regVal[regIdx] = sq_cntr_ctrl.u32All; + regAddr[regIdx] = ViSqCounterRegAddr[sqRegIdx].counterCntlRegAddr; + regIdx++; + + return regIdx; +} + +uint32_t ViPmu::BuildCounterSelRegister(uint32_t cntrIdx, uint32_t* regAddr, uint32_t* regVal, + uint32_t blkId, pm4_profile::Counter* blkCntr) { + void* p_data; + uint32_t data_size; + uint32_t blkCntrIdx; + uint32_t instance_index; + regGRBM_GFX_INDEX grbm_gfx_index; + + // Get the blkCntr selection value + if (!blkCntr->getParameter(HSA_EXT_TOOLS_COUNTER_PARAMETER_EVENT_INDEX, data_size, + (void**)&p_data)) { + return 0; + } + blkCntrIdx = *(static_cast(p_data)); + + uint32_t regIdx = 0; + switch (blkId) { + // Program counters belonging to SQ block + case kHsaViCounterBlockIdSq: + case kHsaViCounterBlockIdSqEs: + case kHsaViCounterBlockIdSqGs: + case kHsaViCounterBlockIdSqVs: + case kHsaViCounterBlockIdSqPs: + case kHsaViCounterBlockIdSqLs: + case kHsaViCounterBlockIdSqHs: + case kHsaViCounterBlockIdSqCs: + return ProgramSQCntrs(cntrIdx, regAddr, regVal, blkId, blkCntrIdx); + + case kHsaViCounterBlockIdCb0: + case kHsaViCounterBlockIdCb1: + case kHsaViCounterBlockIdCb2: + case kHsaViCounterBlockIdCb3: { + regIdx = 0; + instance_index = blkId - kHsaViCounterBlockIdCb0; + grbm_gfx_index.u32All = 0; + grbm_gfx_index.bitfields.INSTANCE_INDEX = instance_index; + grbm_gfx_index.bitfields.SE_BROADCAST_WRITES = 1; + grbm_gfx_index.bitfields.SH_BROADCAST_WRITES = 1; + + regVal[regIdx] = grbm_gfx_index.u32All; + regAddr[regIdx] = mmGRBM_GFX_INDEX__CI__VI; + regIdx++; + + regVal[regIdx] = 0; + regAddr[regIdx] = mmCB_PERFCOUNTER0_LO__CI__VI; + regIdx++; + + regVal[regIdx] = 0; + regAddr[regIdx] = mmCB_PERFCOUNTER0_HI__CI__VI; + regIdx++; + + regVal[regIdx] = 0; + regAddr[regIdx] = mmCB_PERFCOUNTER1_LO__CI__VI; + regIdx++; + + regVal[regIdx] = 0; + regAddr[regIdx] = mmCB_PERFCOUNTER1_HI__CI__VI; + regIdx++; + + regVal[regIdx] = 0; + regAddr[regIdx] = mmCB_PERFCOUNTER2_LO__CI__VI; + regIdx++; + + regVal[regIdx] = 0; + regAddr[regIdx] = mmCB_PERFCOUNTER2_HI__CI__VI; + regIdx++; + + regVal[regIdx] = 0; + regAddr[regIdx] = mmCB_PERFCOUNTER3_LO__CI__VI; + regIdx++; + + regVal[regIdx] = 0; + regAddr[regIdx] = mmCB_PERFCOUNTER3_HI__CI__VI; + regIdx++; + + regCB_PERFCOUNTER0_SELECT__CI__VI cb_perf_counter_select; + cb_perf_counter_select.u32All = 0; + cb_perf_counter_select.bits.PERF_SEL = blkCntrIdx; + + regVal[regIdx] = cb_perf_counter_select.u32All; + regAddr[regIdx] = ViCbCounterRegAddr[cntrIdx].counterSelRegAddr; + regIdx++; + + break; + } + + case kHsaViCounterBlockIdCpf: { + regCPF_PERFCOUNTER0_SELECT__CI__VI cpf_perf_counter_select; + cpf_perf_counter_select.u32All = 0; + cpf_perf_counter_select.bits.PERF_SEL = blkCntrIdx; + + regVal[0] = cpf_perf_counter_select.u32All; + regAddr[0] = ViCpfCounterRegAddr[cntrIdx].counterSelRegAddr; + regIdx = 1; + break; + } + + case kHsaViCounterBlockIdDb0: + case kHsaViCounterBlockIdDb1: + case kHsaViCounterBlockIdDb2: + case kHsaViCounterBlockIdDb3: { + instance_index = blkId - kHsaViCounterBlockIdDb0; + regIdx = 0; + grbm_gfx_index.u32All = 0; + grbm_gfx_index.bitfields.INSTANCE_INDEX = instance_index; + grbm_gfx_index.bitfields.SE_BROADCAST_WRITES = 1; + grbm_gfx_index.bitfields.SH_BROADCAST_WRITES = 1; + + regVal[regIdx] = grbm_gfx_index.u32All; + regAddr[regIdx] = mmGRBM_GFX_INDEX__CI__VI; + regIdx++; + + regVal[regIdx] = 0; + regAddr[regIdx] = mmDB_PERFCOUNTER0_LO__CI__VI; + regIdx++; + regVal[regIdx] = 0; + regAddr[regIdx] = mmDB_PERFCOUNTER0_HI__CI__VI; + regIdx++; + regVal[regIdx] = 0; + regAddr[regIdx] = mmDB_PERFCOUNTER1_LO__CI__VI; + regIdx++; + regVal[regIdx] = 0; + regAddr[regIdx] = mmDB_PERFCOUNTER1_HI__CI__VI; + regIdx++; + regVal[regIdx] = 0; + regAddr[regIdx] = mmDB_PERFCOUNTER2_LO__CI__VI; + regIdx++; + regVal[regIdx] = 0; + regAddr[regIdx] = mmDB_PERFCOUNTER2_HI__CI__VI; + regIdx++; + regVal[regIdx] = 0; + regAddr[regIdx] = mmDB_PERFCOUNTER3_LO__CI__VI; + regIdx++; + regVal[regIdx] = 0; + regAddr[regIdx] = mmDB_PERFCOUNTER3_HI__CI__VI; + regIdx++; + + regDB_PERFCOUNTER0_SELECT db_perf_counter_select; + db_perf_counter_select.u32All = 0; + db_perf_counter_select.bits.PERF_SEL = blkCntrIdx; + regVal[regIdx] = db_perf_counter_select.u32All; + regAddr[regIdx] = ViDbCounterRegAddr[cntrIdx].counterSelRegAddr; + regIdx++; + break; + } + + case kHsaViCounterBlockIdGrbm: { + regGRBM_PERFCOUNTER0_SELECT grbm_perf_counter_select; + grbm_perf_counter_select.u32All = 0; + grbm_perf_counter_select.bits.PERF_SEL = blkCntrIdx; + regVal[0] = grbm_perf_counter_select.u32All; + regAddr[0] = ViGrbmCounterRegAddr[cntrIdx].counterSelRegAddr; + regIdx = 1; + break; + } + + case kHsaViCounterBlockIdGrbmSe: { + regGRBM_SE0_PERFCOUNTER_SELECT grbm_se0_perf_counter_select; + grbm_se0_perf_counter_select.u32All = 0; + grbm_se0_perf_counter_select.bits.PERF_SEL = blkCntrIdx; + regVal[0] = grbm_se0_perf_counter_select.u32All; + regAddr[0] = ViGrbmSeCounterRegAddr[cntrIdx].counterSelRegAddr; + regIdx = 1; + break; + } + + case kHsaViCounterBlockIdPaSu: { + regPA_SU_PERFCOUNTER0_SELECT pa_su_perf_counter_select; + pa_su_perf_counter_select.u32All = 0; + pa_su_perf_counter_select.bits.PERF_SEL = blkCntrIdx; + regVal[0] = pa_su_perf_counter_select.u32All; + regAddr[0] = ViPaSuCounterRegAddr[cntrIdx].counterSelRegAddr; + regIdx = 1; + break; + } + + case kHsaViCounterBlockIdPaSc: { + regPA_SC_PERFCOUNTER0_SELECT pa_sc_perf_counter_select; + pa_sc_perf_counter_select.u32All = 0; + pa_sc_perf_counter_select.bits.PERF_SEL = blkCntrIdx; + regVal[0] = pa_sc_perf_counter_select.u32All; + regAddr[0] = ViPaScCounterRegAddr[cntrIdx].counterSelRegAddr; + regIdx = 1; + break; + } + + case kHsaViCounterBlockIdSpi: { + regSPI_PERFCOUNTER0_SELECT spi_perf_counter_select; + spi_perf_counter_select.u32All = 0; + spi_perf_counter_select.bits.PERF_SEL = blkCntrIdx; + regVal[0] = spi_perf_counter_select.u32All; + regAddr[0] = ViSpiCounterRegAddr[cntrIdx].counterSelRegAddr; + regIdx = 1; + break; + } + + case kHsaViCounterBlockIdSx: { + regIdx = 0; + regVal[regIdx] = 0; + regAddr[regIdx] = mmSX_PERFCOUNTER0_LO__CI__VI; + regIdx++; + + regVal[regIdx] = 0; + regAddr[regIdx] = mmSX_PERFCOUNTER0_HI__CI__VI; + regIdx++; + + regVal[regIdx] = 0; + regAddr[regIdx] = mmSX_PERFCOUNTER1_LO__CI__VI; + regIdx++; + + regVal[regIdx] = 0; + regAddr[regIdx] = mmSX_PERFCOUNTER1_HI__CI__VI; + regIdx++; + + regVal[regIdx] = 0; + regAddr[regIdx] = mmSX_PERFCOUNTER2_LO__CI__VI; + regIdx++; + + regVal[regIdx] = 0; + regAddr[regIdx] = mmSX_PERFCOUNTER2_HI__CI__VI; + regIdx++; + + regVal[regIdx] = 0; + regAddr[regIdx] = mmSX_PERFCOUNTER3_LO__CI__VI; + regIdx++; + + regSX_PERFCOUNTER0_SELECT sx_perf_counter_select; + sx_perf_counter_select.u32All = 0; + sx_perf_counter_select.bits.PERFCOUNTER_SELECT = blkCntrIdx; + regVal[regIdx] = sx_perf_counter_select.u32All; + regAddr[regIdx] = ViSxCounterRegAddr[cntrIdx].counterSelRegAddr; + regIdx++; + break; + } + + case kHsaViCounterBlockIdTa0: + case kHsaViCounterBlockIdTa1: + case kHsaViCounterBlockIdTa2: + case kHsaViCounterBlockIdTa3: + case kHsaViCounterBlockIdTa4: + case kHsaViCounterBlockIdTa5: + case kHsaViCounterBlockIdTa6: + case kHsaViCounterBlockIdTa7: + case kHsaViCounterBlockIdTa8: + case kHsaViCounterBlockIdTa9: + case kHsaViCounterBlockIdTa10: + case kHsaViCounterBlockIdTa11: + case kHsaViCounterBlockIdTa12: + case kHsaViCounterBlockIdTa13: + case kHsaViCounterBlockIdTa14: + case kHsaViCounterBlockIdTa15: + return ProgramTaCntrs(cntrIdx, regAddr, regVal, blkId, blkCntrIdx); + + case kHsaViCounterBlockIdTca0: + case kHsaViCounterBlockIdTca1: + return ProgramTcaCntrs(cntrIdx, regAddr, regVal, blkId, blkCntrIdx); + + case kHsaViCounterBlockIdTcc0: + case kHsaViCounterBlockIdTcc1: + case kHsaViCounterBlockIdTcc2: + case kHsaViCounterBlockIdTcc3: + case kHsaViCounterBlockIdTcc4: + case kHsaViCounterBlockIdTcc5: + case kHsaViCounterBlockIdTcc6: + case kHsaViCounterBlockIdTcc7: + case kHsaViCounterBlockIdTcc8: + case kHsaViCounterBlockIdTcc9: + case kHsaViCounterBlockIdTcc10: + case kHsaViCounterBlockIdTcc11: + case kHsaViCounterBlockIdTcc12: + case kHsaViCounterBlockIdTcc13: + case kHsaViCounterBlockIdTcc14: + case kHsaViCounterBlockIdTcc15: + return ProgramTccCntrs(cntrIdx, regAddr, regVal, blkId, blkCntrIdx); + + case kHsaViCounterBlockIdTd0: + case kHsaViCounterBlockIdTd1: + case kHsaViCounterBlockIdTd2: + case kHsaViCounterBlockIdTd3: + case kHsaViCounterBlockIdTd4: + case kHsaViCounterBlockIdTd5: + case kHsaViCounterBlockIdTd6: + case kHsaViCounterBlockIdTd7: + case kHsaViCounterBlockIdTd8: + case kHsaViCounterBlockIdTd9: + case kHsaViCounterBlockIdTd10: + case kHsaViCounterBlockIdTd11: + case kHsaViCounterBlockIdTd12: + case kHsaViCounterBlockIdTd13: + case kHsaViCounterBlockIdTd14: + case kHsaViCounterBlockIdTd15: + return ProgramTdCntrs(cntrIdx, regAddr, regVal, blkId, blkCntrIdx); + + case kHsaViCounterBlockIdTcp0: + case kHsaViCounterBlockIdTcp1: + case kHsaViCounterBlockIdTcp2: + case kHsaViCounterBlockIdTcp3: + case kHsaViCounterBlockIdTcp4: + case kHsaViCounterBlockIdTcp5: + case kHsaViCounterBlockIdTcp6: + case kHsaViCounterBlockIdTcp7: + case kHsaViCounterBlockIdTcp8: + case kHsaViCounterBlockIdTcp9: + case kHsaViCounterBlockIdTcp10: + case kHsaViCounterBlockIdTcp11: + case kHsaViCounterBlockIdTcp12: + case kHsaViCounterBlockIdTcp13: + case kHsaViCounterBlockIdTcp14: + case kHsaViCounterBlockIdTcp15: + return ProgramTcpCntrs(cntrIdx, regAddr, regVal, blkId, blkCntrIdx); + + case kHsaViCounterBlockIdGds: { + regGDS_PERFCOUNTER0_SELECT gds_perf_counter_select; + gds_perf_counter_select.u32All = 0; + gds_perf_counter_select.bits.PERFCOUNTER_SELECT = blkCntrIdx; + regVal[0] = gds_perf_counter_select.u32All; + regAddr[0] = ViGdsCounterRegAddr[cntrIdx].counterSelRegAddr; + regIdx = 1; + break; + } + + case kHsaViCounterBlockIdVgt: { + regVGT_PERFCOUNTER0_SELECT__CI__VI vgt_perf_counter_select; + vgt_perf_counter_select.u32All = 0; + vgt_perf_counter_select.bits.PERF_SEL = blkCntrIdx; + regVal[0] = vgt_perf_counter_select.u32All; + regAddr[0] = ViVgtCounterRegAddr[cntrIdx].counterSelRegAddr; + regIdx = 1; + break; + } + + case kHsaViCounterBlockIdIa: { + regIA_PERFCOUNTER0_SELECT__CI__VI ia_perf_counter_select; + ia_perf_counter_select.u32All = 0; + ia_perf_counter_select.bits.PERF_SEL = blkCntrIdx; + regVal[0] = ia_perf_counter_select.u32All; + regAddr[0] = ViIaCounterRegAddr[cntrIdx].counterSelRegAddr; + regIdx = 1; + break; + } + + /* + case kHsaViCounterBlockIdMc: { + // To be investigated later + //regMC_SEQ_PERF_SEQ_CTL mc_perfcounter_select; + //mc_perfcounter_select.u32All = 0; + //mc_perfcounter_select.bits.PERF_SEL = blkCntrIdx; + //regVal[0] = mc_perfcounter_select.u32All; + //regAddr[0] = ViMcCounterRegAddr[cntrIdx].counterSelRegAddr; + //regIdx = 1; + } + break; + */ + + case kHsaViCounterBlockIdSrbm: { + regSRBM_PERFCOUNTER0_SELECT srbm_perf_counter_select; + srbm_perf_counter_select.u32All = 0; + srbm_perf_counter_select.bits.PERF_SEL = blkCntrIdx; + regVal[0] = srbm_perf_counter_select.u32All; + regAddr[0] = ViSrbmCounterRegAddr[cntrIdx].counterSelRegAddr; + regIdx = 1; + break; + } + + /* + case kHsaViCounterBlockIdTcs: { + regTCS_PERFCOUNTER0_SELECT__CI tcs_perf_counter_select; + tcs_perf_counter_select.u32All = 0; + tcs_perf_counter_select.bits.PERF_SEL = blkCntrIdx; + regVal[0] = tcs_perf_counter_select.u32All; + regAddr[0] = ViTcsCounterRegAddr[cntrIdx].counterSelRegAddr; + regIdx = 1; + break; + } + */ + + case kHsaViCounterBlockIdWd: { + regWD_PERFCOUNTER0_SELECT__CI__VI wd_perf_counter_select; + wd_perf_counter_select.u32All = 0; + wd_perf_counter_select.bits.PERF_SEL = blkCntrIdx; + regVal[0] = wd_perf_counter_select.u32All; + regAddr[0] = ViWdCounterRegAddr[cntrIdx].counterSelRegAddr; + regIdx = 1; + break; + } + + case kHsaViCounterBlockIdCpg: { + regCPG_PERFCOUNTER0_SELECT__CI__VI cpg_perf_counter_select; + cpg_perf_counter_select.u32All = 0; + cpg_perf_counter_select.bits.PERF_SEL = blkCntrIdx; + regVal[0] = cpg_perf_counter_select.u32All; + regAddr[0] = ViCpgCounterRegAddr[cntrIdx].counterSelRegAddr; + regIdx = 1; + break; + } + + case kHsaViCounterBlockIdCpc: { + regCPC_PERFCOUNTER0_SELECT__CI__VI cpc_perf_counter_select; + cpc_perf_counter_select.u32All = 0; + cpc_perf_counter_select.bits.PERF_SEL = blkCntrIdx; + regVal[0] = cpc_perf_counter_select.u32All; + regAddr[0] = ViCpcCounterRegAddr[cntrIdx].counterSelRegAddr; + regIdx = 1; + break; + } + + /* + case kHsaViCounterBlockIdMc: { + AddPriviledgedCountersToList(ViBlockIdMc, blkCntrIdx); + //Num of regs equals to 0 means it is processed by KFD + regIdx = 0; + break; + } + + case kHsaViCounterBlockIdIommuV2: { + AddPriviledgedCountersToList(ViBlockIdIommuV2, blkCntrIdx); + //Num of regs equals to 0 means it is processed by KFD + regIdx = 0; + break; + } + + case kHsaViCounterBlockIdKernelDriver: { + AddPriviledgedCountersToList(ViBlockIdKernelDriver, blkCntrIdx); + //Num of regs equals to 0 means it is processed by KFD + regIdx = 0; + break; + } + */ + + default: { + regIdx = 0; + break; + } + } + + return regIdx; +} + +uint32_t ViPmu::BuildCounterReadRegisters(uint32_t reg_index, uint32_t block_id, uint32_t* reg_addr, + uint32_t* reg_val) { + uint32_t ii; + uint32_t reg_num = 0; + uint32_t instance_index; + regGRBM_GFX_INDEX grbm_gfx_index; + switch (block_id) { + case kHsaViCounterBlockIdSq: + case kHsaViCounterBlockIdSqEs: + case kHsaViCounterBlockIdSqGs: + case kHsaViCounterBlockIdSqVs: + case kHsaViCounterBlockIdSqPs: + case kHsaViCounterBlockIdSqLs: + case kHsaViCounterBlockIdSqHs: + case kHsaViCounterBlockIdSqCs: { + for (ii = 0; ii < num_se_; ii++) { + grbm_gfx_index.u32All = 0; + grbm_gfx_index.bitfields.INSTANCE_BROADCAST_WRITES = 1; + grbm_gfx_index.bitfields.SE_INDEX = ii; + grbm_gfx_index.bitfields.SH_BROADCAST_WRITES = 1; + reg_addr[reg_num] = mmGRBM_GFX_INDEX__CI__VI; + reg_val[reg_num] = grbm_gfx_index.u32All; + reg_num++; + + reg_addr[reg_num] = ViSqCounterRegAddr[reg_index].counterReadRegAddrLo; + reg_val[reg_num] = COPY_DATA_FLAG; + reg_num++; + + reg_addr[reg_num] = ViSqCounterRegAddr[reg_index].counterReadRegAddrHi; + reg_val[reg_num] = COPY_DATA_FLAG; + reg_num++; + } + break; + } + + case kHsaViCounterBlockIdCb0: + case kHsaViCounterBlockIdCb1: + case kHsaViCounterBlockIdCb2: + case kHsaViCounterBlockIdCb3: { + instance_index = block_id - kHsaViCounterBlockIdCb0; + for (ii = 0; ii < num_se_; ii++) { + grbm_gfx_index.u32All = 0; + grbm_gfx_index.bitfields.INSTANCE_INDEX = instance_index; + grbm_gfx_index.bitfields.SE_INDEX = ii; + grbm_gfx_index.bitfields.SH_BROADCAST_WRITES = 1; + reg_addr[reg_num] = mmGRBM_GFX_INDEX__CI__VI; + reg_val[reg_num] = grbm_gfx_index.u32All; + reg_num++; + + reg_addr[reg_num] = ViCbCounterRegAddr[reg_index].counterReadRegAddrLo; + reg_val[reg_num] = COPY_DATA_FLAG; + reg_num++; + + reg_addr[reg_num] = ViCbCounterRegAddr[reg_index].counterReadRegAddrHi; + reg_val[reg_num] = COPY_DATA_FLAG; + reg_num++; + } + break; + } + + case kHsaViCounterBlockIdCpf: { + reg_addr[reg_num] = mmGRBM_GFX_INDEX__CI__VI; + reg_val[reg_num] = reset_grbm_; + reg_num++; + + reg_addr[reg_num] = ViCpfCounterRegAddr[reg_index].counterReadRegAddrLo; + reg_val[reg_num] = COPY_DATA_FLAG; + reg_num++; + + reg_addr[reg_num] = ViCpfCounterRegAddr[reg_index].counterReadRegAddrHi; + reg_val[reg_num] = COPY_DATA_FLAG; + reg_num++; + break; + } + + case kHsaViCounterBlockIdDb0: + case kHsaViCounterBlockIdDb1: + case kHsaViCounterBlockIdDb2: + case kHsaViCounterBlockIdDb3: { + instance_index = block_id - kHsaViCounterBlockIdDb0; + for (ii = 0; ii < num_se_; ii++) { + grbm_gfx_index.u32All = 0; + grbm_gfx_index.bitfields.INSTANCE_INDEX = instance_index; + grbm_gfx_index.bitfields.SE_INDEX = ii; + grbm_gfx_index.bitfields.SH_BROADCAST_WRITES = 1; + reg_addr[reg_num] = mmGRBM_GFX_INDEX__CI__VI; + reg_val[reg_num] = grbm_gfx_index.u32All; + reg_num++; + + reg_addr[reg_num] = ViDbCounterRegAddr[reg_index].counterReadRegAddrLo; + reg_val[reg_num] = COPY_DATA_FLAG; + reg_num++; + + reg_addr[reg_num] = ViDbCounterRegAddr[reg_index].counterReadRegAddrHi; + reg_val[reg_num] = COPY_DATA_FLAG; + reg_num++; + } + break; + } + + case kHsaViCounterBlockIdGrbm: { + reg_addr[reg_num] = mmGRBM_GFX_INDEX__CI__VI; + reg_val[reg_num] = reset_grbm_; + reg_num++; + + reg_addr[reg_num] = ViGrbmCounterRegAddr[reg_index].counterReadRegAddrLo; + reg_val[reg_num] = COPY_DATA_FLAG; + reg_num++; + + reg_addr[reg_num] = ViGrbmCounterRegAddr[reg_index].counterReadRegAddrHi; + reg_val[reg_num] = COPY_DATA_FLAG; + reg_num++; + break; + } + + case kHsaViCounterBlockIdGrbmSe: { + reg_addr[reg_num] = mmGRBM_GFX_INDEX__CI__VI; + reg_val[reg_num] = reset_grbm_; + reg_num++; + + reg_addr[reg_num] = ViGrbmSeCounterRegAddr[reg_index].counterReadRegAddrLo; + reg_val[reg_num] = COPY_DATA_FLAG; + reg_num++; + + reg_addr[reg_num] = ViGrbmSeCounterRegAddr[reg_index].counterReadRegAddrHi; + reg_val[reg_num] = COPY_DATA_FLAG; + reg_num++; + break; + } + + case kHsaViCounterBlockIdPaSu: { + for (ii = 0; ii < num_se_; ii++) { + grbm_gfx_index.u32All = 0; + grbm_gfx_index.bitfields.INSTANCE_BROADCAST_WRITES = 1; + grbm_gfx_index.bitfields.SE_INDEX = ii; + grbm_gfx_index.bitfields.SH_BROADCAST_WRITES = 1; + reg_addr[reg_num] = mmGRBM_GFX_INDEX__CI__VI; + reg_val[reg_num] = grbm_gfx_index.u32All; + reg_num++; + + reg_addr[reg_num] = ViPaSuCounterRegAddr[reg_index].counterReadRegAddrLo; + reg_val[reg_num] = COPY_DATA_FLAG; + reg_num++; + + reg_addr[reg_num] = ViPaSuCounterRegAddr[reg_index].counterReadRegAddrHi; + reg_val[reg_num] = COPY_DATA_FLAG; + reg_num++; + } + break; + } + + case kHsaViCounterBlockIdPaSc: { + for (ii = 0; ii < num_se_; ii++) { + grbm_gfx_index.u32All = 0; + grbm_gfx_index.bitfields.INSTANCE_BROADCAST_WRITES = 1; + grbm_gfx_index.bitfields.SE_INDEX = ii; + grbm_gfx_index.bitfields.SH_BROADCAST_WRITES = 1; + reg_addr[reg_num] = mmGRBM_GFX_INDEX__CI__VI; + reg_val[reg_num] = grbm_gfx_index.u32All; + reg_num++; + + reg_addr[reg_num] = ViPaScCounterRegAddr[reg_index].counterReadRegAddrLo; + reg_val[reg_num] = COPY_DATA_FLAG; + reg_num++; + + reg_addr[reg_num] = ViPaScCounterRegAddr[reg_index].counterReadRegAddrHi; + reg_val[reg_num] = COPY_DATA_FLAG; + reg_num++; + } + break; + } + + case kHsaViCounterBlockIdSpi: { + for (ii = 0; ii < num_se_; ii++) { + grbm_gfx_index.u32All = 0; + grbm_gfx_index.bitfields.INSTANCE_BROADCAST_WRITES = 1; + grbm_gfx_index.bitfields.SE_INDEX = ii; + grbm_gfx_index.bitfields.SH_BROADCAST_WRITES = 1; + reg_addr[reg_num] = mmGRBM_GFX_INDEX__CI__VI; + reg_val[reg_num] = grbm_gfx_index.u32All; + reg_num++; + + reg_addr[reg_num] = ViSpiCounterRegAddr[reg_index].counterReadRegAddrLo; + reg_val[reg_num] = COPY_DATA_FLAG; + reg_num++; + + reg_addr[reg_num] = ViSpiCounterRegAddr[reg_index].counterReadRegAddrHi; + reg_val[reg_num] = COPY_DATA_FLAG; + reg_num++; + } + break; + } + + case kHsaViCounterBlockIdSx: { + for (ii = 0; ii < num_se_; ii++) { + grbm_gfx_index.u32All = 0; + grbm_gfx_index.bitfields.INSTANCE_BROADCAST_WRITES = 1; + grbm_gfx_index.bitfields.SE_INDEX = ii; + grbm_gfx_index.bitfields.SH_BROADCAST_WRITES = 1; + reg_addr[reg_num] = mmGRBM_GFX_INDEX__CI__VI; + reg_val[reg_num] = grbm_gfx_index.u32All; + reg_num++; + + reg_addr[reg_num] = ViSxCounterRegAddr[reg_index].counterReadRegAddrLo; + reg_val[reg_num] = COPY_DATA_FLAG; + reg_num++; + + reg_addr[reg_num] = ViSxCounterRegAddr[reg_index].counterReadRegAddrHi; + reg_val[reg_num] = COPY_DATA_FLAG; + reg_num++; + } + break; + } + + case kHsaViCounterBlockIdTa0: + case kHsaViCounterBlockIdTa1: + case kHsaViCounterBlockIdTa2: + case kHsaViCounterBlockIdTa3: + case kHsaViCounterBlockIdTa4: + case kHsaViCounterBlockIdTa5: + case kHsaViCounterBlockIdTa6: + case kHsaViCounterBlockIdTa7: + case kHsaViCounterBlockIdTa8: + case kHsaViCounterBlockIdTa9: + case kHsaViCounterBlockIdTa10: + case kHsaViCounterBlockIdTa11: + case kHsaViCounterBlockIdTa12: + case kHsaViCounterBlockIdTa13: + case kHsaViCounterBlockIdTa14: + case kHsaViCounterBlockIdTa15: { + instance_index = block_id - kHsaViCounterBlockIdTa0; + for (ii = 0; ii < num_se_; ii++) { + grbm_gfx_index.u32All = 0; + grbm_gfx_index.bitfields.INSTANCE_INDEX = instance_index; + grbm_gfx_index.bitfields.SE_INDEX = ii; + grbm_gfx_index.bitfields.SH_BROADCAST_WRITES = 1; + reg_addr[reg_num] = mmGRBM_GFX_INDEX__CI__VI; + reg_val[reg_num] = grbm_gfx_index.u32All; + reg_num++; + + reg_addr[reg_num] = ViTaCounterRegAddr[reg_index].counterReadRegAddrLo; + reg_val[reg_num] = COPY_DATA_FLAG; + reg_num++; + + reg_addr[reg_num] = ViTaCounterRegAddr[reg_index].counterReadRegAddrHi; + reg_val[reg_num] = COPY_DATA_FLAG; + reg_num++; + } + break; + } + + case kHsaViCounterBlockIdTca0: + case kHsaViCounterBlockIdTca1: { + instance_index = block_id - kHsaViCounterBlockIdTca0; + grbm_gfx_index.u32All = 0; + grbm_gfx_index.bitfields.INSTANCE_INDEX = instance_index; + grbm_gfx_index.bitfields.SE_BROADCAST_WRITES = 1; + grbm_gfx_index.bitfields.SH_BROADCAST_WRITES = 1; + reg_addr[reg_num] = mmGRBM_GFX_INDEX__CI__VI; + reg_val[reg_num] = grbm_gfx_index.u32All; + reg_num++; + + reg_addr[reg_num] = ViTcaCounterRegAddr[reg_index].counterReadRegAddrLo; + reg_val[reg_num] = COPY_DATA_FLAG; + reg_num++; + + reg_addr[reg_num] = ViTcaCounterRegAddr[reg_index].counterReadRegAddrHi; + reg_val[reg_num] = COPY_DATA_FLAG; + reg_num++; + break; + } + + case kHsaViCounterBlockIdTcc0: + case kHsaViCounterBlockIdTcc1: + case kHsaViCounterBlockIdTcc2: + case kHsaViCounterBlockIdTcc3: + case kHsaViCounterBlockIdTcc4: + case kHsaViCounterBlockIdTcc5: + case kHsaViCounterBlockIdTcc6: + case kHsaViCounterBlockIdTcc7: + case kHsaViCounterBlockIdTcc8: + case kHsaViCounterBlockIdTcc9: + case kHsaViCounterBlockIdTcc10: + case kHsaViCounterBlockIdTcc11: + case kHsaViCounterBlockIdTcc12: + case kHsaViCounterBlockIdTcc13: + case kHsaViCounterBlockIdTcc14: + case kHsaViCounterBlockIdTcc15: { + instance_index = block_id - kHsaViCounterBlockIdTcc0; + grbm_gfx_index.u32All = 0; + grbm_gfx_index.bitfields.INSTANCE_INDEX = instance_index; + grbm_gfx_index.bitfields.SE_BROADCAST_WRITES = 1; + grbm_gfx_index.bitfields.SH_BROADCAST_WRITES = 1; + reg_addr[reg_num] = mmGRBM_GFX_INDEX__CI__VI; + reg_val[reg_num] = grbm_gfx_index.u32All; + reg_num++; + + reg_addr[reg_num] = ViTccCounterRegAddr[reg_index].counterReadRegAddrLo; + reg_val[reg_num] = COPY_DATA_FLAG; + reg_num++; + + reg_addr[reg_num] = ViTccCounterRegAddr[reg_index].counterReadRegAddrHi; + reg_val[reg_num] = COPY_DATA_FLAG; + reg_num++; + break; + } + + case kHsaViCounterBlockIdTd0: + case kHsaViCounterBlockIdTd1: + case kHsaViCounterBlockIdTd2: + case kHsaViCounterBlockIdTd3: + case kHsaViCounterBlockIdTd4: + case kHsaViCounterBlockIdTd5: + case kHsaViCounterBlockIdTd6: + case kHsaViCounterBlockIdTd7: + case kHsaViCounterBlockIdTd8: + case kHsaViCounterBlockIdTd9: + case kHsaViCounterBlockIdTd10: + case kHsaViCounterBlockIdTd11: + case kHsaViCounterBlockIdTd12: + case kHsaViCounterBlockIdTd13: + case kHsaViCounterBlockIdTd14: + case kHsaViCounterBlockIdTd15: { + instance_index = block_id - kHsaViCounterBlockIdTd0; + for (ii = 0; ii < num_se_; ii++) { + grbm_gfx_index.u32All = 0; + grbm_gfx_index.bitfields.INSTANCE_INDEX = instance_index; + grbm_gfx_index.bitfields.SE_INDEX = ii; + grbm_gfx_index.bitfields.SH_BROADCAST_WRITES = 1; + reg_addr[reg_num] = mmGRBM_GFX_INDEX__CI__VI; + reg_val[reg_num] = grbm_gfx_index.u32All; + reg_num++; + + reg_addr[reg_num] = ViTdCounterRegAddr[reg_index].counterReadRegAddrLo; + reg_val[reg_num] = COPY_DATA_FLAG; + reg_num++; + + reg_addr[reg_num] = ViTdCounterRegAddr[reg_index].counterReadRegAddrHi; + reg_val[reg_num] = COPY_DATA_FLAG; + reg_num++; + } + break; + } + + case kHsaViCounterBlockIdTcp0: + case kHsaViCounterBlockIdTcp1: + case kHsaViCounterBlockIdTcp2: + case kHsaViCounterBlockIdTcp3: + case kHsaViCounterBlockIdTcp4: + case kHsaViCounterBlockIdTcp5: + case kHsaViCounterBlockIdTcp6: + case kHsaViCounterBlockIdTcp7: + case kHsaViCounterBlockIdTcp8: + case kHsaViCounterBlockIdTcp9: + case kHsaViCounterBlockIdTcp10: + case kHsaViCounterBlockIdTcp11: + case kHsaViCounterBlockIdTcp12: + case kHsaViCounterBlockIdTcp13: + case kHsaViCounterBlockIdTcp14: + case kHsaViCounterBlockIdTcp15: { + instance_index = block_id - kHsaViCounterBlockIdTcp0; + for (ii = 0; ii < num_se_; ii++) { + grbm_gfx_index.u32All = 0; + grbm_gfx_index.bitfields.INSTANCE_INDEX = instance_index; + grbm_gfx_index.bitfields.SE_INDEX = ii; + grbm_gfx_index.bitfields.SH_BROADCAST_WRITES = 1; + reg_addr[reg_num] = mmGRBM_GFX_INDEX__CI__VI; + reg_val[reg_num] = grbm_gfx_index.u32All; + reg_num++; + + reg_addr[reg_num] = ViTcpCounterRegAddr[reg_index].counterReadRegAddrLo; + reg_val[reg_num] = COPY_DATA_FLAG; + reg_num++; + + reg_addr[reg_num] = ViTcpCounterRegAddr[reg_index].counterReadRegAddrHi; + reg_val[reg_num] = COPY_DATA_FLAG; + reg_num++; + } + break; + } + + case kHsaViCounterBlockIdGds: { + reg_addr[reg_num] = mmGRBM_GFX_INDEX__CI__VI; + reg_val[reg_num] = reset_grbm_; + reg_num++; + + reg_addr[reg_num] = ViGdsCounterRegAddr[reg_index].counterReadRegAddrLo; + reg_val[reg_num] = COPY_DATA_FLAG; + reg_num++; + + reg_addr[reg_num] = ViGdsCounterRegAddr[reg_index].counterReadRegAddrHi; + reg_val[reg_num] = COPY_DATA_FLAG; + reg_num++; + break; + } + + case kHsaViCounterBlockIdVgt: { + for (ii = 0; ii < num_se_; ii++) { + grbm_gfx_index.u32All = 0; + grbm_gfx_index.bitfields.INSTANCE_BROADCAST_WRITES = 1; + grbm_gfx_index.bitfields.SE_INDEX = ii; + grbm_gfx_index.bitfields.SH_BROADCAST_WRITES = 1; + reg_addr[reg_num] = mmGRBM_GFX_INDEX__CI__VI; + reg_val[reg_num] = grbm_gfx_index.u32All; + reg_num++; + + reg_addr[reg_num] = ViVgtCounterRegAddr[reg_index].counterReadRegAddrLo; + reg_val[reg_num] = COPY_DATA_FLAG; + reg_num++; + + reg_addr[reg_num] = ViVgtCounterRegAddr[reg_index].counterReadRegAddrHi; + reg_val[reg_num] = COPY_DATA_FLAG; + reg_num++; + } + break; + } + + case kHsaViCounterBlockIdIa: { + for (ii = 0; ii < num_se_; ii++) { + grbm_gfx_index.u32All = 0; + grbm_gfx_index.bitfields.INSTANCE_BROADCAST_WRITES = 1; + grbm_gfx_index.bitfields.SE_INDEX = ii; + grbm_gfx_index.bitfields.SH_BROADCAST_WRITES = 1; + reg_addr[reg_num] = mmGRBM_GFX_INDEX__CI__VI; + reg_val[reg_num] = grbm_gfx_index.u32All; + reg_num++; + + reg_addr[reg_num] = ViIaCounterRegAddr[reg_index].counterReadRegAddrLo; + reg_val[reg_num] = COPY_DATA_FLAG; + reg_num++; + + reg_addr[reg_num] = ViIaCounterRegAddr[reg_index].counterReadRegAddrHi; + reg_val[reg_num] = COPY_DATA_FLAG; + reg_num++; + } + break; + } + /* + case kHsaViCounterBlockIdMc: { + reg_addr[reg_num] = mmGRBM_GFX_INDEX__CI__VI; + reg_val[reg_num] = reset_grbm_; + reg_num++; + + reg_addr[reg_num] = ViMcCounterRegAddr[reg_index].counterReadRegAddrLo; + reg_val[reg_num] = COPY_DATA_FLAG; + reg_num++; + + reg_addr[reg_num] = ViMcCounterRegAddr[reg_index].counterReadRegAddrHi; + reg_val[reg_num] = COPY_DATA_FLAG; + reg_num++; + break; + } + */ + case kHsaViCounterBlockIdSrbm: { + reg_addr[reg_num] = mmGRBM_GFX_INDEX__CI__VI; + reg_val[reg_num] = reset_grbm_; + reg_num++; + + reg_addr[reg_num] = ViSrbmCounterRegAddr[reg_index].counterReadRegAddrLo; + reg_val[reg_num] = COPY_DATA_FLAG; + reg_num++; + + reg_addr[reg_num] = ViSrbmCounterRegAddr[reg_index].counterReadRegAddrHi; + reg_val[reg_num] = COPY_DATA_FLAG; + reg_num++; + break; + } + /* + case kHsaViCounterBlockIdTcs: { + reg_addr[reg_num] = mmGRBM_GFX_INDEX__CI__VI; + reg_val[reg_num] = reset_grbm_; + reg_num++; + + reg_addr[reg_num] = ViTcsCounterRegAddr[reg_index].counterReadRegAddrLo; + reg_val[reg_num] = COPY_DATA_FLAG; + reg_num++; + + reg_addr[reg_num] = ViTcsCounterRegAddr[reg_index].counterReadRegAddrHi; + reg_val[reg_num] = COPY_DATA_FLAG; + reg_num++; + break; + } + */ + case kHsaViCounterBlockIdWd: { + reg_addr[reg_num] = mmGRBM_GFX_INDEX__CI__VI; + reg_val[reg_num] = reset_grbm_; + reg_num++; + + reg_addr[reg_num] = ViWdCounterRegAddr[reg_index].counterReadRegAddrLo; + reg_val[reg_num] = COPY_DATA_FLAG; + reg_num++; + + reg_addr[reg_num] = ViWdCounterRegAddr[reg_index].counterReadRegAddrHi; + reg_val[reg_num] = COPY_DATA_FLAG; + reg_num++; + break; + } + + case kHsaViCounterBlockIdCpg: { + reg_addr[reg_num] = mmGRBM_GFX_INDEX__CI__VI; + reg_val[reg_num] = reset_grbm_; + reg_num++; + + reg_addr[reg_num] = ViCpgCounterRegAddr[reg_index].counterReadRegAddrLo; + reg_val[reg_num] = COPY_DATA_FLAG; + reg_num++; + + reg_addr[reg_num] = ViCpgCounterRegAddr[reg_index].counterReadRegAddrHi; + reg_val[reg_num] = COPY_DATA_FLAG; + reg_num++; + break; + } + + case kHsaViCounterBlockIdCpc: { + reg_addr[reg_num] = mmGRBM_GFX_INDEX__CI__VI; + reg_val[reg_num] = reset_grbm_; + reg_num++; + + reg_addr[reg_num] = ViCpcCounterRegAddr[reg_index].counterReadRegAddrLo; + reg_val[reg_num] = COPY_DATA_FLAG; + reg_num++; + + reg_addr[reg_num] = ViCpcCounterRegAddr[reg_index].counterReadRegAddrHi; + reg_val[reg_num] = COPY_DATA_FLAG; + reg_num++; + break; + } + + // IommuV2, MC, kernel driver counters are retrieved via + // KFD implementation + case kHsaViCounterBlockIdMc: + case kHsaViCounterBlockIdIommuV2: + case kHsaViCounterBlockIdKernelDriver: { + reg_num = 0; + break; + } + + default: { break; } + } + + return reg_num; +} + +hsa_status_t ViPmu::RemoveCounterBlocks() { + ViCounterBlockMap::iterator it = blk_map_.begin(); + ViCounterBlockMap::iterator block_end = blk_map_.end(); + + for (; it != block_end; it++) { + delete it->second; + } + + return HSA_STATUS_SUCCESS; +} + + +} /* namespace */ diff --git a/runtime/hsa-ext-aql-profile/src/perfcounter/vi_pmu.h b/runtime/hsa-ext-aql-profile/src/perfcounter/vi_pmu.h new file mode 100644 index 0000000000..cd1a1dfb37 --- /dev/null +++ b/runtime/hsa-ext-aql-profile/src/perfcounter/vi_pmu.h @@ -0,0 +1,141 @@ +#ifndef _VI_PMU_H_ +#define _VI_PMU_H_ + +#include "hsa.h" +#include "cmdwriter.h" +#include "hsa_perf.h" +#include "info_set.h" +#include "parameter_set.h" +#include "vi_blockinfo.h" +#include "rocr_profiler.h" + +#include +#include +#include + +namespace pm4_profile { +typedef std::map ViCounterBlockMap; + +// This class implement the VI PMU. It is responsible for setting up +// CounterGroups to represent each VI hardware block which exposes performance +// counters. +class ViPmu : public pm4_profile::Pmu { + public: + ViPmu(); + ~ViPmu(); + + // Returns number of shader engines per block + // for the blocks featured shader engines instancing + uint32_t getNumSe() { return num_se_; } + + // Initializes the handle of buffer used to collect PMC data + bool setPmcDataBuff(uint8_t* pmcBuffer, uint32_t pmcBuffSz); + + int getLastError(); + + std::string getErrorString(int error); + + virtual bool begin(pm4_profile::DefaultCmdBuf* cmdBuff, pm4_profile::CommandWriter* cmdWriter, + bool reset = true); + + virtual bool end(pm4_profile::DefaultCmdBuf* cmdBuff, pm4_profile::CommandWriter* cmdWriter); + + // IPMU inherits the IParameterSet and IInfoSetso we implement it + // through composition and function forwarding + bool getParameter(uint32_t param, uint32_t& ret_size, void** pp_data); + + bool setParameter(uint32_t param, uint32_t param_size, const void* p_data); + + bool getInfo(uint32_t info, uint32_t& ret_size, void** pp_data); + + pm4_profile::CounterBlock* getCounterBlockById(uint32_t id); + + rocr_pmu_state_t getCurrentState() { return profiler_state_; } + + pm4_profile::CounterBlock** getAllCounterBlocks(uint32_t& num_groups); + + private: + // Addr of Counter Data Buffer + uint32_t* pmcData_; + + // Size of Counter Data Buffer + uint32_t pmcDataSz_; + + void Init(); + + bool initCounterBlock(); + + bool isResultReady(); + + // Clear CounterBlockMap + void clearCounterBlockMap(); + + // Reset SQ and CB counters + void ResetCounterBlocks(pm4_profile::DefaultCmdBuf* cmdBuff, + pm4_profile::CommandWriter* cmdWriter); + + // Program SQ block related counters + uint32_t ProgramSQCntrs(uint32_t sqRegIdx, uint32_t* regAddr, uint32_t* regVal, uint32_t blkId, + uint32_t blkCntrIdx); + + // Program TA block related counters + uint32_t ProgramTaCntrs(uint32_t taRegIdx, uint32_t* regAddr, uint32_t* regVal, uint32_t blkId, + uint32_t blkCntrIdx); + + // Program TCA block related counters + uint32_t ProgramTcaCntrs(uint32_t tcaRegIdx, uint32_t* regAddr, uint32_t* regVal, uint32_t blkId, + uint32_t blkCntrIdx); + + // Program TCC block related counters + uint32_t ProgramTccCntrs(uint32_t tccRegIdx, uint32_t* regAddr, uint32_t* regVal, uint32_t blkId, + uint32_t blkCntrIdx); + + // Program TCP block related counters + uint32_t ProgramTcpCntrs(uint32_t tcpRegIdx, uint32_t* regAddr, uint32_t* regVal, uint32_t blkId, + uint32_t blkCntrIdx); + + // Program TD block related counters + uint32_t ProgramTdCntrs(uint32_t tdRegIdx, uint32_t* regAddr, uint32_t* regVal, uint32_t blkId, + uint32_t blkCntrIdx); + + // Build counter selection register, return how many registers are built + uint32_t BuildCounterSelRegister(uint32_t cntrIdx, uint32_t* regAddr, uint32_t* regVal, + uint32_t blkId, pm4_profile::Counter* blkCntr); + + // Build counter selection register, return how many registers are built + uint32_t BuildCounterReadRegisters(uint32_t reg_index, uint32_t block_id, uint32_t* reg_addr, + uint32_t* reg_val); + + private: + // Delete counter blocks in the PMU + hsa_status_t RemoveCounterBlocks(); + + private: + // This contains the available counter groups. + ViCounterBlockMap blk_map_; + + // This stores the current profiling state. + rocr_pmu_state_t profiler_state_; + + pm4_profile::ParameterSet* parameter_set_; + + pm4_profile::InfoSet* info_set_; + + int error_code_; + +// A flag to indicate the current packet is for copy register value +#define COPY_DATA_FLAG 0xFFFFFFFF +#define MAX_REG_NUM 100 + + // Pointer used to store counter block list internally + uint32_t blk_list_size_; + pm4_profile::CounterBlock** blk_list_; + + // Indicates the number of Shader Engines Present + uint32_t num_se_; + + // Used to reset GRBM to its default state + uint32_t reset_grbm_; +}; +} +#endif diff --git a/runtime/hsa-ext-aql-profile/src/threadtrace/CMakeLists.txt b/runtime/hsa-ext-aql-profile/src/threadtrace/CMakeLists.txt new file mode 100644 index 0000000000..a35358f0d8 --- /dev/null +++ b/runtime/hsa-ext-aql-profile/src/threadtrace/CMakeLists.txt @@ -0,0 +1,18 @@ +# +# Source files for Rocr ThreadTrace +# +set ( LIB_SRC thread_trace.cpp ) +set ( LIB_SRC ${LIB_SRC} gfx8_thread_trace.cpp ) +set ( LIB_SRC ${LIB_SRC} gfx9_thread_trace.cpp ) + +# +# Header files include path(s). +# +include_directories ( $ENV{ROCR_INC_DIR} ) +include_directories ( ${PROJ_DIR}/commandwriter ) +include_directories ( ${HSA_RUNTIME_OSC_DIR} ) + +# +# Build ThreadTrace as a Static Library object +# +add_library ( ${SQTT_LIB} STATIC ${LIB_SRC} ) diff --git a/runtime/hsa-ext-aql-profile/src/threadtrace/gfx8_thread_trace.cpp b/runtime/hsa-ext-aql-profile/src/threadtrace/gfx8_thread_trace.cpp new file mode 100644 index 0000000000..14412e43c9 --- /dev/null +++ b/runtime/hsa-ext-aql-profile/src/threadtrace/gfx8_thread_trace.cpp @@ -0,0 +1,360 @@ + +#include +#include +#include +#include +#include + +#include "core/util/os.h" +#include "gfx8_thread_trace.h" + +/// @brief Returns the lower 32-bits of a value +inline uint32_t Low32(uint64_t u) { return (u & 0xFFFFFFFFUL); } + +/// @brief Returns the upper 32-bits of a value +inline uint32_t High32(uint64_t u) { return (u >> 32); } + +namespace pm4_profile { + +Gfx8ThreadTrace::Gfx8ThreadTrace() { + // Initialize the number of shader engines + numSE_ = 4; +} + +Gfx8ThreadTrace::~Gfx8ThreadTrace() {} + +bool Gfx8ThreadTrace::Init(const ThreadTraceConfig* config) { + // Initialize SQTT Configuration and Register objects + if (!ThreadTrace::Init(config)) return false; + InitThreadTraceCfgRegs(); + return true; +} + +void Gfx8ThreadTrace::InitThreadTraceCfgRegs() { + // Indicates the size of buffer to use per Shader Engine instance. + // The size is specified in terms of 4KB blocks + ttCfgRegs_.ttRegSize.u32All = 0; + + // Indicates various attributes of a thread trace session. + // + // MASK_CS: Which shader types should be enabled for data collection + // Enable CS Shader types. + // + // WRAP: How trace buffer should be used as a ring buffer or as a linear + // buffer - Disable WRAP mode i.e use it as a linear buffer + // + // MODE: Enables a thread trace session + // + // CAPTURE_MODE: When thread trace data is collected immediately after MODE + // is enabled or wait until a Thread Trace Start event is received + // + // AUTOFLUSH_EN: Flush thread trace data to buffer often automatically + // + ttCfgRegs_.ttRegMode.u32All = 0; + ttCfgRegs_.ttRegMode.bits.WRAP = 0; + ttCfgRegs_.ttRegMode.bits.CAPTURE_MODE = 0; + ttCfgRegs_.ttRegMode.bits.MASK_CS = 1; + ttCfgRegs_.ttRegMode.bits.AUTOFLUSH_EN = 1; + ttCfgRegs_.ttRegMode.bits.MODE = SQ_THREAD_TRACE_MODE_OFF; + + // Enable Thread Trace for all VM Id's + // Enable all of the SIMD's of the compute unit + // Enable Compute Unit (CU) at index Zero to be used for fine-grained data + // Enable Shader Array (SH) at index Zero to be used for fine-grained data + // + // @note: Not enabling REG_STALL_EN, SPI_STALL_EN and SQ_STALL_EN bits. They + // are useful if we wish to program buffer throttling. + // + ttCfgRegs_.ttRegMask.u32All = 0; + ttCfgRegs_.ttRegMask.bits.SH_SEL = 0x0; + ttCfgRegs_.ttRegMask.bits.SIMD_EN = 0xF; + ttCfgRegs_.ttRegMask.bits.CU_SEL = SetCuId(); + ttCfgRegs_.ttRegMask.bits.SQ_STALL_EN__CI__VI = 0x1; + ttCfgRegs_.ttRegMask.bits.SPI_STALL_EN__CI__VI = 0x1; + ttCfgRegs_.ttRegMask.bits.REG_STALL_EN__CI__VI = 0x1; + ttCfgRegs_.ttRegMask.bits.VM_ID_MASK = SetVmId(); + + // Override Mask value if a user value is available + uint32_t ttMask = SetMask(); + if (ttMask) { + ttCfgRegs_.ttRegMask.u32All = ttMask; + } + + // Mask of compute units to get thread trace data from + ttCfgRegs_.ttRegPerfMask.u32All = 0; + ttCfgRegs_.ttRegPerfMask.bits.SH0_MASK = 0xFFFF; + ttCfgRegs_.ttRegPerfMask.bits.SH1_MASK = 0xFFFF; + + // Indicate the different TT messages/tokens that should be enabled/logged + // Indicate the different TT tokens that specify register operations to be logged + ttCfgRegs_.ttRegTokenMask.u32All = 0; + ttCfgRegs_.ttRegTokenMask.bits.REG_MASK = 0xFF; + ttCfgRegs_.ttRegTokenMask.bits.TOKEN_MASK = 0xFFFF; + ttCfgRegs_.ttRegTokenMask.bits.REG_DROP_ON_STALL__CI__VI = 0x1; + + // Override TokenMask1 value if a user value is available + uint32_t tokenMask1 = SetTokenMask(); + if (tokenMask1) { + ttCfgRegs_.ttRegTokenMask.u32All = tokenMask1; + } + + // Indicate the different TT tokens that specify instruction operations to be logged + // Disabling specifically instruction operations updating Program Counter (PC). + // @note: The field is defined in the spec incorrectly as a 16-bit value + ttCfgRegs_.ttRegTokenMask2.u32All = 0; + ttCfgRegs_.ttRegTokenMask2.bits.INST_MASK = 0xFFFFFF7F; + + // Override TokenMask2 value if a user value is available + uint32_t tokenMask2 = SetTokenMask2(); + if (tokenMask2) { + ttCfgRegs_.ttRegTokenMask2.u32All = tokenMask2; + } +} + +void Gfx8ThreadTrace::setSqttDataBuff(uint8_t* sqttBuffer, uint32_t sqttBuffSz) { + // Compute the size of buffer available for each shader engine + ttBuffSize_ = sqttBuffSz / numSE_; + + // Populate the sqtt buffer array submitted to device + for (int idx = 0; idx < numSE_; idx++) { + uint64_t sqttSEAddr = uint64_t(sqttBuffer + (ttBuffSize_ * idx)); + devMemList_.push_back(sqttSEAddr); + } + + // Update the size bit-field of sqtt ctrl register + ttCfgRegs_.ttRegSize.bits.SIZE = ttBuffSize_ >> TT_BUFF_ALIGN_SHIFT; +} + +void Gfx8ThreadTrace::BeginSession(DefaultCmdBuf* cmdBuff, CommandWriter* cmdWriter) { + // Program Grbm to broadcast messages to all shader engines + regGRBM_GFX_INDEX grbm_gfx_index; + grbm_gfx_index.u32All = 0; + grbm_gfx_index.bitfields.SH_BROADCAST_WRITES = 1; + grbm_gfx_index.bitfields.SE_BROADCAST_WRITES = 1; + grbm_gfx_index.bitfields.INSTANCE_BROADCAST_WRITES = 1; + cmdWriter->BuildWriteUConfigRegPacket(cmdBuff, mmGRBM_GFX_INDEX__CI__VI, grbm_gfx_index.u32All); + + // Issue a CSPartialFlush cmd including cache flush + cmdWriter->BuildWriteWaitIdlePacket(cmdBuff); + + // Disable RLC Perfmon Clock Gating + // On Vega this is needed to collect Perf Cntrs + // cmdWriter->BuildWriteUConfigRegPacket(cmdBuff, mmRLC_PERFMON_CLK_CNTL__VI, 1); + + // Program the Compute register to indicate SQTT is enabled + /* + regCOMPUTE_THREAD_TRACE_ENABLE__CI__VI enableTT = {0}; + enableTT.bits.THREAD_TRACE_ENABLE = 1; + cmdWriter->BuildWriteUConfigRegPacket(cmdBuff, + mmCOMPUTE_THREAD_TRACE_ENABLE__CI__VI, + enableTT.u32All); + */ + + // Program the thread trace mask - specifies SH, CU, SIMD and + // VM Id masks to apply. Enabling SQ/SPI/REG_STALL_EN bits + cmdWriter->BuildWriteUConfigRegPacket(cmdBuff, mmSQ_THREAD_TRACE_MASK__VI, + ttCfgRegs_.ttRegMask.u32All); + + // Program the thread trace Perf mask + cmdWriter->BuildWriteUConfigRegPacket(cmdBuff, mmSQ_THREAD_TRACE_PERF_MASK__VI, + ttCfgRegs_.ttRegPerfMask.u32All); + + // Program the thread trace token mask + cmdWriter->BuildWriteUConfigRegPacket(cmdBuff, mmSQ_THREAD_TRACE_TOKEN_MASK__VI, + ttCfgRegs_.ttRegTokenMask.u32All); + + // Program the thread trace token mask2 to specify the list of instruction + // tokens to record. Disabling INST_PC instruction tokens + cmdWriter->BuildWriteUConfigRegPacket(cmdBuff, mmSQ_THREAD_TRACE_TOKEN_MASK2__VI, + ttCfgRegs_.ttRegTokenMask2.u32All); + + // Program the thread trace mode register + cmdWriter->BuildWriteUConfigRegPacket(cmdBuff, mmSQ_THREAD_TRACE_MODE__VI, + ttCfgRegs_.ttRegMode.u32All); + + // Program the HiWaterMark register to support stalling + if ((ttCfgRegs_.ttRegMask.bits.SQ_STALL_EN__CI__VI) || + (ttCfgRegs_.ttRegMask.bits.SPI_STALL_EN__CI__VI) || + (ttCfgRegs_.ttRegMask.bits.REG_STALL_EN__CI__VI) || + (ttCfgRegs_.ttRegTokenMask.bits.REG_DROP_ON_STALL__CI__VI)) { + cmdWriter->BuildWriteUConfigRegPacket(cmdBuff, mmSQ_THREAD_TRACE_HIWATER__VI, 0x06); + } + + // Iterate through the list of SE's and program the register + // for carrying address of thread trace buffer which is aligned + // to 4KB per thread trace specification + uint64_t baseAddr = 0; + for (int idx = 0; idx < numSE_; idx++) { + // Program Grbm to direct writes to one SE + grbm_gfx_index.bitfields.SH_INDEX = 0; + grbm_gfx_index.bitfields.SE_INDEX = idx; + grbm_gfx_index.bitfields.SH_BROADCAST_WRITES = 0; + grbm_gfx_index.bitfields.SE_BROADCAST_WRITES = 0; + cmdWriter->BuildWriteUConfigRegPacket(cmdBuff, mmGRBM_GFX_INDEX__CI__VI, grbm_gfx_index.u32All); + + // Program base2 address of buffer to use for thread trace + // Encodes ATC bit, so the correct way to program is to use + // ATC Bit property of the device + /* + regSQ_THREAD_TRACE_BASE2__CI__VI sqttBase2 = {}; + sqttBase2.u32All = 0; + sqttBase2.bits.ATC = 0; + sqttBase2.bits.ADDR_HI = 0; + cmdWriter->BuildWriteUConfigRegPacket(cmdBuff, + mmSQ_THREAD_TRACE_BASE2__VI, + sqttBase2.u32All); + */ + + // Program the base address to use + baseAddr = devMemList_[idx] >> TT_BUFF_ALIGN_SHIFT; + + // Program base address of buffer to use for thread trace + regSQ_THREAD_TRACE_BASE sqttBase = {}; + sqttBase.bits.ADDR = Low32(baseAddr); + cmdWriter->BuildWriteUConfigRegPacket(cmdBuff, mmSQ_THREAD_TRACE_BASE__VI, sqttBase.u32All); + + // Program the size of thread trace buffer + cmdWriter->BuildWriteUConfigRegPacket(cmdBuff, mmSQ_THREAD_TRACE_SIZE__VI, + ttCfgRegs_.ttRegSize.u32All); + + // Program the thread trace ctrl register + regSQ_THREAD_TRACE_CTRL sqttCtrl = {}; + sqttCtrl.u32All = 0; + sqttCtrl.bits.RESET_BUFFER = 1; + cmdWriter->BuildWriteUConfigRegPacket(cmdBuff, mmSQ_THREAD_TRACE_CTRL__VI, sqttCtrl.u32All); + } + + // Reset the GRBM to broadcast mode + grbm_gfx_index.bitfields.SH_BROADCAST_WRITES = 1; + grbm_gfx_index.bitfields.SE_BROADCAST_WRITES = 1; + cmdWriter->BuildWriteUConfigRegPacket(cmdBuff, mmGRBM_GFX_INDEX__CI__VI, grbm_gfx_index.u32All); + + // Issue a CSPartialFlush cmd including cache flush + cmdWriter->BuildWriteWaitIdlePacket(cmdBuff); + + // Program the thread trace mode register + ttCfgRegs_.ttRegMode.bits.MODE = SQ_THREAD_TRACE_MODE_ON; + cmdWriter->BuildWriteUConfigRegPacket(cmdBuff, mmSQ_THREAD_TRACE_MODE__VI, + ttCfgRegs_.ttRegMode.u32All); + ttCfgRegs_.ttRegMode.bits.MODE = SQ_THREAD_TRACE_MODE_OFF; + + // Issue a CSPartialFlush cmd including cache flush + cmdWriter->BuildWriteWaitIdlePacket(cmdBuff); + return; +} + +void Gfx8ThreadTrace::StopSession(DefaultCmdBuf* cmdBuff, CommandWriter* cmdWriter) { + // Program Grbm to broadcast messages to all shader engines + regGRBM_GFX_INDEX grbm_gfx_index; + grbm_gfx_index.u32All = 0; + grbm_gfx_index.bitfields.SH_BROADCAST_WRITES = 1; + grbm_gfx_index.bitfields.SE_BROADCAST_WRITES = 1; + grbm_gfx_index.bitfields.INSTANCE_BROADCAST_WRITES = 1; + cmdWriter->BuildWriteUConfigRegPacket(cmdBuff, mmGRBM_GFX_INDEX__CI__VI, grbm_gfx_index.u32All); + + // Issue a CSPartialFlush cmd including cache flush + cmdWriter->BuildWriteWaitIdlePacket(cmdBuff); + + // Program the thread trace mode register to disable thread trace + // The MODE register is set to disable thread trace by default + cmdWriter->BuildWriteUConfigRegPacket(cmdBuff, mmSQ_THREAD_TRACE_MODE__VI, + ttCfgRegs_.ttRegMode.u32All); + + // Issue a CSPartialFlush cmd including cache flush + cmdWriter->BuildWriteWaitIdlePacket(cmdBuff); + + // Iterate through the list of SE's and read the Status, Counter and + // Write Pointer registers of Thread Trace subsystem + uint64_t baseAddr = 0; + for (int idx = 0; idx < numSE_; idx++) { + // Program Grbm to direct writes to one SE + grbm_gfx_index.bitfields.SH_INDEX = 0; + grbm_gfx_index.bitfields.SE_INDEX = idx; + grbm_gfx_index.bitfields.SH_BROADCAST_WRITES = 0; + grbm_gfx_index.bitfields.SE_BROADCAST_WRITES = 0; + cmdWriter->BuildWriteUConfigRegPacket(cmdBuff, mmGRBM_GFX_INDEX__CI__VI, grbm_gfx_index.u32All); + + // Issue WaitRegMem command to wait until SQTT event has completed + bool funcEq = false; + bool memSpace = false; + uint32_t waitVal = 0x01; + uint32_t maskVal = 0x40000000L; + uint32_t statusOffset = mmSQ_THREAD_TRACE_STATUS__VI - UCONFIG_SPACE_START__CI__VI; + cmdWriter->BuildWaitRegMemCommand(cmdBuff, memSpace, statusOffset, funcEq, maskVal, waitVal); + + // Retrieve the values from various status registers + cmdWriter->BuildCopyDataPacket(cmdBuff, COPY_DATA_SEL_SRC_SYS_PERF_COUNTER, + mmSQ_THREAD_TRACE_STATUS__VI, 0, + ttStatus_ + ((TT_STATUS_IDX_MAX * idx) + TT_STATUS_IDX_STATUS), + COPY_DATA_SEL_COUNT_1DW, true); + + cmdWriter->BuildCopyDataPacket(cmdBuff, COPY_DATA_SEL_SRC_SYS_PERF_COUNTER, + mmSQ_THREAD_TRACE_CNTR, 0, + ttStatus_ + ((TT_STATUS_IDX_MAX * idx) + TT_STATUS_IDX_CNTR), + COPY_DATA_SEL_COUNT_1DW, true); + + uint32_t wptrIdx = ((TT_STATUS_IDX_MAX * idx) + TT_STATUS_IDX_WPTR); + cmdWriter->BuildCopyDataPacket(cmdBuff, COPY_DATA_SEL_SRC_SYS_PERF_COUNTER, + mmSQ_THREAD_TRACE_WPTR__VI, 0, ttStatus_ + wptrIdx, + COPY_DATA_SEL_COUNT_1DW, true); + } + + // Reset the GRBM to broadcast mode + grbm_gfx_index.bitfields.SH_BROADCAST_WRITES = 1; + grbm_gfx_index.bitfields.SE_BROADCAST_WRITES = 1; + cmdWriter->BuildWriteUConfigRegPacket(cmdBuff, mmGRBM_GFX_INDEX__CI__VI, grbm_gfx_index.u32All); + + // Initialize cache flush request object + FlushCacheOptions flush; + flush.l1 = true; + flush.l2 = true; + flush.icache = true; + flush.kcache = true; + cmdWriter->BuildFlushCacheCmd(cmdBuff, &flush, NULL, 0); + + // Program the size of thread trace buffer + regSQ_THREAD_TRACE_SIZE ttRegSize = {0}; + ttRegSize.u32All = 0; + cmdWriter->BuildWriteUConfigRegPacket(cmdBuff, mmSQ_THREAD_TRACE_SIZE__VI, ttRegSize.u32All); + + // Program the thread trace ctrl register + regSQ_THREAD_TRACE_CTRL sqttCtrl = {}; + sqttCtrl.u32All = 0; + sqttCtrl.bits.RESET_BUFFER = 1; + cmdWriter->BuildWriteUConfigRegPacket(cmdBuff, mmSQ_THREAD_TRACE_CTRL__VI, sqttCtrl.u32All); + + // Program the compute_thread_trace_enable register + /* + regCOMPUTE_THREAD_TRACE_ENABLE__CI__VI disableTT = {0}; + cmdWriter->BuildWriteUConfigRegPacket(cmdBuff, + mmCOMPUTE_THREAD_TRACE_ENABLE__CI__VI, + disableTT.u32All); + */ + + // Disable RLC Perfmon Clock Gating + // On Vega this is needed to collect Perf Cntrs + // cmdWriter->BuildWriteUConfigRegPacket(cmdBuff, mmRLC_PERFMON_CLK_CNTL__VI, 0); + + // Issue a CSPartialFlush cmd including cache flush + cmdWriter->BuildWriteWaitIdlePacket(cmdBuff); + return; +} + +bool Gfx8ThreadTrace::Validate() { + // Iterate through the list of SE to verify + for (int idx = 0; idx < numSE_; idx++) { + // Determine if the buffer has wrapped + uint32_t statusIdx = ((TT_STATUS_IDX_MAX * idx) + TT_STATUS_IDX_STATUS); + if (ttStatus_[statusIdx] & 0x80000000) { + return false; + } + + // Adjust the value of Write Ptr which is bits [29-0] + uint32_t wptrIdx = ((TT_STATUS_IDX_MAX * idx) + TT_STATUS_IDX_WPTR); + ttStatus_[wptrIdx] = (ttStatus_[wptrIdx] & TT_WRITE_PTR_MASK); + } + + return true; +} + +} // pm4_profile diff --git a/runtime/hsa-ext-aql-profile/src/threadtrace/gfx8_thread_trace.h b/runtime/hsa-ext-aql-profile/src/threadtrace/gfx8_thread_trace.h new file mode 100644 index 0000000000..fd3ca52da8 --- /dev/null +++ b/runtime/hsa-ext-aql-profile/src/threadtrace/gfx8_thread_trace.h @@ -0,0 +1,101 @@ +#ifndef _GFX8_THREAD_TRACE_H_ +#define _GFX8_THREAD_TRACE_H_ + +#include "gfxip/gfx8/si_ci_vi_merged_typedef.h" +#include "gfxip/gfx8/si_ci_vi_merged_offset.h" +#include "gfxip/gfx8/si_ci_vi_merged_enum.h" +#include "gfxip/gfx8/si_pm4defs.h" +#include "thread_trace.h" + +#include + +namespace pm4_profile { + +typedef struct Gfx8ThreadTraceCfgRegs { + // Size of thread trace buffer + regSQ_THREAD_TRACE_SIZE ttRegSize; + // Thread trace mode + regSQ_THREAD_TRACE_MODE ttRegMode; + // Thread trace wave mask + regSQ_THREAD_TRACE_MASK ttRegMask; + // Thread trace token mask + regSQ_THREAD_TRACE_TOKEN_MASK ttRegTokenMask; + // Thread trace token mask2 + regSQ_THREAD_TRACE_TOKEN_MASK2__VI ttRegTokenMask2; + // Thread trace perf mask + regSQ_THREAD_TRACE_PERF_MASK ttRegPerfMask; +} Gfx8ThreadTraceCfgRegs; + +// Encapsulates the various Api and structures used to enable a thread +// trace session and collect its data +class Gfx8ThreadTrace : public ThreadTrace { + public: + Gfx8ThreadTrace(); + + ~Gfx8ThreadTrace(); + + // Initializes various data structures and handles that + // are needed to support a thread trace session + bool Init(const ThreadTraceConfig* config); + + // Builds Pm4 command stream to program hardware registers that + // enable a thread trace session, including the issue of an event + // to begin thread session + void BeginSession(pm4_profile::DefaultCmdBuf* cmdBuff, pm4_profile::CommandWriter* cmdWriter); + + // Builds Pm4 command stream to program hardware registers that + // disable a thread trace session, including the issue of an event + // to stop currently ongoing thread session + void StopSession(pm4_profile::DefaultCmdBuf* cmdBuff, pm4_profile::CommandWriter* cmdWriter); + + // Validates that thread trace session ran correctly i.e. did not + // encounter any errors. + bool Validate(); + + // Initializes the handle of buffer used to collect SQTT data + void setSqttDataBuff(uint8_t* sqttBuffer, uint32_t sqttBuffSz); + + // Initializes the handle of buffer used to read control data of SQTT + void setSqttCtrlBuff(uint32_t* ctrlBuff) { ttStatus_ = ctrlBuff; } + + // Return status info size + uint32_t StatusSizeInfo() const { return TT_STATUS_IDX_MAX * sizeof(uint32_t) * numSE_; } + + // Return number of Shader Engines + uint32_t getNumSe() { return numSE_; } + + private: + // Holds number of Shader Engines present on device + uint32_t numSE_; + + // Thread traces status register indices to determine + // status of thread trace run + typedef enum { + TT_STATUS_IDX_STATUS = 0, + TT_STATUS_IDX_CNTR = 1, + TT_STATUS_IDX_WPTR = 2, + TT_STATUS_IDX_MAX = 3 + } TTStatusReg; + + // A list of tuples of TT_STATUS_IDX_MAX size, + // giving status of thread trace + uint32_t* ttStatus_; + + // Size of thread trace buffer per shader engine + uint32_t ttBuffSize_; + + // Handles of Device memory used for thread trace + std::vector devMemList_; + + // Registers that need to be programmed for Thread Trace + Gfx8ThreadTraceCfgRegs ttCfgRegs_; + + // Initializes thread trace registers with default parameters. + // These are potentially updated based on updates to thread trace + // configuration object by user + void InitThreadTraceCfgRegs(); +}; + +} // pm4_profile + +#endif // _GFX8_THREAD_TRACE_H_ diff --git a/runtime/hsa-ext-aql-profile/src/threadtrace/gfx9_thread_trace.cpp b/runtime/hsa-ext-aql-profile/src/threadtrace/gfx9_thread_trace.cpp new file mode 100644 index 0000000000..2b344cf42a --- /dev/null +++ b/runtime/hsa-ext-aql-profile/src/threadtrace/gfx9_thread_trace.cpp @@ -0,0 +1,356 @@ + +#include +#include +#include +#include +#include + +#include "core/util/os.h" +#include "gfx9_thread_trace.h" + +/// @brief Returns the lower 32-bits of a value +inline uint32_t Low32(uint64_t u) { return (u & 0xFFFFFFFFUL); } + +/// @brief Returns the upper 32-bits of a value +inline uint32_t High32(uint64_t u) { return (u >> 32); } + +namespace pm4_profile { + +Gfx9ThreadTrace::Gfx9ThreadTrace() { + // Initialize the number of shader engines + numSE_ = 4; +} + +Gfx9ThreadTrace::~Gfx9ThreadTrace() {} + +bool Gfx9ThreadTrace::Init(const ThreadTraceConfig* config) { + // Initialize SQTT Configuration and Register objects + if (!ThreadTrace::Init(config)) return false; + InitThreadTraceCfgRegs(); + return true; +} + +void Gfx9ThreadTrace::InitThreadTraceCfgRegs() { + // Indicates the size of buffer to use per Shader Engine instance. + // The size is specified in terms of 4KB blocks + ttCfgRegs_.ttRegSize.u32All = 0; + + // Indicates various attributes of a thread trace session. + // + // MASK_CS: Which shader types should be enabled for data collection + // Enable CS Shader types. + // + // WRAP: How trace buffer should be used as a ring buffer or as a linear + // buffer - Disable WRAP mode i.e use it as a linear buffer + // + // MODE: Enables a thread trace session + // + // CAPTURE_MODE: When thread trace data is collected immediately after MODE + // is enabled or wait until a Thread Trace Start event is received + // + // AUTOFLUSH_EN: Flush thread trace data to buffer often automatically + // + ttCfgRegs_.ttRegMode.u32All = 0; + ttCfgRegs_.ttRegMode.bits.WRAP = 0; + ttCfgRegs_.ttRegMode.bits.CAPTURE_MODE = 0; + ttCfgRegs_.ttRegMode.bits.MASK_CS = 1; + ttCfgRegs_.ttRegMode.bits.AUTOFLUSH_EN = 1; + ttCfgRegs_.ttRegMode.bits.MODE = SQ_THREAD_TRACE_MODE_OFF; + + // Enable Thread Trace for all VM Id's + // Enable all of the SIMD's of the compute unit + // Enable Compute Unit (CU) at index Zero to be used for fine-grained data + // Enable Shader Array (SH) at index Zero to be used for fine-grained data + // + // @note: Not enabling REG_STALL_EN, SPI_STALL_EN and SQ_STALL_EN bits. They + // are useful if we wish to program buffer throttling. + // + ttCfgRegs_.ttRegMask.u32All = 0; + ttCfgRegs_.ttRegMask.bits.SH_SEL = 0x0; + ttCfgRegs_.ttRegMask.bits.SIMD_EN = 0xF; + ttCfgRegs_.ttRegMask.bits.CU_SEL = SetCuId(); + ttCfgRegs_.ttRegMask.bits.SQ_STALL_EN = 0x1; + ttCfgRegs_.ttRegMask.bits.SPI_STALL_EN = 0x1; + ttCfgRegs_.ttRegMask.bits.REG_STALL_EN = 0x1; + ttCfgRegs_.ttRegMask.bits.VM_ID_MASK = SetVmId(); + + // Override Mask value if a user value is available + uint32_t ttMask = SetMask(); + if (ttMask) { + ttCfgRegs_.ttRegMask.u32All = ttMask; + } + + // Mask of compute units to get thread trace data from + ttCfgRegs_.ttRegPerfMask.u32All = 0; + ttCfgRegs_.ttRegPerfMask.bits.SH0_MASK = 0xFFFF; + ttCfgRegs_.ttRegPerfMask.bits.SH1_MASK = 0xFFFF; + + // Indicate the different TT messages/tokens that should be enabled/logged + // Indicate the different TT tokens that specify register operations to be logged + ttCfgRegs_.ttRegTokenMask.u32All = 0; + ttCfgRegs_.ttRegTokenMask.bits.REG_MASK = 0xFF; + ttCfgRegs_.ttRegTokenMask.bits.TOKEN_MASK = 0xFFFF; + ttCfgRegs_.ttRegTokenMask.bits.REG_DROP_ON_STALL = 0x1; + + // Override TokenMask1 value if a user value is available + uint32_t tokenMask1 = SetTokenMask(); + if (tokenMask1) { + ttCfgRegs_.ttRegTokenMask.u32All = tokenMask1; + } + + // Indicate the different TT tokens that specify instruction operations to be logged + // Disabling specifically instruction operations updating Program Counter (PC). + // @note: The field is defined in the spec incorrectly as a 16-bit value + ttCfgRegs_.ttRegTokenMask2.u32All = 0; + ttCfgRegs_.ttRegTokenMask2.bits.INST_MASK = 0xFFFFFF7F; + + // Override TokenMask2 value if a user value is available + uint32_t tokenMask2 = SetTokenMask2(); + if (tokenMask2) { + ttCfgRegs_.ttRegTokenMask2.u32All = tokenMask2; + } +} + +void Gfx9ThreadTrace::setSqttDataBuff(uint8_t* sqttBuffer, uint32_t sqttBuffSz) { + // Compute the size of buffer available for each shader engine + ttBuffSize_ = sqttBuffSz / numSE_; + + // Populate the sqtt buffer array submitted to device + for (int idx = 0; idx < numSE_; idx++) { + uint64_t sqttSEAddr = uint64_t(sqttBuffer + (ttBuffSize_ * idx)); + devMemList_.push_back(sqttSEAddr); + } + + // Update the size bit-field of sqtt ctrl register + ttCfgRegs_.ttRegSize.bits.SIZE = ttBuffSize_ >> TT_BUFF_ALIGN_SHIFT; +} + +void Gfx9ThreadTrace::BeginSession(DefaultCmdBuf* cmdBuff, CommandWriter* cmdWriter) { + // Program Grbm to broadcast messages to all shader engines + regGRBM_GFX_INDEX grbm_gfx_index; + grbm_gfx_index.u32All = 0; + grbm_gfx_index.bitfields.SH_BROADCAST_WRITES = 1; + grbm_gfx_index.bitfields.SE_BROADCAST_WRITES = 1; + grbm_gfx_index.bitfields.INSTANCE_BROADCAST_WRITES = 1; + cmdWriter->BuildWriteUConfigRegPacket(cmdBuff, mmGRBM_GFX_INDEX, grbm_gfx_index.u32All); + + // Issue a CSPartialFlush cmd including cache flush + cmdWriter->BuildWriteWaitIdlePacket(cmdBuff); + + // Disable RLC Perfmon Clock Gating + // On Vega this is needed to collect Perf Cntrs + // cmdWriter->BuildWriteUConfigRegPacket(cmdBuff, mmRLC_PERFMON_CLK_CNTL, 1); + + // Program the Compute register to indicate SQTT is enabled + /* + regCOMPUTE_THREAD_TRACE_ENABLE enableTT = {0}; + enableTT.bits.THREAD_TRACE_ENABLE = 1; + cmdWriter->BuildWriteUConfigRegPacket(cmdBuff, + mmCOMPUTE_THREAD_TRACE_ENABLE, + enableTT.u32All); + */ + + // Program the thread trace mask - specifies SH, CU, SIMD and + // VM Id masks to apply. Enabling SQ/SPI/REG_STALL_EN bits + cmdWriter->BuildWriteUConfigRegPacket(cmdBuff, mmSQ_THREAD_TRACE_MASK, + ttCfgRegs_.ttRegMask.u32All); + + // Program the thread trace Perf mask + cmdWriter->BuildWriteUConfigRegPacket(cmdBuff, mmSQ_THREAD_TRACE_PERF_MASK, + ttCfgRegs_.ttRegPerfMask.u32All); + + // Program the thread trace token mask + cmdWriter->BuildWriteUConfigRegPacket(cmdBuff, mmSQ_THREAD_TRACE_TOKEN_MASK, + ttCfgRegs_.ttRegTokenMask.u32All); + + // Program the thread trace token mask2 to specify the list of instruction + // tokens to record. Disabling INST_PC instruction tokens + cmdWriter->BuildWriteUConfigRegPacket(cmdBuff, mmSQ_THREAD_TRACE_TOKEN_MASK2, + ttCfgRegs_.ttRegTokenMask2.u32All); + + // Program the thread trace mode register + cmdWriter->BuildWriteUConfigRegPacket(cmdBuff, mmSQ_THREAD_TRACE_MODE, + ttCfgRegs_.ttRegMode.u32All); + + // Program the HiWaterMark register to support stalling + if ((ttCfgRegs_.ttRegMask.bits.SQ_STALL_EN) || (ttCfgRegs_.ttRegMask.bits.SPI_STALL_EN) || + (ttCfgRegs_.ttRegMask.bits.REG_STALL_EN) || + (ttCfgRegs_.ttRegTokenMask.bits.REG_DROP_ON_STALL)) { + cmdWriter->BuildWriteUConfigRegPacket(cmdBuff, mmSQ_THREAD_TRACE_HIWATER, 0x06); + } + + // Iterate through the list of SE's and program the register + // for carrying address of thread trace buffer which is aligned + // to 4KB per thread trace specification + uint64_t baseAddr = 0; + for (int idx = 0; idx < numSE_; idx++) { + // Program Grbm to direct writes to one SE + grbm_gfx_index.bitfields.SH_INDEX = 0; + grbm_gfx_index.bitfields.SE_INDEX = idx; + grbm_gfx_index.bitfields.SH_BROADCAST_WRITES = 0; + grbm_gfx_index.bitfields.SE_BROADCAST_WRITES = 0; + cmdWriter->BuildWriteUConfigRegPacket(cmdBuff, mmGRBM_GFX_INDEX, grbm_gfx_index.u32All); + + // Program base2 address of buffer to use for thread trace + /* + regSQ_THREAD_TRACE_BASE2 sqttBase2 = {}; + sqttBase2.u32All = 0; + sqttBase2.bits.ADDR_HI = 0; + cmdWriter->BuildWriteUConfigRegPacket(cmdBuff, + mmSQ_THREAD_TRACE_BASE2, + sqttBase2.u32All); + */ + + // Program the base address to use + baseAddr = devMemList_[idx] >> TT_BUFF_ALIGN_SHIFT; + + // Program base address of buffer to use for thread trace + regSQ_THREAD_TRACE_BASE sqttBase = {}; + sqttBase.bits.ADDR = Low32(baseAddr); + cmdWriter->BuildWriteUConfigRegPacket(cmdBuff, mmSQ_THREAD_TRACE_BASE, sqttBase.u32All); + + // Program the size of thread trace buffer + cmdWriter->BuildWriteUConfigRegPacket(cmdBuff, mmSQ_THREAD_TRACE_SIZE, + ttCfgRegs_.ttRegSize.u32All); + + // Program the thread trace ctrl register + regSQ_THREAD_TRACE_CTRL sqttCtrl = {}; + sqttCtrl.u32All = 0; + sqttCtrl.bits.RESET_BUFFER = 1; + cmdWriter->BuildWriteUConfigRegPacket(cmdBuff, mmSQ_THREAD_TRACE_CTRL, sqttCtrl.u32All); + } + + // Reset the GRBM to broadcast mode + grbm_gfx_index.bitfields.SH_BROADCAST_WRITES = 1; + grbm_gfx_index.bitfields.SE_BROADCAST_WRITES = 1; + cmdWriter->BuildWriteUConfigRegPacket(cmdBuff, mmGRBM_GFX_INDEX, grbm_gfx_index.u32All); + + // Issue a CSPartialFlush cmd including cache flush + cmdWriter->BuildWriteWaitIdlePacket(cmdBuff); + + // Program the thread trace mode register + ttCfgRegs_.ttRegMode.bits.MODE = SQ_THREAD_TRACE_MODE_ON; + cmdWriter->BuildWriteUConfigRegPacket(cmdBuff, mmSQ_THREAD_TRACE_MODE, + ttCfgRegs_.ttRegMode.u32All); + ttCfgRegs_.ttRegMode.bits.MODE = SQ_THREAD_TRACE_MODE_OFF; + + // Issue a CSPartialFlush cmd including cache flush + cmdWriter->BuildWriteWaitIdlePacket(cmdBuff); + return; +} + +void Gfx9ThreadTrace::StopSession(DefaultCmdBuf* cmdBuff, CommandWriter* cmdWriter) { + // Program Grbm to broadcast messages to all shader engines + regGRBM_GFX_INDEX grbm_gfx_index; + grbm_gfx_index.u32All = 0; + grbm_gfx_index.bitfields.SH_BROADCAST_WRITES = 1; + grbm_gfx_index.bitfields.SE_BROADCAST_WRITES = 1; + grbm_gfx_index.bitfields.INSTANCE_BROADCAST_WRITES = 1; + cmdWriter->BuildWriteUConfigRegPacket(cmdBuff, mmGRBM_GFX_INDEX, grbm_gfx_index.u32All); + + // Issue a CSPartialFlush cmd including cache flush + cmdWriter->BuildWriteWaitIdlePacket(cmdBuff); + + // Program the thread trace mode register to disable thread trace + // The MODE register is set to disable thread trace by default + cmdWriter->BuildWriteUConfigRegPacket(cmdBuff, mmSQ_THREAD_TRACE_MODE, + ttCfgRegs_.ttRegMode.u32All); + + // Issue a CSPartialFlush cmd including cache flush + cmdWriter->BuildWriteWaitIdlePacket(cmdBuff); + + // Iterate through the list of SE's and read the Status, Counter and + // Write Pointer registers of Thread Trace subsystem + uint64_t baseAddr = 0; + for (int idx = 0; idx < numSE_; idx++) { + // Program Grbm to direct writes to one SE + grbm_gfx_index.bitfields.SH_INDEX = 0; + grbm_gfx_index.bitfields.SE_INDEX = idx; + grbm_gfx_index.bitfields.SH_BROADCAST_WRITES = 0; + grbm_gfx_index.bitfields.SE_BROADCAST_WRITES = 0; + cmdWriter->BuildWriteUConfigRegPacket(cmdBuff, mmGRBM_GFX_INDEX, grbm_gfx_index.u32All); + + // Issue WaitRegMem command to wait until SQTT event has completed + bool funcEq = false; + bool memSpace = false; + uint32_t waitVal = 0x01; + uint32_t maskVal = 0x40000000L; + uint32_t statusOffset = mmSQ_THREAD_TRACE_STATUS - UCONFIG_SPACE_START; + cmdWriter->BuildWaitRegMemCommand(cmdBuff, memSpace, statusOffset, funcEq, maskVal, waitVal); + + // Retrieve the values from various status registers + cmdWriter->BuildCopyDataPacket(cmdBuff, COPY_DATA_SEL_SRC_SYS_PERF_COUNTER, + mmSQ_THREAD_TRACE_STATUS, 0, + ttStatus_ + ((TT_STATUS_IDX_MAX * idx) + TT_STATUS_IDX_STATUS), + COPY_DATA_SEL_COUNT_1DW, true); + + cmdWriter->BuildCopyDataPacket(cmdBuff, COPY_DATA_SEL_SRC_SYS_PERF_COUNTER, + mmSQ_THREAD_TRACE_CNTR, 0, + ttStatus_ + ((TT_STATUS_IDX_MAX * idx) + TT_STATUS_IDX_CNTR), + COPY_DATA_SEL_COUNT_1DW, true); + + uint32_t wptrIdx = ((TT_STATUS_IDX_MAX * idx) + TT_STATUS_IDX_WPTR); + cmdWriter->BuildCopyDataPacket(cmdBuff, COPY_DATA_SEL_SRC_SYS_PERF_COUNTER, + mmSQ_THREAD_TRACE_WPTR, 0, ttStatus_ + wptrIdx, + COPY_DATA_SEL_COUNT_1DW, true); + } + + // Reset the GRBM to broadcast mode + grbm_gfx_index.bitfields.SH_BROADCAST_WRITES = 1; + grbm_gfx_index.bitfields.SE_BROADCAST_WRITES = 1; + cmdWriter->BuildWriteUConfigRegPacket(cmdBuff, mmGRBM_GFX_INDEX, grbm_gfx_index.u32All); + + // Initialize cache flush request object + FlushCacheOptions flush; + flush.l1 = true; + flush.l2 = true; + flush.icache = true; + flush.kcache = true; + cmdWriter->BuildFlushCacheCmd(cmdBuff, &flush, NULL, 0); + + // Program the size of thread trace buffer + regSQ_THREAD_TRACE_SIZE ttRegSize = {0}; + ttRegSize.u32All = 0; + cmdWriter->BuildWriteUConfigRegPacket(cmdBuff, mmSQ_THREAD_TRACE_SIZE, ttRegSize.u32All); + + // Program the thread trace ctrl register + regSQ_THREAD_TRACE_CTRL sqttCtrl = {}; + sqttCtrl.u32All = 0; + sqttCtrl.bits.RESET_BUFFER = 1; + cmdWriter->BuildWriteUConfigRegPacket(cmdBuff, mmSQ_THREAD_TRACE_CTRL, sqttCtrl.u32All); + + // Program the compute_thread_trace_enable register + /* + regCOMPUTE_THREAD_TRACE_ENABLE disableTT = {0}; + cmdWriter->BuildWriteUConfigRegPacket(cmdBuff, + mmCOMPUTE_THREAD_TRACE_ENABLE, + disableTT.u32All); + */ + + // Disable RLC Perfmon Clock Gating + // On Vega this is needed to collect Perf Cntrs + // cmdWriter->BuildWriteUConfigRegPacket(cmdBuff, mmRLC_PERFMON_CLK_CNTL, 0); + + // Issue a CSPartialFlush cmd including cache flush + cmdWriter->BuildWriteWaitIdlePacket(cmdBuff); + return; +} + +bool Gfx9ThreadTrace::Validate() { + // Iterate through the list of SE to verify + for (int idx = 0; idx < numSE_; idx++) { + // Determine if the buffer has wrapped + uint32_t statusIdx = ((TT_STATUS_IDX_MAX * idx) + TT_STATUS_IDX_STATUS); + if (ttStatus_[statusIdx] & 0x80000000) { + return false; + } + + // Adjust the value of Write Ptr which is bits [29-0] + uint32_t wptrIdx = ((TT_STATUS_IDX_MAX * idx) + TT_STATUS_IDX_WPTR); + ttStatus_[wptrIdx] = (ttStatus_[wptrIdx] & TT_WRITE_PTR_MASK); + } + + return true; +} + +} // pm4_profile diff --git a/runtime/hsa-ext-aql-profile/src/threadtrace/gfx9_thread_trace.h b/runtime/hsa-ext-aql-profile/src/threadtrace/gfx9_thread_trace.h new file mode 100644 index 0000000000..86e0db8734 --- /dev/null +++ b/runtime/hsa-ext-aql-profile/src/threadtrace/gfx9_thread_trace.h @@ -0,0 +1,104 @@ +#ifndef _GFX9_THREAD_TRACE_H_ +#define _GFX9_THREAD_TRACE_H_ + +#include "gfxip/gfx9/gfx9_registers.h" +#include "gfxip/gfx9/gfx9_typedef.h" +#include "gfxip/gfx9/gfx9_enum.h" +#include "gfxip/gfx9/gfx9_offset.h" +#include "gfxip/gfx9/gfx9_pm4defs.h" +#include "thread_trace.h" + +#include + +using namespace pm4_profile::gfx9; + +namespace pm4_profile { + +typedef struct Gfx9ThreadTraceCfgRegs { + // Size of thread trace buffer + regSQ_THREAD_TRACE_SIZE ttRegSize; + // Thread trace mode + regSQ_THREAD_TRACE_MODE ttRegMode; + // Thread trace wave mask + regSQ_THREAD_TRACE_MASK ttRegMask; + // Thread trace token mask + regSQ_THREAD_TRACE_TOKEN_MASK ttRegTokenMask; + // Thread trace token mask2 + regSQ_THREAD_TRACE_TOKEN_MASK2 ttRegTokenMask2; + // Thread trace perf mask + regSQ_THREAD_TRACE_PERF_MASK ttRegPerfMask; +} Gfx9ThreadTraceCfgRegs; + +// Encapsulates the various Api and structures used to enable a thread +// trace session and collect its data +class Gfx9ThreadTrace : public ThreadTrace { + public: + Gfx9ThreadTrace(); + + ~Gfx9ThreadTrace(); + + // Initializes various data structures and handles that + // are needed to support a thread trace session + bool Init(const ThreadTraceConfig* config); + + // Builds Pm4 command stream to program hardware registers that + // enable a thread trace session, including the issue of an event + // to begin thread session + void BeginSession(pm4_profile::DefaultCmdBuf* cmdBuff, pm4_profile::CommandWriter* cmdWriter); + + // Builds Pm4 command stream to program hardware registers that + // disable a thread trace session, including the issue of an event + // to stop currently ongoing thread session + void StopSession(pm4_profile::DefaultCmdBuf* cmdBuff, pm4_profile::CommandWriter* cmdWriter); + + // Validates that thread trace session ran correctly i.e. did not + // encounter any errors. + bool Validate(); + + // Initializes the handle of buffer used to collect SQTT data + void setSqttDataBuff(uint8_t* sqttBuffer, uint32_t sqttBuffSz); + + // Initializes the handle of buffer used to read control data of SQTT + void setSqttCtrlBuff(uint32_t* ctrlBuff) { ttStatus_ = ctrlBuff; } + + // Return status info size + uint32_t StatusSizeInfo() const { return TT_STATUS_IDX_MAX * sizeof(uint32_t) * numSE_; } + + // Return number of Shader Engines + uint32_t getNumSe() { return numSE_; } + + private: + // Holds number of Shader Engines present on device + uint32_t numSE_; + + // Thread traces status register indices to determine + // status of thread trace run + typedef enum { + TT_STATUS_IDX_STATUS = 0, + TT_STATUS_IDX_CNTR = 1, + TT_STATUS_IDX_WPTR = 2, + TT_STATUS_IDX_MAX = 3 + } TTStatusReg; + + // A list of tuples of TT_STATUS_IDX_MAX size, + // giving status of thread trace + uint32_t* ttStatus_; + + // Size of thread trace buffer per shader engine + uint32_t ttBuffSize_; + + // Handles of Device memory used for thread trace + std::vector devMemList_; + + // Registers that need to be programmed for Thread Trace + Gfx9ThreadTraceCfgRegs ttCfgRegs_; + + // Initializes thread trace registers with default parameters. + // These are potentially updated based on updates to thread trace + // configuration object by user + void InitThreadTraceCfgRegs(); +}; + +} // pm4_profile + +#endif // _GFX9_THREAD_TRACE_H_ diff --git a/runtime/hsa-ext-aql-profile/src/threadtrace/thread_trace.cpp b/runtime/hsa-ext-aql-profile/src/threadtrace/thread_trace.cpp new file mode 100644 index 0000000000..cb1e4ff643 --- /dev/null +++ b/runtime/hsa-ext-aql-profile/src/threadtrace/thread_trace.cpp @@ -0,0 +1,105 @@ +#include + +#include "core/util/os.h" +#include "thread_trace.h" + +namespace pm4_profile { + +bool ThreadTrace::Init(const ThreadTraceConfig* config) { + if (config) { + ttConfig_ = *config; + } else { + InitThreadTraceConfig(&ttConfig_); + } + return true; +} + +void ThreadTrace::InitThreadTraceConfig(ThreadTraceConfig* config) const { + memset(config, 0, sizeof(ThreadTraceConfig)); + + config->threadTraceTargetCu = 0; + config->threadTraceVmIdMask = 0; + config->threadTraceMask = 0; + config->threadTraceTokenMask = 0; + config->threadTraceTokenMask2 = 0; +} + +uint8_t ThreadTrace::SetCuId() { + uint32_t cuId = ttConfig_.threadTraceTargetCu; + + // Allow users to specify the CU to choose for Target tokens + std::string var = os::GetEnvVar("HSA_THREAD_TRACE_SELECT_CU"); + if (var.length() > 0) { + cuId = std::stol(var, nullptr, 16); + std::cout << "Using " << cuId << " as CUID for Thread Trace" << std::endl; + } + + assert((cuId <= 15) && "Cu Id must be between 0 and 15"); + + return cuId; +} + +uint8_t ThreadTrace::SetVmId() { + uint32_t vmId = ttConfig_.threadTraceVmIdMask; + + // Allow users to specify the VMID to choose for Target tokens + std::string var = os::GetEnvVar("HSA_THREAD_TRACE_SELECT_VMID"); + if (var.length() > 0) { + vmId = std::stol(var, nullptr, 16); + std::cout << "Using " << vmId << " as VMID for Thread Trace" << std::endl; + } + + assert((vmId <= 2) && "VmId must be between 0 and 2"); + + return vmId; +} + +uint32_t ThreadTrace::SetMask() { + uint32_t ttMask = ttConfig_.threadTraceMask; + const uint32_t validMask = 0x00C0D0; + + // Allow users to specify the Mask to choose for configuration parameters + std::string var = os::GetEnvVar("HSA_THREAD_TRACE_SELECT_MASK"); + if (var.length() > 0) { + ttMask = std::stol(var, nullptr, 16); + std::cout << "Using " << ttMask << " as Mask for Thread Trace" << std::endl; + } + + assert(((ttMask & validMask) == 0) && "Mask should have bits [4,6,7] set to Zero"); + + return ttMask; +} + +uint32_t ThreadTrace::SetTokenMask() { + uint32_t tokenMask = ttConfig_.threadTraceTokenMask; + const uint32_t validMask = 0xFF000000; + + // Allow users to specify the TokenMask to choose for Target tokens + std::string var = os::GetEnvVar("HSA_THREAD_TRACE_SELECT_TOKEN_MASK1"); + if (var.length() > 0) { + tokenMask = std::stol(var, nullptr, 16); + std::cout << "Using " << tokenMask << " as TokenMask for Thread Trace" << std::endl; + } + + assert(((tokenMask & validMask) == 0) && "TokenMask should have bits [31:25] set to Zero"); + + return tokenMask; +} + +uint32_t ThreadTrace::SetTokenMask2() { + uint32_t tokenMask2 = ttConfig_.threadTraceTokenMask2; + const uint32_t validMask = 0xFFFF0000; + + // Allow users to specify the TokenMask2 to choose for Target tokens + std::string var = os::GetEnvVar("HSA_THREAD_TRACE_SELECT_TOKEN_MASK2"); + if (var.length() > 0) { + tokenMask2 = std::stol(var, nullptr, 16); + std::cout << "Using " << tokenMask2 << " as TokenMask2 for Thread Trace" << std::endl; + } + + assert(((tokenMask2 & validMask) == 0) && "TokenMask2 should have bits [31:16] set to Zero"); + + return tokenMask2; +} + +} // pm4_profile diff --git a/runtime/hsa-ext-aql-profile/src/threadtrace/thread_trace.h b/runtime/hsa-ext-aql-profile/src/threadtrace/thread_trace.h new file mode 100644 index 0000000000..0c1dd1161b --- /dev/null +++ b/runtime/hsa-ext-aql-profile/src/threadtrace/thread_trace.h @@ -0,0 +1,104 @@ +#ifndef _THREAD_TRACE_H_ +#define _THREAD_TRACE_H_ + +#include + +#include "cmdwriter.h" + +// Move them as static variables later on +#define TT_WRITE_PTR_MASK (0x3FFFFFFF) +#define TT_DEFAULT_BUFF_SIZE_SCALE (16) +#define TT_DEFAULT_BUFF_SIZE (1024 * 1024 * 8) + +// Size of block in bytesper increment in WPTR +#define TT_WRITE_PTR_BLK (32) + +// Factor by which to shift buffer address +#define TT_BUFF_ALIGN_SHIFT (12) + +// Align address to 64 Kilobytes +#define TT_BUFF_ADDR_ALIGN (0x10000) + +namespace pm4_profile { + +// ThreadTrace config +typedef struct ThreadTraceConfig { + uint32_t threadTraceTargetCu; + uint32_t threadTraceVmIdMask; + uint32_t threadTraceMask; + uint32_t threadTraceTokenMask; + uint32_t threadTraceTokenMask2; +} ThreadTraceConfig; + +// Encapsulates the various Api and structures that are used to enable +// a thread trace session and collect its data. Implementations of this +// interface program device specific registers to realize the functionality +class ThreadTrace { + // Holds Thread Trace configuration information + // @note: Currently not used i.e. is not exposed to users + ThreadTraceConfig ttConfig_; + + public: + // Destructor of the thread trace service handle + virtual ~ThreadTrace(){}; + + // Obtain the CU id to use for thread tracing + uint8_t SetCuId(); + + // Obtain the VM id to use for thread tracing + uint8_t SetVmId(); + + // Obtain the Mask to use for thread tracing + uint32_t SetMask(); + + // Obtain the Token Mask 1 to use for thread tracing + uint32_t SetTokenMask(); + + // Obtain the Token Mask 2 to use for thread tracing + uint32_t SetTokenMask2(); + + // Initializes various data structures and handles that + // are needed to support a thread trace session + virtual bool Init(const ThreadTraceConfig* config); + + // Initializes thread trace configuration object with default + // parameters, that could potentially be overriden by user + // @note: Currently not used i.e. is not exposed to users + virtual void InitThreadTraceConfig(ThreadTraceConfig* config) const; + + // Allows user to configure various parameters of a thread trace session + // @note: Currently not used i.e. is not exposed to users + bool Config(uint32_t key, uint32_t value) { return true; }; + + // Builds Pm4 command stream to program hardware registers that + // enable a thread trace session, including the issue of an event + // to begin thread session + virtual void BeginSession(pm4_profile::DefaultCmdBuf* cmdBuff, + pm4_profile::CommandWriter* cmdWriter) = 0; + + // Builds Pm4 command stream to program hardware registers that + // disable a thread trace session, including the issue of an event + // to stop currently ongoing thread session + virtual void StopSession(pm4_profile::DefaultCmdBuf* cmdBuff, + pm4_profile::CommandWriter* cmdWriter) = 0; + + // Validates that thread trace session ran correctly i.e. did not + // encounter any errors. + virtual bool Validate() = 0; + + // Initializes the handle of buffer used to collect SQTT data + virtual void setSqttDataBuff(uint8_t* sqttBuffer, uint32_t sqttBuffSz) = 0; + + // Initializes the handle of buffer used to read control data of SQTT + virtual void setSqttCtrlBuff(uint32_t* ctrlBuff) = 0; + + // Return number of Shader Engines + virtual uint32_t getNumSe() = 0; + + // Return status info size + virtual uint32_t StatusSizeInfo() const = 0; +}; + +} // pm4_profile + +#endif // _THREAD_TRACE_H_ diff --git a/runtime/hsa-ext-aql-profile/src/util/CMakeLists.txt b/runtime/hsa-ext-aql-profile/src/util/CMakeLists.txt new file mode 100644 index 0000000000..38664f65a3 --- /dev/null +++ b/runtime/hsa-ext-aql-profile/src/util/CMakeLists.txt @@ -0,0 +1,17 @@ +# +# Source files for Rocr Utils library +# +set ( MODULE_SRC ${CORE_UTIL_DIR}/lnx/os_linux.cpp ) + +# +# Header files include path(s). +# +include_directories ( $ENV{ROCR_INC_DIR} ) +include_directories ( ${HSA_RUNTIME_OSC_DIR} ) +include_directories ( ${CORE_UTIL_DIR} ) + +# +# Build Utils as a Static Library object +# +add_library( ${UTIL_LIB} STATIC ${MODULE_SRC} ) +target_link_libraries( ${UTIL_LIB} c stdc++ dl pthread rt ) diff --git a/runtime/hsa-ext-aql-profile/test/CMakeLists.txt b/runtime/hsa-ext-aql-profile/test/CMakeLists.txt new file mode 100644 index 0000000000..f310202398 --- /dev/null +++ b/runtime/hsa-ext-aql-profile/test/CMakeLists.txt @@ -0,0 +1,48 @@ +# +# Header files include path(s). +# +include_directories ( $ENV{ROCR_INC_DIR} ) +include_directories ( ${API_DIR} ) +include_directories ( ${PROJ_DIR}/cmdwriter ) +include_directories ( ${PROJ_DIR}/perfcounter ) +include_directories ( ${PROJ_DIR}/threadtrace ) +include_directories ( ${PROJ_DIR}/aqlprofile ) +include_directories ( ${TEST_DIR}/common ) +include_directories ( ${TEST_DIR}/ctrl ) +include_directories ( ${CORE_UTIL_DIR} ) + +# +# Specify the directory containing the libraries of HsaRt +# to be linked against for building a Hsa Perf application +# +LINK_DIRECTORIES($ENV{ROCR_LIB_DIR}) +find_library ( ROCR_LIB NAMES hsa-runtime64 PATHS $ENV{ROCR_LIB_DIR} ) + +# +# Set Name for Common library and build it as a +# static library to be linked with others +# +set ( COMMON_LIB "common${ONLY64STR}" ) +add_subdirectory ( ${TEST_DIR}/common "${PROJECT_BINARY_DIR}/common" ) + +# +# Build the test library +# +set ( TEST_NAME simple_convolution ) +include_directories ( ${TEST_DIR}/${TEST_NAME} ) +set ( LIB_NAME "${TEST_NAME}${ONLY64STR}" ) +add_library ( ${LIB_NAME} STATIC ${TEST_DIR}/${TEST_NAME}/${TEST_NAME}.cpp ) +target_link_libraries( ${LIB_NAME} c stdc++ ) +execute_process ( COMMAND sh -xc "cp ${TEST_DIR}/${TEST_NAME}/*.hsaco ${PROJECT_BINARY_DIR}" ) +set ( TEST_LIBS ${LIB_NAME} ) + +# +# Build the test control +# +set ( SRC_LIST ${TEST_DIR}/ctrl/test.cpp ) +set ( SRC_LIST ${SRC_LIST} ${TEST_DIR}/ctrl/test_pmgr.cpp ) +set ( SRC_LIST ${SRC_LIST} ${TEST_DIR}/ctrl/test_hsa.cpp ) +set ( LIB_LIST ${TEST_LIBS} ${COMMON_LIB} ${CORE_UTILS_LIB} ${ROCR_LIB} ${TARGET_LIB} ) +set ( EXE_NAME "ctrl" ) +add_executable ( ${EXE_NAME} ${SRC_LIST} ) +target_link_libraries( ${EXE_NAME} ${LIB_LIST} c stdc++ dl pthread rt ) diff --git a/runtime/hsa-ext-aql-profile/test/binary_search/binary_search.cc b/runtime/hsa-ext-aql-profile/test/binary_search/binary_search.cc new file mode 100755 index 0000000000..9d699abbc0 --- /dev/null +++ b/runtime/hsa-ext-aql-profile/test/binary_search/binary_search.cc @@ -0,0 +1,876 @@ +/* + * ============================================================================= + * ROC Runtime Conformance Release License + * ============================================================================= + * The University of Illinois/NCSA + * Open Source License (NCSA) + * + * Copyright (c) 2017, Advanced Micro Devices, Inc. + * All rights reserved. + * + * Developed by: + * + * AMD Research and AMD ROC Software Development + * + * Advanced Micro Devices, Inc. + * + * www.amd.com + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal with the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * - Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimers. + * - Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimers in + * the documentation and/or other materials provided with the distribution. + * - Neither the names of , + * nor the names of its contributors may be used to endorse or promote + * products derived from this Software without specific prior written + * permission. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS WITH THE SOFTWARE. + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include "hsa/hsa.h" +#include "hsa/hsa_ext_amd.h" + +#define RET_IF_HSA_ERR(err) { \ + if ((err) != HSA_STATUS_SUCCESS) { \ + std::cout << "hsa api call failure at line " << __LINE__ << ", file: " << \ + __FILE__ << ". Call returned " << err << std::endl; \ + return (err); \ + } \ +} + +static const uint32_t kBinarySearchLength = 512; +static const uint32_t kBinarySearchFindMe = 108; +static const uint32_t kWorkGroupSize = 256; + +// Hold all the info specific to binary search +typedef struct BinarySearch { + // Binary Search parameters + uint32_t length; + uint32_t work_group_size; + uint32_t work_grid_size; + uint32_t num_sub_divisions; + uint32_t find_me; + + // Buffers needed for this application + uint32_t* input; + uint32_t* input_arr; + uint32_t* input_arr_local; + uint32_t* output; + // Keneral argument buffers and addresses + void* kern_arg_buffer; // Begin of allocated memory + // this pointer to be deallocated + void* kern_arg_address; // Properly aligned address to be used in aql + // packet (don't use for deallocation) + + // Kernel code + std::string kernel_file_name; + std::string kernel_name; + uint32_t kernarg_size; + uint32_t kernarg_align; + + // HSA/RocR objects needed for this application + hsa_agent_t gpu_dev; + hsa_agent_t cpu_dev; + hsa_signal_t signal; + hsa_queue_t* queue; + hsa_amd_memory_pool_t cpu_pool; + hsa_amd_memory_pool_t gpu_pool; + hsa_amd_memory_pool_t kern_arg_pool; + + // Other items we need to populate AQL packet + uint64_t kernel_object; + uint32_t group_segment_size; ///< Kernel group seg size + uint32_t private_segment_size; ///< Kernel private seg size +} BinarySearch; + +void InitializeBinarySearch(BinarySearch* bs) { + bs->kernel_file_name = "./binary_search_kernels.hsaco"; + bs->kernel_name = "binarySearch"; + bs->length = 512; + bs->find_me = 108; + bs->work_group_size = 256; + bs->num_sub_divisions = bs->length / bs->work_group_size; +} + +// This function is called by the call-back functions used to find an agent of +// the specified hsa_device_type_t. Note that it cannot be called directly from +// hsa_iterate_agents() as it does not match the prototype of the call-back +// function. It must be wrapped by a function with the correct prototype. +// +// Return values: +// HSA_STATUS_INFO_BREAK -- "agent" is of the specified type (dev_type) +// HSA_STATUS_SUCCESS -- "agent" is not of the specified type +// Other -- Some error occurred +static hsa_status_t FindAgent(hsa_agent_t agent, void* data, + hsa_device_type_t dev_type) { + if (data == nullptr) { + return HSA_STATUS_ERROR_INVALID_ARGUMENT; + } + + // See if the provided agent matches the input type (dev_type) + hsa_device_type_t hsa_device_type; + hsa_status_t hsa_error_code = hsa_agent_get_info(agent, HSA_AGENT_INFO_DEVICE, + &hsa_device_type); + RET_IF_HSA_ERR(hsa_error_code); + + if (hsa_device_type == dev_type) { + *(reinterpret_cast(data)) = agent; + return HSA_STATUS_INFO_BREAK; + } + + return HSA_STATUS_SUCCESS; +} + +// This is the call-back function used to find a GPU type agent. Note that the +// prototype of this function is dictated by the HSA specification +hsa_status_t FindGPUDevice(hsa_agent_t agent, void* data) { + return FindAgent(agent, data, HSA_DEVICE_TYPE_GPU); +} + +// This is the call-back function used to find a CPU type agent. Note that the +// prototype of this function is dictated by the HSA specification +hsa_status_t FindCPUDevice(hsa_agent_t agent, void* data) { + return FindAgent(agent, data, HSA_DEVICE_TYPE_CPU); +} + +// Find the CPU and GPU agents we need to run this sample, and save them in the +// BinarySearch structure for later use. +hsa_status_t FindDevices(BinarySearch* bs) { + hsa_status_t err; + + // Note that hsa_iterate_agents iterate through all known agents until + // HSA_STATUS_SUCCESS is not returned. The call-backs are implemented such + // that HSA_STATUS_INFO_BREAK means we found an agent of the specified type. + // This value is returned by hsa_iterate_agents. + bs->gpu_dev.handle = 0; + err = hsa_iterate_agents(FindGPUDevice, &bs->gpu_dev); + + if (err != HSA_STATUS_INFO_BREAK) { + return HSA_STATUS_ERROR; + } + + bs->cpu_dev.handle = 0; + err = hsa_iterate_agents(FindCPUDevice, &bs->cpu_dev); + + if (err != HSA_STATUS_INFO_BREAK) { + return HSA_STATUS_ERROR; + } + + if (0 == bs->gpu_dev.handle) { + std::cout << "GPU Device is not Created properly!" << std::endl; + RET_IF_HSA_ERR(HSA_STATUS_ERROR); + } + + if (0 == bs->cpu_dev.handle) { + std::cout << "CPU Device is not Created properly!" << std::endl; + RET_IF_HSA_ERR(HSA_STATUS_ERROR); + } + + return HSA_STATUS_SUCCESS; +} + +// This function checks to see if the provided +// pool has the HSA_AMD_SEGMENT_GLOBAL property. If the kern_arg flag is true, +// the function adds an additional requirement that the pool have the +// HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_KERNARG_INIT property. If kern_arg is false, +// pools must NOT have this property. +// Upon finding a pool that meets these conditions, HSA_STATUS_INFO_BREAK is +// returned. HSA_STATUS_SUCCESS is returned if no errors were encountered, but +// no pool was found meeting the requirements. If an error is encountered, we +// return that error. + +// Note that this function does not match the required prototype for the +// hsa_amd_agent_iterate_memory_pools call back function, and therefore must be +// wrapped by a function with the correct prototype. +static hsa_status_t +FindGlobalPool(hsa_amd_memory_pool_t pool, void* data, bool kern_arg) { + hsa_status_t err; + hsa_amd_segment_t segment; + uint32_t flag; + + if (nullptr == data) { + return HSA_STATUS_ERROR_INVALID_ARGUMENT; + } + + err = hsa_amd_memory_pool_get_info(pool, HSA_AMD_MEMORY_POOL_INFO_SEGMENT, + &segment); + RET_IF_HSA_ERR(err); + + if (HSA_AMD_SEGMENT_GLOBAL != segment) { + return HSA_STATUS_SUCCESS; + } + + err = hsa_amd_memory_pool_get_info(pool, + HSA_AMD_MEMORY_POOL_INFO_GLOBAL_FLAGS, &flag); + RET_IF_HSA_ERR(err); + + uint32_t karg_st = flag & HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_KERNARG_INIT; + + if ((karg_st == 0 && kern_arg) || + (karg_st != 0 && !kern_arg)) { + return HSA_STATUS_SUCCESS; + } + + *(reinterpret_cast(data)) = pool; + return HSA_STATUS_INFO_BREAK; +} + +// This is the call-back function for hsa_amd_agent_iterate_memory_pools() that +// finds a pool with the properties of HSA_AMD_SEGMENT_GLOBAL and that is NOT +// HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_KERNARG_INIT +hsa_status_t FindStandardPool(hsa_amd_memory_pool_t pool, void* data) { + return FindGlobalPool(pool, data, false); +} + +// This is the call-back function for hsa_amd_agent_iterate_memory_pools() that +// finds a pool with the properties of HSA_AMD_SEGMENT_GLOBAL and that IS +// HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_KERNARG_INIT +hsa_status_t FindKernArgPool(hsa_amd_memory_pool_t pool, void* data) { + return FindGlobalPool(pool, data, true); +} + +// Find memory pools that we will need to allocate from for this sample +// application. We will need memory associated with the host CPU, the GPU +// executing the kernels, and for kernel arguments. This function will +// save the found pools to the BinarySearch structure for use elsewhere +// in this program. +hsa_status_t FindPools(BinarySearch* bs) { + hsa_status_t err; + + err = hsa_amd_agent_iterate_memory_pools(bs->cpu_dev, FindStandardPool, + &bs->cpu_pool); + + if (err != HSA_STATUS_INFO_BREAK) { + return HSA_STATUS_ERROR; + } + + err = hsa_amd_agent_iterate_memory_pools(bs->gpu_dev, FindStandardPool, + &bs->gpu_pool); + + if (err != HSA_STATUS_INFO_BREAK) { + return HSA_STATUS_ERROR; + } + + err = hsa_amd_agent_iterate_memory_pools(bs->cpu_dev, + FindKernArgPool, &bs->kern_arg_pool); + + if (err != HSA_STATUS_INFO_BREAK) { + return HSA_STATUS_ERROR; + } + + return HSA_STATUS_SUCCESS; +} + +// Once the needed memory pools have been found and the BinarySearch structure +// has been updated with these handles, this function is then used to allocate +// memory from those pools. +// Devices with which a pool is associated already have access to the pool. +// However, other devices may also need to read or write to that memory. Below, +// we see how we can grant access to other devices to address this issue. +hsa_status_t AllocateAndInitBuffers(BinarySearch* bs) { + hsa_status_t err; + uint32_t out_length = 4 * sizeof(uint32_t); + uint32_t in_length = bs->num_sub_divisions * 2 * sizeof(uint32_t); + + // In all of these examples, we want both the cpu and gpu to have access to + // the buffer in question. We use the array of agents below in the susequent + // calls to hsa_amd_agents_allow_access() for this purpose. + hsa_agent_t ag_list[2] = {bs->gpu_dev, bs->cpu_dev}; + + err = hsa_amd_memory_pool_allocate(bs->cpu_pool, in_length, 0, + reinterpret_cast(&bs->input)); + RET_IF_HSA_ERR(err); + err = hsa_amd_agents_allow_access(2, ag_list, NULL, bs->input); + RET_IF_HSA_ERR(err); + (void)memset(bs->input, 0, in_length); + + err = hsa_amd_memory_pool_allocate(bs->cpu_pool, out_length, 0, + reinterpret_cast(&bs->output)); + RET_IF_HSA_ERR(err); + err = hsa_amd_agents_allow_access(2, ag_list, NULL, bs->output); + RET_IF_HSA_ERR(err); + (void)memset(bs->input, 0, in_length); + + err = hsa_amd_memory_pool_allocate(bs->cpu_pool, in_length, 0, + reinterpret_cast(&bs->input_arr)); + RET_IF_HSA_ERR(err); + err = hsa_amd_agents_allow_access(2, ag_list, NULL, bs->input_arr); + RET_IF_HSA_ERR(err); + (void)memset(bs->input, 0, in_length); + + err = hsa_amd_memory_pool_allocate(bs->cpu_pool, in_length, 0, + reinterpret_cast(&bs->input_arr_local)); + RET_IF_HSA_ERR(err); + err = hsa_amd_agents_allow_access(2, ag_list, NULL, bs->input_arr_local); + RET_IF_HSA_ERR(err); + + // Binary-search application specific code... + // Initialize input buffer with random values in an increasing order + uint32_t max = bs->length * 20; + bs->input[0] = 0; + + uint32_t seed = (unsigned int)time(NULL); + srand(seed); + + for (uint32_t i = 1; i < bs->length; ++i) { + bs->input[i] = bs->input[i - 1] + + static_cast(max * rand_r(&seed) / static_cast(RAND_MAX)); + } + +// #define VERBOSE 1 +#ifdef VERBOSE + std::cout << "Input array values:" << std::endl; + + for (uint32_t i = 0; i < bs->length; ++i) { + std::cout << "input[" << i << "] = " << bs->input[i] << " "; + + if (i % 4 == 0) { + std::cout << std::endl; + } + } + + std::cout << std::endl; +#endif + + return err; +} + +// The code in this function illustrates how to load a kernel from +// pre-compiled code. The goal is to get a handle that can be later +// used in an AQL packet and also to extract information about kernel +// that we will need. All of the information hand kernel handle will +// be saved to the BinarySearch structure. It will be used when we +// populate the AQL packet. +hsa_status_t LoadKernelFromObjFile(BinarySearch* bs) { + hsa_status_t err; + hsa_code_object_reader_t code_obj_rdr = {0}; + hsa_executable_t executable = {0}; + + hsa_file_t file_handle = open(bs->kernel_file_name.c_str(), O_RDONLY); + + if (file_handle == -1) { + std::cout << "failed to open " << bs->kernel_file_name.c_str() << + " at line " << __LINE__ << ", errno: " << errno << std::endl; + return HSA_STATUS_ERROR; + } + + err = hsa_code_object_reader_create_from_file(file_handle, &code_obj_rdr); + RET_IF_HSA_ERR(err); + close(file_handle); + + err = hsa_executable_create_alt(HSA_PROFILE_FULL, + HSA_DEFAULT_FLOAT_ROUNDING_MODE_DEFAULT, NULL, &executable); + RET_IF_HSA_ERR(err); + + err = hsa_executable_load_agent_code_object(executable, bs->gpu_dev, + code_obj_rdr, NULL, NULL); + RET_IF_HSA_ERR(err); + + err = hsa_executable_freeze(executable, NULL); + RET_IF_HSA_ERR(err); + + hsa_executable_symbol_t kern_sym; + err = hsa_executable_get_symbol(executable, NULL, bs->kernel_name.c_str(), + bs->gpu_dev, 0, &kern_sym); + RET_IF_HSA_ERR(err); + + err = hsa_executable_symbol_get_info(kern_sym, + HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT, + &bs->kernel_object); + RET_IF_HSA_ERR(err); + + err = hsa_executable_symbol_get_info(kern_sym, + HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_PRIVATE_SEGMENT_SIZE, + &bs->private_segment_size); + RET_IF_HSA_ERR(err); + + err = hsa_executable_symbol_get_info(kern_sym, + HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_GROUP_SEGMENT_SIZE, + &bs->group_segment_size); + RET_IF_HSA_ERR(err); + + err = hsa_executable_symbol_get_info(kern_sym, + HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_KERNARG_SEGMENT_SIZE, + &bs->kernarg_size); + RET_IF_HSA_ERR(err); + + err = hsa_executable_symbol_get_info(kern_sym, + HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_KERNARG_SEGMENT_ALIGNMENT, + &bs->kernarg_align); + RET_IF_HSA_ERR(err); + + return err; +} + +// This function shows how to do an asynchronous copy. We have to create a +// signal and use the signal to notify us when the copy has completed. +hsa_status_t AgentMemcpy(void* dst, const void* src, + size_t size, hsa_agent_t dst_ag, hsa_agent_t src_ag) { + hsa_signal_t s; + hsa_status_t err; + + err = hsa_signal_create(1, 0, NULL, &s); + RET_IF_HSA_ERR(err); + + err = hsa_amd_memory_async_copy(dst, dst_ag, src, src_ag, size, 0, NULL, s); + RET_IF_HSA_ERR(err); + + if (hsa_signal_wait_scacquire(s, HSA_SIGNAL_CONDITION_LT, 1, + UINT64_MAX, HSA_WAIT_STATE_BLOCKED) != 0) { + err = HSA_STATUS_ERROR; + std::cout << "Async copy signal error" << std::endl; + + RET_IF_HSA_ERR(err); + } + + err = hsa_signal_destroy(s); + + RET_IF_HSA_ERR(err); + + return err; +} + +// AlignDown and AlignUp are 2 utility functions we use to find an aligned +// boundary either below or above a given value (address). The function will +// return a value that has the specified alignment. +static intptr_t +AlignDown(intptr_t value, size_t alignment) { + return (intptr_t) (value & ~(alignment - 1)); +} +static void* +AlignUp(void* value, size_t alignment) { + return reinterpret_cast(AlignDown((uintptr_t) + (reinterpret_cast(value) + alignment - 1), alignment)); +} + +// This function populates the AQL patch with the information +// we have collected and stored in the BinarySearch structure thus far. +void PopulateAQLPacket(BinarySearch const* bs, + hsa_kernel_dispatch_packet_t* aql) { + aql->header = 0; // Dummy val. for now. Set this right before doorbell ring + aql->setup = 1; + aql->workgroup_size_x = bs->work_group_size; + aql->workgroup_size_y = 1; + aql->workgroup_size_z = 1; + aql->grid_size_x = bs->work_grid_size; + aql->grid_size_y = 1; + aql->grid_size_z = 1; + aql->private_segment_size = bs->private_segment_size; + aql->group_segment_size = bs->group_segment_size; + aql->kernel_object = bs->kernel_object; + aql->kernarg_address = bs->kern_arg_address; + aql->completion_signal = bs->signal; + + return; +} +/* + * Write everything in the provided AQL packet to the queue except the first 32 + * bits which include the header and setup fields. That should be done + * last. + */ +void WriteAQLToQueue(hsa_kernel_dispatch_packet_t const* in_aql, + hsa_queue_t* q) { + void* queue_base = q->base_address; + const uint32_t queue_mask = q->size - 1; + uint64_t que_idx = hsa_queue_add_write_index_relaxed(q, 1); + + hsa_kernel_dispatch_packet_t* queue_aql_packet; + + queue_aql_packet = + &(reinterpret_cast(queue_base)) + [que_idx & queue_mask]; + + queue_aql_packet->workgroup_size_x = in_aql->workgroup_size_x; + queue_aql_packet->workgroup_size_y = in_aql->workgroup_size_y; + queue_aql_packet->workgroup_size_z = in_aql->workgroup_size_z; + queue_aql_packet->grid_size_x = in_aql->grid_size_x; + queue_aql_packet->grid_size_y = in_aql->grid_size_y; + queue_aql_packet->grid_size_z = in_aql->grid_size_z; + queue_aql_packet->private_segment_size = in_aql->private_segment_size; + queue_aql_packet->group_segment_size = in_aql->group_segment_size; + queue_aql_packet->kernel_object = in_aql->kernel_object; + queue_aql_packet->kernarg_address = in_aql->kernarg_address; + queue_aql_packet->completion_signal = in_aql->completion_signal; +} + +// This function allocates memory from the kern_arg pool we already found, and +// then sets the argument values needed by the kernel code. +hsa_status_t AllocAndSetKernArgs(BinarySearch* bs, void* args, + size_t arg_size, void** aql_buf_ptr) { + void* kern_arg_buf = nullptr; + hsa_status_t err; + size_t buf_size; + size_t req_align; + + // The kernel code must be written to memory at the correct alignment. We + // already queried the executable to get the correct alignment, which is + // stored in bs->kernarg_align. In case the memory returned from + // hsa_amd_memory_pool is not of the correct alignment, we request a little + // more than what we need in case we need to adjust. + req_align = bs->kernarg_align; + // Allocate enough extra space for alignment adjustments if ncessary + buf_size = arg_size + (req_align << 1); + + err = hsa_amd_memory_pool_allocate(bs->kern_arg_pool, buf_size, 0, + reinterpret_cast(&kern_arg_buf)); + RET_IF_HSA_ERR(err); + + // Address of the allocated buffer + bs->kern_arg_buffer = kern_arg_buf; + + // Addr. of kern arg start. + bs->kern_arg_address = AlignUp(kern_arg_buf, req_align); + + assert(arg_size >= bs->kernarg_size); + assert(((uintptr_t)bs->kern_arg_address + arg_size) < + ((uintptr_t)bs->kern_arg_buffer + buf_size)); + + (void)memcpy(bs->kern_arg_address, args, arg_size); + RET_IF_HSA_ERR(err); + + // Make sure both the CPU and GPU can access the kernel arguments + hsa_agent_t ag_list[2] = {bs->gpu_dev, bs->cpu_dev}; + err = hsa_amd_agents_allow_access(2, ag_list, NULL, bs->kern_arg_buffer); + RET_IF_HSA_ERR(err); + + // Save this info in our BinarySearch structure for later. + *aql_buf_ptr = bs->kern_arg_address; + + return HSA_STATUS_SUCCESS; +} + +// This wrapper atomically writes the provided header and setup to the +// provided AQL packet. The provided AQL packet address should be in the +// queue memory space. +inline void AtomicSetPacketHeader(uint16_t header, uint16_t setup, + hsa_kernel_dispatch_packet_t* queue_packet) { + __atomic_store_n(reinterpret_cast(queue_packet), + header | (setup << 16), __ATOMIC_RELEASE); +} + +// Once all the required data for kernel execution is collected (in this +// application it is stored in the BinarySearch structure) we can put it in +// an AQL packet and ring the queue door bell to tell the command processor to +// execute it. +hsa_status_t Run(BinarySearch* bs) { + hsa_status_t err; + + std::cout << "Executing kernel " << bs->kernel_name << std::endl; + + // Adjust the size of workgroup + // This is mostly application specific. + if (bs->work_group_size > 64) { + bs->work_group_size = 64; + bs->num_sub_divisions = bs->length / bs->work_group_size; + + if (bs->num_sub_divisions < bs->work_group_size) { + bs->num_sub_divisions = bs->work_group_size; + } + + bs->work_grid_size = bs->num_sub_divisions; + } + + // Explanation of BinarySearch algorithm. + /* + * Since a plain binary search on the GPU would not achieve much benefit + * over the GPU we are doing an N'ary search. We split the array into N + * segments every pass and therefore get log (base N) passes instead of log + * (base 2) passes. + * + * In every pass, only the thread that can potentially have the element we + * are looking for writes to the output array. For ex: if we are looking to + * find 4567 in the array and every thread is searching over a segment of + * 1000 values and the input array is 1, 2, 3, 4,... then the first thread + * is searching in 1 to 1000, the second one from 1001 to 2000, etc. The + * first one does not write to the output. The second one doesn't either. + * The fifth one however is from 4001 to 5000. So it can potentially have + * the element 4567 which lies between them. + * + * This particular thread writes to the output the lower bound, upper bound + * and whether the element equals the lower bound element. So, it would be + * 4001, 5000, 0 + * + * The next pass would subdivide 4001 to 5000 into smaller segments and + * continue the same process from there. + * + * When a pass returns 1 in the third element, it means the element has been + * found and we can stop executing the kernel. If the element is not found, + * then the execution stops after looking at segment of size 1. + */ + + uint32_t global_lower_bound = 0; + uint32_t global_upper_bound = bs->length - 1; + uint32_t sub_div_size = (global_upper_bound - global_lower_bound + 1) / + bs->num_sub_divisions; + + if ((bs->input[0] > bs->find_me) || + (bs->input[bs->length - 1] < bs->find_me)) { + bs->output[0] = 0; + bs->output[1] = bs->length - 1; + bs->output[2] = 0; + std::cout << "Returning too early" << std::endl; + return HSA_STATUS_SUCCESS; + } + + bs->output[3] = 1; + + // Setup the kernel args + // See the meta-data for the compiled OpenCL kernel code to ascertain + // the sizes, padding and alignment required for kernel arguments. + // This can be seen by executing + // $ amdgcn-amd-amdhsa-readelf -aw ./binary_search_kernels.hsaco + // The kernel code will expect the following arguments aligned as shown. + typedef uint32_t uint2[2]; + typedef uint32_t uint4[4]; + struct __attribute__((aligned(16))) local_args_t { + uint4* outputArray; + uint2* sortedArray; + uint32_t findMe; + uint32_t pad; + uint64_t global_offset_x; + uint64_t global_offset_y; + uint64_t global_offset_z; + } local_args; + + local_args.outputArray = reinterpret_cast(bs->output); + local_args.sortedArray = reinterpret_cast(bs->input_arr_local); + local_args.findMe = bs->find_me; + local_args.global_offset_x = 0; + local_args.global_offset_y = 0; + local_args.global_offset_z = 0; + + // Copy the kernel args structure into kernel arg memory + err = AllocAndSetKernArgs(bs, &local_args, sizeof(local_args), + &bs->kern_arg_address); + RET_IF_HSA_ERR(err); + + // Populate an AQL packet with the info we've gathered + hsa_kernel_dispatch_packet_t aql; + PopulateAQLPacket(bs, &aql); + + uint32_t in_length = bs->num_sub_divisions * 2 * sizeof(uint32_t); + + while ((sub_div_size > 1) && (bs->output[3] != 0)) { + for (uint32_t i = 0 ; i < bs->num_sub_divisions; i++) { + int idx1 = i * sub_div_size; + int idx2 = ((i + 1) * sub_div_size) - 1; + bs->input_arr[2 * i] = bs->input[idx1]; + bs->input_arr[2 * i + 1] = bs->input[idx2]; + } + + // Copy kernel parameter from system memory to local memory + err = AgentMemcpy(reinterpret_cast(bs->input_arr_local), + reinterpret_cast(bs->input_arr), + in_length, bs->gpu_dev, bs->cpu_dev); + + RET_IF_HSA_ERR(err); + + // Reset output buffer to zero + bs->output[3] = 0; + + // Dispatch kernel with global work size, work group size with ONE dimesion + // and wait for kernel to complete + + // Compute the write index of queue and copy Aql packet into it + uint64_t que_idx = hsa_queue_load_write_index_relaxed(bs->queue); + + const uint32_t mask = bs->queue->size - 1; + + // This function simply copies the data we've collected so far into our + // local AQL packet, except the the setup and header fields. + WriteAQLToQueue(&aql, bs->queue); + + uint32_t aql_header = HSA_PACKET_TYPE_KERNEL_DISPATCH; + aql_header |= HSA_FENCE_SCOPE_SYSTEM << + HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE; + aql_header |= HSA_FENCE_SCOPE_SYSTEM << + HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE; + + // Set the packet's type, acquire and release fences. This should be done + // atomically after all the other fields have been set, using release + // memory ordering to ensure all the fields are set when the door bell + // signal is activated. + void* q_base = bs->queue->base_address; + + AtomicSetPacketHeader(aql_header, aql.setup, + &(reinterpret_cast + (q_base))[que_idx & mask]); + + // Increment the write index and ring the doorbell to dispatch kernel. + hsa_queue_store_write_index_relaxed(bs->queue, (que_idx + 1)); + hsa_signal_store_relaxed(bs->queue->doorbell_signal, que_idx); + + // Wait on the dispatch signal until the kernel is finished. + // Modify the wait condition to HSA_WAIT_STATE_ACTIVE (instead of + // HSA_WAIT_STATE_BLOCKED) if polling is needed instead of blocking, as we + // have below. + // The call below will block until the condition is met. Below we have said + // the condition is that the signal value (initiailzed to 1) associated with + // the queue is less than 1. When the kernel associated with the queued AQL + // packet has completed execution, the signal value is automatically + // decremented by the packet processor. + hsa_signal_value_t value = hsa_signal_wait_scacquire(bs->signal, + HSA_SIGNAL_CONDITION_LT, 1, + UINT64_MAX, HSA_WAIT_STATE_BLOCKED); + + // value should be 0, or we timed-out + if (value) { + std::cout << "Timed out waiting for kernel to complete?" << std::endl; + RET_IF_HSA_ERR(HSA_STATUS_ERROR); + } + + // Reset the signal to its initial value for the next iteration + hsa_signal_store_screlease(bs->signal, 1); + + // Binary search algorithm stuff... + global_lower_bound = bs->output[0] * sub_div_size; + global_upper_bound = global_lower_bound + sub_div_size - 1; + sub_div_size = (global_upper_bound - global_lower_bound + 1) / + bs->num_sub_divisions; + } + + uint32_t element_index = UINT_MAX; + + for (uint32_t i = global_lower_bound; i <= global_upper_bound; i++) { + if (bs->input[i] == bs->find_me) { + element_index = i; + bs->output[0] = i; + bs->output[1] = i + 1; + bs->output[2] = 1; + break; + } + + // Element is not found in region specified + // by global lower bound to global upper bound + bs->output[2] = 0; + } + + uint32_t is_elem_found = bs->output[2]; + + std::cout << "Lower bound = " << global_lower_bound << std::endl; + std::cout << "Upper bound = " << global_upper_bound << std::endl; + std::cout << "Element search for = " << bs->find_me << std::endl; + + + if (is_elem_found == 1) { + std::cout << "Element found at index " << element_index << std::endl; + } else { + std::cout << "Element value " << bs->find_me << " not found" << std::endl; + } + + return HSA_STATUS_SUCCESS; +} + +// Release all the RocR resources we have acquired in this application. +hsa_status_t CleanUp(BinarySearch* bs) { + hsa_status_t err; + + err = hsa_amd_memory_pool_free(bs->input); + RET_IF_HSA_ERR(err); + + err = hsa_amd_memory_pool_free(bs->output); + RET_IF_HSA_ERR(err); + + err = hsa_amd_memory_pool_free(bs->input_arr); + RET_IF_HSA_ERR(err); + + err = hsa_amd_memory_pool_free(bs->kern_arg_buffer); + RET_IF_HSA_ERR(err); + + err = hsa_queue_destroy(bs->queue); + RET_IF_HSA_ERR(err); + + err = hsa_signal_destroy(bs->signal); + RET_IF_HSA_ERR(err); + + err = hsa_shut_down(); + RET_IF_HSA_ERR(err); + + return HSA_STATUS_SUCCESS; +} + +int main(int argc, char* argv[]) { + // This BinarySearch structure (bs) below holds all of the appl. specific + // info we need to run the sample. This includes algorithm specific + // information as well as handles to RocR/HSA objects. + + // The basic structure of this sample is to fill in this structure with the + // required RocR/HSA handles to RocR resources (e.g., agents, memory pools, + // queues, etc.) and then dispatch the packets to the queue, and examine the + // output. + + BinarySearch bs; + hsa_status_t err; + + // Set some working values specific to this application + InitializeBinarySearch(&bs); + + // hsa_init() initializes internal data structures and causes devices + // (agents), memory pools and other resources to be discovered. + err = hsa_init(); + RET_IF_HSA_ERR(err); + + // Find the agents needed for the sample + err = FindDevices(&bs); + RET_IF_HSA_ERR(err); + + // Create the completion signal used when dispatching a packet + err = hsa_signal_create(1, 0, NULL, &bs.signal); + RET_IF_HSA_ERR(err); + + // Create a queue to submit our binary search AQL packets + err = hsa_queue_create(bs.gpu_dev, 128, HSA_QUEUE_TYPE_MULTI, NULL, NULL, + UINT32_MAX, UINT32_MAX, &bs.queue); + RET_IF_HSA_ERR(err); + + // Find the HSA memory pools we need to run this sample + err = FindPools(&bs); + RET_IF_HSA_ERR(err); + + // Allocate memory from the correct memory pool, and initialize them as + // neeeded for the algorihm. + err = AllocateAndInitBuffers(&bs); + RET_IF_HSA_ERR(err); + + // Create a kernel object from the pre-compiled kernel, and read some + // attributes associated with the kernel that we will need. + err = LoadKernelFromObjFile(&bs); + RET_IF_HSA_ERR(err); + + // Fill in the AQL packet, assign the kernel arguments, enqueue the packet, + // "ring" the doorbell, and wait for completion. + err = Run(&bs); + RET_IF_HSA_ERR(err); + + // Release all the RocR resources we've acquired and shutdown HSA. + err = CleanUp(&bs); + + return 0; +} + + +#undef RET_IF_HSA_ERR diff --git a/runtime/hsa-ext-aql-profile/test/binary_search/binary_search_kernels.cl b/runtime/hsa-ext-aql-profile/test/binary_search/binary_search_kernels.cl new file mode 100755 index 0000000000..eb3cca6c86 --- /dev/null +++ b/runtime/hsa-ext-aql-profile/test/binary_search/binary_search_kernels.cl @@ -0,0 +1,127 @@ +/* + * ============================================================================= + * ROC Runtime Conformance Release License + * ============================================================================= + * The University of Illinois/NCSA + * Open Source License (NCSA) + * + * Copyright (c) 2017, Advanced Micro Devices, Inc. + * All rights reserved. + * + * Developed by: + * + * AMD Research and AMD ROC Software Development + * + * Advanced Micro Devices, Inc. + * + * www.amd.com + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal with the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * - Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimers. + * - Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimers in + * the documentation and/or other materials provided with the distribution. + * - Neither the names of , + * nor the names of its contributors may be used to endorse or promote + * products derived from this Software without specific prior written + * permission. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS WITH THE SOFTWARE. + * + */ + +/** + * One instance of this kernel call is a thread. + * Each thread finds out the segment in which it should look for the element. + * After that, it checks if the element is between the lower bound and upper + * bound of its segment. If yes, then this segment becomes the total + * searchspace for the next pass. + * + * To achieve this, it writes the lower bound and upper bound to the output + * array. In case the element at the left end (lower bound) matches the element + * we are looking for, that is marked in the output and we no longer need to + * look any further. + */ + +__kernel void +binarySearch(__global uint4 * outputArray, + __const __global uint2 * sortedArray, + const unsigned int findMe) { + unsigned int tid = get_global_id(0); + + // Then we find the elements for this thread + uint2 element = sortedArray[tid]; + + + // If the element to be found does not lie between + // them, then nothing left to do in this thread + if((element.x > findMe) || (element.y < findMe)) { + return; + } else { + // However, if the element does lie between the lower + // and upper bounds of this thread's searchspace + // we need to narrow down the search further in this + // search space + // The search space for this thread is marked in the + // output as being the total search space for the next pass + outputArray[0].x = tid; + outputArray[0].w = 1; + } +} + + +__kernel void +binarySearch_mulkeys(__global int *keys, + __global uint *input, + const unsigned int numKeys, + __global int *output) { + + int gid = get_global_id(0); + int lBound = gid * 256; + int uBound = lBound + 255; + + for(int i = 0; i < numKeys; i++) { + if(keys[i] >= input[lBound] && keys[i] <= input[uBound]) + output[i]=lBound; + } + +} + + +__kernel void +binarySearch_mulkeysConcurrent(__global uint *keys, + __global uint *input, + const unsigned int inputSize, // num. of inputs + const unsigned int numSubdivisions, + __global int *output) { + + int lBound = (get_global_id(0) % numSubdivisions) * (inputSize / numSubdivisions); + int uBound = lBound + inputSize / numSubdivisions; + int myKey = keys[get_global_id(0) / numSubdivisions]; + int mid; + + while(uBound >= lBound) { + mid = (lBound + uBound) / 2; + if(input[mid] == myKey) { + output[get_global_id(0) / numSubdivisions] = mid; + return; + } else if(input[mid] > myKey) { + uBound = mid - 1; + } else { + lBound = mid + 1; + } + } +} diff --git a/runtime/hsa-ext-aql-profile/test/common/CMakeLists.txt b/runtime/hsa-ext-aql-profile/test/common/CMakeLists.txt new file mode 100644 index 0000000000..a96d647976 --- /dev/null +++ b/runtime/hsa-ext-aql-profile/test/common/CMakeLists.txt @@ -0,0 +1,15 @@ +# +# Source files for Rocr Utils library +# +file( GLOB MODULE_SRC "*.cpp" ) + +# +# Header files include path(s). +# +include_directories ( $ENV{ROCR_INC_DIR} ) + +# +# Build Utils as a Static Library object +# +add_library( ${COMMON_LIB} STATIC ${MODULE_SRC} ) +target_link_libraries( ${COMMON_LIB} c stdc++ dl pthread rt ) diff --git a/runtime/hsa-ext-aql-profile/test/common/common.cpp b/runtime/hsa-ext-aql-profile/test/common/common.cpp new file mode 100644 index 0000000000..19d7383407 --- /dev/null +++ b/runtime/hsa-ext-aql-profile/test/common/common.cpp @@ -0,0 +1,45 @@ +#include "common.hpp" + +void ErrorCheck(hsa_status_t hsa_error_code) { + if (hsa_error_code != HSA_STATUS_SUCCESS) { + std::cerr << "HSA reported error!" << std::endl; + exit(EXIT_FAILURE); + } +} + +hsa_status_t FindGpuDevice(hsa_agent_t agent, void* data) { + if (data == NULL) { + return HSA_STATUS_ERROR_INVALID_ARGUMENT; + } + + hsa_device_type_t hsa_device_type; + hsa_status_t hsa_error_code = hsa_agent_get_info(agent, HSA_AGENT_INFO_DEVICE, &hsa_device_type); + if (hsa_error_code != HSA_STATUS_SUCCESS) { + return hsa_error_code; + } + + if (hsa_device_type == HSA_DEVICE_TYPE_GPU) { + *((hsa_agent_t*)data) = agent; + } + + return HSA_STATUS_SUCCESS; +} + +hsa_status_t FindHostRegion(hsa_region_t region, void* data) { + if (data == NULL) { + return HSA_STATUS_ERROR_INVALID_ARGUMENT; + } + + bool is_host_region = false; + hsa_status_t hsa_error_code = hsa_region_get_info( + region, (hsa_region_info_t)HSA_AMD_REGION_INFO_HOST_ACCESSIBLE, &is_host_region); + if (hsa_error_code != HSA_STATUS_SUCCESS) { + return hsa_error_code; + } + + if (is_host_region) { + *((hsa_region_t*)data) = region; + } + + return HSA_STATUS_SUCCESS; +} diff --git a/runtime/hsa-ext-aql-profile/test/common/common.hpp b/runtime/hsa-ext-aql-profile/test/common/common.hpp new file mode 100644 index 0000000000..ef9713831a --- /dev/null +++ b/runtime/hsa-ext-aql-profile/test/common/common.hpp @@ -0,0 +1,27 @@ +#ifndef COMMON_COMMON_HPP +#define COMMON_COMMON_HPP + +#include +#include + +#include "hsa.h" +#include "hsa_ext_finalize.h" +#include "hsa_ext_amd.h" + +#if defined(_MSC_VER) +#define ALIGNED_(x) __declspec(align(x)) +#else +#if defined(__GNUC__) +#define ALIGNED_(x) __attribute__((aligned(x))) +#endif // __GNUC__ +#endif // _MSC_VER + +#define MULTILINE(...) #__VA_ARGS__ + +void ErrorCheck(hsa_status_t hsa_error_code); + +hsa_status_t FindGpuDevice(hsa_agent_t agent, void* data); + +hsa_status_t FindHostRegion(hsa_region_t region, void* data); + +#endif // COMMON_COMMON_HPP diff --git a/runtime/hsa-ext-aql-profile/test/common/helper_funcs.cpp b/runtime/hsa-ext-aql-profile/test/common/helper_funcs.cpp new file mode 100644 index 0000000000..71f7d1cd33 --- /dev/null +++ b/runtime/hsa-ext-aql-profile/test/common/helper_funcs.cpp @@ -0,0 +1,262 @@ +/********************************************************************** +Copyright 2013 Advanced Micro Devices, Inc. All rights reserved. + +Redistribution and use in source and binary forms, with or without modification, are permitted +provided that the following conditions are met: + + Redistributions of source code must retain the above copyright notice, this list of +conditions and the following disclaimer. + Redistributions in binary form must reproduce the above copyright notice, this list of +conditions and the following disclaimer in the documentation and/or + other materials provided with the distribution. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR +IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT +SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY + DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS + OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, +WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. +********************************************************************/ + +#include "helper_funcs.hpp" + +#ifndef _WIN32 +#include +#endif + + +/* + * Prints no more than 256 elements of the given array. + * Prints full array if length is less than 256. + * Prints Array name followed by elements. + */ +template +void printArray(const std::string header, const T* data, const int width, const int height) { + std::cout << header << " :\n"; + for (int i = 0; i < height; i++) { + std::cout << "> "; + for (int j = 0; j < width; j++) { + std::cout << data[i * width + j] << " "; + } + std::cout << "\n"; + } +} + +template +int fillRandom(T* arrayPtr, const int width, const int height, const T rangeMin, const T rangeMax, + unsigned int seed) { + if (!arrayPtr) { + error("Cannot fill array. NULL pointer."); + return HSA_SDK_FAILURE; + } + + if (!seed) seed = (unsigned int)time(NULL); + + srand(seed); + double range = double(rangeMax - rangeMin) + 1.0; + + /* random initialisation of input */ + for (int i = 0; i < height; i++) + for (int j = 0; j < width; j++) { + int index = i * width + j; + arrayPtr[index] = rangeMin + T(range * rand() / (RAND_MAX + 1.0)); + } + + return HSA_SDK_SUCCESS; +} + +template int fillPos(T* arrayPtr, const int width, const int height) { + if (!arrayPtr) { + error("Cannot fill array. NULL pointer."); + return HSA_SDK_FAILURE; + } + + /* initialisation of input with positions*/ + for (T i = 0; i < height; i++) + for (T j = 0; j < width; j++) { + T index = i * width + j; + arrayPtr[index] = index; + } + + return HSA_SDK_SUCCESS; +} + +template +int fillConstant(T* arrayPtr, const int width, const int height, const T val) { + if (!arrayPtr) { + error("Cannot fill array. NULL pointer."); + return HSA_SDK_FAILURE; + } + + /* initialisation of input with constant value*/ + for (int i = 0; i < height; i++) + for (int j = 0; j < width; j++) { + int index = i * width + j; + arrayPtr[index] = val; + } + + return HSA_SDK_SUCCESS; +} + +template T roundToPowerOf2(T val) { + int bytes = sizeof(T); + + val--; + for (int i = 0; i < bytes; i++) val |= val >> (1 << i); + val++; + + return val; +} + +template int isPowerOf2(T val) { + long long _val = val; + if ((_val & (-_val)) - _val == 0 && _val != 0) + return HSA_SDK_SUCCESS; + else + return HSA_SDK_FAILURE; +} + + +template bool checkVal(T input, T reference, std::string message, bool isAPIerror) { + if (input == reference) { + return true; + } else { + error(message); + return false; + } +} + + +template std::string toString(T t, std::ios_base& (*r)(std::ios_base&)) { + std::ostringstream output; + output << r << t; + return output.str(); +} + + +bool compare(const float* refData, const float* data, const int length, const float epsilon) { + float error = 0.0f; + float ref = 0.0f; + + for (int i = 1; i < length; ++i) { + float diff = refData[i] - data[i]; + error += diff * diff; + ref += refData[i] * refData[i]; + } + + float normRef = ::sqrtf((float)ref); + if (::fabs((float)ref) < 1e-7f) { + return false; + } + float normError = ::sqrtf((float)error); + error = normError / normRef; + + return error < epsilon; +} + +bool compare(const double* refData, const double* data, const int length, const double epsilon) { + double error = 0.0; + double ref = 0.0; + + for (int i = 1; i < length; ++i) { + double diff = refData[i] - data[i]; + error += diff * diff; + ref += refData[i] * refData[i]; + } + + double normRef = ::sqrt((double)ref); + if (::fabs((double)ref) < 1e-7) { + return false; + } + double normError = ::sqrt((double)error); + error = normError / normRef; + + return error < epsilon; +} + +void error(const char* errorMsg) { std::cout << "Error: " << errorMsg << std::endl; } + +void error(std::string errorMsg) { std::cout << "Error: " << errorMsg << std::endl; } + +void expectedError(const char* errorMsg) { + std::cout << "Expected Error: " << errorMsg << std::endl; +} + +void expectedError(std::string errorMsg) { + std::cout << "Expected Error: " << errorMsg << std::endl; +} + + +///////////////////////////////////////////////////////////////// +// Template Instantiations +///////////////////////////////////////////////////////////////// +template void printArray(const std::string, const short*, int, int); +template void printArray(const std::string, const unsigned char*, int, int); +template void printArray(const std::string, const unsigned int*, int, int); +template void printArray(const std::string, const int*, int, int); +template void printArray(const std::string, const long*, int, int); +template void printArray(const std::string, const float*, int, int); +template void printArray(const std::string, const double*, int, int); + +template int fillRandom(unsigned char* arrayPtr, const int width, const int height, + unsigned char rangeMin, unsigned char rangeMax, + unsigned int seed); +template int fillRandom(unsigned int* arrayPtr, const int width, const int height, + unsigned int rangeMin, unsigned int rangeMax, + unsigned int seed); +template int fillRandom(int* arrayPtr, const int width, const int height, int rangeMin, + int rangeMax, unsigned int seed); +template int fillRandom(long* arrayPtr, const int width, const int height, long rangeMin, + long rangeMax, unsigned int seed); +template int fillRandom(float* arrayPtr, const int width, const int height, float rangeMin, + float rangeMax, unsigned int seed); +template int fillRandom(double* arrayPtr, const int width, const int height, + double rangeMin, double rangeMax, unsigned int seed); + +template short roundToPowerOf2(short val); +template unsigned int roundToPowerOf2(unsigned int val); +template int roundToPowerOf2(int val); +template long roundToPowerOf2(long val); + +template int isPowerOf2(short val); +template int isPowerOf2(unsigned int val); +template int isPowerOf2(int val); +template int isPowerOf2(long val); + +template <> int fillPos(short* arrayPtr, const int width, const int height); +template <> int fillPos(unsigned int* arrayPtr, const int width, const int height); +template <> int fillPos(int* arrayPtr, const int width, const int height); +template <> int fillPos(long* arrayPtr, const int width, const int height); + +template <> +int fillConstant(short* arrayPtr, const int width, const int height, const short val); +template <> +int fillConstant(unsigned int* arrayPtr, const int width, const int height, const unsigned int val); +template <> int fillConstant(int* arrayPtr, const int width, const int height, const int val); +template <> int fillConstant(long* arrayPtr, const int width, const int height, const long val); +template <> int fillConstant(long* arrayPtr, const int width, const int height, const long val); +template <> int fillConstant(long* arrayPtr, const int width, const int height, const long val); + + +template bool checkVal(char input, char reference, std::string message, bool isAPIerror); +template bool checkVal(bool input, bool reference, std::string message, bool isAPIerror); +template bool checkVal(std::string input, std::string reference, std::string message, + bool isAPIerror); +template bool checkVal(short input, short reference, std::string message, bool isAPIerror); +template bool checkVal(unsigned int input, unsigned int reference, + std::string message, bool isAPIerror); +template bool checkVal(int input, int reference, std::string message, bool isAPIerror); +template bool checkVal(long input, long reference, std::string message, bool isAPIerror); + + +template std::string toString(char t, std::ios_base& (*r)(std::ios_base&)); +template std::string toString(short t, std::ios_base& (*r)(std::ios_base&)); +template std::string toString(unsigned int t, std::ios_base& (*r)(std::ios_base&)); +template std::string toString(int t, std::ios_base& (*r)(std::ios_base&)); +template std::string toString(long t, std::ios_base& (*r)(std::ios_base&)); +template std::string toString(float t, std::ios_base& (*r)(std::ios_base&)); +template std::string toString(double t, std::ios_base& (*r)(std::ios_base&)); diff --git a/runtime/hsa-ext-aql-profile/test/common/helper_funcs.hpp b/runtime/hsa-ext-aql-profile/test/common/helper_funcs.hpp new file mode 100644 index 0000000000..c5d0e7ca80 --- /dev/null +++ b/runtime/hsa-ext-aql-profile/test/common/helper_funcs.hpp @@ -0,0 +1,141 @@ +/********************************************************************** +Copyright 2013 Advanced Micro Devices, Inc. All rights reserved. + +Redistribution and use in source and binary forms, with or without modification, are permitted +provided that the following conditions are met: + + Redistributions of source code must retain the above copyright notice, this list of +conditions and the following disclaimer. + Redistributions in binary form must reproduce the above copyright notice, this list of +conditions and the following disclaimer in the documentation and/or + other materials provided with the distribution. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR +IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT +SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY + DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS + OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, +WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. +********************************************************************/ +#ifndef HELPER_FUNCS_HPP_ +#define HELPER_FUNCS_HPP_ + +#define HSA_SDK_SUCCESS 0 +#define HSA_SDK_FAILURE 1 +#define HSA_SDK_EXPECTED_FAILURE 2 + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/** + * error + * constant function, Prints error messages + * @param errorMsg char* message + */ +void error(const char* errorMsg); + +/** + * error + * constant function, Prints error messages + * @param errorMsg std::string message + */ +void error(std::string errorMsg); + +/** + * expectedError + * constant function, Prints error messages + * @param errorMsg char* message + */ +void expectedError(const char* errorMsg); + +/** + * expectedError + * constant function, Prints error messages + * @param errorMsg string message + */ +void expectedError(std::string errorMsg); + +/** + * compare template version + * compare data to check error + * @param refData templated input + * @param data templated input + * @param length number of values to compare + * @param epsilon errorWindow + */ +bool compare(const float* refData, const float* data, const int length, + const float epsilon = 1e-6f); +bool compare(const double* refData, const double* data, const int length, + const double epsilon = 1e-6); + +/** + * printArray + * displays a array on std::out + */ +template +void printArray(const std::string header, const T* data, const int width, const int height); + + +/** + * fillRandom + * fill array with random values + */ +template +int fillRandom(T* arrayPtr, const int width, const int height, const T rangeMin, const T rangeMax, + unsigned int seed = 123); + +/** + * fillPos + * fill the specified positions + */ +template int fillPos(T* arrayPtr, const int width, const int height); + +/** + * fillConstant + * fill the array with constant value + */ +template int fillConstant(T* arrayPtr, const int width, const int height, const T val); + + +/** + * roundToPowerOf2 + * rounds to a power of 2 + */ +template T roundToPowerOf2(T val); + +/** + * isPowerOf2 + * checks if input is a power of 2 + */ +template int isPowerOf2(T val); + +/** + * checkVal + * Set default(isAPIerror) parameter to false + * if checkVaul is used to check otherthan OpenCL API error code + */ +template +bool checkVal(T input, T reference, std::string message, bool isAPIerror = true); + +/** + * toString + * convert a T type to string + */ +template std::string toString(T t, std::ios_base& (*r)(std::ios_base&)); + + +#endif diff --git a/runtime/hsa-ext-aql-profile/test/common/hsa_perf_cntrs.cpp b/runtime/hsa-ext-aql-profile/test/common/hsa_perf_cntrs.cpp new file mode 100644 index 0000000000..93e103ae9a --- /dev/null +++ b/runtime/hsa-ext-aql-profile/test/common/hsa_perf_cntrs.cpp @@ -0,0 +1,155 @@ +#include +#include +#include +#include +#include + +#include +#include +#include + +#include "hsa.h" +#include "hsa_ext_profiler.h" +#include "amd_hsa_tools_interfaces.h" + +#include "hsa_perf_cntrs.hpp" + +using namespace std; + +void PreDispatchCallback(const hsa_dispatch_callback_t* dispParam, void* usrArg) { + assert((dispParam->pre_dispatch) && "Pre Dispatch Callback Param is Malformed"); + + hsa_ext_tools_pmu_t* perfMgr = reinterpret_cast(usrArg); + hsa_status_t status = + hsa_ext_tools_pmu_begin(*perfMgr, dispParam->queue, dispParam->aql_translation_handle, true); + assert((status == HSA_STATUS_SUCCESS) && "Error in beginning Perf Cntr Session"); +} + +void PostDispatchCallback(const hsa_dispatch_callback_t* dispParam, void* usrArg) { + assert((!dispParam->pre_dispatch) && "Post Dispatch Callback Param is Malformed"); + + hsa_ext_tools_pmu_t* perfMgr = reinterpret_cast(usrArg); + hsa_status_t status = + hsa_ext_tools_pmu_end(*perfMgr, dispParam->queue, dispParam->aql_translation_handle); + assert((status == HSA_STATUS_SUCCESS) && "Error in endning Perf Cntr Session"); +} + +// Constructor of the class +RocrPerfCntrApp::RocrPerfCntrApp() : perfMgr_(NULL) {} + +// Destructor of the class. Ideally it should delete the +// PMU and its counters +RocrPerfCntrApp::~RocrPerfCntrApp() {} + +// Return the number of perf counters +uint32_t RocrPerfCntrApp::GetNumPerfCntrs() { return uint32_t(cntrList_.size()); } + +// Return the handle of perf counter at specified index +CntrInfo* RocrPerfCntrApp::GetPerfCntr(uint32_t idx) { return cntrList_[idx]; } + +// Print the various fields of Perf Cntrs being programmed +bool RocrPerfCntrApp::PrintCntrs() { + CntrInfo* info; + int size = uint32_t(cntrList_.size()); + for (int idx = 0; idx < size; idx++) { + info = cntrList_[idx]; + std::cout << std::endl; + std::cout << "Rocr Perf Cntr Id: " << info->cntrId << std::endl; + std::cout << "Rocr Perf Cntr Name: " << info->cntrName << std::endl; + std::cout << "Rocr Perf Cntr Blk Id: " << info->blkId << std::endl; + std::cout << "Rocr Perf Cntr Value: " << info->cntrResult << std::endl; + std::cout << "Rocr Perf Cntr Validation: " << info->cnfType << std::endl; + std::cout << std::endl; + } + return true; +} + +// Initialize the list of perf counters +// block id of kHsaAiCounterBlockSQ = 14 == 0x0E +hsa_status_t RocrPerfCntrApp::Init(hsa_agent_t agent) { + // Initialize the list of Perf Cntrs + // Add SQ counter for number of waves + CntrInfo* info = NULL; + cntrList_.reserve(23); + + // Event for number of Waves + info = new CntrInfo(0x4, "SQ_SQ_PERF_SEL_WAVES", NULL, 0x0E, NULL, 0x00, 0xFFFFFFFF, + CntrValCnf_Exact); + cntrList_.push_back(info); + + // Event for number of Threads + info = new CntrInfo(0xE, "SQ_SQ_PERF_SEL_ITEMS", NULL, 0x0E, NULL, 0x00, 0xFFFFFFFF, + CntrValCnf_Exact); + cntrList_.push_back(info); + + + // Create an instance of Perf Mgr + hsa_status_t status; + status = hsa_ext_tools_create_pmu(agent, &perfMgr_); + assert((status == HSA_STATUS_SUCCESS) && "Error in creating Perf Cntr Mgr"); + + // Process each counter from the list as necessary + // each counter descriptor with its perf block handle + // and create an instance of counter in that block + uint32_t size = GetNumPerfCntrs(); + for (uint32_t idx = 0; idx < size; idx++) { + info = GetPerfCntr(idx); + + // Obtain the handle of perf block + if (info->blkHndl == NULL) { + status = hsa_ext_tools_get_counter_block_by_id(perfMgr_, info->blkId, &info->blkHndl); + assert((status == HSA_STATUS_SUCCESS) && "Error in getting Perf Cntr Blk Hndl"); + } + + // Create an instance of counter in the perf block + status = hsa_ext_tools_create_counter(info->blkHndl, &info->cntrHndl); + assert((status == HSA_STATUS_SUCCESS) && "Error in creating Perf Cntr in Perf Blk"); + + // Update the Event Index property of counter + uint32_t cntrProp = HSA_EXT_TOOLS_COUNTER_PARAMETER_EVENT_INDEX; + status = hsa_ext_tools_set_counter_parameter(info->cntrHndl, cntrProp, sizeof(uint32_t), + (void*)&info->cntrId); + assert((status == HSA_STATUS_SUCCESS) && "Error in updating Perf Cntr Property Event Index"); + + // Enable the updated perf counter + status = hsa_ext_tools_set_counter_enabled(info->cntrHndl, true); + assert((status == HSA_STATUS_SUCCESS) && "Error in enabing Perf Cntr"); + } + + return status; +} + +// Register Pre and Post dispatch callbacks +void RocrPerfCntrApp::RegisterCallbacks(hsa_queue_t* queue) { + hsa_status_t status; + status = hsa_ext_tools_set_callback_functions(queue, PreDispatchCallback, PostDispatchCallback); + assert((status == HSA_STATUS_SUCCESS) && "Error in registering Pre & Post Dispatch Callbacks"); + status = hsa_ext_tools_set_callback_arguments(queue, &perfMgr_, &perfMgr_); + assert((status == HSA_STATUS_SUCCESS) && + "Error in registering Pre & Post Dispatch Callback Params"); + return; +} + +// Wait for perf counter collection to complete +hsa_status_t RocrPerfCntrApp::Wait() { + hsa_status_t status; + status = hsa_ext_tools_pmu_wait_for_completion(perfMgr_, 5000); + assert((status == HSA_STATUS_SUCCESS) && "Error in Waiting for Perf Cntr Completion"); + return status; +} + +// Validate perf counter values +hsa_status_t RocrPerfCntrApp::Validate() { + // Retrieve the results of the different Perf Cntrs + // and validate them as configured + CntrInfo* info = NULL; + hsa_status_t status = HSA_STATUS_SUCCESS; + uint32_t size = GetNumPerfCntrs(); + for (uint32_t idx = 0; idx < size; idx++) { + info = GetPerfCntr(idx); + status = hsa_ext_tools_get_counter_result(info->cntrHndl, &info->cntrResult); + std::cout << "Value of Perf Cntr is: " << info->cntrResult << std::endl; + } + + return status; +} diff --git a/runtime/hsa-ext-aql-profile/test/common/hsa_perf_cntrs.hpp b/runtime/hsa-ext-aql-profile/test/common/hsa_perf_cntrs.hpp new file mode 100644 index 0000000000..3a2a2fbd42 --- /dev/null +++ b/runtime/hsa-ext-aql-profile/test/common/hsa_perf_cntrs.hpp @@ -0,0 +1,110 @@ +#ifndef ROCR_PERF_CNTR_APP_H_ +#define ROCR_PERF_CNTR_APP_H_ + +#include +#include +#include +#include + +#include +#include +#include + +#include "hsa.h" +#include "hsa_ext_profiler.h" + +typedef enum CntrValCnfType { + + ///< no counter value validation should be performed + CntrValCnf_None, + + ///< counter value should be an exact match to expectedResult + CntrValCnf_Exact, + + ///< counter value should be greater than expectedResult + CntrValCnf_GreaterThan, + + ///< counter value should be less than expectedResult + CntrValCnf_LessThan + +} CntrValCnfType; + +/// Struct used to encapsulate Counter Info +typedef struct CntrInfo { + ///< Id of counter in hardware block + uint32_t cntrId; + + ///< Name of counter + char cntrName[72]; + + ///< Handle of perf counter + hsa_ext_tools_counter_t cntrHndl; + + ///< Id of hardware block containing the counter + uint32_t blkId; + + ///< Handle of counter block + hsa_ext_tools_counter_block_t blkHndl; + + ///< Expected value of perf counte + uint64_t expectedResult; + + ///< Value of perf counter expected + uint64_t cntrResult; + + ///< Type of validation upon completion of dispatch + CntrValCnfType cnfType; + + CntrInfo(uint32_t cntrId, char* cntrName, void* cntrHndl, uint32_t blkId, void* blkHndl, + uint64_t expResult, uint64_t result, CntrValCnfType cnfType) { + this->cntrId = cntrId; + this->cntrHndl = cntrHndl; + this->blkId = blkId; + this->blkHndl = blkHndl; + this->expectedResult = expResult; + this->cntrResult = result; + this->cnfType = cnfType; + memcpy(this->cntrName, cntrName, strlen(cntrName)); + } + +} CntrInfo; + +class RocrPerfCntrApp { + public: + // Constructor of the class. Will initialize the list of perf counters + // that will be used to program the device + RocrPerfCntrApp(); + + // Destructor of the class + ~RocrPerfCntrApp(); + + // Return the number of perf counters + uint32_t GetNumPerfCntrs(); + + // Return the handle of perf counter at specified index + CntrInfo* GetPerfCntr(uint32_t idx); + + // Print the list of perf counters + bool PrintCntrs(); + + // Initialize the list of perf counters + hsa_status_t Init(hsa_agent_t agent); + + // Register Pre and Post dispatch callbacks + void RegisterCallbacks(hsa_queue_t* queue); + + // Wait for perf counter collection to complete + hsa_status_t Wait(); + + // Validate perf counter values + hsa_status_t Validate(); + + private: + // Number of queues to create + std::vector cntrList_; + + // Handle of Perf Cntr Manager + hsa_ext_tools_pmu_t perfMgr_; +}; + +#endif // ROCR_PERF_CNTR_APP_H_ diff --git a/runtime/hsa-ext-aql-profile/test/common/hsa_rsrc_factory.cpp b/runtime/hsa-ext-aql-profile/test/common/hsa_rsrc_factory.cpp new file mode 100644 index 0000000000..75c2614495 --- /dev/null +++ b/runtime/hsa-ext-aql-profile/test/common/hsa_rsrc_factory.cpp @@ -0,0 +1,476 @@ +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +#include "hsa.h" +#include "hsa_rsrc_factory.hpp" +#include "hsa_ext_finalize.h" +#include "hsa_ext_profiler.h" + +#include "common.hpp" + +using namespace std; + +// Provide access to command line arguments passed in by user +uint32_t hsa_cmdline_arg_cnt; +char** hsa_cmdline_arg_list; + +// Callback function to find and bind kernarg region of an agent +static hsa_status_t find_memregions(hsa_region_t region, void* data) { + hsa_region_global_flag_t flags; + hsa_region_segment_t segment_id; + + hsa_region_get_info(region, HSA_REGION_INFO_SEGMENT, &segment_id); + if (segment_id != HSA_REGION_SEGMENT_GLOBAL) { + return HSA_STATUS_SUCCESS; + } + + AgentInfo* agent_info = (AgentInfo*)data; + hsa_region_get_info(region, HSA_REGION_INFO_GLOBAL_FLAGS, &flags); + if (flags & HSA_REGION_GLOBAL_FLAG_COARSE_GRAINED) { + agent_info->coarse_region = region; + } + + if (flags & HSA_REGION_GLOBAL_FLAG_KERNARG) { + agent_info->kernarg_region = region; + } + + return HSA_STATUS_SUCCESS; +} + +// Callback function to get the number of agents +static hsa_status_t get_hsa_agents(hsa_agent_t agent, void* data) { + // Copy handle of agent and increment number of agents reported + HsaRsrcFactory* rsrcFactory = reinterpret_cast(data); + + // Determine if device is a Gpu agent + hsa_status_t status; + hsa_device_type_t type; + status = hsa_agent_get_info(agent, HSA_AGENT_INFO_DEVICE, &type); + if (type == HSA_DEVICE_TYPE_DSP) { + return HSA_STATUS_SUCCESS; + } + + if (type == HSA_DEVICE_TYPE_CPU) { + AgentInfo* agent_info = reinterpret_cast(malloc(sizeof(AgentInfo))); + agent_info->dev_id = agent; + agent_info->dev_type = HSA_DEVICE_TYPE_CPU; + rsrcFactory->AddAgentInfo(agent_info, false); + return HSA_STATUS_SUCCESS; + } + + // Device is a Gpu agent, build an instance of AgentInfo + AgentInfo* agent_info = reinterpret_cast(malloc(sizeof(AgentInfo))); + agent_info->dev_id = agent; + agent_info->dev_type = HSA_DEVICE_TYPE_GPU; + hsa_agent_get_info(agent, HSA_AGENT_INFO_NAME, agent_info->name); + agent_info->max_wave_size = 0; + hsa_agent_get_info(agent, HSA_AGENT_INFO_WAVEFRONT_SIZE, &agent_info->max_wave_size); + agent_info->max_queue_size = 0; + hsa_agent_get_info(agent, HSA_AGENT_INFO_QUEUE_MAX_SIZE, &agent_info->max_queue_size); + agent_info->profile = hsa_profile_t(108); + hsa_agent_get_info(agent, HSA_AGENT_INFO_PROFILE, &agent_info->profile); + + // Initialize memory regions to zero + agent_info->kernarg_region.handle = 0; + agent_info->coarse_region.handle = 0; + + // Find and Bind Memory regions of the Gpu agent + hsa_agent_iterate_regions(agent, find_memregions, agent_info); + + // Save the instance of AgentInfo + rsrcFactory->AddAgentInfo(agent_info, true); + return HSA_STATUS_SUCCESS; +} + +// Definitions for Static Data members of the class +char* HsaRsrcFactory::brig_path_ = NULL; +uint32_t HsaRsrcFactory::num_cus_ = 4; +uint32_t HsaRsrcFactory::num_waves_; +uint32_t HsaRsrcFactory::num_workitems_; +uint32_t HsaRsrcFactory::kernel_loop_count_; +bool HsaRsrcFactory::print_debug_info_ = false; + +char* HsaRsrcFactory::num_cus_key_ = "num_cus"; +char* HsaRsrcFactory::brig_path_key_ = "brig_path"; +char* HsaRsrcFactory::num_waves_key_ = "waves_per_cu"; +char* HsaRsrcFactory::num_workitems_key_ = "workitems_per_wave"; +char* HsaRsrcFactory::print_debug_key_ = "print_debug"; +char* HsaRsrcFactory::kernel_loop_count_key_ = "kernel_loop_count"; + +// Constructor of the class +HsaRsrcFactory::HsaRsrcFactory() { + // Initialize the Hsa Runtime + hsa_status_t status = hsa_init(); + check("Error in hsa_init", status); + + // Discover the set of Gpu devices available on the platform + status = hsa_iterate_agents(get_hsa_agents, this); + check("Error Calling hsa_iterate_agents", status); + + // Process command line arguments + ProcessCmdline(); +} + +// Destructor of the class +HsaRsrcFactory::~HsaRsrcFactory() {} + +// Get the count of Hsa Gpu Agents available on the platform +// +// @return uint32_t Number of Gpu agents on platform +// +uint32_t HsaRsrcFactory::GetCountOfGpuAgents() { return uint32_t(gpu_list_.size()); } + +// Get the count of Hsa Cpu Agents available on the platform +// +// @return uint32_t Number of Cpu agents on platform +// +uint32_t HsaRsrcFactory::GetCountOfCpuAgents() { return uint32_t(cpu_list_.size()); } + +// Get the AgentInfo handle of a Gpu device +// +// @param idx Gpu Agent at specified index +// +// @param agent_info Output parameter updated with AgentInfo +// +// @return bool true if successful, false otherwise +// +bool HsaRsrcFactory::GetGpuAgentInfo(uint32_t idx, AgentInfo** agent_info) { + // Determine if request is valid + uint32_t size = uint32_t(gpu_list_.size()); + if (idx >= size) { + return false; + } + + // Copy AgentInfo from specified index + *agent_info = gpu_list_[idx]; + return true; +} + +// Get the AgentInfo handle of a Cpu device +// +// @param idx Cpu Agent at specified index +// +// @param agent_info Output parameter updated with AgentInfo +// +// @return bool true if successful, false otherwise +// +bool HsaRsrcFactory::GetCpuAgentInfo(uint32_t idx, AgentInfo** agent_info) { + // Determine if request is valid + uint32_t size = uint32_t(cpu_list_.size()); + if (idx >= size) { + return false; + } + + // Copy AgentInfo from specified index + *agent_info = cpu_list_[idx]; + return true; +} + +// Create a Queue object and return its handle. The queue object is expected +// to support user requested number of Aql dispatch packets. +// +// @param agent_info Gpu Agent on which to create a queue object +// +// @param num_Pkts Number of packets to be held by queue +// +// @param queue Output parameter updated with handle of queue object +// +// @return bool true if successful, false otherwise +// +bool HsaRsrcFactory::CreateQueue(AgentInfo* agent_info, uint32_t num_pkts, hsa_queue_t** queue) { + hsa_status_t status; + status = hsa_queue_create(agent_info->dev_id, num_pkts, HSA_QUEUE_TYPE_MULTI, NULL, NULL, + UINT32_MAX, UINT32_MAX, queue); + return (status == HSA_STATUS_SUCCESS); +} + +// Create a Signal object and return its handle. +// +// @param value Initial value of signal object +// +// @param signal Output parameter updated with handle of signal object +// +// @return bool true if successful, false otherwise +// +bool HsaRsrcFactory::CreateSignal(uint32_t value, hsa_signal_t* signal) { + hsa_status_t status; + status = hsa_signal_create(value, 0, NULL, signal); + return (status == HSA_STATUS_SUCCESS); +} + +// Allocate memory for use by a kernel of specified size in specified +// agent's memory region. Currently supports Global segment whose Kernarg +// flag set. +// +// @param agent_info Agent from whose memory region to allocate +// +// @param size Size of memory in terms of bytes +// +// @return uint8_t* Pointer to buffer, null if allocation fails. +// +uint8_t* HsaRsrcFactory::AllocateLocalMemory(AgentInfo* agent_info, size_t size) { + hsa_status_t status; + uint8_t* buffer = NULL; + + if (agent_info->coarse_region.handle != 0) { + // Allocate in local memory if it is available + status = hsa_memory_allocate(agent_info->coarse_region, size, (void**)&buffer); + if (status == HSA_STATUS_SUCCESS) { + status = hsa_memory_assign_agent(buffer, agent_info->dev_id, HSA_ACCESS_PERMISSION_RW); + } + } else { + // Allocate in system memory if local memory is not available + status = hsa_memory_allocate(agent_info->kernarg_region, size, (void**)&buffer); + } + + return (status == HSA_STATUS_SUCCESS) ? buffer : NULL; +} + +// Allocate memory tp pass kernel parameters. +// +// @param agent_info Agent from whose memory region to allocate +// +// @param size Size of memory in terms of bytes +// +// @return uint8_t* Pointer to buffer, null if allocation fails. +// +uint8_t* HsaRsrcFactory::AllocateSysMemory(AgentInfo* agent_info, size_t size) { + hsa_status_t status; + uint8_t* buffer = NULL; + status = hsa_memory_allocate(agent_info->kernarg_region, size, (void**)&buffer); + return (status == HSA_STATUS_SUCCESS) ? buffer : NULL; +} + +bool HsaRsrcFactory::TransferData(uint8_t* dest_buff, uint8_t* src_buff, uint32_t length, + bool host_to_dev) { + hsa_status_t status; + status = hsa_memory_copy(dest_buff, src_buff, length); + return (status == HSA_STATUS_SUCCESS); +} + +// Fake method for compilation steps only +uint8_t* HsaRsrcFactory::AllocateMemory(AgentInfo* agent_info, size_t size) { + hsa_status_t status; + uint8_t* buffer = NULL; + status = hsa_memory_allocate(agent_info->kernarg_region, size, (void**)&buffer); + return (status == HSA_STATUS_SUCCESS) ? buffer : NULL; +} + +// Loads an Assembled Brig file and Finalizes it into Device Isa +// +// @param agent_info Gpu device for which to finalize +// +// @param brig_path File path of the Assembled Brig file +// +// @param kernel_name Name of the kernel to finalize +// +// @param code_desc Handle of finalized Code Descriptor that could +// be used to submit for execution +// +// @return bool true if successful, false otherwise +// +bool HsaRsrcFactory::LoadAndFinalize(AgentInfo* agent_info, const char* brig_path, + char* kernel_name, hsa_executable_symbol_t* code_desc) { + // Finalize the Hsail object into code object + hsa_status_t status; + hsa_code_object_t code_object; + + // Build the code object filename + std::string filename(brig_path); + std::cout << "Code object filename: " << filename << std::endl; + + // Open the file containing code object + std::ifstream codeStream(filename.c_str(), std::ios::binary | std::ios::ate); + if (!codeStream) { + std::cout << "Error: failed to load " << filename << std::endl; + assert(false); + return false; + } + + // Allocate memory to read in code object from file + size_t size = std::string::size_type(codeStream.tellg()); + char* codeBuff = (char*)AllocateSysMemory(agent_info, size); + if (!codeBuff) { + std::cout << "Error: failed to allocate memory for code object." << std::endl; + assert(false); + return false; + } + + // Read the code object into allocated memory + codeStream.seekg(0, std::ios::beg); + std::copy(std::istreambuf_iterator(codeStream), std::istreambuf_iterator(), codeBuff); + + // De-Serialize the code object that has been read into memory + status = hsa_code_object_deserialize(codeBuff, size, NULL, &code_object); + if (status != HSA_STATUS_SUCCESS) { + std::cout << "Failed to deserialize code object" << std::endl; + return false; + } + + // Create executable. + hsa_executable_t hsaExecutable; + // status = hsa_executable_create(agent_info->profile, + status = + hsa_executable_create(HSA_PROFILE_FULL, HSA_EXECUTABLE_STATE_UNFROZEN, "", &hsaExecutable); + check("Error in creating executable object", status); + + // Load code object. + status = hsa_executable_load_code_object(hsaExecutable, agent_info->dev_id, code_object, ""); + check("Error in loading executable object", status); + + // Freeze executable. + status = hsa_executable_freeze(hsaExecutable, ""); + check("Error in freezing executable object", status); + + // Get symbol handle. + hsa_executable_symbol_t kernelSymbol; + status = hsa_executable_get_symbol(hsaExecutable, NULL, kernel_name, agent_info->dev_id, 0, + &kernelSymbol); + check("Error in looking up kernel symbol", status); + + // Update output parameter + *code_desc = kernelSymbol; + return true; +} + +// Add an instance of AgentInfo representing a Hsa Gpu agent +void HsaRsrcFactory::AddAgentInfo(AgentInfo* agent_info, bool gpu) { + // Add input to Gpu list + if (gpu) { + gpu_list_.push_back(agent_info); + return; + } + + // Add input to Cpu list + cpu_list_.push_back(agent_info); +} + +// Print the various fields of Hsa Gpu Agents +bool HsaRsrcFactory::PrintGpuAgents(const std::string& header) { + std::cout << header << " :" << std::endl; + + AgentInfo* agent_info; + int size = uint32_t(gpu_list_.size()); + for (int idx = 0; idx < size; idx++) { + agent_info = gpu_list_[idx]; + + std::cout << "> agent[" << idx << "] :" << std::endl; + std::cout << ">> Name : " << agent_info->name << std::endl; + std::cout << ">> Max Wave Size : " << agent_info->max_wave_size << std::endl; + std::cout << ">> Max Queue Size : " << agent_info->max_queue_size << std::endl; + std::cout << ">> Kernarg Region Id : " << agent_info->coarse_region.handle << std::endl; + } + return true; +} + +// Returns the file path where brig files is located. Value is +// available only after an instance has been built. +char* HsaRsrcFactory::GetBrigPath() { return HsaRsrcFactory::brig_path_; } + +// Returns the number of compute units present on platform +// Value is available only after an instance has been built. +uint32_t HsaRsrcFactory::GetNumOfCUs() { return HsaRsrcFactory::num_cus_; } + +// Returns the maximum number of waves that can be launched +// per compute unit. The actual number that can be launched +// is affected by resource availability +// +// Value is available only after an instance has been built. +uint32_t HsaRsrcFactory::GetNumOfWavesPerCU() { return HsaRsrcFactory::num_waves_; } + +// Returns the number of work-items that can execute per wave +// Value is available only after an instance has been built. +uint32_t HsaRsrcFactory::GetNumOfWorkItemsPerWave() { return HsaRsrcFactory::num_workitems_; } + +// Returns the number of times kernel loop body should execute. +// Value is available only after an instance has been built. +uint32_t HsaRsrcFactory::GetKernelLoopCount() { return HsaRsrcFactory::kernel_loop_count_; } + +// Returns boolean flag to indicate if debug info should be printed +// Value is available only after an instance has been built. +uint32_t HsaRsrcFactory::GetPrintDebugInfo() { return HsaRsrcFactory::print_debug_info_; } + +// Process command line arguments. The method will capture +// various user command line parameters for tests to use +void HsaRsrcFactory::ProcessCmdline() { + // Command line arguments are given + uint32_t idx; + uint32_t arg_idx; + for (idx = 1; idx < hsa_cmdline_arg_cnt; idx += 2) { + arg_idx = GetArgIndex((char*)hsa_cmdline_arg_list[idx]); + switch (arg_idx) { + case 0: + HsaRsrcFactory::brig_path_ = hsa_cmdline_arg_list[idx + 1]; + break; + case 1: + HsaRsrcFactory::num_cus_ = atoi(hsa_cmdline_arg_list[idx + 1]); + break; + case 2: + HsaRsrcFactory::num_waves_ = atoi(hsa_cmdline_arg_list[idx + 1]); + break; + case 3: + HsaRsrcFactory::num_workitems_ = atoi(hsa_cmdline_arg_list[idx + 1]); + break; + case 4: + HsaRsrcFactory::kernel_loop_count_ = atoi(hsa_cmdline_arg_list[idx + 1]); + break; + case 5: + HsaRsrcFactory::print_debug_info_ = true; + break; + } + } +} + +uint32_t HsaRsrcFactory::GetArgIndex(char* arg_value) { + // Map Brig file path to index zero + if (!strcmp(HsaRsrcFactory::brig_path_key_, arg_value)) { + return 0; + } + + // Map Number of Compute Units to index one + if (!strcmp(HsaRsrcFactory::num_cus_key_, arg_value)) { + return 1; + } + + // Map Number of Waves per CU to index two + if (!strcmp(HsaRsrcFactory::num_waves_key_, arg_value)) { + return 2; + } + + // Map Number of Workitems per Wave to index three + if (!strcmp(HsaRsrcFactory::num_workitems_key_, arg_value)) { + return 3; + } + + // Map Kernel Loop Count to index four + if (!strcmp(HsaRsrcFactory::kernel_loop_count_key_, arg_value)) { + return 4; + } + + // Map print debug info parameter + if (!strcmp(HsaRsrcFactory::print_debug_key_, arg_value)) { + return 5; + } + + return 108; +} + +void HsaRsrcFactory::PrintHelpMsg() { + std::cout << "Key for passing Brig filepath: " << HsaRsrcFactory::brig_path_key_ << std::endl; + std::cout << "Key for passing Number of Compute Units: " << HsaRsrcFactory::num_cus_key_ + << std::endl; + std::cout << "Key for passing Number of Waves per CU: " << HsaRsrcFactory::num_waves_key_ + << std::endl; + std::cout << "Key for passing Number of Workitems per Wave: " + << HsaRsrcFactory::num_workitems_key_ << std::endl; + std::cout << "Key for passing Kernel Loop Count: " << HsaRsrcFactory::kernel_loop_count_key_ + << std::endl; +} diff --git a/runtime/hsa-ext-aql-profile/test/common/hsa_rsrc_factory.hpp b/runtime/hsa-ext-aql-profile/test/common/hsa_rsrc_factory.hpp new file mode 100644 index 0000000000..5bab803783 --- /dev/null +++ b/runtime/hsa-ext-aql-profile/test/common/hsa_rsrc_factory.hpp @@ -0,0 +1,262 @@ +#ifndef HSA_RSRC_FACTORY_H_ +#define HSA_RSRC_FACTORY_H_ + +#include +#include +#include +#include + +#include +#include +#include + +#include "hsatimer.h" +#include "hsa.h" +#include "hsa_ext_finalize.h" + +#define HSA_ARGUMENT_ALIGN_BYTES 16 +#define HSA_QUEUE_ALIGN_BYTES 64 +#define HSA_PACKET_ALIGN_BYTES 64 + +#define check(msg, status) \ + if (status != HSA_STATUS_SUCCESS) { \ + const char* emsg = 0; \ + hsa_status_string(status, &emsg); \ + printf("%s: %s\n", msg, emsg ? emsg : ""); \ + exit(1); \ + } + +#define check_build(msg, status) \ + if (status != STATUS_SUCCESS) { \ + printf("%s\n", msg); \ + exit(1); \ + } + +// Provide access to command line arguments passed in by user +extern uint32_t hsa_cmdline_arg_cnt; +extern char** hsa_cmdline_arg_list; + +// Encapsulates information about a Hsa Agent such as its +// handle, name, max queue size, max wavefront size, etc. +typedef struct { + // Handle of Agent + hsa_agent_t dev_id; + + // Agent type - Cpu = 0, Gpu = 1 or Dsp = 2 + uint32_t dev_type; + + // Name of Agent whose length is less than 64 + char name[64]; + + // Max size of Wavefront size + uint32_t max_wave_size; + + // Max size of Queue buffer + uint32_t max_queue_size; + + // Hsail profile supported by agent + hsa_profile_t profile; + + // Memory region supporting kernel parameters + hsa_region_t coarse_region; + + // Memory region supporting kernel arguments + hsa_region_t kernarg_region; + +} AgentInfo; + +class HsaRsrcFactory { + public: + // Constructor of the class. Will initialize the Hsa Runtime and + // query the system topology to get the list of Cpu and Gpu devices + HsaRsrcFactory(); + + // Destructor of the class + ~HsaRsrcFactory(); + + // Get the count of Hsa Gpu Agents available on the platform + // + // @return uint32_t Number of Gpu agents on platform + // + uint32_t GetCountOfGpuAgents(); + + // Get the count of Hsa Cpu Agents available on the platform + // + // @return uint32_t Number of Cpu agents on platform + // + uint32_t GetCountOfCpuAgents(); + + // Get the AgentInfo handle of a Gpu device + // + // @param idx Gpu Agent at specified index + // + // @param agent_info Output parameter updated with AgentInfo + // + // @return bool true if successful, false otherwise + // + bool GetGpuAgentInfo(uint32_t idx, AgentInfo** agent_info); + + // Get the AgentInfo handle of a Cpu device + // + // @param idx Cpu Agent at specified index + // + // @param agent_info Output parameter updated with AgentInfo + // + // @return bool true if successful, false otherwise + // + bool GetCpuAgentInfo(uint32_t idx, AgentInfo** agent_info); + + // Create a Queue object and return its handle. The queue object is expected + // to support user requested number of Aql dispatch packets. + // + // @param agent_info Gpu Agent on which to create a queue object + // + // @param num_Pkts Number of packets to be held by queue + // + // @param queue Output parameter updated with handle of queue object + // + // @return bool true if successful, false otherwise + // + bool CreateQueue(AgentInfo* agent_info, uint32_t num_pkts, hsa_queue_t** queue); + + // Create a Signal object and return its handle. + // + // @param value Initial value of signal object + // + // @param signal Output parameter updated with handle of signal object + // + // @return bool true if successful, false otherwise + // + bool CreateSignal(uint32_t value, hsa_signal_t* signal); + + // Allocate memory for use by a kernel of specified size in specified + // agent's memory region. Currently supports Global segment whose Kernarg + // flag set. + // + // @param agent_info Agent from whose memory region to allocate + // + // @param size Size of memory in terms of bytes + // + // @return uint8_t* Pointer to buffer, null if allocation fails. + // + uint8_t* AllocateLocalMemory(AgentInfo* agent_info, size_t size); + uint8_t* AllocateMemory(AgentInfo* agent_info, size_t size); + + bool TransferData(uint8_t* dest_buff, uint8_t* src_buff, uint32_t length, bool host_to_dev); + + // Allocate memory tp pass kernel parameters. + // + // @param agent_info Agent from whose memory region to allocate + // + // @param size Size of memory in terms of bytes + // + // @return uint8_t* Pointer to buffer, null if allocation fails. + // + uint8_t* AllocateSysMemory(AgentInfo* agent_info, size_t size); + + // Loads an Assembled Brig file and Finalizes it into Device Isa + // + // @param agent_info Gpu device for which to finalize + // + // @param brig_path File path of the Assembled Brig file + // + // @param kernel_name Name of the kernel to finalize + // + // @param code_desc Handle of finalized Code Descriptor that could + // be used to submit for execution + // + // @return bool true if successful, false otherwise + // + bool LoadAndFinalize(AgentInfo* agent_info, const char* brig_path, char* kernel_name, + hsa_executable_symbol_t* code_desc); + + // Add an instance of AgentInfo representing a Hsa Gpu agent + void AddAgentInfo(AgentInfo* agent_info, bool gpu); + + // Returns the file path where brig files is located + static char* GetBrigPath(); + + // Returns the number of compute units present on platform + static uint32_t GetNumOfCUs(); + + // Returns the maximum number of waves that can be launched + // per compute unit. The actual number that can be launched + // is affected by resource availability + static uint32_t GetNumOfWavesPerCU(); + + // Returns the number of work-items that can execute per wave + static uint32_t GetNumOfWorkItemsPerWave(); + + // Returns the number of times kernel loop body should execute. + static uint32_t GetKernelLoopCount(); + + // Returns boolean flag to indicate if debug info should be printed + static uint32_t GetPrintDebugInfo(); + + // Print the various fields of Hsa Gpu Agents + bool PrintGpuAgents(const std::string& header); + + private: + // Number of queues to create + uint32_t num_queues_; + + // Used to maintain a list of Hsa Queue handles + std::vector queue_list_; + + // Number of Signals to create + uint32_t num_signals_; + + // Used to maintain a list of Hsa Signal handles + std::vector signal_list_; + + // Number of agents reported by platform + uint32_t num_agents_; + + // Used to maintain a list of Hsa Gpu Agent Info + std::vector gpu_list_; + + // Used to maintain a list of Hsa Cpu Agent Info + std::vector cpu_list_; + + // Records the file path where Brig file is located. + // Value is available only after an instance has been built. + static char* brig_path_; + static char* brig_path_key_; + + // Records the number of Compute units present on system. + // Value is available only after an instance has been built. + static uint32_t num_cus_; + static char* num_cus_key_; + + // Records the number of waves that can be launched per Compute unit + // Value is available only after an instance has been built. + static uint32_t num_waves_; + static char* num_waves_key_; + + // Records the number of work-items that can be packed into a wave + // Value is available only after an instance has been built. + static uint32_t num_workitems_; + static char* num_workitems_key_; + + // Records the number of times kernel loop body should run. Value + // is available only after an instance has been built. + static uint32_t kernel_loop_count_; + static char* kernel_loop_count_key_; + + // Records the number of times kernel loop body should run. Value + // is available only after an instance has been built. + static bool print_debug_info_; + static char* print_debug_key_; + + // Process command line arguments. The method will capture + // various user command line parameters for tests to use + static void ProcessCmdline(); + + // Prints the help banner on user arg keys + static void PrintHelpMsg(); + + // Maps an index for the user argument + static uint32_t GetArgIndex(char* arg_value); +}; + +#endif // HSA_RSRC_FACTORY_H_ diff --git a/runtime/hsa-ext-aql-profile/test/common/hsatimer.cpp b/runtime/hsa-ext-aql-profile/test/common/hsatimer.cpp new file mode 100644 index 0000000000..fafaa77ef1 --- /dev/null +++ b/runtime/hsa-ext-aql-profile/test/common/hsatimer.cpp @@ -0,0 +1,168 @@ +#include "hsatimer.h" + +PerfTimer::PerfTimer() { freq_in_100mhz = MeasureTSCFreqHz(); } + +PerfTimer::~PerfTimer() { + while (!_timers.empty()) { + Timer* temp = _timers.back(); + _timers.pop_back(); + delete temp; + } +} + +// a new cretaed timer instantance index will be returned +int PerfTimer::CreateTimer() { + Timer* newTimer = new Timer; + newTimer->_start = 0; + newTimer->_clocks = 0; + +#ifdef _WIN32 + QueryPerformanceFrequency((LARGE_INTEGER*)&newTimer->_freq); +#else + newTimer->_freq = (long long)1.0E3; +#endif + + /* Push back the address of new Timer instance created */ + _timers.push_back(newTimer); + return (int)(_timers.size() - 1); +} + +int PerfTimer::StartTimer(int index) { + if (index >= (int)_timers.size()) { + Error("Cannot reset timer. Invalid handle."); + return HSA_FAILURE; + } + +#ifdef _WIN32 +// General Windows timing method +#ifndef _AMD + long long tmpStart; + QueryPerformanceCounter((LARGE_INTEGER*)&(tmpStart)); + _timers[index]->_start = (double)tmpStart; +#else +// AMD Windows timing method + +#endif + +#else +// General Linux timing method +#ifndef _AMD + struct timeval s; + gettimeofday(&s, 0); + _timers[index]->_start = s.tv_sec * 1.0E3 + ((double)(s.tv_usec / 1.0E3)); +#else + + // AMD timing method + + unsigned int unused; + _timers[index]->_start = __rdtscp(&unused); + +#endif + +#endif + + return HSA_SUCCESS; +} + + +int PerfTimer::StopTimer(int index) { + double n = 0; + if (index >= (int)_timers.size()) { + Error("Cannot reset timer. Invalid handle."); + return HSA_FAILURE; + } +#ifdef _WIN32 +#ifndef _AMD + long long n1; + QueryPerformanceCounter((LARGE_INTEGER*)&(n1)); + n = (double)n1; +#else + +// AMD Window Timing + +#endif + +#else +// General Linux timing method +#ifndef _AMD + struct timeval s; + gettimeofday(&s, 0); + n = s.tv_sec * 1.0E3 + (double)(s.tv_usec / 1.0E3); +#else + // AMD Linux timing + + unsigned int unused; + n = __rdtscp(&unused); +#endif + +#endif + + n -= _timers[index]->_start; + _timers[index]->_start = 0; + +#ifndef _AMD + _timers[index]->_clocks += n; +#else + //_timers[index]->_clocks += 10 * n /freq_in_100mhz; // unit is ns + _timers[index]->_clocks += 1.0E-6 * 10 * n / freq_in_100mhz; // convert to ms + cout << "_AMD is enabled!!!" << endl; +#endif + + return HSA_SUCCESS; +} + +void PerfTimer::Error(string str) { cout << str << endl; } + + +double PerfTimer::ReadTimer(int index) { + if (index >= (int)_timers.size()) { + Error("Cannot read timer. Invalid handle."); + return HSA_FAILURE; + } + + double reading = double(_timers[index]->_clocks); + + reading = double(reading / _timers[index]->_freq); + + return reading; +} + + +uint64_t PerfTimer::CoarseTimestampUs() { +#ifdef _WIN32 + uint64_t freqHz, ticks; + QueryPerformanceFrequency((LARGE_INTEGER*)&freqHz); + QueryPerformanceCounter((LARGE_INTEGER*)&ticks); + + // Scale numerator and divisor until (ticks * 1000000) fits in uint64_t. + while (ticks > (1ULL << 44)) { + ticks /= 16; + freqHz /= 16; + } + + return (ticks * 1000000) / freqHz; +#else + struct timespec ts; + clock_gettime(CLOCK_MONOTONIC_RAW, &ts); + return uint64_t(ts.tv_sec) * 1000000 + ts.tv_nsec / 1000; +#endif +} + +uint64_t PerfTimer::MeasureTSCFreqHz() { + // Make a coarse interval measurement of TSC ticks for 1 gigacycles. + unsigned int unused; + uint64_t tscTicksEnd; + + uint64_t coarseBeginUs = CoarseTimestampUs(); + uint64_t tscTicksBegin = __rdtscp(&unused); + do { + tscTicksEnd = __rdtscp(&unused); + } while (tscTicksEnd - tscTicksBegin < 1000000000); + + uint64_t coarseEndUs = CoarseTimestampUs(); + + // Compute the TSC frequency and round to nearest 100MHz. + uint64_t coarseIntervalNs = (coarseEndUs - coarseBeginUs) * 1000; + uint64_t tscIntervalTicks = tscTicksEnd - tscTicksBegin; + return (tscIntervalTicks * 10 + (coarseIntervalNs / 2)) / coarseIntervalNs; +} diff --git a/runtime/hsa-ext-aql-profile/test/common/hsatimer.h b/runtime/hsa-ext-aql-profile/test/common/hsatimer.h new file mode 100644 index 0000000000..a8dd66d54f --- /dev/null +++ b/runtime/hsa-ext-aql-profile/test/common/hsatimer.h @@ -0,0 +1,68 @@ +#ifndef __MYTIME__ +#define __MYTIME__ + +// Will use AMD timer and general Linux timer based on users' need --> compilation flag +// need to consider platform is Windows or Linux + +#include +#include +#include +#include +#include +#include +#include +using namespace std; + +#if defined(_MSC_VER) +#include +#include +#include +#else +#if defined(__GNUC__) +#include +#include +#endif // __GNUC__ +#endif //_MSC_VER + +#define HSA_FAILURE 1 +#define HSA_SUCCESS 0 + +class PerfTimer { + private: + struct Timer { + string name; /* < name name of time object*/ + long long _freq; /* < _freq frequency*/ + double _clocks; /* < _clocks number of ticks at end*/ + double _start; /* < _start start point ticks*/ + }; + + std::vector _timers; /*< _timers vector to Timer objects */ + double freq_in_100mhz; + + public: + PerfTimer(); + ~PerfTimer(); + + private: + // AMD timing method + uint64_t CoarseTimestampUs(); + uint64_t MeasureTSCFreqHz(); + + // General Linux timing method + + public: + int CreateTimer(); + int StartTimer(int index); + int StopTimer(int index); + + public: + // retrieve time + double ReadTimer(int index); + // write into a file + double WriteTimer(int index); + + public: + void Error(string str); +}; + +#endif diff --git a/runtime/hsa-ext-aql-profile/test/ctrl/test.cpp b/runtime/hsa-ext-aql-profile/test/ctrl/test.cpp new file mode 100644 index 0000000000..5da9952fd1 --- /dev/null +++ b/runtime/hsa-ext-aql-profile/test/ctrl/test.cpp @@ -0,0 +1,91 @@ +/****************************************************************************** + +Copyright 2013 Advanced Micro Devices, Inc. All rights reserved. + +Redistribution and use in source and binary forms, with or without modification, +are permitted provided that the following conditions are met: + +Redistributions of source code must retain the above copyright notice, this list +of conditions and the following disclaimer. + +Redistributions in binary form must reproduce the above copyright notice, this +list of conditions and the following disclaimer in the documentation and/or +other materials provided with the distribution. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. +IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, +INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, +BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE +OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED +OF THE POSSIBILITY OF SUCH DAMAGE. + +*******************************************************************************/ + +#include +#include "simple_convolution.h" +#include "test_hsa.h" +#include "test_pgen_pmc.h" +#include "test_pgen_sqtt.h" + +int main(int argc, char* argv[]) { +#if defined(NDEBUG) + clog.rdbuf(NULL); +#endif + + bool ret_val = true; + + // Create SimpleConvolution test object + TestKernel* test_kernel = new SimpleConvolution(); + TestAql* test_aql = new TestHSA(test_kernel); + + const bool pmc_enable = (getenv("ROCR_ENABLE_PMC") != NULL); + const bool sqtt_enable = (getenv("ROCR_ENABLE_SQTT") != NULL); + if (pmc_enable) + test_aql = new TestPGenPMC(test_aql); + else if (sqtt_enable) + test_aql = new TestPGenSQTT(test_aql); + assert(test_aql != NULL); + if (test_aql == NULL) return 1; + + // Initialization of Hsa Runtime + ret_val = test_aql->initialize(argc, argv); + if (ret_val == false) { + std::cout << "Error in the test initialization" << std::endl; + assert(ret_val); + return 1; + } + + // Setup Hsa resources needed for execution + ret_val = test_aql->setup(); + if (ret_val == false) { + std::cout << "Error in creating hsa resources" << std::endl; + assert(ret_val); + return 1; + } + + // Run SimpleConvolution kernel + ret_val = test_aql->run(); + if (ret_val == false) { + std::cout << "Error in running the test kernel" << std::endl; + assert(ret_val); + return 1; + } + + // Verify the results of the execution + ret_val = test_aql->verify_results(); + if (ret_val) { + std::cout << "Test : Passed" << std::endl; + } else { + std::cout << "Test : Failed" << std::endl; + } + + // Print time taken by sample + test_aql->print_time(); + test_aql->cleanup(); + + return (ret_val) ? 0 : 1; +} diff --git a/runtime/hsa-ext-aql-profile/test/ctrl/test_aql.h b/runtime/hsa-ext-aql-profile/test/ctrl/test_aql.h new file mode 100644 index 0000000000..a7ac177e20 --- /dev/null +++ b/runtime/hsa-ext-aql-profile/test/ctrl/test_aql.h @@ -0,0 +1,87 @@ +/****************************************************************************** + +Copyright ©2013 Advanced Micro Devices, Inc. All rights reserved. + +Redistribution and use in source and binary forms, with or without modification, +are permitted provided that the following conditions are met: + +Redistributions of source code must retain the above copyright notice, this list +of conditions and the following disclaimer. + +Redistributions in binary form must reproduce the above copyright notice, this +list of conditions and the following disclaimer in the documentation and/or +other materials provided with the distribution. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. +IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, +INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, +BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE +OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED +OF THE POSSIBILITY OF SUCH DAMAGE. + +*******************************************************************************/ + +#ifndef _TESTAQL_H_ +#define _TESTAQL_H_ + +#include "hsa.h" +#include "hsa_rsrc_factory.hpp" +#include "hsa_ext_amd_aql_profile.h" + +#define test_assert(cond) \ + { \ + if (cond) { \ + std::cout << "ASSERT FAILED: " << #cond << " : " << __FILE__ << "(" << __LINE__ << ")" \ + << std::endl; \ + abort(); \ + } \ + } + +// Test AQL interface +class TestAql { + TestAql* const test_aql; + + public: + TestAql(TestAql* t = 0) : test_aql(t) {} + virtual ~TestAql() {} + + TestAql* testAql() { return test_aql; } + virtual AgentInfo* getAgentInfo() { return (test_aql) ? test_aql->getAgentInfo() : 0; } + virtual hsa_queue_t* getQueue() { return (test_aql) ? test_aql->getQueue() : 0; } + virtual HsaRsrcFactory* getRsrcFactory() { return (test_aql) ? test_aql->getRsrcFactory() : 0; } + + // Initialize application environment including setting + // up of various configuration parameters based on + // command line arguments + // @return bool true on success and false on failure + virtual bool initialize(int argc, char** argv) { + return (test_aql) ? test_aql->initialize(argc, argv) : true; + } + + // Setup application parameters for exectuion + // @return bool true on success and false on failure + virtual bool setup() { return (test_aql) ? test_aql->setup() : true; } + + // Run the kernel + // @return bool true on success and false on failure + virtual bool run() { return (test_aql) ? test_aql->run() : true; } + + // Verify results + // @return bool true on success and false on failure + virtual bool verify_results() { return (test_aql) ? test_aql->verify_results() : true; } + + // Print to console the time taken to execute kernel + virtual void print_time() { + if (test_aql) test_aql->print_time(); + } + + // Release resources e.g. memory allocations + // @return bool true on success and false on failure + virtual bool cleanup() { return (test_aql) ? test_aql->cleanup() : true; } +}; + +#endif // _TESTAQL_H_ diff --git a/runtime/hsa-ext-aql-profile/test/ctrl/test_hsa.cpp b/runtime/hsa-ext-aql-profile/test/ctrl/test_hsa.cpp new file mode 100644 index 0000000000..06c589b70d --- /dev/null +++ b/runtime/hsa-ext-aql-profile/test/ctrl/test_hsa.cpp @@ -0,0 +1,234 @@ +/****************************************************************************** + +Copyright 2013 Advanced Micro Devices, Inc. All rights reserved. + +Redistribution and use in source and binary forms, with or without modification, +are permitted provided that the following conditions are met: + +Redistributions of source code must retain the above copyright notice, this list +of conditions and the following disclaimer. + +Redistributions in binary form must reproduce the above copyright notice, this +list of conditions and the following disclaimer in the documentation and/or +other materials provided with the distribution. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. +IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, +INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, +BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE +OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED +OF THE POSSIBILITY OF SUCH DAMAGE. + +*******************************************************************************/ + +#include "os.h" +#include "helper_funcs.hpp" +#include "hsa_rsrc_factory.hpp" +#include "test_hsa.h" + +bool TestHSA::initialize(int arg_cnt, char** arg_list) { + std::cout << "TestHSA::initialize :" << std::endl; + // Initialize command line arguments + hsa_cmdline_arg_cnt = arg_cnt; + hsa_cmdline_arg_list = arg_list; + + // Instantiate a Timer object + setup_timer_idx_ = hsa_timer_.CreateTimer(); + dispatch_timer_idx_ = hsa_timer_.CreateTimer(); + + // Instantiate an instance of Hsa Resources Factory + hsa_rsrc_ = new HsaRsrcFactory(); + + // Print properties of the agents + hsa_rsrc_->PrintGpuAgents("> GPU agents"); + + // Create an instance of Gpu agent + const char* p = getenv("ROCR_AGENT_IND"); + const uint32_t agent_ind = (p == NULL) ? 0 : atol(p); + if (!hsa_rsrc_->GetGpuAgentInfo(agent_ind, &agent_info_)) { + std::cout << "> error: agent[" << agent_ind << "] is not found" << std::endl; + return false; + } + std::cout << "> Using agent[" << agent_ind << "] : " << agent_info_->name << std::endl; + + // Create an instance of Aql Queue + uint32_t num_pkts = 128; + hsa_rsrc_->CreateQueue(agent_info_, num_pkts, &hsa_queue_); + + // Obtain handle of signal + hsa_rsrc_->CreateSignal(1, &hsa_signal_); + + // Obtain the code object file name + std::string agentName(agent_info_->name); + if (agentName.compare(0, 4, "gfx8") == 0) { + brig_path_obj_.append("gfx8"); + } else if (agentName.compare(0, 4, "gfx9") == 0) { + brig_path_obj_.append("gfx9"); + } else { + assert(false); + return false; + } + brig_path_obj_.append("_" + name_ + ".hsaco"); + + return true; +} + +bool TestHSA::setup() { + std::cout << "TestHSA::setup :" << std::endl; + + // Start the timer object + hsa_timer_.StartTimer(setup_timer_idx_); + + mem_map_t& mem_map = test_->get_mem_map(); + for (mem_it_t it = mem_map.begin(); it != mem_map.end(); ++it) { + mem_descr_t& des = it->second; + void* ptr = (des.local) ? hsa_rsrc_->AllocateLocalMemory(agent_info_, des.size) + : hsa_rsrc_->AllocateSysMemory(agent_info_, des.size); + des.ptr = ptr; + assert(ptr != NULL); + if (ptr == NULL) return false; + } + test_->init(); + + // Load and Finalize Kernel Code Descriptor + char* brig_path = (char*)brig_path_obj_.c_str(); + const bool ret_val = + hsa_rsrc_->LoadAndFinalize(agent_info_, brig_path, strdup(name_.c_str()), &kernel_code_desc_); + if (ret_val == false) { + std::cout << "Error in loading and finalizing Kernel" << std::endl; + return ret_val; + } + + // Stop the timer object + hsa_timer_.StopTimer(setup_timer_idx_); + setup_time_taken_ = hsa_timer_.ReadTimer(setup_timer_idx_); + total_time_taken_ = setup_time_taken_; + + return true; +} + +bool TestHSA::run() { + std::cout << "TestHSA::run :" << std::endl; + + const uint32_t work_group_size = 64; + const uint32_t work_grid_size = test_->get_elements_count(); + uint32_t group_segment_size = 0; + uint32_t private_segment_size = 0; + const size_t kernarg_segment_size = test_->get_kernarg_size(); + uint64_t code_handle = 0; + + // Retrieve the amount of group memory needed + hsa_executable_symbol_get_info( + kernel_code_desc_, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_GROUP_SEGMENT_SIZE, &group_segment_size); + + // Retrieve the amount of private memory needed + hsa_executable_symbol_get_info(kernel_code_desc_, + HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_PRIVATE_SEGMENT_SIZE, + &private_segment_size); + + // Check the kernel args size + size_t size_info = 0; + hsa_executable_symbol_get_info( + kernel_code_desc_, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_KERNARG_SEGMENT_SIZE, &size_info); + assert(kernarg_segment_size == size_info); + if (kernarg_segment_size != size_info) return false; + + // Retrieve handle of the code block + hsa_executable_symbol_get_info(kernel_code_desc_, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT, + &code_handle); + + // Initialize the dispatch packet. + hsa_kernel_dispatch_packet_t aql; + memset(&aql, 0, sizeof(aql)); + // Set the packet's type, acquire and release fences + aql.header = HSA_PACKET_TYPE_KERNEL_DISPATCH; + aql.header |= HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE; + aql.header |= HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE; + // Populate Aql packet with default values + aql.setup = 1; + aql.grid_size_x = work_grid_size; + aql.grid_size_y = 1; + aql.grid_size_z = 1; + aql.workgroup_size_x = work_group_size; + aql.workgroup_size_y = 1; + aql.workgroup_size_z = 1; + // Bind the kernel code descriptor and arguments + aql.kernel_object = code_handle; + aql.kernarg_address = test_->get_kernarg_ptr(); + aql.group_segment_size = group_segment_size; + aql.private_segment_size = private_segment_size; + // Initialize Aql packet with handle of signal + aql.completion_signal = hsa_signal_; + + // Compute the write index of queue and copy Aql packet into it + const uint64_t que_idx = hsa_queue_load_write_index_relaxed(hsa_queue_); + const uint32_t mask = hsa_queue_->size - 1; + + std::cout << "> Executing kernel: \"" << name_ << "\"" << std::endl; + + // Start the timer object + hsa_timer_.StartTimer(dispatch_timer_idx_); + + // Disable packet so that submission to HW is complete + const auto header = aql.header; + const uint8_t packet_type_mask = (1 << HSA_PACKET_HEADER_WIDTH_TYPE) - 1; + aql.header &= (~packet_type_mask) << HSA_PACKET_HEADER_TYPE; + aql.header |= HSA_PACKET_TYPE_INVALID << HSA_PACKET_HEADER_TYPE; + + // Copy Aql packet into queue buffer + ((hsa_kernel_dispatch_packet_t*)(hsa_queue_->base_address))[que_idx & mask] = aql; + + // After AQL packet is fully copied into queue buffer + // update packet header from invalid state to valid state + std::atomic_thread_fence(std::memory_order_release); + ((hsa_kernel_dispatch_packet_t*)(hsa_queue_->base_address))[que_idx & mask].header = header; + + // Increment the write index and ring the doorbell to dispatch the kernel. + hsa_queue_store_write_index_relaxed(hsa_queue_, (que_idx + 1)); + hsa_signal_store_relaxed(hsa_queue_->doorbell_signal, que_idx); + + std::cout << "> Waiting on kernel dispatch signal" << std::endl; + + // Wait on the dispatch signal until the kernel is finished. + // Update wait condition to HSA_WAIT_STATE_ACTIVE for Polling + hsa_signal_value_t value = hsa_signal_wait_acquire(hsa_signal_, HSA_SIGNAL_CONDITION_LT, 1, + (uint64_t)-1, HSA_WAIT_STATE_BLOCKED); + + // Stop the timer object + hsa_timer_.StopTimer(dispatch_timer_idx_); + dispatch_time_taken_ = hsa_timer_.ReadTimer(dispatch_timer_idx_); + total_time_taken_ += dispatch_time_taken_; + + // Copy kernel buffers from local memory into system memory + hsa_rsrc_->TransferData((uint8_t*)test_->get_output_ptr(), (uint8_t*)test_->get_local_ptr(), + test_->get_output_size(), false); + test_->print_output(); + + return true; +} + +bool TestHSA::verify_results() { + // Compare the results and see if they match + const int32_t cmp_val = + memcmp(test_->get_output_ptr(), test_->get_refout_ptr(), test_->get_output_size()); + return (cmp_val == 0); +} + +void TestHSA::print_time() { + std::cout << "Time taken for Setup by " << this->name_ << " : " << this->setup_time_taken_ + << std::endl; + std::cout << "Time taken for Dispatch by " << this->name_ << " : " << this->dispatch_time_taken_ + << std::endl; + std::cout << "Time taken in Total by " << this->name_ << " : " << this->total_time_taken_ + << std::endl; +} + +bool TestHSA::cleanup() { + // shutdown Hsa Runtime system + hsa_status_t ret_val = hsa_shut_down(); + return (HSA_STATUS_SUCCESS == ret_val); +} diff --git a/runtime/hsa-ext-aql-profile/test/ctrl/test_hsa.h b/runtime/hsa-ext-aql-profile/test/ctrl/test_hsa.h new file mode 100644 index 0000000000..342d4be8bf --- /dev/null +++ b/runtime/hsa-ext-aql-profile/test/ctrl/test_hsa.h @@ -0,0 +1,115 @@ +/****************************************************************************** + +Copyright ©2013 Advanced Micro Devices, Inc. All rights reserved. + +Redistribution and use in source and binary forms, with or without modification, +are permitted provided that the following conditions are met: + +Redistributions of source code must retain the above copyright notice, this list +of conditions and the following disclaimer. + +Redistributions in binary form must reproduce the above copyright notice, this +list of conditions and the following disclaimer in the documentation and/or +other materials provided with the distribution. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. +IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, +INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, +BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE +OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED +OF THE POSSIBILITY OF SUCH DAMAGE. + +*******************************************************************************/ + +#ifndef _TEST_HSA_H_ +#define _TEST_HSA_H_ + +#include "test_aql.h" +#include "test_kernel.h" +#include "hsa_rsrc_factory.hpp" + +// Class implements HSA test +class TestHSA : public TestAql { + public: + // Constructor + TestHSA(TestKernel* test) : test_(test), name_(test->Name()) { + total_time_taken_ = 0; + setup_time_taken_ = 0; + dispatch_time_taken_ = 0; + } + + // Get methods for Agent Info, HAS queue, HSA Resourcse Manager + AgentInfo* getAgentInfo() { return agent_info_; } + hsa_queue_t* getQueue() { return hsa_queue_; } + HsaRsrcFactory* getRsrcFactory() { return hsa_rsrc_; } + + // Initialize application environment including setting + // up of various configuration parameters based on + // command line arguments + // @return bool true on success and false on failure + bool initialize(int argc, char** argv); + + // Setup application parameters for exectuion + // @return bool true on success and false on failure + bool setup(); + + // Run the BinarySearch kernel + // @return bool true on success and false on failure + bool run(); + + // Verify against reference implementation + // @return bool true on success and false on failure + bool verify_results(); + + // Print to console the time taken to execute kernel + void print_time(); + + // Release resources e.g. memory allocations + // @return bool true on success and false on failure + bool cleanup(); + + private: + typedef TestKernel::mem_descr_t mem_descr_t; + typedef TestKernel::mem_map_t mem_map_t; + typedef TestKernel::mem_it_t mem_it_t; + + // Test object + TestKernel* test_; + + // Path of Brig file + std::string brig_path_obj_; + + // Used to track time taken to run the sample + double total_time_taken_; + double setup_time_taken_; + double dispatch_time_taken_; + + // Handle to an Hsa Gpu Agent + AgentInfo* agent_info_; + + // Handle to an Hsa Queue + hsa_queue_t* hsa_queue_; + + // Handle of signal + hsa_signal_t hsa_signal_; + + // Handle of Kernel Code Descriptor + hsa_executable_symbol_t kernel_code_desc_; + + // Instance of timer object + uint32_t setup_timer_idx_; + uint32_t dispatch_timer_idx_; + PerfTimer hsa_timer_; + + // Instance of Hsa Resources Factory + HsaRsrcFactory* hsa_rsrc_; + + // Test kernel name + std::string name_; +}; + +#endif // _TEST_HSA_H_ diff --git a/runtime/hsa-ext-aql-profile/test/ctrl/test_kernel.h b/runtime/hsa-ext-aql-profile/test/ctrl/test_kernel.h new file mode 100644 index 0000000000..7af51e3c1f --- /dev/null +++ b/runtime/hsa-ext-aql-profile/test/ctrl/test_kernel.h @@ -0,0 +1,105 @@ +/****************************************************************************** + +Copyright ©2013 Advanced Micro Devices, Inc. All rights reserved. + +Redistribution and use in source and binary forms, with or without modification, +are permitted provided that the following conditions are met: + +Redistributions of source code must retain the above copyright notice, this list +of conditions and the following disclaimer. + +Redistributions in binary form must reproduce the above copyright notice, this +list of conditions and the following disclaimer in the documentation and/or +other materials provided with the distribution. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. +IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, +INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, +BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE +OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED +OF THE POSSIBILITY OF SUCH DAMAGE. + +*******************************************************************************/ + +#ifndef _TEST_KERNEL_H_ +#define _TEST_KERNEL_H_ + +#include +#include + +// Class implements Kernel test +class TestKernel { + public: + // Memory descriptors IDs + enum { INPUT_DES_ID, OUTPUT_DES_ID, LOCAL_DES_ID, MASK_DES_ID, KERNARG_DES_ID, REFOUT_DES_ID }; + + // Memory descriptors vector declaration + struct mem_descr_t { + void* ptr; + uint32_t size; + bool local; + }; + + // Memory map declaration + typedef std::map mem_map_t; + typedef mem_map_t::iterator mem_it_t; + typedef mem_map_t::const_iterator mem_const_it_t; + + // Initialize method + virtual void init() = 0; + + // Return kernel memory map + mem_map_t& get_mem_map() { return mem_map_; } + + // Return NULL descriptor + static mem_descr_t null_descriptor() { return {0, 0, 0}; } + + // Methods to get the kernel attributes + void* get_kernarg_ptr() const { return get_descr(KERNARG_DES_ID).ptr; } + uint32_t get_kernarg_size() const { return get_descr(KERNARG_DES_ID).size; } + void* get_output_ptr() const { return get_descr(OUTPUT_DES_ID).ptr; } + uint32_t get_output_size() const { return get_descr(OUTPUT_DES_ID).size; } + void* get_local_ptr() const { return get_descr(LOCAL_DES_ID).ptr; } + void* get_refout_ptr() const { return get_descr(REFOUT_DES_ID).ptr; } + virtual uint32_t get_elements_count() const = 0; + + // Print output + virtual void print_output() const = 0; + + // Return name + virtual std::string Name() const = 0; + + protected: + // Set system memory descriptor + bool set_sys_descr(const uint32_t& id, const uint32_t& size) { + return set_mem_descr(id, size, false); + } + + // Set local memory descriptor + bool set_local_descr(const uint32_t& id, const uint32_t& size) { + return set_mem_descr(id, size, true); + } + + // Get memory descriptor + mem_descr_t get_descr(const uint32_t& id) const { + mem_const_it_t it = mem_map_.find(id); + return (it != mem_map_.end()) ? it->second : null_descriptor(); + } + + private: + // Set memory descriptor + bool set_mem_descr(const uint32_t& id, const uint32_t& size, const bool& local) { + const mem_descr_t des = {NULL, size, local}; + auto ret = mem_map_.insert(mem_map_t::value_type(id, des)); + return ret.second; + } + + // Kernel memory map object + mem_map_t mem_map_; +}; + +#endif // _TEST_KERNEL_H_ diff --git a/runtime/hsa-ext-aql-profile/test/ctrl/test_pgen.h b/runtime/hsa-ext-aql-profile/test/ctrl/test_pgen.h new file mode 100644 index 0000000000..8102c3cf0f --- /dev/null +++ b/runtime/hsa-ext-aql-profile/test/ctrl/test_pgen.h @@ -0,0 +1,46 @@ +/****************************************************************************** + +Copyright ©2013 Advanced Micro Devices, Inc. All rights reserved. + +Redistribution and use in source and binary forms, with or without modification, +are permitted provided that the following conditions are met: + +Redistributions of source code must retain the above copyright notice, this list +of conditions and the following disclaimer. + +Redistributions in binary form must reproduce the above copyright notice, this +list of conditions and the following disclaimer in the documentation and/or +other materials provided with the distribution. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. +IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, +INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, +BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE +OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED +OF THE POSSIBILITY OF SUCH DAMAGE. + +*******************************************************************************/ + +#ifndef _TEST_PGEN_H_ +#define _TEST_PGEN_H_ + +#include "test_pmgr.h" +#include "hsa_ext_amd_aql_profile.h" + +// SimpleConvolution: Class implements OpenCL SimpleConvolution sample +class TestPGen : public TestPMgr { + typedef hsa_ext_amd_aql_pm4_packet_t packet_t; + + protected: + packet_t* PrePacket() { return reinterpret_cast(&prePacket); } + packet_t* PostPacket() { return reinterpret_cast(&postPacket); } + + public: + TestPGen(TestAql* t) : TestPMgr(t) {} +}; + +#endif // _TEST_PGEN_H_ diff --git a/runtime/hsa-ext-aql-profile/test/ctrl/test_pgen_pmc.h b/runtime/hsa-ext-aql-profile/test/ctrl/test_pgen_pmc.h new file mode 100644 index 0000000000..b3f5239c8b --- /dev/null +++ b/runtime/hsa-ext-aql-profile/test/ctrl/test_pgen_pmc.h @@ -0,0 +1,142 @@ +/****************************************************************************** + +Copyright ©2013 Advanced Micro Devices, Inc. All rights reserved. + +Redistribution and use in source and binary forms, with or without modification, +are permitted provided that the following conditions are met: + +Redistributions of source code must retain the above copyright notice, this list +of conditions and the following disclaimer. + +Redistributions in binary form must reproduce the above copyright notice, this +list of conditions and the following disclaimer in the documentation and/or +other materials provided with the distribution. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. +IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, +INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, +BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE +OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED +OF THE POSSIBILITY OF SUCH DAMAGE. + +*******************************************************************************/ + +#ifndef _TEST_PGEN_PMC_H_ +#define _TEST_PGEN_PMC_H_ + +#include "test_pgen.h" + +hsa_status_t TestPGenPMC_Callback(hsa_ext_amd_aql_profile_info_type_t info_type, + hsa_ext_amd_aql_profile_info_data_t* info_data, + void* callback_data) { + hsa_status_t status = HSA_STATUS_SUCCESS; + typedef std::vector passed_data_t; + reinterpret_cast(callback_data)->push_back(*info_data); + return status; +} + +// SimpleConvolution: Class implements OpenCL SimpleConvolution sample +class TestPGenPMC : public TestPGen { + const static uint32_t buffer_alignment = 0x1000; // 4K + + hsa_agent_t agent; + hsa_ext_amd_aql_profile_profile_t profile; + hsa_ext_amd_aql_profile_event_t events[2]; + + bool buildPackets() { return true; } + + bool dumpData() { + std::cout << "TestPGenPMC::dumpData :" << std::endl; + + typedef std::vector callback_data_t; + + callback_data_t data; + hsa_ext_amd_aql_profile_iterate_data(&profile, TestPGenPMC_Callback, &data); + for (callback_data_t::iterator it = data.begin(); it != data.end(); ++it) { + std::cout << "> sample(" << dec << it->sample_id << ") block(" + << it->pmc_data.event.block_name << "_" << it->pmc_data.event.block_index + << ") result(" << hex << it->pmc_data.result << ")" << std::endl; + } + + return true; + } + + public: + TestPGenPMC(TestAql* t) : TestPGen(t) { std::cout << "Test: PGen PMC" << std::endl; } + + bool initialize(int arg_cnt, char** arg_list) { + if (!TestPMgr::initialize(arg_cnt, arg_list)) return false; + + hsa_status_t status; + hsa_agent_t agent; + uint32_t command_buffer_alignment; + uint32_t command_buffer_size; + uint32_t output_buffer_alignment; + uint32_t output_buffer_size; + + // GPU identificator + agent = getAgentInfo()->dev_id; + + // Instantiation of the profile object + // ////////////////////////////////////////////////////////////// + // Set the event fields + events[0].block_name = HSA_EXT_AQL_PROFILE_BLOCK_SQ; + events[0].block_index = 0; + events[0].counter_id = 0x4; // SQ_SQ_PERF_SEL_WAVES + events[1].block_name = HSA_EXT_AQL_PROFILE_BLOCK_SQ; + events[1].block_index = 0; + events[1].counter_id = 0xe; // SQ_SQ_PERF_SEL_ITEMS + + // Initialization the profile + memset(&profile, 0, sizeof(profile)); + profile.agent = agent; + profile.type = HSA_EXT_AQL_PROFILE_EVENT_PMC; + + // set enabled events list + profile.events = events; + profile.event_count = 2; + + // Profile buffers attributes + command_buffer_alignment = buffer_alignment; + status = hsa_ext_amd_aql_profile_get_info( + &profile, HSA_EXT_AQL_PROFILE_INFO_COMMAND_BUFFER_SIZE, &command_buffer_size); + assert(status == HSA_STATUS_SUCCESS); + + output_buffer_alignment = buffer_alignment; + status = hsa_ext_amd_aql_profile_get_info(&profile, HSA_EXT_AQL_PROFILE_INFO_PMC_DATA_SIZE, + &output_buffer_size); + assert(status == HSA_STATUS_SUCCESS); + + // Application is allocating the command buffer + // Allocate(command_buffer_alignment, command_buffer_size, + // MODE_HOST_ACC|MODE_DEV_ACC|MODE_EXEC_DATA) + profile.command_buffer.ptr = + getRsrcFactory()->AllocateSysMemory(getAgentInfo(), command_buffer_size); + profile.command_buffer.size = command_buffer_size; + + // Application is allocating the output buffer + // Allocate(output_buffer_alignment, output_buffer_size, + // MODE_HOST_ACC|MODE_DEV_ACC) + profile.output_buffer.ptr = + getRsrcFactory()->AllocateSysMemory(getAgentInfo(), output_buffer_size); + profile.output_buffer.size = output_buffer_size; + memset(profile.output_buffer.ptr, 0x77, output_buffer_size); + + // Populating the AQL start packet + status = hsa_ext_amd_aql_profile_start(&profile, PrePacket()); + assert(status == HSA_STATUS_SUCCESS); + if (status != HSA_STATUS_SUCCESS) return false; + + // Populating the AQL stop packet + status = hsa_ext_amd_aql_profile_stop(&profile, PostPacket()); + assert(status == HSA_STATUS_SUCCESS); + + return (status == HSA_STATUS_SUCCESS); + } +}; + +#endif // _TEST_PGEN_PMC_H_ diff --git a/runtime/hsa-ext-aql-profile/test/ctrl/test_pgen_sqtt.h b/runtime/hsa-ext-aql-profile/test/ctrl/test_pgen_sqtt.h new file mode 100644 index 0000000000..c4728258c7 --- /dev/null +++ b/runtime/hsa-ext-aql-profile/test/ctrl/test_pgen_sqtt.h @@ -0,0 +1,160 @@ +/****************************************************************************** + +Copyright ©2013 Advanced Micro Devices, Inc. All rights reserved. + +Redistribution and use in source and binary forms, with or without modification, +are permitted provided that the following conditions are met: + +Redistributions of source code must retain the above copyright notice, this list +of conditions and the following disclaimer. + +Redistributions in binary form must reproduce the above copyright notice, this +list of conditions and the following disclaimer in the documentation and/or +other materials provided with the distribution. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. +IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, +INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, +BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE +OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED +OF THE POSSIBILITY OF SUCH DAMAGE. + +*******************************************************************************/ + +#ifndef _TEST_PGEN_SQTT_H_ +#define _TEST_PGEN_SQTT_H_ + +#include +#include +#include + +#include "test_pgen.h" + +hsa_status_t TestPGenSQTT_Callback(hsa_ext_amd_aql_profile_info_type_t info_type, + hsa_ext_amd_aql_profile_info_data_t* info_data, + void* callback_data) { + hsa_status_t status = HSA_STATUS_SUCCESS; + typedef std::vector passed_data_t; + reinterpret_cast(callback_data)->push_back(*info_data); + return status; +} + +// SimpleConvolution: Class implements OpenCL SimpleConvolution sample +class TestPGenSQTT : public TestPGen { + const static uint32_t buffer_alignment = 0x1000; // 4K + const static uint32_t buffer_size = 0x2000000; // 32M + + hsa_agent_t agent; + hsa_ext_amd_aql_profile_profile_t profile; + + bool buildPackets() { return true; } + + bool dumpData() { + std::cout << "TestPGenSQTT::dumpData :" << std::endl; + + typedef std::vector callback_data_t; + + callback_data_t data; + hsa_ext_amd_aql_profile_iterate_data(&profile, TestPGenSQTT_Callback, &data); + for (callback_data_t::iterator it = data.begin(); it != data.end(); ++it) { + std::cout << "> sample(" << dec << it->sample_id << ") ptr(" << hex << it->sqtt_data.ptr + << ") size(" << dec << it->sqtt_data.size << ")" << std::endl; + + void* sys_buf = getRsrcFactory()->AllocateSysMemory(getAgentInfo(), it->sqtt_data.size); + assert(sys_buf != NULL); + if (sys_buf == NULL) return HSA_STATUS_ERROR; + + hsa_status_t status = hsa_memory_copy(sys_buf, it->sqtt_data.ptr, it->sqtt_data.size); + assert(status == HSA_STATUS_SUCCESS); + if (status != HSA_STATUS_SUCCESS) return status; + + std::string file_name; + file_name.append("sqtt_dump_"); + file_name.append(std::to_string(it->sample_id)); + file_name.append(".txt"); + std::ofstream out_file; + out_file.open(file_name); + + // Write the buffer in terms of shorts (16 bits) + short* sqtt_data = (short*)sys_buf; + for (int i = 0; i < (it->sqtt_data.size / sizeof(short)); ++i) { + out_file << std::setw(4) << std::setfill('0') << std::hex << sqtt_data[i] << "\n"; + } + + out_file.close(); + } + + return true; + } + + public: + TestPGenSQTT(TestAql* t) : TestPGen(t) { std::cout << "Test: PGen SQTT" << std::endl; } + + bool initialize(int arg_cnt, char** arg_list) { + if (!TestPMgr::initialize(arg_cnt, arg_list)) return false; + + hsa_status_t status; + hsa_agent_t agent; + uint32_t command_buffer_alignment; + uint32_t command_buffer_size; + uint32_t output_buffer_alignment; + uint32_t output_buffer_size; + + // GPU identificator + agent = getAgentInfo()->dev_id; + + // Instantiation of the profile object + // ////////////////////////////////////////////////////////////// + // Set the parameters + // parameters = ....; + + // Initialization the profile + memset(&profile, 0, sizeof(profile)); + profile.agent = agent; + profile.type = HSA_EXT_AQL_PROFILE_EVENT_SQTT; + + // set parameters + // profile.parameters = &event; + // profile.parameter_count = 1; + + // Profile buffers attributes + command_buffer_alignment = buffer_alignment; + status = hsa_ext_amd_aql_profile_get_info( + &profile, HSA_EXT_AQL_PROFILE_INFO_COMMAND_BUFFER_SIZE, &command_buffer_size); + assert(status == HSA_STATUS_SUCCESS); + + output_buffer_alignment = buffer_alignment; + output_buffer_size = buffer_size; + + // Application is allocating the command buffer + // AllocateSystem(command_buffer_alignment, command_buffer_size, + // MODE_HOST_ACC|MODE_DEV_ACC|MODE_EXEC_DATA) + profile.command_buffer.ptr = + getRsrcFactory()->AllocateSysMemory(getAgentInfo(), command_buffer_size); + profile.command_buffer.size = command_buffer_size; + + // Application is allocating the output buffer + // AllocateLocal(output_buffer_alignment, output_buffer_size, + // MODE_DEV_ACC) + profile.output_buffer.ptr = + getRsrcFactory()->AllocateLocalMemory(getAgentInfo(), output_buffer_size); + profile.output_buffer.size = output_buffer_size; + + // Populating the AQL start packet + status = hsa_ext_amd_aql_profile_start(&profile, PrePacket()); + assert(status == HSA_STATUS_SUCCESS); + if (status != HSA_STATUS_SUCCESS) return false; + + // Populating the AQL stop packet + status = hsa_ext_amd_aql_profile_stop(&profile, PostPacket()); + assert(status == HSA_STATUS_SUCCESS); + + return (status == HSA_STATUS_SUCCESS); + } +}; + +#endif // _TEST_PGEN_SQTT_H_ diff --git a/runtime/hsa-ext-aql-profile/test/ctrl/test_pmgr.cpp b/runtime/hsa-ext-aql-profile/test/ctrl/test_pmgr.cpp new file mode 100644 index 0000000000..33c071bc17 --- /dev/null +++ b/runtime/hsa-ext-aql-profile/test/ctrl/test_pmgr.cpp @@ -0,0 +1,98 @@ +/****************************************************************************** + +Copyright 2013 Advanced Micro Devices, Inc. All rights reserved. + +Redistribution and use in source and binary forms, with or without modification, +are permitted provided that the following conditions are met: + +Redistributions of source code must retain the above copyright notice, this list +of conditions and the following disclaimer. + +Redistributions in binary form must reproduce the above copyright notice, this +list of conditions and the following disclaimer in the documentation and/or +other materials provided with the distribution. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. +IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, +INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, +BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE +OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED +OF THE POSSIBILITY OF SUCH DAMAGE. + +*******************************************************************************/ + +#include +#include + +#include "test_pmgr.h" + +bool TestPMgr::addPacket(const packet_t* packet) { + packet_t aql_packet = *packet; + + // Compute the write index of queue and copy Aql packet into it + uint64_t que_idx = hsa_queue_load_write_index_relaxed(getQueue()); + const uint32_t mask = getQueue()->size - 1; + + // Disable packet so that submission to HW is complete + const auto header = HSA_PACKET_TYPE_VENDOR_SPECIFIC << HSA_PACKET_HEADER_TYPE; + aql_packet.header &= (~((1 << HSA_PACKET_HEADER_WIDTH_TYPE) - 1)) << HSA_PACKET_HEADER_TYPE; + aql_packet.header |= HSA_PACKET_TYPE_INVALID << HSA_PACKET_HEADER_TYPE; + + // Copy Aql packet into queue buffer + ((packet_t*)(getQueue()->base_address))[que_idx & mask] = aql_packet; + + // After AQL packet is fully copied into queue buffer + // update packet header from invalid state to valid state + std::atomic_thread_fence(std::memory_order_release); + ((packet_t*)(getQueue()->base_address))[que_idx & mask].header = header; + + // Increment the write index and ring the doorbell to dispatch the kernel. + hsa_queue_store_write_index_relaxed(getQueue(), (que_idx + 1)); + hsa_signal_store_relaxed(getQueue()->doorbell_signal, que_idx); + + return true; +} + +bool TestPMgr::run() { + // Build Aql Pkts + const bool active = buildPackets(); + if (active) { + // Submit Pre-Dispatch Aql packet + addPacket(&prePacket); + } + + testAql()->run(); + + if (active) { + // Set post packet completion signal + postPacket.completion_signal = postSignal; + + // Submit Post-Dispatch Aql packet + addPacket(&postPacket); + + // Wait for Post-Dispatch packet to complete + hsa_signal_wait_acquire(postSignal, HSA_SIGNAL_CONDITION_LT, 1, (uint64_t)-1, + HSA_WAIT_STATE_BLOCKED); + + // Dumping profiling data + dumpData(); + } + + return true; +} + +bool TestPMgr::initialize(int argc, char** argv) { + TestAql::initialize(argc, argv); + hsa_status_t status = hsa_signal_create(1, 0, NULL, &postSignal); + assert(status == HSA_STATUS_SUCCESS); + return (status == HSA_STATUS_SUCCESS); +} + +TestPMgr::TestPMgr(TestAql* t) : TestAql(t) { + dummySignal.handle = 0; + postSignal = dummySignal; +} diff --git a/runtime/hsa-ext-aql-profile/test/ctrl/test_pmgr.h b/runtime/hsa-ext-aql-profile/test/ctrl/test_pmgr.h new file mode 100644 index 0000000000..274fa9ab47 --- /dev/null +++ b/runtime/hsa-ext-aql-profile/test/ctrl/test_pmgr.h @@ -0,0 +1,57 @@ +/****************************************************************************** + +Copyright ©2013 Advanced Micro Devices, Inc. All rights reserved. + +Redistribution and use in source and binary forms, with or without modification, +are permitted provided that the following conditions are met: + +Redistributions of source code must retain the above copyright notice, this list +of conditions and the following disclaimer. + +Redistributions in binary form must reproduce the above copyright notice, this +list of conditions and the following disclaimer in the documentation and/or +other materials provided with the distribution. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. +IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, +INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, +BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE +OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED +OF THE POSSIBILITY OF SUCH DAMAGE. + +*******************************************************************************/ + +#ifndef _TEST_SMGR_H_ +#define _TEST_SMGR_H_ + +#include "test_aql.h" +#include "amd_aql_pm4_ib_packet.h" + +// SimpleConvolution: Class implements OpenCL SimpleConvolution sample +class TestPMgr : public TestAql { + public: + typedef amd_aql_pm4_ib_packet_t packet_t; + + private: + bool addPacket(const packet_t* packet); + + protected: + packet_t prePacket; + packet_t postPacket; + hsa_signal_t dummySignal; + hsa_signal_t postSignal; + + virtual bool buildPackets() { return false; } + virtual bool dumpData() { return false; } + virtual bool initialize(int argc, char** argv); + + public: + TestPMgr(TestAql* t); + bool run(); +}; + +#endif // _TEST_SMGR_H_ diff --git a/runtime/hsa-ext-aql-profile/test/simple_convolution/simple_convolution.cl b/runtime/hsa-ext-aql-profile/test/simple_convolution/simple_convolution.cl new file mode 100644 index 0000000000..23db8cc84f --- /dev/null +++ b/runtime/hsa-ext-aql-profile/test/simple_convolution/simple_convolution.cl @@ -0,0 +1,81 @@ +/****************************************************************************** + +Copyright 2013 Advanced Micro Devices, Inc. All rights reserved. + +Redistribution and use in source and binary forms, with or without modification, +are permitted provided that the following conditions are met: + +Redistributions of source code must retain the above copyright notice, this list +of conditions and the following disclaimer. + +Redistributions in binary form must reproduce the above copyright notice, this +list of conditions and the following disclaimer in the documentation and/or +other materials provided with the distribution. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. +IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, +INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, +BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE +OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED +OF THE POSSIBILITY OF SUCH DAMAGE. + +********************************************************************************/ + +/** + * SimpleConvolution is where each pixel of the output image + * is the weighted sum of the neighborhood pixels of the input image + * The neighborhood is defined by the dimensions of the mask and + * weight of each neighbor is defined by the mask itself. + * @param output Output matrix after performing convolution + * @param input Input matrix on which convolution is to be performed + * @param mask mask matrix using which convolution was to be performed + * @param inputDimensions dimensions of the input matrix + * @param maskDimensions dimensions of the mask matrix + */ +__kernel void simpleConvolution(__global uint * output, + __global uint * input, + __global float * mask, + const uint2 inputDimensions, + const uint2 maskDimensions) { + + uint tid = get_global_id(0); + + uint width = inputDimensions.x; + uint height = inputDimensions.y; + + uint x = tid%width; + uint y = tid/width; + + uint maskWidth = maskDimensions.x; + uint maskHeight = maskDimensions.y; + + uint vstep = (maskWidth -1)/2; + uint hstep = (maskHeight -1)/2; + + // find the left, right, top and bottom indices such that + // the indices do not go beyond image boundaires + uint left = (x < vstep) ? 0 : (x - vstep); + uint right = ((x + vstep) >= width) ? width - 1 : (x + vstep); + uint top = (y < hstep) ? 0 : (y - hstep); + uint bottom = ((y + hstep) >= height)? height - 1: (y + hstep); + + // initializing wighted sum value + float sumFX = 0; + + for(uint i = left; i <= right; ++i) { + for(uint j = top ; j <= bottom; ++j) { + // performing wighted sum within the mask boundaries + uint maskIndex = (j - (y - hstep)) * maskWidth + (i - (x - vstep)); + uint index = j * width + i; + sumFX += ((float)input[index] * mask[maskIndex]); + } + } + + // To round to the nearest integer + sumFX += 0.5f; + output[tid] = (uint)sumFX; +} diff --git a/runtime/hsa-ext-aql-profile/test/simple_convolution/simple_convolution.cpp b/runtime/hsa-ext-aql-profile/test/simple_convolution/simple_convolution.cpp new file mode 100644 index 0000000000..e4e5704337 --- /dev/null +++ b/runtime/hsa-ext-aql-profile/test/simple_convolution/simple_convolution.cpp @@ -0,0 +1,157 @@ +/****************************************************************************** + +Copyright 2013 Advanced Micro Devices, Inc. All rights reserved. + +Redistribution and use in source and binary forms, with or without modification, +are permitted provided that the following conditions are met: + +Redistributions of source code must retain the above copyright notice, this list +of conditions and the following disclaimer. + +Redistributions in binary form must reproduce the above copyright notice, this +list of conditions and the following disclaimer in the documentation and/or +other materials provided with the distribution. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. +IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, +INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, +BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE +OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED +OF THE POSSIBILITY OF SUCH DAMAGE. + +*******************************************************************************/ + +#include "helper_funcs.hpp" +#include "simple_convolution.h" + +SimpleConvolution::SimpleConvolution() { + width_ = 64; + height_ = 64; + mask_width_ = 3; + mask_height_ = mask_width_; + + if (!isPowerOf2(width_)) { + width_ = roundToPowerOf2(width_); + } + + if (!isPowerOf2(height_)) { + height_ = roundToPowerOf2(height_); + } + + if (!(mask_width_ % 2)) { + mask_width_++; + } + + if (!(mask_height_ % 2)) { + mask_height_++; + } + + if (width_ * height_ < 256) { + width_ = 64; + height_ = 64; + } + + const uint32_t input_size_bytes = width_ * height_ * sizeof(uint32_t); + const uint32_t mask_size_bytes = mask_width_ * mask_height_ * sizeof(float); + + set_sys_descr(KERNARG_DES_ID, sizeof(kernel_args_t)); + set_sys_descr(INPUT_DES_ID, input_size_bytes); + set_sys_descr(OUTPUT_DES_ID, input_size_bytes); + set_local_descr(LOCAL_DES_ID, input_size_bytes); + set_sys_descr(MASK_DES_ID, mask_size_bytes); + set_sys_descr(REFOUT_DES_ID, input_size_bytes); +} + +void SimpleConvolution::init() { + std::cout << "SimpleConvolution::init :" << std::endl; + + mem_descr_t input_des = get_descr(INPUT_DES_ID); + mem_descr_t local_des = get_descr(LOCAL_DES_ID); + mem_descr_t mask_des = get_descr(MASK_DES_ID); + mem_descr_t refout_des = get_descr(REFOUT_DES_ID); + mem_descr_t kernarg_des = get_descr(KERNARG_DES_ID); + + uint32_t* input = (uint32_t*)input_des.ptr; + uint32_t* output_local = (uint32_t*)local_des.ptr; + float* mask = (float*)mask_des.ptr; + kernel_args_t* kernel_args = (kernel_args_t*)kernarg_des.ptr; + + // random initialisation of input + fillRandom(input, width_, height_, 0, 255); + + // Fill a blurr filter or some other filter of your choice + const float val = 1.0f / (mask_width_ * 2.0f - 1.0f); + for (uint32_t i = 0; i < (mask_width_ * mask_height_); i++) { + mask[i] = 0; + } + for (uint32_t i = 0; i < mask_width_; i++) { + uint32_t y = mask_height_ / 2; + mask[y * mask_width_ + i] = val; + } + for (uint32_t i = 0; i < mask_height_; i++) { + uint32_t x = mask_width_ / 2; + mask[i * mask_width_ + x] = val; + } + + // Print the INPUT array. + printArray("> Input[0]", input, width_, 1); + printArray("> Mask", mask, mask_width_, mask_height_); + + // Fill the kernel args + kernel_args->arg1 = output_local; + kernel_args->arg2 = input; + kernel_args->arg3 = mask; + kernel_args->arg4 = width_; + kernel_args->arg41 = height_; + kernel_args->arg5 = mask_width_; + kernel_args->arg51 = mask_height_; + + // Calculate the reference output + memset(refout_des.ptr, 0, refout_des.size); + reference_impl((uint32_t*)refout_des.ptr, input, mask, width_, height_, mask_width_, + mask_height_); +} + +void SimpleConvolution::print_output() const { + printArray("> Output[0]", (uint32_t*)get_output_ptr(), width_, 1); +} + +bool SimpleConvolution::reference_impl(uint32_t* output, const uint32_t* input, const float* mask, + const uint32_t width, const uint32_t height, + const uint32_t mask_width, const uint32_t mask_height) { + const uint32_t vstep = (mask_width - 1) / 2; + const uint32_t hstep = (mask_height - 1) / 2; + + // for each pixel in the input + for (uint32_t x = 0; x < width; x++) { + for (uint32_t y = 0; y < height; y++) { + // find the left, right, top and bottom indices such that + // the indices do not go beyond image boundaires + const uint32_t left = (x < vstep) ? 0 : (x - vstep); + const uint32_t right = ((x + vstep) >= width) ? width - 1 : (x + vstep); + const uint32_t top = (y < hstep) ? 0 : (y - hstep); + const uint32_t bottom = ((y + hstep) >= height) ? height - 1 : (y + hstep); + + // initializing wighted sum value + float sum_fx = 0; + for (uint32_t i = left; i <= right; ++i) { + for (uint32_t j = top; j <= bottom; ++j) { + // performing wighted sum within the mask boundaries + uint32_t mask_idx = (j - (y - hstep)) * mask_width + (i - (x - vstep)); + uint32_t index = j * width + i; + + // to round to the nearest integer + sum_fx += ((float)input[index] * mask[mask_idx]); + } + } + sum_fx += 0.5f; + output[y * width + x] = uint32_t(sum_fx); + } + } + + return true; +} diff --git a/runtime/hsa-ext-aql-profile/test/simple_convolution/simple_convolution.h b/runtime/hsa-ext-aql-profile/test/simple_convolution/simple_convolution.h new file mode 100644 index 0000000000..27f3271428 --- /dev/null +++ b/runtime/hsa-ext-aql-profile/test/simple_convolution/simple_convolution.h @@ -0,0 +1,90 @@ +/****************************************************************************** + +Copyright ©2013 Advanced Micro Devices, Inc. All rights reserved. + +Redistribution and use in source and binary forms, with or without modification, +are permitted provided that the following conditions are met: + +Redistributions of source code must retain the above copyright notice, this list +of conditions and the following disclaimer. + +Redistributions in binary form must reproduce the above copyright notice, this +list of conditions and the following disclaimer in the documentation and/or +other materials provided with the distribution. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. +IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, +INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, +BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE +OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED +OF THE POSSIBILITY OF SUCH DAMAGE. + +*******************************************************************************/ + +#ifndef _SIMPLE_CONVOLUTION_H_ +#define _SIMPLE_CONVOLUTION_H_ + +#include +#include + +#include "test_kernel.h" + +// SimpleConvolution: Class implements OpenCL SimpleConvolution sample +class SimpleConvolution : public TestKernel { + public: + // Constructor + SimpleConvolution(); + + // Initialize method + void init(); + + // Return number of compute elements + uint32_t get_elements_count() const { return width_ * height_; } + + // Print output + void print_output() const; + + // Return name + std::string Name() const { return std::string("simpleConvolution"); } + + private: + // Local kernel arguments declaration + struct kernel_args_t { + void* arg1; + void* arg2; + void* arg3; + uint32_t arg4; + uint32_t arg41; + uint32_t arg5; + uint32_t arg51; + }; + + // Width of the Input array + uint32_t width_; + + // Height of the Input array + uint32_t height_; + + // Mask dimensions + uint32_t mask_width_; + + // Mask dimensions + uint32_t mask_height_; + + // Reference CPU implementation of Simple Convolution + // @param output Output matrix after performing convolution + // @param input Input matrix on which convolution is to be performed + // @param mask mask matrix using which convolution was to be performed + // @param input_dimensions dimensions of the input matrix + // @param mask_dimensions dimensions of the mask matrix + // @return bool true on success and false on failure + bool reference_impl(uint32_t* output, const uint32_t* input, const float* mask, + const uint32_t width, const uint32_t height, const uint32_t maskWidth, + const uint32_t maskHeight); +}; + +#endif // _SIMPLE_CONVOLUTION_H_ diff --git a/runtime/hsa-ext-aql-profile/test/simple_convolution/simple_convolution.hsail b/runtime/hsa-ext-aql-profile/test/simple_convolution/simple_convolution.hsail new file mode 100644 index 0000000000..223ef8eddb --- /dev/null +++ b/runtime/hsa-ext-aql-profile/test/simple_convolution/simple_convolution.hsail @@ -0,0 +1,154 @@ +module &m:1:0:$full:$large:$default; +extension "amd:gcn"; +extension "IMAGE"; + +decl prog function &abort()(); + +prog kernel &__OpenCL_SimpleConvolution(kernarg_u64 %__global_offset_0, + kernarg_u64 %output, + kernarg_u64 %input, + kernarg_u64 %mask, + kernarg_u32 %inputDimensions[2], + kernarg_u32 %maskDimensions[2]) { + + pragma "AMD RTI", "ARGSTART:__OpenCL_SimpleConvolution"; + pragma "AMD RTI", "version:3:1:104"; + pragma "AMD RTI", "device:generic"; + pragma "AMD RTI", "uniqueid:1024"; + pragma "AMD RTI", "memory:private:0"; + pragma "AMD RTI", "memory:region:0"; + pragma "AMD RTI", "memory:local:0"; + pragma "AMD RTI", "value:__global_offset_0:u64:1:1:0"; + pragma "AMD RTI", "pointer:output:u32:1:1:96:uav:7:4:RW:0:0:0"; + pragma "AMD RTI", "pointer:input:u32:1:1:112:uav:7:4:RW:0:0:0"; + pragma "AMD RTI", "pointer:mask:float:1:1:128:uav:7:4:RW:0:0:0"; + pragma "AMD RTI", "value:inputDimensions:u32:2:1:144"; + pragma "AMD RTI", "constarg:4:inputDimensions"; + pragma "AMD RTI", "value:maskDimensions:u32:2:1:160"; + pragma "AMD RTI", "constarg:5:maskDimensions"; + pragma "AMD RTI", "function:1:0"; + pragma "AMD RTI", "memory:64bitABI"; + pragma "AMD RTI", "privateid:8"; + pragma "AMD RTI", "enqueue_kernel:0"; + pragma "AMD RTI", "kernel_index:0"; + pragma "AMD RTI", "reflection:0:size_t"; + pragma "AMD RTI", "reflection:1:uint*"; + pragma "AMD RTI", "reflection:2:uint*"; + pragma "AMD RTI", "reflection:3:float*"; + pragma "AMD RTI", "reflection:4:uint2"; + pragma "AMD RTI", "reflection:5:uint2"; + pragma "AMD RTI", "ARGEND:__OpenCL_SimpleConvolution"; + + @__OpenCL_SimpleConvolution_Entry: + + // BB#0: // %entry + + workitemabsid_u32 $s6, 0; + cvt_u64_u32 $d0, $s6; + ld_kernarg_align(8)_width(all)_u64 $d4, [%__global_offset_0]; + add_u64 $d0, $d0, $d4; + cvt_u32_u64 $s5, $d0; + ld_v2_kernarg_align(4)_width(all)_u32 ($s0, $s4), [%inputDimensions]; + ld_v2_kernarg_align(4)_width(all)_u32 ($s1, $s9), [%maskDimensions]; + rem_u32 $s7, $s5, $s0; + add_u32 $s2, $s1, 4294967295; + shr_u32 $s8, $s2, 1; + add_u32 $s2, $s7, $s8; + add_u32 $s3, $s0, 4294967295; + cmp_ge_b1_u32 $c0, $s2, $s0; + cmov_b32 $s2, $c0, $s3, $s2; + sub_u32 $s3, $s7, $s8; + cmp_lt_b1_u32 $c0, $s7, $s8; + cmov_b32 $s3, $c0, 0, $s3; + ld_kernarg_align(8)_width(all)_u64 $d1, [%output]; + cmp_le_b1_u32 $c0, $s3, $s2; + cbr_b1 $c0, @BB0_2; + + // BB#1: + + mov_b32 $s6, 0; + br @BB0_6; + + // @BB0_2: // %for.cond32.preheader.lr.ph + + @BB0_2: + + div_u32 $s5, $s5, $s0; + add_u32 $s9, $s9, 4294967295; + shr_u32 $s9, $s9, 1; + add_u32 $s10, $s5, $s9; + add_u32 $s11, $s4, 4294967295; + cmp_ge_b1_u32 $c0, $s10, $s4; + cmov_b32 $s4, $c0, $s11, $s10; + sub_u32 $s10, $s5, $s9; + cmp_lt_b1_u32 $c0, $s5, $s9; + cmov_b32 $s5, $c0, 0, $s10; + ld_kernarg_align(8)_width(all)_u64 $d2, [%mask]; + ld_kernarg_align(8)_width(all)_u64 $d3, [%input]; + cvt_u64_u32 $d5, $s6; + add_u64 $d4, $d4, $d5; + cvt_u32_u64 $s6, $d4; + div_u32 $s6, $s6, $s0; + max_u32 $s10, $s9, $s6; + sub_u32 $s12, $s10, $s6; + max_u32 $s11, $s7, $s8; + mov_b32 $s6, 0; + mad_u32 $s12, $s1, $s12, $s11; + sub_u32 $s7, $s12, $s7; + sub_u32 $s9, $s10, $s9; + mad_u32 $s9, $s0, $s9, $s11; + sub_u32 $s8, $s9, $s8; + + // @BB0_3: // %for.cond32.preheader + + @BB0_3: + + cmp_gt_b1_u32 $c0, $s5, $s4; + mov_b32 $s9, $s7; + mov_b32 $s10, $s8; + mov_b32 $s11, $s5; + cbr_b1 $c0, @BB0_5; + + // @BB0_4: // %for.body35 + + @BB0_4: + + cvt_u64_u32 $d4, $s9; + shl_u64 $d4, $d4, 2; + add_u64 $d4, $d2, $d4; + ld_global_align(4)_f32 $s12, [$d4]; + cvt_u64_u32 $d4, $s10; + shl_u64 $d4, $d4, 2; + add_u64 $d4, $d3, $d4; + ld_global_align(4)_u32 $s13, [$d4]; + cvt_f32_u32 $s13, $s13; + mul_ftz_f32 $s12, $s13, $s12; + add_u32 $s9, $s9, $s1; + add_u32 $s10, $s10, $s0; + add_u32 $s11, $s11, 1; + add_ftz_f32 $s6, $s6, $s12; + cmp_le_b1_u32 $c0, $s11, $s4; + cbr_b1 $c0, @BB0_4; + + // @BB0_5: // %for.inc48 + + @BB0_5: + + add_u32 $s7, $s7, 1; + add_u32 $s8, $s8, 1; + add_u32 $s3, $s3, 1; + cmp_le_b1_u32 $c0, $s3, $s2; + cbr_b1 $c0, @BB0_3; + + // @BB0_6: // %for.end50 + + @BB0_6: + + and_b64 $d0, $d0, 4294967295; + shl_u64 $d0, $d0, 2; + add_u64 $d0, $d1, $d0; + add_ftz_f32 $s0, $s6, 0F3f000000; + cvt_ftz_u32_f32 $s0, $s0; + st_global_align(4)_u32 $s0, [$d0]; + ret; +};