Adding HSA extension AMD AQL profile library, see Readme.txt
Change-Id: Icbc1e0fb0185642eabbab411a2138ea030d22be8
Этот коммит содержится в:
коммит произвёл
Evgeny Shcherbakov
родитель
da831502ab
Коммит
25035b8d09
@@ -0,0 +1,28 @@
|
||||
#
|
||||
# Minimum version of cmake required
|
||||
#
|
||||
cmake_minimum_required ( VERSION 3.5.0 )
|
||||
|
||||
#
|
||||
# Setup flag to be verbose or not
|
||||
#
|
||||
set ( CMAKE_VERBOSE_MAKEFILE TRUE CACHE BOOL "Verbose Output" FORCE )
|
||||
|
||||
set ( ROOT_DIR ${CMAKE_CURRENT_SOURCE_DIR} )
|
||||
set ( PROJ_DIR ${ROOT_DIR}/src )
|
||||
set ( TEST_DIR ${ROOT_DIR}/test )
|
||||
|
||||
#
|
||||
# Build sources
|
||||
#
|
||||
include ( ${PROJ_DIR}/CMakeLists.txt )
|
||||
|
||||
#
|
||||
# Build tests
|
||||
#
|
||||
add_subdirectory ( ${TEST_DIR} ${PROJECT_BINARY_DIR}/test )
|
||||
|
||||
#
|
||||
# Style format
|
||||
#
|
||||
execute_process ( COMMAND sh -xc "/usr/bin/find ${ROOT_DIR} -name '*.cpp' -o -name '*.hpp' -o -name '*.h' -exec /usr/bin/clang-format -i -style=file \{\} \;" )
|
||||
@@ -0,0 +1,40 @@
|
||||
HSA extension AMD AQL profile library.
|
||||
Provides AQL packets helper methods for
|
||||
perfcounters (PMC) and SQ threadtraces (SQTT).
|
||||
|
||||
Current library implementation supports only GFX9.
|
||||
The library source tree:
|
||||
- doc - Documantation, the API specification and the presentation
|
||||
- inc - Public API
|
||||
- hsa_ext_amd_aql_profile.h - AMD AQL profile library public API
|
||||
- amd_aql_pm4_ib_packet.h - AQL PM4 IB packet type
|
||||
- src - AMD AQL profile library sources
|
||||
- aqlprofile - AMD AQL profile library
|
||||
- commandwriter - PM4 command writer originated from 'hsa-runtime/tools'
|
||||
- perfcounter - PM4 perfcounter manager originated from 'hsa-runtime/tools'
|
||||
- threadtrace - PM4 threadtrace manager originated from 'hsa-runtime/tools'
|
||||
- util - core/utils library build based on 'hsa-runtime/core/util'
|
||||
- test - the library test suite
|
||||
- ctrl - Test controll
|
||||
- common - Test common utils
|
||||
- SimpleConvolution - Simple convolution test
|
||||
|
||||
To build the library:
|
||||
|
||||
$ cd ..../hsa-ext-aql-profile
|
||||
$ mkdir build
|
||||
$ cd build
|
||||
$ cmake ..
|
||||
$ make
|
||||
|
||||
To run the test:
|
||||
|
||||
# cd ..../hsa-ext-aql-profile/build
|
||||
$ cp ../test/SimpleConvolution/gfx9_SimpleConvolution.hsaco .
|
||||
$ test/SimpleConvolution
|
||||
|
||||
to enable PMC profiling:
|
||||
export ROCR_ENABLE_PMC=1
|
||||
|
||||
to enable SQTT profiling:
|
||||
export ROCR_ENABLE_SQTT=1
|
||||
@@ -0,0 +1,66 @@
|
||||
#
|
||||
# Compiler Preprocessor definitions.
|
||||
#
|
||||
add_definitions ( -D__linux__ )
|
||||
add_definitions ( -DUNIX_OS )
|
||||
add_definitions ( -DLINUX )
|
||||
add_definitions ( -D__AMD64__ )
|
||||
add_definitions ( -D__x86_64__ )
|
||||
add_definitions ( -DAMD_INTERNAL_BUILD )
|
||||
add_definitions ( -DLITTLEENDIAN_CPU=1 )
|
||||
add_definitions ( -DHSA_LARGE_MODEL= )
|
||||
add_definitions ( -DHSA_DEPRECATED= )
|
||||
|
||||
#
|
||||
# Linux Compiler options
|
||||
#
|
||||
set ( CMAKE_CXX_FLAGS "-std=c++11")
|
||||
set ( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Werror" )
|
||||
set ( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Werror=return-type" )
|
||||
set ( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fexceptions" )
|
||||
set ( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fvisibility=hidden" )
|
||||
set ( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-error=sign-compare" )
|
||||
set ( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-error=enum-compare" )
|
||||
set ( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-error=comment " )
|
||||
set ( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-error=pointer-arith" )
|
||||
set ( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-comment" )
|
||||
set ( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-sign-compare" )
|
||||
set ( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-pointer-arith" )
|
||||
set ( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-write-strings" )
|
||||
set ( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-conversion-null" )
|
||||
set ( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-deprecated-declarations" )
|
||||
set ( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fno-rtti" )
|
||||
set ( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fno-math-errno" )
|
||||
set ( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fno-threadsafe-statics" )
|
||||
set ( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fms-extensions" )
|
||||
set ( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fmerge-all-constants" )
|
||||
set ( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fPIC" )
|
||||
|
||||
#
|
||||
# Extend Compiler flags based on build type
|
||||
#
|
||||
set ( CMAKE_BUILD_TYPE ${BUILD_TYPE} )
|
||||
if ( "${CMAKE_BUILD_TYPE}" STREQUAL Debug )
|
||||
set ( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -ggdb" )
|
||||
endif ()
|
||||
|
||||
#
|
||||
# Extend Compiler flags based on Processor architecture
|
||||
#
|
||||
if ( CMAKE_SYSTEM_PROCESSOR STREQUAL "x86_64" )
|
||||
set ( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -m64 -msse -msse2" )
|
||||
elseif ( CMAKE_SYSTEM_PROCESSOR STREQUAL "x86" )
|
||||
set ( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -m32" )
|
||||
endif ()
|
||||
|
||||
#
|
||||
# Basic Tool Chain Information
|
||||
#
|
||||
message ( "-------------IS64BIT: " ${IS64BIT} )
|
||||
message ( "-----------BuildType: " ${BUILD_TYPE} )
|
||||
message ( " -----------Compiler: " ${CMAKE_CXX_COMPILER} )
|
||||
message ( " ------------Version: " ${CMAKE_CXX_COMPILER_VERSION} )
|
||||
message ( " ------------ProjDir: " ${PROJ_DIR} )
|
||||
message ( " ------------TestDir: " ${PROJ_DIR} )
|
||||
message ( "------HSA-RuntimeDir: " ${HSA_RUNTIME_DIR} )
|
||||
message ( " -----------CoreUtil: " ${CORE_UTIL_DIR} )
|
||||
@@ -0,0 +1,52 @@
|
||||
#
|
||||
# Build is not supported on Windows plaform
|
||||
#
|
||||
if ( WIN32 )
|
||||
message ( FATAL_ERROR "Windows build is not supported." )
|
||||
endif ()
|
||||
|
||||
#
|
||||
# External dependencies for Rocr Header files
|
||||
#
|
||||
if ( NOT DEFINED ENV{ROCR_INC_DIR} )
|
||||
message ( FATAL_ERROR "ERROR: Environment variable ROCR_INC_DIR is not set" )
|
||||
return ()
|
||||
endif ()
|
||||
|
||||
#
|
||||
# External dependencies for Rocr Library files
|
||||
#
|
||||
if ( NOT DEFINED ENV{ROCR_LIB_DIR} )
|
||||
message ( FATAL_ERROR "ERROR: Environment variable ROCR_LIB_DIR is not set" )
|
||||
return ()
|
||||
endif ()
|
||||
|
||||
#
|
||||
# Process Env to determine build type
|
||||
#
|
||||
string ( TOLOWER "$ENV{ROCR_BLD_TYPE}" type )
|
||||
if ( "${type}" STREQUAL debug )
|
||||
set ( ISDEBUG 1 )
|
||||
set ( BUILD_TYPE "Debug" )
|
||||
else ()
|
||||
set ( ISDEBUG 0 )
|
||||
set ( BUILD_TYPE "Release" )
|
||||
endif ()
|
||||
|
||||
#
|
||||
# Determine build is 32-bit or 64-bit
|
||||
# @note: By default it is not set
|
||||
#
|
||||
if ( "$ENV{ROCR_BLD_BITS}" STREQUAL 32 )
|
||||
set ( ONLY64STR "" )
|
||||
set ( IS64BIT 0 )
|
||||
else ()
|
||||
set ( ONLY64STR "64" )
|
||||
set ( IS64BIT 1 )
|
||||
endif ()
|
||||
|
||||
#
|
||||
# Build information
|
||||
#
|
||||
message ( "---------ROCR-HdrDir: " $ENV{ROCR_INC_DIR} )
|
||||
message ( "---------ROCR-LibDir: " $ENV{ROCR_LIB_DIR} )
|
||||
Двоичный файл не отображается.
Двоичный файл не отображается.
@@ -0,0 +1,67 @@
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
//
|
||||
// Copyright 2017 ADVANCED MICRO DEVICES, INC.
|
||||
//
|
||||
// AMD is granting you permission to use this software and documentation(if any)
|
||||
// (collectively, the "Materials") pursuant to the terms and conditions of the
|
||||
// Software License Agreement included with the Materials.If you do not have a
|
||||
// copy of the Software License Agreement, contact your AMD representative for a
|
||||
// copy.
|
||||
//
|
||||
// You agree that you will not reverse engineer or decompile the Materials, in
|
||||
// whole or in part, except as allowed by applicable law.
|
||||
//
|
||||
// WARRANTY DISCLAIMER : THE SOFTWARE IS PROVIDED "AS IS" WITHOUT WARRANTY OF
|
||||
// ANY KIND.AMD DISCLAIMS ALL WARRANTIES, EXPRESS, IMPLIED, OR STATUTORY,
|
||||
// INCLUDING BUT NOT LIMITED TO THE IMPLIED WARRANTIES OF MERCHANTABILITY,
|
||||
// FITNESS FOR A PARTICULAR PURPOSE, TITLE, NON - INFRINGEMENT, THAT THE
|
||||
// SOFTWARE WILL RUN UNINTERRUPTED OR ERROR - FREE OR WARRANTIES ARISING FROM
|
||||
// CUSTOM OF TRADE OR COURSE OF USAGE.THE ENTIRE RISK ASSOCIATED WITH THE USE OF
|
||||
// THE SOFTWARE IS ASSUMED BY YOU.Some jurisdictions do not allow the exclusion
|
||||
// of implied warranties, so the above exclusion may not apply to You.
|
||||
//
|
||||
// LIMITATION OF LIABILITY AND INDEMNIFICATION : AMD AND ITS LICENSORS WILL NOT,
|
||||
// UNDER ANY CIRCUMSTANCES BE LIABLE TO YOU FOR ANY PUNITIVE, DIRECT,
|
||||
// INCIDENTAL, INDIRECT, SPECIAL OR CONSEQUENTIAL DAMAGES ARISING FROM USE OF
|
||||
// THE SOFTWARE OR THIS AGREEMENT EVEN IF AMD AND ITS LICENSORS HAVE BEEN
|
||||
// ADVISED OF THE POSSIBILITY OF SUCH DAMAGES.In no event shall AMD's total
|
||||
// liability to You for all damages, losses, and causes of action (whether in
|
||||
// contract, tort (including negligence) or otherwise) exceed the amount of $100
|
||||
// USD. You agree to defend, indemnify and hold harmless AMD and its licensors,
|
||||
// and any of their directors, officers, employees, affiliates or agents from
|
||||
// and against any and all loss, damage, liability and other expenses (including
|
||||
// reasonable attorneys' fees), resulting from Your use of the Software or
|
||||
// violation of the terms and conditions of this Agreement.
|
||||
//
|
||||
// U.S.GOVERNMENT RESTRICTED RIGHTS : The Materials are provided with
|
||||
// "RESTRICTED RIGHTS." Use, duplication, or disclosure by the Government is
|
||||
// subject to the restrictions as set forth in FAR 52.227 - 14 and DFAR252.227 -
|
||||
// 7013, et seq., or its successor.Use of the Materials by the Government
|
||||
// constitutes acknowledgement of AMD's proprietary rights in them.
|
||||
//
|
||||
// EXPORT RESTRICTIONS: The Materials may be subject to export restrictions as
|
||||
// stated in the Software License Agreement.
|
||||
//
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
#ifndef _AMD_AQL_PM4_IB_PACKET_H_
|
||||
#define _AMD_AQL_PM4_IB_PACKET_H_
|
||||
|
||||
// Value of 'pm4_ib_format' field of amd_aql_pm4_ib_packet_t packet
|
||||
const static uint32_t AMD_AQL_PM4_IB_FORMAT = 1;
|
||||
// Value of 'dw_count_remain' field of amd_aql_pm4_ib_packet_t packet
|
||||
const static uint32_t AMD_AQL_PM4_IB_DW_COUNT_REMAIN = 10;
|
||||
// Size of 'reserved' array of amd_aql_pm4_ib_packet_t packet
|
||||
const static uint32_t AMD_AQL_PM4_IB_RESERVED_COUNT = 8;
|
||||
|
||||
// AQL Vendor Specific Packet which carry PM4 IB command
|
||||
typedef struct {
|
||||
uint16_t header;
|
||||
uint16_t pm4_ib_format;
|
||||
uint32_t pm4_ib_command[4];
|
||||
uint32_t dw_count_remain;
|
||||
uint32_t reserved[AMD_AQL_PM4_IB_RESERVED_COUNT];
|
||||
hsa_signal_t completion_signal;
|
||||
} amd_aql_pm4_ib_packet_t;
|
||||
|
||||
#endif // _AMD_AQL_PM4_IB_H_
|
||||
@@ -0,0 +1,262 @@
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
//
|
||||
// Copyright 2017 ADVANCED MICRO DEVICES, INC.
|
||||
//
|
||||
// AMD is granting you permission to use this software and documentation(if any)
|
||||
// (collectively, the "Materials") pursuant to the terms and conditions of the
|
||||
// Software License Agreement included with the Materials.If you do not have a
|
||||
// copy of the Software License Agreement, contact your AMD representative for a
|
||||
// copy.
|
||||
//
|
||||
// You agree that you will not reverse engineer or decompile the Materials, in
|
||||
// whole or in part, except as allowed by applicable law.
|
||||
//
|
||||
// WARRANTY DISCLAIMER : THE SOFTWARE IS PROVIDED "AS IS" WITHOUT WARRANTY OF
|
||||
// ANY KIND.AMD DISCLAIMS ALL WARRANTIES, EXPRESS, IMPLIED, OR STATUTORY,
|
||||
// INCLUDING BUT NOT LIMITED TO THE IMPLIED WARRANTIES OF MERCHANTABILITY,
|
||||
// FITNESS FOR A PARTICULAR PURPOSE, TITLE, NON - INFRINGEMENT, THAT THE
|
||||
// SOFTWARE WILL RUN UNINTERRUPTED OR ERROR - FREE OR WARRANTIES ARISING FROM
|
||||
// CUSTOM OF TRADE OR COURSE OF USAGE.THE ENTIRE RISK ASSOCIATED WITH THE USE OF
|
||||
// THE SOFTWARE IS ASSUMED BY YOU.Some jurisdictions do not allow the exclusion
|
||||
// of implied warranties, so the above exclusion may not apply to You.
|
||||
//
|
||||
// LIMITATION OF LIABILITY AND INDEMNIFICATION : AMD AND ITS LICENSORS WILL NOT,
|
||||
// UNDER ANY CIRCUMSTANCES BE LIABLE TO YOU FOR ANY PUNITIVE, DIRECT,
|
||||
// INCIDENTAL, INDIRECT, SPECIAL OR CONSEQUENTIAL DAMAGES ARISING FROM USE OF
|
||||
// THE SOFTWARE OR THIS AGREEMENT EVEN IF AMD AND ITS LICENSORS HAVE BEEN
|
||||
// ADVISED OF THE POSSIBILITY OF SUCH DAMAGES.In no event shall AMD's total
|
||||
// liability to You for all damages, losses, and causes of action (whether in
|
||||
// contract, tort (including negligence) or otherwise) exceed the amount of $100
|
||||
// USD. You agree to defend, indemnify and hold harmless AMD and its licensors,
|
||||
// and any of their directors, officers, employees, affiliates or agents from
|
||||
// and against any and all loss, damage, liability and other expenses (including
|
||||
// reasonable attorneys' fees), resulting from Your use of the Software or
|
||||
// violation of the terms and conditions of this Agreement.
|
||||
//
|
||||
// U.S.GOVERNMENT RESTRICTED RIGHTS : The Materials are provided with
|
||||
// "RESTRICTED RIGHTS." Use, duplication, or disclosure by the Government is
|
||||
// subject to the restrictions as set forth in FAR 52.227 - 14 and DFAR252.227 -
|
||||
// 7013, et seq., or its successor.Use of the Materials by the Government
|
||||
// constitutes acknowledgement of AMD's proprietary rights in them.
|
||||
//
|
||||
// EXPORT RESTRICTIONS: The Materials may be subject to export restrictions as
|
||||
// stated in the Software License Agreement.
|
||||
//
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
#ifndef _HSA_EXT_AMD_AQL_PROFILE_H_
|
||||
#define _HSA_EXT_AMD_AQL_PROFILE_H_
|
||||
|
||||
#include <stdint.h>
|
||||
#include <hsa.h>
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif // __cplusplus
|
||||
|
||||
///////////////////////////////////////////////////////////////////////
|
||||
// Library API:
|
||||
// The library provides helper methods for instantiation of
|
||||
// the profile context object and for populating of the start
|
||||
// and stop AQL packets. The profile object contains a profiling
|
||||
// events list and needed for profiling buffers descriptors,
|
||||
// a command buffer and an output data buffer. To check if there
|
||||
// was an error the library methods return a status code. Also
|
||||
// the library provides methods for querying required buffers
|
||||
// attributes, to validate the event attributes and to get profiling
|
||||
// output data.
|
||||
//
|
||||
// Returned status:
|
||||
// hsa_status_t – HSA status codes are used from hsa.h header
|
||||
//
|
||||
// Supported profiling features:
|
||||
//
|
||||
// Supported profiling events
|
||||
typedef enum {
|
||||
HSA_EXT_AQL_PROFILE_EVENT_PMC,
|
||||
HSA_EXT_AQL_PROFILE_EVENT_SQTT
|
||||
} hsa_ext_amd_aql_profile_event_type_t;
|
||||
|
||||
// Supported performance counters (PMC) blocks
|
||||
// The block ID is the same for a block instances set, for example
|
||||
// each block instance from the TCC block set, TCC0, TCC1, …, TCCN
|
||||
// will have the same block ID HSA_EXT_AQL_PROFILE_BLOCKS_TCC.
|
||||
typedef enum {
|
||||
HSA_EXT_AQL_PROFILE_BLOCK_CB,
|
||||
HSA_EXT_AQL_PROFILE_BLOCK_CPF,
|
||||
HSA_EXT_AQL_PROFILE_BLOCK_DB,
|
||||
HSA_EXT_AQL_PROFILE_BLOCK_GRBM,
|
||||
HSA_EXT_AQL_PROFILE_BLOCK_GRBMSE,
|
||||
HSA_EXT_AQL_PROFILE_BLOCK_PASU,
|
||||
HSA_EXT_AQL_PROFILE_BLOCK_PASC,
|
||||
HSA_EXT_AQL_PROFILE_BLOCK_SPI,
|
||||
HSA_EXT_AQL_PROFILE_BLOCK_SQ,
|
||||
HSA_EXT_AQL_PROFILE_BLOCK_SQGS,
|
||||
HSA_EXT_AQL_PROFILE_BLOCK_SQVS,
|
||||
HSA_EXT_AQL_PROFILE_BLOCK_SQPS,
|
||||
HSA_EXT_AQL_PROFILE_BLOCK_SQHS,
|
||||
HSA_EXT_AQL_PROFILE_BLOCK_SQCS,
|
||||
HSA_EXT_AQL_PROFILE_BLOCK_SX,
|
||||
HSA_EXT_AQL_PROFILE_BLOCK_TA,
|
||||
HSA_EXT_AQL_PROFILE_BLOCK_TCA,
|
||||
HSA_EXT_AQL_PROFILE_BLOCK_TCC,
|
||||
HSA_EXT_AQL_PROFILE_BLOCK_TD,
|
||||
HSA_EXT_AQL_PROFILE_BLOCK_TCP,
|
||||
HSA_EXT_AQL_PROFILE_BLOCK_GDS,
|
||||
HSA_EXT_AQL_PROFILE_BLOCK_VGT,
|
||||
HSA_EXT_AQL_PROFILE_BLOCK_IA,
|
||||
HSA_EXT_AQL_PROFILE_BLOCK_MC,
|
||||
HSA_EXT_AQL_PROFILE_BLOCK_TCS,
|
||||
HSA_EXT_AQL_PROFILE_BLOCK_WD,
|
||||
HSA_EXT_AQL_PROFILE_BLOCKS_NUMBER
|
||||
} hsa_ext_amd_aql_profile_block_name_t;
|
||||
|
||||
// PMC event object structure
|
||||
// ‘counter_id’ value is specified in GFXIPs perfcounter user guides
|
||||
// which is the counters select value, “Performance Counters Selection”
|
||||
// chapter.
|
||||
typedef struct {
|
||||
hsa_ext_amd_aql_profile_block_name_t block_name;
|
||||
uint32_t block_index;
|
||||
uint32_t counter_id;
|
||||
} hsa_ext_amd_aql_profile_event_t;
|
||||
|
||||
// Check if event is valid for the specific GPU
|
||||
hsa_status_t hsa_ext_amd_aql_profile_validate_event(
|
||||
hsa_agent_t agent, // HSA handle for the profiling GPU
|
||||
const hsa_ext_amd_aql_profile_event_t* event, // Pointer on validated event
|
||||
bool* result); // True if the event valid, False otherwise
|
||||
|
||||
// Profiling parameters
|
||||
// All parameters are generic and if not applicable for a specific
|
||||
// profile configuration then error status will be returned.
|
||||
typedef enum {
|
||||
// SQTT applicable parameters
|
||||
HSA_EXT_AQL_PROFILE_PARAM_COMPUTE_UNIT_TARGET,
|
||||
HSA_EXT_AQL_PROFILE_PARAM_VM_ID_MASK,
|
||||
HSA_EXT_AQL_PROFILE_PARAM_MASK,
|
||||
HSA_EXT_AQL_PROFILE_PARAM_TOKEN_MASK,
|
||||
HSA_EXT_AQL_PROFILE_PARAM_TOKEN_MASK2
|
||||
} hsa_ext_amd_aql_profile_parameter_name_t;
|
||||
|
||||
// Profile parameter object
|
||||
typedef struct {
|
||||
hsa_ext_amd_aql_profile_parameter_name_t parameter_name;
|
||||
uint32_t value;
|
||||
} hsa_ext_amd_aql_profile_parameters_t;
|
||||
|
||||
//
|
||||
// Profile context object:
|
||||
// The library provides a profile object structure which contains
|
||||
// the events array, a buffer for the profiling start/stop commands
|
||||
// and a buffer for the output data.
|
||||
// The buffers are specified by the buffer descriptors and allocated
|
||||
// by the application. The buffers allocation attributes, the command
|
||||
// buffer size, the PMC output buffer size as well as profiling output
|
||||
// data can be get using the generic get profile info helper _get_info.
|
||||
//
|
||||
// Buffer descriptor
|
||||
typedef struct {
|
||||
void* ptr;
|
||||
uint32_t size;
|
||||
} hsa_ext_amd_aql_profile_descriptor_t;
|
||||
|
||||
// Profile context object structure, contains profiling events list and
|
||||
// needed for profiling buffers descriptors, a command buffer and
|
||||
// an output data buffer
|
||||
typedef struct {
|
||||
hsa_agent_t agent; // GFXIP handle
|
||||
hsa_ext_amd_aql_profile_event_type_t type; // Events type
|
||||
const hsa_ext_amd_aql_profile_event_t* events; // Events array
|
||||
uint32_t event_count; // Events count
|
||||
const hsa_ext_amd_aql_profile_parameters_t* parameters; // Parameters array
|
||||
uint32_t parameter_count; // Parameters count
|
||||
hsa_ext_amd_aql_profile_descriptor_t output_buffer; // Output buffer
|
||||
hsa_ext_amd_aql_profile_descriptor_t command_buffer; // PM4 commands
|
||||
} hsa_ext_amd_aql_profile_profile_t;
|
||||
|
||||
//
|
||||
// AQL packets populating methods:
|
||||
// The helper methods to populate provided by the application START and
|
||||
// STOP AQL packets which the application is required to submit before and
|
||||
// after profiled GPU task packets respectively.
|
||||
//
|
||||
// AQL Vendor Specific packet which carries a PM4 command
|
||||
typedef struct {
|
||||
uint16_t header;
|
||||
uint16_t pm4_command[27];
|
||||
hsa_signal_t completion_signal;
|
||||
} hsa_ext_amd_aql_pm4_packet_t;
|
||||
|
||||
// Method to populate the provided AQL packet with profiling start commands
|
||||
// Only 'pm4_command' fields of the packet are set and the application
|
||||
// is responsible to set Vendor Specific header type a completion signal
|
||||
hsa_status_t hsa_ext_amd_aql_profile_start(
|
||||
const hsa_ext_amd_aql_profile_profile_t* profile, // [in] profile contex object
|
||||
hsa_ext_amd_aql_pm4_packet_t* aql_start_packet); // [out] profile start AQL packet
|
||||
|
||||
// Method to populate the provided AQL packet with profiling stop commands
|
||||
// Only 'pm4_command' fields of the packet are set and the application
|
||||
// is responsible to set Vendor Specific header type and a completion signal
|
||||
hsa_status_t hsa_ext_amd_aql_profile_stop(
|
||||
const hsa_ext_amd_aql_profile_profile_t* profile, // [in] profile contex object
|
||||
hsa_ext_amd_aql_pm4_packet_t* aql_stop_packet); // [out] profile stop AQL packet
|
||||
|
||||
// Legacy PM4 profiling packet size
|
||||
const unsigned HSA_EXT_AQL_PROFILE_LEGACY_PM4_PACKET_SIZE = 64;
|
||||
// Converting of the profiling AQL packet to PM4 packet, GFX8 support
|
||||
hsa_status_t hsa_ext_amd_aql_profile_legacy_get_pm4(
|
||||
const hsa_ext_amd_aql_pm4_packet_t* aql_packet, // AQL packet
|
||||
void* pm4); // PM4 packet blob
|
||||
|
||||
//
|
||||
// Get profile info:
|
||||
// Generic method for getting various profile info including profile buffers
|
||||
// attributes like the command buffer size and the profiling PMC results.
|
||||
// It’s implied that all counters are 64bit values.
|
||||
//
|
||||
// Profile generic output data:
|
||||
typedef struct {
|
||||
uint32_t sample_id; // PMC sample of SQTT buffer index
|
||||
union {
|
||||
struct {
|
||||
hsa_ext_amd_aql_profile_event_t event; // PMC event
|
||||
uint64_t result; // PMC result
|
||||
} pmc_data;
|
||||
hsa_ext_amd_aql_profile_descriptor_t sqtt_data; // SQTT output data descriptor
|
||||
};
|
||||
} hsa_ext_amd_aql_profile_info_data_t;
|
||||
|
||||
// Profile attributes
|
||||
typedef enum {
|
||||
HSA_EXT_AQL_PROFILE_INFO_COMMAND_BUFFER_SIZE, // get_info returns uint32_t value
|
||||
HSA_EXT_AQL_PROFILE_INFO_PMC_DATA_SIZE, // get_info returns uint32_t value
|
||||
HSA_EXT_AQL_PROFILE_INFO_PMC_DATA, // get_info returns PMC uint64_t value
|
||||
// in info_data object
|
||||
HSA_EXT_AQL_PROFILE_INFO_SQTT_DATA // get_info returns SQTT buffer ptr/size
|
||||
// in info_data object
|
||||
} hsa_ext_amd_aql_profile_info_type_t;
|
||||
|
||||
// Definition of output data iterator callback
|
||||
typedef hsa_status_t (*hsa_ext_amd_aql_profile_data_callback_t)(
|
||||
hsa_ext_amd_aql_profile_info_type_t info_type, // [in] data type, PMC or SQTT data
|
||||
hsa_ext_amd_aql_profile_info_data_t* info_data, // [in] info_data object
|
||||
void* callback_data); // [in/out] data passed to the callback
|
||||
|
||||
// Method for getting the profile info
|
||||
hsa_status_t hsa_ext_amd_aql_profile_get_info(
|
||||
const hsa_ext_amd_aql_profile_profile_t* profile, // [in] profile context object
|
||||
hsa_ext_amd_aql_profile_info_type_t attribute, // [in] requested profile attribute
|
||||
void* value); // [in/out] returned value
|
||||
|
||||
// Method for iterating the events output data
|
||||
hsa_status_t hsa_ext_amd_aql_profile_iterate_data(
|
||||
const hsa_ext_amd_aql_profile_profile_t* profile, // [in] profile context object
|
||||
hsa_ext_amd_aql_profile_data_callback_t callback, // [in] callback to iterate the output data
|
||||
void* data); // [in/out] data passed to the callback
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif // __cplusplus
|
||||
|
||||
#endif // _HSA_EXT_AMD_AQL_PROFILE_H_
|
||||
@@ -0,0 +1,72 @@
|
||||
#
|
||||
# Minimum version of cmake required
|
||||
#
|
||||
cmake_minimum_required ( VERSION 3.5.0 )
|
||||
|
||||
#
|
||||
# Setup flag to be verbose or not
|
||||
#
|
||||
set ( CMAKE_VERBOSE_MAKEFILE TRUE CACHE BOOL "Verbose Output" FORCE )
|
||||
|
||||
#
|
||||
# Set name for the project
|
||||
# @note: Must come before adding any sub-directories
|
||||
#
|
||||
set ( TARGET_NAME "aqlprofile" )
|
||||
project ( ${TARGET_NAME} )
|
||||
|
||||
if ( NOT DEFINED PROJ_DIR )
|
||||
set ( PROJ_DIR ${CMAKE_CURRENT_SOURCE_DIR} )
|
||||
set ( ROOT_DIR ${PROJ_DIR}/.. )
|
||||
endif ()
|
||||
|
||||
set ( API_DIR ${ROOT_DIR}/inc )
|
||||
set ( HSA_RUNTIME_DIR ${PROJ_DIR}/../../.. )
|
||||
set ( HSA_RUNTIME_OSC_DIR ${HSA_RUNTIME_DIR}/opensrc/hsa-runtime )
|
||||
set ( CORE_UTIL_DIR ${HSA_RUNTIME_OSC_DIR}/core/util )
|
||||
include_directories ( ${ROOT_DIR} )
|
||||
|
||||
#
|
||||
# Validate required build environment is setup correctly
|
||||
#
|
||||
include ( ${ROOT_DIR}/cmake_modules/validateBldEnv.cmake )
|
||||
|
||||
#
|
||||
# Setup tool chain flags - preprocessor, compiler and linker
|
||||
#
|
||||
include ( ${ROOT_DIR}/cmake_modules/exportToolFlags.cmake )
|
||||
|
||||
#
|
||||
# Set Name for Utils library and build it as a
|
||||
# static library to be linked with others
|
||||
#
|
||||
set ( UTIL_LIB "util${ONLY64STR}" )
|
||||
add_subdirectory ( ${PROJ_DIR}/util "${PROJECT_BINARY_DIR}/util" )
|
||||
|
||||
#
|
||||
# Set Name for Cmdwriter library and build it as a
|
||||
# static library to be linked with others
|
||||
#
|
||||
set ( CMDWRITER_LIB "commandwriter${ONLY64STR}" )
|
||||
add_subdirectory ( ${PROJ_DIR}/commandwriter "${PROJECT_BINARY_DIR}/commandwriter" )
|
||||
|
||||
#
|
||||
# Set Name for ThreadTrace library and build it as a
|
||||
# static library to be linked with others
|
||||
#
|
||||
set ( SQTT_LIB "sqtt${ONLY64STR}" )
|
||||
add_subdirectory ( ${PROJ_DIR}/threadtrace "${PROJECT_BINARY_DIR}/threadtrace" )
|
||||
|
||||
#
|
||||
# Set Name for Profiler library and build it as a
|
||||
# static library to be linked with others
|
||||
#
|
||||
set ( PMC_LIB "pmc${ONLY64STR}" )
|
||||
add_subdirectory ( ${PROJ_DIR}/perfcounter "${PROJECT_BINARY_DIR}/perfcounter" )
|
||||
|
||||
#
|
||||
# Build the library and link it with other static
|
||||
# libraries that have been built in this regard
|
||||
#
|
||||
set ( TARGET_LIB "${TARGET_NAME}${ONLY64STR}" )
|
||||
add_subdirectory ( ${PROJ_DIR}/${TARGET_NAME} "${PROJECT_BINARY_DIR}/${TARGET_NAME}" )
|
||||
@@ -0,0 +1,20 @@
|
||||
#
|
||||
# Source files for Rocr Service Manager
|
||||
#
|
||||
set ( LIB_SRC aql_profile.cpp populate_aql.cpp gfx8_factory.cpp gfx9_factory.cpp )
|
||||
|
||||
#
|
||||
# Header files include path(s).
|
||||
#
|
||||
include_directories ( $ENV{ROCR_INC_DIR} )
|
||||
include_directories ( ${PROJ_DIR}/perfcounter )
|
||||
include_directories ( ${PROJ_DIR}/threadtrace )
|
||||
include_directories ( ${PROJ_DIR}/commandwriter )
|
||||
include_directories ( ${API_DIR} )
|
||||
|
||||
#
|
||||
# Build Service Manager as a dynamic Library object
|
||||
#
|
||||
set ( LIB_LIST ${PMC_LIB} ${SQTT_LIB} ${CMDWRITER_LIB} ${UTIL_LIB} )
|
||||
add_library ( ${TARGET_LIB} SHARED ${LIB_SRC} )
|
||||
target_link_libraries( ${TARGET_LIB} ${LIB_LIST} c stdc++ dl pthread rt )
|
||||
@@ -0,0 +1,398 @@
|
||||
#include <string>
|
||||
|
||||
#include "aql_profile.h"
|
||||
#include "pm4_factory.h"
|
||||
#include "cmdwriter.h" // commandwriter
|
||||
#include "hsa_perf.h" // perfcounter
|
||||
#include "thread_trace.h" // threadtrace
|
||||
#include "gpu_enum.h"
|
||||
#include "gpu_blockinfo.h"
|
||||
|
||||
#define PUBLIC_API __attribute__((visibility("default")))
|
||||
|
||||
namespace aql_profile {
|
||||
|
||||
// Command buffer partitioning manager
|
||||
// Supports Pre/Post commands partitioning
|
||||
// and postfix control partition
|
||||
class CommandBufferMgr {
|
||||
const static uint32_t align_size = 0x100;
|
||||
const static uint32_t align_mask = align_size - 1;
|
||||
|
||||
struct info_t {
|
||||
uint32_t precmds_size;
|
||||
uint32_t postcmds_size;
|
||||
};
|
||||
|
||||
descriptor_t buffer;
|
||||
uint32_t postfix_size;
|
||||
info_t* info;
|
||||
|
||||
uint32_t align(const uint32_t& size) { return (size + align_mask) & ~align_mask; }
|
||||
|
||||
public:
|
||||
CommandBufferMgr(const profile_t* profile)
|
||||
: buffer(profile->command_buffer), postfix_size(0), info(NULL) {
|
||||
info = (info_t*)setPostfix(sizeof(info_t));
|
||||
}
|
||||
|
||||
uint32_t getSize() { return buffer.size; }
|
||||
|
||||
void* setPostfix(const uint32_t& size) {
|
||||
if (size > postfix_size) {
|
||||
const uint32_t delta = size - postfix_size;
|
||||
postfix_size = size;
|
||||
buffer.size -= (delta < buffer.size) ? delta : buffer.size;
|
||||
}
|
||||
return (buffer.size != 0) ? buffer.ptr + buffer.size : NULL;
|
||||
}
|
||||
|
||||
bool setPreSize(const uint32_t& size) {
|
||||
bool suc = (size <= buffer.size);
|
||||
if (suc) info->precmds_size = size;
|
||||
return suc;
|
||||
}
|
||||
|
||||
uint32_t getPostOffset() { return align(info->precmds_size); }
|
||||
|
||||
bool checkTotalSize(const uint32_t& size) {
|
||||
bool suc = (size <= buffer.size);
|
||||
if (suc) suc = (size >= info->precmds_size);
|
||||
if (suc) {
|
||||
info->postcmds_size = size - info->precmds_size;
|
||||
suc = ((getPostOffset() + info->postcmds_size) <= buffer.size);
|
||||
}
|
||||
return suc;
|
||||
}
|
||||
|
||||
descriptor_t getPreDescr() {
|
||||
descriptor_t descr;
|
||||
descr.ptr = buffer.ptr;
|
||||
descr.size = info->precmds_size;
|
||||
return descr;
|
||||
}
|
||||
|
||||
descriptor_t getPostDescr() {
|
||||
descriptor_t descr;
|
||||
descr.ptr = buffer.ptr + getPostOffset();
|
||||
descr.size = info->postcmds_size;
|
||||
return descr;
|
||||
}
|
||||
};
|
||||
|
||||
static inline bool is_event_match(const event_t& event1, const event_t& event2) {
|
||||
return (event1.block_name == event2.block_name) && (event1.block_index == event2.block_index) &&
|
||||
(event1.counter_id == event2.counter_id);
|
||||
}
|
||||
|
||||
hsa_status_t default_pmcdata_callback(hsa_ext_amd_aql_profile_info_type_t info_type,
|
||||
hsa_ext_amd_aql_profile_info_data_t* info_data,
|
||||
void* callback_data) {
|
||||
hsa_status_t status = HSA_STATUS_SUCCESS;
|
||||
hsa_ext_amd_aql_profile_info_data_t* passed_data =
|
||||
reinterpret_cast<hsa_ext_amd_aql_profile_info_data_t*>(callback_data);
|
||||
|
||||
if (info_type == HSA_EXT_AQL_PROFILE_INFO_PMC_DATA) {
|
||||
if (is_event_match(info_data->pmc_data.event, passed_data->pmc_data.event)) {
|
||||
if (passed_data->sample_id == UINT32_MAX) {
|
||||
passed_data->pmc_data.result += info_data->pmc_data.result;
|
||||
} else if (passed_data->sample_id == info_data->sample_id) {
|
||||
passed_data->pmc_data.result = info_data->pmc_data.result;
|
||||
status = HSA_STATUS_INFO_BREAK;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return status;
|
||||
}
|
||||
|
||||
struct sqtt_ctrl_t {
|
||||
uint32_t status;
|
||||
uint32_t counter;
|
||||
uint32_t writePtr;
|
||||
};
|
||||
|
||||
hsa_status_t default_sqttdata_callback(hsa_ext_amd_aql_profile_info_type_t info_type,
|
||||
hsa_ext_amd_aql_profile_info_data_t* info_data,
|
||||
void* callback_data) {
|
||||
hsa_status_t status = HSA_STATUS_SUCCESS;
|
||||
hsa_ext_amd_aql_profile_info_data_t* passed_data =
|
||||
reinterpret_cast<hsa_ext_amd_aql_profile_info_data_t*>(callback_data);
|
||||
|
||||
if (info_type == HSA_EXT_AQL_PROFILE_INFO_SQTT_DATA) {
|
||||
if (info_data->sample_id == passed_data->sample_id) {
|
||||
passed_data->sqtt_data = info_data->sqtt_data;
|
||||
status = HSA_STATUS_INFO_BREAK;
|
||||
}
|
||||
}
|
||||
|
||||
return status;
|
||||
}
|
||||
|
||||
} // aql_profile
|
||||
|
||||
extern "C" {
|
||||
|
||||
// Check if event is valid for the specific GPU
|
||||
PUBLIC_API hsa_status_t hsa_ext_amd_aql_profile_validate_event(
|
||||
hsa_agent_t agent, const hsa_ext_amd_aql_profile_event_t* event, bool* result) {
|
||||
return HSA_STATUS_SUCCESS;
|
||||
}
|
||||
|
||||
// Method to populate the provided AQL packet with profiling start commands
|
||||
PUBLIC_API hsa_status_t hsa_ext_amd_aql_profile_start(
|
||||
const hsa_ext_amd_aql_profile_profile_t* profile, aql_profile::packet_t* aql_start_packet) {
|
||||
|
||||
aql_profile::Pm4Factory * pm4_factory = aql_profile::Pm4Factory::Create(profile);
|
||||
if (pm4_factory == NULL) return HSA_STATUS_ERROR;
|
||||
|
||||
pm4_profile::CommandWriter* cmdWriter = pm4_factory->getCommandWriter();
|
||||
if (cmdWriter == NULL) return HSA_STATUS_ERROR;
|
||||
|
||||
pm4_profile::DefaultCmdBuf commands;
|
||||
aql_profile::CommandBufferMgr cmdBufMgr(profile);
|
||||
if (cmdBufMgr.getSize() == 0) return HSA_STATUS_ERROR;
|
||||
|
||||
if (profile->type == HSA_EXT_AQL_PROFILE_EVENT_PMC) {
|
||||
pm4_profile::Pmu* pmcMgr = pm4_factory->getPmcMgr();
|
||||
if (pmcMgr == NULL) return HSA_STATUS_ERROR;
|
||||
|
||||
pmcMgr->setPmcDataBuff((uint8_t*)profile->output_buffer.ptr, profile->output_buffer.size);
|
||||
|
||||
for (const hsa_ext_amd_aql_profile_event_t* p = profile->events;
|
||||
p < profile->events + profile->event_count; ++p) {
|
||||
pm4_profile::CounterBlock* block =
|
||||
pmcMgr->getCounterBlockById(pm4_factory->getBlockId(p));
|
||||
if (block == NULL) return HSA_STATUS_ERROR;
|
||||
|
||||
pm4_profile::Counter* counter = block->createCounter();
|
||||
if (counter == NULL) return HSA_STATUS_ERROR;
|
||||
|
||||
counter->setParameter(HSA_EXT_TOOLS_COUNTER_PARAMETER_EVENT_INDEX, sizeof(uint32_t),
|
||||
&(p->counter_id));
|
||||
counter->setEnable(true);
|
||||
}
|
||||
|
||||
// Generate start commands
|
||||
pmcMgr->begin(&commands, cmdWriter);
|
||||
cmdBufMgr.setPreSize(commands.Size());
|
||||
// Generate stop commands
|
||||
pmcMgr->end(&commands, cmdWriter);
|
||||
} else if (profile->type == HSA_EXT_AQL_PROFILE_EVENT_SQTT) {
|
||||
pm4_profile::ThreadTrace* sqttMgr = pm4_factory->getSqttMgr();
|
||||
if (sqttMgr == NULL) return HSA_STATUS_ERROR;
|
||||
|
||||
pm4_profile::ThreadTraceConfig sqtt_config;
|
||||
sqttMgr->InitThreadTraceConfig(&sqtt_config);
|
||||
if (profile->parameters) {
|
||||
for (const hsa_ext_amd_aql_profile_parameters_t* p = profile->parameters;
|
||||
p < (profile->parameters + profile->parameter_count); ++p) {
|
||||
switch (p->parameter_name) {
|
||||
case HSA_EXT_AQL_PROFILE_PARAM_COMPUTE_UNIT_TARGET:
|
||||
sqtt_config.threadTraceTargetCu = p->value;
|
||||
break;
|
||||
case HSA_EXT_AQL_PROFILE_PARAM_VM_ID_MASK:
|
||||
sqtt_config.threadTraceVmIdMask = p->value;
|
||||
break;
|
||||
case HSA_EXT_AQL_PROFILE_PARAM_MASK:
|
||||
sqtt_config.threadTraceMask = p->value;
|
||||
break;
|
||||
case HSA_EXT_AQL_PROFILE_PARAM_TOKEN_MASK:
|
||||
sqtt_config.threadTraceTokenMask = p->value;
|
||||
break;
|
||||
case HSA_EXT_AQL_PROFILE_PARAM_TOKEN_MASK2:
|
||||
sqtt_config.threadTraceTokenMask2 = p->value;
|
||||
break;
|
||||
default:
|
||||
return HSA_STATUS_ERROR;
|
||||
}
|
||||
}
|
||||
}
|
||||
sqttMgr->Init(&sqtt_config);
|
||||
|
||||
sqttMgr->setSqttDataBuff((uint8_t*)profile->output_buffer.ptr, profile->output_buffer.size);
|
||||
|
||||
const uint32_t status_size = sqttMgr->StatusSizeInfo();
|
||||
void* status_ptr = cmdBufMgr.setPostfix(status_size);
|
||||
if (status_ptr == NULL) return HSA_STATUS_ERROR;
|
||||
// Control buffer registering
|
||||
sqttMgr->setSqttCtrlBuff((uint32_t*)status_ptr);
|
||||
|
||||
// Generate start commands
|
||||
sqttMgr->BeginSession(&commands, cmdWriter);
|
||||
cmdBufMgr.setPreSize(commands.Size());
|
||||
// Generate stop commands
|
||||
sqttMgr->StopSession(&commands, cmdWriter);
|
||||
} else
|
||||
return HSA_STATUS_ERROR;
|
||||
|
||||
if (!cmdBufMgr.checkTotalSize(commands.Size())) return HSA_STATUS_ERROR;
|
||||
|
||||
const aql_profile::descriptor_t pre_descr = cmdBufMgr.getPreDescr();
|
||||
const aql_profile::descriptor_t post_descr = cmdBufMgr.getPostDescr();
|
||||
memcpy(pre_descr.ptr, commands.Base(), pre_descr.size);
|
||||
memcpy(post_descr.ptr, commands.Base() + pre_descr.size, post_descr.size);
|
||||
// Populate start aql packet
|
||||
aql_profile::populateAql(pre_descr.ptr, pre_descr.size, cmdWriter, aql_start_packet);
|
||||
|
||||
return HSA_STATUS_SUCCESS;
|
||||
}
|
||||
|
||||
// Method to populate the provided AQL packet with profiling stop commands
|
||||
PUBLIC_API hsa_status_t hsa_ext_amd_aql_profile_stop(
|
||||
const hsa_ext_amd_aql_profile_profile_t* profile, aql_profile::packet_t* aql_stop_packet) {
|
||||
|
||||
aql_profile::Pm4Factory * pm4_factory = aql_profile::Pm4Factory::Create(profile);
|
||||
if (pm4_factory == NULL) return HSA_STATUS_ERROR;
|
||||
|
||||
pm4_profile::CommandWriter* cmdWriter = pm4_factory->getCommandWriter();
|
||||
if (cmdWriter == NULL) return HSA_STATUS_ERROR;
|
||||
|
||||
aql_profile::CommandBufferMgr cmdBufMgr(profile);
|
||||
if (cmdBufMgr.getSize() == 0) return HSA_STATUS_ERROR;
|
||||
|
||||
const aql_profile::descriptor_t post_descr = cmdBufMgr.getPostDescr();
|
||||
// Populate stop aql packet
|
||||
aql_profile::populateAql(post_descr.ptr, post_descr.size, cmdWriter, aql_stop_packet);
|
||||
|
||||
return HSA_STATUS_SUCCESS;
|
||||
}
|
||||
|
||||
// Converting of the profiling AQL packet to PM4 packet, GFX8 support
|
||||
PUBLIC_API hsa_status_t hsa_ext_amd_aql_profile_legacy_get_pm4(
|
||||
const aql_profile::packet_t* aql_packet, void* pm4) {
|
||||
return HSA_STATUS_ERROR;
|
||||
}
|
||||
|
||||
// Method for getting the profile info
|
||||
PUBLIC_API hsa_status_t hsa_ext_amd_aql_profile_get_info(
|
||||
const hsa_ext_amd_aql_profile_profile_t* profile, hsa_ext_amd_aql_profile_info_type_t attribute,
|
||||
void* value) {
|
||||
hsa_status_t status = HSA_STATUS_SUCCESS;
|
||||
|
||||
switch (attribute) {
|
||||
case HSA_EXT_AQL_PROFILE_INFO_COMMAND_BUFFER_SIZE:
|
||||
*(uint32_t*)value = 0x1000; // a current approximation as 4K is big enaugh
|
||||
break;
|
||||
case HSA_EXT_AQL_PROFILE_INFO_PMC_DATA_SIZE:
|
||||
*(uint32_t*)value = 0x1000; // a current approximation as 4K is big enaugh
|
||||
break;
|
||||
case HSA_EXT_AQL_PROFILE_INFO_PMC_DATA:
|
||||
reinterpret_cast<hsa_ext_amd_aql_profile_info_data_t*>(value)->pmc_data.result = 0;
|
||||
status = hsa_ext_amd_aql_profile_iterate_data(profile, aql_profile::default_pmcdata_callback,
|
||||
value);
|
||||
break;
|
||||
case HSA_EXT_AQL_PROFILE_INFO_SQTT_DATA:
|
||||
status = hsa_ext_amd_aql_profile_iterate_data(profile, aql_profile::default_sqttdata_callback,
|
||||
value);
|
||||
break;
|
||||
default:
|
||||
status = HSA_STATUS_ERROR_INVALID_ARGUMENT;
|
||||
}
|
||||
|
||||
return status;
|
||||
}
|
||||
|
||||
// Method for iterating the events output data
|
||||
PUBLIC_API hsa_status_t hsa_ext_amd_aql_profile_iterate_data(
|
||||
const hsa_ext_amd_aql_profile_profile_t* profile,
|
||||
hsa_ext_amd_aql_profile_data_callback_t callback, void* data) {
|
||||
|
||||
hsa_status_t status = HSA_STATUS_SUCCESS;
|
||||
aql_profile::Pm4Factory * pm4_factory = aql_profile::Pm4Factory::Create(profile);
|
||||
if (pm4_factory == NULL) return HSA_STATUS_ERROR;
|
||||
|
||||
if (profile->type == HSA_EXT_AQL_PROFILE_EVENT_PMC) {
|
||||
uint32_t info_size = 0;
|
||||
void* info_data;
|
||||
uint64_t* samples = (uint64_t*)profile->output_buffer.ptr;
|
||||
const uint32_t sample_count = profile->output_buffer.size / sizeof(uint64_t);
|
||||
uint32_t sample_index = 0;
|
||||
|
||||
pm4_profile::Pmu* pmcMgr = pm4_factory->getPmcMgr();
|
||||
if (pmcMgr == NULL) return HSA_STATUS_ERROR;
|
||||
|
||||
for (const hsa_ext_amd_aql_profile_event_t* p = profile->events;
|
||||
p < (profile->events + profile->event_count); ++p) {
|
||||
pm4_profile::CounterBlock* block =
|
||||
pmcMgr->getCounterBlockById(pm4_factory->getBlockId(p));
|
||||
if (block == NULL) return HSA_STATUS_ERROR;
|
||||
if (!block->getInfo(pm4_profile::GPU_BLK_INFO_CONTROL_METHOD, info_size, &info_data)) {
|
||||
return HSA_STATUS_ERROR;
|
||||
}
|
||||
const pm4_profile::CntlMethod method =
|
||||
static_cast<pm4_profile::CntlMethod>(*(static_cast<uint32_t*>(info_data)));
|
||||
// A perfcounter data sample per ShaderEngine
|
||||
const uint32_t block_samples_count = (method == pm4_profile::CntlMethodBySe ||
|
||||
method == pm4_profile::CntlMethodBySeAndInstance)
|
||||
? pmcMgr->getNumSe()
|
||||
: 1;
|
||||
for (uint32_t i = 0; i < block_samples_count; ++i) {
|
||||
assert(sample_index < sample_count);
|
||||
if (sample_index >= sample_count) return HSA_STATUS_ERROR;
|
||||
|
||||
hsa_ext_amd_aql_profile_info_data_t sample_info;
|
||||
sample_info.sample_id = i;
|
||||
sample_info.pmc_data.event = *p;
|
||||
sample_info.pmc_data.result = samples[sample_index];
|
||||
status = callback(HSA_EXT_AQL_PROFILE_INFO_PMC_DATA, &sample_info, data);
|
||||
if (status == HSA_STATUS_INFO_BREAK) {
|
||||
status = HSA_STATUS_SUCCESS;
|
||||
break;
|
||||
}
|
||||
if (status != HSA_STATUS_SUCCESS) break;
|
||||
++sample_index;
|
||||
}
|
||||
}
|
||||
} else if (profile->type == HSA_EXT_AQL_PROFILE_EVENT_SQTT) {
|
||||
pm4_profile::ThreadTrace* sqttMgr = pm4_factory->getSqttMgr();
|
||||
if (sqttMgr == NULL) return HSA_STATUS_ERROR;
|
||||
|
||||
aql_profile::CommandBufferMgr cmdBufMgr(profile);
|
||||
if (cmdBufMgr.getSize() == 0) return HSA_STATUS_ERROR;
|
||||
|
||||
const uint32_t status_size = sqttMgr->StatusSizeInfo();
|
||||
// Control buffer was allocated as the CmdBuffer postfix partition
|
||||
void* status_ptr = cmdBufMgr.setPostfix(status_size);
|
||||
if (status_ptr == NULL) return HSA_STATUS_ERROR;
|
||||
// Control buffer registering
|
||||
sqttMgr->setSqttCtrlBuff((uint32_t*)status_ptr);
|
||||
// Validate SQTT status and normalize WRPTR
|
||||
if (sqttMgr->Validate() == false) return HSA_STATUS_ERROR;
|
||||
|
||||
const uint32_t se_number = sqttMgr->getNumSe();
|
||||
// Casting status pointer to SQTT control per ShaderEngine array
|
||||
aql_profile::sqtt_ctrl_t* sqtt_ctrl = (aql_profile::sqtt_ctrl_t*)status_ptr;
|
||||
assert(status_size == sizeof(aql_profile::sqtt_ctrl_t) * se_number);
|
||||
if (status_size != sizeof(aql_profile::sqtt_ctrl_t) * se_number) {
|
||||
return HSA_STATUS_ERROR;
|
||||
}
|
||||
// SQTT output buffer and capacity per ShaderEngine
|
||||
void* sample_ptr = profile->output_buffer.ptr;
|
||||
const uint32_t sample_capacity = profile->output_buffer.size / se_number;
|
||||
// The samples sizes are returned in the control buffer
|
||||
for (int i = 0; i < se_number; ++i) {
|
||||
// WPTR specifies the index in thread trace buffer where next token will be
|
||||
// written by hardware. The index is incremented by size of 32 bytes.
|
||||
uint32_t sample_size = sqtt_ctrl[i].writePtr * TT_WRITE_PTR_BLK;
|
||||
|
||||
hsa_ext_amd_aql_profile_info_data_t sample_info;
|
||||
sample_info.sample_id = i;
|
||||
sample_info.sqtt_data.ptr = sample_ptr;
|
||||
sample_info.sqtt_data.size = sample_size;
|
||||
status = callback(HSA_EXT_AQL_PROFILE_INFO_SQTT_DATA, &sample_info, data);
|
||||
if (status == HSA_STATUS_INFO_BREAK) {
|
||||
status = HSA_STATUS_SUCCESS;
|
||||
break;
|
||||
}
|
||||
if (status != HSA_STATUS_SUCCESS) break;
|
||||
|
||||
sample_ptr += sample_capacity;
|
||||
}
|
||||
} else {
|
||||
status = HSA_STATUS_ERROR;
|
||||
}
|
||||
|
||||
return status;
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,23 @@
|
||||
#ifndef _AQL_PROFILE_H_
|
||||
#define _AQL_PROFILE_H_
|
||||
|
||||
#include "hsa_ext_amd_aql_profile.h"
|
||||
|
||||
namespace pm4_profile {
|
||||
class CommandWriter;
|
||||
}
|
||||
|
||||
namespace aql_profile {
|
||||
|
||||
typedef hsa_ext_amd_aql_profile_descriptor_t descriptor_t;
|
||||
typedef hsa_ext_amd_aql_profile_profile_t profile_t;
|
||||
typedef hsa_ext_amd_aql_profile_info_type_t info_type_t;
|
||||
typedef hsa_ext_amd_aql_profile_data_callback_t data_callback_t;
|
||||
typedef hsa_ext_amd_aql_pm4_packet_t packet_t;
|
||||
typedef hsa_ext_amd_aql_profile_event_t event_t;
|
||||
|
||||
void populateAql(void* cmdBuffer, uint32_t cmdSz, pm4_profile::CommandWriter* cmdWriter,
|
||||
packet_t* aqlPkt);
|
||||
}
|
||||
|
||||
#endif // _AQL_PROFILE_H_
|
||||
@@ -0,0 +1,43 @@
|
||||
#include "pm4_factory.h"
|
||||
// Commandwriter includes
|
||||
#include "gfx8_cmdwriter.h"
|
||||
// PMC includes
|
||||
#include "vi_pmu.h"
|
||||
// SQTT includes
|
||||
#include "gfx8_thread_trace.h"
|
||||
|
||||
namespace aql_profile {
|
||||
|
||||
// GFX9 block ID mapping table
|
||||
uint32_t Gfx8Factory::block_id_table[HSA_EXT_AQL_PROFILE_BLOCKS_NUMBER] = {
|
||||
pm4_profile::kHsaViCounterBlockIdCb0, pm4_profile::kHsaViCounterBlockIdCpf,
|
||||
pm4_profile::kHsaViCounterBlockIdDb0, pm4_profile::kHsaViCounterBlockIdGrbm,
|
||||
pm4_profile::kHsaViCounterBlockIdGrbmSe, pm4_profile::kHsaViCounterBlockIdPaSu,
|
||||
pm4_profile::kHsaViCounterBlockIdPaSc, pm4_profile::kHsaViCounterBlockIdSpi,
|
||||
pm4_profile::kHsaViCounterBlockIdSq, pm4_profile::kHsaViCounterBlockIdSqGs,
|
||||
pm4_profile::kHsaViCounterBlockIdSqVs, pm4_profile::kHsaViCounterBlockIdSqPs,
|
||||
pm4_profile::kHsaViCounterBlockIdSqHs, pm4_profile::kHsaViCounterBlockIdSqCs,
|
||||
pm4_profile::kHsaViCounterBlockIdSx, pm4_profile::kHsaViCounterBlockIdTa0,
|
||||
pm4_profile::kHsaViCounterBlockIdTca0, pm4_profile::kHsaViCounterBlockIdTcc0,
|
||||
pm4_profile::kHsaViCounterBlockIdTd0, pm4_profile::kHsaViCounterBlockIdTcp0,
|
||||
pm4_profile::kHsaViCounterBlockIdGds, pm4_profile::kHsaViCounterBlockIdVgt,
|
||||
pm4_profile::kHsaViCounterBlockIdIa, pm4_profile::kHsaViCounterBlockIdMc,
|
||||
pm4_profile::kHsaViCounterBlockIdTcs, pm4_profile::kHsaViCounterBlockIdWd};
|
||||
|
||||
pm4_profile::CommandWriter * Gfx8Factory::getCommandWriter() {
|
||||
return new pm4_profile::gfx8::Gfx8CmdWriter(false, true);
|
||||
}
|
||||
|
||||
pm4_profile::Pmu * Gfx8Factory::getPmcMgr() {
|
||||
return new pm4_profile::ViPmu();
|
||||
}
|
||||
|
||||
pm4_profile::ThreadTrace * Gfx8Factory::getSqttMgr() {
|
||||
return new pm4_profile::Gfx8ThreadTrace();
|
||||
}
|
||||
|
||||
uint32_t Gfx8Factory::getBlockId(const event_t* event) {
|
||||
return block_id_table[event->block_name] + event->block_index;
|
||||
}
|
||||
|
||||
} // aql_profile
|
||||
@@ -0,0 +1,70 @@
|
||||
#include "pm4_factory.h"
|
||||
// Commandwriter includes
|
||||
#include "gfx8_cmdwriter.h"
|
||||
#include "gfx9_cmdwriter.h"
|
||||
// PMC includes
|
||||
#include "vi_pmu.h"
|
||||
#include "ai_pmu.h"
|
||||
// SQTT includes
|
||||
#include "gfx8_thread_trace.h"
|
||||
#include "gfx9_thread_trace.h"
|
||||
|
||||
namespace aql_profile {
|
||||
|
||||
// GFX8 block ID mapping table
|
||||
uint32_t gfx8_block_id_table[HSA_EXT_AQL_PROFILE_BLOCKS_NUMBER] = {
|
||||
pm4_profile::kHsaViCounterBlockIdCb0, pm4_profile::kHsaViCounterBlockIdCpf,
|
||||
pm4_profile::kHsaViCounterBlockIdDb0, pm4_profile::kHsaViCounterBlockIdGrbm,
|
||||
pm4_profile::kHsaViCounterBlockIdGrbmSe, pm4_profile::kHsaViCounterBlockIdPaSu,
|
||||
pm4_profile::kHsaViCounterBlockIdPaSc, pm4_profile::kHsaViCounterBlockIdSpi,
|
||||
pm4_profile::kHsaViCounterBlockIdSq, pm4_profile::kHsaViCounterBlockIdSqGs,
|
||||
pm4_profile::kHsaViCounterBlockIdSqVs, pm4_profile::kHsaViCounterBlockIdSqPs,
|
||||
pm4_profile::kHsaViCounterBlockIdSqHs, pm4_profile::kHsaViCounterBlockIdSqCs,
|
||||
pm4_profile::kHsaViCounterBlockIdSx, pm4_profile::kHsaViCounterBlockIdTa0,
|
||||
pm4_profile::kHsaViCounterBlockIdTca0, pm4_profile::kHsaViCounterBlockIdTcc0,
|
||||
pm4_profile::kHsaViCounterBlockIdTd0, pm4_profile::kHsaViCounterBlockIdTcp0,
|
||||
pm4_profile::kHsaViCounterBlockIdGds, pm4_profile::kHsaViCounterBlockIdVgt,
|
||||
pm4_profile::kHsaViCounterBlockIdIa, pm4_profile::kHsaViCounterBlockIdMc,
|
||||
pm4_profile::kHsaViCounterBlockIdTcs, pm4_profile::kHsaViCounterBlockIdWd};
|
||||
|
||||
// GFX9 block ID mapping table
|
||||
uint32_t gfx9_block_id_table[HSA_EXT_AQL_PROFILE_BLOCKS_NUMBER] = {
|
||||
pm4_profile::kHsaAiCounterBlockIdCb0, pm4_profile::kHsaAiCounterBlockIdCpf,
|
||||
pm4_profile::kHsaAiCounterBlockIdDb0, pm4_profile::kHsaAiCounterBlockIdGrbm,
|
||||
pm4_profile::kHsaAiCounterBlockIdGrbmSe, pm4_profile::kHsaAiCounterBlockIdPaSu,
|
||||
pm4_profile::kHsaAiCounterBlockIdPaSc, pm4_profile::kHsaAiCounterBlockIdSpi,
|
||||
pm4_profile::kHsaAiCounterBlockIdSq, pm4_profile::kHsaAiCounterBlockIdSqGs,
|
||||
pm4_profile::kHsaAiCounterBlockIdSqVs, pm4_profile::kHsaAiCounterBlockIdSqPs,
|
||||
pm4_profile::kHsaAiCounterBlockIdSqHs, pm4_profile::kHsaAiCounterBlockIdSqCs,
|
||||
pm4_profile::kHsaAiCounterBlockIdSx, pm4_profile::kHsaAiCounterBlockIdTa0,
|
||||
pm4_profile::kHsaAiCounterBlockIdTca0, pm4_profile::kHsaAiCounterBlockIdTcc0,
|
||||
pm4_profile::kHsaAiCounterBlockIdTd0, pm4_profile::kHsaAiCounterBlockIdTcp0,
|
||||
pm4_profile::kHsaAiCounterBlockIdGds, pm4_profile::kHsaAiCounterBlockIdVgt,
|
||||
pm4_profile::kHsaAiCounterBlockIdIa, pm4_profile::kHsaAiCounterBlockIdMc,
|
||||
pm4_profile::kHsaAiCounterBlockIdTcs, pm4_profile::kHsaAiCounterBlockIdWd};
|
||||
|
||||
pm4_profile::CommandWriter * Pm4Factory::getCommandWriter() {
|
||||
return (is_gfx9 == true) ?
|
||||
new pm4_profile::gfx9::Gfx9CmdWriter(false, true) :
|
||||
new pm4_profile::gfx8::Gfx8CmdWriter(false, true);
|
||||
}
|
||||
|
||||
pm4_profile::Pmu * Pm4Factory::getPmcMgr() {
|
||||
return (is_gfx9 == true) ?
|
||||
new pm4_profile::AiPmu() :
|
||||
new pm4_profile::ViPmu();
|
||||
}
|
||||
|
||||
pm4_profile::ThreadTrace * Pm4Factory::getSqttMgr() {
|
||||
return (is_gfx9 == true) ?
|
||||
new pm4_profile::Gfx9ThreadTrace() :
|
||||
new pm4_profile::Gfx8ThreadTrace();
|
||||
}
|
||||
|
||||
uint32_t Pm4Factory::getBlockId(const event_t* event) {
|
||||
return (is_gfx9 == true) ?
|
||||
gfx9_block_id_table[event->block_name] + event->block_index :
|
||||
gfx8_block_id_table[event->block_name] + event->block_index :
|
||||
}
|
||||
|
||||
} // aql_profile
|
||||
@@ -0,0 +1,43 @@
|
||||
#include "pm4_factory.h"
|
||||
// Commandwriter includes
|
||||
#include "gfx9_cmdwriter.h"
|
||||
// PMC includes
|
||||
#include "ai_pmu.h"
|
||||
// SQTT includes
|
||||
#include "gfx9_thread_trace.h"
|
||||
|
||||
namespace aql_profile {
|
||||
|
||||
// GFX9 block ID mapping table
|
||||
uint32_t Gfx9Factory::block_id_table[HSA_EXT_AQL_PROFILE_BLOCKS_NUMBER] = {
|
||||
pm4_profile::kHsaAiCounterBlockIdCb0, pm4_profile::kHsaAiCounterBlockIdCpf,
|
||||
pm4_profile::kHsaAiCounterBlockIdDb0, pm4_profile::kHsaAiCounterBlockIdGrbm,
|
||||
pm4_profile::kHsaAiCounterBlockIdGrbmSe, pm4_profile::kHsaAiCounterBlockIdPaSu,
|
||||
pm4_profile::kHsaAiCounterBlockIdPaSc, pm4_profile::kHsaAiCounterBlockIdSpi,
|
||||
pm4_profile::kHsaAiCounterBlockIdSq, pm4_profile::kHsaAiCounterBlockIdSqGs,
|
||||
pm4_profile::kHsaAiCounterBlockIdSqVs, pm4_profile::kHsaAiCounterBlockIdSqPs,
|
||||
pm4_profile::kHsaAiCounterBlockIdSqHs, pm4_profile::kHsaAiCounterBlockIdSqCs,
|
||||
pm4_profile::kHsaAiCounterBlockIdSx, pm4_profile::kHsaAiCounterBlockIdTa0,
|
||||
pm4_profile::kHsaAiCounterBlockIdTca0, pm4_profile::kHsaAiCounterBlockIdTcc0,
|
||||
pm4_profile::kHsaAiCounterBlockIdTd0, pm4_profile::kHsaAiCounterBlockIdTcp0,
|
||||
pm4_profile::kHsaAiCounterBlockIdGds, pm4_profile::kHsaAiCounterBlockIdVgt,
|
||||
pm4_profile::kHsaAiCounterBlockIdIa, pm4_profile::kHsaAiCounterBlockIdMc,
|
||||
pm4_profile::kHsaAiCounterBlockIdTcs, pm4_profile::kHsaAiCounterBlockIdWd};
|
||||
|
||||
pm4_profile::CommandWriter * Gfx9Factory::getCommandWriter() {
|
||||
return new pm4_profile::gfx9::Gfx9CmdWriter(false, true);
|
||||
}
|
||||
|
||||
pm4_profile::Pmu * Gfx9Factory::getPmcMgr() {
|
||||
return new pm4_profile::AiPmu();
|
||||
}
|
||||
|
||||
pm4_profile::ThreadTrace * Gfx9Factory::getSqttMgr() {
|
||||
return new pm4_profile::Gfx9ThreadTrace();
|
||||
}
|
||||
|
||||
uint32_t Gfx9Factory::getBlockId(const event_t* event) {
|
||||
return block_id_table[event->block_name] + event->block_index;
|
||||
}
|
||||
|
||||
} // aql_profile
|
||||
@@ -0,0 +1,62 @@
|
||||
#ifndef _PM4_FACTORY_H_
|
||||
#define _PM4_FACTORY_H_
|
||||
|
||||
#include <string.h>
|
||||
#include <assert.h>
|
||||
|
||||
#include "aql_profile.h"
|
||||
|
||||
namespace pm4_profile {
|
||||
class CommandWriter;
|
||||
class Pmu;
|
||||
class ThreadTrace;
|
||||
}
|
||||
|
||||
namespace aql_profile {
|
||||
|
||||
class Pm4Factory {
|
||||
public:
|
||||
static Pm4Factory* Create(const hsa_ext_amd_aql_profile_profile_t* profile);
|
||||
virtual pm4_profile::CommandWriter* getCommandWriter() = 0;
|
||||
virtual pm4_profile::Pmu* getPmcMgr() = 0;
|
||||
virtual pm4_profile::ThreadTrace* getSqttMgr() = 0;
|
||||
virtual uint32_t getBlockId(const event_t* event) = 0;
|
||||
};
|
||||
|
||||
class Gfx8Factory : public Pm4Factory {
|
||||
public:
|
||||
pm4_profile::CommandWriter* getCommandWriter();
|
||||
pm4_profile::Pmu* getPmcMgr();
|
||||
pm4_profile::ThreadTrace* getSqttMgr();
|
||||
uint32_t getBlockId(const event_t* event);
|
||||
|
||||
private:
|
||||
static uint32_t block_id_table[HSA_EXT_AQL_PROFILE_BLOCKS_NUMBER];
|
||||
};
|
||||
|
||||
class Gfx9Factory : public Pm4Factory {
|
||||
public:
|
||||
pm4_profile::CommandWriter* getCommandWriter();
|
||||
pm4_profile::Pmu* getPmcMgr();
|
||||
pm4_profile::ThreadTrace* getSqttMgr();
|
||||
uint32_t getBlockId(const event_t* event);
|
||||
|
||||
private:
|
||||
static uint32_t block_id_table[HSA_EXT_AQL_PROFILE_BLOCKS_NUMBER];
|
||||
};
|
||||
|
||||
inline Pm4Factory* Pm4Factory::Create(const hsa_ext_amd_aql_profile_profile_t* profile) {
|
||||
Pm4Factory* instance = NULL;
|
||||
char agent_name[64];
|
||||
hsa_agent_get_info(profile->agent, HSA_AGENT_INFO_NAME, agent_name);
|
||||
if (strncmp(agent_name, "gfx8", 4) == 0) {
|
||||
instance = new Gfx8Factory();
|
||||
} else if (strncmp(agent_name, "gfx9", 4) == 0) {
|
||||
instance = new Gfx9Factory();
|
||||
}
|
||||
return instance;
|
||||
}
|
||||
|
||||
} // aql_profile
|
||||
|
||||
#endif // _PM4_FACTORY_H_
|
||||
@@ -0,0 +1,41 @@
|
||||
#include <iostream>
|
||||
#include <iomanip>
|
||||
|
||||
#include "aql_profile.h"
|
||||
#include "cmdwriter.h"
|
||||
#include "amd_aql_pm4_ib_packet.h"
|
||||
|
||||
namespace aql_profile {
|
||||
|
||||
void populateAql(uint32_t* ib_packet, packet_t* aql_packet) {
|
||||
// Populate relevant fields of Aql pkt
|
||||
// Size of IB pkt is four DWords
|
||||
// Header and completion sinal are not set
|
||||
amd_aql_pm4_ib_packet_t* aql_pm4_ib = reinterpret_cast<amd_aql_pm4_ib_packet_t*>(aql_packet);
|
||||
aql_pm4_ib->pm4_ib_format = AMD_AQL_PM4_IB_FORMAT;
|
||||
aql_pm4_ib->pm4_ib_command[0] = ib_packet[0];
|
||||
aql_pm4_ib->pm4_ib_command[1] = ib_packet[1];
|
||||
aql_pm4_ib->pm4_ib_command[2] = ib_packet[2];
|
||||
aql_pm4_ib->pm4_ib_command[3] = ib_packet[3];
|
||||
aql_pm4_ib->dw_count_remain = AMD_AQL_PM4_IB_DW_COUNT_REMAIN;
|
||||
for (int i = 0; i < AMD_AQL_PM4_IB_RESERVED_COUNT; ++i) {
|
||||
aql_pm4_ib->reserved[i] = 0;
|
||||
}
|
||||
|
||||
uint32_t* words = (uint32_t*)aql_packet;
|
||||
std::clog << std::setw(40) << std::left << "AQL 'IB' size(16)"
|
||||
<< ":";
|
||||
for (int idx = 0; idx < 16; idx++) {
|
||||
std::clog << " " << std::hex << std::setw(8) << std::setfill('0') << words[idx];
|
||||
}
|
||||
std::clog << std::setfill(' ') << std::endl;
|
||||
}
|
||||
|
||||
void populateAql(void* cmd_buffer, uint32_t cmd_size,
|
||||
pm4_profile::CommandWriter* cmd_writer, packet_t* ppt_packet) {
|
||||
pm4_profile::DefaultCmdBuf ib_buffer;
|
||||
cmd_writer->BuildIndirectBufferCmd(&ib_buffer, cmd_buffer, (size_t)cmd_size);
|
||||
uint32_t* ib_cmds = (uint32_t*)ib_buffer.Base();
|
||||
populateAql(ib_cmds, ppt_packet);
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,15 @@
|
||||
#
|
||||
# Source files for Rocr Cmdwriter
|
||||
#
|
||||
set ( CmdWriterSrcs gfx8_cmdwriter.cpp )
|
||||
set ( CmdWriterSrcs ${CmdWriterSrcs} gfx9_cmdwriter.cpp )
|
||||
|
||||
#
|
||||
# Header files include path(s).
|
||||
#
|
||||
include_directories ( $ENV{ROCR_INC_DIR} )
|
||||
|
||||
#
|
||||
# Build Cmdwriter as a Static Library object
|
||||
#
|
||||
add_library ( ${CMDWRITER_LIB} STATIC ${CmdWriterSrcs} )
|
||||
@@ -0,0 +1,515 @@
|
||||
// cmdwriter.h
|
||||
// Header file for CommandWriter and CmdBuf interfaces
|
||||
|
||||
#ifndef _CMDWRITER_H_
|
||||
#define _CMDWRITER_H_
|
||||
|
||||
#include <vector>
|
||||
#include <string.h>
|
||||
#include <stdint.h>
|
||||
#include <assert.h>
|
||||
|
||||
namespace pm4_profile {
|
||||
|
||||
// User defined options for flusing cache
|
||||
typedef struct FlushCacheOptions_ {
|
||||
bool l1, l2;
|
||||
bool icache, kcache;
|
||||
bool l1_vol, l2_vol, kcache_vol;
|
||||
FlushCacheOptions_() {
|
||||
l1 = l2 = icache = kcache = false;
|
||||
l1_vol = l2_vol = kcache_vol = false;
|
||||
};
|
||||
} FlushCacheOptions;
|
||||
|
||||
/// @brief Interface to build a list of Gpu commands into a byte
|
||||
/// buffer. Classes implementing this interface are used to translate
|
||||
/// various Gpu commands as byte stream.
|
||||
///
|
||||
/// @note: The Api does not require implementations to be thread safe.
|
||||
/// Users are therefore required to be access in a serialized manner.
|
||||
class CmdBuf {
|
||||
public:
|
||||
/// Default destructor.
|
||||
virtual ~CmdBuf() {}
|
||||
|
||||
/// @brief Resets the command buffer object. All of the commands
|
||||
/// previously packed into the buffer are lost i.e. the number of
|
||||
/// bytes in command stream is reset.
|
||||
///
|
||||
/// @note: This convenience Api is provided to allow reuse of the
|
||||
/// command buffer object.
|
||||
///
|
||||
/// @return bool true if successful, false otherwise.
|
||||
virtual bool Reset(void) = 0;
|
||||
|
||||
/// @brief Appends input command into a buffer that could
|
||||
/// be queried for its size and other properties. The append
|
||||
/// does not verify the contents.
|
||||
///
|
||||
/// @param cmd Buffer containing one or more instances of Gpu commands
|
||||
///
|
||||
/// @param size size of the Gpu commands in bytes.
|
||||
///
|
||||
/// @return void
|
||||
virtual void AppendCommand(const void* cmd, uint32_t size) = 0;
|
||||
|
||||
/// @brief Returns the total size (in bytes) of the accumulated commands.
|
||||
///
|
||||
/// @return size_t size of Gpu commands in bytes
|
||||
virtual size_t Size() const = 0;
|
||||
|
||||
private:
|
||||
/// Indexes the command buffer by dwords. Allows accessing constants
|
||||
/// in an assembled command buffer.
|
||||
virtual uint32_t& operator[](size_t index) = 0;
|
||||
|
||||
friend class CommandWriter;
|
||||
};
|
||||
|
||||
/// @brief Implements the interface CmdBuf and thus can be used to
|
||||
/// translate various Gpu commands as byte stream.
|
||||
///
|
||||
/// @note: The Api does not require implementations to be thread safe.
|
||||
/// Users are therefore required to be access in a serialized manner.
|
||||
class DefaultCmdBuf : public CmdBuf {
|
||||
public:
|
||||
/// @brief Append the command into the underlying buffer
|
||||
///
|
||||
/// @param cmd Buffer containing one or more instances of Gpu commands
|
||||
///
|
||||
/// @param size Size of Gpu command(s) in bytes
|
||||
///
|
||||
/// @retur void
|
||||
virtual void AppendCommand(const void* cmd, uint32_t size) {
|
||||
memcpy(ReserveCmdbufSpace(size), cmd, size);
|
||||
}
|
||||
|
||||
/// @brief Resets the Gpu command buffer
|
||||
bool Reset() {
|
||||
cmdbuf_.clear();
|
||||
return true;
|
||||
}
|
||||
|
||||
/// Size of Gpu commands in bytes in the underlying buffer
|
||||
size_t Size() const { return cmdbuf_.size() * sizeof(StorageType); }
|
||||
|
||||
/// Address of the start of accumulated commands.
|
||||
const void* Base() const { return &cmdbuf_[0]; }
|
||||
|
||||
private:
|
||||
/// @brief Returns reference to the value of Gpu command buffer
|
||||
/// at specified index
|
||||
///
|
||||
/// @param index Specifies the buffer index whose value is needed
|
||||
///
|
||||
/// @return uint32_t & Reference of the value being returned
|
||||
uint32_t& operator[](size_t index) { return cmdbuf_[index]; }
|
||||
|
||||
/// @brief Increase Gpu command buffer by specified size
|
||||
///
|
||||
/// @param size Size in bytes by which command buffer should
|
||||
/// be resized.
|
||||
///
|
||||
/// @return void * Pointer into the buffer where the next
|
||||
/// command can be written
|
||||
void* ReserveCmdbufSpace(std::size_t size) {
|
||||
const size_t len = cmdbuf_.size();
|
||||
cmdbuf_.resize(len + size / sizeof(StorageType));
|
||||
return &cmdbuf_[len];
|
||||
}
|
||||
|
||||
/// @brief Defines Gpu command buffer as a vector of StorageType
|
||||
typedef uint32_t StorageType;
|
||||
std::vector<StorageType> cmdbuf_;
|
||||
};
|
||||
|
||||
/// @brief Specifies the public interface of CommandWriter for use by
|
||||
/// clients to build Gpu command streams.
|
||||
class CommandWriter {
|
||||
public:
|
||||
/// @brief These enums specify the operation to perform in the packet
|
||||
/// generated by BuildAtomicPacket. The commenting for each enum uses
|
||||
/// the arguments to the function BuildAtomicPacket to express the
|
||||
/// resulting operation.
|
||||
enum AtomicType {
|
||||
|
||||
/// *destination = *destination + 1;
|
||||
kAtomicTypeIncrement,
|
||||
|
||||
/// *destination = *destination - 1;
|
||||
kAtomicTypeDecrement,
|
||||
|
||||
/// if (*destination == compare) *destination = value;
|
||||
kAtomicTypeCompareAndSwap,
|
||||
|
||||
/// while (*destination != compare);
|
||||
/// *destination = value;
|
||||
kAtomicTypeBlockingCompareAndSwap,
|
||||
|
||||
/// *destination = *destination + value;
|
||||
kAtomicAdd,
|
||||
|
||||
/// *destination = *destination - value;
|
||||
kAtomicSubtract,
|
||||
|
||||
/// *destination = value;
|
||||
kAtomicSwap
|
||||
};
|
||||
|
||||
/// @brief These enums specify the VGT EVENT TYPE to issue and wait for.
|
||||
/// Command Processor (CP) uses these events to communicate with SPI to
|
||||
/// learn about outstanding waves and determine kernel completion.
|
||||
enum VgtEventType {
|
||||
|
||||
/// Enable Performance Counters
|
||||
kPerfCntrsStart,
|
||||
|
||||
/// Disable Performance Counters
|
||||
kPerfCntrsStop,
|
||||
|
||||
/// Read Performance Counters
|
||||
kPerfCntrsSample,
|
||||
|
||||
/// Enable a Thread Trace session
|
||||
kThrdTraceStart,
|
||||
|
||||
/// Disable a Thread Trace session
|
||||
kThrdTraceStop,
|
||||
|
||||
/// Enable flushing of thread trace buffers
|
||||
kThrdTraceFlush,
|
||||
|
||||
/// Enables resetting of BASE register to its last value
|
||||
/// including flushing of thread trace buffers. This could
|
||||
/// be used to toggle between two buffers so as to allow
|
||||
/// collection of large token data
|
||||
kThrdTraceFinish
|
||||
};
|
||||
|
||||
/// @brief Returns the Dword that encodes a No-Op for the CP
|
||||
///
|
||||
/// @return uint32_t Dword that can be used to populate a Pm4
|
||||
/// command queue.
|
||||
///
|
||||
virtual uint32_t GetNoOpCmd() = 0;
|
||||
|
||||
/// @brief Build an instance of Barrier command and copy it into
|
||||
/// the input commmand buffer
|
||||
///
|
||||
/// @param cmdbuf Pointer to command buffer which is updated with
|
||||
/// an instance of Barrier command.
|
||||
///
|
||||
/// @return void
|
||||
virtual void BuildBarrierCommand(CmdBuf* cmdbuf) = 0;
|
||||
|
||||
/// @brief Builds the Gpu command to reference indirectly a stream
|
||||
/// of other Gpu commands. The launch command is then copied into
|
||||
/// the command buffer parameter.
|
||||
///
|
||||
/// @param cmdBuf command buffer to be appended with launch command
|
||||
///
|
||||
/// @param cmd_addr Address of command buffer carrying command stream
|
||||
///
|
||||
/// @param cmd_size Size of dispatch command stream in bytes
|
||||
///
|
||||
/// @return void
|
||||
virtual void BuildIndirectBufferCmd(CmdBuf* cmdbuf, const void* cmd_addr,
|
||||
std::size_t cmd_size) = 0;
|
||||
|
||||
/// @brief Build a Gpu command that triggers an event whose type
|
||||
/// is specified by input parameter. It then copies it into the input
|
||||
/// command buffer
|
||||
///
|
||||
/// @param cmdbuf Pointer to command buffer to be appended
|
||||
///
|
||||
/// @param event Id of Event to be triggered by Gpu
|
||||
///
|
||||
/// @return void
|
||||
virtual void BuildWriteEventPacket(CmdBuf* cmdbuf, uint32_t event) = 0;
|
||||
|
||||
/// @bried Builds a Gpu command to wait until condition is realized
|
||||
///
|
||||
/// @param cmdbuf command buffer to be appended with launch command
|
||||
///
|
||||
/// @param mem_space if the address is in memory or is a register offset
|
||||
///
|
||||
/// @param wait_addr address to wait on
|
||||
///
|
||||
/// @param func_eq true means equal, false means not-equal
|
||||
///
|
||||
/// @param mask_val Mask to apply on value from addr in comparison
|
||||
///
|
||||
/// @param wait_val value to apply for the func given above
|
||||
virtual void BuildWaitRegMemCommand(CmdBuf* cmdbuf, bool mem_space, uint64_t wait_addr,
|
||||
bool func_eq, uint32_t mask_val, uint32_t wait_val) = 0;
|
||||
|
||||
virtual void BuildUpdateHostAddress(CmdBuf* cmdbuf, uint64_t* addr, int64_t value) = 0;
|
||||
|
||||
/// @brief Build CP command to program a Gpu register
|
||||
///
|
||||
/// @param cmdbuf Pointer to command buffer to be appended
|
||||
/// @param addr Register to be programmed
|
||||
/// @param value Value to write into register
|
||||
///
|
||||
/// @return void
|
||||
virtual void BuildWriteUConfigRegPacket(CmdBuf* cmdbuf, uint32_t addr, uint32_t value) = 0;
|
||||
|
||||
/// @brief Build and copy WriteShReg command
|
||||
///
|
||||
/// @param cmdbuf Pointer to command buffer to be appended
|
||||
///
|
||||
/// @param addr Offset of the register
|
||||
///
|
||||
/// @param value Value to write into register
|
||||
///
|
||||
/// @return void
|
||||
virtual void BuildWriteShRegPacket(CmdBuf* cmdbuf, uint32_t addr, uint32_t value) = 0;
|
||||
|
||||
/// @brief Builds a Gpu command to flush Gpu caches and write a
|
||||
/// user defined value at a configurable location that is Gpu
|
||||
/// accessible.
|
||||
///
|
||||
/// @param cmdBuf Command buffer to be appended with bottom of pipe
|
||||
/// notification command
|
||||
///
|
||||
/// @param write_addr Address into which Gpu should write
|
||||
///
|
||||
/// @param write_val Value to write into user provided address
|
||||
///
|
||||
/// @param interrupt True if Gpu should raise an interrupt upon writing
|
||||
/// the user value
|
||||
///
|
||||
/// @return void
|
||||
virtual void BuildBOPNotifyCmd(CmdBuf* cmdbuf, const void* write_addr, uint32_t write_val,
|
||||
bool intrpt) = 0;
|
||||
|
||||
|
||||
/// @brief Build a Gpu command that copies data from a specified
|
||||
/// source to destination
|
||||
///
|
||||
/// @param cmdbuf Pointer to command buffer to be appended
|
||||
///
|
||||
/// @param reg_to_mem flag to indicate if values are being read from a
|
||||
/// Register or a memory location
|
||||
///
|
||||
/// @param src_addr_lo Low 32-bit Source address of the data to read from
|
||||
///
|
||||
/// @param src_addr_hi High 32-bit Source address of the data to read from
|
||||
///
|
||||
/// @param dst_addr Destination address for the data to be written to
|
||||
///
|
||||
/// @param size Size of the data to be written
|
||||
///
|
||||
/// @param wait True if Gpu command should confirm the write operation
|
||||
/// operation has completed successfully
|
||||
///
|
||||
/// @return void
|
||||
///
|
||||
/// @NOTE Change interface to use void* for Src and void* for Dest
|
||||
virtual void BuildCopyDataPacket(CmdBuf* cmdbuf, uint32_t src_sel, uint32_t src_addr_lo,
|
||||
uint32_t src_addr_hi, uint32_t* dst_addr, uint32_t size,
|
||||
bool wait) = 0;
|
||||
|
||||
/// @brief Build and copy a WaitIdle Gpu command into command buffer
|
||||
///
|
||||
/// @param cmdbuf Pointer to command buffer to be appended
|
||||
///
|
||||
/// @return void
|
||||
virtual void BuildWriteWaitIdlePacket(CmdBuf* cmdbuf) = 0;
|
||||
|
||||
// Will issue a VGT event including a cache flush later on
|
||||
virtual void BuildVgtEventPacket(CmdBuf* cmdbuf, uint32_t vgtEvent) = 0;
|
||||
|
||||
/// @brief Build and copy a WriteRegister Gpu command into command buffer
|
||||
///
|
||||
/// @param cmdbuf Pointer to command buffer to be appended
|
||||
///
|
||||
/// @param addr Register into which to write
|
||||
///
|
||||
/// @param value Value to write into register
|
||||
///
|
||||
/// @return void
|
||||
virtual void BuildWriteRegisterPacket(CmdBuf* cmdbuf, uint32_t addr, uint32_t value) = 0;
|
||||
|
||||
/// @brief Build and copy a Gpu command to query the status of a
|
||||
/// WriteEvent into command buffer
|
||||
///
|
||||
/// @param cmdbuf Pointer to command buffer to be appended
|
||||
///
|
||||
/// @param event Id of Event whose status is to be queried
|
||||
///
|
||||
/// @param addr Address to update the status of WriteEvent operation
|
||||
///
|
||||
/// @return void
|
||||
virtual void BuildWriteEventQueryPacket(CmdBuf* cmdBuf, uint32_t event, uint32_t* addr) = 0;
|
||||
|
||||
/// @brief Builds and copies a Gpu comamnd to peform user specified
|
||||
/// operation atomically. The various atomic operations on integers
|
||||
/// that are supported include: increment, decrement, add, subtract,
|
||||
/// compare-and-swap and swap. The operation to perform is specified
|
||||
/// by the enum AtomicType.
|
||||
///
|
||||
/// @param cmdbuf Pointer to command buffer to be appended
|
||||
///
|
||||
/// @param atomic_op Id of the atomic operation to perform
|
||||
///
|
||||
/// @param addr Pointer to the memory block where atomic operation
|
||||
/// would be performed
|
||||
///
|
||||
/// @param value New value to write if atomic operation can be performed
|
||||
///
|
||||
/// @param compare Value to compare if atomic operation is a compare-and-swap
|
||||
///
|
||||
/// @return void
|
||||
virtual void BuildAtomicPacket(CmdBuf* cmdbuf, AtomicType atomic_op, volatile uint32_t* addr,
|
||||
uint32_t value = 0, uint32_t compare = 0) = 0;
|
||||
|
||||
/// @brief Builds and copies a Gpu comamnd to peform user specified
|
||||
/// operation atomically. The various atomic operations on integers
|
||||
/// that are supported include: increment, decrement, add, subtract,
|
||||
/// compare-and-swap and swap. The operation to perform is specified
|
||||
/// by the enum AtomicType.
|
||||
///
|
||||
/// @param cmdbuf Pointer to command buffer to be appended
|
||||
///
|
||||
/// @param atomic_op Id of the atomic operation to perform
|
||||
///
|
||||
/// @param addr Pointer to the memory block where atomic operation
|
||||
/// would be performed
|
||||
///
|
||||
/// @param value New value to write if atomic operation can be performed
|
||||
///
|
||||
/// @param compare Value to compare if atomic operation is a compare-and-swap
|
||||
///
|
||||
/// @return void
|
||||
virtual void BuildAtomicPacket64(CmdBuf* cmdbuf, AtomicType atomic_op, volatile uint64_t* addr,
|
||||
uint64_t value = 0, uint64_t compare = 0) = 0;
|
||||
|
||||
/// @brief Returns the size of an atomic packet
|
||||
///
|
||||
/// @return size_t Size of atomic packet
|
||||
virtual size_t SizeOfAtomicPacket() const = 0;
|
||||
|
||||
/// @brief Build and copy a Gpu command that will tell command processor
|
||||
/// to conditionally execute or skip the next sequence of packets.
|
||||
///
|
||||
/// @param cmdbuf Pointer to command buffer to be appended
|
||||
///
|
||||
/// @param signal Pointer to an integer that tells the command processor
|
||||
/// whether to skip or execute the next block of packets. If it is set
|
||||
/// to 0 the following packets will be skipped, else it will execute the
|
||||
/// following packets
|
||||
///
|
||||
/// @param count The number of dwords in the following packet stream
|
||||
/// that will be conditionally executed
|
||||
///
|
||||
/// @return void
|
||||
virtual void BuildConditionalExecute(CmdBuf* cmdbuf, uint32_t* signal, uint16_t count) = 0;
|
||||
|
||||
/// @brief Builds a CP command to write user specified value
|
||||
/// at a user specified address. The command is then copied
|
||||
/// into the command buffer for submission to a device queue.
|
||||
///
|
||||
/// @param cmdbuf Pointer to command buffer to be appended
|
||||
///
|
||||
/// @param write_addr Address into which CP will write the user
|
||||
/// specified value
|
||||
///
|
||||
/// @param write_value Value to write into the user specified address
|
||||
///
|
||||
/// @return void
|
||||
virtual void BuildWriteDataCommand(CmdBuf* cmdbuf, uint32_t* write_addr,
|
||||
uint32_t write_value) = 0;
|
||||
|
||||
/// @brief Builds a CP command to write user specified value
|
||||
/// at a user specified address. The command is then copied
|
||||
/// into the command buffer for submission to a device queue.
|
||||
///
|
||||
/// @param cmdbuf Pointer to command buffer to be appended
|
||||
///
|
||||
/// @param write_addr Address into which CP will write the user
|
||||
/// specified value
|
||||
///
|
||||
/// @param write_value Value to write into the user specified address
|
||||
///
|
||||
/// @return void
|
||||
virtual void BuildWriteData64Command(CmdBuf* cmdbuf, uint64_t* write_addr,
|
||||
uint64_t write_value) = 0;
|
||||
|
||||
/// Writes into input buffer Gpu commands to flush its cache. It is
|
||||
/// necessary that the buffer provided for flush commands is large
|
||||
/// enough to accommodate the full set of commands. It should be at
|
||||
/// least 512 bytes.
|
||||
///
|
||||
/// @param tsCmdBuf Buffer to write commands to.
|
||||
/// @param writeAddr Registered address into which GPU should write
|
||||
/// a user provided value upon executing the flush commands.
|
||||
/// @param writeVal User provided value written by GPU at user provided
|
||||
/// address, upon executing the flush commands.
|
||||
///
|
||||
/// @return void
|
||||
virtual void BuildFlushCacheCmd(CmdBuf* cmdbuf, FlushCacheOptions* options, uint32_t* writeAddr,
|
||||
uint32_t writeVal) = 0;
|
||||
|
||||
/// @brief Builds Gpu command to copy data from source to destination
|
||||
/// buffer using DMA engine.
|
||||
///
|
||||
/// @param cmdbuf Buffer updated with Gpu copy command
|
||||
/// @param srcAddr Address of source buffer address
|
||||
/// @param dstAddr Address of destination buffer address
|
||||
/// @param copySize Size of data to copy in bytes
|
||||
/// @param waitForCompletion if command should wait for copying to complete
|
||||
virtual void BuildDmaDataPacket(CmdBuf* cmdbuf, uint32_t* srcAddrLo, uint32_t* dstAddr,
|
||||
uint32_t copySize, bool waitForCompletion) = 0;
|
||||
|
||||
/// @brief Release resources used by CommandWriter
|
||||
virtual ~CommandWriter(){};
|
||||
|
||||
protected:
|
||||
/// @brief Return the reference to a value in the command buffer
|
||||
uint32_t& IndexBuffer(CmdBuf* cmdbuf, uint32_t index) { return (*cmdbuf)[index]; }
|
||||
};
|
||||
|
||||
/// @brief Returns the Rounded value per input rounding factor
|
||||
inline uint32_t RoundUp(uint32_t u, uint32_t r) { return ((u + (r - 1)) & ~(r - 1)); }
|
||||
|
||||
/// @brief Returns the lower 32-bits of a value
|
||||
inline uint32_t Low32(uint64_t u) { return (u & 0xFFFFFFFFUL); }
|
||||
|
||||
/// @brief Returns the upper 32-bits of a value
|
||||
inline uint32_t High32(uint64_t u) { return (u >> 32); }
|
||||
|
||||
/// @brief Returns the lower 32-bits of an address
|
||||
inline uint32_t Ptr48Low32(const void* p) {
|
||||
uintptr_t ptr = reinterpret_cast<uintptr_t>(p);
|
||||
assert((ptr & 0xFFFFFFFFFF00ULL) == ptr);
|
||||
return (uint32_t)((ptr & 0xFFFFFFFFFFULL) >> 8);
|
||||
}
|
||||
|
||||
/// @brief Returns the upper 8-bits of an address
|
||||
inline uint8_t Ptr48High8(const void* p) {
|
||||
uintptr_t ptr = reinterpret_cast<uintptr_t>(p);
|
||||
return (uint8_t)((ptr & 0xFF0000000000ULL) >> 40);
|
||||
}
|
||||
|
||||
/// @brief Returns the lower 32-bits of an address
|
||||
inline uint32_t PtrLow32(const void* p) {
|
||||
return static_cast<uint32_t>(reinterpret_cast<uintptr_t>(p));
|
||||
}
|
||||
|
||||
/// @brief Returns the upper 32-bits of an address
|
||||
inline uint32_t PtrHigh32(const void* p) {
|
||||
uint32_t hi_32 = 0;
|
||||
#ifdef HSA_LARGE_MODEL
|
||||
hi_32 = static_cast<uint32_t>(reinterpret_cast<uintptr_t>(p) >> 32);
|
||||
static_assert(sizeof(void*) == 8, "HSA_LARGE_MODEL is not set properly here!");
|
||||
#else
|
||||
static_assert(sizeof(void*) == 4, "HSA_LARGE_MODEL is not set properly here!");
|
||||
#endif
|
||||
return hi_32;
|
||||
}
|
||||
|
||||
} // pm4_profile
|
||||
|
||||
#endif // _CMDWRITER_H_
|
||||
@@ -0,0 +1,161 @@
|
||||
#ifndef _GFX8_CMDS_H_
|
||||
#define _GFX8_CMDS_H_
|
||||
|
||||
#include "gfxip/gfx8/si_ci_vi_merged_enum.h"
|
||||
#include "gfxip/gfx8/si_ci_vi_merged_mask.h"
|
||||
#include "gfxip/gfx8/si_ci_vi_merged_offset.h"
|
||||
#include "gfxip/gfx8/si_ci_vi_merged_registers.h"
|
||||
#include "gfxip/gfx8/si_ci_vi_merged_typedef.h"
|
||||
#include "gfxip/gfx8/si_ci_vi_merged_pm4_it_opcodes.h"
|
||||
#include "gfxip/gfx8/si_pm4defs.h"
|
||||
|
||||
namespace pm4_profile {
|
||||
|
||||
namespace gfx8 {
|
||||
|
||||
// Desc: Defines the Gpu command to dispatch a kernel. It embeds
|
||||
// various Gpu hardware specific data structures for initialization
|
||||
// and configuration before a dispatch begins to run
|
||||
struct DispatchTemplate {
|
||||
// Desc: Structure used to initialize the group dimensions
|
||||
// of a kernel dispatch and if performance counters are enabled
|
||||
struct DispatchDimensionRegs {
|
||||
PM4CMDSETDATA cmd_set_data;
|
||||
regCOMPUTE_START_X compute_start_x;
|
||||
regCOMPUTE_START_Y compute_start_y;
|
||||
regCOMPUTE_START_Z compute_start_z;
|
||||
regCOMPUTE_NUM_THREAD_X compute_num_thread_x;
|
||||
regCOMPUTE_NUM_THREAD_Y compute_num_thread_y;
|
||||
regCOMPUTE_NUM_THREAD_Z compute_num_thread_z;
|
||||
regCOMPUTE_PIPELINESTAT_ENABLE__CI__VI compute_pipelinestat_enable;
|
||||
} dimension_regs;
|
||||
|
||||
// Desc: Structure used to initialize kernel Isa, trap
|
||||
// handler, trap handler buffer, number of SGPR and VGPR
|
||||
// registers needed, amount of Group memory and LDS needed,
|
||||
// Rounding mode for Floating point numbers, etc.
|
||||
struct DispatchProgramRegs {
|
||||
PM4CMDSETDATA cmd_set_data;
|
||||
regCOMPUTE_PGM_LO compute_pgm_lo;
|
||||
regCOMPUTE_PGM_HI compute_pgm_hi;
|
||||
regCOMPUTE_TBA_LO compute_tba_lo;
|
||||
regCOMPUTE_TBA_HI compute_tba_hi;
|
||||
regCOMPUTE_TMA_LO compute_tma_lo;
|
||||
regCOMPUTE_TMA_HI compute_tma_hi;
|
||||
regCOMPUTE_PGM_RSRC1 compute_pgm_rsrc1;
|
||||
regCOMPUTE_PGM_RSRC2 compute_pgm_rsrc2;
|
||||
} program_regs;
|
||||
|
||||
// Desc: Structure used to initialize parameters related to
|
||||
// thread management i.e. number of waves to issue and number
|
||||
// of Compute Units to use
|
||||
struct DispatchResourceRegs {
|
||||
PM4CMDSETDATA cmd_set_data;
|
||||
regCOMPUTE_RESOURCE_LIMITS compute_resource_limits;
|
||||
regCOMPUTE_STATIC_THREAD_MGMT_SE0 compute_static_thread_mgmt_se0;
|
||||
regCOMPUTE_STATIC_THREAD_MGMT_SE1 compute_static_thread_mgmt_se1;
|
||||
regCOMPUTE_TMPRING_SIZE compute_tmpring_size;
|
||||
regCOMPUTE_STATIC_THREAD_MGMT_SE2__CI__VI compute_static_thread_mgmt_se2;
|
||||
regCOMPUTE_STATIC_THREAD_MGMT_SE3__CI__VI compute_static_thread_mgmt_se3;
|
||||
regCOMPUTE_RESTART_X__CI__VI compute_restart_x;
|
||||
regCOMPUTE_RESTART_Y__CI__VI compute_restart_y;
|
||||
regCOMPUTE_RESTART_Z__CI__VI compute_restart_z;
|
||||
regCOMPUTE_THREAD_TRACE_ENABLE__CI__VI compute_thread_trace_enable;
|
||||
} resource_regs;
|
||||
|
||||
// Desc: Structure used to pass handles of the Aql dispatch
|
||||
// packet, Aql queue, Kernel argument address block, Scratch
|
||||
// buffer
|
||||
struct DispatchComputeUserDataRegs {
|
||||
PM4CMDSETDATA cmd_set_data;
|
||||
uint32_t compute_user_data[16];
|
||||
} compute_user_data_regs;
|
||||
|
||||
// Desc: Structure used to configure Cache flush policy
|
||||
// and dimensions of total work size
|
||||
PM4CMDDISPATCHDIRECT dispatch_direct;
|
||||
};
|
||||
|
||||
// Desc: Structure used to issue a Gpu Barrier command
|
||||
struct BarrierTemplate {
|
||||
PM4CMDEVENTWRITE event_write;
|
||||
};
|
||||
|
||||
// Desc: Structure used to configure the flushing
|
||||
// of various caches - instruction, constants, L1
|
||||
// and L2
|
||||
struct AcquireMemTemplate {
|
||||
PM4CMDACQUIREMEM acquire_mem;
|
||||
};
|
||||
|
||||
// Desc: Structure used to reference another Gpu command
|
||||
// indirectly. Generally used to reference a list of Gpu
|
||||
// commands (dispatch cmds) indirectly
|
||||
struct LaunchTemplate {
|
||||
PM4CMDINDIRECTBUFFER indirect_buffer;
|
||||
};
|
||||
|
||||
// Desc: Structure used to determine the end of
|
||||
// a kernel including cache flushes and writing to
|
||||
// a user configurable memory location
|
||||
struct EndofKernelNotifyTemplate {
|
||||
PM4CMDRELEASEMEM release_mem;
|
||||
};
|
||||
|
||||
// Desc: Strucuture used to perform various atomic
|
||||
// operations - add, subtract, increment, etc
|
||||
struct AtomicTemplate {
|
||||
PM4CMDATOMIC atomic;
|
||||
};
|
||||
|
||||
// Desc: Structure used to conditionalize the execution
|
||||
// of a Gpu command stream
|
||||
struct ConditionalExecuteTemplate {
|
||||
PM4CMDCONDEXEC_CI conditional;
|
||||
};
|
||||
|
||||
// Desc: PM4 command to write a 32-bit value into a memory
|
||||
// location accessible to Gpu
|
||||
struct WriteDataTemplate {
|
||||
PM4CMDWRITEDATA write_data;
|
||||
uint32_t write_data_value;
|
||||
};
|
||||
|
||||
// Desc: PM4 command to write a 64-bit value into a memory
|
||||
// location accessible to Gpu
|
||||
struct WriteData64Template {
|
||||
PM4CMDWRITEDATA write_data;
|
||||
uint64_t write_data_value;
|
||||
};
|
||||
|
||||
// Desc: PM4 command to wait for a certain event before proceeding
|
||||
// to process another command on the queue
|
||||
struct WaitRegMemTemplate {
|
||||
PM4CMDWAITREGMEM wait_reg_mem;
|
||||
};
|
||||
|
||||
// Desc: Initializer for commands that set shader registers
|
||||
template <class T> void GenerateSetShRegHeader(T* pm4, uint32_t reg_addr) {
|
||||
pm4->cmd_set_data.header.u32All =
|
||||
PM4_TYPE_3_HDR(IT_SET_SH_REG, sizeof(T) / sizeof(uint32_t), ShaderCompute, 0);
|
||||
pm4->cmd_set_data.regOffset = reg_addr - PERSISTENT_SPACE_START;
|
||||
}
|
||||
|
||||
// Desc: Initializer for various Gpu command headers
|
||||
template <class T> void GenerateCmdHeader(T* pm4, IT_OpCodeType op_code) {
|
||||
pm4->header.u32All = PM4_TYPE_3_HDR(op_code, sizeof(T) / sizeof(uint32_t), ShaderCompute, 0);
|
||||
}
|
||||
|
||||
// Desc: Initializer for commands that set configuration registers
|
||||
template <class T> void GenerateSetConfigRegHeader(T* pm4, uint32_t reg_addr) {
|
||||
pm4->cmd_set_data.header.u32All =
|
||||
PM4_TYPE_3_HDR(IT_SET_CONFIG_REG, sizeof(T) / sizeof(uint32_t), ShaderCompute, 0);
|
||||
pm4->cmd_set_data.regOffset = reg_addr - CONFIG_SPACE_START;
|
||||
}
|
||||
|
||||
|
||||
} // gfx8
|
||||
|
||||
} // pm4_profile
|
||||
|
||||
#endif // _GFX8_CMDS_H_
|
||||
@@ -0,0 +1,768 @@
|
||||
#include <algorithm>
|
||||
#include <iostream>
|
||||
#include <iomanip>
|
||||
#include <sstream>
|
||||
|
||||
#include "gfx8_cmdwriter.h"
|
||||
#include "gfxip/gfx8/gfx8_utils.h"
|
||||
|
||||
// RELEASE MEM DST SEL Definitions
|
||||
#define RELEASE_MEM_DST_SEL_MEMORY_CONTROLLER 0
|
||||
#define RELEASE_MEM_DST_SEL_TC_L2 1
|
||||
|
||||
// RELEASE MEM CACHE POLICY Definitions
|
||||
#define RELEASE_MEM_CACHE_POLICY_LRU 0
|
||||
#define RELEASE_MEM_CACHE_POLICY_STREAM 1
|
||||
#define RELEASE_MEM_CACHE_POLICY_BYPASS 2
|
||||
|
||||
template <class T>
|
||||
static void PrintPm4Packet(const T& command, const char* name) {
|
||||
#if ! defined(NDEBUG)
|
||||
uint32_t * cmd = (uint32_t*)&command;
|
||||
uint32_t size = sizeof(command) / sizeof(uint32_t);
|
||||
std::ostringstream oss;
|
||||
oss << "'" << name << "' size(" << std::dec << size << ")";
|
||||
std::clog << std::setw(40) << std::left << oss.str() << ":";
|
||||
for (uint32_t idx = 0; idx < size; idx++) {
|
||||
std::clog << " " << std::hex << std::setw(8) << std::setfill('0') << cmd[idx];
|
||||
}
|
||||
std::clog << std::setfill(' ') << std::endl;
|
||||
#endif
|
||||
}
|
||||
|
||||
#define APPEND_COMMAND_WRAPPER(cmdbuf, command) \
|
||||
PrintPm4Packet(command, __FUNCTION__); \
|
||||
AppendCommand(cmdbuf, command);
|
||||
|
||||
namespace pm4_profile {
|
||||
namespace gfx8 {
|
||||
|
||||
template <class T> void Gfx8CmdWriter::AppendCommand(CmdBuf* cmdbuf, const T& command) {
|
||||
cmdbuf->AppendCommand(&command, sizeof(command));
|
||||
}
|
||||
|
||||
void Gfx8CmdWriter::InitializeAtomicTemplate() {
|
||||
memset(&atomic_template_.atomic, 0, sizeof(atomic_template_));
|
||||
GenerateCmdHeader(&atomic_template_.atomic, IT_ATOMIC_MEM__CI);
|
||||
|
||||
if (atc_support_) {
|
||||
const uint32_t kAtcShift = 24;
|
||||
atomic_template_.atomic.ordinal2 |= 1 << kAtcShift;
|
||||
}
|
||||
}
|
||||
|
||||
void Gfx8CmdWriter::InitializeConditionalTemplate() {
|
||||
memset(&conditional_template_.conditional, 0, sizeof(conditional_template_));
|
||||
gfx8::GenerateCmdHeader(&conditional_template_.conditional, IT_COND_EXEC);
|
||||
|
||||
if (atc_support_) {
|
||||
const uint32_t kAtcShift = 24;
|
||||
conditional_template_.conditional.ordinal4 |= 1 << kAtcShift;
|
||||
}
|
||||
}
|
||||
|
||||
void Gfx8CmdWriter::InitializeLaunchTemplate() {
|
||||
memset(&launch_template_, 0, sizeof(launch_template_));
|
||||
|
||||
GenerateCmdHeader(&launch_template_.indirect_buffer, IT_INDIRECT_BUFFER);
|
||||
launch_template_.indirect_buffer.CI.valid = true;
|
||||
}
|
||||
|
||||
void Gfx8CmdWriter::InitializeWriteDataTemplate() {
|
||||
// Set the header of write data command
|
||||
memset(&write_data_template_, 0, sizeof(write_data_template_));
|
||||
|
||||
// Initialize the header of command packet
|
||||
PM4CMDWRITEDATA* command = &(write_data_template_.write_data);
|
||||
uint32_t cmd_size = sizeof(write_data_template_) / sizeof(uint32_t);
|
||||
command->ordinal1 = PM4_TYPE_3_HDR(IT_WRITE_DATA, cmd_size, ShaderCompute, 0);
|
||||
|
||||
// Set the ATC bit of command template - specifies if the address
|
||||
// belongs to system memory
|
||||
write_data_template_.write_data.atc__CI = (atc_support_) ? 1 : 0;
|
||||
|
||||
// Set the bit to confirm the write operation and cache policy
|
||||
write_data_template_.write_data.wrConfirm = 1;
|
||||
write_data_template_.write_data.cachePolicy__CI = WRITE_DATA_CACHE_POLICY_BYPASS;
|
||||
|
||||
// Specify the module that will execute the write data command
|
||||
write_data_template_.write_data.engineSel = WRITE_DATA_ENGINE_ME;
|
||||
|
||||
// Specify the class to which the write destination belongs
|
||||
write_data_template_.write_data.dstSel = WRITE_DATA_DST_SEL_MEMORY_ASYNC;
|
||||
}
|
||||
|
||||
void Gfx8CmdWriter::InitializeWriteData64Template() {
|
||||
// Set the header of write data command
|
||||
memset(&write_data64_template_, 0, sizeof(write_data64_template_));
|
||||
|
||||
// Initialize the header of command packet
|
||||
PM4CMDWRITEDATA* command = &(write_data64_template_.write_data);
|
||||
uint32_t cmd_size = sizeof(write_data64_template_) / sizeof(uint32_t);
|
||||
command->ordinal1 = PM4_TYPE_3_HDR(IT_WRITE_DATA, cmd_size, ShaderCompute, 0);
|
||||
|
||||
// Set the ATC bit of command template - specifies if the address
|
||||
// belongs to system memory
|
||||
write_data64_template_.write_data.atc__CI = (atc_support_) ? 1 : 0;
|
||||
|
||||
// Set the bit to confirm the write operation and cache policy
|
||||
write_data64_template_.write_data.wrConfirm = 1;
|
||||
write_data64_template_.write_data.cachePolicy__CI = WRITE_DATA_CACHE_POLICY_BYPASS;
|
||||
|
||||
// Specify the module that will execute the write data command
|
||||
write_data64_template_.write_data.engineSel = WRITE_DATA_ENGINE_ME;
|
||||
|
||||
// Specify the class to which the write destination belongs
|
||||
// write_data64_template_.write_data.dstSel = WRITE_DATA_DST_SEL_TCL2;
|
||||
// TODO: For Hawaii bring up only.
|
||||
write_data64_template_.write_data.dstSel = WRITE_DATA_DST_SEL_MEMORY_ASYNC;
|
||||
}
|
||||
|
||||
void Gfx8CmdWriter::InitializeBarrierTemplate() {
|
||||
memset(&pending_dispatch_template_, 0, sizeof(pending_dispatch_template_));
|
||||
|
||||
gfx8::GenerateCmdHeader(&pending_dispatch_template_.event_write, IT_EVENT_WRITE);
|
||||
pending_dispatch_template_.event_write.eventType = CS_PARTIAL_FLUSH;
|
||||
pending_dispatch_template_.event_write.eventIndex = EventTypeToIndexTable[CS_PARTIAL_FLUSH];
|
||||
}
|
||||
|
||||
void Gfx8CmdWriter::InitializeAcquireMemTemplate() {
|
||||
memset(&invalidate_cache_template_, 0, sizeof(invalidate_cache_template_));
|
||||
|
||||
gfx8::GenerateCmdHeader(&invalidate_cache_template_.acquire_mem, IT_ACQUIRE_MEM__CI__VI);
|
||||
invalidate_cache_template_.acquire_mem.cpCoherBase.u32All = 0x00;
|
||||
invalidate_cache_template_.acquire_mem.cpCoherBaseHi.u32All = 0x00;
|
||||
invalidate_cache_template_.acquire_mem.cpCoherSize.u32All = 0xFFFFFFFF;
|
||||
invalidate_cache_template_.acquire_mem.cpCoherSizeHi.u32All = 0xFF;
|
||||
invalidate_cache_template_.acquire_mem.pollInterval = 0;
|
||||
}
|
||||
|
||||
void Gfx8CmdWriter::InitializeWaitRegMemTemplate() {
|
||||
memset(&wait_reg_mem_template_, 0, sizeof(wait_reg_mem_template_));
|
||||
|
||||
gfx8::GenerateCmdHeader(&wait_reg_mem_template_.wait_reg_mem, IT_WAIT_REG_MEM);
|
||||
wait_reg_mem_template_.wait_reg_mem.atc__CI = (atc_support_) ? 1 : 0;
|
||||
wait_reg_mem_template_.wait_reg_mem.cachePolicy__CI = 2; // bypass
|
||||
wait_reg_mem_template_.wait_reg_mem.pollInterval = 0;
|
||||
wait_reg_mem_template_.wait_reg_mem.engine = WAIT_REG_MEM_ENGINE_ME;
|
||||
}
|
||||
|
||||
Gfx8CmdWriter::Gfx8CmdWriter(bool atc_support, bool pcie_atomic_support) {
|
||||
// Initialize various state variables related to
|
||||
// atomic operations and atc support
|
||||
pcie_atomic_support_ = pcie_atomic_support;
|
||||
atc_support_ = atc_support;
|
||||
|
||||
InitializeLaunchTemplate();
|
||||
InitializeAtomicTemplate();
|
||||
InitializeConditionalTemplate();
|
||||
InitializeWriteDataTemplate();
|
||||
InitializeWriteData64Template();
|
||||
InitializeBarrierTemplate();
|
||||
InitializeAcquireMemTemplate();
|
||||
InitializeWaitRegMemTemplate();
|
||||
}
|
||||
|
||||
void Gfx8CmdWriter::BuildWaitRegMemCommand(CmdBuf* cmdbuf, bool mem_space, uint64_t wait_addr,
|
||||
bool func_eq, uint32_t mask_val, uint32_t wait_val) {
|
||||
gfx8::WaitRegMemTemplate wait_cmd = wait_reg_mem_template_;
|
||||
|
||||
// Apply the space to which addr belongs
|
||||
if (mem_space) {
|
||||
wait_cmd.wait_reg_mem.memSpace = WAIT_REG_MEM_SPACE_MEMORY;
|
||||
} else {
|
||||
wait_cmd.wait_reg_mem.memSpace = WAIT_REG_MEM_SPACE_REGISTER;
|
||||
}
|
||||
|
||||
// Apply the function - equal / not equal desired by user
|
||||
if (func_eq) {
|
||||
wait_cmd.wait_reg_mem.function = WAIT_REG_MEM_FUNC_EQUAL;
|
||||
} else {
|
||||
wait_cmd.wait_reg_mem.function = WAIT_REG_MEM_FUNC_NOT_EQUAL;
|
||||
}
|
||||
|
||||
// Apply the mask on value at address/register
|
||||
wait_cmd.wait_reg_mem.mask = mask_val;
|
||||
|
||||
// Value to use in applying equal / not equal function
|
||||
wait_cmd.wait_reg_mem.reference = wait_val;
|
||||
|
||||
// Update upper 32 bit address if addr is not a register
|
||||
if (mem_space) {
|
||||
assert(!(wait_addr & 0x3) && "WaitRegMem address must be 4 byte aligned");
|
||||
}
|
||||
wait_cmd.wait_reg_mem.pollAddressLo = Low32(wait_addr);
|
||||
if (mem_space) {
|
||||
wait_cmd.wait_reg_mem.pollAddressHi = High32(wait_addr);
|
||||
}
|
||||
|
||||
APPEND_COMMAND_WRAPPER(cmdbuf, wait_cmd);
|
||||
}
|
||||
|
||||
void Gfx8CmdWriter::BuildUpdateHostAddress(CmdBuf* cmdbuf, uint64_t* addr, int64_t value) {
|
||||
// If Atomics are supported, use it
|
||||
if (pcie_atomic_support_) {
|
||||
BuildAtomicPacket64(cmdbuf, CommandWriter::AtomicType::kAtomicSwap, (volatile uint64_t*)addr,
|
||||
value);
|
||||
return;
|
||||
}
|
||||
|
||||
BuildWriteData64Command(cmdbuf, addr, value);
|
||||
return;
|
||||
}
|
||||
|
||||
void Gfx8CmdWriter::BuildIndirectBufferCmd(CmdBuf* cmdbuf, const void* cmd_addr,
|
||||
std::size_t cmd_size) {
|
||||
gfx8::LaunchTemplate launch = launch_template_;
|
||||
|
||||
launch.indirect_buffer.ibBaseLo = PtrLow32(cmd_addr);
|
||||
launch.indirect_buffer.ibBaseHi = PtrHigh32(cmd_addr);
|
||||
launch.indirect_buffer.CI.ibSize = cmd_size / sizeof(uint32_t);
|
||||
|
||||
APPEND_COMMAND_WRAPPER(cmdbuf, launch);
|
||||
}
|
||||
|
||||
void Gfx8CmdWriter::BuildBOPNotifyCmd(CmdBuf* cmdbuf, const void* write_addr, uint32_t write_val,
|
||||
bool interrupt) {
|
||||
// Initialize the command including its header
|
||||
gfx8::EndofKernelNotifyTemplate eopCmd;
|
||||
memset(&eopCmd, 0, sizeof(eopCmd));
|
||||
gfx8::GenerateCmdHeader(&eopCmd.release_mem, IT_RELEASE_MEM__CI__VI);
|
||||
|
||||
// Program CP to wait until following event is notified by SPI
|
||||
eopCmd.release_mem.eventType = BOTTOM_OF_PIPE_TS;
|
||||
eopCmd.release_mem.eventIndex = EventTypeToIndexTable[BOTTOM_OF_PIPE_TS];
|
||||
|
||||
// Program CP to perform various cache operations
|
||||
// which complete before Write operation commences
|
||||
eopCmd.release_mem.atc = atc_support_;
|
||||
eopCmd.release_mem.l2Invlidate = true;
|
||||
eopCmd.release_mem.l2WriteBack = true;
|
||||
|
||||
// Set destination as Memory with Write bypassing Cache
|
||||
eopCmd.release_mem.cachePolicy = RELEASE_MEM_CACHE_POLICY_BYPASS;
|
||||
eopCmd.release_mem.dstSel = RELEASE_MEM_DST_SEL_MEMORY_CONTROLLER;
|
||||
|
||||
// Program CP to write user specified value to user specified address
|
||||
eopCmd.release_mem.ordinal4 = Low32(uint64_t(write_addr));
|
||||
eopCmd.release_mem.addrHi = High32(uint64_t(write_addr));
|
||||
eopCmd.release_mem.dataLo = Low32(write_val);
|
||||
eopCmd.release_mem.dataHi = High32(write_val);
|
||||
eopCmd.release_mem.dataSel = EVENTWRITEEOP_DATA_SEL_SEND_DATA32;
|
||||
|
||||
// Determine if host will poll or wait for interrupt
|
||||
eopCmd.release_mem.intSel =
|
||||
(interrupt == false) ? EVENTWRITEEOP_INT_SEL_NONE : EVENTWRITEEOP_INT_SEL_SEND_INT_ON_CONFIRM;
|
||||
|
||||
APPEND_COMMAND_WRAPPER(cmdbuf, eopCmd);
|
||||
}
|
||||
|
||||
|
||||
void Gfx8CmdWriter::BuildBarrierFenceCommands(CmdBuf* cmdbuf) {
|
||||
gfx8::AcquireMemTemplate invalidate_src_caches = invalidate_cache_template_;
|
||||
|
||||
// wbINVL2 by default writes-back and invalidates both L1 and L2
|
||||
invalidate_src_caches.acquire_mem.coherCntl =
|
||||
CP_COHER_CNTL__TC_ACTION_ENA_MASK | CP_COHER_CNTL__TC_WB_ACTION_ENA_MASK__CI__VI;
|
||||
|
||||
APPEND_COMMAND_WRAPPER(cmdbuf, invalidate_src_caches);
|
||||
}
|
||||
|
||||
// PM4 packet for profilers
|
||||
#define PM4_PACKET3 (0xC0000000)
|
||||
#define PM4_PACKET3_CMD_SHIFT 8
|
||||
#define PM4_PACKET3_COUNT_SHIFT 16
|
||||
|
||||
#define PACKET3(cmd, count) \
|
||||
(PM4_PACKET3 | (((count)-1) << PM4_PACKET3_COUNT_SHIFT) | ((cmd) << PM4_PACKET3_CMD_SHIFT))
|
||||
|
||||
// Structure to store the event PM4 packet
|
||||
typedef struct WriteRegPacket_ { uint32_t item[3]; } WriteRegPacket;
|
||||
|
||||
typedef struct WriteEventPacket_ { uint32_t item[7]; } WriteEventPacket;
|
||||
|
||||
void Gfx8CmdWriter::BuildWriteEventPacket(CmdBuf* cmdbuf, uint32_t event) {
|
||||
|
||||
PM4CMDEVENTWRITE cp_event_initiator;
|
||||
cp_event_initiator.ordinal1 = PACKET3(IT_EVENT_WRITE, 1);
|
||||
cp_event_initiator.ordinal2 = 0;
|
||||
|
||||
VGT_EVENT_TYPE eventType = Reserved_0x00;
|
||||
switch (event) {
|
||||
case kPerfCntrsStart:
|
||||
eventType = PERFCOUNTER_START;
|
||||
break;
|
||||
case kPerfCntrsStop:
|
||||
eventType = PERFCOUNTER_STOP;
|
||||
break;
|
||||
case kPerfCntrsSample:
|
||||
eventType = PERFCOUNTER_SAMPLE;
|
||||
break;
|
||||
default:
|
||||
assert(false && "Illegal VGT Event Id");
|
||||
}
|
||||
|
||||
cp_event_initiator.eventType = eventType;
|
||||
cp_event_initiator.eventIndex = EventTypeToIndexTable[eventType];
|
||||
|
||||
APPEND_COMMAND_WRAPPER(cmdbuf, cp_event_initiator);
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
void Gfx8CmdWriter::BuildWriteUnshadowRegPacket(CmdBuf* cmdbuf, uint32_t addr, uint32_t value) {
|
||||
WriteRegPacket packet;
|
||||
packet.item[0] = (PM4_TYPE_3_HDR(IT_SET_UCONFIG_REG__CI__VI, 1 + PM4_CMD_SET_CONFIG_REG_DWORDS,
|
||||
ShaderGraphics, 0));
|
||||
packet.item[1] = (addr - UCONFIG_SPACE_START__CI__VI);
|
||||
packet.item[2] = value;
|
||||
|
||||
APPEND_COMMAND_WRAPPER(cmdbuf, packet);
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
void Gfx8CmdWriter::BuildWriteUConfigRegPacket(CmdBuf* cmdbuf, uint32_t addr, uint32_t value) {
|
||||
WriteRegPacket packet;
|
||||
packet.item[0] = (PM4_TYPE_3_HDR(IT_SET_UCONFIG_REG__CI__VI, 1 + PM4_CMD_SET_CONFIG_REG_DWORDS,
|
||||
ShaderCompute, 0));
|
||||
packet.item[1] = (addr - UCONFIG_SPACE_START__CI__VI);
|
||||
packet.item[2] = value;
|
||||
|
||||
APPEND_COMMAND_WRAPPER(cmdbuf, packet);
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
void Gfx8CmdWriter::BuildWriteShRegPacket(CmdBuf* cmdbuf, uint32_t addr, uint32_t value) {
|
||||
WriteRegPacket packet;
|
||||
packet.item[0] = (PM4_TYPE_3_HDR(IT_SET_SH_REG, 1 + PM4_CMD_SET_SH_REG_DWORDS, ShaderCompute, 0));
|
||||
packet.item[1] = (addr - PERSISTENT_SPACE_START);
|
||||
packet.item[2] = value;
|
||||
|
||||
APPEND_COMMAND_WRAPPER(cmdbuf, packet);
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
void Gfx8CmdWriter::BuildCopyDataPacket(CmdBuf* cmdbuf, uint32_t src_sel, uint32_t src_addr_lo,
|
||||
uint32_t src_addr_hi, uint32_t* dst_addr, uint32_t size,
|
||||
bool wait) {
|
||||
PM4CMDCOPYDATA cmd_data;
|
||||
memset(&cmd_data, 0, sizeof(PM4CMDCOPYDATA));
|
||||
|
||||
cmd_data.header.u32All = PACKET3(IT_COPY_DATA, 5);
|
||||
|
||||
cmd_data.srcAtc__CI = atc_support_;
|
||||
cmd_data.srcCachePolicy__CI = COPY_DATA_SRC_CACHE_POLICY_BYPASS;
|
||||
cmd_data.srcSel = src_sel;
|
||||
|
||||
cmd_data.dstAtc__CI = atc_support_;
|
||||
cmd_data.dstSel = COPY_DATA_SEL_DST_ASYNC_MEMORY;
|
||||
cmd_data.dstCachePolicy__CI = COPY_DATA_DST_CACHE_POLICY_BYPASS;
|
||||
|
||||
uint32_t dst_addr_lo, dst_addr_hi;
|
||||
|
||||
dst_addr_lo = PtrLow32(dst_addr);
|
||||
dst_addr_hi = PtrHigh32(dst_addr);
|
||||
|
||||
cmd_data.srcAddressLo = src_addr_lo;
|
||||
cmd_data.srcAddressHi = src_addr_hi;
|
||||
cmd_data.dstAddressLo = dst_addr_lo;
|
||||
cmd_data.dstAddressHi = dst_addr_hi;
|
||||
|
||||
cmd_data.countSel = size;
|
||||
cmd_data.wrConfirm = wait;
|
||||
cmd_data.engineSel = COPY_DATA_ENGINE_ME;
|
||||
|
||||
APPEND_COMMAND_WRAPPER(cmdbuf, cmd_data);
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
void Gfx8CmdWriter::BuildCacheFlushPacket(CmdBuf* cmdbuf) {
|
||||
WriteEventPacket packet;
|
||||
packet.item[0] = PACKET3(IT_ACQUIRE_MEM__CI__VI, 6);
|
||||
packet.item[1] = 0x28C00000;
|
||||
packet.item[2] = 0xFFFFFFFF;
|
||||
packet.item[3] = 0;
|
||||
packet.item[4] = 0;
|
||||
packet.item[5] = 0;
|
||||
packet.item[6] = 0x00000004;
|
||||
|
||||
APPEND_COMMAND_WRAPPER(cmdbuf, packet);
|
||||
}
|
||||
|
||||
void Gfx8CmdWriter::BuildWriteWaitIdlePacket(CmdBuf* cmdbuf) {
|
||||
BuildBarrierCommand(cmdbuf);
|
||||
BuildCacheFlushPacket(cmdbuf);
|
||||
return;
|
||||
}
|
||||
|
||||
// Will issue a VGT event including a cache flush later on
|
||||
void Gfx8CmdWriter::BuildVgtEventPacket(CmdBuf* cmdbuf, uint32_t vgtEvent) {
|
||||
PM4CMDEVENTWRITE cp_event_initiator;
|
||||
|
||||
cp_event_initiator.ordinal1 = PACKET3(IT_EVENT_WRITE, 1);
|
||||
cp_event_initiator.ordinal2 = 0;
|
||||
|
||||
VGT_EVENT_TYPE eventType = Reserved_0x00;
|
||||
switch (vgtEvent) {
|
||||
case kPerfCntrsStart:
|
||||
eventType = PERFCOUNTER_START;
|
||||
break;
|
||||
case kPerfCntrsStop:
|
||||
eventType = PERFCOUNTER_STOP;
|
||||
break;
|
||||
case kPerfCntrsSample:
|
||||
eventType = PERFCOUNTER_SAMPLE;
|
||||
break;
|
||||
case kThrdTraceStart:
|
||||
eventType = THREAD_TRACE_START;
|
||||
break;
|
||||
case kThrdTraceStop:
|
||||
eventType = THREAD_TRACE_STOP;
|
||||
break;
|
||||
case kThrdTraceFlush:
|
||||
eventType = THREAD_TRACE_FLUSH;
|
||||
break;
|
||||
case kThrdTraceFinish:
|
||||
eventType = THREAD_TRACE_FINISH;
|
||||
break;
|
||||
default:
|
||||
assert(false && "Illegal VGT Event Id");
|
||||
}
|
||||
|
||||
cp_event_initiator.eventType = eventType;
|
||||
cp_event_initiator.eventIndex = EventTypeToIndexTable[eventType];
|
||||
|
||||
APPEND_COMMAND_WRAPPER(cmdbuf, cp_event_initiator);
|
||||
|
||||
// Check If I should be issuing a cache flush operation as well
|
||||
// test and remove it
|
||||
BuildCacheFlushPacket(cmdbuf);
|
||||
return;
|
||||
}
|
||||
|
||||
void Gfx8CmdWriter::BuildWriteRegisterPacket(CmdBuf* cmdbuf, uint32_t addr, uint32_t value) {
|
||||
WriteRegPacket packet;
|
||||
packet.item[0] =
|
||||
(PM4_TYPE_3_HDR(IT_SET_CONFIG_REG, 1 + PM4_CMD_SET_CONFIG_REG_DWORDS, ShaderGraphics, 0));
|
||||
packet.item[1] = addr - CONFIG_SPACE_START;
|
||||
packet.item[2] = value;
|
||||
|
||||
APPEND_COMMAND_WRAPPER(cmdbuf, packet);
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
void Gfx8CmdWriter::BuildWriteEventQueryPacket(CmdBuf* cmdbuf, uint32_t event, uint32_t* addr) {
|
||||
PM4CMDEVENTWRITEQUERY cp_event_initiator;
|
||||
cp_event_initiator.ordinal1 = PACKET3(IT_EVENT_WRITE, 3);
|
||||
cp_event_initiator.ordinal2 = 0;
|
||||
|
||||
// Update switch statements you want to support
|
||||
VGT_EVENT_TYPE eventType = Reserved_0x00;
|
||||
switch (event) {
|
||||
default:
|
||||
assert(false && "Illegal VGT Event Id");
|
||||
}
|
||||
|
||||
cp_event_initiator.eventType = eventType;
|
||||
cp_event_initiator.eventIndex = EventTypeToIndexTable[eventType];
|
||||
|
||||
// set the address
|
||||
uint32_t addrLo = PtrLow32(addr);
|
||||
uint32_t addrHi = PtrHigh32(addr);
|
||||
((addrLo & 0x7) != 0) ? assert(false) : assert(true);
|
||||
|
||||
cp_event_initiator.ordinal3 = 0;
|
||||
cp_event_initiator.ordinal4 = 0;
|
||||
cp_event_initiator.addressLo = addrLo;
|
||||
cp_event_initiator.addressHi = addrHi;
|
||||
|
||||
APPEND_COMMAND_WRAPPER(cmdbuf, cp_event_initiator);
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
void Gfx8CmdWriter::BuildBarrierCommand(CmdBuf* cmdBuf) {
|
||||
APPEND_COMMAND_WRAPPER(cmdBuf, pending_dispatch_template_);
|
||||
}
|
||||
|
||||
void Gfx8CmdWriter::WriteUserData(uint32_t* dst_addr, uint32_t count, const void* src_addr) {
|
||||
memcpy(dst_addr, src_addr, count * sizeof(uint32_t));
|
||||
}
|
||||
|
||||
|
||||
void Gfx8CmdWriter::BuildAtomicPacket(CmdBuf* cmdbuf, AtomicType atomic_op,
|
||||
volatile uint32_t* addr, uint32_t value,
|
||||
uint32_t compare) {
|
||||
gfx8::AtomicTemplate atomic = atomic_template_;
|
||||
|
||||
// make sure the destination adddress is aligned
|
||||
uint32_t address_low = PtrLow32((void*)addr);
|
||||
uint32_t address_high = PtrHigh32((void*)addr);
|
||||
assert(!(address_low & 0x7) && "destination address must be 8 byte aligned");
|
||||
|
||||
atomic.atomic.addressLo = address_low;
|
||||
atomic.atomic.addressHi = address_high;
|
||||
|
||||
switch (atomic_op) {
|
||||
case CommandWriter::kAtomicTypeIncrement: {
|
||||
atomic.atomic.atomOp = TC_OP_ATOMIC_ADD_RTN_32;
|
||||
atomic.atomic.srcDataLo = 1;
|
||||
break;
|
||||
}
|
||||
case CommandWriter::kAtomicTypeDecrement: {
|
||||
atomic.atomic.atomOp = TC_OP_ATOMIC_SUB_RTN_32;
|
||||
atomic.atomic.srcDataLo = 1;
|
||||
break;
|
||||
}
|
||||
case CommandWriter::kAtomicTypeCompareAndSwap: {
|
||||
atomic.atomic.atomOp = TC_OP_ATOMIC_CMPSWAP_RTN_32;
|
||||
atomic.atomic.srcDataLo = value;
|
||||
atomic.atomic.cmpDataLo = compare;
|
||||
break;
|
||||
}
|
||||
case CommandWriter::kAtomicTypeBlockingCompareAndSwap: {
|
||||
atomic.atomic.atomOp = TC_OP_ATOMIC_CMPSWAP_RTN_32;
|
||||
atomic.atomic.srcDataLo = value;
|
||||
atomic.atomic.cmpDataLo = compare;
|
||||
atomic.atomic.command = 1;
|
||||
atomic.atomic.loopInterval = 128;
|
||||
break;
|
||||
}
|
||||
case CommandWriter::kAtomicAdd: {
|
||||
atomic.atomic.atomOp = TC_OP_ATOMIC_ADD_RTN_32;
|
||||
atomic.atomic.srcDataLo = value;
|
||||
break;
|
||||
}
|
||||
case CommandWriter::kAtomicSubtract: {
|
||||
atomic.atomic.atomOp = TC_OP_ATOMIC_SUB_RTN_32;
|
||||
atomic.atomic.srcDataLo = value;
|
||||
break;
|
||||
}
|
||||
case CommandWriter::kAtomicSwap: {
|
||||
atomic.atomic.atomOp = TC_OP_ATOMIC_SWAP_RTN_32;
|
||||
atomic.atomic.srcDataLo = value;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
APPEND_COMMAND_WRAPPER(cmdbuf, atomic);
|
||||
}
|
||||
|
||||
void Gfx8CmdWriter::BuildAtomicPacket64(CmdBuf* cmdbuf, AtomicType atomic_op,
|
||||
volatile uint64_t* addr, uint64_t value,
|
||||
uint64_t compare) {
|
||||
AtomicTemplate atomic = atomic_template_;
|
||||
|
||||
// make sure the destination adddress is aligned
|
||||
uint32_t address_low = PtrLow32((void*)addr);
|
||||
uint32_t address_high = PtrHigh32((void*)addr);
|
||||
assert(!(address_low & 0x7) && "destination address must be 8 byte aligned");
|
||||
|
||||
atomic.atomic.addressLo = address_low;
|
||||
atomic.atomic.addressHi = address_high;
|
||||
|
||||
atomic.atomic.atc = (atc_support_) ? 1 : 0;
|
||||
atomic.atomic.cachePolicy = 2;
|
||||
|
||||
switch (atomic_op) {
|
||||
case CommandWriter::kAtomicTypeIncrement: {
|
||||
atomic.atomic.atomOp = TC_OP_ATOMIC_ADD_RTN_64;
|
||||
atomic.atomic.srcDataLo = 1;
|
||||
break;
|
||||
}
|
||||
case CommandWriter::kAtomicTypeDecrement: {
|
||||
atomic.atomic.atomOp = TC_OP_ATOMIC_SUB_RTN_64;
|
||||
atomic.atomic.srcDataLo = 1;
|
||||
break;
|
||||
}
|
||||
case CommandWriter::kAtomicTypeCompareAndSwap: {
|
||||
atomic.atomic.atomOp = TC_OP_ATOMIC_CMPSWAP_RTN_64;
|
||||
atomic.atomic.srcDataLo = Low32(value);
|
||||
atomic.atomic.srcDataHi = High32(value);
|
||||
atomic.atomic.cmpDataLo = Low32(compare);
|
||||
atomic.atomic.cmpDataHi = High32(compare);
|
||||
break;
|
||||
}
|
||||
case CommandWriter::kAtomicTypeBlockingCompareAndSwap: {
|
||||
atomic.atomic.atomOp = TC_OP_ATOMIC_CMPSWAP_RTN_64;
|
||||
atomic.atomic.srcDataLo = Low32(value);
|
||||
atomic.atomic.srcDataHi = High32(value);
|
||||
atomic.atomic.cmpDataLo = Low32(compare);
|
||||
atomic.atomic.cmpDataHi = High32(compare);
|
||||
atomic.atomic.command = 1;
|
||||
atomic.atomic.loopInterval = 128;
|
||||
break;
|
||||
}
|
||||
case CommandWriter::kAtomicAdd: {
|
||||
atomic.atomic.atomOp = TC_OP_ATOMIC_ADD_RTN_64;
|
||||
atomic.atomic.srcDataLo = Low32(value);
|
||||
atomic.atomic.srcDataHi = High32(value);
|
||||
break;
|
||||
}
|
||||
case CommandWriter::kAtomicSubtract: {
|
||||
atomic.atomic.atomOp = TC_OP_ATOMIC_SUB_RTN_64;
|
||||
atomic.atomic.srcDataLo = Low32(value);
|
||||
atomic.atomic.srcDataHi = High32(value);
|
||||
break;
|
||||
}
|
||||
case CommandWriter::kAtomicSwap: {
|
||||
atomic.atomic.atomOp = TC_OP_ATOMIC_SWAP_RTN_64;
|
||||
atomic.atomic.srcDataLo = Low32(value);
|
||||
atomic.atomic.srcDataHi = High32(value);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
APPEND_COMMAND_WRAPPER(cmdbuf, atomic);
|
||||
}
|
||||
|
||||
size_t Gfx8CmdWriter::SizeOfAtomicPacket() const {
|
||||
return sizeof(AtomicTemplate) / sizeof(uint32_t);
|
||||
}
|
||||
|
||||
void Gfx8CmdWriter::BuildConditionalExecute(CmdBuf* cmdbuf, uint32_t* signal, uint16_t count) {
|
||||
ConditionalExecuteTemplate conditional = conditional_template_;
|
||||
|
||||
uint32_t address_low = PtrLow32(signal);
|
||||
uint32_t address_high = PtrHigh32(signal);
|
||||
assert(!(address_low & 0x7) && "destination address must be 8 byte aligned");
|
||||
|
||||
conditional.conditional.boolAddrLo = address_low;
|
||||
conditional.conditional.boolAddrHi = address_high;
|
||||
conditional.conditional.execCount = count;
|
||||
|
||||
APPEND_COMMAND_WRAPPER(cmdbuf, conditional);
|
||||
}
|
||||
|
||||
void Gfx8CmdWriter::BuildWriteDataCommand(CmdBuf* cmdbuf, uint32_t* write_addr,
|
||||
uint32_t write_value) {
|
||||
// Copy the initialize command packet
|
||||
gfx8::WriteDataTemplate command = write_data_template_;
|
||||
|
||||
// Encode the user specified value to write
|
||||
command.write_data_value = write_value;
|
||||
|
||||
// Encode the user specified address to write to
|
||||
command.write_data.dstAddrLo = PtrLow32(write_addr);
|
||||
command.write_data.dstAddrHi = PtrHigh32(write_addr);
|
||||
|
||||
// Append the built command into output Command Buffer
|
||||
APPEND_COMMAND_WRAPPER(cmdbuf, command);
|
||||
}
|
||||
|
||||
void Gfx8CmdWriter::BuildWriteData64Command(CmdBuf* cmdbuf, uint64_t* write_addr,
|
||||
uint64_t write_value) {
|
||||
// Copy the initialize command packet
|
||||
gfx8::WriteData64Template command = write_data64_template_;
|
||||
|
||||
// Encode the user specified value to write
|
||||
command.write_data_value = write_value;
|
||||
|
||||
// Encode the user specified address to write to
|
||||
command.write_data.dstAddrLo = PtrLow32(write_addr);
|
||||
command.write_data.dstAddrHi = PtrHigh32(write_addr);
|
||||
|
||||
// Append the built command into output Command Buffer
|
||||
APPEND_COMMAND_WRAPPER(cmdbuf, command);
|
||||
}
|
||||
|
||||
void Gfx8CmdWriter::BuildFlushCacheCmd(CmdBuf* cmdbuf, FlushCacheOptions* options,
|
||||
uint32_t* writeAddr, uint32_t writeVal) {
|
||||
PM4CMDACQUIREMEM flushCmd;
|
||||
memset(&flushCmd, 0, sizeof(flushCmd));
|
||||
|
||||
// Verify write back address is valid. Note that this address is NOT
|
||||
// used on CI. But to have a same interface as that on SI, we keep
|
||||
// the address argument in this function. Thus, this check always pass
|
||||
// no matter the address is NULL or not.
|
||||
(writeAddr == NULL) ? assert(true) : assert(true);
|
||||
|
||||
// Initialize the command header
|
||||
gfx8::GenerateCmdHeader(&flushCmd, IT_ACQUIRE_MEM__CI__VI);
|
||||
|
||||
// Specify the base address of memory being synchronized.
|
||||
// The starting address is indicated as follows: bits [0-48].
|
||||
flushCmd.cpCoherBase.u32All = 0;
|
||||
flushCmd.cpCoherBaseHi.u32All = 0;
|
||||
|
||||
// Specify the size of memory being synchronized. It is indicated
|
||||
// as follows:
|
||||
// COHER_SIZE_256B_MASK = 0xffffffffL
|
||||
// COHER_SIZE_HI_256B_MASK__CI__VI = 0x000000ffL
|
||||
flushCmd.cpCoherSize.u32All = CP_COHER_SIZE__COHER_SIZE_256B_MASK;
|
||||
flushCmd.cpCoherSizeHi.u32All = CP_COHER_SIZE_HI__COHER_SIZE_HI_256B_MASK__CI__VI;
|
||||
|
||||
// Periodicity of polling - interval to wait from the time
|
||||
// of unsuccessful polling result is returned and a new
|
||||
// poll is issued
|
||||
flushCmd.pollInterval = 0x04;
|
||||
|
||||
// Program Coherence Control Register. Initialize L2 Cache flush
|
||||
// for Non-Coherent memory blocks
|
||||
uint32_t coher_cntl = 0;
|
||||
|
||||
coher_cntl |= (options->l1) ? CP_COHER_CNTL__TCL1_ACTION_ENA_MASK : 0;
|
||||
coher_cntl |= (options->l2)
|
||||
? (CP_COHER_CNTL__TC_ACTION_ENA_MASK | CP_COHER_CNTL__TC_WB_ACTION_ENA_MASK__CI__VI)
|
||||
: 0;
|
||||
coher_cntl |= (options->icache) ? CP_COHER_CNTL__SH_ICACHE_ACTION_ENA_MASK : 0;
|
||||
coher_cntl |= (options->kcache) ? CP_COHER_CNTL__SH_KCACHE_ACTION_ENA_MASK : 0;
|
||||
flushCmd.coherCntl = coher_cntl;
|
||||
|
||||
// Copy AcquireMem command buffer stream
|
||||
APPEND_COMMAND_WRAPPER(cmdbuf, flushCmd);
|
||||
return;
|
||||
}
|
||||
|
||||
void Gfx8CmdWriter::BuildDmaDataPacket(CmdBuf* cmdbuf, uint32_t* srcAddr, uint32_t* dstAddr,
|
||||
uint32_t copySize, bool waitForConfirm) {
|
||||
PM4CMDDMADATA cmdDmaData;
|
||||
memset(&cmdDmaData, 0, sizeof(PM4CMDDMADATA));
|
||||
cmdDmaData.header.u32All =
|
||||
(PM4_TYPE_3_HDR(IT_DMA_DATA__CI__VI, PM4_CMD_DMA_DATA_DWORDS, ShaderCompute, 0));
|
||||
|
||||
// Id of Micro Engine
|
||||
cmdDmaData.engine = 0;
|
||||
|
||||
// Specify attributes of source buffer such as its
|
||||
// location, ATC property, Cache policy and Volatile
|
||||
// A value of 1 for cache policy means to Stream
|
||||
cmdDmaData.srcSel = 0;
|
||||
cmdDmaData.srcATC = atc_support_;
|
||||
cmdDmaData.srcCachePolicy = 1;
|
||||
cmdDmaData.srcVolatile = 0;
|
||||
|
||||
// Specify attributes of destination buffer such as
|
||||
// its location, ATC property, Cache policy and Volatile
|
||||
// A value of 1 for cache policy means to Stream
|
||||
cmdDmaData.dstSel = 0;
|
||||
cmdDmaData.dstATC = atc_support_;
|
||||
cmdDmaData.dstCachePolicy = 1;
|
||||
cmdDmaData.dstVolatile = 0;
|
||||
|
||||
// Specify the source and destination addr
|
||||
cmdDmaData.srcAddrHi = PtrHigh32(srcAddr);
|
||||
cmdDmaData.srcAddrLoOrData = PtrLow32(srcAddr);
|
||||
cmdDmaData.dstAddrLo = PtrLow32(dstAddr);
|
||||
cmdDmaData.dstAddrHi = PtrHigh32(dstAddr);
|
||||
|
||||
// Number of bytes to copy. The command restricts
|
||||
// the size to be (2 MB - 1) - 21 Bits
|
||||
assert(copySize < 0x1FFFFF);
|
||||
cmdDmaData.command.byteCount = copySize;
|
||||
|
||||
// Indicate that DMA Cmd should wait if its source
|
||||
// is the destination of a previous DMA Cmd
|
||||
cmdDmaData.command.rawWait = waitForConfirm;
|
||||
|
||||
APPEND_COMMAND_WRAPPER(cmdbuf, cmdDmaData);
|
||||
return;
|
||||
}
|
||||
|
||||
} // gfx8
|
||||
} // pm4_profile
|
||||
@@ -0,0 +1,201 @@
|
||||
#ifndef _GFX8_CMDWRITER_H_
|
||||
#define _GFX8_CMDWRITER_H_
|
||||
|
||||
#include "cmdwriter.h"
|
||||
#include "gfx8_cmds.h"
|
||||
|
||||
namespace pm4_profile {
|
||||
|
||||
namespace gfx8 {
|
||||
|
||||
/// @brief class Gfx8CmdWriter implements the virtual class CommandWriter
|
||||
/// for Sea Islands (CI) and VI chipset
|
||||
class Gfx8CmdWriter : public CommandWriter {
|
||||
public:
|
||||
Gfx8CmdWriter(bool atc_support, bool pcie_atomic_support);
|
||||
|
||||
/// @brief Dword specifying NOOP command for SI/CI/VI chipsets. The macro
|
||||
/// populates the NOOP command which is 32-bits wide. The second parameter,
|
||||
/// the COUNT field of NOOP command, specifies the number of Dwords to skip.
|
||||
/// To skip ZERO Dwords the value should be set to 0x3FFF. Since the macro
|
||||
/// decrements the second parameter by TWO, an artifact of its definition,
|
||||
/// the value is incremented by TWO to 0x4001 (0x3FFF + 2).
|
||||
///
|
||||
inline uint32_t GetNoOpCmd() {
|
||||
static const uint32_t nopCmd = PM4_TYPE_3_HDR(IT_NOP, 0x4001, ShaderCompute, 0);
|
||||
return nopCmd;
|
||||
}
|
||||
|
||||
void BuildBarrierCommand(CmdBuf* cmdBuf);
|
||||
|
||||
void BuildIndirectBufferCmd(CmdBuf* cmdbuf, const void* cmd_addr, std::size_t cmd_size);
|
||||
|
||||
void BuildBOPNotifyCmd(CmdBuf* cmdbuf, const void* write_addr, uint32_t write_val,
|
||||
bool interrupt);
|
||||
|
||||
void BuildBarrierFenceCommands(CmdBuf* cmdbuf);
|
||||
|
||||
void BuildWriteEventPacket(CmdBuf* cmdbuf, uint32_t event);
|
||||
|
||||
void BuildWaitRegMemCommand(CmdBuf* cmdbuf, bool mem_space, uint64_t wait_addr, bool func_eq,
|
||||
uint32_t mask_val, uint32_t wait_val);
|
||||
|
||||
void BuildWriteUnshadowRegPacket(CmdBuf* cmdbuf, uint32_t addr, uint32_t value);
|
||||
|
||||
/// @brief Build CP command to program a Gpu register
|
||||
///
|
||||
/// @param cmdbuf Pointer to command buffer to be appended
|
||||
/// @param addr Register to be programmed
|
||||
/// @param value Value to write into register
|
||||
///
|
||||
/// @return void
|
||||
void BuildWriteUConfigRegPacket(CmdBuf* cmdbuf, uint32_t addr, uint32_t value);
|
||||
|
||||
void BuildWriteShRegPacket(CmdBuf* cmdbuf, uint32_t addr, uint32_t value);
|
||||
|
||||
void BuildCopyDataPacket(CmdBuf* cmdbuf, uint32_t src_sel, uint32_t src_addr_lo,
|
||||
uint32_t src_addr_hi, uint32_t* dst_addr, uint32_t size, bool wait);
|
||||
|
||||
void BuildWriteWaitIdlePacket(CmdBuf* cmdbuf);
|
||||
|
||||
// Will issue a VGT event including a cache flush later on
|
||||
void BuildVgtEventPacket(CmdBuf* cmdbuf, uint32_t vgtEvent);
|
||||
|
||||
void BuildWriteRegisterPacket(CmdBuf* cmdbuf, uint32_t addr, uint32_t value);
|
||||
|
||||
void BuildWriteEventQueryPacket(CmdBuf* cmdbuf, uint32_t event, uint32_t* addr);
|
||||
|
||||
void BuildAtomicPacket(CmdBuf* cmdbuf, AtomicType atomic_op, volatile uint32_t* addr,
|
||||
uint32_t value, uint32_t compare);
|
||||
|
||||
void BuildAtomicPacket64(CmdBuf* cmdbuf, AtomicType atomic_op, volatile uint64_t* addr,
|
||||
uint64_t value = 0, uint64_t compare = 0);
|
||||
|
||||
size_t SizeOfAtomicPacket() const;
|
||||
|
||||
void BuildConditionalExecute(CmdBuf* cmdbuf, uint32_t* signal, uint16_t count);
|
||||
|
||||
void BuildWriteDataCommand(CmdBuf* cmdbuf, uint32_t* write_addr, uint32_t write_value);
|
||||
|
||||
void BuildWriteData64Command(CmdBuf* cmdbuf, uint64_t* write_addr, uint64_t write_value);
|
||||
|
||||
void BuildCacheFlushPacket(CmdBuf* cmdbuf);
|
||||
|
||||
/// Writes into input buffer Gpu commands to flush its cache. It is
|
||||
/// necessary that the buffer provided for flush commands is large
|
||||
/// enough to accommodate the full set of commands. It should be at
|
||||
/// least 512 bytes.
|
||||
///
|
||||
/// @param tsCmdBuf Buffer to write commands to.
|
||||
/// @param writeAddr Registered address into which GPU should write
|
||||
/// a user provided value upon executing the flush commands.
|
||||
/// @param writeVal User provided value written by GPU at user provided
|
||||
/// address, upon executing the flush commands.
|
||||
///
|
||||
/// @return void
|
||||
void BuildFlushCacheCmd(CmdBuf* cmdBuf, FlushCacheOptions* options, uint32_t* writeAddr,
|
||||
uint32_t writeVal);
|
||||
|
||||
/// Builds Gpu command to copy data from source to destination buffer
|
||||
/// using DMA engine.
|
||||
///
|
||||
/// @param cmdbuf Buffer updated with Gpu copy command
|
||||
/// @param srcAddr Address of source buffer address
|
||||
/// @param dstAddr Address of destination buffer address
|
||||
/// @param copySize Size of data to copy in bytes
|
||||
/// @param waitForCompletion if command should wait for copying to complete
|
||||
void BuildDmaDataPacket(CmdBuf* cmdBuf, uint32_t* srcAddr, uint32_t* dstAddr, uint32_t copySize,
|
||||
bool waitForCompletion);
|
||||
|
||||
protected:
|
||||
/// @brief Copies data from source buffer to destination buffer
|
||||
///
|
||||
/// @param dst_addr Address of destination buffer data
|
||||
///
|
||||
/// @count Size of data to copy in 32-bit words
|
||||
///
|
||||
/// @param src_addr Address of buffer containing source data
|
||||
///
|
||||
/// @return void
|
||||
virtual void WriteUserData(uint32_t* dst_addr, uint32_t count, const void* src_addr);
|
||||
|
||||
/// @brief Append an instance of Gpu command into input command buffer stream.
|
||||
///
|
||||
/// @param cmdbuf CommandWriter object appended with anohter Gpu command
|
||||
///
|
||||
/// @param cmd Gpu command to be appended into command buffer
|
||||
///
|
||||
/// @return void
|
||||
template <class T> void AppendCommand(CmdBuf* cmdbuf, const T& cmd);
|
||||
|
||||
private:
|
||||
/// @brief Initializes a Gpu command which can be used to
|
||||
/// reference a Gpu command stream indirectly
|
||||
void InitializeLaunchTemplate();
|
||||
|
||||
/// @brief Initializes a Gpu command to perform atomic operations
|
||||
////
|
||||
void InitializeAtomicTemplate();
|
||||
|
||||
/// @brief Initializes a Gpu command to allow conditional execution
|
||||
/// of a Gpu command stream
|
||||
void InitializeConditionalTemplate();
|
||||
|
||||
/// @brief Initializes a Gpu command to let command processor
|
||||
/// wait for some update before letting other commands to be
|
||||
/// processed
|
||||
void InitializeWaitRegMemTemplate();
|
||||
|
||||
/// @brief Initializes the template for Barrier command.
|
||||
/// Applications can use Barrier command to ensure their
|
||||
/// command is executed only after all other commands have
|
||||
/// completed their execution.
|
||||
void InitializeBarrierTemplate();
|
||||
|
||||
void BuildUpdateHostAddress(CmdBuf* cmdbuf, uint64_t* addr, int64_t value);
|
||||
|
||||
/// @brief Initializes Acquire Memory command template. Users
|
||||
/// can submit this command to invalidate Gpu caches - L1 and
|
||||
/// or L2.
|
||||
void InitializeAcquireMemTemplate();
|
||||
|
||||
/// @brief Initializes an instance of Write Data command
|
||||
/// for use by an application
|
||||
void InitializeWriteDataTemplate();
|
||||
void InitializeWriteData64Template();
|
||||
|
||||
/// @brief Instance of Gpu command to reference dispatch commands
|
||||
LaunchTemplate launch_template_;
|
||||
|
||||
/// @brief Instance of Gpu command to use in performing atomic operations
|
||||
AtomicTemplate atomic_template_;
|
||||
|
||||
/// @brief Instance of Gpu command to use in conditional execution
|
||||
/// of a command stream
|
||||
ConditionalExecuteTemplate conditional_template_;
|
||||
|
||||
/// @brief Instance of Pm4 command WRITE_DATA
|
||||
WriteDataTemplate write_data_template_;
|
||||
WriteData64Template write_data64_template_;
|
||||
|
||||
/// @brief Instance of Pm4 command EVENT_WRITE
|
||||
BarrierTemplate pending_dispatch_template_;
|
||||
|
||||
/// @brief Instance of Pm4 command ACQUIRE_MEM
|
||||
AcquireMemTemplate invalidate_cache_template_;
|
||||
|
||||
/// @brief Instance of Pm4 command WAIT_REG_MEM
|
||||
WaitRegMemTemplate wait_reg_mem_template_;
|
||||
|
||||
/// @brief ATC support.
|
||||
bool atc_support_;
|
||||
|
||||
/// @brief PCIe atomic support.
|
||||
bool pcie_atomic_support_;
|
||||
};
|
||||
|
||||
} // gfx8
|
||||
|
||||
} // pm4_profile
|
||||
|
||||
#endif // _GFX8_CMDWRITER_H_
|
||||
@@ -0,0 +1,90 @@
|
||||
#ifndef _GFX9_CMDS_H_
|
||||
#define _GFX9_CMDS_H_
|
||||
|
||||
#include "gfxip/gfx9/gfx9_utils.h"
|
||||
#include "gfxip/gfx9/gfx9_enum.h"
|
||||
#include "gfxip/gfx9/gfx9_mask.h"
|
||||
#include "gfxip/gfx9/gfx9_offset.h"
|
||||
#include "gfxip/gfx9/gfx9_typedef.h"
|
||||
#include "gfxip/gfx9/gfx9_registers.h"
|
||||
#include "gfxip/gfx9/gfx9_pm4_it_opcodes.h"
|
||||
#include "gfxip/gfx9/f32_mec_pm4_packets_vg10.h"
|
||||
#include "gfxip/gfx9/f32_pfp_pm4_packets_vg10.h"
|
||||
|
||||
namespace pm4_profile {
|
||||
|
||||
namespace gfx9 {
|
||||
|
||||
/// @brief Initializer for commands that set shader registers
|
||||
template <class T> void GenerateSetShRegHeader(T* pm4, uint32_t reg_addr) {
|
||||
pm4->cmd_set_data.header.u32All = PM4_TYPE3_HDR(IT_SET_SH_REG, sizeof(T) / sizeof(uint32_t));
|
||||
pm4->cmd_set_data.bitfields2.reg_offset = reg_addr - PERSISTENT_SPACE_START;
|
||||
}
|
||||
|
||||
// @brief Initializer for various Gpu command headers
|
||||
template <class T> void GenerateCmdHeader(T* pm4, IT_OpCodeType op_code) {
|
||||
pm4->header.u32All = PM4_TYPE3_HDR(op_code, sizeof(T) / sizeof(uint32_t));
|
||||
}
|
||||
|
||||
// @brief Initializer for commands that set configuration registers
|
||||
template <class T> void GenerateSetConfigRegHeader(T* pm4, uint32_t reg_addr) {
|
||||
pm4->cmd_set_data.header.u32All = PM4_TYPE3_HDR(IT_SET_CONFIG_REG, sizeof(T) / sizeof(uint32_t));
|
||||
pm4->cmd_set_data.bitfields2.reg_offset = reg_addr - CONFIG_SPACE_START;
|
||||
}
|
||||
|
||||
/// @brief Structure used to issue a Gpu Barrier command
|
||||
struct BarrierTemplate {
|
||||
PM4MEC_EVENT_WRITE event_write;
|
||||
};
|
||||
|
||||
/// @brief Structure used to configure the flushing of
|
||||
/// various caches - instruction, constants, L1 and L2
|
||||
struct AcquireMemTemplate {
|
||||
PM4MEC_ACQUIRE_MEM acquire_mem;
|
||||
};
|
||||
|
||||
/// @brief Structure used to reference another Gpu command
|
||||
/// indirectly. Generally used to reference a list of Gpu
|
||||
/// commands (dispatch cmds) indirectly
|
||||
struct LaunchTemplate {
|
||||
PM4MEC_INDIRECT_BUFFER indirect_buffer;
|
||||
};
|
||||
|
||||
/// @brief Structure used to determine the end of
|
||||
/// a kernel including cache flushes and writing to
|
||||
/// a user configurable memory location
|
||||
struct EndofKernelNotifyTemplate {
|
||||
PM4MEC_RELEASE_MEM release_mem;
|
||||
};
|
||||
|
||||
// Desc: Strucuture used to perform various atomic
|
||||
// operations - add, subtract, increment, etc
|
||||
struct AtomicTemplate {
|
||||
PM4MEC_ATOMIC_MEM atomic;
|
||||
};
|
||||
|
||||
/// @brief PM4 command to write a 32-bit value into a memory
|
||||
/// location accessible to Gpu
|
||||
struct WriteDataTemplate {
|
||||
PM4MEC_WRITE_DATA write_data;
|
||||
uint32_t write_data_value;
|
||||
};
|
||||
|
||||
/// @brief PM4 command to write a 64-bit value into a memory
|
||||
/// location accessible to Gpu
|
||||
struct WriteData64Template {
|
||||
PM4MEC_WRITE_DATA write_data;
|
||||
uint64_t write_data_value;
|
||||
};
|
||||
|
||||
/// @brief PM4 command to wait for a certain event before proceeding
|
||||
/// to process another command on the queue
|
||||
struct WaitRegMemTemplate {
|
||||
PM4MEC_WAIT_REG_MEM wait_reg_mem;
|
||||
};
|
||||
|
||||
} // gfx9
|
||||
|
||||
} // pm4_profile
|
||||
|
||||
#endif // _GFX9_CMDS_H_
|
||||
@@ -0,0 +1,743 @@
|
||||
#include <algorithm>
|
||||
#include <iostream>
|
||||
#include <iomanip>
|
||||
#include <sstream>
|
||||
|
||||
#include "gfx9_cmdwriter.h"
|
||||
|
||||
template <class T>
|
||||
static void PrintPm4Packet(const T& command, const char* name) {
|
||||
#if ! defined(NDEBUG)
|
||||
uint32_t * cmd = (uint32_t*)&command;
|
||||
uint32_t size = sizeof(command) / sizeof(uint32_t);
|
||||
std::ostringstream oss;
|
||||
oss << "'" << name << "' size(" << std::dec << size << ")";
|
||||
std::clog << std::setw(40) << std::left << oss.str() << ":";
|
||||
for (uint32_t idx = 0; idx < size; idx++) {
|
||||
std::clog << " " << std::hex << std::setw(8) << std::setfill('0') << cmd[idx];
|
||||
}
|
||||
std::clog << std::setfill(' ') << std::endl;
|
||||
#endif
|
||||
}
|
||||
|
||||
#define APPEND_COMMAND_WRAPPER(cmdbuf, command) \
|
||||
PrintPm4Packet(command, __FUNCTION__); \
|
||||
AppendCommand(cmdbuf, command);
|
||||
|
||||
namespace pm4_profile {
|
||||
namespace gfx9 {
|
||||
|
||||
template <class T> void Gfx9CmdWriter::AppendCommand(CmdBuf* cmdbuf, const T& command) {
|
||||
cmdbuf->AppendCommand(&command, sizeof(command));
|
||||
}
|
||||
|
||||
void Gfx9CmdWriter::InitializeLaunchTemplate() {
|
||||
memset(&launch_template_, 0, sizeof(launch_template_));
|
||||
GenerateCmdHeader(&launch_template_.indirect_buffer, IT_INDIRECT_BUFFER);
|
||||
}
|
||||
|
||||
void Gfx9CmdWriter::InitializeAtomicTemplate() {
|
||||
memset(&atomic_template_.atomic, 0, sizeof(atomic_template_));
|
||||
GenerateCmdHeader(&atomic_template_.atomic, IT_ATOMIC_MEM);
|
||||
|
||||
// Specify the micro engine and cache policies
|
||||
PM4MEC_ATOMIC_MEM* atomicCmd = &atomic_template_.atomic;
|
||||
atomicCmd->bitfields2.cache_policy = cache_policy__mec_atomic_mem__stream;
|
||||
}
|
||||
|
||||
void Gfx9CmdWriter::InitializeBarrierTemplate() {
|
||||
memset(&pending_dispatch_template_, 0, sizeof(pending_dispatch_template_));
|
||||
GenerateCmdHeader(&pending_dispatch_template_.event_write, IT_EVENT_WRITE);
|
||||
|
||||
MEC_EVENT_WRITE_event_index_enum index;
|
||||
index = event_index__mec_event_write__cs_partial_flush;
|
||||
pending_dispatch_template_.event_write.bitfields2.event_index = index;
|
||||
pending_dispatch_template_.event_write.bitfields2.event_type = CS_PARTIAL_FLUSH;
|
||||
}
|
||||
|
||||
void Gfx9CmdWriter::InitializeAcquireMemTemplate() {
|
||||
memset(&invalidate_cache_template_, 0, sizeof(invalidate_cache_template_));
|
||||
GenerateCmdHeader(&invalidate_cache_template_.acquire_mem, IT_ACQUIRE_MEM);
|
||||
|
||||
// Specify the CP module which will process this packet
|
||||
PM4MEC_ACQUIRE_MEM* acquire_mem = &invalidate_cache_template_.acquire_mem;
|
||||
|
||||
// Specify the size of memory to invalidate. Size is
|
||||
// specified in terms of 256 byte chunks. A coher_size
|
||||
// of 0xFFFFFFFF actually specified 0xFFFFFFFF00 (40 bits)
|
||||
// of memory. The field coher_size_hi specifies memory from
|
||||
// bits 40-64 for a total of 256 TB.
|
||||
acquire_mem->coher_size = 0xFFFFFFFF;
|
||||
acquire_mem->bitfields4.coher_size_hi = 0xFFFFFF;
|
||||
|
||||
// Specify the address of memory to invalidate. The
|
||||
// address must be 256 byte aligned.
|
||||
acquire_mem->coher_base_lo = 0x00;
|
||||
acquire_mem->bitfields6.coher_base_hi = 0x00;
|
||||
|
||||
// Specify the poll interval for determing if operation is complete
|
||||
acquire_mem->bitfields7.poll_interval = 0x04;
|
||||
}
|
||||
|
||||
void Gfx9CmdWriter::InitializeWaitRegMemTemplate() {
|
||||
memset(&wait_reg_mem_template_, 0, sizeof(wait_reg_mem_template_));
|
||||
GenerateCmdHeader(&wait_reg_mem_template_.wait_reg_mem, IT_WAIT_REG_MEM);
|
||||
|
||||
PM4MEC_WAIT_REG_MEM* wait_reg_mem = &wait_reg_mem_template_.wait_reg_mem;
|
||||
|
||||
wait_reg_mem->bitfields7.poll_interval = 0x04;
|
||||
wait_reg_mem->bitfields2.operation = operation__mec_wait_reg_mem__wait_reg_mem;
|
||||
}
|
||||
|
||||
void Gfx9CmdWriter::InitializeWriteDataTemplate(PM4MEC_WRITE_DATA* write_data, bool bit32) {
|
||||
// Initialize the header of command packet by adjusting the
|
||||
// size of payload - one 32bit DWord or two 32bit DWords
|
||||
uint32_t cmd_size = (bit32) ? 1 : 2;
|
||||
memset(write_data, 0, sizeof(PM4MEC_WRITE_DATA));
|
||||
cmd_size = cmd_size + (sizeof(PM4MEC_WRITE_DATA) / sizeof(uint32_t));
|
||||
write_data->ordinal1 = PM4_TYPE3_HDR(IT_WRITE_DATA, cmd_size);
|
||||
|
||||
// Set the bit to confirm the write operation and cache policy
|
||||
write_data->bitfields2.wr_confirm = wr_confirm__mec_write_data__wait_for_write_confirmation;
|
||||
write_data->bitfields2.cache_policy = cache_policy__mec_write_data__stream;
|
||||
|
||||
// Specify the command to increment address if writing more than one DWord
|
||||
write_data->bitfields2.addr_incr = addr_incr__mec_write_data__increment_address;
|
||||
|
||||
// Specify the class to which the write destination belongs
|
||||
write_data->bitfields2.dst_sel = dst_sel__mec_write_data__memory;
|
||||
}
|
||||
|
||||
void Gfx9CmdWriter::InitializeWriteDataTemplate() {
|
||||
InitializeWriteDataTemplate(&write_data_template_.write_data, true);
|
||||
}
|
||||
|
||||
void Gfx9CmdWriter::InitializeWriteData64Template() {
|
||||
InitializeWriteDataTemplate(&write_data64_template_.write_data, false);
|
||||
}
|
||||
|
||||
void Gfx9CmdWriter::InitializeConditionalTemplate() {
|
||||
/*
|
||||
memset(&conditional_template_.conditional, 0, sizeof(conditional_template_));
|
||||
GenerateCmdHeader(&conditional_template_.conditional, IT_COND_EXEC);
|
||||
|
||||
if (atc_support_) {
|
||||
const uint32_t kAtcShift = 24;
|
||||
conditional_template_.conditional.ordinal4 |= 1 << kAtcShift;
|
||||
}
|
||||
*/
|
||||
}
|
||||
|
||||
void Gfx9CmdWriter::InitializeEndOfKernelNotifyTemplate() {
|
||||
memset(¬ify_template_, 0, sizeof(notify_template_));
|
||||
GenerateCmdHeader(¬ify_template_.release_mem, IT_RELEASE_MEM);
|
||||
|
||||
// Set the event type to be bottom of pipe and cache policy
|
||||
PM4MEC_RELEASE_MEM* rel_mem;
|
||||
rel_mem = ¬ify_template_.release_mem;
|
||||
rel_mem->bitfields2.event_type = BOTTOM_OF_PIPE_TS;
|
||||
rel_mem->bitfields2.cache_policy = cache_policy__mec_release_mem__stream;
|
||||
rel_mem->bitfields2.event_index = event_index__mec_release_mem__end_of_pipe;
|
||||
|
||||
// Specify the attributes of source and destinations of data
|
||||
rel_mem->bitfields3.int_sel = int_sel__mec_release_mem__none;
|
||||
rel_mem->bitfields3.data_sel = data_sel__mec_release_mem__none;
|
||||
rel_mem->bitfields3.dst_sel = dst_sel__mec_release_mem__memory_controller;
|
||||
}
|
||||
|
||||
Gfx9CmdWriter::Gfx9CmdWriter(bool atc_support, bool pcie_atomic_support) {
|
||||
// Initialize various state variables related to
|
||||
// atomic operations and atc support
|
||||
this->atc_support_ = atc_support;
|
||||
this->pcie_atomic_support_ = pcie_atomic_support;
|
||||
|
||||
// Initialize various command templates
|
||||
InitializeLaunchTemplate();
|
||||
InitializeAtomicTemplate();
|
||||
InitializeBarrierTemplate();
|
||||
InitializeAcquireMemTemplate();
|
||||
InitializeWaitRegMemTemplate();
|
||||
InitializeWriteDataTemplate();
|
||||
InitializeWriteData64Template();
|
||||
InitializeConditionalTemplate();
|
||||
InitializeEndOfKernelNotifyTemplate();
|
||||
}
|
||||
|
||||
void Gfx9CmdWriter::BuildIndirectBufferCmd(CmdBuf* cmdbuf, const void* cmd_addr,
|
||||
std::size_t cmd_size) {
|
||||
// Verify the address is 4-byte aligned
|
||||
uint64_t addr = uintptr_t(cmd_addr);
|
||||
assert(!(addr & 0x3) && "IndirectBuffer address must be 4 byte aligned");
|
||||
|
||||
// Specify the address of indirect buffer encoding cmd stream
|
||||
LaunchTemplate launch = launch_template_;
|
||||
|
||||
launch.indirect_buffer.bitfields2.ib_base_lo = (PtrLow32(cmd_addr) >> 2);
|
||||
launch.indirect_buffer.ib_base_hi = PtrHigh32(cmd_addr);
|
||||
|
||||
// Specify the size of indirect buffer and cache policy to set
|
||||
// upon executing the cmds of indirect buffer
|
||||
launch.indirect_buffer.bitfields4.priv = 0;
|
||||
launch.indirect_buffer.bitfields4.valid = 1;
|
||||
launch.indirect_buffer.bitfields4.ib_size = cmd_size / sizeof(uint32_t);
|
||||
launch.indirect_buffer.bitfields4.cache_policy = cache_policy__mec_indirect_buffer__stream;
|
||||
|
||||
// Append the built command into output Command Buffer
|
||||
APPEND_COMMAND_WRAPPER(cmdbuf, launch);
|
||||
}
|
||||
|
||||
void Gfx9CmdWriter::BuildAtomicPacket(CmdBuf* cmdbuf, AtomicType atomic_op, volatile uint32_t* addr,
|
||||
uint32_t value, uint32_t compare) {
|
||||
AtomicTemplate atomicTemplate = atomic_template_;
|
||||
PM4MEC_ATOMIC_MEM* atomicCmd = &atomicTemplate.atomic;
|
||||
|
||||
// make sure the destination adddress is aligned
|
||||
uint32_t address_low = PtrLow32((void*)addr);
|
||||
uint32_t address_high = PtrHigh32((void*)addr);
|
||||
assert(!(address_low & 0x7) && "destination address must be 8 byte aligned");
|
||||
atomicCmd->addr_lo = address_low;
|
||||
atomicCmd->addr_hi = address_high;
|
||||
|
||||
switch (atomic_op) {
|
||||
case CommandWriter::kAtomicTypeIncrement:
|
||||
assert(!(value != 0x01) && "Atomic Increment value should be 1");
|
||||
case CommandWriter::kAtomicAdd:
|
||||
atomicCmd->src_data_lo = value;
|
||||
atomicCmd->bitfields2.atomic = TC_OP_ATOMIC_ADD_RTN_32;
|
||||
break;
|
||||
case CommandWriter::kAtomicTypeDecrement:
|
||||
assert(!(value != 0x01) && "Atomic Decrement value should be 1");
|
||||
case CommandWriter::kAtomicSubtract:
|
||||
atomicCmd->src_data_lo = value;
|
||||
atomicCmd->bitfields2.atomic = TC_OP_ATOMIC_SUB_RTN_32;
|
||||
break;
|
||||
case CommandWriter::kAtomicTypeBlockingCompareAndSwap:
|
||||
atomicCmd->bitfields9.loop_interval = 128;
|
||||
atomicCmd->bitfields2.command = command__mec_atomic_mem__loop_until_compare_satisfied;
|
||||
case CommandWriter::kAtomicTypeCompareAndSwap:
|
||||
atomicCmd->src_data_lo = value;
|
||||
atomicCmd->cmp_data_lo = compare;
|
||||
atomicCmd->bitfields2.atomic = TC_OP_ATOMIC_CMPSWAP_RTN_32;
|
||||
break;
|
||||
case CommandWriter::kAtomicSwap:
|
||||
atomicCmd->src_data_lo = value;
|
||||
atomicCmd->bitfields2.atomic = TC_OP_ATOMIC_SWAP_RTN_32;
|
||||
break;
|
||||
default:
|
||||
assert((false) && "Atomic operation id is invalid");
|
||||
}
|
||||
|
||||
// Append the built command into output Command Buffer
|
||||
APPEND_COMMAND_WRAPPER(cmdbuf, atomicTemplate);
|
||||
}
|
||||
|
||||
void Gfx9CmdWriter::BuildAtomicPacket64(CmdBuf* cmdbuf, AtomicType atomic_op,
|
||||
volatile uint64_t* addr, uint64_t value, uint64_t compare) {
|
||||
AtomicTemplate atomicTemplate = atomic_template_;
|
||||
PM4MEC_ATOMIC_MEM* atomicCmd = &atomicTemplate.atomic;
|
||||
|
||||
// make sure the destination adddress is aligned
|
||||
uint32_t address_low = PtrLow32((void*)addr);
|
||||
uint32_t address_high = PtrHigh32((void*)addr);
|
||||
assert(!(address_low & 0x7) && "destination address must be 8 byte aligned");
|
||||
atomicCmd->addr_lo = address_low;
|
||||
atomicCmd->addr_hi = address_high;
|
||||
|
||||
switch (atomic_op) {
|
||||
case CommandWriter::kAtomicTypeIncrement:
|
||||
assert(!(value != 0x01) && "Atomic Increment value should be 1");
|
||||
case CommandWriter::kAtomicAdd:
|
||||
atomicCmd->src_data_lo = Low32(value);
|
||||
atomicCmd->src_data_hi = High32(value);
|
||||
atomicCmd->bitfields2.atomic = TC_OP_ATOMIC_ADD_RTN_64;
|
||||
break;
|
||||
case CommandWriter::kAtomicTypeDecrement:
|
||||
assert(!(value != 0x01) && "Atomic Decrement value should be 1");
|
||||
case CommandWriter::kAtomicSubtract:
|
||||
atomicCmd->src_data_lo = Low32(value);
|
||||
atomicCmd->src_data_hi = High32(value);
|
||||
atomicCmd->bitfields2.atomic = TC_OP_ATOMIC_SUB_RTN_64;
|
||||
break;
|
||||
case CommandWriter::kAtomicTypeBlockingCompareAndSwap:
|
||||
atomicCmd->bitfields9.loop_interval = 128;
|
||||
atomicCmd->bitfields2.command = command__mec_atomic_mem__loop_until_compare_satisfied;
|
||||
case CommandWriter::kAtomicTypeCompareAndSwap:
|
||||
atomicCmd->src_data_lo = Low32(value);
|
||||
atomicCmd->src_data_hi = High32(value);
|
||||
atomicCmd->cmp_data_lo = Low32(compare);
|
||||
atomicCmd->cmp_data_hi = High32(compare);
|
||||
atomicCmd->bitfields2.atomic = TC_OP_ATOMIC_CMPSWAP_RTN_64;
|
||||
break;
|
||||
case CommandWriter::kAtomicSwap:
|
||||
atomicCmd->src_data_lo = Low32(value);
|
||||
atomicCmd->src_data_hi = High32(value);
|
||||
atomicCmd->bitfields2.atomic = TC_OP_ATOMIC_SWAP_RTN_64;
|
||||
break;
|
||||
default:
|
||||
assert((false) && "Atomic operation id is invalid");
|
||||
}
|
||||
|
||||
// Append the built command into output Command Buffer
|
||||
APPEND_COMMAND_WRAPPER(cmdbuf, atomicTemplate);
|
||||
}
|
||||
|
||||
void Gfx9CmdWriter::BuildBarrierCommand(CmdBuf* cmdBuf) {
|
||||
APPEND_COMMAND_WRAPPER(cmdBuf, pending_dispatch_template_);
|
||||
}
|
||||
|
||||
void Gfx9CmdWriter::BuildWriteDataCommand(CmdBuf* cmdbuf, uint32_t* write_addr,
|
||||
uint32_t write_value) {
|
||||
// Copy the initialized command packet and its payload
|
||||
WriteDataTemplate command = write_data_template_;
|
||||
|
||||
// Encode the user specified address to write to
|
||||
uint64_t addr = uintptr_t(write_addr);
|
||||
assert(!(addr & 0x3) && "WriteData address must be 4 byte aligned");
|
||||
|
||||
// Specify the value to write
|
||||
command.write_data_value = write_value;
|
||||
|
||||
// Test Code to see if this makes a difference
|
||||
command.write_data.dst_mem_addr_hi = PtrHigh32(write_addr);
|
||||
command.write_data.bitfields3c.dst_mem_addr_lo = (PtrLow32(write_addr) >> 2);
|
||||
|
||||
// Append the built command into output Command Buffer
|
||||
APPEND_COMMAND_WRAPPER(cmdbuf, command);
|
||||
}
|
||||
|
||||
void Gfx9CmdWriter::BuildWriteData64Command(CmdBuf* cmdbuf, uint64_t* write_addr,
|
||||
uint64_t write_value) {
|
||||
// Copy the initialized command packet and its payload
|
||||
WriteData64Template command = write_data64_template_;
|
||||
|
||||
// Encode the user specified address to write to
|
||||
uint64_t addr = uintptr_t(write_addr);
|
||||
assert(!(addr & 0x3) && "WriteData address must be 4 byte aligned");
|
||||
|
||||
command.write_data.bitfields3c.dst_mem_addr_lo = (PtrLow32(write_addr) >> 2);
|
||||
command.write_data.dst_mem_addr_hi = PtrHigh32(write_addr);
|
||||
|
||||
// Specify the value to write
|
||||
command.write_data_value = write_value;
|
||||
|
||||
// Append the built command into output Command Buffer
|
||||
APPEND_COMMAND_WRAPPER(cmdbuf, command);
|
||||
}
|
||||
|
||||
void Gfx9CmdWriter::BuildWaitRegMemCommand(CmdBuf* cmdbuf, bool mem_space, uint64_t wait_addr,
|
||||
bool func_eq, uint32_t mask_val, uint32_t wait_val) {
|
||||
WaitRegMemTemplate wait_cmd = wait_reg_mem_template_;
|
||||
|
||||
// Apply the space to which addr belongs
|
||||
if (mem_space) {
|
||||
wait_cmd.wait_reg_mem.bitfields2.mem_space = mem_space__mec_wait_reg_mem__memory_space;
|
||||
} else {
|
||||
wait_cmd.wait_reg_mem.bitfields2.mem_space = mem_space__mec_wait_reg_mem__register_space;
|
||||
}
|
||||
|
||||
// Apply the function - equal / not equal desired by user
|
||||
if (func_eq) {
|
||||
wait_cmd.wait_reg_mem.bitfields2.function =
|
||||
function__mec_wait_reg_mem__equal_to_the_reference_value;
|
||||
} else {
|
||||
wait_cmd.wait_reg_mem.bitfields2.function =
|
||||
function__mec_wait_reg_mem__not_equal_reference_value;
|
||||
}
|
||||
|
||||
// Value to use in applying equal / not equal function
|
||||
wait_cmd.wait_reg_mem.reference = wait_val;
|
||||
|
||||
// Apply the mask on value at address/register
|
||||
wait_cmd.wait_reg_mem.mask = mask_val;
|
||||
|
||||
// The address to poll should be DWord (4 byte) aligned
|
||||
// Update upper 32 bit address if addr is not a register
|
||||
if (mem_space) {
|
||||
assert(!(wait_addr & 0x3) && "WaitRegMem address must be 4 byte aligned");
|
||||
}
|
||||
wait_cmd.wait_reg_mem.bitfields3a.mem_poll_addr_lo = (Low32(wait_addr) >> 2);
|
||||
if (mem_space) {
|
||||
wait_cmd.wait_reg_mem.mem_poll_addr_hi = High32(wait_addr);
|
||||
}
|
||||
|
||||
// Append the command to cmd stream
|
||||
APPEND_COMMAND_WRAPPER(cmdbuf, wait_cmd);
|
||||
}
|
||||
|
||||
void Gfx9CmdWriter::BuildConditionalExecute(CmdBuf* cmdbuf, uint32_t* signal, uint16_t count) {
|
||||
assert(false && "BuildConditionalExecute method is not implemented");
|
||||
/*
|
||||
ConditionalExecuteTemplate conditional = conditional_template_;
|
||||
|
||||
uint32_t address_low = PtrLow32(signal);
|
||||
uint32_t address_high = PtrHigh32(signal);
|
||||
assert(!(address_low & 0x7) && "destination address must be 8 byte aligned");
|
||||
|
||||
conditional.conditional.boolAddrLo = address_low;
|
||||
conditional.conditional.boolAddrHi = address_high;
|
||||
conditional.conditional.execCount = count;
|
||||
|
||||
APPEND_COMMAND_WRAPPER(cmdbuf, conditional);
|
||||
*/
|
||||
}
|
||||
|
||||
void Gfx9CmdWriter::BuildUpdateHostAddress(CmdBuf* cmdbuf, uint64_t* addr, int64_t value) {
|
||||
// If Atomics are supported, use it
|
||||
if (pcie_atomic_support_) {
|
||||
BuildAtomicPacket64(cmdbuf, CommandWriter::AtomicType::kAtomicSwap, (volatile uint64_t*)addr,
|
||||
value);
|
||||
return;
|
||||
}
|
||||
|
||||
BuildWriteData64Command(cmdbuf, addr, value);
|
||||
return;
|
||||
}
|
||||
|
||||
void Gfx9CmdWriter::BuildBOPNotifyCmd(CmdBuf* cmdbuf, const void* write_addr, uint32_t write_value,
|
||||
bool interrupt) {
|
||||
// Initialize the command including its header
|
||||
EndofKernelNotifyTemplate eop = notify_template_;
|
||||
PM4MEC_RELEASE_MEM* rel_mem = &eop.release_mem;
|
||||
|
||||
// Program CP to perform various cache operations
|
||||
// before issuing the write operation commences
|
||||
rel_mem->bitfields2.tc_action_ena = true;
|
||||
rel_mem->bitfields2.tc_wb_action_ena = true;
|
||||
|
||||
// Update cmd to write a user specified 32-bit value
|
||||
rel_mem->data_lo = write_value;
|
||||
rel_mem->bitfields3.data_sel = data_sel__mec_release_mem__send_32_bit_low;
|
||||
|
||||
// Update cmd with user specified address to write to
|
||||
rel_mem->address_hi = High32(uint64_t(write_addr));
|
||||
rel_mem->bitfields4b.address_lo_64b = (Low32(uint64_t(write_addr) >> 3));
|
||||
|
||||
// Update cmd to issue interrupt if user has requested it
|
||||
if (interrupt) {
|
||||
rel_mem->bitfields3.int_sel = int_sel__mec_release_mem__send_interrupt_after_write_confirm;
|
||||
}
|
||||
|
||||
// Serialize the command as stream of Dwords
|
||||
APPEND_COMMAND_WRAPPER(cmdbuf, eop);
|
||||
}
|
||||
|
||||
void Gfx9CmdWriter::BuildBarrierFenceCommands(CmdBuf* cmdbuf) {
|
||||
// TODO: temporarily remove the check because some OpenCL tests
|
||||
// (test_buffers, test_relationals) are failing.
|
||||
// if (using_cc_memory_policy_)
|
||||
// return;
|
||||
AcquireMemTemplate invalidate_src_caches = invalidate_cache_template_;
|
||||
|
||||
// wbINVL2 by default writes-back and invalidates both L1 and L2
|
||||
invalidate_src_caches.acquire_mem.bitfields2.coher_cntl = CP_COHER_CNTL__TC_ACTION_ENA_MASK;
|
||||
invalidate_src_caches.acquire_mem.bitfields2.coher_cntl |= CP_COHER_CNTL__TC_WB_ACTION_ENA_MASK;
|
||||
|
||||
APPEND_COMMAND_WRAPPER(cmdbuf, invalidate_src_caches);
|
||||
}
|
||||
|
||||
/*
|
||||
// PM4 packet for profilers
|
||||
#define PM4_PACKET3 (0xC0000000)
|
||||
#define PM4_PACKET3_CMD_SHIFT 8
|
||||
#define PM4_PACKET3_COUNT_SHIFT 16
|
||||
|
||||
#define PACKET3(cmd, count) \
|
||||
(PM4_PACKET3 | (((count)-1) << PM4_PACKET3_COUNT_SHIFT) | \
|
||||
((cmd) << PM4_PACKET3_CMD_SHIFT))
|
||||
*/
|
||||
|
||||
// Structure to store the event PM4 packet
|
||||
typedef struct WriteRegPacket_ { uint32_t item[3]; } WriteRegPacket;
|
||||
|
||||
void Gfx9CmdWriter::BuildWriteEventPacket(CmdBuf* cmdbuf, uint32_t event) {
|
||||
PM4MEC_EVENT_WRITE cp_event_initiator;
|
||||
memset(&cp_event_initiator, 0, sizeof(PM4MEC_EVENT_WRITE));
|
||||
cp_event_initiator.ordinal1 =
|
||||
PM4_TYPE3_HDR(IT_EVENT_WRITE, (sizeof(PM4MEC_EVENT_WRITE) / sizeof(uint32_t)));
|
||||
cp_event_initiator.ordinal2 = 0;
|
||||
|
||||
VGT_EVENT_TYPE eventType = Reserved_0x00;
|
||||
switch (event) {
|
||||
case kPerfCntrsStart:
|
||||
eventType = PERFCOUNTER_START;
|
||||
break;
|
||||
case kPerfCntrsStop:
|
||||
eventType = PERFCOUNTER_STOP;
|
||||
break;
|
||||
case kPerfCntrsSample:
|
||||
eventType = PERFCOUNTER_SAMPLE;
|
||||
break;
|
||||
default:
|
||||
assert(false && "Illegal VGT Event Id");
|
||||
}
|
||||
|
||||
MEC_EVENT_WRITE_event_index_enum index;
|
||||
index = event_index__mec_event_write__other;
|
||||
cp_event_initiator.bitfields2.event_index = index;
|
||||
cp_event_initiator.bitfields2.event_type = eventType;
|
||||
|
||||
// Append the built command into output Command Buffer
|
||||
APPEND_COMMAND_WRAPPER(cmdbuf, cp_event_initiator);
|
||||
}
|
||||
|
||||
void Gfx9CmdWriter::BuildWriteUnshadowRegPacket(CmdBuf* cmdbuf, uint32_t addr, uint32_t value) {
|
||||
WriteRegPacket packet;
|
||||
packet.item[0] =
|
||||
PM4_TYPE3_HDR(IT_SET_UCONFIG_REG, (1 + sizeof(PM4MEC_SET_CONFIG_REG) / sizeof(uint32_t)));
|
||||
packet.item[1] = (addr - UCONFIG_SPACE_START);
|
||||
packet.item[2] = value;
|
||||
|
||||
APPEND_COMMAND_WRAPPER(cmdbuf, packet);
|
||||
}
|
||||
|
||||
void Gfx9CmdWriter::BuildWriteUConfigRegPacket(CmdBuf* cmdbuf, uint32_t addr, uint32_t value) {
|
||||
WriteRegPacket packet;
|
||||
packet.item[0] =
|
||||
PM4_TYPE3_HDR(IT_SET_UCONFIG_REG, (1 + sizeof(PM4MEC_SET_CONFIG_REG) / sizeof(uint32_t)));
|
||||
packet.item[1] = (addr - UCONFIG_SPACE_START);
|
||||
packet.item[2] = value;
|
||||
|
||||
APPEND_COMMAND_WRAPPER(cmdbuf, packet);
|
||||
}
|
||||
|
||||
void Gfx9CmdWriter::BuildWriteShRegPacket(CmdBuf* cmdbuf, uint32_t addr, uint32_t value) {
|
||||
WriteRegPacket packet;
|
||||
packet.item[0] =
|
||||
PM4_TYPE3_HDR(IT_SET_SH_REG, (1 + sizeof(PM4MEC_SET_CONFIG_REG) / sizeof(uint32_t)));
|
||||
packet.item[1] = (addr - PERSISTENT_SPACE_START);
|
||||
packet.item[2] = value;
|
||||
|
||||
APPEND_COMMAND_WRAPPER(cmdbuf, packet);
|
||||
}
|
||||
|
||||
void Gfx9CmdWriter::BuildCopyDataPacket(CmdBuf* cmdbuf, uint32_t src_sel, uint32_t src_addr_lo,
|
||||
uint32_t src_addr_hi, uint32_t* dst_addr, uint32_t size,
|
||||
bool wait) {
|
||||
PM4MEC_COPY_DATA cmd_data;
|
||||
memset(&cmd_data, 0, sizeof(PM4MEC_COPY_DATA));
|
||||
cmd_data.ordinal1 = PM4_TYPE3_HDR(IT_COPY_DATA, (sizeof(PM4MEC_COPY_DATA) / sizeof(uint32_t)));
|
||||
|
||||
MEC_COPY_DATA_src_sel_enum data_src = src_sel__mec_copy_data__memory;
|
||||
switch (src_sel) {
|
||||
case 0:
|
||||
data_src = src_sel__mec_copy_data__mem_mapped_register;
|
||||
break;
|
||||
case 4:
|
||||
data_src = src_sel__mec_copy_data__perfcounters;
|
||||
break;
|
||||
default:
|
||||
assert(false && "CopyData Illegal value for source of data");
|
||||
break;
|
||||
}
|
||||
cmd_data.bitfields2.src_sel = data_src;
|
||||
cmd_data.bitfields2.src_cache_policy = src_cache_policy__mec_copy_data__stream;
|
||||
|
||||
cmd_data.bitfields2.dst_sel = dst_sel__mec_copy_data__memory;
|
||||
cmd_data.bitfields2.dst_cache_policy = dst_cache_policy__mec_copy_data__stream;
|
||||
|
||||
cmd_data.bitfields2.wr_confirm = (MEC_COPY_DATA_wr_confirm_enum)wait;
|
||||
cmd_data.bitfields2.count_sel = (size == 0) ? count_sel__mec_copy_data__32_bits_of_data
|
||||
: count_sel__mec_copy_data__64_bits_of_data;
|
||||
|
||||
// Specify the source register offset
|
||||
cmd_data.bitfields3a.src_reg_offset = src_addr_lo;
|
||||
|
||||
// Specify the destination memory address
|
||||
cmd_data.dst_addr_hi = PtrHigh32(dst_addr);
|
||||
if (size == 0) {
|
||||
cmd_data.bitfields5b.dst_32b_addr_lo = (PtrLow32(dst_addr) >> 2);
|
||||
} else {
|
||||
cmd_data.bitfields5c.dst_64b_addr_lo = (PtrLow32(dst_addr) >> 3);
|
||||
}
|
||||
|
||||
// Append the built command into output Command Buffer
|
||||
APPEND_COMMAND_WRAPPER(cmdbuf, cmd_data);
|
||||
}
|
||||
|
||||
void Gfx9CmdWriter::BuildCacheFlushPacket(CmdBuf* cmdbuf) {
|
||||
// Initialize the command header
|
||||
PM4MEC_ACQUIRE_MEM cache_flush = invalidate_cache_template_.acquire_mem;
|
||||
|
||||
// Program Coherence Control Register. Initialize L2 Cache flush
|
||||
// for Non-Coherent memory blocks
|
||||
uint32_t coher_cntl = 0;
|
||||
|
||||
coher_cntl |= CP_COHER_CNTL__TC_ACTION_ENA_MASK;
|
||||
coher_cntl |= CP_COHER_CNTL__TCL1_ACTION_ENA_MASK;
|
||||
coher_cntl |= CP_COHER_CNTL__TC_WB_ACTION_ENA_MASK;
|
||||
coher_cntl |= CP_COHER_CNTL__SH_ICACHE_ACTION_ENA_MASK;
|
||||
coher_cntl |= CP_COHER_CNTL__SH_KCACHE_ACTION_ENA_MASK;
|
||||
cache_flush.bitfields2.coher_cntl = coher_cntl;
|
||||
|
||||
// Copy AcquireMem command buffer stream
|
||||
APPEND_COMMAND_WRAPPER(cmdbuf, cache_flush);
|
||||
}
|
||||
|
||||
void Gfx9CmdWriter::BuildWriteWaitIdlePacket(CmdBuf* cmdbuf) {
|
||||
BuildBarrierCommand(cmdbuf);
|
||||
BuildCacheFlushPacket(cmdbuf);
|
||||
}
|
||||
|
||||
// Will issue a VGT event including a cache flush later on
|
||||
void Gfx9CmdWriter::BuildVgtEventPacket(CmdBuf* cmdbuf, uint32_t vgtEvent) {
|
||||
PM4MEC_EVENT_WRITE cp_event_initiator;
|
||||
memset(&cp_event_initiator, 0, sizeof(PM4MEC_EVENT_WRITE));
|
||||
cp_event_initiator.ordinal1 =
|
||||
PM4_TYPE3_HDR(IT_EVENT_WRITE, (sizeof(PM4MEC_EVENT_WRITE) / sizeof(uint32_t)));
|
||||
cp_event_initiator.ordinal2 = 0;
|
||||
|
||||
VGT_EVENT_TYPE eventType = Reserved_0x00;
|
||||
switch (vgtEvent) {
|
||||
case kPerfCntrsStart:
|
||||
eventType = PERFCOUNTER_START;
|
||||
break;
|
||||
case kPerfCntrsStop:
|
||||
eventType = PERFCOUNTER_STOP;
|
||||
break;
|
||||
case kPerfCntrsSample:
|
||||
eventType = PERFCOUNTER_SAMPLE;
|
||||
break;
|
||||
case kThrdTraceStart:
|
||||
eventType = THREAD_TRACE_START;
|
||||
break;
|
||||
case kThrdTraceStop:
|
||||
eventType = THREAD_TRACE_STOP;
|
||||
break;
|
||||
case kThrdTraceFlush:
|
||||
eventType = THREAD_TRACE_FLUSH;
|
||||
break;
|
||||
case kThrdTraceFinish:
|
||||
eventType = THREAD_TRACE_FINISH;
|
||||
break;
|
||||
default:
|
||||
assert(false && "Illegal VGT Event Id");
|
||||
}
|
||||
|
||||
MEC_EVENT_WRITE_event_index_enum index;
|
||||
index = event_index__mec_event_write__other;
|
||||
cp_event_initiator.bitfields2.event_index = index;
|
||||
cp_event_initiator.bitfields2.event_type = eventType;
|
||||
|
||||
// Append the built command into output Command Buffer
|
||||
APPEND_COMMAND_WRAPPER(cmdbuf, cp_event_initiator);
|
||||
|
||||
// Check If I should be issuing a cache flush operation as well
|
||||
// test and remove it
|
||||
BuildCacheFlushPacket(cmdbuf);
|
||||
}
|
||||
|
||||
void Gfx9CmdWriter::BuildWriteRegisterPacket(CmdBuf* cmdbuf, uint32_t addr, uint32_t value) {
|
||||
/*
|
||||
WriteRegPacket packet;
|
||||
packet.item[0] = (PM4_TYPE3_HDR(
|
||||
IT_SET_CONFIG_REG, 1 + PM4_CMD_SET_CONFIG_REG_DWORDS, ShaderGraphics, 0));
|
||||
packet.item[1] = addr - CONFIG_SPACE_START;
|
||||
packet.item[2] = value;
|
||||
|
||||
APPEND_COMMAND_WRAPPER(cmdbuf, packet);
|
||||
|
||||
return;
|
||||
*/
|
||||
}
|
||||
|
||||
void Gfx9CmdWriter::BuildWriteEventQueryPacket(CmdBuf* cmdbuf, uint32_t event, uint32_t* addr) {
|
||||
PM4MEC_EVENT_WRITE_QUERY cp_event_initiator;
|
||||
memset(&cp_event_initiator, 0, sizeof(PM4MEC_EVENT_WRITE_QUERY));
|
||||
cp_event_initiator.ordinal1 =
|
||||
PM4_TYPE3_HDR(IT_EVENT_WRITE, (sizeof(PM4MEC_EVENT_WRITE_QUERY) / sizeof(uint32_t)));
|
||||
cp_event_initiator.ordinal2 = 0;
|
||||
|
||||
// Update switch statements you want to support
|
||||
VGT_EVENT_TYPE eventType = Reserved_0x00;
|
||||
switch (event) {
|
||||
default:
|
||||
assert(false && "Illegal VGT Event Id");
|
||||
}
|
||||
|
||||
MEC_EVENT_WRITE_event_index_enum index;
|
||||
cp_event_initiator.bitfields2.event_type = eventType;
|
||||
index = (MEC_EVENT_WRITE_event_index_enum)EventTypeToIndexTable[eventType];
|
||||
cp_event_initiator.bitfields2.event_index = index;
|
||||
|
||||
// set the address
|
||||
uint32_t addrLo = PtrLow32(addr);
|
||||
uint32_t addrHi = PtrHigh32(addr);
|
||||
((addrLo & 0x7) != 0) ? assert(false) : assert(true);
|
||||
|
||||
cp_event_initiator.address_hi = addrHi;
|
||||
cp_event_initiator.bitfields3.address_lo = (addrLo >> 3);
|
||||
|
||||
// Append the built command into output Command Buffer
|
||||
APPEND_COMMAND_WRAPPER(cmdbuf, cp_event_initiator);
|
||||
}
|
||||
|
||||
size_t Gfx9CmdWriter::SizeOfAtomicPacket() const {
|
||||
return sizeof(AtomicTemplate) / sizeof(uint32_t);
|
||||
}
|
||||
|
||||
void Gfx9CmdWriter::BuildFlushCacheCmd(CmdBuf* cmdbuf, FlushCacheOptions* options,
|
||||
uint32_t* writeAddr, uint32_t writeVal) {
|
||||
PM4MEC_ACQUIRE_MEM cache_flush = invalidate_cache_template_.acquire_mem;
|
||||
|
||||
// Verify write back address is valid. Note that this address is NOT
|
||||
// used on CI. But to have a same interface as that on SI, we keep
|
||||
// the address argument in this function. Thus, this check always pass
|
||||
// no matter the address is NULL or not.
|
||||
(writeAddr == NULL) ? assert(true) : assert(true);
|
||||
|
||||
// Program Coherence Control Register. Initialize L2 Cache flush
|
||||
// for Non-Coherent memory blocks
|
||||
uint32_t coher_cntl = 0;
|
||||
coher_cntl |= (options->l1) ? CP_COHER_CNTL__TCL1_ACTION_ENA_MASK : 0;
|
||||
coher_cntl |= (options->l2)
|
||||
? (CP_COHER_CNTL__TC_ACTION_ENA_MASK | CP_COHER_CNTL__TC_WB_ACTION_ENA_MASK)
|
||||
: 0;
|
||||
coher_cntl |= (options->icache) ? CP_COHER_CNTL__SH_ICACHE_ACTION_ENA_MASK : 0;
|
||||
coher_cntl |= (options->kcache) ? CP_COHER_CNTL__SH_KCACHE_ACTION_ENA_MASK : 0;
|
||||
cache_flush.bitfields2.coher_cntl = coher_cntl;
|
||||
|
||||
// Append the built command into output Command Buffer
|
||||
APPEND_COMMAND_WRAPPER(cmdbuf, cache_flush);
|
||||
return;
|
||||
}
|
||||
|
||||
void Gfx9CmdWriter::BuildDmaDataPacket(CmdBuf* cmdbuf, uint32_t* srcAddr, uint32_t* dstAddr,
|
||||
uint32_t copySize, bool waitForConfirm) {
|
||||
PM4MEC_DMA_DATA cmdDmaData;
|
||||
memset(&cmdDmaData, 0, sizeof(PM4MEC_DMA_DATA));
|
||||
cmdDmaData.header.u32All =
|
||||
PM4_TYPE3_HDR(IT_DMA_DATA, (sizeof(PM4MEC_DMA_DATA) / sizeof(uint32_t)));
|
||||
|
||||
// Specify attributes of source buffer such as its
|
||||
// location and Cache policy
|
||||
cmdDmaData.bitfields2.src_sel = src_sel__mec_dma_data__src_addr_using_sas;
|
||||
cmdDmaData.bitfields2.src_cache_policy = src_cache_policy__mec_dma_data__stream;
|
||||
|
||||
// Specify attributes of destination buffer such as its
|
||||
// location and Cache policy
|
||||
cmdDmaData.bitfields2.dst_sel = dst_sel__mec_dma_data__dst_addr_using_das;
|
||||
cmdDmaData.bitfields2.dst_cache_policy = dst_cache_policy__mec_dma_data__stream;
|
||||
|
||||
// Specify the source and destination addr
|
||||
cmdDmaData.src_addr_lo_or_data = PtrLow32(srcAddr);
|
||||
cmdDmaData.src_addr_hi = PtrHigh32(srcAddr);
|
||||
cmdDmaData.dst_addr_lo = PtrLow32(dstAddr);
|
||||
cmdDmaData.dst_addr_hi = PtrHigh32(dstAddr);
|
||||
|
||||
// Number of bytes to copy. The command restricts
|
||||
// the size to be (64 MB - 1) - 26 Bits
|
||||
assert(copySize < 0x1FFFFF);
|
||||
cmdDmaData.bitfields7.byte_count = copySize;
|
||||
|
||||
// Indicate that DMA Cmd should wait if its source
|
||||
// is the destination of a previous DMA Cmd
|
||||
cmdDmaData.bitfields7.raw_wait = waitForConfirm;
|
||||
|
||||
APPEND_COMMAND_WRAPPER(cmdbuf, cmdDmaData);
|
||||
return;
|
||||
}
|
||||
|
||||
|
||||
} // gfx9 namespace
|
||||
|
||||
} // pm4_profile
|
||||
@@ -0,0 +1,199 @@
|
||||
#ifndef _GFX9_CMDWRITER_H_
|
||||
#define _GFX9_CMDWRITER_H_
|
||||
|
||||
#include "cmdwriter.h"
|
||||
#include "gfx9_cmds.h"
|
||||
|
||||
namespace pm4_profile {
|
||||
|
||||
namespace gfx9 {
|
||||
|
||||
|
||||
/// @brief class Gfx9CmdWriter implements the virtual class CommandWriter
|
||||
/// for GFX9 chipsets
|
||||
class Gfx9CmdWriter : public CommandWriter {
|
||||
public:
|
||||
Gfx9CmdWriter(bool atc_support, bool pcie_atomic_support);
|
||||
|
||||
/// @brief Dword specifying NOOP command for GFX9 chipsets. The macro
|
||||
/// populates the NOOP command which is 32-bits wide. The second parameter,
|
||||
/// the COUNT field of NOOP command, specifies the number of Dwords to skip.
|
||||
/// To skip ZERO Dwords the value should be set to 0x3FFF. Since the macro
|
||||
/// decrements the second parameter by TWO, an artifact of its definition,
|
||||
/// the value is incremented by TWO to 0x4001 (0x3FFF + 2).
|
||||
///
|
||||
inline uint32_t GetNoOpCmd() {
|
||||
static const uint32_t nopCmd = PM4_TYPE3_HDR(IT_NOP, 0x4001);
|
||||
return nopCmd;
|
||||
}
|
||||
|
||||
void BuildBarrierCommand(CmdBuf* cmdBuf);
|
||||
|
||||
void BuildIndirectBufferCmd(CmdBuf* cmdbuf, const void* cmd_addr, std::size_t cmd_size);
|
||||
|
||||
void BuildBOPNotifyCmd(CmdBuf* cmdbuf, const void* write_addr, uint32_t write_val,
|
||||
bool interrupt);
|
||||
|
||||
void BuildBarrierFenceCommands(CmdBuf* cmdbuf);
|
||||
|
||||
void BuildWriteEventPacket(CmdBuf* cmdbuf, uint32_t event);
|
||||
|
||||
void BuildWaitRegMemCommand(CmdBuf* cmdbuf, bool mem_space, uint64_t wait_addr, bool func_eq,
|
||||
uint32_t mask_val, uint32_t wait_val);
|
||||
|
||||
void BuildWriteUnshadowRegPacket(CmdBuf* cmdbuf, uint32_t addr, uint32_t value);
|
||||
|
||||
/// @brief Build CP command to program a Gpu register
|
||||
///
|
||||
/// @param cmdbuf Pointer to command buffer to be appended
|
||||
/// @param addr Register to be programmed
|
||||
/// @param value Value to write into register
|
||||
///
|
||||
/// @return void
|
||||
void BuildWriteUConfigRegPacket(CmdBuf* cmdbuf, uint32_t addr, uint32_t value);
|
||||
|
||||
void BuildWriteShRegPacket(CmdBuf* cmdbuf, uint32_t addr, uint32_t value);
|
||||
|
||||
void BuildCopyDataPacket(CmdBuf* cmdbuf, uint32_t src_sel, uint32_t src_addr_lo,
|
||||
uint32_t src_addr_hi, uint32_t* dst_addr, uint32_t size, bool wait);
|
||||
|
||||
void BuildWriteWaitIdlePacket(CmdBuf* cmdbuf);
|
||||
|
||||
// Will issue a VGT event including a cache flush later on
|
||||
void BuildVgtEventPacket(CmdBuf* cmdbuf, uint32_t vgtEvent);
|
||||
|
||||
void BuildWriteRegisterPacket(CmdBuf* cmdbuf, uint32_t addr, uint32_t value);
|
||||
|
||||
void BuildWriteEventQueryPacket(CmdBuf* cmdbuf, uint32_t event, uint32_t* addr);
|
||||
|
||||
void BuildAtomicPacket(CmdBuf* cmdbuf, AtomicType atomic_op, volatile uint32_t* addr,
|
||||
uint32_t value, uint32_t compare);
|
||||
|
||||
void BuildAtomicPacket64(CmdBuf* cmdbuf, AtomicType atomic_op, volatile uint64_t* addr,
|
||||
uint64_t value = 0, uint64_t compare = 0);
|
||||
|
||||
size_t SizeOfAtomicPacket() const;
|
||||
|
||||
void BuildConditionalExecute(CmdBuf* cmdbuf, uint32_t* signal, uint16_t count);
|
||||
|
||||
void BuildWriteDataCommand(CmdBuf* cmdbuf, uint32_t* write_addr, uint32_t write_value);
|
||||
|
||||
void BuildWriteData64Command(CmdBuf* cmdbuf, uint64_t* write_addr, uint64_t write_value);
|
||||
|
||||
void BuildCacheFlushPacket(CmdBuf* cmdbuf);
|
||||
|
||||
/// Writes into input buffer Gpu commands to flush its cache. It is
|
||||
/// necessary that the buffer provided for flush commands is large
|
||||
/// enough to accommodate the full set of commands. It should be at
|
||||
/// least 512 bytes.
|
||||
///
|
||||
/// @param tsCmdBuf Buffer to write commands to.
|
||||
/// @param writeAddr Registered address into which GPU should write
|
||||
/// a user provided value upon executing the flush commands.
|
||||
/// @param writeVal User provided value written by GPU at user provided
|
||||
/// address, upon executing the flush commands.
|
||||
///
|
||||
/// @return void
|
||||
void BuildFlushCacheCmd(CmdBuf* cmdBuf, FlushCacheOptions* options, uint32_t* writeAddr,
|
||||
uint32_t writeVal);
|
||||
|
||||
/// Builds Gpu command to copy data from source to destination buffer
|
||||
/// using DMA engine.
|
||||
///
|
||||
/// @param cmdbuf Buffer updated with Gpu copy command
|
||||
/// @param srcAddr Address of source buffer address
|
||||
/// @param dstAddr Address of destination buffer address
|
||||
/// @param copySize Size of data to copy in bytes
|
||||
/// @param waitForCompletion if command should wait for copying to complete
|
||||
void BuildDmaDataPacket(CmdBuf* cmdBuf, uint32_t* srcAddr, uint32_t* dstAddr, uint32_t copySize,
|
||||
bool waitForCompletion);
|
||||
|
||||
protected:
|
||||
/// @brief Append an instance of Gpu command into input command buffer stream.
|
||||
///
|
||||
/// @param cmdbuf CommandWriter object appended with anohter Gpu command
|
||||
///
|
||||
/// @param cmd Gpu command to be appended into command buffer
|
||||
///
|
||||
/// @return void
|
||||
template <class T> void AppendCommand(CmdBuf* cmdbuf, const T& cmd);
|
||||
|
||||
private:
|
||||
/// @brief Initializes a Gpu command which can be used to
|
||||
/// reference a Gpu command stream indirectly
|
||||
void InitializeLaunchTemplate();
|
||||
|
||||
/// @brief Initializes a Gpu command which can be used to
|
||||
/// flush Gpu caches and write to a user configurable address
|
||||
/// to indicate an end of kernel
|
||||
void InitializeEndOfKernelNotifyTemplate();
|
||||
|
||||
/// @brief Initializes a Gpu command to perform atomic operations
|
||||
////
|
||||
void InitializeAtomicTemplate();
|
||||
|
||||
/// @brief Initializes a Gpu command to allow conditional execution
|
||||
/// of a Gpu command stream
|
||||
void InitializeConditionalTemplate();
|
||||
|
||||
/// @brief Initializes a Gpu command to let command processor
|
||||
/// wait for some update before letting other commands to be
|
||||
/// processed
|
||||
void InitializeWaitRegMemTemplate();
|
||||
|
||||
/// @brief Initializes the template for Barrier command.
|
||||
/// Applications can use Barrier command to ensure their
|
||||
/// command is executed only after all other commands have
|
||||
/// completed their execution.
|
||||
void InitializeBarrierTemplate();
|
||||
|
||||
void BuildUpdateHostAddress(CmdBuf* cmdbuf, uint64_t* addr, int64_t value);
|
||||
|
||||
/// @brief Initializes Acquire Memory command template. Users
|
||||
/// can submit this command to invalidate Gpu caches - L1 and
|
||||
/// or L2.
|
||||
void InitializeAcquireMemTemplate();
|
||||
|
||||
/// @brief Initializes an instance of Write Data command
|
||||
/// for use by an application
|
||||
void InitializeWriteDataTemplate();
|
||||
void InitializeWriteData64Template();
|
||||
void InitializeWriteDataTemplate(PM4MEC_WRITE_DATA* write_data, bool bit32);
|
||||
|
||||
/// @brief Builds wait_reg_mem with EQUALS condition
|
||||
void BuildWaitRegMemCommand(CmdBuf* cmdbuf, uint64_t wait_addr, uint32_t wait_value);
|
||||
|
||||
/// @brief Instance of Gpu command to reference dispatch commands
|
||||
LaunchTemplate launch_template_;
|
||||
|
||||
/// @brief Instance of Gpu command to use in determing end of kernel
|
||||
EndofKernelNotifyTemplate notify_template_;
|
||||
|
||||
/// @brief Instance of Gpu command to use in performing atomic operations
|
||||
AtomicTemplate atomic_template_;
|
||||
|
||||
/// @brief Instance of Pm4 command WRITE_DATA
|
||||
WriteDataTemplate write_data_template_;
|
||||
WriteData64Template write_data64_template_;
|
||||
|
||||
/// @brief Instance of Pm4 command EVENT_WRITE
|
||||
BarrierTemplate pending_dispatch_template_;
|
||||
|
||||
/// @brief Instance of Pm4 command ACQUIRE_MEM
|
||||
AcquireMemTemplate invalidate_cache_template_;
|
||||
|
||||
/// @brief Instance of Pm4 command WAIT_REG_MEM
|
||||
WaitRegMemTemplate wait_reg_mem_template_;
|
||||
|
||||
/// @brief ATC support.
|
||||
bool atc_support_;
|
||||
|
||||
/// @brief PCIe atomic support.
|
||||
bool pcie_atomic_support_;
|
||||
};
|
||||
|
||||
} // gfx9
|
||||
|
||||
} // pm4_profile
|
||||
|
||||
#endif // _GFX9_CMDWRITER_H_
|
||||
@@ -0,0 +1,24 @@
|
||||
#
|
||||
# Source files for Rocr PerfCntr
|
||||
#
|
||||
set ( LIB_SRC var_data.cpp )
|
||||
set ( LIB_SRC ${LIB_SRC} info_set.cpp )
|
||||
set ( LIB_SRC ${LIB_SRC} parameter_set.cpp )
|
||||
set ( LIB_SRC ${LIB_SRC} gpu_counter.cpp )
|
||||
set ( LIB_SRC ${LIB_SRC} gpu_countergroup.cpp )
|
||||
set ( LIB_SRC ${LIB_SRC} vi_blockinfo.cpp )
|
||||
set ( LIB_SRC ${LIB_SRC} vi_pmu.cpp )
|
||||
set ( LIB_SRC ${LIB_SRC} ai_blockinfo.cpp )
|
||||
set ( LIB_SRC ${LIB_SRC} ai_pmu.cpp )
|
||||
|
||||
#
|
||||
# Header files include path(s).
|
||||
#
|
||||
include_directories ( $ENV{ROCR_INC_DIR} )
|
||||
include_directories ( ${PROJ_DIR}/commandwriter )
|
||||
include_directories ( ${CORE_UTIL_DIR} )
|
||||
|
||||
#
|
||||
# Build PerfCntr as a Static Library object
|
||||
#
|
||||
add_library ( ${PMC_LIB} STATIC ${LIB_SRC} )
|
||||
@@ -0,0 +1,555 @@
|
||||
#include "ai_blockinfo.h"
|
||||
#include "gfxip/gfx9/gfx9_offset.h"
|
||||
#include "gfxip/gfx9/gfx9_typedef.h"
|
||||
|
||||
namespace pm4_profile {
|
||||
/**
|
||||
* Table containing CounterGroups which represent AI hardware blocks
|
||||
* as defined by \ref GpuBlockInfo structure
|
||||
*/
|
||||
GpuBlockInfo AiPmuHwBlocks[] = {
|
||||
// Counter block CB
|
||||
{"AI_CB0", kHsaAiCounterBlockIdCb0, AI_MAX_NUM_SHADER_ENGINES, 2, AI_NUM_CB,
|
||||
CntlMethodBySeAndInstance, 395, AI_COUNTER_NUM_PER_CB, 0, 0, true, 0, 0, false, 0, 0},
|
||||
{"AI_CB1", kHsaAiCounterBlockIdCb1, AI_MAX_NUM_SHADER_ENGINES, 2, AI_NUM_CB,
|
||||
CntlMethodBySeAndInstance, 395, AI_COUNTER_NUM_PER_CB, 0, 0, true, 0, 0, false, 0, 0},
|
||||
{"AI_CB2", kHsaAiCounterBlockIdCb2, AI_MAX_NUM_SHADER_ENGINES, 2, AI_NUM_CB,
|
||||
CntlMethodBySeAndInstance, 395, AI_COUNTER_NUM_PER_CB, 0, 0, true, 0, 0, false, 0, 0},
|
||||
{"AI_CB3", kHsaAiCounterBlockIdCb3, AI_MAX_NUM_SHADER_ENGINES, 2, AI_NUM_CB,
|
||||
CntlMethodBySeAndInstance, 395, AI_COUNTER_NUM_PER_CB, 0, 0, true, 0, 0, false, 0, 0},
|
||||
|
||||
// Temp commented for Vega10
|
||||
// Counter block CPF
|
||||
/*
|
||||
{"AI_CPF", kHsaAiCounterBlockIdCpf, AI_MAX_NUM_SHADER_ENGINES, 2, 1, CntlMethodNone, 19,
|
||||
AI_COUNTER_NUM_PER_CPF, 0, 0, true, 0, 0, false, 0, 0},
|
||||
*/
|
||||
{"AI_CB3", kHsaAiCounterBlockIdCpf, AI_MAX_NUM_SHADER_ENGINES, 2, AI_NUM_CB,
|
||||
CntlMethodBySeAndInstance, 395, AI_COUNTER_NUM_PER_CB, 0, 0, true, 0, 0, false, 0, 0},
|
||||
|
||||
// Counter block DB
|
||||
{"AI_DB0", kHsaAiCounterBlockIdDb0, AI_MAX_NUM_SHADER_ENGINES, 2, AI_NUM_DB,
|
||||
CntlMethodBySeAndInstance, 256, AI_COUNTER_NUM_PER_DB, 0, 0, true, 0, 0, false, 0, 0},
|
||||
{"AI_DB1", kHsaAiCounterBlockIdDb1, AI_MAX_NUM_SHADER_ENGINES, 2, AI_NUM_DB,
|
||||
CntlMethodBySeAndInstance, 256, AI_COUNTER_NUM_PER_DB, 0, 0, true, 0, 0, false, 0, 0},
|
||||
{"AI_DB2", kHsaAiCounterBlockIdDb2, AI_MAX_NUM_SHADER_ENGINES, 2, AI_NUM_DB,
|
||||
CntlMethodBySeAndInstance, 256, AI_COUNTER_NUM_PER_DB, 0, 0, true, 0, 0, false, 0, 0},
|
||||
{"AI_DB3", kHsaAiCounterBlockIdDb3, AI_MAX_NUM_SHADER_ENGINES, 2, AI_NUM_DB,
|
||||
CntlMethodBySeAndInstance, 256, AI_COUNTER_NUM_PER_DB, 0, 0, true, 0, 0, false, 0, 0},
|
||||
|
||||
// Counter block GRBM
|
||||
{"AI_GRBM", kHsaAiCounterBlockIdGrbm, AI_MAX_NUM_SHADER_ENGINES, 2, 1, CntlMethodNone, 33,
|
||||
AI_COUNTER_NUM_PER_GRBM, 0, 0, true, 0, 0, false, 0, 0},
|
||||
|
||||
// Counter block GRBMSE
|
||||
{"AI_GRBMSE", kHsaAiCounterBlockIdGrbmSe, AI_MAX_NUM_SHADER_ENGINES, 2, 1, CntlMethodNone, 14,
|
||||
AI_COUNTER_NUM_PER_GRBMSE, 0, 0, true, 0, 0, false, 0, 0},
|
||||
|
||||
// Counter block PA_SU
|
||||
{"AI_PA_SU", kHsaAiCounterBlockIdPaSu, AI_MAX_NUM_SHADER_ENGINES, 2, 1, CntlMethodBySe, 152,
|
||||
AI_COUNTER_NUM_PER_PA_SU, 0, 0, true, 0, 0, false, 0, 0},
|
||||
|
||||
// Counter block PA_SC
|
||||
{"AI_PA_SC", kHsaAiCounterBlockIdPaSc, AI_MAX_NUM_SHADER_ENGINES, 2, 1, CntlMethodBySe, 396,
|
||||
AI_COUNTER_NUM_PER_PA_SC, 0, 0, true, 0, 0, false, 0, 0},
|
||||
|
||||
// Counter block SPI
|
||||
{"AI_SPI", kHsaAiCounterBlockIdSpi, AI_MAX_NUM_SHADER_ENGINES, 2, 1, CntlMethodBySe, 196,
|
||||
AI_COUNTER_NUM_PER_SPI, 0, 0, true, 0, 0, false, 0, 0},
|
||||
|
||||
// Counter block SQ
|
||||
{"AI_SQ", kHsaAiCounterBlockIdSq, AI_MAX_NUM_SHADER_ENGINES, 2, 1, CntlMethodBySe, 298,
|
||||
AI_COUNTER_NUM_PER_SQ, 0, 0, true, 0, 0, false, 0, 0},
|
||||
{"AI_SQ_GS", kHsaAiCounterBlockIdSqGs, AI_MAX_NUM_SHADER_ENGINES, 2, 1, CntlMethodBySe, 298,
|
||||
AI_COUNTER_NUM_PER_SQ, 0, 0, true, 0, 0, false, 0, 0},
|
||||
{"AI_SQ_VS", kHsaAiCounterBlockIdSqVs, AI_MAX_NUM_SHADER_ENGINES, 2, 1, CntlMethodBySe, 298,
|
||||
AI_COUNTER_NUM_PER_SQ, 0, 0, true, 0, 0, false, 0, 0},
|
||||
{"AI_SQ_PS", kHsaAiCounterBlockIdSqPs, AI_MAX_NUM_SHADER_ENGINES, 2, 1, CntlMethodBySe, 298,
|
||||
AI_COUNTER_NUM_PER_SQ, 0, 0, true, 0, 0, false, 0, 0},
|
||||
{"AI_SQ_HS", kHsaAiCounterBlockIdSqHs, AI_MAX_NUM_SHADER_ENGINES, 2, 1, CntlMethodBySe, 298,
|
||||
AI_COUNTER_NUM_PER_SQ, 0, 0, true, 0, 0, false, 0, 0},
|
||||
{"AI_SQ_CS", kHsaAiCounterBlockIdSqCs, AI_MAX_NUM_SHADER_ENGINES, 2, 1, CntlMethodBySe, 298,
|
||||
AI_COUNTER_NUM_PER_SQ, 0, 0, true, 0, 0, false, 0, 0},
|
||||
|
||||
// Counter block SX
|
||||
{"AI_SX", kHsaAiCounterBlockIdSx, AI_MAX_NUM_SHADER_ENGINES, 2, 1, CntlMethodBySe, 33,
|
||||
AI_COUNTER_NUM_PER_SX, 0, 0, true, 0, 0, false, 0, 0},
|
||||
|
||||
// Counter block TA
|
||||
{"AI_TA0", kHsaAiCounterBlockIdTa0, AI_MAX_NUM_SHADER_ENGINES, 2, AI_NUM_TA,
|
||||
CntlMethodBySeAndInstance, 118, AI_COUNTER_NUM_PER_TA, 0, 0, true, 0, 0, false, 0, 0},
|
||||
{"AI_TA1", kHsaAiCounterBlockIdTa1, AI_MAX_NUM_SHADER_ENGINES, 2, AI_NUM_TA,
|
||||
CntlMethodBySeAndInstance, 118, AI_COUNTER_NUM_PER_TA, 0, 0, true, 0, 0, false, 0, 0},
|
||||
{"AI_TA2", kHsaAiCounterBlockIdTa2, AI_MAX_NUM_SHADER_ENGINES, 2, AI_NUM_TA,
|
||||
CntlMethodBySeAndInstance, 118, AI_COUNTER_NUM_PER_TA, 0, 0, true, 0, 0, false, 0, 0},
|
||||
{"AI_TA3", kHsaAiCounterBlockIdTa3, AI_MAX_NUM_SHADER_ENGINES, 2, AI_NUM_TA,
|
||||
CntlMethodBySeAndInstance, 118, AI_COUNTER_NUM_PER_TA, 0, 0, true, 0, 0, false, 0, 0},
|
||||
{"AI_TA4", kHsaAiCounterBlockIdTa4, AI_MAX_NUM_SHADER_ENGINES, 2, AI_NUM_TA,
|
||||
CntlMethodBySeAndInstance, 118, AI_COUNTER_NUM_PER_TA, 0, 0, true, 0, 0, false, 0, 0},
|
||||
{"AI_TA5", kHsaAiCounterBlockIdTa5, AI_MAX_NUM_SHADER_ENGINES, 2, AI_NUM_TA,
|
||||
CntlMethodBySeAndInstance, 118, AI_COUNTER_NUM_PER_TA, 0, 0, true, 0, 0, false, 0, 0},
|
||||
{"AI_TA6", kHsaAiCounterBlockIdTa6, AI_MAX_NUM_SHADER_ENGINES, 2, AI_NUM_TA,
|
||||
CntlMethodBySeAndInstance, 118, AI_COUNTER_NUM_PER_TA, 0, 0, true, 0, 0, false, 0, 0},
|
||||
{"AI_TA7", kHsaAiCounterBlockIdTa7, AI_MAX_NUM_SHADER_ENGINES, 2, AI_NUM_TA,
|
||||
CntlMethodBySeAndInstance, 118, AI_COUNTER_NUM_PER_TA, 0, 0, true, 0, 0, false, 0, 0},
|
||||
{"AI_TA8", kHsaAiCounterBlockIdTa8, AI_MAX_NUM_SHADER_ENGINES, 2, AI_NUM_TA,
|
||||
CntlMethodBySeAndInstance, 118, AI_COUNTER_NUM_PER_TA, 0, 0, true, 0, 0, false, 0, 0},
|
||||
{"AI_TA9", kHsaAiCounterBlockIdTa9, AI_MAX_NUM_SHADER_ENGINES, 2, AI_NUM_TA,
|
||||
CntlMethodBySeAndInstance, 118, AI_COUNTER_NUM_PER_TA, 0, 0, true, 0, 0, false, 0, 0},
|
||||
{"AI_TA10", kHsaAiCounterBlockIdTa10, AI_MAX_NUM_SHADER_ENGINES, 2, AI_NUM_TA,
|
||||
CntlMethodBySeAndInstance, 118, AI_COUNTER_NUM_PER_TA, 0, 0, true, 0, 0, false, 0, 0},
|
||||
{"AI_TA11", kHsaAiCounterBlockIdTa11, AI_MAX_NUM_SHADER_ENGINES, 2, AI_NUM_TA,
|
||||
CntlMethodBySeAndInstance, 118, AI_COUNTER_NUM_PER_TA, 0, 0, true, 0, 0, false, 0, 0},
|
||||
{"AI_TA12", kHsaAiCounterBlockIdTa12, AI_MAX_NUM_SHADER_ENGINES, 2, AI_NUM_TA,
|
||||
CntlMethodBySeAndInstance, 118, AI_COUNTER_NUM_PER_TA, 0, 0, true, 0, 0, false, 0, 0},
|
||||
{"AI_TA13", kHsaAiCounterBlockIdTa13, AI_MAX_NUM_SHADER_ENGINES, 2, AI_NUM_TA,
|
||||
CntlMethodBySeAndInstance, 118, AI_COUNTER_NUM_PER_TA, 0, 0, true, 0, 0, false, 0, 0},
|
||||
{"AI_TA14", kHsaAiCounterBlockIdTa14, AI_MAX_NUM_SHADER_ENGINES, 2, AI_NUM_TA,
|
||||
CntlMethodBySeAndInstance, 118, AI_COUNTER_NUM_PER_TA, 0, 0, true, 0, 0, false, 0, 0},
|
||||
{"AI_TA15", kHsaAiCounterBlockIdTa15, AI_MAX_NUM_SHADER_ENGINES, 2, AI_NUM_TA,
|
||||
CntlMethodBySeAndInstance, 118, AI_COUNTER_NUM_PER_TA, 0, 0, true, 0, 0, false, 0, 0},
|
||||
|
||||
// Counter block TCA
|
||||
{"AI_TCA0", kHsaAiCounterBlockIdTca0, AI_MAX_NUM_SHADER_ENGINES, 2, AI_NUM_TCA,
|
||||
CntlMethodByInstance, 34, AI_COUNTER_NUM_PER_TCA, 0, 0, true, 0, 0, false, 0, 0},
|
||||
{"AI_TCA1", kHsaAiCounterBlockIdTca1, AI_MAX_NUM_SHADER_ENGINES, 2, AI_NUM_TCA,
|
||||
CntlMethodByInstance, 34, AI_COUNTER_NUM_PER_TCA, 0, 0, true, 0, 0, false, 0, 0},
|
||||
|
||||
// Counter block TCC
|
||||
{"AI_TCC0", kHsaAiCounterBlockIdTcc0, AI_MAX_NUM_SHADER_ENGINES, 2, AI_NUM_TCC,
|
||||
CntlMethodByInstance, 191, AI_COUNTER_NUM_PER_TCC, 0, 0, true, 0, 0, false, 0, 0},
|
||||
{"AI_TCC1", kHsaAiCounterBlockIdTcc1, AI_MAX_NUM_SHADER_ENGINES, 2, AI_NUM_TCC,
|
||||
CntlMethodByInstance, 191, AI_COUNTER_NUM_PER_TCC, 0, 0, true, 0, 0, false, 0, 0},
|
||||
{"AI_TCC2", kHsaAiCounterBlockIdTcc2, AI_MAX_NUM_SHADER_ENGINES, 2, AI_NUM_TCC,
|
||||
CntlMethodByInstance, 191, AI_COUNTER_NUM_PER_TCC, 0, 0, true, 0, 0, false, 0, 0},
|
||||
{"AI_TCC3", kHsaAiCounterBlockIdTcc3, AI_MAX_NUM_SHADER_ENGINES, 2, AI_NUM_TCC,
|
||||
CntlMethodByInstance, 191, AI_COUNTER_NUM_PER_TCC, 0, 0, true, 0, 0, false, 0, 0},
|
||||
{"AI_TCC4", kHsaAiCounterBlockIdTcc4, AI_MAX_NUM_SHADER_ENGINES, 2, AI_NUM_TCC,
|
||||
CntlMethodByInstance, 191, AI_COUNTER_NUM_PER_TCC, 0, 0, true, 0, 0, false, 0, 0},
|
||||
{"AI_TCC5", kHsaAiCounterBlockIdTcc5, AI_MAX_NUM_SHADER_ENGINES, 2, AI_NUM_TCC,
|
||||
CntlMethodByInstance, 191, AI_COUNTER_NUM_PER_TCC, 0, 0, true, 0, 0, false, 0, 0},
|
||||
{"AI_TCC6", kHsaAiCounterBlockIdTcc6, AI_MAX_NUM_SHADER_ENGINES, 2, AI_NUM_TCC,
|
||||
CntlMethodByInstance, 191, AI_COUNTER_NUM_PER_TCC, 0, 0, true, 0, 0, false, 0, 0},
|
||||
{"AI_TCC7", kHsaAiCounterBlockIdTcc7, AI_MAX_NUM_SHADER_ENGINES, 2, AI_NUM_TCC,
|
||||
CntlMethodByInstance, 191, AI_COUNTER_NUM_PER_TCC, 0, 0, true, 0, 0, false, 0, 0},
|
||||
{"AI_TCC8", kHsaAiCounterBlockIdTcc8, AI_MAX_NUM_SHADER_ENGINES, 2, AI_NUM_TCC,
|
||||
CntlMethodByInstance, 191, AI_COUNTER_NUM_PER_TCC, 0, 0, true, 0, 0, false, 0, 0},
|
||||
{"AI_TCC9", kHsaAiCounterBlockIdTcc9, AI_MAX_NUM_SHADER_ENGINES, 2, AI_NUM_TCC,
|
||||
CntlMethodByInstance, 191, AI_COUNTER_NUM_PER_TCC, 0, 0, true, 0, 0, false, 0, 0},
|
||||
{"AI_TCC10", kHsaAiCounterBlockIdTcc10, AI_MAX_NUM_SHADER_ENGINES, 2, AI_NUM_TCC,
|
||||
CntlMethodByInstance, 191, AI_COUNTER_NUM_PER_TCC, 0, 0, true, 0, 0, false, 0, 0},
|
||||
{"AI_TCC11", kHsaAiCounterBlockIdTcc11, AI_MAX_NUM_SHADER_ENGINES, 2, AI_NUM_TCC,
|
||||
CntlMethodByInstance, 191, AI_COUNTER_NUM_PER_TCC, 0, 0, true, 0, 0, false, 0, 0},
|
||||
{"AI_TCC12", kHsaAiCounterBlockIdTcc12, AI_MAX_NUM_SHADER_ENGINES, 2, AI_NUM_TCC,
|
||||
CntlMethodByInstance, 191, AI_COUNTER_NUM_PER_TCC, 0, 0, true, 0, 0, false, 0, 0},
|
||||
{"AI_TCC13", kHsaAiCounterBlockIdTcc13, AI_MAX_NUM_SHADER_ENGINES, 2, AI_NUM_TCC,
|
||||
CntlMethodByInstance, 191, AI_COUNTER_NUM_PER_TCC, 0, 0, true, 0, 0, false, 0, 0},
|
||||
{"AI_TCC14", kHsaAiCounterBlockIdTcc14, AI_MAX_NUM_SHADER_ENGINES, 2, AI_NUM_TCC,
|
||||
CntlMethodByInstance, 191, AI_COUNTER_NUM_PER_TCC, 0, 0, true, 0, 0, false, 0, 0},
|
||||
{"AI_TCC15", kHsaAiCounterBlockIdTcc15, AI_MAX_NUM_SHADER_ENGINES, 2, AI_NUM_TCC,
|
||||
CntlMethodByInstance, 191, AI_COUNTER_NUM_PER_TCC, 0, 0, true, 0, 0, false, 0, 0},
|
||||
|
||||
// Counter block TD
|
||||
{"AI_TD0", kHsaAiCounterBlockIdTd0, AI_MAX_NUM_SHADER_ENGINES, 2, AI_NUM_TD,
|
||||
CntlMethodBySeAndInstance, 54, AI_COUNTER_NUM_PER_TD, 0, 0, true, 0, 0, false, 0, 0},
|
||||
{"AI_TD1", kHsaAiCounterBlockIdTd1, AI_MAX_NUM_SHADER_ENGINES, 2, AI_NUM_TD,
|
||||
CntlMethodBySeAndInstance, 54, AI_COUNTER_NUM_PER_TD, 0, 0, true, 0, 0, false, 0, 0},
|
||||
{"AI_TD2", kHsaAiCounterBlockIdTd2, AI_MAX_NUM_SHADER_ENGINES, 2, AI_NUM_TD,
|
||||
CntlMethodBySeAndInstance, 54, AI_COUNTER_NUM_PER_TD, 0, 0, true, 0, 0, false, 0, 0},
|
||||
{"AI_TD3", kHsaAiCounterBlockIdTd3, AI_MAX_NUM_SHADER_ENGINES, 2, AI_NUM_TD,
|
||||
CntlMethodBySeAndInstance, 54, AI_COUNTER_NUM_PER_TD, 0, 0, true, 0, 0, false, 0, 0},
|
||||
{"AI_TD4", kHsaAiCounterBlockIdTd4, AI_MAX_NUM_SHADER_ENGINES, 2, AI_NUM_TD,
|
||||
CntlMethodBySeAndInstance, 54, AI_COUNTER_NUM_PER_TD, 0, 0, true, 0, 0, false, 0, 0},
|
||||
{"AI_TD5", kHsaAiCounterBlockIdTd5, AI_MAX_NUM_SHADER_ENGINES, 2, AI_NUM_TD,
|
||||
CntlMethodBySeAndInstance, 54, AI_COUNTER_NUM_PER_TD, 0, 0, true, 0, 0, false, 0, 0},
|
||||
{"AI_TD6", kHsaAiCounterBlockIdTd6, AI_MAX_NUM_SHADER_ENGINES, 2, AI_NUM_TD,
|
||||
CntlMethodBySeAndInstance, 54, AI_COUNTER_NUM_PER_TD, 0, 0, true, 0, 0, false, 0, 0},
|
||||
{"AI_TD7", kHsaAiCounterBlockIdTd7, AI_MAX_NUM_SHADER_ENGINES, 2, AI_NUM_TD,
|
||||
CntlMethodBySeAndInstance, 54, AI_COUNTER_NUM_PER_TD, 0, 0, true, 0, 0, false, 0, 0},
|
||||
{"AI_TD8", kHsaAiCounterBlockIdTd8, AI_MAX_NUM_SHADER_ENGINES, 2, AI_NUM_TD,
|
||||
CntlMethodBySeAndInstance, 54, AI_COUNTER_NUM_PER_TD, 0, 0, true, 0, 0, false, 0, 0},
|
||||
{"AI_TD9", kHsaAiCounterBlockIdTd9, AI_MAX_NUM_SHADER_ENGINES, 2, AI_NUM_TD,
|
||||
CntlMethodBySeAndInstance, 54, AI_COUNTER_NUM_PER_TD, 0, 0, true, 0, 0, false, 0, 0},
|
||||
{"AI_TD10", kHsaAiCounterBlockIdTd10, AI_MAX_NUM_SHADER_ENGINES, 2, AI_NUM_TD,
|
||||
CntlMethodBySeAndInstance, 54, AI_COUNTER_NUM_PER_TD, 0, 0, true, 0, 0, false, 0, 0},
|
||||
{"AI_TD11", kHsaAiCounterBlockIdTd11, AI_MAX_NUM_SHADER_ENGINES, 2, AI_NUM_TD,
|
||||
CntlMethodBySeAndInstance, 54, AI_COUNTER_NUM_PER_TD, 0, 0, true, 0, 0, false, 0, 0},
|
||||
{"AI_TD12", kHsaAiCounterBlockIdTd12, AI_MAX_NUM_SHADER_ENGINES, 2, AI_NUM_TD,
|
||||
CntlMethodBySeAndInstance, 54, AI_COUNTER_NUM_PER_TD, 0, 0, true, 0, 0, false, 0, 0},
|
||||
{"AI_TD13", kHsaAiCounterBlockIdTd13, AI_MAX_NUM_SHADER_ENGINES, 2, AI_NUM_TD,
|
||||
CntlMethodBySeAndInstance, 54, AI_COUNTER_NUM_PER_TD, 0, 0, true, 0, 0, false, 0, 0},
|
||||
{"AI_TD14", kHsaAiCounterBlockIdTd14, AI_MAX_NUM_SHADER_ENGINES, 2, AI_NUM_TD,
|
||||
CntlMethodBySeAndInstance, 54, AI_COUNTER_NUM_PER_TD, 0, 0, true, 0, 0, false, 0, 0},
|
||||
{"AI_TD15", kHsaAiCounterBlockIdTd15, AI_MAX_NUM_SHADER_ENGINES, 2, AI_NUM_TD,
|
||||
CntlMethodBySeAndInstance, 54, AI_COUNTER_NUM_PER_TD, 0, 0, true, 0, 0, false, 0, 0},
|
||||
|
||||
// Counter block TCP
|
||||
{"AI_TCP0", kHsaAiCounterBlockIdTcp0, AI_MAX_NUM_SHADER_ENGINES, 2, AI_NUM_TCP,
|
||||
CntlMethodBySeAndInstance, 182, AI_COUNTER_NUM_PER_TCP, 0, 0, true, 0, 0, false, 0, 0},
|
||||
{"AI_TCP1", kHsaAiCounterBlockIdTcp1, AI_MAX_NUM_SHADER_ENGINES, 2, AI_NUM_TCP,
|
||||
CntlMethodBySeAndInstance, 182, AI_COUNTER_NUM_PER_TCP, 0, 0, true, 0, 0, false, 0, 0},
|
||||
{"AI_TCP2", kHsaAiCounterBlockIdTcp2, AI_MAX_NUM_SHADER_ENGINES, 2, AI_NUM_TCP,
|
||||
CntlMethodBySeAndInstance, 182, AI_COUNTER_NUM_PER_TCP, 0, 0, true, 0, 0, false, 0, 0},
|
||||
{"AI_TCP3", kHsaAiCounterBlockIdTcp3, AI_MAX_NUM_SHADER_ENGINES, 2, AI_NUM_TCP,
|
||||
CntlMethodBySeAndInstance, 182, AI_COUNTER_NUM_PER_TCP, 0, 0, true, 0, 0, false, 0, 0},
|
||||
{"AI_TCP4", kHsaAiCounterBlockIdTcp4, AI_MAX_NUM_SHADER_ENGINES, 2, AI_NUM_TCP,
|
||||
CntlMethodBySeAndInstance, 182, AI_COUNTER_NUM_PER_TCP, 0, 0, true, 0, 0, false, 0, 0},
|
||||
{"AI_TCP5", kHsaAiCounterBlockIdTcp5, AI_MAX_NUM_SHADER_ENGINES, 2, AI_NUM_TCP,
|
||||
CntlMethodBySeAndInstance, 182, AI_COUNTER_NUM_PER_TCP, 0, 0, true, 0, 0, false, 0, 0},
|
||||
{"AI_TCP6", kHsaAiCounterBlockIdTcp6, AI_MAX_NUM_SHADER_ENGINES, 2, AI_NUM_TCP,
|
||||
CntlMethodBySeAndInstance, 182, AI_COUNTER_NUM_PER_TCP, 0, 0, true, 0, 0, false, 0, 0},
|
||||
{"AI_TCP7", kHsaAiCounterBlockIdTcp7, AI_MAX_NUM_SHADER_ENGINES, 2, AI_NUM_TCP,
|
||||
CntlMethodBySeAndInstance, 182, AI_COUNTER_NUM_PER_TCP, 0, 0, true, 0, 0, false, 0, 0},
|
||||
{"AI_TCP8", kHsaAiCounterBlockIdTcp8, AI_MAX_NUM_SHADER_ENGINES, 2, AI_NUM_TCP,
|
||||
CntlMethodBySeAndInstance, 182, AI_COUNTER_NUM_PER_TCP, 0, 0, true, 0, 0, false, 0, 0},
|
||||
{"AI_TCP9", kHsaAiCounterBlockIdTcp9, AI_MAX_NUM_SHADER_ENGINES, 2, AI_NUM_TCP,
|
||||
CntlMethodBySeAndInstance, 182, AI_COUNTER_NUM_PER_TCP, 0, 0, true, 0, 0, false, 0, 0},
|
||||
{"AI_TCP10", kHsaAiCounterBlockIdTcp10, AI_MAX_NUM_SHADER_ENGINES, 2, AI_NUM_TCP,
|
||||
CntlMethodBySeAndInstance, 182, AI_COUNTER_NUM_PER_TCP, 0, 0, true, 0, 0, false, 0, 0},
|
||||
{"AI_TCP11", kHsaAiCounterBlockIdTcp11, AI_MAX_NUM_SHADER_ENGINES, 2, AI_NUM_TCP,
|
||||
CntlMethodBySeAndInstance, 182, AI_COUNTER_NUM_PER_TCP, 0, 0, true, 0, 0, false, 0, 0},
|
||||
{"AI_TCP12", kHsaAiCounterBlockIdTcp12, AI_MAX_NUM_SHADER_ENGINES, 2, AI_NUM_TCP,
|
||||
CntlMethodBySeAndInstance, 182, AI_COUNTER_NUM_PER_TCP, 0, 0, true, 0, 0, false, 0, 0},
|
||||
{"AI_TCP13", kHsaAiCounterBlockIdTcp13, AI_MAX_NUM_SHADER_ENGINES, 2, AI_NUM_TCP,
|
||||
CntlMethodBySeAndInstance, 182, AI_COUNTER_NUM_PER_TCP, 0, 0, true, 0, 0, false, 0, 0},
|
||||
{"AI_TCP14", kHsaAiCounterBlockIdTcp14, AI_MAX_NUM_SHADER_ENGINES, 2, AI_NUM_TCP,
|
||||
CntlMethodBySeAndInstance, 182, AI_COUNTER_NUM_PER_TCP, 0, 0, true, 0, 0, false, 0, 0},
|
||||
{"AI_TCP15", kHsaAiCounterBlockIdTcp15, AI_MAX_NUM_SHADER_ENGINES, 2, AI_NUM_TCP,
|
||||
CntlMethodBySeAndInstance, 182, AI_COUNTER_NUM_PER_TCP, 0, 0, true, 0, 0, false, 0, 0},
|
||||
|
||||
// Counter block GDS
|
||||
{"AI_GDS", kHsaAiCounterBlockIdGds, AI_MAX_NUM_SHADER_ENGINES, 2, 1, CntlMethodNone, 120,
|
||||
AI_COUNTER_NUM_PER_GDS, 0, 0, true, 0, 0, false, 0, 0},
|
||||
|
||||
// Counter block VGT
|
||||
{"AI_VGT", kHsaAiCounterBlockIdVgt, AI_MAX_NUM_SHADER_ENGINES, 2, 1, CntlMethodBySe, 145,
|
||||
AI_COUNTER_NUM_PER_VGT, 0, 0, true, 0, 0, false, 0, 0},
|
||||
|
||||
// Counter block IA
|
||||
{"AI_IA", kHsaAiCounterBlockIdIa, AI_MAX_NUM_SHADER_ENGINES, 2, 1, CntlMethodBySe, 23,
|
||||
AI_COUNTER_NUM_PER_IA, 0, 0, true, 0, 0, false, 0, 0},
|
||||
|
||||
// Counter block MC
|
||||
{"AI_MC", kHsaAiCounterBlockIdMc, AI_MAX_NUM_SHADER_ENGINES, 2, 1, CntlMethodNone, 22,
|
||||
AI_COUNTER_NUM_PER_MC, 0, 0, true, 0, 0, false, 0, 0},
|
||||
|
||||
// Temp commented out for Vega10
|
||||
// Counter block SRBM
|
||||
/*
|
||||
{"AI_SRBM", kHsaAiCounterBlockIdSrbm, AI_MAX_NUM_SHADER_ENGINES, 2, 1, CntlMethodNone, 19,
|
||||
AI_COUNTER_NUM_PER_SRBM, 0, 0, true, 0, 0, false, 0, 0},
|
||||
*/
|
||||
|
||||
// Counter block WD
|
||||
{"AI_WD", kHsaAiCounterBlockIdWd, AI_MAX_NUM_SHADER_ENGINES, 2, 1, CntlMethodNone, 36,
|
||||
AI_COUNTER_NUM_PER_WD, 0, 0, true, 0, 0, false, 0, 0},
|
||||
|
||||
// Counter block CPG
|
||||
// Temp commented for Vega10
|
||||
/*
|
||||
{"AI_CPG", kHsaAiCounterBlockIdCpg, AI_MAX_NUM_SHADER_ENGINES, 2, 1, CntlMethodNone, 48,
|
||||
AI_COUNTER_NUM_PER_CPG, 0, 0, true, 0, 0, false, 0, 0},
|
||||
*/
|
||||
|
||||
// Counter block CPC
|
||||
// Temp commented for Vega10
|
||||
/*
|
||||
{"AI_CPC", kHsaAiCounterBlockIdCpc, AI_MAX_NUM_SHADER_ENGINES, 2, 1, CntlMethodNone, 24,
|
||||
AI_COUNTER_NUM_PER_CPC, 0, 0, true, 0, 0, false, 0, 0},
|
||||
*/
|
||||
|
||||
// Counter block IOMMUV2
|
||||
{"AI_IOMMUV2", kHsaAiCounterBlockIdIommuV2, AI_MAX_NUM_SHADER_ENGINES, 2, 1, CntlMethodNone, 25,
|
||||
8, 0, 0, true, 0, 0, false, 0, 0},
|
||||
|
||||
// Counter block KernelDriver
|
||||
{"AI_KD", kHsaAiCounterBlockIdKernelDriver, AI_MAX_NUM_SHADER_ENGINES, 2, 1, CntlMethodNone, 0,
|
||||
0, 0, 0, true, 0, 0, false, 0, 0},
|
||||
|
||||
// Name of the last line should be empty to indicate end of all counter groups
|
||||
{"", kHsaAiCounterBlockIdBlocksLast, 0, 0, 0, CntlMethodNone, 0, 0, 0, 0, false, 0, 0, false, 0,
|
||||
0}};
|
||||
|
||||
/*
|
||||
* The following tables contain register addresses of the SQ counter registers
|
||||
*/
|
||||
|
||||
/*
|
||||
* SQ
|
||||
*/
|
||||
GpuCounterRegInfo AiSqCounterRegAddr[] = {
|
||||
{mmSQ_PERFCOUNTER0_SELECT, mmSQ_PERFCOUNTER_CTRL, mmSQ_PERFCOUNTER0_LO, mmSQ_PERFCOUNTER0_HI},
|
||||
{mmSQ_PERFCOUNTER1_SELECT, mmSQ_PERFCOUNTER_CTRL, mmSQ_PERFCOUNTER1_LO, mmSQ_PERFCOUNTER1_HI},
|
||||
{mmSQ_PERFCOUNTER2_SELECT, mmSQ_PERFCOUNTER_CTRL, mmSQ_PERFCOUNTER2_LO, mmSQ_PERFCOUNTER2_HI},
|
||||
{mmSQ_PERFCOUNTER3_SELECT, mmSQ_PERFCOUNTER_CTRL, mmSQ_PERFCOUNTER3_LO, mmSQ_PERFCOUNTER3_HI},
|
||||
{mmSQ_PERFCOUNTER4_SELECT, mmSQ_PERFCOUNTER_CTRL, mmSQ_PERFCOUNTER4_LO, mmSQ_PERFCOUNTER4_HI},
|
||||
{mmSQ_PERFCOUNTER5_SELECT, mmSQ_PERFCOUNTER_CTRL, mmSQ_PERFCOUNTER5_LO, mmSQ_PERFCOUNTER5_HI},
|
||||
{mmSQ_PERFCOUNTER6_SELECT, mmSQ_PERFCOUNTER_CTRL, mmSQ_PERFCOUNTER6_LO, mmSQ_PERFCOUNTER6_HI},
|
||||
{mmSQ_PERFCOUNTER7_SELECT, mmSQ_PERFCOUNTER_CTRL, mmSQ_PERFCOUNTER7_LO, mmSQ_PERFCOUNTER7_HI},
|
||||
{mmSQ_PERFCOUNTER8_SELECT, mmSQ_PERFCOUNTER_CTRL, mmSQ_PERFCOUNTER8_LO, mmSQ_PERFCOUNTER8_HI},
|
||||
{mmSQ_PERFCOUNTER9_SELECT, mmSQ_PERFCOUNTER_CTRL, mmSQ_PERFCOUNTER9_LO, mmSQ_PERFCOUNTER9_HI},
|
||||
{mmSQ_PERFCOUNTER10_SELECT, mmSQ_PERFCOUNTER_CTRL, mmSQ_PERFCOUNTER10_LO,
|
||||
mmSQ_PERFCOUNTER10_HI},
|
||||
{mmSQ_PERFCOUNTER11_SELECT, mmSQ_PERFCOUNTER_CTRL, mmSQ_PERFCOUNTER11_LO,
|
||||
mmSQ_PERFCOUNTER11_HI},
|
||||
{mmSQ_PERFCOUNTER12_SELECT, mmSQ_PERFCOUNTER_CTRL, mmSQ_PERFCOUNTER12_LO,
|
||||
mmSQ_PERFCOUNTER12_HI},
|
||||
{mmSQ_PERFCOUNTER13_SELECT, mmSQ_PERFCOUNTER_CTRL, mmSQ_PERFCOUNTER13_LO,
|
||||
mmSQ_PERFCOUNTER13_HI},
|
||||
{mmSQ_PERFCOUNTER14_SELECT, mmSQ_PERFCOUNTER_CTRL, mmSQ_PERFCOUNTER14_LO,
|
||||
mmSQ_PERFCOUNTER14_HI},
|
||||
{mmSQ_PERFCOUNTER15_SELECT, mmSQ_PERFCOUNTER_CTRL, mmSQ_PERFCOUNTER15_LO,
|
||||
mmSQ_PERFCOUNTER15_HI}};
|
||||
|
||||
/*
|
||||
* DRMDMA
|
||||
*/
|
||||
GpuCounterRegInfo AiDrmdmaCounterRegAddr[] = {
|
||||
{mmSDMA0_PERFMON_CNTL, 0, mmSDMA0_PERFCOUNTER0_RESULT, 0},
|
||||
{mmSDMA0_PERFMON_CNTL, 0, mmSDMA0_PERFCOUNTER1_RESULT, 0},
|
||||
{mmSDMA1_PERFMON_CNTL, 0, mmSDMA1_PERFCOUNTER0_RESULT, 0},
|
||||
{mmSDMA1_PERFMON_CNTL, 0, mmSDMA1_PERFCOUNTER1_RESULT, 0},
|
||||
};
|
||||
|
||||
/*
|
||||
* IH
|
||||
*/
|
||||
GpuCounterRegInfo AiIhCounterRegAddr[] = {{mmIH_PERFMON_CNTL, 0, mmIH_PERFCOUNTER0_RESULT, 0},
|
||||
{mmIH_PERFMON_CNTL, 0, mmIH_PERFCOUNTER1_RESULT, 0}};
|
||||
|
||||
/*
|
||||
* CPF
|
||||
*/
|
||||
GpuCounterRegInfo AiCpfCounterRegAddr[] = {
|
||||
{mmCPF_PERFCOUNTER0_SELECT, 0, mmCPF_PERFCOUNTER0_LO, mmCPF_PERFCOUNTER0_HI},
|
||||
{mmCPF_PERFCOUNTER1_SELECT, 0, mmCPF_PERFCOUNTER1_LO, mmCPF_PERFCOUNTER1_HI}};
|
||||
|
||||
/*
|
||||
* DRM
|
||||
*/
|
||||
GpuCounterRegInfo AiDrmCounterRegAddr[] = {
|
||||
/*
|
||||
{mmDRM_PERFCOUNTER1_SELECT, 0, mmDRM_PERFCOUNTER1_LO, mmDRM_PERFCOUNTER1_HI},
|
||||
{mmDRM_PERFCOUNTER2_SELECT, 0, mmDRM_PERFCOUNTER2_LO, mmDRM_PERFCOUNTER2_HI}
|
||||
*/
|
||||
};
|
||||
|
||||
/*
|
||||
* GRBM
|
||||
*/
|
||||
GpuCounterRegInfo AiGrbmCounterRegAddr[] = {
|
||||
{mmGRBM_PERFCOUNTER0_SELECT, 0, mmGRBM_PERFCOUNTER0_LO, mmGRBM_PERFCOUNTER0_HI},
|
||||
{mmGRBM_PERFCOUNTER1_SELECT, 0, mmGRBM_PERFCOUNTER1_LO, mmGRBM_PERFCOUNTER1_HI}};
|
||||
|
||||
/*
|
||||
* GRBM_SE
|
||||
*/
|
||||
GpuCounterRegInfo AiGrbmSeCounterRegAddr[] = {
|
||||
{mmGRBM_SE0_PERFCOUNTER_SELECT, 0, mmGRBM_SE0_PERFCOUNTER_LO, mmGRBM_SE0_PERFCOUNTER_HI},
|
||||
{mmGRBM_SE1_PERFCOUNTER_SELECT, 0, mmGRBM_SE1_PERFCOUNTER_LO, mmGRBM_SE1_PERFCOUNTER_HI},
|
||||
{mmGRBM_SE2_PERFCOUNTER_SELECT, 0, mmGRBM_SE2_PERFCOUNTER_LO, mmGRBM_SE2_PERFCOUNTER_HI},
|
||||
{mmGRBM_SE3_PERFCOUNTER_SELECT, 0, mmGRBM_SE3_PERFCOUNTER_LO, mmGRBM_SE3_PERFCOUNTER_HI}};
|
||||
|
||||
/*
|
||||
* PA_SU
|
||||
*/
|
||||
GpuCounterRegInfo AiPaSuCounterRegAddr[] = {
|
||||
{mmPA_SU_PERFCOUNTER0_SELECT, 0, mmPA_SU_PERFCOUNTER0_LO, mmPA_SU_PERFCOUNTER0_HI},
|
||||
{mmPA_SU_PERFCOUNTER1_SELECT, 0, mmPA_SU_PERFCOUNTER1_LO, mmPA_SU_PERFCOUNTER1_HI},
|
||||
{mmPA_SU_PERFCOUNTER2_SELECT, 0, mmPA_SU_PERFCOUNTER2_LO, mmPA_SU_PERFCOUNTER2_HI},
|
||||
{mmPA_SU_PERFCOUNTER3_SELECT, 0, mmPA_SU_PERFCOUNTER3_LO, mmPA_SU_PERFCOUNTER3_HI}};
|
||||
|
||||
/*
|
||||
* PA_SC
|
||||
*/
|
||||
GpuCounterRegInfo AiPaScCounterRegAddr[] = {
|
||||
{mmPA_SC_PERFCOUNTER0_SELECT, 0, mmPA_SC_PERFCOUNTER0_LO, mmPA_SC_PERFCOUNTER0_HI},
|
||||
{mmPA_SC_PERFCOUNTER1_SELECT, 0, mmPA_SC_PERFCOUNTER1_LO, mmPA_SC_PERFCOUNTER1_HI},
|
||||
{mmPA_SC_PERFCOUNTER2_SELECT, 0, mmPA_SC_PERFCOUNTER2_LO, mmPA_SC_PERFCOUNTER2_HI},
|
||||
{mmPA_SC_PERFCOUNTER3_SELECT, 0, mmPA_SC_PERFCOUNTER3_LO, mmPA_SC_PERFCOUNTER3_HI}};
|
||||
|
||||
/*
|
||||
* SPI
|
||||
*/
|
||||
GpuCounterRegInfo AiSpiCounterRegAddr[] = {
|
||||
{mmSPI_PERFCOUNTER0_SELECT, 0, mmSPI_PERFCOUNTER0_LO, mmSPI_PERFCOUNTER0_HI},
|
||||
{mmSPI_PERFCOUNTER1_SELECT, 0, mmSPI_PERFCOUNTER1_LO, mmSPI_PERFCOUNTER1_HI},
|
||||
{mmSPI_PERFCOUNTER2_SELECT, 0, mmSPI_PERFCOUNTER2_LO, mmSPI_PERFCOUNTER2_HI},
|
||||
{mmSPI_PERFCOUNTER3_SELECT, 0, mmSPI_PERFCOUNTER3_LO, mmSPI_PERFCOUNTER3_HI},
|
||||
{mmSPI_PERFCOUNTER4_SELECT, 0, mmSPI_PERFCOUNTER4_LO, mmSPI_PERFCOUNTER4_HI},
|
||||
{mmSPI_PERFCOUNTER5_SELECT, 0, mmSPI_PERFCOUNTER5_LO, mmSPI_PERFCOUNTER5_HI}};
|
||||
|
||||
/*
|
||||
* TCA
|
||||
*/
|
||||
GpuCounterRegInfo AiTcaCounterRegAddr[] = {
|
||||
{mmTCA_PERFCOUNTER0_SELECT, 0, mmTCA_PERFCOUNTER0_LO, mmTCA_PERFCOUNTER0_HI},
|
||||
{mmTCA_PERFCOUNTER1_SELECT, 0, mmTCA_PERFCOUNTER1_LO, mmTCA_PERFCOUNTER1_HI},
|
||||
{mmTCA_PERFCOUNTER2_SELECT, 0, mmTCA_PERFCOUNTER2_LO, mmTCA_PERFCOUNTER2_HI},
|
||||
{mmTCA_PERFCOUNTER3_SELECT, 0, mmTCA_PERFCOUNTER3_LO, mmTCA_PERFCOUNTER3_HI}};
|
||||
|
||||
/*
|
||||
* TCC
|
||||
*/
|
||||
GpuCounterRegInfo AiTccCounterRegAddr[] = {
|
||||
{mmTCC_PERFCOUNTER0_SELECT, 0, mmTCC_PERFCOUNTER0_LO, mmTCC_PERFCOUNTER0_HI},
|
||||
{mmTCC_PERFCOUNTER1_SELECT, 0, mmTCC_PERFCOUNTER1_LO, mmTCC_PERFCOUNTER1_HI},
|
||||
{mmTCC_PERFCOUNTER2_SELECT, 0, mmTCC_PERFCOUNTER2_LO, mmTCC_PERFCOUNTER2_HI},
|
||||
{mmTCC_PERFCOUNTER3_SELECT, 0, mmTCC_PERFCOUNTER3_LO, mmTCC_PERFCOUNTER3_HI}};
|
||||
|
||||
/*
|
||||
* TCP
|
||||
*/
|
||||
GpuCounterRegInfo AiTcpCounterRegAddr[] = {
|
||||
{mmTCP_PERFCOUNTER0_SELECT, 0, mmTCP_PERFCOUNTER0_LO, mmTCP_PERFCOUNTER0_HI},
|
||||
{mmTCP_PERFCOUNTER1_SELECT, 0, mmTCP_PERFCOUNTER1_LO, mmTCP_PERFCOUNTER1_HI},
|
||||
{mmTCP_PERFCOUNTER2_SELECT, 0, mmTCP_PERFCOUNTER2_LO, mmTCP_PERFCOUNTER2_HI},
|
||||
{mmTCP_PERFCOUNTER3_SELECT, 0, mmTCP_PERFCOUNTER3_LO, mmTCP_PERFCOUNTER3_HI}};
|
||||
|
||||
/*
|
||||
* CB
|
||||
*/
|
||||
GpuCounterRegInfo AiCbCounterRegAddr[] = {
|
||||
{mmCB_PERFCOUNTER0_SELECT, 0, mmCB_PERFCOUNTER0_LO, mmCB_PERFCOUNTER0_HI},
|
||||
{mmCB_PERFCOUNTER1_SELECT, 0, mmCB_PERFCOUNTER1_LO, mmCB_PERFCOUNTER1_HI},
|
||||
{mmCB_PERFCOUNTER2_SELECT, 0, mmCB_PERFCOUNTER2_LO, mmCB_PERFCOUNTER2_HI},
|
||||
{mmCB_PERFCOUNTER3_SELECT, 0, mmCB_PERFCOUNTER3_LO, mmCB_PERFCOUNTER3_HI}};
|
||||
|
||||
/*
|
||||
* DB
|
||||
*/
|
||||
GpuCounterRegInfo AiDbCounterRegAddr[] = {
|
||||
{mmDB_PERFCOUNTER0_SELECT, 0, mmDB_PERFCOUNTER0_LO, mmDB_PERFCOUNTER0_HI},
|
||||
{mmDB_PERFCOUNTER1_SELECT, 0, mmDB_PERFCOUNTER1_LO, mmDB_PERFCOUNTER1_HI},
|
||||
{mmDB_PERFCOUNTER2_SELECT, 0, mmDB_PERFCOUNTER2_LO, mmDB_PERFCOUNTER2_HI},
|
||||
{mmDB_PERFCOUNTER3_SELECT, 0, mmDB_PERFCOUNTER3_LO, mmDB_PERFCOUNTER3_HI}};
|
||||
|
||||
/*
|
||||
* RLC
|
||||
*/
|
||||
GpuCounterRegInfo AiRlcCounterRegAddr[] = {
|
||||
{mmRLC_PERFCOUNTER0_SELECT, 0, mmRLC_PERFCOUNTER0_LO, mmRLC_PERFCOUNTER0_HI},
|
||||
{mmRLC_PERFCOUNTER1_SELECT, 0, mmRLC_PERFCOUNTER1_LO, mmRLC_PERFCOUNTER1_HI}};
|
||||
|
||||
/*
|
||||
* SC
|
||||
*/
|
||||
GpuCounterRegInfo AiScCounterRegAddr[] = {
|
||||
{mmPA_SC_PERFCOUNTER0_SELECT, 0, mmPA_SC_PERFCOUNTER0_LO, mmPA_SC_PERFCOUNTER0_HI},
|
||||
{mmPA_SC_PERFCOUNTER1_SELECT, 0, mmPA_SC_PERFCOUNTER1_LO, mmPA_SC_PERFCOUNTER1_HI},
|
||||
{mmPA_SC_PERFCOUNTER2_SELECT, 0, mmPA_SC_PERFCOUNTER2_LO, mmPA_SC_PERFCOUNTER2_HI},
|
||||
{mmPA_SC_PERFCOUNTER3_SELECT, 0, mmPA_SC_PERFCOUNTER3_LO, mmPA_SC_PERFCOUNTER3_HI},
|
||||
{mmPA_SC_PERFCOUNTER4_SELECT, 0, mmPA_SC_PERFCOUNTER4_LO, mmPA_SC_PERFCOUNTER4_HI},
|
||||
{mmPA_SC_PERFCOUNTER5_SELECT, 0, mmPA_SC_PERFCOUNTER5_LO, mmPA_SC_PERFCOUNTER5_HI},
|
||||
{mmPA_SC_PERFCOUNTER6_SELECT, 0, mmPA_SC_PERFCOUNTER6_LO, mmPA_SC_PERFCOUNTER6_HI},
|
||||
{mmPA_SC_PERFCOUNTER7_SELECT, 0, mmPA_SC_PERFCOUNTER7_LO, mmPA_SC_PERFCOUNTER7_HI}};
|
||||
|
||||
/*
|
||||
* SX
|
||||
*/
|
||||
GpuCounterRegInfo AiSxCounterRegAddr[] = {
|
||||
{mmSX_PERFCOUNTER0_SELECT, 0, mmSX_PERFCOUNTER0_LO, mmSX_PERFCOUNTER0_HI},
|
||||
{mmSX_PERFCOUNTER1_SELECT, 0, mmSX_PERFCOUNTER1_LO, mmSX_PERFCOUNTER1_HI},
|
||||
{mmSX_PERFCOUNTER2_SELECT, 0, mmSX_PERFCOUNTER2_LO, mmSX_PERFCOUNTER2_HI},
|
||||
{mmSX_PERFCOUNTER3_SELECT, 0, mmSX_PERFCOUNTER3_LO, mmSX_PERFCOUNTER3_HI}};
|
||||
|
||||
/*
|
||||
* TA
|
||||
*/
|
||||
GpuCounterRegInfo AiTaCounterRegAddr[] = {
|
||||
{mmTA_PERFCOUNTER0_SELECT, 0, mmTA_PERFCOUNTER0_LO, mmTA_PERFCOUNTER0_HI},
|
||||
{mmTA_PERFCOUNTER1_SELECT, 0, mmTA_PERFCOUNTER1_LO, mmTA_PERFCOUNTER1_HI}};
|
||||
|
||||
/*
|
||||
* TD
|
||||
*/
|
||||
GpuCounterRegInfo AiTdCounterRegAddr[] = {
|
||||
{mmTD_PERFCOUNTER0_SELECT, 0, mmTD_PERFCOUNTER0_LO, mmTD_PERFCOUNTER0_HI},
|
||||
{mmTD_PERFCOUNTER1_SELECT, 0, mmTD_PERFCOUNTER1_LO, mmTD_PERFCOUNTER1_HI}};
|
||||
|
||||
/*
|
||||
* GDS
|
||||
*/
|
||||
GpuCounterRegInfo AiGdsCounterRegAddr[] = {
|
||||
{mmGDS_PERFCOUNTER0_SELECT, 0, mmGDS_PERFCOUNTER0_LO, mmGDS_PERFCOUNTER0_HI},
|
||||
{mmGDS_PERFCOUNTER1_SELECT, 0, mmGDS_PERFCOUNTER1_LO, mmGDS_PERFCOUNTER1_HI},
|
||||
{mmGDS_PERFCOUNTER2_SELECT, 0, mmGDS_PERFCOUNTER2_LO, mmGDS_PERFCOUNTER2_HI},
|
||||
{mmGDS_PERFCOUNTER3_SELECT, 0, mmGDS_PERFCOUNTER3_LO, mmGDS_PERFCOUNTER3_HI}};
|
||||
|
||||
/*
|
||||
* VGT
|
||||
*/
|
||||
GpuCounterRegInfo AiVgtCounterRegAddr[] = {
|
||||
{mmVGT_PERFCOUNTER0_SELECT, 0, mmVGT_PERFCOUNTER0_LO, mmVGT_PERFCOUNTER0_HI},
|
||||
{mmVGT_PERFCOUNTER1_SELECT, 0, mmVGT_PERFCOUNTER1_LO, mmVGT_PERFCOUNTER1_HI},
|
||||
{mmVGT_PERFCOUNTER2_SELECT, 0, mmVGT_PERFCOUNTER2_LO, mmVGT_PERFCOUNTER2_HI},
|
||||
{mmVGT_PERFCOUNTER3_SELECT, 0, mmVGT_PERFCOUNTER3_LO, mmVGT_PERFCOUNTER3_HI}};
|
||||
|
||||
/*
|
||||
* IA
|
||||
*/
|
||||
GpuCounterRegInfo AiIaCounterRegAddr[] = {
|
||||
{mmIA_PERFCOUNTER0_SELECT, 0, mmIA_PERFCOUNTER0_LO, mmIA_PERFCOUNTER0_HI},
|
||||
{mmIA_PERFCOUNTER1_SELECT, 0, mmIA_PERFCOUNTER1_LO, mmIA_PERFCOUNTER1_HI},
|
||||
{mmIA_PERFCOUNTER2_SELECT, 0, mmIA_PERFCOUNTER2_LO, mmIA_PERFCOUNTER2_HI},
|
||||
{mmIA_PERFCOUNTER3_SELECT, 0, mmIA_PERFCOUNTER3_LO, mmIA_PERFCOUNTER3_HI}};
|
||||
|
||||
/*
|
||||
* MC
|
||||
*/
|
||||
GpuCounterRegInfo AiMcCounterRegAddr[] = {
|
||||
/*
|
||||
|
||||
{mmMC_SEQ_PERF_SEQ_CTL__SI__VI, 0, mmMC_SEQ_PERF_SEQ_CNT_A_I0__VI,
|
||||
mmMC_SEQ_PERF_SEQ_CNT_A_I1__VI},
|
||||
{mmMC_SEQ_PERF_SEQ_CTL__SI__VI, 0, mmMC_SEQ_PERF_SEQ_CNT_B_I0__VI,
|
||||
mmMC_SEQ_PERF_SEQ_CNT_B_I1__VI},
|
||||
{mmMC_SEQ_PERF_SEQ_CTL__SI__VI, 0, mmMC_SEQ_PERF_SEQ_CNT_C_I0__VI,
|
||||
mmMC_SEQ_PERF_SEQ_CNT_C_I1__VI},
|
||||
{mmMC_SEQ_PERF_SEQ_CTL__SI__VI, 0, mmMC_SEQ_PERF_SEQ_CNT_D_I0__VI,
|
||||
mmMC_SEQ_PERF_SEQ_CNT_D_I1__VI}
|
||||
|
||||
*/
|
||||
};
|
||||
|
||||
/*
|
||||
* SRBM
|
||||
*/
|
||||
GpuCounterRegInfo AiSrbmCounterRegAddr[] = {
|
||||
/*
|
||||
{mmSRBM_PERFCOUNTER0_SELECT, 0, mmSRBM_PERFCOUNTER0_LO,
|
||||
mmSRBM_PERFCOUNTER0_HI},
|
||||
{mmSRBM_PERFCOUNTER1_SELECT, 0, mmSRBM_PERFCOUNTER1_LO,
|
||||
mmSRBM_PERFCOUNTER1_HI}
|
||||
*/
|
||||
};
|
||||
|
||||
/*
|
||||
* WD
|
||||
*/
|
||||
GpuCounterRegInfo AiWdCounterRegAddr[] = {
|
||||
{mmWD_PERFCOUNTER0_SELECT, 0, mmWD_PERFCOUNTER0_LO, mmWD_PERFCOUNTER0_HI},
|
||||
{mmWD_PERFCOUNTER1_SELECT, 0, mmWD_PERFCOUNTER1_LO, mmWD_PERFCOUNTER1_HI},
|
||||
{mmWD_PERFCOUNTER2_SELECT, 0, mmWD_PERFCOUNTER2_LO, mmWD_PERFCOUNTER2_HI},
|
||||
{mmWD_PERFCOUNTER3_SELECT, 0, mmWD_PERFCOUNTER3_LO, mmWD_PERFCOUNTER3_HI}};
|
||||
|
||||
/*
|
||||
* CPG
|
||||
*/
|
||||
GpuCounterRegInfo AiCpgCounterRegAddr[] = {
|
||||
{mmCPG_PERFCOUNTER0_SELECT, 0, mmCPG_PERFCOUNTER0_LO, mmCPG_PERFCOUNTER0_HI},
|
||||
{mmCPG_PERFCOUNTER1_SELECT, 0, mmCPG_PERFCOUNTER1_LO, mmCPG_PERFCOUNTER1_HI}};
|
||||
|
||||
/*
|
||||
* CPC
|
||||
*/
|
||||
GpuCounterRegInfo AiCpcCounterRegAddr[] = {
|
||||
{mmCPC_PERFCOUNTER0_SELECT, 0, mmCPC_PERFCOUNTER0_LO, mmCPC_PERFCOUNTER0_HI},
|
||||
{mmCPC_PERFCOUNTER1_SELECT, 0, mmCPC_PERFCOUNTER1_LO, mmCPC_PERFCOUNTER1_HI}};
|
||||
|
||||
GpuPrivCounterBlockId AiBlockIdSq = {{0xb5c396b6, 0x47e4d310, 0xc35cfc86, 0x08f53a04}};
|
||||
GpuPrivCounterBlockId AiBlockIdMc = {{0x13900b57, 0x4d984956, 0x5268d081, 0x9cf53719}};
|
||||
GpuPrivCounterBlockId AiBlockIdIommuV2 = {{0x80969879, 0x4be6b0f6, 0x636af697, 0x1d10f500}};
|
||||
GpuPrivCounterBlockId AiBlockIdKernelDriver = {{0xea9b5ae1, 0x44b36c3f, 0xf0da5489, 0x0aa96575}};
|
||||
|
||||
} // pm4_profile
|
||||
@@ -0,0 +1,252 @@
|
||||
#ifndef _AI_BLOCKINFO_H_
|
||||
#define _AI_BLOCKINFO_H_
|
||||
|
||||
#include <stdint.h>
|
||||
#include "rocr_profiler.h"
|
||||
#include "gpu_enum.h"
|
||||
#include "gpu_blockinfo.h"
|
||||
|
||||
namespace pm4_profile {
|
||||
|
||||
// MAX Number of block instances for ARCTIC ISLANDS (From Vega10)
|
||||
// Values are found here //gfxip/gfx8/main/src/meta/features/variant/Fiji/album.dj
|
||||
|
||||
// @brief Number of block instances.
|
||||
|
||||
// Number of CB block instances per SE
|
||||
// and number of Perf Cntrs per CB block
|
||||
#define AI_NUM_CB 4
|
||||
#define AI_COUNTER_NUM_PER_CB 4
|
||||
|
||||
// Number of DB block instances per SE
|
||||
// and number of Perf Cntrs per DB block
|
||||
#define AI_NUM_DB 4
|
||||
#define AI_COUNTER_NUM_PER_DB 4
|
||||
|
||||
// Number of TA block instances per SE
|
||||
// and number of Perf Cntrs per TA block
|
||||
#define AI_NUM_TA 16
|
||||
#define AI_COUNTER_NUM_PER_TA 2
|
||||
|
||||
// Number of TD block instances per SE
|
||||
// and number of Perf Cntrs per TD block
|
||||
#define AI_NUM_TD 16
|
||||
#define AI_COUNTER_NUM_PER_TD 2
|
||||
|
||||
// Number of TCP block instances per SE
|
||||
// and number of Perf Cntrs per TCP block
|
||||
#define AI_NUM_TCP 16
|
||||
#define AI_COUNTER_NUM_PER_TCP 4
|
||||
|
||||
// Number of TCA block instances per chip
|
||||
// and number of Perf Cntrs per TCA block
|
||||
#define AI_NUM_TCA 2
|
||||
#define AI_COUNTER_NUM_PER_TCA 4
|
||||
|
||||
// Number of TCC block instances per chip
|
||||
// and number of Perf Cntrs per TCC block
|
||||
#define AI_NUM_TCC 16
|
||||
#define AI_COUNTER_NUM_PER_TCC 4
|
||||
|
||||
// Number of SDMA block instances per chip
|
||||
// and number of Perf Cntrs per SDMA block
|
||||
#define AI_NUM_SDMA 2
|
||||
|
||||
// Number of counter registers per block for arctic islands
|
||||
#define AI_COUNTER_NUM_PER_DRM 2
|
||||
#define AI_COUNTER_NUM_PER_DRMDMA 2
|
||||
#define AI_COUNTER_NUM_PER_IH 2
|
||||
#define AI_COUNTER_NUM_PER_SRBM 2
|
||||
#define AI_COUNTER_NUM_PER_CPF 2
|
||||
#define AI_COUNTER_NUM_PER_GRBM 2
|
||||
#define AI_COUNTER_NUM_PER_GRBMSE 4
|
||||
#define AI_COUNTER_NUM_PER_PA_SU 4
|
||||
#define AI_COUNTER_NUM_PER_RLC 2
|
||||
#define AI_COUNTER_NUM_PER_PA_SC 8
|
||||
#define AI_COUNTER_NUM_PER_SPI 6 // [Shucai: To do: double check the value]
|
||||
#define AI_COUNTER_NUM_PER_SQ 16
|
||||
#define AI_COUNTER_NUM_PER_SX 4
|
||||
#define AI_COUNTER_NUM_PER_GDS 4
|
||||
#define AI_COUNTER_NUM_PER_VGT 4
|
||||
#define AI_COUNTER_NUM_PER_IA 4
|
||||
#define AI_COUNTER_NUM_PER_MC 4
|
||||
#define AI_COUNTER_NUM_PER_TCS 4
|
||||
#define AI_COUNTER_NUM_PER_WD 4
|
||||
#define AI_COUNTER_NUM_PER_CPG 2
|
||||
#define AI_COUNTER_NUM_PER_CPC 2
|
||||
#define AI_COUNTER_NUM_PER_VM 1
|
||||
#define AI_COUNTER_NUM_PER_VM_MD 1
|
||||
#define AI_COUNTER_NUM_PER_PIPESTATS 12
|
||||
|
||||
#define AI_MAX_NUM_SHADER_ENGINES 1
|
||||
|
||||
// Enumeration of AI hardware counter blocks
|
||||
typedef enum HsaAiCounterBlockId {
|
||||
kHsaAiCounterBlockIdCb0 = 0,
|
||||
kHsaAiCounterBlockIdCb1,
|
||||
kHsaAiCounterBlockIdCb2,
|
||||
kHsaAiCounterBlockIdCb3,
|
||||
|
||||
// Temp commented for Vega10
|
||||
kHsaAiCounterBlockIdCpf,
|
||||
|
||||
kHsaAiCounterBlockIdDb0,
|
||||
kHsaAiCounterBlockIdDb1,
|
||||
kHsaAiCounterBlockIdDb2,
|
||||
kHsaAiCounterBlockIdDb3,
|
||||
|
||||
kHsaAiCounterBlockIdGrbm,
|
||||
kHsaAiCounterBlockIdGrbmSe,
|
||||
kHsaAiCounterBlockIdPaSu,
|
||||
kHsaAiCounterBlockIdPaSc,
|
||||
kHsaAiCounterBlockIdSpi,
|
||||
|
||||
kHsaAiCounterBlockIdSq,
|
||||
kHsaAiCounterBlockIdSqGs,
|
||||
kHsaAiCounterBlockIdSqVs,
|
||||
kHsaAiCounterBlockIdSqPs,
|
||||
kHsaAiCounterBlockIdSqHs,
|
||||
kHsaAiCounterBlockIdSqCs,
|
||||
|
||||
kHsaAiCounterBlockIdSx,
|
||||
|
||||
kHsaAiCounterBlockIdTa0,
|
||||
kHsaAiCounterBlockIdTa1,
|
||||
kHsaAiCounterBlockIdTa2,
|
||||
kHsaAiCounterBlockIdTa3,
|
||||
kHsaAiCounterBlockIdTa4,
|
||||
kHsaAiCounterBlockIdTa5,
|
||||
kHsaAiCounterBlockIdTa6,
|
||||
kHsaAiCounterBlockIdTa7,
|
||||
kHsaAiCounterBlockIdTa8,
|
||||
kHsaAiCounterBlockIdTa9,
|
||||
kHsaAiCounterBlockIdTa10,
|
||||
kHsaAiCounterBlockIdTa11,
|
||||
kHsaAiCounterBlockIdTa12,
|
||||
kHsaAiCounterBlockIdTa13,
|
||||
kHsaAiCounterBlockIdTa14,
|
||||
kHsaAiCounterBlockIdTa15,
|
||||
|
||||
kHsaAiCounterBlockIdTca0,
|
||||
kHsaAiCounterBlockIdTca1,
|
||||
|
||||
kHsaAiCounterBlockIdTcc0,
|
||||
kHsaAiCounterBlockIdTcc1,
|
||||
kHsaAiCounterBlockIdTcc2,
|
||||
kHsaAiCounterBlockIdTcc3,
|
||||
kHsaAiCounterBlockIdTcc4,
|
||||
kHsaAiCounterBlockIdTcc5,
|
||||
kHsaAiCounterBlockIdTcc6,
|
||||
kHsaAiCounterBlockIdTcc7,
|
||||
kHsaAiCounterBlockIdTcc8,
|
||||
kHsaAiCounterBlockIdTcc9,
|
||||
kHsaAiCounterBlockIdTcc10,
|
||||
kHsaAiCounterBlockIdTcc11,
|
||||
kHsaAiCounterBlockIdTcc12,
|
||||
kHsaAiCounterBlockIdTcc13,
|
||||
kHsaAiCounterBlockIdTcc14,
|
||||
kHsaAiCounterBlockIdTcc15,
|
||||
|
||||
kHsaAiCounterBlockIdTd0,
|
||||
kHsaAiCounterBlockIdTd1,
|
||||
kHsaAiCounterBlockIdTd2,
|
||||
kHsaAiCounterBlockIdTd3,
|
||||
kHsaAiCounterBlockIdTd4,
|
||||
kHsaAiCounterBlockIdTd5,
|
||||
kHsaAiCounterBlockIdTd6,
|
||||
kHsaAiCounterBlockIdTd7,
|
||||
kHsaAiCounterBlockIdTd8,
|
||||
kHsaAiCounterBlockIdTd9,
|
||||
kHsaAiCounterBlockIdTd10,
|
||||
kHsaAiCounterBlockIdTd11,
|
||||
kHsaAiCounterBlockIdTd12,
|
||||
kHsaAiCounterBlockIdTd13,
|
||||
kHsaAiCounterBlockIdTd14,
|
||||
kHsaAiCounterBlockIdTd15,
|
||||
|
||||
kHsaAiCounterBlockIdTcp0,
|
||||
kHsaAiCounterBlockIdTcp1,
|
||||
kHsaAiCounterBlockIdTcp2,
|
||||
kHsaAiCounterBlockIdTcp3,
|
||||
kHsaAiCounterBlockIdTcp4,
|
||||
kHsaAiCounterBlockIdTcp5,
|
||||
kHsaAiCounterBlockIdTcp6,
|
||||
kHsaAiCounterBlockIdTcp7,
|
||||
kHsaAiCounterBlockIdTcp8,
|
||||
kHsaAiCounterBlockIdTcp9,
|
||||
kHsaAiCounterBlockIdTcp10,
|
||||
kHsaAiCounterBlockIdTcp11,
|
||||
kHsaAiCounterBlockIdTcp12,
|
||||
kHsaAiCounterBlockIdTcp13,
|
||||
kHsaAiCounterBlockIdTcp14,
|
||||
kHsaAiCounterBlockIdTcp15,
|
||||
|
||||
kHsaAiCounterBlockIdGds,
|
||||
kHsaAiCounterBlockIdVgt,
|
||||
kHsaAiCounterBlockIdIa,
|
||||
kHsaAiCounterBlockIdMc,
|
||||
|
||||
// Temp commented out for Vega10
|
||||
// kHsaAiCounterBlockIdSrbm,
|
||||
|
||||
kHsaAiCounterBlockIdTcs,
|
||||
kHsaAiCounterBlockIdWd,
|
||||
|
||||
// Temp commented out for Vega10
|
||||
// kHsaAiCounterBlockIdCpg,
|
||||
|
||||
// Temp commented for Vega10
|
||||
kHsaAiCounterBlockIdCpc,
|
||||
|
||||
// Counters retrieved by KFD
|
||||
kHsaAiCounterBlockIdIommuV2,
|
||||
kHsaAiCounterBlockIdKernelDriver,
|
||||
|
||||
kHsaAiCounterBlockIdCpPipeStats,
|
||||
kHsaAiCounterBlockIdHwInfo,
|
||||
kHsaAiCounterBlockIdBlocksFirst = kHsaAiCounterBlockIdCb0,
|
||||
kHsaAiCounterBlockIdBlocksLast = kHsaAiCounterBlockIdHwInfo
|
||||
} HsaAiCounterBlockId;
|
||||
|
||||
extern GpuBlockInfo AiPmuHwBlocks[];
|
||||
extern GpuCounterRegInfo AiSqCounterRegAddr[];
|
||||
extern GpuCounterRegInfo AiCbCounterRegAddr[];
|
||||
extern GpuCounterRegInfo AiDrmdmaCounterRegAddr[];
|
||||
extern GpuCounterRegInfo AiIhCounterRegAddr[];
|
||||
extern GpuCounterRegInfo AiCpfCounterRegAddr[];
|
||||
extern GpuCounterRegInfo AiCpgCounterRegAddr[];
|
||||
extern GpuCounterRegInfo AiCpcCounterRegAddr[];
|
||||
extern GpuCounterRegInfo AiDrmCounterRegAddr[];
|
||||
extern GpuCounterRegInfo AiGrbmCounterRegAddr[];
|
||||
extern GpuCounterRegInfo AiGrbmSeCounterRegAddr[];
|
||||
extern GpuCounterRegInfo AiPaSuCounterRegAddr[];
|
||||
extern GpuCounterRegInfo AiPaScCounterRegAddr[];
|
||||
extern GpuCounterRegInfo AiSpiCounterRegAddr[];
|
||||
extern GpuCounterRegInfo AiTcaCounterRegAddr[];
|
||||
extern GpuCounterRegInfo AiTccCounterRegAddr[];
|
||||
extern GpuCounterRegInfo AiTcpCounterRegAddr[];
|
||||
extern GpuCounterRegInfo AiDbCounterRegAddr[];
|
||||
extern GpuCounterRegInfo AiRlcCounterRegAddr[];
|
||||
extern GpuCounterRegInfo AiScCounterRegAddr[];
|
||||
extern GpuCounterRegInfo AiSxCounterRegAddr[];
|
||||
extern GpuCounterRegInfo AiTaCounterRegAddr[];
|
||||
extern GpuCounterRegInfo AiTdCounterRegAddr[];
|
||||
extern GpuCounterRegInfo AiGdsCounterRegAddr[];
|
||||
extern GpuCounterRegInfo AiVgtCounterRegAddr[];
|
||||
extern GpuCounterRegInfo AiIaCounterRegAddr[];
|
||||
extern GpuCounterRegInfo AiMcCounterRegAddr[];
|
||||
extern GpuCounterRegInfo AiSrbmCounterRegAddr[];
|
||||
|
||||
// No Tcs Counter block on AI
|
||||
// extern GpuCounterRegInfo AiTcsCounterRegAddr[];
|
||||
extern GpuCounterRegInfo AiWdCounterRegAddr[];
|
||||
extern GpuCounterRegInfo AiCpgCounterRegAddr[];
|
||||
extern GpuCounterRegInfo AiCpcCounterRegAddr[];
|
||||
|
||||
extern GpuPrivCounterBlockId AiBlockIdSq;
|
||||
extern GpuPrivCounterBlockId AiBlockIdMc;
|
||||
extern GpuPrivCounterBlockId AiBlockIdIommuV2;
|
||||
extern GpuPrivCounterBlockId AiBlockIdKernelDriver;
|
||||
}
|
||||
|
||||
#endif // _AI_BLOCKINFO_H_
|
||||
Разница между файлами не показана из-за своего большого размера
Загрузить разницу
@@ -0,0 +1,137 @@
|
||||
#ifndef _AI_PMU_H_
|
||||
#define _AI_PMU_H_
|
||||
|
||||
#include "hsa.h"
|
||||
#include "cmdwriter.h"
|
||||
#include "hsa_perf.h"
|
||||
#include "info_set.h"
|
||||
#include "parameter_set.h"
|
||||
#include "ai_blockinfo.h"
|
||||
#include "rocr_profiler.h"
|
||||
|
||||
#include <stdlib.h>
|
||||
#include <stdint.h>
|
||||
#include <map>
|
||||
|
||||
namespace pm4_profile {
|
||||
typedef std::map<HsaAiCounterBlockId, pm4_profile::CounterBlock*> AiCounterBlockMap;
|
||||
|
||||
// This class implement the AI PMU. It is responsible for setting up
|
||||
// CounterGroups to represent each AI hardware block which exposes performance
|
||||
// counters.
|
||||
class AiPmu : public pm4_profile::Pmu {
|
||||
public:
|
||||
AiPmu();
|
||||
|
||||
~AiPmu();
|
||||
|
||||
// Returns number of shader engines per block
|
||||
// for the blocks featured shader engines instancing
|
||||
uint32_t getNumSe() { return num_se_; }
|
||||
|
||||
// Initializes the handle of buffer used to collect PMC data
|
||||
bool setPmcDataBuff(uint8_t* pmcBuffer, uint32_t pmcBuffSz);
|
||||
|
||||
int getLastError();
|
||||
|
||||
std::string getErrorString(int error);
|
||||
|
||||
virtual bool begin(DefaultCmdBuf* cmdBuff, CommandWriter* cmdWriter, bool reset = true);
|
||||
|
||||
virtual bool end(DefaultCmdBuf* cmdBuff, CommandWriter* cmdWriter);
|
||||
|
||||
// IPMU inherits the IParameterSet and IInfoSetso we implement it
|
||||
// through composition and function forwarding
|
||||
bool getParameter(uint32_t param, uint32_t& ret_size, void** pp_data);
|
||||
|
||||
bool setParameter(uint32_t param, uint32_t param_size, const void* p_data);
|
||||
|
||||
bool getInfo(uint32_t info, uint32_t& ret_size, void** pp_data);
|
||||
|
||||
pm4_profile::CounterBlock* getCounterBlockById(uint32_t id);
|
||||
|
||||
rocr_pmu_state_t getCurrentState() { return profiler_state_; }
|
||||
|
||||
pm4_profile::CounterBlock** getAllCounterBlocks(uint32_t& num_groups);
|
||||
|
||||
private:
|
||||
// Addr of Counter Data Buffer
|
||||
uint32_t* pmcData_;
|
||||
|
||||
// Size of Counter Data Buffer
|
||||
uint32_t pmcDataSz_;
|
||||
|
||||
void Init();
|
||||
|
||||
bool initCounterBlock();
|
||||
|
||||
bool isResultReady();
|
||||
|
||||
// Clear CounterBlockMap
|
||||
void clearCounterBlockMap();
|
||||
|
||||
// Reset SQ and CB counters
|
||||
void ResetCounterBlocks(DefaultCmdBuf* cmdBuff, CommandWriter* cmdWriter);
|
||||
|
||||
// Program SQ block related counters
|
||||
uint32_t ProgramSQCntrs(uint32_t sqRegIdx, uint32_t* regAddr, uint32_t* regVal, uint32_t blkId,
|
||||
uint32_t blkCntrIdx);
|
||||
|
||||
// Program TA block related counters
|
||||
uint32_t ProgramTaCntrs(uint32_t taRegIdx, uint32_t* regAddr, uint32_t* regVal, uint32_t blkId,
|
||||
uint32_t blkCntrIdx);
|
||||
|
||||
// Program TCA block related counters
|
||||
uint32_t ProgramTcaCntrs(uint32_t tcaRegIdx, uint32_t* regAddr, uint32_t* regVal, uint32_t blkId,
|
||||
uint32_t blkCntrIdx);
|
||||
|
||||
// Program TCC block related counters
|
||||
uint32_t ProgramTccCntrs(uint32_t tccRegIdx, uint32_t* regAddr, uint32_t* regVal, uint32_t blkId,
|
||||
uint32_t blkCntrIdx);
|
||||
|
||||
// Program TCP block related counters
|
||||
uint32_t ProgramTcpCntrs(uint32_t tcpRegIdx, uint32_t* regAddr, uint32_t* regVal, uint32_t blkId,
|
||||
uint32_t blkCntrIdx);
|
||||
|
||||
// Program TD block related counters
|
||||
uint32_t ProgramTdCntrs(uint32_t tdRegIdx, uint32_t* regAddr, uint32_t* regVal, uint32_t blkId,
|
||||
uint32_t blkCntrIdx);
|
||||
|
||||
// Build counter selection register, return how many registers are built
|
||||
uint32_t BuildCounterSelRegister(uint32_t cntrIdx, uint32_t* regAddr, uint32_t* regVal,
|
||||
uint32_t blkId, pm4_profile::Counter* blkCntr);
|
||||
|
||||
// Build counter selection register, return how many registers are built
|
||||
uint32_t BuildCounterReadRegisters(uint32_t reg_index, uint32_t block_id, uint32_t* reg_addr,
|
||||
uint32_t* reg_val);
|
||||
|
||||
private:
|
||||
// Delete counter blocks in the PMU
|
||||
hsa_status_t RemoveCounterBlocks();
|
||||
|
||||
private:
|
||||
// This contains the available counter groups.
|
||||
AiCounterBlockMap blk_map_;
|
||||
|
||||
// This stores the current profiling state.
|
||||
rocr_pmu_state_t profiler_state_;
|
||||
|
||||
pm4_profile::ParameterSet* parameter_set_;
|
||||
|
||||
pm4_profile::InfoSet* info_set_;
|
||||
|
||||
int error_code_;
|
||||
|
||||
// Pointer used to store counter block list internally
|
||||
uint32_t blk_list_size_;
|
||||
pm4_profile::CounterBlock** blk_list_;
|
||||
|
||||
// Indicates the number of Shader Engines Present
|
||||
uint32_t num_se_;
|
||||
|
||||
// Used to reset GRBM to its default state
|
||||
uint32_t reset_grbm_;
|
||||
};
|
||||
}
|
||||
|
||||
#endif // _AI_PMU_H_
|
||||
@@ -0,0 +1,101 @@
|
||||
#ifndef _GPU_BLOCKINFO_H_
|
||||
#define _GPU_BLOCKINFO_H_
|
||||
|
||||
#include "rocr_profiler.h"
|
||||
#include "gpu_enum.h"
|
||||
|
||||
#include <stdint.h>
|
||||
|
||||
namespace pm4_profile {
|
||||
|
||||
typedef enum CntlMethod {
|
||||
CntlMethodNone = 0,
|
||||
CntlMethodByInstance = 1,
|
||||
CntlMethodBySe = 2,
|
||||
CntlMethodBySeAndInstance = 3
|
||||
} CntlMethod;
|
||||
|
||||
// Structure which contains information about a specific hardware block for CI.
|
||||
#define GPU_BLOCK_NAME_SIZE 15
|
||||
|
||||
typedef struct GpuBlockInfo_ {
|
||||
// Unique string identifier of the block.
|
||||
const char blockName[GPU_BLOCK_NAME_SIZE];
|
||||
|
||||
// Unique string identifier of the block.
|
||||
uint32_t counterGroupId;
|
||||
|
||||
// Maximum number of shader engines
|
||||
uint32_t maxShaderEngineCount;
|
||||
|
||||
// Maximum number of shader arrays
|
||||
uint32_t maxShaderArrayCount;
|
||||
|
||||
// Maximum number of block instances in the group per shader array
|
||||
uint32_t maxInstanceCount;
|
||||
|
||||
// Counter control method
|
||||
CntlMethod method;
|
||||
|
||||
// Maximum counter event ID
|
||||
uint32_t maxEventId;
|
||||
|
||||
// Maximum number of counters that can be enabled at once
|
||||
uint32_t maxSimultaneousCounters;
|
||||
|
||||
// Maximum number of streaming counters that can be enabled at once
|
||||
uint32_t maxStreamingCounters;
|
||||
|
||||
// The number of hardware counters that are shared
|
||||
// between regular and streaming counters.
|
||||
// This is important so that resources are not double-booked
|
||||
// between the two types of counters.
|
||||
uint32_t sharedHWCounters;
|
||||
|
||||
// Block counters can be configured with additional filters
|
||||
bool hasFilters;
|
||||
|
||||
//------------------------------------------
|
||||
// Trace specific stuff regarding when they get locked
|
||||
|
||||
// Buffer size in bytes
|
||||
uint32_t bufferSize;
|
||||
|
||||
// Current write pointer offset from beginning of the buffer
|
||||
uint32_t wptrOffset;
|
||||
|
||||
// Flag that buffer might have wrapped
|
||||
bool wrapped;
|
||||
|
||||
// If buffer has wrapped, this could indicate approximate
|
||||
// total amount of data that was dumpued in the trace buffer
|
||||
uint32_t dataSizeEstimate;
|
||||
|
||||
// Buffer data pointer
|
||||
void* pData;
|
||||
} GpuBlockInfo;
|
||||
|
||||
// Register address corresponding to each counter
|
||||
typedef struct GpuCounterRegInfo_ {
|
||||
// counter select register address
|
||||
uint32_t counterSelRegAddr;
|
||||
|
||||
// counter control register address
|
||||
uint32_t counterCntlRegAddr;
|
||||
|
||||
// counter read register address low
|
||||
uint32_t counterReadRegAddrLo;
|
||||
|
||||
// counter read register address high
|
||||
uint32_t counterReadRegAddrHi;
|
||||
} GpuCounterRegInfo;
|
||||
|
||||
// Gpu Privileged Block ID info. This number should be the same as that
|
||||
// defined in KFD
|
||||
typedef struct GpuPrivCounterBlockId_ {
|
||||
// Block ID consists of 4 dwords
|
||||
uint32_t items[4];
|
||||
} GpuPrivCounterBlockId;
|
||||
|
||||
} // pm4_profile
|
||||
#endif
|
||||
@@ -0,0 +1,73 @@
|
||||
#include "gpu_counter.h"
|
||||
|
||||
using namespace pm4_profile;
|
||||
|
||||
namespace pm4_profile {
|
||||
static char error_string[][64] = {
|
||||
{"No error"}, {"Counter generic error"}, {"Counter is already set"}, {"Counter not ready"},
|
||||
};
|
||||
|
||||
GpuCounter::GpuCounter() : Counter() {
|
||||
counter_enabled_ = false;
|
||||
parameter_set_ = new ParameterSet();
|
||||
}
|
||||
|
||||
GpuCounter::~GpuCounter() { delete parameter_set_; }
|
||||
|
||||
bool GpuCounter::getResult(uint64_t* p_result) {
|
||||
if (!p_result) {
|
||||
return false;
|
||||
}
|
||||
|
||||
*p_result = result_;
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
bool GpuCounter::setCounterBlock(pm4_profile::CounterBlock* p_cntr_group) {
|
||||
if (!p_cntr_group) {
|
||||
return false;
|
||||
}
|
||||
|
||||
counter_block_ = p_cntr_group;
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
pm4_profile::CounterBlock* GpuCounter::getCounterBlock() { return counter_block_; }
|
||||
|
||||
bool GpuCounter::setEnable(bool b) {
|
||||
// TODO: Validate counter
|
||||
counter_enabled_ = b;
|
||||
return true;
|
||||
}
|
||||
|
||||
void GpuCounter::setResult(uint64_t result) { result_ = result; }
|
||||
|
||||
int GpuCounter::getLastError() { return error_code_; }
|
||||
|
||||
std::string GpuCounter::getErrorString(int error) {
|
||||
if ((error >= 0) && (error < kHsaCounterErrorCodeMax)) {
|
||||
std::string err_string(error_string[error]);
|
||||
return err_string;
|
||||
}
|
||||
return "Incorrect error index";
|
||||
}
|
||||
|
||||
bool GpuCounter::getParameter(uint32_t param, uint32_t& ret_size, void** pp_data) {
|
||||
return parameter_set_->getParameter(param, ret_size, pp_data);
|
||||
}
|
||||
|
||||
bool GpuCounter::setParameter(uint32_t param, uint32_t param_size, const void* p_data) {
|
||||
bool ret_code;
|
||||
|
||||
error_code_ = kHsaCounterErrorCodeNoError;
|
||||
|
||||
ret_code = parameter_set_->setParameter(param, param_size, p_data);
|
||||
if (ret_code == false) {
|
||||
error_code_ = kHsaCounterErrorCodeAlreadySet;
|
||||
}
|
||||
|
||||
return ret_code;
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,52 @@
|
||||
#ifndef _GPU_COUNTER_H_
|
||||
#define _GPU_COUNTER_H_
|
||||
|
||||
#include "hsa_perf.h"
|
||||
#include "parameter_set.h"
|
||||
|
||||
#include <stdlib.h>
|
||||
#include <stdint.h>
|
||||
#include <list>
|
||||
|
||||
namespace pm4_profile {
|
||||
// @brief This class represent each CI performance counter
|
||||
class GpuCounter : public pm4_profile::Counter {
|
||||
public:
|
||||
GpuCounter();
|
||||
|
||||
virtual ~GpuCounter();
|
||||
|
||||
virtual int getLastError();
|
||||
|
||||
virtual std::string getErrorString(int error);
|
||||
|
||||
virtual bool getResult(uint64_t* p_result);
|
||||
|
||||
virtual pm4_profile::CounterBlock* getCounterBlock();
|
||||
|
||||
virtual bool setEnable(bool b);
|
||||
|
||||
virtual bool isEnabled() { return counter_enabled_; }
|
||||
|
||||
virtual bool isResultReady() { return is_result_ready_; }
|
||||
|
||||
virtual bool getParameter(uint32_t param, uint32_t& ret_size, void** pp_data);
|
||||
|
||||
virtual bool setParameter(uint32_t param, uint32_t param_size, const void* p_data);
|
||||
|
||||
bool setCounterBlock(pm4_profile::CounterBlock* p_cntr_group);
|
||||
|
||||
void setResult(uint64_t result);
|
||||
|
||||
private:
|
||||
bool counter_enabled_;
|
||||
bool is_result_ready_;
|
||||
uint64_t result_;
|
||||
pm4_profile::ParameterSet* parameter_set_;
|
||||
pm4_profile::CounterBlock* counter_block_;
|
||||
uint32_t error_code_;
|
||||
};
|
||||
|
||||
typedef std::list<GpuCounter*> GpuCounterList;
|
||||
}
|
||||
#endif // _GPU_COUNTER_H_
|
||||
@@ -0,0 +1,215 @@
|
||||
#include "gpu_countergroup.h"
|
||||
#include "gpu_counter.h"
|
||||
#include "gpu_enum.h"
|
||||
|
||||
using namespace pm4_profile;
|
||||
|
||||
namespace pm4_profile {
|
||||
|
||||
static char error_string[][64] = {
|
||||
{"No error"}, {"Counter block error"}, {"Max counter reached"}, {"Unkown counter"}};
|
||||
|
||||
GpuCounterBlock::GpuCounterBlock() : CounterBlock() {
|
||||
cntr_list_.clear();
|
||||
parameter_set_ = new ParameterSet();
|
||||
info_set_ = new InfoSet();
|
||||
|
||||
// Initialize pointer to NULL
|
||||
pp_cntrs_ = NULL;
|
||||
|
||||
_initCounterBlockType();
|
||||
}
|
||||
|
||||
GpuCounterBlock::~GpuCounterBlock() {
|
||||
GpuCounterList::iterator it = cntr_list_.begin();
|
||||
GpuCounterList::iterator it_end = cntr_list_.end();
|
||||
|
||||
for (; it != it_end; it++) {
|
||||
if (*it) {
|
||||
delete (*it);
|
||||
}
|
||||
}
|
||||
cntr_list_.clear();
|
||||
|
||||
delete parameter_set_;
|
||||
delete info_set_;
|
||||
|
||||
if (pp_cntrs_) {
|
||||
free(pp_cntrs_);
|
||||
pp_cntrs_ = NULL;
|
||||
}
|
||||
}
|
||||
|
||||
void GpuCounterBlock::_initCounterBlockType() {
|
||||
block_type_ = HSA_EXT_TOOLS_COUNTER_BLOCK_TYPE_ASYNC;
|
||||
}
|
||||
|
||||
Counter* GpuCounterBlock::createCounter() {
|
||||
if (!_checkMaxNumOfCounters()) {
|
||||
return NULL;
|
||||
}
|
||||
|
||||
GpuCounter* p_cntr = new GpuCounter();
|
||||
if (!p_cntr) {
|
||||
return NULL;
|
||||
}
|
||||
|
||||
cntr_list_.push_back(p_cntr);
|
||||
|
||||
return (Counter*)p_cntr;
|
||||
}
|
||||
|
||||
bool GpuCounterBlock::destroyCounter(Counter* p_cntr) {
|
||||
bool ret = false;
|
||||
|
||||
if (!p_cntr) {
|
||||
return ret;
|
||||
}
|
||||
|
||||
GpuCounterList::iterator it = cntr_list_.begin();
|
||||
GpuCounterList::iterator it_end = cntr_list_.end();
|
||||
for (; it != it_end; it++) {
|
||||
if (*it == p_cntr) {
|
||||
delete (*it);
|
||||
cntr_list_.erase(it);
|
||||
ret = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
bool GpuCounterBlock::destroyAllCounters() {
|
||||
GpuCounterList::iterator it = cntr_list_.begin();
|
||||
GpuCounterList::iterator it_end = cntr_list_.end();
|
||||
|
||||
for (; it != it_end; it++) {
|
||||
if (*it) {
|
||||
delete (*it);
|
||||
}
|
||||
}
|
||||
|
||||
cntr_list_.clear();
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
Counter** GpuCounterBlock::getEnabledCounters(uint32_t& num) {
|
||||
if (pp_cntrs_) {
|
||||
free(pp_cntrs_);
|
||||
pp_cntrs_ = NULL;
|
||||
}
|
||||
|
||||
pp_cntrs_ = (Counter**)malloc(sizeof(GpuCounter*) * cntr_list_.size());
|
||||
|
||||
if (!pp_cntrs_) {
|
||||
return NULL;
|
||||
}
|
||||
|
||||
int cnt = 0;
|
||||
GpuCounterList::iterator it = cntr_list_.begin();
|
||||
GpuCounterList::iterator it_end = cntr_list_.end();
|
||||
for (; it != it_end; it++) {
|
||||
GpuCounter* p_cntr = (*it);
|
||||
bool is_enabled;
|
||||
is_enabled = p_cntr->isEnabled();
|
||||
if (is_enabled) {
|
||||
*(pp_cntrs_ + cnt) = (Counter*)*it;
|
||||
cnt++;
|
||||
}
|
||||
}
|
||||
|
||||
num = cnt;
|
||||
if (0 == num) {
|
||||
return NULL;
|
||||
}
|
||||
|
||||
return pp_cntrs_;
|
||||
}
|
||||
|
||||
Counter** GpuCounterBlock::getAllCounters(uint32_t& num) {
|
||||
if (pp_cntrs_) {
|
||||
free(pp_cntrs_);
|
||||
pp_cntrs_ = NULL;
|
||||
}
|
||||
|
||||
pp_cntrs_ = (Counter**)malloc(sizeof(GpuCounter*) * cntr_list_.size());
|
||||
|
||||
if (!pp_cntrs_) {
|
||||
return NULL;
|
||||
}
|
||||
|
||||
int cnt = 0;
|
||||
GpuCounterList::iterator it = cntr_list_.begin();
|
||||
GpuCounterList::iterator it_end = cntr_list_.end();
|
||||
for (; it != it_end; it++, cnt++) {
|
||||
*(pp_cntrs_ + cnt) = (Counter*)*it;
|
||||
}
|
||||
|
||||
num = cnt;
|
||||
if (0 == num) {
|
||||
return NULL;
|
||||
}
|
||||
|
||||
return pp_cntrs_;
|
||||
}
|
||||
|
||||
bool GpuCounterBlock::setInfo(GPU_BLK_INFOS blk_info, uint32_t size, void* data) {
|
||||
return info_set_->setInfo(blk_info, size, data);
|
||||
}
|
||||
|
||||
bool GpuCounterBlock::_checkMaxNumOfCounters() {
|
||||
uint32_t num_enabled = _getNumOfEnabledCounters();
|
||||
|
||||
uint32_t* p_num_max = NULL;
|
||||
uint32_t size = 0;
|
||||
|
||||
if (!getInfo(GPU_BLK_INFO_MAX_SIMULTANEOUS_COUNTERS, size, (void**)&p_num_max)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
if (num_enabled >= *p_num_max) {
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
uint32_t GpuCounterBlock::_getNumOfEnabledCounters() {
|
||||
uint32_t cnt = 0;
|
||||
GpuCounterList::iterator it = cntr_list_.begin();
|
||||
GpuCounterList::iterator it_end = cntr_list_.end();
|
||||
|
||||
for (; it != it_end; it++) {
|
||||
GpuCounter* p_cntr = (*it);
|
||||
bool is_enabled;
|
||||
is_enabled = p_cntr->isEnabled();
|
||||
if (is_enabled) {
|
||||
cnt++;
|
||||
}
|
||||
}
|
||||
|
||||
return cnt;
|
||||
}
|
||||
|
||||
std::string GpuCounterBlock::getErrorString(int error) {
|
||||
if ((error >= 0) && (error < kHsaCounterBlockErrorCodeMaxError)) {
|
||||
std::string err_string(error_string[error]);
|
||||
return err_string;
|
||||
}
|
||||
return "incorrect error code";
|
||||
}
|
||||
|
||||
bool GpuCounterBlock::getParameter(uint32_t param, uint32_t& ret_size, void** pp_data) {
|
||||
return parameter_set_->getParameter(param, ret_size, pp_data);
|
||||
}
|
||||
|
||||
bool GpuCounterBlock::setParameter(uint32_t param, uint32_t param_size, const void* pData) {
|
||||
return parameter_set_->setParameter(param, param_size, pData);
|
||||
}
|
||||
|
||||
bool GpuCounterBlock::getInfo(uint32_t info, uint32_t& ret_size, void** pp_data) {
|
||||
return info_set_->getInfo(info, ret_size, pp_data);
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,70 @@
|
||||
#ifndef _GPU_COUNTER_GROUP_H_
|
||||
#define _GPU_COUNTER_GROUP_H_
|
||||
|
||||
// This file contains declaration of Sea Island (CI) CounterBlock class.
|
||||
#include "hsa_perf.h"
|
||||
#include "gpu_counter.h"
|
||||
#include "parameter_set.h"
|
||||
#include "info_set.h"
|
||||
#include "gpu_enum.h"
|
||||
|
||||
#include <stdlib.h>
|
||||
#include <stdint.h>
|
||||
|
||||
namespace pm4_profile {
|
||||
// This class represents one CI hardware block. Each block contains
|
||||
// multiple performance counters.
|
||||
class GpuCounterBlock : public pm4_profile::CounterBlock {
|
||||
public:
|
||||
GpuCounterBlock();
|
||||
~GpuCounterBlock();
|
||||
|
||||
// NOTE [Suravee] : We specify CiPmu as a friend
|
||||
// because the CiPmu needs to be able to setup info of
|
||||
// the counter block.
|
||||
friend class CiPmu;
|
||||
friend class ViPmu;
|
||||
friend class AiPmu;
|
||||
|
||||
std::string getErrorString(int error);
|
||||
|
||||
pm4_profile::Counter* createCounter();
|
||||
|
||||
virtual bool destroyCounter(pm4_profile::Counter* p_cntr);
|
||||
|
||||
virtual bool destroyAllCounters();
|
||||
|
||||
virtual pm4_profile::Counter** getEnabledCounters(uint32_t& num);
|
||||
|
||||
virtual pm4_profile::Counter** getAllCounters(uint32_t& num);
|
||||
|
||||
virtual bool getParameter(uint32_t param, uint32_t& ret_size, void** pp_data);
|
||||
|
||||
virtual bool setParameter(uint32_t param, uint32_t param_size, const void* p_data);
|
||||
|
||||
virtual bool getInfo(uint32_t info, uint32_t& ret_size, void** pp_data);
|
||||
|
||||
protected:
|
||||
void _initCounterBlockType();
|
||||
|
||||
bool setInfo(GPU_BLK_INFOS blk_info, uint32_t size, void* data);
|
||||
|
||||
hsa_ext_tools_counter_block_type_t block_type_;
|
||||
|
||||
private:
|
||||
bool _checkMaxNumOfCounters();
|
||||
|
||||
uint32_t _getNumOfEnabledCounters();
|
||||
|
||||
pm4_profile::ParameterSet* parameter_set_;
|
||||
pm4_profile::InfoSet* info_set_;
|
||||
GpuCounterList cntr_list_;
|
||||
uint32_t error_code_;
|
||||
|
||||
// Pointer of buffer to store counter list
|
||||
pm4_profile::Counter** pp_cntrs_;
|
||||
};
|
||||
|
||||
} // pm4_profile
|
||||
|
||||
#endif // _GPU_COUNTER_GROUP_H_
|
||||
@@ -0,0 +1,65 @@
|
||||
#ifndef _GPU_ENUM_H_
|
||||
#define _GPU_ENUM_H_
|
||||
|
||||
namespace pm4_profile {
|
||||
|
||||
// Enumeration containing GPU hardware block information
|
||||
enum GPU_BLK_INFOS {
|
||||
GPU_BLK_INFO_BLOCK_NAME,
|
||||
GPU_BLK_INFO_ID,
|
||||
GPU_BLK_INFO_MAX_SHADER_ENGINE_COUNT,
|
||||
GPU_BLK_INFO_MAX_SHADER_ARRAY_COUNT,
|
||||
GPU_BLK_INFO_MAX_INSTANCE_COUNT,
|
||||
GPU_BLK_INFO_CONTROL_METHOD,
|
||||
GPU_BLK_INFO_MAX_EVENT_ID,
|
||||
GPU_BLK_INFO_MAX_SIMULTANEOUS_COUNTERS,
|
||||
GPU_BLK_INFO_MAX_STREAMING_COUNTERS,
|
||||
GPU_BLK_INFO_SHARED_HW_COUNTERS,
|
||||
GPU_BLK_INFO_HAS_FILTERS,
|
||||
|
||||
// Trace-specific stuff
|
||||
GPU_TRC_BLK_INFO_BUFFER_SIZE,
|
||||
GPU_TRC_BLK_INFO_BUFFER_WRITE_POINTER_OFFSET,
|
||||
GPU_TRC_BLK_INFO_BUFFER_WRAPPED,
|
||||
GPU_TRC_BLK_INFO_DATA_SIZE_ESTIMATE,
|
||||
GPU_TRC_BLK_INFO_DATA_POINTER,
|
||||
};
|
||||
|
||||
|
||||
/**
|
||||
* Trace buffer parameters
|
||||
*/
|
||||
enum GPU_BLK_PARAMS {
|
||||
// Allows user to specify the size of the trace buffers.
|
||||
GPU_BLK_PARAM_TRACE_BUFFER_SIZE,
|
||||
|
||||
// If we decide to implement this functionality, this will allow the user
|
||||
// to specify the number of trace buffers to create.
|
||||
GPU_BLK_PARAM_TRACE_BUFFER_ARRAY,
|
||||
|
||||
// Specifies whether a new trace buffer should be used for each cmd buffer.
|
||||
// This allows for better correlation of data back to the host application
|
||||
// If this is enabled, and the user does not explicitly specify a
|
||||
// TRACE_BUFFER_ARRAY, then the driver should automatically allocate
|
||||
// additional buffers as needed so that as much of the application
|
||||
// can be traced as possible, until the PerfExperiment is ended.
|
||||
// If a TRACE_BUFFER_ARRAY is specified, then only as many buffers
|
||||
// as specified should be created. If more cmd buffers get submitted
|
||||
// than there are trace buffers, then the later cmd buffers should
|
||||
// not be traced.
|
||||
GPU_BLK_PARAM_TRACE_NEW_BUFFER_ON_SUBMIT,
|
||||
};
|
||||
|
||||
|
||||
// Enumeration containing GPU counter parameters
|
||||
enum GPU_CNTR_PARAMS {
|
||||
GPU_CNTR_PARAM_SHADERENGINE_ID,
|
||||
GPU_CNTR_PARAM_SHADERARRAY_ID,
|
||||
GPU_CNTR_PARAM_INSTANCE_ID,
|
||||
GPU_CNTR_PARAM_EVENT_SELECT_ID,
|
||||
GPU_CNTR_PARAM_SIMD_MASK,
|
||||
GPU_CNTR_PARAM_PERF_MODE,
|
||||
GPU_CNTR_PARAM_TRACE_TYPE,
|
||||
};
|
||||
}
|
||||
#endif
|
||||
@@ -0,0 +1,436 @@
|
||||
#ifndef _HSA_PERF_H_
|
||||
#define _HSA_PERF_H_
|
||||
|
||||
#include "rocr_profiler.h"
|
||||
|
||||
#if !defined(AMD_AMP_HSA_INCLUDES)
|
||||
#include <map>
|
||||
#include <string>
|
||||
#include <stdlib.h>
|
||||
#include <stdint.h>
|
||||
#endif
|
||||
|
||||
namespace pm4_profile {
|
||||
class Pmu;
|
||||
class Counter;
|
||||
class CounterBlock;
|
||||
class TraceGroup;
|
||||
class CommandWriter;
|
||||
class DefaultCmdBuf;
|
||||
|
||||
|
||||
// @brief This is an abstract class for defining a CounterBlock. Each
|
||||
// CounterBlock contains a set of Counters that often belong to the
|
||||
// same functional unit
|
||||
//
|
||||
// For AMD GPU, this can represent blocks of Counters in each HW block
|
||||
// (e.g. SQ, SQI, CP, etc.).
|
||||
// For AMD CPU, this can represent blocks of core PMCs, NB PMCs, L2I PMCs
|
||||
// on each CPU device
|
||||
//
|
||||
// Generally, CounterBlocks are created and initialized by the \ref Pmu class.
|
||||
// Users can query them by calling \ref Pmu::getAllCounterBlocks() or
|
||||
// \ref Pmu::getCounterBlockById(). A CounterBlock is enabled if it contains
|
||||
// enabled Counters in the block.
|
||||
//
|
||||
// Users can manage Counters in each GounterBlock (e.g. create, destroy,
|
||||
// enable and disable). To specify a Counter, users simply call \ref
|
||||
// createCounter. Then it can be enabled or disabled using \ref
|
||||
// Counter::setEnable. When a Counter is enabled, it is checked against the
|
||||
// CounterBlock checks to make sure that the enabled-counter is valid and is
|
||||
// not conflicting with the current Counters in the block.
|
||||
class CounterBlock {
|
||||
public:
|
||||
typedef enum HsaCounterBlockErrorCode {
|
||||
// Generic CounterBlock error
|
||||
kHsaCounterBlockErrorCodeNoError = 0x0,
|
||||
|
||||
// Generic CounterBlock error
|
||||
kHsaCounterBlockErrorCodeGenericError,
|
||||
|
||||
// The maximum number of Counters in the block is reached.
|
||||
kHsaCounterBlockErrorCodeMaxNumCounterReached,
|
||||
|
||||
// The counter does not belong to this block.
|
||||
kHsaCounterBlockErrorCodeUnknownCounter,
|
||||
|
||||
// The counter does not belong to this block.
|
||||
kHsaCounterBlockErrorCodeMaxError
|
||||
} HsaCounterBlockErrorCode;
|
||||
|
||||
// Destructor of CounterBlock.
|
||||
virtual ~CounterBlock() {}
|
||||
|
||||
// Given and error number reported from getLastError or returned from a
|
||||
// function call, retreive the corresponding stl string.
|
||||
// @param[in] error The error corresponding to a call to getLastError
|
||||
// or a return code from a function call.
|
||||
// Return An stl string representing a text corresponding to the error
|
||||
// number.
|
||||
// If invalid error code is given, the returned string is empty.
|
||||
virtual std::string getErrorString(int error) = 0;
|
||||
|
||||
// Create an Counter object return a pointer to caller.
|
||||
// Return On success, this function returns a pointer to Counter
|
||||
// On failure, this function returns NULL
|
||||
// Possible error codes are:
|
||||
// kHSAPerfErrorCodesUnmodifiableState
|
||||
// kHsaCounterBlockErrorCodeMaxNumCounterReached
|
||||
virtual Counter* createCounter() = 0;
|
||||
|
||||
// Destroy the Counter. The CounterBlock which owns the Counter must be in
|
||||
// disabled state.
|
||||
// Return true or false
|
||||
// Possible error codes are:
|
||||
// kHSAPerfErrorCodesInvalidAargs
|
||||
// kHSAPerfErrorCodesUnmodifiableState
|
||||
// kHsaCounterBlockErrorCodeUnknownCounter
|
||||
virtual bool destroyCounter(Counter* p_counter) = 0;
|
||||
|
||||
// Destroy all counters in the block. The CounterBlock must be in disable
|
||||
// state.
|
||||
// Return true or false.
|
||||
// Possible error codes are:
|
||||
// kHSAPerfErrorCodesUnmodifiableState
|
||||
virtual bool destroyAllCounters() = 0;
|
||||
|
||||
// Get a list of pointers to the enabled Counters in this CounterBlock.
|
||||
// note The Counter must be created by the same CounterBlock object using
|
||||
// createCounter().
|
||||
// @param[in] num The number of Counter pointers returned.
|
||||
// Return
|
||||
// return a list of pointers to the enabled Counters.
|
||||
// return NULL if no counter is enabled.
|
||||
virtual Counter** getEnabledCounters(uint32_t& num) = 0;
|
||||
|
||||
// Get a list of pointers to the all Counters in this CounterBlock.
|
||||
// note The Counter must be created by the same CounterBlock object using
|
||||
// createCounter().
|
||||
// @param[in] num The number of Counter pointers returned.
|
||||
// Return
|
||||
// return a list of pointers in the CounterBlock.
|
||||
// return NULL if no counter is enabled.
|
||||
virtual Counter** getAllCounters(uint32_t& num) = 0;
|
||||
|
||||
// Query value of the parameter specified by param
|
||||
// @param[in] param The enumeration of parameter to be queried
|
||||
// @param[out] return_size The returned size of data
|
||||
// @param[out] pp_data The pointer to the returned data. The API is
|
||||
// responsible for managing the memory to store the information as specified
|
||||
// by return_size.
|
||||
//
|
||||
// Return true or false
|
||||
// Possible error codes are:
|
||||
// kHSAPerfErrorCodesInvalidParam
|
||||
// kHSAPerfErrorCodesInvalidParamSize
|
||||
// kHSAPerfErrorCodesInvalidParamData
|
||||
virtual bool getParameter(uint32_t param, uint32_t& return_size, void** pp_data) = 0;
|
||||
|
||||
// Set value for the parameter specified by param
|
||||
// @param[in] param The enumeration of parameter to be queried
|
||||
// @param[out] param_size The size of data
|
||||
// @param[out] p_data The pointer to the data to be set. Users are responsible
|
||||
// for deallocating the memory of p_data after calling the API.
|
||||
// Return true or false
|
||||
// Possible error codes are:
|
||||
// kHSAPerfErrorCodesUnmodifiableState
|
||||
// kHSAPerfErrorCodesInvalidParam
|
||||
// kHSAPerfErrorCodesInvalidParamSize
|
||||
// kHSAPerfErrorCodesInvalidParamData
|
||||
virtual bool setParameter(uint32_t param, uint32_t param_size, const void* p_data) = 0;
|
||||
|
||||
// Query value of the information specified by info
|
||||
// @param[in] info The enumeration of information to be queried
|
||||
// @param[out] Return_size The returned size of data
|
||||
// @param[out] pp_data The pointer to the returned data
|
||||
// Return true or false
|
||||
// Possible error codes are:
|
||||
// kHSAPerfErrorCodesInvalidInfo
|
||||
// kHSAPerfErrorCodesInvalidInfoSize
|
||||
// kHSAPerfErrorCodesInvalidInfoData
|
||||
virtual bool getInfo(uint32_t info, uint32_t& return_size, void** pp_data) = 0;
|
||||
}; // class CounterBlock
|
||||
|
||||
|
||||
// This is an abstract class for defining a TraceGroup. TraceGroup inherits
|
||||
// CounterBlock and add interfaces for managing trace buffer. It also supports
|
||||
// user-data insertion into trace. This allows users to insert arbitary data
|
||||
// (e.g. markers) into trace which and can be used to correlating a specific
|
||||
// events to the collected trace data.
|
||||
class TraceGroup : public CounterBlock {
|
||||
public:
|
||||
typedef enum HsaTraceGroupErrorCode {
|
||||
// Generic TraceGroup error
|
||||
HsaTraceGroupErrorCodeGenericError = 0x100,
|
||||
} HsaTraceGroupErrorCode;
|
||||
|
||||
// Destructor of TraceGroup.
|
||||
virtual ~TraceGroup() {}
|
||||
|
||||
// Obtains the number of buffers which were collected as part of
|
||||
// the trace.
|
||||
// Return The number of collected buffers.
|
||||
virtual uint32_t getCollectedBufferCount() = 0;
|
||||
|
||||
// Locks a trace buffer for host access.
|
||||
// @param[in] buffer_id The index of the buffer to be locked.
|
||||
// Return true or false
|
||||
virtual bool lock(uint32_t buffer_id) = 0;
|
||||
|
||||
// Unlock a trace buffer that was previously locked.
|
||||
// @param[in] buffer_id The index of the buffer to be unlocked.
|
||||
// Return true or false
|
||||
virtual bool unlock(uint32_t buffer_id) = 0;
|
||||
|
||||
// Inserts data (e.g. trace marker) into the trace.
|
||||
// @param[in] type The type of data to be inserted.
|
||||
// @param[in] p_data The data to be inserted.
|
||||
// @param[in] data_size The size of data to be inserted.
|
||||
// Return true or false
|
||||
virtual bool insertUserData(uint32_t type, void* p_data, uint32_t data_size) = 0;
|
||||
}; // class TraceGroup
|
||||
|
||||
|
||||
// This is an abstract class for defining a performance Counter.
|
||||
// Users can obtain a Counter from \ref CounterBlock::createCounter().
|
||||
// Once obtained, users can set up Counter parameters, and enable it using
|
||||
// \ref Counter::setEnable().
|
||||
//
|
||||
// There are several types of Counter as defined in \ref
|
||||
// HsaCounterBlockTypeMask.
|
||||
// Only the supported Counter type can be added to the CounterBlock.
|
||||
//
|
||||
// Each Counter can store Counter-specific parameters. The Counter is used to
|
||||
// specify types of event to be counted.
|
||||
class Counter {
|
||||
public:
|
||||
typedef enum HsaCounterErrorCode {
|
||||
// Generic Counter error
|
||||
kHsaCounterErrorCodeNoError = 0x0,
|
||||
|
||||
// Generic Counter error
|
||||
kHsaCounterErrorCodeGenericError = 0x1,
|
||||
|
||||
// Counter already error
|
||||
kHsaCounterErrorCodeAlreadySet = 0x2,
|
||||
|
||||
// Counter result is not ready.
|
||||
kHsaCounterErrorCodeResultNotReady = 0x3,
|
||||
|
||||
// Max counter error num
|
||||
kHsaCounterErrorCodeMax,
|
||||
} HsaCounterErrorCode;
|
||||
|
||||
// Destructor of Counter
|
||||
virtual ~Counter() {}
|
||||
|
||||
// Retrieve the last error code generated. This should be checked when
|
||||
// values returned are NULL or void.
|
||||
// Return an integer corresponding to the last error reported.
|
||||
virtual int getLastError() = 0;
|
||||
|
||||
// Given and error number reported from getLastError or returned from a
|
||||
// function call, retreive the corresponding stl string.
|
||||
// @param[in] error The error corresponding to a call to getLastError
|
||||
// or a return code from a function call.
|
||||
// Return An stl string representing a text corresponding to the error
|
||||
// number. If invalid error code is given, the returned string is empty.
|
||||
virtual std::string getErrorString(int error) = 0;
|
||||
|
||||
// Get the \ref CounterBlock which owns this counter.
|
||||
// Return
|
||||
// On success, it returns a pointer to the CounterBlock.
|
||||
// On Failure, it returns NULL.
|
||||
virtual CounterBlock* getCounterBlock() = 0;
|
||||
|
||||
// Enable or disable the Counter.
|
||||
// @param[in] b Set to true to enable the CounterBlock.
|
||||
// Return
|
||||
// return true when successfully set the state.
|
||||
// return false otherwise.
|
||||
// In case of the current state already is set to the specified value,
|
||||
// the API returns true.
|
||||
// Possible error codes are:
|
||||
// kHSAPerfErrorCodesUnmodifiableState
|
||||
virtual bool setEnable(bool b) = 0;
|
||||
|
||||
// Return the current state of the Counter.
|
||||
// Return true or false
|
||||
virtual bool isEnabled() = 0;
|
||||
|
||||
// Return the status of this Counter whether the result is available.
|
||||
// Return true or false
|
||||
virtual bool isResultReady() = 0;
|
||||
|
||||
// Query Counter result
|
||||
// note Must be implemented by derived classes
|
||||
// @param[out] p_result The pointer containing the returned result.
|
||||
// Return true or false
|
||||
// Possible error codes are:
|
||||
// kHSAPerfErrorCodesInvalidAargs
|
||||
// kHsaCounterErrorCodeResultNotReady
|
||||
virtual bool getResult(uint64_t* p_result) = 0;
|
||||
|
||||
// Query value of the parameter specified by param
|
||||
// @param[in] param The enumeration of parameter to be queried
|
||||
// @param[out] Return_size The returned size of data
|
||||
// @param[out] pp_data The pointer to the returned data. The API is
|
||||
// responsible for managing the memory to store the information as
|
||||
// specified by return_size.
|
||||
// Return true or false
|
||||
// Possible error codes are:
|
||||
// kHSAPerfErrorCodesInvalidParam
|
||||
// kHSAPerfErrorCodesInvalidParamSize
|
||||
// kHSAPerfErrorCodesInvalidParamData
|
||||
virtual bool getParameter(uint32_t param, uint32_t& return_size, void** pp_data) = 0;
|
||||
|
||||
// Set value for the parameter specified by param
|
||||
// @param[in] param The enumeration of parameter to be queried
|
||||
// @param[out] param_size The size of data
|
||||
// @param[out] p_data The pointer to the data to be set. Users are responsible
|
||||
// for deallocating the memory of p_data after calling the API.
|
||||
// Return true or false
|
||||
// Possible error codes are:
|
||||
// kHSAPerfErrorCodesUnmodifiableState
|
||||
// kHSAPerfErrorCodesInvalidParam
|
||||
// kHSAPerfErrorCodesInvalidParamSize
|
||||
// kHSAPerfErrorCodesInvalidParamData
|
||||
virtual bool setParameter(uint32_t param, uint32_t param_size, const void* p_data) = 0;
|
||||
}; // class Counter
|
||||
|
||||
class Pmu {
|
||||
public:
|
||||
// Enumeration of Pmu error codes
|
||||
typedef enum HsaPmuErrorCode {
|
||||
// Generic PMU error
|
||||
kHsaPmuErrorCodeNoError = 0x0,
|
||||
|
||||
// Unknown CounterBlock ID
|
||||
kHsaPmuErrorCodeUnknownCounterBlockId,
|
||||
|
||||
// No CounterBlock exists
|
||||
kHsaPmuErrorCodeNoCounterBlock,
|
||||
|
||||
// The previously operation is not valid. This could be due to
|
||||
// invalid transition from the current state.
|
||||
kHsaPmuErrorCodeInvalidOperation,
|
||||
|
||||
// PMU is not currently available (e.g. PMU is currently
|
||||
// in-used by others)
|
||||
kHsaPmuErrorCodeNotAvailable,
|
||||
|
||||
// PMU is not currently available (e.g. PMU is currently
|
||||
// in-used by others)
|
||||
kHsaPmuErrorCodeErrorState,
|
||||
|
||||
// PMU result is timeout
|
||||
kHsaPmuErrorCodeTimeOut,
|
||||
|
||||
// Max error count
|
||||
kHsaPmuErrorCodeMax
|
||||
} HsaPmuErrorCode;
|
||||
|
||||
// Destructor of PMU.
|
||||
// note This stops the performance counters if running and releases
|
||||
// any resources used by the PMU.
|
||||
virtual ~Pmu() {}
|
||||
|
||||
// Retrieve the last error code generated. This should be checked when
|
||||
// values returned are NULL or void.
|
||||
// Return an integer corresponding to the last error reported.
|
||||
virtual int getLastError() = 0;
|
||||
|
||||
// Given and error number reported from getLastError or returned from a
|
||||
// function call, retreive the corresponding stl string.
|
||||
// @param[in] error The error corresponding to a call to getLastError
|
||||
// or a return code from a function call.
|
||||
// Return An stl string representing a text corresponding to the error
|
||||
// number. If invalid error code is given, the returned string is empty.
|
||||
virtual std::string getErrorString(int error) = 0;
|
||||
|
||||
// Get CounterBlock from Id
|
||||
// @param[in] id ID of the target CounterBlock
|
||||
// Return
|
||||
// On success, it returns a pointer to specified CounterBlock.
|
||||
// On Failure, it returns NULL.
|
||||
// Possible error codes are:
|
||||
// kHsaPmuErrorCodeUnknownCounterBlockId.
|
||||
virtual CounterBlock* getCounterBlockById(uint32_t id) = 0;
|
||||
|
||||
// Get all available CounterBlock
|
||||
// @param[out] num_block The returned number of CounterBlocks
|
||||
// Return On success, it returns an array of CounterBlock pointers.
|
||||
// On Failure, it returns NULL.
|
||||
virtual CounterBlock** getAllCounterBlocks(uint32_t& num_block) = 0;
|
||||
|
||||
// Get current PMU profiling state.
|
||||
// Return The PMU profiling state as defined in \ref PMU_PROFILE_STATES
|
||||
virtual rocr_pmu_state_t getCurrentState() = 0;
|
||||
|
||||
// Start profiling on the PMU.
|
||||
// @param[in] reset_counter indicates whether reset counter before
|
||||
// recording. Default is reset counters.
|
||||
// note This function must be implemented by children classes.
|
||||
// Return true or false
|
||||
// Possible error codes are:
|
||||
// kHsaPmuErrorCodeInvalidOperation
|
||||
// kHsaPmuErrorCodeNotAvailable
|
||||
virtual bool begin(DefaultCmdBuf* cmdBuff, CommandWriter* cmdWriter, bool reset = true) = 0;
|
||||
|
||||
// Stop profiling on the PMU.
|
||||
// note This function must be called after \ref begin().
|
||||
// note This function must be implemented by children classes.
|
||||
// Return true or false
|
||||
// Possible error codes are:
|
||||
// kHsaPmuErrorCodeInvalidOperation
|
||||
virtual bool end(DefaultCmdBuf* cmdBuff, CommandWriter* cmdWriter) = 0;
|
||||
|
||||
// Initializes the handle of buffer used to collect PMC data
|
||||
// @param pmcBuffer The buffer pointer
|
||||
// @param cmdBufSz Size in terms of bytes
|
||||
virtual bool setPmcDataBuff(uint8_t* pmcBuffer, uint32_t pmcBuffSz) = 0;
|
||||
|
||||
// Query value of the parameter specified by param
|
||||
// @param[in] param The enumeration of parameter to be queried
|
||||
// @param[out] Return_size The returned size of data
|
||||
// @param[out] pp_data The pointer to the returned data. The API is
|
||||
// responsible for managing the memory to store the information as
|
||||
// specified by return_size.
|
||||
// Return true or false
|
||||
// Possible error codes are:
|
||||
// kHSAPerfErrorCodesInvalidParam
|
||||
// kHSAPerfErrorCodesInvalidParamSize
|
||||
// kHSAPerfErrorCodesInvalidParamData
|
||||
virtual bool getParameter(uint32_t param, uint32_t& return_size, void** pp_data) = 0;
|
||||
|
||||
// Set value for the parameter specified by param
|
||||
// @param[in] param The enumeration of parameter to be queried
|
||||
// @param[out] param_size The size of data
|
||||
// @param[out] p_data The pointer to the data to be set. Users are responsible
|
||||
// for deallocating the memory of p_data after calling the API.
|
||||
// Return true or false
|
||||
// Possible error codes are:
|
||||
// kHSAPerfErrorCodesUnmodifiableState
|
||||
// kHSAPerfErrorCodesInvalidParam
|
||||
// kHSAPerfErrorCodesInvalidParamSize
|
||||
// kHSAPerfErrorCodesInvalidParamData
|
||||
virtual bool setParameter(uint32_t param, uint32_t param_size, const void* p_data) = 0;
|
||||
|
||||
// Query value of the information specified by info
|
||||
// @param[in] info The enumeration of information to be queried
|
||||
// @param[out] Return_size The returned size of data
|
||||
// @param[out] pp_data The pointer to the returned data
|
||||
// Return true or false
|
||||
// Possible error codes are:
|
||||
// kHSAPerfErrorCodesInvalidInfo
|
||||
// kHSAPerfErrorCodesInvalidInfoSize
|
||||
// kHSAPerfErrorCodesInvalidInfoData
|
||||
virtual bool getInfo(uint32_t info, uint32_t& return_size, void** pp_data) = 0;
|
||||
|
||||
// Returns number of shader engines per block
|
||||
// for the blocks featured shader engines instancing
|
||||
virtual uint32_t getNumSe() = 0;
|
||||
|
||||
}; // class Pmu
|
||||
} // pm4_profile
|
||||
#endif // _HSA_PERF_H_
|
||||
@@ -0,0 +1,74 @@
|
||||
#include "info_set.h"
|
||||
#include "var_data.h"
|
||||
using namespace std;
|
||||
|
||||
namespace pm4_profile {
|
||||
InfoSet::InfoSet() {
|
||||
releaseParameters();
|
||||
info_table_.clear();
|
||||
p_data_ = NULL;
|
||||
}
|
||||
|
||||
InfoSet::~InfoSet() {
|
||||
releaseParameters();
|
||||
info_table_.clear();
|
||||
free(p_data_);
|
||||
p_data_ = NULL;
|
||||
}
|
||||
|
||||
bool InfoSet::setInfo(uint32_t info, uint32_t info_size, void* p_data) {
|
||||
if (info_table_.end() != info_table_.find(info)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
VarData data;
|
||||
if (!data.set(info_size, p_data)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
info_table_.insert(VarDataMap::value_type(info, data));
|
||||
return true;
|
||||
}
|
||||
|
||||
bool InfoSet::getInfo(uint32_t info, uint32_t& ret_size, void** pp_data) {
|
||||
if (!pp_data || (0 == info_table_.size())) {
|
||||
return false;
|
||||
}
|
||||
|
||||
VarDataMap::iterator it = info_table_.find(info);
|
||||
if (it == info_table_.end()) {
|
||||
return false;
|
||||
}
|
||||
|
||||
int size = it->second.getSize();
|
||||
if (size == 0) {
|
||||
return false;
|
||||
}
|
||||
|
||||
free(p_data_);
|
||||
p_data_ = NULL;
|
||||
|
||||
p_data_ = malloc(size);
|
||||
if (!p_data_) {
|
||||
return false;
|
||||
}
|
||||
|
||||
*pp_data = p_data_;
|
||||
|
||||
ret_size = info_table_[info].get(size, *pp_data);
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
void InfoSet::releaseParameters() {
|
||||
VarDataMap::iterator it = info_table_.begin();
|
||||
VarDataMap::iterator table_end = info_table_.end();
|
||||
|
||||
for (; it != table_end; it++) {
|
||||
it->second.clear();
|
||||
}
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
} // pm4_profile
|
||||
@@ -0,0 +1,48 @@
|
||||
#ifndef _INFO_SET_H_
|
||||
#define _INFO_SET_H_
|
||||
|
||||
// This file contains declaration of IInfoSet class.
|
||||
#include "hsa_perf.h"
|
||||
#include "var_data.h"
|
||||
|
||||
#include <stdlib.h>
|
||||
#include <stdint.h>
|
||||
|
||||
namespace pm4_profile {
|
||||
// An abstract class defining a container to hold a information data set
|
||||
// (e.g. PMU info, CounterGroup info, etc.). Unlike \ref IParameterSet,
|
||||
// This class allows only the children of the class to set the information.
|
||||
class InfoSet {
|
||||
public:
|
||||
// IInfoSet constructor
|
||||
InfoSet();
|
||||
|
||||
// IInfoSet destructor
|
||||
virtual ~InfoSet();
|
||||
|
||||
// Query value of the information specified by info
|
||||
// @param[in] info The enumeration of information to be queried
|
||||
// @param[out] ret_size The returned size of data
|
||||
// @param[out] pp_data The pointer to the returned data
|
||||
// /return true or false
|
||||
bool getInfo(uint32_t info, uint32_t& ret_size, void** pp_data);
|
||||
|
||||
// Set value for the information specified by info
|
||||
// @param[in] info The enumeration of information to be queried
|
||||
// @param[out] info_size The size of data
|
||||
// @param[out] p_data The pointer to the data to be set
|
||||
// /return true or false
|
||||
bool setInfo(uint32_t info, uint32_t info_size, void* p_data);
|
||||
|
||||
private:
|
||||
// Remove all data in the parameter table
|
||||
void releaseParameters();
|
||||
|
||||
// InfoSet property: The info table
|
||||
VarDataMap info_table_;
|
||||
|
||||
// Pointer to the buffer used in getInfo
|
||||
void* p_data_;
|
||||
};
|
||||
}
|
||||
#endif
|
||||
@@ -0,0 +1,74 @@
|
||||
#include "parameter_set.h"
|
||||
|
||||
using namespace std;
|
||||
|
||||
namespace pm4_profile {
|
||||
ParameterSet::ParameterSet() {
|
||||
releaseParameters();
|
||||
param_table_.clear();
|
||||
p_data_ = NULL;
|
||||
}
|
||||
|
||||
ParameterSet::~ParameterSet() {
|
||||
releaseParameters();
|
||||
param_table_.clear();
|
||||
free(p_data_);
|
||||
p_data_ = NULL;
|
||||
}
|
||||
|
||||
bool ParameterSet::setParameter(uint32_t param, uint32_t param_size, const void* p_data) {
|
||||
if (param_table_.end() != param_table_.find(param)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
VarData data;
|
||||
if (!data.set(param_size, p_data)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
param_table_.insert(VarDataMap::value_type(param, data));
|
||||
return true;
|
||||
}
|
||||
|
||||
bool ParameterSet::getParameter(uint32_t param, uint32_t& ret_size, void** pp_data) {
|
||||
if (!pp_data || (0 == param_table_.size())) {
|
||||
return false;
|
||||
}
|
||||
|
||||
VarDataMap::iterator it = param_table_.find(param);
|
||||
if (it == param_table_.end()) {
|
||||
return false;
|
||||
}
|
||||
|
||||
int size = it->second.getSize();
|
||||
if (size == 0) {
|
||||
return false;
|
||||
}
|
||||
|
||||
// for NULL pointer, free does nothing
|
||||
free(p_data_);
|
||||
p_data_ = malloc(size);
|
||||
if (!p_data_) {
|
||||
return false;
|
||||
}
|
||||
|
||||
// store the pointer to be freed
|
||||
*pp_data = p_data_;
|
||||
|
||||
ret_size = param_table_[param].get(size, *pp_data);
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
bool ParameterSet::releaseParameters() {
|
||||
VarDataMap::iterator it = param_table_.begin();
|
||||
VarDataMap::iterator table_end = param_table_.end();
|
||||
|
||||
for (; it != table_end; it++) {
|
||||
it->second.clear();
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
} // pm4_profile
|
||||
@@ -0,0 +1,75 @@
|
||||
#ifndef _PARAMETER_SET_H_
|
||||
#define _PARAMETER_SET_H_
|
||||
|
||||
/*!
|
||||
\note This file contains declaration of IParameterSet class.
|
||||
*/
|
||||
#include "hsa_perf.h"
|
||||
#include "var_data.h"
|
||||
|
||||
#include <stdlib.h>
|
||||
#include <stdint.h>
|
||||
|
||||
namespace pm4_profile {
|
||||
/*!
|
||||
A class defining a container to hold parameter data set
|
||||
(e.g. PMU parameter, CounterGroup parameter, etc.).
|
||||
*/
|
||||
class ParameterSet {
|
||||
public:
|
||||
/*!
|
||||
Enumeration containing types of parameters
|
||||
*/
|
||||
enum parameter {
|
||||
PARAM_MAX,
|
||||
};
|
||||
|
||||
/*! IParameterSet constructor */
|
||||
ParameterSet();
|
||||
|
||||
/*! IParameterSet destructor */
|
||||
virtual ~ParameterSet();
|
||||
|
||||
/*!
|
||||
Query value of the parameter specified by param
|
||||
@param[in] param The enumeration of parameter to be queried
|
||||
@param[out] ret_size The returned size of data
|
||||
@param[out] pp_data The pointer to the returned data
|
||||
/return true or false
|
||||
*/
|
||||
bool getParameter(
|
||||
/*in*/ uint32_t param,
|
||||
/*out*/ uint32_t& ret_size,
|
||||
/*out*/ void** pp_data);
|
||||
|
||||
/*!
|
||||
Set value for the parameter specified by param
|
||||
@param[in] param The enumeration of parameter to be queried
|
||||
@param[out] param_size The size of data
|
||||
@param[out] p_data The pointer to the data to be set
|
||||
/return true or false
|
||||
*/
|
||||
bool setParameter(
|
||||
/*in*/ uint32_t param,
|
||||
/*in*/ uint32_t param_size,
|
||||
/*in*/ const void* p_data);
|
||||
|
||||
private:
|
||||
/*!
|
||||
Remove all data in the parameter table
|
||||
*/
|
||||
bool releaseParameters();
|
||||
|
||||
/*!
|
||||
IParameterSet property: The parameter table
|
||||
*/
|
||||
VarDataMap param_table_;
|
||||
|
||||
/*!
|
||||
Pointer to the buffer used in getParameter
|
||||
*/
|
||||
void* p_data_;
|
||||
};
|
||||
}
|
||||
|
||||
#endif // _PARAMETER_SET_H_
|
||||
@@ -0,0 +1,254 @@
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
//
|
||||
// The University of Illinois/NCSA
|
||||
// Open Source License (NCSA)
|
||||
//
|
||||
// Copyright (c) 2014-2015, Advanced Micro Devices, Inc. All rights reserved.
|
||||
//
|
||||
// Developed by:
|
||||
//
|
||||
// AMD Research and AMD HSA Software Development
|
||||
//
|
||||
// Advanced Micro Devices, Inc.
|
||||
//
|
||||
// www.amd.com
|
||||
//
|
||||
// Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
// of this software and associated documentation files (the "Software"), to
|
||||
// deal with the Software without restriction, including without limitation
|
||||
// the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
// and/or sell copies of the Software, and to permit persons to whom the
|
||||
// Software is furnished to do so, subject to the following conditions:
|
||||
//
|
||||
// - Redistributions of source code must retain the above copyright notice,
|
||||
// this list of conditions and the following disclaimers.
|
||||
// - Redistributions in binary form must reproduce the above copyright
|
||||
// notice, this list of conditions and the following disclaimers in
|
||||
// the documentation and/or other materials provided with the distribution.
|
||||
// - Neither the names of Advanced Micro Devices, Inc,
|
||||
// nor the names of its contributors may be used to endorse or promote
|
||||
// products derived from this Software without specific prior written
|
||||
// permission.
|
||||
//
|
||||
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
||||
// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
|
||||
// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
|
||||
// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
|
||||
// DEALINGS WITH THE SOFTWARE.
|
||||
//
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
#ifndef _ROCR_PROFILER_H_
|
||||
#define _ROCR_PROFILER_H_
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif // __cplusplus
|
||||
|
||||
#if defined _WIN32 || defined __CYGWIN__
|
||||
#ifdef __GNUC__
|
||||
#define HSA_TOOLS_API __attribute__((dllexport))
|
||||
#else
|
||||
#define HSA_TOOLS_API __declspec(dllexport) // Note: actually gcc seems
|
||||
// to also supports this
|
||||
// syntax.
|
||||
#endif
|
||||
#ifndef DLL_LOCAL
|
||||
#define DLL_LOCAL
|
||||
#endif
|
||||
|
||||
#else // defined _WIN32 || defined __CYGWIN__
|
||||
#if __GNUC__ >= 4
|
||||
#define HSA_TOOLS_API __attribute__((visibility("default")))
|
||||
#ifndef DLL_LOCAL
|
||||
#define DLL_LOCAL __attribute__((visibility("hidden")))
|
||||
#endif
|
||||
#else
|
||||
#define HSA_TOOLS_API
|
||||
#ifndef DLL_LOCAL
|
||||
#define DLL_LOCAL
|
||||
#endif
|
||||
#endif
|
||||
#endif // defined _WIN32 || defined __CYGWIN__
|
||||
|
||||
//---------------------------------------------------------------------------//
|
||||
// @brief Enumeration of various information that is set for a counter. //
|
||||
// @detail This enumeration defines the various counter info that could be //
|
||||
// used in a counter. This is used by a counter object to specify //
|
||||
// its type and other conditions that are needed to retrieve a //
|
||||
// counter value. //
|
||||
//---------------------------------------------------------------------------//
|
||||
typedef enum hsa_ext_tools_counter_parameter_s {
|
||||
// Event index of a counter
|
||||
HSA_EXT_TOOLS_COUNTER_PARAMETER_EVENT_INDEX = 0,
|
||||
|
||||
// Simd mask of a counter
|
||||
HSA_EXT_TOOLS_COUNTER_PARAMETER_SIMD_MASK = 1,
|
||||
|
||||
// Shader engine mask of a counter
|
||||
HSA_EXT_TOOLS_COUNTER_PARAMETER_SHADER_MASK = 2,
|
||||
|
||||
// Max counter info index
|
||||
HSA_EXT_TOOLS_COUNTER_PARAMETER_INFO_MAX
|
||||
} hsa_ext_tools_counter_parameter_t;
|
||||
|
||||
//---------------------------------------------------------------------------//
|
||||
// @brief Enumeration of counter block type mask //
|
||||
// @details This enumeration define the bit mask representing types of //
|
||||
// counter broup supported by HSA. This is used by counter block object to //
|
||||
// specify its type. //
|
||||
//---------------------------------------------------------------------------//
|
||||
typedef enum hsa_ext_tools_counter_block_type_s {
|
||||
// Unknown counter block type
|
||||
HSA_EXT_TOOLS_COUNTER_BLOCK_TYPE_UNKNOWN = 0,
|
||||
|
||||
// The CounterBlock of this type can be access at anytime.
|
||||
// note Examples are software Counters and CPU Counters.
|
||||
HSA_EXT_TOOLS_COUNTER_BLOCK_TYPE_SYNC = 1,
|
||||
|
||||
// The CounterBlock type can be access asynchronously.
|
||||
// It is required that the Counter must be stopped
|
||||
// before accessing.
|
||||
HSA_EXT_TOOLS_COUNTER_BLOCK_TYPE_ASYNC = 2,
|
||||
|
||||
// The CounterBlock of this counter block is used for generating
|
||||
// trace.
|
||||
HSA_EXT_TOOLS_COUNTER_BLOCK_TYPE_TRACE = 3,
|
||||
|
||||
// Max CounterBlock type
|
||||
HSA_EXT_TOOLS_COUNTER_BLOCK_TYPE_MAX
|
||||
} hsa_ext_tools_counter_block_type_t;
|
||||
|
||||
//---------------------------------------------------------------------------//
|
||||
// @brief Enumeration of various information that is set for a counter block.//
|
||||
// @detail This enumeration defines the various info that could be used //
|
||||
// in a counter block. This is used by a counter object to specify its type //
|
||||
// and other conditions that are needed for a counter block. //
|
||||
//---------------------------------------------------------------------------//
|
||||
/*
|
||||
typedef enum hsa_ext_tools_counter_block_info_s {
|
||||
// Index of a counter block
|
||||
HSA_EXT_TOOLS_COUNTER_BLOCK_INFO_EVENT_INDEX = 0,
|
||||
|
||||
// Shader bits of a counter block
|
||||
HSA_EXT_TOOLS_COUNTER_BLOCK_INFO_SHADER_BITS = 1,
|
||||
|
||||
// Simd mask of a counter
|
||||
HSA_EXT_TOOLS_COUNTER_BLOCK_INFO_CONTROL_METHOD = 2,
|
||||
|
||||
// Max index of counter block info
|
||||
HSA_EXT_TOOLS_COUNTER_BLOCK_INFO_MAX
|
||||
} hsa_ext_tools_counter_block_info_t;
|
||||
*/
|
||||
|
||||
//---------------------------------------------------------------------------//
|
||||
// Enumeration for the methods used to index into the correct registers. //
|
||||
//---------------------------------------------------------------------------//
|
||||
/*
|
||||
typedef enum hsa_ext_tools_counter_index_method_s {
|
||||
// No index
|
||||
HSA_EXT_TOOLS_COUNTER_INDEX_METHOD_BY_NONE = 0,
|
||||
|
||||
// Index by block instance
|
||||
HSA_EXT_TOOLS_COUNTER_INDEX_METHOD_BY_INSTANCE = 1,
|
||||
|
||||
// Index by shader engine
|
||||
HSA_EXT_TOOLS_COUNTER_INDEX_METHOD_BY_SHADER_ENGINE = 2,
|
||||
|
||||
// Index by shader and instance
|
||||
HSA_EXT_TOOLS_COUNTER_INDEX_METHOD_BY_SHADER_ENGINE_ANDINSTANCE = 3
|
||||
} hsa_ext_tools_counter_index_method_t;
|
||||
*/
|
||||
|
||||
//---------------------------------------------------------------------------//
|
||||
// Enumeration for the HSAPerf generic error codes //
|
||||
//---------------------------------------------------------------------------//
|
||||
/*
|
||||
typedef enum hsa_ext_tools_error_codes_s {
|
||||
// Successful
|
||||
HSA_EXT_TOOLS_ERROR_CODE_OK = 0,
|
||||
|
||||
// Generic error code
|
||||
HSA_EXT_TOOLS_ERROR_CODE_ERROR,
|
||||
|
||||
// Generic invalid HSAPerf API arguments
|
||||
HSA_EXT_TOOLS_ERROR_CODE_INVALID_ARGS,
|
||||
|
||||
// The operation is not permit due to currently in the unmodifiable
|
||||
// HSAPerf state .
|
||||
HSA_EXT_TOOLS_ERROR_CODE_UNMODIFIABLE_STATE,
|
||||
|
||||
// The hsa_ext_tools_set_pmu_parameter() or
|
||||
// hsa_ext_tools_get_pmu_parameter() API contains invalid parameter value.
|
||||
HSA_EXT_TOOLS_ERROR_CODE_INVALID_PARAM,
|
||||
|
||||
// The hsa_ext_tools_set_pmu_parameter() or
|
||||
// hsa_ext_tools_get_pmu_parameter() API contains invalid parameter size
|
||||
// or return size.
|
||||
HSA_EXT_TOOLS_ERROR_CODE_INVALID_PARAM_SIZE,
|
||||
|
||||
// The hsa_ext_tools_set_pmu_parameter() or
|
||||
// hsa_ext_tools_get_pmu_parameter() API contains invalid
|
||||
// pointer (e.g. NULL).
|
||||
HSA_EXT_TOOLS_ERROR_CODE_INVALID_PARAM_DATA,
|
||||
|
||||
// The hsa_ext_tools_get_pmu_info() API contains invalid info value.
|
||||
HSA_EXT_TOOLS_ERROR_CODE_INVALID_INFO,
|
||||
|
||||
// The hsa_ext_tools_get_pmu_info() API contains invalid info
|
||||
// size (e.g. zero).
|
||||
HSA_EXT_TOOLS_ERROR_CODE_INVALID_INFO_SIZE,
|
||||
|
||||
// The hsa_ext_tools_get_pmu_info() API contains invalid
|
||||
// data (e.g. NULL).
|
||||
HSA_EXT_TOOLS_ERROR_CODE_INVALID_INFO_DATA
|
||||
} hsa_ext_tools_error_codes_t;
|
||||
*/
|
||||
|
||||
//---------------------------------------------------------------------------//
|
||||
// Enumeration for Pmu profiling state //
|
||||
//---------------------------------------------------------------------------//
|
||||
typedef enum rocr_pmu_state_s {
|
||||
// Profiling idle. In this state, changes can be made to
|
||||
// the PMU, counter blocks, counters. This state can represent
|
||||
// the moment prior to calling begin or after calling
|
||||
// hsa_ext_tools_pmu_wait_for_completion().
|
||||
ROCR_PMU_STATE_IDLE,
|
||||
|
||||
// Profiling start. In this state, changes cannot be made to
|
||||
// the PMU, counter block, counters. The PMU is collecting
|
||||
// performance counter data. This state represents
|
||||
// the moment after calling hsa_ext_tools_pmu_begin() and before calling
|
||||
// hsa_ext_tools_pmu_end()
|
||||
ROCR_PMU_STATE_START,
|
||||
|
||||
// Profiling stop. In this state, changes cannot be made to
|
||||
// the PMU, counter blocks, Counters. PMU has stopped the
|
||||
// performance counter data collection. However, the result
|
||||
// might not yet be available. This state represents
|
||||
// the moment after calling hsa_ext_tools_pmu_end() and before the call
|
||||
// to hsa_ext_tools_pmu_wait_for_completion() has returned success.
|
||||
ROCR_PMU_STATE_STOP
|
||||
} rocr_pmu_state_t;
|
||||
|
||||
//---------------------------------------------------------------------------//
|
||||
// Opaque pointer to HSA performance monitor unit (PMU) //
|
||||
//---------------------------------------------------------------------------//
|
||||
// typedef void * hsa_ext_tools_pmu_t;
|
||||
|
||||
//---------------------------------------------------------------------------//
|
||||
// Opaque pointer to HSA counter block //
|
||||
//---------------------------------------------------------------------------//
|
||||
// typedef void * hsa_ext_tools_counter_block_t;
|
||||
|
||||
//---------------------------------------------------------------------------//
|
||||
// Opaque pointer to HSA counter //
|
||||
//---------------------------------------------------------------------------//
|
||||
// typedef void * hsa_ext_tools_counter_t;
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif // __cplusplus
|
||||
#endif // _ROCR_PROFILER_H_
|
||||
@@ -0,0 +1,48 @@
|
||||
#include <string.h>
|
||||
#include "var_data.h"
|
||||
|
||||
namespace pm4_profile {
|
||||
VarData::VarData() {
|
||||
size_ = 0;
|
||||
p_data_ = NULL;
|
||||
}
|
||||
|
||||
VarData::~VarData() {}
|
||||
|
||||
void VarData::clear() {
|
||||
size_ = 0;
|
||||
if (p_data_) {
|
||||
free(p_data_);
|
||||
p_data_ = NULL;
|
||||
}
|
||||
}
|
||||
|
||||
bool VarData::set(uint32_t size, const void* p_data) {
|
||||
if (!p_data || (size == 0)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
clear();
|
||||
|
||||
if (NULL == (p_data_ = malloc(size))) {
|
||||
return false;
|
||||
}
|
||||
|
||||
memcpy(p_data_, p_data, size);
|
||||
size_ = size;
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
uint32_t VarData::get(uint32_t size, void* p_data) {
|
||||
if (!p_data || !size || !p_data_ || !size_) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
uint32_t ret_size = size < size_ ? size : size_;
|
||||
|
||||
memcpy(p_data, p_data_, ret_size);
|
||||
|
||||
return ret_size;
|
||||
}
|
||||
} // pm4_profile
|
||||
@@ -0,0 +1,65 @@
|
||||
#ifndef _VAR_DATA_H_
|
||||
#define _VAR_DATA_H_
|
||||
|
||||
/*!
|
||||
\note This file contains declaration of IVarData class.
|
||||
*/
|
||||
|
||||
#include "hsa_perf.h"
|
||||
|
||||
#include <map>
|
||||
#include <stdlib.h>
|
||||
#include <stdint.h>
|
||||
|
||||
namespace pm4_profile {
|
||||
/*!
|
||||
This abstract class implements variable-size storage for information and
|
||||
parameter
|
||||
sets.
|
||||
*/
|
||||
class VarData {
|
||||
public:
|
||||
/*! Constructor for IVarData */
|
||||
VarData();
|
||||
|
||||
/*! Destructor for IVarData */
|
||||
~VarData();
|
||||
|
||||
/*! Deallocate the memory and clean up */
|
||||
void clear();
|
||||
|
||||
/*!
|
||||
Set the data to be stored.
|
||||
@param[in] size Size of data to be stored.
|
||||
@param[in] p_data Pointer to data to be stored.
|
||||
\return true or false
|
||||
*/
|
||||
bool set(uint32_t size, const void* p_data);
|
||||
|
||||
/*!
|
||||
Query the data that was stored.
|
||||
@param[in] size Size (in bytes) of the memory pointed to by p_data.
|
||||
This determines maximum size of the returned data.
|
||||
@param[in,out] p_data Pointer to the result buffer.
|
||||
\return Size (in bytes) of the returned result which is coppied into
|
||||
the buffer pointed to by p_data.
|
||||
*/
|
||||
uint32_t get(uint32_t size, void* p_data);
|
||||
|
||||
/*!
|
||||
Get size of the current data stored
|
||||
\return Size (in bytes) of the data stored.
|
||||
*/
|
||||
uint32_t getSize() { return size_; }
|
||||
|
||||
private:
|
||||
/*! Size of data being stored */
|
||||
uint32_t size_;
|
||||
|
||||
/*! Pointer to the stored data */
|
||||
void* p_data_;
|
||||
};
|
||||
|
||||
typedef std::map<uint32_t, VarData> VarDataMap;
|
||||
}
|
||||
#endif
|
||||
@@ -0,0 +1,622 @@
|
||||
#include "vi_blockinfo.h"
|
||||
#include "gfxip/gfx8/si_ci_vi_merged_offset.h"
|
||||
|
||||
namespace pm4_profile {
|
||||
/**
|
||||
* Table containing CounterGroups which represent VI hardware blocks
|
||||
* as defined by \ref GpuBlockInfo structure
|
||||
*/
|
||||
GpuBlockInfo ViPmuHwBlocks[] = {
|
||||
// Counter block CB
|
||||
{"VI_CB0", kHsaViCounterBlockIdCb0, VI_MAX_NUM_SHADER_ENGINES, 2, VI_NUM_CB,
|
||||
CntlMethodBySeAndInstance, 395, VI_COUNTER_NUM_PER_CB, 0, 0, true, 0, 0, false, 0, 0},
|
||||
{"VI_CB1", kHsaViCounterBlockIdCb1, VI_MAX_NUM_SHADER_ENGINES, 2, VI_NUM_CB,
|
||||
CntlMethodBySeAndInstance, 395, VI_COUNTER_NUM_PER_CB, 0, 0, true, 0, 0, false, 0, 0},
|
||||
{"VI_CB2", kHsaViCounterBlockIdCb2, VI_MAX_NUM_SHADER_ENGINES, 2, VI_NUM_CB,
|
||||
CntlMethodBySeAndInstance, 395, VI_COUNTER_NUM_PER_CB, 0, 0, true, 0, 0, false, 0, 0},
|
||||
{"VI_CB3", kHsaViCounterBlockIdCb3, VI_MAX_NUM_SHADER_ENGINES, 2, VI_NUM_CB,
|
||||
CntlMethodBySeAndInstance, 395, VI_COUNTER_NUM_PER_CB, 0, 0, true, 0, 0, false, 0, 0},
|
||||
|
||||
// Counter block CPF
|
||||
{"VI_CPF", kHsaViCounterBlockIdCpf, VI_MAX_NUM_SHADER_ENGINES, 2, 1, CntlMethodNone, 19,
|
||||
VI_COUNTER_NUM_PER_CPF, 0, 0, true, 0, 0, false, 0, 0},
|
||||
|
||||
// Counter block DB
|
||||
{"VI_DB0", kHsaViCounterBlockIdDb0, VI_MAX_NUM_SHADER_ENGINES, 2, VI_NUM_DB,
|
||||
CntlMethodBySeAndInstance, 256, VI_COUNTER_NUM_PER_DB, 0, 0, true, 0, 0, false, 0, 0},
|
||||
{"VI_DB1", kHsaViCounterBlockIdDb1, VI_MAX_NUM_SHADER_ENGINES, 2, VI_NUM_DB,
|
||||
CntlMethodBySeAndInstance, 256, VI_COUNTER_NUM_PER_DB, 0, 0, true, 0, 0, false, 0, 0},
|
||||
{"VI_DB2", kHsaViCounterBlockIdDb2, VI_MAX_NUM_SHADER_ENGINES, 2, VI_NUM_DB,
|
||||
CntlMethodBySeAndInstance, 256, VI_COUNTER_NUM_PER_DB, 0, 0, true, 0, 0, false, 0, 0},
|
||||
{"VI_DB3", kHsaViCounterBlockIdDb3, VI_MAX_NUM_SHADER_ENGINES, 2, VI_NUM_DB,
|
||||
CntlMethodBySeAndInstance, 256, VI_COUNTER_NUM_PER_DB, 0, 0, true, 0, 0, false, 0, 0},
|
||||
|
||||
// Counter block GRBM
|
||||
{"VI_GRBM", kHsaViCounterBlockIdGrbm, VI_MAX_NUM_SHADER_ENGINES, 2, 1, CntlMethodNone, 33,
|
||||
VI_COUNTER_NUM_PER_GRBM, 0, 0, true, 0, 0, false, 0, 0},
|
||||
|
||||
// Counter block GRBMSE
|
||||
{"VI_GRBMSE", kHsaViCounterBlockIdGrbmSe, VI_MAX_NUM_SHADER_ENGINES, 2, 1, CntlMethodNone, 14,
|
||||
VI_COUNTER_NUM_PER_GRBMSE, 0, 0, true, 0, 0, false, 0, 0},
|
||||
|
||||
// Counter block PA_SU
|
||||
{"VI_PA_SU", kHsaViCounterBlockIdPaSu, VI_MAX_NUM_SHADER_ENGINES, 2, 1, CntlMethodBySe, 152,
|
||||
VI_COUNTER_NUM_PER_PA_SU, 0, 0, true, 0, 0, false, 0, 0},
|
||||
|
||||
// Counter block PA_SC
|
||||
{"VI_PA_SC", kHsaViCounterBlockIdPaSc, VI_MAX_NUM_SHADER_ENGINES, 2, 1, CntlMethodBySe, 396,
|
||||
VI_COUNTER_NUM_PER_PA_SC, 0, 0, true, 0, 0, false, 0, 0},
|
||||
|
||||
// Counter block SPI
|
||||
{"VI_SPI", kHsaViCounterBlockIdSpi, VI_MAX_NUM_SHADER_ENGINES, 2, 1, CntlMethodBySe, 196,
|
||||
VI_COUNTER_NUM_PER_SPI, 0, 0, true, 0, 0, false, 0, 0},
|
||||
|
||||
// Counter block SQ
|
||||
{"VI_SQ", kHsaViCounterBlockIdSq, VI_MAX_NUM_SHADER_ENGINES, 2, 1, CntlMethodBySe, 298,
|
||||
VI_COUNTER_NUM_PER_SQ, 0, 0, true, 0, 0, false, 0, 0},
|
||||
{"VI_SQ_ES", kHsaViCounterBlockIdSqEs, VI_MAX_NUM_SHADER_ENGINES, 2, 1, CntlMethodBySe, 298,
|
||||
VI_COUNTER_NUM_PER_SQ, 0, 0, true, 0, 0, false, 0, 0},
|
||||
{"VI_SQ_GS", kHsaViCounterBlockIdSqGs, VI_MAX_NUM_SHADER_ENGINES, 2, 1, CntlMethodBySe, 298,
|
||||
VI_COUNTER_NUM_PER_SQ, 0, 0, true, 0, 0, false, 0, 0},
|
||||
{"VI_SQ_VS", kHsaViCounterBlockIdSqVs, VI_MAX_NUM_SHADER_ENGINES, 2, 1, CntlMethodBySe, 298,
|
||||
VI_COUNTER_NUM_PER_SQ, 0, 0, true, 0, 0, false, 0, 0},
|
||||
{"VI_SQ_PS", kHsaViCounterBlockIdSqPs, VI_MAX_NUM_SHADER_ENGINES, 2, 1, CntlMethodBySe, 298,
|
||||
VI_COUNTER_NUM_PER_SQ, 0, 0, true, 0, 0, false, 0, 0},
|
||||
{"VI_SQ_LS", kHsaViCounterBlockIdSqLs, VI_MAX_NUM_SHADER_ENGINES, 2, 1, CntlMethodBySe, 298,
|
||||
VI_COUNTER_NUM_PER_SQ, 0, 0, true, 0, 0, false, 0, 0},
|
||||
{"VI_SQ_HS", kHsaViCounterBlockIdSqHs, VI_MAX_NUM_SHADER_ENGINES, 2, 1, CntlMethodBySe, 298,
|
||||
VI_COUNTER_NUM_PER_SQ, 0, 0, true, 0, 0, false, 0, 0},
|
||||
{"VI_SQ_CS", kHsaViCounterBlockIdSqCs, VI_MAX_NUM_SHADER_ENGINES, 2, 1, CntlMethodBySe, 298,
|
||||
VI_COUNTER_NUM_PER_SQ, 0, 0, true, 0, 0, false, 0, 0},
|
||||
|
||||
// Counter block SX
|
||||
{"VI_SX", kHsaViCounterBlockIdSx, VI_MAX_NUM_SHADER_ENGINES, 2, 1, CntlMethodBySe, 33,
|
||||
VI_COUNTER_NUM_PER_SX, 0, 0, true, 0, 0, false, 0, 0},
|
||||
|
||||
// Counter block TA
|
||||
{"VI_TA0", kHsaViCounterBlockIdTa0, VI_MAX_NUM_SHADER_ENGINES, 2, VI_NUM_TA,
|
||||
CntlMethodBySeAndInstance, 118, VI_COUNTER_NUM_PER_TA, 0, 0, true, 0, 0, false, 0, 0},
|
||||
{"VI_TA1", kHsaViCounterBlockIdTa1, VI_MAX_NUM_SHADER_ENGINES, 2, VI_NUM_TA,
|
||||
CntlMethodBySeAndInstance, 118, VI_COUNTER_NUM_PER_TA, 0, 0, true, 0, 0, false, 0, 0},
|
||||
{"VI_TA2", kHsaViCounterBlockIdTa2, VI_MAX_NUM_SHADER_ENGINES, 2, VI_NUM_TA,
|
||||
CntlMethodBySeAndInstance, 118, VI_COUNTER_NUM_PER_TA, 0, 0, true, 0, 0, false, 0, 0},
|
||||
{"VI_TA3", kHsaViCounterBlockIdTa3, VI_MAX_NUM_SHADER_ENGINES, 2, VI_NUM_TA,
|
||||
CntlMethodBySeAndInstance, 118, VI_COUNTER_NUM_PER_TA, 0, 0, true, 0, 0, false, 0, 0},
|
||||
{"VI_TA4", kHsaViCounterBlockIdTa4, VI_MAX_NUM_SHADER_ENGINES, 2, VI_NUM_TA,
|
||||
CntlMethodBySeAndInstance, 118, VI_COUNTER_NUM_PER_TA, 0, 0, true, 0, 0, false, 0, 0},
|
||||
{"VI_TA5", kHsaViCounterBlockIdTa5, VI_MAX_NUM_SHADER_ENGINES, 2, VI_NUM_TA,
|
||||
CntlMethodBySeAndInstance, 118, VI_COUNTER_NUM_PER_TA, 0, 0, true, 0, 0, false, 0, 0},
|
||||
{"VI_TA6", kHsaViCounterBlockIdTa6, VI_MAX_NUM_SHADER_ENGINES, 2, VI_NUM_TA,
|
||||
CntlMethodBySeAndInstance, 118, VI_COUNTER_NUM_PER_TA, 0, 0, true, 0, 0, false, 0, 0},
|
||||
{"VI_TA7", kHsaViCounterBlockIdTa7, VI_MAX_NUM_SHADER_ENGINES, 2, VI_NUM_TA,
|
||||
CntlMethodBySeAndInstance, 118, VI_COUNTER_NUM_PER_TA, 0, 0, true, 0, 0, false, 0, 0},
|
||||
{"VI_TA8", kHsaViCounterBlockIdTa8, VI_MAX_NUM_SHADER_ENGINES, 2, VI_NUM_TA,
|
||||
CntlMethodBySeAndInstance, 118, VI_COUNTER_NUM_PER_TA, 0, 0, true, 0, 0, false, 0, 0},
|
||||
{"VI_TA9", kHsaViCounterBlockIdTa9, VI_MAX_NUM_SHADER_ENGINES, 2, VI_NUM_TA,
|
||||
CntlMethodBySeAndInstance, 118, VI_COUNTER_NUM_PER_TA, 0, 0, true, 0, 0, false, 0, 0},
|
||||
{"VI_TA10", kHsaViCounterBlockIdTa10, VI_MAX_NUM_SHADER_ENGINES, 2, VI_NUM_TA,
|
||||
CntlMethodBySeAndInstance, 118, VI_COUNTER_NUM_PER_TA, 0, 0, true, 0, 0, false, 0, 0},
|
||||
{"VI_TA11", kHsaViCounterBlockIdTa11, VI_MAX_NUM_SHADER_ENGINES, 2, VI_NUM_TA,
|
||||
CntlMethodBySeAndInstance, 118, VI_COUNTER_NUM_PER_TA, 0, 0, true, 0, 0, false, 0, 0},
|
||||
{"VI_TA12", kHsaViCounterBlockIdTa12, VI_MAX_NUM_SHADER_ENGINES, 2, VI_NUM_TA,
|
||||
CntlMethodBySeAndInstance, 118, VI_COUNTER_NUM_PER_TA, 0, 0, true, 0, 0, false, 0, 0},
|
||||
{"VI_TA13", kHsaViCounterBlockIdTa13, VI_MAX_NUM_SHADER_ENGINES, 2, VI_NUM_TA,
|
||||
CntlMethodBySeAndInstance, 118, VI_COUNTER_NUM_PER_TA, 0, 0, true, 0, 0, false, 0, 0},
|
||||
{"VI_TA14", kHsaViCounterBlockIdTa14, VI_MAX_NUM_SHADER_ENGINES, 2, VI_NUM_TA,
|
||||
CntlMethodBySeAndInstance, 118, VI_COUNTER_NUM_PER_TA, 0, 0, true, 0, 0, false, 0, 0},
|
||||
{"VI_TA15", kHsaViCounterBlockIdTa15, VI_MAX_NUM_SHADER_ENGINES, 2, VI_NUM_TA,
|
||||
CntlMethodBySeAndInstance, 118, VI_COUNTER_NUM_PER_TA, 0, 0, true, 0, 0, false, 0, 0},
|
||||
|
||||
// Counter block TCA
|
||||
{"VI_TCA0", kHsaViCounterBlockIdTca0, VI_MAX_NUM_SHADER_ENGINES, 2, VI_NUM_TCA,
|
||||
CntlMethodByInstance, 34, VI_COUNTER_NUM_PER_TCA, 0, 0, true, 0, 0, false, 0, 0},
|
||||
{"VI_TCA1", kHsaViCounterBlockIdTca1, VI_MAX_NUM_SHADER_ENGINES, 2, VI_NUM_TCA,
|
||||
CntlMethodByInstance, 34, VI_COUNTER_NUM_PER_TCA, 0, 0, true, 0, 0, false, 0, 0},
|
||||
|
||||
// Counter block TCC
|
||||
{"VI_TCC0", kHsaViCounterBlockIdTcc0, VI_MAX_NUM_SHADER_ENGINES, 2, VI_NUM_TCC,
|
||||
CntlMethodByInstance, 191, VI_COUNTER_NUM_PER_TCC, 0, 0, true, 0, 0, false, 0, 0},
|
||||
{"VI_TCC1", kHsaViCounterBlockIdTcc1, VI_MAX_NUM_SHADER_ENGINES, 2, VI_NUM_TCC,
|
||||
CntlMethodByInstance, 191, VI_COUNTER_NUM_PER_TCC, 0, 0, true, 0, 0, false, 0, 0},
|
||||
{"VI_TCC2", kHsaViCounterBlockIdTcc2, VI_MAX_NUM_SHADER_ENGINES, 2, VI_NUM_TCC,
|
||||
CntlMethodByInstance, 191, VI_COUNTER_NUM_PER_TCC, 0, 0, true, 0, 0, false, 0, 0},
|
||||
{"VI_TCC3", kHsaViCounterBlockIdTcc3, VI_MAX_NUM_SHADER_ENGINES, 2, VI_NUM_TCC,
|
||||
CntlMethodByInstance, 191, VI_COUNTER_NUM_PER_TCC, 0, 0, true, 0, 0, false, 0, 0},
|
||||
{"VI_TCC4", kHsaViCounterBlockIdTcc4, VI_MAX_NUM_SHADER_ENGINES, 2, VI_NUM_TCC,
|
||||
CntlMethodByInstance, 191, VI_COUNTER_NUM_PER_TCC, 0, 0, true, 0, 0, false, 0, 0},
|
||||
{"VI_TCC5", kHsaViCounterBlockIdTcc5, VI_MAX_NUM_SHADER_ENGINES, 2, VI_NUM_TCC,
|
||||
CntlMethodByInstance, 191, VI_COUNTER_NUM_PER_TCC, 0, 0, true, 0, 0, false, 0, 0},
|
||||
{"VI_TCC6", kHsaViCounterBlockIdTcc6, VI_MAX_NUM_SHADER_ENGINES, 2, VI_NUM_TCC,
|
||||
CntlMethodByInstance, 191, VI_COUNTER_NUM_PER_TCC, 0, 0, true, 0, 0, false, 0, 0},
|
||||
{"VI_TCC7", kHsaViCounterBlockIdTcc7, VI_MAX_NUM_SHADER_ENGINES, 2, VI_NUM_TCC,
|
||||
CntlMethodByInstance, 191, VI_COUNTER_NUM_PER_TCC, 0, 0, true, 0, 0, false, 0, 0},
|
||||
{"VI_TCC8", kHsaViCounterBlockIdTcc8, VI_MAX_NUM_SHADER_ENGINES, 2, VI_NUM_TCC,
|
||||
CntlMethodByInstance, 191, VI_COUNTER_NUM_PER_TCC, 0, 0, true, 0, 0, false, 0, 0},
|
||||
{"VI_TCC9", kHsaViCounterBlockIdTcc9, VI_MAX_NUM_SHADER_ENGINES, 2, VI_NUM_TCC,
|
||||
CntlMethodByInstance, 191, VI_COUNTER_NUM_PER_TCC, 0, 0, true, 0, 0, false, 0, 0},
|
||||
{"VI_TCC10", kHsaViCounterBlockIdTcc10, VI_MAX_NUM_SHADER_ENGINES, 2, VI_NUM_TCC,
|
||||
CntlMethodByInstance, 191, VI_COUNTER_NUM_PER_TCC, 0, 0, true, 0, 0, false, 0, 0},
|
||||
{"VI_TCC11", kHsaViCounterBlockIdTcc11, VI_MAX_NUM_SHADER_ENGINES, 2, VI_NUM_TCC,
|
||||
CntlMethodByInstance, 191, VI_COUNTER_NUM_PER_TCC, 0, 0, true, 0, 0, false, 0, 0},
|
||||
{"VI_TCC12", kHsaViCounterBlockIdTcc12, VI_MAX_NUM_SHADER_ENGINES, 2, VI_NUM_TCC,
|
||||
CntlMethodByInstance, 191, VI_COUNTER_NUM_PER_TCC, 0, 0, true, 0, 0, false, 0, 0},
|
||||
{"VI_TCC13", kHsaViCounterBlockIdTcc13, VI_MAX_NUM_SHADER_ENGINES, 2, VI_NUM_TCC,
|
||||
CntlMethodByInstance, 191, VI_COUNTER_NUM_PER_TCC, 0, 0, true, 0, 0, false, 0, 0},
|
||||
{"VI_TCC14", kHsaViCounterBlockIdTcc14, VI_MAX_NUM_SHADER_ENGINES, 2, VI_NUM_TCC,
|
||||
CntlMethodByInstance, 191, VI_COUNTER_NUM_PER_TCC, 0, 0, true, 0, 0, false, 0, 0},
|
||||
{"VI_TCC15", kHsaViCounterBlockIdTcc15, VI_MAX_NUM_SHADER_ENGINES, 2, VI_NUM_TCC,
|
||||
CntlMethodByInstance, 191, VI_COUNTER_NUM_PER_TCC, 0, 0, true, 0, 0, false, 0, 0},
|
||||
|
||||
// Counter block TD
|
||||
{"VI_TD0", kHsaViCounterBlockIdTd0, VI_MAX_NUM_SHADER_ENGINES, 2, VI_NUM_TD,
|
||||
CntlMethodBySeAndInstance, 54, VI_COUNTER_NUM_PER_TD, 0, 0, true, 0, 0, false, 0, 0},
|
||||
{"VI_TD1", kHsaViCounterBlockIdTd1, VI_MAX_NUM_SHADER_ENGINES, 2, VI_NUM_TD,
|
||||
CntlMethodBySeAndInstance, 54, VI_COUNTER_NUM_PER_TD, 0, 0, true, 0, 0, false, 0, 0},
|
||||
{"VI_TD2", kHsaViCounterBlockIdTd2, VI_MAX_NUM_SHADER_ENGINES, 2, VI_NUM_TD,
|
||||
CntlMethodBySeAndInstance, 54, VI_COUNTER_NUM_PER_TD, 0, 0, true, 0, 0, false, 0, 0},
|
||||
{"VI_TD3", kHsaViCounterBlockIdTd3, VI_MAX_NUM_SHADER_ENGINES, 2, VI_NUM_TD,
|
||||
CntlMethodBySeAndInstance, 54, VI_COUNTER_NUM_PER_TD, 0, 0, true, 0, 0, false, 0, 0},
|
||||
{"VI_TD4", kHsaViCounterBlockIdTd4, VI_MAX_NUM_SHADER_ENGINES, 2, VI_NUM_TD,
|
||||
CntlMethodBySeAndInstance, 54, VI_COUNTER_NUM_PER_TD, 0, 0, true, 0, 0, false, 0, 0},
|
||||
{"VI_TD5", kHsaViCounterBlockIdTd5, VI_MAX_NUM_SHADER_ENGINES, 2, VI_NUM_TD,
|
||||
CntlMethodBySeAndInstance, 54, VI_COUNTER_NUM_PER_TD, 0, 0, true, 0, 0, false, 0, 0},
|
||||
{"VI_TD6", kHsaViCounterBlockIdTd6, VI_MAX_NUM_SHADER_ENGINES, 2, VI_NUM_TD,
|
||||
CntlMethodBySeAndInstance, 54, VI_COUNTER_NUM_PER_TD, 0, 0, true, 0, 0, false, 0, 0},
|
||||
{"VI_TD7", kHsaViCounterBlockIdTd7, VI_MAX_NUM_SHADER_ENGINES, 2, VI_NUM_TD,
|
||||
CntlMethodBySeAndInstance, 54, VI_COUNTER_NUM_PER_TD, 0, 0, true, 0, 0, false, 0, 0},
|
||||
{"VI_TD8", kHsaViCounterBlockIdTd8, VI_MAX_NUM_SHADER_ENGINES, 2, VI_NUM_TD,
|
||||
CntlMethodBySeAndInstance, 54, VI_COUNTER_NUM_PER_TD, 0, 0, true, 0, 0, false, 0, 0},
|
||||
{"VI_TD9", kHsaViCounterBlockIdTd9, VI_MAX_NUM_SHADER_ENGINES, 2, VI_NUM_TD,
|
||||
CntlMethodBySeAndInstance, 54, VI_COUNTER_NUM_PER_TD, 0, 0, true, 0, 0, false, 0, 0},
|
||||
{"VI_TD10", kHsaViCounterBlockIdTd10, VI_MAX_NUM_SHADER_ENGINES, 2, VI_NUM_TD,
|
||||
CntlMethodBySeAndInstance, 54, VI_COUNTER_NUM_PER_TD, 0, 0, true, 0, 0, false, 0, 0},
|
||||
{"VI_TD11", kHsaViCounterBlockIdTd11, VI_MAX_NUM_SHADER_ENGINES, 2, VI_NUM_TD,
|
||||
CntlMethodBySeAndInstance, 54, VI_COUNTER_NUM_PER_TD, 0, 0, true, 0, 0, false, 0, 0},
|
||||
{"VI_TD12", kHsaViCounterBlockIdTd12, VI_MAX_NUM_SHADER_ENGINES, 2, VI_NUM_TD,
|
||||
CntlMethodBySeAndInstance, 54, VI_COUNTER_NUM_PER_TD, 0, 0, true, 0, 0, false, 0, 0},
|
||||
{"VI_TD13", kHsaViCounterBlockIdTd13, VI_MAX_NUM_SHADER_ENGINES, 2, VI_NUM_TD,
|
||||
CntlMethodBySeAndInstance, 54, VI_COUNTER_NUM_PER_TD, 0, 0, true, 0, 0, false, 0, 0},
|
||||
{"VI_TD14", kHsaViCounterBlockIdTd14, VI_MAX_NUM_SHADER_ENGINES, 2, VI_NUM_TD,
|
||||
CntlMethodBySeAndInstance, 54, VI_COUNTER_NUM_PER_TD, 0, 0, true, 0, 0, false, 0, 0},
|
||||
{"VI_TD15", kHsaViCounterBlockIdTd15, VI_MAX_NUM_SHADER_ENGINES, 2, VI_NUM_TD,
|
||||
CntlMethodBySeAndInstance, 54, VI_COUNTER_NUM_PER_TD, 0, 0, true, 0, 0, false, 0, 0},
|
||||
|
||||
// Counter block TCP
|
||||
{"VI_TCP0", kHsaViCounterBlockIdTcp0, VI_MAX_NUM_SHADER_ENGINES, 2, VI_NUM_TCP,
|
||||
CntlMethodBySeAndInstance, 182, VI_COUNTER_NUM_PER_TCP, 0, 0, true, 0, 0, false, 0, 0},
|
||||
{"VI_TCP1", kHsaViCounterBlockIdTcp1, VI_MAX_NUM_SHADER_ENGINES, 2, VI_NUM_TCP,
|
||||
CntlMethodBySeAndInstance, 182, VI_COUNTER_NUM_PER_TCP, 0, 0, true, 0, 0, false, 0, 0},
|
||||
{"VI_TCP2", kHsaViCounterBlockIdTcp2, VI_MAX_NUM_SHADER_ENGINES, 2, VI_NUM_TCP,
|
||||
CntlMethodBySeAndInstance, 182, VI_COUNTER_NUM_PER_TCP, 0, 0, true, 0, 0, false, 0, 0},
|
||||
{"VI_TCP3", kHsaViCounterBlockIdTcp3, VI_MAX_NUM_SHADER_ENGINES, 2, VI_NUM_TCP,
|
||||
CntlMethodBySeAndInstance, 182, VI_COUNTER_NUM_PER_TCP, 0, 0, true, 0, 0, false, 0, 0},
|
||||
{"VI_TCP4", kHsaViCounterBlockIdTcp4, VI_MAX_NUM_SHADER_ENGINES, 2, VI_NUM_TCP,
|
||||
CntlMethodBySeAndInstance, 182, VI_COUNTER_NUM_PER_TCP, 0, 0, true, 0, 0, false, 0, 0},
|
||||
{"VI_TCP5", kHsaViCounterBlockIdTcp5, VI_MAX_NUM_SHADER_ENGINES, 2, VI_NUM_TCP,
|
||||
CntlMethodBySeAndInstance, 182, VI_COUNTER_NUM_PER_TCP, 0, 0, true, 0, 0, false, 0, 0},
|
||||
{"VI_TCP6", kHsaViCounterBlockIdTcp6, VI_MAX_NUM_SHADER_ENGINES, 2, VI_NUM_TCP,
|
||||
CntlMethodBySeAndInstance, 182, VI_COUNTER_NUM_PER_TCP, 0, 0, true, 0, 0, false, 0, 0},
|
||||
{"VI_TCP7", kHsaViCounterBlockIdTcp7, VI_MAX_NUM_SHADER_ENGINES, 2, VI_NUM_TCP,
|
||||
CntlMethodBySeAndInstance, 182, VI_COUNTER_NUM_PER_TCP, 0, 0, true, 0, 0, false, 0, 0},
|
||||
{"VI_TCP8", kHsaViCounterBlockIdTcp8, VI_MAX_NUM_SHADER_ENGINES, 2, VI_NUM_TCP,
|
||||
CntlMethodBySeAndInstance, 182, VI_COUNTER_NUM_PER_TCP, 0, 0, true, 0, 0, false, 0, 0},
|
||||
{"VI_TCP9", kHsaViCounterBlockIdTcp9, VI_MAX_NUM_SHADER_ENGINES, 2, VI_NUM_TCP,
|
||||
CntlMethodBySeAndInstance, 182, VI_COUNTER_NUM_PER_TCP, 0, 0, true, 0, 0, false, 0, 0},
|
||||
{"VI_TCP10", kHsaViCounterBlockIdTcp10, VI_MAX_NUM_SHADER_ENGINES, 2, VI_NUM_TCP,
|
||||
CntlMethodBySeAndInstance, 182, VI_COUNTER_NUM_PER_TCP, 0, 0, true, 0, 0, false, 0, 0},
|
||||
{"VI_TCP11", kHsaViCounterBlockIdTcp11, VI_MAX_NUM_SHADER_ENGINES, 2, VI_NUM_TCP,
|
||||
CntlMethodBySeAndInstance, 182, VI_COUNTER_NUM_PER_TCP, 0, 0, true, 0, 0, false, 0, 0},
|
||||
{"VI_TCP12", kHsaViCounterBlockIdTcp12, VI_MAX_NUM_SHADER_ENGINES, 2, VI_NUM_TCP,
|
||||
CntlMethodBySeAndInstance, 182, VI_COUNTER_NUM_PER_TCP, 0, 0, true, 0, 0, false, 0, 0},
|
||||
{"VI_TCP13", kHsaViCounterBlockIdTcp13, VI_MAX_NUM_SHADER_ENGINES, 2, VI_NUM_TCP,
|
||||
CntlMethodBySeAndInstance, 182, VI_COUNTER_NUM_PER_TCP, 0, 0, true, 0, 0, false, 0, 0},
|
||||
{"VI_TCP14", kHsaViCounterBlockIdTcp14, VI_MAX_NUM_SHADER_ENGINES, 2, VI_NUM_TCP,
|
||||
CntlMethodBySeAndInstance, 182, VI_COUNTER_NUM_PER_TCP, 0, 0, true, 0, 0, false, 0, 0},
|
||||
{"VI_TCP15", kHsaViCounterBlockIdTcp15, VI_MAX_NUM_SHADER_ENGINES, 2, VI_NUM_TCP,
|
||||
CntlMethodBySeAndInstance, 182, VI_COUNTER_NUM_PER_TCP, 0, 0, true, 0, 0, false, 0, 0},
|
||||
|
||||
// Counter block GDS
|
||||
{"VI_GDS", kHsaViCounterBlockIdGds, VI_MAX_NUM_SHADER_ENGINES, 2, 1, CntlMethodNone, 120,
|
||||
VI_COUNTER_NUM_PER_GDS, 0, 0, true, 0, 0, false, 0, 0},
|
||||
|
||||
// Counter block VGT
|
||||
{"VI_VGT", kHsaViCounterBlockIdVgt, VI_MAX_NUM_SHADER_ENGINES, 2, 1, CntlMethodBySe, 145,
|
||||
VI_COUNTER_NUM_PER_VGT, 0, 0, true, 0, 0, false, 0, 0},
|
||||
|
||||
// Counter block IA
|
||||
{"VI_IA", kHsaViCounterBlockIdIa, VI_MAX_NUM_SHADER_ENGINES, 2, 1, CntlMethodBySe, 23,
|
||||
VI_COUNTER_NUM_PER_IA, 0, 0, true, 0, 0, false, 0, 0},
|
||||
|
||||
// Counter block MC
|
||||
{"VI_MC", kHsaViCounterBlockIdMc, VI_MAX_NUM_SHADER_ENGINES, 2, 1, CntlMethodNone, 22,
|
||||
VI_COUNTER_NUM_PER_MC, 0, 0, true, 0, 0, false, 0, 0},
|
||||
|
||||
// Counter block SRBM
|
||||
{"VI_SRBM", kHsaViCounterBlockIdSrbm, VI_MAX_NUM_SHADER_ENGINES, 2, 1, CntlMethodNone, 19,
|
||||
VI_COUNTER_NUM_PER_SRBM, 0, 0, true, 0, 0, false, 0, 0},
|
||||
|
||||
// Counter block WD
|
||||
{"VI_WD", kHsaViCounterBlockIdWd, VI_MAX_NUM_SHADER_ENGINES, 2, 1, CntlMethodNone, 36,
|
||||
VI_COUNTER_NUM_PER_WD, 0, 0, true, 0, 0, false, 0, 0},
|
||||
|
||||
// Counter block CPG
|
||||
{"VI_CPG", kHsaViCounterBlockIdCpg, VI_MAX_NUM_SHADER_ENGINES, 2, 1, CntlMethodNone, 48,
|
||||
VI_COUNTER_NUM_PER_CPG, 0, 0, true, 0, 0, false, 0, 0},
|
||||
|
||||
// Counter block CPC
|
||||
{"VI_CPC", kHsaViCounterBlockIdCpc, VI_MAX_NUM_SHADER_ENGINES, 2, 1, CntlMethodNone, 24,
|
||||
VI_COUNTER_NUM_PER_CPC, 0, 0, true, 0, 0, false, 0, 0},
|
||||
|
||||
// Counter block IOMMUV2
|
||||
{"VI_IOMMUV2", kHsaViCounterBlockIdIommuV2, VI_MAX_NUM_SHADER_ENGINES, 2, 1, CntlMethodNone, 25,
|
||||
8, 0, 0, true, 0, 0, false, 0, 0},
|
||||
|
||||
// Counter block KernelDriver
|
||||
{"VI_KD", kHsaViCounterBlockIdKernelDriver, VI_MAX_NUM_SHADER_ENGINES, 2, 1, CntlMethodNone, 0,
|
||||
0, 0, 0, true, 0, 0, false, 0, 0},
|
||||
|
||||
// Name of the last line should be empty to indicate end of all counter groups
|
||||
{"", kHsaViCounterBlockIdBlocksLast, 0, 0, 0, CntlMethodNone, 0, 0, 0, 0, false, 0, 0, false, 0,
|
||||
0}};
|
||||
|
||||
/*
|
||||
* The following tables contain register addresses of the SQ counter registers
|
||||
*/
|
||||
|
||||
/*
|
||||
* SQ
|
||||
*/
|
||||
GpuCounterRegInfo ViSqCounterRegAddr[] = {
|
||||
{mmSQ_PERFCOUNTER0_SELECT__CI__VI, mmSQ_PERFCOUNTER_CTRL__CI__VI, mmSQ_PERFCOUNTER0_LO__CI__VI,
|
||||
mmSQ_PERFCOUNTER0_HI__CI__VI},
|
||||
{mmSQ_PERFCOUNTER1_SELECT__CI__VI, mmSQ_PERFCOUNTER_CTRL__CI__VI, mmSQ_PERFCOUNTER1_LO__CI__VI,
|
||||
mmSQ_PERFCOUNTER1_HI__CI__VI},
|
||||
{mmSQ_PERFCOUNTER2_SELECT__CI__VI, mmSQ_PERFCOUNTER_CTRL__CI__VI, mmSQ_PERFCOUNTER2_LO__CI__VI,
|
||||
mmSQ_PERFCOUNTER2_HI__CI__VI},
|
||||
{mmSQ_PERFCOUNTER3_SELECT__CI__VI, mmSQ_PERFCOUNTER_CTRL__CI__VI, mmSQ_PERFCOUNTER3_LO__CI__VI,
|
||||
mmSQ_PERFCOUNTER3_HI__CI__VI},
|
||||
{mmSQ_PERFCOUNTER4_SELECT__CI__VI, mmSQ_PERFCOUNTER_CTRL__CI__VI, mmSQ_PERFCOUNTER4_LO__CI__VI,
|
||||
mmSQ_PERFCOUNTER4_HI__CI__VI},
|
||||
{mmSQ_PERFCOUNTER5_SELECT__CI__VI, mmSQ_PERFCOUNTER_CTRL__CI__VI, mmSQ_PERFCOUNTER5_LO__CI__VI,
|
||||
mmSQ_PERFCOUNTER5_HI__CI__VI},
|
||||
{mmSQ_PERFCOUNTER6_SELECT__CI__VI, mmSQ_PERFCOUNTER_CTRL__CI__VI, mmSQ_PERFCOUNTER6_LO__CI__VI,
|
||||
mmSQ_PERFCOUNTER6_HI__CI__VI},
|
||||
{mmSQ_PERFCOUNTER7_SELECT__CI__VI, mmSQ_PERFCOUNTER_CTRL__CI__VI, mmSQ_PERFCOUNTER7_LO__CI__VI,
|
||||
mmSQ_PERFCOUNTER7_HI__CI__VI},
|
||||
{mmSQ_PERFCOUNTER8_SELECT__CI__VI, mmSQ_PERFCOUNTER_CTRL__CI__VI, mmSQ_PERFCOUNTER8_LO__CI__VI,
|
||||
mmSQ_PERFCOUNTER8_HI__CI__VI},
|
||||
{mmSQ_PERFCOUNTER9_SELECT__CI__VI, mmSQ_PERFCOUNTER_CTRL__CI__VI, mmSQ_PERFCOUNTER9_LO__CI__VI,
|
||||
mmSQ_PERFCOUNTER9_HI__CI__VI},
|
||||
{mmSQ_PERFCOUNTER10_SELECT__CI__VI, mmSQ_PERFCOUNTER_CTRL__CI__VI,
|
||||
mmSQ_PERFCOUNTER10_LO__CI__VI, mmSQ_PERFCOUNTER10_HI__CI__VI},
|
||||
{mmSQ_PERFCOUNTER11_SELECT__CI__VI, mmSQ_PERFCOUNTER_CTRL__CI__VI,
|
||||
mmSQ_PERFCOUNTER11_LO__CI__VI, mmSQ_PERFCOUNTER11_HI__CI__VI},
|
||||
{mmSQ_PERFCOUNTER12_SELECT__CI__VI, mmSQ_PERFCOUNTER_CTRL__CI__VI,
|
||||
mmSQ_PERFCOUNTER12_LO__CI__VI, mmSQ_PERFCOUNTER12_HI__CI__VI},
|
||||
{mmSQ_PERFCOUNTER13_SELECT__CI__VI, mmSQ_PERFCOUNTER_CTRL__CI__VI,
|
||||
mmSQ_PERFCOUNTER13_LO__CI__VI, mmSQ_PERFCOUNTER13_HI__CI__VI},
|
||||
{mmSQ_PERFCOUNTER14_SELECT__CI__VI, mmSQ_PERFCOUNTER_CTRL__CI__VI,
|
||||
mmSQ_PERFCOUNTER14_LO__CI__VI, mmSQ_PERFCOUNTER14_HI__CI__VI},
|
||||
{mmSQ_PERFCOUNTER15_SELECT__CI__VI, mmSQ_PERFCOUNTER_CTRL__CI__VI,
|
||||
mmSQ_PERFCOUNTER15_LO__CI__VI, mmSQ_PERFCOUNTER15_HI__CI__VI}};
|
||||
|
||||
/*
|
||||
* DRMDMA
|
||||
*/
|
||||
GpuCounterRegInfo ViDrmdmaCounterRegAddr[] = {
|
||||
{mmSDMA0_PERFMON_CNTL__VI, 0, mmSDMA0_PERFCOUNTER0_RESULT__VI, 0},
|
||||
{mmSDMA0_PERFMON_CNTL__VI, 0, mmSDMA0_PERFCOUNTER1_RESULT__VI, 0},
|
||||
{mmSDMA1_PERFMON_CNTL__VI, 0, mmSDMA1_PERFCOUNTER0_RESULT__VI, 0},
|
||||
{mmSDMA1_PERFMON_CNTL__VI, 0, mmSDMA1_PERFCOUNTER1_RESULT__VI, 0},
|
||||
};
|
||||
|
||||
/*
|
||||
* IH
|
||||
*/
|
||||
GpuCounterRegInfo ViIhCounterRegAddr[] = {
|
||||
{mmIH_PERFMON_CNTL__VI, 0, mmIH_PERFCOUNTER0_RESULT__VI, 0},
|
||||
{mmIH_PERFMON_CNTL__VI, 0, mmIH_PERFCOUNTER1_RESULT__VI, 0}};
|
||||
|
||||
/*
|
||||
* CPF
|
||||
*/
|
||||
GpuCounterRegInfo ViCpfCounterRegAddr[] = {
|
||||
{mmCPF_PERFCOUNTER0_SELECT__CI__VI, 0, mmCPF_PERFCOUNTER0_LO__CI__VI,
|
||||
mmCPF_PERFCOUNTER0_HI__CI__VI},
|
||||
{mmCPF_PERFCOUNTER1_SELECT__CI__VI, 0, mmCPF_PERFCOUNTER1_LO__CI__VI,
|
||||
mmCPF_PERFCOUNTER1_HI__CI__VI}};
|
||||
|
||||
/*
|
||||
* DRM
|
||||
*/
|
||||
GpuCounterRegInfo ViDrmCounterRegAddr[] = {
|
||||
{mmDRM_PERFCOUNTER1_SELECT, 0, mmDRM_PERFCOUNTER1_LO, mmDRM_PERFCOUNTER1_HI},
|
||||
{mmDRM_PERFCOUNTER2_SELECT, 0, mmDRM_PERFCOUNTER2_LO, mmDRM_PERFCOUNTER2_HI}};
|
||||
|
||||
/*
|
||||
* GRBM
|
||||
*/
|
||||
GpuCounterRegInfo ViGrbmCounterRegAddr[] = {
|
||||
{mmGRBM_PERFCOUNTER0_SELECT__CI__VI, 0, mmGRBM_PERFCOUNTER0_LO__CI__VI,
|
||||
mmGRBM_PERFCOUNTER0_HI__CI__VI},
|
||||
{mmGRBM_PERFCOUNTER1_SELECT__CI__VI, 0, mmGRBM_PERFCOUNTER1_LO__CI__VI,
|
||||
mmGRBM_PERFCOUNTER1_HI__CI__VI}};
|
||||
|
||||
/*
|
||||
* GRBM_SE
|
||||
*/
|
||||
GpuCounterRegInfo ViGrbmSeCounterRegAddr[] = {
|
||||
{mmGRBM_SE0_PERFCOUNTER_SELECT__CI__VI, 0, mmGRBM_SE0_PERFCOUNTER_LO__CI__VI,
|
||||
mmGRBM_SE0_PERFCOUNTER_HI__CI__VI},
|
||||
{mmGRBM_SE1_PERFCOUNTER_SELECT__CI__VI, 0, mmGRBM_SE1_PERFCOUNTER_LO__CI__VI,
|
||||
mmGRBM_SE1_PERFCOUNTER_HI__CI__VI},
|
||||
{mmGRBM_SE2_PERFCOUNTER_SELECT__CI__VI, 0, mmGRBM_SE2_PERFCOUNTER_LO__CI__VI,
|
||||
mmGRBM_SE2_PERFCOUNTER_HI__CI__VI},
|
||||
{mmGRBM_SE3_PERFCOUNTER_SELECT__CI__VI, 0, mmGRBM_SE3_PERFCOUNTER_LO__CI__VI,
|
||||
mmGRBM_SE3_PERFCOUNTER_HI__CI__VI}};
|
||||
|
||||
/*
|
||||
* PA_SU
|
||||
*/
|
||||
GpuCounterRegInfo ViPaSuCounterRegAddr[] = {
|
||||
{mmPA_SU_PERFCOUNTER0_SELECT__CI__VI, 0, mmPA_SU_PERFCOUNTER0_LO__CI__VI,
|
||||
mmPA_SU_PERFCOUNTER0_HI__CI__VI},
|
||||
{mmPA_SU_PERFCOUNTER1_SELECT__CI__VI, 0, mmPA_SU_PERFCOUNTER1_LO__CI__VI,
|
||||
mmPA_SU_PERFCOUNTER1_HI__CI__VI},
|
||||
{mmPA_SU_PERFCOUNTER2_SELECT__CI__VI, 0, mmPA_SU_PERFCOUNTER2_LO__CI__VI,
|
||||
mmPA_SU_PERFCOUNTER2_HI__CI__VI},
|
||||
{mmPA_SU_PERFCOUNTER3_SELECT__CI__VI, 0, mmPA_SU_PERFCOUNTER3_LO__CI__VI,
|
||||
mmPA_SU_PERFCOUNTER3_HI__CI__VI}};
|
||||
|
||||
/*
|
||||
* PA_SC
|
||||
*/
|
||||
GpuCounterRegInfo ViPaScCounterRegAddr[] = {
|
||||
{mmPA_SC_PERFCOUNTER0_SELECT__CI__VI, 0, mmPA_SC_PERFCOUNTER0_LO__CI__VI,
|
||||
mmPA_SC_PERFCOUNTER0_HI__CI__VI},
|
||||
{mmPA_SC_PERFCOUNTER1_SELECT__CI__VI, 0, mmPA_SC_PERFCOUNTER1_LO__CI__VI,
|
||||
mmPA_SC_PERFCOUNTER1_HI__CI__VI},
|
||||
{mmPA_SC_PERFCOUNTER2_SELECT__CI__VI, 0, mmPA_SC_PERFCOUNTER2_LO__CI__VI,
|
||||
mmPA_SC_PERFCOUNTER2_HI__CI__VI},
|
||||
{mmPA_SC_PERFCOUNTER3_SELECT__CI__VI, 0, mmPA_SC_PERFCOUNTER3_LO__CI__VI,
|
||||
mmPA_SC_PERFCOUNTER3_HI__CI__VI}};
|
||||
|
||||
/*
|
||||
* SPI
|
||||
*/
|
||||
GpuCounterRegInfo ViSpiCounterRegAddr[] = {
|
||||
{mmSPI_PERFCOUNTER0_SELECT__CI__VI, 0, mmSPI_PERFCOUNTER0_LO__CI__VI,
|
||||
mmSPI_PERFCOUNTER0_HI__CI__VI},
|
||||
{mmSPI_PERFCOUNTER1_SELECT__CI__VI, 0, mmSPI_PERFCOUNTER1_LO__CI__VI,
|
||||
mmSPI_PERFCOUNTER1_HI__CI__VI},
|
||||
{mmSPI_PERFCOUNTER2_SELECT__CI__VI, 0, mmSPI_PERFCOUNTER2_LO__CI__VI,
|
||||
mmSPI_PERFCOUNTER2_HI__CI__VI},
|
||||
{mmSPI_PERFCOUNTER3_SELECT__CI__VI, 0, mmSPI_PERFCOUNTER3_LO__CI__VI,
|
||||
mmSPI_PERFCOUNTER3_HI__CI__VI},
|
||||
{mmSPI_PERFCOUNTER4_SELECT__CI__VI, 0, mmSPI_PERFCOUNTER4_LO__CI__VI,
|
||||
mmSPI_PERFCOUNTER4_HI__CI__VI},
|
||||
{mmSPI_PERFCOUNTER5_SELECT__CI__VI, 0, mmSPI_PERFCOUNTER5_LO__CI__VI,
|
||||
mmSPI_PERFCOUNTER5_HI__CI__VI}};
|
||||
|
||||
/*
|
||||
* TCA
|
||||
*/
|
||||
GpuCounterRegInfo ViTcaCounterRegAddr[] = {
|
||||
{mmTCA_PERFCOUNTER0_SELECT__CI__VI, 0, mmTCA_PERFCOUNTER0_LO__CI__VI,
|
||||
mmTCA_PERFCOUNTER0_HI__CI__VI},
|
||||
{mmTCA_PERFCOUNTER1_SELECT__CI__VI, 0, mmTCA_PERFCOUNTER1_LO__CI__VI,
|
||||
mmTCA_PERFCOUNTER1_HI__CI__VI},
|
||||
{mmTCA_PERFCOUNTER2_SELECT__CI__VI, 0, mmTCA_PERFCOUNTER2_LO__CI__VI,
|
||||
mmTCA_PERFCOUNTER2_HI__CI__VI},
|
||||
{mmTCA_PERFCOUNTER3_SELECT__CI__VI, 0, mmTCA_PERFCOUNTER3_LO__CI__VI,
|
||||
mmTCA_PERFCOUNTER3_HI__CI__VI}};
|
||||
|
||||
/*
|
||||
* TCC
|
||||
*/
|
||||
GpuCounterRegInfo ViTccCounterRegAddr[] = {
|
||||
{mmTCC_PERFCOUNTER0_SELECT__CI__VI, 0, mmTCC_PERFCOUNTER0_LO__CI__VI,
|
||||
mmTCC_PERFCOUNTER0_HI__CI__VI},
|
||||
{mmTCC_PERFCOUNTER1_SELECT__CI__VI, 0, mmTCC_PERFCOUNTER1_LO__CI__VI,
|
||||
mmTCC_PERFCOUNTER1_HI__CI__VI},
|
||||
{mmTCC_PERFCOUNTER2_SELECT__CI__VI, 0, mmTCC_PERFCOUNTER2_LO__CI__VI,
|
||||
mmTCC_PERFCOUNTER2_HI__CI__VI},
|
||||
{mmTCC_PERFCOUNTER3_SELECT__CI__VI, 0, mmTCC_PERFCOUNTER3_LO__CI__VI,
|
||||
mmTCC_PERFCOUNTER3_HI__CI__VI}};
|
||||
|
||||
/*
|
||||
* TCP
|
||||
*/
|
||||
GpuCounterRegInfo ViTcpCounterRegAddr[] = {
|
||||
{mmTCP_PERFCOUNTER0_SELECT__CI__VI, 0, mmTCP_PERFCOUNTER0_LO__CI__VI,
|
||||
mmTCP_PERFCOUNTER0_HI__CI__VI},
|
||||
{mmTCP_PERFCOUNTER1_SELECT__CI__VI, 0, mmTCP_PERFCOUNTER1_LO__CI__VI,
|
||||
mmTCP_PERFCOUNTER1_HI__CI__VI},
|
||||
{mmTCP_PERFCOUNTER2_SELECT__CI__VI, 0, mmTCP_PERFCOUNTER2_LO__CI__VI,
|
||||
mmTCP_PERFCOUNTER2_HI__CI__VI},
|
||||
{mmTCP_PERFCOUNTER3_SELECT__CI__VI, 0, mmTCP_PERFCOUNTER3_LO__CI__VI,
|
||||
mmTCP_PERFCOUNTER3_HI__CI__VI}};
|
||||
|
||||
/*
|
||||
* CB
|
||||
*/
|
||||
GpuCounterRegInfo ViCbCounterRegAddr[] = {
|
||||
{mmCB_PERFCOUNTER0_SELECT__CI__VI, 0, mmCB_PERFCOUNTER0_LO__CI__VI,
|
||||
mmCB_PERFCOUNTER0_HI__CI__VI},
|
||||
{mmCB_PERFCOUNTER1_SELECT__CI__VI, 0, mmCB_PERFCOUNTER1_LO__CI__VI,
|
||||
mmCB_PERFCOUNTER1_HI__CI__VI},
|
||||
{mmCB_PERFCOUNTER2_SELECT__CI__VI, 0, mmCB_PERFCOUNTER2_LO__CI__VI,
|
||||
mmCB_PERFCOUNTER2_HI__CI__VI},
|
||||
{mmCB_PERFCOUNTER3_SELECT__CI__VI, 0, mmCB_PERFCOUNTER3_LO__CI__VI,
|
||||
mmCB_PERFCOUNTER3_HI__CI__VI}};
|
||||
|
||||
/*
|
||||
* DB
|
||||
*/
|
||||
GpuCounterRegInfo ViDbCounterRegAddr[] = {
|
||||
{mmDB_PERFCOUNTER0_SELECT__CI__VI, 0, mmDB_PERFCOUNTER0_LO__CI__VI,
|
||||
mmDB_PERFCOUNTER0_HI__CI__VI},
|
||||
{mmDB_PERFCOUNTER1_SELECT__CI__VI, 0, mmDB_PERFCOUNTER1_LO__CI__VI,
|
||||
mmDB_PERFCOUNTER1_HI__CI__VI},
|
||||
{mmDB_PERFCOUNTER2_SELECT__CI__VI, 0, mmDB_PERFCOUNTER2_LO__CI__VI,
|
||||
mmDB_PERFCOUNTER2_HI__CI__VI},
|
||||
{mmDB_PERFCOUNTER3_SELECT__CI__VI, 0, mmDB_PERFCOUNTER3_LO__CI__VI,
|
||||
mmDB_PERFCOUNTER3_HI__CI__VI}};
|
||||
|
||||
/*
|
||||
* RLC
|
||||
*/
|
||||
GpuCounterRegInfo ViRlcCounterRegAddr[] = {
|
||||
{mmRLC_PERFCOUNTER0_SELECT__CI__VI, 0, mmRLC_PERFCOUNTER0_LO__CI__VI,
|
||||
mmRLC_PERFCOUNTER0_HI__CI__VI},
|
||||
{mmRLC_PERFCOUNTER1_SELECT__CI__VI, 0, mmRLC_PERFCOUNTER1_LO__CI__VI,
|
||||
mmRLC_PERFCOUNTER1_HI__CI__VI}};
|
||||
|
||||
/*
|
||||
* SC
|
||||
*/
|
||||
GpuCounterRegInfo ViScCounterRegAddr[] = {
|
||||
{mmPA_SC_PERFCOUNTER0_SELECT__CI__VI, 0, mmPA_SC_PERFCOUNTER0_LO__CI__VI,
|
||||
mmPA_SC_PERFCOUNTER0_HI__CI__VI},
|
||||
{mmPA_SC_PERFCOUNTER1_SELECT__CI__VI, 0, mmPA_SC_PERFCOUNTER1_LO__CI__VI,
|
||||
mmPA_SC_PERFCOUNTER1_HI__CI__VI},
|
||||
{mmPA_SC_PERFCOUNTER2_SELECT__CI__VI, 0, mmPA_SC_PERFCOUNTER2_LO__CI__VI,
|
||||
mmPA_SC_PERFCOUNTER2_HI__CI__VI},
|
||||
{mmPA_SC_PERFCOUNTER3_SELECT__CI__VI, 0, mmPA_SC_PERFCOUNTER3_LO__CI__VI,
|
||||
mmPA_SC_PERFCOUNTER3_HI__CI__VI},
|
||||
{mmPA_SC_PERFCOUNTER4_SELECT__CI__VI, 0, mmPA_SC_PERFCOUNTER4_LO__CI__VI,
|
||||
mmPA_SC_PERFCOUNTER4_HI__CI__VI},
|
||||
{mmPA_SC_PERFCOUNTER5_SELECT__CI__VI, 0, mmPA_SC_PERFCOUNTER5_LO__CI__VI,
|
||||
mmPA_SC_PERFCOUNTER5_HI__CI__VI},
|
||||
{mmPA_SC_PERFCOUNTER6_SELECT__CI__VI, 0, mmPA_SC_PERFCOUNTER6_LO__CI__VI,
|
||||
mmPA_SC_PERFCOUNTER6_HI__CI__VI},
|
||||
{mmPA_SC_PERFCOUNTER7_SELECT__CI__VI, 0, mmPA_SC_PERFCOUNTER7_LO__CI__VI,
|
||||
mmPA_SC_PERFCOUNTER7_HI__CI__VI}};
|
||||
|
||||
/*
|
||||
* SX
|
||||
*/
|
||||
GpuCounterRegInfo ViSxCounterRegAddr[] = {
|
||||
{mmSX_PERFCOUNTER0_SELECT__CI__VI, 0, mmSX_PERFCOUNTER0_LO__CI__VI,
|
||||
mmSX_PERFCOUNTER0_HI__CI__VI},
|
||||
{mmSX_PERFCOUNTER1_SELECT__CI__VI, 0, mmSX_PERFCOUNTER1_LO__CI__VI,
|
||||
mmSX_PERFCOUNTER1_HI__CI__VI},
|
||||
{mmSX_PERFCOUNTER2_SELECT__CI__VI, 0, mmSX_PERFCOUNTER2_LO__CI__VI,
|
||||
mmSX_PERFCOUNTER2_HI__CI__VI},
|
||||
{mmSX_PERFCOUNTER3_SELECT__CI__VI, 0, mmSX_PERFCOUNTER3_LO__CI__VI,
|
||||
mmSX_PERFCOUNTER3_HI__CI__VI}};
|
||||
|
||||
/*
|
||||
* TA
|
||||
*/
|
||||
GpuCounterRegInfo ViTaCounterRegAddr[] = {
|
||||
{mmTA_PERFCOUNTER0_SELECT__CI__VI, 0, mmTA_PERFCOUNTER0_LO__CI__VI,
|
||||
mmTA_PERFCOUNTER0_HI__CI__VI},
|
||||
{mmTA_PERFCOUNTER1_SELECT__CI__VI, 0, mmTA_PERFCOUNTER1_LO__CI__VI,
|
||||
mmTA_PERFCOUNTER1_HI__CI__VI}};
|
||||
|
||||
/*
|
||||
* TD
|
||||
*/
|
||||
GpuCounterRegInfo ViTdCounterRegAddr[] = {
|
||||
{mmTD_PERFCOUNTER0_SELECT__CI__VI, 0, mmTD_PERFCOUNTER0_LO__CI__VI,
|
||||
mmTD_PERFCOUNTER0_HI__CI__VI},
|
||||
{mmTD_PERFCOUNTER1_SELECT__CI__VI, 0, mmTD_PERFCOUNTER1_LO__CI__VI,
|
||||
mmTD_PERFCOUNTER1_HI__CI__VI}};
|
||||
|
||||
/*
|
||||
* GDS
|
||||
*/
|
||||
GpuCounterRegInfo ViGdsCounterRegAddr[] = {
|
||||
{mmGDS_PERFCOUNTER0_SELECT__CI__VI, 0, mmGDS_PERFCOUNTER0_LO__CI__VI,
|
||||
mmGDS_PERFCOUNTER0_HI__CI__VI},
|
||||
{mmGDS_PERFCOUNTER1_SELECT__CI__VI, 0, mmGDS_PERFCOUNTER1_LO__CI__VI,
|
||||
mmGDS_PERFCOUNTER1_HI__CI__VI},
|
||||
{mmGDS_PERFCOUNTER2_SELECT__CI__VI, 0, mmGDS_PERFCOUNTER2_LO__CI__VI,
|
||||
mmGDS_PERFCOUNTER2_HI__CI__VI},
|
||||
{mmGDS_PERFCOUNTER3_SELECT__CI__VI, 0, mmGDS_PERFCOUNTER3_LO__CI__VI,
|
||||
mmGDS_PERFCOUNTER3_HI__CI__VI}};
|
||||
|
||||
/*
|
||||
* VGT
|
||||
*/
|
||||
GpuCounterRegInfo ViVgtCounterRegAddr[] = {
|
||||
{mmVGT_PERFCOUNTER0_SELECT__CI__VI, 0, mmVGT_PERFCOUNTER0_LO__CI__VI,
|
||||
mmVGT_PERFCOUNTER0_HI__CI__VI},
|
||||
{mmVGT_PERFCOUNTER1_SELECT__CI__VI, 0, mmVGT_PERFCOUNTER1_LO__CI__VI,
|
||||
mmVGT_PERFCOUNTER1_HI__CI__VI},
|
||||
{mmVGT_PERFCOUNTER2_SELECT__CI__VI, 0, mmVGT_PERFCOUNTER2_LO__CI__VI,
|
||||
mmVGT_PERFCOUNTER2_HI__CI__VI},
|
||||
{mmVGT_PERFCOUNTER3_SELECT__CI__VI, 0, mmVGT_PERFCOUNTER3_LO__CI__VI,
|
||||
mmVGT_PERFCOUNTER3_HI__CI__VI}};
|
||||
|
||||
/*
|
||||
* IA
|
||||
*/
|
||||
GpuCounterRegInfo ViIaCounterRegAddr[] = {
|
||||
{mmIA_PERFCOUNTER0_SELECT__CI__VI, 0, mmIA_PERFCOUNTER0_LO__CI__VI,
|
||||
mmIA_PERFCOUNTER0_HI__CI__VI},
|
||||
{mmIA_PERFCOUNTER1_SELECT__CI__VI, 0, mmIA_PERFCOUNTER1_LO__CI__VI,
|
||||
mmIA_PERFCOUNTER1_HI__CI__VI},
|
||||
{mmIA_PERFCOUNTER2_SELECT__CI__VI, 0, mmIA_PERFCOUNTER2_LO__CI__VI,
|
||||
mmIA_PERFCOUNTER2_HI__CI__VI},
|
||||
{mmIA_PERFCOUNTER3_SELECT__CI__VI, 0, mmIA_PERFCOUNTER3_LO__CI__VI,
|
||||
mmIA_PERFCOUNTER3_HI__CI__VI}};
|
||||
|
||||
/*
|
||||
* MC
|
||||
*/
|
||||
GpuCounterRegInfo ViMcCounterRegAddr[] = {
|
||||
{mmMC_SEQ_PERF_SEQ_CTL__SI__VI, 0, mmMC_SEQ_PERF_SEQ_CNT_A_I0__VI,
|
||||
mmMC_SEQ_PERF_SEQ_CNT_A_I1__VI},
|
||||
{mmMC_SEQ_PERF_SEQ_CTL__SI__VI, 0, mmMC_SEQ_PERF_SEQ_CNT_B_I0__VI,
|
||||
mmMC_SEQ_PERF_SEQ_CNT_B_I1__VI},
|
||||
{mmMC_SEQ_PERF_SEQ_CTL__SI__VI, 0, mmMC_SEQ_PERF_SEQ_CNT_C_I0__VI,
|
||||
mmMC_SEQ_PERF_SEQ_CNT_C_I1__VI},
|
||||
{mmMC_SEQ_PERF_SEQ_CTL__SI__VI, 0, mmMC_SEQ_PERF_SEQ_CNT_D_I0__VI,
|
||||
mmMC_SEQ_PERF_SEQ_CNT_D_I1__VI}};
|
||||
|
||||
/*
|
||||
* SRBM
|
||||
*/
|
||||
GpuCounterRegInfo ViSrbmCounterRegAddr[] = {
|
||||
{mmSRBM_PERFCOUNTER0_SELECT__VI, 0, mmSRBM_PERFCOUNTER0_LO__VI, mmSRBM_PERFCOUNTER0_HI__VI},
|
||||
{mmSRBM_PERFCOUNTER1_SELECT__VI, 0, mmSRBM_PERFCOUNTER1_LO__VI, mmSRBM_PERFCOUNTER1_HI__VI}};
|
||||
|
||||
/*
|
||||
* WD
|
||||
*/
|
||||
GpuCounterRegInfo ViWdCounterRegAddr[] = {
|
||||
{mmWD_PERFCOUNTER0_SELECT__CI__VI, 0, mmWD_PERFCOUNTER0_LO__CI__VI,
|
||||
mmWD_PERFCOUNTER0_HI__CI__VI},
|
||||
{mmWD_PERFCOUNTER1_SELECT__CI__VI, 0, mmWD_PERFCOUNTER1_LO__CI__VI,
|
||||
mmWD_PERFCOUNTER1_HI__CI__VI},
|
||||
{mmWD_PERFCOUNTER2_SELECT__CI__VI, 0, mmWD_PERFCOUNTER2_LO__CI__VI,
|
||||
mmWD_PERFCOUNTER2_HI__CI__VI},
|
||||
{mmWD_PERFCOUNTER3_SELECT__CI__VI, 0, mmWD_PERFCOUNTER3_LO__CI__VI,
|
||||
mmWD_PERFCOUNTER3_HI__CI__VI}};
|
||||
|
||||
/*
|
||||
* CPG
|
||||
*/
|
||||
GpuCounterRegInfo ViCpgCounterRegAddr[] = {
|
||||
{mmCPG_PERFCOUNTER0_SELECT__CI__VI, 0, mmCPG_PERFCOUNTER0_LO__CI__VI,
|
||||
mmCPG_PERFCOUNTER0_HI__CI__VI},
|
||||
{mmCPG_PERFCOUNTER1_SELECT__CI__VI, 0, mmCPG_PERFCOUNTER1_LO__CI__VI,
|
||||
mmCPG_PERFCOUNTER1_HI__CI__VI}};
|
||||
|
||||
/*
|
||||
* CPC
|
||||
*/
|
||||
GpuCounterRegInfo ViCpcCounterRegAddr[] = {
|
||||
{mmCPC_PERFCOUNTER0_SELECT__CI__VI, 0, mmCPC_PERFCOUNTER0_LO__CI__VI,
|
||||
mmCPC_PERFCOUNTER0_HI__CI__VI},
|
||||
{mmCPC_PERFCOUNTER1_SELECT__CI__VI, 0, mmCPC_PERFCOUNTER1_LO__CI__VI,
|
||||
mmCPC_PERFCOUNTER1_HI__CI__VI}};
|
||||
|
||||
GpuPrivCounterBlockId ViBlockIdSq = {{0xb5c396b6, 0x47e4d310, 0xc35cfc86, 0x08f53a04}};
|
||||
GpuPrivCounterBlockId ViBlockIdMc = {{0x13900b57, 0x4d984956, 0x5268d081, 0x9cf53719}};
|
||||
GpuPrivCounterBlockId ViBlockIdIommuV2 = {{0x80969879, 0x4be6b0f6, 0x636af697, 0x1d10f500}};
|
||||
GpuPrivCounterBlockId ViBlockIdKernelDriver = {{0xea9b5ae1, 0x44b36c3f, 0xf0da5489, 0x0aa96575}};
|
||||
|
||||
} // pm4_profile
|
||||
@@ -0,0 +1,230 @@
|
||||
#ifndef _VI_BLOCKINFO_H_
|
||||
#define _VI_BLOCKINFO_H_
|
||||
|
||||
#include <stdint.h>
|
||||
#include "rocr_profiler.h"
|
||||
#include "gpu_enum.h"
|
||||
#include "gpu_blockinfo.h"
|
||||
|
||||
namespace pm4_profile {
|
||||
|
||||
// MAX Number of block instances for VOLCANIC ISLANDS (From Fiji)
|
||||
// Values are found here //gfxip/gfx8/main/src/meta/features/variant/Fiji/album.dj
|
||||
|
||||
// @brief Number of block instances.
|
||||
|
||||
// We index per SE and instance
|
||||
#define VI_NUM_CB 4 // CB has 4 instances per SE
|
||||
#define VI_NUM_DB 4 // DB has 4 instances per SE
|
||||
|
||||
// For TA, TD and TCP, the values below are the same as the number of CUs
|
||||
// per SH. We index per SE and instance
|
||||
#define VI_NUM_TA 16 // TA has 11 instances
|
||||
#define VI_NUM_TD 16 // TD has 11 instances
|
||||
#define VI_NUM_TCP 16 // TCP has 11 instances
|
||||
|
||||
// These values are per chip, we index directly per instance
|
||||
#define VI_NUM_TCA 2 // TCA has 2 instances per chip
|
||||
#define VI_NUM_TCC 16 // TCC has 16 instances per chip
|
||||
#define VI_NUM_SDMA 2 // There are two SDMA blocks on VI, exposed as 2
|
||||
// instances here
|
||||
|
||||
// Number of counter registers per block for volcanic islands
|
||||
#define VI_COUNTER_NUM_PER_DRM 2
|
||||
#define VI_COUNTER_NUM_PER_DRMDMA 2
|
||||
#define VI_COUNTER_NUM_PER_IH 2
|
||||
#define VI_COUNTER_NUM_PER_SRBM 2
|
||||
#define VI_COUNTER_NUM_PER_CB 4
|
||||
#define VI_COUNTER_NUM_PER_CPF 2
|
||||
#define VI_COUNTER_NUM_PER_DB 4
|
||||
#define VI_COUNTER_NUM_PER_GRBM 2
|
||||
#define VI_COUNTER_NUM_PER_GRBMSE 4
|
||||
#define VI_COUNTER_NUM_PER_PA_SU 4
|
||||
#define VI_COUNTER_NUM_PER_RLC 2
|
||||
#define VI_COUNTER_NUM_PER_PA_SC 8
|
||||
#define VI_COUNTER_NUM_PER_SPI 6 // [Shucai: To do: double check the value]
|
||||
#define VI_COUNTER_NUM_PER_SQ 16
|
||||
#define VI_COUNTER_NUM_PER_SX 4
|
||||
#define VI_COUNTER_NUM_PER_TA 2
|
||||
#define VI_COUNTER_NUM_PER_TCA 4
|
||||
#define VI_COUNTER_NUM_PER_TCC 4
|
||||
#define VI_COUNTER_NUM_PER_TD 2 // [Shucai: To do: double check the value]
|
||||
#define VI_COUNTER_NUM_PER_TCP 4
|
||||
#define VI_COUNTER_NUM_PER_GDS 4
|
||||
#define VI_COUNTER_NUM_PER_VGT 4
|
||||
#define VI_COUNTER_NUM_PER_IA 4
|
||||
#define VI_COUNTER_NUM_PER_MC 4
|
||||
#define VI_COUNTER_NUM_PER_TCS 4
|
||||
#define VI_COUNTER_NUM_PER_WD 4
|
||||
#define VI_COUNTER_NUM_PER_CPG 2
|
||||
#define VI_COUNTER_NUM_PER_CPC 2
|
||||
#define VI_COUNTER_NUM_PER_VM 1
|
||||
#define VI_COUNTER_NUM_PER_VM_MD 1
|
||||
#define VI_COUNTER_NUM_PER_PIPESTATS 12
|
||||
|
||||
#define VI_MAX_NUM_SHADER_ENGINES 1
|
||||
|
||||
// Enumeration of VI hardware counter blocks
|
||||
typedef enum HsaViCounterBlockId {
|
||||
kHsaViCounterBlockIdCb0 = 0,
|
||||
kHsaViCounterBlockIdCb1,
|
||||
kHsaViCounterBlockIdCb2,
|
||||
kHsaViCounterBlockIdCb3,
|
||||
|
||||
kHsaViCounterBlockIdCpf,
|
||||
|
||||
kHsaViCounterBlockIdDb0,
|
||||
kHsaViCounterBlockIdDb1,
|
||||
kHsaViCounterBlockIdDb2,
|
||||
kHsaViCounterBlockIdDb3,
|
||||
|
||||
kHsaViCounterBlockIdGrbm,
|
||||
kHsaViCounterBlockIdGrbmSe,
|
||||
kHsaViCounterBlockIdPaSu,
|
||||
kHsaViCounterBlockIdPaSc,
|
||||
kHsaViCounterBlockIdSpi,
|
||||
|
||||
kHsaViCounterBlockIdSq,
|
||||
kHsaViCounterBlockIdSqEs,
|
||||
kHsaViCounterBlockIdSqGs,
|
||||
kHsaViCounterBlockIdSqVs,
|
||||
kHsaViCounterBlockIdSqPs,
|
||||
kHsaViCounterBlockIdSqLs,
|
||||
kHsaViCounterBlockIdSqHs,
|
||||
kHsaViCounterBlockIdSqCs,
|
||||
|
||||
kHsaViCounterBlockIdSx,
|
||||
|
||||
kHsaViCounterBlockIdTa0,
|
||||
kHsaViCounterBlockIdTa1,
|
||||
kHsaViCounterBlockIdTa2,
|
||||
kHsaViCounterBlockIdTa3,
|
||||
kHsaViCounterBlockIdTa4,
|
||||
kHsaViCounterBlockIdTa5,
|
||||
kHsaViCounterBlockIdTa6,
|
||||
kHsaViCounterBlockIdTa7,
|
||||
kHsaViCounterBlockIdTa8,
|
||||
kHsaViCounterBlockIdTa9,
|
||||
kHsaViCounterBlockIdTa10,
|
||||
kHsaViCounterBlockIdTa11,
|
||||
kHsaViCounterBlockIdTa12,
|
||||
kHsaViCounterBlockIdTa13,
|
||||
kHsaViCounterBlockIdTa14,
|
||||
kHsaViCounterBlockIdTa15,
|
||||
|
||||
kHsaViCounterBlockIdTca0,
|
||||
kHsaViCounterBlockIdTca1,
|
||||
|
||||
kHsaViCounterBlockIdTcc0,
|
||||
kHsaViCounterBlockIdTcc1,
|
||||
kHsaViCounterBlockIdTcc2,
|
||||
kHsaViCounterBlockIdTcc3,
|
||||
kHsaViCounterBlockIdTcc4,
|
||||
kHsaViCounterBlockIdTcc5,
|
||||
kHsaViCounterBlockIdTcc6,
|
||||
kHsaViCounterBlockIdTcc7,
|
||||
kHsaViCounterBlockIdTcc8,
|
||||
kHsaViCounterBlockIdTcc9,
|
||||
kHsaViCounterBlockIdTcc10,
|
||||
kHsaViCounterBlockIdTcc11,
|
||||
kHsaViCounterBlockIdTcc12,
|
||||
kHsaViCounterBlockIdTcc13,
|
||||
kHsaViCounterBlockIdTcc14,
|
||||
kHsaViCounterBlockIdTcc15,
|
||||
|
||||
kHsaViCounterBlockIdTd0,
|
||||
kHsaViCounterBlockIdTd1,
|
||||
kHsaViCounterBlockIdTd2,
|
||||
kHsaViCounterBlockIdTd3,
|
||||
kHsaViCounterBlockIdTd4,
|
||||
kHsaViCounterBlockIdTd5,
|
||||
kHsaViCounterBlockIdTd6,
|
||||
kHsaViCounterBlockIdTd7,
|
||||
kHsaViCounterBlockIdTd8,
|
||||
kHsaViCounterBlockIdTd9,
|
||||
kHsaViCounterBlockIdTd10,
|
||||
kHsaViCounterBlockIdTd11,
|
||||
kHsaViCounterBlockIdTd12,
|
||||
kHsaViCounterBlockIdTd13,
|
||||
kHsaViCounterBlockIdTd14,
|
||||
kHsaViCounterBlockIdTd15,
|
||||
|
||||
kHsaViCounterBlockIdTcp0,
|
||||
kHsaViCounterBlockIdTcp1,
|
||||
kHsaViCounterBlockIdTcp2,
|
||||
kHsaViCounterBlockIdTcp3,
|
||||
kHsaViCounterBlockIdTcp4,
|
||||
kHsaViCounterBlockIdTcp5,
|
||||
kHsaViCounterBlockIdTcp6,
|
||||
kHsaViCounterBlockIdTcp7,
|
||||
kHsaViCounterBlockIdTcp8,
|
||||
kHsaViCounterBlockIdTcp9,
|
||||
kHsaViCounterBlockIdTcp10,
|
||||
kHsaViCounterBlockIdTcp11,
|
||||
kHsaViCounterBlockIdTcp12,
|
||||
kHsaViCounterBlockIdTcp13,
|
||||
kHsaViCounterBlockIdTcp14,
|
||||
kHsaViCounterBlockIdTcp15,
|
||||
|
||||
kHsaViCounterBlockIdGds,
|
||||
kHsaViCounterBlockIdVgt,
|
||||
kHsaViCounterBlockIdIa,
|
||||
kHsaViCounterBlockIdMc,
|
||||
kHsaViCounterBlockIdSrbm,
|
||||
|
||||
kHsaViCounterBlockIdTcs,
|
||||
kHsaViCounterBlockIdWd,
|
||||
kHsaViCounterBlockIdCpg,
|
||||
kHsaViCounterBlockIdCpc,
|
||||
|
||||
// Counters retrieved by KFD
|
||||
kHsaViCounterBlockIdIommuV2,
|
||||
kHsaViCounterBlockIdKernelDriver,
|
||||
|
||||
kHsaViCounterBlockIdCpPipeStats,
|
||||
kHsaViCounterBlockIdHwInfo,
|
||||
kHsaViCounterBlockIdBlocksFirst = kHsaViCounterBlockIdCb0,
|
||||
kHsaViCounterBlockIdBlocksLast = kHsaViCounterBlockIdHwInfo
|
||||
} HsaViCounterBlockId;
|
||||
|
||||
extern GpuBlockInfo ViPmuHwBlocks[];
|
||||
extern GpuCounterRegInfo ViSqCounterRegAddr[];
|
||||
extern GpuCounterRegInfo ViCbCounterRegAddr[];
|
||||
extern GpuCounterRegInfo ViDrmdmaCounterRegAddr[];
|
||||
extern GpuCounterRegInfo ViIhCounterRegAddr[];
|
||||
extern GpuCounterRegInfo ViCpfCounterRegAddr[];
|
||||
extern GpuCounterRegInfo ViCpgCounterRegAddr[];
|
||||
extern GpuCounterRegInfo ViCpcCounterRegAddr[];
|
||||
extern GpuCounterRegInfo ViDrmCounterRegAddr[];
|
||||
extern GpuCounterRegInfo ViGrbmCounterRegAddr[];
|
||||
extern GpuCounterRegInfo ViGrbmSeCounterRegAddr[];
|
||||
extern GpuCounterRegInfo ViPaSuCounterRegAddr[];
|
||||
extern GpuCounterRegInfo ViPaScCounterRegAddr[];
|
||||
extern GpuCounterRegInfo ViSpiCounterRegAddr[];
|
||||
extern GpuCounterRegInfo ViTcaCounterRegAddr[];
|
||||
extern GpuCounterRegInfo ViTccCounterRegAddr[];
|
||||
extern GpuCounterRegInfo ViTcpCounterRegAddr[];
|
||||
extern GpuCounterRegInfo ViDbCounterRegAddr[];
|
||||
extern GpuCounterRegInfo ViRlcCounterRegAddr[];
|
||||
extern GpuCounterRegInfo ViScCounterRegAddr[];
|
||||
extern GpuCounterRegInfo ViSxCounterRegAddr[];
|
||||
extern GpuCounterRegInfo ViTaCounterRegAddr[];
|
||||
extern GpuCounterRegInfo ViTdCounterRegAddr[];
|
||||
extern GpuCounterRegInfo ViGdsCounterRegAddr[];
|
||||
extern GpuCounterRegInfo ViVgtCounterRegAddr[];
|
||||
extern GpuCounterRegInfo ViIaCounterRegAddr[];
|
||||
extern GpuCounterRegInfo ViMcCounterRegAddr[];
|
||||
extern GpuCounterRegInfo ViSrbmCounterRegAddr[];
|
||||
|
||||
// No Tcs Counter block on VI
|
||||
// extern GpuCounterRegInfo ViTcsCounterRegAddr[];
|
||||
extern GpuCounterRegInfo ViWdCounterRegAddr[];
|
||||
extern GpuCounterRegInfo ViCpgCounterRegAddr[];
|
||||
extern GpuCounterRegInfo ViCpcCounterRegAddr[];
|
||||
|
||||
extern GpuPrivCounterBlockId ViBlockIdSq;
|
||||
extern GpuPrivCounterBlockId ViBlockIdMc;
|
||||
extern GpuPrivCounterBlockId ViBlockIdIommuV2;
|
||||
extern GpuPrivCounterBlockId ViBlockIdKernelDriver;
|
||||
}
|
||||
#endif
|
||||
Разница между файлами не показана из-за своего большого размера
Загрузить разницу
@@ -0,0 +1,141 @@
|
||||
#ifndef _VI_PMU_H_
|
||||
#define _VI_PMU_H_
|
||||
|
||||
#include "hsa.h"
|
||||
#include "cmdwriter.h"
|
||||
#include "hsa_perf.h"
|
||||
#include "info_set.h"
|
||||
#include "parameter_set.h"
|
||||
#include "vi_blockinfo.h"
|
||||
#include "rocr_profiler.h"
|
||||
|
||||
#include <stdlib.h>
|
||||
#include <stdint.h>
|
||||
#include <map>
|
||||
|
||||
namespace pm4_profile {
|
||||
typedef std::map<HsaViCounterBlockId, pm4_profile::CounterBlock*> ViCounterBlockMap;
|
||||
|
||||
// This class implement the VI PMU. It is responsible for setting up
|
||||
// CounterGroups to represent each VI hardware block which exposes performance
|
||||
// counters.
|
||||
class ViPmu : public pm4_profile::Pmu {
|
||||
public:
|
||||
ViPmu();
|
||||
~ViPmu();
|
||||
|
||||
// Returns number of shader engines per block
|
||||
// for the blocks featured shader engines instancing
|
||||
uint32_t getNumSe() { return num_se_; }
|
||||
|
||||
// Initializes the handle of buffer used to collect PMC data
|
||||
bool setPmcDataBuff(uint8_t* pmcBuffer, uint32_t pmcBuffSz);
|
||||
|
||||
int getLastError();
|
||||
|
||||
std::string getErrorString(int error);
|
||||
|
||||
virtual bool begin(pm4_profile::DefaultCmdBuf* cmdBuff, pm4_profile::CommandWriter* cmdWriter,
|
||||
bool reset = true);
|
||||
|
||||
virtual bool end(pm4_profile::DefaultCmdBuf* cmdBuff, pm4_profile::CommandWriter* cmdWriter);
|
||||
|
||||
// IPMU inherits the IParameterSet and IInfoSetso we implement it
|
||||
// through composition and function forwarding
|
||||
bool getParameter(uint32_t param, uint32_t& ret_size, void** pp_data);
|
||||
|
||||
bool setParameter(uint32_t param, uint32_t param_size, const void* p_data);
|
||||
|
||||
bool getInfo(uint32_t info, uint32_t& ret_size, void** pp_data);
|
||||
|
||||
pm4_profile::CounterBlock* getCounterBlockById(uint32_t id);
|
||||
|
||||
rocr_pmu_state_t getCurrentState() { return profiler_state_; }
|
||||
|
||||
pm4_profile::CounterBlock** getAllCounterBlocks(uint32_t& num_groups);
|
||||
|
||||
private:
|
||||
// Addr of Counter Data Buffer
|
||||
uint32_t* pmcData_;
|
||||
|
||||
// Size of Counter Data Buffer
|
||||
uint32_t pmcDataSz_;
|
||||
|
||||
void Init();
|
||||
|
||||
bool initCounterBlock();
|
||||
|
||||
bool isResultReady();
|
||||
|
||||
// Clear CounterBlockMap
|
||||
void clearCounterBlockMap();
|
||||
|
||||
// Reset SQ and CB counters
|
||||
void ResetCounterBlocks(pm4_profile::DefaultCmdBuf* cmdBuff,
|
||||
pm4_profile::CommandWriter* cmdWriter);
|
||||
|
||||
// Program SQ block related counters
|
||||
uint32_t ProgramSQCntrs(uint32_t sqRegIdx, uint32_t* regAddr, uint32_t* regVal, uint32_t blkId,
|
||||
uint32_t blkCntrIdx);
|
||||
|
||||
// Program TA block related counters
|
||||
uint32_t ProgramTaCntrs(uint32_t taRegIdx, uint32_t* regAddr, uint32_t* regVal, uint32_t blkId,
|
||||
uint32_t blkCntrIdx);
|
||||
|
||||
// Program TCA block related counters
|
||||
uint32_t ProgramTcaCntrs(uint32_t tcaRegIdx, uint32_t* regAddr, uint32_t* regVal, uint32_t blkId,
|
||||
uint32_t blkCntrIdx);
|
||||
|
||||
// Program TCC block related counters
|
||||
uint32_t ProgramTccCntrs(uint32_t tccRegIdx, uint32_t* regAddr, uint32_t* regVal, uint32_t blkId,
|
||||
uint32_t blkCntrIdx);
|
||||
|
||||
// Program TCP block related counters
|
||||
uint32_t ProgramTcpCntrs(uint32_t tcpRegIdx, uint32_t* regAddr, uint32_t* regVal, uint32_t blkId,
|
||||
uint32_t blkCntrIdx);
|
||||
|
||||
// Program TD block related counters
|
||||
uint32_t ProgramTdCntrs(uint32_t tdRegIdx, uint32_t* regAddr, uint32_t* regVal, uint32_t blkId,
|
||||
uint32_t blkCntrIdx);
|
||||
|
||||
// Build counter selection register, return how many registers are built
|
||||
uint32_t BuildCounterSelRegister(uint32_t cntrIdx, uint32_t* regAddr, uint32_t* regVal,
|
||||
uint32_t blkId, pm4_profile::Counter* blkCntr);
|
||||
|
||||
// Build counter selection register, return how many registers are built
|
||||
uint32_t BuildCounterReadRegisters(uint32_t reg_index, uint32_t block_id, uint32_t* reg_addr,
|
||||
uint32_t* reg_val);
|
||||
|
||||
private:
|
||||
// Delete counter blocks in the PMU
|
||||
hsa_status_t RemoveCounterBlocks();
|
||||
|
||||
private:
|
||||
// This contains the available counter groups.
|
||||
ViCounterBlockMap blk_map_;
|
||||
|
||||
// This stores the current profiling state.
|
||||
rocr_pmu_state_t profiler_state_;
|
||||
|
||||
pm4_profile::ParameterSet* parameter_set_;
|
||||
|
||||
pm4_profile::InfoSet* info_set_;
|
||||
|
||||
int error_code_;
|
||||
|
||||
// A flag to indicate the current packet is for copy register value
|
||||
#define COPY_DATA_FLAG 0xFFFFFFFF
|
||||
#define MAX_REG_NUM 100
|
||||
|
||||
// Pointer used to store counter block list internally
|
||||
uint32_t blk_list_size_;
|
||||
pm4_profile::CounterBlock** blk_list_;
|
||||
|
||||
// Indicates the number of Shader Engines Present
|
||||
uint32_t num_se_;
|
||||
|
||||
// Used to reset GRBM to its default state
|
||||
uint32_t reset_grbm_;
|
||||
};
|
||||
}
|
||||
#endif
|
||||
@@ -0,0 +1,18 @@
|
||||
#
|
||||
# Source files for Rocr ThreadTrace
|
||||
#
|
||||
set ( LIB_SRC thread_trace.cpp )
|
||||
set ( LIB_SRC ${LIB_SRC} gfx8_thread_trace.cpp )
|
||||
set ( LIB_SRC ${LIB_SRC} gfx9_thread_trace.cpp )
|
||||
|
||||
#
|
||||
# Header files include path(s).
|
||||
#
|
||||
include_directories ( $ENV{ROCR_INC_DIR} )
|
||||
include_directories ( ${PROJ_DIR}/commandwriter )
|
||||
include_directories ( ${HSA_RUNTIME_OSC_DIR} )
|
||||
|
||||
#
|
||||
# Build ThreadTrace as a Static Library object
|
||||
#
|
||||
add_library ( ${SQTT_LIB} STATIC ${LIB_SRC} )
|
||||
@@ -0,0 +1,360 @@
|
||||
|
||||
#include <iostream>
|
||||
#include <fstream>
|
||||
#include <iomanip>
|
||||
#include <random>
|
||||
#include <memory>
|
||||
|
||||
#include "core/util/os.h"
|
||||
#include "gfx8_thread_trace.h"
|
||||
|
||||
/// @brief Returns the lower 32-bits of a value
|
||||
inline uint32_t Low32(uint64_t u) { return (u & 0xFFFFFFFFUL); }
|
||||
|
||||
/// @brief Returns the upper 32-bits of a value
|
||||
inline uint32_t High32(uint64_t u) { return (u >> 32); }
|
||||
|
||||
namespace pm4_profile {
|
||||
|
||||
Gfx8ThreadTrace::Gfx8ThreadTrace() {
|
||||
// Initialize the number of shader engines
|
||||
numSE_ = 4;
|
||||
}
|
||||
|
||||
Gfx8ThreadTrace::~Gfx8ThreadTrace() {}
|
||||
|
||||
bool Gfx8ThreadTrace::Init(const ThreadTraceConfig* config) {
|
||||
// Initialize SQTT Configuration and Register objects
|
||||
if (!ThreadTrace::Init(config)) return false;
|
||||
InitThreadTraceCfgRegs();
|
||||
return true;
|
||||
}
|
||||
|
||||
void Gfx8ThreadTrace::InitThreadTraceCfgRegs() {
|
||||
// Indicates the size of buffer to use per Shader Engine instance.
|
||||
// The size is specified in terms of 4KB blocks
|
||||
ttCfgRegs_.ttRegSize.u32All = 0;
|
||||
|
||||
// Indicates various attributes of a thread trace session.
|
||||
//
|
||||
// MASK_CS: Which shader types should be enabled for data collection
|
||||
// Enable CS Shader types.
|
||||
//
|
||||
// WRAP: How trace buffer should be used as a ring buffer or as a linear
|
||||
// buffer - Disable WRAP mode i.e use it as a linear buffer
|
||||
//
|
||||
// MODE: Enables a thread trace session
|
||||
//
|
||||
// CAPTURE_MODE: When thread trace data is collected immediately after MODE
|
||||
// is enabled or wait until a Thread Trace Start event is received
|
||||
//
|
||||
// AUTOFLUSH_EN: Flush thread trace data to buffer often automatically
|
||||
//
|
||||
ttCfgRegs_.ttRegMode.u32All = 0;
|
||||
ttCfgRegs_.ttRegMode.bits.WRAP = 0;
|
||||
ttCfgRegs_.ttRegMode.bits.CAPTURE_MODE = 0;
|
||||
ttCfgRegs_.ttRegMode.bits.MASK_CS = 1;
|
||||
ttCfgRegs_.ttRegMode.bits.AUTOFLUSH_EN = 1;
|
||||
ttCfgRegs_.ttRegMode.bits.MODE = SQ_THREAD_TRACE_MODE_OFF;
|
||||
|
||||
// Enable Thread Trace for all VM Id's
|
||||
// Enable all of the SIMD's of the compute unit
|
||||
// Enable Compute Unit (CU) at index Zero to be used for fine-grained data
|
||||
// Enable Shader Array (SH) at index Zero to be used for fine-grained data
|
||||
//
|
||||
// @note: Not enabling REG_STALL_EN, SPI_STALL_EN and SQ_STALL_EN bits. They
|
||||
// are useful if we wish to program buffer throttling.
|
||||
//
|
||||
ttCfgRegs_.ttRegMask.u32All = 0;
|
||||
ttCfgRegs_.ttRegMask.bits.SH_SEL = 0x0;
|
||||
ttCfgRegs_.ttRegMask.bits.SIMD_EN = 0xF;
|
||||
ttCfgRegs_.ttRegMask.bits.CU_SEL = SetCuId();
|
||||
ttCfgRegs_.ttRegMask.bits.SQ_STALL_EN__CI__VI = 0x1;
|
||||
ttCfgRegs_.ttRegMask.bits.SPI_STALL_EN__CI__VI = 0x1;
|
||||
ttCfgRegs_.ttRegMask.bits.REG_STALL_EN__CI__VI = 0x1;
|
||||
ttCfgRegs_.ttRegMask.bits.VM_ID_MASK = SetVmId();
|
||||
|
||||
// Override Mask value if a user value is available
|
||||
uint32_t ttMask = SetMask();
|
||||
if (ttMask) {
|
||||
ttCfgRegs_.ttRegMask.u32All = ttMask;
|
||||
}
|
||||
|
||||
// Mask of compute units to get thread trace data from
|
||||
ttCfgRegs_.ttRegPerfMask.u32All = 0;
|
||||
ttCfgRegs_.ttRegPerfMask.bits.SH0_MASK = 0xFFFF;
|
||||
ttCfgRegs_.ttRegPerfMask.bits.SH1_MASK = 0xFFFF;
|
||||
|
||||
// Indicate the different TT messages/tokens that should be enabled/logged
|
||||
// Indicate the different TT tokens that specify register operations to be logged
|
||||
ttCfgRegs_.ttRegTokenMask.u32All = 0;
|
||||
ttCfgRegs_.ttRegTokenMask.bits.REG_MASK = 0xFF;
|
||||
ttCfgRegs_.ttRegTokenMask.bits.TOKEN_MASK = 0xFFFF;
|
||||
ttCfgRegs_.ttRegTokenMask.bits.REG_DROP_ON_STALL__CI__VI = 0x1;
|
||||
|
||||
// Override TokenMask1 value if a user value is available
|
||||
uint32_t tokenMask1 = SetTokenMask();
|
||||
if (tokenMask1) {
|
||||
ttCfgRegs_.ttRegTokenMask.u32All = tokenMask1;
|
||||
}
|
||||
|
||||
// Indicate the different TT tokens that specify instruction operations to be logged
|
||||
// Disabling specifically instruction operations updating Program Counter (PC).
|
||||
// @note: The field is defined in the spec incorrectly as a 16-bit value
|
||||
ttCfgRegs_.ttRegTokenMask2.u32All = 0;
|
||||
ttCfgRegs_.ttRegTokenMask2.bits.INST_MASK = 0xFFFFFF7F;
|
||||
|
||||
// Override TokenMask2 value if a user value is available
|
||||
uint32_t tokenMask2 = SetTokenMask2();
|
||||
if (tokenMask2) {
|
||||
ttCfgRegs_.ttRegTokenMask2.u32All = tokenMask2;
|
||||
}
|
||||
}
|
||||
|
||||
void Gfx8ThreadTrace::setSqttDataBuff(uint8_t* sqttBuffer, uint32_t sqttBuffSz) {
|
||||
// Compute the size of buffer available for each shader engine
|
||||
ttBuffSize_ = sqttBuffSz / numSE_;
|
||||
|
||||
// Populate the sqtt buffer array submitted to device
|
||||
for (int idx = 0; idx < numSE_; idx++) {
|
||||
uint64_t sqttSEAddr = uint64_t(sqttBuffer + (ttBuffSize_ * idx));
|
||||
devMemList_.push_back(sqttSEAddr);
|
||||
}
|
||||
|
||||
// Update the size bit-field of sqtt ctrl register
|
||||
ttCfgRegs_.ttRegSize.bits.SIZE = ttBuffSize_ >> TT_BUFF_ALIGN_SHIFT;
|
||||
}
|
||||
|
||||
void Gfx8ThreadTrace::BeginSession(DefaultCmdBuf* cmdBuff, CommandWriter* cmdWriter) {
|
||||
// Program Grbm to broadcast messages to all shader engines
|
||||
regGRBM_GFX_INDEX grbm_gfx_index;
|
||||
grbm_gfx_index.u32All = 0;
|
||||
grbm_gfx_index.bitfields.SH_BROADCAST_WRITES = 1;
|
||||
grbm_gfx_index.bitfields.SE_BROADCAST_WRITES = 1;
|
||||
grbm_gfx_index.bitfields.INSTANCE_BROADCAST_WRITES = 1;
|
||||
cmdWriter->BuildWriteUConfigRegPacket(cmdBuff, mmGRBM_GFX_INDEX__CI__VI, grbm_gfx_index.u32All);
|
||||
|
||||
// Issue a CSPartialFlush cmd including cache flush
|
||||
cmdWriter->BuildWriteWaitIdlePacket(cmdBuff);
|
||||
|
||||
// Disable RLC Perfmon Clock Gating
|
||||
// On Vega this is needed to collect Perf Cntrs
|
||||
// cmdWriter->BuildWriteUConfigRegPacket(cmdBuff, mmRLC_PERFMON_CLK_CNTL__VI, 1);
|
||||
|
||||
// Program the Compute register to indicate SQTT is enabled
|
||||
/*
|
||||
regCOMPUTE_THREAD_TRACE_ENABLE__CI__VI enableTT = {0};
|
||||
enableTT.bits.THREAD_TRACE_ENABLE = 1;
|
||||
cmdWriter->BuildWriteUConfigRegPacket(cmdBuff,
|
||||
mmCOMPUTE_THREAD_TRACE_ENABLE__CI__VI,
|
||||
enableTT.u32All);
|
||||
*/
|
||||
|
||||
// Program the thread trace mask - specifies SH, CU, SIMD and
|
||||
// VM Id masks to apply. Enabling SQ/SPI/REG_STALL_EN bits
|
||||
cmdWriter->BuildWriteUConfigRegPacket(cmdBuff, mmSQ_THREAD_TRACE_MASK__VI,
|
||||
ttCfgRegs_.ttRegMask.u32All);
|
||||
|
||||
// Program the thread trace Perf mask
|
||||
cmdWriter->BuildWriteUConfigRegPacket(cmdBuff, mmSQ_THREAD_TRACE_PERF_MASK__VI,
|
||||
ttCfgRegs_.ttRegPerfMask.u32All);
|
||||
|
||||
// Program the thread trace token mask
|
||||
cmdWriter->BuildWriteUConfigRegPacket(cmdBuff, mmSQ_THREAD_TRACE_TOKEN_MASK__VI,
|
||||
ttCfgRegs_.ttRegTokenMask.u32All);
|
||||
|
||||
// Program the thread trace token mask2 to specify the list of instruction
|
||||
// tokens to record. Disabling INST_PC instruction tokens
|
||||
cmdWriter->BuildWriteUConfigRegPacket(cmdBuff, mmSQ_THREAD_TRACE_TOKEN_MASK2__VI,
|
||||
ttCfgRegs_.ttRegTokenMask2.u32All);
|
||||
|
||||
// Program the thread trace mode register
|
||||
cmdWriter->BuildWriteUConfigRegPacket(cmdBuff, mmSQ_THREAD_TRACE_MODE__VI,
|
||||
ttCfgRegs_.ttRegMode.u32All);
|
||||
|
||||
// Program the HiWaterMark register to support stalling
|
||||
if ((ttCfgRegs_.ttRegMask.bits.SQ_STALL_EN__CI__VI) ||
|
||||
(ttCfgRegs_.ttRegMask.bits.SPI_STALL_EN__CI__VI) ||
|
||||
(ttCfgRegs_.ttRegMask.bits.REG_STALL_EN__CI__VI) ||
|
||||
(ttCfgRegs_.ttRegTokenMask.bits.REG_DROP_ON_STALL__CI__VI)) {
|
||||
cmdWriter->BuildWriteUConfigRegPacket(cmdBuff, mmSQ_THREAD_TRACE_HIWATER__VI, 0x06);
|
||||
}
|
||||
|
||||
// Iterate through the list of SE's and program the register
|
||||
// for carrying address of thread trace buffer which is aligned
|
||||
// to 4KB per thread trace specification
|
||||
uint64_t baseAddr = 0;
|
||||
for (int idx = 0; idx < numSE_; idx++) {
|
||||
// Program Grbm to direct writes to one SE
|
||||
grbm_gfx_index.bitfields.SH_INDEX = 0;
|
||||
grbm_gfx_index.bitfields.SE_INDEX = idx;
|
||||
grbm_gfx_index.bitfields.SH_BROADCAST_WRITES = 0;
|
||||
grbm_gfx_index.bitfields.SE_BROADCAST_WRITES = 0;
|
||||
cmdWriter->BuildWriteUConfigRegPacket(cmdBuff, mmGRBM_GFX_INDEX__CI__VI, grbm_gfx_index.u32All);
|
||||
|
||||
// Program base2 address of buffer to use for thread trace
|
||||
// Encodes ATC bit, so the correct way to program is to use
|
||||
// ATC Bit property of the device
|
||||
/*
|
||||
regSQ_THREAD_TRACE_BASE2__CI__VI sqttBase2 = {};
|
||||
sqttBase2.u32All = 0;
|
||||
sqttBase2.bits.ATC = 0;
|
||||
sqttBase2.bits.ADDR_HI = 0;
|
||||
cmdWriter->BuildWriteUConfigRegPacket(cmdBuff,
|
||||
mmSQ_THREAD_TRACE_BASE2__VI,
|
||||
sqttBase2.u32All);
|
||||
*/
|
||||
|
||||
// Program the base address to use
|
||||
baseAddr = devMemList_[idx] >> TT_BUFF_ALIGN_SHIFT;
|
||||
|
||||
// Program base address of buffer to use for thread trace
|
||||
regSQ_THREAD_TRACE_BASE sqttBase = {};
|
||||
sqttBase.bits.ADDR = Low32(baseAddr);
|
||||
cmdWriter->BuildWriteUConfigRegPacket(cmdBuff, mmSQ_THREAD_TRACE_BASE__VI, sqttBase.u32All);
|
||||
|
||||
// Program the size of thread trace buffer
|
||||
cmdWriter->BuildWriteUConfigRegPacket(cmdBuff, mmSQ_THREAD_TRACE_SIZE__VI,
|
||||
ttCfgRegs_.ttRegSize.u32All);
|
||||
|
||||
// Program the thread trace ctrl register
|
||||
regSQ_THREAD_TRACE_CTRL sqttCtrl = {};
|
||||
sqttCtrl.u32All = 0;
|
||||
sqttCtrl.bits.RESET_BUFFER = 1;
|
||||
cmdWriter->BuildWriteUConfigRegPacket(cmdBuff, mmSQ_THREAD_TRACE_CTRL__VI, sqttCtrl.u32All);
|
||||
}
|
||||
|
||||
// Reset the GRBM to broadcast mode
|
||||
grbm_gfx_index.bitfields.SH_BROADCAST_WRITES = 1;
|
||||
grbm_gfx_index.bitfields.SE_BROADCAST_WRITES = 1;
|
||||
cmdWriter->BuildWriteUConfigRegPacket(cmdBuff, mmGRBM_GFX_INDEX__CI__VI, grbm_gfx_index.u32All);
|
||||
|
||||
// Issue a CSPartialFlush cmd including cache flush
|
||||
cmdWriter->BuildWriteWaitIdlePacket(cmdBuff);
|
||||
|
||||
// Program the thread trace mode register
|
||||
ttCfgRegs_.ttRegMode.bits.MODE = SQ_THREAD_TRACE_MODE_ON;
|
||||
cmdWriter->BuildWriteUConfigRegPacket(cmdBuff, mmSQ_THREAD_TRACE_MODE__VI,
|
||||
ttCfgRegs_.ttRegMode.u32All);
|
||||
ttCfgRegs_.ttRegMode.bits.MODE = SQ_THREAD_TRACE_MODE_OFF;
|
||||
|
||||
// Issue a CSPartialFlush cmd including cache flush
|
||||
cmdWriter->BuildWriteWaitIdlePacket(cmdBuff);
|
||||
return;
|
||||
}
|
||||
|
||||
void Gfx8ThreadTrace::StopSession(DefaultCmdBuf* cmdBuff, CommandWriter* cmdWriter) {
|
||||
// Program Grbm to broadcast messages to all shader engines
|
||||
regGRBM_GFX_INDEX grbm_gfx_index;
|
||||
grbm_gfx_index.u32All = 0;
|
||||
grbm_gfx_index.bitfields.SH_BROADCAST_WRITES = 1;
|
||||
grbm_gfx_index.bitfields.SE_BROADCAST_WRITES = 1;
|
||||
grbm_gfx_index.bitfields.INSTANCE_BROADCAST_WRITES = 1;
|
||||
cmdWriter->BuildWriteUConfigRegPacket(cmdBuff, mmGRBM_GFX_INDEX__CI__VI, grbm_gfx_index.u32All);
|
||||
|
||||
// Issue a CSPartialFlush cmd including cache flush
|
||||
cmdWriter->BuildWriteWaitIdlePacket(cmdBuff);
|
||||
|
||||
// Program the thread trace mode register to disable thread trace
|
||||
// The MODE register is set to disable thread trace by default
|
||||
cmdWriter->BuildWriteUConfigRegPacket(cmdBuff, mmSQ_THREAD_TRACE_MODE__VI,
|
||||
ttCfgRegs_.ttRegMode.u32All);
|
||||
|
||||
// Issue a CSPartialFlush cmd including cache flush
|
||||
cmdWriter->BuildWriteWaitIdlePacket(cmdBuff);
|
||||
|
||||
// Iterate through the list of SE's and read the Status, Counter and
|
||||
// Write Pointer registers of Thread Trace subsystem
|
||||
uint64_t baseAddr = 0;
|
||||
for (int idx = 0; idx < numSE_; idx++) {
|
||||
// Program Grbm to direct writes to one SE
|
||||
grbm_gfx_index.bitfields.SH_INDEX = 0;
|
||||
grbm_gfx_index.bitfields.SE_INDEX = idx;
|
||||
grbm_gfx_index.bitfields.SH_BROADCAST_WRITES = 0;
|
||||
grbm_gfx_index.bitfields.SE_BROADCAST_WRITES = 0;
|
||||
cmdWriter->BuildWriteUConfigRegPacket(cmdBuff, mmGRBM_GFX_INDEX__CI__VI, grbm_gfx_index.u32All);
|
||||
|
||||
// Issue WaitRegMem command to wait until SQTT event has completed
|
||||
bool funcEq = false;
|
||||
bool memSpace = false;
|
||||
uint32_t waitVal = 0x01;
|
||||
uint32_t maskVal = 0x40000000L;
|
||||
uint32_t statusOffset = mmSQ_THREAD_TRACE_STATUS__VI - UCONFIG_SPACE_START__CI__VI;
|
||||
cmdWriter->BuildWaitRegMemCommand(cmdBuff, memSpace, statusOffset, funcEq, maskVal, waitVal);
|
||||
|
||||
// Retrieve the values from various status registers
|
||||
cmdWriter->BuildCopyDataPacket(cmdBuff, COPY_DATA_SEL_SRC_SYS_PERF_COUNTER,
|
||||
mmSQ_THREAD_TRACE_STATUS__VI, 0,
|
||||
ttStatus_ + ((TT_STATUS_IDX_MAX * idx) + TT_STATUS_IDX_STATUS),
|
||||
COPY_DATA_SEL_COUNT_1DW, true);
|
||||
|
||||
cmdWriter->BuildCopyDataPacket(cmdBuff, COPY_DATA_SEL_SRC_SYS_PERF_COUNTER,
|
||||
mmSQ_THREAD_TRACE_CNTR, 0,
|
||||
ttStatus_ + ((TT_STATUS_IDX_MAX * idx) + TT_STATUS_IDX_CNTR),
|
||||
COPY_DATA_SEL_COUNT_1DW, true);
|
||||
|
||||
uint32_t wptrIdx = ((TT_STATUS_IDX_MAX * idx) + TT_STATUS_IDX_WPTR);
|
||||
cmdWriter->BuildCopyDataPacket(cmdBuff, COPY_DATA_SEL_SRC_SYS_PERF_COUNTER,
|
||||
mmSQ_THREAD_TRACE_WPTR__VI, 0, ttStatus_ + wptrIdx,
|
||||
COPY_DATA_SEL_COUNT_1DW, true);
|
||||
}
|
||||
|
||||
// Reset the GRBM to broadcast mode
|
||||
grbm_gfx_index.bitfields.SH_BROADCAST_WRITES = 1;
|
||||
grbm_gfx_index.bitfields.SE_BROADCAST_WRITES = 1;
|
||||
cmdWriter->BuildWriteUConfigRegPacket(cmdBuff, mmGRBM_GFX_INDEX__CI__VI, grbm_gfx_index.u32All);
|
||||
|
||||
// Initialize cache flush request object
|
||||
FlushCacheOptions flush;
|
||||
flush.l1 = true;
|
||||
flush.l2 = true;
|
||||
flush.icache = true;
|
||||
flush.kcache = true;
|
||||
cmdWriter->BuildFlushCacheCmd(cmdBuff, &flush, NULL, 0);
|
||||
|
||||
// Program the size of thread trace buffer
|
||||
regSQ_THREAD_TRACE_SIZE ttRegSize = {0};
|
||||
ttRegSize.u32All = 0;
|
||||
cmdWriter->BuildWriteUConfigRegPacket(cmdBuff, mmSQ_THREAD_TRACE_SIZE__VI, ttRegSize.u32All);
|
||||
|
||||
// Program the thread trace ctrl register
|
||||
regSQ_THREAD_TRACE_CTRL sqttCtrl = {};
|
||||
sqttCtrl.u32All = 0;
|
||||
sqttCtrl.bits.RESET_BUFFER = 1;
|
||||
cmdWriter->BuildWriteUConfigRegPacket(cmdBuff, mmSQ_THREAD_TRACE_CTRL__VI, sqttCtrl.u32All);
|
||||
|
||||
// Program the compute_thread_trace_enable register
|
||||
/*
|
||||
regCOMPUTE_THREAD_TRACE_ENABLE__CI__VI disableTT = {0};
|
||||
cmdWriter->BuildWriteUConfigRegPacket(cmdBuff,
|
||||
mmCOMPUTE_THREAD_TRACE_ENABLE__CI__VI,
|
||||
disableTT.u32All);
|
||||
*/
|
||||
|
||||
// Disable RLC Perfmon Clock Gating
|
||||
// On Vega this is needed to collect Perf Cntrs
|
||||
// cmdWriter->BuildWriteUConfigRegPacket(cmdBuff, mmRLC_PERFMON_CLK_CNTL__VI, 0);
|
||||
|
||||
// Issue a CSPartialFlush cmd including cache flush
|
||||
cmdWriter->BuildWriteWaitIdlePacket(cmdBuff);
|
||||
return;
|
||||
}
|
||||
|
||||
bool Gfx8ThreadTrace::Validate() {
|
||||
// Iterate through the list of SE to verify
|
||||
for (int idx = 0; idx < numSE_; idx++) {
|
||||
// Determine if the buffer has wrapped
|
||||
uint32_t statusIdx = ((TT_STATUS_IDX_MAX * idx) + TT_STATUS_IDX_STATUS);
|
||||
if (ttStatus_[statusIdx] & 0x80000000) {
|
||||
return false;
|
||||
}
|
||||
|
||||
// Adjust the value of Write Ptr which is bits [29-0]
|
||||
uint32_t wptrIdx = ((TT_STATUS_IDX_MAX * idx) + TT_STATUS_IDX_WPTR);
|
||||
ttStatus_[wptrIdx] = (ttStatus_[wptrIdx] & TT_WRITE_PTR_MASK);
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
} // pm4_profile
|
||||
@@ -0,0 +1,101 @@
|
||||
#ifndef _GFX8_THREAD_TRACE_H_
|
||||
#define _GFX8_THREAD_TRACE_H_
|
||||
|
||||
#include "gfxip/gfx8/si_ci_vi_merged_typedef.h"
|
||||
#include "gfxip/gfx8/si_ci_vi_merged_offset.h"
|
||||
#include "gfxip/gfx8/si_ci_vi_merged_enum.h"
|
||||
#include "gfxip/gfx8/si_pm4defs.h"
|
||||
#include "thread_trace.h"
|
||||
|
||||
#include <string>
|
||||
|
||||
namespace pm4_profile {
|
||||
|
||||
typedef struct Gfx8ThreadTraceCfgRegs {
|
||||
// Size of thread trace buffer
|
||||
regSQ_THREAD_TRACE_SIZE ttRegSize;
|
||||
// Thread trace mode
|
||||
regSQ_THREAD_TRACE_MODE ttRegMode;
|
||||
// Thread trace wave mask
|
||||
regSQ_THREAD_TRACE_MASK ttRegMask;
|
||||
// Thread trace token mask
|
||||
regSQ_THREAD_TRACE_TOKEN_MASK ttRegTokenMask;
|
||||
// Thread trace token mask2
|
||||
regSQ_THREAD_TRACE_TOKEN_MASK2__VI ttRegTokenMask2;
|
||||
// Thread trace perf mask
|
||||
regSQ_THREAD_TRACE_PERF_MASK ttRegPerfMask;
|
||||
} Gfx8ThreadTraceCfgRegs;
|
||||
|
||||
// Encapsulates the various Api and structures used to enable a thread
|
||||
// trace session and collect its data
|
||||
class Gfx8ThreadTrace : public ThreadTrace {
|
||||
public:
|
||||
Gfx8ThreadTrace();
|
||||
|
||||
~Gfx8ThreadTrace();
|
||||
|
||||
// Initializes various data structures and handles that
|
||||
// are needed to support a thread trace session
|
||||
bool Init(const ThreadTraceConfig* config);
|
||||
|
||||
// Builds Pm4 command stream to program hardware registers that
|
||||
// enable a thread trace session, including the issue of an event
|
||||
// to begin thread session
|
||||
void BeginSession(pm4_profile::DefaultCmdBuf* cmdBuff, pm4_profile::CommandWriter* cmdWriter);
|
||||
|
||||
// Builds Pm4 command stream to program hardware registers that
|
||||
// disable a thread trace session, including the issue of an event
|
||||
// to stop currently ongoing thread session
|
||||
void StopSession(pm4_profile::DefaultCmdBuf* cmdBuff, pm4_profile::CommandWriter* cmdWriter);
|
||||
|
||||
// Validates that thread trace session ran correctly i.e. did not
|
||||
// encounter any errors.
|
||||
bool Validate();
|
||||
|
||||
// Initializes the handle of buffer used to collect SQTT data
|
||||
void setSqttDataBuff(uint8_t* sqttBuffer, uint32_t sqttBuffSz);
|
||||
|
||||
// Initializes the handle of buffer used to read control data of SQTT
|
||||
void setSqttCtrlBuff(uint32_t* ctrlBuff) { ttStatus_ = ctrlBuff; }
|
||||
|
||||
// Return status info size
|
||||
uint32_t StatusSizeInfo() const { return TT_STATUS_IDX_MAX * sizeof(uint32_t) * numSE_; }
|
||||
|
||||
// Return number of Shader Engines
|
||||
uint32_t getNumSe() { return numSE_; }
|
||||
|
||||
private:
|
||||
// Holds number of Shader Engines present on device
|
||||
uint32_t numSE_;
|
||||
|
||||
// Thread traces status register indices to determine
|
||||
// status of thread trace run
|
||||
typedef enum {
|
||||
TT_STATUS_IDX_STATUS = 0,
|
||||
TT_STATUS_IDX_CNTR = 1,
|
||||
TT_STATUS_IDX_WPTR = 2,
|
||||
TT_STATUS_IDX_MAX = 3
|
||||
} TTStatusReg;
|
||||
|
||||
// A list of tuples of TT_STATUS_IDX_MAX size,
|
||||
// giving status of thread trace
|
||||
uint32_t* ttStatus_;
|
||||
|
||||
// Size of thread trace buffer per shader engine
|
||||
uint32_t ttBuffSize_;
|
||||
|
||||
// Handles of Device memory used for thread trace
|
||||
std::vector<uint64_t> devMemList_;
|
||||
|
||||
// Registers that need to be programmed for Thread Trace
|
||||
Gfx8ThreadTraceCfgRegs ttCfgRegs_;
|
||||
|
||||
// Initializes thread trace registers with default parameters.
|
||||
// These are potentially updated based on updates to thread trace
|
||||
// configuration object by user
|
||||
void InitThreadTraceCfgRegs();
|
||||
};
|
||||
|
||||
} // pm4_profile
|
||||
|
||||
#endif // _GFX8_THREAD_TRACE_H_
|
||||
@@ -0,0 +1,356 @@
|
||||
|
||||
#include <iostream>
|
||||
#include <fstream>
|
||||
#include <iomanip>
|
||||
#include <random>
|
||||
#include <memory>
|
||||
|
||||
#include "core/util/os.h"
|
||||
#include "gfx9_thread_trace.h"
|
||||
|
||||
/// @brief Returns the lower 32-bits of a value
|
||||
inline uint32_t Low32(uint64_t u) { return (u & 0xFFFFFFFFUL); }
|
||||
|
||||
/// @brief Returns the upper 32-bits of a value
|
||||
inline uint32_t High32(uint64_t u) { return (u >> 32); }
|
||||
|
||||
namespace pm4_profile {
|
||||
|
||||
Gfx9ThreadTrace::Gfx9ThreadTrace() {
|
||||
// Initialize the number of shader engines
|
||||
numSE_ = 4;
|
||||
}
|
||||
|
||||
Gfx9ThreadTrace::~Gfx9ThreadTrace() {}
|
||||
|
||||
bool Gfx9ThreadTrace::Init(const ThreadTraceConfig* config) {
|
||||
// Initialize SQTT Configuration and Register objects
|
||||
if (!ThreadTrace::Init(config)) return false;
|
||||
InitThreadTraceCfgRegs();
|
||||
return true;
|
||||
}
|
||||
|
||||
void Gfx9ThreadTrace::InitThreadTraceCfgRegs() {
|
||||
// Indicates the size of buffer to use per Shader Engine instance.
|
||||
// The size is specified in terms of 4KB blocks
|
||||
ttCfgRegs_.ttRegSize.u32All = 0;
|
||||
|
||||
// Indicates various attributes of a thread trace session.
|
||||
//
|
||||
// MASK_CS: Which shader types should be enabled for data collection
|
||||
// Enable CS Shader types.
|
||||
//
|
||||
// WRAP: How trace buffer should be used as a ring buffer or as a linear
|
||||
// buffer - Disable WRAP mode i.e use it as a linear buffer
|
||||
//
|
||||
// MODE: Enables a thread trace session
|
||||
//
|
||||
// CAPTURE_MODE: When thread trace data is collected immediately after MODE
|
||||
// is enabled or wait until a Thread Trace Start event is received
|
||||
//
|
||||
// AUTOFLUSH_EN: Flush thread trace data to buffer often automatically
|
||||
//
|
||||
ttCfgRegs_.ttRegMode.u32All = 0;
|
||||
ttCfgRegs_.ttRegMode.bits.WRAP = 0;
|
||||
ttCfgRegs_.ttRegMode.bits.CAPTURE_MODE = 0;
|
||||
ttCfgRegs_.ttRegMode.bits.MASK_CS = 1;
|
||||
ttCfgRegs_.ttRegMode.bits.AUTOFLUSH_EN = 1;
|
||||
ttCfgRegs_.ttRegMode.bits.MODE = SQ_THREAD_TRACE_MODE_OFF;
|
||||
|
||||
// Enable Thread Trace for all VM Id's
|
||||
// Enable all of the SIMD's of the compute unit
|
||||
// Enable Compute Unit (CU) at index Zero to be used for fine-grained data
|
||||
// Enable Shader Array (SH) at index Zero to be used for fine-grained data
|
||||
//
|
||||
// @note: Not enabling REG_STALL_EN, SPI_STALL_EN and SQ_STALL_EN bits. They
|
||||
// are useful if we wish to program buffer throttling.
|
||||
//
|
||||
ttCfgRegs_.ttRegMask.u32All = 0;
|
||||
ttCfgRegs_.ttRegMask.bits.SH_SEL = 0x0;
|
||||
ttCfgRegs_.ttRegMask.bits.SIMD_EN = 0xF;
|
||||
ttCfgRegs_.ttRegMask.bits.CU_SEL = SetCuId();
|
||||
ttCfgRegs_.ttRegMask.bits.SQ_STALL_EN = 0x1;
|
||||
ttCfgRegs_.ttRegMask.bits.SPI_STALL_EN = 0x1;
|
||||
ttCfgRegs_.ttRegMask.bits.REG_STALL_EN = 0x1;
|
||||
ttCfgRegs_.ttRegMask.bits.VM_ID_MASK = SetVmId();
|
||||
|
||||
// Override Mask value if a user value is available
|
||||
uint32_t ttMask = SetMask();
|
||||
if (ttMask) {
|
||||
ttCfgRegs_.ttRegMask.u32All = ttMask;
|
||||
}
|
||||
|
||||
// Mask of compute units to get thread trace data from
|
||||
ttCfgRegs_.ttRegPerfMask.u32All = 0;
|
||||
ttCfgRegs_.ttRegPerfMask.bits.SH0_MASK = 0xFFFF;
|
||||
ttCfgRegs_.ttRegPerfMask.bits.SH1_MASK = 0xFFFF;
|
||||
|
||||
// Indicate the different TT messages/tokens that should be enabled/logged
|
||||
// Indicate the different TT tokens that specify register operations to be logged
|
||||
ttCfgRegs_.ttRegTokenMask.u32All = 0;
|
||||
ttCfgRegs_.ttRegTokenMask.bits.REG_MASK = 0xFF;
|
||||
ttCfgRegs_.ttRegTokenMask.bits.TOKEN_MASK = 0xFFFF;
|
||||
ttCfgRegs_.ttRegTokenMask.bits.REG_DROP_ON_STALL = 0x1;
|
||||
|
||||
// Override TokenMask1 value if a user value is available
|
||||
uint32_t tokenMask1 = SetTokenMask();
|
||||
if (tokenMask1) {
|
||||
ttCfgRegs_.ttRegTokenMask.u32All = tokenMask1;
|
||||
}
|
||||
|
||||
// Indicate the different TT tokens that specify instruction operations to be logged
|
||||
// Disabling specifically instruction operations updating Program Counter (PC).
|
||||
// @note: The field is defined in the spec incorrectly as a 16-bit value
|
||||
ttCfgRegs_.ttRegTokenMask2.u32All = 0;
|
||||
ttCfgRegs_.ttRegTokenMask2.bits.INST_MASK = 0xFFFFFF7F;
|
||||
|
||||
// Override TokenMask2 value if a user value is available
|
||||
uint32_t tokenMask2 = SetTokenMask2();
|
||||
if (tokenMask2) {
|
||||
ttCfgRegs_.ttRegTokenMask2.u32All = tokenMask2;
|
||||
}
|
||||
}
|
||||
|
||||
void Gfx9ThreadTrace::setSqttDataBuff(uint8_t* sqttBuffer, uint32_t sqttBuffSz) {
|
||||
// Compute the size of buffer available for each shader engine
|
||||
ttBuffSize_ = sqttBuffSz / numSE_;
|
||||
|
||||
// Populate the sqtt buffer array submitted to device
|
||||
for (int idx = 0; idx < numSE_; idx++) {
|
||||
uint64_t sqttSEAddr = uint64_t(sqttBuffer + (ttBuffSize_ * idx));
|
||||
devMemList_.push_back(sqttSEAddr);
|
||||
}
|
||||
|
||||
// Update the size bit-field of sqtt ctrl register
|
||||
ttCfgRegs_.ttRegSize.bits.SIZE = ttBuffSize_ >> TT_BUFF_ALIGN_SHIFT;
|
||||
}
|
||||
|
||||
void Gfx9ThreadTrace::BeginSession(DefaultCmdBuf* cmdBuff, CommandWriter* cmdWriter) {
|
||||
// Program Grbm to broadcast messages to all shader engines
|
||||
regGRBM_GFX_INDEX grbm_gfx_index;
|
||||
grbm_gfx_index.u32All = 0;
|
||||
grbm_gfx_index.bitfields.SH_BROADCAST_WRITES = 1;
|
||||
grbm_gfx_index.bitfields.SE_BROADCAST_WRITES = 1;
|
||||
grbm_gfx_index.bitfields.INSTANCE_BROADCAST_WRITES = 1;
|
||||
cmdWriter->BuildWriteUConfigRegPacket(cmdBuff, mmGRBM_GFX_INDEX, grbm_gfx_index.u32All);
|
||||
|
||||
// Issue a CSPartialFlush cmd including cache flush
|
||||
cmdWriter->BuildWriteWaitIdlePacket(cmdBuff);
|
||||
|
||||
// Disable RLC Perfmon Clock Gating
|
||||
// On Vega this is needed to collect Perf Cntrs
|
||||
// cmdWriter->BuildWriteUConfigRegPacket(cmdBuff, mmRLC_PERFMON_CLK_CNTL, 1);
|
||||
|
||||
// Program the Compute register to indicate SQTT is enabled
|
||||
/*
|
||||
regCOMPUTE_THREAD_TRACE_ENABLE enableTT = {0};
|
||||
enableTT.bits.THREAD_TRACE_ENABLE = 1;
|
||||
cmdWriter->BuildWriteUConfigRegPacket(cmdBuff,
|
||||
mmCOMPUTE_THREAD_TRACE_ENABLE,
|
||||
enableTT.u32All);
|
||||
*/
|
||||
|
||||
// Program the thread trace mask - specifies SH, CU, SIMD and
|
||||
// VM Id masks to apply. Enabling SQ/SPI/REG_STALL_EN bits
|
||||
cmdWriter->BuildWriteUConfigRegPacket(cmdBuff, mmSQ_THREAD_TRACE_MASK,
|
||||
ttCfgRegs_.ttRegMask.u32All);
|
||||
|
||||
// Program the thread trace Perf mask
|
||||
cmdWriter->BuildWriteUConfigRegPacket(cmdBuff, mmSQ_THREAD_TRACE_PERF_MASK,
|
||||
ttCfgRegs_.ttRegPerfMask.u32All);
|
||||
|
||||
// Program the thread trace token mask
|
||||
cmdWriter->BuildWriteUConfigRegPacket(cmdBuff, mmSQ_THREAD_TRACE_TOKEN_MASK,
|
||||
ttCfgRegs_.ttRegTokenMask.u32All);
|
||||
|
||||
// Program the thread trace token mask2 to specify the list of instruction
|
||||
// tokens to record. Disabling INST_PC instruction tokens
|
||||
cmdWriter->BuildWriteUConfigRegPacket(cmdBuff, mmSQ_THREAD_TRACE_TOKEN_MASK2,
|
||||
ttCfgRegs_.ttRegTokenMask2.u32All);
|
||||
|
||||
// Program the thread trace mode register
|
||||
cmdWriter->BuildWriteUConfigRegPacket(cmdBuff, mmSQ_THREAD_TRACE_MODE,
|
||||
ttCfgRegs_.ttRegMode.u32All);
|
||||
|
||||
// Program the HiWaterMark register to support stalling
|
||||
if ((ttCfgRegs_.ttRegMask.bits.SQ_STALL_EN) || (ttCfgRegs_.ttRegMask.bits.SPI_STALL_EN) ||
|
||||
(ttCfgRegs_.ttRegMask.bits.REG_STALL_EN) ||
|
||||
(ttCfgRegs_.ttRegTokenMask.bits.REG_DROP_ON_STALL)) {
|
||||
cmdWriter->BuildWriteUConfigRegPacket(cmdBuff, mmSQ_THREAD_TRACE_HIWATER, 0x06);
|
||||
}
|
||||
|
||||
// Iterate through the list of SE's and program the register
|
||||
// for carrying address of thread trace buffer which is aligned
|
||||
// to 4KB per thread trace specification
|
||||
uint64_t baseAddr = 0;
|
||||
for (int idx = 0; idx < numSE_; idx++) {
|
||||
// Program Grbm to direct writes to one SE
|
||||
grbm_gfx_index.bitfields.SH_INDEX = 0;
|
||||
grbm_gfx_index.bitfields.SE_INDEX = idx;
|
||||
grbm_gfx_index.bitfields.SH_BROADCAST_WRITES = 0;
|
||||
grbm_gfx_index.bitfields.SE_BROADCAST_WRITES = 0;
|
||||
cmdWriter->BuildWriteUConfigRegPacket(cmdBuff, mmGRBM_GFX_INDEX, grbm_gfx_index.u32All);
|
||||
|
||||
// Program base2 address of buffer to use for thread trace
|
||||
/*
|
||||
regSQ_THREAD_TRACE_BASE2 sqttBase2 = {};
|
||||
sqttBase2.u32All = 0;
|
||||
sqttBase2.bits.ADDR_HI = 0;
|
||||
cmdWriter->BuildWriteUConfigRegPacket(cmdBuff,
|
||||
mmSQ_THREAD_TRACE_BASE2,
|
||||
sqttBase2.u32All);
|
||||
*/
|
||||
|
||||
// Program the base address to use
|
||||
baseAddr = devMemList_[idx] >> TT_BUFF_ALIGN_SHIFT;
|
||||
|
||||
// Program base address of buffer to use for thread trace
|
||||
regSQ_THREAD_TRACE_BASE sqttBase = {};
|
||||
sqttBase.bits.ADDR = Low32(baseAddr);
|
||||
cmdWriter->BuildWriteUConfigRegPacket(cmdBuff, mmSQ_THREAD_TRACE_BASE, sqttBase.u32All);
|
||||
|
||||
// Program the size of thread trace buffer
|
||||
cmdWriter->BuildWriteUConfigRegPacket(cmdBuff, mmSQ_THREAD_TRACE_SIZE,
|
||||
ttCfgRegs_.ttRegSize.u32All);
|
||||
|
||||
// Program the thread trace ctrl register
|
||||
regSQ_THREAD_TRACE_CTRL sqttCtrl = {};
|
||||
sqttCtrl.u32All = 0;
|
||||
sqttCtrl.bits.RESET_BUFFER = 1;
|
||||
cmdWriter->BuildWriteUConfigRegPacket(cmdBuff, mmSQ_THREAD_TRACE_CTRL, sqttCtrl.u32All);
|
||||
}
|
||||
|
||||
// Reset the GRBM to broadcast mode
|
||||
grbm_gfx_index.bitfields.SH_BROADCAST_WRITES = 1;
|
||||
grbm_gfx_index.bitfields.SE_BROADCAST_WRITES = 1;
|
||||
cmdWriter->BuildWriteUConfigRegPacket(cmdBuff, mmGRBM_GFX_INDEX, grbm_gfx_index.u32All);
|
||||
|
||||
// Issue a CSPartialFlush cmd including cache flush
|
||||
cmdWriter->BuildWriteWaitIdlePacket(cmdBuff);
|
||||
|
||||
// Program the thread trace mode register
|
||||
ttCfgRegs_.ttRegMode.bits.MODE = SQ_THREAD_TRACE_MODE_ON;
|
||||
cmdWriter->BuildWriteUConfigRegPacket(cmdBuff, mmSQ_THREAD_TRACE_MODE,
|
||||
ttCfgRegs_.ttRegMode.u32All);
|
||||
ttCfgRegs_.ttRegMode.bits.MODE = SQ_THREAD_TRACE_MODE_OFF;
|
||||
|
||||
// Issue a CSPartialFlush cmd including cache flush
|
||||
cmdWriter->BuildWriteWaitIdlePacket(cmdBuff);
|
||||
return;
|
||||
}
|
||||
|
||||
void Gfx9ThreadTrace::StopSession(DefaultCmdBuf* cmdBuff, CommandWriter* cmdWriter) {
|
||||
// Program Grbm to broadcast messages to all shader engines
|
||||
regGRBM_GFX_INDEX grbm_gfx_index;
|
||||
grbm_gfx_index.u32All = 0;
|
||||
grbm_gfx_index.bitfields.SH_BROADCAST_WRITES = 1;
|
||||
grbm_gfx_index.bitfields.SE_BROADCAST_WRITES = 1;
|
||||
grbm_gfx_index.bitfields.INSTANCE_BROADCAST_WRITES = 1;
|
||||
cmdWriter->BuildWriteUConfigRegPacket(cmdBuff, mmGRBM_GFX_INDEX, grbm_gfx_index.u32All);
|
||||
|
||||
// Issue a CSPartialFlush cmd including cache flush
|
||||
cmdWriter->BuildWriteWaitIdlePacket(cmdBuff);
|
||||
|
||||
// Program the thread trace mode register to disable thread trace
|
||||
// The MODE register is set to disable thread trace by default
|
||||
cmdWriter->BuildWriteUConfigRegPacket(cmdBuff, mmSQ_THREAD_TRACE_MODE,
|
||||
ttCfgRegs_.ttRegMode.u32All);
|
||||
|
||||
// Issue a CSPartialFlush cmd including cache flush
|
||||
cmdWriter->BuildWriteWaitIdlePacket(cmdBuff);
|
||||
|
||||
// Iterate through the list of SE's and read the Status, Counter and
|
||||
// Write Pointer registers of Thread Trace subsystem
|
||||
uint64_t baseAddr = 0;
|
||||
for (int idx = 0; idx < numSE_; idx++) {
|
||||
// Program Grbm to direct writes to one SE
|
||||
grbm_gfx_index.bitfields.SH_INDEX = 0;
|
||||
grbm_gfx_index.bitfields.SE_INDEX = idx;
|
||||
grbm_gfx_index.bitfields.SH_BROADCAST_WRITES = 0;
|
||||
grbm_gfx_index.bitfields.SE_BROADCAST_WRITES = 0;
|
||||
cmdWriter->BuildWriteUConfigRegPacket(cmdBuff, mmGRBM_GFX_INDEX, grbm_gfx_index.u32All);
|
||||
|
||||
// Issue WaitRegMem command to wait until SQTT event has completed
|
||||
bool funcEq = false;
|
||||
bool memSpace = false;
|
||||
uint32_t waitVal = 0x01;
|
||||
uint32_t maskVal = 0x40000000L;
|
||||
uint32_t statusOffset = mmSQ_THREAD_TRACE_STATUS - UCONFIG_SPACE_START;
|
||||
cmdWriter->BuildWaitRegMemCommand(cmdBuff, memSpace, statusOffset, funcEq, maskVal, waitVal);
|
||||
|
||||
// Retrieve the values from various status registers
|
||||
cmdWriter->BuildCopyDataPacket(cmdBuff, COPY_DATA_SEL_SRC_SYS_PERF_COUNTER,
|
||||
mmSQ_THREAD_TRACE_STATUS, 0,
|
||||
ttStatus_ + ((TT_STATUS_IDX_MAX * idx) + TT_STATUS_IDX_STATUS),
|
||||
COPY_DATA_SEL_COUNT_1DW, true);
|
||||
|
||||
cmdWriter->BuildCopyDataPacket(cmdBuff, COPY_DATA_SEL_SRC_SYS_PERF_COUNTER,
|
||||
mmSQ_THREAD_TRACE_CNTR, 0,
|
||||
ttStatus_ + ((TT_STATUS_IDX_MAX * idx) + TT_STATUS_IDX_CNTR),
|
||||
COPY_DATA_SEL_COUNT_1DW, true);
|
||||
|
||||
uint32_t wptrIdx = ((TT_STATUS_IDX_MAX * idx) + TT_STATUS_IDX_WPTR);
|
||||
cmdWriter->BuildCopyDataPacket(cmdBuff, COPY_DATA_SEL_SRC_SYS_PERF_COUNTER,
|
||||
mmSQ_THREAD_TRACE_WPTR, 0, ttStatus_ + wptrIdx,
|
||||
COPY_DATA_SEL_COUNT_1DW, true);
|
||||
}
|
||||
|
||||
// Reset the GRBM to broadcast mode
|
||||
grbm_gfx_index.bitfields.SH_BROADCAST_WRITES = 1;
|
||||
grbm_gfx_index.bitfields.SE_BROADCAST_WRITES = 1;
|
||||
cmdWriter->BuildWriteUConfigRegPacket(cmdBuff, mmGRBM_GFX_INDEX, grbm_gfx_index.u32All);
|
||||
|
||||
// Initialize cache flush request object
|
||||
FlushCacheOptions flush;
|
||||
flush.l1 = true;
|
||||
flush.l2 = true;
|
||||
flush.icache = true;
|
||||
flush.kcache = true;
|
||||
cmdWriter->BuildFlushCacheCmd(cmdBuff, &flush, NULL, 0);
|
||||
|
||||
// Program the size of thread trace buffer
|
||||
regSQ_THREAD_TRACE_SIZE ttRegSize = {0};
|
||||
ttRegSize.u32All = 0;
|
||||
cmdWriter->BuildWriteUConfigRegPacket(cmdBuff, mmSQ_THREAD_TRACE_SIZE, ttRegSize.u32All);
|
||||
|
||||
// Program the thread trace ctrl register
|
||||
regSQ_THREAD_TRACE_CTRL sqttCtrl = {};
|
||||
sqttCtrl.u32All = 0;
|
||||
sqttCtrl.bits.RESET_BUFFER = 1;
|
||||
cmdWriter->BuildWriteUConfigRegPacket(cmdBuff, mmSQ_THREAD_TRACE_CTRL, sqttCtrl.u32All);
|
||||
|
||||
// Program the compute_thread_trace_enable register
|
||||
/*
|
||||
regCOMPUTE_THREAD_TRACE_ENABLE disableTT = {0};
|
||||
cmdWriter->BuildWriteUConfigRegPacket(cmdBuff,
|
||||
mmCOMPUTE_THREAD_TRACE_ENABLE,
|
||||
disableTT.u32All);
|
||||
*/
|
||||
|
||||
// Disable RLC Perfmon Clock Gating
|
||||
// On Vega this is needed to collect Perf Cntrs
|
||||
// cmdWriter->BuildWriteUConfigRegPacket(cmdBuff, mmRLC_PERFMON_CLK_CNTL, 0);
|
||||
|
||||
// Issue a CSPartialFlush cmd including cache flush
|
||||
cmdWriter->BuildWriteWaitIdlePacket(cmdBuff);
|
||||
return;
|
||||
}
|
||||
|
||||
bool Gfx9ThreadTrace::Validate() {
|
||||
// Iterate through the list of SE to verify
|
||||
for (int idx = 0; idx < numSE_; idx++) {
|
||||
// Determine if the buffer has wrapped
|
||||
uint32_t statusIdx = ((TT_STATUS_IDX_MAX * idx) + TT_STATUS_IDX_STATUS);
|
||||
if (ttStatus_[statusIdx] & 0x80000000) {
|
||||
return false;
|
||||
}
|
||||
|
||||
// Adjust the value of Write Ptr which is bits [29-0]
|
||||
uint32_t wptrIdx = ((TT_STATUS_IDX_MAX * idx) + TT_STATUS_IDX_WPTR);
|
||||
ttStatus_[wptrIdx] = (ttStatus_[wptrIdx] & TT_WRITE_PTR_MASK);
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
} // pm4_profile
|
||||
@@ -0,0 +1,104 @@
|
||||
#ifndef _GFX9_THREAD_TRACE_H_
|
||||
#define _GFX9_THREAD_TRACE_H_
|
||||
|
||||
#include "gfxip/gfx9/gfx9_registers.h"
|
||||
#include "gfxip/gfx9/gfx9_typedef.h"
|
||||
#include "gfxip/gfx9/gfx9_enum.h"
|
||||
#include "gfxip/gfx9/gfx9_offset.h"
|
||||
#include "gfxip/gfx9/gfx9_pm4defs.h"
|
||||
#include "thread_trace.h"
|
||||
|
||||
#include <string>
|
||||
|
||||
using namespace pm4_profile::gfx9;
|
||||
|
||||
namespace pm4_profile {
|
||||
|
||||
typedef struct Gfx9ThreadTraceCfgRegs {
|
||||
// Size of thread trace buffer
|
||||
regSQ_THREAD_TRACE_SIZE ttRegSize;
|
||||
// Thread trace mode
|
||||
regSQ_THREAD_TRACE_MODE ttRegMode;
|
||||
// Thread trace wave mask
|
||||
regSQ_THREAD_TRACE_MASK ttRegMask;
|
||||
// Thread trace token mask
|
||||
regSQ_THREAD_TRACE_TOKEN_MASK ttRegTokenMask;
|
||||
// Thread trace token mask2
|
||||
regSQ_THREAD_TRACE_TOKEN_MASK2 ttRegTokenMask2;
|
||||
// Thread trace perf mask
|
||||
regSQ_THREAD_TRACE_PERF_MASK ttRegPerfMask;
|
||||
} Gfx9ThreadTraceCfgRegs;
|
||||
|
||||
// Encapsulates the various Api and structures used to enable a thread
|
||||
// trace session and collect its data
|
||||
class Gfx9ThreadTrace : public ThreadTrace {
|
||||
public:
|
||||
Gfx9ThreadTrace();
|
||||
|
||||
~Gfx9ThreadTrace();
|
||||
|
||||
// Initializes various data structures and handles that
|
||||
// are needed to support a thread trace session
|
||||
bool Init(const ThreadTraceConfig* config);
|
||||
|
||||
// Builds Pm4 command stream to program hardware registers that
|
||||
// enable a thread trace session, including the issue of an event
|
||||
// to begin thread session
|
||||
void BeginSession(pm4_profile::DefaultCmdBuf* cmdBuff, pm4_profile::CommandWriter* cmdWriter);
|
||||
|
||||
// Builds Pm4 command stream to program hardware registers that
|
||||
// disable a thread trace session, including the issue of an event
|
||||
// to stop currently ongoing thread session
|
||||
void StopSession(pm4_profile::DefaultCmdBuf* cmdBuff, pm4_profile::CommandWriter* cmdWriter);
|
||||
|
||||
// Validates that thread trace session ran correctly i.e. did not
|
||||
// encounter any errors.
|
||||
bool Validate();
|
||||
|
||||
// Initializes the handle of buffer used to collect SQTT data
|
||||
void setSqttDataBuff(uint8_t* sqttBuffer, uint32_t sqttBuffSz);
|
||||
|
||||
// Initializes the handle of buffer used to read control data of SQTT
|
||||
void setSqttCtrlBuff(uint32_t* ctrlBuff) { ttStatus_ = ctrlBuff; }
|
||||
|
||||
// Return status info size
|
||||
uint32_t StatusSizeInfo() const { return TT_STATUS_IDX_MAX * sizeof(uint32_t) * numSE_; }
|
||||
|
||||
// Return number of Shader Engines
|
||||
uint32_t getNumSe() { return numSE_; }
|
||||
|
||||
private:
|
||||
// Holds number of Shader Engines present on device
|
||||
uint32_t numSE_;
|
||||
|
||||
// Thread traces status register indices to determine
|
||||
// status of thread trace run
|
||||
typedef enum {
|
||||
TT_STATUS_IDX_STATUS = 0,
|
||||
TT_STATUS_IDX_CNTR = 1,
|
||||
TT_STATUS_IDX_WPTR = 2,
|
||||
TT_STATUS_IDX_MAX = 3
|
||||
} TTStatusReg;
|
||||
|
||||
// A list of tuples of TT_STATUS_IDX_MAX size,
|
||||
// giving status of thread trace
|
||||
uint32_t* ttStatus_;
|
||||
|
||||
// Size of thread trace buffer per shader engine
|
||||
uint32_t ttBuffSize_;
|
||||
|
||||
// Handles of Device memory used for thread trace
|
||||
std::vector<uint64_t> devMemList_;
|
||||
|
||||
// Registers that need to be programmed for Thread Trace
|
||||
Gfx9ThreadTraceCfgRegs ttCfgRegs_;
|
||||
|
||||
// Initializes thread trace registers with default parameters.
|
||||
// These are potentially updated based on updates to thread trace
|
||||
// configuration object by user
|
||||
void InitThreadTraceCfgRegs();
|
||||
};
|
||||
|
||||
} // pm4_profile
|
||||
|
||||
#endif // _GFX9_THREAD_TRACE_H_
|
||||
@@ -0,0 +1,105 @@
|
||||
#include <iostream>
|
||||
|
||||
#include "core/util/os.h"
|
||||
#include "thread_trace.h"
|
||||
|
||||
namespace pm4_profile {
|
||||
|
||||
bool ThreadTrace::Init(const ThreadTraceConfig* config) {
|
||||
if (config) {
|
||||
ttConfig_ = *config;
|
||||
} else {
|
||||
InitThreadTraceConfig(&ttConfig_);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
void ThreadTrace::InitThreadTraceConfig(ThreadTraceConfig* config) const {
|
||||
memset(config, 0, sizeof(ThreadTraceConfig));
|
||||
|
||||
config->threadTraceTargetCu = 0;
|
||||
config->threadTraceVmIdMask = 0;
|
||||
config->threadTraceMask = 0;
|
||||
config->threadTraceTokenMask = 0;
|
||||
config->threadTraceTokenMask2 = 0;
|
||||
}
|
||||
|
||||
uint8_t ThreadTrace::SetCuId() {
|
||||
uint32_t cuId = ttConfig_.threadTraceTargetCu;
|
||||
|
||||
// Allow users to specify the CU to choose for Target tokens
|
||||
std::string var = os::GetEnvVar("HSA_THREAD_TRACE_SELECT_CU");
|
||||
if (var.length() > 0) {
|
||||
cuId = std::stol(var, nullptr, 16);
|
||||
std::cout << "Using " << cuId << " as CUID for Thread Trace" << std::endl;
|
||||
}
|
||||
|
||||
assert((cuId <= 15) && "Cu Id must be between 0 and 15");
|
||||
|
||||
return cuId;
|
||||
}
|
||||
|
||||
uint8_t ThreadTrace::SetVmId() {
|
||||
uint32_t vmId = ttConfig_.threadTraceVmIdMask;
|
||||
|
||||
// Allow users to specify the VMID to choose for Target tokens
|
||||
std::string var = os::GetEnvVar("HSA_THREAD_TRACE_SELECT_VMID");
|
||||
if (var.length() > 0) {
|
||||
vmId = std::stol(var, nullptr, 16);
|
||||
std::cout << "Using " << vmId << " as VMID for Thread Trace" << std::endl;
|
||||
}
|
||||
|
||||
assert((vmId <= 2) && "VmId must be between 0 and 2");
|
||||
|
||||
return vmId;
|
||||
}
|
||||
|
||||
uint32_t ThreadTrace::SetMask() {
|
||||
uint32_t ttMask = ttConfig_.threadTraceMask;
|
||||
const uint32_t validMask = 0x00C0D0;
|
||||
|
||||
// Allow users to specify the Mask to choose for configuration parameters
|
||||
std::string var = os::GetEnvVar("HSA_THREAD_TRACE_SELECT_MASK");
|
||||
if (var.length() > 0) {
|
||||
ttMask = std::stol(var, nullptr, 16);
|
||||
std::cout << "Using " << ttMask << " as Mask for Thread Trace" << std::endl;
|
||||
}
|
||||
|
||||
assert(((ttMask & validMask) == 0) && "Mask should have bits [4,6,7] set to Zero");
|
||||
|
||||
return ttMask;
|
||||
}
|
||||
|
||||
uint32_t ThreadTrace::SetTokenMask() {
|
||||
uint32_t tokenMask = ttConfig_.threadTraceTokenMask;
|
||||
const uint32_t validMask = 0xFF000000;
|
||||
|
||||
// Allow users to specify the TokenMask to choose for Target tokens
|
||||
std::string var = os::GetEnvVar("HSA_THREAD_TRACE_SELECT_TOKEN_MASK1");
|
||||
if (var.length() > 0) {
|
||||
tokenMask = std::stol(var, nullptr, 16);
|
||||
std::cout << "Using " << tokenMask << " as TokenMask for Thread Trace" << std::endl;
|
||||
}
|
||||
|
||||
assert(((tokenMask & validMask) == 0) && "TokenMask should have bits [31:25] set to Zero");
|
||||
|
||||
return tokenMask;
|
||||
}
|
||||
|
||||
uint32_t ThreadTrace::SetTokenMask2() {
|
||||
uint32_t tokenMask2 = ttConfig_.threadTraceTokenMask2;
|
||||
const uint32_t validMask = 0xFFFF0000;
|
||||
|
||||
// Allow users to specify the TokenMask2 to choose for Target tokens
|
||||
std::string var = os::GetEnvVar("HSA_THREAD_TRACE_SELECT_TOKEN_MASK2");
|
||||
if (var.length() > 0) {
|
||||
tokenMask2 = std::stol(var, nullptr, 16);
|
||||
std::cout << "Using " << tokenMask2 << " as TokenMask2 for Thread Trace" << std::endl;
|
||||
}
|
||||
|
||||
assert(((tokenMask2 & validMask) == 0) && "TokenMask2 should have bits [31:16] set to Zero");
|
||||
|
||||
return tokenMask2;
|
||||
}
|
||||
|
||||
} // pm4_profile
|
||||
@@ -0,0 +1,104 @@
|
||||
#ifndef _THREAD_TRACE_H_
|
||||
#define _THREAD_TRACE_H_
|
||||
|
||||
#include <stdint.h>
|
||||
|
||||
#include "cmdwriter.h"
|
||||
|
||||
// Move them as static variables later on
|
||||
#define TT_WRITE_PTR_MASK (0x3FFFFFFF)
|
||||
#define TT_DEFAULT_BUFF_SIZE_SCALE (16)
|
||||
#define TT_DEFAULT_BUFF_SIZE (1024 * 1024 * 8)
|
||||
|
||||
// Size of block in bytesper increment in WPTR
|
||||
#define TT_WRITE_PTR_BLK (32)
|
||||
|
||||
// Factor by which to shift buffer address
|
||||
#define TT_BUFF_ALIGN_SHIFT (12)
|
||||
|
||||
// Align address to 64 Kilobytes
|
||||
#define TT_BUFF_ADDR_ALIGN (0x10000)
|
||||
|
||||
namespace pm4_profile {
|
||||
|
||||
// ThreadTrace config
|
||||
typedef struct ThreadTraceConfig {
|
||||
uint32_t threadTraceTargetCu;
|
||||
uint32_t threadTraceVmIdMask;
|
||||
uint32_t threadTraceMask;
|
||||
uint32_t threadTraceTokenMask;
|
||||
uint32_t threadTraceTokenMask2;
|
||||
} ThreadTraceConfig;
|
||||
|
||||
// Encapsulates the various Api and structures that are used to enable
|
||||
// a thread trace session and collect its data. Implementations of this
|
||||
// interface program device specific registers to realize the functionality
|
||||
class ThreadTrace {
|
||||
// Holds Thread Trace configuration information
|
||||
// @note: Currently not used i.e. is not exposed to users
|
||||
ThreadTraceConfig ttConfig_;
|
||||
|
||||
public:
|
||||
// Destructor of the thread trace service handle
|
||||
virtual ~ThreadTrace(){};
|
||||
|
||||
// Obtain the CU id to use for thread tracing
|
||||
uint8_t SetCuId();
|
||||
|
||||
// Obtain the VM id to use for thread tracing
|
||||
uint8_t SetVmId();
|
||||
|
||||
// Obtain the Mask to use for thread tracing
|
||||
uint32_t SetMask();
|
||||
|
||||
// Obtain the Token Mask 1 to use for thread tracing
|
||||
uint32_t SetTokenMask();
|
||||
|
||||
// Obtain the Token Mask 2 to use for thread tracing
|
||||
uint32_t SetTokenMask2();
|
||||
|
||||
// Initializes various data structures and handles that
|
||||
// are needed to support a thread trace session
|
||||
virtual bool Init(const ThreadTraceConfig* config);
|
||||
|
||||
// Initializes thread trace configuration object with default
|
||||
// parameters, that could potentially be overriden by user
|
||||
// @note: Currently not used i.e. is not exposed to users
|
||||
virtual void InitThreadTraceConfig(ThreadTraceConfig* config) const;
|
||||
|
||||
// Allows user to configure various parameters of a thread trace session
|
||||
// @note: Currently not used i.e. is not exposed to users
|
||||
bool Config(uint32_t key, uint32_t value) { return true; };
|
||||
|
||||
// Builds Pm4 command stream to program hardware registers that
|
||||
// enable a thread trace session, including the issue of an event
|
||||
// to begin thread session
|
||||
virtual void BeginSession(pm4_profile::DefaultCmdBuf* cmdBuff,
|
||||
pm4_profile::CommandWriter* cmdWriter) = 0;
|
||||
|
||||
// Builds Pm4 command stream to program hardware registers that
|
||||
// disable a thread trace session, including the issue of an event
|
||||
// to stop currently ongoing thread session
|
||||
virtual void StopSession(pm4_profile::DefaultCmdBuf* cmdBuff,
|
||||
pm4_profile::CommandWriter* cmdWriter) = 0;
|
||||
|
||||
// Validates that thread trace session ran correctly i.e. did not
|
||||
// encounter any errors.
|
||||
virtual bool Validate() = 0;
|
||||
|
||||
// Initializes the handle of buffer used to collect SQTT data
|
||||
virtual void setSqttDataBuff(uint8_t* sqttBuffer, uint32_t sqttBuffSz) = 0;
|
||||
|
||||
// Initializes the handle of buffer used to read control data of SQTT
|
||||
virtual void setSqttCtrlBuff(uint32_t* ctrlBuff) = 0;
|
||||
|
||||
// Return number of Shader Engines
|
||||
virtual uint32_t getNumSe() = 0;
|
||||
|
||||
// Return status info size
|
||||
virtual uint32_t StatusSizeInfo() const = 0;
|
||||
};
|
||||
|
||||
} // pm4_profile
|
||||
|
||||
#endif // _THREAD_TRACE_H_
|
||||
@@ -0,0 +1,17 @@
|
||||
#
|
||||
# Source files for Rocr Utils library
|
||||
#
|
||||
set ( MODULE_SRC ${CORE_UTIL_DIR}/lnx/os_linux.cpp )
|
||||
|
||||
#
|
||||
# Header files include path(s).
|
||||
#
|
||||
include_directories ( $ENV{ROCR_INC_DIR} )
|
||||
include_directories ( ${HSA_RUNTIME_OSC_DIR} )
|
||||
include_directories ( ${CORE_UTIL_DIR} )
|
||||
|
||||
#
|
||||
# Build Utils as a Static Library object
|
||||
#
|
||||
add_library( ${UTIL_LIB} STATIC ${MODULE_SRC} )
|
||||
target_link_libraries( ${UTIL_LIB} c stdc++ dl pthread rt )
|
||||
@@ -0,0 +1,48 @@
|
||||
#
|
||||
# Header files include path(s).
|
||||
#
|
||||
include_directories ( $ENV{ROCR_INC_DIR} )
|
||||
include_directories ( ${API_DIR} )
|
||||
include_directories ( ${PROJ_DIR}/cmdwriter )
|
||||
include_directories ( ${PROJ_DIR}/perfcounter )
|
||||
include_directories ( ${PROJ_DIR}/threadtrace )
|
||||
include_directories ( ${PROJ_DIR}/aqlprofile )
|
||||
include_directories ( ${TEST_DIR}/common )
|
||||
include_directories ( ${TEST_DIR}/ctrl )
|
||||
include_directories ( ${CORE_UTIL_DIR} )
|
||||
|
||||
#
|
||||
# Specify the directory containing the libraries of HsaRt
|
||||
# to be linked against for building a Hsa Perf application
|
||||
#
|
||||
LINK_DIRECTORIES($ENV{ROCR_LIB_DIR})
|
||||
find_library ( ROCR_LIB NAMES hsa-runtime64 PATHS $ENV{ROCR_LIB_DIR} )
|
||||
|
||||
#
|
||||
# Set Name for Common library and build it as a
|
||||
# static library to be linked with others
|
||||
#
|
||||
set ( COMMON_LIB "common${ONLY64STR}" )
|
||||
add_subdirectory ( ${TEST_DIR}/common "${PROJECT_BINARY_DIR}/common" )
|
||||
|
||||
#
|
||||
# Build the test library
|
||||
#
|
||||
set ( TEST_NAME simple_convolution )
|
||||
include_directories ( ${TEST_DIR}/${TEST_NAME} )
|
||||
set ( LIB_NAME "${TEST_NAME}${ONLY64STR}" )
|
||||
add_library ( ${LIB_NAME} STATIC ${TEST_DIR}/${TEST_NAME}/${TEST_NAME}.cpp )
|
||||
target_link_libraries( ${LIB_NAME} c stdc++ )
|
||||
execute_process ( COMMAND sh -xc "cp ${TEST_DIR}/${TEST_NAME}/*.hsaco ${PROJECT_BINARY_DIR}" )
|
||||
set ( TEST_LIBS ${LIB_NAME} )
|
||||
|
||||
#
|
||||
# Build the test control
|
||||
#
|
||||
set ( SRC_LIST ${TEST_DIR}/ctrl/test.cpp )
|
||||
set ( SRC_LIST ${SRC_LIST} ${TEST_DIR}/ctrl/test_pmgr.cpp )
|
||||
set ( SRC_LIST ${SRC_LIST} ${TEST_DIR}/ctrl/test_hsa.cpp )
|
||||
set ( LIB_LIST ${TEST_LIBS} ${COMMON_LIB} ${CORE_UTILS_LIB} ${ROCR_LIB} ${TARGET_LIB} )
|
||||
set ( EXE_NAME "ctrl" )
|
||||
add_executable ( ${EXE_NAME} ${SRC_LIST} )
|
||||
target_link_libraries( ${EXE_NAME} ${LIB_LIST} c stdc++ dl pthread rt )
|
||||
Исполняемый файл
+876
@@ -0,0 +1,876 @@
|
||||
/*
|
||||
* =============================================================================
|
||||
* ROC Runtime Conformance Release License
|
||||
* =============================================================================
|
||||
* The University of Illinois/NCSA
|
||||
* Open Source License (NCSA)
|
||||
*
|
||||
* Copyright (c) 2017, Advanced Micro Devices, Inc.
|
||||
* All rights reserved.
|
||||
*
|
||||
* Developed by:
|
||||
*
|
||||
* AMD Research and AMD ROC Software Development
|
||||
*
|
||||
* Advanced Micro Devices, Inc.
|
||||
*
|
||||
* www.amd.com
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
* of this software and associated documentation files (the "Software"), to
|
||||
* deal with the Software without restriction, including without limitation
|
||||
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
* and/or sell copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* - Redistributions of source code must retain the above copyright notice,
|
||||
* this list of conditions and the following disclaimers.
|
||||
* - Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimers in
|
||||
* the documentation and/or other materials provided with the distribution.
|
||||
* - Neither the names of <Name of Development Group, Name of Institution>,
|
||||
* nor the names of its contributors may be used to endorse or promote
|
||||
* products derived from this Software without specific prior written
|
||||
* permission.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
||||
* THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
|
||||
* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
|
||||
* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
|
||||
* DEALINGS WITH THE SOFTWARE.
|
||||
*
|
||||
*/
|
||||
|
||||
#include <assert.h>
|
||||
#include <stdint.h>
|
||||
#include <string.h>
|
||||
#include <fcntl.h>
|
||||
#include <unistd.h>
|
||||
#include <string>
|
||||
#include <iostream>
|
||||
#include <climits>
|
||||
#include "hsa/hsa.h"
|
||||
#include "hsa/hsa_ext_amd.h"
|
||||
|
||||
#define RET_IF_HSA_ERR(err) { \
|
||||
if ((err) != HSA_STATUS_SUCCESS) { \
|
||||
std::cout << "hsa api call failure at line " << __LINE__ << ", file: " << \
|
||||
__FILE__ << ". Call returned " << err << std::endl; \
|
||||
return (err); \
|
||||
} \
|
||||
}
|
||||
|
||||
static const uint32_t kBinarySearchLength = 512;
|
||||
static const uint32_t kBinarySearchFindMe = 108;
|
||||
static const uint32_t kWorkGroupSize = 256;
|
||||
|
||||
// Hold all the info specific to binary search
|
||||
typedef struct BinarySearch {
|
||||
// Binary Search parameters
|
||||
uint32_t length;
|
||||
uint32_t work_group_size;
|
||||
uint32_t work_grid_size;
|
||||
uint32_t num_sub_divisions;
|
||||
uint32_t find_me;
|
||||
|
||||
// Buffers needed for this application
|
||||
uint32_t* input;
|
||||
uint32_t* input_arr;
|
||||
uint32_t* input_arr_local;
|
||||
uint32_t* output;
|
||||
// Keneral argument buffers and addresses
|
||||
void* kern_arg_buffer; // Begin of allocated memory
|
||||
// this pointer to be deallocated
|
||||
void* kern_arg_address; // Properly aligned address to be used in aql
|
||||
// packet (don't use for deallocation)
|
||||
|
||||
// Kernel code
|
||||
std::string kernel_file_name;
|
||||
std::string kernel_name;
|
||||
uint32_t kernarg_size;
|
||||
uint32_t kernarg_align;
|
||||
|
||||
// HSA/RocR objects needed for this application
|
||||
hsa_agent_t gpu_dev;
|
||||
hsa_agent_t cpu_dev;
|
||||
hsa_signal_t signal;
|
||||
hsa_queue_t* queue;
|
||||
hsa_amd_memory_pool_t cpu_pool;
|
||||
hsa_amd_memory_pool_t gpu_pool;
|
||||
hsa_amd_memory_pool_t kern_arg_pool;
|
||||
|
||||
// Other items we need to populate AQL packet
|
||||
uint64_t kernel_object;
|
||||
uint32_t group_segment_size; ///< Kernel group seg size
|
||||
uint32_t private_segment_size; ///< Kernel private seg size
|
||||
} BinarySearch;
|
||||
|
||||
void InitializeBinarySearch(BinarySearch* bs) {
|
||||
bs->kernel_file_name = "./binary_search_kernels.hsaco";
|
||||
bs->kernel_name = "binarySearch";
|
||||
bs->length = 512;
|
||||
bs->find_me = 108;
|
||||
bs->work_group_size = 256;
|
||||
bs->num_sub_divisions = bs->length / bs->work_group_size;
|
||||
}
|
||||
|
||||
// This function is called by the call-back functions used to find an agent of
|
||||
// the specified hsa_device_type_t. Note that it cannot be called directly from
|
||||
// hsa_iterate_agents() as it does not match the prototype of the call-back
|
||||
// function. It must be wrapped by a function with the correct prototype.
|
||||
//
|
||||
// Return values:
|
||||
// HSA_STATUS_INFO_BREAK -- "agent" is of the specified type (dev_type)
|
||||
// HSA_STATUS_SUCCESS -- "agent" is not of the specified type
|
||||
// Other -- Some error occurred
|
||||
static hsa_status_t FindAgent(hsa_agent_t agent, void* data,
|
||||
hsa_device_type_t dev_type) {
|
||||
if (data == nullptr) {
|
||||
return HSA_STATUS_ERROR_INVALID_ARGUMENT;
|
||||
}
|
||||
|
||||
// See if the provided agent matches the input type (dev_type)
|
||||
hsa_device_type_t hsa_device_type;
|
||||
hsa_status_t hsa_error_code = hsa_agent_get_info(agent, HSA_AGENT_INFO_DEVICE,
|
||||
&hsa_device_type);
|
||||
RET_IF_HSA_ERR(hsa_error_code);
|
||||
|
||||
if (hsa_device_type == dev_type) {
|
||||
*(reinterpret_cast<hsa_agent_t*>(data)) = agent;
|
||||
return HSA_STATUS_INFO_BREAK;
|
||||
}
|
||||
|
||||
return HSA_STATUS_SUCCESS;
|
||||
}
|
||||
|
||||
// This is the call-back function used to find a GPU type agent. Note that the
|
||||
// prototype of this function is dictated by the HSA specification
|
||||
hsa_status_t FindGPUDevice(hsa_agent_t agent, void* data) {
|
||||
return FindAgent(agent, data, HSA_DEVICE_TYPE_GPU);
|
||||
}
|
||||
|
||||
// This is the call-back function used to find a CPU type agent. Note that the
|
||||
// prototype of this function is dictated by the HSA specification
|
||||
hsa_status_t FindCPUDevice(hsa_agent_t agent, void* data) {
|
||||
return FindAgent(agent, data, HSA_DEVICE_TYPE_CPU);
|
||||
}
|
||||
|
||||
// Find the CPU and GPU agents we need to run this sample, and save them in the
|
||||
// BinarySearch structure for later use.
|
||||
hsa_status_t FindDevices(BinarySearch* bs) {
|
||||
hsa_status_t err;
|
||||
|
||||
// Note that hsa_iterate_agents iterate through all known agents until
|
||||
// HSA_STATUS_SUCCESS is not returned. The call-backs are implemented such
|
||||
// that HSA_STATUS_INFO_BREAK means we found an agent of the specified type.
|
||||
// This value is returned by hsa_iterate_agents.
|
||||
bs->gpu_dev.handle = 0;
|
||||
err = hsa_iterate_agents(FindGPUDevice, &bs->gpu_dev);
|
||||
|
||||
if (err != HSA_STATUS_INFO_BREAK) {
|
||||
return HSA_STATUS_ERROR;
|
||||
}
|
||||
|
||||
bs->cpu_dev.handle = 0;
|
||||
err = hsa_iterate_agents(FindCPUDevice, &bs->cpu_dev);
|
||||
|
||||
if (err != HSA_STATUS_INFO_BREAK) {
|
||||
return HSA_STATUS_ERROR;
|
||||
}
|
||||
|
||||
if (0 == bs->gpu_dev.handle) {
|
||||
std::cout << "GPU Device is not Created properly!" << std::endl;
|
||||
RET_IF_HSA_ERR(HSA_STATUS_ERROR);
|
||||
}
|
||||
|
||||
if (0 == bs->cpu_dev.handle) {
|
||||
std::cout << "CPU Device is not Created properly!" << std::endl;
|
||||
RET_IF_HSA_ERR(HSA_STATUS_ERROR);
|
||||
}
|
||||
|
||||
return HSA_STATUS_SUCCESS;
|
||||
}
|
||||
|
||||
// This function checks to see if the provided
|
||||
// pool has the HSA_AMD_SEGMENT_GLOBAL property. If the kern_arg flag is true,
|
||||
// the function adds an additional requirement that the pool have the
|
||||
// HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_KERNARG_INIT property. If kern_arg is false,
|
||||
// pools must NOT have this property.
|
||||
// Upon finding a pool that meets these conditions, HSA_STATUS_INFO_BREAK is
|
||||
// returned. HSA_STATUS_SUCCESS is returned if no errors were encountered, but
|
||||
// no pool was found meeting the requirements. If an error is encountered, we
|
||||
// return that error.
|
||||
|
||||
// Note that this function does not match the required prototype for the
|
||||
// hsa_amd_agent_iterate_memory_pools call back function, and therefore must be
|
||||
// wrapped by a function with the correct prototype.
|
||||
static hsa_status_t
|
||||
FindGlobalPool(hsa_amd_memory_pool_t pool, void* data, bool kern_arg) {
|
||||
hsa_status_t err;
|
||||
hsa_amd_segment_t segment;
|
||||
uint32_t flag;
|
||||
|
||||
if (nullptr == data) {
|
||||
return HSA_STATUS_ERROR_INVALID_ARGUMENT;
|
||||
}
|
||||
|
||||
err = hsa_amd_memory_pool_get_info(pool, HSA_AMD_MEMORY_POOL_INFO_SEGMENT,
|
||||
&segment);
|
||||
RET_IF_HSA_ERR(err);
|
||||
|
||||
if (HSA_AMD_SEGMENT_GLOBAL != segment) {
|
||||
return HSA_STATUS_SUCCESS;
|
||||
}
|
||||
|
||||
err = hsa_amd_memory_pool_get_info(pool,
|
||||
HSA_AMD_MEMORY_POOL_INFO_GLOBAL_FLAGS, &flag);
|
||||
RET_IF_HSA_ERR(err);
|
||||
|
||||
uint32_t karg_st = flag & HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_KERNARG_INIT;
|
||||
|
||||
if ((karg_st == 0 && kern_arg) ||
|
||||
(karg_st != 0 && !kern_arg)) {
|
||||
return HSA_STATUS_SUCCESS;
|
||||
}
|
||||
|
||||
*(reinterpret_cast<hsa_amd_memory_pool_t*>(data)) = pool;
|
||||
return HSA_STATUS_INFO_BREAK;
|
||||
}
|
||||
|
||||
// This is the call-back function for hsa_amd_agent_iterate_memory_pools() that
|
||||
// finds a pool with the properties of HSA_AMD_SEGMENT_GLOBAL and that is NOT
|
||||
// HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_KERNARG_INIT
|
||||
hsa_status_t FindStandardPool(hsa_amd_memory_pool_t pool, void* data) {
|
||||
return FindGlobalPool(pool, data, false);
|
||||
}
|
||||
|
||||
// This is the call-back function for hsa_amd_agent_iterate_memory_pools() that
|
||||
// finds a pool with the properties of HSA_AMD_SEGMENT_GLOBAL and that IS
|
||||
// HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_KERNARG_INIT
|
||||
hsa_status_t FindKernArgPool(hsa_amd_memory_pool_t pool, void* data) {
|
||||
return FindGlobalPool(pool, data, true);
|
||||
}
|
||||
|
||||
// Find memory pools that we will need to allocate from for this sample
|
||||
// application. We will need memory associated with the host CPU, the GPU
|
||||
// executing the kernels, and for kernel arguments. This function will
|
||||
// save the found pools to the BinarySearch structure for use elsewhere
|
||||
// in this program.
|
||||
hsa_status_t FindPools(BinarySearch* bs) {
|
||||
hsa_status_t err;
|
||||
|
||||
err = hsa_amd_agent_iterate_memory_pools(bs->cpu_dev, FindStandardPool,
|
||||
&bs->cpu_pool);
|
||||
|
||||
if (err != HSA_STATUS_INFO_BREAK) {
|
||||
return HSA_STATUS_ERROR;
|
||||
}
|
||||
|
||||
err = hsa_amd_agent_iterate_memory_pools(bs->gpu_dev, FindStandardPool,
|
||||
&bs->gpu_pool);
|
||||
|
||||
if (err != HSA_STATUS_INFO_BREAK) {
|
||||
return HSA_STATUS_ERROR;
|
||||
}
|
||||
|
||||
err = hsa_amd_agent_iterate_memory_pools(bs->cpu_dev,
|
||||
FindKernArgPool, &bs->kern_arg_pool);
|
||||
|
||||
if (err != HSA_STATUS_INFO_BREAK) {
|
||||
return HSA_STATUS_ERROR;
|
||||
}
|
||||
|
||||
return HSA_STATUS_SUCCESS;
|
||||
}
|
||||
|
||||
// Once the needed memory pools have been found and the BinarySearch structure
|
||||
// has been updated with these handles, this function is then used to allocate
|
||||
// memory from those pools.
|
||||
// Devices with which a pool is associated already have access to the pool.
|
||||
// However, other devices may also need to read or write to that memory. Below,
|
||||
// we see how we can grant access to other devices to address this issue.
|
||||
hsa_status_t AllocateAndInitBuffers(BinarySearch* bs) {
|
||||
hsa_status_t err;
|
||||
uint32_t out_length = 4 * sizeof(uint32_t);
|
||||
uint32_t in_length = bs->num_sub_divisions * 2 * sizeof(uint32_t);
|
||||
|
||||
// In all of these examples, we want both the cpu and gpu to have access to
|
||||
// the buffer in question. We use the array of agents below in the susequent
|
||||
// calls to hsa_amd_agents_allow_access() for this purpose.
|
||||
hsa_agent_t ag_list[2] = {bs->gpu_dev, bs->cpu_dev};
|
||||
|
||||
err = hsa_amd_memory_pool_allocate(bs->cpu_pool, in_length, 0,
|
||||
reinterpret_cast<void**>(&bs->input));
|
||||
RET_IF_HSA_ERR(err);
|
||||
err = hsa_amd_agents_allow_access(2, ag_list, NULL, bs->input);
|
||||
RET_IF_HSA_ERR(err);
|
||||
(void)memset(bs->input, 0, in_length);
|
||||
|
||||
err = hsa_amd_memory_pool_allocate(bs->cpu_pool, out_length, 0,
|
||||
reinterpret_cast<void**>(&bs->output));
|
||||
RET_IF_HSA_ERR(err);
|
||||
err = hsa_amd_agents_allow_access(2, ag_list, NULL, bs->output);
|
||||
RET_IF_HSA_ERR(err);
|
||||
(void)memset(bs->input, 0, in_length);
|
||||
|
||||
err = hsa_amd_memory_pool_allocate(bs->cpu_pool, in_length, 0,
|
||||
reinterpret_cast<void**>(&bs->input_arr));
|
||||
RET_IF_HSA_ERR(err);
|
||||
err = hsa_amd_agents_allow_access(2, ag_list, NULL, bs->input_arr);
|
||||
RET_IF_HSA_ERR(err);
|
||||
(void)memset(bs->input, 0, in_length);
|
||||
|
||||
err = hsa_amd_memory_pool_allocate(bs->cpu_pool, in_length, 0,
|
||||
reinterpret_cast<void**>(&bs->input_arr_local));
|
||||
RET_IF_HSA_ERR(err);
|
||||
err = hsa_amd_agents_allow_access(2, ag_list, NULL, bs->input_arr_local);
|
||||
RET_IF_HSA_ERR(err);
|
||||
|
||||
// Binary-search application specific code...
|
||||
// Initialize input buffer with random values in an increasing order
|
||||
uint32_t max = bs->length * 20;
|
||||
bs->input[0] = 0;
|
||||
|
||||
uint32_t seed = (unsigned int)time(NULL);
|
||||
srand(seed);
|
||||
|
||||
for (uint32_t i = 1; i < bs->length; ++i) {
|
||||
bs->input[i] = bs->input[i - 1] +
|
||||
static_cast<uint32_t>(max * rand_r(&seed) / static_cast<float>(RAND_MAX));
|
||||
}
|
||||
|
||||
// #define VERBOSE 1
|
||||
#ifdef VERBOSE
|
||||
std::cout << "Input array values:" << std::endl;
|
||||
|
||||
for (uint32_t i = 0; i < bs->length; ++i) {
|
||||
std::cout << "input[" << i << "] = " << bs->input[i] << " ";
|
||||
|
||||
if (i % 4 == 0) {
|
||||
std::cout << std::endl;
|
||||
}
|
||||
}
|
||||
|
||||
std::cout << std::endl;
|
||||
#endif
|
||||
|
||||
return err;
|
||||
}
|
||||
|
||||
// The code in this function illustrates how to load a kernel from
|
||||
// pre-compiled code. The goal is to get a handle that can be later
|
||||
// used in an AQL packet and also to extract information about kernel
|
||||
// that we will need. All of the information hand kernel handle will
|
||||
// be saved to the BinarySearch structure. It will be used when we
|
||||
// populate the AQL packet.
|
||||
hsa_status_t LoadKernelFromObjFile(BinarySearch* bs) {
|
||||
hsa_status_t err;
|
||||
hsa_code_object_reader_t code_obj_rdr = {0};
|
||||
hsa_executable_t executable = {0};
|
||||
|
||||
hsa_file_t file_handle = open(bs->kernel_file_name.c_str(), O_RDONLY);
|
||||
|
||||
if (file_handle == -1) {
|
||||
std::cout << "failed to open " << bs->kernel_file_name.c_str() <<
|
||||
" at line " << __LINE__ << ", errno: " << errno << std::endl;
|
||||
return HSA_STATUS_ERROR;
|
||||
}
|
||||
|
||||
err = hsa_code_object_reader_create_from_file(file_handle, &code_obj_rdr);
|
||||
RET_IF_HSA_ERR(err);
|
||||
close(file_handle);
|
||||
|
||||
err = hsa_executable_create_alt(HSA_PROFILE_FULL,
|
||||
HSA_DEFAULT_FLOAT_ROUNDING_MODE_DEFAULT, NULL, &executable);
|
||||
RET_IF_HSA_ERR(err);
|
||||
|
||||
err = hsa_executable_load_agent_code_object(executable, bs->gpu_dev,
|
||||
code_obj_rdr, NULL, NULL);
|
||||
RET_IF_HSA_ERR(err);
|
||||
|
||||
err = hsa_executable_freeze(executable, NULL);
|
||||
RET_IF_HSA_ERR(err);
|
||||
|
||||
hsa_executable_symbol_t kern_sym;
|
||||
err = hsa_executable_get_symbol(executable, NULL, bs->kernel_name.c_str(),
|
||||
bs->gpu_dev, 0, &kern_sym);
|
||||
RET_IF_HSA_ERR(err);
|
||||
|
||||
err = hsa_executable_symbol_get_info(kern_sym,
|
||||
HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT,
|
||||
&bs->kernel_object);
|
||||
RET_IF_HSA_ERR(err);
|
||||
|
||||
err = hsa_executable_symbol_get_info(kern_sym,
|
||||
HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_PRIVATE_SEGMENT_SIZE,
|
||||
&bs->private_segment_size);
|
||||
RET_IF_HSA_ERR(err);
|
||||
|
||||
err = hsa_executable_symbol_get_info(kern_sym,
|
||||
HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_GROUP_SEGMENT_SIZE,
|
||||
&bs->group_segment_size);
|
||||
RET_IF_HSA_ERR(err);
|
||||
|
||||
err = hsa_executable_symbol_get_info(kern_sym,
|
||||
HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_KERNARG_SEGMENT_SIZE,
|
||||
&bs->kernarg_size);
|
||||
RET_IF_HSA_ERR(err);
|
||||
|
||||
err = hsa_executable_symbol_get_info(kern_sym,
|
||||
HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_KERNARG_SEGMENT_ALIGNMENT,
|
||||
&bs->kernarg_align);
|
||||
RET_IF_HSA_ERR(err);
|
||||
|
||||
return err;
|
||||
}
|
||||
|
||||
// This function shows how to do an asynchronous copy. We have to create a
|
||||
// signal and use the signal to notify us when the copy has completed.
|
||||
hsa_status_t AgentMemcpy(void* dst, const void* src,
|
||||
size_t size, hsa_agent_t dst_ag, hsa_agent_t src_ag) {
|
||||
hsa_signal_t s;
|
||||
hsa_status_t err;
|
||||
|
||||
err = hsa_signal_create(1, 0, NULL, &s);
|
||||
RET_IF_HSA_ERR(err);
|
||||
|
||||
err = hsa_amd_memory_async_copy(dst, dst_ag, src, src_ag, size, 0, NULL, s);
|
||||
RET_IF_HSA_ERR(err);
|
||||
|
||||
if (hsa_signal_wait_scacquire(s, HSA_SIGNAL_CONDITION_LT, 1,
|
||||
UINT64_MAX, HSA_WAIT_STATE_BLOCKED) != 0) {
|
||||
err = HSA_STATUS_ERROR;
|
||||
std::cout << "Async copy signal error" << std::endl;
|
||||
|
||||
RET_IF_HSA_ERR(err);
|
||||
}
|
||||
|
||||
err = hsa_signal_destroy(s);
|
||||
|
||||
RET_IF_HSA_ERR(err);
|
||||
|
||||
return err;
|
||||
}
|
||||
|
||||
// AlignDown and AlignUp are 2 utility functions we use to find an aligned
|
||||
// boundary either below or above a given value (address). The function will
|
||||
// return a value that has the specified alignment.
|
||||
static intptr_t
|
||||
AlignDown(intptr_t value, size_t alignment) {
|
||||
return (intptr_t) (value & ~(alignment - 1));
|
||||
}
|
||||
static void*
|
||||
AlignUp(void* value, size_t alignment) {
|
||||
return reinterpret_cast<void*>(AlignDown((uintptr_t)
|
||||
(reinterpret_cast<uintptr_t>(value) + alignment - 1), alignment));
|
||||
}
|
||||
|
||||
// This function populates the AQL patch with the information
|
||||
// we have collected and stored in the BinarySearch structure thus far.
|
||||
void PopulateAQLPacket(BinarySearch const* bs,
|
||||
hsa_kernel_dispatch_packet_t* aql) {
|
||||
aql->header = 0; // Dummy val. for now. Set this right before doorbell ring
|
||||
aql->setup = 1;
|
||||
aql->workgroup_size_x = bs->work_group_size;
|
||||
aql->workgroup_size_y = 1;
|
||||
aql->workgroup_size_z = 1;
|
||||
aql->grid_size_x = bs->work_grid_size;
|
||||
aql->grid_size_y = 1;
|
||||
aql->grid_size_z = 1;
|
||||
aql->private_segment_size = bs->private_segment_size;
|
||||
aql->group_segment_size = bs->group_segment_size;
|
||||
aql->kernel_object = bs->kernel_object;
|
||||
aql->kernarg_address = bs->kern_arg_address;
|
||||
aql->completion_signal = bs->signal;
|
||||
|
||||
return;
|
||||
}
|
||||
/*
|
||||
* Write everything in the provided AQL packet to the queue except the first 32
|
||||
* bits which include the header and setup fields. That should be done
|
||||
* last.
|
||||
*/
|
||||
void WriteAQLToQueue(hsa_kernel_dispatch_packet_t const* in_aql,
|
||||
hsa_queue_t* q) {
|
||||
void* queue_base = q->base_address;
|
||||
const uint32_t queue_mask = q->size - 1;
|
||||
uint64_t que_idx = hsa_queue_add_write_index_relaxed(q, 1);
|
||||
|
||||
hsa_kernel_dispatch_packet_t* queue_aql_packet;
|
||||
|
||||
queue_aql_packet =
|
||||
&(reinterpret_cast<hsa_kernel_dispatch_packet_t*>(queue_base))
|
||||
[que_idx & queue_mask];
|
||||
|
||||
queue_aql_packet->workgroup_size_x = in_aql->workgroup_size_x;
|
||||
queue_aql_packet->workgroup_size_y = in_aql->workgroup_size_y;
|
||||
queue_aql_packet->workgroup_size_z = in_aql->workgroup_size_z;
|
||||
queue_aql_packet->grid_size_x = in_aql->grid_size_x;
|
||||
queue_aql_packet->grid_size_y = in_aql->grid_size_y;
|
||||
queue_aql_packet->grid_size_z = in_aql->grid_size_z;
|
||||
queue_aql_packet->private_segment_size = in_aql->private_segment_size;
|
||||
queue_aql_packet->group_segment_size = in_aql->group_segment_size;
|
||||
queue_aql_packet->kernel_object = in_aql->kernel_object;
|
||||
queue_aql_packet->kernarg_address = in_aql->kernarg_address;
|
||||
queue_aql_packet->completion_signal = in_aql->completion_signal;
|
||||
}
|
||||
|
||||
// This function allocates memory from the kern_arg pool we already found, and
|
||||
// then sets the argument values needed by the kernel code.
|
||||
hsa_status_t AllocAndSetKernArgs(BinarySearch* bs, void* args,
|
||||
size_t arg_size, void** aql_buf_ptr) {
|
||||
void* kern_arg_buf = nullptr;
|
||||
hsa_status_t err;
|
||||
size_t buf_size;
|
||||
size_t req_align;
|
||||
|
||||
// The kernel code must be written to memory at the correct alignment. We
|
||||
// already queried the executable to get the correct alignment, which is
|
||||
// stored in bs->kernarg_align. In case the memory returned from
|
||||
// hsa_amd_memory_pool is not of the correct alignment, we request a little
|
||||
// more than what we need in case we need to adjust.
|
||||
req_align = bs->kernarg_align;
|
||||
// Allocate enough extra space for alignment adjustments if ncessary
|
||||
buf_size = arg_size + (req_align << 1);
|
||||
|
||||
err = hsa_amd_memory_pool_allocate(bs->kern_arg_pool, buf_size, 0,
|
||||
reinterpret_cast<void**>(&kern_arg_buf));
|
||||
RET_IF_HSA_ERR(err);
|
||||
|
||||
// Address of the allocated buffer
|
||||
bs->kern_arg_buffer = kern_arg_buf;
|
||||
|
||||
// Addr. of kern arg start.
|
||||
bs->kern_arg_address = AlignUp(kern_arg_buf, req_align);
|
||||
|
||||
assert(arg_size >= bs->kernarg_size);
|
||||
assert(((uintptr_t)bs->kern_arg_address + arg_size) <
|
||||
((uintptr_t)bs->kern_arg_buffer + buf_size));
|
||||
|
||||
(void)memcpy(bs->kern_arg_address, args, arg_size);
|
||||
RET_IF_HSA_ERR(err);
|
||||
|
||||
// Make sure both the CPU and GPU can access the kernel arguments
|
||||
hsa_agent_t ag_list[2] = {bs->gpu_dev, bs->cpu_dev};
|
||||
err = hsa_amd_agents_allow_access(2, ag_list, NULL, bs->kern_arg_buffer);
|
||||
RET_IF_HSA_ERR(err);
|
||||
|
||||
// Save this info in our BinarySearch structure for later.
|
||||
*aql_buf_ptr = bs->kern_arg_address;
|
||||
|
||||
return HSA_STATUS_SUCCESS;
|
||||
}
|
||||
|
||||
// This wrapper atomically writes the provided header and setup to the
|
||||
// provided AQL packet. The provided AQL packet address should be in the
|
||||
// queue memory space.
|
||||
inline void AtomicSetPacketHeader(uint16_t header, uint16_t setup,
|
||||
hsa_kernel_dispatch_packet_t* queue_packet) {
|
||||
__atomic_store_n(reinterpret_cast<uint32_t*>(queue_packet),
|
||||
header | (setup << 16), __ATOMIC_RELEASE);
|
||||
}
|
||||
|
||||
// Once all the required data for kernel execution is collected (in this
|
||||
// application it is stored in the BinarySearch structure) we can put it in
|
||||
// an AQL packet and ring the queue door bell to tell the command processor to
|
||||
// execute it.
|
||||
hsa_status_t Run(BinarySearch* bs) {
|
||||
hsa_status_t err;
|
||||
|
||||
std::cout << "Executing kernel " << bs->kernel_name << std::endl;
|
||||
|
||||
// Adjust the size of workgroup
|
||||
// This is mostly application specific.
|
||||
if (bs->work_group_size > 64) {
|
||||
bs->work_group_size = 64;
|
||||
bs->num_sub_divisions = bs->length / bs->work_group_size;
|
||||
|
||||
if (bs->num_sub_divisions < bs->work_group_size) {
|
||||
bs->num_sub_divisions = bs->work_group_size;
|
||||
}
|
||||
|
||||
bs->work_grid_size = bs->num_sub_divisions;
|
||||
}
|
||||
|
||||
// Explanation of BinarySearch algorithm.
|
||||
/*
|
||||
* Since a plain binary search on the GPU would not achieve much benefit
|
||||
* over the GPU we are doing an N'ary search. We split the array into N
|
||||
* segments every pass and therefore get log (base N) passes instead of log
|
||||
* (base 2) passes.
|
||||
*
|
||||
* In every pass, only the thread that can potentially have the element we
|
||||
* are looking for writes to the output array. For ex: if we are looking to
|
||||
* find 4567 in the array and every thread is searching over a segment of
|
||||
* 1000 values and the input array is 1, 2, 3, 4,... then the first thread
|
||||
* is searching in 1 to 1000, the second one from 1001 to 2000, etc. The
|
||||
* first one does not write to the output. The second one doesn't either.
|
||||
* The fifth one however is from 4001 to 5000. So it can potentially have
|
||||
* the element 4567 which lies between them.
|
||||
*
|
||||
* This particular thread writes to the output the lower bound, upper bound
|
||||
* and whether the element equals the lower bound element. So, it would be
|
||||
* 4001, 5000, 0
|
||||
*
|
||||
* The next pass would subdivide 4001 to 5000 into smaller segments and
|
||||
* continue the same process from there.
|
||||
*
|
||||
* When a pass returns 1 in the third element, it means the element has been
|
||||
* found and we can stop executing the kernel. If the element is not found,
|
||||
* then the execution stops after looking at segment of size 1.
|
||||
*/
|
||||
|
||||
uint32_t global_lower_bound = 0;
|
||||
uint32_t global_upper_bound = bs->length - 1;
|
||||
uint32_t sub_div_size = (global_upper_bound - global_lower_bound + 1) /
|
||||
bs->num_sub_divisions;
|
||||
|
||||
if ((bs->input[0] > bs->find_me) ||
|
||||
(bs->input[bs->length - 1] < bs->find_me)) {
|
||||
bs->output[0] = 0;
|
||||
bs->output[1] = bs->length - 1;
|
||||
bs->output[2] = 0;
|
||||
std::cout << "Returning too early" << std::endl;
|
||||
return HSA_STATUS_SUCCESS;
|
||||
}
|
||||
|
||||
bs->output[3] = 1;
|
||||
|
||||
// Setup the kernel args
|
||||
// See the meta-data for the compiled OpenCL kernel code to ascertain
|
||||
// the sizes, padding and alignment required for kernel arguments.
|
||||
// This can be seen by executing
|
||||
// $ amdgcn-amd-amdhsa-readelf -aw ./binary_search_kernels.hsaco
|
||||
// The kernel code will expect the following arguments aligned as shown.
|
||||
typedef uint32_t uint2[2];
|
||||
typedef uint32_t uint4[4];
|
||||
struct __attribute__((aligned(16))) local_args_t {
|
||||
uint4* outputArray;
|
||||
uint2* sortedArray;
|
||||
uint32_t findMe;
|
||||
uint32_t pad;
|
||||
uint64_t global_offset_x;
|
||||
uint64_t global_offset_y;
|
||||
uint64_t global_offset_z;
|
||||
} local_args;
|
||||
|
||||
local_args.outputArray = reinterpret_cast<uint4*>(bs->output);
|
||||
local_args.sortedArray = reinterpret_cast<uint2*>(bs->input_arr_local);
|
||||
local_args.findMe = bs->find_me;
|
||||
local_args.global_offset_x = 0;
|
||||
local_args.global_offset_y = 0;
|
||||
local_args.global_offset_z = 0;
|
||||
|
||||
// Copy the kernel args structure into kernel arg memory
|
||||
err = AllocAndSetKernArgs(bs, &local_args, sizeof(local_args),
|
||||
&bs->kern_arg_address);
|
||||
RET_IF_HSA_ERR(err);
|
||||
|
||||
// Populate an AQL packet with the info we've gathered
|
||||
hsa_kernel_dispatch_packet_t aql;
|
||||
PopulateAQLPacket(bs, &aql);
|
||||
|
||||
uint32_t in_length = bs->num_sub_divisions * 2 * sizeof(uint32_t);
|
||||
|
||||
while ((sub_div_size > 1) && (bs->output[3] != 0)) {
|
||||
for (uint32_t i = 0 ; i < bs->num_sub_divisions; i++) {
|
||||
int idx1 = i * sub_div_size;
|
||||
int idx2 = ((i + 1) * sub_div_size) - 1;
|
||||
bs->input_arr[2 * i] = bs->input[idx1];
|
||||
bs->input_arr[2 * i + 1] = bs->input[idx2];
|
||||
}
|
||||
|
||||
// Copy kernel parameter from system memory to local memory
|
||||
err = AgentMemcpy(reinterpret_cast<uint8_t*>(bs->input_arr_local),
|
||||
reinterpret_cast<uint8_t*>(bs->input_arr),
|
||||
in_length, bs->gpu_dev, bs->cpu_dev);
|
||||
|
||||
RET_IF_HSA_ERR(err);
|
||||
|
||||
// Reset output buffer to zero
|
||||
bs->output[3] = 0;
|
||||
|
||||
// Dispatch kernel with global work size, work group size with ONE dimesion
|
||||
// and wait for kernel to complete
|
||||
|
||||
// Compute the write index of queue and copy Aql packet into it
|
||||
uint64_t que_idx = hsa_queue_load_write_index_relaxed(bs->queue);
|
||||
|
||||
const uint32_t mask = bs->queue->size - 1;
|
||||
|
||||
// This function simply copies the data we've collected so far into our
|
||||
// local AQL packet, except the the setup and header fields.
|
||||
WriteAQLToQueue(&aql, bs->queue);
|
||||
|
||||
uint32_t aql_header = HSA_PACKET_TYPE_KERNEL_DISPATCH;
|
||||
aql_header |= HSA_FENCE_SCOPE_SYSTEM <<
|
||||
HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE;
|
||||
aql_header |= HSA_FENCE_SCOPE_SYSTEM <<
|
||||
HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE;
|
||||
|
||||
// Set the packet's type, acquire and release fences. This should be done
|
||||
// atomically after all the other fields have been set, using release
|
||||
// memory ordering to ensure all the fields are set when the door bell
|
||||
// signal is activated.
|
||||
void* q_base = bs->queue->base_address;
|
||||
|
||||
AtomicSetPacketHeader(aql_header, aql.setup,
|
||||
&(reinterpret_cast<hsa_kernel_dispatch_packet_t*>
|
||||
(q_base))[que_idx & mask]);
|
||||
|
||||
// Increment the write index and ring the doorbell to dispatch kernel.
|
||||
hsa_queue_store_write_index_relaxed(bs->queue, (que_idx + 1));
|
||||
hsa_signal_store_relaxed(bs->queue->doorbell_signal, que_idx);
|
||||
|
||||
// Wait on the dispatch signal until the kernel is finished.
|
||||
// Modify the wait condition to HSA_WAIT_STATE_ACTIVE (instead of
|
||||
// HSA_WAIT_STATE_BLOCKED) if polling is needed instead of blocking, as we
|
||||
// have below.
|
||||
// The call below will block until the condition is met. Below we have said
|
||||
// the condition is that the signal value (initiailzed to 1) associated with
|
||||
// the queue is less than 1. When the kernel associated with the queued AQL
|
||||
// packet has completed execution, the signal value is automatically
|
||||
// decremented by the packet processor.
|
||||
hsa_signal_value_t value = hsa_signal_wait_scacquire(bs->signal,
|
||||
HSA_SIGNAL_CONDITION_LT, 1,
|
||||
UINT64_MAX, HSA_WAIT_STATE_BLOCKED);
|
||||
|
||||
// value should be 0, or we timed-out
|
||||
if (value) {
|
||||
std::cout << "Timed out waiting for kernel to complete?" << std::endl;
|
||||
RET_IF_HSA_ERR(HSA_STATUS_ERROR);
|
||||
}
|
||||
|
||||
// Reset the signal to its initial value for the next iteration
|
||||
hsa_signal_store_screlease(bs->signal, 1);
|
||||
|
||||
// Binary search algorithm stuff...
|
||||
global_lower_bound = bs->output[0] * sub_div_size;
|
||||
global_upper_bound = global_lower_bound + sub_div_size - 1;
|
||||
sub_div_size = (global_upper_bound - global_lower_bound + 1) /
|
||||
bs->num_sub_divisions;
|
||||
}
|
||||
|
||||
uint32_t element_index = UINT_MAX;
|
||||
|
||||
for (uint32_t i = global_lower_bound; i <= global_upper_bound; i++) {
|
||||
if (bs->input[i] == bs->find_me) {
|
||||
element_index = i;
|
||||
bs->output[0] = i;
|
||||
bs->output[1] = i + 1;
|
||||
bs->output[2] = 1;
|
||||
break;
|
||||
}
|
||||
|
||||
// Element is not found in region specified
|
||||
// by global lower bound to global upper bound
|
||||
bs->output[2] = 0;
|
||||
}
|
||||
|
||||
uint32_t is_elem_found = bs->output[2];
|
||||
|
||||
std::cout << "Lower bound = " << global_lower_bound << std::endl;
|
||||
std::cout << "Upper bound = " << global_upper_bound << std::endl;
|
||||
std::cout << "Element search for = " << bs->find_me << std::endl;
|
||||
|
||||
|
||||
if (is_elem_found == 1) {
|
||||
std::cout << "Element found at index " << element_index << std::endl;
|
||||
} else {
|
||||
std::cout << "Element value " << bs->find_me << " not found" << std::endl;
|
||||
}
|
||||
|
||||
return HSA_STATUS_SUCCESS;
|
||||
}
|
||||
|
||||
// Release all the RocR resources we have acquired in this application.
|
||||
hsa_status_t CleanUp(BinarySearch* bs) {
|
||||
hsa_status_t err;
|
||||
|
||||
err = hsa_amd_memory_pool_free(bs->input);
|
||||
RET_IF_HSA_ERR(err);
|
||||
|
||||
err = hsa_amd_memory_pool_free(bs->output);
|
||||
RET_IF_HSA_ERR(err);
|
||||
|
||||
err = hsa_amd_memory_pool_free(bs->input_arr);
|
||||
RET_IF_HSA_ERR(err);
|
||||
|
||||
err = hsa_amd_memory_pool_free(bs->kern_arg_buffer);
|
||||
RET_IF_HSA_ERR(err);
|
||||
|
||||
err = hsa_queue_destroy(bs->queue);
|
||||
RET_IF_HSA_ERR(err);
|
||||
|
||||
err = hsa_signal_destroy(bs->signal);
|
||||
RET_IF_HSA_ERR(err);
|
||||
|
||||
err = hsa_shut_down();
|
||||
RET_IF_HSA_ERR(err);
|
||||
|
||||
return HSA_STATUS_SUCCESS;
|
||||
}
|
||||
|
||||
int main(int argc, char* argv[]) {
|
||||
// This BinarySearch structure (bs) below holds all of the appl. specific
|
||||
// info we need to run the sample. This includes algorithm specific
|
||||
// information as well as handles to RocR/HSA objects.
|
||||
|
||||
// The basic structure of this sample is to fill in this structure with the
|
||||
// required RocR/HSA handles to RocR resources (e.g., agents, memory pools,
|
||||
// queues, etc.) and then dispatch the packets to the queue, and examine the
|
||||
// output.
|
||||
|
||||
BinarySearch bs;
|
||||
hsa_status_t err;
|
||||
|
||||
// Set some working values specific to this application
|
||||
InitializeBinarySearch(&bs);
|
||||
|
||||
// hsa_init() initializes internal data structures and causes devices
|
||||
// (agents), memory pools and other resources to be discovered.
|
||||
err = hsa_init();
|
||||
RET_IF_HSA_ERR(err);
|
||||
|
||||
// Find the agents needed for the sample
|
||||
err = FindDevices(&bs);
|
||||
RET_IF_HSA_ERR(err);
|
||||
|
||||
// Create the completion signal used when dispatching a packet
|
||||
err = hsa_signal_create(1, 0, NULL, &bs.signal);
|
||||
RET_IF_HSA_ERR(err);
|
||||
|
||||
// Create a queue to submit our binary search AQL packets
|
||||
err = hsa_queue_create(bs.gpu_dev, 128, HSA_QUEUE_TYPE_MULTI, NULL, NULL,
|
||||
UINT32_MAX, UINT32_MAX, &bs.queue);
|
||||
RET_IF_HSA_ERR(err);
|
||||
|
||||
// Find the HSA memory pools we need to run this sample
|
||||
err = FindPools(&bs);
|
||||
RET_IF_HSA_ERR(err);
|
||||
|
||||
// Allocate memory from the correct memory pool, and initialize them as
|
||||
// neeeded for the algorihm.
|
||||
err = AllocateAndInitBuffers(&bs);
|
||||
RET_IF_HSA_ERR(err);
|
||||
|
||||
// Create a kernel object from the pre-compiled kernel, and read some
|
||||
// attributes associated with the kernel that we will need.
|
||||
err = LoadKernelFromObjFile(&bs);
|
||||
RET_IF_HSA_ERR(err);
|
||||
|
||||
// Fill in the AQL packet, assign the kernel arguments, enqueue the packet,
|
||||
// "ring" the doorbell, and wait for completion.
|
||||
err = Run(&bs);
|
||||
RET_IF_HSA_ERR(err);
|
||||
|
||||
// Release all the RocR resources we've acquired and shutdown HSA.
|
||||
err = CleanUp(&bs);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
#undef RET_IF_HSA_ERR
|
||||
+127
@@ -0,0 +1,127 @@
|
||||
/*
|
||||
* =============================================================================
|
||||
* ROC Runtime Conformance Release License
|
||||
* =============================================================================
|
||||
* The University of Illinois/NCSA
|
||||
* Open Source License (NCSA)
|
||||
*
|
||||
* Copyright (c) 2017, Advanced Micro Devices, Inc.
|
||||
* All rights reserved.
|
||||
*
|
||||
* Developed by:
|
||||
*
|
||||
* AMD Research and AMD ROC Software Development
|
||||
*
|
||||
* Advanced Micro Devices, Inc.
|
||||
*
|
||||
* www.amd.com
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
* of this software and associated documentation files (the "Software"), to
|
||||
* deal with the Software without restriction, including without limitation
|
||||
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
* and/or sell copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* - Redistributions of source code must retain the above copyright notice,
|
||||
* this list of conditions and the following disclaimers.
|
||||
* - Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimers in
|
||||
* the documentation and/or other materials provided with the distribution.
|
||||
* - Neither the names of <Name of Development Group, Name of Institution>,
|
||||
* nor the names of its contributors may be used to endorse or promote
|
||||
* products derived from this Software without specific prior written
|
||||
* permission.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
||||
* THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
|
||||
* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
|
||||
* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
|
||||
* DEALINGS WITH THE SOFTWARE.
|
||||
*
|
||||
*/
|
||||
|
||||
/**
|
||||
* One instance of this kernel call is a thread.
|
||||
* Each thread finds out the segment in which it should look for the element.
|
||||
* After that, it checks if the element is between the lower bound and upper
|
||||
* bound of its segment. If yes, then this segment becomes the total
|
||||
* searchspace for the next pass.
|
||||
*
|
||||
* To achieve this, it writes the lower bound and upper bound to the output
|
||||
* array. In case the element at the left end (lower bound) matches the element
|
||||
* we are looking for, that is marked in the output and we no longer need to
|
||||
* look any further.
|
||||
*/
|
||||
|
||||
__kernel void
|
||||
binarySearch(__global uint4 * outputArray,
|
||||
__const __global uint2 * sortedArray,
|
||||
const unsigned int findMe) {
|
||||
unsigned int tid = get_global_id(0);
|
||||
|
||||
// Then we find the elements for this thread
|
||||
uint2 element = sortedArray[tid];
|
||||
|
||||
|
||||
// If the element to be found does not lie between
|
||||
// them, then nothing left to do in this thread
|
||||
if((element.x > findMe) || (element.y < findMe)) {
|
||||
return;
|
||||
} else {
|
||||
// However, if the element does lie between the lower
|
||||
// and upper bounds of this thread's searchspace
|
||||
// we need to narrow down the search further in this
|
||||
// search space
|
||||
// The search space for this thread is marked in the
|
||||
// output as being the total search space for the next pass
|
||||
outputArray[0].x = tid;
|
||||
outputArray[0].w = 1;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
__kernel void
|
||||
binarySearch_mulkeys(__global int *keys,
|
||||
__global uint *input,
|
||||
const unsigned int numKeys,
|
||||
__global int *output) {
|
||||
|
||||
int gid = get_global_id(0);
|
||||
int lBound = gid * 256;
|
||||
int uBound = lBound + 255;
|
||||
|
||||
for(int i = 0; i < numKeys; i++) {
|
||||
if(keys[i] >= input[lBound] && keys[i] <= input[uBound])
|
||||
output[i]=lBound;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
__kernel void
|
||||
binarySearch_mulkeysConcurrent(__global uint *keys,
|
||||
__global uint *input,
|
||||
const unsigned int inputSize, // num. of inputs
|
||||
const unsigned int numSubdivisions,
|
||||
__global int *output) {
|
||||
|
||||
int lBound = (get_global_id(0) % numSubdivisions) * (inputSize / numSubdivisions);
|
||||
int uBound = lBound + inputSize / numSubdivisions;
|
||||
int myKey = keys[get_global_id(0) / numSubdivisions];
|
||||
int mid;
|
||||
|
||||
while(uBound >= lBound) {
|
||||
mid = (lBound + uBound) / 2;
|
||||
if(input[mid] == myKey) {
|
||||
output[get_global_id(0) / numSubdivisions] = mid;
|
||||
return;
|
||||
} else if(input[mid] > myKey) {
|
||||
uBound = mid - 1;
|
||||
} else {
|
||||
lBound = mid + 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,15 @@
|
||||
#
|
||||
# Source files for Rocr Utils library
|
||||
#
|
||||
file( GLOB MODULE_SRC "*.cpp" )
|
||||
|
||||
#
|
||||
# Header files include path(s).
|
||||
#
|
||||
include_directories ( $ENV{ROCR_INC_DIR} )
|
||||
|
||||
#
|
||||
# Build Utils as a Static Library object
|
||||
#
|
||||
add_library( ${COMMON_LIB} STATIC ${MODULE_SRC} )
|
||||
target_link_libraries( ${COMMON_LIB} c stdc++ dl pthread rt )
|
||||
@@ -0,0 +1,45 @@
|
||||
#include "common.hpp"
|
||||
|
||||
void ErrorCheck(hsa_status_t hsa_error_code) {
|
||||
if (hsa_error_code != HSA_STATUS_SUCCESS) {
|
||||
std::cerr << "HSA reported error!" << std::endl;
|
||||
exit(EXIT_FAILURE);
|
||||
}
|
||||
}
|
||||
|
||||
hsa_status_t FindGpuDevice(hsa_agent_t agent, void* data) {
|
||||
if (data == NULL) {
|
||||
return HSA_STATUS_ERROR_INVALID_ARGUMENT;
|
||||
}
|
||||
|
||||
hsa_device_type_t hsa_device_type;
|
||||
hsa_status_t hsa_error_code = hsa_agent_get_info(agent, HSA_AGENT_INFO_DEVICE, &hsa_device_type);
|
||||
if (hsa_error_code != HSA_STATUS_SUCCESS) {
|
||||
return hsa_error_code;
|
||||
}
|
||||
|
||||
if (hsa_device_type == HSA_DEVICE_TYPE_GPU) {
|
||||
*((hsa_agent_t*)data) = agent;
|
||||
}
|
||||
|
||||
return HSA_STATUS_SUCCESS;
|
||||
}
|
||||
|
||||
hsa_status_t FindHostRegion(hsa_region_t region, void* data) {
|
||||
if (data == NULL) {
|
||||
return HSA_STATUS_ERROR_INVALID_ARGUMENT;
|
||||
}
|
||||
|
||||
bool is_host_region = false;
|
||||
hsa_status_t hsa_error_code = hsa_region_get_info(
|
||||
region, (hsa_region_info_t)HSA_AMD_REGION_INFO_HOST_ACCESSIBLE, &is_host_region);
|
||||
if (hsa_error_code != HSA_STATUS_SUCCESS) {
|
||||
return hsa_error_code;
|
||||
}
|
||||
|
||||
if (is_host_region) {
|
||||
*((hsa_region_t*)data) = region;
|
||||
}
|
||||
|
||||
return HSA_STATUS_SUCCESS;
|
||||
}
|
||||
@@ -0,0 +1,27 @@
|
||||
#ifndef COMMON_COMMON_HPP
|
||||
#define COMMON_COMMON_HPP
|
||||
|
||||
#include <cstdlib>
|
||||
#include <iostream>
|
||||
|
||||
#include "hsa.h"
|
||||
#include "hsa_ext_finalize.h"
|
||||
#include "hsa_ext_amd.h"
|
||||
|
||||
#if defined(_MSC_VER)
|
||||
#define ALIGNED_(x) __declspec(align(x))
|
||||
#else
|
||||
#if defined(__GNUC__)
|
||||
#define ALIGNED_(x) __attribute__((aligned(x)))
|
||||
#endif // __GNUC__
|
||||
#endif // _MSC_VER
|
||||
|
||||
#define MULTILINE(...) #__VA_ARGS__
|
||||
|
||||
void ErrorCheck(hsa_status_t hsa_error_code);
|
||||
|
||||
hsa_status_t FindGpuDevice(hsa_agent_t agent, void* data);
|
||||
|
||||
hsa_status_t FindHostRegion(hsa_region_t region, void* data);
|
||||
|
||||
#endif // COMMON_COMMON_HPP
|
||||
@@ -0,0 +1,262 @@
|
||||
/**********************************************************************
|
||||
Copyright ©2013 Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without modification, are permitted
|
||||
provided that the following conditions are met:
|
||||
|
||||
• Redistributions of source code must retain the above copyright notice, this list of
|
||||
conditions and the following disclaimer.
|
||||
• Redistributions in binary form must reproduce the above copyright notice, this list of
|
||||
conditions and the following disclaimer in the documentation and/or
|
||||
other materials provided with the distribution.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
|
||||
IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
||||
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT
|
||||
SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY
|
||||
DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
|
||||
OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
|
||||
WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
POSSIBILITY OF SUCH DAMAGE.
|
||||
********************************************************************/
|
||||
|
||||
#include "helper_funcs.hpp"
|
||||
|
||||
#ifndef _WIN32
|
||||
#include <unistd.h>
|
||||
#endif
|
||||
|
||||
|
||||
/*
|
||||
* Prints no more than 256 elements of the given array.
|
||||
* Prints full array if length is less than 256.
|
||||
* Prints Array name followed by elements.
|
||||
*/
|
||||
template <typename T>
|
||||
void printArray(const std::string header, const T* data, const int width, const int height) {
|
||||
std::cout << header << " :\n";
|
||||
for (int i = 0; i < height; i++) {
|
||||
std::cout << "> ";
|
||||
for (int j = 0; j < width; j++) {
|
||||
std::cout << data[i * width + j] << " ";
|
||||
}
|
||||
std::cout << "\n";
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
int fillRandom(T* arrayPtr, const int width, const int height, const T rangeMin, const T rangeMax,
|
||||
unsigned int seed) {
|
||||
if (!arrayPtr) {
|
||||
error("Cannot fill array. NULL pointer.");
|
||||
return HSA_SDK_FAILURE;
|
||||
}
|
||||
|
||||
if (!seed) seed = (unsigned int)time(NULL);
|
||||
|
||||
srand(seed);
|
||||
double range = double(rangeMax - rangeMin) + 1.0;
|
||||
|
||||
/* random initialisation of input */
|
||||
for (int i = 0; i < height; i++)
|
||||
for (int j = 0; j < width; j++) {
|
||||
int index = i * width + j;
|
||||
arrayPtr[index] = rangeMin + T(range * rand() / (RAND_MAX + 1.0));
|
||||
}
|
||||
|
||||
return HSA_SDK_SUCCESS;
|
||||
}
|
||||
|
||||
template <typename T> int fillPos(T* arrayPtr, const int width, const int height) {
|
||||
if (!arrayPtr) {
|
||||
error("Cannot fill array. NULL pointer.");
|
||||
return HSA_SDK_FAILURE;
|
||||
}
|
||||
|
||||
/* initialisation of input with positions*/
|
||||
for (T i = 0; i < height; i++)
|
||||
for (T j = 0; j < width; j++) {
|
||||
T index = i * width + j;
|
||||
arrayPtr[index] = index;
|
||||
}
|
||||
|
||||
return HSA_SDK_SUCCESS;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
int fillConstant(T* arrayPtr, const int width, const int height, const T val) {
|
||||
if (!arrayPtr) {
|
||||
error("Cannot fill array. NULL pointer.");
|
||||
return HSA_SDK_FAILURE;
|
||||
}
|
||||
|
||||
/* initialisation of input with constant value*/
|
||||
for (int i = 0; i < height; i++)
|
||||
for (int j = 0; j < width; j++) {
|
||||
int index = i * width + j;
|
||||
arrayPtr[index] = val;
|
||||
}
|
||||
|
||||
return HSA_SDK_SUCCESS;
|
||||
}
|
||||
|
||||
template <typename T> T roundToPowerOf2(T val) {
|
||||
int bytes = sizeof(T);
|
||||
|
||||
val--;
|
||||
for (int i = 0; i < bytes; i++) val |= val >> (1 << i);
|
||||
val++;
|
||||
|
||||
return val;
|
||||
}
|
||||
|
||||
template <typename T> int isPowerOf2(T val) {
|
||||
long long _val = val;
|
||||
if ((_val & (-_val)) - _val == 0 && _val != 0)
|
||||
return HSA_SDK_SUCCESS;
|
||||
else
|
||||
return HSA_SDK_FAILURE;
|
||||
}
|
||||
|
||||
|
||||
template <typename T> bool checkVal(T input, T reference, std::string message, bool isAPIerror) {
|
||||
if (input == reference) {
|
||||
return true;
|
||||
} else {
|
||||
error(message);
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
template <typename T> std::string toString(T t, std::ios_base& (*r)(std::ios_base&)) {
|
||||
std::ostringstream output;
|
||||
output << r << t;
|
||||
return output.str();
|
||||
}
|
||||
|
||||
|
||||
bool compare(const float* refData, const float* data, const int length, const float epsilon) {
|
||||
float error = 0.0f;
|
||||
float ref = 0.0f;
|
||||
|
||||
for (int i = 1; i < length; ++i) {
|
||||
float diff = refData[i] - data[i];
|
||||
error += diff * diff;
|
||||
ref += refData[i] * refData[i];
|
||||
}
|
||||
|
||||
float normRef = ::sqrtf((float)ref);
|
||||
if (::fabs((float)ref) < 1e-7f) {
|
||||
return false;
|
||||
}
|
||||
float normError = ::sqrtf((float)error);
|
||||
error = normError / normRef;
|
||||
|
||||
return error < epsilon;
|
||||
}
|
||||
|
||||
bool compare(const double* refData, const double* data, const int length, const double epsilon) {
|
||||
double error = 0.0;
|
||||
double ref = 0.0;
|
||||
|
||||
for (int i = 1; i < length; ++i) {
|
||||
double diff = refData[i] - data[i];
|
||||
error += diff * diff;
|
||||
ref += refData[i] * refData[i];
|
||||
}
|
||||
|
||||
double normRef = ::sqrt((double)ref);
|
||||
if (::fabs((double)ref) < 1e-7) {
|
||||
return false;
|
||||
}
|
||||
double normError = ::sqrt((double)error);
|
||||
error = normError / normRef;
|
||||
|
||||
return error < epsilon;
|
||||
}
|
||||
|
||||
void error(const char* errorMsg) { std::cout << "Error: " << errorMsg << std::endl; }
|
||||
|
||||
void error(std::string errorMsg) { std::cout << "Error: " << errorMsg << std::endl; }
|
||||
|
||||
void expectedError(const char* errorMsg) {
|
||||
std::cout << "Expected Error: " << errorMsg << std::endl;
|
||||
}
|
||||
|
||||
void expectedError(std::string errorMsg) {
|
||||
std::cout << "Expected Error: " << errorMsg << std::endl;
|
||||
}
|
||||
|
||||
|
||||
/////////////////////////////////////////////////////////////////
|
||||
// Template Instantiations
|
||||
/////////////////////////////////////////////////////////////////
|
||||
template void printArray<short>(const std::string, const short*, int, int);
|
||||
template void printArray<unsigned char>(const std::string, const unsigned char*, int, int);
|
||||
template void printArray<unsigned int>(const std::string, const unsigned int*, int, int);
|
||||
template void printArray<int>(const std::string, const int*, int, int);
|
||||
template void printArray<long>(const std::string, const long*, int, int);
|
||||
template void printArray<float>(const std::string, const float*, int, int);
|
||||
template void printArray<double>(const std::string, const double*, int, int);
|
||||
|
||||
template int fillRandom<unsigned char>(unsigned char* arrayPtr, const int width, const int height,
|
||||
unsigned char rangeMin, unsigned char rangeMax,
|
||||
unsigned int seed);
|
||||
template int fillRandom<unsigned int>(unsigned int* arrayPtr, const int width, const int height,
|
||||
unsigned int rangeMin, unsigned int rangeMax,
|
||||
unsigned int seed);
|
||||
template int fillRandom<int>(int* arrayPtr, const int width, const int height, int rangeMin,
|
||||
int rangeMax, unsigned int seed);
|
||||
template int fillRandom<long>(long* arrayPtr, const int width, const int height, long rangeMin,
|
||||
long rangeMax, unsigned int seed);
|
||||
template int fillRandom<float>(float* arrayPtr, const int width, const int height, float rangeMin,
|
||||
float rangeMax, unsigned int seed);
|
||||
template int fillRandom<double>(double* arrayPtr, const int width, const int height,
|
||||
double rangeMin, double rangeMax, unsigned int seed);
|
||||
|
||||
template short roundToPowerOf2<short>(short val);
|
||||
template unsigned int roundToPowerOf2<unsigned int>(unsigned int val);
|
||||
template int roundToPowerOf2<int>(int val);
|
||||
template long roundToPowerOf2<long>(long val);
|
||||
|
||||
template int isPowerOf2<short>(short val);
|
||||
template int isPowerOf2<unsigned int>(unsigned int val);
|
||||
template int isPowerOf2<int>(int val);
|
||||
template int isPowerOf2<long>(long val);
|
||||
|
||||
template <> int fillPos<short>(short* arrayPtr, const int width, const int height);
|
||||
template <> int fillPos<unsigned int>(unsigned int* arrayPtr, const int width, const int height);
|
||||
template <> int fillPos<int>(int* arrayPtr, const int width, const int height);
|
||||
template <> int fillPos<long>(long* arrayPtr, const int width, const int height);
|
||||
|
||||
template <>
|
||||
int fillConstant<short>(short* arrayPtr, const int width, const int height, const short val);
|
||||
template <>
|
||||
int fillConstant(unsigned int* arrayPtr, const int width, const int height, const unsigned int val);
|
||||
template <> int fillConstant(int* arrayPtr, const int width, const int height, const int val);
|
||||
template <> int fillConstant(long* arrayPtr, const int width, const int height, const long val);
|
||||
template <> int fillConstant(long* arrayPtr, const int width, const int height, const long val);
|
||||
template <> int fillConstant(long* arrayPtr, const int width, const int height, const long val);
|
||||
|
||||
|
||||
template bool checkVal<char>(char input, char reference, std::string message, bool isAPIerror);
|
||||
template bool checkVal<bool>(bool input, bool reference, std::string message, bool isAPIerror);
|
||||
template bool checkVal<std::string>(std::string input, std::string reference, std::string message,
|
||||
bool isAPIerror);
|
||||
template bool checkVal<short>(short input, short reference, std::string message, bool isAPIerror);
|
||||
template bool checkVal<unsigned int>(unsigned int input, unsigned int reference,
|
||||
std::string message, bool isAPIerror);
|
||||
template bool checkVal<int>(int input, int reference, std::string message, bool isAPIerror);
|
||||
template bool checkVal<long>(long input, long reference, std::string message, bool isAPIerror);
|
||||
|
||||
|
||||
template std::string toString<char>(char t, std::ios_base& (*r)(std::ios_base&));
|
||||
template std::string toString<short>(short t, std::ios_base& (*r)(std::ios_base&));
|
||||
template std::string toString<unsigned int>(unsigned int t, std::ios_base& (*r)(std::ios_base&));
|
||||
template std::string toString<int>(int t, std::ios_base& (*r)(std::ios_base&));
|
||||
template std::string toString<long>(long t, std::ios_base& (*r)(std::ios_base&));
|
||||
template std::string toString<float>(float t, std::ios_base& (*r)(std::ios_base&));
|
||||
template std::string toString<double>(double t, std::ios_base& (*r)(std::ios_base&));
|
||||
@@ -0,0 +1,141 @@
|
||||
/**********************************************************************
|
||||
Copyright ©2013 Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without modification, are permitted
|
||||
provided that the following conditions are met:
|
||||
|
||||
• Redistributions of source code must retain the above copyright notice, this list of
|
||||
conditions and the following disclaimer.
|
||||
• Redistributions in binary form must reproduce the above copyright notice, this list of
|
||||
conditions and the following disclaimer in the documentation and/or
|
||||
other materials provided with the distribution.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
|
||||
IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
||||
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT
|
||||
SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY
|
||||
DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
|
||||
OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
|
||||
WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
POSSIBILITY OF SUCH DAMAGE.
|
||||
********************************************************************/
|
||||
#ifndef HELPER_FUNCS_HPP_
|
||||
#define HELPER_FUNCS_HPP_
|
||||
|
||||
#define HSA_SDK_SUCCESS 0
|
||||
#define HSA_SDK_FAILURE 1
|
||||
#define HSA_SDK_EXPECTED_FAILURE 2
|
||||
|
||||
#include <iostream>
|
||||
#include <fstream>
|
||||
#include <iomanip>
|
||||
#include <sstream>
|
||||
#include <string>
|
||||
#include <ctime>
|
||||
#include <cmath>
|
||||
#include <time.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include <vector>
|
||||
#include <malloc.h>
|
||||
|
||||
/**
|
||||
* error
|
||||
* constant function, Prints error messages
|
||||
* @param errorMsg char* message
|
||||
*/
|
||||
void error(const char* errorMsg);
|
||||
|
||||
/**
|
||||
* error
|
||||
* constant function, Prints error messages
|
||||
* @param errorMsg std::string message
|
||||
*/
|
||||
void error(std::string errorMsg);
|
||||
|
||||
/**
|
||||
* expectedError
|
||||
* constant function, Prints error messages
|
||||
* @param errorMsg char* message
|
||||
*/
|
||||
void expectedError(const char* errorMsg);
|
||||
|
||||
/**
|
||||
* expectedError
|
||||
* constant function, Prints error messages
|
||||
* @param errorMsg string message
|
||||
*/
|
||||
void expectedError(std::string errorMsg);
|
||||
|
||||
/**
|
||||
* compare template version
|
||||
* compare data to check error
|
||||
* @param refData templated input
|
||||
* @param data templated input
|
||||
* @param length number of values to compare
|
||||
* @param epsilon errorWindow
|
||||
*/
|
||||
bool compare(const float* refData, const float* data, const int length,
|
||||
const float epsilon = 1e-6f);
|
||||
bool compare(const double* refData, const double* data, const int length,
|
||||
const double epsilon = 1e-6);
|
||||
|
||||
/**
|
||||
* printArray
|
||||
* displays a array on std::out
|
||||
*/
|
||||
template <typename T>
|
||||
void printArray(const std::string header, const T* data, const int width, const int height);
|
||||
|
||||
|
||||
/**
|
||||
* fillRandom
|
||||
* fill array with random values
|
||||
*/
|
||||
template <typename T>
|
||||
int fillRandom(T* arrayPtr, const int width, const int height, const T rangeMin, const T rangeMax,
|
||||
unsigned int seed = 123);
|
||||
|
||||
/**
|
||||
* fillPos
|
||||
* fill the specified positions
|
||||
*/
|
||||
template <typename T> int fillPos(T* arrayPtr, const int width, const int height);
|
||||
|
||||
/**
|
||||
* fillConstant
|
||||
* fill the array with constant value
|
||||
*/
|
||||
template <typename T> int fillConstant(T* arrayPtr, const int width, const int height, const T val);
|
||||
|
||||
|
||||
/**
|
||||
* roundToPowerOf2
|
||||
* rounds to a power of 2
|
||||
*/
|
||||
template <typename T> T roundToPowerOf2(T val);
|
||||
|
||||
/**
|
||||
* isPowerOf2
|
||||
* checks if input is a power of 2
|
||||
*/
|
||||
template <typename T> int isPowerOf2(T val);
|
||||
|
||||
/**
|
||||
* checkVal
|
||||
* Set default(isAPIerror) parameter to false
|
||||
* if checkVaul is used to check otherthan OpenCL API error code
|
||||
*/
|
||||
template <typename T>
|
||||
bool checkVal(T input, T reference, std::string message, bool isAPIerror = true);
|
||||
|
||||
/**
|
||||
* toString
|
||||
* convert a T type to string
|
||||
*/
|
||||
template <typename T> std::string toString(T t, std::ios_base& (*r)(std::ios_base&));
|
||||
|
||||
|
||||
#endif
|
||||
@@ -0,0 +1,155 @@
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <stdint.h>
|
||||
#include <string.h>
|
||||
#include <cassert>
|
||||
|
||||
#include <iostream>
|
||||
#include <vector>
|
||||
#include <string>
|
||||
|
||||
#include "hsa.h"
|
||||
#include "hsa_ext_profiler.h"
|
||||
#include "amd_hsa_tools_interfaces.h"
|
||||
|
||||
#include "hsa_perf_cntrs.hpp"
|
||||
|
||||
using namespace std;
|
||||
|
||||
void PreDispatchCallback(const hsa_dispatch_callback_t* dispParam, void* usrArg) {
|
||||
assert((dispParam->pre_dispatch) && "Pre Dispatch Callback Param is Malformed");
|
||||
|
||||
hsa_ext_tools_pmu_t* perfMgr = reinterpret_cast<hsa_ext_tools_pmu_t*>(usrArg);
|
||||
hsa_status_t status =
|
||||
hsa_ext_tools_pmu_begin(*perfMgr, dispParam->queue, dispParam->aql_translation_handle, true);
|
||||
assert((status == HSA_STATUS_SUCCESS) && "Error in beginning Perf Cntr Session");
|
||||
}
|
||||
|
||||
void PostDispatchCallback(const hsa_dispatch_callback_t* dispParam, void* usrArg) {
|
||||
assert((!dispParam->pre_dispatch) && "Post Dispatch Callback Param is Malformed");
|
||||
|
||||
hsa_ext_tools_pmu_t* perfMgr = reinterpret_cast<hsa_ext_tools_pmu_t*>(usrArg);
|
||||
hsa_status_t status =
|
||||
hsa_ext_tools_pmu_end(*perfMgr, dispParam->queue, dispParam->aql_translation_handle);
|
||||
assert((status == HSA_STATUS_SUCCESS) && "Error in endning Perf Cntr Session");
|
||||
}
|
||||
|
||||
// Constructor of the class
|
||||
RocrPerfCntrApp::RocrPerfCntrApp() : perfMgr_(NULL) {}
|
||||
|
||||
// Destructor of the class. Ideally it should delete the
|
||||
// PMU and its counters
|
||||
RocrPerfCntrApp::~RocrPerfCntrApp() {}
|
||||
|
||||
// Return the number of perf counters
|
||||
uint32_t RocrPerfCntrApp::GetNumPerfCntrs() { return uint32_t(cntrList_.size()); }
|
||||
|
||||
// Return the handle of perf counter at specified index
|
||||
CntrInfo* RocrPerfCntrApp::GetPerfCntr(uint32_t idx) { return cntrList_[idx]; }
|
||||
|
||||
// Print the various fields of Perf Cntrs being programmed
|
||||
bool RocrPerfCntrApp::PrintCntrs() {
|
||||
CntrInfo* info;
|
||||
int size = uint32_t(cntrList_.size());
|
||||
for (int idx = 0; idx < size; idx++) {
|
||||
info = cntrList_[idx];
|
||||
std::cout << std::endl;
|
||||
std::cout << "Rocr Perf Cntr Id: " << info->cntrId << std::endl;
|
||||
std::cout << "Rocr Perf Cntr Name: " << info->cntrName << std::endl;
|
||||
std::cout << "Rocr Perf Cntr Blk Id: " << info->blkId << std::endl;
|
||||
std::cout << "Rocr Perf Cntr Value: " << info->cntrResult << std::endl;
|
||||
std::cout << "Rocr Perf Cntr Validation: " << info->cnfType << std::endl;
|
||||
std::cout << std::endl;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
// Initialize the list of perf counters
|
||||
// block id of kHsaAiCounterBlockSQ = 14 == 0x0E
|
||||
hsa_status_t RocrPerfCntrApp::Init(hsa_agent_t agent) {
|
||||
// Initialize the list of Perf Cntrs
|
||||
// Add SQ counter for number of waves
|
||||
CntrInfo* info = NULL;
|
||||
cntrList_.reserve(23);
|
||||
|
||||
// Event for number of Waves
|
||||
info = new CntrInfo(0x4, "SQ_SQ_PERF_SEL_WAVES", NULL, 0x0E, NULL, 0x00, 0xFFFFFFFF,
|
||||
CntrValCnf_Exact);
|
||||
cntrList_.push_back(info);
|
||||
|
||||
// Event for number of Threads
|
||||
info = new CntrInfo(0xE, "SQ_SQ_PERF_SEL_ITEMS", NULL, 0x0E, NULL, 0x00, 0xFFFFFFFF,
|
||||
CntrValCnf_Exact);
|
||||
cntrList_.push_back(info);
|
||||
|
||||
|
||||
// Create an instance of Perf Mgr
|
||||
hsa_status_t status;
|
||||
status = hsa_ext_tools_create_pmu(agent, &perfMgr_);
|
||||
assert((status == HSA_STATUS_SUCCESS) && "Error in creating Perf Cntr Mgr");
|
||||
|
||||
// Process each counter from the list as necessary
|
||||
// each counter descriptor with its perf block handle
|
||||
// and create an instance of counter in that block
|
||||
uint32_t size = GetNumPerfCntrs();
|
||||
for (uint32_t idx = 0; idx < size; idx++) {
|
||||
info = GetPerfCntr(idx);
|
||||
|
||||
// Obtain the handle of perf block
|
||||
if (info->blkHndl == NULL) {
|
||||
status = hsa_ext_tools_get_counter_block_by_id(perfMgr_, info->blkId, &info->blkHndl);
|
||||
assert((status == HSA_STATUS_SUCCESS) && "Error in getting Perf Cntr Blk Hndl");
|
||||
}
|
||||
|
||||
// Create an instance of counter in the perf block
|
||||
status = hsa_ext_tools_create_counter(info->blkHndl, &info->cntrHndl);
|
||||
assert((status == HSA_STATUS_SUCCESS) && "Error in creating Perf Cntr in Perf Blk");
|
||||
|
||||
// Update the Event Index property of counter
|
||||
uint32_t cntrProp = HSA_EXT_TOOLS_COUNTER_PARAMETER_EVENT_INDEX;
|
||||
status = hsa_ext_tools_set_counter_parameter(info->cntrHndl, cntrProp, sizeof(uint32_t),
|
||||
(void*)&info->cntrId);
|
||||
assert((status == HSA_STATUS_SUCCESS) && "Error in updating Perf Cntr Property Event Index");
|
||||
|
||||
// Enable the updated perf counter
|
||||
status = hsa_ext_tools_set_counter_enabled(info->cntrHndl, true);
|
||||
assert((status == HSA_STATUS_SUCCESS) && "Error in enabing Perf Cntr");
|
||||
}
|
||||
|
||||
return status;
|
||||
}
|
||||
|
||||
// Register Pre and Post dispatch callbacks
|
||||
void RocrPerfCntrApp::RegisterCallbacks(hsa_queue_t* queue) {
|
||||
hsa_status_t status;
|
||||
status = hsa_ext_tools_set_callback_functions(queue, PreDispatchCallback, PostDispatchCallback);
|
||||
assert((status == HSA_STATUS_SUCCESS) && "Error in registering Pre & Post Dispatch Callbacks");
|
||||
status = hsa_ext_tools_set_callback_arguments(queue, &perfMgr_, &perfMgr_);
|
||||
assert((status == HSA_STATUS_SUCCESS) &&
|
||||
"Error in registering Pre & Post Dispatch Callback Params");
|
||||
return;
|
||||
}
|
||||
|
||||
// Wait for perf counter collection to complete
|
||||
hsa_status_t RocrPerfCntrApp::Wait() {
|
||||
hsa_status_t status;
|
||||
status = hsa_ext_tools_pmu_wait_for_completion(perfMgr_, 5000);
|
||||
assert((status == HSA_STATUS_SUCCESS) && "Error in Waiting for Perf Cntr Completion");
|
||||
return status;
|
||||
}
|
||||
|
||||
// Validate perf counter values
|
||||
hsa_status_t RocrPerfCntrApp::Validate() {
|
||||
// Retrieve the results of the different Perf Cntrs
|
||||
// and validate them as configured
|
||||
CntrInfo* info = NULL;
|
||||
hsa_status_t status = HSA_STATUS_SUCCESS;
|
||||
uint32_t size = GetNumPerfCntrs();
|
||||
for (uint32_t idx = 0; idx < size; idx++) {
|
||||
info = GetPerfCntr(idx);
|
||||
status = hsa_ext_tools_get_counter_result(info->cntrHndl, &info->cntrResult);
|
||||
std::cout << "Value of Perf Cntr is: " << info->cntrResult << std::endl;
|
||||
}
|
||||
|
||||
return status;
|
||||
}
|
||||
@@ -0,0 +1,110 @@
|
||||
#ifndef ROCR_PERF_CNTR_APP_H_
|
||||
#define ROCR_PERF_CNTR_APP_H_
|
||||
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <stdint.h>
|
||||
#include <string.h>
|
||||
|
||||
#include <iostream>
|
||||
#include <vector>
|
||||
#include <string>
|
||||
|
||||
#include "hsa.h"
|
||||
#include "hsa_ext_profiler.h"
|
||||
|
||||
typedef enum CntrValCnfType {
|
||||
|
||||
///< no counter value validation should be performed
|
||||
CntrValCnf_None,
|
||||
|
||||
///< counter value should be an exact match to expectedResult
|
||||
CntrValCnf_Exact,
|
||||
|
||||
///< counter value should be greater than expectedResult
|
||||
CntrValCnf_GreaterThan,
|
||||
|
||||
///< counter value should be less than expectedResult
|
||||
CntrValCnf_LessThan
|
||||
|
||||
} CntrValCnfType;
|
||||
|
||||
/// Struct used to encapsulate Counter Info
|
||||
typedef struct CntrInfo {
|
||||
///< Id of counter in hardware block
|
||||
uint32_t cntrId;
|
||||
|
||||
///< Name of counter
|
||||
char cntrName[72];
|
||||
|
||||
///< Handle of perf counter
|
||||
hsa_ext_tools_counter_t cntrHndl;
|
||||
|
||||
///< Id of hardware block containing the counter
|
||||
uint32_t blkId;
|
||||
|
||||
///< Handle of counter block
|
||||
hsa_ext_tools_counter_block_t blkHndl;
|
||||
|
||||
///< Expected value of perf counte
|
||||
uint64_t expectedResult;
|
||||
|
||||
///< Value of perf counter expected
|
||||
uint64_t cntrResult;
|
||||
|
||||
///< Type of validation upon completion of dispatch
|
||||
CntrValCnfType cnfType;
|
||||
|
||||
CntrInfo(uint32_t cntrId, char* cntrName, void* cntrHndl, uint32_t blkId, void* blkHndl,
|
||||
uint64_t expResult, uint64_t result, CntrValCnfType cnfType) {
|
||||
this->cntrId = cntrId;
|
||||
this->cntrHndl = cntrHndl;
|
||||
this->blkId = blkId;
|
||||
this->blkHndl = blkHndl;
|
||||
this->expectedResult = expResult;
|
||||
this->cntrResult = result;
|
||||
this->cnfType = cnfType;
|
||||
memcpy(this->cntrName, cntrName, strlen(cntrName));
|
||||
}
|
||||
|
||||
} CntrInfo;
|
||||
|
||||
class RocrPerfCntrApp {
|
||||
public:
|
||||
// Constructor of the class. Will initialize the list of perf counters
|
||||
// that will be used to program the device
|
||||
RocrPerfCntrApp();
|
||||
|
||||
// Destructor of the class
|
||||
~RocrPerfCntrApp();
|
||||
|
||||
// Return the number of perf counters
|
||||
uint32_t GetNumPerfCntrs();
|
||||
|
||||
// Return the handle of perf counter at specified index
|
||||
CntrInfo* GetPerfCntr(uint32_t idx);
|
||||
|
||||
// Print the list of perf counters
|
||||
bool PrintCntrs();
|
||||
|
||||
// Initialize the list of perf counters
|
||||
hsa_status_t Init(hsa_agent_t agent);
|
||||
|
||||
// Register Pre and Post dispatch callbacks
|
||||
void RegisterCallbacks(hsa_queue_t* queue);
|
||||
|
||||
// Wait for perf counter collection to complete
|
||||
hsa_status_t Wait();
|
||||
|
||||
// Validate perf counter values
|
||||
hsa_status_t Validate();
|
||||
|
||||
private:
|
||||
// Number of queues to create
|
||||
std::vector<CntrInfo*> cntrList_;
|
||||
|
||||
// Handle of Perf Cntr Manager
|
||||
hsa_ext_tools_pmu_t perfMgr_;
|
||||
};
|
||||
|
||||
#endif // ROCR_PERF_CNTR_APP_H_
|
||||
@@ -0,0 +1,476 @@
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <stdint.h>
|
||||
#include <string.h>
|
||||
#include <cassert>
|
||||
|
||||
#include <fstream>
|
||||
#include <iostream>
|
||||
#include <vector>
|
||||
#include <string>
|
||||
|
||||
#include "hsa.h"
|
||||
#include "hsa_rsrc_factory.hpp"
|
||||
#include "hsa_ext_finalize.h"
|
||||
#include "hsa_ext_profiler.h"
|
||||
|
||||
#include "common.hpp"
|
||||
|
||||
using namespace std;
|
||||
|
||||
// Provide access to command line arguments passed in by user
|
||||
uint32_t hsa_cmdline_arg_cnt;
|
||||
char** hsa_cmdline_arg_list;
|
||||
|
||||
// Callback function to find and bind kernarg region of an agent
|
||||
static hsa_status_t find_memregions(hsa_region_t region, void* data) {
|
||||
hsa_region_global_flag_t flags;
|
||||
hsa_region_segment_t segment_id;
|
||||
|
||||
hsa_region_get_info(region, HSA_REGION_INFO_SEGMENT, &segment_id);
|
||||
if (segment_id != HSA_REGION_SEGMENT_GLOBAL) {
|
||||
return HSA_STATUS_SUCCESS;
|
||||
}
|
||||
|
||||
AgentInfo* agent_info = (AgentInfo*)data;
|
||||
hsa_region_get_info(region, HSA_REGION_INFO_GLOBAL_FLAGS, &flags);
|
||||
if (flags & HSA_REGION_GLOBAL_FLAG_COARSE_GRAINED) {
|
||||
agent_info->coarse_region = region;
|
||||
}
|
||||
|
||||
if (flags & HSA_REGION_GLOBAL_FLAG_KERNARG) {
|
||||
agent_info->kernarg_region = region;
|
||||
}
|
||||
|
||||
return HSA_STATUS_SUCCESS;
|
||||
}
|
||||
|
||||
// Callback function to get the number of agents
|
||||
static hsa_status_t get_hsa_agents(hsa_agent_t agent, void* data) {
|
||||
// Copy handle of agent and increment number of agents reported
|
||||
HsaRsrcFactory* rsrcFactory = reinterpret_cast<HsaRsrcFactory*>(data);
|
||||
|
||||
// Determine if device is a Gpu agent
|
||||
hsa_status_t status;
|
||||
hsa_device_type_t type;
|
||||
status = hsa_agent_get_info(agent, HSA_AGENT_INFO_DEVICE, &type);
|
||||
if (type == HSA_DEVICE_TYPE_DSP) {
|
||||
return HSA_STATUS_SUCCESS;
|
||||
}
|
||||
|
||||
if (type == HSA_DEVICE_TYPE_CPU) {
|
||||
AgentInfo* agent_info = reinterpret_cast<AgentInfo*>(malloc(sizeof(AgentInfo)));
|
||||
agent_info->dev_id = agent;
|
||||
agent_info->dev_type = HSA_DEVICE_TYPE_CPU;
|
||||
rsrcFactory->AddAgentInfo(agent_info, false);
|
||||
return HSA_STATUS_SUCCESS;
|
||||
}
|
||||
|
||||
// Device is a Gpu agent, build an instance of AgentInfo
|
||||
AgentInfo* agent_info = reinterpret_cast<AgentInfo*>(malloc(sizeof(AgentInfo)));
|
||||
agent_info->dev_id = agent;
|
||||
agent_info->dev_type = HSA_DEVICE_TYPE_GPU;
|
||||
hsa_agent_get_info(agent, HSA_AGENT_INFO_NAME, agent_info->name);
|
||||
agent_info->max_wave_size = 0;
|
||||
hsa_agent_get_info(agent, HSA_AGENT_INFO_WAVEFRONT_SIZE, &agent_info->max_wave_size);
|
||||
agent_info->max_queue_size = 0;
|
||||
hsa_agent_get_info(agent, HSA_AGENT_INFO_QUEUE_MAX_SIZE, &agent_info->max_queue_size);
|
||||
agent_info->profile = hsa_profile_t(108);
|
||||
hsa_agent_get_info(agent, HSA_AGENT_INFO_PROFILE, &agent_info->profile);
|
||||
|
||||
// Initialize memory regions to zero
|
||||
agent_info->kernarg_region.handle = 0;
|
||||
agent_info->coarse_region.handle = 0;
|
||||
|
||||
// Find and Bind Memory regions of the Gpu agent
|
||||
hsa_agent_iterate_regions(agent, find_memregions, agent_info);
|
||||
|
||||
// Save the instance of AgentInfo
|
||||
rsrcFactory->AddAgentInfo(agent_info, true);
|
||||
return HSA_STATUS_SUCCESS;
|
||||
}
|
||||
|
||||
// Definitions for Static Data members of the class
|
||||
char* HsaRsrcFactory::brig_path_ = NULL;
|
||||
uint32_t HsaRsrcFactory::num_cus_ = 4;
|
||||
uint32_t HsaRsrcFactory::num_waves_;
|
||||
uint32_t HsaRsrcFactory::num_workitems_;
|
||||
uint32_t HsaRsrcFactory::kernel_loop_count_;
|
||||
bool HsaRsrcFactory::print_debug_info_ = false;
|
||||
|
||||
char* HsaRsrcFactory::num_cus_key_ = "num_cus";
|
||||
char* HsaRsrcFactory::brig_path_key_ = "brig_path";
|
||||
char* HsaRsrcFactory::num_waves_key_ = "waves_per_cu";
|
||||
char* HsaRsrcFactory::num_workitems_key_ = "workitems_per_wave";
|
||||
char* HsaRsrcFactory::print_debug_key_ = "print_debug";
|
||||
char* HsaRsrcFactory::kernel_loop_count_key_ = "kernel_loop_count";
|
||||
|
||||
// Constructor of the class
|
||||
HsaRsrcFactory::HsaRsrcFactory() {
|
||||
// Initialize the Hsa Runtime
|
||||
hsa_status_t status = hsa_init();
|
||||
check("Error in hsa_init", status);
|
||||
|
||||
// Discover the set of Gpu devices available on the platform
|
||||
status = hsa_iterate_agents(get_hsa_agents, this);
|
||||
check("Error Calling hsa_iterate_agents", status);
|
||||
|
||||
// Process command line arguments
|
||||
ProcessCmdline();
|
||||
}
|
||||
|
||||
// Destructor of the class
|
||||
HsaRsrcFactory::~HsaRsrcFactory() {}
|
||||
|
||||
// Get the count of Hsa Gpu Agents available on the platform
|
||||
//
|
||||
// @return uint32_t Number of Gpu agents on platform
|
||||
//
|
||||
uint32_t HsaRsrcFactory::GetCountOfGpuAgents() { return uint32_t(gpu_list_.size()); }
|
||||
|
||||
// Get the count of Hsa Cpu Agents available on the platform
|
||||
//
|
||||
// @return uint32_t Number of Cpu agents on platform
|
||||
//
|
||||
uint32_t HsaRsrcFactory::GetCountOfCpuAgents() { return uint32_t(cpu_list_.size()); }
|
||||
|
||||
// Get the AgentInfo handle of a Gpu device
|
||||
//
|
||||
// @param idx Gpu Agent at specified index
|
||||
//
|
||||
// @param agent_info Output parameter updated with AgentInfo
|
||||
//
|
||||
// @return bool true if successful, false otherwise
|
||||
//
|
||||
bool HsaRsrcFactory::GetGpuAgentInfo(uint32_t idx, AgentInfo** agent_info) {
|
||||
// Determine if request is valid
|
||||
uint32_t size = uint32_t(gpu_list_.size());
|
||||
if (idx >= size) {
|
||||
return false;
|
||||
}
|
||||
|
||||
// Copy AgentInfo from specified index
|
||||
*agent_info = gpu_list_[idx];
|
||||
return true;
|
||||
}
|
||||
|
||||
// Get the AgentInfo handle of a Cpu device
|
||||
//
|
||||
// @param idx Cpu Agent at specified index
|
||||
//
|
||||
// @param agent_info Output parameter updated with AgentInfo
|
||||
//
|
||||
// @return bool true if successful, false otherwise
|
||||
//
|
||||
bool HsaRsrcFactory::GetCpuAgentInfo(uint32_t idx, AgentInfo** agent_info) {
|
||||
// Determine if request is valid
|
||||
uint32_t size = uint32_t(cpu_list_.size());
|
||||
if (idx >= size) {
|
||||
return false;
|
||||
}
|
||||
|
||||
// Copy AgentInfo from specified index
|
||||
*agent_info = cpu_list_[idx];
|
||||
return true;
|
||||
}
|
||||
|
||||
// Create a Queue object and return its handle. The queue object is expected
|
||||
// to support user requested number of Aql dispatch packets.
|
||||
//
|
||||
// @param agent_info Gpu Agent on which to create a queue object
|
||||
//
|
||||
// @param num_Pkts Number of packets to be held by queue
|
||||
//
|
||||
// @param queue Output parameter updated with handle of queue object
|
||||
//
|
||||
// @return bool true if successful, false otherwise
|
||||
//
|
||||
bool HsaRsrcFactory::CreateQueue(AgentInfo* agent_info, uint32_t num_pkts, hsa_queue_t** queue) {
|
||||
hsa_status_t status;
|
||||
status = hsa_queue_create(agent_info->dev_id, num_pkts, HSA_QUEUE_TYPE_MULTI, NULL, NULL,
|
||||
UINT32_MAX, UINT32_MAX, queue);
|
||||
return (status == HSA_STATUS_SUCCESS);
|
||||
}
|
||||
|
||||
// Create a Signal object and return its handle.
|
||||
//
|
||||
// @param value Initial value of signal object
|
||||
//
|
||||
// @param signal Output parameter updated with handle of signal object
|
||||
//
|
||||
// @return bool true if successful, false otherwise
|
||||
//
|
||||
bool HsaRsrcFactory::CreateSignal(uint32_t value, hsa_signal_t* signal) {
|
||||
hsa_status_t status;
|
||||
status = hsa_signal_create(value, 0, NULL, signal);
|
||||
return (status == HSA_STATUS_SUCCESS);
|
||||
}
|
||||
|
||||
// Allocate memory for use by a kernel of specified size in specified
|
||||
// agent's memory region. Currently supports Global segment whose Kernarg
|
||||
// flag set.
|
||||
//
|
||||
// @param agent_info Agent from whose memory region to allocate
|
||||
//
|
||||
// @param size Size of memory in terms of bytes
|
||||
//
|
||||
// @return uint8_t* Pointer to buffer, null if allocation fails.
|
||||
//
|
||||
uint8_t* HsaRsrcFactory::AllocateLocalMemory(AgentInfo* agent_info, size_t size) {
|
||||
hsa_status_t status;
|
||||
uint8_t* buffer = NULL;
|
||||
|
||||
if (agent_info->coarse_region.handle != 0) {
|
||||
// Allocate in local memory if it is available
|
||||
status = hsa_memory_allocate(agent_info->coarse_region, size, (void**)&buffer);
|
||||
if (status == HSA_STATUS_SUCCESS) {
|
||||
status = hsa_memory_assign_agent(buffer, agent_info->dev_id, HSA_ACCESS_PERMISSION_RW);
|
||||
}
|
||||
} else {
|
||||
// Allocate in system memory if local memory is not available
|
||||
status = hsa_memory_allocate(agent_info->kernarg_region, size, (void**)&buffer);
|
||||
}
|
||||
|
||||
return (status == HSA_STATUS_SUCCESS) ? buffer : NULL;
|
||||
}
|
||||
|
||||
// Allocate memory tp pass kernel parameters.
|
||||
//
|
||||
// @param agent_info Agent from whose memory region to allocate
|
||||
//
|
||||
// @param size Size of memory in terms of bytes
|
||||
//
|
||||
// @return uint8_t* Pointer to buffer, null if allocation fails.
|
||||
//
|
||||
uint8_t* HsaRsrcFactory::AllocateSysMemory(AgentInfo* agent_info, size_t size) {
|
||||
hsa_status_t status;
|
||||
uint8_t* buffer = NULL;
|
||||
status = hsa_memory_allocate(agent_info->kernarg_region, size, (void**)&buffer);
|
||||
return (status == HSA_STATUS_SUCCESS) ? buffer : NULL;
|
||||
}
|
||||
|
||||
bool HsaRsrcFactory::TransferData(uint8_t* dest_buff, uint8_t* src_buff, uint32_t length,
|
||||
bool host_to_dev) {
|
||||
hsa_status_t status;
|
||||
status = hsa_memory_copy(dest_buff, src_buff, length);
|
||||
return (status == HSA_STATUS_SUCCESS);
|
||||
}
|
||||
|
||||
// Fake method for compilation steps only
|
||||
uint8_t* HsaRsrcFactory::AllocateMemory(AgentInfo* agent_info, size_t size) {
|
||||
hsa_status_t status;
|
||||
uint8_t* buffer = NULL;
|
||||
status = hsa_memory_allocate(agent_info->kernarg_region, size, (void**)&buffer);
|
||||
return (status == HSA_STATUS_SUCCESS) ? buffer : NULL;
|
||||
}
|
||||
|
||||
// Loads an Assembled Brig file and Finalizes it into Device Isa
|
||||
//
|
||||
// @param agent_info Gpu device for which to finalize
|
||||
//
|
||||
// @param brig_path File path of the Assembled Brig file
|
||||
//
|
||||
// @param kernel_name Name of the kernel to finalize
|
||||
//
|
||||
// @param code_desc Handle of finalized Code Descriptor that could
|
||||
// be used to submit for execution
|
||||
//
|
||||
// @return bool true if successful, false otherwise
|
||||
//
|
||||
bool HsaRsrcFactory::LoadAndFinalize(AgentInfo* agent_info, const char* brig_path,
|
||||
char* kernel_name, hsa_executable_symbol_t* code_desc) {
|
||||
// Finalize the Hsail object into code object
|
||||
hsa_status_t status;
|
||||
hsa_code_object_t code_object;
|
||||
|
||||
// Build the code object filename
|
||||
std::string filename(brig_path);
|
||||
std::cout << "Code object filename: " << filename << std::endl;
|
||||
|
||||
// Open the file containing code object
|
||||
std::ifstream codeStream(filename.c_str(), std::ios::binary | std::ios::ate);
|
||||
if (!codeStream) {
|
||||
std::cout << "Error: failed to load " << filename << std::endl;
|
||||
assert(false);
|
||||
return false;
|
||||
}
|
||||
|
||||
// Allocate memory to read in code object from file
|
||||
size_t size = std::string::size_type(codeStream.tellg());
|
||||
char* codeBuff = (char*)AllocateSysMemory(agent_info, size);
|
||||
if (!codeBuff) {
|
||||
std::cout << "Error: failed to allocate memory for code object." << std::endl;
|
||||
assert(false);
|
||||
return false;
|
||||
}
|
||||
|
||||
// Read the code object into allocated memory
|
||||
codeStream.seekg(0, std::ios::beg);
|
||||
std::copy(std::istreambuf_iterator<char>(codeStream), std::istreambuf_iterator<char>(), codeBuff);
|
||||
|
||||
// De-Serialize the code object that has been read into memory
|
||||
status = hsa_code_object_deserialize(codeBuff, size, NULL, &code_object);
|
||||
if (status != HSA_STATUS_SUCCESS) {
|
||||
std::cout << "Failed to deserialize code object" << std::endl;
|
||||
return false;
|
||||
}
|
||||
|
||||
// Create executable.
|
||||
hsa_executable_t hsaExecutable;
|
||||
// status = hsa_executable_create(agent_info->profile,
|
||||
status =
|
||||
hsa_executable_create(HSA_PROFILE_FULL, HSA_EXECUTABLE_STATE_UNFROZEN, "", &hsaExecutable);
|
||||
check("Error in creating executable object", status);
|
||||
|
||||
// Load code object.
|
||||
status = hsa_executable_load_code_object(hsaExecutable, agent_info->dev_id, code_object, "");
|
||||
check("Error in loading executable object", status);
|
||||
|
||||
// Freeze executable.
|
||||
status = hsa_executable_freeze(hsaExecutable, "");
|
||||
check("Error in freezing executable object", status);
|
||||
|
||||
// Get symbol handle.
|
||||
hsa_executable_symbol_t kernelSymbol;
|
||||
status = hsa_executable_get_symbol(hsaExecutable, NULL, kernel_name, agent_info->dev_id, 0,
|
||||
&kernelSymbol);
|
||||
check("Error in looking up kernel symbol", status);
|
||||
|
||||
// Update output parameter
|
||||
*code_desc = kernelSymbol;
|
||||
return true;
|
||||
}
|
||||
|
||||
// Add an instance of AgentInfo representing a Hsa Gpu agent
|
||||
void HsaRsrcFactory::AddAgentInfo(AgentInfo* agent_info, bool gpu) {
|
||||
// Add input to Gpu list
|
||||
if (gpu) {
|
||||
gpu_list_.push_back(agent_info);
|
||||
return;
|
||||
}
|
||||
|
||||
// Add input to Cpu list
|
||||
cpu_list_.push_back(agent_info);
|
||||
}
|
||||
|
||||
// Print the various fields of Hsa Gpu Agents
|
||||
bool HsaRsrcFactory::PrintGpuAgents(const std::string& header) {
|
||||
std::cout << header << " :" << std::endl;
|
||||
|
||||
AgentInfo* agent_info;
|
||||
int size = uint32_t(gpu_list_.size());
|
||||
for (int idx = 0; idx < size; idx++) {
|
||||
agent_info = gpu_list_[idx];
|
||||
|
||||
std::cout << "> agent[" << idx << "] :" << std::endl;
|
||||
std::cout << ">> Name : " << agent_info->name << std::endl;
|
||||
std::cout << ">> Max Wave Size : " << agent_info->max_wave_size << std::endl;
|
||||
std::cout << ">> Max Queue Size : " << agent_info->max_queue_size << std::endl;
|
||||
std::cout << ">> Kernarg Region Id : " << agent_info->coarse_region.handle << std::endl;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
// Returns the file path where brig files is located. Value is
|
||||
// available only after an instance has been built.
|
||||
char* HsaRsrcFactory::GetBrigPath() { return HsaRsrcFactory::brig_path_; }
|
||||
|
||||
// Returns the number of compute units present on platform
|
||||
// Value is available only after an instance has been built.
|
||||
uint32_t HsaRsrcFactory::GetNumOfCUs() { return HsaRsrcFactory::num_cus_; }
|
||||
|
||||
// Returns the maximum number of waves that can be launched
|
||||
// per compute unit. The actual number that can be launched
|
||||
// is affected by resource availability
|
||||
//
|
||||
// Value is available only after an instance has been built.
|
||||
uint32_t HsaRsrcFactory::GetNumOfWavesPerCU() { return HsaRsrcFactory::num_waves_; }
|
||||
|
||||
// Returns the number of work-items that can execute per wave
|
||||
// Value is available only after an instance has been built.
|
||||
uint32_t HsaRsrcFactory::GetNumOfWorkItemsPerWave() { return HsaRsrcFactory::num_workitems_; }
|
||||
|
||||
// Returns the number of times kernel loop body should execute.
|
||||
// Value is available only after an instance has been built.
|
||||
uint32_t HsaRsrcFactory::GetKernelLoopCount() { return HsaRsrcFactory::kernel_loop_count_; }
|
||||
|
||||
// Returns boolean flag to indicate if debug info should be printed
|
||||
// Value is available only after an instance has been built.
|
||||
uint32_t HsaRsrcFactory::GetPrintDebugInfo() { return HsaRsrcFactory::print_debug_info_; }
|
||||
|
||||
// Process command line arguments. The method will capture
|
||||
// various user command line parameters for tests to use
|
||||
void HsaRsrcFactory::ProcessCmdline() {
|
||||
// Command line arguments are given
|
||||
uint32_t idx;
|
||||
uint32_t arg_idx;
|
||||
for (idx = 1; idx < hsa_cmdline_arg_cnt; idx += 2) {
|
||||
arg_idx = GetArgIndex((char*)hsa_cmdline_arg_list[idx]);
|
||||
switch (arg_idx) {
|
||||
case 0:
|
||||
HsaRsrcFactory::brig_path_ = hsa_cmdline_arg_list[idx + 1];
|
||||
break;
|
||||
case 1:
|
||||
HsaRsrcFactory::num_cus_ = atoi(hsa_cmdline_arg_list[idx + 1]);
|
||||
break;
|
||||
case 2:
|
||||
HsaRsrcFactory::num_waves_ = atoi(hsa_cmdline_arg_list[idx + 1]);
|
||||
break;
|
||||
case 3:
|
||||
HsaRsrcFactory::num_workitems_ = atoi(hsa_cmdline_arg_list[idx + 1]);
|
||||
break;
|
||||
case 4:
|
||||
HsaRsrcFactory::kernel_loop_count_ = atoi(hsa_cmdline_arg_list[idx + 1]);
|
||||
break;
|
||||
case 5:
|
||||
HsaRsrcFactory::print_debug_info_ = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
uint32_t HsaRsrcFactory::GetArgIndex(char* arg_value) {
|
||||
// Map Brig file path to index zero
|
||||
if (!strcmp(HsaRsrcFactory::brig_path_key_, arg_value)) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
// Map Number of Compute Units to index one
|
||||
if (!strcmp(HsaRsrcFactory::num_cus_key_, arg_value)) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
// Map Number of Waves per CU to index two
|
||||
if (!strcmp(HsaRsrcFactory::num_waves_key_, arg_value)) {
|
||||
return 2;
|
||||
}
|
||||
|
||||
// Map Number of Workitems per Wave to index three
|
||||
if (!strcmp(HsaRsrcFactory::num_workitems_key_, arg_value)) {
|
||||
return 3;
|
||||
}
|
||||
|
||||
// Map Kernel Loop Count to index four
|
||||
if (!strcmp(HsaRsrcFactory::kernel_loop_count_key_, arg_value)) {
|
||||
return 4;
|
||||
}
|
||||
|
||||
// Map print debug info parameter
|
||||
if (!strcmp(HsaRsrcFactory::print_debug_key_, arg_value)) {
|
||||
return 5;
|
||||
}
|
||||
|
||||
return 108;
|
||||
}
|
||||
|
||||
void HsaRsrcFactory::PrintHelpMsg() {
|
||||
std::cout << "Key for passing Brig filepath: " << HsaRsrcFactory::brig_path_key_ << std::endl;
|
||||
std::cout << "Key for passing Number of Compute Units: " << HsaRsrcFactory::num_cus_key_
|
||||
<< std::endl;
|
||||
std::cout << "Key for passing Number of Waves per CU: " << HsaRsrcFactory::num_waves_key_
|
||||
<< std::endl;
|
||||
std::cout << "Key for passing Number of Workitems per Wave: "
|
||||
<< HsaRsrcFactory::num_workitems_key_ << std::endl;
|
||||
std::cout << "Key for passing Kernel Loop Count: " << HsaRsrcFactory::kernel_loop_count_key_
|
||||
<< std::endl;
|
||||
}
|
||||
@@ -0,0 +1,262 @@
|
||||
#ifndef HSA_RSRC_FACTORY_H_
|
||||
#define HSA_RSRC_FACTORY_H_
|
||||
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <stdint.h>
|
||||
#include <string.h>
|
||||
|
||||
#include <iostream>
|
||||
#include <vector>
|
||||
#include <string>
|
||||
|
||||
#include "hsatimer.h"
|
||||
#include "hsa.h"
|
||||
#include "hsa_ext_finalize.h"
|
||||
|
||||
#define HSA_ARGUMENT_ALIGN_BYTES 16
|
||||
#define HSA_QUEUE_ALIGN_BYTES 64
|
||||
#define HSA_PACKET_ALIGN_BYTES 64
|
||||
|
||||
#define check(msg, status) \
|
||||
if (status != HSA_STATUS_SUCCESS) { \
|
||||
const char* emsg = 0; \
|
||||
hsa_status_string(status, &emsg); \
|
||||
printf("%s: %s\n", msg, emsg ? emsg : "<unknown error>"); \
|
||||
exit(1); \
|
||||
}
|
||||
|
||||
#define check_build(msg, status) \
|
||||
if (status != STATUS_SUCCESS) { \
|
||||
printf("%s\n", msg); \
|
||||
exit(1); \
|
||||
}
|
||||
|
||||
// Provide access to command line arguments passed in by user
|
||||
extern uint32_t hsa_cmdline_arg_cnt;
|
||||
extern char** hsa_cmdline_arg_list;
|
||||
|
||||
// Encapsulates information about a Hsa Agent such as its
|
||||
// handle, name, max queue size, max wavefront size, etc.
|
||||
typedef struct {
|
||||
// Handle of Agent
|
||||
hsa_agent_t dev_id;
|
||||
|
||||
// Agent type - Cpu = 0, Gpu = 1 or Dsp = 2
|
||||
uint32_t dev_type;
|
||||
|
||||
// Name of Agent whose length is less than 64
|
||||
char name[64];
|
||||
|
||||
// Max size of Wavefront size
|
||||
uint32_t max_wave_size;
|
||||
|
||||
// Max size of Queue buffer
|
||||
uint32_t max_queue_size;
|
||||
|
||||
// Hsail profile supported by agent
|
||||
hsa_profile_t profile;
|
||||
|
||||
// Memory region supporting kernel parameters
|
||||
hsa_region_t coarse_region;
|
||||
|
||||
// Memory region supporting kernel arguments
|
||||
hsa_region_t kernarg_region;
|
||||
|
||||
} AgentInfo;
|
||||
|
||||
class HsaRsrcFactory {
|
||||
public:
|
||||
// Constructor of the class. Will initialize the Hsa Runtime and
|
||||
// query the system topology to get the list of Cpu and Gpu devices
|
||||
HsaRsrcFactory();
|
||||
|
||||
// Destructor of the class
|
||||
~HsaRsrcFactory();
|
||||
|
||||
// Get the count of Hsa Gpu Agents available on the platform
|
||||
//
|
||||
// @return uint32_t Number of Gpu agents on platform
|
||||
//
|
||||
uint32_t GetCountOfGpuAgents();
|
||||
|
||||
// Get the count of Hsa Cpu Agents available on the platform
|
||||
//
|
||||
// @return uint32_t Number of Cpu agents on platform
|
||||
//
|
||||
uint32_t GetCountOfCpuAgents();
|
||||
|
||||
// Get the AgentInfo handle of a Gpu device
|
||||
//
|
||||
// @param idx Gpu Agent at specified index
|
||||
//
|
||||
// @param agent_info Output parameter updated with AgentInfo
|
||||
//
|
||||
// @return bool true if successful, false otherwise
|
||||
//
|
||||
bool GetGpuAgentInfo(uint32_t idx, AgentInfo** agent_info);
|
||||
|
||||
// Get the AgentInfo handle of a Cpu device
|
||||
//
|
||||
// @param idx Cpu Agent at specified index
|
||||
//
|
||||
// @param agent_info Output parameter updated with AgentInfo
|
||||
//
|
||||
// @return bool true if successful, false otherwise
|
||||
//
|
||||
bool GetCpuAgentInfo(uint32_t idx, AgentInfo** agent_info);
|
||||
|
||||
// Create a Queue object and return its handle. The queue object is expected
|
||||
// to support user requested number of Aql dispatch packets.
|
||||
//
|
||||
// @param agent_info Gpu Agent on which to create a queue object
|
||||
//
|
||||
// @param num_Pkts Number of packets to be held by queue
|
||||
//
|
||||
// @param queue Output parameter updated with handle of queue object
|
||||
//
|
||||
// @return bool true if successful, false otherwise
|
||||
//
|
||||
bool CreateQueue(AgentInfo* agent_info, uint32_t num_pkts, hsa_queue_t** queue);
|
||||
|
||||
// Create a Signal object and return its handle.
|
||||
//
|
||||
// @param value Initial value of signal object
|
||||
//
|
||||
// @param signal Output parameter updated with handle of signal object
|
||||
//
|
||||
// @return bool true if successful, false otherwise
|
||||
//
|
||||
bool CreateSignal(uint32_t value, hsa_signal_t* signal);
|
||||
|
||||
// Allocate memory for use by a kernel of specified size in specified
|
||||
// agent's memory region. Currently supports Global segment whose Kernarg
|
||||
// flag set.
|
||||
//
|
||||
// @param agent_info Agent from whose memory region to allocate
|
||||
//
|
||||
// @param size Size of memory in terms of bytes
|
||||
//
|
||||
// @return uint8_t* Pointer to buffer, null if allocation fails.
|
||||
//
|
||||
uint8_t* AllocateLocalMemory(AgentInfo* agent_info, size_t size);
|
||||
uint8_t* AllocateMemory(AgentInfo* agent_info, size_t size);
|
||||
|
||||
bool TransferData(uint8_t* dest_buff, uint8_t* src_buff, uint32_t length, bool host_to_dev);
|
||||
|
||||
// Allocate memory tp pass kernel parameters.
|
||||
//
|
||||
// @param agent_info Agent from whose memory region to allocate
|
||||
//
|
||||
// @param size Size of memory in terms of bytes
|
||||
//
|
||||
// @return uint8_t* Pointer to buffer, null if allocation fails.
|
||||
//
|
||||
uint8_t* AllocateSysMemory(AgentInfo* agent_info, size_t size);
|
||||
|
||||
// Loads an Assembled Brig file and Finalizes it into Device Isa
|
||||
//
|
||||
// @param agent_info Gpu device for which to finalize
|
||||
//
|
||||
// @param brig_path File path of the Assembled Brig file
|
||||
//
|
||||
// @param kernel_name Name of the kernel to finalize
|
||||
//
|
||||
// @param code_desc Handle of finalized Code Descriptor that could
|
||||
// be used to submit for execution
|
||||
//
|
||||
// @return bool true if successful, false otherwise
|
||||
//
|
||||
bool LoadAndFinalize(AgentInfo* agent_info, const char* brig_path, char* kernel_name,
|
||||
hsa_executable_symbol_t* code_desc);
|
||||
|
||||
// Add an instance of AgentInfo representing a Hsa Gpu agent
|
||||
void AddAgentInfo(AgentInfo* agent_info, bool gpu);
|
||||
|
||||
// Returns the file path where brig files is located
|
||||
static char* GetBrigPath();
|
||||
|
||||
// Returns the number of compute units present on platform
|
||||
static uint32_t GetNumOfCUs();
|
||||
|
||||
// Returns the maximum number of waves that can be launched
|
||||
// per compute unit. The actual number that can be launched
|
||||
// is affected by resource availability
|
||||
static uint32_t GetNumOfWavesPerCU();
|
||||
|
||||
// Returns the number of work-items that can execute per wave
|
||||
static uint32_t GetNumOfWorkItemsPerWave();
|
||||
|
||||
// Returns the number of times kernel loop body should execute.
|
||||
static uint32_t GetKernelLoopCount();
|
||||
|
||||
// Returns boolean flag to indicate if debug info should be printed
|
||||
static uint32_t GetPrintDebugInfo();
|
||||
|
||||
// Print the various fields of Hsa Gpu Agents
|
||||
bool PrintGpuAgents(const std::string& header);
|
||||
|
||||
private:
|
||||
// Number of queues to create
|
||||
uint32_t num_queues_;
|
||||
|
||||
// Used to maintain a list of Hsa Queue handles
|
||||
std::vector<hsa_queue_t*> queue_list_;
|
||||
|
||||
// Number of Signals to create
|
||||
uint32_t num_signals_;
|
||||
|
||||
// Used to maintain a list of Hsa Signal handles
|
||||
std::vector<hsa_signal_t*> signal_list_;
|
||||
|
||||
// Number of agents reported by platform
|
||||
uint32_t num_agents_;
|
||||
|
||||
// Used to maintain a list of Hsa Gpu Agent Info
|
||||
std::vector<AgentInfo*> gpu_list_;
|
||||
|
||||
// Used to maintain a list of Hsa Cpu Agent Info
|
||||
std::vector<AgentInfo*> cpu_list_;
|
||||
|
||||
// Records the file path where Brig file is located.
|
||||
// Value is available only after an instance has been built.
|
||||
static char* brig_path_;
|
||||
static char* brig_path_key_;
|
||||
|
||||
// Records the number of Compute units present on system.
|
||||
// Value is available only after an instance has been built.
|
||||
static uint32_t num_cus_;
|
||||
static char* num_cus_key_;
|
||||
|
||||
// Records the number of waves that can be launched per Compute unit
|
||||
// Value is available only after an instance has been built.
|
||||
static uint32_t num_waves_;
|
||||
static char* num_waves_key_;
|
||||
|
||||
// Records the number of work-items that can be packed into a wave
|
||||
// Value is available only after an instance has been built.
|
||||
static uint32_t num_workitems_;
|
||||
static char* num_workitems_key_;
|
||||
|
||||
// Records the number of times kernel loop body should run. Value
|
||||
// is available only after an instance has been built.
|
||||
static uint32_t kernel_loop_count_;
|
||||
static char* kernel_loop_count_key_;
|
||||
|
||||
// Records the number of times kernel loop body should run. Value
|
||||
// is available only after an instance has been built.
|
||||
static bool print_debug_info_;
|
||||
static char* print_debug_key_;
|
||||
|
||||
// Process command line arguments. The method will capture
|
||||
// various user command line parameters for tests to use
|
||||
static void ProcessCmdline();
|
||||
|
||||
// Prints the help banner on user arg keys
|
||||
static void PrintHelpMsg();
|
||||
|
||||
// Maps an index for the user argument
|
||||
static uint32_t GetArgIndex(char* arg_value);
|
||||
};
|
||||
|
||||
#endif // HSA_RSRC_FACTORY_H_
|
||||
@@ -0,0 +1,168 @@
|
||||
#include "hsatimer.h"
|
||||
|
||||
PerfTimer::PerfTimer() { freq_in_100mhz = MeasureTSCFreqHz(); }
|
||||
|
||||
PerfTimer::~PerfTimer() {
|
||||
while (!_timers.empty()) {
|
||||
Timer* temp = _timers.back();
|
||||
_timers.pop_back();
|
||||
delete temp;
|
||||
}
|
||||
}
|
||||
|
||||
// a new cretaed timer instantance index will be returned
|
||||
int PerfTimer::CreateTimer() {
|
||||
Timer* newTimer = new Timer;
|
||||
newTimer->_start = 0;
|
||||
newTimer->_clocks = 0;
|
||||
|
||||
#ifdef _WIN32
|
||||
QueryPerformanceFrequency((LARGE_INTEGER*)&newTimer->_freq);
|
||||
#else
|
||||
newTimer->_freq = (long long)1.0E3;
|
||||
#endif
|
||||
|
||||
/* Push back the address of new Timer instance created */
|
||||
_timers.push_back(newTimer);
|
||||
return (int)(_timers.size() - 1);
|
||||
}
|
||||
|
||||
int PerfTimer::StartTimer(int index) {
|
||||
if (index >= (int)_timers.size()) {
|
||||
Error("Cannot reset timer. Invalid handle.");
|
||||
return HSA_FAILURE;
|
||||
}
|
||||
|
||||
#ifdef _WIN32
|
||||
// General Windows timing method
|
||||
#ifndef _AMD
|
||||
long long tmpStart;
|
||||
QueryPerformanceCounter((LARGE_INTEGER*)&(tmpStart));
|
||||
_timers[index]->_start = (double)tmpStart;
|
||||
#else
|
||||
// AMD Windows timing method
|
||||
|
||||
#endif
|
||||
|
||||
#else
|
||||
// General Linux timing method
|
||||
#ifndef _AMD
|
||||
struct timeval s;
|
||||
gettimeofday(&s, 0);
|
||||
_timers[index]->_start = s.tv_sec * 1.0E3 + ((double)(s.tv_usec / 1.0E3));
|
||||
#else
|
||||
|
||||
// AMD timing method
|
||||
|
||||
unsigned int unused;
|
||||
_timers[index]->_start = __rdtscp(&unused);
|
||||
|
||||
#endif
|
||||
|
||||
#endif
|
||||
|
||||
return HSA_SUCCESS;
|
||||
}
|
||||
|
||||
|
||||
int PerfTimer::StopTimer(int index) {
|
||||
double n = 0;
|
||||
if (index >= (int)_timers.size()) {
|
||||
Error("Cannot reset timer. Invalid handle.");
|
||||
return HSA_FAILURE;
|
||||
}
|
||||
#ifdef _WIN32
|
||||
#ifndef _AMD
|
||||
long long n1;
|
||||
QueryPerformanceCounter((LARGE_INTEGER*)&(n1));
|
||||
n = (double)n1;
|
||||
#else
|
||||
|
||||
// AMD Window Timing
|
||||
|
||||
#endif
|
||||
|
||||
#else
|
||||
// General Linux timing method
|
||||
#ifndef _AMD
|
||||
struct timeval s;
|
||||
gettimeofday(&s, 0);
|
||||
n = s.tv_sec * 1.0E3 + (double)(s.tv_usec / 1.0E3);
|
||||
#else
|
||||
// AMD Linux timing
|
||||
|
||||
unsigned int unused;
|
||||
n = __rdtscp(&unused);
|
||||
#endif
|
||||
|
||||
#endif
|
||||
|
||||
n -= _timers[index]->_start;
|
||||
_timers[index]->_start = 0;
|
||||
|
||||
#ifndef _AMD
|
||||
_timers[index]->_clocks += n;
|
||||
#else
|
||||
//_timers[index]->_clocks += 10 * n /freq_in_100mhz; // unit is ns
|
||||
_timers[index]->_clocks += 1.0E-6 * 10 * n / freq_in_100mhz; // convert to ms
|
||||
cout << "_AMD is enabled!!!" << endl;
|
||||
#endif
|
||||
|
||||
return HSA_SUCCESS;
|
||||
}
|
||||
|
||||
void PerfTimer::Error(string str) { cout << str << endl; }
|
||||
|
||||
|
||||
double PerfTimer::ReadTimer(int index) {
|
||||
if (index >= (int)_timers.size()) {
|
||||
Error("Cannot read timer. Invalid handle.");
|
||||
return HSA_FAILURE;
|
||||
}
|
||||
|
||||
double reading = double(_timers[index]->_clocks);
|
||||
|
||||
reading = double(reading / _timers[index]->_freq);
|
||||
|
||||
return reading;
|
||||
}
|
||||
|
||||
|
||||
uint64_t PerfTimer::CoarseTimestampUs() {
|
||||
#ifdef _WIN32
|
||||
uint64_t freqHz, ticks;
|
||||
QueryPerformanceFrequency((LARGE_INTEGER*)&freqHz);
|
||||
QueryPerformanceCounter((LARGE_INTEGER*)&ticks);
|
||||
|
||||
// Scale numerator and divisor until (ticks * 1000000) fits in uint64_t.
|
||||
while (ticks > (1ULL << 44)) {
|
||||
ticks /= 16;
|
||||
freqHz /= 16;
|
||||
}
|
||||
|
||||
return (ticks * 1000000) / freqHz;
|
||||
#else
|
||||
struct timespec ts;
|
||||
clock_gettime(CLOCK_MONOTONIC_RAW, &ts);
|
||||
return uint64_t(ts.tv_sec) * 1000000 + ts.tv_nsec / 1000;
|
||||
#endif
|
||||
}
|
||||
|
||||
uint64_t PerfTimer::MeasureTSCFreqHz() {
|
||||
// Make a coarse interval measurement of TSC ticks for 1 gigacycles.
|
||||
unsigned int unused;
|
||||
uint64_t tscTicksEnd;
|
||||
|
||||
uint64_t coarseBeginUs = CoarseTimestampUs();
|
||||
uint64_t tscTicksBegin = __rdtscp(&unused);
|
||||
do {
|
||||
tscTicksEnd = __rdtscp(&unused);
|
||||
} while (tscTicksEnd - tscTicksBegin < 1000000000);
|
||||
|
||||
uint64_t coarseEndUs = CoarseTimestampUs();
|
||||
|
||||
// Compute the TSC frequency and round to nearest 100MHz.
|
||||
uint64_t coarseIntervalNs = (coarseEndUs - coarseBeginUs) * 1000;
|
||||
uint64_t tscIntervalTicks = tscTicksEnd - tscTicksBegin;
|
||||
return (tscIntervalTicks * 10 + (coarseIntervalNs / 2)) / coarseIntervalNs;
|
||||
}
|
||||
@@ -0,0 +1,68 @@
|
||||
#ifndef __MYTIME__
|
||||
#define __MYTIME__
|
||||
|
||||
// Will use AMD timer and general Linux timer based on users' need --> compilation flag
|
||||
// need to consider platform is Windows or Linux
|
||||
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <stdint.h>
|
||||
#include <string.h>
|
||||
#include <iostream>
|
||||
#include <vector>
|
||||
#include <string>
|
||||
using namespace std;
|
||||
|
||||
#if defined(_MSC_VER)
|
||||
#include <time.h>
|
||||
#include <windows.h>
|
||||
#include <intrin.h>
|
||||
#else
|
||||
#if defined(__GNUC__)
|
||||
#include <sys/time.h>
|
||||
#include <x86intrin.h>
|
||||
#endif // __GNUC__
|
||||
#endif //_MSC_VER
|
||||
|
||||
#define HSA_FAILURE 1
|
||||
#define HSA_SUCCESS 0
|
||||
|
||||
class PerfTimer {
|
||||
private:
|
||||
struct Timer {
|
||||
string name; /* < name name of time object*/
|
||||
long long _freq; /* < _freq frequency*/
|
||||
double _clocks; /* < _clocks number of ticks at end*/
|
||||
double _start; /* < _start start point ticks*/
|
||||
};
|
||||
|
||||
std::vector<Timer*> _timers; /*< _timers vector to Timer objects */
|
||||
double freq_in_100mhz;
|
||||
|
||||
public:
|
||||
PerfTimer();
|
||||
~PerfTimer();
|
||||
|
||||
private:
|
||||
// AMD timing method
|
||||
uint64_t CoarseTimestampUs();
|
||||
uint64_t MeasureTSCFreqHz();
|
||||
|
||||
// General Linux timing method
|
||||
|
||||
public:
|
||||
int CreateTimer();
|
||||
int StartTimer(int index);
|
||||
int StopTimer(int index);
|
||||
|
||||
public:
|
||||
// retrieve time
|
||||
double ReadTimer(int index);
|
||||
// write into a file
|
||||
double WriteTimer(int index);
|
||||
|
||||
public:
|
||||
void Error(string str);
|
||||
};
|
||||
|
||||
#endif
|
||||
@@ -0,0 +1,91 @@
|
||||
/******************************************************************************
|
||||
|
||||
Copyright ©2013 Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without modification,
|
||||
are permitted provided that the following conditions are met:
|
||||
|
||||
Redistributions of source code must retain the above copyright notice, this list
|
||||
of conditions and the following disclaimer.
|
||||
|
||||
Redistributions in binary form must reproduce the above copyright notice, this
|
||||
list of conditions and the following disclaimer in the documentation and/or
|
||||
other materials provided with the distribution.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
|
||||
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
||||
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
|
||||
IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
|
||||
INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
|
||||
BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
|
||||
OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
|
||||
OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*******************************************************************************/
|
||||
|
||||
#include <assert.h>
|
||||
#include "simple_convolution.h"
|
||||
#include "test_hsa.h"
|
||||
#include "test_pgen_pmc.h"
|
||||
#include "test_pgen_sqtt.h"
|
||||
|
||||
int main(int argc, char* argv[]) {
|
||||
#if defined(NDEBUG)
|
||||
clog.rdbuf(NULL);
|
||||
#endif
|
||||
|
||||
bool ret_val = true;
|
||||
|
||||
// Create SimpleConvolution test object
|
||||
TestKernel* test_kernel = new SimpleConvolution();
|
||||
TestAql* test_aql = new TestHSA(test_kernel);
|
||||
|
||||
const bool pmc_enable = (getenv("ROCR_ENABLE_PMC") != NULL);
|
||||
const bool sqtt_enable = (getenv("ROCR_ENABLE_SQTT") != NULL);
|
||||
if (pmc_enable)
|
||||
test_aql = new TestPGenPMC(test_aql);
|
||||
else if (sqtt_enable)
|
||||
test_aql = new TestPGenSQTT(test_aql);
|
||||
assert(test_aql != NULL);
|
||||
if (test_aql == NULL) return 1;
|
||||
|
||||
// Initialization of Hsa Runtime
|
||||
ret_val = test_aql->initialize(argc, argv);
|
||||
if (ret_val == false) {
|
||||
std::cout << "Error in the test initialization" << std::endl;
|
||||
assert(ret_val);
|
||||
return 1;
|
||||
}
|
||||
|
||||
// Setup Hsa resources needed for execution
|
||||
ret_val = test_aql->setup();
|
||||
if (ret_val == false) {
|
||||
std::cout << "Error in creating hsa resources" << std::endl;
|
||||
assert(ret_val);
|
||||
return 1;
|
||||
}
|
||||
|
||||
// Run SimpleConvolution kernel
|
||||
ret_val = test_aql->run();
|
||||
if (ret_val == false) {
|
||||
std::cout << "Error in running the test kernel" << std::endl;
|
||||
assert(ret_val);
|
||||
return 1;
|
||||
}
|
||||
|
||||
// Verify the results of the execution
|
||||
ret_val = test_aql->verify_results();
|
||||
if (ret_val) {
|
||||
std::cout << "Test : Passed" << std::endl;
|
||||
} else {
|
||||
std::cout << "Test : Failed" << std::endl;
|
||||
}
|
||||
|
||||
// Print time taken by sample
|
||||
test_aql->print_time();
|
||||
test_aql->cleanup();
|
||||
|
||||
return (ret_val) ? 0 : 1;
|
||||
}
|
||||
@@ -0,0 +1,87 @@
|
||||
/******************************************************************************
|
||||
|
||||
Copyright ©2013 Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without modification,
|
||||
are permitted provided that the following conditions are met:
|
||||
|
||||
Redistributions of source code must retain the above copyright notice, this list
|
||||
of conditions and the following disclaimer.
|
||||
|
||||
Redistributions in binary form must reproduce the above copyright notice, this
|
||||
list of conditions and the following disclaimer in the documentation and/or
|
||||
other materials provided with the distribution.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
|
||||
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
||||
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
|
||||
IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
|
||||
INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
|
||||
BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
|
||||
OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
|
||||
OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*******************************************************************************/
|
||||
|
||||
#ifndef _TESTAQL_H_
|
||||
#define _TESTAQL_H_
|
||||
|
||||
#include "hsa.h"
|
||||
#include "hsa_rsrc_factory.hpp"
|
||||
#include "hsa_ext_amd_aql_profile.h"
|
||||
|
||||
#define test_assert(cond) \
|
||||
{ \
|
||||
if (cond) { \
|
||||
std::cout << "ASSERT FAILED: " << #cond << " : " << __FILE__ << "(" << __LINE__ << ")" \
|
||||
<< std::endl; \
|
||||
abort(); \
|
||||
} \
|
||||
}
|
||||
|
||||
// Test AQL interface
|
||||
class TestAql {
|
||||
TestAql* const test_aql;
|
||||
|
||||
public:
|
||||
TestAql(TestAql* t = 0) : test_aql(t) {}
|
||||
virtual ~TestAql() {}
|
||||
|
||||
TestAql* testAql() { return test_aql; }
|
||||
virtual AgentInfo* getAgentInfo() { return (test_aql) ? test_aql->getAgentInfo() : 0; }
|
||||
virtual hsa_queue_t* getQueue() { return (test_aql) ? test_aql->getQueue() : 0; }
|
||||
virtual HsaRsrcFactory* getRsrcFactory() { return (test_aql) ? test_aql->getRsrcFactory() : 0; }
|
||||
|
||||
// Initialize application environment including setting
|
||||
// up of various configuration parameters based on
|
||||
// command line arguments
|
||||
// @return bool true on success and false on failure
|
||||
virtual bool initialize(int argc, char** argv) {
|
||||
return (test_aql) ? test_aql->initialize(argc, argv) : true;
|
||||
}
|
||||
|
||||
// Setup application parameters for exectuion
|
||||
// @return bool true on success and false on failure
|
||||
virtual bool setup() { return (test_aql) ? test_aql->setup() : true; }
|
||||
|
||||
// Run the kernel
|
||||
// @return bool true on success and false on failure
|
||||
virtual bool run() { return (test_aql) ? test_aql->run() : true; }
|
||||
|
||||
// Verify results
|
||||
// @return bool true on success and false on failure
|
||||
virtual bool verify_results() { return (test_aql) ? test_aql->verify_results() : true; }
|
||||
|
||||
// Print to console the time taken to execute kernel
|
||||
virtual void print_time() {
|
||||
if (test_aql) test_aql->print_time();
|
||||
}
|
||||
|
||||
// Release resources e.g. memory allocations
|
||||
// @return bool true on success and false on failure
|
||||
virtual bool cleanup() { return (test_aql) ? test_aql->cleanup() : true; }
|
||||
};
|
||||
|
||||
#endif // _TESTAQL_H_
|
||||
@@ -0,0 +1,234 @@
|
||||
/******************************************************************************
|
||||
|
||||
Copyright ©2013 Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without modification,
|
||||
are permitted provided that the following conditions are met:
|
||||
|
||||
Redistributions of source code must retain the above copyright notice, this list
|
||||
of conditions and the following disclaimer.
|
||||
|
||||
Redistributions in binary form must reproduce the above copyright notice, this
|
||||
list of conditions and the following disclaimer in the documentation and/or
|
||||
other materials provided with the distribution.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
|
||||
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
||||
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
|
||||
IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
|
||||
INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
|
||||
BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
|
||||
OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
|
||||
OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*******************************************************************************/
|
||||
|
||||
#include "os.h"
|
||||
#include "helper_funcs.hpp"
|
||||
#include "hsa_rsrc_factory.hpp"
|
||||
#include "test_hsa.h"
|
||||
|
||||
bool TestHSA::initialize(int arg_cnt, char** arg_list) {
|
||||
std::cout << "TestHSA::initialize :" << std::endl;
|
||||
// Initialize command line arguments
|
||||
hsa_cmdline_arg_cnt = arg_cnt;
|
||||
hsa_cmdline_arg_list = arg_list;
|
||||
|
||||
// Instantiate a Timer object
|
||||
setup_timer_idx_ = hsa_timer_.CreateTimer();
|
||||
dispatch_timer_idx_ = hsa_timer_.CreateTimer();
|
||||
|
||||
// Instantiate an instance of Hsa Resources Factory
|
||||
hsa_rsrc_ = new HsaRsrcFactory();
|
||||
|
||||
// Print properties of the agents
|
||||
hsa_rsrc_->PrintGpuAgents("> GPU agents");
|
||||
|
||||
// Create an instance of Gpu agent
|
||||
const char* p = getenv("ROCR_AGENT_IND");
|
||||
const uint32_t agent_ind = (p == NULL) ? 0 : atol(p);
|
||||
if (!hsa_rsrc_->GetGpuAgentInfo(agent_ind, &agent_info_)) {
|
||||
std::cout << "> error: agent[" << agent_ind << "] is not found" << std::endl;
|
||||
return false;
|
||||
}
|
||||
std::cout << "> Using agent[" << agent_ind << "] : " << agent_info_->name << std::endl;
|
||||
|
||||
// Create an instance of Aql Queue
|
||||
uint32_t num_pkts = 128;
|
||||
hsa_rsrc_->CreateQueue(agent_info_, num_pkts, &hsa_queue_);
|
||||
|
||||
// Obtain handle of signal
|
||||
hsa_rsrc_->CreateSignal(1, &hsa_signal_);
|
||||
|
||||
// Obtain the code object file name
|
||||
std::string agentName(agent_info_->name);
|
||||
if (agentName.compare(0, 4, "gfx8") == 0) {
|
||||
brig_path_obj_.append("gfx8");
|
||||
} else if (agentName.compare(0, 4, "gfx9") == 0) {
|
||||
brig_path_obj_.append("gfx9");
|
||||
} else {
|
||||
assert(false);
|
||||
return false;
|
||||
}
|
||||
brig_path_obj_.append("_" + name_ + ".hsaco");
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
bool TestHSA::setup() {
|
||||
std::cout << "TestHSA::setup :" << std::endl;
|
||||
|
||||
// Start the timer object
|
||||
hsa_timer_.StartTimer(setup_timer_idx_);
|
||||
|
||||
mem_map_t& mem_map = test_->get_mem_map();
|
||||
for (mem_it_t it = mem_map.begin(); it != mem_map.end(); ++it) {
|
||||
mem_descr_t& des = it->second;
|
||||
void* ptr = (des.local) ? hsa_rsrc_->AllocateLocalMemory(agent_info_, des.size)
|
||||
: hsa_rsrc_->AllocateSysMemory(agent_info_, des.size);
|
||||
des.ptr = ptr;
|
||||
assert(ptr != NULL);
|
||||
if (ptr == NULL) return false;
|
||||
}
|
||||
test_->init();
|
||||
|
||||
// Load and Finalize Kernel Code Descriptor
|
||||
char* brig_path = (char*)brig_path_obj_.c_str();
|
||||
const bool ret_val =
|
||||
hsa_rsrc_->LoadAndFinalize(agent_info_, brig_path, strdup(name_.c_str()), &kernel_code_desc_);
|
||||
if (ret_val == false) {
|
||||
std::cout << "Error in loading and finalizing Kernel" << std::endl;
|
||||
return ret_val;
|
||||
}
|
||||
|
||||
// Stop the timer object
|
||||
hsa_timer_.StopTimer(setup_timer_idx_);
|
||||
setup_time_taken_ = hsa_timer_.ReadTimer(setup_timer_idx_);
|
||||
total_time_taken_ = setup_time_taken_;
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
bool TestHSA::run() {
|
||||
std::cout << "TestHSA::run :" << std::endl;
|
||||
|
||||
const uint32_t work_group_size = 64;
|
||||
const uint32_t work_grid_size = test_->get_elements_count();
|
||||
uint32_t group_segment_size = 0;
|
||||
uint32_t private_segment_size = 0;
|
||||
const size_t kernarg_segment_size = test_->get_kernarg_size();
|
||||
uint64_t code_handle = 0;
|
||||
|
||||
// Retrieve the amount of group memory needed
|
||||
hsa_executable_symbol_get_info(
|
||||
kernel_code_desc_, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_GROUP_SEGMENT_SIZE, &group_segment_size);
|
||||
|
||||
// Retrieve the amount of private memory needed
|
||||
hsa_executable_symbol_get_info(kernel_code_desc_,
|
||||
HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_PRIVATE_SEGMENT_SIZE,
|
||||
&private_segment_size);
|
||||
|
||||
// Check the kernel args size
|
||||
size_t size_info = 0;
|
||||
hsa_executable_symbol_get_info(
|
||||
kernel_code_desc_, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_KERNARG_SEGMENT_SIZE, &size_info);
|
||||
assert(kernarg_segment_size == size_info);
|
||||
if (kernarg_segment_size != size_info) return false;
|
||||
|
||||
// Retrieve handle of the code block
|
||||
hsa_executable_symbol_get_info(kernel_code_desc_, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT,
|
||||
&code_handle);
|
||||
|
||||
// Initialize the dispatch packet.
|
||||
hsa_kernel_dispatch_packet_t aql;
|
||||
memset(&aql, 0, sizeof(aql));
|
||||
// Set the packet's type, acquire and release fences
|
||||
aql.header = HSA_PACKET_TYPE_KERNEL_DISPATCH;
|
||||
aql.header |= HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE;
|
||||
aql.header |= HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE;
|
||||
// Populate Aql packet with default values
|
||||
aql.setup = 1;
|
||||
aql.grid_size_x = work_grid_size;
|
||||
aql.grid_size_y = 1;
|
||||
aql.grid_size_z = 1;
|
||||
aql.workgroup_size_x = work_group_size;
|
||||
aql.workgroup_size_y = 1;
|
||||
aql.workgroup_size_z = 1;
|
||||
// Bind the kernel code descriptor and arguments
|
||||
aql.kernel_object = code_handle;
|
||||
aql.kernarg_address = test_->get_kernarg_ptr();
|
||||
aql.group_segment_size = group_segment_size;
|
||||
aql.private_segment_size = private_segment_size;
|
||||
// Initialize Aql packet with handle of signal
|
||||
aql.completion_signal = hsa_signal_;
|
||||
|
||||
// Compute the write index of queue and copy Aql packet into it
|
||||
const uint64_t que_idx = hsa_queue_load_write_index_relaxed(hsa_queue_);
|
||||
const uint32_t mask = hsa_queue_->size - 1;
|
||||
|
||||
std::cout << "> Executing kernel: \"" << name_ << "\"" << std::endl;
|
||||
|
||||
// Start the timer object
|
||||
hsa_timer_.StartTimer(dispatch_timer_idx_);
|
||||
|
||||
// Disable packet so that submission to HW is complete
|
||||
const auto header = aql.header;
|
||||
const uint8_t packet_type_mask = (1 << HSA_PACKET_HEADER_WIDTH_TYPE) - 1;
|
||||
aql.header &= (~packet_type_mask) << HSA_PACKET_HEADER_TYPE;
|
||||
aql.header |= HSA_PACKET_TYPE_INVALID << HSA_PACKET_HEADER_TYPE;
|
||||
|
||||
// Copy Aql packet into queue buffer
|
||||
((hsa_kernel_dispatch_packet_t*)(hsa_queue_->base_address))[que_idx & mask] = aql;
|
||||
|
||||
// After AQL packet is fully copied into queue buffer
|
||||
// update packet header from invalid state to valid state
|
||||
std::atomic_thread_fence(std::memory_order_release);
|
||||
((hsa_kernel_dispatch_packet_t*)(hsa_queue_->base_address))[que_idx & mask].header = header;
|
||||
|
||||
// Increment the write index and ring the doorbell to dispatch the kernel.
|
||||
hsa_queue_store_write_index_relaxed(hsa_queue_, (que_idx + 1));
|
||||
hsa_signal_store_relaxed(hsa_queue_->doorbell_signal, que_idx);
|
||||
|
||||
std::cout << "> Waiting on kernel dispatch signal" << std::endl;
|
||||
|
||||
// Wait on the dispatch signal until the kernel is finished.
|
||||
// Update wait condition to HSA_WAIT_STATE_ACTIVE for Polling
|
||||
hsa_signal_value_t value = hsa_signal_wait_acquire(hsa_signal_, HSA_SIGNAL_CONDITION_LT, 1,
|
||||
(uint64_t)-1, HSA_WAIT_STATE_BLOCKED);
|
||||
|
||||
// Stop the timer object
|
||||
hsa_timer_.StopTimer(dispatch_timer_idx_);
|
||||
dispatch_time_taken_ = hsa_timer_.ReadTimer(dispatch_timer_idx_);
|
||||
total_time_taken_ += dispatch_time_taken_;
|
||||
|
||||
// Copy kernel buffers from local memory into system memory
|
||||
hsa_rsrc_->TransferData((uint8_t*)test_->get_output_ptr(), (uint8_t*)test_->get_local_ptr(),
|
||||
test_->get_output_size(), false);
|
||||
test_->print_output();
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
bool TestHSA::verify_results() {
|
||||
// Compare the results and see if they match
|
||||
const int32_t cmp_val =
|
||||
memcmp(test_->get_output_ptr(), test_->get_refout_ptr(), test_->get_output_size());
|
||||
return (cmp_val == 0);
|
||||
}
|
||||
|
||||
void TestHSA::print_time() {
|
||||
std::cout << "Time taken for Setup by " << this->name_ << " : " << this->setup_time_taken_
|
||||
<< std::endl;
|
||||
std::cout << "Time taken for Dispatch by " << this->name_ << " : " << this->dispatch_time_taken_
|
||||
<< std::endl;
|
||||
std::cout << "Time taken in Total by " << this->name_ << " : " << this->total_time_taken_
|
||||
<< std::endl;
|
||||
}
|
||||
|
||||
bool TestHSA::cleanup() {
|
||||
// shutdown Hsa Runtime system
|
||||
hsa_status_t ret_val = hsa_shut_down();
|
||||
return (HSA_STATUS_SUCCESS == ret_val);
|
||||
}
|
||||
@@ -0,0 +1,115 @@
|
||||
/******************************************************************************
|
||||
|
||||
Copyright ©2013 Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without modification,
|
||||
are permitted provided that the following conditions are met:
|
||||
|
||||
Redistributions of source code must retain the above copyright notice, this list
|
||||
of conditions and the following disclaimer.
|
||||
|
||||
Redistributions in binary form must reproduce the above copyright notice, this
|
||||
list of conditions and the following disclaimer in the documentation and/or
|
||||
other materials provided with the distribution.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
|
||||
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
||||
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
|
||||
IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
|
||||
INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
|
||||
BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
|
||||
OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
|
||||
OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*******************************************************************************/
|
||||
|
||||
#ifndef _TEST_HSA_H_
|
||||
#define _TEST_HSA_H_
|
||||
|
||||
#include "test_aql.h"
|
||||
#include "test_kernel.h"
|
||||
#include "hsa_rsrc_factory.hpp"
|
||||
|
||||
// Class implements HSA test
|
||||
class TestHSA : public TestAql {
|
||||
public:
|
||||
// Constructor
|
||||
TestHSA(TestKernel* test) : test_(test), name_(test->Name()) {
|
||||
total_time_taken_ = 0;
|
||||
setup_time_taken_ = 0;
|
||||
dispatch_time_taken_ = 0;
|
||||
}
|
||||
|
||||
// Get methods for Agent Info, HAS queue, HSA Resourcse Manager
|
||||
AgentInfo* getAgentInfo() { return agent_info_; }
|
||||
hsa_queue_t* getQueue() { return hsa_queue_; }
|
||||
HsaRsrcFactory* getRsrcFactory() { return hsa_rsrc_; }
|
||||
|
||||
// Initialize application environment including setting
|
||||
// up of various configuration parameters based on
|
||||
// command line arguments
|
||||
// @return bool true on success and false on failure
|
||||
bool initialize(int argc, char** argv);
|
||||
|
||||
// Setup application parameters for exectuion
|
||||
// @return bool true on success and false on failure
|
||||
bool setup();
|
||||
|
||||
// Run the BinarySearch kernel
|
||||
// @return bool true on success and false on failure
|
||||
bool run();
|
||||
|
||||
// Verify against reference implementation
|
||||
// @return bool true on success and false on failure
|
||||
bool verify_results();
|
||||
|
||||
// Print to console the time taken to execute kernel
|
||||
void print_time();
|
||||
|
||||
// Release resources e.g. memory allocations
|
||||
// @return bool true on success and false on failure
|
||||
bool cleanup();
|
||||
|
||||
private:
|
||||
typedef TestKernel::mem_descr_t mem_descr_t;
|
||||
typedef TestKernel::mem_map_t mem_map_t;
|
||||
typedef TestKernel::mem_it_t mem_it_t;
|
||||
|
||||
// Test object
|
||||
TestKernel* test_;
|
||||
|
||||
// Path of Brig file
|
||||
std::string brig_path_obj_;
|
||||
|
||||
// Used to track time taken to run the sample
|
||||
double total_time_taken_;
|
||||
double setup_time_taken_;
|
||||
double dispatch_time_taken_;
|
||||
|
||||
// Handle to an Hsa Gpu Agent
|
||||
AgentInfo* agent_info_;
|
||||
|
||||
// Handle to an Hsa Queue
|
||||
hsa_queue_t* hsa_queue_;
|
||||
|
||||
// Handle of signal
|
||||
hsa_signal_t hsa_signal_;
|
||||
|
||||
// Handle of Kernel Code Descriptor
|
||||
hsa_executable_symbol_t kernel_code_desc_;
|
||||
|
||||
// Instance of timer object
|
||||
uint32_t setup_timer_idx_;
|
||||
uint32_t dispatch_timer_idx_;
|
||||
PerfTimer hsa_timer_;
|
||||
|
||||
// Instance of Hsa Resources Factory
|
||||
HsaRsrcFactory* hsa_rsrc_;
|
||||
|
||||
// Test kernel name
|
||||
std::string name_;
|
||||
};
|
||||
|
||||
#endif // _TEST_HSA_H_
|
||||
@@ -0,0 +1,105 @@
|
||||
/******************************************************************************
|
||||
|
||||
Copyright ©2013 Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without modification,
|
||||
are permitted provided that the following conditions are met:
|
||||
|
||||
Redistributions of source code must retain the above copyright notice, this list
|
||||
of conditions and the following disclaimer.
|
||||
|
||||
Redistributions in binary form must reproduce the above copyright notice, this
|
||||
list of conditions and the following disclaimer in the documentation and/or
|
||||
other materials provided with the distribution.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
|
||||
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
||||
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
|
||||
IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
|
||||
INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
|
||||
BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
|
||||
OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
|
||||
OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*******************************************************************************/
|
||||
|
||||
#ifndef _TEST_KERNEL_H_
|
||||
#define _TEST_KERNEL_H_
|
||||
|
||||
#include <map>
|
||||
#include <stdint.h>
|
||||
|
||||
// Class implements Kernel test
|
||||
class TestKernel {
|
||||
public:
|
||||
// Memory descriptors IDs
|
||||
enum { INPUT_DES_ID, OUTPUT_DES_ID, LOCAL_DES_ID, MASK_DES_ID, KERNARG_DES_ID, REFOUT_DES_ID };
|
||||
|
||||
// Memory descriptors vector declaration
|
||||
struct mem_descr_t {
|
||||
void* ptr;
|
||||
uint32_t size;
|
||||
bool local;
|
||||
};
|
||||
|
||||
// Memory map declaration
|
||||
typedef std::map<uint32_t, mem_descr_t> mem_map_t;
|
||||
typedef mem_map_t::iterator mem_it_t;
|
||||
typedef mem_map_t::const_iterator mem_const_it_t;
|
||||
|
||||
// Initialize method
|
||||
virtual void init() = 0;
|
||||
|
||||
// Return kernel memory map
|
||||
mem_map_t& get_mem_map() { return mem_map_; }
|
||||
|
||||
// Return NULL descriptor
|
||||
static mem_descr_t null_descriptor() { return {0, 0, 0}; }
|
||||
|
||||
// Methods to get the kernel attributes
|
||||
void* get_kernarg_ptr() const { return get_descr(KERNARG_DES_ID).ptr; }
|
||||
uint32_t get_kernarg_size() const { return get_descr(KERNARG_DES_ID).size; }
|
||||
void* get_output_ptr() const { return get_descr(OUTPUT_DES_ID).ptr; }
|
||||
uint32_t get_output_size() const { return get_descr(OUTPUT_DES_ID).size; }
|
||||
void* get_local_ptr() const { return get_descr(LOCAL_DES_ID).ptr; }
|
||||
void* get_refout_ptr() const { return get_descr(REFOUT_DES_ID).ptr; }
|
||||
virtual uint32_t get_elements_count() const = 0;
|
||||
|
||||
// Print output
|
||||
virtual void print_output() const = 0;
|
||||
|
||||
// Return name
|
||||
virtual std::string Name() const = 0;
|
||||
|
||||
protected:
|
||||
// Set system memory descriptor
|
||||
bool set_sys_descr(const uint32_t& id, const uint32_t& size) {
|
||||
return set_mem_descr(id, size, false);
|
||||
}
|
||||
|
||||
// Set local memory descriptor
|
||||
bool set_local_descr(const uint32_t& id, const uint32_t& size) {
|
||||
return set_mem_descr(id, size, true);
|
||||
}
|
||||
|
||||
// Get memory descriptor
|
||||
mem_descr_t get_descr(const uint32_t& id) const {
|
||||
mem_const_it_t it = mem_map_.find(id);
|
||||
return (it != mem_map_.end()) ? it->second : null_descriptor();
|
||||
}
|
||||
|
||||
private:
|
||||
// Set memory descriptor
|
||||
bool set_mem_descr(const uint32_t& id, const uint32_t& size, const bool& local) {
|
||||
const mem_descr_t des = {NULL, size, local};
|
||||
auto ret = mem_map_.insert(mem_map_t::value_type(id, des));
|
||||
return ret.second;
|
||||
}
|
||||
|
||||
// Kernel memory map object
|
||||
mem_map_t mem_map_;
|
||||
};
|
||||
|
||||
#endif // _TEST_KERNEL_H_
|
||||
@@ -0,0 +1,46 @@
|
||||
/******************************************************************************
|
||||
|
||||
Copyright ©2013 Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without modification,
|
||||
are permitted provided that the following conditions are met:
|
||||
|
||||
Redistributions of source code must retain the above copyright notice, this list
|
||||
of conditions and the following disclaimer.
|
||||
|
||||
Redistributions in binary form must reproduce the above copyright notice, this
|
||||
list of conditions and the following disclaimer in the documentation and/or
|
||||
other materials provided with the distribution.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
|
||||
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
||||
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
|
||||
IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
|
||||
INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
|
||||
BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
|
||||
OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
|
||||
OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*******************************************************************************/
|
||||
|
||||
#ifndef _TEST_PGEN_H_
|
||||
#define _TEST_PGEN_H_
|
||||
|
||||
#include "test_pmgr.h"
|
||||
#include "hsa_ext_amd_aql_profile.h"
|
||||
|
||||
// SimpleConvolution: Class implements OpenCL SimpleConvolution sample
|
||||
class TestPGen : public TestPMgr {
|
||||
typedef hsa_ext_amd_aql_pm4_packet_t packet_t;
|
||||
|
||||
protected:
|
||||
packet_t* PrePacket() { return reinterpret_cast<packet_t*>(&prePacket); }
|
||||
packet_t* PostPacket() { return reinterpret_cast<packet_t*>(&postPacket); }
|
||||
|
||||
public:
|
||||
TestPGen(TestAql* t) : TestPMgr(t) {}
|
||||
};
|
||||
|
||||
#endif // _TEST_PGEN_H_
|
||||
@@ -0,0 +1,142 @@
|
||||
/******************************************************************************
|
||||
|
||||
Copyright ©2013 Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without modification,
|
||||
are permitted provided that the following conditions are met:
|
||||
|
||||
Redistributions of source code must retain the above copyright notice, this list
|
||||
of conditions and the following disclaimer.
|
||||
|
||||
Redistributions in binary form must reproduce the above copyright notice, this
|
||||
list of conditions and the following disclaimer in the documentation and/or
|
||||
other materials provided with the distribution.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
|
||||
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
||||
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
|
||||
IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
|
||||
INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
|
||||
BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
|
||||
OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
|
||||
OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*******************************************************************************/
|
||||
|
||||
#ifndef _TEST_PGEN_PMC_H_
|
||||
#define _TEST_PGEN_PMC_H_
|
||||
|
||||
#include "test_pgen.h"
|
||||
|
||||
hsa_status_t TestPGenPMC_Callback(hsa_ext_amd_aql_profile_info_type_t info_type,
|
||||
hsa_ext_amd_aql_profile_info_data_t* info_data,
|
||||
void* callback_data) {
|
||||
hsa_status_t status = HSA_STATUS_SUCCESS;
|
||||
typedef std::vector<hsa_ext_amd_aql_profile_info_data_t> passed_data_t;
|
||||
reinterpret_cast<passed_data_t*>(callback_data)->push_back(*info_data);
|
||||
return status;
|
||||
}
|
||||
|
||||
// SimpleConvolution: Class implements OpenCL SimpleConvolution sample
|
||||
class TestPGenPMC : public TestPGen {
|
||||
const static uint32_t buffer_alignment = 0x1000; // 4K
|
||||
|
||||
hsa_agent_t agent;
|
||||
hsa_ext_amd_aql_profile_profile_t profile;
|
||||
hsa_ext_amd_aql_profile_event_t events[2];
|
||||
|
||||
bool buildPackets() { return true; }
|
||||
|
||||
bool dumpData() {
|
||||
std::cout << "TestPGenPMC::dumpData :" << std::endl;
|
||||
|
||||
typedef std::vector<hsa_ext_amd_aql_profile_info_data_t> callback_data_t;
|
||||
|
||||
callback_data_t data;
|
||||
hsa_ext_amd_aql_profile_iterate_data(&profile, TestPGenPMC_Callback, &data);
|
||||
for (callback_data_t::iterator it = data.begin(); it != data.end(); ++it) {
|
||||
std::cout << "> sample(" << dec << it->sample_id << ") block("
|
||||
<< it->pmc_data.event.block_name << "_" << it->pmc_data.event.block_index
|
||||
<< ") result(" << hex << it->pmc_data.result << ")" << std::endl;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
public:
|
||||
TestPGenPMC(TestAql* t) : TestPGen(t) { std::cout << "Test: PGen PMC" << std::endl; }
|
||||
|
||||
bool initialize(int arg_cnt, char** arg_list) {
|
||||
if (!TestPMgr::initialize(arg_cnt, arg_list)) return false;
|
||||
|
||||
hsa_status_t status;
|
||||
hsa_agent_t agent;
|
||||
uint32_t command_buffer_alignment;
|
||||
uint32_t command_buffer_size;
|
||||
uint32_t output_buffer_alignment;
|
||||
uint32_t output_buffer_size;
|
||||
|
||||
// GPU identificator
|
||||
agent = getAgentInfo()->dev_id;
|
||||
|
||||
// Instantiation of the profile object
|
||||
// //////////////////////////////////////////////////////////////
|
||||
// Set the event fields
|
||||
events[0].block_name = HSA_EXT_AQL_PROFILE_BLOCK_SQ;
|
||||
events[0].block_index = 0;
|
||||
events[0].counter_id = 0x4; // SQ_SQ_PERF_SEL_WAVES
|
||||
events[1].block_name = HSA_EXT_AQL_PROFILE_BLOCK_SQ;
|
||||
events[1].block_index = 0;
|
||||
events[1].counter_id = 0xe; // SQ_SQ_PERF_SEL_ITEMS
|
||||
|
||||
// Initialization the profile
|
||||
memset(&profile, 0, sizeof(profile));
|
||||
profile.agent = agent;
|
||||
profile.type = HSA_EXT_AQL_PROFILE_EVENT_PMC;
|
||||
|
||||
// set enabled events list
|
||||
profile.events = events;
|
||||
profile.event_count = 2;
|
||||
|
||||
// Profile buffers attributes
|
||||
command_buffer_alignment = buffer_alignment;
|
||||
status = hsa_ext_amd_aql_profile_get_info(
|
||||
&profile, HSA_EXT_AQL_PROFILE_INFO_COMMAND_BUFFER_SIZE, &command_buffer_size);
|
||||
assert(status == HSA_STATUS_SUCCESS);
|
||||
|
||||
output_buffer_alignment = buffer_alignment;
|
||||
status = hsa_ext_amd_aql_profile_get_info(&profile, HSA_EXT_AQL_PROFILE_INFO_PMC_DATA_SIZE,
|
||||
&output_buffer_size);
|
||||
assert(status == HSA_STATUS_SUCCESS);
|
||||
|
||||
// Application is allocating the command buffer
|
||||
// Allocate(command_buffer_alignment, command_buffer_size,
|
||||
// MODE_HOST_ACC|MODE_DEV_ACC|MODE_EXEC_DATA)
|
||||
profile.command_buffer.ptr =
|
||||
getRsrcFactory()->AllocateSysMemory(getAgentInfo(), command_buffer_size);
|
||||
profile.command_buffer.size = command_buffer_size;
|
||||
|
||||
// Application is allocating the output buffer
|
||||
// Allocate(output_buffer_alignment, output_buffer_size,
|
||||
// MODE_HOST_ACC|MODE_DEV_ACC)
|
||||
profile.output_buffer.ptr =
|
||||
getRsrcFactory()->AllocateSysMemory(getAgentInfo(), output_buffer_size);
|
||||
profile.output_buffer.size = output_buffer_size;
|
||||
memset(profile.output_buffer.ptr, 0x77, output_buffer_size);
|
||||
|
||||
// Populating the AQL start packet
|
||||
status = hsa_ext_amd_aql_profile_start(&profile, PrePacket());
|
||||
assert(status == HSA_STATUS_SUCCESS);
|
||||
if (status != HSA_STATUS_SUCCESS) return false;
|
||||
|
||||
// Populating the AQL stop packet
|
||||
status = hsa_ext_amd_aql_profile_stop(&profile, PostPacket());
|
||||
assert(status == HSA_STATUS_SUCCESS);
|
||||
|
||||
return (status == HSA_STATUS_SUCCESS);
|
||||
}
|
||||
};
|
||||
|
||||
#endif // _TEST_PGEN_PMC_H_
|
||||
@@ -0,0 +1,160 @@
|
||||
/******************************************************************************
|
||||
|
||||
Copyright ©2013 Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without modification,
|
||||
are permitted provided that the following conditions are met:
|
||||
|
||||
Redistributions of source code must retain the above copyright notice, this list
|
||||
of conditions and the following disclaimer.
|
||||
|
||||
Redistributions in binary form must reproduce the above copyright notice, this
|
||||
list of conditions and the following disclaimer in the documentation and/or
|
||||
other materials provided with the distribution.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
|
||||
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
||||
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
|
||||
IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
|
||||
INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
|
||||
BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
|
||||
OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
|
||||
OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*******************************************************************************/
|
||||
|
||||
#ifndef _TEST_PGEN_SQTT_H_
|
||||
#define _TEST_PGEN_SQTT_H_
|
||||
|
||||
#include <iostream>
|
||||
#include <iomanip>
|
||||
#include <fstream>
|
||||
|
||||
#include "test_pgen.h"
|
||||
|
||||
hsa_status_t TestPGenSQTT_Callback(hsa_ext_amd_aql_profile_info_type_t info_type,
|
||||
hsa_ext_amd_aql_profile_info_data_t* info_data,
|
||||
void* callback_data) {
|
||||
hsa_status_t status = HSA_STATUS_SUCCESS;
|
||||
typedef std::vector<hsa_ext_amd_aql_profile_info_data_t> passed_data_t;
|
||||
reinterpret_cast<passed_data_t*>(callback_data)->push_back(*info_data);
|
||||
return status;
|
||||
}
|
||||
|
||||
// SimpleConvolution: Class implements OpenCL SimpleConvolution sample
|
||||
class TestPGenSQTT : public TestPGen {
|
||||
const static uint32_t buffer_alignment = 0x1000; // 4K
|
||||
const static uint32_t buffer_size = 0x2000000; // 32M
|
||||
|
||||
hsa_agent_t agent;
|
||||
hsa_ext_amd_aql_profile_profile_t profile;
|
||||
|
||||
bool buildPackets() { return true; }
|
||||
|
||||
bool dumpData() {
|
||||
std::cout << "TestPGenSQTT::dumpData :" << std::endl;
|
||||
|
||||
typedef std::vector<hsa_ext_amd_aql_profile_info_data_t> callback_data_t;
|
||||
|
||||
callback_data_t data;
|
||||
hsa_ext_amd_aql_profile_iterate_data(&profile, TestPGenSQTT_Callback, &data);
|
||||
for (callback_data_t::iterator it = data.begin(); it != data.end(); ++it) {
|
||||
std::cout << "> sample(" << dec << it->sample_id << ") ptr(" << hex << it->sqtt_data.ptr
|
||||
<< ") size(" << dec << it->sqtt_data.size << ")" << std::endl;
|
||||
|
||||
void* sys_buf = getRsrcFactory()->AllocateSysMemory(getAgentInfo(), it->sqtt_data.size);
|
||||
assert(sys_buf != NULL);
|
||||
if (sys_buf == NULL) return HSA_STATUS_ERROR;
|
||||
|
||||
hsa_status_t status = hsa_memory_copy(sys_buf, it->sqtt_data.ptr, it->sqtt_data.size);
|
||||
assert(status == HSA_STATUS_SUCCESS);
|
||||
if (status != HSA_STATUS_SUCCESS) return status;
|
||||
|
||||
std::string file_name;
|
||||
file_name.append("sqtt_dump_");
|
||||
file_name.append(std::to_string(it->sample_id));
|
||||
file_name.append(".txt");
|
||||
std::ofstream out_file;
|
||||
out_file.open(file_name);
|
||||
|
||||
// Write the buffer in terms of shorts (16 bits)
|
||||
short* sqtt_data = (short*)sys_buf;
|
||||
for (int i = 0; i < (it->sqtt_data.size / sizeof(short)); ++i) {
|
||||
out_file << std::setw(4) << std::setfill('0') << std::hex << sqtt_data[i] << "\n";
|
||||
}
|
||||
|
||||
out_file.close();
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
public:
|
||||
TestPGenSQTT(TestAql* t) : TestPGen(t) { std::cout << "Test: PGen SQTT" << std::endl; }
|
||||
|
||||
bool initialize(int arg_cnt, char** arg_list) {
|
||||
if (!TestPMgr::initialize(arg_cnt, arg_list)) return false;
|
||||
|
||||
hsa_status_t status;
|
||||
hsa_agent_t agent;
|
||||
uint32_t command_buffer_alignment;
|
||||
uint32_t command_buffer_size;
|
||||
uint32_t output_buffer_alignment;
|
||||
uint32_t output_buffer_size;
|
||||
|
||||
// GPU identificator
|
||||
agent = getAgentInfo()->dev_id;
|
||||
|
||||
// Instantiation of the profile object
|
||||
// //////////////////////////////////////////////////////////////
|
||||
// Set the parameters
|
||||
// parameters = ....;
|
||||
|
||||
// Initialization the profile
|
||||
memset(&profile, 0, sizeof(profile));
|
||||
profile.agent = agent;
|
||||
profile.type = HSA_EXT_AQL_PROFILE_EVENT_SQTT;
|
||||
|
||||
// set parameters
|
||||
// profile.parameters = &event;
|
||||
// profile.parameter_count = 1;
|
||||
|
||||
// Profile buffers attributes
|
||||
command_buffer_alignment = buffer_alignment;
|
||||
status = hsa_ext_amd_aql_profile_get_info(
|
||||
&profile, HSA_EXT_AQL_PROFILE_INFO_COMMAND_BUFFER_SIZE, &command_buffer_size);
|
||||
assert(status == HSA_STATUS_SUCCESS);
|
||||
|
||||
output_buffer_alignment = buffer_alignment;
|
||||
output_buffer_size = buffer_size;
|
||||
|
||||
// Application is allocating the command buffer
|
||||
// AllocateSystem(command_buffer_alignment, command_buffer_size,
|
||||
// MODE_HOST_ACC|MODE_DEV_ACC|MODE_EXEC_DATA)
|
||||
profile.command_buffer.ptr =
|
||||
getRsrcFactory()->AllocateSysMemory(getAgentInfo(), command_buffer_size);
|
||||
profile.command_buffer.size = command_buffer_size;
|
||||
|
||||
// Application is allocating the output buffer
|
||||
// AllocateLocal(output_buffer_alignment, output_buffer_size,
|
||||
// MODE_DEV_ACC)
|
||||
profile.output_buffer.ptr =
|
||||
getRsrcFactory()->AllocateLocalMemory(getAgentInfo(), output_buffer_size);
|
||||
profile.output_buffer.size = output_buffer_size;
|
||||
|
||||
// Populating the AQL start packet
|
||||
status = hsa_ext_amd_aql_profile_start(&profile, PrePacket());
|
||||
assert(status == HSA_STATUS_SUCCESS);
|
||||
if (status != HSA_STATUS_SUCCESS) return false;
|
||||
|
||||
// Populating the AQL stop packet
|
||||
status = hsa_ext_amd_aql_profile_stop(&profile, PostPacket());
|
||||
assert(status == HSA_STATUS_SUCCESS);
|
||||
|
||||
return (status == HSA_STATUS_SUCCESS);
|
||||
}
|
||||
};
|
||||
|
||||
#endif // _TEST_PGEN_SQTT_H_
|
||||
@@ -0,0 +1,98 @@
|
||||
/******************************************************************************
|
||||
|
||||
Copyright ©2013 Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without modification,
|
||||
are permitted provided that the following conditions are met:
|
||||
|
||||
Redistributions of source code must retain the above copyright notice, this list
|
||||
of conditions and the following disclaimer.
|
||||
|
||||
Redistributions in binary form must reproduce the above copyright notice, this
|
||||
list of conditions and the following disclaimer in the documentation and/or
|
||||
other materials provided with the distribution.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
|
||||
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
||||
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
|
||||
IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
|
||||
INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
|
||||
BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
|
||||
OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
|
||||
OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*******************************************************************************/
|
||||
|
||||
#include <atomic>
|
||||
#include <assert.h>
|
||||
|
||||
#include "test_pmgr.h"
|
||||
|
||||
bool TestPMgr::addPacket(const packet_t* packet) {
|
||||
packet_t aql_packet = *packet;
|
||||
|
||||
// Compute the write index of queue and copy Aql packet into it
|
||||
uint64_t que_idx = hsa_queue_load_write_index_relaxed(getQueue());
|
||||
const uint32_t mask = getQueue()->size - 1;
|
||||
|
||||
// Disable packet so that submission to HW is complete
|
||||
const auto header = HSA_PACKET_TYPE_VENDOR_SPECIFIC << HSA_PACKET_HEADER_TYPE;
|
||||
aql_packet.header &= (~((1 << HSA_PACKET_HEADER_WIDTH_TYPE) - 1)) << HSA_PACKET_HEADER_TYPE;
|
||||
aql_packet.header |= HSA_PACKET_TYPE_INVALID << HSA_PACKET_HEADER_TYPE;
|
||||
|
||||
// Copy Aql packet into queue buffer
|
||||
((packet_t*)(getQueue()->base_address))[que_idx & mask] = aql_packet;
|
||||
|
||||
// After AQL packet is fully copied into queue buffer
|
||||
// update packet header from invalid state to valid state
|
||||
std::atomic_thread_fence(std::memory_order_release);
|
||||
((packet_t*)(getQueue()->base_address))[que_idx & mask].header = header;
|
||||
|
||||
// Increment the write index and ring the doorbell to dispatch the kernel.
|
||||
hsa_queue_store_write_index_relaxed(getQueue(), (que_idx + 1));
|
||||
hsa_signal_store_relaxed(getQueue()->doorbell_signal, que_idx);
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
bool TestPMgr::run() {
|
||||
// Build Aql Pkts
|
||||
const bool active = buildPackets();
|
||||
if (active) {
|
||||
// Submit Pre-Dispatch Aql packet
|
||||
addPacket(&prePacket);
|
||||
}
|
||||
|
||||
testAql()->run();
|
||||
|
||||
if (active) {
|
||||
// Set post packet completion signal
|
||||
postPacket.completion_signal = postSignal;
|
||||
|
||||
// Submit Post-Dispatch Aql packet
|
||||
addPacket(&postPacket);
|
||||
|
||||
// Wait for Post-Dispatch packet to complete
|
||||
hsa_signal_wait_acquire(postSignal, HSA_SIGNAL_CONDITION_LT, 1, (uint64_t)-1,
|
||||
HSA_WAIT_STATE_BLOCKED);
|
||||
|
||||
// Dumping profiling data
|
||||
dumpData();
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
bool TestPMgr::initialize(int argc, char** argv) {
|
||||
TestAql::initialize(argc, argv);
|
||||
hsa_status_t status = hsa_signal_create(1, 0, NULL, &postSignal);
|
||||
assert(status == HSA_STATUS_SUCCESS);
|
||||
return (status == HSA_STATUS_SUCCESS);
|
||||
}
|
||||
|
||||
TestPMgr::TestPMgr(TestAql* t) : TestAql(t) {
|
||||
dummySignal.handle = 0;
|
||||
postSignal = dummySignal;
|
||||
}
|
||||
@@ -0,0 +1,57 @@
|
||||
/******************************************************************************
|
||||
|
||||
Copyright ©2013 Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without modification,
|
||||
are permitted provided that the following conditions are met:
|
||||
|
||||
Redistributions of source code must retain the above copyright notice, this list
|
||||
of conditions and the following disclaimer.
|
||||
|
||||
Redistributions in binary form must reproduce the above copyright notice, this
|
||||
list of conditions and the following disclaimer in the documentation and/or
|
||||
other materials provided with the distribution.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
|
||||
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
||||
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
|
||||
IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
|
||||
INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
|
||||
BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
|
||||
OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
|
||||
OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*******************************************************************************/
|
||||
|
||||
#ifndef _TEST_SMGR_H_
|
||||
#define _TEST_SMGR_H_
|
||||
|
||||
#include "test_aql.h"
|
||||
#include "amd_aql_pm4_ib_packet.h"
|
||||
|
||||
// SimpleConvolution: Class implements OpenCL SimpleConvolution sample
|
||||
class TestPMgr : public TestAql {
|
||||
public:
|
||||
typedef amd_aql_pm4_ib_packet_t packet_t;
|
||||
|
||||
private:
|
||||
bool addPacket(const packet_t* packet);
|
||||
|
||||
protected:
|
||||
packet_t prePacket;
|
||||
packet_t postPacket;
|
||||
hsa_signal_t dummySignal;
|
||||
hsa_signal_t postSignal;
|
||||
|
||||
virtual bool buildPackets() { return false; }
|
||||
virtual bool dumpData() { return false; }
|
||||
virtual bool initialize(int argc, char** argv);
|
||||
|
||||
public:
|
||||
TestPMgr(TestAql* t);
|
||||
bool run();
|
||||
};
|
||||
|
||||
#endif // _TEST_SMGR_H_
|
||||
@@ -0,0 +1,81 @@
|
||||
/******************************************************************************
|
||||
|
||||
Copyright ©2013 Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without modification,
|
||||
are permitted provided that the following conditions are met:
|
||||
|
||||
Redistributions of source code must retain the above copyright notice, this list
|
||||
of conditions and the following disclaimer.
|
||||
|
||||
Redistributions in binary form must reproduce the above copyright notice, this
|
||||
list of conditions and the following disclaimer in the documentation and/or
|
||||
other materials provided with the distribution.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
|
||||
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
||||
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
|
||||
IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
|
||||
INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
|
||||
BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
|
||||
OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
|
||||
OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
********************************************************************************/
|
||||
|
||||
/**
|
||||
* SimpleConvolution is where each pixel of the output image
|
||||
* is the weighted sum of the neighborhood pixels of the input image
|
||||
* The neighborhood is defined by the dimensions of the mask and
|
||||
* weight of each neighbor is defined by the mask itself.
|
||||
* @param output Output matrix after performing convolution
|
||||
* @param input Input matrix on which convolution is to be performed
|
||||
* @param mask mask matrix using which convolution was to be performed
|
||||
* @param inputDimensions dimensions of the input matrix
|
||||
* @param maskDimensions dimensions of the mask matrix
|
||||
*/
|
||||
__kernel void simpleConvolution(__global uint * output,
|
||||
__global uint * input,
|
||||
__global float * mask,
|
||||
const uint2 inputDimensions,
|
||||
const uint2 maskDimensions) {
|
||||
|
||||
uint tid = get_global_id(0);
|
||||
|
||||
uint width = inputDimensions.x;
|
||||
uint height = inputDimensions.y;
|
||||
|
||||
uint x = tid%width;
|
||||
uint y = tid/width;
|
||||
|
||||
uint maskWidth = maskDimensions.x;
|
||||
uint maskHeight = maskDimensions.y;
|
||||
|
||||
uint vstep = (maskWidth -1)/2;
|
||||
uint hstep = (maskHeight -1)/2;
|
||||
|
||||
// find the left, right, top and bottom indices such that
|
||||
// the indices do not go beyond image boundaires
|
||||
uint left = (x < vstep) ? 0 : (x - vstep);
|
||||
uint right = ((x + vstep) >= width) ? width - 1 : (x + vstep);
|
||||
uint top = (y < hstep) ? 0 : (y - hstep);
|
||||
uint bottom = ((y + hstep) >= height)? height - 1: (y + hstep);
|
||||
|
||||
// initializing wighted sum value
|
||||
float sumFX = 0;
|
||||
|
||||
for(uint i = left; i <= right; ++i) {
|
||||
for(uint j = top ; j <= bottom; ++j) {
|
||||
// performing wighted sum within the mask boundaries
|
||||
uint maskIndex = (j - (y - hstep)) * maskWidth + (i - (x - vstep));
|
||||
uint index = j * width + i;
|
||||
sumFX += ((float)input[index] * mask[maskIndex]);
|
||||
}
|
||||
}
|
||||
|
||||
// To round to the nearest integer
|
||||
sumFX += 0.5f;
|
||||
output[tid] = (uint)sumFX;
|
||||
}
|
||||
@@ -0,0 +1,157 @@
|
||||
/******************************************************************************
|
||||
|
||||
Copyright ©2013 Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without modification,
|
||||
are permitted provided that the following conditions are met:
|
||||
|
||||
Redistributions of source code must retain the above copyright notice, this list
|
||||
of conditions and the following disclaimer.
|
||||
|
||||
Redistributions in binary form must reproduce the above copyright notice, this
|
||||
list of conditions and the following disclaimer in the documentation and/or
|
||||
other materials provided with the distribution.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
|
||||
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
||||
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
|
||||
IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
|
||||
INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
|
||||
BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
|
||||
OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
|
||||
OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*******************************************************************************/
|
||||
|
||||
#include "helper_funcs.hpp"
|
||||
#include "simple_convolution.h"
|
||||
|
||||
SimpleConvolution::SimpleConvolution() {
|
||||
width_ = 64;
|
||||
height_ = 64;
|
||||
mask_width_ = 3;
|
||||
mask_height_ = mask_width_;
|
||||
|
||||
if (!isPowerOf2(width_)) {
|
||||
width_ = roundToPowerOf2(width_);
|
||||
}
|
||||
|
||||
if (!isPowerOf2(height_)) {
|
||||
height_ = roundToPowerOf2(height_);
|
||||
}
|
||||
|
||||
if (!(mask_width_ % 2)) {
|
||||
mask_width_++;
|
||||
}
|
||||
|
||||
if (!(mask_height_ % 2)) {
|
||||
mask_height_++;
|
||||
}
|
||||
|
||||
if (width_ * height_ < 256) {
|
||||
width_ = 64;
|
||||
height_ = 64;
|
||||
}
|
||||
|
||||
const uint32_t input_size_bytes = width_ * height_ * sizeof(uint32_t);
|
||||
const uint32_t mask_size_bytes = mask_width_ * mask_height_ * sizeof(float);
|
||||
|
||||
set_sys_descr(KERNARG_DES_ID, sizeof(kernel_args_t));
|
||||
set_sys_descr(INPUT_DES_ID, input_size_bytes);
|
||||
set_sys_descr(OUTPUT_DES_ID, input_size_bytes);
|
||||
set_local_descr(LOCAL_DES_ID, input_size_bytes);
|
||||
set_sys_descr(MASK_DES_ID, mask_size_bytes);
|
||||
set_sys_descr(REFOUT_DES_ID, input_size_bytes);
|
||||
}
|
||||
|
||||
void SimpleConvolution::init() {
|
||||
std::cout << "SimpleConvolution::init :" << std::endl;
|
||||
|
||||
mem_descr_t input_des = get_descr(INPUT_DES_ID);
|
||||
mem_descr_t local_des = get_descr(LOCAL_DES_ID);
|
||||
mem_descr_t mask_des = get_descr(MASK_DES_ID);
|
||||
mem_descr_t refout_des = get_descr(REFOUT_DES_ID);
|
||||
mem_descr_t kernarg_des = get_descr(KERNARG_DES_ID);
|
||||
|
||||
uint32_t* input = (uint32_t*)input_des.ptr;
|
||||
uint32_t* output_local = (uint32_t*)local_des.ptr;
|
||||
float* mask = (float*)mask_des.ptr;
|
||||
kernel_args_t* kernel_args = (kernel_args_t*)kernarg_des.ptr;
|
||||
|
||||
// random initialisation of input
|
||||
fillRandom<uint32_t>(input, width_, height_, 0, 255);
|
||||
|
||||
// Fill a blurr filter or some other filter of your choice
|
||||
const float val = 1.0f / (mask_width_ * 2.0f - 1.0f);
|
||||
for (uint32_t i = 0; i < (mask_width_ * mask_height_); i++) {
|
||||
mask[i] = 0;
|
||||
}
|
||||
for (uint32_t i = 0; i < mask_width_; i++) {
|
||||
uint32_t y = mask_height_ / 2;
|
||||
mask[y * mask_width_ + i] = val;
|
||||
}
|
||||
for (uint32_t i = 0; i < mask_height_; i++) {
|
||||
uint32_t x = mask_width_ / 2;
|
||||
mask[i * mask_width_ + x] = val;
|
||||
}
|
||||
|
||||
// Print the INPUT array.
|
||||
printArray<uint32_t>("> Input[0]", input, width_, 1);
|
||||
printArray<float>("> Mask", mask, mask_width_, mask_height_);
|
||||
|
||||
// Fill the kernel args
|
||||
kernel_args->arg1 = output_local;
|
||||
kernel_args->arg2 = input;
|
||||
kernel_args->arg3 = mask;
|
||||
kernel_args->arg4 = width_;
|
||||
kernel_args->arg41 = height_;
|
||||
kernel_args->arg5 = mask_width_;
|
||||
kernel_args->arg51 = mask_height_;
|
||||
|
||||
// Calculate the reference output
|
||||
memset(refout_des.ptr, 0, refout_des.size);
|
||||
reference_impl((uint32_t*)refout_des.ptr, input, mask, width_, height_, mask_width_,
|
||||
mask_height_);
|
||||
}
|
||||
|
||||
void SimpleConvolution::print_output() const {
|
||||
printArray<uint32_t>("> Output[0]", (uint32_t*)get_output_ptr(), width_, 1);
|
||||
}
|
||||
|
||||
bool SimpleConvolution::reference_impl(uint32_t* output, const uint32_t* input, const float* mask,
|
||||
const uint32_t width, const uint32_t height,
|
||||
const uint32_t mask_width, const uint32_t mask_height) {
|
||||
const uint32_t vstep = (mask_width - 1) / 2;
|
||||
const uint32_t hstep = (mask_height - 1) / 2;
|
||||
|
||||
// for each pixel in the input
|
||||
for (uint32_t x = 0; x < width; x++) {
|
||||
for (uint32_t y = 0; y < height; y++) {
|
||||
// find the left, right, top and bottom indices such that
|
||||
// the indices do not go beyond image boundaires
|
||||
const uint32_t left = (x < vstep) ? 0 : (x - vstep);
|
||||
const uint32_t right = ((x + vstep) >= width) ? width - 1 : (x + vstep);
|
||||
const uint32_t top = (y < hstep) ? 0 : (y - hstep);
|
||||
const uint32_t bottom = ((y + hstep) >= height) ? height - 1 : (y + hstep);
|
||||
|
||||
// initializing wighted sum value
|
||||
float sum_fx = 0;
|
||||
for (uint32_t i = left; i <= right; ++i) {
|
||||
for (uint32_t j = top; j <= bottom; ++j) {
|
||||
// performing wighted sum within the mask boundaries
|
||||
uint32_t mask_idx = (j - (y - hstep)) * mask_width + (i - (x - vstep));
|
||||
uint32_t index = j * width + i;
|
||||
|
||||
// to round to the nearest integer
|
||||
sum_fx += ((float)input[index] * mask[mask_idx]);
|
||||
}
|
||||
}
|
||||
sum_fx += 0.5f;
|
||||
output[y * width + x] = uint32_t(sum_fx);
|
||||
}
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
@@ -0,0 +1,90 @@
|
||||
/******************************************************************************
|
||||
|
||||
Copyright ©2013 Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without modification,
|
||||
are permitted provided that the following conditions are met:
|
||||
|
||||
Redistributions of source code must retain the above copyright notice, this list
|
||||
of conditions and the following disclaimer.
|
||||
|
||||
Redistributions in binary form must reproduce the above copyright notice, this
|
||||
list of conditions and the following disclaimer in the documentation and/or
|
||||
other materials provided with the distribution.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
|
||||
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
||||
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
|
||||
IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
|
||||
INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
|
||||
BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
|
||||
OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
|
||||
OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*******************************************************************************/
|
||||
|
||||
#ifndef _SIMPLE_CONVOLUTION_H_
|
||||
#define _SIMPLE_CONVOLUTION_H_
|
||||
|
||||
#include <vector>
|
||||
#include <map>
|
||||
|
||||
#include "test_kernel.h"
|
||||
|
||||
// SimpleConvolution: Class implements OpenCL SimpleConvolution sample
|
||||
class SimpleConvolution : public TestKernel {
|
||||
public:
|
||||
// Constructor
|
||||
SimpleConvolution();
|
||||
|
||||
// Initialize method
|
||||
void init();
|
||||
|
||||
// Return number of compute elements
|
||||
uint32_t get_elements_count() const { return width_ * height_; }
|
||||
|
||||
// Print output
|
||||
void print_output() const;
|
||||
|
||||
// Return name
|
||||
std::string Name() const { return std::string("simpleConvolution"); }
|
||||
|
||||
private:
|
||||
// Local kernel arguments declaration
|
||||
struct kernel_args_t {
|
||||
void* arg1;
|
||||
void* arg2;
|
||||
void* arg3;
|
||||
uint32_t arg4;
|
||||
uint32_t arg41;
|
||||
uint32_t arg5;
|
||||
uint32_t arg51;
|
||||
};
|
||||
|
||||
// Width of the Input array
|
||||
uint32_t width_;
|
||||
|
||||
// Height of the Input array
|
||||
uint32_t height_;
|
||||
|
||||
// Mask dimensions
|
||||
uint32_t mask_width_;
|
||||
|
||||
// Mask dimensions
|
||||
uint32_t mask_height_;
|
||||
|
||||
// Reference CPU implementation of Simple Convolution
|
||||
// @param output Output matrix after performing convolution
|
||||
// @param input Input matrix on which convolution is to be performed
|
||||
// @param mask mask matrix using which convolution was to be performed
|
||||
// @param input_dimensions dimensions of the input matrix
|
||||
// @param mask_dimensions dimensions of the mask matrix
|
||||
// @return bool true on success and false on failure
|
||||
bool reference_impl(uint32_t* output, const uint32_t* input, const float* mask,
|
||||
const uint32_t width, const uint32_t height, const uint32_t maskWidth,
|
||||
const uint32_t maskHeight);
|
||||
};
|
||||
|
||||
#endif // _SIMPLE_CONVOLUTION_H_
|
||||
@@ -0,0 +1,154 @@
|
||||
module &m:1:0:$full:$large:$default;
|
||||
extension "amd:gcn";
|
||||
extension "IMAGE";
|
||||
|
||||
decl prog function &abort()();
|
||||
|
||||
prog kernel &__OpenCL_SimpleConvolution(kernarg_u64 %__global_offset_0,
|
||||
kernarg_u64 %output,
|
||||
kernarg_u64 %input,
|
||||
kernarg_u64 %mask,
|
||||
kernarg_u32 %inputDimensions[2],
|
||||
kernarg_u32 %maskDimensions[2]) {
|
||||
|
||||
pragma "AMD RTI", "ARGSTART:__OpenCL_SimpleConvolution";
|
||||
pragma "AMD RTI", "version:3:1:104";
|
||||
pragma "AMD RTI", "device:generic";
|
||||
pragma "AMD RTI", "uniqueid:1024";
|
||||
pragma "AMD RTI", "memory:private:0";
|
||||
pragma "AMD RTI", "memory:region:0";
|
||||
pragma "AMD RTI", "memory:local:0";
|
||||
pragma "AMD RTI", "value:__global_offset_0:u64:1:1:0";
|
||||
pragma "AMD RTI", "pointer:output:u32:1:1:96:uav:7:4:RW:0:0:0";
|
||||
pragma "AMD RTI", "pointer:input:u32:1:1:112:uav:7:4:RW:0:0:0";
|
||||
pragma "AMD RTI", "pointer:mask:float:1:1:128:uav:7:4:RW:0:0:0";
|
||||
pragma "AMD RTI", "value:inputDimensions:u32:2:1:144";
|
||||
pragma "AMD RTI", "constarg:4:inputDimensions";
|
||||
pragma "AMD RTI", "value:maskDimensions:u32:2:1:160";
|
||||
pragma "AMD RTI", "constarg:5:maskDimensions";
|
||||
pragma "AMD RTI", "function:1:0";
|
||||
pragma "AMD RTI", "memory:64bitABI";
|
||||
pragma "AMD RTI", "privateid:8";
|
||||
pragma "AMD RTI", "enqueue_kernel:0";
|
||||
pragma "AMD RTI", "kernel_index:0";
|
||||
pragma "AMD RTI", "reflection:0:size_t";
|
||||
pragma "AMD RTI", "reflection:1:uint*";
|
||||
pragma "AMD RTI", "reflection:2:uint*";
|
||||
pragma "AMD RTI", "reflection:3:float*";
|
||||
pragma "AMD RTI", "reflection:4:uint2";
|
||||
pragma "AMD RTI", "reflection:5:uint2";
|
||||
pragma "AMD RTI", "ARGEND:__OpenCL_SimpleConvolution";
|
||||
|
||||
@__OpenCL_SimpleConvolution_Entry:
|
||||
|
||||
// BB#0: // %entry
|
||||
|
||||
workitemabsid_u32 $s6, 0;
|
||||
cvt_u64_u32 $d0, $s6;
|
||||
ld_kernarg_align(8)_width(all)_u64 $d4, [%__global_offset_0];
|
||||
add_u64 $d0, $d0, $d4;
|
||||
cvt_u32_u64 $s5, $d0;
|
||||
ld_v2_kernarg_align(4)_width(all)_u32 ($s0, $s4), [%inputDimensions];
|
||||
ld_v2_kernarg_align(4)_width(all)_u32 ($s1, $s9), [%maskDimensions];
|
||||
rem_u32 $s7, $s5, $s0;
|
||||
add_u32 $s2, $s1, 4294967295;
|
||||
shr_u32 $s8, $s2, 1;
|
||||
add_u32 $s2, $s7, $s8;
|
||||
add_u32 $s3, $s0, 4294967295;
|
||||
cmp_ge_b1_u32 $c0, $s2, $s0;
|
||||
cmov_b32 $s2, $c0, $s3, $s2;
|
||||
sub_u32 $s3, $s7, $s8;
|
||||
cmp_lt_b1_u32 $c0, $s7, $s8;
|
||||
cmov_b32 $s3, $c0, 0, $s3;
|
||||
ld_kernarg_align(8)_width(all)_u64 $d1, [%output];
|
||||
cmp_le_b1_u32 $c0, $s3, $s2;
|
||||
cbr_b1 $c0, @BB0_2;
|
||||
|
||||
// BB#1:
|
||||
|
||||
mov_b32 $s6, 0;
|
||||
br @BB0_6;
|
||||
|
||||
// @BB0_2: // %for.cond32.preheader.lr.ph
|
||||
|
||||
@BB0_2:
|
||||
|
||||
div_u32 $s5, $s5, $s0;
|
||||
add_u32 $s9, $s9, 4294967295;
|
||||
shr_u32 $s9, $s9, 1;
|
||||
add_u32 $s10, $s5, $s9;
|
||||
add_u32 $s11, $s4, 4294967295;
|
||||
cmp_ge_b1_u32 $c0, $s10, $s4;
|
||||
cmov_b32 $s4, $c0, $s11, $s10;
|
||||
sub_u32 $s10, $s5, $s9;
|
||||
cmp_lt_b1_u32 $c0, $s5, $s9;
|
||||
cmov_b32 $s5, $c0, 0, $s10;
|
||||
ld_kernarg_align(8)_width(all)_u64 $d2, [%mask];
|
||||
ld_kernarg_align(8)_width(all)_u64 $d3, [%input];
|
||||
cvt_u64_u32 $d5, $s6;
|
||||
add_u64 $d4, $d4, $d5;
|
||||
cvt_u32_u64 $s6, $d4;
|
||||
div_u32 $s6, $s6, $s0;
|
||||
max_u32 $s10, $s9, $s6;
|
||||
sub_u32 $s12, $s10, $s6;
|
||||
max_u32 $s11, $s7, $s8;
|
||||
mov_b32 $s6, 0;
|
||||
mad_u32 $s12, $s1, $s12, $s11;
|
||||
sub_u32 $s7, $s12, $s7;
|
||||
sub_u32 $s9, $s10, $s9;
|
||||
mad_u32 $s9, $s0, $s9, $s11;
|
||||
sub_u32 $s8, $s9, $s8;
|
||||
|
||||
// @BB0_3: // %for.cond32.preheader
|
||||
|
||||
@BB0_3:
|
||||
|
||||
cmp_gt_b1_u32 $c0, $s5, $s4;
|
||||
mov_b32 $s9, $s7;
|
||||
mov_b32 $s10, $s8;
|
||||
mov_b32 $s11, $s5;
|
||||
cbr_b1 $c0, @BB0_5;
|
||||
|
||||
// @BB0_4: // %for.body35
|
||||
|
||||
@BB0_4:
|
||||
|
||||
cvt_u64_u32 $d4, $s9;
|
||||
shl_u64 $d4, $d4, 2;
|
||||
add_u64 $d4, $d2, $d4;
|
||||
ld_global_align(4)_f32 $s12, [$d4];
|
||||
cvt_u64_u32 $d4, $s10;
|
||||
shl_u64 $d4, $d4, 2;
|
||||
add_u64 $d4, $d3, $d4;
|
||||
ld_global_align(4)_u32 $s13, [$d4];
|
||||
cvt_f32_u32 $s13, $s13;
|
||||
mul_ftz_f32 $s12, $s13, $s12;
|
||||
add_u32 $s9, $s9, $s1;
|
||||
add_u32 $s10, $s10, $s0;
|
||||
add_u32 $s11, $s11, 1;
|
||||
add_ftz_f32 $s6, $s6, $s12;
|
||||
cmp_le_b1_u32 $c0, $s11, $s4;
|
||||
cbr_b1 $c0, @BB0_4;
|
||||
|
||||
// @BB0_5: // %for.inc48
|
||||
|
||||
@BB0_5:
|
||||
|
||||
add_u32 $s7, $s7, 1;
|
||||
add_u32 $s8, $s8, 1;
|
||||
add_u32 $s3, $s3, 1;
|
||||
cmp_le_b1_u32 $c0, $s3, $s2;
|
||||
cbr_b1 $c0, @BB0_3;
|
||||
|
||||
// @BB0_6: // %for.end50
|
||||
|
||||
@BB0_6:
|
||||
|
||||
and_b64 $d0, $d0, 4294967295;
|
||||
shl_u64 $d0, $d0, 2;
|
||||
add_u64 $d0, $d1, $d0;
|
||||
add_ftz_f32 $s0, $s6, 0F3f000000;
|
||||
cvt_ftz_u32_f32 $s0, $s0;
|
||||
st_global_align(4)_u32 $s0, [$d0];
|
||||
ret;
|
||||
};
|
||||
Ссылка в новой задаче
Block a user