diff --git a/CMakeLists.txt b/CMakeLists.txt new file mode 100644 index 0000000000..5649152cb4 --- /dev/null +++ b/CMakeLists.txt @@ -0,0 +1,100 @@ +################################################################################ +## +## Copyright (c) 2016 Advanced Micro Devices, Inc. All rights reserved. +## +## MIT LICENSE: +## Permission is hereby granted, free of charge, to any person obtaining a copy of +## this software and associated documentation files (the "Software"), to deal in +## the Software without restriction, including without limitation the rights to +## use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies +## of the Software, and to permit persons to whom the Software is furnished to do +## so, subject to the following conditions: +## +## The above copyright notice and this permission notice shall be included in all +## copies or substantial portions of the Software. +## +## THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +## IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +## FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +## AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +## LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +## OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +## SOFTWARE. +## +################################################################################ + +cmake_minimum_required ( VERSION 3.5.0 ) + +## Verbose output. +set ( CMAKE_VERBOSE_MAKEFILE TRUE CACHE BOOL "Verbose Output" FORCE ) + +## Set module name and project name. +set ( ROCPROFILER_NAME "rocprofiler" ) +set ( ROCPROFILER_TARGET "${ROCPROFILER_NAME}64" ) +set ( ROCPROFILER_LIBRARY "lib${ROCPROFILER_TARGET}" ) +project ( ${ROCPROFILER_TARGET} ) + +## Adding default path cmake modules +list ( APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/cmake_modules" ) +## Include common cmake modules +include ( utils ) +## Set build environment +include ( env ) + +## Setup the package version. +get_version ( "1.0.0" ) +message ( "-- LIB-VERSION: ${VERSION_MAJOR}.${VERSION_MINOR}.${VERSION_PATCH}" ) + +set ( BUILD_VERSION_MAJOR ${VERSION_MAJOR} ) +set ( BUILD_VERSION_MINOR ${VERSION_MINOR} ) +set ( BUILD_VERSION_PATCH ${VERSION_PATCH} ) +set ( LIB_VERSION_STRING "${BUILD_VERSION_MAJOR}.${BUILD_VERSION_MINOR}.${BUILD_VERSION_PATCH}" ) +if ( DEFINED VERSION_BUILD AND NOT ${VERSION_BUILD} STREQUAL "" ) + message ( "VERSION BUILD DEFINED ${VERSION_BUILD}" ) + set ( BUILD_VERSION_PATCH "${BUILD_VERSION_PATCH}-${VERSION_BUILD}" ) +endif () +set ( BUILD_VERSION_STRING "${BUILD_VERSION_MAJOR}.${BUILD_VERSION_MINOR}.${BUILD_VERSION_PATCH}" ) + +## Set target and root/lib/test directory +set ( TARGET_NAME "${ROCPROFILER_TARGET}" ) +set ( ROOT_DIR "${CMAKE_CURRENT_SOURCE_DIR}" ) +set ( LIB_DIR "${ROOT_DIR}/src" ) +set ( TEST_DIR "${ROOT_DIR}/test" ) + +## Build library +include ( ${LIB_DIR}/CMakeLists.txt ) + +## Set the VERSION and SOVERSION values +set_property ( TARGET ${TARGET_NAME} PROPERTY VERSION "${LIB_VERSION_STRING}" ) +set_property ( TARGET ${TARGET_NAME} PROPERTY SOVERSION "${BUILD_VERSION_MAJOR}" ) + +## If the library is a release, strip the target library +if ( "${CMAKE_BUILD_TYPE}" STREQUAL release ) + add_custom_command ( TARGET ${ROCPROFILER_TARGET} POST_BUILD COMMAND ${CMAKE_STRIP} *.so ) +endif () + +## Build tests +add_subdirectory ( ${TEST_DIR} ${PROJECT_BINARY_DIR}/test ) + +## Add the install directives for the runtime library. +install ( TARGETS ${ROCPROFILER_TARGET} LIBRARY DESTINATION ${ROCPROFILER_NAME}/lib COMPONENT ${ROCPROFILER_LIBRARY} ) + +## Add the packaging directives for the runtime library. +set ( CPACK_PACKAGE_NAME ${ROCPROFILER_NAME} ) +set ( CPACK_PACKAGE_VENDOR "AMD" ) +set ( CPACK_PACKAGE_VERSION_MAJOR ${BUILD_VERSION_MAJOR} ) +set ( CPACK_PACKAGE_VERSION_MINOR ${BUILD_VERSION_MINOR} ) +set ( CPACK_PACKAGE_VERSION_PATCH ${BUILD_VERSION_PATCH} ) +set ( CPACK_PACKAGE_CONTACT "Advanced Micro Devices Inc." ) +set ( CPACK_PACKAGE_DESCRIPTION_SUMMARY "ROCPROFILER library for AMD HSA runtime API extension support" ) +set ( CPACK_RESOURCE_FILE_LICENSE "${CMAKE_CURRENT_SOURCE_DIR}/LICENSE.md" ) + +## Debian package specific variables +set ( CPACK_DEBIAN_PACKAGE_HOMEPAGE "https://github.com/RadeonOpenCompute/HSA-RocProfiler" ) +set ( CPACK_DEBIAN_PACKAGE_CONTROL_EXTRA "${CMAKE_CURRENT_SOURCE_DIR}/DEBIAN/postinst;${CMAKE_CURRENT_SOURCE_DIR}/DEBIAN/prerm" ) + +## RPM package specific variables +set ( CPACK_RPM_PRE_INSTALL_SCRIPT_FILE "${CMAKE_CURRENT_SOURCE_DIR}/RPM/rpm_post" ) +set ( CPACK_RPM_POST_UNINSTALL_SCRIPT_FILE "${CMAKE_CURRENT_SOURCE_DIR}/RPM/rpm_postun" ) + +include ( CPack ) diff --git a/DEBIAN/postinst b/DEBIAN/postinst new file mode 100644 index 0000000000..3d0228844a --- /dev/null +++ b/DEBIAN/postinst @@ -0,0 +1,19 @@ +#/bin/bash + +set -e + +do_ldconfig() { + echo /opt/rocm/librocprofiler/lib > /etc/ld.so.conf.d/libhsa-rocprofiler64.conf && ldconfig +} + +case "$1" in + configure) + do_ldconfig + ;; + abort-upgrade|abort-remove|abort-deconfigure) + echo "$1" + ;; + *) + exit 0 + ;; +esac diff --git a/DEBIAN/prerm b/DEBIAN/prerm new file mode 100644 index 0000000000..b3f509a9fa --- /dev/null +++ b/DEBIAN/prerm @@ -0,0 +1,18 @@ +#!/bin/bash + +set -e + +rm_ldconfig() { + rm -f /etc/ld.so.conf.d/libhsa-rocprofiler64.conf && ldconfig +} + +case "$1" in + remove) + rm_ldconfig + ;; + purge) + ;; + *) + exit 0 + ;; +esac diff --git a/LICENSE.md b/LICENSE.md new file mode 100644 index 0000000000..c68a4aa40d --- /dev/null +++ b/LICENSE.md @@ -0,0 +1,39 @@ +HSA ROC Profiler LICENSE + +The University of Illinois/NCSA +Open Source License (NCSA) + +Copyright (c) 2014-2016, Advanced Micro Devices, Inc. All rights reserved. + +Developed by: + + AMD Research and AMD HSA Software Development + + Advanced Micro Devices, Inc. + + www.amd.com + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to +deal with the Software without restriction, including without limitation +the rights to use, copy, modify, merge, publish, distribute, sublicense, +and/or sell copies of the Software, and to permit persons to whom the +Software is furnished to do so, subject to the following conditions: + + - Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimers. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimers in + the documentation and/or other materials provided with the distribution. + - Neither the names of Advanced Micro Devices, Inc, + nor the names of its contributors may be used to endorse or promote + products derived from this Software without specific prior written + permission. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR +OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +DEALINGS WITH THE SOFTWARE. diff --git a/RPM/rpm_post b/RPM/rpm_post new file mode 100644 index 0000000000..57c5c8113e --- /dev/null +++ b/RPM/rpm_post @@ -0,0 +1 @@ +echo /opt/rocm/librocprofiler/lib > /etc/ld.so.conf.d/libhsa-rocprofiler64.conf && ldconfig diff --git a/RPM/rpm_postun b/RPM/rpm_postun new file mode 100644 index 0000000000..6b3c8f2887 --- /dev/null +++ b/RPM/rpm_postun @@ -0,0 +1 @@ +rm -f /etc/ld.so.conf.d/libhsa-rocprofiler64.conf && ldconfig diff --git a/Readme.txt b/Readme.txt new file mode 100644 index 0000000000..1dd2a6ec23 --- /dev/null +++ b/Readme.txt @@ -0,0 +1,50 @@ +ROC Profiler library. +Profiling with metrics and traces based perfcounters (PMC) and SQ threadtraces (SQTT) +and AqlProfile HSA extension. + +Library supports GFX8/GFX9 APIs. +The library source tree: + - doc - Documantation + - inc/rocprofiler.h - Library public API + - src - Library sources + - core - Library API sources + - util - Library utils sources + - xml - XML parser + - test - Library test suite + - ctrl - Test controll + - util - Test utils + - simple_convolution - Simple convolution test kernel + +Build environment: + +$ export CMAKE_PREFIX_PATH=";" +$ export CMAKE_BUILD_TYPE= # release by default +$ export CMAKE_DEBUG_TRACE=1 # 1 to enable debug tracing + +To build with the current installed ROCM: + +$ cd .../rocprofiler +$ mkdir build +$ cd build + +$ export CMAKE_PREFIX_PATH=/home/evgeny/git/compute/out/ubuntu-16.04/16.04 +$ cmake .. +$ make + +$ cmake -DCMAKE_PREFIX_PATH=/opt/rocm .. +$ make + +To run the test: + +$ cd .../rocprofiler/build +$ export LD_LIBRARY_PATH=$PWD +$ export HSA_TOOLS_LIB=librocprofiler64.so +$ run.sh + +To enabled error messages logging to '/tmp/rocprofiler_log.txt': + +$ export ROCPROFILER_LOG=1 + +To enable verbose tracing: + +$ export ROCPROFILER_TRACE=1 diff --git a/_clang-format b/_clang-format new file mode 100644 index 0000000000..0c81671e05 --- /dev/null +++ b/_clang-format @@ -0,0 +1,60 @@ +--- +Language: Cpp +# BasedOnStyle: Google +AccessModifierOffset: -1 +ConstructorInitializerIndentWidth: 4 +AlignEscapedNewlinesLeft: false +AlignTrailingComments: true +AlignConsecutiveAssignments: false +AlignOperands: false +AllowAllParametersOfDeclarationOnNextLine: true +AllowShortBlocksOnASingleLine: false +AllowShortIfStatementsOnASingleLine: true +AllowShortLoopsOnASingleLine: true +AllowShortFunctionsOnASingleLine: All +AlwaysBreakAfterDefinitionReturnType: false +AlwaysBreakTemplateDeclarations: false +AlwaysBreakBeforeMultilineStrings: true +BreakBeforeBinaryOperators: false +BreakBeforeTernaryOperators: true +BreakConstructorInitializersBeforeComma: false +BinPackParameters: true +ColumnLimit: 100 +ConstructorInitializerAllOnOneLineOrOnePerLine: true +ExperimentalAutoDetectBinPacking: false +IndentCaseLabels: true +IndentWrappedFunctionNames: false +IndentFunctionDeclarationAfterType: false +MaxEmptyLinesToKeep: 2 +KeepEmptyLinesAtTheStartOfBlocks: false +NamespaceIndentation: None +ObjCSpaceAfterProperty: false +ObjCSpaceBeforeProtocolList: false +PenaltyBreakBeforeFirstCallParameter: 1 +PenaltyBreakComment: 300 +PenaltyBreakString: 1000 +PenaltyBreakFirstLessLess: 120 +PenaltyExcessCharacter: 1000000 +PenaltyReturnTypeOnItsOwnLine: 200 +DerivePointerAlignment: false +PointerAlignment: Left +SpacesBeforeTrailingComments: 2 +Cpp11BracedListStyle: true +Standard: Auto +IndentWidth: 2 +TabWidth: 8 +UseTab: Never +BreakBeforeBraces: Attach +SpacesInParentheses: false +SpacesInAngles: false +SpaceInEmptyParentheses: false +SpacesInCStyleCastParentheses: false +SpacesInContainerLiterals: true +SpaceBeforeAssignmentOperators: true +ContinuationIndentWidth: 4 +CommentPragmas: '^ IWYU pragma:' +ForEachMacros: [ foreach, Q_FOREACH, BOOST_FOREACH ] +SpaceBeforeParens: ControlStatements +DisableFormat: false +SortIncludes: false +... diff --git a/cmake_modules/env.cmake b/cmake_modules/env.cmake new file mode 100644 index 0000000000..58e9dc5fbc --- /dev/null +++ b/cmake_modules/env.cmake @@ -0,0 +1,112 @@ +## Build is not supported on Windows plaform +if ( WIN32 ) + message ( FATAL_ERROR "Windows build is not supported." ) +endif () + +## Compiler Preprocessor definitions. +add_definitions ( -D__linux__ ) +add_definitions ( -DUNIX_OS ) +add_definitions ( -DLINUX ) +add_definitions ( -D__AMD64__ ) +add_definitions ( -D__x86_64__ ) +add_definitions ( -DAMD_INTERNAL_BUILD ) +add_definitions ( -DLITTLEENDIAN_CPU=1 ) +add_definitions ( -DHSA_LARGE_MODEL= ) +add_definitions ( -DHSA_DEPRECATED= ) + +## Linux Compiler options +set ( CMAKE_CXX_FLAGS "-std=c++11") +set ( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall" ) +set ( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Werror" ) +set ( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Werror=return-type" ) +set ( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fexceptions" ) +set ( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fvisibility=hidden" ) +set ( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fno-math-errno" ) +set ( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fno-threadsafe-statics" ) +set ( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fmerge-all-constants" ) +set ( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fms-extensions" ) +set ( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fmerge-all-constants" ) +set ( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fPIC" ) + +set ( CMAKE_SHARED_LINKER_FLAGS "-Wl,-Bdynamic -Wl,-z,noexecstack" ) + +set ( CMAKE_SKIP_BUILD_RPATH TRUE ) + +## CLANG options +if ( "$ENV{CXX}" STREQUAL "/usr/bin/clang++" ) +set ( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -ferror-limit=1000000" ) +endif() + +## Enable debug trace +if ( DEFINED ENV{CMAKE_DEBUG_TRACE} ) + add_definitions ( -DDEBUG_TRACE=1 ) +endif() + +## Enable HSA APIs intersepting +if ( NOT "$ENV{CMAKE_HSA_INTERCEPT}" STREQUAL "no" ) + add_definitions ( -DROCP_HSA_INTERCEPT=1 ) +endif() + +## Enable HSA proxy queue +if ( NOT "$ENV{CMAKE_HSA_PROXY}" STREQUAL "no" ) + add_definitions ( -DROCP_HSA_PROXY=1 ) +endif() + +## Check env vars +if ( NOT DEFINED CMAKE_BUILD_TYPE OR "${CMAKE_BUILD_TYPE}" STREQUAL "" ) + if ( DEFINED ENV{CMAKE_BUILD_TYPE} ) + set ( CMAKE_BUILD_TYPE $ENV{CMAKE_BUILD_TYPE} ) + endif() +endif() +if ( NOT DEFINED CMAKE_PREFIX_PATH AND DEFINED ENV{CMAKE_PREFIX_PATH} ) + set ( CMAKE_PREFIX_PATH $ENV{CMAKE_PREFIX_PATH} ) +endif() + +## Extend Compiler flags based on build type +string ( TOLOWER "${CMAKE_BUILD_TYPE}" CMAKE_BUILD_TYPE ) +if ( "${CMAKE_BUILD_TYPE}" STREQUAL debug ) + set ( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -ggdb" ) + set ( CMAKE_BUILD_TYPE "debug" ) +else () + set ( CMAKE_BUILD_TYPE "release" ) +endif () + +## Extend Compiler flags based on Processor architecture +if ( ${CMAKE_SYSTEM_PROCESSOR} STREQUAL "x86_64" ) + set ( NBIT 64 ) + set ( NBITSTR "64" ) + set ( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -m64 -msse -msse2" ) +elseif ( ${CMAKE_SYSTEM_PROCESSOR} STREQUAL "x86" ) + set ( NBIT 32 ) + set ( NBITSTR "" ) + set ( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -m32" ) +endif () + +## Find hsa-runtime headers/lib +find_file ( HSA_RUNTIME_INC "hsa/hsa.h" ) +get_filename_component ( HSA_RUNTIME_INC_PATH ${HSA_RUNTIME_INC} DIRECTORY ) +if ( "${HSA_RUNTIME_INC_PATH}" STREQUAL "" ) + find_file ( HSA_RUNTIME_INC "hsa.h" ) + get_filename_component ( HSA_RUNTIME_INC_PATH ${HSA_RUNTIME_INC} DIRECTORY ) +endif() + +set ( HSA_RUNTIME_NAME "hsa-runtime64" ) +find_library ( HSA_RUNTIME_LIB "lib${HSA_RUNTIME_NAME}.so" ) +get_filename_component ( HSA_RUNTIME_LIB_PATH ${HSA_RUNTIME_LIB} DIRECTORY ) + +set ( HSA_KMT_NAME "hsakmt" ) +find_library ( HSA_KMT_LIB "lib${HSA_KMT_NAME}.so" ) +get_filename_component ( HSA_KMT_LIB_PATH ${HSA_KMT_LIB} DIRECTORY ) + +set ( API_PATH ${HSA_RUNTIME_INC_PATH} ) + +## Basic Tool Chain Information +message ( "----------------NBIT: ${NBIT}" ) +message ( "-----------BuildType: ${CMAKE_BUILD_TYPE}" ) +message ( "------------Compiler: ${CMAKE_CXX_COMPILER}" ) +message ( "----Compiler-Version: ${CMAKE_CXX_COMPILER_VERSION}" ) +message ( "-----HSA-Runtime-Inc: ${HSA_RUNTIME_INC_PATH}" ) +message ( "-----HSA-Runtime-Lib: ${HSA_RUNTIME_LIB_PATH}" ) +message ( "------------API-path: ${API_PATH}" ) +message ( "-----CMAKE_CXX_FLAGS: ${CMAKE_CXX_FLAGS}" ) +message ( "---CMAKE_PREFIX_PATH: ${CMAKE_PREFIX_PATH}" ) diff --git a/cmake_modules/utils.cmake b/cmake_modules/utils.cmake new file mode 100644 index 0000000000..3e4d77357d --- /dev/null +++ b/cmake_modules/utils.cmake @@ -0,0 +1,116 @@ +################################################################################ +## +## The University of Illinois/NCSA +## Open Source License (NCSA) +## +## Copyright (c) 2014-2017, Advanced Micro Devices, Inc. All rights reserved. +## +## Developed by: +## +## AMD Research and AMD HSA Software Development +## +## Advanced Micro Devices, Inc. +## +## www.amd.com +## +## Permission is hereby granted, free of charge, to any person obtaining a copy +## of this software and associated documentation files (the "Software"), to +## deal with the Software without restriction, including without limitation +## the rights to use, copy, modify, merge, publish, distribute, sublicense, +## and#or sell copies of the Software, and to permit persons to whom the +## Software is furnished to do so, subject to the following conditions: +## +## - Redistributions of source code must retain the above copyright notice, +## this list of conditions and the following disclaimers. +## - Redistributions in binary form must reproduce the above copyright +## notice, this list of conditions and the following disclaimers in +## the documentation and#or other materials provided with the distribution. +## - Neither the names of Advanced Micro Devices, Inc, +## nor the names of its contributors may be used to endorse or promote +## products derived from this Software without specific prior written +## permission. +## +## THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +## IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +## FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +## THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR +## OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +## ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +## DEALINGS WITH THE SOFTWARE. +## +################################################################################ + +## Parses the VERSION_STRING variable and places +## the first, second and third number values in +## the major, minor and patch variables. +function( parse_version VERSION_STRING ) + + string ( FIND ${VERSION_STRING} "-" STRING_INDEX ) + + if ( ${STRING_INDEX} GREATER -1 ) + math ( EXPR STRING_INDEX "${STRING_INDEX} + 1" ) + string ( SUBSTRING ${VERSION_STRING} ${STRING_INDEX} -1 VERSION_BUILD ) + endif () + + string ( REGEX MATCHALL "[0123456789]+" VERSIONS ${VERSION_STRING} ) + list ( LENGTH VERSIONS VERSION_COUNT ) + + if ( ${VERSION_COUNT} GREATER 0) + list ( GET VERSIONS 0 MAJOR ) + set ( VERSION_MAJOR ${MAJOR} PARENT_SCOPE ) + set ( TEMP_VERSION_STRING "${MAJOR}" ) + endif () + + if ( ${VERSION_COUNT} GREATER 1 ) + list ( GET VERSIONS 1 MINOR ) + set ( VERSION_MINOR ${MINOR} PARENT_SCOPE ) + set ( TEMP_VERSION_STRING "${TEMP_VERSION_STRING}.${MINOR}" ) + endif () + + if ( ${VERSION_COUNT} GREATER 2 ) + list ( GET VERSIONS 2 PATCH ) + set ( VERSION_PATCH ${PATCH} PARENT_SCOPE ) + set ( TEMP_VERSION_STRING "${TEMP_VERSION_STRING}.${PATCH}" ) + endif () + + if ( DEFINED VERSION_BUILD ) + set ( VERSION_BUILD "${VERSION_BUILD}" PARENT_SCOPE ) + endif () + + set ( VERSION_STRING "${TEMP_VERSION_STRING}" PARENT_SCOPE ) + +endfunction () + +## Gets the current version of the repository +## using versioning tags and git describe. +## Passes back a packaging version string +## and a library version string. +function ( get_version DEFAULT_VERSION_STRING ) + + parse_version ( ${DEFAULT_VERSION_STRING} ) + + find_program ( GIT NAMES git ) + + if ( GIT ) + + execute_process ( COMMAND "git describe --dirty --long --match [0-9]* 2>/dev/null" + WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} + OUTPUT_VARIABLE GIT_TAG_STRING + OUTPUT_STRIP_TRAILING_WHITESPACE + RESULT_VARIABLE RESULT ) + + if ( ${RESULT} EQUAL 0 ) + + parse_version ( ${GIT_TAG_STRING} ) + + endif () + + endif () + + set( VERSION_STRING "${VERSION_STRING}" PARENT_SCOPE ) + set( VERSION_MAJOR "${VERSION_MAJOR}" PARENT_SCOPE ) + set( VERSION_MINOR "${VERSION_MINOR}" PARENT_SCOPE ) + set( VERSION_PATCH "${VERSION_PATCH}" PARENT_SCOPE ) + set( VERSION_BUILD "${VERSION_BUILD}" PARENT_SCOPE ) + +endfunction() diff --git a/doc/ROC_profiler_spec_v_1_2_0.docx b/doc/ROC_profiler_spec_v_1_2_0.docx new file mode 100644 index 0000000000..7b24060f16 Binary files /dev/null and b/doc/ROC_profiler_spec_v_1_2_0.docx differ diff --git a/inc/rocprofiler.h b/inc/rocprofiler.h new file mode 100644 index 0000000000..207dc48950 --- /dev/null +++ b/inc/rocprofiler.h @@ -0,0 +1,272 @@ +//////////////////////////////////////////////////////////////////////////////// +// +// The University of Illinois/NCSA +// Open Source License (NCSA) +// +// Copyright (c) 2014-2015, Advanced Micro Devices, Inc. All rights reserved. +// +// Developed by: +// +// AMD Research and AMD HSA Software Development +// +// Advanced Micro Devices, Inc. +// +// www.amd.com +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to +// deal with the Software without restriction, including without limitation +// the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the +// Software is furnished to do so, subject to the following conditions: +// +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimers. +// - Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimers in +// the documentation and/or other materials provided with the distribution. +// - Neither the names of Advanced Micro Devices, Inc, +// nor the names of its contributors may be used to endorse or promote +// products derived from this Software without specific prior written +// permission. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR +// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +// DEALINGS WITH THE SOFTWARE. +// +//////////////////////////////////////////////////////////////////////////////// + +//////////////////////////////////////////////////////////////////////////////// +// +// ROC Profiler API +// +// The goal of the implementation is to provide a HW specific low-level +// performance analysis interface for profiling of GPU compute applications. +// The profiling includes HW performance counters (PMC) with complex +// performance metrics and thread traces (SQTT). The profiling is supported +// by the SQTT, PMC and Callback APIs. +// +// The library can be used by a tool library loaded by HSA runtime or by +// higher level HW independent performance analysis API like PAPI. +// +// The library is written on C and will be based on AQLprofile AMD specific +// HSA extension. The library implementation requires HSA API intercepting and +// a profiling queue supporting a submit callback interface. +// +//////////////////////////////////////////////////////////////////////////////// + +#ifndef INC_ROCPROFILER_H_ +#define INC_ROCPROFILER_H_ + +#include +#include +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif // __cplusplus + +//////////////////////////////////////////////////////////////////////////////// +// Profiling info +// +// Profiling info objects have profiling feature info, type, parameters and data +// Also profiling data samplaes can be iterated using a callback + +// Profiling feature type +typedef enum { + ROCPROFILER_TYPE_METRIC = 0, + ROCPROFILER_TYPE_TRACE = 1 +} rocprofiler_type_t; + +// Profiling feture parameter +typedef hsa_ven_amd_aqlprofile_parameter_t rocprofiler_parameter_t; + +// Profiling data kind +typedef enum { + ROCPROFILER_UNINIT = 0, + ROCPROFILER_INT32 = 1, + ROCPROFILER_INT64 = 2, + ROCPROFILER_FLOAT = 3, + ROCPROFILER_DOUBLE = 4, + ROCPROFILER_BYTES = 5 +} rocprofiler_metric_kind_t; + +// Profiling data type +typedef struct { + rocprofiler_metric_kind_t kind; // result kind + union { + uint32_t result_int32; // 32bit integer result + uint64_t result_int64; // 64bit integer result + float result_float; // float single-precision result + double result_double; // float double-precision result + struct { + void* ptr; + uint32_t size; + uint32_t instance_count; + bool copy; + } result_bytes; // data by ptr and byte size + }; +} rocprofiler_data_t; + +// Profiling feature info +typedef struct { + rocprofiler_type_t type; // feature type + const char* name; // [in] feature name + const rocprofiler_parameter_t* parameters; // feature parameters array + uint32_t parameter_count; // feature parameters count + rocprofiler_data_t data; // [out] profiling data +} rocprofiler_info_t; + +//////////////////////////////////////////////////////////////////////////////// +// Profiling context +// +// Profiling context object accumuate all profiling information + +// Profiling context object +typedef void rocprofiler_t; + +// Profiling group object +typedef struct { + unsigned index; // group index + rocprofiler_info_t** info; // profiling info array + uint32_t info_count; // profiling info count + rocprofiler_t* context; // context object +} rocprofiler_group_t; + +// Profiling mode +typedef enum { + ROCPROFILER_MODE_STANDALONE = 1, + ROCPROFILER_MODE_CREATEQUEUE = 2, +} rocprofiler_mode_t; + +// Profiling preperties +typedef struct { + hsa_queue_t* queue; // queue for STANDALONE mode + // the queue is created and returned in CREATEQUEUE mode + uint32_t queue_depth; // created queue depth +} rocprofiler_properties_t; + +// Create new profiling context +hsa_status_t rocprofiler_open( + unsigned gpu_index, // GPU index + rocprofiler_info_t* info, // [in] profiling info array + uint32_t info_count, // profiling info count + rocprofiler_t** context, // [out] context object + uint32_t mode, // profiling mode mask + rocprofiler_properties_t* properties); // profiling properties + +// Delete profiling info +hsa_status_t rocprofiler_close( + rocprofiler_t* context); // [in] profiling context + +//////////////////////////////////////////////////////////////////////////////// +// Runtime API observer +// +// Runtime API observer is called on enter and exit for the API + +// Profiling callback data +typedef struct { + uint64_t kernel_object; + uint64_t queue_index; + uint32_t gpu_index; +} rocprofiler_callback_data_t; + +// Profiling callback type +typedef hsa_status_t (*rocprofiler_callback_t)( + const rocprofiler_callback_data_t* callback_data, // [in] callback data union, data depends on + // the callback API id + void* user_data, // [in/out] user data passed to the callback + rocprofiler_group_t** group); // [out] profiling group + +// Provided standard profiling callback +static inline hsa_status_t rocprofiler_set_dispatch_callback( + const rocprofiler_callback_data_t* callback_data, + void* user_data, + rocprofiler_group_t** group) { + *group = reinterpret_cast(user_data); + return HSA_STATUS_SUCCESS; +} + +// Set/remove kernel dispatch observer +hsa_status_t rocprofiler_set_dispatch_observer( + rocprofiler_callback_t callback, // observer callback + void* data); // [in/out] passed callback data + +hsa_status_t rocprofiler_remove_dispatch_observer(); + +//////////////////////////////////////////////////////////////////////////////// +// Start/stop profiling +// +// Start/stop the context profiling invocation, have to be as many as +// contect.invocations' to collect all profiling data + +// Start profiling +hsa_status_t rocprofiler_start( + rocprofiler_t* context, // [in/out] profiling context + uint32_t group_index = 0); // group index + +// Stop profiling +hsa_status_t rocprofiler_stop( + rocprofiler_t* context, // [in/out] profiling context + uint32_t group_index = 0); // group index + +// Read profiling data +hsa_status_t rocprofiler_get_data( + rocprofiler_t* context, // [in/out] profiling context + uint32_t group_index = 0); // group index + +// Get profiling groups +hsa_status_t rocprofiler_get_groups( + rocprofiler_t* context, // [in] profiling context + rocprofiler_group_t** groups, // [out] profiling groups + uint32_t* group_count); // [out] profiling groups count + +// Start profiling +hsa_status_t rocprofiler_start_group( + rocprofiler_group_t* group); // [in/out] profiling group + +// Stop profiling +hsa_status_t rocprofiler_stop_group( + rocprofiler_group_t* group); // [in/out] profiling group + +// Get profiling data +hsa_status_t rocprofiler_get_group_data( + rocprofiler_group_t* group); // [in/out] profiling group + +// Get metrics data +hsa_status_t rocprofiler_get_metrics_data( + const rocprofiler_t* context); // [in/out] profiling context + +// Definition of output data iterator callback +typedef hsa_ven_amd_aqlprofile_data_callback_t rocprofiler_trace_data_callback_t; + +// Method for iterating the events output data +hsa_status_t rocprofiler_iterate_trace_data( + rocprofiler_t* context, // [in] profiling context + rocprofiler_trace_data_callback_t callback, // [in] callback to iterate the output data + void* data); // [in/out] callback data + +//////////////////////////////////////////////////////////////////////////////// +// Returning the error string method + +hsa_status_t rocprofiler_error_string ( + const char** str); // [out] the API error string pointer returning + +//////////////////////////////////////////////////////////////////////////////// +// HSA-runtime tool on-load method +bool OnLoad( + HsaApiTable* table, + uint64_t runtime_version, + uint64_t failed_tool_count, + const char* const * failed_tool_names); + +#ifdef __cplusplus +} // extern "C" block +#endif // __cplusplus + +#endif // INC_ROCPROFILER_H_ diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt new file mode 100644 index 0000000000..4e1bcc43f7 --- /dev/null +++ b/src/CMakeLists.txt @@ -0,0 +1,14 @@ +# +# Build dynamic Library object +# +set ( TARGET_LIB "${TARGET_NAME}" ) +set ( LIB_SRC + ${LIB_DIR}/core/rocprofiler.cpp + ${LIB_DIR}/core/proxy_queue.cpp + ${LIB_DIR}/core/simple_proxy_queue.cpp + ${LIB_DIR}/core/intercept_queue.cpp + ${LIB_DIR}/util/hsa_rsrc_factory.cpp +) +add_library ( ${TARGET_LIB} SHARED ${LIB_SRC} ) +target_include_directories ( ${TARGET_LIB} PRIVATE ${LIB_DIR} ${ROOT_DIR} ${HSA_RUNTIME_INC_PATH} ) +target_link_libraries( ${TARGET_LIB} PRIVATE ${HSA_RUNTIME_LIB} c stdc++) diff --git a/src/core/context.h b/src/core/context.h new file mode 100644 index 0000000000..15a85166d0 --- /dev/null +++ b/src/core/context.h @@ -0,0 +1,399 @@ +#ifndef SRC_CORE_CONTEXT_H_ +#define SRC_CORE_CONTEXT_H_ + +#include "inc/rocprofiler.h" + +#include +#include +#include + +#include "core/metrics.h" +#include "core/profile.h" +#include "core/queue.h" +#include "core/types.h" +#include "util/exception.h" +#include "util/hsa_rsrc_factory.h" + +namespace rocprofiler { +struct rocprofiler_contex_t; +class Context; + +inline unsigned align_size(unsigned size, unsigned alignment) { return ((size + alignment - 1) & ~(alignment - 1)); } + +// Block descriptor +struct block_des_t { + uint32_t id; + uint32_t index; +}; + +// block_des_t less-then functor +struct lt_block_des { + bool operator()(const block_des_t& a1, const block_des_t& a2) const { + return (a1.id < a2.id) || ((a1.id == a2.id) && (a1.index < a2.index)); + } +}; + +// Block status +struct block_status_t { + uint32_t max_counters; + uint32_t counter_index; + uint32_t group_index; +}; + +// Metrics arguments +template +class MetricArgs : public xml::args_cache_t { + public: + MetricArgs(const Map& map) : map_(map) {} + bool Lookup(const std::string& name, uint64_t& result) const { + rocprofiler_info_t* info = NULL; + auto it = map_.find(name); + if (it == map_.end()) EXC_RAISING(HSA_STATUS_ERROR, "var '" << name << "' is not found"); + info = it->second; + if (info) { + result = info->data.result_int64; + if (info->data.kind == ROCPROFILER_UNINIT) EXC_RAISING(HSA_STATUS_ERROR, "var '" << name << "' is uninitialized"); + if (info->data.kind != ROCPROFILER_INT64) EXC_RAISING(HSA_STATUS_ERROR, "var '" << name << "' is of incompatible type, not INT64"); + } else EXC_RAISING(HSA_STATUS_ERROR, "var '" << name << "' info is NULL"); + return (info != NULL); + } + private: + const Map& map_; +}; + +// Profiling group +class Group { + public: + Group(const util::AgentInfo* agent_info, Context *context, const uint32_t& index) : + pmc_profile_(agent_info), + sqtt_profile_(agent_info), + context_(context), + index_(index) + {} + + void Insert(const profile_info_t& info) { + const rocprofiler_type_t type = info.rinfo->type; + info_vector_.push_back(info.rinfo); + switch (type) { + case ROCPROFILER_TYPE_METRIC: + pmc_profile_.Insert(info); + break; + case ROCPROFILER_TYPE_TRACE: + sqtt_profile_.Insert(info); + break; + default: + EXC_RAISING(HSA_STATUS_ERROR, "bad rocprofiler type (" << type << ")"); + } + } + + hsa_status_t Finalize() { + hsa_status_t status = pmc_profile_.Finalize(start_vector_, stop_vector_); + if (status == HSA_STATUS_SUCCESS) { + status = sqtt_profile_.Finalize(start_vector_, stop_vector_); + } + return status; + } + + void GetProfiles(profile_vector_t& vec) { + pmc_profile_.GetProfiles(vec); + sqtt_profile_.GetProfiles(vec); + } + + void GetTraceProfiles(profile_vector_t& vec) { + sqtt_profile_.GetProfiles(vec); + } + + info_vector_t& GetInfoVector() { return info_vector_; } + const pkt_vector_t& GetStartVector() const { return start_vector_; } + const pkt_vector_t& GetStopVector() const { return stop_vector_; } + Context* GetContext() { return context_; } + uint32_t GetIndex() const { return index_; } + + private: + PmcProfile pmc_profile_; + SqttProfile sqtt_profile_; + info_vector_t info_vector_; + pkt_vector_t start_vector_; + pkt_vector_t stop_vector_; + Context* const context_; + const uint32_t index_; +}; + +// Profiling context +class Context { + public: + typedef std::map info_map_t; + + Context(const util::AgentInfo* agent_info, Queue* queue, rocprofiler_info_t* info, const uint32_t info_count) : + agent_(agent_info->dev_id), + agent_info_(agent_info), + queue_(queue), + hsa_rsrc_(&util::HsaRsrcFactory::Instance()), + api_(hsa_rsrc_->AqlProfileApi()), + metrics_(agent_info) + { + Initialize(info, info_count); + Finalize(); + } + + ~Context() { + for (const auto& v : info_map_) { + const std::string& name = v.first; + const rocprofiler_info_t* info = v.second; + if ((info->type == ROCPROFILER_TYPE_METRIC) && (metrics_map_.find(name) == metrics_map_.end())) { + delete info; + } + } + } + + void Initialize(rocprofiler_info_t* info_array, const uint32_t info_count) { + info_map_t input_map; + for (unsigned i = 0; i < info_count; ++i) { + rocprofiler_info_t* info = &info_array[i]; + input_map[info->name] = info; + info->data.kind = ROCPROFILER_UNINIT; + } + + if (info_count) set_.push_back(Group(agent_info_, this, 0)); + + for (unsigned i = 0; i < info_count; ++i) { + rocprofiler_info_t* info = &info_array[i]; + info_map_[info->name] = info; + const rocprofiler_type_t type = info->type; + const char* name = info->name; + + if (type == ROCPROFILER_TYPE_METRIC) { + const Metric* metric = metrics_.Get(name); + if (metric == NULL) EXC_RAISING(HSA_STATUS_ERROR, "metric '" << name << "' is not found"); + auto ret = metrics_map_.insert({name, metric}); + if (!ret.second) EXC_RAISING(HSA_STATUS_ERROR, "metric '" << name << "' is registered more then once"); + + counters_vec_t counters_vec = metric->GetCounters(); + if (counters_vec.empty()) EXC_RAISING(HSA_STATUS_ERROR, "metric name '" << name << "' is not found"); + + for (const counter_t* counter : counters_vec) { + if (metric->GetExpr()) { + auto it = input_map.find(counter->name); + if (it != input_map.end()) { + continue; + } else { + info = NewCounterInfo(counter); + info_map_[info->name] = info; + } + } + + const event_t* event = &(counter->event); + const block_des_t block_des = {event->block_name, event->block_index}; + auto ret = groups_map_.insert({block_des, {}}); + block_status_t& block_status = ret.first->second; + if (block_status.max_counters == 0) { + profile_t query = {}; + query.agent = agent_; + query.type = HSA_VEN_AMD_AQLPROFILE_EVENT_TYPE_PMC; + query.events = event; + + uint32_t block_counters; + hsa_status_t status = api_->hsa_ven_amd_aqlprofile_get_info(&query, HSA_VEN_AMD_AQLPROFILE_INFO_BLOCK_COUNTERS, &block_counters); + if (status != HSA_STATUS_SUCCESS) AQL_EXC_RAISING(status, "get block_counters info"); + block_status.max_counters = block_counters; + } + if (block_status.counter_index >= block_status.max_counters) { + block_status.counter_index = 0; + block_status.group_index += 1; + } + if (block_status.group_index >= set_.size()) { + set_.push_back(Group(agent_info_, this, block_status.group_index)); + } + const uint32_t group_index = block_status.group_index; + set_[group_index].Insert(profile_info_t{event, NULL, 0, info}); + } + } else if (type == ROCPROFILER_TYPE_TRACE) { + set_[0].Insert(profile_info_t{NULL, info->parameters, info->parameter_count, info}); + } else { + EXC_RAISING(HSA_STATUS_ERROR, "bad rocprofiler type (" << type << ")"); + } + } + } + + void Finalize() { + for (unsigned index = 0; index < set_.size(); ++index) { + const hsa_status_t status = set_[index].Finalize(); + if (status != HSA_STATUS_SUCCESS) EXC_RAISING(status, "context finalize failed"); + } + } + + uint32_t GetGroupCount() const { return set_.size(); } + + rocprofiler_group_t GetGroupInfo(const uint32_t& index) { + rocprofiler::info_vector_t& info_vector = set_[index].GetInfoVector(); + rocprofiler_group_t group = {}; + group.info_count = info_vector.size(); + group.info = &info_vector[0]; + group.context = reinterpret_cast(this); + group.index = index; + return group; + } + + const pkt_vector_t& StartPackets(const uint32_t& group_index) const { return set_[group_index].GetStartVector(); } + const pkt_vector_t& StopPackets(const uint32_t& group_index) const { return set_[group_index].GetStopVector(); } + + void Start(const uint32_t& group_index, Queue* const queue = NULL) { + const pkt_vector_t& start_packets = StartPackets(group_index); + Queue* const submit_queue = (queue != NULL) ? queue : queue_; + submit_queue->Submit(&start_packets[0], start_packets.size()); + } + void Stop(const uint32_t& group_index, Queue* const queue = NULL) { + const pkt_vector_t& stop_packets = StopPackets(group_index); + Queue* const submit_queue = (queue != NULL) ? queue : queue_; + submit_queue->Submit(&stop_packets[0], stop_packets.size()); + } + void Submit(const uint32_t& group_index, const packet_t* packet, Queue* const queue = NULL) { + Queue* const submit_queue = (queue != NULL) ? queue : queue_; + Start(group_index, submit_queue); + submit_queue->Submit(packet); + Stop(group_index, submit_queue); + } + + struct callback_data_t { + info_vector_t* info_vector; + size_t index; + char* ptr; + }; + + void GetData(const uint32_t& group_index) { + const profile_vector_t profile_vector = GetProfiles(group_index); + for (auto& tuple : profile_vector) { + // Wait for stop packet to complete + hsa_signal_wait_scacquire( + tuple.completion_signal, + HSA_SIGNAL_CONDITION_LT, + 1, + (uint64_t)-1, + HSA_WAIT_STATE_BLOCKED); + callback_data_t callback_data{tuple.info_vector, tuple.info_vector->size(), NULL}; + const hsa_status_t status = api_->hsa_ven_amd_aqlprofile_iterate_data(tuple.profile, DataCallback, &callback_data); + if (status != HSA_STATUS_SUCCESS) AQL_EXC_RAISING(status, "context iterate data failed"); + } + } + + void GetMetricsData() const { + const MetricArgs args(info_map_); + for (const auto v : metrics_map_) { + const std::string& name = v.first; + const Metric* metric = v.second; + const xml::Expr* expr = metric->GetExpr(); + if (expr) { + auto it = info_map_.find(name); + if (it == info_map_.end()) EXC_RAISING(HSA_STATUS_ERROR, "metric '" << name << "', rocprofiler info is not found"); + rocprofiler_info_t* info = it->second; + info->data.result_int64 = expr->Eval(args); + info->data.kind = ROCPROFILER_INT64; + } + } + } + + void IterateTraceData(rocprofiler_trace_data_callback_t callback, void *data) { + profile_vector_t profile_vector; + set_[0].GetTraceProfiles(profile_vector); + for (auto& tuple : profile_vector) { + const hsa_status_t status = api_->hsa_ven_amd_aqlprofile_iterate_data(tuple.profile, callback, data); + if (status != HSA_STATUS_SUCCESS) AQL_EXC_RAISING(status, "context iterate data failed"); + } + } + + private: + // Getting profling packets + profile_vector_t GetProfiles(const uint32_t& index) { + profile_vector_t vec; + if (index >= set_.size()) { + EXC_RAISING(HSA_STATUS_ERROR, "index exceeding the maximum " << set_.size()); + } + set_[index].GetProfiles(vec); + return vec; + } + + static hsa_status_t DataCallback(hsa_ven_amd_aqlprofile_info_type_t ainfo_type, + hsa_ven_amd_aqlprofile_info_data_t* ainfo_data, + void* data) { + hsa_status_t status = HSA_STATUS_SUCCESS; + callback_data_t* callback_data = reinterpret_cast(data); + info_vector_t& info_vector = *(callback_data->info_vector); + uint32_t index = callback_data->index; + const uint32_t sample_id = ainfo_data->sample_id; + if (info_vector.size() == index) { + index = 0; + } else { + if (sample_id == 0) index += 1; + } + callback_data->index = index; + + if (index < info_vector.size()) { + rocprofiler_info_t* rinfo = info_vector[index]; + if (ainfo_type == HSA_VEN_AMD_AQLPROFILE_INFO_PMC_DATA) { + if (ainfo_data->sample_id == 0) rinfo->data.result_int64 = 0; + rinfo->data.result_int64 += ainfo_data->pmc_data.result; + rinfo->data.kind = ROCPROFILER_INT64; + } else if (ainfo_type == HSA_VEN_AMD_AQLPROFILE_INFO_SQTT_DATA) { + if (rinfo->data.result_bytes.copy) { + char* result_bytes_ptr = reinterpret_cast(rinfo->data.result_bytes.ptr); + const char* end = result_bytes_ptr + rinfo->data.result_bytes.size; + const char* src = reinterpret_cast(ainfo_data->sqtt_data.ptr); + const uint32_t size = ainfo_data->sqtt_data.size; + char* ptr = (sample_id == 0) ? result_bytes_ptr : callback_data->ptr; + uint64_t* header = reinterpret_cast(ptr); + char* dest = ptr + sizeof(*header); + + if ((dest + size) < end) { + hsa_status_t status = hsa_memory_copy(dest, src, size); + if (status == HSA_STATUS_SUCCESS) { + *header = size; + rinfo->data.kind = ROCPROFILER_BYTES; + rinfo->data.result_bytes.instance_count = sample_id + 1; + callback_data->ptr = dest + align_size(size, sizeof(uint64_t)); + } + } else status = HSA_STATUS_ERROR; + } else { + if (sample_id == 0) { + rinfo->data.kind = ROCPROFILER_BYTES; + rinfo->data.result_bytes.ptr = ainfo_data->sqtt_data.ptr; + rinfo->data.result_bytes.instance_count = UINT32_MAX; + } + rinfo->data.result_bytes.instance_count += 1; + } + } else status = HSA_STATUS_ERROR; + } else status = HSA_STATUS_ERROR; + + return status; + } + + rocprofiler_info_t* NewCounterInfo(const counter_t* counter) { + rocprofiler_info_t* info = new rocprofiler_info_t{}; + info->type = ROCPROFILER_TYPE_METRIC; + info->name = counter->name.c_str(); + return info; + } + + // GPU handel + const hsa_agent_t agent_; + const util::AgentInfo* agent_info_; + // Profiling queue + Queue* queue_; + // HSA resources factory + util::HsaRsrcFactory* hsa_rsrc_; + // aqlprofile API table + const pfn_t* api_; + // Profile group set + std::vector set_; + // Metrics dictionary + MetricsDict metrics_; + // Groups map + std::map groups_map_; + // Info map + info_map_t info_map_; + // Metrics map + std::map metrics_map_; +}; + +} // namespace rocprofiler + +#endif // SRC_CORE_CONTEXT_H_ diff --git a/src/core/hsa_proxy_queue.h b/src/core/hsa_proxy_queue.h new file mode 100644 index 0000000000..b1a8167e66 --- /dev/null +++ b/src/core/hsa_proxy_queue.h @@ -0,0 +1,49 @@ +#ifndef _SRC_CORE_HSA_PROXY_QUEUE_H +#define _SRC_CORE_HSA_PROXY_QUEUE_H + +#include +#include +#include +#include + +#include "core/proxy_queue.h" +#include "util/exception.h" + +namespace rocprofiler { +extern decltype(hsa_queue_destroy)* hsa_queue_destroy_fn; +extern decltype(hsa_amd_queue_intercept_create)* hsa_amd_queue_intercept_create_fn; +extern decltype(hsa_amd_queue_intercept_register)* hsa_amd_queue_intercept_register_fn; + +class HsaProxyQueue : public ProxyQueue { + public: + hsa_status_t SetInterceptCB(on_submit_cb_t on_submit_cb, void* data) { + return hsa_amd_queue_intercept_register_fn(queue_, on_submit_cb, data); + } + + void Submit(const packet_t* packet) { EXC_RAISING(HSA_STATUS_ERROR, "HsaProxyQueue::Submit() is not supported"); } + + private: + hsa_status_t Init( + hsa_agent_t agent, + uint32_t size, + hsa_queue_type32_t type, + void (*callback)(hsa_status_t status, hsa_queue_t *source, void *data), + void *data, + uint32_t private_segment_size, + uint32_t group_segment_size, + hsa_queue_t **queue) + { + printf("HsaProxyQueue::Init()\n"); + const auto status = hsa_amd_queue_intercept_create_fn(agent, size, type, callback, data, private_segment_size, group_segment_size, &queue_); + *queue = queue_; + return status; + } + + hsa_status_t Cleanup() const { return hsa_queue_destroy_fn(queue_); } + + hsa_queue_t* queue_; +}; + +} // namespace rocprofiler + +#endif // _SRC_CORE_HSA_PROXY_QUEUE_H diff --git a/src/core/hsa_queue.h b/src/core/hsa_queue.h new file mode 100644 index 0000000000..992dae8ac1 --- /dev/null +++ b/src/core/hsa_queue.h @@ -0,0 +1,60 @@ +#ifndef _SRC_CORE_HSA_QUEUE_H +#define _SRC_CORE_HSA_QUEUE_H + +#include + +#include "core/queue.h" +#include "core/types.h" + +namespace rocprofiler { + +class HsaQueue : public Queue { + public: + typedef void (HsaQueue::*submit_fptr_t)(const packet_t* packet); + enum { + LEGACY_SLOT_SIZE_W = HSA_VEN_AMD_AQLPROFILE_LEGACY_PM4_PACKET_SIZE / sizeof(packet_word_t), + LEGACY_SLOT_SIZE_P = HSA_VEN_AMD_AQLPROFILE_LEGACY_PM4_PACKET_SIZE / sizeof(packet_t) + }; + struct slot_pm4_t { + packet_word_t words[LEGACY_SLOT_SIZE_W]; + }; + + HsaQueue(const util::AgentInfo* agent_info, hsa_queue_t* queue) : + queue_(queue) + {} + + void Submit(const packet_t* packet) { + // Compute the write index of queue and copy Aql packet into it + const uint64_t que_idx = hsa_queue_load_write_index_relaxed(queue_); + // Increment the write index + hsa_queue_store_write_index_relaxed(queue_, que_idx + 1); + + const uint32_t mask = queue_->size - 1; + + // Copy packet to the queue + const packet_word_t* src = reinterpret_cast(packet); + packet_t* slot = reinterpret_cast(queue_->base_address) + (que_idx & mask); + packet_word_t* dst = reinterpret_cast(slot); + const uint32_t nwords = sizeof(packet_t) / sizeof(packet_word_t); + for (unsigned i = 1; i < nwords; ++i) { + dst[i] = src[i]; + } + + // To maintain global order to ensure the prior copy of the packet contents is made visible + // before the header is updated. + // With in-order CP it will wait until the first packet in the blob will be valid + std::atomic* header_atomic_ptr = + reinterpret_cast*>(&dst[0]); + header_atomic_ptr->store(src[0], std::memory_order_release); + + // Doorbell signaling + hsa_signal_store_relaxed(queue_->doorbell_signal, que_idx); + } + + private: + hsa_queue_t* queue_; +}; + +} // namespace rocprofiler + +#endif // _SRC_CORE_HSA_QUEUE_H diff --git a/src/core/intercept_queue.cpp b/src/core/intercept_queue.cpp new file mode 100644 index 0000000000..29c3a00aa2 --- /dev/null +++ b/src/core/intercept_queue.cpp @@ -0,0 +1,15 @@ +#include "core/intercept_queue.h" + +namespace rocprofiler { +void InterceptQueue::HsaIntercept(HsaApiTable* table) { + table->core_->hsa_queue_create_fn = rocprofiler::InterceptQueue::QueueCreate; + table->core_->hsa_queue_destroy_fn = rocprofiler::InterceptQueue::QueueDestroy; +} + +InterceptQueue::mutex_t InterceptQueue::mutex_; +rocprofiler_callback_t InterceptQueue::on_dispatch_cb_ = NULL; +void* InterceptQueue::on_dispatch_cb_data_ = NULL; +const char* InterceptQueue::tool_lib_ = NULL; +void* InterceptQueue::tool_handle_ = NULL; +InterceptQueue::obj_map_t* InterceptQueue::obj_map_ = NULL; +} // namespace rocprofiler diff --git a/src/core/intercept_queue.h b/src/core/intercept_queue.h new file mode 100644 index 0000000000..bcc2160bd8 --- /dev/null +++ b/src/core/intercept_queue.h @@ -0,0 +1,161 @@ +#ifndef _SRC_CORE_INTERCEPT_QUEUE_H +#define _SRC_CORE_INTERCEPT_QUEUE_H + +#include +#include +#include +#include +#include + +#include "core/context.h" +#include "core/proxy_queue.h" +#include "core/types.h" +#include "util/hsa_rsrc_factory.h" + +namespace rocprofiler { +extern decltype(hsa_queue_create)* hsa_queue_create_fn; +extern decltype(hsa_queue_destroy)* hsa_queue_destroy_fn; + +class InterceptQueue { + public: + typedef std::recursive_mutex mutex_t; + typedef std::map obj_map_t; + + static void HsaIntercept(HsaApiTable* table); + + static void SetTool(const char* tool) { tool_lib_ = tool; } + + static void UnloadTool() { if (tool_handle_) dlclose(tool_handle_); } + + static hsa_status_t QueueCreate( + hsa_agent_t agent, + uint32_t size, + hsa_queue_type32_t type, + void (*callback)(hsa_status_t status, hsa_queue_t *source, void *data), + void *data, + uint32_t private_segment_size, + uint32_t group_segment_size, + hsa_queue_t **queue) + { + std::lock_guard lck(mutex_); + + hsa_status_t status = HSA_STATUS_ERROR; + + if (tool_lib_) { + tool_handle_ = dlopen(tool_lib_, RTLD_NOW); + if (tool_handle_ == NULL) { + fprintf(stderr, "ROCProfiler: can't load tool library \"%s\"\n", tool_lib_); + fprintf(stderr, "%s\n", dlerror()); + exit(1); + } + tool_lib_ = NULL; + } + + if (!obj_map_) obj_map_ = new obj_map_t; + + ProxyQueue* proxy = ProxyQueue::Create(agent, size, type, callback, data, private_segment_size, group_segment_size, queue, &status); + if (status != HSA_STATUS_SUCCESS) { + InterceptQueue* obj = new InterceptQueue(agent, proxy); + (*obj_map_)[(uint64_t)(*queue)] = obj; + status = proxy->SetInterceptCB(OnSubmitCB, obj); + } + + return status; + } + + static hsa_status_t QueueDestroy(hsa_queue_t *queue) { + std::lock_guard lck(mutex_); + hsa_status_t status = HSA_STATUS_ERROR; + + obj_map_t::iterator it = obj_map_->find((uint64_t)queue); + if (it != obj_map_->end()) { + const InterceptQueue* obj = it->second; + delete obj; + obj_map_->erase(it); + status = HSA_STATUS_SUCCESS; + } + + return status; + } + + static void OnSubmitCB(const void* in_packets, uint64_t count, uint64_t user_que_idx, void* data, hsa_amd_queue_intercept_packet_writer writer) { + const packet_t* packets_arr = reinterpret_cast(in_packets); + InterceptQueue* obj = reinterpret_cast(data); + Queue* proxy = obj->proxy_; + + for (uint64_t j = 0; j < count; ++j) { + bool to_submit = true; + const packet_t* packet = &packets_arr[j]; + + if ((GetHeaderType(packet) == HSA_PACKET_TYPE_KERNEL_DISPATCH) && (on_dispatch_cb_ != NULL)) { + rocprofiler_group_t* group = NULL; + const hsa_kernel_dispatch_packet_t* dispatch_packet = reinterpret_cast(packet); + rocprofiler_callback_data_t data = {dispatch_packet->kernel_object, user_que_idx, obj->agent_info_->dev_index}; + hsa_status_t status = on_dispatch_cb_(&data, on_dispatch_cb_data_, &group); + if ((status == HSA_STATUS_SUCCESS) && (group != NULL)) { + Context* context = reinterpret_cast(group->context); + const pkt_vector_t& start_vector = context->StartPackets(group->index); + const pkt_vector_t& stop_vector = context->StopPackets(group->index); + + pkt_vector_t packets = start_vector; + packets.insert(packets.end(), *packet); + packets.insert(packets.end(), stop_vector.begin(), stop_vector.end()); + if (writer != NULL) { + writer(&packets[0], packets.size()); + } else { + proxy->Submit(&packets[0], packets.size()); + } + to_submit = false; + } + } + + if (to_submit) { + if (writer != NULL) { + writer(packet, 1); + } else { + proxy->Submit(packet, 1); + } + } + + packet += 1; + } + } + + static void SetDispatchCB(rocprofiler_callback_t on_dispatch_cb, void* data) { + std::lock_guard lck(mutex_); + on_dispatch_cb_ = on_dispatch_cb; + on_dispatch_cb_data_ = data; + } + + static void UnsetDispatchCB() { + std::lock_guard lck(mutex_); + on_dispatch_cb_ = NULL; + on_dispatch_cb_data_ = NULL; + } + + private: + InterceptQueue(const hsa_agent_t& agent, ProxyQueue* proxy) : proxy_(proxy) { + agent_info_ = util::HsaRsrcFactory::Instance().GetAgentInfo(agent); + } + ~InterceptQueue() { ProxyQueue::Destroy(proxy_); } + + static packet_word_t GetHeaderType(const packet_t* packet) { + const packet_word_t* header = reinterpret_cast(packet); + return (*header >> HSA_PACKET_HEADER_TYPE) & header_type_mask; + } + + static mutex_t mutex_; + static const packet_word_t header_type_mask = (1ul << HSA_PACKET_HEADER_WIDTH_TYPE) - 1; + static rocprofiler_callback_t on_dispatch_cb_; + static void* on_dispatch_cb_data_; + static const char* tool_lib_; + static void* tool_handle_; + static obj_map_t* obj_map_; + + ProxyQueue* const proxy_; + const util::AgentInfo* agent_info_; +}; + +} // namespace rocprofiler + +#endif // _SRC_CORE_INTERCEPT_QUEUE_H diff --git a/src/core/metrics.h b/src/core/metrics.h new file mode 100644 index 0000000000..95e3536b67 --- /dev/null +++ b/src/core/metrics.h @@ -0,0 +1,169 @@ +#ifndef SRC_CORE_METRICS_H_ +#define SRC_CORE_METRICS_H_ + +#include +#include +#include + +#include +#include +#include +#include + +#include "core/types.h" +#include "util/exception.h" +#include "util/hsa_rsrc_factory.h" +#include "xml/expr.h" +#include "xml/xml.h" + +namespace rocprofiler { +struct counter_t { + std::string name; + event_t event; +}; +typedef std::vector counters_vec_t; + +class Metric { + public: + Metric(const std::string& name) : name_(name) {} + std::string GetName() const { return name_; } + virtual void GetCounters(counters_vec_t &vec) const = 0; + counters_vec_t GetCounters() const { + counters_vec_t counters; + GetCounters(counters); + return counters; + } + virtual const xml::Expr* GetExpr() const = 0; + private: + std::string name_; +}; + +class BaseMetric : public Metric { + public: + BaseMetric(const std::string& name, const counter_t& counter) : Metric(name), counter_(counter) {} + void GetCounters(counters_vec_t &vec) const { vec.push_back(&counter_); } + const xml::Expr* GetExpr() const { return NULL; } + private: + const counter_t counter_; +}; + +class ExprMetric : public Metric { + public: + ExprMetric(const std::string& name, const counters_vec_t& counters, const xml::Expr* expr) : Metric(name), counters_(counters), expr_(expr) {} + void GetCounters(counters_vec_t& vec) const { vec.insert(vec.end(), counters_.begin(), counters_.end()); } + const xml::Expr* GetExpr() const { return expr_; } + private: + const counters_vec_t counters_; + const xml::Expr* expr_; +}; + + +class MetricsDict { + public: + typedef std::map cache_t; + typedef cache_t::const_iterator const_iterator_t; + + class ExprCache : public xml::expr_cache_t { + public: + ExprCache(const cache_t* cache) : cache_(cache) {} + bool Lookup(const std::string& name, std::string& result) const { + bool ret = false; + auto it = cache_->find(name); + if (it != cache_->end()) { + ret = true; + const rocprofiler::ExprMetric* expr_metric = dynamic_cast(it->second); + if (expr_metric) result = expr_metric->GetExpr()->GetStr(); + } + return ret; + } + private: + const cache_t* const cache_; + }; + + MetricsDict(const util::AgentInfo* agent_info) : xml_(NULL) { + const char* xml_name = getenv("ROCP_METRICS"); + if (xml_name != NULL) { + xml_ = new xml::Xml(xml_name); + std::cout << "ROCProfiler: importing metrics from '" << xml_name << "':" << std::endl; + ImportMetrics(agent_info, agent_info->gfxip); + ImportMetrics(agent_info, "global"); + } + } + + const Metric* Get(const std::string& name) const { + const Metric* metric = NULL; + auto it = cache_.find(name); + if (it != cache_.end()) metric = it->second; + return metric; + } + + private: + void ImportMetrics(const util::AgentInfo* agent_info, const char* scope) { + auto scope_list = xml_->GetNodes("top." + std::string(scope) + ".metric"); + if (!scope_list.empty()) { + std::cout << " " << scope_list.size() << " " << scope << " metrics found" << std::endl; + + for (auto node : scope_list) { + const std::string name = node->opts["name"]; + if (cache_.find(name) != cache_.end()) EXC_RAISING(HSA_STATUS_ERROR, "ImportMetrics: metrics redefined '" << name << "'"); + + const std::string expr_str = node->opts["expr"]; + if (expr_str.empty()) { + const std::string block_name = node->opts["block"]; + const uint32_t event_id = atoi(node->opts["event"].c_str()); + + hsa_ven_amd_aqlprofile_profile_t profile; + profile.agent = agent_info->dev_id; + hsa_ven_amd_aqlprofile_id_query_t query = {block_name.c_str(), 0, 0}; + hsa_status_t status = util::HsaRsrcFactory::Instance().AqlProfileApi()-> + hsa_ven_amd_aqlprofile_get_info(&profile, HSA_VEN_AMD_AQLPROFILE_INFO_BLOCK_ID, &query); + if (status == HSA_STATUS_SUCCESS) { + const hsa_ven_amd_aqlprofile_block_name_t block_id = (hsa_ven_amd_aqlprofile_block_name_t)query.id; + if (query.instance_count > 1) { + for (unsigned block_index = 0; block_index < query.instance_count; ++block_index) { + std::ostringstream os; + os << name << '[' << block_index << ']'; + const std::string full_name = os.str(); + const counter_t counter = {full_name, {block_id, block_index, event_id}}; + cache_[full_name] = new BaseMetric(full_name, counter); + } + } else { + const counter_t counter = {name, {block_id, 0, event_id}}; + cache_[name] = new BaseMetric(name, counter); + } + } else AQL_EXC_RAISING(HSA_STATUS_ERROR, "ImportMetrics: bad block name '" << block_name << "'"); + } else { + xml::Expr* expr_obj = new xml::Expr(expr_str, new ExprCache(&cache_)); + std::cout << " " << name << " = " << expr_obj->String() << std::endl; + counters_vec_t counters_vec; + for (const std::string var : expr_obj->GetVars()) { + auto it = cache_.find(var); + if (it == cache_.end()) EXC_RAISING(HSA_STATUS_ERROR, "Bad metric '" << name << "', var '" << var << "' is not found"); + it->second->GetCounters(counters_vec); + } + cache_[name] = new ExprMetric(name, counters_vec, expr_obj); + } + } + +#if 0 + for (auto& v : cache_) { + const Metric* metric = v.second; + counters_vec_t counters_vec; + printf("> Metric '%s'\n", metric->GetName().c_str()); + metric->GetCounters(counters_vec); + for (auto c : counters_vec) { + printf(" counter %s, b(%u), i (%u), e (%u)\n", c->name.c_str(), c->event.block_name, c->event.block_index, c->event.counter_id); + } + } +#endif + } + } + + // Metrics map + xml::Xml* xml_; + cache_t cache_; +}; + +} // namespace rocprofiler + +#endif // SRC_CORE_METRICS_H_ diff --git a/src/core/profile.h b/src/core/profile.h new file mode 100644 index 0000000000..95a1a09f0a --- /dev/null +++ b/src/core/profile.h @@ -0,0 +1,206 @@ +#ifndef SRC_CORE_PROFILE_H_ +#define SRC_CORE_PROFILE_H_ + +#include "inc/rocprofiler.h" + +#include +#include + +#include "core/types.h" +#include "util/exception.h" +#include "util/hsa_rsrc_factory.h" + +namespace rocprofiler { +struct profile_info_t { + const event_t* event; + const parameter_t* parameters; + uint32_t parameter_count; + rocprofiler_info_t* rinfo; +}; +typedef std::vector info_vector_t; +typedef std::vector pkt_vector_t; +struct profile_tuple_t { + const profile_t* profile; + info_vector_t* info_vector; + hsa_signal_t completion_signal; +}; +typedef std::vector profile_vector_t; + +template class ConfigBase {}; + +template<> class ConfigBase { + public: + ConfigBase(profile_t *profile) : profile_(profile) {} + + protected: + void* Array() { return const_cast(profile_->events); } + unsigned Count() const { return profile_->event_count; } + void Set(event_t* events, const unsigned& count) { + profile_->events = events; + profile_->event_count = count; + } + profile_t* profile_; +}; + +template<> class ConfigBase { + public: + ConfigBase(profile_t *profile) : profile_(profile) {} + + protected: + void* Array() { return const_cast(profile_->parameters); } + unsigned Count() const { return profile_->parameter_count; } + void Set(parameter_t* parameters, const unsigned& count) { + profile_->parameters = parameters; + profile_->parameter_count = count; + } + profile_t* profile_; +}; + +template +class Config : protected ConfigBase { + typedef ConfigBase Parent; + public: + Config(profile_t *profile) : Parent(profile) {} + void Insert(const Item& item) { + auto count = Parent::Count(); + count += 1; + Item* array = reinterpret_cast(realloc(const_cast(Parent::Array()), count * sizeof(Item))); + array[count - 1] = item; + Parent::Set(array, count); + } +}; + +class Profile { + public: + static const uint32_t LEGACY_SLOT_SIZE_PKT = HSA_VEN_AMD_AQLPROFILE_LEGACY_PM4_PACKET_SIZE / sizeof(packet_t); + + Profile(const util::AgentInfo* agent_info) : agent_info_(agent_info) { + profile_ = {}; + profile_.agent = agent_info->dev_id; + is_legacy_ = (strncmp(agent_info->name, "gfx8", 4) == 0); + } + virtual ~Profile() { + hsa_memory_free(profile_.command_buffer.ptr); + hsa_memory_free(profile_.output_buffer.ptr); + free(const_cast(profile_.events)); + free(const_cast(profile_.parameters)); + } + + virtual void Insert(const profile_info_t& info) { + info_vector_.push_back(info.rinfo); + } + + hsa_status_t Finalize(pkt_vector_t& start_vector, pkt_vector_t& stop_vector) { + hsa_status_t status = HSA_STATUS_SUCCESS; + + if (!info_vector_.empty()) { + util::HsaRsrcFactory* rsrc = &util::HsaRsrcFactory::Instance(); + const pfn_t* api = rsrc->AqlProfileApi(); + packet_t start{}; + packet_t stop{}; + + // Check the profile buffer sizes + status = api->hsa_ven_amd_aqlprofile_start(&profile_, NULL); + if (status != HSA_STATUS_SUCCESS) AQL_EXC_RAISING(status, "aqlprofile_start(NULL)"); + Allocate(rsrc); + // Generate start/stop profiling packets + status = api->hsa_ven_amd_aqlprofile_start(&profile_, &start); + if (status != HSA_STATUS_SUCCESS) AQL_EXC_RAISING(status, "aqlprofile_start"); + status = api->hsa_ven_amd_aqlprofile_stop(&profile_, &stop); + if (status != HSA_STATUS_SUCCESS) AQL_EXC_RAISING(status, "aqlprofile_stop"); + // Set completion signals + hsa_signal_t dummy_signal{}; + dummy_signal.handle = 0; + start.completion_signal = dummy_signal; + hsa_signal_t post_signal; + status = hsa_signal_create(1, 0, NULL, &post_signal); + if (status != HSA_STATUS_SUCCESS) AQL_EXC_RAISING(status, "hsa_signal_create"); + stop.completion_signal = post_signal; + completion_signal_ = post_signal; + + if (is_legacy_) { + const uint32_t start_index = start_vector.size(); + const uint32_t stop_index = stop_vector.size(); + + start_vector.insert(start_vector.end(), LEGACY_SLOT_SIZE_PKT, packet_t{}); + stop_vector.insert(stop_vector.end(), LEGACY_SLOT_SIZE_PKT, packet_t{}); + status = api->hsa_ven_amd_aqlprofile_legacy_get_pm4(&start, reinterpret_cast(&start_vector[start_index])); + if (status != HSA_STATUS_SUCCESS) AQL_EXC_RAISING(status, "hsa_ven_amd_aqlprofile_legacy_get_pm4"); + status = api->hsa_ven_amd_aqlprofile_legacy_get_pm4(&stop, reinterpret_cast(&stop_vector[stop_index])); + if (status != HSA_STATUS_SUCCESS) AQL_EXC_RAISING(status, "hsa_ven_amd_aqlprofile_legacy_get_pm4"); + } else { + start_vector.push_back(start); + stop_vector.push_back(stop); + } + } + + return status; + } + + void GetProfiles(profile_vector_t& vec) { + if (!info_vector_.empty()) { + vec.push_back(profile_tuple_t{&profile_, &info_vector_, completion_signal_}); + } + } + + protected: + virtual hsa_status_t Allocate(util::HsaRsrcFactory* rsrc) = 0; + + const util::AgentInfo* const agent_info_; + bool is_legacy_; + profile_t profile_; + info_vector_t info_vector_; + hsa_signal_t completion_signal_; +}; + +class PmcProfile : public Profile { + public: + PmcProfile(const util::AgentInfo* agent_info) : Profile(agent_info) { + profile_.type = HSA_VEN_AMD_AQLPROFILE_EVENT_TYPE_PMC; + } + + void Insert(const profile_info_t& info) { + Profile::Insert(info); + Config(&profile_).Insert(*(info.event)); + } + + hsa_status_t Allocate(util::HsaRsrcFactory* rsrc) { + profile_.command_buffer.ptr = rsrc->AllocateSysMemory(agent_info_, profile_.command_buffer.size); + profile_.output_buffer.ptr = rsrc->AllocateSysMemory(agent_info_, profile_.output_buffer.size); + return (profile_.command_buffer.ptr && profile_.output_buffer.ptr) ? HSA_STATUS_SUCCESS : HSA_STATUS_ERROR; + } +}; + +class SqttProfile : public Profile { + public: + static const uint32_t output_buffer_size = 0x2000000; // 32M + + SqttProfile(const util::AgentInfo* agent_info) : Profile(agent_info) { + profile_.type = HSA_VEN_AMD_AQLPROFILE_EVENT_TYPE_SQTT; + } + + void Insert(const profile_info_t& info) { + Profile::Insert(info); + for (unsigned j = 0; j < info.parameter_count; ++j) { + Config(&profile_).Insert(info.parameters[j]); + } + + info.rinfo->data.result_bytes.size = output_buffer_size; + if (info.rinfo->data.result_bytes.copy) { + const uint32_t output_buffer_size64 = output_buffer_size / sizeof(uint64_t); + info.rinfo->data.result_bytes.ptr = calloc(output_buffer_size64, sizeof(uint64_t)); + memset(info.rinfo->data.result_bytes.ptr, 0, output_buffer_size); + } + } + + hsa_status_t Allocate(util::HsaRsrcFactory* rsrc) { + profile_.output_buffer.size = output_buffer_size; + profile_.command_buffer.ptr = rsrc->AllocateSysMemory(agent_info_, profile_.command_buffer.size); + profile_.output_buffer.ptr = rsrc->AllocateLocalMemory(agent_info_, profile_.output_buffer.size); + return (profile_.command_buffer.ptr && profile_.output_buffer.ptr) ? HSA_STATUS_SUCCESS : HSA_STATUS_ERROR; + } +}; + +} // namespace rocprofiler + +#endif // SRC_CORE_PROFILE_H_ diff --git a/src/core/proxy_queue.cpp b/src/core/proxy_queue.cpp new file mode 100644 index 0000000000..166d84be9e --- /dev/null +++ b/src/core/proxy_queue.cpp @@ -0,0 +1,48 @@ +#include "core/proxy_queue.h" + +#ifdef ROCP_HSA_PROXY +#include "core/hsa_proxy_queue.h" +#endif +#include "core/simple_proxy_queue.h" + +namespace rocprofiler { +void ProxyQueue::HsaIntercept(HsaApiTable* table) { + if (rocp_type_) SimpleProxyQueue::HsaIntercept(table); +} + +ProxyQueue* ProxyQueue::Create( + hsa_agent_t agent, + uint32_t size, + hsa_queue_type32_t type, + void (*callback)(hsa_status_t status, hsa_queue_t *source, void *data), + void *data, + uint32_t private_segment_size, + uint32_t group_segment_size, + hsa_queue_t **queue, + hsa_status_t* status) +{ + hsa_status_t suc = HSA_STATUS_ERROR; +#ifdef ROCP_HSA_PROXY + ProxyQueue* instance = (rocp_type_) ? (ProxyQueue*) new SimpleProxyQueue() : (ProxyQueue*) new HsaProxyQueue(); +#else + ProxyQueue* instance = new SimpleProxyQueue(); +#endif + if (instance != NULL) { + const auto suc = instance->Init(agent, size, type, callback, data, private_segment_size, group_segment_size, queue); + if (suc != HSA_STATUS_SUCCESS) { + delete instance; + instance = NULL; + } + } + *status = suc; + return instance; +} + +hsa_status_t ProxyQueue::Destroy(const ProxyQueue* obj) { + auto suc = obj->Cleanup(); + delete obj; + return suc; +} + +bool ProxyQueue::rocp_type_ = false; +} // namespace rocprofiler diff --git a/src/core/proxy_queue.h b/src/core/proxy_queue.h new file mode 100644 index 0000000000..eb97402eb5 --- /dev/null +++ b/src/core/proxy_queue.h @@ -0,0 +1,70 @@ +#ifndef _SRC_CORE_PROXY_QUEUE_H +#define _SRC_CORE_PROXY_QUEUE_H + +#include +#include +#include +#include +#include + +#include "core/queue.h" +#include "core/types.h" + +struct HsaApiTable; + +namespace rocprofiler { +typedef void (*hsa_amd_queue_intercept_packet_writer)(const void* packets, uint64_t count); +typedef void (*on_submit_cb_t)(const void* packet, uint64_t count, uint64_t que_idx, void* data, hsa_amd_queue_intercept_packet_writer writer); + +class ProxyQueue : public Queue { + public: + static void InitFactory() { +#ifdef ROCP_HSA_PROXY + const char* type = getenv("ROCP_PROXY_QUEUE"); + if (type != NULL) { + if (strncmp(type, "rocp", 4) == 0) rocp_type_ = true; + } +#else + rocp_type_ = true; +#endif + } + + static void HsaIntercept(HsaApiTable* table); + + static ProxyQueue* Create( + hsa_agent_t agent, + uint32_t size, + hsa_queue_type32_t type, + void (*callback)(hsa_status_t status, hsa_queue_t *source, void *data), + void *data, + uint32_t private_segment_size, + uint32_t group_segment_size, + hsa_queue_t **queue, + hsa_status_t* status); + + static hsa_status_t Destroy(const ProxyQueue* obj); + + virtual hsa_status_t Init( + hsa_agent_t agent, + uint32_t size, + hsa_queue_type32_t type, + void (*callback)(hsa_status_t status, hsa_queue_t *source, void *data), + void *data, + uint32_t private_segment_size, + uint32_t group_segment_size, + hsa_queue_t **queue + ) = 0; + virtual hsa_status_t Cleanup() const = 0; + virtual hsa_status_t SetInterceptCB(on_submit_cb_t on_submit_cb, void* data) = 0; + virtual void Submit(const packet_t* packet) = 0; + + protected: + virtual ~ProxyQueue() {}; + + private: + static bool rocp_type_; +}; + +} // namespace rocprofiler + +#endif // _SRC_CORE_PROXY_QUEUE_H diff --git a/src/core/queue.h b/src/core/queue.h new file mode 100644 index 0000000000..9d7393fdaf --- /dev/null +++ b/src/core/queue.h @@ -0,0 +1,20 @@ +#ifndef _SRC_CORE_QUEUE_H +#define _SRC_CORE_QUEUE_H + +#include "core/types.h" + +namespace rocprofiler { + +class Queue { + public: + Queue() {} + virtual ~Queue() {} + virtual void Submit(const packet_t* packet) = 0; + virtual void Submit(const packet_t* packet, const size_t& count) { + for (const packet_t* p = packet; p < packet + count; ++p) Submit(p); + } +}; + +} // namespace rocprofiler + +#endif // _SRC_CORE_QUEUE_H diff --git a/src/core/rocprofiler.cpp b/src/core/rocprofiler.cpp new file mode 100644 index 0000000000..1206e59064 --- /dev/null +++ b/src/core/rocprofiler.cpp @@ -0,0 +1,251 @@ +#include "inc/rocprofiler.h" + +#include +#include +#include +#include + +#include "core/context.h" +#include "core/hsa_queue.h" +#include "core/intercept_queue.h" +#include "core/proxy_queue.h" +#include "core/simple_proxy_queue.h" +#include "util/exception.h" +#include "util/hsa_rsrc_factory.h" +#include "util/logger.h" + +#define PUBLIC_API __attribute__((visibility("default"))) +#define CONSTRUCTOR_API __attribute__((constructor)) +#define DESTRUCTOR_API __attribute__((destructor)) + +#define API_METHOD_PREFIX \ + hsa_status_t status = HSA_STATUS_SUCCESS; \ + try { + +#define API_METHOD_SUFFIX \ + } catch (std::exception& e) { \ + ERR_LOGGING(__FUNCTION__ << "(), " << e.what()); \ + status = rocprofiler::GetExcStatus(e); \ + } \ + return status; + +namespace rocprofiler { +decltype(hsa_queue_create)* hsa_queue_create_fn; +decltype(hsa_queue_destroy)* hsa_queue_destroy_fn; +decltype(hsa_signal_store_relaxed)* hsa_signal_store_relaxed_fn; +decltype(hsa_queue_load_write_index_relaxed)* hsa_queue_load_write_index_relaxed_fn; +decltype(hsa_queue_store_write_index_relaxed)* hsa_queue_store_write_index_relaxed_fn; +#ifdef ROCP_HSA_PROXY +decltype(hsa_amd_queue_intercept_create)* hsa_amd_queue_intercept_create_fn; +decltype(hsa_amd_queue_intercept_register)* hsa_amd_queue_intercept_register_fn; +#endif + +::HsaApiTable* kHsaApiTable; + +void SaveHsaApi(::HsaApiTable* table) { + kHsaApiTable = table; + hsa_queue_create_fn = table->core_->hsa_queue_create_fn; + hsa_queue_destroy_fn = table->core_->hsa_queue_destroy_fn; + hsa_signal_store_relaxed_fn = table->core_->hsa_signal_store_relaxed_fn; + hsa_queue_load_write_index_relaxed_fn = table->core_->hsa_queue_load_write_index_relaxed_fn; + hsa_queue_store_write_index_relaxed_fn = table->core_->hsa_queue_store_write_index_relaxed_fn; +#ifdef ROCP_HSA_PROXY + hsa_amd_queue_intercept_create_fn = table->amd_ext_->hsa_amd_queue_intercept_create_fn; + hsa_amd_queue_intercept_register_fn = table->amd_ext_->hsa_amd_queue_intercept_register_fn; +#endif +} + +void RestoreHsaApi() { + ::HsaApiTable* table = kHsaApiTable; + table->core_->hsa_queue_create_fn = hsa_queue_create_fn; + table->core_->hsa_queue_destroy_fn = hsa_queue_destroy_fn; + table->core_->hsa_signal_store_relaxed_fn = hsa_signal_store_relaxed_fn; + table->core_->hsa_queue_load_write_index_relaxed_fn = hsa_queue_load_write_index_relaxed_fn; + table->core_->hsa_queue_store_write_index_relaxed_fn = hsa_queue_store_write_index_relaxed_fn; +#ifdef ROCP_HSA_PROXY + table->amd_ext_->hsa_amd_queue_intercept_create_fn = hsa_amd_queue_intercept_create_fn; + table->amd_ext_->hsa_amd_queue_intercept_register_fn = hsa_amd_queue_intercept_register_fn; +#endif +} + +CONSTRUCTOR_API void constructor() { + util::Logger::Create(); + util::HsaRsrcFactory::Create(); +} + +DESTRUCTOR_API void destructor() { + util::HsaRsrcFactory::Destroy(); + util::Logger::Destroy(); +} + +hsa_status_t GetExcStatus(const std::exception& e) { + const util::exception* rocprofiler_exc_ptr = dynamic_cast(&e); + return (rocprofiler_exc_ptr) ? static_cast(rocprofiler_exc_ptr->status()) : HSA_STATUS_ERROR; +} + +util::Logger::mutex_t util::Logger::mutex_; +util::Logger* util::Logger::instance_ = NULL; + +} + +extern "C" { + +// Returns the last error message +PUBLIC_API hsa_status_t rocprofiler_error_string(const char** str) { + API_METHOD_PREFIX + *str = rocprofiler::util::Logger::LastMessage().c_str(); + API_METHOD_SUFFIX +} + +// Create new profiling context +PUBLIC_API hsa_status_t rocprofiler_open( + unsigned agent_id, + rocprofiler_info_t* info, + uint32_t info_count, + rocprofiler_t** handle, + uint32_t mode, + rocprofiler_properties_t* properties) +{ + API_METHOD_PREFIX + rocprofiler::util::HsaRsrcFactory* hsa_rsrc = &rocprofiler::util::HsaRsrcFactory::Instance(); + const rocprofiler::util::AgentInfo* agent_info; + if (!hsa_rsrc->GetGpuAgentInfo(agent_id, &agent_info)) { + EXC_RAISING(HSA_STATUS_ERROR, "agent[" << agent_id << "] is not found"); + } + + rocprofiler::Queue* queue = NULL; + if (mode != 0) { + if (mode & ROCPROFILER_MODE_STANDALONE) { + if (mode & ROCPROFILER_MODE_CREATEQUEUE) { + if (hsa_rsrc->CreateQueue(agent_info, properties->queue_depth, &(properties->queue)) == false) { + EXC_RAISING(HSA_STATUS_ERROR, "CreateQueue() failed"); + } + } + queue = new rocprofiler::HsaQueue(agent_info, properties->queue); + } else { + EXC_RAISING(HSA_STATUS_ERROR, "invalid mode (" << mode << ")"); + } + } + + *handle = (void*) new rocprofiler::Context(agent_info, queue, info, info_count); + API_METHOD_SUFFIX +} + +// Delete profiling info +PUBLIC_API hsa_status_t rocprofiler_close(rocprofiler_t* handle) +{ + API_METHOD_PREFIX + rocprofiler::Context* context = reinterpret_cast(handle); + if (context) delete context; + API_METHOD_SUFFIX +} + +// Get profiling groups +PUBLIC_API hsa_status_t rocprofiler_get_groups(rocprofiler_t* handle, rocprofiler_group_t** group_array, uint32_t* group_count) { + API_METHOD_PREFIX + rocprofiler::Context* context = reinterpret_cast(handle); + const uint32_t count = context->GetGroupCount(); + rocprofiler_group_t* groups = (rocprofiler_group_t*) calloc(count, sizeof(rocprofiler_group_t)); + for (unsigned i = 0; i < count; ++i) groups[i] = context->GetGroupInfo(i); + *group_array = groups; + *group_count = count; + API_METHOD_SUFFIX +} + +// Start profiling +PUBLIC_API hsa_status_t rocprofiler_start(rocprofiler_t* handle, uint32_t group_index) { + API_METHOD_PREFIX + rocprofiler::Context* context = reinterpret_cast(handle); + context->Start(group_index); + API_METHOD_SUFFIX +} + +// Stop profiling +PUBLIC_API hsa_status_t rocprofiler_stop(rocprofiler_t* handle, uint32_t group_index) { + API_METHOD_PREFIX + rocprofiler::Context* context = reinterpret_cast(handle); + context->Stop(group_index); + API_METHOD_SUFFIX +} + +// Get profiling data +PUBLIC_API hsa_status_t rocprofiler_get_data(rocprofiler_t* handle, uint32_t group_index) { + API_METHOD_PREFIX + rocprofiler::Context* context = reinterpret_cast(handle); + context->GetData(group_index); + API_METHOD_SUFFIX +} + +// Start profiling +PUBLIC_API hsa_status_t rocprofiler_group_start(rocprofiler_group_t* group) { + API_METHOD_PREFIX + rocprofiler_start(group->context, group->index); + API_METHOD_SUFFIX +} + +// Stop profiling +PUBLIC_API hsa_status_t rocprofiler_group_stop(rocprofiler_group_t* group) { + API_METHOD_PREFIX + rocprofiler_stop(group->context, group->index); + API_METHOD_SUFFIX +} + +// Get profiling data +PUBLIC_API hsa_status_t rocprofiler_get_group_data(rocprofiler_group_t* group) { + API_METHOD_PREFIX + rocprofiler::Context* context = reinterpret_cast(group->context); + context->GetData(group->index); + API_METHOD_SUFFIX +} + +// Get metrics data +PUBLIC_API hsa_status_t rocprofiler_get_metrics_data(const rocprofiler_t* handle) { + API_METHOD_PREFIX + const rocprofiler::Context* context = reinterpret_cast(handle); + context->GetMetricsData(); + API_METHOD_SUFFIX +} + +// Set kernel dispatch observer +PUBLIC_API hsa_status_t rocprofiler_set_dispatch_observer(rocprofiler_callback_t callback, void* data) { + API_METHOD_PREFIX + rocprofiler::InterceptQueue::SetDispatchCB(callback, data); + API_METHOD_SUFFIX +} + +// Set kernel dispatch observer +PUBLIC_API hsa_status_t rocprofiler_remove_dispatch_observer() { + API_METHOD_PREFIX + rocprofiler::InterceptQueue::UnsetDispatchCB(); + API_METHOD_SUFFIX +} + +// Method for iterating the events output data +PUBLIC_API hsa_status_t rocprofiler_iterate_trace_data(rocprofiler_t* handle, hsa_ven_amd_aqlprofile_data_callback_t callback, void* data) { + API_METHOD_PREFIX + rocprofiler::Context* context = reinterpret_cast(handle); + context->IterateTraceData(callback, data); + API_METHOD_SUFFIX +} + +PUBLIC_API bool OnLoad( + HsaApiTable* table, + uint64_t runtime_version, + uint64_t failed_tool_count, + const char* const * failed_tool_names) { + rocprofiler::SaveHsaApi(table); + rocprofiler::ProxyQueue::InitFactory(); + rocprofiler::InterceptQueue::SetTool(getenv("ROCP_TOOL_LIB")); + // HSA intercepting + if (getenv("ROCP_HSA_INTERCEPT") != NULL) { + rocprofiler::InterceptQueue::HsaIntercept(table); + rocprofiler::ProxyQueue::HsaIntercept(table); + } + return true; +} + +PUBLIC_API void OnUnload() { + rocprofiler::RestoreHsaApi(); +} + +} // extern "C" diff --git a/src/core/simple_proxy_queue.cpp b/src/core/simple_proxy_queue.cpp new file mode 100644 index 0000000000..a4dadf4e5e --- /dev/null +++ b/src/core/simple_proxy_queue.cpp @@ -0,0 +1,11 @@ +#include "core/simple_proxy_queue.h" + +namespace rocprofiler { +void SimpleProxyQueue::HsaIntercept(HsaApiTable* table) { + table->core_->hsa_signal_store_relaxed_fn = rocprofiler::SimpleProxyQueue::SignalStore; + table->core_->hsa_queue_load_write_index_relaxed_fn = rocprofiler::SimpleProxyQueue::LoadIndex; + table->core_->hsa_queue_store_write_index_relaxed_fn = rocprofiler::SimpleProxyQueue::StoreIndex; +} + +std::map SimpleProxyQueue::queue_map_; +} // namespace rocprofiler diff --git a/src/core/simple_proxy_queue.h b/src/core/simple_proxy_queue.h new file mode 100644 index 0000000000..9a9f6221c9 --- /dev/null +++ b/src/core/simple_proxy_queue.h @@ -0,0 +1,187 @@ +#ifndef _SRC_CORE_SIMPLE_PROXY_QUEUE_H +#define _SRC_CORE_SIMPLE_PROXY_QUEUE_H + +#include +#include +#include +#include + +#include "core/proxy_queue.h" +#include "core/types.h" +#include "util/hsa_rsrc_factory.h" + +namespace rocprofiler { +extern decltype(hsa_queue_create)* hsa_queue_create_fn; +extern decltype(hsa_queue_destroy)* hsa_queue_destroy_fn; +extern decltype(hsa_signal_store_relaxed)* hsa_signal_store_relaxed_fn; +extern decltype(hsa_queue_load_write_index_relaxed)* hsa_queue_load_write_index_relaxed_fn; +extern decltype(hsa_queue_store_write_index_relaxed)* hsa_queue_store_write_index_relaxed_fn; +typedef decltype(hsa_signal_t::handle) signal_handle_t; + + +class SimpleProxyQueue : public ProxyQueue { + public: + static void HsaIntercept(HsaApiTable* table); + + static void SignalStore( + hsa_signal_t signal, + hsa_signal_value_t que_idx) + { + auto it = queue_map_.find(signal.handle); + if (it != queue_map_.end()) { + SimpleProxyQueue* instance = it->second; + const uint64_t begin = instance->submit_index_; + const uint64_t end = que_idx + 1; + instance->submit_index_ = end; + for (uint64_t j = begin; j < end; ++j) { + // Submited packet + const uint32_t idx = j & instance->queue_mask_; + packet_t* packet = reinterpret_cast(instance->queue_->base_address) + idx; + if (instance->on_submit_cb_ != NULL) instance->on_submit_cb_(packet, 1, j, instance->on_submit_cb_data_, NULL); + else instance->Submit(packet); + } + } else { + hsa_signal_store_relaxed_fn(signal, que_idx); + } + } + + static uint64_t LoadIndex( + const hsa_queue_t *queue) + { + uint64_t index = 0; + auto it = queue_map_.find(queue->doorbell_signal.handle); + if (it != queue_map_.end()) { + SimpleProxyQueue* instance = it->second; + instance->mutex_.lock(); + index = instance->queue_index_; + } else { + index = hsa_queue_load_write_index_relaxed_fn(queue); + } + return index; + } + + static void StoreIndex( + const hsa_queue_t *queue, + uint64_t value) + { + auto it = queue_map_.find(queue->doorbell_signal.handle); + if (it != queue_map_.end()) { + SimpleProxyQueue* instance = it->second; + instance->queue_index_ = value; + instance->mutex_.unlock(); + } else { + hsa_queue_store_write_index_relaxed_fn(queue, value); + } + } + + hsa_status_t SetInterceptCB(on_submit_cb_t on_submit_cb, void* data) { + on_submit_cb_ = on_submit_cb; + on_submit_cb_data_ = data; + return HSA_STATUS_SUCCESS; + } + + void Submit(const packet_t* packet) { + // Compute the write index of queue and copy Aql packet into it + const uint64_t que_idx = hsa_queue_load_write_index_relaxed_fn(queue_); + // Increment the write index and ring the doorbell to submit the packet. + hsa_queue_store_write_index_relaxed_fn(queue_, que_idx + 1); + + const uint32_t mask = queue_->size - 1; + const uint32_t idx = que_idx & mask; + + // Copy packet to the queue + const packet_word_t* src = reinterpret_cast(packet); + packet_word_t* dst = reinterpret_cast(base_address_ + idx); + for (unsigned i = 1; i < sizeof(packet_t) / sizeof(packet_word_t); ++i) { + dst[i] = src[i]; + } + + // To maintain global order to ensure the prior copy of the packet contents is made visible + // before the header is updated. + // With in-order CP it will wait until the first packet in the blob will be valid + std::atomic* header_atomic_ptr = + reinterpret_cast*>(&dst[0]); + header_atomic_ptr->store(src[0], std::memory_order_release); + + // Doorbell signaling + hsa_signal_store_relaxed_fn(doorbell_signal_, que_idx); + } + + SimpleProxyQueue() : + agent_info_(NULL), + queue_(NULL), + base_address_(NULL), + doorbell_signal_({}), + queue_index_(0), + queue_mask_(0), + submit_index_(0), + on_submit_cb_(0), + on_submit_cb_data_(0) + {} + + ~SimpleProxyQueue() {} + + private: + hsa_status_t Init( + hsa_agent_t agent, + uint32_t size, + hsa_queue_type32_t type, + void (*callback)(hsa_status_t status, hsa_queue_t *source, void *data), + void *data, + uint32_t private_segment_size, + uint32_t group_segment_size, + hsa_queue_t **queue) + { + auto status = Init(agent, size); + *queue = queue_; + return status; + } + + hsa_status_t Init(hsa_agent_t agent, uint32_t size) { + hsa_status_t status = HSA_STATUS_ERROR; + agent_info_ = util::HsaRsrcFactory::Instance().GetAgentInfo(agent); + if (agent_info_ != NULL) { + if (agent_info_->dev_type == HSA_DEVICE_TYPE_GPU) { + status = hsa_queue_create_fn(agent, size, HSA_QUEUE_TYPE_MULTI, NULL, NULL, UINT32_MAX, UINT32_MAX, &queue_); + if (status == HSA_STATUS_SUCCESS) { + base_address_ = reinterpret_cast(queue_->base_address); + doorbell_signal_ = queue_->doorbell_signal; + data_array_ = calloc(size + 1, sizeof(packet_t)); + uintptr_t addr = (uintptr_t)data_array_; + queue_->base_address = (void*) ((addr + align_mask_) & ~align_mask_); + status = hsa_signal_create(1, 0, NULL, &(queue_->doorbell_signal)); + queue_mask_ = size - 1; + queue_map_[queue_->doorbell_signal.handle] = this; + } + } + } + return status; + } + + hsa_status_t Cleanup() const { + hsa_status_t status = HSA_STATUS_SUCCESS; + queue_->base_address = base_address_; + queue_->doorbell_signal = doorbell_signal_; + status = hsa_queue_destroy_fn(queue_); + free(data_array_); + return status; + } + + static std::map queue_map_; + const util::AgentInfo* agent_info_; + hsa_queue_t* queue_; + static const uintptr_t align_mask_ = sizeof(packet_t) - 1; + packet_t* base_address_; + hsa_signal_t doorbell_signal_; + uint64_t queue_index_; + uint64_t queue_mask_; + uint64_t submit_index_; + std::mutex mutex_; + on_submit_cb_t on_submit_cb_; + void* on_submit_cb_data_; + void* data_array_; +}; + +} // namespace rocprofiler + +#endif // _SRC_CORE_SIMPLE_PROXY_QUEUE_H diff --git a/src/core/types.h b/src/core/types.h new file mode 100644 index 0000000000..a179e5584f --- /dev/null +++ b/src/core/types.h @@ -0,0 +1,15 @@ +#ifndef SRC_CORE_TYPES_H_ +#define SRC_CORE_TYPES_H_ + +#include + +namespace rocprofiler { +typedef hsa_ven_amd_aqlprofile_1_00_pfn_t pfn_t; +typedef hsa_ven_amd_aqlprofile_event_t event_t; +typedef hsa_ven_amd_aqlprofile_parameter_t parameter_t; +typedef hsa_ven_amd_aqlprofile_profile_t profile_t; +typedef hsa_ext_amd_aql_pm4_packet_t packet_t; +typedef uint32_t packet_word_t; +} // namespace rocprofiler + +#endif // SRC_CORE_TYPES_H_ diff --git a/src/util/exception.h b/src/util/exception.h new file mode 100644 index 0000000000..82b71d8557 --- /dev/null +++ b/src/util/exception.h @@ -0,0 +1,39 @@ +#ifndef SRC_UTIL_EXCEPTION_H_ +#define SRC_UTIL_EXCEPTION_H_ + +#include +#include +#include + +#include + +#define EXC_RAISING(error, stream) { \ + std::ostringstream oss; oss << __FUNCTION__ << "(), " << stream; \ + throw rocprofiler::util::exception(error, oss.str()); \ +} + +#define AQL_EXC_RAISING(error, stream) { \ + const char* error_string = NULL; \ + const rocprofiler::pfn_t* api = util::HsaRsrcFactory::Instance().AqlProfileApi(); \ + api->hsa_ven_amd_aqlprofile_error_string(&error_string); \ + EXC_RAISING(error, stream << ", " << error_string); \ +} + +namespace rocprofiler { +namespace util { + +class exception : public std::exception { + public: + explicit exception(const uint32_t &status, const std::string& msg) : status_(status), str_(msg) {} + const char* what() const throw() { return str_.c_str(); } + uint32_t status() const throw() { return status_; } + + protected: + const uint32_t status_; + const std::string str_; +}; + +} // namespace util +} // namespace rocprofiler + +#endif // SRC_UTIL_EXCEPTION_H_ diff --git a/src/util/hsa_rsrc_factory.cpp b/src/util/hsa_rsrc_factory.cpp new file mode 100644 index 0000000000..18ce098ce2 --- /dev/null +++ b/src/util/hsa_rsrc_factory.cpp @@ -0,0 +1,392 @@ +/********************************************************************** +Copyright ©2013 Advanced Micro Devices, Inc. All rights reserved. + +Redistribution and use in source and binary forms, with or without modification, are permitted +provided that the following conditions are met: + +<95> Redistributions of source code must retain the above copyright notice, this list of +conditions and the following disclaimer. +<95> Redistributions in binary form must reproduce the above copyright notice, this list of +conditions and the following disclaimer in the documentation and/or + other materials provided with the distribution. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR +IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT +SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY + DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS + OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, +WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. +********************************************************************/ + +#include "util/hsa_rsrc_factory.h" + +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +namespace rocprofiler { +namespace util { + +// Callback function to get available in the system agents +hsa_status_t HsaRsrcFactory::GetHsaAgentsCallback(hsa_agent_t agent, void* data) { + hsa_status_t status = HSA_STATUS_ERROR; + HsaRsrcFactory* hsa_rsrc = reinterpret_cast(data); + const AgentInfo* agent_info = hsa_rsrc->AddAgentInfo(agent); + if (agent_info != NULL) status = HSA_STATUS_SUCCESS; + return status; +} + +// Callback function to find and bind kernarg region of an agent +hsa_status_t HsaRsrcFactory::FindMemRegionsCallback(hsa_region_t region, void* data) { + hsa_region_global_flag_t flags; + hsa_region_segment_t segment_id; + + hsa_region_get_info(region, HSA_REGION_INFO_SEGMENT, &segment_id); + if (segment_id != HSA_REGION_SEGMENT_GLOBAL) { + return HSA_STATUS_SUCCESS; + } + + AgentInfo* agent_info = (AgentInfo*)data; + hsa_region_get_info(region, HSA_REGION_INFO_GLOBAL_FLAGS, &flags); + if (flags & HSA_REGION_GLOBAL_FLAG_COARSE_GRAINED) { + agent_info->coarse_region = region; + } + + if (flags & HSA_REGION_GLOBAL_FLAG_KERNARG) { + agent_info->kernarg_region = region; + } + + return HSA_STATUS_SUCCESS; +} + +// Constructor of the class +HsaRsrcFactory::HsaRsrcFactory() { + // Initialize the Hsa Runtime + hsa_status_t status = hsa_init(); + CHECK_STATUS("Error in hsa_init", status); + + // Discover the set of Gpu devices available on the platform + status = hsa_iterate_agents(GetHsaAgentsCallback, this); + CHECK_STATUS("Error Calling hsa_iterate_agents", status); + + // Get AqlProfile API table + status = hsa_system_get_extension_table(HSA_EXTENSION_AMD_AQLPROFILE, 1, 0, &aqlprofile_api_); + CHECK_STATUS("aqlprofile API table query failed", status); +} + +// Destructor of the class +HsaRsrcFactory::~HsaRsrcFactory() { + hsa_status_t status = hsa_shut_down(); + CHECK_STATUS("Error in hsa_shut_down", status); +} + +// Add system agent info +const AgentInfo* HsaRsrcFactory::AddAgentInfo(const hsa_agent_t agent) { + // Determine if device is a Gpu agent + hsa_status_t status; + AgentInfo* agent_info = NULL; + + hsa_device_type_t type; + status = hsa_agent_get_info(agent, HSA_AGENT_INFO_DEVICE, &type); + CHECK_STATUS("Error Calling hsa_agent_get_info", status); + + if (type == HSA_DEVICE_TYPE_CPU) { + agent_info = new AgentInfo{}; + agent_info->dev_id = agent; + agent_info->dev_type = HSA_DEVICE_TYPE_CPU; + agent_info->dev_index = cpu_list_.size(); + cpu_list_.push_back(agent_info); + } + + if (type == HSA_DEVICE_TYPE_GPU) { + agent_info = new AgentInfo{}; + agent_info->dev_id = agent; + agent_info->dev_type = HSA_DEVICE_TYPE_GPU; + hsa_agent_get_info(agent, HSA_AGENT_INFO_NAME, agent_info->name); + strncpy(agent_info->gfxip, agent_info->name, 4); + agent_info->gfxip[4] = '\0'; + agent_info->max_wave_size = 0; + hsa_agent_get_info(agent, HSA_AGENT_INFO_WAVEFRONT_SIZE, &agent_info->max_wave_size); + agent_info->max_queue_size = 0; + hsa_agent_get_info(agent, HSA_AGENT_INFO_QUEUE_MAX_SIZE, &agent_info->max_queue_size); + agent_info->profile = hsa_profile_t(108); + hsa_agent_get_info(agent, HSA_AGENT_INFO_PROFILE, &agent_info->profile); + + // Initialize memory regions to zero + agent_info->kernarg_region.handle = 0; + agent_info->coarse_region.handle = 0; + + // Find and Bind Memory regions of the Gpu agent + hsa_agent_iterate_regions(agent, FindMemRegionsCallback, agent_info); + + // Set GPU index + agent_info->dev_index = gpu_list_.size(); + gpu_list_.push_back(agent_info); + } + + if (agent_info) agent_map_[agent.handle] = agent_info; + + return agent_info; +} + +// Return systen agent info +const AgentInfo* HsaRsrcFactory::GetAgentInfo(const hsa_agent_t agent) { + const AgentInfo* agent_info = NULL; + auto it = agent_map_.find(agent.handle); + if (it != agent_map_.end()) { + agent_info = it->second; + } + return agent_info; +} + +// Get the count of Hsa Gpu Agents available on the platform +// +// @return uint32_t Number of Gpu agents on platform +// +uint32_t HsaRsrcFactory::GetCountOfGpuAgents() { return uint32_t(gpu_list_.size()); } + +// Get the count of Hsa Cpu Agents available on the platform +// +// @return uint32_t Number of Cpu agents on platform +// +uint32_t HsaRsrcFactory::GetCountOfCpuAgents() { return uint32_t(cpu_list_.size()); } + +// Get the AgentInfo handle of a Gpu device +// +// @param idx Gpu Agent at specified index +// +// @param agent_info Output parameter updated with AgentInfo +// +// @return bool true if successful, false otherwise +// +bool HsaRsrcFactory::GetGpuAgentInfo(uint32_t idx, const AgentInfo** agent_info) { + // Determine if request is valid + uint32_t size = uint32_t(gpu_list_.size()); + if (idx >= size) { + return false; + } + + // Copy AgentInfo from specified index + *agent_info = gpu_list_[idx]; + + return true; +} + +// Get the AgentInfo handle of a Cpu device +// +// @param idx Cpu Agent at specified index +// +// @param agent_info Output parameter updated with AgentInfo +// +// @return bool true if successful, false otherwise +// +bool HsaRsrcFactory::GetCpuAgentInfo(uint32_t idx, const AgentInfo** agent_info) { + // Determine if request is valid + uint32_t size = uint32_t(cpu_list_.size()); + if (idx >= size) { + return false; + } + + // Copy AgentInfo from specified index + *agent_info = cpu_list_[idx]; + return true; +} + +// Create a Queue object and return its handle. The queue object is expected +// to support user requested number of Aql dispatch packets. +// +// @param agent_info Gpu Agent on which to create a queue object +// +// @param num_Pkts Number of packets to be held by queue +// +// @param queue Output parameter updated with handle of queue object +// +// @return bool true if successful, false otherwise +// +bool HsaRsrcFactory::CreateQueue(const AgentInfo* agent_info, uint32_t num_pkts, hsa_queue_t** queue) { + hsa_status_t status; + status = hsa_queue_create(agent_info->dev_id, num_pkts, HSA_QUEUE_TYPE_MULTI, NULL, NULL, + UINT32_MAX, UINT32_MAX, queue); + return (status == HSA_STATUS_SUCCESS); +} + +// Create a Signal object and return its handle. +// +// @param value Initial value of signal object +// +// @param signal Output parameter updated with handle of signal object +// +// @return bool true if successful, false otherwise +// +bool HsaRsrcFactory::CreateSignal(uint32_t value, hsa_signal_t* signal) { + hsa_status_t status; + status = hsa_signal_create(value, 0, NULL, signal); + return (status == HSA_STATUS_SUCCESS); +} + +// Allocate memory for use by a kernel of specified size in specified +// agent's memory region. Currently supports Global segment whose Kernarg +// flag set. +// +// @param agent_info Agent from whose memory region to allocate +// +// @param size Size of memory in terms of bytes +// +// @return uint8_t* Pointer to buffer, null if allocation fails. +// +uint8_t* HsaRsrcFactory::AllocateLocalMemory(const AgentInfo* agent_info, size_t size) { + hsa_status_t status; + uint8_t* buffer = NULL; + size = (size + MEM_PAGE_MASK) & ~MEM_PAGE_MASK; + + if (agent_info->coarse_region.handle != 0) { + // Allocate in local memory if it is available + status = hsa_memory_allocate(agent_info->coarse_region, size, (void**)&buffer); + if (status == HSA_STATUS_SUCCESS) { + status = hsa_memory_assign_agent(buffer, agent_info->dev_id, HSA_ACCESS_PERMISSION_RW); + } + } else { + // Allocate in system memory if local memory is not available + status = hsa_memory_allocate(agent_info->kernarg_region, size, (void**)&buffer); + } + + return (status == HSA_STATUS_SUCCESS) ? buffer : NULL; +} + +// Allocate memory tp pass kernel parameters. +// +// @param agent_info Agent from whose memory region to allocate +// +// @param size Size of memory in terms of bytes +// +// @return uint8_t* Pointer to buffer, null if allocation fails. +// +uint8_t* HsaRsrcFactory::AllocateSysMemory(const AgentInfo* agent_info, size_t size) { + hsa_status_t status; + size = (size + MEM_PAGE_MASK) & ~MEM_PAGE_MASK; + + uint8_t* buffer = NULL; + status = hsa_memory_allocate(agent_info->kernarg_region, size, (void**)&buffer); + return (status == HSA_STATUS_SUCCESS) ? buffer : NULL; +} + +// Transfer data method +bool HsaRsrcFactory::TransferData(void* dest_buff, void* src_buff, uint32_t length, + bool host_to_dev) { + hsa_status_t status; + status = hsa_memory_copy(dest_buff, src_buff, length); + return (status == HSA_STATUS_SUCCESS); +} + +// Loads an Assembled Brig file and Finalizes it into Device Isa +// +// @param agent_info Gpu device for which to finalize +// +// @param brig_path File path of the Assembled Brig file +// +// @param kernel_name Name of the kernel to finalize +// +// @param code_desc Handle of finalized Code Descriptor that could +// be used to submit for execution +// +// @return bool true if successful, false otherwise +// +bool HsaRsrcFactory::LoadAndFinalize(const AgentInfo* agent_info, const char* brig_path, + char* kernel_name, hsa_executable_symbol_t* code_desc) { + // Finalize the Hsail object into code object + hsa_status_t status; + hsa_code_object_t code_object; + + // Build the code object filename + std::string filename(brig_path); + std::clog << "Code object filename: " << filename << std::endl; + + // Open the file containing code object + std::ifstream codeStream(filename.c_str(), std::ios::binary | std::ios::ate); + if (!codeStream) { + std::cerr << "Error: failed to load " << filename << std::endl; + assert(false); + return false; + } + + // Allocate memory to read in code object from file + size_t size = std::string::size_type(codeStream.tellg()); + char* codeBuff = (char*)AllocateSysMemory(agent_info, size); + if (!codeBuff) { + std::cerr << "Error: failed to allocate memory for code object." << std::endl; + assert(false); + return false; + } + + // Read the code object into allocated memory + codeStream.seekg(0, std::ios::beg); + std::copy(std::istreambuf_iterator(codeStream), std::istreambuf_iterator(), codeBuff); + + // De-Serialize the code object that has been read into memory + status = hsa_code_object_deserialize(codeBuff, size, NULL, &code_object); + if (status != HSA_STATUS_SUCCESS) { + std::cerr << "Failed to deserialize code object" << std::endl; + return false; + } + + // Create executable. + hsa_executable_t hsaExecutable; + status = + hsa_executable_create(HSA_PROFILE_FULL, HSA_EXECUTABLE_STATE_UNFROZEN, "", &hsaExecutable); + CHECK_STATUS("Error in creating executable object", status); + + // Load code object. + status = hsa_executable_load_code_object(hsaExecutable, agent_info->dev_id, code_object, ""); + CHECK_STATUS("Error in loading executable object", status); + + // Freeze executable. + status = hsa_executable_freeze(hsaExecutable, ""); + CHECK_STATUS("Error in freezing executable object", status); + + // Get symbol handle. + hsa_executable_symbol_t kernelSymbol; + status = hsa_executable_get_symbol(hsaExecutable, NULL, kernel_name, agent_info->dev_id, 0, + &kernelSymbol); + CHECK_STATUS("Error in looking up kernel symbol", status); + + // Update output parameter + *code_desc = kernelSymbol; + return true; +} + +// Print the various fields of Hsa Gpu Agents +bool HsaRsrcFactory::PrintGpuAgents(const std::string& header) { + std::clog << header << " :" << std::endl; + + const AgentInfo* agent_info; + int size = uint32_t(gpu_list_.size()); + for (int idx = 0; idx < size; idx++) { + agent_info = gpu_list_[idx]; + + std::clog << "> agent[" << idx << "] :" << std::endl; + std::clog << ">> Name : " << agent_info->name << std::endl; + std::clog << ">> Max Wave Size : " << agent_info->max_wave_size << std::endl; + std::clog << ">> Max Queue Size : " << agent_info->max_queue_size << std::endl; + std::clog << ">> Kernarg Region Id : " << agent_info->coarse_region.handle << std::endl; + } + return true; +} + +HsaRsrcFactory* HsaRsrcFactory::instance_ = NULL; +HsaRsrcFactory::mutex_t HsaRsrcFactory::mutex_; + +} // namespace util +} // namespace rocprofiler diff --git a/src/util/hsa_rsrc_factory.h b/src/util/hsa_rsrc_factory.h new file mode 100644 index 0000000000..8b4e2227a0 --- /dev/null +++ b/src/util/hsa_rsrc_factory.h @@ -0,0 +1,262 @@ +/********************************************************************** +Copyright ©2013 Advanced Micro Devices, Inc. All rights reserved. + +Redistribution and use in source and binary forms, with or without modification, are permitted +provided that the following conditions are met: + +<95> Redistributions of source code must retain the above copyright notice, this list of +conditions and the following disclaimer. +<95> Redistributions in binary form must reproduce the above copyright notice, this list of +conditions and the following disclaimer in the documentation and/or + other materials provided with the distribution. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR +IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT +SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY + DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS + OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, +WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. +********************************************************************/ + +#ifndef SRC_UTIL_HSA_RSRC_FACTORY_H_ +#define SRC_UTIL_HSA_RSRC_FACTORY_H_ + +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +#define HSA_ARGUMENT_ALIGN_BYTES 16 +#define HSA_QUEUE_ALIGN_BYTES 64 +#define HSA_PACKET_ALIGN_BYTES 64 + +#define CHECK_STATUS(msg, status) \ + if (status != HSA_STATUS_SUCCESS) { \ + const char* emsg = 0; \ + hsa_status_string(status, &emsg); \ + printf("%s: %s\n", msg, emsg ? emsg : ""); \ + exit(1); \ + } + +namespace rocprofiler { +namespace util { +static const unsigned MEM_PAGE_BYTES = 0x1000; +static const unsigned MEM_PAGE_MASK = MEM_PAGE_BYTES - 1; +typedef decltype(hsa_agent_t::handle) hsa_agent_handle_t; + +// Encapsulates information about a Hsa Agent such as its +// handle, name, max queue size, max wavefront size, etc. +struct AgentInfo { + // Handle of Agent + hsa_agent_t dev_id; + + // Agent type - Cpu = 0, Gpu = 1 or Dsp = 2 + uint32_t dev_type; + + // Agent system index + uint32_t dev_index; + + // GFXIP name + char gfxip[64]; + + // Name of Agent whose length is less than 64 + char name[64]; + + // Max size of Wavefront size + uint32_t max_wave_size; + + // Max size of Queue buffer + uint32_t max_queue_size; + + // Hsail profile supported by agent + hsa_profile_t profile; + + // Memory region supporting kernel parameters + hsa_region_t coarse_region; + + // Memory region supporting kernel arguments + hsa_region_t kernarg_region; +}; + +class HsaRsrcFactory { + public: + typedef std::recursive_mutex mutex_t; + + static HsaRsrcFactory* Create() { return NULL; } + + static HsaRsrcFactory* CreateInstance() { + std::lock_guard lck(mutex_); + if (instance_ == NULL) { + instance_ = new HsaRsrcFactory(); + } + return instance_; + } + + static HsaRsrcFactory& Instance() { + CreateInstance(); + hsa_status_t status = (instance_ != NULL) ? HSA_STATUS_SUCCESS : HSA_STATUS_ERROR; + CHECK_STATUS("HsaRsrcFactory::Instance() is not found", status); + return *instance_; + } + + static void Destroy() { + std::lock_guard lck(mutex_); + if (instance_) delete instance_; + instance_ = NULL; + } + + // Return system agent info + const AgentInfo* GetAgentInfo(const hsa_agent_t agent); + + // Get the count of Hsa Gpu Agents available on the platform + // + // @return uint32_t Number of Gpu agents on platform + // + uint32_t GetCountOfGpuAgents(); + + // Get the count of Hsa Cpu Agents available on the platform + // + // @return uint32_t Number of Cpu agents on platform + // + uint32_t GetCountOfCpuAgents(); + + // Get the AgentInfo handle of a Gpu device + // + // @param idx Gpu Agent at specified index + // + // @param agent_info Output parameter updated with AgentInfo + // + // @return bool true if successful, false otherwise + // + bool GetGpuAgentInfo(uint32_t idx, const AgentInfo** agent_info); + + // Get the AgentInfo handle of a Cpu device + // + // @param idx Cpu Agent at specified index + // + // @param agent_info Output parameter updated with AgentInfo + // + // @return bool true if successful, false otherwise + // + bool GetCpuAgentInfo(uint32_t idx, const AgentInfo** agent_info); + + // Create a Queue object and return its handle. The queue object is expected + // to support user requested number of Aql dispatch packets. + // + // @param agent_info Gpu Agent on which to create a queue object + // + // @param num_Pkts Number of packets to be held by queue + // + // @param queue Output parameter updated with handle of queue object + // + // @return bool true if successful, false otherwise + // + bool CreateQueue(const AgentInfo* agent_info, uint32_t num_pkts, hsa_queue_t** queue); + + // Create a Signal object and return its handle. + // + // @param value Initial value of signal object + // + // @param signal Output parameter updated with handle of signal object + // + // @return bool true if successful, false otherwise + // + bool CreateSignal(uint32_t value, hsa_signal_t* signal); + + // Allocate memory for use by a kernel of specified size in specified + // agent's memory region. Currently supports Global segment whose Kernarg + // flag set. + // + // @param agent_info Agent from whose memory region to allocate + // + // @param size Size of memory in terms of bytes + // + // @return uint8_t* Pointer to buffer, null if allocation fails. + // + uint8_t* AllocateLocalMemory(const AgentInfo* agent_info, size_t size); + + // Allocate memory tp pass kernel parameters. + // + // @param agent_info Agent from whose memory region to allocate + // + // @param size Size of memory in terms of bytes + // + // @return uint8_t* Pointer to buffer, null if allocation fails. + // + uint8_t* AllocateSysMemory(const AgentInfo* agent_info, size_t size); + + // Transfer data method + bool TransferData(void* dest_buff, void* src_buff, uint32_t length, bool host_to_dev); + + // Loads an Assembled Brig file and Finalizes it into Device Isa + // + // @param agent_info Gpu device for which to finalize + // + // @param brig_path File path of the Assembled Brig file + // + // @param kernel_name Name of the kernel to finalize + // + // @param code_desc Handle of finalized Code Descriptor that could + // be used to submit for execution + // + // @return bool true if successful, false otherwise + // + bool LoadAndFinalize(const AgentInfo* agent_info, const char* brig_path, char* kernel_name, + hsa_executable_symbol_t* code_desc); + + // Print the various fields of Hsa Gpu Agents + bool PrintGpuAgents(const std::string& header); + + // Return AqlProfile API table + const hsa_ven_amd_aqlprofile_1_00_pfn_t* AqlProfileApi() const { return &aqlprofile_api_; } + + private: + // System agents iterating callback + static hsa_status_t GetHsaAgentsCallback(hsa_agent_t agent, void* data); + + // Callback function to find and bind kernarg region of an agent + static hsa_status_t FindMemRegionsCallback(hsa_region_t region, void* data); + + // Constructor of the class. Will initialize the Hsa Runtime and + // query the system topology to get the list of Cpu and Gpu devices + HsaRsrcFactory(); + + // Destructor of the class + ~HsaRsrcFactory(); + + // Add an instance of AgentInfo representing a Hsa Gpu agent + const AgentInfo* AddAgentInfo(const hsa_agent_t agent); + + static HsaRsrcFactory* instance_; + static mutex_t mutex_; + + // Used to maintain a list of Hsa Gpu Agent Info + std::vector gpu_list_; + + // Used to maintain a list of Hsa Cpu Agent Info + std::vector cpu_list_; + + // System agents map + std::map agent_map_; + + // AqlProfile API table + hsa_ven_amd_aqlprofile_1_00_pfn_t aqlprofile_api_; +}; + +} // namespace util +} // namespace rocprofiler + +#endif // SRC_UTIL_HSA_RSRC_FACTORY_H_ diff --git a/src/util/logger.h b/src/util/logger.h new file mode 100644 index 0000000000..1688dd735f --- /dev/null +++ b/src/util/logger.h @@ -0,0 +1,163 @@ +#ifndef SRC_UTIL_LOGGER_H_ +#define SRC_UTIL_LOGGER_H_ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +namespace rocprofiler { +namespace util { + +class Logger { + public: + typedef std::recursive_mutex mutex_t; + + template Logger& operator<<(const T& m) { + std::ostringstream oss; + oss << m; + if (!streaming_) + Log(oss.str()); + else + Put(oss.str()); + streaming_ = true; + return *this; + } + + typedef void (*manip_t)(); + Logger& operator<<(manip_t f) { + f(); + return *this; + } + + static void begm() { Instance().ResetStreaming(true); } + static void endl() { Instance().ResetStreaming(false); } + + static const std::string& LastMessage() { + Logger& logger = Instance(); + std::lock_guard lck(mutex_); + return logger.message_[GetTid()]; + } + + static Logger* Create() { + std::lock_guard lck(mutex_); + if (instance_ == NULL) instance_ = new Logger(); + return instance_; + } + + static void Destroy() { + std::lock_guard lck(mutex_); + if (instance_ != NULL) delete instance_; + instance_ = NULL; + } + + static Logger& Instance() { + Create(); + return *instance_; + } + + private: + static uint32_t GetPid() { return syscall(__NR_getpid); } + static uint32_t GetTid() { return syscall(__NR_gettid); } + + Logger() : file_(NULL), dirty_(false), streaming_(false), messaging_(false) { + const char* path = getenv("ROCPROFILER_LOG"); + if (path != NULL) { + file_ = fopen("/tmp/rocprofiler_log.txt", "a"); + } + ResetStreaming(false); + } + + ~Logger() { + if (file_ != NULL) { + if (dirty_) Put("\n"); + fclose(file_); + } + } + + void ResetStreaming(const bool messaging) { + std::lock_guard lck(mutex_); + if (messaging) { + message_[GetTid()] = ""; + } + messaging_ = messaging; + streaming_ = messaging; + } + + void Put(const std::string& m) { + std::lock_guard lck(mutex_); + if (messaging_) { + message_[GetTid()] += m; + } + if (file_ != NULL) { + dirty_ = true; + flock(fileno(file_), LOCK_EX); + fprintf(file_, "%s", m.c_str()); + fflush(file_); + flock(fileno(file_), LOCK_UN); + } + } + + void Log(const std::string& m) { + const time_t rawtime = time(NULL); + tm tm_info; + localtime_r(&rawtime, &tm_info); + char tm_str[26]; + strftime(tm_str, 26, "%Y-%m-%d %H:%M:%S", &tm_info); + std::ostringstream oss; + oss << "<" << tm_str << std::dec << " pid" << GetPid() << " tid" << GetTid() << "> " << m; + Put(oss.str()); + } + + FILE* file_; + bool dirty_; + bool streaming_; + bool messaging_; + + static mutex_t mutex_; + static Logger* instance_; + std::map message_; +}; + +} // namespace util +} // namespace rocprofiler + +#define ERR_LOGGING(stream) { \ + rocprofiler::util::Logger::Instance() \ + << "error: " \ + << rocprofiler::util::Logger::begm \ + << stream \ + << rocprofiler::util::Logger::endl; \ +} + +#define INFO_LOGGING(stream) { \ + rocprofiler::util::Logger::Instance() \ + << "info: " \ + << rocprofiler::util::Logger::begm \ + << stream \ + << rocprofiler::util::Logger::endl; \ +} + +#ifdef DEBUG +# define DBG_LOGGING(stream) { \ + rocprofiler::util::Logger::Instance() << rocprofiler::util::Logger::begm \ + << "debug: \"" << stream << "\"" << \ + << " in " << __FUNCTION__ \ + << " at " << __FILE__ << " line " << __LINE__ \ + << rocprofiler::util::Logger::endl; \ +} +#endif + +#endif // SRC_UTIL_LOGGER_H_ diff --git a/src/xml/expr.h b/src/xml/expr.h new file mode 100644 index 0000000000..1f450d186e --- /dev/null +++ b/src/xml/expr.h @@ -0,0 +1,279 @@ +#ifndef _SRC_XML_EXPR_H +#define _SRC_XML_EXPR_H + +#include +#include +#include +#include +#include +#include + +namespace xml { +class exception_t : public std::exception { + public: + explicit exception_t(const std::string& msg) : str_(msg) {} + const char* what() const throw() { return str_.c_str(); } + protected: + const std::string str_; +}; + +typedef uint64_t args_t; +typedef std::map args_map_t; +class Expr; + +template +class any_cache_t { + public: + virtual ~any_cache_t() {} + virtual bool Lookup(const std::string& name, T& result) const = 0; +}; + +typedef any_cache_t expr_cache_t; +typedef any_cache_t args_cache_t; + +class bin_expr_t { + public: + static const bin_expr_t* CreateExpr(const bin_expr_t* arg1, const bin_expr_t* arg2, const char op); + static const bin_expr_t* CreateArg(Expr *obj, const std::string str); + + bin_expr_t() : arg1_(NULL), arg2_(NULL) {} + bin_expr_t(const bin_expr_t* arg1, const bin_expr_t* arg2) : arg1_(arg1), arg2_(arg2) {} + + virtual args_t Eval(const args_cache_t& args) const = 0; + virtual std::string Symbol() const = 0; + + std::string String() const { + std::string str; + if (arg1_) { str = "(" + arg1_->String() + " " + Symbol() + " " + arg2_->String() + ")"; } + else str = Symbol(); + return str; + } + + protected: + const bin_expr_t* arg1_; + const bin_expr_t* arg2_; +}; + +class Expr { + public: + explicit Expr(const std::string& expr, const expr_cache_t* cache) : + expr_(expr), + pos_(0), + sub_count_(0), + cache_(cache) + { + sub_vec_ = new std::vector; + var_vec_ = new std::vector; + tree_ = ParseExpr(); + } + + explicit Expr(const std::string& expr, const Expr* obj) : + expr_(expr), + pos_(0), + sub_count_(0), + cache_(obj->cache_), + sub_vec_(obj->sub_vec_), + var_vec_(obj->var_vec_) + { + sub_vec_->push_back(this); + tree_ = ParseExpr(); + if (!SubCheck()) throw exception_t("expr '" + expr_ + "', bad parenthesis count"); + } + + ~Expr() { + delete cache_; + for (auto it : *sub_vec_) delete it; + delete sub_vec_; + delete var_vec_; + } + + std::string GetStr() const { return expr_; } + const expr_cache_t* GetCache() const { return cache_; } + const bin_expr_t* GetTree() const { return tree_; } + + args_t Eval(const args_cache_t& args) const { return tree_->Eval(args); } + + std::string Lookup(const std::string& str) const { + std::string result; + if (cache_ && !(cache_->Lookup(str, result))) throw exception_t("expr '" + expr_ + "', lookup '" + str + "' failed"); + return result; + } + + void AddVar(const std::string& str) { + bool found = false; + for (std::string s : *var_vec_) if (s == str) found = true; + if (!found) var_vec_->push_back(str); + } + + const std::vector& GetVars() const { return *var_vec_; } + + std::string String() const { return tree_->String(); } + + private: + const bin_expr_t* ParseExpr() { + const bin_expr_t* expr = ParseArg(); + while (!IsEnd()) { + const char op = Symb(); + const bin_expr_t* second_arg = NULL; + if (IsSymb(')')) { + Next(); + SubClose(); + break; + } if (IsSymb('*') || IsSymb('/')) { + Next(); + second_arg = ParseArg(); + expr = bin_expr_t::CreateExpr(expr, second_arg, op); + } else if (IsSymb('+') || IsSymb('-')) { + Next(); + second_arg = ParseExpr(); + expr = bin_expr_t::CreateExpr(expr, second_arg, op); + break; + } else { + throw exception_t("expr '" + expr_ + "', bad operator '" + op + "'"); + } + } + return expr; + } + + const bin_expr_t* ParseArg() { + const bin_expr_t* arg = NULL; + if (IsSymb('(')) { + Next(); + SubOpen(); + arg = ParseExpr(); + } else { + const unsigned pos = FindOp(); + const std::string str = CutTill(pos); + arg = bin_expr_t::CreateArg(this, str); + if (arg == NULL) throw exception_t("expr '" + expr_ + "', bad argument '" + str + "'"); + } + return arg; + } + + char Symb() const { return Symb(pos_); } + char Symb(const unsigned ind) const { return expr_[ind]; } + bool IsEnd() const { return (pos_ >= expr_.length()); } + bool IsSymb(const char c) const { return IsSymb(pos_, c); } + bool IsSymb(const unsigned ind, const char c) const { return (expr_[ind] == c); } + void Next() { ++pos_; } + void SubOpen() { ++sub_count_; } + void SubClose() { --sub_count_; } + bool SubCheck() const { return (sub_count_ == 0); } + unsigned FindOp() const { + unsigned i = pos_; + while (i < expr_.length()) { + switch (Symb(i)) { + case '*': + case '/': + case '+': + case '-': + case '(': + case ')': + goto end; + } + ++i; + } + end: + return i; + } + std::string CutTill(const unsigned pos) { + const std::string str = (pos > pos_) ? expr_.substr(pos_, pos - pos_) : ""; + pos_ = pos; + return str; + } + + const std::string expr_; + unsigned pos_; + unsigned sub_count_; + const bin_expr_t* tree_; + const expr_cache_t* const cache_; + std::vector* sub_vec_; + std::vector* var_vec_; +}; + +class add_expr_t : public bin_expr_t { + public: + add_expr_t(const bin_expr_t* arg1, const bin_expr_t* arg2) : bin_expr_t(arg1, arg2) {} + args_t Eval(const args_cache_t& args) const { return (arg1_->Eval(args) + arg2_->Eval(args)); } + std::string Symbol() const { return "+"; } +}; +class sub_expr_t : public bin_expr_t { + public: + sub_expr_t(const bin_expr_t* arg1, const bin_expr_t* arg2) : bin_expr_t(arg1, arg2) {} + args_t Eval(const args_cache_t& args) const { return (arg1_->Eval(args) - arg2_->Eval(args)); } + std::string Symbol() const { return "-"; } +}; +class mul_expr_t : public bin_expr_t { + public: + mul_expr_t(const bin_expr_t* arg1, const bin_expr_t* arg2) : bin_expr_t(arg1, arg2) {} + args_t Eval(const args_cache_t& args) const { return (arg1_->Eval(args) * arg2_->Eval(args)); } + std::string Symbol() const { return "*"; } +}; +class div_expr_t : public bin_expr_t { + public: + div_expr_t(const bin_expr_t* arg1, const bin_expr_t* arg2) : bin_expr_t(arg1, arg2) {} + args_t Eval(const args_cache_t& args) const { return (arg1_->Eval(args) / arg2_->Eval(args)); } + std::string Symbol() const { return "/"; } +}; +class const_expr_t : public bin_expr_t { + public: + const_expr_t(const args_t value) : value_(value) {} + args_t Eval(const args_cache_t&) const { return value_; } + std::string Symbol() const { std::ostringstream os; os << value_; return os.str(); } + private: + const args_t value_; +}; +class var_expr_t : public bin_expr_t { + public: + var_expr_t(const std::string name) : name_(name) {} + args_t Eval(const args_cache_t& args) const { + args_t result = 0; + if (!args.Lookup(name_, result)) throw exception_t("expr arg lookup '" + name_ + "' failed"); + return result; + } + std::string Symbol() const { return name_; } + private: + const std::string name_; +}; + +inline const bin_expr_t* bin_expr_t::CreateExpr(const bin_expr_t* arg1, const bin_expr_t* arg2, const char op) { + const bin_expr_t* expr = NULL; + switch (op) { + case '+': + expr = new add_expr_t(arg1, arg2); + break; + case '-': + expr = new sub_expr_t(arg1, arg2); + break; + case '*': + expr = new mul_expr_t(arg1, arg2); + break; + case '/': + expr = new div_expr_t(arg1, arg2); + break; + } + return expr; +} + +inline const bin_expr_t* bin_expr_t::CreateArg(Expr* obj, const std::string str) { + const bin_expr_t* arg = NULL; + const unsigned i = strspn(str.c_str(), "1234567890"); + if (i == str.length()) { + const unsigned value = atoi(str.c_str()); + arg = new const_expr_t(value); + } else { + const std::string sub_expr = obj->Lookup(str); + if (sub_expr.empty()) { + arg = new var_expr_t(str); + obj->AddVar(str); + } else { + const Expr* expr = new Expr(sub_expr, obj); + arg = expr->GetTree(); + } + } + return arg; +} + +} // namespace xml + +#endif // _SRC_XML_EXPR_H diff --git a/src/xml/xml.h b/src/xml/xml.h new file mode 100644 index 0000000000..f1a8410ab9 --- /dev/null +++ b/src/xml/xml.h @@ -0,0 +1,221 @@ +#ifndef SRC_XML_XML_H_ +#define SRC_XML_XML_H_ + +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +namespace xml { + +class Xml { + public: + typedef std::vector token_t; + struct level_t { + std::string tag; + std::vector nodes; + std::map opts; + }; + typedef std::vector nodes_vec_t; + + enum { + DECL_STATE, + BODY_STATE + }; + + Xml(const char* file_name) : + file_name_(file_name), + file_line_(0), + data_size_(0), + index_(0), + state_(BODY_STATE), + level_(NULL), + comment_(false) + { + AddLevel("top"); + + fd_ = open(file_name, O_RDONLY); + if (fd_ == -1) { + std::cout << "XML file not found: " << file_name << std::endl; + return; + } + + token_t remainder; + while (1) { + token_t token = (remainder.size()) ? remainder : NextToken(); + remainder.clear(); + // End of file + if (token.size() == 0) break; + +// token_t token1 = token; +// token1.push_back('\0'); +// std::cout << "> " << &token1[0] << std::endl; + + switch(state_) { + case BODY_STATE: + if (token[0] == '<') { + bool node_begin = true; + unsigned ind = 1; + if (token[1] == '/') { + node_begin = false; + ++ind; + } + + unsigned i = ind; + while (i < token.size()) { if (token[i] == '>') break; ++i; } + for (unsigned j = i + 1; j < token.size(); ++j) remainder.push_back(token[j]); + + if (i == token.size()) { + if (node_begin) state_ = DECL_STATE; + else BadFormat(token); + token.push_back('\0'); + } else token[i] = '\0'; + + const char* tag = strdup(&token[ind]); + if (node_begin) { + AddLevel(tag); + } else { + if (strncmp(CurrentLevel().c_str(), tag, strlen(tag))) { + token.back() = '>'; + BadFormat(token); + } + UpLevel(); + } + } else BadFormat(token); + break; + case DECL_STATE: + if (token[0] == '>') { + state_ = BODY_STATE; + for (unsigned j = 1; j < token.size(); ++j) remainder.push_back(token[j]); + continue; + } else { + token.push_back('\0'); + unsigned j = 0; + for (j = 0; j < token.size(); ++j) if (token[j] == '=') break; + if (j == token.size()) BadFormat(token); + token[j] = '\0'; + const char* key = &token[0]; + const char* value = &token[j + 1]; + AddOption(key, value); + } + break; + default: + std::cout << "Wrong state: " << state_ << std::endl; + exit(1); + } + } + } + + std::vector GetNodes(std::string global_tag) { + return map_[global_tag]; + } + + void Print() const { + for(auto& elem : map_) { + for (auto node : elem.second) { + if (node->opts.size()) { + std::cout << elem.first << ":" << std::endl; + for (auto& opt : node->opts) { + std::cout << " " << opt.first << " = " << opt.second << std::endl; + } + } + } + } + } + + private: + bool LineEndCheck() { + bool found = false; + if (buffer_[index_] == '\n') { + buffer_[index_] = ' '; + ++file_line_; + found = true; + comment_ = false; + } else if (comment_ || (buffer_[index_] == '#')) { + found = true; + comment_ = true; + } + return found; + } + + token_t NextToken() { + token_t token; + + while (1) { + if (data_size_ == 0) { + data_size_ = read(fd_, buffer_, buf_size_); + if (data_size_ <= 0) break; + } + if (token.empty()) while ((index_ < data_size_) && ((buffer_[index_] == ' ') || LineEndCheck())) { + ++index_; + } + while ((index_ < data_size_) && (buffer_[index_] != ' ') && !LineEndCheck()) { + token.push_back(buffer_[index_++]); + } + if (index_ == data_size_) { + index_ = 0; + data_size_ = 0; + } else break; + } + + return token; + } + + void BadFormat(token_t token) { + token.push_back('\0'); + std::cout << "Error: " << file_name_ << ", line " << file_line_ << ", bad XML token '" << &token[0] << "'" << std::endl; + exit(1); + } + + void AddLevel(const std::string& tag) { + level_t* level = new level_t; + level->tag = tag; + if (level_) { + level_->nodes.push_back(level); + stack_.push_back(level_); + } + level_ = level; + + std::string global_tag; + for (level_t* level : stack_) { global_tag += level->tag + "."; } + global_tag += tag; + map_[global_tag].push_back(level_); + } + + void UpLevel() { + level_ = stack_.back(); + stack_.pop_back(); + } + + std::string CurrentLevel() const { + return level_->tag; + } + + void AddOption(const std::string& key, const std::string& value) { + level_->opts[key] = value; + } + + const char* file_name_; + unsigned file_line_; + int fd_; + static const unsigned buf_size_ = 256; + char buffer_[buf_size_]; + unsigned data_size_; + unsigned index_; + unsigned state_; + level_t* level_; + std::vector stack_; + std::map map_; + bool comment_; +}; + +} // namespace xml + +#endif // SRC_XML_XML_H_ diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt new file mode 100644 index 0000000000..8dc2c57efa --- /dev/null +++ b/test/CMakeLists.txt @@ -0,0 +1,41 @@ +cmake_minimum_required ( VERSION 3.5.0 ) +set ( CMAKE_VERBOSE_MAKEFILE TRUE CACHE BOOL "Verbose Output" FORCE ) + +set ( EXE_NAME "ctrl" ) + +if ( NOT DEFINED TEST_DIR ) + set ( TEST_DIR ${CMAKE_CURRENT_SOURCE_DIR} ) + project ( ${EXE_NAME} ) + ## Set build environment + include ( env ) + set ( ROCPROFILER_TARGET "rocprofiler64" ) +endif () + +## Util sources +file( GLOB UTIL_SRC "${TEST_DIR}/util/*.cpp" ) + +## Test control sources +set ( CTRL_SRC + ${TEST_DIR}/ctrl/test.cpp + ${TEST_DIR}/ctrl/test_hsa.cpp +) + +## Test kernels sources +set ( TEST_NAME simple_convolution ) +set ( KERN_SRC ${TEST_DIR}/${TEST_NAME}/${TEST_NAME}.cpp ) +execute_process ( COMMAND sh -xc "cp ${TEST_DIR}/${TEST_NAME}/*.hsaco ${PROJECT_BINARY_DIR}" ) + +## Building test executable +add_executable ( ${EXE_NAME} ${KERN_SRC} ${CTRL_SRC} ${UTIL_SRC} ) +target_include_directories ( ${EXE_NAME} PRIVATE ${TEST_DIR} ${ROOT_DIR} ${HSA_RUNTIME_INC_PATH} ) +target_link_libraries( ${EXE_NAME} ${ROCPROFILER_TARGET} ${HSA_RUNTIME_LIB} c stdc++ dl pthread rt atomic ) +execute_process ( COMMAND sh -xc "cp ${TEST_DIR}/run.sh ${PROJECT_BINARY_DIR}" ) +execute_process ( COMMAND sh -xc "cp ${TEST_DIR}/*.xml ${PROJECT_BINARY_DIR}" ) + +## Build test library +set ( TEST_LIB "tool" ) +set ( TEST_LIB_SRC ${TEST_DIR}/ctrl/tool.cpp ) +add_library ( ${TEST_LIB} SHARED ${TEST_LIB_SRC} ) +target_include_directories ( ${TEST_LIB} PRIVATE ${TEST_DIR} ${ROOT_DIR} ${HSA_RUNTIME_INC_PATH} ) +target_link_libraries( ${TEST_LIB} ${ROCPROFILER_TARGET} ${HSA_RUNTIME_LIB} c stdc++ dl pthread rt atomic ) + diff --git a/test/ctrl/run_kernel.h b/test/ctrl/run_kernel.h new file mode 100644 index 0000000000..4eed641056 --- /dev/null +++ b/test/ctrl/run_kernel.h @@ -0,0 +1,85 @@ +/****************************************************************************** + +Copyright ©2013 Advanced Micro Devices, Inc. All rights reserved. + +Redistribution and use in source and binary forms, with or without modification, +are permitted provided that the following conditions are met: + +Redistributions of source code must retain the above copyright notice, this list +of conditions and the following disclaimer. + +Redistributions in binary form must reproduce the above copyright notice, this +list of conditions and the following disclaimer in the documentation and/or +other materials provided with the distribution. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. +IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, +INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, +BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE +OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED +OF THE POSSIBILITY OF SUCH DAMAGE. + +*******************************************************************************/ + +#ifndef TEST_CTRL_RUN_KERNEL_H_ +#define TEST_CTRL_RUN_KERNEL_H_ + +#include "ctrl/test_hsa.h" +#include "util/test_assert.h" + +template bool RunKernel(int argc, char* argv[]) { + bool ret_val = false; + + // Create test kernel object + Kernel test_kernel; + TestAql* test_aql = new TestHsa(&test_kernel); + test_aql = new Test(test_aql); + TEST_ASSERT(test_aql != NULL); + if (test_aql == NULL) return 1; + + // Initialization of Hsa Runtime + ret_val = test_aql->Initialize(argc, argv); + if (ret_val == false) { + std::cerr << "Error in the test initialization" << std::endl; + // TEST_ASSERT(ret_val); + return false; + } + + // Setup Hsa resources needed for execution + ret_val = test_aql->Setup(); + if (ret_val == false) { + std::cerr << "Error in creating hsa resources" << std::endl; + TEST_ASSERT(ret_val); + return false; + } + + // Run test kernel + ret_val = test_aql->Run(); + if (ret_val == false) { + std::cerr << "Error in running the test kernel" << std::endl; + TEST_ASSERT(ret_val); + return false; + } + + // Verify the results of the execution + ret_val = test_aql->VerifyResults(); + if (ret_val) { + std::clog << "Test : Passed" << std::endl; + } else { + std::clog << "Test : Failed" << std::endl; + } + + // Print time taken by sample + test_aql->PrintTime(); + + test_aql->Cleanup(); + delete test_aql; + + return ret_val; +} + +#endif // TEST_CTRL_RUN_KERNEL_H_ diff --git a/test/ctrl/standalone_test.cpp b/test/ctrl/standalone_test.cpp new file mode 100644 index 0000000000..5f3d9147e6 --- /dev/null +++ b/test/ctrl/standalone_test.cpp @@ -0,0 +1,126 @@ +#include +#include +#include + +#include "ctrl/run_kernel.h" +#include "ctrl/test_aql.h" +#include "ctrl/test_hsa.h" +#include "inc/rocprofiler.h" +#include "simple_convolution/simple_convolution.h" +#include "util/test_assert.h" + +int main(int argc, char** argv) { + bool ret_val = false; + // HSA status + hsa_status_t status = HSA_STATUS_ERROR; + // Profiling context + rocprofiler_t* context = NULL; + // Profiling properties + rocprofiler_properties_t properties; + // Number of context invocation + uint32_t invocation = 0; + +#if 0 + // Profiling info objects + const unsigned info_count = 1; + rocprofiler_info_t info[info_count]; + // PMC events + memset(info, 0, sizeof(info)); + info[0].type = ROCPROFILER_TYPE_METRIC; + info[0].name = "SQ_WAVES"; +#else + // Profiling info objects + const unsigned info_count = 3; + rocprofiler_info_t info[info_count]; + // PMC events + memset(info, 0, sizeof(info)); + info[0].type = ROCPROFILER_TYPE_METRIC; + info[0].name = "SQ_WAVES"; + info[1].type = ROCPROFILER_TYPE_METRIC; + info[1].name = "SQ_ITEMS"; + // Tracing parameters + const unsigned parameter_count = 2; + rocprofiler_parameter_t parameters[parameter_count]; + info[2].name = "THREAD_TRACE"; + info[2].type = ROCPROFILER_TYPE_TRACE; + info[2].parameters = parameters; + info[2].parameter_count = parameter_count; + parameters[0].parameter_name = HSA_VEN_AMD_AQLPROFILE_PARAMETER_NAME_MASK; + parameters[0].value = 0; + parameters[1].parameter_name = HSA_VEN_AMD_AQLPROFILE_PARAMETER_NAME_TOKEN_MASK; + parameters[1].value = 0; +#endif + + // Creating profiling context + properties = {}; + properties.queue_depth = 128; + status = rocprofiler_open(TestHsa::HsaAgentId(), info, info_count, &context, ROCPROFILER_MODE_STANDALONE|ROCPROFILER_MODE_OWNQUEUE, &properties); + TEST_STATUS(status == HSA_STATUS_SUCCESS); + + TestHsa::SetQueue(properties.queue); + + // Adding dispatch observer + status = rocprofiler_dispatch_observer(rocprofiler_dispatch_callback, context); + TEST_STATUS(status == HSA_STATUS_SUCCESS); + + // Querying the number of context invocation + status = rocprofiler_invocation(context, &invocation); + TEST_STATUS(status == HSA_STATUS_SUCCESS); + + // Dispatching profiled kernel n-times to collect all counter groups data + unsigned n = 0; + while(1) { + std::cout << "> " << n << "/" << invocation << std::endl; +#if 0 + status = rocprofiler_start(context); + TEST_STATUS(status == HSA_STATUS_SUCCESS); + ret_val = RunKernel(argc, argv); + status = rocprofiler_stop(context); + TEST_STATUS(status == HSA_STATUS_SUCCESS); +#else + ret_val = RunKernel(argc, argv); +#endif + status = rocprofiler_sample(context); + TEST_STATUS(status == HSA_STATUS_SUCCESS); + + for (rocprofiler_info_t* p = info; p < info + info_count; ++p) { + std::cout << (p - info) << ": " << p->name; + switch (p->data.kind) { + case ROCPROFILER_INT64: + std::cout << std::dec << " result64 (" << p->data.result64 << ")" << std::endl; + break; + case ROCPROFILER_BYTES: { + const char* ptr = reinterpret_cast(p->data.result_bytes.ptr); + uint64_t size = 0; + for (unsigned i = 0; i < p->data.result_bytes.instance_count; ++i) { + size = *reinterpret_cast(ptr); + const char* data = ptr + sizeof(size); + std::cout << std::endl; + std::cout << std::hex << " data (" << (void*)data << ")" << std::endl; + std::cout << std::dec << " size (" << size << ")" << std::endl; + ptr = data + size; + } + break; + } + default: + std::cout << "result kind (" << p->data.kind << ")" << std::endl; + TEST_ASSERT(false); + } + } + + ++n; + if (n < invocation) { + status = rocprofiler_next(context); + TEST_STATUS(status == HSA_STATUS_SUCCESS); + continue; + } + break; + } + + // Finishing cleanup + // Deleting profiling context will delete all allocated resources + status = rocprofiler_close(context); + TEST_STATUS(status == HSA_STATUS_SUCCESS); + + return (ret_val) ? 0 : 1; +} diff --git a/test/ctrl/test.cpp b/test/ctrl/test.cpp new file mode 100644 index 0000000000..c9e8e71f23 --- /dev/null +++ b/test/ctrl/test.cpp @@ -0,0 +1,14 @@ +#include +#include +#include + +#include "ctrl/run_kernel.h" +#include "ctrl/test_aql.h" +#include "simple_convolution/simple_convolution.h" + +int main(int argc, char** argv) { + TestHsa::HsaInstantiate(); + for (int i = 0; i < 3; ++i) RunKernel(argc, argv); + TestHsa::HsaShutdown(); + return 0; +} diff --git a/test/ctrl/test_aql.h b/test/ctrl/test_aql.h new file mode 100644 index 0000000000..38909e6acd --- /dev/null +++ b/test/ctrl/test_aql.h @@ -0,0 +1,80 @@ +/****************************************************************************** + +Copyright ©2013 Advanced Micro Devices, Inc. All rights reserved. + +Redistribution and use in source and binary forms, with or without modification, +are permitted provided that the following conditions are met: + +Redistributions of source code must retain the above copyright notice, this list +of conditions and the following disclaimer. + +Redistributions in binary form must reproduce the above copyright notice, this +list of conditions and the following disclaimer in the documentation and/or +other materials provided with the distribution. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. +IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, +INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, +BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE +OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED +OF THE POSSIBILITY OF SUCH DAMAGE. + +*******************************************************************************/ + +#ifndef TEST_CTRL_TEST_AQL_H_ +#define TEST_CTRL_TEST_AQL_H_ + +#include +#include + +#include "util/hsa_rsrc_factory.h" + +// Test AQL interface +class TestAql { + public: + explicit TestAql(TestAql* t = 0) : test_(t) {} + virtual ~TestAql() { if (test_) delete test_; } + + TestAql* Test() { return test_; } + virtual AgentInfo* GetAgentInfo() { return (test_) ? test_->GetAgentInfo() : 0; } + virtual hsa_queue_t* GetQueue() { return (test_) ? test_->GetQueue() : 0; } + virtual HsaRsrcFactory* GetRsrcFactory() { return (test_) ? test_->GetRsrcFactory() : 0; } + + // Initialize application environment including setting + // up of various configuration parameters based on + // command line arguments + // @return bool true on success and false on failure + virtual bool Initialize(int argc, char** argv) { + return (test_) ? test_->Initialize(argc, argv) : true; + } + + // Setup application parameters for exectuion + // @return bool true on success and false on failure + virtual bool Setup() { return (test_) ? test_->Setup() : true; } + + // Run the kernel + // @return bool true on success and false on failure + virtual bool Run() { return (test_) ? test_->Run() : true; } + + // Verify results + // @return bool true on success and false on failure + virtual bool VerifyResults() { return (test_) ? test_->VerifyResults() : true; } + + // Print to console the time taken to execute kernel + virtual void PrintTime() { + if (test_) test_->PrintTime(); + } + + // Release resources e.g. memory allocations + // @return bool true on success and false on failure + virtual bool Cleanup() { return (test_) ? test_->Cleanup() : true; } + + private: + TestAql* const test_; +}; + +#endif // TEST_CTRL_TEST_AQL_H_ diff --git a/test/ctrl/test_hsa.cpp b/test/ctrl/test_hsa.cpp new file mode 100644 index 0000000000..39d606f405 --- /dev/null +++ b/test/ctrl/test_hsa.cpp @@ -0,0 +1,252 @@ +/****************************************************************************** + +Copyright ©2013 Advanced Micro Devices, Inc. All rights reserved. + +Redistribution and use in source and binary forms, with or without modification, +are permitted provided that the following conditions are met: + +Redistributions of source code must retain the above copyright notice, this list +of conditions and the following disclaimer. + +Redistributions in binary form must reproduce the above copyright notice, this +list of conditions and the following disclaimer in the documentation and/or +other materials provided with the distribution. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. +IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, +INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, +BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE +OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED +OF THE POSSIBILITY OF SUCH DAMAGE. + +*******************************************************************************/ + +#include "ctrl/test_hsa.h" + +#include + +#include "util/test_assert.h" +#include "util/helper_funcs.h" +#include "util/hsa_rsrc_factory.h" + +HsaRsrcFactory* TestHsa::hsa_rsrc_ = NULL; +AgentInfo* TestHsa::agent_info_ = NULL; +hsa_queue_t* TestHsa::hsa_queue_ = NULL; +uint32_t TestHsa::agent_id_ = 0; + +HsaRsrcFactory* TestHsa::HsaInstantiate(const uint32_t agent_ind) { + // Instantiate an instance of Hsa Resources Factory + if (hsa_rsrc_ == NULL) { + agent_id_ = agent_ind; + + hsa_rsrc_ = HsaRsrcFactory::Create(); + + // Print properties of the agents + hsa_rsrc_->PrintGpuAgents("> GPU agents"); + + // Create an instance of Gpu agent + if (!hsa_rsrc_->GetGpuAgentInfo(agent_ind, &agent_info_)) { + agent_info_ = NULL; + std::cerr << "> error: agent[" << agent_ind << "] is not found" << std::endl; + return NULL; + } + std::clog << "> Using agent[" << agent_ind << "] : " << agent_info_->name << std::endl; + + // Create an instance of Aql Queue + if (hsa_queue_ == NULL) { + uint32_t num_pkts = 128; + if(hsa_rsrc_->CreateQueue(agent_info_, num_pkts, &hsa_queue_) == false) { + hsa_queue_ = NULL; + } + } + } + return hsa_rsrc_; +} + +void TestHsa::HsaShutdown() { if (hsa_rsrc_) hsa_rsrc_->Destroy(); } + +bool TestHsa::Initialize(int arg_cnt, char** arg_list) { + std::clog << "TestHsa::Initialize :" << std::endl; + + // Instantiate a Timer object + setup_timer_idx_ = hsa_timer_.CreateTimer(); + dispatch_timer_idx_ = hsa_timer_.CreateTimer(); + + hsa_rsrc_ = HsaInstantiate(agent_id_); + if (hsa_rsrc_ == NULL) { + TEST_ASSERT(false); + return false; + } + + // Obtain handle of signal + hsa_rsrc_->CreateSignal(1, &hsa_signal_); + + // Obtain the code object file name + std::string agentName(agent_info_->name); + if (agentName.compare(0, 4, "gfx8") == 0) { + brig_path_obj_.append("gfx8"); + } else if (agentName.compare(0, 4, "gfx9") == 0) { + brig_path_obj_.append("gfx9"); + } else { + TEST_ASSERT(false); + return false; + } + brig_path_obj_.append("_" + name_ + ".hsaco"); + + return true; +} + +bool TestHsa::Setup() { + std::clog << "TestHsa::setup :" << std::endl; + + // Start the timer object + hsa_timer_.StartTimer(setup_timer_idx_); + + mem_map_t& mem_map = test_->GetMemMap(); + for (mem_it_t it = mem_map.begin(); it != mem_map.end(); ++it) { + mem_descr_t& des = it->second; + void* ptr = (des.local) ? hsa_rsrc_->AllocateLocalMemory(agent_info_, des.size) + : hsa_rsrc_->AllocateSysMemory(agent_info_, des.size); + des.ptr = ptr; + TEST_ASSERT(ptr != NULL); + if (ptr == NULL) return false; + } + test_->Init(); + + // Load and Finalize Kernel Code Descriptor + char* brig_path = (char*)brig_path_obj_.c_str(); + const bool ret_val = + hsa_rsrc_->LoadAndFinalize(agent_info_, brig_path, strdup(name_.c_str()), &kernel_code_desc_); + if (ret_val == false) { + std::cerr << "Error in loading and finalizing Kernel" << std::endl; + return ret_val; + } + + // Stop the timer object + hsa_timer_.StopTimer(setup_timer_idx_); + setup_time_taken_ = hsa_timer_.ReadTimer(setup_timer_idx_); + total_time_taken_ = setup_time_taken_; + + return true; +} + +bool TestHsa::Run() { + std::clog << "TestHsa::run :" << std::endl; + + const uint32_t work_group_size = 64; + const uint32_t work_grid_size = test_->GetGridSize(); + uint32_t group_segment_size = 0; + uint32_t private_segment_size = 0; + const size_t kernarg_segment_size = test_->GetKernargSize(); + uint64_t code_handle = 0; + + // Retrieve the amount of group memory needed + hsa_executable_symbol_get_info( + kernel_code_desc_, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_GROUP_SEGMENT_SIZE, &group_segment_size); + + // Retrieve the amount of private memory needed + hsa_executable_symbol_get_info(kernel_code_desc_, + HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_PRIVATE_SEGMENT_SIZE, + &private_segment_size); + + // Check the kernel args size + size_t size_info = 0; + hsa_executable_symbol_get_info( + kernel_code_desc_, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_KERNARG_SEGMENT_SIZE, &size_info); + TEST_ASSERT(kernarg_segment_size == size_info); + if (kernarg_segment_size != size_info) return false; + + // Retrieve handle of the code block + hsa_executable_symbol_get_info(kernel_code_desc_, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT, + &code_handle); + + // Initialize the dispatch packet. + hsa_kernel_dispatch_packet_t aql; + memset(&aql, 0, sizeof(aql)); + // Set the packet's type, barrier bit, acquire and release fences + aql.header = HSA_PACKET_TYPE_KERNEL_DISPATCH; + aql.header |= HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_SCACQUIRE_FENCE_SCOPE; + aql.header |= HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_SCRELEASE_FENCE_SCOPE; + // Populate Aql packet with default values + aql.setup = 1; + aql.grid_size_x = work_grid_size; + aql.grid_size_y = 1; + aql.grid_size_z = 1; + aql.workgroup_size_x = work_group_size; + aql.workgroup_size_y = 1; + aql.workgroup_size_z = 1; + // Bind the kernel code descriptor and arguments + aql.kernel_object = code_handle; + aql.kernarg_address = test_->GetKernargPtr(); + aql.group_segment_size = group_segment_size; + aql.private_segment_size = private_segment_size; + // Initialize Aql packet with handle of signal + aql.completion_signal = hsa_signal_; + + // Compute the write index of queue and copy Aql packet into it + const uint64_t que_idx = hsa_queue_load_write_index_relaxed(hsa_queue_); + const uint32_t mask = hsa_queue_->size - 1; + + std::clog << "> Executing kernel: \"" << name_ << "\"" << std::endl; + + // Start the timer object + hsa_timer_.StartTimer(dispatch_timer_idx_); + + // Disable packet so that submission to HW is complete + const auto header = aql.header; + aql.header = HSA_PACKET_TYPE_INVALID << HSA_PACKET_HEADER_TYPE; + + // Copy Aql packet into queue buffer + ((hsa_kernel_dispatch_packet_t*)(hsa_queue_->base_address))[que_idx & mask] = aql; + + // After AQL packet is fully copied into queue buffer + // update packet header from invalid state to valid state + std::atomic_thread_fence(std::memory_order_release); + ((hsa_kernel_dispatch_packet_t*)(hsa_queue_->base_address))[que_idx & mask].header = header; + + // Increment the write index and ring the doorbell to dispatch the kernel. + hsa_queue_store_write_index_relaxed(hsa_queue_, (que_idx + 1)); + hsa_signal_store_relaxed(hsa_queue_->doorbell_signal, que_idx); + + std::clog << "> Waiting on kernel dispatch signal, que_idx=" << que_idx << std::endl; + + // Wait on the dispatch signal until the kernel is finished. + // Update wait condition to HSA_WAIT_STATE_ACTIVE for Polling + hsa_signal_wait_acquire(hsa_signal_, HSA_SIGNAL_CONDITION_LT, 1, (uint64_t)-1, + HSA_WAIT_STATE_BLOCKED); + + // Stop the timer object + hsa_timer_.StopTimer(dispatch_timer_idx_); + dispatch_time_taken_ = hsa_timer_.ReadTimer(dispatch_timer_idx_); + total_time_taken_ += dispatch_time_taken_; + + // Copy kernel buffers from local memory into system memory + hsa_rsrc_->TransferData(test_->GetOutputPtr(), test_->GetLocalPtr(), test_->GetOutputSize(), + false); + test_->PrintOutput(); + + return true; +} + +bool TestHsa::VerifyResults() { + // Compare the results and see if they match + const void* const refout_ptr = test_->GetRefoutPtr(); + const int32_t cmp_val = + (refout_ptr != NULL) ? memcmp(test_->GetOutputPtr(), refout_ptr, test_->GetOutputSize()) : 0; + return (cmp_val == 0); +} + +void TestHsa::PrintTime() { + std::clog << "Time taken for Setup by " << this->name_ << " : " << this->setup_time_taken_ + << std::endl; + std::clog << "Time taken for Dispatch by " << this->name_ << " : " << this->dispatch_time_taken_ + << std::endl; + std::clog << "Time taken in Total by " << this->name_ << " : " << this->total_time_taken_ + << std::endl; +} + +bool TestHsa::Cleanup() { return true; } diff --git a/test/ctrl/test_hsa.h b/test/ctrl/test_hsa.h new file mode 100644 index 0000000000..ceb33b4ca9 --- /dev/null +++ b/test/ctrl/test_hsa.h @@ -0,0 +1,125 @@ +/****************************************************************************** + +Copyright ©2013 Advanced Micro Devices, Inc. All rights reserved. + +Redistribution and use in source and binary forms, with or without modification, +are permitted provided that the following conditions are met: + +Redistributions of source code must retain the above copyright notice, this list +of conditions and the following disclaimer. + +Redistributions in binary form must reproduce the above copyright notice, this +list of conditions and the following disclaimer in the documentation and/or +other materials provided with the distribution. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. +IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, +INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, +BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE +OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED +OF THE POSSIBILITY OF SUCH DAMAGE. + +*******************************************************************************/ + +#ifndef TEST_CTRL_TEST_HSA_H_ +#define TEST_CTRL_TEST_HSA_H_ + +#include "ctrl/test_aql.h" +#include "ctrl/test_kernel.h" +#include "util/hsa_rsrc_factory.h" +#include "util/perf_timer.h" + +// Class implements HSA test +class TestHsa : public TestAql { + public: + // Instantiate HSA resources + static HsaRsrcFactory* HsaInstantiate(const uint32_t agent_ind = agent_id_); + static void HsaShutdown(); + static void SetQueue(hsa_queue_t* queue) { hsa_queue_ = queue; } + static uint32_t HsaAgentId() { return agent_id_; } + + // Constructor + explicit TestHsa(TestKernel* test) : test_(test), name_(test->Name()) { + total_time_taken_ = 0; + setup_time_taken_ = 0; + dispatch_time_taken_ = 0; + } + + // Get methods for Agent Info, HAS queue, HSA Resourcse Manager + AgentInfo* GetAgentInfo() { return agent_info_; } + hsa_queue_t* GetQueue() { return hsa_queue_; } + HsaRsrcFactory* GetRsrcFactory() { return hsa_rsrc_; } + + // Initialize application environment including setting + // up of various configuration parameters based on + // command line arguments + // @return bool true on success and false on failure + bool Initialize(int argc, char** argv); + + // Setup application parameters for exectuion + // @return bool true on success and false on failure + bool Setup(); + + // Run the BinarySearch kernel + // @return bool true on success and false on failure + bool Run(); + + // Verify against reference implementation + // @return bool true on success and false on failure + bool VerifyResults(); + + // Print to console the time taken to execute kernel + void PrintTime(); + + // Release resources e.g. memory allocations + // @return bool true on success and false on failure + bool Cleanup(); + + private: + typedef TestKernel::mem_descr_t mem_descr_t; + typedef TestKernel::mem_map_t mem_map_t; + typedef TestKernel::mem_it_t mem_it_t; + + // Test object + TestKernel* test_; + + // Path of Brig file + std::string brig_path_obj_; + + // Used to track time taken to run the sample + double total_time_taken_; + double setup_time_taken_; + double dispatch_time_taken_; + + // Handle of signal + hsa_signal_t hsa_signal_; + + // Handle of Kernel Code Descriptor + hsa_executable_symbol_t kernel_code_desc_; + + // Instance of timer object + uint32_t setup_timer_idx_; + uint32_t dispatch_timer_idx_; + PerfTimer hsa_timer_; + + // Instance of Hsa Resources Factory + static HsaRsrcFactory* hsa_rsrc_; + + // GPU id + static uint32_t agent_id_; + + // Handle to an Hsa Gpu Agent + static AgentInfo* agent_info_; + + // Handle to an Hsa Queue + static hsa_queue_t* hsa_queue_; + + // Test kernel name + std::string name_; +}; + +#endif // TEST_CTRL_TEST_HSA_H_ diff --git a/test/ctrl/test_kernel.h b/test/ctrl/test_kernel.h new file mode 100644 index 0000000000..5427ba3144 --- /dev/null +++ b/test/ctrl/test_kernel.h @@ -0,0 +1,107 @@ +/****************************************************************************** + +Copyright ©2013 Advanced Micro Devices, Inc. All rights reserved. + +Redistribution and use in source and binary forms, with or without modification, +are permitted provided that the following conditions are met: + +Redistributions of source code must retain the above copyright notice, this list +of conditions and the following disclaimer. + +Redistributions in binary form must reproduce the above copyright notice, this +list of conditions and the following disclaimer in the documentation and/or +other materials provided with the distribution. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. +IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, +INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, +BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE +OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED +OF THE POSSIBILITY OF SUCH DAMAGE. + +*******************************************************************************/ + +#ifndef TEST_CTRL_TEST_KERNEL_H_ +#define TEST_CTRL_TEST_KERNEL_H_ + +#include +#include + +// Class implements kernel test +class TestKernel { + public: + // Memory descriptors IDs + enum { INPUT_DES_ID, OUTPUT_DES_ID, LOCAL_DES_ID, MASK_DES_ID, KERNARG_DES_ID, REFOUT_DES_ID }; + + // Memory descriptors vector declaration + struct mem_descr_t { + void* ptr; + uint32_t size; + bool local; + }; + + // Memory map declaration + typedef std::map mem_map_t; + typedef mem_map_t::iterator mem_it_t; + typedef mem_map_t::const_iterator mem_const_it_t; + + virtual ~TestKernel() {} + + // Initialize method + virtual void Init() = 0; + + // Return kernel memory map + mem_map_t& GetMemMap() { return mem_map_; } + + // Return NULL descriptor + static mem_descr_t NullDescriptor() { return {NULL, 0, 0}; } + + // Methods to get the kernel attributes + void* GetKernargPtr() const { return GetDescr(KERNARG_DES_ID).ptr; } + uint32_t GetKernargSize() const { return GetDescr(KERNARG_DES_ID).size; } + void* GetOutputPtr() const { return GetDescr(OUTPUT_DES_ID).ptr; } + uint32_t GetOutputSize() const { return GetDescr(OUTPUT_DES_ID).size; } + void* GetLocalPtr() const { return GetDescr(LOCAL_DES_ID).ptr; } + void* GetRefoutPtr() const { return GetDescr(REFOUT_DES_ID).ptr; } + virtual uint32_t GetGridSize() const = 0; + + // Print output + virtual void PrintOutput() const = 0; + + // Return name + virtual std::string Name() const = 0; + + protected: + // Set system memory descriptor + bool SetSysDescr(const uint32_t& id, const uint32_t& size) { + return SetMemDescr(id, size, false); + } + + // Set local memory descriptor + bool SetLocalDescr(const uint32_t& id, const uint32_t& size) { + return SetMemDescr(id, size, true); + } + + // Get memory descriptor + mem_descr_t GetDescr(const uint32_t& id) const { + mem_const_it_t it = mem_map_.find(id); + return (it != mem_map_.end()) ? it->second : NullDescriptor(); + } + + private: + // Set memory descriptor + bool SetMemDescr(const uint32_t& id, const uint32_t& size, const bool& local) { + const mem_descr_t des = {NULL, size, local}; + auto ret = mem_map_.insert(mem_map_t::value_type(id, des)); + return ret.second; + } + + // Kernel memory map object + mem_map_t mem_map_; +}; + +#endif // TEST_CTRL_TEST_KERNEL_H_ diff --git a/test/ctrl/test_pgen.h b/test/ctrl/test_pgen.h new file mode 100644 index 0000000000..36e36bd2ef --- /dev/null +++ b/test/ctrl/test_pgen.h @@ -0,0 +1,45 @@ +/****************************************************************************** + +Copyright ©2013 Advanced Micro Devices, Inc. All rights reserved. + +Redistribution and use in source and binary forms, with or without modification, +are permitted provided that the following conditions are met: + +Redistributions of source code must retain the above copyright notice, this list +of conditions and the following disclaimer. + +Redistributions in binary form must reproduce the above copyright notice, this +list of conditions and the following disclaimer in the documentation and/or +other materials provided with the distribution. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. +IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, +INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, +BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE +OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED +OF THE POSSIBILITY OF SUCH DAMAGE. + +*******************************************************************************/ + +#ifndef TEST_CTRL_TEST_PGEN_H_ +#define TEST_CTRL_TEST_PGEN_H_ + +#include "ctrl/test_pmgr.h" + +// SimpleConvolution: Class implements OpenCL SimpleConvolution sample +class TestPGen : public TestPMgr { + protected: + typedef hsa_ext_amd_aql_pm4_packet_t packet_t; + + packet_t* PrePacket() { return reinterpret_cast(&pre_packet_); } + packet_t* PostPacket() { return reinterpret_cast(&post_packet_); } + + public: + explicit TestPGen(TestAql* t) : TestPMgr(t) {} +}; + +#endif // TEST_CTRL_TEST_PGEN_H_ diff --git a/test/ctrl/test_pgen_rocp.h b/test/ctrl/test_pgen_rocp.h new file mode 100644 index 0000000000..06ed2edfd7 --- /dev/null +++ b/test/ctrl/test_pgen_rocp.h @@ -0,0 +1,78 @@ +/****************************************************************************** + +Copyright ©2013 Advanced Micro Devices, Inc. All rights reserved. + +Redistribution and use in source and binary forms, with or without modification, +are permitted provided that the following conditions are met: + +Redistributions of source code must retain the above copyright notice, this list +of conditions and the following disclaimer. + +Redistributions in binary form must reproduce the above copyright notice, this +list of conditions and the following disclaimer in the documentation and/or +other materials provided with the distribution. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. +IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, +INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, +BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE +OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED +OF THE POSSIBILITY OF SUCH DAMAGE. + +*******************************************************************************/ + +#ifndef TEST_CTRL_TEST_PGEN_ROCP_H_ +#define TEST_CTRL_TEST_PGEN_ROCP_H_ + +#include +#include + +#include "ctrl/test_pgen.h" +#include "util/test_assert.h" + +hsa_status_t TestPGenRocpCallback(hsa_ven_amd_aqlprofile_info_type_t info_type, + hsa_ven_amd_aqlprofile_info_data_t* info_data, + void* callback_data) { + hsa_status_t status = HSA_STATUS_SUCCESS; + typedef std::vector passed_data_t; + reinterpret_cast(callback_data)->push_back(*info_data); + return status; +} + +// Class implements PMC profiling +class TestPGenRocp : public TestPGen { + public: + explicit TestPGenRocp(TestAql* t) : TestPGen(t) { std::clog << "Test: PGen ROCP" << std::endl; } + + bool Initialize(int /*arg_cnt*/, char** /*arg_list*/) { + status = rocprofiler_on_dispatch(&profile_, PrePacket(), PostPacket()); + TEST_STATUS(status != HSA_STATUS_SUCCESS); + return (status == HSA_STATUS_SUCCESS); + } + + private: + bool BuildPackets() { return true; } + + bool DumpData() { + std::clog << "TestPGenRocp::DumpData :" << std::endl; + + typedef std::vector callback_data_t; + + callback_data_t data; + api_.hsa_ven_amd_aqlprofile_iterate_data(&profile_, TestPGenRocpCallback, &data); + for (callback_data_t::iterator it = data.begin(); it != data.end(); ++it) { + std::cout << std::dec << "event(block(" << it->pmc_data.event.block_name << "_" + << it->pmc_data.event.block_index << "), id(" << it->pmc_data.event.counter_id + << ")), sample(" << it->sample_id << "), result(" << it->pmc_data.result << ")" + << std::endl; + } + + return true; + } +}; + +#endif // TEST_CTRL_TEST_PGEN_ROCP_H_ diff --git a/test/ctrl/test_pmgr.cpp b/test/ctrl/test_pmgr.cpp new file mode 100644 index 0000000000..87fba5676f --- /dev/null +++ b/test/ctrl/test_pmgr.cpp @@ -0,0 +1,144 @@ +/****************************************************************************** + +Copyright ©2013 Advanced Micro Devices, Inc. All rights reserved. + +Redistribution and use in source and binary forms, with or without modification, +are permitted provided that the following conditions are met: + +Redistributions of source code must retain the above copyright notice, this list +of conditions and the following disclaimer. + +Redistributions in binary form must reproduce the above copyright notice, this +list of conditions and the following disclaimer in the documentation and/or +other materials provided with the distribution. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. +IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, +INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, +BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE +OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED +OF THE POSSIBILITY OF SUCH DAMAGE. + +*******************************************************************************/ + +#include "ctrl/test_pmgr.h" + +#include + +#include "ctrl/test_assert.h" + +bool TestPMgr::AddPacketGfx9(const packet_t* packet) { + packet_t aql_packet = *packet; + + // Compute the write index of queue and copy Aql packet into it + uint64_t que_idx = hsa_queue_load_write_index_relaxed(GetQueue()); + const uint32_t mask = GetQueue()->size - 1; + packet_t* slot = (reinterpret_cast(GetQueue()->base_address)) + (que_idx & mask); + + // Disable packet so that submission to HW is complete + const auto header = HSA_PACKET_TYPE_VENDOR_SPECIFIC << HSA_PACKET_HEADER_TYPE; + aql_packet.header &= (~((1ul << HSA_PACKET_HEADER_WIDTH_TYPE) - 1)) << HSA_PACKET_HEADER_TYPE; + aql_packet.header |= HSA_PACKET_TYPE_INVALID << HSA_PACKET_HEADER_TYPE; + + // Copy Aql packet into queue buffer + *slot = aql_packet; + // After AQL packet is fully copied into queue buffer + // update packet header from invalid state to valid state + auto header_atomic_ptr = + reinterpret_cast*>(&slot->header); + header_atomic_ptr->store(header, std::memory_order_release); + + // Increment the write index and ring the doorbell to dispatch the kernel. + hsa_queue_store_write_index_relaxed(GetQueue(), (que_idx + 1)); + hsa_signal_store_relaxed(GetQueue()->doorbell_signal, que_idx); + + return true; +} + +bool TestPMgr::AddPacketGfx8(const packet_t* packet) { + // Create legacy devices PM4 data + const hsa_ext_amd_aql_pm4_packet_t* aql_packet = (const hsa_ext_amd_aql_pm4_packet_t*)packet; + slot_pm4_t data; + api_.hsa_ven_amd_aqlprofile_legacy_get_pm4(aql_packet, reinterpret_cast(data.words)); + + // Compute the write index of queue and copy Aql packet into it + uint64_t que_idx = hsa_queue_load_write_index_relaxed(GetQueue()); + const uint32_t mask = GetQueue()->size - 1; + + // Copy Aql/Pm4 blob into queue buffer + packet_t* ptr = (reinterpret_cast(GetQueue()->base_address)) + (que_idx & mask); + slot_pm4_t* slot = reinterpret_cast(ptr); + for (unsigned i = 1; i < SLOT_PM4_SIZE_DW; ++i) { + slot->words[i] = data.words[i]; + } + // To maintain global order to ensure the prior copy of the packet contents is made visible + // before the header is updated. + // With in-order CP it will wait until the first packet in the blob will be valid + std::atomic* header_atomic_ptr = + reinterpret_cast*>(&slot->words[0]); + header_atomic_ptr->store(data.words[0], std::memory_order_release); + + // Increment the write index and ring the doorbell to dispatch the kernel. + que_idx += SLOT_PM4_SIZE_AQLP - 1; + hsa_queue_store_write_index_relaxed(GetQueue(), (que_idx + 1)); + hsa_signal_store_relaxed(GetQueue()->doorbell_signal, que_idx); + + return true; +} + +bool TestPMgr::AddPacket(const packet_t* packet) { + const char* agent_name = GetAgentInfo()->name; + return (strncmp(agent_name, "gfx8", 4) == 0) ? AddPacketGfx8(packet) : AddPacketGfx9(packet); +} + +bool TestPMgr::Run() { + // Build Aql Pkts + const bool active = BuildPackets(); + if (active) { + // Submit Pre-Dispatch Aql packet + AddPacket(&pre_packet_); + } + + Test()->Run(); + + if (active) { + // Set post packet completion signal + post_packet_.completion_signal = post_signal_; + + // Submit Post-Dispatch Aql packet + AddPacket(&post_packet_); + + // Wait for Post-Dispatch packet to complete + hsa_signal_wait_acquire(post_signal_, HSA_SIGNAL_CONDITION_LT, 1, (uint64_t)-1, + HSA_WAIT_STATE_BLOCKED); + + // Dumping profiling data + DumpData(); + } + + return true; +} + +bool TestPMgr::Initialize(int argc, char** argv) { + TestAql::Initialize(argc, argv); + + hsa_status_t status = HSA_STATUS_ERROR; + status = hsa_signal_create(1, 0, NULL, &post_signal_); + TEST_ASSERT(status == HSA_STATUS_SUCCESS); + status = hsa_system_get_extension_table(HSA_EXTENSION_AMD_AQLPROFILE, 1, 0, &api_); + TEST_ASSERT(status == HSA_STATUS_SUCCESS); + + return true; +} + +TestPMgr::TestPMgr(TestAql* t) : TestAql(t), api_({0}) { + memset(&pre_packet_, 0, sizeof(pre_packet_)); + memset(&post_packet_, 0, sizeof(post_packet_)); + dummy_signal_.handle = 0; + post_signal_ = dummy_signal_; + memset(&api_, 0, sizeof(api_)); +} diff --git a/test/ctrl/test_pmgr.h b/test/ctrl/test_pmgr.h new file mode 100644 index 0000000000..3998dc1f03 --- /dev/null +++ b/test/ctrl/test_pmgr.h @@ -0,0 +1,70 @@ +/****************************************************************************** + +Copyright ©2013 Advanced Micro Devices, Inc. All rights reserved. + +Redistribution and use in source and binary forms, with or without modification, +are permitted provided that the following conditions are met: + +Redistributions of source code must retain the above copyright notice, this list +of conditions and the following disclaimer. + +Redistributions in binary form must reproduce the above copyright notice, this +list of conditions and the following disclaimer in the documentation and/or +other materials provided with the distribution. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. +IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, +INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, +BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE +OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED +OF THE POSSIBILITY OF SUCH DAMAGE. + +*******************************************************************************/ + +#ifndef TEST_CTRL_TEST_PMGR_H_ +#define TEST_CTRL_TEST_PMGR_H_ + +#include +#include +#include + +#include "ctrl/test_aql.h" + +// Class implements profiling manager +class TestPMgr : public TestAql { + public: + typedef hsa_ext_amd_aql_pm4_packet_t packet_t; + explicit TestPMgr(TestAql* t); + bool Run(); + + protected: + packet_t pre_packet_; + packet_t post_packet_; + hsa_signal_t dummy_signal_; + hsa_signal_t post_signal_; + + hsa_ven_amd_aqlprofile_1_00_pfn_t api_; + + virtual bool BuildPackets() { return false; } + virtual bool DumpData() { return false; } + virtual bool Initialize(int argc, char** argv); + + private: + enum { + SLOT_PM4_SIZE_DW = HSA_VEN_AMD_AQLPROFILE_LEGACY_PM4_PACKET_SIZE / sizeof(uint32_t), + SLOT_PM4_SIZE_AQLP = HSA_VEN_AMD_AQLPROFILE_LEGACY_PM4_PACKET_SIZE / sizeof(packet_t) + }; + struct slot_pm4_t { + uint32_t words[SLOT_PM4_SIZE_DW]; + }; + + bool AddPacket(const packet_t* packet); + bool AddPacketGfx8(const packet_t* packet); + bool AddPacketGfx9(const packet_t* packet); +}; + +#endif // TEST_CTRL_TEST_PMGR_H_ diff --git a/test/ctrl/thr_tool.cpp b/test/ctrl/thr_tool.cpp new file mode 100644 index 0000000000..fa2f2abfa5 --- /dev/null +++ b/test/ctrl/thr_tool.cpp @@ -0,0 +1,297 @@ +#include +#include +#include +#include +#include +#include +#include + +#include "inc/rocprofiler.h" +#include "util/xml.h" + +#define PUBLIC_API __attribute__((visibility("default"))) +#define CONSTRUCTOR_API __attribute__((constructor)) +#define DESTRUCTOR_API __attribute__((destructor)) + +// Tool thread +pthread_t thread; +pthread_attr_t thr_attr; +bool thr_stop = false; + +struct dispatch_data_t { + rocprofiler_info_t* info; + unsigned info_count; + unsigned group_index; +}; + +struct context_entry_t { + rocprofiler_group_t* group; + rocprofiler_info_t* info; + unsigned info_count; + rocprofiler_callback_data_t data; +}; + +pthread_mutex_t mutex = PTHREAD_MUTEX_INITIALIZER; +unsigned context_array_size = 1; +context_entry_t* context_array = NULL; +unsigned context_array_index = 0; + +const char* file_name; +FILE* file_handle = NULL; + +void check_status(hsa_status_t status) { + if (status != HSA_STATUS_SUCCESS) { + const char* error_string = NULL; + rocprofiler_error_string(&error_string); + fprintf(stderr, "ERROR: %s\n", error_string); + exit(1); + } +} + +unsigned align_size(unsigned size, unsigned alignment) { return ((size + alignment - 1) & ~(alignment - 1)); } + +void print_info(FILE* file, const rocprofiler_info_t* info, const unsigned info_count, const char* str) { + if (str) fprintf(file, "%s:\n", str); + for (unsigned i= 0; i < info_count; ++i) { + const rocprofiler_info_t* p = &info[i]; + fprintf(file, " %s ", p->name); + switch (p->data.kind) { + case ROCPROFILER_INT64: + fprintf(file, "(%lu)\n", p->data.result64); + break; + case ROCPROFILER_BYTES: { + fprintf(file, "(\n"); + const char* ptr = reinterpret_cast(p->data.result_bytes.ptr); + uint64_t size = 0; + for (unsigned i = 0; i < p->data.result_bytes.instance_count; ++i) { + size = *reinterpret_cast(ptr); + const char* data = ptr + sizeof(size); + fprintf(file, " data (%p), size (%lu)\n", data, size); + size = align_size(size, sizeof(uint64_t)); + ptr = data + size; + } + fprintf(file, " )\n"); + break; + } + default: + std::cout << "Bad result kind (" << p->data.kind << ")" << std::endl; + } + } +} + +void print_group(FILE* file, const rocprofiler_group_t* group, const char* str) { + if (str) fprintf(file, "%s:\n", str); + for (unsigned i= 0; i < group->info_count; ++i) { + print_info(file, group->info[i], 1, NULL); + } +} + +void store_context(context_entry_t context_entry) { + if(pthread_mutex_lock(&mutex) != 0) { + perror("pthread_mutex_lock"); + exit(1); + } + if ((context_array == NULL) || (context_array_index >= context_array_size)) { + context_array_size *= 2; + context_array = reinterpret_cast(realloc(context_array, context_array_size * sizeof(context_entry_t))); + } + context_array_index += 1; + context_array[context_array_index - 1] = context_entry; + if(pthread_mutex_unlock(&mutex) != 0) { + perror("pthread_mutex_unlock"); + exit(1); + } +} + +void dump_context(FILE *file, unsigned index) { + hsa_status_t status = HSA_STATUS_ERROR; + + if (pthread_mutex_lock(&mutex) != 0) { + perror("pthread_mutex_lock"); + exit(1); + } + context_entry_t* entry = &context_array[index]; + rocprofiler_group_t* group = entry->group; + const rocprofiler_info_t* info = entry->info; + const unsigned info_count = entry->info_count; + fprintf(file, "Dispatch[%u], kernel_object(0x%lx):\n", index, entry->data.kernel_object); + if (pthread_mutex_unlock(&mutex) != 0) { + perror("pthread_mutex_unlock"); + exit(1); + } + + status = rocprofiler_get_group_data(group); + check_status(status); + //print_group(file, group, "Group[0] data"); + + status = rocprofiler_get_metrics_data(group->context); + check_status(status); + print_info(file, info, info_count, NULL); + + // Finishing cleanup + // Deleting profiling context will delete all allocated resources + rocprofiler_close(group->context); +} + +// Provided standard profiling callback +hsa_status_t dispatch_callback( + const rocprofiler_callback_data_t* callback_data, + void* user_data, + rocprofiler_group_t** group) { + hsa_status_t status = HSA_STATUS_ERROR; + // Passed tool data + dispatch_data_t* tool_data = reinterpret_cast(user_data); + // Profiling context + rocprofiler_t* context = NULL; + + // Open profiling context + status = rocprofiler_open(0, tool_data->info, tool_data->info_count, &context, 0, NULL); + check_status(status); + + rocprofiler_group_t* groups = NULL; + uint32_t group_count = 0; + status = rocprofiler_get_groups(context, &groups, &group_count); + check_status(status); + assert(group_count == 1); + + *group = &groups[0]; + store_context({*group, tool_data->info, tool_data->info_count, *callback_data}); + + return status; +} + +void* dumping_data(void*) { + unsigned index = 0; + do { + while (index < context_array_index) { + dump_context(file_handle, index); + ++index; + } + } while (!thr_stop); + return NULL; +} + +CONSTRUCTOR_API void constructor() { + std::map parameters_dict; + parameters_dict["HSA_VEN_AMD_AQLPROFILE_PARAMETER_NAME_COMPUTE_UNIT_TARGET"] = HSA_VEN_AMD_AQLPROFILE_PARAMETER_NAME_COMPUTE_UNIT_TARGET; + parameters_dict["HSA_VEN_AMD_AQLPROFILE_PARAMETER_NAME_VM_ID_MASK"] = HSA_VEN_AMD_AQLPROFILE_PARAMETER_NAME_VM_ID_MASK; + parameters_dict["HSA_VEN_AMD_AQLPROFILE_PARAMETER_NAME_MASK"] = HSA_VEN_AMD_AQLPROFILE_PARAMETER_NAME_MASK; + parameters_dict["HSA_VEN_AMD_AQLPROFILE_PARAMETER_NAME_TOKEN_MASK"] = HSA_VEN_AMD_AQLPROFILE_PARAMETER_NAME_TOKEN_MASK; + parameters_dict["HSA_VEN_AMD_AQLPROFILE_PARAMETER_NAME_TOKEN_MASK2"] = HSA_VEN_AMD_AQLPROFILE_PARAMETER_NAME_TOKEN_MASK2; + +#ifdef TOOL_THREAD + int err = pthread_attr_init(&thr_attr); + if (err) { errno = err; perror("pthread_attr_init"); exit(1); } + err = pthread_create(&thread, &thr_attr, dumping_data, NULL); + if (err) { errno = err; perror("pthread_create"); exit(1); } +#endif + + // Set output file + file_name = getenv("ROCP_OUTPUT"); + if (file_name != NULL) { + file_handle = fopen(file_name, "w"); + if (file_handle == NULL) { + perror("fopen"); + exit(1); + } + } else file_handle = stdout; + + // Getting input + const char* xml_name = getenv("ROCP_INPUT"); + if (xml_name == NULL) { + fprintf(stderr, "ROCProfiler: input is not specified, ROCP_INPUT env"); + exit(1); + } + printf("ROCProfiler: input from \"%s\"\n", xml_name); + xml::Xml* xml = new xml::Xml(xml_name); + + // Getting metrics + auto metrics_list = xml->GetNodes("top.metric"); + std::vector metrics_vec; + for (auto* entry : metrics_list) { + const std::string entry_str = entry->opts["name"]; + size_t pos1 = 0; + while(pos1 < entry_str.length()) { + const size_t pos2 = entry_str.find(",", pos1); + const std::string metric_name = entry_str.substr(pos1, pos2 - pos1); + metrics_vec.push_back(metric_name); + if (pos2 == std::string::npos) break; + pos1 = pos2 + 1; + } + } + + // Getting traces + auto traces_list = xml->GetNodes("top.trace"); + + const unsigned info_count = metrics_vec.size() + traces_list.size(); + rocprofiler_info_t* info= new rocprofiler_info_t[info_count]; + memset(info, 0, info_count * sizeof(rocprofiler_info_t)); + + printf(" %d metrics\n", (int) metrics_vec.size()); + for (unsigned i = 0; i < metrics_vec.size(); ++i) { + const std::string& name = metrics_vec[i]; + printf("%s%s", (i == 0) ? " " : ", ", name.c_str()); + info[i] = {}; + info[i].type = ROCPROFILER_TYPE_METRIC; + info[i].name = strdup(name.c_str()); + } + if (metrics_vec.size()) printf("\n"); + + printf(" %d traces\n", (int) traces_list.size()); + unsigned index = metrics_vec.size(); + for (auto* entry : traces_list) { + auto params_list = xml->GetNodes("top.trace.parameters"); + if (params_list.size() != 1) { + fprintf(stderr, "ROCProfiler: Single input 'parameters' section is supported\n"); + exit(1); + } + const std::string& name = entry->opts["name"]; + printf(" %s (\n", name.c_str()); + info[index] = {}; + info[index].type = ROCPROFILER_TYPE_TRACE; + info[index].name = strdup(name.c_str()); + + for (auto* params : params_list) { + const unsigned parameter_count = params->opts.size(); + rocprofiler_parameter_t *parameters = new rocprofiler_parameter_t[parameter_count]; + unsigned p_index = 0; + for (auto& v : params->opts) { + const std::string parameter_name = v.first; + if (parameters_dict.find(parameter_name) == parameters_dict.end()) { + fprintf(stderr, "ROCProfiler: unknown trace parameter %s\n", parameter_name.c_str()); + exit(1); + } + const uint32_t value = strtol(v.second.c_str(), NULL, 0); + printf(" %s = 0x%x\n", parameter_name.c_str(), value); + parameters[p_index] = {}; + parameters[p_index].parameter_name = parameters_dict[parameter_name]; + parameters[p_index].value = value; + ++p_index; + } + + info[index].parameters = parameters; + info[index].parameter_count = parameter_count; + } + printf(" )\n"); + ++index; + } + + if (info_count) { + // Adding dispatch observer + dispatch_data_t* dispatch_data = new dispatch_data_t{}; + dispatch_data->info = info; + dispatch_data->info_count = info_count; + dispatch_data->group_index = 0; + rocprofiler_dispatch_observer(dispatch_callback, dispatch_data); + } +} + +DESTRUCTOR_API void destructor() { + printf("\nROCPRofiler: %u contexts collected", context_array_index); + thr_stop = true; +#ifdef TOOL_THREAD + pthread_join(thread, NULL); +#else + dumping_data(NULL); +#endif +} diff --git a/test/ctrl/tool.cpp b/test/ctrl/tool.cpp new file mode 100644 index 0000000000..570a738916 --- /dev/null +++ b/test/ctrl/tool.cpp @@ -0,0 +1,313 @@ +#include +#include +#include +#include +#include +#include +#include + +#include "inc/rocprofiler.h" +#include "util/xml.h" + +#define PUBLIC_API __attribute__((visibility("default"))) +#define CONSTRUCTOR_API __attribute__((constructor)) +#define DESTRUCTOR_API __attribute__((destructor)) + +struct dispatch_data_t { + rocprofiler_info_t* info; + unsigned info_count; + unsigned group_index; +}; + +struct context_entry_t { + rocprofiler_group_t* group; + rocprofiler_info_t* info; + unsigned info_count; + rocprofiler_callback_data_t data; +}; + +pthread_mutex_t mutex = PTHREAD_MUTEX_INITIALIZER; +unsigned context_array_size = 1; +context_entry_t* context_array = NULL; +unsigned context_array_index = 0; +unsigned dump_index = 0; + +const char* file_name = NULL; +FILE* file_handle = NULL; + +void check_status(hsa_status_t status) { + if (status != HSA_STATUS_SUCCESS) { + const char* error_string = NULL; + rocprofiler_error_string(&error_string); + fprintf(stderr, "ERROR: %s\n", error_string); + exit(1); + } +} + +hsa_status_t trace_data_cb( + hsa_ven_amd_aqlprofile_info_type_t info_type, + hsa_ven_amd_aqlprofile_info_data_t* info_data, + void* data) +{ + hsa_status_t status = HSA_STATUS_SUCCESS; + if (info_type == HSA_VEN_AMD_AQLPROFILE_INFO_SQTT_DATA) { + printf(" data ptr (%p), size(%u)\n", info_data->sqtt_data.ptr, info_data->sqtt_data.size); + } else status = HSA_STATUS_ERROR; + return status; +} + +unsigned align_size(unsigned size, unsigned alignment) { return ((size + alignment - 1) & ~(alignment - 1)); } + +void print_info(FILE* file, const rocprofiler_info_t* info, const unsigned info_count, rocprofiler_t* context, const char* str) { + if (str) fprintf(file, "%s:\n", str); + for (unsigned i= 0; i < info_count; ++i) { + const rocprofiler_info_t* p = &info[i]; + fprintf(file, " %s ", p->name); + switch (p->data.kind) { + case ROCPROFILER_INT64: + fprintf(file, "(%lu)\n", p->data.result_int64); + break; + case ROCPROFILER_BYTES: { + fprintf(file, "(\n"); + if (p->data.result_bytes.copy) { + fprintf(file, " system memory copy\n"); + const char* ptr = reinterpret_cast(p->data.result_bytes.ptr); + uint64_t size = 0; + for (unsigned i = 0; i < p->data.result_bytes.instance_count; ++i) { + size = *reinterpret_cast(ptr); + const char* data = ptr + sizeof(size); + fprintf(file, " data (%p), size (%lu)\n", data, size); + size = align_size(size, sizeof(uint64_t)); + ptr = data + size; + } + } else { + fprintf(file, " local memory buffer\n"); + rocprofiler_iterate_trace_data(context, trace_data_cb, NULL); + } + fprintf(file, " )\n"); + break; + } + default: + std::cout << "Bad result kind (" << p->data.kind << ")" << std::endl; + } + } +} + +void print_group(FILE* file, const rocprofiler_group_t* group, const char* str) { + if (str) fprintf(file, "%s:\n", str); + for (unsigned i= 0; i < group->info_count; ++i) { + print_info(file, group->info[i], 1, group->context, NULL); + } +} + +void store_entry(const context_entry_t& context_entry) { + if(pthread_mutex_lock(&mutex) != 0) { + perror("pthread_mutex_lock"); + exit(1); + } + + if ((context_array == NULL) || (context_array_index >= context_array_size)) { + context_array_size *= 2; + context_array = reinterpret_cast(realloc(context_array, context_array_size * sizeof(context_entry_t))); + } + context_array[context_array_index] = context_entry; + context_array_index += 1; + + if (pthread_mutex_unlock(&mutex) != 0) { + perror("pthread_mutex_unlock"); + exit(1); + } +} + +void dump_context(FILE *file, context_entry_t* entry, unsigned index) { + hsa_status_t status = HSA_STATUS_ERROR; + + rocprofiler_group_t* group = entry->group; + const rocprofiler_info_t* info = entry->info; + const unsigned info_count = entry->info_count; + fprintf(file, "Dispatch[%u], kernel_object(0x%lx):\n", index, entry->data.kernel_object); + + status = rocprofiler_get_group_data(group); + check_status(status); + //print_group(file, group, "Group[0] data"); + + status = rocprofiler_get_metrics_data(group->context); + check_status(status); + print_info(file, info, info_count, group->context, NULL); + + // Finishing cleanup + // Deleting profiling context will delete all allocated resources + rocprofiler_close(group->context); + + dump_index = index; +} + +void dumping_data() { + if (pthread_mutex_lock(&mutex) != 0) { + perror("pthread_mutex_lock"); + exit(1); + } + + for (unsigned index = 0; index < context_array_index; ++index) { + dump_context(file_handle, &context_array[index], index); + } + + if (pthread_mutex_unlock(&mutex) != 0) { + perror("pthread_mutex_unlock"); + exit(1); + } +} + +// profiling callback +hsa_status_t dispatch_callback( + const rocprofiler_callback_data_t* callback_data, + void* user_data, + rocprofiler_group_t** group) { + hsa_status_t status = HSA_STATUS_ERROR; + // Passed tool data + dispatch_data_t* tool_data = reinterpret_cast(user_data); + // Profiling context + rocprofiler_t* context = NULL; + // context properties + rocprofiler_properties_t properties{}; + + // Open profiling context + status = rocprofiler_open(0, tool_data->info, tool_data->info_count, &context, 0, &properties); + check_status(status); + + rocprofiler_group_t* groups = NULL; + uint32_t group_count = 0; + status = rocprofiler_get_groups(context, &groups, &group_count); + check_status(status); + assert(group_count == 1); + + *group = &groups[0]; + context_entry_t entry; + entry.group = *group; + entry.info = tool_data->info; + entry.info_count = tool_data->info_count; + entry.data = *callback_data; + store_entry(entry); + + return status; +} + +CONSTRUCTOR_API void constructor() { + std::map parameters_dict; + parameters_dict["HSA_VEN_AMD_AQLPROFILE_PARAMETER_NAME_COMPUTE_UNIT_TARGET"] = HSA_VEN_AMD_AQLPROFILE_PARAMETER_NAME_COMPUTE_UNIT_TARGET; + parameters_dict["HSA_VEN_AMD_AQLPROFILE_PARAMETER_NAME_VM_ID_MASK"] = HSA_VEN_AMD_AQLPROFILE_PARAMETER_NAME_VM_ID_MASK; + parameters_dict["HSA_VEN_AMD_AQLPROFILE_PARAMETER_NAME_MASK"] = HSA_VEN_AMD_AQLPROFILE_PARAMETER_NAME_MASK; + parameters_dict["HSA_VEN_AMD_AQLPROFILE_PARAMETER_NAME_TOKEN_MASK"] = HSA_VEN_AMD_AQLPROFILE_PARAMETER_NAME_TOKEN_MASK; + parameters_dict["HSA_VEN_AMD_AQLPROFILE_PARAMETER_NAME_TOKEN_MASK2"] = HSA_VEN_AMD_AQLPROFILE_PARAMETER_NAME_TOKEN_MASK2; + + // Set output file + file_name = getenv("ROCP_OUTPUT"); + if (file_name != NULL) { + file_handle = fopen(file_name, "w"); + if (file_handle == NULL) { + perror("fopen"); + exit(1); + } + } else file_handle = stdout; + + // Getting input + const char* xml_name = getenv("ROCP_INPUT"); + if (xml_name == NULL) { + fprintf(stderr, "ROCProfiler: input is not specified, ROCP_INPUT env"); + exit(1); + } + printf("ROCProfiler: input from \"%s\"\n", xml_name); + xml::Xml* xml = new xml::Xml(xml_name); + + // Getting metrics + auto metrics_list = xml->GetNodes("top.metric"); + std::vector metrics_vec; + for (auto* entry : metrics_list) { + const std::string entry_str = entry->opts["name"]; + size_t pos1 = 0; + while(pos1 < entry_str.length()) { + const size_t pos2 = entry_str.find(",", pos1); + const std::string metric_name = entry_str.substr(pos1, pos2 - pos1); + metrics_vec.push_back(metric_name); + if (pos2 == std::string::npos) break; + pos1 = pos2 + 1; + } + } + + // Getting traces + auto traces_list = xml->GetNodes("top.trace"); + + const unsigned info_count = metrics_vec.size() + traces_list.size(); + rocprofiler_info_t* info= new rocprofiler_info_t[info_count]; + memset(info, 0, info_count * sizeof(rocprofiler_info_t)); + + printf(" %d metrics\n", (int) metrics_vec.size()); + for (unsigned i = 0; i < metrics_vec.size(); ++i) { + const std::string& name = metrics_vec[i]; + printf("%s%s", (i == 0) ? " " : ", ", name.c_str()); + info[i] = {}; + info[i].type = ROCPROFILER_TYPE_METRIC; + info[i].name = strdup(name.c_str()); + } + if (metrics_vec.size()) printf("\n"); + + printf(" %d traces\n", (int) traces_list.size()); + unsigned index = metrics_vec.size(); + for (auto* entry : traces_list) { + auto params_list = xml->GetNodes("top.trace.parameters"); + if (params_list.size() != 1) { + fprintf(stderr, "ROCProfiler: Single input 'parameters' section is supported\n"); + exit(1); + } + const std::string& name = entry->opts["name"]; + const bool to_copy_data = (entry->opts["copy"] == "true"); + printf(" %s (\n", name.c_str()); + info[index] = {}; + info[index].type = ROCPROFILER_TYPE_TRACE; + info[index].name = strdup(name.c_str()); + info[index].data.result_bytes.copy = to_copy_data; + + for (auto* params : params_list) { + const unsigned parameter_count = params->opts.size(); + rocprofiler_parameter_t *parameters = new rocprofiler_parameter_t[parameter_count]; + unsigned p_index = 0; + for (auto& v : params->opts) { + const std::string parameter_name = v.first; + if (parameters_dict.find(parameter_name) == parameters_dict.end()) { + fprintf(stderr, "ROCProfiler: unknown trace parameter %s\n", parameter_name.c_str()); + exit(1); + } + const uint32_t value = strtol(v.second.c_str(), NULL, 0); + printf(" %s = 0x%x\n", parameter_name.c_str(), value); + parameters[p_index] = {}; + parameters[p_index].parameter_name = parameters_dict[parameter_name]; + parameters[p_index].value = value; + ++p_index; + } + + info[index].parameters = parameters; + info[index].parameter_count = parameter_count; + } + printf(" )\n"); + ++index; + } + + if (info_count) { + // Adding dispatch observer + dispatch_data_t* dispatch_data = new dispatch_data_t{}; + dispatch_data->info = info; + dispatch_data->info_count = info_count; + dispatch_data->group_index = 0; + rocprofiler_set_dispatch_observer(dispatch_callback, dispatch_data); + } +} + +DESTRUCTOR_API void destructor() { + printf("\nROCPRofiler: %u contexts collected", context_array_index); + if (file_name == NULL) { + printf("\n"); + } else { + printf(", dumping to %s\n", file_name); + } + dumping_data(); +} diff --git a/test/input.xml b/test/input.xml new file mode 100644 index 0000000000..9a51eb2b82 --- /dev/null +++ b/test/input.xml @@ -0,0 +1,7 @@ + + + + diff --git a/test/metrics.xml b/test/metrics.xml new file mode 100644 index 0000000000..87c5d12f0f --- /dev/null +++ b/test/metrics.xml @@ -0,0 +1,15 @@ + + + + + + + + + + + + + + + diff --git a/test/run.sh b/test/run.sh new file mode 100755 index 0000000000..f1db179397 --- /dev/null +++ b/test/run.sh @@ -0,0 +1,27 @@ +#!/bin/sh + +tbin=./test/ctrl + +#export HSA_LIB=/home/evgeny/pkg/compute-psdb-16453/lib +export HSA_LIB=/home/evgeny/git/compute/out/ubuntu-16.04/16.04/lib +export OCL_LIB=/home/evgeny/pkg/opencl_modified/opencl_x86_64/lib +#export OCL_LIB=/home/evgeny/Perforce/eshcherb_opencl/drivers/opencl/dist/linux/debug/lib/x86_64 + +export LD_LIBRARY_PATH=$PWD:$HSA_LIB:$OCL_LIB +export ROCPROFILER_LOG=1 + +export HSA_TOOLS_LIB=librocprofiler64.so +export ROCP_TOOL_LIB=test/libtool.so +export ROCP_HSA_INTERCEPT=1 +export ROCP_METRICS=metrics.xml +export ROCP_INPUT=input.xml +unset ROCP_PROXY_QUEUE + +echo "Run simple profiling test" +if [ -n "$1" ] ; then + eval "$*" +else + eval $tbin +fi + +exit 0 diff --git a/test/simple_convolution/gfx8_SimpleConvolution.hsaco b/test/simple_convolution/gfx8_SimpleConvolution.hsaco new file mode 100644 index 0000000000..831484c226 Binary files /dev/null and b/test/simple_convolution/gfx8_SimpleConvolution.hsaco differ diff --git a/test/simple_convolution/gfx9_SimpleConvolution.hsaco b/test/simple_convolution/gfx9_SimpleConvolution.hsaco new file mode 100644 index 0000000000..f65dde4252 Binary files /dev/null and b/test/simple_convolution/gfx9_SimpleConvolution.hsaco differ diff --git a/test/simple_convolution/simple_convolution.cl b/test/simple_convolution/simple_convolution.cl new file mode 100644 index 0000000000..9cf58d2008 --- /dev/null +++ b/test/simple_convolution/simple_convolution.cl @@ -0,0 +1,81 @@ +/****************************************************************************** + +Copyright ©2013 Advanced Micro Devices, Inc. All rights reserved. + +Redistribution and use in source and binary forms, with or without modification, +are permitted provided that the following conditions are met: + +Redistributions of source code must retain the above copyright notice, this list +of conditions and the following disclaimer. + +Redistributions in binary form must reproduce the above copyright notice, this +list of conditions and the following disclaimer in the documentation and/or +other materials provided with the distribution. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. +IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, +INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, +BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE +OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED +OF THE POSSIBILITY OF SUCH DAMAGE. + +********************************************************************************/ + +/** + * SimpleConvolution is where each pixel of the output image + * is the weighted sum of the neighborhood pixels of the input image + * The neighborhood is defined by the dimensions of the mask and + * weight of each neighbor is defined by the mask itself. + * @param output Output matrix after performing convolution + * @param input Input matrix on which convolution is to be performed + * @param mask mask matrix using which convolution was to be performed + * @param inputDimensions dimensions of the input matrix + * @param maskDimensions dimensions of the mask matrix + */ +__kernel void SimpleConvolution(__global uint * output, + __global uint * input, + __global float * mask, + const uint2 inputDimensions, + const uint2 maskDimensions) { + + uint tid = get_global_id(0); + + uint width = inputDimensions.x; + uint height = inputDimensions.y; + + uint x = tid%width; + uint y = tid/width; + + uint maskWidth = maskDimensions.x; + uint maskHeight = maskDimensions.y; + + uint vstep = (maskWidth -1)/2; + uint hstep = (maskHeight -1)/2; + + // find the left, right, top and bottom indices such that + // the indices do not go beyond image boundaires + uint left = (x < vstep) ? 0 : (x - vstep); + uint right = ((x + vstep) >= width) ? width - 1 : (x + vstep); + uint top = (y < hstep) ? 0 : (y - hstep); + uint bottom = ((y + hstep) >= height)? height - 1: (y + hstep); + + // initializing wighted sum value + float sumFX = 0; + + for(uint i = left; i <= right; ++i) { + for(uint j = top ; j <= bottom; ++j) { + // performing wighted sum within the mask boundaries + uint maskIndex = (j - (y - hstep)) * maskWidth + (i - (x - vstep)); + uint index = j * width + i; + sumFX += ((float)input[index] * mask[maskIndex]); + } + } + + // To round to the nearest integer + sumFX += 0.5f; + output[tid] = (uint)sumFX; +} diff --git a/test/simple_convolution/simple_convolution.cpp b/test/simple_convolution/simple_convolution.cpp new file mode 100644 index 0000000000..e5676d6d44 --- /dev/null +++ b/test/simple_convolution/simple_convolution.cpp @@ -0,0 +1,390 @@ +/****************************************************************************** + +Copyright ©2013 Advanced Micro Devices, Inc. All rights reserved. + +Redistribution and use in source and binary forms, with or without modification, +are permitted provided that the following conditions are met: + +Redistributions of source code must retain the above copyright notice, this list +of conditions and the following disclaimer. + +Redistributions in binary form must reproduce the above copyright notice, this +list of conditions and the following disclaimer in the documentation and/or +other materials provided with the distribution. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. +IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, +INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, +BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE +OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED +OF THE POSSIBILITY OF SUCH DAMAGE. + +*******************************************************************************/ + +#include "simple_convolution/simple_convolution.h" + +#include +#include + +#include "util/helper_funcs.h" +#include "util/test_assert.h" + +const uint32_t SimpleConvolution::input_data_[]{ + 15, 201, 51, 89, 92, 34, 96, 66, 11, 225, 161, 96, 81, 211, 108, 124, 202, 244, 182, + 90, 215, 92, 98, 20, 44, 225, 55, 247, 202, 0, 45, 218, 202, 97, 51, 39, 131, 147, + 105, 143, 116, 11, 239, 198, 222, 92, 67, 169, 81, 250, 3, 40, 86, 101, 60, 131, 70, + 116, 123, 17, 117, 168, 236, 64, 10, 31, 103, 142, 179, 209, 29, 40, 220, 13, 239, 187, + 105, 50, 100, 186, 44, 104, 227, 131, 205, 32, 6, 20, 149, 130, 38, 10, 43, 18, 75, + 53, 50, 178, 195, 230, 132, 225, 14, 96, 238, 253, 27, 88, 48, 128, 18, 92, 232, 246, + 224, 182, 23, 231, 203, 172, 105, 241, 183, 148, 4, 2, 202, 55, 181, 142, 29, 57, 111, + 43, 153, 93, 41, 181, 181, 89, 54, 200, 182, 31, 190, 150, 213, 213, 126, 160, 130, 232, + 146, 57, 125, 151, 59, 71, 206, 240, 213, 236, 42, 68, 24, 195, 162, 65, 121, 87, 155, + 175, 31, 81, 207, 222, 232, 164, 180, 102, 69, 55, 79, 216, 112, 204, 112, 171, 19, 63, + 156, 233, 43, 198, 46, 67, 138, 208, 132, 4, 39, 32, 180, 71, 113, 131, 38, 90, 40, + 219, 193, 109, 18, 16, 70, 131, 220, 182, 46, 240, 245, 203, 217, 32, 146, 7, 100, 28, + 216, 233, 32, 255, 9, 213, 71, 123, 88, 110, 213, 128, 74, 150, 238, 93, 166, 52, 224, + 131, 234, 15, 115, 224, 218, 76, 1, 108, 84, 101, 137, 44, 79, 170, 44, 88, 127, 116, + 211, 216, 226, 168, 88, 45, 63, 70, 138, 230, 123, 107, 105, 101, 122, 220, 70, 84, 41, + 71, 193, 125, 173, 75, 169, 252, 245, 213, 84, 117, 73, 40, 77, 44, 209, 166, 90, 16, + 237, 229, 246, 104, 80, 95, 206, 202, 60, 20, 31, 101, 92, 225, 226, 9, 44, 140, 5, + 34, 97, 89, 151, 171, 129, 229, 216, 82, 139, 51, 99, 120, 24, 89, 225, 104, 185, 175, + 50, 246, 196, 82, 91, 32, 51, 62, 42, 96, 202, 47, 130, 44, 137, 26, 215, 10, 255, + 176, 93, 138, 227, 193, 3, 251, 27, 229, 100, 212, 149, 151, 202, 89, 233, 38, 122, 29, + 100, 164, 125, 46, 212, 0, 90, 93, 26, 50, 103, 25, 226, 197, 164, 198, 135, 168, 194, + 162, 141, 38, 119, 34, 190, 66, 124, 167, 104, 247, 197, 204, 156, 67, 251, 112, 67, 85, + 205, 93, 135, 53, 119, 106, 251, 28, 49, 130, 196, 243, 36, 82, 26, 155, 117, 216, 221, + 241, 128, 70, 233, 70, 18, 133, 137, 14, 245, 204, 99, 195, 42, 235, 248, 161, 86, 243, + 190, 135, 118, 130, 123, 154, 213, 150, 54, 74, 111, 20, 60, 240, 90, 37, 54, 109, 171, + 191, 123, 161, 140, 222, 100, 182, 202, 93, 88, 32, 80, 23, 168, 198, 153, 36, 97, 111, + 187, 151, 185, 43, 172, 245, 27, 6, 27, 82, 115, 199, 18, 239, 104, 158, 206, 205, 85, + 152, 42, 174, 185, 123, 197, 98, 65, 95, 135, 163, 206, 66, 59, 136, 109, 231, 125, 137, + 237, 153, 219, 97, 96, 237, 81, 201, 140, 31, 150, 226, 183, 192, 144, 113, 59, 86, 212, + 125, 182, 91, 33, 132, 158, 92, 12, 12, 68, 138, 149, 50, 36, 113, 147, 133, 95, 229, + 78, 235, 4, 228, 206, 188, 165, 95, 45, 225, 181, 1, 94, 107, 93, 128, 240, 251, 220, + 252, 7, 32, 135, 156, 83, 171, 14, 230, 48, 109, 203, 126, 89, 208, 99, 39, 140, 9, + 134, 185, 234, 60, 187, 73, 167, 24, 201, 152, 20, 166, 148, 27, 199, 28, 184, 26, 199, + 198, 0, 248, 52, 204, 119, 141, 157, 218, 181, 41, 227, 59, 227, 206, 119, 159, 23, 31, + 184, 224, 183, 204, 134, 76, 231, 77, 105, 160, 103, 48, 103, 104, 41, 155, 53, 160, 41, + 210, 123, 222, 252, 95, 26, 223, 45, 146, 126, 68, 177, 54, 37, 105, 3, 171, 182, 235, + 249, 31, 139, 97, 80, 243, 202, 121, 143, 0, 26, 184, 210, 149, 151, 207, 244, 177, 174, + 34, 67, 45, 102, 245, 100, 140, 95, 104, 55, 21, 83, 49, 53, 223, 147, 134, 210, 93, + 0, 97, 93, 26, 26, 48, 175, 178, 255, 164, 99, 174, 198, 167, 220, 45, 156, 64, 185, + 252, 168, 241, 18, 252, 35, 71, 219, 182, 205, 173, 19, 206, 15, 113, 232, 42, 161, 152, + 220, 160, 60, 64, 79, 3, 231, 43, 49, 132, 108, 235, 128, 21, 220, 146, 17, 255, 218, + 236, 182, 168, 154, 201, 118, 170, 58, 94, 212, 220, 246, 177, 125, 51, 241, 204, 55, 216, + 248, 104, 92, 100, 83, 221, 121, 48, 111, 138, 47, 73, 119, 230, 241, 17, 175, 103, 187, + 234, 198, 144, 199, 188, 65, 68, 240, 51, 17, 39, 11, 9, 143, 104, 109, 227, 70, 231, + 19, 181, 113, 66, 255, 233, 41, 241, 250, 217, 89, 182, 196, 31, 71, 139, 220, 137, 208, + 204, 188, 225, 243, 200, 234, 131, 48, 88, 102, 119, 63, 121, 44, 177, 188, 44, 154, 229, + 29, 149, 190, 118, 76, 130, 150, 147, 14, 114, 28, 222, 62, 217, 191, 50, 161, 170, 181, + 210, 2, 28, 73, 66, 149, 117, 243, 81, 162, 141, 55, 191, 35, 245, 54, 111, 120, 204, + 2, 134, 62, 31, 100, 125, 248, 36, 175, 153, 206, 101, 107, 209, 129, 181, 19, 22, 43, + 7, 104, 205, 149, 159, 140, 184, 149, 195, 39, 14, 143, 42, 148, 205, 73, 249, 74, 66, + 30, 250, 219, 237, 96, 71, 190, 225, 253, 210, 248, 40, 218, 96, 245, 111, 0, 130, 39, + 150, 69, 79, 165, 212, 122, 57, 162, 195, 51, 237, 6, 82, 231, 225, 63, 71, 41, 253, + 41, 38, 208, 33, 78, 170, 130, 68, 26, 131, 198, 66, 26, 12, 145, 191, 224, 11, 249, + 130, 207, 44, 112, 213, 126, 88, 183, 190, 160, 225, 187, 201, 8, 140, 235, 87, 55, 109, + 155, 81, 241, 98, 147, 11, 110, 37, 202, 79, 49, 195, 210, 0, 240, 66, 214, 110, 154, + 142, 44, 58, 111, 232, 4, 119, 117, 239, 207, 172, 93, 106, 254, 78, 205, 145, 89, 59, + 183, 35, 138, 232, 230, 92, 233, 214, 159, 191, 69, 58, 78, 114, 116, 189, 91, 121, 53, + 208, 104, 4, 125, 198, 111, 123, 20, 60, 13, 109, 120, 196, 145, 3, 172, 119, 95, 150, + 78, 255, 85, 147, 57, 163, 6, 174, 97, 97, 39, 151, 50, 144, 155, 175, 86, 11, 43, + 107, 71, 56, 216, 191, 253, 105, 194, 170, 225, 34, 64, 47, 34, 150, 195, 91, 58, 201, + 10, 155, 43, 49, 50, 93, 194, 206, 13, 25, 217, 56, 132, 33, 112, 92, 225, 109, 198, + 164, 23, 167, 199, 88, 215, 234, 238, 155, 69, 40, 100, 80, 196, 144, 129, 246, 237, 68, + 197, 250, 93, 159, 51, 225, 193, 163, 62, 163, 17, 4, 71, 41, 172, 15, 130, 132, 249, + 112, 31, 63, 152, 132, 143, 92, 20, 17, 83, 1, 86, 25, 252, 179, 185, 47, 149, 122, + 211, 211, 29, 229, 216, 101, 15, 133, 117, 145, 9, 111, 1, 40, 175, 154, 173, 62, 247, + 193, 80, 75, 194, 166, 100, 191, 90, 29, 239, 239, 152, 194, 195, 182, 168, 156, 27, 183, + 33, 145, 73, 43, 0, 75, 83, 175, 229, 0, 238, 221, 194, 63, 40, 133, 230, 140, 68, + 64, 170, 51, 48, 66, 246, 243, 248, 159, 144, 20, 87, 177, 165, 160, 220, 166, 235, 48, + 86, 209, 49, 68, 174, 243, 132, 214, 120, 106, 99, 189, 170, 13, 241, 219, 80, 232, 207, + 72, 135, 95, 92, 223, 16, 2, 127, 237, 169, 107, 29, 255, 61, 79, 68, 236, 67, 200, + 194, 188, 50, 38, 121, 221, 52, 107, 184, 132, 84, 136, 204, 219, 231, 41, 186, 248, 44, + 58, 229, 213, 166, 3, 212, 227, 82, 25, 207, 150, 225, 146, 82, 20, 185, 204, 242, 237, + 55, 170, 113, 139, 50, 62, 103, 26, 103, 34, 18, 148, 93, 247, 105, 3, 251, 62, 231, + 77, 87, 182, 227, 57, 73, 54, 77, 2, 2, 63, 239, 57, 234, 97, 197, 29, 159, 44, + 55, 7, 79, 74, 155, 172, 66, 5, 175, 61, 67, 150, 139, 155, 77, 111, 212, 151, 165, + 34, 153, 167, 98, 137, 225, 77, 234, 166, 107, 138, 211, 163, 145, 34, 237, 45, 206, 47, + 50, 126, 108, 117, 21, 248, 17, 98, 103, 230, 249, 12, 9, 147, 179, 107, 29, 149, 185, + 7, 59, 37, 146, 14, 200, 35, 49, 182, 80, 0, 230, 130, 126, 83, 248, 148, 75, 9, + 247, 178, 240, 240, 190, 249, 132, 114, 101, 161, 7, 30, 169, 67, 68, 59, 82, 12, 95, + 131, 195, 176, 131, 169, 51, 2, 252, 44, 150, 72, 54, 141, 250, 38, 126, 185, 31, 3, + 44, 132, 165, 52, 163, 78, 120, 231, 138, 202, 244, 234, 77, 183, 155, 209, 97, 207, 212, + 94, 251, 107, 166, 49, 249, 161, 88, 120, 91, 120, 123, 135, 253, 33, 188, 160, 112, 52, + 136, 250, 254, 125, 229, 76, 53, 128, 30, 150, 79, 243, 244, 75, 95, 155, 125, 88, 60, + 213, 209, 152, 78, 77, 32, 75, 110, 220, 236, 222, 17, 117, 217, 15, 242, 190, 92, 39, + 63, 123, 190, 143, 111, 178, 219, 206, 78, 88, 38, 138, 46, 247, 34, 124, 69, 66, 199, + 179, 31, 179, 145, 48, 41, 106, 64, 27, 41, 157, 67, 105, 24, 1, 249, 135, 179, 212, + 86, 1, 44, 124, 140, 91, 116, 175, 215, 185, 242, 159, 108, 17, 83, 254, 66, 124, 105, + 131, 151, 146, 32, 218, 252, 57, 219, 245, 193, 143, 201, 23, 145, 246, 148, 30, 82, 8, + 206, 41, 194, 192, 201, 47, 210, 28, 46, 20, 152, 151, 151, 48, 42, 184, 11, 38, 241, + 231, 28, 179, 119, 230, 202, 8, 220, 94, 39, 46, 103, 245, 88, 42, 181, 33, 90, 136, + 62, 136, 156, 214, 31, 52, 7, 74, 237, 19, 113, 223, 250, 141, 146, 113, 115, 92, 122, + 80, 187, 161, 126, 35, 150, 215, 78, 76, 249, 168, 212, 55, 48, 113, 14, 80, 166, 21, + 154, 147, 40, 12, 114, 35, 153, 5, 148, 12, 98, 15, 92, 29, 176, 219, 65, 71, 179, + 143, 147, 172, 56, 104, 227, 104, 218, 241, 185, 128, 7, 84, 20, 47, 96, 135, 82, 249, + 140, 231, 6, 238, 246, 99, 12, 167, 63, 77, 238, 242, 221, 130, 158, 21, 235, 129, 126, + 197, 114, 56, 69, 121, 140, 90, 169, 237, 225, 252, 231, 109, 228, 237, 91, 219, 81, 104, + 130, 144, 181, 113, 130, 147, 244, 32, 169, 223, 162, 39, 164, 21, 95, 234, 143, 236, 68, + 57, 217, 37, 53, 192, 147, 25, 174, 239, 245, 0, 87, 119, 144, 13, 232, 19, 160, 220, + 51, 73, 188, 214, 113, 96, 235, 209, 75, 122, 190, 144, 179, 151, 181, 233, 88, 73, 3, + 7, 56, 248, 7, 143, 112, 152, 156, 89, 171, 61, 53, 223, 135, 242, 181, 248, 83, 161, + 202, 158, 28, 136, 46, 208, 32, 228, 186, 121, 45, 189, 128, 102, 182, 136, 246, 38, 32, + 147, 127, 204, 208, 181, 171, 87, 167, 97, 80, 250, 2, 26, 153, 31, 163, 200, 239, 195, + 172, 169, 60, 218, 103, 188, 65, 30, 69, 55, 68, 102, 202, 196, 50, 154, 121, 221, 242, + 33, 63, 67, 28, 66, 93, 181, 97, 0, 126, 81, 196, 43, 251, 0, 5, 98, 189, 70, + 128, 3, 126, 197, 105, 72, 137, 155, 227, 3, 121, 214, 36, 184, 25, 65, 250, 118, 247, + 91, 119, 117, 173, 60, 160, 168, 60, 166, 10, 250, 237, 139, 253, 107, 80, 102, 180, 217, + 2, 151, 221, 123, 109, 1, 52, 134, 66, 46, 253, 57, 138, 117, 175, 55, 178, 79, 223, + 239, 245, 234, 233, 226, 117, 231, 78, 198, 78, 2, 159, 80, 154, 124, 204, 7, 126, 0, + 142, 193, 47, 140, 251, 185, 2, 170, 241, 180, 249, 208, 163, 239, 186, 141, 210, 48, 116, + 32, 246, 195, 34, 150, 19, 188, 19, 224, 196, 146, 224, 83, 83, 15, 224, 78, 201, 226, + 249, 186, 151, 243, 139, 58, 226, 70, 199, 181, 118, 60, 213, 109, 255, 248, 3, 19, 181, + 23, 243, 122, 169, 212, 205, 252, 228, 173, 75, 173, 144, 68, 104, 39, 55, 243, 98, 26, + 57, 41, 207, 175, 102, 165, 29, 102, 158, 32, 121, 83, 56, 109, 205, 225, 66, 155, 222, + 38, 73, 42, 212, 218, 110, 60, 1, 166, 48, 99, 193, 105, 141, 145, 25, 244, 54, 54, + 90, 213, 87, 212, 40, 143, 66, 246, 112, 132, 146, 79, 171, 220, 121, 128, 182, 232, 189, + 184, 143, 237, 27, 80, 86, 169, 226, 112, 158, 25, 166, 248, 238, 253, 204, 23, 141, 15, + 13, 254, 147, 160, 77, 63, 124, 199, 191, 50, 175, 124, 234, 62, 105, 6, 143, 192, 176, + 113, 48, 78, 139, 215, 71, 121, 213, 20, 144, 98, 35, 158, 96, 183, 62, 174, 246, 187, + 117, 182, 237, 37, 50, 216, 99, 156, 223, 243, 93, 143, 101, 142, 222, 240, 101, 37, 106, + 58, 57, 250, 157, 93, 153, 254, 20, 216, 172, 10, 147, 34, 192, 129, 71, 243, 90, 171, + 144, 57, 159, 238, 201, 4, 124, 167, 244, 225, 205, 95, 28, 7, 89, 185, 100, 243, 184, + 121, 203, 100, 131, 95, 135, 68, 224, 207, 56, 58, 122, 201, 115, 25, 183, 61, 30, 51, + 229, 18, 21, 178, 113, 49, 186, 203, 235, 31, 191, 163, 152, 138, 8, 28, 233, 143, 97, + 202, 95, 153, 4, 217, 98, 120, 243, 26, 182, 17, 77, 155, 36, 99, 78, 150, 149, 8, + 98, 128, 39, 33, 36, 192, 172, 45, 220, 149, 189, 61, 96, 28, 215, 100, 246, 58, 221, + 233, 84, 147, 251, 162, 47, 31, 5, 125, 181, 154, 134, 23, 27, 174, 57, 64, 110, 229, + 109, 75, 123, 43, 136, 219, 71, 95, 64, 61, 154, 29, 39, 238, 177, 34, 145, 225, 65, + 150, 94, 247, 49, 229, 15, 77, 147, 72, 141, 2, 45, 251, 77, 169, 38, 213, 132, 110, + 53, 196, 172, 207, 226, 212, 190, 148, 246, 79, 117, 56, 230, 212, 48, 23, 185, 63, 100, + 76, 136, 242, 78, 181, 237, 156, 95, 20, 113, 227, 131, 167, 168, 47, 119, 139, 3, 53, + 31, 250, 133, 149, 50, 107, 105, 99, 130, 34, 162, 231, 111, 42, 217, 190, 224, 199, 90, + 63, 220, 204, 35, 95, 115, 203, 143, 234, 86, 147, 32, 118, 141, 165, 11, 192, 16, 117, + 35, 147, 152, 198, 123, 7, 240, 84, 198, 209, 28, 33, 17, 248, 237, 52, 88, 97, 255, + 231, 76, 86, 122, 109, 204, 8, 18, 216, 201, 35, 77, 237, 183, 229, 179, 50, 237, 164, + 135, 179, 118, 164, 213, 135, 157, 195, 187, 245, 36, 187, 220, 113, 18, 87, 222, 222, 96, + 241, 183, 42, 21, 4, 23, 205, 233, 203, 0, 214, 112, 136, 138, 230, 44, 95, 110, 201, + 34, 41, 191, 71, 229, 155, 185, 247, 243, 151, 214, 84, 137, 141, 126, 159, 146, 149, 108, + 124, 97, 109, 82, 209, 245, 221, 183, 34, 60, 37, 236, 95, 79, 171, 167, 53, 71, 96, + 45, 58, 248, 3, 142, 129, 145, 12, 33, 36, 162, 142, 160, 3, 251, 243, 213, 240, 208, + 141, 19, 13, 178, 255, 109, 2, 170, 20, 55, 241, 116, 101, 44, 108, 105, 186, 238, 251, + 199, 15, 31, 106, 157, 191, 110, 152, 178, 67, 137, 131, 208, 156, 144, 131, 155, 253, 134, + 70, 18, 190, 55, 134, 35, 99, 243, 140, 30, 225, 135, 230, 240, 166, 81, 142, 102, 191, + 39, 25, 3, 177, 156, 211, 77, 45, 87, 233, 43, 221, 48, 61, 155, 103, 195, 191, 203, + 182, 75, 233, 152, 211, 208, 136, 121, 33, 23, 224, 224, 62, 249, 227, 239, 149, 183, 61, + 195, 15, 39, 238, 236, 87, 43, 136, 191, 239, 71, 138, 166, 147, 116, 62, 102, 68, 199, + 224, 101, 223, 193, 70, 29, 186, 42, 13, 80, 225, 75, 19, 241, 115, 1, 221, 202, 45, + 102, 137, 29, 174, 20, 195, 66, 136, 2, 168, 205, 201, 137, 50, 168, 74, 121, 198, 4, + 163, 212, 85, 133, 31, 105, 118, 146, 106, 84, 93, 152, 187, 231, 181, 105, 251, 121, 171, + 132, 123, 84, 81, 69, 221, 132, 238, 40, 253, 181, 45, 161, 137, 130, 39, 169, 235, 158, + 59, 86, 242, 153, 239, 173, 128, 165, 23, 123, 30, 195, 0, 154, 23, 81, 224, 245, 214, + 206, 30, 212, 131, 75, 117, 12, 206, 157, 181, 186, 59, 241, 17, 45, 138, 0, 219, 11, + 165, 243, 135, 196, 182, 135, 95, 205, 217, 63, 195, 175, 14, 225, 131, 145, 45, 249, 158, + 251, 150, 84, 182, 209, 70, 199, 255, 209, 199, 219, 220, 109, 206, 99, 50, 132, 234, 146, + 82, 195, 209, 22, 114, 223, 247, 246, 113, 37, 239, 16, 33, 134, 100, 215, 88, 170, 158, + 87, 123, 102, 50, 88, 211, 1, 187, 6, 134, 165, 152, 216, 105, 106, 239, 220, 74, 231, + 210, 187, 12, 194, 204, 45, 72, 49, 4, 160, 219, 162, 248, 87, 8, 43, 176, 220, 44, + 107, 227, 178, 17, 124, 139, 122, 230, 122, 87, 48, 97, 42, 236, 110, 236, 185, 155, 53, + 234, 159, 214, 198, 66, 206, 30, 75, 249, 206, 40, 38, 57, 11, 217, 74, 136, 100, 197, + 110, 223, 29, 159, 65, 71, 140, 175, 51, 69, 74, 105, 48, 234, 63, 246, 45, 13, 20, + 121, 7, 226, 161, 46, 28, 173, 7, 103, 53, 108, 45, 164, 76, 74, 68, 141, 145, 208, + 61, 197, 22, 136, 46, 70, 115, 110, 60, 161, 124, 81, 26, 132, 51, 188, 178, 79, 106, + 186, 183, 160, 39, 228, 68, 115, 46, 136, 1, 192, 89, 62, 133, 112, 198, 180, 182, 58, + 34, 243, 219, 158, 69, 245, 34, 120, 178, 213, 200, 28, 143, 128, 188, 182, 100, 1, 41, + 146, 137, 43, 82, 227, 105, 216, 83, 48, 140, 10, 106, 175, 254, 70, 77, 67, 59, 112, + 188, 237, 69, 133, 10, 212, 5, 198, 138, 105, 199, 180, 252, 81, 223, 79, 53, 73, 39, + 137, 121, 180, 148, 228, 99, 146, 42, 177, 214, 102, 33, 147, 84, 102, 25, 94, 59, 31, + 37, 197, 137, 237, 122, 133, 63, 90, 213, 116, 163, 253, 253, 29, 177, 145, 2, 21, 36, + 45, 198, 251, 147, 231, 143, 232, 78, 168, 71, 137, 199, 108, 79, 80, 90, 201, 214, 153, + 35, 172, 13, 199, 169, 11, 228, 91, 157, 231, 112, 193, 20, 54, 189, 167, 30, 77, 144, + 108, 245, 215, 246, 189, 68, 69, 14, 158, 14, 228, 55, 50, 145, 69, 249, 58, 80, 222, + 149, 237, 198, 5, 175, 218, 60, 109, 130, 91, 186, 18, 200, 175, 234, 190, 109, 46, 3, + 123, 204, 18, 96, 4, 68, 241, 73, 62, 44, 154, 29, 193, 136, 227, 199, 55, 189, 4, + 164, 64, 95, 95, 82, 39, 15, 60, 230, 124, 107, 233, 248, 55, 251, 89, 60, 63, 75, + 134, 126, 119, 32, 156, 57, 168, 127, 0, 224, 61, 5, 133, 125, 100, 228, 208, 140, 243, + 12, 114, 111, 119, 92, 104, 175, 87, 193, 236, 151, 13, 114, 21, 132, 146, 177, 189, 59, + 49, 190, 27, 110, 195, 160, 236, 40, 132, 188, 181, 120, 201, 40, 232, 65, 132, 80, 241, + 220, 18, 221, 115, 31, 79, 137, 164, 226, 58, 98, 29, 108, 32, 57, 219, 228, 218, 199, + 13, 95, 132, 195, 215, 77, 235, 191, 143, 112, 16, 128, 76, 35, 93, 191, 66, 173, 73, + 231, 143, 132, 73, 173, 240, 106, 231, 203, 78, 193, 147, 92, 33, 23, 31, 248, 100, 11, + 184, 243, 123, 201, 115, 200, 236, 209, 135, 47, 126, 209, 22, 14, 85, 95, 188, 69, 202, + 163, 17, 24, 101, 164, 117, 134, 187, 148, 127, 31, 159, 55, 19, 27, 1, 135, 227, 237, + 89, 107, 28, 216, 60, 51, 230, 145, 147, 163, 215, 93, 70, 232, 118, 172, 140, 235, 50, + 71, 128, 177, 103, 32, 233, 123, 60, 234, 2, 31, 216, 91, 139, 244, 52, 200, 40, 26, + 90, 188, 189, 49, 25, 4, 25, 144, 176, 166, 124, 227, 237, 252, 148, 85, 29, 125, 208, + 89, 104, 210, 121, 64, 46, 4, 53, 99, 204, 93, 125, 38, 25, 59, 88, 51, 64, 113, + 195, 241, 23, 64, 212, 5, 60, 104, 90, 90, 230, 42, 179, 78, 253, 44, 143, 44, 49, + 196, 143, 254, 34, 13, 36, 60, 73, 125, 112, 137, 239, 52, 122, 7, 116, 79, 12, 177, + 183, 103, 11, 158, 146, 190, 237, 143, 235, 124, 188, 28, 65, 76, 26, 100, 89, 63, 160, + 163, 188, 17, 44, 172, 69, 167, 179, 185, 246, 191, 107, 174, 38, 118, 76, 184, 53, 58, + 72, 32, 182, 5, 61, 248, 81, 88, 92, 170, 152, 253, 77, 84, 14, 122, 1, 83, 34, + 180, 13, 25, 115, 120, 199, 154, 238, 20, 83, 36, 79, 155, 68, 5, 160, 130, 254, 242, + 218, 90, 156, 114, 87, 234, 199, 101, 101, 200, 185, 135, 124, 198, 160, 240, 62, 104, 138, + 45, 125, 222, 81, 204, 122, 150, 210, 26, 24, 208, 12, 242, 42, 169, 101, 130, 148, 44, + 232, 249, 245, 161, 128, 113, 103, 33, 98, 166, 137, 236, 212, 7, 202, 38, 211, 69, 188, + 165, 95, 212, 118, 108, 199, 161, 22, 45, 35, 170, 90, 11, 163, 79, 173, 36, 193, 20, + 69, 35, 187, 207, 16, 144, 214, 219, 182, 170, 32, 114, 79, 128, 71, 198, 237, 15, 103, + 4, 60, 139, 175, 150, 151, 82, 230, 68, 119, 168, 89, 188, 204, 20, 140, 220, 165, 98, + 184, 91, 12, 217, 205, 92, 90, 20, 35, 71, 36, 138, 76, 96, 22, 251, 247, 173, 78, + 222, 241, 197, 134, 75, 130, 83, 96, 14, 47, 5, 113, 232, 96, 126, 193, 45, 218, 28, + 66, 253, 99, 103, 136, 176, 200, 158, 171, 191, 76, 249, 158, 62, 190, 37, 137, 65, 120, + 233, 80, 168, 238, 193, 145, 79, 63, 82, 125, 26, 111, 191, 24, 210, 39, 161, 131, 239, + 64, 46, 175, 140, 39, 77, 202, 230, 115, 84, 40, 235, 62, 120, 148, 45, 57, 37, 124, + 121, 120, 249, 148, 231, 185, 172, 186, 224, 77, 61, 207, 141, 107, 126, 26, 147, 204, 229, + 121, 63, 58, 161, 43, 120, 25, 191, 165, 83, 228, 34, 205, 92, 27, 97, 67, 213, 13, + 253, 182, 91, 59, 133, 233, 166, 4, 4, 57, 209, 233, 179, 16, 35, 85, 59, 155, 111, + 250, 65, 194, 223, 99, 144, 59, 127, 241, 127, 85, 255, 125, 11, 90, 184, 145, 68, 95, + 150, 72, 153, 103, 49, 76, 120, 85, 161, 179, 241, 16, 174, 51, 211, 142, 150, 99, 201, + 22, 85, 73, 108, 84, 199, 120, 175, 128, 9, 243, 223, 160, 59, 120, 8, 109, 197, 128, + 194, 103, 52, 180, 119, 227, 231, 75, 113, 126, 175, 59, 148, 4, 132, 1, 89, 75, 121, + 8, 204, 131, 251, 171, 36, 55, 36, 44, 165, 233, 172, 103, 80, 224, 28, 200, 195, 3, + 20, 53, 129, 195, 112, 22, 200, 244, 23, 34, 64, 145, 42, 12, 20, 38, 184, 56, 94, + 220, 101, 3, 198, 17, 107, 22, 242, 135, 222, 182, 138, 243, 235, 11, 182, 91, 34, 127, + 80, 58, 161, 145, 203, 204, 158, 224, 242, 86, 24, 81, 51, 126, 84, 249, 143, 191, 15, + 130, 70, 238, 57, 209, 225, 36, 221, 152, 128, 255, 24, 208, 57, 186, 97, 4, 134, 255, + 229, 121, 86, 254, 202, 137, 124, 31, 130, 12, 222, 146, 142, 37, 129, 199, 247, 98, 236, + 212, 251, 108, 211, 20, 60, 13, 206, 158, 18, 84}; + +SimpleConvolution::SimpleConvolution() { + width_ = 64; + height_ = 64; + mask_width_ = 3; + mask_height_ = mask_width_; + randomize_seed_ = 0; + + if (!IsPowerOf2(width_)) { + width_ = RoundToPowerOf2(width_); + } + + if (!IsPowerOf2(height_)) { + height_ = RoundToPowerOf2(height_); + } + + if (!(mask_width_ % 2)) { + mask_width_++; + } + + if (!(mask_height_ % 2)) { + mask_height_++; + } + + if (width_ * height_ < 256) { + width_ = 64; + height_ = 64; + } + + const uint32_t input_size_bytes = width_ * height_ * sizeof(uint32_t); + const uint32_t mask_size_bytes = mask_width_ * mask_height_ * sizeof(float); + + SetSysDescr(KERNARG_DES_ID, sizeof(kernel_args_t)); + SetSysDescr(INPUT_DES_ID, input_size_bytes); + SetSysDescr(OUTPUT_DES_ID, input_size_bytes); + SetLocalDescr(LOCAL_DES_ID, input_size_bytes); + SetSysDescr(MASK_DES_ID, mask_size_bytes); + SetSysDescr(REFOUT_DES_ID, input_size_bytes); + + if (!randomize_seed_) TEST_ASSERT(sizeof(input_data_) <= input_size_bytes); +} + +void SimpleConvolution::Init() { + std::clog << "SimpleConvolution::init :" << std::endl; + + mem_descr_t input_des = GetDescr(INPUT_DES_ID); + mem_descr_t local_des = GetDescr(LOCAL_DES_ID); + mem_descr_t mask_des = GetDescr(MASK_DES_ID); + mem_descr_t refout_des = GetDescr(REFOUT_DES_ID); + mem_descr_t kernarg_des = GetDescr(KERNARG_DES_ID); + + uint32_t* input = (uint32_t*)input_des.ptr; + uint32_t* output_local = (uint32_t*)local_des.ptr; + float* mask = (float*)mask_des.ptr; + kernel_args_t* kernel_args = (kernel_args_t*)kernarg_des.ptr; + + if (randomize_seed_) { + // random initialisation of input + FillRandom(input, width_, height_, 0, 255, randomize_seed_); + } else { + // initialization with preset values + memcpy(input, input_data_, width_ * height_ * sizeof(uint32_t)); + } + + // Fill a blurr filter or some other filter of your choice + const float val = 1.0f / (mask_width_ * 2.0f - 1.0f); + for (uint32_t i = 0; i < (mask_width_ * mask_height_); i++) { + mask[i] = 0; + } + for (uint32_t i = 0; i < mask_width_; i++) { + uint32_t y = mask_height_ / 2; + mask[y * mask_width_ + i] = val; + } + for (uint32_t i = 0; i < mask_height_; i++) { + uint32_t x = mask_width_ / 2; + mask[i * mask_width_ + x] = val; + } + + // Print the INPUT array. + std::clog << std::dec; + PrintArray("> Input[0]", input, width_, 1); + PrintArray("> Mask", mask, mask_width_, mask_height_); + + // Fill the kernel args + kernel_args->arg1 = output_local; + kernel_args->arg2 = input; + kernel_args->arg3 = mask; + kernel_args->arg4 = width_; + kernel_args->arg41 = height_; + kernel_args->arg5 = mask_width_; + kernel_args->arg51 = mask_height_; + + // Calculate the reference output + memset(refout_des.ptr, 0, refout_des.size); + ReferenceImplementation(reinterpret_cast(refout_des.ptr), input, mask, width_, height_, + mask_width_, mask_height_); +} + +void SimpleConvolution::PrintOutput() const { + PrintArray("> Output[0]", reinterpret_cast(GetOutputPtr()), width_, 1); +} + +bool SimpleConvolution::ReferenceImplementation(uint32_t* output, const uint32_t* input, + const float* mask, const uint32_t width, + const uint32_t height, const uint32_t mask_width, + const uint32_t mask_height) { + const uint32_t vstep = (mask_width - 1) / 2; + const uint32_t hstep = (mask_height - 1) / 2; + + // for each pixel in the input + for (uint32_t x = 0; x < width; x++) { + for (uint32_t y = 0; y < height; y++) { + // find the left, right, top and bottom indices such that + // the indices do not go beyond image boundaires + const uint32_t left = (x < vstep) ? 0 : (x - vstep); + const uint32_t right = ((x + vstep) >= width) ? width - 1 : (x + vstep); + const uint32_t top = (y < hstep) ? 0 : (y - hstep); + const uint32_t bottom = ((y + hstep) >= height) ? height - 1 : (y + hstep); + + // initializing wighted sum value + float sum_fx = 0; + for (uint32_t i = left; i <= right; ++i) { + for (uint32_t j = top; j <= bottom; ++j) { + // performing wighted sum within the mask boundaries + uint32_t mask_idx = (j - (y - hstep)) * mask_width + (i - (x - vstep)); + uint32_t index = j * width + i; + + // to round to the nearest integer + sum_fx += ((float)input[index] * mask[mask_idx]); + } + } + sum_fx += 0.5f; + output[y * width + x] = uint32_t(sum_fx); + } + } + + return true; +} diff --git a/test/simple_convolution/simple_convolution.h b/test/simple_convolution/simple_convolution.h new file mode 100644 index 0000000000..a5b75a6c30 --- /dev/null +++ b/test/simple_convolution/simple_convolution.h @@ -0,0 +1,96 @@ +/****************************************************************************** + +Copyright ©2013 Advanced Micro Devices, Inc. All rights reserved. + +Redistribution and use in source and binary forms, with or without modification, +are permitted provided that the following conditions are met: + +Redistributions of source code must retain the above copyright notice, this list +of conditions and the following disclaimer. + +Redistributions in binary form must reproduce the above copyright notice, this +list of conditions and the following disclaimer in the documentation and/or +other materials provided with the distribution. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. +IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, +INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, +BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE +OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED +OF THE POSSIBILITY OF SUCH DAMAGE. + +*******************************************************************************/ + +#ifndef TEST_SIMPLE_CONVOLUTION_SIMPLE_CONVOLUTION_H_ +#define TEST_SIMPLE_CONVOLUTION_SIMPLE_CONVOLUTION_H_ + +#include +#include + +#include "ctrl/test_kernel.h" + +// Class implements SimpleConvolution kernel parameters +class SimpleConvolution : public TestKernel { + public: + // Constructor + SimpleConvolution(); + + // Initialize method + void Init(); + + // Return compute grid size + uint32_t GetGridSize() const { return width_ * height_; } + + // Print output + void PrintOutput() const; + + // Return name + std::string Name() const { return std::string("SimpleConvolution"); } + + private: + // Local kernel arguments declaration + struct kernel_args_t { + void* arg1; + void* arg2; + void* arg3; + uint32_t arg4; + uint32_t arg41; + uint32_t arg5; + uint32_t arg51; + }; + + // Reference CPU implementation of Simple Convolution + // @param output Output matrix after performing convolution + // @param input Input matrix on which convolution is to be performed + // @param mask mask matrix using which convolution was to be performed + // @param input_dimensions dimensions of the input matrix + // @param mask_dimensions dimensions of the mask matrix + // @return bool true on success and false on failure + bool ReferenceImplementation(uint32_t* output, const uint32_t* input, const float* mask, + const uint32_t width, const uint32_t height, + const uint32_t maskWidth, const uint32_t maskHeight); + + // Width of the Input array + uint32_t width_; + + // Height of the Input array + uint32_t height_; + + // Mask dimensions + uint32_t mask_width_; + + // Mask dimensions + uint32_t mask_height_; + + // Randomize input data + unsigned randomize_seed_; + + // Input data + static const uint32_t input_data_[]; +}; + +#endif // TEST_SIMPLE_CONVOLUTION_SIMPLE_CONVOLUTION_H_ diff --git a/test/simple_convolution/simple_convolution.hsail b/test/simple_convolution/simple_convolution.hsail new file mode 100644 index 0000000000..223ef8eddb --- /dev/null +++ b/test/simple_convolution/simple_convolution.hsail @@ -0,0 +1,154 @@ +module &m:1:0:$full:$large:$default; +extension "amd:gcn"; +extension "IMAGE"; + +decl prog function &abort()(); + +prog kernel &__OpenCL_SimpleConvolution(kernarg_u64 %__global_offset_0, + kernarg_u64 %output, + kernarg_u64 %input, + kernarg_u64 %mask, + kernarg_u32 %inputDimensions[2], + kernarg_u32 %maskDimensions[2]) { + + pragma "AMD RTI", "ARGSTART:__OpenCL_SimpleConvolution"; + pragma "AMD RTI", "version:3:1:104"; + pragma "AMD RTI", "device:generic"; + pragma "AMD RTI", "uniqueid:1024"; + pragma "AMD RTI", "memory:private:0"; + pragma "AMD RTI", "memory:region:0"; + pragma "AMD RTI", "memory:local:0"; + pragma "AMD RTI", "value:__global_offset_0:u64:1:1:0"; + pragma "AMD RTI", "pointer:output:u32:1:1:96:uav:7:4:RW:0:0:0"; + pragma "AMD RTI", "pointer:input:u32:1:1:112:uav:7:4:RW:0:0:0"; + pragma "AMD RTI", "pointer:mask:float:1:1:128:uav:7:4:RW:0:0:0"; + pragma "AMD RTI", "value:inputDimensions:u32:2:1:144"; + pragma "AMD RTI", "constarg:4:inputDimensions"; + pragma "AMD RTI", "value:maskDimensions:u32:2:1:160"; + pragma "AMD RTI", "constarg:5:maskDimensions"; + pragma "AMD RTI", "function:1:0"; + pragma "AMD RTI", "memory:64bitABI"; + pragma "AMD RTI", "privateid:8"; + pragma "AMD RTI", "enqueue_kernel:0"; + pragma "AMD RTI", "kernel_index:0"; + pragma "AMD RTI", "reflection:0:size_t"; + pragma "AMD RTI", "reflection:1:uint*"; + pragma "AMD RTI", "reflection:2:uint*"; + pragma "AMD RTI", "reflection:3:float*"; + pragma "AMD RTI", "reflection:4:uint2"; + pragma "AMD RTI", "reflection:5:uint2"; + pragma "AMD RTI", "ARGEND:__OpenCL_SimpleConvolution"; + + @__OpenCL_SimpleConvolution_Entry: + + // BB#0: // %entry + + workitemabsid_u32 $s6, 0; + cvt_u64_u32 $d0, $s6; + ld_kernarg_align(8)_width(all)_u64 $d4, [%__global_offset_0]; + add_u64 $d0, $d0, $d4; + cvt_u32_u64 $s5, $d0; + ld_v2_kernarg_align(4)_width(all)_u32 ($s0, $s4), [%inputDimensions]; + ld_v2_kernarg_align(4)_width(all)_u32 ($s1, $s9), [%maskDimensions]; + rem_u32 $s7, $s5, $s0; + add_u32 $s2, $s1, 4294967295; + shr_u32 $s8, $s2, 1; + add_u32 $s2, $s7, $s8; + add_u32 $s3, $s0, 4294967295; + cmp_ge_b1_u32 $c0, $s2, $s0; + cmov_b32 $s2, $c0, $s3, $s2; + sub_u32 $s3, $s7, $s8; + cmp_lt_b1_u32 $c0, $s7, $s8; + cmov_b32 $s3, $c0, 0, $s3; + ld_kernarg_align(8)_width(all)_u64 $d1, [%output]; + cmp_le_b1_u32 $c0, $s3, $s2; + cbr_b1 $c0, @BB0_2; + + // BB#1: + + mov_b32 $s6, 0; + br @BB0_6; + + // @BB0_2: // %for.cond32.preheader.lr.ph + + @BB0_2: + + div_u32 $s5, $s5, $s0; + add_u32 $s9, $s9, 4294967295; + shr_u32 $s9, $s9, 1; + add_u32 $s10, $s5, $s9; + add_u32 $s11, $s4, 4294967295; + cmp_ge_b1_u32 $c0, $s10, $s4; + cmov_b32 $s4, $c0, $s11, $s10; + sub_u32 $s10, $s5, $s9; + cmp_lt_b1_u32 $c0, $s5, $s9; + cmov_b32 $s5, $c0, 0, $s10; + ld_kernarg_align(8)_width(all)_u64 $d2, [%mask]; + ld_kernarg_align(8)_width(all)_u64 $d3, [%input]; + cvt_u64_u32 $d5, $s6; + add_u64 $d4, $d4, $d5; + cvt_u32_u64 $s6, $d4; + div_u32 $s6, $s6, $s0; + max_u32 $s10, $s9, $s6; + sub_u32 $s12, $s10, $s6; + max_u32 $s11, $s7, $s8; + mov_b32 $s6, 0; + mad_u32 $s12, $s1, $s12, $s11; + sub_u32 $s7, $s12, $s7; + sub_u32 $s9, $s10, $s9; + mad_u32 $s9, $s0, $s9, $s11; + sub_u32 $s8, $s9, $s8; + + // @BB0_3: // %for.cond32.preheader + + @BB0_3: + + cmp_gt_b1_u32 $c0, $s5, $s4; + mov_b32 $s9, $s7; + mov_b32 $s10, $s8; + mov_b32 $s11, $s5; + cbr_b1 $c0, @BB0_5; + + // @BB0_4: // %for.body35 + + @BB0_4: + + cvt_u64_u32 $d4, $s9; + shl_u64 $d4, $d4, 2; + add_u64 $d4, $d2, $d4; + ld_global_align(4)_f32 $s12, [$d4]; + cvt_u64_u32 $d4, $s10; + shl_u64 $d4, $d4, 2; + add_u64 $d4, $d3, $d4; + ld_global_align(4)_u32 $s13, [$d4]; + cvt_f32_u32 $s13, $s13; + mul_ftz_f32 $s12, $s13, $s12; + add_u32 $s9, $s9, $s1; + add_u32 $s10, $s10, $s0; + add_u32 $s11, $s11, 1; + add_ftz_f32 $s6, $s6, $s12; + cmp_le_b1_u32 $c0, $s11, $s4; + cbr_b1 $c0, @BB0_4; + + // @BB0_5: // %for.inc48 + + @BB0_5: + + add_u32 $s7, $s7, 1; + add_u32 $s8, $s8, 1; + add_u32 $s3, $s3, 1; + cmp_le_b1_u32 $c0, $s3, $s2; + cbr_b1 $c0, @BB0_3; + + // @BB0_6: // %for.end50 + + @BB0_6: + + and_b64 $d0, $d0, 4294967295; + shl_u64 $d0, $d0, 2; + add_u64 $d0, $d1, $d0; + add_ftz_f32 $s0, $s6, 0F3f000000; + cvt_ftz_u32_f32 $s0, $s0; + st_global_align(4)_u32 $s0, [$d0]; + ret; +}; diff --git a/test/util/helper_funcs.h b/test/util/helper_funcs.h new file mode 100644 index 0000000000..758e3d874f --- /dev/null +++ b/test/util/helper_funcs.h @@ -0,0 +1,87 @@ +/********************************************************************** +Copyright ©2013 Advanced Micro Devices, Inc. All rights reserved. + +Redistribution and use in source and binary forms, with or without modification, are permitted +provided that the following conditions are met: + +• Redistributions of source code must retain the above copyright notice, this list of +conditions and the following disclaimer. +• Redistributions in binary form must reproduce the above copyright notice, this list of +conditions and the following disclaimer in the documentation and/or + other materials provided with the distribution. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR +IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT +SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY + DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS + OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, +WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. +********************************************************************/ +#ifndef TEST_UTIL_HELPER_FUNCS_H_ +#define TEST_UTIL_HELPER_FUNCS_H_ + +#include +#include +#include +#include +#include + +static inline void Error(std::string error_msg) { + std::cerr << "Error: " << error_msg << std::endl; +} + +template +void PrintArray(const std::string header, const T* data, const int width, const int height) { + std::clog << header << " :\n"; + for (int i = 0; i < height; i++) { + std::clog << "> "; + for (int j = 0; j < width; j++) { + std::clog << data[i * width + j] << " "; + } + std::clog << "\n"; + } +} + +template +bool FillRandom(T* array_ptr, const int width, const int height, const T range_min, + const T range_max, unsigned int seed = 123) { + if (!array_ptr) { + Error("Cannot fill array. NULL pointer."); + return false; + } + + if (!seed) seed = (unsigned int)time(NULL); + + srand(seed); + double range = double(range_max - range_min) + 1.0; + + /* random initialisation of input */ + for (int i = 0; i < height; i++) + for (int j = 0; j < width; j++) { + int index = i * width + j; + array_ptr[index] = range_min + T(range * rand() / (RAND_MAX + 1.0)); + } + + return true; +} + +template T RoundToPowerOf2(T val) { + int bytes = sizeof(T); + + val--; + for (int i = 0; i < bytes; i++) val |= val >> (1 << i); + val++; + + return val; +} + +template bool IsPowerOf2(T val) { + long long long_val = val; + return (((long_val & (-long_val)) - long_val == 0) && (long_val != 0)); +} + +#endif // TEST_UTIL_HELPER_FUNCS_H_ diff --git a/test/util/hsa_rsrc_factory.cpp b/test/util/hsa_rsrc_factory.cpp new file mode 100644 index 0000000000..139b618de9 --- /dev/null +++ b/test/util/hsa_rsrc_factory.cpp @@ -0,0 +1,372 @@ +/********************************************************************** +Copyright ©2013 Advanced Micro Devices, Inc. All rights reserved. + +Redistribution and use in source and binary forms, with or without modification, are permitted +provided that the following conditions are met: + +<95> Redistributions of source code must retain the above copyright notice, this list of +conditions and the following disclaimer. +<95> Redistributions in binary form must reproduce the above copyright notice, this list of +conditions and the following disclaimer in the documentation and/or + other materials provided with the distribution. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR +IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT +SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY + DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS + OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, +WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. +********************************************************************/ + +#include "util/hsa_rsrc_factory.h" + +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +// Callback function to find and bind kernarg region of an agent +static hsa_status_t FindMemRegionsCallback(hsa_region_t region, void* data) { + hsa_region_global_flag_t flags; + hsa_region_segment_t segment_id; + + hsa_region_get_info(region, HSA_REGION_INFO_SEGMENT, &segment_id); + if (segment_id != HSA_REGION_SEGMENT_GLOBAL) { + return HSA_STATUS_SUCCESS; + } + + AgentInfo* agent_info = (AgentInfo*)data; + hsa_region_get_info(region, HSA_REGION_INFO_GLOBAL_FLAGS, &flags); + if (flags & HSA_REGION_GLOBAL_FLAG_COARSE_GRAINED) { + agent_info->coarse_region = region; + } + + if (flags & HSA_REGION_GLOBAL_FLAG_KERNARG) { + agent_info->kernarg_region = region; + } + + return HSA_STATUS_SUCCESS; +} + +// Callback function to get the number of agents +static hsa_status_t GetHsaAgentsCallback(hsa_agent_t agent, void* data) { + // Copy handle of agent and increment number of agents reported + HsaRsrcFactory* rsrcFactory = reinterpret_cast(data); + + // Determine if device is a Gpu agent + hsa_status_t status; + hsa_device_type_t type; + status = hsa_agent_get_info(agent, HSA_AGENT_INFO_DEVICE, &type); + CHECK_STATUS("Error Calling hsa_agent_get_info", status); + if (type == HSA_DEVICE_TYPE_DSP) { + return HSA_STATUS_SUCCESS; + } + + if (type == HSA_DEVICE_TYPE_CPU) { + AgentInfo* agent_info = reinterpret_cast(malloc(sizeof(AgentInfo))); + agent_info->dev_id = agent; + agent_info->dev_type = HSA_DEVICE_TYPE_CPU; + rsrcFactory->AddAgentInfo(agent_info, false); + return HSA_STATUS_SUCCESS; + } + + // Device is a Gpu agent, build an instance of AgentInfo + AgentInfo* agent_info = reinterpret_cast(malloc(sizeof(AgentInfo))); + agent_info->dev_id = agent; + agent_info->dev_type = HSA_DEVICE_TYPE_GPU; + hsa_agent_get_info(agent, HSA_AGENT_INFO_NAME, agent_info->name); + agent_info->max_wave_size = 0; + hsa_agent_get_info(agent, HSA_AGENT_INFO_WAVEFRONT_SIZE, &agent_info->max_wave_size); + agent_info->max_queue_size = 0; + hsa_agent_get_info(agent, HSA_AGENT_INFO_QUEUE_MAX_SIZE, &agent_info->max_queue_size); + agent_info->profile = hsa_profile_t(108); + hsa_agent_get_info(agent, HSA_AGENT_INFO_PROFILE, &agent_info->profile); + + // Initialize memory regions to zero + agent_info->kernarg_region.handle = 0; + agent_info->coarse_region.handle = 0; + + // Find and Bind Memory regions of the Gpu agent + hsa_agent_iterate_regions(agent, FindMemRegionsCallback, agent_info); + + // Save the instance of AgentInfo + rsrcFactory->AddAgentInfo(agent_info, true); + + return HSA_STATUS_SUCCESS; +} + +// Constructor of the class +HsaRsrcFactory::HsaRsrcFactory() { + // Initialize the Hsa Runtime + hsa_status_t status = hsa_init(); + CHECK_STATUS("Error in hsa_init", status); + + // Discover the set of Gpu devices available on the platform + status = hsa_iterate_agents(GetHsaAgentsCallback, this); + CHECK_STATUS("Error Calling hsa_iterate_agents", status); +} + +// Destructor of the class +HsaRsrcFactory::~HsaRsrcFactory() { + hsa_status_t status = hsa_shut_down(); + CHECK_STATUS("Error in hsa_shut_down", status); +} + +// Get the count of Hsa Gpu Agents available on the platform +// +// @return uint32_t Number of Gpu agents on platform +// +uint32_t HsaRsrcFactory::GetCountOfGpuAgents() { return uint32_t(gpu_list_.size()); } + +// Get the count of Hsa Cpu Agents available on the platform +// +// @return uint32_t Number of Cpu agents on platform +// +uint32_t HsaRsrcFactory::GetCountOfCpuAgents() { return uint32_t(cpu_list_.size()); } + +// Get the AgentInfo handle of a Gpu device +// +// @param idx Gpu Agent at specified index +// +// @param agent_info Output parameter updated with AgentInfo +// +// @return bool true if successful, false otherwise +// +bool HsaRsrcFactory::GetGpuAgentInfo(uint32_t idx, AgentInfo** agent_info) { + // Determine if request is valid + uint32_t size = uint32_t(gpu_list_.size()); + if (idx >= size) { + return false; + } + + // Copy AgentInfo from specified index + *agent_info = gpu_list_[idx]; + return true; +} + +// Get the AgentInfo handle of a Cpu device +// +// @param idx Cpu Agent at specified index +// +// @param agent_info Output parameter updated with AgentInfo +// +// @return bool true if successful, false otherwise +// +bool HsaRsrcFactory::GetCpuAgentInfo(uint32_t idx, AgentInfo** agent_info) { + // Determine if request is valid + uint32_t size = uint32_t(cpu_list_.size()); + if (idx >= size) { + return false; + } + + // Copy AgentInfo from specified index + *agent_info = cpu_list_[idx]; + return true; +} + +// Create a Queue object and return its handle. The queue object is expected +// to support user requested number of Aql dispatch packets. +// +// @param agent_info Gpu Agent on which to create a queue object +// +// @param num_Pkts Number of packets to be held by queue +// +// @param queue Output parameter updated with handle of queue object +// +// @return bool true if successful, false otherwise +// +bool HsaRsrcFactory::CreateQueue(AgentInfo* agent_info, uint32_t num_pkts, hsa_queue_t** queue) { + hsa_status_t status; + status = hsa_queue_create(agent_info->dev_id, num_pkts, HSA_QUEUE_TYPE_MULTI, NULL, NULL, + UINT32_MAX, UINT32_MAX, queue); + return (status == HSA_STATUS_SUCCESS); +} + +// Create a Signal object and return its handle. +// +// @param value Initial value of signal object +// +// @param signal Output parameter updated with handle of signal object +// +// @return bool true if successful, false otherwise +// +bool HsaRsrcFactory::CreateSignal(uint32_t value, hsa_signal_t* signal) { + hsa_status_t status; + status = hsa_signal_create(value, 0, NULL, signal); + return (status == HSA_STATUS_SUCCESS); +} + +// Allocate memory for use by a kernel of specified size in specified +// agent's memory region. Currently supports Global segment whose Kernarg +// flag set. +// +// @param agent_info Agent from whose memory region to allocate +// +// @param size Size of memory in terms of bytes +// +// @return uint8_t* Pointer to buffer, null if allocation fails. +// +uint8_t* HsaRsrcFactory::AllocateLocalMemory(const AgentInfo* agent_info, size_t size) { + hsa_status_t status; + uint8_t* buffer = NULL; + size = (size + MEM_PAGE_MASK) & ~MEM_PAGE_MASK; + + if (agent_info->coarse_region.handle != 0) { + // Allocate in local memory if it is available + status = hsa_memory_allocate(agent_info->coarse_region, size, (void**)&buffer); + if (status == HSA_STATUS_SUCCESS) { + status = hsa_memory_assign_agent(buffer, agent_info->dev_id, HSA_ACCESS_PERMISSION_RW); + } + } else { + // Allocate in system memory if local memory is not available + status = hsa_memory_allocate(agent_info->kernarg_region, size, (void**)&buffer); + } + + return (status == HSA_STATUS_SUCCESS) ? buffer : NULL; +} + +// Allocate memory tp pass kernel parameters. +// +// @param agent_info Agent from whose memory region to allocate +// +// @param size Size of memory in terms of bytes +// +// @return uint8_t* Pointer to buffer, null if allocation fails. +// +uint8_t* HsaRsrcFactory::AllocateSysMemory(const AgentInfo* agent_info, size_t size) { + hsa_status_t status; + size = (size + MEM_PAGE_MASK) & ~MEM_PAGE_MASK; + + uint8_t* buffer = NULL; + status = hsa_memory_allocate(agent_info->kernarg_region, size, (void**)&buffer); + return (status == HSA_STATUS_SUCCESS) ? buffer : NULL; +} + +// Transfer data method +bool HsaRsrcFactory::TransferData(void* dest_buff, void* src_buff, uint32_t length, + bool host_to_dev) { + hsa_status_t status; + status = hsa_memory_copy(dest_buff, src_buff, length); + return (status == HSA_STATUS_SUCCESS); +} + +// Loads an Assembled Brig file and Finalizes it into Device Isa +// +// @param agent_info Gpu device for which to finalize +// +// @param brig_path File path of the Assembled Brig file +// +// @param kernel_name Name of the kernel to finalize +// +// @param code_desc Handle of finalized Code Descriptor that could +// be used to submit for execution +// +// @return bool true if successful, false otherwise +// +bool HsaRsrcFactory::LoadAndFinalize(AgentInfo* agent_info, const char* brig_path, + char* kernel_name, hsa_executable_symbol_t* code_desc) { + // Finalize the Hsail object into code object + hsa_status_t status; + hsa_code_object_t code_object; + + // Build the code object filename + std::string filename(brig_path); + std::clog << "Code object filename: " << filename << std::endl; + + // Open the file containing code object + std::ifstream codeStream(filename.c_str(), std::ios::binary | std::ios::ate); + if (!codeStream) { + std::cerr << "Error: failed to load " << filename << std::endl; + assert(false); + return false; + } + + // Allocate memory to read in code object from file + size_t size = std::string::size_type(codeStream.tellg()); + char* codeBuff = (char*)AllocateSysMemory(agent_info, size); + if (!codeBuff) { + std::cerr << "Error: failed to allocate memory for code object." << std::endl; + assert(false); + return false; + } + + // Read the code object into allocated memory + codeStream.seekg(0, std::ios::beg); + std::copy(std::istreambuf_iterator(codeStream), std::istreambuf_iterator(), codeBuff); + + // De-Serialize the code object that has been read into memory + status = hsa_code_object_deserialize(codeBuff, size, NULL, &code_object); + if (status != HSA_STATUS_SUCCESS) { + std::cerr << "Failed to deserialize code object" << std::endl; + return false; + } + + // Create executable. + hsa_executable_t hsaExecutable; + status = + hsa_executable_create(HSA_PROFILE_FULL, HSA_EXECUTABLE_STATE_UNFROZEN, "", &hsaExecutable); + CHECK_STATUS("Error in creating executable object", status); + + // Load code object. + status = hsa_executable_load_code_object(hsaExecutable, agent_info->dev_id, code_object, ""); + CHECK_STATUS("Error in loading executable object", status); + + // Freeze executable. + status = hsa_executable_freeze(hsaExecutable, ""); + CHECK_STATUS("Error in freezing executable object", status); + + // Get symbol handle. + hsa_executable_symbol_t kernelSymbol; + status = hsa_executable_get_symbol(hsaExecutable, NULL, kernel_name, agent_info->dev_id, 0, + &kernelSymbol); + CHECK_STATUS("Error in looking up kernel symbol", status); + + // Update output parameter + *code_desc = kernelSymbol; + return true; +} + +// Add an instance of AgentInfo representing a Hsa Gpu agent +void HsaRsrcFactory::AddAgentInfo(AgentInfo* agent_info, bool gpu) { + // Add input to Gpu list + if (gpu) { + gpu_list_.push_back(agent_info); + return; + } + + // Add input to Cpu list + cpu_list_.push_back(agent_info); +} + +// Print the various fields of Hsa Gpu Agents +bool HsaRsrcFactory::PrintGpuAgents(const std::string& header) { + std::clog << header << " :" << std::endl; + + AgentInfo* agent_info; + int size = uint32_t(gpu_list_.size()); + for (int idx = 0; idx < size; idx++) { + agent_info = gpu_list_[idx]; + + std::clog << "> agent[" << idx << "] :" << std::endl; + std::clog << ">> Name : " << agent_info->name << std::endl; + std::clog << ">> Max Wave Size : " << agent_info->max_wave_size << std::endl; + std::clog << ">> Max Queue Size : " << agent_info->max_queue_size << std::endl; + std::clog << ">> Kernarg Region Id : " << agent_info->coarse_region.handle << std::endl; + } + return true; +} + +HsaRsrcFactory* HsaRsrcFactory::instance_ = NULL; +HsaRsrcFactory::mutex_t HsaRsrcFactory::mutex_; diff --git a/test/util/hsa_rsrc_factory.h b/test/util/hsa_rsrc_factory.h new file mode 100644 index 0000000000..88235e62bf --- /dev/null +++ b/test/util/hsa_rsrc_factory.h @@ -0,0 +1,234 @@ +/********************************************************************** +Copyright ©2013 Advanced Micro Devices, Inc. All rights reserved. + +Redistribution and use in source and binary forms, with or without modification, are permitted +provided that the following conditions are met: + +<95> Redistributions of source code must retain the above copyright notice, this list of +conditions and the following disclaimer. +<95> Redistributions in binary form must reproduce the above copyright notice, this list of +conditions and the following disclaimer in the documentation and/or + other materials provided with the distribution. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR +IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT +SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY + DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS + OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, +WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. +********************************************************************/ + +#ifndef TEST_UTIL_HSA_RSRC_FACTORY_H_ +#define TEST_UTIL_HSA_RSRC_FACTORY_H_ + +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +#define HSA_ARGUMENT_ALIGN_BYTES 16 +#define HSA_QUEUE_ALIGN_BYTES 64 +#define HSA_PACKET_ALIGN_BYTES 64 + +#define CHECK_STATUS(msg, status) \ + if (status != HSA_STATUS_SUCCESS) { \ + const char* emsg = 0; \ + hsa_status_string(status, &emsg); \ + printf("%s: %s\n", msg, emsg ? emsg : ""); \ + exit(1); \ + } + +static const unsigned MEM_PAGE_BYTES = 0x1000; +static const unsigned MEM_PAGE_MASK = MEM_PAGE_BYTES - 1; + +// Encapsulates information about a Hsa Agent such as its +// handle, name, max queue size, max wavefront size, etc. +typedef struct { + // Handle of Agent + hsa_agent_t dev_id; + + // Agent type - Cpu = 0, Gpu = 1 or Dsp = 2 + uint32_t dev_type; + + // Name of Agent whose length is less than 64 + char name[64]; + + // Max size of Wavefront size + uint32_t max_wave_size; + + // Max size of Queue buffer + uint32_t max_queue_size; + + // Hsail profile supported by agent + hsa_profile_t profile; + + // Memory region supporting kernel parameters + hsa_region_t coarse_region; + + // Memory region supporting kernel arguments + hsa_region_t kernarg_region; + +} AgentInfo; + +class HsaRsrcFactory { + public: + typedef std::recursive_mutex mutex_t; + + static HsaRsrcFactory* Create() { + std::lock_guard lck(mutex_); + if (HsaRsrcFactory::instance_ == NULL) { + HsaRsrcFactory::instance_ = new HsaRsrcFactory(); + } + return instance_; + } + + static void Destroy() { + std::lock_guard lck(mutex_); + if (instance_) delete instance_; + instance_ = NULL; + } + + static HsaRsrcFactory& Instance() { + hsa_status_t status = (instance_ != NULL) ? HSA_STATUS_SUCCESS : HSA_STATUS_ERROR; + CHECK_STATUS("HsaRsrcFactory::Instance()", status); + return *instance_; + } + + // Get the count of Hsa Gpu Agents available on the platform + // + // @return uint32_t Number of Gpu agents on platform + // + uint32_t GetCountOfGpuAgents(); + + // Get the count of Hsa Cpu Agents available on the platform + // + // @return uint32_t Number of Cpu agents on platform + // + uint32_t GetCountOfCpuAgents(); + + // Get the AgentInfo handle of a Gpu device + // + // @param idx Gpu Agent at specified index + // + // @param agent_info Output parameter updated with AgentInfo + // + // @return bool true if successful, false otherwise + // + bool GetGpuAgentInfo(uint32_t idx, AgentInfo** agent_info); + + // Get the AgentInfo handle of a Cpu device + // + // @param idx Cpu Agent at specified index + // + // @param agent_info Output parameter updated with AgentInfo + // + // @return bool true if successful, false otherwise + // + bool GetCpuAgentInfo(uint32_t idx, AgentInfo** agent_info); + + // Create a Queue object and return its handle. The queue object is expected + // to support user requested number of Aql dispatch packets. + // + // @param agent_info Gpu Agent on which to create a queue object + // + // @param num_Pkts Number of packets to be held by queue + // + // @param queue Output parameter updated with handle of queue object + // + // @return bool true if successful, false otherwise + // + bool CreateQueue(AgentInfo* agent_info, uint32_t num_pkts, hsa_queue_t** queue); + + // Create a Signal object and return its handle. + // + // @param value Initial value of signal object + // + // @param signal Output parameter updated with handle of signal object + // + // @return bool true if successful, false otherwise + // + bool CreateSignal(uint32_t value, hsa_signal_t* signal); + + // Allocate memory for use by a kernel of specified size in specified + // agent's memory region. Currently supports Global segment whose Kernarg + // flag set. + // + // @param agent_info Agent from whose memory region to allocate + // + // @param size Size of memory in terms of bytes + // + // @return uint8_t* Pointer to buffer, null if allocation fails. + // + uint8_t* AllocateLocalMemory(const AgentInfo* agent_info, size_t size); + + // Allocate memory tp pass kernel parameters. + // + // @param agent_info Agent from whose memory region to allocate + // + // @param size Size of memory in terms of bytes + // + // @return uint8_t* Pointer to buffer, null if allocation fails. + // + uint8_t* AllocateSysMemory(const AgentInfo* agent_info, size_t size); + + // Transfer data method + bool TransferData(void* dest_buff, void* src_buff, uint32_t length, bool host_to_dev); + + // Loads an Assembled Brig file and Finalizes it into Device Isa + // + // @param agent_info Gpu device for which to finalize + // + // @param brig_path File path of the Assembled Brig file + // + // @param kernel_name Name of the kernel to finalize + // + // @param code_desc Handle of finalized Code Descriptor that could + // be used to submit for execution + // + // @return bool true if successful, false otherwise + // + bool LoadAndFinalize(AgentInfo* agent_info, const char* brig_path, char* kernel_name, + hsa_executable_symbol_t* code_desc); + + // Add an instance of AgentInfo representing a Hsa Gpu agent + void AddAgentInfo(AgentInfo* agent_info, bool gpu); + + // Print the various fields of Hsa Gpu Agents + bool PrintGpuAgents(const std::string& header); + + private: + // Constructor of the class. Will initialize the Hsa Runtime and + // query the system topology to get the list of Cpu and Gpu devices + HsaRsrcFactory(); + + // Destructor of the class + ~HsaRsrcFactory(); + + static HsaRsrcFactory* instance_; + static mutex_t mutex_; + + // Used to maintain a list of Hsa Queue handles + std::vector queue_list_; + + // Used to maintain a list of Hsa Signal handles + std::vector signal_list_; + + // Used to maintain a list of Hsa Gpu Agent Info + std::vector gpu_list_; + + // Used to maintain a list of Hsa Cpu Agent Info + std::vector cpu_list_; +}; + +#endif // TEST_UTIL_HSA_RSRC_FACTORY_H_ diff --git a/test/util/perf_timer.cpp b/test/util/perf_timer.cpp new file mode 100644 index 0000000000..591e4d5801 --- /dev/null +++ b/test/util/perf_timer.cpp @@ -0,0 +1,181 @@ +/********************************************************************** +Copyright ©2013 Advanced Micro Devices, Inc. All rights reserved. + +Redistribution and use in source and binary forms, with or without modification, are permitted +provided that the following conditions are met: + +<95> Redistributions of source code must retain the above copyright notice, this list of +conditions and the following disclaimer. +<95> Redistributions in binary form must reproduce the above copyright notice, this list of +conditions and the following disclaimer in the documentation and/or + other materials provided with the distribution. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR +IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT +SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY + DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS + OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, +WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. +********************************************************************/ + +#include "util/perf_timer.h" + +PerfTimer::PerfTimer() { freq_in_100mhz_ = MeasureTSCFreqHz(); } + +PerfTimer::~PerfTimer() { + while (!timers_.empty()) { + Timer* temp = timers_.back(); + timers_.pop_back(); + delete temp; + } +} + +// New cretaed timer instantance index will be returned +int PerfTimer::CreateTimer() { + Timer* newTimer = new Timer; + newTimer->start = 0; + newTimer->clocks = 0; + +#ifdef _WIN32 + QueryPerformanceFrequency((LARGE_INTEGER*)&newTimer->freq); +#else + newTimer->freq = (long long)1.0E3; +#endif + + /* Push back the address of new Timer instance created */ + timers_.push_back(newTimer); + return (int)(timers_.size() - 1); +} + +int PerfTimer::StartTimer(int index) { + if (index >= (int)timers_.size()) { + Error("Cannot reset timer. Invalid handle."); + return FAILURE; + } + +#ifdef _WIN32 +// General Windows timing method +#ifndef _AMD + long long tmpStart; + QueryPerformanceCounter((LARGE_INTEGER*)&(tmpStart)); + timers_[index]->start = (double)tmpStart; +#else +// AMD Windows timing method +#endif +#else +// General Linux timing method +#ifndef _AMD + struct timeval s; + gettimeofday(&s, 0); + timers_[index]->start = s.tv_sec * 1.0E3 + ((double)(s.tv_usec / 1.0E3)); +#else + // AMD timing method + unsigned int unused; + timers_[index]->start = __rdtscp(&unused); +#endif +#endif + + return SUCCESS; +} + + +int PerfTimer::StopTimer(int index) { + double n = 0; + if (index >= (int)timers_.size()) { + Error("Cannot reset timer. Invalid handle."); + return FAILURE; + } +#ifdef _WIN32 +#ifndef _AMD + long long n1; + QueryPerformanceCounter((LARGE_INTEGER*)&(n1)); + n = (double)n1; +#else +// AMD Window Timing +#endif + +#else +// General Linux timing method +#ifndef _AMD + struct timeval s; + gettimeofday(&s, 0); + n = s.tv_sec * 1.0E3 + (double)(s.tv_usec / 1.0E3); +#else + // AMD Linux timing + unsigned int unused; + n = __rdtscp(&unused); +#endif +#endif + + n -= timers_[index]->start; + timers_[index]->start = 0; + +#ifndef _AMD + timers_[index]->clocks += n; +#else + // timers_[index]->clocks += 10 * n / freq_in_100mhz_; // unit is ns + timers_[index]->clocks += 1.0E-6 * 10 * n / freq_in_100mhz_; // convert to ms +#endif + + return SUCCESS; +} + +void PerfTimer::Error(std::string str) { std::cout << str << std::endl; } + + +double PerfTimer::ReadTimer(int index) { + if (index >= (int)timers_.size()) { + Error("Cannot read timer. Invalid handle."); + return FAILURE; + } + + double reading = double(timers_[index]->clocks); + + reading = double(reading / timers_[index]->freq); + + return reading; +} + + +uint64_t PerfTimer::CoarseTimestampUs() { +#ifdef _WIN32 + uint64_t freqHz, ticks; + QueryPerformanceFrequency((LARGE_INTEGER*)&freqHz); + QueryPerformanceCounter((LARGE_INTEGER*)&ticks); + + // Scale numerator and divisor until (ticks * 1000000) fits in uint64_t. + while (ticks > (1ULL << 44)) { + ticks /= 16; + freqHz /= 16; + } + + return (ticks * 1000000) / freqHz; +#else + struct timespec ts; + clock_gettime(CLOCK_MONOTONIC_RAW, &ts); + return uint64_t(ts.tv_sec) * 1000000 + ts.tv_nsec / 1000; +#endif +} + +uint64_t PerfTimer::MeasureTSCFreqHz() { + // Make a coarse interval measurement of TSC ticks for 1 gigacycles. + unsigned int unused; + uint64_t tscTicksEnd; + + uint64_t coarseBeginUs = CoarseTimestampUs(); + uint64_t tscTicksBegin = __rdtscp(&unused); + do { + tscTicksEnd = __rdtscp(&unused); + } while (tscTicksEnd - tscTicksBegin < 1000000000); + + uint64_t coarseEndUs = CoarseTimestampUs(); + + // Compute the TSC frequency and round to nearest 100MHz. + uint64_t coarseIntervalNs = (coarseEndUs - coarseBeginUs) * 1000; + uint64_t tscIntervalTicks = tscTicksEnd - tscTicksBegin; + return (tscIntervalTicks * 10 + (coarseIntervalNs / 2)) / coarseIntervalNs; +} diff --git a/test/util/perf_timer.h b/test/util/perf_timer.h new file mode 100644 index 0000000000..5c77246c1c --- /dev/null +++ b/test/util/perf_timer.h @@ -0,0 +1,85 @@ +/********************************************************************** +Copyright ©2013 Advanced Micro Devices, Inc. All rights reserved. + +Redistribution and use in source and binary forms, with or without modification, are permitted +provided that the following conditions are met: + +<95> Redistributions of source code must retain the above copyright notice, this list of +conditions and the following disclaimer. +<95> Redistributions in binary form must reproduce the above copyright notice, this list of +conditions and the following disclaimer in the documentation and/or + other materials provided with the distribution. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR +IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT +SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY + DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS + OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, +WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. +********************************************************************/ + +#ifndef TEST_UTIL_PERF_TIMER_H_ +#define TEST_UTIL_PERF_TIMER_H_ + +// Will use AMD timer or general Linux timer based on compilation flag +// Need to consider platform is Windows or Linux + +#include +#include +#include + +#if defined(_MSC_VER) +#include +#include +#include +#else +#if defined(__GNUC__) +#include +#include +#endif // __GNUC__ +#endif // _MSC_VER + +#include +#include +#include + +class PerfTimer { + public: + enum { SUCCESS = 0, FAILURE = 1 }; + + PerfTimer(); + ~PerfTimer(); + + // General Linux timing method + int CreateTimer(); + int StartTimer(int index); + int StopTimer(int index); + + // retrieve time + double ReadTimer(int index); + // write into a file + double WriteTimer(int index); + + private: + struct Timer { + std::string name; /* name of time object */ + long long freq; /* frequency */ + double clocks; /* number of ticks at end */ + double start; /* start point ticks */ + }; + + std::vector timers_; /* vector to Timer objects */ + double freq_in_100mhz_; + + // AMD timing method + uint64_t CoarseTimestampUs(); + uint64_t MeasureTSCFreqHz(); + + void Error(std::string str); +}; + +#endif // TEST_UTIL_PERF_TIMER_H_ diff --git a/test/util/test_assert.h b/test/util/test_assert.h new file mode 100644 index 0000000000..101dc03434 --- /dev/null +++ b/test/util/test_assert.h @@ -0,0 +1,52 @@ +/****************************************************************************** + +Copyright ©2013 Advanced Micro Devices, Inc. All rights reserved. + +Redistribution and use in source and binary forms, with or without modification, +are permitted provided that the following conditions are met: + +Redistributions of source code must retain the above copyright notice, this list +of conditions and the following disclaimer. + +Redistributions in binary form must reproduce the above copyright notice, this +list of conditions and the following disclaimer in the documentation and/or +other materials provided with the distribution. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. +IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, +INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, +BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE +OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED +OF THE POSSIBILITY OF SUCH DAMAGE. + +*******************************************************************************/ + +#ifndef TEST_CTRL_TEST_ASSERT_H_ +#define TEST_CTRL_TEST_ASSERT_H_ + +#define TEST_ASSERT(cond) \ + { \ + if (!(cond)) { \ + std::cerr << "Assert failed(" << #cond << ") at " << __FILE__ << ", line " << __LINE__ \ + << std::endl; \ + exit(-1); \ + } \ + } + +#define TEST_STATUS(cond) \ + { \ + if (!(cond)) { \ + std::cerr << "Test error at " << __FILE__ << ", line " << __LINE__ \ + << std::endl; \ + const char* message; \ + rocprofiler_error_string(&message); \ + std::cerr << "ERROR: " << message << std::endl; \ + exit(-1); \ + } \ + } + +#endif // TEST_CTRL_TEST_ASSERT_H_ diff --git a/test/util/xml.h b/test/util/xml.h new file mode 100644 index 0000000000..ef84fb4e36 --- /dev/null +++ b/test/util/xml.h @@ -0,0 +1,221 @@ +#ifndef TEST_UTIL_XML_H_ +#define TEST_UTIL_XML_H_ + +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +namespace xml { + +class Xml { + public: + typedef std::vector token_t; + struct level_t { + std::string tag; + std::vector nodes; + std::map opts; + }; + typedef std::vector nodes_vec_t; + + enum { + DECL_STATE, + BODY_STATE + }; + + Xml(const char* file_name) : + file_name_(file_name), + file_line_(0), + data_size_(0), + index_(0), + state_(BODY_STATE), + level_(NULL), + comment_(false) + { + AddLevel("top"); + + fd_ = open(file_name, O_RDONLY); + if (fd_ == -1) { + std::cout << "XML file not found: " << file_name << std::endl; + return; + } + + token_t remainder; + while (1) { + token_t token = (remainder.size()) ? remainder : NextToken(); + remainder.clear(); + // End of file + if (token.size() == 0) break; + +// token_t token1 = token; +// token1.push_back('\0'); +// std::cout << "> " << &token1[0] << std::endl; + + switch(state_) { + case BODY_STATE: + if (token[0] == '<') { + bool node_begin = true; + unsigned ind = 1; + if (token[1] == '/') { + node_begin = false; + ++ind; + } + + unsigned i = ind; + while (i < token.size()) { if (token[i] == '>') break; ++i; } + for (unsigned j = i + 1; j < token.size(); ++j) remainder.push_back(token[j]); + + if (i == token.size()) { + if (node_begin) state_ = DECL_STATE; + else BadFormat(token); + token.push_back('\0'); + } else token[i] = '\0'; + + const char* tag = strdup(&token[ind]); + if (node_begin) { + AddLevel(tag); + } else { + if (strncmp(CurrentLevel().c_str(), tag, strlen(tag))) { + token.back() = '>'; + BadFormat(token); + } + UpLevel(); + } + } else BadFormat(token); + break; + case DECL_STATE: + if (token[0] == '>') { + state_ = BODY_STATE; + for (unsigned j = 1; j < token.size(); ++j) remainder.push_back(token[j]); + continue; + } else { + token.push_back('\0'); + unsigned j = 0; + for (j = 0; j < token.size(); ++j) if (token[j] == '=') break; + if (j == token.size()) BadFormat(token); + token[j] = '\0'; + const char* key = &token[0]; + const char* value = &token[j + 1]; + AddOption(key, value); + } + break; + default: + std::cout << "Wrong state: " << state_ << std::endl; + exit(1); + } + } + } + + std::vector GetNodes(std::string global_tag) { + return map_[global_tag]; + } + + void Print() const { + for(auto& elem : map_) { + for (auto node : elem.second) { + if (node->opts.size()) { + std::cout << elem.first << ":" << std::endl; + for (auto& opt : node->opts) { + std::cout << " " << opt.first << " = " << opt.second << std::endl; + } + } + } + } + } + + private: + bool LineEndCheck() { + bool found = false; + if (buffer_[index_] == '\n') { + buffer_[index_] = ' '; + ++file_line_; + found = true; + comment_ = false; + } else if (comment_ || (buffer_[index_] == '#')) { + found = true; + comment_ = true; + } + return found; + } + + token_t NextToken() { + token_t token; + + while (1) { + if (data_size_ == 0) { + data_size_ = read(fd_, buffer_, buf_size_); + if (data_size_ <= 0) break; + } + if (token.empty()) while ((index_ < data_size_) && ((buffer_[index_] == ' ') || LineEndCheck())) { + ++index_; + } + while ((index_ < data_size_) && (buffer_[index_] != ' ') && !LineEndCheck()) { + token.push_back(buffer_[index_++]); + } + if (index_ == data_size_) { + index_ = 0; + data_size_ = 0; + } else break; + } + + return token; + } + + void BadFormat(token_t token) { + token.push_back('\0'); + std::cout << "Error: " << file_name_ << ", line " << file_line_ << ", bad XML token '" << &token[0] << "'" << std::endl; + exit(1); + } + + void AddLevel(const std::string& tag) { + level_t* level = new level_t; + level->tag = tag; + if (level_) { + level_->nodes.push_back(level); + stack_.push_back(level_); + } + level_ = level; + + std::string global_tag; + for (level_t* level : stack_) { global_tag += level->tag + "."; } + global_tag += tag; + map_[global_tag].push_back(level_); + } + + void UpLevel() { + level_ = stack_.back(); + stack_.pop_back(); + } + + std::string CurrentLevel() const { + return level_->tag; + } + + void AddOption(const std::string& key, const std::string& value) { + level_->opts[key] = value; + } + + const char* file_name_; + unsigned file_line_; + int fd_; + static const unsigned buf_size_ = 256; + char buffer_[buf_size_]; + unsigned data_size_; + unsigned index_; + unsigned state_; + level_t* level_; + std::vector stack_; + std::map map_; + bool comment_; +}; + +} // namespace xml + +#endif // TEST_UTIL_XML_H_