Initial commit
This commit is contained in:
@@ -0,0 +1,103 @@
|
||||
################################################################################
|
||||
##
|
||||
## Copyright (c) 2016 Advanced Micro Devices, Inc. All rights reserved.
|
||||
##
|
||||
## MIT LICENSE:
|
||||
## Permission is hereby granted, free of charge, to any person obtaining a copy of
|
||||
## this software and associated documentation files (the "Software"), to deal in
|
||||
## the Software without restriction, including without limitation the rights to
|
||||
## use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
|
||||
## of the Software, and to permit persons to whom the Software is furnished to do
|
||||
## so, subject to the following conditions:
|
||||
##
|
||||
## The above copyright notice and this permission notice shall be included in all
|
||||
## copies or substantial portions of the Software.
|
||||
##
|
||||
## THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
## IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
## FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
## AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
## LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
## OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
## SOFTWARE.
|
||||
##
|
||||
################################################################################
|
||||
|
||||
cmake_minimum_required ( VERSION 3.5.0 )
|
||||
|
||||
## Verbose output.
|
||||
set ( CMAKE_VERBOSE_MAKEFILE TRUE CACHE BOOL "Verbose Output" FORCE )
|
||||
|
||||
## Set module name and project name.
|
||||
set ( ROCTRACER_NAME "roctracer" )
|
||||
set ( ROCTRACER_TARGET "${ROCTRACER_NAME}64" )
|
||||
set ( ROCTRACER_LIBRARY "lib${ROCTRACER_TARGET}" )
|
||||
project ( ${ROCTRACER_TARGET} )
|
||||
|
||||
## Adding default path cmake modules
|
||||
list ( APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/cmake_modules" )
|
||||
## Include common cmake modules
|
||||
include ( utils )
|
||||
## Set build environment
|
||||
include ( env )
|
||||
|
||||
## Setup the package version.
|
||||
get_version ( "1.0.0" )
|
||||
message ( "-- LIB-VERSION: ${VERSION_MAJOR}.${VERSION_MINOR}.${VERSION_PATCH}" )
|
||||
|
||||
set ( BUILD_VERSION_MAJOR ${VERSION_MAJOR} )
|
||||
set ( BUILD_VERSION_MINOR ${VERSION_MINOR} )
|
||||
set ( BUILD_VERSION_PATCH ${VERSION_PATCH} )
|
||||
set ( LIB_VERSION_STRING "${BUILD_VERSION_MAJOR}.${BUILD_VERSION_MINOR}.${BUILD_VERSION_PATCH}" )
|
||||
if ( DEFINED VERSION_BUILD AND NOT ${VERSION_BUILD} STREQUAL "" )
|
||||
message ( "VERSION BUILD DEFINED ${VERSION_BUILD}" )
|
||||
set ( BUILD_VERSION_PATCH "${BUILD_VERSION_PATCH}-${VERSION_BUILD}" )
|
||||
endif ()
|
||||
set ( BUILD_VERSION_STRING "${BUILD_VERSION_MAJOR}.${BUILD_VERSION_MINOR}.${BUILD_VERSION_PATCH}" )
|
||||
|
||||
## Set target and root/lib/test directory
|
||||
set ( TARGET_NAME "${ROCTRACER_TARGET}" )
|
||||
set ( ROOT_DIR "${CMAKE_CURRENT_SOURCE_DIR}" )
|
||||
set ( LIB_DIR "${ROOT_DIR}/src" )
|
||||
#set ( TEST_DIR "${ROOT_DIR}/test" )
|
||||
|
||||
## Build library
|
||||
include ( ${LIB_DIR}/CMakeLists.txt )
|
||||
|
||||
## Set the VERSION and SOVERSION values
|
||||
set_property ( TARGET ${TARGET_NAME} PROPERTY VERSION "${LIB_VERSION_STRING}" )
|
||||
set_property ( TARGET ${TARGET_NAME} PROPERTY SOVERSION "${BUILD_VERSION_MAJOR}" )
|
||||
|
||||
## If the library is a release, strip the target library
|
||||
if ( "${CMAKE_BUILD_TYPE}" STREQUAL release )
|
||||
add_custom_command ( TARGET ${ROCTRACER_TARGET} POST_BUILD COMMAND ${CMAKE_STRIP} *.so )
|
||||
endif ()
|
||||
|
||||
## Build tests
|
||||
#add_subdirectory ( ${TEST_DIR} ${PROJECT_BINARY_DIR}/test )
|
||||
|
||||
## Install information
|
||||
install ( TARGETS ${ROCTRACER_TARGET} LIBRARY DESTINATION ${ROCTRACER_NAME}/lib )
|
||||
install ( FILES ${CMAKE_CURRENT_SOURCE_DIR}/inc/roctracer.h DESTINATION ${ROCTRACER_NAME}/include )
|
||||
|
||||
## Packaging directives
|
||||
set ( CPACK_PACKAGE_NAME "${ROCTRACER_NAME}-dev" )
|
||||
set ( CPACK_PACKAGE_VENDOR "AMD" )
|
||||
set ( CPACK_PACKAGE_VERSION_MAJOR ${BUILD_VERSION_MAJOR} )
|
||||
set ( CPACK_PACKAGE_VERSION_MINOR ${BUILD_VERSION_MINOR} )
|
||||
set ( CPACK_PACKAGE_VERSION_PATCH ${BUILD_VERSION_PATCH} )
|
||||
set ( CPACK_PACKAGE_CONTACT "Advanced Micro Devices Inc." )
|
||||
set ( CPACK_PACKAGE_DESCRIPTION_SUMMARY "ROCTRACER library for AMD HSA runtime API extension support" )
|
||||
set ( CPACK_RESOURCE_FILE_LICENSE "${CMAKE_CURRENT_SOURCE_DIR}/LICENSE" )
|
||||
|
||||
## Debian package specific variables
|
||||
set ( CPACK_DEBIAN_PACKAGE_DEPENDS "hsa-rocr-dev" )
|
||||
set ( CPACK_DEBIAN_PACKAGE_HOMEPAGE "https://github.com/RadeonOpenCompute/HSA-RocProfiler" )
|
||||
set ( CPACK_DEBIAN_PACKAGE_CONTROL_EXTRA "${CMAKE_CURRENT_SOURCE_DIR}/DEBIAN/postinst;${CMAKE_CURRENT_SOURCE_DIR}/DEBIAN/prerm" )
|
||||
|
||||
## RPM package specific variables
|
||||
set ( CPACK_RPM_PACKAGE_DEPENDS "hsa-rocr-dev" )
|
||||
set ( CPACK_RPM_PRE_INSTALL_SCRIPT_FILE "${CMAKE_CURRENT_SOURCE_DIR}/RPM/rpm_post" )
|
||||
set ( CPACK_RPM_POST_UNINSTALL_SCRIPT_FILE "${CMAKE_CURRENT_SOURCE_DIR}/RPM/rpm_postun" )
|
||||
|
||||
include ( CPack )
|
||||
@@ -0,0 +1,19 @@
|
||||
#/bin/bash
|
||||
|
||||
set -e
|
||||
|
||||
do_ldconfig() {
|
||||
echo /opt/rocm/libroctracer/lib > /etc/ld.so.conf.d/libhsa-roctracer64.conf && ldconfig
|
||||
}
|
||||
|
||||
case "$1" in
|
||||
configure)
|
||||
do_ldconfig
|
||||
;;
|
||||
abort-upgrade|abort-remove|abort-deconfigure)
|
||||
echo "$1"
|
||||
;;
|
||||
*)
|
||||
exit 0
|
||||
;;
|
||||
esac
|
||||
@@ -0,0 +1,18 @@
|
||||
#!/bin/bash
|
||||
|
||||
set -e
|
||||
|
||||
rm_ldconfig() {
|
||||
rm -f /etc/ld.so.conf.d/libhsa-roctracer64.conf && ldconfig
|
||||
}
|
||||
|
||||
case "$1" in
|
||||
remove)
|
||||
rm_ldconfig
|
||||
;;
|
||||
purge)
|
||||
;;
|
||||
*)
|
||||
exit 0
|
||||
;;
|
||||
esac
|
||||
+21
@@ -0,0 +1,21 @@
|
||||
MIT License
|
||||
|
||||
Copyright (c) 2017 ROCm Core Technology
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in all
|
||||
copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
SOFTWARE.
|
||||
@@ -0,0 +1 @@
|
||||
echo /opt/rocm/libroctracer/lib > /etc/ld.so.conf.d/libhsa-roctracer64.conf && ldconfig
|
||||
@@ -0,0 +1 @@
|
||||
rm -f /etc/ld.so.conf.d/libhsa-roctracer64.conf && ldconfig
|
||||
@@ -0,0 +1,9 @@
|
||||
ROC Profiler/Traces library.
|
||||
Callback and Activity APIs
|
||||
|
||||
The library source tree:
|
||||
- doc - Documentation
|
||||
- inc/roctracer.h - Library public API
|
||||
- src - Library sources
|
||||
- core - Library API sources
|
||||
- util - Library utils sources
|
||||
@@ -0,0 +1,60 @@
|
||||
---
|
||||
Language: Cpp
|
||||
# BasedOnStyle: Google
|
||||
AccessModifierOffset: -1
|
||||
ConstructorInitializerIndentWidth: 4
|
||||
AlignEscapedNewlinesLeft: false
|
||||
AlignTrailingComments: true
|
||||
AlignConsecutiveAssignments: false
|
||||
AlignOperands: false
|
||||
AllowAllParametersOfDeclarationOnNextLine: true
|
||||
AllowShortBlocksOnASingleLine: false
|
||||
AllowShortIfStatementsOnASingleLine: true
|
||||
AllowShortLoopsOnASingleLine: true
|
||||
AllowShortFunctionsOnASingleLine: All
|
||||
AlwaysBreakAfterDefinitionReturnType: false
|
||||
AlwaysBreakTemplateDeclarations: false
|
||||
AlwaysBreakBeforeMultilineStrings: true
|
||||
BreakBeforeBinaryOperators: false
|
||||
BreakBeforeTernaryOperators: true
|
||||
BreakConstructorInitializersBeforeComma: false
|
||||
BinPackParameters: true
|
||||
ColumnLimit: 100
|
||||
ConstructorInitializerAllOnOneLineOrOnePerLine: true
|
||||
ExperimentalAutoDetectBinPacking: false
|
||||
IndentCaseLabels: true
|
||||
IndentWrappedFunctionNames: false
|
||||
IndentFunctionDeclarationAfterType: false
|
||||
MaxEmptyLinesToKeep: 2
|
||||
KeepEmptyLinesAtTheStartOfBlocks: false
|
||||
NamespaceIndentation: None
|
||||
ObjCSpaceAfterProperty: false
|
||||
ObjCSpaceBeforeProtocolList: false
|
||||
PenaltyBreakBeforeFirstCallParameter: 1
|
||||
PenaltyBreakComment: 300
|
||||
PenaltyBreakString: 1000
|
||||
PenaltyBreakFirstLessLess: 120
|
||||
PenaltyExcessCharacter: 1000000
|
||||
PenaltyReturnTypeOnItsOwnLine: 200
|
||||
DerivePointerAlignment: false
|
||||
PointerAlignment: Left
|
||||
SpacesBeforeTrailingComments: 2
|
||||
Cpp11BracedListStyle: true
|
||||
Standard: Auto
|
||||
IndentWidth: 2
|
||||
TabWidth: 8
|
||||
UseTab: Never
|
||||
BreakBeforeBraces: Attach
|
||||
SpacesInParentheses: false
|
||||
SpacesInAngles: false
|
||||
SpaceInEmptyParentheses: false
|
||||
SpacesInCStyleCastParentheses: false
|
||||
SpacesInContainerLiterals: true
|
||||
SpaceBeforeAssignmentOperators: true
|
||||
ContinuationIndentWidth: 4
|
||||
CommentPragmas: '^ IWYU pragma:'
|
||||
ForEachMacros: [ foreach, Q_FOREACH, BOOST_FOREACH ]
|
||||
SpaceBeforeParens: ControlStatements
|
||||
DisableFormat: false
|
||||
SortIncludes: false
|
||||
...
|
||||
@@ -0,0 +1,107 @@
|
||||
## Build is not supported on Windows plaform
|
||||
if ( WIN32 )
|
||||
message ( FATAL_ERROR "Windows build is not supported." )
|
||||
endif ()
|
||||
|
||||
## Compiler Preprocessor definitions.
|
||||
add_definitions ( -D__linux__ )
|
||||
add_definitions ( -DUNIX_OS )
|
||||
add_definitions ( -DLINUX )
|
||||
add_definitions ( -D__AMD64__ )
|
||||
add_definitions ( -D__x86_64__ )
|
||||
add_definitions ( -DAMD_INTERNAL_BUILD )
|
||||
add_definitions ( -DLITTLEENDIAN_CPU=1 )
|
||||
add_definitions ( -DHSA_LARGE_MODEL= )
|
||||
add_definitions ( -DHSA_DEPRECATED= )
|
||||
add_definitions ( -D__HIP_PLATFORM_HCC__ )
|
||||
|
||||
## Linux Compiler options
|
||||
set ( CMAKE_CXX_FLAGS "-std=c++11")
|
||||
set ( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall" )
|
||||
set ( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Werror" )
|
||||
set ( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Werror=return-type" )
|
||||
set ( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fexceptions" )
|
||||
set ( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fvisibility=hidden" )
|
||||
set ( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fno-math-errno" )
|
||||
set ( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fno-threadsafe-statics" )
|
||||
set ( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fmerge-all-constants" )
|
||||
set ( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fms-extensions" )
|
||||
set ( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fmerge-all-constants" )
|
||||
set ( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fPIC" )
|
||||
|
||||
set ( CMAKE_SHARED_LINKER_FLAGS "-Wl,-Bdynamic -Wl,-z,noexecstack" )
|
||||
|
||||
set ( CMAKE_SKIP_BUILD_RPATH TRUE )
|
||||
|
||||
## CLANG options
|
||||
if ( "$ENV{CXX}" STREQUAL "/usr/bin/clang++" )
|
||||
set ( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -ferror-limit=1000000" )
|
||||
endif()
|
||||
|
||||
## Enable debug trace
|
||||
if ( DEFINED ENV{CMAKE_DEBUG_TRACE} )
|
||||
add_definitions ( -DDEBUG_TRACE=1 )
|
||||
endif()
|
||||
|
||||
## Enable direct loading of AQL-profile HSA extension
|
||||
if ( DEFINED ENV{CMAKE_LD_AQLPROFILE} )
|
||||
add_definitions ( -DROCP_LD_AQLPROFILE=1 )
|
||||
endif()
|
||||
|
||||
## Make env vars
|
||||
if ( NOT DEFINED CMAKE_BUILD_TYPE OR "${CMAKE_BUILD_TYPE}" STREQUAL "" )
|
||||
if ( DEFINED ENV{CMAKE_BUILD_TYPE} )
|
||||
set ( CMAKE_BUILD_TYPE $ENV{CMAKE_BUILD_TYPE} )
|
||||
endif()
|
||||
endif()
|
||||
if ( NOT DEFINED CMAKE_PREFIX_PATH AND DEFINED ENV{CMAKE_PREFIX_PATH} )
|
||||
set ( CMAKE_PREFIX_PATH $ENV{CMAKE_PREFIX_PATH} )
|
||||
endif()
|
||||
|
||||
set ( HIP_INC_DIR $ENV{HIP_INC_DIR} )
|
||||
|
||||
## Extend Compiler flags based on build type
|
||||
string ( TOLOWER "${CMAKE_BUILD_TYPE}" CMAKE_BUILD_TYPE )
|
||||
if ( "${CMAKE_BUILD_TYPE}" STREQUAL debug )
|
||||
set ( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -ggdb" )
|
||||
set ( CMAKE_BUILD_TYPE "debug" )
|
||||
else ()
|
||||
set ( CMAKE_BUILD_TYPE "release" )
|
||||
endif ()
|
||||
|
||||
## Extend Compiler flags based on Processor architecture
|
||||
if ( ${CMAKE_SYSTEM_PROCESSOR} STREQUAL "x86_64" )
|
||||
set ( NBIT 64 )
|
||||
set ( NBITSTR "64" )
|
||||
set ( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -m64 -msse -msse2" )
|
||||
elseif ( ${CMAKE_SYSTEM_PROCESSOR} STREQUAL "x86" )
|
||||
set ( NBIT 32 )
|
||||
set ( NBITSTR "" )
|
||||
set ( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -m32" )
|
||||
endif ()
|
||||
|
||||
## Find hsa-runtime headers/lib
|
||||
find_file ( HSA_RUNTIME_INC "hsa.h" )
|
||||
if ( "${HSA_RUNTIME_INC_PATH}" STREQUAL "" )
|
||||
find_file ( HSA_RUNTIME_INC "hsa/hsa.h" )
|
||||
endif()
|
||||
find_library ( HSA_RUNTIME_LIB "libhsa-runtime${NBIT}.so" )
|
||||
get_filename_component ( HSA_RUNTIME_INC_PATH ${HSA_RUNTIME_INC} DIRECTORY )
|
||||
get_filename_component ( HSA_RUNTIME_LIB_PATH ${HSA_RUNTIME_LIB} DIRECTORY )
|
||||
|
||||
find_library ( HSA_KMT_LIB "libhsakmt.so" )
|
||||
get_filename_component ( HSA_KMT_LIB_PATH ${HSA_KMT_LIB} DIRECTORY )
|
||||
|
||||
set ( API_PATH ${HSA_RUNTIME_INC_PATH} )
|
||||
|
||||
## Basic Tool Chain Information
|
||||
message ( "----------------NBIT: ${NBIT}" )
|
||||
message ( "-----------BuildType: ${CMAKE_BUILD_TYPE}" )
|
||||
message ( "------------Compiler: ${CMAKE_CXX_COMPILER}" )
|
||||
message ( "----Compiler-Version: ${CMAKE_CXX_COMPILER_VERSION}" )
|
||||
message ( "-----HSA-Runtime-Inc: ${HSA_RUNTIME_INC_PATH}" )
|
||||
message ( "-----HSA-Runtime-Lib: ${HSA_RUNTIME_LIB_PATH}" )
|
||||
message ( "------------API-path: ${API_PATH}" )
|
||||
message ( "-----CMAKE_CXX_FLAGS: ${CMAKE_CXX_FLAGS}" )
|
||||
message ( "---CMAKE_PREFIX_PATH: ${CMAKE_PREFIX_PATH}" )
|
||||
message ( "---------HIP_INC_DIR: ${HIP_INC_DIR}" )
|
||||
@@ -0,0 +1,116 @@
|
||||
################################################################################
|
||||
##
|
||||
## The University of Illinois/NCSA
|
||||
## Open Source License (NCSA)
|
||||
##
|
||||
## Copyright (c) 2014-2017, Advanced Micro Devices, Inc. All rights reserved.
|
||||
##
|
||||
## Developed by:
|
||||
##
|
||||
## AMD Research and AMD HSA Software Development
|
||||
##
|
||||
## Advanced Micro Devices, Inc.
|
||||
##
|
||||
## www.amd.com
|
||||
##
|
||||
## Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
## of this software and associated documentation files (the "Software"), to
|
||||
## deal with the Software without restriction, including without limitation
|
||||
## the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
## and#or sell copies of the Software, and to permit persons to whom the
|
||||
## Software is furnished to do so, subject to the following conditions:
|
||||
##
|
||||
## - Redistributions of source code must retain the above copyright notice,
|
||||
## this list of conditions and the following disclaimers.
|
||||
## - Redistributions in binary form must reproduce the above copyright
|
||||
## notice, this list of conditions and the following disclaimers in
|
||||
## the documentation and#or other materials provided with the distribution.
|
||||
## - Neither the names of Advanced Micro Devices, Inc,
|
||||
## nor the names of its contributors may be used to endorse or promote
|
||||
## products derived from this Software without specific prior written
|
||||
## permission.
|
||||
##
|
||||
## THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
## IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
## FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
||||
## THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
|
||||
## OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
|
||||
## ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
|
||||
## DEALINGS WITH THE SOFTWARE.
|
||||
##
|
||||
################################################################################
|
||||
|
||||
## Parses the VERSION_STRING variable and places
|
||||
## the first, second and third number values in
|
||||
## the major, minor and patch variables.
|
||||
function( parse_version VERSION_STRING )
|
||||
|
||||
string ( FIND ${VERSION_STRING} "-" STRING_INDEX )
|
||||
|
||||
if ( ${STRING_INDEX} GREATER -1 )
|
||||
math ( EXPR STRING_INDEX "${STRING_INDEX} + 1" )
|
||||
string ( SUBSTRING ${VERSION_STRING} ${STRING_INDEX} -1 VERSION_BUILD )
|
||||
endif ()
|
||||
|
||||
string ( REGEX MATCHALL "[0123456789]+" VERSIONS ${VERSION_STRING} )
|
||||
list ( LENGTH VERSIONS VERSION_COUNT )
|
||||
|
||||
if ( ${VERSION_COUNT} GREATER 0)
|
||||
list ( GET VERSIONS 0 MAJOR )
|
||||
set ( VERSION_MAJOR ${MAJOR} PARENT_SCOPE )
|
||||
set ( TEMP_VERSION_STRING "${MAJOR}" )
|
||||
endif ()
|
||||
|
||||
if ( ${VERSION_COUNT} GREATER 1 )
|
||||
list ( GET VERSIONS 1 MINOR )
|
||||
set ( VERSION_MINOR ${MINOR} PARENT_SCOPE )
|
||||
set ( TEMP_VERSION_STRING "${TEMP_VERSION_STRING}.${MINOR}" )
|
||||
endif ()
|
||||
|
||||
if ( ${VERSION_COUNT} GREATER 2 )
|
||||
list ( GET VERSIONS 2 PATCH )
|
||||
set ( VERSION_PATCH ${PATCH} PARENT_SCOPE )
|
||||
set ( TEMP_VERSION_STRING "${TEMP_VERSION_STRING}.${PATCH}" )
|
||||
endif ()
|
||||
|
||||
if ( DEFINED VERSION_BUILD )
|
||||
set ( VERSION_BUILD "${VERSION_BUILD}" PARENT_SCOPE )
|
||||
endif ()
|
||||
|
||||
set ( VERSION_STRING "${TEMP_VERSION_STRING}" PARENT_SCOPE )
|
||||
|
||||
endfunction ()
|
||||
|
||||
## Gets the current version of the repository
|
||||
## using versioning tags and git describe.
|
||||
## Passes back a packaging version string
|
||||
## and a library version string.
|
||||
function ( get_version DEFAULT_VERSION_STRING )
|
||||
|
||||
parse_version ( ${DEFAULT_VERSION_STRING} )
|
||||
|
||||
find_program ( GIT NAMES git )
|
||||
|
||||
if ( GIT )
|
||||
|
||||
execute_process ( COMMAND "git describe --dirty --long --match [0-9]* 2>/dev/null"
|
||||
WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
|
||||
OUTPUT_VARIABLE GIT_TAG_STRING
|
||||
OUTPUT_STRIP_TRAILING_WHITESPACE
|
||||
RESULT_VARIABLE RESULT )
|
||||
|
||||
if ( ${RESULT} EQUAL 0 )
|
||||
|
||||
parse_version ( ${GIT_TAG_STRING} )
|
||||
|
||||
endif ()
|
||||
|
||||
endif ()
|
||||
|
||||
set( VERSION_STRING "${VERSION_STRING}" PARENT_SCOPE )
|
||||
set( VERSION_MAJOR "${VERSION_MAJOR}" PARENT_SCOPE )
|
||||
set( VERSION_MINOR "${VERSION_MINOR}" PARENT_SCOPE )
|
||||
set( VERSION_PATCH "${VERSION_PATCH}" PARENT_SCOPE )
|
||||
set( VERSION_BUILD "${VERSION_BUILD}" PARENT_SCOPE )
|
||||
|
||||
endfunction()
|
||||
Plik binarny nie jest wyświetlany.
Plik binarny nie jest wyświetlany.
@@ -0,0 +1,230 @@
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
//
|
||||
// The University of Illinois/NCSA
|
||||
// Open Source License (NCSA)
|
||||
//
|
||||
// Copyright (c) 2014-2015, Advanced Micro Devices, Inc. All rights reserved.
|
||||
//
|
||||
// Developed by:
|
||||
//
|
||||
// AMD Research and AMD HSA Software Development
|
||||
//
|
||||
// Advanced Micro Devices, Inc.
|
||||
//
|
||||
// www.amd.com
|
||||
//
|
||||
// Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
// of this software and associated documentation files (the "Software"), to
|
||||
// deal with the Software without restriction, including without limitation
|
||||
// the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
// and/or sell copies of the Software, and to permit persons to whom the
|
||||
// Software is furnished to do so, subject to the following conditions:
|
||||
//
|
||||
// - Redistributions of source code must retain the above copyright notice,
|
||||
// this list of conditions and the following disclaimers.
|
||||
// - Redistributions in binary form must reproduce the above copyright
|
||||
// notice, this list of conditions and the following disclaimers in
|
||||
// the documentation and/or other materials provided with the distribution.
|
||||
// - Neither the names of Advanced Micro Devices, Inc,
|
||||
// nor the names of its contributors may be used to endorse or promote
|
||||
// products derived from this Software without specific prior written
|
||||
// permission.
|
||||
//
|
||||
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
||||
// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
|
||||
// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
|
||||
// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
|
||||
// DEALINGS WITH THE SOFTWARE.
|
||||
//
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
//
|
||||
// ROC Profiler API
|
||||
//
|
||||
// The goal of the implementation is to provide a HW specific low-level
|
||||
// performance analysis interface for profiling of GPU compute applications.
|
||||
// The profiling includes HW performance counters (PMC) with complex
|
||||
// performance metrics and thread traces (SQTT). The profiling is supported
|
||||
// by the SQTT, PMC and Callback APIs.
|
||||
//
|
||||
// The library can be used by a tool library loaded by HSA runtime or by
|
||||
// higher level HW independent performance analysis API like PAPI.
|
||||
//
|
||||
// The library is written on C and will be based on AQLprofile AMD specific
|
||||
// HSA extension. The library implementation requires HSA API intercepting and
|
||||
// a profiling queue supporting a submit callback interface.
|
||||
//
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
#ifndef INC_ROCTRACER_H_
|
||||
#define INC_ROCTRACER_H_
|
||||
|
||||
#include <stdint.h>
|
||||
|
||||
#include <hip/hip_runtime.h>
|
||||
#include <hip/hip_cbstr.h>
|
||||
|
||||
#define ROCTRACER_VERSION_MAJOR 1
|
||||
#define ROCTRACER_VERSION_MINOR 0
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif // __cplusplus
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
// Returning library version
|
||||
uint32_t roctracer_version_major();
|
||||
uint32_t roctracer_version_minor();
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
// Library errors enumaration
|
||||
typedef enum {
|
||||
ROCTRACER_STATUS_SUCCESS = 0,
|
||||
ROCTRACER_STATUS_ERROR = 1,
|
||||
ROCTRACER_STATUS_UNINIT = 2,
|
||||
ROCTRACER_STATUS_BREAK = 3,
|
||||
ROCTRACER_STATUS_BAD_DOMAIN = 4,
|
||||
ROCTRACER_STATUS_HIP_API_ERR = 5,
|
||||
} roctracer_status_t;
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
// Returning the last error
|
||||
const char* roctracer_error_string();
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
// Traced runtime API domains
|
||||
|
||||
// Traced API domains
|
||||
typedef enum {
|
||||
ROCTRACER_API_DOMAIN_ANY = 0, // HIP API domain
|
||||
ROCTRACER_API_DOMAIN_HIP = 1, // HIP API domain
|
||||
} roctracer_api_domain_t;
|
||||
|
||||
// Traced calls ID enumeration
|
||||
typedef hip_cb_id_t roctracer_hip_api_cid_t;
|
||||
|
||||
// Correlation ID type
|
||||
typedef uint64_t roctracer_correletion_id_t;
|
||||
|
||||
// Return method name by given API domain and call ID
|
||||
// NULL returned on the error and the library errno is set
|
||||
const char* roctracer_get_api_name(
|
||||
roctracer_api_domain_t domain, // API domain
|
||||
roctracer_hip_api_cid_t cid); // API call ID
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
// Callback API
|
||||
//
|
||||
// ROC profiler frontend provides support for runtime API callbacks and activity
|
||||
// records logging. The API callbacks provide the API calls arguments and are
|
||||
// called on different phases, on enter, on exit, on kernel completion.
|
||||
// Methods return non-zero on error and library errno is set.
|
||||
|
||||
// API callback phase
|
||||
typedef enum {
|
||||
ROCTRACER_API_PHASE_ENTER = 0,
|
||||
ROCTRACER_API_PHASE_EXIT = 1,
|
||||
ROCTRACER_API_PHASE_COMPLETE = 2,
|
||||
} roctracer_feature_kind_t;
|
||||
|
||||
// API calback data
|
||||
typedef hip_cb_fun_t roctracer_api_callback_t;
|
||||
|
||||
// Enable runtime API callbacks
|
||||
int roctracer_enable_api_callback(
|
||||
roctracer_api_domain_t domain, // runtime API domain
|
||||
uint32_t cid, // API call ID
|
||||
roctracer_api_callback_t callback, // callback function pointer
|
||||
void* arg); // [in/out] callback arg
|
||||
|
||||
// Disable runtime API callbacks
|
||||
int roctracer_disable_api_callback(
|
||||
roctracer_api_domain_t domain, // runtime API domain
|
||||
uint32_t cid); // API call ID
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
// Activity API
|
||||
//
|
||||
// The activity records are asynchronously logged to the pool and can be associated
|
||||
// with the respective API callbacks using the correlation ID. Activity API can
|
||||
// be used to enable collecting of the records with timestamping data for API
|
||||
// calls and the kernel submits.
|
||||
// Methods return non zero on error and library errno is set.
|
||||
|
||||
// Roctracer pool type
|
||||
typedef void roctracer_pool_t;
|
||||
|
||||
// Activity record
|
||||
typedef hip_act_record_t roctracer_record_t;
|
||||
|
||||
// Return next record
|
||||
static inline int roctracer_next_record(
|
||||
const roctracer_record_t* record, // [in] record ptr
|
||||
const roctracer_record_t** next) // [out] next record ptr
|
||||
{
|
||||
*next = (record + 1);
|
||||
return ROCTRACER_STATUS_SUCCESS;
|
||||
}
|
||||
|
||||
// Tracer allocator type
|
||||
typedef void (*roctracer_allocator_t)(
|
||||
char** ptr, // memory pointer
|
||||
size_t size, // memory size
|
||||
void* arg); // allocator arg
|
||||
|
||||
// Pool callback type
|
||||
typedef void (*roctracer_buffer_callback_t)(
|
||||
const char* begin, // [in] available buffered trace records
|
||||
const char* end, // [in] end of buffered trace records
|
||||
void* arg); // [in/out] callback arg
|
||||
|
||||
// Tracer properties
|
||||
typedef struct {
|
||||
uint32_t mode; // roctracer mode
|
||||
size_t buffer_size; // buffer size
|
||||
roctracer_allocator_t alloc_fun; // memory alocator function pointer
|
||||
void* alloc_arg; // memory alocator function pointer
|
||||
roctracer_buffer_callback_t buffer_callback_fun; // tracer record callback function
|
||||
void* buffer_callback_arg; // tracer record callback arg
|
||||
} roctracer_properties_t;
|
||||
|
||||
// Create tracer memory pool
|
||||
// The first invocation sets the default pool
|
||||
int roctracer_open_pool(
|
||||
const roctracer_properties_t* properties, // tracer pool properties
|
||||
roctracer_pool_t** pool = NULL); // [out] returns tracer pool if not NULL,
|
||||
// otherwise sets the default one if it is not set yet
|
||||
// otherwise the error is generated
|
||||
|
||||
// Close tracer memory pool
|
||||
int roctracer_close_pool(
|
||||
roctracer_pool_t* pool = NULL); // [in] memory pool, NULL is a default one
|
||||
|
||||
// Return current default pool
|
||||
// Set new default pool if the argument is not NULL
|
||||
roctracer_pool_t* roctracer_default_pool(
|
||||
roctracer_pool_t* pool = NULL); // [in] new default pool if not NULL
|
||||
|
||||
// Enable activity records logging
|
||||
int roctracer_enable_api_activity(
|
||||
roctracer_api_domain_t domain, // runtime API domain
|
||||
uint32_t activity_kind, // activity kind
|
||||
roctracer_pool_t* pool = NULL); // memory pool, NULL is a default one
|
||||
|
||||
// Disable activity records logging
|
||||
int roctracer_disable_api_activity(
|
||||
roctracer_api_domain_t domain, // runtime API domain
|
||||
uint32_t activity_kind); // activity kind
|
||||
|
||||
// Flush available activity records
|
||||
int roctracer_flush_api_activity(
|
||||
roctracer_pool_t* pool = NULL); // memory pool, NULL is a default one
|
||||
|
||||
#ifdef __cplusplus
|
||||
} // extern "C" block
|
||||
#endif // __cplusplus
|
||||
|
||||
#endif // INC_ROCTRACER_H_
|
||||
Executable
+189
@@ -0,0 +1,189 @@
|
||||
#!/usr/bin/python
|
||||
import os, sys, re
|
||||
|
||||
HEADER = "hip_cbstr.h"
|
||||
REC_MAX_LEN = 1024
|
||||
|
||||
def fill_api_map(out, api_name, args_str):
|
||||
args_list = []
|
||||
|
||||
args_str = re.sub(r'^\s*', r'', args_str);
|
||||
args_str = re.sub(r'\s*$', r'', args_str);
|
||||
args_str = re.sub(r'\s*,\s*', r',', args_str);
|
||||
args_str = re.sub(r'\s+', r' ', args_str);
|
||||
|
||||
for arg_pair in args_str.split(','):
|
||||
arg_pair = re.sub(r'\s+=\s+\S+$', '', arg_pair);
|
||||
m = re.match("^(.*)\s(\S+)$", arg_pair);
|
||||
if m: args_list.append((m.group(1), m.group(2)))
|
||||
|
||||
out[api_name] = args_list;
|
||||
|
||||
# hipError_t hipSetupArgument(const void* arg, size_t size, size_t offset);
|
||||
def parse_api(inp, out):
|
||||
end_pattern = re.compile("Texture");
|
||||
beg_pattern = re.compile("^hipError_t");
|
||||
api_pattern = re.compile("^hipError_t\s+([^\(]+)\(([^\)]*)\)");
|
||||
|
||||
found = 0
|
||||
record = ""
|
||||
line_num = -1
|
||||
|
||||
for line in inp.readlines():
|
||||
record += re.sub(r'^\s+', r' ', line[:-1])
|
||||
line_num += 1
|
||||
|
||||
if len(record) > REC_MAX_LEN:
|
||||
print "Error: bad record \"" + record + "\"\nfile '" + hfile + ", line (" + str(line_num) + ")"
|
||||
break;
|
||||
|
||||
if beg_pattern.match(record): found = 1
|
||||
|
||||
if found:
|
||||
m = api_pattern.match(record)
|
||||
if m:
|
||||
found = 0
|
||||
if end_pattern.search(record): break
|
||||
fill_api_map(out, m.group(1), m.group(2))
|
||||
else: continue
|
||||
|
||||
record = ""
|
||||
|
||||
#############################################################
|
||||
|
||||
if (len(sys.argv) != 2):
|
||||
print >>sys.stderr, "Usage:", sys.argv[0], " <input HIP API .h file>"
|
||||
sys.exit(1)
|
||||
|
||||
hfile = sys.argv[1]
|
||||
if not os.path.isfile(hfile):
|
||||
print >>sys.stderr, "Error: input file '" + hfile + "' not found"
|
||||
sys.exit(1)
|
||||
|
||||
inp = open(hfile, 'r')
|
||||
api_map = {}
|
||||
parse_api(inp, api_map)
|
||||
|
||||
api_map['hipLaunchKernel'] = [('void*', 'kernel'), ('hipStream_t', 'stream')]
|
||||
|
||||
f = open(HEADER, 'w')
|
||||
f.write('// automatically generated sources\n')
|
||||
f.write('#ifndef HIP__CBSTR_H__\n');
|
||||
f.write('#define HIP__CBSTR_H__\n');
|
||||
|
||||
# Generating the callbacks function type
|
||||
f.write('\n// HIP API callbacks function type\n\
|
||||
struct hip_cb_data_t;\n\
|
||||
typedef void (*hip_cb_fun_t)(const hip_cb_data_t* data, void* arg);\n\
|
||||
')
|
||||
|
||||
# Generating the callbacks ID enumaration
|
||||
f.write('\n// HIP API callbacks ID enumaration\n')
|
||||
f.write('enum hip_cb_id_t {\n')
|
||||
cb_id = 0
|
||||
for name in api_map.keys():
|
||||
f.write(' HIP_API_ID_' + name + ' = ' + str(cb_id) + ',\n')
|
||||
cb_id += 1
|
||||
f.write(' HIP_API_ID_NUMBER = ' + str(cb_id) + ',\n')
|
||||
f.write('};\n')
|
||||
|
||||
# Generating the callbacks data structure
|
||||
f.write('\n// HIP API callbacks data structure\n')
|
||||
f.write(
|
||||
'struct hip_cb_data_t {\n' +
|
||||
' const char* name;\n' +
|
||||
' hip_cb_id_t id;\n' +
|
||||
' uint32_t correlation_id;\n' +
|
||||
' bool on_enter;\n' +
|
||||
' union {\n'
|
||||
)
|
||||
for name, args in api_map.items():
|
||||
if len(args) != 0:
|
||||
f.write(' struct {\n')
|
||||
for arg_tuple in args:
|
||||
f.write(' ' + arg_tuple[0] + ' ' + arg_tuple[1] + ';\n')
|
||||
f.write(' } ' + name + ';\n')
|
||||
f.write(
|
||||
' } args;\n' +
|
||||
'};\n'
|
||||
)
|
||||
|
||||
# Generating the callbacks args data filling macros
|
||||
f.write('\n// HIP API callbacks args data filling macros\n')
|
||||
for name, args in api_map.items():
|
||||
f.write('#define INIT_' + name + '_CB_ARGS_DATA(cb_data) { \\\n')
|
||||
for arg_tuple in args:
|
||||
arg_type = arg_tuple[0];
|
||||
arg_name = arg_tuple[1];
|
||||
f.write(' cb_data.args.' + name + '.' + arg_name + ' = (' + arg_type + ')' + arg_name + '; \\\n');
|
||||
f.write('};\n')
|
||||
f.write('#define INIT_CB_ARGS_DATA(cb_id, cb_data) INIT_##cb_id##_CB_ARGS_DATA(cb_data)\n')
|
||||
|
||||
# Generating the callbacks table
|
||||
f.write('\n// HIP API callbacks table\n')
|
||||
f.write('\
|
||||
struct hip_cb_table_t {\n\
|
||||
struct { hip_cb_fun_t act; hip_cb_fun_t fun; void* arg; } arr[HIP_API_ID_NUMBER];\n\
|
||||
};\n\
|
||||
#define HIP_CALLBACKS_TABLE hip_cb_table_t HIP_API_callbacks_table{};\n\
|
||||
')
|
||||
f.write('\
|
||||
inline bool HIP_SET_ACTIVITY(uint32_t id, hip_cb_fun_t fun, void* arg = NULL) {\n\
|
||||
(void)arg;\n\
|
||||
extern hip_cb_table_t HIP_API_callbacks_table;\n\
|
||||
if (id < HIP_API_ID_NUMBER) {\n\
|
||||
HIP_API_callbacks_table.arr[id].act = fun;\n\
|
||||
return true;\n\
|
||||
}\n\
|
||||
return false;\n\
|
||||
}\n')
|
||||
f.write('\
|
||||
inline bool HIP_SET_CALLBACK(uint32_t id, hip_cb_fun_t fun, void* arg) {\n\
|
||||
extern hip_cb_table_t HIP_API_callbacks_table; \n\
|
||||
if (id < HIP_API_ID_NUMBER) {\n\
|
||||
HIP_API_callbacks_table.arr[id].fun = fun;\n\
|
||||
HIP_API_callbacks_table.arr[id].arg = arg;\n\
|
||||
return true;\n\
|
||||
}\n\
|
||||
return false;\n\
|
||||
}\n')
|
||||
|
||||
# Generating the callback spawning class
|
||||
f.write('\n// HIP API callbacks spawning class macro\n\
|
||||
#define CB_SPAWNER_OBJECT(cb_id) \\\n\
|
||||
class api_callbacks_spawner_t { \\\n\
|
||||
public: \\\n\
|
||||
api_callbacks_spawner_t(hip_cb_data_t& cb_data) : cb_data_(cb_data) { \\\n\
|
||||
hip_cb_id_t id = HIP_API_ID_##cb_id; \\\n\
|
||||
cb_data_.id = id; \\\n\
|
||||
cb_data_.correlation_id = UINT_MAX; \\\n\
|
||||
cb_data_.name = #cb_id; \\\n\
|
||||
extern const hip_cb_table_t* getApiCallbackTabel(); \\\n\
|
||||
const hip_cb_table_t* cb_table = getApiCallbackTabel(); \\\n\
|
||||
cb_act_ = cb_table->arr[id].act; \\\n\
|
||||
cb_fun_ = cb_table->arr[id].fun; \\\n\
|
||||
cb_arg_ = cb_table->arr[id].arg; \\\n\
|
||||
cb_data_.on_enter = true; \\\n\
|
||||
if (cb_act_ != NULL) cb_act_(&cb_data_, NULL); \\\n\
|
||||
if (cb_fun_ != NULL) cb_fun_(&cb_data_, cb_arg_); \\\n\
|
||||
} \\\n\
|
||||
~api_callbacks_spawner_t() { \\\n\
|
||||
cb_data_.on_enter = false; \\\n\
|
||||
if (cb_act_ != NULL) cb_act_(&cb_data_, NULL); \\\n\
|
||||
if (cb_fun_ != NULL) cb_fun_(&cb_data_, cb_arg_); \\\n\
|
||||
} \\\n\
|
||||
private: \\\n\
|
||||
hip_cb_data_t& cb_data_; \\\n\
|
||||
hip_cb_fun_t cb_act_; \\\n\
|
||||
hip_cb_fun_t cb_fun_; \\\n\
|
||||
void* cb_arg_; \\\n\
|
||||
}; \\\n\
|
||||
hip_cb_data_t cb_data{}; \\\n\
|
||||
INIT_CB_ARGS_DATA(cb_id, cb_data); \\\n\
|
||||
api_callbacks_spawner_t api_callbacks_spawner(cb_data); \n\
|
||||
')
|
||||
|
||||
f.write('#endif // HIP__CBSTR__');
|
||||
|
||||
print "Header '" + HEADER + "' is generated"
|
||||
#############################################################
|
||||
@@ -0,0 +1,11 @@
|
||||
#
|
||||
# Build dynamic Library object
|
||||
#
|
||||
set ( TARGET_LIB "${TARGET_NAME}" )
|
||||
set ( LIB_SRC
|
||||
${LIB_DIR}/core/roctracer.cpp
|
||||
${LIB_DIR}/util/hsa_rsrc_factory.cpp
|
||||
)
|
||||
add_library ( ${TARGET_LIB} SHARED ${LIB_SRC} )
|
||||
target_include_directories ( ${TARGET_LIB} PRIVATE ${LIB_DIR} ${ROOT_DIR} ${HSA_RUNTIME_INC_PATH} ${HIP_INC_DIR} )
|
||||
target_link_libraries( ${TARGET_LIB} PRIVATE ${HSA_RUNTIME_LIB} c stdc++)
|
||||
@@ -0,0 +1,420 @@
|
||||
#include "inc/roctracer.h"
|
||||
|
||||
#include <atomic>
|
||||
#include <hip/hip_runtime.h>
|
||||
#include <mutex>
|
||||
#include <string.h>
|
||||
#include <pthread.h>
|
||||
|
||||
#include "util/exception.h"
|
||||
#include "util/hsa_rsrc_factory.h"
|
||||
#include "util/logger.h"
|
||||
|
||||
#define PUBLIC_API __attribute__((visibility("default")))
|
||||
#define CONSTRUCTOR_API __attribute__((constructor))
|
||||
#define DESTRUCTOR_API __attribute__((destructor))
|
||||
|
||||
#define PTHREAD_CALL(call) \
|
||||
do { \
|
||||
int err = call; \
|
||||
if (err != 0) { \
|
||||
errno = err; \
|
||||
perror(#call); \
|
||||
abort(); \
|
||||
} \
|
||||
} while (0)
|
||||
|
||||
#define HSART_CALL(call) \
|
||||
do { \
|
||||
hsa_status_t status = call; \
|
||||
if (status != HSA_STATUS_SUCCESS) { \
|
||||
std::cerr << "HSA-rt call '" << #call << "' error(" << std::hex << status << ")" \
|
||||
<< std::dec << std::endl << std::flush; \
|
||||
abort(); \
|
||||
} \
|
||||
} while (0)
|
||||
|
||||
#define API_METHOD_PREFIX \
|
||||
int err = 0; \
|
||||
try {
|
||||
|
||||
#define API_METHOD_SUFFIX \
|
||||
} \
|
||||
catch (std::exception & e) { \
|
||||
ERR_LOGGING(__FUNCTION__ << "(), " << e.what()); \
|
||||
err = roctracer::GetExcStatus(e); \
|
||||
} \
|
||||
return err;
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
// Internal library methods
|
||||
//
|
||||
namespace roctracer {
|
||||
|
||||
int GetExcStatus(const std::exception& e) {
|
||||
const util::exception* roctracer_exc_ptr = dynamic_cast<const util::exception*>(&e);
|
||||
return (roctracer_exc_ptr) ? static_cast<int>(roctracer_exc_ptr->status()) : 1;
|
||||
}
|
||||
|
||||
class GlobalCounter {
|
||||
public:
|
||||
typedef std::mutex mutex_t;
|
||||
typedef uint64_t counter_t;
|
||||
|
||||
static counter_t Increment() {
|
||||
std::lock_guard<mutex_t> lock(mutex_);
|
||||
return ++counter_;
|
||||
}
|
||||
|
||||
private:
|
||||
static mutex_t mutex_;
|
||||
static counter_t counter_;
|
||||
};
|
||||
GlobalCounter::mutex_t GlobalCounter::mutex_;
|
||||
GlobalCounter::counter_t GlobalCounter::counter_ = 0;
|
||||
|
||||
class MemoryPool {
|
||||
public:
|
||||
typedef std::mutex mutex_t;
|
||||
|
||||
static void allocator_default(char** ptr, size_t size, void* arg) {
|
||||
(void)arg;
|
||||
if (*ptr == NULL) {
|
||||
*ptr = reinterpret_cast<char*>(malloc(size));
|
||||
} else if (size != 0) {
|
||||
*ptr = reinterpret_cast<char*>(realloc(ptr, size));
|
||||
} else {
|
||||
free(*ptr);
|
||||
*ptr = NULL;
|
||||
}
|
||||
}
|
||||
|
||||
MemoryPool(const roctracer_properties_t& properties) {
|
||||
// Assigning pool allocator
|
||||
alloc_fun_ = allocator_default;
|
||||
alloc_arg_ = NULL;
|
||||
if (properties.alloc_fun != NULL) {
|
||||
alloc_fun_ = properties.alloc_fun;
|
||||
alloc_arg_ = properties.alloc_arg;
|
||||
}
|
||||
|
||||
// Pool definition
|
||||
buffer_size_ = properties.buffer_size;
|
||||
const size_t pool_size = 2 * buffer_size_;
|
||||
pool_begin_ = NULL;
|
||||
alloc_fun_(&pool_begin_, pool_size, alloc_arg_);
|
||||
if (pool_begin_ == NULL) EXC_ABORT(ROCTRACER_STATUS_ERROR, "pool allocator failed");
|
||||
pool_end_ = pool_begin_ + pool_size;
|
||||
buffer_begin_ = pool_begin_;
|
||||
buffer_end_ = buffer_begin_ + buffer_size_;
|
||||
write_ptr_ = buffer_begin_;
|
||||
|
||||
// Consuming read thread
|
||||
read_callback_fun_ = properties.buffer_callback_fun;
|
||||
read_callback_arg_ = properties.buffer_callback_arg;
|
||||
consumer_arg_ = consumer_arg_t{this, true, NULL, NULL};
|
||||
PTHREAD_CALL(pthread_mutex_init(&read_mutex_, NULL));
|
||||
PTHREAD_CALL(pthread_cond_init(&read_cond_, NULL));
|
||||
PTHREAD_CALL(pthread_create(&consumer_thread_, NULL, reader_fun, &consumer_arg_));
|
||||
}
|
||||
|
||||
~MemoryPool() {
|
||||
Flush();
|
||||
allocator_default(&pool_begin_, 0, alloc_arg_);
|
||||
}
|
||||
|
||||
template <typename Record>
|
||||
void* Write(const Record& record) {
|
||||
std::lock_guard<mutex_t> lock(write_mutex_);
|
||||
char* next = write_ptr_ + sizeof(Record);
|
||||
if (next > buffer_end_) {
|
||||
if (write_ptr_ == buffer_begin_) EXC_ABORT(ROCTRACER_STATUS_ERROR, "buffer size(" << buffer_size_ << ") is less then the record(" << sizeof(Record) << ")");
|
||||
spawn_reader(buffer_begin_, buffer_end_);
|
||||
buffer_begin_ = (buffer_end_ == pool_end_) ? pool_begin_ : buffer_end_;
|
||||
buffer_end_ = buffer_begin_ + buffer_size_;
|
||||
write_ptr_ = buffer_begin_;
|
||||
next = write_ptr_ + sizeof(Record);
|
||||
}
|
||||
Record* ptr = reinterpret_cast<Record*>(write_ptr_);
|
||||
*ptr = record;
|
||||
write_ptr_ = next;
|
||||
return reinterpret_cast<void*>(ptr);
|
||||
}
|
||||
|
||||
void Flush() {
|
||||
if (write_ptr_ > buffer_begin_) {
|
||||
spawn_reader(buffer_begin_, write_ptr_);
|
||||
sync_reader(&consumer_arg_);
|
||||
buffer_begin_ = write_ptr_;
|
||||
}
|
||||
}
|
||||
|
||||
private:
|
||||
struct consumer_arg_t {
|
||||
MemoryPool* obj;
|
||||
bool valid;
|
||||
const char* begin;
|
||||
const char* end;
|
||||
};
|
||||
|
||||
static void reset_reader(consumer_arg_t* arg) {
|
||||
reinterpret_cast<std::atomic<bool>*>(&(arg->valid))->store(false, std::memory_order_release);
|
||||
}
|
||||
|
||||
static void sync_reader(const consumer_arg_t* arg) {
|
||||
while(arg->valid) PTHREAD_CALL(pthread_yield());
|
||||
}
|
||||
|
||||
static void* reader_fun(void* consumer_arg) {
|
||||
consumer_arg_t* arg = reinterpret_cast<consumer_arg_t*>(consumer_arg);
|
||||
roctracer::MemoryPool* obj = arg->obj;
|
||||
|
||||
reset_reader(arg);
|
||||
|
||||
while (1) {
|
||||
PTHREAD_CALL(pthread_mutex_lock(&(obj->read_mutex_)));
|
||||
while (arg->valid == false) {
|
||||
PTHREAD_CALL(pthread_cond_wait(&(obj->read_cond_), &(obj->read_mutex_)));
|
||||
}
|
||||
obj->read_callback_fun_(arg->begin, arg->end, obj->read_callback_arg_);
|
||||
reset_reader(arg);
|
||||
PTHREAD_CALL(pthread_mutex_unlock(&(obj->read_mutex_)));
|
||||
}
|
||||
|
||||
return NULL;
|
||||
}
|
||||
|
||||
void spawn_reader(const char* data_begin, const char* data_end) {
|
||||
sync_reader(&consumer_arg_);
|
||||
PTHREAD_CALL(pthread_mutex_lock(&read_mutex_));
|
||||
consumer_arg_ = consumer_arg_t{this, true, data_begin, data_end};
|
||||
PTHREAD_CALL(pthread_cond_signal(&read_cond_));
|
||||
PTHREAD_CALL(pthread_mutex_unlock(&read_mutex_));
|
||||
}
|
||||
|
||||
// pool allocator
|
||||
roctracer_allocator_t alloc_fun_;
|
||||
void* alloc_arg_;
|
||||
|
||||
// Pool definition
|
||||
size_t buffer_size_;
|
||||
char* pool_begin_;
|
||||
char* pool_end_;
|
||||
char* buffer_begin_;
|
||||
char* buffer_end_;
|
||||
char* write_ptr_;
|
||||
mutex_t write_mutex_;
|
||||
|
||||
// Consuming read thread
|
||||
roctracer_buffer_callback_t read_callback_fun_;
|
||||
void* read_callback_arg_;
|
||||
consumer_arg_t consumer_arg_;
|
||||
pthread_t consumer_thread_;
|
||||
pthread_mutex_t read_mutex_;
|
||||
pthread_cond_t read_cond_;
|
||||
};
|
||||
|
||||
class Timer {
|
||||
public:
|
||||
typedef uint64_t timestamp_t;
|
||||
typedef long double freq_t;
|
||||
|
||||
Timer() {
|
||||
timestamp_t timestamp_hz = 0;
|
||||
HSART_CALL(hsa_system_get_info(HSA_SYSTEM_INFO_TIMESTAMP_FREQUENCY, ×tamp_hz));
|
||||
timestamp_factor_ = (freq_t)1000000000 / (freq_t)timestamp_hz;
|
||||
}
|
||||
|
||||
// Return timestamp in 'ns'
|
||||
timestamp_t timestamp_ns() {
|
||||
timestamp_t timestamp;
|
||||
HSART_CALL(hsa_system_get_info(HSA_SYSTEM_INFO_TIMESTAMP, ×tamp));
|
||||
return timestamp_t((freq_t)timestamp * timestamp_factor_);
|
||||
}
|
||||
|
||||
private:
|
||||
// Timestamp frequency factor
|
||||
freq_t timestamp_factor_;
|
||||
};
|
||||
|
||||
CONSTRUCTOR_API void constructor() {
|
||||
util::Logger::Create();
|
||||
}
|
||||
|
||||
DESTRUCTOR_API void destructor() {
|
||||
util::HsaRsrcFactory::Destroy();
|
||||
util::Logger::Destroy();
|
||||
}
|
||||
|
||||
// Activity callback to generate an activity record
|
||||
void ActivityCallback(
|
||||
roctracer_record_t* record,
|
||||
uint32_t activity_kind,
|
||||
const void* callback_data,
|
||||
void* arg)
|
||||
{
|
||||
static Timer timer;
|
||||
|
||||
const hip_cb_data_t* data = reinterpret_cast<const hip_cb_data_t*>(callback_data);
|
||||
MemoryPool* pool = reinterpret_cast<MemoryPool*>(arg);
|
||||
if (pool == NULL) EXC_ABORT(ROCTRACER_STATUS_ERROR, "ActivityCallback pool is NULL");
|
||||
if (data->phase == ROCTRACER_API_PHASE_ENTER) {
|
||||
*record = {};
|
||||
record->name = data->name;
|
||||
record->activity_kind = activity_kind;
|
||||
record->begin_ns = timer.timestamp_ns();
|
||||
// Correlation ID generating
|
||||
const auto correlation_id = GlobalCounter::Increment();
|
||||
record->correlation_id = correlation_id;
|
||||
const_cast<hip_cb_data_t*>(data)->correlation_id = correlation_id;
|
||||
} else {
|
||||
record->end_ns = timer.timestamp_ns();
|
||||
pool->Write<roctracer_record_t>(*record);
|
||||
}
|
||||
}
|
||||
|
||||
util::Logger::mutex_t util::Logger::mutex_;
|
||||
util::Logger* util::Logger::instance_ = NULL;
|
||||
MemoryPool* memory_pool = NULL;
|
||||
}
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
// Public library methods
|
||||
//
|
||||
extern "C" {
|
||||
|
||||
// Returns library vesrion
|
||||
PUBLIC_API uint32_t roctracer_version_major() { return ROCTRACER_VERSION_MAJOR; }
|
||||
PUBLIC_API uint32_t roctracer_version_minor() { return ROCTRACER_VERSION_MINOR; }
|
||||
|
||||
// Returns the last error
|
||||
PUBLIC_API const char* roctracer_error_string() {
|
||||
return strdup(roctracer::util::Logger::LastMessage().c_str());
|
||||
}
|
||||
|
||||
// Return method name by given API domain and call ID
|
||||
// NULL returned on the error and the library errno is set
|
||||
PUBLIC_API const char* roctracer_get_api_name(roctracer_api_domain_t domain, roctracer_hip_api_cid_t cid) {
|
||||
return NULL;
|
||||
}
|
||||
|
||||
// Enable runtime API callbacks
|
||||
PUBLIC_API int roctracer_enable_api_callback(
|
||||
roctracer_api_domain_t domain,
|
||||
uint32_t cid,
|
||||
roctracer_api_callback_t callback,
|
||||
void* user_data)
|
||||
{
|
||||
API_METHOD_PREFIX
|
||||
switch (domain) {
|
||||
case ROCTRACER_API_DOMAIN_HIP: {
|
||||
hipError_t hip_err = hipRegisterApiCallback(cid, callback, user_data);
|
||||
if (hip_err != hipSuccess) HIP_EXC_RAISING(ROCTRACER_STATUS_HIP_API_ERR, "hipRegisterApiCallback error(" << hip_err << ")");
|
||||
break;
|
||||
}
|
||||
default:
|
||||
EXC_RAISING(ROCTRACER_STATUS_BAD_DOMAIN, "invalid domain ID(" << domain << ")");
|
||||
}
|
||||
API_METHOD_SUFFIX
|
||||
}
|
||||
|
||||
// Enable runtime API callbacks
|
||||
PUBLIC_API int roctracer_disable_api_callback(
|
||||
roctracer_api_domain_t domain,
|
||||
uint32_t cid)
|
||||
{
|
||||
API_METHOD_PREFIX
|
||||
switch (domain) {
|
||||
case ROCTRACER_API_DOMAIN_HIP: {
|
||||
hipError_t hip_err = hipRemoveApiCallback(cid);
|
||||
if (hip_err != hipSuccess) HIP_EXC_RAISING(ROCTRACER_STATUS_HIP_API_ERR, "hipRemoveApiCallback error(" << hip_err << ")");
|
||||
break;
|
||||
}
|
||||
default:
|
||||
EXC_RAISING(ROCTRACER_STATUS_BAD_DOMAIN, "invalid domain ID(" << domain << ")");
|
||||
}
|
||||
API_METHOD_SUFFIX
|
||||
}
|
||||
|
||||
// Return default pool and set new one if parameter pool is not NULL.
|
||||
roctracer_pool_t* roctracer_default_pool(roctracer_pool_t* pool) {
|
||||
roctracer_pool_t* p = reinterpret_cast<roctracer_pool_t*>(roctracer::memory_pool);
|
||||
if (pool != NULL) roctracer::memory_pool = reinterpret_cast<roctracer::MemoryPool*>(pool);
|
||||
if (p == NULL) EXC_RAISING(ROCTRACER_STATUS_UNINIT, "default pool is not initialized");
|
||||
return p;
|
||||
}
|
||||
|
||||
// Open memory pool
|
||||
PUBLIC_API int roctracer_open_pool(
|
||||
const roctracer_properties_t* properties,
|
||||
roctracer_pool_t** pool)
|
||||
{
|
||||
API_METHOD_PREFIX
|
||||
if ((pool == NULL) && (roctracer::memory_pool != NULL)) {
|
||||
EXC_RAISING(ROCTRACER_STATUS_ERROR, "default pool already set");
|
||||
}
|
||||
roctracer::MemoryPool* p = new roctracer::MemoryPool(*properties);
|
||||
if (p == NULL) EXC_RAISING(ROCTRACER_STATUS_ERROR, "MemoryPool() error");
|
||||
if (pool != NULL) *pool = p;
|
||||
else roctracer::memory_pool = p;
|
||||
API_METHOD_SUFFIX
|
||||
}
|
||||
|
||||
// Close memory pool
|
||||
PUBLIC_API int roctracer_close_pool(roctracer_pool_t* pool) {
|
||||
API_METHOD_PREFIX
|
||||
roctracer_pool_t* ptr = (pool == NULL) ? roctracer_default_pool() : pool;
|
||||
roctracer::MemoryPool* memory_pool = reinterpret_cast<roctracer::MemoryPool*>(ptr);
|
||||
delete(memory_pool);
|
||||
if (pool == NULL) roctracer::memory_pool = NULL;
|
||||
API_METHOD_SUFFIX
|
||||
}
|
||||
|
||||
// Enable activity records logging
|
||||
PUBLIC_API int roctracer_enable_api_activity(
|
||||
roctracer_api_domain_t domain,
|
||||
uint32_t activity_kind,
|
||||
roctracer_pool_t* pool)
|
||||
{
|
||||
API_METHOD_PREFIX
|
||||
if (pool == NULL) pool = roctracer_default_pool();
|
||||
switch (domain) {
|
||||
case ROCTRACER_API_DOMAIN_HIP: {
|
||||
const hipError_t hip_err = hipRegisterActivityCallback(activity_kind, roctracer::ActivityCallback, pool);
|
||||
if (hip_err != hipSuccess) HIP_EXC_RAISING(ROCTRACER_STATUS_HIP_API_ERR, "hipRegisterActivityCallback error(" << hip_err << ")");
|
||||
break;
|
||||
}
|
||||
default:
|
||||
EXC_RAISING(ROCTRACER_STATUS_BAD_DOMAIN, "invalid domain ID(" << domain << ")");
|
||||
}
|
||||
API_METHOD_SUFFIX
|
||||
}
|
||||
|
||||
// Disable activity records logging
|
||||
PUBLIC_API int roctracer_disable_api_activity(
|
||||
roctracer_api_domain_t domain,
|
||||
uint32_t activity_kind)
|
||||
{
|
||||
API_METHOD_PREFIX
|
||||
switch (domain) {
|
||||
case ROCTRACER_API_DOMAIN_HIP: {
|
||||
const hipError_t hip_err = hipRemoveActivityCallback(activity_kind);
|
||||
if (hip_err != hipSuccess) HIP_EXC_RAISING(ROCTRACER_STATUS_HIP_API_ERR, "hipRemoveActivityCallback error(" << hip_err << ")");
|
||||
break;
|
||||
}
|
||||
default:
|
||||
EXC_RAISING(ROCTRACER_STATUS_BAD_DOMAIN, "invalid domain ID(" << domain << ")");
|
||||
}
|
||||
API_METHOD_SUFFIX
|
||||
}
|
||||
|
||||
// Flush available activity records
|
||||
PUBLIC_API int roctracer_flush_api_activity(roctracer_pool_t* pool) {
|
||||
API_METHOD_PREFIX
|
||||
if (pool == NULL) pool = roctracer_default_pool();
|
||||
roctracer::MemoryPool* memory_pool = reinterpret_cast<roctracer::MemoryPool*>(pool);
|
||||
memory_pool->Flush();
|
||||
API_METHOD_SUFFIX
|
||||
}
|
||||
|
||||
} // extern "C"
|
||||
@@ -0,0 +1,47 @@
|
||||
#ifndef SRC_UTIL_EXCEPTION_H_
|
||||
#define SRC_UTIL_EXCEPTION_H_
|
||||
|
||||
#include <hsa_ven_amd_aqlprofile.h>
|
||||
|
||||
#include <exception>
|
||||
#include <sstream>
|
||||
#include <string>
|
||||
|
||||
#define EXC_ABORT(error, stream) \
|
||||
{ \
|
||||
std::ostringstream oss; \
|
||||
oss << __FUNCTION__ << "(), " << stream; \
|
||||
std::cout << oss.str() << std::endl; \
|
||||
abort(); \
|
||||
}
|
||||
|
||||
#define EXC_RAISING(error, stream) \
|
||||
{ \
|
||||
std::ostringstream oss; \
|
||||
oss << __FUNCTION__ << "(), " << stream; \
|
||||
throw roctracer::util::exception(error, oss.str()); \
|
||||
}
|
||||
|
||||
#define HIP_EXC_RAISING(error, stream) \
|
||||
{ \
|
||||
EXC_RAISING(error, "HIP error: " << stream); \
|
||||
}
|
||||
|
||||
namespace roctracer {
|
||||
namespace util {
|
||||
|
||||
class exception : public std::exception {
|
||||
public:
|
||||
explicit exception(const uint32_t& status, const std::string& msg) : status_(status), str_(msg) {}
|
||||
const char* what() const throw() { return str_.c_str(); }
|
||||
uint32_t status() const throw() { return status_; }
|
||||
|
||||
protected:
|
||||
const uint32_t status_;
|
||||
const std::string str_;
|
||||
};
|
||||
|
||||
} // namespace util
|
||||
} // namespace roctracer
|
||||
|
||||
#endif // SRC_UTIL_EXCEPTION_H_
|
||||
@@ -0,0 +1,572 @@
|
||||
/**********************************************************************
|
||||
Copyright ©2013 Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without modification, are permitted
|
||||
provided that the following conditions are met:
|
||||
|
||||
<95> Redistributions of source code must retain the above copyright notice, this list of
|
||||
conditions and the following disclaimer.
|
||||
<95> Redistributions in binary form must reproduce the above copyright notice, this list of
|
||||
conditions and the following disclaimer in the documentation and/or
|
||||
other materials provided with the distribution.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
|
||||
IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
||||
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT
|
||||
SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY
|
||||
DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
|
||||
OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
|
||||
WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
POSSIBILITY OF SUCH DAMAGE.
|
||||
********************************************************************/
|
||||
|
||||
#include "util/hsa_rsrc_factory.h"
|
||||
|
||||
#include <dlfcn.h>
|
||||
#include <fcntl.h>
|
||||
#include <hsa.h>
|
||||
#include <hsa_ext_amd.h>
|
||||
#include <hsa_ext_finalize.h>
|
||||
#include <stdint.h>
|
||||
#include <stdio.h>
|
||||
#include <string.h>
|
||||
#include <stdlib.h>
|
||||
#include <sys/stat.h>
|
||||
#include <sys/types.h>
|
||||
|
||||
#include <atomic>
|
||||
#include <cassert>
|
||||
#include <fstream>
|
||||
#include <iostream>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
#ifndef AQL_PROFILE_READ_API_ENABLE
|
||||
#define AQL_PROFILE_READ_API_ENABLE 0
|
||||
#endif
|
||||
|
||||
namespace roctracer {
|
||||
namespace util {
|
||||
|
||||
// Callback function to get available in the system agents
|
||||
hsa_status_t HsaRsrcFactory::GetHsaAgentsCallback(hsa_agent_t agent, void* data) {
|
||||
hsa_status_t status = HSA_STATUS_ERROR;
|
||||
HsaRsrcFactory* hsa_rsrc = reinterpret_cast<HsaRsrcFactory*>(data);
|
||||
const AgentInfo* agent_info = hsa_rsrc->AddAgentInfo(agent);
|
||||
if (agent_info != NULL) status = HSA_STATUS_SUCCESS;
|
||||
return status;
|
||||
}
|
||||
|
||||
// This function checks to see if the provided
|
||||
// pool has the HSA_AMD_SEGMENT_GLOBAL property. If the kern_arg flag is true,
|
||||
// the function adds an additional requirement that the pool have the
|
||||
// HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_KERNARG_INIT property. If kern_arg is false,
|
||||
// pools must NOT have this property.
|
||||
// Upon finding a pool that meets these conditions, HSA_STATUS_INFO_BREAK is
|
||||
// returned. HSA_STATUS_SUCCESS is returned if no errors were encountered, but
|
||||
// no pool was found meeting the requirements. If an error is encountered, we
|
||||
// return that error.
|
||||
static hsa_status_t
|
||||
FindGlobalPool(hsa_amd_memory_pool_t pool, void* data, bool kern_arg) {
|
||||
hsa_status_t err;
|
||||
hsa_amd_segment_t segment;
|
||||
uint32_t flag;
|
||||
|
||||
if (nullptr == data) {
|
||||
return HSA_STATUS_ERROR_INVALID_ARGUMENT;
|
||||
}
|
||||
|
||||
err = hsa_amd_memory_pool_get_info(pool, HSA_AMD_MEMORY_POOL_INFO_SEGMENT,
|
||||
&segment);
|
||||
CHECK_STATUS("hsa_amd_memory_pool_get_info", err);
|
||||
if (HSA_AMD_SEGMENT_GLOBAL != segment) {
|
||||
return HSA_STATUS_SUCCESS;
|
||||
}
|
||||
|
||||
err = hsa_amd_memory_pool_get_info(pool,
|
||||
HSA_AMD_MEMORY_POOL_INFO_GLOBAL_FLAGS, &flag);
|
||||
CHECK_STATUS("hsa_amd_memory_pool_get_info", err);
|
||||
|
||||
uint32_t karg_st = flag & HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_KERNARG_INIT;
|
||||
|
||||
if ((karg_st == 0 && kern_arg) ||
|
||||
(karg_st != 0 && !kern_arg)) {
|
||||
return HSA_STATUS_SUCCESS;
|
||||
}
|
||||
|
||||
*(reinterpret_cast<hsa_amd_memory_pool_t*>(data)) = pool;
|
||||
return HSA_STATUS_INFO_BREAK;
|
||||
}
|
||||
|
||||
// This is the call-back function for hsa_amd_agent_iterate_memory_pools() that
|
||||
// finds a pool with the properties of HSA_AMD_SEGMENT_GLOBAL and that is NOT
|
||||
// HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_KERNARG_INIT
|
||||
hsa_status_t FindStandardPool(hsa_amd_memory_pool_t pool, void* data) {
|
||||
return FindGlobalPool(pool, data, false);
|
||||
}
|
||||
|
||||
// This is the call-back function for hsa_amd_agent_iterate_memory_pools() that
|
||||
// finds a pool with the properties of HSA_AMD_SEGMENT_GLOBAL and that IS
|
||||
// HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_KERNARG_INIT
|
||||
hsa_status_t FindKernArgPool(hsa_amd_memory_pool_t pool, void* data) {
|
||||
return FindGlobalPool(pool, data, true);
|
||||
}
|
||||
#if 0
|
||||
// Callback function to find and bind kernarg region of an agent
|
||||
hsa_status_t HsaRsrcFactory::FindMemRegionsCallback(hsa_region_t region, void* data) {
|
||||
hsa_region_global_flag_t flags;
|
||||
hsa_region_segment_t segment_id;
|
||||
|
||||
hsa_region_get_info(region, HSA_REGION_INFO_SEGMENT, &segment_id);
|
||||
if (segment_id != HSA_REGION_SEGMENT_GLOBAL) {
|
||||
return HSA_STATUS_SUCCESS;
|
||||
}
|
||||
|
||||
AgentInfo* agent_info = (AgentInfo*)data;
|
||||
hsa_region_get_info(region, HSA_REGION_INFO_GLOBAL_FLAGS, &flags);
|
||||
if (flags & HSA_REGION_GLOBAL_FLAG_COARSE_GRAINED) {
|
||||
agent_info->coarse_region = region;
|
||||
}
|
||||
|
||||
if (flags & HSA_REGION_GLOBAL_FLAG_KERNARG) {
|
||||
agent_info->kernarg_region = region;
|
||||
}
|
||||
|
||||
return HSA_STATUS_SUCCESS;
|
||||
}
|
||||
#endif
|
||||
// Constructor of the class
|
||||
HsaRsrcFactory::HsaRsrcFactory(bool initialize_hsa) : initialize_hsa_(initialize_hsa) {
|
||||
hsa_status_t status;
|
||||
// Initialize the Hsa Runtime
|
||||
if (initialize_hsa_) {
|
||||
status = hsa_init();
|
||||
CHECK_STATUS("Error in hsa_init", status);
|
||||
}
|
||||
// Discover the set of Gpu devices available on the platform
|
||||
status = hsa_iterate_agents(GetHsaAgentsCallback, this);
|
||||
CHECK_STATUS("Error Calling hsa_iterate_agents", status);
|
||||
|
||||
// Get AqlProfile API table
|
||||
aqlprofile_api_ = {0};
|
||||
#ifdef ROCP_LD_AQLPROFILE
|
||||
status = LoadAqlProfileLib(&aqlprofile_api_);
|
||||
#else
|
||||
status = hsa_system_get_extension_table(HSA_EXTENSION_AMD_AQLPROFILE, 1, 0, &aqlprofile_api_);
|
||||
#endif
|
||||
CHECK_STATUS("aqlprofile API table load failed", status);
|
||||
|
||||
// Get Loader API table
|
||||
loader_api_ = {0};
|
||||
status = hsa_system_get_extension_table(HSA_EXTENSION_AMD_LOADER, 1, 0, &loader_api_);
|
||||
CHECK_STATUS("loader API table query failed", status);
|
||||
}
|
||||
|
||||
// Destructor of the class
|
||||
HsaRsrcFactory::~HsaRsrcFactory() {
|
||||
for (auto p : cpu_list_) delete p;
|
||||
for (auto p : gpu_list_) delete p;
|
||||
if (initialize_hsa_) {
|
||||
hsa_status_t status = hsa_shut_down();
|
||||
CHECK_STATUS("Error in hsa_shut_down", status);
|
||||
}
|
||||
}
|
||||
|
||||
hsa_status_t HsaRsrcFactory::LoadAqlProfileLib(aqlprofile_pfn_t* api) {
|
||||
void* handle = dlopen(kAqlProfileLib, RTLD_NOW);
|
||||
if (handle == NULL) {
|
||||
fprintf(stderr, "Loading '%s' failed, %s\n", kAqlProfileLib, dlerror());
|
||||
return HSA_STATUS_ERROR;
|
||||
}
|
||||
dlerror(); /* Clear any existing error */
|
||||
|
||||
api->hsa_ven_amd_aqlprofile_error_string =
|
||||
(decltype(::hsa_ven_amd_aqlprofile_error_string)*)
|
||||
dlsym(handle, "hsa_ven_amd_aqlprofile_error_string");
|
||||
api->hsa_ven_amd_aqlprofile_validate_event =
|
||||
(decltype(::hsa_ven_amd_aqlprofile_validate_event)*)
|
||||
dlsym(handle, "hsa_ven_amd_aqlprofile_validate_event");
|
||||
api->hsa_ven_amd_aqlprofile_start =
|
||||
(decltype(::hsa_ven_amd_aqlprofile_start)*)
|
||||
dlsym(handle, "hsa_ven_amd_aqlprofile_start");
|
||||
api->hsa_ven_amd_aqlprofile_stop =
|
||||
(decltype(::hsa_ven_amd_aqlprofile_stop)*)
|
||||
dlsym(handle, "hsa_ven_amd_aqlprofile_stop");
|
||||
#if AQL_PROFILE_READ_API_ENABLE
|
||||
api->hsa_ven_amd_aqlprofile_read =
|
||||
(decltype(::hsa_ven_amd_aqlprofile_read)*)
|
||||
dlsym(handle, "hsa_ven_amd_aqlprofile_read");
|
||||
#endif // AQL_PROFILE_READ_API_ENABLE
|
||||
api->hsa_ven_amd_aqlprofile_legacy_get_pm4 =
|
||||
(decltype(::hsa_ven_amd_aqlprofile_legacy_get_pm4)*)
|
||||
dlsym(handle, "hsa_ven_amd_aqlprofile_legacy_get_pm4");
|
||||
api->hsa_ven_amd_aqlprofile_get_info =
|
||||
(decltype(::hsa_ven_amd_aqlprofile_get_info)*)
|
||||
dlsym(handle, "hsa_ven_amd_aqlprofile_get_info");
|
||||
api->hsa_ven_amd_aqlprofile_iterate_data =
|
||||
(decltype(::hsa_ven_amd_aqlprofile_iterate_data)*)
|
||||
dlsym(handle, "hsa_ven_amd_aqlprofile_iterate_data");
|
||||
|
||||
return HSA_STATUS_SUCCESS;
|
||||
}
|
||||
|
||||
// Add system agent info
|
||||
const AgentInfo* HsaRsrcFactory::AddAgentInfo(const hsa_agent_t agent) {
|
||||
// Determine if device is a Gpu agent
|
||||
hsa_status_t status;
|
||||
AgentInfo* agent_info = NULL;
|
||||
|
||||
hsa_device_type_t type;
|
||||
status = hsa_agent_get_info(agent, HSA_AGENT_INFO_DEVICE, &type);
|
||||
CHECK_STATUS("Error Calling hsa_agent_get_info", status);
|
||||
|
||||
if (type == HSA_DEVICE_TYPE_CPU) {
|
||||
agent_info = new AgentInfo{};
|
||||
agent_info->dev_id = agent;
|
||||
agent_info->dev_type = HSA_DEVICE_TYPE_CPU;
|
||||
agent_info->dev_index = cpu_list_.size();
|
||||
|
||||
status = hsa_amd_agent_iterate_memory_pools(agent, FindStandardPool, &agent_info->cpu_pool);
|
||||
CHECK_ITER_STATUS("hsa_amd_agent_iterate_memory_pools(cpu pool)", status);
|
||||
status = hsa_amd_agent_iterate_memory_pools(agent, FindKernArgPool, &agent_info->kern_arg_pool);
|
||||
CHECK_ITER_STATUS("hsa_amd_agent_iterate_memory_pools(kern arg pool)", status);
|
||||
agent_info->gpu_pool = {};
|
||||
|
||||
cpu_list_.push_back(agent_info);
|
||||
cpu_agents_.push_back(agent);
|
||||
}
|
||||
|
||||
if (type == HSA_DEVICE_TYPE_GPU) {
|
||||
agent_info = new AgentInfo{};
|
||||
agent_info->dev_id = agent;
|
||||
agent_info->dev_type = HSA_DEVICE_TYPE_GPU;
|
||||
hsa_agent_get_info(agent, HSA_AGENT_INFO_NAME, agent_info->name);
|
||||
strncpy(agent_info->gfxip, agent_info->name, 4);
|
||||
agent_info->gfxip[4] = '\0';
|
||||
hsa_agent_get_info(agent, HSA_AGENT_INFO_WAVEFRONT_SIZE, &agent_info->max_wave_size);
|
||||
hsa_agent_get_info(agent, HSA_AGENT_INFO_QUEUE_MAX_SIZE, &agent_info->max_queue_size);
|
||||
hsa_agent_get_info(agent, HSA_AGENT_INFO_PROFILE, &agent_info->profile);
|
||||
agent_info->is_apu = (agent_info->profile == HSA_PROFILE_FULL) ? true : false;
|
||||
hsa_agent_get_info(agent, static_cast<hsa_agent_info_t>(HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT), &agent_info->cu_num);
|
||||
hsa_agent_get_info(agent, static_cast<hsa_agent_info_t>(HSA_AMD_AGENT_INFO_MAX_WAVES_PER_CU), &agent_info->waves_per_cu);
|
||||
hsa_agent_get_info(agent, static_cast<hsa_agent_info_t>(HSA_AMD_AGENT_INFO_NUM_SIMDS_PER_CU), &agent_info->simds_per_cu);
|
||||
hsa_agent_get_info(agent, static_cast<hsa_agent_info_t>(HSA_AMD_AGENT_INFO_NUM_SHADER_ENGINES), &agent_info->se_num);
|
||||
hsa_agent_get_info(agent, static_cast<hsa_agent_info_t>(HSA_AMD_AGENT_INFO_NUM_SHADER_ARRAYS_PER_SE), &agent_info->shader_arrays_per_se);
|
||||
|
||||
agent_info->cpu_pool = {};
|
||||
agent_info->kern_arg_pool = {};
|
||||
status = hsa_amd_agent_iterate_memory_pools(agent, FindStandardPool, &agent_info->gpu_pool);
|
||||
CHECK_ITER_STATUS("hsa_amd_agent_iterate_memory_pools(gpu pool)", status);
|
||||
#if 0
|
||||
// Initialize memory regions to zero
|
||||
agent_info->kernarg_region.handle = 0;
|
||||
agent_info->coarse_region.handle = 0;
|
||||
// Find and Bind Memory regions of the Gpu agent
|
||||
hsa_agent_iterate_regions(agent, FindMemRegionsCallback, agent_info);
|
||||
#endif
|
||||
|
||||
// Set GPU index
|
||||
agent_info->dev_index = gpu_list_.size();
|
||||
gpu_list_.push_back(agent_info);
|
||||
gpu_agents_.push_back(agent);
|
||||
}
|
||||
|
||||
if (agent_info) agent_map_[agent.handle] = agent_info;
|
||||
|
||||
return agent_info;
|
||||
}
|
||||
|
||||
// Return systen agent info
|
||||
const AgentInfo* HsaRsrcFactory::GetAgentInfo(const hsa_agent_t agent) {
|
||||
const AgentInfo* agent_info = NULL;
|
||||
auto it = agent_map_.find(agent.handle);
|
||||
if (it != agent_map_.end()) {
|
||||
agent_info = it->second;
|
||||
}
|
||||
return agent_info;
|
||||
}
|
||||
|
||||
// Get the count of Hsa Gpu Agents available on the platform
|
||||
//
|
||||
// @return uint32_t Number of Gpu agents on platform
|
||||
//
|
||||
uint32_t HsaRsrcFactory::GetCountOfGpuAgents() { return uint32_t(gpu_list_.size()); }
|
||||
|
||||
// Get the count of Hsa Cpu Agents available on the platform
|
||||
//
|
||||
// @return uint32_t Number of Cpu agents on platform
|
||||
//
|
||||
uint32_t HsaRsrcFactory::GetCountOfCpuAgents() { return uint32_t(cpu_list_.size()); }
|
||||
|
||||
// Get the AgentInfo handle of a Gpu device
|
||||
//
|
||||
// @param idx Gpu Agent at specified index
|
||||
//
|
||||
// @param agent_info Output parameter updated with AgentInfo
|
||||
//
|
||||
// @return bool true if successful, false otherwise
|
||||
//
|
||||
bool HsaRsrcFactory::GetGpuAgentInfo(uint32_t idx, const AgentInfo** agent_info) {
|
||||
// Determine if request is valid
|
||||
uint32_t size = uint32_t(gpu_list_.size());
|
||||
if (idx >= size) {
|
||||
return false;
|
||||
}
|
||||
|
||||
// Copy AgentInfo from specified index
|
||||
*agent_info = gpu_list_[idx];
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
// Get the AgentInfo handle of a Cpu device
|
||||
//
|
||||
// @param idx Cpu Agent at specified index
|
||||
//
|
||||
// @param agent_info Output parameter updated with AgentInfo
|
||||
//
|
||||
// @return bool true if successful, false otherwise
|
||||
//
|
||||
bool HsaRsrcFactory::GetCpuAgentInfo(uint32_t idx, const AgentInfo** agent_info) {
|
||||
// Determine if request is valid
|
||||
uint32_t size = uint32_t(cpu_list_.size());
|
||||
if (idx >= size) {
|
||||
return false;
|
||||
}
|
||||
|
||||
// Copy AgentInfo from specified index
|
||||
*agent_info = cpu_list_[idx];
|
||||
return true;
|
||||
}
|
||||
|
||||
// Create a Queue object and return its handle. The queue object is expected
|
||||
// to support user requested number of Aql dispatch packets.
|
||||
//
|
||||
// @param agent_info Gpu Agent on which to create a queue object
|
||||
//
|
||||
// @param num_Pkts Number of packets to be held by queue
|
||||
//
|
||||
// @param queue Output parameter updated with handle of queue object
|
||||
//
|
||||
// @return bool true if successful, false otherwise
|
||||
//
|
||||
bool HsaRsrcFactory::CreateQueue(const AgentInfo* agent_info, uint32_t num_pkts,
|
||||
hsa_queue_t** queue) {
|
||||
hsa_status_t status;
|
||||
status = hsa_queue_create(agent_info->dev_id, num_pkts, HSA_QUEUE_TYPE_MULTI, NULL, NULL,
|
||||
UINT32_MAX, UINT32_MAX, queue);
|
||||
return (status == HSA_STATUS_SUCCESS);
|
||||
}
|
||||
|
||||
// Create a Signal object and return its handle.
|
||||
// @param value Initial value of signal object
|
||||
// @param signal Output parameter updated with handle of signal object
|
||||
// @return bool true if successful, false otherwise
|
||||
bool HsaRsrcFactory::CreateSignal(uint32_t value, hsa_signal_t* signal) {
|
||||
hsa_status_t status;
|
||||
status = hsa_signal_create(value, 0, NULL, signal);
|
||||
return (status == HSA_STATUS_SUCCESS);
|
||||
}
|
||||
|
||||
// Allocate memory for use by a kernel of specified size in specified
|
||||
// agent's memory region.
|
||||
// @param agent_info Agent from whose memory region to allocate
|
||||
// @param size Size of memory in terms of bytes
|
||||
// @return uint8_t* Pointer to buffer, null if allocation fails.
|
||||
uint8_t* HsaRsrcFactory::AllocateLocalMemory(const AgentInfo* agent_info, size_t size) {
|
||||
hsa_status_t status = HSA_STATUS_ERROR;
|
||||
uint8_t* buffer = NULL;
|
||||
size = (size + MEM_PAGE_MASK) & ~MEM_PAGE_MASK;
|
||||
status = hsa_amd_memory_pool_allocate(agent_info->gpu_pool, size, 0, (void**)&buffer);
|
||||
// Only GPU can access the memory
|
||||
if (status == HSA_STATUS_SUCCESS) {
|
||||
hsa_agent_t agents_list[1] = {agent_info->dev_id};
|
||||
status = hsa_amd_agents_allow_access(1, agents_list, NULL, buffer);
|
||||
}
|
||||
uint8_t* ptr = (status == HSA_STATUS_SUCCESS) ? buffer : NULL;
|
||||
printf("AllocateLocalMemory %p\n", ptr);
|
||||
return ptr;
|
||||
}
|
||||
|
||||
// Allocate memory to pass kernel parameters.
|
||||
// Memory is alocated accessible for all CPU agents and for GPU given by AgentInfo parameter.
|
||||
// @param agent_info Agent from whose memory region to allocate
|
||||
// @param size Size of memory in terms of bytes
|
||||
// @return uint8_t* Pointer to buffer, null if allocation fails.
|
||||
uint8_t* HsaRsrcFactory::AllocateKernArgMemory(const AgentInfo* agent_info, size_t size) {
|
||||
hsa_status_t status = HSA_STATUS_ERROR;
|
||||
uint8_t* buffer = NULL;
|
||||
if (!cpu_agents_.empty()) {
|
||||
size = (size + MEM_PAGE_MASK) & ~MEM_PAGE_MASK;
|
||||
status = hsa_amd_memory_pool_allocate(cpu_list_[0]->kern_arg_pool, size, 0, (void**)&buffer);
|
||||
// Both the CPU and GPU can access the kernel arguments
|
||||
if (status == HSA_STATUS_SUCCESS) {
|
||||
auto agents_vec = cpu_agents_;
|
||||
agents_vec.push_back(agent_info->dev_id);
|
||||
status = hsa_amd_agents_allow_access(agents_vec.size(), &agents_vec[0], NULL, buffer);
|
||||
}
|
||||
}
|
||||
uint8_t* ptr = (status == HSA_STATUS_SUCCESS) ? buffer : NULL;
|
||||
printf("AllocateKernargMemory %p\n", ptr);
|
||||
return ptr;
|
||||
}
|
||||
|
||||
// Allocate system memory accessible by both CPU and GPU
|
||||
// @param agent_info Agent from whose memory region to allocate
|
||||
// @param size Size of memory in terms of bytes
|
||||
// @return uint8_t* Pointer to buffer, null if allocation fails.
|
||||
uint8_t* HsaRsrcFactory::AllocateSysMemory(const AgentInfo* agent_info, size_t size) {
|
||||
hsa_status_t status = HSA_STATUS_ERROR;
|
||||
uint8_t* buffer = NULL;
|
||||
if (!cpu_agents_.empty()) {
|
||||
size = (size + MEM_PAGE_MASK) & ~MEM_PAGE_MASK;
|
||||
status = hsa_amd_memory_pool_allocate(cpu_list_[0]->cpu_pool, size, 0, (void**)&buffer);
|
||||
// Both the CPU and GPU can access the memory
|
||||
if (status == HSA_STATUS_SUCCESS) {
|
||||
auto agents_vec = cpu_agents_;
|
||||
agents_vec.push_back(agent_info->dev_id);
|
||||
status = hsa_amd_agents_allow_access(agents_vec.size(), &agents_vec[0], NULL, buffer);
|
||||
}
|
||||
}
|
||||
uint8_t* ptr = (status == HSA_STATUS_SUCCESS) ? buffer : NULL;
|
||||
printf("AllocateSysMemory %p\n", ptr);
|
||||
return ptr;
|
||||
}
|
||||
|
||||
// Copy data from GPU to host memory
|
||||
bool HsaRsrcFactory::CopyToHost(const hsa_agent_t& agent, void* dst, const void* src, size_t size) {
|
||||
hsa_status_t status = HSA_STATUS_ERROR;
|
||||
if (!cpu_agents_.empty()) {
|
||||
hsa_signal_t s = {};
|
||||
hsa_status_t status = hsa_signal_create(1, 0, NULL, &s);
|
||||
if (status == HSA_STATUS_SUCCESS) {
|
||||
status = hsa_amd_memory_async_copy(dst, cpu_agents_[0], src, agent, size, 0, NULL, s);
|
||||
if (status == HSA_STATUS_SUCCESS) {
|
||||
if (hsa_signal_wait_scacquire(s, HSA_SIGNAL_CONDITION_LT, 1, UINT64_MAX, HSA_WAIT_STATE_BLOCKED) != 0) {
|
||||
status = HSA_STATUS_ERROR;
|
||||
}
|
||||
}
|
||||
status = hsa_signal_destroy(s);
|
||||
}
|
||||
}
|
||||
return (status == HSA_STATUS_SUCCESS);
|
||||
}
|
||||
bool HsaRsrcFactory::CopyToHost(const AgentInfo* agent_info, void* dst, const void* src, size_t size) {
|
||||
return CopyToHost(agent_info->dev_id, dst, src, size);
|
||||
}
|
||||
|
||||
// Loads an Assembled Brig file and Finalizes it into Device Isa
|
||||
// @param agent_info Gpu device for which to finalize
|
||||
// @param brig_path File path of the Assembled Brig file
|
||||
// @param kernel_name Name of the kernel to finalize
|
||||
// @param code_desc Handle of finalized Code Descriptor that could
|
||||
// be used to submit for execution
|
||||
// @return bool true if successful, false otherwise
|
||||
bool HsaRsrcFactory::LoadAndFinalize(const AgentInfo* agent_info, const char* brig_path,
|
||||
const char* kernel_name, hsa_executable_t* executable, hsa_executable_symbol_t* code_desc) {
|
||||
hsa_status_t status = HSA_STATUS_ERROR;
|
||||
|
||||
// Build the code object filename
|
||||
std::string filename(brig_path);
|
||||
std::clog << "Code object filename: " << filename << std::endl;
|
||||
|
||||
// Open the file containing code object
|
||||
hsa_file_t file_handle = open(filename.c_str(), O_RDONLY);
|
||||
if (file_handle == -1) {
|
||||
std::cerr << "Error: failed to load '" << filename << "'" << std::endl;
|
||||
assert(false);
|
||||
return false;
|
||||
}
|
||||
|
||||
// Create code object reader
|
||||
hsa_code_object_reader_t code_obj_rdr = {0};
|
||||
status = hsa_code_object_reader_create_from_file(file_handle, &code_obj_rdr);
|
||||
if (status != HSA_STATUS_SUCCESS) {
|
||||
std::cerr << "Failed to create code object reader '" << filename << "'" << std::endl;
|
||||
return false;
|
||||
}
|
||||
|
||||
// Create executable.
|
||||
status = hsa_executable_create_alt(HSA_PROFILE_FULL,
|
||||
HSA_DEFAULT_FLOAT_ROUNDING_MODE_DEFAULT, NULL, executable);
|
||||
CHECK_STATUS("Error in creating executable object", status);
|
||||
|
||||
// Load code object.
|
||||
status = hsa_executable_load_agent_code_object(*executable, agent_info->dev_id,
|
||||
code_obj_rdr, NULL, NULL);
|
||||
CHECK_STATUS("Error in loading executable object", status);
|
||||
|
||||
// Freeze executable.
|
||||
status = hsa_executable_freeze(*executable, "");
|
||||
CHECK_STATUS("Error in freezing executable object", status);
|
||||
|
||||
// Get symbol handle.
|
||||
hsa_executable_symbol_t kernelSymbol;
|
||||
status = hsa_executable_get_symbol(*executable, NULL, kernel_name, agent_info->dev_id, 0,
|
||||
&kernelSymbol);
|
||||
CHECK_STATUS("Error in looking up kernel symbol", status);
|
||||
|
||||
// Update output parameter
|
||||
*code_desc = kernelSymbol;
|
||||
return true;
|
||||
}
|
||||
|
||||
// Print the various fields of Hsa Gpu Agents
|
||||
bool HsaRsrcFactory::PrintGpuAgents(const std::string& header) {
|
||||
std::clog << header << " :" << std::endl;
|
||||
|
||||
const AgentInfo* agent_info;
|
||||
int size = uint32_t(gpu_list_.size());
|
||||
for (int idx = 0; idx < size; idx++) {
|
||||
agent_info = gpu_list_[idx];
|
||||
|
||||
std::clog << "> agent[" << idx << "] :" << std::endl;
|
||||
std::clog << ">> Name : " << agent_info->name << std::endl;
|
||||
std::clog << ">> APU : " << agent_info->is_apu << std::endl;
|
||||
std::clog << ">> HSAIL profile : " << agent_info->profile << std::endl;
|
||||
std::clog << ">> Max Wave Size : " << agent_info->max_wave_size << std::endl;
|
||||
std::clog << ">> Max Queue Size : " << agent_info->max_queue_size << std::endl;
|
||||
// std::clog << ">> Kernarg Region Id : " << agent_info->coarse_region.handle << std::endl;
|
||||
std::clog << ">> CU number : " << agent_info->cu_num << std::endl;
|
||||
std::clog << ">> Waves per CU : " << agent_info->waves_per_cu << std::endl;
|
||||
std::clog << ">> SIMDs per CU : " << agent_info->simds_per_cu << std::endl;
|
||||
std::clog << ">> SE number : " << agent_info->se_num << std::endl;
|
||||
std::clog << ">> Shader Arrays per SE : " << agent_info->shader_arrays_per_se << std::endl;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
uint64_t HsaRsrcFactory::Submit(hsa_queue_t* queue, void* packet) {
|
||||
const uint32_t slot_size_b = 0x40;
|
||||
|
||||
// adevance command queue
|
||||
const uint64_t write_idx = hsa_queue_load_write_index_relaxed(queue);
|
||||
hsa_queue_store_write_index_relaxed(queue, write_idx + 1);
|
||||
while ((write_idx - hsa_queue_load_read_index_relaxed(queue)) >= queue->size) {
|
||||
sched_yield();
|
||||
}
|
||||
|
||||
uint32_t slot_idx = (uint32_t)(write_idx % queue->size);
|
||||
uint32_t* queue_slot = (uint32_t*)((uintptr_t)(queue->base_address) + (slot_idx * slot_size_b));
|
||||
uint32_t* slot_data = (uint32_t*)packet;
|
||||
|
||||
// Copy buffered commands into the queue slot.
|
||||
// Overwrite the AQL invalid header (first dword) last.
|
||||
// This prevents the slot from being read until it's fully written.
|
||||
memcpy(&queue_slot[1], &slot_data[1], slot_size_b - sizeof(uint32_t));
|
||||
std::atomic<uint32_t>* header_atomic_ptr = reinterpret_cast<std::atomic<uint32_t>*>(&queue_slot[0]);
|
||||
header_atomic_ptr->store(slot_data[0], std::memory_order_release);
|
||||
|
||||
// ringdoor bell
|
||||
hsa_signal_store_relaxed(queue->doorbell_signal, write_idx);
|
||||
|
||||
return write_idx;
|
||||
}
|
||||
|
||||
HsaRsrcFactory* HsaRsrcFactory::instance_ = NULL;
|
||||
HsaRsrcFactory::mutex_t HsaRsrcFactory::mutex_;
|
||||
|
||||
} // namespace util
|
||||
} // namespace roctracer
|
||||
@@ -0,0 +1,283 @@
|
||||
/**********************************************************************
|
||||
Copyright ©2013 Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without modification, are permitted
|
||||
provided that the following conditions are met:
|
||||
|
||||
<95> Redistributions of source code must retain the above copyright notice, this list of
|
||||
conditions and the following disclaimer.
|
||||
<95> Redistributions in binary form must reproduce the above copyright notice, this list of
|
||||
conditions and the following disclaimer in the documentation and/or
|
||||
other materials provided with the distribution.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
|
||||
IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
||||
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT
|
||||
SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY
|
||||
DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
|
||||
OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
|
||||
WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
POSSIBILITY OF SUCH DAMAGE.
|
||||
********************************************************************/
|
||||
|
||||
#ifndef _HSA_RSRC_FACTORY_H_
|
||||
#define _HSA_RSRC_FACTORY_H_
|
||||
|
||||
#include <hsa.h>
|
||||
#include <hsa_ext_amd.h>
|
||||
#include <hsa_ext_finalize.h>
|
||||
#include <hsa_ven_amd_aqlprofile.h>
|
||||
#include <hsa_ven_amd_loader.h>
|
||||
#include <stdint.h>
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
|
||||
#include <iostream>
|
||||
#include <mutex>
|
||||
#include <map>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
#define HSA_ARGUMENT_ALIGN_BYTES 16
|
||||
#define HSA_QUEUE_ALIGN_BYTES 64
|
||||
#define HSA_PACKET_ALIGN_BYTES 64
|
||||
|
||||
#define CHECK_STATUS(msg, status) \
|
||||
if (status != HSA_STATUS_SUCCESS) { \
|
||||
const char* emsg = 0; \
|
||||
hsa_status_string(status, &emsg); \
|
||||
printf("%s: %s\n", msg, emsg ? emsg : "<unknown error>"); \
|
||||
exit(1); \
|
||||
}
|
||||
|
||||
#define CHECK_ITER_STATUS(msg, status) \
|
||||
if (status != HSA_STATUS_INFO_BREAK) { \
|
||||
const char* emsg = 0; \
|
||||
hsa_status_string(status, &emsg); \
|
||||
printf("%s: %s\n", msg, emsg ? emsg : "<unknown error>"); \
|
||||
exit(1); \
|
||||
}
|
||||
|
||||
namespace roctracer {
|
||||
namespace util {
|
||||
static const unsigned MEM_PAGE_BYTES = 0x1000;
|
||||
static const unsigned MEM_PAGE_MASK = MEM_PAGE_BYTES - 1;
|
||||
typedef decltype(hsa_agent_t::handle) hsa_agent_handle_t;
|
||||
|
||||
// Encapsulates information about a Hsa Agent such as its
|
||||
// handle, name, max queue size, max wavefront size, etc.
|
||||
struct AgentInfo {
|
||||
// Handle of Agent
|
||||
hsa_agent_t dev_id;
|
||||
|
||||
// Agent type - Cpu = 0, Gpu = 1 or Dsp = 2
|
||||
uint32_t dev_type;
|
||||
|
||||
// APU flag
|
||||
bool is_apu;
|
||||
|
||||
// Agent system index
|
||||
uint32_t dev_index;
|
||||
|
||||
// GFXIP name
|
||||
char gfxip[64];
|
||||
|
||||
// Name of Agent whose length is less than 64
|
||||
char name[64];
|
||||
|
||||
// Max size of Wavefront size
|
||||
uint32_t max_wave_size;
|
||||
|
||||
// Max size of Queue buffer
|
||||
uint32_t max_queue_size;
|
||||
|
||||
// Hsail profile supported by agent
|
||||
hsa_profile_t profile;
|
||||
#if 0
|
||||
// Memory region supporting kernel parameters
|
||||
hsa_region_t coarse_region;
|
||||
|
||||
// Memory region supporting kernel arguments
|
||||
hsa_region_t kernarg_region;
|
||||
#endif
|
||||
// CPU/GPU/kern-arg memory pools
|
||||
hsa_amd_memory_pool_t cpu_pool;
|
||||
hsa_amd_memory_pool_t gpu_pool;
|
||||
hsa_amd_memory_pool_t kern_arg_pool;
|
||||
|
||||
// The number of compute unit available in the agent.
|
||||
uint32_t cu_num;
|
||||
|
||||
// Maximum number of waves possible in a Compute Unit.
|
||||
uint32_t waves_per_cu;
|
||||
|
||||
// Number of SIMD's per compute unit CU
|
||||
uint32_t simds_per_cu;
|
||||
|
||||
// Number of Shader Engines (SE) in Gpu
|
||||
uint32_t se_num;
|
||||
|
||||
// Number of Shader Arrays Per Shader Engines in Gpu
|
||||
uint32_t shader_arrays_per_se;
|
||||
};
|
||||
|
||||
class HsaRsrcFactory {
|
||||
public:
|
||||
typedef std::recursive_mutex mutex_t;
|
||||
|
||||
static HsaRsrcFactory* Create(bool initialize_hsa = true) {
|
||||
std::lock_guard<mutex_t> lck(mutex_);
|
||||
if (instance_ == NULL) {
|
||||
instance_ = new HsaRsrcFactory(initialize_hsa);
|
||||
}
|
||||
return instance_;
|
||||
}
|
||||
|
||||
static HsaRsrcFactory& Instance() {
|
||||
if (instance_ == NULL) instance_ = Create(false);
|
||||
hsa_status_t status = (instance_ != NULL) ? HSA_STATUS_SUCCESS : HSA_STATUS_ERROR;
|
||||
CHECK_STATUS("HsaRsrcFactory::Instance() failed", status);
|
||||
return *instance_;
|
||||
}
|
||||
|
||||
static void Destroy() {
|
||||
std::lock_guard<mutex_t> lck(mutex_);
|
||||
if (instance_) delete instance_;
|
||||
instance_ = NULL;
|
||||
}
|
||||
|
||||
// Return system agent info
|
||||
const AgentInfo* GetAgentInfo(const hsa_agent_t agent);
|
||||
|
||||
// Get the count of Hsa Gpu Agents available on the platform
|
||||
// @return uint32_t Number of Gpu agents on platform
|
||||
uint32_t GetCountOfGpuAgents();
|
||||
|
||||
// Get the count of Hsa Cpu Agents available on the platform
|
||||
// @return uint32_t Number of Cpu agents on platform
|
||||
uint32_t GetCountOfCpuAgents();
|
||||
|
||||
// Get the AgentInfo handle of a Gpu device
|
||||
// @param idx Gpu Agent at specified index
|
||||
// @param agent_info Output parameter updated with AgentInfo
|
||||
// @return bool true if successful, false otherwise
|
||||
bool GetGpuAgentInfo(uint32_t idx, const AgentInfo** agent_info);
|
||||
|
||||
// Get the AgentInfo handle of a Cpu device
|
||||
// @param idx Cpu Agent at specified index
|
||||
// @param agent_info Output parameter updated with AgentInfo
|
||||
// @return bool true if successful, false otherwise
|
||||
bool GetCpuAgentInfo(uint32_t idx, const AgentInfo** agent_info);
|
||||
|
||||
// Create a Queue object and return its handle. The queue object is expected
|
||||
// to support user requested number of Aql dispatch packets.
|
||||
// @param agent_info Gpu Agent on which to create a queue object
|
||||
// @param num_Pkts Number of packets to be held by queue
|
||||
// @param queue Output parameter updated with handle of queue object
|
||||
// @return bool true if successful, false otherwise
|
||||
bool CreateQueue(const AgentInfo* agent_info, uint32_t num_pkts, hsa_queue_t** queue);
|
||||
|
||||
// Create a Signal object and return its handle.
|
||||
// @param value Initial value of signal object
|
||||
// @param signal Output parameter updated with handle of signal object
|
||||
// @return bool true if successful, false otherwise
|
||||
bool CreateSignal(uint32_t value, hsa_signal_t* signal);
|
||||
|
||||
// Allocate local GPU memory
|
||||
// @param agent_info Agent from whose memory region to allocate
|
||||
// @param size Size of memory in terms of bytes
|
||||
// @return uint8_t* Pointer to buffer, null if allocation fails.
|
||||
uint8_t* AllocateLocalMemory(const AgentInfo* agent_info, size_t size);
|
||||
|
||||
// Allocate memory tp pass kernel parameters
|
||||
// Memory is alocated accessible for all CPU agents and for GPU given by AgentInfo parameter.
|
||||
// @param agent_info Agent from whose memory region to allocate
|
||||
// @param size Size of memory in terms of bytes
|
||||
// @return uint8_t* Pointer to buffer, null if allocation fails.
|
||||
uint8_t* AllocateKernArgMemory(const AgentInfo* agent_info, size_t size);
|
||||
|
||||
// Allocate system memory accessible from both CPU and GPU
|
||||
// Memory is alocated accessible to all CPU agents and AgentInfo parameter is ignored.
|
||||
// @param agent_info Agent from whose memory region to allocate
|
||||
// @param size Size of memory in terms of bytes
|
||||
// @return uint8_t* Pointer to buffer, null if allocation fails.
|
||||
uint8_t* AllocateSysMemory(const AgentInfo* agent_info, size_t size);
|
||||
|
||||
// Copy data from GPU to host memory
|
||||
bool CopyToHost(const hsa_agent_t& agent, void* dst, const void* src, size_t size);
|
||||
bool CopyToHost(const AgentInfo* agent_info, void* dst, const void* src, size_t size);
|
||||
|
||||
// Loads an Assembled Brig file and Finalizes it into Device Isa
|
||||
// @param agent_info Gpu device for which to finalize
|
||||
// @param brig_path File path of the Assembled Brig file
|
||||
// @param kernel_name Name of the kernel to finalize
|
||||
// @param code_desc Handle of finalized Code Descriptor that could
|
||||
// be used to submit for execution
|
||||
// @return true if successful, false otherwise
|
||||
bool LoadAndFinalize(const AgentInfo* agent_info, const char* brig_path, const char* kernel_name,
|
||||
hsa_executable_t* hsa_exec, hsa_executable_symbol_t* code_desc);
|
||||
|
||||
// Print the various fields of Hsa Gpu Agents
|
||||
bool PrintGpuAgents(const std::string& header);
|
||||
|
||||
// Submit AQL packet to given queue
|
||||
static uint64_t Submit(hsa_queue_t* queue, void* packet);
|
||||
|
||||
// Return AqlProfile API table
|
||||
typedef hsa_ven_amd_aqlprofile_1_00_pfn_t aqlprofile_pfn_t;
|
||||
const aqlprofile_pfn_t* AqlProfileApi() const { return &aqlprofile_api_; }
|
||||
|
||||
// Return Loader API table
|
||||
const hsa_ven_amd_loader_1_00_pfn_t* LoaderApi() const { return &loader_api_; }
|
||||
|
||||
private:
|
||||
// System agents iterating callback
|
||||
static hsa_status_t GetHsaAgentsCallback(hsa_agent_t agent, void* data);
|
||||
|
||||
// Callback function to find and bind kernarg region of an agent
|
||||
static hsa_status_t FindMemRegionsCallback(hsa_region_t region, void* data);
|
||||
|
||||
// Load AQL profile HSA extension library directly
|
||||
static hsa_status_t LoadAqlProfileLib(aqlprofile_pfn_t* api);
|
||||
|
||||
// Constructor of the class. Will initialize the Hsa Runtime and
|
||||
// query the system topology to get the list of Cpu and Gpu devices
|
||||
HsaRsrcFactory(bool initialize_hsa);
|
||||
|
||||
// Destructor of the class
|
||||
~HsaRsrcFactory();
|
||||
|
||||
// HSA was initialized
|
||||
const bool initialize_hsa_;
|
||||
|
||||
// Add an instance of AgentInfo representing a Hsa Gpu agent
|
||||
const AgentInfo* AddAgentInfo(const hsa_agent_t agent);
|
||||
|
||||
static HsaRsrcFactory* instance_;
|
||||
static mutex_t mutex_;
|
||||
|
||||
// Used to maintain a list of Hsa Gpu Agent Info
|
||||
std::vector<const AgentInfo*> gpu_list_;
|
||||
std::vector<hsa_agent_t> gpu_agents_;
|
||||
|
||||
// Used to maintain a list of Hsa Cpu Agent Info
|
||||
std::vector<const AgentInfo*> cpu_list_;
|
||||
std::vector<hsa_agent_t> cpu_agents_;
|
||||
|
||||
// System agents map
|
||||
std::map<hsa_agent_handle_t, const AgentInfo*> agent_map_;
|
||||
|
||||
// AqlProfile API table
|
||||
aqlprofile_pfn_t aqlprofile_api_;
|
||||
|
||||
// Loader API table
|
||||
hsa_ven_amd_loader_1_00_pfn_t loader_api_;
|
||||
};
|
||||
|
||||
} // namespace util
|
||||
} // namespace roctracer
|
||||
|
||||
#endif // _HSA_RSRC_FACTORY_H_
|
||||
@@ -0,0 +1,169 @@
|
||||
#ifndef SRC_UTIL_LOGGER_H_
|
||||
#define SRC_UTIL_LOGGER_H_
|
||||
|
||||
#include <time.h>
|
||||
#include <stdio.h>
|
||||
#include <unistd.h>
|
||||
#include <sys/types.h>
|
||||
#include <sys/syscall.h>
|
||||
#include <sys/file.h>
|
||||
#include <stdarg.h>
|
||||
#include <stdlib.h>
|
||||
|
||||
#include <string>
|
||||
#include <iostream>
|
||||
#include <sstream>
|
||||
#include <fstream>
|
||||
#include <exception>
|
||||
#include <mutex>
|
||||
#include <map>
|
||||
|
||||
namespace roctracer {
|
||||
namespace util {
|
||||
|
||||
class Logger {
|
||||
public:
|
||||
typedef std::recursive_mutex mutex_t;
|
||||
|
||||
template <typename T> Logger& operator<<(const T& m) {
|
||||
std::ostringstream oss;
|
||||
oss << m;
|
||||
if (!streaming_)
|
||||
Log(oss.str());
|
||||
else
|
||||
Put(oss.str());
|
||||
streaming_ = true;
|
||||
return *this;
|
||||
}
|
||||
|
||||
typedef void (*manip_t)();
|
||||
Logger& operator<<(manip_t f) {
|
||||
f();
|
||||
return *this;
|
||||
}
|
||||
|
||||
static void begm() { Instance().ResetStreaming(true); }
|
||||
static void endl() { Instance().ResetStreaming(false); }
|
||||
|
||||
static const std::string& LastMessage() {
|
||||
Logger& logger = Instance();
|
||||
std::lock_guard<mutex_t> lck(mutex_);
|
||||
return logger.message_[GetTid()];
|
||||
}
|
||||
|
||||
static Logger* Create() {
|
||||
std::lock_guard<mutex_t> lck(mutex_);
|
||||
if (instance_ == NULL) instance_ = new Logger();
|
||||
return instance_;
|
||||
}
|
||||
|
||||
static void Destroy() {
|
||||
std::lock_guard<mutex_t> lck(mutex_);
|
||||
if (instance_ != NULL) delete instance_;
|
||||
instance_ = NULL;
|
||||
}
|
||||
|
||||
static Logger& Instance() {
|
||||
Create();
|
||||
return *instance_;
|
||||
}
|
||||
|
||||
private:
|
||||
static uint32_t GetPid() { return syscall(__NR_getpid); }
|
||||
static uint32_t GetTid() { return syscall(__NR_gettid); }
|
||||
|
||||
Logger() : file_(NULL), dirty_(false), streaming_(false), messaging_(false) {
|
||||
const char* path = getenv("ROCTRACER_LOG");
|
||||
if (path != NULL) {
|
||||
file_ = fopen("/tmp/roctracer_log.txt", "a");
|
||||
}
|
||||
ResetStreaming(false);
|
||||
}
|
||||
|
||||
~Logger() {
|
||||
if (file_ != NULL) {
|
||||
if (dirty_) Put("\n");
|
||||
fclose(file_);
|
||||
}
|
||||
}
|
||||
|
||||
void ResetStreaming(const bool messaging) {
|
||||
std::lock_guard<mutex_t> lck(mutex_);
|
||||
if (messaging) {
|
||||
message_[GetTid()] = "";
|
||||
} else if (streaming_) {
|
||||
Put("\n");
|
||||
dirty_ = false;
|
||||
}
|
||||
messaging_ = messaging;
|
||||
streaming_ = messaging;
|
||||
}
|
||||
|
||||
void Put(const std::string& m) {
|
||||
std::lock_guard<mutex_t> lck(mutex_);
|
||||
if (messaging_) {
|
||||
message_[GetTid()] += m;
|
||||
}
|
||||
if (file_ != NULL) {
|
||||
dirty_ = true;
|
||||
flock(fileno(file_), LOCK_EX);
|
||||
fprintf(file_, "%s", m.c_str());
|
||||
fflush(file_);
|
||||
flock(fileno(file_), LOCK_UN);
|
||||
}
|
||||
}
|
||||
|
||||
void Log(const std::string& m) {
|
||||
const time_t rawtime = time(NULL);
|
||||
tm tm_info;
|
||||
localtime_r(&rawtime, &tm_info);
|
||||
char tm_str[26];
|
||||
strftime(tm_str, 26, "%Y-%m-%d %H:%M:%S", &tm_info);
|
||||
std::ostringstream oss;
|
||||
oss << "<" << tm_str << std::dec << " pid" << GetPid() << " tid" << GetTid() << "> " << m;
|
||||
Put(oss.str());
|
||||
}
|
||||
|
||||
FILE* file_;
|
||||
bool dirty_;
|
||||
bool streaming_;
|
||||
bool messaging_;
|
||||
|
||||
static mutex_t mutex_;
|
||||
static Logger* instance_;
|
||||
std::map<uint32_t, std::string> message_;
|
||||
};
|
||||
|
||||
} // namespace util
|
||||
} // namespace roctracer
|
||||
|
||||
#define ERR_LOGGING(stream) \
|
||||
{ \
|
||||
roctracer::util::Logger::Instance() << "error: " << roctracer::util::Logger::begm \
|
||||
<< stream << roctracer::util::Logger::endl; \
|
||||
}
|
||||
|
||||
#define INFO_LOGGING(stream) \
|
||||
{ \
|
||||
roctracer::util::Logger::Instance() << "info: " << roctracer::util::Logger::begm << stream \
|
||||
<< roctracer::util::Logger::endl; \
|
||||
}
|
||||
|
||||
#define WARN_LOGGING(stream) \
|
||||
{ \
|
||||
std::cerr << "ROCProfiler: " << stream << std::endl; \
|
||||
roctracer::util::Logger::Instance() << "warning: " << roctracer::util::Logger::begm << stream \
|
||||
<< roctracer::util::Logger::endl; \
|
||||
}
|
||||
|
||||
#ifdef DEBUG
|
||||
#define DBG_LOGGING(stream) \
|
||||
{ \
|
||||
roctracer::util::Logger::Instance() << roctracer::util::Logger::begm << "debug: \"" \
|
||||
<< stream << "\"" < < < < \
|
||||
" in " << __FUNCTION__ << " at " << __FILE__ << " line " << __LINE__ \
|
||||
<< roctracer::util::Logger::endl; \
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif // SRC_UTIL_LOGGER_H_
|
||||
@@ -0,0 +1,36 @@
|
||||
ROOT_PATH=../..
|
||||
LIB_PATH=$(ROOT_PATH)/b
|
||||
LIB_NAME=roctracer64
|
||||
|
||||
export HCC_HOME=/home/evgeny/git/compute/out/ubuntu-16.04/16.04/hcc
|
||||
HIP_PATH=/home/evgeny/git/compute/external/hip/hip
|
||||
HIPCC=$(HIP_PATH)/bin/hipcc
|
||||
|
||||
SOURCES = MatrixTranspose.cpp
|
||||
OBJECTS = $(SOURCES:.cpp=.o)
|
||||
|
||||
EXECUTABLE=./MatrixTranspose
|
||||
export LD_LIBRARY_PATH=$(LIB_PATH)
|
||||
|
||||
.PHONY: test
|
||||
|
||||
|
||||
all: $(EXECUTABLE) test
|
||||
|
||||
CXXFLAGS =-g -DCOMPILE_HIP_ATP_MARKER=1 -I$(ROOT_PATH)
|
||||
CXX=$(HIPCC)
|
||||
|
||||
|
||||
$(EXECUTABLE): $(OBJECTS)
|
||||
$(HIPCC) $(OBJECTS) -o $@ -L/home/evgeny/git/compute/out/ubuntu-16.04/16.04/hcc/lib -lmcwamp_hsa -L$(LIB_PATH) -l$(LIB_NAME)
|
||||
|
||||
|
||||
test: $(EXECUTABLE)
|
||||
HCC_PROFILE=1 $(EXECUTABLE)
|
||||
|
||||
|
||||
clean:
|
||||
rm -f $(EXECUTABLE)
|
||||
rm -f $(OBJECTS)
|
||||
rm -f $(HIP_PATH)/src/*.o
|
||||
|
||||
Executable
BIN
Plik binarny nie jest wyświetlany.
@@ -0,0 +1,269 @@
|
||||
/*
|
||||
Copyright (c) 2015-2016 Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in
|
||||
all copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
THE SOFTWARE.
|
||||
*/
|
||||
|
||||
#include <iostream>
|
||||
#include <string>
|
||||
|
||||
// hip header file
|
||||
#include "hip/hip_runtime.h"
|
||||
#include "hip/hip_cbstr.h"
|
||||
#include "inc/roctracer.h"
|
||||
|
||||
#define WIDTH 1024
|
||||
|
||||
|
||||
#define NUM (WIDTH * WIDTH)
|
||||
|
||||
#define THREADS_PER_BLOCK_X 4
|
||||
#define THREADS_PER_BLOCK_Y 4
|
||||
#define THREADS_PER_BLOCK_Z 1
|
||||
|
||||
// Device (Kernel) function, it must be void
|
||||
// hipLaunchParm provides the execution configuration
|
||||
__global__ void matrixTranspose(hipLaunchParm lp, float* out, float* in, const int width) {
|
||||
int x = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x;
|
||||
int y = hipBlockDim_y * hipBlockIdx_y + hipThreadIdx_y;
|
||||
|
||||
out[y * width + x] = in[x * width + y];
|
||||
}
|
||||
|
||||
// CPU implementation of matrix transpose
|
||||
void matrixTransposeCPUReference(float* output, float* input, const unsigned int width) {
|
||||
for (unsigned int j = 0; j < width; j++) {
|
||||
for (unsigned int i = 0; i < width; i++) {
|
||||
output[i * width + j] = input[j * width + i];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void init_tracing();
|
||||
void finish_tracing();
|
||||
|
||||
int main() {
|
||||
init_tracing();
|
||||
|
||||
float* Matrix;
|
||||
float* TransposeMatrix;
|
||||
float* cpuTransposeMatrix;
|
||||
|
||||
float* gpuMatrix;
|
||||
float* gpuTransposeMatrix;
|
||||
|
||||
hipDeviceProp_t devProp;
|
||||
hipGetDeviceProperties(&devProp, 0);
|
||||
|
||||
std::cout << "Device name " << devProp.name << std::endl;
|
||||
|
||||
int i;
|
||||
int errors;
|
||||
|
||||
int iterations = 10;
|
||||
begin:
|
||||
|
||||
Matrix = (float*)malloc(NUM * sizeof(float));
|
||||
TransposeMatrix = (float*)malloc(NUM * sizeof(float));
|
||||
cpuTransposeMatrix = (float*)malloc(NUM * sizeof(float));
|
||||
|
||||
// initialize the input data
|
||||
for (i = 0; i < NUM; i++) {
|
||||
Matrix[i] = (float)i * 10.0f;
|
||||
}
|
||||
|
||||
// allocate the memory on the device side
|
||||
hipMalloc((void**)&gpuMatrix, NUM * sizeof(float));
|
||||
hipMalloc((void**)&gpuTransposeMatrix, NUM * sizeof(float));
|
||||
|
||||
// Memory transfer from host to device
|
||||
hipMemcpy(gpuMatrix, Matrix, NUM * sizeof(float), hipMemcpyHostToDevice);
|
||||
|
||||
// Lauching kernel from host
|
||||
hipLaunchKernel(matrixTranspose, dim3(WIDTH / THREADS_PER_BLOCK_X, WIDTH / THREADS_PER_BLOCK_Y),
|
||||
dim3(THREADS_PER_BLOCK_X, THREADS_PER_BLOCK_Y), 0, 0, gpuTransposeMatrix,
|
||||
gpuMatrix, WIDTH);
|
||||
|
||||
// Memory transfer from device to host
|
||||
hipMemcpy(TransposeMatrix, gpuTransposeMatrix, NUM * sizeof(float), hipMemcpyDeviceToHost);
|
||||
|
||||
// CPU MatrixTranspose computation
|
||||
matrixTransposeCPUReference(cpuTransposeMatrix, Matrix, WIDTH);
|
||||
|
||||
// verify the results
|
||||
errors = 0;
|
||||
double eps = 1.0E-6;
|
||||
for (i = 0; i < NUM; i++) {
|
||||
if (std::abs(TransposeMatrix[i] - cpuTransposeMatrix[i]) > eps) {
|
||||
errors++;
|
||||
}
|
||||
}
|
||||
if (errors != 0) {
|
||||
printf("FAILED: %d errors\n", errors);
|
||||
} else {
|
||||
printf("PASSED!\n");
|
||||
}
|
||||
|
||||
// free the resources on device side
|
||||
hipFree(gpuMatrix);
|
||||
hipFree(gpuTransposeMatrix);
|
||||
|
||||
// free the resources on host side
|
||||
free(Matrix);
|
||||
free(TransposeMatrix);
|
||||
free(cpuTransposeMatrix);
|
||||
|
||||
if ((errors == 0) && (--iterations != 0)) goto begin;
|
||||
|
||||
finish_tracing();
|
||||
return errors;
|
||||
}
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
// HIP Callbacks/Activity tracing
|
||||
//
|
||||
|
||||
// Macro to check ROC-tracer calls status
|
||||
#define ROCTRACER_CALL(call) \
|
||||
do { \
|
||||
int err = call; \
|
||||
if (err != 0) { \
|
||||
std::cerr << roctracer_error_string() << std::endl << std::flush; \
|
||||
abort(); \
|
||||
} \
|
||||
} while (0)
|
||||
|
||||
struct ihipModuleSymbol_t {
|
||||
uint64_t _object; // The kernel object.
|
||||
uint32_t _groupSegmentSize;
|
||||
uint32_t _privateSegmentSize;
|
||||
std::string _name; // TODO - review for performance cost. Name is just used for debug.
|
||||
};
|
||||
|
||||
// HIP API callback function
|
||||
extern "C" void hip_api_callback(
|
||||
uint32_t domain,
|
||||
uint32_t cid,
|
||||
const void* callback_data,
|
||||
void* arg)
|
||||
{
|
||||
(void)arg;
|
||||
const hip_cb_data_t* data = reinterpret_cast<const hip_cb_data_t*>(callback_data);
|
||||
fprintf(stdout, "<%s id(%u)\tcorrelation_id(%u) %s> ",
|
||||
data->name,
|
||||
cid,
|
||||
data->correlation_id,
|
||||
(data->phase == ROCTRACER_API_PHASE_ENTER) ? "on-enter" : "on-exit");
|
||||
if (data->phase == ROCTRACER_API_PHASE_ENTER) {
|
||||
switch (cid) {
|
||||
case HIP_API_ID_hipMemcpy:
|
||||
fprintf(stdout, "dst(%p) src(%p) size(0x%x) kind(%u)",
|
||||
data->args.hipMemcpy.dst,
|
||||
data->args.hipMemcpy.src,
|
||||
(uint32_t)(data->args.hipMemcpy.sizeBytes),
|
||||
(uint32_t)(data->args.hipMemcpy.kind));
|
||||
break;
|
||||
case HIP_API_ID_hipMalloc:
|
||||
fprintf(stdout, "ptr(%p) size(0x%x)",
|
||||
data->args.hipMalloc.ptr,
|
||||
(uint32_t)(data->args.hipMalloc.sizeBytes));
|
||||
break;
|
||||
case HIP_API_ID_hipFree:
|
||||
fprintf(stdout, "ptr(%p)",
|
||||
data->args.hipFree.ptr);
|
||||
break;
|
||||
case HIP_API_ID_hipModuleLaunchKernel:
|
||||
fprintf(stdout, "kernel(%s) straem(%p)",
|
||||
data->args.hipModuleLaunchKernel.f->_name.c_str(),
|
||||
data->args.hipModuleLaunchKernel.stream);
|
||||
break;
|
||||
case HIP_API_ID_hipLaunchKernel:
|
||||
fprintf(stdout, "kernel(%p) straem(%p)",
|
||||
data->args.hipLaunchKernel.kernel,
|
||||
data->args.hipLaunchKernel.stream);
|
||||
break;
|
||||
case HIP_API_ID_hipKernel:
|
||||
fprintf(stdout, "kernel(\"%s\") start(%lu) end(%lu)",
|
||||
data->args.hipKernel.name,
|
||||
data->args.hipKernel.start,
|
||||
data->args.hipKernel.end);
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
} else {
|
||||
switch (cid) {
|
||||
case HIP_API_ID_hipMalloc:
|
||||
fprintf(stdout, "*ptr(0x%p)",
|
||||
*(data->args.hipMalloc.ptr));
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
}
|
||||
fprintf(stdout, "\n"); fflush(stdout);
|
||||
}
|
||||
|
||||
// Activity tracing callback
|
||||
// hipMalloc id(3) correlation_id(1): begin_ns(1525888652762640464) end_ns(1525888652762877067)
|
||||
void activity_callback(const char* begin, const char* end, void* arg) {
|
||||
const roctracer_record_t* record = reinterpret_cast<const roctracer_record_t*>(begin);
|
||||
const roctracer_record_t* next = NULL;
|
||||
ROCTRACER_CALL(roctracer_next_record(record, &next));
|
||||
fprintf(stdout, "\tActivity records:\n"); fflush(stdout);
|
||||
while (reinterpret_cast<const char*>(next) <= end) {
|
||||
fprintf(stdout, "\t%s id(%u)\tcorrelation_id(%lu): begin_ns(%lu) end_ns(%lu)\n",
|
||||
record->name,
|
||||
record->activity_kind,
|
||||
record->correlation_id,
|
||||
record->begin_ns,
|
||||
record->end_ns); fflush(stdout);
|
||||
record = next;
|
||||
ROCTRACER_CALL(roctracer_next_record(record, &next));
|
||||
}
|
||||
}
|
||||
|
||||
// Initialize function
|
||||
void init_tracing() {
|
||||
// Enable HIP API callbacks
|
||||
ROCTRACER_CALL(roctracer_enable_api_callback(ROCTRACER_API_DOMAIN_HIP, HIP_API_ID_hipMemcpy, hip_api_callback, NULL));
|
||||
ROCTRACER_CALL(roctracer_enable_api_callback(ROCTRACER_API_DOMAIN_HIP, HIP_API_ID_hipMalloc, hip_api_callback, NULL));
|
||||
ROCTRACER_CALL(roctracer_enable_api_callback(ROCTRACER_API_DOMAIN_HIP, HIP_API_ID_hipFree, hip_api_callback, NULL));
|
||||
ROCTRACER_CALL(roctracer_enable_api_callback(ROCTRACER_API_DOMAIN_HIP, HIP_API_ID_hipLaunchKernel, hip_api_callback, NULL));
|
||||
ROCTRACER_CALL(roctracer_enable_api_callback(ROCTRACER_API_DOMAIN_HIP, HIP_API_ID_hipModuleLaunchKernel, hip_api_callback, NULL));
|
||||
ROCTRACER_CALL(roctracer_enable_api_callback(ROCTRACER_API_DOMAIN_HIP, HIP_API_ID_hipKernel, hip_api_callback, NULL));
|
||||
|
||||
// Enable HIP activity tracing
|
||||
roctracer_properties_t properties{};
|
||||
properties.buffer_size = 0x100;
|
||||
properties.buffer_callback_fun = activity_callback;
|
||||
ROCTRACER_CALL(roctracer_open_pool(&properties));
|
||||
ROCTRACER_CALL(roctracer_enable_api_activity(ROCTRACER_API_DOMAIN_HIP, HIP_API_ID_hipMemcpy));
|
||||
ROCTRACER_CALL(roctracer_enable_api_activity(ROCTRACER_API_DOMAIN_HIP, HIP_API_ID_hipMalloc));
|
||||
ROCTRACER_CALL(roctracer_enable_api_activity(ROCTRACER_API_DOMAIN_HIP, HIP_API_ID_hipFree));
|
||||
ROCTRACER_CALL(roctracer_enable_api_activity(ROCTRACER_API_DOMAIN_HIP, HIP_API_ID_hipLaunchKernel));
|
||||
ROCTRACER_CALL(roctracer_enable_api_activity(ROCTRACER_API_DOMAIN_HIP, HIP_API_ID_hipModuleLaunchKernel));
|
||||
ROCTRACER_CALL(roctracer_enable_api_activity(ROCTRACER_API_DOMAIN_HIP, HIP_API_ID_hipKernel));
|
||||
}
|
||||
|
||||
void finish_tracing() {
|
||||
ROCTRACER_CALL(roctracer_close_pool());
|
||||
}
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
Plik binarny nie jest wyświetlany.
@@ -0,0 +1,100 @@
|
||||
## Writing first HIP program ###
|
||||
|
||||
This tutorial shows how to get write simple HIP application. We will write the simplest Matrix Transpose program.
|
||||
|
||||
## HIP Introduction:
|
||||
|
||||
HIP is a C++ runtime API and kernel language that allows developers to create portable applications that can run on AMD and other GPU’s. Our goal was to rise above the lowest-common-denominator paths and deliver a solution that allows you, the developer, to use essential hardware features and maximize your application’s performance on GPU hardware.
|
||||
|
||||
## Requirement:
|
||||
For hardware requirement and software installation [Installation](https://github.com/ROCm-Developer-Tools/HIP/INSTALL.md)
|
||||
|
||||
## prerequiste knowledge:
|
||||
|
||||
Programmers familiar with CUDA, OpenCL will be able to quickly learn and start coding with the HIP API. In case you are not, don't worry. You choose to start with the best one. We'll be explaining everything assuming you are completely new to gpgpu programming.
|
||||
|
||||
## Simple Matrix Transpose
|
||||
|
||||
Here is simple example showing how to write your first program in HIP.
|
||||
In order to use the HIP framework, we need to add the "hip_runtime.h" header file. SInce its c++ api you can add any header file you have been using earlier while writing your c/c++ program. For gpgpu programming, we have host(microprocessor) and the device(gpu).
|
||||
|
||||
## Device-side code
|
||||
We will work on device side code first, Here is simple example showing a snippet of HIP device side code:
|
||||
|
||||
`__global__ void matrixTranspose(hipLaunchParm lp, `
|
||||
` float *out, `
|
||||
` float *in, `
|
||||
` const int width, `
|
||||
` const int height) `
|
||||
`{ `
|
||||
` int x = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x; `
|
||||
` int y = hipBlockDim_y * hipBlockIdx_y + hipThreadIdx_y; `
|
||||
` `
|
||||
` out[y * width + x] = in[x * height + y]; `
|
||||
`} `
|
||||
|
||||
`__global__` keyword is the Function-Type Qualifiers, it is used with functions that are executed on device and are called/launched from the hosts.
|
||||
other function-type qualifiers are:
|
||||
`__device__` functions are Executed on the device and Called from the device only
|
||||
`__host__` functions are Executed on the host and Called from the host
|
||||
|
||||
`__host__` can combine with `__device__`, in which case the function compiles for both the host and device. These functions cannot use the HIP grid coordinate functions (for example, "hipThreadIdx_x", will talk about it latter). A possible workaround is to pass the necessary coordinate info as an argument to the function.
|
||||
`__host__` cannot combine with `__global__`.
|
||||
|
||||
`__global__` functions are often referred to as *kernels, and calling one is termed *launching the kernel*.
|
||||
|
||||
Next keyword is `void`. HIP `__global__` functions must have a `void` return type, and the first parameter to a HIP `__global__` function must have the type `hipLaunchParm`, which is for execution configuration. Global functions require the caller to specify an "execution configuration" that includes the grid and block dimensions. The execution configuration can also include other information for the launch, such as the amount of additional shared memory to allocate and the stream where the kernel should execute.
|
||||
|
||||
After `hipLaunchParm`, Kernel arguments follows next(i.e., `float *out, float *in, const int width, const int height`).
|
||||
|
||||
The kernel function begins with
|
||||
` int x = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x;`
|
||||
` int y = hipBlockDim_y * hipBlockIdx_y + hipThreadIdx_y;`
|
||||
here the keyword hipBlockIdx_x, hipBlockIdx_y and hipBlockIdx_z(not used here) are the built-in functions to identify the threads in a block. The keyword hipBlockDim_x, hipBlockDim_y and hipBlockDim_z(not used here) are to identify the dimensions of the block.
|
||||
|
||||
We are familiar with rest of the code on device-side.
|
||||
|
||||
## Host-side code
|
||||
|
||||
Now, we'll see how to call the kernel from the host. Inside the main() function, we first defined the pointers(for both, the host-side as well as device). The declaration of device pointer is similar to that of the host. Next, we have `hipDeviceProp_t`, it is the pre-defined struct for hip device properties. This is followed by `hipGetDeviceProperties(&devProp, 0)` It is used to extract the device information. The first parameter is the struct, second parameter is the device number to get properties for. Next line print the name of the device.
|
||||
|
||||
We allocated memory to the Matrix on host side by using malloc and initiallized it. While in order to allocate memory on device side we will be using `hipMalloc`, it's quiet similar to that of malloc instruction. After this, we will copy the data to the allocated memory on device-side using `hipMemcpy`.
|
||||
` hipMemcpy(gpuMatrix, Matrix, NUM*sizeof(float), hipMemcpyHostToDevice);`
|
||||
here the first parameter is the destination pointer, second is the source pointer, third is the size of memory copy and the last specify the direction on memory copy(which is in this case froom host to device). While in order to transfer memory from device to host, use `hipMemcpyDeviceToHost` and for device to device memory copy use `hipMemcpyDeviceToDevice`.
|
||||
|
||||
Now, we'll see how to launch the kernel.
|
||||
` hipLaunchKernel(matrixTranspose, `
|
||||
` dim3(WIDTH/THREADS_PER_BLOCK_X, HEIGHT/THREADS_PER_BLOCK_Y), `
|
||||
` dim3(THREADS_PER_BLOCK_X, THREADS_PER_BLOCK_Y), `
|
||||
` 0, 0, `
|
||||
` gpuTransposeMatrix , gpuMatrix, WIDTH ,HEIGHT); `
|
||||
|
||||
HIP introduces a standard C++ calling convention to pass the execution configuration to the kernel (this convention replaces the `Cuda <<< >>>` syntax). In HIP,
|
||||
- Kernels launch with the `"hipLaunchKernel"` function
|
||||
- The first five parameters to hipLaunchKernel are the following:
|
||||
- **symbol kernelName**: the name of the kernel to launch. To support template kernels which contains "," use the HIP_KERNEL_NAME macro. In current application it's "matrixTranspose".
|
||||
- **dim3 gridDim**: 3D-grid dimensions specifying the number of blocks to launch. In MatrixTranspose sample, it's "dim3(WIDTH/THREADS_PER_BLOCK_X, HEIGHT/THREADS_PER_BLOCK_Y)".
|
||||
- **dim3 blockDim**: 3D-block dimensions specifying the number of threads in each block.In MatrixTranspose sample, it's "dim3(THREADS_PER_BLOCK_X, THREADS_PER_BLOCK_Y)".
|
||||
- **size_t dynamicShared**: amount of additional shared memory to allocate when launching the kernel. In MatrixTranspose sample, it's '0'.
|
||||
- **hipStream_t**: stream where the kernel should execute. A value of 0 corresponds to the NULL stream.In MatrixTranspose sample, it's '0'.
|
||||
- Kernel arguments follow these first five parameters. Here, these are "gpuTransposeMatrix , gpuMatrix, WIDTH ,HEIGHT".
|
||||
|
||||
Next, we'll copy the computed values/data back to the device using the `hipMemcpy`. Here the last parameter will be `hipMemcpyDeviceToHost`
|
||||
|
||||
After, copying the data from device to memory, we will verify it with the one we computed with the cpu reference funtion.
|
||||
|
||||
Finally, we will free the memory allocated earlier by using free() for host while for devices we will use `hipFree`.
|
||||
|
||||
## How to build and run:
|
||||
Use the make command and execute it using ./exe
|
||||
Use hipcc to build the application, which is using hcc on AMD and nvcc on nvidia.
|
||||
|
||||
## More Info:
|
||||
- [HIP FAQ](https://github.com/ROCm-Developer-Tools/HIP/docs/markdown/hip_faq.md)
|
||||
- [HIP Kernel Language](https://github.com/ROCm-Developer-Tools/HIP/docs/markdown/hip_kernel_language.md)
|
||||
- [HIP Runtime API (Doxygen)](http://rocm-developer-tools.github.io/HIP)
|
||||
- [HIP Porting Guide](https://github.com/ROCm-Developer-Tools/HIP/docs/markdown/hip_porting_guide.md)
|
||||
- [HIP Terminology](https://github.com/ROCm-Developer-Tools/HIP/docs/markdown/hip_terms.md) (including Rosetta Stone of GPU computing terms across CUDA/HIP/HC/AMP/OpenL)
|
||||
- [hipify-clang](https://github.com/ROCm-Developer-Tools/HIP/hipify-clang/README.md)
|
||||
- [Developer/CONTRIBUTING Info](https://github.com/ROCm-Developer-Tools/HIP/CONTRIBUTING.md)
|
||||
- [Release Notes](https://github.com/ROCm-Developer-Tools/HIP/RELEASE.md)
|
||||
@@ -0,0 +1,31 @@
|
||||
HCC_PROFILE=2 ./MatrixTranspose
|
||||
Device name Device 67df
|
||||
<hipMalloc id(3) correlation_id(1) on-enter> ptr(0x7ffd20812ac8) size(0x400000)
|
||||
<hipMalloc id(3) correlation_id(1) on-exit> *ptr(0x0x901800000)
|
||||
<hipMalloc id(3) correlation_id(2) on-enter> ptr(0x7ffd20812ac0) size(0x400000)
|
||||
<hipMalloc id(3) correlation_id(2) on-exit> *ptr(0x0x901e00000)
|
||||
<hipMemcpy id(101) correlation_id(3) on-enter> dst(0x901800000) src(0x7f8d7de53010) size(0x400000) kind(1)
|
||||
<hipMemcpy id(101) correlation_id(3) on-exit>
|
||||
<hipLaunchKernel id(38) correlation_id(4) on-enter> kernel(0x457010) straem((nil))
|
||||
<hipModuleLaunchKernel id(118) correlation_id(5) on-enter> kernel(_Z15matrixTransposeN8hip_impl17Empty_launch_parmEPfS1_i) straem((nil))
|
||||
<hipModuleLaunchKernel id(118) correlation_id(5) on-exit>
|
||||
<hipLaunchKernel id(38) correlation_id(4) on-exit>
|
||||
<hipMemcpy id(101) correlation_id(6) on-enter> dst(0x7f8d7d567010) src(0x901e00000) size(0x400000) kind(2)
|
||||
<hipKernel id(133) correlation_id(7) on-enter> kernel("_Z15matrixTransposeN8hip_impl17Empty_launch_parmEPfS1_i") start(83839566647773) end(83839568478493)
|
||||
<hipKernel id(133) correlation_id(7) on-exit>
|
||||
<hipMemcpy id(101) correlation_id(6) on-exit>
|
||||
PASSED!
|
||||
<hipFree id(80) correlation_id(8) on-enter> ptr(0x901800000)
|
||||
<hipFree id(80) correlation_id(8) on-exit>
|
||||
<hipFree id(80) correlation_id(9) on-enter> ptr(0x901e00000)
|
||||
<hipFree id(80) correlation_id(9) on-exit>
|
||||
Activity records:
|
||||
hipMalloc id(3) correlation_id(1): begin_ns(1525888625904709559) end_ns(1525888625904912781)
|
||||
hipMalloc id(3) correlation_id(2): begin_ns(1525888625904916451) end_ns(1525888625905101986)
|
||||
hipMemcpy id(101) correlation_id(3): begin_ns(1525888625905109198) end_ns(1525888625906165995)
|
||||
hipModuleLaunchKernel id(118) correlation_id(5): begin_ns(1525888625906184286) end_ns(1525888625906210781)
|
||||
hipLaunchKernel id(38) correlation_id(4): begin_ns(1525888625906171593) end_ns(1525888625906212767)
|
||||
hipKernel id(133) correlation_id(7): begin_ns(1525888625908107416) end_ns(1525888625908114094)
|
||||
hipMemcpy id(101) correlation_id(6): begin_ns(1525888625906214261) end_ns(1525888625908785157)
|
||||
hipFree id(80) correlation_id(8): begin_ns(1525888625924823273) end_ns(1525888625924893910)
|
||||
hipFree id(80) correlation_id(9): begin_ns(1525888625924896385) end_ns(1525888625924922452)
|
||||
Reference in New Issue
Block a user