This commit is contained in:
Evgeny
2018-05-10 13:19:10 -05:00
commit 0a934cff68
26 zmienionych plików z 2812 dodań i 0 usunięć
+103
Wyświetl plik
@@ -0,0 +1,103 @@
################################################################################
##
## Copyright (c) 2016 Advanced Micro Devices, Inc. All rights reserved.
##
## MIT LICENSE:
## Permission is hereby granted, free of charge, to any person obtaining a copy of
## this software and associated documentation files (the "Software"), to deal in
## the Software without restriction, including without limitation the rights to
## use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
## of the Software, and to permit persons to whom the Software is furnished to do
## so, subject to the following conditions:
##
## The above copyright notice and this permission notice shall be included in all
## copies or substantial portions of the Software.
##
## THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
## IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
## FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
## AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
## LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
## OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
## SOFTWARE.
##
################################################################################
cmake_minimum_required ( VERSION 3.5.0 )
## Verbose output.
set ( CMAKE_VERBOSE_MAKEFILE TRUE CACHE BOOL "Verbose Output" FORCE )
## Set module name and project name.
set ( ROCTRACER_NAME "roctracer" )
set ( ROCTRACER_TARGET "${ROCTRACER_NAME}64" )
set ( ROCTRACER_LIBRARY "lib${ROCTRACER_TARGET}" )
project ( ${ROCTRACER_TARGET} )
## Adding default path cmake modules
list ( APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/cmake_modules" )
## Include common cmake modules
include ( utils )
## Set build environment
include ( env )
## Setup the package version.
get_version ( "1.0.0" )
message ( "-- LIB-VERSION: ${VERSION_MAJOR}.${VERSION_MINOR}.${VERSION_PATCH}" )
set ( BUILD_VERSION_MAJOR ${VERSION_MAJOR} )
set ( BUILD_VERSION_MINOR ${VERSION_MINOR} )
set ( BUILD_VERSION_PATCH ${VERSION_PATCH} )
set ( LIB_VERSION_STRING "${BUILD_VERSION_MAJOR}.${BUILD_VERSION_MINOR}.${BUILD_VERSION_PATCH}" )
if ( DEFINED VERSION_BUILD AND NOT ${VERSION_BUILD} STREQUAL "" )
message ( "VERSION BUILD DEFINED ${VERSION_BUILD}" )
set ( BUILD_VERSION_PATCH "${BUILD_VERSION_PATCH}-${VERSION_BUILD}" )
endif ()
set ( BUILD_VERSION_STRING "${BUILD_VERSION_MAJOR}.${BUILD_VERSION_MINOR}.${BUILD_VERSION_PATCH}" )
## Set target and root/lib/test directory
set ( TARGET_NAME "${ROCTRACER_TARGET}" )
set ( ROOT_DIR "${CMAKE_CURRENT_SOURCE_DIR}" )
set ( LIB_DIR "${ROOT_DIR}/src" )
#set ( TEST_DIR "${ROOT_DIR}/test" )
## Build library
include ( ${LIB_DIR}/CMakeLists.txt )
## Set the VERSION and SOVERSION values
set_property ( TARGET ${TARGET_NAME} PROPERTY VERSION "${LIB_VERSION_STRING}" )
set_property ( TARGET ${TARGET_NAME} PROPERTY SOVERSION "${BUILD_VERSION_MAJOR}" )
## If the library is a release, strip the target library
if ( "${CMAKE_BUILD_TYPE}" STREQUAL release )
add_custom_command ( TARGET ${ROCTRACER_TARGET} POST_BUILD COMMAND ${CMAKE_STRIP} *.so )
endif ()
## Build tests
#add_subdirectory ( ${TEST_DIR} ${PROJECT_BINARY_DIR}/test )
## Install information
install ( TARGETS ${ROCTRACER_TARGET} LIBRARY DESTINATION ${ROCTRACER_NAME}/lib )
install ( FILES ${CMAKE_CURRENT_SOURCE_DIR}/inc/roctracer.h DESTINATION ${ROCTRACER_NAME}/include )
## Packaging directives
set ( CPACK_PACKAGE_NAME "${ROCTRACER_NAME}-dev" )
set ( CPACK_PACKAGE_VENDOR "AMD" )
set ( CPACK_PACKAGE_VERSION_MAJOR ${BUILD_VERSION_MAJOR} )
set ( CPACK_PACKAGE_VERSION_MINOR ${BUILD_VERSION_MINOR} )
set ( CPACK_PACKAGE_VERSION_PATCH ${BUILD_VERSION_PATCH} )
set ( CPACK_PACKAGE_CONTACT "Advanced Micro Devices Inc." )
set ( CPACK_PACKAGE_DESCRIPTION_SUMMARY "ROCTRACER library for AMD HSA runtime API extension support" )
set ( CPACK_RESOURCE_FILE_LICENSE "${CMAKE_CURRENT_SOURCE_DIR}/LICENSE" )
## Debian package specific variables
set ( CPACK_DEBIAN_PACKAGE_DEPENDS "hsa-rocr-dev" )
set ( CPACK_DEBIAN_PACKAGE_HOMEPAGE "https://github.com/RadeonOpenCompute/HSA-RocProfiler" )
set ( CPACK_DEBIAN_PACKAGE_CONTROL_EXTRA "${CMAKE_CURRENT_SOURCE_DIR}/DEBIAN/postinst;${CMAKE_CURRENT_SOURCE_DIR}/DEBIAN/prerm" )
## RPM package specific variables
set ( CPACK_RPM_PACKAGE_DEPENDS "hsa-rocr-dev" )
set ( CPACK_RPM_PRE_INSTALL_SCRIPT_FILE "${CMAKE_CURRENT_SOURCE_DIR}/RPM/rpm_post" )
set ( CPACK_RPM_POST_UNINSTALL_SCRIPT_FILE "${CMAKE_CURRENT_SOURCE_DIR}/RPM/rpm_postun" )
include ( CPack )
+19
Wyświetl plik
@@ -0,0 +1,19 @@
#/bin/bash
set -e
do_ldconfig() {
echo /opt/rocm/libroctracer/lib > /etc/ld.so.conf.d/libhsa-roctracer64.conf && ldconfig
}
case "$1" in
configure)
do_ldconfig
;;
abort-upgrade|abort-remove|abort-deconfigure)
echo "$1"
;;
*)
exit 0
;;
esac
+18
Wyświetl plik
@@ -0,0 +1,18 @@
#!/bin/bash
set -e
rm_ldconfig() {
rm -f /etc/ld.so.conf.d/libhsa-roctracer64.conf && ldconfig
}
case "$1" in
remove)
rm_ldconfig
;;
purge)
;;
*)
exit 0
;;
esac
+21
Wyświetl plik
@@ -0,0 +1,21 @@
MIT License
Copyright (c) 2017 ROCm Core Technology
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
+1
Wyświetl plik
@@ -0,0 +1 @@
echo /opt/rocm/libroctracer/lib > /etc/ld.so.conf.d/libhsa-roctracer64.conf && ldconfig
+1
Wyświetl plik
@@ -0,0 +1 @@
rm -f /etc/ld.so.conf.d/libhsa-roctracer64.conf && ldconfig
+9
Wyświetl plik
@@ -0,0 +1,9 @@
ROC Profiler/Traces library.
Callback and Activity APIs
The library source tree:
- doc - Documentation
- inc/roctracer.h - Library public API
- src - Library sources
- core - Library API sources
- util - Library utils sources
+60
Wyświetl plik
@@ -0,0 +1,60 @@
---
Language: Cpp
# BasedOnStyle: Google
AccessModifierOffset: -1
ConstructorInitializerIndentWidth: 4
AlignEscapedNewlinesLeft: false
AlignTrailingComments: true
AlignConsecutiveAssignments: false
AlignOperands: false
AllowAllParametersOfDeclarationOnNextLine: true
AllowShortBlocksOnASingleLine: false
AllowShortIfStatementsOnASingleLine: true
AllowShortLoopsOnASingleLine: true
AllowShortFunctionsOnASingleLine: All
AlwaysBreakAfterDefinitionReturnType: false
AlwaysBreakTemplateDeclarations: false
AlwaysBreakBeforeMultilineStrings: true
BreakBeforeBinaryOperators: false
BreakBeforeTernaryOperators: true
BreakConstructorInitializersBeforeComma: false
BinPackParameters: true
ColumnLimit: 100
ConstructorInitializerAllOnOneLineOrOnePerLine: true
ExperimentalAutoDetectBinPacking: false
IndentCaseLabels: true
IndentWrappedFunctionNames: false
IndentFunctionDeclarationAfterType: false
MaxEmptyLinesToKeep: 2
KeepEmptyLinesAtTheStartOfBlocks: false
NamespaceIndentation: None
ObjCSpaceAfterProperty: false
ObjCSpaceBeforeProtocolList: false
PenaltyBreakBeforeFirstCallParameter: 1
PenaltyBreakComment: 300
PenaltyBreakString: 1000
PenaltyBreakFirstLessLess: 120
PenaltyExcessCharacter: 1000000
PenaltyReturnTypeOnItsOwnLine: 200
DerivePointerAlignment: false
PointerAlignment: Left
SpacesBeforeTrailingComments: 2
Cpp11BracedListStyle: true
Standard: Auto
IndentWidth: 2
TabWidth: 8
UseTab: Never
BreakBeforeBraces: Attach
SpacesInParentheses: false
SpacesInAngles: false
SpaceInEmptyParentheses: false
SpacesInCStyleCastParentheses: false
SpacesInContainerLiterals: true
SpaceBeforeAssignmentOperators: true
ContinuationIndentWidth: 4
CommentPragmas: '^ IWYU pragma:'
ForEachMacros: [ foreach, Q_FOREACH, BOOST_FOREACH ]
SpaceBeforeParens: ControlStatements
DisableFormat: false
SortIncludes: false
...
+107
Wyświetl plik
@@ -0,0 +1,107 @@
## Build is not supported on Windows plaform
if ( WIN32 )
message ( FATAL_ERROR "Windows build is not supported." )
endif ()
## Compiler Preprocessor definitions.
add_definitions ( -D__linux__ )
add_definitions ( -DUNIX_OS )
add_definitions ( -DLINUX )
add_definitions ( -D__AMD64__ )
add_definitions ( -D__x86_64__ )
add_definitions ( -DAMD_INTERNAL_BUILD )
add_definitions ( -DLITTLEENDIAN_CPU=1 )
add_definitions ( -DHSA_LARGE_MODEL= )
add_definitions ( -DHSA_DEPRECATED= )
add_definitions ( -D__HIP_PLATFORM_HCC__ )
## Linux Compiler options
set ( CMAKE_CXX_FLAGS "-std=c++11")
set ( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall" )
set ( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Werror" )
set ( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Werror=return-type" )
set ( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fexceptions" )
set ( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fvisibility=hidden" )
set ( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fno-math-errno" )
set ( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fno-threadsafe-statics" )
set ( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fmerge-all-constants" )
set ( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fms-extensions" )
set ( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fmerge-all-constants" )
set ( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fPIC" )
set ( CMAKE_SHARED_LINKER_FLAGS "-Wl,-Bdynamic -Wl,-z,noexecstack" )
set ( CMAKE_SKIP_BUILD_RPATH TRUE )
## CLANG options
if ( "$ENV{CXX}" STREQUAL "/usr/bin/clang++" )
set ( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -ferror-limit=1000000" )
endif()
## Enable debug trace
if ( DEFINED ENV{CMAKE_DEBUG_TRACE} )
add_definitions ( -DDEBUG_TRACE=1 )
endif()
## Enable direct loading of AQL-profile HSA extension
if ( DEFINED ENV{CMAKE_LD_AQLPROFILE} )
add_definitions ( -DROCP_LD_AQLPROFILE=1 )
endif()
## Make env vars
if ( NOT DEFINED CMAKE_BUILD_TYPE OR "${CMAKE_BUILD_TYPE}" STREQUAL "" )
if ( DEFINED ENV{CMAKE_BUILD_TYPE} )
set ( CMAKE_BUILD_TYPE $ENV{CMAKE_BUILD_TYPE} )
endif()
endif()
if ( NOT DEFINED CMAKE_PREFIX_PATH AND DEFINED ENV{CMAKE_PREFIX_PATH} )
set ( CMAKE_PREFIX_PATH $ENV{CMAKE_PREFIX_PATH} )
endif()
set ( HIP_INC_DIR $ENV{HIP_INC_DIR} )
## Extend Compiler flags based on build type
string ( TOLOWER "${CMAKE_BUILD_TYPE}" CMAKE_BUILD_TYPE )
if ( "${CMAKE_BUILD_TYPE}" STREQUAL debug )
set ( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -ggdb" )
set ( CMAKE_BUILD_TYPE "debug" )
else ()
set ( CMAKE_BUILD_TYPE "release" )
endif ()
## Extend Compiler flags based on Processor architecture
if ( ${CMAKE_SYSTEM_PROCESSOR} STREQUAL "x86_64" )
set ( NBIT 64 )
set ( NBITSTR "64" )
set ( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -m64 -msse -msse2" )
elseif ( ${CMAKE_SYSTEM_PROCESSOR} STREQUAL "x86" )
set ( NBIT 32 )
set ( NBITSTR "" )
set ( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -m32" )
endif ()
## Find hsa-runtime headers/lib
find_file ( HSA_RUNTIME_INC "hsa.h" )
if ( "${HSA_RUNTIME_INC_PATH}" STREQUAL "" )
find_file ( HSA_RUNTIME_INC "hsa/hsa.h" )
endif()
find_library ( HSA_RUNTIME_LIB "libhsa-runtime${NBIT}.so" )
get_filename_component ( HSA_RUNTIME_INC_PATH ${HSA_RUNTIME_INC} DIRECTORY )
get_filename_component ( HSA_RUNTIME_LIB_PATH ${HSA_RUNTIME_LIB} DIRECTORY )
find_library ( HSA_KMT_LIB "libhsakmt.so" )
get_filename_component ( HSA_KMT_LIB_PATH ${HSA_KMT_LIB} DIRECTORY )
set ( API_PATH ${HSA_RUNTIME_INC_PATH} )
## Basic Tool Chain Information
message ( "----------------NBIT: ${NBIT}" )
message ( "-----------BuildType: ${CMAKE_BUILD_TYPE}" )
message ( "------------Compiler: ${CMAKE_CXX_COMPILER}" )
message ( "----Compiler-Version: ${CMAKE_CXX_COMPILER_VERSION}" )
message ( "-----HSA-Runtime-Inc: ${HSA_RUNTIME_INC_PATH}" )
message ( "-----HSA-Runtime-Lib: ${HSA_RUNTIME_LIB_PATH}" )
message ( "------------API-path: ${API_PATH}" )
message ( "-----CMAKE_CXX_FLAGS: ${CMAKE_CXX_FLAGS}" )
message ( "---CMAKE_PREFIX_PATH: ${CMAKE_PREFIX_PATH}" )
message ( "---------HIP_INC_DIR: ${HIP_INC_DIR}" )
+116
Wyświetl plik
@@ -0,0 +1,116 @@
################################################################################
##
## The University of Illinois/NCSA
## Open Source License (NCSA)
##
## Copyright (c) 2014-2017, Advanced Micro Devices, Inc. All rights reserved.
##
## Developed by:
##
## AMD Research and AMD HSA Software Development
##
## Advanced Micro Devices, Inc.
##
## www.amd.com
##
## Permission is hereby granted, free of charge, to any person obtaining a copy
## of this software and associated documentation files (the "Software"), to
## deal with the Software without restriction, including without limitation
## the rights to use, copy, modify, merge, publish, distribute, sublicense,
## and#or sell copies of the Software, and to permit persons to whom the
## Software is furnished to do so, subject to the following conditions:
##
## - Redistributions of source code must retain the above copyright notice,
## this list of conditions and the following disclaimers.
## - Redistributions in binary form must reproduce the above copyright
## notice, this list of conditions and the following disclaimers in
## the documentation and#or other materials provided with the distribution.
## - Neither the names of Advanced Micro Devices, Inc,
## nor the names of its contributors may be used to endorse or promote
## products derived from this Software without specific prior written
## permission.
##
## THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
## IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
## FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
## THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
## OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
## ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
## DEALINGS WITH THE SOFTWARE.
##
################################################################################
## Parses the VERSION_STRING variable and places
## the first, second and third number values in
## the major, minor and patch variables.
function( parse_version VERSION_STRING )
string ( FIND ${VERSION_STRING} "-" STRING_INDEX )
if ( ${STRING_INDEX} GREATER -1 )
math ( EXPR STRING_INDEX "${STRING_INDEX} + 1" )
string ( SUBSTRING ${VERSION_STRING} ${STRING_INDEX} -1 VERSION_BUILD )
endif ()
string ( REGEX MATCHALL "[0123456789]+" VERSIONS ${VERSION_STRING} )
list ( LENGTH VERSIONS VERSION_COUNT )
if ( ${VERSION_COUNT} GREATER 0)
list ( GET VERSIONS 0 MAJOR )
set ( VERSION_MAJOR ${MAJOR} PARENT_SCOPE )
set ( TEMP_VERSION_STRING "${MAJOR}" )
endif ()
if ( ${VERSION_COUNT} GREATER 1 )
list ( GET VERSIONS 1 MINOR )
set ( VERSION_MINOR ${MINOR} PARENT_SCOPE )
set ( TEMP_VERSION_STRING "${TEMP_VERSION_STRING}.${MINOR}" )
endif ()
if ( ${VERSION_COUNT} GREATER 2 )
list ( GET VERSIONS 2 PATCH )
set ( VERSION_PATCH ${PATCH} PARENT_SCOPE )
set ( TEMP_VERSION_STRING "${TEMP_VERSION_STRING}.${PATCH}" )
endif ()
if ( DEFINED VERSION_BUILD )
set ( VERSION_BUILD "${VERSION_BUILD}" PARENT_SCOPE )
endif ()
set ( VERSION_STRING "${TEMP_VERSION_STRING}" PARENT_SCOPE )
endfunction ()
## Gets the current version of the repository
## using versioning tags and git describe.
## Passes back a packaging version string
## and a library version string.
function ( get_version DEFAULT_VERSION_STRING )
parse_version ( ${DEFAULT_VERSION_STRING} )
find_program ( GIT NAMES git )
if ( GIT )
execute_process ( COMMAND "git describe --dirty --long --match [0-9]* 2>/dev/null"
WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
OUTPUT_VARIABLE GIT_TAG_STRING
OUTPUT_STRIP_TRAILING_WHITESPACE
RESULT_VARIABLE RESULT )
if ( ${RESULT} EQUAL 0 )
parse_version ( ${GIT_TAG_STRING} )
endif ()
endif ()
set( VERSION_STRING "${VERSION_STRING}" PARENT_SCOPE )
set( VERSION_MAJOR "${VERSION_MAJOR}" PARENT_SCOPE )
set( VERSION_MINOR "${VERSION_MINOR}" PARENT_SCOPE )
set( VERSION_PATCH "${VERSION_PATCH}" PARENT_SCOPE )
set( VERSION_BUILD "${VERSION_BUILD}" PARENT_SCOPE )
endfunction()
Plik binarny nie jest wyświetlany.
Plik binarny nie jest wyświetlany.
+230
Wyświetl plik
@@ -0,0 +1,230 @@
////////////////////////////////////////////////////////////////////////////////
//
// The University of Illinois/NCSA
// Open Source License (NCSA)
//
// Copyright (c) 2014-2015, Advanced Micro Devices, Inc. All rights reserved.
//
// Developed by:
//
// AMD Research and AMD HSA Software Development
//
// Advanced Micro Devices, Inc.
//
// www.amd.com
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to
// deal with the Software without restriction, including without limitation
// the rights to use, copy, modify, merge, publish, distribute, sublicense,
// and/or sell copies of the Software, and to permit persons to whom the
// Software is furnished to do so, subject to the following conditions:
//
// - Redistributions of source code must retain the above copyright notice,
// this list of conditions and the following disclaimers.
// - Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimers in
// the documentation and/or other materials provided with the distribution.
// - Neither the names of Advanced Micro Devices, Inc,
// nor the names of its contributors may be used to endorse or promote
// products derived from this Software without specific prior written
// permission.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
// DEALINGS WITH THE SOFTWARE.
//
////////////////////////////////////////////////////////////////////////////////
////////////////////////////////////////////////////////////////////////////////
//
// ROC Profiler API
//
// The goal of the implementation is to provide a HW specific low-level
// performance analysis interface for profiling of GPU compute applications.
// The profiling includes HW performance counters (PMC) with complex
// performance metrics and thread traces (SQTT). The profiling is supported
// by the SQTT, PMC and Callback APIs.
//
// The library can be used by a tool library loaded by HSA runtime or by
// higher level HW independent performance analysis API like PAPI.
//
// The library is written on C and will be based on AQLprofile AMD specific
// HSA extension. The library implementation requires HSA API intercepting and
// a profiling queue supporting a submit callback interface.
//
////////////////////////////////////////////////////////////////////////////////
#ifndef INC_ROCTRACER_H_
#define INC_ROCTRACER_H_
#include <stdint.h>
#include <hip/hip_runtime.h>
#include <hip/hip_cbstr.h>
#define ROCTRACER_VERSION_MAJOR 1
#define ROCTRACER_VERSION_MINOR 0
#ifdef __cplusplus
extern "C" {
#endif // __cplusplus
////////////////////////////////////////////////////////////////////////////////
// Returning library version
uint32_t roctracer_version_major();
uint32_t roctracer_version_minor();
////////////////////////////////////////////////////////////////////////////////
// Library errors enumaration
typedef enum {
ROCTRACER_STATUS_SUCCESS = 0,
ROCTRACER_STATUS_ERROR = 1,
ROCTRACER_STATUS_UNINIT = 2,
ROCTRACER_STATUS_BREAK = 3,
ROCTRACER_STATUS_BAD_DOMAIN = 4,
ROCTRACER_STATUS_HIP_API_ERR = 5,
} roctracer_status_t;
////////////////////////////////////////////////////////////////////////////////
// Returning the last error
const char* roctracer_error_string();
////////////////////////////////////////////////////////////////////////////////
// Traced runtime API domains
// Traced API domains
typedef enum {
ROCTRACER_API_DOMAIN_ANY = 0, // HIP API domain
ROCTRACER_API_DOMAIN_HIP = 1, // HIP API domain
} roctracer_api_domain_t;
// Traced calls ID enumeration
typedef hip_cb_id_t roctracer_hip_api_cid_t;
// Correlation ID type
typedef uint64_t roctracer_correletion_id_t;
// Return method name by given API domain and call ID
// NULL returned on the error and the library errno is set
const char* roctracer_get_api_name(
roctracer_api_domain_t domain, // API domain
roctracer_hip_api_cid_t cid); // API call ID
////////////////////////////////////////////////////////////////////////////////
// Callback API
//
// ROC profiler frontend provides support for runtime API callbacks and activity
// records logging. The API callbacks provide the API calls arguments and are
// called on different phases, on enter, on exit, on kernel completion.
// Methods return non-zero on error and library errno is set.
// API callback phase
typedef enum {
ROCTRACER_API_PHASE_ENTER = 0,
ROCTRACER_API_PHASE_EXIT = 1,
ROCTRACER_API_PHASE_COMPLETE = 2,
} roctracer_feature_kind_t;
// API calback data
typedef hip_cb_fun_t roctracer_api_callback_t;
// Enable runtime API callbacks
int roctracer_enable_api_callback(
roctracer_api_domain_t domain, // runtime API domain
uint32_t cid, // API call ID
roctracer_api_callback_t callback, // callback function pointer
void* arg); // [in/out] callback arg
// Disable runtime API callbacks
int roctracer_disable_api_callback(
roctracer_api_domain_t domain, // runtime API domain
uint32_t cid); // API call ID
////////////////////////////////////////////////////////////////////////////////
// Activity API
//
// The activity records are asynchronously logged to the pool and can be associated
// with the respective API callbacks using the correlation ID. Activity API can
// be used to enable collecting of the records with timestamping data for API
// calls and the kernel submits.
// Methods return non zero on error and library errno is set.
// Roctracer pool type
typedef void roctracer_pool_t;
// Activity record
typedef hip_act_record_t roctracer_record_t;
// Return next record
static inline int roctracer_next_record(
const roctracer_record_t* record, // [in] record ptr
const roctracer_record_t** next) // [out] next record ptr
{
*next = (record + 1);
return ROCTRACER_STATUS_SUCCESS;
}
// Tracer allocator type
typedef void (*roctracer_allocator_t)(
char** ptr, // memory pointer
size_t size, // memory size
void* arg); // allocator arg
// Pool callback type
typedef void (*roctracer_buffer_callback_t)(
const char* begin, // [in] available buffered trace records
const char* end, // [in] end of buffered trace records
void* arg); // [in/out] callback arg
// Tracer properties
typedef struct {
uint32_t mode; // roctracer mode
size_t buffer_size; // buffer size
roctracer_allocator_t alloc_fun; // memory alocator function pointer
void* alloc_arg; // memory alocator function pointer
roctracer_buffer_callback_t buffer_callback_fun; // tracer record callback function
void* buffer_callback_arg; // tracer record callback arg
} roctracer_properties_t;
// Create tracer memory pool
// The first invocation sets the default pool
int roctracer_open_pool(
const roctracer_properties_t* properties, // tracer pool properties
roctracer_pool_t** pool = NULL); // [out] returns tracer pool if not NULL,
// otherwise sets the default one if it is not set yet
// otherwise the error is generated
// Close tracer memory pool
int roctracer_close_pool(
roctracer_pool_t* pool = NULL); // [in] memory pool, NULL is a default one
// Return current default pool
// Set new default pool if the argument is not NULL
roctracer_pool_t* roctracer_default_pool(
roctracer_pool_t* pool = NULL); // [in] new default pool if not NULL
// Enable activity records logging
int roctracer_enable_api_activity(
roctracer_api_domain_t domain, // runtime API domain
uint32_t activity_kind, // activity kind
roctracer_pool_t* pool = NULL); // memory pool, NULL is a default one
// Disable activity records logging
int roctracer_disable_api_activity(
roctracer_api_domain_t domain, // runtime API domain
uint32_t activity_kind); // activity kind
// Flush available activity records
int roctracer_flush_api_activity(
roctracer_pool_t* pool = NULL); // memory pool, NULL is a default one
#ifdef __cplusplus
} // extern "C" block
#endif // __cplusplus
#endif // INC_ROCTRACER_H_
+189
Wyświetl plik
@@ -0,0 +1,189 @@
#!/usr/bin/python
import os, sys, re
HEADER = "hip_cbstr.h"
REC_MAX_LEN = 1024
def fill_api_map(out, api_name, args_str):
args_list = []
args_str = re.sub(r'^\s*', r'', args_str);
args_str = re.sub(r'\s*$', r'', args_str);
args_str = re.sub(r'\s*,\s*', r',', args_str);
args_str = re.sub(r'\s+', r' ', args_str);
for arg_pair in args_str.split(','):
arg_pair = re.sub(r'\s+=\s+\S+$', '', arg_pair);
m = re.match("^(.*)\s(\S+)$", arg_pair);
if m: args_list.append((m.group(1), m.group(2)))
out[api_name] = args_list;
# hipError_t hipSetupArgument(const void* arg, size_t size, size_t offset);
def parse_api(inp, out):
end_pattern = re.compile("Texture");
beg_pattern = re.compile("^hipError_t");
api_pattern = re.compile("^hipError_t\s+([^\(]+)\(([^\)]*)\)");
found = 0
record = ""
line_num = -1
for line in inp.readlines():
record += re.sub(r'^\s+', r' ', line[:-1])
line_num += 1
if len(record) > REC_MAX_LEN:
print "Error: bad record \"" + record + "\"\nfile '" + hfile + ", line (" + str(line_num) + ")"
break;
if beg_pattern.match(record): found = 1
if found:
m = api_pattern.match(record)
if m:
found = 0
if end_pattern.search(record): break
fill_api_map(out, m.group(1), m.group(2))
else: continue
record = ""
#############################################################
if (len(sys.argv) != 2):
print >>sys.stderr, "Usage:", sys.argv[0], " <input HIP API .h file>"
sys.exit(1)
hfile = sys.argv[1]
if not os.path.isfile(hfile):
print >>sys.stderr, "Error: input file '" + hfile + "' not found"
sys.exit(1)
inp = open(hfile, 'r')
api_map = {}
parse_api(inp, api_map)
api_map['hipLaunchKernel'] = [('void*', 'kernel'), ('hipStream_t', 'stream')]
f = open(HEADER, 'w')
f.write('// automatically generated sources\n')
f.write('#ifndef HIP__CBSTR_H__\n');
f.write('#define HIP__CBSTR_H__\n');
# Generating the callbacks function type
f.write('\n// HIP API callbacks function type\n\
struct hip_cb_data_t;\n\
typedef void (*hip_cb_fun_t)(const hip_cb_data_t* data, void* arg);\n\
')
# Generating the callbacks ID enumaration
f.write('\n// HIP API callbacks ID enumaration\n')
f.write('enum hip_cb_id_t {\n')
cb_id = 0
for name in api_map.keys():
f.write(' HIP_API_ID_' + name + ' = ' + str(cb_id) + ',\n')
cb_id += 1
f.write(' HIP_API_ID_NUMBER = ' + str(cb_id) + ',\n')
f.write('};\n')
# Generating the callbacks data structure
f.write('\n// HIP API callbacks data structure\n')
f.write(
'struct hip_cb_data_t {\n' +
' const char* name;\n' +
' hip_cb_id_t id;\n' +
' uint32_t correlation_id;\n' +
' bool on_enter;\n' +
' union {\n'
)
for name, args in api_map.items():
if len(args) != 0:
f.write(' struct {\n')
for arg_tuple in args:
f.write(' ' + arg_tuple[0] + ' ' + arg_tuple[1] + ';\n')
f.write(' } ' + name + ';\n')
f.write(
' } args;\n' +
'};\n'
)
# Generating the callbacks args data filling macros
f.write('\n// HIP API callbacks args data filling macros\n')
for name, args in api_map.items():
f.write('#define INIT_' + name + '_CB_ARGS_DATA(cb_data) { \\\n')
for arg_tuple in args:
arg_type = arg_tuple[0];
arg_name = arg_tuple[1];
f.write(' cb_data.args.' + name + '.' + arg_name + ' = (' + arg_type + ')' + arg_name + '; \\\n');
f.write('};\n')
f.write('#define INIT_CB_ARGS_DATA(cb_id, cb_data) INIT_##cb_id##_CB_ARGS_DATA(cb_data)\n')
# Generating the callbacks table
f.write('\n// HIP API callbacks table\n')
f.write('\
struct hip_cb_table_t {\n\
struct { hip_cb_fun_t act; hip_cb_fun_t fun; void* arg; } arr[HIP_API_ID_NUMBER];\n\
};\n\
#define HIP_CALLBACKS_TABLE hip_cb_table_t HIP_API_callbacks_table{};\n\
')
f.write('\
inline bool HIP_SET_ACTIVITY(uint32_t id, hip_cb_fun_t fun, void* arg = NULL) {\n\
(void)arg;\n\
extern hip_cb_table_t HIP_API_callbacks_table;\n\
if (id < HIP_API_ID_NUMBER) {\n\
HIP_API_callbacks_table.arr[id].act = fun;\n\
return true;\n\
}\n\
return false;\n\
}\n')
f.write('\
inline bool HIP_SET_CALLBACK(uint32_t id, hip_cb_fun_t fun, void* arg) {\n\
extern hip_cb_table_t HIP_API_callbacks_table; \n\
if (id < HIP_API_ID_NUMBER) {\n\
HIP_API_callbacks_table.arr[id].fun = fun;\n\
HIP_API_callbacks_table.arr[id].arg = arg;\n\
return true;\n\
}\n\
return false;\n\
}\n')
# Generating the callback spawning class
f.write('\n// HIP API callbacks spawning class macro\n\
#define CB_SPAWNER_OBJECT(cb_id) \\\n\
class api_callbacks_spawner_t { \\\n\
public: \\\n\
api_callbacks_spawner_t(hip_cb_data_t& cb_data) : cb_data_(cb_data) { \\\n\
hip_cb_id_t id = HIP_API_ID_##cb_id; \\\n\
cb_data_.id = id; \\\n\
cb_data_.correlation_id = UINT_MAX; \\\n\
cb_data_.name = #cb_id; \\\n\
extern const hip_cb_table_t* getApiCallbackTabel(); \\\n\
const hip_cb_table_t* cb_table = getApiCallbackTabel(); \\\n\
cb_act_ = cb_table->arr[id].act; \\\n\
cb_fun_ = cb_table->arr[id].fun; \\\n\
cb_arg_ = cb_table->arr[id].arg; \\\n\
cb_data_.on_enter = true; \\\n\
if (cb_act_ != NULL) cb_act_(&cb_data_, NULL); \\\n\
if (cb_fun_ != NULL) cb_fun_(&cb_data_, cb_arg_); \\\n\
} \\\n\
~api_callbacks_spawner_t() { \\\n\
cb_data_.on_enter = false; \\\n\
if (cb_act_ != NULL) cb_act_(&cb_data_, NULL); \\\n\
if (cb_fun_ != NULL) cb_fun_(&cb_data_, cb_arg_); \\\n\
} \\\n\
private: \\\n\
hip_cb_data_t& cb_data_; \\\n\
hip_cb_fun_t cb_act_; \\\n\
hip_cb_fun_t cb_fun_; \\\n\
void* cb_arg_; \\\n\
}; \\\n\
hip_cb_data_t cb_data{}; \\\n\
INIT_CB_ARGS_DATA(cb_id, cb_data); \\\n\
api_callbacks_spawner_t api_callbacks_spawner(cb_data); \n\
')
f.write('#endif // HIP__CBSTR__');
print "Header '" + HEADER + "' is generated"
#############################################################
+11
Wyświetl plik
@@ -0,0 +1,11 @@
#
# Build dynamic Library object
#
set ( TARGET_LIB "${TARGET_NAME}" )
set ( LIB_SRC
${LIB_DIR}/core/roctracer.cpp
${LIB_DIR}/util/hsa_rsrc_factory.cpp
)
add_library ( ${TARGET_LIB} SHARED ${LIB_SRC} )
target_include_directories ( ${TARGET_LIB} PRIVATE ${LIB_DIR} ${ROOT_DIR} ${HSA_RUNTIME_INC_PATH} ${HIP_INC_DIR} )
target_link_libraries( ${TARGET_LIB} PRIVATE ${HSA_RUNTIME_LIB} c stdc++)
+420
Wyświetl plik
@@ -0,0 +1,420 @@
#include "inc/roctracer.h"
#include <atomic>
#include <hip/hip_runtime.h>
#include <mutex>
#include <string.h>
#include <pthread.h>
#include "util/exception.h"
#include "util/hsa_rsrc_factory.h"
#include "util/logger.h"
#define PUBLIC_API __attribute__((visibility("default")))
#define CONSTRUCTOR_API __attribute__((constructor))
#define DESTRUCTOR_API __attribute__((destructor))
#define PTHREAD_CALL(call) \
do { \
int err = call; \
if (err != 0) { \
errno = err; \
perror(#call); \
abort(); \
} \
} while (0)
#define HSART_CALL(call) \
do { \
hsa_status_t status = call; \
if (status != HSA_STATUS_SUCCESS) { \
std::cerr << "HSA-rt call '" << #call << "' error(" << std::hex << status << ")" \
<< std::dec << std::endl << std::flush; \
abort(); \
} \
} while (0)
#define API_METHOD_PREFIX \
int err = 0; \
try {
#define API_METHOD_SUFFIX \
} \
catch (std::exception & e) { \
ERR_LOGGING(__FUNCTION__ << "(), " << e.what()); \
err = roctracer::GetExcStatus(e); \
} \
return err;
///////////////////////////////////////////////////////////////////////////////////////////////////
// Internal library methods
//
namespace roctracer {
int GetExcStatus(const std::exception& e) {
const util::exception* roctracer_exc_ptr = dynamic_cast<const util::exception*>(&e);
return (roctracer_exc_ptr) ? static_cast<int>(roctracer_exc_ptr->status()) : 1;
}
class GlobalCounter {
public:
typedef std::mutex mutex_t;
typedef uint64_t counter_t;
static counter_t Increment() {
std::lock_guard<mutex_t> lock(mutex_);
return ++counter_;
}
private:
static mutex_t mutex_;
static counter_t counter_;
};
GlobalCounter::mutex_t GlobalCounter::mutex_;
GlobalCounter::counter_t GlobalCounter::counter_ = 0;
class MemoryPool {
public:
typedef std::mutex mutex_t;
static void allocator_default(char** ptr, size_t size, void* arg) {
(void)arg;
if (*ptr == NULL) {
*ptr = reinterpret_cast<char*>(malloc(size));
} else if (size != 0) {
*ptr = reinterpret_cast<char*>(realloc(ptr, size));
} else {
free(*ptr);
*ptr = NULL;
}
}
MemoryPool(const roctracer_properties_t& properties) {
// Assigning pool allocator
alloc_fun_ = allocator_default;
alloc_arg_ = NULL;
if (properties.alloc_fun != NULL) {
alloc_fun_ = properties.alloc_fun;
alloc_arg_ = properties.alloc_arg;
}
// Pool definition
buffer_size_ = properties.buffer_size;
const size_t pool_size = 2 * buffer_size_;
pool_begin_ = NULL;
alloc_fun_(&pool_begin_, pool_size, alloc_arg_);
if (pool_begin_ == NULL) EXC_ABORT(ROCTRACER_STATUS_ERROR, "pool allocator failed");
pool_end_ = pool_begin_ + pool_size;
buffer_begin_ = pool_begin_;
buffer_end_ = buffer_begin_ + buffer_size_;
write_ptr_ = buffer_begin_;
// Consuming read thread
read_callback_fun_ = properties.buffer_callback_fun;
read_callback_arg_ = properties.buffer_callback_arg;
consumer_arg_ = consumer_arg_t{this, true, NULL, NULL};
PTHREAD_CALL(pthread_mutex_init(&read_mutex_, NULL));
PTHREAD_CALL(pthread_cond_init(&read_cond_, NULL));
PTHREAD_CALL(pthread_create(&consumer_thread_, NULL, reader_fun, &consumer_arg_));
}
~MemoryPool() {
Flush();
allocator_default(&pool_begin_, 0, alloc_arg_);
}
template <typename Record>
void* Write(const Record& record) {
std::lock_guard<mutex_t> lock(write_mutex_);
char* next = write_ptr_ + sizeof(Record);
if (next > buffer_end_) {
if (write_ptr_ == buffer_begin_) EXC_ABORT(ROCTRACER_STATUS_ERROR, "buffer size(" << buffer_size_ << ") is less then the record(" << sizeof(Record) << ")");
spawn_reader(buffer_begin_, buffer_end_);
buffer_begin_ = (buffer_end_ == pool_end_) ? pool_begin_ : buffer_end_;
buffer_end_ = buffer_begin_ + buffer_size_;
write_ptr_ = buffer_begin_;
next = write_ptr_ + sizeof(Record);
}
Record* ptr = reinterpret_cast<Record*>(write_ptr_);
*ptr = record;
write_ptr_ = next;
return reinterpret_cast<void*>(ptr);
}
void Flush() {
if (write_ptr_ > buffer_begin_) {
spawn_reader(buffer_begin_, write_ptr_);
sync_reader(&consumer_arg_);
buffer_begin_ = write_ptr_;
}
}
private:
struct consumer_arg_t {
MemoryPool* obj;
bool valid;
const char* begin;
const char* end;
};
static void reset_reader(consumer_arg_t* arg) {
reinterpret_cast<std::atomic<bool>*>(&(arg->valid))->store(false, std::memory_order_release);
}
static void sync_reader(const consumer_arg_t* arg) {
while(arg->valid) PTHREAD_CALL(pthread_yield());
}
static void* reader_fun(void* consumer_arg) {
consumer_arg_t* arg = reinterpret_cast<consumer_arg_t*>(consumer_arg);
roctracer::MemoryPool* obj = arg->obj;
reset_reader(arg);
while (1) {
PTHREAD_CALL(pthread_mutex_lock(&(obj->read_mutex_)));
while (arg->valid == false) {
PTHREAD_CALL(pthread_cond_wait(&(obj->read_cond_), &(obj->read_mutex_)));
}
obj->read_callback_fun_(arg->begin, arg->end, obj->read_callback_arg_);
reset_reader(arg);
PTHREAD_CALL(pthread_mutex_unlock(&(obj->read_mutex_)));
}
return NULL;
}
void spawn_reader(const char* data_begin, const char* data_end) {
sync_reader(&consumer_arg_);
PTHREAD_CALL(pthread_mutex_lock(&read_mutex_));
consumer_arg_ = consumer_arg_t{this, true, data_begin, data_end};
PTHREAD_CALL(pthread_cond_signal(&read_cond_));
PTHREAD_CALL(pthread_mutex_unlock(&read_mutex_));
}
// pool allocator
roctracer_allocator_t alloc_fun_;
void* alloc_arg_;
// Pool definition
size_t buffer_size_;
char* pool_begin_;
char* pool_end_;
char* buffer_begin_;
char* buffer_end_;
char* write_ptr_;
mutex_t write_mutex_;
// Consuming read thread
roctracer_buffer_callback_t read_callback_fun_;
void* read_callback_arg_;
consumer_arg_t consumer_arg_;
pthread_t consumer_thread_;
pthread_mutex_t read_mutex_;
pthread_cond_t read_cond_;
};
class Timer {
public:
typedef uint64_t timestamp_t;
typedef long double freq_t;
Timer() {
timestamp_t timestamp_hz = 0;
HSART_CALL(hsa_system_get_info(HSA_SYSTEM_INFO_TIMESTAMP_FREQUENCY, &timestamp_hz));
timestamp_factor_ = (freq_t)1000000000 / (freq_t)timestamp_hz;
}
// Return timestamp in 'ns'
timestamp_t timestamp_ns() {
timestamp_t timestamp;
HSART_CALL(hsa_system_get_info(HSA_SYSTEM_INFO_TIMESTAMP, &timestamp));
return timestamp_t((freq_t)timestamp * timestamp_factor_);
}
private:
// Timestamp frequency factor
freq_t timestamp_factor_;
};
CONSTRUCTOR_API void constructor() {
util::Logger::Create();
}
DESTRUCTOR_API void destructor() {
util::HsaRsrcFactory::Destroy();
util::Logger::Destroy();
}
// Activity callback to generate an activity record
void ActivityCallback(
roctracer_record_t* record,
uint32_t activity_kind,
const void* callback_data,
void* arg)
{
static Timer timer;
const hip_cb_data_t* data = reinterpret_cast<const hip_cb_data_t*>(callback_data);
MemoryPool* pool = reinterpret_cast<MemoryPool*>(arg);
if (pool == NULL) EXC_ABORT(ROCTRACER_STATUS_ERROR, "ActivityCallback pool is NULL");
if (data->phase == ROCTRACER_API_PHASE_ENTER) {
*record = {};
record->name = data->name;
record->activity_kind = activity_kind;
record->begin_ns = timer.timestamp_ns();
// Correlation ID generating
const auto correlation_id = GlobalCounter::Increment();
record->correlation_id = correlation_id;
const_cast<hip_cb_data_t*>(data)->correlation_id = correlation_id;
} else {
record->end_ns = timer.timestamp_ns();
pool->Write<roctracer_record_t>(*record);
}
}
util::Logger::mutex_t util::Logger::mutex_;
util::Logger* util::Logger::instance_ = NULL;
MemoryPool* memory_pool = NULL;
}
///////////////////////////////////////////////////////////////////////////////////////////////////
// Public library methods
//
extern "C" {
// Returns library vesrion
PUBLIC_API uint32_t roctracer_version_major() { return ROCTRACER_VERSION_MAJOR; }
PUBLIC_API uint32_t roctracer_version_minor() { return ROCTRACER_VERSION_MINOR; }
// Returns the last error
PUBLIC_API const char* roctracer_error_string() {
return strdup(roctracer::util::Logger::LastMessage().c_str());
}
// Return method name by given API domain and call ID
// NULL returned on the error and the library errno is set
PUBLIC_API const char* roctracer_get_api_name(roctracer_api_domain_t domain, roctracer_hip_api_cid_t cid) {
return NULL;
}
// Enable runtime API callbacks
PUBLIC_API int roctracer_enable_api_callback(
roctracer_api_domain_t domain,
uint32_t cid,
roctracer_api_callback_t callback,
void* user_data)
{
API_METHOD_PREFIX
switch (domain) {
case ROCTRACER_API_DOMAIN_HIP: {
hipError_t hip_err = hipRegisterApiCallback(cid, callback, user_data);
if (hip_err != hipSuccess) HIP_EXC_RAISING(ROCTRACER_STATUS_HIP_API_ERR, "hipRegisterApiCallback error(" << hip_err << ")");
break;
}
default:
EXC_RAISING(ROCTRACER_STATUS_BAD_DOMAIN, "invalid domain ID(" << domain << ")");
}
API_METHOD_SUFFIX
}
// Enable runtime API callbacks
PUBLIC_API int roctracer_disable_api_callback(
roctracer_api_domain_t domain,
uint32_t cid)
{
API_METHOD_PREFIX
switch (domain) {
case ROCTRACER_API_DOMAIN_HIP: {
hipError_t hip_err = hipRemoveApiCallback(cid);
if (hip_err != hipSuccess) HIP_EXC_RAISING(ROCTRACER_STATUS_HIP_API_ERR, "hipRemoveApiCallback error(" << hip_err << ")");
break;
}
default:
EXC_RAISING(ROCTRACER_STATUS_BAD_DOMAIN, "invalid domain ID(" << domain << ")");
}
API_METHOD_SUFFIX
}
// Return default pool and set new one if parameter pool is not NULL.
roctracer_pool_t* roctracer_default_pool(roctracer_pool_t* pool) {
roctracer_pool_t* p = reinterpret_cast<roctracer_pool_t*>(roctracer::memory_pool);
if (pool != NULL) roctracer::memory_pool = reinterpret_cast<roctracer::MemoryPool*>(pool);
if (p == NULL) EXC_RAISING(ROCTRACER_STATUS_UNINIT, "default pool is not initialized");
return p;
}
// Open memory pool
PUBLIC_API int roctracer_open_pool(
const roctracer_properties_t* properties,
roctracer_pool_t** pool)
{
API_METHOD_PREFIX
if ((pool == NULL) && (roctracer::memory_pool != NULL)) {
EXC_RAISING(ROCTRACER_STATUS_ERROR, "default pool already set");
}
roctracer::MemoryPool* p = new roctracer::MemoryPool(*properties);
if (p == NULL) EXC_RAISING(ROCTRACER_STATUS_ERROR, "MemoryPool() error");
if (pool != NULL) *pool = p;
else roctracer::memory_pool = p;
API_METHOD_SUFFIX
}
// Close memory pool
PUBLIC_API int roctracer_close_pool(roctracer_pool_t* pool) {
API_METHOD_PREFIX
roctracer_pool_t* ptr = (pool == NULL) ? roctracer_default_pool() : pool;
roctracer::MemoryPool* memory_pool = reinterpret_cast<roctracer::MemoryPool*>(ptr);
delete(memory_pool);
if (pool == NULL) roctracer::memory_pool = NULL;
API_METHOD_SUFFIX
}
// Enable activity records logging
PUBLIC_API int roctracer_enable_api_activity(
roctracer_api_domain_t domain,
uint32_t activity_kind,
roctracer_pool_t* pool)
{
API_METHOD_PREFIX
if (pool == NULL) pool = roctracer_default_pool();
switch (domain) {
case ROCTRACER_API_DOMAIN_HIP: {
const hipError_t hip_err = hipRegisterActivityCallback(activity_kind, roctracer::ActivityCallback, pool);
if (hip_err != hipSuccess) HIP_EXC_RAISING(ROCTRACER_STATUS_HIP_API_ERR, "hipRegisterActivityCallback error(" << hip_err << ")");
break;
}
default:
EXC_RAISING(ROCTRACER_STATUS_BAD_DOMAIN, "invalid domain ID(" << domain << ")");
}
API_METHOD_SUFFIX
}
// Disable activity records logging
PUBLIC_API int roctracer_disable_api_activity(
roctracer_api_domain_t domain,
uint32_t activity_kind)
{
API_METHOD_PREFIX
switch (domain) {
case ROCTRACER_API_DOMAIN_HIP: {
const hipError_t hip_err = hipRemoveActivityCallback(activity_kind);
if (hip_err != hipSuccess) HIP_EXC_RAISING(ROCTRACER_STATUS_HIP_API_ERR, "hipRemoveActivityCallback error(" << hip_err << ")");
break;
}
default:
EXC_RAISING(ROCTRACER_STATUS_BAD_DOMAIN, "invalid domain ID(" << domain << ")");
}
API_METHOD_SUFFIX
}
// Flush available activity records
PUBLIC_API int roctracer_flush_api_activity(roctracer_pool_t* pool) {
API_METHOD_PREFIX
if (pool == NULL) pool = roctracer_default_pool();
roctracer::MemoryPool* memory_pool = reinterpret_cast<roctracer::MemoryPool*>(pool);
memory_pool->Flush();
API_METHOD_SUFFIX
}
} // extern "C"
+47
Wyświetl plik
@@ -0,0 +1,47 @@
#ifndef SRC_UTIL_EXCEPTION_H_
#define SRC_UTIL_EXCEPTION_H_
#include <hsa_ven_amd_aqlprofile.h>
#include <exception>
#include <sstream>
#include <string>
#define EXC_ABORT(error, stream) \
{ \
std::ostringstream oss; \
oss << __FUNCTION__ << "(), " << stream; \
std::cout << oss.str() << std::endl; \
abort(); \
}
#define EXC_RAISING(error, stream) \
{ \
std::ostringstream oss; \
oss << __FUNCTION__ << "(), " << stream; \
throw roctracer::util::exception(error, oss.str()); \
}
#define HIP_EXC_RAISING(error, stream) \
{ \
EXC_RAISING(error, "HIP error: " << stream); \
}
namespace roctracer {
namespace util {
class exception : public std::exception {
public:
explicit exception(const uint32_t& status, const std::string& msg) : status_(status), str_(msg) {}
const char* what() const throw() { return str_.c_str(); }
uint32_t status() const throw() { return status_; }
protected:
const uint32_t status_;
const std::string str_;
};
} // namespace util
} // namespace roctracer
#endif // SRC_UTIL_EXCEPTION_H_
+572
Wyświetl plik
@@ -0,0 +1,572 @@
/**********************************************************************
Copyright ©2013 Advanced Micro Devices, Inc. All rights reserved.
Redistribution and use in source and binary forms, with or without modification, are permitted
provided that the following conditions are met:
<95> Redistributions of source code must retain the above copyright notice, this list of
conditions and the following disclaimer.
<95> Redistributions in binary form must reproduce the above copyright notice, this list of
conditions and the following disclaimer in the documentation and/or
other materials provided with the distribution.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT
SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY
DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
POSSIBILITY OF SUCH DAMAGE.
********************************************************************/
#include "util/hsa_rsrc_factory.h"
#include <dlfcn.h>
#include <fcntl.h>
#include <hsa.h>
#include <hsa_ext_amd.h>
#include <hsa_ext_finalize.h>
#include <stdint.h>
#include <stdio.h>
#include <string.h>
#include <stdlib.h>
#include <sys/stat.h>
#include <sys/types.h>
#include <atomic>
#include <cassert>
#include <fstream>
#include <iostream>
#include <string>
#include <vector>
#ifndef AQL_PROFILE_READ_API_ENABLE
#define AQL_PROFILE_READ_API_ENABLE 0
#endif
namespace roctracer {
namespace util {
// Callback function to get available in the system agents
hsa_status_t HsaRsrcFactory::GetHsaAgentsCallback(hsa_agent_t agent, void* data) {
hsa_status_t status = HSA_STATUS_ERROR;
HsaRsrcFactory* hsa_rsrc = reinterpret_cast<HsaRsrcFactory*>(data);
const AgentInfo* agent_info = hsa_rsrc->AddAgentInfo(agent);
if (agent_info != NULL) status = HSA_STATUS_SUCCESS;
return status;
}
// This function checks to see if the provided
// pool has the HSA_AMD_SEGMENT_GLOBAL property. If the kern_arg flag is true,
// the function adds an additional requirement that the pool have the
// HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_KERNARG_INIT property. If kern_arg is false,
// pools must NOT have this property.
// Upon finding a pool that meets these conditions, HSA_STATUS_INFO_BREAK is
// returned. HSA_STATUS_SUCCESS is returned if no errors were encountered, but
// no pool was found meeting the requirements. If an error is encountered, we
// return that error.
static hsa_status_t
FindGlobalPool(hsa_amd_memory_pool_t pool, void* data, bool kern_arg) {
hsa_status_t err;
hsa_amd_segment_t segment;
uint32_t flag;
if (nullptr == data) {
return HSA_STATUS_ERROR_INVALID_ARGUMENT;
}
err = hsa_amd_memory_pool_get_info(pool, HSA_AMD_MEMORY_POOL_INFO_SEGMENT,
&segment);
CHECK_STATUS("hsa_amd_memory_pool_get_info", err);
if (HSA_AMD_SEGMENT_GLOBAL != segment) {
return HSA_STATUS_SUCCESS;
}
err = hsa_amd_memory_pool_get_info(pool,
HSA_AMD_MEMORY_POOL_INFO_GLOBAL_FLAGS, &flag);
CHECK_STATUS("hsa_amd_memory_pool_get_info", err);
uint32_t karg_st = flag & HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_KERNARG_INIT;
if ((karg_st == 0 && kern_arg) ||
(karg_st != 0 && !kern_arg)) {
return HSA_STATUS_SUCCESS;
}
*(reinterpret_cast<hsa_amd_memory_pool_t*>(data)) = pool;
return HSA_STATUS_INFO_BREAK;
}
// This is the call-back function for hsa_amd_agent_iterate_memory_pools() that
// finds a pool with the properties of HSA_AMD_SEGMENT_GLOBAL and that is NOT
// HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_KERNARG_INIT
hsa_status_t FindStandardPool(hsa_amd_memory_pool_t pool, void* data) {
return FindGlobalPool(pool, data, false);
}
// This is the call-back function for hsa_amd_agent_iterate_memory_pools() that
// finds a pool with the properties of HSA_AMD_SEGMENT_GLOBAL and that IS
// HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_KERNARG_INIT
hsa_status_t FindKernArgPool(hsa_amd_memory_pool_t pool, void* data) {
return FindGlobalPool(pool, data, true);
}
#if 0
// Callback function to find and bind kernarg region of an agent
hsa_status_t HsaRsrcFactory::FindMemRegionsCallback(hsa_region_t region, void* data) {
hsa_region_global_flag_t flags;
hsa_region_segment_t segment_id;
hsa_region_get_info(region, HSA_REGION_INFO_SEGMENT, &segment_id);
if (segment_id != HSA_REGION_SEGMENT_GLOBAL) {
return HSA_STATUS_SUCCESS;
}
AgentInfo* agent_info = (AgentInfo*)data;
hsa_region_get_info(region, HSA_REGION_INFO_GLOBAL_FLAGS, &flags);
if (flags & HSA_REGION_GLOBAL_FLAG_COARSE_GRAINED) {
agent_info->coarse_region = region;
}
if (flags & HSA_REGION_GLOBAL_FLAG_KERNARG) {
agent_info->kernarg_region = region;
}
return HSA_STATUS_SUCCESS;
}
#endif
// Constructor of the class
HsaRsrcFactory::HsaRsrcFactory(bool initialize_hsa) : initialize_hsa_(initialize_hsa) {
hsa_status_t status;
// Initialize the Hsa Runtime
if (initialize_hsa_) {
status = hsa_init();
CHECK_STATUS("Error in hsa_init", status);
}
// Discover the set of Gpu devices available on the platform
status = hsa_iterate_agents(GetHsaAgentsCallback, this);
CHECK_STATUS("Error Calling hsa_iterate_agents", status);
// Get AqlProfile API table
aqlprofile_api_ = {0};
#ifdef ROCP_LD_AQLPROFILE
status = LoadAqlProfileLib(&aqlprofile_api_);
#else
status = hsa_system_get_extension_table(HSA_EXTENSION_AMD_AQLPROFILE, 1, 0, &aqlprofile_api_);
#endif
CHECK_STATUS("aqlprofile API table load failed", status);
// Get Loader API table
loader_api_ = {0};
status = hsa_system_get_extension_table(HSA_EXTENSION_AMD_LOADER, 1, 0, &loader_api_);
CHECK_STATUS("loader API table query failed", status);
}
// Destructor of the class
HsaRsrcFactory::~HsaRsrcFactory() {
for (auto p : cpu_list_) delete p;
for (auto p : gpu_list_) delete p;
if (initialize_hsa_) {
hsa_status_t status = hsa_shut_down();
CHECK_STATUS("Error in hsa_shut_down", status);
}
}
hsa_status_t HsaRsrcFactory::LoadAqlProfileLib(aqlprofile_pfn_t* api) {
void* handle = dlopen(kAqlProfileLib, RTLD_NOW);
if (handle == NULL) {
fprintf(stderr, "Loading '%s' failed, %s\n", kAqlProfileLib, dlerror());
return HSA_STATUS_ERROR;
}
dlerror(); /* Clear any existing error */
api->hsa_ven_amd_aqlprofile_error_string =
(decltype(::hsa_ven_amd_aqlprofile_error_string)*)
dlsym(handle, "hsa_ven_amd_aqlprofile_error_string");
api->hsa_ven_amd_aqlprofile_validate_event =
(decltype(::hsa_ven_amd_aqlprofile_validate_event)*)
dlsym(handle, "hsa_ven_amd_aqlprofile_validate_event");
api->hsa_ven_amd_aqlprofile_start =
(decltype(::hsa_ven_amd_aqlprofile_start)*)
dlsym(handle, "hsa_ven_amd_aqlprofile_start");
api->hsa_ven_amd_aqlprofile_stop =
(decltype(::hsa_ven_amd_aqlprofile_stop)*)
dlsym(handle, "hsa_ven_amd_aqlprofile_stop");
#if AQL_PROFILE_READ_API_ENABLE
api->hsa_ven_amd_aqlprofile_read =
(decltype(::hsa_ven_amd_aqlprofile_read)*)
dlsym(handle, "hsa_ven_amd_aqlprofile_read");
#endif // AQL_PROFILE_READ_API_ENABLE
api->hsa_ven_amd_aqlprofile_legacy_get_pm4 =
(decltype(::hsa_ven_amd_aqlprofile_legacy_get_pm4)*)
dlsym(handle, "hsa_ven_amd_aqlprofile_legacy_get_pm4");
api->hsa_ven_amd_aqlprofile_get_info =
(decltype(::hsa_ven_amd_aqlprofile_get_info)*)
dlsym(handle, "hsa_ven_amd_aqlprofile_get_info");
api->hsa_ven_amd_aqlprofile_iterate_data =
(decltype(::hsa_ven_amd_aqlprofile_iterate_data)*)
dlsym(handle, "hsa_ven_amd_aqlprofile_iterate_data");
return HSA_STATUS_SUCCESS;
}
// Add system agent info
const AgentInfo* HsaRsrcFactory::AddAgentInfo(const hsa_agent_t agent) {
// Determine if device is a Gpu agent
hsa_status_t status;
AgentInfo* agent_info = NULL;
hsa_device_type_t type;
status = hsa_agent_get_info(agent, HSA_AGENT_INFO_DEVICE, &type);
CHECK_STATUS("Error Calling hsa_agent_get_info", status);
if (type == HSA_DEVICE_TYPE_CPU) {
agent_info = new AgentInfo{};
agent_info->dev_id = agent;
agent_info->dev_type = HSA_DEVICE_TYPE_CPU;
agent_info->dev_index = cpu_list_.size();
status = hsa_amd_agent_iterate_memory_pools(agent, FindStandardPool, &agent_info->cpu_pool);
CHECK_ITER_STATUS("hsa_amd_agent_iterate_memory_pools(cpu pool)", status);
status = hsa_amd_agent_iterate_memory_pools(agent, FindKernArgPool, &agent_info->kern_arg_pool);
CHECK_ITER_STATUS("hsa_amd_agent_iterate_memory_pools(kern arg pool)", status);
agent_info->gpu_pool = {};
cpu_list_.push_back(agent_info);
cpu_agents_.push_back(agent);
}
if (type == HSA_DEVICE_TYPE_GPU) {
agent_info = new AgentInfo{};
agent_info->dev_id = agent;
agent_info->dev_type = HSA_DEVICE_TYPE_GPU;
hsa_agent_get_info(agent, HSA_AGENT_INFO_NAME, agent_info->name);
strncpy(agent_info->gfxip, agent_info->name, 4);
agent_info->gfxip[4] = '\0';
hsa_agent_get_info(agent, HSA_AGENT_INFO_WAVEFRONT_SIZE, &agent_info->max_wave_size);
hsa_agent_get_info(agent, HSA_AGENT_INFO_QUEUE_MAX_SIZE, &agent_info->max_queue_size);
hsa_agent_get_info(agent, HSA_AGENT_INFO_PROFILE, &agent_info->profile);
agent_info->is_apu = (agent_info->profile == HSA_PROFILE_FULL) ? true : false;
hsa_agent_get_info(agent, static_cast<hsa_agent_info_t>(HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT), &agent_info->cu_num);
hsa_agent_get_info(agent, static_cast<hsa_agent_info_t>(HSA_AMD_AGENT_INFO_MAX_WAVES_PER_CU), &agent_info->waves_per_cu);
hsa_agent_get_info(agent, static_cast<hsa_agent_info_t>(HSA_AMD_AGENT_INFO_NUM_SIMDS_PER_CU), &agent_info->simds_per_cu);
hsa_agent_get_info(agent, static_cast<hsa_agent_info_t>(HSA_AMD_AGENT_INFO_NUM_SHADER_ENGINES), &agent_info->se_num);
hsa_agent_get_info(agent, static_cast<hsa_agent_info_t>(HSA_AMD_AGENT_INFO_NUM_SHADER_ARRAYS_PER_SE), &agent_info->shader_arrays_per_se);
agent_info->cpu_pool = {};
agent_info->kern_arg_pool = {};
status = hsa_amd_agent_iterate_memory_pools(agent, FindStandardPool, &agent_info->gpu_pool);
CHECK_ITER_STATUS("hsa_amd_agent_iterate_memory_pools(gpu pool)", status);
#if 0
// Initialize memory regions to zero
agent_info->kernarg_region.handle = 0;
agent_info->coarse_region.handle = 0;
// Find and Bind Memory regions of the Gpu agent
hsa_agent_iterate_regions(agent, FindMemRegionsCallback, agent_info);
#endif
// Set GPU index
agent_info->dev_index = gpu_list_.size();
gpu_list_.push_back(agent_info);
gpu_agents_.push_back(agent);
}
if (agent_info) agent_map_[agent.handle] = agent_info;
return agent_info;
}
// Return systen agent info
const AgentInfo* HsaRsrcFactory::GetAgentInfo(const hsa_agent_t agent) {
const AgentInfo* agent_info = NULL;
auto it = agent_map_.find(agent.handle);
if (it != agent_map_.end()) {
agent_info = it->second;
}
return agent_info;
}
// Get the count of Hsa Gpu Agents available on the platform
//
// @return uint32_t Number of Gpu agents on platform
//
uint32_t HsaRsrcFactory::GetCountOfGpuAgents() { return uint32_t(gpu_list_.size()); }
// Get the count of Hsa Cpu Agents available on the platform
//
// @return uint32_t Number of Cpu agents on platform
//
uint32_t HsaRsrcFactory::GetCountOfCpuAgents() { return uint32_t(cpu_list_.size()); }
// Get the AgentInfo handle of a Gpu device
//
// @param idx Gpu Agent at specified index
//
// @param agent_info Output parameter updated with AgentInfo
//
// @return bool true if successful, false otherwise
//
bool HsaRsrcFactory::GetGpuAgentInfo(uint32_t idx, const AgentInfo** agent_info) {
// Determine if request is valid
uint32_t size = uint32_t(gpu_list_.size());
if (idx >= size) {
return false;
}
// Copy AgentInfo from specified index
*agent_info = gpu_list_[idx];
return true;
}
// Get the AgentInfo handle of a Cpu device
//
// @param idx Cpu Agent at specified index
//
// @param agent_info Output parameter updated with AgentInfo
//
// @return bool true if successful, false otherwise
//
bool HsaRsrcFactory::GetCpuAgentInfo(uint32_t idx, const AgentInfo** agent_info) {
// Determine if request is valid
uint32_t size = uint32_t(cpu_list_.size());
if (idx >= size) {
return false;
}
// Copy AgentInfo from specified index
*agent_info = cpu_list_[idx];
return true;
}
// Create a Queue object and return its handle. The queue object is expected
// to support user requested number of Aql dispatch packets.
//
// @param agent_info Gpu Agent on which to create a queue object
//
// @param num_Pkts Number of packets to be held by queue
//
// @param queue Output parameter updated with handle of queue object
//
// @return bool true if successful, false otherwise
//
bool HsaRsrcFactory::CreateQueue(const AgentInfo* agent_info, uint32_t num_pkts,
hsa_queue_t** queue) {
hsa_status_t status;
status = hsa_queue_create(agent_info->dev_id, num_pkts, HSA_QUEUE_TYPE_MULTI, NULL, NULL,
UINT32_MAX, UINT32_MAX, queue);
return (status == HSA_STATUS_SUCCESS);
}
// Create a Signal object and return its handle.
// @param value Initial value of signal object
// @param signal Output parameter updated with handle of signal object
// @return bool true if successful, false otherwise
bool HsaRsrcFactory::CreateSignal(uint32_t value, hsa_signal_t* signal) {
hsa_status_t status;
status = hsa_signal_create(value, 0, NULL, signal);
return (status == HSA_STATUS_SUCCESS);
}
// Allocate memory for use by a kernel of specified size in specified
// agent's memory region.
// @param agent_info Agent from whose memory region to allocate
// @param size Size of memory in terms of bytes
// @return uint8_t* Pointer to buffer, null if allocation fails.
uint8_t* HsaRsrcFactory::AllocateLocalMemory(const AgentInfo* agent_info, size_t size) {
hsa_status_t status = HSA_STATUS_ERROR;
uint8_t* buffer = NULL;
size = (size + MEM_PAGE_MASK) & ~MEM_PAGE_MASK;
status = hsa_amd_memory_pool_allocate(agent_info->gpu_pool, size, 0, (void**)&buffer);
// Only GPU can access the memory
if (status == HSA_STATUS_SUCCESS) {
hsa_agent_t agents_list[1] = {agent_info->dev_id};
status = hsa_amd_agents_allow_access(1, agents_list, NULL, buffer);
}
uint8_t* ptr = (status == HSA_STATUS_SUCCESS) ? buffer : NULL;
printf("AllocateLocalMemory %p\n", ptr);
return ptr;
}
// Allocate memory to pass kernel parameters.
// Memory is alocated accessible for all CPU agents and for GPU given by AgentInfo parameter.
// @param agent_info Agent from whose memory region to allocate
// @param size Size of memory in terms of bytes
// @return uint8_t* Pointer to buffer, null if allocation fails.
uint8_t* HsaRsrcFactory::AllocateKernArgMemory(const AgentInfo* agent_info, size_t size) {
hsa_status_t status = HSA_STATUS_ERROR;
uint8_t* buffer = NULL;
if (!cpu_agents_.empty()) {
size = (size + MEM_PAGE_MASK) & ~MEM_PAGE_MASK;
status = hsa_amd_memory_pool_allocate(cpu_list_[0]->kern_arg_pool, size, 0, (void**)&buffer);
// Both the CPU and GPU can access the kernel arguments
if (status == HSA_STATUS_SUCCESS) {
auto agents_vec = cpu_agents_;
agents_vec.push_back(agent_info->dev_id);
status = hsa_amd_agents_allow_access(agents_vec.size(), &agents_vec[0], NULL, buffer);
}
}
uint8_t* ptr = (status == HSA_STATUS_SUCCESS) ? buffer : NULL;
printf("AllocateKernargMemory %p\n", ptr);
return ptr;
}
// Allocate system memory accessible by both CPU and GPU
// @param agent_info Agent from whose memory region to allocate
// @param size Size of memory in terms of bytes
// @return uint8_t* Pointer to buffer, null if allocation fails.
uint8_t* HsaRsrcFactory::AllocateSysMemory(const AgentInfo* agent_info, size_t size) {
hsa_status_t status = HSA_STATUS_ERROR;
uint8_t* buffer = NULL;
if (!cpu_agents_.empty()) {
size = (size + MEM_PAGE_MASK) & ~MEM_PAGE_MASK;
status = hsa_amd_memory_pool_allocate(cpu_list_[0]->cpu_pool, size, 0, (void**)&buffer);
// Both the CPU and GPU can access the memory
if (status == HSA_STATUS_SUCCESS) {
auto agents_vec = cpu_agents_;
agents_vec.push_back(agent_info->dev_id);
status = hsa_amd_agents_allow_access(agents_vec.size(), &agents_vec[0], NULL, buffer);
}
}
uint8_t* ptr = (status == HSA_STATUS_SUCCESS) ? buffer : NULL;
printf("AllocateSysMemory %p\n", ptr);
return ptr;
}
// Copy data from GPU to host memory
bool HsaRsrcFactory::CopyToHost(const hsa_agent_t& agent, void* dst, const void* src, size_t size) {
hsa_status_t status = HSA_STATUS_ERROR;
if (!cpu_agents_.empty()) {
hsa_signal_t s = {};
hsa_status_t status = hsa_signal_create(1, 0, NULL, &s);
if (status == HSA_STATUS_SUCCESS) {
status = hsa_amd_memory_async_copy(dst, cpu_agents_[0], src, agent, size, 0, NULL, s);
if (status == HSA_STATUS_SUCCESS) {
if (hsa_signal_wait_scacquire(s, HSA_SIGNAL_CONDITION_LT, 1, UINT64_MAX, HSA_WAIT_STATE_BLOCKED) != 0) {
status = HSA_STATUS_ERROR;
}
}
status = hsa_signal_destroy(s);
}
}
return (status == HSA_STATUS_SUCCESS);
}
bool HsaRsrcFactory::CopyToHost(const AgentInfo* agent_info, void* dst, const void* src, size_t size) {
return CopyToHost(agent_info->dev_id, dst, src, size);
}
// Loads an Assembled Brig file and Finalizes it into Device Isa
// @param agent_info Gpu device for which to finalize
// @param brig_path File path of the Assembled Brig file
// @param kernel_name Name of the kernel to finalize
// @param code_desc Handle of finalized Code Descriptor that could
// be used to submit for execution
// @return bool true if successful, false otherwise
bool HsaRsrcFactory::LoadAndFinalize(const AgentInfo* agent_info, const char* brig_path,
const char* kernel_name, hsa_executable_t* executable, hsa_executable_symbol_t* code_desc) {
hsa_status_t status = HSA_STATUS_ERROR;
// Build the code object filename
std::string filename(brig_path);
std::clog << "Code object filename: " << filename << std::endl;
// Open the file containing code object
hsa_file_t file_handle = open(filename.c_str(), O_RDONLY);
if (file_handle == -1) {
std::cerr << "Error: failed to load '" << filename << "'" << std::endl;
assert(false);
return false;
}
// Create code object reader
hsa_code_object_reader_t code_obj_rdr = {0};
status = hsa_code_object_reader_create_from_file(file_handle, &code_obj_rdr);
if (status != HSA_STATUS_SUCCESS) {
std::cerr << "Failed to create code object reader '" << filename << "'" << std::endl;
return false;
}
// Create executable.
status = hsa_executable_create_alt(HSA_PROFILE_FULL,
HSA_DEFAULT_FLOAT_ROUNDING_MODE_DEFAULT, NULL, executable);
CHECK_STATUS("Error in creating executable object", status);
// Load code object.
status = hsa_executable_load_agent_code_object(*executable, agent_info->dev_id,
code_obj_rdr, NULL, NULL);
CHECK_STATUS("Error in loading executable object", status);
// Freeze executable.
status = hsa_executable_freeze(*executable, "");
CHECK_STATUS("Error in freezing executable object", status);
// Get symbol handle.
hsa_executable_symbol_t kernelSymbol;
status = hsa_executable_get_symbol(*executable, NULL, kernel_name, agent_info->dev_id, 0,
&kernelSymbol);
CHECK_STATUS("Error in looking up kernel symbol", status);
// Update output parameter
*code_desc = kernelSymbol;
return true;
}
// Print the various fields of Hsa Gpu Agents
bool HsaRsrcFactory::PrintGpuAgents(const std::string& header) {
std::clog << header << " :" << std::endl;
const AgentInfo* agent_info;
int size = uint32_t(gpu_list_.size());
for (int idx = 0; idx < size; idx++) {
agent_info = gpu_list_[idx];
std::clog << "> agent[" << idx << "] :" << std::endl;
std::clog << ">> Name : " << agent_info->name << std::endl;
std::clog << ">> APU : " << agent_info->is_apu << std::endl;
std::clog << ">> HSAIL profile : " << agent_info->profile << std::endl;
std::clog << ">> Max Wave Size : " << agent_info->max_wave_size << std::endl;
std::clog << ">> Max Queue Size : " << agent_info->max_queue_size << std::endl;
// std::clog << ">> Kernarg Region Id : " << agent_info->coarse_region.handle << std::endl;
std::clog << ">> CU number : " << agent_info->cu_num << std::endl;
std::clog << ">> Waves per CU : " << agent_info->waves_per_cu << std::endl;
std::clog << ">> SIMDs per CU : " << agent_info->simds_per_cu << std::endl;
std::clog << ">> SE number : " << agent_info->se_num << std::endl;
std::clog << ">> Shader Arrays per SE : " << agent_info->shader_arrays_per_se << std::endl;
}
return true;
}
uint64_t HsaRsrcFactory::Submit(hsa_queue_t* queue, void* packet) {
const uint32_t slot_size_b = 0x40;
// adevance command queue
const uint64_t write_idx = hsa_queue_load_write_index_relaxed(queue);
hsa_queue_store_write_index_relaxed(queue, write_idx + 1);
while ((write_idx - hsa_queue_load_read_index_relaxed(queue)) >= queue->size) {
sched_yield();
}
uint32_t slot_idx = (uint32_t)(write_idx % queue->size);
uint32_t* queue_slot = (uint32_t*)((uintptr_t)(queue->base_address) + (slot_idx * slot_size_b));
uint32_t* slot_data = (uint32_t*)packet;
// Copy buffered commands into the queue slot.
// Overwrite the AQL invalid header (first dword) last.
// This prevents the slot from being read until it's fully written.
memcpy(&queue_slot[1], &slot_data[1], slot_size_b - sizeof(uint32_t));
std::atomic<uint32_t>* header_atomic_ptr = reinterpret_cast<std::atomic<uint32_t>*>(&queue_slot[0]);
header_atomic_ptr->store(slot_data[0], std::memory_order_release);
// ringdoor bell
hsa_signal_store_relaxed(queue->doorbell_signal, write_idx);
return write_idx;
}
HsaRsrcFactory* HsaRsrcFactory::instance_ = NULL;
HsaRsrcFactory::mutex_t HsaRsrcFactory::mutex_;
} // namespace util
} // namespace roctracer
+283
Wyświetl plik
@@ -0,0 +1,283 @@
/**********************************************************************
Copyright ©2013 Advanced Micro Devices, Inc. All rights reserved.
Redistribution and use in source and binary forms, with or without modification, are permitted
provided that the following conditions are met:
<95> Redistributions of source code must retain the above copyright notice, this list of
conditions and the following disclaimer.
<95> Redistributions in binary form must reproduce the above copyright notice, this list of
conditions and the following disclaimer in the documentation and/or
other materials provided with the distribution.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT
SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY
DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
POSSIBILITY OF SUCH DAMAGE.
********************************************************************/
#ifndef _HSA_RSRC_FACTORY_H_
#define _HSA_RSRC_FACTORY_H_
#include <hsa.h>
#include <hsa_ext_amd.h>
#include <hsa_ext_finalize.h>
#include <hsa_ven_amd_aqlprofile.h>
#include <hsa_ven_amd_loader.h>
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <iostream>
#include <mutex>
#include <map>
#include <string>
#include <vector>
#define HSA_ARGUMENT_ALIGN_BYTES 16
#define HSA_QUEUE_ALIGN_BYTES 64
#define HSA_PACKET_ALIGN_BYTES 64
#define CHECK_STATUS(msg, status) \
if (status != HSA_STATUS_SUCCESS) { \
const char* emsg = 0; \
hsa_status_string(status, &emsg); \
printf("%s: %s\n", msg, emsg ? emsg : "<unknown error>"); \
exit(1); \
}
#define CHECK_ITER_STATUS(msg, status) \
if (status != HSA_STATUS_INFO_BREAK) { \
const char* emsg = 0; \
hsa_status_string(status, &emsg); \
printf("%s: %s\n", msg, emsg ? emsg : "<unknown error>"); \
exit(1); \
}
namespace roctracer {
namespace util {
static const unsigned MEM_PAGE_BYTES = 0x1000;
static const unsigned MEM_PAGE_MASK = MEM_PAGE_BYTES - 1;
typedef decltype(hsa_agent_t::handle) hsa_agent_handle_t;
// Encapsulates information about a Hsa Agent such as its
// handle, name, max queue size, max wavefront size, etc.
struct AgentInfo {
// Handle of Agent
hsa_agent_t dev_id;
// Agent type - Cpu = 0, Gpu = 1 or Dsp = 2
uint32_t dev_type;
// APU flag
bool is_apu;
// Agent system index
uint32_t dev_index;
// GFXIP name
char gfxip[64];
// Name of Agent whose length is less than 64
char name[64];
// Max size of Wavefront size
uint32_t max_wave_size;
// Max size of Queue buffer
uint32_t max_queue_size;
// Hsail profile supported by agent
hsa_profile_t profile;
#if 0
// Memory region supporting kernel parameters
hsa_region_t coarse_region;
// Memory region supporting kernel arguments
hsa_region_t kernarg_region;
#endif
// CPU/GPU/kern-arg memory pools
hsa_amd_memory_pool_t cpu_pool;
hsa_amd_memory_pool_t gpu_pool;
hsa_amd_memory_pool_t kern_arg_pool;
// The number of compute unit available in the agent.
uint32_t cu_num;
// Maximum number of waves possible in a Compute Unit.
uint32_t waves_per_cu;
// Number of SIMD's per compute unit CU
uint32_t simds_per_cu;
// Number of Shader Engines (SE) in Gpu
uint32_t se_num;
// Number of Shader Arrays Per Shader Engines in Gpu
uint32_t shader_arrays_per_se;
};
class HsaRsrcFactory {
public:
typedef std::recursive_mutex mutex_t;
static HsaRsrcFactory* Create(bool initialize_hsa = true) {
std::lock_guard<mutex_t> lck(mutex_);
if (instance_ == NULL) {
instance_ = new HsaRsrcFactory(initialize_hsa);
}
return instance_;
}
static HsaRsrcFactory& Instance() {
if (instance_ == NULL) instance_ = Create(false);
hsa_status_t status = (instance_ != NULL) ? HSA_STATUS_SUCCESS : HSA_STATUS_ERROR;
CHECK_STATUS("HsaRsrcFactory::Instance() failed", status);
return *instance_;
}
static void Destroy() {
std::lock_guard<mutex_t> lck(mutex_);
if (instance_) delete instance_;
instance_ = NULL;
}
// Return system agent info
const AgentInfo* GetAgentInfo(const hsa_agent_t agent);
// Get the count of Hsa Gpu Agents available on the platform
// @return uint32_t Number of Gpu agents on platform
uint32_t GetCountOfGpuAgents();
// Get the count of Hsa Cpu Agents available on the platform
// @return uint32_t Number of Cpu agents on platform
uint32_t GetCountOfCpuAgents();
// Get the AgentInfo handle of a Gpu device
// @param idx Gpu Agent at specified index
// @param agent_info Output parameter updated with AgentInfo
// @return bool true if successful, false otherwise
bool GetGpuAgentInfo(uint32_t idx, const AgentInfo** agent_info);
// Get the AgentInfo handle of a Cpu device
// @param idx Cpu Agent at specified index
// @param agent_info Output parameter updated with AgentInfo
// @return bool true if successful, false otherwise
bool GetCpuAgentInfo(uint32_t idx, const AgentInfo** agent_info);
// Create a Queue object and return its handle. The queue object is expected
// to support user requested number of Aql dispatch packets.
// @param agent_info Gpu Agent on which to create a queue object
// @param num_Pkts Number of packets to be held by queue
// @param queue Output parameter updated with handle of queue object
// @return bool true if successful, false otherwise
bool CreateQueue(const AgentInfo* agent_info, uint32_t num_pkts, hsa_queue_t** queue);
// Create a Signal object and return its handle.
// @param value Initial value of signal object
// @param signal Output parameter updated with handle of signal object
// @return bool true if successful, false otherwise
bool CreateSignal(uint32_t value, hsa_signal_t* signal);
// Allocate local GPU memory
// @param agent_info Agent from whose memory region to allocate
// @param size Size of memory in terms of bytes
// @return uint8_t* Pointer to buffer, null if allocation fails.
uint8_t* AllocateLocalMemory(const AgentInfo* agent_info, size_t size);
// Allocate memory tp pass kernel parameters
// Memory is alocated accessible for all CPU agents and for GPU given by AgentInfo parameter.
// @param agent_info Agent from whose memory region to allocate
// @param size Size of memory in terms of bytes
// @return uint8_t* Pointer to buffer, null if allocation fails.
uint8_t* AllocateKernArgMemory(const AgentInfo* agent_info, size_t size);
// Allocate system memory accessible from both CPU and GPU
// Memory is alocated accessible to all CPU agents and AgentInfo parameter is ignored.
// @param agent_info Agent from whose memory region to allocate
// @param size Size of memory in terms of bytes
// @return uint8_t* Pointer to buffer, null if allocation fails.
uint8_t* AllocateSysMemory(const AgentInfo* agent_info, size_t size);
// Copy data from GPU to host memory
bool CopyToHost(const hsa_agent_t& agent, void* dst, const void* src, size_t size);
bool CopyToHost(const AgentInfo* agent_info, void* dst, const void* src, size_t size);
// Loads an Assembled Brig file and Finalizes it into Device Isa
// @param agent_info Gpu device for which to finalize
// @param brig_path File path of the Assembled Brig file
// @param kernel_name Name of the kernel to finalize
// @param code_desc Handle of finalized Code Descriptor that could
// be used to submit for execution
// @return true if successful, false otherwise
bool LoadAndFinalize(const AgentInfo* agent_info, const char* brig_path, const char* kernel_name,
hsa_executable_t* hsa_exec, hsa_executable_symbol_t* code_desc);
// Print the various fields of Hsa Gpu Agents
bool PrintGpuAgents(const std::string& header);
// Submit AQL packet to given queue
static uint64_t Submit(hsa_queue_t* queue, void* packet);
// Return AqlProfile API table
typedef hsa_ven_amd_aqlprofile_1_00_pfn_t aqlprofile_pfn_t;
const aqlprofile_pfn_t* AqlProfileApi() const { return &aqlprofile_api_; }
// Return Loader API table
const hsa_ven_amd_loader_1_00_pfn_t* LoaderApi() const { return &loader_api_; }
private:
// System agents iterating callback
static hsa_status_t GetHsaAgentsCallback(hsa_agent_t agent, void* data);
// Callback function to find and bind kernarg region of an agent
static hsa_status_t FindMemRegionsCallback(hsa_region_t region, void* data);
// Load AQL profile HSA extension library directly
static hsa_status_t LoadAqlProfileLib(aqlprofile_pfn_t* api);
// Constructor of the class. Will initialize the Hsa Runtime and
// query the system topology to get the list of Cpu and Gpu devices
HsaRsrcFactory(bool initialize_hsa);
// Destructor of the class
~HsaRsrcFactory();
// HSA was initialized
const bool initialize_hsa_;
// Add an instance of AgentInfo representing a Hsa Gpu agent
const AgentInfo* AddAgentInfo(const hsa_agent_t agent);
static HsaRsrcFactory* instance_;
static mutex_t mutex_;
// Used to maintain a list of Hsa Gpu Agent Info
std::vector<const AgentInfo*> gpu_list_;
std::vector<hsa_agent_t> gpu_agents_;
// Used to maintain a list of Hsa Cpu Agent Info
std::vector<const AgentInfo*> cpu_list_;
std::vector<hsa_agent_t> cpu_agents_;
// System agents map
std::map<hsa_agent_handle_t, const AgentInfo*> agent_map_;
// AqlProfile API table
aqlprofile_pfn_t aqlprofile_api_;
// Loader API table
hsa_ven_amd_loader_1_00_pfn_t loader_api_;
};
} // namespace util
} // namespace roctracer
#endif // _HSA_RSRC_FACTORY_H_
+169
Wyświetl plik
@@ -0,0 +1,169 @@
#ifndef SRC_UTIL_LOGGER_H_
#define SRC_UTIL_LOGGER_H_
#include <time.h>
#include <stdio.h>
#include <unistd.h>
#include <sys/types.h>
#include <sys/syscall.h>
#include <sys/file.h>
#include <stdarg.h>
#include <stdlib.h>
#include <string>
#include <iostream>
#include <sstream>
#include <fstream>
#include <exception>
#include <mutex>
#include <map>
namespace roctracer {
namespace util {
class Logger {
public:
typedef std::recursive_mutex mutex_t;
template <typename T> Logger& operator<<(const T& m) {
std::ostringstream oss;
oss << m;
if (!streaming_)
Log(oss.str());
else
Put(oss.str());
streaming_ = true;
return *this;
}
typedef void (*manip_t)();
Logger& operator<<(manip_t f) {
f();
return *this;
}
static void begm() { Instance().ResetStreaming(true); }
static void endl() { Instance().ResetStreaming(false); }
static const std::string& LastMessage() {
Logger& logger = Instance();
std::lock_guard<mutex_t> lck(mutex_);
return logger.message_[GetTid()];
}
static Logger* Create() {
std::lock_guard<mutex_t> lck(mutex_);
if (instance_ == NULL) instance_ = new Logger();
return instance_;
}
static void Destroy() {
std::lock_guard<mutex_t> lck(mutex_);
if (instance_ != NULL) delete instance_;
instance_ = NULL;
}
static Logger& Instance() {
Create();
return *instance_;
}
private:
static uint32_t GetPid() { return syscall(__NR_getpid); }
static uint32_t GetTid() { return syscall(__NR_gettid); }
Logger() : file_(NULL), dirty_(false), streaming_(false), messaging_(false) {
const char* path = getenv("ROCTRACER_LOG");
if (path != NULL) {
file_ = fopen("/tmp/roctracer_log.txt", "a");
}
ResetStreaming(false);
}
~Logger() {
if (file_ != NULL) {
if (dirty_) Put("\n");
fclose(file_);
}
}
void ResetStreaming(const bool messaging) {
std::lock_guard<mutex_t> lck(mutex_);
if (messaging) {
message_[GetTid()] = "";
} else if (streaming_) {
Put("\n");
dirty_ = false;
}
messaging_ = messaging;
streaming_ = messaging;
}
void Put(const std::string& m) {
std::lock_guard<mutex_t> lck(mutex_);
if (messaging_) {
message_[GetTid()] += m;
}
if (file_ != NULL) {
dirty_ = true;
flock(fileno(file_), LOCK_EX);
fprintf(file_, "%s", m.c_str());
fflush(file_);
flock(fileno(file_), LOCK_UN);
}
}
void Log(const std::string& m) {
const time_t rawtime = time(NULL);
tm tm_info;
localtime_r(&rawtime, &tm_info);
char tm_str[26];
strftime(tm_str, 26, "%Y-%m-%d %H:%M:%S", &tm_info);
std::ostringstream oss;
oss << "<" << tm_str << std::dec << " pid" << GetPid() << " tid" << GetTid() << "> " << m;
Put(oss.str());
}
FILE* file_;
bool dirty_;
bool streaming_;
bool messaging_;
static mutex_t mutex_;
static Logger* instance_;
std::map<uint32_t, std::string> message_;
};
} // namespace util
} // namespace roctracer
#define ERR_LOGGING(stream) \
{ \
roctracer::util::Logger::Instance() << "error: " << roctracer::util::Logger::begm \
<< stream << roctracer::util::Logger::endl; \
}
#define INFO_LOGGING(stream) \
{ \
roctracer::util::Logger::Instance() << "info: " << roctracer::util::Logger::begm << stream \
<< roctracer::util::Logger::endl; \
}
#define WARN_LOGGING(stream) \
{ \
std::cerr << "ROCProfiler: " << stream << std::endl; \
roctracer::util::Logger::Instance() << "warning: " << roctracer::util::Logger::begm << stream \
<< roctracer::util::Logger::endl; \
}
#ifdef DEBUG
#define DBG_LOGGING(stream) \
{ \
roctracer::util::Logger::Instance() << roctracer::util::Logger::begm << "debug: \"" \
<< stream << "\"" < < < < \
" in " << __FUNCTION__ << " at " << __FILE__ << " line " << __LINE__ \
<< roctracer::util::Logger::endl; \
}
#endif
#endif // SRC_UTIL_LOGGER_H_
+36
Wyświetl plik
@@ -0,0 +1,36 @@
ROOT_PATH=../..
LIB_PATH=$(ROOT_PATH)/b
LIB_NAME=roctracer64
export HCC_HOME=/home/evgeny/git/compute/out/ubuntu-16.04/16.04/hcc
HIP_PATH=/home/evgeny/git/compute/external/hip/hip
HIPCC=$(HIP_PATH)/bin/hipcc
SOURCES = MatrixTranspose.cpp
OBJECTS = $(SOURCES:.cpp=.o)
EXECUTABLE=./MatrixTranspose
export LD_LIBRARY_PATH=$(LIB_PATH)
.PHONY: test
all: $(EXECUTABLE) test
CXXFLAGS =-g -DCOMPILE_HIP_ATP_MARKER=1 -I$(ROOT_PATH)
CXX=$(HIPCC)
$(EXECUTABLE): $(OBJECTS)
$(HIPCC) $(OBJECTS) -o $@ -L/home/evgeny/git/compute/out/ubuntu-16.04/16.04/hcc/lib -lmcwamp_hsa -L$(LIB_PATH) -l$(LIB_NAME)
test: $(EXECUTABLE)
HCC_PROFILE=1 $(EXECUTABLE)
clean:
rm -f $(EXECUTABLE)
rm -f $(OBJECTS)
rm -f $(HIP_PATH)/src/*.o
Plik binarny nie jest wyświetlany.
@@ -0,0 +1,269 @@
/*
Copyright (c) 2015-2016 Advanced Micro Devices, Inc. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
#include <iostream>
#include <string>
// hip header file
#include "hip/hip_runtime.h"
#include "hip/hip_cbstr.h"
#include "inc/roctracer.h"
#define WIDTH 1024
#define NUM (WIDTH * WIDTH)
#define THREADS_PER_BLOCK_X 4
#define THREADS_PER_BLOCK_Y 4
#define THREADS_PER_BLOCK_Z 1
// Device (Kernel) function, it must be void
// hipLaunchParm provides the execution configuration
__global__ void matrixTranspose(hipLaunchParm lp, float* out, float* in, const int width) {
int x = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x;
int y = hipBlockDim_y * hipBlockIdx_y + hipThreadIdx_y;
out[y * width + x] = in[x * width + y];
}
// CPU implementation of matrix transpose
void matrixTransposeCPUReference(float* output, float* input, const unsigned int width) {
for (unsigned int j = 0; j < width; j++) {
for (unsigned int i = 0; i < width; i++) {
output[i * width + j] = input[j * width + i];
}
}
}
void init_tracing();
void finish_tracing();
int main() {
init_tracing();
float* Matrix;
float* TransposeMatrix;
float* cpuTransposeMatrix;
float* gpuMatrix;
float* gpuTransposeMatrix;
hipDeviceProp_t devProp;
hipGetDeviceProperties(&devProp, 0);
std::cout << "Device name " << devProp.name << std::endl;
int i;
int errors;
int iterations = 10;
begin:
Matrix = (float*)malloc(NUM * sizeof(float));
TransposeMatrix = (float*)malloc(NUM * sizeof(float));
cpuTransposeMatrix = (float*)malloc(NUM * sizeof(float));
// initialize the input data
for (i = 0; i < NUM; i++) {
Matrix[i] = (float)i * 10.0f;
}
// allocate the memory on the device side
hipMalloc((void**)&gpuMatrix, NUM * sizeof(float));
hipMalloc((void**)&gpuTransposeMatrix, NUM * sizeof(float));
// Memory transfer from host to device
hipMemcpy(gpuMatrix, Matrix, NUM * sizeof(float), hipMemcpyHostToDevice);
// Lauching kernel from host
hipLaunchKernel(matrixTranspose, dim3(WIDTH / THREADS_PER_BLOCK_X, WIDTH / THREADS_PER_BLOCK_Y),
dim3(THREADS_PER_BLOCK_X, THREADS_PER_BLOCK_Y), 0, 0, gpuTransposeMatrix,
gpuMatrix, WIDTH);
// Memory transfer from device to host
hipMemcpy(TransposeMatrix, gpuTransposeMatrix, NUM * sizeof(float), hipMemcpyDeviceToHost);
// CPU MatrixTranspose computation
matrixTransposeCPUReference(cpuTransposeMatrix, Matrix, WIDTH);
// verify the results
errors = 0;
double eps = 1.0E-6;
for (i = 0; i < NUM; i++) {
if (std::abs(TransposeMatrix[i] - cpuTransposeMatrix[i]) > eps) {
errors++;
}
}
if (errors != 0) {
printf("FAILED: %d errors\n", errors);
} else {
printf("PASSED!\n");
}
// free the resources on device side
hipFree(gpuMatrix);
hipFree(gpuTransposeMatrix);
// free the resources on host side
free(Matrix);
free(TransposeMatrix);
free(cpuTransposeMatrix);
if ((errors == 0) && (--iterations != 0)) goto begin;
finish_tracing();
return errors;
}
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
// HIP Callbacks/Activity tracing
//
// Macro to check ROC-tracer calls status
#define ROCTRACER_CALL(call) \
do { \
int err = call; \
if (err != 0) { \
std::cerr << roctracer_error_string() << std::endl << std::flush; \
abort(); \
} \
} while (0)
struct ihipModuleSymbol_t {
uint64_t _object; // The kernel object.
uint32_t _groupSegmentSize;
uint32_t _privateSegmentSize;
std::string _name; // TODO - review for performance cost. Name is just used for debug.
};
// HIP API callback function
extern "C" void hip_api_callback(
uint32_t domain,
uint32_t cid,
const void* callback_data,
void* arg)
{
(void)arg;
const hip_cb_data_t* data = reinterpret_cast<const hip_cb_data_t*>(callback_data);
fprintf(stdout, "<%s id(%u)\tcorrelation_id(%u) %s> ",
data->name,
cid,
data->correlation_id,
(data->phase == ROCTRACER_API_PHASE_ENTER) ? "on-enter" : "on-exit");
if (data->phase == ROCTRACER_API_PHASE_ENTER) {
switch (cid) {
case HIP_API_ID_hipMemcpy:
fprintf(stdout, "dst(%p) src(%p) size(0x%x) kind(%u)",
data->args.hipMemcpy.dst,
data->args.hipMemcpy.src,
(uint32_t)(data->args.hipMemcpy.sizeBytes),
(uint32_t)(data->args.hipMemcpy.kind));
break;
case HIP_API_ID_hipMalloc:
fprintf(stdout, "ptr(%p) size(0x%x)",
data->args.hipMalloc.ptr,
(uint32_t)(data->args.hipMalloc.sizeBytes));
break;
case HIP_API_ID_hipFree:
fprintf(stdout, "ptr(%p)",
data->args.hipFree.ptr);
break;
case HIP_API_ID_hipModuleLaunchKernel:
fprintf(stdout, "kernel(%s) straem(%p)",
data->args.hipModuleLaunchKernel.f->_name.c_str(),
data->args.hipModuleLaunchKernel.stream);
break;
case HIP_API_ID_hipLaunchKernel:
fprintf(stdout, "kernel(%p) straem(%p)",
data->args.hipLaunchKernel.kernel,
data->args.hipLaunchKernel.stream);
break;
case HIP_API_ID_hipKernel:
fprintf(stdout, "kernel(\"%s\") start(%lu) end(%lu)",
data->args.hipKernel.name,
data->args.hipKernel.start,
data->args.hipKernel.end);
break;
default:
break;
}
} else {
switch (cid) {
case HIP_API_ID_hipMalloc:
fprintf(stdout, "*ptr(0x%p)",
*(data->args.hipMalloc.ptr));
break;
default:
break;
}
}
fprintf(stdout, "\n"); fflush(stdout);
}
// Activity tracing callback
// hipMalloc id(3) correlation_id(1): begin_ns(1525888652762640464) end_ns(1525888652762877067)
void activity_callback(const char* begin, const char* end, void* arg) {
const roctracer_record_t* record = reinterpret_cast<const roctracer_record_t*>(begin);
const roctracer_record_t* next = NULL;
ROCTRACER_CALL(roctracer_next_record(record, &next));
fprintf(stdout, "\tActivity records:\n"); fflush(stdout);
while (reinterpret_cast<const char*>(next) <= end) {
fprintf(stdout, "\t%s id(%u)\tcorrelation_id(%lu): begin_ns(%lu) end_ns(%lu)\n",
record->name,
record->activity_kind,
record->correlation_id,
record->begin_ns,
record->end_ns); fflush(stdout);
record = next;
ROCTRACER_CALL(roctracer_next_record(record, &next));
}
}
// Initialize function
void init_tracing() {
// Enable HIP API callbacks
ROCTRACER_CALL(roctracer_enable_api_callback(ROCTRACER_API_DOMAIN_HIP, HIP_API_ID_hipMemcpy, hip_api_callback, NULL));
ROCTRACER_CALL(roctracer_enable_api_callback(ROCTRACER_API_DOMAIN_HIP, HIP_API_ID_hipMalloc, hip_api_callback, NULL));
ROCTRACER_CALL(roctracer_enable_api_callback(ROCTRACER_API_DOMAIN_HIP, HIP_API_ID_hipFree, hip_api_callback, NULL));
ROCTRACER_CALL(roctracer_enable_api_callback(ROCTRACER_API_DOMAIN_HIP, HIP_API_ID_hipLaunchKernel, hip_api_callback, NULL));
ROCTRACER_CALL(roctracer_enable_api_callback(ROCTRACER_API_DOMAIN_HIP, HIP_API_ID_hipModuleLaunchKernel, hip_api_callback, NULL));
ROCTRACER_CALL(roctracer_enable_api_callback(ROCTRACER_API_DOMAIN_HIP, HIP_API_ID_hipKernel, hip_api_callback, NULL));
// Enable HIP activity tracing
roctracer_properties_t properties{};
properties.buffer_size = 0x100;
properties.buffer_callback_fun = activity_callback;
ROCTRACER_CALL(roctracer_open_pool(&properties));
ROCTRACER_CALL(roctracer_enable_api_activity(ROCTRACER_API_DOMAIN_HIP, HIP_API_ID_hipMemcpy));
ROCTRACER_CALL(roctracer_enable_api_activity(ROCTRACER_API_DOMAIN_HIP, HIP_API_ID_hipMalloc));
ROCTRACER_CALL(roctracer_enable_api_activity(ROCTRACER_API_DOMAIN_HIP, HIP_API_ID_hipFree));
ROCTRACER_CALL(roctracer_enable_api_activity(ROCTRACER_API_DOMAIN_HIP, HIP_API_ID_hipLaunchKernel));
ROCTRACER_CALL(roctracer_enable_api_activity(ROCTRACER_API_DOMAIN_HIP, HIP_API_ID_hipModuleLaunchKernel));
ROCTRACER_CALL(roctracer_enable_api_activity(ROCTRACER_API_DOMAIN_HIP, HIP_API_ID_hipKernel));
}
void finish_tracing() {
ROCTRACER_CALL(roctracer_close_pool());
}
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
Plik binarny nie jest wyświetlany.
+100
Wyświetl plik
@@ -0,0 +1,100 @@
## Writing first HIP program ###
This tutorial shows how to get write simple HIP application. We will write the simplest Matrix Transpose program.
## HIP Introduction:
HIP is a C++ runtime API and kernel language that allows developers to create portable applications that can run on AMD and other GPUs. Our goal was to rise above the lowest-common-denominator paths and deliver a solution that allows you, the developer, to use essential hardware features and maximize your applications performance on GPU hardware.
## Requirement:
For hardware requirement and software installation [Installation](https://github.com/ROCm-Developer-Tools/HIP/INSTALL.md)
## prerequiste knowledge:
Programmers familiar with CUDA, OpenCL will be able to quickly learn and start coding with the HIP API. In case you are not, don't worry. You choose to start with the best one. We'll be explaining everything assuming you are completely new to gpgpu programming.
## Simple Matrix Transpose
Here is simple example showing how to write your first program in HIP.
In order to use the HIP framework, we need to add the "hip_runtime.h" header file. SInce its c++ api you can add any header file you have been using earlier while writing your c/c++ program. For gpgpu programming, we have host(microprocessor) and the device(gpu).
## Device-side code
We will work on device side code first, Here is simple example showing a snippet of HIP device side code:
`__global__ void matrixTranspose(hipLaunchParm lp, `
` float *out, `
` float *in, `
` const int width, `
` const int height) `
`{ `
` int x = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x; `
` int y = hipBlockDim_y * hipBlockIdx_y + hipThreadIdx_y; `
` `
` out[y * width + x] = in[x * height + y]; `
`} `
`__global__` keyword is the Function-Type Qualifiers, it is used with functions that are executed on device and are called/launched from the hosts.
other function-type qualifiers are:
`__device__` functions are Executed on the device and Called from the device only
`__host__` functions are Executed on the host and Called from the host
`__host__` can combine with `__device__`, in which case the function compiles for both the host and device. These functions cannot use the HIP grid coordinate functions (for example, "hipThreadIdx_x", will talk about it latter). A possible workaround is to pass the necessary coordinate info as an argument to the function.
`__host__` cannot combine with `__global__`.
`__global__` functions are often referred to as *kernels, and calling one is termed *launching the kernel*.
Next keyword is `void`. HIP `__global__` functions must have a `void` return type, and the first parameter to a HIP `__global__` function must have the type `hipLaunchParm`, which is for execution configuration. Global functions require the caller to specify an "execution configuration" that includes the grid and block dimensions. The execution configuration can also include other information for the launch, such as the amount of additional shared memory to allocate and the stream where the kernel should execute.
After `hipLaunchParm`, Kernel arguments follows next(i.e., `float *out, float *in, const int width, const int height`).
The kernel function begins with
` int x = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x;`
` int y = hipBlockDim_y * hipBlockIdx_y + hipThreadIdx_y;`
here the keyword hipBlockIdx_x, hipBlockIdx_y and hipBlockIdx_z(not used here) are the built-in functions to identify the threads in a block. The keyword hipBlockDim_x, hipBlockDim_y and hipBlockDim_z(not used here) are to identify the dimensions of the block.
We are familiar with rest of the code on device-side.
## Host-side code
Now, we'll see how to call the kernel from the host. Inside the main() function, we first defined the pointers(for both, the host-side as well as device). The declaration of device pointer is similar to that of the host. Next, we have `hipDeviceProp_t`, it is the pre-defined struct for hip device properties. This is followed by `hipGetDeviceProperties(&devProp, 0)` It is used to extract the device information. The first parameter is the struct, second parameter is the device number to get properties for. Next line print the name of the device.
We allocated memory to the Matrix on host side by using malloc and initiallized it. While in order to allocate memory on device side we will be using `hipMalloc`, it's quiet similar to that of malloc instruction. After this, we will copy the data to the allocated memory on device-side using `hipMemcpy`.
` hipMemcpy(gpuMatrix, Matrix, NUM*sizeof(float), hipMemcpyHostToDevice);`
here the first parameter is the destination pointer, second is the source pointer, third is the size of memory copy and the last specify the direction on memory copy(which is in this case froom host to device). While in order to transfer memory from device to host, use `hipMemcpyDeviceToHost` and for device to device memory copy use `hipMemcpyDeviceToDevice`.
Now, we'll see how to launch the kernel.
` hipLaunchKernel(matrixTranspose, `
` dim3(WIDTH/THREADS_PER_BLOCK_X, HEIGHT/THREADS_PER_BLOCK_Y), `
` dim3(THREADS_PER_BLOCK_X, THREADS_PER_BLOCK_Y), `
` 0, 0, `
` gpuTransposeMatrix , gpuMatrix, WIDTH ,HEIGHT); `
HIP introduces a standard C++ calling convention to pass the execution configuration to the kernel (this convention replaces the `Cuda <<< >>>` syntax). In HIP,
- Kernels launch with the `"hipLaunchKernel"` function
- The first five parameters to hipLaunchKernel are the following:
- **symbol kernelName**: the name of the kernel to launch. To support template kernels which contains "," use the HIP_KERNEL_NAME macro. In current application it's "matrixTranspose".
- **dim3 gridDim**: 3D-grid dimensions specifying the number of blocks to launch. In MatrixTranspose sample, it's "dim3(WIDTH/THREADS_PER_BLOCK_X, HEIGHT/THREADS_PER_BLOCK_Y)".
- **dim3 blockDim**: 3D-block dimensions specifying the number of threads in each block.In MatrixTranspose sample, it's "dim3(THREADS_PER_BLOCK_X, THREADS_PER_BLOCK_Y)".
- **size_t dynamicShared**: amount of additional shared memory to allocate when launching the kernel. In MatrixTranspose sample, it's '0'.
- **hipStream_t**: stream where the kernel should execute. A value of 0 corresponds to the NULL stream.In MatrixTranspose sample, it's '0'.
- Kernel arguments follow these first five parameters. Here, these are "gpuTransposeMatrix , gpuMatrix, WIDTH ,HEIGHT".
Next, we'll copy the computed values/data back to the device using the `hipMemcpy`. Here the last parameter will be `hipMemcpyDeviceToHost`
After, copying the data from device to memory, we will verify it with the one we computed with the cpu reference funtion.
Finally, we will free the memory allocated earlier by using free() for host while for devices we will use `hipFree`.
## How to build and run:
Use the make command and execute it using ./exe
Use hipcc to build the application, which is using hcc on AMD and nvcc on nvidia.
## More Info:
- [HIP FAQ](https://github.com/ROCm-Developer-Tools/HIP/docs/markdown/hip_faq.md)
- [HIP Kernel Language](https://github.com/ROCm-Developer-Tools/HIP/docs/markdown/hip_kernel_language.md)
- [HIP Runtime API (Doxygen)](http://rocm-developer-tools.github.io/HIP)
- [HIP Porting Guide](https://github.com/ROCm-Developer-Tools/HIP/docs/markdown/hip_porting_guide.md)
- [HIP Terminology](https://github.com/ROCm-Developer-Tools/HIP/docs/markdown/hip_terms.md) (including Rosetta Stone of GPU computing terms across CUDA/HIP/HC/AMP/OpenL)
- [hipify-clang](https://github.com/ROCm-Developer-Tools/HIP/hipify-clang/README.md)
- [Developer/CONTRIBUTING Info](https://github.com/ROCm-Developer-Tools/HIP/CONTRIBUTING.md)
- [Release Notes](https://github.com/ROCm-Developer-Tools/HIP/RELEASE.md)
+31
Wyświetl plik
@@ -0,0 +1,31 @@
HCC_PROFILE=2 ./MatrixTranspose
Device name Device 67df
<hipMalloc id(3) correlation_id(1) on-enter> ptr(0x7ffd20812ac8) size(0x400000)
<hipMalloc id(3) correlation_id(1) on-exit> *ptr(0x0x901800000)
<hipMalloc id(3) correlation_id(2) on-enter> ptr(0x7ffd20812ac0) size(0x400000)
<hipMalloc id(3) correlation_id(2) on-exit> *ptr(0x0x901e00000)
<hipMemcpy id(101) correlation_id(3) on-enter> dst(0x901800000) src(0x7f8d7de53010) size(0x400000) kind(1)
<hipMemcpy id(101) correlation_id(3) on-exit>
<hipLaunchKernel id(38) correlation_id(4) on-enter> kernel(0x457010) straem((nil))
<hipModuleLaunchKernel id(118) correlation_id(5) on-enter> kernel(_Z15matrixTransposeN8hip_impl17Empty_launch_parmEPfS1_i) straem((nil))
<hipModuleLaunchKernel id(118) correlation_id(5) on-exit>
<hipLaunchKernel id(38) correlation_id(4) on-exit>
<hipMemcpy id(101) correlation_id(6) on-enter> dst(0x7f8d7d567010) src(0x901e00000) size(0x400000) kind(2)
<hipKernel id(133) correlation_id(7) on-enter> kernel("_Z15matrixTransposeN8hip_impl17Empty_launch_parmEPfS1_i") start(83839566647773) end(83839568478493)
<hipKernel id(133) correlation_id(7) on-exit>
<hipMemcpy id(101) correlation_id(6) on-exit>
PASSED!
<hipFree id(80) correlation_id(8) on-enter> ptr(0x901800000)
<hipFree id(80) correlation_id(8) on-exit>
<hipFree id(80) correlation_id(9) on-enter> ptr(0x901e00000)
<hipFree id(80) correlation_id(9) on-exit>
Activity records:
hipMalloc id(3) correlation_id(1): begin_ns(1525888625904709559) end_ns(1525888625904912781)
hipMalloc id(3) correlation_id(2): begin_ns(1525888625904916451) end_ns(1525888625905101986)
hipMemcpy id(101) correlation_id(3): begin_ns(1525888625905109198) end_ns(1525888625906165995)
hipModuleLaunchKernel id(118) correlation_id(5): begin_ns(1525888625906184286) end_ns(1525888625906210781)
hipLaunchKernel id(38) correlation_id(4): begin_ns(1525888625906171593) end_ns(1525888625906212767)
hipKernel id(133) correlation_id(7): begin_ns(1525888625908107416) end_ns(1525888625908114094)
hipMemcpy id(101) correlation_id(6): begin_ns(1525888625906214261) end_ns(1525888625908785157)
hipFree id(80) correlation_id(8): begin_ns(1525888625924823273) end_ns(1525888625924893910)
hipFree id(80) correlation_id(9): begin_ns(1525888625924896385) end_ns(1525888625924922452)