Documentation + Miscellaneous Fixes (#36)
* Added documentation markdown source
* Replaced AARInternal with AMDResearch in URLs
* Renamed cpack artifact names
* Fix to testing and lulesh submodule checkout
* Docker updates
* CMake and CPack
- force CMAKE_INSTALL_LIBDIR to lib
- CPACK_DEBIAN_PACKAGE_RELEASE uses OMNITRACE_CPACK_SYSTEM_NAME
- CPACK_RPM_PACKAGE_RELEASE uses OMNITRACE_CPACK_SYSTEM_NAME
- Tweak LIBOMP_LIBRARY find in examples/openmp
- Tweak setup-env.sh.in
* Partial update of README
- status badges
- docs link
- removed install info (covered by docs)
* OMNITRACE_SAMPLING_CPUS setting
- enables control over which CPUs are sampled for frequency
* omnitrace exe updates
- exclude transaction clone, virtual thunk, non-virtual thunk
- module_function::start_address
- module_function::instructions
- verbosity > 0 encodes instructions into JSON
* Miscellaneous fixes
- relocate setup-env.sh.in
- add modulefile.in
- Updated README.md and source/docs/about.md
- cmake fix for libomp
- fix license in miscellaneous places
- dl.hpp and dl.cpp
* Update timemory and dyninst submodules
- timemory signals updates
- dyninst Movement-adhoc updates
* cmake format
[ROCm/rocprofiler-systems commit: 945f541965]
Bu işleme şunda yer alıyor:
işlemeyi yapan:
GitHub
ebeveyn
4ddb8405ac
işleme
127e30a4d7
+2
-2
@@ -136,7 +136,7 @@ jobs:
|
||||
timeout-minutes: 10
|
||||
uses: actions/upload-artifact@v2
|
||||
with:
|
||||
name: stgz-installers
|
||||
name: ubuntu-bionic-rocm-stgz-installers
|
||||
path: |
|
||||
build-release/omnitrace-*.sh
|
||||
|
||||
@@ -144,7 +144,7 @@ jobs:
|
||||
timeout-minutes: 10
|
||||
uses: actions/upload-artifact@v2
|
||||
with:
|
||||
name: deb-installers
|
||||
name: ubuntu-bionic-rocm-deb-installers
|
||||
path: |
|
||||
build-release/omnitrace_*.deb
|
||||
|
||||
|
||||
@@ -112,7 +112,7 @@ jobs:
|
||||
timeout-minutes: 10
|
||||
uses: actions/upload-artifact@v2
|
||||
with:
|
||||
name: stgz-installers
|
||||
name: ubuntu-bionic-stgz-installers
|
||||
path: |
|
||||
build-release/omnitrace-*.sh
|
||||
|
||||
@@ -120,7 +120,7 @@ jobs:
|
||||
timeout-minutes: 10
|
||||
uses: actions/upload-artifact@v2
|
||||
with:
|
||||
name: deb-installers
|
||||
name: ubuntu-bionic-deb-installers
|
||||
path: |
|
||||
build-release/omnitrace_*.deb
|
||||
|
||||
|
||||
+2
-2
@@ -136,7 +136,7 @@ jobs:
|
||||
timeout-minutes: 10
|
||||
uses: actions/upload-artifact@v2
|
||||
with:
|
||||
name: stgz-installers
|
||||
name: ubuntu-focal-rocm-stgz-installers
|
||||
path: |
|
||||
build-release/omnitrace-*.sh
|
||||
|
||||
@@ -144,7 +144,7 @@ jobs:
|
||||
timeout-minutes: 10
|
||||
uses: actions/upload-artifact@v2
|
||||
with:
|
||||
name: deb-installers
|
||||
name: ubuntu-focal-rocm-deb-installers
|
||||
path: |
|
||||
build-release/omnitrace_*.deb
|
||||
|
||||
|
||||
@@ -112,7 +112,7 @@ jobs:
|
||||
timeout-minutes: 10
|
||||
uses: actions/upload-artifact@v2
|
||||
with:
|
||||
name: stgz-installers
|
||||
name: ubuntu-focal-stgz-installers
|
||||
path: |
|
||||
build-release/omnitrace-*.sh
|
||||
|
||||
@@ -120,7 +120,7 @@ jobs:
|
||||
timeout-minutes: 10
|
||||
uses: actions/upload-artifact@v2
|
||||
with:
|
||||
name: deb-installers
|
||||
name: ubuntu-focal-deb-installers
|
||||
path: |
|
||||
build-release/omnitrace_*.deb
|
||||
|
||||
|
||||
@@ -25,7 +25,7 @@ project(
|
||||
LANGUAGES C CXX
|
||||
VERSION ${OMNITRACE_VERSION}
|
||||
DESCRIPTION "CPU/GPU Application tracing with static/dynamic binary instrumentation"
|
||||
HOMEPAGE_URL "https://github.com/AARInternal/omnitrace")
|
||||
HOMEPAGE_URL "https://github.com/AMDResearch/omnitrace")
|
||||
|
||||
message(
|
||||
STATUS
|
||||
@@ -54,11 +54,17 @@ include(MacroUtilities) # various functions and macros
|
||||
include(Compilers) # compiler identification
|
||||
include(BuildSettings) # compiler flags
|
||||
|
||||
# force this because dyninst always installs to lib
|
||||
set(CMAKE_INSTALL_LIBDIR
|
||||
"lib"
|
||||
CACHE STRING "Object code libraries (lib)" FORCE)
|
||||
set(CMAKE_CXX_STANDARD
|
||||
17
|
||||
CACHE STRING "CXX language standard")
|
||||
omnitrace_add_feature(CMAKE_CXX_STANDARD "CXX language standard")
|
||||
omnitrace_add_feature(CMAKE_BUILD_TYPE "Build optimization level")
|
||||
omnitrace_add_feature(CMAKE_INSTALL_PREFIX "Installation prefix")
|
||||
omnitrace_add_feature(CMAKE_CXX_COMPILER "C++ compiler")
|
||||
omnitrace_add_feature(CMAKE_CXX_STANDARD "CXX language standard")
|
||||
omnitrace_add_option(CMAKE_CXX_STANDARD_REQUIRED "Require C++ language standard" ON)
|
||||
omnitrace_add_option(CMAKE_CXX_EXTENSIONS "Compiler specific language extensions" OFF)
|
||||
omnitrace_add_option(CMAKE_INSTALL_RPATH_USE_LINK_PATH "Enable rpath to linked libraries"
|
||||
@@ -170,8 +176,13 @@ add_subdirectory(source)
|
||||
#
|
||||
# ------------------------------------------------------------------------------#
|
||||
|
||||
configure_file(${PROJECT_SOURCE_DIR}/scripts/setup-env.sh.in
|
||||
${PROJECT_BINARY_DIR}/scripts/setup-env.sh @ONLY)
|
||||
configure_file(${PROJECT_SOURCE_DIR}/cmake/Templates/setup-env.sh.in
|
||||
${PROJECT_BINARY_DIR}/install-tree/setup-env.sh @ONLY)
|
||||
|
||||
configure_file(
|
||||
${PROJECT_SOURCE_DIR}/cmake/Templates/modulefile.in
|
||||
${PROJECT_BINARY_DIR}/install-tree/modulefiles/${PROJECT_NAME}/${OMNITRACE_VERSION}
|
||||
@ONLY)
|
||||
|
||||
install(
|
||||
PROGRAMS ${PROJECT_SOURCE_DIR}/scripts/omnitrace-merge.jl
|
||||
@@ -184,20 +195,22 @@ install(
|
||||
OPTIONAL)
|
||||
|
||||
install(
|
||||
FILES ${PROJECT_BINARY_DIR}/scripts/setup-env.sh
|
||||
FILES ${PROJECT_BINARY_DIR}/install-tree/setup-env.sh
|
||||
DESTINATION ${CMAKE_INSTALL_DATAROOTDIR}/${PROJECT_NAME}
|
||||
OPTIONAL)
|
||||
|
||||
install(
|
||||
FILES
|
||||
${PROJECT_BINARY_DIR}/install-tree/modulefiles/${PROJECT_NAME}/${OMNITRACE_VERSION}
|
||||
DESTINATION ${CMAKE_INSTALL_DATAROOTDIR}/modulefiles/${PROJECT_NAME}
|
||||
OPTIONAL)
|
||||
|
||||
# ------------------------------------------------------------------------------#
|
||||
#
|
||||
# examples
|
||||
#
|
||||
# ------------------------------------------------------------------------------#
|
||||
|
||||
if(OMNITRACE_BUILD_LTO)
|
||||
omnitrace_restore_variables(LTO VARIABLES CMAKE_INTERPROCEDURAL_OPTIMIZATION)
|
||||
endif()
|
||||
|
||||
if(OMNITRACE_BUILD_EXAMPLES)
|
||||
add_subdirectory(examples)
|
||||
endif()
|
||||
|
||||
@@ -1,71 +1,13 @@
|
||||
# omnitrace: application tracing with static/dynamic binary instrumentation
|
||||
|
||||
It is highly recommended to use the ore-built binary installers for omnitrace which are provided in the "Assets" section of each release.
|
||||
[](https://github.com/AMDResearch/omnitrace/actions/workflows/ubuntu-bionic.yml)
|
||||
[](https://github.com/AMDResearch/omnitrace/actions/workflows/ubuntu-focal-external.yml)
|
||||
[](https://github.com/AMDResearch/omnitrace/actions/workflows/ubuntu-focal-dyninst-package.yml)
|
||||
[](https://github.com/AMDResearch/omnitrace/actions/workflows/ubuntu-focal.yml)
|
||||
[](https://github.com/AMDResearch/omnitrace/actions/workflows/ubuntu-focal-external-rocm.yml)
|
||||
|
||||
## Dependencies
|
||||
|
||||
- Ubuntu 18.04 or Ubuntu 20.04
|
||||
- Other OS distributions may be supported but are not tested
|
||||
- GCC compiler v7+
|
||||
- Older GCC compilers may be supported but are not tested
|
||||
- Clang compilers are generally supported for Omnitrace but not Dyninst
|
||||
- [CMake](https://cmake.org/) v3.15+
|
||||
- [DynInst](https://github.com/dyninst/dyninst) for dynamic or static instrumentation
|
||||
- [TBB](https://github.com/oneapi-src/oneTBB) required by Dyninst
|
||||
- [ElfUtils](https://sourceware.org/elfutils/) required by Dyninst
|
||||
- [LibIberty](https://github.com/gcc-mirror/gcc/tree/master/libiberty) required by Dyninst
|
||||
- [Boost](https://www.boost.org/) required by Dyninst
|
||||
- [OpenMP](https://www.openmp.org/) optional by Dyninst
|
||||
- [ROCm](https://rocmdocs.amd.com/en/latest/Installation_Guide/Installation-Guide.html#ubuntu) (optional)
|
||||
- HIP
|
||||
- Roctracer for HIP API and kernel tracing
|
||||
- [PAPI](https://icl.utk.edu/papi/)
|
||||
- [libunwind](https://www.nongnu.org/libunwind/) for call-stack sampling
|
||||
- Several optional third-party profiling tools supported by timemory (e.g. TAU, Caliper, CrayPAT, etc.)
|
||||
|
||||
## Installing CMake
|
||||
|
||||
If using Ubuntu 20.04, `apt-get install cmake` will install cmake v3.16.3. If using Ubuntu 18.04, the cmake version via apt is too old (v3.10.2). In this case, run:
|
||||
|
||||
```console
|
||||
python3 -m pip install `cmake==3.18.4`
|
||||
export PATH=${HOME}/.local/bin
|
||||
```
|
||||
|
||||
## Installing DynInst
|
||||
|
||||
The easiest way to install Dyninst is to configure omnitrace with `-DOMNITRACE_BUILD_DYNINST` and have Dyninst install it's dependencies:
|
||||
`-DDyninst_BUILD_TBB=ON -DDyninst_BUILD_ELFUTILS=ON -DDyninst_BUILD_BOOST=ON -DDyninst_BUILD_LIBIBERTY=ON`.
|
||||
|
||||
```shell
|
||||
git clone https://github.com/spack/spack.git
|
||||
source ./spack/share/spack/setup-env.sh
|
||||
spack compiler find
|
||||
spack external find
|
||||
spack install dyninst
|
||||
spack load -r dyninst
|
||||
```
|
||||
|
||||
## Installing omnitrace
|
||||
|
||||
Omnitrace can have full MPI support (`-DOMNITRACE_USE_MPI=ON`) or partially (`-DOMNITRACE_USE_MPI_HEADERS=ON`). The only difference between these two modes
|
||||
is whether or not the results collected via timemory can be aggregated into one output file. If full MPI support is selected, make sure your target application
|
||||
is built against the same MPI distribution as omnitrace, i.e. do not build omnitrace with MPICH and use it on a target application built against OpenMPI.
|
||||
If partial support is selected, build omnitrace against OpenMPI -- the reason this is recommended is because the `MPI_COMM_WORLD` in OpenMPI is a pointer to
|
||||
`ompi_communicator_t` (8 bytes) whereas `MPI_COMM_WORLD` in MPICH is an `int` (4 bytes). Building omnitrace with partial MPI support and the MPICH header and using
|
||||
on an application using OpenMPI will thus implicitly cast `MPI_COMM_WORLD` to 4 bytes in the MPI function wrappers before calling the underlying OpenMPI function
|
||||
resulting in an incorrect address for `ompi_communicator_t` whereas partial MPI support with the OpenMPI headers does not cast `MPI_COMM_WORLD` into a smaller datatype
|
||||
which used with MPICH.
|
||||
|
||||
```shell
|
||||
OMNITRACE_ROOT=${HOME}/sw/omnitrace
|
||||
git clone https://github.com/AARInternal/omnitrace.git
|
||||
cmake -B build-omnitrace -DOMNITRACE_USE_MPI=ON -DCMAKE_INSTALL_PREFIX=${OMNITRACE_ROOT} omnitrace
|
||||
cmake --build build-omnitrace --target all --parallel 8
|
||||
cmake --build build-omnitrace --target install
|
||||
export PATH=${OMNITRACE_ROOT}/bin:${PATH}
|
||||
export LD_LIBRARY_PATH=${OMNITRACE_ROOT}/lib64:${OMNITRACE_ROOT}/lib:${LD_LIBRARY_PATH}
|
||||
```
|
||||
Omnitrace is an AMD research project and should not be treated as an offical part of the ROCm software stack.
|
||||
The documentation for omnitrace is available at [amdresearch.github.io/omnitrace](https://amdresearch.github.io/omnitrace/).
|
||||
|
||||
## Using Omnitrace Executable
|
||||
|
||||
|
||||
@@ -1,7 +1,8 @@
|
||||
if(DYNINST_BUILD_ELFUTILS AND DYNINST_ELFUTILS_DOWNLOAD_VERSION)
|
||||
omnitrace_add_feature(DYNINST_ELFUTILS_DOWNLOAD_VERSION "ElfUtils download version")
|
||||
foreach(_LIB dw elf)
|
||||
install(
|
||||
FILES
|
||||
PROGRAMS
|
||||
${CMAKE_INSTALL_PREFIX}/${CMAKE_INSTALL_LIBDIR}/dyninst-tpls/lib/lib${_LIB}${CMAKE_SHARED_LIBRARY_SUFFIX}
|
||||
${CMAKE_INSTALL_PREFIX}/${CMAKE_INSTALL_LIBDIR}/dyninst-tpls/lib/lib${_LIB}${CMAKE_SHARED_LIBRARY_SUFFIX}.1
|
||||
${CMAKE_INSTALL_PREFIX}/${CMAKE_INSTALL_LIBDIR}/dyninst-tpls/lib/lib${_LIB}-${DYNINST_ELFUTILS_DOWNLOAD_VERSION}${CMAKE_SHARED_LIBRARY_SUFFIX}
|
||||
@@ -32,6 +33,7 @@ set(CPACK_PACKAGE_VERSION_MINOR "${PROJECT_VERSION_MINOR}")
|
||||
set(CPACK_PACKAGE_VERSION_PATCH "${PROJECT_VERSION_PATCH}")
|
||||
set(CPACK_PACKAGE_CONTACT "jonathan.madsen@amd.com")
|
||||
set(CPACK_RESOURCE_FILE_LICENSE "${PROJECT_SOURCE_DIR}/LICENSE")
|
||||
set(CPACK_INCLUDE_TOPLEVEL_DIRECTORY OFF)
|
||||
set(OMNITRACE_CPACK_SYSTEM_NAME
|
||||
"${_SYSTEM_NAME}"
|
||||
CACHE STRING "System name, e.g. Linux or Ubuntu-18.04")
|
||||
@@ -101,8 +103,9 @@ omnitrace_add_feature(OMNITRACE_PACKAGE_FILE_NAME "CPack filename")
|
||||
#
|
||||
# -------------------------------------------------------------------------------------- #
|
||||
|
||||
set(CPACK_DEBIAN_PACKAGE_HOMEPAGE "https://github.com/AARInternal/omnitrace")
|
||||
set(CPACK_DEBIAN_PACKAGE_RELEASE "${CMAKE_SYSTEM_NAME}${OMNITRACE_CPACK_PACKAGE_SUFFIX}")
|
||||
set(CPACK_DEBIAN_PACKAGE_HOMEPAGE "https://github.com/AMDResearch/omnitrace")
|
||||
set(CPACK_DEBIAN_PACKAGE_RELEASE
|
||||
"${OMNITRACE_CPACK_SYSTEM_NAME}${OMNITRACE_CPACK_PACKAGE_SUFFIX}")
|
||||
string(REGEX REPLACE "([a-zA-Z])-([0-9])" "\\1\\2" CPACK_DEBIAN_PACKAGE_RELEASE
|
||||
"${CPACK_DEBIAN_PACKAGE_RELEASE}")
|
||||
string(REPLACE "-" "~" CPACK_DEBIAN_PACKAGE_RELEASE "${CPACK_DEBIAN_PACKAGE_RELEASE}")
|
||||
@@ -166,7 +169,8 @@ if(DEFINED CPACK_PACKAGING_INSTALL_PREFIX)
|
||||
set(CPACK_RPM_EXCLUDE_FROM_AUTO_FILELIST_ADDITION "${CPACK_PACKAGING_INSTALL_PREFIX}")
|
||||
endif()
|
||||
|
||||
set(CPACK_RPM_PACKAGE_RELEASE "${CMAKE_SYSTEM_NAME}${OMNITRACE_CPACK_PACKAGE_SUFFIX}")
|
||||
set(CPACK_RPM_PACKAGE_RELEASE
|
||||
"${OMNITRACE_CPACK_SYSTEM_NAME}${OMNITRACE_CPACK_PACKAGE_SUFFIX}")
|
||||
string(REGEX REPLACE "([a-zA-Z])-([0-9])" "\\1\\2" CPACK_RPM_PACKAGE_RELEASE
|
||||
"${CPACK_RPM_PACKAGE_RELEASE}")
|
||||
string(REPLACE "-" "~" CPACK_RPM_PACKAGE_RELEASE "${CPACK_RPM_PACKAGE_RELEASE}")
|
||||
|
||||
@@ -190,10 +190,7 @@ function(ROCM_VERSION_PARSE_VERSION_FILES)
|
||||
endforeach()
|
||||
endfunction()
|
||||
|
||||
# search for HIP to set ROCM_PATH
|
||||
if(NOT hip_FOUND)
|
||||
find_package(hip)
|
||||
endif()
|
||||
# search for HIP to set ROCM_PATH if(NOT hip_FOUND) find_package(hip) endif()
|
||||
|
||||
function(COMPUTE_ROCM_VERSION_DIR)
|
||||
if(EXISTS "${ROCmVersion_VERSION_FILE}" AND IS_ABSOLUTE
|
||||
@@ -231,7 +228,7 @@ function(ROCM_VERSION_PARSE_VERSION_FILES)
|
||||
set(_PATHS ${ROCmVersion_DIR})
|
||||
else()
|
||||
set(_PATHS ${ROCmVersion_DIR} ${ROCmVersion_ROOT} ${ROCmVersion_ROOT_DIR}
|
||||
${ROCM_PATH} $ENV{CMAKE_PREFIX_PATH} ${CMAKE_PREFIX_PATH} /opt/rocm)
|
||||
$ENV{CMAKE_PREFIX_PATH} ${CMAKE_PREFIX_PATH} ${ROCM_PATH} /opt/rocm)
|
||||
rocm_version_message(STATUS "ROCmVersion search paths: ${_PATHS}")
|
||||
endif()
|
||||
|
||||
|
||||
@@ -191,7 +191,7 @@ if(OMNITRACE_BUILD_DYNINST)
|
||||
omnitrace_target_compile_definitions(
|
||||
omnitrace-dyninst
|
||||
INTERFACE
|
||||
DYNINST_API_RT="${CMAKE_INSTALL_PREFIX}/${CMAKE_INSTALL_LIBDIR}:$<TARGET_FILE_DIR:Dyninst::dyninstAPI_RT>:${CMAKE_INSTALL_PREFIX}/lib/$<TARGET_FILE_NAME:Dyninst::dyninstAPI_RT>:$<TARGET_FILE:Dyninst::dyninstAPI_RT>"
|
||||
DYNINST_API_RT="${CMAKE_INSTALL_PREFIX}/${CMAKE_INSTALL_LIBDIR}:$<TARGET_FILE_DIR:Dyninst::dyninstAPI_RT>:${CMAKE_INSTALL_PREFIX}/${CMAKE_INSTALL_LIBDIR}/$<TARGET_FILE_NAME:Dyninst::dyninstAPI_RT>:$<TARGET_FILE:Dyninst::dyninstAPI_RT>"
|
||||
)
|
||||
endif()
|
||||
|
||||
@@ -466,6 +466,9 @@ if(NOT TARGET PTL::ptl-shared)
|
||||
set(PTL_USE_GPU OFF)
|
||||
set(PTL_DEVELOPER_INSTALL OFF)
|
||||
|
||||
if(NOT DEFINED BUILD_OBJECT_LIBS)
|
||||
set(BUILD_OBJECT_LIBS OFF)
|
||||
endif()
|
||||
omnitrace_save_variables(
|
||||
BUILD_CONFIG
|
||||
VARIABLES BUILD_SHARED_LIBS BUILD_STATIC_LIBS BUILD_OBJECT_LIBS
|
||||
|
||||
@@ -0,0 +1,15 @@
|
||||
#%Module1.0
|
||||
|
||||
module-whatis "omnitrace (version @OMNITRACE_VERSION@)"
|
||||
|
||||
proc ModulesHelp { } {
|
||||
puts stderr "Loads omnitrace v@OMNITRACE_VERSION@"
|
||||
}
|
||||
|
||||
set ROOT [file normalize [file dirname [file normalize ${ModulesCurrentModulefile}]]/../../..]
|
||||
|
||||
prepend-path CMAKE_PREFIX_PATH "${ROOT}"
|
||||
prepend-path PATH "${ROOT}/bin"
|
||||
prepend-path LD_LIBRARY_PATH "${ROOT}/@CMAKE_INSTALL_LIBDIR@"
|
||||
prepend-path PYTHONPATH "${ROOT}/@CMAKE_INSTALL_PYTHONDIR@"
|
||||
setenv @PROJECT_NAME@_DIR "${ROOT}/share/cmake/omnitrace"
|
||||
+4
-3
@@ -8,7 +8,8 @@ if [ ! -d "${BASEDIR}" ]; then
|
||||
return 1
|
||||
fi
|
||||
|
||||
export PATH=${BASEDIR}/bin:${PATH}
|
||||
export LD_LIBRARY_PATH=${BASEDIR}/@CMAKE_INSTALL_LIBDIR@:${LD_LIBRARY_PATH}
|
||||
PATH=${BASEDIR}/bin:${PATH}
|
||||
LD_LIBRARY_PATH=${BASEDIR}/@CMAKE_INSTALL_LIBDIR@:${LD_LIBRARY_PATH}
|
||||
|
||||
return 0
|
||||
export PATH
|
||||
export LD_LIBRARY_PATH
|
||||
@@ -0,0 +1,57 @@
|
||||
ARG DISTRO=centos
|
||||
ARG VERSION=7
|
||||
FROM ${DISTRO}:${VERSION}
|
||||
|
||||
ENV HOME /root
|
||||
ENV SHELL /bin/bash
|
||||
ENV BASH_ENV /etc/bash.bashrc
|
||||
ENV DEBIAN_FRONTEND noninteractive
|
||||
|
||||
WORKDIR /tmp
|
||||
SHELL [ "/bin/bash", "-c" ]
|
||||
|
||||
ENV PATH /usr/local/bin:${PATH}
|
||||
|
||||
RUN yum update -y && \
|
||||
yum groupinstall -y "Development Tools" && \
|
||||
yum install -y centos-release-scl && \
|
||||
yum install -y epel-release && \
|
||||
yum install -y devtoolset-9 python3-pip openmpi3-devel zlib-devel numactl-devel papi-devel dpkg-devel dpkg-dev && \
|
||||
python3 -m pip install 'cmake==3.18.4'
|
||||
|
||||
ARG AMDGPU_RPM=21.40.2/rhel/7.9/amdgpu-install-21.40.2.40502-1.el7.noarch.rpm
|
||||
# ARG AMDGPU_RPM=latest/rhel/7.9/amdgpu-install-21.50.50000-1.el7.noarch.rpm
|
||||
|
||||
RUN yum install -y https://repo.radeon.com/amdgpu-install/${AMDGPU_RPM} && \
|
||||
amdgpu-install --usecase=rocm,hip,hiplibsdk --no-dkms --skip-broken -y && \
|
||||
yum install -y rocm-hip-sdk roctracer-dev rocm-smi-lib rocprofiler-dev && \
|
||||
yum update -y && \
|
||||
yum clean all
|
||||
|
||||
RUN ln -s /opt/rocm-* /opt/rocm
|
||||
|
||||
WORKDIR /home
|
||||
SHELL [ "/bin/bash", "--login", "-c" ]
|
||||
COPY ./entrypoint-centos.sh /docker-entrypoint.sh
|
||||
ENTRYPOINT [ "/docker-entrypoint.sh" ]
|
||||
|
||||
#1 yum update
|
||||
#2 yum groupinstall "Development Tools"
|
||||
#3 yum install devtoolset-9-toolchain
|
||||
#4 yum install devtoolset-9
|
||||
#5 yum install devtoolset-7-toolchain
|
||||
#6 yum search devtoolset
|
||||
#7 yum search -a devtoolset
|
||||
#8 yum search --help
|
||||
#9 yum repolist
|
||||
#10 yum list available
|
||||
#11 yum list available devtoolset*
|
||||
#12 yum list available devtoolset\*
|
||||
#13 subscription-manager list --available
|
||||
#14 yum install subscription-manager
|
||||
#15 subscription-manager list --available
|
||||
#16 yum install centos-release-scl
|
||||
#17 yum-config-manager --enable rhel-server-rhscl-7-rpms
|
||||
#18 yum install devtoolset-7
|
||||
#19 yum install devtoolset-9
|
||||
#20 scl enable devtoolset-9 bash
|
||||
@@ -0,0 +1,32 @@
|
||||
ARG DISTRO=opensuse/leap
|
||||
ARG VERSION=15.3
|
||||
FROM ${DISTRO}:${VERSION}
|
||||
|
||||
ENV HOME /root
|
||||
ENV SHELL /bin/bash
|
||||
ENV BASH_ENV /etc/bash.bashrc
|
||||
ENV DEBIAN_FRONTEND noninteractive
|
||||
|
||||
WORKDIR /tmp
|
||||
SHELL [ "/bin/bash", "-c" ]
|
||||
|
||||
ENV PATH /usr/local/bin:${PATH}
|
||||
|
||||
RUN zypper update -y && \
|
||||
zypper dist-upgrade -y && \
|
||||
zypper install -y -t pattern devel_basis && \
|
||||
zypper install -y python3-pip openmpi3-devel gcc-c++ git libnuma-devel dpkg-devel rpm-build && \
|
||||
python3 -m pip install 'cmake==3.18.4'
|
||||
|
||||
# ARG AMDGPU_RPM=21.40.2/sle/15/amdgpu-install-21.40.2.40502-1.noarch.rpm
|
||||
ARG AMDGPU_RPM=latest/sle/15/amdgpu-install-21.50.50000-1.noarch.rpm
|
||||
|
||||
RUN zypper --no-gpg-checks install -y https://repo.radeon.com/amdgpu-install/${AMDGPU_RPM} && \
|
||||
zypper addrepo https://download.opensuse.org/repositories/devel:languages:perl/SLE_15/devel:languages:perl.repo && \
|
||||
zypper --non-interactive --gpg-auto-import-keys refresh && \
|
||||
amdgpu-install --usecase=rocm,hip,hiplibsdk --no-dkms -y && \
|
||||
zypper install -y rocm-hip-sdk roctracer-dev rocm-smi-lib rocprofiler-dev && \
|
||||
zypper clean --all
|
||||
|
||||
WORKDIR /home
|
||||
SHELL [ "/bin/bash", "--login", "-c" ]
|
||||
@@ -0,0 +1,36 @@
|
||||
ARG DISTRO=opensuse/leap
|
||||
ARG VERSION=15.3
|
||||
FROM ${DISTRO}:${VERSION}
|
||||
|
||||
ENV HOME /root
|
||||
ENV SHELL /bin/bash
|
||||
ENV BASH_ENV /etc/bash.bashrc
|
||||
ENV DEBIAN_FRONTEND noninteractive
|
||||
|
||||
WORKDIR /tmp
|
||||
SHELL [ "/bin/bash", "-c" ]
|
||||
|
||||
ENV PATH /usr/local/bin:${PATH}
|
||||
|
||||
ARG EXTRA_PACKAGES=""
|
||||
ARG ELFUTILS_DOWNLOAD_VERSION="0.183"
|
||||
ARG NJOBS="12"
|
||||
|
||||
RUN zypper update -y && \
|
||||
zypper dist-upgrade -y && \
|
||||
zypper install -y -t pattern devel_basis && \
|
||||
zypper install -y python3-pip openmpi3-devel gcc-c++ git libnuma-devel dpkg-devel rpm-build papi-devel && \
|
||||
python3 -m pip install 'cmake==3.18.4'
|
||||
|
||||
COPY ./dyninst-source /tmp/dyninst
|
||||
|
||||
RUN cd /tmp/dyninst && \
|
||||
cmake -B build -DCMAKE_BUILD_TYPE=Release -DBUILD_BOOST=ON -DBUILD_TBB=ON -DBUILD_ELFUTILS=ON -DBUILD_LIBIBERTY=ON && \
|
||||
cmake --build build --target all --parallel ${NJOBS} && \
|
||||
cmake --build build --target install --parallel ${NJOBS} && \
|
||||
cd /tmp && \
|
||||
shopt -s dotglob extglob && \
|
||||
rm -rf *
|
||||
|
||||
WORKDIR /home
|
||||
SHELL [ "/bin/bash", "--login", "-c" ]
|
||||
@@ -2,33 +2,79 @@
|
||||
|
||||
set -e
|
||||
|
||||
if [ ! -f Dockerfile.ci ]; then cd docker; fi
|
||||
|
||||
if [ ! -f Dockerfile.ci ]; then
|
||||
echo "Error! Execute script from source directory"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
rm -rf ./dyninst-source
|
||||
cp -r ../external/dyninst ./dyninst-source
|
||||
rm -rf ./dyninst-source/{build,install}*
|
||||
|
||||
: ${DISTRO:=ubuntu}
|
||||
: ${VERSIONS:=20.04 18.04}
|
||||
: ${NJOBS=$(nproc)}
|
||||
: ${ELFUTILS_VERSION:=0.183}
|
||||
|
||||
send-error()
|
||||
{
|
||||
echo -e "\nError: ${@}"
|
||||
exit 1
|
||||
}
|
||||
|
||||
verbose-run()
|
||||
{
|
||||
echo -e "\n\n### Executing \"${@}\"... ###\n"
|
||||
eval $@
|
||||
}
|
||||
|
||||
n=0
|
||||
while [[ $# -gt 0 ]]
|
||||
do
|
||||
case "${1}" in
|
||||
"--distro")
|
||||
shift
|
||||
DISTRO=${1}
|
||||
;;
|
||||
"--versions")
|
||||
shift
|
||||
VERSIONS=${1}
|
||||
;;
|
||||
"-j")
|
||||
shift
|
||||
NJOBS=${1}
|
||||
;;
|
||||
"--elfutils-version")
|
||||
shift
|
||||
ELFUTILS_VERSION=${1}
|
||||
;;
|
||||
*)
|
||||
send-error "Unsupported argument at position $((${n} + 1)) :: ${1}"
|
||||
;;
|
||||
esac
|
||||
n=$((${n} + 1))
|
||||
shift
|
||||
done
|
||||
|
||||
DOCKER_FILE=Dockerfile.${DISTRO}.ci
|
||||
|
||||
if [ ! -f ${DOCKER_FILE} ]; then cd docker; fi
|
||||
|
||||
if [ ! -f ${DOCKER_FILE} ]; then
|
||||
echo "Error! Execute script from source directory"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
verbose-run rm -rf ./dyninst-source
|
||||
verbose-run cp -r ../external/dyninst ./dyninst-source
|
||||
verbose-run rm -rf ./dyninst-source/{build,install}*
|
||||
|
||||
set -e
|
||||
|
||||
DISTRO_IMAGE=${DISTRO}
|
||||
|
||||
if [ "${DISTRO}" = "opensuse" ]; then DISTRO_IMAGE="opensuse/leap"; fi
|
||||
|
||||
for VERSION in ${VERSIONS}
|
||||
do
|
||||
docker build . \
|
||||
-f Dockerfile.ci \
|
||||
verbose-run docker build . \
|
||||
-f ${DOCKER_FILE} \
|
||||
--tag jrmadsen/omnitrace-ci:${DISTRO}-${VERSION} \
|
||||
--build-arg DISTRO=${DISTRO} \
|
||||
--build-arg DISTRO=${DISTRO_IMAGE} \
|
||||
--build-arg VERSION=${VERSION} \
|
||||
--build-arg NJOBS=${NJOBS} \
|
||||
--build-arg ELFUTILS_DOWNLOAD_VERSION=${ELFUTILS_VERSION}
|
||||
done
|
||||
|
||||
rm -rf ./dyninst-source
|
||||
verbose-run rm -rf ./dyninst-source
|
||||
|
||||
@@ -10,10 +10,11 @@ set -e
|
||||
build-release()
|
||||
{
|
||||
CONTAINER=$1
|
||||
ROCM_VERSION=$2
|
||||
CODE_VERSION=$3
|
||||
OS=$2
|
||||
ROCM_VERSION=$3
|
||||
CODE_VERSION=$4
|
||||
MPI=$4
|
||||
docker run -it --rm -v ${PWD}:/home/omnitrace --env ROCM_VERSION=${ROCM_VERSION} --env VERSION=${CODE_VERSION} --env MPI=${MPI} ${CONTAINER} /home/omnitrace/scripts/build-release.sh
|
||||
docker run -it --rm -v ${PWD}:/home/omnitrace --env DISTRO=${OS} --env ROCM_VERSION=${ROCM_VERSION} --env VERSION=${CODE_VERSION} --env MPI=${MPI} ${CONTAINER} /home/omnitrace/scripts/build-release.sh
|
||||
}
|
||||
|
||||
: ${DISTRO:=ubuntu}
|
||||
@@ -21,6 +22,50 @@ build-release()
|
||||
: ${ROCM_VERSIONS:=5.0 4.5 4.3}
|
||||
: ${MPI:=0}
|
||||
|
||||
send-error()
|
||||
{
|
||||
echo -e "\nError: ${@}"
|
||||
exit 1
|
||||
}
|
||||
|
||||
verbose-run()
|
||||
{
|
||||
echo -e "\n\n### Executing \"${@}\"... ###\n"
|
||||
eval $@
|
||||
}
|
||||
|
||||
n=0
|
||||
while [[ $# -gt 0 ]]
|
||||
do
|
||||
case "${1}" in
|
||||
"--distro")
|
||||
shift
|
||||
DISTRO=${1}
|
||||
;;
|
||||
"--versions")
|
||||
shift
|
||||
VERSIONS=${1}
|
||||
;;
|
||||
"--rocm-versions")
|
||||
shift
|
||||
ROCM_VERSIONS=${1}
|
||||
;;
|
||||
*)
|
||||
if [ "${n}" -eq 0 ]; then
|
||||
DISTRO=${1}
|
||||
elif [ "${n}" -eq 1 ]; then
|
||||
VERSIONS=${1}
|
||||
elif [ "${n}" -eq 2 ]; then
|
||||
ROCM_VERSIONS=${1}
|
||||
else
|
||||
send-error "Unsupported argument at position $((${n} + 1)) :: ${1}"
|
||||
fi
|
||||
;;
|
||||
esac
|
||||
n=$((${n} + 1))
|
||||
shift
|
||||
done
|
||||
|
||||
CODE_VERSION=$(cat VERSION)
|
||||
|
||||
for VERSION in ${VERSIONS}
|
||||
@@ -28,6 +73,6 @@ do
|
||||
TAG=${DISTRO}-${VERSION}
|
||||
for ROCM_VERSION in ${ROCM_VERSIONS}
|
||||
do
|
||||
build-release jrmadsen/omnitrace-${TAG}-rocm-${ROCM_VERSION} ${ROCM_VERSION} ${CODE_VERSION} ${MPI}
|
||||
build-release jrmadsen/omnitrace-${TAG}-rocm-${ROCM_VERSION} ${DISTRO}-${VERSION} ${ROCM_VERSION} ${CODE_VERSION} ${MPI}
|
||||
done
|
||||
done
|
||||
|
||||
@@ -3,19 +3,138 @@
|
||||
: ${ROCM_VERSIONS:="5.0 4.5 4.3"}
|
||||
: ${DISTRO:=ubuntu}
|
||||
: ${VERSIONS:=20.04 18.04}
|
||||
: ${CI:=""}
|
||||
|
||||
set -e
|
||||
|
||||
if [ ! -f Dockerfile ]; then cd docker; fi
|
||||
send-error()
|
||||
{
|
||||
echo -e "\nError: ${@}"
|
||||
exit 1
|
||||
}
|
||||
|
||||
verbose-run()
|
||||
{
|
||||
echo -e "\n\n### Executing \"${@}\"... ###\n"
|
||||
eval $@
|
||||
}
|
||||
|
||||
n=0
|
||||
while [[ $# -gt 0 ]]
|
||||
do
|
||||
case "${1}" in
|
||||
"--distro")
|
||||
shift
|
||||
DISTRO=${1}
|
||||
;;
|
||||
"--versions")
|
||||
shift
|
||||
VERSIONS=${1}
|
||||
;;
|
||||
"--rocm-versions")
|
||||
shift
|
||||
ROCM_VERSIONS=${1}
|
||||
;;
|
||||
*)
|
||||
if [ "${n}" -eq 0 ]; then
|
||||
DISTRO=${1}
|
||||
elif [ "${n}" -eq 1 ]; then
|
||||
VERSIONS=${1}
|
||||
elif [ "${n}" -eq 2 ]; then
|
||||
ROCM_VERSIONS=${1}
|
||||
else
|
||||
send-error "Unsupported argument at position $((${n} + 1)) :: ${1}"
|
||||
fi
|
||||
;;
|
||||
esac
|
||||
n=$((${n} + 1))
|
||||
shift
|
||||
done
|
||||
|
||||
DOCKER_FILE="Dockerfile.${DISTRO}"
|
||||
|
||||
if [ -n "${CI}" ]; then DOCKER_FILE="${DOCKER_FILE}.ci"; fi
|
||||
if [ ! -f ${DOCKER_FILE} ]; then cd docker; fi
|
||||
if [ ! -f ${DOCKER_FILE} ]; then send-error "File \"${DOCKER_FILE}\" not found"; fi
|
||||
|
||||
for VERSION in ${VERSIONS}
|
||||
do
|
||||
for i in ${ROCM_VERSIONS}
|
||||
do
|
||||
ROCM_REPO_VERSION=${i}
|
||||
if [ "${i}" = "5.0" ]; then ROCM_REPO_VERSION=debian; fi
|
||||
if [ "${i}" = "4.1" ]; then ROCM_REPO_DIST="xenial"; fi
|
||||
if [ "${i}" = "4.0" ]; then ROCM_REPO_DIST="xenial"; fi
|
||||
docker build . --tag jrmadsen/omnitrace-${DISTRO}-${VERSION}-rocm-${i} --build-arg DISTRO=${DISTRO} --build-arg VERSION=${VERSION} --build-arg ROCM_REPO_VERSION=${ROCM_REPO_VERSION} --build-arg ROCM_REPO_DIST=${ROCM_REPO_DIST}
|
||||
if [ "${DISTRO}" = "ubuntu" ]; then
|
||||
ROCM_REPO_DIST="ubuntu"
|
||||
ROCM_REPO_VERSION=${i}
|
||||
case "${i}" in
|
||||
5.0*)
|
||||
ROCM_REPO_VERSION="debian"
|
||||
;;
|
||||
4.1* | 4.0*)
|
||||
ROCM_REPO_DIST="xenial"
|
||||
;;
|
||||
*)
|
||||
send-error "Unsupported combination :: ${DISTRO}-${VERSION} + ROCm ${i}"
|
||||
;;
|
||||
esac
|
||||
verbose-run docker build . -f ${DOCKER_FILE} --tag jrmadsen/omnitrace-${DISTRO}-${VERSION}-rocm-${i} --build-arg DISTRO=${DISTRO} --build-arg VERSION=${VERSION} --build-arg ROCM_REPO_VERSION=${ROCM_REPO_VERSION} --build-arg ROCM_REPO_DIST=${ROCM_REPO_DIST}
|
||||
elif [ "${DISTRO}" = "centos" ]; then
|
||||
case "${VERSION}" in
|
||||
7)
|
||||
RPM_PATH=7.9
|
||||
RPM_TAG=".el7"
|
||||
;;
|
||||
8)
|
||||
RPM_PATH=8.5
|
||||
RPM_TAG=".el7"
|
||||
;;
|
||||
*)
|
||||
send-error "Invalid centos version ${VERSION}. Supported: 7, 8"
|
||||
esac
|
||||
case "${i}" in
|
||||
5.0*)
|
||||
ROCM_RPM=latest/rhel/${RPM_PATH}/amdgpu-install-21.50.50000-1${RPM_TAG}.noarch.rpm
|
||||
;;
|
||||
4.5 | 4.5.2)
|
||||
ROCM_RPM=21.40.2/rhel/${RPM_PATH}/amdgpu-install-21.40.2.40502-1${RPM_TAG}.noarch.rpm
|
||||
;;
|
||||
4.5.1)
|
||||
ROCM_RPM=21.40.1/rhel/${RPM_PATH}/amdgpu-install-21.40.1.40501-1${RPM_TAG}.noarch.rpm
|
||||
;;
|
||||
4.5.0)
|
||||
ROCM_RPM=21.40/rhel/${RPM_PATH}/amdgpu-install-21.40.1.40501-1${RPM_TAG}.noarch.rpm
|
||||
;;
|
||||
*)
|
||||
send-error "Unsupported combination :: ${DISTRO}-${VERSION} + ROCm ${i}"
|
||||
;;
|
||||
esac
|
||||
verbose-run docker build . -f ${DOCKER_FILE} --tag jrmadsen/omnitrace-${DISTRO}-${VERSION}-rocm-${i} --build-arg DISTRO=${DISTRO} --build-arg VERSION=${VERSION} --build-arg AMDGPU_RPM=${ROCM_RPM}
|
||||
elif [ "${DISTRO}" = "opensuse" ]; then
|
||||
case "${VERSION}" in
|
||||
15.*)
|
||||
DISTRO_IMAGE="opensuse/leap"
|
||||
echo "DISTRO_IMAGE: ${DISTRO_IMAGE}"
|
||||
;;
|
||||
*)
|
||||
send-error "Invalid opensuse version ${VERSION}. Supported: 15.x"
|
||||
;;
|
||||
esac
|
||||
case "${i}" in
|
||||
5.0*)
|
||||
ROCM_RPM=latest/sle/15/amdgpu-install-21.50.50000-1.noarch.rpm
|
||||
;;
|
||||
4.5 | 4.5.2)
|
||||
ROCM_RPM=21.40.2/sle/15/amdgpu-install-21.40.2.40502-1.noarch.rpm
|
||||
;;
|
||||
4.5.1)
|
||||
ROCM_RPM=21.40.1/sle/15/amdgpu-install-21.40.1.40501-1.noarch.rpm
|
||||
;;
|
||||
4.5.0)
|
||||
ROCM_RPM=21.40/sle/15/amdgpu-install-21.40.1.40501-1.noarch.rpm
|
||||
;;
|
||||
*)
|
||||
send-error "Unsupported combination :: ${DISTRO}-${VERSION} + ROCm ${i}"
|
||||
;;
|
||||
esac
|
||||
verbose-run docker build . -f ${DOCKER_FILE} --tag jrmadsen/omnitrace-${DISTRO}-${VERSION}-rocm-${i} --build-arg DISTRO=${DISTRO_IMAGE} --build-arg VERSION=${VERSION} --build-arg AMDGPU_RPM=${ROCM_RPM}
|
||||
fi
|
||||
done
|
||||
done
|
||||
|
||||
Çalıştırılabilir dosya
@@ -0,0 +1,14 @@
|
||||
#!/bin/bash
|
||||
|
||||
source scl_source enable devtoolset-9
|
||||
source /etc/profile.d/modules.sh
|
||||
module load mpi
|
||||
|
||||
export LC_ALL=en_US.UTF-8
|
||||
|
||||
if [ -z "${1}" ]; then
|
||||
exec bash
|
||||
else
|
||||
set -e
|
||||
eval $@
|
||||
fi
|
||||
@@ -81,6 +81,11 @@ function(CHECKOUT_GIT_SUBMODULE)
|
||||
set(_SUBMODULE_EXISTS OFF)
|
||||
if(EXISTS "${_SUBMODULE}" AND NOT IS_DIRECTORY "${_SUBMODULE}")
|
||||
set(_SUBMODULE_EXISTS ON)
|
||||
else()
|
||||
set(_SUBMODULE "${CMAKE_SOURCE_DIR}/.gitmodules")
|
||||
if(EXISTS "${_SUBMODULE}" AND NOT IS_DIRECTORY "${_SUBMODULE}")
|
||||
set(_SUBMODULE_EXISTS ON)
|
||||
endif()
|
||||
endif()
|
||||
|
||||
set(_HAS_REPO_URL OFF)
|
||||
|
||||
@@ -57,7 +57,6 @@ Authors of the OpenMP code:
|
||||
|
||||
#include "../common/npb-CPP.hpp"
|
||||
#include "npbparams.hpp"
|
||||
#include "omp.h"
|
||||
|
||||
/*
|
||||
* ---------------------------------------------------------------------
|
||||
|
||||
@@ -12,12 +12,14 @@ add_executable(openmp-lu ${CMAKE_CURRENT_SOURCE_DIR}/LU/lu.cpp
|
||||
$<TARGET_OBJECTS:openmp-common>)
|
||||
|
||||
find_program(CLANGXX_EXECUTABLE NAMES clang++)
|
||||
if(CLANGXX_EXECUTABLE)
|
||||
find_library(LIBOMP_LIBRARY
|
||||
NAMES omp ${CMAKE_SHARED_LIBRARY_PREFIX}omp${CMAKE_SHARED_LIBRARY_SUFFIX}.5)
|
||||
if(CLANGXX_EXECUTABLE AND LIBOMP_LIBRARY)
|
||||
target_compile_options(openmp-common PUBLIC -W -Wall -fopenmp=libomp)
|
||||
target_compile_options(openmp-cg PRIVATE -W -Wall -fopenmp=libomp)
|
||||
target_link_libraries(openmp-cg PRIVATE omp)
|
||||
target_link_libraries(openmp-cg PRIVATE ${LIBOMP_LIBRARY})
|
||||
target_compile_options(openmp-lu PRIVATE -W -Wall -fopenmp=libomp)
|
||||
target_link_libraries(openmp-lu PRIVATE omp)
|
||||
target_link_libraries(openmp-lu PRIVATE ${LIBOMP_LIBRARY})
|
||||
omnitrace_custom_compilation(COMPILER ${CLANGXX_EXECUTABLE} TARGET openmp-common)
|
||||
omnitrace_custom_compilation(COMPILER ${CLANGXX_EXECUTABLE} TARGET openmp-cg)
|
||||
omnitrace_custom_compilation(COMPILER ${CLANGXX_EXECUTABLE} TARGET openmp-lu)
|
||||
|
||||
@@ -59,7 +59,6 @@ Authors of the OpenMP code:
|
||||
|
||||
#include "../common/npb-CPP.hpp"
|
||||
#include "npbparams.hpp"
|
||||
#include "omp.h"
|
||||
|
||||
/*
|
||||
* ---------------------------------------------------------------------
|
||||
@@ -2095,8 +2094,8 @@ read_input()
|
||||
* ---------------------------------------------------------------------
|
||||
*/
|
||||
FILE* fp;
|
||||
int avoid_warning;
|
||||
if((fp = fopen("inputlu.data", "r")) != NULL)
|
||||
int avoid_warning = 0;
|
||||
if((fp = fopen("inputlu.data", "r")) != nullptr)
|
||||
{
|
||||
printf("Reading from input file inputlu.data\n");
|
||||
while(fgetc(fp) != '\n')
|
||||
@@ -2156,6 +2155,7 @@ read_input()
|
||||
ny0 = ISIZ2;
|
||||
nz0 = ISIZ3;
|
||||
}
|
||||
(void) avoid_warning;
|
||||
/*
|
||||
* ---------------------------------------------------------------------
|
||||
* check problem size
|
||||
|
||||
+1
-1
projects/rocprofiler-systems/external/dyninst alt modülü güncellendi: 1cb91f1eea...bd17049666
+1
-1
projects/rocprofiler-systems/external/timemory alt modülü güncellendi: de1266606c...14fd2323bd
@@ -2,13 +2,18 @@
|
||||
|
||||
: ${EXTRA_ARGS:=""}
|
||||
: ${EXTRA_TAGS:=""}
|
||||
: ${BUILD_DIR:=build-release}
|
||||
: ${VERSION:=0.0.4}
|
||||
: ${ROCM_VERSION:=4.5.0}
|
||||
: ${NJOBS:=8}
|
||||
: ${DISTRO:=""}
|
||||
: ${LTO:="ON"}
|
||||
|
||||
DISTRO=$(lsb_release -i | awk '{print $NF}')-$(lsb_release -r | awk '{print $NF}')
|
||||
if [ -z "${DISTRO}" ]; then
|
||||
DISTRO=$(lsb_release -i | awk '{print $NF}')-$(lsb_release -r | awk '{print $NF}')
|
||||
fi
|
||||
|
||||
STANDARD_ARGS="-DCPACK_GENERATOR=STGZ -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_RPATH_USE_LINK_PATH=OFF -DOMNITRACE_MAX_THREADS=2048 -DOMNITRACE_BUILD_TESTING=OFF -DTIMEMORY_USE_LIBUNWIND=ON -DTIMEMORY_BUILD_LIBUNWIND=ON -DTIMEMORY_BUILD_PORTABLE=ON"
|
||||
STANDARD_ARGS="-DCPACK_GENERATOR=STGZ -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_RPATH_USE_LINK_PATH=OFF -DOMNITRACE_MAX_THREADS=2048 -DOMNITRACE_BUILD_TESTING=OFF -DOMNITRACE_BUILD_EXAMPLES=OFF -DOMNITRACE_USE_MPI_HEADERS=ON -DOMNITRACE_USE_OMPT=ON -DOMNITRACE_CPACK_SYSTEM_NAME=${DISTRO} -DOMNITRACE_ROCM_VERSION=${ROCM_VERSION} -DOMNITRACE_BUILD_LTO=${LTO} -DTIMEMORY_USE_LIBUNWIND=ON -DTIMEMORY_BUILD_LIBUNWIND=ON -DTIMEMORY_BUILD_PORTABLE=ON"
|
||||
STANDARD_ARGS="${STANDARD_ARGS} -DOMNITRACE_BUILD_DYNINST=ON $(echo -DDYNINST_BUILD_{TBB,BOOST,ELFUTILS,LIBIBERTY}=ON)"
|
||||
if [ -n "${EXTRA_ARGS}" ]; then
|
||||
STANDARD_ARGS="${STANDARD_ARGS} ${EXTRA_ARGS}"
|
||||
@@ -25,45 +30,25 @@ echo -e "Working directory: $(pwd)"
|
||||
|
||||
umask 0000
|
||||
|
||||
if [ ! -f build-release/${PACKAGE_BASE_TAG}.sh ]; then
|
||||
cmake -B build-release/${DISTRO}-core ${STANDARD_ARGS} -DCMAKE_INSTALL_PREFIX=build-release/${DISTRO}-core/install-release -DDYNINST_USE_OpenMP=OFF -DOMNITRACE_USE_MPI_HEADERS=OFF -DOMNITRACE_USE_HIP=OFF .
|
||||
cmake --build build-release/${DISTRO}-core --target package --parallel ${NJOBS}
|
||||
cp build-release/${DISTRO}-core/omnitrace-${VERSION}-Linux.sh build-release/${PACKAGE_BASE_TAG}.sh
|
||||
fi
|
||||
build-and-package()
|
||||
{
|
||||
local DIR=${1}
|
||||
shift
|
||||
cmake -B ${BUILD_DIR}/${DIR} -DCMAKE_INSTALL_PREFIX=${BUILD_DIR}/${DIR}/install-release ${STANDARD_ARGS} $@ .
|
||||
cmake --build ${BUILD_DIR}/${DIR} --target all --parallel ${NJOBS}
|
||||
pushd ${BUILD_DIR}/${DIR}
|
||||
rm -f *.sh *.deb *.rpm
|
||||
cpack -G STGZ
|
||||
cpack -G DEB -D CPACK_PACKAGING_INSTALL_PREFIX=/opt/omnitrace
|
||||
cpack -G RPM -D CPACK_PACKAGING_INSTALL_PREFIX=/opt/omnitrace
|
||||
popd
|
||||
cp ${BUILD_DIR}/${DIR}/omnitrace-${VERSION}-*.sh ${BUILD_DIR}/
|
||||
cp ${BUILD_DIR}/${DIR}/omnitrace_${VERSION}-*.deb ${BUILD_DIR}/
|
||||
cp ${BUILD_DIR}/${DIR}/omnitrace-${VERSION}-*.rpm ${BUILD_DIR}/
|
||||
}
|
||||
|
||||
apt-get install -y libopenmpi-dev openmpi-bin libudev-dev
|
||||
build-and-package ${DISTRO}-core -DDYNINST_USE_OpenMP=OFF -DOMNITRACE_USE_HIP=OFF
|
||||
# build-and-package ${DISTRO}-rocm-${ROCM_VERSION} -DOMNITRACE_USE_HIP=ON -DDYNINST_USE_OpenMP=ON
|
||||
# build-and-package ${DISTRO}-rocm-${ROCM_VERSION}-papi -DOMNITRACE_USE_HIP=ON -DDYNINST_USE_OpenMP=ON -DOMNITRACE_USE_PAPI=ON
|
||||
|
||||
STANDARD_ARGS="${STANDARD_ARGS} -DOMNITRACE_USE_HIP=ON -DOMNITRACE_USE_MPI_HEADERS=ON -DDYNINST_USE_OpenMP=ON"
|
||||
|
||||
if [ ! -f build-release/${PACKAGE_BASE_TAG}-ROCm-${ROCM_VERSION}.sh ]; then
|
||||
cmake -B build-release/${DISTRO}-rocm-${ROCM_VERSION} -DCMAKE_INSTALL_PREFIX=build-release/${DISTRO}-rocm-${ROCM_VERSION}/install-release ${STANDARD_ARGS} .
|
||||
cmake --build build-release/${DISTRO}-rocm-${ROCM_VERSION} --target package --parallel ${NJOBS}
|
||||
cp build-release/${DISTRO}-rocm-${ROCM_VERSION}/omnitrace-${VERSION}-Linux.sh build-release/${PACKAGE_BASE_TAG}-ROCm-${ROCM_VERSION}.sh
|
||||
fi
|
||||
|
||||
STANDARD_ARGS="${STANDARD_ARGS} -DTIMEMORY_USE_PAPI=ON"
|
||||
|
||||
if [ ! -f build-release/${PACKAGE_BASE_TAG}-ROCm-${ROCM_VERSION}-PAPI.sh ]; then
|
||||
cmake -B build-release/${DISTRO}-rocm-${ROCM_VERSION}-papi -DCMAKE_INSTALL_PREFIX=build-release/${DISTRO}-rocm-${ROCM_VERSION}-papi/install-release ${STANDARD_ARGS} .
|
||||
cmake --build build-release/${DISTRO}-rocm-${ROCM_VERSION}-papi --target package --parallel ${NJOBS}
|
||||
cp build-release/${DISTRO}-rocm-${ROCM_VERSION}-papi/omnitrace-${VERSION}-Linux.sh build-release/${PACKAGE_BASE_TAG}-ROCm-${ROCM_VERSION}-PAPI.sh
|
||||
fi
|
||||
|
||||
if [ "${MPI}" -lt 1 ]; then exit 0; fi
|
||||
|
||||
STANDARD_ARGS="${STANDARD_ARGS} -DOMNITRACE_USE_MPI=ON"
|
||||
|
||||
if [ ! -f build-release/${PACKAGE_BASE_TAG}-ROCm-${ROCM_VERSION}-PAPI-OpenMPI.sh ]; then
|
||||
cmake -B build-release/${DISTRO}-rocm-${ROCM_VERSION}-papi-openmpi -DCMAKE_INSTALL_PREFIX=build-release/${DISTRO}-rocm-${ROCM_VERSION}-papi-openmpi/install-release ${STANDARD_ARGS} .
|
||||
cmake --build build-release/${DISTRO}-rocm-${ROCM_VERSION}-papi-openmpi --target package --parallel ${NJOBS}
|
||||
cp build-release/${DISTRO}-rocm-${ROCM_VERSION}-papi-openmpi/omnitrace-${VERSION}-Linux.sh build-release/${PACKAGE_BASE_TAG}-ROCm-${ROCM_VERSION}-PAPI-OpenMPI.sh
|
||||
fi
|
||||
|
||||
apt-get purge -y libopenmpi-dev openmpi-bin
|
||||
apt-get install -y libmpich-dev mpich
|
||||
|
||||
if [ ! -f build-release/${PACKAGE_BASE_TAG}-ROCm-${ROCM_VERSION}-PAPI-MPICH.sh ]; then
|
||||
cmake -B build-release/${DISTRO}-rocm-${ROCM_VERSION}-papi-mpich -DCMAKE_INSTALL_PREFIX=build-release/${DISTRO}-rocm-${ROCM_VERSION}-papi-mpich/install-release ${STANDARD_ARGS} .
|
||||
cmake --build build-release/${DISTRO}-rocm-${ROCM_VERSION}-papi-mpich --target package --parallel ${NJOBS}
|
||||
cp build-release/${DISTRO}-rocm-${ROCM_VERSION}-papi-mpich/omnitrace-${VERSION}-Linux.sh build-release/${PACKAGE_BASE_TAG}-ROCm-${ROCM_VERSION}-PAPI-MPICH.sh
|
||||
fi
|
||||
# build-and-package ${DISTRO}-rocm-${ROCM_VERSION}-papi-openmpi -DOMNITRACE_USE_HIP=ON -DDYNINST_USE_OpenMP=ON -DOMNITRACE_USE_PAPI=ON -DOMNITRACE_USE_MPI=ON
|
||||
|
||||
@@ -1,26 +1,24 @@
|
||||
// MIT License
|
||||
// MIT License
|
||||
//
|
||||
// Copyright (c) 2020, The Regents of the University of California,
|
||||
// through Lawrence Berkeley National Laboratory (subject to receipt of any
|
||||
// required approvals from the U.S. Dept. of Energy). All rights reserved.
|
||||
// Copyright (c) 2022 Advanced Micro Devices, Inc. All Rights Reserved.
|
||||
//
|
||||
// Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
// of this software and associated documentation files (the "Software"), to
|
||||
// deal in the Software without restriction, including without limitation the
|
||||
// rights to use, copy, modify, merge, publish, distribute, sublicense, and
|
||||
// copies of the Software, and to permit persons to whom the Software is
|
||||
// furnished to do so, subject to the following conditions:
|
||||
// Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
// of this software and associated documentation files (the "Software"), to deal
|
||||
// in the Software without restriction, including without limitation the rights
|
||||
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
// copies of the Software, and to permit persons to whom the Software is
|
||||
// furnished to do so, subject to the following conditions:
|
||||
//
|
||||
// The above copyright notice and this permission notice shall be included in
|
||||
// all copies or substantial portions of the Software.
|
||||
// The above copyright notice and this permission notice shall be included in all
|
||||
// copies or substantial portions of the Software.
|
||||
//
|
||||
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
|
||||
// IN THE SOFTWARE.
|
||||
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
// SOFTWARE.
|
||||
|
||||
#include "avail.hpp"
|
||||
#include "library/api.hpp"
|
||||
|
||||
@@ -1,32 +1,24 @@
|
||||
// MIT License
|
||||
// MIT License
|
||||
//
|
||||
// Copyright (c) 2020, The Regents of the University of California,
|
||||
// through Lawrence Berkeley National Laboratory (subject to receipt of any
|
||||
// required approvals from the U.S. Dept. of Energy). All rights reserved.
|
||||
// Copyright (c) 2022 Advanced Micro Devices, Inc. All Rights Reserved.
|
||||
//
|
||||
// Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
// of this software and associated documentation files (the "Software"), to
|
||||
// deal in the Software without restriction, including without limitation the
|
||||
// rights to use, copy, modify, merge, publish, distribute, sublicense, and
|
||||
// copies of the Software, and to permit persons to whom the Software is
|
||||
// furnished to do so, subject to the following conditions:
|
||||
// Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
// of this software and associated documentation files (the "Software"), to deal
|
||||
// in the Software without restriction, including without limitation the rights
|
||||
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
// copies of the Software, and to permit persons to whom the Software is
|
||||
// furnished to do so, subject to the following conditions:
|
||||
//
|
||||
// The above copyright notice and this permission notice shall be included in
|
||||
// all copies or substantial portions of the Software.
|
||||
// The above copyright notice and this permission notice shall be included in all
|
||||
// copies or substantial portions of the Software.
|
||||
//
|
||||
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
|
||||
// IN THE SOFTWARE.
|
||||
|
||||
/** \file timemory/tools/available.hpp
|
||||
* \headerfile tools/available.hpp "tools/available.hpp"
|
||||
* Handles serializing the settings
|
||||
*
|
||||
*/
|
||||
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
// SOFTWARE.
|
||||
|
||||
#pragma once
|
||||
|
||||
|
||||
@@ -62,9 +62,12 @@ module_function::module_function(module_t* mod, procedure_t* proc)
|
||||
|
||||
for(const auto& itr : basic_blocks)
|
||||
{
|
||||
std::vector<instruction_t> instructions{};
|
||||
itr->getInstructions(instructions);
|
||||
num_instructions += instructions.size();
|
||||
std::vector<instruction_t> _instructions{};
|
||||
itr->getInstructions(_instructions);
|
||||
num_instructions += _instructions.size();
|
||||
instructions.reserve(instructions.size() + _instructions.size());
|
||||
for(auto&& iitr : _instructions)
|
||||
instructions.emplace_back(iitr);
|
||||
}
|
||||
|
||||
char modname[FUNCNAMELEN];
|
||||
@@ -84,7 +87,10 @@ module_function::module_function(module_t* mod, procedure_t* proc)
|
||||
}
|
||||
std::pair<address_t, address_t> _range{};
|
||||
if(function->getAddressRange(_range.first, _range.second))
|
||||
{
|
||||
start_address = _range.first;
|
||||
address_range = _range.second - _range.first;
|
||||
}
|
||||
}
|
||||
|
||||
void
|
||||
@@ -95,7 +101,8 @@ module_function::write_header(std::ostream& os)
|
||||
auto w2 = std::min<size_t>(get_width()[2], absolute_max_width);
|
||||
|
||||
std::stringstream ss;
|
||||
ss << std::setw(14) << "AddressRange"
|
||||
ss << std::setw(14) << "StartAddress"
|
||||
<< " " << std::setw(14) << "AddressRange"
|
||||
<< " " << std::setw(14) << "#Instructions"
|
||||
<< " " << std::setw(6) << "Ratio"
|
||||
<< " " << std::setw(w0 + 8) << std::left << "Module"
|
||||
|
||||
@@ -28,6 +28,9 @@
|
||||
#include <timemory/mpl/concepts.hpp>
|
||||
#include <timemory/tpls/cereal/cereal/cereal.hpp>
|
||||
|
||||
#include <sstream>
|
||||
#include <string>
|
||||
|
||||
struct module_function
|
||||
{
|
||||
using width_t = std::array<size_t, 4>;
|
||||
@@ -74,16 +77,18 @@ struct module_function
|
||||
bool is_address_range_constrained() const; // checks address range constraint
|
||||
bool is_num_instructions_constrained() const; // check # instructions constraint
|
||||
|
||||
uint64_t address_range = 0;
|
||||
uint64_t num_instructions = 0;
|
||||
module_t* module = nullptr;
|
||||
procedure_t* function = nullptr;
|
||||
flow_graph_t* flow_graph = nullptr;
|
||||
string_t module_name = {};
|
||||
string_t function_name = {};
|
||||
function_signature signature = {};
|
||||
basic_block_set_t basic_blocks = {};
|
||||
basic_loop_vec_t loop_blocks = {};
|
||||
size_t start_address = 0;
|
||||
uint64_t address_range = 0;
|
||||
uint64_t num_instructions = 0;
|
||||
module_t* module = nullptr;
|
||||
procedure_t* function = nullptr;
|
||||
flow_graph_t* flow_graph = nullptr;
|
||||
string_t module_name = {};
|
||||
string_t function_name = {};
|
||||
function_signature signature = {};
|
||||
basic_block_set_t basic_blocks = {};
|
||||
basic_loop_vec_t loop_blocks = {};
|
||||
std::vector<instruction_t> instructions = {};
|
||||
|
||||
using str_msg_t = std::tuple<int, string_t, string_t, string_t>;
|
||||
using str_msg_vec_t = std::vector<str_msg_t>;
|
||||
@@ -131,8 +136,11 @@ public:
|
||||
return _inc;
|
||||
};
|
||||
|
||||
std::stringstream _addr{};
|
||||
_addr << "0x" << std::hex << rhs.start_address;
|
||||
// clang-format off
|
||||
ss << std::setw(14) << rhs.address_range << " "
|
||||
ss << std::setw(14) << _addr.str() << " "
|
||||
<< std::setw(14) << rhs.address_range << " "
|
||||
<< std::setw(14) << rhs.num_instructions << " "
|
||||
<< std::setw(6) << std::setprecision(2) << std::fixed << (rhs.address_range / static_cast<double>(rhs.num_instructions)) << " "
|
||||
<< std::setw(w0 + 8) << std::left << _get_str(rhs.module_name) << " "
|
||||
@@ -150,6 +158,13 @@ void
|
||||
module_function::serialize(ArchiveT& ar, const unsigned)
|
||||
{
|
||||
namespace cereal = tim::cereal;
|
||||
if constexpr(tim::concepts::is_output_archive<ArchiveT>::value)
|
||||
{
|
||||
std::stringstream _addr{};
|
||||
_addr << "0x" << std::hex << start_address;
|
||||
ar(cereal::make_nvp("start_address", _addr.str()));
|
||||
}
|
||||
|
||||
ar(cereal::make_nvp("address_range", address_range),
|
||||
cereal::make_nvp("instructions", num_instructions),
|
||||
cereal::make_nvp("module", module_name),
|
||||
@@ -181,5 +196,17 @@ module_function::serialize(ArchiveT& ar, const unsigned)
|
||||
cereal::make_nvp("is_num_instructions_constrained",
|
||||
is_num_instructions_constrained()));
|
||||
ar.finishNode();
|
||||
// instructions can inflate JSON size so only output when verbosity is increased
|
||||
// above default
|
||||
if(verbose_level > 0)
|
||||
{
|
||||
std::vector<std::string> _instructions{};
|
||||
_instructions.reserve(instructions.size());
|
||||
for(auto&& itr : instructions)
|
||||
{
|
||||
_instructions.emplace_back(itr.format());
|
||||
}
|
||||
ar(cereal::make_nvp("instructions", _instructions));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1954,7 +1954,8 @@ instrument_entity(const string_t& function_name)
|
||||
"(std::_Sp_counted_base|std::(use|has)_facet|std::locale|::sentry|^std::_|::_(M|"
|
||||
"S)_|::basic_string[a-zA-Z,<>: ]+::_M_create)",
|
||||
regex_opts);
|
||||
static std::regex leading("^(_|\\.|frame_dummy|\\(|targ|kmp_threadprivate_)",
|
||||
static std::regex leading("^(_|\\.|frame_dummy|transaction clone|virtual "
|
||||
"thunk|non-virtual thunk|\\(|targ|kmp_threadprivate_)",
|
||||
regex_opts);
|
||||
static std::regex trailing(
|
||||
"(_|\\.part\\.[0-9]+|\\.constprop\\.[0-9]+|\\.|\\.[0-9]+)$", regex_opts);
|
||||
|
||||
@@ -45,6 +45,8 @@ function(OMNITRACE_ADD_BIN_TEST)
|
||||
"${TEST_ENVIRONMENT}"
|
||||
TIMEOUT
|
||||
${TEST_TIMEOUT}
|
||||
DEPENDS
|
||||
"${TEST_DEPENDS}"
|
||||
LABELS
|
||||
"omnitrace-bin;${TEST_LABELS}"
|
||||
PASS_REGULAR_EXPRESSION
|
||||
@@ -66,6 +68,8 @@ function(OMNITRACE_ADD_BIN_TEST)
|
||||
"${TEST_ENVIRONMENT}"
|
||||
TIMEOUT
|
||||
${TEST_TIMEOUT}
|
||||
DEPENDS
|
||||
"${TEST_DEPENDS}"
|
||||
LABELS
|
||||
"omnitrace-bin;${TEST_LABELS}"
|
||||
PASS_REGULAR_EXPRESSION
|
||||
|
||||
@@ -0,0 +1,5 @@
|
||||
/build*
|
||||
/_build
|
||||
/_doxygen
|
||||
/.gitinfo
|
||||
/omnitrace.dox
|
||||
@@ -0,0 +1,20 @@
|
||||
# Minimal makefile for Sphinx documentation
|
||||
#
|
||||
|
||||
# You can set these variables from the command line, and also
|
||||
# from the environment for the first two.
|
||||
SPHINXOPTS ?=
|
||||
SPHINXBUILD ?= sphinx-build
|
||||
SOURCEDIR = .
|
||||
BUILDDIR = _build
|
||||
|
||||
# Put it first so that "make" without argument is like "make help".
|
||||
help:
|
||||
@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
|
||||
|
||||
.PHONY: help Makefile
|
||||
|
||||
# Catch-all target: route all unknown targets to Sphinx using the new
|
||||
# "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS).
|
||||
%: Makefile
|
||||
@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
|
||||
@@ -0,0 +1,31 @@
|
||||
# About
|
||||
|
||||
```eval_rst
|
||||
.. toctree::
|
||||
:glob:
|
||||
:maxdepth: 4
|
||||
```
|
||||
|
||||
[Browse Omnitrace source code on Github](https://github.com/AMDResearch/omnitrace)
|
||||
|
||||
> [Omnitrace](https://github.com/AMDResearch/omnitrace) is an AMD research project and should
|
||||
> not be treated as an offical part of the ROCm software stack.
|
||||
|
||||
[Omnitrace](https://github.com/AMDResearch/omnitrace) is designed for both high-level and
|
||||
comprehensive application tracing and profiling on both the CPU and GPU.
|
||||
[Omnitrace](https://github.com/AMDResearch/omnitrace) supports both binary instrumentation
|
||||
and sampling as a means of collecting various metrics.
|
||||
|
||||
Visualization of the comprehensive omnitrace results can be viewed in any modern web browser by visiting [ui.perfetto.dev](https://ui.perfetto.dev/)
|
||||
and loading the perfetto output (`.proto` files) produced by omnitrace.
|
||||
|
||||
Aggregated high-level results are available in text files for human consumption and JSON files for programmatic analysis.
|
||||
The JSON output files are compatible with the python package [hatchet](https://github.com/hatchet/hatchet) which converts
|
||||
the performance data into pandas dataframes and facilitate multi-run comparisons, filtering, visualization in Jupyter notebooks, and much more.
|
||||
|
||||
[Omnitrace](https://github.com/AMDResearch/omnitrace) has two distinct configuration steps:
|
||||
|
||||
1. Configuring which functions and modules are instrumented in the target binaries (i.e. executable and/or libraries)
|
||||
- [Instrumenting with Omnitrace](instrumenting.md)
|
||||
2. Configuring what the instrumentation does happens when the instrumented binaries are executed
|
||||
- [Customizing Omnitrace Runtime](runtime.md)
|
||||
@@ -0,0 +1,164 @@
|
||||
# Configuration file for the Sphinx documentation builder.
|
||||
#
|
||||
# This file only contains a selection of the most common options. For a full
|
||||
# list see the documentation:
|
||||
# http://www.sphinx-doc.org/en/master/config
|
||||
|
||||
# -- Path setup --------------------------------------------------------------
|
||||
|
||||
# If extensions (or modules to document with autodoc) are in another directory,
|
||||
# add these directories to sys.path here. If the directory is relative to the
|
||||
# documentation root, use os.path.abspath to make it absolute, like shown here.
|
||||
#
|
||||
# import os
|
||||
# sys.path.insert(0, os.path.abspath('.'))
|
||||
|
||||
import os
|
||||
import sys
|
||||
import subprocess as sp
|
||||
|
||||
# If extensions (or modules to document with autodoc) are in another directory,
|
||||
# add these directories to sys.path here. If the directory is relative to the
|
||||
# documentation root, use os.path.abspath to make it absolute, like shown here.
|
||||
sys.path.insert(0, os.path.abspath(".."))
|
||||
|
||||
|
||||
def install(package):
|
||||
sp.call([sys.executable, "-m", "pip", "install", package])
|
||||
|
||||
|
||||
# Check if we're running on Read the Docs' servers
|
||||
read_the_docs_build = os.environ.get("READTHEDOCS", None) == "True"
|
||||
|
||||
|
||||
# -- Project information -----------------------------------------------------
|
||||
project = "omnitrace"
|
||||
copyright = "2022, Advanced Micro Devices, Inc."
|
||||
author = "Audacious Software Group"
|
||||
|
||||
version = open(os.path.join("..", "VERSION")).read().strip()
|
||||
# The full version, including alpha/beta/rc tags
|
||||
release = version
|
||||
|
||||
_docdir = os.path.realpath(os.getcwd())
|
||||
_srcdir = os.path.realpath(os.path.join(os.getcwd(), ".."))
|
||||
_sitedir = os.path.realpath(os.path.join(os.getcwd(), "..", "site"))
|
||||
_staticdir = os.path.realpath(os.path.join(_docdir, "_static"))
|
||||
_templatedir = os.path.realpath(os.path.join(_docdir, "_templates"))
|
||||
|
||||
if not os.path.exists(_staticdir):
|
||||
os.makedirs(_staticdir)
|
||||
|
||||
if not os.path.exists(_templatedir):
|
||||
os.makedirs(_templatedir)
|
||||
|
||||
|
||||
# -- General configuration ---------------------------------------------------
|
||||
|
||||
install("sphinx_rtd_theme")
|
||||
|
||||
# Add any Sphinx extension module names here, as strings. They can be
|
||||
# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
|
||||
# ones.
|
||||
extensions = [
|
||||
"sphinx.ext.autodoc",
|
||||
"sphinx.ext.doctest",
|
||||
"sphinx.ext.todo",
|
||||
"sphinx.ext.viewcode",
|
||||
"sphinx.ext.githubpages",
|
||||
"sphinx.ext.mathjax",
|
||||
"sphinx.ext.autosummary",
|
||||
"sphinx.ext.napoleon",
|
||||
"sphinx_markdown_tables",
|
||||
"recommonmark",
|
||||
"breathe",
|
||||
]
|
||||
|
||||
source_suffix = {
|
||||
".rst": "restructuredtext",
|
||||
".md": "markdown",
|
||||
}
|
||||
|
||||
from recommonmark.parser import CommonMarkParser
|
||||
|
||||
source_parsers = {".md": CommonMarkParser}
|
||||
|
||||
# Add any paths that contain templates here, relative to this directory.
|
||||
templates_path = ["_templates"]
|
||||
|
||||
# The master toctree document.
|
||||
master_doc = "index"
|
||||
|
||||
# List of patterns, relative to source directory, that match files and
|
||||
# directories to ignore when looking for source files.
|
||||
# This pattern also affects html_static_path and html_extra_path.
|
||||
exclude_patterns = ["_build", "Thumbs.db", ".DS_Store"]
|
||||
|
||||
default_role = None
|
||||
|
||||
# -- Options for HTML output -------------------------------------------------
|
||||
|
||||
# The theme to use for HTML and HTML Help pages. See the documentation for
|
||||
# a list of builtin themes.
|
||||
#
|
||||
html_theme = "sphinx_rtd_theme"
|
||||
|
||||
# Add any paths that contain custom static files (such as style sheets) here,
|
||||
# relative to this directory. They are copied after the builtin static files,
|
||||
# so a file named "default.css" will overwrite the builtin "default.css".
|
||||
html_static_path = ["_static"]
|
||||
|
||||
html_theme_options = {
|
||||
'analytics_id': 'G-1HLBBRSTT9', # Provided by Google in your dashboard
|
||||
'analytics_anonymize_ip': False,
|
||||
'logo_only': False,
|
||||
'display_version': True,
|
||||
'prev_next_buttons_location': 'bottom',
|
||||
'style_external_links': False,
|
||||
'vcs_pageview_mode': '',
|
||||
# 'style_nav_header_background': 'white',
|
||||
# Toc options
|
||||
'collapse_navigation': True,
|
||||
'sticky_navigation': True,
|
||||
'navigation_depth': 4,
|
||||
'includehidden': True,
|
||||
'titles_only': False
|
||||
}
|
||||
|
||||
# Breathe Configuration
|
||||
breathe_projects = {"omnitrace": "_doxygen/xml"}
|
||||
breathe_default_project = "omnitrace"
|
||||
breathe_default_members = ('members', )
|
||||
breathe_projects_source = {
|
||||
"auto": (
|
||||
"../source",
|
||||
[
|
||||
"lib/omnitrace-user/omnitrace/user.h",
|
||||
],
|
||||
)
|
||||
}
|
||||
|
||||
from pygments.styles import get_all_styles
|
||||
|
||||
# The name of the Pygments (syntax highlighting) style to use.
|
||||
styles = list(get_all_styles())
|
||||
preferences = ("emacs", "pastie", "colorful")
|
||||
for pref in preferences:
|
||||
if pref in styles:
|
||||
pygments_style = pref
|
||||
break
|
||||
|
||||
from recommonmark.transform import AutoStructify
|
||||
|
||||
# app setup hook
|
||||
def setup(app):
|
||||
app.add_config_value(
|
||||
"recommonmark_config",
|
||||
{
|
||||
"auto_toc_tree_section": "Contents",
|
||||
"enable_eval_rst": True,
|
||||
"enable_auto_doc_ref": False,
|
||||
},
|
||||
True,
|
||||
)
|
||||
app.add_transform(AutoStructify)
|
||||
@@ -0,0 +1,29 @@
|
||||
# Generating a Critical Trace
|
||||
|
||||
```eval_rst
|
||||
.. toctree::
|
||||
:glob:
|
||||
:maxdepth: 4
|
||||
```
|
||||
|
||||
## Overview
|
||||
|
||||
A critical trace is defined in omnitrace as the most time-consuming path through a parallelized code.
|
||||
The steps for generating a critical trace are:
|
||||
|
||||
1. Enable the `OMNITRACE_CRITICAL_TRACE` setting
|
||||
2. Configure any other relevant critical-trace settings, as needed
|
||||
- `omnitrace-avail --categories settings::critical-trace`
|
||||
3. Execute application
|
||||
4. Locate the JSON files with `call-chain` in their name
|
||||
5. Provide these files to the `omnitrace-critical-trace` executable
|
||||
6. Open generated perfetto file in [ui.perfetto.dev](https://ui.perfetto.dev/)
|
||||
|
||||
## omnitrace-critical-trace Executable
|
||||
|
||||
The `omnitrace-critical-trace` executable post-processes one or more `call-chain` JSON files and generates a perfetto output
|
||||
for visualizing the critical trace.
|
||||
|
||||
**INCOMPLETE**
|
||||
|
||||
This executable is still under-development.
|
||||
@@ -0,0 +1,22 @@
|
||||
name: omnitrace-docs
|
||||
channels:
|
||||
- conda-forge
|
||||
- defaults
|
||||
dependencies:
|
||||
- python=3.9
|
||||
- cmake
|
||||
- curl
|
||||
- doxygen
|
||||
- git
|
||||
- graphviz
|
||||
- matplotlib
|
||||
- mkdocs
|
||||
- numpy
|
||||
- openssl
|
||||
- pillow
|
||||
- pip
|
||||
- setuptools
|
||||
- breathe <4.30.0
|
||||
- sphinx <4.0.0
|
||||
- sphinx-markdown-tables
|
||||
- docutils
|
||||
@@ -0,0 +1,76 @@
|
||||
# Features
|
||||
|
||||
```eval_rst
|
||||
.. toctree::
|
||||
:glob:
|
||||
:maxdepth: 4
|
||||
```
|
||||
|
||||
## Overview
|
||||
|
||||
[Omnitrace](https://github.com/AMDResearch/omnitrace) is designed to be highly extensible. Internally, it leverages the
|
||||
[timemory performance analysis toolkit](https://github.com/NERSC/timemory) to
|
||||
manage extensions, resources, data, etc.
|
||||
|
||||
### Data Collection Modes
|
||||
|
||||
- Dynamic instrumentation
|
||||
- Runtime instrumentation
|
||||
- Instrument executable and shared libraries at runtime
|
||||
- Binary rewriting
|
||||
- Generate a new executable and/or library with instrumentation built-in
|
||||
- Statistical sampling
|
||||
- Periodic software interrupts per-thread
|
||||
- Background thread sampling
|
||||
- Record process and system-level values while an application executes
|
||||
- Critical trace generation
|
||||
|
||||
### Data Analysis
|
||||
|
||||
- Critical trace generation (beta)
|
||||
- Support for
|
||||
|
||||
### Parallelism API Support
|
||||
|
||||
- Built-in MPI support
|
||||
- Kokkos-Tools support
|
||||
|
||||
### GPU Metrics
|
||||
|
||||
- HIP API tracing
|
||||
- ROCM HSA API tracing
|
||||
- Kernel runtime tracing
|
||||
- System-level sampling (via rocm-smi)
|
||||
- Memory usage
|
||||
- Power usage
|
||||
- Temperature
|
||||
- Utilization
|
||||
|
||||
### CPU Metrics
|
||||
|
||||
- CPU hardware counters sampling and profiles
|
||||
- CPU frequency sampling
|
||||
- Various timing metrics
|
||||
- Wall time
|
||||
- CPU time (process and/or thread)
|
||||
- CPU utilization (process and/or thread)
|
||||
- User CPU time
|
||||
- Kernel CPU time
|
||||
- Various memory metrics
|
||||
- High-water mark (sampling and profiles)
|
||||
- Memory page allocation
|
||||
- Virtual memory usage
|
||||
- Network statistics
|
||||
- I/O metrics
|
||||
- ... many more
|
||||
|
||||
### Third-party API support
|
||||
|
||||
- OpenMP-Tools (OMPT)
|
||||
- TAU
|
||||
- LIKWID
|
||||
- Caliper
|
||||
- CrayPAT
|
||||
- VTune
|
||||
- NVTX
|
||||
- ROCTX
|
||||
@@ -0,0 +1,19 @@
|
||||
if(NOT DEFINED SOURCE_DIR)
|
||||
message(FATAL_ERROR "Please define SOURCE_DIR")
|
||||
endif()
|
||||
|
||||
get_filename_component(SOURCE_DIR "${SOURCE_DIR}" ABSOLUTE)
|
||||
|
||||
find_program(DOT_EXECUTABLE NAMES dot)
|
||||
|
||||
if(NOT DOT_EXECUTABLE)
|
||||
message(FATAL_ERROR "Please install dot and/or specify DOT_EXECUTABLE")
|
||||
endif()
|
||||
|
||||
file(READ "${SOURCE_DIR}/VERSION" FULL_VERSION_STRING LIMIT_COUNT 1)
|
||||
string(REGEX REPLACE "(\n|\r)" "" FULL_VERSION_STRING "${FULL_VERSION_STRING}")
|
||||
string(REGEX REPLACE "([0-9]+)\\.([0-9]+)\\.([0-9]+)(.*)" "\\1.\\2.\\3" OMNITRACE_VERSION
|
||||
"${FULL_VERSION_STRING}")
|
||||
|
||||
configure_file(${SOURCE_DIR}/docs-source/omnitrace.dox.in
|
||||
${SOURCE_DIR}/docs-source/omnitrace.dox @ONLY)
|
||||
@@ -0,0 +1,11 @@
|
||||
# Getting Started
|
||||
|
||||
```eval_rst
|
||||
.. toctree::
|
||||
:glob:
|
||||
:maxdepth: 3
|
||||
|
||||
instrumenting
|
||||
runtime
|
||||
critical_trace
|
||||
```
|
||||
@@ -0,0 +1,15 @@
|
||||
# Welcome to the [Omnitrace](https://github.com/AMDResearch/omnitrace) Documentation!
|
||||
|
||||
```eval_rst
|
||||
.. toctree::
|
||||
:glob:
|
||||
:maxdepth: 4
|
||||
:caption: Table of Contents
|
||||
|
||||
about
|
||||
features
|
||||
installation
|
||||
getting_started
|
||||
output
|
||||
user_api
|
||||
```
|
||||
@@ -0,0 +1,162 @@
|
||||
# Installation
|
||||
|
||||
```eval_rst
|
||||
.. toctree::
|
||||
:glob:
|
||||
:maxdepth: 4
|
||||
```
|
||||
|
||||
- Ubuntu 18.04 or Ubuntu 20.04
|
||||
- Other OS distributions may be supported but are not tested
|
||||
- GCC compiler v7+
|
||||
- Older GCC compilers may be supported but are not tested
|
||||
- Clang compilers are generally supported for [Omnitrace](https://github.com/AMDResearch/omnitrace) but not Dyninst
|
||||
- [CMake](https://cmake.org/) v3.15+
|
||||
- [DynInst](https://github.com/dyninst/dyninst) for dynamic or static instrumentation
|
||||
- [TBB](https://github.com/oneapi-src/oneTBB) required by Dyninst
|
||||
- [ElfUtils](https://sourceware.org/elfutils/) required by Dyninst
|
||||
- [LibIberty](https://github.com/gcc-mirror/gcc/tree/master/libiberty) required by Dyninst
|
||||
- [Boost](https://www.boost.org/) required by Dyninst
|
||||
- [OpenMP](https://www.openmp.org/) optional by Dyninst
|
||||
- [ROCm](https://rocmdocs.amd.com/en/latest/Installation_Guide/Installation-Guide.html#ubuntu) (optional)
|
||||
- HIP
|
||||
- Roctracer for HIP API and kernel tracing
|
||||
- ROCM-SMI for GPU monitoring
|
||||
- [PAPI](https://icl.utk.edu/papi/)
|
||||
- [libunwind](https://www.nongnu.org/libunwind/) for call-stack sampling
|
||||
- Several optional third-party profiling tools supported by timemory (e.g. TAU, Caliper, CrayPAT, etc.)
|
||||
|
||||
## Installing omnitrace from binary distributions
|
||||
|
||||
Every omnitrace release provides binary installer scripts of the form:
|
||||
|
||||
```shell
|
||||
omnitrace-{VERSION}-{OS_DISTRIB}-{OS_VERSION}[-ROCm-{ROCM_VERSION}[-{EXTRA}]].sh
|
||||
```
|
||||
|
||||
E.g.:
|
||||
|
||||
```shell
|
||||
omnitrace-0.0.5-Ubuntu-18.04.sh
|
||||
omnitrace-0.0.5-Ubuntu-18.04-ROCm-4.3.0.sh
|
||||
omnitrace-0.0.5-Ubuntu-18.04-ROCm-4.5.0.sh
|
||||
...
|
||||
omnitrace-0.0.5-Ubuntu-20.04-ROCm-4.5.0-PAPI.sh
|
||||
omnitrace-0.0.5-Ubuntu-20.04-ROCm-4.5.0-PAPI-MPICH.sh
|
||||
omnitrace-0.0.5-Ubuntu-20.04-ROCm-4.5.0-PAPI-OpenMPI.sh
|
||||
```
|
||||
|
||||
The EXTRA fields such as PAPI, MPICH, and OpenMPI are built against the libraries provided by the
|
||||
OS package manager, e.g. `apt-get install libpapi-dev` for Ubuntu.
|
||||
|
||||
### Download the appropriate binary distribution
|
||||
|
||||
```shell
|
||||
wget https://github.com/AMDResearch/omnitrace/releases/download/v<VERSION>/<SCRIPT>
|
||||
```
|
||||
|
||||
### Create the target installation directory
|
||||
|
||||
```shell
|
||||
mkdir /opt/omnitrace
|
||||
```
|
||||
|
||||
### Run the installer script
|
||||
|
||||
```shell
|
||||
./omnitrace-0.0.5-Ubuntu-18.04-ROCm-4.3.0-PAPI-MPICH.sh --prefix=/opt/omnitrace
|
||||
```
|
||||
|
||||
### Configure the environment
|
||||
|
||||
```shell
|
||||
source /opt/omnitrace/share/omnitrace/setup-env.sh
|
||||
```
|
||||
|
||||
### Test the executables
|
||||
|
||||
```shell
|
||||
omnitrace --help
|
||||
omnitrace-avail --help
|
||||
```
|
||||
|
||||
## Installing Omnitrace from source
|
||||
|
||||
### Installing CMake
|
||||
|
||||
If using Ubuntu 20.04, `apt-get install cmake` will install cmake v3.16.3. If using Ubuntu 18.04, the cmake version via apt is too old (v3.10.2). In this case,
|
||||
follow the instructions [here](https://apt.kitware.com/) to add the CMake apt package repository; or alternatively (if root access is not available),
|
||||
specific versions of CMake can be easily installed via the Python pip package manager:
|
||||
|
||||
```shell
|
||||
python3 -m pip install 'cmake==3.18.4'
|
||||
export PATH=${HOME}/.local/bin
|
||||
```
|
||||
|
||||
> NOTE: be wary of using `python3 -m pip install cmake`. If pip installs a cmake version with a `.post<N>` suffix, it will be necessary to
|
||||
> specify the root path when cmake is invoked.
|
||||
|
||||
### Installing DynInst
|
||||
|
||||
#### Building Dyninst alongside Omnitrace
|
||||
|
||||
The easiest way to install Dyninst is to configure omnitrace with `OMNITRACE_BUILD_DYNINST=ON`. Depending on the version of Ubuntu, the apt package manager may have current enough
|
||||
versions of Dyninst's Boost, TBB, and LibIberty dependencies (i.e. `apt-get install libtbb-dev libiberty-dev libboost-dev`); however, it is possible to request Dyninst to install
|
||||
it's dependencies via `Dyninst_BUILD_<DEP>=ON`, e.g.:
|
||||
|
||||
```shell
|
||||
git clone https://github.com/AMDResearch/omnitrace.git omnitrace-source
|
||||
cmake -B omnitrace-build -DOMNITRACE_BUILD_DYNINST=ON -DDyninst_BUILD_{TBB,ELFUTILS,BOOST,LIBIBERTY}=ON omnitrace-source
|
||||
```
|
||||
|
||||
where `-DDyninst_BUILD_{TBB,BOOST,ELFUTILS,LIBIBERTY}=ON` is expanded by the shell to `-DDyninst_BUILD_TBB=ON -DDyninst_BUILD_BOOST=ON ...`
|
||||
|
||||
#### Installing Dyninst via Spack
|
||||
|
||||
[Spack](https://github.com/spack/spack) is another option to install Dyninst and it's dependencies:
|
||||
|
||||
```shell
|
||||
git clone https://github.com/spack/spack.git
|
||||
source ./spack/share/spack/setup-env.sh
|
||||
spack compiler find
|
||||
spack external find
|
||||
spack install dyninst
|
||||
spack load -r dyninst
|
||||
```
|
||||
|
||||
### Installing omnitrace
|
||||
|
||||
Omnitrace has cmake configuration options for supporting MPI (`OMNITRACE_USE_MPI` or `OMNITRACE_USE_MPI_HEADERS`), HIP kernel tracing (`OMNITRACE_USE_ROCTRACER`),
|
||||
sampling ROCm devices (`OMNITRACE_USE_ROCM_SMI`), OpenMP-Tools (`OMNITRACE_USE_OMPT`), hardware counters via PAPI (`OMNITRACE_USE_PAPI`), among others.
|
||||
Various additional features can be enabled via the [`TIMEMORY_USE_*` CMake options](https://timemory.readthedocs.io/en/develop/installation.html#cmake-options).
|
||||
Any `OMNITRACE_USE_<VAL>` option which has a corresponding `TIMEMORY_USE_<VAL>` option means that the support within timemory for this feature has been integrated
|
||||
into omnitrace's perfetto support, e.g. `OMNITRACE_USE_PAPI=<VAL>` forces `TIMEMORY_USE_PAPI=<VAL>` and the data that timemory is able to collect via this package
|
||||
is passed along to perfetto and will be displayed when the `.proto` file is visualized in [ui.perfetto.dev](https://ui.perfetto.dev).
|
||||
|
||||
```shell
|
||||
OMNITRACE_ROOT=${HOME}/sw/omnitrace
|
||||
git clone https://github.com/AMDResearch/omnitrace.git omnitrace-source
|
||||
cmake \
|
||||
-B omnitrace-build \
|
||||
-DOMNITRACE_USE_MPI_HEADERS=ON \
|
||||
-DCMAKE_INSTALL_PREFIX=${OMNITRACE_ROOT} \
|
||||
omnitrace-source
|
||||
cmake --build omnitrace-build --target all --parallel 8
|
||||
cmake --build omnitrace-build --target install
|
||||
source ${OMNITRACE_ROOT}/share/omnitrace/setup-env.sh
|
||||
```
|
||||
|
||||
#### MPI Support within Omnitrace
|
||||
|
||||
[Omnitrace](https://github.com/AMDResearch/omnitrace) can have full (`OMNITRACE_USE_MPI=ON`) or partial (`OMNITRACE_USE_MPI_HEADERS=ON`) MPI support.
|
||||
The only difference between these two modes is whether or not the results collected via timemory can be aggregated into one output file. The primary
|
||||
benefits of partial or full MPI support are the automatic wrapping of MPI functions and the ability to label output with suffixes which correspond to the
|
||||
`MPI_COMM_WORLD` rank ID instead of using the system process identifier (i.e. PID).
|
||||
In general, it is recommended to use partial MPI support with the OpenMPI headers as this is the most portable configuration.
|
||||
If full MPI support is selected, make sure your target application is built against the same MPI distribution as omnitrace,
|
||||
i.e. do not build omnitrace with MPICH and use it on a target application built against OpenMPI.
|
||||
If partial support is selected, the reason the OpenMPI headers are recommended instead of the MPICH headers is
|
||||
because the `MPI_COMM_WORLD` in OpenMPI is a pointer to `ompi_communicator_t` (8 bytes), whereas `MPI_COMM_WORLD` in MPICH,
|
||||
it is an `int` (4 bytes). Building omnitrace with partial MPI support and the MPICH headers and then using
|
||||
omnitrace on an application built against OpenMPI will cause a segmentation fault due to the value of the `MPI_COMM_WORLD` being narrowed
|
||||
during the function wrapping before being passed along to the underlying MPI function.
|
||||
@@ -0,0 +1,682 @@
|
||||
# Instrumenting with Omnitrace
|
||||
|
||||
```eval_rst
|
||||
.. toctree::
|
||||
:glob:
|
||||
:maxdepth: 4
|
||||
```
|
||||
|
||||
## omnitrace Executable
|
||||
|
||||
Instrumentation is performed with the `omnitrace` executable. View the help menu with the `-h` / `--help` option:
|
||||
|
||||
```shell
|
||||
$ omnitrace --help
|
||||
[omnitrace] Usage: omnitrace [ --help (count: 0, dtype: bool)
|
||||
--debug (max: 1, dtype: bool)
|
||||
--verbose (max: 1, dtype: bool)
|
||||
--error (max: 1, dtype: boolean)
|
||||
--simulate (max: 1, dtype: bool)
|
||||
--print-format (min: 1, dtype: string)
|
||||
--print-dir (count: 1, dtype: string)
|
||||
--print-available (count: 1)
|
||||
--print-instrumented (count: 1)
|
||||
--print-excluded (count: 1)
|
||||
--print-overlapping (count: 1)
|
||||
--output (count: 1)
|
||||
--pid (count: 1, dtype: int)
|
||||
--mode (count: 1)
|
||||
--command (count: 1)
|
||||
--prefer (count: 1)
|
||||
--library (count: unlimited)
|
||||
--main-function (count: 1)
|
||||
--driver (max: 1, dtype: boolean)
|
||||
--load (count: unlimited, dtype: string)
|
||||
--load-instr (count: unlimited, dtype: filepath)
|
||||
--init-functions (count: unlimited, dtype: string)
|
||||
--fini-functions (count: unlimited, dtype: string)
|
||||
--function-include (count: unlimited)
|
||||
--function-exclude (count: unlimited)
|
||||
--module-include (count: unlimited)
|
||||
--module-exclude (count: unlimited)
|
||||
--label (count: unlimited, dtype: string)
|
||||
--default-components (count: unlimited, dtype: string)
|
||||
--env (count: unlimited)
|
||||
--mpi (max: 1, dtype: bool)
|
||||
--instrument-loops (max: 1, dtype: boolean)
|
||||
--min-address-range (count: 1, dtype: int)
|
||||
--min-address-range-loop (count: 1, dtype: int)
|
||||
--dynamic-callsites (max: 1, dtype: boolean)
|
||||
--traps (max: 1, dtype: bool)
|
||||
--loop-traps (max: 1, dtype: bool)
|
||||
--allow-overlapping (count: 0, dtype: bool)
|
||||
--batch-size (count: 1, dtype: int)
|
||||
--dyninst-options (count: unlimited)
|
||||
] -- <CMD> <ARGS>
|
||||
|
||||
Options:
|
||||
-h, -?, --help Shows this page
|
||||
|
||||
[DEBUG OPTIONS]
|
||||
|
||||
--debug Debug output
|
||||
-v, --verbose Verbose output
|
||||
-e, --error All warnings produce runtime errors
|
||||
--simulate Exit after outputting diagnostic {available,instrumented,excluded,overlapping} module
|
||||
function lists, e.g. available-instr.txt
|
||||
--print-format [ json | txt | xml ]
|
||||
Output format for diagnostic {available,instrumented,excluded,overlapping} module
|
||||
function lists, e.g. {print-dir}/available-instr.txt
|
||||
--print-dir Output directory for diagnostic {available,instrumented,excluded,overlapping} module
|
||||
function lists, e.g. {print-dir}/available-instr.txt
|
||||
--print-available [ functions | functions+ | modules | pair | pair+ ]
|
||||
Print the available entities for instrumentation (functions, modules, or module-function
|
||||
pair) to stdout applying regular expressions and exit
|
||||
--print-instrumented [ functions | functions+ | modules | pair | pair+ ]
|
||||
Print the instrumented entities (functions, modules, or module-function pair) to stdout
|
||||
after applying regular expressions and exit
|
||||
--print-excluded [ functions | functions+ | modules | pair | pair+ ]
|
||||
Print the entities for instrumentation (functions, modules, or module-function pair)
|
||||
which are excluded from the instrumentation to stdout after applying regular expressions
|
||||
and exit
|
||||
--print-overlapping [ functions | functions+ | modules | pair | pair+ ]
|
||||
Print the entities for instrumentation (functions, modules, or module-function pair)
|
||||
which overlap other function calls or have multiple entry points to stdout applying
|
||||
regular expressions and exit
|
||||
|
||||
[MODE OPTIONS]
|
||||
|
||||
-o, --output Enable generation of a new executable (binary-rewrite)
|
||||
-p, --pid Connect to running process
|
||||
-M, --mode [ sampling | trace ]
|
||||
Instrumentation mode. 'trace' mode instruments the selected functions, 'sampling' mode
|
||||
only instruments the main function to start and stop the sampler.
|
||||
-c, --command Input executable and arguments (if '-- <CMD>' not provided)
|
||||
|
||||
[LIBRARY OPTIONS]
|
||||
|
||||
--prefer [ shared | static ] Prefer this library types when available
|
||||
-L, --library Libraries with instrumentation routines (default: "libomnitrace")
|
||||
-m, --main-function The primary function to instrument around, e.g. 'main'
|
||||
--driver Force main or _init/_fini instrumentation
|
||||
--load Supplemental instrumentation library names w/o extension (e.g. 'libinstr' for
|
||||
'libinstr.so' or 'libinstr.a')
|
||||
--load-instr Load {available,instrumented,excluded,overlapping}-instr JSON or XML file(s) and override
|
||||
what is read from the binary
|
||||
--init-functions Initialization function(s) for supplemental instrumentation libraries (see '--load'
|
||||
option)
|
||||
--fini-functions Finalization function(s) for supplemental instrumentation libraries (see '--load' option)
|
||||
|
||||
[SYMBOL SELECTION OPTIONS]
|
||||
|
||||
-I, -R, --function-include Regex for selecting functions
|
||||
-E, --function-exclude Regex for excluding functions
|
||||
-MI, -MR, --module-include Regex for selecting modules/files/libraries
|
||||
-ME, --module-exclude Regex for excluding modules/files/libraries
|
||||
|
||||
[RUNTIME OPTIONS]
|
||||
|
||||
--label [ args | file | line | return ]
|
||||
Labeling info for functions. By default, just the function name is recorded. Use these
|
||||
options to gain more information about the function signature or location of the
|
||||
functions
|
||||
-d, --default-components Default components to instrument (only useful when timemory is enabled in omnitrace
|
||||
library)
|
||||
--env Environment variables to add to the runtime in form VARIABLE=VALUE. E.g. use '--env
|
||||
OMNITRACE_USE_TIMEMORY=ON' to default to using timemory instead of perfetto
|
||||
--mpi Enable MPI support (requires omnitrace built w/ MPI and GOTCHA support). NOTE: this will
|
||||
automatically be activated if MPI_Init/MPI_Init_thread and MPI_Finalize are found in the
|
||||
symbol table of target
|
||||
|
||||
[GRANULARITY OPTIONS]
|
||||
|
||||
-l, --instrument-loops Instrument at the loop level
|
||||
-r, --min-address-range If the address range of a function is less than this value, exclude it from
|
||||
instrumentation
|
||||
--min-address-range-loop If the address range of a function containing a loop is less than this value, exclude it
|
||||
from instrumentation
|
||||
--dynamic-callsites Force instrumentation if a function has dynamic callsites (e.g. function pointers)
|
||||
--traps Instrument points which require using a trap. On the x86 architecture, because
|
||||
instructions are of variable size, the instruction at a point may be too small for
|
||||
Dyninst to replace it with the normal code sequence used to call instrumentation. Also,
|
||||
when instrumentation is placed at points other than subroutine entry, exit, or call
|
||||
points, traps may be used to ensure the instrumentation fits. In this case, Dyninst
|
||||
replaces the instruction with a single-byte instruction that generates a trap.
|
||||
--loop-traps Instrument points within a loop which require using a trap (only relevant when
|
||||
--instrument-loops is enabled).
|
||||
--allow-overlapping Allow dyninst to instrument either multiple functions which overlap (share part of same
|
||||
function body) or single functions with multiple entry points. For more info, see Section
|
||||
2 of the DyninstAPI documentation.
|
||||
|
||||
[DYNINST OPTIONS]
|
||||
|
||||
-b, --batch-size Dyninst supports batch insertion of multiple points during runtime instrumentation. If
|
||||
one large batch insertion fails, this value will be used to create smaller batches.
|
||||
Larger batches generally decrease the instrumentation time
|
||||
--dyninst-options [ BaseTrampDeletion | DebugParsing | DelayedParsing | InstrStackFrames | MergeTramp | SaveFPR | TrampRecursive | TypeChecking ]
|
||||
Advanced dyninst options: BPatch::set<OPTION>(bool), e.g. bpatch->setTrampRecursive(true)
|
||||
```
|
||||
|
||||
There are three ways to perform instrumentation:
|
||||
|
||||
1. Running the application via the omnitrace executable (analagous to `gdb --args <program> <args>`)
|
||||
- This mode is the default if neither the `-p` nor `-o` comand-line options are used
|
||||
- Runtime instrumentation supports instrumenting not only the target executable but also the
|
||||
the shared libraries loaded by the target executable. Consequently, this mode consumes more memory,
|
||||
takes longer to perform the instrumentation, and tends to have a more significant overhead on the
|
||||
runtime of the application
|
||||
- This mode is recommended if you want to analyze not only the performance of your executable and/or
|
||||
libraries but also the performance of the library dependencies
|
||||
2. Attaching to a process that is currently running (analagous to `gdb -p <PID>`)
|
||||
- This mode is activate via `-p <PID>`
|
||||
- Same caveats as 1. with respect to memory and overhead
|
||||
3. Generating a new executable or library with the instrumentation built-in (binary rewrite)
|
||||
- This mode is activated via the `-o <output-file>` option
|
||||
- Binary rewriting is limited to the text section of the target executable or library: it will not instrument
|
||||
the dynamically-linked libraries. Consequently, this mode performs the instrumentation significantly faster
|
||||
and has a much lower overhead when running the instrumentated executable and/or libraries
|
||||
- Binary rewriting is the recommended mode when the target executable uses process-level parallelism (e.g. MPI)
|
||||
- If your target executable has a minimal main which and the bulk of your application is in one specific dynamic library,
|
||||
see [Binary Rewriting a Library](#binary-rewriting-a-library) for help
|
||||
|
||||
|
||||
> NOTE: Attaching to a running process is an alpha feature and support for detaching from the target process
|
||||
> without ending the target process is not currently supported.
|
||||
|
||||
The general syntax for separating omnitrace command line arguments from the application arguments follows the
|
||||
is consistent with the LLVM style of using a standalone double-hyphen (`--`). All arguments preceding the double-hyphen
|
||||
are interpreted as belonging to omnitrace and all arguments following the double-hyphen are interpreted as the
|
||||
application and it's arguments. In binary rewrite mode, all application arguments after the first argument
|
||||
are ignored, i.e. `./omnitrace -o ls.inst -- ls -l` interprets `ls` as the target to instrument (ignores the `-l` argument)
|
||||
and generates a `ls.inst` executable that you can subsequently run `ls.inst -l` with.
|
||||
|
||||
## Runtime Instrumentation
|
||||
|
||||
```shell
|
||||
omnitrace <omnitrace-options> -- <exe> [<exe-options>...]
|
||||
```
|
||||
|
||||
## Attaching to Running Process
|
||||
|
||||
```shell
|
||||
omnitrace <omnitrace-options> -p <PID> -- <exe-name>
|
||||
```
|
||||
|
||||
## Binary Rewrite
|
||||
|
||||
```shell
|
||||
omnitrace <omnitrace-options> -o <name-of-new-exe-or-library> -- <exe-or-library>
|
||||
```
|
||||
|
||||
### Binary Rewriting a Library
|
||||
|
||||
Many applications bundle the bulk of their functionality into one or more dynamic libraries and have a relatively simple main
|
||||
which links to these libraries and simply serves as the "driver" for setting up the workflow. If you binary rewrite your
|
||||
executable and find there is insufficient info because of this, you can either switch to runtime instrumentation or
|
||||
binary rewrite the libraries of interest.
|
||||
|
||||
Support for standalone binary rewriting of a dynamic library without binary rewriting the executable is a beta feature.
|
||||
In general, it is supported as long as the library contains the `_init` and `_fini` symbols but these symbols are not
|
||||
standardized to the extent of `main` in an executable.
|
||||
The recommended workflow is as follows:
|
||||
|
||||
1. Determine the names of the dynamically linked libraries of interest via `ldd`
|
||||
2. Generate a binary rewrite of the executable
|
||||
3. Generate a binary rewrite of the desired libraries with the same base name as the original library, e.g. `libfoo.so.2` instead of `libfoo.so`
|
||||
- Output the instrumented library into a different folder than the original library
|
||||
4. Prefix the `LD_LIBRARY_PATH` executable with the output folder from 3
|
||||
5. Verify via `ldd` that the instrumented executable resolves the location of the instrumented library
|
||||
|
||||
### Binary Rewriting a Library Example
|
||||
|
||||
`foo` executable is dynamically linked to `libfoo.so.2`:
|
||||
|
||||
```shell
|
||||
$ pwd
|
||||
/home/user
|
||||
$ which foo
|
||||
/usr/local/bin/foo
|
||||
$ ldd /usr/local/bin/foo
|
||||
...
|
||||
libfoo.so.2 => /usr/local/lib/libfoo.so.2 (...)
|
||||
...
|
||||
```
|
||||
|
||||
Generate binary rewrites of `foo` and `libfoo.so.2`:
|
||||
|
||||
```shell
|
||||
omnitrace -o ./foo.inst -- foo
|
||||
omnitrace -o ./libfoo.so.2 -- /usr/local/lib/libfoo.so.2
|
||||
```
|
||||
|
||||
At this point, the instrumented `foo.inst` executable will still dynamically load the original `libfoo.so.2` in `/usr/local/lib`:
|
||||
|
||||
```shell
|
||||
$ ldd ./foo.inst
|
||||
...
|
||||
libfoo.so.2 => /usr/local/lib/libfoo.so.2 (...)
|
||||
...
|
||||
```
|
||||
|
||||
Prefix the `LD_LIBRARY_PATH` environment variable with the folder containing the instrumented `libfoo.so.2`:
|
||||
|
||||
```shell
|
||||
export LD_LIBRARY_PATH=/home/user:${LD_LIBRARY_PATH}
|
||||
```
|
||||
|
||||
When `foo.inst` is executed, it will now load the instrumented library:
|
||||
|
||||
```shell
|
||||
$ ldd ./foo.inst
|
||||
...
|
||||
libfoo.so.2 => /home/user/libfoo.so.2 (...)
|
||||
...
|
||||
```
|
||||
|
||||
## Selective Instrumentation
|
||||
|
||||
The default behavior of omnitrace does not instrument every symbol in the binary. These default rules are:
|
||||
|
||||
- Skip instrumenting dynamic call-sites (i.e. function pointers)
|
||||
- Option `--dynamic-callsites` will force instrumentation for all dynamic call-sites
|
||||
- The cost of a function can be loosely approximated by the size of the function in the binary so by default, omnitrace only instruments functions which span an address range of 256 bytes.
|
||||
- Option `--min-address-range` will modify this heuristic for all functions which do not contain loops
|
||||
- Option `--min-address-range-loop` will modify this heuristic for functions which contain loops
|
||||
- This separate loop option is provided because functions with loops can be compact in the binary while also being costly
|
||||
- Skip instrumentation points which require using a trap
|
||||
- See the description for the `--traps` and `--loop-traps` options for more information
|
||||
- Skip instrumenting loops within the body of a function
|
||||
- Option `--instrument-loops` will enable this behavior
|
||||
- Skip instrumenting functions with overlapping function bodies and single functions with multiple entry point
|
||||
- These arise from various optimizations and instrumenting these functions can be enabled via the `--allow-overlapping` option
|
||||
|
||||
### Viewing the Available, Instrumented, Excluded, and Overlapping Functions
|
||||
|
||||
Whenever omnitrace is executed with a verbosity of zero or higher, it emits files which detail which functions (and which module they were defined in)
|
||||
were available for instrumentation, which functions were instrumented, which functions were excluded, and which functions contained overlapping function bodies.
|
||||
The default output path of these files will be in a `omnitrace-<NAME>-output` folder where `<NAME>` is the basename of the targeted binary or
|
||||
(in the case of binary rewrite, the basename of the resulting executable), e.g.
|
||||
`omnitrace -- ls` will output it's files to `omnitrace-ls-output` whereas `omnitrace -o ls.inst -- ls` will output to `omnitrace-ls.inst-output`.
|
||||
|
||||
If you would like to generate these files without executing or generating an executable, use the `--simulate` option:
|
||||
|
||||
```shell
|
||||
omnitrace --simulate -- foo
|
||||
omnitrace --simulate -o foo.inst -- foo
|
||||
```
|
||||
|
||||
### Excluding and Including Modules and Functions
|
||||
|
||||
[Omnitrace](https://github.com/AMDResearch/omnitrace) has a set of 6 command-line options which each accept one or more regular expressions for customizing the scope of which module and/or functions are
|
||||
instrumented. Multiple regexes per option are treated as an OR operation, e.g. `--module-include libfoo libbar` is effectively that same as `--module-include 'libfoo|libbar'`.
|
||||
|
||||
If you would like to force the inclusion of certain modules and/or function without changing any of the heuristics, use the `--module-include` and/or `--function-include` options.
|
||||
Note that these options will not exclude modules and/or functions which do not satisfy their regular expression.
|
||||
|
||||
If you would like to narrow the scope of the instrumentation to a specific set of libraries and/or functions, use the `--module-restrict` and `--function-restrict` options.
|
||||
Applying these options allow you to exclusively select the union one or more regular expressions, regardless of whether or not the functions satisfy the
|
||||
aforementioned default heuristics. Any function or module that is not within the union of these regular expressions will be excluded from instrumentation.
|
||||
|
||||
If you would like to avoid instrumenting a set of modules and/or functions, use the `--module-exclude` and `--function-exclude` options.
|
||||
These options are always applied regardless of whether the module or function satisfied the "restrict" or "include" regular expression.
|
||||
|
||||
#### Example Available Module and Function Info Output
|
||||
|
||||
> `omnitrace -o lulesh.inst --label file line args --simulate -- lulesh`
|
||||
|
||||
```console
|
||||
AddressRange Module Function FunctionSignature
|
||||
9165 ../examples/lulesh/lulesh-comm.cc CommMonoQ CommMonoQ(domain) [lulesh-comm.cc:1891]
|
||||
3396 ../examples/lulesh/lulesh-comm.cc CommRecv CommRecv(domain, int, Index_t, Index_t, Index_t, Index_t, bool, bool) [lulesh...
|
||||
8666 ../examples/lulesh/lulesh-comm.cc CommSBN CommSBN(domain, int, Domain_member *) [lulesh-comm.cc:926]
|
||||
10212 ../examples/lulesh/lulesh-comm.cc CommSend CommSend(domain, int, Index_t, Domain_member *, Index_t, Index_t, Index_t, bo...
|
||||
6823 ../examples/lulesh/lulesh-comm.cc CommSyncPosVel CommSyncPosVel(domain) [lulesh-comm.cc:1404]
|
||||
126 ../examples/lulesh/lulesh-comm.cc _GLOBAL__sub_I_lulesh_comm.cc _GLOBAL__sub_I_lulesh_comm.cc() [lulesh-comm.cc]
|
||||
308 ../examples/lulesh/lulesh-init.cc .omp_outlined..26 .omp_outlined..26(const , const , const ParallelFor<Kokkos::Impl::ViewCopy<Ko...
|
||||
628 ../examples/lulesh/lulesh-init.cc .omp_outlined..34 .omp_outlined..34(const , const , const ParallelFor<Kokkos::Impl::ViewCopy<Ko...
|
||||
656 ../examples/lulesh/lulesh-init.cc .omp_outlined..41 .omp_outlined..41(const , const , const ParallelFor<Kokkos::Impl::ViewCopy<Ko...
|
||||
662 ../examples/lulesh/lulesh-init.cc .omp_outlined..45 .omp_outlined..45(const , const , const ParallelFor<Kokkos::Impl::ViewCopy<Ko...
|
||||
550 ../examples/lulesh/lulesh-init.cc .omp_outlined..55 .omp_outlined..55(const , const , const ParallelFor<Kokkos::Impl::ViewFill<Ko...
|
||||
556 ../examples/lulesh/lulesh-init.cc .omp_outlined..57 .omp_outlined..57(const , const , const ParallelFor<Kokkos::Impl::ViewFill<Ko...
|
||||
550 ../examples/lulesh/lulesh-init.cc .omp_outlined..78 .omp_outlined..78(const , const , const ParallelFor<Kokkos::Impl::ViewFill<Ko...
|
||||
640 ../examples/lulesh/lulesh-init.cc .omp_outlined..84 .omp_outlined..84(const , const , const ParallelFor<Kokkos::Impl::ViewCopy<Ko...
|
||||
646 ../examples/lulesh/lulesh-init.cc .omp_outlined..88 .omp_outlined..88(const , const , const ParallelFor<Kokkos::Impl::ViewCopy<Ko...
|
||||
1840 ../examples/lulesh/lulesh-init.cc Domain::AllocateElemPersistent Domain::AllocateElemPersistent(Domain *, Int_t) [lulesh-init.cc:94]
|
||||
1384 ../examples/lulesh/lulesh-init.cc Domain::AllocateNodePersistent Domain::AllocateNodePersistent(Domain *, Int_t) [lulesh-init.cc:94]
|
||||
1264 ../examples/lulesh/lulesh-init.cc Domain::BuildMesh Domain::BuildMesh(Domain *, Int_t, Int_t, Int_t) [lulesh-init.cc:308]
|
||||
2312 ../examples/lulesh/lulesh-init.cc Domain::CreateRegionIndexSets Domain::CreateRegionIndexSets(Domain *, Int_t, Int_t) [lulesh-init.cc:409]
|
||||
7109 ../examples/lulesh/lulesh-init.cc Domain::Domain Domain::Domain(Domain *, Int_t, Index_t, Index_t, Index_t, Index_t, int, int,...
|
||||
2458 ../examples/lulesh/lulesh-init.cc Domain::SetupBoundaryConditions Domain::SetupBoundaryConditions(Domain *, Int_t) [lulesh-init.cc:409]
|
||||
956 ../examples/lulesh/lulesh-init.cc Domain::SetupCommBuffers Domain::SetupCommBuffers(Domain *, Int_t) [lulesh-init.cc]
|
||||
1456 ../examples/lulesh/lulesh-init.cc Domain::SetupElementConnectivities Domain::SetupElementConnectivities(Domain *, Int_t) [lulesh-init.cc:409]
|
||||
721 ../examples/lulesh/lulesh-init.cc Domain::SetupSymmetryPlanes Domain::SetupSymmetryPlanes(Domain *, Int_t) [lulesh-init.cc:409]
|
||||
1591 ../examples/lulesh/lulesh-init.cc Domain::SetupThreadSupportStructures Domain::SetupThreadSupportStructures(Domain *) [lulesh-init.cc:376]
|
||||
1644 ../examples/lulesh/lulesh-init.cc Domain::~Domain Domain::~Domain(Domain *) [lulesh-init.cc:286]
|
||||
218 ../examples/lulesh/lulesh-init.cc InitMeshDecomp InitMeshDecomp(Int_t, Int_t, Int_t *, Int_t *, Int_t *, Int_t *) [lulesh-init...
|
||||
260 ../examples/lulesh/lulesh-init.cc Kokkos::Impl::CommonSubview<Kokkos::View<int* [8], Kokkos::LayoutRight>, Kokk... Kokkos::Impl::CommonSubview<Kokkos::View<int* [8], Kokkos::LayoutRight>, Kokk...
|
||||
1786 ../examples/lulesh/lulesh-init.cc Kokkos::Impl::HostIterateTile<Kokkos::MDRangePolicy<Kokkos::OpenMP, Kokkos::R... Kokkos::Impl::HostIterateTile<Kokkos::MDRangePolicy<Kokkos::OpenMP, Kokkos::R...
|
||||
330 ../examples/lulesh/lulesh-init.cc Kokkos::Impl::ParallelConstructName<Kokkos::Impl::ViewCopy<Kokkos::View<int**... Kokkos::Impl::ParallelConstructName<Kokkos::Impl::ViewCopy<Kokkos::View<int**...
|
||||
330 ../examples/lulesh/lulesh-init.cc Kokkos::Impl::ParallelConstructName<Kokkos::Impl::ViewCopy<Kokkos::View<int**... Kokkos::Impl::ParallelConstructName<Kokkos::Impl::ViewCopy<Kokkos::View<int**...
|
||||
330 ../examples/lulesh/lulesh-init.cc Kokkos::Impl::ParallelConstructName<Kokkos::Impl::ViewCopy<Kokkos::View<int*,... Kokkos::Impl::ParallelConstructName<Kokkos::Impl::ViewCopy<Kokkos::View<int*,...
|
||||
330 ../examples/lulesh/lulesh-init.cc Kokkos::Impl::ParallelConstructName<Kokkos::Impl::ViewCopy<Kokkos::View<int*,... Kokkos::Impl::ParallelConstructName<Kokkos::Impl::ViewCopy<Kokkos::View<int*,...
|
||||
330 ../examples/lulesh/lulesh-init.cc Kokkos::Impl::ParallelConstructName<Kokkos::Impl::ViewFill<Kokkos::View<doubl... Kokkos::Impl::ParallelConstructName<Kokkos::Impl::ViewFill<Kokkos::View<doubl...
|
||||
330 ../examples/lulesh/lulesh-init.cc Kokkos::Impl::ParallelConstructName<Kokkos::Impl::ViewFill<Kokkos::View<doubl... Kokkos::Impl::ParallelConstructName<Kokkos::Impl::ViewFill<Kokkos::View<doubl...
|
||||
330 ../examples/lulesh/lulesh-init.cc Kokkos::Impl::ParallelConstructName<Kokkos::Impl::ViewFill<Kokkos::View<doubl... Kokkos::Impl::ParallelConstructName<Kokkos::Impl::ViewFill<Kokkos::View<doubl...
|
||||
522 ../examples/lulesh/lulesh-init.cc Kokkos::Impl::ParallelFor<Kokkos::Impl::ViewCopy<Kokkos::View<int**, Kokkos::... Kokkos::Impl::ParallelFor<Kokkos::Impl::ViewCopy<Kokkos::View<int**, Kokkos::...
|
||||
232 ../examples/lulesh/lulesh-init.cc Kokkos::Impl::ParallelFor<Kokkos::Impl::ViewCopy<Kokkos::View<int**, Kokkos::... Kokkos::Impl::ParallelFor<Kokkos::Impl::ViewCopy<Kokkos::View<int**, Kokkos::...
|
||||
49 ../examples/lulesh/lulesh-init.cc Kokkos::Impl::SharedAllocationRecord<Kokkos::HostSpace, Kokkos::Impl::ViewVal... Kokkos::Impl::SharedAllocationRecord<Kokkos::HostSpace, Kokkos::Impl::ViewVal...
|
||||
1476 ../examples/lulesh/lulesh-init.cc Kokkos::Impl::Tile_Loop_Type<2, false, int, void, void>::apply<Kokkos::Impl::... Kokkos::Impl::Tile_Loop_Type<2, false, int, void, void>::apply<Kokkos::Impl::...
|
||||
555 ../examples/lulesh/lulesh-init.cc Kokkos::Impl::ViewCopy<Kokkos::View<int**, Kokkos::LayoutRight, Kokkos::Devic... Kokkos::Impl::ViewCopy<Kokkos::View<int**, Kokkos::LayoutRight, Kokkos::Devic...
|
||||
613 ../examples/lulesh/lulesh-init.cc Kokkos::Impl::ViewCopy<Kokkos::View<int**, Kokkos::LayoutRight, Kokkos::Devic... Kokkos::Impl::ViewCopy<Kokkos::View<int**, Kokkos::LayoutRight, Kokkos::Devic...
|
||||
603 ../examples/lulesh/lulesh-init.cc Kokkos::Impl::ViewCopy<Kokkos::View<int*, Kokkos::LayoutLeft, Kokkos::Device<... Kokkos::Impl::ViewCopy<Kokkos::View<int*, Kokkos::LayoutLeft, Kokkos::Device<...
|
||||
604 ../examples/lulesh/lulesh-init.cc Kokkos::Impl::ViewCopy<Kokkos::View<int*, Kokkos::LayoutLeft, Kokkos::Device<... Kokkos::Impl::ViewCopy<Kokkos::View<int*, Kokkos::LayoutLeft, Kokkos::Device<...
|
||||
281 ../examples/lulesh/lulesh-init.cc Kokkos::Impl::ViewCtorProp<std::__cxx11::basic_string<char, std::char_traits<... Kokkos::Impl::ViewCtorProp<std::__cxx11::basic_string<char, std::char_traits<...
|
||||
281 ../examples/lulesh/lulesh-init.cc Kokkos::Impl::ViewCtorProp<std::__cxx11::basic_string<char, std::char_traits<... Kokkos::Impl::ViewCtorProp<std::__cxx11::basic_string<char, std::char_traits<...
|
||||
281 ../examples/lulesh/lulesh-init.cc Kokkos::Impl::ViewCtorProp<std::__cxx11::basic_string<char, std::char_traits<... Kokkos::Impl::ViewCtorProp<std::__cxx11::basic_string<char, std::char_traits<...
|
||||
281 ../examples/lulesh/lulesh-init.cc Kokkos::Impl::ViewCtorProp<std::__cxx11::basic_string<char, std::char_traits<... Kokkos::Impl::ViewCtorProp<std::__cxx11::basic_string<char, std::char_traits<...
|
||||
281 ../examples/lulesh/lulesh-init.cc Kokkos::Impl::ViewCtorProp<std::__cxx11::basic_string<char, std::char_traits<... Kokkos::Impl::ViewCtorProp<std::__cxx11::basic_string<char, std::char_traits<...
|
||||
524 ../examples/lulesh/lulesh-init.cc Kokkos::Impl::ViewFill<Kokkos::View<double*, Kokkos::LayoutRight, Kokkos::Dev... Kokkos::Impl::ViewFill<Kokkos::View<double*, Kokkos::LayoutRight, Kokkos::Dev...
|
||||
525 ../examples/lulesh/lulesh-init.cc Kokkos::Impl::ViewFill<Kokkos::View<double*, Kokkos::LayoutRight, Kokkos::Dev... Kokkos::Impl::ViewFill<Kokkos::View<double*, Kokkos::LayoutRight, Kokkos::Dev...
|
||||
524 ../examples/lulesh/lulesh-init.cc Kokkos::Impl::ViewFill<Kokkos::View<double*, Kokkos::LayoutRight, Kokkos::Dev... Kokkos::Impl::ViewFill<Kokkos::View<double*, Kokkos::LayoutRight, Kokkos::Dev...
|
||||
583 ../examples/lulesh/lulesh-init.cc Kokkos::Impl::ViewMapping<Kokkos::ViewTraits<int* [8], Kokkos::LayoutRight>, ... SharedAllocationRecord<void, void> * Kokkos::Impl::ViewMapping<Kokkos::ViewTr...
|
||||
529 ../examples/lulesh/lulesh-init.cc Kokkos::Impl::ViewMapping<Kokkos::ViewTraits<int*, Kokkos::HostSpace>, void>:... SharedAllocationRecord<void, void> * Kokkos::Impl::ViewMapping<Kokkos::ViewTr...
|
||||
529 ../examples/lulesh/lulesh-init.cc Kokkos::Impl::ViewMapping<Kokkos::ViewTraits<int*>, void>::allocate_shared<st... SharedAllocationRecord<void, void> * Kokkos::Impl::ViewMapping<Kokkos::ViewTr...
|
||||
203 ../examples/lulesh/lulesh-init.cc Kokkos::Impl::ViewRemap<Kokkos::View<int* [8], Kokkos::LayoutRight>, Kokkos::... Kokkos::Impl::ViewRemap<Kokkos::View<int* [8], Kokkos::LayoutRight>, Kokkos::...
|
||||
331 ../examples/lulesh/lulesh-init.cc Kokkos::Impl::ViewRemap<Kokkos::View<int*>, Kokkos::View<int*>, Kokkos::OpenM... Kokkos::Impl::ViewRemap<Kokkos::View<int*>, Kokkos::View<int*>, Kokkos::OpenM...
|
||||
461 ../examples/lulesh/lulesh-init.cc Kokkos::Impl::ViewValueFunctor<Kokkos::Device<Kokkos::OpenMP, Kokkos::HostSpa... enable_if_t<std::is_trivial<int>::value && std::is_trivially_copy_assignable<...
|
||||
353 ../examples/lulesh/lulesh-init.cc Kokkos::Impl::contiguous_fill<Kokkos::OpenMP, double*> Kokkos::Impl::contiguous_fill<Kokkos::OpenMP, double*>(exec_space, dst, value...
|
||||
139 ../examples/lulesh/lulesh-init.cc Kokkos::Impl::contiguous_fill<Kokkos::OpenMP, double, Kokkos::LayoutRight, Ko... Kokkos::Impl::contiguous_fill<Kokkos::OpenMP, double, Kokkos::LayoutRight, Ko...
|
||||
824 ../examples/lulesh/lulesh-init.cc Kokkos::Impl::view_copy<Kokkos::View<int* [8], Kokkos::LayoutRight, Kokkos::D... Kokkos::Impl::view_copy<Kokkos::View<int* [8], Kokkos::LayoutRight, Kokkos::D...
|
||||
824 ../examples/lulesh/lulesh-init.cc Kokkos::Impl::view_copy<Kokkos::View<int* [8], Kokkos::LayoutRight, Kokkos::D... Kokkos::Impl::view_copy<Kokkos::View<int* [8], Kokkos::LayoutRight, Kokkos::D...
|
||||
824 ../examples/lulesh/lulesh-init.cc Kokkos::Impl::view_copy<Kokkos::View<int* [8], Kokkos::LayoutRight>, Kokkos::... Kokkos::Impl::view_copy<Kokkos::View<int* [8], Kokkos::LayoutRight>, Kokkos::...
|
||||
824 ../examples/lulesh/lulesh-init.cc Kokkos::Impl::view_copy<Kokkos::View<int* [8], Kokkos::LayoutRight>, Kokkos::... Kokkos::Impl::view_copy<Kokkos::View<int* [8], Kokkos::LayoutRight>, Kokkos::...
|
||||
697 ../examples/lulesh/lulesh-init.cc Kokkos::Impl::view_copy<Kokkos::View<int*, Kokkos::LayoutRight, Kokkos::Devic... Kokkos::Impl::view_copy<Kokkos::View<int*, Kokkos::LayoutRight, Kokkos::Devic...
|
||||
697 ../examples/lulesh/lulesh-init.cc Kokkos::Impl::view_copy<Kokkos::View<int*>, Kokkos::View<int*> > Kokkos::Impl::view_copy<Kokkos::View<int*>, Kokkos::View<int*> >(dst, src) [l...
|
||||
2036 ../examples/lulesh/lulesh-init.cc Kokkos::RangePolicy<Kokkos::OpenMP, Kokkos::Schedule<Kokkos::Static>, int>::R... Kokkos::RangePolicy<Kokkos::OpenMP, Kokkos::Schedule<Kokkos::Static>, int>::R...
|
||||
2506 ../examples/lulesh/lulesh-init.cc Kokkos::RangePolicy<Kokkos::OpenMP, Kokkos::Schedule<Kokkos::Static>, long>::... Kokkos::RangePolicy<Kokkos::OpenMP, Kokkos::Schedule<Kokkos::Static>, long>::...
|
||||
271 ../examples/lulesh/lulesh-init.cc Kokkos::StaticCrsGraph<int, Kokkos::LayoutLeft, Kokkos::OpenMP, Kokkos::Memor... Kokkos::StaticCrsGraph<int, Kokkos::LayoutLeft, Kokkos::OpenMP, Kokkos::Memor...
|
||||
470 ../examples/lulesh/lulesh-init.cc Kokkos::View<int* [8], Kokkos::LayoutRight>::View<std::__cxx11::basic_string<... Kokkos::View<int* [8], Kokkos::LayoutRight>::View<std::__cxx11::basic_string<...
|
||||
323 ../examples/lulesh/lulesh-init.cc Kokkos::View<int* [8], Kokkos::LayoutRight>::View<std::__cxx11::basic_string<... Kokkos::View<int* [8], Kokkos::LayoutRight>::View<std::__cxx11::basic_string<...
|
||||
410 ../examples/lulesh/lulesh-init.cc Kokkos::View<int*, Kokkos::HostSpace>::View<char [10]> Kokkos::View<int*, Kokkos::HostSpace>::View<char [10]>(View<int *, Kokkos::Ho...
|
||||
410 ../examples/lulesh/lulesh-init.cc Kokkos::View<int*, Kokkos::HostSpace>::View<char [14]> Kokkos::View<int*, Kokkos::HostSpace>::View<char [14]>(View<int *, Kokkos::Ho...
|
||||
462 ../examples/lulesh/lulesh-init.cc Kokkos::View<int*, Kokkos::HostSpace>::View<std::__cxx11::basic_string<char, ... Kokkos::View<int*, Kokkos::HostSpace>::View<std::__cxx11::basic_string<char, ...
|
||||
410 ../examples/lulesh/lulesh-init.cc Kokkos::View<int*>::View<char [16]> Kokkos::View<int*>::View<char [16]>(View<int *> *, arg_label, type, const siz...
|
||||
410 ../examples/lulesh/lulesh-init.cc Kokkos::View<int*>::View<char [19]> Kokkos::View<int*>::View<char [19]>(View<int *> *, arg_label, type, const siz...
|
||||
410 ../examples/lulesh/lulesh-init.cc Kokkos::View<int*>::View<char [21]> Kokkos::View<int*>::View<char [21]>(View<int *> *, arg_label, type, const siz...
|
||||
462 ../examples/lulesh/lulesh-init.cc Kokkos::View<int*>::View<std::__cxx11::basic_string<char, std::char_traits<ch... Kokkos::View<int*>::View<std::__cxx11::basic_string<char, std::char_traits<ch...
|
||||
323 ../examples/lulesh/lulesh-init.cc Kokkos::View<int*>::View<std::__cxx11::basic_string<char, std::char_traits<ch... Kokkos::View<int*>::View<std::__cxx11::basic_string<char, std::char_traits<ch...
|
||||
6589 ../examples/lulesh/lulesh-init.cc Kokkos::deep_copy<double*, , double*, Kokkos::LayoutRight, Kokkos::Device<Kok... Kokkos::deep_copy<double*, , double*, Kokkos::LayoutRight, Kokkos::Device<Kok...
|
||||
1052 ../examples/lulesh/lulesh-init.cc Kokkos::deep_copy<double*> Kokkos::deep_copy<double*>(dst, value) [lulesh-init.cc]
|
||||
1050 ../examples/lulesh/lulesh-init.cc Kokkos::deep_copy<double, Kokkos::LayoutRight, Kokkos::Device<Kokkos::OpenMP,... Kokkos::deep_copy<double, Kokkos::LayoutRight, Kokkos::Device<Kokkos::OpenMP,...
|
||||
7686 ../examples/lulesh/lulesh-init.cc Kokkos::deep_copy<int* [8], Kokkos::LayoutRight, Kokkos::Device<Kokkos::OpenM... Kokkos::deep_copy<int* [8], Kokkos::LayoutRight, Kokkos::Device<Kokkos::OpenM...
|
||||
7686 ../examples/lulesh/lulesh-init.cc Kokkos::deep_copy<int* [8], Kokkos::LayoutRight, int* [8], Kokkos::LayoutRigh... Kokkos::deep_copy<int* [8], Kokkos::LayoutRight, int* [8], Kokkos::LayoutRigh...
|
||||
6589 ../examples/lulesh/lulesh-init.cc Kokkos::deep_copy<int*, , int*, Kokkos::LayoutRight, Kokkos::Device<Kokkos::O... Kokkos::deep_copy<int*, , int*, Kokkos::LayoutRight, Kokkos::Device<Kokkos::O...
|
||||
6589 ../examples/lulesh/lulesh-init.cc Kokkos::deep_copy<int*, Kokkos::LayoutLeft, Kokkos::Device<Kokkos::OpenMP, Ko... Kokkos::deep_copy<int*, Kokkos::LayoutLeft, Kokkos::Device<Kokkos::OpenMP, Ko...
|
||||
6589 ../examples/lulesh/lulesh-init.cc Kokkos::deep_copy<int*, Kokkos::LayoutRight, Kokkos::Device<Kokkos::OpenMP, K... Kokkos::deep_copy<int*, Kokkos::LayoutRight, Kokkos::Device<Kokkos::OpenMP, K...
|
||||
863 ../examples/lulesh/lulesh-init.cc Kokkos::impl_resize<, int* [8], Kokkos::LayoutRight> type Kokkos::impl_resize<, int* [8], Kokkos::LayoutRight>(v, const size_t, co...
|
||||
854 ../examples/lulesh/lulesh-init.cc Kokkos::impl_resize<, int*> type Kokkos::impl_resize<, int*>(v, const size_t, const size_t, const size_t,...
|
||||
697 ../examples/lulesh/lulesh-init.cc Kokkos::parallel_for<Kokkos::MDRangePolicy<Kokkos::OpenMP, Kokkos::Rank<2u, (... Kokkos::parallel_for<Kokkos::MDRangePolicy<Kokkos::OpenMP, Kokkos::Rank<2u, (...
|
||||
706 ../examples/lulesh/lulesh-init.cc Kokkos::parallel_for<Kokkos::MDRangePolicy<Kokkos::OpenMP, Kokkos::Rank<2u, (... Kokkos::parallel_for<Kokkos::MDRangePolicy<Kokkos::OpenMP, Kokkos::Rank<2u, (...
|
||||
912 ../examples/lulesh/lulesh-init.cc Kokkos::parallel_for<Kokkos::RangePolicy<Kokkos::OpenMP, Kokkos::IndexType<in... Kokkos::parallel_for<Kokkos::RangePolicy<Kokkos::OpenMP, Kokkos::IndexType<in...
|
||||
791 ../examples/lulesh/lulesh-init.cc Kokkos::parallel_for<Kokkos::RangePolicy<Kokkos::OpenMP, Kokkos::IndexType<in... Kokkos::parallel_for<Kokkos::RangePolicy<Kokkos::OpenMP, Kokkos::IndexType<in...
|
||||
791 ../examples/lulesh/lulesh-init.cc Kokkos::parallel_for<Kokkos::RangePolicy<Kokkos::OpenMP, Kokkos::IndexType<in... Kokkos::parallel_for<Kokkos::RangePolicy<Kokkos::OpenMP, Kokkos::IndexType<in...
|
||||
944 ../examples/lulesh/lulesh-init.cc Kokkos::parallel_for<Kokkos::RangePolicy<Kokkos::OpenMP, Kokkos::IndexType<lo... Kokkos::parallel_for<Kokkos::RangePolicy<Kokkos::OpenMP, Kokkos::IndexType<lo...
|
||||
839 ../examples/lulesh/lulesh-init.cc Kokkos::parallel_for<Kokkos::RangePolicy<Kokkos::OpenMP, Kokkos::IndexType<lo... Kokkos::parallel_for<Kokkos::RangePolicy<Kokkos::OpenMP, Kokkos::IndexType<lo...
|
||||
126 ../examples/lulesh/lulesh-init.cc _GLOBAL__sub_I_lulesh_init.cc _GLOBAL__sub_I_lulesh_init.cc() [lulesh-init.cc]
|
||||
6589 ../examples/lulesh/lulesh-util.cc Kokkos::deep_copy<double*, Kokkos::LayoutRight, Kokkos::Device<Kokkos::OpenMP... Kokkos::deep_copy<double*, Kokkos::LayoutRight, Kokkos::Device<Kokkos::OpenMP...
|
||||
1345 ../examples/lulesh/lulesh-util.cc ParseCommandLineOptions ParseCommandLineOptions(int, char * *, int, cmdLineOpts *) [lulesh-util.cc:67]
|
||||
171 ../examples/lulesh/lulesh-util.cc PrintCommandLineOptions PrintCommandLineOptions(char *, int) [lulesh-util.cc:31]
|
||||
67 ../examples/lulesh/lulesh-util.cc StrToInt int StrToInt(const char *, int *) [lulesh-util.cc:13]
|
||||
706 ../examples/lulesh/lulesh-util.cc VerifyAndWriteFinalOutput VerifyAndWriteFinalOutput(Real_t, locDom, Int_t, Int_t) [lulesh-util.cc:222]
|
||||
126 ../examples/lulesh/lulesh-util.cc _GLOBAL__sub_I_lulesh_util.cc _GLOBAL__sub_I_lulesh_util.cc() [lulesh-util.cc]
|
||||
17 ../examples/lulesh/lulesh-viz.cc DumpToVisit DumpToVisit(domain, int, int, int) [lulesh-viz.cc:415]
|
||||
126 ../examples/lulesh/lulesh-viz.cc _GLOBAL__sub_I_lulesh_viz.cc _GLOBAL__sub_I_lulesh_viz.cc() [lulesh-viz.cc]
|
||||
451 ../examples/lulesh/lulesh.cc .omp_outlined..103 .omp_outlined..103(const , const , const ParallelReduce<(lambda at ../example...
|
||||
796 ../examples/lulesh/lulesh.cc .omp_outlined..109 .omp_outlined..109(const , const , const ParallelFor<(lambda at ../examples/l...
|
||||
394 ../examples/lulesh/lulesh.cc .omp_outlined..111 .omp_outlined..111(const , const , const ParallelFor<(lambda at ../examples/l...
|
||||
402 ../examples/lulesh/lulesh.cc .omp_outlined..113 .omp_outlined..113(const , const , const ParallelFor<(lambda at ../examples/l...
|
||||
427 ../examples/lulesh/lulesh.cc .omp_outlined..115 .omp_outlined..115(const , const , const ParallelReduce<(lambda at ../example...
|
||||
859 ../examples/lulesh/lulesh.cc .omp_outlined..119 .omp_outlined..119(const , const , const ParallelFor<(lambda at ../examples/l...
|
||||
243 ../examples/lulesh/lulesh.cc .omp_outlined..122 .omp_outlined..122(const , const , const ParallelFor<(lambda at ../examples/l...
|
||||
426 ../examples/lulesh/lulesh.cc .omp_outlined..124 .omp_outlined..124(const , const , const ParallelFor<(lambda at ../examples/l...
|
||||
529 ../examples/lulesh/lulesh.cc .omp_outlined..127 .omp_outlined..127(const , const , const ParallelFor<(lambda at ../examples/l...
|
||||
865 ../examples/lulesh/lulesh.cc .omp_outlined..130 .omp_outlined..130(const , const , const ParallelFor<(lambda at ../examples/l...
|
||||
539 ../examples/lulesh/lulesh.cc .omp_outlined..132 .omp_outlined..132(const , const , const ParallelReduce<(lambda at ../example...
|
||||
456 ../examples/lulesh/lulesh.cc .omp_outlined..134 .omp_outlined..134(const , const , const ParallelReduce<(lambda at ../example...
|
||||
252 ../examples/lulesh/lulesh.cc .omp_outlined..20 .omp_outlined..20(const , const , const ParallelFor<(lambda at ../examples/lu...
|
||||
870 ../examples/lulesh/lulesh.cc .omp_outlined..35 .omp_outlined..35(const , const , const ParallelFor<(lambda at ../examples/lu...
|
||||
473 ../examples/lulesh/lulesh.cc .omp_outlined..42 .omp_outlined..42(const , const , const ParallelFor<(lambda at ../examples/lu...
|
||||
252 ../examples/lulesh/lulesh.cc .omp_outlined..46 .omp_outlined..46(const , const , const ParallelFor<(lambda at ../examples/lu...
|
||||
1101 ../examples/lulesh/lulesh.cc .omp_outlined..48 .omp_outlined..48(const , const , const ParallelFor<(lambda at ../examples/lu...
|
||||
427 ../examples/lulesh/lulesh.cc .omp_outlined..55 .omp_outlined..55(const , const , const ParallelReduce<(lambda at ../examples...
|
||||
1326 ../examples/lulesh/lulesh.cc .omp_outlined..57 .omp_outlined..57(const , const , const ParallelReduce<(lambda at ../examples...
|
||||
243 ../examples/lulesh/lulesh.cc .omp_outlined..61 .omp_outlined..61(const , const , const ParallelFor<(lambda at ../examples/lu...
|
||||
1101 ../examples/lulesh/lulesh.cc .omp_outlined..63 .omp_outlined..63(const , const , const ParallelFor<(lambda at ../examples/lu...
|
||||
372 ../examples/lulesh/lulesh.cc .omp_outlined..66 .omp_outlined..66(const , const , const ParallelFor<(lambda at ../examples/lu...
|
||||
499 ../examples/lulesh/lulesh.cc .omp_outlined..71 .omp_outlined..71(const , const , const ParallelFor<(lambda at ../examples/lu...
|
||||
499 ../examples/lulesh/lulesh.cc .omp_outlined..73 .omp_outlined..73(const , const , const ParallelFor<(lambda at ../examples/lu...
|
||||
499 ../examples/lulesh/lulesh.cc .omp_outlined..75 .omp_outlined..75(const , const , const ParallelFor<(lambda at ../examples/lu...
|
||||
465 ../examples/lulesh/lulesh.cc .omp_outlined..78 .omp_outlined..78(const , const , const ParallelFor<(lambda at ../examples/lu...
|
||||
396 ../examples/lulesh/lulesh.cc .omp_outlined..81 .omp_outlined..81(const , const , const ParallelFor<(lambda at ../examples/lu...
|
||||
656 ../examples/lulesh/lulesh.cc .omp_outlined..85 .omp_outlined..85(const , const , const ParallelFor<Kokkos::Impl::ViewCopy<Ko...
|
||||
662 ../examples/lulesh/lulesh.cc .omp_outlined..89 .omp_outlined..89(const , const , const ParallelFor<Kokkos::Impl::ViewCopy<Ko...
|
||||
443 ../examples/lulesh/lulesh.cc .omp_outlined..93 .omp_outlined..93(const , const , const ParallelReduce<(lambda at ../examples...
|
||||
243 ../examples/lulesh/lulesh.cc .omp_outlined..96 .omp_outlined..96(const , const , const ParallelFor<(lambda at ../examples/lu...
|
||||
243 ../examples/lulesh/lulesh.cc .omp_outlined..99 .omp_outlined..99(const , const , const ParallelFor<(lambda at ../examples/lu...
|
||||
13367 ../examples/lulesh/lulesh.cc ApplyMaterialPropertiesForElems ApplyMaterialPropertiesForElems(domain) [lulesh.cc:409]
|
||||
1530 ../examples/lulesh/lulesh.cc CalcElemCharacteristicLength Real_t CalcElemCharacteristicLength(const Real_t *, const Real_t *, const Rea...
|
||||
982 ../examples/lulesh/lulesh.cc CalcElemFBHourglassForce CalcElemFBHourglassForce(const Real_t *, const Real_t[] *, coefficient, Real_...
|
||||
2428 ../examples/lulesh/lulesh.cc CalcElemNodeNormals CalcElemNodeNormals(Real_t *, Real_t *, Real_t *, const Real_t *, const Real_...
|
||||
853 ../examples/lulesh/lulesh.cc CalcElemShapeFunctionDerivatives CalcElemShapeFunctionDerivatives(const Real_t *, const Real_t *, const Real_t...
|
||||
1097 ../examples/lulesh/lulesh.cc CalcElemVolumeDerivative CalcElemVolumeDerivative(i, dvdx, dvdy, dvdz, const Real_t *, const Real_t *,...
|
||||
1054 ../examples/lulesh/lulesh.cc CalcKinematicsForElems CalcKinematicsForElems(domain, Real_t, Index_t) [lulesh.cc]
|
||||
14160 ../examples/lulesh/lulesh.cc CalcVolumeForceForElems CalcVolumeForceForElems(domain) [lulesh.cc:409]
|
||||
366 ../examples/lulesh/lulesh.cc Domain::AllocateGradients Domain::AllocateGradients(Domain *, Int_t, Int_t) [lulesh.cc:214]
|
||||
475 ../examples/lulesh/lulesh.cc Domain::DeallocateGradients Domain::DeallocateGradients(Domain *) [lulesh.cc:105]
|
||||
250 ../examples/lulesh/lulesh.cc Domain::DeallocateStrains Domain::DeallocateStrains(Domain *) [lulesh.cc:105]
|
||||
4356 ../examples/lulesh/lulesh.cc Domain::Domain Domain::Domain(Domain *) [lulesh.cc:78]
|
||||
15 ../examples/lulesh/lulesh.cc Domain::delv_eta Domain::delv_eta(const Domain *, const Index_t) [lulesh.cc:371]
|
||||
15 ../examples/lulesh/lulesh.cc Domain::delv_xi Domain::delv_xi(const Domain *, const Index_t) [lulesh.cc:368]
|
||||
15 ../examples/lulesh/lulesh.cc Domain::delv_zeta Domain::delv_zeta(const Domain *, const Index_t) [lulesh.cc:374]
|
||||
15 ../examples/lulesh/lulesh.cc Domain::fx Domain::fx(const Domain *, const Index_t) [lulesh.cc:303]
|
||||
15 ../examples/lulesh/lulesh.cc Domain::fy Domain::fy(const Domain *, const Index_t) [lulesh.cc:306]
|
||||
15 ../examples/lulesh/lulesh.cc Domain::fz Domain::fz(const Domain *, const Index_t) [lulesh.cc:309]
|
||||
15 ../examples/lulesh/lulesh.cc Domain::nodalMass Domain::nodalMass(const Domain *, const Index_t) [lulesh.cc:314]
|
||||
15 ../examples/lulesh/lulesh.cc Domain::x Domain::x(const Domain *, const Index_t) [lulesh.cc:257]
|
||||
15 ../examples/lulesh/lulesh.cc Domain::xd Domain::xd(const Domain *, const Index_t) [lulesh.cc:272]
|
||||
15 ../examples/lulesh/lulesh.cc Domain::y Domain::y(const Domain *, const Index_t) [lulesh.cc:258]
|
||||
15 ../examples/lulesh/lulesh.cc Domain::yd Domain::yd(const Domain *, const Index_t) [lulesh.cc:275]
|
||||
15 ../examples/lulesh/lulesh.cc Domain::z Domain::z(const Domain *, const Index_t) [lulesh.cc:259]
|
||||
15 ../examples/lulesh/lulesh.cc Domain::zd Domain::zd(const Domain *, const Index_t) [lulesh.cc:278]
|
||||
330 ../examples/lulesh/lulesh.cc Kokkos::Impl::ParallelConstructName<Kokkos::Impl::ViewCopy<Kokkos::View<doubl... Kokkos::Impl::ParallelConstructName<Kokkos::Impl::ViewCopy<Kokkos::View<doubl...
|
||||
330 ../examples/lulesh/lulesh.cc Kokkos::Impl::ParallelConstructName<Kokkos::Impl::ViewCopy<Kokkos::View<doubl... Kokkos::Impl::ParallelConstructName<Kokkos::Impl::ViewCopy<Kokkos::View<doubl...
|
||||
1508 ../examples/lulesh/lulesh.cc Kokkos::Impl::ParallelFor<CalcEnergyForElems(double*, double*, double*, doubl... type Kokkos::Impl::ParallelFor<CalcEnergyForElems(double*, double*, double*, ...
|
||||
3606 ../examples/lulesh/lulesh.cc Kokkos::Impl::ParallelFor<CalcFBHourglassForceForElems(Domain&, double*, Kokk... type Kokkos::Impl::ParallelFor<CalcFBHourglassForceForElems(Domain&, double*,...
|
||||
2917 ../examples/lulesh/lulesh.cc Kokkos::Impl::ParallelFor<CalcKinematicsForElems(Domain&, double, int)::$_0, ... type Kokkos::Impl::ParallelFor<CalcKinematicsForElems(Domain&, double, int)::...
|
||||
3119 ../examples/lulesh/lulesh.cc Kokkos::Impl::ParallelFor<CalcMonotonicQGradientsForElems(Domain&)::{lambda(i... type Kokkos::Impl::ParallelFor<CalcMonotonicQGradientsForElems(Domain&)::{lam...
|
||||
1969 ../examples/lulesh/lulesh.cc Kokkos::Impl::ParallelFor<CalcMonotonicQRegionForElems(Domain&, int, double):... type Kokkos::Impl::ParallelFor<CalcMonotonicQRegionForElems(Domain&, int, dou...
|
||||
1265 ../examples/lulesh/lulesh.cc Kokkos::Impl::ParallelFor<IntegrateStressForElems(Domain&, double*, double*, ... type Kokkos::Impl::ParallelFor<IntegrateStressForElems(Domain&, double*, doub...
|
||||
49 ../examples/lulesh/lulesh.cc Kokkos::Impl::SharedAllocationRecord<Kokkos::HostSpace, Kokkos::Impl::ViewVal... Kokkos::Impl::SharedAllocationRecord<Kokkos::HostSpace, Kokkos::Impl::ViewVal...
|
||||
1497 ../examples/lulesh/lulesh.cc Kokkos::Impl::TeamPolicyInternal<Kokkos::OpenMP>::TeamPolicyInternal Kokkos::Impl::TeamPolicyInternal<Kokkos::OpenMP>::TeamPolicyInternal(TeamPoli...
|
||||
603 ../examples/lulesh/lulesh.cc Kokkos::Impl::ViewCopy<Kokkos::View<double*, Kokkos::LayoutLeft, Kokkos::Devi... Kokkos::Impl::ViewCopy<Kokkos::View<double*, Kokkos::LayoutLeft, Kokkos::Devi...
|
||||
604 ../examples/lulesh/lulesh.cc Kokkos::Impl::ViewCopy<Kokkos::View<double*, Kokkos::LayoutLeft, Kokkos::Devi... Kokkos::Impl::ViewCopy<Kokkos::View<double*, Kokkos::LayoutLeft, Kokkos::Devi...
|
||||
281 ../examples/lulesh/lulesh.cc Kokkos::Impl::ViewCtorProp<std::__cxx11::basic_string<char, std::char_traits<... Kokkos::Impl::ViewCtorProp<std::__cxx11::basic_string<char, std::char_traits<...
|
||||
281 ../examples/lulesh/lulesh.cc Kokkos::Impl::ViewCtorProp<std::__cxx11::basic_string<char, std::char_traits<... Kokkos::Impl::ViewCtorProp<std::__cxx11::basic_string<char, std::char_traits<...
|
||||
521 ../examples/lulesh/lulesh.cc Kokkos::Impl::ViewMapping<Kokkos::ViewTraits<double*>, void>::allocate_shared... SharedAllocationRecord<void, void> * Kokkos::Impl::ViewMapping<Kokkos::ViewTr...
|
||||
331 ../examples/lulesh/lulesh.cc Kokkos::Impl::ViewRemap<Kokkos::View<double*>, Kokkos::View<double*>, Kokkos:... Kokkos::Impl::ViewRemap<Kokkos::View<double*>, Kokkos::View<double*>, Kokkos:...
|
||||
461 ../examples/lulesh/lulesh.cc Kokkos::Impl::ViewValueFunctor<Kokkos::Device<Kokkos::OpenMP, Kokkos::HostSpa... enable_if_t<std::is_trivial<double>::value && std::is_trivially_copy_assignab...
|
||||
1609 ../examples/lulesh/lulesh.cc Kokkos::Impl::runtime_check_rank_host Kokkos::Impl::runtime_check_rank_host(const size_t, const bool, const size_t,...
|
||||
697 ../examples/lulesh/lulesh.cc Kokkos::Impl::view_copy<Kokkos::View<double*, Kokkos::LayoutRight, Kokkos::De... Kokkos::Impl::view_copy<Kokkos::View<double*, Kokkos::LayoutRight, Kokkos::De...
|
||||
697 ../examples/lulesh/lulesh.cc Kokkos::Impl::view_copy<Kokkos::View<double*>, Kokkos::View<double*> > Kokkos::Impl::view_copy<Kokkos::View<double*>, Kokkos::View<double*> >(dst, s...
|
||||
2250 ../examples/lulesh/lulesh.cc Kokkos::RangePolicy<Kokkos::OpenMP>::RangePolicy Kokkos::RangePolicy<Kokkos::OpenMP>::RangePolicy(RangePolicy<Kokkos::OpenMP> ...
|
||||
213 ../examples/lulesh/lulesh.cc Kokkos::StaticCrsGraph<int, Kokkos::LayoutLeft, Kokkos::OpenMP, Kokkos::Memor... Kokkos::StaticCrsGraph<int, Kokkos::LayoutLeft, Kokkos::OpenMP, Kokkos::Memor...
|
||||
410 ../examples/lulesh/lulesh.cc Kokkos::View<double*>::View<char [6]> Kokkos::View<double*>::View<char [6]>(View<double *> *, arg_label, type, cons...
|
||||
410 ../examples/lulesh/lulesh.cc Kokkos::View<double*>::View<char [7]> Kokkos::View<double*>::View<char [7]>(View<double *> *, arg_label, type, cons...
|
||||
462 ../examples/lulesh/lulesh.cc Kokkos::View<double*>::View<std::__cxx11::basic_string<char, std::char_traits... Kokkos::View<double*>::View<std::__cxx11::basic_string<char, std::char_traits...
|
||||
323 ../examples/lulesh/lulesh.cc Kokkos::View<double*>::View<std::__cxx11::basic_string<char, std::char_traits... Kokkos::View<double*>::View<std::__cxx11::basic_string<char, std::char_traits...
|
||||
25 ../examples/lulesh/lulesh.cc Kokkos::View<double*>::~View Kokkos::View<double*>::~View(View<double *> *) [lulesh.cc:409]
|
||||
840 ../examples/lulesh/lulesh.cc Kokkos::abort Kokkos::abort(const const char *, const const char *) [lulesh.cc:202]
|
||||
854 ../examples/lulesh/lulesh.cc Kokkos::impl_resize<, double*> type Kokkos::impl_resize<, double*>(v, const size_t, const size_t, const size...
|
||||
928 ../examples/lulesh/lulesh.cc Kokkos::parallel_for<Kokkos::RangePolicy<Kokkos::OpenMP, Kokkos::IndexType<in... Kokkos::parallel_for<Kokkos::RangePolicy<Kokkos::OpenMP, Kokkos::IndexType<in...
|
||||
960 ../examples/lulesh/lulesh.cc Kokkos::parallel_for<Kokkos::RangePolicy<Kokkos::OpenMP, Kokkos::IndexType<lo... Kokkos::parallel_for<Kokkos::RangePolicy<Kokkos::OpenMP, Kokkos::IndexType<lo...
|
||||
21470 ../examples/lulesh/lulesh.cc LagrangeLeapFrog LagrangeLeapFrog(domain) [lulesh.cc]
|
||||
226 ../examples/lulesh/lulesh.cc ResizeBuffer ResizeBuffer(const size_t) [lulesh.cc:23]
|
||||
169 ../examples/lulesh/lulesh.cc _GLOBAL__sub_I_lulesh.cc _GLOBAL__sub_I_lulesh.cc() [lulesh.cc]
|
||||
1836 ../examples/lulesh/lulesh.cc main int main(int, char * *) [lulesh.cc]
|
||||
63 ../examples/lulesh/lulesh.cc std::_Rb_tree<std::__cxx11::basic_string<char, std::char_traits<char>, std::a... std::_Rb_tree<std::__cxx11::basic_string<char, std::char_traits<char>, std::a...
|
||||
20 ../examples/lulesh/lulesh.cc std::map<std::__cxx11::basic_string<char, std::char_traits<char>, std::alloca... std::map<std::__cxx11::basic_string<char, std::char_traits<char>, std::alloca...
|
||||
160 ../examples/lulesh/lulesh.cc std::operator+<char, std::char_traits<char>, std::allocator<char> > basic_string<char, std::char_traits<char>, std::allocator<char> > std::operat...
|
||||
187 ../examples/lulesh/lulesh.cc std::pair<std::__cxx11::basic_string<char, std::char_traits<char>, std::alloc... std::pair<std::__cxx11::basic_string<char, std::char_traits<char>, std::alloc...
|
||||
11 lulesh __clang_call_terminate __clang_call_terminate() [lulesh]
|
||||
33 lulesh __do_global_dtors_aux __do_global_dtors_aux() [lulesh]
|
||||
5 lulesh __libc_csu_fini __libc_csu_fini() [lulesh]
|
||||
101 lulesh __libc_csu_init __libc_csu_init() [lulesh]
|
||||
5 lulesh _dl_relocate_static_pie _dl_relocate_static_pie() [lulesh]
|
||||
13 lulesh _fini _fini() [lulesh]
|
||||
27 lulesh _init _init() [lulesh]
|
||||
47 lulesh _start _start() [lulesh]
|
||||
6 lulesh frame_dummy frame_dummy() [lulesh]
|
||||
```
|
||||
|
||||
#### Example Instrumented Module and Function Info Output
|
||||
|
||||
> `omnitrace -o lulesh.inst --label file line args --simulate -- lulesh`
|
||||
|
||||
After the heuristics are applied in [Example Available Module and Function Info Output](#example-available-module-and-function-info-output),
|
||||
the selected module/functions are:
|
||||
|
||||
```console
|
||||
AddressRange Module Function FunctionSignature
|
||||
9165 ../examples/lulesh/lulesh-comm.cc CommMonoQ CommMonoQ(domain) [lulesh-comm.cc:1891]
|
||||
3396 ../examples/lulesh/lulesh-comm.cc CommRecv CommRecv(domain, int, Index_t, Index_t, Index_t, Index_t, bool, bool) [lulesh...
|
||||
8666 ../examples/lulesh/lulesh-comm.cc CommSBN CommSBN(domain, int, Domain_member *) [lulesh-comm.cc:926]
|
||||
10212 ../examples/lulesh/lulesh-comm.cc CommSend CommSend(domain, int, Index_t, Domain_member *, Index_t, Index_t, Index_t, bo...
|
||||
6823 ../examples/lulesh/lulesh-comm.cc CommSyncPosVel CommSyncPosVel(domain) [lulesh-comm.cc:1404]
|
||||
1840 ../examples/lulesh/lulesh-init.cc Domain::AllocateElemPersistent Domain::AllocateElemPersistent(Domain *, Int_t) [lulesh-init.cc:94]
|
||||
1384 ../examples/lulesh/lulesh-init.cc Domain::AllocateNodePersistent Domain::AllocateNodePersistent(Domain *, Int_t) [lulesh-init.cc:94]
|
||||
1264 ../examples/lulesh/lulesh-init.cc Domain::BuildMesh Domain::BuildMesh(Domain *, Int_t, Int_t, Int_t) [lulesh-init.cc:308]
|
||||
2312 ../examples/lulesh/lulesh-init.cc Domain::CreateRegionIndexSets Domain::CreateRegionIndexSets(Domain *, Int_t, Int_t) [lulesh-init.cc:409]
|
||||
7109 ../examples/lulesh/lulesh-init.cc Domain::Domain Domain::Domain(Domain *, Int_t, Index_t, Index_t, Index_t, Index_t, int, int,...
|
||||
2458 ../examples/lulesh/lulesh-init.cc Domain::SetupBoundaryConditions Domain::SetupBoundaryConditions(Domain *, Int_t) [lulesh-init.cc:409]
|
||||
956 ../examples/lulesh/lulesh-init.cc Domain::SetupCommBuffers Domain::SetupCommBuffers(Domain *, Int_t) [lulesh-init.cc]
|
||||
1456 ../examples/lulesh/lulesh-init.cc Domain::SetupElementConnectivities Domain::SetupElementConnectivities(Domain *, Int_t) [lulesh-init.cc:409]
|
||||
721 ../examples/lulesh/lulesh-init.cc Domain::SetupSymmetryPlanes Domain::SetupSymmetryPlanes(Domain *, Int_t) [lulesh-init.cc:409]
|
||||
1591 ../examples/lulesh/lulesh-init.cc Domain::SetupThreadSupportStructures Domain::SetupThreadSupportStructures(Domain *) [lulesh-init.cc:376]
|
||||
1644 ../examples/lulesh/lulesh-init.cc Domain::~Domain Domain::~Domain(Domain *) [lulesh-init.cc:286]
|
||||
271 ../examples/lulesh/lulesh-init.cc Kokkos::StaticCrsGraph<int, Kokkos::LayoutLeft, Kokkos::OpenMP, Kokkos::Memor... Kokkos::StaticCrsGraph<int, Kokkos::LayoutLeft, Kokkos::OpenMP, Kokkos::Memor...
|
||||
410 ../examples/lulesh/lulesh-init.cc Kokkos::View<int*, Kokkos::HostSpace>::View<char [10]> Kokkos::View<int*, Kokkos::HostSpace>::View<char [10]>(View<int *, Kokkos::Ho...
|
||||
410 ../examples/lulesh/lulesh-init.cc Kokkos::View<int*, Kokkos::HostSpace>::View<char [14]> Kokkos::View<int*, Kokkos::HostSpace>::View<char [14]>(View<int *, Kokkos::Ho...
|
||||
410 ../examples/lulesh/lulesh-init.cc Kokkos::View<int*>::View<char [16]> Kokkos::View<int*>::View<char [16]>(View<int *> *, arg_label, type, const siz...
|
||||
410 ../examples/lulesh/lulesh-init.cc Kokkos::View<int*>::View<char [19]> Kokkos::View<int*>::View<char [19]>(View<int *> *, arg_label, type, const siz...
|
||||
410 ../examples/lulesh/lulesh-init.cc Kokkos::View<int*>::View<char [21]> Kokkos::View<int*>::View<char [21]>(View<int *> *, arg_label, type, const siz...
|
||||
6589 ../examples/lulesh/lulesh-init.cc Kokkos::deep_copy<double*, , double*, Kokkos::LayoutRight, Kokkos::Device<Kok... Kokkos::deep_copy<double*, , double*, Kokkos::LayoutRight, Kokkos::Device<Kok...
|
||||
1052 ../examples/lulesh/lulesh-init.cc Kokkos::deep_copy<double*> Kokkos::deep_copy<double*>(dst, value) [lulesh-init.cc]
|
||||
1050 ../examples/lulesh/lulesh-init.cc Kokkos::deep_copy<double, Kokkos::LayoutRight, Kokkos::Device<Kokkos::OpenMP,... Kokkos::deep_copy<double, Kokkos::LayoutRight, Kokkos::Device<Kokkos::OpenMP,...
|
||||
7686 ../examples/lulesh/lulesh-init.cc Kokkos::deep_copy<int* [8], Kokkos::LayoutRight, Kokkos::Device<Kokkos::OpenM... Kokkos::deep_copy<int* [8], Kokkos::LayoutRight, Kokkos::Device<Kokkos::OpenM...
|
||||
7686 ../examples/lulesh/lulesh-init.cc Kokkos::deep_copy<int* [8], Kokkos::LayoutRight, int* [8], Kokkos::LayoutRigh... Kokkos::deep_copy<int* [8], Kokkos::LayoutRight, int* [8], Kokkos::LayoutRigh...
|
||||
6589 ../examples/lulesh/lulesh-init.cc Kokkos::deep_copy<int*, , int*, Kokkos::LayoutRight, Kokkos::Device<Kokkos::O... Kokkos::deep_copy<int*, , int*, Kokkos::LayoutRight, Kokkos::Device<Kokkos::O...
|
||||
6589 ../examples/lulesh/lulesh-init.cc Kokkos::deep_copy<int*, Kokkos::LayoutLeft, Kokkos::Device<Kokkos::OpenMP, Ko... Kokkos::deep_copy<int*, Kokkos::LayoutLeft, Kokkos::Device<Kokkos::OpenMP, Ko...
|
||||
6589 ../examples/lulesh/lulesh-init.cc Kokkos::deep_copy<int*, Kokkos::LayoutRight, Kokkos::Device<Kokkos::OpenMP, K... Kokkos::deep_copy<int*, Kokkos::LayoutRight, Kokkos::Device<Kokkos::OpenMP, K...
|
||||
697 ../examples/lulesh/lulesh-init.cc Kokkos::parallel_for<Kokkos::MDRangePolicy<Kokkos::OpenMP, Kokkos::Rank<2u, (... Kokkos::parallel_for<Kokkos::MDRangePolicy<Kokkos::OpenMP, Kokkos::Rank<2u, (...
|
||||
706 ../examples/lulesh/lulesh-init.cc Kokkos::parallel_for<Kokkos::MDRangePolicy<Kokkos::OpenMP, Kokkos::Rank<2u, (... Kokkos::parallel_for<Kokkos::MDRangePolicy<Kokkos::OpenMP, Kokkos::Rank<2u, (...
|
||||
912 ../examples/lulesh/lulesh-init.cc Kokkos::parallel_for<Kokkos::RangePolicy<Kokkos::OpenMP, Kokkos::IndexType<in... Kokkos::parallel_for<Kokkos::RangePolicy<Kokkos::OpenMP, Kokkos::IndexType<in...
|
||||
791 ../examples/lulesh/lulesh-init.cc Kokkos::parallel_for<Kokkos::RangePolicy<Kokkos::OpenMP, Kokkos::IndexType<in... Kokkos::parallel_for<Kokkos::RangePolicy<Kokkos::OpenMP, Kokkos::IndexType<in...
|
||||
791 ../examples/lulesh/lulesh-init.cc Kokkos::parallel_for<Kokkos::RangePolicy<Kokkos::OpenMP, Kokkos::IndexType<in... Kokkos::parallel_for<Kokkos::RangePolicy<Kokkos::OpenMP, Kokkos::IndexType<in...
|
||||
944 ../examples/lulesh/lulesh-init.cc Kokkos::parallel_for<Kokkos::RangePolicy<Kokkos::OpenMP, Kokkos::IndexType<lo... Kokkos::parallel_for<Kokkos::RangePolicy<Kokkos::OpenMP, Kokkos::IndexType<lo...
|
||||
839 ../examples/lulesh/lulesh-init.cc Kokkos::parallel_for<Kokkos::RangePolicy<Kokkos::OpenMP, Kokkos::IndexType<lo... Kokkos::parallel_for<Kokkos::RangePolicy<Kokkos::OpenMP, Kokkos::IndexType<lo...
|
||||
6589 ../examples/lulesh/lulesh-util.cc Kokkos::deep_copy<double*, Kokkos::LayoutRight, Kokkos::Device<Kokkos::OpenMP... Kokkos::deep_copy<double*, Kokkos::LayoutRight, Kokkos::Device<Kokkos::OpenMP...
|
||||
1345 ../examples/lulesh/lulesh-util.cc ParseCommandLineOptions ParseCommandLineOptions(int, char * *, int, cmdLineOpts *) [lulesh-util.cc:67]
|
||||
706 ../examples/lulesh/lulesh-util.cc VerifyAndWriteFinalOutput VerifyAndWriteFinalOutput(Real_t, locDom, Int_t, Int_t) [lulesh-util.cc:222]
|
||||
13367 ../examples/lulesh/lulesh.cc ApplyMaterialPropertiesForElems ApplyMaterialPropertiesForElems(domain) [lulesh.cc:409]
|
||||
982 ../examples/lulesh/lulesh.cc CalcElemFBHourglassForce CalcElemFBHourglassForce(const Real_t *, const Real_t[] *, coefficient, Real_...
|
||||
2428 ../examples/lulesh/lulesh.cc CalcElemNodeNormals CalcElemNodeNormals(Real_t *, Real_t *, Real_t *, const Real_t *, const Real_...
|
||||
853 ../examples/lulesh/lulesh.cc CalcElemShapeFunctionDerivatives CalcElemShapeFunctionDerivatives(const Real_t *, const Real_t *, const Real_t...
|
||||
1054 ../examples/lulesh/lulesh.cc CalcKinematicsForElems CalcKinematicsForElems(domain, Real_t, Index_t) [lulesh.cc]
|
||||
14160 ../examples/lulesh/lulesh.cc CalcVolumeForceForElems CalcVolumeForceForElems(domain) [lulesh.cc:409]
|
||||
366 ../examples/lulesh/lulesh.cc Domain::AllocateGradients Domain::AllocateGradients(Domain *, Int_t, Int_t) [lulesh.cc:214]
|
||||
475 ../examples/lulesh/lulesh.cc Domain::DeallocateGradients Domain::DeallocateGradients(Domain *) [lulesh.cc:105]
|
||||
4356 ../examples/lulesh/lulesh.cc Domain::Domain Domain::Domain(Domain *) [lulesh.cc:78]
|
||||
410 ../examples/lulesh/lulesh.cc Kokkos::View<double*>::View<char [6]> Kokkos::View<double*>::View<char [6]>(View<double *> *, arg_label, type, cons...
|
||||
410 ../examples/lulesh/lulesh.cc Kokkos::View<double*>::View<char [7]> Kokkos::View<double*>::View<char [7]>(View<double *> *, arg_label, type, cons...
|
||||
928 ../examples/lulesh/lulesh.cc Kokkos::parallel_for<Kokkos::RangePolicy<Kokkos::OpenMP, Kokkos::IndexType<in... Kokkos::parallel_for<Kokkos::RangePolicy<Kokkos::OpenMP, Kokkos::IndexType<in...
|
||||
960 ../examples/lulesh/lulesh.cc Kokkos::parallel_for<Kokkos::RangePolicy<Kokkos::OpenMP, Kokkos::IndexType<lo... Kokkos::parallel_for<Kokkos::RangePolicy<Kokkos::OpenMP, Kokkos::IndexType<lo...
|
||||
21470 ../examples/lulesh/lulesh.cc LagrangeLeapFrog LagrangeLeapFrog(domain) [lulesh.cc]
|
||||
1836 ../examples/lulesh/lulesh.cc main int main(int, char * *) [lulesh.cc]
|
||||
```
|
||||
|
||||
## Sampling
|
||||
|
||||
By default, omnitrace uses `--mode trace` for instrumentation. The `--mode sampling` option
|
||||
will only instrument `main` in an executable and will activate both CPU call-stack sampling and
|
||||
background system-level thread sampling by default.
|
||||
Tracing capabilities which do not rely on instrumentation, such as the HIP API and kernel tracing
|
||||
(which is collected via roctracer), will still be available.
|
||||
|
||||
[Omnitrace](https://github.com/AMDResearch/omnitrace)'s sampling capabilities are always available, even in trace mode, but is deactivated by default.
|
||||
In order to activate sampling in trace mode, simply set `OMNITRACE_USE_SAMPLING=ON` in the environment
|
||||
or in an omnitrace configuration file.
|
||||
|
||||
## Embedding a Default Configuration
|
||||
|
||||
Using the `--env` option, a default configuration can be embedded into the target. Although this option
|
||||
works for runtime instrumentation, it is most useful when generating new binaries since the generated
|
||||
binary may be used later in a different login sessions when the environment may have changed.
|
||||
|
||||
For example, if the following sequence of commands are run:
|
||||
|
||||
```shell
|
||||
omnitrace -o ./foo.inst -- ./foo
|
||||
export OMNITRACE_USE_SAMPLING=ON
|
||||
export OMNITRACE_SAMPLING_FREQ=5
|
||||
./foo.inst
|
||||
```
|
||||
|
||||
These configuration settings will not be preserved in another session, whereas:
|
||||
|
||||
```shell
|
||||
omnitrace -o ./foo.samp --env OMNITRACE_USE_SAMPLING=ON OMNITRACE_SAMPLING_FREQ=5 -- ./foo
|
||||
```
|
||||
|
||||
will preserve those environment variables:
|
||||
|
||||
```shell
|
||||
# will sample 5x per second
|
||||
./foo.samp
|
||||
```
|
||||
|
||||
while still allowing the subsequent session to override those defaults:
|
||||
|
||||
```shell
|
||||
# will sample 100x per second
|
||||
export OMNITRACE_SAMPLING_FREQ=100
|
||||
./foo.samp
|
||||
```
|
||||
|
||||
### Troubleshooting
|
||||
|
||||
#### Checking for RPATH
|
||||
|
||||
If `ldd ./foo.inst` from the [Binary Rewriting a Library Example](#binary-rewriting-a-library-example) section still returned `/usr/local/lib/libfoo.so.2`, your executable may have an rpath encoded in the binary.
|
||||
This ELF entry will result in the dynamic linker to ignore `LD_LIBRARY_PATH` if it finds a `libfoo.so.2` in the rpath.
|
||||
You can use the `objdump` tool to perform this query:
|
||||
|
||||
```shell
|
||||
objdump -p <exe-or-library> | egrep 'RPATH|RUNPATH'
|
||||
```
|
||||
|
||||
If this produces output, e.g.:
|
||||
|
||||
```shell
|
||||
RUNPATH $ORIGIN:$ORIGIN/../lib
|
||||
```
|
||||
|
||||
You will have to remove or modify the rpath in order to get `foo.inst` to resolve to the instrumented `libfoo.so.2`
|
||||
|
||||
#### Modifying RPATH
|
||||
|
||||
> Requires `patchelf` package
|
||||
|
||||
```shell
|
||||
patchelf --remove-rpath <exe-or-library>
|
||||
patchelf --set-rpath '/home/user' <exe-or-library>
|
||||
```
|
||||
@@ -0,0 +1,35 @@
|
||||
@ECHO OFF
|
||||
|
||||
pushd %~dp0
|
||||
|
||||
REM Command file for Sphinx documentation
|
||||
|
||||
if "%SPHINXBUILD%" == "" (
|
||||
set SPHINXBUILD=sphinx-build
|
||||
)
|
||||
set SOURCEDIR=.
|
||||
set BUILDDIR=_build
|
||||
|
||||
if "%1" == "" goto help
|
||||
|
||||
%SPHINXBUILD% >NUL 2>NUL
|
||||
if errorlevel 9009 (
|
||||
echo.
|
||||
echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
|
||||
echo.installed, then set the SPHINXBUILD environment variable to point
|
||||
echo.to the full path of the 'sphinx-build' executable. Alternatively you
|
||||
echo.may add the Sphinx directory to PATH.
|
||||
echo.
|
||||
echo.If you don't have Sphinx installed, grab it from
|
||||
echo.http://sphinx-doc.org/
|
||||
exit /b 1
|
||||
)
|
||||
|
||||
%SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
|
||||
goto end
|
||||
|
||||
:help
|
||||
%SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
|
||||
|
||||
:end
|
||||
popd
|
||||
Dosya farkı çok büyük olduğundan ihmal edildi
Fark Yükle
@@ -0,0 +1,528 @@
|
||||
# Omnitrace Output
|
||||
|
||||
```eval_rst
|
||||
.. toctree::
|
||||
:glob:
|
||||
:maxdepth: 3
|
||||
```
|
||||
|
||||
## Overview
|
||||
|
||||
The general output form of omnitrace is `<OUTPUT_PATH>[/<TIMESTAMP>]/[<PREFIX>]<DATA_NAME>[-<OUTPUT_SUFFIX>].<EXT>`.
|
||||
|
||||
E.g. with the base configuration:
|
||||
|
||||
```shell
|
||||
export OMNITRACE_OUTPUT_PATH=omnitrace-example-output
|
||||
export OMNITRACE_TIME_OUTPUT=ON
|
||||
export OMNITRACE_USE_PID=OFF
|
||||
export OMNITRACE_USE_TIMEMORY=ON
|
||||
export OMNITRACE_USE_PERFETTO=ON
|
||||
```
|
||||
|
||||
```shell
|
||||
$ omnitrace -- ./foo
|
||||
...
|
||||
[omnitrace] Outputting 'omnitrace-example-output/perfetto-trace.proto'...
|
||||
|
||||
[omnitrace] Outputting 'omnitrace-example-output/wall-clock.txt'...
|
||||
[omnitrace] Outputting 'omnitrace-example-output/wall-clock.json'...
|
||||
```
|
||||
|
||||
If we enable the `OMNITRACE_USE_PID` option, then when our non-MPI executable is executed with a PID of 63453:
|
||||
|
||||
```shell
|
||||
$ export OMNITRACE_USE_PID=ON
|
||||
$ omnitrace -- ./foo
|
||||
...
|
||||
[omnitrace] Outputting 'omnitrace-example-output/perfetto-trace-63453.proto'...
|
||||
|
||||
[omnitrace] Outputting 'omnitrace-example-output/wall-clock-63453.txt'...
|
||||
[omnitrace] Outputting 'omnitrace-example-output/wall-clock-63453.json'...
|
||||
```
|
||||
|
||||
If we enable `OMNITRACE_TIME_OUTPUT`, then a job started on January 31, 2022 at 12:30 PM:
|
||||
|
||||
```shell
|
||||
$ export OMNITRACE_TIME_OUTPUT=ON
|
||||
$ omnitrace -- ./foo
|
||||
...
|
||||
[omnitrace] Outputting 'omnitrace-example-output/2022-01-31_12.30_PM/perfetto-trace-63453.proto'...
|
||||
|
||||
[omnitrace] Outputting 'omnitrace-example-output/2022-01-31_12.30_PM/wall-clock-63453.txt'...
|
||||
[omnitrace] Outputting 'omnitrace-example-output/2022-01-31_12.30_PM/wall-clock-63453.json'...
|
||||
```
|
||||
|
||||
## Metadata
|
||||
|
||||
[Omnitrace](https://github.com/AMDResearch/omnitrace) will output a metadata.json file.
|
||||
|
||||
## Configuring Output
|
||||
|
||||
### Core Configuration Settings
|
||||
|
||||
> See also: [Customizing Omnitrace Runtime](runtime.md)
|
||||
|
||||
| Setting | Value | Description |
|
||||
|---------------------------|--------------------|---------------------------------------------------------------------------------------------------|
|
||||
| `OMNITRACE_OUTPUT_PATH` | Any valid path | Path to folder where output files should be placed |
|
||||
| `OMNITRACE_OUTPUT_PREFIX` | String | Useful for multiple runs with different arguments. See [Output Prefix Keys](#output-prefix-keys) |
|
||||
| `OMNITRACE_OUTPUT_FILE` | Any valid filepath | Specific location for perfetto output file. |
|
||||
| `OMNITRACE_TIME_OUTPUT` | Boolean | Place all output in a timestamped folder, timestamp format controlled via `OMNITRACE_TIME_FORMAT` |
|
||||
| `OMNITRACE_TIME_FORMAT` | String | See `strftime` man pages for valid identifiers |
|
||||
| `OMNITRACE_USE_PID` | Boolean | Append either the PID or the MPI rank to all output files (before the extension) |
|
||||
|
||||
#### Output Prefix Keys
|
||||
|
||||
Output prefix keys have many uses but most useful when dealing with multiple profiling runs or large MPI jobs.
|
||||
Their inclusion in omnitrace stems from their introduction into timemory for [compile-time-perf](https://github.com/jrmadsen/compile-time-perf)
|
||||
which needed to be able to create different output files for a generic wrapper around compilation commands while still
|
||||
overwriting the output from the last time a file was compiled.
|
||||
|
||||
If you are ever doing scaling studies and specifying options via the command line, it is highly recommend to just
|
||||
use a common `OMNITRACE_OUTPUT_PATH`, disable `OMNITRACE_TIME_OUTPUT`,
|
||||
set `OMNITRACE_OUTPUT_PREFIX="%argt%-"` and let omnitrace cleanly organize the output.
|
||||
|
||||
| String | Encoding |
|
||||
|-----------------|-----------------------------------------------------------------------------------------------|
|
||||
| `%arg<N>%` | Command line argument at position `<N>` (zero indexed), e.g. `%arg0%` for first argument. |
|
||||
| `%arg<N>_hash%` | MD5 sum of `%arg<N>%` |
|
||||
| `%argv%` | Entire command-line condensed into a single string |
|
||||
| `%argv_hash%` | MD5 sum of `%argv%` |
|
||||
| `%argt%` | Similar to `%argv%` except basename of first command line argument |
|
||||
| `%argt_hash%` | MD5 sum if `%argt%` |
|
||||
| `%args%` | All command line arguments condensed into a single string |
|
||||
| `%args_hash%` | MD5 sum of `%args%` |
|
||||
| `%tag%` | Basename of first command line argument |
|
||||
| `%tag_hash%` | MD5 sum of `%tag%` |
|
||||
| `%pid%` | Process identifier (i.e. `getpid()`) |
|
||||
| `%job%` | Value of `SLURM_JOB_ID` environment variable if exists, else `0` |
|
||||
| `%rank%` | Value of `SLURM_PROCID` environment variable if exists, else `MPI_Comm_rank` (or `0` non-mpi) |
|
||||
| `%size%` | `MPI_Comm_size` or `1` if non-mpi |
|
||||
| `%m` | Shorthand for `%argt_hash%` |
|
||||
| `%p` | Shorthand for `%pid%` |
|
||||
| `%j` | Shorthand for `%job%` |
|
||||
| `%r` | Shorthand for `%rank%` |
|
||||
| `%s` | Shorthand for `%size%` |
|
||||
|
||||
> NOTE: any output prefix key which contain a '/' will have the `/` characters
|
||||
> replaced with `_` and any leading underscores will be stripped, e.g. if `%arg0%` is `/usr/bin/foo`, this
|
||||
> will translate to `usr_bin_foo`. Additionally, any `%arg<N>%` keys which do not have a command line argument
|
||||
> at position `<N>` will be ignored.
|
||||
|
||||
## Perfetto Output
|
||||
|
||||
Use the `OMNITRACE_OUTPUT_FILE` to specify a specific location. If this is an absolute path, then all `OMNITRACE_OUTPUT_PATH`, etc.
|
||||
settings will be ignored.
|
||||
|
||||
## Timemory Output
|
||||
|
||||
Use `omnitrace-avail --components --filename` to view the base filename for each component. E.g.
|
||||
|
||||
```shell
|
||||
$ ./omnitrace-avail wall_clock -C -f
|
||||
|---------------------------------|---------------|------------------------|
|
||||
| COMPONENT | AVAILABLE | FILENAME |
|
||||
|---------------------------------|---------------|------------------------|
|
||||
| wall_clock | true | wall_clock |
|
||||
| sampling_wall_clock | true | sampling_wall_clock |
|
||||
|---------------------------------|---------------|------------------------|
|
||||
```
|
||||
|
||||
Setting `OMNITRACE_COLLAPSE_THREADS=ON` and/or `OMNITRACE_COLLAPSE_PROCESSES=ON` (only valid with full MPI support) the timemory output
|
||||
will combine the per-thread and/or per-rank data which have identical call-stacks.
|
||||
|
||||
The `OMNITRACE_FLAT_PROFILE` setting will remove all call stack heirarchy. Using `OMNITRACE_FLAT_PROFILE=ON` in combination
|
||||
with `OMNITRACE_COLLAPSE_THREADS=ON` is a useful configuration for identifying min/max measurements regardless of calling context.
|
||||
The `OMNITRACE_TIMELINE_PROFILE` setting (with `OMNITRACE_FLAT_PROFILE=OFF`) will effectively generate similar data that can be found
|
||||
in perfetto. Enabling timeline and flat profiling will effectively generate similar data to `strace`. However, while timemory in general
|
||||
requires significantly less memory than perfetto, this is not the case in timeline mode so activate this setting with caution.
|
||||
|
||||
### Timemory Text Output
|
||||
|
||||
> Hint: the generation of text output is configurable via `OMNITRACE_TEXT_OUTPUT`
|
||||
|
||||
Timemory text output files are meant for human-consumption (use JSON formats for analysis)
|
||||
and as such, some fields such as the `LABEL` fields may be truncated for readability.
|
||||
Modification of the truncation can be changed via the `OMNITRACE_MAX_WIDTH` setting.
|
||||
|
||||
#### Timemory Text Output Example
|
||||
|
||||
In the below, the `NN` field in `|NN>>>` is the thread ID. If MPI support is enabled, this will be `|MM|NN>>>` and `MM` will be the rank.
|
||||
If `OMNITRACE_COLLAPSE_THREADS=ON` and `OMNITRACE_COLLAPSE_PROCESSES=ON`, neither the `MM` nor the `NN` will be present unless the
|
||||
component explicitly sets type-traits which specify that the data is only relevant per-thread or per-process, e.g. the `thread_cpu_clock` clock component.
|
||||
|
||||
```console
|
||||
|-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
|
||||
| REAL-CLOCK TIMER (I.E. WALL-CLOCK TIMER) |
|
||||
|-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
|
||||
| LABEL | COUNT | DEPTH | METRIC | UNITS | SUM | MEAN | MIN | MAX | VAR | STDDEV | % SELF |
|
||||
|--------------------------------------------------------------|--------|--------|------------|--------|-----------|-----------|-----------|-----------|----------|----------|--------|
|
||||
| |00>>> main | 1 | 0 | wall_clock | sec | 13.360265 | 13.360265 | 13.360265 | 13.360265 | 0.000000 | 0.000000 | 18.2 |
|
||||
| |00>>> |_ompt_thread_initial | 1 | 1 | wall_clock | sec | 10.924161 | 10.924161 | 10.924161 | 10.924161 | 0.000000 | 0.000000 | 0.0 |
|
||||
| |00>>> |_ompt_implicit_task | 1 | 2 | wall_clock | sec | 10.923050 | 10.923050 | 10.923050 | 10.923050 | 0.000000 | 0.000000 | 0.1 |
|
||||
| |00>>> |_ompt_parallel [parallelism=12] | 1 | 3 | wall_clock | sec | 10.915026 | 10.915026 | 10.915026 | 10.915026 | 0.000000 | 0.000000 | 0.0 |
|
||||
| |00>>> |_ompt_implicit_task | 1 | 4 | wall_clock | sec | 10.647951 | 10.647951 | 10.647951 | 10.647951 | 0.000000 | 0.000000 | 0.0 |
|
||||
| |00>>> |_ompt_work_loop | 156 | 5 | wall_clock | sec | 0.000812 | 0.000005 | 0.000001 | 0.000212 | 0.000000 | 0.000018 | 100.0 |
|
||||
| |00>>> |_ompt_work_single_executor | 40 | 5 | wall_clock | sec | 0.000016 | 0.000000 | 0.000000 | 0.000001 | 0.000000 | 0.000000 | 100.0 |
|
||||
| |00>>> |_ompt_sync_region_barrier_implicit | 308 | 5 | wall_clock | sec | 0.000629 | 0.000002 | 0.000001 | 0.000017 | 0.000000 | 0.000002 | 100.0 |
|
||||
| |00>>> |_conj_grad | 76 | 5 | wall_clock | sec | 10.641165 | 0.140015 | 0.131894 | 0.155099 | 0.000017 | 0.004080 | 1.0 |
|
||||
| |00>>> |_ompt_work_single_executor | 803 | 6 | wall_clock | sec | 0.000292 | 0.000000 | 0.000000 | 0.000001 | 0.000000 | 0.000000 | 100.0 |
|
||||
| |00>>> |_ompt_work_loop | 7904 | 6 | wall_clock | sec | 7.420265 | 0.000939 | 0.000005 | 0.006974 | 0.000003 | 0.001613 | 100.0 |
|
||||
| |00>>> |_ompt_sync_region_barrier_implicit | 6004 | 6 | wall_clock | sec | 0.283160 | 0.000047 | 0.000001 | 0.004087 | 0.000000 | 0.000303 | 100.0 |
|
||||
| |00>>> |_ompt_sync_region_barrier_implementation | 3952 | 6 | wall_clock | sec | 2.829252 | 0.000716 | 0.000007 | 0.009005 | 0.000001 | 0.000985 | 99.7 |
|
||||
| |00>>> |_ompt_sync_region_reduction | 15808 | 7 | wall_clock | sec | 0.009142 | 0.000001 | 0.000000 | 0.000007 | 0.000000 | 0.000000 | 100.0 |
|
||||
| |00>>> |_ompt_work_single_other | 1249 | 6 | wall_clock | sec | 0.000270 | 0.000000 | 0.000000 | 0.000001 | 0.000000 | 0.000000 | 100.0 |
|
||||
| |00>>> |_ompt_work_single_other | 114 | 5 | wall_clock | sec | 0.000024 | 0.000000 | 0.000000 | 0.000001 | 0.000000 | 0.000000 | 100.0 |
|
||||
| |00>>> |_ompt_sync_region_barrier_implementation | 76 | 5 | wall_clock | sec | 0.000876 | 0.000012 | 0.000008 | 0.000025 | 0.000000 | 0.000003 | 84.4 |
|
||||
| |00>>> |_ompt_sync_region_reduction | 304 | 6 | wall_clock | sec | 0.000136 | 0.000000 | 0.000000 | 0.000001 | 0.000000 | 0.000000 | 100.0 |
|
||||
| |00>>> |_ompt_master | 226 | 5 | wall_clock | sec | 0.001978 | 0.000009 | 0.000000 | 0.000038 | 0.000000 | 0.000012 | 100.0 |
|
||||
| |11>>> |_ompt_thread_worker | 1 | 4 | wall_clock | sec | 10.656145 | 10.656145 | 10.656145 | 10.656145 | 0.000000 | 0.000000 | 0.1 |
|
||||
| |11>>> |_ompt_implicit_task | 1 | 5 | wall_clock | sec | 10.649183 | 10.649183 | 10.649183 | 10.649183 | 0.000000 | 0.000000 | 0.0 |
|
||||
| |11>>> |_ompt_work_loop | 156 | 6 | wall_clock | sec | 0.000852 | 0.000005 | 0.000002 | 0.000230 | 0.000000 | 0.000019 | 100.0 |
|
||||
| |11>>> |_ompt_work_single_other | 149 | 6 | wall_clock | sec | 0.000035 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 100.0 |
|
||||
| |11>>> |_ompt_sync_region_barrier_implicit | 308 | 6 | wall_clock | sec | 0.004135 | 0.000013 | 0.000001 | 0.001233 | 0.000000 | 0.000070 | 100.0 |
|
||||
| |11>>> |_conj_grad | 76 | 6 | wall_clock | sec | 10.641302 | 0.140017 | 0.131896 | 0.155102 | 0.000017 | 0.004080 | 0.6 |
|
||||
| |11>>> |_ompt_work_single_other | 2023 | 7 | wall_clock | sec | 0.000458 | 0.000000 | 0.000000 | 0.000001 | 0.000000 | 0.000000 | 100.0 |
|
||||
| |11>>> |_ompt_work_loop | 7904 | 7 | wall_clock | sec | 8.253555 | 0.001044 | 0.000005 | 0.008021 | 0.000003 | 0.001790 | 100.0 |
|
||||
| |11>>> |_ompt_sync_region_barrier_implicit | 6004 | 7 | wall_clock | sec | 0.263840 | 0.000044 | 0.000001 | 0.004087 | 0.000000 | 0.000297 | 100.0 |
|
||||
| |11>>> |_ompt_sync_region_barrier_implementation | 3952 | 7 | wall_clock | sec | 2.059823 | 0.000521 | 0.000007 | 0.009508 | 0.000001 | 0.000863 | 100.0 |
|
||||
| |11>>> |_ompt_work_single_executor | 29 | 7 | wall_clock | sec | 0.000011 | 0.000000 | 0.000000 | 0.000001 | 0.000000 | 0.000000 | 100.0 |
|
||||
| |11>>> |_ompt_work_single_executor | 5 | 6 | wall_clock | sec | 0.000002 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 100.0 |
|
||||
| |11>>> |_ompt_sync_region_barrier_implementation | 76 | 6 | wall_clock | sec | 0.000975 | 0.000013 | 0.000008 | 0.000024 | 0.000000 | 0.000003 | 100.0 |
|
||||
| |10>>> |_ompt_thread_worker | 1 | 4 | wall_clock | sec | 10.681664 | 10.681664 | 10.681664 | 10.681664 | 0.000000 | 0.000000 | 0.3 |
|
||||
| |10>>> |_ompt_implicit_task | 1 | 5 | wall_clock | sec | 10.649158 | 10.649158 | 10.649158 | 10.649158 | 0.000000 | 0.000000 | 0.0 |
|
||||
| |10>>> |_ompt_work_loop | 156 | 6 | wall_clock | sec | 0.000863 | 0.000006 | 0.000002 | 0.000231 | 0.000000 | 0.000019 | 100.0 |
|
||||
| |10>>> |_ompt_work_single_other | 140 | 6 | wall_clock | sec | 0.000037 | 0.000000 | 0.000000 | 0.000001 | 0.000000 | 0.000000 | 100.0 |
|
||||
| |10>>> |_ompt_sync_region_barrier_implicit | 308 | 6 | wall_clock | sec | 0.004149 | 0.000013 | 0.000001 | 0.001221 | 0.000000 | 0.000070 | 100.0 |
|
||||
| |10>>> |_conj_grad | 76 | 6 | wall_clock | sec | 10.641288 | 0.140017 | 0.131896 | 0.155101 | 0.000017 | 0.004080 | 0.7 |
|
||||
| |10>>> |_ompt_work_single_other | 1883 | 7 | wall_clock | sec | 0.000487 | 0.000000 | 0.000000 | 0.000001 | 0.000000 | 0.000000 | 100.0 |
|
||||
| |10>>> |_ompt_work_loop | 7904 | 7 | wall_clock | sec | 8.174545 | 0.001034 | 0.000005 | 0.006899 | 0.000003 | 0.001766 | 100.0 |
|
||||
| |10>>> |_ompt_sync_region_barrier_implicit | 6004 | 7 | wall_clock | sec | 0.268808 | 0.000045 | 0.000001 | 0.004087 | 0.000000 | 0.000299 | 100.0 |
|
||||
| |10>>> |_ompt_sync_region_barrier_implementation | 3952 | 7 | wall_clock | sec | 2.126988 | 0.000538 | 0.000007 | 0.009843 | 0.000001 | 0.000872 | 99.9 |
|
||||
| |10>>> |_ompt_sync_region_reduction | 3952 | 8 | wall_clock | sec | 0.002574 | 0.000001 | 0.000000 | 0.000014 | 0.000000 | 0.000000 | 100.0 |
|
||||
| |10>>> |_ompt_work_single_executor | 169 | 7 | wall_clock | sec | 0.000072 | 0.000000 | 0.000000 | 0.000001 | 0.000000 | 0.000000 | 100.0 |
|
||||
| |10>>> |_ompt_sync_region_barrier_implementation | 76 | 6 | wall_clock | sec | 0.000954 | 0.000013 | 0.000009 | 0.000023 | 0.000000 | 0.000003 | 95.9 |
|
||||
| |10>>> |_ompt_sync_region_reduction | 76 | 7 | wall_clock | sec | 0.000039 | 0.000001 | 0.000000 | 0.000001 | 0.000000 | 0.000000 | 100.0 |
|
||||
| |10>>> |_ompt_work_single_executor | 14 | 6 | wall_clock | sec | 0.000006 | 0.000000 | 0.000000 | 0.000001 | 0.000000 | 0.000000 | 100.0 |
|
||||
| |09>>> |_ompt_thread_worker | 1 | 4 | wall_clock | sec | 10.686552 | 10.686552 | 10.686552 | 10.686552 | 0.000000 | 0.000000 | 0.3 |
|
||||
| |09>>> |_ompt_implicit_task | 1 | 5 | wall_clock | sec | 10.649151 | 10.649151 | 10.649151 | 10.649151 | 0.000000 | 0.000000 | 0.0 |
|
||||
| |09>>> |_ompt_work_loop | 156 | 6 | wall_clock | sec | 0.000880 | 0.000006 | 0.000002 | 0.000258 | 0.000000 | 0.000021 | 100.0 |
|
||||
| |09>>> |_ompt_work_single_other | 148 | 6 | wall_clock | sec | 0.000034 | 0.000000 | 0.000000 | 0.000001 | 0.000000 | 0.000000 | 100.0 |
|
||||
| |09>>> |_ompt_sync_region_barrier_implicit | 308 | 6 | wall_clock | sec | 0.004129 | 0.000013 | 0.000001 | 0.001210 | 0.000000 | 0.000069 | 100.0 |
|
||||
| |09>>> |_conj_grad | 76 | 6 | wall_clock | sec | 10.641308 | 0.140017 | 0.131895 | 0.155102 | 0.000017 | 0.004080 | 0.7 |
|
||||
| |09>>> |_ompt_work_single_other | 2043 | 7 | wall_clock | sec | 0.000473 | 0.000000 | 0.000000 | 0.000001 | 0.000000 | 0.000000 | 100.0 |
|
||||
| |09>>> |_ompt_work_loop | 7904 | 7 | wall_clock | sec | 7.977001 | 0.001009 | 0.000005 | 0.007325 | 0.000003 | 0.001732 | 100.0 |
|
||||
| |09>>> |_ompt_sync_region_barrier_implicit | 6004 | 7 | wall_clock | sec | 0.242996 | 0.000040 | 0.000001 | 0.004087 | 0.000000 | 0.000284 | 100.0 |
|
||||
| |09>>> |_ompt_sync_region_barrier_implementation | 3952 | 7 | wall_clock | sec | 2.350895 | 0.000595 | 0.000007 | 0.008689 | 0.000001 | 0.000926 | 100.0 |
|
||||
| |09>>> |_ompt_work_single_executor | 9 | 7 | wall_clock | sec | 0.000004 | 0.000000 | 0.000000 | 0.000001 | 0.000000 | 0.000000 | 100.0 |
|
||||
| |09>>> |_ompt_sync_region_barrier_implementation | 76 | 6 | wall_clock | sec | 0.000973 | 0.000013 | 0.000008 | 0.000025 | 0.000000 | 0.000003 | 100.0 |
|
||||
| |09>>> |_ompt_work_single_executor | 6 | 6 | wall_clock | sec | 0.000002 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 100.0 |
|
||||
| |08>>> |_ompt_thread_worker | 1 | 4 | wall_clock | sec | 10.721622 | 10.721622 | 10.721622 | 10.721622 | 0.000000 | 0.000000 | 0.7 |
|
||||
| |08>>> |_ompt_implicit_task | 1 | 5 | wall_clock | sec | 10.649135 | 10.649135 | 10.649135 | 10.649135 | 0.000000 | 0.000000 | 0.0 |
|
||||
| |08>>> |_ompt_work_loop | 156 | 6 | wall_clock | sec | 0.000839 | 0.000005 | 0.000001 | 0.000231 | 0.000000 | 0.000019 | 100.0 |
|
||||
| |08>>> |_ompt_work_single_other | 141 | 6 | wall_clock | sec | 0.000030 | 0.000000 | 0.000000 | 0.000001 | 0.000000 | 0.000000 | 100.0 |
|
||||
| |08>>> |_ompt_sync_region_barrier_implicit | 308 | 6 | wall_clock | sec | 0.004114 | 0.000013 | 0.000001 | 0.001198 | 0.000000 | 0.000069 | 100.0 |
|
||||
| |08>>> |_conj_grad | 76 | 6 | wall_clock | sec | 10.641294 | 0.140017 | 0.131896 | 0.155101 | 0.000017 | 0.004080 | 0.6 |
|
||||
| |08>>> |_ompt_work_single_other | 1742 | 7 | wall_clock | sec | 0.000392 | 0.000000 | 0.000000 | 0.000001 | 0.000000 | 0.000000 | 100.0 |
|
||||
| |08>>> |_ompt_work_loop | 7904 | 7 | wall_clock | sec | 8.306388 | 0.001051 | 0.000005 | 0.007886 | 0.000003 | 0.001795 | 100.0 |
|
||||
| |08>>> |_ompt_sync_region_barrier_implicit | 6004 | 7 | wall_clock | sec | 0.274358 | 0.000046 | 0.000001 | 0.004090 | 0.000000 | 0.000302 | 100.0 |
|
||||
| |08>>> |_ompt_sync_region_barrier_implementation | 3952 | 7 | wall_clock | sec | 1.991251 | 0.000504 | 0.000007 | 0.008694 | 0.000001 | 0.000844 | 99.8 |
|
||||
| |08>>> |_ompt_sync_region_reduction | 7904 | 8 | wall_clock | sec | 0.003816 | 0.000000 | 0.000000 | 0.000017 | 0.000000 | 0.000000 | 100.0 |
|
||||
| |08>>> |_ompt_work_single_executor | 310 | 7 | wall_clock | sec | 0.000112 | 0.000000 | 0.000000 | 0.000001 | 0.000000 | 0.000000 | 100.0 |
|
||||
| |08>>> |_ompt_sync_region_barrier_implementation | 76 | 6 | wall_clock | sec | 0.000955 | 0.000013 | 0.000009 | 0.000026 | 0.000000 | 0.000003 | 93.7 |
|
||||
| |08>>> |_ompt_sync_region_reduction | 152 | 7 | wall_clock | sec | 0.000060 | 0.000000 | 0.000000 | 0.000001 | 0.000000 | 0.000000 | 100.0 |
|
||||
| |08>>> |_ompt_work_single_executor | 13 | 6 | wall_clock | sec | 0.000005 | 0.000000 | 0.000000 | 0.000001 | 0.000000 | 0.000000 | 100.0 |
|
||||
| |07>>> |_ompt_thread_worker | 1 | 4 | wall_clock | sec | 10.747282 | 10.747282 | 10.747282 | 10.747282 | 0.000000 | 0.000000 | 0.9 |
|
||||
| |07>>> |_ompt_implicit_task | 1 | 5 | wall_clock | sec | 10.649093 | 10.649093 | 10.649093 | 10.649093 | 0.000000 | 0.000000 | 0.0 |
|
||||
| |07>>> |_ompt_work_loop | 156 | 6 | wall_clock | sec | 0.000923 | 0.000006 | 0.000002 | 0.000231 | 0.000000 | 0.000019 | 100.0 |
|
||||
| |07>>> |_ompt_work_single_other | 152 | 6 | wall_clock | sec | 0.000048 | 0.000000 | 0.000000 | 0.000001 | 0.000000 | 0.000000 | 100.0 |
|
||||
| |07>>> |_ompt_sync_region_barrier_implicit | 308 | 6 | wall_clock | sec | 0.003981 | 0.000013 | 0.000001 | 0.001186 | 0.000000 | 0.000068 | 100.0 |
|
||||
| |07>>> |_conj_grad | 76 | 6 | wall_clock | sec | 10.641295 | 0.140017 | 0.131896 | 0.155101 | 0.000017 | 0.004080 | 0.7 |
|
||||
| |07>>> |_ompt_work_single_other | 2043 | 7 | wall_clock | sec | 0.000648 | 0.000000 | 0.000000 | 0.000001 | 0.000000 | 0.000000 | 100.0 |
|
||||
| |07>>> |_ompt_work_loop | 7904 | 7 | wall_clock | sec | 7.978811 | 0.001009 | 0.000005 | 0.006728 | 0.000003 | 0.001732 | 100.0 |
|
||||
| |07>>> |_ompt_sync_region_barrier_implicit | 6004 | 7 | wall_clock | sec | 0.199939 | 0.000033 | 0.000001 | 0.004086 | 0.000000 | 0.000255 | 100.0 |
|
||||
| |07>>> |_ompt_sync_region_barrier_implementation | 3952 | 7 | wall_clock | sec | 2.385843 | 0.000604 | 0.000009 | 0.009039 | 0.000001 | 0.000938 | 100.0 |
|
||||
| |07>>> |_ompt_work_single_executor | 9 | 7 | wall_clock | sec | 0.000004 | 0.000000 | 0.000000 | 0.000001 | 0.000000 | 0.000000 | 100.0 |
|
||||
| |07>>> |_ompt_sync_region_barrier_implementation | 76 | 6 | wall_clock | sec | 0.000905 | 0.000012 | 0.000010 | 0.000025 | 0.000000 | 0.000003 | 100.0 |
|
||||
| |07>>> |_ompt_work_single_executor | 2 | 6 | wall_clock | sec | 0.000001 | 0.000001 | 0.000000 | 0.000001 | 0.000000 | 0.000000 | 100.0 |
|
||||
| |06>>> |_ompt_thread_worker | 1 | 4 | wall_clock | sec | 10.772278 | 10.772278 | 10.772278 | 10.772278 | 0.000000 | 0.000000 | 1.1 |
|
||||
| |06>>> |_ompt_implicit_task | 1 | 5 | wall_clock | sec | 10.649092 | 10.649092 | 10.649092 | 10.649092 | 0.000000 | 0.000000 | 0.0 |
|
||||
| |06>>> |_ompt_work_loop | 156 | 6 | wall_clock | sec | 0.000888 | 0.000006 | 0.000002 | 0.000236 | 0.000000 | 0.000020 | 100.0 |
|
||||
| |06>>> |_ompt_work_single_other | 153 | 6 | wall_clock | sec | 0.000037 | 0.000000 | 0.000000 | 0.000001 | 0.000000 | 0.000000 | 100.0 |
|
||||
| |06>>> |_ompt_sync_region_barrier_implicit | 308 | 6 | wall_clock | sec | 0.004090 | 0.000013 | 0.000001 | 0.001175 | 0.000000 | 0.000067 | 100.0 |
|
||||
| |06>>> |_conj_grad | 76 | 6 | wall_clock | sec | 10.641317 | 0.140017 | 0.131896 | 0.155101 | 0.000017 | 0.004080 | 0.8 |
|
||||
| |06>>> |_ompt_work_single_other | 2041 | 7 | wall_clock | sec | 0.000476 | 0.000000 | 0.000000 | 0.000001 | 0.000000 | 0.000000 | 100.0 |
|
||||
| |06>>> |_ompt_work_loop | 7904 | 7 | wall_clock | sec | 7.467961 | 0.000945 | 0.000005 | 0.010712 | 0.000003 | 0.001627 | 100.0 |
|
||||
| |06>>> |_ompt_sync_region_barrier_implicit | 6004 | 7 | wall_clock | sec | 0.250883 | 0.000042 | 0.000001 | 0.004087 | 0.000000 | 0.000285 | 100.0 |
|
||||
| |06>>> |_ompt_sync_region_barrier_implementation | 3952 | 7 | wall_clock | sec | 2.838733 | 0.000718 | 0.000009 | 0.009015 | 0.000001 | 0.001015 | 99.9 |
|
||||
| |06>>> |_ompt_sync_region_reduction | 3952 | 8 | wall_clock | sec | 0.003334 | 0.000001 | 0.000000 | 0.000025 | 0.000000 | 0.000001 | 100.0 |
|
||||
| |06>>> |_ompt_work_single_executor | 11 | 7 | wall_clock | sec | 0.000005 | 0.000000 | 0.000000 | 0.000001 | 0.000000 | 0.000000 | 100.0 |
|
||||
| |06>>> |_ompt_sync_region_barrier_implementation | 76 | 6 | wall_clock | sec | 0.000940 | 0.000012 | 0.000009 | 0.000025 | 0.000000 | 0.000003 | 95.4 |
|
||||
| |06>>> |_ompt_sync_region_reduction | 76 | 7 | wall_clock | sec | 0.000044 | 0.000001 | 0.000000 | 0.000001 | 0.000000 | 0.000000 | 100.0 |
|
||||
| |06>>> |_ompt_work_single_executor | 1 | 6 | wall_clock | sec | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 100.0 |
|
||||
| |05>>> |_ompt_thread_worker | 1 | 4 | wall_clock | sec | 10.797950 | 10.797950 | 10.797950 | 10.797950 | 0.000000 | 0.000000 | 1.4 |
|
||||
| |05>>> |_ompt_implicit_task | 1 | 5 | wall_clock | sec | 10.649072 | 10.649072 | 10.649072 | 10.649072 | 0.000000 | 0.000000 | 0.0 |
|
||||
| |05>>> |_ompt_work_loop | 156 | 6 | wall_clock | sec | 0.000879 | 0.000006 | 0.000001 | 0.000248 | 0.000000 | 0.000021 | 100.0 |
|
||||
| |05>>> |_ompt_work_single_other | 142 | 6 | wall_clock | sec | 0.000034 | 0.000000 | 0.000000 | 0.000001 | 0.000000 | 0.000000 | 100.0 |
|
||||
| |05>>> |_ompt_sync_region_barrier_implicit | 308 | 6 | wall_clock | sec | 0.004062 | 0.000013 | 0.000002 | 0.001163 | 0.000000 | 0.000067 | 100.0 |
|
||||
| |05>>> |_conj_grad | 76 | 6 | wall_clock | sec | 10.641291 | 0.140017 | 0.131896 | 0.155101 | 0.000017 | 0.004080 | 0.7 |
|
||||
| |05>>> |_ompt_work_single_other | 2038 | 7 | wall_clock | sec | 0.000500 | 0.000000 | 0.000000 | 0.000001 | 0.000000 | 0.000000 | 100.0 |
|
||||
| |05>>> |_ompt_work_loop | 7904 | 7 | wall_clock | sec | 8.279191 | 0.001047 | 0.000005 | 0.006596 | 0.000003 | 0.001792 | 100.0 |
|
||||
| |05>>> |_ompt_sync_region_barrier_implicit | 6004 | 7 | wall_clock | sec | 0.250939 | 0.000042 | 0.000001 | 0.004090 | 0.000000 | 0.000286 | 100.0 |
|
||||
| |05>>> |_ompt_sync_region_barrier_implementation | 3952 | 7 | wall_clock | sec | 2.039013 | 0.000516 | 0.000009 | 0.008689 | 0.000001 | 0.000855 | 100.0 |
|
||||
| |05>>> |_ompt_work_single_executor | 14 | 7 | wall_clock | sec | 0.000005 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 100.0 |
|
||||
| |05>>> |_ompt_sync_region_barrier_implementation | 76 | 6 | wall_clock | sec | 0.000926 | 0.000012 | 0.000009 | 0.000023 | 0.000000 | 0.000003 | 100.0 |
|
||||
| |05>>> |_ompt_work_single_executor | 12 | 6 | wall_clock | sec | 0.000005 | 0.000000 | 0.000000 | 0.000001 | 0.000000 | 0.000000 | 100.0 |
|
||||
| |04>>> |_ompt_thread_worker | 1 | 4 | wall_clock | sec | 10.825935 | 10.825935 | 10.825935 | 10.825935 | 0.000000 | 0.000000 | 1.6 |
|
||||
| |04>>> |_ompt_implicit_task | 1 | 5 | wall_clock | sec | 10.649068 | 10.649068 | 10.649068 | 10.649068 | 0.000000 | 0.000000 | 0.0 |
|
||||
| |04>>> |_ompt_work_loop | 156 | 6 | wall_clock | sec | 0.000884 | 0.000006 | 0.000002 | 0.000245 | 0.000000 | 0.000020 | 100.0 |
|
||||
| |04>>> |_ompt_work_single_other | 150 | 6 | wall_clock | sec | 0.000034 | 0.000000 | 0.000000 | 0.000001 | 0.000000 | 0.000000 | 100.0 |
|
||||
| |04>>> |_ompt_sync_region_barrier_implicit | 308 | 6 | wall_clock | sec | 0.004069 | 0.000013 | 0.000001 | 0.001151 | 0.000000 | 0.000066 | 100.0 |
|
||||
| |04>>> |_conj_grad | 76 | 6 | wall_clock | sec | 10.641300 | 0.140017 | 0.131896 | 0.155101 | 0.000017 | 0.004080 | 1.1 |
|
||||
| |04>>> |_ompt_work_single_other | 2041 | 7 | wall_clock | sec | 0.000448 | 0.000000 | 0.000000 | 0.000001 | 0.000000 | 0.000000 | 100.0 |
|
||||
| |04>>> |_ompt_work_loop | 7904 | 7 | wall_clock | sec | 7.438393 | 0.000941 | 0.000005 | 0.007090 | 0.000003 | 0.001624 | 100.0 |
|
||||
| |04>>> |_ompt_sync_region_barrier_implicit | 6004 | 7 | wall_clock | sec | 0.270654 | 0.000045 | 0.000001 | 0.004090 | 0.000000 | 0.000295 | 100.0 |
|
||||
| |04>>> |_ompt_sync_region_barrier_implementation | 3952 | 7 | wall_clock | sec | 2.819165 | 0.000713 | 0.000009 | 0.008379 | 0.000001 | 0.001013 | 99.9 |
|
||||
| |04>>> |_ompt_sync_region_reduction | 7904 | 8 | wall_clock | sec | 0.003932 | 0.000000 | 0.000000 | 0.000015 | 0.000000 | 0.000000 | 100.0 |
|
||||
| |04>>> |_ompt_work_single_executor | 11 | 7 | wall_clock | sec | 0.000005 | 0.000000 | 0.000000 | 0.000001 | 0.000000 | 0.000000 | 100.0 |
|
||||
| |04>>> |_ompt_sync_region_barrier_implementation | 76 | 6 | wall_clock | sec | 0.000936 | 0.000012 | 0.000009 | 0.000025 | 0.000000 | 0.000003 | 93.2 |
|
||||
| |04>>> |_ompt_sync_region_reduction | 152 | 7 | wall_clock | sec | 0.000064 | 0.000000 | 0.000000 | 0.000001 | 0.000000 | 0.000000 | 100.0 |
|
||||
| |04>>> |_ompt_work_single_executor | 4 | 6 | wall_clock | sec | 0.000001 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 100.0 |
|
||||
| |03>>> |_ompt_thread_worker | 1 | 4 | wall_clock | sec | 10.849322 | 10.849322 | 10.849322 | 10.849322 | 0.000000 | 0.000000 | 1.8 |
|
||||
| |03>>> |_ompt_implicit_task | 1 | 5 | wall_clock | sec | 10.649075 | 10.649075 | 10.649075 | 10.649075 | 0.000000 | 0.000000 | 0.0 |
|
||||
| |03>>> |_ompt_work_loop | 156 | 6 | wall_clock | sec | 0.000861 | 0.000006 | 0.000002 | 0.000238 | 0.000000 | 0.000020 | 100.0 |
|
||||
| |03>>> |_ompt_work_single_other | 120 | 6 | wall_clock | sec | 0.000028 | 0.000000 | 0.000000 | 0.000001 | 0.000000 | 0.000000 | 100.0 |
|
||||
| |03>>> |_ompt_sync_region_barrier_implicit | 308 | 6 | wall_clock | sec | 0.003993 | 0.000013 | 0.000001 | 0.001138 | 0.000000 | 0.000065 | 100.0 |
|
||||
| |03>>> |_conj_grad | 76 | 6 | wall_clock | sec | 10.641302 | 0.140017 | 0.131896 | 0.155101 | 0.000017 | 0.004080 | 0.8 |
|
||||
| |03>>> |_ompt_work_single_other | 1756 | 7 | wall_clock | sec | 0.000426 | 0.000000 | 0.000000 | 0.000001 | 0.000000 | 0.000000 | 100.0 |
|
||||
| |03>>> |_ompt_work_loop | 7904 | 7 | wall_clock | sec | 8.005617 | 0.001013 | 0.000005 | 0.011500 | 0.000003 | 0.001741 | 100.0 |
|
||||
| |03>>> |_ompt_sync_region_barrier_implicit | 6004 | 7 | wall_clock | sec | 0.231485 | 0.000039 | 0.000001 | 0.004086 | 0.000000 | 0.000277 | 100.0 |
|
||||
| |03>>> |_ompt_sync_region_barrier_implementation | 3952 | 7 | wall_clock | sec | 2.320428 | 0.000587 | 0.000009 | 0.010868 | 0.000001 | 0.000912 | 100.0 |
|
||||
| |03>>> |_ompt_work_single_executor | 296 | 7 | wall_clock | sec | 0.000120 | 0.000000 | 0.000000 | 0.000001 | 0.000000 | 0.000000 | 100.0 |
|
||||
| |03>>> |_ompt_sync_region_barrier_implementation | 76 | 6 | wall_clock | sec | 0.000967 | 0.000013 | 0.000010 | 0.000023 | 0.000000 | 0.000003 | 100.0 |
|
||||
| |03>>> |_ompt_work_single_executor | 34 | 6 | wall_clock | sec | 0.000013 | 0.000000 | 0.000000 | 0.000001 | 0.000000 | 0.000000 | 100.0 |
|
||||
| |02>>> |_ompt_thread_worker | 1 | 4 | wall_clock | sec | 10.876387 | 10.876387 | 10.876387 | 10.876387 | 0.000000 | 0.000000 | 2.1 |
|
||||
| |02>>> |_ompt_implicit_task | 1 | 5 | wall_clock | sec | 10.649050 | 10.649050 | 10.649050 | 10.649050 | 0.000000 | 0.000000 | 0.0 |
|
||||
| |02>>> |_ompt_work_loop | 156 | 6 | wall_clock | sec | 0.000924 | 0.000006 | 0.000001 | 0.000241 | 0.000000 | 0.000020 | 100.0 |
|
||||
| |02>>> |_ompt_work_single_other | 139 | 6 | wall_clock | sec | 0.000040 | 0.000000 | 0.000000 | 0.000001 | 0.000000 | 0.000000 | 100.0 |
|
||||
| |02>>> |_ompt_sync_region_barrier_implicit | 308 | 6 | wall_clock | sec | 0.003972 | 0.000013 | 0.000001 | 0.001127 | 0.000000 | 0.000064 | 100.0 |
|
||||
| |02>>> |_conj_grad | 76 | 6 | wall_clock | sec | 10.641287 | 0.140017 | 0.131895 | 0.155101 | 0.000017 | 0.004080 | 0.7 |
|
||||
| |02>>> |_ompt_work_single_other | 1902 | 7 | wall_clock | sec | 0.000553 | 0.000000 | 0.000000 | 0.000001 | 0.000000 | 0.000000 | 100.0 |
|
||||
| |02>>> |_ompt_work_loop | 7904 | 7 | wall_clock | sec | 7.906688 | 0.001000 | 0.000005 | 0.007068 | 0.000003 | 0.001713 | 100.0 |
|
||||
| |02>>> |_ompt_sync_region_barrier_implicit | 6004 | 7 | wall_clock | sec | 0.261367 | 0.000044 | 0.000001 | 0.004088 | 0.000000 | 0.000295 | 100.0 |
|
||||
| |02>>> |_ompt_sync_region_barrier_implementation | 3952 | 7 | wall_clock | sec | 2.402362 | 0.000608 | 0.000009 | 0.010399 | 0.000001 | 0.000944 | 99.9 |
|
||||
| |02>>> |_ompt_sync_region_reduction | 3952 | 8 | wall_clock | sec | 0.002937 | 0.000001 | 0.000000 | 0.000021 | 0.000000 | 0.000000 | 100.0 |
|
||||
| |02>>> |_ompt_work_single_executor | 150 | 7 | wall_clock | sec | 0.000073 | 0.000000 | 0.000000 | 0.000001 | 0.000000 | 0.000000 | 100.0 |
|
||||
| |02>>> |_ompt_sync_region_barrier_implementation | 76 | 6 | wall_clock | sec | 0.000895 | 0.000012 | 0.000009 | 0.000026 | 0.000000 | 0.000003 | 95.2 |
|
||||
| |02>>> |_ompt_sync_region_reduction | 76 | 7 | wall_clock | sec | 0.000043 | 0.000001 | 0.000000 | 0.000001 | 0.000000 | 0.000000 | 100.0 |
|
||||
| |02>>> |_ompt_work_single_executor | 15 | 6 | wall_clock | sec | 0.000007 | 0.000000 | 0.000000 | 0.000001 | 0.000000 | 0.000000 | 100.0 |
|
||||
| |01>>> |_ompt_thread_worker | 1 | 4 | wall_clock | sec | 10.901650 | 10.901650 | 10.901650 | 10.901650 | 0.000000 | 0.000000 | 2.3 |
|
||||
| |01>>> |_ompt_implicit_task | 1 | 5 | wall_clock | sec | 10.649017 | 10.649017 | 10.649017 | 10.649017 | 0.000000 | 0.000000 | 0.0 |
|
||||
| |01>>> |_ompt_work_loop | 156 | 6 | wall_clock | sec | 0.000863 | 0.000006 | 0.000001 | 0.000231 | 0.000000 | 0.000019 | 100.0 |
|
||||
| |01>>> |_ompt_work_single_other | 146 | 6 | wall_clock | sec | 0.000033 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 100.0 |
|
||||
| |01>>> |_ompt_sync_region_barrier_implicit | 308 | 6 | wall_clock | sec | 0.004012 | 0.000013 | 0.000001 | 0.001115 | 0.000000 | 0.000064 | 100.0 |
|
||||
| |01>>> |_conj_grad | 76 | 6 | wall_clock | sec | 10.641316 | 0.140017 | 0.131895 | 0.155101 | 0.000017 | 0.004080 | 0.8 |
|
||||
| |01>>> |_ompt_work_single_other | 1811 | 7 | wall_clock | sec | 0.000403 | 0.000000 | 0.000000 | 0.000001 | 0.000000 | 0.000000 | 100.0 |
|
||||
| |01>>> |_ompt_work_loop | 7904 | 7 | wall_clock | sec | 7.410337 | 0.000938 | 0.000005 | 0.010556 | 0.000003 | 0.001610 | 100.0 |
|
||||
| |01>>> |_ompt_sync_region_barrier_implicit | 6004 | 7 | wall_clock | sec | 0.202494 | 0.000034 | 0.000001 | 0.003521 | 0.000000 | 0.000256 | 100.0 |
|
||||
| |01>>> |_ompt_sync_region_barrier_implementation | 3952 | 7 | wall_clock | sec | 2.943604 | 0.000745 | 0.000008 | 0.009033 | 0.000001 | 0.001024 | 100.0 |
|
||||
| |01>>> |_ompt_work_single_executor | 241 | 7 | wall_clock | sec | 0.000093 | 0.000000 | 0.000000 | 0.000001 | 0.000000 | 0.000000 | 100.0 |
|
||||
| |01>>> |_ompt_sync_region_barrier_implementation | 76 | 6 | wall_clock | sec | 0.000917 | 0.000012 | 0.000009 | 0.000026 | 0.000000 | 0.000003 | 100.0 |
|
||||
| |01>>> |_ompt_work_single_executor | 8 | 6 | wall_clock | sec | 0.000004 | 0.000000 | 0.000000 | 0.000001 | 0.000000 | 0.000000 | 100.0 |
|
||||
| |00>>> |_c_print_results | 1 | 2 | wall_clock | sec | 0.000049 | 0.000049 | 0.000049 | 0.000049 | 0.000000 | 0.000000 | 100.0 |
|
||||
|-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
|
||||
```
|
||||
|
||||
### Timemory Flat JSON Output
|
||||
|
||||
> Hint: the generation of flat JSON output is configurable via `OMNITRACE_JSON_OUTPUT`
|
||||
|
||||
Timemory provides two JSON output formats. The flat JSON output files are similar to the text files: the hierarchical information
|
||||
is represented by the indentation of the `"prefix"` field and the `"depth"` field. All the data entries are in a single JSON array,
|
||||
e.g. the `["timemory"]["wall_clock"]["ranks"][0]["graph"][<N>]["prefix"]` entry in the below:
|
||||
|
||||
```json
|
||||
{
|
||||
"timemory": {
|
||||
"wall_clock": {
|
||||
"description": "Real-clock timer (i.e. wall-clock timer)",
|
||||
"thread_count": 12,
|
||||
"process_count": 1,
|
||||
"properties": {
|
||||
"cereal_class_version": 0,
|
||||
"enum": "WALL_CLOCK",
|
||||
"id": "wall_clock",
|
||||
"value": 78,
|
||||
"ids": [
|
||||
"real_clock",
|
||||
"virtual_clock",
|
||||
"wall_clock"
|
||||
]
|
||||
},
|
||||
"mpi_size": 0,
|
||||
"num_ranks": 1,
|
||||
"concurrency": 12,
|
||||
"upcxx_size": 1,
|
||||
"unit_value": 1000000000,
|
||||
"thread_scope_only": false,
|
||||
"type": "wall_clock",
|
||||
"unit_repr": "sec",
|
||||
"ranks": [
|
||||
{
|
||||
"graph_size": 173,
|
||||
"rank": 0,
|
||||
"graph": [
|
||||
{
|
||||
"depth": 0,
|
||||
"stats": {
|
||||
"count": 1,
|
||||
"min": 13.360264917,
|
||||
"sqr": 178.49667865242102,
|
||||
"sum": 13.360264917,
|
||||
"stddev": 0.0,
|
||||
"max": 13.360264917,
|
||||
"cereal_class_version": 0,
|
||||
"mean": 13.360264917
|
||||
},
|
||||
"prefix": "|00>>> main",
|
||||
"rolling_hash": 17481650134347108265,
|
||||
"entry": {
|
||||
"repr_display": 13.360264917,
|
||||
"value": 13360264917,
|
||||
"repr_data": 13.360264917,
|
||||
"cereal_class_version": 0,
|
||||
"accum": 13360264917,
|
||||
"laps": 1
|
||||
},
|
||||
"hash": 17481650134347108265
|
||||
},
|
||||
{
|
||||
"depth": 1,
|
||||
"stats": {
|
||||
"count": 1,
|
||||
"min": 10.924160502,
|
||||
"max": 10.924160502,
|
||||
"sum": 10.924160502,
|
||||
"stddev": 0.0,
|
||||
"sqr": 119.33728267345688,
|
||||
"mean": 10.924160502
|
||||
},
|
||||
"prefix": "|00>>> |_ompt_thread_initial",
|
||||
"rolling_hash": 5142782188440775656,
|
||||
"entry": {
|
||||
"repr_display": 10.924160502,
|
||||
"laps": 1,
|
||||
"accum": 10924160502,
|
||||
"repr_data": 10.924160502,
|
||||
"value": 10924160502
|
||||
},
|
||||
"hash": 6107876127803219007
|
||||
},
|
||||
{
|
||||
"depth": 2,
|
||||
"stats": {
|
||||
"count": 1,
|
||||
"min": 10.923050237,
|
||||
"max": 10.923050237,
|
||||
"sum": 10.923050237,
|
||||
"stddev": 0.0,
|
||||
"sqr": 119.31302648002575,
|
||||
"mean": 10.923050237
|
||||
},
|
||||
"prefix": "|00>>> |_ompt_implicit_task",
|
||||
"rolling_hash": 2098840206724841601,
|
||||
"entry": {
|
||||
"repr_display": 10.923050237,
|
||||
"laps": 1,
|
||||
"accum": 10923050237,
|
||||
"repr_data": 10.923050237,
|
||||
"value": 10923050237
|
||||
},
|
||||
"hash": 15402802091993617561
|
||||
},
|
||||
{
|
||||
"..." : "... etc. ..."
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
This format is easier than the hierarchical format to write a simple Python script for post-processing, e.g.:
|
||||
|
||||
```python
|
||||
#!/usr/bin/env python3
|
||||
|
||||
import sys
|
||||
import json
|
||||
|
||||
|
||||
def read_json(inp):
|
||||
with open(inp, "r") as f:
|
||||
return json.load(f)
|
||||
|
||||
|
||||
def find_max(data):
|
||||
"""Find the max for any function called multiple times"""
|
||||
max_entry = None
|
||||
for itr in data:
|
||||
if itr["entry"]["laps"] == 1:
|
||||
continue
|
||||
if max_entry is None:
|
||||
max_entry = itr
|
||||
else:
|
||||
if itr["stats"]["mean"] > max_entry["stats"]["mean"]:
|
||||
max_entry = itr
|
||||
return max_entry
|
||||
|
||||
|
||||
def strip_name(name):
|
||||
"""Return everything after |_ if it exists"""
|
||||
idx = name.index("|_")
|
||||
return name if idx is None else name[(idx + 2) :]
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
||||
input_data = [[x, read_json(x)] for x in sys.argv[1:]]
|
||||
|
||||
for file, data in input_data:
|
||||
for metric, metric_data in data["timemory"].items():
|
||||
|
||||
print(f"[{file}] Found metric: {metric}")
|
||||
|
||||
for n, itr in enumerate(metric_data["ranks"]):
|
||||
|
||||
max_entry = find_max(itr["graph"])
|
||||
print(
|
||||
"[{}] Maximum value: '{}' at depth {} was called {}x :: {:.3f} {} (mean = {:.3e} {})".format(
|
||||
file,
|
||||
strip_name(max_entry["prefix"]),
|
||||
max_entry["depth"],
|
||||
max_entry["entry"]["laps"],
|
||||
max_entry["entry"]["repr_data"],
|
||||
metric_data["unit_repr"],
|
||||
max_entry["stats"]["mean"],
|
||||
metric_data["unit_repr"],
|
||||
)
|
||||
)
|
||||
```
|
||||
|
||||
This script applied to the corresponding JSON output from [Text Output Example](#timemory-text-output-example) would be:
|
||||
|
||||
```console
|
||||
[openmp-cg.inst-wall_clock.json] Found metric: wall_clock
|
||||
[openmp-cg.inst-wall_clock.json] Maximum value: 'conj_grad' at depth 6 was called 76x :: 10.641 sec (mean = 1.400e-01 sec)
|
||||
```
|
||||
|
||||
### Timemory Hierarchical JSON Output
|
||||
|
||||
> Hint: the generation of hierarchical JSON output is configurable via `OMNITRACE_TREE_OUTPUT`
|
||||
|
||||
The hierarchical JSON output (extension: `.tree.json`) contains the very similar data to the flat JSON output, however,
|
||||
it's structure requires processing through recursion. The main use of these files are their analysis support
|
||||
by [hatchet](https://github.com/hatchet/hatchet).
|
||||
@@ -0,0 +1,506 @@
|
||||
# Customizing Omnitrace Runtime
|
||||
|
||||
```eval_rst
|
||||
.. toctree::
|
||||
:glob:
|
||||
:maxdepth: 4
|
||||
```
|
||||
|
||||
## omnitrace-avail Executable
|
||||
|
||||
The `omnitrace-avail` executable provides information about the runtime settings, data collection capabilities, and
|
||||
available hardware counters (when built with PAPI support). In contrast to this documentation, it is effectively
|
||||
self-updating: when new capabilities and settings are added to the omnitrace source code, it is effectively
|
||||
propagated to `omnitrace-avail`, thus it should be viewed as the single source of truth if any conflicting
|
||||
information or missing feature is found in this documentation.
|
||||
|
||||
### Exploring Runtime Settings
|
||||
|
||||
In order to view the list of the available runtime settings, their current value, and descriptions for each setting:
|
||||
|
||||
```shell
|
||||
omnitrace-avail --description
|
||||
```
|
||||
|
||||
> HINT: use `--brief` to suppress printing current value and/or `-c 0` to suppress truncation of the descriptions
|
||||
|
||||
Any setting which is boolean (`omnitrace-avail --settings --value --brief --filter bool`) accepts a case insensitive
|
||||
match to nearly all common expressions for boolean logic: ON, OFF, YES, NO, TRUE, FALSE, 0, 1, etc.
|
||||
|
||||
### Exploring Components
|
||||
|
||||
[Omnitrace](https://github.com/AMDResearch/omnitrace) uses [timemory](https://github.com/NERSC/timemory) extensively to provide various capabilities and manage
|
||||
data and resources. By default, when `OMNITRACE_USE_TIMEMORY=ON`, omnitrace will only collect wall-clock
|
||||
timing values; however, by modifying the `OMNITRACE_TIMEMORY_COMPONENTS` setting, omnitrace can be configured to
|
||||
collect hardware counters, CPU-clock timers, memory usage, context-switches, page-faults, network statistics,
|
||||
and many more. In fact, omnitrace can actually be used as a dynamic instrumentation vehicle for other 3rd-party profiling
|
||||
APIs such as [Caliper](https://github.com/LLNL/Caliper) and [LIKWID](https://github.com/RRZE-HPC/likwid) by building omnitrace
|
||||
from source with the CMake option(s) `TIMEMORY_USE_CALIPER=ON` and/or `TIMEMORY_USE_LIKWID=ON` and then adding
|
||||
`caliper_marker` and/or `likwid_marker` to `OMNITRACE_TIMEMORY_COMPONENTS`.
|
||||
|
||||
View all possible components and their descriptions:
|
||||
|
||||
```shell
|
||||
omnitrace-avail --components --description
|
||||
```
|
||||
|
||||
Restrict to available components and view the string identifiers for `OMNITRACE_TIMEMORY_COMPONENTS`:
|
||||
|
||||
```shell
|
||||
omnitrace-avail --components --available --string --brief
|
||||
```
|
||||
|
||||
### Exploring Hardware Counters
|
||||
|
||||
[Omnitrace](https://github.com/AMDResearch/omnitrace) supports collecting hardware counters via PAPI.
|
||||
|
||||
View all possible hardware counters and their descriptions:
|
||||
|
||||
```shell
|
||||
omnitrace-avail --hw-counters --description
|
||||
```
|
||||
|
||||
### omnitrace-avail Examples
|
||||
|
||||
#### Settings
|
||||
|
||||
```console
|
||||
$ omnitrace-avail -S -bd
|
||||
|-----------------------------------------|-----------------------------------------|
|
||||
| ENVIRONMENT VARIABLE | DESCRIPTION |
|
||||
|-----------------------------------------|-----------------------------------------|
|
||||
| OMNITRACE_ADD_SECONDARY | Enable/disable components adding sec... |
|
||||
| OMNITRACE_BACKEND | Specify the perfetto backend to acti... |
|
||||
| OMNITRACE_BUFFER_SIZE_KB | Size of perfetto buffer (in KB) |
|
||||
| OMNITRACE_COLLAPSE_PROCESSES | Enable/disable combining process-spe... |
|
||||
| OMNITRACE_COLLAPSE_THREADS | Enable/disable combining thread-spec... |
|
||||
| OMNITRACE_CONFIG_FILE | Configuration file for omnitrace |
|
||||
| OMNITRACE_COUT_OUTPUT | Write output to stdout |
|
||||
| OMNITRACE_CRITICAL_TRACE | Enable generation of the critical trace |
|
||||
| OMNITRACE_CRITICAL_TRACE_BUFFER_COUNT | Number of critical trace records to ... |
|
||||
| OMNITRACE_CRITICAL_TRACE_COUNT | Number of critical trace to export (... |
|
||||
| OMNITRACE_CRITICAL_TRACE_DEBUG | Enable debugging for critical trace |
|
||||
| OMNITRACE_CRITICAL_TRACE_NUM_THREADS | Number of threads to use when genera... |
|
||||
| OMNITRACE_CRITICAL_TRACE_PER_ROW | How many critical traces per row in ... |
|
||||
| OMNITRACE_CRITICAL_TRACE_SERIALIZE_N... | Include names in serialization of cr... |
|
||||
| OMNITRACE_DEBUG | Enable debug output |
|
||||
| OMNITRACE_DIFF_OUTPUT | Generate a difference output vs. a p... |
|
||||
| OMNITRACE_ENABLED | Activation state of timemory |
|
||||
| OMNITRACE_ENABLE_SIGNAL_HANDLER | Enable signals in timemory_init |
|
||||
| OMNITRACE_FILE_OUTPUT | Write output to files |
|
||||
| OMNITRACE_FLAT_PROFILE | Set the label hierarchy mode to defa... |
|
||||
| OMNITRACE_FLAT_SAMPLING | Ignore hierarchy in all statistical ... |
|
||||
| OMNITRACE_INPUT_EXTENSIONS | File extensions used when searching ... |
|
||||
| OMNITRACE_INPUT_PATH | Explicitly specify the input folder ... |
|
||||
| OMNITRACE_INPUT_PREFIX | Explicitly specify the prefix for in... |
|
||||
| OMNITRACE_INSTRUMENTATION_INTERVAL | Instrumentation only takes measureme... |
|
||||
| OMNITRACE_JSON_OUTPUT | Write json output files |
|
||||
| OMNITRACE_MAX_DEPTH | Set the maximum depth of label hiera... |
|
||||
| OMNITRACE_MAX_THREAD_BOOKMARKS | Maximum number of times a worker thr... |
|
||||
| OMNITRACE_MAX_WIDTH | Set the maximum width for component ... |
|
||||
| OMNITRACE_MEMORY_PRECISION | Set the precision for components wit... |
|
||||
| OMNITRACE_MEMORY_SCIENTIFIC | Set the numerical reporting format f... |
|
||||
| OMNITRACE_MEMORY_UNITS | Set the units for components with 'u... |
|
||||
| OMNITRACE_MEMORY_WIDTH | Set the output width for components ... |
|
||||
| OMNITRACE_NETWORK_INTERFACE | Default network interface |
|
||||
| OMNITRACE_NODE_COUNT | Total number of nodes used in applic... |
|
||||
| OMNITRACE_OUTPUT_FILE | Perfetto filename |
|
||||
| OMNITRACE_OUTPUT_PATH | Explicitly specify the output folder... |
|
||||
| OMNITRACE_OUTPUT_PREFIX | Explicitly specify a prefix for all ... |
|
||||
| OMNITRACE_PAPI_EVENTS | PAPI presets and events to collect (... |
|
||||
| OMNITRACE_PAPI_FAIL_ON_ERROR | Configure PAPI errors to trigger a r... |
|
||||
| OMNITRACE_PAPI_MULTIPLEXING | Enable multiplexing when using PAPI |
|
||||
| OMNITRACE_PAPI_OVERFLOW | Value at which PAPI hw counters trig... |
|
||||
| OMNITRACE_PAPI_QUIET | Configure suppression of reporting P... |
|
||||
| OMNITRACE_PAPI_THREADING | Enable multithreading support when u... |
|
||||
| OMNITRACE_PRECISION | Set the global output precision for ... |
|
||||
| OMNITRACE_ROCM_SMI_DEVICES | Devices to query when OMNITRACE_USE_... |
|
||||
| OMNITRACE_ROCTRACER_FLAT_PROFILE | Ignore hierarchy in all kernels entr... |
|
||||
| OMNITRACE_ROCTRACER_HSA_ACTIVITY | Enable HSA activity tracing support |
|
||||
| OMNITRACE_ROCTRACER_HSA_API | Enable HSA API tracing support |
|
||||
| OMNITRACE_ROCTRACER_HSA_API_TYPES | HSA API type to collect |
|
||||
| OMNITRACE_ROCTRACER_TIMELINE_PROFILE | Create unique entries for every kern... |
|
||||
| OMNITRACE_SAMPLING_DELAY | Number of seconds to delay activatin... |
|
||||
| OMNITRACE_SAMPLING_FREQ | Number of software interrupts per se... |
|
||||
| OMNITRACE_SCIENTIFIC | Set the global numerical reporting t... |
|
||||
| OMNITRACE_SETTINGS_DESC | Provide descriptions when printing s... |
|
||||
| OMNITRACE_SHMEM_SIZE_HINT_KB | Hint for shared-memory buffer size i... |
|
||||
| OMNITRACE_SUPPRESS_CONFIG | Disable processing of setting config... |
|
||||
| OMNITRACE_SUPPRESS_PARSING | Disable parsing environment |
|
||||
| OMNITRACE_TEXT_OUTPUT | Write text output files |
|
||||
| OMNITRACE_TIMELINE_PROFILE | Set the label hierarchy mode to defa... |
|
||||
| OMNITRACE_TIMELINE_SAMPLING | Create unique entries for every samp... |
|
||||
| OMNITRACE_TIMEMORY_COMPONENTS | List of components to collect via ti... |
|
||||
| OMNITRACE_TIME_FORMAT | Customize the folder generation when... |
|
||||
| OMNITRACE_TIME_OUTPUT | Output data to subfolder w/ a timest... |
|
||||
| OMNITRACE_TIMING_PRECISION | Set the precision for components wit... |
|
||||
| OMNITRACE_TIMING_SCIENTIFIC | Set the numerical reporting format f... |
|
||||
| OMNITRACE_TIMING_UNITS | Set the units for components with 'u... |
|
||||
| OMNITRACE_TIMING_WIDTH | Set the output width for components ... |
|
||||
| OMNITRACE_TREE_OUTPUT | Write hierarchical json output files |
|
||||
| OMNITRACE_USE_KOKKOSP | Enable support for Kokkos Tools |
|
||||
| OMNITRACE_USE_PERFETTO | Enable perfetto backend |
|
||||
| OMNITRACE_USE_PID | Enable tagging filenames with proces... |
|
||||
| OMNITRACE_USE_ROCM_SMI | Enable sampling GPU power, temp, uti... |
|
||||
| OMNITRACE_USE_ROCTRACER | Enable ROCM tracing |
|
||||
| OMNITRACE_USE_SAMPLING | Enable statistical sampling of call-... |
|
||||
| OMNITRACE_USE_TIMEMORY | Enable timemory backend |
|
||||
| OMNITRACE_VERBOSE | Verbosity level |
|
||||
| OMNITRACE_WIDTH | Set the global output width for comp... |
|
||||
|-----------------------------------------|-----------------------------------------|
|
||||
```
|
||||
|
||||
#### Components
|
||||
|
||||
```console
|
||||
$ omnitrace-avail -C -bd
|
||||
|-----------------------------------|----------------------------------------------|
|
||||
| COMPONENT | DESCRIPTION |
|
||||
|-----------------------------------|----------------------------------------------|
|
||||
| allinea_map | Controls the AllineaMAP sampler. |
|
||||
| caliper_marker | Generic forwarding of markers to Caliper ... |
|
||||
| caliper_config | Caliper configuration manager. |
|
||||
| caliper_loop_marker | Variant of caliper_marker with support fo... |
|
||||
| cpu_clock | Total CPU time spent in both user- and ke... |
|
||||
| cpu_util | Percentage of CPU-clock time divided by w... |
|
||||
| craypat_counters | Names and value of any counter events tha... |
|
||||
| craypat_flush_buffer | Writes all the recorded contents in the d... |
|
||||
| craypat_heap_stats | Undocumented by 'pat_api.h'. |
|
||||
| craypat_record | Toggles CrayPAT recording on calling thread. |
|
||||
| craypat_region | Adds region labels to CrayPAT output. |
|
||||
| current_peak_rss | Absolute value of high-water mark of memo... |
|
||||
| gperftools_cpu_profiler | Control switch for gperftools CPU profiler. |
|
||||
| gperftools_heap_profiler | Control switch for the gperftools heap pr... |
|
||||
| hip_event | Records the time interval between two poi... |
|
||||
| kernel_mode_time | CPU time spent executing in kernel mode (... |
|
||||
| likwid_marker | LIKWID perfmon (CPU) marker forwarding. |
|
||||
| likwid_nvmarker | LIKWID nvmon (GPU) marker forwarding. |
|
||||
| malloc_gotcha | GOTCHA wrapper for memory allocation func... |
|
||||
| memory_allocations | Number of bytes allocated/freed instead o... |
|
||||
| monotonic_clock | Wall-clock timer which will continue to i... |
|
||||
| monotonic_raw_clock | Wall-clock timer unaffected by frequency ... |
|
||||
| network_stats | Reports network bytes, packets, errors, d... |
|
||||
| num_io_in | Number of times the filesystem had to per... |
|
||||
| num_io_out | Number of times the filesystem had to per... |
|
||||
| num_major_page_faults | Number of page faults serviced that requi... |
|
||||
| num_minor_page_faults | Number of page faults serviced without an... |
|
||||
| page_rss | Amount of memory allocated in pages of me... |
|
||||
| papi_array<8ul> | Fixed-size array of PAPI HW counters. |
|
||||
| papi_vector | Dynamically allocated array of PAPI HW co... |
|
||||
| peak_rss | Measures changes in the high-water mark f... |
|
||||
| perfetto_trace | Provides Perfetto Tracing SDK: system pro... |
|
||||
| priority_context_switch | Number of context switch due to higher pr... |
|
||||
| process_cpu_clock | CPU-clock timer for the calling process (... |
|
||||
| process_cpu_util | Percentage of CPU-clock time divided by w... |
|
||||
| read_bytes | Number of bytes which this process really... |
|
||||
| read_char | Number of bytes which this task has cause... |
|
||||
| roctx_marker | Generates high-level region markers for H... |
|
||||
| system_clock | CPU time spent in kernel-mode. |
|
||||
| tau_marker | Forwards markers to TAU instrumentation (... |
|
||||
| thread_cpu_clock | CPU-clock timer for the calling thread. |
|
||||
| thread_cpu_util | Percentage of CPU-clock time divided by w... |
|
||||
| timestamp | Provides a timestamp for every sample and... |
|
||||
| trip_count | Counts number of invocations. |
|
||||
| user_clock | CPU time spent in user-mode. |
|
||||
| user_mode_time | CPU time spent executing in user mode (vi... |
|
||||
| virtual_memory | Records the change in virtual memory. |
|
||||
| voluntary_context_switch | Number of context switches due to a proce... |
|
||||
| vtune_event | Creates events for Intel profiler running... |
|
||||
| vtune_frame | Creates frames for Intel profiler running... |
|
||||
| vtune_profiler | Control switch for Intel profiler running... |
|
||||
| wall_clock | Real-clock timer (i.e. wall-clock timer). |
|
||||
| written_bytes | Number of bytes sent to the storage layer. |
|
||||
| written_char | Number of bytes which this task has cause... |
|
||||
| omnitrace | Invokes instrumentation functions 'omnitr... |
|
||||
| roctracer | High-precision ROCm API and kernel tracing. |
|
||||
| sampling_wall_clock | Wall-clock timing. Derived from statistic... |
|
||||
| sampling_cpu_clock | CPU-clock timing. Derived from statistica... |
|
||||
| sampling_percent | Fraction of wall-clock time spent in func... |
|
||||
| sampling_gpu_power | GPU Power Usage via ROCm-SMI. Derived fro... |
|
||||
| sampling_gpu_temp | GPU Temperature via ROCm-SMI. Derived fro... |
|
||||
| sampling_gpu_busy | GPU Utilization (% busy) via ROCm-SMI. De... |
|
||||
| sampling_gpu_memory_usage | GPU Memory Usage via ROCm-SMI. Derived fr... |
|
||||
|-----------------------------------|----------------------------------------------|
|
||||
```
|
||||
|
||||
#### Hardware Counters
|
||||
|
||||
```console
|
||||
$ omnitrace-avail -H -bd
|
||||
|---------------------|-------------------------------------------------|
|
||||
| HARDWARE COUNTER | DESCRIPTION |
|
||||
|---------------------|-------------------------------------------------|
|
||||
| CPU | |
|
||||
|---------------------|-------------------------------------------------|
|
||||
| PAPI_L1_DCM | Level 1 data cache misses |
|
||||
| PAPI_L1_ICM | Level 1 instruction cache misses |
|
||||
| PAPI_L2_DCM | Level 2 data cache misses |
|
||||
| PAPI_L2_ICM | Level 2 instruction cache misses |
|
||||
| PAPI_L3_DCM | Level 3 data cache misses |
|
||||
| PAPI_L3_ICM | Level 3 instruction cache misses |
|
||||
| PAPI_L1_TCM | Level 1 cache misses |
|
||||
| PAPI_L2_TCM | Level 2 cache misses |
|
||||
| PAPI_L3_TCM | Level 3 cache misses |
|
||||
| PAPI_CA_SNP | Requests for a snoop |
|
||||
| PAPI_CA_SHR | Requests for exclusive access to shared cach... |
|
||||
| PAPI_CA_CLN | Requests for exclusive access to clean cache... |
|
||||
| PAPI_CA_INV | Requests for cache line invalidation |
|
||||
| PAPI_CA_ITV | Requests for cache line intervention |
|
||||
| PAPI_L3_LDM | Level 3 load misses |
|
||||
| PAPI_L3_STM | Level 3 store misses |
|
||||
| PAPI_BRU_IDL | Cycles branch units are idle |
|
||||
| PAPI_FXU_IDL | Cycles integer units are idle |
|
||||
| PAPI_FPU_IDL | Cycles floating point units are idle |
|
||||
| PAPI_LSU_IDL | Cycles load/store units are idle |
|
||||
| PAPI_TLB_DM | Data translation lookaside buffer misses |
|
||||
| PAPI_TLB_IM | Instruction translation lookaside buffer misses |
|
||||
| PAPI_TLB_TL | Total translation lookaside buffer misses |
|
||||
| PAPI_L1_LDM | Level 1 load misses |
|
||||
| PAPI_L1_STM | Level 1 store misses |
|
||||
| PAPI_L2_LDM | Level 2 load misses |
|
||||
| PAPI_L2_STM | Level 2 store misses |
|
||||
| PAPI_BTAC_M | Branch target address cache misses |
|
||||
| PAPI_PRF_DM | Data prefetch cache misses |
|
||||
| PAPI_L3_DCH | Level 3 data cache hits |
|
||||
| PAPI_TLB_SD | Translation lookaside buffer shootdowns |
|
||||
| PAPI_CSR_FAL | Failed store conditional instructions |
|
||||
| PAPI_CSR_SUC | Successful store conditional instructions |
|
||||
| PAPI_CSR_TOT | Total store conditional instructions |
|
||||
| PAPI_MEM_SCY | Cycles Stalled Waiting for memory accesses |
|
||||
| PAPI_MEM_RCY | Cycles Stalled Waiting for memory reads |
|
||||
| PAPI_MEM_WCY | Cycles Stalled Waiting for memory writes |
|
||||
| PAPI_STL_ICY | Cycles with no instruction issue |
|
||||
| PAPI_FUL_ICY | Cycles with maximum instruction issue |
|
||||
| PAPI_STL_CCY | Cycles with no instructions completed |
|
||||
| PAPI_FUL_CCY | Cycles with maximum instructions completed |
|
||||
| PAPI_HW_INT | Hardware interrupts |
|
||||
| PAPI_BR_UCN | Unconditional branch instructions |
|
||||
| PAPI_BR_CN | Conditional branch instructions |
|
||||
| PAPI_BR_TKN | Conditional branch instructions taken |
|
||||
| PAPI_BR_NTK | Conditional branch instructions not taken |
|
||||
| PAPI_BR_MSP | Conditional branch instructions mispredicted |
|
||||
| PAPI_BR_PRC | Conditional branch instructions correctly pr... |
|
||||
| PAPI_FMA_INS | FMA instructions completed |
|
||||
| PAPI_TOT_IIS | Instructions issued |
|
||||
| PAPI_TOT_INS | Instructions completed |
|
||||
| PAPI_INT_INS | Integer instructions |
|
||||
| PAPI_FP_INS | Floating point instructions |
|
||||
| PAPI_LD_INS | Load instructions |
|
||||
| PAPI_SR_INS | Store instructions |
|
||||
| PAPI_BR_INS | Branch instructions |
|
||||
| PAPI_VEC_INS | Vector/SIMD instructions (could include inte... |
|
||||
| PAPI_RES_STL | Cycles stalled on any resource |
|
||||
| PAPI_FP_STAL | Cycles the FP unit(s) are stalled |
|
||||
| PAPI_TOT_CYC | Total cycles |
|
||||
| PAPI_LST_INS | Load/store instructions completed |
|
||||
| PAPI_SYC_INS | Synchronization instructions completed |
|
||||
| PAPI_L1_DCH | Level 1 data cache hits |
|
||||
| PAPI_L2_DCH | Level 2 data cache hits |
|
||||
| PAPI_L1_DCA | Level 1 data cache accesses |
|
||||
| PAPI_L2_DCA | Level 2 data cache accesses |
|
||||
| PAPI_L3_DCA | Level 3 data cache accesses |
|
||||
| PAPI_L1_DCR | Level 1 data cache reads |
|
||||
| PAPI_L2_DCR | Level 2 data cache reads |
|
||||
| PAPI_L3_DCR | Level 3 data cache reads |
|
||||
| PAPI_L1_DCW | Level 1 data cache writes |
|
||||
| PAPI_L2_DCW | Level 2 data cache writes |
|
||||
| PAPI_L3_DCW | Level 3 data cache writes |
|
||||
| PAPI_L1_ICH | Level 1 instruction cache hits |
|
||||
| PAPI_L2_ICH | Level 2 instruction cache hits |
|
||||
| PAPI_L3_ICH | Level 3 instruction cache hits |
|
||||
| PAPI_L1_ICA | Level 1 instruction cache accesses |
|
||||
| PAPI_L2_ICA | Level 2 instruction cache accesses |
|
||||
| PAPI_L3_ICA | Level 3 instruction cache accesses |
|
||||
| PAPI_L1_ICR | Level 1 instruction cache reads |
|
||||
| PAPI_L2_ICR | Level 2 instruction cache reads |
|
||||
| PAPI_L3_ICR | Level 3 instruction cache reads |
|
||||
| PAPI_L1_ICW | Level 1 instruction cache writes |
|
||||
| PAPI_L2_ICW | Level 2 instruction cache writes |
|
||||
| PAPI_L3_ICW | Level 3 instruction cache writes |
|
||||
| PAPI_L1_TCH | Level 1 total cache hits |
|
||||
| PAPI_L2_TCH | Level 2 total cache hits |
|
||||
| PAPI_L3_TCH | Level 3 total cache hits |
|
||||
| PAPI_L1_TCA | Level 1 total cache accesses |
|
||||
| PAPI_L2_TCA | Level 2 total cache accesses |
|
||||
| PAPI_L3_TCA | Level 3 total cache accesses |
|
||||
| PAPI_L1_TCR | Level 1 total cache reads |
|
||||
| PAPI_L2_TCR | Level 2 total cache reads |
|
||||
| PAPI_L3_TCR | Level 3 total cache reads |
|
||||
| PAPI_L1_TCW | Level 1 total cache writes |
|
||||
| PAPI_L2_TCW | Level 2 total cache writes |
|
||||
| PAPI_L3_TCW | Level 3 total cache writes |
|
||||
| PAPI_FML_INS | Floating point multiply instructions |
|
||||
| PAPI_FAD_INS | Floating point add instructions |
|
||||
| PAPI_FDV_INS | Floating point divide instructions |
|
||||
| PAPI_FSQ_INS | Floating point square root instructions |
|
||||
| PAPI_FNV_INS | Floating point inverse instructions |
|
||||
| PAPI_FP_OPS | Floating point operations |
|
||||
| PAPI_SP_OPS | Floating point operations; optimized to coun... |
|
||||
| PAPI_DP_OPS | Floating point operations; optimized to coun... |
|
||||
| PAPI_VEC_SP | Single precision vector/SIMD instructions |
|
||||
| PAPI_VEC_DP | Double precision vector/SIMD instructions |
|
||||
| PAPI_REF_CYC | Reference clock cycles |
|
||||
|---------------------|-------------------------------------------------|
|
||||
```
|
||||
|
||||
## Creating a Configuration File
|
||||
|
||||
[Omnitrace](https://github.com/AMDResearch/omnitrace) supports 3 configuration file formats: JSON, XML, and plain text.
|
||||
Configuration files are specified via the `OMNITRACE_CONFIG_FILE` environment variable
|
||||
and by default will look for `${HOME}/omnitrace.cfg` and `${HOME}/omnitrace.json`.
|
||||
Multiple configuration files can be concatenated via `:`, e.g.:
|
||||
|
||||
```shell
|
||||
export OMNITRACE_CONFIG_FILE=~/.config/omnitrace.cfg:~/.config/omnitrace.json
|
||||
```
|
||||
|
||||
If a configuration variable is specified in both a configuration file and in the environment,
|
||||
the environment variable takes precedence.
|
||||
|
||||
### Sample Text Configuration File
|
||||
|
||||
Text files support very basic variables and are case-insensitive.
|
||||
Variables are created when an lvalue starts with a $ and are
|
||||
dereferenced when they appear as rvalues.
|
||||
|
||||
Entries in the text configuration file which do not match to a known setting
|
||||
in `omnitrace-avail` but are prefixed with `OMNITRACE_` are interpreted as
|
||||
environment variables and are exported via `setenv`
|
||||
but do not override an existing value for the environment variable.
|
||||
|
||||
```shell
|
||||
# lvals starting with $ are variables
|
||||
$USE = ON
|
||||
|
||||
# use fields
|
||||
OMNITRACE_USE_PERFETTO = $USE
|
||||
OMNITRACE_USE_TIMEMORY = $USE
|
||||
OMNITRACE_USE_SAMPLING = $USE
|
||||
OMNITRACE_USE_PID = OFF
|
||||
OMNITRACE_CRITICAL_TRACE = OFF
|
||||
|
||||
# debug
|
||||
OMNITRACE_DEBUG = OFF
|
||||
OMNITRACE_VERBOSE = 1
|
||||
OMNITRACE_DL_VERBOSE = 1
|
||||
|
||||
# output fields
|
||||
OMNITRACE_OUTPUT_PREFIX = %tag%-
|
||||
OMNITRACE_OUTPUT_PATH = omnitrace-example-output
|
||||
OMNITRACE_TIME_OUTPUT = OFF
|
||||
|
||||
# timemory fields
|
||||
OMNITRACE_PAPI_EVENTS = PAPI_TOT_INS PAPI_FP_INS
|
||||
OMNITRACE_TIMEMORY_COMPONENTS = wall_clock trip_count
|
||||
|
||||
# sampling fields
|
||||
OMNITRACE_SAMPLING_FREQ = 10
|
||||
|
||||
# rocm-smi fields
|
||||
OMNITRACE_ROCM_SMI_DEVICES = 1
|
||||
|
||||
# misc env variables
|
||||
OMNITRACE_SAMPLING_KEEP_DYNINST_SUFFIX = OFF
|
||||
OMNITRACE_SAMPLING_KEEP_INTERNAL = OFF
|
||||
```
|
||||
|
||||
### Sample XML Configuration File
|
||||
|
||||
The full XML specification for a configuration value contains
|
||||
a lot of information:
|
||||
|
||||
```xml
|
||||
<?xml version="1.0" encoding="utf-8"?>
|
||||
<timemory_xml>
|
||||
<omnitrace>
|
||||
<settings>
|
||||
<cereal_class_version>2</cereal_class_version>
|
||||
<!-- Full setting specification -->
|
||||
<OMNITRACE_ADD_SECONDARY>
|
||||
<cereal_class_version>1</cereal_class_version>
|
||||
<name>add_secondary</name>
|
||||
<environ>OMNITRACE_ADD_SECONDARY</environ>
|
||||
<description>...</description>
|
||||
<count>-1</count>
|
||||
<max_count>1</max_count>
|
||||
<cmdline>
|
||||
<value0>--timemory-add-secondary</value0>
|
||||
</cmdline>
|
||||
<categories>
|
||||
<value0>component</value0>
|
||||
<value1>data</value1>
|
||||
<value2>native</value2>
|
||||
</categories>
|
||||
<data_type>bool</data_type>
|
||||
<initial>true</initial>
|
||||
<value>true</value>
|
||||
</OMNITRACE_ADD_SECONDARY>
|
||||
<!-- etc. -->
|
||||
</settings>
|
||||
</omnitrace>
|
||||
</timemory_xml>
|
||||
```
|
||||
|
||||
Howver when writing an XML configuration file, the following is perfectly acceptable
|
||||
to set `OMNITRACE_ADD_SECONDARY=false`:
|
||||
|
||||
```xml
|
||||
<?xml version="1.0" encoding="utf-8"?>
|
||||
<timemory_xml>
|
||||
<omnitrace>
|
||||
<settings>
|
||||
<OMNITRACE_ADD_SECONDARY>
|
||||
<value>false</value>
|
||||
</OMNITRACE_ADD_SECONDARY>
|
||||
</settings>
|
||||
</omnitrace>
|
||||
</timemory_xml>
|
||||
```
|
||||
|
||||
### Sample JSON Configuration File
|
||||
|
||||
The full JSON specification for a configuration value contains the same information as the XML:
|
||||
|
||||
```json
|
||||
{
|
||||
"omnitrace": {
|
||||
"settings": {
|
||||
"OMNITRACE_ADD_SECONDARY": {
|
||||
"count": -1,
|
||||
"name": "add_secondary",
|
||||
"data_type": "bool",
|
||||
"initial": true,
|
||||
"value": true,
|
||||
"max_count": 1,
|
||||
"cmdline": [
|
||||
"--timemory-add-secondary"
|
||||
],
|
||||
"environ": "OMNITRACE_ADD_SECONDARY",
|
||||
"cereal_class_version": 1,
|
||||
"categories": [
|
||||
"component",
|
||||
"data",
|
||||
"native"
|
||||
],
|
||||
"description": "Enable/disable components adding secondary (child) entries when available. E.g. suppress individual CUDA kernels, etc. when using Cupti components"
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
Similarly, the
|
||||
Howver when writing an XML configuration file, the following is perfectly acceptable
|
||||
to set `OMNITRACE_ADD_SECONDARY=false`:
|
||||
|
||||
```json
|
||||
{
|
||||
"omnitrace": {
|
||||
"settings": {
|
||||
"OMNITRACE_ADD_SECONDARY": {
|
||||
"value": true
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
```
|
||||
Çalıştırılabilir dosya
@@ -0,0 +1,29 @@
|
||||
#!/bin/bash -e
|
||||
|
||||
message()
|
||||
{
|
||||
echo -e "\n\n##### ${@}... #####\n"
|
||||
}
|
||||
|
||||
WORK_DIR=$(dirname ${BASH_SOURCE[0]})
|
||||
|
||||
message "Changing directory to ${WORK_DIR}"
|
||||
cd ${WORK_DIR}
|
||||
|
||||
SOURCE_DIR=$(cd ${WORK_DIR}/.. &> /dev/null && pwd)
|
||||
message "Source directory is ${SOURCE_DIR}"
|
||||
|
||||
message "Generating omnitrace.dox"
|
||||
cmake -DSOURCE_DIR=${SOURCE_DIR} -P ${WORK_DIR}/generate-doxyfile.cmake
|
||||
|
||||
message "Generating doxygen xml files"
|
||||
doxygen omnitrace.dox
|
||||
|
||||
message "Building html documentation"
|
||||
make html
|
||||
|
||||
message "Removing stale documentation in ${SOURCE_DIR}/docs/"
|
||||
rm -rf ${SOURCE_DIR}/docs/*
|
||||
|
||||
message "Copying docs-source/_build/html/* to docs/"
|
||||
cp -r ${WORK_DIR}/_build/html/* ${SOURCE_DIR}/docs/
|
||||
Çalıştırılabilir dosya
@@ -0,0 +1,9 @@
|
||||
#!/bin/bash -e
|
||||
|
||||
WORK_DIR=$(dirname ${BASH_SOURCE[0]})
|
||||
|
||||
SOURCE_DIR=$(cd ${WORK_DIR}/.. &> /dev/null && pwd)
|
||||
|
||||
cmake -DSOURCE_DIR=${SOURCE_DIR} -P generate-doxyfile.cmake
|
||||
|
||||
doxygen omnitrace.dox
|
||||
@@ -0,0 +1,212 @@
|
||||
# User API
|
||||
|
||||
```eval_rst
|
||||
.. doxygenfile:: omnitrace/user.h
|
||||
```
|
||||
|
||||
By default, when omnitrace detects any `omnitrace_user_start_*` or `omnitrace_user_stop_*` function, instrumentation
|
||||
is disabled at start-up -- thus, `omnitrace_user_stop_trace()` is not required at the beginning of main. This is
|
||||
can be manually controlled via the `OMNITRACE_INIT_ENABLED` environment variable. User-defined regions are always
|
||||
recorded, regardless of whether whether `omnitrace_user_start_*` or `omnitrace_user_stop_*` has been called.
|
||||
|
||||
## Example
|
||||
|
||||
### User API Implementation
|
||||
|
||||
```cpp
|
||||
#include <omnitrace/user.h>
|
||||
|
||||
#include <atomic>
|
||||
#include <cassert>
|
||||
#include <cstdio>
|
||||
#include <cstdlib>
|
||||
#include <sstream>
|
||||
#include <thread>
|
||||
#include <vector>
|
||||
|
||||
std::atomic<long> total{ 0 };
|
||||
|
||||
long
|
||||
fib(long n) __attribute__((noinline));
|
||||
|
||||
void
|
||||
run(size_t nitr, long) __attribute__((noinline));
|
||||
|
||||
int
|
||||
custom_push_region(const char* name);
|
||||
|
||||
namespace
|
||||
{
|
||||
int (*omnitrace_push_region_f)(const char*) = nullptr;
|
||||
}
|
||||
|
||||
int
|
||||
main(int argc, char** argv)
|
||||
{
|
||||
// get the internal callback to start a user-defined region
|
||||
omnitrace_user_get_callbacks(OMNITRACE_USER_REGION, (void**) &omnitrace_push_region_f,
|
||||
nullptr);
|
||||
// assign the custom callback to start a user-defined region
|
||||
if(omnitrace_push_region_f)
|
||||
omnitrace_user_configure(OMNITRACE_USER_REGION, (void*) &custom_push_region,
|
||||
nullptr);
|
||||
|
||||
omnitrace_user_push_region(argv[0]);
|
||||
omnitrace_user_push_region("initialization");
|
||||
size_t nthread = std::min<size_t>(16, std::thread::hardware_concurrency());
|
||||
size_t nitr = 50000;
|
||||
long nfib = 10;
|
||||
if(argc > 1) nfib = atol(argv[1]);
|
||||
if(argc > 2) nthread = atol(argv[2]);
|
||||
if(argc > 3) nitr = atol(argv[3]);
|
||||
omnitrace_user_pop_region("initialization");
|
||||
|
||||
printf("[%s] Threads: %zu\n[%s] Iterations: %zu\n[%s] fibonacci(%li)...\n", argv[0],
|
||||
nthread, argv[0], nitr, argv[0], nfib);
|
||||
|
||||
omnitrace_user_push_region("thread_creation");
|
||||
std::vector<std::thread> threads{};
|
||||
threads.reserve(nthread);
|
||||
// disable instrumentation for child threads
|
||||
omnitrace_user_stop_thread_trace();
|
||||
for(size_t i = 0; i < nthread; ++i)
|
||||
{
|
||||
size_t _nitr = ((i % 2) == 1) ? (nitr - (0.1 * nitr)) : (nitr + (0.1 * nitr));
|
||||
long _nfib = ((i % 2) == 1) ? (nfib - (0.1 * nfib)) : (nfib + (0.1 * nfib));
|
||||
threads.emplace_back(&run, _nitr, _nfib);
|
||||
}
|
||||
// re-enable instrumentation
|
||||
omnitrace_user_start_thread_trace();
|
||||
omnitrace_user_pop_region("thread_creation");
|
||||
|
||||
omnitrace_user_push_region("thread_wait");
|
||||
for(auto& itr : threads)
|
||||
itr.join();
|
||||
omnitrace_user_pop_region("thread_wait");
|
||||
|
||||
run(nitr, nfib);
|
||||
|
||||
printf("[%s] fibonacci(%li) x %lu = %li\n", argv[0], nfib, nthread, total.load());
|
||||
omnitrace_user_pop_region(argv[0]);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
long
|
||||
fib(long n)
|
||||
{
|
||||
return (n < 2) ? n : fib(n - 1) + fib(n - 2);
|
||||
}
|
||||
|
||||
#define RUN_LABEL \
|
||||
std::string{ std::string{ __FUNCTION__ } + "(" + std::to_string(n) + ") x " + \
|
||||
std::to_string(nitr) } \
|
||||
.c_str()
|
||||
|
||||
void
|
||||
run(size_t nitr, long n)
|
||||
{
|
||||
omnitrace_user_push_region(RUN_LABEL);
|
||||
long local = 0;
|
||||
for(size_t i = 0; i < nitr; ++i)
|
||||
local += fib(n);
|
||||
total += local;
|
||||
omnitrace_user_pop_region(RUN_LABEL);
|
||||
}
|
||||
|
||||
int
|
||||
custom_push_region(const char* name)
|
||||
{
|
||||
printf("Pushing custom region :: %s\n", name);
|
||||
return (*omnitrace_push_region_f)(name);
|
||||
}
|
||||
```
|
||||
|
||||
### User API Output
|
||||
|
||||
```console
|
||||
$ omnitrace -l --min-address-range=0 --min-address-range-loop=0 --min-instructions=8 -E custom_push_region -o -- ./user-api
|
||||
...
|
||||
$ export OMNITRACE_USE_TIMEMORY=ON
|
||||
$ export OMNITRACE_USE_PID=OFF
|
||||
$ export OMNITRACE_TIME_OUTPUT=OFF
|
||||
$ export OMNITRACE_OUTPUT_PATH=omnitrace-example-output
|
||||
$ ./user-api.inst 20 4 100
|
||||
Pushing custom region :: ./user-api.inst
|
||||
[omnitrace][omnitrace_init_tooling] Instrumentation mode: Trace
|
||||
|
||||
|
||||
______ .___ ___. .__ __. __ .___________..______ ___ ______ _______
|
||||
/ __ \ | \/ | | \ | | | | | || _ \ / \ / || ____|
|
||||
| | | | | \ / | | \| | | | `---| |----`| |_) | / ^ \ | ,----'| |__
|
||||
| | | | | |\/| | | . ` | | | | | | / / /_\ \ | | | __|
|
||||
| `--' | | | | | | |\ | | | | | | |\ \----./ _____ \ | `----.| |____
|
||||
\______/ |__| |__| |__| \__| |__| |__| | _| `._____/__/ \__\ \______||_______|
|
||||
|
||||
|
||||
|
||||
Pushing custom region :: initialization
|
||||
[./user-api.inst] Threads: 4
|
||||
[./user-api.inst] Iterations: 100
|
||||
[./user-api.inst] fibonacci(20)...
|
||||
Pushing custom region :: thread_creation
|
||||
Pushing custom region :: run(20) x 100
|
||||
Pushing custom region :: thread_wait
|
||||
Pushing custom region :: run(20) x 100
|
||||
Pushing custom region :: run(20) x 100
|
||||
Pushing custom region :: run(20) x 100
|
||||
Pushing custom region :: run(20) x 100
|
||||
[./user-api.inst] fibonacci(20) x 4 = 3382500
|
||||
|
||||
|
||||
[omnitrace][2637959][0] omnitrace : 2.716905 sec wall_clock, 1.216 mb peak_rss, 3.680000 sec cpu_clock, 135.4 % cpu_util [laps: 1]
|
||||
[omnitrace][2637959][0] user-api.inst/thread-0 : 2.715708 sec wall_clock, 2.354223 sec thread_cpu_clock, 86.7 % thread_cpu_util, 1.216 mb peak_rss [laps: 1]
|
||||
[omnitrace][2637959][0] user-api.inst/thread-1 : 0.329802 sec wall_clock, 0.329739 sec thread_cpu_clock, 100.0 % thread_cpu_util, 0.000 mb peak_rss [laps: 1]
|
||||
[omnitrace][2637959][0] user-api.inst/thread-2 : 0.355981 sec wall_clock, 0.335795 sec thread_cpu_clock, 94.3 % thread_cpu_util, 0.528 mb peak_rss [laps: 1]
|
||||
[omnitrace][2637959][0] user-api.inst/thread-3 : 0.341329 sec wall_clock, 0.331214 sec thread_cpu_clock, 97.0 % thread_cpu_util, 0.456 mb peak_rss [laps: 1]
|
||||
[omnitrace][2637959][0] user-api.inst/thread-4 : 0.360631 sec wall_clock, 0.330374 sec thread_cpu_clock, 91.6 % thread_cpu_util, 0.600 mb peak_rss [laps: 1]
|
||||
[wall_clock]|0> Outputting 'omnitrace-example-output/wall_clock.json'...
|
||||
[wall_clock]|0> Outputting 'omnitrace-example-output/wall_clock.tree.json'...
|
||||
[wall_clock]|0> Outputting 'omnitrace-example-output/wall_clock.txt'...
|
||||
|
||||
|
||||
[metadata::manager::finalize]> Outputting 'omnitrace-example-output/metadata.json' and 'omnitrace-example-output/functions.json'...
|
||||
$ cat omnitrace-example-output/wall_clock.txt
|
||||
|-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
|
||||
| REAL-CLOCK TIMER (I.E. WALL-CLOCK TIMER) |
|
||||
|-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
|
||||
| LABEL | COUNT | DEPTH | METRIC | UNITS | SUM | MEAN | MIN | MAX | VAR | STDDEV | % SELF |
|
||||
|-----------------------------------------------------------------------|---------|--------|------------|--------|----------|----------|----------|----------|----------|----------|--------|
|
||||
| |0>>> ./user-api.inst | 1 | 0 | wall_clock | sec | 2.715611 | 2.715611 | 2.715611 | 2.715611 | 0.000000 | 0.000000 | 0.0 |
|
||||
| |0>>> |_initialization | 1 | 1 | wall_clock | sec | 0.000001 | 0.000001 | 0.000001 | 0.000001 | 0.000000 | 0.000000 | 100.0 |
|
||||
| |0>>> |_thread_creation | 1 | 1 | wall_clock | sec | 0.000170 | 0.000170 | 0.000170 | 0.000170 | 0.000000 | 0.000000 | 100.0 |
|
||||
| |0>>> |_thread_wait | 1 | 1 | wall_clock | sec | 0.360751 | 0.360751 | 0.360751 | 0.360751 | 0.000000 | 0.000000 | 0.0 |
|
||||
| |1>>> |_run(20) x 100 | 1 | 2 | wall_clock | sec | 0.329472 | 0.329472 | 0.329472 | 0.329472 | 0.000000 | 0.000000 | 100.0 |
|
||||
| |3>>> |_run(20) x 100 | 1 | 2 | wall_clock | sec | 0.331028 | 0.331028 | 0.331028 | 0.331028 | 0.000000 | 0.000000 | 100.0 |
|
||||
| |2>>> |_run(20) x 100 | 1 | 2 | wall_clock | sec | 0.335554 | 0.335554 | 0.335554 | 0.335554 | 0.000000 | 0.000000 | 100.0 |
|
||||
| |4>>> |_run(20) x 100 | 1 | 2 | wall_clock | sec | 0.330220 | 0.330220 | 0.330220 | 0.330220 | 0.000000 | 0.000000 | 100.0 |
|
||||
| |0>>> |_run | 1 | 1 | wall_clock | sec | 2.354618 | 2.354618 | 2.354618 | 2.354618 | 0.000000 | 0.000000 | 0.0 |
|
||||
| |0>>> |_run(20) x 100 | 1 | 2 | wall_clock | sec | 2.354600 | 2.354600 | 2.354600 | 2.354600 | 0.000000 | 0.000000 | 48.3 |
|
||||
| |0>>> |_fib | 1094600 | 3 | wall_clock | sec | 1.217671 | 0.000001 | 0.000000 | 0.000055 | 0.000000 | 0.000002 | 41.3 |
|
||||
| |0>>> |_fib | 418100 | 4 | wall_clock | sec | 0.714197 | 0.000002 | 0.000000 | 0.000050 | 0.000000 | 0.000002 | 38.1 |
|
||||
| |0>>> |_fib | 258400 | 5 | wall_clock | sec | 0.441874 | 0.000002 | 0.000000 | 0.000047 | 0.000000 | 0.000002 | 37.9 |
|
||||
| |0>>> |_fib | 159700 | 6 | wall_clock | sec | 0.274224 | 0.000002 | 0.000000 | 0.000044 | 0.000000 | 0.000002 | 37.9 |
|
||||
| |0>>> |_fib | 98700 | 7 | wall_clock | sec | 0.170399 | 0.000002 | 0.000000 | 0.000042 | 0.000000 | 0.000002 | 37.7 |
|
||||
| |0>>> |_fib | 61000 | 8 | wall_clock | sec | 0.106093 | 0.000002 | 0.000000 | 0.000039 | 0.000000 | 0.000002 | 37.5 |
|
||||
| |0>>> |_fib | 37700 | 9 | wall_clock | sec | 0.066316 | 0.000002 | 0.000000 | 0.000036 | 0.000000 | 0.000002 | 40.2 |
|
||||
| |0>>> |_fib | 23300 | 10 | wall_clock | sec | 0.039640 | 0.000002 | 0.000000 | 0.000033 | 0.000000 | 0.000002 | 38.2 |
|
||||
| |0>>> |_fib | 14400 | 11 | wall_clock | sec | 0.024504 | 0.000002 | 0.000000 | 0.000030 | 0.000000 | 0.000002 | 37.9 |
|
||||
| |0>>> |_fib | 8900 | 12 | wall_clock | sec | 0.015219 | 0.000002 | 0.000000 | 0.000027 | 0.000000 | 0.000002 | 38.1 |
|
||||
| |0>>> |_fib | 5500 | 13 | wall_clock | sec | 0.009417 | 0.000002 | 0.000000 | 0.000024 | 0.000000 | 0.000002 | 38.3 |
|
||||
| |0>>> |_fib | 3400 | 14 | wall_clock | sec | 0.005806 | 0.000002 | 0.000000 | 0.000021 | 0.000000 | 0.000002 | 38.4 |
|
||||
| |0>>> |_fib | 2100 | 15 | wall_clock | sec | 0.003576 | 0.000002 | 0.000000 | 0.000019 | 0.000000 | 0.000002 | 38.4 |
|
||||
| |0>>> |_fib | 1300 | 16 | wall_clock | sec | 0.002201 | 0.000002 | 0.000000 | 0.000016 | 0.000000 | 0.000002 | 40.3 |
|
||||
| |0>>> |_fib | 800 | 17 | wall_clock | sec | 0.001315 | 0.000002 | 0.000000 | 0.000014 | 0.000000 | 0.000002 | 42.1 |
|
||||
| |0>>> |_fib | 500 | 18 | wall_clock | sec | 0.000762 | 0.000002 | 0.000000 | 0.000010 | 0.000000 | 0.000001 | 42.1 |
|
||||
| |0>>> |_fib | 300 | 19 | wall_clock | sec | 0.000441 | 0.000001 | 0.000000 | 0.000008 | 0.000000 | 0.000001 | 47.8 |
|
||||
| |0>>> |_fib | 200 | 20 | wall_clock | sec | 0.000230 | 0.000001 | 0.000000 | 0.000006 | 0.000000 | 0.000001 | 49.0 |
|
||||
| |0>>> |_fib | 100 | 21 | wall_clock | sec | 0.000117 | 0.000001 | 0.000001 | 0.000003 | 0.000000 | 0.000000 | 84.5 |
|
||||
| |0>>> |_fib | 100 | 22 | wall_clock | sec | 0.000018 | 0.000000 | 0.000000 | 0.000001 | 0.000000 | 0.000000 | 100.0 |
|
||||
| |0>>> std::vector<std::thread, std::allocator<std::thread> >::~vector | 1 | 0 | wall_clock | sec | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 100.0 |
|
||||
|-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
|
||||
```
|
||||
@@ -1,8 +1,6 @@
|
||||
// MIT License
|
||||
//
|
||||
// Copyright (c) 2020, The Regents of the University of California,
|
||||
// through Lawrence Berkeley National Laboratory (subject to receipt of any
|
||||
// required approvals from the U.S. Dept. of Energy). All rights reserved.
|
||||
// Copyright (c) 2022 Advanced Micro Devices, Inc. All Rights Reserved.
|
||||
//
|
||||
// Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
// of this software and associated documentation files (the "Software"), to deal
|
||||
@@ -22,36 +20,19 @@
|
||||
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
// SOFTWARE.
|
||||
|
||||
#if !defined(OMNITRACE_DL_SOURCE)
|
||||
# define OMNITRACE_DL_SOURCE 1
|
||||
#endif
|
||||
|
||||
#define OMNITRACE_COMMON_LIBRARY_NAME "dl"
|
||||
|
||||
#include "common/defines.h"
|
||||
#include "dl.hpp"
|
||||
#include "common/delimit.hpp"
|
||||
#include "common/environment.hpp"
|
||||
#include "common/invoke.hpp"
|
||||
#include "common/join.hpp"
|
||||
|
||||
#include "omnitrace/user.h"
|
||||
|
||||
#include <atomic>
|
||||
#include <cstdint>
|
||||
#include <cstdio>
|
||||
#include <cstdlib>
|
||||
#include <cstring>
|
||||
#include <dlfcn.h>
|
||||
#include <functional>
|
||||
#include <gnu/libc-version.h>
|
||||
#include <iostream>
|
||||
#include <limits>
|
||||
#include <memory>
|
||||
#include <sstream>
|
||||
#include <stdexcept>
|
||||
#include <string>
|
||||
#include <sys/stat.h>
|
||||
#include <unistd.h>
|
||||
|
||||
#if !defined(OMNITRACE_USE_OMPT)
|
||||
# define OMNITRACE_USE_OMPT 0
|
||||
#endif
|
||||
//--------------------------------------------------------------------------------------//
|
||||
|
||||
#define OMNITRACE_DLSYM(VARNAME, HANDLE, FUNCNAME) \
|
||||
if(HANDLE) \
|
||||
@@ -69,38 +50,6 @@
|
||||
} \
|
||||
}
|
||||
|
||||
//--------------------------------------------------------------------------------------//
|
||||
//
|
||||
// omnitrace symbols
|
||||
//
|
||||
//--------------------------------------------------------------------------------------//
|
||||
|
||||
extern "C"
|
||||
{
|
||||
struct ompt_start_tool_result_t;
|
||||
|
||||
void omnitrace_init_library(void) OMNITRACE_PUBLIC_API;
|
||||
void omnitrace_init(const char*, bool, const char*) OMNITRACE_PUBLIC_API;
|
||||
void omnitrace_finalize(void) OMNITRACE_PUBLIC_API;
|
||||
void omnitrace_set_env(const char* env_name,
|
||||
const char* env_val) OMNITRACE_PUBLIC_API;
|
||||
void omnitrace_set_mpi(bool use, bool attached) OMNITRACE_PUBLIC_API;
|
||||
void omnitrace_push_trace(const char* name) OMNITRACE_PUBLIC_API;
|
||||
void omnitrace_pop_trace(const char* name) OMNITRACE_PUBLIC_API;
|
||||
|
||||
int omnitrace_user_start_trace_dl(void) OMNITRACE_HIDDEN_API;
|
||||
int omnitrace_user_stop_trace_dl(void) OMNITRACE_HIDDEN_API;
|
||||
|
||||
int omnitrace_user_start_thread_trace_dl(void) OMNITRACE_HIDDEN_API;
|
||||
int omnitrace_user_stop_thread_trace_dl(void) OMNITRACE_HIDDEN_API;
|
||||
|
||||
int omnitrace_user_push_region_dl(const char*) OMNITRACE_HIDDEN_API;
|
||||
int omnitrace_user_pop_region_dl(const char*) OMNITRACE_HIDDEN_API;
|
||||
|
||||
ompt_start_tool_result_t* ompt_start_tool(unsigned int,
|
||||
const char*) OMNITRACE_PUBLIC_API;
|
||||
}
|
||||
|
||||
//--------------------------------------------------------------------------------------//
|
||||
|
||||
namespace omnitrace
|
||||
@@ -405,6 +354,32 @@ extern "C"
|
||||
}
|
||||
}
|
||||
|
||||
void omnitrace_push_region(const char* name)
|
||||
{
|
||||
if(!dl::get_active()) return;
|
||||
if(dl::get_thread_enabled())
|
||||
{
|
||||
OMNITRACE_DL_INVOKE(get_indirect().omnitrace_push_region_f, name);
|
||||
}
|
||||
else
|
||||
{
|
||||
++dl::get_thread_count();
|
||||
}
|
||||
}
|
||||
|
||||
void omnitrace_pop_region(const char* name)
|
||||
{
|
||||
if(!dl::get_active()) return;
|
||||
if(dl::get_thread_enabled())
|
||||
{
|
||||
OMNITRACE_DL_INVOKE(get_indirect().omnitrace_pop_region_f, name);
|
||||
}
|
||||
else
|
||||
{
|
||||
if(dl::get_thread_count()-- == 0) omnitrace_user_start_thread_trace_dl();
|
||||
}
|
||||
}
|
||||
|
||||
void omnitrace_set_env(const char* a, const char* b)
|
||||
{
|
||||
setenv(a, b, 0);
|
||||
|
||||
@@ -0,0 +1,83 @@
|
||||
// MIT License
|
||||
//
|
||||
// Copyright (c) 2022 Advanced Micro Devices, Inc. All Rights Reserved.
|
||||
//
|
||||
// Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
// of this software and associated documentation files (the "Software"), to deal
|
||||
// in the Software without restriction, including without limitation the rights
|
||||
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
// copies of the Software, and to permit persons to whom the Software is
|
||||
// furnished to do so, subject to the following conditions:
|
||||
//
|
||||
// The above copyright notice and this permission notice shall be included in all
|
||||
// copies or substantial portions of the Software.
|
||||
//
|
||||
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
// SOFTWARE.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "common/defines.h"
|
||||
#include "omnitrace/user.h"
|
||||
|
||||
#include <atomic>
|
||||
#include <cstdint>
|
||||
#include <cstdio>
|
||||
#include <cstdlib>
|
||||
#include <cstring>
|
||||
#include <dlfcn.h>
|
||||
#include <functional>
|
||||
#include <gnu/libc-version.h>
|
||||
#include <iostream>
|
||||
#include <limits>
|
||||
#include <memory>
|
||||
#include <sstream>
|
||||
#include <stdexcept>
|
||||
#include <string>
|
||||
#include <sys/stat.h>
|
||||
#include <unistd.h>
|
||||
|
||||
#if !defined(OMNITRACE_USE_OMPT)
|
||||
# define OMNITRACE_USE_OMPT 0
|
||||
#endif
|
||||
|
||||
//--------------------------------------------------------------------------------------//
|
||||
//
|
||||
// omnitrace symbols
|
||||
//
|
||||
//--------------------------------------------------------------------------------------//
|
||||
|
||||
extern "C"
|
||||
{
|
||||
void omnitrace_init_library(void) OMNITRACE_PUBLIC_API;
|
||||
void omnitrace_init(const char*, bool, const char*) OMNITRACE_PUBLIC_API;
|
||||
void omnitrace_finalize(void) OMNITRACE_PUBLIC_API;
|
||||
void omnitrace_set_env(const char* env_name,
|
||||
const char* env_val) OMNITRACE_PUBLIC_API;
|
||||
void omnitrace_set_mpi(bool use, bool attached) OMNITRACE_PUBLIC_API;
|
||||
void omnitrace_push_trace(const char* name) OMNITRACE_PUBLIC_API;
|
||||
void omnitrace_pop_trace(const char* name) OMNITRACE_PUBLIC_API;
|
||||
void omnitrace_push_region(const char*) OMNITRACE_PUBLIC_API;
|
||||
void omnitrace_pop_region(const char*) OMNITRACE_PUBLIC_API;
|
||||
|
||||
#if defined(OMNITRACE_DL_SOURCE) && (OMNITRACE_DL_SOURCE > 0)
|
||||
int omnitrace_user_start_trace_dl(void) OMNITRACE_HIDDEN_API;
|
||||
int omnitrace_user_stop_trace_dl(void) OMNITRACE_HIDDEN_API;
|
||||
|
||||
int omnitrace_user_start_thread_trace_dl(void) OMNITRACE_HIDDEN_API;
|
||||
int omnitrace_user_stop_thread_trace_dl(void) OMNITRACE_HIDDEN_API;
|
||||
|
||||
int omnitrace_user_push_region_dl(const char*) OMNITRACE_HIDDEN_API;
|
||||
int omnitrace_user_pop_region_dl(const char*) OMNITRACE_HIDDEN_API;
|
||||
|
||||
struct ompt_start_tool_result_t;
|
||||
|
||||
ompt_start_tool_result_t* ompt_start_tool(unsigned int,
|
||||
const char*) OMNITRACE_PUBLIC_API;
|
||||
#endif
|
||||
}
|
||||
@@ -232,6 +232,9 @@ get_sampling_freq();
|
||||
double&
|
||||
get_sampling_delay();
|
||||
|
||||
std::string
|
||||
get_sampling_cpus();
|
||||
|
||||
double&
|
||||
get_thread_sampling_freq();
|
||||
|
||||
|
||||
@@ -172,6 +172,13 @@ configure_settings()
|
||||
"increasing this value can fix deadlocks during init",
|
||||
0.5, "sampling");
|
||||
|
||||
OMNITRACE_CONFIG_SETTING(
|
||||
std::string, "OMNITRACE_SAMPLING_CPUS",
|
||||
"CPUs to collect frequency information for. Values should be separated by commas "
|
||||
"and can be explicit or ranges, e.g. 0,1,5-8. An empty value implies 'all' and "
|
||||
"'none' suppresses all CPU frequency sampling",
|
||||
"", "sampling");
|
||||
|
||||
auto _backend = tim::get_env_choice<std::string>(
|
||||
"OMNITRACE_BACKEND",
|
||||
(_system_backend)
|
||||
@@ -522,9 +529,10 @@ print_settings()
|
||||
if(dmp::rank() > 0) return;
|
||||
|
||||
static std::set<tim::string_view_t> _sample_options = {
|
||||
"OMNITRACE_SAMPLING_FREQ", "OMNITRACE_SAMPLING_DELAY",
|
||||
"OMNITRACE_FLAT_SAMPLING", "OMNITRACE_TIMELINE_SAMPLING",
|
||||
"OMNITRACE_FLAT_SAMPLING", "OMNITRACE_TIMELINE_SAMPLING",
|
||||
"OMNITRACE_SAMPLING_FREQ", "OMNITRACE_SAMPLING_DELAY",
|
||||
"OMNITRACE_SAMPLING_CPUS", "OMNITRACE_FLAT_SAMPLING",
|
||||
"OMNITRACE_TIMELINE_SAMPLING", "OMNITRACE_FLAT_SAMPLING",
|
||||
"OMNITRACE_TIMELINE_SAMPLING",
|
||||
};
|
||||
static std::set<tim::string_view_t> _perfetto_options = {
|
||||
"OMNITRACE_OUTPUT_FILE",
|
||||
@@ -915,6 +923,13 @@ get_sampling_delay()
|
||||
return static_cast<tim::tsettings<double>&>(*_v->second).get();
|
||||
}
|
||||
|
||||
std::string
|
||||
get_sampling_cpus()
|
||||
{
|
||||
static auto _v = get_config()->find("OMNITRACE_SAMPLING_CPUS");
|
||||
return static_cast<tim::tsettings<std::string>&>(*_v->second).get();
|
||||
}
|
||||
|
||||
int64_t
|
||||
get_critical_trace_count()
|
||||
{
|
||||
|
||||
@@ -29,6 +29,7 @@
|
||||
#include "library/timemory.hpp"
|
||||
|
||||
#include <cstdlib>
|
||||
#include <string>
|
||||
#include <utility>
|
||||
#include <vector>
|
||||
|
||||
@@ -40,8 +41,9 @@ namespace
|
||||
{
|
||||
struct cpu_freq
|
||||
{};
|
||||
using freq_pair_t = std::pair<size_t, double>;
|
||||
std::vector<std::deque<freq_pair_t>> cpu_frequencies = {};
|
||||
using freq_pair_t = std::pair<size_t, double>;
|
||||
std::vector<std::deque<freq_pair_t>> cpu_frequencies = {};
|
||||
std::set<size_t> enabled_cpu_frequencies = {};
|
||||
|
||||
struct cpu_mem
|
||||
{};
|
||||
@@ -107,6 +109,42 @@ config()
|
||||
|
||||
_ifs.close();
|
||||
|
||||
auto _enabled_val = get_sampling_cpus();
|
||||
if(_enabled_val != "none" && _enabled_val != "all")
|
||||
{
|
||||
auto _enabled = tim::delimit(_enabled_val, ",; \t");
|
||||
if(_enabled.empty())
|
||||
{
|
||||
for(size_t i = 0; i < _ncpu; ++i)
|
||||
enabled_cpu_frequencies.emplace(i);
|
||||
}
|
||||
for(auto&& _v : _enabled)
|
||||
{
|
||||
if(_v.find_first_not_of("0123456789-") != std::string::npos)
|
||||
{
|
||||
OMNITRACE_VERBOSE_F(
|
||||
0,
|
||||
"Invalid CPU specification. Only numerical values (e.g., 0) or "
|
||||
"ranges (e.g., 0-7) are permitted. Ignoring %s...",
|
||||
_v.c_str());
|
||||
continue;
|
||||
}
|
||||
if(_v.find('-') != std::string::npos)
|
||||
{
|
||||
auto _vv = tim::delimit(_v, "-");
|
||||
OMNITRACE_CONDITIONAL_THROW(
|
||||
_vv.size() != 2,
|
||||
"Invalid CPU range specification: %s. Required format N-M, e.g. 0-4",
|
||||
_v.c_str());
|
||||
for(size_t i = std::stoull(_vv.at(0)); i < std::stoull(_vv.at(1)); ++i)
|
||||
enabled_cpu_frequencies.insert(i);
|
||||
}
|
||||
else
|
||||
{
|
||||
enabled_cpu_frequencies.insert(std::stoull(_v));
|
||||
}
|
||||
}
|
||||
}
|
||||
cpu_frequencies.resize(_ncpu);
|
||||
cpu_mhz_pos = _cpu_mhz_pos;
|
||||
ifs = std::make_unique<std::ifstream>("/proc/cpuinfo", std::ifstream::binary);
|
||||
@@ -129,7 +167,11 @@ sample()
|
||||
|
||||
auto _ts = tim::get_clock_real_now<size_t, std::nano>();
|
||||
for(int64_t i = 0; i < ncpu; ++i)
|
||||
{
|
||||
if(!enabled_cpu_frequencies.empty() && enabled_cpu_frequencies.count(i) == 0)
|
||||
continue;
|
||||
cpu_frequencies.at(i).emplace_back(_ts, _read_cpu_freq(i));
|
||||
}
|
||||
}
|
||||
|
||||
void
|
||||
|
||||
@@ -1,8 +1,6 @@
|
||||
// MIT License
|
||||
//
|
||||
// Copyright (c) 2020, The Regents of the University of California,
|
||||
// through Lawrence Berkeley National Laboratory (subject to receipt of any
|
||||
// required approvals from the U.S. Dept. of Energy). All rights reserved.
|
||||
// Copyright (c) 2022 Advanced Micro Devices, Inc. All Rights Reserved.
|
||||
//
|
||||
// Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
// of this software and associated documentation files (the "Software"), to deal
|
||||
|
||||
@@ -1,8 +1,6 @@
|
||||
// MIT License
|
||||
//
|
||||
// Copyright (c) 2020, The Regents of the University of California,
|
||||
// through Lawrence Berkeley National Laboratory (subject to receipt of any
|
||||
// required approvals from the U.S. Dept. of Energy). All rights reserved.
|
||||
// Copyright (c) 2022 Advanced Micro Devices, Inc. All Rights Reserved.
|
||||
//
|
||||
// Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
// of this software and associated documentation files (the "Software"), to deal
|
||||
|
||||
@@ -206,7 +206,7 @@ function(OMNITRACE_ADD_TEST)
|
||||
_TEST
|
||||
baseline binary-rewrite binary-rewrite-run binary-rewrite-sampling
|
||||
binary-rewrite-run-sampling runtime-instrument runtime-instrument-sampling)
|
||||
string(REPLACE "-run-" "-" _prefix "${TEST_NAME}-${_TEST}/")
|
||||
string(REGEX REPLACE "-run(-|/)" "\\1" _prefix "${TEST_NAME}-${_TEST}/")
|
||||
set(_environ "${TEST_ENVIRONMENT}")
|
||||
set(_labels "${_TEST}")
|
||||
set(_timeout ${TEST_REWRITE_TIMEOUT})
|
||||
|
||||
Yeni konuda referans
Bir kullanıcı engelle