Documentation + Miscellaneous Fixes (#36)

* Added documentation markdown source

* Replaced AARInternal with AMDResearch in URLs

* Renamed cpack artifact names

* Fix to testing and lulesh submodule checkout

* Docker updates

* CMake and CPack

- force CMAKE_INSTALL_LIBDIR to lib
- CPACK_DEBIAN_PACKAGE_RELEASE uses OMNITRACE_CPACK_SYSTEM_NAME
- CPACK_RPM_PACKAGE_RELEASE uses OMNITRACE_CPACK_SYSTEM_NAME
- Tweak LIBOMP_LIBRARY find in examples/openmp
- Tweak setup-env.sh.in

* Partial update of README

- status badges
- docs link
- removed install info (covered by docs)

* OMNITRACE_SAMPLING_CPUS setting

- enables control over which CPUs are sampled for frequency

* omnitrace exe updates

- exclude transaction clone, virtual thunk, non-virtual thunk
- module_function::start_address
- module_function::instructions
- verbosity > 0 encodes instructions into JSON

* Miscellaneous fixes

- relocate setup-env.sh.in
- add modulefile.in
- Updated README.md and source/docs/about.md
- cmake fix for libomp
- fix license in miscellaneous places
- dl.hpp and dl.cpp

* Update timemory and dyninst submodules

- timemory signals updates
- dyninst Movement-adhoc updates

* cmake format

[ROCm/rocprofiler-systems commit: 945f541965]
Bu işleme şunda yer alıyor:
Jonathan R. Madsen
2022-04-04 15:27:38 -05:00
işlemeyi yapan: GitHub
ebeveyn 4ddb8405ac
işleme 127e30a4d7
61 değiştirilmiş dosya ile 6277 ekleme ve 301 silme
+2 -2
Dosyayı Görüntüle
@@ -136,7 +136,7 @@ jobs:
timeout-minutes: 10
uses: actions/upload-artifact@v2
with:
name: stgz-installers
name: ubuntu-bionic-rocm-stgz-installers
path: |
build-release/omnitrace-*.sh
@@ -144,7 +144,7 @@ jobs:
timeout-minutes: 10
uses: actions/upload-artifact@v2
with:
name: deb-installers
name: ubuntu-bionic-rocm-deb-installers
path: |
build-release/omnitrace_*.deb
+2 -2
Dosyayı Görüntüle
@@ -112,7 +112,7 @@ jobs:
timeout-minutes: 10
uses: actions/upload-artifact@v2
with:
name: stgz-installers
name: ubuntu-bionic-stgz-installers
path: |
build-release/omnitrace-*.sh
@@ -120,7 +120,7 @@ jobs:
timeout-minutes: 10
uses: actions/upload-artifact@v2
with:
name: deb-installers
name: ubuntu-bionic-deb-installers
path: |
build-release/omnitrace_*.deb
+2 -2
Dosyayı Görüntüle
@@ -136,7 +136,7 @@ jobs:
timeout-minutes: 10
uses: actions/upload-artifact@v2
with:
name: stgz-installers
name: ubuntu-focal-rocm-stgz-installers
path: |
build-release/omnitrace-*.sh
@@ -144,7 +144,7 @@ jobs:
timeout-minutes: 10
uses: actions/upload-artifact@v2
with:
name: deb-installers
name: ubuntu-focal-rocm-deb-installers
path: |
build-release/omnitrace_*.deb
+2 -2
Dosyayı Görüntüle
@@ -112,7 +112,7 @@ jobs:
timeout-minutes: 10
uses: actions/upload-artifact@v2
with:
name: stgz-installers
name: ubuntu-focal-stgz-installers
path: |
build-release/omnitrace-*.sh
@@ -120,7 +120,7 @@ jobs:
timeout-minutes: 10
uses: actions/upload-artifact@v2
with:
name: deb-installers
name: ubuntu-focal-deb-installers
path: |
build-release/omnitrace_*.deb
+22 -9
Dosyayı Görüntüle
@@ -25,7 +25,7 @@ project(
LANGUAGES C CXX
VERSION ${OMNITRACE_VERSION}
DESCRIPTION "CPU/GPU Application tracing with static/dynamic binary instrumentation"
HOMEPAGE_URL "https://github.com/AARInternal/omnitrace")
HOMEPAGE_URL "https://github.com/AMDResearch/omnitrace")
message(
STATUS
@@ -54,11 +54,17 @@ include(MacroUtilities) # various functions and macros
include(Compilers) # compiler identification
include(BuildSettings) # compiler flags
# force this because dyninst always installs to lib
set(CMAKE_INSTALL_LIBDIR
"lib"
CACHE STRING "Object code libraries (lib)" FORCE)
set(CMAKE_CXX_STANDARD
17
CACHE STRING "CXX language standard")
omnitrace_add_feature(CMAKE_CXX_STANDARD "CXX language standard")
omnitrace_add_feature(CMAKE_BUILD_TYPE "Build optimization level")
omnitrace_add_feature(CMAKE_INSTALL_PREFIX "Installation prefix")
omnitrace_add_feature(CMAKE_CXX_COMPILER "C++ compiler")
omnitrace_add_feature(CMAKE_CXX_STANDARD "CXX language standard")
omnitrace_add_option(CMAKE_CXX_STANDARD_REQUIRED "Require C++ language standard" ON)
omnitrace_add_option(CMAKE_CXX_EXTENSIONS "Compiler specific language extensions" OFF)
omnitrace_add_option(CMAKE_INSTALL_RPATH_USE_LINK_PATH "Enable rpath to linked libraries"
@@ -170,8 +176,13 @@ add_subdirectory(source)
#
# ------------------------------------------------------------------------------#
configure_file(${PROJECT_SOURCE_DIR}/scripts/setup-env.sh.in
${PROJECT_BINARY_DIR}/scripts/setup-env.sh @ONLY)
configure_file(${PROJECT_SOURCE_DIR}/cmake/Templates/setup-env.sh.in
${PROJECT_BINARY_DIR}/install-tree/setup-env.sh @ONLY)
configure_file(
${PROJECT_SOURCE_DIR}/cmake/Templates/modulefile.in
${PROJECT_BINARY_DIR}/install-tree/modulefiles/${PROJECT_NAME}/${OMNITRACE_VERSION}
@ONLY)
install(
PROGRAMS ${PROJECT_SOURCE_DIR}/scripts/omnitrace-merge.jl
@@ -184,20 +195,22 @@ install(
OPTIONAL)
install(
FILES ${PROJECT_BINARY_DIR}/scripts/setup-env.sh
FILES ${PROJECT_BINARY_DIR}/install-tree/setup-env.sh
DESTINATION ${CMAKE_INSTALL_DATAROOTDIR}/${PROJECT_NAME}
OPTIONAL)
install(
FILES
${PROJECT_BINARY_DIR}/install-tree/modulefiles/${PROJECT_NAME}/${OMNITRACE_VERSION}
DESTINATION ${CMAKE_INSTALL_DATAROOTDIR}/modulefiles/${PROJECT_NAME}
OPTIONAL)
# ------------------------------------------------------------------------------#
#
# examples
#
# ------------------------------------------------------------------------------#
if(OMNITRACE_BUILD_LTO)
omnitrace_restore_variables(LTO VARIABLES CMAKE_INTERPROCEDURAL_OPTIMIZATION)
endif()
if(OMNITRACE_BUILD_EXAMPLES)
add_subdirectory(examples)
endif()
+7 -65
Dosyayı Görüntüle
@@ -1,71 +1,13 @@
# omnitrace: application tracing with static/dynamic binary instrumentation
It is highly recommended to use the ore-built binary installers for omnitrace which are provided in the "Assets" section of each release.
[![Ubuntu 18.04 (GCC 7, 8, MPICH)](https://github.com/AMDResearch/omnitrace/actions/workflows/ubuntu-bionic.yml/badge.svg)](https://github.com/AMDResearch/omnitrace/actions/workflows/ubuntu-bionic.yml)
[![Ubuntu 20.04 (GCC 7, 8, 9, 10)](https://github.com/AMDResearch/omnitrace/actions/workflows/ubuntu-focal-external.yml/badge.svg)](https://github.com/AMDResearch/omnitrace/actions/workflows/ubuntu-focal-external.yml)
[![Ubuntu 20.04 (GCC 9, external Dyninst)](https://github.com/AMDResearch/omnitrace/actions/workflows/ubuntu-focal-dyninst-package.yml/badge.svg)](https://github.com/AMDResearch/omnitrace/actions/workflows/ubuntu-focal-dyninst-package.yml)
[![Ubuntu 20.04 (GCC 9, MPICH, OpenMPI)](https://github.com/AMDResearch/omnitrace/actions/workflows/ubuntu-focal.yml/badge.svg)](https://github.com/AMDResearch/omnitrace/actions/workflows/ubuntu-focal.yml)
[![Ubuntu 20.04 (GCC 9, MPICH, OpenMPI, ROCm 4.3, 4.5, 5.0)](https://github.com/AMDResearch/omnitrace/actions/workflows/ubuntu-focal-external-rocm.yml/badge.svg)](https://github.com/AMDResearch/omnitrace/actions/workflows/ubuntu-focal-external-rocm.yml)
## Dependencies
- Ubuntu 18.04 or Ubuntu 20.04
- Other OS distributions may be supported but are not tested
- GCC compiler v7+
- Older GCC compilers may be supported but are not tested
- Clang compilers are generally supported for Omnitrace but not Dyninst
- [CMake](https://cmake.org/) v3.15+
- [DynInst](https://github.com/dyninst/dyninst) for dynamic or static instrumentation
- [TBB](https://github.com/oneapi-src/oneTBB) required by Dyninst
- [ElfUtils](https://sourceware.org/elfutils/) required by Dyninst
- [LibIberty](https://github.com/gcc-mirror/gcc/tree/master/libiberty) required by Dyninst
- [Boost](https://www.boost.org/) required by Dyninst
- [OpenMP](https://www.openmp.org/) optional by Dyninst
- [ROCm](https://rocmdocs.amd.com/en/latest/Installation_Guide/Installation-Guide.html#ubuntu) (optional)
- HIP
- Roctracer for HIP API and kernel tracing
- [PAPI](https://icl.utk.edu/papi/)
- [libunwind](https://www.nongnu.org/libunwind/) for call-stack sampling
- Several optional third-party profiling tools supported by timemory (e.g. TAU, Caliper, CrayPAT, etc.)
## Installing CMake
If using Ubuntu 20.04, `apt-get install cmake` will install cmake v3.16.3. If using Ubuntu 18.04, the cmake version via apt is too old (v3.10.2). In this case, run:
```console
python3 -m pip install `cmake==3.18.4`
export PATH=${HOME}/.local/bin
```
## Installing DynInst
The easiest way to install Dyninst is to configure omnitrace with `-DOMNITRACE_BUILD_DYNINST` and have Dyninst install it's dependencies:
`-DDyninst_BUILD_TBB=ON -DDyninst_BUILD_ELFUTILS=ON -DDyninst_BUILD_BOOST=ON -DDyninst_BUILD_LIBIBERTY=ON`.
```shell
git clone https://github.com/spack/spack.git
source ./spack/share/spack/setup-env.sh
spack compiler find
spack external find
spack install dyninst
spack load -r dyninst
```
## Installing omnitrace
Omnitrace can have full MPI support (`-DOMNITRACE_USE_MPI=ON`) or partially (`-DOMNITRACE_USE_MPI_HEADERS=ON`). The only difference between these two modes
is whether or not the results collected via timemory can be aggregated into one output file. If full MPI support is selected, make sure your target application
is built against the same MPI distribution as omnitrace, i.e. do not build omnitrace with MPICH and use it on a target application built against OpenMPI.
If partial support is selected, build omnitrace against OpenMPI -- the reason this is recommended is because the `MPI_COMM_WORLD` in OpenMPI is a pointer to
`ompi_communicator_t` (8 bytes) whereas `MPI_COMM_WORLD` in MPICH is an `int` (4 bytes). Building omnitrace with partial MPI support and the MPICH header and using
on an application using OpenMPI will thus implicitly cast `MPI_COMM_WORLD` to 4 bytes in the MPI function wrappers before calling the underlying OpenMPI function
resulting in an incorrect address for `ompi_communicator_t` whereas partial MPI support with the OpenMPI headers does not cast `MPI_COMM_WORLD` into a smaller datatype
which used with MPICH.
```shell
OMNITRACE_ROOT=${HOME}/sw/omnitrace
git clone https://github.com/AARInternal/omnitrace.git
cmake -B build-omnitrace -DOMNITRACE_USE_MPI=ON -DCMAKE_INSTALL_PREFIX=${OMNITRACE_ROOT} omnitrace
cmake --build build-omnitrace --target all --parallel 8
cmake --build build-omnitrace --target install
export PATH=${OMNITRACE_ROOT}/bin:${PATH}
export LD_LIBRARY_PATH=${OMNITRACE_ROOT}/lib64:${OMNITRACE_ROOT}/lib:${LD_LIBRARY_PATH}
```
Omnitrace is an AMD research project and should not be treated as an offical part of the ROCm software stack.
The documentation for omnitrace is available at [amdresearch.github.io/omnitrace](https://amdresearch.github.io/omnitrace/).
## Using Omnitrace Executable
+8 -4
Dosyayı Görüntüle
@@ -1,7 +1,8 @@
if(DYNINST_BUILD_ELFUTILS AND DYNINST_ELFUTILS_DOWNLOAD_VERSION)
omnitrace_add_feature(DYNINST_ELFUTILS_DOWNLOAD_VERSION "ElfUtils download version")
foreach(_LIB dw elf)
install(
FILES
PROGRAMS
${CMAKE_INSTALL_PREFIX}/${CMAKE_INSTALL_LIBDIR}/dyninst-tpls/lib/lib${_LIB}${CMAKE_SHARED_LIBRARY_SUFFIX}
${CMAKE_INSTALL_PREFIX}/${CMAKE_INSTALL_LIBDIR}/dyninst-tpls/lib/lib${_LIB}${CMAKE_SHARED_LIBRARY_SUFFIX}.1
${CMAKE_INSTALL_PREFIX}/${CMAKE_INSTALL_LIBDIR}/dyninst-tpls/lib/lib${_LIB}-${DYNINST_ELFUTILS_DOWNLOAD_VERSION}${CMAKE_SHARED_LIBRARY_SUFFIX}
@@ -32,6 +33,7 @@ set(CPACK_PACKAGE_VERSION_MINOR "${PROJECT_VERSION_MINOR}")
set(CPACK_PACKAGE_VERSION_PATCH "${PROJECT_VERSION_PATCH}")
set(CPACK_PACKAGE_CONTACT "jonathan.madsen@amd.com")
set(CPACK_RESOURCE_FILE_LICENSE "${PROJECT_SOURCE_DIR}/LICENSE")
set(CPACK_INCLUDE_TOPLEVEL_DIRECTORY OFF)
set(OMNITRACE_CPACK_SYSTEM_NAME
"${_SYSTEM_NAME}"
CACHE STRING "System name, e.g. Linux or Ubuntu-18.04")
@@ -101,8 +103,9 @@ omnitrace_add_feature(OMNITRACE_PACKAGE_FILE_NAME "CPack filename")
#
# -------------------------------------------------------------------------------------- #
set(CPACK_DEBIAN_PACKAGE_HOMEPAGE "https://github.com/AARInternal/omnitrace")
set(CPACK_DEBIAN_PACKAGE_RELEASE "${CMAKE_SYSTEM_NAME}${OMNITRACE_CPACK_PACKAGE_SUFFIX}")
set(CPACK_DEBIAN_PACKAGE_HOMEPAGE "https://github.com/AMDResearch/omnitrace")
set(CPACK_DEBIAN_PACKAGE_RELEASE
"${OMNITRACE_CPACK_SYSTEM_NAME}${OMNITRACE_CPACK_PACKAGE_SUFFIX}")
string(REGEX REPLACE "([a-zA-Z])-([0-9])" "\\1\\2" CPACK_DEBIAN_PACKAGE_RELEASE
"${CPACK_DEBIAN_PACKAGE_RELEASE}")
string(REPLACE "-" "~" CPACK_DEBIAN_PACKAGE_RELEASE "${CPACK_DEBIAN_PACKAGE_RELEASE}")
@@ -166,7 +169,8 @@ if(DEFINED CPACK_PACKAGING_INSTALL_PREFIX)
set(CPACK_RPM_EXCLUDE_FROM_AUTO_FILELIST_ADDITION "${CPACK_PACKAGING_INSTALL_PREFIX}")
endif()
set(CPACK_RPM_PACKAGE_RELEASE "${CMAKE_SYSTEM_NAME}${OMNITRACE_CPACK_PACKAGE_SUFFIX}")
set(CPACK_RPM_PACKAGE_RELEASE
"${OMNITRACE_CPACK_SYSTEM_NAME}${OMNITRACE_CPACK_PACKAGE_SUFFIX}")
string(REGEX REPLACE "([a-zA-Z])-([0-9])" "\\1\\2" CPACK_RPM_PACKAGE_RELEASE
"${CPACK_RPM_PACKAGE_RELEASE}")
string(REPLACE "-" "~" CPACK_RPM_PACKAGE_RELEASE "${CPACK_RPM_PACKAGE_RELEASE}")
+2 -5
Dosyayı Görüntüle
@@ -190,10 +190,7 @@ function(ROCM_VERSION_PARSE_VERSION_FILES)
endforeach()
endfunction()
# search for HIP to set ROCM_PATH
if(NOT hip_FOUND)
find_package(hip)
endif()
# search for HIP to set ROCM_PATH if(NOT hip_FOUND) find_package(hip) endif()
function(COMPUTE_ROCM_VERSION_DIR)
if(EXISTS "${ROCmVersion_VERSION_FILE}" AND IS_ABSOLUTE
@@ -231,7 +228,7 @@ function(ROCM_VERSION_PARSE_VERSION_FILES)
set(_PATHS ${ROCmVersion_DIR})
else()
set(_PATHS ${ROCmVersion_DIR} ${ROCmVersion_ROOT} ${ROCmVersion_ROOT_DIR}
${ROCM_PATH} $ENV{CMAKE_PREFIX_PATH} ${CMAKE_PREFIX_PATH} /opt/rocm)
$ENV{CMAKE_PREFIX_PATH} ${CMAKE_PREFIX_PATH} ${ROCM_PATH} /opt/rocm)
rocm_version_message(STATUS "ROCmVersion search paths: ${_PATHS}")
endif()
+4 -1
Dosyayı Görüntüle
@@ -191,7 +191,7 @@ if(OMNITRACE_BUILD_DYNINST)
omnitrace_target_compile_definitions(
omnitrace-dyninst
INTERFACE
DYNINST_API_RT="${CMAKE_INSTALL_PREFIX}/${CMAKE_INSTALL_LIBDIR}:$<TARGET_FILE_DIR:Dyninst::dyninstAPI_RT>:${CMAKE_INSTALL_PREFIX}/lib/$<TARGET_FILE_NAME:Dyninst::dyninstAPI_RT>:$<TARGET_FILE:Dyninst::dyninstAPI_RT>"
DYNINST_API_RT="${CMAKE_INSTALL_PREFIX}/${CMAKE_INSTALL_LIBDIR}:$<TARGET_FILE_DIR:Dyninst::dyninstAPI_RT>:${CMAKE_INSTALL_PREFIX}/${CMAKE_INSTALL_LIBDIR}/$<TARGET_FILE_NAME:Dyninst::dyninstAPI_RT>:$<TARGET_FILE:Dyninst::dyninstAPI_RT>"
)
endif()
@@ -466,6 +466,9 @@ if(NOT TARGET PTL::ptl-shared)
set(PTL_USE_GPU OFF)
set(PTL_DEVELOPER_INSTALL OFF)
if(NOT DEFINED BUILD_OBJECT_LIBS)
set(BUILD_OBJECT_LIBS OFF)
endif()
omnitrace_save_variables(
BUILD_CONFIG
VARIABLES BUILD_SHARED_LIBS BUILD_STATIC_LIBS BUILD_OBJECT_LIBS
+15
Dosyayı Görüntüle
@@ -0,0 +1,15 @@
#%Module1.0
module-whatis "omnitrace (version @OMNITRACE_VERSION@)"
proc ModulesHelp { } {
puts stderr "Loads omnitrace v@OMNITRACE_VERSION@"
}
set ROOT [file normalize [file dirname [file normalize ${ModulesCurrentModulefile}]]/../../..]
prepend-path CMAKE_PREFIX_PATH "${ROOT}"
prepend-path PATH "${ROOT}/bin"
prepend-path LD_LIBRARY_PATH "${ROOT}/@CMAKE_INSTALL_LIBDIR@"
prepend-path PYTHONPATH "${ROOT}/@CMAKE_INSTALL_PYTHONDIR@"
setenv @PROJECT_NAME@_DIR "${ROOT}/share/cmake/omnitrace"
@@ -8,7 +8,8 @@ if [ ! -d "${BASEDIR}" ]; then
return 1
fi
export PATH=${BASEDIR}/bin:${PATH}
export LD_LIBRARY_PATH=${BASEDIR}/@CMAKE_INSTALL_LIBDIR@:${LD_LIBRARY_PATH}
PATH=${BASEDIR}/bin:${PATH}
LD_LIBRARY_PATH=${BASEDIR}/@CMAKE_INSTALL_LIBDIR@:${LD_LIBRARY_PATH}
return 0
export PATH
export LD_LIBRARY_PATH
+57
Dosyayı Görüntüle
@@ -0,0 +1,57 @@
ARG DISTRO=centos
ARG VERSION=7
FROM ${DISTRO}:${VERSION}
ENV HOME /root
ENV SHELL /bin/bash
ENV BASH_ENV /etc/bash.bashrc
ENV DEBIAN_FRONTEND noninteractive
WORKDIR /tmp
SHELL [ "/bin/bash", "-c" ]
ENV PATH /usr/local/bin:${PATH}
RUN yum update -y && \
yum groupinstall -y "Development Tools" && \
yum install -y centos-release-scl && \
yum install -y epel-release && \
yum install -y devtoolset-9 python3-pip openmpi3-devel zlib-devel numactl-devel papi-devel dpkg-devel dpkg-dev && \
python3 -m pip install 'cmake==3.18.4'
ARG AMDGPU_RPM=21.40.2/rhel/7.9/amdgpu-install-21.40.2.40502-1.el7.noarch.rpm
# ARG AMDGPU_RPM=latest/rhel/7.9/amdgpu-install-21.50.50000-1.el7.noarch.rpm
RUN yum install -y https://repo.radeon.com/amdgpu-install/${AMDGPU_RPM} && \
amdgpu-install --usecase=rocm,hip,hiplibsdk --no-dkms --skip-broken -y && \
yum install -y rocm-hip-sdk roctracer-dev rocm-smi-lib rocprofiler-dev && \
yum update -y && \
yum clean all
RUN ln -s /opt/rocm-* /opt/rocm
WORKDIR /home
SHELL [ "/bin/bash", "--login", "-c" ]
COPY ./entrypoint-centos.sh /docker-entrypoint.sh
ENTRYPOINT [ "/docker-entrypoint.sh" ]
#1 yum update
#2 yum groupinstall "Development Tools"
#3 yum install devtoolset-9-toolchain
#4 yum install devtoolset-9
#5 yum install devtoolset-7-toolchain
#6 yum search devtoolset
#7 yum search -a devtoolset
#8 yum search --help
#9 yum repolist
#10 yum list available
#11 yum list available devtoolset*
#12 yum list available devtoolset\*
#13 subscription-manager list --available
#14 yum install subscription-manager
#15 subscription-manager list --available
#16 yum install centos-release-scl
#17 yum-config-manager --enable rhel-server-rhscl-7-rpms
#18 yum install devtoolset-7
#19 yum install devtoolset-9
#20 scl enable devtoolset-9 bash
+32
Dosyayı Görüntüle
@@ -0,0 +1,32 @@
ARG DISTRO=opensuse/leap
ARG VERSION=15.3
FROM ${DISTRO}:${VERSION}
ENV HOME /root
ENV SHELL /bin/bash
ENV BASH_ENV /etc/bash.bashrc
ENV DEBIAN_FRONTEND noninteractive
WORKDIR /tmp
SHELL [ "/bin/bash", "-c" ]
ENV PATH /usr/local/bin:${PATH}
RUN zypper update -y && \
zypper dist-upgrade -y && \
zypper install -y -t pattern devel_basis && \
zypper install -y python3-pip openmpi3-devel gcc-c++ git libnuma-devel dpkg-devel rpm-build && \
python3 -m pip install 'cmake==3.18.4'
# ARG AMDGPU_RPM=21.40.2/sle/15/amdgpu-install-21.40.2.40502-1.noarch.rpm
ARG AMDGPU_RPM=latest/sle/15/amdgpu-install-21.50.50000-1.noarch.rpm
RUN zypper --no-gpg-checks install -y https://repo.radeon.com/amdgpu-install/${AMDGPU_RPM} && \
zypper addrepo https://download.opensuse.org/repositories/devel:languages:perl/SLE_15/devel:languages:perl.repo && \
zypper --non-interactive --gpg-auto-import-keys refresh && \
amdgpu-install --usecase=rocm,hip,hiplibsdk --no-dkms -y && \
zypper install -y rocm-hip-sdk roctracer-dev rocm-smi-lib rocprofiler-dev && \
zypper clean --all
WORKDIR /home
SHELL [ "/bin/bash", "--login", "-c" ]
+36
Dosyayı Görüntüle
@@ -0,0 +1,36 @@
ARG DISTRO=opensuse/leap
ARG VERSION=15.3
FROM ${DISTRO}:${VERSION}
ENV HOME /root
ENV SHELL /bin/bash
ENV BASH_ENV /etc/bash.bashrc
ENV DEBIAN_FRONTEND noninteractive
WORKDIR /tmp
SHELL [ "/bin/bash", "-c" ]
ENV PATH /usr/local/bin:${PATH}
ARG EXTRA_PACKAGES=""
ARG ELFUTILS_DOWNLOAD_VERSION="0.183"
ARG NJOBS="12"
RUN zypper update -y && \
zypper dist-upgrade -y && \
zypper install -y -t pattern devel_basis && \
zypper install -y python3-pip openmpi3-devel gcc-c++ git libnuma-devel dpkg-devel rpm-build papi-devel && \
python3 -m pip install 'cmake==3.18.4'
COPY ./dyninst-source /tmp/dyninst
RUN cd /tmp/dyninst && \
cmake -B build -DCMAKE_BUILD_TYPE=Release -DBUILD_BOOST=ON -DBUILD_TBB=ON -DBUILD_ELFUTILS=ON -DBUILD_LIBIBERTY=ON && \
cmake --build build --target all --parallel ${NJOBS} && \
cmake --build build --target install --parallel ${NJOBS} && \
cd /tmp && \
shopt -s dotglob extglob && \
rm -rf *
WORKDIR /home
SHELL [ "/bin/bash", "--login", "-c" ]
+61 -15
Dosyayı Görüntüle
@@ -2,33 +2,79 @@
set -e
if [ ! -f Dockerfile.ci ]; then cd docker; fi
if [ ! -f Dockerfile.ci ]; then
echo "Error! Execute script from source directory"
exit 1
fi
rm -rf ./dyninst-source
cp -r ../external/dyninst ./dyninst-source
rm -rf ./dyninst-source/{build,install}*
: ${DISTRO:=ubuntu}
: ${VERSIONS:=20.04 18.04}
: ${NJOBS=$(nproc)}
: ${ELFUTILS_VERSION:=0.183}
send-error()
{
echo -e "\nError: ${@}"
exit 1
}
verbose-run()
{
echo -e "\n\n### Executing \"${@}\"... ###\n"
eval $@
}
n=0
while [[ $# -gt 0 ]]
do
case "${1}" in
"--distro")
shift
DISTRO=${1}
;;
"--versions")
shift
VERSIONS=${1}
;;
"-j")
shift
NJOBS=${1}
;;
"--elfutils-version")
shift
ELFUTILS_VERSION=${1}
;;
*)
send-error "Unsupported argument at position $((${n} + 1)) :: ${1}"
;;
esac
n=$((${n} + 1))
shift
done
DOCKER_FILE=Dockerfile.${DISTRO}.ci
if [ ! -f ${DOCKER_FILE} ]; then cd docker; fi
if [ ! -f ${DOCKER_FILE} ]; then
echo "Error! Execute script from source directory"
exit 1
fi
verbose-run rm -rf ./dyninst-source
verbose-run cp -r ../external/dyninst ./dyninst-source
verbose-run rm -rf ./dyninst-source/{build,install}*
set -e
DISTRO_IMAGE=${DISTRO}
if [ "${DISTRO}" = "opensuse" ]; then DISTRO_IMAGE="opensuse/leap"; fi
for VERSION in ${VERSIONS}
do
docker build . \
-f Dockerfile.ci \
verbose-run docker build . \
-f ${DOCKER_FILE} \
--tag jrmadsen/omnitrace-ci:${DISTRO}-${VERSION} \
--build-arg DISTRO=${DISTRO} \
--build-arg DISTRO=${DISTRO_IMAGE} \
--build-arg VERSION=${VERSION} \
--build-arg NJOBS=${NJOBS} \
--build-arg ELFUTILS_DOWNLOAD_VERSION=${ELFUTILS_VERSION}
done
rm -rf ./dyninst-source
verbose-run rm -rf ./dyninst-source
+49 -4
Dosyayı Görüntüle
@@ -10,10 +10,11 @@ set -e
build-release()
{
CONTAINER=$1
ROCM_VERSION=$2
CODE_VERSION=$3
OS=$2
ROCM_VERSION=$3
CODE_VERSION=$4
MPI=$4
docker run -it --rm -v ${PWD}:/home/omnitrace --env ROCM_VERSION=${ROCM_VERSION} --env VERSION=${CODE_VERSION} --env MPI=${MPI} ${CONTAINER} /home/omnitrace/scripts/build-release.sh
docker run -it --rm -v ${PWD}:/home/omnitrace --env DISTRO=${OS} --env ROCM_VERSION=${ROCM_VERSION} --env VERSION=${CODE_VERSION} --env MPI=${MPI} ${CONTAINER} /home/omnitrace/scripts/build-release.sh
}
: ${DISTRO:=ubuntu}
@@ -21,6 +22,50 @@ build-release()
: ${ROCM_VERSIONS:=5.0 4.5 4.3}
: ${MPI:=0}
send-error()
{
echo -e "\nError: ${@}"
exit 1
}
verbose-run()
{
echo -e "\n\n### Executing \"${@}\"... ###\n"
eval $@
}
n=0
while [[ $# -gt 0 ]]
do
case "${1}" in
"--distro")
shift
DISTRO=${1}
;;
"--versions")
shift
VERSIONS=${1}
;;
"--rocm-versions")
shift
ROCM_VERSIONS=${1}
;;
*)
if [ "${n}" -eq 0 ]; then
DISTRO=${1}
elif [ "${n}" -eq 1 ]; then
VERSIONS=${1}
elif [ "${n}" -eq 2 ]; then
ROCM_VERSIONS=${1}
else
send-error "Unsupported argument at position $((${n} + 1)) :: ${1}"
fi
;;
esac
n=$((${n} + 1))
shift
done
CODE_VERSION=$(cat VERSION)
for VERSION in ${VERSIONS}
@@ -28,6 +73,6 @@ do
TAG=${DISTRO}-${VERSION}
for ROCM_VERSION in ${ROCM_VERSIONS}
do
build-release jrmadsen/omnitrace-${TAG}-rocm-${ROCM_VERSION} ${ROCM_VERSION} ${CODE_VERSION} ${MPI}
build-release jrmadsen/omnitrace-${TAG}-rocm-${ROCM_VERSION} ${DISTRO}-${VERSION} ${ROCM_VERSION} ${CODE_VERSION} ${MPI}
done
done
+125 -6
Dosyayı Görüntüle
@@ -3,19 +3,138 @@
: ${ROCM_VERSIONS:="5.0 4.5 4.3"}
: ${DISTRO:=ubuntu}
: ${VERSIONS:=20.04 18.04}
: ${CI:=""}
set -e
if [ ! -f Dockerfile ]; then cd docker; fi
send-error()
{
echo -e "\nError: ${@}"
exit 1
}
verbose-run()
{
echo -e "\n\n### Executing \"${@}\"... ###\n"
eval $@
}
n=0
while [[ $# -gt 0 ]]
do
case "${1}" in
"--distro")
shift
DISTRO=${1}
;;
"--versions")
shift
VERSIONS=${1}
;;
"--rocm-versions")
shift
ROCM_VERSIONS=${1}
;;
*)
if [ "${n}" -eq 0 ]; then
DISTRO=${1}
elif [ "${n}" -eq 1 ]; then
VERSIONS=${1}
elif [ "${n}" -eq 2 ]; then
ROCM_VERSIONS=${1}
else
send-error "Unsupported argument at position $((${n} + 1)) :: ${1}"
fi
;;
esac
n=$((${n} + 1))
shift
done
DOCKER_FILE="Dockerfile.${DISTRO}"
if [ -n "${CI}" ]; then DOCKER_FILE="${DOCKER_FILE}.ci"; fi
if [ ! -f ${DOCKER_FILE} ]; then cd docker; fi
if [ ! -f ${DOCKER_FILE} ]; then send-error "File \"${DOCKER_FILE}\" not found"; fi
for VERSION in ${VERSIONS}
do
for i in ${ROCM_VERSIONS}
do
ROCM_REPO_VERSION=${i}
if [ "${i}" = "5.0" ]; then ROCM_REPO_VERSION=debian; fi
if [ "${i}" = "4.1" ]; then ROCM_REPO_DIST="xenial"; fi
if [ "${i}" = "4.0" ]; then ROCM_REPO_DIST="xenial"; fi
docker build . --tag jrmadsen/omnitrace-${DISTRO}-${VERSION}-rocm-${i} --build-arg DISTRO=${DISTRO} --build-arg VERSION=${VERSION} --build-arg ROCM_REPO_VERSION=${ROCM_REPO_VERSION} --build-arg ROCM_REPO_DIST=${ROCM_REPO_DIST}
if [ "${DISTRO}" = "ubuntu" ]; then
ROCM_REPO_DIST="ubuntu"
ROCM_REPO_VERSION=${i}
case "${i}" in
5.0*)
ROCM_REPO_VERSION="debian"
;;
4.1* | 4.0*)
ROCM_REPO_DIST="xenial"
;;
*)
send-error "Unsupported combination :: ${DISTRO}-${VERSION} + ROCm ${i}"
;;
esac
verbose-run docker build . -f ${DOCKER_FILE} --tag jrmadsen/omnitrace-${DISTRO}-${VERSION}-rocm-${i} --build-arg DISTRO=${DISTRO} --build-arg VERSION=${VERSION} --build-arg ROCM_REPO_VERSION=${ROCM_REPO_VERSION} --build-arg ROCM_REPO_DIST=${ROCM_REPO_DIST}
elif [ "${DISTRO}" = "centos" ]; then
case "${VERSION}" in
7)
RPM_PATH=7.9
RPM_TAG=".el7"
;;
8)
RPM_PATH=8.5
RPM_TAG=".el7"
;;
*)
send-error "Invalid centos version ${VERSION}. Supported: 7, 8"
esac
case "${i}" in
5.0*)
ROCM_RPM=latest/rhel/${RPM_PATH}/amdgpu-install-21.50.50000-1${RPM_TAG}.noarch.rpm
;;
4.5 | 4.5.2)
ROCM_RPM=21.40.2/rhel/${RPM_PATH}/amdgpu-install-21.40.2.40502-1${RPM_TAG}.noarch.rpm
;;
4.5.1)
ROCM_RPM=21.40.1/rhel/${RPM_PATH}/amdgpu-install-21.40.1.40501-1${RPM_TAG}.noarch.rpm
;;
4.5.0)
ROCM_RPM=21.40/rhel/${RPM_PATH}/amdgpu-install-21.40.1.40501-1${RPM_TAG}.noarch.rpm
;;
*)
send-error "Unsupported combination :: ${DISTRO}-${VERSION} + ROCm ${i}"
;;
esac
verbose-run docker build . -f ${DOCKER_FILE} --tag jrmadsen/omnitrace-${DISTRO}-${VERSION}-rocm-${i} --build-arg DISTRO=${DISTRO} --build-arg VERSION=${VERSION} --build-arg AMDGPU_RPM=${ROCM_RPM}
elif [ "${DISTRO}" = "opensuse" ]; then
case "${VERSION}" in
15.*)
DISTRO_IMAGE="opensuse/leap"
echo "DISTRO_IMAGE: ${DISTRO_IMAGE}"
;;
*)
send-error "Invalid opensuse version ${VERSION}. Supported: 15.x"
;;
esac
case "${i}" in
5.0*)
ROCM_RPM=latest/sle/15/amdgpu-install-21.50.50000-1.noarch.rpm
;;
4.5 | 4.5.2)
ROCM_RPM=21.40.2/sle/15/amdgpu-install-21.40.2.40502-1.noarch.rpm
;;
4.5.1)
ROCM_RPM=21.40.1/sle/15/amdgpu-install-21.40.1.40501-1.noarch.rpm
;;
4.5.0)
ROCM_RPM=21.40/sle/15/amdgpu-install-21.40.1.40501-1.noarch.rpm
;;
*)
send-error "Unsupported combination :: ${DISTRO}-${VERSION} + ROCm ${i}"
;;
esac
verbose-run docker build . -f ${DOCKER_FILE} --tag jrmadsen/omnitrace-${DISTRO}-${VERSION}-rocm-${i} --build-arg DISTRO=${DISTRO_IMAGE} --build-arg VERSION=${VERSION} --build-arg AMDGPU_RPM=${ROCM_RPM}
fi
done
done
+14
Dosyayı Görüntüle
@@ -0,0 +1,14 @@
#!/bin/bash
source scl_source enable devtoolset-9
source /etc/profile.d/modules.sh
module load mpi
export LC_ALL=en_US.UTF-8
if [ -z "${1}" ]; then
exec bash
else
set -e
eval $@
fi
+5
Dosyayı Görüntüle
@@ -81,6 +81,11 @@ function(CHECKOUT_GIT_SUBMODULE)
set(_SUBMODULE_EXISTS OFF)
if(EXISTS "${_SUBMODULE}" AND NOT IS_DIRECTORY "${_SUBMODULE}")
set(_SUBMODULE_EXISTS ON)
else()
set(_SUBMODULE "${CMAKE_SOURCE_DIR}/.gitmodules")
if(EXISTS "${_SUBMODULE}" AND NOT IS_DIRECTORY "${_SUBMODULE}")
set(_SUBMODULE_EXISTS ON)
endif()
endif()
set(_HAS_REPO_URL OFF)
-1
Dosyayı Görüntüle
@@ -57,7 +57,6 @@ Authors of the OpenMP code:
#include "../common/npb-CPP.hpp"
#include "npbparams.hpp"
#include "omp.h"
/*
* ---------------------------------------------------------------------
+5 -3
Dosyayı Görüntüle
@@ -12,12 +12,14 @@ add_executable(openmp-lu ${CMAKE_CURRENT_SOURCE_DIR}/LU/lu.cpp
$<TARGET_OBJECTS:openmp-common>)
find_program(CLANGXX_EXECUTABLE NAMES clang++)
if(CLANGXX_EXECUTABLE)
find_library(LIBOMP_LIBRARY
NAMES omp ${CMAKE_SHARED_LIBRARY_PREFIX}omp${CMAKE_SHARED_LIBRARY_SUFFIX}.5)
if(CLANGXX_EXECUTABLE AND LIBOMP_LIBRARY)
target_compile_options(openmp-common PUBLIC -W -Wall -fopenmp=libomp)
target_compile_options(openmp-cg PRIVATE -W -Wall -fopenmp=libomp)
target_link_libraries(openmp-cg PRIVATE omp)
target_link_libraries(openmp-cg PRIVATE ${LIBOMP_LIBRARY})
target_compile_options(openmp-lu PRIVATE -W -Wall -fopenmp=libomp)
target_link_libraries(openmp-lu PRIVATE omp)
target_link_libraries(openmp-lu PRIVATE ${LIBOMP_LIBRARY})
omnitrace_custom_compilation(COMPILER ${CLANGXX_EXECUTABLE} TARGET openmp-common)
omnitrace_custom_compilation(COMPILER ${CLANGXX_EXECUTABLE} TARGET openmp-cg)
omnitrace_custom_compilation(COMPILER ${CLANGXX_EXECUTABLE} TARGET openmp-lu)
+3 -3
Dosyayı Görüntüle
@@ -59,7 +59,6 @@ Authors of the OpenMP code:
#include "../common/npb-CPP.hpp"
#include "npbparams.hpp"
#include "omp.h"
/*
* ---------------------------------------------------------------------
@@ -2095,8 +2094,8 @@ read_input()
* ---------------------------------------------------------------------
*/
FILE* fp;
int avoid_warning;
if((fp = fopen("inputlu.data", "r")) != NULL)
int avoid_warning = 0;
if((fp = fopen("inputlu.data", "r")) != nullptr)
{
printf("Reading from input file inputlu.data\n");
while(fgetc(fp) != '\n')
@@ -2156,6 +2155,7 @@ read_input()
ny0 = ISIZ2;
nz0 = ISIZ3;
}
(void) avoid_warning;
/*
* ---------------------------------------------------------------------
* check problem size
projects/rocprofiler-systems/external/dyninst alt modülü güncellendi: 1cb91f1eea...bd17049666
projects/rocprofiler-systems/external/timemory alt modülü güncellendi: de1266606c...14fd2323bd
+27 -42
Dosyayı Görüntüle
@@ -2,13 +2,18 @@
: ${EXTRA_ARGS:=""}
: ${EXTRA_TAGS:=""}
: ${BUILD_DIR:=build-release}
: ${VERSION:=0.0.4}
: ${ROCM_VERSION:=4.5.0}
: ${NJOBS:=8}
: ${DISTRO:=""}
: ${LTO:="ON"}
DISTRO=$(lsb_release -i | awk '{print $NF}')-$(lsb_release -r | awk '{print $NF}')
if [ -z "${DISTRO}" ]; then
DISTRO=$(lsb_release -i | awk '{print $NF}')-$(lsb_release -r | awk '{print $NF}')
fi
STANDARD_ARGS="-DCPACK_GENERATOR=STGZ -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_RPATH_USE_LINK_PATH=OFF -DOMNITRACE_MAX_THREADS=2048 -DOMNITRACE_BUILD_TESTING=OFF -DTIMEMORY_USE_LIBUNWIND=ON -DTIMEMORY_BUILD_LIBUNWIND=ON -DTIMEMORY_BUILD_PORTABLE=ON"
STANDARD_ARGS="-DCPACK_GENERATOR=STGZ -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_RPATH_USE_LINK_PATH=OFF -DOMNITRACE_MAX_THREADS=2048 -DOMNITRACE_BUILD_TESTING=OFF -DOMNITRACE_BUILD_EXAMPLES=OFF -DOMNITRACE_USE_MPI_HEADERS=ON -DOMNITRACE_USE_OMPT=ON -DOMNITRACE_CPACK_SYSTEM_NAME=${DISTRO} -DOMNITRACE_ROCM_VERSION=${ROCM_VERSION} -DOMNITRACE_BUILD_LTO=${LTO} -DTIMEMORY_USE_LIBUNWIND=ON -DTIMEMORY_BUILD_LIBUNWIND=ON -DTIMEMORY_BUILD_PORTABLE=ON"
STANDARD_ARGS="${STANDARD_ARGS} -DOMNITRACE_BUILD_DYNINST=ON $(echo -DDYNINST_BUILD_{TBB,BOOST,ELFUTILS,LIBIBERTY}=ON)"
if [ -n "${EXTRA_ARGS}" ]; then
STANDARD_ARGS="${STANDARD_ARGS} ${EXTRA_ARGS}"
@@ -25,45 +30,25 @@ echo -e "Working directory: $(pwd)"
umask 0000
if [ ! -f build-release/${PACKAGE_BASE_TAG}.sh ]; then
cmake -B build-release/${DISTRO}-core ${STANDARD_ARGS} -DCMAKE_INSTALL_PREFIX=build-release/${DISTRO}-core/install-release -DDYNINST_USE_OpenMP=OFF -DOMNITRACE_USE_MPI_HEADERS=OFF -DOMNITRACE_USE_HIP=OFF .
cmake --build build-release/${DISTRO}-core --target package --parallel ${NJOBS}
cp build-release/${DISTRO}-core/omnitrace-${VERSION}-Linux.sh build-release/${PACKAGE_BASE_TAG}.sh
fi
build-and-package()
{
local DIR=${1}
shift
cmake -B ${BUILD_DIR}/${DIR} -DCMAKE_INSTALL_PREFIX=${BUILD_DIR}/${DIR}/install-release ${STANDARD_ARGS} $@ .
cmake --build ${BUILD_DIR}/${DIR} --target all --parallel ${NJOBS}
pushd ${BUILD_DIR}/${DIR}
rm -f *.sh *.deb *.rpm
cpack -G STGZ
cpack -G DEB -D CPACK_PACKAGING_INSTALL_PREFIX=/opt/omnitrace
cpack -G RPM -D CPACK_PACKAGING_INSTALL_PREFIX=/opt/omnitrace
popd
cp ${BUILD_DIR}/${DIR}/omnitrace-${VERSION}-*.sh ${BUILD_DIR}/
cp ${BUILD_DIR}/${DIR}/omnitrace_${VERSION}-*.deb ${BUILD_DIR}/
cp ${BUILD_DIR}/${DIR}/omnitrace-${VERSION}-*.rpm ${BUILD_DIR}/
}
apt-get install -y libopenmpi-dev openmpi-bin libudev-dev
build-and-package ${DISTRO}-core -DDYNINST_USE_OpenMP=OFF -DOMNITRACE_USE_HIP=OFF
# build-and-package ${DISTRO}-rocm-${ROCM_VERSION} -DOMNITRACE_USE_HIP=ON -DDYNINST_USE_OpenMP=ON
# build-and-package ${DISTRO}-rocm-${ROCM_VERSION}-papi -DOMNITRACE_USE_HIP=ON -DDYNINST_USE_OpenMP=ON -DOMNITRACE_USE_PAPI=ON
STANDARD_ARGS="${STANDARD_ARGS} -DOMNITRACE_USE_HIP=ON -DOMNITRACE_USE_MPI_HEADERS=ON -DDYNINST_USE_OpenMP=ON"
if [ ! -f build-release/${PACKAGE_BASE_TAG}-ROCm-${ROCM_VERSION}.sh ]; then
cmake -B build-release/${DISTRO}-rocm-${ROCM_VERSION} -DCMAKE_INSTALL_PREFIX=build-release/${DISTRO}-rocm-${ROCM_VERSION}/install-release ${STANDARD_ARGS} .
cmake --build build-release/${DISTRO}-rocm-${ROCM_VERSION} --target package --parallel ${NJOBS}
cp build-release/${DISTRO}-rocm-${ROCM_VERSION}/omnitrace-${VERSION}-Linux.sh build-release/${PACKAGE_BASE_TAG}-ROCm-${ROCM_VERSION}.sh
fi
STANDARD_ARGS="${STANDARD_ARGS} -DTIMEMORY_USE_PAPI=ON"
if [ ! -f build-release/${PACKAGE_BASE_TAG}-ROCm-${ROCM_VERSION}-PAPI.sh ]; then
cmake -B build-release/${DISTRO}-rocm-${ROCM_VERSION}-papi -DCMAKE_INSTALL_PREFIX=build-release/${DISTRO}-rocm-${ROCM_VERSION}-papi/install-release ${STANDARD_ARGS} .
cmake --build build-release/${DISTRO}-rocm-${ROCM_VERSION}-papi --target package --parallel ${NJOBS}
cp build-release/${DISTRO}-rocm-${ROCM_VERSION}-papi/omnitrace-${VERSION}-Linux.sh build-release/${PACKAGE_BASE_TAG}-ROCm-${ROCM_VERSION}-PAPI.sh
fi
if [ "${MPI}" -lt 1 ]; then exit 0; fi
STANDARD_ARGS="${STANDARD_ARGS} -DOMNITRACE_USE_MPI=ON"
if [ ! -f build-release/${PACKAGE_BASE_TAG}-ROCm-${ROCM_VERSION}-PAPI-OpenMPI.sh ]; then
cmake -B build-release/${DISTRO}-rocm-${ROCM_VERSION}-papi-openmpi -DCMAKE_INSTALL_PREFIX=build-release/${DISTRO}-rocm-${ROCM_VERSION}-papi-openmpi/install-release ${STANDARD_ARGS} .
cmake --build build-release/${DISTRO}-rocm-${ROCM_VERSION}-papi-openmpi --target package --parallel ${NJOBS}
cp build-release/${DISTRO}-rocm-${ROCM_VERSION}-papi-openmpi/omnitrace-${VERSION}-Linux.sh build-release/${PACKAGE_BASE_TAG}-ROCm-${ROCM_VERSION}-PAPI-OpenMPI.sh
fi
apt-get purge -y libopenmpi-dev openmpi-bin
apt-get install -y libmpich-dev mpich
if [ ! -f build-release/${PACKAGE_BASE_TAG}-ROCm-${ROCM_VERSION}-PAPI-MPICH.sh ]; then
cmake -B build-release/${DISTRO}-rocm-${ROCM_VERSION}-papi-mpich -DCMAKE_INSTALL_PREFIX=build-release/${DISTRO}-rocm-${ROCM_VERSION}-papi-mpich/install-release ${STANDARD_ARGS} .
cmake --build build-release/${DISTRO}-rocm-${ROCM_VERSION}-papi-mpich --target package --parallel ${NJOBS}
cp build-release/${DISTRO}-rocm-${ROCM_VERSION}-papi-mpich/omnitrace-${VERSION}-Linux.sh build-release/${PACKAGE_BASE_TAG}-ROCm-${ROCM_VERSION}-PAPI-MPICH.sh
fi
# build-and-package ${DISTRO}-rocm-${ROCM_VERSION}-papi-openmpi -DOMNITRACE_USE_HIP=ON -DDYNINST_USE_OpenMP=ON -DOMNITRACE_USE_PAPI=ON -DOMNITRACE_USE_MPI=ON
+17 -19
Dosyayı Görüntüle
@@ -1,26 +1,24 @@
// MIT License
// MIT License
//
// Copyright (c) 2020, The Regents of the University of California,
// through Lawrence Berkeley National Laboratory (subject to receipt of any
// required approvals from the U.S. Dept. of Energy). All rights reserved.
// Copyright (c) 2022 Advanced Micro Devices, Inc. All Rights Reserved.
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to
// deal in the Software without restriction, including without limitation the
// rights to use, copy, modify, merge, publish, distribute, sublicense, and
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in
// all copies or substantial portions of the Software.
// The above copyright notice and this permission notice shall be included in all
// copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
// IN THE SOFTWARE.
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
// SOFTWARE.
#include "avail.hpp"
#include "library/api.hpp"
+17 -25
Dosyayı Görüntüle
@@ -1,32 +1,24 @@
// MIT License
// MIT License
//
// Copyright (c) 2020, The Regents of the University of California,
// through Lawrence Berkeley National Laboratory (subject to receipt of any
// required approvals from the U.S. Dept. of Energy). All rights reserved.
// Copyright (c) 2022 Advanced Micro Devices, Inc. All Rights Reserved.
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to
// deal in the Software without restriction, including without limitation the
// rights to use, copy, modify, merge, publish, distribute, sublicense, and
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in
// all copies or substantial portions of the Software.
// The above copyright notice and this permission notice shall be included in all
// copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
// IN THE SOFTWARE.
/** \file timemory/tools/available.hpp
* \headerfile tools/available.hpp "tools/available.hpp"
* Handles serializing the settings
*
*/
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
// SOFTWARE.
#pragma once
+11 -4
Dosyayı Görüntüle
@@ -62,9 +62,12 @@ module_function::module_function(module_t* mod, procedure_t* proc)
for(const auto& itr : basic_blocks)
{
std::vector<instruction_t> instructions{};
itr->getInstructions(instructions);
num_instructions += instructions.size();
std::vector<instruction_t> _instructions{};
itr->getInstructions(_instructions);
num_instructions += _instructions.size();
instructions.reserve(instructions.size() + _instructions.size());
for(auto&& iitr : _instructions)
instructions.emplace_back(iitr);
}
char modname[FUNCNAMELEN];
@@ -84,7 +87,10 @@ module_function::module_function(module_t* mod, procedure_t* proc)
}
std::pair<address_t, address_t> _range{};
if(function->getAddressRange(_range.first, _range.second))
{
start_address = _range.first;
address_range = _range.second - _range.first;
}
}
void
@@ -95,7 +101,8 @@ module_function::write_header(std::ostream& os)
auto w2 = std::min<size_t>(get_width()[2], absolute_max_width);
std::stringstream ss;
ss << std::setw(14) << "AddressRange"
ss << std::setw(14) << "StartAddress"
<< " " << std::setw(14) << "AddressRange"
<< " " << std::setw(14) << "#Instructions"
<< " " << std::setw(6) << "Ratio"
<< " " << std::setw(w0 + 8) << std::left << "Module"
+38 -11
Dosyayı Görüntüle
@@ -28,6 +28,9 @@
#include <timemory/mpl/concepts.hpp>
#include <timemory/tpls/cereal/cereal/cereal.hpp>
#include <sstream>
#include <string>
struct module_function
{
using width_t = std::array<size_t, 4>;
@@ -74,16 +77,18 @@ struct module_function
bool is_address_range_constrained() const; // checks address range constraint
bool is_num_instructions_constrained() const; // check # instructions constraint
uint64_t address_range = 0;
uint64_t num_instructions = 0;
module_t* module = nullptr;
procedure_t* function = nullptr;
flow_graph_t* flow_graph = nullptr;
string_t module_name = {};
string_t function_name = {};
function_signature signature = {};
basic_block_set_t basic_blocks = {};
basic_loop_vec_t loop_blocks = {};
size_t start_address = 0;
uint64_t address_range = 0;
uint64_t num_instructions = 0;
module_t* module = nullptr;
procedure_t* function = nullptr;
flow_graph_t* flow_graph = nullptr;
string_t module_name = {};
string_t function_name = {};
function_signature signature = {};
basic_block_set_t basic_blocks = {};
basic_loop_vec_t loop_blocks = {};
std::vector<instruction_t> instructions = {};
using str_msg_t = std::tuple<int, string_t, string_t, string_t>;
using str_msg_vec_t = std::vector<str_msg_t>;
@@ -131,8 +136,11 @@ public:
return _inc;
};
std::stringstream _addr{};
_addr << "0x" << std::hex << rhs.start_address;
// clang-format off
ss << std::setw(14) << rhs.address_range << " "
ss << std::setw(14) << _addr.str() << " "
<< std::setw(14) << rhs.address_range << " "
<< std::setw(14) << rhs.num_instructions << " "
<< std::setw(6) << std::setprecision(2) << std::fixed << (rhs.address_range / static_cast<double>(rhs.num_instructions)) << " "
<< std::setw(w0 + 8) << std::left << _get_str(rhs.module_name) << " "
@@ -150,6 +158,13 @@ void
module_function::serialize(ArchiveT& ar, const unsigned)
{
namespace cereal = tim::cereal;
if constexpr(tim::concepts::is_output_archive<ArchiveT>::value)
{
std::stringstream _addr{};
_addr << "0x" << std::hex << start_address;
ar(cereal::make_nvp("start_address", _addr.str()));
}
ar(cereal::make_nvp("address_range", address_range),
cereal::make_nvp("instructions", num_instructions),
cereal::make_nvp("module", module_name),
@@ -181,5 +196,17 @@ module_function::serialize(ArchiveT& ar, const unsigned)
cereal::make_nvp("is_num_instructions_constrained",
is_num_instructions_constrained()));
ar.finishNode();
// instructions can inflate JSON size so only output when verbosity is increased
// above default
if(verbose_level > 0)
{
std::vector<std::string> _instructions{};
_instructions.reserve(instructions.size());
for(auto&& itr : instructions)
{
_instructions.emplace_back(itr.format());
}
ar(cereal::make_nvp("instructions", _instructions));
}
}
}
+2 -1
Dosyayı Görüntüle
@@ -1954,7 +1954,8 @@ instrument_entity(const string_t& function_name)
"(std::_Sp_counted_base|std::(use|has)_facet|std::locale|::sentry|^std::_|::_(M|"
"S)_|::basic_string[a-zA-Z,<>: ]+::_M_create)",
regex_opts);
static std::regex leading("^(_|\\.|frame_dummy|\\(|targ|kmp_threadprivate_)",
static std::regex leading("^(_|\\.|frame_dummy|transaction clone|virtual "
"thunk|non-virtual thunk|\\(|targ|kmp_threadprivate_)",
regex_opts);
static std::regex trailing(
"(_|\\.part\\.[0-9]+|\\.constprop\\.[0-9]+|\\.|\\.[0-9]+)$", regex_opts);
+4
Dosyayı Görüntüle
@@ -45,6 +45,8 @@ function(OMNITRACE_ADD_BIN_TEST)
"${TEST_ENVIRONMENT}"
TIMEOUT
${TEST_TIMEOUT}
DEPENDS
"${TEST_DEPENDS}"
LABELS
"omnitrace-bin;${TEST_LABELS}"
PASS_REGULAR_EXPRESSION
@@ -66,6 +68,8 @@ function(OMNITRACE_ADD_BIN_TEST)
"${TEST_ENVIRONMENT}"
TIMEOUT
${TEST_TIMEOUT}
DEPENDS
"${TEST_DEPENDS}"
LABELS
"omnitrace-bin;${TEST_LABELS}"
PASS_REGULAR_EXPRESSION
+5
Dosyayı Görüntüle
@@ -0,0 +1,5 @@
/build*
/_build
/_doxygen
/.gitinfo
/omnitrace.dox
Dosyayı Görüntüle
+20
Dosyayı Görüntüle
@@ -0,0 +1,20 @@
# Minimal makefile for Sphinx documentation
#
# You can set these variables from the command line, and also
# from the environment for the first two.
SPHINXOPTS ?=
SPHINXBUILD ?= sphinx-build
SOURCEDIR = .
BUILDDIR = _build
# Put it first so that "make" without argument is like "make help".
help:
@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
.PHONY: help Makefile
# Catch-all target: route all unknown targets to Sphinx using the new
# "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS).
%: Makefile
@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
+31
Dosyayı Görüntüle
@@ -0,0 +1,31 @@
# About
```eval_rst
.. toctree::
:glob:
:maxdepth: 4
```
[Browse Omnitrace source code on Github](https://github.com/AMDResearch/omnitrace)
> [Omnitrace](https://github.com/AMDResearch/omnitrace) is an AMD research project and should
> not be treated as an offical part of the ROCm software stack.
[Omnitrace](https://github.com/AMDResearch/omnitrace) is designed for both high-level and
comprehensive application tracing and profiling on both the CPU and GPU.
[Omnitrace](https://github.com/AMDResearch/omnitrace) supports both binary instrumentation
and sampling as a means of collecting various metrics.
Visualization of the comprehensive omnitrace results can be viewed in any modern web browser by visiting [ui.perfetto.dev](https://ui.perfetto.dev/)
and loading the perfetto output (`.proto` files) produced by omnitrace.
Aggregated high-level results are available in text files for human consumption and JSON files for programmatic analysis.
The JSON output files are compatible with the python package [hatchet](https://github.com/hatchet/hatchet) which converts
the performance data into pandas dataframes and facilitate multi-run comparisons, filtering, visualization in Jupyter notebooks, and much more.
[Omnitrace](https://github.com/AMDResearch/omnitrace) has two distinct configuration steps:
1. Configuring which functions and modules are instrumented in the target binaries (i.e. executable and/or libraries)
- [Instrumenting with Omnitrace](instrumenting.md)
2. Configuring what the instrumentation does happens when the instrumented binaries are executed
- [Customizing Omnitrace Runtime](runtime.md)
+164
Dosyayı Görüntüle
@@ -0,0 +1,164 @@
# Configuration file for the Sphinx documentation builder.
#
# This file only contains a selection of the most common options. For a full
# list see the documentation:
# http://www.sphinx-doc.org/en/master/config
# -- Path setup --------------------------------------------------------------
# If extensions (or modules to document with autodoc) are in another directory,
# add these directories to sys.path here. If the directory is relative to the
# documentation root, use os.path.abspath to make it absolute, like shown here.
#
# import os
# sys.path.insert(0, os.path.abspath('.'))
import os
import sys
import subprocess as sp
# If extensions (or modules to document with autodoc) are in another directory,
# add these directories to sys.path here. If the directory is relative to the
# documentation root, use os.path.abspath to make it absolute, like shown here.
sys.path.insert(0, os.path.abspath(".."))
def install(package):
sp.call([sys.executable, "-m", "pip", "install", package])
# Check if we're running on Read the Docs' servers
read_the_docs_build = os.environ.get("READTHEDOCS", None) == "True"
# -- Project information -----------------------------------------------------
project = "omnitrace"
copyright = "2022, Advanced Micro Devices, Inc."
author = "Audacious Software Group"
version = open(os.path.join("..", "VERSION")).read().strip()
# The full version, including alpha/beta/rc tags
release = version
_docdir = os.path.realpath(os.getcwd())
_srcdir = os.path.realpath(os.path.join(os.getcwd(), ".."))
_sitedir = os.path.realpath(os.path.join(os.getcwd(), "..", "site"))
_staticdir = os.path.realpath(os.path.join(_docdir, "_static"))
_templatedir = os.path.realpath(os.path.join(_docdir, "_templates"))
if not os.path.exists(_staticdir):
os.makedirs(_staticdir)
if not os.path.exists(_templatedir):
os.makedirs(_templatedir)
# -- General configuration ---------------------------------------------------
install("sphinx_rtd_theme")
# Add any Sphinx extension module names here, as strings. They can be
# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
# ones.
extensions = [
"sphinx.ext.autodoc",
"sphinx.ext.doctest",
"sphinx.ext.todo",
"sphinx.ext.viewcode",
"sphinx.ext.githubpages",
"sphinx.ext.mathjax",
"sphinx.ext.autosummary",
"sphinx.ext.napoleon",
"sphinx_markdown_tables",
"recommonmark",
"breathe",
]
source_suffix = {
".rst": "restructuredtext",
".md": "markdown",
}
from recommonmark.parser import CommonMarkParser
source_parsers = {".md": CommonMarkParser}
# Add any paths that contain templates here, relative to this directory.
templates_path = ["_templates"]
# The master toctree document.
master_doc = "index"
# List of patterns, relative to source directory, that match files and
# directories to ignore when looking for source files.
# This pattern also affects html_static_path and html_extra_path.
exclude_patterns = ["_build", "Thumbs.db", ".DS_Store"]
default_role = None
# -- Options for HTML output -------------------------------------------------
# The theme to use for HTML and HTML Help pages. See the documentation for
# a list of builtin themes.
#
html_theme = "sphinx_rtd_theme"
# Add any paths that contain custom static files (such as style sheets) here,
# relative to this directory. They are copied after the builtin static files,
# so a file named "default.css" will overwrite the builtin "default.css".
html_static_path = ["_static"]
html_theme_options = {
'analytics_id': 'G-1HLBBRSTT9', # Provided by Google in your dashboard
'analytics_anonymize_ip': False,
'logo_only': False,
'display_version': True,
'prev_next_buttons_location': 'bottom',
'style_external_links': False,
'vcs_pageview_mode': '',
# 'style_nav_header_background': 'white',
# Toc options
'collapse_navigation': True,
'sticky_navigation': True,
'navigation_depth': 4,
'includehidden': True,
'titles_only': False
}
# Breathe Configuration
breathe_projects = {"omnitrace": "_doxygen/xml"}
breathe_default_project = "omnitrace"
breathe_default_members = ('members', )
breathe_projects_source = {
"auto": (
"../source",
[
"lib/omnitrace-user/omnitrace/user.h",
],
)
}
from pygments.styles import get_all_styles
# The name of the Pygments (syntax highlighting) style to use.
styles = list(get_all_styles())
preferences = ("emacs", "pastie", "colorful")
for pref in preferences:
if pref in styles:
pygments_style = pref
break
from recommonmark.transform import AutoStructify
# app setup hook
def setup(app):
app.add_config_value(
"recommonmark_config",
{
"auto_toc_tree_section": "Contents",
"enable_eval_rst": True,
"enable_auto_doc_ref": False,
},
True,
)
app.add_transform(AutoStructify)
+29
Dosyayı Görüntüle
@@ -0,0 +1,29 @@
# Generating a Critical Trace
```eval_rst
.. toctree::
:glob:
:maxdepth: 4
```
## Overview
A critical trace is defined in omnitrace as the most time-consuming path through a parallelized code.
The steps for generating a critical trace are:
1. Enable the `OMNITRACE_CRITICAL_TRACE` setting
2. Configure any other relevant critical-trace settings, as needed
- `omnitrace-avail --categories settings::critical-trace`
3. Execute application
4. Locate the JSON files with `call-chain` in their name
5. Provide these files to the `omnitrace-critical-trace` executable
6. Open generated perfetto file in [ui.perfetto.dev](https://ui.perfetto.dev/)
## omnitrace-critical-trace Executable
The `omnitrace-critical-trace` executable post-processes one or more `call-chain` JSON files and generates a perfetto output
for visualizing the critical trace.
**INCOMPLETE**
This executable is still under-development.
+22
Dosyayı Görüntüle
@@ -0,0 +1,22 @@
name: omnitrace-docs
channels:
- conda-forge
- defaults
dependencies:
- python=3.9
- cmake
- curl
- doxygen
- git
- graphviz
- matplotlib
- mkdocs
- numpy
- openssl
- pillow
- pip
- setuptools
- breathe <4.30.0
- sphinx <4.0.0
- sphinx-markdown-tables
- docutils
+76
Dosyayı Görüntüle
@@ -0,0 +1,76 @@
# Features
```eval_rst
.. toctree::
:glob:
:maxdepth: 4
```
## Overview
[Omnitrace](https://github.com/AMDResearch/omnitrace) is designed to be highly extensible. Internally, it leverages the
[timemory performance analysis toolkit](https://github.com/NERSC/timemory) to
manage extensions, resources, data, etc.
### Data Collection Modes
- Dynamic instrumentation
- Runtime instrumentation
- Instrument executable and shared libraries at runtime
- Binary rewriting
- Generate a new executable and/or library with instrumentation built-in
- Statistical sampling
- Periodic software interrupts per-thread
- Background thread sampling
- Record process and system-level values while an application executes
- Critical trace generation
### Data Analysis
- Critical trace generation (beta)
- Support for
### Parallelism API Support
- Built-in MPI support
- Kokkos-Tools support
### GPU Metrics
- HIP API tracing
- ROCM HSA API tracing
- Kernel runtime tracing
- System-level sampling (via rocm-smi)
- Memory usage
- Power usage
- Temperature
- Utilization
### CPU Metrics
- CPU hardware counters sampling and profiles
- CPU frequency sampling
- Various timing metrics
- Wall time
- CPU time (process and/or thread)
- CPU utilization (process and/or thread)
- User CPU time
- Kernel CPU time
- Various memory metrics
- High-water mark (sampling and profiles)
- Memory page allocation
- Virtual memory usage
- Network statistics
- I/O metrics
- ... many more
### Third-party API support
- OpenMP-Tools (OMPT)
- TAU
- LIKWID
- Caliper
- CrayPAT
- VTune
- NVTX
- ROCTX
+19
Dosyayı Görüntüle
@@ -0,0 +1,19 @@
if(NOT DEFINED SOURCE_DIR)
message(FATAL_ERROR "Please define SOURCE_DIR")
endif()
get_filename_component(SOURCE_DIR "${SOURCE_DIR}" ABSOLUTE)
find_program(DOT_EXECUTABLE NAMES dot)
if(NOT DOT_EXECUTABLE)
message(FATAL_ERROR "Please install dot and/or specify DOT_EXECUTABLE")
endif()
file(READ "${SOURCE_DIR}/VERSION" FULL_VERSION_STRING LIMIT_COUNT 1)
string(REGEX REPLACE "(\n|\r)" "" FULL_VERSION_STRING "${FULL_VERSION_STRING}")
string(REGEX REPLACE "([0-9]+)\\.([0-9]+)\\.([0-9]+)(.*)" "\\1.\\2.\\3" OMNITRACE_VERSION
"${FULL_VERSION_STRING}")
configure_file(${SOURCE_DIR}/docs-source/omnitrace.dox.in
${SOURCE_DIR}/docs-source/omnitrace.dox @ONLY)
+11
Dosyayı Görüntüle
@@ -0,0 +1,11 @@
# Getting Started
```eval_rst
.. toctree::
:glob:
:maxdepth: 3
instrumenting
runtime
critical_trace
```
+15
Dosyayı Görüntüle
@@ -0,0 +1,15 @@
# Welcome to the [Omnitrace](https://github.com/AMDResearch/omnitrace) Documentation!
```eval_rst
.. toctree::
:glob:
:maxdepth: 4
:caption: Table of Contents
about
features
installation
getting_started
output
user_api
```
+162
Dosyayı Görüntüle
@@ -0,0 +1,162 @@
# Installation
```eval_rst
.. toctree::
:glob:
:maxdepth: 4
```
- Ubuntu 18.04 or Ubuntu 20.04
- Other OS distributions may be supported but are not tested
- GCC compiler v7+
- Older GCC compilers may be supported but are not tested
- Clang compilers are generally supported for [Omnitrace](https://github.com/AMDResearch/omnitrace) but not Dyninst
- [CMake](https://cmake.org/) v3.15+
- [DynInst](https://github.com/dyninst/dyninst) for dynamic or static instrumentation
- [TBB](https://github.com/oneapi-src/oneTBB) required by Dyninst
- [ElfUtils](https://sourceware.org/elfutils/) required by Dyninst
- [LibIberty](https://github.com/gcc-mirror/gcc/tree/master/libiberty) required by Dyninst
- [Boost](https://www.boost.org/) required by Dyninst
- [OpenMP](https://www.openmp.org/) optional by Dyninst
- [ROCm](https://rocmdocs.amd.com/en/latest/Installation_Guide/Installation-Guide.html#ubuntu) (optional)
- HIP
- Roctracer for HIP API and kernel tracing
- ROCM-SMI for GPU monitoring
- [PAPI](https://icl.utk.edu/papi/)
- [libunwind](https://www.nongnu.org/libunwind/) for call-stack sampling
- Several optional third-party profiling tools supported by timemory (e.g. TAU, Caliper, CrayPAT, etc.)
## Installing omnitrace from binary distributions
Every omnitrace release provides binary installer scripts of the form:
```shell
omnitrace-{VERSION}-{OS_DISTRIB}-{OS_VERSION}[-ROCm-{ROCM_VERSION}[-{EXTRA}]].sh
```
E.g.:
```shell
omnitrace-0.0.5-Ubuntu-18.04.sh
omnitrace-0.0.5-Ubuntu-18.04-ROCm-4.3.0.sh
omnitrace-0.0.5-Ubuntu-18.04-ROCm-4.5.0.sh
...
omnitrace-0.0.5-Ubuntu-20.04-ROCm-4.5.0-PAPI.sh
omnitrace-0.0.5-Ubuntu-20.04-ROCm-4.5.0-PAPI-MPICH.sh
omnitrace-0.0.5-Ubuntu-20.04-ROCm-4.5.0-PAPI-OpenMPI.sh
```
The EXTRA fields such as PAPI, MPICH, and OpenMPI are built against the libraries provided by the
OS package manager, e.g. `apt-get install libpapi-dev` for Ubuntu.
### Download the appropriate binary distribution
```shell
wget https://github.com/AMDResearch/omnitrace/releases/download/v<VERSION>/<SCRIPT>
```
### Create the target installation directory
```shell
mkdir /opt/omnitrace
```
### Run the installer script
```shell
./omnitrace-0.0.5-Ubuntu-18.04-ROCm-4.3.0-PAPI-MPICH.sh --prefix=/opt/omnitrace
```
### Configure the environment
```shell
source /opt/omnitrace/share/omnitrace/setup-env.sh
```
### Test the executables
```shell
omnitrace --help
omnitrace-avail --help
```
## Installing Omnitrace from source
### Installing CMake
If using Ubuntu 20.04, `apt-get install cmake` will install cmake v3.16.3. If using Ubuntu 18.04, the cmake version via apt is too old (v3.10.2). In this case,
follow the instructions [here](https://apt.kitware.com/) to add the CMake apt package repository; or alternatively (if root access is not available),
specific versions of CMake can be easily installed via the Python pip package manager:
```shell
python3 -m pip install 'cmake==3.18.4'
export PATH=${HOME}/.local/bin
```
> NOTE: be wary of using `python3 -m pip install cmake`. If pip installs a cmake version with a `.post<N>` suffix, it will be necessary to
> specify the root path when cmake is invoked.
### Installing DynInst
#### Building Dyninst alongside Omnitrace
The easiest way to install Dyninst is to configure omnitrace with `OMNITRACE_BUILD_DYNINST=ON`. Depending on the version of Ubuntu, the apt package manager may have current enough
versions of Dyninst's Boost, TBB, and LibIberty dependencies (i.e. `apt-get install libtbb-dev libiberty-dev libboost-dev`); however, it is possible to request Dyninst to install
it's dependencies via `Dyninst_BUILD_<DEP>=ON`, e.g.:
```shell
git clone https://github.com/AMDResearch/omnitrace.git omnitrace-source
cmake -B omnitrace-build -DOMNITRACE_BUILD_DYNINST=ON -DDyninst_BUILD_{TBB,ELFUTILS,BOOST,LIBIBERTY}=ON omnitrace-source
```
where `-DDyninst_BUILD_{TBB,BOOST,ELFUTILS,LIBIBERTY}=ON` is expanded by the shell to `-DDyninst_BUILD_TBB=ON -DDyninst_BUILD_BOOST=ON ...`
#### Installing Dyninst via Spack
[Spack](https://github.com/spack/spack) is another option to install Dyninst and it's dependencies:
```shell
git clone https://github.com/spack/spack.git
source ./spack/share/spack/setup-env.sh
spack compiler find
spack external find
spack install dyninst
spack load -r dyninst
```
### Installing omnitrace
Omnitrace has cmake configuration options for supporting MPI (`OMNITRACE_USE_MPI` or `OMNITRACE_USE_MPI_HEADERS`), HIP kernel tracing (`OMNITRACE_USE_ROCTRACER`),
sampling ROCm devices (`OMNITRACE_USE_ROCM_SMI`), OpenMP-Tools (`OMNITRACE_USE_OMPT`), hardware counters via PAPI (`OMNITRACE_USE_PAPI`), among others.
Various additional features can be enabled via the [`TIMEMORY_USE_*` CMake options](https://timemory.readthedocs.io/en/develop/installation.html#cmake-options).
Any `OMNITRACE_USE_<VAL>` option which has a corresponding `TIMEMORY_USE_<VAL>` option means that the support within timemory for this feature has been integrated
into omnitrace's perfetto support, e.g. `OMNITRACE_USE_PAPI=<VAL>` forces `TIMEMORY_USE_PAPI=<VAL>` and the data that timemory is able to collect via this package
is passed along to perfetto and will be displayed when the `.proto` file is visualized in [ui.perfetto.dev](https://ui.perfetto.dev).
```shell
OMNITRACE_ROOT=${HOME}/sw/omnitrace
git clone https://github.com/AMDResearch/omnitrace.git omnitrace-source
cmake \
-B omnitrace-build \
-DOMNITRACE_USE_MPI_HEADERS=ON \
-DCMAKE_INSTALL_PREFIX=${OMNITRACE_ROOT} \
omnitrace-source
cmake --build omnitrace-build --target all --parallel 8
cmake --build omnitrace-build --target install
source ${OMNITRACE_ROOT}/share/omnitrace/setup-env.sh
```
#### MPI Support within Omnitrace
[Omnitrace](https://github.com/AMDResearch/omnitrace) can have full (`OMNITRACE_USE_MPI=ON`) or partial (`OMNITRACE_USE_MPI_HEADERS=ON`) MPI support.
The only difference between these two modes is whether or not the results collected via timemory can be aggregated into one output file. The primary
benefits of partial or full MPI support are the automatic wrapping of MPI functions and the ability to label output with suffixes which correspond to the
`MPI_COMM_WORLD` rank ID instead of using the system process identifier (i.e. PID).
In general, it is recommended to use partial MPI support with the OpenMPI headers as this is the most portable configuration.
If full MPI support is selected, make sure your target application is built against the same MPI distribution as omnitrace,
i.e. do not build omnitrace with MPICH and use it on a target application built against OpenMPI.
If partial support is selected, the reason the OpenMPI headers are recommended instead of the MPICH headers is
because the `MPI_COMM_WORLD` in OpenMPI is a pointer to `ompi_communicator_t` (8 bytes), whereas `MPI_COMM_WORLD` in MPICH,
it is an `int` (4 bytes). Building omnitrace with partial MPI support and the MPICH headers and then using
omnitrace on an application built against OpenMPI will cause a segmentation fault due to the value of the `MPI_COMM_WORLD` being narrowed
during the function wrapping before being passed along to the underlying MPI function.
+682
Dosyayı Görüntüle
@@ -0,0 +1,682 @@
# Instrumenting with Omnitrace
```eval_rst
.. toctree::
:glob:
:maxdepth: 4
```
## omnitrace Executable
Instrumentation is performed with the `omnitrace` executable. View the help menu with the `-h` / `--help` option:
```shell
$ omnitrace --help
[omnitrace] Usage: omnitrace [ --help (count: 0, dtype: bool)
--debug (max: 1, dtype: bool)
--verbose (max: 1, dtype: bool)
--error (max: 1, dtype: boolean)
--simulate (max: 1, dtype: bool)
--print-format (min: 1, dtype: string)
--print-dir (count: 1, dtype: string)
--print-available (count: 1)
--print-instrumented (count: 1)
--print-excluded (count: 1)
--print-overlapping (count: 1)
--output (count: 1)
--pid (count: 1, dtype: int)
--mode (count: 1)
--command (count: 1)
--prefer (count: 1)
--library (count: unlimited)
--main-function (count: 1)
--driver (max: 1, dtype: boolean)
--load (count: unlimited, dtype: string)
--load-instr (count: unlimited, dtype: filepath)
--init-functions (count: unlimited, dtype: string)
--fini-functions (count: unlimited, dtype: string)
--function-include (count: unlimited)
--function-exclude (count: unlimited)
--module-include (count: unlimited)
--module-exclude (count: unlimited)
--label (count: unlimited, dtype: string)
--default-components (count: unlimited, dtype: string)
--env (count: unlimited)
--mpi (max: 1, dtype: bool)
--instrument-loops (max: 1, dtype: boolean)
--min-address-range (count: 1, dtype: int)
--min-address-range-loop (count: 1, dtype: int)
--dynamic-callsites (max: 1, dtype: boolean)
--traps (max: 1, dtype: bool)
--loop-traps (max: 1, dtype: bool)
--allow-overlapping (count: 0, dtype: bool)
--batch-size (count: 1, dtype: int)
--dyninst-options (count: unlimited)
] -- <CMD> <ARGS>
Options:
-h, -?, --help Shows this page
[DEBUG OPTIONS]
--debug Debug output
-v, --verbose Verbose output
-e, --error All warnings produce runtime errors
--simulate Exit after outputting diagnostic {available,instrumented,excluded,overlapping} module
function lists, e.g. available-instr.txt
--print-format [ json | txt | xml ]
Output format for diagnostic {available,instrumented,excluded,overlapping} module
function lists, e.g. {print-dir}/available-instr.txt
--print-dir Output directory for diagnostic {available,instrumented,excluded,overlapping} module
function lists, e.g. {print-dir}/available-instr.txt
--print-available [ functions | functions+ | modules | pair | pair+ ]
Print the available entities for instrumentation (functions, modules, or module-function
pair) to stdout applying regular expressions and exit
--print-instrumented [ functions | functions+ | modules | pair | pair+ ]
Print the instrumented entities (functions, modules, or module-function pair) to stdout
after applying regular expressions and exit
--print-excluded [ functions | functions+ | modules | pair | pair+ ]
Print the entities for instrumentation (functions, modules, or module-function pair)
which are excluded from the instrumentation to stdout after applying regular expressions
and exit
--print-overlapping [ functions | functions+ | modules | pair | pair+ ]
Print the entities for instrumentation (functions, modules, or module-function pair)
which overlap other function calls or have multiple entry points to stdout applying
regular expressions and exit
[MODE OPTIONS]
-o, --output Enable generation of a new executable (binary-rewrite)
-p, --pid Connect to running process
-M, --mode [ sampling | trace ]
Instrumentation mode. 'trace' mode instruments the selected functions, 'sampling' mode
only instruments the main function to start and stop the sampler.
-c, --command Input executable and arguments (if '-- <CMD>' not provided)
[LIBRARY OPTIONS]
--prefer [ shared | static ] Prefer this library types when available
-L, --library Libraries with instrumentation routines (default: "libomnitrace")
-m, --main-function The primary function to instrument around, e.g. 'main'
--driver Force main or _init/_fini instrumentation
--load Supplemental instrumentation library names w/o extension (e.g. 'libinstr' for
'libinstr.so' or 'libinstr.a')
--load-instr Load {available,instrumented,excluded,overlapping}-instr JSON or XML file(s) and override
what is read from the binary
--init-functions Initialization function(s) for supplemental instrumentation libraries (see '--load'
option)
--fini-functions Finalization function(s) for supplemental instrumentation libraries (see '--load' option)
[SYMBOL SELECTION OPTIONS]
-I, -R, --function-include Regex for selecting functions
-E, --function-exclude Regex for excluding functions
-MI, -MR, --module-include Regex for selecting modules/files/libraries
-ME, --module-exclude Regex for excluding modules/files/libraries
[RUNTIME OPTIONS]
--label [ args | file | line | return ]
Labeling info for functions. By default, just the function name is recorded. Use these
options to gain more information about the function signature or location of the
functions
-d, --default-components Default components to instrument (only useful when timemory is enabled in omnitrace
library)
--env Environment variables to add to the runtime in form VARIABLE=VALUE. E.g. use '--env
OMNITRACE_USE_TIMEMORY=ON' to default to using timemory instead of perfetto
--mpi Enable MPI support (requires omnitrace built w/ MPI and GOTCHA support). NOTE: this will
automatically be activated if MPI_Init/MPI_Init_thread and MPI_Finalize are found in the
symbol table of target
[GRANULARITY OPTIONS]
-l, --instrument-loops Instrument at the loop level
-r, --min-address-range If the address range of a function is less than this value, exclude it from
instrumentation
--min-address-range-loop If the address range of a function containing a loop is less than this value, exclude it
from instrumentation
--dynamic-callsites Force instrumentation if a function has dynamic callsites (e.g. function pointers)
--traps Instrument points which require using a trap. On the x86 architecture, because
instructions are of variable size, the instruction at a point may be too small for
Dyninst to replace it with the normal code sequence used to call instrumentation. Also,
when instrumentation is placed at points other than subroutine entry, exit, or call
points, traps may be used to ensure the instrumentation fits. In this case, Dyninst
replaces the instruction with a single-byte instruction that generates a trap.
--loop-traps Instrument points within a loop which require using a trap (only relevant when
--instrument-loops is enabled).
--allow-overlapping Allow dyninst to instrument either multiple functions which overlap (share part of same
function body) or single functions with multiple entry points. For more info, see Section
2 of the DyninstAPI documentation.
[DYNINST OPTIONS]
-b, --batch-size Dyninst supports batch insertion of multiple points during runtime instrumentation. If
one large batch insertion fails, this value will be used to create smaller batches.
Larger batches generally decrease the instrumentation time
--dyninst-options [ BaseTrampDeletion | DebugParsing | DelayedParsing | InstrStackFrames | MergeTramp | SaveFPR | TrampRecursive | TypeChecking ]
Advanced dyninst options: BPatch::set<OPTION>(bool), e.g. bpatch->setTrampRecursive(true)
```
There are three ways to perform instrumentation:
1. Running the application via the omnitrace executable (analagous to `gdb --args <program> <args>`)
- This mode is the default if neither the `-p` nor `-o` comand-line options are used
- Runtime instrumentation supports instrumenting not only the target executable but also the
the shared libraries loaded by the target executable. Consequently, this mode consumes more memory,
takes longer to perform the instrumentation, and tends to have a more significant overhead on the
runtime of the application
- This mode is recommended if you want to analyze not only the performance of your executable and/or
libraries but also the performance of the library dependencies
2. Attaching to a process that is currently running (analagous to `gdb -p <PID>`)
- This mode is activate via `-p <PID>`
- Same caveats as 1. with respect to memory and overhead
3. Generating a new executable or library with the instrumentation built-in (binary rewrite)
- This mode is activated via the `-o <output-file>` option
- Binary rewriting is limited to the text section of the target executable or library: it will not instrument
the dynamically-linked libraries. Consequently, this mode performs the instrumentation significantly faster
and has a much lower overhead when running the instrumentated executable and/or libraries
- Binary rewriting is the recommended mode when the target executable uses process-level parallelism (e.g. MPI)
- If your target executable has a minimal main which and the bulk of your application is in one specific dynamic library,
see [Binary Rewriting a Library](#binary-rewriting-a-library) for help
> NOTE: Attaching to a running process is an alpha feature and support for detaching from the target process
> without ending the target process is not currently supported.
The general syntax for separating omnitrace command line arguments from the application arguments follows the
is consistent with the LLVM style of using a standalone double-hyphen (`--`). All arguments preceding the double-hyphen
are interpreted as belonging to omnitrace and all arguments following the double-hyphen are interpreted as the
application and it's arguments. In binary rewrite mode, all application arguments after the first argument
are ignored, i.e. `./omnitrace -o ls.inst -- ls -l` interprets `ls` as the target to instrument (ignores the `-l` argument)
and generates a `ls.inst` executable that you can subsequently run `ls.inst -l` with.
## Runtime Instrumentation
```shell
omnitrace <omnitrace-options> -- <exe> [<exe-options>...]
```
## Attaching to Running Process
```shell
omnitrace <omnitrace-options> -p <PID> -- <exe-name>
```
## Binary Rewrite
```shell
omnitrace <omnitrace-options> -o <name-of-new-exe-or-library> -- <exe-or-library>
```
### Binary Rewriting a Library
Many applications bundle the bulk of their functionality into one or more dynamic libraries and have a relatively simple main
which links to these libraries and simply serves as the "driver" for setting up the workflow. If you binary rewrite your
executable and find there is insufficient info because of this, you can either switch to runtime instrumentation or
binary rewrite the libraries of interest.
Support for standalone binary rewriting of a dynamic library without binary rewriting the executable is a beta feature.
In general, it is supported as long as the library contains the `_init` and `_fini` symbols but these symbols are not
standardized to the extent of `main` in an executable.
The recommended workflow is as follows:
1. Determine the names of the dynamically linked libraries of interest via `ldd`
2. Generate a binary rewrite of the executable
3. Generate a binary rewrite of the desired libraries with the same base name as the original library, e.g. `libfoo.so.2` instead of `libfoo.so`
- Output the instrumented library into a different folder than the original library
4. Prefix the `LD_LIBRARY_PATH` executable with the output folder from 3
5. Verify via `ldd` that the instrumented executable resolves the location of the instrumented library
### Binary Rewriting a Library Example
`foo` executable is dynamically linked to `libfoo.so.2`:
```shell
$ pwd
/home/user
$ which foo
/usr/local/bin/foo
$ ldd /usr/local/bin/foo
...
libfoo.so.2 => /usr/local/lib/libfoo.so.2 (...)
...
```
Generate binary rewrites of `foo` and `libfoo.so.2`:
```shell
omnitrace -o ./foo.inst -- foo
omnitrace -o ./libfoo.so.2 -- /usr/local/lib/libfoo.so.2
```
At this point, the instrumented `foo.inst` executable will still dynamically load the original `libfoo.so.2` in `/usr/local/lib`:
```shell
$ ldd ./foo.inst
...
libfoo.so.2 => /usr/local/lib/libfoo.so.2 (...)
...
```
Prefix the `LD_LIBRARY_PATH` environment variable with the folder containing the instrumented `libfoo.so.2`:
```shell
export LD_LIBRARY_PATH=/home/user:${LD_LIBRARY_PATH}
```
When `foo.inst` is executed, it will now load the instrumented library:
```shell
$ ldd ./foo.inst
...
libfoo.so.2 => /home/user/libfoo.so.2 (...)
...
```
## Selective Instrumentation
The default behavior of omnitrace does not instrument every symbol in the binary. These default rules are:
- Skip instrumenting dynamic call-sites (i.e. function pointers)
- Option `--dynamic-callsites` will force instrumentation for all dynamic call-sites
- The cost of a function can be loosely approximated by the size of the function in the binary so by default, omnitrace only instruments functions which span an address range of 256 bytes.
- Option `--min-address-range` will modify this heuristic for all functions which do not contain loops
- Option `--min-address-range-loop` will modify this heuristic for functions which contain loops
- This separate loop option is provided because functions with loops can be compact in the binary while also being costly
- Skip instrumentation points which require using a trap
- See the description for the `--traps` and `--loop-traps` options for more information
- Skip instrumenting loops within the body of a function
- Option `--instrument-loops` will enable this behavior
- Skip instrumenting functions with overlapping function bodies and single functions with multiple entry point
- These arise from various optimizations and instrumenting these functions can be enabled via the `--allow-overlapping` option
### Viewing the Available, Instrumented, Excluded, and Overlapping Functions
Whenever omnitrace is executed with a verbosity of zero or higher, it emits files which detail which functions (and which module they were defined in)
were available for instrumentation, which functions were instrumented, which functions were excluded, and which functions contained overlapping function bodies.
The default output path of these files will be in a `omnitrace-<NAME>-output` folder where `<NAME>` is the basename of the targeted binary or
(in the case of binary rewrite, the basename of the resulting executable), e.g.
`omnitrace -- ls` will output it's files to `omnitrace-ls-output` whereas `omnitrace -o ls.inst -- ls` will output to `omnitrace-ls.inst-output`.
If you would like to generate these files without executing or generating an executable, use the `--simulate` option:
```shell
omnitrace --simulate -- foo
omnitrace --simulate -o foo.inst -- foo
```
### Excluding and Including Modules and Functions
[Omnitrace](https://github.com/AMDResearch/omnitrace) has a set of 6 command-line options which each accept one or more regular expressions for customizing the scope of which module and/or functions are
instrumented. Multiple regexes per option are treated as an OR operation, e.g. `--module-include libfoo libbar` is effectively that same as `--module-include 'libfoo|libbar'`.
If you would like to force the inclusion of certain modules and/or function without changing any of the heuristics, use the `--module-include` and/or `--function-include` options.
Note that these options will not exclude modules and/or functions which do not satisfy their regular expression.
If you would like to narrow the scope of the instrumentation to a specific set of libraries and/or functions, use the `--module-restrict` and `--function-restrict` options.
Applying these options allow you to exclusively select the union one or more regular expressions, regardless of whether or not the functions satisfy the
aforementioned default heuristics. Any function or module that is not within the union of these regular expressions will be excluded from instrumentation.
If you would like to avoid instrumenting a set of modules and/or functions, use the `--module-exclude` and `--function-exclude` options.
These options are always applied regardless of whether the module or function satisfied the "restrict" or "include" regular expression.
#### Example Available Module and Function Info Output
> `omnitrace -o lulesh.inst --label file line args --simulate -- lulesh`
```console
AddressRange Module Function FunctionSignature
9165 ../examples/lulesh/lulesh-comm.cc CommMonoQ CommMonoQ(domain) [lulesh-comm.cc:1891]
3396 ../examples/lulesh/lulesh-comm.cc CommRecv CommRecv(domain, int, Index_t, Index_t, Index_t, Index_t, bool, bool) [lulesh...
8666 ../examples/lulesh/lulesh-comm.cc CommSBN CommSBN(domain, int, Domain_member *) [lulesh-comm.cc:926]
10212 ../examples/lulesh/lulesh-comm.cc CommSend CommSend(domain, int, Index_t, Domain_member *, Index_t, Index_t, Index_t, bo...
6823 ../examples/lulesh/lulesh-comm.cc CommSyncPosVel CommSyncPosVel(domain) [lulesh-comm.cc:1404]
126 ../examples/lulesh/lulesh-comm.cc _GLOBAL__sub_I_lulesh_comm.cc _GLOBAL__sub_I_lulesh_comm.cc() [lulesh-comm.cc]
308 ../examples/lulesh/lulesh-init.cc .omp_outlined..26 .omp_outlined..26(const , const , const ParallelFor<Kokkos::Impl::ViewCopy<Ko...
628 ../examples/lulesh/lulesh-init.cc .omp_outlined..34 .omp_outlined..34(const , const , const ParallelFor<Kokkos::Impl::ViewCopy<Ko...
656 ../examples/lulesh/lulesh-init.cc .omp_outlined..41 .omp_outlined..41(const , const , const ParallelFor<Kokkos::Impl::ViewCopy<Ko...
662 ../examples/lulesh/lulesh-init.cc .omp_outlined..45 .omp_outlined..45(const , const , const ParallelFor<Kokkos::Impl::ViewCopy<Ko...
550 ../examples/lulesh/lulesh-init.cc .omp_outlined..55 .omp_outlined..55(const , const , const ParallelFor<Kokkos::Impl::ViewFill<Ko...
556 ../examples/lulesh/lulesh-init.cc .omp_outlined..57 .omp_outlined..57(const , const , const ParallelFor<Kokkos::Impl::ViewFill<Ko...
550 ../examples/lulesh/lulesh-init.cc .omp_outlined..78 .omp_outlined..78(const , const , const ParallelFor<Kokkos::Impl::ViewFill<Ko...
640 ../examples/lulesh/lulesh-init.cc .omp_outlined..84 .omp_outlined..84(const , const , const ParallelFor<Kokkos::Impl::ViewCopy<Ko...
646 ../examples/lulesh/lulesh-init.cc .omp_outlined..88 .omp_outlined..88(const , const , const ParallelFor<Kokkos::Impl::ViewCopy<Ko...
1840 ../examples/lulesh/lulesh-init.cc Domain::AllocateElemPersistent Domain::AllocateElemPersistent(Domain *, Int_t) [lulesh-init.cc:94]
1384 ../examples/lulesh/lulesh-init.cc Domain::AllocateNodePersistent Domain::AllocateNodePersistent(Domain *, Int_t) [lulesh-init.cc:94]
1264 ../examples/lulesh/lulesh-init.cc Domain::BuildMesh Domain::BuildMesh(Domain *, Int_t, Int_t, Int_t) [lulesh-init.cc:308]
2312 ../examples/lulesh/lulesh-init.cc Domain::CreateRegionIndexSets Domain::CreateRegionIndexSets(Domain *, Int_t, Int_t) [lulesh-init.cc:409]
7109 ../examples/lulesh/lulesh-init.cc Domain::Domain Domain::Domain(Domain *, Int_t, Index_t, Index_t, Index_t, Index_t, int, int,...
2458 ../examples/lulesh/lulesh-init.cc Domain::SetupBoundaryConditions Domain::SetupBoundaryConditions(Domain *, Int_t) [lulesh-init.cc:409]
956 ../examples/lulesh/lulesh-init.cc Domain::SetupCommBuffers Domain::SetupCommBuffers(Domain *, Int_t) [lulesh-init.cc]
1456 ../examples/lulesh/lulesh-init.cc Domain::SetupElementConnectivities Domain::SetupElementConnectivities(Domain *, Int_t) [lulesh-init.cc:409]
721 ../examples/lulesh/lulesh-init.cc Domain::SetupSymmetryPlanes Domain::SetupSymmetryPlanes(Domain *, Int_t) [lulesh-init.cc:409]
1591 ../examples/lulesh/lulesh-init.cc Domain::SetupThreadSupportStructures Domain::SetupThreadSupportStructures(Domain *) [lulesh-init.cc:376]
1644 ../examples/lulesh/lulesh-init.cc Domain::~Domain Domain::~Domain(Domain *) [lulesh-init.cc:286]
218 ../examples/lulesh/lulesh-init.cc InitMeshDecomp InitMeshDecomp(Int_t, Int_t, Int_t *, Int_t *, Int_t *, Int_t *) [lulesh-init...
260 ../examples/lulesh/lulesh-init.cc Kokkos::Impl::CommonSubview<Kokkos::View<int* [8], Kokkos::LayoutRight>, Kokk... Kokkos::Impl::CommonSubview<Kokkos::View<int* [8], Kokkos::LayoutRight>, Kokk...
1786 ../examples/lulesh/lulesh-init.cc Kokkos::Impl::HostIterateTile<Kokkos::MDRangePolicy<Kokkos::OpenMP, Kokkos::R... Kokkos::Impl::HostIterateTile<Kokkos::MDRangePolicy<Kokkos::OpenMP, Kokkos::R...
330 ../examples/lulesh/lulesh-init.cc Kokkos::Impl::ParallelConstructName<Kokkos::Impl::ViewCopy<Kokkos::View<int**... Kokkos::Impl::ParallelConstructName<Kokkos::Impl::ViewCopy<Kokkos::View<int**...
330 ../examples/lulesh/lulesh-init.cc Kokkos::Impl::ParallelConstructName<Kokkos::Impl::ViewCopy<Kokkos::View<int**... Kokkos::Impl::ParallelConstructName<Kokkos::Impl::ViewCopy<Kokkos::View<int**...
330 ../examples/lulesh/lulesh-init.cc Kokkos::Impl::ParallelConstructName<Kokkos::Impl::ViewCopy<Kokkos::View<int*,... Kokkos::Impl::ParallelConstructName<Kokkos::Impl::ViewCopy<Kokkos::View<int*,...
330 ../examples/lulesh/lulesh-init.cc Kokkos::Impl::ParallelConstructName<Kokkos::Impl::ViewCopy<Kokkos::View<int*,... Kokkos::Impl::ParallelConstructName<Kokkos::Impl::ViewCopy<Kokkos::View<int*,...
330 ../examples/lulesh/lulesh-init.cc Kokkos::Impl::ParallelConstructName<Kokkos::Impl::ViewFill<Kokkos::View<doubl... Kokkos::Impl::ParallelConstructName<Kokkos::Impl::ViewFill<Kokkos::View<doubl...
330 ../examples/lulesh/lulesh-init.cc Kokkos::Impl::ParallelConstructName<Kokkos::Impl::ViewFill<Kokkos::View<doubl... Kokkos::Impl::ParallelConstructName<Kokkos::Impl::ViewFill<Kokkos::View<doubl...
330 ../examples/lulesh/lulesh-init.cc Kokkos::Impl::ParallelConstructName<Kokkos::Impl::ViewFill<Kokkos::View<doubl... Kokkos::Impl::ParallelConstructName<Kokkos::Impl::ViewFill<Kokkos::View<doubl...
522 ../examples/lulesh/lulesh-init.cc Kokkos::Impl::ParallelFor<Kokkos::Impl::ViewCopy<Kokkos::View<int**, Kokkos::... Kokkos::Impl::ParallelFor<Kokkos::Impl::ViewCopy<Kokkos::View<int**, Kokkos::...
232 ../examples/lulesh/lulesh-init.cc Kokkos::Impl::ParallelFor<Kokkos::Impl::ViewCopy<Kokkos::View<int**, Kokkos::... Kokkos::Impl::ParallelFor<Kokkos::Impl::ViewCopy<Kokkos::View<int**, Kokkos::...
49 ../examples/lulesh/lulesh-init.cc Kokkos::Impl::SharedAllocationRecord<Kokkos::HostSpace, Kokkos::Impl::ViewVal... Kokkos::Impl::SharedAllocationRecord<Kokkos::HostSpace, Kokkos::Impl::ViewVal...
1476 ../examples/lulesh/lulesh-init.cc Kokkos::Impl::Tile_Loop_Type<2, false, int, void, void>::apply<Kokkos::Impl::... Kokkos::Impl::Tile_Loop_Type<2, false, int, void, void>::apply<Kokkos::Impl::...
555 ../examples/lulesh/lulesh-init.cc Kokkos::Impl::ViewCopy<Kokkos::View<int**, Kokkos::LayoutRight, Kokkos::Devic... Kokkos::Impl::ViewCopy<Kokkos::View<int**, Kokkos::LayoutRight, Kokkos::Devic...
613 ../examples/lulesh/lulesh-init.cc Kokkos::Impl::ViewCopy<Kokkos::View<int**, Kokkos::LayoutRight, Kokkos::Devic... Kokkos::Impl::ViewCopy<Kokkos::View<int**, Kokkos::LayoutRight, Kokkos::Devic...
603 ../examples/lulesh/lulesh-init.cc Kokkos::Impl::ViewCopy<Kokkos::View<int*, Kokkos::LayoutLeft, Kokkos::Device<... Kokkos::Impl::ViewCopy<Kokkos::View<int*, Kokkos::LayoutLeft, Kokkos::Device<...
604 ../examples/lulesh/lulesh-init.cc Kokkos::Impl::ViewCopy<Kokkos::View<int*, Kokkos::LayoutLeft, Kokkos::Device<... Kokkos::Impl::ViewCopy<Kokkos::View<int*, Kokkos::LayoutLeft, Kokkos::Device<...
281 ../examples/lulesh/lulesh-init.cc Kokkos::Impl::ViewCtorProp<std::__cxx11::basic_string<char, std::char_traits<... Kokkos::Impl::ViewCtorProp<std::__cxx11::basic_string<char, std::char_traits<...
281 ../examples/lulesh/lulesh-init.cc Kokkos::Impl::ViewCtorProp<std::__cxx11::basic_string<char, std::char_traits<... Kokkos::Impl::ViewCtorProp<std::__cxx11::basic_string<char, std::char_traits<...
281 ../examples/lulesh/lulesh-init.cc Kokkos::Impl::ViewCtorProp<std::__cxx11::basic_string<char, std::char_traits<... Kokkos::Impl::ViewCtorProp<std::__cxx11::basic_string<char, std::char_traits<...
281 ../examples/lulesh/lulesh-init.cc Kokkos::Impl::ViewCtorProp<std::__cxx11::basic_string<char, std::char_traits<... Kokkos::Impl::ViewCtorProp<std::__cxx11::basic_string<char, std::char_traits<...
281 ../examples/lulesh/lulesh-init.cc Kokkos::Impl::ViewCtorProp<std::__cxx11::basic_string<char, std::char_traits<... Kokkos::Impl::ViewCtorProp<std::__cxx11::basic_string<char, std::char_traits<...
524 ../examples/lulesh/lulesh-init.cc Kokkos::Impl::ViewFill<Kokkos::View<double*, Kokkos::LayoutRight, Kokkos::Dev... Kokkos::Impl::ViewFill<Kokkos::View<double*, Kokkos::LayoutRight, Kokkos::Dev...
525 ../examples/lulesh/lulesh-init.cc Kokkos::Impl::ViewFill<Kokkos::View<double*, Kokkos::LayoutRight, Kokkos::Dev... Kokkos::Impl::ViewFill<Kokkos::View<double*, Kokkos::LayoutRight, Kokkos::Dev...
524 ../examples/lulesh/lulesh-init.cc Kokkos::Impl::ViewFill<Kokkos::View<double*, Kokkos::LayoutRight, Kokkos::Dev... Kokkos::Impl::ViewFill<Kokkos::View<double*, Kokkos::LayoutRight, Kokkos::Dev...
583 ../examples/lulesh/lulesh-init.cc Kokkos::Impl::ViewMapping<Kokkos::ViewTraits<int* [8], Kokkos::LayoutRight>, ... SharedAllocationRecord<void, void> * Kokkos::Impl::ViewMapping<Kokkos::ViewTr...
529 ../examples/lulesh/lulesh-init.cc Kokkos::Impl::ViewMapping<Kokkos::ViewTraits<int*, Kokkos::HostSpace>, void>:... SharedAllocationRecord<void, void> * Kokkos::Impl::ViewMapping<Kokkos::ViewTr...
529 ../examples/lulesh/lulesh-init.cc Kokkos::Impl::ViewMapping<Kokkos::ViewTraits<int*>, void>::allocate_shared<st... SharedAllocationRecord<void, void> * Kokkos::Impl::ViewMapping<Kokkos::ViewTr...
203 ../examples/lulesh/lulesh-init.cc Kokkos::Impl::ViewRemap<Kokkos::View<int* [8], Kokkos::LayoutRight>, Kokkos::... Kokkos::Impl::ViewRemap<Kokkos::View<int* [8], Kokkos::LayoutRight>, Kokkos::...
331 ../examples/lulesh/lulesh-init.cc Kokkos::Impl::ViewRemap<Kokkos::View<int*>, Kokkos::View<int*>, Kokkos::OpenM... Kokkos::Impl::ViewRemap<Kokkos::View<int*>, Kokkos::View<int*>, Kokkos::OpenM...
461 ../examples/lulesh/lulesh-init.cc Kokkos::Impl::ViewValueFunctor<Kokkos::Device<Kokkos::OpenMP, Kokkos::HostSpa... enable_if_t<std::is_trivial<int>::value && std::is_trivially_copy_assignable<...
353 ../examples/lulesh/lulesh-init.cc Kokkos::Impl::contiguous_fill<Kokkos::OpenMP, double*> Kokkos::Impl::contiguous_fill<Kokkos::OpenMP, double*>(exec_space, dst, value...
139 ../examples/lulesh/lulesh-init.cc Kokkos::Impl::contiguous_fill<Kokkos::OpenMP, double, Kokkos::LayoutRight, Ko... Kokkos::Impl::contiguous_fill<Kokkos::OpenMP, double, Kokkos::LayoutRight, Ko...
824 ../examples/lulesh/lulesh-init.cc Kokkos::Impl::view_copy<Kokkos::View<int* [8], Kokkos::LayoutRight, Kokkos::D... Kokkos::Impl::view_copy<Kokkos::View<int* [8], Kokkos::LayoutRight, Kokkos::D...
824 ../examples/lulesh/lulesh-init.cc Kokkos::Impl::view_copy<Kokkos::View<int* [8], Kokkos::LayoutRight, Kokkos::D... Kokkos::Impl::view_copy<Kokkos::View<int* [8], Kokkos::LayoutRight, Kokkos::D...
824 ../examples/lulesh/lulesh-init.cc Kokkos::Impl::view_copy<Kokkos::View<int* [8], Kokkos::LayoutRight>, Kokkos::... Kokkos::Impl::view_copy<Kokkos::View<int* [8], Kokkos::LayoutRight>, Kokkos::...
824 ../examples/lulesh/lulesh-init.cc Kokkos::Impl::view_copy<Kokkos::View<int* [8], Kokkos::LayoutRight>, Kokkos::... Kokkos::Impl::view_copy<Kokkos::View<int* [8], Kokkos::LayoutRight>, Kokkos::...
697 ../examples/lulesh/lulesh-init.cc Kokkos::Impl::view_copy<Kokkos::View<int*, Kokkos::LayoutRight, Kokkos::Devic... Kokkos::Impl::view_copy<Kokkos::View<int*, Kokkos::LayoutRight, Kokkos::Devic...
697 ../examples/lulesh/lulesh-init.cc Kokkos::Impl::view_copy<Kokkos::View<int*>, Kokkos::View<int*> > Kokkos::Impl::view_copy<Kokkos::View<int*>, Kokkos::View<int*> >(dst, src) [l...
2036 ../examples/lulesh/lulesh-init.cc Kokkos::RangePolicy<Kokkos::OpenMP, Kokkos::Schedule<Kokkos::Static>, int>::R... Kokkos::RangePolicy<Kokkos::OpenMP, Kokkos::Schedule<Kokkos::Static>, int>::R...
2506 ../examples/lulesh/lulesh-init.cc Kokkos::RangePolicy<Kokkos::OpenMP, Kokkos::Schedule<Kokkos::Static>, long>::... Kokkos::RangePolicy<Kokkos::OpenMP, Kokkos::Schedule<Kokkos::Static>, long>::...
271 ../examples/lulesh/lulesh-init.cc Kokkos::StaticCrsGraph<int, Kokkos::LayoutLeft, Kokkos::OpenMP, Kokkos::Memor... Kokkos::StaticCrsGraph<int, Kokkos::LayoutLeft, Kokkos::OpenMP, Kokkos::Memor...
470 ../examples/lulesh/lulesh-init.cc Kokkos::View<int* [8], Kokkos::LayoutRight>::View<std::__cxx11::basic_string<... Kokkos::View<int* [8], Kokkos::LayoutRight>::View<std::__cxx11::basic_string<...
323 ../examples/lulesh/lulesh-init.cc Kokkos::View<int* [8], Kokkos::LayoutRight>::View<std::__cxx11::basic_string<... Kokkos::View<int* [8], Kokkos::LayoutRight>::View<std::__cxx11::basic_string<...
410 ../examples/lulesh/lulesh-init.cc Kokkos::View<int*, Kokkos::HostSpace>::View<char [10]> Kokkos::View<int*, Kokkos::HostSpace>::View<char [10]>(View<int *, Kokkos::Ho...
410 ../examples/lulesh/lulesh-init.cc Kokkos::View<int*, Kokkos::HostSpace>::View<char [14]> Kokkos::View<int*, Kokkos::HostSpace>::View<char [14]>(View<int *, Kokkos::Ho...
462 ../examples/lulesh/lulesh-init.cc Kokkos::View<int*, Kokkos::HostSpace>::View<std::__cxx11::basic_string<char, ... Kokkos::View<int*, Kokkos::HostSpace>::View<std::__cxx11::basic_string<char, ...
410 ../examples/lulesh/lulesh-init.cc Kokkos::View<int*>::View<char [16]> Kokkos::View<int*>::View<char [16]>(View<int *> *, arg_label, type, const siz...
410 ../examples/lulesh/lulesh-init.cc Kokkos::View<int*>::View<char [19]> Kokkos::View<int*>::View<char [19]>(View<int *> *, arg_label, type, const siz...
410 ../examples/lulesh/lulesh-init.cc Kokkos::View<int*>::View<char [21]> Kokkos::View<int*>::View<char [21]>(View<int *> *, arg_label, type, const siz...
462 ../examples/lulesh/lulesh-init.cc Kokkos::View<int*>::View<std::__cxx11::basic_string<char, std::char_traits<ch... Kokkos::View<int*>::View<std::__cxx11::basic_string<char, std::char_traits<ch...
323 ../examples/lulesh/lulesh-init.cc Kokkos::View<int*>::View<std::__cxx11::basic_string<char, std::char_traits<ch... Kokkos::View<int*>::View<std::__cxx11::basic_string<char, std::char_traits<ch...
6589 ../examples/lulesh/lulesh-init.cc Kokkos::deep_copy<double*, , double*, Kokkos::LayoutRight, Kokkos::Device<Kok... Kokkos::deep_copy<double*, , double*, Kokkos::LayoutRight, Kokkos::Device<Kok...
1052 ../examples/lulesh/lulesh-init.cc Kokkos::deep_copy<double*> Kokkos::deep_copy<double*>(dst, value) [lulesh-init.cc]
1050 ../examples/lulesh/lulesh-init.cc Kokkos::deep_copy<double, Kokkos::LayoutRight, Kokkos::Device<Kokkos::OpenMP,... Kokkos::deep_copy<double, Kokkos::LayoutRight, Kokkos::Device<Kokkos::OpenMP,...
7686 ../examples/lulesh/lulesh-init.cc Kokkos::deep_copy<int* [8], Kokkos::LayoutRight, Kokkos::Device<Kokkos::OpenM... Kokkos::deep_copy<int* [8], Kokkos::LayoutRight, Kokkos::Device<Kokkos::OpenM...
7686 ../examples/lulesh/lulesh-init.cc Kokkos::deep_copy<int* [8], Kokkos::LayoutRight, int* [8], Kokkos::LayoutRigh... Kokkos::deep_copy<int* [8], Kokkos::LayoutRight, int* [8], Kokkos::LayoutRigh...
6589 ../examples/lulesh/lulesh-init.cc Kokkos::deep_copy<int*, , int*, Kokkos::LayoutRight, Kokkos::Device<Kokkos::O... Kokkos::deep_copy<int*, , int*, Kokkos::LayoutRight, Kokkos::Device<Kokkos::O...
6589 ../examples/lulesh/lulesh-init.cc Kokkos::deep_copy<int*, Kokkos::LayoutLeft, Kokkos::Device<Kokkos::OpenMP, Ko... Kokkos::deep_copy<int*, Kokkos::LayoutLeft, Kokkos::Device<Kokkos::OpenMP, Ko...
6589 ../examples/lulesh/lulesh-init.cc Kokkos::deep_copy<int*, Kokkos::LayoutRight, Kokkos::Device<Kokkos::OpenMP, K... Kokkos::deep_copy<int*, Kokkos::LayoutRight, Kokkos::Device<Kokkos::OpenMP, K...
863 ../examples/lulesh/lulesh-init.cc Kokkos::impl_resize<, int* [8], Kokkos::LayoutRight> type Kokkos::impl_resize<, int* [8], Kokkos::LayoutRight>(v, const size_t, co...
854 ../examples/lulesh/lulesh-init.cc Kokkos::impl_resize<, int*> type Kokkos::impl_resize<, int*>(v, const size_t, const size_t, const size_t,...
697 ../examples/lulesh/lulesh-init.cc Kokkos::parallel_for<Kokkos::MDRangePolicy<Kokkos::OpenMP, Kokkos::Rank<2u, (... Kokkos::parallel_for<Kokkos::MDRangePolicy<Kokkos::OpenMP, Kokkos::Rank<2u, (...
706 ../examples/lulesh/lulesh-init.cc Kokkos::parallel_for<Kokkos::MDRangePolicy<Kokkos::OpenMP, Kokkos::Rank<2u, (... Kokkos::parallel_for<Kokkos::MDRangePolicy<Kokkos::OpenMP, Kokkos::Rank<2u, (...
912 ../examples/lulesh/lulesh-init.cc Kokkos::parallel_for<Kokkos::RangePolicy<Kokkos::OpenMP, Kokkos::IndexType<in... Kokkos::parallel_for<Kokkos::RangePolicy<Kokkos::OpenMP, Kokkos::IndexType<in...
791 ../examples/lulesh/lulesh-init.cc Kokkos::parallel_for<Kokkos::RangePolicy<Kokkos::OpenMP, Kokkos::IndexType<in... Kokkos::parallel_for<Kokkos::RangePolicy<Kokkos::OpenMP, Kokkos::IndexType<in...
791 ../examples/lulesh/lulesh-init.cc Kokkos::parallel_for<Kokkos::RangePolicy<Kokkos::OpenMP, Kokkos::IndexType<in... Kokkos::parallel_for<Kokkos::RangePolicy<Kokkos::OpenMP, Kokkos::IndexType<in...
944 ../examples/lulesh/lulesh-init.cc Kokkos::parallel_for<Kokkos::RangePolicy<Kokkos::OpenMP, Kokkos::IndexType<lo... Kokkos::parallel_for<Kokkos::RangePolicy<Kokkos::OpenMP, Kokkos::IndexType<lo...
839 ../examples/lulesh/lulesh-init.cc Kokkos::parallel_for<Kokkos::RangePolicy<Kokkos::OpenMP, Kokkos::IndexType<lo... Kokkos::parallel_for<Kokkos::RangePolicy<Kokkos::OpenMP, Kokkos::IndexType<lo...
126 ../examples/lulesh/lulesh-init.cc _GLOBAL__sub_I_lulesh_init.cc _GLOBAL__sub_I_lulesh_init.cc() [lulesh-init.cc]
6589 ../examples/lulesh/lulesh-util.cc Kokkos::deep_copy<double*, Kokkos::LayoutRight, Kokkos::Device<Kokkos::OpenMP... Kokkos::deep_copy<double*, Kokkos::LayoutRight, Kokkos::Device<Kokkos::OpenMP...
1345 ../examples/lulesh/lulesh-util.cc ParseCommandLineOptions ParseCommandLineOptions(int, char * *, int, cmdLineOpts *) [lulesh-util.cc:67]
171 ../examples/lulesh/lulesh-util.cc PrintCommandLineOptions PrintCommandLineOptions(char *, int) [lulesh-util.cc:31]
67 ../examples/lulesh/lulesh-util.cc StrToInt int StrToInt(const char *, int *) [lulesh-util.cc:13]
706 ../examples/lulesh/lulesh-util.cc VerifyAndWriteFinalOutput VerifyAndWriteFinalOutput(Real_t, locDom, Int_t, Int_t) [lulesh-util.cc:222]
126 ../examples/lulesh/lulesh-util.cc _GLOBAL__sub_I_lulesh_util.cc _GLOBAL__sub_I_lulesh_util.cc() [lulesh-util.cc]
17 ../examples/lulesh/lulesh-viz.cc DumpToVisit DumpToVisit(domain, int, int, int) [lulesh-viz.cc:415]
126 ../examples/lulesh/lulesh-viz.cc _GLOBAL__sub_I_lulesh_viz.cc _GLOBAL__sub_I_lulesh_viz.cc() [lulesh-viz.cc]
451 ../examples/lulesh/lulesh.cc .omp_outlined..103 .omp_outlined..103(const , const , const ParallelReduce<(lambda at ../example...
796 ../examples/lulesh/lulesh.cc .omp_outlined..109 .omp_outlined..109(const , const , const ParallelFor<(lambda at ../examples/l...
394 ../examples/lulesh/lulesh.cc .omp_outlined..111 .omp_outlined..111(const , const , const ParallelFor<(lambda at ../examples/l...
402 ../examples/lulesh/lulesh.cc .omp_outlined..113 .omp_outlined..113(const , const , const ParallelFor<(lambda at ../examples/l...
427 ../examples/lulesh/lulesh.cc .omp_outlined..115 .omp_outlined..115(const , const , const ParallelReduce<(lambda at ../example...
859 ../examples/lulesh/lulesh.cc .omp_outlined..119 .omp_outlined..119(const , const , const ParallelFor<(lambda at ../examples/l...
243 ../examples/lulesh/lulesh.cc .omp_outlined..122 .omp_outlined..122(const , const , const ParallelFor<(lambda at ../examples/l...
426 ../examples/lulesh/lulesh.cc .omp_outlined..124 .omp_outlined..124(const , const , const ParallelFor<(lambda at ../examples/l...
529 ../examples/lulesh/lulesh.cc .omp_outlined..127 .omp_outlined..127(const , const , const ParallelFor<(lambda at ../examples/l...
865 ../examples/lulesh/lulesh.cc .omp_outlined..130 .omp_outlined..130(const , const , const ParallelFor<(lambda at ../examples/l...
539 ../examples/lulesh/lulesh.cc .omp_outlined..132 .omp_outlined..132(const , const , const ParallelReduce<(lambda at ../example...
456 ../examples/lulesh/lulesh.cc .omp_outlined..134 .omp_outlined..134(const , const , const ParallelReduce<(lambda at ../example...
252 ../examples/lulesh/lulesh.cc .omp_outlined..20 .omp_outlined..20(const , const , const ParallelFor<(lambda at ../examples/lu...
870 ../examples/lulesh/lulesh.cc .omp_outlined..35 .omp_outlined..35(const , const , const ParallelFor<(lambda at ../examples/lu...
473 ../examples/lulesh/lulesh.cc .omp_outlined..42 .omp_outlined..42(const , const , const ParallelFor<(lambda at ../examples/lu...
252 ../examples/lulesh/lulesh.cc .omp_outlined..46 .omp_outlined..46(const , const , const ParallelFor<(lambda at ../examples/lu...
1101 ../examples/lulesh/lulesh.cc .omp_outlined..48 .omp_outlined..48(const , const , const ParallelFor<(lambda at ../examples/lu...
427 ../examples/lulesh/lulesh.cc .omp_outlined..55 .omp_outlined..55(const , const , const ParallelReduce<(lambda at ../examples...
1326 ../examples/lulesh/lulesh.cc .omp_outlined..57 .omp_outlined..57(const , const , const ParallelReduce<(lambda at ../examples...
243 ../examples/lulesh/lulesh.cc .omp_outlined..61 .omp_outlined..61(const , const , const ParallelFor<(lambda at ../examples/lu...
1101 ../examples/lulesh/lulesh.cc .omp_outlined..63 .omp_outlined..63(const , const , const ParallelFor<(lambda at ../examples/lu...
372 ../examples/lulesh/lulesh.cc .omp_outlined..66 .omp_outlined..66(const , const , const ParallelFor<(lambda at ../examples/lu...
499 ../examples/lulesh/lulesh.cc .omp_outlined..71 .omp_outlined..71(const , const , const ParallelFor<(lambda at ../examples/lu...
499 ../examples/lulesh/lulesh.cc .omp_outlined..73 .omp_outlined..73(const , const , const ParallelFor<(lambda at ../examples/lu...
499 ../examples/lulesh/lulesh.cc .omp_outlined..75 .omp_outlined..75(const , const , const ParallelFor<(lambda at ../examples/lu...
465 ../examples/lulesh/lulesh.cc .omp_outlined..78 .omp_outlined..78(const , const , const ParallelFor<(lambda at ../examples/lu...
396 ../examples/lulesh/lulesh.cc .omp_outlined..81 .omp_outlined..81(const , const , const ParallelFor<(lambda at ../examples/lu...
656 ../examples/lulesh/lulesh.cc .omp_outlined..85 .omp_outlined..85(const , const , const ParallelFor<Kokkos::Impl::ViewCopy<Ko...
662 ../examples/lulesh/lulesh.cc .omp_outlined..89 .omp_outlined..89(const , const , const ParallelFor<Kokkos::Impl::ViewCopy<Ko...
443 ../examples/lulesh/lulesh.cc .omp_outlined..93 .omp_outlined..93(const , const , const ParallelReduce<(lambda at ../examples...
243 ../examples/lulesh/lulesh.cc .omp_outlined..96 .omp_outlined..96(const , const , const ParallelFor<(lambda at ../examples/lu...
243 ../examples/lulesh/lulesh.cc .omp_outlined..99 .omp_outlined..99(const , const , const ParallelFor<(lambda at ../examples/lu...
13367 ../examples/lulesh/lulesh.cc ApplyMaterialPropertiesForElems ApplyMaterialPropertiesForElems(domain) [lulesh.cc:409]
1530 ../examples/lulesh/lulesh.cc CalcElemCharacteristicLength Real_t CalcElemCharacteristicLength(const Real_t *, const Real_t *, const Rea...
982 ../examples/lulesh/lulesh.cc CalcElemFBHourglassForce CalcElemFBHourglassForce(const Real_t *, const Real_t[] *, coefficient, Real_...
2428 ../examples/lulesh/lulesh.cc CalcElemNodeNormals CalcElemNodeNormals(Real_t *, Real_t *, Real_t *, const Real_t *, const Real_...
853 ../examples/lulesh/lulesh.cc CalcElemShapeFunctionDerivatives CalcElemShapeFunctionDerivatives(const Real_t *, const Real_t *, const Real_t...
1097 ../examples/lulesh/lulesh.cc CalcElemVolumeDerivative CalcElemVolumeDerivative(i, dvdx, dvdy, dvdz, const Real_t *, const Real_t *,...
1054 ../examples/lulesh/lulesh.cc CalcKinematicsForElems CalcKinematicsForElems(domain, Real_t, Index_t) [lulesh.cc]
14160 ../examples/lulesh/lulesh.cc CalcVolumeForceForElems CalcVolumeForceForElems(domain) [lulesh.cc:409]
366 ../examples/lulesh/lulesh.cc Domain::AllocateGradients Domain::AllocateGradients(Domain *, Int_t, Int_t) [lulesh.cc:214]
475 ../examples/lulesh/lulesh.cc Domain::DeallocateGradients Domain::DeallocateGradients(Domain *) [lulesh.cc:105]
250 ../examples/lulesh/lulesh.cc Domain::DeallocateStrains Domain::DeallocateStrains(Domain *) [lulesh.cc:105]
4356 ../examples/lulesh/lulesh.cc Domain::Domain Domain::Domain(Domain *) [lulesh.cc:78]
15 ../examples/lulesh/lulesh.cc Domain::delv_eta Domain::delv_eta(const Domain *, const Index_t) [lulesh.cc:371]
15 ../examples/lulesh/lulesh.cc Domain::delv_xi Domain::delv_xi(const Domain *, const Index_t) [lulesh.cc:368]
15 ../examples/lulesh/lulesh.cc Domain::delv_zeta Domain::delv_zeta(const Domain *, const Index_t) [lulesh.cc:374]
15 ../examples/lulesh/lulesh.cc Domain::fx Domain::fx(const Domain *, const Index_t) [lulesh.cc:303]
15 ../examples/lulesh/lulesh.cc Domain::fy Domain::fy(const Domain *, const Index_t) [lulesh.cc:306]
15 ../examples/lulesh/lulesh.cc Domain::fz Domain::fz(const Domain *, const Index_t) [lulesh.cc:309]
15 ../examples/lulesh/lulesh.cc Domain::nodalMass Domain::nodalMass(const Domain *, const Index_t) [lulesh.cc:314]
15 ../examples/lulesh/lulesh.cc Domain::x Domain::x(const Domain *, const Index_t) [lulesh.cc:257]
15 ../examples/lulesh/lulesh.cc Domain::xd Domain::xd(const Domain *, const Index_t) [lulesh.cc:272]
15 ../examples/lulesh/lulesh.cc Domain::y Domain::y(const Domain *, const Index_t) [lulesh.cc:258]
15 ../examples/lulesh/lulesh.cc Domain::yd Domain::yd(const Domain *, const Index_t) [lulesh.cc:275]
15 ../examples/lulesh/lulesh.cc Domain::z Domain::z(const Domain *, const Index_t) [lulesh.cc:259]
15 ../examples/lulesh/lulesh.cc Domain::zd Domain::zd(const Domain *, const Index_t) [lulesh.cc:278]
330 ../examples/lulesh/lulesh.cc Kokkos::Impl::ParallelConstructName<Kokkos::Impl::ViewCopy<Kokkos::View<doubl... Kokkos::Impl::ParallelConstructName<Kokkos::Impl::ViewCopy<Kokkos::View<doubl...
330 ../examples/lulesh/lulesh.cc Kokkos::Impl::ParallelConstructName<Kokkos::Impl::ViewCopy<Kokkos::View<doubl... Kokkos::Impl::ParallelConstructName<Kokkos::Impl::ViewCopy<Kokkos::View<doubl...
1508 ../examples/lulesh/lulesh.cc Kokkos::Impl::ParallelFor<CalcEnergyForElems(double*, double*, double*, doubl... type Kokkos::Impl::ParallelFor<CalcEnergyForElems(double*, double*, double*, ...
3606 ../examples/lulesh/lulesh.cc Kokkos::Impl::ParallelFor<CalcFBHourglassForceForElems(Domain&, double*, Kokk... type Kokkos::Impl::ParallelFor<CalcFBHourglassForceForElems(Domain&, double*,...
2917 ../examples/lulesh/lulesh.cc Kokkos::Impl::ParallelFor<CalcKinematicsForElems(Domain&, double, int)::$_0, ... type Kokkos::Impl::ParallelFor<CalcKinematicsForElems(Domain&, double, int)::...
3119 ../examples/lulesh/lulesh.cc Kokkos::Impl::ParallelFor<CalcMonotonicQGradientsForElems(Domain&)::{lambda(i... type Kokkos::Impl::ParallelFor<CalcMonotonicQGradientsForElems(Domain&)::{lam...
1969 ../examples/lulesh/lulesh.cc Kokkos::Impl::ParallelFor<CalcMonotonicQRegionForElems(Domain&, int, double):... type Kokkos::Impl::ParallelFor<CalcMonotonicQRegionForElems(Domain&, int, dou...
1265 ../examples/lulesh/lulesh.cc Kokkos::Impl::ParallelFor<IntegrateStressForElems(Domain&, double*, double*, ... type Kokkos::Impl::ParallelFor<IntegrateStressForElems(Domain&, double*, doub...
49 ../examples/lulesh/lulesh.cc Kokkos::Impl::SharedAllocationRecord<Kokkos::HostSpace, Kokkos::Impl::ViewVal... Kokkos::Impl::SharedAllocationRecord<Kokkos::HostSpace, Kokkos::Impl::ViewVal...
1497 ../examples/lulesh/lulesh.cc Kokkos::Impl::TeamPolicyInternal<Kokkos::OpenMP>::TeamPolicyInternal Kokkos::Impl::TeamPolicyInternal<Kokkos::OpenMP>::TeamPolicyInternal(TeamPoli...
603 ../examples/lulesh/lulesh.cc Kokkos::Impl::ViewCopy<Kokkos::View<double*, Kokkos::LayoutLeft, Kokkos::Devi... Kokkos::Impl::ViewCopy<Kokkos::View<double*, Kokkos::LayoutLeft, Kokkos::Devi...
604 ../examples/lulesh/lulesh.cc Kokkos::Impl::ViewCopy<Kokkos::View<double*, Kokkos::LayoutLeft, Kokkos::Devi... Kokkos::Impl::ViewCopy<Kokkos::View<double*, Kokkos::LayoutLeft, Kokkos::Devi...
281 ../examples/lulesh/lulesh.cc Kokkos::Impl::ViewCtorProp<std::__cxx11::basic_string<char, std::char_traits<... Kokkos::Impl::ViewCtorProp<std::__cxx11::basic_string<char, std::char_traits<...
281 ../examples/lulesh/lulesh.cc Kokkos::Impl::ViewCtorProp<std::__cxx11::basic_string<char, std::char_traits<... Kokkos::Impl::ViewCtorProp<std::__cxx11::basic_string<char, std::char_traits<...
521 ../examples/lulesh/lulesh.cc Kokkos::Impl::ViewMapping<Kokkos::ViewTraits<double*>, void>::allocate_shared... SharedAllocationRecord<void, void> * Kokkos::Impl::ViewMapping<Kokkos::ViewTr...
331 ../examples/lulesh/lulesh.cc Kokkos::Impl::ViewRemap<Kokkos::View<double*>, Kokkos::View<double*>, Kokkos:... Kokkos::Impl::ViewRemap<Kokkos::View<double*>, Kokkos::View<double*>, Kokkos:...
461 ../examples/lulesh/lulesh.cc Kokkos::Impl::ViewValueFunctor<Kokkos::Device<Kokkos::OpenMP, Kokkos::HostSpa... enable_if_t<std::is_trivial<double>::value && std::is_trivially_copy_assignab...
1609 ../examples/lulesh/lulesh.cc Kokkos::Impl::runtime_check_rank_host Kokkos::Impl::runtime_check_rank_host(const size_t, const bool, const size_t,...
697 ../examples/lulesh/lulesh.cc Kokkos::Impl::view_copy<Kokkos::View<double*, Kokkos::LayoutRight, Kokkos::De... Kokkos::Impl::view_copy<Kokkos::View<double*, Kokkos::LayoutRight, Kokkos::De...
697 ../examples/lulesh/lulesh.cc Kokkos::Impl::view_copy<Kokkos::View<double*>, Kokkos::View<double*> > Kokkos::Impl::view_copy<Kokkos::View<double*>, Kokkos::View<double*> >(dst, s...
2250 ../examples/lulesh/lulesh.cc Kokkos::RangePolicy<Kokkos::OpenMP>::RangePolicy Kokkos::RangePolicy<Kokkos::OpenMP>::RangePolicy(RangePolicy<Kokkos::OpenMP> ...
213 ../examples/lulesh/lulesh.cc Kokkos::StaticCrsGraph<int, Kokkos::LayoutLeft, Kokkos::OpenMP, Kokkos::Memor... Kokkos::StaticCrsGraph<int, Kokkos::LayoutLeft, Kokkos::OpenMP, Kokkos::Memor...
410 ../examples/lulesh/lulesh.cc Kokkos::View<double*>::View<char [6]> Kokkos::View<double*>::View<char [6]>(View<double *> *, arg_label, type, cons...
410 ../examples/lulesh/lulesh.cc Kokkos::View<double*>::View<char [7]> Kokkos::View<double*>::View<char [7]>(View<double *> *, arg_label, type, cons...
462 ../examples/lulesh/lulesh.cc Kokkos::View<double*>::View<std::__cxx11::basic_string<char, std::char_traits... Kokkos::View<double*>::View<std::__cxx11::basic_string<char, std::char_traits...
323 ../examples/lulesh/lulesh.cc Kokkos::View<double*>::View<std::__cxx11::basic_string<char, std::char_traits... Kokkos::View<double*>::View<std::__cxx11::basic_string<char, std::char_traits...
25 ../examples/lulesh/lulesh.cc Kokkos::View<double*>::~View Kokkos::View<double*>::~View(View<double *> *) [lulesh.cc:409]
840 ../examples/lulesh/lulesh.cc Kokkos::abort Kokkos::abort(const const char *, const const char *) [lulesh.cc:202]
854 ../examples/lulesh/lulesh.cc Kokkos::impl_resize<, double*> type Kokkos::impl_resize<, double*>(v, const size_t, const size_t, const size...
928 ../examples/lulesh/lulesh.cc Kokkos::parallel_for<Kokkos::RangePolicy<Kokkos::OpenMP, Kokkos::IndexType<in... Kokkos::parallel_for<Kokkos::RangePolicy<Kokkos::OpenMP, Kokkos::IndexType<in...
960 ../examples/lulesh/lulesh.cc Kokkos::parallel_for<Kokkos::RangePolicy<Kokkos::OpenMP, Kokkos::IndexType<lo... Kokkos::parallel_for<Kokkos::RangePolicy<Kokkos::OpenMP, Kokkos::IndexType<lo...
21470 ../examples/lulesh/lulesh.cc LagrangeLeapFrog LagrangeLeapFrog(domain) [lulesh.cc]
226 ../examples/lulesh/lulesh.cc ResizeBuffer ResizeBuffer(const size_t) [lulesh.cc:23]
169 ../examples/lulesh/lulesh.cc _GLOBAL__sub_I_lulesh.cc _GLOBAL__sub_I_lulesh.cc() [lulesh.cc]
1836 ../examples/lulesh/lulesh.cc main int main(int, char * *) [lulesh.cc]
63 ../examples/lulesh/lulesh.cc std::_Rb_tree<std::__cxx11::basic_string<char, std::char_traits<char>, std::a... std::_Rb_tree<std::__cxx11::basic_string<char, std::char_traits<char>, std::a...
20 ../examples/lulesh/lulesh.cc std::map<std::__cxx11::basic_string<char, std::char_traits<char>, std::alloca... std::map<std::__cxx11::basic_string<char, std::char_traits<char>, std::alloca...
160 ../examples/lulesh/lulesh.cc std::operator+<char, std::char_traits<char>, std::allocator<char> > basic_string<char, std::char_traits<char>, std::allocator<char> > std::operat...
187 ../examples/lulesh/lulesh.cc std::pair<std::__cxx11::basic_string<char, std::char_traits<char>, std::alloc... std::pair<std::__cxx11::basic_string<char, std::char_traits<char>, std::alloc...
11 lulesh __clang_call_terminate __clang_call_terminate() [lulesh]
33 lulesh __do_global_dtors_aux __do_global_dtors_aux() [lulesh]
5 lulesh __libc_csu_fini __libc_csu_fini() [lulesh]
101 lulesh __libc_csu_init __libc_csu_init() [lulesh]
5 lulesh _dl_relocate_static_pie _dl_relocate_static_pie() [lulesh]
13 lulesh _fini _fini() [lulesh]
27 lulesh _init _init() [lulesh]
47 lulesh _start _start() [lulesh]
6 lulesh frame_dummy frame_dummy() [lulesh]
```
#### Example Instrumented Module and Function Info Output
> `omnitrace -o lulesh.inst --label file line args --simulate -- lulesh`
After the heuristics are applied in [Example Available Module and Function Info Output](#example-available-module-and-function-info-output),
the selected module/functions are:
```console
AddressRange Module Function FunctionSignature
9165 ../examples/lulesh/lulesh-comm.cc CommMonoQ CommMonoQ(domain) [lulesh-comm.cc:1891]
3396 ../examples/lulesh/lulesh-comm.cc CommRecv CommRecv(domain, int, Index_t, Index_t, Index_t, Index_t, bool, bool) [lulesh...
8666 ../examples/lulesh/lulesh-comm.cc CommSBN CommSBN(domain, int, Domain_member *) [lulesh-comm.cc:926]
10212 ../examples/lulesh/lulesh-comm.cc CommSend CommSend(domain, int, Index_t, Domain_member *, Index_t, Index_t, Index_t, bo...
6823 ../examples/lulesh/lulesh-comm.cc CommSyncPosVel CommSyncPosVel(domain) [lulesh-comm.cc:1404]
1840 ../examples/lulesh/lulesh-init.cc Domain::AllocateElemPersistent Domain::AllocateElemPersistent(Domain *, Int_t) [lulesh-init.cc:94]
1384 ../examples/lulesh/lulesh-init.cc Domain::AllocateNodePersistent Domain::AllocateNodePersistent(Domain *, Int_t) [lulesh-init.cc:94]
1264 ../examples/lulesh/lulesh-init.cc Domain::BuildMesh Domain::BuildMesh(Domain *, Int_t, Int_t, Int_t) [lulesh-init.cc:308]
2312 ../examples/lulesh/lulesh-init.cc Domain::CreateRegionIndexSets Domain::CreateRegionIndexSets(Domain *, Int_t, Int_t) [lulesh-init.cc:409]
7109 ../examples/lulesh/lulesh-init.cc Domain::Domain Domain::Domain(Domain *, Int_t, Index_t, Index_t, Index_t, Index_t, int, int,...
2458 ../examples/lulesh/lulesh-init.cc Domain::SetupBoundaryConditions Domain::SetupBoundaryConditions(Domain *, Int_t) [lulesh-init.cc:409]
956 ../examples/lulesh/lulesh-init.cc Domain::SetupCommBuffers Domain::SetupCommBuffers(Domain *, Int_t) [lulesh-init.cc]
1456 ../examples/lulesh/lulesh-init.cc Domain::SetupElementConnectivities Domain::SetupElementConnectivities(Domain *, Int_t) [lulesh-init.cc:409]
721 ../examples/lulesh/lulesh-init.cc Domain::SetupSymmetryPlanes Domain::SetupSymmetryPlanes(Domain *, Int_t) [lulesh-init.cc:409]
1591 ../examples/lulesh/lulesh-init.cc Domain::SetupThreadSupportStructures Domain::SetupThreadSupportStructures(Domain *) [lulesh-init.cc:376]
1644 ../examples/lulesh/lulesh-init.cc Domain::~Domain Domain::~Domain(Domain *) [lulesh-init.cc:286]
271 ../examples/lulesh/lulesh-init.cc Kokkos::StaticCrsGraph<int, Kokkos::LayoutLeft, Kokkos::OpenMP, Kokkos::Memor... Kokkos::StaticCrsGraph<int, Kokkos::LayoutLeft, Kokkos::OpenMP, Kokkos::Memor...
410 ../examples/lulesh/lulesh-init.cc Kokkos::View<int*, Kokkos::HostSpace>::View<char [10]> Kokkos::View<int*, Kokkos::HostSpace>::View<char [10]>(View<int *, Kokkos::Ho...
410 ../examples/lulesh/lulesh-init.cc Kokkos::View<int*, Kokkos::HostSpace>::View<char [14]> Kokkos::View<int*, Kokkos::HostSpace>::View<char [14]>(View<int *, Kokkos::Ho...
410 ../examples/lulesh/lulesh-init.cc Kokkos::View<int*>::View<char [16]> Kokkos::View<int*>::View<char [16]>(View<int *> *, arg_label, type, const siz...
410 ../examples/lulesh/lulesh-init.cc Kokkos::View<int*>::View<char [19]> Kokkos::View<int*>::View<char [19]>(View<int *> *, arg_label, type, const siz...
410 ../examples/lulesh/lulesh-init.cc Kokkos::View<int*>::View<char [21]> Kokkos::View<int*>::View<char [21]>(View<int *> *, arg_label, type, const siz...
6589 ../examples/lulesh/lulesh-init.cc Kokkos::deep_copy<double*, , double*, Kokkos::LayoutRight, Kokkos::Device<Kok... Kokkos::deep_copy<double*, , double*, Kokkos::LayoutRight, Kokkos::Device<Kok...
1052 ../examples/lulesh/lulesh-init.cc Kokkos::deep_copy<double*> Kokkos::deep_copy<double*>(dst, value) [lulesh-init.cc]
1050 ../examples/lulesh/lulesh-init.cc Kokkos::deep_copy<double, Kokkos::LayoutRight, Kokkos::Device<Kokkos::OpenMP,... Kokkos::deep_copy<double, Kokkos::LayoutRight, Kokkos::Device<Kokkos::OpenMP,...
7686 ../examples/lulesh/lulesh-init.cc Kokkos::deep_copy<int* [8], Kokkos::LayoutRight, Kokkos::Device<Kokkos::OpenM... Kokkos::deep_copy<int* [8], Kokkos::LayoutRight, Kokkos::Device<Kokkos::OpenM...
7686 ../examples/lulesh/lulesh-init.cc Kokkos::deep_copy<int* [8], Kokkos::LayoutRight, int* [8], Kokkos::LayoutRigh... Kokkos::deep_copy<int* [8], Kokkos::LayoutRight, int* [8], Kokkos::LayoutRigh...
6589 ../examples/lulesh/lulesh-init.cc Kokkos::deep_copy<int*, , int*, Kokkos::LayoutRight, Kokkos::Device<Kokkos::O... Kokkos::deep_copy<int*, , int*, Kokkos::LayoutRight, Kokkos::Device<Kokkos::O...
6589 ../examples/lulesh/lulesh-init.cc Kokkos::deep_copy<int*, Kokkos::LayoutLeft, Kokkos::Device<Kokkos::OpenMP, Ko... Kokkos::deep_copy<int*, Kokkos::LayoutLeft, Kokkos::Device<Kokkos::OpenMP, Ko...
6589 ../examples/lulesh/lulesh-init.cc Kokkos::deep_copy<int*, Kokkos::LayoutRight, Kokkos::Device<Kokkos::OpenMP, K... Kokkos::deep_copy<int*, Kokkos::LayoutRight, Kokkos::Device<Kokkos::OpenMP, K...
697 ../examples/lulesh/lulesh-init.cc Kokkos::parallel_for<Kokkos::MDRangePolicy<Kokkos::OpenMP, Kokkos::Rank<2u, (... Kokkos::parallel_for<Kokkos::MDRangePolicy<Kokkos::OpenMP, Kokkos::Rank<2u, (...
706 ../examples/lulesh/lulesh-init.cc Kokkos::parallel_for<Kokkos::MDRangePolicy<Kokkos::OpenMP, Kokkos::Rank<2u, (... Kokkos::parallel_for<Kokkos::MDRangePolicy<Kokkos::OpenMP, Kokkos::Rank<2u, (...
912 ../examples/lulesh/lulesh-init.cc Kokkos::parallel_for<Kokkos::RangePolicy<Kokkos::OpenMP, Kokkos::IndexType<in... Kokkos::parallel_for<Kokkos::RangePolicy<Kokkos::OpenMP, Kokkos::IndexType<in...
791 ../examples/lulesh/lulesh-init.cc Kokkos::parallel_for<Kokkos::RangePolicy<Kokkos::OpenMP, Kokkos::IndexType<in... Kokkos::parallel_for<Kokkos::RangePolicy<Kokkos::OpenMP, Kokkos::IndexType<in...
791 ../examples/lulesh/lulesh-init.cc Kokkos::parallel_for<Kokkos::RangePolicy<Kokkos::OpenMP, Kokkos::IndexType<in... Kokkos::parallel_for<Kokkos::RangePolicy<Kokkos::OpenMP, Kokkos::IndexType<in...
944 ../examples/lulesh/lulesh-init.cc Kokkos::parallel_for<Kokkos::RangePolicy<Kokkos::OpenMP, Kokkos::IndexType<lo... Kokkos::parallel_for<Kokkos::RangePolicy<Kokkos::OpenMP, Kokkos::IndexType<lo...
839 ../examples/lulesh/lulesh-init.cc Kokkos::parallel_for<Kokkos::RangePolicy<Kokkos::OpenMP, Kokkos::IndexType<lo... Kokkos::parallel_for<Kokkos::RangePolicy<Kokkos::OpenMP, Kokkos::IndexType<lo...
6589 ../examples/lulesh/lulesh-util.cc Kokkos::deep_copy<double*, Kokkos::LayoutRight, Kokkos::Device<Kokkos::OpenMP... Kokkos::deep_copy<double*, Kokkos::LayoutRight, Kokkos::Device<Kokkos::OpenMP...
1345 ../examples/lulesh/lulesh-util.cc ParseCommandLineOptions ParseCommandLineOptions(int, char * *, int, cmdLineOpts *) [lulesh-util.cc:67]
706 ../examples/lulesh/lulesh-util.cc VerifyAndWriteFinalOutput VerifyAndWriteFinalOutput(Real_t, locDom, Int_t, Int_t) [lulesh-util.cc:222]
13367 ../examples/lulesh/lulesh.cc ApplyMaterialPropertiesForElems ApplyMaterialPropertiesForElems(domain) [lulesh.cc:409]
982 ../examples/lulesh/lulesh.cc CalcElemFBHourglassForce CalcElemFBHourglassForce(const Real_t *, const Real_t[] *, coefficient, Real_...
2428 ../examples/lulesh/lulesh.cc CalcElemNodeNormals CalcElemNodeNormals(Real_t *, Real_t *, Real_t *, const Real_t *, const Real_...
853 ../examples/lulesh/lulesh.cc CalcElemShapeFunctionDerivatives CalcElemShapeFunctionDerivatives(const Real_t *, const Real_t *, const Real_t...
1054 ../examples/lulesh/lulesh.cc CalcKinematicsForElems CalcKinematicsForElems(domain, Real_t, Index_t) [lulesh.cc]
14160 ../examples/lulesh/lulesh.cc CalcVolumeForceForElems CalcVolumeForceForElems(domain) [lulesh.cc:409]
366 ../examples/lulesh/lulesh.cc Domain::AllocateGradients Domain::AllocateGradients(Domain *, Int_t, Int_t) [lulesh.cc:214]
475 ../examples/lulesh/lulesh.cc Domain::DeallocateGradients Domain::DeallocateGradients(Domain *) [lulesh.cc:105]
4356 ../examples/lulesh/lulesh.cc Domain::Domain Domain::Domain(Domain *) [lulesh.cc:78]
410 ../examples/lulesh/lulesh.cc Kokkos::View<double*>::View<char [6]> Kokkos::View<double*>::View<char [6]>(View<double *> *, arg_label, type, cons...
410 ../examples/lulesh/lulesh.cc Kokkos::View<double*>::View<char [7]> Kokkos::View<double*>::View<char [7]>(View<double *> *, arg_label, type, cons...
928 ../examples/lulesh/lulesh.cc Kokkos::parallel_for<Kokkos::RangePolicy<Kokkos::OpenMP, Kokkos::IndexType<in... Kokkos::parallel_for<Kokkos::RangePolicy<Kokkos::OpenMP, Kokkos::IndexType<in...
960 ../examples/lulesh/lulesh.cc Kokkos::parallel_for<Kokkos::RangePolicy<Kokkos::OpenMP, Kokkos::IndexType<lo... Kokkos::parallel_for<Kokkos::RangePolicy<Kokkos::OpenMP, Kokkos::IndexType<lo...
21470 ../examples/lulesh/lulesh.cc LagrangeLeapFrog LagrangeLeapFrog(domain) [lulesh.cc]
1836 ../examples/lulesh/lulesh.cc main int main(int, char * *) [lulesh.cc]
```
## Sampling
By default, omnitrace uses `--mode trace` for instrumentation. The `--mode sampling` option
will only instrument `main` in an executable and will activate both CPU call-stack sampling and
background system-level thread sampling by default.
Tracing capabilities which do not rely on instrumentation, such as the HIP API and kernel tracing
(which is collected via roctracer), will still be available.
[Omnitrace](https://github.com/AMDResearch/omnitrace)'s sampling capabilities are always available, even in trace mode, but is deactivated by default.
In order to activate sampling in trace mode, simply set `OMNITRACE_USE_SAMPLING=ON` in the environment
or in an omnitrace configuration file.
## Embedding a Default Configuration
Using the `--env` option, a default configuration can be embedded into the target. Although this option
works for runtime instrumentation, it is most useful when generating new binaries since the generated
binary may be used later in a different login sessions when the environment may have changed.
For example, if the following sequence of commands are run:
```shell
omnitrace -o ./foo.inst -- ./foo
export OMNITRACE_USE_SAMPLING=ON
export OMNITRACE_SAMPLING_FREQ=5
./foo.inst
```
These configuration settings will not be preserved in another session, whereas:
```shell
omnitrace -o ./foo.samp --env OMNITRACE_USE_SAMPLING=ON OMNITRACE_SAMPLING_FREQ=5 -- ./foo
```
will preserve those environment variables:
```shell
# will sample 5x per second
./foo.samp
```
while still allowing the subsequent session to override those defaults:
```shell
# will sample 100x per second
export OMNITRACE_SAMPLING_FREQ=100
./foo.samp
```
### Troubleshooting
#### Checking for RPATH
If `ldd ./foo.inst` from the [Binary Rewriting a Library Example](#binary-rewriting-a-library-example) section still returned `/usr/local/lib/libfoo.so.2`, your executable may have an rpath encoded in the binary.
This ELF entry will result in the dynamic linker to ignore `LD_LIBRARY_PATH` if it finds a `libfoo.so.2` in the rpath.
You can use the `objdump` tool to perform this query:
```shell
objdump -p <exe-or-library> | egrep 'RPATH|RUNPATH'
```
If this produces output, e.g.:
```shell
RUNPATH $ORIGIN:$ORIGIN/../lib
```
You will have to remove or modify the rpath in order to get `foo.inst` to resolve to the instrumented `libfoo.so.2`
#### Modifying RPATH
> Requires `patchelf` package
```shell
patchelf --remove-rpath <exe-or-library>
patchelf --set-rpath '/home/user' <exe-or-library>
```
+35
Dosyayı Görüntüle
@@ -0,0 +1,35 @@
@ECHO OFF
pushd %~dp0
REM Command file for Sphinx documentation
if "%SPHINXBUILD%" == "" (
set SPHINXBUILD=sphinx-build
)
set SOURCEDIR=.
set BUILDDIR=_build
if "%1" == "" goto help
%SPHINXBUILD% >NUL 2>NUL
if errorlevel 9009 (
echo.
echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
echo.installed, then set the SPHINXBUILD environment variable to point
echo.to the full path of the 'sphinx-build' executable. Alternatively you
echo.may add the Sphinx directory to PATH.
echo.
echo.If you don't have Sphinx installed, grab it from
echo.http://sphinx-doc.org/
exit /b 1
)
%SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
goto end
:help
%SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
:end
popd
Dosya farkı çok büyük olduğundan ihmal edildi Fark Yükle
+528
Dosyayı Görüntüle
@@ -0,0 +1,528 @@
# Omnitrace Output
```eval_rst
.. toctree::
:glob:
:maxdepth: 3
```
## Overview
The general output form of omnitrace is `<OUTPUT_PATH>[/<TIMESTAMP>]/[<PREFIX>]<DATA_NAME>[-<OUTPUT_SUFFIX>].<EXT>`.
E.g. with the base configuration:
```shell
export OMNITRACE_OUTPUT_PATH=omnitrace-example-output
export OMNITRACE_TIME_OUTPUT=ON
export OMNITRACE_USE_PID=OFF
export OMNITRACE_USE_TIMEMORY=ON
export OMNITRACE_USE_PERFETTO=ON
```
```shell
$ omnitrace -- ./foo
...
[omnitrace] Outputting 'omnitrace-example-output/perfetto-trace.proto'...
[omnitrace] Outputting 'omnitrace-example-output/wall-clock.txt'...
[omnitrace] Outputting 'omnitrace-example-output/wall-clock.json'...
```
If we enable the `OMNITRACE_USE_PID` option, then when our non-MPI executable is executed with a PID of 63453:
```shell
$ export OMNITRACE_USE_PID=ON
$ omnitrace -- ./foo
...
[omnitrace] Outputting 'omnitrace-example-output/perfetto-trace-63453.proto'...
[omnitrace] Outputting 'omnitrace-example-output/wall-clock-63453.txt'...
[omnitrace] Outputting 'omnitrace-example-output/wall-clock-63453.json'...
```
If we enable `OMNITRACE_TIME_OUTPUT`, then a job started on January 31, 2022 at 12:30 PM:
```shell
$ export OMNITRACE_TIME_OUTPUT=ON
$ omnitrace -- ./foo
...
[omnitrace] Outputting 'omnitrace-example-output/2022-01-31_12.30_PM/perfetto-trace-63453.proto'...
[omnitrace] Outputting 'omnitrace-example-output/2022-01-31_12.30_PM/wall-clock-63453.txt'...
[omnitrace] Outputting 'omnitrace-example-output/2022-01-31_12.30_PM/wall-clock-63453.json'...
```
## Metadata
[Omnitrace](https://github.com/AMDResearch/omnitrace) will output a metadata.json file.
## Configuring Output
### Core Configuration Settings
> See also: [Customizing Omnitrace Runtime](runtime.md)
| Setting | Value | Description |
|---------------------------|--------------------|---------------------------------------------------------------------------------------------------|
| `OMNITRACE_OUTPUT_PATH` | Any valid path | Path to folder where output files should be placed |
| `OMNITRACE_OUTPUT_PREFIX` | String | Useful for multiple runs with different arguments. See [Output Prefix Keys](#output-prefix-keys) |
| `OMNITRACE_OUTPUT_FILE` | Any valid filepath | Specific location for perfetto output file. |
| `OMNITRACE_TIME_OUTPUT` | Boolean | Place all output in a timestamped folder, timestamp format controlled via `OMNITRACE_TIME_FORMAT` |
| `OMNITRACE_TIME_FORMAT` | String | See `strftime` man pages for valid identifiers |
| `OMNITRACE_USE_PID` | Boolean | Append either the PID or the MPI rank to all output files (before the extension) |
#### Output Prefix Keys
Output prefix keys have many uses but most useful when dealing with multiple profiling runs or large MPI jobs.
Their inclusion in omnitrace stems from their introduction into timemory for [compile-time-perf](https://github.com/jrmadsen/compile-time-perf)
which needed to be able to create different output files for a generic wrapper around compilation commands while still
overwriting the output from the last time a file was compiled.
If you are ever doing scaling studies and specifying options via the command line, it is highly recommend to just
use a common `OMNITRACE_OUTPUT_PATH`, disable `OMNITRACE_TIME_OUTPUT`,
set `OMNITRACE_OUTPUT_PREFIX="%argt%-"` and let omnitrace cleanly organize the output.
| String | Encoding |
|-----------------|-----------------------------------------------------------------------------------------------|
| `%arg<N>%` | Command line argument at position `<N>` (zero indexed), e.g. `%arg0%` for first argument. |
| `%arg<N>_hash%` | MD5 sum of `%arg<N>%` |
| `%argv%` | Entire command-line condensed into a single string |
| `%argv_hash%` | MD5 sum of `%argv%` |
| `%argt%` | Similar to `%argv%` except basename of first command line argument |
| `%argt_hash%` | MD5 sum if `%argt%` |
| `%args%` | All command line arguments condensed into a single string |
| `%args_hash%` | MD5 sum of `%args%` |
| `%tag%` | Basename of first command line argument |
| `%tag_hash%` | MD5 sum of `%tag%` |
| `%pid%` | Process identifier (i.e. `getpid()`) |
| `%job%` | Value of `SLURM_JOB_ID` environment variable if exists, else `0` |
| `%rank%` | Value of `SLURM_PROCID` environment variable if exists, else `MPI_Comm_rank` (or `0` non-mpi) |
| `%size%` | `MPI_Comm_size` or `1` if non-mpi |
| `%m` | Shorthand for `%argt_hash%` |
| `%p` | Shorthand for `%pid%` |
| `%j` | Shorthand for `%job%` |
| `%r` | Shorthand for `%rank%` |
| `%s` | Shorthand for `%size%` |
> NOTE: any output prefix key which contain a '/' will have the `/` characters
> replaced with `_` and any leading underscores will be stripped, e.g. if `%arg0%` is `/usr/bin/foo`, this
> will translate to `usr_bin_foo`. Additionally, any `%arg<N>%` keys which do not have a command line argument
> at position `<N>` will be ignored.
## Perfetto Output
Use the `OMNITRACE_OUTPUT_FILE` to specify a specific location. If this is an absolute path, then all `OMNITRACE_OUTPUT_PATH`, etc.
settings will be ignored.
## Timemory Output
Use `omnitrace-avail --components --filename` to view the base filename for each component. E.g.
```shell
$ ./omnitrace-avail wall_clock -C -f
|---------------------------------|---------------|------------------------|
| COMPONENT | AVAILABLE | FILENAME |
|---------------------------------|---------------|------------------------|
| wall_clock | true | wall_clock |
| sampling_wall_clock | true | sampling_wall_clock |
|---------------------------------|---------------|------------------------|
```
Setting `OMNITRACE_COLLAPSE_THREADS=ON` and/or `OMNITRACE_COLLAPSE_PROCESSES=ON` (only valid with full MPI support) the timemory output
will combine the per-thread and/or per-rank data which have identical call-stacks.
The `OMNITRACE_FLAT_PROFILE` setting will remove all call stack heirarchy. Using `OMNITRACE_FLAT_PROFILE=ON` in combination
with `OMNITRACE_COLLAPSE_THREADS=ON` is a useful configuration for identifying min/max measurements regardless of calling context.
The `OMNITRACE_TIMELINE_PROFILE` setting (with `OMNITRACE_FLAT_PROFILE=OFF`) will effectively generate similar data that can be found
in perfetto. Enabling timeline and flat profiling will effectively generate similar data to `strace`. However, while timemory in general
requires significantly less memory than perfetto, this is not the case in timeline mode so activate this setting with caution.
### Timemory Text Output
> Hint: the generation of text output is configurable via `OMNITRACE_TEXT_OUTPUT`
Timemory text output files are meant for human-consumption (use JSON formats for analysis)
and as such, some fields such as the `LABEL` fields may be truncated for readability.
Modification of the truncation can be changed via the `OMNITRACE_MAX_WIDTH` setting.
#### Timemory Text Output Example
In the below, the `NN` field in `|NN>>>` is the thread ID. If MPI support is enabled, this will be `|MM|NN>>>` and `MM` will be the rank.
If `OMNITRACE_COLLAPSE_THREADS=ON` and `OMNITRACE_COLLAPSE_PROCESSES=ON`, neither the `MM` nor the `NN` will be present unless the
component explicitly sets type-traits which specify that the data is only relevant per-thread or per-process, e.g. the `thread_cpu_clock` clock component.
```console
|-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
| REAL-CLOCK TIMER (I.E. WALL-CLOCK TIMER) |
|-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
| LABEL | COUNT | DEPTH | METRIC | UNITS | SUM | MEAN | MIN | MAX | VAR | STDDEV | % SELF |
|--------------------------------------------------------------|--------|--------|------------|--------|-----------|-----------|-----------|-----------|----------|----------|--------|
| |00>>> main | 1 | 0 | wall_clock | sec | 13.360265 | 13.360265 | 13.360265 | 13.360265 | 0.000000 | 0.000000 | 18.2 |
| |00>>> |_ompt_thread_initial | 1 | 1 | wall_clock | sec | 10.924161 | 10.924161 | 10.924161 | 10.924161 | 0.000000 | 0.000000 | 0.0 |
| |00>>> |_ompt_implicit_task | 1 | 2 | wall_clock | sec | 10.923050 | 10.923050 | 10.923050 | 10.923050 | 0.000000 | 0.000000 | 0.1 |
| |00>>> |_ompt_parallel [parallelism=12] | 1 | 3 | wall_clock | sec | 10.915026 | 10.915026 | 10.915026 | 10.915026 | 0.000000 | 0.000000 | 0.0 |
| |00>>> |_ompt_implicit_task | 1 | 4 | wall_clock | sec | 10.647951 | 10.647951 | 10.647951 | 10.647951 | 0.000000 | 0.000000 | 0.0 |
| |00>>> |_ompt_work_loop | 156 | 5 | wall_clock | sec | 0.000812 | 0.000005 | 0.000001 | 0.000212 | 0.000000 | 0.000018 | 100.0 |
| |00>>> |_ompt_work_single_executor | 40 | 5 | wall_clock | sec | 0.000016 | 0.000000 | 0.000000 | 0.000001 | 0.000000 | 0.000000 | 100.0 |
| |00>>> |_ompt_sync_region_barrier_implicit | 308 | 5 | wall_clock | sec | 0.000629 | 0.000002 | 0.000001 | 0.000017 | 0.000000 | 0.000002 | 100.0 |
| |00>>> |_conj_grad | 76 | 5 | wall_clock | sec | 10.641165 | 0.140015 | 0.131894 | 0.155099 | 0.000017 | 0.004080 | 1.0 |
| |00>>> |_ompt_work_single_executor | 803 | 6 | wall_clock | sec | 0.000292 | 0.000000 | 0.000000 | 0.000001 | 0.000000 | 0.000000 | 100.0 |
| |00>>> |_ompt_work_loop | 7904 | 6 | wall_clock | sec | 7.420265 | 0.000939 | 0.000005 | 0.006974 | 0.000003 | 0.001613 | 100.0 |
| |00>>> |_ompt_sync_region_barrier_implicit | 6004 | 6 | wall_clock | sec | 0.283160 | 0.000047 | 0.000001 | 0.004087 | 0.000000 | 0.000303 | 100.0 |
| |00>>> |_ompt_sync_region_barrier_implementation | 3952 | 6 | wall_clock | sec | 2.829252 | 0.000716 | 0.000007 | 0.009005 | 0.000001 | 0.000985 | 99.7 |
| |00>>> |_ompt_sync_region_reduction | 15808 | 7 | wall_clock | sec | 0.009142 | 0.000001 | 0.000000 | 0.000007 | 0.000000 | 0.000000 | 100.0 |
| |00>>> |_ompt_work_single_other | 1249 | 6 | wall_clock | sec | 0.000270 | 0.000000 | 0.000000 | 0.000001 | 0.000000 | 0.000000 | 100.0 |
| |00>>> |_ompt_work_single_other | 114 | 5 | wall_clock | sec | 0.000024 | 0.000000 | 0.000000 | 0.000001 | 0.000000 | 0.000000 | 100.0 |
| |00>>> |_ompt_sync_region_barrier_implementation | 76 | 5 | wall_clock | sec | 0.000876 | 0.000012 | 0.000008 | 0.000025 | 0.000000 | 0.000003 | 84.4 |
| |00>>> |_ompt_sync_region_reduction | 304 | 6 | wall_clock | sec | 0.000136 | 0.000000 | 0.000000 | 0.000001 | 0.000000 | 0.000000 | 100.0 |
| |00>>> |_ompt_master | 226 | 5 | wall_clock | sec | 0.001978 | 0.000009 | 0.000000 | 0.000038 | 0.000000 | 0.000012 | 100.0 |
| |11>>> |_ompt_thread_worker | 1 | 4 | wall_clock | sec | 10.656145 | 10.656145 | 10.656145 | 10.656145 | 0.000000 | 0.000000 | 0.1 |
| |11>>> |_ompt_implicit_task | 1 | 5 | wall_clock | sec | 10.649183 | 10.649183 | 10.649183 | 10.649183 | 0.000000 | 0.000000 | 0.0 |
| |11>>> |_ompt_work_loop | 156 | 6 | wall_clock | sec | 0.000852 | 0.000005 | 0.000002 | 0.000230 | 0.000000 | 0.000019 | 100.0 |
| |11>>> |_ompt_work_single_other | 149 | 6 | wall_clock | sec | 0.000035 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 100.0 |
| |11>>> |_ompt_sync_region_barrier_implicit | 308 | 6 | wall_clock | sec | 0.004135 | 0.000013 | 0.000001 | 0.001233 | 0.000000 | 0.000070 | 100.0 |
| |11>>> |_conj_grad | 76 | 6 | wall_clock | sec | 10.641302 | 0.140017 | 0.131896 | 0.155102 | 0.000017 | 0.004080 | 0.6 |
| |11>>> |_ompt_work_single_other | 2023 | 7 | wall_clock | sec | 0.000458 | 0.000000 | 0.000000 | 0.000001 | 0.000000 | 0.000000 | 100.0 |
| |11>>> |_ompt_work_loop | 7904 | 7 | wall_clock | sec | 8.253555 | 0.001044 | 0.000005 | 0.008021 | 0.000003 | 0.001790 | 100.0 |
| |11>>> |_ompt_sync_region_barrier_implicit | 6004 | 7 | wall_clock | sec | 0.263840 | 0.000044 | 0.000001 | 0.004087 | 0.000000 | 0.000297 | 100.0 |
| |11>>> |_ompt_sync_region_barrier_implementation | 3952 | 7 | wall_clock | sec | 2.059823 | 0.000521 | 0.000007 | 0.009508 | 0.000001 | 0.000863 | 100.0 |
| |11>>> |_ompt_work_single_executor | 29 | 7 | wall_clock | sec | 0.000011 | 0.000000 | 0.000000 | 0.000001 | 0.000000 | 0.000000 | 100.0 |
| |11>>> |_ompt_work_single_executor | 5 | 6 | wall_clock | sec | 0.000002 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 100.0 |
| |11>>> |_ompt_sync_region_barrier_implementation | 76 | 6 | wall_clock | sec | 0.000975 | 0.000013 | 0.000008 | 0.000024 | 0.000000 | 0.000003 | 100.0 |
| |10>>> |_ompt_thread_worker | 1 | 4 | wall_clock | sec | 10.681664 | 10.681664 | 10.681664 | 10.681664 | 0.000000 | 0.000000 | 0.3 |
| |10>>> |_ompt_implicit_task | 1 | 5 | wall_clock | sec | 10.649158 | 10.649158 | 10.649158 | 10.649158 | 0.000000 | 0.000000 | 0.0 |
| |10>>> |_ompt_work_loop | 156 | 6 | wall_clock | sec | 0.000863 | 0.000006 | 0.000002 | 0.000231 | 0.000000 | 0.000019 | 100.0 |
| |10>>> |_ompt_work_single_other | 140 | 6 | wall_clock | sec | 0.000037 | 0.000000 | 0.000000 | 0.000001 | 0.000000 | 0.000000 | 100.0 |
| |10>>> |_ompt_sync_region_barrier_implicit | 308 | 6 | wall_clock | sec | 0.004149 | 0.000013 | 0.000001 | 0.001221 | 0.000000 | 0.000070 | 100.0 |
| |10>>> |_conj_grad | 76 | 6 | wall_clock | sec | 10.641288 | 0.140017 | 0.131896 | 0.155101 | 0.000017 | 0.004080 | 0.7 |
| |10>>> |_ompt_work_single_other | 1883 | 7 | wall_clock | sec | 0.000487 | 0.000000 | 0.000000 | 0.000001 | 0.000000 | 0.000000 | 100.0 |
| |10>>> |_ompt_work_loop | 7904 | 7 | wall_clock | sec | 8.174545 | 0.001034 | 0.000005 | 0.006899 | 0.000003 | 0.001766 | 100.0 |
| |10>>> |_ompt_sync_region_barrier_implicit | 6004 | 7 | wall_clock | sec | 0.268808 | 0.000045 | 0.000001 | 0.004087 | 0.000000 | 0.000299 | 100.0 |
| |10>>> |_ompt_sync_region_barrier_implementation | 3952 | 7 | wall_clock | sec | 2.126988 | 0.000538 | 0.000007 | 0.009843 | 0.000001 | 0.000872 | 99.9 |
| |10>>> |_ompt_sync_region_reduction | 3952 | 8 | wall_clock | sec | 0.002574 | 0.000001 | 0.000000 | 0.000014 | 0.000000 | 0.000000 | 100.0 |
| |10>>> |_ompt_work_single_executor | 169 | 7 | wall_clock | sec | 0.000072 | 0.000000 | 0.000000 | 0.000001 | 0.000000 | 0.000000 | 100.0 |
| |10>>> |_ompt_sync_region_barrier_implementation | 76 | 6 | wall_clock | sec | 0.000954 | 0.000013 | 0.000009 | 0.000023 | 0.000000 | 0.000003 | 95.9 |
| |10>>> |_ompt_sync_region_reduction | 76 | 7 | wall_clock | sec | 0.000039 | 0.000001 | 0.000000 | 0.000001 | 0.000000 | 0.000000 | 100.0 |
| |10>>> |_ompt_work_single_executor | 14 | 6 | wall_clock | sec | 0.000006 | 0.000000 | 0.000000 | 0.000001 | 0.000000 | 0.000000 | 100.0 |
| |09>>> |_ompt_thread_worker | 1 | 4 | wall_clock | sec | 10.686552 | 10.686552 | 10.686552 | 10.686552 | 0.000000 | 0.000000 | 0.3 |
| |09>>> |_ompt_implicit_task | 1 | 5 | wall_clock | sec | 10.649151 | 10.649151 | 10.649151 | 10.649151 | 0.000000 | 0.000000 | 0.0 |
| |09>>> |_ompt_work_loop | 156 | 6 | wall_clock | sec | 0.000880 | 0.000006 | 0.000002 | 0.000258 | 0.000000 | 0.000021 | 100.0 |
| |09>>> |_ompt_work_single_other | 148 | 6 | wall_clock | sec | 0.000034 | 0.000000 | 0.000000 | 0.000001 | 0.000000 | 0.000000 | 100.0 |
| |09>>> |_ompt_sync_region_barrier_implicit | 308 | 6 | wall_clock | sec | 0.004129 | 0.000013 | 0.000001 | 0.001210 | 0.000000 | 0.000069 | 100.0 |
| |09>>> |_conj_grad | 76 | 6 | wall_clock | sec | 10.641308 | 0.140017 | 0.131895 | 0.155102 | 0.000017 | 0.004080 | 0.7 |
| |09>>> |_ompt_work_single_other | 2043 | 7 | wall_clock | sec | 0.000473 | 0.000000 | 0.000000 | 0.000001 | 0.000000 | 0.000000 | 100.0 |
| |09>>> |_ompt_work_loop | 7904 | 7 | wall_clock | sec | 7.977001 | 0.001009 | 0.000005 | 0.007325 | 0.000003 | 0.001732 | 100.0 |
| |09>>> |_ompt_sync_region_barrier_implicit | 6004 | 7 | wall_clock | sec | 0.242996 | 0.000040 | 0.000001 | 0.004087 | 0.000000 | 0.000284 | 100.0 |
| |09>>> |_ompt_sync_region_barrier_implementation | 3952 | 7 | wall_clock | sec | 2.350895 | 0.000595 | 0.000007 | 0.008689 | 0.000001 | 0.000926 | 100.0 |
| |09>>> |_ompt_work_single_executor | 9 | 7 | wall_clock | sec | 0.000004 | 0.000000 | 0.000000 | 0.000001 | 0.000000 | 0.000000 | 100.0 |
| |09>>> |_ompt_sync_region_barrier_implementation | 76 | 6 | wall_clock | sec | 0.000973 | 0.000013 | 0.000008 | 0.000025 | 0.000000 | 0.000003 | 100.0 |
| |09>>> |_ompt_work_single_executor | 6 | 6 | wall_clock | sec | 0.000002 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 100.0 |
| |08>>> |_ompt_thread_worker | 1 | 4 | wall_clock | sec | 10.721622 | 10.721622 | 10.721622 | 10.721622 | 0.000000 | 0.000000 | 0.7 |
| |08>>> |_ompt_implicit_task | 1 | 5 | wall_clock | sec | 10.649135 | 10.649135 | 10.649135 | 10.649135 | 0.000000 | 0.000000 | 0.0 |
| |08>>> |_ompt_work_loop | 156 | 6 | wall_clock | sec | 0.000839 | 0.000005 | 0.000001 | 0.000231 | 0.000000 | 0.000019 | 100.0 |
| |08>>> |_ompt_work_single_other | 141 | 6 | wall_clock | sec | 0.000030 | 0.000000 | 0.000000 | 0.000001 | 0.000000 | 0.000000 | 100.0 |
| |08>>> |_ompt_sync_region_barrier_implicit | 308 | 6 | wall_clock | sec | 0.004114 | 0.000013 | 0.000001 | 0.001198 | 0.000000 | 0.000069 | 100.0 |
| |08>>> |_conj_grad | 76 | 6 | wall_clock | sec | 10.641294 | 0.140017 | 0.131896 | 0.155101 | 0.000017 | 0.004080 | 0.6 |
| |08>>> |_ompt_work_single_other | 1742 | 7 | wall_clock | sec | 0.000392 | 0.000000 | 0.000000 | 0.000001 | 0.000000 | 0.000000 | 100.0 |
| |08>>> |_ompt_work_loop | 7904 | 7 | wall_clock | sec | 8.306388 | 0.001051 | 0.000005 | 0.007886 | 0.000003 | 0.001795 | 100.0 |
| |08>>> |_ompt_sync_region_barrier_implicit | 6004 | 7 | wall_clock | sec | 0.274358 | 0.000046 | 0.000001 | 0.004090 | 0.000000 | 0.000302 | 100.0 |
| |08>>> |_ompt_sync_region_barrier_implementation | 3952 | 7 | wall_clock | sec | 1.991251 | 0.000504 | 0.000007 | 0.008694 | 0.000001 | 0.000844 | 99.8 |
| |08>>> |_ompt_sync_region_reduction | 7904 | 8 | wall_clock | sec | 0.003816 | 0.000000 | 0.000000 | 0.000017 | 0.000000 | 0.000000 | 100.0 |
| |08>>> |_ompt_work_single_executor | 310 | 7 | wall_clock | sec | 0.000112 | 0.000000 | 0.000000 | 0.000001 | 0.000000 | 0.000000 | 100.0 |
| |08>>> |_ompt_sync_region_barrier_implementation | 76 | 6 | wall_clock | sec | 0.000955 | 0.000013 | 0.000009 | 0.000026 | 0.000000 | 0.000003 | 93.7 |
| |08>>> |_ompt_sync_region_reduction | 152 | 7 | wall_clock | sec | 0.000060 | 0.000000 | 0.000000 | 0.000001 | 0.000000 | 0.000000 | 100.0 |
| |08>>> |_ompt_work_single_executor | 13 | 6 | wall_clock | sec | 0.000005 | 0.000000 | 0.000000 | 0.000001 | 0.000000 | 0.000000 | 100.0 |
| |07>>> |_ompt_thread_worker | 1 | 4 | wall_clock | sec | 10.747282 | 10.747282 | 10.747282 | 10.747282 | 0.000000 | 0.000000 | 0.9 |
| |07>>> |_ompt_implicit_task | 1 | 5 | wall_clock | sec | 10.649093 | 10.649093 | 10.649093 | 10.649093 | 0.000000 | 0.000000 | 0.0 |
| |07>>> |_ompt_work_loop | 156 | 6 | wall_clock | sec | 0.000923 | 0.000006 | 0.000002 | 0.000231 | 0.000000 | 0.000019 | 100.0 |
| |07>>> |_ompt_work_single_other | 152 | 6 | wall_clock | sec | 0.000048 | 0.000000 | 0.000000 | 0.000001 | 0.000000 | 0.000000 | 100.0 |
| |07>>> |_ompt_sync_region_barrier_implicit | 308 | 6 | wall_clock | sec | 0.003981 | 0.000013 | 0.000001 | 0.001186 | 0.000000 | 0.000068 | 100.0 |
| |07>>> |_conj_grad | 76 | 6 | wall_clock | sec | 10.641295 | 0.140017 | 0.131896 | 0.155101 | 0.000017 | 0.004080 | 0.7 |
| |07>>> |_ompt_work_single_other | 2043 | 7 | wall_clock | sec | 0.000648 | 0.000000 | 0.000000 | 0.000001 | 0.000000 | 0.000000 | 100.0 |
| |07>>> |_ompt_work_loop | 7904 | 7 | wall_clock | sec | 7.978811 | 0.001009 | 0.000005 | 0.006728 | 0.000003 | 0.001732 | 100.0 |
| |07>>> |_ompt_sync_region_barrier_implicit | 6004 | 7 | wall_clock | sec | 0.199939 | 0.000033 | 0.000001 | 0.004086 | 0.000000 | 0.000255 | 100.0 |
| |07>>> |_ompt_sync_region_barrier_implementation | 3952 | 7 | wall_clock | sec | 2.385843 | 0.000604 | 0.000009 | 0.009039 | 0.000001 | 0.000938 | 100.0 |
| |07>>> |_ompt_work_single_executor | 9 | 7 | wall_clock | sec | 0.000004 | 0.000000 | 0.000000 | 0.000001 | 0.000000 | 0.000000 | 100.0 |
| |07>>> |_ompt_sync_region_barrier_implementation | 76 | 6 | wall_clock | sec | 0.000905 | 0.000012 | 0.000010 | 0.000025 | 0.000000 | 0.000003 | 100.0 |
| |07>>> |_ompt_work_single_executor | 2 | 6 | wall_clock | sec | 0.000001 | 0.000001 | 0.000000 | 0.000001 | 0.000000 | 0.000000 | 100.0 |
| |06>>> |_ompt_thread_worker | 1 | 4 | wall_clock | sec | 10.772278 | 10.772278 | 10.772278 | 10.772278 | 0.000000 | 0.000000 | 1.1 |
| |06>>> |_ompt_implicit_task | 1 | 5 | wall_clock | sec | 10.649092 | 10.649092 | 10.649092 | 10.649092 | 0.000000 | 0.000000 | 0.0 |
| |06>>> |_ompt_work_loop | 156 | 6 | wall_clock | sec | 0.000888 | 0.000006 | 0.000002 | 0.000236 | 0.000000 | 0.000020 | 100.0 |
| |06>>> |_ompt_work_single_other | 153 | 6 | wall_clock | sec | 0.000037 | 0.000000 | 0.000000 | 0.000001 | 0.000000 | 0.000000 | 100.0 |
| |06>>> |_ompt_sync_region_barrier_implicit | 308 | 6 | wall_clock | sec | 0.004090 | 0.000013 | 0.000001 | 0.001175 | 0.000000 | 0.000067 | 100.0 |
| |06>>> |_conj_grad | 76 | 6 | wall_clock | sec | 10.641317 | 0.140017 | 0.131896 | 0.155101 | 0.000017 | 0.004080 | 0.8 |
| |06>>> |_ompt_work_single_other | 2041 | 7 | wall_clock | sec | 0.000476 | 0.000000 | 0.000000 | 0.000001 | 0.000000 | 0.000000 | 100.0 |
| |06>>> |_ompt_work_loop | 7904 | 7 | wall_clock | sec | 7.467961 | 0.000945 | 0.000005 | 0.010712 | 0.000003 | 0.001627 | 100.0 |
| |06>>> |_ompt_sync_region_barrier_implicit | 6004 | 7 | wall_clock | sec | 0.250883 | 0.000042 | 0.000001 | 0.004087 | 0.000000 | 0.000285 | 100.0 |
| |06>>> |_ompt_sync_region_barrier_implementation | 3952 | 7 | wall_clock | sec | 2.838733 | 0.000718 | 0.000009 | 0.009015 | 0.000001 | 0.001015 | 99.9 |
| |06>>> |_ompt_sync_region_reduction | 3952 | 8 | wall_clock | sec | 0.003334 | 0.000001 | 0.000000 | 0.000025 | 0.000000 | 0.000001 | 100.0 |
| |06>>> |_ompt_work_single_executor | 11 | 7 | wall_clock | sec | 0.000005 | 0.000000 | 0.000000 | 0.000001 | 0.000000 | 0.000000 | 100.0 |
| |06>>> |_ompt_sync_region_barrier_implementation | 76 | 6 | wall_clock | sec | 0.000940 | 0.000012 | 0.000009 | 0.000025 | 0.000000 | 0.000003 | 95.4 |
| |06>>> |_ompt_sync_region_reduction | 76 | 7 | wall_clock | sec | 0.000044 | 0.000001 | 0.000000 | 0.000001 | 0.000000 | 0.000000 | 100.0 |
| |06>>> |_ompt_work_single_executor | 1 | 6 | wall_clock | sec | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 100.0 |
| |05>>> |_ompt_thread_worker | 1 | 4 | wall_clock | sec | 10.797950 | 10.797950 | 10.797950 | 10.797950 | 0.000000 | 0.000000 | 1.4 |
| |05>>> |_ompt_implicit_task | 1 | 5 | wall_clock | sec | 10.649072 | 10.649072 | 10.649072 | 10.649072 | 0.000000 | 0.000000 | 0.0 |
| |05>>> |_ompt_work_loop | 156 | 6 | wall_clock | sec | 0.000879 | 0.000006 | 0.000001 | 0.000248 | 0.000000 | 0.000021 | 100.0 |
| |05>>> |_ompt_work_single_other | 142 | 6 | wall_clock | sec | 0.000034 | 0.000000 | 0.000000 | 0.000001 | 0.000000 | 0.000000 | 100.0 |
| |05>>> |_ompt_sync_region_barrier_implicit | 308 | 6 | wall_clock | sec | 0.004062 | 0.000013 | 0.000002 | 0.001163 | 0.000000 | 0.000067 | 100.0 |
| |05>>> |_conj_grad | 76 | 6 | wall_clock | sec | 10.641291 | 0.140017 | 0.131896 | 0.155101 | 0.000017 | 0.004080 | 0.7 |
| |05>>> |_ompt_work_single_other | 2038 | 7 | wall_clock | sec | 0.000500 | 0.000000 | 0.000000 | 0.000001 | 0.000000 | 0.000000 | 100.0 |
| |05>>> |_ompt_work_loop | 7904 | 7 | wall_clock | sec | 8.279191 | 0.001047 | 0.000005 | 0.006596 | 0.000003 | 0.001792 | 100.0 |
| |05>>> |_ompt_sync_region_barrier_implicit | 6004 | 7 | wall_clock | sec | 0.250939 | 0.000042 | 0.000001 | 0.004090 | 0.000000 | 0.000286 | 100.0 |
| |05>>> |_ompt_sync_region_barrier_implementation | 3952 | 7 | wall_clock | sec | 2.039013 | 0.000516 | 0.000009 | 0.008689 | 0.000001 | 0.000855 | 100.0 |
| |05>>> |_ompt_work_single_executor | 14 | 7 | wall_clock | sec | 0.000005 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 100.0 |
| |05>>> |_ompt_sync_region_barrier_implementation | 76 | 6 | wall_clock | sec | 0.000926 | 0.000012 | 0.000009 | 0.000023 | 0.000000 | 0.000003 | 100.0 |
| |05>>> |_ompt_work_single_executor | 12 | 6 | wall_clock | sec | 0.000005 | 0.000000 | 0.000000 | 0.000001 | 0.000000 | 0.000000 | 100.0 |
| |04>>> |_ompt_thread_worker | 1 | 4 | wall_clock | sec | 10.825935 | 10.825935 | 10.825935 | 10.825935 | 0.000000 | 0.000000 | 1.6 |
| |04>>> |_ompt_implicit_task | 1 | 5 | wall_clock | sec | 10.649068 | 10.649068 | 10.649068 | 10.649068 | 0.000000 | 0.000000 | 0.0 |
| |04>>> |_ompt_work_loop | 156 | 6 | wall_clock | sec | 0.000884 | 0.000006 | 0.000002 | 0.000245 | 0.000000 | 0.000020 | 100.0 |
| |04>>> |_ompt_work_single_other | 150 | 6 | wall_clock | sec | 0.000034 | 0.000000 | 0.000000 | 0.000001 | 0.000000 | 0.000000 | 100.0 |
| |04>>> |_ompt_sync_region_barrier_implicit | 308 | 6 | wall_clock | sec | 0.004069 | 0.000013 | 0.000001 | 0.001151 | 0.000000 | 0.000066 | 100.0 |
| |04>>> |_conj_grad | 76 | 6 | wall_clock | sec | 10.641300 | 0.140017 | 0.131896 | 0.155101 | 0.000017 | 0.004080 | 1.1 |
| |04>>> |_ompt_work_single_other | 2041 | 7 | wall_clock | sec | 0.000448 | 0.000000 | 0.000000 | 0.000001 | 0.000000 | 0.000000 | 100.0 |
| |04>>> |_ompt_work_loop | 7904 | 7 | wall_clock | sec | 7.438393 | 0.000941 | 0.000005 | 0.007090 | 0.000003 | 0.001624 | 100.0 |
| |04>>> |_ompt_sync_region_barrier_implicit | 6004 | 7 | wall_clock | sec | 0.270654 | 0.000045 | 0.000001 | 0.004090 | 0.000000 | 0.000295 | 100.0 |
| |04>>> |_ompt_sync_region_barrier_implementation | 3952 | 7 | wall_clock | sec | 2.819165 | 0.000713 | 0.000009 | 0.008379 | 0.000001 | 0.001013 | 99.9 |
| |04>>> |_ompt_sync_region_reduction | 7904 | 8 | wall_clock | sec | 0.003932 | 0.000000 | 0.000000 | 0.000015 | 0.000000 | 0.000000 | 100.0 |
| |04>>> |_ompt_work_single_executor | 11 | 7 | wall_clock | sec | 0.000005 | 0.000000 | 0.000000 | 0.000001 | 0.000000 | 0.000000 | 100.0 |
| |04>>> |_ompt_sync_region_barrier_implementation | 76 | 6 | wall_clock | sec | 0.000936 | 0.000012 | 0.000009 | 0.000025 | 0.000000 | 0.000003 | 93.2 |
| |04>>> |_ompt_sync_region_reduction | 152 | 7 | wall_clock | sec | 0.000064 | 0.000000 | 0.000000 | 0.000001 | 0.000000 | 0.000000 | 100.0 |
| |04>>> |_ompt_work_single_executor | 4 | 6 | wall_clock | sec | 0.000001 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 100.0 |
| |03>>> |_ompt_thread_worker | 1 | 4 | wall_clock | sec | 10.849322 | 10.849322 | 10.849322 | 10.849322 | 0.000000 | 0.000000 | 1.8 |
| |03>>> |_ompt_implicit_task | 1 | 5 | wall_clock | sec | 10.649075 | 10.649075 | 10.649075 | 10.649075 | 0.000000 | 0.000000 | 0.0 |
| |03>>> |_ompt_work_loop | 156 | 6 | wall_clock | sec | 0.000861 | 0.000006 | 0.000002 | 0.000238 | 0.000000 | 0.000020 | 100.0 |
| |03>>> |_ompt_work_single_other | 120 | 6 | wall_clock | sec | 0.000028 | 0.000000 | 0.000000 | 0.000001 | 0.000000 | 0.000000 | 100.0 |
| |03>>> |_ompt_sync_region_barrier_implicit | 308 | 6 | wall_clock | sec | 0.003993 | 0.000013 | 0.000001 | 0.001138 | 0.000000 | 0.000065 | 100.0 |
| |03>>> |_conj_grad | 76 | 6 | wall_clock | sec | 10.641302 | 0.140017 | 0.131896 | 0.155101 | 0.000017 | 0.004080 | 0.8 |
| |03>>> |_ompt_work_single_other | 1756 | 7 | wall_clock | sec | 0.000426 | 0.000000 | 0.000000 | 0.000001 | 0.000000 | 0.000000 | 100.0 |
| |03>>> |_ompt_work_loop | 7904 | 7 | wall_clock | sec | 8.005617 | 0.001013 | 0.000005 | 0.011500 | 0.000003 | 0.001741 | 100.0 |
| |03>>> |_ompt_sync_region_barrier_implicit | 6004 | 7 | wall_clock | sec | 0.231485 | 0.000039 | 0.000001 | 0.004086 | 0.000000 | 0.000277 | 100.0 |
| |03>>> |_ompt_sync_region_barrier_implementation | 3952 | 7 | wall_clock | sec | 2.320428 | 0.000587 | 0.000009 | 0.010868 | 0.000001 | 0.000912 | 100.0 |
| |03>>> |_ompt_work_single_executor | 296 | 7 | wall_clock | sec | 0.000120 | 0.000000 | 0.000000 | 0.000001 | 0.000000 | 0.000000 | 100.0 |
| |03>>> |_ompt_sync_region_barrier_implementation | 76 | 6 | wall_clock | sec | 0.000967 | 0.000013 | 0.000010 | 0.000023 | 0.000000 | 0.000003 | 100.0 |
| |03>>> |_ompt_work_single_executor | 34 | 6 | wall_clock | sec | 0.000013 | 0.000000 | 0.000000 | 0.000001 | 0.000000 | 0.000000 | 100.0 |
| |02>>> |_ompt_thread_worker | 1 | 4 | wall_clock | sec | 10.876387 | 10.876387 | 10.876387 | 10.876387 | 0.000000 | 0.000000 | 2.1 |
| |02>>> |_ompt_implicit_task | 1 | 5 | wall_clock | sec | 10.649050 | 10.649050 | 10.649050 | 10.649050 | 0.000000 | 0.000000 | 0.0 |
| |02>>> |_ompt_work_loop | 156 | 6 | wall_clock | sec | 0.000924 | 0.000006 | 0.000001 | 0.000241 | 0.000000 | 0.000020 | 100.0 |
| |02>>> |_ompt_work_single_other | 139 | 6 | wall_clock | sec | 0.000040 | 0.000000 | 0.000000 | 0.000001 | 0.000000 | 0.000000 | 100.0 |
| |02>>> |_ompt_sync_region_barrier_implicit | 308 | 6 | wall_clock | sec | 0.003972 | 0.000013 | 0.000001 | 0.001127 | 0.000000 | 0.000064 | 100.0 |
| |02>>> |_conj_grad | 76 | 6 | wall_clock | sec | 10.641287 | 0.140017 | 0.131895 | 0.155101 | 0.000017 | 0.004080 | 0.7 |
| |02>>> |_ompt_work_single_other | 1902 | 7 | wall_clock | sec | 0.000553 | 0.000000 | 0.000000 | 0.000001 | 0.000000 | 0.000000 | 100.0 |
| |02>>> |_ompt_work_loop | 7904 | 7 | wall_clock | sec | 7.906688 | 0.001000 | 0.000005 | 0.007068 | 0.000003 | 0.001713 | 100.0 |
| |02>>> |_ompt_sync_region_barrier_implicit | 6004 | 7 | wall_clock | sec | 0.261367 | 0.000044 | 0.000001 | 0.004088 | 0.000000 | 0.000295 | 100.0 |
| |02>>> |_ompt_sync_region_barrier_implementation | 3952 | 7 | wall_clock | sec | 2.402362 | 0.000608 | 0.000009 | 0.010399 | 0.000001 | 0.000944 | 99.9 |
| |02>>> |_ompt_sync_region_reduction | 3952 | 8 | wall_clock | sec | 0.002937 | 0.000001 | 0.000000 | 0.000021 | 0.000000 | 0.000000 | 100.0 |
| |02>>> |_ompt_work_single_executor | 150 | 7 | wall_clock | sec | 0.000073 | 0.000000 | 0.000000 | 0.000001 | 0.000000 | 0.000000 | 100.0 |
| |02>>> |_ompt_sync_region_barrier_implementation | 76 | 6 | wall_clock | sec | 0.000895 | 0.000012 | 0.000009 | 0.000026 | 0.000000 | 0.000003 | 95.2 |
| |02>>> |_ompt_sync_region_reduction | 76 | 7 | wall_clock | sec | 0.000043 | 0.000001 | 0.000000 | 0.000001 | 0.000000 | 0.000000 | 100.0 |
| |02>>> |_ompt_work_single_executor | 15 | 6 | wall_clock | sec | 0.000007 | 0.000000 | 0.000000 | 0.000001 | 0.000000 | 0.000000 | 100.0 |
| |01>>> |_ompt_thread_worker | 1 | 4 | wall_clock | sec | 10.901650 | 10.901650 | 10.901650 | 10.901650 | 0.000000 | 0.000000 | 2.3 |
| |01>>> |_ompt_implicit_task | 1 | 5 | wall_clock | sec | 10.649017 | 10.649017 | 10.649017 | 10.649017 | 0.000000 | 0.000000 | 0.0 |
| |01>>> |_ompt_work_loop | 156 | 6 | wall_clock | sec | 0.000863 | 0.000006 | 0.000001 | 0.000231 | 0.000000 | 0.000019 | 100.0 |
| |01>>> |_ompt_work_single_other | 146 | 6 | wall_clock | sec | 0.000033 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 100.0 |
| |01>>> |_ompt_sync_region_barrier_implicit | 308 | 6 | wall_clock | sec | 0.004012 | 0.000013 | 0.000001 | 0.001115 | 0.000000 | 0.000064 | 100.0 |
| |01>>> |_conj_grad | 76 | 6 | wall_clock | sec | 10.641316 | 0.140017 | 0.131895 | 0.155101 | 0.000017 | 0.004080 | 0.8 |
| |01>>> |_ompt_work_single_other | 1811 | 7 | wall_clock | sec | 0.000403 | 0.000000 | 0.000000 | 0.000001 | 0.000000 | 0.000000 | 100.0 |
| |01>>> |_ompt_work_loop | 7904 | 7 | wall_clock | sec | 7.410337 | 0.000938 | 0.000005 | 0.010556 | 0.000003 | 0.001610 | 100.0 |
| |01>>> |_ompt_sync_region_barrier_implicit | 6004 | 7 | wall_clock | sec | 0.202494 | 0.000034 | 0.000001 | 0.003521 | 0.000000 | 0.000256 | 100.0 |
| |01>>> |_ompt_sync_region_barrier_implementation | 3952 | 7 | wall_clock | sec | 2.943604 | 0.000745 | 0.000008 | 0.009033 | 0.000001 | 0.001024 | 100.0 |
| |01>>> |_ompt_work_single_executor | 241 | 7 | wall_clock | sec | 0.000093 | 0.000000 | 0.000000 | 0.000001 | 0.000000 | 0.000000 | 100.0 |
| |01>>> |_ompt_sync_region_barrier_implementation | 76 | 6 | wall_clock | sec | 0.000917 | 0.000012 | 0.000009 | 0.000026 | 0.000000 | 0.000003 | 100.0 |
| |01>>> |_ompt_work_single_executor | 8 | 6 | wall_clock | sec | 0.000004 | 0.000000 | 0.000000 | 0.000001 | 0.000000 | 0.000000 | 100.0 |
| |00>>> |_c_print_results | 1 | 2 | wall_clock | sec | 0.000049 | 0.000049 | 0.000049 | 0.000049 | 0.000000 | 0.000000 | 100.0 |
|-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
```
### Timemory Flat JSON Output
> Hint: the generation of flat JSON output is configurable via `OMNITRACE_JSON_OUTPUT`
Timemory provides two JSON output formats. The flat JSON output files are similar to the text files: the hierarchical information
is represented by the indentation of the `"prefix"` field and the `"depth"` field. All the data entries are in a single JSON array,
e.g. the `["timemory"]["wall_clock"]["ranks"][0]["graph"][<N>]["prefix"]` entry in the below:
```json
{
"timemory": {
"wall_clock": {
"description": "Real-clock timer (i.e. wall-clock timer)",
"thread_count": 12,
"process_count": 1,
"properties": {
"cereal_class_version": 0,
"enum": "WALL_CLOCK",
"id": "wall_clock",
"value": 78,
"ids": [
"real_clock",
"virtual_clock",
"wall_clock"
]
},
"mpi_size": 0,
"num_ranks": 1,
"concurrency": 12,
"upcxx_size": 1,
"unit_value": 1000000000,
"thread_scope_only": false,
"type": "wall_clock",
"unit_repr": "sec",
"ranks": [
{
"graph_size": 173,
"rank": 0,
"graph": [
{
"depth": 0,
"stats": {
"count": 1,
"min": 13.360264917,
"sqr": 178.49667865242102,
"sum": 13.360264917,
"stddev": 0.0,
"max": 13.360264917,
"cereal_class_version": 0,
"mean": 13.360264917
},
"prefix": "|00>>> main",
"rolling_hash": 17481650134347108265,
"entry": {
"repr_display": 13.360264917,
"value": 13360264917,
"repr_data": 13.360264917,
"cereal_class_version": 0,
"accum": 13360264917,
"laps": 1
},
"hash": 17481650134347108265
},
{
"depth": 1,
"stats": {
"count": 1,
"min": 10.924160502,
"max": 10.924160502,
"sum": 10.924160502,
"stddev": 0.0,
"sqr": 119.33728267345688,
"mean": 10.924160502
},
"prefix": "|00>>> |_ompt_thread_initial",
"rolling_hash": 5142782188440775656,
"entry": {
"repr_display": 10.924160502,
"laps": 1,
"accum": 10924160502,
"repr_data": 10.924160502,
"value": 10924160502
},
"hash": 6107876127803219007
},
{
"depth": 2,
"stats": {
"count": 1,
"min": 10.923050237,
"max": 10.923050237,
"sum": 10.923050237,
"stddev": 0.0,
"sqr": 119.31302648002575,
"mean": 10.923050237
},
"prefix": "|00>>> |_ompt_implicit_task",
"rolling_hash": 2098840206724841601,
"entry": {
"repr_display": 10.923050237,
"laps": 1,
"accum": 10923050237,
"repr_data": 10.923050237,
"value": 10923050237
},
"hash": 15402802091993617561
},
{
"..." : "... etc. ..."
}
]
}
]
}
}
}
```
This format is easier than the hierarchical format to write a simple Python script for post-processing, e.g.:
```python
#!/usr/bin/env python3
import sys
import json
def read_json(inp):
with open(inp, "r") as f:
return json.load(f)
def find_max(data):
"""Find the max for any function called multiple times"""
max_entry = None
for itr in data:
if itr["entry"]["laps"] == 1:
continue
if max_entry is None:
max_entry = itr
else:
if itr["stats"]["mean"] > max_entry["stats"]["mean"]:
max_entry = itr
return max_entry
def strip_name(name):
"""Return everything after |_ if it exists"""
idx = name.index("|_")
return name if idx is None else name[(idx + 2) :]
if __name__ == "__main__":
input_data = [[x, read_json(x)] for x in sys.argv[1:]]
for file, data in input_data:
for metric, metric_data in data["timemory"].items():
print(f"[{file}] Found metric: {metric}")
for n, itr in enumerate(metric_data["ranks"]):
max_entry = find_max(itr["graph"])
print(
"[{}] Maximum value: '{}' at depth {} was called {}x :: {:.3f} {} (mean = {:.3e} {})".format(
file,
strip_name(max_entry["prefix"]),
max_entry["depth"],
max_entry["entry"]["laps"],
max_entry["entry"]["repr_data"],
metric_data["unit_repr"],
max_entry["stats"]["mean"],
metric_data["unit_repr"],
)
)
```
This script applied to the corresponding JSON output from [Text Output Example](#timemory-text-output-example) would be:
```console
[openmp-cg.inst-wall_clock.json] Found metric: wall_clock
[openmp-cg.inst-wall_clock.json] Maximum value: 'conj_grad' at depth 6 was called 76x :: 10.641 sec (mean = 1.400e-01 sec)
```
### Timemory Hierarchical JSON Output
> Hint: the generation of hierarchical JSON output is configurable via `OMNITRACE_TREE_OUTPUT`
The hierarchical JSON output (extension: `.tree.json`) contains the very similar data to the flat JSON output, however,
it's structure requires processing through recursion. The main use of these files are their analysis support
by [hatchet](https://github.com/hatchet/hatchet).
+506
Dosyayı Görüntüle
@@ -0,0 +1,506 @@
# Customizing Omnitrace Runtime
```eval_rst
.. toctree::
:glob:
:maxdepth: 4
```
## omnitrace-avail Executable
The `omnitrace-avail` executable provides information about the runtime settings, data collection capabilities, and
available hardware counters (when built with PAPI support). In contrast to this documentation, it is effectively
self-updating: when new capabilities and settings are added to the omnitrace source code, it is effectively
propagated to `omnitrace-avail`, thus it should be viewed as the single source of truth if any conflicting
information or missing feature is found in this documentation.
### Exploring Runtime Settings
In order to view the list of the available runtime settings, their current value, and descriptions for each setting:
```shell
omnitrace-avail --description
```
> HINT: use `--brief` to suppress printing current value and/or `-c 0` to suppress truncation of the descriptions
Any setting which is boolean (`omnitrace-avail --settings --value --brief --filter bool`) accepts a case insensitive
match to nearly all common expressions for boolean logic: ON, OFF, YES, NO, TRUE, FALSE, 0, 1, etc.
### Exploring Components
[Omnitrace](https://github.com/AMDResearch/omnitrace) uses [timemory](https://github.com/NERSC/timemory) extensively to provide various capabilities and manage
data and resources. By default, when `OMNITRACE_USE_TIMEMORY=ON`, omnitrace will only collect wall-clock
timing values; however, by modifying the `OMNITRACE_TIMEMORY_COMPONENTS` setting, omnitrace can be configured to
collect hardware counters, CPU-clock timers, memory usage, context-switches, page-faults, network statistics,
and many more. In fact, omnitrace can actually be used as a dynamic instrumentation vehicle for other 3rd-party profiling
APIs such as [Caliper](https://github.com/LLNL/Caliper) and [LIKWID](https://github.com/RRZE-HPC/likwid) by building omnitrace
from source with the CMake option(s) `TIMEMORY_USE_CALIPER=ON` and/or `TIMEMORY_USE_LIKWID=ON` and then adding
`caliper_marker` and/or `likwid_marker` to `OMNITRACE_TIMEMORY_COMPONENTS`.
View all possible components and their descriptions:
```shell
omnitrace-avail --components --description
```
Restrict to available components and view the string identifiers for `OMNITRACE_TIMEMORY_COMPONENTS`:
```shell
omnitrace-avail --components --available --string --brief
```
### Exploring Hardware Counters
[Omnitrace](https://github.com/AMDResearch/omnitrace) supports collecting hardware counters via PAPI.
View all possible hardware counters and their descriptions:
```shell
omnitrace-avail --hw-counters --description
```
### omnitrace-avail Examples
#### Settings
```console
$ omnitrace-avail -S -bd
|-----------------------------------------|-----------------------------------------|
| ENVIRONMENT VARIABLE | DESCRIPTION |
|-----------------------------------------|-----------------------------------------|
| OMNITRACE_ADD_SECONDARY | Enable/disable components adding sec... |
| OMNITRACE_BACKEND | Specify the perfetto backend to acti... |
| OMNITRACE_BUFFER_SIZE_KB | Size of perfetto buffer (in KB) |
| OMNITRACE_COLLAPSE_PROCESSES | Enable/disable combining process-spe... |
| OMNITRACE_COLLAPSE_THREADS | Enable/disable combining thread-spec... |
| OMNITRACE_CONFIG_FILE | Configuration file for omnitrace |
| OMNITRACE_COUT_OUTPUT | Write output to stdout |
| OMNITRACE_CRITICAL_TRACE | Enable generation of the critical trace |
| OMNITRACE_CRITICAL_TRACE_BUFFER_COUNT | Number of critical trace records to ... |
| OMNITRACE_CRITICAL_TRACE_COUNT | Number of critical trace to export (... |
| OMNITRACE_CRITICAL_TRACE_DEBUG | Enable debugging for critical trace |
| OMNITRACE_CRITICAL_TRACE_NUM_THREADS | Number of threads to use when genera... |
| OMNITRACE_CRITICAL_TRACE_PER_ROW | How many critical traces per row in ... |
| OMNITRACE_CRITICAL_TRACE_SERIALIZE_N... | Include names in serialization of cr... |
| OMNITRACE_DEBUG | Enable debug output |
| OMNITRACE_DIFF_OUTPUT | Generate a difference output vs. a p... |
| OMNITRACE_ENABLED | Activation state of timemory |
| OMNITRACE_ENABLE_SIGNAL_HANDLER | Enable signals in timemory_init |
| OMNITRACE_FILE_OUTPUT | Write output to files |
| OMNITRACE_FLAT_PROFILE | Set the label hierarchy mode to defa... |
| OMNITRACE_FLAT_SAMPLING | Ignore hierarchy in all statistical ... |
| OMNITRACE_INPUT_EXTENSIONS | File extensions used when searching ... |
| OMNITRACE_INPUT_PATH | Explicitly specify the input folder ... |
| OMNITRACE_INPUT_PREFIX | Explicitly specify the prefix for in... |
| OMNITRACE_INSTRUMENTATION_INTERVAL | Instrumentation only takes measureme... |
| OMNITRACE_JSON_OUTPUT | Write json output files |
| OMNITRACE_MAX_DEPTH | Set the maximum depth of label hiera... |
| OMNITRACE_MAX_THREAD_BOOKMARKS | Maximum number of times a worker thr... |
| OMNITRACE_MAX_WIDTH | Set the maximum width for component ... |
| OMNITRACE_MEMORY_PRECISION | Set the precision for components wit... |
| OMNITRACE_MEMORY_SCIENTIFIC | Set the numerical reporting format f... |
| OMNITRACE_MEMORY_UNITS | Set the units for components with 'u... |
| OMNITRACE_MEMORY_WIDTH | Set the output width for components ... |
| OMNITRACE_NETWORK_INTERFACE | Default network interface |
| OMNITRACE_NODE_COUNT | Total number of nodes used in applic... |
| OMNITRACE_OUTPUT_FILE | Perfetto filename |
| OMNITRACE_OUTPUT_PATH | Explicitly specify the output folder... |
| OMNITRACE_OUTPUT_PREFIX | Explicitly specify a prefix for all ... |
| OMNITRACE_PAPI_EVENTS | PAPI presets and events to collect (... |
| OMNITRACE_PAPI_FAIL_ON_ERROR | Configure PAPI errors to trigger a r... |
| OMNITRACE_PAPI_MULTIPLEXING | Enable multiplexing when using PAPI |
| OMNITRACE_PAPI_OVERFLOW | Value at which PAPI hw counters trig... |
| OMNITRACE_PAPI_QUIET | Configure suppression of reporting P... |
| OMNITRACE_PAPI_THREADING | Enable multithreading support when u... |
| OMNITRACE_PRECISION | Set the global output precision for ... |
| OMNITRACE_ROCM_SMI_DEVICES | Devices to query when OMNITRACE_USE_... |
| OMNITRACE_ROCTRACER_FLAT_PROFILE | Ignore hierarchy in all kernels entr... |
| OMNITRACE_ROCTRACER_HSA_ACTIVITY | Enable HSA activity tracing support |
| OMNITRACE_ROCTRACER_HSA_API | Enable HSA API tracing support |
| OMNITRACE_ROCTRACER_HSA_API_TYPES | HSA API type to collect |
| OMNITRACE_ROCTRACER_TIMELINE_PROFILE | Create unique entries for every kern... |
| OMNITRACE_SAMPLING_DELAY | Number of seconds to delay activatin... |
| OMNITRACE_SAMPLING_FREQ | Number of software interrupts per se... |
| OMNITRACE_SCIENTIFIC | Set the global numerical reporting t... |
| OMNITRACE_SETTINGS_DESC | Provide descriptions when printing s... |
| OMNITRACE_SHMEM_SIZE_HINT_KB | Hint for shared-memory buffer size i... |
| OMNITRACE_SUPPRESS_CONFIG | Disable processing of setting config... |
| OMNITRACE_SUPPRESS_PARSING | Disable parsing environment |
| OMNITRACE_TEXT_OUTPUT | Write text output files |
| OMNITRACE_TIMELINE_PROFILE | Set the label hierarchy mode to defa... |
| OMNITRACE_TIMELINE_SAMPLING | Create unique entries for every samp... |
| OMNITRACE_TIMEMORY_COMPONENTS | List of components to collect via ti... |
| OMNITRACE_TIME_FORMAT | Customize the folder generation when... |
| OMNITRACE_TIME_OUTPUT | Output data to subfolder w/ a timest... |
| OMNITRACE_TIMING_PRECISION | Set the precision for components wit... |
| OMNITRACE_TIMING_SCIENTIFIC | Set the numerical reporting format f... |
| OMNITRACE_TIMING_UNITS | Set the units for components with 'u... |
| OMNITRACE_TIMING_WIDTH | Set the output width for components ... |
| OMNITRACE_TREE_OUTPUT | Write hierarchical json output files |
| OMNITRACE_USE_KOKKOSP | Enable support for Kokkos Tools |
| OMNITRACE_USE_PERFETTO | Enable perfetto backend |
| OMNITRACE_USE_PID | Enable tagging filenames with proces... |
| OMNITRACE_USE_ROCM_SMI | Enable sampling GPU power, temp, uti... |
| OMNITRACE_USE_ROCTRACER | Enable ROCM tracing |
| OMNITRACE_USE_SAMPLING | Enable statistical sampling of call-... |
| OMNITRACE_USE_TIMEMORY | Enable timemory backend |
| OMNITRACE_VERBOSE | Verbosity level |
| OMNITRACE_WIDTH | Set the global output width for comp... |
|-----------------------------------------|-----------------------------------------|
```
#### Components
```console
$ omnitrace-avail -C -bd
|-----------------------------------|----------------------------------------------|
| COMPONENT | DESCRIPTION |
|-----------------------------------|----------------------------------------------|
| allinea_map | Controls the AllineaMAP sampler. |
| caliper_marker | Generic forwarding of markers to Caliper ... |
| caliper_config | Caliper configuration manager. |
| caliper_loop_marker | Variant of caliper_marker with support fo... |
| cpu_clock | Total CPU time spent in both user- and ke... |
| cpu_util | Percentage of CPU-clock time divided by w... |
| craypat_counters | Names and value of any counter events tha... |
| craypat_flush_buffer | Writes all the recorded contents in the d... |
| craypat_heap_stats | Undocumented by 'pat_api.h'. |
| craypat_record | Toggles CrayPAT recording on calling thread. |
| craypat_region | Adds region labels to CrayPAT output. |
| current_peak_rss | Absolute value of high-water mark of memo... |
| gperftools_cpu_profiler | Control switch for gperftools CPU profiler. |
| gperftools_heap_profiler | Control switch for the gperftools heap pr... |
| hip_event | Records the time interval between two poi... |
| kernel_mode_time | CPU time spent executing in kernel mode (... |
| likwid_marker | LIKWID perfmon (CPU) marker forwarding. |
| likwid_nvmarker | LIKWID nvmon (GPU) marker forwarding. |
| malloc_gotcha | GOTCHA wrapper for memory allocation func... |
| memory_allocations | Number of bytes allocated/freed instead o... |
| monotonic_clock | Wall-clock timer which will continue to i... |
| monotonic_raw_clock | Wall-clock timer unaffected by frequency ... |
| network_stats | Reports network bytes, packets, errors, d... |
| num_io_in | Number of times the filesystem had to per... |
| num_io_out | Number of times the filesystem had to per... |
| num_major_page_faults | Number of page faults serviced that requi... |
| num_minor_page_faults | Number of page faults serviced without an... |
| page_rss | Amount of memory allocated in pages of me... |
| papi_array<8ul> | Fixed-size array of PAPI HW counters. |
| papi_vector | Dynamically allocated array of PAPI HW co... |
| peak_rss | Measures changes in the high-water mark f... |
| perfetto_trace | Provides Perfetto Tracing SDK: system pro... |
| priority_context_switch | Number of context switch due to higher pr... |
| process_cpu_clock | CPU-clock timer for the calling process (... |
| process_cpu_util | Percentage of CPU-clock time divided by w... |
| read_bytes | Number of bytes which this process really... |
| read_char | Number of bytes which this task has cause... |
| roctx_marker | Generates high-level region markers for H... |
| system_clock | CPU time spent in kernel-mode. |
| tau_marker | Forwards markers to TAU instrumentation (... |
| thread_cpu_clock | CPU-clock timer for the calling thread. |
| thread_cpu_util | Percentage of CPU-clock time divided by w... |
| timestamp | Provides a timestamp for every sample and... |
| trip_count | Counts number of invocations. |
| user_clock | CPU time spent in user-mode. |
| user_mode_time | CPU time spent executing in user mode (vi... |
| virtual_memory | Records the change in virtual memory. |
| voluntary_context_switch | Number of context switches due to a proce... |
| vtune_event | Creates events for Intel profiler running... |
| vtune_frame | Creates frames for Intel profiler running... |
| vtune_profiler | Control switch for Intel profiler running... |
| wall_clock | Real-clock timer (i.e. wall-clock timer). |
| written_bytes | Number of bytes sent to the storage layer. |
| written_char | Number of bytes which this task has cause... |
| omnitrace | Invokes instrumentation functions 'omnitr... |
| roctracer | High-precision ROCm API and kernel tracing. |
| sampling_wall_clock | Wall-clock timing. Derived from statistic... |
| sampling_cpu_clock | CPU-clock timing. Derived from statistica... |
| sampling_percent | Fraction of wall-clock time spent in func... |
| sampling_gpu_power | GPU Power Usage via ROCm-SMI. Derived fro... |
| sampling_gpu_temp | GPU Temperature via ROCm-SMI. Derived fro... |
| sampling_gpu_busy | GPU Utilization (% busy) via ROCm-SMI. De... |
| sampling_gpu_memory_usage | GPU Memory Usage via ROCm-SMI. Derived fr... |
|-----------------------------------|----------------------------------------------|
```
#### Hardware Counters
```console
$ omnitrace-avail -H -bd
|---------------------|-------------------------------------------------|
| HARDWARE COUNTER | DESCRIPTION |
|---------------------|-------------------------------------------------|
| CPU | |
|---------------------|-------------------------------------------------|
| PAPI_L1_DCM | Level 1 data cache misses |
| PAPI_L1_ICM | Level 1 instruction cache misses |
| PAPI_L2_DCM | Level 2 data cache misses |
| PAPI_L2_ICM | Level 2 instruction cache misses |
| PAPI_L3_DCM | Level 3 data cache misses |
| PAPI_L3_ICM | Level 3 instruction cache misses |
| PAPI_L1_TCM | Level 1 cache misses |
| PAPI_L2_TCM | Level 2 cache misses |
| PAPI_L3_TCM | Level 3 cache misses |
| PAPI_CA_SNP | Requests for a snoop |
| PAPI_CA_SHR | Requests for exclusive access to shared cach... |
| PAPI_CA_CLN | Requests for exclusive access to clean cache... |
| PAPI_CA_INV | Requests for cache line invalidation |
| PAPI_CA_ITV | Requests for cache line intervention |
| PAPI_L3_LDM | Level 3 load misses |
| PAPI_L3_STM | Level 3 store misses |
| PAPI_BRU_IDL | Cycles branch units are idle |
| PAPI_FXU_IDL | Cycles integer units are idle |
| PAPI_FPU_IDL | Cycles floating point units are idle |
| PAPI_LSU_IDL | Cycles load/store units are idle |
| PAPI_TLB_DM | Data translation lookaside buffer misses |
| PAPI_TLB_IM | Instruction translation lookaside buffer misses |
| PAPI_TLB_TL | Total translation lookaside buffer misses |
| PAPI_L1_LDM | Level 1 load misses |
| PAPI_L1_STM | Level 1 store misses |
| PAPI_L2_LDM | Level 2 load misses |
| PAPI_L2_STM | Level 2 store misses |
| PAPI_BTAC_M | Branch target address cache misses |
| PAPI_PRF_DM | Data prefetch cache misses |
| PAPI_L3_DCH | Level 3 data cache hits |
| PAPI_TLB_SD | Translation lookaside buffer shootdowns |
| PAPI_CSR_FAL | Failed store conditional instructions |
| PAPI_CSR_SUC | Successful store conditional instructions |
| PAPI_CSR_TOT | Total store conditional instructions |
| PAPI_MEM_SCY | Cycles Stalled Waiting for memory accesses |
| PAPI_MEM_RCY | Cycles Stalled Waiting for memory reads |
| PAPI_MEM_WCY | Cycles Stalled Waiting for memory writes |
| PAPI_STL_ICY | Cycles with no instruction issue |
| PAPI_FUL_ICY | Cycles with maximum instruction issue |
| PAPI_STL_CCY | Cycles with no instructions completed |
| PAPI_FUL_CCY | Cycles with maximum instructions completed |
| PAPI_HW_INT | Hardware interrupts |
| PAPI_BR_UCN | Unconditional branch instructions |
| PAPI_BR_CN | Conditional branch instructions |
| PAPI_BR_TKN | Conditional branch instructions taken |
| PAPI_BR_NTK | Conditional branch instructions not taken |
| PAPI_BR_MSP | Conditional branch instructions mispredicted |
| PAPI_BR_PRC | Conditional branch instructions correctly pr... |
| PAPI_FMA_INS | FMA instructions completed |
| PAPI_TOT_IIS | Instructions issued |
| PAPI_TOT_INS | Instructions completed |
| PAPI_INT_INS | Integer instructions |
| PAPI_FP_INS | Floating point instructions |
| PAPI_LD_INS | Load instructions |
| PAPI_SR_INS | Store instructions |
| PAPI_BR_INS | Branch instructions |
| PAPI_VEC_INS | Vector/SIMD instructions (could include inte... |
| PAPI_RES_STL | Cycles stalled on any resource |
| PAPI_FP_STAL | Cycles the FP unit(s) are stalled |
| PAPI_TOT_CYC | Total cycles |
| PAPI_LST_INS | Load/store instructions completed |
| PAPI_SYC_INS | Synchronization instructions completed |
| PAPI_L1_DCH | Level 1 data cache hits |
| PAPI_L2_DCH | Level 2 data cache hits |
| PAPI_L1_DCA | Level 1 data cache accesses |
| PAPI_L2_DCA | Level 2 data cache accesses |
| PAPI_L3_DCA | Level 3 data cache accesses |
| PAPI_L1_DCR | Level 1 data cache reads |
| PAPI_L2_DCR | Level 2 data cache reads |
| PAPI_L3_DCR | Level 3 data cache reads |
| PAPI_L1_DCW | Level 1 data cache writes |
| PAPI_L2_DCW | Level 2 data cache writes |
| PAPI_L3_DCW | Level 3 data cache writes |
| PAPI_L1_ICH | Level 1 instruction cache hits |
| PAPI_L2_ICH | Level 2 instruction cache hits |
| PAPI_L3_ICH | Level 3 instruction cache hits |
| PAPI_L1_ICA | Level 1 instruction cache accesses |
| PAPI_L2_ICA | Level 2 instruction cache accesses |
| PAPI_L3_ICA | Level 3 instruction cache accesses |
| PAPI_L1_ICR | Level 1 instruction cache reads |
| PAPI_L2_ICR | Level 2 instruction cache reads |
| PAPI_L3_ICR | Level 3 instruction cache reads |
| PAPI_L1_ICW | Level 1 instruction cache writes |
| PAPI_L2_ICW | Level 2 instruction cache writes |
| PAPI_L3_ICW | Level 3 instruction cache writes |
| PAPI_L1_TCH | Level 1 total cache hits |
| PAPI_L2_TCH | Level 2 total cache hits |
| PAPI_L3_TCH | Level 3 total cache hits |
| PAPI_L1_TCA | Level 1 total cache accesses |
| PAPI_L2_TCA | Level 2 total cache accesses |
| PAPI_L3_TCA | Level 3 total cache accesses |
| PAPI_L1_TCR | Level 1 total cache reads |
| PAPI_L2_TCR | Level 2 total cache reads |
| PAPI_L3_TCR | Level 3 total cache reads |
| PAPI_L1_TCW | Level 1 total cache writes |
| PAPI_L2_TCW | Level 2 total cache writes |
| PAPI_L3_TCW | Level 3 total cache writes |
| PAPI_FML_INS | Floating point multiply instructions |
| PAPI_FAD_INS | Floating point add instructions |
| PAPI_FDV_INS | Floating point divide instructions |
| PAPI_FSQ_INS | Floating point square root instructions |
| PAPI_FNV_INS | Floating point inverse instructions |
| PAPI_FP_OPS | Floating point operations |
| PAPI_SP_OPS | Floating point operations; optimized to coun... |
| PAPI_DP_OPS | Floating point operations; optimized to coun... |
| PAPI_VEC_SP | Single precision vector/SIMD instructions |
| PAPI_VEC_DP | Double precision vector/SIMD instructions |
| PAPI_REF_CYC | Reference clock cycles |
|---------------------|-------------------------------------------------|
```
## Creating a Configuration File
[Omnitrace](https://github.com/AMDResearch/omnitrace) supports 3 configuration file formats: JSON, XML, and plain text.
Configuration files are specified via the `OMNITRACE_CONFIG_FILE` environment variable
and by default will look for `${HOME}/omnitrace.cfg` and `${HOME}/omnitrace.json`.
Multiple configuration files can be concatenated via `:`, e.g.:
```shell
export OMNITRACE_CONFIG_FILE=~/.config/omnitrace.cfg:~/.config/omnitrace.json
```
If a configuration variable is specified in both a configuration file and in the environment,
the environment variable takes precedence.
### Sample Text Configuration File
Text files support very basic variables and are case-insensitive.
Variables are created when an lvalue starts with a $ and are
dereferenced when they appear as rvalues.
Entries in the text configuration file which do not match to a known setting
in `omnitrace-avail` but are prefixed with `OMNITRACE_` are interpreted as
environment variables and are exported via `setenv`
but do not override an existing value for the environment variable.
```shell
# lvals starting with $ are variables
$USE = ON
# use fields
OMNITRACE_USE_PERFETTO = $USE
OMNITRACE_USE_TIMEMORY = $USE
OMNITRACE_USE_SAMPLING = $USE
OMNITRACE_USE_PID = OFF
OMNITRACE_CRITICAL_TRACE = OFF
# debug
OMNITRACE_DEBUG = OFF
OMNITRACE_VERBOSE = 1
OMNITRACE_DL_VERBOSE = 1
# output fields
OMNITRACE_OUTPUT_PREFIX = %tag%-
OMNITRACE_OUTPUT_PATH = omnitrace-example-output
OMNITRACE_TIME_OUTPUT = OFF
# timemory fields
OMNITRACE_PAPI_EVENTS = PAPI_TOT_INS PAPI_FP_INS
OMNITRACE_TIMEMORY_COMPONENTS = wall_clock trip_count
# sampling fields
OMNITRACE_SAMPLING_FREQ = 10
# rocm-smi fields
OMNITRACE_ROCM_SMI_DEVICES = 1
# misc env variables
OMNITRACE_SAMPLING_KEEP_DYNINST_SUFFIX = OFF
OMNITRACE_SAMPLING_KEEP_INTERNAL = OFF
```
### Sample XML Configuration File
The full XML specification for a configuration value contains
a lot of information:
```xml
<?xml version="1.0" encoding="utf-8"?>
<timemory_xml>
<omnitrace>
<settings>
<cereal_class_version>2</cereal_class_version>
<!-- Full setting specification -->
<OMNITRACE_ADD_SECONDARY>
<cereal_class_version>1</cereal_class_version>
<name>add_secondary</name>
<environ>OMNITRACE_ADD_SECONDARY</environ>
<description>...</description>
<count>-1</count>
<max_count>1</max_count>
<cmdline>
<value0>--timemory-add-secondary</value0>
</cmdline>
<categories>
<value0>component</value0>
<value1>data</value1>
<value2>native</value2>
</categories>
<data_type>bool</data_type>
<initial>true</initial>
<value>true</value>
</OMNITRACE_ADD_SECONDARY>
<!-- etc. -->
</settings>
</omnitrace>
</timemory_xml>
```
Howver when writing an XML configuration file, the following is perfectly acceptable
to set `OMNITRACE_ADD_SECONDARY=false`:
```xml
<?xml version="1.0" encoding="utf-8"?>
<timemory_xml>
<omnitrace>
<settings>
<OMNITRACE_ADD_SECONDARY>
<value>false</value>
</OMNITRACE_ADD_SECONDARY>
</settings>
</omnitrace>
</timemory_xml>
```
### Sample JSON Configuration File
The full JSON specification for a configuration value contains the same information as the XML:
```json
{
"omnitrace": {
"settings": {
"OMNITRACE_ADD_SECONDARY": {
"count": -1,
"name": "add_secondary",
"data_type": "bool",
"initial": true,
"value": true,
"max_count": 1,
"cmdline": [
"--timemory-add-secondary"
],
"environ": "OMNITRACE_ADD_SECONDARY",
"cereal_class_version": 1,
"categories": [
"component",
"data",
"native"
],
"description": "Enable/disable components adding secondary (child) entries when available. E.g. suppress individual CUDA kernels, etc. when using Cupti components"
}
}
}
}
```
Similarly, the
Howver when writing an XML configuration file, the following is perfectly acceptable
to set `OMNITRACE_ADD_SECONDARY=false`:
```json
{
"omnitrace": {
"settings": {
"OMNITRACE_ADD_SECONDARY": {
"value": true
}
}
}
}
```
+29
Dosyayı Görüntüle
@@ -0,0 +1,29 @@
#!/bin/bash -e
message()
{
echo -e "\n\n##### ${@}... #####\n"
}
WORK_DIR=$(dirname ${BASH_SOURCE[0]})
message "Changing directory to ${WORK_DIR}"
cd ${WORK_DIR}
SOURCE_DIR=$(cd ${WORK_DIR}/.. &> /dev/null && pwd)
message "Source directory is ${SOURCE_DIR}"
message "Generating omnitrace.dox"
cmake -DSOURCE_DIR=${SOURCE_DIR} -P ${WORK_DIR}/generate-doxyfile.cmake
message "Generating doxygen xml files"
doxygen omnitrace.dox
message "Building html documentation"
make html
message "Removing stale documentation in ${SOURCE_DIR}/docs/"
rm -rf ${SOURCE_DIR}/docs/*
message "Copying docs-source/_build/html/* to docs/"
cp -r ${WORK_DIR}/_build/html/* ${SOURCE_DIR}/docs/
+9
Dosyayı Görüntüle
@@ -0,0 +1,9 @@
#!/bin/bash -e
WORK_DIR=$(dirname ${BASH_SOURCE[0]})
SOURCE_DIR=$(cd ${WORK_DIR}/.. &> /dev/null && pwd)
cmake -DSOURCE_DIR=${SOURCE_DIR} -P generate-doxyfile.cmake
doxygen omnitrace.dox
+212
Dosyayı Görüntüle
@@ -0,0 +1,212 @@
# User API
```eval_rst
.. doxygenfile:: omnitrace/user.h
```
By default, when omnitrace detects any `omnitrace_user_start_*` or `omnitrace_user_stop_*` function, instrumentation
is disabled at start-up -- thus, `omnitrace_user_stop_trace()` is not required at the beginning of main. This is
can be manually controlled via the `OMNITRACE_INIT_ENABLED` environment variable. User-defined regions are always
recorded, regardless of whether whether `omnitrace_user_start_*` or `omnitrace_user_stop_*` has been called.
## Example
### User API Implementation
```cpp
#include <omnitrace/user.h>
#include <atomic>
#include <cassert>
#include <cstdio>
#include <cstdlib>
#include <sstream>
#include <thread>
#include <vector>
std::atomic<long> total{ 0 };
long
fib(long n) __attribute__((noinline));
void
run(size_t nitr, long) __attribute__((noinline));
int
custom_push_region(const char* name);
namespace
{
int (*omnitrace_push_region_f)(const char*) = nullptr;
}
int
main(int argc, char** argv)
{
// get the internal callback to start a user-defined region
omnitrace_user_get_callbacks(OMNITRACE_USER_REGION, (void**) &omnitrace_push_region_f,
nullptr);
// assign the custom callback to start a user-defined region
if(omnitrace_push_region_f)
omnitrace_user_configure(OMNITRACE_USER_REGION, (void*) &custom_push_region,
nullptr);
omnitrace_user_push_region(argv[0]);
omnitrace_user_push_region("initialization");
size_t nthread = std::min<size_t>(16, std::thread::hardware_concurrency());
size_t nitr = 50000;
long nfib = 10;
if(argc > 1) nfib = atol(argv[1]);
if(argc > 2) nthread = atol(argv[2]);
if(argc > 3) nitr = atol(argv[3]);
omnitrace_user_pop_region("initialization");
printf("[%s] Threads: %zu\n[%s] Iterations: %zu\n[%s] fibonacci(%li)...\n", argv[0],
nthread, argv[0], nitr, argv[0], nfib);
omnitrace_user_push_region("thread_creation");
std::vector<std::thread> threads{};
threads.reserve(nthread);
// disable instrumentation for child threads
omnitrace_user_stop_thread_trace();
for(size_t i = 0; i < nthread; ++i)
{
size_t _nitr = ((i % 2) == 1) ? (nitr - (0.1 * nitr)) : (nitr + (0.1 * nitr));
long _nfib = ((i % 2) == 1) ? (nfib - (0.1 * nfib)) : (nfib + (0.1 * nfib));
threads.emplace_back(&run, _nitr, _nfib);
}
// re-enable instrumentation
omnitrace_user_start_thread_trace();
omnitrace_user_pop_region("thread_creation");
omnitrace_user_push_region("thread_wait");
for(auto& itr : threads)
itr.join();
omnitrace_user_pop_region("thread_wait");
run(nitr, nfib);
printf("[%s] fibonacci(%li) x %lu = %li\n", argv[0], nfib, nthread, total.load());
omnitrace_user_pop_region(argv[0]);
return 0;
}
long
fib(long n)
{
return (n < 2) ? n : fib(n - 1) + fib(n - 2);
}
#define RUN_LABEL \
std::string{ std::string{ __FUNCTION__ } + "(" + std::to_string(n) + ") x " + \
std::to_string(nitr) } \
.c_str()
void
run(size_t nitr, long n)
{
omnitrace_user_push_region(RUN_LABEL);
long local = 0;
for(size_t i = 0; i < nitr; ++i)
local += fib(n);
total += local;
omnitrace_user_pop_region(RUN_LABEL);
}
int
custom_push_region(const char* name)
{
printf("Pushing custom region :: %s\n", name);
return (*omnitrace_push_region_f)(name);
}
```
### User API Output
```console
$ omnitrace -l --min-address-range=0 --min-address-range-loop=0 --min-instructions=8 -E custom_push_region -o -- ./user-api
...
$ export OMNITRACE_USE_TIMEMORY=ON
$ export OMNITRACE_USE_PID=OFF
$ export OMNITRACE_TIME_OUTPUT=OFF
$ export OMNITRACE_OUTPUT_PATH=omnitrace-example-output
$ ./user-api.inst 20 4 100
Pushing custom region :: ./user-api.inst
[omnitrace][omnitrace_init_tooling] Instrumentation mode: Trace
______ .___ ___. .__ __. __ .___________..______ ___ ______ _______
/ __ \ | \/ | | \ | | | | | || _ \ / \ / || ____|
| | | | | \ / | | \| | | | `---| |----`| |_) | / ^ \ | ,----'| |__
| | | | | |\/| | | . ` | | | | | | / / /_\ \ | | | __|
| `--' | | | | | | |\ | | | | | | |\ \----./ _____ \ | `----.| |____
\______/ |__| |__| |__| \__| |__| |__| | _| `._____/__/ \__\ \______||_______|
Pushing custom region :: initialization
[./user-api.inst] Threads: 4
[./user-api.inst] Iterations: 100
[./user-api.inst] fibonacci(20)...
Pushing custom region :: thread_creation
Pushing custom region :: run(20) x 100
Pushing custom region :: thread_wait
Pushing custom region :: run(20) x 100
Pushing custom region :: run(20) x 100
Pushing custom region :: run(20) x 100
Pushing custom region :: run(20) x 100
[./user-api.inst] fibonacci(20) x 4 = 3382500
[omnitrace][2637959][0] omnitrace : 2.716905 sec wall_clock, 1.216 mb peak_rss, 3.680000 sec cpu_clock, 135.4 % cpu_util [laps: 1]
[omnitrace][2637959][0] user-api.inst/thread-0 : 2.715708 sec wall_clock, 2.354223 sec thread_cpu_clock, 86.7 % thread_cpu_util, 1.216 mb peak_rss [laps: 1]
[omnitrace][2637959][0] user-api.inst/thread-1 : 0.329802 sec wall_clock, 0.329739 sec thread_cpu_clock, 100.0 % thread_cpu_util, 0.000 mb peak_rss [laps: 1]
[omnitrace][2637959][0] user-api.inst/thread-2 : 0.355981 sec wall_clock, 0.335795 sec thread_cpu_clock, 94.3 % thread_cpu_util, 0.528 mb peak_rss [laps: 1]
[omnitrace][2637959][0] user-api.inst/thread-3 : 0.341329 sec wall_clock, 0.331214 sec thread_cpu_clock, 97.0 % thread_cpu_util, 0.456 mb peak_rss [laps: 1]
[omnitrace][2637959][0] user-api.inst/thread-4 : 0.360631 sec wall_clock, 0.330374 sec thread_cpu_clock, 91.6 % thread_cpu_util, 0.600 mb peak_rss [laps: 1]
[wall_clock]|0> Outputting 'omnitrace-example-output/wall_clock.json'...
[wall_clock]|0> Outputting 'omnitrace-example-output/wall_clock.tree.json'...
[wall_clock]|0> Outputting 'omnitrace-example-output/wall_clock.txt'...
[metadata::manager::finalize]> Outputting 'omnitrace-example-output/metadata.json' and 'omnitrace-example-output/functions.json'...
$ cat omnitrace-example-output/wall_clock.txt
|-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
| REAL-CLOCK TIMER (I.E. WALL-CLOCK TIMER) |
|-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
| LABEL | COUNT | DEPTH | METRIC | UNITS | SUM | MEAN | MIN | MAX | VAR | STDDEV | % SELF |
|-----------------------------------------------------------------------|---------|--------|------------|--------|----------|----------|----------|----------|----------|----------|--------|
| |0>>> ./user-api.inst | 1 | 0 | wall_clock | sec | 2.715611 | 2.715611 | 2.715611 | 2.715611 | 0.000000 | 0.000000 | 0.0 |
| |0>>> |_initialization | 1 | 1 | wall_clock | sec | 0.000001 | 0.000001 | 0.000001 | 0.000001 | 0.000000 | 0.000000 | 100.0 |
| |0>>> |_thread_creation | 1 | 1 | wall_clock | sec | 0.000170 | 0.000170 | 0.000170 | 0.000170 | 0.000000 | 0.000000 | 100.0 |
| |0>>> |_thread_wait | 1 | 1 | wall_clock | sec | 0.360751 | 0.360751 | 0.360751 | 0.360751 | 0.000000 | 0.000000 | 0.0 |
| |1>>> |_run(20) x 100 | 1 | 2 | wall_clock | sec | 0.329472 | 0.329472 | 0.329472 | 0.329472 | 0.000000 | 0.000000 | 100.0 |
| |3>>> |_run(20) x 100 | 1 | 2 | wall_clock | sec | 0.331028 | 0.331028 | 0.331028 | 0.331028 | 0.000000 | 0.000000 | 100.0 |
| |2>>> |_run(20) x 100 | 1 | 2 | wall_clock | sec | 0.335554 | 0.335554 | 0.335554 | 0.335554 | 0.000000 | 0.000000 | 100.0 |
| |4>>> |_run(20) x 100 | 1 | 2 | wall_clock | sec | 0.330220 | 0.330220 | 0.330220 | 0.330220 | 0.000000 | 0.000000 | 100.0 |
| |0>>> |_run | 1 | 1 | wall_clock | sec | 2.354618 | 2.354618 | 2.354618 | 2.354618 | 0.000000 | 0.000000 | 0.0 |
| |0>>> |_run(20) x 100 | 1 | 2 | wall_clock | sec | 2.354600 | 2.354600 | 2.354600 | 2.354600 | 0.000000 | 0.000000 | 48.3 |
| |0>>> |_fib | 1094600 | 3 | wall_clock | sec | 1.217671 | 0.000001 | 0.000000 | 0.000055 | 0.000000 | 0.000002 | 41.3 |
| |0>>> |_fib | 418100 | 4 | wall_clock | sec | 0.714197 | 0.000002 | 0.000000 | 0.000050 | 0.000000 | 0.000002 | 38.1 |
| |0>>> |_fib | 258400 | 5 | wall_clock | sec | 0.441874 | 0.000002 | 0.000000 | 0.000047 | 0.000000 | 0.000002 | 37.9 |
| |0>>> |_fib | 159700 | 6 | wall_clock | sec | 0.274224 | 0.000002 | 0.000000 | 0.000044 | 0.000000 | 0.000002 | 37.9 |
| |0>>> |_fib | 98700 | 7 | wall_clock | sec | 0.170399 | 0.000002 | 0.000000 | 0.000042 | 0.000000 | 0.000002 | 37.7 |
| |0>>> |_fib | 61000 | 8 | wall_clock | sec | 0.106093 | 0.000002 | 0.000000 | 0.000039 | 0.000000 | 0.000002 | 37.5 |
| |0>>> |_fib | 37700 | 9 | wall_clock | sec | 0.066316 | 0.000002 | 0.000000 | 0.000036 | 0.000000 | 0.000002 | 40.2 |
| |0>>> |_fib | 23300 | 10 | wall_clock | sec | 0.039640 | 0.000002 | 0.000000 | 0.000033 | 0.000000 | 0.000002 | 38.2 |
| |0>>> |_fib | 14400 | 11 | wall_clock | sec | 0.024504 | 0.000002 | 0.000000 | 0.000030 | 0.000000 | 0.000002 | 37.9 |
| |0>>> |_fib | 8900 | 12 | wall_clock | sec | 0.015219 | 0.000002 | 0.000000 | 0.000027 | 0.000000 | 0.000002 | 38.1 |
| |0>>> |_fib | 5500 | 13 | wall_clock | sec | 0.009417 | 0.000002 | 0.000000 | 0.000024 | 0.000000 | 0.000002 | 38.3 |
| |0>>> |_fib | 3400 | 14 | wall_clock | sec | 0.005806 | 0.000002 | 0.000000 | 0.000021 | 0.000000 | 0.000002 | 38.4 |
| |0>>> |_fib | 2100 | 15 | wall_clock | sec | 0.003576 | 0.000002 | 0.000000 | 0.000019 | 0.000000 | 0.000002 | 38.4 |
| |0>>> |_fib | 1300 | 16 | wall_clock | sec | 0.002201 | 0.000002 | 0.000000 | 0.000016 | 0.000000 | 0.000002 | 40.3 |
| |0>>> |_fib | 800 | 17 | wall_clock | sec | 0.001315 | 0.000002 | 0.000000 | 0.000014 | 0.000000 | 0.000002 | 42.1 |
| |0>>> |_fib | 500 | 18 | wall_clock | sec | 0.000762 | 0.000002 | 0.000000 | 0.000010 | 0.000000 | 0.000001 | 42.1 |
| |0>>> |_fib | 300 | 19 | wall_clock | sec | 0.000441 | 0.000001 | 0.000000 | 0.000008 | 0.000000 | 0.000001 | 47.8 |
| |0>>> |_fib | 200 | 20 | wall_clock | sec | 0.000230 | 0.000001 | 0.000000 | 0.000006 | 0.000000 | 0.000001 | 49.0 |
| |0>>> |_fib | 100 | 21 | wall_clock | sec | 0.000117 | 0.000001 | 0.000001 | 0.000003 | 0.000000 | 0.000000 | 84.5 |
| |0>>> |_fib | 100 | 22 | wall_clock | sec | 0.000018 | 0.000000 | 0.000000 | 0.000001 | 0.000000 | 0.000000 | 100.0 |
| |0>>> std::vector<std::thread, std::allocator<std::thread> >::~vector | 1 | 0 | wall_clock | sec | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 100.0 |
|-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
```
+33 -58
Dosyayı Görüntüle
@@ -1,8 +1,6 @@
// MIT License
//
// Copyright (c) 2020, The Regents of the University of California,
// through Lawrence Berkeley National Laboratory (subject to receipt of any
// required approvals from the U.S. Dept. of Energy). All rights reserved.
// Copyright (c) 2022 Advanced Micro Devices, Inc. All Rights Reserved.
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
@@ -22,36 +20,19 @@
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
// SOFTWARE.
#if !defined(OMNITRACE_DL_SOURCE)
# define OMNITRACE_DL_SOURCE 1
#endif
#define OMNITRACE_COMMON_LIBRARY_NAME "dl"
#include "common/defines.h"
#include "dl.hpp"
#include "common/delimit.hpp"
#include "common/environment.hpp"
#include "common/invoke.hpp"
#include "common/join.hpp"
#include "omnitrace/user.h"
#include <atomic>
#include <cstdint>
#include <cstdio>
#include <cstdlib>
#include <cstring>
#include <dlfcn.h>
#include <functional>
#include <gnu/libc-version.h>
#include <iostream>
#include <limits>
#include <memory>
#include <sstream>
#include <stdexcept>
#include <string>
#include <sys/stat.h>
#include <unistd.h>
#if !defined(OMNITRACE_USE_OMPT)
# define OMNITRACE_USE_OMPT 0
#endif
//--------------------------------------------------------------------------------------//
#define OMNITRACE_DLSYM(VARNAME, HANDLE, FUNCNAME) \
if(HANDLE) \
@@ -69,38 +50,6 @@
} \
}
//--------------------------------------------------------------------------------------//
//
// omnitrace symbols
//
//--------------------------------------------------------------------------------------//
extern "C"
{
struct ompt_start_tool_result_t;
void omnitrace_init_library(void) OMNITRACE_PUBLIC_API;
void omnitrace_init(const char*, bool, const char*) OMNITRACE_PUBLIC_API;
void omnitrace_finalize(void) OMNITRACE_PUBLIC_API;
void omnitrace_set_env(const char* env_name,
const char* env_val) OMNITRACE_PUBLIC_API;
void omnitrace_set_mpi(bool use, bool attached) OMNITRACE_PUBLIC_API;
void omnitrace_push_trace(const char* name) OMNITRACE_PUBLIC_API;
void omnitrace_pop_trace(const char* name) OMNITRACE_PUBLIC_API;
int omnitrace_user_start_trace_dl(void) OMNITRACE_HIDDEN_API;
int omnitrace_user_stop_trace_dl(void) OMNITRACE_HIDDEN_API;
int omnitrace_user_start_thread_trace_dl(void) OMNITRACE_HIDDEN_API;
int omnitrace_user_stop_thread_trace_dl(void) OMNITRACE_HIDDEN_API;
int omnitrace_user_push_region_dl(const char*) OMNITRACE_HIDDEN_API;
int omnitrace_user_pop_region_dl(const char*) OMNITRACE_HIDDEN_API;
ompt_start_tool_result_t* ompt_start_tool(unsigned int,
const char*) OMNITRACE_PUBLIC_API;
}
//--------------------------------------------------------------------------------------//
namespace omnitrace
@@ -405,6 +354,32 @@ extern "C"
}
}
void omnitrace_push_region(const char* name)
{
if(!dl::get_active()) return;
if(dl::get_thread_enabled())
{
OMNITRACE_DL_INVOKE(get_indirect().omnitrace_push_region_f, name);
}
else
{
++dl::get_thread_count();
}
}
void omnitrace_pop_region(const char* name)
{
if(!dl::get_active()) return;
if(dl::get_thread_enabled())
{
OMNITRACE_DL_INVOKE(get_indirect().omnitrace_pop_region_f, name);
}
else
{
if(dl::get_thread_count()-- == 0) omnitrace_user_start_thread_trace_dl();
}
}
void omnitrace_set_env(const char* a, const char* b)
{
setenv(a, b, 0);
+83
Dosyayı Görüntüle
@@ -0,0 +1,83 @@
// MIT License
//
// Copyright (c) 2022 Advanced Micro Devices, Inc. All Rights Reserved.
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in all
// copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
// SOFTWARE.
#pragma once
#include "common/defines.h"
#include "omnitrace/user.h"
#include <atomic>
#include <cstdint>
#include <cstdio>
#include <cstdlib>
#include <cstring>
#include <dlfcn.h>
#include <functional>
#include <gnu/libc-version.h>
#include <iostream>
#include <limits>
#include <memory>
#include <sstream>
#include <stdexcept>
#include <string>
#include <sys/stat.h>
#include <unistd.h>
#if !defined(OMNITRACE_USE_OMPT)
# define OMNITRACE_USE_OMPT 0
#endif
//--------------------------------------------------------------------------------------//
//
// omnitrace symbols
//
//--------------------------------------------------------------------------------------//
extern "C"
{
void omnitrace_init_library(void) OMNITRACE_PUBLIC_API;
void omnitrace_init(const char*, bool, const char*) OMNITRACE_PUBLIC_API;
void omnitrace_finalize(void) OMNITRACE_PUBLIC_API;
void omnitrace_set_env(const char* env_name,
const char* env_val) OMNITRACE_PUBLIC_API;
void omnitrace_set_mpi(bool use, bool attached) OMNITRACE_PUBLIC_API;
void omnitrace_push_trace(const char* name) OMNITRACE_PUBLIC_API;
void omnitrace_pop_trace(const char* name) OMNITRACE_PUBLIC_API;
void omnitrace_push_region(const char*) OMNITRACE_PUBLIC_API;
void omnitrace_pop_region(const char*) OMNITRACE_PUBLIC_API;
#if defined(OMNITRACE_DL_SOURCE) && (OMNITRACE_DL_SOURCE > 0)
int omnitrace_user_start_trace_dl(void) OMNITRACE_HIDDEN_API;
int omnitrace_user_stop_trace_dl(void) OMNITRACE_HIDDEN_API;
int omnitrace_user_start_thread_trace_dl(void) OMNITRACE_HIDDEN_API;
int omnitrace_user_stop_thread_trace_dl(void) OMNITRACE_HIDDEN_API;
int omnitrace_user_push_region_dl(const char*) OMNITRACE_HIDDEN_API;
int omnitrace_user_pop_region_dl(const char*) OMNITRACE_HIDDEN_API;
struct ompt_start_tool_result_t;
ompt_start_tool_result_t* ompt_start_tool(unsigned int,
const char*) OMNITRACE_PUBLIC_API;
#endif
}
+3
Dosyayı Görüntüle
@@ -232,6 +232,9 @@ get_sampling_freq();
double&
get_sampling_delay();
std::string
get_sampling_cpus();
double&
get_thread_sampling_freq();
+18 -3
Dosyayı Görüntüle
@@ -172,6 +172,13 @@ configure_settings()
"increasing this value can fix deadlocks during init",
0.5, "sampling");
OMNITRACE_CONFIG_SETTING(
std::string, "OMNITRACE_SAMPLING_CPUS",
"CPUs to collect frequency information for. Values should be separated by commas "
"and can be explicit or ranges, e.g. 0,1,5-8. An empty value implies 'all' and "
"'none' suppresses all CPU frequency sampling",
"", "sampling");
auto _backend = tim::get_env_choice<std::string>(
"OMNITRACE_BACKEND",
(_system_backend)
@@ -522,9 +529,10 @@ print_settings()
if(dmp::rank() > 0) return;
static std::set<tim::string_view_t> _sample_options = {
"OMNITRACE_SAMPLING_FREQ", "OMNITRACE_SAMPLING_DELAY",
"OMNITRACE_FLAT_SAMPLING", "OMNITRACE_TIMELINE_SAMPLING",
"OMNITRACE_FLAT_SAMPLING", "OMNITRACE_TIMELINE_SAMPLING",
"OMNITRACE_SAMPLING_FREQ", "OMNITRACE_SAMPLING_DELAY",
"OMNITRACE_SAMPLING_CPUS", "OMNITRACE_FLAT_SAMPLING",
"OMNITRACE_TIMELINE_SAMPLING", "OMNITRACE_FLAT_SAMPLING",
"OMNITRACE_TIMELINE_SAMPLING",
};
static std::set<tim::string_view_t> _perfetto_options = {
"OMNITRACE_OUTPUT_FILE",
@@ -915,6 +923,13 @@ get_sampling_delay()
return static_cast<tim::tsettings<double>&>(*_v->second).get();
}
std::string
get_sampling_cpus()
{
static auto _v = get_config()->find("OMNITRACE_SAMPLING_CPUS");
return static_cast<tim::tsettings<std::string>&>(*_v->second).get();
}
int64_t
get_critical_trace_count()
{
+44 -2
Dosyayı Görüntüle
@@ -29,6 +29,7 @@
#include "library/timemory.hpp"
#include <cstdlib>
#include <string>
#include <utility>
#include <vector>
@@ -40,8 +41,9 @@ namespace
{
struct cpu_freq
{};
using freq_pair_t = std::pair<size_t, double>;
std::vector<std::deque<freq_pair_t>> cpu_frequencies = {};
using freq_pair_t = std::pair<size_t, double>;
std::vector<std::deque<freq_pair_t>> cpu_frequencies = {};
std::set<size_t> enabled_cpu_frequencies = {};
struct cpu_mem
{};
@@ -107,6 +109,42 @@ config()
_ifs.close();
auto _enabled_val = get_sampling_cpus();
if(_enabled_val != "none" && _enabled_val != "all")
{
auto _enabled = tim::delimit(_enabled_val, ",; \t");
if(_enabled.empty())
{
for(size_t i = 0; i < _ncpu; ++i)
enabled_cpu_frequencies.emplace(i);
}
for(auto&& _v : _enabled)
{
if(_v.find_first_not_of("0123456789-") != std::string::npos)
{
OMNITRACE_VERBOSE_F(
0,
"Invalid CPU specification. Only numerical values (e.g., 0) or "
"ranges (e.g., 0-7) are permitted. Ignoring %s...",
_v.c_str());
continue;
}
if(_v.find('-') != std::string::npos)
{
auto _vv = tim::delimit(_v, "-");
OMNITRACE_CONDITIONAL_THROW(
_vv.size() != 2,
"Invalid CPU range specification: %s. Required format N-M, e.g. 0-4",
_v.c_str());
for(size_t i = std::stoull(_vv.at(0)); i < std::stoull(_vv.at(1)); ++i)
enabled_cpu_frequencies.insert(i);
}
else
{
enabled_cpu_frequencies.insert(std::stoull(_v));
}
}
}
cpu_frequencies.resize(_ncpu);
cpu_mhz_pos = _cpu_mhz_pos;
ifs = std::make_unique<std::ifstream>("/proc/cpuinfo", std::ifstream::binary);
@@ -129,7 +167,11 @@ sample()
auto _ts = tim::get_clock_real_now<size_t, std::nano>();
for(int64_t i = 0; i < ncpu; ++i)
{
if(!enabled_cpu_frequencies.empty() && enabled_cpu_frequencies.count(i) == 0)
continue;
cpu_frequencies.at(i).emplace_back(_ts, _read_cpu_freq(i));
}
}
void
+1 -3
Dosyayı Görüntüle
@@ -1,8 +1,6 @@
// MIT License
//
// Copyright (c) 2020, The Regents of the University of California,
// through Lawrence Berkeley National Laboratory (subject to receipt of any
// required approvals from the U.S. Dept. of Energy). All rights reserved.
// Copyright (c) 2022 Advanced Micro Devices, Inc. All Rights Reserved.
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
+1 -3
Dosyayı Görüntüle
@@ -1,8 +1,6 @@
// MIT License
//
// Copyright (c) 2020, The Regents of the University of California,
// through Lawrence Berkeley National Laboratory (subject to receipt of any
// required approvals from the U.S. Dept. of Energy). All rights reserved.
// Copyright (c) 2022 Advanced Micro Devices, Inc. All Rights Reserved.
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
+1 -1
Dosyayı Görüntüle
@@ -206,7 +206,7 @@ function(OMNITRACE_ADD_TEST)
_TEST
baseline binary-rewrite binary-rewrite-run binary-rewrite-sampling
binary-rewrite-run-sampling runtime-instrument runtime-instrument-sampling)
string(REPLACE "-run-" "-" _prefix "${TEST_NAME}-${_TEST}/")
string(REGEX REPLACE "-run(-|/)" "\\1" _prefix "${TEST_NAME}-${_TEST}/")
set(_environ "${TEST_ENVIRONMENT}")
set(_labels "${_TEST}")
set(_timeout ${TEST_REWRITE_TIMEOUT})