OMPT Target Offload Support (#17)

- Porting from https://github.com/ROCm/omnitrace/pull/411
- Improve OMPT support
- Add OpenMP target example to testing
- Update Timemory submodule to use ROCm/Timemory rather than NERSC/Timemory
- Update `actions/upload-artifacts` to v4
- Standardize the `cmake_minimum_required` to 3.18.4 across workflows, project, and examples
- Updated Ubuntu 20.04 workflows

[ROCm/rocprofiler-systems commit: 9da7365471]
Dieser Commit ist enthalten in:
David Galiffi
2024-11-07 16:49:32 -05:00
Ursprung 86a2901239
Commit d8c98d2d4d
34 geänderte Dateien mit 800 neuen und 124 gelöschten Zeilen
@@ -63,8 +63,9 @@ jobs:
chmod +x /opt/trace_processor/bin/trace_processor_shell
fi
python3 -m pip install --upgrade pip &&
python3 -m pip install numpy perfetto dataclasses &&
for i in 6 7 8 9 10 11; do /opt/conda/envs/py3.${i}/bin/python -m pip install numpy perfetto dataclasses; done
python3 -m pip install --upgrade numpy perfetto dataclasses &&
python3 -m pip install 'cmake==3.18.4' &&
for i in 6 7 8 9 10 11; do /opt/conda/envs/py3.${i}/bin/python -m pip install --upgrade numpy perfetto dataclasses; done
- name: Configure Env
run:
@@ -136,7 +137,7 @@ jobs:
- name: CTest Artifacts
if: failure()
continue-on-error: True
uses: actions/upload-artifact@v3
uses: actions/upload-artifact@v4
with:
name: ctest-${{ github.job }}-${{ strategy.job-index }}-log
path: |
@@ -145,7 +146,7 @@ jobs:
- name: Data Artifacts
if: failure()
continue-on-error: True
uses: actions/upload-artifact@v3
uses: actions/upload-artifact@v4
with:
name: data-${{ github.job }}-${{ strategy.job-index }}-files
path: |
@@ -67,8 +67,9 @@ jobs:
chmod +x /opt/trace_processor/bin/trace_processor_shell
fi
python3 -m pip install --upgrade pip &&
python3 -m pip install numpy perfetto dataclasses &&
for i in 6 7 8 9 10 11; do /opt/conda/envs/py3.${i}/bin/python -m pip install numpy perfetto dataclasses; done
python3 -m pip install --upgrade numpy perfetto dataclasses &&
python3 -m pip install 'cmake==3.18.4' &&
for i in 6 7 8 9 10 11; do /opt/conda/envs/py3.${i}/bin/python -m pip install --upgrade numpy perfetto dataclasses; done
- name: Install ROCm Packages
if: ${{ matrix.rocm-version > 0 }}
@@ -46,7 +46,7 @@ jobs:
compiler: ['g++-7', 'g++-8']
lto: ['OFF']
strip: ['OFF']
python: ['ON']
python: ['OFF']
build-type: ['Release']
mpi-headers: ['OFF']
static-libgcc: ['OFF']
@@ -97,8 +97,9 @@ jobs:
wget https://commondatastorage.googleapis.com/perfetto-luci-artifacts/v47.0/linux-amd64/trace_processor_shell -P /opt/trace_processor/bin &&
chmod +x /opt/trace_processor/bin/trace_processor_shell &&
python3 -m pip install --upgrade pip &&
python3 -m pip install numpy perfetto dataclasses &&
for i in 6 7 8 9 10 11; do /opt/conda/envs/py3.${i}/bin/python -m pip install numpy perfetto dataclasses; done &&
python3 -m pip install --upgrade numpy perfetto dataclasses &&
python3 -m pip install 'cmake==3.18.4' &&
for i in 6 7 8 9 10 11; do /opt/conda/envs/py3.${i}/bin/python -m pip install --upgrade numpy perfetto dataclasses; done &&
apt-get -y --purge autoremove &&
apt-get -y clean &&
/opt/conda/bin/conda clean -y -a
@@ -278,8 +279,9 @@ jobs:
wget https://commondatastorage.googleapis.com/perfetto-luci-artifacts/v47.0/linux-amd64/trace_processor_shell -P /opt/trace_processor/bin &&
chmod +x /opt/trace_processor/bin/trace_processor_shell &&
python3 -m pip install --upgrade pip &&
python3 -m pip install numpy perfetto dataclasses &&
for i in 6 7 8 9 10 11; do /opt/conda/envs/py3.${i}/bin/python -m pip install numpy perfetto dataclasses; done &&
python3 -m pip install --upgrade numpy perfetto dataclasses &&
python3 -m pip install 'cmake==3.18.4' &&
for i in 6 7 8 9 10 11; do /opt/conda/envs/py3.${i}/bin/python -m pip install --upgrade numpy perfetto dataclasses; done &&
apt-get -y --purge autoremove &&
apt-get -y clean &&
/opt/conda/bin/conda clean -y -a
@@ -408,16 +410,6 @@ jobs:
ompt: ['ON']
papi: ['ON']
deps: ['libtbb-dev libboost-{atomic,system,thread,date-time,filesystem,timer}-dev']
include:
- compiler: 'g++'
mpi: 'nompi'
boost: 'ON'
tbb: 'ON'
build-type: 'Release'
python: 'ON'
ompt: 'OFF'
papi: 'OFF'
deps: ''
env:
ELFUTILS_DOWNLOAD_VERSION: 0.186
@@ -440,7 +432,8 @@ jobs:
wget https://commondatastorage.googleapis.com/perfetto-luci-artifacts/v47.0/linux-amd64/trace_processor_shell -P /opt/trace_processor/bin &&
chmod +x /opt/trace_processor/bin/trace_processor_shell &&
python3 -m pip install --upgrade pip &&
python3 -m pip install numpy perfetto dataclasses &&
python3 -m pip install --upgrade numpy perfetto dataclasses &&
python3 -m pip install 'cmake==3.18.4' &&
sudo apt-get -y --purge autoremove &&
sudo apt-get -y clean
@@ -592,8 +585,9 @@ jobs:
wget https://commondatastorage.googleapis.com/perfetto-luci-artifacts/v47.0/linux-amd64/trace_processor_shell -P /opt/trace_processor/bin &&
chmod +x /opt/trace_processor/bin/trace_processor_shell &&
python3 -m pip install --upgrade pip &&
python3 -m pip install numpy perfetto dataclasses &&
for i in 6 7 8 9 10 11; do /opt/conda/envs/py3.${i}/bin/python -m pip install numpy perfetto dataclasses; done &&
python3 -m pip install --upgrade numpy perfetto dataclasses &&
python3 -m pip install 'cmake==3.18.4' &&
for i in 6 7 8 9 10 11; do /opt/conda/envs/py3.${i}/bin/python -m pip install --upgrade numpy perfetto dataclasses; done &&
apt-get -y --purge autoremove &&
apt-get -y clean &&
/opt/conda/bin/conda clean -y -a
@@ -111,8 +111,9 @@ jobs:
apt-get upgrade -y &&
apt-get install -y autoconf bison build-essential clang environment-modules gettext libfabric-dev libiberty-dev libomp-dev libopenmpi-dev libtool m4 openmpi-bin python3-pip texinfo ${{ matrix.compiler }} &&
python3 -m pip install --upgrade pip &&
python3 -m pip install numpy perfetto dataclasses &&
for i in 6 7 8 9 10 11; do /opt/conda/envs/py3.${i}/bin/python -m pip install numpy perfetto dataclasses; done
python3 -m pip install --upgrade numpy perfetto dataclasses &&
python3 -m pip install 'cmake==3.18.4' &&
for i in 6 7 8 9 10 11; do /opt/conda/envs/py3.${i}/bin/python -m pip install --upgrade numpy perfetto dataclasses; done
- name: Install ROCm Packages
timeout-minutes: 25
+1 -1
Datei anzeigen
@@ -1,6 +1,6 @@
[submodule "external/timemory"]
path = external/timemory
url = https://github.com/NERSC/timemory.git
url = https://github.com/ROCm/timemory.git
[submodule "external/perfetto"]
path = external/perfetto
url = https://github.com/google/perfetto.git
@@ -1,4 +1,4 @@
cmake_minimum_required(VERSION 3.16 FATAL_ERROR)
cmake_minimum_required(VERSION 3.18.4 FATAL_ERROR)
if(CMAKE_SOURCE_DIR STREQUAL CMAKE_BINARY_DIR AND CMAKE_CURRENT_SOURCE_DIR STREQUAL
CMAKE_SOURCE_DIR)
+1 -1
Datei anzeigen
@@ -132,7 +132,7 @@ export LD_LIBRARY_PATH=/opt/rocprofiler-systems/lib:${LD_LIBRARY_PATH}
Generate a rocprofiler-systems configuration file using `rocprof-sys-avail -G rocprof-sys.cfg`. Optionally, use `rocprof-sys-avail -G rocprof-sys.cfg --all` for
a verbose configuration file with descriptions, categories, etc. Modify the configuration file as desired, e.g. enable
[perfetto](https://perfetto.dev/), [timemory](https://github.com/NERSC/timemory), sampling, and process-level sampling by default
[perfetto](https://perfetto.dev/), [timemory](https://github.com/ROCm/timemory), sampling, and process-level sampling by default
and tweak some sampling default values:
```console
@@ -677,7 +677,7 @@ mark_as_advanced(TIMEMORY_PROJECT_NAME)
rocprofiler_systems_checkout_git_submodule(
RELATIVE_PATH external/timemory
WORKING_DIRECTORY ${PROJECT_SOURCE_DIR}
REPO_URL https://github.com/NERSC/timemory.git
REPO_URL https://github.com/ROCm/timemory.git
REPO_BRANCH omnitrace)
rocprofiler_systems_save_variables(
@@ -7,7 +7,7 @@ The ROCm Systems Profiler feature set and use cases
***************************************
`ROCm Systems Profiler <https://github.com/ROCm/rocprofiler-systems>`_ is designed to be highly extensible.
Internally, it leverages the `Timemory performance analysis toolkit <https://github.com/NERSC/timemory>`_
Internally, it leverages the `Timemory performance analysis toolkit <https://github.com/ROCm/timemory>`_
to manage extensions, resources, data, and other items. It supports the following features,
modes, metrics, and APIs.
@@ -28,7 +28,7 @@ use the ``rocprof-sys-avail -G ~/.rocprof-sys.cfg --all`` option
for a verbose configuration file with descriptions, categories, and additional information.
Modify ``${HOME}/.rocprof-sys.cfg`` as required. For example, enable `Perfetto <https://perfetto.dev/>`_,
`Timemory <https://github.com/NERSC/timemory>`_, sampling, and process-level sampling by default
`Timemory <https://github.com/ROCm/timemory>`_, sampling, and process-level sampling by default
and tweak the default sampling values.
.. code-block:: shell
@@ -64,7 +64,7 @@ accepts a case insensitive match for nearly all common Boolean logic expressions
Exploring components
-----------------------------------
ROCm Systems Profiler uses `Timemory <https://github.com/NERSC/timemory>`_ extensively to provide
ROCm Systems Profiler uses `Timemory <https://github.com/ROCm/timemory>`_ extensively to provide
various capabilities and manage
data and resources. By default, with ``ROCPROFSYS_PROFILE=ON``, ROCm Systems Profiler only collects wall-clock
timing values. However, by modifying the ``ROCPROFSYS_TIMEMORY_COMPONENTS`` setting,
@@ -1,4 +1,4 @@
cmake_minimum_required(VERSION 3.16 FATAL_ERROR)
cmake_minimum_required(VERSION 3.18.4 FATAL_ERROR)
project(rocprofiler-systems-examples LANGUAGES C CXX)
@@ -1,4 +1,4 @@
cmake_minimum_required(VERSION 3.16 FATAL_ERROR)
cmake_minimum_required(VERSION 3.18.4 FATAL_ERROR)
project(rocprofiler-systems-causal-example LANGUAGES CXX)
@@ -1,4 +1,4 @@
cmake_minimum_required(VERSION 3.15 FATAL_ERROR)
cmake_minimum_required(VERSION 3.18.4 FATAL_ERROR)
project(rocprofiler-systems-code-coverage-example LANGUAGES CXX)
@@ -1,4 +1,4 @@
cmake_minimum_required(VERSION 3.15 FATAL_ERROR)
cmake_minimum_required(VERSION 3.18.4 FATAL_ERROR)
project(rocprofiler-systems-fork LANGUAGES CXX)
@@ -1,4 +1,4 @@
cmake_minimum_required(VERSION 3.16 FATAL_ERROR)
cmake_minimum_required(VERSION 3.18.4 FATAL_ERROR)
project(rocprofiler-systems-lulesh-example LANGUAGES C CXX)
@@ -1,4 +1,4 @@
cmake_minimum_required(VERSION 3.16 FATAL_ERROR)
cmake_minimum_required(VERSION 3.18.4 FATAL_ERROR)
project(rocprofiler-systems-mpi-examples LANGUAGES C CXX)
@@ -1,4 +1,4 @@
cmake_minimum_required(VERSION 3.16 FATAL_ERROR)
cmake_minimum_required(VERSION 3.18.4 FATAL_ERROR)
project(rocprofiler-systems-openmp LANGUAGES CXX)
@@ -56,3 +56,5 @@ if(ROCPROFSYS_INSTALL_EXAMPLES)
DESTINATION bin
COMPONENT rocprofiler-systems-examples)
endif()
add_subdirectory(target)
@@ -0,0 +1,110 @@
#
#
#
cmake_minimum_required(VERSION 3.18.4 FATAL_ERROR)
# try to find a compatible HIP version
if(ROCmVersion_MAJOR_VERSION AND ROCmVersion_MAJOR_VERSION GREATER_EQUAL 6)
find_package(hip ${ROCmVersion_MAJOR_VERSION}.0.0)
else()
find_package(hip)
endif()
if(NOT hip_FOUND)
message(WARNING "ROCm >= 5.6 not found. Skipping OpenMP target example.")
return()
elseif(hip_FOUND AND hip_VERSION VERSION_LESS 5.6.0)
message(
WARNING
"ROCm >= 5.6 not found (found ${hip_VERSION}). Skipping OpenMP target example."
)
return()
endif()
if(NOT OMP_TARGET_COMPILER)
find_program(
amdclangpp_EXECUTABLE
NAMES amdclang++
HINTS ${ROCM_PATH} ENV ROCM_PATH /opt/rocm
PATHS ${ROCM_PATH} ENV ROCM_PATH /opt/rocm
PATH_SUFFIXES bin llvm/bin)
mark_as_advanced(amdclangpp_EXECUTABLE)
if(amdclangpp_EXECUTABLE)
set(OMP_TARGET_COMPILER
"${amdclangpp_EXECUTABLE}"
CACHE FILEPATH "OpenMP target compiler")
else()
message(WARNING "OpenMP target compiler not found. Skipping this example.")
return()
endif()
endif()
project(rocprofiler-systems-example-openmp-target-lib LANGUAGES CXX)
set(CMAKE_BUILD_TYPE "RelWithDebInfo")
set(DEFAULT_GPU_TARGETS
"gfx900"
"gfx906"
"gfx908"
"gfx90a"
"gfx940"
"gfx941"
"gfx942"
"gfx1030"
"gfx1010"
"gfx1100"
"gfx1101"
"gfx1102")
set(GPU_TARGETS
"${DEFAULT_GPU_TARGETS}"
CACHE STRING "GPU targets to compile for")
find_package(Threads REQUIRED)
add_library(openmp-target-lib SHARED)
target_sources(openmp-target-lib PRIVATE library.cpp)
target_link_libraries(openmp-target-lib PUBLIC Threads::Threads)
target_compile_options(openmp-target-lib PRIVATE -fopenmp -ggdb)
target_link_options(openmp-target-lib PUBLIC -fopenmp)
foreach(_TARGET ${GPU_TARGETS})
target_compile_options(openmp-target-lib PRIVATE --offload-arch=${_TARGET})
target_link_options(openmp-target-lib PUBLIC --offload-arch=${_TARGET})
endforeach()
message(STATUS "Using OpenMP target compiler: ${OMP_TARGET_COMPILER}")
get_filename_component(OMP_TARGET_COMPILER_DIR ${OMP_TARGET_COMPILER} PATH)
get_filename_component(OMP_TARGET_COMPILER_DIR ${OMP_TARGET_COMPILER_DIR} PATH)
message(STATUS "Using OpemMP target compiler directory: ${OMP_TARGET_COMPILER_DIR}")
if(NOT EXISTS ${OMP_TARGET_COMPILER_DIR}/llvm/lib)
message(FATAL_ERROR "${OMP_TARGET_COMPILER_DIR}/llvm/lib does not exist")
endif()
set_target_properties(
openmp-target-lib
PROPERTIES BUILD_RPATH
"${OMP_TARGET_COMPILER_DIR}/llvm/lib:${OMP_TARGET_COMPILER_DIR}/lib"
OUTPUT_NAME "openmp-target"
POSITION_INDEPENDENT_CODE ON)
rocprofiler_systems_custom_compilation(TARGET openmp-target-lib COMPILER
${OMP_TARGET_COMPILER})
add_executable(openmp-target)
target_sources(openmp-target PRIVATE main.cpp)
target_link_libraries(openmp-target PRIVATE openmp-target-lib)
target_compile_options(openmp-target PRIVATE -ggdb)
set_target_properties(
openmp-target
PROPERTIES BUILD_RPATH
"${OMP_TARGET_COMPILER_DIR}/llvm/lib:${OMP_TARGET_COMPILER_DIR}/lib"
POSITION_INDEPENDENT_CODE ON)
rocprofiler_systems_custom_compilation(TARGET openmp-target COMPILER
${OMP_TARGET_COMPILER})
@@ -0,0 +1,149 @@
// MIT License
//
// Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved.
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in
// all copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
// THE SOFTWARE.
#include <cstdlib>
#include <fstream>
#include <iostream>
#include <math.h>
#include <sstream>
#include <stdio.h>
#include <string>
#include <thread>
#include <unistd.h>
constexpr float EPS_FLOAT = 1.0e-7f;
constexpr double EPS_DOUBLE = 1.0e-15;
#pragma omp declare target
template <typename T>
T
mul(T a, T b)
{
T c;
c = a * b;
return c;
}
#pragma omp end declare target
template <typename T>
void
vmul(T* a, T* b, T* c, int N)
{
#pragma omp target map(to : a [0:N], b [0:N]) map(from : c [0:N])
#pragma omp teams distribute parallel for
for(int i = 0; i < N; i++)
{
c[i] = mul(a[i], b[i]);
}
}
int
run_impl()
{
std::this_thread::sleep_for(std::chrono::milliseconds{ 50 });
constexpr int N = 100000;
constexpr int Nc = N / 100;
int a_i[N], b_i[N], c_i[N], validate_i[N];
float a_f[N], b_f[N], c_f[N], validate_f[N];
double a_d[N], b_d[N], c_d[N], validate_d[N];
int N_errors = 0;
bool flag = false;
#pragma omp parallel for schedule(dynamic, Nc)
for(int i = 0; i < N; ++i)
{
a_f[i] = a_i[i] = i + 1;
b_f[i] = b_i[i] = i + 2;
a_d[i] = a_i[i];
b_d[i] = b_i[i];
validate_i[i] = a_i[i] * b_i[i];
validate_f[i] = a_f[i] * b_f[i];
validate_d[i] = a_d[i] * b_d[i];
}
for(int i = 0; i < 2; ++i)
{
vmul(a_i, b_i, c_i, N);
vmul(a_f, b_f, c_f, N);
vmul(a_d, b_d, c_d, N);
}
for(int i = 0; i < N; i++)
{
if(c_i[i] != validate_i[i])
{
++N_errors;
// print 1st bad index
if(!flag)
{
printf("First fail: c_i[%d](%d) != validate_i[%d](%d)\n", i, c_i[i], i,
validate_i[i]);
flag = true;
}
}
}
flag = false;
for(int i = 0; i < N; i++)
{
if(fabs(c_f[i] - validate_f[i]) > EPS_FLOAT)
{
++N_errors;
// print 1st bad index
if(!flag)
{
printf("First fail: c_f[%d](%f) != validate_f[%d](%f)\n", i,
static_cast<double>(c_f[i]), i,
static_cast<double>(validate_f[i]));
flag = true;
}
}
}
flag = false;
for(int i = 0; i < N; i++)
{
if(fabs(c_d[i] - validate_d[i]) > EPS_DOUBLE)
{
++N_errors;
// print 1st bad index
if(!flag)
{
printf("First fail: c_d[%d](%f) != validate_d[%d](%f)\n", i, c_d[i], i,
validate_d[i]);
flag = true;
}
}
}
return N_errors;
}
int
run()
{
#pragma omp parallel
{
run_impl();
}
return 0;
}
@@ -0,0 +1,52 @@
// MIT License
//
// Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved.
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in
// all copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
// THE SOFTWARE.
#include <cstdlib>
#include <fstream>
#include <iostream>
#include <math.h>
#include <sstream>
#include <stdio.h>
#include <string>
#include <unistd.h>
extern int
run();
int
main()
{
auto N_errors = run();
auto _ec = EXIT_SUCCESS;
if(N_errors == 0)
{
printf("Success\n");
}
else
{
printf("Total %d failures\n", N_errors);
printf("Fail\n");
_ec = EXIT_FAILURE;
}
return _ec;
}
@@ -1,4 +1,4 @@
cmake_minimum_required(VERSION 3.16 FATAL_ERROR)
cmake_minimum_required(VERSION 3.18.4 FATAL_ERROR)
project(rocprofiler-systems-parallel-overhead-example LANGUAGES CXX)
@@ -1,4 +1,4 @@
cmake_minimum_required(VERSION 3.16 FATAL_ERROR)
cmake_minimum_required(VERSION 3.18.4 FATAL_ERROR)
project(rocprofiler-systems-python)
@@ -1,4 +1,4 @@
cmake_minimum_required(VERSION 3.16 FATAL_ERROR)
cmake_minimum_required(VERSION 3.18.4 FATAL_ERROR)
project(rocprofiler-systems-rccl-example LANGUAGES CXX)
@@ -1,4 +1,4 @@
cmake_minimum_required(VERSION 3.15 FATAL_ERROR)
cmake_minimum_required(VERSION 3.18.4 FATAL_ERROR)
project(rocprofiler-systems-rewrite-caller-example LANGUAGES CXX)
@@ -1,4 +1,4 @@
cmake_minimum_required(VERSION 3.15 FATAL_ERROR)
cmake_minimum_required(VERSION 3.18.4 FATAL_ERROR)
project(rocprofiler-systems-trace-time-window-example LANGUAGES CXX)
@@ -1,4 +1,4 @@
cmake_minimum_required(VERSION 3.16 FATAL_ERROR)
cmake_minimum_required(VERSION 3.18.4 FATAL_ERROR)
project(rocprofiler-systems-transpose-example LANGUAGES CXX)
@@ -1,4 +1,4 @@
cmake_minimum_required(VERSION 3.16 FATAL_ERROR)
cmake_minimum_required(VERSION 3.18.4 FATAL_ERROR)
project(rocprofiler-systems-user-api-example LANGUAGES CXX)
Submodul projects/rocprofiler-systems/external/timemory aktualisiert: 9ce43f3293...68ce420086
@@ -207,7 +207,7 @@ EOF
verbose-run cd ${BINARY_DIR}
cat << EOF > dashboard.cmake
cmake_minimum_required(VERSION 3.16 FATAL_ERROR)
cmake_minimum_required(VERSION 3.18.4 FATAL_ERROR)
include("\${CMAKE_CURRENT_LIST_DIR}/CTestCustom.cmake")
@@ -112,7 +112,7 @@ verbose-run cp -v -r ${EXAMPLE_DIR}/${EXAMPLE_NAME}/* ${SOURCE_DIR}/
verbose-run pushd ${SOURCE_DIR}
cat << EOF > CMakeLists.txt
cmake_minimum_required(VERSION 3.16.0 FATAL_ERROR)
cmake_minimum_required(VERSION 3.18.4.0 FATAL_ERROR)
project(test LANGUAGES C CXX)
@@ -1,4 +1,4 @@
cmake_minimum_required(VERSION 3.8)
cmake_minimum_required(VERSION 3.18.4)
if(NOT DEFINED ROCPROFSYS_VERSION)
file(READ "${CMAKE_CURRENT_LIST_DIR}/../VERSION" FULL_VERSION_STRING LIMIT_COUNT 1)
@@ -30,27 +30,171 @@
#if defined(ROCPROFSYS_USE_OMPT) && ROCPROFSYS_USE_OMPT > 0
# include "binary/link_map.hpp"
# include "core/components/fwd.hpp"
# include "library/components/category_region.hpp"
# include "library/tracing.hpp"
# include <timemory/components/ompt.hpp>
# include <timemory/components/ompt/backends.hpp>
# include <timemory/components/ompt/context.hpp>
# include <timemory/components/ompt/context_handler.hpp>
# include <timemory/components/ompt/extern.hpp>
# include <timemory/components/ompt/tool.hpp>
# include <timemory/mpl/type_traits.hpp>
# include <timemory/timemory.hpp>
# include <timemory/units.hpp>
# include <timemory/unwind/addr2line.hpp>
# include <timemory/utility/demangle.hpp>
# include <timemory/utility/join.hpp>
# include <timemory/utility/types.hpp>
# include <dlfcn.h>
# include <memory>
# include <sys/mman.h>
# include <sys/types.h>
using api_t = TIMEMORY_API;
using ompt_handle_t = tim::component::ompt_handle<api_t>;
using ompt_context_t = tim::openmp::context_handler<api_t>;
using ompt_toolset_t = typename ompt_handle_t::toolset_type;
using ompt_bundle_t = tim::component_tuple<ompt_handle_t>;
using api_t = tim::project::rocprofsys;
extern "C"
namespace rocprofsys
{
ompt_start_tool_result_t* ompt_start_tool(unsigned int,
const char*) ROCPROFSYS_PUBLIC_API;
}
namespace component
{
struct ompt : comp::base<ompt, void>
{
using value_type = void;
using base_type = comp::base<ompt, void>;
using context_info_t = tim::openmp::context_info;
static std::string label() { return "ompt"; }
static std::string description() { return "OpenMP tools tracing"; }
ompt() = default;
~ompt() = default;
ompt(const ompt&) = default;
ompt(ompt&&) noexcept = default;
ompt& operator=(const ompt&) = default;
ompt& operator=(ompt&&) noexcept = default;
template <typename... Args>
void start(const context_info_t& _ctx_info, Args&&...) const
{
category_region<category::ompt>::start<tim::quirk::timemory>(m_prefix);
auto _ts = tracing::now();
uint64_t _cid =
(_ctx_info.target_arguments) ? _ctx_info.target_arguments->host_op_id : 0;
auto _annotate = [&](::perfetto::EventContext ctx) {
if(config::get_perfetto_annotations())
{
tracing::add_perfetto_annotation(ctx, "begin_ns", _ts);
for(const auto& itr : _ctx_info.arguments)
tracing::add_perfetto_annotation(ctx, itr.label, itr.value);
}
};
if(_cid > 0)
{
category_region<category::ompt>::start<tim::quirk::perfetto>(
(_ctx_info.func.empty()) ? m_prefix : _ctx_info.func, _ts,
::perfetto::Flow::ProcessScoped(_cid), std::move(_annotate));
}
else
{
category_region<category::ompt>::start<tim::quirk::perfetto>(
(_ctx_info.func.empty()) ? m_prefix : _ctx_info.func, _ts,
std::move(_annotate));
}
}
template <typename... Args>
void stop(const context_info_t& _ctx_info, Args&&...) const
{
category_region<category::ompt>::stop<tim::quirk::timemory>(m_prefix);
auto _ts = tracing::now();
uint64_t _cid =
(_ctx_info.target_arguments) ? _ctx_info.target_arguments->host_op_id : 0;
auto _annotate = [&](::perfetto::EventContext ctx) {
if(config::get_perfetto_annotations())
{
tracing::add_perfetto_annotation(ctx, "end_ns", _ts);
for(const auto& itr : _ctx_info.arguments)
tracing::add_perfetto_annotation(ctx, itr.label, itr.value);
}
};
if(_cid > 0)
{
category_region<category::ompt>::stop<tim::quirk::perfetto>(
(_ctx_info.func.empty()) ? m_prefix : _ctx_info.func, _ts,
std::move(_annotate));
}
else
{
category_region<category::ompt>::stop<tim::quirk::perfetto>(
(_ctx_info.func.empty()) ? m_prefix : _ctx_info.func, _ts,
std::move(_annotate));
}
}
template <typename... Args>
void store(const context_info_t& _ctx_info, Args&&... _args) const
{
start(_ctx_info, std::forward<Args>(_args)...);
stop(_ctx_info, std::forward<Args>(_args)...);
}
static void record(std::string_view name, ompt_id_t id, uint64_t beg_time,
uint64_t end_time, uint64_t thrd_id, uint64_t targ_id,
const context_info_t& common)
{
(void) thrd_id;
(void) targ_id;
auto _annotate = [&](::perfetto::EventContext ctx) {
if(config::get_perfetto_annotations())
{
for(const auto& itr : common.arguments)
tracing::add_perfetto_annotation(ctx, itr.label, itr.value);
}
};
auto _track = tracing::get_perfetto_track(
category::ompt{},
[](uint64_t _targ_id_v) {
return ::timemory::join::join("", "OMP Target ", _targ_id_v);
},
targ_id);
category_region<category::ompt>::start<tim::quirk::perfetto>(
name, _track, beg_time, ::perfetto::Flow::ProcessScoped(id),
std::move(_annotate));
category_region<category::ompt>::stop<tim::quirk::perfetto>(name, _track,
end_time);
}
void set_prefix(std::string_view _v) { m_prefix = _v; }
private:
std::string_view m_prefix = {};
};
} // namespace component
} // namespace rocprofsys
namespace tim
{
namespace trait
{
template <>
struct ompt_handle<api_t>
{
using type = component_tuple<::rocprofsys::component::ompt>;
};
} // namespace trait
} // namespace tim
namespace rocprofsys
{
@@ -58,6 +202,11 @@ namespace ompt
{
namespace
{
using ompt_handle_t = tim::component::ompt_handle<api_t>;
using ompt_context_t = tim::openmp::context_handler<api_t>;
using ompt_toolset_t = typename ompt_handle_t::toolset_type;
using ompt_bundle_t = tim::component_tuple<ompt_handle_t>;
std::unique_ptr<ompt_bundle_t> f_bundle = {};
bool _init_toolset_off = (trait::runtime_enabled<ompt_toolset_t>::set(false),
trait::runtime_enabled<ompt_context_t>::set(false), true);
@@ -70,10 +219,7 @@ setup()
if(!tim::settings::enabled()) return;
trait::runtime_enabled<ompt_toolset_t>::set(true);
trait::runtime_enabled<ompt_context_t>::set(true);
comp::user_ompt_bundle::global_init();
comp::user_ompt_bundle::reset();
tim::auto_lock_t lk{ tim::type_mutex<ompt_handle_t>() };
comp::user_ompt_bundle::configure<component::local_category_region<category::ompt>>();
f_bundle = std::make_unique<ompt_bundle_t>("rocprofsys/ompt",
quirk::config<quirk::auto_start>{});
}
@@ -91,10 +237,15 @@ shutdown()
ompt_context_t::cleanup();
trait::runtime_enabled<ompt_toolset_t>::set(false);
trait::runtime_enabled<ompt_context_t>::set(false);
comp::user_ompt_bundle::reset();
pthread_gotcha::shutdown();
// call the OMPT finalize callback
if(f_finalize) (*f_finalize)();
if(f_finalize)
{
for(const auto& itr : tim::openmp::get_ompt_device_functions<api_t>())
if(itr.second.stop_trace) itr.second.stop_trace(itr.second.device);
(*f_finalize)();
f_finalize = nullptr;
}
}
f_bundle.reset();
_protect = false;
@@ -115,21 +266,232 @@ tool_initialize(ompt_function_lookup_t lookup, int initial_device_num,
{
if(!rocprofsys::settings_are_configured())
{
ROCPROFSYS_BASIC_WARNING(
ROCPROFSYS_BASIC_WARNING_F(
0,
"[%s] invoked before rocprof-sys was initialized. In instrumentation mode, "
"settings exported to the environment have not been propagated yet...\n",
__FUNCTION__);
rocprofsys::configure_settings();
use_tool() = get_env("ROCPROFSYS_USE_OMPT", true, false);
}
else
{
use_tool() = rocprofsys::config::get_use_ompt();
}
use_tool() = rocprofsys::config::get_use_ompt();
if(use_tool())
{
TIMEMORY_PRINTF(stderr, "OpenMP-tools configuring for initial device %i\n\n",
initial_device_num);
f_finalize = tim::ompt::configure<TIMEMORY_OMPT_API_TAG>(
lookup, initial_device_num, tool_data);
ROCPROFSYS_BASIC_VERBOSE_F(
2, "OpenMP-tools configuring for initial device %i\n\n", initial_device_num);
static auto _generate_key = [](std::string_view _key_v,
const ::tim::openmp::argument_array_t& _args_v) {
return std::string{ _key_v };
(void) _args_v;
};
tim::openmp::get_codeptr_ra_resolver<api_t>() =
[](tim::openmp::context_info& _ctx_info) {
const auto& _key = _ctx_info.label;
const auto* codeptr_ra = _ctx_info.codeptr_ra;
auto& _args = _ctx_info.arguments;
ROCPROFSYS_BASIC_VERBOSE(2, "resolving codeptr return address for %s\n",
_key.data());
if(!codeptr_ra) return _generate_key(_key, _args);
static thread_local auto _once = std::once_flag{};
std::call_once(_once, []() { ::tim::unwind::update_file_maps(); });
auto _info = ::rocprofsys::binary::lookup_ipaddr_entry<false>(
reinterpret_cast<uintptr_t>(codeptr_ra));
if(_info)
{
_ctx_info.func = tim::demangle(_info->name);
if(_info->lineno > 0)
{
auto _linfo = _info->lineinfo.rget([](const auto& _v) -> bool {
return (_v && !_v.location.empty() && _v.line > 0);
});
if(_linfo)
{
_ctx_info.file = _linfo.location;
_ctx_info.line = _linfo.line;
_args.emplace_back("file", _ctx_info.file);
_args.emplace_back("lineinfo",
::timemory::join::join("@", _ctx_info.file,
_ctx_info.line));
}
else
{
_ctx_info.file = _info->location;
_args.emplace_back("file", _ctx_info.file);
}
return _generate_key(
::timemory::join::join(" @ ", _key, _ctx_info.func), _args);
}
else
{
return _generate_key(
::timemory::join::join(" @ ", _key, _ctx_info.func), _args);
}
}
else
{
auto _dl_info = Dl_info{ nullptr, nullptr, nullptr, nullptr };
if(dladdr(codeptr_ra, &_dl_info) != 0)
{
_ctx_info.file = _dl_info.dli_fname;
_ctx_info.func = tim::demangle(_dl_info.dli_sname);
_args.emplace_back("file", _ctx_info.file);
return _generate_key(
::timemory::join::join(
" @ ", _key,
::timemory::join::join("", _ctx_info.func, " [",
_ctx_info.file, "]")),
_args);
}
}
// since no line info could be deduced, include the codeptr return address
auto _args_codeptr_v = _args;
_args_codeptr_v.emplace_back("codeptr_ra", codeptr_ra);
return _generate_key(_key, _args_codeptr_v);
};
tim::openmp::get_function_lookup_callback<
api_t>() = [](ompt_function_lookup_t,
const std::optional<tim::openmp::function_lookup_params>&
params) {
if(!params) return;
ROCPROFSYS_VERBOSE(3, "[ompt] configuring device %i...\n",
params->device_num);
auto& device_funcs =
tim::openmp::get_ompt_device_functions<api_t>().at(params->device_num);
device_funcs.set_trace_ompt(params->device, 1, ompt_callback_target_data_op);
device_funcs.set_trace_ompt(params->device, 1, ompt_callback_target_submit);
static ompt_callback_buffer_request_t request =
[](int device_num, ompt_buffer_t** buffer, size_t* bytes) {
ROCPROFSYS_VERBOSE(3, "[ompt] buffer request...\n");
*bytes = ::tim::units::get_page_size();
*buffer = mmap(nullptr, *bytes, PROT_READ | PROT_WRITE,
MAP_ANONYMOUS | MAP_PRIVATE, -1, 0);
(void) device_num;
};
static ompt_callback_buffer_complete_t complete = [](int device_num,
ompt_buffer_t* buffer,
size_t bytes,
ompt_buffer_cursor_t
begin,
int buffer_owned) {
ROCPROFSYS_VERBOSE(3, "[ompt] buffer complete...\n");
tim::consume_parameters(device_num, buffer, bytes, begin, buffer_owned);
auto _funcs =
tim::openmp::get_ompt_device_functions<api_t>().at(device_num);
auto _skew = rocprofsys::tracing::get_clock_skew(
[&_funcs]() { return _funcs.get_device_time(_funcs.device); });
ompt_buffer_cursor_t _cursor = begin;
size_t _nrecords = 0;
do
{
if(_cursor == 0) break;
++_nrecords;
auto* _record = _funcs.get_record_ompt(buffer, _cursor);
if(_record)
{
const char* _type = tim::openmp::get_enum_label(_record->type);
auto _thrd_id = _record->thread_id;
auto _targ_id = _record->target_id;
unsigned long beg_time = _record->time + _skew;
unsigned long end_time = 0;
ompt_id_t id = 0;
const char* _name = tim::openmp::get_enum_label(_record->type);
if(_record->type == ompt_callback_target_submit)
{
auto& _data = _record->record.target_kernel;
end_time = _data.end_time + _skew;
id = _data.host_op_id;
auto _ctx_info = tim::openmp::argument_array_t{
{ "begin_ns", beg_time },
{ "end_ns", end_time },
{ "type", _type },
{ "thread_id", _thrd_id },
{ "target_id", _targ_id },
{ "host_op_id", id },
{ "requested_num_teams", _data.requested_num_teams },
{ "granted_num_teams", _data.granted_num_teams }
};
component::ompt::record(
_name, id, beg_time, end_time, _thrd_id, _targ_id,
tim::openmp::context_info{ _name, nullptr, _ctx_info });
}
else if(_record->type == ompt_callback_target_data_op)
{
auto& _data = _record->record.target_data_op;
end_time = _data.end_time + _skew;
id = _data.host_op_id;
const auto* _opname =
tim::openmp::get_enum_label(_data.optype);
auto _ctx_info = tim::openmp::argument_array_t{
{ "begin_ns", beg_time },
{ "end_ns", end_time },
{ "type", _type },
{ "thread_id", _thrd_id },
{ "target_id", _targ_id },
{ "host_op_id", id },
{ "optype", _opname },
{ "src_addr", reinterpret_cast<void*>(_data.src_addr) },
{ "dst_addr", reinterpret_cast<void*>(_data.dest_addr) },
{ "src_device_num", _data.src_device_num },
{ "dst_device_num", _data.dest_device_num },
{ "bytes", _data.bytes },
};
component::ompt::record(
_opname, id, beg_time, end_time, _thrd_id, _targ_id,
tim::openmp::context_info{ _name, nullptr, _ctx_info });
}
ROCPROFSYS_VERBOSE(
3,
"type=%i, type_name=%s, start=%lu, end=%lu, delta=%lu, "
"tid=%lu, target_id=%lu, host_id=%lu\n",
_record->type, tim::openmp::get_enum_label(_record->type),
beg_time, end_time, (end_time - beg_time), _record->thread_id,
_record->target_id, id);
}
_funcs.advance_buffer_cursor(_funcs.device, buffer, bytes, _cursor,
&_cursor);
} while(_cursor != 0);
ROCPROFSYS_VERBOSE(3, "[ompt] number of records: %zu\n", _nrecords);
if(buffer_owned == 1)
{
::munmap(buffer, bytes);
}
};
device_funcs.start_trace(params->device, request, complete);
};
f_finalize = tim::ompt::configure<api_t>(lookup, initial_device_num, tool_data);
}
return 1; // success
}
@@ -143,18 +505,23 @@ tool_finalize(ompt_data_t*)
} // namespace ompt
} // namespace rocprofsys
extern "C" ompt_start_tool_result_t*
ompt_start_tool(unsigned int omp_version, const char* runtime_version)
extern "C"
{
ROCPROFSYS_BASIC_VERBOSE_F(0, "OpenMP version: %u, runtime version: %s\n",
omp_version, runtime_version);
ROCPROFSYS_METADATA("OMP_VERSION", omp_version);
ROCPROFSYS_METADATA("OMP_RUNTIME_VERSION", runtime_version);
ompt_start_tool_result_t* ompt_start_tool(unsigned int,
const char*) ROCPROFSYS_PUBLIC_API;
static auto* data = new ompt_start_tool_result_t{ &rocprofsys::ompt::tool_initialize,
&rocprofsys::ompt::tool_finalize,
{ 0 } };
return data;
ompt_start_tool_result_t* ompt_start_tool(unsigned int omp_version,
const char* runtime_version)
{
ROCPROFSYS_BASIC_VERBOSE_F(0, "OpenMP version: %u, runtime version: %s\n",
omp_version, runtime_version);
ROCPROFSYS_METADATA("OMP_VERSION", omp_version);
ROCPROFSYS_METADATA("OMP_RUNTIME_VERSION", runtime_version);
static auto* data = new ompt_start_tool_result_t{
&rocprofsys::ompt::tool_initialize, &rocprofsys::ompt::tool_finalize, { 0 }
};
return data;
}
}
#else
@@ -163,55 +163,18 @@ get_clock_skew()
static auto _use = tim::get_env("ROCPROFSYS_USE_ROCTRACER_CLOCK_SKEW", true);
if(!_use) return 0;
static auto _v = []() {
namespace cpu = tim::cpu;
// synchronize timestamps
// We'll take a CPU timestamp before and after taking a GPU timestmp, then
// take the average of those two, hoping that it's roughly at the same time
// as the GPU timestamp.
static auto _cpu_now = []() {
cpu::fence();
return comp::wall_clock::record();
};
static auto _gpu_now = []() {
cpu::fence();
auto _gpu_now = []() {
uint64_t _ts = 0;
ROCPROFSYS_ROCTRACER_CALL(roctracer_get_timestamp(&_ts));
roctracer_get_timestamp(&_ts);
return _ts;
};
do
{
// warm up cache and allow for any static initialization
(void) _cpu_now();
(void) _gpu_now();
} while(false);
// discard (warm-up)
(void) tracing::get_clock_skew(_gpu_now, 1);
auto _compute = [](volatile uint64_t& _cpu_ts, volatile uint64_t& _gpu_ts) {
_cpu_ts = 0;
_gpu_ts = 0;
_cpu_ts += _cpu_now() / 2;
_gpu_ts += _gpu_now() / 1;
_cpu_ts += _cpu_now() / 2;
return static_cast<int64_t>(_cpu_ts) - static_cast<int64_t>(_gpu_ts);
};
constexpr int64_t _n = 10;
int64_t _cpu_ave = 0;
int64_t _gpu_ave = 0;
int64_t _diff = 0;
for(int64_t i = 0; i < _n; ++i)
{
volatile uint64_t _cpu_ts = 0;
volatile uint64_t _gpu_ts = 0;
_diff += _compute(_cpu_ts, _gpu_ts);
_cpu_ave += _cpu_ts / _n;
_gpu_ave += _gpu_ts / _n;
}
ROCPROFSYS_BASIC_VERBOSE(2, "CPU timestamp: %li\n", _cpu_ave);
ROCPROFSYS_BASIC_VERBOSE(2, "HIP timestamp: %li\n", _gpu_ave);
auto _diff = tracing::get_clock_skew(_gpu_now, 10);
ROCPROFSYS_BASIC_VERBOSE(1, "CPU/HIP timestamp skew: %li (used: %s)\n", _diff,
_use ? "yes" : "no");
_diff /= _n;
return _diff;
}();
return _v;
@@ -651,5 +651,41 @@ mark_perfetto_track(CategoryT, const char* name, ::perfetto::Track _track, uint6
TRACE_EVENT_INSTANT(trait::name<CategoryT>::value, ::perfetto::DynamicString{ name },
_track, _ts, std::forward<Args>(args)...);
}
template <typename FuncT>
int64_t
get_clock_skew(FuncT&& _timestamp_func, int64_t _n = 1)
{
namespace cpu = tim::cpu;
// synchronize timestamps
// We'll take a CPU timestamp before and after taking a GPU timestmp, then
// take the average of those two, hoping that it's roughly at the same time
// as the GPU timestamp.
auto _cpu_now = []() {
cpu::fence();
return now();
};
auto _gpu_now = [&_timestamp_func]() {
cpu::fence();
return std::forward<FuncT>(_timestamp_func)();
};
auto _compute = [&_cpu_now, &_gpu_now]() {
volatile uint64_t _cpu_ts = 0;
volatile uint64_t _gpu_ts = 0;
_cpu_ts += _cpu_now();
_gpu_ts += _gpu_now();
_cpu_ts += _cpu_now();
return static_cast<int64_t>(_cpu_ts / 2) - static_cast<int64_t>(_gpu_ts);
};
int64_t _diff = 0;
for(int64_t i = 0; i < _n; ++i)
{
_diff += _compute();
}
return (_diff / _n);
}
} // namespace tracing
} // namespace rocprofsys