OMPT Target Offload Support (#17)
- Porting from https://github.com/ROCm/omnitrace/pull/411
- Improve OMPT support
- Add OpenMP target example to testing
- Update Timemory submodule to use ROCm/Timemory rather than NERSC/Timemory
- Update `actions/upload-artifacts` to v4
- Standardize the `cmake_minimum_required` to 3.18.4 across workflows, project, and examples
- Updated Ubuntu 20.04 workflows
[ROCm/rocprofiler-systems commit: 9da7365471]
Dieser Commit ist enthalten in:
@@ -63,8 +63,9 @@ jobs:
|
||||
chmod +x /opt/trace_processor/bin/trace_processor_shell
|
||||
fi
|
||||
python3 -m pip install --upgrade pip &&
|
||||
python3 -m pip install numpy perfetto dataclasses &&
|
||||
for i in 6 7 8 9 10 11; do /opt/conda/envs/py3.${i}/bin/python -m pip install numpy perfetto dataclasses; done
|
||||
python3 -m pip install --upgrade numpy perfetto dataclasses &&
|
||||
python3 -m pip install 'cmake==3.18.4' &&
|
||||
for i in 6 7 8 9 10 11; do /opt/conda/envs/py3.${i}/bin/python -m pip install --upgrade numpy perfetto dataclasses; done
|
||||
|
||||
- name: Configure Env
|
||||
run:
|
||||
@@ -136,7 +137,7 @@ jobs:
|
||||
- name: CTest Artifacts
|
||||
if: failure()
|
||||
continue-on-error: True
|
||||
uses: actions/upload-artifact@v3
|
||||
uses: actions/upload-artifact@v4
|
||||
with:
|
||||
name: ctest-${{ github.job }}-${{ strategy.job-index }}-log
|
||||
path: |
|
||||
@@ -145,7 +146,7 @@ jobs:
|
||||
- name: Data Artifacts
|
||||
if: failure()
|
||||
continue-on-error: True
|
||||
uses: actions/upload-artifact@v3
|
||||
uses: actions/upload-artifact@v4
|
||||
with:
|
||||
name: data-${{ github.job }}-${{ strategy.job-index }}-files
|
||||
path: |
|
||||
|
||||
@@ -67,8 +67,9 @@ jobs:
|
||||
chmod +x /opt/trace_processor/bin/trace_processor_shell
|
||||
fi
|
||||
python3 -m pip install --upgrade pip &&
|
||||
python3 -m pip install numpy perfetto dataclasses &&
|
||||
for i in 6 7 8 9 10 11; do /opt/conda/envs/py3.${i}/bin/python -m pip install numpy perfetto dataclasses; done
|
||||
python3 -m pip install --upgrade numpy perfetto dataclasses &&
|
||||
python3 -m pip install 'cmake==3.18.4' &&
|
||||
for i in 6 7 8 9 10 11; do /opt/conda/envs/py3.${i}/bin/python -m pip install --upgrade numpy perfetto dataclasses; done
|
||||
|
||||
- name: Install ROCm Packages
|
||||
if: ${{ matrix.rocm-version > 0 }}
|
||||
|
||||
@@ -46,7 +46,7 @@ jobs:
|
||||
compiler: ['g++-7', 'g++-8']
|
||||
lto: ['OFF']
|
||||
strip: ['OFF']
|
||||
python: ['ON']
|
||||
python: ['OFF']
|
||||
build-type: ['Release']
|
||||
mpi-headers: ['OFF']
|
||||
static-libgcc: ['OFF']
|
||||
@@ -97,8 +97,9 @@ jobs:
|
||||
wget https://commondatastorage.googleapis.com/perfetto-luci-artifacts/v47.0/linux-amd64/trace_processor_shell -P /opt/trace_processor/bin &&
|
||||
chmod +x /opt/trace_processor/bin/trace_processor_shell &&
|
||||
python3 -m pip install --upgrade pip &&
|
||||
python3 -m pip install numpy perfetto dataclasses &&
|
||||
for i in 6 7 8 9 10 11; do /opt/conda/envs/py3.${i}/bin/python -m pip install numpy perfetto dataclasses; done &&
|
||||
python3 -m pip install --upgrade numpy perfetto dataclasses &&
|
||||
python3 -m pip install 'cmake==3.18.4' &&
|
||||
for i in 6 7 8 9 10 11; do /opt/conda/envs/py3.${i}/bin/python -m pip install --upgrade numpy perfetto dataclasses; done &&
|
||||
apt-get -y --purge autoremove &&
|
||||
apt-get -y clean &&
|
||||
/opt/conda/bin/conda clean -y -a
|
||||
@@ -278,8 +279,9 @@ jobs:
|
||||
wget https://commondatastorage.googleapis.com/perfetto-luci-artifacts/v47.0/linux-amd64/trace_processor_shell -P /opt/trace_processor/bin &&
|
||||
chmod +x /opt/trace_processor/bin/trace_processor_shell &&
|
||||
python3 -m pip install --upgrade pip &&
|
||||
python3 -m pip install numpy perfetto dataclasses &&
|
||||
for i in 6 7 8 9 10 11; do /opt/conda/envs/py3.${i}/bin/python -m pip install numpy perfetto dataclasses; done &&
|
||||
python3 -m pip install --upgrade numpy perfetto dataclasses &&
|
||||
python3 -m pip install 'cmake==3.18.4' &&
|
||||
for i in 6 7 8 9 10 11; do /opt/conda/envs/py3.${i}/bin/python -m pip install --upgrade numpy perfetto dataclasses; done &&
|
||||
apt-get -y --purge autoremove &&
|
||||
apt-get -y clean &&
|
||||
/opt/conda/bin/conda clean -y -a
|
||||
@@ -408,16 +410,6 @@ jobs:
|
||||
ompt: ['ON']
|
||||
papi: ['ON']
|
||||
deps: ['libtbb-dev libboost-{atomic,system,thread,date-time,filesystem,timer}-dev']
|
||||
include:
|
||||
- compiler: 'g++'
|
||||
mpi: 'nompi'
|
||||
boost: 'ON'
|
||||
tbb: 'ON'
|
||||
build-type: 'Release'
|
||||
python: 'ON'
|
||||
ompt: 'OFF'
|
||||
papi: 'OFF'
|
||||
deps: ''
|
||||
|
||||
env:
|
||||
ELFUTILS_DOWNLOAD_VERSION: 0.186
|
||||
@@ -440,7 +432,8 @@ jobs:
|
||||
wget https://commondatastorage.googleapis.com/perfetto-luci-artifacts/v47.0/linux-amd64/trace_processor_shell -P /opt/trace_processor/bin &&
|
||||
chmod +x /opt/trace_processor/bin/trace_processor_shell &&
|
||||
python3 -m pip install --upgrade pip &&
|
||||
python3 -m pip install numpy perfetto dataclasses &&
|
||||
python3 -m pip install --upgrade numpy perfetto dataclasses &&
|
||||
python3 -m pip install 'cmake==3.18.4' &&
|
||||
sudo apt-get -y --purge autoremove &&
|
||||
sudo apt-get -y clean
|
||||
|
||||
@@ -592,8 +585,9 @@ jobs:
|
||||
wget https://commondatastorage.googleapis.com/perfetto-luci-artifacts/v47.0/linux-amd64/trace_processor_shell -P /opt/trace_processor/bin &&
|
||||
chmod +x /opt/trace_processor/bin/trace_processor_shell &&
|
||||
python3 -m pip install --upgrade pip &&
|
||||
python3 -m pip install numpy perfetto dataclasses &&
|
||||
for i in 6 7 8 9 10 11; do /opt/conda/envs/py3.${i}/bin/python -m pip install numpy perfetto dataclasses; done &&
|
||||
python3 -m pip install --upgrade numpy perfetto dataclasses &&
|
||||
python3 -m pip install 'cmake==3.18.4' &&
|
||||
for i in 6 7 8 9 10 11; do /opt/conda/envs/py3.${i}/bin/python -m pip install --upgrade numpy perfetto dataclasses; done &&
|
||||
apt-get -y --purge autoremove &&
|
||||
apt-get -y clean &&
|
||||
/opt/conda/bin/conda clean -y -a
|
||||
|
||||
@@ -111,8 +111,9 @@ jobs:
|
||||
apt-get upgrade -y &&
|
||||
apt-get install -y autoconf bison build-essential clang environment-modules gettext libfabric-dev libiberty-dev libomp-dev libopenmpi-dev libtool m4 openmpi-bin python3-pip texinfo ${{ matrix.compiler }} &&
|
||||
python3 -m pip install --upgrade pip &&
|
||||
python3 -m pip install numpy perfetto dataclasses &&
|
||||
for i in 6 7 8 9 10 11; do /opt/conda/envs/py3.${i}/bin/python -m pip install numpy perfetto dataclasses; done
|
||||
python3 -m pip install --upgrade numpy perfetto dataclasses &&
|
||||
python3 -m pip install 'cmake==3.18.4' &&
|
||||
for i in 6 7 8 9 10 11; do /opt/conda/envs/py3.${i}/bin/python -m pip install --upgrade numpy perfetto dataclasses; done
|
||||
|
||||
- name: Install ROCm Packages
|
||||
timeout-minutes: 25
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
[submodule "external/timemory"]
|
||||
path = external/timemory
|
||||
url = https://github.com/NERSC/timemory.git
|
||||
url = https://github.com/ROCm/timemory.git
|
||||
[submodule "external/perfetto"]
|
||||
path = external/perfetto
|
||||
url = https://github.com/google/perfetto.git
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
cmake_minimum_required(VERSION 3.16 FATAL_ERROR)
|
||||
cmake_minimum_required(VERSION 3.18.4 FATAL_ERROR)
|
||||
|
||||
if(CMAKE_SOURCE_DIR STREQUAL CMAKE_BINARY_DIR AND CMAKE_CURRENT_SOURCE_DIR STREQUAL
|
||||
CMAKE_SOURCE_DIR)
|
||||
|
||||
@@ -132,7 +132,7 @@ export LD_LIBRARY_PATH=/opt/rocprofiler-systems/lib:${LD_LIBRARY_PATH}
|
||||
|
||||
Generate a rocprofiler-systems configuration file using `rocprof-sys-avail -G rocprof-sys.cfg`. Optionally, use `rocprof-sys-avail -G rocprof-sys.cfg --all` for
|
||||
a verbose configuration file with descriptions, categories, etc. Modify the configuration file as desired, e.g. enable
|
||||
[perfetto](https://perfetto.dev/), [timemory](https://github.com/NERSC/timemory), sampling, and process-level sampling by default
|
||||
[perfetto](https://perfetto.dev/), [timemory](https://github.com/ROCm/timemory), sampling, and process-level sampling by default
|
||||
and tweak some sampling default values:
|
||||
|
||||
```console
|
||||
|
||||
@@ -677,7 +677,7 @@ mark_as_advanced(TIMEMORY_PROJECT_NAME)
|
||||
rocprofiler_systems_checkout_git_submodule(
|
||||
RELATIVE_PATH external/timemory
|
||||
WORKING_DIRECTORY ${PROJECT_SOURCE_DIR}
|
||||
REPO_URL https://github.com/NERSC/timemory.git
|
||||
REPO_URL https://github.com/ROCm/timemory.git
|
||||
REPO_BRANCH omnitrace)
|
||||
|
||||
rocprofiler_systems_save_variables(
|
||||
|
||||
@@ -7,7 +7,7 @@ The ROCm Systems Profiler feature set and use cases
|
||||
***************************************
|
||||
|
||||
`ROCm Systems Profiler <https://github.com/ROCm/rocprofiler-systems>`_ is designed to be highly extensible.
|
||||
Internally, it leverages the `Timemory performance analysis toolkit <https://github.com/NERSC/timemory>`_
|
||||
Internally, it leverages the `Timemory performance analysis toolkit <https://github.com/ROCm/timemory>`_
|
||||
to manage extensions, resources, data, and other items. It supports the following features,
|
||||
modes, metrics, and APIs.
|
||||
|
||||
|
||||
@@ -28,7 +28,7 @@ use the ``rocprof-sys-avail -G ~/.rocprof-sys.cfg --all`` option
|
||||
for a verbose configuration file with descriptions, categories, and additional information.
|
||||
|
||||
Modify ``${HOME}/.rocprof-sys.cfg`` as required. For example, enable `Perfetto <https://perfetto.dev/>`_,
|
||||
`Timemory <https://github.com/NERSC/timemory>`_, sampling, and process-level sampling by default
|
||||
`Timemory <https://github.com/ROCm/timemory>`_, sampling, and process-level sampling by default
|
||||
and tweak the default sampling values.
|
||||
|
||||
.. code-block:: shell
|
||||
@@ -64,7 +64,7 @@ accepts a case insensitive match for nearly all common Boolean logic expressions
|
||||
Exploring components
|
||||
-----------------------------------
|
||||
|
||||
ROCm Systems Profiler uses `Timemory <https://github.com/NERSC/timemory>`_ extensively to provide
|
||||
ROCm Systems Profiler uses `Timemory <https://github.com/ROCm/timemory>`_ extensively to provide
|
||||
various capabilities and manage
|
||||
data and resources. By default, with ``ROCPROFSYS_PROFILE=ON``, ROCm Systems Profiler only collects wall-clock
|
||||
timing values. However, by modifying the ``ROCPROFSYS_TIMEMORY_COMPONENTS`` setting,
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
cmake_minimum_required(VERSION 3.16 FATAL_ERROR)
|
||||
cmake_minimum_required(VERSION 3.18.4 FATAL_ERROR)
|
||||
|
||||
project(rocprofiler-systems-examples LANGUAGES C CXX)
|
||||
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
cmake_minimum_required(VERSION 3.16 FATAL_ERROR)
|
||||
cmake_minimum_required(VERSION 3.18.4 FATAL_ERROR)
|
||||
|
||||
project(rocprofiler-systems-causal-example LANGUAGES CXX)
|
||||
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
cmake_minimum_required(VERSION 3.15 FATAL_ERROR)
|
||||
cmake_minimum_required(VERSION 3.18.4 FATAL_ERROR)
|
||||
|
||||
project(rocprofiler-systems-code-coverage-example LANGUAGES CXX)
|
||||
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
cmake_minimum_required(VERSION 3.15 FATAL_ERROR)
|
||||
cmake_minimum_required(VERSION 3.18.4 FATAL_ERROR)
|
||||
|
||||
project(rocprofiler-systems-fork LANGUAGES CXX)
|
||||
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
cmake_minimum_required(VERSION 3.16 FATAL_ERROR)
|
||||
cmake_minimum_required(VERSION 3.18.4 FATAL_ERROR)
|
||||
|
||||
project(rocprofiler-systems-lulesh-example LANGUAGES C CXX)
|
||||
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
cmake_minimum_required(VERSION 3.16 FATAL_ERROR)
|
||||
cmake_minimum_required(VERSION 3.18.4 FATAL_ERROR)
|
||||
|
||||
project(rocprofiler-systems-mpi-examples LANGUAGES C CXX)
|
||||
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
cmake_minimum_required(VERSION 3.16 FATAL_ERROR)
|
||||
cmake_minimum_required(VERSION 3.18.4 FATAL_ERROR)
|
||||
|
||||
project(rocprofiler-systems-openmp LANGUAGES CXX)
|
||||
|
||||
@@ -56,3 +56,5 @@ if(ROCPROFSYS_INSTALL_EXAMPLES)
|
||||
DESTINATION bin
|
||||
COMPONENT rocprofiler-systems-examples)
|
||||
endif()
|
||||
|
||||
add_subdirectory(target)
|
||||
|
||||
@@ -0,0 +1,110 @@
|
||||
#
|
||||
#
|
||||
#
|
||||
cmake_minimum_required(VERSION 3.18.4 FATAL_ERROR)
|
||||
|
||||
# try to find a compatible HIP version
|
||||
if(ROCmVersion_MAJOR_VERSION AND ROCmVersion_MAJOR_VERSION GREATER_EQUAL 6)
|
||||
find_package(hip ${ROCmVersion_MAJOR_VERSION}.0.0)
|
||||
else()
|
||||
find_package(hip)
|
||||
endif()
|
||||
|
||||
if(NOT hip_FOUND)
|
||||
message(WARNING "ROCm >= 5.6 not found. Skipping OpenMP target example.")
|
||||
return()
|
||||
elseif(hip_FOUND AND hip_VERSION VERSION_LESS 5.6.0)
|
||||
message(
|
||||
WARNING
|
||||
"ROCm >= 5.6 not found (found ${hip_VERSION}). Skipping OpenMP target example."
|
||||
)
|
||||
return()
|
||||
endif()
|
||||
|
||||
if(NOT OMP_TARGET_COMPILER)
|
||||
find_program(
|
||||
amdclangpp_EXECUTABLE
|
||||
NAMES amdclang++
|
||||
HINTS ${ROCM_PATH} ENV ROCM_PATH /opt/rocm
|
||||
PATHS ${ROCM_PATH} ENV ROCM_PATH /opt/rocm
|
||||
PATH_SUFFIXES bin llvm/bin)
|
||||
mark_as_advanced(amdclangpp_EXECUTABLE)
|
||||
|
||||
if(amdclangpp_EXECUTABLE)
|
||||
set(OMP_TARGET_COMPILER
|
||||
"${amdclangpp_EXECUTABLE}"
|
||||
CACHE FILEPATH "OpenMP target compiler")
|
||||
else()
|
||||
message(WARNING "OpenMP target compiler not found. Skipping this example.")
|
||||
return()
|
||||
endif()
|
||||
endif()
|
||||
|
||||
project(rocprofiler-systems-example-openmp-target-lib LANGUAGES CXX)
|
||||
|
||||
set(CMAKE_BUILD_TYPE "RelWithDebInfo")
|
||||
|
||||
set(DEFAULT_GPU_TARGETS
|
||||
"gfx900"
|
||||
"gfx906"
|
||||
"gfx908"
|
||||
"gfx90a"
|
||||
"gfx940"
|
||||
"gfx941"
|
||||
"gfx942"
|
||||
"gfx1030"
|
||||
"gfx1010"
|
||||
"gfx1100"
|
||||
"gfx1101"
|
||||
"gfx1102")
|
||||
|
||||
set(GPU_TARGETS
|
||||
"${DEFAULT_GPU_TARGETS}"
|
||||
CACHE STRING "GPU targets to compile for")
|
||||
|
||||
find_package(Threads REQUIRED)
|
||||
|
||||
add_library(openmp-target-lib SHARED)
|
||||
target_sources(openmp-target-lib PRIVATE library.cpp)
|
||||
target_link_libraries(openmp-target-lib PUBLIC Threads::Threads)
|
||||
target_compile_options(openmp-target-lib PRIVATE -fopenmp -ggdb)
|
||||
target_link_options(openmp-target-lib PUBLIC -fopenmp)
|
||||
|
||||
foreach(_TARGET ${GPU_TARGETS})
|
||||
target_compile_options(openmp-target-lib PRIVATE --offload-arch=${_TARGET})
|
||||
target_link_options(openmp-target-lib PUBLIC --offload-arch=${_TARGET})
|
||||
endforeach()
|
||||
|
||||
message(STATUS "Using OpenMP target compiler: ${OMP_TARGET_COMPILER}")
|
||||
|
||||
get_filename_component(OMP_TARGET_COMPILER_DIR ${OMP_TARGET_COMPILER} PATH)
|
||||
get_filename_component(OMP_TARGET_COMPILER_DIR ${OMP_TARGET_COMPILER_DIR} PATH)
|
||||
|
||||
message(STATUS "Using OpemMP target compiler directory: ${OMP_TARGET_COMPILER_DIR}")
|
||||
|
||||
if(NOT EXISTS ${OMP_TARGET_COMPILER_DIR}/llvm/lib)
|
||||
message(FATAL_ERROR "${OMP_TARGET_COMPILER_DIR}/llvm/lib does not exist")
|
||||
endif()
|
||||
set_target_properties(
|
||||
openmp-target-lib
|
||||
PROPERTIES BUILD_RPATH
|
||||
"${OMP_TARGET_COMPILER_DIR}/llvm/lib:${OMP_TARGET_COMPILER_DIR}/lib"
|
||||
OUTPUT_NAME "openmp-target"
|
||||
POSITION_INDEPENDENT_CODE ON)
|
||||
|
||||
rocprofiler_systems_custom_compilation(TARGET openmp-target-lib COMPILER
|
||||
${OMP_TARGET_COMPILER})
|
||||
|
||||
add_executable(openmp-target)
|
||||
target_sources(openmp-target PRIVATE main.cpp)
|
||||
target_link_libraries(openmp-target PRIVATE openmp-target-lib)
|
||||
target_compile_options(openmp-target PRIVATE -ggdb)
|
||||
|
||||
set_target_properties(
|
||||
openmp-target
|
||||
PROPERTIES BUILD_RPATH
|
||||
"${OMP_TARGET_COMPILER_DIR}/llvm/lib:${OMP_TARGET_COMPILER_DIR}/lib"
|
||||
POSITION_INDEPENDENT_CODE ON)
|
||||
|
||||
rocprofiler_systems_custom_compilation(TARGET openmp-target COMPILER
|
||||
${OMP_TARGET_COMPILER})
|
||||
@@ -0,0 +1,149 @@
|
||||
// MIT License
|
||||
//
|
||||
// Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved.
|
||||
//
|
||||
// Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
// of this software and associated documentation files (the "Software"), to deal
|
||||
// in the Software without restriction, including without limitation the rights
|
||||
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
// copies of the Software, and to permit persons to whom the Software is
|
||||
// furnished to do so, subject to the following conditions:
|
||||
//
|
||||
// The above copyright notice and this permission notice shall be included in
|
||||
// all copies or substantial portions of the Software.
|
||||
//
|
||||
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
// THE SOFTWARE.
|
||||
|
||||
#include <cstdlib>
|
||||
#include <fstream>
|
||||
#include <iostream>
|
||||
#include <math.h>
|
||||
#include <sstream>
|
||||
#include <stdio.h>
|
||||
#include <string>
|
||||
#include <thread>
|
||||
#include <unistd.h>
|
||||
|
||||
constexpr float EPS_FLOAT = 1.0e-7f;
|
||||
constexpr double EPS_DOUBLE = 1.0e-15;
|
||||
|
||||
#pragma omp declare target
|
||||
template <typename T>
|
||||
T
|
||||
mul(T a, T b)
|
||||
{
|
||||
T c;
|
||||
c = a * b;
|
||||
return c;
|
||||
}
|
||||
#pragma omp end declare target
|
||||
|
||||
template <typename T>
|
||||
void
|
||||
vmul(T* a, T* b, T* c, int N)
|
||||
{
|
||||
#pragma omp target map(to : a [0:N], b [0:N]) map(from : c [0:N])
|
||||
#pragma omp teams distribute parallel for
|
||||
for(int i = 0; i < N; i++)
|
||||
{
|
||||
c[i] = mul(a[i], b[i]);
|
||||
}
|
||||
}
|
||||
|
||||
int
|
||||
run_impl()
|
||||
{
|
||||
std::this_thread::sleep_for(std::chrono::milliseconds{ 50 });
|
||||
|
||||
constexpr int N = 100000;
|
||||
constexpr int Nc = N / 100;
|
||||
int a_i[N], b_i[N], c_i[N], validate_i[N];
|
||||
float a_f[N], b_f[N], c_f[N], validate_f[N];
|
||||
double a_d[N], b_d[N], c_d[N], validate_d[N];
|
||||
int N_errors = 0;
|
||||
bool flag = false;
|
||||
|
||||
#pragma omp parallel for schedule(dynamic, Nc)
|
||||
for(int i = 0; i < N; ++i)
|
||||
{
|
||||
a_f[i] = a_i[i] = i + 1;
|
||||
b_f[i] = b_i[i] = i + 2;
|
||||
a_d[i] = a_i[i];
|
||||
b_d[i] = b_i[i];
|
||||
validate_i[i] = a_i[i] * b_i[i];
|
||||
validate_f[i] = a_f[i] * b_f[i];
|
||||
validate_d[i] = a_d[i] * b_d[i];
|
||||
}
|
||||
|
||||
for(int i = 0; i < 2; ++i)
|
||||
{
|
||||
vmul(a_i, b_i, c_i, N);
|
||||
vmul(a_f, b_f, c_f, N);
|
||||
vmul(a_d, b_d, c_d, N);
|
||||
}
|
||||
|
||||
for(int i = 0; i < N; i++)
|
||||
{
|
||||
if(c_i[i] != validate_i[i])
|
||||
{
|
||||
++N_errors;
|
||||
// print 1st bad index
|
||||
if(!flag)
|
||||
{
|
||||
printf("First fail: c_i[%d](%d) != validate_i[%d](%d)\n", i, c_i[i], i,
|
||||
validate_i[i]);
|
||||
flag = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
flag = false;
|
||||
for(int i = 0; i < N; i++)
|
||||
{
|
||||
if(fabs(c_f[i] - validate_f[i]) > EPS_FLOAT)
|
||||
{
|
||||
++N_errors;
|
||||
// print 1st bad index
|
||||
if(!flag)
|
||||
{
|
||||
printf("First fail: c_f[%d](%f) != validate_f[%d](%f)\n", i,
|
||||
static_cast<double>(c_f[i]), i,
|
||||
static_cast<double>(validate_f[i]));
|
||||
flag = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
flag = false;
|
||||
for(int i = 0; i < N; i++)
|
||||
{
|
||||
if(fabs(c_d[i] - validate_d[i]) > EPS_DOUBLE)
|
||||
{
|
||||
++N_errors;
|
||||
// print 1st bad index
|
||||
if(!flag)
|
||||
{
|
||||
printf("First fail: c_d[%d](%f) != validate_d[%d](%f)\n", i, c_d[i], i,
|
||||
validate_d[i]);
|
||||
flag = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return N_errors;
|
||||
}
|
||||
|
||||
int
|
||||
run()
|
||||
{
|
||||
#pragma omp parallel
|
||||
{
|
||||
run_impl();
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
@@ -0,0 +1,52 @@
|
||||
// MIT License
|
||||
//
|
||||
// Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved.
|
||||
//
|
||||
// Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
// of this software and associated documentation files (the "Software"), to deal
|
||||
// in the Software without restriction, including without limitation the rights
|
||||
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
// copies of the Software, and to permit persons to whom the Software is
|
||||
// furnished to do so, subject to the following conditions:
|
||||
//
|
||||
// The above copyright notice and this permission notice shall be included in
|
||||
// all copies or substantial portions of the Software.
|
||||
//
|
||||
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
// THE SOFTWARE.
|
||||
|
||||
#include <cstdlib>
|
||||
#include <fstream>
|
||||
#include <iostream>
|
||||
#include <math.h>
|
||||
#include <sstream>
|
||||
#include <stdio.h>
|
||||
#include <string>
|
||||
#include <unistd.h>
|
||||
|
||||
extern int
|
||||
run();
|
||||
|
||||
int
|
||||
main()
|
||||
{
|
||||
auto N_errors = run();
|
||||
auto _ec = EXIT_SUCCESS;
|
||||
if(N_errors == 0)
|
||||
{
|
||||
printf("Success\n");
|
||||
}
|
||||
else
|
||||
{
|
||||
printf("Total %d failures\n", N_errors);
|
||||
printf("Fail\n");
|
||||
_ec = EXIT_FAILURE;
|
||||
}
|
||||
|
||||
return _ec;
|
||||
}
|
||||
@@ -1,4 +1,4 @@
|
||||
cmake_minimum_required(VERSION 3.16 FATAL_ERROR)
|
||||
cmake_minimum_required(VERSION 3.18.4 FATAL_ERROR)
|
||||
|
||||
project(rocprofiler-systems-parallel-overhead-example LANGUAGES CXX)
|
||||
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
cmake_minimum_required(VERSION 3.16 FATAL_ERROR)
|
||||
cmake_minimum_required(VERSION 3.18.4 FATAL_ERROR)
|
||||
|
||||
project(rocprofiler-systems-python)
|
||||
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
cmake_minimum_required(VERSION 3.16 FATAL_ERROR)
|
||||
cmake_minimum_required(VERSION 3.18.4 FATAL_ERROR)
|
||||
|
||||
project(rocprofiler-systems-rccl-example LANGUAGES CXX)
|
||||
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
cmake_minimum_required(VERSION 3.15 FATAL_ERROR)
|
||||
cmake_minimum_required(VERSION 3.18.4 FATAL_ERROR)
|
||||
|
||||
project(rocprofiler-systems-rewrite-caller-example LANGUAGES CXX)
|
||||
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
cmake_minimum_required(VERSION 3.15 FATAL_ERROR)
|
||||
cmake_minimum_required(VERSION 3.18.4 FATAL_ERROR)
|
||||
|
||||
project(rocprofiler-systems-trace-time-window-example LANGUAGES CXX)
|
||||
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
cmake_minimum_required(VERSION 3.16 FATAL_ERROR)
|
||||
cmake_minimum_required(VERSION 3.18.4 FATAL_ERROR)
|
||||
|
||||
project(rocprofiler-systems-transpose-example LANGUAGES CXX)
|
||||
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
cmake_minimum_required(VERSION 3.16 FATAL_ERROR)
|
||||
cmake_minimum_required(VERSION 3.18.4 FATAL_ERROR)
|
||||
|
||||
project(rocprofiler-systems-user-api-example LANGUAGES CXX)
|
||||
|
||||
|
||||
+1
-1
Submodul projects/rocprofiler-systems/external/timemory aktualisiert: 9ce43f3293...68ce420086
@@ -207,7 +207,7 @@ EOF
|
||||
verbose-run cd ${BINARY_DIR}
|
||||
|
||||
cat << EOF > dashboard.cmake
|
||||
cmake_minimum_required(VERSION 3.16 FATAL_ERROR)
|
||||
cmake_minimum_required(VERSION 3.18.4 FATAL_ERROR)
|
||||
|
||||
include("\${CMAKE_CURRENT_LIST_DIR}/CTestCustom.cmake")
|
||||
|
||||
|
||||
@@ -112,7 +112,7 @@ verbose-run cp -v -r ${EXAMPLE_DIR}/${EXAMPLE_NAME}/* ${SOURCE_DIR}/
|
||||
verbose-run pushd ${SOURCE_DIR}
|
||||
|
||||
cat << EOF > CMakeLists.txt
|
||||
cmake_minimum_required(VERSION 3.16.0 FATAL_ERROR)
|
||||
cmake_minimum_required(VERSION 3.18.4.0 FATAL_ERROR)
|
||||
|
||||
project(test LANGUAGES C CXX)
|
||||
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
cmake_minimum_required(VERSION 3.8)
|
||||
cmake_minimum_required(VERSION 3.18.4)
|
||||
|
||||
if(NOT DEFINED ROCPROFSYS_VERSION)
|
||||
file(READ "${CMAKE_CURRENT_LIST_DIR}/../VERSION" FULL_VERSION_STRING LIMIT_COUNT 1)
|
||||
|
||||
@@ -30,27 +30,171 @@
|
||||
|
||||
#if defined(ROCPROFSYS_USE_OMPT) && ROCPROFSYS_USE_OMPT > 0
|
||||
|
||||
# include "binary/link_map.hpp"
|
||||
# include "core/components/fwd.hpp"
|
||||
# include "library/components/category_region.hpp"
|
||||
# include "library/tracing.hpp"
|
||||
|
||||
# include <timemory/components/ompt.hpp>
|
||||
# include <timemory/components/ompt/backends.hpp>
|
||||
# include <timemory/components/ompt/context.hpp>
|
||||
# include <timemory/components/ompt/context_handler.hpp>
|
||||
# include <timemory/components/ompt/extern.hpp>
|
||||
# include <timemory/components/ompt/tool.hpp>
|
||||
# include <timemory/mpl/type_traits.hpp>
|
||||
# include <timemory/timemory.hpp>
|
||||
# include <timemory/units.hpp>
|
||||
# include <timemory/unwind/addr2line.hpp>
|
||||
# include <timemory/utility/demangle.hpp>
|
||||
# include <timemory/utility/join.hpp>
|
||||
# include <timemory/utility/types.hpp>
|
||||
|
||||
# include <dlfcn.h>
|
||||
# include <memory>
|
||||
# include <sys/mman.h>
|
||||
# include <sys/types.h>
|
||||
|
||||
using api_t = TIMEMORY_API;
|
||||
using ompt_handle_t = tim::component::ompt_handle<api_t>;
|
||||
using ompt_context_t = tim::openmp::context_handler<api_t>;
|
||||
using ompt_toolset_t = typename ompt_handle_t::toolset_type;
|
||||
using ompt_bundle_t = tim::component_tuple<ompt_handle_t>;
|
||||
using api_t = tim::project::rocprofsys;
|
||||
|
||||
extern "C"
|
||||
namespace rocprofsys
|
||||
{
|
||||
ompt_start_tool_result_t* ompt_start_tool(unsigned int,
|
||||
const char*) ROCPROFSYS_PUBLIC_API;
|
||||
}
|
||||
namespace component
|
||||
{
|
||||
struct ompt : comp::base<ompt, void>
|
||||
{
|
||||
using value_type = void;
|
||||
using base_type = comp::base<ompt, void>;
|
||||
using context_info_t = tim::openmp::context_info;
|
||||
|
||||
static std::string label() { return "ompt"; }
|
||||
static std::string description() { return "OpenMP tools tracing"; }
|
||||
|
||||
ompt() = default;
|
||||
~ompt() = default;
|
||||
ompt(const ompt&) = default;
|
||||
ompt(ompt&&) noexcept = default;
|
||||
|
||||
ompt& operator=(const ompt&) = default;
|
||||
ompt& operator=(ompt&&) noexcept = default;
|
||||
|
||||
template <typename... Args>
|
||||
void start(const context_info_t& _ctx_info, Args&&...) const
|
||||
{
|
||||
category_region<category::ompt>::start<tim::quirk::timemory>(m_prefix);
|
||||
|
||||
auto _ts = tracing::now();
|
||||
uint64_t _cid =
|
||||
(_ctx_info.target_arguments) ? _ctx_info.target_arguments->host_op_id : 0;
|
||||
auto _annotate = [&](::perfetto::EventContext ctx) {
|
||||
if(config::get_perfetto_annotations())
|
||||
{
|
||||
tracing::add_perfetto_annotation(ctx, "begin_ns", _ts);
|
||||
for(const auto& itr : _ctx_info.arguments)
|
||||
tracing::add_perfetto_annotation(ctx, itr.label, itr.value);
|
||||
}
|
||||
};
|
||||
|
||||
if(_cid > 0)
|
||||
{
|
||||
category_region<category::ompt>::start<tim::quirk::perfetto>(
|
||||
(_ctx_info.func.empty()) ? m_prefix : _ctx_info.func, _ts,
|
||||
::perfetto::Flow::ProcessScoped(_cid), std::move(_annotate));
|
||||
}
|
||||
else
|
||||
{
|
||||
category_region<category::ompt>::start<tim::quirk::perfetto>(
|
||||
(_ctx_info.func.empty()) ? m_prefix : _ctx_info.func, _ts,
|
||||
std::move(_annotate));
|
||||
}
|
||||
}
|
||||
|
||||
template <typename... Args>
|
||||
void stop(const context_info_t& _ctx_info, Args&&...) const
|
||||
{
|
||||
category_region<category::ompt>::stop<tim::quirk::timemory>(m_prefix);
|
||||
|
||||
auto _ts = tracing::now();
|
||||
uint64_t _cid =
|
||||
(_ctx_info.target_arguments) ? _ctx_info.target_arguments->host_op_id : 0;
|
||||
auto _annotate = [&](::perfetto::EventContext ctx) {
|
||||
if(config::get_perfetto_annotations())
|
||||
{
|
||||
tracing::add_perfetto_annotation(ctx, "end_ns", _ts);
|
||||
for(const auto& itr : _ctx_info.arguments)
|
||||
tracing::add_perfetto_annotation(ctx, itr.label, itr.value);
|
||||
}
|
||||
};
|
||||
|
||||
if(_cid > 0)
|
||||
{
|
||||
category_region<category::ompt>::stop<tim::quirk::perfetto>(
|
||||
(_ctx_info.func.empty()) ? m_prefix : _ctx_info.func, _ts,
|
||||
std::move(_annotate));
|
||||
}
|
||||
else
|
||||
{
|
||||
category_region<category::ompt>::stop<tim::quirk::perfetto>(
|
||||
(_ctx_info.func.empty()) ? m_prefix : _ctx_info.func, _ts,
|
||||
std::move(_annotate));
|
||||
}
|
||||
}
|
||||
|
||||
template <typename... Args>
|
||||
void store(const context_info_t& _ctx_info, Args&&... _args) const
|
||||
{
|
||||
start(_ctx_info, std::forward<Args>(_args)...);
|
||||
stop(_ctx_info, std::forward<Args>(_args)...);
|
||||
}
|
||||
|
||||
static void record(std::string_view name, ompt_id_t id, uint64_t beg_time,
|
||||
uint64_t end_time, uint64_t thrd_id, uint64_t targ_id,
|
||||
const context_info_t& common)
|
||||
{
|
||||
(void) thrd_id;
|
||||
(void) targ_id;
|
||||
|
||||
auto _annotate = [&](::perfetto::EventContext ctx) {
|
||||
if(config::get_perfetto_annotations())
|
||||
{
|
||||
for(const auto& itr : common.arguments)
|
||||
tracing::add_perfetto_annotation(ctx, itr.label, itr.value);
|
||||
}
|
||||
};
|
||||
|
||||
auto _track = tracing::get_perfetto_track(
|
||||
category::ompt{},
|
||||
[](uint64_t _targ_id_v) {
|
||||
return ::timemory::join::join("", "OMP Target ", _targ_id_v);
|
||||
},
|
||||
targ_id);
|
||||
|
||||
category_region<category::ompt>::start<tim::quirk::perfetto>(
|
||||
name, _track, beg_time, ::perfetto::Flow::ProcessScoped(id),
|
||||
std::move(_annotate));
|
||||
|
||||
category_region<category::ompt>::stop<tim::quirk::perfetto>(name, _track,
|
||||
end_time);
|
||||
}
|
||||
|
||||
void set_prefix(std::string_view _v) { m_prefix = _v; }
|
||||
|
||||
private:
|
||||
std::string_view m_prefix = {};
|
||||
};
|
||||
} // namespace component
|
||||
} // namespace rocprofsys
|
||||
|
||||
namespace tim
|
||||
{
|
||||
namespace trait
|
||||
{
|
||||
template <>
|
||||
struct ompt_handle<api_t>
|
||||
{
|
||||
using type = component_tuple<::rocprofsys::component::ompt>;
|
||||
};
|
||||
} // namespace trait
|
||||
} // namespace tim
|
||||
|
||||
namespace rocprofsys
|
||||
{
|
||||
@@ -58,6 +202,11 @@ namespace ompt
|
||||
{
|
||||
namespace
|
||||
{
|
||||
using ompt_handle_t = tim::component::ompt_handle<api_t>;
|
||||
using ompt_context_t = tim::openmp::context_handler<api_t>;
|
||||
using ompt_toolset_t = typename ompt_handle_t::toolset_type;
|
||||
using ompt_bundle_t = tim::component_tuple<ompt_handle_t>;
|
||||
|
||||
std::unique_ptr<ompt_bundle_t> f_bundle = {};
|
||||
bool _init_toolset_off = (trait::runtime_enabled<ompt_toolset_t>::set(false),
|
||||
trait::runtime_enabled<ompt_context_t>::set(false), true);
|
||||
@@ -70,10 +219,7 @@ setup()
|
||||
if(!tim::settings::enabled()) return;
|
||||
trait::runtime_enabled<ompt_toolset_t>::set(true);
|
||||
trait::runtime_enabled<ompt_context_t>::set(true);
|
||||
comp::user_ompt_bundle::global_init();
|
||||
comp::user_ompt_bundle::reset();
|
||||
tim::auto_lock_t lk{ tim::type_mutex<ompt_handle_t>() };
|
||||
comp::user_ompt_bundle::configure<component::local_category_region<category::ompt>>();
|
||||
f_bundle = std::make_unique<ompt_bundle_t>("rocprofsys/ompt",
|
||||
quirk::config<quirk::auto_start>{});
|
||||
}
|
||||
@@ -91,10 +237,15 @@ shutdown()
|
||||
ompt_context_t::cleanup();
|
||||
trait::runtime_enabled<ompt_toolset_t>::set(false);
|
||||
trait::runtime_enabled<ompt_context_t>::set(false);
|
||||
comp::user_ompt_bundle::reset();
|
||||
pthread_gotcha::shutdown();
|
||||
// call the OMPT finalize callback
|
||||
if(f_finalize) (*f_finalize)();
|
||||
if(f_finalize)
|
||||
{
|
||||
for(const auto& itr : tim::openmp::get_ompt_device_functions<api_t>())
|
||||
if(itr.second.stop_trace) itr.second.stop_trace(itr.second.device);
|
||||
(*f_finalize)();
|
||||
f_finalize = nullptr;
|
||||
}
|
||||
}
|
||||
f_bundle.reset();
|
||||
_protect = false;
|
||||
@@ -115,21 +266,232 @@ tool_initialize(ompt_function_lookup_t lookup, int initial_device_num,
|
||||
{
|
||||
if(!rocprofsys::settings_are_configured())
|
||||
{
|
||||
ROCPROFSYS_BASIC_WARNING(
|
||||
ROCPROFSYS_BASIC_WARNING_F(
|
||||
0,
|
||||
"[%s] invoked before rocprof-sys was initialized. In instrumentation mode, "
|
||||
"settings exported to the environment have not been propagated yet...\n",
|
||||
__FUNCTION__);
|
||||
rocprofsys::configure_settings();
|
||||
use_tool() = get_env("ROCPROFSYS_USE_OMPT", true, false);
|
||||
}
|
||||
else
|
||||
{
|
||||
use_tool() = rocprofsys::config::get_use_ompt();
|
||||
}
|
||||
|
||||
use_tool() = rocprofsys::config::get_use_ompt();
|
||||
if(use_tool())
|
||||
{
|
||||
TIMEMORY_PRINTF(stderr, "OpenMP-tools configuring for initial device %i\n\n",
|
||||
initial_device_num);
|
||||
f_finalize = tim::ompt::configure<TIMEMORY_OMPT_API_TAG>(
|
||||
lookup, initial_device_num, tool_data);
|
||||
ROCPROFSYS_BASIC_VERBOSE_F(
|
||||
2, "OpenMP-tools configuring for initial device %i\n\n", initial_device_num);
|
||||
|
||||
static auto _generate_key = [](std::string_view _key_v,
|
||||
const ::tim::openmp::argument_array_t& _args_v) {
|
||||
return std::string{ _key_v };
|
||||
(void) _args_v;
|
||||
};
|
||||
|
||||
tim::openmp::get_codeptr_ra_resolver<api_t>() =
|
||||
[](tim::openmp::context_info& _ctx_info) {
|
||||
const auto& _key = _ctx_info.label;
|
||||
const auto* codeptr_ra = _ctx_info.codeptr_ra;
|
||||
auto& _args = _ctx_info.arguments;
|
||||
|
||||
ROCPROFSYS_BASIC_VERBOSE(2, "resolving codeptr return address for %s\n",
|
||||
_key.data());
|
||||
|
||||
if(!codeptr_ra) return _generate_key(_key, _args);
|
||||
|
||||
static thread_local auto _once = std::once_flag{};
|
||||
std::call_once(_once, []() { ::tim::unwind::update_file_maps(); });
|
||||
|
||||
auto _info = ::rocprofsys::binary::lookup_ipaddr_entry<false>(
|
||||
reinterpret_cast<uintptr_t>(codeptr_ra));
|
||||
|
||||
if(_info)
|
||||
{
|
||||
_ctx_info.func = tim::demangle(_info->name);
|
||||
if(_info->lineno > 0)
|
||||
{
|
||||
auto _linfo = _info->lineinfo.rget([](const auto& _v) -> bool {
|
||||
return (_v && !_v.location.empty() && _v.line > 0);
|
||||
});
|
||||
|
||||
if(_linfo)
|
||||
{
|
||||
_ctx_info.file = _linfo.location;
|
||||
_ctx_info.line = _linfo.line;
|
||||
_args.emplace_back("file", _ctx_info.file);
|
||||
_args.emplace_back("lineinfo",
|
||||
::timemory::join::join("@", _ctx_info.file,
|
||||
_ctx_info.line));
|
||||
}
|
||||
else
|
||||
{
|
||||
_ctx_info.file = _info->location;
|
||||
_args.emplace_back("file", _ctx_info.file);
|
||||
}
|
||||
|
||||
return _generate_key(
|
||||
::timemory::join::join(" @ ", _key, _ctx_info.func), _args);
|
||||
}
|
||||
else
|
||||
{
|
||||
return _generate_key(
|
||||
::timemory::join::join(" @ ", _key, _ctx_info.func), _args);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
auto _dl_info = Dl_info{ nullptr, nullptr, nullptr, nullptr };
|
||||
if(dladdr(codeptr_ra, &_dl_info) != 0)
|
||||
{
|
||||
_ctx_info.file = _dl_info.dli_fname;
|
||||
_ctx_info.func = tim::demangle(_dl_info.dli_sname);
|
||||
_args.emplace_back("file", _ctx_info.file);
|
||||
return _generate_key(
|
||||
::timemory::join::join(
|
||||
" @ ", _key,
|
||||
::timemory::join::join("", _ctx_info.func, " [",
|
||||
_ctx_info.file, "]")),
|
||||
_args);
|
||||
}
|
||||
}
|
||||
|
||||
// since no line info could be deduced, include the codeptr return address
|
||||
auto _args_codeptr_v = _args;
|
||||
_args_codeptr_v.emplace_back("codeptr_ra", codeptr_ra);
|
||||
return _generate_key(_key, _args_codeptr_v);
|
||||
};
|
||||
|
||||
tim::openmp::get_function_lookup_callback<
|
||||
api_t>() = [](ompt_function_lookup_t,
|
||||
const std::optional<tim::openmp::function_lookup_params>&
|
||||
params) {
|
||||
if(!params) return;
|
||||
|
||||
ROCPROFSYS_VERBOSE(3, "[ompt] configuring device %i...\n",
|
||||
params->device_num);
|
||||
|
||||
auto& device_funcs =
|
||||
tim::openmp::get_ompt_device_functions<api_t>().at(params->device_num);
|
||||
|
||||
device_funcs.set_trace_ompt(params->device, 1, ompt_callback_target_data_op);
|
||||
device_funcs.set_trace_ompt(params->device, 1, ompt_callback_target_submit);
|
||||
|
||||
static ompt_callback_buffer_request_t request =
|
||||
[](int device_num, ompt_buffer_t** buffer, size_t* bytes) {
|
||||
ROCPROFSYS_VERBOSE(3, "[ompt] buffer request...\n");
|
||||
*bytes = ::tim::units::get_page_size();
|
||||
*buffer = mmap(nullptr, *bytes, PROT_READ | PROT_WRITE,
|
||||
MAP_ANONYMOUS | MAP_PRIVATE, -1, 0);
|
||||
(void) device_num;
|
||||
};
|
||||
|
||||
static ompt_callback_buffer_complete_t complete = [](int device_num,
|
||||
ompt_buffer_t* buffer,
|
||||
size_t bytes,
|
||||
ompt_buffer_cursor_t
|
||||
begin,
|
||||
int buffer_owned) {
|
||||
ROCPROFSYS_VERBOSE(3, "[ompt] buffer complete...\n");
|
||||
tim::consume_parameters(device_num, buffer, bytes, begin, buffer_owned);
|
||||
|
||||
auto _funcs =
|
||||
tim::openmp::get_ompt_device_functions<api_t>().at(device_num);
|
||||
auto _skew = rocprofsys::tracing::get_clock_skew(
|
||||
[&_funcs]() { return _funcs.get_device_time(_funcs.device); });
|
||||
|
||||
ompt_buffer_cursor_t _cursor = begin;
|
||||
size_t _nrecords = 0;
|
||||
do
|
||||
{
|
||||
if(_cursor == 0) break;
|
||||
++_nrecords;
|
||||
auto* _record = _funcs.get_record_ompt(buffer, _cursor);
|
||||
if(_record)
|
||||
{
|
||||
const char* _type = tim::openmp::get_enum_label(_record->type);
|
||||
auto _thrd_id = _record->thread_id;
|
||||
auto _targ_id = _record->target_id;
|
||||
|
||||
unsigned long beg_time = _record->time + _skew;
|
||||
unsigned long end_time = 0;
|
||||
ompt_id_t id = 0;
|
||||
const char* _name = tim::openmp::get_enum_label(_record->type);
|
||||
|
||||
if(_record->type == ompt_callback_target_submit)
|
||||
{
|
||||
auto& _data = _record->record.target_kernel;
|
||||
end_time = _data.end_time + _skew;
|
||||
id = _data.host_op_id;
|
||||
|
||||
auto _ctx_info = tim::openmp::argument_array_t{
|
||||
{ "begin_ns", beg_time },
|
||||
{ "end_ns", end_time },
|
||||
{ "type", _type },
|
||||
{ "thread_id", _thrd_id },
|
||||
{ "target_id", _targ_id },
|
||||
{ "host_op_id", id },
|
||||
{ "requested_num_teams", _data.requested_num_teams },
|
||||
{ "granted_num_teams", _data.granted_num_teams }
|
||||
};
|
||||
|
||||
component::ompt::record(
|
||||
_name, id, beg_time, end_time, _thrd_id, _targ_id,
|
||||
tim::openmp::context_info{ _name, nullptr, _ctx_info });
|
||||
}
|
||||
else if(_record->type == ompt_callback_target_data_op)
|
||||
{
|
||||
auto& _data = _record->record.target_data_op;
|
||||
end_time = _data.end_time + _skew;
|
||||
id = _data.host_op_id;
|
||||
const auto* _opname =
|
||||
tim::openmp::get_enum_label(_data.optype);
|
||||
|
||||
auto _ctx_info = tim::openmp::argument_array_t{
|
||||
{ "begin_ns", beg_time },
|
||||
{ "end_ns", end_time },
|
||||
{ "type", _type },
|
||||
{ "thread_id", _thrd_id },
|
||||
{ "target_id", _targ_id },
|
||||
{ "host_op_id", id },
|
||||
{ "optype", _opname },
|
||||
{ "src_addr", reinterpret_cast<void*>(_data.src_addr) },
|
||||
{ "dst_addr", reinterpret_cast<void*>(_data.dest_addr) },
|
||||
{ "src_device_num", _data.src_device_num },
|
||||
{ "dst_device_num", _data.dest_device_num },
|
||||
{ "bytes", _data.bytes },
|
||||
};
|
||||
|
||||
component::ompt::record(
|
||||
_opname, id, beg_time, end_time, _thrd_id, _targ_id,
|
||||
tim::openmp::context_info{ _name, nullptr, _ctx_info });
|
||||
}
|
||||
|
||||
ROCPROFSYS_VERBOSE(
|
||||
3,
|
||||
"type=%i, type_name=%s, start=%lu, end=%lu, delta=%lu, "
|
||||
"tid=%lu, target_id=%lu, host_id=%lu\n",
|
||||
_record->type, tim::openmp::get_enum_label(_record->type),
|
||||
beg_time, end_time, (end_time - beg_time), _record->thread_id,
|
||||
_record->target_id, id);
|
||||
}
|
||||
|
||||
_funcs.advance_buffer_cursor(_funcs.device, buffer, bytes, _cursor,
|
||||
&_cursor);
|
||||
} while(_cursor != 0);
|
||||
|
||||
ROCPROFSYS_VERBOSE(3, "[ompt] number of records: %zu\n", _nrecords);
|
||||
|
||||
if(buffer_owned == 1)
|
||||
{
|
||||
::munmap(buffer, bytes);
|
||||
}
|
||||
};
|
||||
|
||||
device_funcs.start_trace(params->device, request, complete);
|
||||
};
|
||||
|
||||
f_finalize = tim::ompt::configure<api_t>(lookup, initial_device_num, tool_data);
|
||||
}
|
||||
return 1; // success
|
||||
}
|
||||
@@ -143,18 +505,23 @@ tool_finalize(ompt_data_t*)
|
||||
} // namespace ompt
|
||||
} // namespace rocprofsys
|
||||
|
||||
extern "C" ompt_start_tool_result_t*
|
||||
ompt_start_tool(unsigned int omp_version, const char* runtime_version)
|
||||
extern "C"
|
||||
{
|
||||
ROCPROFSYS_BASIC_VERBOSE_F(0, "OpenMP version: %u, runtime version: %s\n",
|
||||
omp_version, runtime_version);
|
||||
ROCPROFSYS_METADATA("OMP_VERSION", omp_version);
|
||||
ROCPROFSYS_METADATA("OMP_RUNTIME_VERSION", runtime_version);
|
||||
ompt_start_tool_result_t* ompt_start_tool(unsigned int,
|
||||
const char*) ROCPROFSYS_PUBLIC_API;
|
||||
|
||||
static auto* data = new ompt_start_tool_result_t{ &rocprofsys::ompt::tool_initialize,
|
||||
&rocprofsys::ompt::tool_finalize,
|
||||
{ 0 } };
|
||||
return data;
|
||||
ompt_start_tool_result_t* ompt_start_tool(unsigned int omp_version,
|
||||
const char* runtime_version)
|
||||
{
|
||||
ROCPROFSYS_BASIC_VERBOSE_F(0, "OpenMP version: %u, runtime version: %s\n",
|
||||
omp_version, runtime_version);
|
||||
ROCPROFSYS_METADATA("OMP_VERSION", omp_version);
|
||||
ROCPROFSYS_METADATA("OMP_RUNTIME_VERSION", runtime_version);
|
||||
static auto* data = new ompt_start_tool_result_t{
|
||||
&rocprofsys::ompt::tool_initialize, &rocprofsys::ompt::tool_finalize, { 0 }
|
||||
};
|
||||
return data;
|
||||
}
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
@@ -163,55 +163,18 @@ get_clock_skew()
|
||||
static auto _use = tim::get_env("ROCPROFSYS_USE_ROCTRACER_CLOCK_SKEW", true);
|
||||
if(!_use) return 0;
|
||||
static auto _v = []() {
|
||||
namespace cpu = tim::cpu;
|
||||
// synchronize timestamps
|
||||
// We'll take a CPU timestamp before and after taking a GPU timestmp, then
|
||||
// take the average of those two, hoping that it's roughly at the same time
|
||||
// as the GPU timestamp.
|
||||
static auto _cpu_now = []() {
|
||||
cpu::fence();
|
||||
return comp::wall_clock::record();
|
||||
};
|
||||
|
||||
static auto _gpu_now = []() {
|
||||
cpu::fence();
|
||||
auto _gpu_now = []() {
|
||||
uint64_t _ts = 0;
|
||||
ROCPROFSYS_ROCTRACER_CALL(roctracer_get_timestamp(&_ts));
|
||||
roctracer_get_timestamp(&_ts);
|
||||
return _ts;
|
||||
};
|
||||
|
||||
do
|
||||
{
|
||||
// warm up cache and allow for any static initialization
|
||||
(void) _cpu_now();
|
||||
(void) _gpu_now();
|
||||
} while(false);
|
||||
// discard (warm-up)
|
||||
(void) tracing::get_clock_skew(_gpu_now, 1);
|
||||
|
||||
auto _compute = [](volatile uint64_t& _cpu_ts, volatile uint64_t& _gpu_ts) {
|
||||
_cpu_ts = 0;
|
||||
_gpu_ts = 0;
|
||||
_cpu_ts += _cpu_now() / 2;
|
||||
_gpu_ts += _gpu_now() / 1;
|
||||
_cpu_ts += _cpu_now() / 2;
|
||||
return static_cast<int64_t>(_cpu_ts) - static_cast<int64_t>(_gpu_ts);
|
||||
};
|
||||
constexpr int64_t _n = 10;
|
||||
int64_t _cpu_ave = 0;
|
||||
int64_t _gpu_ave = 0;
|
||||
int64_t _diff = 0;
|
||||
for(int64_t i = 0; i < _n; ++i)
|
||||
{
|
||||
volatile uint64_t _cpu_ts = 0;
|
||||
volatile uint64_t _gpu_ts = 0;
|
||||
_diff += _compute(_cpu_ts, _gpu_ts);
|
||||
_cpu_ave += _cpu_ts / _n;
|
||||
_gpu_ave += _gpu_ts / _n;
|
||||
}
|
||||
ROCPROFSYS_BASIC_VERBOSE(2, "CPU timestamp: %li\n", _cpu_ave);
|
||||
ROCPROFSYS_BASIC_VERBOSE(2, "HIP timestamp: %li\n", _gpu_ave);
|
||||
auto _diff = tracing::get_clock_skew(_gpu_now, 10);
|
||||
ROCPROFSYS_BASIC_VERBOSE(1, "CPU/HIP timestamp skew: %li (used: %s)\n", _diff,
|
||||
_use ? "yes" : "no");
|
||||
_diff /= _n;
|
||||
return _diff;
|
||||
}();
|
||||
return _v;
|
||||
|
||||
@@ -651,5 +651,41 @@ mark_perfetto_track(CategoryT, const char* name, ::perfetto::Track _track, uint6
|
||||
TRACE_EVENT_INSTANT(trait::name<CategoryT>::value, ::perfetto::DynamicString{ name },
|
||||
_track, _ts, std::forward<Args>(args)...);
|
||||
}
|
||||
|
||||
template <typename FuncT>
|
||||
int64_t
|
||||
get_clock_skew(FuncT&& _timestamp_func, int64_t _n = 1)
|
||||
{
|
||||
namespace cpu = tim::cpu;
|
||||
// synchronize timestamps
|
||||
// We'll take a CPU timestamp before and after taking a GPU timestmp, then
|
||||
// take the average of those two, hoping that it's roughly at the same time
|
||||
// as the GPU timestamp.
|
||||
auto _cpu_now = []() {
|
||||
cpu::fence();
|
||||
return now();
|
||||
};
|
||||
|
||||
auto _gpu_now = [&_timestamp_func]() {
|
||||
cpu::fence();
|
||||
return std::forward<FuncT>(_timestamp_func)();
|
||||
};
|
||||
|
||||
auto _compute = [&_cpu_now, &_gpu_now]() {
|
||||
volatile uint64_t _cpu_ts = 0;
|
||||
volatile uint64_t _gpu_ts = 0;
|
||||
_cpu_ts += _cpu_now();
|
||||
_gpu_ts += _gpu_now();
|
||||
_cpu_ts += _cpu_now();
|
||||
return static_cast<int64_t>(_cpu_ts / 2) - static_cast<int64_t>(_gpu_ts);
|
||||
};
|
||||
|
||||
int64_t _diff = 0;
|
||||
for(int64_t i = 0; i < _n; ++i)
|
||||
{
|
||||
_diff += _compute();
|
||||
}
|
||||
return (_diff / _n);
|
||||
}
|
||||
} // namespace tracing
|
||||
} // namespace rocprofsys
|
||||
|
||||
In neuem Issue referenzieren
Einen Benutzer sperren