* Delete core critical-trace files

* Update docs and README

* Update workflows

* Update testing

* Update cmake

* Remove critical trace usage in source code

* Update source/docs/critical_trace.md

- fix spelling

* Formatting

* Update bin/omnitrace-avail/avail.cpp

- statically allocate shared pointers for timemory manager and hash id/aliases to prevent use-after-free errors
Этот коммит содержится в:
Jonathan R. Madsen
2024-04-23 09:35:44 -05:00
коммит произвёл GitHub
родитель b81db80926
Коммит 9499e2f521
45 изменённых файлов: 55 добавлений и 2954 удалений
-2
Просмотреть файл
@@ -107,8 +107,6 @@ jobs:
ldd $(which omnitrace-avail)
omnitrace-avail --help
omnitrace-avail -a
which omnitrace-critical-trace
ldd $(which omnitrace-critical-trace)
which omnitrace
ldd $(which omnitrace)
omnitrace-instrument --help
+1 -1
Просмотреть файл
@@ -125,7 +125,7 @@ jobs:
run: |
set -v
source /opt/omnitrace/share/omnitrace/setup-env.sh
./scripts/test-install.sh --test-omnitrace-{instrument,avail,sample,rewrite,runtime,critical-trace,python}=1
./scripts/test-install.sh --test-omnitrace-{instrument,avail,sample,rewrite,runtime,python}=1
- name: Test User API
timeout-minutes: 10
-2
Просмотреть файл
@@ -138,8 +138,6 @@ jobs:
ldd $(which omnitrace-avail)
omnitrace-avail --help
omnitrace-avail -a
which omnitrace-critical-trace
ldd $(which omnitrace-critical-trace)
which omnitrace
ldd $(which omnitrace)
omnitrace-instrument --help
+4 -4
Просмотреть файл
@@ -196,7 +196,7 @@ jobs:
module use /opt/omnitrace/share/modulefiles
module avail
module load omnitrace
./scripts/test-install.sh --test-omnitrace-{instrument,avail,sample,rewrite,runtime,critical-trace}=1 --test-omnitrace-python=${{ matrix.python }}
./scripts/test-install.sh --test-omnitrace-{instrument,avail,sample,rewrite,runtime}=1 --test-omnitrace-python=${{ matrix.python }}
- name: Test User API
timeout-minutes: 10
@@ -362,7 +362,7 @@ jobs:
shell: bash
run: |
source /opt/omnitrace/share/omnitrace/setup-env.sh
./scripts/test-install.sh --test-omnitrace-{instrument,avail,sample,python,rewrite,runtime,critical-trace}=1
./scripts/test-install.sh --test-omnitrace-{instrument,avail,sample,python,rewrite,runtime}=1
- name: Test User API
timeout-minutes: 10
@@ -525,7 +525,7 @@ jobs:
run: |
set -v
source /opt/omnitrace/share/omnitrace/setup-env.sh
${{ github.workspace }}/scripts/test-install.sh --test-omnitrace-{instrument,avail,sample,python,rewrite,runtime,critical-trace}=1
${{ github.workspace }}/scripts/test-install.sh --test-omnitrace-{instrument,avail,sample,python,rewrite,runtime}=1
- name: Test Install with Modulefile
timeout-minutes: 15
@@ -534,7 +534,7 @@ jobs:
source /usr/share/modules/init/$(basename ${SHELL})
module use /opt/omnitrace/share/modulefiles
module load omnitrace
${{ github.workspace }}/scripts/test-install.sh --test-omnitrace-{instrument,avail,sample,python,rewrite,runtime,critical-trace}=1
${{ github.workspace }}/scripts/test-install.sh --test-omnitrace-{instrument,avail,sample,python,rewrite,runtime}=1
- name: Test User API
timeout-minutes: 10
+1 -1
Просмотреть файл
@@ -216,7 +216,7 @@ jobs:
module use /opt/omnitrace/share/modulefiles
module avail
module load omnitrace
./scripts/test-install.sh --test-omnitrace-{instrument,avail,sample,python,rewrite,runtime,critical-trace}=1
./scripts/test-install.sh --test-omnitrace-{instrument,avail,sample,python,rewrite,runtime}=1
- name: Test User API
timeout-minutes: 10
-2
Просмотреть файл
@@ -35,7 +35,6 @@ such as the memory usage, page-faults, and context-switches, and thread-level me
- Background thread records process-, system- and device-level metrics while the application executes
- Causal profiling
- Quantifies the potential impact of optimizations in parallel codes
- Critical trace generation
### Data Analysis
@@ -45,7 +44,6 @@ such as the memory usage, page-faults, and context-switches, and thread-level me
- Comprehensive traces
- Every individual event/measurement
- Application speedup predictions resulting from potential optimizations in functions and lines of code (causal profiling)
- Critical trace analysis (alpha)
### Parallelism API Support
-13
Просмотреть файл
@@ -42,7 +42,6 @@ fi
: ${ENABLE_OMNITRACE_PYTHON:=0}
: ${ENABLE_OMNITRACE_REWRITE:=1}
: ${ENABLE_OMNITRACE_RUNTIME:=1}
: ${ENABLE_OMNITRACE_CRITICAL_TRACE:=1}
usage()
{
@@ -55,7 +54,6 @@ usage()
print_option test-omnitrace-python "0|1" "Enable testing omnitrace-python" "${ENABLE_OMNITRACE_PYTHON}"
print_option test-omnitrace-rewrite "0|1" "Enable testing omnitrace-instrument binary rewrite" "${ENABLE_OMNITRACE_REWRITE}"
print_option test-omnitrace-runtime "0|1" "Enable testing omnitrace-instrument runtime instrumentation" "${ENABLE_OMNITRACE_RUNTIME}"
print_option test-omnitrace-critial-trace "0|1" "Enable testing omnitrace-instrument critical trace" "${ENABLE_OMNITRACE_CRITICAL_TRACE}"
}
cat << EOF > ${CONFIG_DIR}/omnitrace.cfg
@@ -126,10 +124,6 @@ do
ENABLE_OMNITRACE_RUNTIME=${VAL}
continue
;;
--test-omnitrace-critical-trace)
ENABLE_OMNITRACE_CRITICAL_TRACE=${VAL}
continue
;;
--source-dir)
SOURCE_DIR=${VAL}
continue
@@ -204,16 +198,9 @@ test-omnitrace-runtime()
verbose-run omnitrace-instrument -e -v 1 -- ${LS_NAME} ${LS_ARGS}
}
test-omnitrace-critical-trace()
{
which omnitrace-critical-trace
ldd $(which omnitrace-critical-trace)
}
if [ "${ENABLE_OMNITRACE_INSTRUMENT}" -ne 0 ]; then verbose-run test-omnitrace; fi
if [ "${ENABLE_OMNITRACE_AVAIL}" -ne 0 ]; then verbose-run test-omnitrace-avail; fi
if [ "${ENABLE_OMNITRACE_SAMPLE}" -ne 0 ]; then verbose-run test-omnitrace-sample; fi
if [ "${ENABLE_OMNITRACE_PYTHON}" -ne 0 ]; then verbose-run test-omnitrace-python; fi
if [ "${ENABLE_OMNITRACE_REWRITE}" -ne 0 ]; then verbose-run test-omnitrace-rewrite; fi
if [ "${ENABLE_OMNITRACE_RUNTIME}" -ne 0 ]; then verbose-run test-omnitrace-runtime; fi
if [ "${ENABLE_OMNITRACE_CRITICAL_TRACE}" -ne 0 ]; then verbose-run test-omnitrace-critical-trace; fi
-1
Просмотреть файл
@@ -15,7 +15,6 @@ endif()
# executables
add_subdirectory(omnitrace-avail)
add_subdirectory(omnitrace-critical-trace)
add_subdirectory(omnitrace-causal)
add_subdirectory(omnitrace-sample)
add_subdirectory(omnitrace-instrument)
+11
Просмотреть файл
@@ -41,6 +41,8 @@
#include <timemory/components/placeholder.hpp>
#include <timemory/components/properties.hpp>
#include <timemory/components/skeletons.hpp>
#include <timemory/hash/types.hpp>
#include <timemory/manager/manager.hpp>
#include <timemory/mpl/types.hpp>
#include <timemory/timemory.hpp>
#include <timemory/unwind/bfd.hpp>
@@ -118,6 +120,11 @@ namespace
{
// initialize HIP before main so that libomnitrace is not HSA_TOOLS_LIB
int gpu_count = omnitrace::gpu::hip_device_count();
// statically allocated shared_ptrs to prevent use after free errors
auto timemory_manager = tim::manager::master_instance();
auto timemory_hash_ids = tim::hash::get_main_hash_ids();
auto timemory_hash_aliases = tim::hash::get_main_hash_aliases();
} // namespace
//--------------------------------------------------------------------------------------//
@@ -125,6 +132,10 @@ int gpu_count = omnitrace::gpu::hip_device_count();
int
main(int argc, char** argv)
{
(void) timemory_manager; // suppress unused variables
(void) timemory_hash_ids; //
(void) timemory_hash_aliases; //
tim::unwind::set_bfd_verbose(3);
tim::set_env("OMNITRACE_INIT_TOOLING", "OFF", 1);
omnitrace_init_library();
-1
Просмотреть файл
@@ -200,7 +200,6 @@ get_initial_environment()
update_env(_env, "OMNITRACE_TRACE", false);
update_env(_env, "OMNITRACE_PROFILE", false);
update_env(_env, "OMNITRACE_USE_PROCESS_SAMPLING", false);
update_env(_env, "OMNITRACE_CRITICAL_TRACE", false);
update_env(_env, "OMNITRACE_THREAD_POOL_SIZE",
get_env<int>("OMNITRACE_THREAD_POOL_SIZE", 0));
update_env(_env, "OMNITRACE_LAUNCHER", "omnitrace-causal");
-25
Просмотреть файл
@@ -1,25 +0,0 @@
# ------------------------------------------------------------------------------#
#
# omnitrace-critical-trace target
#
# ------------------------------------------------------------------------------#
add_executable(omnitrace-critical-trace ${CMAKE_CURRENT_LIST_DIR}/critical-trace.cpp
${CMAKE_CURRENT_LIST_DIR}/critical-trace.hpp)
target_include_directories(omnitrace-critical-trace PRIVATE ${CMAKE_CURRENT_LIST_DIR})
target_compile_definitions(omnitrace-critical-trace PRIVATE OMNITRACE_EXTERN_COMPONENTS=0)
target_link_libraries(
omnitrace-critical-trace
PRIVATE omnitrace::omnitrace-compile-definitions
omnitrace::omnitrace-interface-library omnitrace::omnitrace-headers
omnitrace::omnitrace-timemory omnitrace::libomnitrace-static)
set_target_properties(
omnitrace-critical-trace
PROPERTIES BUILD_RPATH "\$ORIGIN:\$ORIGIN/../${CMAKE_INSTALL_LIBDIR}"
INSTALL_RPATH "${OMNITRACE_EXE_INSTALL_RPATH}")
install(
TARGETS omnitrace-critical-trace
DESTINATION ${CMAKE_INSTALL_BINDIR}
OPTIONAL)
Разница между файлами не показана из-за своего большого размера Загрузить разницу
-113
Просмотреть файл
@@ -1,113 +0,0 @@
// MIT License
//
// Copyright (c) 2022 Advanced Micro Devices, Inc. All Rights Reserved.
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in all
// copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
// SOFTWARE.
#pragma once
#include "core/config.hpp"
#include "core/debug.hpp"
#include "core/defines.hpp"
#include "core/perfetto.hpp"
#include "library/critical_trace.hpp"
#include "library/ptl.hpp"
#include <PTL/ThreadPool.hh>
#include <timemory/backends/dmp.hpp>
#include <timemory/backends/threading.hpp>
#include <timemory/hash/types.hpp>
#include <timemory/tpls/cereal/cereal/archives/json.hpp>
#include <timemory/tpls/cereal/cereal/cereal.hpp>
#include <timemory/utility/macros.hpp>
#include <timemory/utility/types.hpp>
#include <timemory/utility/utility.hpp>
#include <cctype>
#include <cstdint>
#include <exception>
#include <iomanip>
#include <sstream>
#include <stdexcept>
#include <utility>
namespace omnitrace
{
namespace critical_trace
{
namespace
{
using call_graph_t = tim::graph<entry>;
using call_graph_itr_t = typename call_graph_t::iterator;
using call_graph_sibling_itr_t = typename call_graph_t::sibling_iterator;
using call_graph_preorder_itr_t = typename call_graph_t::pre_order_iterator;
hash_ids complete_hash_ids{};
call_chain complete_call_chain{};
std::mutex complete_call_mutex{};
void
update_critical_path(call_chain _chain, int64_t _tid);
bool
load_call_chain(const std::string& _fname, const std::string& _label,
call_chain& _call_chain);
void
compute_critical_trace();
void
find_children(PTL::ThreadPool& _tp, call_graph_t& _graph, const call_chain& _chain);
void
find_sequences(PTL::ThreadPool& _tp, call_graph_t& _graph,
std::vector<call_chain>& _chain);
void
find_sequences(PTL::ThreadPool& _tp, call_graph_t& _graph, call_graph_itr_t _root,
std::vector<call_chain>& _chain);
template <typename ArchiveT, typename T, typename AllocatorT>
void
serialize_graph(ArchiveT& ar, const tim::graph<T, AllocatorT>& _graph);
template <typename ArchiveT, typename T, typename AllocatorT>
void
serialize_subgraph(ArchiveT& ar, const tim::graph<T, AllocatorT>& _graph,
typename tim::graph<T, AllocatorT>::iterator _root);
void
compute_critical_trace();
template <Device DevT>
void
generate_perfetto(const std::vector<call_chain>& _data);
inline void
copy_hash_ids()
{
// make copy to avoid parallel iteration issues
auto _hash_ids = complete_hash_ids;
// ensure all hash ids exist
for(const auto& itr : _hash_ids)
tim::hash::add_hash_id(itr);
}
} // namespace
} // namespace critical_trace
} // namespace omnitrace
-1
Просмотреть файл
@@ -301,7 +301,6 @@ omnitrace_add_bin_test(
ARGS -R
omnitrace
~timemory
~critical_trace
-r
_P
~PERFETTO
+3 -22
Просмотреть файл
@@ -1,4 +1,4 @@
# Generating a Critical Trace
# Critical Trace Support
```eval_rst
.. toctree::
@@ -6,24 +6,5 @@
:maxdepth: 4
```
## Overview
A critical trace is defined in omnitrace as the most time-consuming path through a parallelized code.
The steps for generating a critical trace are:
1. Enable the `OMNITRACE_CRITICAL_TRACE` setting
2. Configure any other relevant critical-trace settings, as needed
- `omnitrace-avail --categories settings::critical_trace`
3. Execute application
4. Locate the JSON files with `call-chain` in their name
5. Provide these files to the `omnitrace-critical-trace` executable
6. Open generated perfetto file in [ui.perfetto.dev](https://ui.perfetto.dev/)
## omnitrace-critical-trace Executable
The `omnitrace-critical-trace` executable post-processes one or more `call-chain` JSON files and generates a perfetto output
for visualizing the critical trace.
**INCOMPLETE**
This executable is still under-development.
Critical trace support has been superseded by causal profiling support.
Critical trace support was removed in Omnitrace v1.11.0 due to incomplete implementation.
-4
Просмотреть файл
@@ -50,10 +50,6 @@ for each variant:
- For a binary rewrite: outputs new instrumented binary and exits
- For runtime instrumentation or attaching to a process: instructs the application to resume executing and then waits for the application to exit
### omnitrace-critical-trace: [source/bin/omnitrace-critical-trace](https://github.com/ROCm/omnitrace/tree/main/source/bin/omnitrace-critical-trace)
Post-processing tool for critical-trace data output by omnitrace.
## Libraries
### Common Library: [source/lib/common](https://github.com/ROCm/omnitrace/tree/main/source/lib/common)
-2
Просмотреть файл
@@ -25,7 +25,6 @@ manage extensions, resources, data, etc.
- Background thread records process-, system- and device-level metrics while the application executes
- Causal profiling
- Quantifies the potential impact of optimizations in parallel codes
- Critical trace generation
### Data Analysis
@@ -35,7 +34,6 @@ manage extensions, resources, data, etc.
- Comprehensive traces
- Every individual event/measurement
- Application speedup predictions resulting from potential optimizations in functions and lines of code (causal profiling)
- Critical trace analysis (alpha)
### Parallelism API Support
-12
Просмотреть файл
@@ -191,13 +191,7 @@ OMNITRACE_USE_PID = true
OMNITRACE_OUTPUT_PATH = omnitrace-%tag%-output
OMNITRACE_OUTPUT_PREFIX =
OMNITRACE_CI = false
OMNITRACE_CRITICAL_TRACE = false
OMNITRACE_CRITICAL_TRACE_BUFFER_COUNT = 2000
OMNITRACE_CRITICAL_TRACE_COUNT = 0
OMNITRACE_CRITICAL_TRACE_DEBUG = false
OMNITRACE_THREAD_POOL_SIZE = 8
OMNITRACE_CRITICAL_TRACE_PER_ROW = 0
OMNITRACE_CRITICAL_TRACE_SERIALIZE_NAMES = false
OMNITRACE_DEBUG = false
OMNITRACE_DL_VERBOSE = 0
OMNITRACE_INSTRUMENTATION_INTERVAL = 1
@@ -283,13 +277,7 @@ $ omnitrace-avail -S -bd
| OMNITRACE_CONFIG_FILE | Configuration file for omnitrace |
| OMNITRACE_COUT_OUTPUT | Write output to stdout |
| OMNITRACE_CPU_AFFINITY | Enable pinning threads to CPUs (Linu... |
| OMNITRACE_CRITICAL_TRACE | Enable generation of the critical trace |
| OMNITRACE_CRITICAL_TRACE_BUFFER_COUNT | Number of critical trace records to ... |
| OMNITRACE_CRITICAL_TRACE_COUNT | Number of critical trace to export (... |
| OMNITRACE_CRITICAL_TRACE_DEBUG | Enable debugging for critical trace |
| OMNITRACE_THREAD_POOL_SIZE | Number of threads to use when genera... |
| OMNITRACE_CRITICAL_TRACE_PER_ROW | How many critical traces per row in ... |
| OMNITRACE_CRITICAL_TRACE_SERIALIZE_N... | Include names in serialization of cr... |
| OMNITRACE_DEBUG | Enable debug output |
| OMNITRACE_DIFF_OUTPUT | Generate a difference output vs. a p... |
| OMNITRACE_DL_VERBOSE | Verbosity within the omnitrace-dl li... |
-4
Просмотреть файл
@@ -209,7 +209,6 @@ $ omnitrace-sample -- ./parallel-overhead-locks 30 4 100
HSA_TOOLS_LIB=/opt/omnitrace/lib/libomnitrace-dl.so.1.7.1
HSA_TOOLS_REPORT_LOAD_FAILURE=1
LD_PRELOAD=/opt/omnitrace/lib/libomnitrace-dl.so.1.7.1
OMNITRACE_CRITICAL_TRACE=false
OMNITRACE_USE_PROCESS_SAMPLING=false
OMNITRACE_USE_SAMPLING=true
OMP_TOOL_LIBRARIES=/opt/omnitrace/lib/libomnitrace-dl.so.1.7.1
@@ -228,7 +227,6 @@ HSA_TOOLS_REPORT_LOAD_FAILURE=1
KOKKOS_PROFILE_LIBRARY=/opt/omnitrace/lib/libomnitrace.so.1.7.1
LD_PRELOAD=/opt/omnitrace/lib/libomnitrace-dl.so.1.7.1
OMNITRACE_CPU_FREQ_ENABLED=true
OMNITRACE_CRITICAL_TRACE=false
OMNITRACE_TRACE_THREAD_LOCKS=true
OMNITRACE_TRACE_THREAD_RW_LOCKS=true
OMNITRACE_TRACE_THREAD_SPIN_LOCKS=true
@@ -258,7 +256,6 @@ $ omnitrace-sample -PTDH -E all -o omnitrace-output %tag% -- ./parallel-overhead
LD_PRELOAD=/opt/omnitrace/lib/libomnitrace-dl.so.1.7.1
OMNITRACE_CPU_FREQ_ENABLED=true
OMNITRACE_CRITICAL_TRACE=false
OMNITRACE_OUTPUT_PATH=omnitrace-output
OMNITRACE_OUTPUT_PREFIX=%tag%
OMNITRACE_TRACE_THREAD_LOCKS=false
@@ -288,7 +285,6 @@ $ omnitrace-sample -PTDH -E all -o omnitrace-output %tag% -c -- ./parallel-overh
LD_PRELOAD=/opt/omnitrace/lib/libomnitrace-dl.so.1.7.1
OMNITRACE_CONFIG_FILE=
OMNITRACE_CPU_FREQ_ENABLED=true
OMNITRACE_CRITICAL_TRACE=false
OMNITRACE_OUTPUT_PATH=omnitrace-output
OMNITRACE_OUTPUT_PREFIX=%tag%
OMNITRACE_TRACE_THREAD_LOCKS=false
-1
Просмотреть файл
@@ -1225,7 +1225,6 @@ add_core_arguments(parser_t& _parser, parser_data& _data)
add_group_arguments(_parser, "perfetto", _data, true);
add_group_arguments(_parser, "timemory", _data, true);
add_group_arguments(_parser, "rocm", _data, true);
add_group_arguments(_parser, "critical_trace", _data, true);
_parser.start_group("MISCELLANEOUS OPTIONS", "");
-6
Просмотреть файл
@@ -110,9 +110,6 @@ OMNITRACE_DEFINE_CATEGORY(category, mpi, OMNITRACE_CATEGORY_MPI, "mpi", "MPI reg
OMNITRACE_DEFINE_CATEGORY(category, ompt, OMNITRACE_CATEGORY_OMPT, "ompt", "OpenMP tools regions")
OMNITRACE_DEFINE_CATEGORY(category, process_sampling, OMNITRACE_CATEGORY_PROCESS_SAMPLING, "process_sampling", "Process-level data")
OMNITRACE_DEFINE_CATEGORY(category, comm_data, OMNITRACE_CATEGORY_COMM_DATA, "comm_data", "MPI/RCCL counters for tracking amount of data sent or received")
OMNITRACE_DEFINE_CATEGORY(category, critical_trace, OMNITRACE_CATEGORY_CRITICAL_TRACE, "critical-trace", "Critical trace data")
OMNITRACE_DEFINE_CATEGORY(category, host_critical_trace, OMNITRACE_CATEGORY_HOST_CRITICAL_TRACE, "host-critical-trace", "Host-side critical trace data")
OMNITRACE_DEFINE_CATEGORY(category, device_critical_trace, OMNITRACE_CATEGORY_DEVICE_CRITICAL_TRACE, "device-critical-trace", "Device-side critical trace data")
OMNITRACE_DEFINE_CATEGORY(category, causal, OMNITRACE_CATEGORY_CAUSAL, "causal", "Causal profiling data")
OMNITRACE_DEFINE_CATEGORY(category, cpu_freq, OMNITRACE_CATEGORY_CPU_FREQ, "cpu_frequency", "CPU frequency (collected in background thread)")
OMNITRACE_DEFINE_CATEGORY(category, process_page, OMNITRACE_CATEGORY_PROCESS_PAGE, "process_page_fault", "Memory page faults in process (collected in background thread)")
@@ -174,9 +171,6 @@ using name = perfetto_category<Tp...>;
OMNITRACE_PERFETTO_CATEGORY(category::sampling), \
OMNITRACE_PERFETTO_CATEGORY(category::process_sampling), \
OMNITRACE_PERFETTO_CATEGORY(category::comm_data), \
OMNITRACE_PERFETTO_CATEGORY(category::critical_trace), \
OMNITRACE_PERFETTO_CATEGORY(category::host_critical_trace), \
OMNITRACE_PERFETTO_CATEGORY(category::device_critical_trace), \
OMNITRACE_PERFETTO_CATEGORY(category::causal), \
OMNITRACE_PERFETTO_CATEGORY(category::cpu_freq), \
OMNITRACE_PERFETTO_CATEGORY(category::process_page), \
-75
Просмотреть файл
@@ -520,10 +520,6 @@ configure_settings(bool _init)
_backend, "perfetto")
->set_choices({ "inprocess", "system", "all" });
OMNITRACE_CONFIG_SETTING(bool, "OMNITRACE_CRITICAL_TRACE",
"Enable generation of the critical trace", false, "backend",
"critical_trace");
OMNITRACE_CONFIG_SETTING(bool, "OMNITRACE_TRACE_THREAD_LOCKS",
"Enable tracing calls to pthread_mutex_lock, "
"pthread_mutex_unlock, pthread_mutex_trylock",
@@ -652,15 +648,6 @@ configure_settings(bool _init)
"busy,temp,power,mem_usage", "backend", "rocm_smi", "rocm",
"process_sampling", "advanced");
OMNITRACE_CONFIG_SETTING(bool, "OMNITRACE_CRITICAL_TRACE_DEBUG",
"Enable debugging for critical trace", _omnitrace_debug,
"debugging", "critical_trace", "advanced");
OMNITRACE_CONFIG_SETTING(
bool, "OMNITRACE_CRITICAL_TRACE_SERIALIZE_NAMES",
"Include names in serialization of critical trace (mainly for debugging)",
_omnitrace_debug, "debugging", "critical_trace", "advanced");
OMNITRACE_CONFIG_SETTING(size_t, "OMNITRACE_PERFETTO_SHMEM_SIZE_HINT_KB",
"Hint for shared-memory buffer size in perfetto (in KB)",
size_t{ 4096 }, "perfetto", "data", "advanced");
@@ -726,21 +713,6 @@ configure_settings(bool _init)
1),
"parallelism", "advanced");
OMNITRACE_CONFIG_EXT_SETTING(int64_t, "OMNITRACE_CRITICAL_TRACE_COUNT",
"Number of critical trace to export (0 == all)",
int64_t{ 0 }, "critical_trace",
"omnitrace-critical-trace", "advanced");
OMNITRACE_CONFIG_SETTING(uint64_t, "OMNITRACE_CRITICAL_TRACE_BUFFER_COUNT",
"Number of critical trace records to store in thread-local "
"memory before submitting to shared buffer",
uint64_t{ 2000 }, "critical_trace", "advanced");
OMNITRACE_CONFIG_EXT_SETTING(
int64_t, "OMNITRACE_CRITICAL_TRACE_PER_ROW",
"How many critical traces per row in perfetto (0 == all in one row)",
int64_t{ 0 }, "critical_trace", "omnitrace-critical-trace", "advanced");
OMNITRACE_CONFIG_SETTING(
std::string, "OMNITRACE_TIMEMORY_COMPONENTS",
"List of components to collect via timemory (see `omnitrace-avail -C`)",
@@ -1162,14 +1134,12 @@ configure_mode_settings(const std::shared_ptr<settings>& _config)
_set("OMNITRACE_USE_OMPT", false);
_set("OMNITRACE_USE_SAMPLING", false);
_set("OMNITRACE_USE_PROCESS_SAMPLING", false);
_set("OMNITRACE_CRITICAL_TRACE", false);
}
else if(get_mode() == Mode::Causal)
{
_set("OMNITRACE_USE_CAUSAL", true);
_set("OMNITRACE_TRACE", false);
_set("OMNITRACE_PROFILE", false);
_set("OMNITRACE_CRITICAL_TRACE", false);
_set("OMNITRACE_USE_SAMPLING", false);
_set("OMNITRACE_USE_PROCESS_SAMPLING", false);
}
@@ -1228,7 +1198,6 @@ configure_mode_settings(const std::shared_ptr<settings>& _config)
_set("OMNITRACE_USE_SAMPLING", false);
_set("OMNITRACE_USE_PROCESS_SAMPLING", false);
_set("OMNITRACE_USE_CODE_COVERAGE", false);
_set("OMNITRACE_CRITICAL_TRACE", false);
set_setting_value("OMNITRACE_TIMEMORY_COMPONENTS", std::string{});
set_setting_value("OMNITRACE_PAPI_EVENTS", std::string{});
}
@@ -1409,7 +1378,6 @@ configure_disabled_settings(const std::shared_ptr<settings>& _config)
_handle_use_option("OMNITRACE_USE_ROCM_SMI", "rocm_smi");
_handle_use_option("OMNITRACE_USE_ROCTRACER", "roctracer");
_handle_use_option("OMNITRACE_USE_ROCPROFILER", "rocprofiler");
_handle_use_option("OMNITRACE_CRITICAL_TRACE", "critical_trace");
#if !defined(OMNITRACE_USE_ROCTRACER) || OMNITRACE_USE_ROCTRACER == 0
_config->find("OMNITRACE_USE_ROCTRACER")->second->set_hidden(true);
@@ -1976,13 +1944,6 @@ get_use_mpip()
return static_cast<tim::tsettings<bool>&>(*_v->second).get();
}
bool&
get_use_critical_trace()
{
static auto _v = get_config()->find("OMNITRACE_CRITICAL_TRACE");
return static_cast<tim::tsettings<bool>&>(*_v->second).get();
}
bool
get_use_kokkosp()
{
@@ -2029,20 +1990,6 @@ get_num_threads_hint()
return static_cast<tim::tsettings<size_t>&>(*_v->second).get();
}
bool
get_critical_trace_debug()
{
static auto _v = get_config()->find("OMNITRACE_CRITICAL_TRACE_DEBUG");
return static_cast<tim::tsettings<bool>&>(*_v->second).get();
}
bool
get_critical_trace_serialize_names()
{
static auto _v = get_config()->find("OMNITRACE_CRITICAL_TRACE_SERIALIZE_NAMES");
return static_cast<tim::tsettings<bool>&>(*_v->second).get();
}
bool
get_sampling_keep_internal()
{
@@ -2099,13 +2046,6 @@ get_trace_hsa_activity()
return static_cast<tim::tsettings<bool>&>(*_v->second).get();
}
int64_t
get_critical_trace_per_row()
{
static auto _v = get_config()->find("OMNITRACE_CRITICAL_TRACE_PER_ROW");
return static_cast<tim::tsettings<int64_t>&>(*_v->second).get();
}
size_t
get_perfetto_shmem_size_hint()
{
@@ -2215,14 +2155,6 @@ get_perfetto_annotations()
return static_cast<tim::tsettings<bool>&>(*_v->second).get();
}
uint64_t
get_critical_trace_update_freq()
{
static uint64_t _v =
get_config()->get<uint64_t>("OMNITRACE_CRITICAL_TRACE_BUFFER_COUNT");
return _v;
}
uint64_t
get_thread_pool_size()
{
@@ -2394,13 +2326,6 @@ get_sampling_allocator_size()
return std::max<size_t>(static_cast<tim::tsettings<size_t>&>(*_v->second).get(), 1);
}
int64_t
get_critical_trace_count()
{
static auto _v = get_config()->find("OMNITRACE_CRITICAL_TRACE_COUNT");
return static_cast<tim::tsettings<int64_t>&>(*_v->second).get();
}
double
get_process_sampling_freq()
{
-18
Просмотреть файл
@@ -218,9 +218,6 @@ get_use_pid();
bool&
get_use_mpip();
bool&
get_use_critical_trace() OMNITRACE_HOT;
bool
get_use_kokkosp();
@@ -251,12 +248,6 @@ get_trace_hsa_api();
bool
get_trace_hsa_activity();
bool
get_critical_trace_debug();
bool
get_critical_trace_serialize_names();
size_t
get_perfetto_shmem_size_hint();
@@ -278,9 +269,6 @@ get_disabled_categories();
bool
get_perfetto_annotations() OMNITRACE_HOT;
uint64_t
get_critical_trace_update_freq();
uint64_t
get_thread_pool_size();
@@ -297,9 +285,6 @@ get_perfetto_output_filename();
bool
get_perfetto_roctracer_per_stream() OMNITRACE_HOT;
int64_t
get_critical_trace_count();
double
get_trace_delay();
@@ -360,9 +345,6 @@ get_process_sampling_duration();
std::string
get_sampling_gpus();
int64_t
get_critical_trace_per_row();
bool
get_trace_thread_locks();
-9
Просмотреть файл
@@ -67,9 +67,6 @@ get_debug_tid() OMNITRACE_HOT;
bool
get_debug_pid() OMNITRACE_HOT;
bool
get_critical_trace_debug() OMNITRACE_HOT;
} // namespace config
namespace debug
@@ -560,12 +557,6 @@ as_hex<void*>(void*, size_t);
#define OMNITRACE_BASIC_DEBUG_F(...) \
OMNITRACE_CONDITIONAL_BASIC_PRINT_F(::omnitrace::get_debug_env(), __VA_ARGS__)
#define OMNITRACE_CT_DEBUG(...) \
OMNITRACE_CONDITIONAL_PRINT(::omnitrace::get_critical_trace_debug(), __VA_ARGS__)
#define OMNITRACE_CT_DEBUG_F(...) \
OMNITRACE_CONDITIONAL_PRINT_F(::omnitrace::get_critical_trace_debug(), __VA_ARGS__)
//--------------------------------------------------------------------------------------//
//
// Verbose macros
+1 -1
Просмотреть файл
@@ -50,6 +50,7 @@ template <typename... Args>
auto
get_backtrace(Args... _arg)
{
consume_args(_arg...);
auto _bt = std::stringstream{};
if constexpr(sizeof...(Args) > 0)
{
@@ -57,7 +58,6 @@ get_backtrace(Args... _arg)
}
tim::unwind::detailed_backtrace<2>(_bt, true);
return strdup(_bt.str().c_str());
consume_args(_arg...);
}
} // namespace
-3
Просмотреть файл
@@ -63,9 +63,6 @@ extern "C"
OMNITRACE_CATEGORY_OMPT,
OMNITRACE_CATEGORY_PROCESS_SAMPLING,
OMNITRACE_CATEGORY_COMM_DATA,
OMNITRACE_CATEGORY_CRITICAL_TRACE,
OMNITRACE_CATEGORY_HOST_CRITICAL_TRACE,
OMNITRACE_CATEGORY_DEVICE_CRITICAL_TRACE,
OMNITRACE_CATEGORY_CAUSAL,
OMNITRACE_CATEGORY_CPU_FREQ,
OMNITRACE_CATEGORY_PROCESS_PAGE,
-52
Просмотреть файл
@@ -48,7 +48,6 @@
#include "library/components/pthread_gotcha.hpp"
#include "library/components/rocprofiler.hpp"
#include "library/coverage.hpp"
#include "library/critical_trace.hpp"
#include "library/ompt.hpp"
#include "library/process_sampler.hpp"
#include "library/ptl.hpp"
@@ -203,9 +202,6 @@ ensure_finalization(bool _static_init = false)
return scope::destructor{ []() { omnitrace_finalize_hidden(); } };
}
using Device = critical_trace::Device;
using Phase = critical_trace::Phase;
template <typename... Tp>
struct fini_bundle
{
@@ -402,11 +398,6 @@ omnitrace_init_library_hidden()
if(_debug_init) config::set_setting_value("OMNITRACE_DEBUG", _debug_value);
} };
// below will effectively do:
// get_cpu_cid_stack(0)->emplace_back(-1);
// plus query some env variables
add_critical_trace<Device::CPU, Phase::NONE>(0, -1, 0, 0, 0, 0, 0, 0, 0, -1, 0);
tim::trait::runtime_enabled<comp::roctracer>::set(get_use_roctracer());
tim::trait::runtime_enabled<comp::roctracer_data>::set(get_use_roctracer() &&
get_use_timemory());
@@ -920,55 +911,12 @@ omnitrace_finalize_hidden(void)
causal::finish_experimenting();
}
if(get_use_critical_trace() || (get_use_rocm_smi() && get_use_roctracer()))
{
OMNITRACE_VERBOSE_F(1, "Generating the critical trace...\n");
for(size_t i = 0; i < thread_info::get_peak_num_threads(); ++i)
{
using critical_trace_hash_data =
thread_data<critical_trace::hash_ids, critical_trace::id>;
if(i < critical_trace_hash_data::get()->size() &&
critical_trace_hash_data::get()->at(i))
{
OMNITRACE_DEBUG_F("Copying the hash id data for thread %zu...\n", i);
critical_trace::add_hash_id(*critical_trace_hash_data::get()->at(i));
}
}
for(size_t i = 0; i < thread_info::get_peak_num_threads(); ++i)
{
using critical_trace_chain_data = thread_data<critical_trace::call_chain>;
if(i < critical_trace_chain_data::get()->size() &&
critical_trace_chain_data::get()->at(i))
{
OMNITRACE_DEBUG_F(
"Updating the critical trace call-chains for thread %zu...\n", i);
critical_trace::update(i); // launch update task
}
}
OMNITRACE_VERBOSE_F(1, "Waiting on critical trace updates...\n");
tasking::join();
}
if(get_use_process_sampling())
{
OMNITRACE_VERBOSE_F(1, "Post-processing the system-level samples...\n");
process_sampler::post_process();
}
if(get_use_critical_trace())
{
// launch compute task
OMNITRACE_VERBOSE_F(1, "Launching critical trace compute task...\n");
critical_trace::compute();
OMNITRACE_VERBOSE_F(1, "Waiting on critical trace computation...\n");
tasking::join();
}
// shutdown tasking before timemory is finalized, especially the roctracer thread-pool
OMNITRACE_VERBOSE_F(1, "Shutting down thread-pools...\n");
tasking::shutdown();
-2
Просмотреть файл
@@ -2,7 +2,6 @@
set(library_sources
${CMAKE_CURRENT_LIST_DIR}/coverage.cpp
${CMAKE_CURRENT_LIST_DIR}/cpu_freq.cpp
${CMAKE_CURRENT_LIST_DIR}/critical_trace.cpp
${CMAKE_CURRENT_LIST_DIR}/kokkosp.cpp
${CMAKE_CURRENT_LIST_DIR}/ompt.cpp
${CMAKE_CURRENT_LIST_DIR}/perf.cpp
@@ -17,7 +16,6 @@ set(library_sources
set(library_headers
${CMAKE_CURRENT_LIST_DIR}/coverage.hpp
${CMAKE_CURRENT_LIST_DIR}/cpu_freq.hpp
${CMAKE_CURRENT_LIST_DIR}/critical_trace.hpp
${CMAKE_CURRENT_LIST_DIR}/ompt.hpp
${CMAKE_CURRENT_LIST_DIR}/process_sampler.hpp
${CMAKE_CURRENT_LIST_DIR}/perf.hpp
-49
Просмотреть файл
@@ -27,7 +27,6 @@
#include "core/state.hpp"
#include "core/timemory.hpp"
#include "library/causal/data.hpp"
#include "library/critical_trace.hpp"
#include "library/runtime.hpp"
#include "library/tracing.hpp"
#include "library/tracing/annotation.hpp"
@@ -68,12 +67,6 @@ using tracing_count_categories_t =
type_list<category::host, category::mpi, category::pthread, category::rocm_hip,
category::rocm_hsa, category::rocm_rccl>;
// these categories are added to the critical trace
using critical_trace_categories_t =
type_list<category::host, category::mpi, category::pthread, category::rocm_hip,
category::rocm_hsa, category::rocm_rccl, category::device_hip,
category::device_hsa, category::numa, category::python>;
// convert these categories to throughput points
using causal_throughput_categories_t =
type_list<category::host, category::kokkos, category::ompt, category::rocm_hip,
@@ -195,24 +188,6 @@ category_region<CategoryT>::start(std::string_view name, Args&&... args)
tracing::push_perfetto(CategoryT{}, name.data(), std::forward<Args>(args)...);
}
}
if constexpr(is_one_of<CategoryT, critical_trace_categories_t>::value)
{
using Device = critical_trace::Device;
using Phase = critical_trace::Phase;
if(get_use_critical_trace())
{
uint64_t _cid = 0;
uint64_t _parent_cid = 0;
uint32_t _depth = 0;
std::tie(_cid, _parent_cid, _depth) = create_cpu_cid_entry();
auto _ts = comp::wall_clock::record();
add_critical_trace<Device::CPU, Phase::BEGIN>(
threading::get_id(), _cid, 0, _parent_cid, _ts, 0, 0, 0,
critical_trace::add_hash_id(name.data()), _depth);
}
}
}
template <typename CategoryT>
@@ -278,30 +253,6 @@ category_region<CategoryT>::stop(std::string_view name, Args&&... args)
if(get_use_causal()) causal::pop_progress_point(name);
}
}
if constexpr(is_one_of<CategoryT, critical_trace_categories_t>::value)
{
using Device = critical_trace::Device;
using Phase = critical_trace::Phase;
if(get_use_critical_trace())
{
if(get_cpu_cid_stack() && !get_cpu_cid_stack()->empty())
{
auto _cid = get_cpu_cid_stack()->back();
if(get_cpu_cid_parents()->find(_cid) != get_cpu_cid_parents()->end())
{
uint64_t _parent_cid = 0;
uint32_t _depth = 0;
auto _ts = comp::wall_clock::record();
std::tie(_parent_cid, _depth) = get_cpu_cid_parents()->at(_cid);
add_critical_trace<Device::CPU, Phase::END>(
threading::get_id(), _cid, 0, _parent_cid, _ts, _ts, 0, 0,
critical_trace::add_hash_id(name.data()), _depth);
}
}
}
}
}
else
{
+2 -26
Просмотреть файл
@@ -25,7 +25,6 @@
#include "core/debug.hpp"
#include "core/utility.hpp"
#include "library/components/category_region.hpp"
#include "library/critical_trace.hpp"
#include "library/runtime.hpp"
#include "library/thread_info.hpp"
@@ -41,9 +40,6 @@ namespace omnitrace
{
namespace component
{
using Device = critical_trace::Device;
using Phase = critical_trace::Phase;
pthread_mutex_gotcha::hash_array_t&
pthread_mutex_gotcha::get_hashes()
{
@@ -76,7 +72,7 @@ pthread_mutex_gotcha::get_hashes()
{
auto&& _id = _data.at(i).tool_id;
if(!_id.empty())
_init.at(i) = critical_trace::add_hash_id(_id.c_str());
_init.at(i) = tim::add_hash_id(_id.c_str());
else
{
if(_skip.count(i) > 0) continue;
@@ -176,7 +172,7 @@ pthread_mutex_gotcha::pthread_mutex_gotcha(const gotcha_data_t& _data)
template <typename... Args>
auto
pthread_mutex_gotcha::operator()(uintptr_t&& _id, int (*_callee)(Args...),
pthread_mutex_gotcha::operator()(uintptr_t&&, int (*_callee)(Args...),
Args... _args) const
{
using bundle_t = category_region<category::pthread>;
@@ -203,30 +199,10 @@ pthread_mutex_gotcha::operator()(uintptr_t&& _id, int (*_callee)(Args...),
bool& _protect;
} _dtor{ m_protect = true };
uint64_t _cid = 0;
uint64_t _parent_cid = 0;
uint32_t _depth = 0;
int64_t _ts = 0;
if(_id < std::numeric_limits<uintptr_t>::max() && get_use_critical_trace())
{
OMNITRACE_SCOPED_THREAD_STATE(ThreadState::Internal);
std::tie(_cid, _parent_cid, _depth) = create_cpu_cid_entry();
_ts = comp::wall_clock::record();
}
bundle_t::audit(std::string_view{ m_data->tool_id }, audit::incoming{}, _args...);
auto _ret = (*_callee)(_args...);
bundle_t::audit(std::string_view{ m_data->tool_id }, audit::outgoing{}, _ret);
if(_id < std::numeric_limits<uintptr_t>::max() && get_use_critical_trace())
{
OMNITRACE_SCOPED_THREAD_STATE(ThreadState::Internal);
add_critical_trace<Device::CPU, Phase::DELTA>(
threading::get_id(), _cid, 0, _parent_cid, _ts, comp::wall_clock::record(), 0,
_id, get_hashes().at(m_data->index), _depth);
}
tim::consume_parameters(_id, _cid, _parent_cid, _depth, _ts);
return _ret;
}
-753
Просмотреть файл
@@ -1,753 +0,0 @@
// MIT License
//
// Copyright (c) 2022 Advanced Micro Devices, Inc. All Rights Reserved.
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in all
// copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
// SOFTWARE.
#include "library/critical_trace.hpp"
#include "core/config.hpp"
#include "core/debug.hpp"
#include "core/defines.hpp"
#include "core/perfetto.hpp"
#include "library/ptl.hpp"
#include "library/runtime.hpp"
#include "library/thread_data.hpp"
#include "library/tracing.hpp"
#include "library/tracing/annotation.hpp"
#include <PTL/ThreadPool.hh>
#include <timemory/backends/dmp.hpp>
#include <timemory/backends/threading.hpp>
#include <timemory/hash/types.hpp>
#include <timemory/operations/types/file_output_message.hpp>
#include <timemory/tpls/cereal/cereal/archives/json.hpp>
#include <timemory/tpls/cereal/cereal/cereal.hpp>
#include <timemory/utility/macros.hpp>
#include <timemory/utility/types.hpp>
#include <timemory/utility/utility.hpp>
#include <cctype>
#include <cstdint>
#include <exception>
#include <iomanip>
#include <sstream>
#include <stdexcept>
#include <utility>
namespace omnitrace
{
namespace critical_trace
{
namespace
{
using call_graph_t = tim::graph<entry>;
using call_graph_itr_t = typename call_graph_t::iterator;
using call_graph_sibling_itr_t = typename call_graph_t::sibling_iterator;
using call_graph_preorder_itr_t = typename call_graph_t::pre_order_iterator;
hash_ids complete_hash_ids{};
call_chain complete_call_chain{};
std::mutex complete_call_mutex{};
std::mutex tasking_mutex{};
void
update_critical_path(call_chain _chain, int64_t _tid);
void
compute_critical_trace();
void
copy_hash_ids()
{
// make copy to avoid parallel iteration issues
auto _hash_ids = complete_hash_ids;
// ensure all hash ids exist
for(const auto& itr : _hash_ids)
tim::hash::add_hash_id(itr);
}
} // namespace
} // namespace critical_trace
namespace critical_trace
{
namespace
{
template <typename Arg0, typename Arg1, typename... Args>
size_t
get_combined_hash(Arg0&& _zero, Arg1&& _one, Args&&... _args)
{
return tim::hash::get_hash_id(std::forward<Arg0>(_zero), std::forward<Arg1>(_one),
std::forward<Args>(_args)...);
}
} // namespace
//--------------------------------------------------------------------------------------//
//
// ENTRY
//
//--------------------------------------------------------------------------------------//
bool
entry::operator==(const entry& rhs) const
{
if(device != rhs.device) return false;
if(cpu_cid != rhs.cpu_cid) return false;
if(gpu_cid != rhs.gpu_cid) return false;
if(hash != rhs.hash) return false;
if(tid != rhs.tid) return false;
if(devid != rhs.devid) return false;
if(queue_id != rhs.queue_id) return false;
if(depth != rhs.depth) return false;
if(priority != rhs.priority) return false;
if(pid != rhs.pid) return false;
return true;
/*
return std::tie(device, depth, priority, devid, pid, tid, cpu_cid, gpu_cid, queue_id,
hash) == std::tie(rhs.device, rhs.depth, rhs.priority, rhs.devid,
rhs.pid, rhs.tid, rhs.cpu_cid, rhs.gpu_cid,
rhs.queue_id, rhs.hash);
*/
}
bool
entry::operator<(const entry& rhs) const
{
// sort by process ids
auto _pid_eq = (pid == rhs.pid);
if(!_pid_eq) return (pid < rhs.pid);
// sort by device ids
auto _devid_eq = (devid == rhs.devid);
if(!_devid_eq) return (devid < rhs.devid);
// sort by cpu ids
auto _cpu_eq = (cpu_cid == rhs.cpu_cid);
if(!_cpu_eq) return (cpu_cid < rhs.cpu_cid);
// sort by gpu ids
if(gpu_cid > 0 && rhs.gpu_cid > 0)
{
auto _gpu_eq = (gpu_cid == rhs.gpu_cid);
if(!_gpu_eq) return (gpu_cid < rhs.gpu_cid);
}
// sort by parent ids
auto _par_eq = (parent_cid == rhs.parent_cid);
if(!_par_eq) return (parent_cid < rhs.parent_cid);
// sort by queue ids
auto _queue_eq = (queue_id == rhs.queue_id);
if(!_queue_eq) return (queue_id < rhs.queue_id);
// sort by priority
auto _prio_eq = (priority == rhs.priority);
if(!_prio_eq) return (priority < rhs.priority);
// sort by timestamp (last resort)
return (begin_ns < rhs.begin_ns);
}
bool
entry::operator>(const entry& rhs) const
{
return (!(*this < rhs) && std::tie(begin_ns, cpu_cid, gpu_cid) !=
std::tie(rhs.begin_ns, rhs.cpu_cid, rhs.gpu_cid));
}
entry&
entry::operator+=(const entry& rhs)
{
if(phase == Phase::BEGIN && rhs.phase == Phase::END)
{
assert(rhs.end_ns >= begin_ns);
end_ns = rhs.end_ns;
phase = Phase::DELTA;
return *this;
}
else
{
OMNITRACE_VERBOSE(
2, "Warning! Incorrect phase. entry::operator+=(entry) is only valid for "
"Phase::BEGIN += Phase::END\n");
}
return *this;
}
size_t
entry::get_hash() const
{
return get_combined_hash(hash, static_cast<short>(device), static_cast<short>(phase),
devid, pid, tid, cpu_cid, gpu_cid, queue_id, priority);
}
int64_t
entry::get_timestamp() const
{
switch(phase)
{
case Phase::BEGIN: return begin_ns;
case Phase::END: return end_ns;
case Phase::DELTA: return (end_ns - begin_ns);
case Phase::NONE: break;
}
return 0;
}
int64_t
entry::get_cost() const
{
switch(phase)
{
case Phase::DELTA: return (end_ns - begin_ns);
default: break;
}
return 0;
}
void
entry::write(std::ostream& _os) const
{
if(device == Device::GPU)
_os << "[GPU][" << cpu_cid << "][" << gpu_cid << "]";
else
_os << "[CPU][" << cpu_cid << "]";
_os << " parent: " << static_cast<int64_t>(parent_cid);
_os << ", device: " << devid;
_os << ", pid: " << pid;
_os << ", tid: " << tid;
_os << ", depth: " << depth;
_os << ", queue: " << queue_id;
_os << ", priority: " << priority;
if(phase == Phase::DELTA)
{
std::stringstream _cost{};
_cost << std::setprecision(4) << std::scientific << (get_timestamp() / 1.0e9);
_os << ", cost: [" << std::setw(8) << _cost.str() << " sec]";
}
else
{
_os << ", phase: ";
if(phase == Phase::BEGIN)
_os << "begin ";
else if(phase == Phase::END)
_os << "end ";
_os << "[" << begin_ns << ":" << end_ns << "]";
}
_os << ", hash: " << hash << " :: " << tim::demangle(tim::get_hash_identifier(hash));
}
//--------------------------------------------------------------------------------------//
//
// CALL CHAIN
//
//--------------------------------------------------------------------------------------//
bool
call_chain::operator==(const call_chain& rhs) const
{
if(size() != rhs.size()) return false;
for(size_t i = 0; i < size(); ++i)
if(at(i) != rhs.at(i)) return false;
return true;
}
int64_t
call_chain::get_cost(int64_t _tid) const
{
int64_t _cost = 0;
if(_tid < 0)
{
for(const auto& itr : *this)
_cost += itr.get_cost();
}
else
{
for(const auto& itr : *this)
{
if(itr.tid == _tid) _cost += itr.get_cost();
}
}
return _cost;
}
template <Device DevT>
void
call_chain::generate_perfetto(::perfetto::Track _track, std::set<entry>& _used) const
{
OMNITRACE_SCOPED_THREAD_STATE(ThreadState::Internal);
static std::set<std::string> _static_strings{};
static std::mutex _static_mutex{};
for(const auto& itr : *this)
{
if(!_used.emplace(itr).second) continue;
auto&& _annotater = [&](::perfetto::EventContext ctx) {
if(config::get_perfetto_annotations())
{
tracing::add_perfetto_annotation(ctx, "begin_ns", itr.begin_ns);
tracing::add_perfetto_annotation(ctx, "end_ns", itr.end_ns);
}
};
if constexpr(DevT == Device::NONE)
{
if(itr.device == Device::CPU)
{
tracing::push_perfetto_track(category::host_critical_trace{}, "CPU",
_track, itr.begin_ns, std::move(_annotater));
tracing::pop_perfetto_track(category::host_critical_trace{}, "CPU",
_track, itr.end_ns);
}
else if(itr.device == Device::GPU)
{
tracing::push_perfetto_track(category::device_critical_trace{}, "GPU",
_track, itr.begin_ns, std::move(_annotater));
tracing::pop_perfetto_track(category::device_critical_trace{}, "GPU",
_track, itr.end_ns);
}
}
else
{
using category_t = std::conditional_t<
DevT == Device::ANY, omnitrace::category::critical_trace,
std::conditional_t<DevT == Device::CPU,
omnitrace::category::host_critical_trace,
omnitrace::category::device_critical_trace>>;
if constexpr(DevT != Device::ANY)
{
if(itr.device != DevT) continue;
}
std::string _name = tim::demangle(tim::get_hash_identifier(itr.hash));
_static_mutex.lock();
auto sitr = _static_strings.emplace(_name);
_static_mutex.unlock();
tracing::push_perfetto_track(category_t{}, sitr.first->c_str(), _track,
itr.begin_ns, std::move(_annotater));
tracing::pop_perfetto_track(category_t{}, sitr.first->c_str(), _track,
itr.end_ns);
}
}
}
// explicit instantiations
template void
call_chain::generate_perfetto<Device::NONE>(::perfetto::Track, std::set<entry>&) const;
template void
call_chain::generate_perfetto<Device::CPU>(::perfetto::Track, std::set<entry>&) const;
template void
call_chain::generate_perfetto<Device::GPU>(::perfetto::Track, std::set<entry>&) const;
template void
call_chain::generate_perfetto<Device::ANY>(::perfetto::Track, std::set<entry>&) const;
//--------------------------------------------------------------------------------------//
//
// FREE FUNCTIONS
//
//--------------------------------------------------------------------------------------//
uint64_t
get_update_frequency()
{
return get_critical_trace_update_freq();
}
unique_ptr_t<call_chain>&
get(int64_t _tid)
{
static auto* _v = thread_data<call_chain>::get();
static thread_local auto _once = [_tid]() {
if(!_v->at(0)) _v->at(0) = unique_ptr_t<call_chain>{ new call_chain{} };
if(!_v->at(_tid)) _v->at(_tid) = unique_ptr_t<call_chain>{ new call_chain{} };
if(_tid > 0) *_v->at(_tid) = *_v->at(0);
return true;
}();
(void) _once;
return _v->at(_tid);
}
void
add_hash_id(const hash_ids& _labels)
{
OMNITRACE_SCOPED_THREAD_STATE(ThreadState::Internal);
if(!tasking::critical_trace::get_task_group().pool()) return;
std::unique_lock<std::mutex> _lk{ tasking_mutex };
tasking::critical_trace::get_task_group().exec([_labels]() {
static std::mutex _mtx{};
_mtx.lock();
for(auto itr : _labels)
complete_hash_ids.emplace(std::move(itr));
_mtx.unlock();
});
}
size_t
add_hash_id(const std::string& _label)
{
using critical_trace_hash_data =
thread_data<critical_trace::hash_ids, critical_trace::id>;
auto _hash = tim::hash::add_hash_id(_label);
if(get_use_critical_trace() || get_use_rocm_smi())
{
critical_trace_hash_data::construct();
critical_trace_hash_data::instance()->emplace(_label);
}
return _hash;
}
void
update(int64_t _tid)
{
if(!get_use_critical_trace() && !get_use_rocm_smi()) return;
OMNITRACE_SCOPED_THREAD_STATE(ThreadState::Internal);
if(!tasking::critical_trace::get_task_group().pool()) return;
std::unique_lock<std::mutex> _lk{ tasking_mutex };
call_chain _data{};
std::swap(_data, *critical_trace::get(_tid));
tasking::critical_trace::get_task_group().exec(update_critical_path, _data, _tid);
}
void
compute(int64_t _tid)
{
update(_tid);
OMNITRACE_SCOPED_THREAD_STATE(ThreadState::Internal);
if(!tasking::critical_trace::get_task_group().pool()) return;
std::unique_lock<std::mutex> _lk{ tasking_mutex };
tasking::critical_trace::get_task_group().exec(compute_critical_trace);
}
//--------------------------------------------------------------------------------------//
//
// HELPER FUNCTIONS
//
//--------------------------------------------------------------------------------------//
namespace
{
std::string
get_perf_name(std::string _func)
{
const auto _npos = std::string::npos;
auto _pos = std::string::npos;
while((_pos = _func.find('_')) != _npos)
_func = _func.replace(_pos, 1, " ");
if(_func.length() > 0) _func.at(0) = std::toupper(_func.at(0));
return _func;
}
void
save_call_chain_json(const std::string& _fname, const std::string& _label,
const call_chain& _call_chain, bool _msg = false,
std::string _func = {})
{
OMNITRACE_CT_DEBUG("[%s][%s] saving %zu call chain entries to '%s'\n", __FUNCTION__,
_label.c_str(), _call_chain.size(), _fname.c_str());
using perfstats_t =
tim::lightweight_tuple<comp::wall_clock, comp::peak_rss, comp::page_rss>;
perfstats_t _perf{ get_perf_name(__FUNCTION__) };
_perf.start();
auto _save = [&](std::ostream& _os) {
namespace cereal = tim::cereal;
auto ar = tim::policy::output_archive<cereal::MinimalJSONOutputArchive>::get(_os);
auto _hash_map = *tim::hash::get_hash_ids();
for(auto& itr : _hash_map)
itr.second = tim::demangle(itr.second);
ar->setNextName("omnitrace");
ar->startNode();
(*ar)(cereal::make_nvp("hash_map", _hash_map),
cereal::make_nvp(_label.c_str(), _call_chain));
ar->finishNode();
};
std::ofstream ofs{};
if(tim::filepath::open(ofs, _fname))
{
if(_msg)
{
if(_func.empty()) _func = __FUNCTION__;
if(get_verbose() >= 0)
operation::file_output_message<critical_trace::call_chain>{}(
_fname, std::string{ _func });
}
std::stringstream oss{};
if(_call_chain.size() > 100000)
{
_save(ofs);
}
else
{
_save(oss);
ofs << oss.str() << std::endl;
}
}
_perf.stop();
if(_msg)
{
OMNITRACE_CT_DEBUG("%s\n", JOIN("", _perf).c_str());
}
}
template <typename Tp, template <typename...> class ContainerT, typename... Args,
typename FuncT = bool (*)(const Tp&, const Tp&)>
inline auto
find(
const Tp& _v, ContainerT<Tp, Args...>& _vec,
FuncT&& _func = [](const Tp& _lhs, const Tp& _rhs) { return (_lhs == _rhs); })
{
for(auto itr = _vec.begin(); itr != _vec.end(); ++itr)
{
if(std::forward<FuncT>(_func)(_v, *itr))
{
return itr;
}
}
OMNITRACE_CT_DEBUG("[%s] no match found in %zu entries...\n", __FUNCTION__,
_vec.size());
return _vec.end();
}
template <typename FuncT = bool (*)(const entry&, const entry&)>
inline auto
find(
const entry& _v, call_chain& _vec,
FuncT&& _func = [](const entry& _lhs, const entry& _rhs) { return (_lhs == _rhs); })
{
return find(_v, reinterpret_cast<std::vector<entry>&>(_vec),
std::forward<FuncT>(_func));
}
void
squash_critical_path(call_chain& _targ)
{
OMNITRACE_CT_DEBUG("[%s]\n", __FUNCTION__);
static auto _strict_equal = [](const entry& _lhs, const entry& _rhs) {
auto _same_phase = (_lhs.phase == _rhs.phase);
bool _phase_check = true;
if(_same_phase) _phase_check = (_lhs.get_timestamp() == _rhs.get_timestamp());
return (_lhs == _rhs && _lhs.parent_cid == _rhs.parent_cid && _phase_check);
};
std::sort(_targ.begin(), _targ.end());
call_chain _squashed{};
for(auto& itr : _targ)
{
if(itr.phase == Phase::DELTA)
{
_squashed.emplace_back(itr);
}
else if(itr.phase == Phase::BEGIN)
{
if(find(itr, _squashed, _strict_equal) == _squashed.end())
_squashed.emplace_back(itr);
}
else
{
auto mitr = find(itr, _squashed);
if(mitr != _squashed.end())
*mitr += itr;
else
_squashed.emplace_back(itr);
}
}
std::swap(_targ, _squashed);
std::sort(_targ.begin(), _targ.end());
}
void
combine_critical_path(call_chain& _targ, call_chain _chain)
{
OMNITRACE_CT_DEBUG("[%s]\n", __FUNCTION__);
OMNITRACE_CT_DEBUG("[%s] adding %zu entries to existing call-chain of %zu...\n",
__FUNCTION__, _chain.size(), _targ.size());
// use a deque here because when combining _begin and _end, you end
// up erasing entries from the front of _begin. When _begin is large, it
// takes a lot of time to move all the elements each iteration
std::deque<entry> _begin{};
std::deque<entry> _end{};
call_chain _delta{};
_delta.reserve(_chain.size() / 2); // estimated total deltas
for(auto& itr : _chain)
{
if(itr.phase == Phase::DELTA)
_delta.emplace_back(itr);
else if(itr.phase == Phase::BEGIN)
_begin.emplace_back(itr);
else if(itr.phase == Phase::END)
_end.emplace_back(itr);
}
OMNITRACE_CT_DEBUG("[%s] sorting %zu begin and %zu end call-chain entries...\n",
__FUNCTION__, _begin.size(), _end.size());
std::sort(_begin.begin(), _begin.end());
std::sort(_end.begin(), _end.end());
std::deque<entry> _tmp{};
std::swap(_end, _tmp);
for(auto& eitr : _tmp)
{
auto mitr = find(eitr, _begin);
if(mitr == _begin.end())
_end.emplace_back(eitr);
else
{
*mitr += eitr;
_delta.emplace_back(*mitr);
_begin.erase(mitr);
}
}
_tmp.clear();
OMNITRACE_CT_DEBUG(
"[%s] %zu begin and %zu end call-chain entries were not matched...\n",
__FUNCTION__, _begin.size(), _end.size());
call_chain _combined{};
_combined.reserve(_delta.size() + _begin.size() + _end.size());
for(auto& itr : _delta)
_combined.emplace_back(itr);
for(auto& itr : _begin)
_combined.emplace_back(itr);
for(auto& itr : _end)
_combined.emplace_back(itr);
OMNITRACE_CT_DEBUG("[%s] sorting %zu combined call-chain entries...\n", __FUNCTION__,
_combined.size());
std::sort(_combined.begin(), _combined.end());
OMNITRACE_SCOPED_THREAD_STATE(ThreadState::Internal);
std::unique_lock<std::mutex> _lk{ complete_call_mutex };
for(auto& itr : _combined)
_targ.emplace_back(itr);
// squash_critical_path(_targ);
}
void
update_critical_path(call_chain _chain, int64_t)
{
OMNITRACE_CT_DEBUG("[%s] updating critical path with %zu entries...\n", __FUNCTION__,
_chain.size());
try
{
// remove any data not
// auto _diff_tid = [_tid](const entry& _v) { return _v.tid != _tid; };
//_chain.erase(std::remove_if(_chain.begin(), _chain.end(), _diff_tid),
// _chain.end());
combine_critical_path(complete_call_chain, std::move(_chain));
} catch(const std::exception& e)
{
std::cerr << "Thread exited with exception: " << e.what() << std::endl;
TIMEMORY_CONDITIONAL_DEMANGLED_BACKTRACE(true, 32);
}
}
void
compute_critical_trace()
{
OMNITRACE_SCOPED_THREAD_STATE(ThreadState::Internal);
static bool _computed = false;
std::unique_lock<std::mutex> _lk{ complete_call_mutex };
if(_computed) return;
OMNITRACE_CONDITIONAL_PRINT(get_critical_trace_debug() || get_verbose() >= 0,
"[%s] Generating critical trace...\n", __FUNCTION__);
// ensure all hash ids exist
copy_hash_ids();
using perfstats_t =
tim::lightweight_tuple<comp::wall_clock, comp::peak_rss, comp::page_rss>;
perfstats_t _ct_perf{};
_ct_perf.start();
try
{
OMNITRACE_VERBOSE_F(1, "[%s] initial call chain: %zu entries\n", __FUNCTION__,
complete_call_chain.size());
perfstats_t _perf{ get_perf_name(__FUNCTION__) };
_perf.start();
std::sort(complete_call_chain.begin(), complete_call_chain.end());
_perf.stop().rekey("Sorting critical trace");
OMNITRACE_VERBOSE_F(1, "%s\n", JOIN("", _perf).c_str());
_perf.reset().start();
save_call_chain_json(
tim::settings::compose_output_filename("call-chain", ".json"), "call_chain",
complete_call_chain, true, __FUNCTION__);
_perf.stop().rekey("Save call-chain");
OMNITRACE_VERBOSE_F(1, "%s\n", JOIN("", _perf).c_str());
} catch(std::exception& e)
{
OMNITRACE_PRINT_F("Thread exited '%s' with exception: %s\n", __FUNCTION__,
e.what());
TIMEMORY_CONDITIONAL_DEMANGLED_BACKTRACE(true, 32);
}
OMNITRACE_PRINT_F("%s\n", _ct_perf.stop().as_string<false, false>().c_str());
}
} // namespace
std::vector<std::pair<std::string, entry>>
get_entries(const std::function<bool(const entry&)>& _eval)
{
OMNITRACE_SCOPED_THREAD_STATE(ThreadState::Internal);
tasking::join();
copy_hash_ids();
squash_critical_path(complete_call_chain);
std::sort(complete_call_chain.begin(), complete_call_chain.end());
auto _v = std::vector<std::pair<std::string, entry>>{};
for(const auto& itr : complete_call_chain)
{
if(itr.phase != Phase::DELTA) continue;
if(_eval(itr)) _v.emplace_back(tim::get_hash_identifier(itr.hash), itr);
}
return _v;
}
} // namespace critical_trace
} // namespace omnitrace
-370
Просмотреть файл
@@ -1,370 +0,0 @@
// MIT License
//
// Copyright (c) 2022 Advanced Micro Devices, Inc. All Rights Reserved.
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in all
// copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
// SOFTWARE.
#pragma once
#include "core/common.hpp"
#include "core/config.hpp"
#include "core/defines.hpp"
#include "core/perfetto.hpp"
#include "library/runtime.hpp"
#include "library/thread_data.hpp"
#include <timemory/backends/process.hpp>
#include <timemory/backends/threading.hpp>
#include <timemory/hash/types.hpp>
#include <timemory/macros/language.hpp>
#include <timemory/tpls/cereal/cereal.hpp>
#include <timemory/utility/demangle.hpp>
#include <timemory/utility/utility.hpp>
#include <cstdint>
#include <cstdlib>
#include <mutex>
#include <ostream>
#include <string>
#include <vector>
namespace omnitrace
{
namespace critical_trace
{
enum class Device : uint8_t
{
NONE = 0,
CPU,
GPU,
ANY,
};
enum class Phase : uint8_t
{
NONE = 0,
BEGIN,
END,
DELTA,
};
struct OMNITRACE_ATTRIBUTE(packed) entry
{
entry() = default;
~entry() = default;
entry(const entry&) = default;
entry(entry&&) noexcept = default;
entry& operator=(const entry&) = default;
entry& operator=(entry&&) noexcept = default;
Device device = Device::CPU; /// which device it executed on
Phase phase = Phase::NONE; /// start / stop / unspecified
uint16_t priority = 0; /// priority value (for sorting)
uint32_t depth = 0; /// call-stack depth
int32_t devid = 0; /// device id
int32_t pid = 0; /// process id
int32_t tid = 0; /// thread id it was registered on
uint64_t cpu_cid = 0; /// CPU correlation id
uint64_t gpu_cid = 0; /// GPU correlation id
uint64_t parent_cid = 0; /// parent CPU correlation id
int64_t begin_ns = 0; /// timestamp of start
int64_t end_ns = 0; /// timestamp of end
uintptr_t queue_id = 0; /// stream id (GPU) or mutex id
size_t hash = 0; /// hash for name
bool operator==(const entry& rhs) const;
bool operator!=(const entry& rhs) const { return !(*this == rhs); }
bool operator<(const entry& rhs) const;
bool operator>(const entry& rhs) const;
bool operator<=(const entry& rhs) const { return !(*this > rhs); }
bool operator>=(const entry& rhs) const { return !(*this < rhs); }
entry& operator+=(const entry& rhs);
size_t get_hash() const;
int64_t get_timestamp() const;
int64_t get_cost() const;
void write(std::ostream& _os) const;
friend std::ostream& operator<<(std::ostream& _os, const entry& _v)
{
_v.write(_os);
return _os;
}
template <typename Archive>
void save(Archive& ar, unsigned int) const;
template <typename Archive>
void load(Archive& ar, unsigned int);
};
template <typename Archive>
void
entry::save(Archive& ar, unsigned int) const
{
namespace cereal = tim::cereal;
#define SAVE_PACKED_ENTRY_FIELD(VAR) \
{ \
auto _val = VAR; \
ar(cereal::make_nvp(#VAR, _val)); \
}
SAVE_PACKED_ENTRY_FIELD(priority);
SAVE_PACKED_ENTRY_FIELD(device);
SAVE_PACKED_ENTRY_FIELD(phase);
SAVE_PACKED_ENTRY_FIELD(depth);
SAVE_PACKED_ENTRY_FIELD(devid);
SAVE_PACKED_ENTRY_FIELD(pid);
SAVE_PACKED_ENTRY_FIELD(tid);
SAVE_PACKED_ENTRY_FIELD(cpu_cid);
SAVE_PACKED_ENTRY_FIELD(gpu_cid);
SAVE_PACKED_ENTRY_FIELD(parent_cid);
SAVE_PACKED_ENTRY_FIELD(begin_ns);
SAVE_PACKED_ENTRY_FIELD(end_ns);
SAVE_PACKED_ENTRY_FIELD(queue_id);
SAVE_PACKED_ENTRY_FIELD(hash);
#undef SAVE_PACKED_ENTRY_FIELD
std::string _name{};
auto _hash = hash;
if(_hash > 0) _name = tim::get_hash_identifier(_hash);
ar(cereal::make_nvp("name", _name),
cereal::make_nvp("demangled_name", tim::demangle(_name)));
}
template <typename Archive>
void
entry::load(Archive& ar, unsigned int)
{
namespace cereal = tim::cereal;
#define LOAD_PACKED_ENTRY_FIELD(VAR) \
{ \
auto _val = VAR; \
ar(cereal::make_nvp(#VAR, _val)); \
VAR = _val; \
}
LOAD_PACKED_ENTRY_FIELD(priority);
LOAD_PACKED_ENTRY_FIELD(device);
LOAD_PACKED_ENTRY_FIELD(phase);
LOAD_PACKED_ENTRY_FIELD(depth);
LOAD_PACKED_ENTRY_FIELD(devid);
LOAD_PACKED_ENTRY_FIELD(pid);
LOAD_PACKED_ENTRY_FIELD(tid);
LOAD_PACKED_ENTRY_FIELD(cpu_cid);
LOAD_PACKED_ENTRY_FIELD(gpu_cid);
LOAD_PACKED_ENTRY_FIELD(parent_cid);
LOAD_PACKED_ENTRY_FIELD(begin_ns);
LOAD_PACKED_ENTRY_FIELD(end_ns);
LOAD_PACKED_ENTRY_FIELD(queue_id);
LOAD_PACKED_ENTRY_FIELD(hash);
#undef LOAD_PACKED_ENTRY_FIELD
std::string _name{};
std::string _demangled_name{};
ar(cereal::make_nvp("name", _name),
cereal::make_nvp("demangled_name", _demangled_name));
auto _hash = hash;
tim::get_hash_ids()->emplace(_hash, _name);
}
struct call_chain : private std::vector<entry>
{
using base_type = std::vector<entry>;
using base_type::at;
using base_type::back;
using base_type::begin;
using base_type::cbegin;
using base_type::cend;
using base_type::clear;
using base_type::emplace_back;
using base_type::empty;
using base_type::end;
using base_type::erase;
using base_type::front;
using base_type::pop_back;
using base_type::push_back;
using base_type::rbegin;
using base_type::rend;
using base_type::reserve;
using base_type::size;
int64_t get_cost(int64_t _tid = -1) const;
bool operator==(const call_chain& rhs) const;
bool operator!=(const call_chain& rhs) const { return !(*this == rhs); }
friend std::ostream& operator<<(std::ostream& _os, const call_chain& _v)
{
size_t _n = 0;
for(const auto& itr : _v)
_os << " [" << _n++ << "] " << itr << "\n";
return _os;
}
template <typename Archive>
void serialize(Archive& ar, unsigned int)
{
namespace cereal = tim::cereal;
ar(cereal::make_nvp("call_chain", static_cast<base_type&>(*this)));
}
template <Device DevT>
void generate_perfetto(::perfetto::Track, std::set<entry>& _used) const;
template <bool BoolV = true, typename FuncT>
bool query(FuncT&&) const;
};
template <bool BoolV, typename FuncT>
bool
call_chain::query(FuncT&& _func) const
{
for(const auto& itr : *this)
{
if(std::forward<FuncT>(_func)(itr)) return BoolV;
}
return !BoolV;
}
using hash_ids = std::unordered_set<std::string>;
uint64_t
get_update_frequency();
unique_ptr_t<call_chain>&
get(int64_t _tid = threading::get_id());
size_t
add_hash_id(const std::string& _label);
void
add_hash_id(const hash_ids&);
void
update(int64_t _tid = threading::get_id());
void
compute(int64_t _tid = threading::get_id());
std::vector<std::pair<std::string, entry>>
get_entries(const std::function<bool(const entry&)>& _eval = [](const entry&) {
return true;
});
struct id
{};
} // namespace critical_trace
template <critical_trace::Device DevID, critical_trace::Phase PhaseID,
bool UpdateStack = true>
inline void
add_critical_trace(int32_t _targ_tid, size_t _cpu_cid, size_t _gpu_cid,
size_t _parent_cid, int64_t _ts_beg, int64_t _ts_val, int32_t _devid,
uintptr_t _queue, size_t _hash, uint32_t _depth, uint16_t _prio = 0)
{
// clang-format off
// these are used to create unique type mutexes
struct critical_insert {};
struct cpu_cid_stack {};
// clang-format on
OMNITRACE_SCOPED_THREAD_STATE(ThreadState::Internal);
static constexpr auto num_mutexes = max_supported_threads;
static auto _update_freq = critical_trace::get_update_frequency();
static auto _pid = process::get_id();
auto _self_tid = threading::get_id();
if constexpr(PhaseID != critical_trace::Phase::NONE)
{
auto& _self_mtx =
type_mutex<critical_insert, project::omnitrace, num_mutexes>(_self_tid);
auto_lock_t _self_lk{ _self_mtx, std::defer_lock };
// unique lock per thread
if(!_self_lk.owns_lock()) _self_lk.lock();
auto& _critical_trace = critical_trace::get(_self_tid);
_critical_trace->emplace_back(critical_trace::entry{
DevID, PhaseID, _prio, _depth, _devid, _pid, _targ_tid, _cpu_cid, _gpu_cid,
_parent_cid, _ts_beg, _ts_val, _queue, _hash });
}
if constexpr(UpdateStack)
{
auto& _self_mtx = get_cpu_cid_stack_lock(_self_tid);
auto& _targ_mtx = get_cpu_cid_stack_lock(_targ_tid);
auto_lock_t _self_lk{ _self_mtx, std::defer_lock };
auto_lock_t _targ_lk{ _targ_mtx, std::defer_lock };
// unique lock per thread
auto _lock = [&_self_lk, &_targ_lk, _self_tid, _targ_tid]() {
if(!_self_lk.owns_lock() && _self_tid != _targ_tid) _self_lk.lock();
if(!_targ_lk.owns_lock()) _targ_lk.lock();
};
if constexpr(PhaseID == critical_trace::Phase::NONE)
{
_lock();
get_cpu_cid_stack(_targ_tid)->emplace_back(_cpu_cid);
}
else if constexpr(PhaseID == critical_trace::Phase::BEGIN)
{
_lock();
get_cpu_cid_stack(_targ_tid)->emplace_back(_cpu_cid);
}
else if constexpr(PhaseID == critical_trace::Phase::END)
{
_lock();
get_cpu_cid_stack(_targ_tid)->pop_back();
if(_gpu_cid == 0 && _cpu_cid % _update_freq == (_update_freq - 1))
critical_trace::update(_targ_tid);
}
tim::consume_parameters(_lock);
}
tim::consume_parameters(_pid, _targ_tid, _cpu_cid, _gpu_cid, _parent_cid, _ts_beg,
_ts_val, _devid, _queue, _hash, _depth, _prio, num_mutexes);
}
} // namespace omnitrace
namespace std
{
inline std::string
to_string(::omnitrace::critical_trace::Device _v)
{
using Device = ::omnitrace::critical_trace::Device;
switch(_v)
{
case Device::NONE: return std::string{};
case Device::CPU: return std::string{ "CPU" };
case Device::GPU: return std::string{ "GPU" };
case Device::ANY: return std::string{ "CPU + GPU" };
}
return std::string{ "Unknown Device" };
}
} // namespace std
+5
Просмотреть файл
@@ -633,6 +633,11 @@ perf_event::record::locate_field() const
if constexpr(SampleT == sample::last) return reinterpret_cast<Tp>(p);
OMNITRACE_FATAL << "Unsupported sample field requested!";
if constexpr(std::is_pointer<Tp>::value)
return nullptr;
else
return Tp{};
}
namespace
-53
Просмотреть файл
@@ -129,19 +129,6 @@ get_thread_pool_state()
} // namespace
} // namespace roctracer
namespace critical_trace
{
namespace
{
auto&
get_thread_pool_state()
{
static auto _v = State::PreInit;
return _v;
}
} // namespace
} // namespace critical_trace
void
setup()
{
@@ -164,17 +151,6 @@ join()
OMNITRACE_DEBUG_F("roctracer thread-pool is not active...\n");
}
if(critical_trace::get_thread_pool_state() == State::Active)
{
OMNITRACE_DEBUG_F("waiting for all critical trace tasks to complete...\n");
for(size_t i = 0; i < thread_info::get_peak_num_threads(); ++i)
critical_trace::get_task_group(i).join();
}
else
{
OMNITRACE_DEBUG_F("critical-trace thread-pool is not active...\n");
}
if(general::get_thread_pool_state() == State::Active)
{
OMNITRACE_DEBUG_F("waiting for all general tasks to complete...\n");
@@ -202,22 +178,6 @@ shutdown()
OMNITRACE_DEBUG_F("roctracer thread-pool is not active...\n");
}
if(critical_trace::get_thread_pool_state() == State::Active)
{
OMNITRACE_DEBUG_F("Waiting on completion of critical trace tasks...\n");
for(size_t i = 0; i < thread_info::get_peak_num_threads(); ++i)
{
critical_trace::get_task_group(i).join();
critical_trace::get_task_group(i).clear();
critical_trace::get_task_group(i).set_pool(nullptr);
}
critical_trace::get_thread_pool_state() = State::Finalized;
}
else
{
OMNITRACE_DEBUG_F("critical-trace thread-pool is not active...\n");
}
if(general::get_thread_pool_state() == State::Active)
{
OMNITRACE_DEBUG_F("Waiting on completion of general tasks...\n");
@@ -270,18 +230,5 @@ roctracer::get_task_group(int64_t _tid)
&tasking::get_thread_pool()));
return *_v;
}
PTL::TaskGroup<void>&
critical_trace::get_task_group(int64_t _tid)
{
struct local
{};
using thread_data_t = thread_data<PTL::TaskGroup<void>, local>;
static thread_local auto& _v =
(critical_trace::get_thread_pool_state() = State::Active,
thread_data_t::instance(construct_on_thread{ _tid },
&tasking::get_thread_pool()));
return *_v;
}
} // namespace tasking
} // namespace omnitrace
-12
Просмотреть файл
@@ -67,17 +67,5 @@ namespace roctracer
PTL::TaskGroup<void>&
get_task_group(int64_t _tid = utility::get_thread_index());
} // namespace roctracer
//--------------------------------------------------------------------------------------//
//
// critical trace
//
//--------------------------------------------------------------------------------------//
namespace critical_trace
{
PTL::TaskGroup<void>&
get_task_group(int64_t _tid = utility::get_thread_index());
} // namespace critical_trace
} // namespace tasking
} // namespace omnitrace
-50
Просмотреть файл
@@ -38,7 +38,6 @@
#include "core/gpu.hpp"
#include "core/perfetto.hpp"
#include "core/state.hpp"
#include "library/critical_trace.hpp"
#include "library/runtime.hpp"
#include "library/thread_info.hpp"
@@ -326,55 +325,6 @@ data::post_process(uint32_t _dev_id)
};
if(get_use_perfetto()) _process_perfetto();
if(!get_use_timemory()) return;
#if !defined(TIMEMORY_USE_MPI)
// timemory + MPI here causes hangs for some reason. it is unclear why
using samp_bundle_t = tim::lightweight_tuple<sampling_gpu_busy, sampling_gpu_temp,
sampling_gpu_power, sampling_gpu_memory>;
trait::runtime_enabled<sampling_gpu_busy>::set(_settings.busy);
trait::runtime_enabled<sampling_gpu_temp>::set(_settings.temp);
trait::runtime_enabled<sampling_gpu_power>::set(_settings.power);
trait::runtime_enabled<sampling_gpu_memory>::set(_settings.mem_usage);
using entry_t = critical_trace::entry;
auto _gpu_entries = critical_trace::get_entries(
[](const entry_t& _e) { return (_e.device == critical_trace::Device::GPU); });
for(auto& itr : _rocm_smi)
{
auto _ts = itr.m_ts;
if(!_thread_info->is_valid_time(_ts)) continue;
auto _entries = std::vector<std::pair<std::string_view, const entry_t*>>{};
for(const auto& eitr : _gpu_entries)
{
if(_ts >= eitr.second.begin_ns && _ts <= eitr.second.end_ns)
_entries.emplace_back(std::string_view{ eitr.first }, &eitr.second);
}
std::vector<samp_bundle_t> _tc{};
_tc.reserve(_entries.size());
for(auto& eitr : _entries)
{
auto& _v = _tc.emplace_back(eitr.first);
_v.push();
_v.start();
_v.stop();
GPU_METRIC(sampling_gpu_busy, m_busy_perc)
GPU_METRIC(sampling_gpu_temp, m_temp / 1.0e3) // provided in milli-degree C
GPU_METRIC(sampling_gpu_power,
m_power * units::microwatt / static_cast<double>(units::watt))
GPU_METRIC(sampling_gpu_memory,
m_mem_usage / static_cast<double>(units::megabyte))
_v.pop();
}
}
#endif
}
//--------------------------------------------------------------------------------------//
+12 -168
Просмотреть файл
@@ -28,7 +28,6 @@
#include "core/debug.hpp"
#include "core/locking.hpp"
#include "library/components/category_region.hpp"
#include "library/critical_trace.hpp"
#include "library/runtime.hpp"
#include "library/sampling.hpp"
#include "library/thread_data.hpp"
@@ -129,32 +128,6 @@ get_roctracer_tid_data()
return _v;
}
using cid_tuple_t = std::tuple<uint64_t, uint64_t, uint32_t, uintptr_t>;
struct cid_data : cid_tuple_t
{
using cid_tuple_t::cid_tuple_t;
OMNITRACE_DEFAULT_OBJECT(cid_data)
auto& cid() { return std::get<0>(*this); }
auto& pcid() { return std::get<1>(*this); }
auto& depth() { return std::get<2>(*this); }
auto& queue() { return std::get<3>(*this); }
auto cid() const { return std::get<0>(*this); }
auto pcid() const { return std::get<1>(*this); }
auto depth() const { return std::get<2>(*this); }
auto queue() const { return std::get<3>(*this); }
};
auto&
get_roctracer_cid_data(int64_t _tid = threading::get_id())
{
using thread_data_t =
thread_data<std::unordered_map<uint64_t, cid_data>, category::roctracer>;
return thread_data_t::instance(construct_on_thread{ _tid });
}
auto&
get_hip_activity_callbacks(int64_t _tid = threading::get_id())
{
@@ -562,9 +535,6 @@ hip_api_callback(uint32_t domain, uint32_t cid, const void* callback_data, void*
OMNITRACE_SCOPED_THREAD_STATE(ThreadState::Internal);
using Device = critical_trace::Device;
using Phase = critical_trace::Phase;
assert(domain == ACTIVITY_DOMAIN_HIP_API);
const char* op_name = roctracer_op_string(domain, cid, 0);
if(op_name == nullptr) op_name = hip_api_name(cid);
@@ -591,88 +561,12 @@ hip_api_callback(uint32_t domain, uint32_t cid, const void* callback_data, void*
op_name, cid, data->correlation_id,
(data->phase == ACTIVITY_API_PHASE_ENTER) ? "on-enter" : "on-exit");
int64_t _ts = comp::wall_clock::record();
auto _tid = threading::get_id();
uint64_t _crit_cid = 0;
uint64_t _parent_crit_cid = 0;
uint32_t _depth = 0;
uintptr_t _queue = 0;
auto _roct_cid = data->correlation_id;
#define OMNITRACE_HIP_API_QUEUE_CASE(API_FUNC, VARIABLE) \
case HIP_API_ID_##API_FUNC: \
_queue = reinterpret_cast<uintptr_t>(data->args.API_FUNC.VARIABLE); \
break;
#define OMNITRACE_HIP_API_QUEUE_CASE_ALT(API_FUNC, UNION, VARIABLE) \
case HIP_API_ID_##API_FUNC: \
_queue = reinterpret_cast<uintptr_t>(data->args.UNION.VARIABLE); \
break;
switch(cid)
{
OMNITRACE_HIP_API_QUEUE_CASE(hipLaunchKernel, stream)
OMNITRACE_HIP_API_QUEUE_CASE(hipModuleLaunchKernel, stream)
OMNITRACE_HIP_API_QUEUE_CASE(hipHccModuleLaunchKernel, hStream)
OMNITRACE_HIP_API_QUEUE_CASE(hipLaunchCooperativeKernel, stream)
OMNITRACE_HIP_API_QUEUE_CASE(hipExtLaunchKernel, stream)
OMNITRACE_HIP_API_QUEUE_CASE(hipExtModuleLaunchKernel, hStream)
OMNITRACE_HIP_API_QUEUE_CASE(hipExtStreamCreateWithCUMask, stream)
OMNITRACE_HIP_API_QUEUE_CASE(hipExtStreamGetCUMask, stream)
OMNITRACE_HIP_API_QUEUE_CASE(hipStreamSynchronize, stream)
OMNITRACE_HIP_API_QUEUE_CASE(hipConfigureCall, stream)
OMNITRACE_HIP_API_QUEUE_CASE(hipDrvMemcpy3DAsync, stream)
OMNITRACE_HIP_API_QUEUE_CASE(hipEventRecord, stream)
OMNITRACE_HIP_API_QUEUE_CASE(hipMemPrefetchAsync, stream)
OMNITRACE_HIP_API_QUEUE_CASE(hipMemcpy2DAsync, stream)
OMNITRACE_HIP_API_QUEUE_CASE(hipMemcpy2DFromArrayAsync, stream)
OMNITRACE_HIP_API_QUEUE_CASE(hipMemcpy3DAsync, stream)
OMNITRACE_HIP_API_QUEUE_CASE(hipMemcpyAsync, stream)
OMNITRACE_HIP_API_QUEUE_CASE(hipMemcpyDtoDAsync, stream)
OMNITRACE_HIP_API_QUEUE_CASE(hipMemcpyDtoHAsync, stream)
OMNITRACE_HIP_API_QUEUE_CASE(hipMemcpyFromSymbolAsync, stream)
OMNITRACE_HIP_API_QUEUE_CASE(hipMemcpyHtoDAsync, stream)
OMNITRACE_HIP_API_QUEUE_CASE(hipMemcpyParam2DAsync, stream)
OMNITRACE_HIP_API_QUEUE_CASE(hipMemcpyPeerAsync, stream)
OMNITRACE_HIP_API_QUEUE_CASE(hipMemcpyToSymbolAsync, stream)
OMNITRACE_HIP_API_QUEUE_CASE(hipMemcpyWithStream, stream)
OMNITRACE_HIP_API_QUEUE_CASE(hipMemset2DAsync, stream)
OMNITRACE_HIP_API_QUEUE_CASE(hipMemset3DAsync, stream)
OMNITRACE_HIP_API_QUEUE_CASE(hipMemsetAsync, stream)
OMNITRACE_HIP_API_QUEUE_CASE(hipMemsetD16Async, stream)
OMNITRACE_HIP_API_QUEUE_CASE(hipMemsetD32Async, stream)
OMNITRACE_HIP_API_QUEUE_CASE(hipMemsetD8Async, stream)
OMNITRACE_HIP_API_QUEUE_CASE(hipStreamAddCallback, stream)
OMNITRACE_HIP_API_QUEUE_CASE(hipStreamAttachMemAsync, stream)
OMNITRACE_HIP_API_QUEUE_CASE(hipStreamDestroy, stream)
OMNITRACE_HIP_API_QUEUE_CASE(hipStreamGetFlags, stream)
OMNITRACE_HIP_API_QUEUE_CASE(hipStreamGetPriority, stream)
OMNITRACE_HIP_API_QUEUE_CASE(hipStreamQuery, stream)
OMNITRACE_HIP_API_QUEUE_CASE(hipStreamWaitEvent, stream)
#if OMNITRACE_HIP_VERSION >= 40300
OMNITRACE_HIP_API_QUEUE_CASE(hipMemcpy2DToArrayAsync, stream)
OMNITRACE_HIP_API_QUEUE_CASE(hipStreamWaitValue32, stream)
OMNITRACE_HIP_API_QUEUE_CASE(hipStreamWaitValue64, stream)
OMNITRACE_HIP_API_QUEUE_CASE(hipStreamWriteValue32, stream)
OMNITRACE_HIP_API_QUEUE_CASE(hipStreamWriteValue64, stream)
#endif
#if OMNITRACE_HIP_VERSION >= 40500
OMNITRACE_HIP_API_QUEUE_CASE(hipGraphLaunch, stream)
OMNITRACE_HIP_API_QUEUE_CASE(hipGraphicsMapResources, stream)
OMNITRACE_HIP_API_QUEUE_CASE(hipGraphicsUnmapResources, stream)
OMNITRACE_HIP_API_QUEUE_CASE(hipSignalExternalSemaphoresAsync, stream)
OMNITRACE_HIP_API_QUEUE_CASE(hipStreamBeginCapture, stream)
OMNITRACE_HIP_API_QUEUE_CASE(hipStreamEndCapture, stream)
OMNITRACE_HIP_API_QUEUE_CASE(hipWaitExternalSemaphoresAsync, stream)
#endif
#if OMNITRACE_HIP_VERSION >= 50000
OMNITRACE_HIP_API_QUEUE_CASE(hipStreamIsCapturing, stream)
OMNITRACE_HIP_API_QUEUE_CASE(hipStreamGetCaptureInfo, stream)
OMNITRACE_HIP_API_QUEUE_CASE(hipStreamGetCaptureInfo_v2, stream)
OMNITRACE_HIP_API_QUEUE_CASE(hipStreamUpdateCaptureDependencies, stream)
#endif
default: break;
}
int64_t _ts = comp::wall_clock::record();
auto _tid = threading::get_id();
uint64_t _crit_cid = 0;
uint64_t _parent_crit_cid = 0;
uint32_t _depth = 0;
auto _roct_cid = data->correlation_id;
auto& _device_id = get_current_device();
@@ -863,15 +757,6 @@ hip_api_callback(uint32_t domain, uint32_t cid, const void* callback_data, void*
get_roctracer_hip_data()->erase(itr.first);
}
}
if(get_use_critical_trace() || get_use_rocm_smi())
{
add_critical_trace<Device::CPU, Phase::BEGIN>(
_tid, _crit_cid, _roct_cid, _parent_crit_cid, _ts, 0, _device_id, _queue,
critical_trace::add_hash_id(op_name), _depth);
}
get_roctracer_cid_data(_tid)->emplace(
_roct_cid, cid_data{ _crit_cid, _parent_crit_cid, _depth, _queue });
hip_exec_activity_callbacks(_tid);
}
@@ -879,9 +764,6 @@ hip_api_callback(uint32_t domain, uint32_t cid, const void* callback_data, void*
{
hip_exec_activity_callbacks(_tid);
std::tie(_crit_cid, _parent_crit_cid, _depth, std::ignore) =
get_roctracer_cid_data(_tid)->at(_roct_cid);
if(get_use_perfetto())
{
tracing::pop_perfetto_ts(
@@ -913,12 +795,6 @@ hip_api_callback(uint32_t domain, uint32_t cid, const void* callback_data, void*
}
}
}
if(get_use_critical_trace() || get_use_rocm_smi())
{
add_critical_trace<Device::CPU, Phase::END>(
_tid, _crit_cid, _roct_cid, _parent_crit_cid, _ts, _ts, _device_id,
_queue, critical_trace::add_hash_id(op_name), _depth);
}
}
tim::consume_parameters(arg);
}
@@ -935,9 +811,6 @@ hip_activity_callback(const char* begin, const char* end, void* arg)
auto&& _protect = comp::roctracer::protect_flush_activity();
(void) _protect;
using Device = critical_trace::Device;
using Phase = critical_trace::Phase;
if(!trait::runtime_enabled<comp::roctracer>::get()) return;
static auto _kernel_names = std::unordered_map<const char*, std::string>{};
static auto _indexes = std::unordered_map<uint64_t, int>{};
@@ -982,17 +855,12 @@ hip_activity_callback(const char* begin, const char* end, void* arg)
auto& _keys = get_roctracer_key_data();
auto& _tids = get_roctracer_tid_data();
int16_t _depth = 0; // depth of kernel launch
int64_t _tid = 0; // thread id
uint64_t _crit_cid = 0; // correlation id
uint64_t _pcid = 0; // parent corr_id
int32_t _devid = record->device_id; // device id
int64_t _queid = record->queue_id; // queue id
uintptr_t _queue = 0; // Host queue (stream)
auto _laps = _indexes[_roct_cid]++; // see note #1
const char* _name = nullptr;
bool _found = false;
bool _critical_trace = get_use_critical_trace() || get_use_rocm_smi();
int64_t _tid = 0; // thread id
int32_t _devid = record->device_id; // device id
int64_t _queid = record->queue_id; // queue id
uintptr_t _queue = 0; // Host queue (stream)
const char* _name = nullptr;
bool _found = false;
{
locking::atomic_lock _lk{ roctracer_type_mutex<key_data_mutex_t>() };
@@ -1008,21 +876,6 @@ hip_activity_callback(const char* begin, const char* end, void* arg)
if(_name == nullptr && op_name == nullptr) continue;
if(_name == nullptr) _name = op_name;
if(_critical_trace)
{
auto& _crit_cids = get_roctracer_cid_data(_tid);
if(_crit_cids->find(_roct_cid) != _crit_cids->end())
std::tie(_crit_cid, _pcid, _depth, _queue) = _crit_cids->at(_roct_cid);
else
{
OMNITRACE_VERBOSE_F(3,
"No critical trace entry generated for \"%s\" :: "
"unknown correlation id...\n",
_name);
_critical_trace = false;
}
}
static auto _op_id_names =
std::array<const char*, 3>{ "DISPATCH", "COPY", "BARRIER" };
@@ -1094,15 +947,6 @@ hip_activity_callback(const char* begin, const char* end, void* arg)
tracing::pop_perfetto_track(category::device_hip{}, "", _track, _end_ns);
}
if(_critical_trace)
{
auto _hash = critical_trace::add_hash_id(_name);
uint16_t _prio = _laps + 1; // priority
add_critical_trace<Device::GPU, Phase::DELTA, false>(
_tid, _crit_cid, _roct_cid, _crit_cid, _beg_ns, _end_ns, _devid, _queid,
_hash, _depth + 1, _prio);
}
if(_found && _name != nullptr && get_use_timemory())
{
auto _func = [_beg_ns, _end_ns, _name]() {
-1
Просмотреть файл
@@ -17,7 +17,6 @@ include(${CMAKE_CURRENT_LIST_DIR}/omnitrace-openmp-tests.cmake)
include(${CMAKE_CURRENT_LIST_DIR}/omnitrace-code-coverage-tests.cmake)
include(${CMAKE_CURRENT_LIST_DIR}/omnitrace-fork-tests.cmake)
include(${CMAKE_CURRENT_LIST_DIR}/omnitrace-time-window-tests.cmake)
include(${CMAKE_CURRENT_LIST_DIR}/omnitrace-critical-trace-tests.cmake)
include(${CMAKE_CURRENT_LIST_DIR}/omnitrace-attach-tests.cmake)
include(${CMAKE_CURRENT_LIST_DIR}/omnitrace-rccl-tests.cmake)
include(${CMAKE_CURRENT_LIST_DIR}/omnitrace-overflow-tests.cmake)
-54
Просмотреть файл
@@ -1,54 +0,0 @@
# -------------------------------------------------------------------------------------- #
#
# critical-trace tests
#
# -------------------------------------------------------------------------------------- #
omnitrace_add_test(
SKIP_BASELINE SKIP_RUNTIME SKIP_SAMPLING
NAME parallel-overhead-critical-trace
TARGET parallel-overhead
LABELS "critical-trace"
REWRITE_ARGS
-e
-i
8
-E
"^fib"
-v
2
--print-instrumented
functions
RUN_ARGS 10 4 100
ENVIRONMENT "${_critical_trace_environment}")
add_test(
NAME parallel-overhead-process-critical-trace
COMMAND
$<TARGET_FILE:omnitrace-critical-trace>
${PROJECT_BINARY_DIR}/omnitrace-tests-output/parallel-overhead-critical-trace-binary-rewrite/call-chain.json
WORKING_DIRECTORY ${PROJECT_BINARY_DIR})
set(_parallel_overhead_critical_trace_environ
"OMNITRACE_OUTPUT_PATH=omnitrace-tests-output"
"OMNITRACE_OUTPUT_PREFIX=parallel-overhead-critical-trace/"
"OMNITRACE_CRITICAL_TRACE_DEBUG=ON"
"OMNITRACE_VERBOSE=4"
"OMNITRACE_USE_PID=OFF"
"OMNITRACE_TIME_OUTPUT=OFF"
"OMNITRACE_CI=ON"
"OMNITRACE_CI_TIMEOUT=300")
set_tests_properties(
parallel-overhead-process-critical-trace
PROPERTIES
ENVIRONMENT
"${_parallel_overhead_critical_trace_environ}"
TIMEOUT
300
LABELS
"parallel-overhead;critical-trace"
PASS_REGULAR_EXPRESSION
"Outputting.*(critical-trace-cpu.json).*Outputting.*(critical-trace-any.json)"
DEPENDS
parallel-overhead-critical-trace-binary-rewrite-run)
+1 -1
Просмотреть файл
@@ -10,7 +10,7 @@ omnitrace_add_test(
REWRITE_ARGS -e -v 2 --print-instrumented modules -i 16
RUNTIME_ARGS -e -v 1 --label file -i 16
ENVIRONMENT
"${_base_environment};OMNITRACE_CRITICAL_TRACE=ON;OMNITRACE_SAMPLING_FREQ=250;OMNITRACE_SAMPLING_REALTIME=ON"
"${_base_environment};OMNITRACE_SAMPLING_FREQ=250;OMNITRACE_SAMPLING_REALTIME=ON"
SAMPLING_PASS_REGEX "fork.. called on PID"
RUNTIME_PASS_REGEX "fork.. called on PID"
REWRITE_RUN_PASS_REGEX "fork.. called on PID"
+1 -1
Просмотреть файл
@@ -40,7 +40,7 @@ omnitrace_add_test(
return
args
RUN_ARGS 10 ${NUM_THREADS} 1000
ENVIRONMENT "${_base_environment};OMNITRACE_CRITICAL_TRACE=OFF")
ENVIRONMENT "${_base_environment}")
omnitrace_add_test(
SKIP_BASELINE SKIP_RUNTIME
+6 -9
Просмотреть файл
@@ -26,7 +26,7 @@ omnitrace_add_test(
LABELS "kokkos;kokkos-profile-library"
RUN_ARGS -i 25 -s 20 -p
ENVIRONMENT
"${_base_environment};OMNITRACE_CRITICAL_TRACE=OFF;OMNITRACE_USE_KOKKOSP=ON;OMNITRACE_COUT_OUTPUT=ON;OMNITRACE_SAMPLING_FREQ=50;OMNITRACE_KOKKOSP_PREFIX=[kokkos];KOKKOS_PROFILE_LIBRARY=libomnitrace-dl.so"
"${_base_environment};OMNITRACE_USE_KOKKOSP=ON;OMNITRACE_COUT_OUTPUT=ON;OMNITRACE_SAMPLING_FREQ=50;OMNITRACE_KOKKOSP_PREFIX=[kokkos];KOKKOS_PROFILE_LIBRARY=libomnitrace-dl.so"
REWRITE_RUN_PASS_REGEX "\\|_\\[kokkos\\] [a-zA-Z]"
RUNTIME_PASS_REGEX "\\|_\\[kokkos\\] [a-zA-Z]")
@@ -40,7 +40,7 @@ omnitrace_add_test(
LABELS "kokkos;kokkos-profile-library"
RUN_ARGS -i 10 -s 20 -p
ENVIRONMENT
"${_base_environment};OMNITRACE_CRITICAL_TRACE=OFF;OMNITRACE_USE_KOKKOSP=ON;OMNITRACE_COUT_OUTPUT=ON;OMNITRACE_SAMPLING_FREQ=50;OMNITRACE_KOKKOSP_PREFIX=[kokkos];KOKKOS_PROFILE_LIBRARY=libomnitrace.so"
"${_base_environment};OMNITRACE_USE_KOKKOSP=ON;OMNITRACE_COUT_OUTPUT=ON;OMNITRACE_SAMPLING_FREQ=50;OMNITRACE_KOKKOSP_PREFIX=[kokkos];KOKKOS_PROFILE_LIBRARY=libomnitrace.so"
BASELINE_PASS_REGEX "\\|_\\[kokkos\\] [a-zA-Z]")
omnitrace_add_test(
@@ -53,7 +53,7 @@ omnitrace_add_test(
LABELS "kokkos;kokkos-profile-library"
RUN_ARGS -i 10 -s 20 -p
ENVIRONMENT
"${_base_environment};OMNITRACE_CRITICAL_TRACE=OFF;OMNITRACE_USE_KOKKOSP=ON;OMNITRACE_COUT_OUTPUT=ON;OMNITRACE_SAMPLING_FREQ=50;OMNITRACE_KOKKOSP_PREFIX=[kokkos];KOKKOS_PROFILE_LIBRARY=libomnitrace-dl.so"
"${_base_environment};OMNITRACE_USE_KOKKOSP=ON;OMNITRACE_COUT_OUTPUT=ON;OMNITRACE_SAMPLING_FREQ=50;OMNITRACE_KOKKOSP_PREFIX=[kokkos];KOKKOS_PROFILE_LIBRARY=libomnitrace-dl.so"
BASELINE_PASS_REGEX "\\|_\\[kokkos\\] [a-zA-Z]")
omnitrace_add_test(
@@ -77,8 +77,7 @@ omnitrace_add_test(
-ME
[==[lib(gomp|m-)]==]
RUN_ARGS -i 10 -s 20 -p
ENVIRONMENT
"${_base_environment};OMNITRACE_CRITICAL_TRACE=OFF;OMNITRACE_USE_KOKKOSP=ON")
ENVIRONMENT "${_base_environment};OMNITRACE_USE_KOKKOSP=ON")
omnitrace_add_test(
SKIP_BASELINE
@@ -100,8 +99,7 @@ omnitrace_add_test(
-ME
[==[libgomp]==]
RUN_ARGS -i 10 -s 20 -p
ENVIRONMENT
"${_perfetto_environment};OMNITRACE_CRITICAL_TRACE=OFF;OMNITRACE_USE_KOKKOSP=OFF")
ENVIRONMENT "${_perfetto_environment};OMNITRACE_USE_KOKKOSP=OFF")
omnitrace_add_test(
NAME lulesh-timemory
@@ -122,6 +120,5 @@ omnitrace_add_test(
--env
OMNITRACE_TIMEMORY_COMPONENTS="wall_clock peak_rss"
RUN_ARGS -i 10 -s 20 -p
ENVIRONMENT
"${_timemory_environment};OMNITRACE_CRITICAL_TRACE=OFF;OMNITRACE_USE_KOKKOSP=OFF"
ENVIRONMENT "${_timemory_environment};OMNITRACE_USE_KOKKOSP=OFF"
REWRITE_FAIL_REGEX "0 instrumented loops in procedure")
+5 -5
Просмотреть файл
@@ -26,7 +26,7 @@ omnitrace_add_test(
args
-E
uniform_int_distribution
ENVIRONMENT "${_base_environment};OMNITRACE_CRITICAL_TRACE=ON")
ENVIRONMENT "${_base_environment}")
omnitrace_add_test(
SKIP_REWRITE SKIP_RUNTIME
@@ -37,7 +37,7 @@ omnitrace_add_test(
NUM_PROCS 1
RUN_ARGS 1 2 2
ENVIRONMENT
"${_base_environment};OMNITRACE_CRITICAL_TRACE=OFF;OMNITRACE_ROCTRACER_HSA_ACTIVITY=OFF;OMNITRACE_ROCTRACER_HSA_API=OFF"
"${_base_environment};OMNITRACE_ROCTRACER_HSA_ACTIVITY=OFF;OMNITRACE_ROCTRACER_HSA_API=OFF"
)
omnitrace_add_test(
@@ -61,7 +61,7 @@ omnitrace_add_test(
-E
uniform_int_distribution
RUN_ARGS 2 100 50
ENVIRONMENT "${_base_environment};OMNITRACE_CRITICAL_TRACE=OFF"
ENVIRONMENT "${_base_environment}"
REWRITE_FAIL_REGEX "0 instrumented loops in procedure transpose")
if(OMNITRACE_USE_ROCPROFILER)
@@ -75,7 +75,7 @@ if(OMNITRACE_USE_ROCPROFILER)
NUM_PROCS ${NUM_PROCS}
REWRITE_ARGS -e -v 2 -E uniform_int_distribution
ENVIRONMENT
"${_base_environment};OMNITRACE_CRITICAL_TRACE=OFF;OMNITRACE_ROCM_EVENTS=${OMNITRACE_ROCM_EVENTS_TEST}"
"${_base_environment};OMNITRACE_ROCM_EVENTS=${OMNITRACE_ROCM_EVENTS_TEST}"
REWRITE_RUN_PASS_REGEX
"rocprof-device-0-GRBM_COUNT.txt(.*)rocprof-device-0-GPUBusy.txt(.*)rocprof-device-0-SQ_WAVES.txt(.*)rocprof-device-0-SQ_INSTS_VALU.txt(.*)rocprof-device-0-VALUInsts.txt(.*)rocprof-device-0-TCC_HIT_sum.txt(.*)rocprof-device-0-TA_TA_BUSY_0.txt(.*)rocprof-device-0-TA_TA_BUSY_11.txt"
)
@@ -90,7 +90,7 @@ if(OMNITRACE_USE_ROCPROFILER)
NUM_PROCS ${NUM_PROCS}
REWRITE_ARGS -e -v 2 -E uniform_int_distribution
ENVIRONMENT
"${_base_environment};OMNITRACE_CRITICAL_TRACE=OFF;OMNITRACE_USE_ROCTRACER=OFF;OMNITRACE_ROCM_EVENTS=${OMNITRACE_ROCM_EVENTS_TEST}"
"${_base_environment};OMNITRACE_USE_ROCTRACER=OFF;OMNITRACE_ROCM_EVENTS=${OMNITRACE_ROCM_EVENTS_TEST}"
REWRITE_RUN_PASS_REGEX
"rocprof-device-0-GRBM_COUNT.txt(.*)rocprof-device-0-GPUBusy.txt(.*)rocprof-device-0-SQ_WAVES.txt(.*)rocprof-device-0-SQ_INSTS_VALU.txt(.*)rocprof-device-0-VALUInsts.txt(.*)rocprof-device-0-TCC_HIT_sum.txt(.*)rocprof-device-0-TA_TA_BUSY_0.txt(.*)rocprof-device-0-TA_TA_BUSY_11.txt"
REWRITE_RUN_FAIL_REGEX "roctracer.txt|OMNITRACE_ABORT_FAIL_REGEX")
+1 -18
Просмотреть файл
@@ -80,7 +80,6 @@ set(_lock_environment
"OMNITRACE_USE_SAMPLING=ON"
"OMNITRACE_USE_PROCESS_SAMPLING=OFF"
"OMNITRACE_SAMPLING_FREQ=750"
"OMNITRACE_CRITICAL_TRACE=ON"
"OMNITRACE_COLLAPSE_THREADS=ON"
"OMNITRACE_TRACE_THREAD_LOCKS=ON"
"OMNITRACE_TRACE_THREAD_SPIN_LOCKS=ON"
@@ -91,26 +90,11 @@ set(_lock_environment
"OMNITRACE_VERBOSE=2"
"${_test_library_path}")
set(_critical_trace_environment
"OMNITRACE_VERBOSE=2"
"OMNITRACE_USE_SAMPLING=OFF"
"OMNITRACE_USE_PROCESS_SAMPLING=OFF"
"OMNITRACE_CRITICAL_TRACE=ON"
"OMNITRACE_CRITICAL_TRACE_DEBUG=ON"
"OMNITRACE_TRACE_THREAD_LOCKS=ON"
"OMNITRACE_TRACE_THREAD_SPIN_LOCKS=ON"
"OMNITRACE_TRACE_THREAD_RW_LOCKS=ON"
"OMNITRACE_COUT_OUTPUT=ON"
"OMNITRACE_TIME_OUTPUT=OFF"
"OMNITRACE_TIMELINE_PROFILE=OFF"
"${_test_library_path}")
set(_ompt_environment
"OMNITRACE_TRACE=ON"
"OMNITRACE_PROFILE=ON"
"OMNITRACE_TIME_OUTPUT=OFF"
"OMNITRACE_USE_OMPT=ON"
"OMNITRACE_CRITICAL_TRACE=OFF"
"OMNITRACE_TIMEMORY_COMPONENTS=wall_clock,trip_count,peak_rss"
"${_test_openmp_env}"
"${_test_library_path}")
@@ -136,7 +120,7 @@ set(_timemory_environment
"${_test_openmp_env}"
"${_test_library_path}")
set(_test_environment ${_base_environment} "OMNITRACE_CRITICAL_TRACE=OFF")
set(_test_environment ${_base_environment})
set(_causal_environment
"${_test_openmp_env}" "${_test_library_path}" "OMNITRACE_TIME_OUTPUT=OFF"
@@ -159,7 +143,6 @@ set(_attach_environment
"OMNITRACE_PROFILE=ON"
"OMNITRACE_USE_SAMPLING=OFF"
"OMNITRACE_USE_PROCESS_SAMPLING=ON"
"OMNITRACE_USE_CRITICAL_TRACE=OFF"
"OMNITRACE_USE_OMPT=ON"
"OMNITRACE_USE_KOKKOSP=ON"
"OMNITRACE_TIME_OUTPUT=OFF"
+1 -1
Просмотреть файл
@@ -23,7 +23,7 @@ omnitrace_add_test(
return
args
RUN_ARGS 10 ${NUM_THREADS} 1000
ENVIRONMENT "${_base_environment};OMNITRACE_CRITICAL_TRACE=OFF"
ENVIRONMENT "${_base_environment}"
REWRITE_RUN_PASS_REGEX "Pushing custom region :: run.10. x 1000"
RUNTIME_PASS_REGEX "Pushing custom region :: run.10. x 1000"
SAMPLING_PASS_REGEX "Pushing custom region :: run.10. x 1000"