From 9de3a6b0b4243bf8ec10164babdd99f64dbc65f2 Mon Sep 17 00:00:00 2001 From: "Jonathan R. Madsen" Date: Thu, 13 Apr 2023 02:14:35 -0500 Subject: [PATCH] Linux Perf Support + Causal Profiling Updates (#276) * causal backtrace updates - fix initial causal sampling period value * causal delay updates - tweak handling of sleep_for_overhead * Fix experiment global scaling for prog pts - results in drastically improved predictions * pthread_mutex_gotcha updates - disable all wrappers during causal profiling * validate-causal-json.py updates - support decimal stddev - fix setting stddev from command-line * causal perform_experiment_impl update - handle start failing because finalizing * deprecate causal::component::sample_rate - appears to not help at all * Rework sample info * Increase causal unwind_depth - use OMNITRACE_MAX_UNWIND_DEPTH * validate-causal-json updates - min experiments - exclude reporting predictions with less than X experiments at a given speedup - percent samples - only print samples within X% of the peak (default: 95%) * Update timemory submodule - extensions to sampling for signals delivered via non-timer method - e.g. via HW counter overflow * dwarf_entry::operator< updates - sort via file * causal profiling docs updates - info about backends - info about installing/enabling perf * config updates: causal backend - CausalBackend enum - OMNITRACE_CAUSAL_BACKEND: perf, timer, auto - omnitrace-causal option: --backend * debug update - use spin_mutex instead of std::mutex * address_range::contains update - range from 0-100 contains range from 10-100 but was returning false because high was == 100 not < 100 * symbol::operator< update - handle load address differences * sampling updates (non-causal) - update get_timer to get_trigger + dynamic_cast * container::static_vector updates - support construction from container::c_array - update_size private member func for handling atomic m_size * Move perf files - moved library/causal/perf.{hpp,cpp} to library/perf.{hpp,cpp} * causal example update - created impl.hpp (forward decls) - renamed {cpu,rng}_func_impl to {cpu,rng}_impl_func - only create two threads which run N iterations instead of two threads each iteration * Update timemory submodule - updates to unwind::processed_entry - updates to procfs::maps * Updated causal documentation - fixed line numbers changed by modifications to causal example * omnitrace-causal exe updates - set OMNITRACE_THREAD_POOL_SIZE to zero by default * core/containers updates - static_vector: provide data() member function - c_array pop_front() and pop_back() member functions * core: config and argparse updates + perf - core/perf.{hpp,cpp} - forward decl of enums - config-related capabilities - argparse: --sample-overflow - renamed some config functions - e.g. get_sampling_cpu_freq -> get_sampling_cputime_freq - added config settings related to overflow sampling via perf - added timer_sampling and overflow_sampling categories * Update timemory submodule - sampling allocator flushing * binary updates - lookup_ipaddr_entry - use bfd_find_nearest_line instead of bfd_find_nearest_line_discriminator - discriminators are not used - explicit instantiations of inlined_symbol::serialize * Bump VERSION to 1.10.0 * sampling and perf updates - support overflow sampling via Linux Perf - update perf namespace - update perf::perf_event - update record ctor: pointer instead of const ref - update open member func: return optional string - add m_batch_size member variable - sampling updates - support overflow sampling - flush allocators - increase buffer size from 1024 to 2048 - restructure post-processing in light of perf overflow supports - improve offload memory usage only load buffers for thread - load_offload_buffer(tid) uses thread-specific filepos - component updates - backtrace_metrics::operator-= - backtrace_metrics::operator- - backtrace::sample does not record for overflow signal - callchain: perf overflow sample * core updates - component::sampling_percent does not report self + uses_percent_units * causal updates - tweak get_line_info - overloads for set_current_selection (uint64_t, c_array, std::array) - delay - use sampling::pause/sampling::resume - experiment - experiment::sample derives from unwind::processed_entry - experiment::samples is vector instead of set - fixed samples - overloads for is_selected (uint64_t, c_array, std::array) - scaling factor defaults to 100 instead of 50 - serialize updates follow change to experiment::sample - modify algorithm for increasing/decreasing experiment length - sample_data - use map instead of set - get_samples returns vector instead of set - sampling - support overflow via Linux Perf - update causal_offload_buffer - flush sampling allocator - backtrace - overflow component * libomnitrace-dl updates - handle dl::InstrumentMode::PythonProfile * testing updates (causal) - causal line 155 -> causal line 100 - causal line 165 -> causal line 110 * formatting * exit_gotcha updates - exit_info for abort() - message about non-zero exit code * testing updates - fail regex for causal tests - validate-causal-json: >= min_experiments instead of > min_experiments - handle OMNITRACE_DEBUG_SETTINGS in omnitrace_write_test_config * causal sampling updates - add new lines where appropriate * causal data updates - reorder diagnostic info when experiment fails to start * binary updates - symbol address range from address to address + symsize + 1 - add 1 based on debug info * causal data updates - sample_selection wait_ns defaults to 1,000 instead of 10,000 - sample_selection wait scaled by iteration number - save_line_info_impl verbosity - print latest_eligible_pc when experiment does not start * causal sampling + component updates - perf backend disables component::backtrace - ensure get_sampling_(realtime|cputime|overflow)_signal do not malloc * causal: remove period stats * validate-causal-json update - fix --help * causal data updates - improve eligible pc history reporting when experiment fails to start * causal data updates - fix compute_eligible_lines_impl - eligible address ranges returning too many ranges - occasionally, overwrite all *true* eligible address ranges * causal data updates - reduce scoped ranges to symbol ranges - is_eligible_address() returns true contains (not just coarse) - revert some sample_selection behavior * binary address_multirange updates - make coarse_range private - fix operator+=(pair) * causal example update - fix nsync to default to once per iteration * binary analysis updates - tweak header file includes * causal updates - remove factoring in sleep_for_overhead - invoke delay::process() even if experiment is not active * causal data updates - update latest_eligible_pc structure * update omnitrace-install.py.in - fix support for fedora - /etc/os-release does not have ID_LIKE - fallback to RHEL 8.7 if version not specified * update omnitrace-install.py.in - fix support for debian - /etc/os-release does not have ID_LIKE - version mapping * Update documentation - update docs on installation * causal data and experiment updates - data: reset_sample_selection * causal set_current_selection debugging - debug messages for failed e2e runs * causal data and backtrace component updates - data: set_current_selection returns the number of eligible addresses added - backtrace: if cputime signal has selected zero IPs > 5x, then realtime signal starts contributing call-stacks * core library updates - move config::parse_numeric_range to utility namespace - add core/utility.cpp - support range:increment, e.g. 5-25:10 expands to '5 15 25' instead of '5 10 15 20 25' * omnitrace-causal update - end-to-end expands all speedups - support range:increment in speedups * causal backtrace updates - remove select_ival (realtime signal always contributes when select_count == 0) * containers: static_vector update - explicit c_array constructor - explicit std::array constructor * causal data updates - remove set_current_selection(uint64_t) - remove set_current_selection(std::array) - sample_selection increase default wait time - report eligible PC candidates - move reset_sample_selection to perform_experiment_impl - decrease latest_eligible_pc array size - set_current_selection does not guard for experiment::active * core debug updates - OMNITRACE_PRINT_COLOR macros * causal data updates - tweak to experiment never started message * causal gotcha updates - remove unused code * critical trace updates - remove unused code * omnitrace-causal - OMNITRACE_LAUNCHER * causal data updates - don't fail on end-to-end + omnitrace-causal * causal backtrace updates - reintroduce select_ival behavior * causal data updates - tweak verbose messages about number of PC candidates * core mproc updates - utilities for waiting on child PID and diagnosing status - omnitrace::mproc::wait_pid - omnitrace::mproc::diagnose_status * omnitrace-run updates - support --fork argument for executing via fork in current process + execvpe on child instead of execvpe in current process * omnitrace-causal updates - wait_pid and diagnose_status just call equivalent functions in omnitrace::mproc * ubuntu-focal workflow update - attempt to launch ubuntu-focal-codecov job with CAP_SYS_ADMIN and use perf backend * tests reorg and updates - remove binary-rewrite-sampling and runtime-instrument-sampling tests - rename *-preload tests (which use omnitrace-sample exe) to *-sampling - split tests/CMakeLists.txt into several tests/omnitrace--tests.cmake files - tweak to causal-both-omni-func test - add args: -n 2 -b timer * update validate-causal-json.py - better reasoning info for adjusting tolerance - always apply tolerance adjustments in CI mode * causal e2e tests update - add label "causal-e2e" label - tweak params - old: 80 12 432525 500000000 - new: 80 50 432525 100000000 - disable processor affinity for slow-func/line-100 tests - artificially inflates some speedups with perf * unblocking_gotcha updates - overload operator() according to gotcha function index * blocking_gotcha updates - overload operator() according to gotcha function index - fix bug where potentially post block functors (e.g. pthread_mutex_trylock) throw error if lock is not acquired. * parse_numeric_range update - support unordered_set * config update - OMNITRACE_DEBUG_{TIDS,PIDS} use parse_numeric_range --- .cmake-format.yaml | 10 +- .github/workflows/ubuntu-focal.yml | 2 + README.md | 16 +- VERSION | 2 +- cmake/Templates/omnitrace-install.py.in | 71 +- examples/causal/causal.cpp | 137 +- examples/causal/impl.cpp | 20 +- examples/causal/impl.hpp | 97 ++ external/timemory | 2 +- source/bin/omnitrace-causal/CMakeLists.txt | 2 +- source/bin/omnitrace-causal/impl.cpp | 182 +-- source/bin/omnitrace-run/impl.cpp | 11 +- source/bin/omnitrace-run/omnitrace-run.cpp | 38 +- source/bin/omnitrace-run/omnitrace-run.hpp | 2 +- source/bin/tests/CMakeLists.txt | 1 + source/docs/causal_profiling.md | 71 +- source/docs/installation.md | 27 +- source/lib/binary/address_multirange.cpp | 8 +- source/lib/binary/address_multirange.hpp | 11 +- source/lib/binary/analysis.cpp | 104 ++ source/lib/binary/analysis.hpp | 4 + source/lib/binary/dwarf_entry.cpp | 4 +- source/lib/binary/symbol.cpp | 54 +- source/lib/core/CMakeLists.txt | 5 +- source/lib/core/argparse.cpp | 48 +- source/lib/core/binary/address_range.cpp | 2 +- source/lib/core/categories.hpp | 4 + source/lib/core/components/fwd.hpp | 5 + source/lib/core/config.cpp | 305 ++-- source/lib/core/config.hpp | 37 +- source/lib/core/containers/c_array.hpp | 8 + source/lib/core/containers/static_vector.hpp | 57 +- source/lib/core/debug.cpp | 4 +- source/lib/core/debug.hpp | 40 +- source/lib/core/mproc.cpp | 120 ++ source/lib/core/mproc.hpp | 6 + source/lib/core/perf.cpp | 244 +++ source/lib/core/perf.hpp | 291 ++++ source/lib/core/state.hpp | 7 + source/lib/core/utility.cpp | 123 ++ source/lib/core/utility.hpp | 14 + source/lib/omnitrace-dl/dl.cpp | 3 + .../lib/omnitrace-user/omnitrace/categories.h | 2 + source/lib/omnitrace/library/CMakeLists.txt | 2 + .../omnitrace/library/causal/CMakeLists.txt | 20 +- .../library/causal/components/backtrace.cpp | 191 ++- .../library/causal/components/backtrace.hpp | 41 +- .../causal/components/blocking_gotcha.cpp | 43 +- .../causal/components/blocking_gotcha.hpp | 30 +- .../causal/components/causal_gotcha.cpp | 14 - .../causal/components/causal_gotcha.hpp | 2 - .../causal/components/unblocking_gotcha.cpp | 35 +- .../causal/components/unblocking_gotcha.hpp | 18 +- source/lib/omnitrace/library/causal/data.cpp | 381 +++-- source/lib/omnitrace/library/causal/data.hpp | 8 +- source/lib/omnitrace/library/causal/delay.cpp | 8 +- .../omnitrace/library/causal/experiment.cpp | 191 ++- .../omnitrace/library/causal/experiment.hpp | 53 +- source/lib/omnitrace/library/causal/fwd.hpp | 3 +- .../omnitrace/library/causal/sample_data.cpp | 41 +- .../omnitrace/library/causal/sample_data.hpp | 9 +- .../lib/omnitrace/library/causal/sampling.cpp | 289 +++- .../lib/omnitrace/library/causal/sampling.hpp | 6 + .../library/components/CMakeLists.txt | 2 + .../library/components/backtrace.cpp | 10 +- .../library/components/backtrace_metrics.cpp | 29 + .../library/components/backtrace_metrics.hpp | 8 + .../library/components/callchain.cpp | 210 +++ .../library/components/callchain.hpp | 95 ++ .../library/components/exit_gotcha.cpp | 7 + .../components/pthread_mutex_gotcha.cpp | 5 +- .../lib/omnitrace/library/critical_trace.cpp | 146 -- .../lib/omnitrace/library/critical_trace.hpp | 16 - .../omnitrace/library/{causal => }/perf.cpp | 262 ++- .../omnitrace/library/{causal => }/perf.hpp | 134 +- source/lib/omnitrace/library/sampling.cpp | 891 ++++++---- source/lib/omnitrace/library/sampling.hpp | 15 +- tests/CMakeLists.txt | 1437 +---------------- tests/omnitrace-attach-tests.cmake | 53 + tests/omnitrace-causal-tests.cmake | 180 +++ tests/omnitrace-code-coverage-tests.cmake | 137 ++ tests/omnitrace-config-tests.cmake | 40 + tests/omnitrace-critical-trace-tests.cmake | 52 + tests/omnitrace-fork-tests.cmake | 23 + tests/omnitrace-instrument-tests.cmake | 54 + tests/omnitrace-kokkos-tests.cmake | 128 ++ tests/omnitrace-mpi-tests.cmake | 122 ++ tests/omnitrace-openmp-tests.cmake | 99 ++ tests/omnitrace-pthread-tests.cmake | 34 + tests/omnitrace-python-tests.cmake | 267 +++ tests/omnitrace-rccl-tests.cmake | 60 + tests/omnitrace-rocm-tests.cmake | 85 + tests/omnitrace-testing.cmake | 103 +- tests/omnitrace-time-window-tests.cmake | 114 ++ tests/omnitrace-user-api-tests.cmake | 31 + tests/validate-causal-json.py | 67 +- 96 files changed, 5489 insertions(+), 3013 deletions(-) create mode 100644 examples/causal/impl.hpp create mode 100644 source/lib/core/perf.cpp create mode 100644 source/lib/core/perf.hpp create mode 100644 source/lib/core/utility.cpp create mode 100644 source/lib/omnitrace/library/components/callchain.cpp create mode 100644 source/lib/omnitrace/library/components/callchain.hpp rename source/lib/omnitrace/library/{causal => }/perf.cpp (62%) rename source/lib/omnitrace/library/{causal => }/perf.hpp (71%) create mode 100644 tests/omnitrace-attach-tests.cmake create mode 100644 tests/omnitrace-causal-tests.cmake create mode 100644 tests/omnitrace-code-coverage-tests.cmake create mode 100644 tests/omnitrace-config-tests.cmake create mode 100644 tests/omnitrace-critical-trace-tests.cmake create mode 100644 tests/omnitrace-fork-tests.cmake create mode 100644 tests/omnitrace-instrument-tests.cmake create mode 100644 tests/omnitrace-kokkos-tests.cmake create mode 100644 tests/omnitrace-mpi-tests.cmake create mode 100644 tests/omnitrace-openmp-tests.cmake create mode 100644 tests/omnitrace-pthread-tests.cmake create mode 100644 tests/omnitrace-python-tests.cmake create mode 100644 tests/omnitrace-rccl-tests.cmake create mode 100644 tests/omnitrace-rocm-tests.cmake create mode 100644 tests/omnitrace-time-window-tests.cmake create mode 100644 tests/omnitrace-user-api-tests.cmake diff --git a/.cmake-format.yaml b/.cmake-format.yaml index 6c7d80a1d2..9cda2c7d1a 100644 --- a/.cmake-format.yaml +++ b/.cmake-format.yaml @@ -21,10 +21,9 @@ parse: omnitrace_add_test: flags: - SKIP_BASELINE - - SKIP_PRELOAD + - SKIP_SAMPLING - SKIP_REWRITE - SKIP_RUNTIME - - SKIP_SAMPLING kwargs: NAME: '*' TARGET: '*' @@ -33,15 +32,16 @@ parse: NUM_PROCS: '*' REWRITE_TIMEOUT: '*' RUNTIME_TIMEOUT: '*' - PRELOAD_TIMEOUT: '*' + SAMPLING_TIMEOUT: '*' + SAMPLING_ARGS: '*' REWRITE_ARGS: '*' RUNTIME_ARGS: '*' RUN_ARGS: '*' ENVIRONMENT: '*' LABELS: '*' PROPERTIES: '*' - PRELOAD_PASS_REGEX: '*' - PRELOAD_FAIL_REGEX: '*' + SAMPLING_PASS_REGEX: '*' + SAMPLING_FAIL_REGEX: '*' RUNTIME_PASS_REGEX: '*' RUNTIME_FAIL_REGEX: '*' REWRITE_PASS_REGEX: '*' diff --git a/.github/workflows/ubuntu-focal.yml b/.github/workflows/ubuntu-focal.yml index 5d002c6d31..8f69029059 100644 --- a/.github/workflows/ubuntu-focal.yml +++ b/.github/workflows/ubuntu-focal.yml @@ -554,9 +554,11 @@ jobs: container: image: jrmadsen/omnitrace:ci-base-ubuntu-20.04 + options: --cap-add CAP_SYS_ADMIN env: OMNITRACE_VERBOSE: 2 + OMNITRACE_CAUSAL_BACKEND: perf steps: - uses: actions/checkout@v3 diff --git a/README.md b/README.md index decc444894..8437fe50f7 100755 --- a/README.md +++ b/README.md @@ -99,9 +99,19 @@ See the [Getting Started documentation](https://amdresearch.github.io/omnitrace/ - Visit [Releases](https://github.com/AMDResearch/omnitrace/releases) page - Select appropriate installer (recommendation: `.sh` scripts do not require super-user priviledges unlike the DEB/RPM installers) - If targeting a ROCm application, find the installer script with the matching ROCm version - - If you are unsure about your Linux distro, check `/etc/os-release` - - If no installer script matches your target OS, try one of the Ubuntu 18.04 `*.sh` installers - - This installation may be built against older library versions supported on your distro via backwards compatibility + - If you are unsure about your Linux distro, check `/etc/os-release` or use the `omnitrace-install.py` script + +If the above recommendation is not desired, download the `omnitrace-install.py` and specify `--prefix ` when +executing it. This script will attempt to auto-detect a compatible OS distribution and version. +If ROCm support is desired, specify `--rocm X.Y` where `X` is the ROCm major version and `Y` +is the ROCm minor version, e.g. `--rocm 5.4`. + +```console +wget https://github.com/AMDResearch/omnitrace/releases/latest/download/omnitrace-install.py +python3 ./omnitrace-install.py --prefix /opt/omnitrace/rocm-5.4 --rocm 5.4 +``` + +See the [Installation Documentation](https://amdresearch.github.io/omnitrace/installation) for detailed information. ### Setup diff --git a/VERSION b/VERSION index 8fdcf38694..81c871de46 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -1.9.2 +1.10.0 diff --git a/cmake/Templates/omnitrace-install.py.in b/cmake/Templates/omnitrace-install.py.in index 5a8b17a941..ae5a406f8f 100755 --- a/cmake/Templates/omnitrace-install.py.in +++ b/cmake/Templates/omnitrace-install.py.in @@ -63,30 +63,73 @@ def get_os_info(os_distrib, os_version): _key, _data = line.split("=", 1) _os_info[_key] = _data.strip('"') + def _parse_version(_v): + _version = re.split(r"[\\.-]", _v) + return ( + "{}.{}".format(_version[0], _version[1]) + if len(_version) > 1 + else "{}".format(_version[0]) + ) + if os_distrib is None or os_distrib == "auto": - if "debian" in _os_info["ID_LIKE"]: + if "ubuntu" in _os_info["ID"]: os_distrib = "ubuntu" - elif "suse" in _os_info["ID_LIKE"]: + elif "opensuse" in _os_info["ID"]: os_distrib = "opensuse" - elif "rhel" in _os_info["ID_LIKE"]: + elif "rhel" in _os_info["ID"]: os_distrib = "rhel" - elif "fedora" in _os_info["ID_LIKE"]: + elif "centos" in _os_info["ID"]: os_distrib = "rhel" - elif "centos" in _os_info["ID_LIKE"]: + elif "rockylinux" in _os_info["ID"]: os_distrib = "rhel" + elif "debian" in _os_info["ID"]: + os_distrib = "ubuntu" + if "debian" in _os_info["ID"] and os_version is None: + _debian_version = float(_parse_version(_os_info["VERSION_ID"])) + if _debian_version >= 11.0: + os_version = "20.04" + else: + os_version = "18.04" + elif "fedora" in _os_info["ID"]: + os_distrib = "rhel" + # fedora has different versioning system so fallback to 8.7 + if os_version is None: + os_version = "8.7" else: - raise RuntimeError( - "Unknown ID_LIKE value in /etc/os-release: {}".format(_os_info["ID_LIKE"]) - ) - elif os_distrib == "fedora" or os_distrib == "centos": + # if we don't have an exact match, check ID_LIKE + if "ID_LIKE" not in _os_info.keys(): + _os_info["ID_LIKE"] = _os_info["ID"] + + if "debian" in _os_info["ID_LIKE"]: + os_distrib = "ubuntu" + if os_version is None: + # fallback on 18.04 if ID is not ubuntu but debian-like + os_version = "18.04" + elif "suse" in _os_info["ID_LIKE"]: + os_distrib = "opensuse" + # fallback on 15.3 if ID is not opensuse but suse-like + if os_version is None: + os_version = "15.3" + elif "rhel" in _os_info["ID_LIKE"] or "centos" in _os_info["ID_LIKE"]: + os_distrib = "rhel" + if os_version is None: + os_version = "8.7" + else: + raise RuntimeError( + "Unknown ID_LIKE value in /etc/os-release: {}".format( + _os_info["ID_LIKE"] + ) + ) + elif os_distrib == "centos": os_distrib = "rhel" + # uses same versioning system + elif os_distrib == "fedora": + os_distrib = "rhel" + if os_version is None: + # fedora has different versioning system so fallback to 8.7 + os_version = "8.7" if os_version is None: - - def _parse_version(_v): - _version = re.split(r"[\\.-]", _v) - return "{}.{}".format(_version[0], _version[1]) - os_version = _parse_version(_os_info["VERSION_ID"]) return (os_distrib, os_version) diff --git a/examples/causal/causal.cpp b/examples/causal/causal.cpp index 807eb1c970..a8a98a996f 100644 --- a/examples/causal/causal.cpp +++ b/examples/causal/causal.cpp @@ -1,122 +1,67 @@ #include "causal.hpp" - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -using mutex_t = std::timed_mutex; -using auto_lock_t = std::unique_lock; -using clock_type = std::chrono::high_resolution_clock; -using nanosec = std::chrono::nanoseconds; +#include "impl.hpp" namespace { std::chrono::duration t_ms; std::chrono::duration slow_ms; std::chrono::duration fast_ms; - -template -inline void -consume_variables(Args&&...) -{} } // namespace -template -bool -rng_func_impl(int64_t n, uint64_t rseed); - -template -bool -cpu_func_impl(int64_t n, int nloop); - -void -rng_slow_func(int64_t n, uint64_t rseed) __attribute__((noinline)); - -void -rng_fast_func(int64_t n, uint64_t rseed) __attribute__((noinline)); - -void -cpu_slow_func(int64_t n, int nloop) __attribute__((noinline)); - -void -cpu_fast_func(int64_t n, int nloop) __attribute__((noinline)); - -#if USE_CPU > 0 -# define CPU_SLOW_FUNC(...) cpu_slow_func(__VA_ARGS__) -# define CPU_FAST_FUNC(...) cpu_fast_func(__VA_ARGS__) -#else -# define CPU_SLOW_FUNC(...) consume_variables(__VA_ARGS__) -# define CPU_FAST_FUNC(...) consume_variables(__VA_ARGS__) -#endif - -#if USE_RNG > 0 -# define RNG_SLOW_FUNC(...) rng_slow_func(__VA_ARGS__) -# define RNG_FAST_FUNC(...) rng_fast_func(__VA_ARGS__) -#else -# define RNG_SLOW_FUNC(...) consume_variables(__VA_ARGS__) -# define RNG_FAST_FUNC(...) consume_variables(__VA_ARGS__) -#endif - int main(int argc, char** argv) { uint64_t rseed = std::random_device{}(); - int nitr = 200; + size_t nitr = 50; double frac = 70; - int64_t slow_val = 100000000L; + int64_t slow_val = 200000000L; + size_t nsync = 1; if(argc > 1) frac = std::stod(argv[1]); - if(argc > 2) nitr = std::stoi(argv[2]); + if(argc > 2) nitr = std::stoull(argv[2]); if(argc > 3) rseed = std::stoul(argv[3]); if(argc > 4) slow_val = std::stol(argv[4]); + if(argc > 5) nsync = std::stoull(argv[5]); + nsync = std::min(std::max(nsync, 1), nitr); int64_t fast_val = (frac / 100.0) * slow_val; double rfrac = (fast_val / static_cast(slow_val)); if(argc > 5) fast_val = std::stol(argv[5]); - printf("\nIterations: %i, fraction: %6.2f, random seed: %lu :: slow = %zu, " - "fast = %zu, expected ratio = %6.2f\n", - nitr, frac, rseed, slow_val, fast_val, rfrac * 100.0); + printf("\nFraction: %6.2f, iterations: %zu, random seed: %lu :: slow = %zu, " + "fast = %zu, expected ratio = %6.2f, sync every %lu iterations\n", + frac, nitr, rseed, slow_val, fast_val, rfrac * 100.0, nsync); - auto _t = clock_type::now(); - for(int i = 0; i < nitr; ++i) + auto _wait_barrier = pthread_barrier_t{}; + pthread_barrier_init(&_wait_barrier, nullptr, 3); + auto _thread_func = [nitr, nsync, &_wait_barrier](const auto& _func, auto* _timer, + auto _nsec, auto _nseed, + auto _nloop) { + pthread_barrier_wait(&_wait_barrier); + for(size_t i = 0; i < nitr; ++i) + { + auto _t = clock_type::now(); + _func(_nsec, _nseed, _nloop); + (*_timer) += (clock_type::now() - _t); + CAUSAL_PROGRESS_NAMED("iteration"); + if(i % nsync == (nsync - 1)) pthread_barrier_wait(&_wait_barrier); + } + }; + + auto _t = clock_type::now(); + auto _threads = std::vector{}; + _threads.emplace_back(_thread_func, SLOW_FUNC, &slow_ms, slow_val, rseed, 10000); + _threads.emplace_back(_thread_func, FAST_FUNC, &fast_ms, fast_val, rseed, 10000); + pthread_barrier_wait(&_wait_barrier); + for(size_t i = 0; i < nitr; ++i) { if(i == 0 || i + 1 == nitr || i % (nitr / 5) == 0) - printf("executing iteration: %i\n", i); - // - auto&& _slow_func = [](auto _nsec, auto _seed, auto _nloop) { - auto _t = clock_type::now(); - CPU_SLOW_FUNC(_nsec, _nloop); - RNG_SLOW_FUNC(_nsec / 5, _seed); - slow_ms += (clock_type::now() - _t); - }; - // - auto&& _fast_func = [](auto _nsec, auto _seed, auto _nloop) { - auto _t = clock_type::now(); - CPU_FAST_FUNC(_nsec, _nloop); - RNG_FAST_FUNC(_nsec / 5, _seed); - fast_ms += (clock_type::now() - _t); - }; - // - CAUSAL_BEGIN("main_iteration"); - // - auto _threads = std::vector{}; - _threads.emplace_back(std::move(_slow_func), slow_val, rseed, 10000); - _threads.emplace_back(std::move(_fast_func), fast_val, rseed, 10000); - for(auto& itr : _threads) - itr.join(); - CAUSAL_END("main_iteration"); - CAUSAL_PROGRESS; + (printf("executing iteration: %zu\n", i), fflush(stdout)); + if(i % nsync == (nsync - 1)) pthread_barrier_wait(&_wait_barrier); } + for(auto& itr : _threads) + itr.join(); + t_ms += clock_type::now() - _t; auto rms = (fast_ms.count() / slow_ms.count()); printf("slow_func() took %10.3f ms\n", slow_ms.count()); @@ -132,7 +77,7 @@ void rng_slow_func(int64_t n, uint64_t rseed) { // clang-format off - while(rng_func_impl(n, rseed) != false) {} + while(rng_impl_func(n, rseed) != false) {} // clang-format on } // @@ -142,7 +87,7 @@ void rng_fast_func(int64_t n, uint64_t rseed) { // clang-format off - while(rng_func_impl(n, rseed) != true) {} + while(rng_impl_func(n, rseed) != true) {} // clang-format on } // @@ -152,7 +97,7 @@ void cpu_slow_func(int64_t n, int nloop) { // clang-format off - while(cpu_func_impl(n, nloop) != false) {} + while(cpu_impl_func(n, nloop) != false) {} // clang-format on } // @@ -162,6 +107,6 @@ void cpu_fast_func(int64_t n, int nloop) { // clang-format off - while(cpu_func_impl(n, nloop) != true) {} + while(cpu_impl_func(n, nloop) != true) {} // clang-format on } diff --git a/examples/causal/impl.cpp b/examples/causal/impl.cpp index 0c70519a2c..1839715e2c 100644 --- a/examples/causal/impl.cpp +++ b/examples/causal/impl.cpp @@ -66,7 +66,7 @@ get_clock_cpu_now() noexcept; // template bool -rng_func_impl(int64_t n, uint64_t rseed) +rng_impl_func(int64_t n, uint64_t rseed) { int64_t _n = 0; auto _rng = std::mt19937_64{ rseed }; @@ -77,8 +77,8 @@ rng_func_impl(int64_t n, uint64_t rseed) return V; } -template bool rng_func_impl(int64_t, uint64_t); -template bool rng_func_impl(int64_t, uint64_t); +template bool rng_impl_func(int64_t, uint64_t); +template bool rng_impl_func(int64_t, uint64_t); // // This implementation works well for COZ @@ -86,25 +86,25 @@ template bool rng_func_impl(int64_t, uint64_t); // template bool -cpu_func_impl(int64_t n, int nloop) +cpu_impl_func(int64_t n, int nloop) { auto _t = clock_type::now(); auto _cpu_now = get_clock_cpu_now(); auto _cpu_end = _cpu_now + n; // clang-format off - while(get_clock_cpu_now() < _cpu_end) - { - for(volatile int i = 0; i < nloop; ++i) {} - CAUSAL_PROGRESS_NAMED("cpu_impl"); + while(get_clock_cpu_now() < _cpu_end) + { + for(volatile int i = 0; i < nloop; ++i) {} + CAUSAL_PROGRESS_NAMED("cpu_impl"); } // clang-format on return V; } template bool -cpu_func_impl(int64_t, int); +cpu_impl_func(int64_t, int); template bool -cpu_func_impl(int64_t, int); +cpu_impl_func(int64_t, int); namespace { diff --git a/examples/causal/impl.hpp b/examples/causal/impl.hpp new file mode 100644 index 0000000000..fdb4ceda1e --- /dev/null +++ b/examples/causal/impl.hpp @@ -0,0 +1,97 @@ +// MIT License +// +// Copyright (c) 2022 Advanced Micro Devices, Inc. All Rights Reserved. +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in all +// copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +// SOFTWARE. + +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +using mutex_t = std::timed_mutex; +using auto_lock_t = std::unique_lock; +using clock_type = std::chrono::high_resolution_clock; +using nanosec = std::chrono::nanoseconds; + +namespace +{ +template +inline void +consume_variables(Args&&...) +{} +} // namespace + +template +bool +rng_impl_func(int64_t n, uint64_t rseed); + +template +bool +cpu_impl_func(int64_t n, int nloop); + +void +rng_slow_func(int64_t n, uint64_t rseed) __attribute__((noinline)); + +void +rng_fast_func(int64_t n, uint64_t rseed) __attribute__((noinline)); + +void +cpu_slow_func(int64_t n, int nloop) __attribute__((noinline)); + +void +cpu_fast_func(int64_t n, int nloop) __attribute__((noinline)); + +#if USE_CPU > 0 +# define CPU_SLOW_FUNC(...) cpu_slow_func(__VA_ARGS__) +# define CPU_FAST_FUNC(...) cpu_fast_func(__VA_ARGS__) +#else +# define CPU_SLOW_FUNC(...) consume_variables(__VA_ARGS__) +# define CPU_FAST_FUNC(...) consume_variables(__VA_ARGS__) +#endif + +#if USE_RNG > 0 +# define RNG_SLOW_FUNC(...) rng_slow_func(__VA_ARGS__) +# define RNG_FAST_FUNC(...) rng_fast_func(__VA_ARGS__) +#else +# define RNG_SLOW_FUNC(...) consume_variables(__VA_ARGS__) +# define RNG_FAST_FUNC(...) consume_variables(__VA_ARGS__) +#endif + +#define SLOW_FUNC \ + [](auto _nsec_v, auto _nseed_v, auto _nloop_v) { \ + CPU_SLOW_FUNC(_nsec_v, _nloop_v); \ + RNG_SLOW_FUNC(_nsec_v / 5, _nseed_v); \ + } + +#define FAST_FUNC \ + [](auto _nsec_v, auto _nseed_v, auto _nloop_v) { \ + CPU_FAST_FUNC(_nsec_v, _nloop_v); \ + RNG_FAST_FUNC(_nsec_v / 5, _nseed_v); \ + } diff --git a/external/timemory b/external/timemory index 2b92a966d7..58536c55d7 160000 --- a/external/timemory +++ b/external/timemory @@ -1 +1 @@ -Subproject commit 2b92a966d795d8a01476d080bcbd0573dd9fb21f +Subproject commit 58536c55d73904401059a839f9355a2150ba170a diff --git a/source/bin/omnitrace-causal/CMakeLists.txt b/source/bin/omnitrace-causal/CMakeLists.txt index ca8e59f117..0d0ed3763d 100644 --- a/source/bin/omnitrace-causal/CMakeLists.txt +++ b/source/bin/omnitrace-causal/CMakeLists.txt @@ -14,7 +14,7 @@ target_include_directories(omnitrace-causal PRIVATE ${CMAKE_CURRENT_LIST_DIR}) target_link_libraries( omnitrace-causal PRIVATE omnitrace::omnitrace-compile-definitions omnitrace::omnitrace-headers - omnitrace::omnitrace-common-library) + omnitrace::omnitrace-common-library omnitrace::omnitrace-core) set_target_properties( omnitrace-causal PROPERTIES BUILD_RPATH "\$ORIGIN:\$ORIGIN/../${CMAKE_INSTALL_LIBDIR}" INSTALL_RPATH "${OMNITRACE_EXE_INSTALL_RPATH}") diff --git a/source/bin/omnitrace-causal/impl.cpp b/source/bin/omnitrace-causal/impl.cpp index 355ebcd19c..6d57bd0656 100644 --- a/source/bin/omnitrace-causal/impl.cpp +++ b/source/bin/omnitrace-causal/impl.cpp @@ -27,12 +27,14 @@ #include "common/environment.hpp" #include "common/join.hpp" #include "common/setup.hpp" +#include "core/mproc.hpp" +#include "core/utility.hpp" -#include #include #include #include #include +#include #include #include @@ -45,6 +47,7 @@ #include #include #include +#include #include #include #include @@ -57,10 +60,11 @@ namespace color = ::tim::log::color; namespace filepath = ::tim::filepath; namespace console = ::tim::utility::console; namespace argparse = ::tim::argparse; -using namespace timemory::join; -using tim::get_env; -using tim::log::monochrome; -using tim::log::stream; +using namespace ::timemory::join; +using ::omnitrace::utility::parse_numeric_range; +using ::tim::get_env; +using ::tim::log::monochrome; +using ::tim::log::stream; namespace std { @@ -147,117 +151,13 @@ remove_child_pid(pid_t _v) int wait_pid(pid_t _pid, int _opts) { - int _status = 0; - pid_t _pid_v = -1; - _opts |= WUNTRACED; - do - { - if((_opts & WNOHANG) > 0) - std::this_thread::sleep_for(std::chrono::milliseconds{ 100 }); - _pid_v = waitpid(_pid, &_status, _opts); - } while(_pid <= 0); - return _status; + return ::omnitrace::mproc::wait_pid(_pid, _opts); } int diagnose_status(pid_t _pid, int _status) { - auto _verbose = get_verbose(); - if(_verbose >= 3) - { - fflush(stderr); - fflush(stdout); - std::cout << std::flush; - std::cerr << std::flush; - } - - bool _normal_exit = (WIFEXITED(_status) > 0); - bool _unhandled_signal = (WIFSIGNALED(_status) > 0); - bool _core_dump = (WCOREDUMP(_status) > 0); - bool _stopped = (WIFSTOPPED(_status) > 0); - int _exit_status = WEXITSTATUS(_status); - int _stop_signal = (_stopped) ? WSTOPSIG(_status) : 0; - int _ec = (_unhandled_signal) ? WTERMSIG(_status) : 0; - - if(_verbose >= 4) - { - TIMEMORY_PRINTF_INFO( - stderr, - "diagnosing status for process %i :: status: %i... normal exit: %s, " - "unhandled signal: %s, core dump: %s, stopped: %s, exit status: %i, stop " - "signal: %i, exit code: %i\n", - _pid, _status, std::to_string(_normal_exit).c_str(), - std::to_string(_unhandled_signal).c_str(), std::to_string(_core_dump).c_str(), - std::to_string(_stopped).c_str(), _exit_status, _stop_signal, _ec); - } - else if(_verbose >= 3) - { - TIMEMORY_PRINTF_INFO(stderr, - "diagnosing status for process %i :: status: %i ...\n", _pid, - _status); - } - - if(!_normal_exit) - { - if(_ec == 0) _ec = EXIT_FAILURE; - if(_verbose >= 0) - { - TIMEMORY_PRINTF_FATAL( - stderr, "process %i terminated abnormally. exit code: %i\n", _pid, _ec); - } - } - - if(_stopped) - { - if(_verbose >= 0) - { - TIMEMORY_PRINTF_FATAL(stderr, - "process %i stopped with signal %i. exit code: %i\n", - _pid, _stop_signal, _ec); - } - } - - if(_core_dump) - { - if(_verbose >= 0) - { - TIMEMORY_PRINTF_FATAL( - stderr, "process %i terminated and produced a core dump. exit code: %i\n", - _pid, _ec); - } - } - - if(_unhandled_signal) - { - if(_verbose >= 0) - { - TIMEMORY_PRINTF_FATAL(stderr, - "process %i terminated because it received a signal " - "(%i) that was not handled. exit code: %i\n", - _pid, _ec, _ec); - } - } - - if(!_normal_exit && _exit_status > 0) - { - if(_verbose >= 0) - { - if(_exit_status == 127) - { - TIMEMORY_PRINTF_FATAL( - stderr, "execv in process %i failed. exit code: %i\n", _pid, _ec); - } - else - { - TIMEMORY_PRINTF_FATAL( - stderr, - "process %i terminated with a non-zero status. exit code: %i\n", _pid, - _ec); - } - } - } - - return _ec; + return ::omnitrace::mproc::diagnose_status(_pid, _status, get_verbose()); } std::string @@ -301,6 +201,9 @@ get_initial_environment() update_env(_env, "OMNITRACE_USE_TIMEMORY", false); update_env(_env, "OMNITRACE_USE_PROCESS_SAMPLING", false); update_env(_env, "OMNITRACE_CRITICAL_TRACE", false); + update_env(_env, "OMNITRACE_THREAD_POOL_SIZE", + get_env("OMNITRACE_THREAD_POOL_SIZE", 0)); + update_env(_env, "OMNITRACE_LAUNCHER", "omnitrace-causal"); return _env; } @@ -634,7 +537,12 @@ parse_args(int argc, char** argv, std::vector& _env, parser.start_group("CAUSAL PROFILING OPTIONS (General)", "These settings will be applied to all causal profiling runs"); - parser.add_argument({ "-m", "--mode" }, "Causal profiling mode") + parser + .add_argument({ "-m", "--mode" }, + "Causal profiling mode. Function mode tends to resolve statistics " + "faster than line mode (due to smaller sampling space). Ideally, " + "use function mode first to identify a function to target and then " + "switch to line mode + function scope setting") .count(1) .dtype("string") .choices({ "function", "line" }) @@ -643,6 +551,14 @@ parse_args(int argc, char** argv, std::vector& _env, update_env(_env, "OMNITRACE_CAUSAL_MODE", p.get("mode")); }); + parser.add_argument({ "-b", "--backend" }, "Causal profiling sampling backend.") + .count(1) + .dtype("string") + .choices({ "auto", "perf", "timer" }) + .action([&](parser_t& p) { + update_env(_env, "OMNITRACE_CAUSAL_BACKEND", p.get("backend")); + }); + parser .add_argument({ "-o", "--output-name" }, "Output filename of causal profiling data w/o extension") @@ -717,16 +633,42 @@ parse_args(int argc, char** argv, std::vector& _env, "scopes (MAIN+foo, MAIN+bar, MAIN+foo, MAIN+bar)"); parser - .add_argument({ "-s", "--speedups" }, - "Pool of virtual speedups to sample from during experimentation. " - "Each space designates a group and multiple speedups can be " - "grouped together by commas, e.g. -s 0 0,10,20-50 is two groups: " - "group #1 is '0' and group #2 is '0 10 20 25 30 35 40 45 50'") - .min_count(0) + .add_argument( + { "-s", "--speedups" }, + "Pool of virtual speedups to sample from during experimentation. " + "Each space designates a group and multiple speedups can be " + "grouped together by commas, e.g. '-s 0 0,10,20-50' is two groups: " + "group #1 is '0' and group #2 is '0 10 20 25 30 35 40 45 50' -- " + "unless end-to-end mode is activated: in end-to-end mode, only one " + "speedup is selected for the entire run so all groups are " + "expanded. If a range is specified, the default increment is 5, " + "however, this can be overridden by suffixing the range with a colon and the " + "desired increment, e.g., '0-40:10' would expand to '0 10 20 30 40'") + .min_count(1) .max_count(-1) - .dtype("integers") + .dtype("integer | range | range:increment") .action([&](parser_t& p) { - _virtual_speedups = p.get>("speedups"); + auto _val = p.get>("speedups"); + if(p.get("end-to-end")) + { + _virtual_speedups.clear(); + for(const auto& itr : _val) + { + for(const auto& ditr : tim::delimit(itr, ",; \t\n\r")) + { + for(auto nitr : + parse_numeric_range>( + ditr, "virtual speedup", 5L)) + { + _virtual_speedups.emplace_back(std::to_string(nitr)); + } + } + } + } + else + { + _virtual_speedups = _val; + } }); parser diff --git a/source/bin/omnitrace-run/impl.cpp b/source/bin/omnitrace-run/impl.cpp index 44239498cd..58e0e8f94e 100644 --- a/source/bin/omnitrace-run/impl.cpp +++ b/source/bin/omnitrace-run/impl.cpp @@ -247,7 +247,7 @@ print_updated_environment(parser_data_t& _data, std::string_view _prefix) } parser_data_t& -parse_args(int argc, char** argv, parser_data_t& _parser_data) +parse_args(int argc, char** argv, parser_data_t& _parser_data, bool& _fork_exec) { get_initial_environment(_parser_data); @@ -305,6 +305,13 @@ parse_args(int argc, char** argv, parser_data_t& _parser_data) omnitrace::argparse::add_core_arguments(parser, _parser_data); omnitrace::argparse::add_extended_arguments(parser, _parser_data); + parser.start_group("EXECUTION OPTIONS", ""); + parser.add_argument({ "--fork" }, "Execute via fork + execvpe instead of execvpe") + .min_count(0) + .max_count(1) + .dtype("boolean") + .action([&](parser_t& p) { _fork_exec = p.get("fork"); }); + auto _inpv = std::vector{}; auto& _outv = _parser_data.command; bool _hash = false; @@ -335,6 +342,8 @@ parse_args(int argc, char** argv, parser_data_t& _parser_data) exit(EXIT_FAILURE); } + tim::log::monochrome() = _parser_data.monochrome; + return _parser_data; } diff --git a/source/bin/omnitrace-run/omnitrace-run.cpp b/source/bin/omnitrace-run/omnitrace-run.cpp index a870960d6e..789c0eabdb 100644 --- a/source/bin/omnitrace-run/omnitrace-run.cpp +++ b/source/bin/omnitrace-run/omnitrace-run.cpp @@ -21,6 +21,7 @@ // SOFTWARE. #include "omnitrace-run.hpp" +#include "core/mproc.hpp" #include #include @@ -61,7 +62,8 @@ main(int argc, char** argv) } auto _parse_data = parser_data_t{}; - parse_args(argc, argv, _parse_data); + auto _fork_exec = false; + parse_args(argc, argv, _parse_data, _fork_exec); prepare_command_for_run(argv[0], _parse_data); prepare_environment_for_run(_parse_data); @@ -73,7 +75,39 @@ main(int argc, char** argv) print_command(_parse_data, "OMNITRACE: "); _argv.emplace_back(nullptr); _envp.emplace_back(nullptr); - return execvpe(_argv.front(), _argv.data(), _envp.data()); + + if(_fork_exec) + { + auto _main_pid = getpid(); + auto _pid = fork(); + + if(_pid == 0) + { + return execvpe(_argv.front(), _argv.data(), _envp.data()); + } + else + { + auto _status = omnitrace::mproc::wait_pid(_pid); + auto _ec = omnitrace::mproc::diagnose_status(_pid, _status); + if(_ec != 0 && _parse_data.verbose >= 0) + { + TIMEMORY_PRINTF_FATAL( + stderr, "process %i exiting with non-zero exit code: %i\n", _pid, + _ec); + } + else if(_parse_data.verbose >= 2) + { + TIMEMORY_PRINTF_FATAL( + stderr, "omnitrace run in process %i completed. exit code: %i\n", + _pid, _ec); + } + return _ec; + } + } + else + { + return execvpe(_argv.front(), _argv.data(), _envp.data()); + } } _print_usage(); diff --git a/source/bin/omnitrace-run/omnitrace-run.hpp b/source/bin/omnitrace-run/omnitrace-run.hpp index c62090e067..cc7578d0e0 100644 --- a/source/bin/omnitrace-run/omnitrace-run.hpp +++ b/source/bin/omnitrace-run/omnitrace-run.hpp @@ -47,7 +47,7 @@ void prepare_environment_for_run(parser_data_t&); parser_data_t& -parse_args(int argc, char** argv, parser_data_t&); +parse_args(int argc, char** argv, parser_data_t&, bool&); parser_data_t& parse_command(int argc, char** argv, parser_data_t&); diff --git a/source/bin/tests/CMakeLists.txt b/source/bin/tests/CMakeLists.txt index b226e7bb5f..60a9e382fd 100644 --- a/source/bin/tests/CMakeLists.txt +++ b/source/bin/tests/CMakeLists.txt @@ -521,6 +521,7 @@ omnitrace_add_bin_test( cpu_clock peak_rss page_rss + --fork -- $ 5 diff --git a/source/docs/causal_profiling.md b/source/docs/causal_profiling.md index adb3a28df2..b44e531fa4 100644 --- a/source/docs/causal_profiling.md +++ b/source/docs/causal_profiling.md @@ -76,6 +76,7 @@ be found in the future. | Concept | Setting | Options | Description | |------------------|-----------------------------------|----------------------------------|--------------------------------------------------------------------------------------------------------------------| +| Backend | `OMNITRACE_CAUSAL_BACKEND` | `perf`, `timer` | Backend for recording samples required to calculate the virtual speed-up | | Mode | `OMNITRACE_CAUSAL_MODE` | `function`, `line` | Select entire function or individual line of code for causal experiments | | End-to-End | `OMNITRACE_CAUSAL_END_TO_END` | boolean | Perform a single experiment during the entire run (does not require progress-points) | | Fixed speedup(s) | `OMNITRACE_CAUSAL_FIXED_SPEEDUP` | one or more values from [0, 100] | Virtual speedup or pool of virtual speedups to randomly select | @@ -89,6 +90,58 @@ be found in the future. 2. `` and `:` support requires debug info (i.e. code was compiled with `-g` or, preferably, `-g3`) 3. Function mode does not require debug info but does not support stripped binaries +### Backends + +Both causal profiling backends interrupt each thread 1000x per second of CPU-time to apply virtual speedups. +The difference between the backends is how the samples which are responsible calculating the virtual speedup are recorded. +There are 3 key differences between the two backends: + +1. `perf` backend requires Linux Perf and elevated security priviledges +2. `perf` backend interrupts the application less frequently whereas the `timer` backend will interrupt the applicaiton 1000x per second of realtime +3. `timer` backend has less accurate call-stacks due to instruction pointer skid + +In general, the `"perf"` is preferred over the `"timer"` backend when sufficient security priviledges permit it's usage. +If `"OMNITRACE_CAUSAL_BACKEND"` is set to `"auto"`, Omnitrace will fallback to using the `"timer"` backend only if +using the `"perf"` backend fails; if `"OMNITRACE_CAUSAL_BACKEND"` is set to `"perf"` and using this backend fails, Omnitrace +will abort. + +#### Instruction Pointer Skid + +Instruction pointer (IP) skid is how many instructions execute between an event of interest +happening and where the IP is when the kernel is able to stop the application. +For the `"timer"` backend, this translates to the +difference between when the IP when the timer generated a signal and the IP when the +signal was actually generated. Although IP skid does still occur with the `"perf"` backend, +the overhead of pausing the entire thread with the `"timer"` backend makes this much more pronounced +and, as such, the `"timer"` backend tends to have a lower resolution than the `"perf"` backend, +especially in `"line"` mode. + +#### Installing Linux Perf + +Linux Perf is built into the kernel and may already be installed (e.g., included in the default kernel for OpenSUSE). +The official method of checking whether Linux Perf is installed is checking for the existence of the file +`/proc/sys/kernel/perf_event_paranoid` -- if the file exists, the kernel has Perf installed. + +If this file does not exist, on Debian-based systems like Ubuntu, install (as superuser): + +```console +apt-get install linux-tools-common linux-tools-generic linux-tools-$(uname -r) +``` + +and reboot your computer. In order to use the `"perf"` backend, the value of `/proc/sys/kernel/perf_event_paranoid` +should be <= 2. If the value in this file is greater than 2, you will likely be unable to use the perf backend. + +To update the paranoid level temporarily (until the system is rebooted), run one of the following methods +as a superuser (where `PARANOID_LEVEL=` with `` in the range `[-1, 2]): + +```console +echo ${PARANOID_LEVEL} | sudo tee /proc/sys/kernel/perf_event_paranoid +sysctl kernel.perf_event_paranoid=${PARANOID_LEVEL} +``` + +To make the paranoid level persistent after a reboot, add `kernel.perf_event_paranoid=` +(where `` is the desired paranoid level) to the `/etc/sysctl.conf` file. + ### Speedup Prediction Variability and `omnitrace-causal` Executable Causal profiling typically require executing the application several times in order to adequately sample all the domains of executing code, experiment speedups, etc. and resolve statistical fluctuations. @@ -259,7 +312,7 @@ omnitrace-causal \ # 20 iterations in line mode with 1 speedup group -# and source scope restricted to lines 155 and 165 +# and source scope restricted to lines 100 and 110 # in the causal.cpp file. # # outputs to files: @@ -273,7 +326,7 @@ omnitrace-causal \ -s ${SPEEDUPS} \ -m line \ -o experiments.line \ - -S "causal\\.cpp:(155|165)" \ + -S "causal\\.cpp:(100|110)" \ -- \ ./causal-omni-cpu "${@}" @@ -302,8 +355,8 @@ omnitrace-causal \ # 3 iterations in line mode of 15 singular speedups # in end-to-end mode with 2 different source scopes -# where one is restricted to line 155 in causal.cpp -# and another is restricted to line 165 in causal.cpp. +# where one is restricted to line 100 in causal.cpp +# and another is restricted to line 110 in causal.cpp. # # outputs to files: # - causal/experiments.line.e2e.coz @@ -317,8 +370,8 @@ omnitrace-causal \ -m line \ -e \ -o experiments.line.e2e \ - -S "causal\\.cpp:155" \ - "causal\\.cpp:165" \ + -S "causal\\.cpp:100" \ + "causal\\.cpp:110" \ -- \ ./causal-omni-cpu "${@}" @@ -465,9 +518,7 @@ OmniTrace provides several additional features and utilities for causal profilin | Scope options | Supports binary and source scopes | Supports binary, source, and function scopes | See Note #4, #5, and #6 below | | Scope inclusion | Uses `%` as wildcard for binary and source scopes | Full regex support for binary, source, and function scopes | | | Scope exclusion | Not supported | Supports regexes for excluding binary/source/function | See Note #7 below | -| Call-stack sampling | Linux perf | libunwind | See Note #8 below | - -### Notes +| Call-stack sampling | Linux perf | Linux perf, libunwind | See Note #8 below | 1. OmniTrace supports a "function" mode which does not require debug info 2. OmniTrace supports selecting entire range of instruction pointers for a function instead of instruction pointer for one line. In large codes, "function" mode @@ -478,3 +529,5 @@ OmniTrace provides several additional features and utilities for causal profilin 6. OmniTrace supports a "function" scope which narrows the functions/lines which are eligible for causal experiments to those within the matching functions 7. OmniTrace supports a second filter on scopes for removing binary/source/function caught by inclusive match, e.g. `BINARY_SCOPE=.*` + `BINARY_EXCLUDE=libmpi.*` initially includes all binaries but exclude regex removes MPI libraries +8. In Omnitrace, the Linux perf backend is preferred over use libunwind. However, Linux perf usage can be restricted for security reasons. + Omnitrace will fallback to using a second POSIX timer and libunwind if Linux perf is not available. diff --git a/source/docs/installation.md b/source/docs/installation.md index cd87c08687..c02a8c72dc 100644 --- a/source/docs/installation.md +++ b/source/docs/installation.md @@ -6,15 +6,38 @@ :maxdepth: 4 ``` +## Quick Start (Latest Release, Binary Installer) + +Download the `omnitrace-install.py` and specify `--prefix `. This script +will attempt to auto-detect the appropriate OS distribution and OS version. +If ROCm support is desired, specify `--rocm X.Y` where `X` is the ROCm major version and `Y` +is the ROCm minor version, e.g. `--rocm 5.4`. + +```console +wget https://github.com/AMDResearch/omnitrace/releases/latest/download/omnitrace-install.py +python3 ./omnitrace-install.py --prefix /opt/omnitrace --rocm 5.4 +``` + +This script supports installation on Ubuntu, OpenSUSE, RedHat, Debian, CentOS, and Fedora. +If the target OS is compatible with one of the [operating system versions](#operating-system) below, +specify `-d -v `, e.g. if the OS is compatible with Ubuntu 18.04, pass +`-d ubuntu -v 18.04` to the script. + ## Operating System -OmniTrace is only supported on Linux. +OmniTrace is only supported on Linux. The following distributions are tested: - Ubuntu 18.04 - Ubuntu 20.04 +- Ubuntu 22.04 - OpenSUSE 15.2 - OpenSUSE 15.3 -- Other OS distributions may be supported but are not tested +- OpenSUSE 15.4 +- RedHat 8.7 +- RedHat 9.0 +- RedHat 9.1 + +Other OS distributions may be supported but are not tested. ### Identifying the Operating System diff --git a/source/lib/binary/address_multirange.cpp b/source/lib/binary/address_multirange.cpp index 39c90df2b4..76507bed54 100644 --- a/source/lib/binary/address_multirange.cpp +++ b/source/lib/binary/address_multirange.cpp @@ -34,16 +34,16 @@ namespace binary address_multirange& address_multirange::operator+=(std::pair&& _v) { - coarse_range = address_range{ std::min(coarse_range.low, _v.second), - std::max(coarse_range.high, _v.second) }; + m_coarse_range = address_range{ std::min(m_coarse_range.low, _v.second), + std::max(m_coarse_range.high, _v.second + 1) }; return *this; } address_multirange& address_multirange::operator+=(std::pair&& _v) { - coarse_range = address_range{ std::min(coarse_range.low, _v.second.low), - std::max(coarse_range.high, _v.second.high) }; + m_coarse_range = address_range{ std::min(m_coarse_range.low, _v.second.low), + std::max(m_coarse_range.high, _v.second.high) }; return *this; } diff --git a/source/lib/binary/address_multirange.hpp b/source/lib/binary/address_multirange.hpp index 51f8fb06f5..4c5d62c8c7 100644 --- a/source/lib/binary/address_multirange.hpp +++ b/source/lib/binary/address_multirange.hpp @@ -49,14 +49,15 @@ struct address_multirange template bool contains(Tp&& _v) const; - address_range coarse_range = {}; - auto size() const { return m_fine_ranges.size(); } auto empty() const { return m_fine_ranges.empty(); } - auto range_size() const { return coarse_range.size(); } + auto range_size() const { return m_coarse_range.size(); } + auto get_coarse_range() const { return m_coarse_range; } + auto get_ranges() const { return m_fine_ranges; } private: - std::set m_fine_ranges = {}; + address_range m_coarse_range = {}; + std::set m_fine_ranges = {}; }; template @@ -68,7 +69,7 @@ address_multirange::contains(Tp&& _v) const std::is_same::value, "Error! operator+= supports only integrals or address_ranges"); - if(!coarse_range.contains(_v)) return false; + if(!m_coarse_range.contains(_v)) return false; return std::any_of(m_fine_ranges.begin(), m_fine_ranges.end(), [_v](auto&& itr) { return itr.contains(_v); }); } diff --git a/source/lib/binary/analysis.cpp b/source/lib/binary/analysis.cpp index 8d5dedb15a..bdc4204c28 100644 --- a/source/lib/binary/analysis.cpp +++ b/source/lib/binary/analysis.cpp @@ -37,15 +37,18 @@ #include "core/common.hpp" #include "core/config.hpp" #include "core/debug.hpp" +#include "core/locking.hpp" #include "core/state.hpp" #include "core/utility.hpp" #include "dwarf_entry.hpp" +#include "link_map.hpp" #include "scope_filter.hpp" #include "symbol.hpp" #include #include #include +#include #include #include #include @@ -200,5 +203,106 @@ get_binary_info(const std::vector& _files, return _data; } + +template +std::optional +lookup_ipaddr_entry(uintptr_t _addr, unw_context_t* _context_p, + tim::unwind::cache* _cache_p) +{ + static auto _mutex = locking::atomic_mutex{}; + static auto _cache_v = tim::unwind::cache{ true }; + static auto _context_v = []() { + auto _v = unw_context_t{}; + unw_getcontext(&_v); + return _v; + }(); + + if constexpr(ExcludeInternal) + { + static auto _exclude_range = []() { + auto _maps = ::tim::procfs::maps::iterate_program_headers(); + auto _exclude_range_v = std::set{}; + auto _insert_exclude_range = [&_maps, + &_exclude_range_v](const std::string& _v) { + auto _base_v = std::string_view{ filepath::basename(_v) }; + auto _real_v = filepath::realpath(_v); + for(const auto& mitr : _maps) + { + if(std::string_view{ filepath::basename(mitr.pathname) } == _base_v || + _real_v == _v) + { + _exclude_range_v.emplace( + address_range_t{ mitr.load_address, mitr.last_address }); + } + } + }; + + for(const auto& itr : binary::get_link_map("libomnitrace.so", "", "")) + _insert_exclude_range(itr.real()); + + for(const auto& itr : binary::get_link_map("libomnitrace-dl.so", "", "")) + _insert_exclude_range(itr.real()); + + return _exclude_range_v; + }(); + + for(auto itr : _exclude_range) + if(itr.contains(_addr)) return std::optional{}; + } + + // NOLINTNEXTLINE(readability-misleading-indentation) + if(_addr == 0) return std::optional{}; + + auto _lk = locking::atomic_lock{ _mutex, std::defer_lock }; + + if(!_context_p) _context_p = &_context_v; + if(!_cache_p) + { + _cache_p = &_cache_v; + // prevent concurrent access to cache + _lk.lock(); + } + + auto _entry = tim::unwind::entry{ _addr }; + + auto citr = _cache_p->entries.find(_entry); + if(citr != _cache_p->entries.end()) + { + if(citr->second.error == 0) return citr->second; + return std::optional{}; + } + + auto _v = tim::unwind::processed_entry{}; + _v.address = _entry.address(); + _v.name = _entry.template get_name<4096, true>(*_context_p, &_v.offset, &_v.error); + + tim::unwind::processed_entry::construct(_v, &_cache_p->files); + + if(_v.error != 0 && _v.lineinfo) + { + auto _lineinfo = _v.lineinfo.get(); + if(_lineinfo) + { + _v.name = _lineinfo.name; + _v.error = 0; + } + } + else if(_v.info && _v.info.symbol) + { + _v.name = _v.info.symbol.name; + _v.error = 0; + } + + _cache_p->entries.emplace(_entry, _v); + + return (_v.error == 0) ? std::optional{ _v } + : std::optional{}; +} + +template std::optional +lookup_ipaddr_entry(uintptr_t, unw_context_t*, tim::unwind::cache*); + +template std::optional +lookup_ipaddr_entry(uintptr_t, unw_context_t*, tim::unwind::cache*); } // namespace binary } // namespace omnitrace diff --git a/source/lib/binary/analysis.hpp b/source/lib/binary/analysis.hpp index 501094a5ce..d70d58e255 100644 --- a/source/lib/binary/analysis.hpp +++ b/source/lib/binary/analysis.hpp @@ -57,5 +57,9 @@ std::vector get_binary_info(const std::vector&, const std::vector&, bool _process_dwarf = true, bool _process_bfd = true, bool _include_all = false); + +template +std::optional +lookup_ipaddr_entry(uintptr_t, unw_context_t* = nullptr, tim::unwind::cache* = nullptr); } // namespace binary } // namespace omnitrace diff --git a/source/lib/binary/dwarf_entry.cpp b/source/lib/binary/dwarf_entry.cpp index e74c10c42e..8210f729b3 100644 --- a/source/lib/binary/dwarf_entry.cpp +++ b/source/lib/binary/dwarf_entry.cpp @@ -132,8 +132,8 @@ get_dwarf_entry(Dwarf_Die* _die) bool dwarf_entry::operator<(const dwarf_entry& _rhs) const { - return std::tie(address, line, col, discriminator) < - std::tie(_rhs.address, _rhs.line, _rhs.col, _rhs.discriminator); + return std::tie(address, file, line, col, discriminator) < + std::tie(_rhs.address, _rhs.file, _rhs.line, _rhs.col, _rhs.discriminator); } bool diff --git a/source/lib/binary/symbol.cpp b/source/lib/binary/symbol.cpp index 62cfb299e9..6ff4531aa5 100644 --- a/source/lib/binary/symbol.cpp +++ b/source/lib/binary/symbol.cpp @@ -82,7 +82,7 @@ read_inliner_info(bfd* _inp) symbol::symbol(const base_type& _v) : base_type{ _v } -, address{ _v.address, _v.address + _v.symsize } +, address{ _v.address, _v.address + _v.symsize + 1 } , func{ std::string{ base_type::name } } {} @@ -96,6 +96,15 @@ symbol::operator==(const symbol& _rhs) const bool symbol::operator<(const symbol& _rhs) const { + // if both have non-zero load addresses that are not equal, compare based on load + // addresses + if(load_address > 0 && _rhs.load_address > 0 && load_address != _rhs.load_address) + return (load_address < _rhs.load_address); + + // if address is same and name is same, return true if load_address is higher + if(address == _rhs.address && base_type::name == _rhs.base_type::name) + return load_address > _rhs.load_address; + return std::tie(address, base_type::binding, base_type::visibility, base_type::name) < std::tie(_rhs.address, _rhs.base_type::binding, base_type::visibility, base_type::name); @@ -122,6 +131,10 @@ symbol::operator+=(const symbol& _rhs) address += _rhs.address; utility::combine(inlines, _rhs.inlines); utility::combine(dwarf_info, _rhs.dwarf_info); + if(_rhs.binding < binding) binding = _rhs.binding; + if(_rhs.visibility < visibility) visibility = _rhs.visibility; + if(load_address == 0 && _rhs.load_address > load_address) + load_address = _rhs.load_address; } else { @@ -171,6 +184,20 @@ symbol::read_dwarf_entries(const std::deque& _info) _get_next_address(itr, itr->address.low) }; } + std::sort(dwarf_info.begin(), dwarf_info.end(), + [](const auto& _lhs, const auto& _rhs) { + return std::tie(_lhs.address, _lhs.file, _lhs.line, _lhs.col) < + std::tie(_rhs.address, _rhs.file, _rhs.line, _rhs.col); + }); + + dwarf_info.erase(std::unique(dwarf_info.begin(), dwarf_info.end(), + [](const auto& _lhs, const auto& _rhs) { + return std::tie(_lhs.address, _lhs.file, + _lhs.line) == + std::tie(_rhs.address, _rhs.file, _rhs.line); + }), + dwarf_info.end()); + return dwarf_info.size(); } @@ -206,15 +233,12 @@ symbol::read_bfd_line_info(bfd_file& _bfd) auto* _syms = reinterpret_cast(_bfd.syms); { - const char* _file = nullptr; - const char* _func = nullptr; - unsigned int _line = 0; - unsigned int _discriminator = 0; + const char* _file = nullptr; + const char* _func = nullptr; + unsigned int _line = 0; - // if(bfd_find_nearest_line(_inp, _section, _syms, _pc - _vma, &_file, - // &_func, &_line) != 0) - if(bfd_find_nearest_line_discriminator(_inp, _section, _syms, _pc - _vma, &_file, - &_func, &_line, &_discriminator) != 0) + if(bfd_find_nearest_line(_inp, _section, _syms, _pc - _vma, &_file, &_func, + &_line) != 0) { if(_file) file = _file; if(_func) func = _func; @@ -340,6 +364,18 @@ symbol::serialize(ArchiveT& ar, const unsigned int) ar(cereal::make_nvp("dfunc", demangle(func))); } +template void +inlined_symbol::serialize(cereal::JSONInputArchive&, + const unsigned int); + +template void +inlined_symbol::serialize( + cereal::MinimalJSONOutputArchive&, const unsigned int); + +template void +inlined_symbol::serialize( + cereal::PrettyJSONOutputArchive&, const unsigned int); + template void symbol::serialize(cereal::JSONInputArchive&, const unsigned int); diff --git a/source/lib/core/CMakeLists.txt b/source/lib/core/CMakeLists.txt index 05998737f0..b18973c605 100644 --- a/source/lib/core/CMakeLists.txt +++ b/source/lib/core/CMakeLists.txt @@ -12,9 +12,11 @@ set(core_sources ${CMAKE_CURRENT_LIST_DIR}/exception.cpp ${CMAKE_CURRENT_LIST_DIR}/gpu.cpp ${CMAKE_CURRENT_LIST_DIR}/mproc.cpp + ${CMAKE_CURRENT_LIST_DIR}/perf.cpp ${CMAKE_CURRENT_LIST_DIR}/perfetto.cpp ${CMAKE_CURRENT_LIST_DIR}/state.cpp - ${CMAKE_CURRENT_LIST_DIR}/timemory.cpp) + ${CMAKE_CURRENT_LIST_DIR}/timemory.cpp + ${CMAKE_CURRENT_LIST_DIR}/utility.cpp) set(core_headers ${CMAKE_CURRENT_LIST_DIR}/argparse.hpp @@ -29,6 +31,7 @@ set(core_headers ${CMAKE_CURRENT_LIST_DIR}/gpu.hpp ${CMAKE_CURRENT_LIST_DIR}/locking.hpp ${CMAKE_CURRENT_LIST_DIR}/mproc.hpp + ${CMAKE_CURRENT_LIST_DIR}/perf.hpp ${CMAKE_CURRENT_LIST_DIR}/perfetto.hpp ${CMAKE_CURRENT_LIST_DIR}/redirect.hpp ${CMAKE_CURRENT_LIST_DIR}/state.hpp diff --git a/source/lib/core/argparse.cpp b/source/lib/core/argparse.cpp index d56b10af4a..5cce80680f 100644 --- a/source/lib/core/argparse.cpp +++ b/source/lib/core/argparse.cpp @@ -272,6 +272,15 @@ add_core_arguments(parser_t& _parser, parser_data& _data) %{INDENT}% to consume more resources since, while idle, the real-clock time increases (and therefore triggers taking samples) %{INDENT}% whereas the CPU-clock time does not.)"; + const auto* _overflow_desc = + R"(Sample based on an overflow event. Accepts zero or more arguments: + %{INDENT}%0. Enables sampling based on overflow. + %{INDENT}%1. Overflow metric, e.g. PERF_COUNT_HW_INSTRUCTIONS + %{INDENT}%2. Overflow value. E.g., if metric == PERF_COUNT_HW_INSTRUCTIONS, then 10000000 == sample every 10,000,000 instructions. + %{INDENT}%3+ Thread IDs to target for sampling, starting at 0 (the main thread). + %{INDENT}% May be specified as index or range, e.g., '0 2-4' will be interpreted as: + %{INDENT}% sample the main thread (0), do not sample the first child thread but sample the 2nd, 3rd, and 4th child threads)"; + const auto* _hsa_interrupt_desc = R"(Set the value of the HSA_ENABLE_INTERRUPT environment variable. %{INDENT}% ROCm version 5.2 and older have a bug which will cause a deadlock if a sample is taken while waiting for the signal @@ -1075,7 +1084,7 @@ add_core_arguments(parser_t& _parser, parser_data& _data) "SAMPLING TIMER OPTIONS", "These options determine the heuristic for deciding when to take a sample"); - if(_data.environ_filter("sample_cputime", _data)) + if(_data.environ_filter("sampling_cputime", _data)) { _parser.add_argument({ "--sample-cputime" }, _cputime_desc) .min_count(0) @@ -1103,7 +1112,7 @@ add_core_arguments(parser_t& _parser, parser_data& _data) _data.processed_environs.emplace("sampling_cputime"); } - if(_data.environ_filter("sample_realtime", _data)) + if(_data.environ_filter("sampling_realtime", _data)) { _parser.add_argument({ "--sample-realtime" }, _realtime_desc) .min_count(0) @@ -1132,6 +1141,41 @@ add_core_arguments(parser_t& _parser, parser_data& _data) _data.processed_environs.emplace("sampling_realtime"); } + if(_data.environ_filter("sampling_overflow", _data)) + { + _parser.add_argument({ "--sample-overflow" }, _overflow_desc) + .min_count(0) + .dtype("[event] [freq] [tids...]") + .action([&](parser_t& p) { + auto _v = p.get>("sample-overflow"); + update_env(_data, "OMNITRACE_SAMPLING_OVERFLOW", true); + + if(!_v.empty()) + { + if(p.exists("sampling-overflow-event") && + _v.front() != p.get("sampling-overflow-event")) + throw exception(join( + "", "'--sample-overflow ", _v.front(), + " ...' conflicts with '--sampling-overflow-event ", + p.get("sampling-overflow-event"), "' option")); + update_env(_data, "OMNITRACE_SAMPLING_OVERFLOW_EVENT", _v.front()); + _v.pop_front(); + } + if(!_v.empty()) + { + update_env(_data, "OMNITRACE_SAMPLING_OVERFLOW_FREQ", _v.front()); + _v.pop_front(); + } + if(!_v.empty()) + { + update_env(_data, "OMNITRACE_SAMPLING_OVERFLOW_TIDS", + join(array_config_t{ "," }, _v)); + } + }); + + _data.processed_environs.emplace("sampling_overflow"); + } + _parser.start_group( "ADVANCED SAMPLING OPTIONS", "These options determine the heuristic for deciding when to take a sample"); diff --git a/source/lib/core/binary/address_range.cpp b/source/lib/core/binary/address_range.cpp index 2cd2940a2b..d5d0211f48 100644 --- a/source/lib/core/binary/address_range.cpp +++ b/source/lib/core/binary/address_range.cpp @@ -80,7 +80,7 @@ address_range::contains(uintptr_t _v) const bool address_range::contains(address_range _v) const { - return (*this == _v) || (contains(_v.low) && contains(_v.high)); + return (*this == _v) || (contains(_v.low) && (contains(_v.high) || _v.high == high)); } bool diff --git a/source/lib/core/categories.hpp b/source/lib/core/categories.hpp index 4e946b8006..ce1106823b 100644 --- a/source/lib/core/categories.hpp +++ b/source/lib/core/categories.hpp @@ -130,6 +130,8 @@ OMNITRACE_DEFINE_CATEGORY(category, thread_context_switch, OMNITRACE_CATEGORY_TH OMNITRACE_DEFINE_CATEGORY(category, thread_hardware_counter, OMNITRACE_CATEGORY_THREAD_HARDWARE_COUNTER, "thread_hardware_counter", "Hardware counter value on thread (derived from sampling)") OMNITRACE_DEFINE_CATEGORY(category, kernel_hardware_counter, OMNITRACE_CATEGORY_KERNEL_HARDWARE_COUNTER, "kernel_hardware_counter", "Hardware counter value for kernel (deterministic)") OMNITRACE_DEFINE_CATEGORY(category, numa, OMNITRACE_CATEGORY_NUMA, "numa", "Non-unified memory architecture") +OMNITRACE_DEFINE_CATEGORY(category, timer_sampling, OMNITRACE_CATEGORY_TIMER_SAMPLING, "timer_sampling", "Sampling based on a timer") +OMNITRACE_DEFINE_CATEGORY(category, overflow_sampling, OMNITRACE_CATEGORY_OVERFLOW_SAMPLING, "overflow_sampling", "Sampling based on a counter overflow") OMNITRACE_DECLARE_CATEGORY(category, sampling, OMNITRACE_CATEGORY_SAMPLING, "sampling", "Host-side call-stack sampling") // clang-format on @@ -192,6 +194,8 @@ using name = perfetto_category; OMNITRACE_PERFETTO_CATEGORY(category::thread_hardware_counter), \ OMNITRACE_PERFETTO_CATEGORY(category::kernel_hardware_counter), \ OMNITRACE_PERFETTO_CATEGORY(category::numa), \ + OMNITRACE_PERFETTO_CATEGORY(category::timer_sampling), \ + OMNITRACE_PERFETTO_CATEGORY(category::overflow_sampling), \ ::perfetto::Category("timemory").SetDescription("Events from the timemory API") #if defined(TIMEMORY_USE_PERFETTO) diff --git a/source/lib/core/components/fwd.hpp b/source/lib/core/components/fwd.hpp index df9f2afaa2..de68523b18 100644 --- a/source/lib/core/components/fwd.hpp +++ b/source/lib/core/components/fwd.hpp @@ -233,6 +233,8 @@ OMNITRACE_DEFINE_CONCRETE_TRAIT(uses_timing_units, component::sampling_cpu_clock // enable percent units OMNITRACE_DEFINE_CONCRETE_TRAIT(uses_percent_units, component::sampling_gpu_busy, true_type) +OMNITRACE_DEFINE_CONCRETE_TRAIT(uses_percent_units, component::sampling_percent, + true_type) // enable memory units OMNITRACE_DEFINE_CONCRETE_TRAIT(is_memory_category, component::sampling_gpu_memory, @@ -253,6 +255,9 @@ OMNITRACE_DEFINE_CONCRETE_TRAIT(report_mean, component::sampling_percent, false_ OMNITRACE_DEFINE_CONCRETE_TRAIT(report_statistics, component::sampling_percent, false_type) +// reporting categories (self) +OMNITRACE_DEFINE_CONCRETE_TRAIT(report_self, component::sampling_percent, false_type) + #define OMNITRACE_DECLARE_EXTERN_COMPONENT(NAME, HAS_DATA, ...) \ TIMEMORY_DECLARE_EXTERN_TEMPLATE( \ struct tim::component::base #include @@ -109,59 +111,7 @@ get_available_categories() return _v; } -template , typename Up = Tp> -ContainerT -parse_numeric_range(std::string _input_string, const std::string& _label, Up _incr) -{ - auto _get_value = [](const std::string& _inp) { - std::stringstream iss{ _inp }; - auto var = Tp{}; - iss >> var; - return var; - }; - - for(auto& itr : _input_string) - itr = tolower(itr); - auto _result = ContainerT{}; - for(const auto& _v : tim::delimit(_input_string, ",; \t\n\r")) - { - if(_v.find_first_not_of("0123456789-") != std::string::npos) - { - OMNITRACE_VERBOSE_F( - 0, - "Invalid %s specification. Only numerical values (e.g., 0) or " - "ranges (e.g., 0-7) are permitted. Ignoring %s...", - _label.c_str(), _v.c_str()); - continue; - } - if(_v.find('-') != std::string::npos) - { - auto _vv = tim::delimit(_v, "-"); - OMNITRACE_CONDITIONAL_THROW( - _vv.size() != 2, - "Invalid %s range specification: %s. Required format N-M, e.g. 0-4", - _label.c_str(), _v.c_str()); - Tp _vn = _get_value(_vv.at(0)); - Tp _vN = _get_value(_vv.at(1)); - do - { - if constexpr(std::is_same>::value) - _result.emplace(_vn); - else - _result.emplace_back(_vn); - _vn += _incr; - } while(_vn <= _vN); - } - else - { - if constexpr(std::is_same>::value) - _result.emplace(std::stol(_v)); - else - _result.emplace_back(std::stol(_v)); - } - } - return _result; -} +using utility::parse_numeric_range; #define OMNITRACE_CONFIG_SETTING(TYPE, ENV_NAME, DESCRIPTION, INITIAL_VALUE, ...) \ [&]() { \ @@ -454,6 +404,11 @@ configure_settings(bool _init) "Defaults to OMNITRACE_SAMPLING_FREQ when <= 0.0", -1.0, "sampling", "advanced"); + OMNITRACE_CONFIG_SETTING(double, "OMNITRACE_SAMPLING_OVERFLOW_FREQ", + "Number of events in between each sample. " + "Defaults to OMNITRACE_SAMPLING_FREQ when <= 0.0", + -1.0, "sampling", "advanced"); + OMNITRACE_CONFIG_SETTING( double, "OMNITRACE_SAMPLING_DELAY", "Time (in seconds) to wait before the first sampling signal is delivered, " @@ -518,15 +473,22 @@ configure_settings(bool _init) OMNITRACE_CONFIG_SETTING( std::string, "OMNITRACE_SAMPLING_CPUTIME_TIDS", "Same as OMNITRACE_SAMPLING_TIDS but applies specifically to samplers whose " - "timers are based on the CPU-time. This is useful when both " - "OMNITRACE_SAMPLING_CPUTIME=ON and OMNITRACE_SAMPLING_REALTIME=ON", + "timers are based on the CPU-time. This is useful when you want to restrict " + "samples to particular threads.", std::string{}, "sampling", "advanced"); OMNITRACE_CONFIG_SETTING( std::string, "OMNITRACE_SAMPLING_REALTIME_TIDS", "Same as OMNITRACE_SAMPLING_TIDS but applies specifically to samplers whose " - "timers are based on the real (wall) time. This is useful when both " - "OMNITRACE_SAMPLING_CPUTIME=ON and OMNITRACE_SAMPLING_REALTIME=ON", + "timers are based on the real (wall) time. This is useful when you want to " + "restrict samples to particular threads.", + std::string{}, "sampling", "advanced"); + + OMNITRACE_CONFIG_SETTING( + std::string, "OMNITRACE_SAMPLING_OVERFLOW_TIDS", + "Same as OMNITRACE_SAMPLING_TIDS but applies specifically to samplers whose " + "samples are based on the overflow of a particular event. This is useful when " + "you want to restrict samples to particular threads.", std::string{}, "sampling", "advanced"); auto _backend = tim::get_env_choice( @@ -593,29 +555,45 @@ configure_settings(bool _init) "thread started by the application.", 8, "sampling", "debugging", "advanced"); + OMNITRACE_CONFIG_SETTING(bool, "OMNITRACE_SAMPLING_OVERFLOW", + "Enable sampling via an overflow of a HW counter. This " + "requires Linux perf (/proc/sys/kernel/perf_event_paranoid " + "created by OS) with a value of 2 or less in that file", + false, "sampling", "advanced"); + OMNITRACE_CONFIG_SETTING( bool, "OMNITRACE_SAMPLING_REALTIME", - "Enable sampling frequency via a wall-clock timer on child threads. This may " - "result in typically idle child threads consuming an unnecessary large amount of " - "CPU time. The main thread always has this enabled.", + "Enable sampling frequency via a wall-clock timer. This may result in typically " + "idle child threads consuming an unnecessary large amount of CPU time.", false, "sampling", "advanced"); - OMNITRACE_CONFIG_SETTING(bool, "OMNITRACE_SAMPLING_CPUTIME", - "Enable sampling frequency via a timer that measures both " - "CPU time used by the current process, " - "and CPU time expended on behalf of the process by the " - "system. This is recommended.", - true, "sampling", "advanced"); - - auto _sigrt_range = SIGRTMAX - SIGRTMIN; - OMNITRACE_CONFIG_SETTING( - int, "OMNITRACE_SAMPLING_REALTIME_OFFSET", - std::string{ - "Modify this value only if the target process is also using SIGRTMIN. E.g. " - "the signal used is SIGRTMIN + . Value must be <= " } + - std::to_string(_sigrt_range), - 0, "sampling", "advanced"); + bool, "OMNITRACE_SAMPLING_CPUTIME", + "Enable sampling frequency via a timer that measures both CPU time used by the " + "current process, and CPU time expended on behalf of the process by the system. " + "This is recommended.", + false, "sampling", "advanced"); + + OMNITRACE_CONFIG_SETTING(int, "OMNITRACE_SAMPLING_CPUTIME_SIGNAL", + "Modify this value only if the target process is also using " + "the same signal (SIGPROF)", + SIGPROF, "sampling", "advanced"); + + OMNITRACE_CONFIG_SETTING(int, "OMNITRACE_SAMPLING_REALTIME_SIGNAL", + "Modify this value only if the target process is also using " + "the same signal (SIGRTMIN)", + SIGRTMIN, "sampling", "advanced"); + + OMNITRACE_CONFIG_SETTING(int, "OMNITRACE_SAMPLING_OVERFLOW_SIGNAL", + "Modify this value only if the target process is also using " + "the same signal (SIGRTMIN + 1)", + SIGRTMIN + 1, "sampling", "advanced"); + + OMNITRACE_CONFIG_SETTING(std::string, "OMNITRACE_SAMPLING_OVERFLOW_EVENT", + "Metric for overflow sampling", + std::string{ "perf::PERF_COUNT_HW_CACHE_REFERENCES" }, + "sampling", "hardware_counters") + ->set_choices(perf::get_config_choices()); OMNITRACE_CONFIG_SETTING(bool, "OMNITRACE_ROCTRACER_HIP_API", "Enable HIP API tracing support", true, "roctracer", "rocm", @@ -757,12 +735,22 @@ configure_settings(bool _init) std::string, "OMNITRACE_TMPDIR", "Base directory for temporary files", get_env("TMPDIR", "/tmp"), "io", "data", "advanced"); + OMNITRACE_CONFIG_SETTING( + std::string, "OMNITRACE_CAUSAL_BACKEND", + "Backend for call-stack sampling. See " + "https://amdresearch.github.io/omnitrace/causal_profiling.html#backends for more " + "info. If set to \"auto\", omnitrace will attempt to use the perf backend and " + "fallback on the timer backend if unavailable", + std::string{ "auto" }, "causal", "analysis") + ->set_choices({ "auto", "perf", "timer" }); + OMNITRACE_CONFIG_SETTING( std::string, "OMNITRACE_CAUSAL_MODE", "Perform causal experiments at the function-scope or line-scope. Ideally, use " "function first to locate function with highest impact and then switch to line " "mode + OMNITRACE_CAUSAL_FUNCTION_SCOPE set to the function being targeted.", - std::string{ "function" }, "causal", "analysis", "advanced"); + std::string{ "function" }, "causal", "analysis") + ->set_choices({ "func", "line", "function" }); OMNITRACE_CONFIG_SETTING( double, "OMNITRACE_CAUSAL_DELAY", @@ -1306,16 +1294,25 @@ configure_signal_handler(const std::shared_ptr& _config) } } -int -get_realtime_signal() +bool +get_use_sampling_overflow() { - return SIGRTMIN + get_sampling_rtoffset(); + static auto _v = get_config()->find("OMNITRACE_SAMPLING_OVERFLOW"); + return static_cast&>(*_v->second).get(); } -int -get_cputime_signal() +bool +get_use_sampling_realtime() { - return SIGPROF; + static auto _v = get_config()->find("OMNITRACE_SAMPLING_REALTIME"); + return static_cast&>(*_v->second).get(); +} + +bool +get_use_sampling_cputime() +{ + static auto _v = get_config()->find("OMNITRACE_SAMPLING_CPUTIME"); + return static_cast&>(*_v->second).get(); } std::set get_sampling_signals(int64_t) @@ -1323,13 +1320,22 @@ std::set get_sampling_signals(int64_t) auto _v = std::set{}; if(get_use_causal()) { - _v.emplace(get_cputime_signal()); - _v.emplace(get_realtime_signal()); + _v.emplace(get_sampling_cputime_signal()); + _v.emplace(get_sampling_realtime_signal()); } else { - if(get_use_sampling_cputime()) _v.emplace(get_cputime_signal()); - if(get_use_sampling_realtime()) _v.emplace(get_realtime_signal()); + if(get_use_sampling() && !get_use_sampling_cputime() && + !get_use_sampling_realtime() && !get_use_sampling_overflow()) + { + OMNITRACE_VERBOSE_F(1, "sampling enabled by cputime/realtime/overflow not " + "specified. defaulting to cputime...\n"); + set_setting_value("OMNITRACE_SAMPLING_CPUTIME", true); + } + + if(get_use_sampling_cputime()) _v.emplace(get_sampling_cputime_signal()); + if(get_use_sampling_realtime()) _v.emplace(get_sampling_realtime_signal()); + if(get_use_sampling_overflow()) _v.emplace(get_sampling_overflow_signal()); } return _v; @@ -2008,24 +2014,24 @@ get_sampling_keep_internal() return static_cast&>(*_v->second).get(); } -bool -get_use_sampling_realtime() +int +get_sampling_overflow_signal() { - static auto _v = get_config()->find("OMNITRACE_SAMPLING_REALTIME"); - return static_cast&>(*_v->second).get(); -} - -bool -get_use_sampling_cputime() -{ - static auto _v = get_config()->find("OMNITRACE_SAMPLING_CPUTIME"); - return static_cast&>(*_v->second).get(); + static auto _v = get_config()->find("OMNITRACE_SAMPLING_OVERFLOW_SIGNAL"); + return static_cast&>(*_v->second).get(); } int -get_sampling_rtoffset() +get_sampling_realtime_signal() { - static auto _v = get_config()->find("OMNITRACE_SAMPLING_REALTIME_OFFSET"); + static auto _v = get_config()->find("OMNITRACE_SAMPLING_REALTIME_SIGNAL"); + return static_cast&>(*_v->second).get(); +} + +int +get_sampling_cputime_signal() +{ + static auto _v = get_config()->find("OMNITRACE_SAMPLING_CPUTIME_SIGNAL"); return static_cast&>(*_v->second).get(); } @@ -2241,7 +2247,7 @@ get_sampling_freq() } double -get_sampling_cpu_freq() +get_sampling_cputime_freq() { static auto _v = get_config()->find("OMNITRACE_SAMPLING_CPUTIME_FREQ"); auto& _val = static_cast&>(*_v->second).get(); @@ -2250,7 +2256,7 @@ get_sampling_cpu_freq() } double -get_sampling_real_freq() +get_sampling_realtime_freq() { static auto _v = get_config()->find("OMNITRACE_SAMPLING_REALTIME_FREQ"); auto& _val = static_cast&>(*_v->second).get(); @@ -2258,6 +2264,15 @@ get_sampling_real_freq() return _val; } +double +get_sampling_overflow_freq() +{ + static auto _v = get_config()->find("OMNITRACE_SAMPLING_OVERFLOW_FREQ"); + auto& _val = static_cast&>(*_v->second).get(); + if(_val <= 0.0) _val = get_sampling_freq(); + return _val; +} + double get_sampling_delay() { @@ -2266,7 +2281,7 @@ get_sampling_delay() } double -get_sampling_cpu_delay() +get_sampling_cputime_delay() { static auto _v = get_config()->find("OMNITRACE_SAMPLING_CPUTIME_DELAY"); auto& _val = static_cast&>(*_v->second).get(); @@ -2275,7 +2290,7 @@ get_sampling_cpu_delay() } double -get_sampling_real_delay() +get_sampling_realtime_delay() { static auto _v = get_config()->find("OMNITRACE_SAMPLING_REALTIME_DELAY"); auto& _val = static_cast&>(*_v->second).get(); @@ -2293,32 +2308,40 @@ get_sampling_duration() std::string get_sampling_cpus() { - static auto _v = get_config()->find("OMNITRACE_SAMPLING_CPUS"); + auto _v = get_config()->find("OMNITRACE_SAMPLING_CPUS"); return static_cast&>(*_v->second).get(); } std::set get_sampling_tids() { - static auto _v = get_config()->find("OMNITRACE_SAMPLING_TIDS"); + auto _v = get_config()->find("OMNITRACE_SAMPLING_TIDS"); return parse_numeric_range<>( - static_cast&>(*_v->second).get(), "thread IDs", 1); + static_cast&>(*_v->second).get(), "thread IDs", 1L); } std::set -get_sampling_cpu_tids() +get_sampling_cputime_tids() { - static auto _v = get_config()->find("OMNITRACE_SAMPLING_CPUTIME_TIDS"); + auto _v = get_config()->find("OMNITRACE_SAMPLING_CPUTIME_TIDS"); return parse_numeric_range<>( - static_cast&>(*_v->second).get(), "thread IDs", 1); + static_cast&>(*_v->second).get(), "thread IDs", 1L); } std::set -get_sampling_real_tids() +get_sampling_realtime_tids() { - static auto _v = get_config()->find("OMNITRACE_SAMPLING_REALTIME_TIDS"); + auto _v = get_config()->find("OMNITRACE_SAMPLING_REALTIME_TIDS"); return parse_numeric_range<>( - static_cast&>(*_v->second).get(), "thread IDs", 1); + static_cast&>(*_v->second).get(), "thread IDs", 1L); +} + +std::set +get_sampling_overflow_tids() +{ + auto _v = get_config()->find("OMNITRACE_SAMPLING_OVERFLOW_TIDS"); + return parse_numeric_range<>( + static_cast&>(*_v->second).get(), "thread IDs", 1L); } bool @@ -2415,14 +2438,8 @@ get_trace_thread_join() bool get_debug_tid() { - static auto _vlist = []() { - std::unordered_set _tids{}; - for(auto itr : tim::delimit>( - tim::get_env("OMNITRACE_DEBUG_TIDS", ""), - ",: ", [](const std::string& _v) { return std::stoll(_v); })) - _tids.insert(itr); - return _tids; - }(); + static auto _vlist = parse_numeric_range>( + tim::get_env("OMNITRACE_DEBUG_TIDS", ""), "debug tids", 1L); static thread_local bool _v = _vlist.empty() || _vlist.count(tim::threading::get_id()) > 0; return _v; @@ -2431,14 +2448,8 @@ get_debug_tid() bool get_debug_pid() { - static auto _vlist = []() { - std::unordered_set _pids{}; - for(auto itr : tim::delimit>( - tim::get_env("OMNITRACE_DEBUG_PIDS", ""), - ",: ", [](const std::string& _v) { return std::stoll(_v); })) - _pids.insert(itr); - return _pids; - }(); + static auto _vlist = parse_numeric_range>( + tim::get_env("OMNITRACE_DEBUG_PIDS", ""), "debug pids", 1L); static bool _v = _vlist.empty() || _vlist.count(tim::process::get_id()) > 0 || _vlist.count(dmp::rank()) > 0; return _v; @@ -2589,6 +2600,31 @@ get_tmp_file(std::string _basename, std::string _ext) return _existing_files.at(_fname); } +CausalBackend +get_causal_backend() +{ + static auto _m = std::unordered_map{ + { "auto", CausalBackend::Auto }, + { "perf", CausalBackend::Perf }, + { "timer", CausalBackend::Timer }, + }; + + auto _v = get_config()->find("OMNITRACE_CAUSAL_BACKEND"); + try + { + return _m.at(static_cast&>(*_v->second).get()); + } catch(std::runtime_error& _e) + { + auto _mode = static_cast&>(*_v->second).get(); + OMNITRACE_THROW("[%s] invalid causal backend %s. Choices: %s\n", __FUNCTION__, + _mode.c_str(), + timemory::join::join(timemory::join::array_config{ ", ", "", "" }, + _v->second->get_choices()) + .c_str()); + } + return CausalBackend::Auto; +} + CausalMode get_causal_mode() { @@ -2612,12 +2648,11 @@ get_causal_mode() } catch(std::runtime_error& _e) { auto _mode = static_cast&>(*_v->second).get(); - std::stringstream _ss{}; - for(const auto& itr : _v->second->get_choices()) - _ss << ", " << itr; - auto _msg = (_ss.str().length() > 2) ? _ss.str().substr(2) : std::string{}; - OMNITRACE_THROW("[%s] invalid causal mode %s. Choices: %s\n", __FUNCTION__, - _mode.c_str(), _msg.c_str()); + OMNITRACE_THROW( + "[%s] invalid causal mode %s. Choices: %s\n", __FUNCTION__, _mode.c_str(), + timemory::join::join(timemory::join::array_config{ ", ", "", "" }, + _v->second->get_choices()) + .c_str()); } return CausalMode::Function; }(); @@ -2637,7 +2672,7 @@ get_causal_fixed_speedup() static auto _v = get_config()->find("OMNITRACE_CAUSAL_FIXED_SPEEDUP"); return parse_numeric_range>( static_cast&>(*_v->second).get(), - "causal fixed speedup", 5); + "causal fixed speedup", 5L); } std::string diff --git a/source/lib/core/config.hpp b/source/lib/core/config.hpp index bba79009e6..5ed92bb2f1 100644 --- a/source/lib/core/config.hpp +++ b/source/lib/core/config.hpp @@ -64,10 +64,13 @@ void configure_disabled_settings(const std::shared_ptr&); int -get_realtime_signal(); +get_sampling_overflow_signal(); int -get_cputime_signal(); +get_sampling_realtime_signal(); + +int +get_sampling_cputime_signal(); std::set get_sampling_signals(int64_t _tid = 0); @@ -233,15 +236,6 @@ get_use_code_coverage(); bool get_sampling_keep_internal(); -bool -get_use_sampling_realtime(); - -bool -get_use_sampling_cputime(); - -int -get_sampling_rtoffset(); - bool get_use_rcclp(); @@ -316,19 +310,22 @@ double get_sampling_freq(); double -get_sampling_cpu_freq(); +get_sampling_cputime_freq(); double -get_sampling_real_freq(); +get_sampling_realtime_freq(); + +double +get_sampling_overflow_freq(); double get_sampling_delay(); double -get_sampling_cpu_delay(); +get_sampling_cputime_delay(); double -get_sampling_real_delay(); +get_sampling_realtime_delay(); double get_sampling_duration(); @@ -337,10 +334,13 @@ std::string get_sampling_cpus(); std::set -get_sampling_cpu_tids(); +get_sampling_cputime_tids(); std::set -get_sampling_real_tids(); +get_sampling_realtime_tids(); + +std::set +get_sampling_overflow_tids(); bool get_sampling_include_inlines(); @@ -408,6 +408,9 @@ struct tmp_file std::shared_ptr get_tmp_file(std::string _basename, std::string _ext = "dat"); +CausalBackend +get_causal_backend(); + CausalMode get_causal_mode(); diff --git a/source/lib/core/containers/c_array.hpp b/source/lib/core/containers/c_array.hpp index b04f6efe8d..252d5df6fb 100644 --- a/source/lib/core/containers/c_array.hpp +++ b/source/lib/core/containers/c_array.hpp @@ -82,6 +82,14 @@ struct c_array return c_array(&m_base[start], end - start); } + void pop_front() + { + ++m_base; + --m_size; + } + + void pop_back() { --m_size; } + operator Tp*() const { return m_base; } // Iterator class for convenient range-based for loop support diff --git a/source/lib/core/containers/static_vector.hpp b/source/lib/core/containers/static_vector.hpp index a54ba49ac9..8eee0e8643 100644 --- a/source/lib/core/containers/static_vector.hpp +++ b/source/lib/core/containers/static_vector.hpp @@ -23,6 +23,7 @@ #pragma once #include "core/common.hpp" +#include "core/containers/c_array.hpp" #include "core/debug.hpp" #include "core/exception.hpp" @@ -50,6 +51,10 @@ struct static_vector static_vector& operator=(static_vector&&) noexcept = default; static_vector(size_t _n, Tp _v = {}); + explicit static_vector(c_array&&); + + template + explicit static_vector(std::array&&); static_vector& operator=(std::initializer_list&& _v); static_vector& operator=(std::pair, size_t>&&); @@ -92,10 +97,16 @@ struct static_vector decltype(auto) back() { return *(m_data.begin() + size() - 1); } decltype(auto) back() const { return *(m_data.begin() + size() - 1); } + auto* data() { return m_data.data(); } + const auto* data() const { return m_data.data(); } + void swap(this_type& _v); friend void swap(this_type& _lhs, this_type& _rhs) { _lhs.swap(_rhs); } +private: + void update_size(size_t); + private: count_type m_size = count_type{ 0 }; std::array m_data = {}; @@ -104,8 +115,25 @@ private: template static_vector::static_vector(size_t _n, Tp _v) { - m_size.store(_n); m_data.fill(_v); + update_size(_n); +} + +template +static_vector::static_vector(c_array&& _v) +{ + auto _n = std::min(N, _v.size()); + for(size_t i = 0; i < _n; ++i, ++m_size) + m_data[i] = _v[i]; +} + +template +template +static_vector::static_vector(std::array&& _v) +{ + auto _n = std::min(N, M); + for(size_t i = 0; i < _n; ++i, ++m_size) + m_data[i] = _v[i]; } template @@ -129,14 +157,9 @@ template static_vector& static_vector::operator=(std::pair, size_t>&& _v) { - if constexpr(AtomicSizeV) m_size.store(0); - + update_size(0); m_data = std::move(_v.first); - - if constexpr(AtomicSizeV) - m_size.store(_v.second); - else - m_size = _v.second; + update_size(_v.second); return *this; } @@ -145,10 +168,7 @@ template void static_vector::clear() { - if constexpr(AtomicSizeV) - m_size.store(0); - else - m_size = 0; + update_size(0); } template @@ -160,8 +180,8 @@ static_vector::swap(this_type& _v) auto _t_size = m_size; auto _v_size = _v.m_size; std::swap(m_data, _v.m_data); - m_size.store(_v_size); - _v.m_size.store(_t_size); + update_size(_v_size); + _v.update_size(_t_size); } else { @@ -190,5 +210,14 @@ static_vector::emplace_back(Args&&... _v) return m_data[_idx]; } +template +void +static_vector::update_size(size_t _n) +{ + if constexpr(AtomicSizeV) + m_size.store(_n); + else + m_size = _n; +} } // namespace container } // namespace omnitrace diff --git a/source/lib/core/debug.cpp b/source/lib/core/debug.cpp index 0dfe182355..0595007ead 100644 --- a/source/lib/core/debug.cpp +++ b/source/lib/core/debug.cpp @@ -22,6 +22,7 @@ #include "debug.hpp" #include "binary/address_range.hpp" +#include "locking.hpp" #include "state.hpp" #include @@ -87,7 +88,8 @@ set_source_location(source_location&& _v) } lock::lock() -: m_lk{ tim::type_mutex(), std::defer_lock } +: m_lk{ tim::type_mutex(), + std::defer_lock } { if(!m_lk.owns_lock() && !_protect_lock) { diff --git a/source/lib/core/debug.hpp b/source/lib/core/debug.hpp index 777d8593e3..f6f78b33df 100644 --- a/source/lib/core/debug.hpp +++ b/source/lib/core/debug.hpp @@ -24,6 +24,7 @@ #include "defines.hpp" #include "exception.hpp" +#include "locking.hpp" #include #include @@ -109,7 +110,7 @@ struct lock ~lock(); private: - tim::auto_lock_t m_lk; + locking::atomic_lock m_lk; }; // template @@ -220,6 +221,43 @@ as_hex(void*, size_t); //--------------------------------------------------------------------------------------// +#define OMNITRACE_CONDITIONAL_PRINT_COLOR(COLOR, COND, ...) \ + if((COND) && ::omnitrace::config::get_debug_tid() && \ + ::omnitrace::config::get_debug_pid()) \ + { \ + ::omnitrace::debug::flush(); \ + ::omnitrace::debug::lock _debug_lk{}; \ + OMNITRACE_FPRINTF_STDERR_COLOR(COLOR); \ + fprintf(::omnitrace::debug::get_file(), "[omnitrace][%i][%li]%s", \ + OMNITRACE_DEBUG_PROCESS_IDENTIFIER, OMNITRACE_DEBUG_THREAD_IDENTIFIER, \ + ::omnitrace::debug::is_bracket(__VA_ARGS__) ? "" : " "); \ + fprintf(::omnitrace::debug::get_file(), __VA_ARGS__); \ + ::omnitrace::debug::flush(); \ + } + +#define OMNITRACE_CONDITIONAL_PRINT_COLOR_F(COLOR, COND, ...) \ + if((COND) && ::omnitrace::config::get_debug_tid() && \ + ::omnitrace::config::get_debug_pid()) \ + { \ + ::omnitrace::debug::flush(); \ + ::omnitrace::debug::lock _debug_lk{}; \ + OMNITRACE_FPRINTF_STDERR_COLOR(COLOR); \ + fprintf(::omnitrace::debug::get_file(), "[omnitrace][%i][%li][%s]%s", \ + OMNITRACE_DEBUG_PROCESS_IDENTIFIER, OMNITRACE_DEBUG_THREAD_IDENTIFIER, \ + OMNITRACE_FUNCTION, \ + ::omnitrace::debug::is_bracket(__VA_ARGS__) ? "" : " "); \ + fprintf(::omnitrace::debug::get_file(), __VA_ARGS__); \ + ::omnitrace::debug::flush(); \ + } + +#define OMNITRACE_PRINT_COLOR(COLOR, ...) \ + OMNITRACE_CONDITIONAL_PRINT_COLOR(COLOR, true, __VA_ARGS__) + +#define OMNITRACE_PRINT_COLOR_F(COLOR, ...) \ + OMNITRACE_CONDITIONAL_PRINT_COLOR_F(COLOR, true, __VA_ARGS__) + +//--------------------------------------------------------------------------------------// + #define OMNITRACE_CONDITIONAL_PRINT(COND, ...) \ if((COND) && ::omnitrace::config::get_debug_tid() && \ ::omnitrace::config::get_debug_pid()) \ diff --git a/source/lib/core/mproc.cpp b/source/lib/core/mproc.cpp index b6ed20277a..4fba4c0b5f 100644 --- a/source/lib/core/mproc.cpp +++ b/source/lib/core/mproc.cpp @@ -28,6 +28,8 @@ #include #include #include +#include +#include #include namespace omnitrace @@ -70,5 +72,123 @@ get_process_index(int _pid, int _ppid) } return -1; } + +int +wait_pid(pid_t _pid, int _opts) +{ + int _status = 0; + pid_t _pid_v = -1; + _opts |= WUNTRACED; + do + { + if((_opts & WNOHANG) > 0) + { + std::this_thread::yield(); + std::this_thread::sleep_for(std::chrono::milliseconds{ 100 }); + } + _pid_v = waitpid(_pid, &_status, _opts); + } while(_pid_v <= 0); + return _status; +} + +int +diagnose_status(pid_t _pid, int _status, int _verbose) +{ + if(_verbose >= 3) + { + fflush(stderr); + fflush(stdout); + std::cout << std::flush; + std::cerr << std::flush; + } + + bool _normal_exit = (WIFEXITED(_status) > 0); + bool _unhandled_signal = (WIFSIGNALED(_status) > 0); + bool _core_dump = (WCOREDUMP(_status) > 0); + bool _stopped = (WIFSTOPPED(_status) > 0); + int _exit_status = WEXITSTATUS(_status); + int _stop_signal = (_stopped) ? WSTOPSIG(_status) : 0; + int _ec = (_unhandled_signal) ? WTERMSIG(_status) : 0; + + if(_verbose >= 4) + { + TIMEMORY_PRINTF_INFO( + stderr, + "diagnosing status for process %i :: status: %i... normal exit: %s, " + "unhandled signal: %s, core dump: %s, stopped: %s, exit status: %i, stop " + "signal: %i, exit code: %i\n", + _pid, _status, std::to_string(_normal_exit).c_str(), + std::to_string(_unhandled_signal).c_str(), std::to_string(_core_dump).c_str(), + std::to_string(_stopped).c_str(), _exit_status, _stop_signal, _ec); + } + else if(_verbose >= 3) + { + TIMEMORY_PRINTF_INFO(stderr, + "diagnosing status for process %i :: status: %i ...\n", _pid, + _status); + } + + if(!_normal_exit) + { + if(_ec == 0) _ec = EXIT_FAILURE; + if(_verbose >= 0) + { + TIMEMORY_PRINTF_FATAL( + stderr, "process %i terminated abnormally. exit code: %i\n", _pid, _ec); + } + } + + if(_stopped) + { + if(_verbose >= 0) + { + TIMEMORY_PRINTF_FATAL(stderr, + "process %i stopped with signal %i. exit code: %i\n", + _pid, _stop_signal, _ec); + } + } + + if(_core_dump) + { + if(_verbose >= 0) + { + TIMEMORY_PRINTF_FATAL( + stderr, "process %i terminated and produced a core dump. exit code: %i\n", + _pid, _ec); + } + } + + if(_unhandled_signal) + { + if(_verbose >= 0) + { + TIMEMORY_PRINTF_FATAL(stderr, + "process %i terminated because it received a signal " + "(%i) that was not handled. exit code: %i\n", + _pid, _ec, _ec); + } + } + + if(!_normal_exit && _exit_status > 0) + { + if(_verbose >= 0) + { + if(_exit_status == 127) + { + TIMEMORY_PRINTF_FATAL( + stderr, "execv in process %i failed. exit code: %i\n", _pid, _ec); + } + else + { + TIMEMORY_PRINTF_FATAL( + stderr, + "process %i terminated with a non-zero status. exit code: %i\n", _pid, + _ec); + } + } + } + + return _ec; +} } // namespace mproc } // namespace omnitrace diff --git a/source/lib/core/mproc.hpp b/source/lib/core/mproc.hpp index eb1534a4e0..b50c80341e 100644 --- a/source/lib/core/mproc.hpp +++ b/source/lib/core/mproc.hpp @@ -35,5 +35,11 @@ get_concurrent_processes(int _ppid = getppid()); int get_process_index(int _pid = getpid(), int _ppid = getppid()); + +int +wait_pid(pid_t _pid, int _opts = 0); + +int +diagnose_status(pid_t _pid, int _status, int _verbose = 0); } // namespace mproc } // namespace omnitrace diff --git a/source/lib/core/perf.cpp b/source/lib/core/perf.cpp new file mode 100644 index 0000000000..e457953794 --- /dev/null +++ b/source/lib/core/perf.cpp @@ -0,0 +1,244 @@ +// MIT License +// +// Copyright (c) 2022 Advanced Micro Devices, Inc. All Rights Reserved. +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in all +// copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +// SOFTWARE. + +#include "perf.hpp" +#include "debug.hpp" + +#include + +namespace omnitrace +{ +namespace perf +{ +namespace units = ::tim::units; + +std::vector +get_config_choices() +{ + namespace regex_const = ::std::regex_constants; + + auto _data = std::vector{}; + auto _papi_events = tim::papi::available_events_info(); + const auto _prefix = std::string_view{ "perf::" }; + auto _regex = + std::regex{ "^(perf::|)PERF_COUNT_(HW|SW|HW_CACHE)_([A-Z_]+)(|:[A-Z]+)$", + regex_const::optimize }; + + for(const auto& itr : _papi_events) + { + if(std::regex_match(itr.symbol(), _regex)) + { + auto _symbol = itr.symbol(); + auto _pos = _symbol.find(_prefix); + if(_pos == 0) _symbol = _symbol.substr(_prefix.length()); + _data.emplace_back(_symbol); + } + } + + std::sort(_data.begin(), _data.end()); + _data.erase(std::unique(_data.begin(), _data.end()), _data.end()); + + return _data; +} + +event_type +get_event_type(std::string_view _v) +{ + if(_v.find("PERF_COUNT_SW_") != std::string_view::npos) + return event_type::software; + else if(_v.find("PERF_COUNT_SW_") != std::string_view::npos && + !std::regex_search(_v.data(), + std::regex{ "PERF_COUNT_HW_CACHE_(MISSES|REFERENCES)$" })) + return event_type::hw_cache; + else if(_v.find("PERF_COUNT_HW_") != std::string_view::npos) + return event_type::hardware; + return event_type::max; +} + +hw_config +get_hw_config(std::string_view _v) +{ +#define HW_CONFIG_REGEX(KEY) std::regex_search(_v.data(), std::regex{ "(HW_" KEY ")$" }) + + if(HW_CONFIG_REGEX("CPU_CYCLES")) + return hw_config::cpu_cycles; + else if(HW_CONFIG_REGEX("INSTRUCTIONS")) + return hw_config::instructions; + else if(HW_CONFIG_REGEX("CACHE_REFERENCES")) + return hw_config::cache_references; + else if(HW_CONFIG_REGEX("CACHE_MISSES")) + return hw_config::cache_misses; + else if(HW_CONFIG_REGEX("BRANCH_INSTRUCTIONS")) + return hw_config::branch_instructions; + else if(HW_CONFIG_REGEX("BRANCH_MISSES")) + return hw_config::branch_misses; + else if(HW_CONFIG_REGEX("BUS_CYCLES")) + return hw_config::bus_cycles; + else if(HW_CONFIG_REGEX("STALLED_CYCLES_FRONTEND")) + return hw_config::stalled_cycles_frontend; + else if(HW_CONFIG_REGEX("STALLED_CYCLES_BACKEND")) + return hw_config::stalled_cycles_backend; + else if(HW_CONFIG_REGEX("REF_CPU_CYCLES")) + return hw_config::reference_cpu_cycles; + else + { + OMNITRACE_THROW("Unknown perf hardware config: %s", _v.data()); + } + +#undef HW_CONFIG_REGEX + + return hw_config::max; +} + +sw_config +get_sw_config(std::string_view _v) +{ +#define SW_CONFIG_REGEX(KEY) std::regex_search(_v.data(), std::regex{ "(SW_" KEY ")$" }) + + if(SW_CONFIG_REGEX("CPU_CLOCK")) + return sw_config::cpu_clock; + else if(SW_CONFIG_REGEX("TASK_CLOCK")) + return sw_config::task_clock; + else if(SW_CONFIG_REGEX("PAGE_FAULTS")) + return sw_config::page_faults; + else if(SW_CONFIG_REGEX("CONTEXT_SWITCHES")) + return sw_config::context_switches; + else if(SW_CONFIG_REGEX("CPU_MIGRATIONS")) + return sw_config::cpu_migrations; + else if(SW_CONFIG_REGEX("PAGE_FAULTS_MIN")) + return sw_config::page_faults_minor; + else if(SW_CONFIG_REGEX("PAGE_FAULTS_MAJ")) + return sw_config::page_faults_major; + else if(SW_CONFIG_REGEX("ALIGNMENT_FAULTS")) + return sw_config::alignment_faults; + else if(SW_CONFIG_REGEX("EMULATION_FAULTS")) + return sw_config::emulation_faults; + else + { + OMNITRACE_THROW("Unknown perf hw cache config: %s", _v.data()); + } + +#undef SW_CONFIG_REGEX + + return sw_config::max; +} + +int +get_hw_cache_config(std::string_view _v) +{ + int _value = 0; + +#define HW_CACHE_CONFIG_REGEX(KEY) \ + std::regex_search(_v.data(), std::regex{ "(HW_CACHE_" KEY ")" }) + + if(HW_CACHE_CONFIG_REGEX("L1D")) + _value |= static_cast(hw_cache_config::l1d); + else if(HW_CACHE_CONFIG_REGEX("L1I")) + _value |= static_cast(hw_cache_config::l1i); + else if(HW_CACHE_CONFIG_REGEX("LL")) + _value |= static_cast(hw_cache_config::ll); + else if(HW_CACHE_CONFIG_REGEX("DTLB")) + _value |= static_cast(hw_cache_config::dtlb); + else if(HW_CACHE_CONFIG_REGEX("ITLB")) + _value |= static_cast(hw_cache_config::itlb); + else if(HW_CACHE_CONFIG_REGEX("BPU")) + _value |= static_cast(hw_cache_config::bpu); + else if(HW_CACHE_CONFIG_REGEX("NODE")) + _value |= static_cast(hw_cache_config::node); + else + OMNITRACE_THROW("Unknown perf software config: %s", _v.data()); + +#undef HW_CACHE_CONFIG_REGEX +#define HW_CACHE_OP_REGEX(KEY) \ + std::regex_search(_v.data(), std::regex{ "(HW_CACHE_([A-Z1]+):" KEY ")" }) + + if(HW_CACHE_OP_REGEX("READ")) + _value |= (static_cast(hw_cache_op::read) << 8); + else if(HW_CACHE_OP_REGEX("WRITE")) + _value |= (static_cast(hw_cache_op::write) << 8); + else if(HW_CACHE_OP_REGEX("PREFETCH")) + _value |= (static_cast(hw_cache_op::prefetch) << 8); + else + _value |= (static_cast(hw_cache_op::read) << 8); + +#undef HW_CACHE_OP_REGEX +#define HW_CACHE_OP_RESULT_REGEX(KEY) \ + std::regex_search(_v.data(), std::regex{ "(HW_CACHE_([A-Z1]+):" KEY ")" }) + + if(HW_CACHE_OP_RESULT_REGEX("READ")) + _value |= (static_cast(hw_cache_op_result::access) << 16); + else if(HW_CACHE_OP_RESULT_REGEX("WRITE")) + _value |= (static_cast(hw_cache_op_result::miss) << 16); + else + _value |= (static_cast(hw_cache_op_result::access) << 16); + +#undef HW_CACHE_OP_RESULT_REGEX + + return _value; +} + +void +config_overflow_sampling(struct perf_event_attr& _pe, std::string_view _event, + double _freq) +{ + auto _period = (1.0 / _freq) * units::sec; + + _pe.type = static_cast(perf::get_event_type(_event)); + switch(_pe.type) + { + case PERF_TYPE_HARDWARE: + { + _pe.config = static_cast(perf::get_hw_config(_event)); + break; + } + case PERF_TYPE_SOFTWARE: + { + _pe.config = static_cast(perf::get_sw_config(_event)); + break; + } + case PERF_TYPE_HW_CACHE: + { + _pe.config = static_cast(perf::get_hw_cache_config(_event)); + break; + } + case PERF_TYPE_BREAKPOINT: + case PERF_TYPE_TRACEPOINT: + case PERF_TYPE_RAW: + case PERF_TYPE_MAX: + default: + { + OMNITRACE_THROW("unsupported perf type"); + } + }; + + if(_pe.type == PERF_TYPE_SOFTWARE && + (_pe.config == PERF_COUNT_SW_CPU_CLOCK || _pe.config == PERF_COUNT_SW_TASK_CLOCK)) + { + _pe.sample_period = static_cast(_period); + } + else + { + _pe.sample_period = static_cast(_freq); + } +} +} // namespace perf +} // namespace omnitrace diff --git a/source/lib/core/perf.hpp b/source/lib/core/perf.hpp new file mode 100644 index 0000000000..a91b96f1ab --- /dev/null +++ b/source/lib/core/perf.hpp @@ -0,0 +1,291 @@ +// MIT License +// +// Copyright (c) 2022 Advanced Micro Devices, Inc. All Rights Reserved. +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in all +// copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +// SOFTWARE. + +#pragma once + +#include "core/defines.hpp" + +#include + +#include +#include + +#if __GLIBC__ == 2 && __GLIBC_MINOR__ < 30 +# include +# define gettid() syscall(SYS_gettid) +#endif + +// Workaround for missing hw_breakpoint.h include file: +// This include file just defines constants used to configure watchpoint registers. +// This will be constant across x86 systems. +enum +{ + HW_BREAKPOINT_X = 4 +}; + +namespace omnitrace +{ +namespace perf +{ +/// An enum class with all the available sampling data +enum class sample : uint64_t +{ + ip = PERF_SAMPLE_IP, + pid_tid = PERF_SAMPLE_TID, + time = PERF_SAMPLE_TIME, + addr = PERF_SAMPLE_ADDR, + id = PERF_SAMPLE_ID, + stream_id = PERF_SAMPLE_STREAM_ID, + cpu = PERF_SAMPLE_CPU, + period = PERF_SAMPLE_PERIOD, + +#if defined(PERF_SAMPLE_READ) + read = PERF_SAMPLE_READ, +#else + read = 0, +#endif + + callchain = PERF_SAMPLE_CALLCHAIN, + raw = PERF_SAMPLE_RAW, + +#if defined(PERF_SAMPLE_BRANCH_STACK) + branch_stack = PERF_SAMPLE_BRANCH_STACK, +#else + branch_stack = 0, +#endif + +#if defined(PERF_SAMPLE_REGS_USER) + regs = PERF_SAMPLE_REGS_USER, +#else + regs = 0, +#endif + +#if defined(PERF_SAMPLE_STACK_USER) + stack = PERF_SAMPLE_STACK_USER, +#else + stack = 0, +#endif + +#if defined(PERF_SAMPLE_WEIGHT) + weight = PERF_SAMPLE_WEIGHT, +#else + weight = 0, +#endif + +#if defined(PERF_SAMPLE_DATA_SRC) + data_src = PERF_SAMPLE_DATA_SRC, +#else + data_src = 0, +#endif + +#if defined(PERF_SAMPLE_IDENTIFIER) + identifier = PERF_SAMPLE_IDENTIFIER, +#else + identifier = 0, +#endif + +#if defined(PERF_SAMPLE_TRANSACTION) + transaction = PERF_SAMPLE_TRANSACTION, +#else + transaction = 0, +#endif + +#if defined(PERF_SAMPLE_REGS_INTR) + regs_intr = PERF_SAMPLE_REGS_INTR, +#else + regs_intr = 0, +#endif + +#if defined(PERF_SAMPLE_PHYS_ADDR) + phys_addr = PERF_SAMPLE_PHYS_ADDR, +#else + phys_addr = 0, +#endif + +#if defined(PERF_SAMPLE_CGROUP) + cgroup = PERF_SAMPLE_CGROUP, +#else + cgroup = 0, +#endif + + last = PERF_SAMPLE_MAX +}; + +enum class event_type : int +{ + hardware = PERF_TYPE_HARDWARE, + software = PERF_TYPE_SOFTWARE, + tracepoint = PERF_TYPE_TRACEPOINT, + hw_cache = PERF_TYPE_HW_CACHE, + raw = PERF_TYPE_RAW, + breakpoint = PERF_TYPE_BREAKPOINT, + max = PERF_TYPE_MAX, +}; + +enum class hw_config : int +{ + cpu_cycles = PERF_COUNT_HW_CPU_CYCLES, + instructions = PERF_COUNT_HW_INSTRUCTIONS, + cache_references = PERF_COUNT_HW_CACHE_REFERENCES, + cache_misses = PERF_COUNT_HW_CACHE_MISSES, + branch_instructions = PERF_COUNT_HW_BRANCH_INSTRUCTIONS, + branch_misses = PERF_COUNT_HW_BRANCH_MISSES, + bus_cycles = PERF_COUNT_HW_BUS_CYCLES, + stalled_cycles_frontend = PERF_COUNT_HW_STALLED_CYCLES_FRONTEND, + stalled_cycles_backend = PERF_COUNT_HW_STALLED_CYCLES_BACKEND, + reference_cpu_cycles = PERF_COUNT_HW_REF_CPU_CYCLES, + max = PERF_COUNT_HW_MAX, +}; + +enum class sw_config : int +{ + cpu_clock = PERF_COUNT_SW_CPU_CLOCK, + task_clock = PERF_COUNT_SW_TASK_CLOCK, + page_faults = PERF_COUNT_SW_PAGE_FAULTS, + context_switches = PERF_COUNT_SW_CONTEXT_SWITCHES, + cpu_migrations = PERF_COUNT_SW_CPU_MIGRATIONS, + page_faults_minor = PERF_COUNT_SW_PAGE_FAULTS_MIN, + page_faults_major = PERF_COUNT_SW_PAGE_FAULTS_MAJ, + alignment_faults = PERF_COUNT_SW_ALIGNMENT_FAULTS, + emulation_faults = PERF_COUNT_SW_EMULATION_FAULTS, + max = PERF_COUNT_SW_MAX, +}; + +enum class hw_cache_config : int +{ + l1d = PERF_COUNT_HW_CACHE_L1D, + l1i = PERF_COUNT_HW_CACHE_L1I, + ll = PERF_COUNT_HW_CACHE_LL, + dtlb = PERF_COUNT_HW_CACHE_DTLB, + itlb = PERF_COUNT_HW_CACHE_ITLB, + bpu = PERF_COUNT_HW_CACHE_BPU, + node = PERF_COUNT_HW_CACHE_NODE, + max = PERF_COUNT_HW_CACHE_MAX, +}; + +enum class hw_cache_op : int +{ + read = PERF_COUNT_HW_CACHE_OP_READ, + write = PERF_COUNT_HW_CACHE_OP_WRITE, + prefetch = PERF_COUNT_HW_CACHE_OP_PREFETCH, + max = PERF_COUNT_HW_CACHE_OP_MAX, +}; + +enum class hw_cache_op_result : int +{ + access = PERF_COUNT_HW_CACHE_RESULT_ACCESS, + miss = PERF_COUNT_HW_CACHE_RESULT_MISS, + max = PERF_COUNT_HW_CACHE_RESULT_MAX, +}; + +/// An enum to distinguish types of records in the mmapped ring buffer +enum class record_type +{ + mmap = PERF_RECORD_MMAP, + lost = PERF_RECORD_LOST, + comm = PERF_RECORD_COMM, + exit = PERF_RECORD_EXIT, + throttle = PERF_RECORD_THROTTLE, + unthrottle = PERF_RECORD_UNTHROTTLE, + fork = PERF_RECORD_FORK, + read = PERF_RECORD_READ, + sample = PERF_RECORD_SAMPLE, + +#if defined(PERF_RECORD_MMAP2) + mmap2 = PERF_RECORD_MMAP2, +#else + mmap2 = 0, +#endif + +#if defined(PERF_RECORD_AUX) + aux = PERF_RECORD_AUX, +#else + aux = 0, +#endif + +#if defined(PERF_RECORD_ITRACE_START) + itrace_start = PERF_RECORD_ITRACE_START, +#else + itrace_start = 0, +#endif + +#if defined(PERF_RECORD_LOST_SAMPLES) + lost_samples = PERF_RECORD_LOST_SAMPLES, +#else + lost_samples = 0, +#endif + +#if defined(PERF_RECORD_SWITCH) + switch_record = PERF_RECORD_SWITCH, +#else + switch_record = 0, +#endif + +#if defined(PERF_RECORD_SWITCH_CPU_WIDE) + switch_cpu_wide = PERF_RECORD_SWITCH_CPU_WIDE, +#else + switch_cpu_wide = 0, +#endif + +#if defined(PERF_RECORD_NAMESPACES) + namespaces = PERF_RECORD_NAMESPACES, +#else + namespaces = 0, +#endif + +#if defined(PERF_RECORD_KSYMBOL) + ksymbol = PERF_RECORD_KSYMBOL, +#else + ksymbol = 0, +#endif + +#if defined(PERF_RECORD_BPF_EVENT) + bpf_event = PERF_RECORD_BPF_EVENT, +#else + bpf_event = 0, +#endif + +#if defined(PERF_RECORD_CGROUP) + cgroup = PERF_RECORD_CGROUP, +#else + cgroup = 0, +#endif + +#if defined(PERF_RECORD_TEXT_POKE) + text_poke = PERF_RECORD_TEXT_POKE, +#else + text_poke = 0, +#endif +}; + +std::vector +get_config_choices(); + +event_type get_event_type(std::string_view); +hw_config get_hw_config(std::string_view); +sw_config get_sw_config(std::string_view); +int get_hw_cache_config(std::string_view); + +void +config_overflow_sampling(struct perf_event_attr&, std::string_view, double); +} // namespace perf +} // namespace omnitrace diff --git a/source/lib/core/state.hpp b/source/lib/core/state.hpp index c7d41f5ddb..326fe094f1 100644 --- a/source/lib/core/state.hpp +++ b/source/lib/core/state.hpp @@ -57,6 +57,13 @@ enum class Mode : unsigned short Coverage }; +enum class CausalBackend : unsigned short +{ + Perf = 0, + Timer, + Auto, +}; + enum class CausalMode : unsigned short { Line = 0, diff --git a/source/lib/core/utility.cpp b/source/lib/core/utility.cpp new file mode 100644 index 0000000000..2a16c068be --- /dev/null +++ b/source/lib/core/utility.cpp @@ -0,0 +1,123 @@ +// MIT License +// +// Copyright (c) 2022 Advanced Micro Devices, Inc. All Rights Reserved. +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in all +// copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +// SOFTWARE. + +#include "utility.hpp" +#include "debug.hpp" + +namespace omnitrace +{ +namespace utility +{ +namespace +{ +template +auto +emplace_impl(ContainerT& _targ, Arg&& _v, int) + -> decltype(_targ.emplace(std::forward(_v))) +{ + return _targ.emplace(std::forward(_v)); +} + +template +auto +emplace_impl(ContainerT& _targ, Arg&& _v, long) + -> decltype(_targ.emplace_back(std::forward(_v))) +{ + return _targ.emplace_back(std::forward(_v)); +} + +template +decltype(auto) +emplace(ContainerT& _targ, Arg&& _v) +{ + return emplace_impl(_targ, std::forward(_v), 0); +} +} // namespace + +template +ContainerT +parse_numeric_range(std::string _input_string, const std::string& _label, Up _incr) +{ + auto _get_value = [](const std::string& _inp) { + std::stringstream iss{ _inp }; + auto var = Tp{}; + iss >> var; + return var; + }; + + for(auto& itr : _input_string) + itr = tolower(itr); + auto _result = ContainerT{}; + for(auto _v : tim::delimit(_input_string, ",; \t\n\r")) + { + if(_v.find_first_not_of("0123456789-:") != std::string::npos) + { + OMNITRACE_BASIC_VERBOSE_F( + 0, + "Invalid %s specification. Only numerical values (e.g., 0), ranges " + "(e.g., 0-7), and ranges with increments (e.g. 20-40:10) are permitted. " + "Ignoring %s...", + _label.c_str(), _v.c_str()); + continue; + } + + auto _incr_v = _incr; + auto _incr_pos = _v.find(':'); + if(_incr_pos != std::string::npos) + { + auto _incr_str = _v.substr(_incr_pos + 1); + if(!_incr_str.empty()) _incr_v = static_cast(std::stoull(_incr_str)); + _v = _v.substr(0, _incr_pos); + } + + if(_v.find('-') != std::string::npos) + { + auto _vv = tim::delimit(_v, "-"); + OMNITRACE_CONDITIONAL_THROW( + _vv.size() != 2, + "Invalid %s range specification: %s. Required format N-M, e.g. 0-4", + _label.c_str(), _v.c_str()); + Tp _vn = _get_value(_vv.at(0)); + Tp _vN = _get_value(_vv.at(1)); + do + { + emplace(_result, _vn); + _vn += _incr_v; + } while(_vn <= _vN); + } + else + { + emplace(_result, std::stoll(_v)); + } + } + return _result; +} + +template std::set +parse_numeric_range>(std::string, const std::string&, long); +template std::vector +parse_numeric_range>(std::string, const std::string&, long); +template std::unordered_set +parse_numeric_range>(std::string, const std::string&, + long); +} // namespace utility +} // namespace omnitrace diff --git a/source/lib/core/utility.hpp b/source/lib/core/utility.hpp index 87c03d885f..cde303a065 100644 --- a/source/lib/core/utility.hpp +++ b/source/lib/core/utility.hpp @@ -25,6 +25,7 @@ #include "concepts.hpp" #include +#include #include #include @@ -32,6 +33,7 @@ #include #include #include +#include #include #include #include @@ -238,5 +240,17 @@ convert(std::string_view _inp) _iss >> _ret; return _ret; } + +template , typename Up = Tp> +ContainerT +parse_numeric_range(std::string _input_string, const std::string& _label, Up _incr); + +extern template std::set +parse_numeric_range>(std::string, const std::string&, long); +extern template std::vector +parse_numeric_range>(std::string, const std::string&, long); +extern template std::unordered_set +parse_numeric_range>(std::string, const std::string&, + long); } // namespace utility } // namespace omnitrace diff --git a/source/lib/omnitrace-dl/dl.cpp b/source/lib/omnitrace-dl/dl.cpp index 5f01a78b3a..0181df2d62 100644 --- a/source/lib/omnitrace-dl/dl.cpp +++ b/source/lib/omnitrace-dl/dl.cpp @@ -136,6 +136,8 @@ reset_omnitrace_preload() auto&& _preload_libs = get_env("LD_PRELOAD", std::string{}); if(_preload_libs.find("libomnitrace-dl.so") != std::string::npos) { + (void) get_omnitrace_is_preloaded(); + (void) get_omnitrace_preload(); auto _modified_preload = std::string{}; for(const auto& itr : delimit(_preload_libs, ":")) { @@ -1293,6 +1295,7 @@ verify_instrumented_preloaded() case dl::InstrumentMode::None: case dl::InstrumentMode::ProcessAttach: case dl::InstrumentMode::ProcessCreate: + case dl::InstrumentMode::PythonProfile: { return; } diff --git a/source/lib/omnitrace-user/omnitrace/categories.h b/source/lib/omnitrace-user/omnitrace/categories.h index 5d1c1275e8..480a828ce4 100644 --- a/source/lib/omnitrace-user/omnitrace/categories.h +++ b/source/lib/omnitrace-user/omnitrace/categories.h @@ -83,6 +83,8 @@ extern "C" OMNITRACE_CATEGORY_THREAD_HARDWARE_COUNTER, OMNITRACE_CATEGORY_KERNEL_HARDWARE_COUNTER, OMNITRACE_CATEGORY_NUMA, + OMNITRACE_CATEGORY_TIMER_SAMPLING, + OMNITRACE_CATEGORY_OVERFLOW_SAMPLING, OMNITRACE_CATEGORY_LAST // the value of below enum is used for iterating // over the enum in C++ templates. It MUST diff --git a/source/lib/omnitrace/library/CMakeLists.txt b/source/lib/omnitrace/library/CMakeLists.txt index 3c8b09801c..a691011934 100644 --- a/source/lib/omnitrace/library/CMakeLists.txt +++ b/source/lib/omnitrace/library/CMakeLists.txt @@ -5,6 +5,7 @@ set(library_sources ${CMAKE_CURRENT_LIST_DIR}/critical_trace.cpp ${CMAKE_CURRENT_LIST_DIR}/kokkosp.cpp ${CMAKE_CURRENT_LIST_DIR}/ompt.cpp + ${CMAKE_CURRENT_LIST_DIR}/perf.cpp ${CMAKE_CURRENT_LIST_DIR}/process_sampler.cpp ${CMAKE_CURRENT_LIST_DIR}/ptl.cpp ${CMAKE_CURRENT_LIST_DIR}/runtime.cpp @@ -19,6 +20,7 @@ set(library_headers ${CMAKE_CURRENT_LIST_DIR}/critical_trace.hpp ${CMAKE_CURRENT_LIST_DIR}/ompt.hpp ${CMAKE_CURRENT_LIST_DIR}/process_sampler.hpp + ${CMAKE_CURRENT_LIST_DIR}/perf.hpp ${CMAKE_CURRENT_LIST_DIR}/ptl.hpp ${CMAKE_CURRENT_LIST_DIR}/rcclp.hpp ${CMAKE_CURRENT_LIST_DIR}/rocm.hpp diff --git a/source/lib/omnitrace/library/causal/CMakeLists.txt b/source/lib/omnitrace/library/causal/CMakeLists.txt index b4ab0062fc..d00fe85fc6 100644 --- a/source/lib/omnitrace/library/causal/CMakeLists.txt +++ b/source/lib/omnitrace/library/causal/CMakeLists.txt @@ -1,21 +1,13 @@ # set(causal_sources - ${CMAKE_CURRENT_LIST_DIR}/data.cpp - ${CMAKE_CURRENT_LIST_DIR}/delay.cpp - ${CMAKE_CURRENT_LIST_DIR}/experiment.cpp - # ${CMAKE_CURRENT_LIST_DIR}/perf.cpp - ${CMAKE_CURRENT_LIST_DIR}/sample_data.cpp - ${CMAKE_CURRENT_LIST_DIR}/sampling.cpp - ${CMAKE_CURRENT_LIST_DIR}/selected_entry.cpp) + ${CMAKE_CURRENT_LIST_DIR}/data.cpp ${CMAKE_CURRENT_LIST_DIR}/delay.cpp + ${CMAKE_CURRENT_LIST_DIR}/experiment.cpp ${CMAKE_CURRENT_LIST_DIR}/sample_data.cpp + ${CMAKE_CURRENT_LIST_DIR}/sampling.cpp ${CMAKE_CURRENT_LIST_DIR}/selected_entry.cpp) set(causal_headers - ${CMAKE_CURRENT_LIST_DIR}/data.hpp - ${CMAKE_CURRENT_LIST_DIR}/delay.hpp - ${CMAKE_CURRENT_LIST_DIR}/experiment.hpp - # ${CMAKE_CURRENT_LIST_DIR}/perf.hpp - ${CMAKE_CURRENT_LIST_DIR}/sample_data.hpp - ${CMAKE_CURRENT_LIST_DIR}/sampling.hpp - ${CMAKE_CURRENT_LIST_DIR}/selected_entry.hpp) + ${CMAKE_CURRENT_LIST_DIR}/data.hpp ${CMAKE_CURRENT_LIST_DIR}/delay.hpp + ${CMAKE_CURRENT_LIST_DIR}/experiment.hpp ${CMAKE_CURRENT_LIST_DIR}/sample_data.hpp + ${CMAKE_CURRENT_LIST_DIR}/sampling.hpp ${CMAKE_CURRENT_LIST_DIR}/selected_entry.hpp) target_sources(omnitrace-object-library PRIVATE ${causal_sources} ${causal_headers}) diff --git a/source/lib/omnitrace/library/causal/components/backtrace.cpp b/source/lib/omnitrace/library/causal/components/backtrace.cpp index f1d472a506..476d2b99ce 100644 --- a/source/lib/omnitrace/library/causal/components/backtrace.cpp +++ b/source/lib/omnitrace/library/causal/components/backtrace.cpp @@ -29,6 +29,7 @@ #include "library/causal/data.hpp" #include "library/causal/delay.hpp" #include "library/causal/experiment.hpp" +#include "library/perf.hpp" #include "library/runtime.hpp" #include "library/thread_data.hpp" #include "library/thread_info.hpp" @@ -45,6 +46,7 @@ #include #include +#include #include namespace omnitrace @@ -57,46 +59,85 @@ namespace { using ::tim::backtrace::get_unw_signal_frame_stack_raw; -auto& -get_delay_statistics() -{ - using thread_data_t = - thread_data>, category::sampling>; - - static_assert( - use_placement_new_when_generating_unique_ptr::value, - "delay statistics thread data should use placement new to allocate unique_ptr"); - - static auto& _v = thread_data_t::instance(construct_on_init{}); - return _v; -} -} // namespace +int realtime_signal = 0; +int cputime_signal = 0; +int overflow_signal = 0; void -backtrace::start() +generic_global_init() { // do not delete these lines. The thread data needs to be allocated // before it is called in sampler or else a deadlock will occur when // the sample interrupts a malloc call - (void) get_delay_statistics(); + if(realtime_signal + cputime_signal + overflow_signal == 0) + { + realtime_signal = get_sampling_realtime_signal(); + cputime_signal = get_sampling_cputime_signal(); + overflow_signal = get_sampling_overflow_signal(); + } +} +} // namespace + +void +overflow::global_init() +{ + // do not delete these lines. + generic_global_init(); } void -backtrace::stop() -{} +backtrace::global_init() +{ + // do not delete these lines. + generic_global_init(); +} void -sample_rate::sample(int _sig) +overflow::sample(int _sig) { - if(_sig != get_realtime_signal()) return; + OMNITRACE_SCOPED_THREAD_STATE(ThreadState::Internal); - // update the last sample for backtrace signal(s) even when in use - static thread_local int64_t _last_sample = 0; + static thread_local const auto& _tinfo = thread_info::get(); + auto _tid = _tinfo->index_data->sequent_value; + auto& _perf_event = perf::get_instance(_tid); - auto _this_sample = tracing::now(); - auto& _period_stat = get_delay_statistics()->at(threading::get_id()); - if(_last_sample > 0) _period_stat += (_this_sample - _last_sample); - _last_sample = _this_sample; + if(!_perf_event) return; + + m_index = causal::experiment::get_index(); + + _perf_event->stop(); + + for(auto itr : *_perf_event) + { + if(itr.is_sample()) + { + auto _sample_ip = itr.get_ip(); + auto _data = callchain_t{}; + _data.emplace_back(_sample_ip); + for(auto ditr : itr.get_callchain()) + { + if(ditr != _sample_ip) _data.emplace_back(ditr); + if(_data.size() == _data.capacity()) break; + } + + if(causal::experiment::is_active() && causal::experiment::is_selected(_data)) + { + ++m_selected; + causal::experiment::add_selected(); + causal::delay::get_local() += causal::experiment::get_delay(); + } + else if(!causal::experiment::is_active()) + { + causal::set_current_selection(_data); + } + + m_stack.emplace_back(_data); + } + } + + _perf_event->start(); + + if(_sig == cputime_signal) causal::delay::process(); } void @@ -104,11 +145,16 @@ backtrace::sample(int _sig) { constexpr size_t depth = ::omnitrace::causal::unwind_depth; constexpr int64_t ignore_depth = ::omnitrace::causal::unwind_offset; + constexpr size_t select_init = std::numeric_limits::max(); + constexpr size_t select_ival = 5; // interval at which realtime signal contributes // update the last sample for backtrace signal(s) even when in use static thread_local size_t _protect_flag = 0; - // sampling_guard _guard{}; + // the select_count is initialized to max so that realtime signal does + // not initially set the current selection + static thread_local size_t _select_count = select_init; + static thread_local size_t _select_zeros = 0; if((_protect_flag & 1) == 1 || OMNITRACE_UNLIKELY(!trait::runtime_enabled::get())) @@ -122,33 +168,52 @@ backtrace::sample(int _sig) m_index = causal::experiment::get_index(); m_stack = get_unw_signal_frame_stack_raw(); - // the batch handler timer delivers a signal according to the thread CPU - // clock, ensuring that setting the current selection and processing the - // delays only happens when the thread is active - if(_sig == get_cputime_signal()) - { - if(!causal::experiment::is_active()) - causal::set_current_selection(m_stack); - else - causal::delay::process(); - } - else if(_sig == get_realtime_signal()) - { - static thread_local auto _tid = threading::get_id(); - auto& _period_stat = get_delay_statistics()->at(_tid); + auto _set_current_selection = [](auto _stack) { + // save the former selection count + auto _former_count = _select_count; + // get the current selection count + _select_count = causal::set_current_selection(_stack); + // if the selection count was reduced, reset select zeros. + // this typically means that a new experiment was started + if(_former_count > _select_count) _select_zeros = 0; + // if no PCs were selected, increment the select zeros. + // if the cputime signal has not selected a PC in select_ival iterations, + // then the realtime signal will start contributing to the current + // selection. We generally want only the cputime signal to contribute + // because those PCs are in-use (since the thread CPU clock in increasing) + if(_select_count == 0) ++_select_zeros; + }; + // the batch handler timer delivers a signal according to the thread CPU + // clock, ensuring that setting the current selection is preferred when the thread + // is active and processing the delays happens only when the thread is active + if(_sig == cputime_signal) + { + if(causal::experiment::is_active()) + causal::delay::process(); + else + _set_current_selection(m_stack); + } + else if(_sig == realtime_signal) + { if(causal::experiment::is_active() && causal::experiment::is_selected(m_stack)) { m_selected = true; causal::experiment::add_selected(); - // compute the delay time based on the rate of taking samples, - // unless we have taken less than 10, in which case, we just - // use the pre-computed value. - auto _delay = - (_period_stat.get_count() < 10) - ? causal::experiment::get_delay() - : (_period_stat.get_mean() * causal::experiment::get_delay_scaling()); - causal::delay::get_local() += _delay; + causal::delay::get_local() += causal::experiment::get_delay(); + } + else if(!causal::experiment::is_active()) + { + // if no PCs have been selected after at least "select_ival" call-stacks via + // the cputime signal, then contribute the call-stack via the realtime signal. + // This can be particularly relevant in end-to-end runs targeting a particular + // line/function since it is possible that the line/function is situated such + // the cputime signal is never delivered when executing the particular + // line/function... despite the line/function executing in between the + // the cputime signals. This is rare but has been observed + // + if(_select_count == 0 && _select_zeros >= select_ival) + _set_current_selection(m_stack); } } else @@ -165,36 +230,10 @@ backtrace::get_period(uint64_t _units) { using cast_type = std::conditional_t::value, Tp, double>; - double _realtime_freq = - (get_use_sampling_realtime()) ? get_sampling_real_freq() : 0.0; - double _cputime_freq = (get_use_sampling_cputime()) ? get_sampling_cpu_freq() : 0.0; - - auto _freq = std::max(_realtime_freq, _cputime_freq); - double _period = 1.0 / _freq; + double _period = 1.0 / 1000.0; int64_t _period_nsec = static_cast(_period * units::sec) % units::sec; return static_cast(_period_nsec) / static_cast(_units); } - -tim::statistics -backtrace::get_period_stats() -{ - auto _data = tim::statistics{}; - if(!get_delay_statistics()) return _data; - for(auto itr : *get_delay_statistics()) - { - if(itr.get_count() > 1) _data += itr; - } - return _data; -} - -void -backtrace::reset_period_stats() -{ - for(auto& itr : *get_delay_statistics()) - { - itr.reset(); - } -} } // namespace component } // namespace causal } // namespace omnitrace diff --git a/source/lib/omnitrace/library/causal/components/backtrace.hpp b/source/lib/omnitrace/library/causal/components/backtrace.hpp index c9b78aa8ba..7af3f31002 100644 --- a/source/lib/omnitrace/library/causal/components/backtrace.hpp +++ b/source/lib/omnitrace/library/causal/components/backtrace.hpp @@ -27,7 +27,8 @@ #include "core/defines.hpp" #include "core/timemory.hpp" #include "library/causal/data.hpp" -#include "library/causal/sample_data.hpp" +#include "library/causal/fwd.hpp" +#include "library/perf.hpp" #include #include @@ -45,22 +46,36 @@ namespace causal { namespace component { -struct sample_rate : comp::empty_base +struct overflow : comp::empty_base { - using value_type = void; - static void sample(int = -1); + static constexpr auto alt_stack_size = perf::perf_event::max_batch_size; + + using value_type = void; + using callchain_t = container::static_vector; + using alt_stack_t = container::static_vector; + + static std::string label() { return "causal::overflow"; } + static void global_init(); + + void sample(int = -1); + + auto get_selected() const { return m_selected; } + auto get_index() const { return m_index; } + const auto& get_stack() const { return m_stack; } + +private: + int32_t m_selected = 0; + uint32_t m_index = 0; + alt_stack_t m_stack = {}; }; struct backtrace : comp::empty_base { - using value_type = void; - using sample_data_set_t = std::set; + using value_type = void; + using callchain_t = container::static_vector; static std::string label() { return "causal::backtrace"; } - static std::string description() - { - return "Causal profiling data collected in backtrace"; - } + static void global_init(); backtrace() = default; ~backtrace() = default; @@ -70,9 +85,6 @@ struct backtrace : comp::empty_base backtrace& operator=(const backtrace&) = default; backtrace& operator=(backtrace&&) noexcept = default; - static void start(); - static void stop(); - void sample(int = -1); auto get_selected() const { return m_selected; } @@ -82,9 +94,6 @@ struct backtrace : comp::empty_base template static Tp get_period(uint64_t _units = units::nsec); - static tim::statistics get_period_stats(); - static void reset_period_stats(); - private: bool m_selected = false; uint32_t m_index = 0; diff --git a/source/lib/omnitrace/library/causal/components/blocking_gotcha.cpp b/source/lib/omnitrace/library/causal/components/blocking_gotcha.cpp index f0e8982a9e..5c7f7e50fa 100644 --- a/source/lib/omnitrace/library/causal/components/blocking_gotcha.cpp +++ b/source/lib/omnitrace/library/causal/components/blocking_gotcha.cpp @@ -39,6 +39,7 @@ #include #include #include +#include #pragma weak pthread_join #pragma weak pthread_mutex_lock @@ -141,9 +142,9 @@ blocking_gotcha::shutdown() blocking_gotcha_t::disable(); } -template -Ret -blocking_gotcha::operator()(const comp::gotcha_data& _data, Ret (*_func)(Args...), +template +std::enable_if_t<(Idx <= blocking_gotcha::indexes::maybe_post_block_max_idx), Ret> +blocking_gotcha::operator()(gotcha_index, Ret (*_func)(Args...), Args... _args) const noexcept { int64_t _delay_value = causal::delay::get_global().load(std::memory_order_relaxed); @@ -154,20 +155,26 @@ blocking_gotcha::operator()(const comp::gotcha_data& _data, Ret (*_func)(Args... if(get_thread_state() < ::omnitrace::ThreadState::Internal) { - if(_data.index <= 5) - causal::delay::postblock(_delay_value); - else if(_ret == 0 && _data.index >= 6 && _data.index <= 13) + if constexpr(Idx >= always_post_block_min_idx && Idx <= always_post_block_max_idx) + { causal::delay::postblock(_delay_value); + } + else if constexpr(Idx >= maybe_post_block_min_idx && + Idx <= maybe_post_block_max_idx) + { + if(_ret == 0) causal::delay::postblock(_delay_value); + } else - OMNITRACE_FAIL_F("Error! unexpected index %zu ('%s')\n", _data.index, - _data.tool_id.c_str()); + { + static_assert(Idx > maybe_post_block_max_idx, "Error! bad overload"); + } } return _ret; } int -blocking_gotcha::operator()(const comp::gotcha_data&, int (*)(const sigset_t*, int*), +blocking_gotcha::operator()(gotcha_index, int (*)(const sigset_t*, int*), const sigset_t* _set_v, int* _sig) const noexcept { auto _active = get_thread_state() < ::omnitrace::ThreadState::Internal; @@ -198,7 +205,7 @@ blocking_gotcha::operator()(const comp::gotcha_data&, int (*)(const sigset_t*, i } int -blocking_gotcha::operator()(const comp::gotcha_data&, +blocking_gotcha::operator()(gotcha_index, int (*_func)(const sigset_t*, siginfo_t*), const sigset_t* _set_v, siginfo_t* _info_v) const noexcept { @@ -224,7 +231,7 @@ blocking_gotcha::operator()(const comp::gotcha_data&, } int -blocking_gotcha::operator()(const comp::gotcha_data&, +blocking_gotcha::operator()(gotcha_index, int (*_func)(const sigset_t*, siginfo_t*, const struct timespec*), const sigset_t* _set_v, siginfo_t* _info_v, @@ -250,6 +257,20 @@ blocking_gotcha::operator()(const comp::gotcha_data&, return _ret; } + +int +blocking_gotcha::operator()(gotcha_index, int (*)(const sigset_t*), + const sigset_t* _set_v) const noexcept +{ + auto _old_set = sigset_t{}; + int _sig = 0; + ::sigprocmask(SIG_SETMASK, _set_v, &_old_set); + // sigwait is wrapped so no need to block/unblock signals + auto _ret = ::sigwait(_set_v, &_sig); + ::sigprocmask(SIG_SETMASK, &_old_set, nullptr); + + return _ret; +} } // namespace component } // namespace causal } // namespace omnitrace diff --git a/source/lib/omnitrace/library/causal/components/blocking_gotcha.hpp b/source/lib/omnitrace/library/causal/components/blocking_gotcha.hpp index 0b590471cb..5d2ede3f7c 100644 --- a/source/lib/omnitrace/library/causal/components/blocking_gotcha.hpp +++ b/source/lib/omnitrace/library/causal/components/blocking_gotcha.hpp @@ -44,6 +44,22 @@ struct blocking_gotcha : comp::base { static constexpr size_t gotcha_capacity = 19; + template + using gotcha_index = std::integral_constant; + + enum indexes + { + always_post_block_min_idx = 0, + always_post_block_max_idx = 5, + maybe_post_block_min_idx = 6, + maybe_post_block_max_idx = 13, + sigwait_idx = 14, + sigwaitinfo_idx = 15, + sigtimedwait_idx = 16, + sigsuspend_idx = 17, + indexes_max = gotcha_capacity - 1, + }; + OMNITRACE_DEFAULT_OBJECT(blocking_gotcha) // string id for component @@ -55,18 +71,22 @@ struct blocking_gotcha : comp::base static void configure(); static void shutdown(); - template - Ret operator()(const comp::gotcha_data&, Ret (*)(Args...), Args...) const noexcept; + template + std::enable_if_t<(Idx <= maybe_post_block_max_idx), Ret> operator()( + gotcha_index, Ret (*)(Args...), Args...) const noexcept; - int operator()(const comp::gotcha_data&, int (*)(const sigset_t*, int*), + int operator()(gotcha_index, int (*)(const sigset_t*, int*), const sigset_t*, int*) const noexcept; - int operator()(const comp::gotcha_data&, int (*)(const sigset_t*, siginfo_t*), + int operator()(gotcha_index, int (*)(const sigset_t*, siginfo_t*), const sigset_t*, siginfo_t*) const noexcept; - int operator()(const comp::gotcha_data&, + int operator()(gotcha_index, int (*)(const sigset_t*, siginfo_t*, const struct timespec*), const sigset_t*, siginfo_t*, const struct timespec*) const noexcept; + + int operator()(gotcha_index, int (*)(const sigset_t*), + const sigset_t*) const noexcept; }; using blocking_gotcha_t = diff --git a/source/lib/omnitrace/library/causal/components/causal_gotcha.cpp b/source/lib/omnitrace/library/causal/components/causal_gotcha.cpp index f931a6b087..911ee124e0 100644 --- a/source/lib/omnitrace/library/causal/components/causal_gotcha.cpp +++ b/source/lib/omnitrace/library/causal/components/causal_gotcha.cpp @@ -41,8 +41,6 @@ namespace component { namespace { -namespace signals = ::tim::signals; - using bundle_t = tim::lightweight_tuple; auto& @@ -101,18 +99,6 @@ causal_gotcha::stop() shutdown(); } -void -causal_gotcha::block_signals() -{ - signals::block_signals(sampling_signals(), signals::sigmask_scope::thread); -} - -void -causal_gotcha::unblock_signals() -{ - signals::unblock_signals(sampling_signals(), signals::sigmask_scope::thread); -} - void causal_gotcha::remove_signals(sigset_t* _set) { diff --git a/source/lib/omnitrace/library/causal/components/causal_gotcha.hpp b/source/lib/omnitrace/library/causal/components/causal_gotcha.hpp index 5353cb7efc..fb4e8d1931 100644 --- a/source/lib/omnitrace/library/causal/components/causal_gotcha.hpp +++ b/source/lib/omnitrace/library/causal/components/causal_gotcha.hpp @@ -49,8 +49,6 @@ struct causal_gotcha : tim::component::base static void start(); static void stop(); - static void block_signals(); - static void unblock_signals(); static void remove_signals(sigset_t*); }; } // namespace component diff --git a/source/lib/omnitrace/library/causal/components/unblocking_gotcha.cpp b/source/lib/omnitrace/library/causal/components/unblocking_gotcha.cpp index 437617d6ea..99c9db37f6 100644 --- a/source/lib/omnitrace/library/causal/components/unblocking_gotcha.cpp +++ b/source/lib/omnitrace/library/causal/components/unblocking_gotcha.cpp @@ -97,34 +97,35 @@ unblocking_gotcha::shutdown() unblocking_gotcha_t::disable(); } -template -Ret -unblocking_gotcha::operator()(const comp::gotcha_data& _data, Ret (*_func)(Args...), +template +std::enable_if_t<(Idx < unblocking_gotcha::indexes::kill_idx), Ret> +unblocking_gotcha::operator()(gotcha_index, Ret (*_func)(Args...), Args... _args) const noexcept { auto _active = get_thread_state() < ::omnitrace::ThreadState::Internal; - if(_active) causal::delay::process(); - - if(_active && _data.index == 7) + if(_active) { - int64_t _delay_value = (_active) ? causal::delay::get_global().load() : 0; + causal::delay::process(); - causal::sampling::block_backtrace_samples(); - auto _ret = (*_func)(_args...); - causal::sampling::unblock_backtrace_samples(); + if constexpr(Idx == pthread_barrier_wait_idx) + { + int64_t _delay_value = (_active) ? causal::delay::get_global().load() : 0; - causal::delay::postblock(_delay_value); - return _ret; - } - else - { - return (*_func)(_args...); + causal::sampling::block_backtrace_samples(); + auto _ret = (*_func)(_args...); + causal::sampling::unblock_backtrace_samples(); + + causal::delay::postblock(_delay_value); + return _ret; + } } + + return (*_func)(_args...); } int -unblocking_gotcha::operator()(const comp::gotcha_data&, int (*_func)(pid_t, int), +unblocking_gotcha::operator()(gotcha_index, int (*_func)(pid_t, int), pid_t _pid, int _sig) const noexcept { auto _active = get_thread_state() < ::omnitrace::ThreadState::Internal; diff --git a/source/lib/omnitrace/library/causal/components/unblocking_gotcha.hpp b/source/lib/omnitrace/library/causal/components/unblocking_gotcha.hpp index f13a87e63a..547573f6d9 100644 --- a/source/lib/omnitrace/library/causal/components/unblocking_gotcha.hpp +++ b/source/lib/omnitrace/library/causal/components/unblocking_gotcha.hpp @@ -43,6 +43,16 @@ struct unblocking_gotcha : comp::base { static constexpr size_t gotcha_capacity = 9; + enum indexes + { + pthread_barrier_wait_idx = 7, + kill_idx = 8, + indexes_max = gotcha_capacity, + }; + + template + using gotcha_index = std::integral_constant; + OMNITRACE_DEFAULT_OBJECT(unblocking_gotcha) // string id for component @@ -54,10 +64,12 @@ struct unblocking_gotcha : comp::base static void configure(); static void shutdown(); - template - Ret operator()(const comp::gotcha_data&, Ret (*)(Args...), Args...) const noexcept; + template + std::enable_if_t<(Idx < kill_idx), Ret> operator()(gotcha_index, + Ret (*)(Args...), + Args...) const noexcept; - int operator()(const comp::gotcha_data&, int (*)(pid_t, int), pid_t, + int operator()(gotcha_index, int (*)(pid_t, int), pid_t, int) const noexcept; }; diff --git a/source/lib/omnitrace/library/causal/data.cpp b/source/lib/omnitrace/library/causal/data.cpp index 17bc539cbc..cc53634a1f 100644 --- a/source/lib/omnitrace/library/causal/data.cpp +++ b/source/lib/omnitrace/library/causal/data.cpp @@ -28,11 +28,14 @@ #include "binary/scope_filter.hpp" #include "core/binary/fwd.hpp" #include "core/config.hpp" +#include "core/containers/c_array.hpp" #include "core/debug.hpp" #include "core/state.hpp" #include "core/utility.hpp" #include "library/causal/delay.hpp" #include "library/causal/experiment.hpp" +#include "library/causal/fwd.hpp" +#include "library/causal/sample_data.hpp" #include "library/causal/sampling.hpp" #include "library/causal/selected_entry.hpp" #include "library/ptl.hpp" @@ -48,6 +51,7 @@ #include #include #include +#include #include #include @@ -117,7 +121,7 @@ get_eligible_address_ranges() using sf = binary::scope_filter; auto -get_filters(std::set _scopes = { +get_filters(const std::set& _scopes = { sf::BINARY_FILTER, sf::SOURCE_FILTER, sf::FUNCTION_FILTER }) { auto _filters = std::vector{}; @@ -276,29 +280,15 @@ auto compute_eligible_lines_impl() { const auto& _binary_info = get_cached_binary_info().first; - auto& _filter_info = get_cached_binary_info().second; + auto& _scoped_info = get_cached_binary_info().second; auto _filters = get_filters(); - auto& _eligible_ar = get_eligible_address_ranges(); for(const auto& litr : _binary_info) { - for(const auto& ditr : litr.mappings) - { - _eligible_ar += - std::make_pair(binary::address_multirange::coarse{}, - address_range_t{ ditr.load_address, ditr.last_address }); - } - - for(const auto& ditr : litr.symbols) - { - _eligible_ar += ditr.address + ditr.load_address; - } - - auto& _filtered = _filter_info.emplace_back(); - _filtered.bfd = litr.bfd; - _filtered.mappings = litr.mappings; - _filtered.ranges = litr.ranges; - _filtered.sections = litr.sections; + auto& _scoped = _scoped_info.emplace_back(); + _scoped.bfd = litr.bfd; + _scoped.mappings = litr.mappings; + _scoped.sections = litr.sections; for(const auto& ditr : litr.symbols) { @@ -312,7 +302,8 @@ compute_eligible_lines_impl() if(ditr(_filters) || (_sym.inlines.size() + _sym.dwarf_info.size()) > 0) { - _filtered.symbols.emplace_back(_sym); + _scoped.ranges.emplace_back(_sym.ipaddr()); + _scoped.symbols.emplace_back(_sym); } } @@ -322,17 +313,31 @@ compute_eligible_lines_impl() sf::satisfies_filter(_filters, sf::SOURCE_FILTER, join(':', ditr.file, ditr.line))) { - _filtered.debug_info.emplace_back(ditr); + _scoped.debug_info.emplace_back(ditr); } } - _filtered.sort(); + _scoped.sort(); + } + + auto& _eligible_ar = get_eligible_address_ranges(); + for(const auto& litr : _scoped_info) + { + for(const auto& ditr : litr.symbols) + { + _eligible_ar += ditr.ipaddr(); + } + + for(auto ditr : litr.ranges) + { + _eligible_ar += ditr; + } } OMNITRACE_VERBOSE( 0, "[causal] eligible address ranges: %zu, coarse address range: %zu [%s]\n", _eligible_ar.size(), _eligible_ar.range_size(), - _eligible_ar.coarse_range.as_string().c_str()); + _eligible_ar.get_coarse_range().as_string().c_str()); if(_eligible_ar.empty()) { @@ -369,9 +374,10 @@ save_maps_info_impl(std::ostream& _ofs) void save_line_info_impl(std::ostream& _ofs, - const std::vector& _binary_data) + const std::vector& _binary_data, + const std::array& _info = { true, true, true }) { - auto _write_impl = [&_ofs](const binary::binary_info& _data) { + auto _write_impl = [&_ofs, &_info](const binary::binary_info& _data) { for(const auto& itr : _data.mappings) { _ofs << itr.pathname << " [" << as_hex(itr.load_address) << " - " @@ -389,28 +395,38 @@ save_line_info_impl(std::ostream& _ofs, if(!itr.func.empty()) _ofs << " [" << tim::demangle(itr.func) << "]"; _ofs << "\n"; - for(const auto& ditr : itr.inlines) + if(std::get<0>(_info)) { - _ofs << " " << ditr.file << ":" << ditr.line; - if(!ditr.func.empty()) _ofs << " [" << tim::demangle(ditr.func) << "]"; - _ofs << "\n"; + for(const auto& ditr : itr.inlines) + { + _ofs << " " << ditr.file << ":" << ditr.line; + if(!ditr.func.empty()) + _ofs << " [" << tim::demangle(ditr.func) << "]"; + _ofs << "\n"; + } } - for(const auto& ditr : itr.dwarf_info) + if(std::get<1>(_info)) { - _ofs << " " << as_hex(ditr.address) << " :: " << ditr.file << ":" - << ditr.line; - _ofs << "\n"; - _emitted_dwarf_addresses.emplace(ditr.address.low); + for(const auto& ditr : itr.dwarf_info) + { + _ofs << " " << as_hex(ditr.address) << " :: " << ditr.file + << ":" << ditr.line; + _ofs << "\n"; + _emitted_dwarf_addresses.emplace(ditr.address.low); + } } } - for(const auto& itr : _data.debug_info) + if(std::get<2>(_info)) { - if(_emitted_dwarf_addresses.count(itr.address.low) > 0) continue; - _ofs << " " << as_hex(itr.address) << " :: " << itr.file << ":" - << itr.line; - _ofs << "\n"; + for(const auto& itr : _data.debug_info) + { + if(_emitted_dwarf_addresses.count(itr.address.low) > 0) continue; + _ofs << " " << as_hex(itr.address) << " :: " << itr.file << ":" + << itr.line; + _ofs << "\n"; + } } _ofs << "\n" << std::flush; @@ -433,6 +449,10 @@ compute_eligible_lines() }); } +auto eligible_pc_history = std::map{}; +auto eligible_pc_idx = std::atomic{ 0 }; +auto eligible_pc_candidates = std::atomic{ 0 }; + void perform_experiment_impl(std::shared_ptr> _started) // NOLINT { @@ -455,19 +475,18 @@ perform_experiment_impl(std::shared_ptr> _started) // NOLINT // notify that thread has started if(_started) _started->set_value(); - // pause at least one second to determine sampling rate - // std::this_thread::sleep_for(std::chrono::seconds{ 1 }); - if(!config::get_causal_end_to_end()) { // wait for at least one progress point to start while(num_progress_points.load(std::memory_order_relaxed) == 0) { + std::this_thread::yield(); std::this_thread::sleep_for(std::chrono::milliseconds{ 1 }); } } // allow ~10 samples to be collected + std::this_thread::yield(); std::this_thread::sleep_for(std::chrono::milliseconds{ 10 }); double _delay_sec = @@ -481,6 +500,7 @@ perform_experiment_impl(std::shared_ptr> _started) // NOLINT OMNITRACE_VERBOSE(1, "[causal] delaying experimentation for %.2f seconds...\n", _delay_sec); uint64_t _delay_nsec = _delay_sec * units::sec; + std::this_thread::yield(); std::this_thread::sleep_for(std::chrono::nanoseconds{ _delay_nsec }); } @@ -515,58 +535,128 @@ perform_experiment_impl(std::shared_ptr> _started) // NOLINT { if(get_state() == State::Finalized) { - auto _memory = std::stringstream{}; - auto _binary = std::stringstream{}; - auto _scoped = std::stringstream{}; - auto _sample = std::stringstream{}; - save_maps_info_impl(_memory); - save_line_info_impl(_binary, get_cached_binary_info().first); - save_line_info_impl(_scoped, get_cached_binary_info().second); + if(_impl_no > 0) return; - auto _samples = std::map{}; + OMNITRACE_VERBOSE( + 0, + "[causal] experiment failed to start. Number of PC candidates: %zu\n", + eligible_pc_candidates.load()); + + auto _memory = std::stringstream{}; + auto _binary = std::stringstream{}; + auto _scoped = std::stringstream{}; + auto _sample = std::stringstream{}; + auto _eligible = std::stringstream{}; + save_maps_info_impl(_memory); + save_line_info_impl(_binary, get_cached_binary_info().first, + { true, true, false }); + save_line_info_impl(_scoped, get_cached_binary_info().second, + { true, true, false }); + + auto _samples_map = std::map{}; for(const auto& itr : get_samples()) { for(const auto& iitr : itr.second) { - _samples[iitr.address] += iitr.count; + _samples_map[iitr.address] += iitr.count; } } + auto _eligible_pc_hist = std::vector>{}; + for(const auto& itr : eligible_pc_history) + { + _eligible_pc_hist.emplace_back(std::make_pair(itr.first, itr.second)); + } + + std::sort( + _eligible_pc_hist.begin(), _eligible_pc_hist.end(), + [](auto&& _lhs, auto&& _rhs) { return _lhs.second > _rhs.second; }); + + for(const auto& itr : _eligible_pc_hist) + { + _eligible << " " << std::setw(8) << itr.second + << " :: " << as_hex(itr.first) << "\n"; + } + + auto _samples = std::vector>{}; + for(const auto& itr : _samples_map) + _samples.emplace_back(std::make_pair(itr.first, itr.second)); + + // sort by most samples + std::sort(_samples.begin(), _samples.end(), + [](const auto& _lhs, const auto& _rhs) { + return _lhs.second > _rhs.second; + }); + for(const auto& itr : _samples) { if(itr.second > 0) { - auto _linfo = get_line_info(itr.first, true); - // if(_linfo.size() > 1) _linfo.pop_front(); - for(const auto& iitr : _linfo) + auto _is_eligible = is_eligible_address(itr.first) && + !get_line_info(itr.first, false).empty(); + auto _linfo = binary::lookup_ipaddr_entry(itr.first); + if(_linfo) { _sample << " " << std::setw(8) << itr.second - << " :: " << as_hex(itr.first) << " [" << iitr.file - << ":" << iitr.line << "][" << demangle(iitr.func) - << "]\n"; - } - - if(_linfo.empty()) - { - _sample << " " << std::setw(8) << itr.second - << " :: " << as_hex(itr.first) << "\n"; + << " :: " << std::setw(5) << std::boolalpha + << _is_eligible << " :: " << as_hex(itr.first) << " " + << _linfo->location << ":" << _linfo->lineno << " [" + << demangle(_linfo->name) << "]\n"; + for(const auto& iitr : _linfo->lineinfo.lines) + { + _sample << " " << std::setw(8) << itr.second + << " :: " << std::setw(5) << std::boolalpha + << _is_eligible << " :: " << as_hex(itr.first) + << " " << iitr.location << ":" << iitr.line + << " [" << demangle(iitr.name) << "]\n"; + } } } } + OMNITRACE_PRINT_COLOR(fatal, "causal experiment never started\n"); + std::cerr << std::flush; auto _cerr = tim::log::warning_stream(std::cerr); - _cerr << "\nmaps:\n\n" << _memory.str() << "\n"; - _cerr << "\nbinary:\n\n" << _binary.str() << "\n"; - _cerr << "\nscoped:\n\n" << _scoped.str() << "\n"; - _cerr << "\nsample:\n\n" << _sample.str() << "\n"; + _cerr << "\npc samples:\n\n" << _sample.str() << "\n"; + _cerr << "\neligible pcs:\n\n" << _eligible.str() << "\n"; + _cerr << "\nscoped pcs:\n\n" << _scoped.str() << "\n"; + if(get_verbose() >= 1) + { + _cerr << "\nbinary pcs:\n\n" << _binary.str() << "\n"; + _cerr << "\nmaps:\n\n" << _memory.str() << "\n"; + } std::cerr << std::flush; - OMNITRACE_CONDITIONAL_THROW(_impl_no == 0, "experiment never started"); + // if launched via omnitrace-causal, allow end-to-end runs that do not + // start experiments + auto _omni_causal_launcher = + get_env("OMNITRACE_LAUNCHER", "", false) == + "omnitrace-causal"; + + if(!(get_causal_end_to_end() && _omni_causal_launcher)) + { + OMNITRACE_CONDITIONAL_THROW(_impl_no == 0, + "causal experiment never started"); + } + return; } + else + { + OMNITRACE_VERBOSE( + 1, + "[causal] experiment failed to start. Number of PC candidates: %zu\n", + eligible_pc_candidates.load()); + } } + OMNITRACE_VERBOSE(3, + "[causal] experiment started. Number of PC candidates: %zu\n", + eligible_pc_candidates.load()); + + reset_sample_selection(); + // wait for the experiment to complete if(config::get_causal_end_to_end()) { @@ -592,15 +682,14 @@ perform_experiment_impl(std::shared_ptr> _started) // NOLINT } } -// thread-safe read/write ring-buffer via atomics -using pc_ring_buffer_t = tim::data_storage::atomic_ring_buffer; // latest_eligible_pcs is an array of unwind_depth size -> samples will // use lowest indexes for most recent functions address in the call-stack auto latest_eligible_pc = []() { - auto _arr = std::array, unwind_depth>{}; + using atomic_uintptr_t = std::atomic; + constexpr size_t array_size = unwind_depth; + auto _arr = std::array, array_size>{}; for(auto& itr : _arr) - itr = std::make_unique(units::get_page_size() / - (sizeof(uintptr_t) + 1)); + itr = std::make_unique>(0); return _arr; }(); } // namespace @@ -610,20 +699,21 @@ auto latest_eligible_pc = []() { bool is_eligible_address(uintptr_t _v) { - return get_eligible_address_ranges().coarse_range.contains(_v); + return get_eligible_address_ranges().contains(_v); } void save_line_info(const settings::compose_filename_config& _cfg, int _verbose) { - auto _write = [_verbose](const std::string& ofname, const auto& _data) { + auto _write = [_verbose](const std::string& ofname, const auto& _data, + const std::array& _info) { auto _ofs = std::ofstream{}; if(tim::filepath::open(_ofs, ofname)) { if(_verbose >= 0) operation::file_output_message{}( ofname, std::string{ "causal_symbol_info" }); - save_line_info_impl(_ofs, _data); + save_line_info_impl(_ofs, _data, _info); save_maps_info_impl(_ofs); } else @@ -634,27 +724,54 @@ save_line_info(const settings::compose_filename_config& _cfg, int _verbose) _write(tim::settings::compose_output_filename( join('-', config::get_causal_output_filename(), "binary"), "txt", _cfg), - get_cached_binary_info().first); + get_cached_binary_info().first, { true, true, true }); _write(tim::settings::compose_output_filename( join('-', config::get_causal_output_filename(), "scoped"), "txt", _cfg), - get_cached_binary_info().second); + get_cached_binary_info().second, { true, true, false }); +} + +size_t +set_current_selection(unwind_addr_t _stack) +{ + for(auto itr : _stack) + { + if(itr == 0) continue; + ++eligible_pc_candidates; + if(is_eligible_address(itr)) + { + auto _idx = eligible_pc_idx++ % latest_eligible_pc.size(); + latest_eligible_pc.at(_idx)->store(itr); + } + } + + return eligible_pc_idx.load(std::memory_order_relaxed); +} + +size_t +set_current_selection(container::c_array _stack) +{ + for(auto itr : _stack) + { + if(itr == 0) continue; + ++eligible_pc_candidates; + if(is_eligible_address(itr)) + { + auto _idx = eligible_pc_idx++ % latest_eligible_pc.size(); + latest_eligible_pc.at(_idx)->store(itr); + } + } + + return eligible_pc_idx.load(std::memory_order_relaxed); } void -set_current_selection(unwind_addr_t _stack) +reset_sample_selection() { - if(experiment::is_active()) return; - - size_t _n = 0; - for(auto itr : _stack) + eligible_pc_idx.store(0); + eligible_pc_candidates.store(0); + for(auto& itr : latest_eligible_pc) { - auto& _pcs = latest_eligible_pc.at(_n); - if(_pcs && is_eligible_address(itr)) - { - _pcs->write(&itr); - // increment after valid found -> first valid pc for call-stack - ++_n; - } + if(itr) itr->store(0); } } @@ -663,8 +780,6 @@ sample_selection(size_t _nitr, size_t _wait_ns) { OMNITRACE_SCOPED_THREAD_STATE(ThreadState::Internal); - size_t _n = 0; - auto _select_address = [&](auto& _address_vec) { // this isn't necessary bc of check before calling this lambda but // kept because of size() - 1 in distribution range @@ -688,6 +803,8 @@ sample_selection(size_t _nitr, size_t _wait_ns) _address_vec.erase(_address_vec.begin() + _idx); + eligible_pc_history[_addr] += 1; + if(get_causal_mode() == CausalMode::Function) _sym_addr = (_dl_info.symbol) ? _dl_info.symbol.address() : _addr; @@ -725,45 +842,37 @@ sample_selection(size_t _nitr, size_t _wait_ns) ? linfo.front() : linfo.back(); return selected_entry{ _addr, _sym_addr, _linfo_v }; - // return selected_entry{ address_range_t{ _addr }, - // address_range_t{ _sym_addr }, - // { _linfo_v.second } }; } return selected_entry{}; }; - while(_n++ < _nitr) + while(eligible_pc_idx.load(std::memory_order_relaxed) == 0) + { + if(get_state() >= State::Finalized) return selected_entry{}; + std::this_thread::yield(); + std::this_thread::sleep_for(std::chrono::nanoseconds{ _wait_ns }); + } + + for(size_t _n = 0; _n < _nitr; ++_n) { auto _addresses = std::deque{}; for(auto& aitr : latest_eligible_pc) { if(OMNITRACE_UNLIKELY(!aitr)) { - OMNITRACE_WARNING(0, "invalid ring buffer...\n"); + OMNITRACE_WARNING(0, "invalid atomic pc...\n"); continue; } - auto _naddrs = aitr->count(); - if(_naddrs == 0) continue; - - for(size_t i = 0; i < _naddrs; ++i) - { - uintptr_t _addr = 0; - if(!aitr->is_empty() && aitr->read(&_addr) != nullptr) - { - if(_addr > 0) _addresses.emplace_back(_addr); - } - } - - if(!_addresses.empty()) - { - auto _selection = _select_address(_addresses); - if(_selection) return _selection; - } + uintptr_t _addr = aitr->load(); + if(_addr > 0) _addresses.emplace_back(_addr); } - std::this_thread::yield(); - std::this_thread::sleep_for(std::chrono::nanoseconds{ _wait_ns }); + if(!_addresses.empty()) + { + auto _selection = _select_address(_addresses); + if(_selection) return _selection; + } } return selected_entry{}; @@ -781,12 +890,32 @@ get_line_info(uintptr_t _addr, bool _include_discarded) { auto _local_data = std::deque{}; + // make sure the address is in the coarse grained mapped regions + // before performing an exhaustive search + bool _is_mapped = std::find_if(litr.mappings.begin(), litr.mappings.end(), + [_addr](const auto& mitr) { + return address_range_t{ mitr.load_address, + mitr.last_address } + .contains(_addr); + }) != litr.mappings.end(); + + if(!_is_mapped) return; + for(const auto& ditr : litr.symbols) { + // skip if load address is greater than address + if(_addr < ditr.load_address) continue; + // compute the symbols ip address range auto _ipaddr = ditr.ipaddr(); + // if the lower bound of the ip address range is greater than the address, + // all following symbols are not worth searching since they are at higher + // addresses than this symbol (sorted by address) + // if(_ipaddr.low > _addr) break; + if(!_ipaddr.contains(_addr)) continue; - if(config::get_causal_mode() == CausalMode::Function) + if(_include_discarded || + config::get_causal_mode() == CausalMode::Function) { // check if the primary symbol satisfy the constraints if(ditr(_filters)) _local_data.emplace_back(ditr); @@ -795,28 +924,28 @@ get_line_info(uintptr_t _addr, bool _include_discarded) // functions may utility::combine(_local_data, ditr.get_inline_symbols(_filters)); } - else if(config::get_causal_mode() == CausalMode::Line) + + if(_include_discarded || config::get_causal_mode() == CausalMode::Line) { auto _debug_data = std::deque{}; for(const auto& itr : ditr.get_debug_line_info(_filters)) { + if(!_ipaddr.contains(itr.ipaddr())) + OMNITRACE_THROW( + "Error! debug line info ipaddr (%s) is not contained in " + "symbol ipaddr (%s)", + as_hex(itr.ipaddr()).c_str(), as_hex(_ipaddr).c_str()); if(itr.ipaddr().contains(_addr)) _debug_data.emplace_back(itr); } utility::combine(_local_data, _debug_data); } - else - { - throw exception( - join(" ", "Causal mode not supported:", - std::to_string(config::get_causal_mode()))); - } } if(!_local_data.empty()) { // combine and only allow first match utility::combine(_data, _local_data); - break; + if(!_include_discarded) break; } } }; diff --git a/source/lib/omnitrace/library/causal/data.hpp b/source/lib/omnitrace/library/causal/data.hpp index 8fdba31656..3648fcddf0 100644 --- a/source/lib/omnitrace/library/causal/data.hpp +++ b/source/lib/omnitrace/library/causal/data.hpp @@ -52,10 +52,14 @@ get_line_info(uintptr_t _addr, bool include_discarded = true); bool is_eligible_address(uintptr_t); -void set_current_selection(unwind_addr_t); +size_t set_current_selection(unwind_addr_t); +size_t set_current_selection(container::c_array); + +void +reset_sample_selection(); selected_entry -sample_selection(size_t _nitr = 1000, size_t _wait_ns = 10000); +sample_selection(size_t _nitr = 1000, size_t _wait_ns = 100000); void push_progress_point(std::string_view); diff --git a/source/lib/omnitrace/library/causal/delay.cpp b/source/lib/omnitrace/library/causal/delay.cpp index b254f23ea3..0568a9bb0d 100644 --- a/source/lib/omnitrace/library/causal/delay.cpp +++ b/source/lib/omnitrace/library/causal/delay.cpp @@ -25,6 +25,7 @@ #include "core/utility.hpp" #include "library/causal/components/causal_gotcha.hpp" #include "library/causal/experiment.hpp" +#include "library/causal/sampling.hpp" #include "library/runtime.hpp" #include "library/thread_data.hpp" #include "library/thread_info.hpp" @@ -108,17 +109,16 @@ delay::process() { if(get_global() < get_local()) { - auto _diff = (get_local() - get_global()); - if(_diff > sleep_for_overhead) get_global() += _diff; + get_global() += (get_local() - get_global()); } else if(get_global() > get_local()) { - ::omnitrace::causal::component::causal_gotcha::block_signals(); + ::omnitrace::causal::sampling::pause(); auto _beg = tracing::now(); std::this_thread::sleep_for( std::chrono::nanoseconds{ get_global() - get_local() }); get_local() += (tracing::now() - _beg); - ::omnitrace::causal::component::causal_gotcha::unblock_signals(); + ::omnitrace::causal::sampling::resume(); } } else diff --git a/source/lib/omnitrace/library/causal/experiment.cpp b/source/lib/omnitrace/library/causal/experiment.cpp index e0f0e67a13..89dff0480a 100644 --- a/source/lib/omnitrace/library/causal/experiment.cpp +++ b/source/lib/omnitrace/library/causal/experiment.cpp @@ -21,6 +21,9 @@ // SOFTWARE. #include "library/causal/experiment.hpp" +#include "binary/analysis.hpp" +#include "binary/dwarf_entry.hpp" +#include "binary/symbol.hpp" #include "common/defines.h" #include "core/config.hpp" #include "core/debug.hpp" @@ -29,11 +32,11 @@ #include "library/causal/components/progress_point.hpp" #include "library/causal/data.hpp" #include "library/causal/delay.hpp" +#include "library/causal/sample_data.hpp" #include "library/thread_data.hpp" #include "library/thread_info.hpp" #include "library/tracing.hpp" -#include #include #include #include @@ -42,10 +45,12 @@ #include #include #include +#include #include #include #include +#include #include #include @@ -68,26 +73,31 @@ bool use_exp_speedup_scaling = get_env("OMNITRACE_CAUSAL_SCALE_EXPERIMENT_TIME_BY_SPEEDUP", false); } // namespace +experiment::sample::sample(const base_type& _b, uint64_t _c) +: base_type{ _b } +, count{ _c } +{ + if(lineinfo) + { + for(const auto& itr : lineinfo.lines) + { + if(itr.inlined) + inlines.emplace_back( + binary::inlined_symbol{ itr.line, itr.location, itr.name }); + } + } +} + bool experiment::sample::operator==(const sample& _v) const { - return std::tie(address, info.line, info.file, info.func, location) == - std::tie(_v.address, _v.info.line, _v.info.file, _v.info.func, _v.location); + return base_type::operator==(_v); } bool experiment::sample::operator<(const sample& _v) const { - if(info.line > 0 && _v.info.line > 0) - { - return std::tie(info.line, info.file) == std::tie(_v.info.line, _v.info.file); - } - else if((info.line + _v.info.line) > 0) - { - return std::tie(info.file, location, info.line) < - std::tie(_v.info.file, _v.location, _v.info.line); - } - return (location < _v.location); + return base_type::operator<(_v); } const auto& @@ -102,8 +112,35 @@ void experiment::sample::serialize(ArchiveT& ar, const unsigned) { namespace cereal = ::tim::cereal; - ar(cereal::make_nvp("location", location), cereal::make_nvp("count", count), - cereal::make_nvp("info", info)); + using cereal::make_nvp; + + ar(cereal::make_nvp("count", count)); + if constexpr(concepts::is_output_archive::value) + { + ar(cereal::make_nvp("location", get_identifier())); + } + + ar.setNextName("info"); + ar.startNode(); + ar(make_nvp("address", address), make_nvp("line", lineno), make_nvp("file", location), + make_nvp("func", name)); + + if constexpr(concepts::is_output_archive::value) + { + ar(cereal::make_nvp("dfunc", demangle(name)), + cereal::make_nvp("dwarf_info", std::vector{})); + } + ar(cereal::make_nvp("inlines", inlines)); + ar.finishNode(); + + ar(cereal::make_nvp("dlinfo", info)); +} + +std::string +experiment::sample::get_identifier() const +{ + return (lineno > 0 && !location.empty()) ? join(":", location, lineno) + : demangle(name); } template @@ -119,7 +156,7 @@ experiment::record::serialize(ArchiveT& ar, const unsigned) { ar(cereal::make_nvp("samples", _samples)); for(auto& itr : _samples) - samples.emplace(std::move(itr)); + samples.emplace_back(std::move(itr)); } else { @@ -171,8 +208,6 @@ experiment::serialize(ArchiveT& ar, const unsigned) } ar(cereal::make_nvp("progress_points", _ppts)); } - - ar(cereal::make_nvp("period_stats", period_stats)); } std::string @@ -203,9 +238,6 @@ experiment::start() // sampling period in nanoseconds sampling_period = backtrace_causal::get_period(units::nsec); - // adjust for the real sampling period - period_stats = causal::component::backtrace::get_period_stats(); - if(period_stats.get_count() > 10) sampling_period = period_stats.get_mean(); // experiment time is scaled up for longer speedups index = experiment_history.size() + 1; @@ -222,10 +254,14 @@ experiment::start() OMNITRACE_VERBOSE(0, "Starting causal experiment #%-3u: %s\n", index, as_string().c_str()); - current_experiment_value = *this; - current_selected_count.store(0); - current_experiment.store(this); - return true; + if(get_state() < State::Finalized) + { + current_experiment_value = *this; + current_selected_count.store(0); + current_experiment.store(this); + return true; + } + return false; } bool @@ -258,34 +294,46 @@ experiment::stop() total_delay = (global_delay - total_delay); duration = (experiment_time > total_delay) ? (experiment_time - total_delay) : 0; fini_progress = component::progress_point::get_progress_points(); - period_stats = causal::component::backtrace::get_period_stats(); // sync data delay::sync(); - // for larger speedups, we increased the experiment time, so we want to artificially - // increase num by the same factor. E.g. 10 throughput points at speedup 50 should - // really look like 15 - double _scale_num = 1.0 + ((use_exp_speedup_scaling) ? delay_scaling : 0.0); - auto _prog_stats = tim::statistics{}; + auto _prog_stats = tim::statistics{}; + auto _prog_vals = std::vector{}; + _prog_vals.reserve(fini_progress.size()); for(auto fitr : fini_progress) { auto _pt = fitr.second - init_progress[fitr.first]; int64_t _num = std::max({ _pt.get_laps(), _pt.get_arrival(), _pt.get_departure() }); - if(_num > 0) _prog_stats += (_num * _scale_num); + if(_num > 0) _prog_vals.emplace_back(_num); } + std::sort(_prog_vals.begin(), _prog_vals.end()); + for(auto itr : _prog_vals) + _prog_stats += itr; - auto _mean = (_prog_stats.get_count() > 0) ? _prog_stats.get_mean() : 0; - auto _high = (_prog_stats.get_count() > 0) ? _prog_stats.get_max() : 0; - if(_high < 5) + auto _nvals = _prog_vals.size(); + auto _medi = (_nvals > 2) ? _prog_vals.at(_nvals / 2) : _prog_vals.front(); + auto _mean = (_nvals > 0) ? _prog_stats.get_mean() : 0; + auto _high = (_nvals > 0) ? _prog_stats.get_max() : 0; + auto _lowv = (_nvals > 0) ? _prog_stats.get_min() : 0; + + if(_lowv <= 3 && (_mean < 5 || _medi < 5)) { + OMNITRACE_VERBOSE(2, + "[progress points] increasing experiment time :: low: %6.3f, " + "high: %6.3f, mean: %6.3f, median: %zi\n", + _lowv, _high, _mean, _medi); global_scaling *= 2; ++global_scaling_increments; // keep track of how many successive increments have // been performed } - else if(_mean > 10 && global_scaling > 1) + else if(_mean > 10 && _lowv >= 8 && global_scaling > 1) { + OMNITRACE_VERBOSE(2, + "[progress points] decreasing experiment time :: low: %6.3f, " + "high: %6.3f, mean: %6.3f, median: %zi\n", + _lowv, _high, _mean, _medi); global_scaling /= 2; global_scaling_increments = 0; } @@ -304,7 +352,9 @@ experiment::stop() if(_high > 0) experiment_history.emplace_back(*this); - std::this_thread::sleep_for(std::chrono::nanoseconds{ sampling_period * batch_size }); + std::this_thread::sleep_for( + std::chrono::nanoseconds{ 5 * sampling_period * batch_size }); + return true; } @@ -320,7 +370,6 @@ experiment::as_string() const _ss << ", duration: " << std::setw(5) << std::fixed << std::setprecision(3) << _dur << " sec"; _ss << " :: experiment: " << as_hex(selection.address) << " "; - //_ss << " [" << selection.info.ipaddr().as_string() << "]"; if(selection.symbol_address > 0 && selection.address != selection.symbol_address) _ss << "(symbol@" << as_hex(selection.symbol_address) << ") "; if(!selection.symbol.file.empty() && selection.symbol.line > 0) @@ -375,13 +424,30 @@ experiment::is_active() return (current_experiment.load(std::memory_order_relaxed) != nullptr); } +bool +experiment::is_selected(uint64_t _addr) +{ + return (is_active() && current_experiment_value.selection.contains(_addr)); +} + bool experiment::is_selected(unwind_addr_t _stack) { if(is_active()) { for(auto itr : _stack) - if(current_experiment_value.selection.contains(itr)) return true; + if(itr > 0 && current_experiment_value.selection.contains(itr)) return true; + } + return false; +} + +bool +experiment::is_selected(container::c_array _stack) +{ + if(is_active()) + { + for(auto itr : _stack) + if(itr > 0 && current_experiment_value.selection.contains(itr)) return true; } return false; } @@ -413,9 +479,6 @@ experiment::save_experiments(std::string _fname_base, const filename_config_t& _ { const auto& _info0 = thread_info::get(0, InternalTID); - // if(experiment_history.size() > 1) - // experiment_history.erase(experiment_history.begin()); - auto current_record = record{}; current_record.startup = _info0->lifetime.first; @@ -446,11 +509,7 @@ experiment::save_experiments(std::string _fname_base, const filename_config_t& _ // update sample data { auto _add_sample = [¤t_record](sample&& _v) { - auto fitr = current_record.samples.find(_v); - if(fitr != current_record.samples.end()) - *fitr += _v; - else - current_record.samples.emplace(std::move(_v)); + current_record.samples.emplace_back(std::move(_v)); }; auto _total_samples = std::map{}; @@ -462,41 +521,24 @@ experiment::save_experiments(std::string _fname_base, const filename_config_t& _ } } + OMNITRACE_VERBOSE_F(1, "Processing line info for %zu sampled addresses...\n", + _total_samples.size()); + + for(const auto& itr : _total_samples) + { + auto _entry = binary::lookup_ipaddr_entry(itr.first); + if(_entry) _add_sample(sample{ *_entry, itr.second }); + } + auto _binfo_cfg = settings::compose_filename_config{}; _binfo_cfg.subdirectory = "causal/binary-info"; _binfo_cfg.use_suffix = config::get_use_pid(); save_line_info(_binfo_cfg, config::get_verbose()); - - for(const auto& itr : _total_samples) - { - auto _addr = itr.first; - auto _count = itr.second; - if(_count > 0) - { - auto _linfo = get_line_info(_addr, true); - for(const auto& iitr : _linfo) - { - auto _name = (iitr.line > 0) ? join(":", iitr.file, iitr.line) - : demangle(iitr.func); - - _name = join(" :: ", as_hex(_addr), _name); - _add_sample(sample{ _count, _addr, _name, iitr }); - } - - if(_linfo.empty() && config::get_debug()) - { - _add_sample( - sample{ _count, _addr, as_hex(_addr), sample::line_info{} }); - } - } - } } bool _causal_output_reset = config::get_setting_value("OMNITRACE_CAUSAL_FILE_RESET").value_or(false); - // if(current_record.experiments.empty()) return; - { auto _saved_experiments = (_causal_output_reset) ? std::vector{} @@ -615,7 +657,8 @@ experiment::save_experiments(std::string _fname_base, const filename_config_t& _ for(const auto& itr : current_record.samples) { - ofs << "samples\tlocation=" << itr.location << "\tcount=" << itr.count; + ofs << "samples\tlocation=" << itr.get_identifier() + << "\tcount=" << itr.count; if(config::get_debug()) ofs << "\taddress=" << as_hex(itr.address); ofs << "\n"; } diff --git a/source/lib/omnitrace/library/causal/experiment.hpp b/source/lib/omnitrace/library/causal/experiment.hpp index c383022337..7663acd5ff 100644 --- a/source/lib/omnitrace/library/causal/experiment.hpp +++ b/source/lib/omnitrace/library/causal/experiment.hpp @@ -30,13 +30,13 @@ #include "library/causal/components/backtrace.hpp" #include "library/causal/components/progress_point.hpp" #include "library/causal/data.hpp" -#include "library/causal/sample_data.hpp" #include "library/causal/selected_entry.hpp" #include #include #include #include +#include #include #include @@ -55,17 +55,17 @@ struct experiment std::unordered_map; using experiments_t = std::vector; using filename_config_t = settings::compose_filename_config; - using sample_dataset_t = std::set; using period_stats_t = tim::statistics; - struct sample + struct sample : unwind::processed_entry { - using line_info = binary::symbol; + using base_type = unwind::processed_entry; - mutable uint64_t count = 0; - uintptr_t address = 0; - std::string location = {}; - line_info info = {}; + sample() = default; + sample(const base_type&, uint64_t); + + mutable uint64_t count = 0; + std::vector inlines = {}; bool operator==(const sample&) const; bool operator<(const sample&) const; @@ -73,6 +73,8 @@ struct experiment template void serialize(ArchiveT& ar, const unsigned); + + std::string get_identifier() const; }; struct record @@ -80,7 +82,7 @@ struct experiment int64_t startup = 0; uint64_t runtime = 0; std::vector experiments = {}; - std::set samples = {}; + std::vector samples = {}; template void serialize(ArchiveT& ar, const unsigned); @@ -105,10 +107,18 @@ struct experiment static double get_delay_scaling(); static uint32_t get_index(); static bool is_active(); + static bool is_selected(uint64_t); static bool is_selected(unwind_addr_t); + static bool is_selected(container::c_array); static void add_selected(); static experiments_t get_experiments(); + template + static bool is_selected(std::array _v) + { + return is_selected(container::c_array{ _v.data(), _v.size() }); + } + static void save_experiments(); static void save_experiments(std::string, const filename_config_t&); static std::vector load_experiments(bool _throw_on_err = true); @@ -116,24 +126,23 @@ struct experiment bool = true); bool running = false; - uint16_t virtual_speedup = 0; /// 0-100 in multiples of 5 - uint32_t index = 0; /// experiment number - uint64_t sampling_period = 0; /// period b/t samples [nsec] - uint64_t start_time = 0; /// start of experiment [nsec] - uint64_t end_time = 0; /// end of experiment [nsec] - uint64_t experiment_time = 0; /// how long the experiment ran [nsec] - uint64_t duration = 0; /// runtime - delays [nsec] - uint64_t batch_size = 10; /// batch factor for experiment/cooloff - uint64_t scaling_factor = 50; /// scaling factor for experiment time - uint64_t sample_delay = 0; /// how long to delay [nsec] - uint64_t total_delay = 0; /// total delays [nsec] - uint64_t selected = 0; /// num times selected line sampled + uint16_t virtual_speedup = 0; /// 0-100 in multiples of 5 + uint32_t index = 0; /// experiment number + uint64_t sampling_period = 0; /// period b/t samples [nsec] + uint64_t start_time = 0; /// start of experiment [nsec] + uint64_t end_time = 0; /// end of experiment [nsec] + uint64_t experiment_time = 0; /// how long the experiment ran [nsec] + uint64_t duration = 0; /// runtime - delays [nsec] + uint64_t batch_size = 10; /// batch factor for experiment/cooloff + uint64_t scaling_factor = 100; /// scaling factor for experiment time + uint64_t sample_delay = 0; /// how long to delay [nsec] + uint64_t total_delay = 0; /// total delays [nsec] + uint64_t selected = 0; /// num times selected line sampled uint64_t global_delay = 0; double delay_scaling = 0.0; /// virtual_speedup / 100. selected_entry selection = {}; /// which line was selected progress_points_t init_progress = {}; /// progress points at start progress_points_t fini_progress = {}; /// progress points at end - period_stats_t period_stats = {}; /// stats for sampling period }; } // namespace causal } // namespace omnitrace diff --git a/source/lib/omnitrace/library/causal/fwd.hpp b/source/lib/omnitrace/library/causal/fwd.hpp index 183ac2ec2d..31db91a9e5 100644 --- a/source/lib/omnitrace/library/causal/fwd.hpp +++ b/source/lib/omnitrace/library/causal/fwd.hpp @@ -22,6 +22,7 @@ #pragma once +#include "common/defines.h" #include "core/binary/fwd.hpp" #include "core/containers/static_vector.hpp" #include "core/defines.hpp" @@ -41,7 +42,7 @@ namespace unwind = ::tim::unwind; namespace causal { -static constexpr size_t unwind_depth = 8; +static constexpr size_t unwind_depth = OMNITRACE_MAX_UNWIND_DEPTH; static constexpr size_t unwind_offset = 0; using unwind_stack_t = unwind::stack; using unwind_addr_t = container::static_vector; diff --git a/source/lib/omnitrace/library/causal/sample_data.cpp b/source/lib/omnitrace/library/causal/sample_data.cpp index cbd310634f..4cde123341 100644 --- a/source/lib/omnitrace/library/causal/sample_data.cpp +++ b/source/lib/omnitrace/library/causal/sample_data.cpp @@ -33,32 +33,38 @@ namespace causal { namespace { -auto samples = std::map>{}; +auto samples = std::map>{}; } -std::set +std::vector get_samples(uint32_t _index) { - return samples[_index]; + auto _data = std::vector{}; + _data.reserve(samples.at(_index).size()); + for(const auto& itr : samples.at(_index)) + { + _data.emplace_back(sample_data{ itr.first, itr.second }); + } + return _data; } -std::map> +std::map> get_samples() { - return samples; + auto _data = std::map>{}; + + for(const auto& itr : samples) + { + _data[itr.first] = get_samples(itr.first); + } + + return _data; } void -add_sample(uint32_t _index, uintptr_t _v) +add_sample(uint32_t _index, uintptr_t _addr, uint64_t _count) { - auto& _samples = samples[_index]; - auto _value = sample_data{ _v }; - _value.count = 1; - auto itr = _samples.find(_value); - if(itr == _samples.end()) - _samples.emplace(_value); - else - itr->count += 1; + samples[_index][_addr] += _count; } void @@ -67,5 +73,12 @@ add_samples(uint32_t _index, const std::vector& _v) for(const auto& itr : _v) add_sample(_index, itr); } + +void +add_samples(uint32_t _index, const std::map& _v) +{ + for(const auto& itr : _v) + add_sample(_index, itr.first, itr.second); +} } // namespace causal } // namespace omnitrace diff --git a/source/lib/omnitrace/library/causal/sample_data.hpp b/source/lib/omnitrace/library/causal/sample_data.hpp index 1a6e21ace3..a6898e8a9a 100644 --- a/source/lib/omnitrace/library/causal/sample_data.hpp +++ b/source/lib/omnitrace/library/causal/sample_data.hpp @@ -51,14 +51,17 @@ struct sample_data } }; -std::map> +std::map> get_samples(); void add_samples(uint32_t, const std::vector&); -std::set get_samples(uint32_t); +std::vector get_samples(uint32_t); -void add_sample(uint32_t, uintptr_t); +void add_sample(uint32_t, uintptr_t, uint64_t = 1); + +void +add_samples(uint32_t, const std::map&); } // namespace causal } // namespace omnitrace diff --git a/source/lib/omnitrace/library/causal/sampling.cpp b/source/lib/omnitrace/library/causal/sampling.cpp index 10452b6115..90573c9673 100644 --- a/source/lib/omnitrace/library/causal/sampling.cpp +++ b/source/lib/omnitrace/library/causal/sampling.cpp @@ -21,6 +21,7 @@ // SOFTWARE. #include "library/causal/sampling.hpp" +#include "binary/analysis.hpp" #include "core/common.hpp" #include "core/concepts.hpp" #include "core/config.hpp" @@ -30,6 +31,8 @@ #include "core/utility.hpp" #include "library/causal/components/backtrace.hpp" #include "library/causal/data.hpp" +#include "library/causal/sample_data.hpp" +#include "library/perf.hpp" #include "library/ptl.hpp" #include "library/runtime.hpp" #include "library/sampling.hpp" @@ -37,8 +40,11 @@ #include "library/thread_info.hpp" #include +#include #include +#include #include +#include #include #include #include @@ -46,6 +52,7 @@ #include #include #include +#include #include #include #include @@ -58,11 +65,14 @@ namespace causal namespace sampling { using ::tim::sampling::dynamic; +using ::tim::sampling::overflow; using ::tim::sampling::timer; using causal_bundle_t = - tim::lightweight_tuple; -using causal_sampler_t = tim::sampling::sampler; + tim::lightweight_tuple; +using causal_sampler_t = tim::sampling::sampler; +using backtrace_enabled = trait::runtime_enabled; +using overflow_enabled = trait::runtime_enabled; } // namespace sampling } // namespace causal } // namespace omnitrace @@ -156,17 +166,34 @@ void causal_offload_buffer(int64_t, causal_sampler_buffer_t&& _buf) { auto _data = std::move(_buf); - auto _processed = std::map>{}; + auto _processed = std::map>{}; while(!_data.is_empty()) { auto _bundle = causal_sampler_bundle_t{}; _data.read(&_bundle); - auto* _bt_causal = _bundle.get(); + + const auto* _bt_causal = _bundle.get(); if(_bt_causal) { - for(auto&& itr : _bt_causal->get_stack()) + auto _stack = _bt_causal->get_stack(); + + for(auto itr : _stack) { - if(itr > 0) _processed[_bt_causal->get_index()].emplace_back(itr); + if(itr > 0) _processed[_bt_causal->get_index()][itr] += 1; + } + } + + const auto* _of_causal = _bundle.get(); + if(_of_causal) + { + const auto& _stack = _of_causal->get_stack(); + + for(const auto& ditr : _stack) + { + for(auto aitr : ditr) + { + if(aitr > 0) _processed[_of_causal->get_index()][aitr] += 1; + } } } } @@ -177,7 +204,9 @@ causal_offload_buffer(int64_t, causal_sampler_buffer_t&& _buf) static auto _mutex = locking::atomic_mutex{}; auto _lk = locking::atomic_lock{ _mutex }; for(const auto& itr : _processed) + { add_samples(itr.first, itr.second); + } } } @@ -186,6 +215,7 @@ configure(bool _setup, int64_t _tid) { const auto& _info = thread_info::get(_tid, SequentTID); auto& _causal = get_causal_sampler(_tid); + auto& _causal_perf = perf::get_instance(_tid); auto& _running = get_causal_sampler_running(_tid); auto& _signal_types = get_causal_sampler_signals(_tid); @@ -197,6 +227,19 @@ configure(bool _setup, int64_t _tid) if(_setup && _signal_types.empty()) _signal_types = get_sampling_signals(_tid); + // initialize + if(_setup) + { + using global_init_mode = operation::mode_constant; + using thread_init_mode = operation::mode_constant; + // initialize backtrace + operation::init{}(global_init_mode{}); + operation::init{}(thread_init_mode{}); + // initialize overflow + operation::init{}(global_init_mode{}); + operation::init{}(thread_init_mode{}); + } + if(_setup && !_causal && !_running && !_signal_types.empty()) { auto _verbose = std::min(get_verbose() - 2, 2); @@ -218,21 +261,95 @@ configure(bool _setup, int64_t _tid) _causal = std::make_unique(_causal_alloc, "omnitrace", _tid, _verbose); + auto _activate_perf_backend = [&_causal, &_causal_perf, &_info, &_tid]() { + _causal_perf = std::make_unique(); + auto _open_error = + _causal_perf->open(1000.0, 10, _info->index_data->system_value); + if(_open_error) + { + _causal_perf.reset(); + } + else + { + overflow_enabled::set(true); + overflow_enabled::set(scope::thread_scope{}, true); + backtrace_enabled::set(false); + backtrace_enabled::set(scope::thread_scope{}, false); + _causal->configure(overflow{ get_sampling_overflow_signal(), + [](int, pid_t, long, int64_t) { + // perf::get_instance(_idx)->set_ready_signal(_sig); + return true; + }, + [](int, pid_t, long, int64_t _idx) { + return perf::get_instance(_idx)->start(); + }, + [](int, pid_t, long, int64_t _idx) { + return perf::get_instance(_idx)->stop(); + }, + _tid, threading::get_sys_tid() }); + if(_tid == 0) OMNITRACE_VERBOSE(1, "causal profiling backend: perf\n"); + } + + return _open_error; + }; + + auto _activate_timer_backend = [&_causal, &_tid]() { + backtrace_enabled::set(true); + backtrace_enabled::set(scope::thread_scope{}, true); + overflow_enabled::set(false); + overflow_enabled::set(scope::thread_scope{}, false); + _causal->configure(timer{ get_sampling_realtime_signal(), CLOCK_REALTIME, + SIGEV_THREAD_ID, 1000.0, 1.0e-6, _tid, + threading::get_sys_tid() }); + if(_tid == 0) OMNITRACE_VERBOSE(1, "causal profiling backend: timer\n"); + return true; + }; + TIMEMORY_REQUIRE(_causal) << "nullptr to causal profiling instance"; _causal->set_flags(SA_RESTART); _causal->set_verbose(_verbose); _causal->set_offload(&causal_offload_buffer); - _causal->configure(timer{ get_realtime_signal(), CLOCK_REALTIME, SIGEV_THREAD_ID, - 1000.0, 1.0e-6, _tid, threading::get_sys_tid() }); + if(get_causal_backend() == CausalBackend::Perf) + { + auto _perf_error = _activate_perf_backend(); + OMNITRACE_REQUIRE(!_perf_error) + << "perf backend for causal profiling failed to activate: " + << *_perf_error << "\n"; + } + else if(get_causal_backend() == CausalBackend::Timer) + { + OMNITRACE_REQUIRE(_activate_timer_backend()) + << "timer backend for causal profiling failed to activate\n"; + } + else if(get_causal_backend() == CausalBackend::Auto) + { + auto _perf_error = _activate_perf_backend(); + if(!_perf_error) + { + config::set_setting_value("OMNITRACE_CAUSAL_BACKEND", + std::string{ "perf" }); + } + else + { + OMNITRACE_WARNING_F( + 0, "perf backend for causal profiling failed to activate: %s\n", + _perf_error->c_str()); - _causal->configure(timer{ get_cputime_signal(), CLOCK_THREAD_CPUTIME_ID, + OMNITRACE_REQUIRE(_activate_timer_backend()) + << "timer backend for causal profiling failed to activate\n"; + + config::set_setting_value("OMNITRACE_CAUSAL_BACKEND", + std::string{ "timer" }); + } + } + + _causal->configure(timer{ get_sampling_cputime_signal(), CLOCK_THREAD_CPUTIME_ID, SIGEV_THREAD_ID, 1000.0, 1.0e-6, _tid, threading::get_sys_tid() }); _running = true; - if(_tid == 0) causal::component::backtrace::start(); _causal->start(); } else if(!_setup && _causal && _running) @@ -257,12 +374,22 @@ configure(bool _setup, int64_t _tid) get_causal_sampler(i)->stop(); get_causal_sampler(i)->reset(); } + + if(perf::get_instance(i)) + { + perf::get_instance(i).reset(); + } } } _causal->stop(); _causal->reset(); + if(_causal_perf) + { + _causal_perf.reset(); + } + OMNITRACE_DEBUG("Causal sampler destroyed for thread %lu\n", _tid); } @@ -311,17 +438,113 @@ unblock_samples() void block_backtrace_samples() { - trait::runtime_enabled::set(scope::thread_scope{}, - false); + pause(scope::thread_scope{}); } void unblock_backtrace_samples() { - trait::runtime_enabled::set(scope::thread_scope{}, - true); + resume(scope::thread_scope{}); } +namespace +{ +std::optional _process_paused = {}; +thread_local std::optional _thread_paused = {}; +namespace signals = ::tim::signals; + +const auto& +sampling_signals() +{ + static thread_local auto _v = get_signal_types(threading::get_id()); + return _v; +} +} // namespace + +template +void pause(ScopeT) +{ + static_assert( + tim::is_one_of>::value, + "Unsupported scope"); + + if constexpr(std::is_same::value) + { + if(!_thread_paused) _thread_paused = false; + + bool _paused_v = *_thread_paused; + if(!_paused_v) + { + auto& _causal_perf = perf::get_instance(threading::get_id()); + if(_causal_perf) _causal_perf->stop(); + signals::block_signals(sampling_signals(), signals::sigmask_scope::thread); + _thread_paused = true; + } + } + else + { + if(!_process_paused) _process_paused = false; + + bool _paused_v = *_process_paused; + if(!_paused_v) + { + for(auto i = 0; i < OMNITRACE_MAX_THREADS; ++i) + { + auto& _causal_perf = perf::get_instance(i); + if(_causal_perf) _causal_perf->stop(); + } + signals::block_signals(sampling_signals(), signals::sigmask_scope::process); + _process_paused = true; + } + } +} + +template +void resume(ScopeT) +{ + static_assert( + tim::is_one_of>::value, + "Unsupported scope"); + + if constexpr(std::is_same::value) + { + if(!_thread_paused) _thread_paused = true; + + bool _paused_v = *_thread_paused; + if(_paused_v) + { + auto& _causal_perf = perf::get_instance(threading::get_id()); + if(_causal_perf) _causal_perf->start(); + signals::unblock_signals(sampling_signals(), signals::sigmask_scope::thread); + _thread_paused = false; + } + } + else + { + if(!_process_paused) _process_paused = true; + + bool _paused_v = *_process_paused; + if(_paused_v) + { + for(auto i = 0; i < OMNITRACE_MAX_THREADS; ++i) + { + auto& _causal_perf = perf::get_instance(i); + if(_causal_perf) _causal_perf->start(); + } + signals::unblock_signals(sampling_signals(), signals::sigmask_scope::process); + _process_paused = false; + } + } +} + +template void pause(scope::thread_scope); +template void pause(scope::process_scope); + +template void resume(scope::thread_scope); +template void resume(scope::process_scope); + void block_signals(std::set _signals) { @@ -354,10 +577,15 @@ post_process() { auto& _causal = get_causal_sampler(i); if(_causal) _causal->stop(); + auto& _causal_perf = perf::get_instance(i); + if(_causal_perf) _causal_perf->stop(); } configure(false, 0); + auto _allocator = get_causal_sampler_allocator(false); + if(_allocator) _allocator->flush(); + for(size_t i = 0; i < max_supported_threads; ++i) { auto& _causal = get_causal_sampler(i); @@ -370,12 +598,15 @@ post_process() for(size_t i = 0; i < max_supported_threads; ++i) { get_causal_sampler(i).reset(); + + auto& _causal_perf = perf::get_instance(i); + if(_causal_perf) + { + _causal_perf.reset(); + } } - if(get_causal_sampler_allocator(false)) - { - get_causal_sampler_allocator(false).reset(); - } + if(_allocator) _allocator.reset(); } namespace @@ -386,9 +617,27 @@ post_process_causal(int64_t, const std::vector& _data) for(const auto& itr : _data) { const auto* _bt_causal = itr.get(); - for(auto&& ditr : _bt_causal->get_stack()) + if(_bt_causal) { - if(ditr > 0) add_sample(_bt_causal->get_index(), ditr); + auto _stack = _bt_causal->get_stack(); + for(auto&& ditr : _stack) + { + if(ditr > 0) add_sample(_bt_causal->get_index(), ditr); + } + } + + const auto* _of_causal = itr.get(); + if(_of_causal) + { + const auto& _stack = _of_causal->get_stack(); + + for(const auto& ditr : _stack) + { + for(auto aitr : ditr) + { + if(aitr > 0) add_sample(_of_causal->get_index(), aitr); + } + } } } } diff --git a/source/lib/omnitrace/library/causal/sampling.hpp b/source/lib/omnitrace/library/causal/sampling.hpp index 5cc465bef5..c02a6c0513 100644 --- a/source/lib/omnitrace/library/causal/sampling.hpp +++ b/source/lib/omnitrace/library/causal/sampling.hpp @@ -51,6 +51,12 @@ block_backtrace_samples(); void unblock_backtrace_samples(); +template +void pause(Tp = {}); + +template +void resume(Tp = {}); + void block_signals(std::set = {}); void unblock_signals(std::set = {}); diff --git a/source/lib/omnitrace/library/components/CMakeLists.txt b/source/lib/omnitrace/library/components/CMakeLists.txt index 942d53378c..efc8915e8b 100644 --- a/source/lib/omnitrace/library/components/CMakeLists.txt +++ b/source/lib/omnitrace/library/components/CMakeLists.txt @@ -3,6 +3,7 @@ set(component_sources ${CMAKE_CURRENT_LIST_DIR}/backtrace.cpp ${CMAKE_CURRENT_LIST_DIR}/backtrace_metrics.cpp ${CMAKE_CURRENT_LIST_DIR}/backtrace_timestamp.cpp + ${CMAKE_CURRENT_LIST_DIR}/callchain.cpp ${CMAKE_CURRENT_LIST_DIR}/comm_data.cpp ${CMAKE_CURRENT_LIST_DIR}/cpu_freq.cpp ${CMAKE_CURRENT_LIST_DIR}/exit_gotcha.cpp @@ -17,6 +18,7 @@ set(component_headers ${CMAKE_CURRENT_LIST_DIR}/backtrace.hpp ${CMAKE_CURRENT_LIST_DIR}/backtrace_metrics.hpp ${CMAKE_CURRENT_LIST_DIR}/backtrace_timestamp.hpp + ${CMAKE_CURRENT_LIST_DIR}/callchain.hpp ${CMAKE_CURRENT_LIST_DIR}/category_region.hpp ${CMAKE_CURRENT_LIST_DIR}/comm_data.hpp ${CMAKE_CURRENT_LIST_DIR}/cpu_freq.hpp diff --git a/source/lib/omnitrace/library/components/backtrace.cpp b/source/lib/omnitrace/library/components/backtrace.cpp index 2df0f10f4a..f2ee3c5525 100644 --- a/source/lib/omnitrace/library/components/backtrace.cpp +++ b/source/lib/omnitrace/library/components/backtrace.cpp @@ -132,12 +132,12 @@ backtrace::filter_and_patch(const std::vector& _data) return 1; }; - bool _keep_suffix = tim::get_env("OMNITRACE_SAMPLING_KEEP_DYNINST_SUFFIX", - get_debug_sampling()); + static bool _keep_suffix = tim::get_env( + "OMNITRACE_SAMPLING_KEEP_DYNINST_SUFFIX", get_debug_sampling()); // in the dyninst binary rewrite runtime, instrumented functions are appended with // "_dyninst", i.e. "main" will show up as "main_dyninst" in the backtrace. - auto _patch_label = [_keep_suffix](std::string_view _lbl) -> std::string { + auto _patch_label = [](std::string_view _lbl) -> std::string { // debugging feature if(_keep_suffix) return std::string{ _lbl }; const std::string _dyninst{ "_dyninst" }; @@ -183,8 +183,10 @@ backtrace::size() const } void -backtrace::sample(int) +backtrace::sample(int signo) { + if(signo == get_sampling_overflow_signal()) return; + // on RedHat, the unw_step within get_unw_stack involves a mutex lock OMNITRACE_SCOPED_THREAD_STATE(ThreadState::Internal); diff --git a/source/lib/omnitrace/library/components/backtrace_metrics.cpp b/source/lib/omnitrace/library/components/backtrace_metrics.cpp index 386b39f553..5f639cfc18 100644 --- a/source/lib/omnitrace/library/components/backtrace_metrics.cpp +++ b/source/lib/omnitrace/library/components/backtrace_metrics.cpp @@ -317,6 +317,34 @@ backtrace_metrics::fini_perfetto(int64_t _tid, valid_array_t _valid) } } +backtrace_metrics& +backtrace_metrics::operator-=(const backtrace_metrics& _rhs) +{ + auto& _lhs = *this; + if(_lhs(category::thread_peak_memory{})) + { + _lhs.m_mem_peak -= _rhs.m_mem_peak; + } + + if(_lhs(category::thread_context_switch{})) + { + _lhs.m_ctx_swch -= _rhs.m_ctx_swch; + } + + if(_lhs(category::thread_page_fault{})) + { + _lhs.m_page_flt -= _rhs.m_page_flt; + } + + if(_lhs(type_list{}) && _lhs(category::thread_hardware_counter{})) + { + for(size_t i = 0; i < _lhs.m_hw_counter.size(); ++i) + _lhs.m_hw_counter.at(i) -= _rhs.m_hw_counter.at(i); + } + + return _lhs; +} + void backtrace_metrics::post_process_perfetto(int64_t _tid, uint64_t _ts) const { @@ -340,6 +368,7 @@ backtrace_metrics::post_process_perfetto(int64_t _tid, uint64_t _ts) const perfetto_counter_track::at(_tid, 2), _ts, m_page_flt); } + if((*this)(type_list{}) && (*this)(category::thread_hardware_counter{})) { for(size_t i = 0; i < perfetto_counter_track::size(_tid); ++i) diff --git a/source/lib/omnitrace/library/components/backtrace_metrics.hpp b/source/lib/omnitrace/library/components/backtrace_metrics.hpp index 223b74c820..512007a4aa 100644 --- a/source/lib/omnitrace/library/components/backtrace_metrics.hpp +++ b/source/lib/omnitrace/library/components/backtrace_metrics.hpp @@ -114,6 +114,14 @@ struct backtrace_metrics : comp::empty_base void post_process_perfetto(int64_t _tid, uint64_t _ts) const; + backtrace_metrics& operator-=(const backtrace_metrics&); + + friend backtrace_metrics operator-(backtrace_metrics _lhs, + const backtrace_metrics& _rhs) + { + return (_lhs -= _rhs); + } + private: valid_array_t m_valid = {}; int64_t m_cpu = 0; diff --git a/source/lib/omnitrace/library/components/callchain.cpp b/source/lib/omnitrace/library/components/callchain.cpp new file mode 100644 index 0000000000..7ead6b498a --- /dev/null +++ b/source/lib/omnitrace/library/components/callchain.cpp @@ -0,0 +1,210 @@ +// MIT License +// +// Copyright (c) 2022 Advanced Micro Devices, Inc. All Rights Reserved. +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in all +// copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +// SOFTWARE. + +#include "callchain.hpp" +#include "binary/analysis.hpp" +#include "core/common.hpp" +#include "core/components/fwd.hpp" +#include "core/config.hpp" +#include "core/debug.hpp" +#include "core/perfetto.hpp" +#include "core/state.hpp" +#include "library/components/ensure_storage.hpp" +#include "library/perf.hpp" +#include "library/ptl.hpp" +#include "library/runtime.hpp" +#include "library/sampling.hpp" +#include "library/thread_info.hpp" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +namespace omnitrace +{ +namespace component +{ +bool +callchain::record::operator<(const record& rhs) const +{ + return timestamp < rhs.timestamp; +} + +std::vector +callchain::get() const +{ + std::vector _v = {}; + if(size() == 0) return _v; + + _v.reserve(size()); + auto _data = m_data; + std::sort(_data.begin(), _data.end()); + for(const auto& itr : _data) + { + auto _v2 = ts_entry_vec_t{ itr.timestamp, {} }; + for(auto iitr : itr.data) + { + auto _entry = binary::lookup_ipaddr_entry(iitr); + if(_entry) _v2.second.emplace_back(*_entry); + } + + if(!_v2.second.empty()) + { + // put the bottom of the call-stack on top + std::reverse(_v2.second.begin(), _v2.second.end()); + _v.emplace_back(std::move(_v2)); + } + } + + auto _known_excludes = + std::set{ "funlockfile", "killpg", "__restore_rt" }; + // remove some known functions which are by-products of interrupts + for(auto& itr : _v) + { + while(!itr.second.empty() && + _known_excludes.find(itr.second.back().name) != _known_excludes.end()) + itr.second.pop_back(); + } + + return _v; +} + +std::string +callchain::label() +{ + return "callchain"; +} + +std::string +callchain::description() +{ + return "Records callchain data"; +} + +std::vector +callchain::filter_and_patch(const std::vector& _data) +{ + auto _ret = std::vector{}; + _ret.reserve(_data.size()); + for(const auto& itr : _data) + { + auto _v = backtrace::filter_and_patch(itr.second); + if(!_v.empty()) _ret.emplace_back(ts_entry_vec_t{ itr.first, std::move(_v) }); + } + + return _ret; +} + +void +callchain::start() +{} + +void +callchain::stop() +{} + +bool +callchain::empty() const +{ + return (size() == 0); +} + +size_t +callchain::size() const +{ + return m_data.size(); +} + +void +callchain::sample(int signo) +{ + if(signo != get_sampling_overflow_signal()) return; + + // on RedHat, the unw_step within get_unw_stack involves a mutex lock + OMNITRACE_SCOPED_THREAD_STATE(ThreadState::Internal); + + static thread_local const auto& _tinfo = thread_info::get(); + auto _tid = _tinfo->index_data->sequent_value; + auto& _perf_event = perf::get_instance(_tid); + + if(!_perf_event) return; + + _perf_event->stop(); + + for(auto itr : *_perf_event) + { + if(itr.is_sample()) + { + auto _ip = itr.get_ip(); + auto _data = record{}; + _data.timestamp = itr.get_time(); + _data.data.emplace_back(_ip); + for(auto ditr : itr.get_callchain()) + { + if(ditr != _ip) _data.data.emplace_back(ditr); + if(_data.data.size() == _data.data.capacity()) break; + } + if(!_data.data.empty()) m_data.emplace_back(_data); + } + } + + _perf_event->start(); +} +} // namespace component +} // namespace omnitrace + +TIMEMORY_INITIALIZE_STORAGE(omnitrace::component::callchain) diff --git a/source/lib/omnitrace/library/components/callchain.hpp b/source/lib/omnitrace/library/components/callchain.hpp new file mode 100644 index 0000000000..681e61f3cf --- /dev/null +++ b/source/lib/omnitrace/library/components/callchain.hpp @@ -0,0 +1,95 @@ +// MIT License +// +// Copyright (c) 2022 Advanced Micro Devices, Inc. All Rights Reserved. +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in all +// copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +// SOFTWARE. + +#pragma once + +#include "core/common.hpp" +#include "core/components/fwd.hpp" +#include "core/containers/static_vector.hpp" +#include "core/defines.hpp" +#include "core/timemory.hpp" +#include "library/thread_data.hpp" + +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +namespace omnitrace +{ +namespace component +{ +struct callchain : comp::empty_base +{ + static constexpr size_t stack_depth = OMNITRACE_MAX_UNWIND_DEPTH; + + struct record + { + uint64_t timestamp = 0; + container::static_vector data = {}; + + bool operator<(const record& rhs) const; + }; + + using cache_type = tim::unwind::cache; + using entry_type = tim::unwind::processed_entry; + using value_type = void; + using data_t = container::static_vector; + using entry_vec_t = std::vector; + using ts_entry_vec_t = std::pair; + + static std::string label(); + static std::string description(); + + callchain() = default; + ~callchain() = default; + callchain(const callchain&) = default; + callchain(callchain&&) noexcept = default; + + callchain& operator=(const callchain&) = default; + callchain& operator=(callchain&&) noexcept = default; + + static std::vector filter_and_patch( + const std::vector&); + + static void start(); + static void stop(); + + void sample(int = -1); + bool empty() const; + size_t size() const; + std::vector get() const; + data_t get_data() const { return m_data; } + +private: + data_t m_data = {}; +}; +} // namespace component +} // namespace omnitrace diff --git a/source/lib/omnitrace/library/components/exit_gotcha.cpp b/source/lib/omnitrace/library/components/exit_gotcha.cpp index ce171f2814..540da5d4f9 100644 --- a/source/lib/omnitrace/library/components/exit_gotcha.cpp +++ b/source/lib/omnitrace/library/components/exit_gotcha.cpp @@ -89,6 +89,12 @@ invoke_exit_gotcha(const exit_gotcha::gotcha_data& _data, FuncT _func, Args... _ JOIN(", ", _args...).c_str(), get_exe_name().c_str()); } + if(_exit_info.is_known && _exit_info.exit_code != 0) + { + OMNITRACE_BASIC_VERBOSE(0, "%s exiting with non-zero exit code: %i...\n", + get_exe_name().c_str(), _exit_info.exit_code); + } + (*_func)(_args...); } } // namespace @@ -106,6 +112,7 @@ exit_gotcha::operator()(const gotcha_data& _data, exit_func_t _func, int _ec) co void exit_gotcha::operator()(const gotcha_data& _data, abort_func_t _func) const { + _exit_info = { true, false, SIGABRT }; invoke_exit_gotcha(_data, _func); } diff --git a/source/lib/omnitrace/library/components/pthread_mutex_gotcha.cpp b/source/lib/omnitrace/library/components/pthread_mutex_gotcha.cpp index ada298833a..279e4b4c18 100644 --- a/source/lib/omnitrace/library/components/pthread_mutex_gotcha.cpp +++ b/source/lib/omnitrace/library/components/pthread_mutex_gotcha.cpp @@ -98,7 +98,8 @@ void pthread_mutex_gotcha::configure() { pthread_mutex_gotcha_t::get_initializer() = []() { - if(!tim::settings::enabled()) return; + if(!tim::settings::enabled() || get_use_causal()) return; + if(config::get_trace_thread_locks()) { pthread_mutex_gotcha_t::configure( @@ -155,7 +156,7 @@ pthread_mutex_gotcha::configure() "pthread_spin_unlock" }); } - if(config::get_trace_thread_join() && !get_use_causal()) + if(config::get_trace_thread_join()) { pthread_mutex_gotcha_t::configure( comp::gotcha_config<12, int, pthread_t, void**>{ "pthread_join" }); diff --git a/source/lib/omnitrace/library/critical_trace.cpp b/source/lib/omnitrace/library/critical_trace.cpp index 2aea98e518..13284f89ba 100644 --- a/source/lib/omnitrace/library/critical_trace.cpp +++ b/source/lib/omnitrace/library/critical_trace.cpp @@ -228,100 +228,6 @@ entry::get_cost() const return 0; } -int64_t -entry::get_overlap(const entry& rhs) const -{ - if(begin_ns >= rhs.end_ns || end_ns >= rhs.begin_ns) // no overlap - return 0; - else if(begin_ns >= rhs.begin_ns && end_ns <= rhs.end_ns) // inclusive to rhs - return get_cost(); - else if(begin_ns <= rhs.begin_ns && end_ns >= rhs.end_ns) // rhs is inclusive - return rhs.get_cost(); - else if(begin_ns <= rhs.begin_ns && end_ns <= rhs.end_ns) // at beginning - return (end_ns - rhs.begin_ns); - else if(begin_ns >= rhs.begin_ns && end_ns >= rhs.end_ns) // at end - return (rhs.end_ns - begin_ns); - else - { - OMNITRACE_PRINT("Warning! entry::get_overlap(entry, tid) " - "could not determine the overlap :: %s\n", - JOIN("", *this).c_str()); - } - return 0; -} - -int64_t -entry::get_independent(const entry& rhs) const -{ - if(begin_ns >= rhs.end_ns || end_ns >= rhs.begin_ns) // no overlap - return get_cost(); - else if(begin_ns >= rhs.begin_ns && end_ns <= rhs.end_ns) // inclusive to rhs - return 0; - else if(begin_ns <= rhs.begin_ns && end_ns >= rhs.end_ns) // rhs is inclusive - return get_cost() - rhs.get_cost(); - else if(begin_ns <= rhs.begin_ns && end_ns <= rhs.end_ns) // at beginning - return (rhs.begin_ns - begin_ns); - else if(begin_ns >= rhs.begin_ns && end_ns >= rhs.end_ns) // at end - return (end_ns - rhs.end_ns); - else - { - OMNITRACE_PRINT("Warning! entry::get_independent(entry, tid) " - "could not determine the overlap :: %s\n", - JOIN("", *this).c_str()); - } - return 0; -} - -int64_t -entry::get_overlap(const entry& rhs, int32_t _devid, int32_t _pid, int64_t _tid) const -{ - if(_devid != this->devid || _pid != this->pid) // different device or process id - return 0; - - if(!is_delta(*this, __FUNCTION__)) return 0; - if(!is_delta(rhs, __FUNCTION__)) return 0; - - if(_tid < 0 || (this->tid == _tid && rhs.tid == _tid)) // all threads or same thread - return get_overlap(rhs); - - return 0; -} - -int64_t -entry::get_independent(const entry& rhs, int32_t _devid, int32_t _pid, int64_t _tid) const -{ - if(!is_delta(*this, __FUNCTION__)) return 0; - if(!is_delta(rhs, __FUNCTION__)) return 0; - - if(_devid != this->devid || _pid != this->pid) // different device or process id - return get_independent(rhs); - else if(_tid < 0 || - (this->tid == _tid && rhs.tid == _tid)) // all threads or same thread - return get_independent(rhs); - else if(this->tid == _tid && rhs.tid != _tid) // rhs is on different thread - return get_cost(); - return 0; -} - -bool -entry::is_bounded(const entry& rhs) const -{ - // ignores thread - return !(begin_ns < rhs.begin_ns || end_ns > rhs.end_ns); -} - -bool -entry::is_bounded(const entry& rhs, int32_t _devid, int32_t _pid, int64_t _tid) const -{ - if(_devid != this->devid || _pid != this->pid) // different device or process id - return false; - - if(tid == _tid && rhs.tid == _tid) // all threads or same thread - return !(begin_ns < rhs.begin_ns || end_ns > rhs.end_ns); - - return false; -} - void entry::write(std::ostream& _os) const { @@ -354,19 +260,6 @@ entry::write(std::ostream& _os) const _os << ", hash: " << hash << " :: " << tim::demangle(tim::get_hash_identifier(hash)); } -bool -entry::is_delta(const entry& _v, const std::string_view& _ctx) -{ - if(_v.phase != Phase::DELTA) - { - OMNITRACE_CT_DEBUG( - "Warning! Invalid phase for entry. entry::%s requires Phase::DELTA :: %s\n", - _ctx.data(), JOIN("", _v).c_str()); - return true; - } - return false; -} - //--------------------------------------------------------------------------------------// // // CALL CHAIN @@ -382,16 +275,6 @@ call_chain::operator==(const call_chain& rhs) const return true; } -size_t -call_chain::get_hash() const -{ - if(empty()) return 0; - int64_t _hash = this->at(0).get_hash(); - for(size_t i = 1; i < this->size(); ++i) - _hash = get_combined_hash(_hash, at(i).get_hash()); - return _hash; -} - int64_t call_chain::get_cost(int64_t _tid) const { @@ -411,35 +294,6 @@ call_chain::get_cost(int64_t _tid) const return _cost; } -int64_t -call_chain::get_overlap(int32_t _devid, int32_t _pid, int64_t _tid) const -{ - int64_t _cost = 0; - auto itr = this->begin(); - auto nitr = ++this->begin(); - for(; nitr != this->end(); ++nitr, ++itr) - _cost += nitr->get_overlap(*itr, _devid, _pid, _tid); - return _cost; -} - -int64_t -call_chain::get_independent(int32_t _devid, int32_t _pid, int64_t _tid) const -{ - int64_t _cost = 0; - auto itr = this->begin(); - auto nitr = ++this->begin(); - for(; nitr != this->end(); ++nitr, ++itr) - _cost += itr->get_independent(*nitr, _devid, _pid, _tid); - return _cost; -} - -std::vector& -call_chain::get_top_chains() -{ - static std::vector _v{}; - return _v; -} - template void call_chain::generate_perfetto(::perfetto::Track _track, std::set& _used) const diff --git a/source/lib/omnitrace/library/critical_trace.hpp b/source/lib/omnitrace/library/critical_trace.hpp index 6f8b0e332b..b0f72d6f4b 100644 --- a/source/lib/omnitrace/library/critical_trace.hpp +++ b/source/lib/omnitrace/library/critical_trace.hpp @@ -102,20 +102,8 @@ struct OMNITRACE_ATTRIBUTE(packed) entry int64_t get_cost() const; - bool is_bounded(const entry& rhs) const; - int64_t get_overlap(const entry& rhs) const; - int64_t get_independent(const entry& rhs) const; - - int64_t get_overlap(const entry& rhs, int32_t _devid, int32_t _pid, - int64_t _tid) const; - int64_t get_independent(const entry& rhs, int32_t _devid, int32_t _pid, - int64_t _tid) const; - bool is_bounded(const entry& rhs, int32_t _devid, int32_t _pid, int64_t _tid) const; - void write(std::ostream& _os) const; - static bool is_delta(const entry&, const std::string_view&); - friend std::ostream& operator<<(std::ostream& _os, const entry& _v) { _v.write(_os); @@ -222,11 +210,7 @@ struct call_chain : private std::vector using base_type::reserve; using base_type::size; - size_t get_hash() const; int64_t get_cost(int64_t _tid = -1) const; - int64_t get_overlap(int32_t _devid, int32_t _pid, int64_t _tid = -1) const; - int64_t get_independent(int32_t _devid, int32_t _pid, int64_t _tid = -1) const; - static std::vector& get_top_chains(); bool operator==(const call_chain& rhs) const; bool operator!=(const call_chain& rhs) const { return !(*this == rhs); } diff --git a/source/lib/omnitrace/library/causal/perf.cpp b/source/lib/omnitrace/library/perf.cpp similarity index 62% rename from source/lib/omnitrace/library/causal/perf.cpp rename to source/lib/omnitrace/library/perf.cpp index a4908da386..8d71440b6d 100644 --- a/source/lib/omnitrace/library/causal/perf.cpp +++ b/source/lib/omnitrace/library/perf.cpp @@ -20,18 +20,25 @@ // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. -#include "library/causal/perf.hpp" +#include "library/perf.hpp" +#include "core/debug.hpp" +#include "core/locking.hpp" +#include "core/state.hpp" #include "core/timemory.hpp" #include "core/utility.hpp" +#include "library/thread_data.hpp" #include #include #include #include +#include #include #include +#include #include +#include #include #include #include @@ -42,10 +49,26 @@ #include #include +#if !defined(OMNITRACE_RETURN_ERROR_MSG) +# define OMNITRACE_RETURN_ERROR_MSG(COND, ...) \ + if((COND)) \ + { \ + auto _msg_ss = std::stringstream{}; \ + _msg_ss << __VA_ARGS__; \ + return std::optional{ _msg_ss.str() }; \ + } +#endif + +#if !defined(OMNITRACE_FATAL) +# define OMNITRACE_FATAL TIMEMORY_FATAL +#endif + +#if !defined(OMNITRACE_ASSERT) +# define OMNITRACE_ASSERT(COND) (COND) ? ::tim::log::base() : TIMEMORY_FATAL +#endif + namespace omnitrace { -namespace causal -{ namespace perf { namespace @@ -75,7 +98,7 @@ perf_event::perf_event(perf_event&& rhs) noexcept if(m_fd != -1 && m_fd != rhs.m_fd) { ::close(m_fd); - TIMEMORY_INFO << "Closed perf event fd " << m_fd; + OMNITRACE_VERBOSE(1, "Closed perf event fd %li\n", m_fd); } if(m_mapping != nullptr && m_mapping != rhs.m_mapping) munmap(m_mapping, sizes.mmap); @@ -100,6 +123,7 @@ perf_event::~perf_event() { close(); } perf_event& perf_event::operator=(perf_event&& rhs) noexcept { + OMNITRACE_SCOPED_THREAD_STATE(ThreadState::Internal); if(&rhs == this) return *this; // Release resources if the current perf_event is initialized and not equal to this @@ -123,11 +147,13 @@ perf_event::operator=(perf_event&& rhs) noexcept } // Open a perf_event file and map it (if sampling is enabled) -bool +std::optional perf_event::open(struct perf_event_attr& _pe, pid_t _pid, int _cpu) { + OMNITRACE_SCOPED_THREAD_STATE(ThreadState::Internal); m_sample_type = _pe.sample_type; m_read_format = _pe.read_format; + m_batch_size = _pe.wakeup_events; // Set some mandatory fields _pe.size = sizeof(struct perf_event_attr); @@ -139,27 +165,22 @@ perf_event::open(struct perf_event_attr& _pe, pid_t _pid, int _cpu) { std::string path = "/proc/sys/kernel/perf_event_paranoid"; - FILE* file = fopen(path.c_str(), "r"); - OMNITRACE_PREFER(file != nullptr) - << "Failed to open " << path << ": " << strerror(errno); + auto file = std::ifstream{ path.c_str() }; - if(file == nullptr) return false; + OMNITRACE_RETURN_ERROR_MSG(!file, + "Failed to open " << path << ": " << strerror(errno)); - char value_str[3]; - int res = fread(value_str, sizeof(value_str), 1, file); - TIMEMORY_REQUIRE(res != -1) - << "Failed to read from " << path << ": " << strerror(errno); + int value = 4; + file >> value; - if(res == -1) return false; + OMNITRACE_RETURN_ERROR_MSG(file.bad(), "Failed to read from " << path << ": " + << strerror(errno)); - value_str[2] = '\0'; - int value = atoi(value_str); - - TIMEMORY_WARNING << "Failed to open perf event. " - << "Consider tweaking " << path << " to 2 or less " - << "(current value is " << value << "), " - << "or run omnitrace as a privileged user (with CAP_SYS_ADMIN)."; - return false; + OMNITRACE_RETURN_ERROR_MSG( + true, "Failed to open perf event. Consider tweaking " + << path << " to 2 or less " + << "(current value is " << value << "), " + << "or run omnitrace as a privileged user (with CAP_SYS_ADMIN)."); } // If sampling, map the perf event file @@ -168,78 +189,107 @@ perf_event::open(struct perf_event_attr& _pe, pid_t _pid, int _cpu) void* ring_buffer = mmap(nullptr, sizes.mmap, PROT_READ | PROT_WRITE, MAP_SHARED, m_fd, 0); - OMNITRACE_PREFER(ring_buffer != MAP_FAILED) - << "Mapping perf_event ring buffer failed. " - << "Make sure the current user has permission " - "to invoke the perf tool, and that " - << "the program being profiled does not use " - "an excessive number of threads (>1000).\n"; - - if(ring_buffer == MAP_FAILED) return false; + OMNITRACE_RETURN_ERROR_MSG( + ring_buffer == MAP_FAILED, + "Mapping perf_event ring buffer failed. Make sure the current user has " + "permission to invoke the perf tool, and that the program being profiled " + "does not use an excessive number of threads (>1000)"); m_mapping = reinterpret_cast(ring_buffer); } - return true; + return std::optional{}; } -bool +std::optional perf_event::open(double _freq, uint32_t _batch_size, pid_t _pid, int _cpu) { + OMNITRACE_SCOPED_THREAD_STATE(ThreadState::Internal); uint64_t _period = (1.0 / _freq) * units::sec; struct perf_event_attr _pe; + if(_batch_size > 0) + m_batch_size = _batch_size; + else + _batch_size = m_batch_size; + memset(&_pe, 0, sizeof(_pe)); _pe.type = PERF_TYPE_SOFTWARE; _pe.config = PERF_COUNT_SW_TASK_CLOCK; _pe.sample_type = PERF_SAMPLE_IP | PERF_SAMPLE_CALLCHAIN; _pe.sample_period = _period; _pe.wakeup_events = _batch_size; - _pe.sample_period = _period; - _pe.wakeup_events = _batch_size; // This is ignored on linux 3.13 (why?) _pe.exclude_idle = 1; _pe.exclude_kernel = 1; - _pe.precise_ip = 0; _pe.disabled = 1; + // potential additions + _pe.inherit = 0; + _pe.exclude_hv = 1; + _pe.exclude_callchain_kernel = 1; + _pe.use_clockid = 1; + _pe.clockid = CLOCK_REALTIME; + // _pe.precise_ip = 0; + // _pe.exclusive = 1; + // _pe.pinned = 1; return open(_pe, _pid, _cpu); } +/// Read event count +long +perf_event::get_fileno() const +{ + return m_fd; +} + /// Read event count uint64_t perf_event::get_count() const { uint64_t count; - TIMEMORY_REQUIRE(read(m_fd, &count, sizeof(uint64_t)) == sizeof(uint64_t)) + OMNITRACE_REQUIRE(read(m_fd, &count, sizeof(uint64_t)) == sizeof(uint64_t)) << "Failed to read event count from perf_event file"; return count; } /// Start counting events -void +bool perf_event::start() const { if(m_fd != -1) { - TIMEMORY_REQUIRE(ioctl(m_fd, PERF_EVENT_IOC_ENABLE, 0) != -1) + OMNITRACE_SCOPED_THREAD_STATE(ThreadState::Internal); + OMNITRACE_REQUIRE(ioctl(m_fd, PERF_EVENT_IOC_ENABLE, 0) != -1) << "Failed to start perf event: " << strerror(errno); } + return (m_fd != -1); } /// Stop counting events -void +bool perf_event::stop() const { if(m_fd != -1) { - TIMEMORY_REQUIRE(ioctl(m_fd, PERF_EVENT_IOC_DISABLE, 0) != -1) + OMNITRACE_SCOPED_THREAD_STATE(ThreadState::Internal); + OMNITRACE_REQUIRE(ioctl(m_fd, PERF_EVENT_IOC_DISABLE, 0) != -1) << "Failed to stop perf event: " << strerror(errno) << " (" << m_fd << ")"; } + return (m_fd != -1); +} + +bool +perf_event::is_open() const +{ + return (m_fd != -1); } void perf_event::close() { + OMNITRACE_SCOPED_THREAD_STATE(ThreadState::Internal); + stop(); + if(m_fd != -1) { ::close(m_fd); @@ -256,22 +306,25 @@ perf_event::close() void perf_event::set_ready_signal(int sig) const { + OMNITRACE_SCOPED_THREAD_STATE(ThreadState::Internal); // Set the perf_event file to async - TIMEMORY_REQUIRE(fcntl(m_fd, F_SETFL, fcntl(m_fd, F_GETFL, 0) | O_ASYNC) != -1) + OMNITRACE_REQUIRE(fcntl(m_fd, F_SETFL, fcntl(m_fd, F_GETFL, 0) | O_ASYNC) != -1) << "failed to set perf_event file to async mode"; // Set the notification signal for the perf file - TIMEMORY_REQUIRE(fcntl(m_fd, F_SETSIG, sig) != -1) + OMNITRACE_REQUIRE(fcntl(m_fd, F_SETSIG, sig) != -1) << "failed to set perf_event file signal"; // Set the current thread as the owner of the file (to target signal delivery) - TIMEMORY_REQUIRE(fcntl(m_fd, F_SETOWN, gettid()) != -1) + OMNITRACE_REQUIRE(fcntl(m_fd, F_SETOWN, gettid()) != -1) << "failed to set the owner of the perf_event file"; } void perf_event::iterator::next() { + OMNITRACE_SCOPED_THREAD_STATE(ThreadState::Internal); + struct perf_event_header _hdr; // Copy out the record header @@ -322,6 +375,8 @@ perf_event::iterator::operator!=(const iterator& other) const perf_event::record perf_event::iterator::get() { + OMNITRACE_SCOPED_THREAD_STATE(ThreadState::Internal); + // Copy out the record header perf_event::copy_from_ring_buffer(m_mapping, m_index, _buf, sizeof(struct perf_event_header)); @@ -332,7 +387,7 @@ perf_event::iterator::get() // Copy out the entire record perf_event::copy_from_ring_buffer(m_mapping, m_index, _buf, header->size); - return perf_event::record(m_source, header); + return perf_event::record(&m_source, header); } bool @@ -367,6 +422,8 @@ void perf_event::copy_from_ring_buffer(struct perf_event_mmap_page* _mapping, ptrdiff_t _index, void* _dest, size_t _nbytes) { + OMNITRACE_SCOPED_THREAD_STATE(ThreadState::Internal); + uintptr_t _base = reinterpret_cast(_mapping) + sizes.page; size_t _beg_idx = _index % sizes.data; size_t _end_idx = _beg_idx + _nbytes; @@ -391,53 +448,74 @@ perf_event::copy_from_ring_buffer(struct perf_event_mmap_page* _mapping, ptrdiff uint64_t perf_event::record::get_ip() const { - TIMEMORY_ASSERT(is_sample() && m_source.is_sampling(sample::ip)) - << "Record does not have an ip field"; + OMNITRACE_ASSERT(is_sample() && m_source != nullptr && + m_source->is_sampling(sample::ip)) + << "Record does not have an ip field (" << is_sample() << "|" << m_source << ")"; return *locate_field(); } uint64_t perf_event::record::get_pid() const { - TIMEMORY_ASSERT(is_sample() && m_source.is_sampling(sample::pid_tid)) - << "Record does not have a `pid` field"; + OMNITRACE_ASSERT(is_sample() && m_source != nullptr && + m_source->is_sampling(sample::pid_tid)) + << "Record does not have a `pid` field (" << is_sample() << "|" << m_source + << ")"; return locate_field()[0]; } uint64_t perf_event::record::get_tid() const { - TIMEMORY_ASSERT(is_sample() && m_source.is_sampling(sample::pid_tid)) - << "Record does not have a `tid` field"; + OMNITRACE_ASSERT(is_sample() && m_source != nullptr && + m_source->is_sampling(sample::pid_tid)) + << "Record does not have a `tid` field (" << is_sample() << "|" << m_source + << ")"; return locate_field()[1]; } uint64_t perf_event::record::get_time() const { - TIMEMORY_ASSERT(is_sample() && m_source.is_sampling(sample::time)) - << "Record does not have a 'time' field"; + OMNITRACE_ASSERT(is_sample() && m_source != nullptr && + m_source->is_sampling(sample::time)) + << "Record does not have a 'time' field (" << is_sample() << "|" << m_source + << ")"; return *locate_field(); } +uint64_t +perf_event::record::get_period() const +{ + OMNITRACE_ASSERT(is_sample() && m_source != nullptr && + m_source->is_sampling(sample::period)) + << "Record does not have a 'period' field (" << is_sample() << "|" << m_source + << ")"; + return *locate_field(); +} + uint32_t perf_event::record::get_cpu() const { - TIMEMORY_ASSERT(is_sample() && m_source.is_sampling(sample::cpu)) - << "Record does not have a 'cpu' field"; + OMNITRACE_ASSERT(is_sample() && m_source != nullptr && + m_source->is_sampling(sample::cpu)) + << "Record does not have a 'cpu' field (" << is_sample() << "|" << m_source + << ")"; return *locate_field(); } container::c_array perf_event::record::get_callchain() const { - TIMEMORY_ASSERT(is_sample() && m_source.is_sampling(sample::callchain)) - << "Record does not have a callchain field"; + OMNITRACE_ASSERT(is_sample() && m_source != nullptr && + m_source->is_sampling(sample::callchain)) + << "Record does not have a callchain field (" << is_sample() << "|" << m_source + << ")"; uint64_t* _base = locate_field(); uint64_t _size = *_base; // Advance the callchain array pointer past the size - _base++; + ++_base; return container::wrap_c_array(_base, _size); } @@ -445,6 +523,8 @@ template Tp perf_event::record::locate_field() const { + OMNITRACE_SCOPED_THREAD_STATE(ThreadState::Internal); + uintptr_t p = reinterpret_cast(m_header) + sizeof(struct perf_event_header); @@ -454,41 +534,45 @@ perf_event::record::locate_field() const // ip if constexpr(SampleT == sample::ip) return reinterpret_cast(p); - if(m_source.is_sampling(sample::ip)) p += sizeof(uint64_t); + if(m_source != nullptr && m_source->is_sampling(sample::ip)) p += sizeof(uint64_t); // pid, tid if constexpr(SampleT == sample::pid_tid) return reinterpret_cast(p); - if(m_source.is_sampling(sample::pid_tid)) p += sizeof(uint32_t) + sizeof(uint32_t); + if(m_source != nullptr && m_source->is_sampling(sample::pid_tid)) + p += sizeof(uint32_t) + sizeof(uint32_t); // time if constexpr(SampleT == sample::time) return reinterpret_cast(p); - if(m_source.is_sampling(sample::time)) p += sizeof(uint64_t); + if(m_source != nullptr && m_source->is_sampling(sample::time)) p += sizeof(uint64_t); // addr if constexpr(SampleT == sample::addr) return reinterpret_cast(p); - if(m_source.is_sampling(sample::addr)) p += sizeof(uint64_t); + if(m_source != nullptr && m_source->is_sampling(sample::addr)) p += sizeof(uint64_t); // id if constexpr(SampleT == sample::id) return reinterpret_cast(p); - if(m_source.is_sampling(sample::id)) p += sizeof(uint64_t); + if(m_source != nullptr && m_source->is_sampling(sample::id)) p += sizeof(uint64_t); // stream_id if constexpr(SampleT == sample::stream_id) return reinterpret_cast(p); - if(m_source.is_sampling(sample::stream_id)) p += sizeof(uint64_t); + if(m_source != nullptr && m_source->is_sampling(sample::stream_id)) + p += sizeof(uint64_t); // cpu if constexpr(SampleT == sample::cpu) return reinterpret_cast(p); - if(m_source.is_sampling(sample::cpu)) p += sizeof(uint32_t) + sizeof(uint32_t); + if(m_source != nullptr && m_source->is_sampling(sample::cpu)) + p += sizeof(uint32_t) + sizeof(uint32_t); // period if constexpr(SampleT == sample::period) return reinterpret_cast(p); - if(m_source.is_sampling(sample::period)) p += sizeof(uint64_t); + if(m_source != nullptr && m_source->is_sampling(sample::period)) + p += sizeof(uint64_t); // value if constexpr(SampleT == sample::read) return reinterpret_cast(p); - if(m_source.is_sampling(sample::read)) + if(m_source != nullptr && m_source->is_sampling(sample::read)) { - uint64_t read_format = m_source.get_read_format(); + uint64_t read_format = m_source->get_read_format(); if(read_format & PERF_FORMAT_GROUP) { // Get the number of values in the read format structure @@ -516,15 +600,15 @@ perf_event::record::locate_field() const // callchain if constexpr(SampleT == sample::callchain) return reinterpret_cast(p); - if(m_source.is_sampling(sample::callchain)) + if(m_source != nullptr && m_source->is_sampling(sample::callchain)) { uint64_t nr = *reinterpret_cast(p); - p += sizeof(uint64_t) + nr * sizeof(uint64_t); + p += sizeof(uint64_t) + (nr * sizeof(uint64_t)); } // raw if constexpr(SampleT == sample::raw) return reinterpret_cast(p); - if(m_source.is_sampling(sample::raw)) + if(m_source != nullptr && m_source->is_sampling(sample::raw)) { uint32_t raw_size = *reinterpret_cast(p); p += sizeof(uint32_t) + raw_size; @@ -532,24 +616,46 @@ perf_event::record::locate_field() const // branch_stack if constexpr(SampleT == sample::branch_stack) return reinterpret_cast(p); - if(m_source.is_sampling(sample::branch_stack)) - TIMEMORY_FATAL << "Branch stack sampling is not supported"; + if(m_source != nullptr && m_source->is_sampling(sample::branch_stack)) + OMNITRACE_FATAL << "Branch stack sampling is not supported"; // regs if constexpr(SampleT == sample::regs) return reinterpret_cast(p); - if(m_source.is_sampling(sample::regs)) - TIMEMORY_FATAL << "Register sampling is not supported"; + if(m_source != nullptr && m_source->is_sampling(sample::regs)) + OMNITRACE_FATAL << "Register sampling is not supported"; // stack if constexpr(SampleT == sample::stack) return reinterpret_cast(p); - if(m_source.is_sampling(sample::stack)) - TIMEMORY_FATAL << "Stack sampling is not supported"; + if(m_source != nullptr && m_source->is_sampling(sample::stack)) + OMNITRACE_FATAL << "Stack sampling is not supported"; // end if constexpr(SampleT == sample::last) return reinterpret_cast(p); - TIMEMORY_FATAL << "Unsupported sample field requested!"; + OMNITRACE_FATAL << "Unsupported sample field requested!"; +} + +namespace +{ +inline auto& +get_instances() +{ + using thread_data_t = thread_data>, perf_event>; + static auto& _v = thread_data_t::instance(construct_on_init{}); + return _v; +} +} // namespace + +std::unique_ptr& +get_instance(int64_t _tid) +{ + auto& _data = get_instances(); + if(static_cast(_tid) >= _data->size()) + { + OMNITRACE_SCOPED_THREAD_STATE(ThreadState::Internal); + _data->resize(_tid + 1); + } + return _data->at(_tid); } } // namespace perf -} // namespace causal } // namespace omnitrace diff --git a/source/lib/omnitrace/library/causal/perf.hpp b/source/lib/omnitrace/library/perf.hpp similarity index 71% rename from source/lib/omnitrace/library/causal/perf.hpp rename to source/lib/omnitrace/library/perf.hpp index d815737dbc..132ff4ce29 100644 --- a/source/lib/omnitrace/library/causal/perf.hpp +++ b/source/lib/omnitrace/library/perf.hpp @@ -24,79 +24,31 @@ #include "core/containers/c_array.hpp" #include "core/defines.hpp" +#include "core/locking.hpp" +#include "core/perf.hpp" + +#include #include #include #include #include +#include +#include +#include #include -#if __GLIBC__ == 2 && __GLIBC_MINOR__ < 30 -# include -# define gettid() syscall(SYS_gettid) -#endif - -// Workaround for missing hw_breakpoint.h include file: -// This include file just defines constants used to configure watchpoint registers. -// This will be constant across x86 systems. -enum -{ - HW_BREAKPOINT_X = 4 -}; - namespace omnitrace { -namespace causal -{ namespace perf { -/// An enum class with all the available sampling data -enum class sample : uint64_t -{ - ip = PERF_SAMPLE_IP, - pid_tid = PERF_SAMPLE_TID, - time = PERF_SAMPLE_TIME, - addr = PERF_SAMPLE_ADDR, - id = PERF_SAMPLE_ID, - stream_id = PERF_SAMPLE_STREAM_ID, - cpu = PERF_SAMPLE_CPU, - period = PERF_SAMPLE_PERIOD, - -#if defined(PREF_SAMPLE_READ) - read = PERF_SAMPLE_READ, -#else - read = 0, -#endif - - callchain = PERF_SAMPLE_CALLCHAIN, - raw = PERF_SAMPLE_RAW, - -#if defined(PERF_SAMPLE_BRANCH_STACK) - branch_stack = PERF_SAMPLE_BRANCH_STACK, -#else - branch_stack = 0, -#endif - -#if defined(PERF_SAMPLE_REGS_USER) - regs = PERF_SAMPLE_REGS_USER, -#else - regs = 0, -#endif - -#if defined(PERF_SAMPLE_STACK_USER) - stack = PERF_SAMPLE_STACK_USER, -#else - stack = 0, -#endif - - last = PERF_SAMPLE_MAX -}; - struct perf_event { - enum class record_type; + static constexpr uint32_t max_batch_size = 32; + struct record; struct sample_record; + class iterator; /// Default constructor perf_event() = default; @@ -113,17 +65,27 @@ struct perf_event perf_event& operator=(const perf_event&) = delete; /// Open a perf_event file using the given options structure - bool open(struct perf_event_attr& pe, pid_t pid = 0, int cpu = -1); - bool open(double, uint32_t, pid_t pid = 0, int cpu = -1); + std::optional open(struct perf_event_attr& pe, pid_t pid = 0, + int cpu = -1); + std::optional open(double, uint32_t = 0, pid_t pid = 0, int cpu = -1); + + /// Return file descriptor + long get_fileno() const; /// Read event count uint64_t get_count() const; + /// Get the batch size + uint32_t get_batch_size() const { return m_batch_size; } + /// Start counting events and collecting samples - void start() const; + bool start() const; /// Stop counting events - void stop() const; + bool stop() const; + + /// Check if counting events and collecting samples + bool is_open() const; /// Close the perf_event file and unmap the ring buffer void close(); @@ -141,33 +103,21 @@ struct perf_event /// Get the configuration for this perf_event's read format inline uint64_t get_read_format() const { return m_read_format; } - /// An enum to distinguish types of records in the mmapped ring buffer - enum class record_type - { - mmap = PERF_RECORD_MMAP, - lost = PERF_RECORD_LOST, - comm = PERF_RECORD_COMM, - exit = PERF_RECORD_EXIT, - throttle = PERF_RECORD_THROTTLE, - unthrottle = PERF_RECORD_UNTHROTTLE, - fork = PERF_RECORD_FORK, - read = PERF_RECORD_READ, - sample = PERF_RECORD_SAMPLE, - -#if defined(PERF_RECORD_MMAP2) - mmap2 = PERF_RECORD_MMAP2 -#else - mmap2 = 0 -#endif - }; - - class iterator; - /// A generic record type struct record { friend class perf_event::iterator; + record() = default; + ~record() = default; + record(const record&) = default; + record(record&&) noexcept = default; + record& operator=(const record&) = default; + record& operator=(record&&) noexcept = default; + + bool is_valid() const { return (m_source != nullptr && m_header != nullptr); } + operator bool() const { return is_valid(); } + record_type get_type() const { return static_cast(m_header->type); } inline bool is_mmap() const { return get_type() == record_type::mmap; } @@ -188,11 +138,12 @@ struct perf_event uint64_t get_pid() const; uint64_t get_tid() const; uint64_t get_time() const; + uint64_t get_period() const; uint32_t get_cpu() const; container::c_array get_callchain() const; private: - record(const perf_event& source, struct perf_event_header* header) + record(const perf_event* source, struct perf_event_header* header) : m_source(source) , m_header(header) {} @@ -200,8 +151,8 @@ struct perf_event template Tp locate_field() const; - const perf_event& m_source; - struct perf_event_header* m_header; + const perf_event* m_source = nullptr; + struct perf_event_header* m_header = nullptr; }; class iterator @@ -232,7 +183,7 @@ struct perf_event /// Get an iterator to the beginning of the memory mapped ring buffer iterator begin() { return iterator(*this, m_mapping); } - // Get an iterator to the end of the memory mapped ring buffer + /// Get an iterator to the end of the memory mapped ring buffer iterator end() { return iterator(*this, nullptr); } private: @@ -240,6 +191,8 @@ private: static void copy_from_ring_buffer(struct perf_event_mmap_page* mapping, ptrdiff_t index, void* dest, size_t bytes); + uint32_t m_batch_size = 10; + /// File descriptor for the perf event long m_fd = -1; @@ -251,6 +204,9 @@ private: /// The read format from this perf event's configuration uint64_t m_read_format = 0; }; + +/// provides thread-local instance of perf_event +std::unique_ptr& +get_instance(int64_t _tid); } // namespace perf -} // namespace causal } // namespace omnitrace diff --git a/source/lib/omnitrace/library/sampling.cpp b/source/lib/omnitrace/library/sampling.cpp index 7432fc217e..7996d55a87 100644 --- a/source/lib/omnitrace/library/sampling.cpp +++ b/source/lib/omnitrace/library/sampling.cpp @@ -26,11 +26,14 @@ #include "core/config.hpp" #include "core/debug.hpp" #include "core/locking.hpp" +#include "core/perf.hpp" #include "core/state.hpp" #include "core/utility.hpp" #include "library/components/backtrace.hpp" #include "library/components/backtrace_metrics.hpp" #include "library/components/backtrace_timestamp.hpp" +#include "library/components/callchain.hpp" +#include "library/perf.hpp" #include "library/ptl.hpp" #include "library/runtime.hpp" #include "library/thread_data.hpp" @@ -54,9 +57,12 @@ #include #include #include +#include #include +#include #include #include +#include #include #include #include @@ -98,17 +104,33 @@ namespace omnitrace namespace sampling { using ::tim::sampling::dynamic; -using tim::sampling::timer; +using ::tim::sampling::overflow; +using ::tim::sampling::timer; using hw_counters = typename component::backtrace_metrics::hw_counters; using signal_type_instances = thread_data, category::sampling>; using sampler_running_instances = thread_data; using bundle_t = tim::lightweight_tuple; + component::backtrace_metrics, component::callchain>; using sampler_t = tim::sampling::sampler; using sampler_instances = thread_data; using sampler_init_instances = thread_data; + +using component::backtrace; +using component::backtrace_cpu_clock; // NOLINT +using component::backtrace_fraction; // NOLINT +using component::backtrace_metrics; +using component::backtrace_timestamp; +using component::backtrace_wall_clock; // NOLINT +using component::callchain; +using component::sampling_cpu_clock; +using component::sampling_gpu_busy; +using component::sampling_gpu_memory; +using component::sampling_gpu_power; +using component::sampling_gpu_temp; +using component::sampling_percent; +using component::sampling_wall_clock; } // namespace sampling } // namespace omnitrace @@ -117,7 +139,7 @@ OMNITRACE_DEFINE_CONCRETE_TRAIT(prevent_reentry, sampling::sampler_t, std::true_ OMNITRACE_DEFINE_CONCRETE_TRAIT(provide_backtrace, sampling::sampler_t, std::false_type) OMNITRACE_DEFINE_CONCRETE_TRAIT(buffer_size, sampling::sampler_t, - TIMEMORY_ESC(std::integral_constant)) + TIMEMORY_ESC(std::integral_constant)) namespace omnitrace { @@ -393,6 +415,9 @@ get_offload_mutex() using sampler_bundle_t = typename sampler_t::bundle_type; using sampler_buffer_t = tim::data_storage::ring_buffer; +using pos_type = typename std::fstream::pos_type; + +auto offload_seq_data = std::unordered_map>{}; void offload_buffer(int64_t _seq, sampler_buffer_t&& _buf) @@ -407,16 +432,18 @@ offload_buffer(int64_t _seq, sampler_buffer_t&& _buf) auto& _file = get_offload_file(); OMNITRACE_REQUIRE(_file) - << "Error! sampling allocator tried to offload buffer of samples but the " - "offload file does not exist\n"; + << "Error! sampling allocator tried to offload buffer of samples for thread " + << _seq << " but the offload file does not exist\n"; OMNITRACE_VERBOSE_F(2, "Offloading %zu samples for thread %li to %s...\n", _buf.count(), _seq, _file->filename.c_str()); auto& _fs = _file->stream; - OMNITRACE_REQUIRE(_fs.good()) - << "Error! temporary file for offloading buffer is in an invalid state\n"; + OMNITRACE_REQUIRE(_fs.good()) << "Error! temporary file for offloading buffer is in " + "an invalid state during offload for thread " + << _seq << "\n"; + offload_seq_data[_seq].emplace(_fs.tellg()); _fs.write(reinterpret_cast(&_seq), sizeof(_seq)); auto _data = std::move(_buf); _data.save(_fs); @@ -425,9 +452,9 @@ offload_buffer(int64_t _seq, sampler_buffer_t&& _buf) } auto -load_offload_buffer() +load_offload_buffer(int64_t _thread_idx) { - auto _data = std::map>{}; + auto _data = std::vector{}; if(!get_use_tmp_files()) { OMNITRACE_WARNING_F( @@ -448,30 +475,44 @@ load_offload_buffer() auto& _fs = _file->stream; - _fs.close(); - _file->open(std::ios::binary | std::ios::in); + if(_fs.is_open()) _fs.close(); - if(!_fs) + if(!_file->open(std::ios::binary | std::ios::in)) { OMNITRACE_WARNING_F(0, "[sampling] %s failed to open", _file->filename.c_str()); + return _data; } - while(!_fs.eof()) + if(offload_seq_data.count(_thread_idx) == 0) return _data; + + size_t _count = 0; + for(auto itr : offload_seq_data.at(_thread_idx)) { + _fs.seekg(itr); // set to the absolute position + int64_t _seq = 0; _fs.read(reinterpret_cast(&_seq), sizeof(_seq)); - if(_fs.eof()) - { - OMNITRACE_VERBOSE_F(2, "[sampling] No more samples found in file...\n"); - break; - } + if(_fs.eof()) break; + sampler_buffer_t _buffer{}; _buffer.load(_fs); - OMNITRACE_VERBOSE_F(2, "[sampling] Loading %zu samples for thread %li...\n", - _buffer.count(), _seq); - _data[_seq].emplace_back(std::move(_buffer)); + + if(_seq != _thread_idx) + { + OMNITRACE_WARNING_F( + 0, + "[sampling] file position %zu returned %zi instead of (expected) %zi\n", + static_cast(itr), _seq, _thread_idx); + continue; + } + _count += _buffer.count(); + _data.emplace_back(std::move(_buffer)); } - _file.reset(); + + OMNITRACE_VERBOSE_F(2, "[sampling] Loaded %zu samples for thread %li...\n", _count, + _thread_idx); + + _file->close(); return _data; } @@ -481,6 +522,7 @@ configure(bool _setup, int64_t _tid) { const auto& _info = thread_info::get(_tid, SequentTID); auto& _sampler = sampling::get_sampler(_tid); + auto& _perf_sampler = perf::get_instance(_tid); auto& _running = get_sampler_running(_tid); bool _is_running = (!_running) ? false : *_running; auto& _signal_types = sampling::get_signal_types(_tid); @@ -492,8 +534,9 @@ configure(bool _setup, int64_t _tid) OMNITRACE_SCOPED_SAMPLING_ON_CHILD_THREADS(false); OMNITRACE_SCOPED_THREAD_STATE(ThreadState::Internal); - auto&& _cpu_tids = get_sampling_cpu_tids(); - auto&& _real_tids = get_sampling_real_tids(); + auto&& _cputime_tids = get_sampling_cputime_tids(); + auto&& _realtime_tids = get_sampling_realtime_tids(); + auto&& _overflow_tids = get_sampling_overflow_tids(); auto _erase_tid_signal = [_tid, &_signal_types](auto& _tids, int _signum) { if(!_tids.empty()) @@ -506,8 +549,9 @@ configure(bool _setup, int64_t _tid) } }; - _erase_tid_signal(_cpu_tids, get_cputime_signal()); - _erase_tid_signal(_real_tids, get_realtime_signal()); + _erase_tid_signal(_cputime_tids, get_sampling_cputime_signal()); + _erase_tid_signal(_realtime_tids, get_sampling_realtime_signal()); + _erase_tid_signal(_overflow_tids, get_sampling_overflow_signal()); if(_setup && !_sampler && !_is_running && !_signal_types->empty()) { @@ -544,20 +588,79 @@ configure(bool _setup, int64_t _tid) _sampler->set_flags(SA_RESTART); _sampler->set_verbose(_verbose); - if(_signal_types->count(get_realtime_signal()) > 0) + if(_signal_types->count(get_sampling_realtime_signal()) > 0) { - _sampler->configure(timer{ get_realtime_signal(), CLOCK_REALTIME, - SIGEV_THREAD_ID, get_sampling_real_freq(), - get_sampling_real_delay(), _tid, + _sampler->configure(timer{ get_sampling_realtime_signal(), CLOCK_REALTIME, + SIGEV_THREAD_ID, get_sampling_realtime_freq(), + get_sampling_realtime_delay(), _tid, threading::get_sys_tid() }); } - if(_signal_types->count(get_cputime_signal()) > 0) + if(_signal_types->count(get_sampling_cputime_signal()) > 0) { - _sampler->configure(timer{ get_cputime_signal(), CLOCK_THREAD_CPUTIME_ID, - SIGEV_THREAD_ID, get_sampling_cpu_freq(), - get_sampling_cpu_delay(), _tid, - threading::get_sys_tid() }); + _sampler->configure( + timer{ get_sampling_cputime_signal(), CLOCK_THREAD_CPUTIME_ID, + SIGEV_THREAD_ID, get_sampling_cputime_freq(), + get_sampling_cputime_delay(), _tid, threading::get_sys_tid() }); + } + + if(_signal_types->count(get_sampling_overflow_signal()) > 0) + { + if(_signal_types->size() == 1) + trait::runtime_enabled::set(false); + + _perf_sampler = std::make_unique(); + + struct perf_event_attr _pe; + memset(&_pe, 0, sizeof(_pe)); + + auto _freq = get_sampling_overflow_freq(); + auto _overflow_event = + get_setting_value("OMNITRACE_SAMPLING_OVERFLOW_EVENT") + .value_or("perf::PERF_COUNT_HW_CACHE_REFERENCES"); + + perf::config_overflow_sampling(_pe, _overflow_event, _freq); + + _pe.sample_type = PERF_SAMPLE_TIME | PERF_SAMPLE_IP | PERF_SAMPLE_CALLCHAIN; + + _pe.wakeup_events = 10; + _pe.exclude_idle = 1; + _pe.exclude_kernel = 1; + _pe.exclude_hv = 1; + _pe.exclude_callchain_kernel = 1; + _pe.disabled = 1; + _pe.inherit = 0; + + if(_pe.type == PERF_TYPE_SOFTWARE) + { + _pe.use_clockid = 1; + _pe.clockid = CLOCK_REALTIME; + } + + auto _perf_open_error = + _perf_sampler->open(_pe, _info->index_data->system_value); + + OMNITRACE_REQUIRE(!_perf_open_error) + << "perf backend for overflow failed to activate: " << *_perf_open_error; + + _perf_sampler->set_ready_signal(get_sampling_overflow_signal()); + _sampler->configure(overflow{ + get_sampling_overflow_signal(), + [](int _sig, pid_t, long, int64_t _idx) { + perf::get_instance(_idx)->set_ready_signal(_sig); + return true; + }, + [](int, pid_t, long, int64_t _idx) { + return perf::get_instance(_idx)->start(); + }, + [](int, pid_t, long, int64_t _idx) { + if(!perf::get_instance(_idx) || !perf::get_instance(_idx)->is_open()) + return true; + auto _stopped = perf::get_instance(_idx)->stop(); + if(_stopped) perf::get_instance(_idx)->close(); + return _stopped; + }, + _tid, threading::get_sys_tid() }); } if(get_use_tmp_files()) @@ -584,16 +687,32 @@ configure(bool _setup, int64_t _tid) for(auto itr : *_signal_types) { - const char* _type = (itr == get_realtime_signal()) ? "wall" : "CPU"; - const auto* _timer = _sampler->get_timer(itr); - if(_timer) + if(itr == get_sampling_overflow_signal()) { - OMNITRACE_VERBOSE( - 2, - "[SIG%i] Sampler for thread %lu will be triggered %.1fx per " - "second of %s-time (every %.3e milliseconds)...\n", - itr, _tid, _timer->get_frequency(units::sec), _type, - _timer->get_period(units::msec)); + auto _freq = get_sampling_overflow_freq(); + auto _overflow_event = + get_setting_value("OMNITRACE_SAMPLING_OVERFLOW_EVENT") + .value_or("perf::PERF_COUNT_HW_CACHE_REFERENCES"); + OMNITRACE_VERBOSE(2, + "[SIG%i] Sampler for thread %lu will be triggered " + "every %.1f %s events...\n", + itr, _tid, _freq, _overflow_event.c_str()); + } + else + { + const char* _type = + (itr == get_sampling_realtime_signal()) ? "wall" : "CPU"; + const auto* _timer = + dynamic_cast(_sampler->get_trigger(itr)); + if(_timer) + { + OMNITRACE_VERBOSE( + 2, + "[SIG%i] Sampler for thread %lu will be triggered %.1fx per " + "second of %s-time (every %.3e milliseconds)...\n", + itr, _tid, _timer->get_frequency(units::sec), _type, + _timer->get_period(units::msec)); + } } } @@ -604,7 +723,7 @@ configure(bool _setup, int64_t _tid) } else if(!_setup && _sampler && _is_running) { - OMNITRACE_DEBUG("Destroying sampler for thread %lu...\n", _tid); + OMNITRACE_DEBUG("Stopping sampler for thread %lu...\n", _tid); *_running = false; if(_tid == threading::get_id() && !_signal_types->empty()) @@ -616,22 +735,22 @@ configure(bool _setup, int64_t _tid) if(_tid == 0) { - block_samples(); - // this propagates to all threads + block_samples(); _sampler->ignore(*_signal_types); + } - // wait for the samples to finish - auto _freq = - std::max(get_sampling_cpu_freq(), get_sampling_real_freq()); - auto _period = (1.0 / _freq) * units::sec; - _period = std::min(_period, 1.0e9); // max of 1 second - std::this_thread::sleep_for( - std::chrono::nanoseconds{ static_cast(_period) }); + _sampler->stop(); + _sampler->reset(); + *_running = false; + if(_perf_sampler) _perf_sampler->stop(); + if(_tid == 0) + { for(int64_t i = 1; i < OMNITRACE_MAX_THREADS; ++i) { if(sampling::get_sampler(i)) sampling::get_sampler(i)->stop(); + if(perf::get_instance(i)) perf::get_instance(i)->stop(); } for(int64_t i = 1; i < OMNITRACE_MAX_THREADS; ++i) @@ -643,11 +762,13 @@ configure(bool _setup, int64_t _tid) } } + // wait for the samples to finish + for(auto& itr : get_sampler_allocators()) + if(itr) itr->flush(); + stop_duration_thread(); } - _sampler->stop(); - if(trait::runtime_enabled::get()) backtrace_metrics::configure(_setup, _tid); @@ -657,12 +778,39 @@ configure(bool _setup, int64_t _tid) return (_signal_types) ? *_signal_types : std::set{}; } +struct timer_sampling_data +{ + int64_t m_tid = -1; + uint64_t m_beg = 0; + uint64_t m_end = 0; + std::vector m_stack = {}; + backtrace_metrics m_metrics = {}; +}; + +struct overflow_sampling_data +{ + int64_t m_tid = -1; + uint64_t m_beg = 0; + uint64_t m_end = 0; + std::vector m_stack = {}; +}; + +std::vector +post_process_timer_data(int64_t, const bundle_t*, const std::vector&); + +std::vector +post_process_overflow_data(int64_t, const bundle_t*, const std::vector&); + void -post_process_perfetto(int64_t _tid, const bundle_t* _init, - const std::vector& _data); +post_process_perfetto(int64_t, const std::vector&, + const std::vector&); + void -post_process_timemory(int64_t _tid, const bundle_t* _init, - const std::vector& _data); +post_process_timemory(int64_t, const std::vector&, + const std::vector&); + +auto static_strings = std::set{}; + } // namespace unique_ptr_t>& @@ -753,19 +901,13 @@ post_process() auto _internal_samples = std::atomic{ 0 }; OMNITRACE_VERBOSE(2 || get_debug_sampling(), "Stopping sampling components...\n"); - omnitrace::component::backtrace::stop(); - for(size_t i = 0; i < max_supported_threads; ++i) - { - backtrace_metrics::configure(false, i); - auto& _sampler = get_sampler(i); - if(_sampler) - { - _sampler->set_offload(nullptr); - _sampler->stop(); - } - } - auto _loaded_data = load_offload_buffer(); + omnitrace::component::backtrace::stop(); + configure(false, 0); + + for(auto& itr : get_sampler_allocators()) + if(itr) itr->flush(); + for(size_t i = 0; i < max_supported_threads; ++i) { auto& _sampler = get_sampler(i); @@ -796,8 +938,9 @@ post_process() OMNITRACE_VERBOSE(3 || get_debug_sampling(), "Getting sampler data for thread %lu...\n", i); - auto _raw_data = _sampler->get_data(); - for(auto litr : _loaded_data[i]) + auto _raw_data = _sampler->get_data(); + auto _loaded_data = load_offload_buffer(i); + for(auto litr : _loaded_data) { while(!litr.is_empty()) { @@ -823,9 +966,10 @@ post_process() for(auto& itr : _raw_data) { auto* _bt = itr.get(); + auto* _cc = itr.get(); auto* _ts = itr.get(); - if(_thread_info && _bt && !_bt->empty() && _ts && - _thread_info->is_valid_time(_ts->get_timestamp())) + if(_thread_info && ((_bt && !_bt->empty()) || (_cc && !_cc->empty())) && + _ts && _thread_info->is_valid_time(_ts->get_timestamp())) { _data.emplace_back(&itr); } @@ -840,8 +984,11 @@ post_process() "Sampler data for thread %lu has %zu valid entries...\n", i, _data.size()); - if(get_use_perfetto()) post_process_perfetto(i, _init, _data); - if(get_use_timemory()) post_process_timemory(i, _init, _data); + auto _timer_data = post_process_timer_data(i, _init, _data); + auto _overflow_data = post_process_overflow_data(i, _init, _data); + + if(get_use_perfetto()) post_process_perfetto(i, _timer_data, _overflow_data); + if(get_use_timemory()) post_process_timemory(i, _timer_data, _overflow_data); } else { @@ -855,6 +1002,8 @@ post_process() OMNITRACE_VERBOSE(3 || get_debug_sampling(), "Destroying samplers and allocators...\n"); + get_offload_file().reset(); // remove the temporary file + for(size_t i = 0; i < max_supported_threads; ++i) get_sampler(i).reset(); @@ -878,16 +1027,103 @@ post_process() namespace { +std::vector +post_process_timer_data(int64_t _tid, const bundle_t* _init, + const std::vector& _data) +{ + auto _results = std::vector{}; + + const auto* _last = _init; + for(const auto& itr : _data) + { + auto* _bt_data = itr->get(); + auto* _bt_time = itr->get(); + auto* _bt_metrics = itr->get(); + const auto* _last_metrics = _last->get(); + + if(!_bt_data || !_bt_time || _bt_data->empty() || _bt_time->get_tid() != _tid) + continue; + + auto _ret = timer_sampling_data{}; + _ret.m_tid = _bt_time->get_tid(); + _ret.m_beg = _last->get()->get_timestamp(); + _ret.m_end = _bt_time->get_timestamp(); + _ret.m_stack = backtrace::filter_and_patch(_bt_data->get()); + if constexpr(tim::trait::is_available::value) + { + auto _hw_counters_enabled = [](const auto* _bt_v) { + return (_bt_v != nullptr) && + (*_bt_v)(type_list{}) && + (*_bt_v)(category::thread_hardware_counter{}); + }; + + if(_bt_metrics && _last_metrics && _hw_counters_enabled(_bt_metrics) && + _hw_counters_enabled(_last_metrics)) + { + _ret.m_metrics = (*_bt_metrics) - (*_last_metrics); + } + } + + _results.emplace_back(std::move(_ret)); + _last = itr; + } + + std::sort(_results.begin(), _results.end(), + [](const auto& _lhs, const auto& _rhs) { return _lhs.m_beg < _rhs.m_beg; }); + + return _results; +} + +std::vector +post_process_overflow_data(int64_t _tid, const bundle_t*, + const std::vector& _data) +{ + auto _results = std::vector{}; + + uint64_t _last_call_ts = 0; + uint64_t _perf_ts_offset = 0; + for(const auto& itr : _data) + { + auto* _bt_call = itr->get(); + auto* _bt_time = itr->get(); + + if(!_bt_call || !_bt_time || _bt_call->empty() || _bt_time->get_tid() != _tid) + continue; + + for(const auto& pitr : callchain::filter_and_patch(_bt_call->get())) + { + if(_last_call_ts == 0) + { + _last_call_ts = pitr.first; + _perf_ts_offset = (_bt_time->get_timestamp() - pitr.first); + continue; + } + + auto _ret = overflow_sampling_data{}; + _ret.m_tid = _bt_time->get_tid(); + _ret.m_beg = _last_call_ts + _perf_ts_offset; + _ret.m_end = pitr.first + _perf_ts_offset; + _ret.m_stack = pitr.second; + _last_call_ts = pitr.first; + _results.emplace_back(std::move(_ret)); + } + } + + std::sort(_results.begin(), _results.end(), + [](const auto& _lhs, const auto& _rhs) { return _lhs.m_beg < _rhs.m_beg; }); + + return _results; +} + void -post_process_perfetto(int64_t _tid, const bundle_t* _init, - const std::vector& _data) +post_process_perfetto(int64_t _tid, const std::vector& _timer_data, + const std::vector& _overflow_data) { auto _valid_metrics = backtrace_metrics::valid_array_t{}; - for(const auto& itr : _data) + for(const auto& itr : _timer_data) { - const auto* _bt_mt = itr->get(); - if(_bt_mt) _valid_metrics |= _bt_mt->get_valid(); + _valid_metrics |= itr.m_metrics.get_valid(); } if(trait::runtime_enabled::get()) @@ -895,14 +1131,8 @@ post_process_perfetto(int64_t _tid, const bundle_t* _init, OMNITRACE_VERBOSE(3 || get_debug_sampling(), "[%li] Post-processing metrics for perfetto...\n", _tid); backtrace_metrics::init_perfetto(_tid, _valid_metrics); - for(const auto& itr : _data) - { - const auto* _bt_metrics = itr->get(); - const auto* _bt_time = itr->get(); - if(!_bt_metrics || !_bt_time) continue; - if(_bt_time->get_tid() != _tid) continue; - _bt_metrics->post_process_perfetto(_tid, _bt_time->get_timestamp()); - } + for(const auto& itr : _timer_data) + itr.m_metrics.post_process_perfetto(_tid, 0.5 * (itr.m_beg + itr.m_end)); backtrace_metrics::fini_perfetto(_tid, _valid_metrics); } @@ -916,132 +1146,60 @@ post_process_perfetto(int64_t _tid, const bundle_t* _init, uint64_t _beg_ns = _thread_info->get_start(); uint64_t _end_ns = _thread_info->get_stop(); - uint64_t _last_ts = - std::max(_init->get()->get_timestamp(), _beg_ns); - auto _track = tracing::get_perfetto_track( - category::sampling{}, - [](auto _seq_id, auto _sys_id) { - return TIMEMORY_JOIN(" ", "Thread", _seq_id, "(S)", _sys_id); - }, - _thread_info->index_data->sequent_value, _thread_info->index_data->system_value); + auto _overflow_event = + get_setting_value("OMNITRACE_SAMPLING_OVERFLOW_EVENT").value_or(""); - tracing::push_perfetto_track(category::sampling{}, "samples [omnitrace]", _track, - _beg_ns, [&](::perfetto::EventContext ctx) { - if(config::get_perfetto_annotations()) - { - tracing::add_perfetto_annotation(ctx, "begin_ns", - _beg_ns); - } - }); - - auto _as_hex = [](auto _v) { return JOIN("", "0x", std::hex, _v); }; - - auto _labels = backtrace_metrics::get_hw_counter_labels(_tid); - for(const auto& itr : _data) + if(!_overflow_event.empty()) { - const auto* _bt_ts = itr->get(); - const auto* _bt_cs = itr->get(); - const auto* _bt_mt = itr->get(); + const auto _overflow_prefix = std::string_view{ "PERF_COUNT_" }; + const auto _overflow_pos = _overflow_event.find(_overflow_prefix); + if(_overflow_pos != std::string::npos) + _overflow_event = + _overflow_event.substr(_overflow_pos + _overflow_prefix.length()); - if(!_bt_ts || !_bt_cs || !_bt_mt) continue; - if(_bt_ts->get_tid() != _tid) continue; + const auto* _main_name = + static_strings.emplace(join(" ", _overflow_event, "samples [omnitrace]")) + .first->c_str(); - static std::set _static_strings{}; - const auto* _last = _init; - auto _patched_data = backtrace::filter_and_patch(_bt_cs->get()); - size_t _ncount = 0; - for(const auto& iitr : _patched_data) + auto _track = tracing::get_perfetto_track( + category::overflow_sampling{}, + [](auto _seq_id, auto _sys_id) { + return TIMEMORY_JOIN(" ", "Thread", _seq_id, "Overflow", "(S)", _sys_id); + }, + _thread_info->index_data->sequent_value, + _thread_info->index_data->system_value); + + tracing::push_perfetto_track(category::overflow_sampling{}, _main_name, _track, + _beg_ns, [&](::perfetto::EventContext ctx) { + if(config::get_perfetto_annotations()) + { + tracing::add_perfetto_annotation( + ctx, "begin_ns", _beg_ns); + } + }); + + for(const auto& itr : _overflow_data) { - uint64_t _beg = _last_ts; - uint64_t _end = _bt_ts->get_timestamp(); + auto _beg = itr.m_beg; + auto _end = itr.m_end; + if(!_thread_info->is_valid_lifetime({ _beg, _end })) continue; - auto _ncur = _ncount++; - // the begin/end + HW counters will be same for entire call-stack so only - // annotate - // the top and the bottom functons to keep the data consumption low - bool _include_common = (_ncur == 0 || _ncur + 1 == _patched_data.size()); - - // Only annotate HW counters when: - // 1. when we can compute a difference from the last sample - // 2. when the number of HW counters b/t this sample and last are the same - bool _include_hw = - _include_common && (_last != nullptr) && - _bt_mt->get_hw_counters().size() == - _last->get()->get_hw_counters().size(); - - auto _hw_counters_enabled = [](const auto* _bt_v) { - return (_bt_v != nullptr) && - (*_bt_v)(type_list{}) && - (*_bt_v)(category::thread_hardware_counter{}); - }; - - // annotations common to both modes - auto _common_annotate = [&](::perfetto::EventContext& ctx, bool _is_last) { - if(_include_common && _is_last) - { - tracing::add_perfetto_annotation(ctx, "begin_ns", _beg); - tracing::add_perfetto_annotation(ctx, "end_ns", _end); - } - if(_include_hw && _is_last && _last && - _hw_counters_enabled(_last->get()) && - _hw_counters_enabled(_bt_mt)) - { - // current values when read - auto _hw_cnt_vals = _bt_mt->get_hw_counters(); - // compute difference from last sample to provide the HW counters for - // this sample - tim::math::minus(_hw_cnt_vals, - _last->get()->get_hw_counters()); - for(size_t i = 0; i < _labels.size(); ++i) - tracing::add_perfetto_annotation(ctx, _labels.at(i), - _hw_cnt_vals.at(i)); - } - }; - - if(get_sampling_include_inlines() && iitr.lineinfo) + for(const auto& iitr : itr.m_stack) { - auto _lines = iitr.lineinfo.lines; - std::reverse(_lines.begin(), _lines.end()); - size_t _n = 0; - for(const auto& litr : _lines) - { - const auto* _name = - _static_strings.emplace(demangle(litr.name)).first->c_str(); - auto _info = JOIN(':', litr.location, litr.line); - tracing::push_perfetto_track( - category::sampling{}, _name, _track, _beg, - [&](::perfetto::EventContext ctx) { - if(config::get_perfetto_annotations()) - { - _common_annotate(ctx, (_n == 0 && _ncur == 0) || - (_n + 1 == _lines.size())); - tracing::add_perfetto_annotation(ctx, "file", - iitr.location); - tracing::add_perfetto_annotation(ctx, "lineinfo", _info); - tracing::add_perfetto_annotation(ctx, "inlined", - (_n++ > 0)); - } - }); - tracing::pop_perfetto_track(category::sampling{}, _name, _track, - _end); - } - } - else - { - const auto* _name = _static_strings.emplace(iitr.name).first->c_str(); + const auto* _name = + static_strings.emplace(demangle(iitr.name)).first->c_str(); tracing::push_perfetto_track( - category::sampling{}, _name, _track, _beg, + category::overflow_sampling{}, _name, _track, _beg, [&](::perfetto::EventContext ctx) { if(config::get_perfetto_annotations()) { - _common_annotate(ctx, true); tracing::add_perfetto_annotation(ctx, "file", iitr.location); tracing::add_perfetto_annotation(ctx, "pc", - _as_hex(iitr.address)); + as_hex(iitr.address)); tracing::add_perfetto_annotation(ctx, "line_address", - _as_hex(iitr.line_address)); + as_hex(iitr.line_address)); if(iitr.lineinfo) { auto _lines = iitr.lineinfo.lines; @@ -1058,67 +1216,230 @@ post_process_perfetto(int64_t _tid, const bundle_t* _init, } } }); - - tracing::pop_perfetto_track(category::sampling{}, _name, _track, _end); + tracing::pop_perfetto_track(category::overflow_sampling{}, _name, _track, + _end); } } - _last_ts = _bt_ts->get_timestamp(); - _last = itr; + + tracing::pop_perfetto_track(category::overflow_sampling{}, _main_name, _track, + _end_ns, [&](::perfetto::EventContext ctx) { + if(config::get_perfetto_annotations()) + { + tracing::add_perfetto_annotation( + ctx, "end_ns", _end_ns); + } + }); } - tracing::pop_perfetto_track(category::sampling{}, "samples [omnitrace]", _track, - _end_ns, [&](::perfetto::EventContext ctx) { - if(config::get_perfetto_annotations()) + if(!_timer_data.empty()) + { + auto _track = tracing::get_perfetto_track( + category::timer_sampling{}, + [](auto _seq_id, auto _sys_id) { + return TIMEMORY_JOIN(" ", "Thread", _seq_id, "(S)", _sys_id); + }, + _thread_info->index_data->sequent_value, + _thread_info->index_data->system_value); + + tracing::push_perfetto_track(category::timer_sampling{}, "samples [omnitrace]", + _track, _beg_ns, [&](::perfetto::EventContext ctx) { + if(config::get_perfetto_annotations()) + { + tracing::add_perfetto_annotation( + ctx, "begin_ns", _beg_ns); + } + }); + + auto _labels = backtrace_metrics::get_hw_counter_labels(_tid); + for(const auto& itr : _timer_data) + { + size_t _ncount = 0; + uint64_t _beg = itr.m_beg; + uint64_t _end = itr.m_end; + if(!_thread_info->is_valid_lifetime({ _beg, _end })) continue; + + for(const auto& iitr : itr.m_stack) + { + auto _ncur = _ncount++; + // the begin/end + HW counters will be same for entire call-stack so only + // annotate the top and the bottom functons to keep the data consumption + // low + bool _include_common = (_ncur == 0 || _ncur + 1 == itr.m_stack.size()); + + // Only annotate HW counters when first or last and HW counters are not + // empty + bool _include_hw = + _include_common && !itr.m_metrics.get_hw_counters().empty(); + + // annotations common to both modes + auto _common_annotate = [&](::perfetto::EventContext& ctx, + bool _is_last) { + if(_include_common && _is_last) + { + tracing::add_perfetto_annotation(ctx, "begin_ns", _beg); + tracing::add_perfetto_annotation(ctx, "end_ns", _end); + } + + if(_include_hw) + { + // current values when read + auto _hw_cnt_vals = itr.m_metrics.get_hw_counters(); + for(size_t i = 0; i < _labels.size(); ++i) + tracing::add_perfetto_annotation(ctx, _labels.at(i), + _hw_cnt_vals.at(i)); + } + }; + + if(get_sampling_include_inlines() && iitr.lineinfo) + { + auto _lines = iitr.lineinfo.lines; + std::reverse(_lines.begin(), _lines.end()); + size_t _n = 0; + for(const auto& litr : _lines) + { + const auto* _name = + static_strings.emplace(demangle(litr.name)).first->c_str(); + auto _info = JOIN(':', litr.location, litr.line); + tracing::push_perfetto_track( + category::timer_sampling{}, _name, _track, _beg, + [&](::perfetto::EventContext ctx) { + if(config::get_perfetto_annotations()) + { + _common_annotate(ctx, (_n == 0 && _ncur == 0) || + (_n + 1 == _lines.size())); + tracing::add_perfetto_annotation(ctx, "file", + iitr.location); + tracing::add_perfetto_annotation(ctx, "lineinfo", + _info); + tracing::add_perfetto_annotation(ctx, "inlined", + (_n++ > 0)); + } + }); + tracing::pop_perfetto_track(category::timer_sampling{}, _name, + _track, _end); + } + } + else + { + const auto* _name = static_strings.emplace(iitr.name).first->c_str(); + tracing::push_perfetto_track( + category::timer_sampling{}, _name, _track, _beg, + [&](::perfetto::EventContext ctx) { + if(config::get_perfetto_annotations()) + { + _common_annotate(ctx, true); + tracing::add_perfetto_annotation(ctx, "file", + iitr.location); + tracing::add_perfetto_annotation(ctx, "pc", + as_hex(iitr.address)); + tracing::add_perfetto_annotation( + ctx, "line_address", as_hex(iitr.line_address)); + if(iitr.lineinfo) + { + auto _lines = iitr.lineinfo.lines; + std::reverse(_lines.begin(), _lines.end()); + size_t _n = 0; + for(const auto& litr : _lines) { - tracing::add_perfetto_annotation(ctx, "end_ns", - _end_ns); + auto _label = JOIN('-', "lineinfo", _n++); + tracing::add_perfetto_annotation( + ctx, _label.c_str(), + JOIN('@', demangle(litr.name), + JOIN(':', litr.location, litr.line))); } - }); + } + } + }); + + tracing::pop_perfetto_track(category::timer_sampling{}, _name, _track, + _end); + } + } + } + + tracing::pop_perfetto_track(category::timer_sampling{}, "samples [omnitrace]", + _track, _end_ns, [&](::perfetto::EventContext ctx) { + if(config::get_perfetto_annotations()) + { + tracing::add_perfetto_annotation( + ctx, "end_ns", _end_ns); + } + }); + } } void -post_process_timemory(int64_t _tid, const bundle_t* _init, - const std::vector& _data) +post_process_timemory(int64_t _tid, const std::vector& _timer_data, + const std::vector& _overflow_data) { - auto _depth_sum = std::map>{}; - OMNITRACE_VERBOSE(3 || get_debug_sampling(), "[%li] Post-processing data for timemory...\n", _tid); - const auto* _last = _init; - for(const auto& itr : _data) + // compute the total number of entries + int64_t _sum = 0; + for(const auto& itr : _overflow_data) + _sum += itr.m_stack.size(); + for(const auto& itr : _timer_data) + _sum += itr.m_stack.size(); + + for(const auto& itr : _overflow_data) + { + using bundle_t = tim::lightweight_tuple; + + auto _data = std::vector{}; + _data.reserve(itr.m_stack.size()); + + for(const auto& iitr : itr.m_stack) + { + _data.emplace_back(tim::string_view_t{ iitr.name }); + _data.back().push(itr.m_tid); + _data.back().start(); + } + + // stop the instances and update the values as needed + for(size_t i = 0; i < _data.size(); ++i) + { + auto& iitr = _data.at(_data.size() - i - 1); + iitr.stop(); + if constexpr(tim::trait::is_available::value) + { + auto* _sc = iitr.get(); + if(_sc) + { + auto _value = static_cast(itr.m_end - itr.m_beg) / + sampling_wall_clock::get_unit(); + _sc->set_value(_value); + _sc->set_accum(_value); + } + } + iitr.pop(); + } + } + + for(const auto& itr : _timer_data) { using bundle_t = tim::lightweight_tuple; - auto* _bt_data = itr->get(); - auto* _bt_time = itr->get(); - auto* _bt_metrics = itr->get(); - const auto* _last_metrics = _last->get(); + double _elapsed_wc = (itr.m_end - itr.m_beg); - if(!_bt_data || !_bt_time) continue; - - double _elapsed_wc = (_bt_time->get_timestamp() - - _last->get()->get_timestamp()); - - std::vector _tc{}; - _tc.reserve(_bt_data->size()); + auto _data = std::vector{}; + _data.reserve(itr.m_stack.size()); // generate the instances of the tuple of components and start them - for(const auto& iitr : backtrace::filter_and_patch(_bt_data->get())) + for(const auto& iitr : itr.m_stack) { - _tc.emplace_back(tim::string_view_t{ iitr.name }); - _tc.back().push(_bt_time->get_tid()); - _tc.back().start(); + _data.emplace_back(tim::string_view_t{ iitr.name }); + _data.back().push(itr.m_tid); + _data.back().start(); } // stop the instances and update the values as needed - for(size_t i = 0; i < _tc.size(); ++i) + for(size_t i = 0; i < _data.size(); ++i) { - auto& iitr = _tc.at(_tc.size() - i - 1); - size_t _depth = 0; - _depth_sum[_bt_time->get_tid()][_depth] += 1; + auto& iitr = _data.at(_data.size() - i - 1); iitr.stop(); + if constexpr(tim::trait::is_available::value) { auto* _sc = iitr.get(); @@ -1129,83 +1450,86 @@ post_process_timemory(int64_t _tid, const bundle_t* _init, _sc->set_accum(_value); } } + + const auto& _metrics = itr.m_metrics; if constexpr(tim::trait::is_available::value) { auto* _cc = iitr.get(); - if(_cc && _bt_metrics && _last_metrics && - (*_bt_metrics)(category::thread_cpu_time{}) && - (*_last_metrics)(category::thread_cpu_time{})) + if(_cc && _metrics && _metrics(category::thread_cpu_time{})) { - double _elapsed_cc = (_bt_metrics->get_cpu_timestamp() - - _last_metrics->get_cpu_timestamp()); + double _elapsed_cc = _metrics.get_cpu_timestamp(); _cc->set_value(_elapsed_cc / sampling_cpu_clock::get_unit()); _cc->set_accum(_elapsed_cc / sampling_cpu_clock::get_unit()); } } + if constexpr(tim::trait::is_available::value) { - auto _hw_counters_enabled = [](const auto* _bt_v) { - return (_bt_v != nullptr) && - (*_bt_v)(type_list{}) && - (*_bt_v)(category::thread_hardware_counter{}); - }; + auto* _hw_counter = iitr.get(); - if(_bt_metrics && _last_metrics && _hw_counters_enabled(_bt_metrics) && - _hw_counters_enabled(_last_metrics)) + if(_hw_counter && _metrics && + _metrics(type_list{}) && + _metrics(category::thread_hardware_counter{})) { - auto _hw_cnt_vals = _bt_metrics->get_hw_counters(); - if(_bt_metrics->get_hw_counters().size() == - _last_metrics->get_hw_counters().size()) - { - for(size_t k = 0; k < _bt_metrics->get_hw_counters().size(); ++k) - { - if(_last_metrics->get_hw_counters()[k] > _hw_cnt_vals[k]) - _hw_cnt_vals[k] -= _last_metrics->get_hw_counters()[k]; - } - } - auto* _hw_counter = iitr.get(); - if(_hw_counter) - { - _hw_counter->set_value(_hw_cnt_vals); - _hw_counter->set_accum(_hw_cnt_vals); - } + _hw_counter->set_value(_metrics.get_hw_counters()); + _hw_counter->set_accum(_metrics.get_hw_counters()); } } + iitr.pop(); } - _last = itr; } - for(auto&& itr : _data) + for(auto&& itr : _overflow_data) { using bundle_t = - tim::lightweight_tuple>; + tim::lightweight_tuple>; - auto* _bt_data = itr->get(); - auto* _bt_time = itr->get(); - - if(!_bt_time || !_bt_data) continue; - if(_depth_sum.find(_bt_time->get_tid()) == _depth_sum.end()) continue; - - std::vector _tc{}; - _tc.reserve(_bt_data->size()); + auto _data = std::vector{}; + _data.reserve(itr.m_stack.size()); // generate the instances of the tuple of components and start them - for(const auto& iitr : backtrace::filter_and_patch(_bt_data->get())) + for(const auto& iitr : itr.m_stack) { - _tc.emplace_back(tim::string_view_t{ iitr.name }); - _tc.back().push(_bt_time->get_tid()); - _tc.back().start(); + _data.emplace_back(tim::string_view_t{ iitr.name }); + _data.back().push(itr.m_tid); + _data.back().start(); } // stop the instances and update the values as needed - for(size_t i = 0; i < _tc.size(); ++i) + for(size_t i = 0; i < _data.size(); ++i) { - auto& iitr = _tc.at(_tc.size() - i - 1); - size_t _depth = 0; - double _value = (1.0 / _depth_sum[_bt_time->get_tid()][_depth]) * 100.0; + auto& iitr = _data.at(_data.size() - i - 1); + double _value = (1.0 / _sum) * 100.0; + iitr.store(std::plus{}, _value); + iitr.stop(); + iitr.pop(); + } + } + + for(auto&& itr : _timer_data) + { + using bundle_t = + tim::lightweight_tuple>; + + auto _data = std::vector{}; + _data.reserve(itr.m_stack.size()); + + // generate the instances of the tuple of components and start them + for(const auto& iitr : itr.m_stack) + { + _data.emplace_back(tim::string_view_t{ iitr.name }); + _data.back().push(itr.m_tid); + _data.back().start(); + } + + // stop the instances and update the values as needed + for(size_t i = 0; i < _data.size(); ++i) + { + auto& iitr = _data.at(_data.size() - i - 1); + double _value = (1.0 / _sum) * 100.0; iitr.store(std::plus{}, _value); iitr.stop(); iitr.pop(); @@ -1225,6 +1549,7 @@ struct sampling_initialization sampling_percent::label() = "sampling_percent"; sampling_percent::description() = "Percentage of samples"; + sampling_percent::set_precision(3); sampling_gpu_busy::label() = "sampling_gpu_busy_percent"; sampling_gpu_busy::description() = "Utilization of GPU(s)"; diff --git a/source/lib/omnitrace/library/sampling.hpp b/source/lib/omnitrace/library/sampling.hpp index 0c0b748b56..62234f73d9 100644 --- a/source/lib/omnitrace/library/sampling.hpp +++ b/source/lib/omnitrace/library/sampling.hpp @@ -29,6 +29,7 @@ #include "library/components/backtrace.hpp" #include "library/components/backtrace_metrics.hpp" #include "library/components/backtrace_timestamp.hpp" +#include "library/components/callchain.hpp" #include "library/thread_data.hpp" #include @@ -43,20 +44,6 @@ namespace omnitrace { namespace sampling { -using component::backtrace; // NOLINT -using component::backtrace_cpu_clock; // NOLINT -using component::backtrace_fraction; // NOLINT -using component::backtrace_metrics; // NOLINT -using component::backtrace_timestamp; // NOLINT -using component::backtrace_wall_clock; // NOLINT -using component::sampling_cpu_clock; -using component::sampling_gpu_busy; -using component::sampling_gpu_memory; -using component::sampling_gpu_power; -using component::sampling_gpu_temp; -using component::sampling_percent; -using component::sampling_wall_clock; - unique_ptr_t>& get_signal_types(int64_t _tid); diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index 7aa15cc1d1..452a6136f9 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -5,1423 +5,20 @@ include_guard(GLOBAL) include(${CMAKE_CURRENT_LIST_DIR}/omnitrace-testing.cmake) -# -------------------------------------------------------------------------------------- # -# -# general config file tests -# -# -------------------------------------------------------------------------------------- # - -file( - WRITE ${CMAKE_CURRENT_BINARY_DIR}/invalid.cfg - " -OMNITRACE_CONFIG_FILE = -FOOBAR = ON -") - -if(TARGET parallel-overhead) - set(_CONFIG_TEST_EXE $) -else() - set(_CONFIG_TEST_EXE ls) -endif() - -add_test( - NAME omnitrace-invalid-config - COMMAND $ -- ${_CONFIG_TEST_EXE} - WORKING_DIRECTORY ${PROJECT_BINARY_DIR}) - -set_tests_properties( - omnitrace-invalid-config - PROPERTIES ENVIRONMENT - "OMNITRACE_CONFIG_FILE=${CMAKE_CURRENT_BINARY_DIR}/invalid.cfg" TIMEOUT - 120 LABELS "config" WILL_FAIL ON) - -add_test( - NAME omnitrace-missing-config - COMMAND $ -- ${_CONFIG_TEST_EXE} - WORKING_DIRECTORY ${PROJECT_BINARY_DIR}) - -set_tests_properties( - omnitrace-missing-config - PROPERTIES ENVIRONMENT - "OMNITRACE_CONFIG_FILE=${CMAKE_CURRENT_BINARY_DIR}/missing.cfg" TIMEOUT - 120 LABELS "config" WILL_FAIL ON) - -# -------------------------------------------------------------------------------------- # -# -# binary-rewrite and runtime-instrumentation tests -# -# -------------------------------------------------------------------------------------- # - -omnitrace_add_test( - NAME transpose - TARGET transpose - MPI ${TRANSPOSE_USE_MPI} - GPU ON - NUM_PROCS ${NUM_PROCS} - REWRITE_ARGS -e -v 2 --print-instructions -E uniform_int_distribution - RUNTIME_ARGS - -e - -v - 1 - --label - file - line - return - args - -E - uniform_int_distribution - ENVIRONMENT "${_base_environment};OMNITRACE_CRITICAL_TRACE=ON") - -omnitrace_add_test( - SKIP_BASELINE SKIP_SAMPLING SKIP_RUNTIME - NAME transpose-loops - TARGET transpose - LABELS "loops" - MPI ${TRANSPOSE_USE_MPI} - GPU ON - NUM_PROCS ${NUM_PROCS} - REWRITE_ARGS - -e - -v - 2 - --label - return - args - -l - -i - 8 - -E - uniform_int_distribution - RUN_ARGS 2 100 50 - ENVIRONMENT "${_base_environment};OMNITRACE_CRITICAL_TRACE=OFF" - REWRITE_FAIL_REGEX "0 instrumented loops in procedure transpose") - -omnitrace_add_test( - SKIP_PRELOAD SKIP_RUNTIME SKIP_SAMPLING - NAME rewrite-caller - TARGET rewrite-caller - LABELS "caller-include" - REWRITE_ARGS - -e - -i - 256 - --caller-include - "^inner" - -v - 2 - --print-instrumented - functions - RUN_ARGS 17 - ENVIRONMENT "${_base_environment};OMNITRACE_COUT_OUTPUT=ON" - BASELINE_PASS_REGEX "number of calls made = 17" - REWRITE_PASS_REGEX "\\[function\\]\\[Forcing\\] caller-include-regex :: 'outer'" - REWRITE_RUN_PASS_REGEX ">>> ._outer ([ \\|]+) 17") - -set(OMNITRACE_ROCM_EVENTS_TEST - "GRBM_COUNT,GPUBusy,SQ_WAVES,SQ_INSTS_VALU,VALUInsts,TCC_HIT_sum,TA_TA_BUSY[0]:device=0,TA_TA_BUSY[11]:device=0" - ) - -if(OMNITRACE_USE_ROCPROFILER) - omnitrace_add_test( - SKIP_BASELINE SKIP_SAMPLING SKIP_RUNTIME - NAME transpose-rocprofiler - TARGET transpose - LABELS "rocprofiler" - MPI ${TRANSPOSE_USE_MPI} - GPU ON - NUM_PROCS ${NUM_PROCS} - REWRITE_ARGS -e -v 2 -E uniform_int_distribution - ENVIRONMENT - "${_base_environment};OMNITRACE_CRITICAL_TRACE=OFF;OMNITRACE_ROCM_EVENTS=${OMNITRACE_ROCM_EVENTS_TEST}" - REWRITE_RUN_PASS_REGEX - "rocprof-device-0-GRBM_COUNT.txt(.*)rocprof-device-0-GPUBusy.txt(.*)rocprof-device-0-SQ_WAVES.txt(.*)rocprof-device-0-SQ_INSTS_VALU.txt(.*)rocprof-device-0-VALUInsts.txt(.*)rocprof-device-0-TCC_HIT_sum.txt(.*)rocprof-device-0-TA_TA_BUSY_0.txt(.*)rocprof-device-0-TA_TA_BUSY_11.txt" - ) - - omnitrace_add_test( - SKIP_BASELINE SKIP_SAMPLING SKIP_RUNTIME - NAME transpose-rocprofiler-no-roctracer - TARGET transpose - LABELS "rocprofiler" - MPI ${TRANSPOSE_USE_MPI} - GPU ON - NUM_PROCS ${NUM_PROCS} - REWRITE_ARGS -e -v 2 -E uniform_int_distribution - ENVIRONMENT - "${_base_environment};OMNITRACE_CRITICAL_TRACE=OFF;OMNITRACE_USE_ROCTRACER=OFF;OMNITRACE_ROCM_EVENTS=${OMNITRACE_ROCM_EVENTS_TEST}" - REWRITE_RUN_PASS_REGEX - "rocprof-device-0-GRBM_COUNT.txt(.*)rocprof-device-0-GPUBusy.txt(.*)rocprof-device-0-SQ_WAVES.txt(.*)rocprof-device-0-SQ_INSTS_VALU.txt(.*)rocprof-device-0-VALUInsts.txt(.*)rocprof-device-0-TCC_HIT_sum.txt(.*)rocprof-device-0-TA_TA_BUSY_0.txt(.*)rocprof-device-0-TA_TA_BUSY_11.txt" - REWRITE_RUN_FAIL_REGEX "roctracer.txt") -endif() - -omnitrace_add_test( - NAME parallel-overhead - TARGET parallel-overhead - REWRITE_ARGS -e -v 2 --min-instructions=8 - RUNTIME_ARGS - -e - -v - 1 - --min-instructions=8 - --label - file - line - return - args - RUN_ARGS 10 ${NUM_THREADS} 1000 - ENVIRONMENT "${_base_environment};OMNITRACE_CRITICAL_TRACE=OFF") - -omnitrace_add_test( - NAME parallel-overhead-locks - TARGET parallel-overhead-locks - LABELS "locks" - REWRITE_ARGS -e -i 256 - RUNTIME_ARGS -e -i 256 - RUN_ARGS 30 4 1000 - ENVIRONMENT - "${_lock_environment};OMNITRACE_USE_TIMEMORY=ON;OMNITRACE_USE_PERFETTO=ON;OMNITRACE_COLLAPSE_THREADS=OFF;OMNITRACE_SAMPLING_REALTIME=ON;OMNITRACE_SAMPLING_REALTIME_FREQ=10;OMNITRACE_SAMPLING_REALTIME_TIDS=0;OMNITRACE_SAMPLING_KEEP_INTERNAL=OFF" - REWRITE_RUN_PASS_REGEX - "wall_clock .*\\|_pthread_create .* 4 .*\\|_pthread_mutex_lock .* 1000 .*\\|_pthread_mutex_unlock .* 1000 .*\\|_pthread_mutex_lock .* 1000 .*\\|_pthread_mutex_unlock .* 1000 .*\\|_pthread_mutex_lock .* 1000 .*\\|_pthread_mutex_unlock .* 1000 .*\\|_pthread_mutex_lock .* 1000 .*\\|_pthread_mutex_unlock .* 1000" - RUNTIME_PASS_REGEX - "wall_clock .*\\|_pthread_create .* 4 .*\\|_pthread_mutex_lock .* 1000 .*\\|_pthread_mutex_unlock .* 1000 .*\\|_pthread_mutex_lock .* 1000 .*\\|_pthread_mutex_unlock .* 1000 .*\\|_pthread_mutex_lock .* 1000 .*\\|_pthread_mutex_unlock .* 1000 .*\\|_pthread_mutex_lock .* 1000 .*\\|_pthread_mutex_unlock .* 1000" - ) - -omnitrace_add_test( - SKIP_RUNTIME SKIP_SAMPLING - NAME parallel-overhead-locks-timemory - TARGET parallel-overhead-locks - LABELS "locks" - REWRITE_ARGS -e -v 2 --min-instructions=32 --dyninst-options InstrStackFrames SaveFPR - TrampRecursive - RUN_ARGS 10 4 1000 - ENVIRONMENT - "${_lock_environment};OMNITRACE_FLAT_PROFILE=ON;OMNITRACE_USE_TIMEMORY=ON;OMNITRACE_USE_PERFETTO=OFF;OMNITRACE_SAMPLING_KEEP_INTERNAL=OFF" - REWRITE_RUN_PASS_REGEX - "start_thread (.*) 4 (.*) pthread_mutex_lock (.*) 4000 (.*) pthread_mutex_unlock (.*) 4000" - ) - -omnitrace_add_test( - SKIP_BASELINE SKIP_RUNTIME SKIP_SAMPLING - NAME parallel-overhead-locks-perfetto - TARGET parallel-overhead-locks - LABELS "locks" - REWRITE_ARGS -e -v 2 --min-instructions=8 - RUN_ARGS 10 4 1000 - ENVIRONMENT - "${_lock_environment};OMNITRACE_FLAT_PROFILE=ON;OMNITRACE_USE_TIMEMORY=OFF;OMNITRACE_USE_PERFETTO=ON;OMNITRACE_SAMPLING_KEEP_INTERNAL=OFF" - ) - -omnitrace_add_test( - NAME user-api - TARGET user-api - LABELS "loops" - REWRITE_ARGS -e -v 2 -l --min-instructions=8 -E custom_push_region - RUNTIME_ARGS - -e - -v - 1 - -l - --min-instructions=8 - -E - custom_push_region - --label - file - line - return - args - RUN_ARGS 10 ${NUM_THREADS} 1000 - ENVIRONMENT "${_base_environment};OMNITRACE_CRITICAL_TRACE=OFF" - REWRITE_RUN_PASS_REGEX "Pushing custom region :: run.10. x 1000" - RUNTIME_PASS_REGEX "Pushing custom region :: run.10. x 1000" - PRELOAD_PASS_REGEX "Pushing custom region :: run.10. x 1000" - BASELINE_FAIL_REGEX "Pushing custom region" - REWRITE_FAIL_REGEX "0 instrumented loops in procedure") - -if(OMNITRACE_USE_MPI OR OMNITRACE_USE_MPI_HEADERS) - omnitrace_add_test( - SKIP_RUNTIME - NAME "mpi" - TARGET mpi-example - MPI ON - NUM_PROCS 4 - REWRITE_ARGS - -e - -v - 2 - --label - file - line - return - args - --min-instructions - 0 - ENVIRONMENT "${_base_environment};OMNITRACE_VERBOSE=1" - REWRITE_RUN_PASS_REGEX - "(/[A-Za-z-]+/perfetto-trace-0.proto).*(/[A-Za-z-]+/wall_clock-0.txt')" - REWRITE_RUN_FAIL_REGEX - "(perfetto-trace|trip_count|sampling_percent|sampling_cpu_clock|sampling_wall_clock|wall_clock)-[0-9][0-9]+.(json|txt|proto)" - ) - - omnitrace_add_test( - SKIP_RUNTIME SKIP_SAMPLING - NAME "mpi-flat-mpip" - TARGET mpi-example - MPI ON - NUM_PROCS 4 - LABELS "mpip" - REWRITE_ARGS - -e - -v - 2 - --label - file - line - args - --min-instructions - 0 - ENVIRONMENT - "${_flat_environment};OMNITRACE_USE_SAMPLING=OFF;OMNITRACE_STRICT_CONFIG=OFF;OMNITRACE_USE_MPIP=ON" - REWRITE_RUN_PASS_REGEX - ">>> mpi-flat-mpip.inst(.*\n.*)>>> MPI_Init_thread(.*\n.*)>>> pthread_create(.*\n.*)>>> MPI_Comm_size(.*\n.*)>>> MPI_Comm_rank(.*\n.*)>>> MPI_Barrier(.*\n.*)>>> MPI_Alltoall" - ) - - omnitrace_add_test( - SKIP_RUNTIME SKIP_SAMPLING - NAME "mpi-flat" - TARGET mpi-example - MPI ON - NUM_PROCS 4 - LABELS "mpip" - REWRITE_ARGS - -e - -v - 2 - --label - file - line - args - --min-instructions - 0 - ENVIRONMENT "${_flat_environment};OMNITRACE_USE_SAMPLING=OFF" - REWRITE_RUN_PASS_REGEX - ">>> mpi-flat.inst(.*\n.*)>>> MPI_Init_thread(.*\n.*)>>> pthread_create(.*\n.*)>>> MPI_Comm_size(.*\n.*)>>> MPI_Comm_rank(.*\n.*)>>> MPI_Barrier(.*\n.*)>>> MPI_Alltoall" - ) - - set(_mpip_environment - "OMNITRACE_USE_PERFETTO=ON" - "OMNITRACE_USE_TIMEMORY=ON" - "OMNITRACE_USE_SAMPLING=OFF" - "OMNITRACE_USE_PROCESS_SAMPLING=OFF" - "OMNITRACE_TIME_OUTPUT=OFF" - "OMNITRACE_FILE_OUTPUT=ON" - "OMNITRACE_USE_MPIP=ON" - "OMNITRACE_DEBUG=OFF" - "OMNITRACE_VERBOSE=2" - "OMNITRACE_DL_VERBOSE=2" - "${_test_openmp_env}" - "${_test_library_path}") - - set(_mpip_all2all_environment - "OMNITRACE_USE_PERFETTO=ON" - "OMNITRACE_USE_TIMEMORY=ON" - "OMNITRACE_USE_SAMPLING=OFF" - "OMNITRACE_USE_PROCESS_SAMPLING=OFF" - "OMNITRACE_TIME_OUTPUT=OFF" - "OMNITRACE_FILE_OUTPUT=ON" - "OMNITRACE_USE_MPIP=ON" - "OMNITRACE_DEBUG=ON" - "OMNITRACE_VERBOSE=3" - "OMNITRACE_DL_VERBOSE=3" - "${_test_openmp_env}" - "${_test_library_path}") - - foreach(_EXAMPLE all2all allgather allreduce bcast reduce scatter-gather send-recv) - if("${_mpip_${_EXAMPLE}_environment}" STREQUAL "") - set(_mpip_${_EXAMPLE}_environment "${_mpip_environment}") - endif() - omnitrace_add_test( - SKIP_RUNTIME SKIP_SAMPLING SKIP_PRELOAD - NAME "mpi-${_EXAMPLE}" - TARGET mpi-${_EXAMPLE} - MPI ON - NUM_PROCS 2 - LABELS "mpip" - REWRITE_ARGS -e -v 2 --label file line --min-instructions 0 - RUN_ARGS 30 - ENVIRONMENT "${_mpip_${_EXAMPLE}_environment}") - endforeach() -endif() - -omnitrace_add_test( - NAME lulesh - TARGET lulesh - MPI ${LULESH_USE_MPI} - GPU ${LULESH_USE_GPU} - NUM_PROCS 8 - LABELS "kokkos" - REWRITE_ARGS -e -v 2 --label file line return args - RUNTIME_ARGS - -e - -v - 1 - --label - file - line - return - args - -ME - [==[lib(gomp|m-)]==] - LABELS "kokkos;kokkos-profile-library" - RUN_ARGS -i 25 -s 20 -p - ENVIRONMENT - "${_base_environment};OMNITRACE_CRITICAL_TRACE=OFF;OMNITRACE_USE_KOKKOSP=ON;OMNITRACE_COUT_OUTPUT=ON;OMNITRACE_SAMPLING_FREQ=50;OMNITRACE_KOKKOSP_PREFIX=[kokkos];KOKKOS_PROFILE_LIBRARY=libomnitrace-dl.so" - REWRITE_RUN_PASS_REGEX "\\|_\\[kokkos\\] [a-zA-Z]" - RUNTIME_PASS_REGEX "\\|_\\[kokkos\\] [a-zA-Z]") - -omnitrace_add_test( - SKIP_RUNTIME SKIP_REWRITE - NAME lulesh-baseline-kokkosp-libomnitrace - TARGET lulesh - MPI ${LULESH_USE_MPI} - GPU ${LULESH_USE_GPU} - NUM_PROCS 8 - LABELS "kokkos;kokkos-profile-library" - RUN_ARGS -i 10 -s 20 -p - ENVIRONMENT - "${_base_environment};OMNITRACE_CRITICAL_TRACE=OFF;OMNITRACE_USE_KOKKOSP=ON;OMNITRACE_COUT_OUTPUT=ON;OMNITRACE_SAMPLING_FREQ=50;OMNITRACE_KOKKOSP_PREFIX=[kokkos];KOKKOS_PROFILE_LIBRARY=libomnitrace.so" - BASELINE_PASS_REGEX "\\|_\\[kokkos\\] [a-zA-Z]") - -omnitrace_add_test( - SKIP_RUNTIME SKIP_REWRITE - NAME lulesh-baseline-kokkosp-libomnitrace-dl - TARGET lulesh - MPI ${LULESH_USE_MPI} - GPU ${LULESH_USE_GPU} - NUM_PROCS 8 - LABELS "kokkos;kokkos-profile-library" - RUN_ARGS -i 10 -s 20 -p - ENVIRONMENT - "${_base_environment};OMNITRACE_CRITICAL_TRACE=OFF;OMNITRACE_USE_KOKKOSP=ON;OMNITRACE_COUT_OUTPUT=ON;OMNITRACE_SAMPLING_FREQ=50;OMNITRACE_KOKKOSP_PREFIX=[kokkos];KOKKOS_PROFILE_LIBRARY=libomnitrace-dl.so" - BASELINE_PASS_REGEX "\\|_\\[kokkos\\] [a-zA-Z]") - -omnitrace_add_test( - SKIP_BASELINE - NAME lulesh-kokkosp - TARGET lulesh - MPI ${LULESH_USE_MPI} - GPU ${LULESH_USE_GPU} - NUM_PROCS 8 - LABELS "kokkos" - REWRITE_ARGS -e -v 2 - RUNTIME_ARGS - -e - -v - 1 - --label - file - line - return - args - -ME - [==[lib(gomp|m-)]==] - RUN_ARGS -i 10 -s 20 -p - ENVIRONMENT - "${_base_environment};OMNITRACE_CRITICAL_TRACE=OFF;OMNITRACE_USE_KOKKOSP=ON") - -omnitrace_add_test( - SKIP_BASELINE SKIP_SAMPLING - NAME lulesh-perfetto - TARGET lulesh - MPI ${LULESH_USE_MPI} - GPU ${LULESH_USE_GPU} - NUM_PROCS 8 - LABELS "kokkos;loops" - REWRITE_ARGS -e -v 2 - RUNTIME_ARGS - -e - -v - 1 - -l - --dynamic-callsites - --traps - --allow-overlapping - -ME - [==[libgomp]==] - RUN_ARGS -i 10 -s 20 -p - ENVIRONMENT - "${_perfetto_environment};OMNITRACE_CRITICAL_TRACE=OFF;OMNITRACE_USE_KOKKOSP=OFF") - -omnitrace_add_test( - SKIP_SAMPLING - NAME lulesh-timemory - TARGET lulesh - MPI ${LULESH_USE_MPI} - GPU ${LULESH_USE_GPU} - NUM_PROCS 8 - LABELS "kokkos;loops" - REWRITE_ARGS -e -v 2 -l --dynamic-callsites --traps --allow-overlapping - RUNTIME_ARGS - -e - -v - 1 - -l - --dynamic-callsites - -ME - [==[libgomp]==] - -d - wall_clock - peak_rss - RUN_ARGS -i 10 -s 20 -p - ENVIRONMENT - "${_timemory_environment};OMNITRACE_CRITICAL_TRACE=OFF;OMNITRACE_USE_KOKKOSP=OFF" - REWRITE_FAIL_REGEX "0 instrumented loops in procedure") - -if(OMNITRACE_OPENMP_USING_LIBOMP_LIBRARY AND OMNITRACE_USE_OMPT) - set(_OMPT_PASS_REGEX "\\|_ompt_") -else() - set(_OMPT_PASS_REGEX "") -endif() - -omnitrace_add_test( - SKIP_SAMPLING - NAME openmp-cg - TARGET openmp-cg - LABELS "openmp" - REWRITE_ARGS -e -v 2 --instrument-loops - RUNTIME_ARGS -e -v 1 --label return args - REWRITE_TIMEOUT 180 - RUNTIME_TIMEOUT 360 - ENVIRONMENT "${_ompt_environment};OMNITRACE_USE_SAMPLING=OFF;OMNITRACE_COUT_OUTPUT=ON" - REWRITE_RUN_PASS_REGEX "${_OMPT_PASS_REGEX}" - RUNTIME_PASS_REGEX "${_OMPT_PASS_REGEX}" - REWRITE_FAIL_REGEX "0 instrumented loops in procedure") - -omnitrace_add_test( - SKIP_RUNTIME - NAME openmp-lu - TARGET openmp-lu - LABELS "openmp" - REWRITE_ARGS -e -v 2 --instrument-loops - RUNTIME_ARGS -e -v 1 --label return args -E ^GOMP - REWRITE_TIMEOUT 180 - RUNTIME_TIMEOUT 360 - ENVIRONMENT - "${_ompt_environment};OMNITRACE_USE_SAMPLING=ON;OMNITRACE_SAMPLING_FREQ=50;OMNITRACE_COUT_OUTPUT=ON" - REWRITE_RUN_PASS_REGEX "${_OMPT_PASS_REGEX}" - REWRITE_FAIL_REGEX "0 instrumented loops in procedure") - -set(_ompt_preload_environ - "${_ompt_environment}" - "OMNITRACE_VERBOSE=2" - "OMNITRACE_USE_OMPT=OFF" - "OMNITRACE_USE_SAMPLING=ON" - "OMNITRACE_USE_PROCESS_SAMPLING=OFF" - "OMNITRACE_SAMPLING_FREQ=100" - "OMNITRACE_SAMPLING_DELAY=0.1" - "OMNITRACE_SAMPLING_DURATION=0.25" - "OMNITRACE_SAMPLING_CPUTIME=ON" - "OMNITRACE_SAMPLING_REALTIME=ON" - "OMNITRACE_SAMPLING_CPUTIME_FREQ=1000" - "OMNITRACE_SAMPLING_REALTIME_FREQ=500" - "OMNITRACE_MONOCHROME=ON") - -set(_ompt_sample_no_tmpfiles_environ - "${_ompt_environment}" - "OMNITRACE_VERBOSE=2" - "OMNITRACE_USE_OMPT=OFF" - "OMNITRACE_USE_SAMPLING=ON" - "OMNITRACE_USE_PROCESS_SAMPLING=OFF" - "OMNITRACE_SAMPLING_CPUTIME=ON" - "OMNITRACE_SAMPLING_REALTIME=OFF" - "OMNITRACE_SAMPLING_CPUTIME_FREQ=700" - "OMNITRACE_USE_TEMPORARY_FILES=OFF" - "OMNITRACE_MONOCHROME=ON") - -set(_ompt_preload_samp_regex - "Sampler for thread 0 will be triggered 1000.0x per second of CPU-time(.*)Sampler for thread 0 will be triggered 500.0x per second of wall-time(.*)Sampling will be disabled after 0.250000 seconds(.*)Sampling duration of 0.250000 seconds has elapsed. Shutting down sampling" - ) -set(_ompt_preload_file_regex - "sampling-duration-preload/sampling_percent.(json|txt)(.*)sampling-duration-preload/sampling_cpu_clock.(json|txt)(.*)sampling-duration-preload/sampling_wall_clock.(json|txt)" - ) -set(_notmp_preload_file_regex - "sampling-no-tmp-files-preload/sampling_percent.(json|txt)(.*)sampling-no-tmp-files-preload/sampling_cpu_clock.(json|txt)(.*)sampling-no-tmp-files-preload/sampling_wall_clock.(json|txt)" - ) - -omnitrace_add_test( - SKIP_BASELINE SKIP_RUNTIME SKIP_REWRITE SKIP_SAMPLING - NAME openmp-cg-sampling-duration - TARGET openmp-cg - LABELS "openmp;sampling-duration" - ENVIRONMENT "${_ompt_preload_environ}" - PRELOAD_PASS_REGEX "${_ompt_preload_samp_regex}(.*)${_ompt_preload_file_regex}") - -omnitrace_add_test( - SKIP_BASELINE SKIP_RUNTIME SKIP_REWRITE SKIP_SAMPLING - NAME openmp-lu-sampling-duration - TARGET openmp-lu - LABELS "openmp;sampling-duration" - ENVIRONMENT "${_ompt_preload_environ}" - PRELOAD_PASS_REGEX "${_ompt_preload_samp_regex}(.*)${_ompt_preload_file_regex}") - -omnitrace_add_test( - SKIP_BASELINE SKIP_RUNTIME SKIP_REWRITE SKIP_SAMPLING - NAME openmp-cg-sampling-no-tmp-files - TARGET openmp-cg - LABELS "openmp;no-tmp-files" - ENVIRONMENT "${_ompt_sample_no_tmpfiles_environ}" - PRELOAD_PASS_REGEX "${_notmp_preload_file_regex}") - -omnitrace_add_test( - SKIP_BASELINE SKIP_SAMPLING SKIP_PRELOAD - NAME code-coverage - TARGET code-coverage - REWRITE_ARGS - -e - -v - 2 - --min-instructions=4 - -E - ^std:: - -M - coverage - --coverage - function - RUNTIME_ARGS - -e - -v - 1 - --min-instructions=4 - -E - ^std:: - --label - file - line - return - args - -M - coverage - --coverage - function - --module-restrict - code.coverage - LABELS "coverage;function-coverage" - RUN_ARGS 10 ${NUM_THREADS} 1000 - ENVIRONMENT "${_base_environment}" - RUNTIME_PASS_REGEX "(\\\[[0-9]+\\\]) code coverage :: 66.67%" - REWRITE_RUN_PASS_REGEX "(\\\[[0-9]+\\\]) code coverage :: 66.67%") - -omnitrace_add_test( - SKIP_BASELINE SKIP_SAMPLING SKIP_PRELOAD - NAME code-coverage-hybrid - TARGET code-coverage - REWRITE_ARGS -e -v 2 --min-instructions=4 -E ^std:: --coverage function - RUNTIME_ARGS - -e - -v - 1 - --min-instructions=4 - -E - ^std:: - --label - file - line - return - args - --coverage - function - --module-restrict - code.coverage - LABELS "coverage;function-coverage;hybrid-coverage" - RUN_ARGS 10 ${NUM_THREADS} 1000 - ENVIRONMENT "${_base_environment}" - RUNTIME_PASS_REGEX "(\\\[[0-9]+\\\]) code coverage :: 66.67%" - REWRITE_RUN_PASS_REGEX "(\\\[[0-9]+\\\]) code coverage :: 66.67%") - -omnitrace_add_test( - SKIP_BASELINE SKIP_SAMPLING SKIP_PRELOAD - NAME code-coverage-basic-blocks - TARGET code-coverage - REWRITE_ARGS - -e - -v - 2 - --min-instructions=4 - -E - ^std:: - -M - coverage - --coverage - basic_block - RUNTIME_ARGS - -e - -v - 1 - --min-instructions=4 - -E - ^std:: - --label - file - line - return - args - -M - coverage - --coverage - basic_block - --module-restrict - code.coverage - LABELS "coverage;bb-coverage" - RUN_ARGS 10 ${NUM_THREADS} 1000 - ENVIRONMENT "${_base_environment}" - RUNTIME_PASS_REGEX "(\\\[[0-9]+\\\]) function coverage :: 66.67%" - REWRITE_RUN_PASS_REGEX "(\\\[[0-9]+\\\]) function coverage :: 66.67%") - -omnitrace_add_test( - SKIP_BASELINE SKIP_SAMPLING SKIP_PRELOAD - NAME code-coverage-basic-blocks-hybrid - TARGET code-coverage - REWRITE_ARGS -e -v 2 --min-instructions=4 -E ^std:: --coverage basic_block - RUNTIME_ARGS - -e - -v - 1 - --min-instructions=4 - -E - ^std:: - --label - file - line - return - args - --coverage - basic_block - --module-restrict - code.coverage - LABELS "coverage;bb-coverage;hybrid-coverage" - RUN_ARGS 10 ${NUM_THREADS} 1000 - ENVIRONMENT "${_base_environment}" - RUNTIME_PASS_REGEX "(\\\[[0-9]+\\\]) function coverage :: 66.67%" - REWRITE_RUN_PASS_REGEX "(\\\[[0-9]+\\\]) function coverage :: 66.67%") - -if(_OS_RELEASE STREQUAL "ubuntu-18.04") - set(_TRACE_WINDOW_SKIP SKIP_RUNTIME) -endif() - -omnitrace_add_test( - SKIP_BASELINE SKIP_SAMPLING SKIP_PRELOAD ${_TRACE_WINDOW_SKIP} - NAME trace-time-window - TARGET trace-time-window - REWRITE_ARGS -e -v 2 --caller-include inner -i 4096 - RUNTIME_ARGS -e -v 1 --caller-include inner -i 4096 - LABELS "time-window" - ENVIRONMENT "${_window_environment};OMNITRACE_TRACE_DURATION=1.25") - -omnitrace_add_validation_test( - NAME trace-time-window-binary-rewrite - TIMEMORY_METRIC "wall_clock" - TIMEMORY_FILE "wall_clock.json" - PERFETTO_METRIC "host" - PERFETTO_FILE "perfetto-trace.proto" - LABELS "time-window" - FAIL_REGEX "outer_d" - ARGS -l - trace-time-window.inst - outer_a - outer_b - outer_c - -c - 1 - 1 - 1 - 1 - -d - 0 - 1 - 1 - 1 - -p) - -omnitrace_add_validation_test( - NAME trace-time-window-runtime-instrument - TIMEMORY_METRIC "wall_clock" - TIMEMORY_FILE "wall_clock.json" - PERFETTO_METRIC "host" - PERFETTO_FILE "perfetto-trace.proto" - LABELS "time-window" - FAIL_REGEX "outer_d" - ARGS -l - trace-time-window - outer_a - outer_b - outer_c - -c - 1 - 1 - 1 - 1 - -d - 0 - 1 - 1 - 1 - -p) - -omnitrace_add_test( - SKIP_BASELINE SKIP_SAMPLING SKIP_PRELOAD ${_TRACE_WINDOW_SKIP} - NAME trace-time-window-delay - TARGET trace-time-window - REWRITE_ARGS -e -v 2 --caller-include inner -i 4096 - RUNTIME_ARGS -e -v 1 --caller-include inner -i 4096 - LABELS "time-window" - ENVIRONMENT - "${_window_environment};OMNITRACE_TRACE_DELAY=0.75;OMNITRACE_TRACE_DURATION=0.75") - -omnitrace_add_validation_test( - NAME trace-time-window-delay-binary-rewrite - TIMEMORY_METRIC "wall_clock" - TIMEMORY_FILE "wall_clock.json" - PERFETTO_METRIC "host" - PERFETTO_FILE "perfetto-trace.proto" - LABELS "time-window" - ARGS -l - outer_c - outer_d - -c - 1 - 1 - -d - 0 - 0 - -p) - -omnitrace_add_validation_test( - NAME trace-time-window-delay-runtime-instrument - TIMEMORY_METRIC "wall_clock" - TIMEMORY_FILE "wall_clock.json" - PERFETTO_METRIC "host" - PERFETTO_FILE "perfetto-trace.proto" - LABELS "time-window" - ARGS -l - outer_c - outer_d - -c - 1 - 1 - -d - 0 - 0 - -p) - -omnitrace_add_test( - NAME fork - TARGET fork-example - REWRITE_ARGS -e -v 2 --print-instrumented modules -i 16 - RUNTIME_ARGS -e -v 1 --label file -i 16 - ENVIRONMENT - "${_base_environment};OMNITRACE_CRITICAL_TRACE=ON;OMNITRACE_SAMPLING_FREQ=250;OMNITRACE_SAMPLING_REALTIME=ON" - PRELOAD_PASS_REGEX "fork.. called on PID" - RUNTIME_PASS_REGEX "fork.. called on PID" - REWRITE_RUN_PASS_REGEX "fork.. called on PID" - PRELOAD_FAIL_REGEX - "(terminate called after throwing an instance|calling abort.. in |Exit code: [1-9])" - RUNTIME_FAIL_REGEX - "(terminate called after throwing an instance|calling abort.. in |Exit code: [1-9])" - REWRITE_RUN_FAIL_REGEX - "(terminate called after throwing an instance|calling abort.. in |Exit code: [1-9])" - ) - -# -------------------------------------------------------------------------------------- # -# -# critical-trace tests -# -# -------------------------------------------------------------------------------------- # - -omnitrace_add_test( - SKIP_BASELINE SKIP_RUNTIME SKIP_SAMPLING SKIP_PRELOAD - NAME parallel-overhead-critical-trace - TARGET parallel-overhead - LABELS "critical-trace" - REWRITE_ARGS - -e - -i - 8 - -E - "^fib" - -v - 2 - --print-instrumented - functions - RUN_ARGS 10 4 100 - ENVIRONMENT "${_critical_trace_environment}") - -add_test( - NAME parallel-overhead-process-critical-trace - COMMAND - $ - ${PROJECT_BINARY_DIR}/omnitrace-tests-output/parallel-overhead-critical-trace-binary-rewrite/call-chain.json - WORKING_DIRECTORY ${PROJECT_BINARY_DIR}) - -set(_parallel_overhead_critical_trace_environ - "OMNITRACE_OUTPUT_PATH=omnitrace-tests-output" - "OMNITRACE_OUTPUT_PREFIX=parallel-overhead-critical-trace/" - "OMNITRACE_CRITICAL_TRACE_DEBUG=ON" - "OMNITRACE_VERBOSE=4" - "OMNITRACE_USE_PID=OFF" - "OMNITRACE_TIME_OUTPUT=OFF") - -set_tests_properties( - parallel-overhead-process-critical-trace - PROPERTIES - ENVIRONMENT - "${_parallel_overhead_critical_trace_environ}" - TIMEOUT - 300 - LABELS - "parallel-overhead;critical-trace" - PASS_REGULAR_EXPRESSION - "Outputting.*(critical-trace-cpu.json).*Outputting.*(critical-trace-any.json)" - DEPENDS - parallel-overhead-critical-trace-binary-rewrite-run) - -# -------------------------------------------------------------------------------------- # -# -# attach tests -# -# -------------------------------------------------------------------------------------- # - -set(_VALID_PTRACE_SCOPE OFF) -if(EXISTS "/proc/sys/kernel/yama/ptrace_scope") - file(READ "/proc/sys/kernel/yama/ptrace_scope" _PTRACE_SCOPE LIMIT 1) - if("${_PTRACE_SCOPE}" EQUAL 0) - set(_VALID_PTRACE_SCOPE ON) - endif() -else() - omnitrace_message( - AUTHOR_WARNING - "Disabling attach tests. Run 'echo 0 | sudo tee /proc/sys/kernel/yama/ptrace_scope' to enable attaching to process" - ) -endif() - -if(TARGET parallel-overhead AND _VALID_PTRACE_SCOPE) - add_test( - NAME parallel-overhead-attach - COMMAND - ${CMAKE_CURRENT_LIST_DIR}/run-omnitrace-pid.sh - $ -ME "\.c$" -E fib -e -v 1 --label return - args file -l -- $ 30 8 1000 - WORKING_DIRECTORY ${PROJECT_BINARY_DIR}) - - set(_parallel_overhead_attach_environ - "${_attach_environment}" "OMNITRACE_OUTPUT_PATH=omnitrace-tests-output" - "OMNITRACE_OUTPUT_PREFIX=parallel-overhead-attach/") - - set_tests_properties( - parallel-overhead-attach - PROPERTIES ENVIRONMENT - "${_parallel_overhead_attach_environ}" - TIMEOUT - 300 - LABELS - "parallel-overhead;attach" - PASS_REGULAR_EXPRESSION - "Outputting.*(perfetto-trace.proto).*Outputting.*(wall_clock.txt)" - FAIL_REGULAR_EXPRESSION - "Dyninst was unable to attach to the specified process") -endif() - -# -------------------------------------------------------------------------------------- # -# -# rccl tests -# -# -------------------------------------------------------------------------------------- # - -foreach(_TARGET ${RCCL_TEST_TARGETS}) - string(REPLACE "rccl-tests::" "" _NAME "${_TARGET}") - string(REPLACE "_" "-" _NAME "${_NAME}") - omnitrace_add_test( - SKIP_SAMPLING - NAME rccl-test-${_NAME} - TARGET ${_TARGET} - LABELS "rccl-tests;rcclp" - MPI ON - GPU ON - NUM_PROCS 1 - REWRITE_ARGS - -e - -v - 2 - -i - 8 - --label - file - line - return - args - RUNTIME_ARGS - -e - -v - 1 - -i - 8 - --label - file - line - return - args - -ME - sysdeps - --log-file - rccl-test-${_NAME}.log - RUN_ARGS -t - 1 - -g - 1 - -i - 10 - -w - 2 - -m - 2 - -p - -c - 1 - -z - -s - 1 - ENVIRONMENT "${_rccl_environment}") -endforeach() - -# -------------------------------------------------------------------------------------- # -# -# causal profiling tests -# -# -------------------------------------------------------------------------------------- # - -omnitrace_add_causal_test( - NAME cpu-omni-func - TARGET causal-cpu-omni - RUN_ARGS 70 10 432525 1000000000 - CAUSAL_MODE "function" - CAUSAL_PASS_REGEX - "Starting causal experiment #1(.*)causal/experiments.json(.*)causal/experiments.coz" - ) - -omnitrace_add_causal_test( - SKIP_BASELINE - NAME cpu-omni-func-ndebug - TARGET causal-cpu-omni-ndebug - RUN_ARGS 70 10 432525 1000000000 - CAUSAL_MODE "function" - CAUSAL_PASS_REGEX - "Starting causal experiment #1(.*)causal/experiments.json(.*)causal/experiments.coz" - ) - -omnitrace_add_causal_test( - SKIP_BASELINE - NAME cpu-omni-line - TARGET causal-cpu-omni - RUN_ARGS 70 10 432525 1000000000 - CAUSAL_MODE "line" - CAUSAL_PASS_REGEX - "Starting causal experiment #1(.*)causal/experiments.json(.*)causal/experiments.coz" - ) - -omnitrace_add_causal_test( - NAME both-omni-func - TARGET causal-both-omni - RUN_ARGS 70 10 432525 400000000 - CAUSAL_MODE "function" - CAUSAL_ARGS - -w - 1 - -d - 3 - --monochrome - -g - ${CMAKE_BINARY_DIR}/omnitrace-tests-config/causal-both-omni-func - -l - causal-both-omni - -v - 3 - ENVIRONMENT "OMNITRACE_STRICT_CONFIG=OFF" - CAUSAL_PASS_REGEX - "Starting causal experiment #1(.*)causal/experiments.json(.*)causal/experiments.coz" - ) - -omnitrace_add_causal_test( - NAME lulesh-func - TARGET lulesh-omni - RUN_ARGS -i 35 -s 50 -p - CAUSAL_MODE "function" - CAUSAL_ARGS -s 0,10,25,50,75 - CAUSAL_PASS_REGEX - "Starting causal experiment #1(.*)causal/experiments.json(.*)causal/experiments.coz" - ) - -omnitrace_add_causal_test( - SKIP_BASELINE - NAME lulesh-func-ndebug - TARGET lulesh-omni-ndebug - RUN_ARGS -i 35 -s 50 -p - CAUSAL_MODE "function" - CAUSAL_ARGS -s 0,10,25,50,75 - CAUSAL_PASS_REGEX - "Starting causal experiment #1(.*)causal/experiments.json(.*)causal/experiments.coz" - ) - -omnitrace_add_causal_test( - SKIP_BASELINE - NAME lulesh-line - TARGET lulesh-omni - RUN_ARGS -i 35 -s 50 -p - CAUSAL_MODE "line" - CAUSAL_ARGS -s 0,10,25,50,75 -S lulesh.cc - CAUSAL_PASS_REGEX - "Starting causal experiment #1(.*)causal/experiments.json(.*)causal/experiments.coz" - ) - -set(_causal_common_args - "-n 5 -e -s 0 10 20 30 -B $") - -macro( - causal_e2e_args_and_validation - _NAME - _TEST - _MODE - _EXPER - _V10 # expected value for virtual speedup of 10 - _V20 - _V30 - _TOL # tolerance for virtual speedup - ) - # arguments to omnitrace-causal - set(${_NAME}_args "${_causal_common_args} ${_MODE} ${_EXPER}") - - # arguments to validate-causal-json.py - set(${_NAME}_valid - "-n 0 -i omnitrace-tests-output/causal-cpu-omni-${_TEST}-e2e/causal/experiments.json -v ${_EXPER} $ 10 ${_V10} ${_TOL} ${_EXPER} $ 20 ${_V20} ${_TOL} ${_EXPER} $ 30 ${_V30} ${_TOL}" - ) - # patch string for command-line - string(REPLACE " " ";" ${_NAME}_args "${${_NAME}_args}") - string(REPLACE " " ";" ${_NAME}_valid "${${_NAME}_valid}") -endmacro() - -causal_e2e_args_and_validation(_causal_slow_func slow-func "-F" "cpu_slow_func" 10 20 20 - 5) -causal_e2e_args_and_validation(_causal_fast_func fast-func "-F" "cpu_fast_func" 0 0 0 5) -causal_e2e_args_and_validation(_causal_line_155 line-155 "-S" "causal.cpp:155" 10 20 20 5) -causal_e2e_args_and_validation(_causal_line_165 line-165 "-S" "causal.cpp:165" 0 0 0 5) - -omnitrace_add_causal_test( - SKIP_BASELINE - NAME cpu-omni-slow-func-e2e - TARGET causal-cpu-omni - RUN_ARGS 80 12 432525 500000000 - CAUSAL_MODE "func" - CAUSAL_ARGS ${_causal_slow_func_args} - CAUSAL_VALIDATE_ARGS ${_causal_slow_func_valid} - CAUSAL_PASS_REGEX - "Starting causal experiment #1(.*)causal/experiments.json(.*)causal/experiments.coz" - PROPERTIES PROCESSORS 2 PROCESSOR_AFFINITY ON) - -omnitrace_add_causal_test( - SKIP_BASELINE - NAME cpu-omni-fast-func-e2e - TARGET causal-cpu-omni - RUN_ARGS 80 12 432525 500000000 - CAUSAL_MODE "func" - CAUSAL_ARGS ${_causal_fast_func_args} - CAUSAL_VALIDATE_ARGS ${_causal_fast_func_valid} - CAUSAL_PASS_REGEX - "Starting causal experiment #1(.*)causal/experiments.json(.*)causal/experiments.coz" - PROPERTIES PROCESSORS 2 PROCESSOR_AFFINITY OFF) - -omnitrace_add_causal_test( - SKIP_BASELINE - NAME cpu-omni-line-155-e2e - TARGET causal-cpu-omni - RUN_ARGS 80 12 432525 500000000 - CAUSAL_MODE "line" - CAUSAL_ARGS ${_causal_line_155_args} - CAUSAL_VALIDATE_ARGS ${_causal_line_155_valid} - CAUSAL_PASS_REGEX - "Starting causal experiment #1(.*)causal/experiments.json(.*)causal/experiments.coz" - PROPERTIES PROCESSORS 2 PROCESSOR_AFFINITY ON) - -omnitrace_add_causal_test( - SKIP_BASELINE - NAME cpu-omni-line-165-e2e - TARGET causal-cpu-omni - RUN_ARGS 80 12 432525 500000000 - CAUSAL_MODE "line" - CAUSAL_ARGS ${_causal_line_165_args} - CAUSAL_VALIDATE_ARGS ${_causal_line_165_valid} - CAUSAL_PASS_REGEX - "Starting causal experiment #1(.*)causal/experiments.json(.*)causal/experiments.coz" - PROPERTIES PROCESSORS 2 PROCESSOR_AFFINITY OFF) - -# -------------------------------------------------------------------------------------- # -# -# python tests -# -# -------------------------------------------------------------------------------------- # - -set(_INDEX 0) -foreach(_VERSION ${OMNITRACE_PYTHON_VERSIONS}) - if(NOT OMNITRACE_USE_PYTHON) - continue() - endif() - - list(GET OMNITRACE_PYTHON_ROOT_DIRS ${_INDEX} _PYTHON_ROOT_DIR) - - omnitrace_find_python( - _PYTHON - ROOT_DIR "${_PYTHON_ROOT_DIR}" - COMPONENTS Interpreter) - - # ---------------------------------------------------------------------------------- # - # python tests - # ---------------------------------------------------------------------------------- # - omnitrace_add_python_test( - NAME python-external - PYTHON_EXECUTABLE ${_PYTHON_EXECUTABLE} - PYTHON_VERSION ${_VERSION} - FILE ${CMAKE_SOURCE_DIR}/examples/python/external.py - PROFILE_ARGS "--label" "file" - RUN_ARGS -v 10 -n 5 - ENVIRONMENT "${_python_environment}") - - omnitrace_add_python_test( - NAME python-external-exclude-inefficient - PYTHON_EXECUTABLE ${_PYTHON_EXECUTABLE} - PYTHON_VERSION ${_VERSION} - FILE ${CMAKE_SOURCE_DIR}/examples/python/external.py - PROFILE_ARGS -E "^inefficient$" - RUN_ARGS -v 10 -n 5 - ENVIRONMENT "${_python_environment}") - - omnitrace_add_python_test( - NAME python-builtin - PYTHON_EXECUTABLE ${_PYTHON_EXECUTABLE} - PYTHON_VERSION ${_VERSION} - FILE ${CMAKE_SOURCE_DIR}/examples/python/builtin.py - PROFILE_ARGS "-b" "--label" "file" "line" - RUN_ARGS -v 10 -n 5 - ENVIRONMENT "${_python_environment}") - - omnitrace_add_python_test( - NAME python-builtin-noprofile - PYTHON_EXECUTABLE ${_PYTHON_EXECUTABLE} - PYTHON_VERSION ${_VERSION} - FILE ${CMAKE_SOURCE_DIR}/examples/python/noprofile.py - PROFILE_ARGS "-b" "--label" "file" - RUN_ARGS -v 15 -n 5 - ENVIRONMENT "${_python_environment}") - - omnitrace_add_python_test( - STANDALONE - NAME python-source - PYTHON_EXECUTABLE ${_PYTHON_EXECUTABLE} - PYTHON_VERSION ${_VERSION} - FILE ${CMAKE_SOURCE_DIR}/examples/python/source.py - RUN_ARGS -v 5 -n 5 -s 3 - ENVIRONMENT "${_python_environment}") - - omnitrace_add_python_test( - STANDALONE - NAME python-code-coverage - PYTHON_EXECUTABLE ${_PYTHON_EXECUTABLE} - PYTHON_VERSION ${_VERSION} - FILE ${CMAKE_SOURCE_DIR}/examples/code-coverage/code-coverage.py - RUN_ARGS - -i - ${PROJECT_BINARY_DIR}/omnitrace-tests-output/code-coverage-basic-blocks-binary-rewrite/coverage.json - ${PROJECT_BINARY_DIR}/omnitrace-tests-output/code-coverage-basic-blocks-hybrid-runtime-instrument/coverage.json - -o - ${PROJECT_BINARY_DIR}/omnitrace-tests-output/code-coverage-basic-blocks-summary/coverage.json - DEPENDS code-coverage-basic-blocks-binary-rewrite - code-coverage-basic-blocks-binary-rewrite-run - code-coverage-basic-blocks-hybrid-runtime-instrument - LABELS "code-coverage" - ENVIRONMENT "${_python_environment}") - - # ---------------------------------------------------------------------------------- # - # python output tests - # ---------------------------------------------------------------------------------- # - - if(CMAKE_VERSION VERSION_LESS "3.18.0") - find_program( - OMNITRACE_CAT_EXE - NAMES cat - PATH_SUFFIXES bin) - if(OMNITRACE_CAT_EXE) - set(OMNITRACE_CAT_COMMAND ${OMNITRACE_CAT_EXE}) - endif() - else() - set(OMNITRACE_CAT_COMMAND ${CMAKE_COMMAND} -E cat) - endif() - - if(OMNITRACE_CAT_COMMAND) - omnitrace_add_python_test( - NAME python-external-check - COMMAND ${OMNITRACE_CAT_COMMAND} - PYTHON_VERSION ${_VERSION} - FILE omnitrace-tests-output/python-external/${_VERSION}/trip_count.txt - PASS_REGEX - "(\\\[compile\\\]).*(\\\| \\\|0>>> \\\[run\\\]\\\[external.py\\\]).*(\\\| \\\|0>>> \\\|_\\\[fib\\\]\\\[external.py\\\]).*(\\\| \\\|0>>> \\\|_\\\[inefficient\\\]\\\[external.py\\\])" - DEPENDS python-external-${_VERSION} - ENVIRONMENT "${_python_environment}") - - omnitrace_add_python_test( - NAME python-external-exclude-inefficient-check - COMMAND ${OMNITRACE_CAT_COMMAND} - PYTHON_VERSION ${_VERSION} - FILE omnitrace-tests-output/python-external-exclude-inefficient/${_VERSION}/trip_count.txt - FAIL_REGEX "(\\\|_inefficient).*(\\\|_sum)" - DEPENDS python-external-exclude-inefficient-${_VERSION} - ENVIRONMENT "${_python_environment}") - - omnitrace_add_python_test( - NAME python-builtin-check - COMMAND ${OMNITRACE_CAT_COMMAND} - PYTHON_VERSION ${_VERSION} - FILE omnitrace-tests-output/python-builtin/${_VERSION}/trip_count.txt - PASS_REGEX "\\\[inefficient\\\]\\\[builtin.py:14\\\]" - DEPENDS python-builtin-${_VERSION} - ENVIRONMENT "${_python_environment}") - - omnitrace_add_python_test( - NAME python-builtin-noprofile-check - COMMAND ${OMNITRACE_CAT_COMMAND} - PYTHON_VERSION ${_VERSION} - FILE omnitrace-tests-output/python-builtin-noprofile/${_VERSION}/trip_count.txt - PASS_REGEX ".(run)..(noprofile.py)." - FAIL_REGEX ".(fib|inefficient)..(noprofile.py)." - DEPENDS python-builtin-noprofile-${_VERSION} - ENVIRONMENT "${_python_environment}") - else() - omnitrace_message( - WARNING - "Neither 'cat' nor 'cmake -E cat' are available. Python source checks are disabled" - ) - endif() - - function(OMNITRACE_ADD_PYTHON_VALIDATION_TEST) - cmake_parse_arguments( - TEST "" "NAME;TIMEMORY_METRIC;TIMEMORY_FILE;PERFETTO_METRIC;PERFETTO_FILE" - "ARGS" ${ARGN}) - - omnitrace_add_python_test( - NAME ${TEST_NAME}-validate-timemory - COMMAND - ${_PYTHON_EXECUTABLE} ${CMAKE_CURRENT_LIST_DIR}/validate-timemory-json.py - -m ${TEST_TIMEMORY_METRIC} ${TEST_ARGS} -i - PYTHON_VERSION ${_VERSION} - FILE omnitrace-tests-output/${TEST_NAME}/${_VERSION}/${TEST_TIMEMORY_FILE} - DEPENDS ${TEST_NAME}-${_VERSION} - PASS_REGEX - "omnitrace-tests-output/${TEST_NAME}/${_VERSION}/${TEST_TIMEMORY_FILE} validated" - ENVIRONMENT "${_python_environment}") - - omnitrace_add_python_test( - NAME ${TEST_NAME}-validate-perfetto - COMMAND - ${_PYTHON_EXECUTABLE} ${CMAKE_CURRENT_LIST_DIR}/validate-perfetto-proto.py - -m ${TEST_PERFETTO_METRIC} ${TEST_ARGS} -p -i - PYTHON_VERSION ${_VERSION} - FILE omnitrace-tests-output/${TEST_NAME}/${_VERSION}/${TEST_PERFETTO_FILE} - DEPENDS ${TEST_NAME}-${_VERSION} - PASS_REGEX - "omnitrace-tests-output/${TEST_NAME}/${_VERSION}/${TEST_PERFETTO_FILE} validated" - ENVIRONMENT "${_python_environment}") - endfunction() - - set(python_source_labels - main_loop - run - fib - fib - fib - fib - fib - inefficient - _sum) - set(python_source_count - 5 - 3 - 3 - 6 - 12 - 18 - 6 - 3 - 3) - set(python_source_depth - 0 - 1 - 2 - 3 - 4 - 5 - 6 - 2 - 3) - - omnitrace_add_python_validation_test( - NAME python-source - TIMEMORY_METRIC "trip_count" - TIMEMORY_FILE "trip_count.json" - PERFETTO_METRIC "host;user" - PERFETTO_FILE "perfetto-trace.proto" - ARGS -l ${python_source_labels} -c ${python_source_count} -d - ${python_source_depth}) - - set(python_builtin_labels - [run][builtin.py:28] - [fib][builtin.py:10] - [fib][builtin.py:10] - [fib][builtin.py:10] - [fib][builtin.py:10] - [fib][builtin.py:10] - [fib][builtin.py:10] - [fib][builtin.py:10] - [fib][builtin.py:10] - [fib][builtin.py:10] - [fib][builtin.py:10] - [inefficient][builtin.py:14]) - set(python_builtin_count - 5 - 5 - 10 - 20 - 40 - 80 - 160 - 260 - 220 - 80 - 10 - 5) - set(python_builtin_depth - 0 - 1 - 2 - 3 - 4 - 5 - 6 - 7 - 8 - 9 - 10 - 1) - - omnitrace_add_python_validation_test( - NAME python-builtin - TIMEMORY_METRIC "trip_count" - TIMEMORY_FILE "trip_count.json" - PERFETTO_METRIC "host;user" - PERFETTO_FILE "perfetto-trace.proto" - ARGS -l ${python_builtin_labels} -c ${python_builtin_count} -d - ${python_builtin_depth}) - math(EXPR _INDEX "${_INDEX} + 1") -endforeach() +# test groups +include(${CMAKE_CURRENT_LIST_DIR}/omnitrace-config-tests.cmake) +include(${CMAKE_CURRENT_LIST_DIR}/omnitrace-instrument-tests.cmake) +include(${CMAKE_CURRENT_LIST_DIR}/omnitrace-pthread-tests.cmake) +include(${CMAKE_CURRENT_LIST_DIR}/omnitrace-rocm-tests.cmake) +include(${CMAKE_CURRENT_LIST_DIR}/omnitrace-user-api-tests.cmake) +include(${CMAKE_CURRENT_LIST_DIR}/omnitrace-mpi-tests.cmake) +include(${CMAKE_CURRENT_LIST_DIR}/omnitrace-kokkos-tests.cmake) +include(${CMAKE_CURRENT_LIST_DIR}/omnitrace-openmp-tests.cmake) +include(${CMAKE_CURRENT_LIST_DIR}/omnitrace-code-coverage-tests.cmake) +include(${CMAKE_CURRENT_LIST_DIR}/omnitrace-fork-tests.cmake) +include(${CMAKE_CURRENT_LIST_DIR}/omnitrace-time-window-tests.cmake) +include(${CMAKE_CURRENT_LIST_DIR}/omnitrace-critical-trace-tests.cmake) +include(${CMAKE_CURRENT_LIST_DIR}/omnitrace-attach-tests.cmake) +include(${CMAKE_CURRENT_LIST_DIR}/omnitrace-rccl-tests.cmake) +include(${CMAKE_CURRENT_LIST_DIR}/omnitrace-causal-tests.cmake) +include(${CMAKE_CURRENT_LIST_DIR}/omnitrace-python-tests.cmake) diff --git a/tests/omnitrace-attach-tests.cmake b/tests/omnitrace-attach-tests.cmake new file mode 100644 index 0000000000..3c2e820869 --- /dev/null +++ b/tests/omnitrace-attach-tests.cmake @@ -0,0 +1,53 @@ +# -------------------------------------------------------------------------------------- # +# +# attach tests +# +# -------------------------------------------------------------------------------------- # + +set(_VALID_PTRACE_SCOPE OFF) + +if(EXISTS "/proc/sys/kernel/yama/ptrace_scope") + file(READ "/proc/sys/kernel/yama/ptrace_scope" _PTRACE_SCOPE LIMIT 1) + + if("${_PTRACE_SCOPE}" EQUAL 0) + set(_VALID_PTRACE_SCOPE ON) + endif() +else() + omnitrace_message( + AUTHOR_WARNING + "Disabling attach tests. Run 'echo 0 | sudo tee /proc/sys/kernel/yama/ptrace_scope' to enable attaching to process" + ) +endif() + +if(NOT _VALID_PTRACE_SCOPE) + return() +endif() + +if(NOT TARGET parallel-overhead) + return() +endif() + +add_test( + NAME parallel-overhead-attach + COMMAND + ${CMAKE_CURRENT_LIST_DIR}/run-omnitrace-pid.sh $ + -ME "\.c$" -E fib -e -v 1 --label return args file -l -- + $ 30 8 1000 + WORKING_DIRECTORY ${PROJECT_BINARY_DIR}) + +set(_parallel_overhead_attach_environ + "${_attach_environment}" "OMNITRACE_OUTPUT_PATH=omnitrace-tests-output" + "OMNITRACE_OUTPUT_PREFIX=parallel-overhead-attach/") + +set_tests_properties( + parallel-overhead-attach + PROPERTIES ENVIRONMENT + "${_parallel_overhead_attach_environ}" + TIMEOUT + 300 + LABELS + "parallel-overhead;attach" + PASS_REGULAR_EXPRESSION + "Outputting.*(perfetto-trace.proto).*Outputting.*(wall_clock.txt)" + FAIL_REGULAR_EXPRESSION + "Dyninst was unable to attach to the specified process") diff --git a/tests/omnitrace-causal-tests.cmake b/tests/omnitrace-causal-tests.cmake new file mode 100644 index 0000000000..62a52ceced --- /dev/null +++ b/tests/omnitrace-causal-tests.cmake @@ -0,0 +1,180 @@ +# -------------------------------------------------------------------------------------- # +# +# causal profiling tests +# +# -------------------------------------------------------------------------------------- # + +omnitrace_add_causal_test( + NAME cpu-omni-func + TARGET causal-cpu-omni + RUN_ARGS 70 10 432525 1000000000 + CAUSAL_MODE "function" + CAUSAL_PASS_REGEX + "Starting causal experiment #1(.*)causal/experiments.json(.*)causal/experiments.coz" + ) + +omnitrace_add_causal_test( + SKIP_BASELINE + NAME cpu-omni-func-ndebug + TARGET causal-cpu-omni-ndebug + RUN_ARGS 70 10 432525 1000000000 + CAUSAL_MODE "function" + CAUSAL_PASS_REGEX + "Starting causal experiment #1(.*)causal/experiments.json(.*)causal/experiments.coz" + ) + +omnitrace_add_causal_test( + SKIP_BASELINE + NAME cpu-omni-line + TARGET causal-cpu-omni + RUN_ARGS 70 10 432525 1000000000 + CAUSAL_MODE "line" + CAUSAL_PASS_REGEX + "Starting causal experiment #1(.*)causal/experiments.json(.*)causal/experiments.coz" + ) + +omnitrace_add_causal_test( + NAME both-omni-func + TARGET causal-both-omni + RUN_ARGS 70 10 432525 400000000 + CAUSAL_MODE "function" + CAUSAL_ARGS + -n + 2 + -w + 1 + -d + 3 + --monochrome + -g + ${CMAKE_BINARY_DIR}/omnitrace-tests-config/causal-both-omni-func + -l + causal-both-omni + -v + 3 + -b + timer + ENVIRONMENT "OMNITRACE_STRICT_CONFIG=OFF" + CAUSAL_PASS_REGEX + "Starting causal experiment #1(.*)causal/experiments.json(.*)causal/experiments.coz" + ) + +omnitrace_add_causal_test( + NAME lulesh-func + TARGET lulesh-omni + RUN_ARGS -i 35 -s 50 -p + CAUSAL_MODE "function" + CAUSAL_ARGS -s 0,10,25,50,75 + CAUSAL_PASS_REGEX + "Starting causal experiment #1(.*)causal/experiments.json(.*)causal/experiments.coz" + ) + +omnitrace_add_causal_test( + SKIP_BASELINE + NAME lulesh-func-ndebug + TARGET lulesh-omni-ndebug + RUN_ARGS -i 35 -s 50 -p + CAUSAL_MODE "function" + CAUSAL_ARGS -s 0,10,25,50,75 + CAUSAL_PASS_REGEX + "Starting causal experiment #1(.*)causal/experiments.json(.*)causal/experiments.coz" + ) + +omnitrace_add_causal_test( + SKIP_BASELINE + NAME lulesh-line + TARGET lulesh-omni + RUN_ARGS -i 35 -s 50 -p + CAUSAL_MODE "line" + CAUSAL_ARGS -s 0,10,25,50,75 -S lulesh.cc + CAUSAL_PASS_REGEX + "Starting causal experiment #1(.*)causal/experiments.json(.*)causal/experiments.coz" + ) + +# set(_causal_e2e_exe_args 80 100 432525 100000000) set(_causal_e2e_exe_args 80 12 432525 +# 500000000) +set(_causal_e2e_exe_args 80 50 432525 100000000) +set(_causal_common_args + "-n 5 -e -s 0 10 20 30 -B $") + +macro( + causal_e2e_args_and_validation + _NAME + _TEST + _MODE + _EXPER + _V10 # expected value for virtual speedup of 15 + _V20 + _V30 + _TOL # tolerance for virtual speedup + ) + # arguments to omnitrace-causal + set(${_NAME}_args "${_causal_common_args} ${_MODE} ${_EXPER}") + + # arguments to validate-causal-json.py + set(${_NAME}_valid + "-n 0 -i omnitrace-tests-output/causal-cpu-omni-${_TEST}-e2e/causal/experiments.json -v ${_EXPER} $ 10 ${_V10} ${_TOL} ${_EXPER} $ 20 ${_V20} ${_TOL} ${_EXPER} $ 30 ${_V30} ${_TOL}" + ) + + # patch string for command-line + string(REPLACE " " ";" ${_NAME}_args "${${_NAME}_args}") + string(REPLACE " " ";" ${_NAME}_valid "${${_NAME}_valid}") +endmacro() + +causal_e2e_args_and_validation(_causal_slow_func slow-func "-F" "cpu_slow_func" 10 20 20 + 5) +causal_e2e_args_and_validation(_causal_fast_func fast-func "-F" "cpu_fast_func" 0 0 0 5) +causal_e2e_args_and_validation(_causal_line_100 line-100 "-S" "causal.cpp:100" 10 20 20 5) +causal_e2e_args_and_validation(_causal_line_110 line-110 "-S" "causal.cpp:110" 0 0 0 5) + +omnitrace_add_causal_test( + SKIP_BASELINE + NAME cpu-omni-slow-func-e2e + TARGET causal-cpu-omni + LABELS "causal-e2e" + RUN_ARGS ${_causal_e2e_exe_args} + CAUSAL_MODE "func" + CAUSAL_ARGS ${_causal_slow_func_args} + CAUSAL_VALIDATE_ARGS ${_causal_slow_func_valid} + CAUSAL_PASS_REGEX + "Starting causal experiment #1(.*)causal/experiments.json(.*)causal/experiments.coz" + PROPERTIES PROCESSORS 2 PROCESSOR_AFFINITY OFF) + +omnitrace_add_causal_test( + SKIP_BASELINE + NAME cpu-omni-fast-func-e2e + TARGET causal-cpu-omni + LABELS "causal-e2e" + RUN_ARGS ${_causal_e2e_exe_args} + CAUSAL_MODE "func" + CAUSAL_ARGS ${_causal_fast_func_args} + CAUSAL_VALIDATE_ARGS ${_causal_fast_func_valid} + CAUSAL_PASS_REGEX + "Starting causal experiment #1(.*)causal/experiments.json(.*)causal/experiments.coz" + PROPERTIES PROCESSORS 2 PROCESSOR_AFFINITY OFF) + +omnitrace_add_causal_test( + SKIP_BASELINE + NAME cpu-omni-line-100-e2e + TARGET causal-cpu-omni + LABELS "causal-e2e" + RUN_ARGS ${_causal_e2e_exe_args} + CAUSAL_MODE "line" + CAUSAL_ARGS ${_causal_line_100_args} + CAUSAL_VALIDATE_ARGS ${_causal_line_100_valid} + CAUSAL_PASS_REGEX + "Starting causal experiment #1(.*)causal/experiments.json(.*)causal/experiments.coz" + PROPERTIES PROCESSORS 2 PROCESSOR_AFFINITY OFF) + +omnitrace_add_causal_test( + SKIP_BASELINE + NAME cpu-omni-line-110-e2e + TARGET causal-cpu-omni + LABELS "causal-e2e" + RUN_ARGS ${_causal_e2e_exe_args} + CAUSAL_MODE "line" + CAUSAL_ARGS ${_causal_line_110_args} + CAUSAL_VALIDATE_ARGS ${_causal_line_110_valid} + CAUSAL_PASS_REGEX + "Starting causal experiment #1(.*)causal/experiments.json(.*)causal/experiments.coz" + PROPERTIES PROCESSORS 2 PROCESSOR_AFFINITY OFF) diff --git a/tests/omnitrace-code-coverage-tests.cmake b/tests/omnitrace-code-coverage-tests.cmake new file mode 100644 index 0000000000..c7da89d6e9 --- /dev/null +++ b/tests/omnitrace-code-coverage-tests.cmake @@ -0,0 +1,137 @@ +# -------------------------------------------------------------------------------------- # +# +# code-coverage tests +# +# -------------------------------------------------------------------------------------- # + +omnitrace_add_test( + SKIP_BASELINE SKIP_SAMPLING + NAME code-coverage + TARGET code-coverage + REWRITE_ARGS + -e + -v + 2 + --min-instructions=4 + -E + ^std:: + -M + coverage + --coverage + function + RUNTIME_ARGS + -e + -v + 1 + --min-instructions=4 + -E + ^std:: + --label + file + line + return + args + -M + coverage + --coverage + function + --module-restrict + code.coverage + LABELS "coverage;function-coverage" + RUN_ARGS 10 ${NUM_THREADS} 1000 + ENVIRONMENT "${_base_environment}" + RUNTIME_PASS_REGEX "(\\\[[0-9]+\\\]) code coverage :: 66.67%" + REWRITE_RUN_PASS_REGEX "(\\\[[0-9]+\\\]) code coverage :: 66.67%") + +omnitrace_add_test( + SKIP_BASELINE SKIP_SAMPLING + NAME code-coverage-hybrid + TARGET code-coverage + REWRITE_ARGS -e -v 2 --min-instructions=4 -E ^std:: --coverage function + RUNTIME_ARGS + -e + -v + 1 + --min-instructions=4 + -E + ^std:: + --label + file + line + return + args + --coverage + function + --module-restrict + code.coverage + LABELS "coverage;function-coverage;hybrid-coverage" + RUN_ARGS 10 ${NUM_THREADS} 1000 + ENVIRONMENT "${_base_environment}" + RUNTIME_PASS_REGEX "(\\\[[0-9]+\\\]) code coverage :: 66.67%" + REWRITE_RUN_PASS_REGEX "(\\\[[0-9]+\\\]) code coverage :: 66.67%") + +omnitrace_add_test( + SKIP_BASELINE SKIP_SAMPLING + NAME code-coverage-basic-blocks + TARGET code-coverage + REWRITE_ARGS + -e + -v + 2 + --min-instructions=4 + -E + ^std:: + -M + coverage + --coverage + basic_block + RUNTIME_ARGS + -e + -v + 1 + --min-instructions=4 + -E + ^std:: + --label + file + line + return + args + -M + coverage + --coverage + basic_block + --module-restrict + code.coverage + LABELS "coverage;bb-coverage" + RUN_ARGS 10 ${NUM_THREADS} 1000 + ENVIRONMENT "${_base_environment}" + RUNTIME_PASS_REGEX "(\\\[[0-9]+\\\]) function coverage :: 66.67%" + REWRITE_RUN_PASS_REGEX "(\\\[[0-9]+\\\]) function coverage :: 66.67%") + +omnitrace_add_test( + SKIP_BASELINE SKIP_SAMPLING + NAME code-coverage-basic-blocks-hybrid + TARGET code-coverage + REWRITE_ARGS -e -v 2 --min-instructions=4 -E ^std:: --coverage basic_block + RUNTIME_ARGS + -e + -v + 1 + --min-instructions=4 + -E + ^std:: + --label + file + line + return + args + --coverage + basic_block + --module-restrict + code.coverage + LABELS "coverage;bb-coverage;hybrid-coverage" + RUN_ARGS 10 ${NUM_THREADS} 1000 + ENVIRONMENT "${_base_environment}" + RUNTIME_PASS_REGEX "(\\\[[0-9]+\\\]) function coverage :: 66.67%" + REWRITE_RUN_PASS_REGEX "(\\\[[0-9]+\\\]) function coverage :: 66.67%") diff --git a/tests/omnitrace-config-tests.cmake b/tests/omnitrace-config-tests.cmake new file mode 100644 index 0000000000..72b8706f19 --- /dev/null +++ b/tests/omnitrace-config-tests.cmake @@ -0,0 +1,40 @@ +# -------------------------------------------------------------------------------------- # +# +# general config file tests +# +# -------------------------------------------------------------------------------------- # + +file( + WRITE ${CMAKE_CURRENT_BINARY_DIR}/invalid.cfg + " +OMNITRACE_CONFIG_FILE = +FOOBAR = ON +") + +if(TARGET parallel-overhead) + set(_CONFIG_TEST_EXE $) +else() + set(_CONFIG_TEST_EXE ls) +endif() + +add_test( + NAME omnitrace-invalid-config + COMMAND $ -- ${_CONFIG_TEST_EXE} + WORKING_DIRECTORY ${PROJECT_BINARY_DIR}) + +set_tests_properties( + omnitrace-invalid-config + PROPERTIES ENVIRONMENT + "OMNITRACE_CONFIG_FILE=${CMAKE_CURRENT_BINARY_DIR}/invalid.cfg" TIMEOUT + 120 LABELS "config" WILL_FAIL ON) + +add_test( + NAME omnitrace-missing-config + COMMAND $ -- ${_CONFIG_TEST_EXE} + WORKING_DIRECTORY ${PROJECT_BINARY_DIR}) + +set_tests_properties( + omnitrace-missing-config + PROPERTIES ENVIRONMENT + "OMNITRACE_CONFIG_FILE=${CMAKE_CURRENT_BINARY_DIR}/missing.cfg" TIMEOUT + 120 LABELS "config" WILL_FAIL ON) diff --git a/tests/omnitrace-critical-trace-tests.cmake b/tests/omnitrace-critical-trace-tests.cmake new file mode 100644 index 0000000000..d1a9633d38 --- /dev/null +++ b/tests/omnitrace-critical-trace-tests.cmake @@ -0,0 +1,52 @@ +# -------------------------------------------------------------------------------------- # +# +# critical-trace tests +# +# -------------------------------------------------------------------------------------- # + +omnitrace_add_test( + SKIP_BASELINE SKIP_RUNTIME SKIP_SAMPLING + NAME parallel-overhead-critical-trace + TARGET parallel-overhead + LABELS "critical-trace" + REWRITE_ARGS + -e + -i + 8 + -E + "^fib" + -v + 2 + --print-instrumented + functions + RUN_ARGS 10 4 100 + ENVIRONMENT "${_critical_trace_environment}") + +add_test( + NAME parallel-overhead-process-critical-trace + COMMAND + $ + ${PROJECT_BINARY_DIR}/omnitrace-tests-output/parallel-overhead-critical-trace-binary-rewrite/call-chain.json + WORKING_DIRECTORY ${PROJECT_BINARY_DIR}) + +set(_parallel_overhead_critical_trace_environ + "OMNITRACE_OUTPUT_PATH=omnitrace-tests-output" + "OMNITRACE_OUTPUT_PREFIX=parallel-overhead-critical-trace/" + "OMNITRACE_CRITICAL_TRACE_DEBUG=ON" + "OMNITRACE_VERBOSE=4" + "OMNITRACE_USE_PID=OFF" + "OMNITRACE_TIME_OUTPUT=OFF") + +set_tests_properties( + parallel-overhead-process-critical-trace + PROPERTIES + ENVIRONMENT + "${_parallel_overhead_critical_trace_environ}" + TIMEOUT + 300 + LABELS + "parallel-overhead;critical-trace" + PASS_REGULAR_EXPRESSION + "Outputting.*(critical-trace-cpu.json).*Outputting.*(critical-trace-any.json)" + DEPENDS + parallel-overhead-critical-trace-binary-rewrite-run) diff --git a/tests/omnitrace-fork-tests.cmake b/tests/omnitrace-fork-tests.cmake new file mode 100644 index 0000000000..4a074a7b91 --- /dev/null +++ b/tests/omnitrace-fork-tests.cmake @@ -0,0 +1,23 @@ +# -------------------------------------------------------------------------------------- # +# +# fork tests +# +# -------------------------------------------------------------------------------------- # + +omnitrace_add_test( + NAME fork + TARGET fork-example + REWRITE_ARGS -e -v 2 --print-instrumented modules -i 16 + RUNTIME_ARGS -e -v 1 --label file -i 16 + ENVIRONMENT + "${_base_environment};OMNITRACE_CRITICAL_TRACE=ON;OMNITRACE_SAMPLING_FREQ=250;OMNITRACE_SAMPLING_REALTIME=ON" + SAMPLING_PASS_REGEX "fork.. called on PID" + RUNTIME_PASS_REGEX "fork.. called on PID" + REWRITE_RUN_PASS_REGEX "fork.. called on PID" + SAMPLING_FAIL_REGEX + "(terminate called after throwing an instance|calling abort.. in |Exit code: [1-9])" + RUNTIME_FAIL_REGEX + "(terminate called after throwing an instance|calling abort.. in |Exit code: [1-9])" + REWRITE_RUN_FAIL_REGEX + "(terminate called after throwing an instance|calling abort.. in |Exit code: [1-9])" + ) diff --git a/tests/omnitrace-instrument-tests.cmake b/tests/omnitrace-instrument-tests.cmake new file mode 100644 index 0000000000..cb3a787016 --- /dev/null +++ b/tests/omnitrace-instrument-tests.cmake @@ -0,0 +1,54 @@ +# -------------------------------------------------------------------------------------- # +# +# binary-rewrite and runtime-instrumentation tests +# +# -------------------------------------------------------------------------------------- # + +omnitrace_add_test( + SKIP_SAMPLING SKIP_RUNTIME + NAME rewrite-caller + TARGET rewrite-caller + LABELS "caller-include" + REWRITE_ARGS + -e + -i + 256 + --caller-include + "^inner" + -v + 2 + --print-instrumented + functions + RUN_ARGS 17 + ENVIRONMENT "${_base_environment};OMNITRACE_COUT_OUTPUT=ON" + BASELINE_PASS_REGEX "number of calls made = 17" + REWRITE_PASS_REGEX "\\[function\\]\\[Forcing\\] caller-include-regex :: 'outer'" + REWRITE_RUN_PASS_REGEX ">>> ._outer ([ \\|]+) 17") + +omnitrace_add_test( + NAME parallel-overhead + TARGET parallel-overhead + REWRITE_ARGS -e -v 2 --min-instructions=8 + RUNTIME_ARGS + -e + -v + 1 + --min-instructions=8 + --label + file + line + return + args + RUN_ARGS 10 ${NUM_THREADS} 1000 + ENVIRONMENT "${_base_environment};OMNITRACE_CRITICAL_TRACE=OFF") + +omnitrace_add_test( + SKIP_BASELINE SKIP_RUNTIME + NAME parallel-overhead-locks-perfetto + TARGET parallel-overhead-locks + LABELS "locks" + REWRITE_ARGS -e -v 2 --min-instructions=8 + RUN_ARGS 10 4 1000 + ENVIRONMENT + "${_lock_environment};OMNITRACE_FLAT_PROFILE=ON;OMNITRACE_USE_TIMEMORY=OFF;OMNITRACE_USE_PERFETTO=ON;OMNITRACE_SAMPLING_KEEP_INTERNAL=OFF" + ) diff --git a/tests/omnitrace-kokkos-tests.cmake b/tests/omnitrace-kokkos-tests.cmake new file mode 100644 index 0000000000..c78c2fbc63 --- /dev/null +++ b/tests/omnitrace-kokkos-tests.cmake @@ -0,0 +1,128 @@ +# -------------------------------------------------------------------------------------- # +# +# kokkos (lulesh) tests +# +# -------------------------------------------------------------------------------------- # + +omnitrace_add_test( + NAME lulesh + TARGET lulesh + MPI ${LULESH_USE_MPI} + GPU ${LULESH_USE_GPU} + NUM_PROCS 8 + LABELS "kokkos" + REWRITE_ARGS -e -v 2 --label file line return args + RUNTIME_ARGS + -e + -v + 1 + --label + file + line + return + args + -ME + [==[lib(gomp|m-)]==] + LABELS "kokkos;kokkos-profile-library" + RUN_ARGS -i 25 -s 20 -p + ENVIRONMENT + "${_base_environment};OMNITRACE_CRITICAL_TRACE=OFF;OMNITRACE_USE_KOKKOSP=ON;OMNITRACE_COUT_OUTPUT=ON;OMNITRACE_SAMPLING_FREQ=50;OMNITRACE_KOKKOSP_PREFIX=[kokkos];KOKKOS_PROFILE_LIBRARY=libomnitrace-dl.so" + REWRITE_RUN_PASS_REGEX "\\|_\\[kokkos\\] [a-zA-Z]" + RUNTIME_PASS_REGEX "\\|_\\[kokkos\\] [a-zA-Z]") + +omnitrace_add_test( + SKIP_RUNTIME SKIP_REWRITE + NAME lulesh-baseline-kokkosp-libomnitrace + TARGET lulesh + MPI ${LULESH_USE_MPI} + GPU ${LULESH_USE_GPU} + NUM_PROCS 8 + LABELS "kokkos;kokkos-profile-library" + RUN_ARGS -i 10 -s 20 -p + ENVIRONMENT + "${_base_environment};OMNITRACE_CRITICAL_TRACE=OFF;OMNITRACE_USE_KOKKOSP=ON;OMNITRACE_COUT_OUTPUT=ON;OMNITRACE_SAMPLING_FREQ=50;OMNITRACE_KOKKOSP_PREFIX=[kokkos];KOKKOS_PROFILE_LIBRARY=libomnitrace.so" + BASELINE_PASS_REGEX "\\|_\\[kokkos\\] [a-zA-Z]") + +omnitrace_add_test( + SKIP_RUNTIME SKIP_REWRITE + NAME lulesh-baseline-kokkosp-libomnitrace-dl + TARGET lulesh + MPI ${LULESH_USE_MPI} + GPU ${LULESH_USE_GPU} + NUM_PROCS 8 + LABELS "kokkos;kokkos-profile-library" + RUN_ARGS -i 10 -s 20 -p + ENVIRONMENT + "${_base_environment};OMNITRACE_CRITICAL_TRACE=OFF;OMNITRACE_USE_KOKKOSP=ON;OMNITRACE_COUT_OUTPUT=ON;OMNITRACE_SAMPLING_FREQ=50;OMNITRACE_KOKKOSP_PREFIX=[kokkos];KOKKOS_PROFILE_LIBRARY=libomnitrace-dl.so" + BASELINE_PASS_REGEX "\\|_\\[kokkos\\] [a-zA-Z]") + +omnitrace_add_test( + SKIP_BASELINE + NAME lulesh-kokkosp + TARGET lulesh + MPI ${LULESH_USE_MPI} + GPU ${LULESH_USE_GPU} + NUM_PROCS 8 + LABELS "kokkos" + REWRITE_ARGS -e -v 2 + RUNTIME_ARGS + -e + -v + 1 + --label + file + line + return + args + -ME + [==[lib(gomp|m-)]==] + RUN_ARGS -i 10 -s 20 -p + ENVIRONMENT + "${_base_environment};OMNITRACE_CRITICAL_TRACE=OFF;OMNITRACE_USE_KOKKOSP=ON") + +omnitrace_add_test( + SKIP_BASELINE + NAME lulesh-perfetto + TARGET lulesh + MPI ${LULESH_USE_MPI} + GPU ${LULESH_USE_GPU} + NUM_PROCS 8 + LABELS "kokkos;loops" + REWRITE_ARGS -e -v 2 + RUNTIME_ARGS + -e + -v + 1 + -l + --dynamic-callsites + --traps + --allow-overlapping + -ME + [==[libgomp]==] + RUN_ARGS -i 10 -s 20 -p + ENVIRONMENT + "${_perfetto_environment};OMNITRACE_CRITICAL_TRACE=OFF;OMNITRACE_USE_KOKKOSP=OFF") + +omnitrace_add_test( + NAME lulesh-timemory + TARGET lulesh + MPI ${LULESH_USE_MPI} + GPU ${LULESH_USE_GPU} + NUM_PROCS 8 + LABELS "kokkos;loops" + REWRITE_ARGS -e -v 2 -l --dynamic-callsites --traps --allow-overlapping + RUNTIME_ARGS + -e + -v + 1 + -l + --dynamic-callsites + -ME + [==[libgomp]==] + -d + wall_clock + peak_rss + RUN_ARGS -i 10 -s 20 -p + ENVIRONMENT + "${_timemory_environment};OMNITRACE_CRITICAL_TRACE=OFF;OMNITRACE_USE_KOKKOSP=OFF" + REWRITE_FAIL_REGEX "0 instrumented loops in procedure") diff --git a/tests/omnitrace-mpi-tests.cmake b/tests/omnitrace-mpi-tests.cmake new file mode 100644 index 0000000000..93c0f485ae --- /dev/null +++ b/tests/omnitrace-mpi-tests.cmake @@ -0,0 +1,122 @@ +# -------------------------------------------------------------------------------------- # +# +# MPI tests +# +# -------------------------------------------------------------------------------------- # + +if(NOT OMNITRACE_USE_MPI AND NOT OMNITRACE_USE_MPI_HEADERS) + return() +endif() + +omnitrace_add_test( + SKIP_RUNTIME + NAME "mpi" + TARGET mpi-example + MPI ON + NUM_PROCS 4 + REWRITE_ARGS + -e + -v + 2 + --label + file + line + return + args + --min-instructions + 0 + ENVIRONMENT "${_base_environment};OMNITRACE_VERBOSE=1" + REWRITE_RUN_PASS_REGEX + "(/[A-Za-z-]+/perfetto-trace-0.proto).*(/[A-Za-z-]+/wall_clock-0.txt')" + REWRITE_RUN_FAIL_REGEX + "(perfetto-trace|trip_count|sampling_percent|sampling_cpu_clock|sampling_wall_clock|wall_clock)-[0-9][0-9]+.(json|txt|proto)" + ) + +omnitrace_add_test( + SKIP_RUNTIME + NAME "mpi-flat-mpip" + TARGET mpi-example + MPI ON + NUM_PROCS 4 + LABELS "mpip" + REWRITE_ARGS + -e + -v + 2 + --label + file + line + args + --min-instructions + 0 + ENVIRONMENT + "${_flat_environment};OMNITRACE_USE_SAMPLING=OFF;OMNITRACE_STRICT_CONFIG=OFF;OMNITRACE_USE_MPIP=ON" + REWRITE_RUN_PASS_REGEX + ">>> mpi-flat-mpip.inst(.*\n.*)>>> MPI_Init_thread(.*\n.*)>>> pthread_create(.*\n.*)>>> MPI_Comm_size(.*\n.*)>>> MPI_Comm_rank(.*\n.*)>>> MPI_Barrier(.*\n.*)>>> MPI_Alltoall" + ) + +omnitrace_add_test( + SKIP_RUNTIME + NAME "mpi-flat" + TARGET mpi-example + MPI ON + NUM_PROCS 4 + LABELS "mpip" + REWRITE_ARGS + -e + -v + 2 + --label + file + line + args + --min-instructions + 0 + ENVIRONMENT "${_flat_environment};OMNITRACE_USE_SAMPLING=OFF" + REWRITE_RUN_PASS_REGEX + ">>> mpi-flat.inst(.*\n.*)>>> MPI_Init_thread(.*\n.*)>>> pthread_create(.*\n.*)>>> MPI_Comm_size(.*\n.*)>>> MPI_Comm_rank(.*\n.*)>>> MPI_Barrier(.*\n.*)>>> MPI_Alltoall" + ) + +set(_mpip_environment + "OMNITRACE_USE_PERFETTO=ON" + "OMNITRACE_USE_TIMEMORY=ON" + "OMNITRACE_USE_SAMPLING=OFF" + "OMNITRACE_USE_PROCESS_SAMPLING=OFF" + "OMNITRACE_TIME_OUTPUT=OFF" + "OMNITRACE_FILE_OUTPUT=ON" + "OMNITRACE_USE_MPIP=ON" + "OMNITRACE_DEBUG=OFF" + "OMNITRACE_VERBOSE=2" + "OMNITRACE_DL_VERBOSE=2" + "${_test_openmp_env}" + "${_test_library_path}") + +set(_mpip_all2all_environment + "OMNITRACE_USE_PERFETTO=ON" + "OMNITRACE_USE_TIMEMORY=ON" + "OMNITRACE_USE_SAMPLING=OFF" + "OMNITRACE_USE_PROCESS_SAMPLING=OFF" + "OMNITRACE_TIME_OUTPUT=OFF" + "OMNITRACE_FILE_OUTPUT=ON" + "OMNITRACE_USE_MPIP=ON" + "OMNITRACE_DEBUG=ON" + "OMNITRACE_VERBOSE=3" + "OMNITRACE_DL_VERBOSE=3" + "${_test_openmp_env}" + "${_test_library_path}") + +foreach(_EXAMPLE all2all allgather allreduce bcast reduce scatter-gather send-recv) + if("${_mpip_${_EXAMPLE}_environment}" STREQUAL "") + set(_mpip_${_EXAMPLE}_environment "${_mpip_environment}") + endif() + omnitrace_add_test( + SKIP_RUNTIME SKIP_SAMPLING + NAME "mpi-${_EXAMPLE}" + TARGET mpi-${_EXAMPLE} + MPI ON + NUM_PROCS 2 + LABELS "mpip" + REWRITE_ARGS -e -v 2 --label file line --min-instructions 0 + RUN_ARGS 30 + ENVIRONMENT "${_mpip_${_EXAMPLE}_environment}") +endforeach() diff --git a/tests/omnitrace-openmp-tests.cmake b/tests/omnitrace-openmp-tests.cmake new file mode 100644 index 0000000000..f0293fbf13 --- /dev/null +++ b/tests/omnitrace-openmp-tests.cmake @@ -0,0 +1,99 @@ +# -------------------------------------------------------------------------------------- # +# +# openmp tests +# +# -------------------------------------------------------------------------------------- # + +if(OMNITRACE_OPENMP_USING_LIBOMP_LIBRARY AND OMNITRACE_USE_OMPT) + set(_OMPT_PASS_REGEX "\\|_ompt_") +else() + set(_OMPT_PASS_REGEX "") +endif() + +omnitrace_add_test( + NAME openmp-cg + TARGET openmp-cg + LABELS "openmp" + REWRITE_ARGS -e -v 2 --instrument-loops + RUNTIME_ARGS -e -v 1 --label return args + REWRITE_TIMEOUT 180 + RUNTIME_TIMEOUT 360 + ENVIRONMENT "${_ompt_environment};OMNITRACE_USE_SAMPLING=OFF;OMNITRACE_COUT_OUTPUT=ON" + REWRITE_RUN_PASS_REGEX "${_OMPT_PASS_REGEX}" + RUNTIME_PASS_REGEX "${_OMPT_PASS_REGEX}" + REWRITE_FAIL_REGEX "0 instrumented loops in procedure") + +omnitrace_add_test( + SKIP_RUNTIME + NAME openmp-lu + TARGET openmp-lu + LABELS "openmp" + REWRITE_ARGS -e -v 2 --instrument-loops + RUNTIME_ARGS -e -v 1 --label return args -E ^GOMP + REWRITE_TIMEOUT 180 + RUNTIME_TIMEOUT 360 + ENVIRONMENT + "${_ompt_environment};OMNITRACE_USE_SAMPLING=ON;OMNITRACE_SAMPLING_FREQ=50;OMNITRACE_COUT_OUTPUT=ON" + REWRITE_RUN_PASS_REGEX "${_OMPT_PASS_REGEX}" + REWRITE_FAIL_REGEX "0 instrumented loops in procedure") + +set(_ompt_sampling_environ + "${_ompt_environment}" + "OMNITRACE_VERBOSE=2" + "OMNITRACE_USE_OMPT=OFF" + "OMNITRACE_USE_SAMPLING=ON" + "OMNITRACE_USE_PROCESS_SAMPLING=OFF" + "OMNITRACE_SAMPLING_FREQ=100" + "OMNITRACE_SAMPLING_DELAY=0.1" + "OMNITRACE_SAMPLING_DURATION=0.25" + "OMNITRACE_SAMPLING_CPUTIME=ON" + "OMNITRACE_SAMPLING_REALTIME=ON" + "OMNITRACE_SAMPLING_CPUTIME_FREQ=1000" + "OMNITRACE_SAMPLING_REALTIME_FREQ=500" + "OMNITRACE_MONOCHROME=ON") + +set(_ompt_sample_no_tmpfiles_environ + "${_ompt_environment}" + "OMNITRACE_VERBOSE=2" + "OMNITRACE_USE_OMPT=OFF" + "OMNITRACE_USE_SAMPLING=ON" + "OMNITRACE_USE_PROCESS_SAMPLING=OFF" + "OMNITRACE_SAMPLING_CPUTIME=ON" + "OMNITRACE_SAMPLING_REALTIME=OFF" + "OMNITRACE_SAMPLING_CPUTIME_FREQ=700" + "OMNITRACE_USE_TEMPORARY_FILES=OFF" + "OMNITRACE_MONOCHROME=ON") + +set(_ompt_sampling_samp_regex + "Sampler for thread 0 will be triggered 1000.0x per second of CPU-time(.*)Sampler for thread 0 will be triggered 500.0x per second of wall-time(.*)Sampling will be disabled after 0.250000 seconds(.*)Sampling duration of 0.250000 seconds has elapsed. Shutting down sampling" + ) +set(_ompt_sampling_file_regex + "sampling-duration-sampling/sampling_percent.(json|txt)(.*)sampling-duration-sampling/sampling_cpu_clock.(json|txt)(.*)sampling-duration-sampling/sampling_wall_clock.(json|txt)" + ) +set(_notmp_sampling_file_regex + "sampling-no-tmp-files-sampling/sampling_percent.(json|txt)(.*)sampling-no-tmp-files-sampling/sampling_cpu_clock.(json|txt)(.*)sampling-no-tmp-files-sampling/sampling_wall_clock.(json|txt)" + ) + +omnitrace_add_test( + SKIP_BASELINE SKIP_RUNTIME SKIP_REWRITE + NAME openmp-cg-sampling-duration + TARGET openmp-cg + LABELS "openmp;sampling-duration" + ENVIRONMENT "${_ompt_sampling_environ}" + SAMPLING_PASS_REGEX "${_ompt_sampling_samp_regex}(.*)${_ompt_sampling_file_regex}") + +omnitrace_add_test( + SKIP_BASELINE SKIP_RUNTIME SKIP_REWRITE + NAME openmp-lu-sampling-duration + TARGET openmp-lu + LABELS "openmp;sampling-duration" + ENVIRONMENT "${_ompt_sampling_environ}" + SAMPLING_PASS_REGEX "${_ompt_sampling_samp_regex}(.*)${_ompt_sampling_file_regex}") + +omnitrace_add_test( + SKIP_BASELINE SKIP_RUNTIME SKIP_REWRITE + NAME openmp-cg-sampling-no-tmp-files + TARGET openmp-cg + LABELS "openmp;no-tmp-files" + ENVIRONMENT "${_ompt_sample_no_tmpfiles_environ}" + SAMPLING_PASS_REGEX "${_notmp_sampling_file_regex}") diff --git a/tests/omnitrace-pthread-tests.cmake b/tests/omnitrace-pthread-tests.cmake new file mode 100644 index 0000000000..63d43e1007 --- /dev/null +++ b/tests/omnitrace-pthread-tests.cmake @@ -0,0 +1,34 @@ +# -------------------------------------------------------------------------------------- # +# +# binary-rewrite and runtime-instrumentation tests +# +# -------------------------------------------------------------------------------------- # + +omnitrace_add_test( + NAME parallel-overhead-locks + TARGET parallel-overhead-locks + LABELS "locks" + REWRITE_ARGS -e -i 256 + RUNTIME_ARGS -e -i 256 + RUN_ARGS 30 4 1000 + ENVIRONMENT + "${_lock_environment};OMNITRACE_USE_TIMEMORY=ON;OMNITRACE_USE_PERFETTO=ON;OMNITRACE_COLLAPSE_THREADS=OFF;OMNITRACE_SAMPLING_REALTIME=ON;OMNITRACE_SAMPLING_REALTIME_FREQ=10;OMNITRACE_SAMPLING_REALTIME_TIDS=0;OMNITRACE_SAMPLING_KEEP_INTERNAL=OFF" + REWRITE_RUN_PASS_REGEX + "wall_clock .*\\|_pthread_create .* 4 .*\\|_pthread_mutex_lock .* 1000 .*\\|_pthread_mutex_unlock .* 1000 .*\\|_pthread_mutex_lock .* 1000 .*\\|_pthread_mutex_unlock .* 1000 .*\\|_pthread_mutex_lock .* 1000 .*\\|_pthread_mutex_unlock .* 1000 .*\\|_pthread_mutex_lock .* 1000 .*\\|_pthread_mutex_unlock .* 1000" + RUNTIME_PASS_REGEX + "wall_clock .*\\|_pthread_create .* 4 .*\\|_pthread_mutex_lock .* 1000 .*\\|_pthread_mutex_unlock .* 1000 .*\\|_pthread_mutex_lock .* 1000 .*\\|_pthread_mutex_unlock .* 1000 .*\\|_pthread_mutex_lock .* 1000 .*\\|_pthread_mutex_unlock .* 1000 .*\\|_pthread_mutex_lock .* 1000 .*\\|_pthread_mutex_unlock .* 1000" + ) + +omnitrace_add_test( + SKIP_RUNTIME + NAME parallel-overhead-locks-timemory + TARGET parallel-overhead-locks + LABELS "locks" + REWRITE_ARGS -e -v 2 --min-instructions=32 --dyninst-options InstrStackFrames SaveFPR + TrampRecursive + RUN_ARGS 10 4 1000 + ENVIRONMENT + "${_lock_environment};OMNITRACE_FLAT_PROFILE=ON;OMNITRACE_USE_TIMEMORY=ON;OMNITRACE_USE_PERFETTO=OFF;OMNITRACE_SAMPLING_KEEP_INTERNAL=OFF" + REWRITE_RUN_PASS_REGEX + "start_thread (.*) 4 (.*) pthread_mutex_lock (.*) 4000 (.*) pthread_mutex_unlock (.*) 4000" + ) diff --git a/tests/omnitrace-python-tests.cmake b/tests/omnitrace-python-tests.cmake new file mode 100644 index 0000000000..0458489627 --- /dev/null +++ b/tests/omnitrace-python-tests.cmake @@ -0,0 +1,267 @@ +# -------------------------------------------------------------------------------------- # +# +# python tests +# +# -------------------------------------------------------------------------------------- # + +set(_INDEX 0) + +foreach(_VERSION ${OMNITRACE_PYTHON_VERSIONS}) + if(NOT OMNITRACE_USE_PYTHON) + continue() + endif() + + list(GET OMNITRACE_PYTHON_ROOT_DIRS ${_INDEX} _PYTHON_ROOT_DIR) + + omnitrace_find_python( + _PYTHON + ROOT_DIR "${_PYTHON_ROOT_DIR}" + COMPONENTS Interpreter) + + # ---------------------------------------------------------------------------------- # + # python tests + # ---------------------------------------------------------------------------------- # + omnitrace_add_python_test( + NAME python-external + PYTHON_EXECUTABLE ${_PYTHON_EXECUTABLE} + PYTHON_VERSION ${_VERSION} + FILE ${CMAKE_SOURCE_DIR}/examples/python/external.py + PROFILE_ARGS "--label" "file" + RUN_ARGS -v 10 -n 5 + ENVIRONMENT "${_python_environment}") + + omnitrace_add_python_test( + NAME python-external-exclude-inefficient + PYTHON_EXECUTABLE ${_PYTHON_EXECUTABLE} + PYTHON_VERSION ${_VERSION} + FILE ${CMAKE_SOURCE_DIR}/examples/python/external.py + PROFILE_ARGS -E "^inefficient$" + RUN_ARGS -v 10 -n 5 + ENVIRONMENT "${_python_environment}") + + omnitrace_add_python_test( + NAME python-builtin + PYTHON_EXECUTABLE ${_PYTHON_EXECUTABLE} + PYTHON_VERSION ${_VERSION} + FILE ${CMAKE_SOURCE_DIR}/examples/python/builtin.py + PROFILE_ARGS "-b" "--label" "file" "line" + RUN_ARGS -v 10 -n 5 + ENVIRONMENT "${_python_environment}") + + omnitrace_add_python_test( + NAME python-builtin-noprofile + PYTHON_EXECUTABLE ${_PYTHON_EXECUTABLE} + PYTHON_VERSION ${_VERSION} + FILE ${CMAKE_SOURCE_DIR}/examples/python/noprofile.py + PROFILE_ARGS "-b" "--label" "file" + RUN_ARGS -v 15 -n 5 + ENVIRONMENT "${_python_environment}") + + omnitrace_add_python_test( + STANDALONE + NAME python-source + PYTHON_EXECUTABLE ${_PYTHON_EXECUTABLE} + PYTHON_VERSION ${_VERSION} + FILE ${CMAKE_SOURCE_DIR}/examples/python/source.py + RUN_ARGS -v 5 -n 5 -s 3 + ENVIRONMENT "${_python_environment}") + + omnitrace_add_python_test( + STANDALONE + NAME python-code-coverage + PYTHON_EXECUTABLE ${_PYTHON_EXECUTABLE} + PYTHON_VERSION ${_VERSION} + FILE ${CMAKE_SOURCE_DIR}/examples/code-coverage/code-coverage.py + RUN_ARGS + -i + ${PROJECT_BINARY_DIR}/omnitrace-tests-output/code-coverage-basic-blocks-binary-rewrite/coverage.json + ${PROJECT_BINARY_DIR}/omnitrace-tests-output/code-coverage-basic-blocks-hybrid-runtime-instrument/coverage.json + -o + ${PROJECT_BINARY_DIR}/omnitrace-tests-output/code-coverage-basic-blocks-summary/coverage.json + DEPENDS code-coverage-basic-blocks-binary-rewrite + code-coverage-basic-blocks-binary-rewrite-run + code-coverage-basic-blocks-hybrid-runtime-instrument + LABELS "code-coverage" + ENVIRONMENT "${_python_environment}") + + # ---------------------------------------------------------------------------------- # + # python output tests + # ---------------------------------------------------------------------------------- # + if(CMAKE_VERSION VERSION_LESS "3.18.0") + find_program( + OMNITRACE_CAT_EXE + NAMES cat + PATH_SUFFIXES bin) + + if(OMNITRACE_CAT_EXE) + set(OMNITRACE_CAT_COMMAND ${OMNITRACE_CAT_EXE}) + endif() + else() + set(OMNITRACE_CAT_COMMAND ${CMAKE_COMMAND} -E cat) + endif() + + if(OMNITRACE_CAT_COMMAND) + omnitrace_add_python_test( + NAME python-external-check + COMMAND ${OMNITRACE_CAT_COMMAND} + PYTHON_VERSION ${_VERSION} + FILE omnitrace-tests-output/python-external/${_VERSION}/trip_count.txt + PASS_REGEX + "(\\\[compile\\\]).*(\\\| \\\|0>>> \\\[run\\\]\\\[external.py\\\]).*(\\\| \\\|0>>> \\\|_\\\[fib\\\]\\\[external.py\\\]).*(\\\| \\\|0>>> \\\|_\\\[inefficient\\\]\\\[external.py\\\])" + DEPENDS python-external-${_VERSION} + ENVIRONMENT "${_python_environment}") + + omnitrace_add_python_test( + NAME python-external-exclude-inefficient-check + COMMAND ${OMNITRACE_CAT_COMMAND} + PYTHON_VERSION ${_VERSION} + FILE omnitrace-tests-output/python-external-exclude-inefficient/${_VERSION}/trip_count.txt + FAIL_REGEX "(\\\|_inefficient).*(\\\|_sum)" + DEPENDS python-external-exclude-inefficient-${_VERSION} + ENVIRONMENT "${_python_environment}") + + omnitrace_add_python_test( + NAME python-builtin-check + COMMAND ${OMNITRACE_CAT_COMMAND} + PYTHON_VERSION ${_VERSION} + FILE omnitrace-tests-output/python-builtin/${_VERSION}/trip_count.txt + PASS_REGEX "\\\[inefficient\\\]\\\[builtin.py:14\\\]" + DEPENDS python-builtin-${_VERSION} + ENVIRONMENT "${_python_environment}") + + omnitrace_add_python_test( + NAME python-builtin-noprofile-check + COMMAND ${OMNITRACE_CAT_COMMAND} + PYTHON_VERSION ${_VERSION} + FILE omnitrace-tests-output/python-builtin-noprofile/${_VERSION}/trip_count.txt + PASS_REGEX ".(run)..(noprofile.py)." + FAIL_REGEX ".(fib|inefficient)..(noprofile.py)." + DEPENDS python-builtin-noprofile-${_VERSION} + ENVIRONMENT "${_python_environment}") + else() + omnitrace_message( + WARNING + "Neither 'cat' nor 'cmake -E cat' are available. Python source checks are disabled" + ) + endif() + + function(OMNITRACE_ADD_PYTHON_VALIDATION_TEST) + cmake_parse_arguments( + TEST "" "NAME;TIMEMORY_METRIC;TIMEMORY_FILE;PERFETTO_METRIC;PERFETTO_FILE" + "ARGS" ${ARGN}) + + omnitrace_add_python_test( + NAME ${TEST_NAME}-validate-timemory + COMMAND + ${_PYTHON_EXECUTABLE} ${CMAKE_CURRENT_LIST_DIR}/validate-timemory-json.py + -m ${TEST_TIMEMORY_METRIC} ${TEST_ARGS} -i + PYTHON_VERSION ${_VERSION} + FILE omnitrace-tests-output/${TEST_NAME}/${_VERSION}/${TEST_TIMEMORY_FILE} + DEPENDS ${TEST_NAME}-${_VERSION} + PASS_REGEX + "omnitrace-tests-output/${TEST_NAME}/${_VERSION}/${TEST_TIMEMORY_FILE} validated" + ENVIRONMENT "${_python_environment}") + + omnitrace_add_python_test( + NAME ${TEST_NAME}-validate-perfetto + COMMAND + ${_PYTHON_EXECUTABLE} ${CMAKE_CURRENT_LIST_DIR}/validate-perfetto-proto.py + -m ${TEST_PERFETTO_METRIC} ${TEST_ARGS} -p -i + PYTHON_VERSION ${_VERSION} + FILE omnitrace-tests-output/${TEST_NAME}/${_VERSION}/${TEST_PERFETTO_FILE} + DEPENDS ${TEST_NAME}-${_VERSION} + PASS_REGEX + "omnitrace-tests-output/${TEST_NAME}/${_VERSION}/${TEST_PERFETTO_FILE} validated" + ENVIRONMENT "${_python_environment}") + endfunction() + + set(python_source_labels + main_loop + run + fib + fib + fib + fib + fib + inefficient + _sum) + set(python_source_count + 5 + 3 + 3 + 6 + 12 + 18 + 6 + 3 + 3) + set(python_source_depth + 0 + 1 + 2 + 3 + 4 + 5 + 6 + 2 + 3) + + omnitrace_add_python_validation_test( + NAME python-source + TIMEMORY_METRIC "trip_count" + TIMEMORY_FILE "trip_count.json" + PERFETTO_METRIC "host;user" + PERFETTO_FILE "perfetto-trace.proto" + ARGS -l ${python_source_labels} -c ${python_source_count} -d + ${python_source_depth}) + + set(python_builtin_labels + [run][builtin.py:28] + [fib][builtin.py:10] + [fib][builtin.py:10] + [fib][builtin.py:10] + [fib][builtin.py:10] + [fib][builtin.py:10] + [fib][builtin.py:10] + [fib][builtin.py:10] + [fib][builtin.py:10] + [fib][builtin.py:10] + [fib][builtin.py:10] + [inefficient][builtin.py:14]) + set(python_builtin_count + 5 + 5 + 10 + 20 + 40 + 80 + 160 + 260 + 220 + 80 + 10 + 5) + set(python_builtin_depth + 0 + 1 + 2 + 3 + 4 + 5 + 6 + 7 + 8 + 9 + 10 + 1) + + omnitrace_add_python_validation_test( + NAME python-builtin + TIMEMORY_METRIC "trip_count" + TIMEMORY_FILE "trip_count.json" + PERFETTO_METRIC "host;user" + PERFETTO_FILE "perfetto-trace.proto" + ARGS -l ${python_builtin_labels} -c ${python_builtin_count} -d + ${python_builtin_depth}) + math(EXPR _INDEX "${_INDEX} + 1") +endforeach() diff --git a/tests/omnitrace-rccl-tests.cmake b/tests/omnitrace-rccl-tests.cmake new file mode 100644 index 0000000000..42f38bf23a --- /dev/null +++ b/tests/omnitrace-rccl-tests.cmake @@ -0,0 +1,60 @@ +# -------------------------------------------------------------------------------------- # +# +# rccl tests +# +# -------------------------------------------------------------------------------------- # + +foreach(_TARGET ${RCCL_TEST_TARGETS}) + string(REPLACE "rccl-tests::" "" _NAME "${_TARGET}") + string(REPLACE "_" "-" _NAME "${_NAME}") + omnitrace_add_test( + NAME rccl-test-${_NAME} + TARGET ${_TARGET} + LABELS "rccl-tests;rcclp" + MPI ON + GPU ON + NUM_PROCS 1 + REWRITE_ARGS + -e + -v + 2 + -i + 8 + --label + file + line + return + args + RUNTIME_ARGS + -e + -v + 1 + -i + 8 + --label + file + line + return + args + -ME + sysdeps + --log-file + rccl-test-${_NAME}.log + RUN_ARGS -t + 1 + -g + 1 + -i + 10 + -w + 2 + -m + 2 + -p + -c + 1 + -z + -s + 1 + ENVIRONMENT "${_rccl_environment}") +endforeach() diff --git a/tests/omnitrace-rocm-tests.cmake b/tests/omnitrace-rocm-tests.cmake new file mode 100644 index 0000000000..907d4fa66c --- /dev/null +++ b/tests/omnitrace-rocm-tests.cmake @@ -0,0 +1,85 @@ +# -------------------------------------------------------------------------------------- # +# +# ROCm tests +# +# -------------------------------------------------------------------------------------- # + +set(OMNITRACE_ROCM_EVENTS_TEST + "GRBM_COUNT,GPUBusy,SQ_WAVES,SQ_INSTS_VALU,VALUInsts,TCC_HIT_sum,TA_TA_BUSY[0]:device=0,TA_TA_BUSY[11]:device=0" + ) + +omnitrace_add_test( + NAME transpose + TARGET transpose + MPI ${TRANSPOSE_USE_MPI} + GPU ON + NUM_PROCS ${NUM_PROCS} + REWRITE_ARGS -e -v 2 --print-instructions -E uniform_int_distribution + RUNTIME_ARGS + -e + -v + 1 + --label + file + line + return + args + -E + uniform_int_distribution + ENVIRONMENT "${_base_environment};OMNITRACE_CRITICAL_TRACE=ON") + +omnitrace_add_test( + SKIP_BASELINE SKIP_RUNTIME + NAME transpose-loops + TARGET transpose + LABELS "loops" + MPI ${TRANSPOSE_USE_MPI} + GPU ON + NUM_PROCS ${NUM_PROCS} + REWRITE_ARGS + -e + -v + 2 + --label + return + args + -l + -i + 8 + -E + uniform_int_distribution + RUN_ARGS 2 100 50 + ENVIRONMENT "${_base_environment};OMNITRACE_CRITICAL_TRACE=OFF" + REWRITE_FAIL_REGEX "0 instrumented loops in procedure transpose") + +if(OMNITRACE_USE_ROCPROFILER) + omnitrace_add_test( + SKIP_BASELINE SKIP_RUNTIME + NAME transpose-rocprofiler + TARGET transpose + LABELS "rocprofiler" + MPI ${TRANSPOSE_USE_MPI} + GPU ON + NUM_PROCS ${NUM_PROCS} + REWRITE_ARGS -e -v 2 -E uniform_int_distribution + ENVIRONMENT + "${_base_environment};OMNITRACE_CRITICAL_TRACE=OFF;OMNITRACE_ROCM_EVENTS=${OMNITRACE_ROCM_EVENTS_TEST}" + REWRITE_RUN_PASS_REGEX + "rocprof-device-0-GRBM_COUNT.txt(.*)rocprof-device-0-GPUBusy.txt(.*)rocprof-device-0-SQ_WAVES.txt(.*)rocprof-device-0-SQ_INSTS_VALU.txt(.*)rocprof-device-0-VALUInsts.txt(.*)rocprof-device-0-TCC_HIT_sum.txt(.*)rocprof-device-0-TA_TA_BUSY_0.txt(.*)rocprof-device-0-TA_TA_BUSY_11.txt" + ) + + omnitrace_add_test( + SKIP_BASELINE SKIP_RUNTIME + NAME transpose-rocprofiler-no-roctracer + TARGET transpose + LABELS "rocprofiler" + MPI ${TRANSPOSE_USE_MPI} + GPU ON + NUM_PROCS ${NUM_PROCS} + REWRITE_ARGS -e -v 2 -E uniform_int_distribution + ENVIRONMENT + "${_base_environment};OMNITRACE_CRITICAL_TRACE=OFF;OMNITRACE_USE_ROCTRACER=OFF;OMNITRACE_ROCM_EVENTS=${OMNITRACE_ROCM_EVENTS_TEST}" + REWRITE_RUN_PASS_REGEX + "rocprof-device-0-GRBM_COUNT.txt(.*)rocprof-device-0-GPUBusy.txt(.*)rocprof-device-0-SQ_WAVES.txt(.*)rocprof-device-0-SQ_INSTS_VALU.txt(.*)rocprof-device-0-VALUInsts.txt(.*)rocprof-device-0-TCC_HIT_sum.txt(.*)rocprof-device-0-TA_TA_BUSY_0.txt(.*)rocprof-device-0-TA_TA_BUSY_11.txt" + REWRITE_RUN_FAIL_REGEX "roctracer.txt") +endif() diff --git a/tests/omnitrace-testing.cmake b/tests/omnitrace-testing.cmake index a4f745d22e..41e86e5636 100644 --- a/tests/omnitrace-testing.cmake +++ b/tests/omnitrace-testing.cmake @@ -262,7 +262,11 @@ function(OMNITRACE_WRITE_TEST_CONFIG _FILE _ENV) set(_FILE_CONTENTS) set(_ENV_CONTENTS) + set(_DEBUG_SETTINGS ON) foreach(_VAL ${${_ENV}}) + if("${_VAL}" MATCHES "^OMNITRACE_DEBUG_SETTINGS=") + set(_DEBUG_SETTINGS OFF) + endif() if("${_VAL}" MATCHES "^OMNITRACE_" AND NOT "${_VAL}" MATCHES "${_ENV_ONLY}") set(_FILE_CONTENTS "${_FILE_CONTENTS}${_VAL}\n") else() @@ -290,7 +294,9 @@ OMNITRACE_ROCTRACER_HSA_ACTIVITY = ON ${_FILE_CONTENTS} ") list(APPEND _ENV_CONTENTS "OMNITRACE_CONFIG_FILE=${_CONFIG_FILE}") - list(APPEND _ENV_CONTENTS "OMNITRACE_DEBUG_SETTINGS=1") + if(_DEBUG_SETTINGS) + list(APPEND _ENV_CONTENTS "OMNITRACE_DEBUG_SETTINGS=1") + endif() set(${_ENV} "${_ENV_CONTENTS}" PARENT_SCOPE) @@ -336,25 +342,24 @@ endmacro() # -------------------------------------------------------------------------------------- # function(OMNITRACE_ADD_TEST) - foreach(_PREFIX PRELOAD RUNTIME REWRITE REWRITE_RUN BASELINE) + foreach(_PREFIX SAMPLING RUNTIME REWRITE REWRITE_RUN BASELINE) foreach(_TYPE PASS FAIL SKIP) list(APPEND _REGEX_OPTS "${_PREFIX}_${_TYPE}_REGEX") endforeach() endforeach() - set(_KWARGS REWRITE_ARGS RUNTIME_ARGS RUN_ARGS ENVIRONMENT LABELS PROPERTIES - ${_REGEX_OPTS}) + set(_KWARGS REWRITE_ARGS RUNTIME_ARGS SAMPLING_ARGS RUN_ARGS ENVIRONMENT LABELS + PROPERTIES ${_REGEX_OPTS}) cmake_parse_arguments( - TEST - "SKIP_BASELINE;SKIP_PRELOAD;SKIP_REWRITE;SKIP_RUNTIME;SKIP_SAMPLING;FORCE_SAMPLING" - "NAME;TARGET;MPI;GPU;NUM_PROCS;REWRITE_TIMEOUT;RUNTIME_TIMEOUT;PRELOAD" - "${_KWARGS}" + TEST "SKIP_BASELINE;SKIP_SAMPLING;SKIP_REWRITE;SKIP_RUNTIME" + "NAME;TARGET;MPI;GPU;NUM_PROCS;REWRITE_TIMEOUT;RUNTIME_TIMEOUT" "${_KWARGS}" ${ARGN}) - foreach(_PREFIX PRELOAD RUNTIME REWRITE REWRITE_RUN BASELINE) + foreach(_PREFIX SAMPLING RUNTIME REWRITE REWRITE_RUN BASELINE) if("${${_PREFIX}_FAIL_REGEX}" STREQUAL "") set(${_PREFIX}_FAIL_REGEX - "(### ERROR ###|address of faulting memory reference)") + "(### ERROR ###|address of faulting memory reference|exiting with non-zero exit code)" + ) endif() endforeach() @@ -387,8 +392,8 @@ function(OMNITRACE_ADD_TEST) set(TEST_RUNTIME_TIMEOUT 300) endif() - if(NOT TEST_PRELOAD_TIMEOUT) - set(TEST_PRELOAD_TIMEOUT 120) + if(NOT TEST_SAMPLING_TIMEOUT) + set(TEST_SAMPLING_TIMEOUT 120) endif() if(NOT DEFINED TEST_ENVIRONMENT OR "${TEST_ENVIRONMENT}" STREQUAL "") @@ -448,11 +453,12 @@ function(OMNITRACE_ADD_TEST) WORKING_DIRECTORY ${PROJECT_BINARY_DIR}) endif() - if(NOT TEST_SKIP_PRELOAD) + if(NOT TEST_SKIP_SAMPLING) add_test( - NAME ${TEST_NAME}-preload - COMMAND ${COMMAND_PREFIX} $ -- - $ ${TEST_RUN_ARGS} + NAME ${TEST_NAME}-sampling + COMMAND + ${COMMAND_PREFIX} $ ${TEST_SAMPLE_ARGS} + -- $ ${TEST_RUN_ARGS} WORKING_DIRECTORY ${PROJECT_BINARY_DIR}) endif() @@ -473,23 +479,6 @@ function(OMNITRACE_ADD_TEST) WORKING_DIRECTORY ${PROJECT_BINARY_DIR}) endif() - if(TEST_FORCE_SAMPLING OR (NOT TEST_SKIP_REWRITE AND NOT TEST_SKIP_SAMPLING)) - add_test( - NAME ${TEST_NAME}-binary-rewrite-sampling - COMMAND - $ -o - $/${TEST_NAME}.samp -M sampling - ${TEST_REWRITE_ARGS} -- $ - WORKING_DIRECTORY ${PROJECT_BINARY_DIR}) - - add_test( - NAME ${TEST_NAME}-binary-rewrite-sampling-run - COMMAND - ${COMMAND_PREFIX} $ -- - $/${TEST_NAME}.samp ${TEST_RUN_ARGS} - WORKING_DIRECTORY ${PROJECT_BINARY_DIR}) - endif() - if(NOT TEST_SKIP_RUNTIME AND NOT OMNITRACE_USE_SANITIZER) add_test( NAME ${TEST_NAME}-runtime-instrument @@ -498,34 +487,16 @@ function(OMNITRACE_ADD_TEST) WORKING_DIRECTORY ${PROJECT_BINARY_DIR}) endif() - if((TEST_FORCE_SAMPLING OR (NOT TEST_SKIP_RUNTIME AND NOT TEST_SKIP_SAMPLING)) - AND NOT OMNITRACE_USE_SANITIZER) - add_test( - NAME ${TEST_NAME}-runtime-instrument-sampling - COMMAND - $ -M sampling ${TEST_RUNTIME_ARGS} - -- $ ${TEST_RUN_ARGS} - WORKING_DIRECTORY ${PROJECT_BINARY_DIR}) - endif() - if(TEST ${TEST_NAME}-binary-rewrite-run) set_tests_properties(${TEST_NAME}-binary-rewrite-run PROPERTIES DEPENDS ${TEST_NAME}-binary-rewrite) endif() - if(TEST ${TEST_NAME}-binary-rewrite-sampling-run) - set_tests_properties(${TEST_NAME}-binary-rewrite-sampling-run - PROPERTIES DEPENDS ${TEST_NAME}-binary-rewrite-sampling) - endif() - - foreach( - _TEST - baseline preload binary-rewrite binary-rewrite-run binary-rewrite-sampling - binary-rewrite-sampling-run runtime-instrument runtime-instrument-sampling) + foreach(_TEST baseline sampling binary-rewrite binary-rewrite-run + runtime-instrument) string(REGEX REPLACE "-run(-|/)" "\\1" _prefix "${TEST_NAME}-${_TEST}/") set(_labels "${_TEST}") string(REPLACE "-run" "" _labels "${_TEST}") - string(REPLACE "-sampling" ";sampling" _labels "${_labels}") if(TEST_TARGET) list(APPEND _labels "${TEST_TARGET}") endif() @@ -539,14 +510,14 @@ function(OMNITRACE_ADD_TEST) "OMNITRACE_OUTPUT_PREFIX=${_prefix}") set(_timeout ${TEST_REWRITE_TIMEOUT}) - if("${_TEST}" MATCHES "preload") - set(_timeout ${TEST_PRELOAD_TIMEOUT}) + if("${_TEST}" MATCHES "sampling") + set(_timeout ${TEST_SAMPLING_TIMEOUT}) elseif("${_TEST}" MATCHES "runtime-instrument") set(_timeout ${TEST_RUNTIME_TIMEOUT}) endif() set(_props) - if("${_TEST}" MATCHES "run|preload|baseline") + if("${_TEST}" MATCHES "run|sampling|baseline") set(_props ${TEST_PROPERTIES}) if(NOT "RUN_SERIAL" IN_LIST _props) list(APPEND _props RUN_SERIAL ON) @@ -561,13 +532,13 @@ function(OMNITRACE_ADD_TEST) set(_REGEX_VAR REWRITE) elseif("${_TEST}" MATCHES "baseline") set(_REGEX_VAR BASELINE) - elseif("${_TEST}" MATCHES "preload") - set(_REGEX_VAR PRELOAD) + elseif("${_TEST}" MATCHES "sampling") + set(_REGEX_VAR SAMPLING) else() set(_REGEX_VAR) endif() - if("${_TEST}" MATCHES "binary-rewrite-run|runtime-instrument|preload") + if("${_TEST}" MATCHES "binary-rewrite-run|runtime-instrument|sampling") omnitrace_patch_sanitizer_environment(_environ) endif() @@ -632,6 +603,12 @@ function(OMNITRACE_ADD_CAUSAL_TEST) set(TEST_CAUSAL_VALIDATE_TIMEOUT 60) endif() + if("${TEST_CAUSAL_FAIL_REGEX}" STREQUAL "") + set(TEST_CAUSAL_FAIL_REGEX + "(### ERROR ###|address of faulting memory reference|exiting with non-zero exit code)" + ) + endif() + if(TARGET ${TEST_TARGET}) set(COMMAND_PREFIX $ --reset -m ${TEST_CAUSAL_MODE} ${TEST_CAUSAL_ARGS} --) @@ -692,7 +669,10 @@ function(OMNITRACE_ADD_CAUSAL_TEST) "OMNITRACE_OUTPUT_PREFIX=${_prefix}" "OMNITRACE_CI=ON" "OMNITRACE_USE_PID=OFF" - "OMNITRACE_THREAD_POOL_SIZE=1" + "OMNITRACE_THREAD_POOL_SIZE=0" + "OMNITRACE_VERBOSE=1" + "OMNITRACE_DL_VERBOSE=0" + "OMNITRACE_DEBUG_SETTINGS=0" "${TEST_ENVIRONMENT}") set(_timeout ${TEST_CAUSAL_TIMEOUT}) @@ -954,6 +934,9 @@ function(OMNITRACE_ADD_VALIDATION_TEST) endforeach() list(APPEND TEST_DEPENDS "${TEST_NAME}") + if("${TEST_NAME}" MATCHES "-binary-rewrite") + list(APPEND TEST_DEPENDS "${TEST_NAME}-run") + endif() if(NOT TEST_PASS_REGEX) set(TEST_PASS_REGEX diff --git a/tests/omnitrace-time-window-tests.cmake b/tests/omnitrace-time-window-tests.cmake new file mode 100644 index 0000000000..040474a146 --- /dev/null +++ b/tests/omnitrace-time-window-tests.cmake @@ -0,0 +1,114 @@ +# -------------------------------------------------------------------------------------- # +# +# time-window tests +# +# -------------------------------------------------------------------------------------- # + +if(_OS_RELEASE STREQUAL "ubuntu-18.04") + set(_TRACE_WINDOW_SKIP SKIP_RUNTIME) +endif() + +omnitrace_add_test( + SKIP_BASELINE SKIP_SAMPLING ${_TRACE_WINDOW_SKIP} + NAME trace-time-window + TARGET trace-time-window + REWRITE_ARGS -e -v 2 --caller-include inner -i 4096 + RUNTIME_ARGS -e -v 1 --caller-include inner -i 4096 + LABELS "time-window" + ENVIRONMENT "${_window_environment};OMNITRACE_TRACE_DURATION=1.25") + +omnitrace_add_validation_test( + NAME trace-time-window-binary-rewrite + TIMEMORY_METRIC "wall_clock" + TIMEMORY_FILE "wall_clock.json" + PERFETTO_METRIC "host" + PERFETTO_FILE "perfetto-trace.proto" + LABELS "time-window" + FAIL_REGEX "outer_d" + ARGS -l + trace-time-window.inst + outer_a + outer_b + outer_c + -c + 1 + 1 + 1 + 1 + -d + 0 + 1 + 1 + 1 + -p) + +omnitrace_add_validation_test( + NAME trace-time-window-runtime-instrument + TIMEMORY_METRIC "wall_clock" + TIMEMORY_FILE "wall_clock.json" + PERFETTO_METRIC "host" + PERFETTO_FILE "perfetto-trace.proto" + LABELS "time-window" + FAIL_REGEX "outer_d" + ARGS -l + trace-time-window + outer_a + outer_b + outer_c + -c + 1 + 1 + 1 + 1 + -d + 0 + 1 + 1 + 1 + -p) + +omnitrace_add_test( + SKIP_BASELINE SKIP_SAMPLING ${_TRACE_WINDOW_SKIP} + NAME trace-time-window-delay + TARGET trace-time-window + REWRITE_ARGS -e -v 2 --caller-include inner -i 4096 + RUNTIME_ARGS -e -v 1 --caller-include inner -i 4096 + LABELS "time-window" + ENVIRONMENT + "${_window_environment};OMNITRACE_TRACE_DELAY=0.75;OMNITRACE_TRACE_DURATION=0.75") + +omnitrace_add_validation_test( + NAME trace-time-window-delay-binary-rewrite + TIMEMORY_METRIC "wall_clock" + TIMEMORY_FILE "wall_clock.json" + PERFETTO_METRIC "host" + PERFETTO_FILE "perfetto-trace.proto" + LABELS "time-window" + ARGS -l + outer_c + outer_d + -c + 1 + 1 + -d + 0 + 0 + -p) + +omnitrace_add_validation_test( + NAME trace-time-window-delay-runtime-instrument + TIMEMORY_METRIC "wall_clock" + TIMEMORY_FILE "wall_clock.json" + PERFETTO_METRIC "host" + PERFETTO_FILE "perfetto-trace.proto" + LABELS "time-window" + ARGS -l + outer_c + outer_d + -c + 1 + 1 + -d + 0 + 0 + -p) diff --git a/tests/omnitrace-user-api-tests.cmake b/tests/omnitrace-user-api-tests.cmake new file mode 100644 index 0000000000..d26625841e --- /dev/null +++ b/tests/omnitrace-user-api-tests.cmake @@ -0,0 +1,31 @@ +# -------------------------------------------------------------------------------------- # +# +# User API tests +# +# -------------------------------------------------------------------------------------- # + +omnitrace_add_test( + NAME user-api + TARGET user-api + LABELS "loops" + REWRITE_ARGS -e -v 2 -l --min-instructions=8 -E custom_push_region + RUNTIME_ARGS + -e + -v + 1 + -l + --min-instructions=8 + -E + custom_push_region + --label + file + line + return + args + RUN_ARGS 10 ${NUM_THREADS} 1000 + ENVIRONMENT "${_base_environment};OMNITRACE_CRITICAL_TRACE=OFF" + REWRITE_RUN_PASS_REGEX "Pushing custom region :: run.10. x 1000" + RUNTIME_PASS_REGEX "Pushing custom region :: run.10. x 1000" + SAMPLING_PASS_REGEX "Pushing custom region :: run.10. x 1000" + BASELINE_FAIL_REGEX "Pushing custom region" + REWRITE_FAIL_REGEX "0 instrumented loops in procedure") diff --git a/tests/validate-causal-json.py b/tests/validate-causal-json.py index 06a06f6214..4101bfb05b 100755 --- a/tests/validate-causal-json.py +++ b/tests/validate-causal-json.py @@ -9,7 +9,7 @@ import argparse from collections import OrderedDict -num_stddev = 1 +num_stddev = 1.0 def mean(_data): @@ -21,7 +21,7 @@ def stddev(_data): return 0.0 _mean = mean(_data) _variance = sum([((x - _mean) ** 2) for x in _data]) / float(len(_data)) - return _variance**0.5 + return float(num_stddev) * math.sqrt(_variance) def simpsons_rule(a, b, fa, fb): @@ -66,21 +66,28 @@ class validation(object): return None _tolerance = self.tolerance - - if _ci is True and _virt_speedup > 10: - """On GitHub Action servers, you typically only get one core with two hyperthreads. - The hyperthreading causes the speedup potential to drop off at higher virtual speedups - so we consider + _reason = "[unspecified reason]" + if _ci is True: + """On GitHub Action servers, you typically only get two CPUs, which may be one + core with two hyperthreads. The hyperthreading can causes the speedup potential + to drop. Furthermore, these are typically shared resources so the runtime may + vary significantly. Thus, always account for stddev to prevent failures due to + these causes """ _tolerance += max([_base_speedup_stddev, _prog_speedup_stddev]) + _reason = "results obtained on a shared CI system... potentially artificially deflating speedup predictions" elif _base_speedup_stddev > self.tolerance: _tolerance += math.sqrt(_base_speedup_stddev) + _reason = ( + f"large standard deviation of the baseline ({_base_speedup_stddev:.3f})" + ) elif _prog_speedup_stddev > 1.0: _tolerance += math.sqrt(_prog_speedup_stddev) + _reason = f"large standard deviation of the program speedup ({_prog_speedup_stddev:.3f})" if _tolerance > self.tolerance: sys.stderr.write( - f" [{_exp_name}][{_pp_name}][{_virt_speedup}] Tolerance adjusted due to stddev or to account for hyperthreading on CI systems ({self.tolerance:.3f} increased to {_tolerance:.3f})...\n" + f" [{_exp_name}][{_pp_name}][{_virt_speedup}] Tolerance increased: {_reason} ({self.tolerance:.3f} increased to {_tolerance:.3f})...\n" ) def _compute(_speedup_v, _tolerance_v): @@ -195,9 +202,7 @@ class line_speedup(object): if self.data is None or self.base is None: return f"{self.name}" _line_speedup = self.compute_speedup() - _line_stddev = ( - float(num_stddev) * self.compute_speedup_stddev() - ) # 3 stddev == 99.87% + _line_stddev = self.compute_speedup_stddev() # 3 stddev == 99.87% _name = self.get_name() return f"[{_name}][{self.prog}][{self.data.speedup:3}] speedup: {_line_speedup:6.1f} +/- {_line_stddev:6.2f} %" @@ -345,7 +350,6 @@ def compute_speedups(_data, args): for selected, pitr in _data.items(): for progpt, ditr in pitr.items(): if 0 not in ditr.keys(): - # print(f"missing baseline data for {progpt} in {selected}...") continue _baseline = ditr[0].mean() for speedup, itr in ditr.items(): @@ -353,8 +357,9 @@ def compute_speedups(_data, args): continue if speedup != itr.speedup: raise ValueError(f"in {selected}: {speedup} != {itr.speedup}") - _val = line_speedup(selected, progpt, itr, ditr[0]) - ret.append(_val) + if len(itr) >= args.min_experiments: + _val = line_speedup(selected, progpt, itr, ditr[0]) + ret.append(_val) ret.sort() _last_name = None @@ -400,6 +405,8 @@ def get_validations(args): def main(): import argparse + global num_stddev + parser = argparse.ArgumentParser() parser.add_argument( "-e", "--experiments", type=str, help="Regex for experiments", default=".*" @@ -414,6 +421,13 @@ def main(): parser.add_argument( "-n", "--num-points", type=int, help="Minimum number of data points", default=5 ) + parser.add_argument( + "-m", + "--min-experiments", + type=int, + help="Minimum number of experiments per speedup (e.g. do not display speedups when there are fewer than X experiments at this speedup)", + default=2, + ) parser.add_argument( "-i", "--input", type=str, nargs="*", help="Input file(s)", required=True ) @@ -428,9 +442,9 @@ def main(): parser.add_argument( "-d", "--stddev", - type=int, + type=float, help="Number of standard deviations to report", - default=1, + default=1.0, ) parser.add_argument( "-v", @@ -440,6 +454,12 @@ def main(): help="Validate speedup: {experiment regex} {progress-point regex} {virtual-speedup} {expected-speedup} {tolerance}", default=[], ) + parser.add_argument( + "--samples", + type=float, + help="Report samples within this percentage of the peak (0.0, 100.0] (default: 95 percent)", + default=95.0, + ) parser.add_argument( "--ci", action="store_true", @@ -454,6 +474,13 @@ def main(): num_stddev = args.stddev num_speedups = len(args.speedups) + percent_samples = args.samples + if not percent_samples > 0.0 and not percent_samples <= 100.0: + raise ValueError( + f"Invalid samples value: {percent_samples}. Supported range: 0.0 < x <= 100.0" + ) + percent_samples = 1.0 - (percent_samples / 100.0) + if num_speedups > 0 and args.num_points > num_speedups: args.num_points = num_speedups @@ -466,9 +493,11 @@ def main(): samp = process_samples(samp, inp_data) print("Samples:") - width = max([len(x) for x in samp.keys()]) - for name, count in sorted(samp.items()): - print(f" {name:{width}} :: {count}") + width = max([int(math.log10(x) + 1) for _, x in samp.items()]) + samp_peak = max([count for _, count in samp.items()]) + for name, count in sorted(samp.items(), key=lambda x: x[1], reverse=True): + if count >= samp_peak * percent_samples: + print(f" {count:{width}} :: {name}") results = compute_speedups(data, args) print("")