Files
rocm-systems/source/docs/user_api.md
T
Jonathan R. Madsen a142b2029d Fix the main library stop routine for timemory (#39)
* Fix the main library stop routine for timemory

- the main pop_timemory function was popping too many calls
- this primarily affected recursive calls

* Lengthen the timeout for the Configure CMake step

* Fix python tests

- new validate-timemory-json.py script

* Documentation update

- Call-counts in timemory output examples in documentation were affected by the changes

* Fix the per-thread metrics during finalization

- pthread_create_mutex starts/stops the per-thread data
- removed unintentional continue statement

* Docs tweaks

* Fix lap counter on per-thread metrics
2022-06-13 15:57:44 -05:00

17 KiB

User API

.. doxygenfile:: omnitrace/user.h

By default, when omnitrace detects any omnitrace_user_start_* or omnitrace_user_stop_* function, instrumentation is disabled at start-up -- thus, omnitrace_user_stop_trace() is not required at the beginning of main. This is can be manually controlled via the OMNITRACE_INIT_ENABLED environment variable. User-defined regions are always recorded, regardless of whether whether omnitrace_user_start_* or omnitrace_user_stop_* has been called.

Example

User API Implementation

#include <omnitrace/user.h>

#include <atomic>
#include <cassert>
#include <cstdio>
#include <cstdlib>
#include <sstream>
#include <thread>
#include <vector>

std::atomic<long> total{ 0 };

long
fib(long n) __attribute__((noinline));

void
run(size_t nitr, long) __attribute__((noinline));

int
custom_push_region(const char* name);

namespace
{
int (*omnitrace_push_region_f)(const char*) = nullptr;
}

int
main(int argc, char** argv)
{
    // get the internal callback to start a user-defined region
    omnitrace_user_get_callbacks(OMNITRACE_USER_REGION, (void**) &omnitrace_push_region_f,
                                 nullptr);
    // assign the custom callback to start a user-defined region
    if(omnitrace_push_region_f)
        omnitrace_user_configure(OMNITRACE_USER_REGION, (void*) &custom_push_region,
                                 nullptr);

    omnitrace_user_push_region(argv[0]);
    omnitrace_user_push_region("initialization");
    size_t nthread = std::min<size_t>(16, std::thread::hardware_concurrency());
    size_t nitr    = 50000;
    long   nfib    = 10;
    if(argc > 1) nfib = atol(argv[1]);
    if(argc > 2) nthread = atol(argv[2]);
    if(argc > 3) nitr = atol(argv[3]);
    omnitrace_user_pop_region("initialization");

    printf("[%s] Threads: %zu\n[%s] Iterations: %zu\n[%s] fibonacci(%li)...\n", argv[0],
           nthread, argv[0], nitr, argv[0], nfib);

    omnitrace_user_push_region("thread_creation");
    std::vector<std::thread> threads{};
    threads.reserve(nthread);
    // disable instrumentation for child threads
    omnitrace_user_stop_thread_trace();
    for(size_t i = 0; i < nthread; ++i)
    {
        size_t _nitr = ((i % 2) == 1) ? (nitr - (0.1 * nitr)) : (nitr + (0.1 * nitr));
        long   _nfib = ((i % 2) == 1) ? (nfib - (0.1 * nfib)) : (nfib + (0.1 * nfib));
        threads.emplace_back(&run, _nitr, _nfib);
    }
    // re-enable instrumentation
    omnitrace_user_start_thread_trace();
    omnitrace_user_pop_region("thread_creation");

    omnitrace_user_push_region("thread_wait");
    for(auto& itr : threads)
        itr.join();
    omnitrace_user_pop_region("thread_wait");

    run(nitr, nfib);

    printf("[%s] fibonacci(%li) x %lu = %li\n", argv[0], nfib, nthread, total.load());
    omnitrace_user_pop_region(argv[0]);

    return 0;
}

long
fib(long n)
{
    return (n < 2) ? n : fib(n - 1) + fib(n - 2);
}

#define RUN_LABEL                                                                        \
    std::string{ std::string{ __FUNCTION__ } + "(" + std::to_string(n) + ") x " +        \
                 std::to_string(nitr) }                                                  \
        .c_str()

void
run(size_t nitr, long n)
{
    omnitrace_user_push_region(RUN_LABEL);
    long local = 0;
    for(size_t i = 0; i < nitr; ++i)
        local += fib(n);
    total += local;
    omnitrace_user_pop_region(RUN_LABEL);
}

int
custom_push_region(const char* name)
{
    printf("Pushing custom region :: %s\n", name);
    return (*omnitrace_push_region_f)(name);
}

User API Output

$ omnitrace -l --min-instructions=8 -E custom_push_region -o -- ./user-api
...
$ export OMNITRACE_USE_TIMEMORY=ON
$ export OMNITRACE_USE_PID=OFF
$ export OMNITRACE_TIME_OUTPUT=OFF
$ export OMNITRACE_OUTPUT_PATH=omnitrace-example-output
$ ./user-api.inst 20 4 100
Pushing custom region :: ./user-api.inst
[omnitrace][omnitrace_init_tooling] Instrumentation mode: Trace


      ______   .___  ___. .__   __.  __  .___________..______          ___       ______  _______
     /  __  \  |   \/   | |  \ |  | |  | |           ||   _  \        /   \     /      ||   ____|
    |  |  |  | |  \  /  | |   \|  | |  | `---|  |----`|  |_)  |      /  ^  \   |  ,----'|  |__
    |  |  |  | |  |\/|  | |  . `  | |  |     |  |     |      /      /  /_\  \  |  |     |   __|
    |  `--'  | |  |  |  | |  |\   | |  |     |  |     |  |\  \----./  _____  \ |  `----.|  |____
     \______/  |__|  |__| |__| \__| |__|     |__|     | _| `._____/__/     \__\ \______||_______|



Pushing custom region :: initialization
[./user-api.inst] Threads: 4
[./user-api.inst] Iterations: 100
[./user-api.inst] fibonacci(20)...
Pushing custom region :: thread_creation
Pushing custom region :: thread_wait
Pushing custom region :: run(20) x 100
Pushing custom region :: run(20) x 100
Pushing custom region :: run(20) x 100
Pushing custom region :: run(20) x 100
Pushing custom region :: run(20) x 100
[./user-api.inst] fibonacci(20) x 4 = 3382500
[omnitrace][86267][0][omnitrace_finalize] finalizing...


[omnitrace][86267][0] omnitrace : 5.190895 sec wall_clock,    2.748 mb peak_rss, 6.330000 sec cpu_clock,  121.9 % cpu_util [laps: 1]
[omnitrace][86267][0] user-api.inst/thread-0 : 5.078713 sec wall_clock, 4.722415 sec thread_cpu_clock,   93.0 % thread_cpu_util,    1.276 mb peak_rss [laps: 1]
[omnitrace][86267][0] user-api.inst/thread-1 : 0.322248 sec wall_clock, 0.322191 sec thread_cpu_clock,  100.0 % thread_cpu_util,    1.000 mb peak_rss [laps: 1]
[omnitrace][86267][0] user-api.inst/thread-2 : 0.323255 sec wall_clock, 0.323194 sec thread_cpu_clock,  100.0 % thread_cpu_util,    0.000 mb peak_rss [laps: 1]
[omnitrace][86267][0] user-api.inst/thread-3 : 0.323569 sec wall_clock, 0.323484 sec thread_cpu_clock,  100.0 % thread_cpu_util,    1.092 mb peak_rss [laps: 1]
[omnitrace][86267][0] user-api.inst/thread-4 : 0.324178 sec wall_clock, 0.324057 sec thread_cpu_clock,  100.0 % thread_cpu_util,    1.184 mb peak_rss [laps: 1]
[omnitrace][86267][0] Post-processing 51 cpu frequency and memory usage entries...

[omnitrace][wall_clock]|0> Outputting 'omnitrace-user-api.inst-output/wall_clock.json'...
[omnitrace][wall_clock]|0> Outputting 'omnitrace-user-api.inst-output/wall_clock.tree.json'...
[omnitrace][wall_clock]|0> Outputting 'omnitrace-user-api.inst-output/wall_clock.txt'...

[omnitrace][manager::finalize][metadata]> Outputting 'omnitrace-user-api.inst-output/metadata.json' and 'omnitrace-user-api.inst-output/functions.json'...
[omnitrace][86267][0][omnitrace_finalize] Finalized
$ cat omnitrace-example-output/wall_clock.txt
|----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
|                                                                              REAL-CLOCK TIMER (I.E. WALL-CLOCK TIMER)                                                                              |
|----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
|                                     LABEL                                       | COUNT  | DEPTH  |   METRIC   | UNITS  |   SUM    |   MEAN   |   MIN    |   MAX    |   VAR    | STDDEV   | % SELF |
|---------------------------------------------------------------------------------|--------|--------|------------|--------|----------|----------|----------|----------|----------|----------|--------|
| |0>>> ./user-api.inst                                                           |      1 |      0 | wall_clock | sec    | 5.078521 | 5.078521 | 5.078521 | 5.078521 | 0.000000 | 0.000000 |    0.0 |
| |0>>> |_initialization                                                          |      1 |      1 | wall_clock | sec    | 0.000004 | 0.000004 | 0.000004 | 0.000004 | 0.000000 | 0.000000 |  100.0 |
| |0>>> |_thread_creation                                                         |      1 |      1 | wall_clock | sec    | 0.000159 | 0.000159 | 0.000159 | 0.000159 | 0.000000 | 0.000000 |  100.0 |
| |0>>> |_thread_wait                                                             |      1 |      1 | wall_clock | sec    | 0.355307 | 0.355307 | 0.355307 | 0.355307 | 0.000000 | 0.000000 |    0.0 |
| |0>>>   |_std::vector<std::thread, std::allocator<std::thread> >::begin         |      1 |      2 | wall_clock | sec    | 0.000001 | 0.000001 | 0.000001 | 0.000001 | 0.000000 | 0.000000 |  100.0 |
| |0>>>   |_std::vector<std::thread, std::allocator<std::thread> >::end           |      1 |      2 | wall_clock | sec    | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 |  100.0 |
| |0>>>   |_pthread_join                                                          |      4 |      2 | wall_clock | sec    | 0.355257 | 0.088814 | 0.000001 | 0.333144 | 0.026559 | 0.162970 |  100.0 |
| |2>>>     |_start_thread                                                        |      1 |      3 | wall_clock | sec    | 0.000032 | 0.000032 | 0.000032 | 0.000032 | 0.000000 | 0.000000 |  100.0 |
| |1>>>     |_start_thread                                                        |      1 |      3 | wall_clock | sec    | 0.000036 | 0.000036 | 0.000036 | 0.000036 | 0.000000 | 0.000000 |  100.0 |
| |3>>>     |_start_thread                                                        |      1 |      3 | wall_clock | sec    | 0.000034 | 0.000034 | 0.000034 | 0.000034 | 0.000000 | 0.000000 |  100.0 |
| |4>>>     |_start_thread                                                        |      1 |      3 | wall_clock | sec    | 0.000039 | 0.000039 | 0.000039 | 0.000039 | 0.000000 | 0.000000 |  100.0 |
| |0>>> |_run                                                                     |      1 |      1 | wall_clock | sec    | 4.722993 | 4.722993 | 4.722993 | 4.722993 | 0.000000 | 0.000000 |    0.0 |
| |0>>>   |_std::char_traits<char>::length                                        |      1 |      2 | wall_clock | sec    | 0.000001 | 0.000001 | 0.000001 | 0.000001 | 0.000000 | 0.000000 |  100.0 |
| |0>>>   |_std::distance<char const*>                                            |      1 |      2 | wall_clock | sec    | 0.000001 | 0.000001 | 0.000001 | 0.000001 | 0.000000 | 0.000000 |  100.0 |
| |0>>>   |_std::operator+<char, std::char_traits<char>, std::allocator<char> >   |      2 |      2 | wall_clock | sec    | 0.000002 | 0.000001 | 0.000001 | 0.000001 | 0.000000 | 0.000000 |  100.0 |
| |0>>>   |_run(20) x 100                                                         |      1 |      2 | wall_clock | sec    | 4.722951 | 4.722951 | 4.722951 | 4.722951 | 0.000000 | 0.000000 |    0.0 |
| |0>>>     |_run [{94,25}-{96,25}]                                               |      1 |      3 | wall_clock | sec    | 4.722925 | 4.722925 | 4.722925 | 4.722925 | 0.000000 | 0.000000 |    0.0 |
| |0>>>       |_fib                                                               |    100 |      4 | wall_clock | sec    | 4.722718 | 0.047227 | 0.046713 | 0.051987 | 0.000000 | 0.000625 |    0.0 |
| |0>>>         |_fib                                                             |    200 |      5 | wall_clock | sec    | 4.722302 | 0.023612 | 0.017827 | 0.034091 | 0.000032 | 0.005627 |    0.0 |
| |0>>>           |_fib                                                           |    400 |      6 | wall_clock | sec    | 4.721485 | 0.011804 | 0.006790 | 0.023003 | 0.000016 | 0.004024 |    0.0 |
| |0>>>             |_fib                                                         |    800 |      7 | wall_clock | sec    | 4.719858 | 0.005900 | 0.002564 | 0.016078 | 0.000006 | 0.002498 |    0.1 |
| |0>>>               |_fib                                                       |   1600 |      8 | wall_clock | sec    | 4.716572 | 0.002948 | 0.000977 | 0.011849 | 0.000002 | 0.001465 |    0.1 |
| |0>>>                 |_fib                                                     |   3200 |      9 | wall_clock | sec    | 4.709918 | 0.001472 | 0.000371 | 0.008246 | 0.000001 | 0.000831 |    0.3 |
| |0>>>                   |_fib                                                   |   6400 |     10 | wall_clock | sec    | 4.696775 | 0.000734 | 0.000140 | 0.005111 | 0.000000 | 0.000461 |    0.6 |
| |0>>>                     |_fib                                                 |  12800 |     11 | wall_clock | sec    | 4.670093 | 0.000365 | 0.000050 | 0.003166 | 0.000000 | 0.000253 |    1.1 |
| |0>>>                       |_fib                                               |  25600 |     12 | wall_clock | sec    | 4.617496 | 0.000180 | 0.000017 | 0.001959 | 0.000000 | 0.000137 |    2.3 |
| |0>>>                         |_fib                                             |  51200 |     13 | wall_clock | sec    | 4.512671 | 0.000088 | 0.000004 | 0.001212 | 0.000000 | 0.000074 |    4.6 |
| |0>>>                           |_fib                                           | 102400 |     14 | wall_clock | sec    | 4.304142 | 0.000042 | 0.000000 | 0.000752 | 0.000000 | 0.000039 |    9.6 |
| |0>>>                             |_fib                                         | 202600 |     15 | wall_clock | sec    | 3.892580 | 0.000019 | 0.000000 | 0.000469 | 0.000000 | 0.000021 |   19.0 |
| |0>>>                               |_fib                                       | 363200 |     16 | wall_clock | sec    | 3.151143 | 0.000009 | 0.000000 | 0.000293 | 0.000000 | 0.000011 |   33.2 |
| |0>>>                                 |_fib                                     | 502000 |     17 | wall_clock | sec    | 2.105217 | 0.000004 | 0.000000 | 0.000183 | 0.000000 | 0.000006 |   49.1 |
| |0>>>                                   |_fib                                   | 476000 |     18 | wall_clock | sec    | 1.071652 | 0.000002 | 0.000000 | 0.000114 | 0.000000 | 0.000004 |   63.6 |
| |0>>>                                     |_fib                                 | 294200 |     19 | wall_clock | sec    | 0.390193 | 0.000001 | 0.000000 | 0.000071 | 0.000000 | 0.000003 |   75.3 |
| |0>>>                                       |_fib                               | 115200 |     20 | wall_clock | sec    | 0.096190 | 0.000001 | 0.000000 | 0.000043 | 0.000000 | 0.000002 |   84.4 |
| |0>>>                                         |_fib                             |  27400 |     21 | wall_clock | sec    | 0.015020 | 0.000001 | 0.000000 | 0.000025 | 0.000000 | 0.000001 |   91.1 |
| |0>>>                                           |_fib                           |   3600 |     22 | wall_clock | sec    | 0.001336 | 0.000000 | 0.000000 | 0.000013 | 0.000000 | 0.000001 |   96.3 |
| |0>>>                                             |_fib                         |    200 |     23 | wall_clock | sec    | 0.000050 | 0.000000 | 0.000000 | 0.000001 | 0.000000 | 0.000000 |  100.0 |
| |0>>>     |_std::char_traits<char>::length                                      |      1 |      3 | wall_clock | sec    | 0.000001 | 0.000001 | 0.000001 | 0.000001 | 0.000000 | 0.000000 |  100.0 |
| |0>>>     |_std::distance<char const*>                                          |      1 |      3 | wall_clock | sec    | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 |  100.0 |
| |0>>>     |_std::operator+<char, std::char_traits<char>, std::allocator<char> > |      2 |      3 | wall_clock | sec    | 0.000001 | 0.000001 | 0.000000 | 0.000001 | 0.000000 | 0.000000 |  100.0 |
| |0>>> |_std::operator&                                                          |      1 |      1 | wall_clock | sec    | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 |  100.0 |
| |0>>> std::vector<std::thread, std::allocator<std::thread> >::~vector           |      1 |      0 | wall_clock | sec    | 0.000045 | 0.000045 | 0.000045 | 0.000045 | 0.000000 | 0.000000 |   32.7 |
| |0>>> |_std::thread::~thread                                                    |      4 |      1 | wall_clock | sec    | 0.000030 | 0.000007 | 0.000007 | 0.000009 | 0.000000 | 0.000001 |   31.2 |
| |0>>>   |_std::thread::joinable                                                 |      4 |      2 | wall_clock | sec    | 0.000021 | 0.000005 | 0.000005 | 0.000006 | 0.000000 | 0.000001 |   89.4 |
| |0>>>     |_std::thread::id::id                                                 |      4 |      3 | wall_clock | sec    | 0.000001 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 |  100.0 |
| |0>>>     |_std::operator==                                                     |      4 |      3 | wall_clock | sec    | 0.000001 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 |  100.0 |
| |0>>> |_std::allocator_traits<std::allocator<std::thread> >::deallocate         |      1 |      1 | wall_clock | sec    | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 |  100.0 |
| |0>>> |_std::allocator<std::thread>::~allocator                                 |      1 |      1 | wall_clock | sec    | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 |  100.0 |
|----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|