diff --git a/projects/rocprofiler-systems/README.md b/projects/rocprofiler-systems/README.md index 55cea660a0..aca42693ab 100755 --- a/projects/rocprofiler-systems/README.md +++ b/projects/rocprofiler-systems/README.md @@ -135,22 +135,29 @@ omnitrace-merge.jl results.json omnitrace-app.inst-output/2021-09-02_01.03_PM/*. ## Use Perfetto tracing with System Backend -In a separate window run: +Enable `traced` and `perfetto` in the background: ```shell pkill traced traced --background -perfetto --out ./htrace.out --txt -c ${OMNITRACE_ROOT}/share/omnitrace.cfg +perfetto --out ./omnitrace-perfetto.proto --txt -c ${OMNITRACE_ROOT}/share/omnitrace.cfg --background ``` -then in the window running the application, configure the omnitrace instrumentation to use the system backend: +Configure omnitrace to use the perfetto system backend: ```shell export OMNITRACE_PERFETTO_BACKEND=system ``` -for the merge use the `htrace.out`: +And finally, execute your instrumented application. Either the binary rewritten application: ```shell -omnitrace-merge.jl results.json htrace.out +omnitrace -o ./myapp.inst -- ./myapp +./myapp.inst +``` + +Or with runtime instrumentation: + +```shell +omnitrace -- ./myapp ``` diff --git a/projects/rocprofiler-systems/cmake/Modules/FindMPI-Headers.cmake b/projects/rocprofiler-systems/cmake/Modules/FindMPI-Headers.cmake index 3f59245f25..e3d595e447 100644 --- a/projects/rocprofiler-systems/cmake/Modules/FindMPI-Headers.cmake +++ b/projects/rocprofiler-systems/cmake/Modules/FindMPI-Headers.cmake @@ -10,7 +10,7 @@ set(MPI_HEADERS_VENDOR_INTERNAL "OpenMPI" CACHE STRING "Distribution type of internal mpi.h") set(MPI_HEADERS_INCLUDE_DIR_INTERNAL - "${PROJECT_SOURCE_DIR}/source/lib/omnitrace/library/tpls" + "${PROJECT_SOURCE_DIR}/source/lib/omnitrace/library/tpls/mpi" CACHE PATH "Path to internal ${MPI_HEADERS_VENDOR_INTERNAL} mpi.h") mark_as_advanced(MPI_HEADERS_VENDOR_INTERNAL) mark_as_advanced(MPI_HEADERS_INCLUDE_DIR_INTERNAL) diff --git a/projects/rocprofiler-systems/examples/parallel-overhead/parallel-overhead.cpp b/projects/rocprofiler-systems/examples/parallel-overhead/parallel-overhead.cpp index 87a012dd19..5ecc33b402 100644 --- a/projects/rocprofiler-systems/examples/parallel-overhead/parallel-overhead.cpp +++ b/projects/rocprofiler-systems/examples/parallel-overhead/parallel-overhead.cpp @@ -1,6 +1,7 @@ #include #include +#include #include #include #include @@ -63,12 +64,20 @@ main(int argc, char** argv) printf("\n[%s] Threads: %zu\n[%s] Iterations: %zu\n[%s] fibonacci(%li)...\n", _name.c_str(), nthread, _name.c_str(), nitr, _name.c_str(), nfib); + pthread_barrier_t _barrier; + pthread_barrier_init(&_barrier, nullptr, nthread); + + auto _run = [&_barrier](size_t nitr, long n) { + pthread_barrier_wait(&_barrier); + run(nitr, n); + }; + std::vector threads{}; for(size_t i = 0; i < nthread; ++i) { size_t _nitr = ((i % 2) == 1) ? (nitr - (0.1 * nitr)) : (nitr + (0.1 * nitr)); _nitr = std::max(_nitr, 1); - threads.emplace_back(&run, _nitr, nfib); + threads.emplace_back(_run, _nitr, nfib); } #if !defined(USE_LOCKS) @@ -79,6 +88,8 @@ main(int argc, char** argv) for(auto& itr : threads) itr.join(); + pthread_barrier_destroy(&_barrier); + printf("[%s] fibonacci(%li) x %lu = %li\n", _name.c_str(), nfib, nthread, static_cast(total)); diff --git a/projects/rocprofiler-systems/source/docs/installation.md b/projects/rocprofiler-systems/source/docs/installation.md index e147687935..13e8e63b31 100644 --- a/projects/rocprofiler-systems/source/docs/installation.md +++ b/projects/rocprofiler-systems/source/docs/installation.md @@ -91,9 +91,9 @@ The Clang compiler may be used in lieu of the GCC compiler if Dyninst is already > ***If the system installed cmake is too old, installing a new version of cmake can be done through several methods.*** > ***One of the easiest options is to use PyPi (i.e. python's pip):*** > -> ```python +> ```shell > pip install --user 'cmake==3.18.4' -> export PATH=${HOME}/.local/bin:${PATH}` +> export PATH=${HOME}/.local/bin:${PATH} > ``` ### Required Third-Party Packages @@ -147,14 +147,14 @@ and Dyninst requires TBB), and the CMake option to build the package alongside o The easiest way to install Dyninst is to configure omnitrace with `OMNITRACE_BUILD_DYNINST=ON`. Depending on the version of Ubuntu, the apt package manager may have current enough versions of Dyninst's Boost, TBB, and LibIberty dependencies (i.e. `apt-get install libtbb-dev libiberty-dev libboost-dev`); however, it is possible to request Dyninst to install -it's dependencies via `Dyninst_BUILD_=ON`, e.g.: +it's dependencies via `DYNINST_BUILD_=ON`, e.g.: ```shell git clone https://github.com/AMDResearch/omnitrace.git omnitrace-source -cmake -B omnitrace-build -DOMNITRACE_BUILD_DYNINST=ON -DDyninst_BUILD_{TBB,ELFUTILS,BOOST,LIBIBERTY}=ON omnitrace-source +cmake -B omnitrace-build -DOMNITRACE_BUILD_DYNINST=ON -DDYNINST_BUILD_{TBB,ELFUTILS,BOOST,LIBIBERTY}=ON omnitrace-source ``` -where `-DDyninst_BUILD_{TBB,BOOST,ELFUTILS,LIBIBERTY}=ON` is expanded by the shell to `-DDyninst_BUILD_TBB=ON -DDyninst_BUILD_BOOST=ON ...` +where `-DDYNINST_BUILD_{TBB,BOOST,ELFUTILS,LIBIBERTY}=ON` is expanded by the shell to `-DDYNINST_BUILD_TBB=ON -DDYNINST_BUILD_BOOST=ON ...` #### Installing Dyninst via Spack @@ -180,7 +180,6 @@ into omnitrace's perfetto support, e.g. `OMNITRACE_USE_PAPI=` forces `TIMEM is passed along to perfetto and will be displayed when the `.proto` file is visualized in [ui.perfetto.dev](https://ui.perfetto.dev). ```shell -OMNITRACE_ROOT=/opt/omnitrace git clone https://github.com/AMDResearch/omnitrace.git omnitrace-source cmake \ -B omnitrace-build \ @@ -208,7 +207,10 @@ source /opt/omnitrace/share/omnitrace/setup-env.sh [Omnitrace](https://github.com/AMDResearch/omnitrace) can have full (`OMNITRACE_USE_MPI=ON`) or partial (`OMNITRACE_USE_MPI_HEADERS=ON`) MPI support. The only difference between these two modes is whether or not the results collected via timemory and/or perfetto can be aggregated into a single -output file during finalization. The primary benefits of partial or full MPI support are the automatic wrapping of MPI functions and the ability +output file during finalization. When full MPI support is enabled, combining the timemory results always occurs whereas combining the perfetto +results is configurable via the `OMNITRACE_PERFETTO_COMBINE_TRACES` setting. + +The primary benefits of partial or full MPI support are the automatic wrapping of MPI functions and the ability to label output with suffixes which correspond to the `MPI_COMM_WORLD` rank ID instead of using the system process identifier (i.e. PID). In general, it is recommended to use partial MPI support with the OpenMPI headers as this is the most portable configuration. If full MPI support is selected, make sure your target application is built against the same MPI distribution as omnitrace, diff --git a/projects/rocprofiler-systems/source/lib/omnitrace/CMakeLists.txt b/projects/rocprofiler-systems/source/lib/omnitrace/CMakeLists.txt index 8209e561bc..33b82d3df6 100644 --- a/projects/rocprofiler-systems/source/lib/omnitrace/CMakeLists.txt +++ b/projects/rocprofiler-systems/source/lib/omnitrace/CMakeLists.txt @@ -57,6 +57,7 @@ set(library_sources ${CMAKE_CURRENT_LIST_DIR}/library/cpu_freq.cpp ${CMAKE_CURRENT_LIST_DIR}/library/critical_trace.cpp ${CMAKE_CURRENT_LIST_DIR}/library/debug.cpp + ${CMAKE_CURRENT_LIST_DIR}/library/dynamic_library.cpp ${CMAKE_CURRENT_LIST_DIR}/library/kokkosp.cpp ${CMAKE_CURRENT_LIST_DIR}/library/gpu.cpp ${CMAKE_CURRENT_LIST_DIR}/library/ompt.cpp @@ -86,6 +87,7 @@ set(library_headers ${CMAKE_CURRENT_LIST_DIR}/library/cpu_freq.hpp ${CMAKE_CURRENT_LIST_DIR}/library/critical_trace.hpp ${CMAKE_CURRENT_LIST_DIR}/library/debug.hpp + ${CMAKE_CURRENT_LIST_DIR}/library/dynamic_library.hpp ${CMAKE_CURRENT_LIST_DIR}/library/gpu.hpp ${CMAKE_CURRENT_LIST_DIR}/library/ompt.hpp ${CMAKE_CURRENT_LIST_DIR}/library/perfetto.hpp diff --git a/projects/rocprofiler-systems/source/lib/omnitrace/library.hpp b/projects/rocprofiler-systems/source/lib/omnitrace/library.hpp index fcf383237b..eba85694cd 100644 --- a/projects/rocprofiler-systems/source/lib/omnitrace/library.hpp +++ b/projects/rocprofiler-systems/source/lib/omnitrace/library.hpp @@ -122,7 +122,7 @@ add_critical_trace(int32_t _targ_tid, size_t _cpu_cid, size_t _gpu_cid, tim::consume_parameters(_lock); } - tim::consume_parameters(_targ_tid, _cpu_cid, _gpu_cid, _parent_cid, _ts_beg, _ts_val, - _queue, _hash, _depth, _prio, num_mutexes); + tim::consume_parameters(_pid, _targ_tid, _cpu_cid, _gpu_cid, _parent_cid, _ts_beg, + _ts_val, _devid, _queue, _hash, _depth, _prio, num_mutexes); } } // namespace omnitrace diff --git a/projects/rocprofiler-systems/source/lib/omnitrace/library/components/pthread_mutex_gotcha.cpp b/projects/rocprofiler-systems/source/lib/omnitrace/library/components/pthread_mutex_gotcha.cpp index a8d35432a5..420437cdf0 100644 --- a/projects/rocprofiler-systems/source/lib/omnitrace/library/components/pthread_mutex_gotcha.cpp +++ b/projects/rocprofiler-systems/source/lib/omnitrace/library/components/pthread_mutex_gotcha.cpp @@ -35,7 +35,9 @@ #include #include +#include #include +#include namespace omnitrace { @@ -51,8 +53,9 @@ pthread_mutex_gotcha::get_hashes() // we could see weird results. static auto _v = []() { const auto& _data = pthread_mutex_gotcha_t::get_gotcha_data(); - hash_array_t _init{}; - for(size_t i = 0; i < gotcha_capacity; ++i) + hash_array_t _init = {}; + size_t i0 = (config::get_trace_thread_locks()) ? 0 : 3; + for(size_t i = i0; i < gotcha_capacity; ++i) { auto&& _id = _data.at(i).tool_id; if(!_id.empty()) @@ -90,12 +93,44 @@ pthread_mutex_gotcha::configure() pthread_mutex_gotcha_t::configure( comp::gotcha_config<2, int, pthread_mutex_t*>{ "pthread_mutex_trylock" }); } + + pthread_mutex_gotcha_t::configure( + comp::gotcha_config<3, int, pthread_barrier_t*>{ "pthread_barrier_wait" }); + + pthread_mutex_gotcha_t::configure( + comp::gotcha_config<4, int, pthread_rwlock_t*>{ "pthread_rwlock_rdlock" }); + + pthread_mutex_gotcha_t::configure( + comp::gotcha_config<5, int, pthread_rwlock_t*>{ "pthread_rwlock_tryrdlock" }); + + pthread_mutex_gotcha_t::configure( + comp::gotcha_config<6, int, pthread_rwlock_t*>{ "pthread_rwlock_trywrlock" }); + + pthread_mutex_gotcha_t::configure( + comp::gotcha_config<7, int, pthread_rwlock_t*>{ "pthread_rwlock_unlock" }); + + pthread_mutex_gotcha_t::configure( + comp::gotcha_config<8, int, pthread_rwlock_t*>{ "pthread_rwlock_wrlock" }); + + pthread_mutex_gotcha_t::configure( + comp::gotcha_config<9, int, pthread_spinlock_t*>{ "pthread_spin_lock" }); + + pthread_mutex_gotcha_t::configure( + comp::gotcha_config<10, int, pthread_spinlock_t*>{ "pthread_spin_trylock" }); + + pthread_mutex_gotcha_t::configure( + comp::gotcha_config<11, int, pthread_spinlock_t*>{ "pthread_spin_unlock" }); + + pthread_mutex_gotcha_t::configure( + comp::gotcha_config<12, int, pthread_t, void**>{ "pthread_join" }); }; } void pthread_mutex_gotcha::shutdown() -{} +{ + pthread_mutex_gotcha_t::disable(); +} void pthread_mutex_gotcha::validate() @@ -122,10 +157,10 @@ pthread_mutex_gotcha::validate() } } -int -pthread_mutex_gotcha::operator()(const gotcha_data_t& _data, - int (*_callee)(pthread_mutex_t*), - pthread_mutex_t* _mutex) +template +auto +pthread_mutex_gotcha::operator()(uintptr_t&& _id, const comp::gotcha_data& _data, + int (*_callee)(Args...), Args... _args) const { if(is_disabled()) { @@ -134,7 +169,7 @@ pthread_mutex_gotcha::operator()(const gotcha_data_t& _data, OMNITRACE_PRINT("Warning! nullptr to %s\n", _data.tool_id.c_str()); return EINVAL; } - return (*_callee)(_mutex); + return (*_callee)(_args...); } uint64_t _cid = 0; @@ -144,24 +179,66 @@ pthread_mutex_gotcha::operator()(const gotcha_data_t& _data, OMNITRACE_SCOPED_THREAD_STATE(ThreadState::Internal); - if(get_use_critical_trace()) + if(_id < std::numeric_limits::max() && get_use_critical_trace()) { std::tie(_cid, _parent_cid, _depth) = create_cpu_cid_entry(); _ts = comp::wall_clock::record(); } omnitrace_push_region(_data.tool_id.c_str()); - auto _ret = (*_callee)(_mutex); + auto _ret = (*_callee)(_args...); omnitrace_pop_region(_data.tool_id.c_str()); - if(get_use_critical_trace()) + if(_id < std::numeric_limits::max() && get_use_critical_trace()) { add_critical_trace( threading::get_id(), _cid, 0, _parent_cid, _ts, comp::wall_clock::record(), 0, - reinterpret_cast(_mutex), get_hashes().at(_data.index), _depth); + _id, get_hashes().at(_data.index), _depth); } return _ret; + tim::consume_parameters(_id, _cid, _parent_cid, _depth, _ts); +} + +int +pthread_mutex_gotcha::operator()(const gotcha_data_t& _data, + int (*_callee)(pthread_mutex_t*), + pthread_mutex_t* _mutex) const +{ + return (*this)(reinterpret_cast(_mutex), _data, _callee, _mutex); +} + +int +pthread_mutex_gotcha::operator()(const gotcha_data_t& _data, + int (*_callee)(pthread_spinlock_t*), + pthread_spinlock_t* _lock) const +{ + return (*this)(reinterpret_cast(_lock), _data, _callee, _lock); +} + +int +pthread_mutex_gotcha::operator()(const gotcha_data_t& _data, + int (*_callee)(pthread_rwlock_t*), + pthread_rwlock_t* _lock) const +{ + return (*this)(reinterpret_cast(_lock), _data, _callee, _lock); +} + +int +pthread_mutex_gotcha::operator()(const gotcha_data_t& _data, + int (*_callee)(pthread_barrier_t*), + pthread_barrier_t* _barrier) const +{ + return (*this)(reinterpret_cast(_barrier), _data, _callee, _barrier); +} + +int +pthread_mutex_gotcha::operator()(const gotcha_data_t& _data, + int (*_callee)(pthread_t, void**), pthread_t _thr, + void** _tinfo) const +{ + return (*this)(static_cast(threading::get_id()), _data, _callee, _thr, + _tinfo); } bool diff --git a/projects/rocprofiler-systems/source/lib/omnitrace/library/components/pthread_mutex_gotcha.hpp b/projects/rocprofiler-systems/source/lib/omnitrace/library/components/pthread_mutex_gotcha.hpp index f6d8328953..facbd3715d 100644 --- a/projects/rocprofiler-systems/source/lib/omnitrace/library/components/pthread_mutex_gotcha.hpp +++ b/projects/rocprofiler-systems/source/lib/omnitrace/library/components/pthread_mutex_gotcha.hpp @@ -35,7 +35,7 @@ namespace omnitrace // this is used to wrap pthread_mutex() struct pthread_mutex_gotcha : comp::base { - static constexpr size_t gotcha_capacity = 3; + static constexpr size_t gotcha_capacity = 13; using hash_array_t = std::array; using gotcha_data_t = comp::gotcha_data; @@ -49,11 +49,23 @@ struct pthread_mutex_gotcha : comp::base static void shutdown(); static void validate(); - int operator()(const gotcha_data_t&, int (*)(pthread_mutex_t*), pthread_mutex_t*); + int operator()(const gotcha_data_t&, int (*)(pthread_mutex_t*), + pthread_mutex_t*) const; + int operator()(const gotcha_data_t&, int (*)(pthread_spinlock_t*), + pthread_spinlock_t*) const; + int operator()(const gotcha_data_t&, int (*)(pthread_rwlock_t*), + pthread_rwlock_t*) const; + int operator()(const gotcha_data_t&, int (*)(pthread_barrier_t*), + pthread_barrier_t*) const; + int operator()(const gotcha_data_t&, int (*)(pthread_t, void**), pthread_t, + void**) const; private: static bool is_disabled(); static hash_array_t& get_hashes(); + + template + auto operator()(uintptr_t&&, const gotcha_data_t&, int (*)(Args...), Args...) const; }; using pthread_mutex_gotcha_t = comp::gotcha + +namespace omnitrace +{ +dynamic_library::dynamic_library(const char* _env, const char* _fname, int _flags, + bool _store) +: envname{ _env } +, filename{ tim::get_env(_env, _fname, _store) } +, flags{ _flags } +{ + open(); +} + +dynamic_library::~dynamic_library() { close(); } + +bool +dynamic_library::open() +{ + if(!filename.empty()) + { + handle = dlopen(filename.c_str(), flags); + if(!handle) + { + OMNITRACE_VERBOSE(2, "[dynamic_library][%s][%s] %s\n", envname.c_str(), + filename.c_str(), dlerror()); + } + dlerror(); // Clear any existing error + } + return (handle != nullptr); +} + +int +dynamic_library::close() const +{ + if(handle) return dlclose(handle); + return -1; +} +} // namespace omnitrace diff --git a/projects/rocprofiler-systems/source/lib/omnitrace/library/dynamic_library.hpp b/projects/rocprofiler-systems/source/lib/omnitrace/library/dynamic_library.hpp index 648ace7f6d..f27220c94a 100644 --- a/projects/rocprofiler-systems/source/lib/omnitrace/library/dynamic_library.hpp +++ b/projects/rocprofiler-systems/source/lib/omnitrace/library/dynamic_library.hpp @@ -22,11 +22,8 @@ #pragma once -#include "library/debug.hpp" #include "library/defines.hpp" -#include - #include #include @@ -41,26 +38,12 @@ struct dynamic_library dynamic_library& operator=(dynamic_library&&) noexcept = default; dynamic_library(const char* _env, const char* _fname, - int _flags = (RTLD_NOW | RTLD_GLOBAL), bool _store = false) - : envname{ _env } - , filename{ tim::get_env(_env, _fname, _store) } - , flags{ _flags } - { - if(!filename.empty()) - { - handle = dlopen(filename.c_str(), flags); - if(!handle) - { - OMNITRACE_DEBUG("%s\n", dlerror()); - } - dlerror(); // Clear any existing error - } - } + int _flags = (RTLD_LAZY | RTLD_GLOBAL), bool _store = false); - ~dynamic_library() - { - if(handle) dlclose(handle); - } + ~dynamic_library(); + + bool open(); + int close() const; std::string envname = {}; std::string filename = {}; diff --git a/projects/rocprofiler-systems/source/lib/omnitrace/library/kokkosp.cpp b/projects/rocprofiler-systems/source/lib/omnitrace/library/kokkosp.cpp index 3959b55099..d0714eeef5 100644 --- a/projects/rocprofiler-systems/source/lib/omnitrace/library/kokkosp.cpp +++ b/projects/rocprofiler-systems/source/lib/omnitrace/library/kokkosp.cpp @@ -28,6 +28,7 @@ #include "library/components/user_region.hpp" #include "library/config.hpp" #include "library/debug.hpp" +#include "library/runtime.hpp" #include @@ -83,6 +84,7 @@ extern "C" void kokkosp_parse_args(int argc, char** argv) { + OMNITRACE_SCOPED_THREAD_STATE(ThreadState::Internal); if(!omnitrace::config::settings_are_configured() && omnitrace::get_state() < omnitrace::State::Active) { @@ -102,12 +104,14 @@ extern "C" void kokkosp_declare_metadata(const char* key, const char* value) { + OMNITRACE_SCOPED_THREAD_STATE(ThreadState::Internal); tim::manager::add_metadata(key, value); } void kokkosp_init_library(const int loadSeq, const uint64_t interfaceVer, const uint32_t devInfoCount, void* deviceInfo) { + OMNITRACE_SCOPED_THREAD_STATE(ThreadState::Internal); tim::consume_parameters(devInfoCount, deviceInfo); if(_standalone_initialized || (!omnitrace::config::settings_are_configured() && @@ -138,6 +142,7 @@ extern "C" void kokkosp_finalize_library() { + OMNITRACE_SCOPED_THREAD_STATE(ThreadState::Internal); if(_standalone_initialized) { omnitrace_pop_trace("kokkos_main"); @@ -156,6 +161,7 @@ extern "C" void kokkosp_begin_parallel_for(const char* name, uint32_t devid, uint64_t* kernid) { + OMNITRACE_SCOPED_THREAD_STATE(ThreadState::Internal); auto pname = (devid > std::numeric_limits::max()) // junk device number ? TIMEMORY_JOIN(" ", "[kokkos]", name) @@ -168,6 +174,7 @@ extern "C" void kokkosp_end_parallel_for(uint64_t kernid) { + OMNITRACE_SCOPED_THREAD_STATE(ThreadState::Internal); kokkosp::logger_t{}.mark(-1, __FUNCTION__, kernid); kokkosp::stop_profiler(kernid); kokkosp::destroy_profiler(kernid); @@ -177,6 +184,7 @@ extern "C" void kokkosp_begin_parallel_reduce(const char* name, uint32_t devid, uint64_t* kernid) { + OMNITRACE_SCOPED_THREAD_STATE(ThreadState::Internal); auto pname = (devid > std::numeric_limits::max()) // junk device number ? TIMEMORY_JOIN(" ", "[kokkos]", name) @@ -189,6 +197,7 @@ extern "C" void kokkosp_end_parallel_reduce(uint64_t kernid) { + OMNITRACE_SCOPED_THREAD_STATE(ThreadState::Internal); kokkosp::logger_t{}.mark(-1, __FUNCTION__, kernid); kokkosp::stop_profiler(kernid); kokkosp::destroy_profiler(kernid); @@ -198,6 +207,7 @@ extern "C" void kokkosp_begin_parallel_scan(const char* name, uint32_t devid, uint64_t* kernid) { + OMNITRACE_SCOPED_THREAD_STATE(ThreadState::Internal); auto pname = (devid > std::numeric_limits::max()) // junk device number ? TIMEMORY_JOIN(" ", "[kokkos]", name) @@ -210,6 +220,7 @@ extern "C" void kokkosp_end_parallel_scan(uint64_t kernid) { + OMNITRACE_SCOPED_THREAD_STATE(ThreadState::Internal); kokkosp::logger_t{}.mark(-1, __FUNCTION__, kernid); kokkosp::stop_profiler(kernid); kokkosp::destroy_profiler(kernid); @@ -219,6 +230,7 @@ extern "C" void kokkosp_begin_fence(const char* name, uint32_t devid, uint64_t* kernid) { + OMNITRACE_SCOPED_THREAD_STATE(ThreadState::Internal); auto pname = (devid > std::numeric_limits::max()) // junk device number ? TIMEMORY_JOIN(" ", "[kokkos]", name) @@ -231,6 +243,7 @@ extern "C" void kokkosp_end_fence(uint64_t kernid) { + OMNITRACE_SCOPED_THREAD_STATE(ThreadState::Internal); kokkosp::logger_t{}.mark(-1, __FUNCTION__, kernid); kokkosp::stop_profiler(kernid); kokkosp::destroy_profiler(kernid); @@ -240,6 +253,7 @@ extern "C" void kokkosp_push_profile_region(const char* name) { + OMNITRACE_SCOPED_THREAD_STATE(ThreadState::Internal); kokkosp::logger_t{}.mark(1, __FUNCTION__, name); kokkosp::get_profiler_stack().push_back( kokkosp::profiler_t(name)); @@ -248,6 +262,7 @@ extern "C" void kokkosp_pop_profile_region() { + OMNITRACE_SCOPED_THREAD_STATE(ThreadState::Internal); kokkosp::logger_t{}.mark(-1, __FUNCTION__); if(kokkosp::get_profiler_stack().empty()) return; @@ -259,6 +274,7 @@ extern "C" void kokkosp_create_profile_section(const char* name, uint32_t* secid) { + OMNITRACE_SCOPED_THREAD_STATE(ThreadState::Internal); *secid = kokkosp::get_unique_id(); auto pname = TIMEMORY_JOIN(" ", "[kokkos]", name); kokkosp::create_profiler(pname, *secid); @@ -266,6 +282,7 @@ extern "C" void kokkosp_destroy_profile_section(uint32_t secid) { + OMNITRACE_SCOPED_THREAD_STATE(ThreadState::Internal); kokkosp::destroy_profiler(secid); } @@ -273,12 +290,14 @@ extern "C" void kokkosp_start_profile_section(uint32_t secid) { + OMNITRACE_SCOPED_THREAD_STATE(ThreadState::Internal); kokkosp::logger_t{}.mark(1, __FUNCTION__, secid); kokkosp::start_profiler(secid); } void kokkosp_stop_profile_section(uint32_t secid) { + OMNITRACE_SCOPED_THREAD_STATE(ThreadState::Internal); kokkosp::logger_t{}.mark(-1, __FUNCTION__, secid); kokkosp::start_profiler(secid); } @@ -288,6 +307,7 @@ extern "C" void kokkosp_allocate_data(const SpaceHandle space, const char* label, const void* const ptr, const uint64_t size) { + OMNITRACE_SCOPED_THREAD_STATE(ThreadState::Internal); kokkosp::logger_t{}.mark(0, __FUNCTION__, space.name, label, TIMEMORY_JOIN("", '[', ptr, ']'), size); kokkosp::profiler_alloc_t<>{ TIMEMORY_JOIN(" ", "[kokkos][allocate]", space.name, @@ -298,6 +318,7 @@ extern "C" void kokkosp_deallocate_data(const SpaceHandle space, const char* label, const void* const ptr, const uint64_t size) { + OMNITRACE_SCOPED_THREAD_STATE(ThreadState::Internal); kokkosp::logger_t{}.mark(0, __FUNCTION__, space.name, label, TIMEMORY_JOIN("", '[', ptr, ']'), size); kokkosp::profiler_alloc_t<>{ TIMEMORY_JOIN(" ", "[kokkos][deallocate]", @@ -311,6 +332,7 @@ extern "C" const void* dst_ptr, SpaceHandle src_handle, const char* src_name, const void* src_ptr, uint64_t size) { + OMNITRACE_SCOPED_THREAD_STATE(ThreadState::Internal); kokkosp::logger_t{}.mark(1, __FUNCTION__, dst_handle.name, dst_name, TIMEMORY_JOIN("", '[', dst_ptr, ']'), src_handle.name, src_name, TIMEMORY_JOIN("", '[', src_ptr, ']'), size); @@ -329,6 +351,7 @@ extern "C" void kokkosp_end_deep_copy() { + OMNITRACE_SCOPED_THREAD_STATE(ThreadState::Internal); kokkosp::logger_t{}.mark(-1, __FUNCTION__); auto& _data = kokkosp::get_profiler_stack(); if(_data.empty()) return; @@ -341,6 +364,7 @@ extern "C" void kokkosp_profile_event(const char* name) { + OMNITRACE_SCOPED_THREAD_STATE(ThreadState::Internal); kokkosp::profiler_t{}.mark(name); } diff --git a/projects/rocprofiler-systems/source/lib/omnitrace/library/tpls/mpi.h b/projects/rocprofiler-systems/source/lib/omnitrace/library/tpls/mpi/mpi.h similarity index 100% rename from projects/rocprofiler-systems/source/lib/omnitrace/library/tpls/mpi.h rename to projects/rocprofiler-systems/source/lib/omnitrace/library/tpls/mpi/mpi.h diff --git a/projects/rocprofiler-systems/source/lib/omnitrace/library/tpls/mpi_portable_platform.h b/projects/rocprofiler-systems/source/lib/omnitrace/library/tpls/mpi/mpi_portable_platform.h similarity index 100% rename from projects/rocprofiler-systems/source/lib/omnitrace/library/tpls/mpi_portable_platform.h rename to projects/rocprofiler-systems/source/lib/omnitrace/library/tpls/mpi/mpi_portable_platform.h