MPI headers + mutex gotcha + roctracer + kokkosp (#11)
* MPI headers, mutex gotcha + roctracer + kokkosp
- relocate internal MPI headers
- pthread_barrier in parallel-overhead
- doc fixes to DYNINST options
- minor tweaks to dynamic_library
- dlopen libamdhip64.so
- scoped thread state in kokkos
- extended pthread_mutex_gotcha
* Fix for unused-but-set-variables
[ROCm/rocprofiler-systems commit: 424a3593e7]
Αυτή η υποβολή περιλαμβάνεται σε:
υποβλήθηκε από
GitHub
γονέας
938aaef082
υποβολή
8cc87ca6b8
@@ -135,22 +135,29 @@ omnitrace-merge.jl results.json omnitrace-app.inst-output/2021-09-02_01.03_PM/*.
|
||||
|
||||
## Use Perfetto tracing with System Backend
|
||||
|
||||
In a separate window run:
|
||||
Enable `traced` and `perfetto` in the background:
|
||||
|
||||
```shell
|
||||
pkill traced
|
||||
traced --background
|
||||
perfetto --out ./htrace.out --txt -c ${OMNITRACE_ROOT}/share/omnitrace.cfg
|
||||
perfetto --out ./omnitrace-perfetto.proto --txt -c ${OMNITRACE_ROOT}/share/omnitrace.cfg --background
|
||||
```
|
||||
|
||||
then in the window running the application, configure the omnitrace instrumentation to use the system backend:
|
||||
Configure omnitrace to use the perfetto system backend:
|
||||
|
||||
```shell
|
||||
export OMNITRACE_PERFETTO_BACKEND=system
|
||||
```
|
||||
|
||||
for the merge use the `htrace.out`:
|
||||
And finally, execute your instrumented application. Either the binary rewritten application:
|
||||
|
||||
```shell
|
||||
omnitrace-merge.jl results.json htrace.out
|
||||
omnitrace -o ./myapp.inst -- ./myapp
|
||||
./myapp.inst
|
||||
```
|
||||
|
||||
Or with runtime instrumentation:
|
||||
|
||||
```shell
|
||||
omnitrace -- ./myapp
|
||||
```
|
||||
|
||||
@@ -10,7 +10,7 @@ set(MPI_HEADERS_VENDOR_INTERNAL
|
||||
"OpenMPI"
|
||||
CACHE STRING "Distribution type of internal mpi.h")
|
||||
set(MPI_HEADERS_INCLUDE_DIR_INTERNAL
|
||||
"${PROJECT_SOURCE_DIR}/source/lib/omnitrace/library/tpls"
|
||||
"${PROJECT_SOURCE_DIR}/source/lib/omnitrace/library/tpls/mpi"
|
||||
CACHE PATH "Path to internal ${MPI_HEADERS_VENDOR_INTERNAL} mpi.h")
|
||||
mark_as_advanced(MPI_HEADERS_VENDOR_INTERNAL)
|
||||
mark_as_advanced(MPI_HEADERS_INCLUDE_DIR_INTERNAL)
|
||||
|
||||
+12
-1
@@ -1,6 +1,7 @@
|
||||
|
||||
#include <cstdio>
|
||||
#include <cstdlib>
|
||||
#include <pthread.h>
|
||||
#include <string>
|
||||
#include <thread>
|
||||
#include <vector>
|
||||
@@ -63,12 +64,20 @@ main(int argc, char** argv)
|
||||
printf("\n[%s] Threads: %zu\n[%s] Iterations: %zu\n[%s] fibonacci(%li)...\n",
|
||||
_name.c_str(), nthread, _name.c_str(), nitr, _name.c_str(), nfib);
|
||||
|
||||
pthread_barrier_t _barrier;
|
||||
pthread_barrier_init(&_barrier, nullptr, nthread);
|
||||
|
||||
auto _run = [&_barrier](size_t nitr, long n) {
|
||||
pthread_barrier_wait(&_barrier);
|
||||
run(nitr, n);
|
||||
};
|
||||
|
||||
std::vector<std::thread> threads{};
|
||||
for(size_t i = 0; i < nthread; ++i)
|
||||
{
|
||||
size_t _nitr = ((i % 2) == 1) ? (nitr - (0.1 * nitr)) : (nitr + (0.1 * nitr));
|
||||
_nitr = std::max<size_t>(_nitr, 1);
|
||||
threads.emplace_back(&run, _nitr, nfib);
|
||||
threads.emplace_back(_run, _nitr, nfib);
|
||||
}
|
||||
|
||||
#if !defined(USE_LOCKS)
|
||||
@@ -79,6 +88,8 @@ main(int argc, char** argv)
|
||||
for(auto& itr : threads)
|
||||
itr.join();
|
||||
|
||||
pthread_barrier_destroy(&_barrier);
|
||||
|
||||
printf("[%s] fibonacci(%li) x %lu = %li\n", _name.c_str(), nfib, nthread,
|
||||
static_cast<long>(total));
|
||||
|
||||
|
||||
@@ -91,9 +91,9 @@ The Clang compiler may be used in lieu of the GCC compiler if Dyninst is already
|
||||
> ***If the system installed cmake is too old, installing a new version of cmake can be done through several methods.***
|
||||
> ***One of the easiest options is to use PyPi (i.e. python's pip):***
|
||||
>
|
||||
> ```python
|
||||
> ```shell
|
||||
> pip install --user 'cmake==3.18.4'
|
||||
> export PATH=${HOME}/.local/bin:${PATH}`
|
||||
> export PATH=${HOME}/.local/bin:${PATH}
|
||||
> ```
|
||||
|
||||
### Required Third-Party Packages
|
||||
@@ -147,14 +147,14 @@ and Dyninst requires TBB), and the CMake option to build the package alongside o
|
||||
|
||||
The easiest way to install Dyninst is to configure omnitrace with `OMNITRACE_BUILD_DYNINST=ON`. Depending on the version of Ubuntu, the apt package manager may have current enough
|
||||
versions of Dyninst's Boost, TBB, and LibIberty dependencies (i.e. `apt-get install libtbb-dev libiberty-dev libboost-dev`); however, it is possible to request Dyninst to install
|
||||
it's dependencies via `Dyninst_BUILD_<DEP>=ON`, e.g.:
|
||||
it's dependencies via `DYNINST_BUILD_<DEP>=ON`, e.g.:
|
||||
|
||||
```shell
|
||||
git clone https://github.com/AMDResearch/omnitrace.git omnitrace-source
|
||||
cmake -B omnitrace-build -DOMNITRACE_BUILD_DYNINST=ON -DDyninst_BUILD_{TBB,ELFUTILS,BOOST,LIBIBERTY}=ON omnitrace-source
|
||||
cmake -B omnitrace-build -DOMNITRACE_BUILD_DYNINST=ON -DDYNINST_BUILD_{TBB,ELFUTILS,BOOST,LIBIBERTY}=ON omnitrace-source
|
||||
```
|
||||
|
||||
where `-DDyninst_BUILD_{TBB,BOOST,ELFUTILS,LIBIBERTY}=ON` is expanded by the shell to `-DDyninst_BUILD_TBB=ON -DDyninst_BUILD_BOOST=ON ...`
|
||||
where `-DDYNINST_BUILD_{TBB,BOOST,ELFUTILS,LIBIBERTY}=ON` is expanded by the shell to `-DDYNINST_BUILD_TBB=ON -DDYNINST_BUILD_BOOST=ON ...`
|
||||
|
||||
#### Installing Dyninst via Spack
|
||||
|
||||
@@ -180,7 +180,6 @@ into omnitrace's perfetto support, e.g. `OMNITRACE_USE_PAPI=<VAL>` forces `TIMEM
|
||||
is passed along to perfetto and will be displayed when the `.proto` file is visualized in [ui.perfetto.dev](https://ui.perfetto.dev).
|
||||
|
||||
```shell
|
||||
OMNITRACE_ROOT=/opt/omnitrace
|
||||
git clone https://github.com/AMDResearch/omnitrace.git omnitrace-source
|
||||
cmake \
|
||||
-B omnitrace-build \
|
||||
@@ -208,7 +207,10 @@ source /opt/omnitrace/share/omnitrace/setup-env.sh
|
||||
|
||||
[Omnitrace](https://github.com/AMDResearch/omnitrace) can have full (`OMNITRACE_USE_MPI=ON`) or partial (`OMNITRACE_USE_MPI_HEADERS=ON`) MPI support.
|
||||
The only difference between these two modes is whether or not the results collected via timemory and/or perfetto can be aggregated into a single
|
||||
output file during finalization. The primary benefits of partial or full MPI support are the automatic wrapping of MPI functions and the ability
|
||||
output file during finalization. When full MPI support is enabled, combining the timemory results always occurs whereas combining the perfetto
|
||||
results is configurable via the `OMNITRACE_PERFETTO_COMBINE_TRACES` setting.
|
||||
|
||||
The primary benefits of partial or full MPI support are the automatic wrapping of MPI functions and the ability
|
||||
to label output with suffixes which correspond to the `MPI_COMM_WORLD` rank ID instead of using the system process identifier (i.e. PID).
|
||||
In general, it is recommended to use partial MPI support with the OpenMPI headers as this is the most portable configuration.
|
||||
If full MPI support is selected, make sure your target application is built against the same MPI distribution as omnitrace,
|
||||
|
||||
@@ -57,6 +57,7 @@ set(library_sources
|
||||
${CMAKE_CURRENT_LIST_DIR}/library/cpu_freq.cpp
|
||||
${CMAKE_CURRENT_LIST_DIR}/library/critical_trace.cpp
|
||||
${CMAKE_CURRENT_LIST_DIR}/library/debug.cpp
|
||||
${CMAKE_CURRENT_LIST_DIR}/library/dynamic_library.cpp
|
||||
${CMAKE_CURRENT_LIST_DIR}/library/kokkosp.cpp
|
||||
${CMAKE_CURRENT_LIST_DIR}/library/gpu.cpp
|
||||
${CMAKE_CURRENT_LIST_DIR}/library/ompt.cpp
|
||||
@@ -86,6 +87,7 @@ set(library_headers
|
||||
${CMAKE_CURRENT_LIST_DIR}/library/cpu_freq.hpp
|
||||
${CMAKE_CURRENT_LIST_DIR}/library/critical_trace.hpp
|
||||
${CMAKE_CURRENT_LIST_DIR}/library/debug.hpp
|
||||
${CMAKE_CURRENT_LIST_DIR}/library/dynamic_library.hpp
|
||||
${CMAKE_CURRENT_LIST_DIR}/library/gpu.hpp
|
||||
${CMAKE_CURRENT_LIST_DIR}/library/ompt.hpp
|
||||
${CMAKE_CURRENT_LIST_DIR}/library/perfetto.hpp
|
||||
|
||||
@@ -122,7 +122,7 @@ add_critical_trace(int32_t _targ_tid, size_t _cpu_cid, size_t _gpu_cid,
|
||||
tim::consume_parameters(_lock);
|
||||
}
|
||||
|
||||
tim::consume_parameters(_targ_tid, _cpu_cid, _gpu_cid, _parent_cid, _ts_beg, _ts_val,
|
||||
_queue, _hash, _depth, _prio, num_mutexes);
|
||||
tim::consume_parameters(_pid, _targ_tid, _cpu_cid, _gpu_cid, _parent_cid, _ts_beg,
|
||||
_ts_val, _devid, _queue, _hash, _depth, _prio, num_mutexes);
|
||||
}
|
||||
} // namespace omnitrace
|
||||
|
||||
+89
-12
@@ -35,7 +35,9 @@
|
||||
#include <timemory/utility/signals.hpp>
|
||||
#include <timemory/utility/types.hpp>
|
||||
|
||||
#include <cstdint>
|
||||
#include <pthread.h>
|
||||
#include <stdexcept>
|
||||
|
||||
namespace omnitrace
|
||||
{
|
||||
@@ -51,8 +53,9 @@ pthread_mutex_gotcha::get_hashes()
|
||||
// we could see weird results.
|
||||
static auto _v = []() {
|
||||
const auto& _data = pthread_mutex_gotcha_t::get_gotcha_data();
|
||||
hash_array_t _init{};
|
||||
for(size_t i = 0; i < gotcha_capacity; ++i)
|
||||
hash_array_t _init = {};
|
||||
size_t i0 = (config::get_trace_thread_locks()) ? 0 : 3;
|
||||
for(size_t i = i0; i < gotcha_capacity; ++i)
|
||||
{
|
||||
auto&& _id = _data.at(i).tool_id;
|
||||
if(!_id.empty())
|
||||
@@ -90,12 +93,44 @@ pthread_mutex_gotcha::configure()
|
||||
pthread_mutex_gotcha_t::configure(
|
||||
comp::gotcha_config<2, int, pthread_mutex_t*>{ "pthread_mutex_trylock" });
|
||||
}
|
||||
|
||||
pthread_mutex_gotcha_t::configure(
|
||||
comp::gotcha_config<3, int, pthread_barrier_t*>{ "pthread_barrier_wait" });
|
||||
|
||||
pthread_mutex_gotcha_t::configure(
|
||||
comp::gotcha_config<4, int, pthread_rwlock_t*>{ "pthread_rwlock_rdlock" });
|
||||
|
||||
pthread_mutex_gotcha_t::configure(
|
||||
comp::gotcha_config<5, int, pthread_rwlock_t*>{ "pthread_rwlock_tryrdlock" });
|
||||
|
||||
pthread_mutex_gotcha_t::configure(
|
||||
comp::gotcha_config<6, int, pthread_rwlock_t*>{ "pthread_rwlock_trywrlock" });
|
||||
|
||||
pthread_mutex_gotcha_t::configure(
|
||||
comp::gotcha_config<7, int, pthread_rwlock_t*>{ "pthread_rwlock_unlock" });
|
||||
|
||||
pthread_mutex_gotcha_t::configure(
|
||||
comp::gotcha_config<8, int, pthread_rwlock_t*>{ "pthread_rwlock_wrlock" });
|
||||
|
||||
pthread_mutex_gotcha_t::configure(
|
||||
comp::gotcha_config<9, int, pthread_spinlock_t*>{ "pthread_spin_lock" });
|
||||
|
||||
pthread_mutex_gotcha_t::configure(
|
||||
comp::gotcha_config<10, int, pthread_spinlock_t*>{ "pthread_spin_trylock" });
|
||||
|
||||
pthread_mutex_gotcha_t::configure(
|
||||
comp::gotcha_config<11, int, pthread_spinlock_t*>{ "pthread_spin_unlock" });
|
||||
|
||||
pthread_mutex_gotcha_t::configure(
|
||||
comp::gotcha_config<12, int, pthread_t, void**>{ "pthread_join" });
|
||||
};
|
||||
}
|
||||
|
||||
void
|
||||
pthread_mutex_gotcha::shutdown()
|
||||
{}
|
||||
{
|
||||
pthread_mutex_gotcha_t::disable();
|
||||
}
|
||||
|
||||
void
|
||||
pthread_mutex_gotcha::validate()
|
||||
@@ -122,10 +157,10 @@ pthread_mutex_gotcha::validate()
|
||||
}
|
||||
}
|
||||
|
||||
int
|
||||
pthread_mutex_gotcha::operator()(const gotcha_data_t& _data,
|
||||
int (*_callee)(pthread_mutex_t*),
|
||||
pthread_mutex_t* _mutex)
|
||||
template <typename... Args>
|
||||
auto
|
||||
pthread_mutex_gotcha::operator()(uintptr_t&& _id, const comp::gotcha_data& _data,
|
||||
int (*_callee)(Args...), Args... _args) const
|
||||
{
|
||||
if(is_disabled())
|
||||
{
|
||||
@@ -134,7 +169,7 @@ pthread_mutex_gotcha::operator()(const gotcha_data_t& _data,
|
||||
OMNITRACE_PRINT("Warning! nullptr to %s\n", _data.tool_id.c_str());
|
||||
return EINVAL;
|
||||
}
|
||||
return (*_callee)(_mutex);
|
||||
return (*_callee)(_args...);
|
||||
}
|
||||
|
||||
uint64_t _cid = 0;
|
||||
@@ -144,24 +179,66 @@ pthread_mutex_gotcha::operator()(const gotcha_data_t& _data,
|
||||
|
||||
OMNITRACE_SCOPED_THREAD_STATE(ThreadState::Internal);
|
||||
|
||||
if(get_use_critical_trace())
|
||||
if(_id < std::numeric_limits<uintptr_t>::max() && get_use_critical_trace())
|
||||
{
|
||||
std::tie(_cid, _parent_cid, _depth) = create_cpu_cid_entry();
|
||||
_ts = comp::wall_clock::record();
|
||||
}
|
||||
|
||||
omnitrace_push_region(_data.tool_id.c_str());
|
||||
auto _ret = (*_callee)(_mutex);
|
||||
auto _ret = (*_callee)(_args...);
|
||||
omnitrace_pop_region(_data.tool_id.c_str());
|
||||
|
||||
if(get_use_critical_trace())
|
||||
if(_id < std::numeric_limits<uintptr_t>::max() && get_use_critical_trace())
|
||||
{
|
||||
add_critical_trace<Device::CPU, Phase::DELTA>(
|
||||
threading::get_id(), _cid, 0, _parent_cid, _ts, comp::wall_clock::record(), 0,
|
||||
reinterpret_cast<uintptr_t>(_mutex), get_hashes().at(_data.index), _depth);
|
||||
_id, get_hashes().at(_data.index), _depth);
|
||||
}
|
||||
|
||||
return _ret;
|
||||
tim::consume_parameters(_id, _cid, _parent_cid, _depth, _ts);
|
||||
}
|
||||
|
||||
int
|
||||
pthread_mutex_gotcha::operator()(const gotcha_data_t& _data,
|
||||
int (*_callee)(pthread_mutex_t*),
|
||||
pthread_mutex_t* _mutex) const
|
||||
{
|
||||
return (*this)(reinterpret_cast<uintptr_t>(_mutex), _data, _callee, _mutex);
|
||||
}
|
||||
|
||||
int
|
||||
pthread_mutex_gotcha::operator()(const gotcha_data_t& _data,
|
||||
int (*_callee)(pthread_spinlock_t*),
|
||||
pthread_spinlock_t* _lock) const
|
||||
{
|
||||
return (*this)(reinterpret_cast<uintptr_t>(_lock), _data, _callee, _lock);
|
||||
}
|
||||
|
||||
int
|
||||
pthread_mutex_gotcha::operator()(const gotcha_data_t& _data,
|
||||
int (*_callee)(pthread_rwlock_t*),
|
||||
pthread_rwlock_t* _lock) const
|
||||
{
|
||||
return (*this)(reinterpret_cast<uintptr_t>(_lock), _data, _callee, _lock);
|
||||
}
|
||||
|
||||
int
|
||||
pthread_mutex_gotcha::operator()(const gotcha_data_t& _data,
|
||||
int (*_callee)(pthread_barrier_t*),
|
||||
pthread_barrier_t* _barrier) const
|
||||
{
|
||||
return (*this)(reinterpret_cast<uintptr_t>(_barrier), _data, _callee, _barrier);
|
||||
}
|
||||
|
||||
int
|
||||
pthread_mutex_gotcha::operator()(const gotcha_data_t& _data,
|
||||
int (*_callee)(pthread_t, void**), pthread_t _thr,
|
||||
void** _tinfo) const
|
||||
{
|
||||
return (*this)(static_cast<uintptr_t>(threading::get_id()), _data, _callee, _thr,
|
||||
_tinfo);
|
||||
}
|
||||
|
||||
bool
|
||||
|
||||
+14
-2
@@ -35,7 +35,7 @@ namespace omnitrace
|
||||
// this is used to wrap pthread_mutex()
|
||||
struct pthread_mutex_gotcha : comp::base<pthread_mutex_gotcha, void>
|
||||
{
|
||||
static constexpr size_t gotcha_capacity = 3;
|
||||
static constexpr size_t gotcha_capacity = 13;
|
||||
using hash_array_t = std::array<size_t, gotcha_capacity>;
|
||||
using gotcha_data_t = comp::gotcha_data;
|
||||
|
||||
@@ -49,11 +49,23 @@ struct pthread_mutex_gotcha : comp::base<pthread_mutex_gotcha, void>
|
||||
static void shutdown();
|
||||
static void validate();
|
||||
|
||||
int operator()(const gotcha_data_t&, int (*)(pthread_mutex_t*), pthread_mutex_t*);
|
||||
int operator()(const gotcha_data_t&, int (*)(pthread_mutex_t*),
|
||||
pthread_mutex_t*) const;
|
||||
int operator()(const gotcha_data_t&, int (*)(pthread_spinlock_t*),
|
||||
pthread_spinlock_t*) const;
|
||||
int operator()(const gotcha_data_t&, int (*)(pthread_rwlock_t*),
|
||||
pthread_rwlock_t*) const;
|
||||
int operator()(const gotcha_data_t&, int (*)(pthread_barrier_t*),
|
||||
pthread_barrier_t*) const;
|
||||
int operator()(const gotcha_data_t&, int (*)(pthread_t, void**), pthread_t,
|
||||
void**) const;
|
||||
|
||||
private:
|
||||
static bool is_disabled();
|
||||
static hash_array_t& get_hashes();
|
||||
|
||||
template <typename... Args>
|
||||
auto operator()(uintptr_t&&, const gotcha_data_t&, int (*)(Args...), Args...) const;
|
||||
};
|
||||
|
||||
using pthread_mutex_gotcha_t = comp::gotcha<pthread_mutex_gotcha::gotcha_capacity,
|
||||
|
||||
+5
-2
@@ -26,6 +26,7 @@
|
||||
#include "library/config.hpp"
|
||||
#include "library/debug.hpp"
|
||||
#include "library/defines.hpp"
|
||||
#include "library/dynamic_library.hpp"
|
||||
#include "library/redirect.hpp"
|
||||
#include "library/sampling.hpp"
|
||||
#include "library/thread_data.hpp"
|
||||
@@ -122,9 +123,11 @@ roctracer::setup()
|
||||
|
||||
OMNITRACE_VERBOSE_F(1, "setting up roctracer...\n");
|
||||
|
||||
dynamic_library _amdhip64{ "OMNITRACE_ROCTRACER_LIBAMDHIP64", "libamdhip64.so" };
|
||||
|
||||
#if OMNITRACE_HIP_VERSION_MAJOR == 4 && OMNITRACE_HIP_VERSION_MINOR < 4
|
||||
auto _kfdwrapper = dynamic_library{ "OMNITRACE_ROCTRACER_LIBKFDWRAPPER",
|
||||
OMNITRACE_ROCTRACER_LIBKFDWRAPPER };
|
||||
dynamic_library _kfdwrapper{ "OMNITRACE_ROCTRACER_LIBKFDWRAPPER",
|
||||
OMNITRACE_ROCTRACER_LIBKFDWRAPPER };
|
||||
#endif
|
||||
|
||||
ROCTRACER_CALL(roctracer_set_properties(ACTIVITY_DOMAIN_HIP_API, nullptr));
|
||||
|
||||
@@ -0,0 +1,64 @@
|
||||
// MIT License
|
||||
//
|
||||
// Copyright (c) 2022 Advanced Micro Devices, Inc. All Rights Reserved.
|
||||
//
|
||||
// Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
// of this software and associated documentation files (the "Software"), to deal
|
||||
// in the Software without restriction, including without limitation the rights
|
||||
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
// copies of the Software, and to permit persons to whom the Software is
|
||||
// furnished to do so, subject to the following conditions:
|
||||
//
|
||||
// The above copyright notice and this permission notice shall be included in all
|
||||
// copies or substantial portions of the Software.
|
||||
//
|
||||
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
// SOFTWARE.
|
||||
|
||||
#include "library/dynamic_library.hpp"
|
||||
#include "library/debug.hpp"
|
||||
#include "library/defines.hpp"
|
||||
|
||||
#include <timemory/environment.hpp>
|
||||
|
||||
namespace omnitrace
|
||||
{
|
||||
dynamic_library::dynamic_library(const char* _env, const char* _fname, int _flags,
|
||||
bool _store)
|
||||
: envname{ _env }
|
||||
, filename{ tim::get_env<std::string>(_env, _fname, _store) }
|
||||
, flags{ _flags }
|
||||
{
|
||||
open();
|
||||
}
|
||||
|
||||
dynamic_library::~dynamic_library() { close(); }
|
||||
|
||||
bool
|
||||
dynamic_library::open()
|
||||
{
|
||||
if(!filename.empty())
|
||||
{
|
||||
handle = dlopen(filename.c_str(), flags);
|
||||
if(!handle)
|
||||
{
|
||||
OMNITRACE_VERBOSE(2, "[dynamic_library][%s][%s] %s\n", envname.c_str(),
|
||||
filename.c_str(), dlerror());
|
||||
}
|
||||
dlerror(); // Clear any existing error
|
||||
}
|
||||
return (handle != nullptr);
|
||||
}
|
||||
|
||||
int
|
||||
dynamic_library::close() const
|
||||
{
|
||||
if(handle) return dlclose(handle);
|
||||
return -1;
|
||||
}
|
||||
} // namespace omnitrace
|
||||
+5
-22
@@ -22,11 +22,8 @@
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "library/debug.hpp"
|
||||
#include "library/defines.hpp"
|
||||
|
||||
#include <timemory/environment.hpp>
|
||||
|
||||
#include <dlfcn.h>
|
||||
#include <string>
|
||||
|
||||
@@ -41,26 +38,12 @@ struct dynamic_library
|
||||
dynamic_library& operator=(dynamic_library&&) noexcept = default;
|
||||
|
||||
dynamic_library(const char* _env, const char* _fname,
|
||||
int _flags = (RTLD_NOW | RTLD_GLOBAL), bool _store = false)
|
||||
: envname{ _env }
|
||||
, filename{ tim::get_env<std::string>(_env, _fname, _store) }
|
||||
, flags{ _flags }
|
||||
{
|
||||
if(!filename.empty())
|
||||
{
|
||||
handle = dlopen(filename.c_str(), flags);
|
||||
if(!handle)
|
||||
{
|
||||
OMNITRACE_DEBUG("%s\n", dlerror());
|
||||
}
|
||||
dlerror(); // Clear any existing error
|
||||
}
|
||||
}
|
||||
int _flags = (RTLD_LAZY | RTLD_GLOBAL), bool _store = false);
|
||||
|
||||
~dynamic_library()
|
||||
{
|
||||
if(handle) dlclose(handle);
|
||||
}
|
||||
~dynamic_library();
|
||||
|
||||
bool open();
|
||||
int close() const;
|
||||
|
||||
std::string envname = {};
|
||||
std::string filename = {};
|
||||
|
||||
@@ -28,6 +28,7 @@
|
||||
#include "library/components/user_region.hpp"
|
||||
#include "library/config.hpp"
|
||||
#include "library/debug.hpp"
|
||||
#include "library/runtime.hpp"
|
||||
|
||||
#include <timemory/api/kokkosp.hpp>
|
||||
|
||||
@@ -83,6 +84,7 @@ extern "C"
|
||||
|
||||
void kokkosp_parse_args(int argc, char** argv)
|
||||
{
|
||||
OMNITRACE_SCOPED_THREAD_STATE(ThreadState::Internal);
|
||||
if(!omnitrace::config::settings_are_configured() &&
|
||||
omnitrace::get_state() < omnitrace::State::Active)
|
||||
{
|
||||
@@ -102,12 +104,14 @@ extern "C"
|
||||
|
||||
void kokkosp_declare_metadata(const char* key, const char* value)
|
||||
{
|
||||
OMNITRACE_SCOPED_THREAD_STATE(ThreadState::Internal);
|
||||
tim::manager::add_metadata(key, value);
|
||||
}
|
||||
|
||||
void kokkosp_init_library(const int loadSeq, const uint64_t interfaceVer,
|
||||
const uint32_t devInfoCount, void* deviceInfo)
|
||||
{
|
||||
OMNITRACE_SCOPED_THREAD_STATE(ThreadState::Internal);
|
||||
tim::consume_parameters(devInfoCount, deviceInfo);
|
||||
|
||||
if(_standalone_initialized || (!omnitrace::config::settings_are_configured() &&
|
||||
@@ -138,6 +142,7 @@ extern "C"
|
||||
|
||||
void kokkosp_finalize_library()
|
||||
{
|
||||
OMNITRACE_SCOPED_THREAD_STATE(ThreadState::Internal);
|
||||
if(_standalone_initialized)
|
||||
{
|
||||
omnitrace_pop_trace("kokkos_main");
|
||||
@@ -156,6 +161,7 @@ extern "C"
|
||||
|
||||
void kokkosp_begin_parallel_for(const char* name, uint32_t devid, uint64_t* kernid)
|
||||
{
|
||||
OMNITRACE_SCOPED_THREAD_STATE(ThreadState::Internal);
|
||||
auto pname =
|
||||
(devid > std::numeric_limits<uint16_t>::max()) // junk device number
|
||||
? TIMEMORY_JOIN(" ", "[kokkos]", name)
|
||||
@@ -168,6 +174,7 @@ extern "C"
|
||||
|
||||
void kokkosp_end_parallel_for(uint64_t kernid)
|
||||
{
|
||||
OMNITRACE_SCOPED_THREAD_STATE(ThreadState::Internal);
|
||||
kokkosp::logger_t{}.mark(-1, __FUNCTION__, kernid);
|
||||
kokkosp::stop_profiler<omnitrace::component::user_region>(kernid);
|
||||
kokkosp::destroy_profiler<omnitrace::component::user_region>(kernid);
|
||||
@@ -177,6 +184,7 @@ extern "C"
|
||||
|
||||
void kokkosp_begin_parallel_reduce(const char* name, uint32_t devid, uint64_t* kernid)
|
||||
{
|
||||
OMNITRACE_SCOPED_THREAD_STATE(ThreadState::Internal);
|
||||
auto pname =
|
||||
(devid > std::numeric_limits<uint16_t>::max()) // junk device number
|
||||
? TIMEMORY_JOIN(" ", "[kokkos]", name)
|
||||
@@ -189,6 +197,7 @@ extern "C"
|
||||
|
||||
void kokkosp_end_parallel_reduce(uint64_t kernid)
|
||||
{
|
||||
OMNITRACE_SCOPED_THREAD_STATE(ThreadState::Internal);
|
||||
kokkosp::logger_t{}.mark(-1, __FUNCTION__, kernid);
|
||||
kokkosp::stop_profiler<omnitrace::component::user_region>(kernid);
|
||||
kokkosp::destroy_profiler<omnitrace::component::user_region>(kernid);
|
||||
@@ -198,6 +207,7 @@ extern "C"
|
||||
|
||||
void kokkosp_begin_parallel_scan(const char* name, uint32_t devid, uint64_t* kernid)
|
||||
{
|
||||
OMNITRACE_SCOPED_THREAD_STATE(ThreadState::Internal);
|
||||
auto pname =
|
||||
(devid > std::numeric_limits<uint16_t>::max()) // junk device number
|
||||
? TIMEMORY_JOIN(" ", "[kokkos]", name)
|
||||
@@ -210,6 +220,7 @@ extern "C"
|
||||
|
||||
void kokkosp_end_parallel_scan(uint64_t kernid)
|
||||
{
|
||||
OMNITRACE_SCOPED_THREAD_STATE(ThreadState::Internal);
|
||||
kokkosp::logger_t{}.mark(-1, __FUNCTION__, kernid);
|
||||
kokkosp::stop_profiler<omnitrace::component::user_region>(kernid);
|
||||
kokkosp::destroy_profiler<omnitrace::component::user_region>(kernid);
|
||||
@@ -219,6 +230,7 @@ extern "C"
|
||||
|
||||
void kokkosp_begin_fence(const char* name, uint32_t devid, uint64_t* kernid)
|
||||
{
|
||||
OMNITRACE_SCOPED_THREAD_STATE(ThreadState::Internal);
|
||||
auto pname =
|
||||
(devid > std::numeric_limits<uint16_t>::max()) // junk device number
|
||||
? TIMEMORY_JOIN(" ", "[kokkos]", name)
|
||||
@@ -231,6 +243,7 @@ extern "C"
|
||||
|
||||
void kokkosp_end_fence(uint64_t kernid)
|
||||
{
|
||||
OMNITRACE_SCOPED_THREAD_STATE(ThreadState::Internal);
|
||||
kokkosp::logger_t{}.mark(-1, __FUNCTION__, kernid);
|
||||
kokkosp::stop_profiler<omnitrace::component::user_region>(kernid);
|
||||
kokkosp::destroy_profiler<omnitrace::component::user_region>(kernid);
|
||||
@@ -240,6 +253,7 @@ extern "C"
|
||||
|
||||
void kokkosp_push_profile_region(const char* name)
|
||||
{
|
||||
OMNITRACE_SCOPED_THREAD_STATE(ThreadState::Internal);
|
||||
kokkosp::logger_t{}.mark(1, __FUNCTION__, name);
|
||||
kokkosp::get_profiler_stack<omnitrace::component::user_region>().push_back(
|
||||
kokkosp::profiler_t<omnitrace::component::user_region>(name));
|
||||
@@ -248,6 +262,7 @@ extern "C"
|
||||
|
||||
void kokkosp_pop_profile_region()
|
||||
{
|
||||
OMNITRACE_SCOPED_THREAD_STATE(ThreadState::Internal);
|
||||
kokkosp::logger_t{}.mark(-1, __FUNCTION__);
|
||||
if(kokkosp::get_profiler_stack<omnitrace::component::user_region>().empty())
|
||||
return;
|
||||
@@ -259,6 +274,7 @@ extern "C"
|
||||
|
||||
void kokkosp_create_profile_section(const char* name, uint32_t* secid)
|
||||
{
|
||||
OMNITRACE_SCOPED_THREAD_STATE(ThreadState::Internal);
|
||||
*secid = kokkosp::get_unique_id();
|
||||
auto pname = TIMEMORY_JOIN(" ", "[kokkos]", name);
|
||||
kokkosp::create_profiler<omnitrace::component::user_region>(pname, *secid);
|
||||
@@ -266,6 +282,7 @@ extern "C"
|
||||
|
||||
void kokkosp_destroy_profile_section(uint32_t secid)
|
||||
{
|
||||
OMNITRACE_SCOPED_THREAD_STATE(ThreadState::Internal);
|
||||
kokkosp::destroy_profiler<omnitrace::component::user_region>(secid);
|
||||
}
|
||||
|
||||
@@ -273,12 +290,14 @@ extern "C"
|
||||
|
||||
void kokkosp_start_profile_section(uint32_t secid)
|
||||
{
|
||||
OMNITRACE_SCOPED_THREAD_STATE(ThreadState::Internal);
|
||||
kokkosp::logger_t{}.mark(1, __FUNCTION__, secid);
|
||||
kokkosp::start_profiler<omnitrace::component::user_region>(secid);
|
||||
}
|
||||
|
||||
void kokkosp_stop_profile_section(uint32_t secid)
|
||||
{
|
||||
OMNITRACE_SCOPED_THREAD_STATE(ThreadState::Internal);
|
||||
kokkosp::logger_t{}.mark(-1, __FUNCTION__, secid);
|
||||
kokkosp::start_profiler<omnitrace::component::user_region>(secid);
|
||||
}
|
||||
@@ -288,6 +307,7 @@ extern "C"
|
||||
void kokkosp_allocate_data(const SpaceHandle space, const char* label,
|
||||
const void* const ptr, const uint64_t size)
|
||||
{
|
||||
OMNITRACE_SCOPED_THREAD_STATE(ThreadState::Internal);
|
||||
kokkosp::logger_t{}.mark(0, __FUNCTION__, space.name, label,
|
||||
TIMEMORY_JOIN("", '[', ptr, ']'), size);
|
||||
kokkosp::profiler_alloc_t<>{ TIMEMORY_JOIN(" ", "[kokkos][allocate]", space.name,
|
||||
@@ -298,6 +318,7 @@ extern "C"
|
||||
void kokkosp_deallocate_data(const SpaceHandle space, const char* label,
|
||||
const void* const ptr, const uint64_t size)
|
||||
{
|
||||
OMNITRACE_SCOPED_THREAD_STATE(ThreadState::Internal);
|
||||
kokkosp::logger_t{}.mark(0, __FUNCTION__, space.name, label,
|
||||
TIMEMORY_JOIN("", '[', ptr, ']'), size);
|
||||
kokkosp::profiler_alloc_t<>{ TIMEMORY_JOIN(" ", "[kokkos][deallocate]",
|
||||
@@ -311,6 +332,7 @@ extern "C"
|
||||
const void* dst_ptr, SpaceHandle src_handle,
|
||||
const char* src_name, const void* src_ptr, uint64_t size)
|
||||
{
|
||||
OMNITRACE_SCOPED_THREAD_STATE(ThreadState::Internal);
|
||||
kokkosp::logger_t{}.mark(1, __FUNCTION__, dst_handle.name, dst_name,
|
||||
TIMEMORY_JOIN("", '[', dst_ptr, ']'), src_handle.name,
|
||||
src_name, TIMEMORY_JOIN("", '[', src_ptr, ']'), size);
|
||||
@@ -329,6 +351,7 @@ extern "C"
|
||||
|
||||
void kokkosp_end_deep_copy()
|
||||
{
|
||||
OMNITRACE_SCOPED_THREAD_STATE(ThreadState::Internal);
|
||||
kokkosp::logger_t{}.mark(-1, __FUNCTION__);
|
||||
auto& _data = kokkosp::get_profiler_stack<omnitrace::component::user_region>();
|
||||
if(_data.empty()) return;
|
||||
@@ -341,6 +364,7 @@ extern "C"
|
||||
|
||||
void kokkosp_profile_event(const char* name)
|
||||
{
|
||||
OMNITRACE_SCOPED_THREAD_STATE(ThreadState::Internal);
|
||||
kokkosp::profiler_t<omnitrace::component::user_region>{}.mark(name);
|
||||
}
|
||||
|
||||
|
||||
Αναφορά σε νέο ζήτημα
Block a user