Support for tracing mutex locking (#52)

* Parallel overhead example with locks

* Support tracing mutex locking + more

- support wrapping pthread_mutex_lock
- support wrapping pthread_mutex_unlock
- support wrapping pthread_mutex_trylock
- get_perfetto_combined_traces setting
- OMNITRACE_TRACE_THREAD_LOCKS option
- ThreadState
- critical trace includes queue id
- enabled/disabled settings in timemory
- fix OMNITRACE_TIMEMORY_COMPONENTS
- fix reading config
- fix setting categories
- applied ThreadState::Internal in various places
- utility::get_filled_array
- utility::get_reserved_vector
- utility::get_thread_index
- fork_gotcha messages about forks
- split out some pthread_gotcha functionality into pthread_create_gotcha
- handle queue id in roctracer callbacks

* Update timemory and PTL submodules

* Misc CMake updates

- Includes fix to omnitrace-static-lib{gcc,stdcxx}

* Misc cleanup to pthread_mutex_gotcha and backtrace

* Fix to duplicate field in module_function json

* Improvement to debug messages

* omnitrace-dl and common improvements

- tweak to delimit
- common::ignore message
- common::join quoting of strings
- omnitrace_set_env ignores if inited and active
- omnitrace_set_mpi ignores if inited and active

* nsync for transpose example

* Fix to thread_deleter<void> functor invoke

* Fix thread state and HIP stream enums

[ROCm/rocprofiler-systems commit: b208047741]
This commit is contained in:
Jonathan R. Madsen
2022-05-08 04:40:10 -05:00
committed by GitHub
orang tua 0094a471fd
melakukan 0d5f0fb9cf
50 mengubah file dengan 1736 tambahan dan 483 penghapusan
@@ -132,6 +132,44 @@ parse:
kwargs:
RESULT_VARIABLE: '*'
OUTPUT_VARIABLE: '*'
omnitrace_find_static_library:
flags:
- NO_CACHE
- REQUIRED
- NO_DEFAULT_PATH
- NO_PACKAGE_ROOT_PATH
- NO_CMAKE_PATH
- NO_CMAKE_ENVIRONMENT_PATH
- NO_SYSTEM_ENVIRONMENT_PATH
- CMAKE_FIND_ROOT_PATH_BOTH
- ONLY_CMAKE_FIND_ROOT_PATH
- NO_CMAKE_FIND_ROOT_PATH
kwargs:
NAMES: '*'
NAMES_PER_DIR: '*'
HINTS: '*'
PATHS: '*'
PATH_SUFFIXES: '*'
DOC: '*'
omnitrace_find_shared_library:
flags:
- NO_CACHE
- REQUIRED
- NO_DEFAULT_PATH
- NO_PACKAGE_ROOT_PATH
- NO_CMAKE_PATH
- NO_CMAKE_ENVIRONMENT_PATH
- NO_SYSTEM_ENVIRONMENT_PATH
- CMAKE_FIND_ROOT_PATH_BOTH
- ONLY_CMAKE_FIND_ROOT_PATH
- NO_CMAKE_FIND_ROOT_PATH
kwargs:
NAMES: '*'
NAMES_PER_DIR: '*'
HINTS: '*'
PATHS: '*'
PATH_SUFFIXES: '*'
DOC: '*'
override_spec: {}
vartags: []
proptags: []
@@ -301,9 +301,18 @@ target_compile_options(
omnitrace-static-libgcc
INTERFACE $<$<COMPILE_LANGUAGE:C>:$<$<C_COMPILER_ID:GNU>:-static-libgcc>>
$<$<COMPILE_LANGUAGE:CXX>:$<$<CXX_COMPILER_ID:GNU>:-static-libgcc>>)
target_link_options(
omnitrace-static-libgcc INTERFACE
$<$<COMPILE_LANGUAGE:C>:$<$<C_COMPILER_ID:GNU,Clang>:-static-libgcc>>
$<$<COMPILE_LANGUAGE:CXX>:$<$<CXX_COMPILER_ID:GNU,Clang>:-static-libgcc>>)
target_compile_options(
omnitrace-static-libstdcxx
INTERFACE $<$<COMPILE_LANGUAGE:CXX>:$<$<CXX_COMPILER_ID:GNU>:-static-libstdc++>>)
INTERFACE $<$<COMPILE_LANGUAGE:CXX>:$<$<CXX_COMPILER_ID:GNU,Clang>:-static-libstdc++>>
)
target_link_options(
omnitrace-static-libstdcxx INTERFACE
$<$<COMPILE_LANGUAGE:CXX>:$<$<CXX_COMPILER_ID:GNU,Clang>:-static-libstdc++>>)
# ----------------------------------------------------------------------------------------#
# user customization
@@ -777,4 +777,14 @@ function(OMNITRACE_PYTHON_CONSOLE_SCRIPT SCRIPT_NAME SCRIPT_SUBMODULE)
endif()
endfunction()
function(OMNITRACE_FIND_STATIC_LIBRARY)
set(CMAKE_FIND_LIBRARY_SUFFIXES ${CMAKE_STATIC_LIBRARY_SUFFIX})
find_library(${ARGN})
endfunction()
function(OMNITRACE_FIND_SHARED_LIBRARY)
set(CMAKE_FIND_LIBRARY_SUFFIXES ${CMAKE_SHARED_LIBRARY_SUFFIX})
find_library(${ARGN})
endfunction()
cmake_policy(POP)
@@ -7,7 +7,13 @@ find_package(Threads REQUIRED)
add_executable(parallel-overhead parallel-overhead.cpp)
target_link_libraries(parallel-overhead Threads::Threads)
add_executable(parallel-overhead-locks parallel-overhead.cpp)
target_link_libraries(parallel-overhead-locks Threads::Threads)
target_compile_definitions(parallel-overhead-locks PRIVATE USE_LOCKS=1)
if(NOT CMAKE_PROJECT_NAME STREQUAL PROJECT_NAME)
set_target_properties(parallel-overhead PROPERTIES RUNTIME_OUTPUT_DIRECTORY
${CMAKE_BINARY_DIR})
set_target_properties(parallel-overhead-locks PROPERTIES RUNTIME_OUTPUT_DIRECTORY
${CMAKE_BINARY_DIR})
endif()
@@ -1,14 +1,23 @@
#include <atomic>
#include <cstdio>
#include <cstdlib>
#include <string>
#include <thread>
#include <vector>
#if defined(USE_LOCKS)
# include <mutex>
using auto_lock_t = std::unique_lock<std::mutex>;
long total = 0;
std::mutex mtx{};
#else
# include <atomic>
std::atomic<long> total{ 0 };
#endif
long
fib(long n) __attribute__((noinline));
void
run(size_t nitr, long) __attribute__((noinline));
@@ -21,10 +30,19 @@ fib(long n)
void
run(size_t nitr, long n)
{
#if defined(USE_LOCKS)
for(size_t i = 0; i < nitr; ++i)
{
auto _v = fib(n);
auto_lock_t _lk{ mtx };
total += _v;
}
#else
long local = 0;
for(size_t i = 0; i < nitr; ++i)
local += fib(n);
total += local;
#endif
}
int
@@ -42,7 +60,7 @@ main(int argc, char** argv)
if(argc > 2) nthread = atol(argv[2]);
if(argc > 3) nitr = atol(argv[3]);
printf("[%s] Threads: %zu\n[%s] Iterations: %zu\n[%s] fibonacci(%li)...\n",
printf("\n[%s] Threads: %zu\n[%s] Iterations: %zu\n[%s] fibonacci(%li)...\n",
_name.c_str(), nthread, _name.c_str(), nitr, _name.c_str(), nfib);
std::vector<std::thread> threads{};
@@ -53,13 +71,16 @@ main(int argc, char** argv)
threads.emplace_back(&run, _nitr, nfib);
}
#if !defined(USE_LOCKS)
auto _nitr = std::max<size_t>(nitr - 0.25 * nitr, 1);
run(_nitr, nfib - 0.1 * nfib);
#endif
for(auto& itr : threads)
itr.join();
printf("[%s] fibonacci(%li) x %lu = %li\n", _name.c_str(), nfib, nthread,
total.load());
static_cast<long>(total));
return 0;
}
@@ -95,10 +95,12 @@ transpose_a(int* in, int* out, int M, int N)
void
run(int rank, int tid, hipStream_t stream, int argc, char** argv)
{
size_t nitr = 500;
unsigned int M = 4960 * 2;
unsigned int N = 4960 * 2;
size_t nitr = 500;
size_t nsync = 10;
unsigned int M = 4960 * 2;
unsigned int N = 4960 * 2;
if(argc > 2) nitr = atoll(argv[2]);
if(argc > 3) nsync = atoll(argv[3]);
auto_lock_t _lk{ print_lock };
std::cout << "[" << rank << "][" << tid << "] M: " << M << " N: " << N << std::endl;
@@ -126,10 +128,11 @@ run(int rank, int tid, hipStream_t stream, int argc, char** argv)
dim3 block(32, 32, 1); // transpose_a
auto t1 = std::chrono::high_resolution_clock::now();
for(size_t i = 0; i < nitr; i++)
for(size_t i = 0; i < nitr; ++i)
{
transpose_a<<<grid, block, 0, stream>>>(in, out, M, N);
check_hip_error();
if(i % nsync == (nsync - 1)) HIP_API_CALL(hipStreamSynchronize(stream));
}
auto t2 = std::chrono::high_resolution_clock::now();
HIP_API_CALL(hipStreamSynchronize(stream));
@@ -179,15 +182,18 @@ do_a2a(int rank)
int
main(int argc, char** argv)
{
int rank = 0;
int size = 1;
int nthreads = 2;
int nitr = 5000;
int rank = 0;
int size = 1;
int nthreads = 2;
int nitr = 5000;
size_t nsync = 10;
if(argc > 1) nthreads = atoi(argv[1]);
if(argc > 2) nitr = atoi(argv[2]);
if(argc > 3) nsync = atoll(argv[3]);
printf("[transpose] Number of threads: %i\n", nthreads);
printf("[transpose] Number of iterations: %i\n", nitr);
printf("[transpose] Syncing every %zu iterations\n", nsync);
#if defined(USE_MPI)
MPI_Init(&argc, &argv);
Submodule projects/rocprofiler-systems/external/PTL updated: 4afd2bdeb9...1451b6c279
Submodule projects/rocprofiler-systems/external/timemory updated: 9ccf9ec9f6...52e7034fd4
@@ -9,7 +9,7 @@ add_executable(
${CMAKE_CURRENT_LIST_DIR}/avail.cpp ${CMAKE_CURRENT_LIST_DIR}/avail.hpp
$<TARGET_OBJECTS:omnitrace::omnitrace-object-library>)
target_include_directories(omnitrace-avail PRIVATE ${CMAKE_CURRENT_LIST_DIR}/include)
target_include_directories(omnitrace-avail PRIVATE ${CMAKE_CURRENT_LIST_DIR})
target_compile_definitions(omnitrace-avail PRIVATE OMNITRACE_EXTERN_COMPONENTS=0)
target_link_libraries(omnitrace-avail PRIVATE omnitrace::omnitrace-compile-definitions
omnitrace::omnitrace-interface-library)
@@ -439,7 +439,9 @@ main(int argc, char** argv)
parser.add_argument({ "" }, "");
parser.add_argument({ "[VIEW OPTIONS]" }, "");
parser.add_argument({ "-A", "--available" }, "Only display available components")
parser
.add_argument({ "-A", "--available" },
"Only display available components/settings/hw-counters")
.max_count(1)
.action([](parser_t& p) { available_only = p.get<bool>("available"); });
parser
@@ -892,8 +894,8 @@ write_settings_info(std::ostream& os, const array_t<bool, N>& opts,
_setting_output.end());
// patch up the categories
str_set_t _not_in_category_view{};
auto _settings = tim::settings::shared_instance();
auto _not_in_category_view = str_set_t{};
auto _settings = tim::settings::shared_instance();
for(auto& itr : _setting_output)
{
auto _name = itr.find("environ")->second;
@@ -942,6 +944,19 @@ write_settings_info(std::ostream& os, const array_t<bool, N>& opts,
}),
_setting_output.end());
if(available_only)
{
_setting_output.erase(
std::remove_if(_setting_output.begin(), _setting_output.end(),
[&_settings](const auto& itr) {
auto iitr = _settings->find(itr.at("environ"));
if(iitr != _settings->end())
return (iitr->second->get_enabled() == false);
return true;
}),
_setting_output.end());
}
if(alphabetical)
{
std::sort(_setting_output.begin(), _setting_output.end(),
@@ -10,12 +10,13 @@ add_executable(
${CMAKE_CURRENT_LIST_DIR}/critical-trace.hpp
$<TARGET_OBJECTS:omnitrace::omnitrace-object-library>)
target_include_directories(omnitrace-critical-trace
PRIVATE ${CMAKE_CURRENT_LIST_DIR}/include)
target_include_directories(omnitrace-critical-trace PRIVATE ${CMAKE_CURRENT_LIST_DIR})
target_compile_definitions(omnitrace-critical-trace PRIVATE OMNITRACE_EXTERN_COMPONENTS=0)
target_link_libraries(
omnitrace-critical-trace PRIVATE omnitrace::omnitrace-compile-definitions
omnitrace::omnitrace-interface-library)
omnitrace-critical-trace
PRIVATE omnitrace::omnitrace-compile-definitions
omnitrace::omnitrace-interface-library omnitrace::omnitrace-headers
omnitrace::omnitrace-timemory)
set_target_properties(
omnitrace-critical-trace
PROPERTIES BUILD_RPATH "\$ORIGIN:${PROJECT_BINARY_DIR}:${CMAKE_BINARY_DIR}"
@@ -176,7 +176,7 @@ module_function::serialize(ArchiveT& ar, const unsigned)
}
ar(cereal::make_nvp("address_range", address_range),
cereal::make_nvp("instructions", num_instructions),
cereal::make_nvp("num_instructions", num_instructions),
cereal::make_nvp("module", module_name),
cereal::make_nvp("function", function_name),
cereal::make_nvp("signature", signature));
@@ -22,6 +22,7 @@
#pragma once
#include <cstring>
#include <string>
#include <vector>
@@ -31,6 +32,51 @@ inline namespace common
{
namespace
{
template <typename ContainerT, typename... Args>
inline auto
emplace_impl(ContainerT& _c, int, Args&&... _args)
-> decltype(_c.emplace_back(std::forward<Args>(_args)...))
{
return _c.emplace_back(std::forward<Args>(_args)...);
}
template <typename ContainerT, typename... Args>
inline auto
emplace_impl(ContainerT& _c, long, Args&&... _args)
-> decltype(_c.emplace(std::forward<Args>(_args)...))
{
return _c.emplace(std::forward<Args>(_args)...);
}
template <typename ContainerT, typename... Args>
inline auto
emplace(ContainerT& _c, Args&&... _args)
{
return emplace_impl(_c, 0, std::forward<Args>(_args)...);
}
template <typename ContainerT, typename ArgT>
inline auto
reserve_impl(ContainerT& _c, int, ArgT _arg) -> decltype(_c.reserve(_arg), bool())
{
_c.reserve(_arg);
return true;
}
template <typename ContainerT, typename ArgT>
inline auto
reserve_impl(ContainerT&, long, ArgT)
{
return false;
}
template <typename ContainerT, typename ArgT>
inline auto
reserve(ContainerT& _c, ArgT _arg)
{
return reserve_impl(_c, 0, _arg);
}
template <typename ContainerT = std::vector<std::string>>
inline ContainerT
delimit(const std::string& line, const char* delimiters = "\"',;: ");
@@ -42,6 +88,18 @@ delimit(const std::string& line, const char* delimiters)
ContainerT _result{};
size_t _beginp = 0; // position that is the beginning of the new string
size_t _delimp = 0; // position of the delimiter in the string
if(reserve(_result, 0))
{
size_t _nmax = 0;
for(char itr : line)
{
for(size_t j = 0; j < strlen(delimiters); ++j)
{
if(itr == delimiters[j]) ++_nmax;
}
}
reserve(_result, _nmax);
}
while(_beginp < line.length() && _delimp < line.length())
{
// find the first character (starting at _delimp) that is not a delimiter
@@ -56,7 +114,7 @@ delimit(const std::string& line, const char* delimiters)
// between this position and the next delimiter
_tmp = line.substr(_beginp, _delimp - _beginp);
// don't add empty strings
if(!_tmp.empty()) _result.emplace(_result.end(), _tmp);
if(!_tmp.empty()) emplace(_result, _tmp);
}
return _result;
}
@@ -60,6 +60,21 @@ get_thread_index()
return _v;
}
template <typename... Args>
auto
ignore(const char* _name, int _verbose, int _value, const char* _reason, Args... _args)
{
if(_verbose >= _value)
{
fflush(stderr);
fprintf(stderr,
"[omnitrace][" OMNITRACE_COMMON_LIBRARY_NAME
"][%li] %s(%s) was ignored :: %s\n",
get_thread_index(), _name, join(", ", _args...).c_str(), _reason);
fflush(stderr);
}
}
template <typename FuncT, typename... Args>
auto
invoke(const char* _name, int _verbose, bool& _toggle, FuncT&& _func, Args... _args)
@@ -22,8 +22,10 @@
#pragma once
#include <ios>
#include <sstream>
#include <string>
#include <type_traits>
#if !defined(OMNITRACE_FOLD_EXPRESSION)
# define OMNITRACE_FOLD_EXPRESSION(...) ((__VA_ARGS__), ...)
@@ -35,12 +37,51 @@ inline namespace common
{
namespace
{
template <typename Tp>
struct is_string_impl : std::false_type
{};
template <>
struct is_string_impl<std::string> : std::true_type
{};
template <>
struct is_string_impl<std::string_view> : std::true_type
{};
template <>
struct is_string_impl<const char*> : std::true_type
{};
template <>
struct is_string_impl<char*> : std::true_type
{};
template <typename Tp>
struct is_string : is_string_impl<std::remove_cv_t<std::decay_t<Tp>>>
{};
template <typename ArgT>
auto
as_string(ArgT&& _v, std::enable_if_t<is_string<ArgT>::value, int> = 0)
{
return std::string{ "\"" } + _v + std::string{ "\"" };
}
template <typename ArgT>
auto
as_string(ArgT&& _v, std::enable_if_t<!is_string<ArgT>::value, long> = 0)
{
return _v;
}
template <typename DelimT, typename... Args>
auto
join(DelimT&& _delim, Args&&... _args)
{
std::stringstream _ss{};
OMNITRACE_FOLD_EXPRESSION(_ss << _delim << _args);
_ss << std::boolalpha;
OMNITRACE_FOLD_EXPRESSION(_ss << _delim << as_string(_args));
auto _ret = _ss.str();
if constexpr(std::is_same<DelimT, char>::value)
{
@@ -6,43 +6,37 @@
set(CMAKE_BUILD_TYPE "Release")
set(CMAKE_SKIP_RPATH OFF)
set(BUILD_RPATH_USE_ORIGIN ON)
set(CMAKE_BUILD_RPATH_USE_ORIGIN ON)
set(CMAKE_CXX_VISIBILITY_PRESET "internal")
set(CMAKE_POSITION_INDEPENDENT_CODE ON)
add_library(omnitrace-dl-library SHARED)
add_library(omnitrace::omnitrace-dl-library ALIAS omnitrace-dl-library)
target_sources(omnitrace-dl-library PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/dl.cpp
${CMAKE_CURRENT_SOURCE_DIR}/dl.hpp)
target_link_libraries(
omnitrace-dl-library
PUBLIC ${dl_LIBRARY} $<BUILD_INTERFACE:omnitrace::common-library>
$<BUILD_INTERFACE:omnitrace::omnitrace-compile-definitions>)
target_include_directories(
omnitrace-dl-library
PUBLIC $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}>
$<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/../omnitrace-user>)
target_link_libraries(
omnitrace-dl-library
PUBLIC
${dl_LIBRARY}
$<BUILD_INTERFACE:omnitrace::common-library>
$<BUILD_INTERFACE:omnitrace::omnitrace-compile-definitions>
$<IF:$<BOOL:${OMNITRACE_BUILD_STATIC_LIBGCC}>,$<BUILD_INTERFACE:omnitrace::omnitrace-static-libgcc>,>
$<IF:$<BOOL:${OMNITRACE_BUILD_STATIC_LIBSTDCXX}>,$<BUILD_INTERFACE:omnitrace::omnitrace-static-libstdcxx>,>
)
check_cxx_compiler_flag("-fno-exceptions" omnitrace_dl_library_fno_exceptions)
check_cxx_compiler_flag("-ftls-model=local-dynamic"
omnitrace_dl_library_ftls_module_local_dynamic)
if(OMNITRACE_BUILD_DEVELOPER)
if(omnitrace_dl_library_fno_exceptions)
target_compile_options(omnitrace-dl-library PRIVATE -fno-exceptions)
endif()
if(omnitrace_dl_library_ftls_module_local_dynamic)
target_compile_options(omnitrace-dl-library PRIVATE -ftls-model=local-dynamic)
endif()
endif()
add_target_cxx_flag_if_avail(omnitrace-dl-library "-ftls-model=global-dynamic")
add_target_cxx_flag_if_avail(omnitrace-dl-library "-g")
set_target_properties(
omnitrace-dl-library
PROPERTIES OUTPUT_NAME omnitrace-dl
CXX_VISIBILITY_PRESET "internal"
VERSION ${PROJECT_VERSION}
SOVERSION ${PROJECT_VERSION_MAJOR}
POSITION_INDEPENDENT_CODE ON
BUILD_RPATH "\$ORIGIN"
INSTALL_RPATH "\$ORIGIN")
@@ -122,9 +122,9 @@ const char* _omnitrace_dl_dlopen_descr = "RTLD_LAZY | RTLD_LOCAL";
/// This class contains function pointers for omnitrace's instrumentation functions
struct OMNITRACE_HIDDEN_API indirect
{
OMNITRACE_INLINE indirect(std::string omnilib, std::string userlib)
: m_omnilib{ find_path(std::move(omnilib)) }
, m_userlib{ find_path(std::move(userlib)) }
OMNITRACE_INLINE indirect(const std::string& omnilib, const std::string& userlib)
: m_omnilib{ find_path(omnilib) }
, m_userlib{ find_path(userlib) }
{
if(_omnitrace_dl_verbose >= 1)
{
@@ -218,19 +218,20 @@ struct OMNITRACE_HIDDEN_API indirect
}
}
static OMNITRACE_INLINE std::string find_path(std::string&& _path)
static OMNITRACE_INLINE std::string find_path(const std::string& _path)
{
auto _paths =
delimit(join(":", get_env("OMNITRACE_PATH", ""),
get_env("LD_LIBRARY_PATH", ""), get_env("LIBRARY_PATH", "")),
":");
auto _paths_search =
join(":", get_env("OMNITRACE_PATH", ""), get_env("LD_LIBRARY_PATH", ""),
get_env("LIBRARY_PATH", ""));
auto _paths = delimit(_paths_search, ":");
auto file_exists = [](const std::string& name) {
struct stat buffer;
return (stat(name.c_str(), &buffer) == 0);
};
for(auto&& itr : _paths)
for(const auto& itr : _paths)
{
auto _f = join('/', itr, _path);
if(file_exists(_f)) return _f;
@@ -269,10 +270,10 @@ get_indirect() OMNITRACE_HIDDEN_API;
indirect&
get_indirect()
{
static auto _v =
indirect{ get_env("OMNITRACE_LIBRARY", "libomnitrace.so"),
get_env("OMNITRACE_USER_LIBRARY", "libomnitrace-user.so") };
return _v;
static auto _libomni = get_env("OMNITRACE_LIBRARY", "libomnitrace.so");
static auto _libuser = get_env("OMNITRACE_USER_LIBRARY", "libomnitrace-user.so");
static auto* _v = new indirect{ _libomni, _libuser };
return *_v;
}
auto&
@@ -340,6 +341,10 @@ bool _omnitrace_dl_fini = (std::atexit([]() {
(::omnitrace::dl::get_thread_status() = false), \
__VA_ARGS__)
#define OMNITRACE_DL_IGNORE(...) \
::omnitrace::common::ignore(__FUNCTION__, ::omnitrace::dl::_omnitrace_dl_verbose, \
__VA_ARGS__)
#define OMNITRACE_DL_INVOKE_STATUS(STATUS, ...) \
::omnitrace::common::invoke(__FUNCTION__, ::omnitrace::dl::_omnitrace_dl_verbose, \
STATUS, __VA_ARGS__)
@@ -464,18 +469,30 @@ extern "C"
void omnitrace_set_env(const char* a, const char* b)
{
if(dl::get_inited() && dl::get_active())
{
OMNITRACE_DL_IGNORE(2, "already initialized and active", a, b);
return;
}
setenv(a, b, 0);
OMNITRACE_DL_INVOKE(get_indirect().omnitrace_set_env_f, a, b);
}
void omnitrace_set_mpi(bool a, bool b)
{
if(dl::get_inited() && dl::get_active())
{
OMNITRACE_DL_IGNORE(2, "already initialized and active", a, b);
return;
}
OMNITRACE_DL_INVOKE(get_indirect().omnitrace_set_mpi_f, a, b);
}
void omnitrace_register_source(const char* file, const char* func, size_t line,
size_t address, const char* source)
{
OMNITRACE_DL_LOG(3, "%s(\"%s\", \"%s\", %zu, %zu, \"%s\")\n", __FUNCTION__, file,
func, line, address, source);
OMNITRACE_DL_INVOKE(get_indirect().omnitrace_register_source_f, file, func, line,
address, source);
}
@@ -8,8 +8,8 @@ add_library(omnitrace-interface-library INTERFACE)
add_library(omnitrace::omnitrace-interface-library ALIAS omnitrace-interface-library)
target_include_directories(
omnitrace-interface-library INTERFACE ${CMAKE_CURRENT_SOURCE_DIR}/include
${CMAKE_CURRENT_BINARY_DIR}/include)
omnitrace-interface-library INTERFACE ${CMAKE_CURRENT_SOURCE_DIR}
${CMAKE_CURRENT_BINARY_DIR})
target_include_directories(omnitrace-interface-library SYSTEM
INTERFACE ${perfetto_DIR}/sdk)
@@ -54,6 +54,7 @@ set(library_sources
${CMAKE_CURRENT_LIST_DIR}/library/coverage.cpp
${CMAKE_CURRENT_LIST_DIR}/library/cpu_freq.cpp
${CMAKE_CURRENT_LIST_DIR}/library/critical_trace.cpp
${CMAKE_CURRENT_LIST_DIR}/library/debug.cpp
${CMAKE_CURRENT_LIST_DIR}/library/kokkosp.cpp
${CMAKE_CURRENT_LIST_DIR}/library/gpu.cpp
${CMAKE_CURRENT_LIST_DIR}/library/ompt.cpp
@@ -70,6 +71,8 @@ set(library_sources
${CMAKE_CURRENT_LIST_DIR}/library/components/mpi_gotcha.cpp
${CMAKE_CURRENT_LIST_DIR}/library/components/omnitrace.cpp
${CMAKE_CURRENT_LIST_DIR}/library/components/pthread_gotcha.cpp
${CMAKE_CURRENT_LIST_DIR}/library/components/pthread_create_gotcha.cpp
${CMAKE_CURRENT_LIST_DIR}/library/components/pthread_mutex_gotcha.cpp
${CMAKE_CURRENT_LIST_DIR}/library/components/user_region.cpp
${perfetto_DIR}/sdk/perfetto.cc)
@@ -92,6 +95,7 @@ set(library_headers
${CMAKE_CURRENT_LIST_DIR}/library/thread_data.hpp
${CMAKE_CURRENT_LIST_DIR}/library/thread_sampler.hpp
${CMAKE_CURRENT_LIST_DIR}/library/timemory.hpp
${CMAKE_CURRENT_LIST_DIR}/library/utility.hpp
${CMAKE_CURRENT_LIST_DIR}/library/components/fwd.hpp
${CMAKE_CURRENT_LIST_DIR}/library/components/backtrace.hpp
${CMAKE_CURRENT_LIST_DIR}/library/components/fork_gotcha.hpp
@@ -102,6 +106,8 @@ set(library_headers
${CMAKE_CURRENT_LIST_DIR}/library/components/roctracer.hpp
${CMAKE_CURRENT_LIST_DIR}/library/components/roctracer_callbacks.hpp
${CMAKE_CURRENT_LIST_DIR}/library/components/pthread_gotcha.hpp
${CMAKE_CURRENT_LIST_DIR}/library/components/pthread_create_gotcha.hpp
${CMAKE_CURRENT_LIST_DIR}/library/components/pthread_mutex_gotcha.hpp
${CMAKE_CURRENT_LIST_DIR}/library/components/user_region.hpp
${perfetto_DIR}/sdk/perfetto.h)
@@ -119,7 +125,8 @@ if(OMNITRACE_USE_ROCM_SMI)
PRIVATE ${CMAKE_CURRENT_LIST_DIR}/library/components/rocm_smi.cpp)
endif()
target_link_libraries(omnitrace-object-library PRIVATE omnitrace-interface-library)
target_link_libraries(omnitrace-object-library
PRIVATE omnitrace::omnitrace-interface-library)
if(OMNITRACE_DYNINST_API_RT)
get_filename_component(OMNITRACE_DYNINST_API_RT_DIR "${OMNITRACE_DYNINST_API_RT}"
@@ -27,6 +27,7 @@
#include "library/components/fwd.hpp"
#include "library/components/mpi_gotcha.hpp"
#include "library/components/pthread_gotcha.hpp"
#include "library/components/pthread_mutex_gotcha.hpp"
#include "library/config.hpp"
#include "library/coverage.hpp"
#include "library/critical_trace.hpp"
@@ -221,7 +222,7 @@ omnitrace_push_trace_hidden(const char* name)
std::tie(_cid, _parent_cid, _depth) = create_cpu_cid_entry();
auto _ts = comp::wall_clock::record();
add_critical_trace<Device::CPU, Phase::BEGIN>(
threading::get_id(), _cid, 0, _parent_cid, _ts, 0,
threading::get_id(), _cid, 0, _parent_cid, _ts, 0, 0,
critical_trace::add_hash_id(name), _depth);
}
}
@@ -263,7 +264,7 @@ omnitrace_pop_trace_hidden(const char* name)
auto _ts = comp::wall_clock::record();
std::tie(_parent_cid, _depth) = get_cpu_cid_parents()->at(_cid);
add_critical_trace<Device::CPU, Phase::END>(
threading::get_id(), _cid, 0, _parent_cid, _ts, _ts,
threading::get_id(), _cid, 0, _parent_cid, _ts, _ts, 0,
critical_trace::add_hash_id(name), _depth);
}
}
@@ -436,6 +437,8 @@ omnitrace_init_library_hidden()
if(get_state() != State::PreInit || get_state() == State::Init || _once) return;
_once = true;
OMNITRACE_SCOPED_THREAD_STATE(ThreadState::Internal);
OMNITRACE_CONDITIONAL_BASIC_PRINT_F(_debug_init, "State is %s. Setting to %s...\n",
std::to_string(get_state()).c_str(),
std::to_string(State::Init).c_str());
@@ -445,7 +448,7 @@ omnitrace_init_library_hidden()
"glibc's backtrace() occurs...\n");
{
std::stringstream _ss{};
tim::print_backtrace<64>(_ss);
tim::print_backtrace<16>(_ss);
(void) _ss;
}
@@ -471,7 +474,7 @@ omnitrace_init_library_hidden()
// below will effectively do:
// get_cpu_cid_stack(0)->emplace_back(-1);
// plus query some env variables
add_critical_trace<Device::CPU, Phase::NONE>(0, -1, 0, 0, 0, 0, 0, 0);
add_critical_trace<Device::CPU, Phase::NONE>(0, -1, 0, 0, 0, 0, 0, 0, 0);
if(gpu::device_count() == 0 && get_state() != State::Active)
{
@@ -550,6 +553,8 @@ omnitrace_init_tooling_hidden()
if(get_state() != State::PreInit || get_state() == State::Init || _once) return false;
_once = true;
OMNITRACE_SCOPED_THREAD_STATE(ThreadState::Internal);
OMNITRACE_CONDITIONAL_THROW(
get_state() == State::Init,
"%s called after omnitrace_init_library() was explicitly called",
@@ -571,6 +576,9 @@ omnitrace_init_tooling_hidden()
OMNITRACE_DEBUG_F("\n");
auto _dtor = scope::destructor{ []() {
// if set to finalized, don't continue
if(get_state() > State::Active) return;
if(config::get_trace_thread_locks()) pthread_mutex_gotcha::validate();
if(get_use_thread_sampling())
{
pthread_gotcha::push_enable_sampling_on_child_threads(false);
@@ -595,6 +603,12 @@ omnitrace_init_tooling_hidden()
sampling::block_signals();
}
if(get_use_critical_trace())
{
// initialize the thread pool
(void) tasking::critical_trace::get_task_group();
}
if(get_use_timemory())
{
comp::user_global_bundle::global_init();
@@ -892,8 +906,8 @@ omnitrace_init_hidden(const char* _mode, bool _is_binary_rewrite, const char* _a
});
std::atexit([]() {
// if not already finalized then we should finalize
if(get_state() != State::Finalized) omnitrace_finalize_hidden();
// if active (not already finalized) then we should finalize
if(get_state() == State::Active) omnitrace_finalize_hidden();
});
OMNITRACE_CONDITIONAL_BASIC_PRINT_F(
@@ -940,6 +954,8 @@ omnitrace_finalize_hidden(void)
// disable thread id recycling during finalization
threading::recycle_ids() = false;
set_thread_state(ThreadState::Completed);
// return if not active
if(get_state() != State::Active)
{
@@ -1193,25 +1209,33 @@ omnitrace_finalize_hidden(void)
using char_vec_t = std::vector<char>;
OMNITRACE_VERBOSE_F(3, "Getting the trace data...\n");
#if defined(TIMEMORY_USE_MPI) && TIMEMORY_USE_MPI > 0
using perfetto_mpi_get_t = tim::operation::finalize::mpi_get<char_vec_t, true>;
char_vec_t _trace_data{ tracing_session->ReadTraceBlocking() };
std::vector<char_vec_t> _rank_data = {};
auto _combine = [](char_vec_t& _dst, const char_vec_t& _src) -> char_vec_t& {
_dst.reserve(_dst.size() + _src.size());
for(auto&& itr : _src)
_dst.emplace_back(itr);
return _dst;
};
perfetto_mpi_get_t{ _rank_data, _trace_data, _combine };
auto trace_data = char_vec_t{};
for(auto& itr : _rank_data)
trace_data =
(trace_data.empty()) ? std::move(itr) : _combine(trace_data, itr);
#if defined(TIMEMORY_USE_MPI) && TIMEMORY_USE_MPI > 0
if(get_perfetto_combined_traces())
{
using perfetto_mpi_get_t =
tim::operation::finalize::mpi_get<char_vec_t, true>;
char_vec_t _trace_data{ tracing_session->ReadTraceBlocking() };
std::vector<char_vec_t> _rank_data = {};
auto _combine = [](char_vec_t& _dst, const char_vec_t& _src) -> char_vec_t& {
_dst.reserve(_dst.size() + _src.size());
for(auto&& itr : _src)
_dst.emplace_back(itr);
return _dst;
};
perfetto_mpi_get_t{ _rank_data, _trace_data, _combine };
for(auto& itr : _rank_data)
trace_data =
(trace_data.empty()) ? std::move(itr) : _combine(trace_data, itr);
}
else
{
trace_data = tracing_session->ReadTraceBlocking();
}
#else
char_vec_t trace_data{ tracing_session->ReadTraceBlocking() };
trace_data = tracing_session->ReadTraceBlocking();
#endif
if(!trace_data.empty())
@@ -53,8 +53,8 @@ template <critical_trace::Device DevID, critical_trace::Phase PhaseID,
bool UpdateStack = true>
inline void
add_critical_trace(int64_t _targ_tid, size_t _cpu_cid, size_t _gpu_cid,
size_t _parent_cid, int64_t _ts_beg, int64_t _ts_val, size_t _hash,
uint16_t _depth, uint16_t _prio = 0)
size_t _parent_cid, int64_t _ts_beg, int64_t _ts_val, uintptr_t _queue,
size_t _hash, uint16_t _depth, uint16_t _prio = 0)
{
// clang-format off
// these are used to create unique type mutexes
@@ -62,6 +62,8 @@ add_critical_trace(int64_t _targ_tid, size_t _cpu_cid, size_t _gpu_cid,
struct cpu_cid_stack {};
// clang-format on
OMNITRACE_SCOPED_THREAD_STATE(ThreadState::Internal);
using tim::type_mutex;
using auto_lock_t = tim::auto_lock_t;
static constexpr auto num_mutexes = max_supported_threads;
@@ -80,9 +82,9 @@ add_critical_trace(int64_t _targ_tid, size_t _cpu_cid, size_t _gpu_cid,
if(!_self_lk.owns_lock()) _self_lk.lock();
auto& _critical_trace = critical_trace::get(_self_tid);
_critical_trace->emplace_back(
critical_trace::entry{ _prio, DevID, PhaseID, _depth, _targ_tid, _cpu_cid,
_gpu_cid, _parent_cid, _ts_beg, _ts_val, _hash });
_critical_trace->emplace_back(critical_trace::entry{
_prio, DevID, PhaseID, _depth, _targ_tid, _cpu_cid, _gpu_cid, _parent_cid,
_ts_beg, _ts_val, _queue, _hash });
}
if constexpr(UpdateStack)
@@ -119,6 +121,6 @@ add_critical_trace(int64_t _targ_tid, size_t _cpu_cid, size_t _gpu_cid,
}
tim::consume_parameters(_targ_tid, _cpu_cid, _gpu_cid, _parent_cid, _ts_beg, _ts_val,
_hash, _depth, _prio);
_queue, _hash, _depth, _prio);
}
} // namespace omnitrace
@@ -271,7 +271,7 @@ backtrace::sample(int signum)
m_ts = clock_type::now();
m_thr_cpu_ts = tim::get_clock_thread_now<int64_t, std::nano>();
m_mem_peak = tim::get_peak_rss(RUSAGE_THREAD);
m_data = tim::get_unw_backtrace<128, 4, false>();
m_data = tim::get_unw_backtrace<stack_depth, 4, false>();
auto* itr = m_data.begin();
for(; itr != m_data.end(); ++itr, ++m_size)
{
@@ -52,8 +52,10 @@ struct backtrace
, tim::concepts::component
{
static constexpr size_t num_hw_counters = TIMEMORY_PAPI_ARRAY_SIZE;
static constexpr size_t buffer_width = 512;
static constexpr size_t stack_depth = 128;
using data_t = std::array<char[512], 128>;
using data_t = std::array<char[buffer_width], stack_depth>;
using clock_type = std::chrono::steady_clock;
using time_point_type = typename clock_type::time_point;
using value_type = void;
@@ -23,6 +23,12 @@
#include "library/components/fork_gotcha.hpp"
#include "library/config.hpp"
#include "library/debug.hpp"
#include "library/state.hpp"
#include <timemory/backends/process.hpp>
#include <timemory/backends/threading.hpp>
#include <unistd.h>
namespace omnitrace
{
@@ -37,6 +43,8 @@ fork_gotcha::configure()
void
fork_gotcha::audit(const gotcha_data_t&, audit::incoming)
{
OMNITRACE_VERBOSE(1, "fork() called on PID %i (rank: %i), TID %li\n",
process::get_id(), dmp::rank(), threading::get_id());
OMNITRACE_CONDITIONAL_BASIC_PRINT(
get_debug_env(),
"Warning! Calling fork() within an OpenMPI application using libfabric "
@@ -45,9 +53,13 @@ fork_gotcha::audit(const gotcha_data_t&, audit::incoming)
}
void
fork_gotcha::audit(const gotcha_data_t& _data, audit::outgoing, pid_t _pid)
fork_gotcha::audit(const gotcha_data_t&, audit::outgoing, pid_t _pid)
{
OMNITRACE_CONDITIONAL_BASIC_PRINT(get_debug_env(), "%s() return PID %i\n",
_data.tool_id.c_str(), (int) _pid);
if(_pid != 0)
{
OMNITRACE_VERBOSE(1, "fork() called on PID %i created PID %i\n", getppid(), _pid);
tim::settings::use_output_suffix() = true;
tim::settings::default_process_suffix() = process::get_id();
}
}
} // namespace omnitrace
@@ -24,10 +24,13 @@
#include "library/components/fwd.hpp"
#include "library/defines.hpp"
#include "library/runtime.hpp"
#include "library/state.hpp"
#include "library/timemory.hpp"
#include "timemory/mpl/concepts.hpp"
#include "timemory/mpl/function_traits.hpp"
#include "timemory/utility/macros.hpp"
#include <timemory/mpl/concepts.hpp>
#include <timemory/mpl/function_traits.hpp>
#include <timemory/utility/macros.hpp>
#include <type_traits>
#include <utility>
@@ -71,6 +74,7 @@ struct functors : comp::base<functors<ApiT, StartFuncT, StopFuncT>, void>
int> = 0>
static auto start(Args&&... _args)
{
OMNITRACE_SCOPED_THREAD_STATE(ThreadState::Internal);
get_functors().first(std::forward<Args>(_args)...);
}
@@ -79,6 +83,7 @@ struct functors : comp::base<functors<ApiT, StartFuncT, StopFuncT>, void>
int> = 0>
static auto stop(Args&&... _args)
{
OMNITRACE_SCOPED_THREAD_STATE(ThreadState::Internal);
get_functors().second(std::forward<Args>(_args)...);
}
@@ -87,24 +92,28 @@ struct functors : comp::base<functors<ApiT, StartFuncT, StopFuncT>, void>
template <typename Tp = this_type, enable_if_t<Tp::begin_supports_cstr, int> = 0>
void start()
{
OMNITRACE_SCOPED_THREAD_STATE(ThreadState::Internal);
get_functors().first(m_prefix);
}
template <typename Tp = this_type, enable_if_t<Tp::end_supports_cstr, int> = 0>
void stop()
{
OMNITRACE_SCOPED_THREAD_STATE(ThreadState::Internal);
get_functors().second(m_prefix);
}
template <typename Tp = this_type, enable_if_t<Tp::begin_supports_void, int> = 0>
void start()
{
OMNITRACE_SCOPED_THREAD_STATE(ThreadState::Internal);
get_functors().first();
}
template <typename Tp = this_type, enable_if_t<Tp::end_supports_void, int> = 0>
void stop()
{
OMNITRACE_SCOPED_THREAD_STATE(ThreadState::Internal);
get_functors().second();
}
@@ -46,8 +46,7 @@ omnitrace_mpi_set_attr()
return MPI_SUCCESS;
};
static auto _mpi_fini = [](MPI_Comm, int, void*, void*) {
OMNITRACE_CONDITIONAL_BASIC_PRINT(get_debug_env(),
"MPI Comm attribute finalize\n");
OMNITRACE_DEBUG("MPI Comm attribute finalize\n");
if(mpip_index != std::numeric_limits<uint64_t>::max())
comp::deactivate_mpip<tim::component_tuple<omnitrace::component::omnitrace>,
api::omnitrace>(mpip_index);
@@ -83,8 +82,7 @@ mpi_gotcha::configure()
void
mpi_gotcha::audit(const gotcha_data_t& _data, audit::incoming, int*, char***)
{
OMNITRACE_CONDITIONAL_BASIC_PRINT_F(get_debug_env(), "%s(int*, char***)\n",
_data.tool_id.c_str());
OMNITRACE_BASIC_DEBUG_F("%s(int*, char***)\n", _data.tool_id.c_str());
if(get_state() < ::omnitrace::State::Init) set_state(::omnitrace::State::PreInit);
@@ -98,8 +96,7 @@ mpi_gotcha::audit(const gotcha_data_t& _data, audit::incoming, int*, char***)
void
mpi_gotcha::audit(const gotcha_data_t& _data, audit::incoming, int*, char***, int, int*)
{
OMNITRACE_CONDITIONAL_BASIC_PRINT_F(get_debug_env(), "%s(int*, char***, int, int*)\n",
_data.tool_id.c_str());
OMNITRACE_BASIC_DEBUG_F("%s(int*, char***, int, int*)\n", _data.tool_id.c_str());
if(get_state() < ::omnitrace::State::Init) set_state(::omnitrace::State::PreInit);
@@ -113,7 +110,7 @@ mpi_gotcha::audit(const gotcha_data_t& _data, audit::incoming, int*, char***, in
void
mpi_gotcha::audit(const gotcha_data_t& _data, audit::incoming)
{
OMNITRACE_CONDITIONAL_BASIC_PRINT_F(get_debug_env(), "%s()\n", _data.tool_id.c_str());
OMNITRACE_BASIC_DEBUG_F("%s()\n", _data.tool_id.c_str());
if(mpip_index != std::numeric_limits<uint64_t>::max())
comp::deactivate_mpip<tim::component_tuple<omnitrace::component::omnitrace>,
@@ -130,7 +127,7 @@ mpi_gotcha::audit(const gotcha_data_t& _data, audit::incoming)
void
mpi_gotcha::audit(const gotcha_data_t& _data, audit::incoming, comm_t, int* _val)
{
OMNITRACE_CONDITIONAL_BASIC_PRINT_F(get_debug_env(), "%s()\n", _data.tool_id.c_str());
OMNITRACE_BASIC_DEBUG_F("%s()\n", _data.tool_id.c_str());
omnitrace_push_trace_hidden(_data.tool_id.c_str());
if(_data.tool_id == "MPI_Comm_rank")
@@ -151,8 +148,7 @@ mpi_gotcha::audit(const gotcha_data_t& _data, audit::incoming, comm_t, int* _val
void
mpi_gotcha::audit(const gotcha_data_t& _data, audit::outgoing, int _retval)
{
OMNITRACE_CONDITIONAL_BASIC_PRINT_F(get_debug_env(), "%s() returned %i\n",
_data.tool_id.c_str(), (int) _retval);
OMNITRACE_BASIC_DEBUG_F("%s() returned %i\n", _data.tool_id.c_str(), (int) _retval);
if(_retval == tim::mpi::success_v && _data.tool_id.find("MPI_Init") == 0)
{
@@ -164,8 +160,7 @@ mpi_gotcha::audit(const gotcha_data_t& _data, audit::outgoing, int _retval)
// were excluded via a regex expression)
if(get_use_mpip())
{
OMNITRACE_CONDITIONAL_BASIC_PRINT_F(get_debug_env() || get_verbose_env() > 0,
"Activating MPI wrappers...\n");
OMNITRACE_BASIC_VERBOSE_F(2, "Activating MPI wrappers...\n");
// use env vars OMNITRACE_MPIP_PERMIT_LIST and OMNITRACE_MPIP_REJECT_LIST
// to control the gotcha bindings at runtime
@@ -186,13 +181,12 @@ mpi_gotcha::audit(const gotcha_data_t& _data, audit::outgoing, int _retval)
tim::mpi::set_rank(m_rank);
tim::settings::default_process_suffix() = m_rank;
get_perfetto_output_filename().clear();
OMNITRACE_CONDITIONAL_BASIC_PRINT(
get_debug() || get_verbose() > 0, "[pid=%i] MPI rank: %i (%i)\n",
process::get_id(), tim::mpi::rank(), m_rank);
OMNITRACE_BASIC_VERBOSE(0, "[pid=%i] MPI rank: %i (%i)\n",
process::get_id(), tim::mpi::rank(), m_rank);
}
else
{
OMNITRACE_BASIC_PRINT_F("%s() returned %i :: nullptr to rank\n",
OMNITRACE_BASIC_VERBOSE(0, "%s() returned %i :: nullptr to rank\n",
_data.tool_id.c_str(), (int) _retval);
}
}
@@ -202,19 +196,19 @@ mpi_gotcha::audit(const gotcha_data_t& _data, audit::outgoing, int _retval)
{
m_size = std::max<int>(*m_size_ptr, m_size);
tim::mpi::set_size(m_size);
OMNITRACE_CONDITIONAL_BASIC_PRINT(
get_debug() || get_verbose() > 0, "[pid=%i] MPI size: %i (%i)\n",
process::get_id(), tim::mpi::size(), m_size);
OMNITRACE_BASIC_VERBOSE(0, "[pid=%i] MPI size: %i (%i)\n",
process::get_id(), tim::mpi::size(), m_size);
}
else
{
OMNITRACE_BASIC_PRINT_F("%s() returned %i :: nullptr to size\n",
OMNITRACE_BASIC_VERBOSE(0, "%s() returned %i :: nullptr to size\n",
_data.tool_id.c_str(), (int) _retval);
}
}
else
{
OMNITRACE_BASIC_PRINT_F("%s() returned %i :: unexpected function wrapper\n",
OMNITRACE_BASIC_VERBOSE(0,
"%s() returned %i :: unexpected function wrapper\n",
_data.tool_id.c_str(), (int) _retval);
}
}
@@ -0,0 +1,301 @@
// MIT License
//
// Copyright (c) 2022 Advanced Micro Devices, Inc. All Rights Reserved.
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in all
// copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
// SOFTWARE.
#include "library/components/pthread_create_gotcha.hpp"
#include "library/components/omnitrace.hpp"
#include "library/components/pthread_gotcha.hpp"
#include "library/components/roctracer.hpp"
#include "library/config.hpp"
#include "library/debug.hpp"
#include "library/runtime.hpp"
#include "library/sampling.hpp"
#include "library/thread_data.hpp"
#include <bits/stdint-intn.h>
#include <timemory/backends/threading.hpp>
#include <timemory/sampling/allocator.hpp>
#include <timemory/utility/types.hpp>
#include <ostream>
#include <pthread.h>
namespace omnitrace
{
namespace sampling
{
std::set<int>
setup();
std::set<int>
shutdown();
} // namespace sampling
namespace mpl = tim::mpl;
using bundle_t = tim::lightweight_tuple<comp::wall_clock, comp::roctracer_data>;
using wall_pw_t = mpl::piecewise_select<comp::wall_clock>; // only wall-clock
using main_pw_t = mpl::piecewise_ignore<comp::wall_clock>; // exclude wall-clock
namespace
{
auto* is_shutdown = new bool{ false }; // intentional data leak
auto* bundles = new std::map<int64_t, std::shared_ptr<bundle_t>>{};
auto* bundles_mutex = new std::mutex{};
auto bundles_dtor = scope::destructor{ []() {
omnitrace::pthread_create_gotcha::shutdown();
delete bundles;
delete bundles_mutex;
bundles = nullptr;
bundles_mutex = nullptr;
} };
inline void
start_bundle(bundle_t& _bundle)
{
if(!get_use_timemory()) return;
OMNITRACE_BASIC_VERBOSE_F(3, "starting bundle '%s'...\n", _bundle.key().c_str());
_bundle.push();
_bundle.start();
}
inline void
stop_bundle(bundle_t& _bundle, int64_t _tid)
{
if(!get_use_timemory()) return;
OMNITRACE_BASIC_VERBOSE_F(3, "stopping bundle '%s' in thread %li...\n",
_bundle.key().c_str(), _tid);
_bundle.stop(wall_pw_t{}); // stop wall-clock so we can get the value
// update roctracer_data
_bundle.store(std::plus<double>{},
_bundle.get<comp::wall_clock>()->get() * units::sec);
// stop all other components including roctracer_data after update
_bundle.stop(main_pw_t{});
// exclude popping wall-clock
_bundle.pop(_tid);
}
} // namespace
//--------------------------------------------------------------------------------------//
pthread_create_gotcha::wrapper::wrapper(routine_t _routine, void* _arg,
bool _enable_sampling, int64_t _parent,
promise_t* _p)
: m_enable_sampling{ _enable_sampling }
, m_parent_tid{ _parent }
, m_routine{ _routine }
, m_arg{ _arg }
, m_promise{ _p }
{}
void*
pthread_create_gotcha::wrapper::operator()() const
{
if(is_shutdown && *is_shutdown)
{
if(m_promise) m_promise->set_value();
// execute the original function
return m_routine(m_arg);
}
set_thread_state(ThreadState::Internal);
int64_t _tid = -1;
auto _is_sampling = false;
auto _bundle = std::shared_ptr<bundle_t>{};
auto _signals = std::set<int>{};
auto _coverage = (get_mode() == omnitrace::Mode::Coverage);
auto _dtor = scope::destructor{ [&]() {
set_thread_state(ThreadState::Internal);
if(_is_sampling)
{
sampling::block_signals(_signals);
sampling::shutdown();
}
pthread_create_gotcha::shutdown(_tid);
set_thread_state(ThreadState::Completed);
} };
auto _active = (get_state() == omnitrace::State::Active && bundles != nullptr &&
bundles_mutex != nullptr);
if(_active && !_coverage)
{
_tid = threading::get_id();
threading::set_thread_name(TIMEMORY_JOIN(" ", "Thread", _tid).c_str());
if(bundles && bundles_mutex)
{
std::unique_lock<std::mutex> _lk{ *bundles_mutex };
_bundle = bundles->emplace(_tid, std::make_shared<bundle_t>("start_thread"))
.first->second;
}
if(_bundle) start_bundle(*_bundle);
get_cpu_cid_stack(threading::get_id(), m_parent_tid);
if(m_enable_sampling)
{
// initialize thread-local statics
(void) tim::get_unw_backtrace<12, 1, false>();
_is_sampling = true;
pthread_gotcha::push_enable_sampling_on_child_threads(false);
_signals = sampling::setup();
pthread_gotcha::pop_enable_sampling_on_child_threads();
sampling::unblock_signals();
}
}
if(m_promise) m_promise->set_value();
set_thread_state(ThreadState::Enabled);
// execute the original function
return m_routine(m_arg);
}
void*
pthread_create_gotcha::wrapper::wrap(void* _arg)
{
if(_arg == nullptr) return nullptr;
// convert the argument
wrapper* _wrapper = static_cast<wrapper*>(_arg);
// execute the original function
return (*_wrapper)();
}
void
pthread_create_gotcha::configure()
{
pthread_create_gotcha_t::get_initializer() = []() {
pthread_create_gotcha_t::template configure<
0, int, pthread_t*, const pthread_attr_t*, void* (*) (void*), void*>(
"pthread_create");
};
}
void
pthread_create_gotcha::shutdown()
{
if(is_shutdown)
{
if(*is_shutdown) return;
*is_shutdown = true;
}
if(!bundles_mutex || !bundles) return;
std::unique_lock<std::mutex> _lk{ *bundles_mutex };
unsigned long _ndangling = 0;
for(auto itr : *bundles)
{
if(itr.second)
{
stop_bundle(*itr.second, itr.first);
++_ndangling;
}
itr.second.reset();
}
bundles->clear();
OMNITRACE_CONDITIONAL_BASIC_PRINT(
(get_verbose_env() >= 2 || get_debug_env()) && _ndangling > 0,
"[pthread_create_gotcha::shutdown] cleaned up %lu dangling bundles\n",
_ndangling);
}
void
pthread_create_gotcha::shutdown(int64_t _tid)
{
if(is_shutdown && *is_shutdown) return;
if(!bundles_mutex || !bundles) return;
std::unique_lock<std::mutex> _lk{ *bundles_mutex };
auto itr = bundles->find(_tid);
if(itr != bundles->end())
{
if(itr->second) stop_bundle(*itr->second, itr->first);
itr->second.reset();
bundles->erase(itr);
}
}
// pthread_create
int
pthread_create_gotcha::operator()(pthread_t* thread, const pthread_attr_t* attr,
void* (*start_routine)(void*), void* arg) const
{
OMNITRACE_SCOPED_THREAD_STATE(ThreadState::Internal);
bundle_t _bundle{ "pthread_create" };
auto _enable_sampling = pthread_gotcha::sampling_enabled_on_child_threads();
auto _coverage = (get_mode() == omnitrace::Mode::Coverage);
auto _active = (get_state() == omnitrace::State::Active);
int64_t _tid = (_active) ? threading::get_id() : 0;
if(_active)
{
OMNITRACE_VERBOSE(1, "Creating new thread on PID %i (rank: %i), TID %li\n",
process::get_id(), dmp::rank(), _tid);
}
// ensure that cpu cid stack exists on the parent thread if active
if(!_coverage && _active) get_cpu_cid_stack();
if(!get_use_sampling() || !_enable_sampling)
{
auto* _obj = new wrapper(start_routine, arg, _enable_sampling, _tid, nullptr);
// create the thread
auto _ret =
::pthread_create(thread, attr, &wrapper::wrap, static_cast<void*>(_obj));
return _ret;
}
// block the signals in entire process
OMNITRACE_DEBUG("blocking signals...\n");
tim::sampling::block_signals({ SIGALRM, SIGPROF },
tim::sampling::sigmask_scope::process);
start_bundle(_bundle);
// promise set by thread when signal handler is configured
auto _promise = std::promise<void>{};
auto _fut = _promise.get_future();
auto* _wrap = new wrapper(start_routine, arg, _enable_sampling, _tid, &_promise);
// create the thread
auto _ret = ::pthread_create(thread, attr, &wrapper::wrap, static_cast<void*>(_wrap));
// wait for thread to set promise
OMNITRACE_DEBUG("waiting for child to signal it is setup...\n");
_fut.wait();
stop_bundle(_bundle, threading::get_id());
// unblock the signals in the entire process
OMNITRACE_DEBUG("unblocking signals...\n");
tim::sampling::unblock_signals({ SIGALRM, SIGPROF },
tim::sampling::sigmask_scope::process);
OMNITRACE_DEBUG("returning success...\n");
return _ret;
}
} // namespace omnitrace
@@ -0,0 +1,71 @@
// MIT License
//
// Copyright (c) 2022 Advanced Micro Devices, Inc. All Rights Reserved.
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in all
// copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
// SOFTWARE.
#pragma once
#include "library/common.hpp"
#include "library/defines.hpp"
#include "library/timemory.hpp"
#include <cstdint>
#include <future>
namespace omnitrace
{
struct pthread_create_gotcha : tim::component::base<pthread_create_gotcha, void>
{
struct wrapper
{
using routine_t = void* (*) (void*);
using promise_t = std::promise<void>;
wrapper(routine_t _routine, void* _arg, bool, int64_t, promise_t*);
void* operator()() const;
static void* wrap(void* _arg);
private:
bool m_enable_sampling = false;
int64_t m_parent_tid = 0;
routine_t m_routine = nullptr;
void* m_arg = nullptr;
promise_t* m_promise = nullptr;
};
TIMEMORY_DEFAULT_OBJECT(pthread_create_gotcha)
// string id for component
static std::string label() { return "pthread_create_gotcha"; }
// generate the gotcha wrappers
static void configure();
static void shutdown();
static void shutdown(int64_t);
// pthread_create
int operator()(pthread_t* thread, const pthread_attr_t* attr,
void* (*start_routine)(void*), void* arg) const;
};
using pthread_create_gotcha_t =
tim::component::gotcha<2, std::tuple<>, pthread_create_gotcha>;
} // namespace omnitrace
@@ -22,224 +22,62 @@
#include "library/components/pthread_gotcha.hpp"
#include "library/components/omnitrace.hpp"
#include "library/components/pthread_create_gotcha.hpp"
#include "library/components/pthread_mutex_gotcha.hpp"
#include "library/components/roctracer.hpp"
#include "library/config.hpp"
#include "library/debug.hpp"
#include "library/runtime.hpp"
#include "library/sampling.hpp"
#include "library/thread_data.hpp"
#include "library/utility.hpp"
#include <timemory/backends/threading.hpp>
#include <timemory/sampling/allocator.hpp>
#include <timemory/utility/types.hpp>
#include <ostream>
#include <pthread.h>
#include <array>
#include <vector>
namespace omnitrace
{
namespace sampling
{
std::set<int>
setup();
std::set<int>
shutdown();
} // namespace sampling
namespace mpl = tim::mpl;
using bundle_t = tim::lightweight_tuple<comp::wall_clock, comp::roctracer_data>;
using wall_pw_t = mpl::piecewise_select<comp::wall_clock>; // only wall-clock
using main_pw_t = mpl::piecewise_ignore<comp::wall_clock>; // exclude wall-clock
namespace
{
auto* is_shutdown = new bool{ false }; // intentional data leak
auto* bundles = new std::map<int64_t, std::shared_ptr<bundle_t>>{};
auto* bundles_mutex = new std::mutex{};
auto bundles_dtor = scope::destructor{ []() {
omnitrace::pthread_gotcha::shutdown();
delete bundles;
delete bundles_mutex;
bundles = nullptr;
bundles_mutex = nullptr;
} };
using bundle_t = tim::lightweight_tuple<pthread_create_gotcha_t, pthread_mutex_gotcha_t>;
inline void
start_bundle(bundle_t& _bundle)
auto&
get_sampling_on_child_threads_history(int64_t _idx = utility::get_thread_index())
{
if(!get_use_timemory()) return;
OMNITRACE_BASIC_VERBOSE_F(3, "starting bundle '%s'...\n", _bundle.key().c_str());
if(comp::roctracer::is_setup())
{
_bundle.push();
_bundle.start();
}
else
{
_bundle.push(wall_pw_t{});
_bundle.start(wall_pw_t{});
}
}
inline void
stop_bundle(bundle_t& _bundle, int64_t _tid)
{
if(!get_use_timemory()) return;
OMNITRACE_BASIC_VERBOSE_F(3, "stopping bundle '%s' in thread %li...\n",
_bundle.key().c_str(), _tid);
_bundle.stop(wall_pw_t{}); // stop wall-clock so we can get the value
// update roctracer_data
_bundle.store(std::plus<double>{},
_bundle.get<comp::wall_clock>()->get() * units::sec);
// stop all other components including roctracer_data after update
_bundle.stop(main_pw_t{});
// exclude popping wall-clock
_bundle.pop(_tid);
}
auto
get_thread_index()
{
static std::atomic<int64_t> _c{ 0 };
static thread_local int64_t _v = _c++;
return _v;
static auto _v = utility::get_filled_array<OMNITRACE_MAX_THREADS>(
[]() { return utility::get_reserved_vector<bool>(32); });
return _v.at(_idx);
}
auto&
get_sampling_on_child_threads_history(int64_t _idx = get_thread_index())
get_bundle()
{
static auto _v = std::array<std::vector<bool>, OMNITRACE_MAX_THREADS>{};
return _v.at(_idx);
static auto _v = std::unique_ptr<bundle_t>{};
if(!_v) _v = std::make_unique<bundle_t>("pthread_gotcha");
return _v;
}
} // namespace
//--------------------------------------------------------------------------------------//
pthread_gotcha::wrapper::wrapper(routine_t _routine, void* _arg, bool _enable_sampling,
int64_t _parent, promise_t* _p)
: m_enable_sampling{ _enable_sampling }
, m_parent_tid{ _parent }
, m_routine{ _routine }
, m_arg{ _arg }
, m_promise{ _p }
{}
void*
pthread_gotcha::wrapper::operator()() const
{
if(is_shutdown && *is_shutdown)
{
if(m_promise) m_promise->set_value();
// execute the original function
return m_routine(m_arg);
}
int64_t _tid = -1;
auto _is_sampling = false;
auto _bundle = std::shared_ptr<bundle_t>{};
auto _signals = std::set<int>{};
auto _coverage = (get_mode() == omnitrace::Mode::Coverage);
auto _dtor = scope::destructor{ [&]() {
if(_is_sampling)
{
sampling::block_signals(_signals);
sampling::shutdown();
}
if(!bundles || !bundles_mutex) return;
if(_bundle && get_state() < omnitrace::State::Finalized)
{
std::unique_lock<std::mutex> _lk{ *bundles_mutex };
stop_bundle(*_bundle, _tid);
_bundle.reset();
bundles->erase(_tid);
}
} };
auto _active = (get_state() == omnitrace::State::Active && bundles && bundles_mutex);
if(_active && !_coverage)
{
_tid = threading::get_id();
threading::set_thread_name(TIMEMORY_JOIN(" ", "Thread", _tid).c_str());
if(bundles && bundles_mutex)
{
std::unique_lock<std::mutex> _lk{ *bundles_mutex };
if(comp::roctracer::is_setup())
_bundle =
bundles->emplace(_tid, std::make_shared<bundle_t>("start_thread"))
.first->second;
}
if(_bundle) start_bundle(*_bundle);
get_cpu_cid_stack(threading::get_id(), m_parent_tid);
if(m_enable_sampling)
{
// initialize thread-local statics
(void) tim::get_unw_backtrace<12, 1, false>();
_is_sampling = true;
push_enable_sampling_on_child_threads(false);
_signals = sampling::setup();
pop_enable_sampling_on_child_threads();
sampling::unblock_signals();
}
}
if(m_promise) m_promise->set_value();
// execute the original function
return m_routine(m_arg);
}
void*
pthread_gotcha::wrapper::wrap(void* _arg)
{
if(_arg == nullptr) return nullptr;
// convert the argument
wrapper* _wrapper = static_cast<wrapper*>(_arg);
// execute the original function
return (*_wrapper)();
}
void
pthread_gotcha::configure()
{
pthread_gotcha_t::get_initializer() = []() {
pthread_gotcha_t::template configure<0, int, pthread_t*, const pthread_attr_t*,
void* (*) (void*), void*>("pthread_create");
};
pthread_create_gotcha::configure();
pthread_mutex_gotcha::configure();
}
void
pthread_gotcha::shutdown()
{
if(is_shutdown)
{
if(*is_shutdown) return;
*is_shutdown = true;
}
if(!bundles_mutex || !bundles) return;
std::unique_lock<std::mutex> _lk{ *bundles_mutex };
unsigned long _ndangling = 0;
for(auto itr : *bundles)
{
if(itr.second)
{
stop_bundle(*itr.second, itr.first);
++_ndangling;
}
itr.second.reset();
}
bundles->clear();
OMNITRACE_CONDITIONAL_BASIC_PRINT(
(get_verbose_env() >= 2 || get_debug_env()) && _ndangling > 0,
"[pthread_gotcha::shutdown] cleaned up %lu dangling bundles\n", _ndangling);
pthread_create_gotcha::shutdown();
pthread_mutex_gotcha::shutdown();
}
bool
@@ -287,57 +125,16 @@ pthread_gotcha::sampling_on_child_threads()
return _v;
}
// pthread_create
int
pthread_gotcha::operator()(pthread_t* thread, const pthread_attr_t* attr,
void* (*start_routine)(void*), void* arg) const
void
pthread_gotcha::start()
{
bundle_t _bundle{ "pthread_create" };
auto _enable_sampling = sampling_enabled_on_child_threads();
auto _coverage = (get_mode() == omnitrace::Mode::Coverage);
auto _active = (get_state() == omnitrace::State::Active);
int64_t _tid = (_active) ? threading::get_id() : 0;
// ensure that cpu cid stack exists on the parent thread if active
if(!_coverage && _active) get_cpu_cid_stack();
if(!get_use_sampling() || !_enable_sampling)
{
auto* _obj = new wrapper(start_routine, arg, _enable_sampling, _tid, nullptr);
// create the thread
auto _ret =
::pthread_create(thread, attr, &wrapper::wrap, static_cast<void*>(_obj));
return _ret;
}
// block the signals in entire process
OMNITRACE_DEBUG("blocking signals...\n");
tim::sampling::block_signals({ SIGALRM, SIGPROF },
tim::sampling::sigmask_scope::process);
start_bundle(_bundle);
// promise set by thread when signal handler is configured
auto _promise = std::promise<void>{};
auto _fut = _promise.get_future();
auto* _wrap = new wrapper(start_routine, arg, _enable_sampling, _tid, &_promise);
// create the thread
auto _ret = ::pthread_create(thread, attr, &wrapper::wrap, static_cast<void*>(_wrap));
// wait for thread to set promise
OMNITRACE_DEBUG("waiting for child to signal it is setup...\n");
_fut.wait();
stop_bundle(_bundle, threading::get_id());
// unblock the signals in the entire process
OMNITRACE_DEBUG("unblocking signals...\n");
tim::sampling::unblock_signals({ SIGALRM, SIGPROF },
tim::sampling::sigmask_scope::process);
OMNITRACE_DEBUG("returning success...\n");
return _ret;
get_bundle()->start();
}
void
pthread_gotcha::stop()
{
get_bundle()->stop();
get_bundle().reset();
}
} // namespace omnitrace
@@ -33,24 +33,6 @@ namespace omnitrace
{
struct pthread_gotcha : tim::component::base<pthread_gotcha, void>
{
struct wrapper
{
using routine_t = void* (*) (void*);
using promise_t = std::promise<void>;
wrapper(routine_t _routine, void* _arg, bool, int64_t, promise_t*);
void* operator()() const;
static void* wrap(void* _arg);
private:
bool m_enable_sampling = false;
int64_t m_parent_tid = 0;
routine_t m_routine = nullptr;
void* m_arg = nullptr;
promise_t* m_promise = nullptr;
};
TIMEMORY_DEFAULT_OBJECT(pthread_gotcha)
// string id for component
@@ -72,13 +54,10 @@ struct pthread_gotcha : tim::component::base<pthread_gotcha, void>
// make sure every newly created thead starts with this value
static void set_sampling_on_all_future_threads(bool _v);
// pthread_create
int operator()(pthread_t* thread, const pthread_attr_t* attr,
void* (*start_routine)(void*), void* arg) const;
static void start();
static void stop();
private:
static bool& sampling_on_child_threads();
};
using pthread_gotcha_t = tim::component::gotcha<2, std::tuple<>, pthread_gotcha>;
} // namespace omnitrace
@@ -0,0 +1,174 @@
// MIT License
//
// Copyright (c) 2022 Advanced Micro Devices, Inc. All Rights Reserved.
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in all
// copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
// SOFTWARE.
#include "library/components/pthread_mutex_gotcha.hpp"
#include "library.hpp"
#include "library/components/pthread_gotcha.hpp"
#include "library/config.hpp"
#include "library/critical_trace.hpp"
#include "library/debug.hpp"
#include "library/runtime.hpp"
#include "library/sampling.hpp"
#include "library/thread_sampler.hpp"
#include "library/utility.hpp"
#include "timemory/backends/threading.hpp"
#include <timemory/utility/signals.hpp>
#include <timemory/utility/types.hpp>
#include <pthread.h>
namespace omnitrace
{
using Device = critical_trace::Device;
using Phase = critical_trace::Phase;
pthread_mutex_gotcha::hash_array_t&
pthread_mutex_gotcha::get_hashes()
{
// theoretically, this private function will NEVER be called until it
// is called by a gotcha wrapper, which means the tool ids should be
// fully populated. If that fails to be the case for some reason,
// we could see weird results.
static auto _v = []() {
const auto& _data = pthread_mutex_gotcha_t::get_gotcha_data();
hash_array_t _init{};
for(size_t i = 0; i < gotcha_capacity; ++i)
{
auto&& _id = _data.at(i).tool_id;
if(!_id.empty())
_init.at(i) = critical_trace::add_hash_id(_id.c_str());
else
{
OMNITRACE_VERBOSE(
1,
"WARNING!!! pthread_mutex_gotcha tool id at index %zu was empty!\n",
i);
}
OMNITRACE_CI_FAIL(
_id.empty() || _init.at(i) == 0,
"pthread_mutex_gotcha tool id at index %zu has no hash value\n", i);
}
return _init;
}();
return _v;
}
void
pthread_mutex_gotcha::configure()
{
pthread_mutex_gotcha_t::get_initializer() = []() {
if(config::get_trace_thread_locks())
{
pthread_mutex_gotcha::validate();
pthread_mutex_gotcha_t::configure(
comp::gotcha_config<0, int, pthread_mutex_t*>{ "pthread_mutex_lock" });
pthread_mutex_gotcha_t::configure(
comp::gotcha_config<1, int, pthread_mutex_t*>{ "pthread_mutex_unlock" });
pthread_mutex_gotcha_t::configure(
comp::gotcha_config<2, int, pthread_mutex_t*>{ "pthread_mutex_trylock" });
}
};
}
void
pthread_mutex_gotcha::shutdown()
{}
void
pthread_mutex_gotcha::validate()
{
if(config::get_trace_thread_locks() && config::get_use_perfetto())
{
OMNITRACE_PRINT_F("\n");
OMNITRACE_PRINT_F("\n");
OMNITRACE_PRINT_F("\n");
OMNITRACE_PRINT_F(
"The overhead of all the mutex locking internally by perfetto is\n")
OMNITRACE_PRINT_F(
"so significant that all timing data is rendered meaningless.\n");
OMNITRACE_PRINT_F(
"However, mutex locking is effectively non-existant in timemory.\n");
OMNITRACE_PRINT_F("If you want to trace the mutex locking:\n")
OMNITRACE_PRINT_F(" OMNITRACE_USE_TIMEMORY=ON\n");
OMNITRACE_PRINT_F(" OMNITRACE_USE_PERFETTO=OFF\n");
OMNITRACE_PRINT_F("\n");
OMNITRACE_PRINT_F("\n");
OMNITRACE_PRINT_F("\n");
OMNITRACE_FAIL_F("OMNITRACE_USE_PERFETTO and OMNITRACE_TRACE_THREAD_LOCKS cannot "
"both be enabled.\n");
}
}
int
pthread_mutex_gotcha::operator()(const gotcha_data_t& _data,
int (*_callee)(pthread_mutex_t*),
pthread_mutex_t* _mutex)
{
if(is_disabled())
{
if(!_callee)
{
OMNITRACE_PRINT("Warning! nullptr to %s\n", _data.tool_id.c_str());
return EINVAL;
}
return (*_callee)(_mutex);
}
uint64_t _cid = 0;
uint64_t _parent_cid = 0;
uint16_t _depth = 0;
int64_t _ts = 0;
OMNITRACE_SCOPED_THREAD_STATE(ThreadState::Internal);
if(get_use_critical_trace())
{
std::tie(_cid, _parent_cid, _depth) = create_cpu_cid_entry();
_ts = comp::wall_clock::record();
}
omnitrace_push_region(_data.tool_id.c_str());
auto _ret = (*_callee)(_mutex);
omnitrace_pop_region(_data.tool_id.c_str());
if(get_use_critical_trace())
{
add_critical_trace<Device::CPU, Phase::DELTA>(
threading::get_id(), _cid, 0, _parent_cid, _ts, comp::wall_clock::record(),
reinterpret_cast<uintptr_t>(_mutex), get_hashes().at(_data.index), _depth);
}
return _ret;
}
bool
pthread_mutex_gotcha::is_disabled()
{
return (omnitrace::get_state() != omnitrace::State::Active ||
omnitrace::get_thread_state() != omnitrace::ThreadState::Enabled ||
(get_use_sampling() && !pthread_gotcha::sampling_enabled_on_child_threads()));
}
} // namespace omnitrace
@@ -0,0 +1,61 @@
// MIT License
//
// Copyright (c) 2022 Advanced Micro Devices, Inc. All Rights Reserved.
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in all
// copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
// SOFTWARE.
#pragma once
#include "library/common.hpp"
#include "library/defines.hpp"
#include "library/timemory.hpp"
#include <array>
#include <cstddef>
#include <string>
namespace omnitrace
{
// this is used to wrap pthread_mutex()
struct pthread_mutex_gotcha : comp::base<pthread_mutex_gotcha, void>
{
static constexpr size_t gotcha_capacity = 3;
using hash_array_t = std::array<size_t, gotcha_capacity>;
using gotcha_data_t = comp::gotcha_data;
TIMEMORY_DEFAULT_OBJECT(pthread_mutex_gotcha)
// string id for component
static std::string label() { return "pthread_mutex_gotcha"; }
// generate the gotcha wrappers
static void configure();
static void shutdown();
static void validate();
int operator()(const gotcha_data_t&, int (*)(pthread_mutex_t*), pthread_mutex_t*);
private:
static bool is_disabled();
static hash_array_t& get_hashes();
};
using pthread_mutex_gotcha_t = comp::gotcha<pthread_mutex_gotcha::gotcha_capacity,
quirk::fast, pthread_mutex_gotcha>;
} // namespace omnitrace
@@ -25,11 +25,13 @@
#include "library/config.hpp"
#include "library/critical_trace.hpp"
#include "library/debug.hpp"
#include "library/runtime.hpp"
#include "library/sampling.hpp"
#include "library/thread_data.hpp"
#include <timemory/backends/cpu.hpp>
#include <timemory/backends/threading.hpp>
#include <timemory/utility/types.hpp>
#include <atomic>
#include <chrono>
@@ -171,6 +173,8 @@ hsa_api_callback(uint32_t domain, uint32_t cid, const void* callback_data, void*
if(get_state() != State::Active || !trait::runtime_enabled<comp::roctracer>::get())
return;
OMNITRACE_SCOPED_THREAD_STATE(ThreadState::Internal);
(void) arg;
const hsa_api_data_t* data = reinterpret_cast<const hsa_api_data_t*>(callback_data);
OMNITRACE_CONDITIONAL_PRINT_F(
@@ -279,6 +283,8 @@ hsa_activity_callback(uint32_t op, activity_record_t* record, void* arg)
if(get_state() != State::Active || !trait::runtime_enabled<comp::roctracer>::get())
return;
OMNITRACE_SCOPED_THREAD_STATE(ThreadState::Internal);
sampling::block_signals();
static const char* copy_op_name = "hsa_async_copy";
@@ -359,6 +365,8 @@ hip_api_callback(uint32_t domain, uint32_t cid, const void* callback_data, void*
if(get_state() != State::Active || !trait::runtime_enabled<comp::roctracer>::get())
return;
OMNITRACE_SCOPED_THREAD_STATE(ThreadState::Internal);
using Device = critical_trace::Device;
using Phase = critical_trace::Phase;
@@ -388,12 +396,86 @@ hip_api_callback(uint32_t domain, uint32_t cid, const void* callback_data, void*
op_name, cid, data->correlation_id,
(data->phase == ACTIVITY_API_PHASE_ENTER) ? "on-enter" : "on-exit");
int64_t _ts = comp::wall_clock::record();
auto _tid = threading::get_id();
uint64_t _cid = 0;
uint64_t _parent_cid = 0;
uint16_t _depth = 0;
auto _corr_id = data->correlation_id;
int64_t _ts = comp::wall_clock::record();
auto _tid = threading::get_id();
uint64_t _cid = 0;
uint64_t _parent_cid = 0;
uint16_t _depth = 0;
uintptr_t _queue = 0;
auto _corr_id = data->correlation_id;
#define OMNITRACE_HIP_API_QUEUE_CASE(API_FUNC, VARIABLE) \
case HIP_API_ID_##API_FUNC: \
_queue = reinterpret_cast<uintptr_t>(data->args.API_FUNC.VARIABLE); \
break;
#define OMNITRACE_HIP_API_QUEUE_CASE_ALT(API_FUNC, UNION, VARIABLE) \
case HIP_API_ID_##API_FUNC: \
_queue = reinterpret_cast<uintptr_t>(data->args.UNION.VARIABLE); \
break;
switch(cid)
{
OMNITRACE_HIP_API_QUEUE_CASE(hipLaunchKernel, stream)
OMNITRACE_HIP_API_QUEUE_CASE(hipModuleLaunchKernel, stream)
OMNITRACE_HIP_API_QUEUE_CASE(hipHccModuleLaunchKernel, hStream)
OMNITRACE_HIP_API_QUEUE_CASE(hipLaunchCooperativeKernel, stream)
OMNITRACE_HIP_API_QUEUE_CASE(hipExtLaunchKernel, stream)
OMNITRACE_HIP_API_QUEUE_CASE(hipExtModuleLaunchKernel, hStream)
OMNITRACE_HIP_API_QUEUE_CASE(hipExtStreamCreateWithCUMask, stream)
OMNITRACE_HIP_API_QUEUE_CASE(hipExtStreamGetCUMask, stream)
OMNITRACE_HIP_API_QUEUE_CASE(hipStreamSynchronize, stream)
OMNITRACE_HIP_API_QUEUE_CASE(hipConfigureCall, stream)
OMNITRACE_HIP_API_QUEUE_CASE(hipDrvMemcpy3DAsync, stream)
OMNITRACE_HIP_API_QUEUE_CASE(hipEventRecord, stream)
OMNITRACE_HIP_API_QUEUE_CASE(hipMemPrefetchAsync, stream)
OMNITRACE_HIP_API_QUEUE_CASE(hipMemcpy2DAsync, stream)
OMNITRACE_HIP_API_QUEUE_CASE(hipMemcpy2DFromArrayAsync, stream)
OMNITRACE_HIP_API_QUEUE_CASE(hipMemcpy2DToArrayAsync, stream)
OMNITRACE_HIP_API_QUEUE_CASE(hipMemcpy3DAsync, stream)
OMNITRACE_HIP_API_QUEUE_CASE(hipMemcpyAsync, stream)
OMNITRACE_HIP_API_QUEUE_CASE(hipMemcpyDtoDAsync, stream)
OMNITRACE_HIP_API_QUEUE_CASE(hipMemcpyDtoHAsync, stream)
OMNITRACE_HIP_API_QUEUE_CASE(hipMemcpyFromSymbolAsync, stream)
OMNITRACE_HIP_API_QUEUE_CASE(hipMemcpyHtoDAsync, stream)
OMNITRACE_HIP_API_QUEUE_CASE(hipMemcpyParam2DAsync, stream)
OMNITRACE_HIP_API_QUEUE_CASE(hipMemcpyPeerAsync, stream)
OMNITRACE_HIP_API_QUEUE_CASE(hipMemcpyToSymbolAsync, stream)
OMNITRACE_HIP_API_QUEUE_CASE(hipMemcpyWithStream, stream)
OMNITRACE_HIP_API_QUEUE_CASE(hipMemset2DAsync, stream)
OMNITRACE_HIP_API_QUEUE_CASE(hipMemset3DAsync, stream)
OMNITRACE_HIP_API_QUEUE_CASE(hipMemsetAsync, stream)
OMNITRACE_HIP_API_QUEUE_CASE(hipMemsetD16Async, stream)
OMNITRACE_HIP_API_QUEUE_CASE(hipMemsetD32Async, stream)
OMNITRACE_HIP_API_QUEUE_CASE(hipMemsetD8Async, stream)
OMNITRACE_HIP_API_QUEUE_CASE(hipStreamAddCallback, stream)
OMNITRACE_HIP_API_QUEUE_CASE(hipStreamAttachMemAsync, stream)
OMNITRACE_HIP_API_QUEUE_CASE(hipStreamDestroy, stream)
OMNITRACE_HIP_API_QUEUE_CASE(hipStreamGetFlags, stream)
OMNITRACE_HIP_API_QUEUE_CASE(hipStreamGetPriority, stream)
OMNITRACE_HIP_API_QUEUE_CASE(hipStreamQuery, stream)
OMNITRACE_HIP_API_QUEUE_CASE(hipStreamWaitEvent, stream)
OMNITRACE_HIP_API_QUEUE_CASE(hipStreamWaitValue32, stream)
OMNITRACE_HIP_API_QUEUE_CASE(hipStreamWaitValue64, stream)
OMNITRACE_HIP_API_QUEUE_CASE(hipStreamWriteValue32, stream)
OMNITRACE_HIP_API_QUEUE_CASE(hipStreamWriteValue64, stream)
#if OMNITRACE_HIP_VERSION_MAJOR >= 4 && OMNITRACE_HIP_VERSION_MINOR >= 5
OMNITRACE_HIP_API_QUEUE_CASE(hipGraphLaunch, stream)
OMNITRACE_HIP_API_QUEUE_CASE(hipGraphicsMapResources, stream)
OMNITRACE_HIP_API_QUEUE_CASE(hipGraphicsUnmapResources, stream)
OMNITRACE_HIP_API_QUEUE_CASE(hipSignalExternalSemaphoresAsync, stream)
OMNITRACE_HIP_API_QUEUE_CASE(hipStreamBeginCapture, stream)
OMNITRACE_HIP_API_QUEUE_CASE(hipStreamEndCapture, stream)
OMNITRACE_HIP_API_QUEUE_CASE(hipWaitExternalSemaphoresAsync, stream)
#endif
#if OMNITRACE_HIP_VERSION_MAJOR >= 5
OMNITRACE_HIP_API_QUEUE_CASE(hipStreamIsCapturing, stream)
OMNITRACE_HIP_API_QUEUE_CASE(hipStreamGetCaptureInfo, stream)
OMNITRACE_HIP_API_QUEUE_CASE(hipStreamGetCaptureInfo_v2, stream)
OMNITRACE_HIP_API_QUEUE_CASE(hipStreamUpdateCaptureDependencies, stream)
#endif
default: break;
}
if(data->phase == ACTIVITY_API_PHASE_ENTER)
{
@@ -401,12 +483,24 @@ hip_api_callback(uint32_t domain, uint32_t cid, const void* callback_data, void*
switch(cid)
{
case HIP_API_ID_hipLaunchKernel:
case HIP_API_ID_hipLaunchCooperativeKernel:
{
_name = hipKernelNameRefByPtr(data->args.hipLaunchKernel.function_address,
data->args.hipLaunchKernel.stream);
break;
}
case HIP_API_ID_hipLaunchCooperativeKernel:
{
_name =
hipKernelNameRefByPtr(data->args.hipLaunchCooperativeKernel.f,
data->args.hipLaunchCooperativeKernel.stream);
if(!_name)
{
_name =
hipKernelNameRefByPtr(data->args.hipLaunchKernel.function_address,
data->args.hipLaunchKernel.stream);
}
break;
}
case HIP_API_ID_hipHccModuleLaunchKernel:
{
_name = hipKernelNameRef(data->args.hipHccModuleLaunchKernel.f);
@@ -468,7 +562,7 @@ hip_api_callback(uint32_t domain, uint32_t cid, const void* callback_data, void*
if(get_use_critical_trace() || get_use_rocm_smi())
{
add_critical_trace<Device::CPU, Phase::BEGIN>(
_tid, _cid, _corr_id, _parent_cid, _ts, 0,
_tid, _cid, _corr_id, _parent_cid, _ts, 0, _queue,
critical_trace::add_hash_id(op_name), _depth);
}
@@ -517,7 +611,7 @@ hip_api_callback(uint32_t domain, uint32_t cid, const void* callback_data, void*
if(get_use_critical_trace() || get_use_rocm_smi())
{
add_critical_trace<Device::CPU, Phase::END>(
_tid, _cid, _corr_id, _parent_cid, _ts, _ts,
_tid, _cid, _corr_id, _parent_cid, _ts, _ts, _queue,
critical_trace::add_hash_id(op_name), _depth);
}
}
@@ -531,6 +625,8 @@ hip_activity_callback(const char* begin, const char* end, void*)
if(get_state() != State::Active || !trait::runtime_enabled<comp::roctracer>::get())
return;
OMNITRACE_SCOPED_THREAD_STATE(ThreadState::Internal);
sampling::block_signals();
static thread_local auto _once = (threading::set_thread_name("omni.roctracer"), true);
@@ -650,13 +746,13 @@ hip_activity_callback(const char* begin, const char* end, void*)
auto _hash = critical_trace::add_hash_id(_name);
uint16_t _prio = _laps + 1; // priority
add_critical_trace<Device::GPU, Phase::DELTA, false>(
_tid, _cid, _corr_id, _cid, _beg_ns, _end_ns, _hash, _depth + 1, _prio);
_tid, _cid, _corr_id, _cid, _beg_ns, _end_ns, record->queue_id, _hash,
_depth + 1, _prio);
}
if(_found && _name != nullptr && get_use_timemory())
{
auto _func = [_depth, _tid, _cid, _laps, _beg_ns, _end_ns, _corr_id,
_name]() {
auto _func = [_beg_ns, _end_ns, _name]() {
roctracer_bundle_t _bundle{ _name, _scope };
_bundle.start()
.store(std::plus<double>{}, static_cast<double>(_end_ns - _beg_ns))
@@ -725,6 +821,8 @@ extern "C"
if(!config::settings_are_configured() && get_state() < State::Active)
omnitrace_init_tooling_hidden();
OMNITRACE_SCOPED_THREAD_STATE(ThreadState::Internal);
static auto _setup = [=]() {
try
{
@@ -29,6 +29,7 @@
#include <timemory/backends/process.hpp>
#include <timemory/backends/threading.hpp>
#include <timemory/environment.hpp>
#include <timemory/environment/types.hpp>
#include <timemory/sampling/allocator.hpp>
#include <timemory/settings.hpp>
#include <timemory/settings/types.hpp>
@@ -74,7 +75,7 @@ get_setting_name(std::string _v)
{ \
auto _ret = _config->insert<TYPE, TYPE>( \
ENV_NAME, get_setting_name(ENV_NAME), DESCRIPTION, INITIAL_VALUE, \
std::set<std::string>{ "custom", "omnitrace", "omnitrace-library", \
std::set<std::string>{ "custom", "omnitrace", "omnitrace_library", \
__VA_ARGS__ }); \
if(!_ret.second) \
OMNITRACE_PRINT("Warning! Duplicate setting: %s / %s\n", \
@@ -87,7 +88,7 @@ get_setting_name(std::string _v)
{ \
auto _ret = _config->insert<TYPE, TYPE>( \
ENV_NAME, get_setting_name(ENV_NAME), DESCRIPTION, INITIAL_VALUE, \
std::set<std::string>{ "custom", "omnitrace", "omnitrace-library", \
std::set<std::string>{ "custom", "omnitrace", "omnitrace_library", \
__VA_ARGS__ }, \
std::vector<std::string>{ CMD_LINE }); \
if(!_ret.second) \
@@ -147,28 +148,23 @@ configure_settings(bool _init)
if(_omnitrace_debug) tim::set_env("TIMEMORY_DEBUG_SETTINGS", "1", 0);
OMNITRACE_CONFIG_SETTING(bool, "OMNITRACE_USE_PERFETTO", "Enable perfetto backend",
_default_perfetto_v, "backend", "perfetto",
"instrumentation");
_default_perfetto_v, "backend", "perfetto");
OMNITRACE_CONFIG_SETTING(bool, "OMNITRACE_USE_TIMEMORY", "Enable timemory backend",
!_config->get<bool>("OMNITRACE_USE_PERFETTO"), "backend",
"timemory", "instrumentation", "sampling");
"timemory");
#if defined(OMNITRACE_USE_ROCTRACER) && OMNITRACE_USE_ROCTRACER > 0
OMNITRACE_CONFIG_SETTING(bool, "OMNITRACE_USE_ROCTRACER", "Enable ROCM tracing", true,
"backend", "roctracer", "rocm");
#endif
#if defined(OMNITRACE_USE_ROCM_SMI) && OMNITRACE_USE_ROCM_SMI > 0
OMNITRACE_CONFIG_SETTING(
bool, "OMNITRACE_USE_ROCM_SMI",
"Enable sampling GPU power, temp, utilization, and memory usage", true, "backend",
"rocm-smi", "rocm");
"rocm_smi", "rocm");
OMNITRACE_CONFIG_SETTING(std::string, "OMNITRACE_ROCM_SMI_DEVICES",
"Devices to query when OMNITRACE_USE_ROCM_SMI=ON", "all",
"backend", "rocm-smi", "rocm");
#endif
"backend", "rocm_smi", "rocm");
OMNITRACE_CONFIG_SETTING(bool, "OMNITRACE_USE_SAMPLING",
"Enable statistical sampling of call-stack", false,
@@ -177,12 +173,12 @@ configure_settings(bool _init)
OMNITRACE_CONFIG_SETTING(bool, "OMNITRACE_USE_THREAD_SAMPLING",
"Enable a background thread which samples system metrics "
"such as the CPU/GPU freq, power, etc.",
true, "backend", "sampling");
true, "backend", "sampling", "thread_sampling");
OMNITRACE_CONFIG_SETTING(
bool, "OMNITRACE_USE_PID",
"Enable tagging filenames with process identifier (either MPI rank or pid)", true,
"io");
"io", "filename");
OMNITRACE_CONFIG_SETTING(bool, "OMNITRACE_USE_KOKKOSP",
"Enable support for Kokkos Tools", false, "kokkos",
@@ -192,11 +188,9 @@ configure_settings(bool _init)
bool, "OMNITRACE_KOKKOS_KERNEL_LOGGER", "Enables kernel logging", false,
"--omnitrace-kokkos-kernel-logger", "kokkos", "debugging");
#if defined(OMNITRACE_USE_OMPT) && OMNITRACE_USE_OMPT > 0
OMNITRACE_CONFIG_SETTING(bool, "OMNITRACE_USE_OMPT",
"Enable support for OpenMP-Tools", false, "openmp", "ompt",
"backend");
#endif
OMNITRACE_CONFIG_SETTING(bool, "OMNITRACE_USE_CODE_COVERAGE",
"Enable support for code coverage", false, "coverage",
@@ -205,25 +199,25 @@ configure_settings(bool _init)
OMNITRACE_CONFIG_SETTING(size_t, "OMNITRACE_INSTRUMENTATION_INTERVAL",
"Instrumentation only takes measurements once every N "
"function calls (not statistical)",
1, "instrumentation");
1, "instrumentation", "data_sampling");
OMNITRACE_CONFIG_SETTING(
double, "OMNITRACE_SAMPLING_FREQ",
"Number of software interrupts per second when OMNITTRACE_USE_SAMPLING=ON", 10.0,
"sampling");
"sampling", "thread_sampling");
OMNITRACE_CONFIG_SETTING(
double, "OMNITRACE_SAMPLING_DELAY",
"Number of seconds to wait before the first sampling signal is delivered, "
"increasing this value can fix deadlocks during init",
0.5, "sampling");
0.5, "sampling", "thread_sampling");
OMNITRACE_CONFIG_SETTING(
std::string, "OMNITRACE_SAMPLING_CPUS",
"CPUs to collect frequency information for. Values should be separated by commas "
"and can be explicit or ranges, e.g. 0,1,5-8. An empty value implies 'all' and "
"'none' suppresses all CPU frequency sampling",
"", "sampling");
"", "thread_sampling");
auto _backend = tim::get_env_choice<std::string>(
"OMNITRACE_BACKEND",
@@ -235,30 +229,36 @@ configure_settings(bool _init)
OMNITRACE_CONFIG_SETTING(std::string, "OMNITRACE_BACKEND",
"Specify the perfetto backend to activate. Options are: "
"'inprocess', 'system', or 'all'",
_backend, "perfetto", "instrumentation", "sampling");
_backend, "perfetto");
OMNITRACE_CONFIG_SETTING(bool, "OMNITRACE_CRITICAL_TRACE",
"Enable generation of the critical trace", false, "backend",
"critical-trace", "instrumentation");
"critical_trace");
OMNITRACE_CONFIG_SETTING(bool, "OMNITRACE_TRACE_THREAD_LOCKS",
"Enable tracking calls to pthread_mutex_lock, "
"pthread_mutex_unlock, pthread_mutex_trylock",
false, "backend", "parallelism", "gotcha");
OMNITRACE_CONFIG_SETTING(bool, "OMNITRACE_FLAT_SAMPLING",
"Ignore hierarchy in all statistical sampling entries",
_config->get_flat_profile(), "sampling", "data_layout");
_config->get_flat_profile(), "timemory", "sampling",
"data_layout");
OMNITRACE_CONFIG_SETTING(
bool, "OMNITRACE_TIMELINE_SAMPLING",
"Create unique entries for every sample when statistical sampling is enabled",
_config->get_timeline_profile(), "sampling", "data_layout");
_config->get_timeline_profile(), "timemory", "sampling", "data_layout");
OMNITRACE_CONFIG_SETTING(
bool, "OMNITRACE_ROCTRACER_FLAT_PROFILE",
"Ignore hierarchy in all kernels entries with timemory backend",
_config->get_flat_profile(), "roctracer", "data_layout", "rocm");
_config->get_flat_profile(), "timemory", "roctracer", "data_layout", "rocm");
OMNITRACE_CONFIG_SETTING(
bool, "OMNITRACE_ROCTRACER_TIMELINE_PROFILE",
"Create unique entries for every kernel with timemory backend",
_config->get_timeline_profile(), "roctracer", "data_layout", "rocm");
_config->get_timeline_profile(), "timemory", "roctracer", "data_layout", "rocm");
OMNITRACE_CONFIG_SETTING(bool, "OMNITRACE_ROCTRACER_HSA_ACTIVITY",
"Enable HSA activity tracing support", false, "roctracer",
@@ -273,12 +273,12 @@ configure_settings(bool _init)
OMNITRACE_CONFIG_SETTING(bool, "OMNITRACE_CRITICAL_TRACE_DEBUG",
"Enable debugging for critical trace", _omnitrace_debug,
"debugging", "critical-trace");
"debugging", "critical_trace");
OMNITRACE_CONFIG_SETTING(
bool, "OMNITRACE_CRITICAL_TRACE_SERIALIZE_NAMES",
"Include names in serialization of critical trace (mainly for debugging)",
_omnitrace_debug, "debugging", "critical-trace");
_omnitrace_debug, "debugging", "critical_trace");
OMNITRACE_CONFIG_SETTING(size_t, "OMNITRACE_SHMEM_SIZE_HINT_KB",
"Hint for shared-memory buffer size in perfetto (in KB)",
@@ -288,37 +288,36 @@ configure_settings(bool _init)
"Size of perfetto buffer (in KB)", 1024000, "perfetto",
"data");
OMNITRACE_CONFIG_SETTING(bool, "OMNITRACE_COMBINE_PERFETTO_TRACES",
"Combine Perfetto traces", true, "perfetto", "data");
OMNITRACE_CONFIG_SETTING(int64_t, "OMNITRACE_CRITICAL_TRACE_COUNT",
"Number of critical trace to export (0 == all)", 0, "data",
"critical-trace");
"critical_trace");
OMNITRACE_CONFIG_SETTING(uint64_t, "OMNITRACE_CRITICAL_TRACE_BUFFER_COUNT",
"Number of critical trace records to store in thread-local "
"memory before submitting to shared buffer",
2000, "data", "critical-trace");
2000, "data", "critical_trace");
OMNITRACE_CONFIG_SETTING(
uint64_t, "OMNITRACE_CRITICAL_TRACE_NUM_THREADS",
"Number of threads to use when generating the critical trace",
std::min<uint64_t>(8, std::thread::hardware_concurrency()), "parallelism",
"critical-trace");
"critical_trace");
OMNITRACE_CONFIG_SETTING(
int64_t, "OMNITRACE_CRITICAL_TRACE_PER_ROW",
"How many critical traces per row in perfetto (0 == all in one row)", 0, "io",
"critical-trace");
"critical_trace");
OMNITRACE_CONFIG_SETTING(
std::string, "OMNITRACE_TIMEMORY_COMPONENTS",
"List of components to collect via timemory (see timemory-avail)", "wall_clock",
"timemory", "component", "instrumentation");
"timemory", "component");
OMNITRACE_CONFIG_SETTING(std::string, "OMNITRACE_OUTPUT_FILE", "Perfetto filename",
"", "perfetto", "io");
OMNITRACE_CONFIG_SETTING(bool, "OMNITRACE_SETTINGS_DESC",
"Provide descriptions when printing settings", false,
"debugging");
"", "perfetto", "io", "filename");
_config->get_flamegraph_output() = false;
_config->get_cout_output() = false;
@@ -334,7 +333,7 @@ configure_settings(bool _init)
_config->get_max_thread_bookmarks() = 1;
_config->get_timing_units() = "sec";
_config->get_memory_units() = "MB";
_config->get_papi_events() = "PAPI_TOT_CYC, PAPI_TOT_INS";
_config->get_papi_events() = "PAPI_TOT_CYC";
// settings native to timemory but critically and/or extensively used by omnitrace
auto _add_omnitrace_category = [](auto itr) {
@@ -342,7 +341,7 @@ configure_settings(bool _init)
{
auto _categories = itr->second->get_categories();
_categories.emplace("omnitrace");
_categories.emplace("omnitrace-library");
_categories.emplace("omnitrace_library");
itr->second->set_categories(_categories);
}
};
@@ -396,8 +395,7 @@ configure_settings(bool _init)
_config->read(itr);
}
_config->get_global_components() =
_config->get<std::string>("OMNITRACE_TIMEMORY_COMPONENTS");
settings::suppress_config() = true;
// always initialize timemory because gotcha wrappers are always used
auto _cmd = tim::read_command_line(process::get_id());
@@ -408,9 +406,6 @@ configure_settings(bool _init)
if(_pos < _exe.length() - 1) _exe = _exe.substr(_pos + 1);
get_exe_name() = _exe;
scope::get_fields()[scope::flat::value] = tim::settings::flat_profile();
scope::get_fields()[scope::timeline::value] = tim::settings::timeline_profile();
bool _found_sep = false;
for(const auto& itr : _cmd)
{
@@ -425,8 +420,13 @@ configure_settings(bool _init)
tim::timemory_init(_cmd, _parser, "omnitrace-");
}
_config->get_global_components() =
_config->get<std::string>("OMNITRACE_TIMEMORY_COMPONENTS");
scope::get_fields()[scope::flat::value] = _config->get_flat_profile();
scope::get_fields()[scope::timeline::value] = _config->get_timeline_profile();
settings::suppress_parsing() = true;
settings::suppress_config() = true;
settings::use_output_suffix() = _config->get<bool>("OMNITRACE_USE_PID");
#if !defined(TIMEMORY_USE_MPI) && defined(TIMEMORY_USE_MPI_HEADERS)
if(tim::dmp::is_initialized()) settings::default_process_suffix() = tim::dmp::rank();
@@ -485,6 +485,60 @@ configure_settings(bool _init)
_old_handler = signal(_dyninst_trampoline_signal,
static_cast<signal_handler_t>(_trampoline_handler));
}
auto _handle_use_option = [](const std::string& _opt, const std::string& _category) {
if(!_config->get<bool>(_opt))
{
auto _disabled = _config->disable_category(_category);
_config->enable(_opt);
for(auto&& itr : _disabled)
OMNITRACE_BASIC_VERBOSE(3, "[%s=OFF] disabled option :: '%s'\n",
_opt.c_str(), itr.c_str());
return false;
}
auto _enabled = _config->enable_category(_category);
for(auto&& itr : _enabled)
OMNITRACE_BASIC_VERBOSE(3, "[%s=ON] enabled option :: '%s'\n",
_opt.c_str(), itr.c_str());
return true;
};
_handle_use_option("OMNITRACE_USE_SAMPLING", "sampling");
_handle_use_option("OMNITRACE_USE_THREAD_SAMPLING", "thread_sampling");
_handle_use_option("OMNITRACE_USE_KOKKOSP", "kokkos");
_handle_use_option("OMNITRACE_USE_PERFETTO", "perfetto");
_handle_use_option("OMNITRACE_USE_TIMEMORY", "timemory");
_handle_use_option("OMNITRACE_CRITICAL_TRACE", "critical_trace");
_handle_use_option("OMNITRACE_USE_OMPT", "ompt");
_handle_use_option("OMNITRACE_USE_ROCM_SMI", "rocm_smi");
_handle_use_option("OMNITRACE_USE_ROCTRACER", "roctracer");
#if !defined(OMNITRACE_USE_ROCTRACER) || OMNITRACE_USE_ROCTRACER == 0
_config->disable_category("roctracer");
#endif
#if !defined(OMNITRACE_USE_ROCM_SMI) || OMNITRACE_USE_ROCM_SMI == 0
_config->disable_category("rocm_smi");
#endif
#if defined(OMNITRACE_USE_OMPT) || OMNITRACE_USE_OMPT == 0
_config->disable_category("ompt");
#endif
// user bundle components
_config->disable_category("throttle");
_config->disable("components");
_config->disable("global_components");
_config->disable("ompt_components");
_config->disable("kokkos_components");
_config->disable("trace_components");
_config->disable("profiler_components");
_config->disable("destructor_report");
_config->disable("stack_clearing");
#if !defined(TIMEMORY_USE_MPI) || TIMEMORY_USE_MPI == 0
_config->disable("OMNITRACE_COMBINE_PERFETTO_TRACES");
#endif
}
void
@@ -512,7 +566,8 @@ print_settings(
std::stringstream _os{};
bool _md = tim::get_env<bool>("OMNITRACE_SETTINGS_DESC_MARKDOWN", false);
bool _print_desc = get_debug() || tim::get_env("OMNITRACE_SETTINGS_DESC", false);
bool _md = tim::get_env<bool>("OMNITRACE_SETTINGS_DESC_MARKDOWN", false);
constexpr size_t nfields = 3;
using str_array_t = std::array<std::string, nfields>;
@@ -521,6 +576,7 @@ print_settings(
_widths.fill(0);
for(const auto& itr : *get_config())
{
if(!itr.second->get_enabled()) continue;
if(_filter(itr.first, itr.second->get_categories()))
{
auto _disp = itr.second->get_display(std::ios::boolalpha);
@@ -549,8 +605,6 @@ print_settings(
return lhs.at(0) < rhs.at(0);
});
bool _print_desc = get_debug() || get_config()->get<bool>("OMNITRACE_SETTINGS_DESC");
auto tot_width = std::accumulate(_widths.begin(), _widths.end(), 0);
if(!_print_desc) tot_width -= _widths.back() + 4;
@@ -945,6 +999,17 @@ get_perfetto_buffer_size()
return static_cast<tim::tsettings<size_t>&>(*_v->second).get();
}
bool
get_perfetto_combined_traces()
{
#if defined(TIMEMORY_USE_MPI) && TIMEMORY_USE_MPI > 0
static auto _v = get_config()->find("OMNITRACE_COMBINE_PERFETTO_TRACES");
return static_cast<tim::tsettings<bool>&>(*_v->second).get();
#else
return false;
#endif
}
uint64_t
get_critical_trace_update_freq()
{
@@ -1062,6 +1127,13 @@ get_rocm_smi_devices()
#endif
}
bool
get_trace_thread_locks()
{
static auto _v = get_config()->find("OMNITRACE_TRACE_THREAD_LOCKS");
return static_cast<tim::tsettings<bool>&>(*_v->second).get();
}
bool
get_debug_tid()
{
@@ -199,6 +199,9 @@ get_perfetto_shmem_size_hint();
size_t
get_perfetto_buffer_size();
bool
get_perfetto_combined_traces();
uint64_t
get_critical_trace_update_freq();
@@ -238,6 +241,9 @@ get_rocm_smi_devices();
int64_t
get_critical_trace_per_row();
bool
get_trace_thread_locks();
} // namespace config
//
@@ -26,6 +26,7 @@
#include "library/defines.hpp"
#include "library/perfetto.hpp"
#include "library/ptl.hpp"
#include "library/runtime.hpp"
#include "library/thread_data.hpp"
#include <PTL/ThreadPool.hh>
@@ -109,9 +110,9 @@ get_combined_hash(Arg0&& _zero, Arg1&& _one, Args&&... _args)
bool
entry::operator==(const entry& rhs) const
{
return (device == rhs.device && depth == rhs.depth && priority == rhs.priority &&
tid == rhs.tid && cpu_cid == rhs.cpu_cid && gpu_cid == rhs.gpu_cid &&
hash == rhs.hash);
return std::tie(device, depth, priority, tid, cpu_cid, gpu_cid, queue_id, hash) ==
std::tie(rhs.device, rhs.depth, rhs.priority, rhs.tid, rhs.cpu_cid,
rhs.gpu_cid, rhs.queue_id, rhs.hash);
}
bool
@@ -132,6 +133,10 @@ entry::operator<(const entry& rhs) const
auto _par_eq = (parent_cid == rhs.parent_cid);
if(!_par_eq) return (parent_cid < rhs.parent_cid);
// sort by queue ids
auto _queue_eq = (queue_id == rhs.queue_id);
if(!_queue_eq) return (queue_id < rhs.queue_id);
// sort by priority
auto _prio_eq = (priority == rhs.priority);
if(!_prio_eq) return (priority < rhs.priority);
@@ -143,8 +148,8 @@ entry::operator<(const entry& rhs) const
bool
entry::operator>(const entry& rhs) const
{
return (!(*this < rhs) && begin_ns != rhs.begin_ns && cpu_cid != rhs.cpu_cid &&
gpu_cid != rhs.gpu_cid);
return (!(*this < rhs) && std::tie(begin_ns, cpu_cid, gpu_cid) !=
std::tie(rhs.begin_ns, rhs.cpu_cid, rhs.gpu_cid));
}
entry&
@@ -171,7 +176,7 @@ size_t
entry::get_hash() const
{
return get_combined_hash(hash, static_cast<short>(device), static_cast<short>(phase),
tid, cpu_cid, gpu_cid, priority);
tid, cpu_cid, gpu_cid, queue_id, priority);
}
int64_t
@@ -293,6 +298,7 @@ entry::write(std::ostream& _os) const
_os << " parent: " << static_cast<int64_t>(parent_cid);
_os << ", tid: " << tid;
_os << ", depth: " << depth;
_os << ", queue: " << queue_id;
_os << ", priority: " << priority;
if(phase == Phase::DELTA)
{
@@ -423,6 +429,7 @@ template <>
void
call_chain::generate_perfetto<Device::CPU>(std::set<entry>& _used) const
{
OMNITRACE_SCOPED_THREAD_STATE(ThreadState::Internal);
static std::set<std::string> _static_strings{};
static std::mutex _static_mutex{};
for(const auto& itr : *this)
@@ -444,6 +451,7 @@ template <>
void
call_chain::generate_perfetto<Device::GPU>(std::set<entry>& _used) const
{
OMNITRACE_SCOPED_THREAD_STATE(ThreadState::Internal);
static std::set<std::string> _static_strings{};
static std::mutex _static_mutex{};
for(const auto& itr : *this)
@@ -465,6 +473,7 @@ template <>
void
call_chain::generate_perfetto<Device::ANY>(std::set<entry>& _used) const
{
OMNITRACE_SCOPED_THREAD_STATE(ThreadState::Internal);
static std::set<std::string> _static_strings{};
static std::mutex _static_mutex{};
for(const auto& itr : *this)
@@ -509,6 +518,7 @@ get(int64_t _tid)
void
add_hash_id(const hash_ids& _labels)
{
OMNITRACE_SCOPED_THREAD_STATE(ThreadState::Internal);
std::unique_lock<std::mutex> _lk{ tasking::critical_trace::get_mutex() };
if(!tasking::critical_trace::get_task_group().pool()) return;
tasking::critical_trace::get_task_group().exec([_labels]() {
@@ -539,6 +549,7 @@ void
update(int64_t _tid)
{
if(!get_use_critical_trace() && !get_use_rocm_smi()) return;
OMNITRACE_SCOPED_THREAD_STATE(ThreadState::Internal);
std::unique_lock<std::mutex> _lk{ tasking::critical_trace::get_mutex() };
if(!tasking::critical_trace::get_task_group().pool()) return;
call_chain _data{};
@@ -550,6 +561,7 @@ void
compute(int64_t _tid)
{
update(_tid);
OMNITRACE_SCOPED_THREAD_STATE(ThreadState::Internal);
std::unique_lock<std::mutex> _lk{ tasking::critical_trace::get_mutex() };
if(!tasking::critical_trace::get_task_group().pool()) return;
tasking::critical_trace::get_task_group().exec(compute_critical_trace);
@@ -723,6 +735,7 @@ combine_critical_path(call_chain& _targ, call_chain _chain)
_combined.emplace_back(itr);
std::sort(_combined.begin(), _combined.end());
OMNITRACE_SCOPED_THREAD_STATE(ThreadState::Internal);
std::unique_lock<std::mutex> _lk{ complete_call_mutex };
for(auto& itr : _combined)
_targ.emplace_back(itr);
@@ -751,6 +764,8 @@ update_critical_path(call_chain _chain, int64_t)
void
compute_critical_trace()
{
OMNITRACE_SCOPED_THREAD_STATE(ThreadState::Internal);
static bool _computed = false;
std::unique_lock<std::mutex> _lk{ complete_call_mutex };
@@ -825,6 +840,7 @@ get_entries(int64_t _ts, const std::function<bool(const entry&)>& _eval)
}
*_targ = _v;
};
OMNITRACE_SCOPED_THREAD_STATE(ThreadState::Internal);
std::unique_lock<std::mutex> _lk{ tasking::critical_trace::get_mutex() };
size_t _n = 0;
std::vector<std::pair<std::string, entry>> _v{};
@@ -65,17 +65,18 @@ struct entry
entry& operator=(const entry&) = default;
entry& operator=(entry&&) noexcept = default;
uint16_t priority = 0; // priority value (for sorting)
Device device = Device::CPU; // which device it executed on
Phase phase = Phase::NONE; // start / stop / unspecified
uint16_t depth = 0; // call-stack depth
int64_t tid = 0; // thread id it was registered on
uint64_t cpu_cid = 0; // CPU correlation id
uint64_t gpu_cid = 0; // GPU correlation id
uint64_t parent_cid = 0; // parent CPU correlation id
int64_t begin_ns = 0; // timestamp of start
int64_t end_ns = 0; // timestamp of end
size_t hash = 0; // hash for name
uint16_t priority = 0; /// priority value (for sorting)
Device device = Device::CPU; /// which device it executed on
Phase phase = Phase::NONE; /// start / stop / unspecified
uint16_t depth = 0; /// call-stack depth
int64_t tid = 0; /// thread id it was registered on
uint64_t cpu_cid = 0; /// CPU correlation id
uint64_t gpu_cid = 0; /// GPU correlation id
uint64_t parent_cid = 0; /// parent CPU correlation id
int64_t begin_ns = 0; /// timestamp of start
int64_t end_ns = 0; /// timestamp of end
uintptr_t queue_id = 0; /// stream id (GPU) or mutex id
size_t hash = 0; /// hash for name
bool operator==(const entry& rhs) const;
bool operator!=(const entry& rhs) const { return !(*this == rhs); }
@@ -127,7 +128,8 @@ entry::save(Archive& ar, unsigned int) const
cereal::make_nvp("tid", tid), cereal::make_nvp("cpu_cid", cpu_cid),
cereal::make_nvp("gpu_cid", gpu_cid), cereal::make_nvp("parent_cid", parent_cid),
cereal::make_nvp("begin_ns", begin_ns), cereal::make_nvp("end_ns", end_ns),
cereal::make_nvp("hash", hash), cereal::make_nvp("name", _name),
cereal::make_nvp("queue", queue_id), cereal::make_nvp("hash", hash),
cereal::make_nvp("name", _name),
cereal::make_nvp("demangled_name", tim::demangle(_name)));
}
@@ -144,6 +146,7 @@ entry::load(Archive& ar, unsigned int)
cereal::make_nvp("gpu_cid", gpu_cid), cereal::make_nvp("parent_cid", parent_cid),
cereal::make_nvp("begin_ns", begin_ns), cereal::make_nvp("end_ns", end_ns),
cereal::make_nvp("hash", hash), cereal::make_nvp("name", _name),
cereal::make_nvp("queue", queue_id),
cereal::make_nvp("demangled_name", _demangled_name));
tim::get_hash_ids()->emplace(hash, _name);
@@ -0,0 +1,50 @@
// MIT License
//
// Copyright (c) 2022 Advanced Micro Devices, Inc. All Rights Reserved.
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in all
// copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
// SOFTWARE.
#include "library/debug.hpp"
#include "library/runtime.hpp"
#include "library/state.hpp"
namespace omnitrace
{
namespace debug
{
lock::lock()
: m_lk{ tim::type_mutex<decltype(std::cerr)>(), std::defer_lock }
{
if(!m_lk.owns_lock())
{
push_thread_state(ThreadState::Internal);
m_lk.lock();
}
}
lock::~lock()
{
if(m_lk.owns_lock())
{
m_lk.unlock();
pop_thread_state();
}
}
} // namespace debug
} // namespace omnitrace
@@ -27,6 +27,10 @@
#include <timemory/api.hpp>
#include <timemory/backends/dmp.hpp>
#include <timemory/backends/process.hpp>
#include <timemory/backends/threading.hpp>
#include <timemory/mpl/concepts.hpp>
#include <timemory/utility/backtrace.hpp>
#include <timemory/utility/locking.hpp>
#include <timemory/utility/utility.hpp>
#include <array>
@@ -66,6 +70,25 @@ get_critical_trace_debug();
namespace debug
{
struct lock
{
lock();
~lock();
private:
tim::auto_lock_t m_lk;
};
//
template <typename Arg, typename... Args>
bool
is_bracket(Arg&& _arg, Args&&...)
{
if constexpr(::tim::concepts::is_string_type<Arg>::value)
return (::std::string_view{ _arg }.empty()) ? false : _arg[0] == '[';
else
return false;
}
//
namespace
{
template <typename T, size_t... Idx>
@@ -79,10 +102,6 @@ get_chars(T&& _c, std::index_sequence<Idx...>)
} // namespace debug
} // namespace omnitrace
#define OMNITRACE_VAR_NAME_COMBINE(X, Y) X##Y
#define OMNITRACE_LINESTR TIMEMORY_STRINGIZE(__LINE__)
#define OMNITRACE_VARIABLE(LABEL) OMNITRACE_VAR_NAME_COMBINE(_omni_var_, LABEL)
#if !defined(OMNITRACE_DEBUG_BUFFER_LEN)
# define OMNITRACE_DEBUG_BUFFER_LEN 2048
#endif
@@ -129,14 +148,17 @@ get_chars(T&& _c, std::index_sequence<Idx...>)
.data()
#endif
//--------------------------------------------------------------------------------------//
#define OMNITRACE_CONDITIONAL_PRINT(COND, ...) \
if((COND) && ::omnitrace::config::get_debug_tid() && \
::omnitrace::config::get_debug_pid()) \
{ \
fflush(stderr); \
tim::auto_lock_t _lk{ tim::type_mutex<decltype(std::cerr)>() }; \
fprintf(stderr, "[omnitrace][%i][%li] ", OMNITRACE_PROCESS_IDENTIFIER, \
OMNITRACE_THREAD_IDENTIFIER); \
::omnitrace::debug::lock _lk{}; \
fprintf(stderr, "[omnitrace][%i][%li]%s", OMNITRACE_PROCESS_IDENTIFIER, \
OMNITRACE_THREAD_IDENTIFIER, \
::omnitrace::debug::is_bracket(__VA_ARGS__) ? "" : " "); \
fprintf(stderr, __VA_ARGS__); \
fflush(stderr); \
}
@@ -146,8 +168,9 @@ get_chars(T&& _c, std::index_sequence<Idx...>)
::omnitrace::config::get_debug_pid()) \
{ \
fflush(stderr); \
tim::auto_lock_t _lk{ tim::type_mutex<decltype(std::cerr)>() }; \
fprintf(stderr, "[omnitrace] "); \
::omnitrace::debug::lock _lk{}; \
fprintf(stderr, "[omnitrace]%s", \
::omnitrace::debug::is_bracket(__VA_ARGS__) ? "" : " "); \
fprintf(stderr, __VA_ARGS__); \
fflush(stderr); \
}
@@ -157,9 +180,10 @@ get_chars(T&& _c, std::index_sequence<Idx...>)
::omnitrace::config::get_debug_pid()) \
{ \
fflush(stderr); \
tim::auto_lock_t _lk{ tim::type_mutex<decltype(std::cerr)>() }; \
fprintf(stderr, "[omnitrace][%i][%li][%s] ", OMNITRACE_PROCESS_IDENTIFIER, \
OMNITRACE_THREAD_IDENTIFIER, OMNITRACE_FUNCTION); \
::omnitrace::debug::lock _lk{}; \
fprintf(stderr, "[omnitrace][%i][%li][%s]%s", OMNITRACE_PROCESS_IDENTIFIER, \
OMNITRACE_THREAD_IDENTIFIER, OMNITRACE_FUNCTION, \
::omnitrace::debug::is_bracket(__VA_ARGS__) ? "" : " "); \
fprintf(stderr, __VA_ARGS__); \
fflush(stderr); \
}
@@ -169,19 +193,23 @@ get_chars(T&& _c, std::index_sequence<Idx...>)
::omnitrace::config::get_debug_pid()) \
{ \
fflush(stderr); \
tim::auto_lock_t _lk{ tim::type_mutex<decltype(std::cerr)>() }; \
fprintf(stderr, "[omnitrace][%s] ", OMNITRACE_FUNCTION); \
::omnitrace::debug::lock _lk{}; \
fprintf(stderr, "[omnitrace][%s]%s", OMNITRACE_FUNCTION, \
::omnitrace::debug::is_bracket(__VA_ARGS__) ? "" : " "); \
fprintf(stderr, __VA_ARGS__); \
fflush(stderr); \
}
//--------------------------------------------------------------------------------------//
#define OMNITRACE_CONDITIONAL_THROW(COND, ...) \
if(COND) \
{ \
char _msg_buffer[OMNITRACE_DEBUG_BUFFER_LEN]; \
snprintf(_msg_buffer, OMNITRACE_DEBUG_BUFFER_LEN, "[omnitrace][%i][%li][%s] ", \
snprintf(_msg_buffer, OMNITRACE_DEBUG_BUFFER_LEN, "[omnitrace][%i][%li][%s]%s", \
OMNITRACE_PROCESS_IDENTIFIER, OMNITRACE_THREAD_IDENTIFIER, \
OMNITRACE_FUNCTION); \
OMNITRACE_FUNCTION, \
::omnitrace::debug::is_bracket(__VA_ARGS__) ? "" : " "); \
auto len = strlen(_msg_buffer); \
snprintf(_msg_buffer + len, OMNITRACE_DEBUG_BUFFER_LEN - len, __VA_ARGS__); \
throw std::runtime_error(_msg_buffer); \
@@ -191,8 +219,9 @@ get_chars(T&& _c, std::index_sequence<Idx...>)
if(COND) \
{ \
char _msg_buffer[OMNITRACE_DEBUG_BUFFER_LEN]; \
snprintf(_msg_buffer, OMNITRACE_DEBUG_BUFFER_LEN, "[omnitrace][%s] ", \
OMNITRACE_FUNCTION); \
snprintf(_msg_buffer, OMNITRACE_DEBUG_BUFFER_LEN, "[omnitrace][%s]%s", \
OMNITRACE_FUNCTION, \
::omnitrace::debug::is_bracket(__VA_ARGS__) ? "" : " "); \
auto len = strlen(_msg_buffer); \
snprintf(_msg_buffer + len, OMNITRACE_DEBUG_BUFFER_LEN - len, __VA_ARGS__); \
throw std::runtime_error(_msg_buffer); \
@@ -206,8 +235,71 @@ get_chars(T&& _c, std::index_sequence<Idx...>)
OMNITRACE_CONDITIONAL_BASIC_THROW( \
::omnitrace::get_is_continuous_integration() && (COND), __VA_ARGS__)
#define OMNITRACE_STRINGIZE(...) #__VA_ARGS__
#define OMNITRACE_ESC(...) __VA_ARGS__
//--------------------------------------------------------------------------------------//
#define OMNITRACE_CONDITIONAL_FAIL(COND, ...) \
if(COND) \
{ \
fflush(stderr); \
fprintf(stderr, "[omnitrace][%i][%li]%s", OMNITRACE_PROCESS_IDENTIFIER, \
OMNITRACE_THREAD_IDENTIFIER, \
::omnitrace::debug::is_bracket(__VA_ARGS__) ? "" : " "); \
fprintf(stderr, __VA_ARGS__); \
::omnitrace::set_state(::omnitrace::State::Finalized); \
::tim::disable_signal_detection(); \
::tim::print_demangled_backtrace<64>(); \
::std::exit(EXIT_FAILURE); \
}
#define OMNITRACE_CONDITIONAL_BASIC_FAIL(COND, ...) \
if(COND) \
{ \
fflush(stderr); \
fprintf(stderr, "[omnitrace]%s", \
::omnitrace::debug::is_bracket(__VA_ARGS__) ? "" : " "); \
fprintf(stderr, __VA_ARGS__); \
::omnitrace::set_state(::omnitrace::State::Finalized); \
::tim::disable_signal_detection(); \
::tim::print_demangled_backtrace<64>(); \
::std::exit(EXIT_FAILURE); \
}
#define OMNITRACE_CONDITIONAL_FAIL_F(COND, ...) \
if(COND) \
{ \
fflush(stderr); \
fprintf(stderr, "[omnitrace][%i][%li][%s]%s", OMNITRACE_PROCESS_IDENTIFIER, \
OMNITRACE_THREAD_IDENTIFIER, OMNITRACE_FUNCTION, \
::omnitrace::debug::is_bracket(__VA_ARGS__) ? "" : " "); \
fprintf(stderr, __VA_ARGS__); \
::omnitrace::set_state(::omnitrace::State::Finalized); \
::tim::disable_signal_detection(); \
::tim::print_demangled_backtrace<64>(); \
::std::exit(EXIT_FAILURE); \
}
#define OMNITRACE_CONDITIONAL_BASIC_FAIL_F(COND, ...) \
if(COND) \
{ \
fflush(stderr); \
fprintf(stderr, "[omnitrace][%s]%s", OMNITRACE_FUNCTION, \
::omnitrace::debug::is_bracket(__VA_ARGS__) ? "" : " "); \
fprintf(stderr, __VA_ARGS__); \
::omnitrace::set_state(::omnitrace::State::Finalized); \
::tim::disable_signal_detection(); \
::tim::print_demangled_backtrace<64>(); \
::std::exit(EXIT_FAILURE); \
}
#define OMNITRACE_CI_FAIL(COND, ...) \
OMNITRACE_CONDITIONAL_FAIL(::omnitrace::get_is_continuous_integration() && (COND), \
__VA_ARGS__)
#define OMNITRACE_CI_BASIC_FAIL(COND, ...) \
OMNITRACE_CONDITIONAL_BASIC_FAIL( \
::omnitrace::get_is_continuous_integration() && (COND), __VA_ARGS__)
//--------------------------------------------------------------------------------------//
//--------------------------------------------------------------------------------------//
//
@@ -287,6 +379,20 @@ get_chars(T&& _c, std::index_sequence<Idx...>)
#define OMNITRACE_BASIC_THROW(...) OMNITRACE_CONDITIONAL_BASIC_THROW(true, __VA_ARGS__)
//--------------------------------------------------------------------------------------//
//
// Fail macros
//
//--------------------------------------------------------------------------------------//
#define OMNITRACE_FAIL(...) OMNITRACE_CONDITIONAL_FAIL(true, __VA_ARGS__)
#define OMNITRACE_FAIL_F(...) OMNITRACE_CONDITIONAL_FAIL_F(true, __VA_ARGS__)
#define OMNITRACE_BASIC_FAIL(...) OMNITRACE_CONDITIONAL_BASIC_FAIL(true, __VA_ARGS__)
#define OMNITRACE_BASIC_FAIL_F(...) OMNITRACE_CONDITIONAL_BASIC_FAIL_F(true, __VA_ARGS__)
#include <string>
namespace std
@@ -67,3 +67,10 @@
# endif
# include <cassert>
#endif
#define OMNITRACE_STRINGIZE(X) OMNITRACE_STRINGIZE2(X)
#define OMNITRACE_STRINGIZE2(X) #X
#define OMNITRACE_VAR_NAME_COMBINE(X, Y) X##Y
#define OMNITRACE_VARIABLE(Y) OMNITRACE_VAR_NAME_COMBINE(_omni_var_, Y)
#define OMNITRACE_LINESTR OMNITRACE_STRINGIZE(__LINE__)
#define OMNITRACE_ESC(...) __VA_ARGS__
@@ -24,9 +24,11 @@
#include "library/config.hpp"
#include "library/debug.hpp"
#include "library/defines.hpp"
#include "library/runtime.hpp"
#include "library/sampling.hpp"
#include <PTL/ThreadPool.hh>
#include <timemory/utility/declaration.hpp>
namespace omnitrace
@@ -42,9 +44,10 @@ auto _thread_pool_cfg = []() {
_v.use_tbb = false;
_v.verbose = -1;
_v.initializer = []() {
set_thread_state(ThreadState::Internal);
sampling::block_signals();
threading::set_thread_name(
TIMEMORY_JOIN('.', "ptl", PTL::Threading::GetThreadId()).c_str());
JOIN('.', "ptl", PTL::Threading::GetThreadId()).c_str());
};
_v.finalizer = []() {};
_v.priority = 5;
@@ -26,6 +26,7 @@
#include "library/debug.hpp"
#include "library/defines.hpp"
#include "library/thread_data.hpp"
#include "library/utility.hpp"
#include <timemory/backends/dmp.hpp>
#include <timemory/backends/mpi.hpp>
@@ -98,6 +99,8 @@ create_cpu_cid_entry(int64_t _tid)
{
using tim::auto_lock_t;
OMNITRACE_SCOPED_THREAD_STATE(ThreadState::Internal);
// unique lock for _tid
auto& _mtx = get_cpu_cid_stack_lock(_tid);
auto_lock_t _lk{ _mtx, std::defer_lock };
@@ -166,4 +169,50 @@ get_gotcha_bundle()
"omnitrace", quirk::config<quirk::auto_start>{}));
return _v;
}
namespace
{
auto&
get_thread_state_history(int64_t _idx = utility::get_thread_index())
{
static auto _v = utility::get_filled_array<OMNITRACE_MAX_THREADS>(
[]() { return utility::get_reserved_vector<ThreadState>(32); });
return _v.at(_idx);
}
} // namespace
ThreadState&
get_thread_state()
{
static thread_local ThreadState _v{ ThreadState::Enabled };
return _v;
}
ThreadState
set_thread_state(ThreadState _n)
{
auto _o = get_thread_state();
get_thread_state() = _n;
return _o;
}
ThreadState
push_thread_state(ThreadState _v)
{
return get_thread_state_history().emplace_back(set_thread_state(_v));
}
ThreadState
pop_thread_state()
{
auto& _hist = get_thread_state_history();
if(!_hist.empty())
{
set_thread_state(_hist.back());
_hist.pop_back();
}
return get_thread_state();
}
} // namespace omnitrace
@@ -46,7 +46,7 @@ namespace omnitrace
// bundle of components around omnitrace_init and omnitrace_finalize
using main_bundle_t =
tim::lightweight_tuple<comp::wall_clock, comp::peak_rss, comp::cpu_clock,
comp::cpu_util, pthread_gotcha_t>;
comp::cpu_util, pthread_gotcha>;
using gotcha_bundle_t = tim::lightweight_tuple<fork_gotcha_t, mpi_gotcha_t>;
@@ -89,4 +89,27 @@ get_cpu_cid_entry(uint64_t _cid, int64_t _tid = threading::get_id());
tim::mutex_t&
get_cpu_cid_stack_lock(int64_t _tid = threading::get_id());
ThreadState&
get_thread_state();
/// returns old state
ThreadState set_thread_state(ThreadState);
ThreadState push_thread_state(ThreadState);
ThreadState
pop_thread_state();
struct scoped_thread_state
{
scoped_thread_state(ThreadState _v) { push_thread_state(_v); }
~scoped_thread_state() { pop_thread_state(); }
};
} // namespace omnitrace
#define OMNITRACE_SCOPED_THREAD_STATE(STATE) \
::omnitrace::scoped_thread_state OMNITRACE_VARIABLE( \
OMNITRACE_VAR_NAME_COMBINE(scoped_thread_state_, __LINE__)) \
{ \
::omnitrace::STATE \
}
@@ -40,6 +40,19 @@ to_string(omnitrace::State _v)
return {};
}
std::string
to_string(omnitrace::ThreadState _v)
{
switch(_v)
{
case omnitrace::ThreadState::Enabled: return "Enabled";
case omnitrace::ThreadState::Internal: return "Internal";
case omnitrace::ThreadState::Disabled: return "Disabled";
case omnitrace::ThreadState::Completed: return "Completed";
}
return {};
}
std::string
to_string(omnitrace::Mode _v)
{
@@ -36,6 +36,15 @@ enum class State : unsigned short
Finalized
};
// used for specifying the state of omnitrace
enum class ThreadState : unsigned short
{
Enabled = 0,
Internal,
Disabled,
Completed,
};
enum class Mode : unsigned short
{
Trace = 0,
@@ -51,6 +60,9 @@ namespace std
std::string
to_string(omnitrace::State _v);
std::string
to_string(omnitrace::ThreadState _v);
std::string
to_string(omnitrace::Mode _v);
} // namespace std
@@ -21,6 +21,10 @@
// SOFTWARE.
#include "library/thread_data.hpp"
#include "library/components/pthread_create_gotcha.hpp"
#include "library/utility.hpp"
#include <timemory/backends/threading.hpp>
namespace omnitrace
{
@@ -30,4 +34,15 @@ instrumentation_bundles::instances()
static auto _v = instance_array_t{};
return _v;
}
void
thread_deleter<void>::operator()() const
{
pthread_create_gotcha::shutdown(threading::get_id());
set_thread_state(ThreadState::Completed);
if(get_state() != State::Finalized && threading::get_id() == 0)
omnitrace_finalize_hidden();
}
template struct thread_deleter<void>;
} // namespace omnitrace
@@ -26,6 +26,7 @@
#include "library/common.hpp"
#include "library/config.hpp"
#include "library/defines.hpp"
#include "library/state.hpp"
#include "library/timemory.hpp"
#include <array>
@@ -40,6 +41,8 @@
namespace omnitrace
{
ThreadState set_thread_state(ThreadState);
// bundle of components used in instrumentation
using instrumentation_bundle_t =
tim::component_bundle<api::omnitrace, comp::wall_clock*, comp::user_global_bundle*>;
@@ -56,12 +59,20 @@ using unique_ptr_t = std::unique_ptr<Tp, thread_deleter<Tp>>;
static constexpr size_t max_supported_threads = OMNITRACE_MAX_THREADS;
template <>
struct thread_deleter<void>
{
void operator()() const;
};
extern template struct thread_deleter<void>;
template <typename Tp>
struct thread_deleter
{
void operator()(Tp* ptr) const
{
if(get_state() != State::Finalized) omnitrace_finalize_hidden();
thread_deleter<void>{}();
delete ptr;
}
};
@@ -74,6 +74,7 @@ get_sampler_is_sampling()
void
sampler::poll(std::atomic<State>* _state, nsec_t _interval, promise_t* _ready)
{
set_thread_state(ThreadState::Internal);
threading::set_thread_name("omni.sampler");
// notify thread started
@@ -0,0 +1,70 @@
// MIT License
//
// Copyright (c) 2022 Advanced Micro Devices, Inc. All Rights Reserved.
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in all
// copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
// SOFTWARE.
#pragma once
#include <array>
#include <atomic>
#include <cstddef>
#include <cstdint>
#include <vector>
namespace omnitrace
{
namespace utility
{
namespace
{
/// provides an alternative thread index for when using threading::get_id() is not
/// desirable
inline auto
get_thread_index()
{
static std::atomic<int64_t> _c{ 0 };
static thread_local int64_t _v = _c++;
return _v;
}
/// fills any array with the result of the functor
template <size_t N, typename FuncT>
inline auto
get_filled_array(FuncT&& _func)
{
using Tp = std::decay_t<decltype(_func())>;
std::array<Tp, N> _v{};
for(auto& itr : _v)
itr = std::move(_func());
return _v;
}
/// returns a vector with a preallocated buffer
template <typename... Tp>
inline auto
get_reserved_vector(size_t _n)
{
std::vector<Tp...> _v{};
_v.reserve(_n);
return _v;
}
} // namespace
} // namespace utility
} // namespace omnitrace
@@ -63,12 +63,10 @@ endfunction()
add_library(omnitrace-python-compile-options INTERFACE)
add_library(omnitrace::omnitrace-python-compile-options ALIAS
omnitrace-python-compile-options)
add_cxx_flag_if_avail("-frtti" omnitrace-python-compile-options)
add_cxx_flag_if_avail("-Wno-unused-value" omnitrace-python-compile-options)
add_cxx_flag_if_avail("-Wno-range-loop-analysis" omnitrace-python-compile-options)
add_cxx_flag_if_avail("-ftls-model=global-dynamic" omnitrace-python-compile-options)
add_cxx_flag_if_avail("-Wno-deprecated-declarations" omnitrace-python-compile-options)
add_cxx_flag_if_avail("-Wno-unused-but-set-parameter" omnitrace-python-compile-options)
add_target_cxx_flag_if_avail(
omnitrace-python-compile-options "-Wno-unused-value" "-Wno-range-loop-analysis"
"-Wno-deprecated-declarations" "-Wno-unused-but-set-parameter"
"-ftls-model=global-dynamic")
file(GLOB pyheaders ${CMAKE_CURRENT_LIST_DIR}/libpyomnitrace*.hpp)
set(pysources ${CMAKE_CURRENT_LIST_DIR}/libpyomnitrace.cpp)
@@ -34,6 +34,18 @@ set(_base_environment
"LD_LIBRARY_PATH=${PROJECT_BINARY_DIR}:${OMNITRACE_DYNINST_API_RT_DIR}:$ENV{LD_LIBRARY_PATH}"
)
set(_lock_environment
"OMNITRACE_USE_SAMPLING=OFF"
"OMNITRACE_CRITICAL_TRACE=ON"
"OMNITRACE_COLLAPSE_THREADS=ON"
"OMNITRACE_TRACE_THREAD_LOCKS=ON"
"OMNITRACE_COUT_OUTPUT=ON"
"OMNITRACE_TIME_OUTPUT=OFF"
"OMNITRACE_FLAT_PROFILE=ON"
"OMNITRACE_TIMELINE_PROFILE=OFF"
"LD_LIBRARY_PATH=${PROJECT_BINARY_DIR}:${OMNITRACE_DYNINST_API_RT_DIR}:$ENV{LD_LIBRARY_PATH}"
)
set(_ompt_environment
"OMNITRACE_USE_PERFETTO=ON"
"OMNITRACE_USE_TIMEMORY=ON"
@@ -144,7 +156,7 @@ function(OMNITRACE_ADD_TEST)
list(APPEND TEST_ENVIRONMENT "OMNITRACE_USE_PID=OFF")
endif()
if(NOT SKIP_BASELINE)
if(NOT TEST_SKIP_BASELINE)
add_test(
NAME ${TEST_NAME}-baseline
COMMAND ${COMMAND_PREFIX} $<TARGET_FILE:${TEST_TARGET}> ${TEST_RUN_ARGS}
@@ -389,7 +401,7 @@ omnitrace_add_test(
TARGET transpose
MPI ${TRANSPOSE_USE_MPI}
NUM_PROCS ${NUM_PROCS}
REWRITE_ARGS -e -v 2
REWRITE_ARGS -e -v 2 --print-instructions
RUNTIME_ARGS -e -v 1 --label file line return args
ENVIRONMENT "${_base_environment};OMNITRACE_CRITICAL_TRACE=ON")
@@ -410,6 +422,30 @@ omnitrace_add_test(
RUN_ARGS 10 ${NUM_THREADS} 1000
ENVIRONMENT "${_base_environment};OMNITRACE_CRITICAL_TRACE=OFF")
omnitrace_add_test(
SKIP_RUNTIME SKIP_SAMPLING
NAME parallel-overhead-locks-timemory
TARGET parallel-overhead-locks
LABELS "locks"
REWRITE_ARGS -e -v 2 --min-instructions=4
RUN_ARGS 10 4 1000
ENVIRONMENT
"${_lock_environment};OMNITRACE_USE_TIMEMORY=ON;OMNITRACE_USE_PERFETTO=OFF"
REWRITE_RUN_PASS_REGEX
"start_thread (.*) 4 (.*) pthread_mutex_lock (.*) 4000 (.*) pthread_mutex_unlock (.*) 4000"
)
omnitrace_add_test(
SKIP_BASELINE SKIP_RUNTIME SKIP_SAMPLING
NAME parallel-overhead-locks-perfetto
TARGET parallel-overhead-locks
LABELS "locks"
REWRITE_ARGS -e -v 2 --min-instructions=8
RUN_ARGS 10 4 1000
ENVIRONMENT
"${_lock_environment};OMNITRACE_USE_TIMEMORY=OFF;OMNITRACE_USE_PERFETTO=ON"
PROPERTIES WILL_FAIL ON)
omnitrace_add_test(
NAME user-api
TARGET user-api