Support for tracing mutex locking (#52)
* Parallel overhead example with locks
* Support tracing mutex locking + more
- support wrapping pthread_mutex_lock
- support wrapping pthread_mutex_unlock
- support wrapping pthread_mutex_trylock
- get_perfetto_combined_traces setting
- OMNITRACE_TRACE_THREAD_LOCKS option
- ThreadState
- critical trace includes queue id
- enabled/disabled settings in timemory
- fix OMNITRACE_TIMEMORY_COMPONENTS
- fix reading config
- fix setting categories
- applied ThreadState::Internal in various places
- utility::get_filled_array
- utility::get_reserved_vector
- utility::get_thread_index
- fork_gotcha messages about forks
- split out some pthread_gotcha functionality into pthread_create_gotcha
- handle queue id in roctracer callbacks
* Update timemory and PTL submodules
* Misc CMake updates
- Includes fix to omnitrace-static-lib{gcc,stdcxx}
* Misc cleanup to pthread_mutex_gotcha and backtrace
* Fix to duplicate field in module_function json
* Improvement to debug messages
* omnitrace-dl and common improvements
- tweak to delimit
- common::ignore message
- common::join quoting of strings
- omnitrace_set_env ignores if inited and active
- omnitrace_set_mpi ignores if inited and active
* nsync for transpose example
* Fix to thread_deleter<void> functor invoke
* Fix thread state and HIP stream enums
[ROCm/rocprofiler-systems commit: b208047741]
This commit is contained in:
committed by
GitHub
orang tua
0094a471fd
melakukan
0d5f0fb9cf
@@ -132,6 +132,44 @@ parse:
|
||||
kwargs:
|
||||
RESULT_VARIABLE: '*'
|
||||
OUTPUT_VARIABLE: '*'
|
||||
omnitrace_find_static_library:
|
||||
flags:
|
||||
- NO_CACHE
|
||||
- REQUIRED
|
||||
- NO_DEFAULT_PATH
|
||||
- NO_PACKAGE_ROOT_PATH
|
||||
- NO_CMAKE_PATH
|
||||
- NO_CMAKE_ENVIRONMENT_PATH
|
||||
- NO_SYSTEM_ENVIRONMENT_PATH
|
||||
- CMAKE_FIND_ROOT_PATH_BOTH
|
||||
- ONLY_CMAKE_FIND_ROOT_PATH
|
||||
- NO_CMAKE_FIND_ROOT_PATH
|
||||
kwargs:
|
||||
NAMES: '*'
|
||||
NAMES_PER_DIR: '*'
|
||||
HINTS: '*'
|
||||
PATHS: '*'
|
||||
PATH_SUFFIXES: '*'
|
||||
DOC: '*'
|
||||
omnitrace_find_shared_library:
|
||||
flags:
|
||||
- NO_CACHE
|
||||
- REQUIRED
|
||||
- NO_DEFAULT_PATH
|
||||
- NO_PACKAGE_ROOT_PATH
|
||||
- NO_CMAKE_PATH
|
||||
- NO_CMAKE_ENVIRONMENT_PATH
|
||||
- NO_SYSTEM_ENVIRONMENT_PATH
|
||||
- CMAKE_FIND_ROOT_PATH_BOTH
|
||||
- ONLY_CMAKE_FIND_ROOT_PATH
|
||||
- NO_CMAKE_FIND_ROOT_PATH
|
||||
kwargs:
|
||||
NAMES: '*'
|
||||
NAMES_PER_DIR: '*'
|
||||
HINTS: '*'
|
||||
PATHS: '*'
|
||||
PATH_SUFFIXES: '*'
|
||||
DOC: '*'
|
||||
override_spec: {}
|
||||
vartags: []
|
||||
proptags: []
|
||||
|
||||
@@ -301,9 +301,18 @@ target_compile_options(
|
||||
omnitrace-static-libgcc
|
||||
INTERFACE $<$<COMPILE_LANGUAGE:C>:$<$<C_COMPILER_ID:GNU>:-static-libgcc>>
|
||||
$<$<COMPILE_LANGUAGE:CXX>:$<$<CXX_COMPILER_ID:GNU>:-static-libgcc>>)
|
||||
target_link_options(
|
||||
omnitrace-static-libgcc INTERFACE
|
||||
$<$<COMPILE_LANGUAGE:C>:$<$<C_COMPILER_ID:GNU,Clang>:-static-libgcc>>
|
||||
$<$<COMPILE_LANGUAGE:CXX>:$<$<CXX_COMPILER_ID:GNU,Clang>:-static-libgcc>>)
|
||||
|
||||
target_compile_options(
|
||||
omnitrace-static-libstdcxx
|
||||
INTERFACE $<$<COMPILE_LANGUAGE:CXX>:$<$<CXX_COMPILER_ID:GNU>:-static-libstdc++>>)
|
||||
INTERFACE $<$<COMPILE_LANGUAGE:CXX>:$<$<CXX_COMPILER_ID:GNU,Clang>:-static-libstdc++>>
|
||||
)
|
||||
target_link_options(
|
||||
omnitrace-static-libstdcxx INTERFACE
|
||||
$<$<COMPILE_LANGUAGE:CXX>:$<$<CXX_COMPILER_ID:GNU,Clang>:-static-libstdc++>>)
|
||||
|
||||
# ----------------------------------------------------------------------------------------#
|
||||
# user customization
|
||||
|
||||
@@ -777,4 +777,14 @@ function(OMNITRACE_PYTHON_CONSOLE_SCRIPT SCRIPT_NAME SCRIPT_SUBMODULE)
|
||||
endif()
|
||||
endfunction()
|
||||
|
||||
function(OMNITRACE_FIND_STATIC_LIBRARY)
|
||||
set(CMAKE_FIND_LIBRARY_SUFFIXES ${CMAKE_STATIC_LIBRARY_SUFFIX})
|
||||
find_library(${ARGN})
|
||||
endfunction()
|
||||
|
||||
function(OMNITRACE_FIND_SHARED_LIBRARY)
|
||||
set(CMAKE_FIND_LIBRARY_SUFFIXES ${CMAKE_SHARED_LIBRARY_SUFFIX})
|
||||
find_library(${ARGN})
|
||||
endfunction()
|
||||
|
||||
cmake_policy(POP)
|
||||
|
||||
@@ -7,7 +7,13 @@ find_package(Threads REQUIRED)
|
||||
add_executable(parallel-overhead parallel-overhead.cpp)
|
||||
target_link_libraries(parallel-overhead Threads::Threads)
|
||||
|
||||
add_executable(parallel-overhead-locks parallel-overhead.cpp)
|
||||
target_link_libraries(parallel-overhead-locks Threads::Threads)
|
||||
target_compile_definitions(parallel-overhead-locks PRIVATE USE_LOCKS=1)
|
||||
|
||||
if(NOT CMAKE_PROJECT_NAME STREQUAL PROJECT_NAME)
|
||||
set_target_properties(parallel-overhead PROPERTIES RUNTIME_OUTPUT_DIRECTORY
|
||||
${CMAKE_BINARY_DIR})
|
||||
set_target_properties(parallel-overhead-locks PROPERTIES RUNTIME_OUTPUT_DIRECTORY
|
||||
${CMAKE_BINARY_DIR})
|
||||
endif()
|
||||
|
||||
@@ -1,14 +1,23 @@
|
||||
|
||||
#include <atomic>
|
||||
#include <cstdio>
|
||||
#include <cstdlib>
|
||||
#include <string>
|
||||
#include <thread>
|
||||
#include <vector>
|
||||
|
||||
#if defined(USE_LOCKS)
|
||||
# include <mutex>
|
||||
using auto_lock_t = std::unique_lock<std::mutex>;
|
||||
long total = 0;
|
||||
std::mutex mtx{};
|
||||
#else
|
||||
# include <atomic>
|
||||
std::atomic<long> total{ 0 };
|
||||
#endif
|
||||
|
||||
long
|
||||
fib(long n) __attribute__((noinline));
|
||||
|
||||
void
|
||||
run(size_t nitr, long) __attribute__((noinline));
|
||||
|
||||
@@ -21,10 +30,19 @@ fib(long n)
|
||||
void
|
||||
run(size_t nitr, long n)
|
||||
{
|
||||
#if defined(USE_LOCKS)
|
||||
for(size_t i = 0; i < nitr; ++i)
|
||||
{
|
||||
auto _v = fib(n);
|
||||
auto_lock_t _lk{ mtx };
|
||||
total += _v;
|
||||
}
|
||||
#else
|
||||
long local = 0;
|
||||
for(size_t i = 0; i < nitr; ++i)
|
||||
local += fib(n);
|
||||
total += local;
|
||||
#endif
|
||||
}
|
||||
|
||||
int
|
||||
@@ -42,7 +60,7 @@ main(int argc, char** argv)
|
||||
if(argc > 2) nthread = atol(argv[2]);
|
||||
if(argc > 3) nitr = atol(argv[3]);
|
||||
|
||||
printf("[%s] Threads: %zu\n[%s] Iterations: %zu\n[%s] fibonacci(%li)...\n",
|
||||
printf("\n[%s] Threads: %zu\n[%s] Iterations: %zu\n[%s] fibonacci(%li)...\n",
|
||||
_name.c_str(), nthread, _name.c_str(), nitr, _name.c_str(), nfib);
|
||||
|
||||
std::vector<std::thread> threads{};
|
||||
@@ -53,13 +71,16 @@ main(int argc, char** argv)
|
||||
threads.emplace_back(&run, _nitr, nfib);
|
||||
}
|
||||
|
||||
#if !defined(USE_LOCKS)
|
||||
auto _nitr = std::max<size_t>(nitr - 0.25 * nitr, 1);
|
||||
run(_nitr, nfib - 0.1 * nfib);
|
||||
#endif
|
||||
|
||||
for(auto& itr : threads)
|
||||
itr.join();
|
||||
|
||||
printf("[%s] fibonacci(%li) x %lu = %li\n", _name.c_str(), nfib, nthread,
|
||||
total.load());
|
||||
static_cast<long>(total));
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
@@ -95,10 +95,12 @@ transpose_a(int* in, int* out, int M, int N)
|
||||
void
|
||||
run(int rank, int tid, hipStream_t stream, int argc, char** argv)
|
||||
{
|
||||
size_t nitr = 500;
|
||||
unsigned int M = 4960 * 2;
|
||||
unsigned int N = 4960 * 2;
|
||||
size_t nitr = 500;
|
||||
size_t nsync = 10;
|
||||
unsigned int M = 4960 * 2;
|
||||
unsigned int N = 4960 * 2;
|
||||
if(argc > 2) nitr = atoll(argv[2]);
|
||||
if(argc > 3) nsync = atoll(argv[3]);
|
||||
|
||||
auto_lock_t _lk{ print_lock };
|
||||
std::cout << "[" << rank << "][" << tid << "] M: " << M << " N: " << N << std::endl;
|
||||
@@ -126,10 +128,11 @@ run(int rank, int tid, hipStream_t stream, int argc, char** argv)
|
||||
dim3 block(32, 32, 1); // transpose_a
|
||||
|
||||
auto t1 = std::chrono::high_resolution_clock::now();
|
||||
for(size_t i = 0; i < nitr; i++)
|
||||
for(size_t i = 0; i < nitr; ++i)
|
||||
{
|
||||
transpose_a<<<grid, block, 0, stream>>>(in, out, M, N);
|
||||
check_hip_error();
|
||||
if(i % nsync == (nsync - 1)) HIP_API_CALL(hipStreamSynchronize(stream));
|
||||
}
|
||||
auto t2 = std::chrono::high_resolution_clock::now();
|
||||
HIP_API_CALL(hipStreamSynchronize(stream));
|
||||
@@ -179,15 +182,18 @@ do_a2a(int rank)
|
||||
int
|
||||
main(int argc, char** argv)
|
||||
{
|
||||
int rank = 0;
|
||||
int size = 1;
|
||||
int nthreads = 2;
|
||||
int nitr = 5000;
|
||||
int rank = 0;
|
||||
int size = 1;
|
||||
int nthreads = 2;
|
||||
int nitr = 5000;
|
||||
size_t nsync = 10;
|
||||
if(argc > 1) nthreads = atoi(argv[1]);
|
||||
if(argc > 2) nitr = atoi(argv[2]);
|
||||
if(argc > 3) nsync = atoll(argv[3]);
|
||||
|
||||
printf("[transpose] Number of threads: %i\n", nthreads);
|
||||
printf("[transpose] Number of iterations: %i\n", nitr);
|
||||
printf("[transpose] Syncing every %zu iterations\n", nsync);
|
||||
|
||||
#if defined(USE_MPI)
|
||||
MPI_Init(&argc, &argv);
|
||||
|
||||
+1
-1
Submodule projects/rocprofiler-systems/external/PTL updated: 4afd2bdeb9...1451b6c279
+1
-1
Submodule projects/rocprofiler-systems/external/timemory updated: 9ccf9ec9f6...52e7034fd4
@@ -9,7 +9,7 @@ add_executable(
|
||||
${CMAKE_CURRENT_LIST_DIR}/avail.cpp ${CMAKE_CURRENT_LIST_DIR}/avail.hpp
|
||||
$<TARGET_OBJECTS:omnitrace::omnitrace-object-library>)
|
||||
|
||||
target_include_directories(omnitrace-avail PRIVATE ${CMAKE_CURRENT_LIST_DIR}/include)
|
||||
target_include_directories(omnitrace-avail PRIVATE ${CMAKE_CURRENT_LIST_DIR})
|
||||
target_compile_definitions(omnitrace-avail PRIVATE OMNITRACE_EXTERN_COMPONENTS=0)
|
||||
target_link_libraries(omnitrace-avail PRIVATE omnitrace::omnitrace-compile-definitions
|
||||
omnitrace::omnitrace-interface-library)
|
||||
|
||||
@@ -439,7 +439,9 @@ main(int argc, char** argv)
|
||||
|
||||
parser.add_argument({ "" }, "");
|
||||
parser.add_argument({ "[VIEW OPTIONS]" }, "");
|
||||
parser.add_argument({ "-A", "--available" }, "Only display available components")
|
||||
parser
|
||||
.add_argument({ "-A", "--available" },
|
||||
"Only display available components/settings/hw-counters")
|
||||
.max_count(1)
|
||||
.action([](parser_t& p) { available_only = p.get<bool>("available"); });
|
||||
parser
|
||||
@@ -892,8 +894,8 @@ write_settings_info(std::ostream& os, const array_t<bool, N>& opts,
|
||||
_setting_output.end());
|
||||
|
||||
// patch up the categories
|
||||
str_set_t _not_in_category_view{};
|
||||
auto _settings = tim::settings::shared_instance();
|
||||
auto _not_in_category_view = str_set_t{};
|
||||
auto _settings = tim::settings::shared_instance();
|
||||
for(auto& itr : _setting_output)
|
||||
{
|
||||
auto _name = itr.find("environ")->second;
|
||||
@@ -942,6 +944,19 @@ write_settings_info(std::ostream& os, const array_t<bool, N>& opts,
|
||||
}),
|
||||
_setting_output.end());
|
||||
|
||||
if(available_only)
|
||||
{
|
||||
_setting_output.erase(
|
||||
std::remove_if(_setting_output.begin(), _setting_output.end(),
|
||||
[&_settings](const auto& itr) {
|
||||
auto iitr = _settings->find(itr.at("environ"));
|
||||
if(iitr != _settings->end())
|
||||
return (iitr->second->get_enabled() == false);
|
||||
return true;
|
||||
}),
|
||||
_setting_output.end());
|
||||
}
|
||||
|
||||
if(alphabetical)
|
||||
{
|
||||
std::sort(_setting_output.begin(), _setting_output.end(),
|
||||
|
||||
@@ -10,12 +10,13 @@ add_executable(
|
||||
${CMAKE_CURRENT_LIST_DIR}/critical-trace.hpp
|
||||
$<TARGET_OBJECTS:omnitrace::omnitrace-object-library>)
|
||||
|
||||
target_include_directories(omnitrace-critical-trace
|
||||
PRIVATE ${CMAKE_CURRENT_LIST_DIR}/include)
|
||||
target_include_directories(omnitrace-critical-trace PRIVATE ${CMAKE_CURRENT_LIST_DIR})
|
||||
target_compile_definitions(omnitrace-critical-trace PRIVATE OMNITRACE_EXTERN_COMPONENTS=0)
|
||||
target_link_libraries(
|
||||
omnitrace-critical-trace PRIVATE omnitrace::omnitrace-compile-definitions
|
||||
omnitrace::omnitrace-interface-library)
|
||||
omnitrace-critical-trace
|
||||
PRIVATE omnitrace::omnitrace-compile-definitions
|
||||
omnitrace::omnitrace-interface-library omnitrace::omnitrace-headers
|
||||
omnitrace::omnitrace-timemory)
|
||||
set_target_properties(
|
||||
omnitrace-critical-trace
|
||||
PROPERTIES BUILD_RPATH "\$ORIGIN:${PROJECT_BINARY_DIR}:${CMAKE_BINARY_DIR}"
|
||||
|
||||
@@ -176,7 +176,7 @@ module_function::serialize(ArchiveT& ar, const unsigned)
|
||||
}
|
||||
|
||||
ar(cereal::make_nvp("address_range", address_range),
|
||||
cereal::make_nvp("instructions", num_instructions),
|
||||
cereal::make_nvp("num_instructions", num_instructions),
|
||||
cereal::make_nvp("module", module_name),
|
||||
cereal::make_nvp("function", function_name),
|
||||
cereal::make_nvp("signature", signature));
|
||||
|
||||
@@ -22,6 +22,7 @@
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <cstring>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
@@ -31,6 +32,51 @@ inline namespace common
|
||||
{
|
||||
namespace
|
||||
{
|
||||
template <typename ContainerT, typename... Args>
|
||||
inline auto
|
||||
emplace_impl(ContainerT& _c, int, Args&&... _args)
|
||||
-> decltype(_c.emplace_back(std::forward<Args>(_args)...))
|
||||
{
|
||||
return _c.emplace_back(std::forward<Args>(_args)...);
|
||||
}
|
||||
|
||||
template <typename ContainerT, typename... Args>
|
||||
inline auto
|
||||
emplace_impl(ContainerT& _c, long, Args&&... _args)
|
||||
-> decltype(_c.emplace(std::forward<Args>(_args)...))
|
||||
{
|
||||
return _c.emplace(std::forward<Args>(_args)...);
|
||||
}
|
||||
|
||||
template <typename ContainerT, typename... Args>
|
||||
inline auto
|
||||
emplace(ContainerT& _c, Args&&... _args)
|
||||
{
|
||||
return emplace_impl(_c, 0, std::forward<Args>(_args)...);
|
||||
}
|
||||
|
||||
template <typename ContainerT, typename ArgT>
|
||||
inline auto
|
||||
reserve_impl(ContainerT& _c, int, ArgT _arg) -> decltype(_c.reserve(_arg), bool())
|
||||
{
|
||||
_c.reserve(_arg);
|
||||
return true;
|
||||
}
|
||||
|
||||
template <typename ContainerT, typename ArgT>
|
||||
inline auto
|
||||
reserve_impl(ContainerT&, long, ArgT)
|
||||
{
|
||||
return false;
|
||||
}
|
||||
|
||||
template <typename ContainerT, typename ArgT>
|
||||
inline auto
|
||||
reserve(ContainerT& _c, ArgT _arg)
|
||||
{
|
||||
return reserve_impl(_c, 0, _arg);
|
||||
}
|
||||
|
||||
template <typename ContainerT = std::vector<std::string>>
|
||||
inline ContainerT
|
||||
delimit(const std::string& line, const char* delimiters = "\"',;: ");
|
||||
@@ -42,6 +88,18 @@ delimit(const std::string& line, const char* delimiters)
|
||||
ContainerT _result{};
|
||||
size_t _beginp = 0; // position that is the beginning of the new string
|
||||
size_t _delimp = 0; // position of the delimiter in the string
|
||||
if(reserve(_result, 0))
|
||||
{
|
||||
size_t _nmax = 0;
|
||||
for(char itr : line)
|
||||
{
|
||||
for(size_t j = 0; j < strlen(delimiters); ++j)
|
||||
{
|
||||
if(itr == delimiters[j]) ++_nmax;
|
||||
}
|
||||
}
|
||||
reserve(_result, _nmax);
|
||||
}
|
||||
while(_beginp < line.length() && _delimp < line.length())
|
||||
{
|
||||
// find the first character (starting at _delimp) that is not a delimiter
|
||||
@@ -56,7 +114,7 @@ delimit(const std::string& line, const char* delimiters)
|
||||
// between this position and the next delimiter
|
||||
_tmp = line.substr(_beginp, _delimp - _beginp);
|
||||
// don't add empty strings
|
||||
if(!_tmp.empty()) _result.emplace(_result.end(), _tmp);
|
||||
if(!_tmp.empty()) emplace(_result, _tmp);
|
||||
}
|
||||
return _result;
|
||||
}
|
||||
|
||||
@@ -60,6 +60,21 @@ get_thread_index()
|
||||
return _v;
|
||||
}
|
||||
|
||||
template <typename... Args>
|
||||
auto
|
||||
ignore(const char* _name, int _verbose, int _value, const char* _reason, Args... _args)
|
||||
{
|
||||
if(_verbose >= _value)
|
||||
{
|
||||
fflush(stderr);
|
||||
fprintf(stderr,
|
||||
"[omnitrace][" OMNITRACE_COMMON_LIBRARY_NAME
|
||||
"][%li] %s(%s) was ignored :: %s\n",
|
||||
get_thread_index(), _name, join(", ", _args...).c_str(), _reason);
|
||||
fflush(stderr);
|
||||
}
|
||||
}
|
||||
|
||||
template <typename FuncT, typename... Args>
|
||||
auto
|
||||
invoke(const char* _name, int _verbose, bool& _toggle, FuncT&& _func, Args... _args)
|
||||
|
||||
@@ -22,8 +22,10 @@
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <ios>
|
||||
#include <sstream>
|
||||
#include <string>
|
||||
#include <type_traits>
|
||||
|
||||
#if !defined(OMNITRACE_FOLD_EXPRESSION)
|
||||
# define OMNITRACE_FOLD_EXPRESSION(...) ((__VA_ARGS__), ...)
|
||||
@@ -35,12 +37,51 @@ inline namespace common
|
||||
{
|
||||
namespace
|
||||
{
|
||||
template <typename Tp>
|
||||
struct is_string_impl : std::false_type
|
||||
{};
|
||||
|
||||
template <>
|
||||
struct is_string_impl<std::string> : std::true_type
|
||||
{};
|
||||
|
||||
template <>
|
||||
struct is_string_impl<std::string_view> : std::true_type
|
||||
{};
|
||||
|
||||
template <>
|
||||
struct is_string_impl<const char*> : std::true_type
|
||||
{};
|
||||
|
||||
template <>
|
||||
struct is_string_impl<char*> : std::true_type
|
||||
{};
|
||||
|
||||
template <typename Tp>
|
||||
struct is_string : is_string_impl<std::remove_cv_t<std::decay_t<Tp>>>
|
||||
{};
|
||||
|
||||
template <typename ArgT>
|
||||
auto
|
||||
as_string(ArgT&& _v, std::enable_if_t<is_string<ArgT>::value, int> = 0)
|
||||
{
|
||||
return std::string{ "\"" } + _v + std::string{ "\"" };
|
||||
}
|
||||
|
||||
template <typename ArgT>
|
||||
auto
|
||||
as_string(ArgT&& _v, std::enable_if_t<!is_string<ArgT>::value, long> = 0)
|
||||
{
|
||||
return _v;
|
||||
}
|
||||
|
||||
template <typename DelimT, typename... Args>
|
||||
auto
|
||||
join(DelimT&& _delim, Args&&... _args)
|
||||
{
|
||||
std::stringstream _ss{};
|
||||
OMNITRACE_FOLD_EXPRESSION(_ss << _delim << _args);
|
||||
_ss << std::boolalpha;
|
||||
OMNITRACE_FOLD_EXPRESSION(_ss << _delim << as_string(_args));
|
||||
auto _ret = _ss.str();
|
||||
if constexpr(std::is_same<DelimT, char>::value)
|
||||
{
|
||||
|
||||
@@ -6,43 +6,37 @@
|
||||
|
||||
set(CMAKE_BUILD_TYPE "Release")
|
||||
set(CMAKE_SKIP_RPATH OFF)
|
||||
set(BUILD_RPATH_USE_ORIGIN ON)
|
||||
set(CMAKE_BUILD_RPATH_USE_ORIGIN ON)
|
||||
set(CMAKE_CXX_VISIBILITY_PRESET "internal")
|
||||
set(CMAKE_POSITION_INDEPENDENT_CODE ON)
|
||||
|
||||
add_library(omnitrace-dl-library SHARED)
|
||||
add_library(omnitrace::omnitrace-dl-library ALIAS omnitrace-dl-library)
|
||||
|
||||
target_sources(omnitrace-dl-library PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/dl.cpp
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/dl.hpp)
|
||||
target_link_libraries(
|
||||
omnitrace-dl-library
|
||||
PUBLIC ${dl_LIBRARY} $<BUILD_INTERFACE:omnitrace::common-library>
|
||||
$<BUILD_INTERFACE:omnitrace::omnitrace-compile-definitions>)
|
||||
target_include_directories(
|
||||
omnitrace-dl-library
|
||||
PUBLIC $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}>
|
||||
$<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/../omnitrace-user>)
|
||||
target_link_libraries(
|
||||
omnitrace-dl-library
|
||||
PUBLIC
|
||||
${dl_LIBRARY}
|
||||
$<BUILD_INTERFACE:omnitrace::common-library>
|
||||
$<BUILD_INTERFACE:omnitrace::omnitrace-compile-definitions>
|
||||
$<IF:$<BOOL:${OMNITRACE_BUILD_STATIC_LIBGCC}>,$<BUILD_INTERFACE:omnitrace::omnitrace-static-libgcc>,>
|
||||
$<IF:$<BOOL:${OMNITRACE_BUILD_STATIC_LIBSTDCXX}>,$<BUILD_INTERFACE:omnitrace::omnitrace-static-libstdcxx>,>
|
||||
)
|
||||
|
||||
check_cxx_compiler_flag("-fno-exceptions" omnitrace_dl_library_fno_exceptions)
|
||||
check_cxx_compiler_flag("-ftls-model=local-dynamic"
|
||||
omnitrace_dl_library_ftls_module_local_dynamic)
|
||||
|
||||
if(OMNITRACE_BUILD_DEVELOPER)
|
||||
if(omnitrace_dl_library_fno_exceptions)
|
||||
target_compile_options(omnitrace-dl-library PRIVATE -fno-exceptions)
|
||||
endif()
|
||||
|
||||
if(omnitrace_dl_library_ftls_module_local_dynamic)
|
||||
target_compile_options(omnitrace-dl-library PRIVATE -ftls-model=local-dynamic)
|
||||
endif()
|
||||
endif()
|
||||
add_target_cxx_flag_if_avail(omnitrace-dl-library "-ftls-model=global-dynamic")
|
||||
add_target_cxx_flag_if_avail(omnitrace-dl-library "-g")
|
||||
|
||||
set_target_properties(
|
||||
omnitrace-dl-library
|
||||
PROPERTIES OUTPUT_NAME omnitrace-dl
|
||||
CXX_VISIBILITY_PRESET "internal"
|
||||
VERSION ${PROJECT_VERSION}
|
||||
SOVERSION ${PROJECT_VERSION_MAJOR}
|
||||
POSITION_INDEPENDENT_CODE ON
|
||||
BUILD_RPATH "\$ORIGIN"
|
||||
INSTALL_RPATH "\$ORIGIN")
|
||||
|
||||
|
||||
@@ -122,9 +122,9 @@ const char* _omnitrace_dl_dlopen_descr = "RTLD_LAZY | RTLD_LOCAL";
|
||||
/// This class contains function pointers for omnitrace's instrumentation functions
|
||||
struct OMNITRACE_HIDDEN_API indirect
|
||||
{
|
||||
OMNITRACE_INLINE indirect(std::string omnilib, std::string userlib)
|
||||
: m_omnilib{ find_path(std::move(omnilib)) }
|
||||
, m_userlib{ find_path(std::move(userlib)) }
|
||||
OMNITRACE_INLINE indirect(const std::string& omnilib, const std::string& userlib)
|
||||
: m_omnilib{ find_path(omnilib) }
|
||||
, m_userlib{ find_path(userlib) }
|
||||
{
|
||||
if(_omnitrace_dl_verbose >= 1)
|
||||
{
|
||||
@@ -218,19 +218,20 @@ struct OMNITRACE_HIDDEN_API indirect
|
||||
}
|
||||
}
|
||||
|
||||
static OMNITRACE_INLINE std::string find_path(std::string&& _path)
|
||||
static OMNITRACE_INLINE std::string find_path(const std::string& _path)
|
||||
{
|
||||
auto _paths =
|
||||
delimit(join(":", get_env("OMNITRACE_PATH", ""),
|
||||
get_env("LD_LIBRARY_PATH", ""), get_env("LIBRARY_PATH", "")),
|
||||
":");
|
||||
auto _paths_search =
|
||||
join(":", get_env("OMNITRACE_PATH", ""), get_env("LD_LIBRARY_PATH", ""),
|
||||
get_env("LIBRARY_PATH", ""));
|
||||
|
||||
auto _paths = delimit(_paths_search, ":");
|
||||
|
||||
auto file_exists = [](const std::string& name) {
|
||||
struct stat buffer;
|
||||
return (stat(name.c_str(), &buffer) == 0);
|
||||
};
|
||||
|
||||
for(auto&& itr : _paths)
|
||||
for(const auto& itr : _paths)
|
||||
{
|
||||
auto _f = join('/', itr, _path);
|
||||
if(file_exists(_f)) return _f;
|
||||
@@ -269,10 +270,10 @@ get_indirect() OMNITRACE_HIDDEN_API;
|
||||
indirect&
|
||||
get_indirect()
|
||||
{
|
||||
static auto _v =
|
||||
indirect{ get_env("OMNITRACE_LIBRARY", "libomnitrace.so"),
|
||||
get_env("OMNITRACE_USER_LIBRARY", "libomnitrace-user.so") };
|
||||
return _v;
|
||||
static auto _libomni = get_env("OMNITRACE_LIBRARY", "libomnitrace.so");
|
||||
static auto _libuser = get_env("OMNITRACE_USER_LIBRARY", "libomnitrace-user.so");
|
||||
static auto* _v = new indirect{ _libomni, _libuser };
|
||||
return *_v;
|
||||
}
|
||||
|
||||
auto&
|
||||
@@ -340,6 +341,10 @@ bool _omnitrace_dl_fini = (std::atexit([]() {
|
||||
(::omnitrace::dl::get_thread_status() = false), \
|
||||
__VA_ARGS__)
|
||||
|
||||
#define OMNITRACE_DL_IGNORE(...) \
|
||||
::omnitrace::common::ignore(__FUNCTION__, ::omnitrace::dl::_omnitrace_dl_verbose, \
|
||||
__VA_ARGS__)
|
||||
|
||||
#define OMNITRACE_DL_INVOKE_STATUS(STATUS, ...) \
|
||||
::omnitrace::common::invoke(__FUNCTION__, ::omnitrace::dl::_omnitrace_dl_verbose, \
|
||||
STATUS, __VA_ARGS__)
|
||||
@@ -464,18 +469,30 @@ extern "C"
|
||||
|
||||
void omnitrace_set_env(const char* a, const char* b)
|
||||
{
|
||||
if(dl::get_inited() && dl::get_active())
|
||||
{
|
||||
OMNITRACE_DL_IGNORE(2, "already initialized and active", a, b);
|
||||
return;
|
||||
}
|
||||
setenv(a, b, 0);
|
||||
OMNITRACE_DL_INVOKE(get_indirect().omnitrace_set_env_f, a, b);
|
||||
}
|
||||
|
||||
void omnitrace_set_mpi(bool a, bool b)
|
||||
{
|
||||
if(dl::get_inited() && dl::get_active())
|
||||
{
|
||||
OMNITRACE_DL_IGNORE(2, "already initialized and active", a, b);
|
||||
return;
|
||||
}
|
||||
OMNITRACE_DL_INVOKE(get_indirect().omnitrace_set_mpi_f, a, b);
|
||||
}
|
||||
|
||||
void omnitrace_register_source(const char* file, const char* func, size_t line,
|
||||
size_t address, const char* source)
|
||||
{
|
||||
OMNITRACE_DL_LOG(3, "%s(\"%s\", \"%s\", %zu, %zu, \"%s\")\n", __FUNCTION__, file,
|
||||
func, line, address, source);
|
||||
OMNITRACE_DL_INVOKE(get_indirect().omnitrace_register_source_f, file, func, line,
|
||||
address, source);
|
||||
}
|
||||
|
||||
@@ -8,8 +8,8 @@ add_library(omnitrace-interface-library INTERFACE)
|
||||
add_library(omnitrace::omnitrace-interface-library ALIAS omnitrace-interface-library)
|
||||
|
||||
target_include_directories(
|
||||
omnitrace-interface-library INTERFACE ${CMAKE_CURRENT_SOURCE_DIR}/include
|
||||
${CMAKE_CURRENT_BINARY_DIR}/include)
|
||||
omnitrace-interface-library INTERFACE ${CMAKE_CURRENT_SOURCE_DIR}
|
||||
${CMAKE_CURRENT_BINARY_DIR})
|
||||
target_include_directories(omnitrace-interface-library SYSTEM
|
||||
INTERFACE ${perfetto_DIR}/sdk)
|
||||
|
||||
@@ -54,6 +54,7 @@ set(library_sources
|
||||
${CMAKE_CURRENT_LIST_DIR}/library/coverage.cpp
|
||||
${CMAKE_CURRENT_LIST_DIR}/library/cpu_freq.cpp
|
||||
${CMAKE_CURRENT_LIST_DIR}/library/critical_trace.cpp
|
||||
${CMAKE_CURRENT_LIST_DIR}/library/debug.cpp
|
||||
${CMAKE_CURRENT_LIST_DIR}/library/kokkosp.cpp
|
||||
${CMAKE_CURRENT_LIST_DIR}/library/gpu.cpp
|
||||
${CMAKE_CURRENT_LIST_DIR}/library/ompt.cpp
|
||||
@@ -70,6 +71,8 @@ set(library_sources
|
||||
${CMAKE_CURRENT_LIST_DIR}/library/components/mpi_gotcha.cpp
|
||||
${CMAKE_CURRENT_LIST_DIR}/library/components/omnitrace.cpp
|
||||
${CMAKE_CURRENT_LIST_DIR}/library/components/pthread_gotcha.cpp
|
||||
${CMAKE_CURRENT_LIST_DIR}/library/components/pthread_create_gotcha.cpp
|
||||
${CMAKE_CURRENT_LIST_DIR}/library/components/pthread_mutex_gotcha.cpp
|
||||
${CMAKE_CURRENT_LIST_DIR}/library/components/user_region.cpp
|
||||
${perfetto_DIR}/sdk/perfetto.cc)
|
||||
|
||||
@@ -92,6 +95,7 @@ set(library_headers
|
||||
${CMAKE_CURRENT_LIST_DIR}/library/thread_data.hpp
|
||||
${CMAKE_CURRENT_LIST_DIR}/library/thread_sampler.hpp
|
||||
${CMAKE_CURRENT_LIST_DIR}/library/timemory.hpp
|
||||
${CMAKE_CURRENT_LIST_DIR}/library/utility.hpp
|
||||
${CMAKE_CURRENT_LIST_DIR}/library/components/fwd.hpp
|
||||
${CMAKE_CURRENT_LIST_DIR}/library/components/backtrace.hpp
|
||||
${CMAKE_CURRENT_LIST_DIR}/library/components/fork_gotcha.hpp
|
||||
@@ -102,6 +106,8 @@ set(library_headers
|
||||
${CMAKE_CURRENT_LIST_DIR}/library/components/roctracer.hpp
|
||||
${CMAKE_CURRENT_LIST_DIR}/library/components/roctracer_callbacks.hpp
|
||||
${CMAKE_CURRENT_LIST_DIR}/library/components/pthread_gotcha.hpp
|
||||
${CMAKE_CURRENT_LIST_DIR}/library/components/pthread_create_gotcha.hpp
|
||||
${CMAKE_CURRENT_LIST_DIR}/library/components/pthread_mutex_gotcha.hpp
|
||||
${CMAKE_CURRENT_LIST_DIR}/library/components/user_region.hpp
|
||||
${perfetto_DIR}/sdk/perfetto.h)
|
||||
|
||||
@@ -119,7 +125,8 @@ if(OMNITRACE_USE_ROCM_SMI)
|
||||
PRIVATE ${CMAKE_CURRENT_LIST_DIR}/library/components/rocm_smi.cpp)
|
||||
endif()
|
||||
|
||||
target_link_libraries(omnitrace-object-library PRIVATE omnitrace-interface-library)
|
||||
target_link_libraries(omnitrace-object-library
|
||||
PRIVATE omnitrace::omnitrace-interface-library)
|
||||
|
||||
if(OMNITRACE_DYNINST_API_RT)
|
||||
get_filename_component(OMNITRACE_DYNINST_API_RT_DIR "${OMNITRACE_DYNINST_API_RT}"
|
||||
|
||||
@@ -27,6 +27,7 @@
|
||||
#include "library/components/fwd.hpp"
|
||||
#include "library/components/mpi_gotcha.hpp"
|
||||
#include "library/components/pthread_gotcha.hpp"
|
||||
#include "library/components/pthread_mutex_gotcha.hpp"
|
||||
#include "library/config.hpp"
|
||||
#include "library/coverage.hpp"
|
||||
#include "library/critical_trace.hpp"
|
||||
@@ -221,7 +222,7 @@ omnitrace_push_trace_hidden(const char* name)
|
||||
std::tie(_cid, _parent_cid, _depth) = create_cpu_cid_entry();
|
||||
auto _ts = comp::wall_clock::record();
|
||||
add_critical_trace<Device::CPU, Phase::BEGIN>(
|
||||
threading::get_id(), _cid, 0, _parent_cid, _ts, 0,
|
||||
threading::get_id(), _cid, 0, _parent_cid, _ts, 0, 0,
|
||||
critical_trace::add_hash_id(name), _depth);
|
||||
}
|
||||
}
|
||||
@@ -263,7 +264,7 @@ omnitrace_pop_trace_hidden(const char* name)
|
||||
auto _ts = comp::wall_clock::record();
|
||||
std::tie(_parent_cid, _depth) = get_cpu_cid_parents()->at(_cid);
|
||||
add_critical_trace<Device::CPU, Phase::END>(
|
||||
threading::get_id(), _cid, 0, _parent_cid, _ts, _ts,
|
||||
threading::get_id(), _cid, 0, _parent_cid, _ts, _ts, 0,
|
||||
critical_trace::add_hash_id(name), _depth);
|
||||
}
|
||||
}
|
||||
@@ -436,6 +437,8 @@ omnitrace_init_library_hidden()
|
||||
if(get_state() != State::PreInit || get_state() == State::Init || _once) return;
|
||||
_once = true;
|
||||
|
||||
OMNITRACE_SCOPED_THREAD_STATE(ThreadState::Internal);
|
||||
|
||||
OMNITRACE_CONDITIONAL_BASIC_PRINT_F(_debug_init, "State is %s. Setting to %s...\n",
|
||||
std::to_string(get_state()).c_str(),
|
||||
std::to_string(State::Init).c_str());
|
||||
@@ -445,7 +448,7 @@ omnitrace_init_library_hidden()
|
||||
"glibc's backtrace() occurs...\n");
|
||||
{
|
||||
std::stringstream _ss{};
|
||||
tim::print_backtrace<64>(_ss);
|
||||
tim::print_backtrace<16>(_ss);
|
||||
(void) _ss;
|
||||
}
|
||||
|
||||
@@ -471,7 +474,7 @@ omnitrace_init_library_hidden()
|
||||
// below will effectively do:
|
||||
// get_cpu_cid_stack(0)->emplace_back(-1);
|
||||
// plus query some env variables
|
||||
add_critical_trace<Device::CPU, Phase::NONE>(0, -1, 0, 0, 0, 0, 0, 0);
|
||||
add_critical_trace<Device::CPU, Phase::NONE>(0, -1, 0, 0, 0, 0, 0, 0, 0);
|
||||
|
||||
if(gpu::device_count() == 0 && get_state() != State::Active)
|
||||
{
|
||||
@@ -550,6 +553,8 @@ omnitrace_init_tooling_hidden()
|
||||
if(get_state() != State::PreInit || get_state() == State::Init || _once) return false;
|
||||
_once = true;
|
||||
|
||||
OMNITRACE_SCOPED_THREAD_STATE(ThreadState::Internal);
|
||||
|
||||
OMNITRACE_CONDITIONAL_THROW(
|
||||
get_state() == State::Init,
|
||||
"%s called after omnitrace_init_library() was explicitly called",
|
||||
@@ -571,6 +576,9 @@ omnitrace_init_tooling_hidden()
|
||||
OMNITRACE_DEBUG_F("\n");
|
||||
|
||||
auto _dtor = scope::destructor{ []() {
|
||||
// if set to finalized, don't continue
|
||||
if(get_state() > State::Active) return;
|
||||
if(config::get_trace_thread_locks()) pthread_mutex_gotcha::validate();
|
||||
if(get_use_thread_sampling())
|
||||
{
|
||||
pthread_gotcha::push_enable_sampling_on_child_threads(false);
|
||||
@@ -595,6 +603,12 @@ omnitrace_init_tooling_hidden()
|
||||
sampling::block_signals();
|
||||
}
|
||||
|
||||
if(get_use_critical_trace())
|
||||
{
|
||||
// initialize the thread pool
|
||||
(void) tasking::critical_trace::get_task_group();
|
||||
}
|
||||
|
||||
if(get_use_timemory())
|
||||
{
|
||||
comp::user_global_bundle::global_init();
|
||||
@@ -892,8 +906,8 @@ omnitrace_init_hidden(const char* _mode, bool _is_binary_rewrite, const char* _a
|
||||
});
|
||||
|
||||
std::atexit([]() {
|
||||
// if not already finalized then we should finalize
|
||||
if(get_state() != State::Finalized) omnitrace_finalize_hidden();
|
||||
// if active (not already finalized) then we should finalize
|
||||
if(get_state() == State::Active) omnitrace_finalize_hidden();
|
||||
});
|
||||
|
||||
OMNITRACE_CONDITIONAL_BASIC_PRINT_F(
|
||||
@@ -940,6 +954,8 @@ omnitrace_finalize_hidden(void)
|
||||
// disable thread id recycling during finalization
|
||||
threading::recycle_ids() = false;
|
||||
|
||||
set_thread_state(ThreadState::Completed);
|
||||
|
||||
// return if not active
|
||||
if(get_state() != State::Active)
|
||||
{
|
||||
@@ -1193,25 +1209,33 @@ omnitrace_finalize_hidden(void)
|
||||
using char_vec_t = std::vector<char>;
|
||||
OMNITRACE_VERBOSE_F(3, "Getting the trace data...\n");
|
||||
|
||||
#if defined(TIMEMORY_USE_MPI) && TIMEMORY_USE_MPI > 0
|
||||
using perfetto_mpi_get_t = tim::operation::finalize::mpi_get<char_vec_t, true>;
|
||||
|
||||
char_vec_t _trace_data{ tracing_session->ReadTraceBlocking() };
|
||||
std::vector<char_vec_t> _rank_data = {};
|
||||
auto _combine = [](char_vec_t& _dst, const char_vec_t& _src) -> char_vec_t& {
|
||||
_dst.reserve(_dst.size() + _src.size());
|
||||
for(auto&& itr : _src)
|
||||
_dst.emplace_back(itr);
|
||||
return _dst;
|
||||
};
|
||||
|
||||
perfetto_mpi_get_t{ _rank_data, _trace_data, _combine };
|
||||
auto trace_data = char_vec_t{};
|
||||
for(auto& itr : _rank_data)
|
||||
trace_data =
|
||||
(trace_data.empty()) ? std::move(itr) : _combine(trace_data, itr);
|
||||
#if defined(TIMEMORY_USE_MPI) && TIMEMORY_USE_MPI > 0
|
||||
if(get_perfetto_combined_traces())
|
||||
{
|
||||
using perfetto_mpi_get_t =
|
||||
tim::operation::finalize::mpi_get<char_vec_t, true>;
|
||||
|
||||
char_vec_t _trace_data{ tracing_session->ReadTraceBlocking() };
|
||||
std::vector<char_vec_t> _rank_data = {};
|
||||
auto _combine = [](char_vec_t& _dst, const char_vec_t& _src) -> char_vec_t& {
|
||||
_dst.reserve(_dst.size() + _src.size());
|
||||
for(auto&& itr : _src)
|
||||
_dst.emplace_back(itr);
|
||||
return _dst;
|
||||
};
|
||||
|
||||
perfetto_mpi_get_t{ _rank_data, _trace_data, _combine };
|
||||
for(auto& itr : _rank_data)
|
||||
trace_data =
|
||||
(trace_data.empty()) ? std::move(itr) : _combine(trace_data, itr);
|
||||
}
|
||||
else
|
||||
{
|
||||
trace_data = tracing_session->ReadTraceBlocking();
|
||||
}
|
||||
#else
|
||||
char_vec_t trace_data{ tracing_session->ReadTraceBlocking() };
|
||||
trace_data = tracing_session->ReadTraceBlocking();
|
||||
#endif
|
||||
|
||||
if(!trace_data.empty())
|
||||
|
||||
@@ -53,8 +53,8 @@ template <critical_trace::Device DevID, critical_trace::Phase PhaseID,
|
||||
bool UpdateStack = true>
|
||||
inline void
|
||||
add_critical_trace(int64_t _targ_tid, size_t _cpu_cid, size_t _gpu_cid,
|
||||
size_t _parent_cid, int64_t _ts_beg, int64_t _ts_val, size_t _hash,
|
||||
uint16_t _depth, uint16_t _prio = 0)
|
||||
size_t _parent_cid, int64_t _ts_beg, int64_t _ts_val, uintptr_t _queue,
|
||||
size_t _hash, uint16_t _depth, uint16_t _prio = 0)
|
||||
{
|
||||
// clang-format off
|
||||
// these are used to create unique type mutexes
|
||||
@@ -62,6 +62,8 @@ add_critical_trace(int64_t _targ_tid, size_t _cpu_cid, size_t _gpu_cid,
|
||||
struct cpu_cid_stack {};
|
||||
// clang-format on
|
||||
|
||||
OMNITRACE_SCOPED_THREAD_STATE(ThreadState::Internal);
|
||||
|
||||
using tim::type_mutex;
|
||||
using auto_lock_t = tim::auto_lock_t;
|
||||
static constexpr auto num_mutexes = max_supported_threads;
|
||||
@@ -80,9 +82,9 @@ add_critical_trace(int64_t _targ_tid, size_t _cpu_cid, size_t _gpu_cid,
|
||||
if(!_self_lk.owns_lock()) _self_lk.lock();
|
||||
|
||||
auto& _critical_trace = critical_trace::get(_self_tid);
|
||||
_critical_trace->emplace_back(
|
||||
critical_trace::entry{ _prio, DevID, PhaseID, _depth, _targ_tid, _cpu_cid,
|
||||
_gpu_cid, _parent_cid, _ts_beg, _ts_val, _hash });
|
||||
_critical_trace->emplace_back(critical_trace::entry{
|
||||
_prio, DevID, PhaseID, _depth, _targ_tid, _cpu_cid, _gpu_cid, _parent_cid,
|
||||
_ts_beg, _ts_val, _queue, _hash });
|
||||
}
|
||||
|
||||
if constexpr(UpdateStack)
|
||||
@@ -119,6 +121,6 @@ add_critical_trace(int64_t _targ_tid, size_t _cpu_cid, size_t _gpu_cid,
|
||||
}
|
||||
|
||||
tim::consume_parameters(_targ_tid, _cpu_cid, _gpu_cid, _parent_cid, _ts_beg, _ts_val,
|
||||
_hash, _depth, _prio);
|
||||
_queue, _hash, _depth, _prio);
|
||||
}
|
||||
} // namespace omnitrace
|
||||
|
||||
+1
-1
@@ -271,7 +271,7 @@ backtrace::sample(int signum)
|
||||
m_ts = clock_type::now();
|
||||
m_thr_cpu_ts = tim::get_clock_thread_now<int64_t, std::nano>();
|
||||
m_mem_peak = tim::get_peak_rss(RUSAGE_THREAD);
|
||||
m_data = tim::get_unw_backtrace<128, 4, false>();
|
||||
m_data = tim::get_unw_backtrace<stack_depth, 4, false>();
|
||||
auto* itr = m_data.begin();
|
||||
for(; itr != m_data.end(); ++itr, ++m_size)
|
||||
{
|
||||
|
||||
+3
-1
@@ -52,8 +52,10 @@ struct backtrace
|
||||
, tim::concepts::component
|
||||
{
|
||||
static constexpr size_t num_hw_counters = TIMEMORY_PAPI_ARRAY_SIZE;
|
||||
static constexpr size_t buffer_width = 512;
|
||||
static constexpr size_t stack_depth = 128;
|
||||
|
||||
using data_t = std::array<char[512], 128>;
|
||||
using data_t = std::array<char[buffer_width], stack_depth>;
|
||||
using clock_type = std::chrono::steady_clock;
|
||||
using time_point_type = typename clock_type::time_point;
|
||||
using value_type = void;
|
||||
|
||||
+15
-3
@@ -23,6 +23,12 @@
|
||||
#include "library/components/fork_gotcha.hpp"
|
||||
#include "library/config.hpp"
|
||||
#include "library/debug.hpp"
|
||||
#include "library/state.hpp"
|
||||
|
||||
#include <timemory/backends/process.hpp>
|
||||
#include <timemory/backends/threading.hpp>
|
||||
|
||||
#include <unistd.h>
|
||||
|
||||
namespace omnitrace
|
||||
{
|
||||
@@ -37,6 +43,8 @@ fork_gotcha::configure()
|
||||
void
|
||||
fork_gotcha::audit(const gotcha_data_t&, audit::incoming)
|
||||
{
|
||||
OMNITRACE_VERBOSE(1, "fork() called on PID %i (rank: %i), TID %li\n",
|
||||
process::get_id(), dmp::rank(), threading::get_id());
|
||||
OMNITRACE_CONDITIONAL_BASIC_PRINT(
|
||||
get_debug_env(),
|
||||
"Warning! Calling fork() within an OpenMPI application using libfabric "
|
||||
@@ -45,9 +53,13 @@ fork_gotcha::audit(const gotcha_data_t&, audit::incoming)
|
||||
}
|
||||
|
||||
void
|
||||
fork_gotcha::audit(const gotcha_data_t& _data, audit::outgoing, pid_t _pid)
|
||||
fork_gotcha::audit(const gotcha_data_t&, audit::outgoing, pid_t _pid)
|
||||
{
|
||||
OMNITRACE_CONDITIONAL_BASIC_PRINT(get_debug_env(), "%s() return PID %i\n",
|
||||
_data.tool_id.c_str(), (int) _pid);
|
||||
if(_pid != 0)
|
||||
{
|
||||
OMNITRACE_VERBOSE(1, "fork() called on PID %i created PID %i\n", getppid(), _pid);
|
||||
tim::settings::use_output_suffix() = true;
|
||||
tim::settings::default_process_suffix() = process::get_id();
|
||||
}
|
||||
}
|
||||
} // namespace omnitrace
|
||||
|
||||
+12
-3
@@ -24,10 +24,13 @@
|
||||
|
||||
#include "library/components/fwd.hpp"
|
||||
#include "library/defines.hpp"
|
||||
#include "library/runtime.hpp"
|
||||
#include "library/state.hpp"
|
||||
#include "library/timemory.hpp"
|
||||
#include "timemory/mpl/concepts.hpp"
|
||||
#include "timemory/mpl/function_traits.hpp"
|
||||
#include "timemory/utility/macros.hpp"
|
||||
|
||||
#include <timemory/mpl/concepts.hpp>
|
||||
#include <timemory/mpl/function_traits.hpp>
|
||||
#include <timemory/utility/macros.hpp>
|
||||
|
||||
#include <type_traits>
|
||||
#include <utility>
|
||||
@@ -71,6 +74,7 @@ struct functors : comp::base<functors<ApiT, StartFuncT, StopFuncT>, void>
|
||||
int> = 0>
|
||||
static auto start(Args&&... _args)
|
||||
{
|
||||
OMNITRACE_SCOPED_THREAD_STATE(ThreadState::Internal);
|
||||
get_functors().first(std::forward<Args>(_args)...);
|
||||
}
|
||||
|
||||
@@ -79,6 +83,7 @@ struct functors : comp::base<functors<ApiT, StartFuncT, StopFuncT>, void>
|
||||
int> = 0>
|
||||
static auto stop(Args&&... _args)
|
||||
{
|
||||
OMNITRACE_SCOPED_THREAD_STATE(ThreadState::Internal);
|
||||
get_functors().second(std::forward<Args>(_args)...);
|
||||
}
|
||||
|
||||
@@ -87,24 +92,28 @@ struct functors : comp::base<functors<ApiT, StartFuncT, StopFuncT>, void>
|
||||
template <typename Tp = this_type, enable_if_t<Tp::begin_supports_cstr, int> = 0>
|
||||
void start()
|
||||
{
|
||||
OMNITRACE_SCOPED_THREAD_STATE(ThreadState::Internal);
|
||||
get_functors().first(m_prefix);
|
||||
}
|
||||
|
||||
template <typename Tp = this_type, enable_if_t<Tp::end_supports_cstr, int> = 0>
|
||||
void stop()
|
||||
{
|
||||
OMNITRACE_SCOPED_THREAD_STATE(ThreadState::Internal);
|
||||
get_functors().second(m_prefix);
|
||||
}
|
||||
|
||||
template <typename Tp = this_type, enable_if_t<Tp::begin_supports_void, int> = 0>
|
||||
void start()
|
||||
{
|
||||
OMNITRACE_SCOPED_THREAD_STATE(ThreadState::Internal);
|
||||
get_functors().first();
|
||||
}
|
||||
|
||||
template <typename Tp = this_type, enable_if_t<Tp::end_supports_void, int> = 0>
|
||||
void stop()
|
||||
{
|
||||
OMNITRACE_SCOPED_THREAD_STATE(ThreadState::Internal);
|
||||
get_functors().second();
|
||||
}
|
||||
|
||||
|
||||
+15
-21
@@ -46,8 +46,7 @@ omnitrace_mpi_set_attr()
|
||||
return MPI_SUCCESS;
|
||||
};
|
||||
static auto _mpi_fini = [](MPI_Comm, int, void*, void*) {
|
||||
OMNITRACE_CONDITIONAL_BASIC_PRINT(get_debug_env(),
|
||||
"MPI Comm attribute finalize\n");
|
||||
OMNITRACE_DEBUG("MPI Comm attribute finalize\n");
|
||||
if(mpip_index != std::numeric_limits<uint64_t>::max())
|
||||
comp::deactivate_mpip<tim::component_tuple<omnitrace::component::omnitrace>,
|
||||
api::omnitrace>(mpip_index);
|
||||
@@ -83,8 +82,7 @@ mpi_gotcha::configure()
|
||||
void
|
||||
mpi_gotcha::audit(const gotcha_data_t& _data, audit::incoming, int*, char***)
|
||||
{
|
||||
OMNITRACE_CONDITIONAL_BASIC_PRINT_F(get_debug_env(), "%s(int*, char***)\n",
|
||||
_data.tool_id.c_str());
|
||||
OMNITRACE_BASIC_DEBUG_F("%s(int*, char***)\n", _data.tool_id.c_str());
|
||||
|
||||
if(get_state() < ::omnitrace::State::Init) set_state(::omnitrace::State::PreInit);
|
||||
|
||||
@@ -98,8 +96,7 @@ mpi_gotcha::audit(const gotcha_data_t& _data, audit::incoming, int*, char***)
|
||||
void
|
||||
mpi_gotcha::audit(const gotcha_data_t& _data, audit::incoming, int*, char***, int, int*)
|
||||
{
|
||||
OMNITRACE_CONDITIONAL_BASIC_PRINT_F(get_debug_env(), "%s(int*, char***, int, int*)\n",
|
||||
_data.tool_id.c_str());
|
||||
OMNITRACE_BASIC_DEBUG_F("%s(int*, char***, int, int*)\n", _data.tool_id.c_str());
|
||||
|
||||
if(get_state() < ::omnitrace::State::Init) set_state(::omnitrace::State::PreInit);
|
||||
|
||||
@@ -113,7 +110,7 @@ mpi_gotcha::audit(const gotcha_data_t& _data, audit::incoming, int*, char***, in
|
||||
void
|
||||
mpi_gotcha::audit(const gotcha_data_t& _data, audit::incoming)
|
||||
{
|
||||
OMNITRACE_CONDITIONAL_BASIC_PRINT_F(get_debug_env(), "%s()\n", _data.tool_id.c_str());
|
||||
OMNITRACE_BASIC_DEBUG_F("%s()\n", _data.tool_id.c_str());
|
||||
|
||||
if(mpip_index != std::numeric_limits<uint64_t>::max())
|
||||
comp::deactivate_mpip<tim::component_tuple<omnitrace::component::omnitrace>,
|
||||
@@ -130,7 +127,7 @@ mpi_gotcha::audit(const gotcha_data_t& _data, audit::incoming)
|
||||
void
|
||||
mpi_gotcha::audit(const gotcha_data_t& _data, audit::incoming, comm_t, int* _val)
|
||||
{
|
||||
OMNITRACE_CONDITIONAL_BASIC_PRINT_F(get_debug_env(), "%s()\n", _data.tool_id.c_str());
|
||||
OMNITRACE_BASIC_DEBUG_F("%s()\n", _data.tool_id.c_str());
|
||||
|
||||
omnitrace_push_trace_hidden(_data.tool_id.c_str());
|
||||
if(_data.tool_id == "MPI_Comm_rank")
|
||||
@@ -151,8 +148,7 @@ mpi_gotcha::audit(const gotcha_data_t& _data, audit::incoming, comm_t, int* _val
|
||||
void
|
||||
mpi_gotcha::audit(const gotcha_data_t& _data, audit::outgoing, int _retval)
|
||||
{
|
||||
OMNITRACE_CONDITIONAL_BASIC_PRINT_F(get_debug_env(), "%s() returned %i\n",
|
||||
_data.tool_id.c_str(), (int) _retval);
|
||||
OMNITRACE_BASIC_DEBUG_F("%s() returned %i\n", _data.tool_id.c_str(), (int) _retval);
|
||||
|
||||
if(_retval == tim::mpi::success_v && _data.tool_id.find("MPI_Init") == 0)
|
||||
{
|
||||
@@ -164,8 +160,7 @@ mpi_gotcha::audit(const gotcha_data_t& _data, audit::outgoing, int _retval)
|
||||
// were excluded via a regex expression)
|
||||
if(get_use_mpip())
|
||||
{
|
||||
OMNITRACE_CONDITIONAL_BASIC_PRINT_F(get_debug_env() || get_verbose_env() > 0,
|
||||
"Activating MPI wrappers...\n");
|
||||
OMNITRACE_BASIC_VERBOSE_F(2, "Activating MPI wrappers...\n");
|
||||
|
||||
// use env vars OMNITRACE_MPIP_PERMIT_LIST and OMNITRACE_MPIP_REJECT_LIST
|
||||
// to control the gotcha bindings at runtime
|
||||
@@ -186,13 +181,12 @@ mpi_gotcha::audit(const gotcha_data_t& _data, audit::outgoing, int _retval)
|
||||
tim::mpi::set_rank(m_rank);
|
||||
tim::settings::default_process_suffix() = m_rank;
|
||||
get_perfetto_output_filename().clear();
|
||||
OMNITRACE_CONDITIONAL_BASIC_PRINT(
|
||||
get_debug() || get_verbose() > 0, "[pid=%i] MPI rank: %i (%i)\n",
|
||||
process::get_id(), tim::mpi::rank(), m_rank);
|
||||
OMNITRACE_BASIC_VERBOSE(0, "[pid=%i] MPI rank: %i (%i)\n",
|
||||
process::get_id(), tim::mpi::rank(), m_rank);
|
||||
}
|
||||
else
|
||||
{
|
||||
OMNITRACE_BASIC_PRINT_F("%s() returned %i :: nullptr to rank\n",
|
||||
OMNITRACE_BASIC_VERBOSE(0, "%s() returned %i :: nullptr to rank\n",
|
||||
_data.tool_id.c_str(), (int) _retval);
|
||||
}
|
||||
}
|
||||
@@ -202,19 +196,19 @@ mpi_gotcha::audit(const gotcha_data_t& _data, audit::outgoing, int _retval)
|
||||
{
|
||||
m_size = std::max<int>(*m_size_ptr, m_size);
|
||||
tim::mpi::set_size(m_size);
|
||||
OMNITRACE_CONDITIONAL_BASIC_PRINT(
|
||||
get_debug() || get_verbose() > 0, "[pid=%i] MPI size: %i (%i)\n",
|
||||
process::get_id(), tim::mpi::size(), m_size);
|
||||
OMNITRACE_BASIC_VERBOSE(0, "[pid=%i] MPI size: %i (%i)\n",
|
||||
process::get_id(), tim::mpi::size(), m_size);
|
||||
}
|
||||
else
|
||||
{
|
||||
OMNITRACE_BASIC_PRINT_F("%s() returned %i :: nullptr to size\n",
|
||||
OMNITRACE_BASIC_VERBOSE(0, "%s() returned %i :: nullptr to size\n",
|
||||
_data.tool_id.c_str(), (int) _retval);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
OMNITRACE_BASIC_PRINT_F("%s() returned %i :: unexpected function wrapper\n",
|
||||
OMNITRACE_BASIC_VERBOSE(0,
|
||||
"%s() returned %i :: unexpected function wrapper\n",
|
||||
_data.tool_id.c_str(), (int) _retval);
|
||||
}
|
||||
}
|
||||
|
||||
+301
@@ -0,0 +1,301 @@
|
||||
// MIT License
|
||||
//
|
||||
// Copyright (c) 2022 Advanced Micro Devices, Inc. All Rights Reserved.
|
||||
//
|
||||
// Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
// of this software and associated documentation files (the "Software"), to deal
|
||||
// in the Software without restriction, including without limitation the rights
|
||||
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
// copies of the Software, and to permit persons to whom the Software is
|
||||
// furnished to do so, subject to the following conditions:
|
||||
//
|
||||
// The above copyright notice and this permission notice shall be included in all
|
||||
// copies or substantial portions of the Software.
|
||||
//
|
||||
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
// SOFTWARE.
|
||||
|
||||
#include "library/components/pthread_create_gotcha.hpp"
|
||||
#include "library/components/omnitrace.hpp"
|
||||
#include "library/components/pthread_gotcha.hpp"
|
||||
#include "library/components/roctracer.hpp"
|
||||
#include "library/config.hpp"
|
||||
#include "library/debug.hpp"
|
||||
#include "library/runtime.hpp"
|
||||
#include "library/sampling.hpp"
|
||||
#include "library/thread_data.hpp"
|
||||
|
||||
#include <bits/stdint-intn.h>
|
||||
#include <timemory/backends/threading.hpp>
|
||||
#include <timemory/sampling/allocator.hpp>
|
||||
#include <timemory/utility/types.hpp>
|
||||
|
||||
#include <ostream>
|
||||
#include <pthread.h>
|
||||
|
||||
namespace omnitrace
|
||||
{
|
||||
namespace sampling
|
||||
{
|
||||
std::set<int>
|
||||
setup();
|
||||
std::set<int>
|
||||
shutdown();
|
||||
} // namespace sampling
|
||||
|
||||
namespace mpl = tim::mpl;
|
||||
|
||||
using bundle_t = tim::lightweight_tuple<comp::wall_clock, comp::roctracer_data>;
|
||||
using wall_pw_t = mpl::piecewise_select<comp::wall_clock>; // only wall-clock
|
||||
using main_pw_t = mpl::piecewise_ignore<comp::wall_clock>; // exclude wall-clock
|
||||
|
||||
namespace
|
||||
{
|
||||
auto* is_shutdown = new bool{ false }; // intentional data leak
|
||||
auto* bundles = new std::map<int64_t, std::shared_ptr<bundle_t>>{};
|
||||
auto* bundles_mutex = new std::mutex{};
|
||||
auto bundles_dtor = scope::destructor{ []() {
|
||||
omnitrace::pthread_create_gotcha::shutdown();
|
||||
delete bundles;
|
||||
delete bundles_mutex;
|
||||
bundles = nullptr;
|
||||
bundles_mutex = nullptr;
|
||||
} };
|
||||
|
||||
inline void
|
||||
start_bundle(bundle_t& _bundle)
|
||||
{
|
||||
if(!get_use_timemory()) return;
|
||||
OMNITRACE_BASIC_VERBOSE_F(3, "starting bundle '%s'...\n", _bundle.key().c_str());
|
||||
_bundle.push();
|
||||
_bundle.start();
|
||||
}
|
||||
|
||||
inline void
|
||||
stop_bundle(bundle_t& _bundle, int64_t _tid)
|
||||
{
|
||||
if(!get_use_timemory()) return;
|
||||
OMNITRACE_BASIC_VERBOSE_F(3, "stopping bundle '%s' in thread %li...\n",
|
||||
_bundle.key().c_str(), _tid);
|
||||
_bundle.stop(wall_pw_t{}); // stop wall-clock so we can get the value
|
||||
// update roctracer_data
|
||||
_bundle.store(std::plus<double>{},
|
||||
_bundle.get<comp::wall_clock>()->get() * units::sec);
|
||||
// stop all other components including roctracer_data after update
|
||||
_bundle.stop(main_pw_t{});
|
||||
// exclude popping wall-clock
|
||||
_bundle.pop(_tid);
|
||||
}
|
||||
} // namespace
|
||||
|
||||
//--------------------------------------------------------------------------------------//
|
||||
|
||||
pthread_create_gotcha::wrapper::wrapper(routine_t _routine, void* _arg,
|
||||
bool _enable_sampling, int64_t _parent,
|
||||
promise_t* _p)
|
||||
: m_enable_sampling{ _enable_sampling }
|
||||
, m_parent_tid{ _parent }
|
||||
, m_routine{ _routine }
|
||||
, m_arg{ _arg }
|
||||
, m_promise{ _p }
|
||||
{}
|
||||
|
||||
void*
|
||||
pthread_create_gotcha::wrapper::operator()() const
|
||||
{
|
||||
if(is_shutdown && *is_shutdown)
|
||||
{
|
||||
if(m_promise) m_promise->set_value();
|
||||
// execute the original function
|
||||
return m_routine(m_arg);
|
||||
}
|
||||
|
||||
set_thread_state(ThreadState::Internal);
|
||||
|
||||
int64_t _tid = -1;
|
||||
auto _is_sampling = false;
|
||||
auto _bundle = std::shared_ptr<bundle_t>{};
|
||||
auto _signals = std::set<int>{};
|
||||
auto _coverage = (get_mode() == omnitrace::Mode::Coverage);
|
||||
auto _dtor = scope::destructor{ [&]() {
|
||||
set_thread_state(ThreadState::Internal);
|
||||
if(_is_sampling)
|
||||
{
|
||||
sampling::block_signals(_signals);
|
||||
sampling::shutdown();
|
||||
}
|
||||
|
||||
pthread_create_gotcha::shutdown(_tid);
|
||||
set_thread_state(ThreadState::Completed);
|
||||
} };
|
||||
|
||||
auto _active = (get_state() == omnitrace::State::Active && bundles != nullptr &&
|
||||
bundles_mutex != nullptr);
|
||||
|
||||
if(_active && !_coverage)
|
||||
{
|
||||
_tid = threading::get_id();
|
||||
threading::set_thread_name(TIMEMORY_JOIN(" ", "Thread", _tid).c_str());
|
||||
if(bundles && bundles_mutex)
|
||||
{
|
||||
std::unique_lock<std::mutex> _lk{ *bundles_mutex };
|
||||
_bundle = bundles->emplace(_tid, std::make_shared<bundle_t>("start_thread"))
|
||||
.first->second;
|
||||
}
|
||||
if(_bundle) start_bundle(*_bundle);
|
||||
get_cpu_cid_stack(threading::get_id(), m_parent_tid);
|
||||
if(m_enable_sampling)
|
||||
{
|
||||
// initialize thread-local statics
|
||||
(void) tim::get_unw_backtrace<12, 1, false>();
|
||||
_is_sampling = true;
|
||||
pthread_gotcha::push_enable_sampling_on_child_threads(false);
|
||||
_signals = sampling::setup();
|
||||
pthread_gotcha::pop_enable_sampling_on_child_threads();
|
||||
sampling::unblock_signals();
|
||||
}
|
||||
}
|
||||
|
||||
if(m_promise) m_promise->set_value();
|
||||
|
||||
set_thread_state(ThreadState::Enabled);
|
||||
// execute the original function
|
||||
return m_routine(m_arg);
|
||||
}
|
||||
|
||||
void*
|
||||
pthread_create_gotcha::wrapper::wrap(void* _arg)
|
||||
{
|
||||
if(_arg == nullptr) return nullptr;
|
||||
|
||||
// convert the argument
|
||||
wrapper* _wrapper = static_cast<wrapper*>(_arg);
|
||||
|
||||
// execute the original function
|
||||
return (*_wrapper)();
|
||||
}
|
||||
|
||||
void
|
||||
pthread_create_gotcha::configure()
|
||||
{
|
||||
pthread_create_gotcha_t::get_initializer() = []() {
|
||||
pthread_create_gotcha_t::template configure<
|
||||
0, int, pthread_t*, const pthread_attr_t*, void* (*) (void*), void*>(
|
||||
"pthread_create");
|
||||
};
|
||||
}
|
||||
|
||||
void
|
||||
pthread_create_gotcha::shutdown()
|
||||
{
|
||||
if(is_shutdown)
|
||||
{
|
||||
if(*is_shutdown) return;
|
||||
*is_shutdown = true;
|
||||
}
|
||||
|
||||
if(!bundles_mutex || !bundles) return;
|
||||
|
||||
std::unique_lock<std::mutex> _lk{ *bundles_mutex };
|
||||
unsigned long _ndangling = 0;
|
||||
for(auto itr : *bundles)
|
||||
{
|
||||
if(itr.second)
|
||||
{
|
||||
stop_bundle(*itr.second, itr.first);
|
||||
++_ndangling;
|
||||
}
|
||||
itr.second.reset();
|
||||
}
|
||||
|
||||
bundles->clear();
|
||||
|
||||
OMNITRACE_CONDITIONAL_BASIC_PRINT(
|
||||
(get_verbose_env() >= 2 || get_debug_env()) && _ndangling > 0,
|
||||
"[pthread_create_gotcha::shutdown] cleaned up %lu dangling bundles\n",
|
||||
_ndangling);
|
||||
}
|
||||
|
||||
void
|
||||
pthread_create_gotcha::shutdown(int64_t _tid)
|
||||
{
|
||||
if(is_shutdown && *is_shutdown) return;
|
||||
|
||||
if(!bundles_mutex || !bundles) return;
|
||||
|
||||
std::unique_lock<std::mutex> _lk{ *bundles_mutex };
|
||||
auto itr = bundles->find(_tid);
|
||||
if(itr != bundles->end())
|
||||
{
|
||||
if(itr->second) stop_bundle(*itr->second, itr->first);
|
||||
itr->second.reset();
|
||||
bundles->erase(itr);
|
||||
}
|
||||
}
|
||||
|
||||
// pthread_create
|
||||
int
|
||||
pthread_create_gotcha::operator()(pthread_t* thread, const pthread_attr_t* attr,
|
||||
void* (*start_routine)(void*), void* arg) const
|
||||
{
|
||||
OMNITRACE_SCOPED_THREAD_STATE(ThreadState::Internal);
|
||||
bundle_t _bundle{ "pthread_create" };
|
||||
auto _enable_sampling = pthread_gotcha::sampling_enabled_on_child_threads();
|
||||
auto _coverage = (get_mode() == omnitrace::Mode::Coverage);
|
||||
auto _active = (get_state() == omnitrace::State::Active);
|
||||
int64_t _tid = (_active) ? threading::get_id() : 0;
|
||||
|
||||
if(_active)
|
||||
{
|
||||
OMNITRACE_VERBOSE(1, "Creating new thread on PID %i (rank: %i), TID %li\n",
|
||||
process::get_id(), dmp::rank(), _tid);
|
||||
}
|
||||
|
||||
// ensure that cpu cid stack exists on the parent thread if active
|
||||
if(!_coverage && _active) get_cpu_cid_stack();
|
||||
|
||||
if(!get_use_sampling() || !_enable_sampling)
|
||||
{
|
||||
auto* _obj = new wrapper(start_routine, arg, _enable_sampling, _tid, nullptr);
|
||||
// create the thread
|
||||
auto _ret =
|
||||
::pthread_create(thread, attr, &wrapper::wrap, static_cast<void*>(_obj));
|
||||
return _ret;
|
||||
}
|
||||
|
||||
// block the signals in entire process
|
||||
OMNITRACE_DEBUG("blocking signals...\n");
|
||||
tim::sampling::block_signals({ SIGALRM, SIGPROF },
|
||||
tim::sampling::sigmask_scope::process);
|
||||
|
||||
start_bundle(_bundle);
|
||||
|
||||
// promise set by thread when signal handler is configured
|
||||
auto _promise = std::promise<void>{};
|
||||
auto _fut = _promise.get_future();
|
||||
auto* _wrap = new wrapper(start_routine, arg, _enable_sampling, _tid, &_promise);
|
||||
|
||||
// create the thread
|
||||
auto _ret = ::pthread_create(thread, attr, &wrapper::wrap, static_cast<void*>(_wrap));
|
||||
|
||||
// wait for thread to set promise
|
||||
OMNITRACE_DEBUG("waiting for child to signal it is setup...\n");
|
||||
_fut.wait();
|
||||
|
||||
stop_bundle(_bundle, threading::get_id());
|
||||
|
||||
// unblock the signals in the entire process
|
||||
OMNITRACE_DEBUG("unblocking signals...\n");
|
||||
tim::sampling::unblock_signals({ SIGALRM, SIGPROF },
|
||||
tim::sampling::sigmask_scope::process);
|
||||
|
||||
OMNITRACE_DEBUG("returning success...\n");
|
||||
return _ret;
|
||||
}
|
||||
|
||||
} // namespace omnitrace
|
||||
+71
@@ -0,0 +1,71 @@
|
||||
// MIT License
|
||||
//
|
||||
// Copyright (c) 2022 Advanced Micro Devices, Inc. All Rights Reserved.
|
||||
//
|
||||
// Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
// of this software and associated documentation files (the "Software"), to deal
|
||||
// in the Software without restriction, including without limitation the rights
|
||||
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
// copies of the Software, and to permit persons to whom the Software is
|
||||
// furnished to do so, subject to the following conditions:
|
||||
//
|
||||
// The above copyright notice and this permission notice shall be included in all
|
||||
// copies or substantial portions of the Software.
|
||||
//
|
||||
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
// SOFTWARE.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "library/common.hpp"
|
||||
#include "library/defines.hpp"
|
||||
#include "library/timemory.hpp"
|
||||
|
||||
#include <cstdint>
|
||||
#include <future>
|
||||
|
||||
namespace omnitrace
|
||||
{
|
||||
struct pthread_create_gotcha : tim::component::base<pthread_create_gotcha, void>
|
||||
{
|
||||
struct wrapper
|
||||
{
|
||||
using routine_t = void* (*) (void*);
|
||||
using promise_t = std::promise<void>;
|
||||
|
||||
wrapper(routine_t _routine, void* _arg, bool, int64_t, promise_t*);
|
||||
void* operator()() const;
|
||||
|
||||
static void* wrap(void* _arg);
|
||||
|
||||
private:
|
||||
bool m_enable_sampling = false;
|
||||
int64_t m_parent_tid = 0;
|
||||
routine_t m_routine = nullptr;
|
||||
void* m_arg = nullptr;
|
||||
promise_t* m_promise = nullptr;
|
||||
};
|
||||
|
||||
TIMEMORY_DEFAULT_OBJECT(pthread_create_gotcha)
|
||||
|
||||
// string id for component
|
||||
static std::string label() { return "pthread_create_gotcha"; }
|
||||
|
||||
// generate the gotcha wrappers
|
||||
static void configure();
|
||||
static void shutdown();
|
||||
static void shutdown(int64_t);
|
||||
|
||||
// pthread_create
|
||||
int operator()(pthread_t* thread, const pthread_attr_t* attr,
|
||||
void* (*start_routine)(void*), void* arg) const;
|
||||
};
|
||||
|
||||
using pthread_create_gotcha_t =
|
||||
tim::component::gotcha<2, std::tuple<>, pthread_create_gotcha>;
|
||||
} // namespace omnitrace
|
||||
+29
-232
@@ -22,224 +22,62 @@
|
||||
|
||||
#include "library/components/pthread_gotcha.hpp"
|
||||
#include "library/components/omnitrace.hpp"
|
||||
#include "library/components/pthread_create_gotcha.hpp"
|
||||
#include "library/components/pthread_mutex_gotcha.hpp"
|
||||
#include "library/components/roctracer.hpp"
|
||||
#include "library/config.hpp"
|
||||
#include "library/debug.hpp"
|
||||
#include "library/runtime.hpp"
|
||||
#include "library/sampling.hpp"
|
||||
#include "library/thread_data.hpp"
|
||||
#include "library/utility.hpp"
|
||||
|
||||
#include <timemory/backends/threading.hpp>
|
||||
#include <timemory/sampling/allocator.hpp>
|
||||
#include <timemory/utility/types.hpp>
|
||||
|
||||
#include <ostream>
|
||||
#include <pthread.h>
|
||||
|
||||
#include <array>
|
||||
#include <vector>
|
||||
|
||||
namespace omnitrace
|
||||
{
|
||||
namespace sampling
|
||||
{
|
||||
std::set<int>
|
||||
setup();
|
||||
std::set<int>
|
||||
shutdown();
|
||||
} // namespace sampling
|
||||
|
||||
namespace mpl = tim::mpl;
|
||||
|
||||
using bundle_t = tim::lightweight_tuple<comp::wall_clock, comp::roctracer_data>;
|
||||
using wall_pw_t = mpl::piecewise_select<comp::wall_clock>; // only wall-clock
|
||||
using main_pw_t = mpl::piecewise_ignore<comp::wall_clock>; // exclude wall-clock
|
||||
|
||||
namespace
|
||||
{
|
||||
auto* is_shutdown = new bool{ false }; // intentional data leak
|
||||
auto* bundles = new std::map<int64_t, std::shared_ptr<bundle_t>>{};
|
||||
auto* bundles_mutex = new std::mutex{};
|
||||
auto bundles_dtor = scope::destructor{ []() {
|
||||
omnitrace::pthread_gotcha::shutdown();
|
||||
delete bundles;
|
||||
delete bundles_mutex;
|
||||
bundles = nullptr;
|
||||
bundles_mutex = nullptr;
|
||||
} };
|
||||
using bundle_t = tim::lightweight_tuple<pthread_create_gotcha_t, pthread_mutex_gotcha_t>;
|
||||
|
||||
inline void
|
||||
start_bundle(bundle_t& _bundle)
|
||||
auto&
|
||||
get_sampling_on_child_threads_history(int64_t _idx = utility::get_thread_index())
|
||||
{
|
||||
if(!get_use_timemory()) return;
|
||||
OMNITRACE_BASIC_VERBOSE_F(3, "starting bundle '%s'...\n", _bundle.key().c_str());
|
||||
if(comp::roctracer::is_setup())
|
||||
{
|
||||
_bundle.push();
|
||||
_bundle.start();
|
||||
}
|
||||
else
|
||||
{
|
||||
_bundle.push(wall_pw_t{});
|
||||
_bundle.start(wall_pw_t{});
|
||||
}
|
||||
}
|
||||
|
||||
inline void
|
||||
stop_bundle(bundle_t& _bundle, int64_t _tid)
|
||||
{
|
||||
if(!get_use_timemory()) return;
|
||||
OMNITRACE_BASIC_VERBOSE_F(3, "stopping bundle '%s' in thread %li...\n",
|
||||
_bundle.key().c_str(), _tid);
|
||||
_bundle.stop(wall_pw_t{}); // stop wall-clock so we can get the value
|
||||
// update roctracer_data
|
||||
_bundle.store(std::plus<double>{},
|
||||
_bundle.get<comp::wall_clock>()->get() * units::sec);
|
||||
// stop all other components including roctracer_data after update
|
||||
_bundle.stop(main_pw_t{});
|
||||
// exclude popping wall-clock
|
||||
_bundle.pop(_tid);
|
||||
}
|
||||
|
||||
auto
|
||||
get_thread_index()
|
||||
{
|
||||
static std::atomic<int64_t> _c{ 0 };
|
||||
static thread_local int64_t _v = _c++;
|
||||
return _v;
|
||||
static auto _v = utility::get_filled_array<OMNITRACE_MAX_THREADS>(
|
||||
[]() { return utility::get_reserved_vector<bool>(32); });
|
||||
return _v.at(_idx);
|
||||
}
|
||||
|
||||
auto&
|
||||
get_sampling_on_child_threads_history(int64_t _idx = get_thread_index())
|
||||
get_bundle()
|
||||
{
|
||||
static auto _v = std::array<std::vector<bool>, OMNITRACE_MAX_THREADS>{};
|
||||
return _v.at(_idx);
|
||||
static auto _v = std::unique_ptr<bundle_t>{};
|
||||
if(!_v) _v = std::make_unique<bundle_t>("pthread_gotcha");
|
||||
return _v;
|
||||
}
|
||||
} // namespace
|
||||
|
||||
//--------------------------------------------------------------------------------------//
|
||||
|
||||
pthread_gotcha::wrapper::wrapper(routine_t _routine, void* _arg, bool _enable_sampling,
|
||||
int64_t _parent, promise_t* _p)
|
||||
: m_enable_sampling{ _enable_sampling }
|
||||
, m_parent_tid{ _parent }
|
||||
, m_routine{ _routine }
|
||||
, m_arg{ _arg }
|
||||
, m_promise{ _p }
|
||||
{}
|
||||
|
||||
void*
|
||||
pthread_gotcha::wrapper::operator()() const
|
||||
{
|
||||
if(is_shutdown && *is_shutdown)
|
||||
{
|
||||
if(m_promise) m_promise->set_value();
|
||||
// execute the original function
|
||||
return m_routine(m_arg);
|
||||
}
|
||||
|
||||
int64_t _tid = -1;
|
||||
auto _is_sampling = false;
|
||||
auto _bundle = std::shared_ptr<bundle_t>{};
|
||||
auto _signals = std::set<int>{};
|
||||
auto _coverage = (get_mode() == omnitrace::Mode::Coverage);
|
||||
auto _dtor = scope::destructor{ [&]() {
|
||||
if(_is_sampling)
|
||||
{
|
||||
sampling::block_signals(_signals);
|
||||
sampling::shutdown();
|
||||
}
|
||||
|
||||
if(!bundles || !bundles_mutex) return;
|
||||
if(_bundle && get_state() < omnitrace::State::Finalized)
|
||||
{
|
||||
std::unique_lock<std::mutex> _lk{ *bundles_mutex };
|
||||
stop_bundle(*_bundle, _tid);
|
||||
_bundle.reset();
|
||||
bundles->erase(_tid);
|
||||
}
|
||||
} };
|
||||
|
||||
auto _active = (get_state() == omnitrace::State::Active && bundles && bundles_mutex);
|
||||
|
||||
if(_active && !_coverage)
|
||||
{
|
||||
_tid = threading::get_id();
|
||||
threading::set_thread_name(TIMEMORY_JOIN(" ", "Thread", _tid).c_str());
|
||||
if(bundles && bundles_mutex)
|
||||
{
|
||||
std::unique_lock<std::mutex> _lk{ *bundles_mutex };
|
||||
if(comp::roctracer::is_setup())
|
||||
_bundle =
|
||||
bundles->emplace(_tid, std::make_shared<bundle_t>("start_thread"))
|
||||
.first->second;
|
||||
}
|
||||
if(_bundle) start_bundle(*_bundle);
|
||||
get_cpu_cid_stack(threading::get_id(), m_parent_tid);
|
||||
if(m_enable_sampling)
|
||||
{
|
||||
// initialize thread-local statics
|
||||
(void) tim::get_unw_backtrace<12, 1, false>();
|
||||
_is_sampling = true;
|
||||
push_enable_sampling_on_child_threads(false);
|
||||
_signals = sampling::setup();
|
||||
pop_enable_sampling_on_child_threads();
|
||||
sampling::unblock_signals();
|
||||
}
|
||||
}
|
||||
|
||||
if(m_promise) m_promise->set_value();
|
||||
|
||||
// execute the original function
|
||||
return m_routine(m_arg);
|
||||
}
|
||||
|
||||
void*
|
||||
pthread_gotcha::wrapper::wrap(void* _arg)
|
||||
{
|
||||
if(_arg == nullptr) return nullptr;
|
||||
|
||||
// convert the argument
|
||||
wrapper* _wrapper = static_cast<wrapper*>(_arg);
|
||||
|
||||
// execute the original function
|
||||
return (*_wrapper)();
|
||||
}
|
||||
|
||||
void
|
||||
pthread_gotcha::configure()
|
||||
{
|
||||
pthread_gotcha_t::get_initializer() = []() {
|
||||
pthread_gotcha_t::template configure<0, int, pthread_t*, const pthread_attr_t*,
|
||||
void* (*) (void*), void*>("pthread_create");
|
||||
};
|
||||
pthread_create_gotcha::configure();
|
||||
pthread_mutex_gotcha::configure();
|
||||
}
|
||||
|
||||
void
|
||||
pthread_gotcha::shutdown()
|
||||
{
|
||||
if(is_shutdown)
|
||||
{
|
||||
if(*is_shutdown) return;
|
||||
*is_shutdown = true;
|
||||
}
|
||||
|
||||
if(!bundles_mutex || !bundles) return;
|
||||
|
||||
std::unique_lock<std::mutex> _lk{ *bundles_mutex };
|
||||
unsigned long _ndangling = 0;
|
||||
for(auto itr : *bundles)
|
||||
{
|
||||
if(itr.second)
|
||||
{
|
||||
stop_bundle(*itr.second, itr.first);
|
||||
++_ndangling;
|
||||
}
|
||||
itr.second.reset();
|
||||
}
|
||||
|
||||
bundles->clear();
|
||||
|
||||
OMNITRACE_CONDITIONAL_BASIC_PRINT(
|
||||
(get_verbose_env() >= 2 || get_debug_env()) && _ndangling > 0,
|
||||
"[pthread_gotcha::shutdown] cleaned up %lu dangling bundles\n", _ndangling);
|
||||
pthread_create_gotcha::shutdown();
|
||||
pthread_mutex_gotcha::shutdown();
|
||||
}
|
||||
|
||||
bool
|
||||
@@ -287,57 +125,16 @@ pthread_gotcha::sampling_on_child_threads()
|
||||
return _v;
|
||||
}
|
||||
|
||||
// pthread_create
|
||||
int
|
||||
pthread_gotcha::operator()(pthread_t* thread, const pthread_attr_t* attr,
|
||||
void* (*start_routine)(void*), void* arg) const
|
||||
void
|
||||
pthread_gotcha::start()
|
||||
{
|
||||
bundle_t _bundle{ "pthread_create" };
|
||||
auto _enable_sampling = sampling_enabled_on_child_threads();
|
||||
auto _coverage = (get_mode() == omnitrace::Mode::Coverage);
|
||||
auto _active = (get_state() == omnitrace::State::Active);
|
||||
int64_t _tid = (_active) ? threading::get_id() : 0;
|
||||
|
||||
// ensure that cpu cid stack exists on the parent thread if active
|
||||
if(!_coverage && _active) get_cpu_cid_stack();
|
||||
|
||||
if(!get_use_sampling() || !_enable_sampling)
|
||||
{
|
||||
auto* _obj = new wrapper(start_routine, arg, _enable_sampling, _tid, nullptr);
|
||||
// create the thread
|
||||
auto _ret =
|
||||
::pthread_create(thread, attr, &wrapper::wrap, static_cast<void*>(_obj));
|
||||
return _ret;
|
||||
}
|
||||
|
||||
// block the signals in entire process
|
||||
OMNITRACE_DEBUG("blocking signals...\n");
|
||||
tim::sampling::block_signals({ SIGALRM, SIGPROF },
|
||||
tim::sampling::sigmask_scope::process);
|
||||
|
||||
start_bundle(_bundle);
|
||||
|
||||
// promise set by thread when signal handler is configured
|
||||
auto _promise = std::promise<void>{};
|
||||
auto _fut = _promise.get_future();
|
||||
auto* _wrap = new wrapper(start_routine, arg, _enable_sampling, _tid, &_promise);
|
||||
|
||||
// create the thread
|
||||
auto _ret = ::pthread_create(thread, attr, &wrapper::wrap, static_cast<void*>(_wrap));
|
||||
|
||||
// wait for thread to set promise
|
||||
OMNITRACE_DEBUG("waiting for child to signal it is setup...\n");
|
||||
_fut.wait();
|
||||
|
||||
stop_bundle(_bundle, threading::get_id());
|
||||
|
||||
// unblock the signals in the entire process
|
||||
OMNITRACE_DEBUG("unblocking signals...\n");
|
||||
tim::sampling::unblock_signals({ SIGALRM, SIGPROF },
|
||||
tim::sampling::sigmask_scope::process);
|
||||
|
||||
OMNITRACE_DEBUG("returning success...\n");
|
||||
return _ret;
|
||||
get_bundle()->start();
|
||||
}
|
||||
|
||||
void
|
||||
pthread_gotcha::stop()
|
||||
{
|
||||
get_bundle()->stop();
|
||||
get_bundle().reset();
|
||||
}
|
||||
} // namespace omnitrace
|
||||
|
||||
+2
-23
@@ -33,24 +33,6 @@ namespace omnitrace
|
||||
{
|
||||
struct pthread_gotcha : tim::component::base<pthread_gotcha, void>
|
||||
{
|
||||
struct wrapper
|
||||
{
|
||||
using routine_t = void* (*) (void*);
|
||||
using promise_t = std::promise<void>;
|
||||
|
||||
wrapper(routine_t _routine, void* _arg, bool, int64_t, promise_t*);
|
||||
void* operator()() const;
|
||||
|
||||
static void* wrap(void* _arg);
|
||||
|
||||
private:
|
||||
bool m_enable_sampling = false;
|
||||
int64_t m_parent_tid = 0;
|
||||
routine_t m_routine = nullptr;
|
||||
void* m_arg = nullptr;
|
||||
promise_t* m_promise = nullptr;
|
||||
};
|
||||
|
||||
TIMEMORY_DEFAULT_OBJECT(pthread_gotcha)
|
||||
|
||||
// string id for component
|
||||
@@ -72,13 +54,10 @@ struct pthread_gotcha : tim::component::base<pthread_gotcha, void>
|
||||
// make sure every newly created thead starts with this value
|
||||
static void set_sampling_on_all_future_threads(bool _v);
|
||||
|
||||
// pthread_create
|
||||
int operator()(pthread_t* thread, const pthread_attr_t* attr,
|
||||
void* (*start_routine)(void*), void* arg) const;
|
||||
static void start();
|
||||
static void stop();
|
||||
|
||||
private:
|
||||
static bool& sampling_on_child_threads();
|
||||
};
|
||||
|
||||
using pthread_gotcha_t = tim::component::gotcha<2, std::tuple<>, pthread_gotcha>;
|
||||
} // namespace omnitrace
|
||||
|
||||
+174
@@ -0,0 +1,174 @@
|
||||
// MIT License
|
||||
//
|
||||
// Copyright (c) 2022 Advanced Micro Devices, Inc. All Rights Reserved.
|
||||
//
|
||||
// Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
// of this software and associated documentation files (the "Software"), to deal
|
||||
// in the Software without restriction, including without limitation the rights
|
||||
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
// copies of the Software, and to permit persons to whom the Software is
|
||||
// furnished to do so, subject to the following conditions:
|
||||
//
|
||||
// The above copyright notice and this permission notice shall be included in all
|
||||
// copies or substantial portions of the Software.
|
||||
//
|
||||
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
// SOFTWARE.
|
||||
|
||||
#include "library/components/pthread_mutex_gotcha.hpp"
|
||||
#include "library.hpp"
|
||||
#include "library/components/pthread_gotcha.hpp"
|
||||
#include "library/config.hpp"
|
||||
#include "library/critical_trace.hpp"
|
||||
#include "library/debug.hpp"
|
||||
#include "library/runtime.hpp"
|
||||
#include "library/sampling.hpp"
|
||||
#include "library/thread_sampler.hpp"
|
||||
#include "library/utility.hpp"
|
||||
#include "timemory/backends/threading.hpp"
|
||||
|
||||
#include <timemory/utility/signals.hpp>
|
||||
#include <timemory/utility/types.hpp>
|
||||
|
||||
#include <pthread.h>
|
||||
|
||||
namespace omnitrace
|
||||
{
|
||||
using Device = critical_trace::Device;
|
||||
using Phase = critical_trace::Phase;
|
||||
|
||||
pthread_mutex_gotcha::hash_array_t&
|
||||
pthread_mutex_gotcha::get_hashes()
|
||||
{
|
||||
// theoretically, this private function will NEVER be called until it
|
||||
// is called by a gotcha wrapper, which means the tool ids should be
|
||||
// fully populated. If that fails to be the case for some reason,
|
||||
// we could see weird results.
|
||||
static auto _v = []() {
|
||||
const auto& _data = pthread_mutex_gotcha_t::get_gotcha_data();
|
||||
hash_array_t _init{};
|
||||
for(size_t i = 0; i < gotcha_capacity; ++i)
|
||||
{
|
||||
auto&& _id = _data.at(i).tool_id;
|
||||
if(!_id.empty())
|
||||
_init.at(i) = critical_trace::add_hash_id(_id.c_str());
|
||||
else
|
||||
{
|
||||
OMNITRACE_VERBOSE(
|
||||
1,
|
||||
"WARNING!!! pthread_mutex_gotcha tool id at index %zu was empty!\n",
|
||||
i);
|
||||
}
|
||||
OMNITRACE_CI_FAIL(
|
||||
_id.empty() || _init.at(i) == 0,
|
||||
"pthread_mutex_gotcha tool id at index %zu has no hash value\n", i);
|
||||
}
|
||||
return _init;
|
||||
}();
|
||||
return _v;
|
||||
}
|
||||
|
||||
void
|
||||
pthread_mutex_gotcha::configure()
|
||||
{
|
||||
pthread_mutex_gotcha_t::get_initializer() = []() {
|
||||
if(config::get_trace_thread_locks())
|
||||
{
|
||||
pthread_mutex_gotcha::validate();
|
||||
|
||||
pthread_mutex_gotcha_t::configure(
|
||||
comp::gotcha_config<0, int, pthread_mutex_t*>{ "pthread_mutex_lock" });
|
||||
|
||||
pthread_mutex_gotcha_t::configure(
|
||||
comp::gotcha_config<1, int, pthread_mutex_t*>{ "pthread_mutex_unlock" });
|
||||
|
||||
pthread_mutex_gotcha_t::configure(
|
||||
comp::gotcha_config<2, int, pthread_mutex_t*>{ "pthread_mutex_trylock" });
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
void
|
||||
pthread_mutex_gotcha::shutdown()
|
||||
{}
|
||||
|
||||
void
|
||||
pthread_mutex_gotcha::validate()
|
||||
{
|
||||
if(config::get_trace_thread_locks() && config::get_use_perfetto())
|
||||
{
|
||||
OMNITRACE_PRINT_F("\n");
|
||||
OMNITRACE_PRINT_F("\n");
|
||||
OMNITRACE_PRINT_F("\n");
|
||||
OMNITRACE_PRINT_F(
|
||||
"The overhead of all the mutex locking internally by perfetto is\n")
|
||||
OMNITRACE_PRINT_F(
|
||||
"so significant that all timing data is rendered meaningless.\n");
|
||||
OMNITRACE_PRINT_F(
|
||||
"However, mutex locking is effectively non-existant in timemory.\n");
|
||||
OMNITRACE_PRINT_F("If you want to trace the mutex locking:\n")
|
||||
OMNITRACE_PRINT_F(" OMNITRACE_USE_TIMEMORY=ON\n");
|
||||
OMNITRACE_PRINT_F(" OMNITRACE_USE_PERFETTO=OFF\n");
|
||||
OMNITRACE_PRINT_F("\n");
|
||||
OMNITRACE_PRINT_F("\n");
|
||||
OMNITRACE_PRINT_F("\n");
|
||||
OMNITRACE_FAIL_F("OMNITRACE_USE_PERFETTO and OMNITRACE_TRACE_THREAD_LOCKS cannot "
|
||||
"both be enabled.\n");
|
||||
}
|
||||
}
|
||||
|
||||
int
|
||||
pthread_mutex_gotcha::operator()(const gotcha_data_t& _data,
|
||||
int (*_callee)(pthread_mutex_t*),
|
||||
pthread_mutex_t* _mutex)
|
||||
{
|
||||
if(is_disabled())
|
||||
{
|
||||
if(!_callee)
|
||||
{
|
||||
OMNITRACE_PRINT("Warning! nullptr to %s\n", _data.tool_id.c_str());
|
||||
return EINVAL;
|
||||
}
|
||||
return (*_callee)(_mutex);
|
||||
}
|
||||
|
||||
uint64_t _cid = 0;
|
||||
uint64_t _parent_cid = 0;
|
||||
uint16_t _depth = 0;
|
||||
int64_t _ts = 0;
|
||||
|
||||
OMNITRACE_SCOPED_THREAD_STATE(ThreadState::Internal);
|
||||
|
||||
if(get_use_critical_trace())
|
||||
{
|
||||
std::tie(_cid, _parent_cid, _depth) = create_cpu_cid_entry();
|
||||
_ts = comp::wall_clock::record();
|
||||
}
|
||||
|
||||
omnitrace_push_region(_data.tool_id.c_str());
|
||||
auto _ret = (*_callee)(_mutex);
|
||||
omnitrace_pop_region(_data.tool_id.c_str());
|
||||
|
||||
if(get_use_critical_trace())
|
||||
{
|
||||
add_critical_trace<Device::CPU, Phase::DELTA>(
|
||||
threading::get_id(), _cid, 0, _parent_cid, _ts, comp::wall_clock::record(),
|
||||
reinterpret_cast<uintptr_t>(_mutex), get_hashes().at(_data.index), _depth);
|
||||
}
|
||||
|
||||
return _ret;
|
||||
}
|
||||
|
||||
bool
|
||||
pthread_mutex_gotcha::is_disabled()
|
||||
{
|
||||
return (omnitrace::get_state() != omnitrace::State::Active ||
|
||||
omnitrace::get_thread_state() != omnitrace::ThreadState::Enabled ||
|
||||
(get_use_sampling() && !pthread_gotcha::sampling_enabled_on_child_threads()));
|
||||
}
|
||||
} // namespace omnitrace
|
||||
+61
@@ -0,0 +1,61 @@
|
||||
// MIT License
|
||||
//
|
||||
// Copyright (c) 2022 Advanced Micro Devices, Inc. All Rights Reserved.
|
||||
//
|
||||
// Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
// of this software and associated documentation files (the "Software"), to deal
|
||||
// in the Software without restriction, including without limitation the rights
|
||||
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
// copies of the Software, and to permit persons to whom the Software is
|
||||
// furnished to do so, subject to the following conditions:
|
||||
//
|
||||
// The above copyright notice and this permission notice shall be included in all
|
||||
// copies or substantial portions of the Software.
|
||||
//
|
||||
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
// SOFTWARE.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "library/common.hpp"
|
||||
#include "library/defines.hpp"
|
||||
#include "library/timemory.hpp"
|
||||
|
||||
#include <array>
|
||||
#include <cstddef>
|
||||
#include <string>
|
||||
|
||||
namespace omnitrace
|
||||
{
|
||||
// this is used to wrap pthread_mutex()
|
||||
struct pthread_mutex_gotcha : comp::base<pthread_mutex_gotcha, void>
|
||||
{
|
||||
static constexpr size_t gotcha_capacity = 3;
|
||||
using hash_array_t = std::array<size_t, gotcha_capacity>;
|
||||
using gotcha_data_t = comp::gotcha_data;
|
||||
|
||||
TIMEMORY_DEFAULT_OBJECT(pthread_mutex_gotcha)
|
||||
|
||||
// string id for component
|
||||
static std::string label() { return "pthread_mutex_gotcha"; }
|
||||
|
||||
// generate the gotcha wrappers
|
||||
static void configure();
|
||||
static void shutdown();
|
||||
static void validate();
|
||||
|
||||
int operator()(const gotcha_data_t&, int (*)(pthread_mutex_t*), pthread_mutex_t*);
|
||||
|
||||
private:
|
||||
static bool is_disabled();
|
||||
static hash_array_t& get_hashes();
|
||||
};
|
||||
|
||||
using pthread_mutex_gotcha_t = comp::gotcha<pthread_mutex_gotcha::gotcha_capacity,
|
||||
quirk::fast, pthread_mutex_gotcha>;
|
||||
} // namespace omnitrace
|
||||
+110
-12
@@ -25,11 +25,13 @@
|
||||
#include "library/config.hpp"
|
||||
#include "library/critical_trace.hpp"
|
||||
#include "library/debug.hpp"
|
||||
#include "library/runtime.hpp"
|
||||
#include "library/sampling.hpp"
|
||||
#include "library/thread_data.hpp"
|
||||
|
||||
#include <timemory/backends/cpu.hpp>
|
||||
#include <timemory/backends/threading.hpp>
|
||||
#include <timemory/utility/types.hpp>
|
||||
|
||||
#include <atomic>
|
||||
#include <chrono>
|
||||
@@ -171,6 +173,8 @@ hsa_api_callback(uint32_t domain, uint32_t cid, const void* callback_data, void*
|
||||
if(get_state() != State::Active || !trait::runtime_enabled<comp::roctracer>::get())
|
||||
return;
|
||||
|
||||
OMNITRACE_SCOPED_THREAD_STATE(ThreadState::Internal);
|
||||
|
||||
(void) arg;
|
||||
const hsa_api_data_t* data = reinterpret_cast<const hsa_api_data_t*>(callback_data);
|
||||
OMNITRACE_CONDITIONAL_PRINT_F(
|
||||
@@ -279,6 +283,8 @@ hsa_activity_callback(uint32_t op, activity_record_t* record, void* arg)
|
||||
if(get_state() != State::Active || !trait::runtime_enabled<comp::roctracer>::get())
|
||||
return;
|
||||
|
||||
OMNITRACE_SCOPED_THREAD_STATE(ThreadState::Internal);
|
||||
|
||||
sampling::block_signals();
|
||||
|
||||
static const char* copy_op_name = "hsa_async_copy";
|
||||
@@ -359,6 +365,8 @@ hip_api_callback(uint32_t domain, uint32_t cid, const void* callback_data, void*
|
||||
if(get_state() != State::Active || !trait::runtime_enabled<comp::roctracer>::get())
|
||||
return;
|
||||
|
||||
OMNITRACE_SCOPED_THREAD_STATE(ThreadState::Internal);
|
||||
|
||||
using Device = critical_trace::Device;
|
||||
using Phase = critical_trace::Phase;
|
||||
|
||||
@@ -388,12 +396,86 @@ hip_api_callback(uint32_t domain, uint32_t cid, const void* callback_data, void*
|
||||
op_name, cid, data->correlation_id,
|
||||
(data->phase == ACTIVITY_API_PHASE_ENTER) ? "on-enter" : "on-exit");
|
||||
|
||||
int64_t _ts = comp::wall_clock::record();
|
||||
auto _tid = threading::get_id();
|
||||
uint64_t _cid = 0;
|
||||
uint64_t _parent_cid = 0;
|
||||
uint16_t _depth = 0;
|
||||
auto _corr_id = data->correlation_id;
|
||||
int64_t _ts = comp::wall_clock::record();
|
||||
auto _tid = threading::get_id();
|
||||
uint64_t _cid = 0;
|
||||
uint64_t _parent_cid = 0;
|
||||
uint16_t _depth = 0;
|
||||
uintptr_t _queue = 0;
|
||||
auto _corr_id = data->correlation_id;
|
||||
|
||||
#define OMNITRACE_HIP_API_QUEUE_CASE(API_FUNC, VARIABLE) \
|
||||
case HIP_API_ID_##API_FUNC: \
|
||||
_queue = reinterpret_cast<uintptr_t>(data->args.API_FUNC.VARIABLE); \
|
||||
break;
|
||||
|
||||
#define OMNITRACE_HIP_API_QUEUE_CASE_ALT(API_FUNC, UNION, VARIABLE) \
|
||||
case HIP_API_ID_##API_FUNC: \
|
||||
_queue = reinterpret_cast<uintptr_t>(data->args.UNION.VARIABLE); \
|
||||
break;
|
||||
|
||||
switch(cid)
|
||||
{
|
||||
OMNITRACE_HIP_API_QUEUE_CASE(hipLaunchKernel, stream)
|
||||
OMNITRACE_HIP_API_QUEUE_CASE(hipModuleLaunchKernel, stream)
|
||||
OMNITRACE_HIP_API_QUEUE_CASE(hipHccModuleLaunchKernel, hStream)
|
||||
OMNITRACE_HIP_API_QUEUE_CASE(hipLaunchCooperativeKernel, stream)
|
||||
OMNITRACE_HIP_API_QUEUE_CASE(hipExtLaunchKernel, stream)
|
||||
OMNITRACE_HIP_API_QUEUE_CASE(hipExtModuleLaunchKernel, hStream)
|
||||
OMNITRACE_HIP_API_QUEUE_CASE(hipExtStreamCreateWithCUMask, stream)
|
||||
OMNITRACE_HIP_API_QUEUE_CASE(hipExtStreamGetCUMask, stream)
|
||||
OMNITRACE_HIP_API_QUEUE_CASE(hipStreamSynchronize, stream)
|
||||
OMNITRACE_HIP_API_QUEUE_CASE(hipConfigureCall, stream)
|
||||
OMNITRACE_HIP_API_QUEUE_CASE(hipDrvMemcpy3DAsync, stream)
|
||||
OMNITRACE_HIP_API_QUEUE_CASE(hipEventRecord, stream)
|
||||
OMNITRACE_HIP_API_QUEUE_CASE(hipMemPrefetchAsync, stream)
|
||||
OMNITRACE_HIP_API_QUEUE_CASE(hipMemcpy2DAsync, stream)
|
||||
OMNITRACE_HIP_API_QUEUE_CASE(hipMemcpy2DFromArrayAsync, stream)
|
||||
OMNITRACE_HIP_API_QUEUE_CASE(hipMemcpy2DToArrayAsync, stream)
|
||||
OMNITRACE_HIP_API_QUEUE_CASE(hipMemcpy3DAsync, stream)
|
||||
OMNITRACE_HIP_API_QUEUE_CASE(hipMemcpyAsync, stream)
|
||||
OMNITRACE_HIP_API_QUEUE_CASE(hipMemcpyDtoDAsync, stream)
|
||||
OMNITRACE_HIP_API_QUEUE_CASE(hipMemcpyDtoHAsync, stream)
|
||||
OMNITRACE_HIP_API_QUEUE_CASE(hipMemcpyFromSymbolAsync, stream)
|
||||
OMNITRACE_HIP_API_QUEUE_CASE(hipMemcpyHtoDAsync, stream)
|
||||
OMNITRACE_HIP_API_QUEUE_CASE(hipMemcpyParam2DAsync, stream)
|
||||
OMNITRACE_HIP_API_QUEUE_CASE(hipMemcpyPeerAsync, stream)
|
||||
OMNITRACE_HIP_API_QUEUE_CASE(hipMemcpyToSymbolAsync, stream)
|
||||
OMNITRACE_HIP_API_QUEUE_CASE(hipMemcpyWithStream, stream)
|
||||
OMNITRACE_HIP_API_QUEUE_CASE(hipMemset2DAsync, stream)
|
||||
OMNITRACE_HIP_API_QUEUE_CASE(hipMemset3DAsync, stream)
|
||||
OMNITRACE_HIP_API_QUEUE_CASE(hipMemsetAsync, stream)
|
||||
OMNITRACE_HIP_API_QUEUE_CASE(hipMemsetD16Async, stream)
|
||||
OMNITRACE_HIP_API_QUEUE_CASE(hipMemsetD32Async, stream)
|
||||
OMNITRACE_HIP_API_QUEUE_CASE(hipMemsetD8Async, stream)
|
||||
OMNITRACE_HIP_API_QUEUE_CASE(hipStreamAddCallback, stream)
|
||||
OMNITRACE_HIP_API_QUEUE_CASE(hipStreamAttachMemAsync, stream)
|
||||
OMNITRACE_HIP_API_QUEUE_CASE(hipStreamDestroy, stream)
|
||||
OMNITRACE_HIP_API_QUEUE_CASE(hipStreamGetFlags, stream)
|
||||
OMNITRACE_HIP_API_QUEUE_CASE(hipStreamGetPriority, stream)
|
||||
OMNITRACE_HIP_API_QUEUE_CASE(hipStreamQuery, stream)
|
||||
OMNITRACE_HIP_API_QUEUE_CASE(hipStreamWaitEvent, stream)
|
||||
OMNITRACE_HIP_API_QUEUE_CASE(hipStreamWaitValue32, stream)
|
||||
OMNITRACE_HIP_API_QUEUE_CASE(hipStreamWaitValue64, stream)
|
||||
OMNITRACE_HIP_API_QUEUE_CASE(hipStreamWriteValue32, stream)
|
||||
OMNITRACE_HIP_API_QUEUE_CASE(hipStreamWriteValue64, stream)
|
||||
#if OMNITRACE_HIP_VERSION_MAJOR >= 4 && OMNITRACE_HIP_VERSION_MINOR >= 5
|
||||
OMNITRACE_HIP_API_QUEUE_CASE(hipGraphLaunch, stream)
|
||||
OMNITRACE_HIP_API_QUEUE_CASE(hipGraphicsMapResources, stream)
|
||||
OMNITRACE_HIP_API_QUEUE_CASE(hipGraphicsUnmapResources, stream)
|
||||
OMNITRACE_HIP_API_QUEUE_CASE(hipSignalExternalSemaphoresAsync, stream)
|
||||
OMNITRACE_HIP_API_QUEUE_CASE(hipStreamBeginCapture, stream)
|
||||
OMNITRACE_HIP_API_QUEUE_CASE(hipStreamEndCapture, stream)
|
||||
OMNITRACE_HIP_API_QUEUE_CASE(hipWaitExternalSemaphoresAsync, stream)
|
||||
#endif
|
||||
#if OMNITRACE_HIP_VERSION_MAJOR >= 5
|
||||
OMNITRACE_HIP_API_QUEUE_CASE(hipStreamIsCapturing, stream)
|
||||
OMNITRACE_HIP_API_QUEUE_CASE(hipStreamGetCaptureInfo, stream)
|
||||
OMNITRACE_HIP_API_QUEUE_CASE(hipStreamGetCaptureInfo_v2, stream)
|
||||
OMNITRACE_HIP_API_QUEUE_CASE(hipStreamUpdateCaptureDependencies, stream)
|
||||
#endif
|
||||
default: break;
|
||||
}
|
||||
|
||||
if(data->phase == ACTIVITY_API_PHASE_ENTER)
|
||||
{
|
||||
@@ -401,12 +483,24 @@ hip_api_callback(uint32_t domain, uint32_t cid, const void* callback_data, void*
|
||||
switch(cid)
|
||||
{
|
||||
case HIP_API_ID_hipLaunchKernel:
|
||||
case HIP_API_ID_hipLaunchCooperativeKernel:
|
||||
{
|
||||
_name = hipKernelNameRefByPtr(data->args.hipLaunchKernel.function_address,
|
||||
data->args.hipLaunchKernel.stream);
|
||||
break;
|
||||
}
|
||||
case HIP_API_ID_hipLaunchCooperativeKernel:
|
||||
{
|
||||
_name =
|
||||
hipKernelNameRefByPtr(data->args.hipLaunchCooperativeKernel.f,
|
||||
data->args.hipLaunchCooperativeKernel.stream);
|
||||
if(!_name)
|
||||
{
|
||||
_name =
|
||||
hipKernelNameRefByPtr(data->args.hipLaunchKernel.function_address,
|
||||
data->args.hipLaunchKernel.stream);
|
||||
}
|
||||
break;
|
||||
}
|
||||
case HIP_API_ID_hipHccModuleLaunchKernel:
|
||||
{
|
||||
_name = hipKernelNameRef(data->args.hipHccModuleLaunchKernel.f);
|
||||
@@ -468,7 +562,7 @@ hip_api_callback(uint32_t domain, uint32_t cid, const void* callback_data, void*
|
||||
if(get_use_critical_trace() || get_use_rocm_smi())
|
||||
{
|
||||
add_critical_trace<Device::CPU, Phase::BEGIN>(
|
||||
_tid, _cid, _corr_id, _parent_cid, _ts, 0,
|
||||
_tid, _cid, _corr_id, _parent_cid, _ts, 0, _queue,
|
||||
critical_trace::add_hash_id(op_name), _depth);
|
||||
}
|
||||
|
||||
@@ -517,7 +611,7 @@ hip_api_callback(uint32_t domain, uint32_t cid, const void* callback_data, void*
|
||||
if(get_use_critical_trace() || get_use_rocm_smi())
|
||||
{
|
||||
add_critical_trace<Device::CPU, Phase::END>(
|
||||
_tid, _cid, _corr_id, _parent_cid, _ts, _ts,
|
||||
_tid, _cid, _corr_id, _parent_cid, _ts, _ts, _queue,
|
||||
critical_trace::add_hash_id(op_name), _depth);
|
||||
}
|
||||
}
|
||||
@@ -531,6 +625,8 @@ hip_activity_callback(const char* begin, const char* end, void*)
|
||||
if(get_state() != State::Active || !trait::runtime_enabled<comp::roctracer>::get())
|
||||
return;
|
||||
|
||||
OMNITRACE_SCOPED_THREAD_STATE(ThreadState::Internal);
|
||||
|
||||
sampling::block_signals();
|
||||
|
||||
static thread_local auto _once = (threading::set_thread_name("omni.roctracer"), true);
|
||||
@@ -650,13 +746,13 @@ hip_activity_callback(const char* begin, const char* end, void*)
|
||||
auto _hash = critical_trace::add_hash_id(_name);
|
||||
uint16_t _prio = _laps + 1; // priority
|
||||
add_critical_trace<Device::GPU, Phase::DELTA, false>(
|
||||
_tid, _cid, _corr_id, _cid, _beg_ns, _end_ns, _hash, _depth + 1, _prio);
|
||||
_tid, _cid, _corr_id, _cid, _beg_ns, _end_ns, record->queue_id, _hash,
|
||||
_depth + 1, _prio);
|
||||
}
|
||||
|
||||
if(_found && _name != nullptr && get_use_timemory())
|
||||
{
|
||||
auto _func = [_depth, _tid, _cid, _laps, _beg_ns, _end_ns, _corr_id,
|
||||
_name]() {
|
||||
auto _func = [_beg_ns, _end_ns, _name]() {
|
||||
roctracer_bundle_t _bundle{ _name, _scope };
|
||||
_bundle.start()
|
||||
.store(std::plus<double>{}, static_cast<double>(_end_ns - _beg_ns))
|
||||
@@ -725,6 +821,8 @@ extern "C"
|
||||
if(!config::settings_are_configured() && get_state() < State::Active)
|
||||
omnitrace_init_tooling_hidden();
|
||||
|
||||
OMNITRACE_SCOPED_THREAD_STATE(ThreadState::Internal);
|
||||
|
||||
static auto _setup = [=]() {
|
||||
try
|
||||
{
|
||||
|
||||
@@ -29,6 +29,7 @@
|
||||
#include <timemory/backends/process.hpp>
|
||||
#include <timemory/backends/threading.hpp>
|
||||
#include <timemory/environment.hpp>
|
||||
#include <timemory/environment/types.hpp>
|
||||
#include <timemory/sampling/allocator.hpp>
|
||||
#include <timemory/settings.hpp>
|
||||
#include <timemory/settings/types.hpp>
|
||||
@@ -74,7 +75,7 @@ get_setting_name(std::string _v)
|
||||
{ \
|
||||
auto _ret = _config->insert<TYPE, TYPE>( \
|
||||
ENV_NAME, get_setting_name(ENV_NAME), DESCRIPTION, INITIAL_VALUE, \
|
||||
std::set<std::string>{ "custom", "omnitrace", "omnitrace-library", \
|
||||
std::set<std::string>{ "custom", "omnitrace", "omnitrace_library", \
|
||||
__VA_ARGS__ }); \
|
||||
if(!_ret.second) \
|
||||
OMNITRACE_PRINT("Warning! Duplicate setting: %s / %s\n", \
|
||||
@@ -87,7 +88,7 @@ get_setting_name(std::string _v)
|
||||
{ \
|
||||
auto _ret = _config->insert<TYPE, TYPE>( \
|
||||
ENV_NAME, get_setting_name(ENV_NAME), DESCRIPTION, INITIAL_VALUE, \
|
||||
std::set<std::string>{ "custom", "omnitrace", "omnitrace-library", \
|
||||
std::set<std::string>{ "custom", "omnitrace", "omnitrace_library", \
|
||||
__VA_ARGS__ }, \
|
||||
std::vector<std::string>{ CMD_LINE }); \
|
||||
if(!_ret.second) \
|
||||
@@ -147,28 +148,23 @@ configure_settings(bool _init)
|
||||
if(_omnitrace_debug) tim::set_env("TIMEMORY_DEBUG_SETTINGS", "1", 0);
|
||||
|
||||
OMNITRACE_CONFIG_SETTING(bool, "OMNITRACE_USE_PERFETTO", "Enable perfetto backend",
|
||||
_default_perfetto_v, "backend", "perfetto",
|
||||
"instrumentation");
|
||||
_default_perfetto_v, "backend", "perfetto");
|
||||
|
||||
OMNITRACE_CONFIG_SETTING(bool, "OMNITRACE_USE_TIMEMORY", "Enable timemory backend",
|
||||
!_config->get<bool>("OMNITRACE_USE_PERFETTO"), "backend",
|
||||
"timemory", "instrumentation", "sampling");
|
||||
"timemory");
|
||||
|
||||
#if defined(OMNITRACE_USE_ROCTRACER) && OMNITRACE_USE_ROCTRACER > 0
|
||||
OMNITRACE_CONFIG_SETTING(bool, "OMNITRACE_USE_ROCTRACER", "Enable ROCM tracing", true,
|
||||
"backend", "roctracer", "rocm");
|
||||
#endif
|
||||
|
||||
#if defined(OMNITRACE_USE_ROCM_SMI) && OMNITRACE_USE_ROCM_SMI > 0
|
||||
OMNITRACE_CONFIG_SETTING(
|
||||
bool, "OMNITRACE_USE_ROCM_SMI",
|
||||
"Enable sampling GPU power, temp, utilization, and memory usage", true, "backend",
|
||||
"rocm-smi", "rocm");
|
||||
"rocm_smi", "rocm");
|
||||
|
||||
OMNITRACE_CONFIG_SETTING(std::string, "OMNITRACE_ROCM_SMI_DEVICES",
|
||||
"Devices to query when OMNITRACE_USE_ROCM_SMI=ON", "all",
|
||||
"backend", "rocm-smi", "rocm");
|
||||
#endif
|
||||
"backend", "rocm_smi", "rocm");
|
||||
|
||||
OMNITRACE_CONFIG_SETTING(bool, "OMNITRACE_USE_SAMPLING",
|
||||
"Enable statistical sampling of call-stack", false,
|
||||
@@ -177,12 +173,12 @@ configure_settings(bool _init)
|
||||
OMNITRACE_CONFIG_SETTING(bool, "OMNITRACE_USE_THREAD_SAMPLING",
|
||||
"Enable a background thread which samples system metrics "
|
||||
"such as the CPU/GPU freq, power, etc.",
|
||||
true, "backend", "sampling");
|
||||
true, "backend", "sampling", "thread_sampling");
|
||||
|
||||
OMNITRACE_CONFIG_SETTING(
|
||||
bool, "OMNITRACE_USE_PID",
|
||||
"Enable tagging filenames with process identifier (either MPI rank or pid)", true,
|
||||
"io");
|
||||
"io", "filename");
|
||||
|
||||
OMNITRACE_CONFIG_SETTING(bool, "OMNITRACE_USE_KOKKOSP",
|
||||
"Enable support for Kokkos Tools", false, "kokkos",
|
||||
@@ -192,11 +188,9 @@ configure_settings(bool _init)
|
||||
bool, "OMNITRACE_KOKKOS_KERNEL_LOGGER", "Enables kernel logging", false,
|
||||
"--omnitrace-kokkos-kernel-logger", "kokkos", "debugging");
|
||||
|
||||
#if defined(OMNITRACE_USE_OMPT) && OMNITRACE_USE_OMPT > 0
|
||||
OMNITRACE_CONFIG_SETTING(bool, "OMNITRACE_USE_OMPT",
|
||||
"Enable support for OpenMP-Tools", false, "openmp", "ompt",
|
||||
"backend");
|
||||
#endif
|
||||
|
||||
OMNITRACE_CONFIG_SETTING(bool, "OMNITRACE_USE_CODE_COVERAGE",
|
||||
"Enable support for code coverage", false, "coverage",
|
||||
@@ -205,25 +199,25 @@ configure_settings(bool _init)
|
||||
OMNITRACE_CONFIG_SETTING(size_t, "OMNITRACE_INSTRUMENTATION_INTERVAL",
|
||||
"Instrumentation only takes measurements once every N "
|
||||
"function calls (not statistical)",
|
||||
1, "instrumentation");
|
||||
1, "instrumentation", "data_sampling");
|
||||
|
||||
OMNITRACE_CONFIG_SETTING(
|
||||
double, "OMNITRACE_SAMPLING_FREQ",
|
||||
"Number of software interrupts per second when OMNITTRACE_USE_SAMPLING=ON", 10.0,
|
||||
"sampling");
|
||||
"sampling", "thread_sampling");
|
||||
|
||||
OMNITRACE_CONFIG_SETTING(
|
||||
double, "OMNITRACE_SAMPLING_DELAY",
|
||||
"Number of seconds to wait before the first sampling signal is delivered, "
|
||||
"increasing this value can fix deadlocks during init",
|
||||
0.5, "sampling");
|
||||
0.5, "sampling", "thread_sampling");
|
||||
|
||||
OMNITRACE_CONFIG_SETTING(
|
||||
std::string, "OMNITRACE_SAMPLING_CPUS",
|
||||
"CPUs to collect frequency information for. Values should be separated by commas "
|
||||
"and can be explicit or ranges, e.g. 0,1,5-8. An empty value implies 'all' and "
|
||||
"'none' suppresses all CPU frequency sampling",
|
||||
"", "sampling");
|
||||
"", "thread_sampling");
|
||||
|
||||
auto _backend = tim::get_env_choice<std::string>(
|
||||
"OMNITRACE_BACKEND",
|
||||
@@ -235,30 +229,36 @@ configure_settings(bool _init)
|
||||
OMNITRACE_CONFIG_SETTING(std::string, "OMNITRACE_BACKEND",
|
||||
"Specify the perfetto backend to activate. Options are: "
|
||||
"'inprocess', 'system', or 'all'",
|
||||
_backend, "perfetto", "instrumentation", "sampling");
|
||||
_backend, "perfetto");
|
||||
|
||||
OMNITRACE_CONFIG_SETTING(bool, "OMNITRACE_CRITICAL_TRACE",
|
||||
"Enable generation of the critical trace", false, "backend",
|
||||
"critical-trace", "instrumentation");
|
||||
"critical_trace");
|
||||
|
||||
OMNITRACE_CONFIG_SETTING(bool, "OMNITRACE_TRACE_THREAD_LOCKS",
|
||||
"Enable tracking calls to pthread_mutex_lock, "
|
||||
"pthread_mutex_unlock, pthread_mutex_trylock",
|
||||
false, "backend", "parallelism", "gotcha");
|
||||
|
||||
OMNITRACE_CONFIG_SETTING(bool, "OMNITRACE_FLAT_SAMPLING",
|
||||
"Ignore hierarchy in all statistical sampling entries",
|
||||
_config->get_flat_profile(), "sampling", "data_layout");
|
||||
_config->get_flat_profile(), "timemory", "sampling",
|
||||
"data_layout");
|
||||
|
||||
OMNITRACE_CONFIG_SETTING(
|
||||
bool, "OMNITRACE_TIMELINE_SAMPLING",
|
||||
"Create unique entries for every sample when statistical sampling is enabled",
|
||||
_config->get_timeline_profile(), "sampling", "data_layout");
|
||||
_config->get_timeline_profile(), "timemory", "sampling", "data_layout");
|
||||
|
||||
OMNITRACE_CONFIG_SETTING(
|
||||
bool, "OMNITRACE_ROCTRACER_FLAT_PROFILE",
|
||||
"Ignore hierarchy in all kernels entries with timemory backend",
|
||||
_config->get_flat_profile(), "roctracer", "data_layout", "rocm");
|
||||
_config->get_flat_profile(), "timemory", "roctracer", "data_layout", "rocm");
|
||||
|
||||
OMNITRACE_CONFIG_SETTING(
|
||||
bool, "OMNITRACE_ROCTRACER_TIMELINE_PROFILE",
|
||||
"Create unique entries for every kernel with timemory backend",
|
||||
_config->get_timeline_profile(), "roctracer", "data_layout", "rocm");
|
||||
_config->get_timeline_profile(), "timemory", "roctracer", "data_layout", "rocm");
|
||||
|
||||
OMNITRACE_CONFIG_SETTING(bool, "OMNITRACE_ROCTRACER_HSA_ACTIVITY",
|
||||
"Enable HSA activity tracing support", false, "roctracer",
|
||||
@@ -273,12 +273,12 @@ configure_settings(bool _init)
|
||||
|
||||
OMNITRACE_CONFIG_SETTING(bool, "OMNITRACE_CRITICAL_TRACE_DEBUG",
|
||||
"Enable debugging for critical trace", _omnitrace_debug,
|
||||
"debugging", "critical-trace");
|
||||
"debugging", "critical_trace");
|
||||
|
||||
OMNITRACE_CONFIG_SETTING(
|
||||
bool, "OMNITRACE_CRITICAL_TRACE_SERIALIZE_NAMES",
|
||||
"Include names in serialization of critical trace (mainly for debugging)",
|
||||
_omnitrace_debug, "debugging", "critical-trace");
|
||||
_omnitrace_debug, "debugging", "critical_trace");
|
||||
|
||||
OMNITRACE_CONFIG_SETTING(size_t, "OMNITRACE_SHMEM_SIZE_HINT_KB",
|
||||
"Hint for shared-memory buffer size in perfetto (in KB)",
|
||||
@@ -288,37 +288,36 @@ configure_settings(bool _init)
|
||||
"Size of perfetto buffer (in KB)", 1024000, "perfetto",
|
||||
"data");
|
||||
|
||||
OMNITRACE_CONFIG_SETTING(bool, "OMNITRACE_COMBINE_PERFETTO_TRACES",
|
||||
"Combine Perfetto traces", true, "perfetto", "data");
|
||||
|
||||
OMNITRACE_CONFIG_SETTING(int64_t, "OMNITRACE_CRITICAL_TRACE_COUNT",
|
||||
"Number of critical trace to export (0 == all)", 0, "data",
|
||||
"critical-trace");
|
||||
"critical_trace");
|
||||
|
||||
OMNITRACE_CONFIG_SETTING(uint64_t, "OMNITRACE_CRITICAL_TRACE_BUFFER_COUNT",
|
||||
"Number of critical trace records to store in thread-local "
|
||||
"memory before submitting to shared buffer",
|
||||
2000, "data", "critical-trace");
|
||||
2000, "data", "critical_trace");
|
||||
|
||||
OMNITRACE_CONFIG_SETTING(
|
||||
uint64_t, "OMNITRACE_CRITICAL_TRACE_NUM_THREADS",
|
||||
"Number of threads to use when generating the critical trace",
|
||||
std::min<uint64_t>(8, std::thread::hardware_concurrency()), "parallelism",
|
||||
"critical-trace");
|
||||
"critical_trace");
|
||||
|
||||
OMNITRACE_CONFIG_SETTING(
|
||||
int64_t, "OMNITRACE_CRITICAL_TRACE_PER_ROW",
|
||||
"How many critical traces per row in perfetto (0 == all in one row)", 0, "io",
|
||||
"critical-trace");
|
||||
"critical_trace");
|
||||
|
||||
OMNITRACE_CONFIG_SETTING(
|
||||
std::string, "OMNITRACE_TIMEMORY_COMPONENTS",
|
||||
"List of components to collect via timemory (see timemory-avail)", "wall_clock",
|
||||
"timemory", "component", "instrumentation");
|
||||
"timemory", "component");
|
||||
|
||||
OMNITRACE_CONFIG_SETTING(std::string, "OMNITRACE_OUTPUT_FILE", "Perfetto filename",
|
||||
"", "perfetto", "io");
|
||||
|
||||
OMNITRACE_CONFIG_SETTING(bool, "OMNITRACE_SETTINGS_DESC",
|
||||
"Provide descriptions when printing settings", false,
|
||||
"debugging");
|
||||
"", "perfetto", "io", "filename");
|
||||
|
||||
_config->get_flamegraph_output() = false;
|
||||
_config->get_cout_output() = false;
|
||||
@@ -334,7 +333,7 @@ configure_settings(bool _init)
|
||||
_config->get_max_thread_bookmarks() = 1;
|
||||
_config->get_timing_units() = "sec";
|
||||
_config->get_memory_units() = "MB";
|
||||
_config->get_papi_events() = "PAPI_TOT_CYC, PAPI_TOT_INS";
|
||||
_config->get_papi_events() = "PAPI_TOT_CYC";
|
||||
|
||||
// settings native to timemory but critically and/or extensively used by omnitrace
|
||||
auto _add_omnitrace_category = [](auto itr) {
|
||||
@@ -342,7 +341,7 @@ configure_settings(bool _init)
|
||||
{
|
||||
auto _categories = itr->second->get_categories();
|
||||
_categories.emplace("omnitrace");
|
||||
_categories.emplace("omnitrace-library");
|
||||
_categories.emplace("omnitrace_library");
|
||||
itr->second->set_categories(_categories);
|
||||
}
|
||||
};
|
||||
@@ -396,8 +395,7 @@ configure_settings(bool _init)
|
||||
_config->read(itr);
|
||||
}
|
||||
|
||||
_config->get_global_components() =
|
||||
_config->get<std::string>("OMNITRACE_TIMEMORY_COMPONENTS");
|
||||
settings::suppress_config() = true;
|
||||
|
||||
// always initialize timemory because gotcha wrappers are always used
|
||||
auto _cmd = tim::read_command_line(process::get_id());
|
||||
@@ -408,9 +406,6 @@ configure_settings(bool _init)
|
||||
if(_pos < _exe.length() - 1) _exe = _exe.substr(_pos + 1);
|
||||
get_exe_name() = _exe;
|
||||
|
||||
scope::get_fields()[scope::flat::value] = tim::settings::flat_profile();
|
||||
scope::get_fields()[scope::timeline::value] = tim::settings::timeline_profile();
|
||||
|
||||
bool _found_sep = false;
|
||||
for(const auto& itr : _cmd)
|
||||
{
|
||||
@@ -425,8 +420,13 @@ configure_settings(bool _init)
|
||||
tim::timemory_init(_cmd, _parser, "omnitrace-");
|
||||
}
|
||||
|
||||
_config->get_global_components() =
|
||||
_config->get<std::string>("OMNITRACE_TIMEMORY_COMPONENTS");
|
||||
|
||||
scope::get_fields()[scope::flat::value] = _config->get_flat_profile();
|
||||
scope::get_fields()[scope::timeline::value] = _config->get_timeline_profile();
|
||||
|
||||
settings::suppress_parsing() = true;
|
||||
settings::suppress_config() = true;
|
||||
settings::use_output_suffix() = _config->get<bool>("OMNITRACE_USE_PID");
|
||||
#if !defined(TIMEMORY_USE_MPI) && defined(TIMEMORY_USE_MPI_HEADERS)
|
||||
if(tim::dmp::is_initialized()) settings::default_process_suffix() = tim::dmp::rank();
|
||||
@@ -485,6 +485,60 @@ configure_settings(bool _init)
|
||||
_old_handler = signal(_dyninst_trampoline_signal,
|
||||
static_cast<signal_handler_t>(_trampoline_handler));
|
||||
}
|
||||
|
||||
auto _handle_use_option = [](const std::string& _opt, const std::string& _category) {
|
||||
if(!_config->get<bool>(_opt))
|
||||
{
|
||||
auto _disabled = _config->disable_category(_category);
|
||||
_config->enable(_opt);
|
||||
for(auto&& itr : _disabled)
|
||||
OMNITRACE_BASIC_VERBOSE(3, "[%s=OFF] disabled option :: '%s'\n",
|
||||
_opt.c_str(), itr.c_str());
|
||||
return false;
|
||||
}
|
||||
auto _enabled = _config->enable_category(_category);
|
||||
for(auto&& itr : _enabled)
|
||||
OMNITRACE_BASIC_VERBOSE(3, "[%s=ON] enabled option :: '%s'\n",
|
||||
_opt.c_str(), itr.c_str());
|
||||
return true;
|
||||
};
|
||||
|
||||
_handle_use_option("OMNITRACE_USE_SAMPLING", "sampling");
|
||||
_handle_use_option("OMNITRACE_USE_THREAD_SAMPLING", "thread_sampling");
|
||||
_handle_use_option("OMNITRACE_USE_KOKKOSP", "kokkos");
|
||||
_handle_use_option("OMNITRACE_USE_PERFETTO", "perfetto");
|
||||
_handle_use_option("OMNITRACE_USE_TIMEMORY", "timemory");
|
||||
_handle_use_option("OMNITRACE_CRITICAL_TRACE", "critical_trace");
|
||||
_handle_use_option("OMNITRACE_USE_OMPT", "ompt");
|
||||
_handle_use_option("OMNITRACE_USE_ROCM_SMI", "rocm_smi");
|
||||
_handle_use_option("OMNITRACE_USE_ROCTRACER", "roctracer");
|
||||
|
||||
#if !defined(OMNITRACE_USE_ROCTRACER) || OMNITRACE_USE_ROCTRACER == 0
|
||||
_config->disable_category("roctracer");
|
||||
#endif
|
||||
|
||||
#if !defined(OMNITRACE_USE_ROCM_SMI) || OMNITRACE_USE_ROCM_SMI == 0
|
||||
_config->disable_category("rocm_smi");
|
||||
#endif
|
||||
|
||||
#if defined(OMNITRACE_USE_OMPT) || OMNITRACE_USE_OMPT == 0
|
||||
_config->disable_category("ompt");
|
||||
#endif
|
||||
|
||||
// user bundle components
|
||||
_config->disable_category("throttle");
|
||||
_config->disable("components");
|
||||
_config->disable("global_components");
|
||||
_config->disable("ompt_components");
|
||||
_config->disable("kokkos_components");
|
||||
_config->disable("trace_components");
|
||||
_config->disable("profiler_components");
|
||||
_config->disable("destructor_report");
|
||||
_config->disable("stack_clearing");
|
||||
|
||||
#if !defined(TIMEMORY_USE_MPI) || TIMEMORY_USE_MPI == 0
|
||||
_config->disable("OMNITRACE_COMBINE_PERFETTO_TRACES");
|
||||
#endif
|
||||
}
|
||||
|
||||
void
|
||||
@@ -512,7 +566,8 @@ print_settings(
|
||||
|
||||
std::stringstream _os{};
|
||||
|
||||
bool _md = tim::get_env<bool>("OMNITRACE_SETTINGS_DESC_MARKDOWN", false);
|
||||
bool _print_desc = get_debug() || tim::get_env("OMNITRACE_SETTINGS_DESC", false);
|
||||
bool _md = tim::get_env<bool>("OMNITRACE_SETTINGS_DESC_MARKDOWN", false);
|
||||
|
||||
constexpr size_t nfields = 3;
|
||||
using str_array_t = std::array<std::string, nfields>;
|
||||
@@ -521,6 +576,7 @@ print_settings(
|
||||
_widths.fill(0);
|
||||
for(const auto& itr : *get_config())
|
||||
{
|
||||
if(!itr.second->get_enabled()) continue;
|
||||
if(_filter(itr.first, itr.second->get_categories()))
|
||||
{
|
||||
auto _disp = itr.second->get_display(std::ios::boolalpha);
|
||||
@@ -549,8 +605,6 @@ print_settings(
|
||||
return lhs.at(0) < rhs.at(0);
|
||||
});
|
||||
|
||||
bool _print_desc = get_debug() || get_config()->get<bool>("OMNITRACE_SETTINGS_DESC");
|
||||
|
||||
auto tot_width = std::accumulate(_widths.begin(), _widths.end(), 0);
|
||||
if(!_print_desc) tot_width -= _widths.back() + 4;
|
||||
|
||||
@@ -945,6 +999,17 @@ get_perfetto_buffer_size()
|
||||
return static_cast<tim::tsettings<size_t>&>(*_v->second).get();
|
||||
}
|
||||
|
||||
bool
|
||||
get_perfetto_combined_traces()
|
||||
{
|
||||
#if defined(TIMEMORY_USE_MPI) && TIMEMORY_USE_MPI > 0
|
||||
static auto _v = get_config()->find("OMNITRACE_COMBINE_PERFETTO_TRACES");
|
||||
return static_cast<tim::tsettings<bool>&>(*_v->second).get();
|
||||
#else
|
||||
return false;
|
||||
#endif
|
||||
}
|
||||
|
||||
uint64_t
|
||||
get_critical_trace_update_freq()
|
||||
{
|
||||
@@ -1062,6 +1127,13 @@ get_rocm_smi_devices()
|
||||
#endif
|
||||
}
|
||||
|
||||
bool
|
||||
get_trace_thread_locks()
|
||||
{
|
||||
static auto _v = get_config()->find("OMNITRACE_TRACE_THREAD_LOCKS");
|
||||
return static_cast<tim::tsettings<bool>&>(*_v->second).get();
|
||||
}
|
||||
|
||||
bool
|
||||
get_debug_tid()
|
||||
{
|
||||
|
||||
@@ -199,6 +199,9 @@ get_perfetto_shmem_size_hint();
|
||||
size_t
|
||||
get_perfetto_buffer_size();
|
||||
|
||||
bool
|
||||
get_perfetto_combined_traces();
|
||||
|
||||
uint64_t
|
||||
get_critical_trace_update_freq();
|
||||
|
||||
@@ -238,6 +241,9 @@ get_rocm_smi_devices();
|
||||
|
||||
int64_t
|
||||
get_critical_trace_per_row();
|
||||
|
||||
bool
|
||||
get_trace_thread_locks();
|
||||
} // namespace config
|
||||
|
||||
//
|
||||
|
||||
@@ -26,6 +26,7 @@
|
||||
#include "library/defines.hpp"
|
||||
#include "library/perfetto.hpp"
|
||||
#include "library/ptl.hpp"
|
||||
#include "library/runtime.hpp"
|
||||
#include "library/thread_data.hpp"
|
||||
|
||||
#include <PTL/ThreadPool.hh>
|
||||
@@ -109,9 +110,9 @@ get_combined_hash(Arg0&& _zero, Arg1&& _one, Args&&... _args)
|
||||
bool
|
||||
entry::operator==(const entry& rhs) const
|
||||
{
|
||||
return (device == rhs.device && depth == rhs.depth && priority == rhs.priority &&
|
||||
tid == rhs.tid && cpu_cid == rhs.cpu_cid && gpu_cid == rhs.gpu_cid &&
|
||||
hash == rhs.hash);
|
||||
return std::tie(device, depth, priority, tid, cpu_cid, gpu_cid, queue_id, hash) ==
|
||||
std::tie(rhs.device, rhs.depth, rhs.priority, rhs.tid, rhs.cpu_cid,
|
||||
rhs.gpu_cid, rhs.queue_id, rhs.hash);
|
||||
}
|
||||
|
||||
bool
|
||||
@@ -132,6 +133,10 @@ entry::operator<(const entry& rhs) const
|
||||
auto _par_eq = (parent_cid == rhs.parent_cid);
|
||||
if(!_par_eq) return (parent_cid < rhs.parent_cid);
|
||||
|
||||
// sort by queue ids
|
||||
auto _queue_eq = (queue_id == rhs.queue_id);
|
||||
if(!_queue_eq) return (queue_id < rhs.queue_id);
|
||||
|
||||
// sort by priority
|
||||
auto _prio_eq = (priority == rhs.priority);
|
||||
if(!_prio_eq) return (priority < rhs.priority);
|
||||
@@ -143,8 +148,8 @@ entry::operator<(const entry& rhs) const
|
||||
bool
|
||||
entry::operator>(const entry& rhs) const
|
||||
{
|
||||
return (!(*this < rhs) && begin_ns != rhs.begin_ns && cpu_cid != rhs.cpu_cid &&
|
||||
gpu_cid != rhs.gpu_cid);
|
||||
return (!(*this < rhs) && std::tie(begin_ns, cpu_cid, gpu_cid) !=
|
||||
std::tie(rhs.begin_ns, rhs.cpu_cid, rhs.gpu_cid));
|
||||
}
|
||||
|
||||
entry&
|
||||
@@ -171,7 +176,7 @@ size_t
|
||||
entry::get_hash() const
|
||||
{
|
||||
return get_combined_hash(hash, static_cast<short>(device), static_cast<short>(phase),
|
||||
tid, cpu_cid, gpu_cid, priority);
|
||||
tid, cpu_cid, gpu_cid, queue_id, priority);
|
||||
}
|
||||
|
||||
int64_t
|
||||
@@ -293,6 +298,7 @@ entry::write(std::ostream& _os) const
|
||||
_os << " parent: " << static_cast<int64_t>(parent_cid);
|
||||
_os << ", tid: " << tid;
|
||||
_os << ", depth: " << depth;
|
||||
_os << ", queue: " << queue_id;
|
||||
_os << ", priority: " << priority;
|
||||
if(phase == Phase::DELTA)
|
||||
{
|
||||
@@ -423,6 +429,7 @@ template <>
|
||||
void
|
||||
call_chain::generate_perfetto<Device::CPU>(std::set<entry>& _used) const
|
||||
{
|
||||
OMNITRACE_SCOPED_THREAD_STATE(ThreadState::Internal);
|
||||
static std::set<std::string> _static_strings{};
|
||||
static std::mutex _static_mutex{};
|
||||
for(const auto& itr : *this)
|
||||
@@ -444,6 +451,7 @@ template <>
|
||||
void
|
||||
call_chain::generate_perfetto<Device::GPU>(std::set<entry>& _used) const
|
||||
{
|
||||
OMNITRACE_SCOPED_THREAD_STATE(ThreadState::Internal);
|
||||
static std::set<std::string> _static_strings{};
|
||||
static std::mutex _static_mutex{};
|
||||
for(const auto& itr : *this)
|
||||
@@ -465,6 +473,7 @@ template <>
|
||||
void
|
||||
call_chain::generate_perfetto<Device::ANY>(std::set<entry>& _used) const
|
||||
{
|
||||
OMNITRACE_SCOPED_THREAD_STATE(ThreadState::Internal);
|
||||
static std::set<std::string> _static_strings{};
|
||||
static std::mutex _static_mutex{};
|
||||
for(const auto& itr : *this)
|
||||
@@ -509,6 +518,7 @@ get(int64_t _tid)
|
||||
void
|
||||
add_hash_id(const hash_ids& _labels)
|
||||
{
|
||||
OMNITRACE_SCOPED_THREAD_STATE(ThreadState::Internal);
|
||||
std::unique_lock<std::mutex> _lk{ tasking::critical_trace::get_mutex() };
|
||||
if(!tasking::critical_trace::get_task_group().pool()) return;
|
||||
tasking::critical_trace::get_task_group().exec([_labels]() {
|
||||
@@ -539,6 +549,7 @@ void
|
||||
update(int64_t _tid)
|
||||
{
|
||||
if(!get_use_critical_trace() && !get_use_rocm_smi()) return;
|
||||
OMNITRACE_SCOPED_THREAD_STATE(ThreadState::Internal);
|
||||
std::unique_lock<std::mutex> _lk{ tasking::critical_trace::get_mutex() };
|
||||
if(!tasking::critical_trace::get_task_group().pool()) return;
|
||||
call_chain _data{};
|
||||
@@ -550,6 +561,7 @@ void
|
||||
compute(int64_t _tid)
|
||||
{
|
||||
update(_tid);
|
||||
OMNITRACE_SCOPED_THREAD_STATE(ThreadState::Internal);
|
||||
std::unique_lock<std::mutex> _lk{ tasking::critical_trace::get_mutex() };
|
||||
if(!tasking::critical_trace::get_task_group().pool()) return;
|
||||
tasking::critical_trace::get_task_group().exec(compute_critical_trace);
|
||||
@@ -723,6 +735,7 @@ combine_critical_path(call_chain& _targ, call_chain _chain)
|
||||
_combined.emplace_back(itr);
|
||||
std::sort(_combined.begin(), _combined.end());
|
||||
|
||||
OMNITRACE_SCOPED_THREAD_STATE(ThreadState::Internal);
|
||||
std::unique_lock<std::mutex> _lk{ complete_call_mutex };
|
||||
for(auto& itr : _combined)
|
||||
_targ.emplace_back(itr);
|
||||
@@ -751,6 +764,8 @@ update_critical_path(call_chain _chain, int64_t)
|
||||
void
|
||||
compute_critical_trace()
|
||||
{
|
||||
OMNITRACE_SCOPED_THREAD_STATE(ThreadState::Internal);
|
||||
|
||||
static bool _computed = false;
|
||||
std::unique_lock<std::mutex> _lk{ complete_call_mutex };
|
||||
|
||||
@@ -825,6 +840,7 @@ get_entries(int64_t _ts, const std::function<bool(const entry&)>& _eval)
|
||||
}
|
||||
*_targ = _v;
|
||||
};
|
||||
OMNITRACE_SCOPED_THREAD_STATE(ThreadState::Internal);
|
||||
std::unique_lock<std::mutex> _lk{ tasking::critical_trace::get_mutex() };
|
||||
size_t _n = 0;
|
||||
std::vector<std::pair<std::string, entry>> _v{};
|
||||
|
||||
@@ -65,17 +65,18 @@ struct entry
|
||||
entry& operator=(const entry&) = default;
|
||||
entry& operator=(entry&&) noexcept = default;
|
||||
|
||||
uint16_t priority = 0; // priority value (for sorting)
|
||||
Device device = Device::CPU; // which device it executed on
|
||||
Phase phase = Phase::NONE; // start / stop / unspecified
|
||||
uint16_t depth = 0; // call-stack depth
|
||||
int64_t tid = 0; // thread id it was registered on
|
||||
uint64_t cpu_cid = 0; // CPU correlation id
|
||||
uint64_t gpu_cid = 0; // GPU correlation id
|
||||
uint64_t parent_cid = 0; // parent CPU correlation id
|
||||
int64_t begin_ns = 0; // timestamp of start
|
||||
int64_t end_ns = 0; // timestamp of end
|
||||
size_t hash = 0; // hash for name
|
||||
uint16_t priority = 0; /// priority value (for sorting)
|
||||
Device device = Device::CPU; /// which device it executed on
|
||||
Phase phase = Phase::NONE; /// start / stop / unspecified
|
||||
uint16_t depth = 0; /// call-stack depth
|
||||
int64_t tid = 0; /// thread id it was registered on
|
||||
uint64_t cpu_cid = 0; /// CPU correlation id
|
||||
uint64_t gpu_cid = 0; /// GPU correlation id
|
||||
uint64_t parent_cid = 0; /// parent CPU correlation id
|
||||
int64_t begin_ns = 0; /// timestamp of start
|
||||
int64_t end_ns = 0; /// timestamp of end
|
||||
uintptr_t queue_id = 0; /// stream id (GPU) or mutex id
|
||||
size_t hash = 0; /// hash for name
|
||||
|
||||
bool operator==(const entry& rhs) const;
|
||||
bool operator!=(const entry& rhs) const { return !(*this == rhs); }
|
||||
@@ -127,7 +128,8 @@ entry::save(Archive& ar, unsigned int) const
|
||||
cereal::make_nvp("tid", tid), cereal::make_nvp("cpu_cid", cpu_cid),
|
||||
cereal::make_nvp("gpu_cid", gpu_cid), cereal::make_nvp("parent_cid", parent_cid),
|
||||
cereal::make_nvp("begin_ns", begin_ns), cereal::make_nvp("end_ns", end_ns),
|
||||
cereal::make_nvp("hash", hash), cereal::make_nvp("name", _name),
|
||||
cereal::make_nvp("queue", queue_id), cereal::make_nvp("hash", hash),
|
||||
cereal::make_nvp("name", _name),
|
||||
cereal::make_nvp("demangled_name", tim::demangle(_name)));
|
||||
}
|
||||
|
||||
@@ -144,6 +146,7 @@ entry::load(Archive& ar, unsigned int)
|
||||
cereal::make_nvp("gpu_cid", gpu_cid), cereal::make_nvp("parent_cid", parent_cid),
|
||||
cereal::make_nvp("begin_ns", begin_ns), cereal::make_nvp("end_ns", end_ns),
|
||||
cereal::make_nvp("hash", hash), cereal::make_nvp("name", _name),
|
||||
cereal::make_nvp("queue", queue_id),
|
||||
cereal::make_nvp("demangled_name", _demangled_name));
|
||||
|
||||
tim::get_hash_ids()->emplace(hash, _name);
|
||||
|
||||
@@ -0,0 +1,50 @@
|
||||
// MIT License
|
||||
//
|
||||
// Copyright (c) 2022 Advanced Micro Devices, Inc. All Rights Reserved.
|
||||
//
|
||||
// Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
// of this software and associated documentation files (the "Software"), to deal
|
||||
// in the Software without restriction, including without limitation the rights
|
||||
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
// copies of the Software, and to permit persons to whom the Software is
|
||||
// furnished to do so, subject to the following conditions:
|
||||
//
|
||||
// The above copyright notice and this permission notice shall be included in all
|
||||
// copies or substantial portions of the Software.
|
||||
//
|
||||
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
// SOFTWARE.
|
||||
|
||||
#include "library/debug.hpp"
|
||||
#include "library/runtime.hpp"
|
||||
#include "library/state.hpp"
|
||||
|
||||
namespace omnitrace
|
||||
{
|
||||
namespace debug
|
||||
{
|
||||
lock::lock()
|
||||
: m_lk{ tim::type_mutex<decltype(std::cerr)>(), std::defer_lock }
|
||||
{
|
||||
if(!m_lk.owns_lock())
|
||||
{
|
||||
push_thread_state(ThreadState::Internal);
|
||||
m_lk.lock();
|
||||
}
|
||||
}
|
||||
|
||||
lock::~lock()
|
||||
{
|
||||
if(m_lk.owns_lock())
|
||||
{
|
||||
m_lk.unlock();
|
||||
pop_thread_state();
|
||||
}
|
||||
}
|
||||
} // namespace debug
|
||||
} // namespace omnitrace
|
||||
@@ -27,6 +27,10 @@
|
||||
#include <timemory/api.hpp>
|
||||
#include <timemory/backends/dmp.hpp>
|
||||
#include <timemory/backends/process.hpp>
|
||||
#include <timemory/backends/threading.hpp>
|
||||
#include <timemory/mpl/concepts.hpp>
|
||||
#include <timemory/utility/backtrace.hpp>
|
||||
#include <timemory/utility/locking.hpp>
|
||||
#include <timemory/utility/utility.hpp>
|
||||
|
||||
#include <array>
|
||||
@@ -66,6 +70,25 @@ get_critical_trace_debug();
|
||||
|
||||
namespace debug
|
||||
{
|
||||
struct lock
|
||||
{
|
||||
lock();
|
||||
~lock();
|
||||
|
||||
private:
|
||||
tim::auto_lock_t m_lk;
|
||||
};
|
||||
//
|
||||
template <typename Arg, typename... Args>
|
||||
bool
|
||||
is_bracket(Arg&& _arg, Args&&...)
|
||||
{
|
||||
if constexpr(::tim::concepts::is_string_type<Arg>::value)
|
||||
return (::std::string_view{ _arg }.empty()) ? false : _arg[0] == '[';
|
||||
else
|
||||
return false;
|
||||
}
|
||||
//
|
||||
namespace
|
||||
{
|
||||
template <typename T, size_t... Idx>
|
||||
@@ -79,10 +102,6 @@ get_chars(T&& _c, std::index_sequence<Idx...>)
|
||||
} // namespace debug
|
||||
} // namespace omnitrace
|
||||
|
||||
#define OMNITRACE_VAR_NAME_COMBINE(X, Y) X##Y
|
||||
#define OMNITRACE_LINESTR TIMEMORY_STRINGIZE(__LINE__)
|
||||
#define OMNITRACE_VARIABLE(LABEL) OMNITRACE_VAR_NAME_COMBINE(_omni_var_, LABEL)
|
||||
|
||||
#if !defined(OMNITRACE_DEBUG_BUFFER_LEN)
|
||||
# define OMNITRACE_DEBUG_BUFFER_LEN 2048
|
||||
#endif
|
||||
@@ -129,14 +148,17 @@ get_chars(T&& _c, std::index_sequence<Idx...>)
|
||||
.data()
|
||||
#endif
|
||||
|
||||
//--------------------------------------------------------------------------------------//
|
||||
|
||||
#define OMNITRACE_CONDITIONAL_PRINT(COND, ...) \
|
||||
if((COND) && ::omnitrace::config::get_debug_tid() && \
|
||||
::omnitrace::config::get_debug_pid()) \
|
||||
{ \
|
||||
fflush(stderr); \
|
||||
tim::auto_lock_t _lk{ tim::type_mutex<decltype(std::cerr)>() }; \
|
||||
fprintf(stderr, "[omnitrace][%i][%li] ", OMNITRACE_PROCESS_IDENTIFIER, \
|
||||
OMNITRACE_THREAD_IDENTIFIER); \
|
||||
::omnitrace::debug::lock _lk{}; \
|
||||
fprintf(stderr, "[omnitrace][%i][%li]%s", OMNITRACE_PROCESS_IDENTIFIER, \
|
||||
OMNITRACE_THREAD_IDENTIFIER, \
|
||||
::omnitrace::debug::is_bracket(__VA_ARGS__) ? "" : " "); \
|
||||
fprintf(stderr, __VA_ARGS__); \
|
||||
fflush(stderr); \
|
||||
}
|
||||
@@ -146,8 +168,9 @@ get_chars(T&& _c, std::index_sequence<Idx...>)
|
||||
::omnitrace::config::get_debug_pid()) \
|
||||
{ \
|
||||
fflush(stderr); \
|
||||
tim::auto_lock_t _lk{ tim::type_mutex<decltype(std::cerr)>() }; \
|
||||
fprintf(stderr, "[omnitrace] "); \
|
||||
::omnitrace::debug::lock _lk{}; \
|
||||
fprintf(stderr, "[omnitrace]%s", \
|
||||
::omnitrace::debug::is_bracket(__VA_ARGS__) ? "" : " "); \
|
||||
fprintf(stderr, __VA_ARGS__); \
|
||||
fflush(stderr); \
|
||||
}
|
||||
@@ -157,9 +180,10 @@ get_chars(T&& _c, std::index_sequence<Idx...>)
|
||||
::omnitrace::config::get_debug_pid()) \
|
||||
{ \
|
||||
fflush(stderr); \
|
||||
tim::auto_lock_t _lk{ tim::type_mutex<decltype(std::cerr)>() }; \
|
||||
fprintf(stderr, "[omnitrace][%i][%li][%s] ", OMNITRACE_PROCESS_IDENTIFIER, \
|
||||
OMNITRACE_THREAD_IDENTIFIER, OMNITRACE_FUNCTION); \
|
||||
::omnitrace::debug::lock _lk{}; \
|
||||
fprintf(stderr, "[omnitrace][%i][%li][%s]%s", OMNITRACE_PROCESS_IDENTIFIER, \
|
||||
OMNITRACE_THREAD_IDENTIFIER, OMNITRACE_FUNCTION, \
|
||||
::omnitrace::debug::is_bracket(__VA_ARGS__) ? "" : " "); \
|
||||
fprintf(stderr, __VA_ARGS__); \
|
||||
fflush(stderr); \
|
||||
}
|
||||
@@ -169,19 +193,23 @@ get_chars(T&& _c, std::index_sequence<Idx...>)
|
||||
::omnitrace::config::get_debug_pid()) \
|
||||
{ \
|
||||
fflush(stderr); \
|
||||
tim::auto_lock_t _lk{ tim::type_mutex<decltype(std::cerr)>() }; \
|
||||
fprintf(stderr, "[omnitrace][%s] ", OMNITRACE_FUNCTION); \
|
||||
::omnitrace::debug::lock _lk{}; \
|
||||
fprintf(stderr, "[omnitrace][%s]%s", OMNITRACE_FUNCTION, \
|
||||
::omnitrace::debug::is_bracket(__VA_ARGS__) ? "" : " "); \
|
||||
fprintf(stderr, __VA_ARGS__); \
|
||||
fflush(stderr); \
|
||||
}
|
||||
|
||||
//--------------------------------------------------------------------------------------//
|
||||
|
||||
#define OMNITRACE_CONDITIONAL_THROW(COND, ...) \
|
||||
if(COND) \
|
||||
{ \
|
||||
char _msg_buffer[OMNITRACE_DEBUG_BUFFER_LEN]; \
|
||||
snprintf(_msg_buffer, OMNITRACE_DEBUG_BUFFER_LEN, "[omnitrace][%i][%li][%s] ", \
|
||||
snprintf(_msg_buffer, OMNITRACE_DEBUG_BUFFER_LEN, "[omnitrace][%i][%li][%s]%s", \
|
||||
OMNITRACE_PROCESS_IDENTIFIER, OMNITRACE_THREAD_IDENTIFIER, \
|
||||
OMNITRACE_FUNCTION); \
|
||||
OMNITRACE_FUNCTION, \
|
||||
::omnitrace::debug::is_bracket(__VA_ARGS__) ? "" : " "); \
|
||||
auto len = strlen(_msg_buffer); \
|
||||
snprintf(_msg_buffer + len, OMNITRACE_DEBUG_BUFFER_LEN - len, __VA_ARGS__); \
|
||||
throw std::runtime_error(_msg_buffer); \
|
||||
@@ -191,8 +219,9 @@ get_chars(T&& _c, std::index_sequence<Idx...>)
|
||||
if(COND) \
|
||||
{ \
|
||||
char _msg_buffer[OMNITRACE_DEBUG_BUFFER_LEN]; \
|
||||
snprintf(_msg_buffer, OMNITRACE_DEBUG_BUFFER_LEN, "[omnitrace][%s] ", \
|
||||
OMNITRACE_FUNCTION); \
|
||||
snprintf(_msg_buffer, OMNITRACE_DEBUG_BUFFER_LEN, "[omnitrace][%s]%s", \
|
||||
OMNITRACE_FUNCTION, \
|
||||
::omnitrace::debug::is_bracket(__VA_ARGS__) ? "" : " "); \
|
||||
auto len = strlen(_msg_buffer); \
|
||||
snprintf(_msg_buffer + len, OMNITRACE_DEBUG_BUFFER_LEN - len, __VA_ARGS__); \
|
||||
throw std::runtime_error(_msg_buffer); \
|
||||
@@ -206,8 +235,71 @@ get_chars(T&& _c, std::index_sequence<Idx...>)
|
||||
OMNITRACE_CONDITIONAL_BASIC_THROW( \
|
||||
::omnitrace::get_is_continuous_integration() && (COND), __VA_ARGS__)
|
||||
|
||||
#define OMNITRACE_STRINGIZE(...) #__VA_ARGS__
|
||||
#define OMNITRACE_ESC(...) __VA_ARGS__
|
||||
//--------------------------------------------------------------------------------------//
|
||||
|
||||
#define OMNITRACE_CONDITIONAL_FAIL(COND, ...) \
|
||||
if(COND) \
|
||||
{ \
|
||||
fflush(stderr); \
|
||||
fprintf(stderr, "[omnitrace][%i][%li]%s", OMNITRACE_PROCESS_IDENTIFIER, \
|
||||
OMNITRACE_THREAD_IDENTIFIER, \
|
||||
::omnitrace::debug::is_bracket(__VA_ARGS__) ? "" : " "); \
|
||||
fprintf(stderr, __VA_ARGS__); \
|
||||
::omnitrace::set_state(::omnitrace::State::Finalized); \
|
||||
::tim::disable_signal_detection(); \
|
||||
::tim::print_demangled_backtrace<64>(); \
|
||||
::std::exit(EXIT_FAILURE); \
|
||||
}
|
||||
|
||||
#define OMNITRACE_CONDITIONAL_BASIC_FAIL(COND, ...) \
|
||||
if(COND) \
|
||||
{ \
|
||||
fflush(stderr); \
|
||||
fprintf(stderr, "[omnitrace]%s", \
|
||||
::omnitrace::debug::is_bracket(__VA_ARGS__) ? "" : " "); \
|
||||
fprintf(stderr, __VA_ARGS__); \
|
||||
::omnitrace::set_state(::omnitrace::State::Finalized); \
|
||||
::tim::disable_signal_detection(); \
|
||||
::tim::print_demangled_backtrace<64>(); \
|
||||
::std::exit(EXIT_FAILURE); \
|
||||
}
|
||||
|
||||
#define OMNITRACE_CONDITIONAL_FAIL_F(COND, ...) \
|
||||
if(COND) \
|
||||
{ \
|
||||
fflush(stderr); \
|
||||
fprintf(stderr, "[omnitrace][%i][%li][%s]%s", OMNITRACE_PROCESS_IDENTIFIER, \
|
||||
OMNITRACE_THREAD_IDENTIFIER, OMNITRACE_FUNCTION, \
|
||||
::omnitrace::debug::is_bracket(__VA_ARGS__) ? "" : " "); \
|
||||
fprintf(stderr, __VA_ARGS__); \
|
||||
::omnitrace::set_state(::omnitrace::State::Finalized); \
|
||||
::tim::disable_signal_detection(); \
|
||||
::tim::print_demangled_backtrace<64>(); \
|
||||
::std::exit(EXIT_FAILURE); \
|
||||
}
|
||||
|
||||
#define OMNITRACE_CONDITIONAL_BASIC_FAIL_F(COND, ...) \
|
||||
if(COND) \
|
||||
{ \
|
||||
fflush(stderr); \
|
||||
fprintf(stderr, "[omnitrace][%s]%s", OMNITRACE_FUNCTION, \
|
||||
::omnitrace::debug::is_bracket(__VA_ARGS__) ? "" : " "); \
|
||||
fprintf(stderr, __VA_ARGS__); \
|
||||
::omnitrace::set_state(::omnitrace::State::Finalized); \
|
||||
::tim::disable_signal_detection(); \
|
||||
::tim::print_demangled_backtrace<64>(); \
|
||||
::std::exit(EXIT_FAILURE); \
|
||||
}
|
||||
|
||||
#define OMNITRACE_CI_FAIL(COND, ...) \
|
||||
OMNITRACE_CONDITIONAL_FAIL(::omnitrace::get_is_continuous_integration() && (COND), \
|
||||
__VA_ARGS__)
|
||||
|
||||
#define OMNITRACE_CI_BASIC_FAIL(COND, ...) \
|
||||
OMNITRACE_CONDITIONAL_BASIC_FAIL( \
|
||||
::omnitrace::get_is_continuous_integration() && (COND), __VA_ARGS__)
|
||||
|
||||
//--------------------------------------------------------------------------------------//
|
||||
|
||||
//--------------------------------------------------------------------------------------//
|
||||
//
|
||||
@@ -287,6 +379,20 @@ get_chars(T&& _c, std::index_sequence<Idx...>)
|
||||
|
||||
#define OMNITRACE_BASIC_THROW(...) OMNITRACE_CONDITIONAL_BASIC_THROW(true, __VA_ARGS__)
|
||||
|
||||
//--------------------------------------------------------------------------------------//
|
||||
//
|
||||
// Fail macros
|
||||
//
|
||||
//--------------------------------------------------------------------------------------//
|
||||
|
||||
#define OMNITRACE_FAIL(...) OMNITRACE_CONDITIONAL_FAIL(true, __VA_ARGS__)
|
||||
|
||||
#define OMNITRACE_FAIL_F(...) OMNITRACE_CONDITIONAL_FAIL_F(true, __VA_ARGS__)
|
||||
|
||||
#define OMNITRACE_BASIC_FAIL(...) OMNITRACE_CONDITIONAL_BASIC_FAIL(true, __VA_ARGS__)
|
||||
|
||||
#define OMNITRACE_BASIC_FAIL_F(...) OMNITRACE_CONDITIONAL_BASIC_FAIL_F(true, __VA_ARGS__)
|
||||
|
||||
#include <string>
|
||||
|
||||
namespace std
|
||||
|
||||
@@ -67,3 +67,10 @@
|
||||
# endif
|
||||
# include <cassert>
|
||||
#endif
|
||||
|
||||
#define OMNITRACE_STRINGIZE(X) OMNITRACE_STRINGIZE2(X)
|
||||
#define OMNITRACE_STRINGIZE2(X) #X
|
||||
#define OMNITRACE_VAR_NAME_COMBINE(X, Y) X##Y
|
||||
#define OMNITRACE_VARIABLE(Y) OMNITRACE_VAR_NAME_COMBINE(_omni_var_, Y)
|
||||
#define OMNITRACE_LINESTR OMNITRACE_STRINGIZE(__LINE__)
|
||||
#define OMNITRACE_ESC(...) __VA_ARGS__
|
||||
|
||||
@@ -24,9 +24,11 @@
|
||||
#include "library/config.hpp"
|
||||
#include "library/debug.hpp"
|
||||
#include "library/defines.hpp"
|
||||
#include "library/runtime.hpp"
|
||||
#include "library/sampling.hpp"
|
||||
|
||||
#include <PTL/ThreadPool.hh>
|
||||
|
||||
#include <timemory/utility/declaration.hpp>
|
||||
|
||||
namespace omnitrace
|
||||
@@ -42,9 +44,10 @@ auto _thread_pool_cfg = []() {
|
||||
_v.use_tbb = false;
|
||||
_v.verbose = -1;
|
||||
_v.initializer = []() {
|
||||
set_thread_state(ThreadState::Internal);
|
||||
sampling::block_signals();
|
||||
threading::set_thread_name(
|
||||
TIMEMORY_JOIN('.', "ptl", PTL::Threading::GetThreadId()).c_str());
|
||||
JOIN('.', "ptl", PTL::Threading::GetThreadId()).c_str());
|
||||
};
|
||||
_v.finalizer = []() {};
|
||||
_v.priority = 5;
|
||||
|
||||
@@ -26,6 +26,7 @@
|
||||
#include "library/debug.hpp"
|
||||
#include "library/defines.hpp"
|
||||
#include "library/thread_data.hpp"
|
||||
#include "library/utility.hpp"
|
||||
|
||||
#include <timemory/backends/dmp.hpp>
|
||||
#include <timemory/backends/mpi.hpp>
|
||||
@@ -98,6 +99,8 @@ create_cpu_cid_entry(int64_t _tid)
|
||||
{
|
||||
using tim::auto_lock_t;
|
||||
|
||||
OMNITRACE_SCOPED_THREAD_STATE(ThreadState::Internal);
|
||||
|
||||
// unique lock for _tid
|
||||
auto& _mtx = get_cpu_cid_stack_lock(_tid);
|
||||
auto_lock_t _lk{ _mtx, std::defer_lock };
|
||||
@@ -166,4 +169,50 @@ get_gotcha_bundle()
|
||||
"omnitrace", quirk::config<quirk::auto_start>{}));
|
||||
return _v;
|
||||
}
|
||||
|
||||
namespace
|
||||
{
|
||||
auto&
|
||||
get_thread_state_history(int64_t _idx = utility::get_thread_index())
|
||||
{
|
||||
static auto _v = utility::get_filled_array<OMNITRACE_MAX_THREADS>(
|
||||
[]() { return utility::get_reserved_vector<ThreadState>(32); });
|
||||
|
||||
return _v.at(_idx);
|
||||
}
|
||||
} // namespace
|
||||
|
||||
ThreadState&
|
||||
get_thread_state()
|
||||
{
|
||||
static thread_local ThreadState _v{ ThreadState::Enabled };
|
||||
return _v;
|
||||
}
|
||||
|
||||
ThreadState
|
||||
set_thread_state(ThreadState _n)
|
||||
{
|
||||
auto _o = get_thread_state();
|
||||
get_thread_state() = _n;
|
||||
return _o;
|
||||
}
|
||||
|
||||
ThreadState
|
||||
push_thread_state(ThreadState _v)
|
||||
{
|
||||
return get_thread_state_history().emplace_back(set_thread_state(_v));
|
||||
}
|
||||
|
||||
ThreadState
|
||||
pop_thread_state()
|
||||
{
|
||||
auto& _hist = get_thread_state_history();
|
||||
if(!_hist.empty())
|
||||
{
|
||||
set_thread_state(_hist.back());
|
||||
_hist.pop_back();
|
||||
}
|
||||
return get_thread_state();
|
||||
}
|
||||
|
||||
} // namespace omnitrace
|
||||
|
||||
@@ -46,7 +46,7 @@ namespace omnitrace
|
||||
// bundle of components around omnitrace_init and omnitrace_finalize
|
||||
using main_bundle_t =
|
||||
tim::lightweight_tuple<comp::wall_clock, comp::peak_rss, comp::cpu_clock,
|
||||
comp::cpu_util, pthread_gotcha_t>;
|
||||
comp::cpu_util, pthread_gotcha>;
|
||||
|
||||
using gotcha_bundle_t = tim::lightweight_tuple<fork_gotcha_t, mpi_gotcha_t>;
|
||||
|
||||
@@ -89,4 +89,27 @@ get_cpu_cid_entry(uint64_t _cid, int64_t _tid = threading::get_id());
|
||||
tim::mutex_t&
|
||||
get_cpu_cid_stack_lock(int64_t _tid = threading::get_id());
|
||||
|
||||
ThreadState&
|
||||
get_thread_state();
|
||||
|
||||
/// returns old state
|
||||
ThreadState set_thread_state(ThreadState);
|
||||
|
||||
ThreadState push_thread_state(ThreadState);
|
||||
|
||||
ThreadState
|
||||
pop_thread_state();
|
||||
|
||||
struct scoped_thread_state
|
||||
{
|
||||
scoped_thread_state(ThreadState _v) { push_thread_state(_v); }
|
||||
~scoped_thread_state() { pop_thread_state(); }
|
||||
};
|
||||
} // namespace omnitrace
|
||||
|
||||
#define OMNITRACE_SCOPED_THREAD_STATE(STATE) \
|
||||
::omnitrace::scoped_thread_state OMNITRACE_VARIABLE( \
|
||||
OMNITRACE_VAR_NAME_COMBINE(scoped_thread_state_, __LINE__)) \
|
||||
{ \
|
||||
::omnitrace::STATE \
|
||||
}
|
||||
|
||||
@@ -40,6 +40,19 @@ to_string(omnitrace::State _v)
|
||||
return {};
|
||||
}
|
||||
|
||||
std::string
|
||||
to_string(omnitrace::ThreadState _v)
|
||||
{
|
||||
switch(_v)
|
||||
{
|
||||
case omnitrace::ThreadState::Enabled: return "Enabled";
|
||||
case omnitrace::ThreadState::Internal: return "Internal";
|
||||
case omnitrace::ThreadState::Disabled: return "Disabled";
|
||||
case omnitrace::ThreadState::Completed: return "Completed";
|
||||
}
|
||||
return {};
|
||||
}
|
||||
|
||||
std::string
|
||||
to_string(omnitrace::Mode _v)
|
||||
{
|
||||
|
||||
@@ -36,6 +36,15 @@ enum class State : unsigned short
|
||||
Finalized
|
||||
};
|
||||
|
||||
// used for specifying the state of omnitrace
|
||||
enum class ThreadState : unsigned short
|
||||
{
|
||||
Enabled = 0,
|
||||
Internal,
|
||||
Disabled,
|
||||
Completed,
|
||||
};
|
||||
|
||||
enum class Mode : unsigned short
|
||||
{
|
||||
Trace = 0,
|
||||
@@ -51,6 +60,9 @@ namespace std
|
||||
std::string
|
||||
to_string(omnitrace::State _v);
|
||||
|
||||
std::string
|
||||
to_string(omnitrace::ThreadState _v);
|
||||
|
||||
std::string
|
||||
to_string(omnitrace::Mode _v);
|
||||
} // namespace std
|
||||
|
||||
@@ -21,6 +21,10 @@
|
||||
// SOFTWARE.
|
||||
|
||||
#include "library/thread_data.hpp"
|
||||
#include "library/components/pthread_create_gotcha.hpp"
|
||||
#include "library/utility.hpp"
|
||||
|
||||
#include <timemory/backends/threading.hpp>
|
||||
|
||||
namespace omnitrace
|
||||
{
|
||||
@@ -30,4 +34,15 @@ instrumentation_bundles::instances()
|
||||
static auto _v = instance_array_t{};
|
||||
return _v;
|
||||
}
|
||||
|
||||
void
|
||||
thread_deleter<void>::operator()() const
|
||||
{
|
||||
pthread_create_gotcha::shutdown(threading::get_id());
|
||||
set_thread_state(ThreadState::Completed);
|
||||
if(get_state() != State::Finalized && threading::get_id() == 0)
|
||||
omnitrace_finalize_hidden();
|
||||
}
|
||||
|
||||
template struct thread_deleter<void>;
|
||||
} // namespace omnitrace
|
||||
|
||||
@@ -26,6 +26,7 @@
|
||||
#include "library/common.hpp"
|
||||
#include "library/config.hpp"
|
||||
#include "library/defines.hpp"
|
||||
#include "library/state.hpp"
|
||||
#include "library/timemory.hpp"
|
||||
|
||||
#include <array>
|
||||
@@ -40,6 +41,8 @@
|
||||
|
||||
namespace omnitrace
|
||||
{
|
||||
ThreadState set_thread_state(ThreadState);
|
||||
|
||||
// bundle of components used in instrumentation
|
||||
using instrumentation_bundle_t =
|
||||
tim::component_bundle<api::omnitrace, comp::wall_clock*, comp::user_global_bundle*>;
|
||||
@@ -56,12 +59,20 @@ using unique_ptr_t = std::unique_ptr<Tp, thread_deleter<Tp>>;
|
||||
|
||||
static constexpr size_t max_supported_threads = OMNITRACE_MAX_THREADS;
|
||||
|
||||
template <>
|
||||
struct thread_deleter<void>
|
||||
{
|
||||
void operator()() const;
|
||||
};
|
||||
|
||||
extern template struct thread_deleter<void>;
|
||||
|
||||
template <typename Tp>
|
||||
struct thread_deleter
|
||||
{
|
||||
void operator()(Tp* ptr) const
|
||||
{
|
||||
if(get_state() != State::Finalized) omnitrace_finalize_hidden();
|
||||
thread_deleter<void>{}();
|
||||
delete ptr;
|
||||
}
|
||||
};
|
||||
|
||||
@@ -74,6 +74,7 @@ get_sampler_is_sampling()
|
||||
void
|
||||
sampler::poll(std::atomic<State>* _state, nsec_t _interval, promise_t* _ready)
|
||||
{
|
||||
set_thread_state(ThreadState::Internal);
|
||||
threading::set_thread_name("omni.sampler");
|
||||
|
||||
// notify thread started
|
||||
|
||||
@@ -0,0 +1,70 @@
|
||||
// MIT License
|
||||
//
|
||||
// Copyright (c) 2022 Advanced Micro Devices, Inc. All Rights Reserved.
|
||||
//
|
||||
// Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
// of this software and associated documentation files (the "Software"), to deal
|
||||
// in the Software without restriction, including without limitation the rights
|
||||
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
// copies of the Software, and to permit persons to whom the Software is
|
||||
// furnished to do so, subject to the following conditions:
|
||||
//
|
||||
// The above copyright notice and this permission notice shall be included in all
|
||||
// copies or substantial portions of the Software.
|
||||
//
|
||||
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
// SOFTWARE.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <array>
|
||||
#include <atomic>
|
||||
#include <cstddef>
|
||||
#include <cstdint>
|
||||
#include <vector>
|
||||
|
||||
namespace omnitrace
|
||||
{
|
||||
namespace utility
|
||||
{
|
||||
namespace
|
||||
{
|
||||
/// provides an alternative thread index for when using threading::get_id() is not
|
||||
/// desirable
|
||||
inline auto
|
||||
get_thread_index()
|
||||
{
|
||||
static std::atomic<int64_t> _c{ 0 };
|
||||
static thread_local int64_t _v = _c++;
|
||||
return _v;
|
||||
}
|
||||
|
||||
/// fills any array with the result of the functor
|
||||
template <size_t N, typename FuncT>
|
||||
inline auto
|
||||
get_filled_array(FuncT&& _func)
|
||||
{
|
||||
using Tp = std::decay_t<decltype(_func())>;
|
||||
std::array<Tp, N> _v{};
|
||||
for(auto& itr : _v)
|
||||
itr = std::move(_func());
|
||||
return _v;
|
||||
}
|
||||
|
||||
/// returns a vector with a preallocated buffer
|
||||
template <typename... Tp>
|
||||
inline auto
|
||||
get_reserved_vector(size_t _n)
|
||||
{
|
||||
std::vector<Tp...> _v{};
|
||||
_v.reserve(_n);
|
||||
return _v;
|
||||
}
|
||||
} // namespace
|
||||
} // namespace utility
|
||||
} // namespace omnitrace
|
||||
@@ -63,12 +63,10 @@ endfunction()
|
||||
add_library(omnitrace-python-compile-options INTERFACE)
|
||||
add_library(omnitrace::omnitrace-python-compile-options ALIAS
|
||||
omnitrace-python-compile-options)
|
||||
add_cxx_flag_if_avail("-frtti" omnitrace-python-compile-options)
|
||||
add_cxx_flag_if_avail("-Wno-unused-value" omnitrace-python-compile-options)
|
||||
add_cxx_flag_if_avail("-Wno-range-loop-analysis" omnitrace-python-compile-options)
|
||||
add_cxx_flag_if_avail("-ftls-model=global-dynamic" omnitrace-python-compile-options)
|
||||
add_cxx_flag_if_avail("-Wno-deprecated-declarations" omnitrace-python-compile-options)
|
||||
add_cxx_flag_if_avail("-Wno-unused-but-set-parameter" omnitrace-python-compile-options)
|
||||
add_target_cxx_flag_if_avail(
|
||||
omnitrace-python-compile-options "-Wno-unused-value" "-Wno-range-loop-analysis"
|
||||
"-Wno-deprecated-declarations" "-Wno-unused-but-set-parameter"
|
||||
"-ftls-model=global-dynamic")
|
||||
|
||||
file(GLOB pyheaders ${CMAKE_CURRENT_LIST_DIR}/libpyomnitrace*.hpp)
|
||||
set(pysources ${CMAKE_CURRENT_LIST_DIR}/libpyomnitrace.cpp)
|
||||
|
||||
@@ -34,6 +34,18 @@ set(_base_environment
|
||||
"LD_LIBRARY_PATH=${PROJECT_BINARY_DIR}:${OMNITRACE_DYNINST_API_RT_DIR}:$ENV{LD_LIBRARY_PATH}"
|
||||
)
|
||||
|
||||
set(_lock_environment
|
||||
"OMNITRACE_USE_SAMPLING=OFF"
|
||||
"OMNITRACE_CRITICAL_TRACE=ON"
|
||||
"OMNITRACE_COLLAPSE_THREADS=ON"
|
||||
"OMNITRACE_TRACE_THREAD_LOCKS=ON"
|
||||
"OMNITRACE_COUT_OUTPUT=ON"
|
||||
"OMNITRACE_TIME_OUTPUT=OFF"
|
||||
"OMNITRACE_FLAT_PROFILE=ON"
|
||||
"OMNITRACE_TIMELINE_PROFILE=OFF"
|
||||
"LD_LIBRARY_PATH=${PROJECT_BINARY_DIR}:${OMNITRACE_DYNINST_API_RT_DIR}:$ENV{LD_LIBRARY_PATH}"
|
||||
)
|
||||
|
||||
set(_ompt_environment
|
||||
"OMNITRACE_USE_PERFETTO=ON"
|
||||
"OMNITRACE_USE_TIMEMORY=ON"
|
||||
@@ -144,7 +156,7 @@ function(OMNITRACE_ADD_TEST)
|
||||
list(APPEND TEST_ENVIRONMENT "OMNITRACE_USE_PID=OFF")
|
||||
endif()
|
||||
|
||||
if(NOT SKIP_BASELINE)
|
||||
if(NOT TEST_SKIP_BASELINE)
|
||||
add_test(
|
||||
NAME ${TEST_NAME}-baseline
|
||||
COMMAND ${COMMAND_PREFIX} $<TARGET_FILE:${TEST_TARGET}> ${TEST_RUN_ARGS}
|
||||
@@ -389,7 +401,7 @@ omnitrace_add_test(
|
||||
TARGET transpose
|
||||
MPI ${TRANSPOSE_USE_MPI}
|
||||
NUM_PROCS ${NUM_PROCS}
|
||||
REWRITE_ARGS -e -v 2
|
||||
REWRITE_ARGS -e -v 2 --print-instructions
|
||||
RUNTIME_ARGS -e -v 1 --label file line return args
|
||||
ENVIRONMENT "${_base_environment};OMNITRACE_CRITICAL_TRACE=ON")
|
||||
|
||||
@@ -410,6 +422,30 @@ omnitrace_add_test(
|
||||
RUN_ARGS 10 ${NUM_THREADS} 1000
|
||||
ENVIRONMENT "${_base_environment};OMNITRACE_CRITICAL_TRACE=OFF")
|
||||
|
||||
omnitrace_add_test(
|
||||
SKIP_RUNTIME SKIP_SAMPLING
|
||||
NAME parallel-overhead-locks-timemory
|
||||
TARGET parallel-overhead-locks
|
||||
LABELS "locks"
|
||||
REWRITE_ARGS -e -v 2 --min-instructions=4
|
||||
RUN_ARGS 10 4 1000
|
||||
ENVIRONMENT
|
||||
"${_lock_environment};OMNITRACE_USE_TIMEMORY=ON;OMNITRACE_USE_PERFETTO=OFF"
|
||||
REWRITE_RUN_PASS_REGEX
|
||||
"start_thread (.*) 4 (.*) pthread_mutex_lock (.*) 4000 (.*) pthread_mutex_unlock (.*) 4000"
|
||||
)
|
||||
|
||||
omnitrace_add_test(
|
||||
SKIP_BASELINE SKIP_RUNTIME SKIP_SAMPLING
|
||||
NAME parallel-overhead-locks-perfetto
|
||||
TARGET parallel-overhead-locks
|
||||
LABELS "locks"
|
||||
REWRITE_ARGS -e -v 2 --min-instructions=8
|
||||
RUN_ARGS 10 4 1000
|
||||
ENVIRONMENT
|
||||
"${_lock_environment};OMNITRACE_USE_TIMEMORY=OFF;OMNITRACE_USE_PERFETTO=ON"
|
||||
PROPERTIES WILL_FAIL ON)
|
||||
|
||||
omnitrace_add_test(
|
||||
NAME user-api
|
||||
TARGET user-api
|
||||
|
||||
Reference in New Issue
Block a user