diff --git a/projects/rocprofiler-systems/.cmake-format.yaml b/projects/rocprofiler-systems/.cmake-format.yaml index 4c15e28863..6d98d08509 100644 --- a/projects/rocprofiler-systems/.cmake-format.yaml +++ b/projects/rocprofiler-systems/.cmake-format.yaml @@ -132,6 +132,44 @@ parse: kwargs: RESULT_VARIABLE: '*' OUTPUT_VARIABLE: '*' + omnitrace_find_static_library: + flags: + - NO_CACHE + - REQUIRED + - NO_DEFAULT_PATH + - NO_PACKAGE_ROOT_PATH + - NO_CMAKE_PATH + - NO_CMAKE_ENVIRONMENT_PATH + - NO_SYSTEM_ENVIRONMENT_PATH + - CMAKE_FIND_ROOT_PATH_BOTH + - ONLY_CMAKE_FIND_ROOT_PATH + - NO_CMAKE_FIND_ROOT_PATH + kwargs: + NAMES: '*' + NAMES_PER_DIR: '*' + HINTS: '*' + PATHS: '*' + PATH_SUFFIXES: '*' + DOC: '*' + omnitrace_find_shared_library: + flags: + - NO_CACHE + - REQUIRED + - NO_DEFAULT_PATH + - NO_PACKAGE_ROOT_PATH + - NO_CMAKE_PATH + - NO_CMAKE_ENVIRONMENT_PATH + - NO_SYSTEM_ENVIRONMENT_PATH + - CMAKE_FIND_ROOT_PATH_BOTH + - ONLY_CMAKE_FIND_ROOT_PATH + - NO_CMAKE_FIND_ROOT_PATH + kwargs: + NAMES: '*' + NAMES_PER_DIR: '*' + HINTS: '*' + PATHS: '*' + PATH_SUFFIXES: '*' + DOC: '*' override_spec: {} vartags: [] proptags: [] diff --git a/projects/rocprofiler-systems/cmake/BuildSettings.cmake b/projects/rocprofiler-systems/cmake/BuildSettings.cmake index 92b1166480..654676fd50 100644 --- a/projects/rocprofiler-systems/cmake/BuildSettings.cmake +++ b/projects/rocprofiler-systems/cmake/BuildSettings.cmake @@ -301,9 +301,18 @@ target_compile_options( omnitrace-static-libgcc INTERFACE $<$:$<$:-static-libgcc>> $<$:$<$:-static-libgcc>>) +target_link_options( + omnitrace-static-libgcc INTERFACE + $<$:$<$:-static-libgcc>> + $<$:$<$:-static-libgcc>>) + target_compile_options( omnitrace-static-libstdcxx - INTERFACE $<$:$<$:-static-libstdc++>>) + INTERFACE $<$:$<$:-static-libstdc++>> + ) +target_link_options( + omnitrace-static-libstdcxx INTERFACE + $<$:$<$:-static-libstdc++>>) # ----------------------------------------------------------------------------------------# # user customization diff --git a/projects/rocprofiler-systems/cmake/MacroUtilities.cmake b/projects/rocprofiler-systems/cmake/MacroUtilities.cmake index 488b5fccc0..e6976f5688 100644 --- a/projects/rocprofiler-systems/cmake/MacroUtilities.cmake +++ b/projects/rocprofiler-systems/cmake/MacroUtilities.cmake @@ -777,4 +777,14 @@ function(OMNITRACE_PYTHON_CONSOLE_SCRIPT SCRIPT_NAME SCRIPT_SUBMODULE) endif() endfunction() +function(OMNITRACE_FIND_STATIC_LIBRARY) + set(CMAKE_FIND_LIBRARY_SUFFIXES ${CMAKE_STATIC_LIBRARY_SUFFIX}) + find_library(${ARGN}) +endfunction() + +function(OMNITRACE_FIND_SHARED_LIBRARY) + set(CMAKE_FIND_LIBRARY_SUFFIXES ${CMAKE_SHARED_LIBRARY_SUFFIX}) + find_library(${ARGN}) +endfunction() + cmake_policy(POP) diff --git a/projects/rocprofiler-systems/examples/parallel-overhead/CMakeLists.txt b/projects/rocprofiler-systems/examples/parallel-overhead/CMakeLists.txt index d28346e70e..700055ae69 100644 --- a/projects/rocprofiler-systems/examples/parallel-overhead/CMakeLists.txt +++ b/projects/rocprofiler-systems/examples/parallel-overhead/CMakeLists.txt @@ -7,7 +7,13 @@ find_package(Threads REQUIRED) add_executable(parallel-overhead parallel-overhead.cpp) target_link_libraries(parallel-overhead Threads::Threads) +add_executable(parallel-overhead-locks parallel-overhead.cpp) +target_link_libraries(parallel-overhead-locks Threads::Threads) +target_compile_definitions(parallel-overhead-locks PRIVATE USE_LOCKS=1) + if(NOT CMAKE_PROJECT_NAME STREQUAL PROJECT_NAME) set_target_properties(parallel-overhead PROPERTIES RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}) + set_target_properties(parallel-overhead-locks PROPERTIES RUNTIME_OUTPUT_DIRECTORY + ${CMAKE_BINARY_DIR}) endif() diff --git a/projects/rocprofiler-systems/examples/parallel-overhead/parallel-overhead.cpp b/projects/rocprofiler-systems/examples/parallel-overhead/parallel-overhead.cpp index 19c83f19a6..87a012dd19 100644 --- a/projects/rocprofiler-systems/examples/parallel-overhead/parallel-overhead.cpp +++ b/projects/rocprofiler-systems/examples/parallel-overhead/parallel-overhead.cpp @@ -1,14 +1,23 @@ -#include #include #include #include #include #include +#if defined(USE_LOCKS) +# include +using auto_lock_t = std::unique_lock; +long total = 0; +std::mutex mtx{}; +#else +# include std::atomic total{ 0 }; +#endif + long fib(long n) __attribute__((noinline)); + void run(size_t nitr, long) __attribute__((noinline)); @@ -21,10 +30,19 @@ fib(long n) void run(size_t nitr, long n) { +#if defined(USE_LOCKS) + for(size_t i = 0; i < nitr; ++i) + { + auto _v = fib(n); + auto_lock_t _lk{ mtx }; + total += _v; + } +#else long local = 0; for(size_t i = 0; i < nitr; ++i) local += fib(n); total += local; +#endif } int @@ -42,7 +60,7 @@ main(int argc, char** argv) if(argc > 2) nthread = atol(argv[2]); if(argc > 3) nitr = atol(argv[3]); - printf("[%s] Threads: %zu\n[%s] Iterations: %zu\n[%s] fibonacci(%li)...\n", + printf("\n[%s] Threads: %zu\n[%s] Iterations: %zu\n[%s] fibonacci(%li)...\n", _name.c_str(), nthread, _name.c_str(), nitr, _name.c_str(), nfib); std::vector threads{}; @@ -53,13 +71,16 @@ main(int argc, char** argv) threads.emplace_back(&run, _nitr, nfib); } +#if !defined(USE_LOCKS) auto _nitr = std::max(nitr - 0.25 * nitr, 1); run(_nitr, nfib - 0.1 * nfib); +#endif + for(auto& itr : threads) itr.join(); printf("[%s] fibonacci(%li) x %lu = %li\n", _name.c_str(), nfib, nthread, - total.load()); + static_cast(total)); return 0; } diff --git a/projects/rocprofiler-systems/examples/transpose/transpose.cpp b/projects/rocprofiler-systems/examples/transpose/transpose.cpp index 1b0495599b..e5a82f9752 100644 --- a/projects/rocprofiler-systems/examples/transpose/transpose.cpp +++ b/projects/rocprofiler-systems/examples/transpose/transpose.cpp @@ -95,10 +95,12 @@ transpose_a(int* in, int* out, int M, int N) void run(int rank, int tid, hipStream_t stream, int argc, char** argv) { - size_t nitr = 500; - unsigned int M = 4960 * 2; - unsigned int N = 4960 * 2; + size_t nitr = 500; + size_t nsync = 10; + unsigned int M = 4960 * 2; + unsigned int N = 4960 * 2; if(argc > 2) nitr = atoll(argv[2]); + if(argc > 3) nsync = atoll(argv[3]); auto_lock_t _lk{ print_lock }; std::cout << "[" << rank << "][" << tid << "] M: " << M << " N: " << N << std::endl; @@ -126,10 +128,11 @@ run(int rank, int tid, hipStream_t stream, int argc, char** argv) dim3 block(32, 32, 1); // transpose_a auto t1 = std::chrono::high_resolution_clock::now(); - for(size_t i = 0; i < nitr; i++) + for(size_t i = 0; i < nitr; ++i) { transpose_a<<>>(in, out, M, N); check_hip_error(); + if(i % nsync == (nsync - 1)) HIP_API_CALL(hipStreamSynchronize(stream)); } auto t2 = std::chrono::high_resolution_clock::now(); HIP_API_CALL(hipStreamSynchronize(stream)); @@ -179,15 +182,18 @@ do_a2a(int rank) int main(int argc, char** argv) { - int rank = 0; - int size = 1; - int nthreads = 2; - int nitr = 5000; + int rank = 0; + int size = 1; + int nthreads = 2; + int nitr = 5000; + size_t nsync = 10; if(argc > 1) nthreads = atoi(argv[1]); if(argc > 2) nitr = atoi(argv[2]); + if(argc > 3) nsync = atoll(argv[3]); printf("[transpose] Number of threads: %i\n", nthreads); printf("[transpose] Number of iterations: %i\n", nitr); + printf("[transpose] Syncing every %zu iterations\n", nsync); #if defined(USE_MPI) MPI_Init(&argc, &argv); diff --git a/projects/rocprofiler-systems/external/PTL b/projects/rocprofiler-systems/external/PTL index 4afd2bdeb9..1451b6c279 160000 --- a/projects/rocprofiler-systems/external/PTL +++ b/projects/rocprofiler-systems/external/PTL @@ -1 +1 @@ -Subproject commit 4afd2bdeb9ba0ff5c988a31f901f1629134cb109 +Subproject commit 1451b6c279558856c0bec89870b469e8fa4be308 diff --git a/projects/rocprofiler-systems/external/timemory b/projects/rocprofiler-systems/external/timemory index 9ccf9ec9f6..52e7034fd4 160000 --- a/projects/rocprofiler-systems/external/timemory +++ b/projects/rocprofiler-systems/external/timemory @@ -1 +1 @@ -Subproject commit 9ccf9ec9f648a7a0bb7d0753df44f56d77c4a9a9 +Subproject commit 52e7034fd419ff296506cdef43084f6071dbaba1 diff --git a/projects/rocprofiler-systems/source/bin/omnitrace-avail/CMakeLists.txt b/projects/rocprofiler-systems/source/bin/omnitrace-avail/CMakeLists.txt index 01a5cce482..4fd5e02acc 100644 --- a/projects/rocprofiler-systems/source/bin/omnitrace-avail/CMakeLists.txt +++ b/projects/rocprofiler-systems/source/bin/omnitrace-avail/CMakeLists.txt @@ -9,7 +9,7 @@ add_executable( ${CMAKE_CURRENT_LIST_DIR}/avail.cpp ${CMAKE_CURRENT_LIST_DIR}/avail.hpp $) -target_include_directories(omnitrace-avail PRIVATE ${CMAKE_CURRENT_LIST_DIR}/include) +target_include_directories(omnitrace-avail PRIVATE ${CMAKE_CURRENT_LIST_DIR}) target_compile_definitions(omnitrace-avail PRIVATE OMNITRACE_EXTERN_COMPONENTS=0) target_link_libraries(omnitrace-avail PRIVATE omnitrace::omnitrace-compile-definitions omnitrace::omnitrace-interface-library) diff --git a/projects/rocprofiler-systems/source/bin/omnitrace-avail/avail.cpp b/projects/rocprofiler-systems/source/bin/omnitrace-avail/avail.cpp index 2d5c1478a4..18d0767621 100644 --- a/projects/rocprofiler-systems/source/bin/omnitrace-avail/avail.cpp +++ b/projects/rocprofiler-systems/source/bin/omnitrace-avail/avail.cpp @@ -439,7 +439,9 @@ main(int argc, char** argv) parser.add_argument({ "" }, ""); parser.add_argument({ "[VIEW OPTIONS]" }, ""); - parser.add_argument({ "-A", "--available" }, "Only display available components") + parser + .add_argument({ "-A", "--available" }, + "Only display available components/settings/hw-counters") .max_count(1) .action([](parser_t& p) { available_only = p.get("available"); }); parser @@ -892,8 +894,8 @@ write_settings_info(std::ostream& os, const array_t& opts, _setting_output.end()); // patch up the categories - str_set_t _not_in_category_view{}; - auto _settings = tim::settings::shared_instance(); + auto _not_in_category_view = str_set_t{}; + auto _settings = tim::settings::shared_instance(); for(auto& itr : _setting_output) { auto _name = itr.find("environ")->second; @@ -942,6 +944,19 @@ write_settings_info(std::ostream& os, const array_t& opts, }), _setting_output.end()); + if(available_only) + { + _setting_output.erase( + std::remove_if(_setting_output.begin(), _setting_output.end(), + [&_settings](const auto& itr) { + auto iitr = _settings->find(itr.at("environ")); + if(iitr != _settings->end()) + return (iitr->second->get_enabled() == false); + return true; + }), + _setting_output.end()); + } + if(alphabetical) { std::sort(_setting_output.begin(), _setting_output.end(), diff --git a/projects/rocprofiler-systems/source/bin/omnitrace-critical-trace/CMakeLists.txt b/projects/rocprofiler-systems/source/bin/omnitrace-critical-trace/CMakeLists.txt index bce9129ad7..b9c67444e8 100644 --- a/projects/rocprofiler-systems/source/bin/omnitrace-critical-trace/CMakeLists.txt +++ b/projects/rocprofiler-systems/source/bin/omnitrace-critical-trace/CMakeLists.txt @@ -10,12 +10,13 @@ add_executable( ${CMAKE_CURRENT_LIST_DIR}/critical-trace.hpp $) -target_include_directories(omnitrace-critical-trace - PRIVATE ${CMAKE_CURRENT_LIST_DIR}/include) +target_include_directories(omnitrace-critical-trace PRIVATE ${CMAKE_CURRENT_LIST_DIR}) target_compile_definitions(omnitrace-critical-trace PRIVATE OMNITRACE_EXTERN_COMPONENTS=0) target_link_libraries( - omnitrace-critical-trace PRIVATE omnitrace::omnitrace-compile-definitions - omnitrace::omnitrace-interface-library) + omnitrace-critical-trace + PRIVATE omnitrace::omnitrace-compile-definitions + omnitrace::omnitrace-interface-library omnitrace::omnitrace-headers + omnitrace::omnitrace-timemory) set_target_properties( omnitrace-critical-trace PROPERTIES BUILD_RPATH "\$ORIGIN:${PROJECT_BINARY_DIR}:${CMAKE_BINARY_DIR}" diff --git a/projects/rocprofiler-systems/source/bin/omnitrace/module_function.hpp b/projects/rocprofiler-systems/source/bin/omnitrace/module_function.hpp index 3ec55eed9b..3b07121559 100644 --- a/projects/rocprofiler-systems/source/bin/omnitrace/module_function.hpp +++ b/projects/rocprofiler-systems/source/bin/omnitrace/module_function.hpp @@ -176,7 +176,7 @@ module_function::serialize(ArchiveT& ar, const unsigned) } ar(cereal::make_nvp("address_range", address_range), - cereal::make_nvp("instructions", num_instructions), + cereal::make_nvp("num_instructions", num_instructions), cereal::make_nvp("module", module_name), cereal::make_nvp("function", function_name), cereal::make_nvp("signature", signature)); diff --git a/projects/rocprofiler-systems/source/lib/common/delimit.hpp b/projects/rocprofiler-systems/source/lib/common/delimit.hpp index c489f24e7e..1c6834506a 100644 --- a/projects/rocprofiler-systems/source/lib/common/delimit.hpp +++ b/projects/rocprofiler-systems/source/lib/common/delimit.hpp @@ -22,6 +22,7 @@ #pragma once +#include #include #include @@ -31,6 +32,51 @@ inline namespace common { namespace { +template +inline auto +emplace_impl(ContainerT& _c, int, Args&&... _args) + -> decltype(_c.emplace_back(std::forward(_args)...)) +{ + return _c.emplace_back(std::forward(_args)...); +} + +template +inline auto +emplace_impl(ContainerT& _c, long, Args&&... _args) + -> decltype(_c.emplace(std::forward(_args)...)) +{ + return _c.emplace(std::forward(_args)...); +} + +template +inline auto +emplace(ContainerT& _c, Args&&... _args) +{ + return emplace_impl(_c, 0, std::forward(_args)...); +} + +template +inline auto +reserve_impl(ContainerT& _c, int, ArgT _arg) -> decltype(_c.reserve(_arg), bool()) +{ + _c.reserve(_arg); + return true; +} + +template +inline auto +reserve_impl(ContainerT&, long, ArgT) +{ + return false; +} + +template +inline auto +reserve(ContainerT& _c, ArgT _arg) +{ + return reserve_impl(_c, 0, _arg); +} + template > inline ContainerT delimit(const std::string& line, const char* delimiters = "\"',;: "); @@ -42,6 +88,18 @@ delimit(const std::string& line, const char* delimiters) ContainerT _result{}; size_t _beginp = 0; // position that is the beginning of the new string size_t _delimp = 0; // position of the delimiter in the string + if(reserve(_result, 0)) + { + size_t _nmax = 0; + for(char itr : line) + { + for(size_t j = 0; j < strlen(delimiters); ++j) + { + if(itr == delimiters[j]) ++_nmax; + } + } + reserve(_result, _nmax); + } while(_beginp < line.length() && _delimp < line.length()) { // find the first character (starting at _delimp) that is not a delimiter @@ -56,7 +114,7 @@ delimit(const std::string& line, const char* delimiters) // between this position and the next delimiter _tmp = line.substr(_beginp, _delimp - _beginp); // don't add empty strings - if(!_tmp.empty()) _result.emplace(_result.end(), _tmp); + if(!_tmp.empty()) emplace(_result, _tmp); } return _result; } diff --git a/projects/rocprofiler-systems/source/lib/common/invoke.hpp b/projects/rocprofiler-systems/source/lib/common/invoke.hpp index 7a3e689906..dfcfbd35f9 100644 --- a/projects/rocprofiler-systems/source/lib/common/invoke.hpp +++ b/projects/rocprofiler-systems/source/lib/common/invoke.hpp @@ -60,6 +60,21 @@ get_thread_index() return _v; } +template +auto +ignore(const char* _name, int _verbose, int _value, const char* _reason, Args... _args) +{ + if(_verbose >= _value) + { + fflush(stderr); + fprintf(stderr, + "[omnitrace][" OMNITRACE_COMMON_LIBRARY_NAME + "][%li] %s(%s) was ignored :: %s\n", + get_thread_index(), _name, join(", ", _args...).c_str(), _reason); + fflush(stderr); + } +} + template auto invoke(const char* _name, int _verbose, bool& _toggle, FuncT&& _func, Args... _args) diff --git a/projects/rocprofiler-systems/source/lib/common/join.hpp b/projects/rocprofiler-systems/source/lib/common/join.hpp index f0c829125c..0924469927 100644 --- a/projects/rocprofiler-systems/source/lib/common/join.hpp +++ b/projects/rocprofiler-systems/source/lib/common/join.hpp @@ -22,8 +22,10 @@ #pragma once +#include #include #include +#include #if !defined(OMNITRACE_FOLD_EXPRESSION) # define OMNITRACE_FOLD_EXPRESSION(...) ((__VA_ARGS__), ...) @@ -35,12 +37,51 @@ inline namespace common { namespace { +template +struct is_string_impl : std::false_type +{}; + +template <> +struct is_string_impl : std::true_type +{}; + +template <> +struct is_string_impl : std::true_type +{}; + +template <> +struct is_string_impl : std::true_type +{}; + +template <> +struct is_string_impl : std::true_type +{}; + +template +struct is_string : is_string_impl>> +{}; + +template +auto +as_string(ArgT&& _v, std::enable_if_t::value, int> = 0) +{ + return std::string{ "\"" } + _v + std::string{ "\"" }; +} + +template +auto +as_string(ArgT&& _v, std::enable_if_t::value, long> = 0) +{ + return _v; +} + template auto join(DelimT&& _delim, Args&&... _args) { std::stringstream _ss{}; - OMNITRACE_FOLD_EXPRESSION(_ss << _delim << _args); + _ss << std::boolalpha; + OMNITRACE_FOLD_EXPRESSION(_ss << _delim << as_string(_args)); auto _ret = _ss.str(); if constexpr(std::is_same::value) { diff --git a/projects/rocprofiler-systems/source/lib/omnitrace-dl/CMakeLists.txt b/projects/rocprofiler-systems/source/lib/omnitrace-dl/CMakeLists.txt index a5d36c9e38..c2f610fba4 100644 --- a/projects/rocprofiler-systems/source/lib/omnitrace-dl/CMakeLists.txt +++ b/projects/rocprofiler-systems/source/lib/omnitrace-dl/CMakeLists.txt @@ -6,43 +6,37 @@ set(CMAKE_BUILD_TYPE "Release") set(CMAKE_SKIP_RPATH OFF) -set(BUILD_RPATH_USE_ORIGIN ON) +set(CMAKE_BUILD_RPATH_USE_ORIGIN ON) +set(CMAKE_CXX_VISIBILITY_PRESET "internal") +set(CMAKE_POSITION_INDEPENDENT_CODE ON) add_library(omnitrace-dl-library SHARED) add_library(omnitrace::omnitrace-dl-library ALIAS omnitrace-dl-library) target_sources(omnitrace-dl-library PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/dl.cpp ${CMAKE_CURRENT_SOURCE_DIR}/dl.hpp) -target_link_libraries( - omnitrace-dl-library - PUBLIC ${dl_LIBRARY} $ - $) target_include_directories( omnitrace-dl-library PUBLIC $ $) +target_link_libraries( + omnitrace-dl-library + PUBLIC + ${dl_LIBRARY} + $ + $ + $,$,> + $,$,> + ) -check_cxx_compiler_flag("-fno-exceptions" omnitrace_dl_library_fno_exceptions) -check_cxx_compiler_flag("-ftls-model=local-dynamic" - omnitrace_dl_library_ftls_module_local_dynamic) - -if(OMNITRACE_BUILD_DEVELOPER) - if(omnitrace_dl_library_fno_exceptions) - target_compile_options(omnitrace-dl-library PRIVATE -fno-exceptions) - endif() - - if(omnitrace_dl_library_ftls_module_local_dynamic) - target_compile_options(omnitrace-dl-library PRIVATE -ftls-model=local-dynamic) - endif() -endif() +add_target_cxx_flag_if_avail(omnitrace-dl-library "-ftls-model=global-dynamic") +add_target_cxx_flag_if_avail(omnitrace-dl-library "-g") set_target_properties( omnitrace-dl-library PROPERTIES OUTPUT_NAME omnitrace-dl - CXX_VISIBILITY_PRESET "internal" VERSION ${PROJECT_VERSION} SOVERSION ${PROJECT_VERSION_MAJOR} - POSITION_INDEPENDENT_CODE ON BUILD_RPATH "\$ORIGIN" INSTALL_RPATH "\$ORIGIN") diff --git a/projects/rocprofiler-systems/source/lib/omnitrace-dl/dl.cpp b/projects/rocprofiler-systems/source/lib/omnitrace-dl/dl.cpp index 75f3b4843f..f2a640b2aa 100644 --- a/projects/rocprofiler-systems/source/lib/omnitrace-dl/dl.cpp +++ b/projects/rocprofiler-systems/source/lib/omnitrace-dl/dl.cpp @@ -122,9 +122,9 @@ const char* _omnitrace_dl_dlopen_descr = "RTLD_LAZY | RTLD_LOCAL"; /// This class contains function pointers for omnitrace's instrumentation functions struct OMNITRACE_HIDDEN_API indirect { - OMNITRACE_INLINE indirect(std::string omnilib, std::string userlib) - : m_omnilib{ find_path(std::move(omnilib)) } - , m_userlib{ find_path(std::move(userlib)) } + OMNITRACE_INLINE indirect(const std::string& omnilib, const std::string& userlib) + : m_omnilib{ find_path(omnilib) } + , m_userlib{ find_path(userlib) } { if(_omnitrace_dl_verbose >= 1) { @@ -218,19 +218,20 @@ struct OMNITRACE_HIDDEN_API indirect } } - static OMNITRACE_INLINE std::string find_path(std::string&& _path) + static OMNITRACE_INLINE std::string find_path(const std::string& _path) { - auto _paths = - delimit(join(":", get_env("OMNITRACE_PATH", ""), - get_env("LD_LIBRARY_PATH", ""), get_env("LIBRARY_PATH", "")), - ":"); + auto _paths_search = + join(":", get_env("OMNITRACE_PATH", ""), get_env("LD_LIBRARY_PATH", ""), + get_env("LIBRARY_PATH", "")); + + auto _paths = delimit(_paths_search, ":"); auto file_exists = [](const std::string& name) { struct stat buffer; return (stat(name.c_str(), &buffer) == 0); }; - for(auto&& itr : _paths) + for(const auto& itr : _paths) { auto _f = join('/', itr, _path); if(file_exists(_f)) return _f; @@ -269,10 +270,10 @@ get_indirect() OMNITRACE_HIDDEN_API; indirect& get_indirect() { - static auto _v = - indirect{ get_env("OMNITRACE_LIBRARY", "libomnitrace.so"), - get_env("OMNITRACE_USER_LIBRARY", "libomnitrace-user.so") }; - return _v; + static auto _libomni = get_env("OMNITRACE_LIBRARY", "libomnitrace.so"); + static auto _libuser = get_env("OMNITRACE_USER_LIBRARY", "libomnitrace-user.so"); + static auto* _v = new indirect{ _libomni, _libuser }; + return *_v; } auto& @@ -340,6 +341,10 @@ bool _omnitrace_dl_fini = (std::atexit([]() { (::omnitrace::dl::get_thread_status() = false), \ __VA_ARGS__) +#define OMNITRACE_DL_IGNORE(...) \ + ::omnitrace::common::ignore(__FUNCTION__, ::omnitrace::dl::_omnitrace_dl_verbose, \ + __VA_ARGS__) + #define OMNITRACE_DL_INVOKE_STATUS(STATUS, ...) \ ::omnitrace::common::invoke(__FUNCTION__, ::omnitrace::dl::_omnitrace_dl_verbose, \ STATUS, __VA_ARGS__) @@ -464,18 +469,30 @@ extern "C" void omnitrace_set_env(const char* a, const char* b) { + if(dl::get_inited() && dl::get_active()) + { + OMNITRACE_DL_IGNORE(2, "already initialized and active", a, b); + return; + } setenv(a, b, 0); OMNITRACE_DL_INVOKE(get_indirect().omnitrace_set_env_f, a, b); } void omnitrace_set_mpi(bool a, bool b) { + if(dl::get_inited() && dl::get_active()) + { + OMNITRACE_DL_IGNORE(2, "already initialized and active", a, b); + return; + } OMNITRACE_DL_INVOKE(get_indirect().omnitrace_set_mpi_f, a, b); } void omnitrace_register_source(const char* file, const char* func, size_t line, size_t address, const char* source) { + OMNITRACE_DL_LOG(3, "%s(\"%s\", \"%s\", %zu, %zu, \"%s\")\n", __FUNCTION__, file, + func, line, address, source); OMNITRACE_DL_INVOKE(get_indirect().omnitrace_register_source_f, file, func, line, address, source); } diff --git a/projects/rocprofiler-systems/source/lib/omnitrace/CMakeLists.txt b/projects/rocprofiler-systems/source/lib/omnitrace/CMakeLists.txt index 92fcfa674c..1d6a9f717f 100644 --- a/projects/rocprofiler-systems/source/lib/omnitrace/CMakeLists.txt +++ b/projects/rocprofiler-systems/source/lib/omnitrace/CMakeLists.txt @@ -8,8 +8,8 @@ add_library(omnitrace-interface-library INTERFACE) add_library(omnitrace::omnitrace-interface-library ALIAS omnitrace-interface-library) target_include_directories( - omnitrace-interface-library INTERFACE ${CMAKE_CURRENT_SOURCE_DIR}/include - ${CMAKE_CURRENT_BINARY_DIR}/include) + omnitrace-interface-library INTERFACE ${CMAKE_CURRENT_SOURCE_DIR} + ${CMAKE_CURRENT_BINARY_DIR}) target_include_directories(omnitrace-interface-library SYSTEM INTERFACE ${perfetto_DIR}/sdk) @@ -54,6 +54,7 @@ set(library_sources ${CMAKE_CURRENT_LIST_DIR}/library/coverage.cpp ${CMAKE_CURRENT_LIST_DIR}/library/cpu_freq.cpp ${CMAKE_CURRENT_LIST_DIR}/library/critical_trace.cpp + ${CMAKE_CURRENT_LIST_DIR}/library/debug.cpp ${CMAKE_CURRENT_LIST_DIR}/library/kokkosp.cpp ${CMAKE_CURRENT_LIST_DIR}/library/gpu.cpp ${CMAKE_CURRENT_LIST_DIR}/library/ompt.cpp @@ -70,6 +71,8 @@ set(library_sources ${CMAKE_CURRENT_LIST_DIR}/library/components/mpi_gotcha.cpp ${CMAKE_CURRENT_LIST_DIR}/library/components/omnitrace.cpp ${CMAKE_CURRENT_LIST_DIR}/library/components/pthread_gotcha.cpp + ${CMAKE_CURRENT_LIST_DIR}/library/components/pthread_create_gotcha.cpp + ${CMAKE_CURRENT_LIST_DIR}/library/components/pthread_mutex_gotcha.cpp ${CMAKE_CURRENT_LIST_DIR}/library/components/user_region.cpp ${perfetto_DIR}/sdk/perfetto.cc) @@ -92,6 +95,7 @@ set(library_headers ${CMAKE_CURRENT_LIST_DIR}/library/thread_data.hpp ${CMAKE_CURRENT_LIST_DIR}/library/thread_sampler.hpp ${CMAKE_CURRENT_LIST_DIR}/library/timemory.hpp + ${CMAKE_CURRENT_LIST_DIR}/library/utility.hpp ${CMAKE_CURRENT_LIST_DIR}/library/components/fwd.hpp ${CMAKE_CURRENT_LIST_DIR}/library/components/backtrace.hpp ${CMAKE_CURRENT_LIST_DIR}/library/components/fork_gotcha.hpp @@ -102,6 +106,8 @@ set(library_headers ${CMAKE_CURRENT_LIST_DIR}/library/components/roctracer.hpp ${CMAKE_CURRENT_LIST_DIR}/library/components/roctracer_callbacks.hpp ${CMAKE_CURRENT_LIST_DIR}/library/components/pthread_gotcha.hpp + ${CMAKE_CURRENT_LIST_DIR}/library/components/pthread_create_gotcha.hpp + ${CMAKE_CURRENT_LIST_DIR}/library/components/pthread_mutex_gotcha.hpp ${CMAKE_CURRENT_LIST_DIR}/library/components/user_region.hpp ${perfetto_DIR}/sdk/perfetto.h) @@ -119,7 +125,8 @@ if(OMNITRACE_USE_ROCM_SMI) PRIVATE ${CMAKE_CURRENT_LIST_DIR}/library/components/rocm_smi.cpp) endif() -target_link_libraries(omnitrace-object-library PRIVATE omnitrace-interface-library) +target_link_libraries(omnitrace-object-library + PRIVATE omnitrace::omnitrace-interface-library) if(OMNITRACE_DYNINST_API_RT) get_filename_component(OMNITRACE_DYNINST_API_RT_DIR "${OMNITRACE_DYNINST_API_RT}" diff --git a/projects/rocprofiler-systems/source/lib/omnitrace/library.cpp b/projects/rocprofiler-systems/source/lib/omnitrace/library.cpp index 24dedaf322..2877ef041c 100644 --- a/projects/rocprofiler-systems/source/lib/omnitrace/library.cpp +++ b/projects/rocprofiler-systems/source/lib/omnitrace/library.cpp @@ -27,6 +27,7 @@ #include "library/components/fwd.hpp" #include "library/components/mpi_gotcha.hpp" #include "library/components/pthread_gotcha.hpp" +#include "library/components/pthread_mutex_gotcha.hpp" #include "library/config.hpp" #include "library/coverage.hpp" #include "library/critical_trace.hpp" @@ -221,7 +222,7 @@ omnitrace_push_trace_hidden(const char* name) std::tie(_cid, _parent_cid, _depth) = create_cpu_cid_entry(); auto _ts = comp::wall_clock::record(); add_critical_trace( - threading::get_id(), _cid, 0, _parent_cid, _ts, 0, + threading::get_id(), _cid, 0, _parent_cid, _ts, 0, 0, critical_trace::add_hash_id(name), _depth); } } @@ -263,7 +264,7 @@ omnitrace_pop_trace_hidden(const char* name) auto _ts = comp::wall_clock::record(); std::tie(_parent_cid, _depth) = get_cpu_cid_parents()->at(_cid); add_critical_trace( - threading::get_id(), _cid, 0, _parent_cid, _ts, _ts, + threading::get_id(), _cid, 0, _parent_cid, _ts, _ts, 0, critical_trace::add_hash_id(name), _depth); } } @@ -436,6 +437,8 @@ omnitrace_init_library_hidden() if(get_state() != State::PreInit || get_state() == State::Init || _once) return; _once = true; + OMNITRACE_SCOPED_THREAD_STATE(ThreadState::Internal); + OMNITRACE_CONDITIONAL_BASIC_PRINT_F(_debug_init, "State is %s. Setting to %s...\n", std::to_string(get_state()).c_str(), std::to_string(State::Init).c_str()); @@ -445,7 +448,7 @@ omnitrace_init_library_hidden() "glibc's backtrace() occurs...\n"); { std::stringstream _ss{}; - tim::print_backtrace<64>(_ss); + tim::print_backtrace<16>(_ss); (void) _ss; } @@ -471,7 +474,7 @@ omnitrace_init_library_hidden() // below will effectively do: // get_cpu_cid_stack(0)->emplace_back(-1); // plus query some env variables - add_critical_trace(0, -1, 0, 0, 0, 0, 0, 0); + add_critical_trace(0, -1, 0, 0, 0, 0, 0, 0, 0); if(gpu::device_count() == 0 && get_state() != State::Active) { @@ -550,6 +553,8 @@ omnitrace_init_tooling_hidden() if(get_state() != State::PreInit || get_state() == State::Init || _once) return false; _once = true; + OMNITRACE_SCOPED_THREAD_STATE(ThreadState::Internal); + OMNITRACE_CONDITIONAL_THROW( get_state() == State::Init, "%s called after omnitrace_init_library() was explicitly called", @@ -571,6 +576,9 @@ omnitrace_init_tooling_hidden() OMNITRACE_DEBUG_F("\n"); auto _dtor = scope::destructor{ []() { + // if set to finalized, don't continue + if(get_state() > State::Active) return; + if(config::get_trace_thread_locks()) pthread_mutex_gotcha::validate(); if(get_use_thread_sampling()) { pthread_gotcha::push_enable_sampling_on_child_threads(false); @@ -595,6 +603,12 @@ omnitrace_init_tooling_hidden() sampling::block_signals(); } + if(get_use_critical_trace()) + { + // initialize the thread pool + (void) tasking::critical_trace::get_task_group(); + } + if(get_use_timemory()) { comp::user_global_bundle::global_init(); @@ -892,8 +906,8 @@ omnitrace_init_hidden(const char* _mode, bool _is_binary_rewrite, const char* _a }); std::atexit([]() { - // if not already finalized then we should finalize - if(get_state() != State::Finalized) omnitrace_finalize_hidden(); + // if active (not already finalized) then we should finalize + if(get_state() == State::Active) omnitrace_finalize_hidden(); }); OMNITRACE_CONDITIONAL_BASIC_PRINT_F( @@ -940,6 +954,8 @@ omnitrace_finalize_hidden(void) // disable thread id recycling during finalization threading::recycle_ids() = false; + set_thread_state(ThreadState::Completed); + // return if not active if(get_state() != State::Active) { @@ -1193,25 +1209,33 @@ omnitrace_finalize_hidden(void) using char_vec_t = std::vector; OMNITRACE_VERBOSE_F(3, "Getting the trace data...\n"); -#if defined(TIMEMORY_USE_MPI) && TIMEMORY_USE_MPI > 0 - using perfetto_mpi_get_t = tim::operation::finalize::mpi_get; - - char_vec_t _trace_data{ tracing_session->ReadTraceBlocking() }; - std::vector _rank_data = {}; - auto _combine = [](char_vec_t& _dst, const char_vec_t& _src) -> char_vec_t& { - _dst.reserve(_dst.size() + _src.size()); - for(auto&& itr : _src) - _dst.emplace_back(itr); - return _dst; - }; - - perfetto_mpi_get_t{ _rank_data, _trace_data, _combine }; auto trace_data = char_vec_t{}; - for(auto& itr : _rank_data) - trace_data = - (trace_data.empty()) ? std::move(itr) : _combine(trace_data, itr); +#if defined(TIMEMORY_USE_MPI) && TIMEMORY_USE_MPI > 0 + if(get_perfetto_combined_traces()) + { + using perfetto_mpi_get_t = + tim::operation::finalize::mpi_get; + + char_vec_t _trace_data{ tracing_session->ReadTraceBlocking() }; + std::vector _rank_data = {}; + auto _combine = [](char_vec_t& _dst, const char_vec_t& _src) -> char_vec_t& { + _dst.reserve(_dst.size() + _src.size()); + for(auto&& itr : _src) + _dst.emplace_back(itr); + return _dst; + }; + + perfetto_mpi_get_t{ _rank_data, _trace_data, _combine }; + for(auto& itr : _rank_data) + trace_data = + (trace_data.empty()) ? std::move(itr) : _combine(trace_data, itr); + } + else + { + trace_data = tracing_session->ReadTraceBlocking(); + } #else - char_vec_t trace_data{ tracing_session->ReadTraceBlocking() }; + trace_data = tracing_session->ReadTraceBlocking(); #endif if(!trace_data.empty()) diff --git a/projects/rocprofiler-systems/source/lib/omnitrace/library.hpp b/projects/rocprofiler-systems/source/lib/omnitrace/library.hpp index 6b5d85f8fa..b459a40ed9 100644 --- a/projects/rocprofiler-systems/source/lib/omnitrace/library.hpp +++ b/projects/rocprofiler-systems/source/lib/omnitrace/library.hpp @@ -53,8 +53,8 @@ template inline void add_critical_trace(int64_t _targ_tid, size_t _cpu_cid, size_t _gpu_cid, - size_t _parent_cid, int64_t _ts_beg, int64_t _ts_val, size_t _hash, - uint16_t _depth, uint16_t _prio = 0) + size_t _parent_cid, int64_t _ts_beg, int64_t _ts_val, uintptr_t _queue, + size_t _hash, uint16_t _depth, uint16_t _prio = 0) { // clang-format off // these are used to create unique type mutexes @@ -62,6 +62,8 @@ add_critical_trace(int64_t _targ_tid, size_t _cpu_cid, size_t _gpu_cid, struct cpu_cid_stack {}; // clang-format on + OMNITRACE_SCOPED_THREAD_STATE(ThreadState::Internal); + using tim::type_mutex; using auto_lock_t = tim::auto_lock_t; static constexpr auto num_mutexes = max_supported_threads; @@ -80,9 +82,9 @@ add_critical_trace(int64_t _targ_tid, size_t _cpu_cid, size_t _gpu_cid, if(!_self_lk.owns_lock()) _self_lk.lock(); auto& _critical_trace = critical_trace::get(_self_tid); - _critical_trace->emplace_back( - critical_trace::entry{ _prio, DevID, PhaseID, _depth, _targ_tid, _cpu_cid, - _gpu_cid, _parent_cid, _ts_beg, _ts_val, _hash }); + _critical_trace->emplace_back(critical_trace::entry{ + _prio, DevID, PhaseID, _depth, _targ_tid, _cpu_cid, _gpu_cid, _parent_cid, + _ts_beg, _ts_val, _queue, _hash }); } if constexpr(UpdateStack) @@ -119,6 +121,6 @@ add_critical_trace(int64_t _targ_tid, size_t _cpu_cid, size_t _gpu_cid, } tim::consume_parameters(_targ_tid, _cpu_cid, _gpu_cid, _parent_cid, _ts_beg, _ts_val, - _hash, _depth, _prio); + _queue, _hash, _depth, _prio); } } // namespace omnitrace diff --git a/projects/rocprofiler-systems/source/lib/omnitrace/library/components/backtrace.cpp b/projects/rocprofiler-systems/source/lib/omnitrace/library/components/backtrace.cpp index 3db250bdf5..6a91c45a7b 100644 --- a/projects/rocprofiler-systems/source/lib/omnitrace/library/components/backtrace.cpp +++ b/projects/rocprofiler-systems/source/lib/omnitrace/library/components/backtrace.cpp @@ -271,7 +271,7 @@ backtrace::sample(int signum) m_ts = clock_type::now(); m_thr_cpu_ts = tim::get_clock_thread_now(); m_mem_peak = tim::get_peak_rss(RUSAGE_THREAD); - m_data = tim::get_unw_backtrace<128, 4, false>(); + m_data = tim::get_unw_backtrace(); auto* itr = m_data.begin(); for(; itr != m_data.end(); ++itr, ++m_size) { diff --git a/projects/rocprofiler-systems/source/lib/omnitrace/library/components/backtrace.hpp b/projects/rocprofiler-systems/source/lib/omnitrace/library/components/backtrace.hpp index 0ba84547ef..ac9f93a620 100644 --- a/projects/rocprofiler-systems/source/lib/omnitrace/library/components/backtrace.hpp +++ b/projects/rocprofiler-systems/source/lib/omnitrace/library/components/backtrace.hpp @@ -52,8 +52,10 @@ struct backtrace , tim::concepts::component { static constexpr size_t num_hw_counters = TIMEMORY_PAPI_ARRAY_SIZE; + static constexpr size_t buffer_width = 512; + static constexpr size_t stack_depth = 128; - using data_t = std::array; + using data_t = std::array; using clock_type = std::chrono::steady_clock; using time_point_type = typename clock_type::time_point; using value_type = void; diff --git a/projects/rocprofiler-systems/source/lib/omnitrace/library/components/fork_gotcha.cpp b/projects/rocprofiler-systems/source/lib/omnitrace/library/components/fork_gotcha.cpp index 6b87494b67..1e61af50bd 100644 --- a/projects/rocprofiler-systems/source/lib/omnitrace/library/components/fork_gotcha.cpp +++ b/projects/rocprofiler-systems/source/lib/omnitrace/library/components/fork_gotcha.cpp @@ -23,6 +23,12 @@ #include "library/components/fork_gotcha.hpp" #include "library/config.hpp" #include "library/debug.hpp" +#include "library/state.hpp" + +#include +#include + +#include namespace omnitrace { @@ -37,6 +43,8 @@ fork_gotcha::configure() void fork_gotcha::audit(const gotcha_data_t&, audit::incoming) { + OMNITRACE_VERBOSE(1, "fork() called on PID %i (rank: %i), TID %li\n", + process::get_id(), dmp::rank(), threading::get_id()); OMNITRACE_CONDITIONAL_BASIC_PRINT( get_debug_env(), "Warning! Calling fork() within an OpenMPI application using libfabric " @@ -45,9 +53,13 @@ fork_gotcha::audit(const gotcha_data_t&, audit::incoming) } void -fork_gotcha::audit(const gotcha_data_t& _data, audit::outgoing, pid_t _pid) +fork_gotcha::audit(const gotcha_data_t&, audit::outgoing, pid_t _pid) { - OMNITRACE_CONDITIONAL_BASIC_PRINT(get_debug_env(), "%s() return PID %i\n", - _data.tool_id.c_str(), (int) _pid); + if(_pid != 0) + { + OMNITRACE_VERBOSE(1, "fork() called on PID %i created PID %i\n", getppid(), _pid); + tim::settings::use_output_suffix() = true; + tim::settings::default_process_suffix() = process::get_id(); + } } } // namespace omnitrace diff --git a/projects/rocprofiler-systems/source/lib/omnitrace/library/components/functors.hpp b/projects/rocprofiler-systems/source/lib/omnitrace/library/components/functors.hpp index ff15643dab..880c47b54b 100644 --- a/projects/rocprofiler-systems/source/lib/omnitrace/library/components/functors.hpp +++ b/projects/rocprofiler-systems/source/lib/omnitrace/library/components/functors.hpp @@ -24,10 +24,13 @@ #include "library/components/fwd.hpp" #include "library/defines.hpp" +#include "library/runtime.hpp" +#include "library/state.hpp" #include "library/timemory.hpp" -#include "timemory/mpl/concepts.hpp" -#include "timemory/mpl/function_traits.hpp" -#include "timemory/utility/macros.hpp" + +#include +#include +#include #include #include @@ -71,6 +74,7 @@ struct functors : comp::base, void> int> = 0> static auto start(Args&&... _args) { + OMNITRACE_SCOPED_THREAD_STATE(ThreadState::Internal); get_functors().first(std::forward(_args)...); } @@ -79,6 +83,7 @@ struct functors : comp::base, void> int> = 0> static auto stop(Args&&... _args) { + OMNITRACE_SCOPED_THREAD_STATE(ThreadState::Internal); get_functors().second(std::forward(_args)...); } @@ -87,24 +92,28 @@ struct functors : comp::base, void> template = 0> void start() { + OMNITRACE_SCOPED_THREAD_STATE(ThreadState::Internal); get_functors().first(m_prefix); } template = 0> void stop() { + OMNITRACE_SCOPED_THREAD_STATE(ThreadState::Internal); get_functors().second(m_prefix); } template = 0> void start() { + OMNITRACE_SCOPED_THREAD_STATE(ThreadState::Internal); get_functors().first(); } template = 0> void stop() { + OMNITRACE_SCOPED_THREAD_STATE(ThreadState::Internal); get_functors().second(); } diff --git a/projects/rocprofiler-systems/source/lib/omnitrace/library/components/mpi_gotcha.cpp b/projects/rocprofiler-systems/source/lib/omnitrace/library/components/mpi_gotcha.cpp index b50a143b29..f401e4ebe2 100644 --- a/projects/rocprofiler-systems/source/lib/omnitrace/library/components/mpi_gotcha.cpp +++ b/projects/rocprofiler-systems/source/lib/omnitrace/library/components/mpi_gotcha.cpp @@ -46,8 +46,7 @@ omnitrace_mpi_set_attr() return MPI_SUCCESS; }; static auto _mpi_fini = [](MPI_Comm, int, void*, void*) { - OMNITRACE_CONDITIONAL_BASIC_PRINT(get_debug_env(), - "MPI Comm attribute finalize\n"); + OMNITRACE_DEBUG("MPI Comm attribute finalize\n"); if(mpip_index != std::numeric_limits::max()) comp::deactivate_mpip, api::omnitrace>(mpip_index); @@ -83,8 +82,7 @@ mpi_gotcha::configure() void mpi_gotcha::audit(const gotcha_data_t& _data, audit::incoming, int*, char***) { - OMNITRACE_CONDITIONAL_BASIC_PRINT_F(get_debug_env(), "%s(int*, char***)\n", - _data.tool_id.c_str()); + OMNITRACE_BASIC_DEBUG_F("%s(int*, char***)\n", _data.tool_id.c_str()); if(get_state() < ::omnitrace::State::Init) set_state(::omnitrace::State::PreInit); @@ -98,8 +96,7 @@ mpi_gotcha::audit(const gotcha_data_t& _data, audit::incoming, int*, char***) void mpi_gotcha::audit(const gotcha_data_t& _data, audit::incoming, int*, char***, int, int*) { - OMNITRACE_CONDITIONAL_BASIC_PRINT_F(get_debug_env(), "%s(int*, char***, int, int*)\n", - _data.tool_id.c_str()); + OMNITRACE_BASIC_DEBUG_F("%s(int*, char***, int, int*)\n", _data.tool_id.c_str()); if(get_state() < ::omnitrace::State::Init) set_state(::omnitrace::State::PreInit); @@ -113,7 +110,7 @@ mpi_gotcha::audit(const gotcha_data_t& _data, audit::incoming, int*, char***, in void mpi_gotcha::audit(const gotcha_data_t& _data, audit::incoming) { - OMNITRACE_CONDITIONAL_BASIC_PRINT_F(get_debug_env(), "%s()\n", _data.tool_id.c_str()); + OMNITRACE_BASIC_DEBUG_F("%s()\n", _data.tool_id.c_str()); if(mpip_index != std::numeric_limits::max()) comp::deactivate_mpip, @@ -130,7 +127,7 @@ mpi_gotcha::audit(const gotcha_data_t& _data, audit::incoming) void mpi_gotcha::audit(const gotcha_data_t& _data, audit::incoming, comm_t, int* _val) { - OMNITRACE_CONDITIONAL_BASIC_PRINT_F(get_debug_env(), "%s()\n", _data.tool_id.c_str()); + OMNITRACE_BASIC_DEBUG_F("%s()\n", _data.tool_id.c_str()); omnitrace_push_trace_hidden(_data.tool_id.c_str()); if(_data.tool_id == "MPI_Comm_rank") @@ -151,8 +148,7 @@ mpi_gotcha::audit(const gotcha_data_t& _data, audit::incoming, comm_t, int* _val void mpi_gotcha::audit(const gotcha_data_t& _data, audit::outgoing, int _retval) { - OMNITRACE_CONDITIONAL_BASIC_PRINT_F(get_debug_env(), "%s() returned %i\n", - _data.tool_id.c_str(), (int) _retval); + OMNITRACE_BASIC_DEBUG_F("%s() returned %i\n", _data.tool_id.c_str(), (int) _retval); if(_retval == tim::mpi::success_v && _data.tool_id.find("MPI_Init") == 0) { @@ -164,8 +160,7 @@ mpi_gotcha::audit(const gotcha_data_t& _data, audit::outgoing, int _retval) // were excluded via a regex expression) if(get_use_mpip()) { - OMNITRACE_CONDITIONAL_BASIC_PRINT_F(get_debug_env() || get_verbose_env() > 0, - "Activating MPI wrappers...\n"); + OMNITRACE_BASIC_VERBOSE_F(2, "Activating MPI wrappers...\n"); // use env vars OMNITRACE_MPIP_PERMIT_LIST and OMNITRACE_MPIP_REJECT_LIST // to control the gotcha bindings at runtime @@ -186,13 +181,12 @@ mpi_gotcha::audit(const gotcha_data_t& _data, audit::outgoing, int _retval) tim::mpi::set_rank(m_rank); tim::settings::default_process_suffix() = m_rank; get_perfetto_output_filename().clear(); - OMNITRACE_CONDITIONAL_BASIC_PRINT( - get_debug() || get_verbose() > 0, "[pid=%i] MPI rank: %i (%i)\n", - process::get_id(), tim::mpi::rank(), m_rank); + OMNITRACE_BASIC_VERBOSE(0, "[pid=%i] MPI rank: %i (%i)\n", + process::get_id(), tim::mpi::rank(), m_rank); } else { - OMNITRACE_BASIC_PRINT_F("%s() returned %i :: nullptr to rank\n", + OMNITRACE_BASIC_VERBOSE(0, "%s() returned %i :: nullptr to rank\n", _data.tool_id.c_str(), (int) _retval); } } @@ -202,19 +196,19 @@ mpi_gotcha::audit(const gotcha_data_t& _data, audit::outgoing, int _retval) { m_size = std::max(*m_size_ptr, m_size); tim::mpi::set_size(m_size); - OMNITRACE_CONDITIONAL_BASIC_PRINT( - get_debug() || get_verbose() > 0, "[pid=%i] MPI size: %i (%i)\n", - process::get_id(), tim::mpi::size(), m_size); + OMNITRACE_BASIC_VERBOSE(0, "[pid=%i] MPI size: %i (%i)\n", + process::get_id(), tim::mpi::size(), m_size); } else { - OMNITRACE_BASIC_PRINT_F("%s() returned %i :: nullptr to size\n", + OMNITRACE_BASIC_VERBOSE(0, "%s() returned %i :: nullptr to size\n", _data.tool_id.c_str(), (int) _retval); } } else { - OMNITRACE_BASIC_PRINT_F("%s() returned %i :: unexpected function wrapper\n", + OMNITRACE_BASIC_VERBOSE(0, + "%s() returned %i :: unexpected function wrapper\n", _data.tool_id.c_str(), (int) _retval); } } diff --git a/projects/rocprofiler-systems/source/lib/omnitrace/library/components/pthread_create_gotcha.cpp b/projects/rocprofiler-systems/source/lib/omnitrace/library/components/pthread_create_gotcha.cpp new file mode 100644 index 0000000000..aaed456bb2 --- /dev/null +++ b/projects/rocprofiler-systems/source/lib/omnitrace/library/components/pthread_create_gotcha.cpp @@ -0,0 +1,301 @@ +// MIT License +// +// Copyright (c) 2022 Advanced Micro Devices, Inc. All Rights Reserved. +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in all +// copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +// SOFTWARE. + +#include "library/components/pthread_create_gotcha.hpp" +#include "library/components/omnitrace.hpp" +#include "library/components/pthread_gotcha.hpp" +#include "library/components/roctracer.hpp" +#include "library/config.hpp" +#include "library/debug.hpp" +#include "library/runtime.hpp" +#include "library/sampling.hpp" +#include "library/thread_data.hpp" + +#include +#include +#include +#include + +#include +#include + +namespace omnitrace +{ +namespace sampling +{ +std::set +setup(); +std::set +shutdown(); +} // namespace sampling + +namespace mpl = tim::mpl; + +using bundle_t = tim::lightweight_tuple; +using wall_pw_t = mpl::piecewise_select; // only wall-clock +using main_pw_t = mpl::piecewise_ignore; // exclude wall-clock + +namespace +{ +auto* is_shutdown = new bool{ false }; // intentional data leak +auto* bundles = new std::map>{}; +auto* bundles_mutex = new std::mutex{}; +auto bundles_dtor = scope::destructor{ []() { + omnitrace::pthread_create_gotcha::shutdown(); + delete bundles; + delete bundles_mutex; + bundles = nullptr; + bundles_mutex = nullptr; +} }; + +inline void +start_bundle(bundle_t& _bundle) +{ + if(!get_use_timemory()) return; + OMNITRACE_BASIC_VERBOSE_F(3, "starting bundle '%s'...\n", _bundle.key().c_str()); + _bundle.push(); + _bundle.start(); +} + +inline void +stop_bundle(bundle_t& _bundle, int64_t _tid) +{ + if(!get_use_timemory()) return; + OMNITRACE_BASIC_VERBOSE_F(3, "stopping bundle '%s' in thread %li...\n", + _bundle.key().c_str(), _tid); + _bundle.stop(wall_pw_t{}); // stop wall-clock so we can get the value + // update roctracer_data + _bundle.store(std::plus{}, + _bundle.get()->get() * units::sec); + // stop all other components including roctracer_data after update + _bundle.stop(main_pw_t{}); + // exclude popping wall-clock + _bundle.pop(_tid); +} +} // namespace + +//--------------------------------------------------------------------------------------// + +pthread_create_gotcha::wrapper::wrapper(routine_t _routine, void* _arg, + bool _enable_sampling, int64_t _parent, + promise_t* _p) +: m_enable_sampling{ _enable_sampling } +, m_parent_tid{ _parent } +, m_routine{ _routine } +, m_arg{ _arg } +, m_promise{ _p } +{} + +void* +pthread_create_gotcha::wrapper::operator()() const +{ + if(is_shutdown && *is_shutdown) + { + if(m_promise) m_promise->set_value(); + // execute the original function + return m_routine(m_arg); + } + + set_thread_state(ThreadState::Internal); + + int64_t _tid = -1; + auto _is_sampling = false; + auto _bundle = std::shared_ptr{}; + auto _signals = std::set{}; + auto _coverage = (get_mode() == omnitrace::Mode::Coverage); + auto _dtor = scope::destructor{ [&]() { + set_thread_state(ThreadState::Internal); + if(_is_sampling) + { + sampling::block_signals(_signals); + sampling::shutdown(); + } + + pthread_create_gotcha::shutdown(_tid); + set_thread_state(ThreadState::Completed); + } }; + + auto _active = (get_state() == omnitrace::State::Active && bundles != nullptr && + bundles_mutex != nullptr); + + if(_active && !_coverage) + { + _tid = threading::get_id(); + threading::set_thread_name(TIMEMORY_JOIN(" ", "Thread", _tid).c_str()); + if(bundles && bundles_mutex) + { + std::unique_lock _lk{ *bundles_mutex }; + _bundle = bundles->emplace(_tid, std::make_shared("start_thread")) + .first->second; + } + if(_bundle) start_bundle(*_bundle); + get_cpu_cid_stack(threading::get_id(), m_parent_tid); + if(m_enable_sampling) + { + // initialize thread-local statics + (void) tim::get_unw_backtrace<12, 1, false>(); + _is_sampling = true; + pthread_gotcha::push_enable_sampling_on_child_threads(false); + _signals = sampling::setup(); + pthread_gotcha::pop_enable_sampling_on_child_threads(); + sampling::unblock_signals(); + } + } + + if(m_promise) m_promise->set_value(); + + set_thread_state(ThreadState::Enabled); + // execute the original function + return m_routine(m_arg); +} + +void* +pthread_create_gotcha::wrapper::wrap(void* _arg) +{ + if(_arg == nullptr) return nullptr; + + // convert the argument + wrapper* _wrapper = static_cast(_arg); + + // execute the original function + return (*_wrapper)(); +} + +void +pthread_create_gotcha::configure() +{ + pthread_create_gotcha_t::get_initializer() = []() { + pthread_create_gotcha_t::template configure< + 0, int, pthread_t*, const pthread_attr_t*, void* (*) (void*), void*>( + "pthread_create"); + }; +} + +void +pthread_create_gotcha::shutdown() +{ + if(is_shutdown) + { + if(*is_shutdown) return; + *is_shutdown = true; + } + + if(!bundles_mutex || !bundles) return; + + std::unique_lock _lk{ *bundles_mutex }; + unsigned long _ndangling = 0; + for(auto itr : *bundles) + { + if(itr.second) + { + stop_bundle(*itr.second, itr.first); + ++_ndangling; + } + itr.second.reset(); + } + + bundles->clear(); + + OMNITRACE_CONDITIONAL_BASIC_PRINT( + (get_verbose_env() >= 2 || get_debug_env()) && _ndangling > 0, + "[pthread_create_gotcha::shutdown] cleaned up %lu dangling bundles\n", + _ndangling); +} + +void +pthread_create_gotcha::shutdown(int64_t _tid) +{ + if(is_shutdown && *is_shutdown) return; + + if(!bundles_mutex || !bundles) return; + + std::unique_lock _lk{ *bundles_mutex }; + auto itr = bundles->find(_tid); + if(itr != bundles->end()) + { + if(itr->second) stop_bundle(*itr->second, itr->first); + itr->second.reset(); + bundles->erase(itr); + } +} + +// pthread_create +int +pthread_create_gotcha::operator()(pthread_t* thread, const pthread_attr_t* attr, + void* (*start_routine)(void*), void* arg) const +{ + OMNITRACE_SCOPED_THREAD_STATE(ThreadState::Internal); + bundle_t _bundle{ "pthread_create" }; + auto _enable_sampling = pthread_gotcha::sampling_enabled_on_child_threads(); + auto _coverage = (get_mode() == omnitrace::Mode::Coverage); + auto _active = (get_state() == omnitrace::State::Active); + int64_t _tid = (_active) ? threading::get_id() : 0; + + if(_active) + { + OMNITRACE_VERBOSE(1, "Creating new thread on PID %i (rank: %i), TID %li\n", + process::get_id(), dmp::rank(), _tid); + } + + // ensure that cpu cid stack exists on the parent thread if active + if(!_coverage && _active) get_cpu_cid_stack(); + + if(!get_use_sampling() || !_enable_sampling) + { + auto* _obj = new wrapper(start_routine, arg, _enable_sampling, _tid, nullptr); + // create the thread + auto _ret = + ::pthread_create(thread, attr, &wrapper::wrap, static_cast(_obj)); + return _ret; + } + + // block the signals in entire process + OMNITRACE_DEBUG("blocking signals...\n"); + tim::sampling::block_signals({ SIGALRM, SIGPROF }, + tim::sampling::sigmask_scope::process); + + start_bundle(_bundle); + + // promise set by thread when signal handler is configured + auto _promise = std::promise{}; + auto _fut = _promise.get_future(); + auto* _wrap = new wrapper(start_routine, arg, _enable_sampling, _tid, &_promise); + + // create the thread + auto _ret = ::pthread_create(thread, attr, &wrapper::wrap, static_cast(_wrap)); + + // wait for thread to set promise + OMNITRACE_DEBUG("waiting for child to signal it is setup...\n"); + _fut.wait(); + + stop_bundle(_bundle, threading::get_id()); + + // unblock the signals in the entire process + OMNITRACE_DEBUG("unblocking signals...\n"); + tim::sampling::unblock_signals({ SIGALRM, SIGPROF }, + tim::sampling::sigmask_scope::process); + + OMNITRACE_DEBUG("returning success...\n"); + return _ret; +} + +} // namespace omnitrace diff --git a/projects/rocprofiler-systems/source/lib/omnitrace/library/components/pthread_create_gotcha.hpp b/projects/rocprofiler-systems/source/lib/omnitrace/library/components/pthread_create_gotcha.hpp new file mode 100644 index 0000000000..f6e20e6d3b --- /dev/null +++ b/projects/rocprofiler-systems/source/lib/omnitrace/library/components/pthread_create_gotcha.hpp @@ -0,0 +1,71 @@ +// MIT License +// +// Copyright (c) 2022 Advanced Micro Devices, Inc. All Rights Reserved. +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in all +// copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +// SOFTWARE. + +#pragma once + +#include "library/common.hpp" +#include "library/defines.hpp" +#include "library/timemory.hpp" + +#include +#include + +namespace omnitrace +{ +struct pthread_create_gotcha : tim::component::base +{ + struct wrapper + { + using routine_t = void* (*) (void*); + using promise_t = std::promise; + + wrapper(routine_t _routine, void* _arg, bool, int64_t, promise_t*); + void* operator()() const; + + static void* wrap(void* _arg); + + private: + bool m_enable_sampling = false; + int64_t m_parent_tid = 0; + routine_t m_routine = nullptr; + void* m_arg = nullptr; + promise_t* m_promise = nullptr; + }; + + TIMEMORY_DEFAULT_OBJECT(pthread_create_gotcha) + + // string id for component + static std::string label() { return "pthread_create_gotcha"; } + + // generate the gotcha wrappers + static void configure(); + static void shutdown(); + static void shutdown(int64_t); + + // pthread_create + int operator()(pthread_t* thread, const pthread_attr_t* attr, + void* (*start_routine)(void*), void* arg) const; +}; + +using pthread_create_gotcha_t = + tim::component::gotcha<2, std::tuple<>, pthread_create_gotcha>; +} // namespace omnitrace diff --git a/projects/rocprofiler-systems/source/lib/omnitrace/library/components/pthread_gotcha.cpp b/projects/rocprofiler-systems/source/lib/omnitrace/library/components/pthread_gotcha.cpp index c33fb4140b..da79a5d0b9 100644 --- a/projects/rocprofiler-systems/source/lib/omnitrace/library/components/pthread_gotcha.cpp +++ b/projects/rocprofiler-systems/source/lib/omnitrace/library/components/pthread_gotcha.cpp @@ -22,224 +22,62 @@ #include "library/components/pthread_gotcha.hpp" #include "library/components/omnitrace.hpp" +#include "library/components/pthread_create_gotcha.hpp" +#include "library/components/pthread_mutex_gotcha.hpp" #include "library/components/roctracer.hpp" #include "library/config.hpp" #include "library/debug.hpp" #include "library/runtime.hpp" #include "library/sampling.hpp" #include "library/thread_data.hpp" +#include "library/utility.hpp" #include #include #include -#include #include +#include +#include + namespace omnitrace { -namespace sampling -{ -std::set -setup(); -std::set -shutdown(); -} // namespace sampling - -namespace mpl = tim::mpl; - -using bundle_t = tim::lightweight_tuple; -using wall_pw_t = mpl::piecewise_select; // only wall-clock -using main_pw_t = mpl::piecewise_ignore; // exclude wall-clock - namespace { -auto* is_shutdown = new bool{ false }; // intentional data leak -auto* bundles = new std::map>{}; -auto* bundles_mutex = new std::mutex{}; -auto bundles_dtor = scope::destructor{ []() { - omnitrace::pthread_gotcha::shutdown(); - delete bundles; - delete bundles_mutex; - bundles = nullptr; - bundles_mutex = nullptr; -} }; +using bundle_t = tim::lightweight_tuple; -inline void -start_bundle(bundle_t& _bundle) +auto& +get_sampling_on_child_threads_history(int64_t _idx = utility::get_thread_index()) { - if(!get_use_timemory()) return; - OMNITRACE_BASIC_VERBOSE_F(3, "starting bundle '%s'...\n", _bundle.key().c_str()); - if(comp::roctracer::is_setup()) - { - _bundle.push(); - _bundle.start(); - } - else - { - _bundle.push(wall_pw_t{}); - _bundle.start(wall_pw_t{}); - } -} - -inline void -stop_bundle(bundle_t& _bundle, int64_t _tid) -{ - if(!get_use_timemory()) return; - OMNITRACE_BASIC_VERBOSE_F(3, "stopping bundle '%s' in thread %li...\n", - _bundle.key().c_str(), _tid); - _bundle.stop(wall_pw_t{}); // stop wall-clock so we can get the value - // update roctracer_data - _bundle.store(std::plus{}, - _bundle.get()->get() * units::sec); - // stop all other components including roctracer_data after update - _bundle.stop(main_pw_t{}); - // exclude popping wall-clock - _bundle.pop(_tid); -} - -auto -get_thread_index() -{ - static std::atomic _c{ 0 }; - static thread_local int64_t _v = _c++; - return _v; + static auto _v = utility::get_filled_array( + []() { return utility::get_reserved_vector(32); }); + return _v.at(_idx); } auto& -get_sampling_on_child_threads_history(int64_t _idx = get_thread_index()) +get_bundle() { - static auto _v = std::array, OMNITRACE_MAX_THREADS>{}; - return _v.at(_idx); + static auto _v = std::unique_ptr{}; + if(!_v) _v = std::make_unique("pthread_gotcha"); + return _v; } } // namespace //--------------------------------------------------------------------------------------// -pthread_gotcha::wrapper::wrapper(routine_t _routine, void* _arg, bool _enable_sampling, - int64_t _parent, promise_t* _p) -: m_enable_sampling{ _enable_sampling } -, m_parent_tid{ _parent } -, m_routine{ _routine } -, m_arg{ _arg } -, m_promise{ _p } -{} - -void* -pthread_gotcha::wrapper::operator()() const -{ - if(is_shutdown && *is_shutdown) - { - if(m_promise) m_promise->set_value(); - // execute the original function - return m_routine(m_arg); - } - - int64_t _tid = -1; - auto _is_sampling = false; - auto _bundle = std::shared_ptr{}; - auto _signals = std::set{}; - auto _coverage = (get_mode() == omnitrace::Mode::Coverage); - auto _dtor = scope::destructor{ [&]() { - if(_is_sampling) - { - sampling::block_signals(_signals); - sampling::shutdown(); - } - - if(!bundles || !bundles_mutex) return; - if(_bundle && get_state() < omnitrace::State::Finalized) - { - std::unique_lock _lk{ *bundles_mutex }; - stop_bundle(*_bundle, _tid); - _bundle.reset(); - bundles->erase(_tid); - } - } }; - - auto _active = (get_state() == omnitrace::State::Active && bundles && bundles_mutex); - - if(_active && !_coverage) - { - _tid = threading::get_id(); - threading::set_thread_name(TIMEMORY_JOIN(" ", "Thread", _tid).c_str()); - if(bundles && bundles_mutex) - { - std::unique_lock _lk{ *bundles_mutex }; - if(comp::roctracer::is_setup()) - _bundle = - bundles->emplace(_tid, std::make_shared("start_thread")) - .first->second; - } - if(_bundle) start_bundle(*_bundle); - get_cpu_cid_stack(threading::get_id(), m_parent_tid); - if(m_enable_sampling) - { - // initialize thread-local statics - (void) tim::get_unw_backtrace<12, 1, false>(); - _is_sampling = true; - push_enable_sampling_on_child_threads(false); - _signals = sampling::setup(); - pop_enable_sampling_on_child_threads(); - sampling::unblock_signals(); - } - } - - if(m_promise) m_promise->set_value(); - - // execute the original function - return m_routine(m_arg); -} - -void* -pthread_gotcha::wrapper::wrap(void* _arg) -{ - if(_arg == nullptr) return nullptr; - - // convert the argument - wrapper* _wrapper = static_cast(_arg); - - // execute the original function - return (*_wrapper)(); -} - void pthread_gotcha::configure() { - pthread_gotcha_t::get_initializer() = []() { - pthread_gotcha_t::template configure<0, int, pthread_t*, const pthread_attr_t*, - void* (*) (void*), void*>("pthread_create"); - }; + pthread_create_gotcha::configure(); + pthread_mutex_gotcha::configure(); } void pthread_gotcha::shutdown() { - if(is_shutdown) - { - if(*is_shutdown) return; - *is_shutdown = true; - } - - if(!bundles_mutex || !bundles) return; - - std::unique_lock _lk{ *bundles_mutex }; - unsigned long _ndangling = 0; - for(auto itr : *bundles) - { - if(itr.second) - { - stop_bundle(*itr.second, itr.first); - ++_ndangling; - } - itr.second.reset(); - } - - bundles->clear(); - - OMNITRACE_CONDITIONAL_BASIC_PRINT( - (get_verbose_env() >= 2 || get_debug_env()) && _ndangling > 0, - "[pthread_gotcha::shutdown] cleaned up %lu dangling bundles\n", _ndangling); + pthread_create_gotcha::shutdown(); + pthread_mutex_gotcha::shutdown(); } bool @@ -287,57 +125,16 @@ pthread_gotcha::sampling_on_child_threads() return _v; } -// pthread_create -int -pthread_gotcha::operator()(pthread_t* thread, const pthread_attr_t* attr, - void* (*start_routine)(void*), void* arg) const +void +pthread_gotcha::start() { - bundle_t _bundle{ "pthread_create" }; - auto _enable_sampling = sampling_enabled_on_child_threads(); - auto _coverage = (get_mode() == omnitrace::Mode::Coverage); - auto _active = (get_state() == omnitrace::State::Active); - int64_t _tid = (_active) ? threading::get_id() : 0; - - // ensure that cpu cid stack exists on the parent thread if active - if(!_coverage && _active) get_cpu_cid_stack(); - - if(!get_use_sampling() || !_enable_sampling) - { - auto* _obj = new wrapper(start_routine, arg, _enable_sampling, _tid, nullptr); - // create the thread - auto _ret = - ::pthread_create(thread, attr, &wrapper::wrap, static_cast(_obj)); - return _ret; - } - - // block the signals in entire process - OMNITRACE_DEBUG("blocking signals...\n"); - tim::sampling::block_signals({ SIGALRM, SIGPROF }, - tim::sampling::sigmask_scope::process); - - start_bundle(_bundle); - - // promise set by thread when signal handler is configured - auto _promise = std::promise{}; - auto _fut = _promise.get_future(); - auto* _wrap = new wrapper(start_routine, arg, _enable_sampling, _tid, &_promise); - - // create the thread - auto _ret = ::pthread_create(thread, attr, &wrapper::wrap, static_cast(_wrap)); - - // wait for thread to set promise - OMNITRACE_DEBUG("waiting for child to signal it is setup...\n"); - _fut.wait(); - - stop_bundle(_bundle, threading::get_id()); - - // unblock the signals in the entire process - OMNITRACE_DEBUG("unblocking signals...\n"); - tim::sampling::unblock_signals({ SIGALRM, SIGPROF }, - tim::sampling::sigmask_scope::process); - - OMNITRACE_DEBUG("returning success...\n"); - return _ret; + get_bundle()->start(); } +void +pthread_gotcha::stop() +{ + get_bundle()->stop(); + get_bundle().reset(); +} } // namespace omnitrace diff --git a/projects/rocprofiler-systems/source/lib/omnitrace/library/components/pthread_gotcha.hpp b/projects/rocprofiler-systems/source/lib/omnitrace/library/components/pthread_gotcha.hpp index e58e8a8494..0711812acd 100644 --- a/projects/rocprofiler-systems/source/lib/omnitrace/library/components/pthread_gotcha.hpp +++ b/projects/rocprofiler-systems/source/lib/omnitrace/library/components/pthread_gotcha.hpp @@ -33,24 +33,6 @@ namespace omnitrace { struct pthread_gotcha : tim::component::base { - struct wrapper - { - using routine_t = void* (*) (void*); - using promise_t = std::promise; - - wrapper(routine_t _routine, void* _arg, bool, int64_t, promise_t*); - void* operator()() const; - - static void* wrap(void* _arg); - - private: - bool m_enable_sampling = false; - int64_t m_parent_tid = 0; - routine_t m_routine = nullptr; - void* m_arg = nullptr; - promise_t* m_promise = nullptr; - }; - TIMEMORY_DEFAULT_OBJECT(pthread_gotcha) // string id for component @@ -72,13 +54,10 @@ struct pthread_gotcha : tim::component::base // make sure every newly created thead starts with this value static void set_sampling_on_all_future_threads(bool _v); - // pthread_create - int operator()(pthread_t* thread, const pthread_attr_t* attr, - void* (*start_routine)(void*), void* arg) const; + static void start(); + static void stop(); private: static bool& sampling_on_child_threads(); }; - -using pthread_gotcha_t = tim::component::gotcha<2, std::tuple<>, pthread_gotcha>; } // namespace omnitrace diff --git a/projects/rocprofiler-systems/source/lib/omnitrace/library/components/pthread_mutex_gotcha.cpp b/projects/rocprofiler-systems/source/lib/omnitrace/library/components/pthread_mutex_gotcha.cpp new file mode 100644 index 0000000000..f400083241 --- /dev/null +++ b/projects/rocprofiler-systems/source/lib/omnitrace/library/components/pthread_mutex_gotcha.cpp @@ -0,0 +1,174 @@ +// MIT License +// +// Copyright (c) 2022 Advanced Micro Devices, Inc. All Rights Reserved. +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in all +// copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +// SOFTWARE. + +#include "library/components/pthread_mutex_gotcha.hpp" +#include "library.hpp" +#include "library/components/pthread_gotcha.hpp" +#include "library/config.hpp" +#include "library/critical_trace.hpp" +#include "library/debug.hpp" +#include "library/runtime.hpp" +#include "library/sampling.hpp" +#include "library/thread_sampler.hpp" +#include "library/utility.hpp" +#include "timemory/backends/threading.hpp" + +#include +#include + +#include + +namespace omnitrace +{ +using Device = critical_trace::Device; +using Phase = critical_trace::Phase; + +pthread_mutex_gotcha::hash_array_t& +pthread_mutex_gotcha::get_hashes() +{ + // theoretically, this private function will NEVER be called until it + // is called by a gotcha wrapper, which means the tool ids should be + // fully populated. If that fails to be the case for some reason, + // we could see weird results. + static auto _v = []() { + const auto& _data = pthread_mutex_gotcha_t::get_gotcha_data(); + hash_array_t _init{}; + for(size_t i = 0; i < gotcha_capacity; ++i) + { + auto&& _id = _data.at(i).tool_id; + if(!_id.empty()) + _init.at(i) = critical_trace::add_hash_id(_id.c_str()); + else + { + OMNITRACE_VERBOSE( + 1, + "WARNING!!! pthread_mutex_gotcha tool id at index %zu was empty!\n", + i); + } + OMNITRACE_CI_FAIL( + _id.empty() || _init.at(i) == 0, + "pthread_mutex_gotcha tool id at index %zu has no hash value\n", i); + } + return _init; + }(); + return _v; +} + +void +pthread_mutex_gotcha::configure() +{ + pthread_mutex_gotcha_t::get_initializer() = []() { + if(config::get_trace_thread_locks()) + { + pthread_mutex_gotcha::validate(); + + pthread_mutex_gotcha_t::configure( + comp::gotcha_config<0, int, pthread_mutex_t*>{ "pthread_mutex_lock" }); + + pthread_mutex_gotcha_t::configure( + comp::gotcha_config<1, int, pthread_mutex_t*>{ "pthread_mutex_unlock" }); + + pthread_mutex_gotcha_t::configure( + comp::gotcha_config<2, int, pthread_mutex_t*>{ "pthread_mutex_trylock" }); + } + }; +} + +void +pthread_mutex_gotcha::shutdown() +{} + +void +pthread_mutex_gotcha::validate() +{ + if(config::get_trace_thread_locks() && config::get_use_perfetto()) + { + OMNITRACE_PRINT_F("\n"); + OMNITRACE_PRINT_F("\n"); + OMNITRACE_PRINT_F("\n"); + OMNITRACE_PRINT_F( + "The overhead of all the mutex locking internally by perfetto is\n") + OMNITRACE_PRINT_F( + "so significant that all timing data is rendered meaningless.\n"); + OMNITRACE_PRINT_F( + "However, mutex locking is effectively non-existant in timemory.\n"); + OMNITRACE_PRINT_F("If you want to trace the mutex locking:\n") + OMNITRACE_PRINT_F(" OMNITRACE_USE_TIMEMORY=ON\n"); + OMNITRACE_PRINT_F(" OMNITRACE_USE_PERFETTO=OFF\n"); + OMNITRACE_PRINT_F("\n"); + OMNITRACE_PRINT_F("\n"); + OMNITRACE_PRINT_F("\n"); + OMNITRACE_FAIL_F("OMNITRACE_USE_PERFETTO and OMNITRACE_TRACE_THREAD_LOCKS cannot " + "both be enabled.\n"); + } +} + +int +pthread_mutex_gotcha::operator()(const gotcha_data_t& _data, + int (*_callee)(pthread_mutex_t*), + pthread_mutex_t* _mutex) +{ + if(is_disabled()) + { + if(!_callee) + { + OMNITRACE_PRINT("Warning! nullptr to %s\n", _data.tool_id.c_str()); + return EINVAL; + } + return (*_callee)(_mutex); + } + + uint64_t _cid = 0; + uint64_t _parent_cid = 0; + uint16_t _depth = 0; + int64_t _ts = 0; + + OMNITRACE_SCOPED_THREAD_STATE(ThreadState::Internal); + + if(get_use_critical_trace()) + { + std::tie(_cid, _parent_cid, _depth) = create_cpu_cid_entry(); + _ts = comp::wall_clock::record(); + } + + omnitrace_push_region(_data.tool_id.c_str()); + auto _ret = (*_callee)(_mutex); + omnitrace_pop_region(_data.tool_id.c_str()); + + if(get_use_critical_trace()) + { + add_critical_trace( + threading::get_id(), _cid, 0, _parent_cid, _ts, comp::wall_clock::record(), + reinterpret_cast(_mutex), get_hashes().at(_data.index), _depth); + } + + return _ret; +} + +bool +pthread_mutex_gotcha::is_disabled() +{ + return (omnitrace::get_state() != omnitrace::State::Active || + omnitrace::get_thread_state() != omnitrace::ThreadState::Enabled || + (get_use_sampling() && !pthread_gotcha::sampling_enabled_on_child_threads())); +} +} // namespace omnitrace diff --git a/projects/rocprofiler-systems/source/lib/omnitrace/library/components/pthread_mutex_gotcha.hpp b/projects/rocprofiler-systems/source/lib/omnitrace/library/components/pthread_mutex_gotcha.hpp new file mode 100644 index 0000000000..f6d8328953 --- /dev/null +++ b/projects/rocprofiler-systems/source/lib/omnitrace/library/components/pthread_mutex_gotcha.hpp @@ -0,0 +1,61 @@ +// MIT License +// +// Copyright (c) 2022 Advanced Micro Devices, Inc. All Rights Reserved. +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in all +// copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +// SOFTWARE. + +#pragma once + +#include "library/common.hpp" +#include "library/defines.hpp" +#include "library/timemory.hpp" + +#include +#include +#include + +namespace omnitrace +{ +// this is used to wrap pthread_mutex() +struct pthread_mutex_gotcha : comp::base +{ + static constexpr size_t gotcha_capacity = 3; + using hash_array_t = std::array; + using gotcha_data_t = comp::gotcha_data; + + TIMEMORY_DEFAULT_OBJECT(pthread_mutex_gotcha) + + // string id for component + static std::string label() { return "pthread_mutex_gotcha"; } + + // generate the gotcha wrappers + static void configure(); + static void shutdown(); + static void validate(); + + int operator()(const gotcha_data_t&, int (*)(pthread_mutex_t*), pthread_mutex_t*); + +private: + static bool is_disabled(); + static hash_array_t& get_hashes(); +}; + +using pthread_mutex_gotcha_t = comp::gotcha; +} // namespace omnitrace diff --git a/projects/rocprofiler-systems/source/lib/omnitrace/library/components/roctracer_callbacks.cpp b/projects/rocprofiler-systems/source/lib/omnitrace/library/components/roctracer_callbacks.cpp index e242c7092d..006852fe8b 100644 --- a/projects/rocprofiler-systems/source/lib/omnitrace/library/components/roctracer_callbacks.cpp +++ b/projects/rocprofiler-systems/source/lib/omnitrace/library/components/roctracer_callbacks.cpp @@ -25,11 +25,13 @@ #include "library/config.hpp" #include "library/critical_trace.hpp" #include "library/debug.hpp" +#include "library/runtime.hpp" #include "library/sampling.hpp" #include "library/thread_data.hpp" #include #include +#include #include #include @@ -171,6 +173,8 @@ hsa_api_callback(uint32_t domain, uint32_t cid, const void* callback_data, void* if(get_state() != State::Active || !trait::runtime_enabled::get()) return; + OMNITRACE_SCOPED_THREAD_STATE(ThreadState::Internal); + (void) arg; const hsa_api_data_t* data = reinterpret_cast(callback_data); OMNITRACE_CONDITIONAL_PRINT_F( @@ -279,6 +283,8 @@ hsa_activity_callback(uint32_t op, activity_record_t* record, void* arg) if(get_state() != State::Active || !trait::runtime_enabled::get()) return; + OMNITRACE_SCOPED_THREAD_STATE(ThreadState::Internal); + sampling::block_signals(); static const char* copy_op_name = "hsa_async_copy"; @@ -359,6 +365,8 @@ hip_api_callback(uint32_t domain, uint32_t cid, const void* callback_data, void* if(get_state() != State::Active || !trait::runtime_enabled::get()) return; + OMNITRACE_SCOPED_THREAD_STATE(ThreadState::Internal); + using Device = critical_trace::Device; using Phase = critical_trace::Phase; @@ -388,12 +396,86 @@ hip_api_callback(uint32_t domain, uint32_t cid, const void* callback_data, void* op_name, cid, data->correlation_id, (data->phase == ACTIVITY_API_PHASE_ENTER) ? "on-enter" : "on-exit"); - int64_t _ts = comp::wall_clock::record(); - auto _tid = threading::get_id(); - uint64_t _cid = 0; - uint64_t _parent_cid = 0; - uint16_t _depth = 0; - auto _corr_id = data->correlation_id; + int64_t _ts = comp::wall_clock::record(); + auto _tid = threading::get_id(); + uint64_t _cid = 0; + uint64_t _parent_cid = 0; + uint16_t _depth = 0; + uintptr_t _queue = 0; + auto _corr_id = data->correlation_id; + +#define OMNITRACE_HIP_API_QUEUE_CASE(API_FUNC, VARIABLE) \ + case HIP_API_ID_##API_FUNC: \ + _queue = reinterpret_cast(data->args.API_FUNC.VARIABLE); \ + break; + +#define OMNITRACE_HIP_API_QUEUE_CASE_ALT(API_FUNC, UNION, VARIABLE) \ + case HIP_API_ID_##API_FUNC: \ + _queue = reinterpret_cast(data->args.UNION.VARIABLE); \ + break; + + switch(cid) + { + OMNITRACE_HIP_API_QUEUE_CASE(hipLaunchKernel, stream) + OMNITRACE_HIP_API_QUEUE_CASE(hipModuleLaunchKernel, stream) + OMNITRACE_HIP_API_QUEUE_CASE(hipHccModuleLaunchKernel, hStream) + OMNITRACE_HIP_API_QUEUE_CASE(hipLaunchCooperativeKernel, stream) + OMNITRACE_HIP_API_QUEUE_CASE(hipExtLaunchKernel, stream) + OMNITRACE_HIP_API_QUEUE_CASE(hipExtModuleLaunchKernel, hStream) + OMNITRACE_HIP_API_QUEUE_CASE(hipExtStreamCreateWithCUMask, stream) + OMNITRACE_HIP_API_QUEUE_CASE(hipExtStreamGetCUMask, stream) + OMNITRACE_HIP_API_QUEUE_CASE(hipStreamSynchronize, stream) + OMNITRACE_HIP_API_QUEUE_CASE(hipConfigureCall, stream) + OMNITRACE_HIP_API_QUEUE_CASE(hipDrvMemcpy3DAsync, stream) + OMNITRACE_HIP_API_QUEUE_CASE(hipEventRecord, stream) + OMNITRACE_HIP_API_QUEUE_CASE(hipMemPrefetchAsync, stream) + OMNITRACE_HIP_API_QUEUE_CASE(hipMemcpy2DAsync, stream) + OMNITRACE_HIP_API_QUEUE_CASE(hipMemcpy2DFromArrayAsync, stream) + OMNITRACE_HIP_API_QUEUE_CASE(hipMemcpy2DToArrayAsync, stream) + OMNITRACE_HIP_API_QUEUE_CASE(hipMemcpy3DAsync, stream) + OMNITRACE_HIP_API_QUEUE_CASE(hipMemcpyAsync, stream) + OMNITRACE_HIP_API_QUEUE_CASE(hipMemcpyDtoDAsync, stream) + OMNITRACE_HIP_API_QUEUE_CASE(hipMemcpyDtoHAsync, stream) + OMNITRACE_HIP_API_QUEUE_CASE(hipMemcpyFromSymbolAsync, stream) + OMNITRACE_HIP_API_QUEUE_CASE(hipMemcpyHtoDAsync, stream) + OMNITRACE_HIP_API_QUEUE_CASE(hipMemcpyParam2DAsync, stream) + OMNITRACE_HIP_API_QUEUE_CASE(hipMemcpyPeerAsync, stream) + OMNITRACE_HIP_API_QUEUE_CASE(hipMemcpyToSymbolAsync, stream) + OMNITRACE_HIP_API_QUEUE_CASE(hipMemcpyWithStream, stream) + OMNITRACE_HIP_API_QUEUE_CASE(hipMemset2DAsync, stream) + OMNITRACE_HIP_API_QUEUE_CASE(hipMemset3DAsync, stream) + OMNITRACE_HIP_API_QUEUE_CASE(hipMemsetAsync, stream) + OMNITRACE_HIP_API_QUEUE_CASE(hipMemsetD16Async, stream) + OMNITRACE_HIP_API_QUEUE_CASE(hipMemsetD32Async, stream) + OMNITRACE_HIP_API_QUEUE_CASE(hipMemsetD8Async, stream) + OMNITRACE_HIP_API_QUEUE_CASE(hipStreamAddCallback, stream) + OMNITRACE_HIP_API_QUEUE_CASE(hipStreamAttachMemAsync, stream) + OMNITRACE_HIP_API_QUEUE_CASE(hipStreamDestroy, stream) + OMNITRACE_HIP_API_QUEUE_CASE(hipStreamGetFlags, stream) + OMNITRACE_HIP_API_QUEUE_CASE(hipStreamGetPriority, stream) + OMNITRACE_HIP_API_QUEUE_CASE(hipStreamQuery, stream) + OMNITRACE_HIP_API_QUEUE_CASE(hipStreamWaitEvent, stream) + OMNITRACE_HIP_API_QUEUE_CASE(hipStreamWaitValue32, stream) + OMNITRACE_HIP_API_QUEUE_CASE(hipStreamWaitValue64, stream) + OMNITRACE_HIP_API_QUEUE_CASE(hipStreamWriteValue32, stream) + OMNITRACE_HIP_API_QUEUE_CASE(hipStreamWriteValue64, stream) +#if OMNITRACE_HIP_VERSION_MAJOR >= 4 && OMNITRACE_HIP_VERSION_MINOR >= 5 + OMNITRACE_HIP_API_QUEUE_CASE(hipGraphLaunch, stream) + OMNITRACE_HIP_API_QUEUE_CASE(hipGraphicsMapResources, stream) + OMNITRACE_HIP_API_QUEUE_CASE(hipGraphicsUnmapResources, stream) + OMNITRACE_HIP_API_QUEUE_CASE(hipSignalExternalSemaphoresAsync, stream) + OMNITRACE_HIP_API_QUEUE_CASE(hipStreamBeginCapture, stream) + OMNITRACE_HIP_API_QUEUE_CASE(hipStreamEndCapture, stream) + OMNITRACE_HIP_API_QUEUE_CASE(hipWaitExternalSemaphoresAsync, stream) +#endif +#if OMNITRACE_HIP_VERSION_MAJOR >= 5 + OMNITRACE_HIP_API_QUEUE_CASE(hipStreamIsCapturing, stream) + OMNITRACE_HIP_API_QUEUE_CASE(hipStreamGetCaptureInfo, stream) + OMNITRACE_HIP_API_QUEUE_CASE(hipStreamGetCaptureInfo_v2, stream) + OMNITRACE_HIP_API_QUEUE_CASE(hipStreamUpdateCaptureDependencies, stream) +#endif + default: break; + } if(data->phase == ACTIVITY_API_PHASE_ENTER) { @@ -401,12 +483,24 @@ hip_api_callback(uint32_t domain, uint32_t cid, const void* callback_data, void* switch(cid) { case HIP_API_ID_hipLaunchKernel: - case HIP_API_ID_hipLaunchCooperativeKernel: { _name = hipKernelNameRefByPtr(data->args.hipLaunchKernel.function_address, data->args.hipLaunchKernel.stream); break; } + case HIP_API_ID_hipLaunchCooperativeKernel: + { + _name = + hipKernelNameRefByPtr(data->args.hipLaunchCooperativeKernel.f, + data->args.hipLaunchCooperativeKernel.stream); + if(!_name) + { + _name = + hipKernelNameRefByPtr(data->args.hipLaunchKernel.function_address, + data->args.hipLaunchKernel.stream); + } + break; + } case HIP_API_ID_hipHccModuleLaunchKernel: { _name = hipKernelNameRef(data->args.hipHccModuleLaunchKernel.f); @@ -468,7 +562,7 @@ hip_api_callback(uint32_t domain, uint32_t cid, const void* callback_data, void* if(get_use_critical_trace() || get_use_rocm_smi()) { add_critical_trace( - _tid, _cid, _corr_id, _parent_cid, _ts, 0, + _tid, _cid, _corr_id, _parent_cid, _ts, 0, _queue, critical_trace::add_hash_id(op_name), _depth); } @@ -517,7 +611,7 @@ hip_api_callback(uint32_t domain, uint32_t cid, const void* callback_data, void* if(get_use_critical_trace() || get_use_rocm_smi()) { add_critical_trace( - _tid, _cid, _corr_id, _parent_cid, _ts, _ts, + _tid, _cid, _corr_id, _parent_cid, _ts, _ts, _queue, critical_trace::add_hash_id(op_name), _depth); } } @@ -531,6 +625,8 @@ hip_activity_callback(const char* begin, const char* end, void*) if(get_state() != State::Active || !trait::runtime_enabled::get()) return; + OMNITRACE_SCOPED_THREAD_STATE(ThreadState::Internal); + sampling::block_signals(); static thread_local auto _once = (threading::set_thread_name("omni.roctracer"), true); @@ -650,13 +746,13 @@ hip_activity_callback(const char* begin, const char* end, void*) auto _hash = critical_trace::add_hash_id(_name); uint16_t _prio = _laps + 1; // priority add_critical_trace( - _tid, _cid, _corr_id, _cid, _beg_ns, _end_ns, _hash, _depth + 1, _prio); + _tid, _cid, _corr_id, _cid, _beg_ns, _end_ns, record->queue_id, _hash, + _depth + 1, _prio); } if(_found && _name != nullptr && get_use_timemory()) { - auto _func = [_depth, _tid, _cid, _laps, _beg_ns, _end_ns, _corr_id, - _name]() { + auto _func = [_beg_ns, _end_ns, _name]() { roctracer_bundle_t _bundle{ _name, _scope }; _bundle.start() .store(std::plus{}, static_cast(_end_ns - _beg_ns)) @@ -725,6 +821,8 @@ extern "C" if(!config::settings_are_configured() && get_state() < State::Active) omnitrace_init_tooling_hidden(); + OMNITRACE_SCOPED_THREAD_STATE(ThreadState::Internal); + static auto _setup = [=]() { try { diff --git a/projects/rocprofiler-systems/source/lib/omnitrace/library/config.cpp b/projects/rocprofiler-systems/source/lib/omnitrace/library/config.cpp index 2e023b8726..2520a41f04 100644 --- a/projects/rocprofiler-systems/source/lib/omnitrace/library/config.cpp +++ b/projects/rocprofiler-systems/source/lib/omnitrace/library/config.cpp @@ -29,6 +29,7 @@ #include #include #include +#include #include #include #include @@ -74,7 +75,7 @@ get_setting_name(std::string _v) { \ auto _ret = _config->insert( \ ENV_NAME, get_setting_name(ENV_NAME), DESCRIPTION, INITIAL_VALUE, \ - std::set{ "custom", "omnitrace", "omnitrace-library", \ + std::set{ "custom", "omnitrace", "omnitrace_library", \ __VA_ARGS__ }); \ if(!_ret.second) \ OMNITRACE_PRINT("Warning! Duplicate setting: %s / %s\n", \ @@ -87,7 +88,7 @@ get_setting_name(std::string _v) { \ auto _ret = _config->insert( \ ENV_NAME, get_setting_name(ENV_NAME), DESCRIPTION, INITIAL_VALUE, \ - std::set{ "custom", "omnitrace", "omnitrace-library", \ + std::set{ "custom", "omnitrace", "omnitrace_library", \ __VA_ARGS__ }, \ std::vector{ CMD_LINE }); \ if(!_ret.second) \ @@ -147,28 +148,23 @@ configure_settings(bool _init) if(_omnitrace_debug) tim::set_env("TIMEMORY_DEBUG_SETTINGS", "1", 0); OMNITRACE_CONFIG_SETTING(bool, "OMNITRACE_USE_PERFETTO", "Enable perfetto backend", - _default_perfetto_v, "backend", "perfetto", - "instrumentation"); + _default_perfetto_v, "backend", "perfetto"); OMNITRACE_CONFIG_SETTING(bool, "OMNITRACE_USE_TIMEMORY", "Enable timemory backend", !_config->get("OMNITRACE_USE_PERFETTO"), "backend", - "timemory", "instrumentation", "sampling"); + "timemory"); -#if defined(OMNITRACE_USE_ROCTRACER) && OMNITRACE_USE_ROCTRACER > 0 OMNITRACE_CONFIG_SETTING(bool, "OMNITRACE_USE_ROCTRACER", "Enable ROCM tracing", true, "backend", "roctracer", "rocm"); -#endif -#if defined(OMNITRACE_USE_ROCM_SMI) && OMNITRACE_USE_ROCM_SMI > 0 OMNITRACE_CONFIG_SETTING( bool, "OMNITRACE_USE_ROCM_SMI", "Enable sampling GPU power, temp, utilization, and memory usage", true, "backend", - "rocm-smi", "rocm"); + "rocm_smi", "rocm"); OMNITRACE_CONFIG_SETTING(std::string, "OMNITRACE_ROCM_SMI_DEVICES", "Devices to query when OMNITRACE_USE_ROCM_SMI=ON", "all", - "backend", "rocm-smi", "rocm"); -#endif + "backend", "rocm_smi", "rocm"); OMNITRACE_CONFIG_SETTING(bool, "OMNITRACE_USE_SAMPLING", "Enable statistical sampling of call-stack", false, @@ -177,12 +173,12 @@ configure_settings(bool _init) OMNITRACE_CONFIG_SETTING(bool, "OMNITRACE_USE_THREAD_SAMPLING", "Enable a background thread which samples system metrics " "such as the CPU/GPU freq, power, etc.", - true, "backend", "sampling"); + true, "backend", "sampling", "thread_sampling"); OMNITRACE_CONFIG_SETTING( bool, "OMNITRACE_USE_PID", "Enable tagging filenames with process identifier (either MPI rank or pid)", true, - "io"); + "io", "filename"); OMNITRACE_CONFIG_SETTING(bool, "OMNITRACE_USE_KOKKOSP", "Enable support for Kokkos Tools", false, "kokkos", @@ -192,11 +188,9 @@ configure_settings(bool _init) bool, "OMNITRACE_KOKKOS_KERNEL_LOGGER", "Enables kernel logging", false, "--omnitrace-kokkos-kernel-logger", "kokkos", "debugging"); -#if defined(OMNITRACE_USE_OMPT) && OMNITRACE_USE_OMPT > 0 OMNITRACE_CONFIG_SETTING(bool, "OMNITRACE_USE_OMPT", "Enable support for OpenMP-Tools", false, "openmp", "ompt", "backend"); -#endif OMNITRACE_CONFIG_SETTING(bool, "OMNITRACE_USE_CODE_COVERAGE", "Enable support for code coverage", false, "coverage", @@ -205,25 +199,25 @@ configure_settings(bool _init) OMNITRACE_CONFIG_SETTING(size_t, "OMNITRACE_INSTRUMENTATION_INTERVAL", "Instrumentation only takes measurements once every N " "function calls (not statistical)", - 1, "instrumentation"); + 1, "instrumentation", "data_sampling"); OMNITRACE_CONFIG_SETTING( double, "OMNITRACE_SAMPLING_FREQ", "Number of software interrupts per second when OMNITTRACE_USE_SAMPLING=ON", 10.0, - "sampling"); + "sampling", "thread_sampling"); OMNITRACE_CONFIG_SETTING( double, "OMNITRACE_SAMPLING_DELAY", "Number of seconds to wait before the first sampling signal is delivered, " "increasing this value can fix deadlocks during init", - 0.5, "sampling"); + 0.5, "sampling", "thread_sampling"); OMNITRACE_CONFIG_SETTING( std::string, "OMNITRACE_SAMPLING_CPUS", "CPUs to collect frequency information for. Values should be separated by commas " "and can be explicit or ranges, e.g. 0,1,5-8. An empty value implies 'all' and " "'none' suppresses all CPU frequency sampling", - "", "sampling"); + "", "thread_sampling"); auto _backend = tim::get_env_choice( "OMNITRACE_BACKEND", @@ -235,30 +229,36 @@ configure_settings(bool _init) OMNITRACE_CONFIG_SETTING(std::string, "OMNITRACE_BACKEND", "Specify the perfetto backend to activate. Options are: " "'inprocess', 'system', or 'all'", - _backend, "perfetto", "instrumentation", "sampling"); + _backend, "perfetto"); OMNITRACE_CONFIG_SETTING(bool, "OMNITRACE_CRITICAL_TRACE", "Enable generation of the critical trace", false, "backend", - "critical-trace", "instrumentation"); + "critical_trace"); + + OMNITRACE_CONFIG_SETTING(bool, "OMNITRACE_TRACE_THREAD_LOCKS", + "Enable tracking calls to pthread_mutex_lock, " + "pthread_mutex_unlock, pthread_mutex_trylock", + false, "backend", "parallelism", "gotcha"); OMNITRACE_CONFIG_SETTING(bool, "OMNITRACE_FLAT_SAMPLING", "Ignore hierarchy in all statistical sampling entries", - _config->get_flat_profile(), "sampling", "data_layout"); + _config->get_flat_profile(), "timemory", "sampling", + "data_layout"); OMNITRACE_CONFIG_SETTING( bool, "OMNITRACE_TIMELINE_SAMPLING", "Create unique entries for every sample when statistical sampling is enabled", - _config->get_timeline_profile(), "sampling", "data_layout"); + _config->get_timeline_profile(), "timemory", "sampling", "data_layout"); OMNITRACE_CONFIG_SETTING( bool, "OMNITRACE_ROCTRACER_FLAT_PROFILE", "Ignore hierarchy in all kernels entries with timemory backend", - _config->get_flat_profile(), "roctracer", "data_layout", "rocm"); + _config->get_flat_profile(), "timemory", "roctracer", "data_layout", "rocm"); OMNITRACE_CONFIG_SETTING( bool, "OMNITRACE_ROCTRACER_TIMELINE_PROFILE", "Create unique entries for every kernel with timemory backend", - _config->get_timeline_profile(), "roctracer", "data_layout", "rocm"); + _config->get_timeline_profile(), "timemory", "roctracer", "data_layout", "rocm"); OMNITRACE_CONFIG_SETTING(bool, "OMNITRACE_ROCTRACER_HSA_ACTIVITY", "Enable HSA activity tracing support", false, "roctracer", @@ -273,12 +273,12 @@ configure_settings(bool _init) OMNITRACE_CONFIG_SETTING(bool, "OMNITRACE_CRITICAL_TRACE_DEBUG", "Enable debugging for critical trace", _omnitrace_debug, - "debugging", "critical-trace"); + "debugging", "critical_trace"); OMNITRACE_CONFIG_SETTING( bool, "OMNITRACE_CRITICAL_TRACE_SERIALIZE_NAMES", "Include names in serialization of critical trace (mainly for debugging)", - _omnitrace_debug, "debugging", "critical-trace"); + _omnitrace_debug, "debugging", "critical_trace"); OMNITRACE_CONFIG_SETTING(size_t, "OMNITRACE_SHMEM_SIZE_HINT_KB", "Hint for shared-memory buffer size in perfetto (in KB)", @@ -288,37 +288,36 @@ configure_settings(bool _init) "Size of perfetto buffer (in KB)", 1024000, "perfetto", "data"); + OMNITRACE_CONFIG_SETTING(bool, "OMNITRACE_COMBINE_PERFETTO_TRACES", + "Combine Perfetto traces", true, "perfetto", "data"); + OMNITRACE_CONFIG_SETTING(int64_t, "OMNITRACE_CRITICAL_TRACE_COUNT", "Number of critical trace to export (0 == all)", 0, "data", - "critical-trace"); + "critical_trace"); OMNITRACE_CONFIG_SETTING(uint64_t, "OMNITRACE_CRITICAL_TRACE_BUFFER_COUNT", "Number of critical trace records to store in thread-local " "memory before submitting to shared buffer", - 2000, "data", "critical-trace"); + 2000, "data", "critical_trace"); OMNITRACE_CONFIG_SETTING( uint64_t, "OMNITRACE_CRITICAL_TRACE_NUM_THREADS", "Number of threads to use when generating the critical trace", std::min(8, std::thread::hardware_concurrency()), "parallelism", - "critical-trace"); + "critical_trace"); OMNITRACE_CONFIG_SETTING( int64_t, "OMNITRACE_CRITICAL_TRACE_PER_ROW", "How many critical traces per row in perfetto (0 == all in one row)", 0, "io", - "critical-trace"); + "critical_trace"); OMNITRACE_CONFIG_SETTING( std::string, "OMNITRACE_TIMEMORY_COMPONENTS", "List of components to collect via timemory (see timemory-avail)", "wall_clock", - "timemory", "component", "instrumentation"); + "timemory", "component"); OMNITRACE_CONFIG_SETTING(std::string, "OMNITRACE_OUTPUT_FILE", "Perfetto filename", - "", "perfetto", "io"); - - OMNITRACE_CONFIG_SETTING(bool, "OMNITRACE_SETTINGS_DESC", - "Provide descriptions when printing settings", false, - "debugging"); + "", "perfetto", "io", "filename"); _config->get_flamegraph_output() = false; _config->get_cout_output() = false; @@ -334,7 +333,7 @@ configure_settings(bool _init) _config->get_max_thread_bookmarks() = 1; _config->get_timing_units() = "sec"; _config->get_memory_units() = "MB"; - _config->get_papi_events() = "PAPI_TOT_CYC, PAPI_TOT_INS"; + _config->get_papi_events() = "PAPI_TOT_CYC"; // settings native to timemory but critically and/or extensively used by omnitrace auto _add_omnitrace_category = [](auto itr) { @@ -342,7 +341,7 @@ configure_settings(bool _init) { auto _categories = itr->second->get_categories(); _categories.emplace("omnitrace"); - _categories.emplace("omnitrace-library"); + _categories.emplace("omnitrace_library"); itr->second->set_categories(_categories); } }; @@ -396,8 +395,7 @@ configure_settings(bool _init) _config->read(itr); } - _config->get_global_components() = - _config->get("OMNITRACE_TIMEMORY_COMPONENTS"); + settings::suppress_config() = true; // always initialize timemory because gotcha wrappers are always used auto _cmd = tim::read_command_line(process::get_id()); @@ -408,9 +406,6 @@ configure_settings(bool _init) if(_pos < _exe.length() - 1) _exe = _exe.substr(_pos + 1); get_exe_name() = _exe; - scope::get_fields()[scope::flat::value] = tim::settings::flat_profile(); - scope::get_fields()[scope::timeline::value] = tim::settings::timeline_profile(); - bool _found_sep = false; for(const auto& itr : _cmd) { @@ -425,8 +420,13 @@ configure_settings(bool _init) tim::timemory_init(_cmd, _parser, "omnitrace-"); } + _config->get_global_components() = + _config->get("OMNITRACE_TIMEMORY_COMPONENTS"); + + scope::get_fields()[scope::flat::value] = _config->get_flat_profile(); + scope::get_fields()[scope::timeline::value] = _config->get_timeline_profile(); + settings::suppress_parsing() = true; - settings::suppress_config() = true; settings::use_output_suffix() = _config->get("OMNITRACE_USE_PID"); #if !defined(TIMEMORY_USE_MPI) && defined(TIMEMORY_USE_MPI_HEADERS) if(tim::dmp::is_initialized()) settings::default_process_suffix() = tim::dmp::rank(); @@ -485,6 +485,60 @@ configure_settings(bool _init) _old_handler = signal(_dyninst_trampoline_signal, static_cast(_trampoline_handler)); } + + auto _handle_use_option = [](const std::string& _opt, const std::string& _category) { + if(!_config->get(_opt)) + { + auto _disabled = _config->disable_category(_category); + _config->enable(_opt); + for(auto&& itr : _disabled) + OMNITRACE_BASIC_VERBOSE(3, "[%s=OFF] disabled option :: '%s'\n", + _opt.c_str(), itr.c_str()); + return false; + } + auto _enabled = _config->enable_category(_category); + for(auto&& itr : _enabled) + OMNITRACE_BASIC_VERBOSE(3, "[%s=ON] enabled option :: '%s'\n", + _opt.c_str(), itr.c_str()); + return true; + }; + + _handle_use_option("OMNITRACE_USE_SAMPLING", "sampling"); + _handle_use_option("OMNITRACE_USE_THREAD_SAMPLING", "thread_sampling"); + _handle_use_option("OMNITRACE_USE_KOKKOSP", "kokkos"); + _handle_use_option("OMNITRACE_USE_PERFETTO", "perfetto"); + _handle_use_option("OMNITRACE_USE_TIMEMORY", "timemory"); + _handle_use_option("OMNITRACE_CRITICAL_TRACE", "critical_trace"); + _handle_use_option("OMNITRACE_USE_OMPT", "ompt"); + _handle_use_option("OMNITRACE_USE_ROCM_SMI", "rocm_smi"); + _handle_use_option("OMNITRACE_USE_ROCTRACER", "roctracer"); + +#if !defined(OMNITRACE_USE_ROCTRACER) || OMNITRACE_USE_ROCTRACER == 0 + _config->disable_category("roctracer"); +#endif + +#if !defined(OMNITRACE_USE_ROCM_SMI) || OMNITRACE_USE_ROCM_SMI == 0 + _config->disable_category("rocm_smi"); +#endif + +#if defined(OMNITRACE_USE_OMPT) || OMNITRACE_USE_OMPT == 0 + _config->disable_category("ompt"); +#endif + + // user bundle components + _config->disable_category("throttle"); + _config->disable("components"); + _config->disable("global_components"); + _config->disable("ompt_components"); + _config->disable("kokkos_components"); + _config->disable("trace_components"); + _config->disable("profiler_components"); + _config->disable("destructor_report"); + _config->disable("stack_clearing"); + +#if !defined(TIMEMORY_USE_MPI) || TIMEMORY_USE_MPI == 0 + _config->disable("OMNITRACE_COMBINE_PERFETTO_TRACES"); +#endif } void @@ -512,7 +566,8 @@ print_settings( std::stringstream _os{}; - bool _md = tim::get_env("OMNITRACE_SETTINGS_DESC_MARKDOWN", false); + bool _print_desc = get_debug() || tim::get_env("OMNITRACE_SETTINGS_DESC", false); + bool _md = tim::get_env("OMNITRACE_SETTINGS_DESC_MARKDOWN", false); constexpr size_t nfields = 3; using str_array_t = std::array; @@ -521,6 +576,7 @@ print_settings( _widths.fill(0); for(const auto& itr : *get_config()) { + if(!itr.second->get_enabled()) continue; if(_filter(itr.first, itr.second->get_categories())) { auto _disp = itr.second->get_display(std::ios::boolalpha); @@ -549,8 +605,6 @@ print_settings( return lhs.at(0) < rhs.at(0); }); - bool _print_desc = get_debug() || get_config()->get("OMNITRACE_SETTINGS_DESC"); - auto tot_width = std::accumulate(_widths.begin(), _widths.end(), 0); if(!_print_desc) tot_width -= _widths.back() + 4; @@ -945,6 +999,17 @@ get_perfetto_buffer_size() return static_cast&>(*_v->second).get(); } +bool +get_perfetto_combined_traces() +{ +#if defined(TIMEMORY_USE_MPI) && TIMEMORY_USE_MPI > 0 + static auto _v = get_config()->find("OMNITRACE_COMBINE_PERFETTO_TRACES"); + return static_cast&>(*_v->second).get(); +#else + return false; +#endif +} + uint64_t get_critical_trace_update_freq() { @@ -1062,6 +1127,13 @@ get_rocm_smi_devices() #endif } +bool +get_trace_thread_locks() +{ + static auto _v = get_config()->find("OMNITRACE_TRACE_THREAD_LOCKS"); + return static_cast&>(*_v->second).get(); +} + bool get_debug_tid() { diff --git a/projects/rocprofiler-systems/source/lib/omnitrace/library/config.hpp b/projects/rocprofiler-systems/source/lib/omnitrace/library/config.hpp index 2a82a29b5f..6a1136bee5 100644 --- a/projects/rocprofiler-systems/source/lib/omnitrace/library/config.hpp +++ b/projects/rocprofiler-systems/source/lib/omnitrace/library/config.hpp @@ -199,6 +199,9 @@ get_perfetto_shmem_size_hint(); size_t get_perfetto_buffer_size(); +bool +get_perfetto_combined_traces(); + uint64_t get_critical_trace_update_freq(); @@ -238,6 +241,9 @@ get_rocm_smi_devices(); int64_t get_critical_trace_per_row(); + +bool +get_trace_thread_locks(); } // namespace config // diff --git a/projects/rocprofiler-systems/source/lib/omnitrace/library/critical_trace.cpp b/projects/rocprofiler-systems/source/lib/omnitrace/library/critical_trace.cpp index a117e08469..710d850fc0 100644 --- a/projects/rocprofiler-systems/source/lib/omnitrace/library/critical_trace.cpp +++ b/projects/rocprofiler-systems/source/lib/omnitrace/library/critical_trace.cpp @@ -26,6 +26,7 @@ #include "library/defines.hpp" #include "library/perfetto.hpp" #include "library/ptl.hpp" +#include "library/runtime.hpp" #include "library/thread_data.hpp" #include @@ -109,9 +110,9 @@ get_combined_hash(Arg0&& _zero, Arg1&& _one, Args&&... _args) bool entry::operator==(const entry& rhs) const { - return (device == rhs.device && depth == rhs.depth && priority == rhs.priority && - tid == rhs.tid && cpu_cid == rhs.cpu_cid && gpu_cid == rhs.gpu_cid && - hash == rhs.hash); + return std::tie(device, depth, priority, tid, cpu_cid, gpu_cid, queue_id, hash) == + std::tie(rhs.device, rhs.depth, rhs.priority, rhs.tid, rhs.cpu_cid, + rhs.gpu_cid, rhs.queue_id, rhs.hash); } bool @@ -132,6 +133,10 @@ entry::operator<(const entry& rhs) const auto _par_eq = (parent_cid == rhs.parent_cid); if(!_par_eq) return (parent_cid < rhs.parent_cid); + // sort by queue ids + auto _queue_eq = (queue_id == rhs.queue_id); + if(!_queue_eq) return (queue_id < rhs.queue_id); + // sort by priority auto _prio_eq = (priority == rhs.priority); if(!_prio_eq) return (priority < rhs.priority); @@ -143,8 +148,8 @@ entry::operator<(const entry& rhs) const bool entry::operator>(const entry& rhs) const { - return (!(*this < rhs) && begin_ns != rhs.begin_ns && cpu_cid != rhs.cpu_cid && - gpu_cid != rhs.gpu_cid); + return (!(*this < rhs) && std::tie(begin_ns, cpu_cid, gpu_cid) != + std::tie(rhs.begin_ns, rhs.cpu_cid, rhs.gpu_cid)); } entry& @@ -171,7 +176,7 @@ size_t entry::get_hash() const { return get_combined_hash(hash, static_cast(device), static_cast(phase), - tid, cpu_cid, gpu_cid, priority); + tid, cpu_cid, gpu_cid, queue_id, priority); } int64_t @@ -293,6 +298,7 @@ entry::write(std::ostream& _os) const _os << " parent: " << static_cast(parent_cid); _os << ", tid: " << tid; _os << ", depth: " << depth; + _os << ", queue: " << queue_id; _os << ", priority: " << priority; if(phase == Phase::DELTA) { @@ -423,6 +429,7 @@ template <> void call_chain::generate_perfetto(std::set& _used) const { + OMNITRACE_SCOPED_THREAD_STATE(ThreadState::Internal); static std::set _static_strings{}; static std::mutex _static_mutex{}; for(const auto& itr : *this) @@ -444,6 +451,7 @@ template <> void call_chain::generate_perfetto(std::set& _used) const { + OMNITRACE_SCOPED_THREAD_STATE(ThreadState::Internal); static std::set _static_strings{}; static std::mutex _static_mutex{}; for(const auto& itr : *this) @@ -465,6 +473,7 @@ template <> void call_chain::generate_perfetto(std::set& _used) const { + OMNITRACE_SCOPED_THREAD_STATE(ThreadState::Internal); static std::set _static_strings{}; static std::mutex _static_mutex{}; for(const auto& itr : *this) @@ -509,6 +518,7 @@ get(int64_t _tid) void add_hash_id(const hash_ids& _labels) { + OMNITRACE_SCOPED_THREAD_STATE(ThreadState::Internal); std::unique_lock _lk{ tasking::critical_trace::get_mutex() }; if(!tasking::critical_trace::get_task_group().pool()) return; tasking::critical_trace::get_task_group().exec([_labels]() { @@ -539,6 +549,7 @@ void update(int64_t _tid) { if(!get_use_critical_trace() && !get_use_rocm_smi()) return; + OMNITRACE_SCOPED_THREAD_STATE(ThreadState::Internal); std::unique_lock _lk{ tasking::critical_trace::get_mutex() }; if(!tasking::critical_trace::get_task_group().pool()) return; call_chain _data{}; @@ -550,6 +561,7 @@ void compute(int64_t _tid) { update(_tid); + OMNITRACE_SCOPED_THREAD_STATE(ThreadState::Internal); std::unique_lock _lk{ tasking::critical_trace::get_mutex() }; if(!tasking::critical_trace::get_task_group().pool()) return; tasking::critical_trace::get_task_group().exec(compute_critical_trace); @@ -723,6 +735,7 @@ combine_critical_path(call_chain& _targ, call_chain _chain) _combined.emplace_back(itr); std::sort(_combined.begin(), _combined.end()); + OMNITRACE_SCOPED_THREAD_STATE(ThreadState::Internal); std::unique_lock _lk{ complete_call_mutex }; for(auto& itr : _combined) _targ.emplace_back(itr); @@ -751,6 +764,8 @@ update_critical_path(call_chain _chain, int64_t) void compute_critical_trace() { + OMNITRACE_SCOPED_THREAD_STATE(ThreadState::Internal); + static bool _computed = false; std::unique_lock _lk{ complete_call_mutex }; @@ -825,6 +840,7 @@ get_entries(int64_t _ts, const std::function& _eval) } *_targ = _v; }; + OMNITRACE_SCOPED_THREAD_STATE(ThreadState::Internal); std::unique_lock _lk{ tasking::critical_trace::get_mutex() }; size_t _n = 0; std::vector> _v{}; diff --git a/projects/rocprofiler-systems/source/lib/omnitrace/library/critical_trace.hpp b/projects/rocprofiler-systems/source/lib/omnitrace/library/critical_trace.hpp index 9415290bb2..dbde43937e 100644 --- a/projects/rocprofiler-systems/source/lib/omnitrace/library/critical_trace.hpp +++ b/projects/rocprofiler-systems/source/lib/omnitrace/library/critical_trace.hpp @@ -65,17 +65,18 @@ struct entry entry& operator=(const entry&) = default; entry& operator=(entry&&) noexcept = default; - uint16_t priority = 0; // priority value (for sorting) - Device device = Device::CPU; // which device it executed on - Phase phase = Phase::NONE; // start / stop / unspecified - uint16_t depth = 0; // call-stack depth - int64_t tid = 0; // thread id it was registered on - uint64_t cpu_cid = 0; // CPU correlation id - uint64_t gpu_cid = 0; // GPU correlation id - uint64_t parent_cid = 0; // parent CPU correlation id - int64_t begin_ns = 0; // timestamp of start - int64_t end_ns = 0; // timestamp of end - size_t hash = 0; // hash for name + uint16_t priority = 0; /// priority value (for sorting) + Device device = Device::CPU; /// which device it executed on + Phase phase = Phase::NONE; /// start / stop / unspecified + uint16_t depth = 0; /// call-stack depth + int64_t tid = 0; /// thread id it was registered on + uint64_t cpu_cid = 0; /// CPU correlation id + uint64_t gpu_cid = 0; /// GPU correlation id + uint64_t parent_cid = 0; /// parent CPU correlation id + int64_t begin_ns = 0; /// timestamp of start + int64_t end_ns = 0; /// timestamp of end + uintptr_t queue_id = 0; /// stream id (GPU) or mutex id + size_t hash = 0; /// hash for name bool operator==(const entry& rhs) const; bool operator!=(const entry& rhs) const { return !(*this == rhs); } @@ -127,7 +128,8 @@ entry::save(Archive& ar, unsigned int) const cereal::make_nvp("tid", tid), cereal::make_nvp("cpu_cid", cpu_cid), cereal::make_nvp("gpu_cid", gpu_cid), cereal::make_nvp("parent_cid", parent_cid), cereal::make_nvp("begin_ns", begin_ns), cereal::make_nvp("end_ns", end_ns), - cereal::make_nvp("hash", hash), cereal::make_nvp("name", _name), + cereal::make_nvp("queue", queue_id), cereal::make_nvp("hash", hash), + cereal::make_nvp("name", _name), cereal::make_nvp("demangled_name", tim::demangle(_name))); } @@ -144,6 +146,7 @@ entry::load(Archive& ar, unsigned int) cereal::make_nvp("gpu_cid", gpu_cid), cereal::make_nvp("parent_cid", parent_cid), cereal::make_nvp("begin_ns", begin_ns), cereal::make_nvp("end_ns", end_ns), cereal::make_nvp("hash", hash), cereal::make_nvp("name", _name), + cereal::make_nvp("queue", queue_id), cereal::make_nvp("demangled_name", _demangled_name)); tim::get_hash_ids()->emplace(hash, _name); diff --git a/projects/rocprofiler-systems/source/lib/omnitrace/library/debug.cpp b/projects/rocprofiler-systems/source/lib/omnitrace/library/debug.cpp new file mode 100644 index 0000000000..9af50b877f --- /dev/null +++ b/projects/rocprofiler-systems/source/lib/omnitrace/library/debug.cpp @@ -0,0 +1,50 @@ +// MIT License +// +// Copyright (c) 2022 Advanced Micro Devices, Inc. All Rights Reserved. +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in all +// copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +// SOFTWARE. + +#include "library/debug.hpp" +#include "library/runtime.hpp" +#include "library/state.hpp" + +namespace omnitrace +{ +namespace debug +{ +lock::lock() +: m_lk{ tim::type_mutex(), std::defer_lock } +{ + if(!m_lk.owns_lock()) + { + push_thread_state(ThreadState::Internal); + m_lk.lock(); + } +} + +lock::~lock() +{ + if(m_lk.owns_lock()) + { + m_lk.unlock(); + pop_thread_state(); + } +} +} // namespace debug +} // namespace omnitrace diff --git a/projects/rocprofiler-systems/source/lib/omnitrace/library/debug.hpp b/projects/rocprofiler-systems/source/lib/omnitrace/library/debug.hpp index bb7a25ada3..9195ceb868 100644 --- a/projects/rocprofiler-systems/source/lib/omnitrace/library/debug.hpp +++ b/projects/rocprofiler-systems/source/lib/omnitrace/library/debug.hpp @@ -27,6 +27,10 @@ #include #include #include +#include +#include +#include +#include #include #include @@ -66,6 +70,25 @@ get_critical_trace_debug(); namespace debug { +struct lock +{ + lock(); + ~lock(); + +private: + tim::auto_lock_t m_lk; +}; +// +template +bool +is_bracket(Arg&& _arg, Args&&...) +{ + if constexpr(::tim::concepts::is_string_type::value) + return (::std::string_view{ _arg }.empty()) ? false : _arg[0] == '['; + else + return false; +} +// namespace { template @@ -79,10 +102,6 @@ get_chars(T&& _c, std::index_sequence) } // namespace debug } // namespace omnitrace -#define OMNITRACE_VAR_NAME_COMBINE(X, Y) X##Y -#define OMNITRACE_LINESTR TIMEMORY_STRINGIZE(__LINE__) -#define OMNITRACE_VARIABLE(LABEL) OMNITRACE_VAR_NAME_COMBINE(_omni_var_, LABEL) - #if !defined(OMNITRACE_DEBUG_BUFFER_LEN) # define OMNITRACE_DEBUG_BUFFER_LEN 2048 #endif @@ -129,14 +148,17 @@ get_chars(T&& _c, std::index_sequence) .data() #endif +//--------------------------------------------------------------------------------------// + #define OMNITRACE_CONDITIONAL_PRINT(COND, ...) \ if((COND) && ::omnitrace::config::get_debug_tid() && \ ::omnitrace::config::get_debug_pid()) \ { \ fflush(stderr); \ - tim::auto_lock_t _lk{ tim::type_mutex() }; \ - fprintf(stderr, "[omnitrace][%i][%li] ", OMNITRACE_PROCESS_IDENTIFIER, \ - OMNITRACE_THREAD_IDENTIFIER); \ + ::omnitrace::debug::lock _lk{}; \ + fprintf(stderr, "[omnitrace][%i][%li]%s", OMNITRACE_PROCESS_IDENTIFIER, \ + OMNITRACE_THREAD_IDENTIFIER, \ + ::omnitrace::debug::is_bracket(__VA_ARGS__) ? "" : " "); \ fprintf(stderr, __VA_ARGS__); \ fflush(stderr); \ } @@ -146,8 +168,9 @@ get_chars(T&& _c, std::index_sequence) ::omnitrace::config::get_debug_pid()) \ { \ fflush(stderr); \ - tim::auto_lock_t _lk{ tim::type_mutex() }; \ - fprintf(stderr, "[omnitrace] "); \ + ::omnitrace::debug::lock _lk{}; \ + fprintf(stderr, "[omnitrace]%s", \ + ::omnitrace::debug::is_bracket(__VA_ARGS__) ? "" : " "); \ fprintf(stderr, __VA_ARGS__); \ fflush(stderr); \ } @@ -157,9 +180,10 @@ get_chars(T&& _c, std::index_sequence) ::omnitrace::config::get_debug_pid()) \ { \ fflush(stderr); \ - tim::auto_lock_t _lk{ tim::type_mutex() }; \ - fprintf(stderr, "[omnitrace][%i][%li][%s] ", OMNITRACE_PROCESS_IDENTIFIER, \ - OMNITRACE_THREAD_IDENTIFIER, OMNITRACE_FUNCTION); \ + ::omnitrace::debug::lock _lk{}; \ + fprintf(stderr, "[omnitrace][%i][%li][%s]%s", OMNITRACE_PROCESS_IDENTIFIER, \ + OMNITRACE_THREAD_IDENTIFIER, OMNITRACE_FUNCTION, \ + ::omnitrace::debug::is_bracket(__VA_ARGS__) ? "" : " "); \ fprintf(stderr, __VA_ARGS__); \ fflush(stderr); \ } @@ -169,19 +193,23 @@ get_chars(T&& _c, std::index_sequence) ::omnitrace::config::get_debug_pid()) \ { \ fflush(stderr); \ - tim::auto_lock_t _lk{ tim::type_mutex() }; \ - fprintf(stderr, "[omnitrace][%s] ", OMNITRACE_FUNCTION); \ + ::omnitrace::debug::lock _lk{}; \ + fprintf(stderr, "[omnitrace][%s]%s", OMNITRACE_FUNCTION, \ + ::omnitrace::debug::is_bracket(__VA_ARGS__) ? "" : " "); \ fprintf(stderr, __VA_ARGS__); \ fflush(stderr); \ } +//--------------------------------------------------------------------------------------// + #define OMNITRACE_CONDITIONAL_THROW(COND, ...) \ if(COND) \ { \ char _msg_buffer[OMNITRACE_DEBUG_BUFFER_LEN]; \ - snprintf(_msg_buffer, OMNITRACE_DEBUG_BUFFER_LEN, "[omnitrace][%i][%li][%s] ", \ + snprintf(_msg_buffer, OMNITRACE_DEBUG_BUFFER_LEN, "[omnitrace][%i][%li][%s]%s", \ OMNITRACE_PROCESS_IDENTIFIER, OMNITRACE_THREAD_IDENTIFIER, \ - OMNITRACE_FUNCTION); \ + OMNITRACE_FUNCTION, \ + ::omnitrace::debug::is_bracket(__VA_ARGS__) ? "" : " "); \ auto len = strlen(_msg_buffer); \ snprintf(_msg_buffer + len, OMNITRACE_DEBUG_BUFFER_LEN - len, __VA_ARGS__); \ throw std::runtime_error(_msg_buffer); \ @@ -191,8 +219,9 @@ get_chars(T&& _c, std::index_sequence) if(COND) \ { \ char _msg_buffer[OMNITRACE_DEBUG_BUFFER_LEN]; \ - snprintf(_msg_buffer, OMNITRACE_DEBUG_BUFFER_LEN, "[omnitrace][%s] ", \ - OMNITRACE_FUNCTION); \ + snprintf(_msg_buffer, OMNITRACE_DEBUG_BUFFER_LEN, "[omnitrace][%s]%s", \ + OMNITRACE_FUNCTION, \ + ::omnitrace::debug::is_bracket(__VA_ARGS__) ? "" : " "); \ auto len = strlen(_msg_buffer); \ snprintf(_msg_buffer + len, OMNITRACE_DEBUG_BUFFER_LEN - len, __VA_ARGS__); \ throw std::runtime_error(_msg_buffer); \ @@ -206,8 +235,71 @@ get_chars(T&& _c, std::index_sequence) OMNITRACE_CONDITIONAL_BASIC_THROW( \ ::omnitrace::get_is_continuous_integration() && (COND), __VA_ARGS__) -#define OMNITRACE_STRINGIZE(...) #__VA_ARGS__ -#define OMNITRACE_ESC(...) __VA_ARGS__ +//--------------------------------------------------------------------------------------// + +#define OMNITRACE_CONDITIONAL_FAIL(COND, ...) \ + if(COND) \ + { \ + fflush(stderr); \ + fprintf(stderr, "[omnitrace][%i][%li]%s", OMNITRACE_PROCESS_IDENTIFIER, \ + OMNITRACE_THREAD_IDENTIFIER, \ + ::omnitrace::debug::is_bracket(__VA_ARGS__) ? "" : " "); \ + fprintf(stderr, __VA_ARGS__); \ + ::omnitrace::set_state(::omnitrace::State::Finalized); \ + ::tim::disable_signal_detection(); \ + ::tim::print_demangled_backtrace<64>(); \ + ::std::exit(EXIT_FAILURE); \ + } + +#define OMNITRACE_CONDITIONAL_BASIC_FAIL(COND, ...) \ + if(COND) \ + { \ + fflush(stderr); \ + fprintf(stderr, "[omnitrace]%s", \ + ::omnitrace::debug::is_bracket(__VA_ARGS__) ? "" : " "); \ + fprintf(stderr, __VA_ARGS__); \ + ::omnitrace::set_state(::omnitrace::State::Finalized); \ + ::tim::disable_signal_detection(); \ + ::tim::print_demangled_backtrace<64>(); \ + ::std::exit(EXIT_FAILURE); \ + } + +#define OMNITRACE_CONDITIONAL_FAIL_F(COND, ...) \ + if(COND) \ + { \ + fflush(stderr); \ + fprintf(stderr, "[omnitrace][%i][%li][%s]%s", OMNITRACE_PROCESS_IDENTIFIER, \ + OMNITRACE_THREAD_IDENTIFIER, OMNITRACE_FUNCTION, \ + ::omnitrace::debug::is_bracket(__VA_ARGS__) ? "" : " "); \ + fprintf(stderr, __VA_ARGS__); \ + ::omnitrace::set_state(::omnitrace::State::Finalized); \ + ::tim::disable_signal_detection(); \ + ::tim::print_demangled_backtrace<64>(); \ + ::std::exit(EXIT_FAILURE); \ + } + +#define OMNITRACE_CONDITIONAL_BASIC_FAIL_F(COND, ...) \ + if(COND) \ + { \ + fflush(stderr); \ + fprintf(stderr, "[omnitrace][%s]%s", OMNITRACE_FUNCTION, \ + ::omnitrace::debug::is_bracket(__VA_ARGS__) ? "" : " "); \ + fprintf(stderr, __VA_ARGS__); \ + ::omnitrace::set_state(::omnitrace::State::Finalized); \ + ::tim::disable_signal_detection(); \ + ::tim::print_demangled_backtrace<64>(); \ + ::std::exit(EXIT_FAILURE); \ + } + +#define OMNITRACE_CI_FAIL(COND, ...) \ + OMNITRACE_CONDITIONAL_FAIL(::omnitrace::get_is_continuous_integration() && (COND), \ + __VA_ARGS__) + +#define OMNITRACE_CI_BASIC_FAIL(COND, ...) \ + OMNITRACE_CONDITIONAL_BASIC_FAIL( \ + ::omnitrace::get_is_continuous_integration() && (COND), __VA_ARGS__) + +//--------------------------------------------------------------------------------------// //--------------------------------------------------------------------------------------// // @@ -287,6 +379,20 @@ get_chars(T&& _c, std::index_sequence) #define OMNITRACE_BASIC_THROW(...) OMNITRACE_CONDITIONAL_BASIC_THROW(true, __VA_ARGS__) +//--------------------------------------------------------------------------------------// +// +// Fail macros +// +//--------------------------------------------------------------------------------------// + +#define OMNITRACE_FAIL(...) OMNITRACE_CONDITIONAL_FAIL(true, __VA_ARGS__) + +#define OMNITRACE_FAIL_F(...) OMNITRACE_CONDITIONAL_FAIL_F(true, __VA_ARGS__) + +#define OMNITRACE_BASIC_FAIL(...) OMNITRACE_CONDITIONAL_BASIC_FAIL(true, __VA_ARGS__) + +#define OMNITRACE_BASIC_FAIL_F(...) OMNITRACE_CONDITIONAL_BASIC_FAIL_F(true, __VA_ARGS__) + #include namespace std diff --git a/projects/rocprofiler-systems/source/lib/omnitrace/library/defines.hpp.in b/projects/rocprofiler-systems/source/lib/omnitrace/library/defines.hpp.in index 2576badfde..f77651e88f 100644 --- a/projects/rocprofiler-systems/source/lib/omnitrace/library/defines.hpp.in +++ b/projects/rocprofiler-systems/source/lib/omnitrace/library/defines.hpp.in @@ -67,3 +67,10 @@ # endif # include #endif + +#define OMNITRACE_STRINGIZE(X) OMNITRACE_STRINGIZE2(X) +#define OMNITRACE_STRINGIZE2(X) #X +#define OMNITRACE_VAR_NAME_COMBINE(X, Y) X##Y +#define OMNITRACE_VARIABLE(Y) OMNITRACE_VAR_NAME_COMBINE(_omni_var_, Y) +#define OMNITRACE_LINESTR OMNITRACE_STRINGIZE(__LINE__) +#define OMNITRACE_ESC(...) __VA_ARGS__ diff --git a/projects/rocprofiler-systems/source/lib/omnitrace/library/ptl.cpp b/projects/rocprofiler-systems/source/lib/omnitrace/library/ptl.cpp index 26dedca112..551d3284a3 100644 --- a/projects/rocprofiler-systems/source/lib/omnitrace/library/ptl.cpp +++ b/projects/rocprofiler-systems/source/lib/omnitrace/library/ptl.cpp @@ -24,9 +24,11 @@ #include "library/config.hpp" #include "library/debug.hpp" #include "library/defines.hpp" +#include "library/runtime.hpp" #include "library/sampling.hpp" #include + #include namespace omnitrace @@ -42,9 +44,10 @@ auto _thread_pool_cfg = []() { _v.use_tbb = false; _v.verbose = -1; _v.initializer = []() { + set_thread_state(ThreadState::Internal); sampling::block_signals(); threading::set_thread_name( - TIMEMORY_JOIN('.', "ptl", PTL::Threading::GetThreadId()).c_str()); + JOIN('.', "ptl", PTL::Threading::GetThreadId()).c_str()); }; _v.finalizer = []() {}; _v.priority = 5; diff --git a/projects/rocprofiler-systems/source/lib/omnitrace/library/runtime.cpp b/projects/rocprofiler-systems/source/lib/omnitrace/library/runtime.cpp index 9cb8ae9952..12c63bc080 100644 --- a/projects/rocprofiler-systems/source/lib/omnitrace/library/runtime.cpp +++ b/projects/rocprofiler-systems/source/lib/omnitrace/library/runtime.cpp @@ -26,6 +26,7 @@ #include "library/debug.hpp" #include "library/defines.hpp" #include "library/thread_data.hpp" +#include "library/utility.hpp" #include #include @@ -98,6 +99,8 @@ create_cpu_cid_entry(int64_t _tid) { using tim::auto_lock_t; + OMNITRACE_SCOPED_THREAD_STATE(ThreadState::Internal); + // unique lock for _tid auto& _mtx = get_cpu_cid_stack_lock(_tid); auto_lock_t _lk{ _mtx, std::defer_lock }; @@ -166,4 +169,50 @@ get_gotcha_bundle() "omnitrace", quirk::config{})); return _v; } + +namespace +{ +auto& +get_thread_state_history(int64_t _idx = utility::get_thread_index()) +{ + static auto _v = utility::get_filled_array( + []() { return utility::get_reserved_vector(32); }); + + return _v.at(_idx); +} +} // namespace + +ThreadState& +get_thread_state() +{ + static thread_local ThreadState _v{ ThreadState::Enabled }; + return _v; +} + +ThreadState +set_thread_state(ThreadState _n) +{ + auto _o = get_thread_state(); + get_thread_state() = _n; + return _o; +} + +ThreadState +push_thread_state(ThreadState _v) +{ + return get_thread_state_history().emplace_back(set_thread_state(_v)); +} + +ThreadState +pop_thread_state() +{ + auto& _hist = get_thread_state_history(); + if(!_hist.empty()) + { + set_thread_state(_hist.back()); + _hist.pop_back(); + } + return get_thread_state(); +} + } // namespace omnitrace diff --git a/projects/rocprofiler-systems/source/lib/omnitrace/library/runtime.hpp b/projects/rocprofiler-systems/source/lib/omnitrace/library/runtime.hpp index 9c0133e041..d90f12daf6 100644 --- a/projects/rocprofiler-systems/source/lib/omnitrace/library/runtime.hpp +++ b/projects/rocprofiler-systems/source/lib/omnitrace/library/runtime.hpp @@ -46,7 +46,7 @@ namespace omnitrace // bundle of components around omnitrace_init and omnitrace_finalize using main_bundle_t = tim::lightweight_tuple; + comp::cpu_util, pthread_gotcha>; using gotcha_bundle_t = tim::lightweight_tuple; @@ -89,4 +89,27 @@ get_cpu_cid_entry(uint64_t _cid, int64_t _tid = threading::get_id()); tim::mutex_t& get_cpu_cid_stack_lock(int64_t _tid = threading::get_id()); +ThreadState& +get_thread_state(); + +/// returns old state +ThreadState set_thread_state(ThreadState); + +ThreadState push_thread_state(ThreadState); + +ThreadState +pop_thread_state(); + +struct scoped_thread_state +{ + scoped_thread_state(ThreadState _v) { push_thread_state(_v); } + ~scoped_thread_state() { pop_thread_state(); } +}; } // namespace omnitrace + +#define OMNITRACE_SCOPED_THREAD_STATE(STATE) \ + ::omnitrace::scoped_thread_state OMNITRACE_VARIABLE( \ + OMNITRACE_VAR_NAME_COMBINE(scoped_thread_state_, __LINE__)) \ + { \ + ::omnitrace::STATE \ + } diff --git a/projects/rocprofiler-systems/source/lib/omnitrace/library/state.cpp b/projects/rocprofiler-systems/source/lib/omnitrace/library/state.cpp index 067d93bea4..c6cfea3b4a 100644 --- a/projects/rocprofiler-systems/source/lib/omnitrace/library/state.cpp +++ b/projects/rocprofiler-systems/source/lib/omnitrace/library/state.cpp @@ -40,6 +40,19 @@ to_string(omnitrace::State _v) return {}; } +std::string +to_string(omnitrace::ThreadState _v) +{ + switch(_v) + { + case omnitrace::ThreadState::Enabled: return "Enabled"; + case omnitrace::ThreadState::Internal: return "Internal"; + case omnitrace::ThreadState::Disabled: return "Disabled"; + case omnitrace::ThreadState::Completed: return "Completed"; + } + return {}; +} + std::string to_string(omnitrace::Mode _v) { diff --git a/projects/rocprofiler-systems/source/lib/omnitrace/library/state.hpp b/projects/rocprofiler-systems/source/lib/omnitrace/library/state.hpp index ed09058db0..1cc74bfb36 100644 --- a/projects/rocprofiler-systems/source/lib/omnitrace/library/state.hpp +++ b/projects/rocprofiler-systems/source/lib/omnitrace/library/state.hpp @@ -36,6 +36,15 @@ enum class State : unsigned short Finalized }; +// used for specifying the state of omnitrace +enum class ThreadState : unsigned short +{ + Enabled = 0, + Internal, + Disabled, + Completed, +}; + enum class Mode : unsigned short { Trace = 0, @@ -51,6 +60,9 @@ namespace std std::string to_string(omnitrace::State _v); +std::string +to_string(omnitrace::ThreadState _v); + std::string to_string(omnitrace::Mode _v); } // namespace std diff --git a/projects/rocprofiler-systems/source/lib/omnitrace/library/thread_data.cpp b/projects/rocprofiler-systems/source/lib/omnitrace/library/thread_data.cpp index 802f183ed7..36457eedc8 100644 --- a/projects/rocprofiler-systems/source/lib/omnitrace/library/thread_data.cpp +++ b/projects/rocprofiler-systems/source/lib/omnitrace/library/thread_data.cpp @@ -21,6 +21,10 @@ // SOFTWARE. #include "library/thread_data.hpp" +#include "library/components/pthread_create_gotcha.hpp" +#include "library/utility.hpp" + +#include namespace omnitrace { @@ -30,4 +34,15 @@ instrumentation_bundles::instances() static auto _v = instance_array_t{}; return _v; } + +void +thread_deleter::operator()() const +{ + pthread_create_gotcha::shutdown(threading::get_id()); + set_thread_state(ThreadState::Completed); + if(get_state() != State::Finalized && threading::get_id() == 0) + omnitrace_finalize_hidden(); +} + +template struct thread_deleter; } // namespace omnitrace diff --git a/projects/rocprofiler-systems/source/lib/omnitrace/library/thread_data.hpp b/projects/rocprofiler-systems/source/lib/omnitrace/library/thread_data.hpp index dc40f5bac2..aca30a01c9 100644 --- a/projects/rocprofiler-systems/source/lib/omnitrace/library/thread_data.hpp +++ b/projects/rocprofiler-systems/source/lib/omnitrace/library/thread_data.hpp @@ -26,6 +26,7 @@ #include "library/common.hpp" #include "library/config.hpp" #include "library/defines.hpp" +#include "library/state.hpp" #include "library/timemory.hpp" #include @@ -40,6 +41,8 @@ namespace omnitrace { +ThreadState set_thread_state(ThreadState); + // bundle of components used in instrumentation using instrumentation_bundle_t = tim::component_bundle; @@ -56,12 +59,20 @@ using unique_ptr_t = std::unique_ptr>; static constexpr size_t max_supported_threads = OMNITRACE_MAX_THREADS; +template <> +struct thread_deleter +{ + void operator()() const; +}; + +extern template struct thread_deleter; + template struct thread_deleter { void operator()(Tp* ptr) const { - if(get_state() != State::Finalized) omnitrace_finalize_hidden(); + thread_deleter{}(); delete ptr; } }; diff --git a/projects/rocprofiler-systems/source/lib/omnitrace/library/thread_sampler.cpp b/projects/rocprofiler-systems/source/lib/omnitrace/library/thread_sampler.cpp index 8022f2c75f..bf232c7f7a 100644 --- a/projects/rocprofiler-systems/source/lib/omnitrace/library/thread_sampler.cpp +++ b/projects/rocprofiler-systems/source/lib/omnitrace/library/thread_sampler.cpp @@ -74,6 +74,7 @@ get_sampler_is_sampling() void sampler::poll(std::atomic* _state, nsec_t _interval, promise_t* _ready) { + set_thread_state(ThreadState::Internal); threading::set_thread_name("omni.sampler"); // notify thread started diff --git a/projects/rocprofiler-systems/source/lib/omnitrace/library/utility.hpp b/projects/rocprofiler-systems/source/lib/omnitrace/library/utility.hpp new file mode 100644 index 0000000000..8dd651f3b0 --- /dev/null +++ b/projects/rocprofiler-systems/source/lib/omnitrace/library/utility.hpp @@ -0,0 +1,70 @@ +// MIT License +// +// Copyright (c) 2022 Advanced Micro Devices, Inc. All Rights Reserved. +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in all +// copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +// SOFTWARE. + +#pragma once + +#include +#include +#include +#include +#include + +namespace omnitrace +{ +namespace utility +{ +namespace +{ +/// provides an alternative thread index for when using threading::get_id() is not +/// desirable +inline auto +get_thread_index() +{ + static std::atomic _c{ 0 }; + static thread_local int64_t _v = _c++; + return _v; +} + +/// fills any array with the result of the functor +template +inline auto +get_filled_array(FuncT&& _func) +{ + using Tp = std::decay_t; + std::array _v{}; + for(auto& itr : _v) + itr = std::move(_func()); + return _v; +} + +/// returns a vector with a preallocated buffer +template +inline auto +get_reserved_vector(size_t _n) +{ + std::vector _v{}; + _v.reserve(_n); + return _v; +} +} // namespace +} // namespace utility +} // namespace omnitrace diff --git a/projects/rocprofiler-systems/source/python/CMakeLists.txt b/projects/rocprofiler-systems/source/python/CMakeLists.txt index 047896d600..5fb661e6b9 100644 --- a/projects/rocprofiler-systems/source/python/CMakeLists.txt +++ b/projects/rocprofiler-systems/source/python/CMakeLists.txt @@ -63,12 +63,10 @@ endfunction() add_library(omnitrace-python-compile-options INTERFACE) add_library(omnitrace::omnitrace-python-compile-options ALIAS omnitrace-python-compile-options) -add_cxx_flag_if_avail("-frtti" omnitrace-python-compile-options) -add_cxx_flag_if_avail("-Wno-unused-value" omnitrace-python-compile-options) -add_cxx_flag_if_avail("-Wno-range-loop-analysis" omnitrace-python-compile-options) -add_cxx_flag_if_avail("-ftls-model=global-dynamic" omnitrace-python-compile-options) -add_cxx_flag_if_avail("-Wno-deprecated-declarations" omnitrace-python-compile-options) -add_cxx_flag_if_avail("-Wno-unused-but-set-parameter" omnitrace-python-compile-options) +add_target_cxx_flag_if_avail( + omnitrace-python-compile-options "-Wno-unused-value" "-Wno-range-loop-analysis" + "-Wno-deprecated-declarations" "-Wno-unused-but-set-parameter" + "-ftls-model=global-dynamic") file(GLOB pyheaders ${CMAKE_CURRENT_LIST_DIR}/libpyomnitrace*.hpp) set(pysources ${CMAKE_CURRENT_LIST_DIR}/libpyomnitrace.cpp) diff --git a/projects/rocprofiler-systems/tests/CMakeLists.txt b/projects/rocprofiler-systems/tests/CMakeLists.txt index 94fef5de91..50b1a1771b 100644 --- a/projects/rocprofiler-systems/tests/CMakeLists.txt +++ b/projects/rocprofiler-systems/tests/CMakeLists.txt @@ -34,6 +34,18 @@ set(_base_environment "LD_LIBRARY_PATH=${PROJECT_BINARY_DIR}:${OMNITRACE_DYNINST_API_RT_DIR}:$ENV{LD_LIBRARY_PATH}" ) +set(_lock_environment + "OMNITRACE_USE_SAMPLING=OFF" + "OMNITRACE_CRITICAL_TRACE=ON" + "OMNITRACE_COLLAPSE_THREADS=ON" + "OMNITRACE_TRACE_THREAD_LOCKS=ON" + "OMNITRACE_COUT_OUTPUT=ON" + "OMNITRACE_TIME_OUTPUT=OFF" + "OMNITRACE_FLAT_PROFILE=ON" + "OMNITRACE_TIMELINE_PROFILE=OFF" + "LD_LIBRARY_PATH=${PROJECT_BINARY_DIR}:${OMNITRACE_DYNINST_API_RT_DIR}:$ENV{LD_LIBRARY_PATH}" + ) + set(_ompt_environment "OMNITRACE_USE_PERFETTO=ON" "OMNITRACE_USE_TIMEMORY=ON" @@ -144,7 +156,7 @@ function(OMNITRACE_ADD_TEST) list(APPEND TEST_ENVIRONMENT "OMNITRACE_USE_PID=OFF") endif() - if(NOT SKIP_BASELINE) + if(NOT TEST_SKIP_BASELINE) add_test( NAME ${TEST_NAME}-baseline COMMAND ${COMMAND_PREFIX} $ ${TEST_RUN_ARGS} @@ -389,7 +401,7 @@ omnitrace_add_test( TARGET transpose MPI ${TRANSPOSE_USE_MPI} NUM_PROCS ${NUM_PROCS} - REWRITE_ARGS -e -v 2 + REWRITE_ARGS -e -v 2 --print-instructions RUNTIME_ARGS -e -v 1 --label file line return args ENVIRONMENT "${_base_environment};OMNITRACE_CRITICAL_TRACE=ON") @@ -410,6 +422,30 @@ omnitrace_add_test( RUN_ARGS 10 ${NUM_THREADS} 1000 ENVIRONMENT "${_base_environment};OMNITRACE_CRITICAL_TRACE=OFF") +omnitrace_add_test( + SKIP_RUNTIME SKIP_SAMPLING + NAME parallel-overhead-locks-timemory + TARGET parallel-overhead-locks + LABELS "locks" + REWRITE_ARGS -e -v 2 --min-instructions=4 + RUN_ARGS 10 4 1000 + ENVIRONMENT + "${_lock_environment};OMNITRACE_USE_TIMEMORY=ON;OMNITRACE_USE_PERFETTO=OFF" + REWRITE_RUN_PASS_REGEX + "start_thread (.*) 4 (.*) pthread_mutex_lock (.*) 4000 (.*) pthread_mutex_unlock (.*) 4000" + ) + +omnitrace_add_test( + SKIP_BASELINE SKIP_RUNTIME SKIP_SAMPLING + NAME parallel-overhead-locks-perfetto + TARGET parallel-overhead-locks + LABELS "locks" + REWRITE_ARGS -e -v 2 --min-instructions=8 + RUN_ARGS 10 4 1000 + ENVIRONMENT + "${_lock_environment};OMNITRACE_USE_TIMEMORY=OFF;OMNITRACE_USE_PERFETTO=ON" + PROPERTIES WILL_FAIL ON) + omnitrace_add_test( NAME user-api TARGET user-api