diff --git a/.github/workflows/opensuse.yml b/.github/workflows/opensuse.yml index 1bc056db30..ef3679206c 100644 --- a/.github/workflows/opensuse.yml +++ b/.github/workflows/opensuse.yml @@ -29,7 +29,7 @@ jobs: - uses: actions/checkout@v2 - name: Install Packages - timeout-minutes: 5 + timeout-minutes: 10 run: for i in 6 7 8 9 10; do /opt/conda/envs/py3.${i}/bin/python -m pip install numpy perfetto dataclasses; done @@ -66,7 +66,7 @@ jobs: cmake --build build --target all --parallel 2 -- VERBOSE=1 - name: Install - timeout-minutes: 5 + timeout-minutes: 10 run: cmake --build build --target install --parallel 2 @@ -124,3 +124,11 @@ jobs: build/omnitrace-tests-config/*.cfg build/omnitrace-tests-output/**/*.txt build/omnitrace-tests-output/**/*-instr*.json + + - name: Kill Perfetto + if: success() || failure() + continue-on-error: True + run: | + set +e + RUNNING_PROCS=$(pgrep trace_processor_shell) + if [ -n "${RUNNING_PROCS}" ]; then kill -s 9 ${RUNNING_PROCS}; fi diff --git a/.github/workflows/ubuntu-bionic.yml b/.github/workflows/ubuntu-bionic.yml index 8b2b8806d7..c71db63214 100644 --- a/.github/workflows/ubuntu-bionic.yml +++ b/.github/workflows/ubuntu-bionic.yml @@ -19,7 +19,7 @@ env: jobs: ubuntu-bionic: - runs-on: ubuntu-18.04 + runs-on: ubuntu-latest container: image: jrmadsen/omnitrace:ci-base-ubuntu-18.04 strategy: @@ -29,7 +29,7 @@ jobs: steps: - name: Patch Git - timeout-minutes: 5 + timeout-minutes: 10 run: | apt-get update apt-get install -y software-properties-common @@ -43,7 +43,7 @@ jobs: submodules: recursive - name: Install Packages - timeout-minutes: 5 + timeout-minutes: 10 run: apt-get update && apt-get upgrade -y && @@ -55,7 +55,7 @@ jobs: for i in 6 7 8 9 10; do /opt/conda/envs/py3.${i}/bin/python -m pip install numpy perfetto dataclasses; done - name: Install Kokkos - timeout-minutes: 5 + timeout-minutes: 10 run: cd examples/lulesh/external/kokkos && cmake -B build -DKokkos_ENABLE_OPENMP=ON -DKokkos_ENABLE_SERIAL=ON -DBUILD_SHARED_LIBS=ON -DCMAKE_CXX_STANDARD=17 . && @@ -153,3 +153,11 @@ jobs: build/omnitrace-tests-config/*.cfg build/omnitrace-tests-output/**/*.txt build/omnitrace-tests-output/**/*-instr*.json + + - name: Kill Perfetto + if: success() || failure() + continue-on-error: True + run: | + set +e + RUNNING_PROCS=$(pgrep trace_processor_shell) + if [ -n "${RUNNING_PROCS}" ]; then kill -s 9 ${RUNNING_PROCS}; fi diff --git a/.github/workflows/ubuntu-focal.yml b/.github/workflows/ubuntu-focal.yml index 48f4ae848c..130c5d70d7 100644 --- a/.github/workflows/ubuntu-focal.yml +++ b/.github/workflows/ubuntu-focal.yml @@ -59,7 +59,7 @@ jobs: - uses: actions/checkout@v2 - name: Install Packages - timeout-minutes: 5 + timeout-minutes: 10 run: apt-get update && apt-get install -y software-properties-common && @@ -111,7 +111,7 @@ jobs: cmake --build build --target all --parallel 2 -- VERBOSE=1 - name: Install - timeout-minutes: 5 + timeout-minutes: 10 run: cmake --build build --target install --parallel 2 @@ -170,6 +170,14 @@ jobs: build/omnitrace-tests-output/**/*.txt build/omnitrace-tests-output/**/*-instr*.json + - name: Kill Perfetto + if: success() || failure() + continue-on-error: True + run: | + set +e + RUNNING_PROCS=$(pgrep trace_processor_shell) + if [ -n "${RUNNING_PROCS}" ]; then kill -s 9 ${RUNNING_PROCS}; fi + ubuntu-focal-external-rocm: runs-on: ubuntu-20.04 container: @@ -199,7 +207,7 @@ jobs: - uses: actions/checkout@v2 - name: Install Packages - timeout-minutes: 5 + timeout-minutes: 10 run: apt-get update && apt-get install -y software-properties-common wget gnupg2 && @@ -213,7 +221,7 @@ jobs: - name: Install RCCL if: ${{ matrix.rocm_version != '4.3' }} - timeout-minutes: 5 + timeout-minutes: 10 run: apt-get install -y rccl-dev @@ -371,6 +379,14 @@ jobs: build/omnitrace-tests-output/**/*.txt build/omnitrace-tests-output/**/*-instr*.json + - name: Kill Perfetto + if: success() || failure() + continue-on-error: True + run: | + set +e + RUNNING_PROCS=$(pgrep trace_processor_shell) + if [ -n "${RUNNING_PROCS}" ]; then kill -s 9 ${RUNNING_PROCS}; fi + ubuntu-focal: runs-on: ubuntu-20.04 strategy: @@ -404,7 +420,7 @@ jobs: - uses: actions/checkout@v2 - name: Install Packages - timeout-minutes: 5 + timeout-minutes: 10 run: sudo apt-get update && sudo apt-get install -y build-essential m4 autoconf libtool python3-pip ${{ matrix.deps }} clang libomp-dev ${{ matrix.compiler }} ${{ matrix.mpi }} && @@ -523,3 +539,11 @@ jobs: ${{ github.workspace }}/build/omnitrace-tests-config/*.cfg ${{ github.workspace }}/build/omnitrace-tests-output/**/*.txt ${{ github.workspace }}/build/omnitrace-tests-output/**/*-instr*.json + + - name: Kill Perfetto + if: success() || failure() + continue-on-error: True + run: | + set +e + RUNNING_PROCS=$(pgrep trace_processor_shell) + if [ -n "${RUNNING_PROCS}" ]; then kill -s 9 ${RUNNING_PROCS}; fi diff --git a/examples/mpi/mpi.cpp b/examples/mpi/mpi.cpp index 7625d25b77..0a30a61858 100644 --- a/examples/mpi/mpi.cpp +++ b/examples/mpi/mpi.cpp @@ -44,15 +44,21 @@ namespace auto _name = std::string{}; } // namespace -template -void -all2all(int _rank, MPI_Comm _comm) +template +auto +get_values_str(const Tp& _data) { - if(_comm == MPI_COMM_NULL) return; - static_assert(N > 0, "Error! N must be greater than zero!"); + std::stringstream _ss{}; + for(auto&& itr : _data) + _ss << ", " << std::setw(6) << std::setprecision(2) << std::fixed << itr; + return _ss.str().substr(1); +}; - auto _mt = std::mt19937_64{ size_t(_rank + 100) }; - auto _dist = []() { +template +auto +get_dist(std::mt19937_64& _mt) +{ + static auto _dist = []() { if constexpr(std::is_integral::value) { return std::uniform_int_distribution(1, N * N); @@ -62,23 +68,13 @@ all2all(int _rank, MPI_Comm _comm) return std::uniform_real_distribution(1.0, N * N); } }(); + return _dist(_mt); +} - auto _get_values_str = [](const auto& _data) { - std::stringstream _ss{}; - for(auto&& itr : _data) - _ss << ", " << std::setw(6) << std::setprecision(2) << std::fixed << itr; - return _ss.str().substr(1); - }; - - std::array values_sent = {}; - std::array values_recv = {}; - for(size_t i = 0; i < N; ++i) - values_sent[i] = _dist(_mt); - - if(_rank == 0) - printf("[%s][%i] values sent (# = %zu) :: %s.\n", _name.c_str(), _rank, - values_sent.size(), _get_values_str(values_sent).c_str()); - +template +auto +get_dtype() +{ auto _dtype = MPI_INT; // NOLINT if(std::is_same::value) _dtype = MPI_LONG; @@ -86,12 +82,71 @@ all2all(int _rank, MPI_Comm _comm) _dtype = MPI_FLOAT; else if(std::is_same::value) _dtype = MPI_DOUBLE; + return _dtype; +} + +template +void +all2all(int _rank, MPI_Comm _comm) +{ + if(_comm == MPI_COMM_NULL) return; + static_assert(N > 0, "Error! N must be greater than zero!"); + + auto _dtype = get_dtype(); + auto _mt = std::mt19937_64{ size_t(_rank + 100) }; + auto values_sent = std::array{}; + auto values_recv = std::array{}; + for(size_t i = 0; i < N; ++i) + values_sent[i] = get_dist(_mt); + + if(_rank == 0) + printf("[%s][%s][%i] values sent (# = %zu) :: %s.\n", _name.c_str(), __FUNCTION__, + _rank, values_sent.size(), get_values_str(values_sent).c_str()); MPI_Alltoall(&values_sent[_rank], 1, _dtype, &values_recv[_rank], 1, _dtype, _comm); if(_rank == 0) - printf("[%s][%i] values recv (# = %zu) :: %s.\n", _name.c_str(), _rank, - values_sent.size(), _get_values_str(values_recv).c_str()); + printf("[%s][%s][%i] values recv (# = %zu) :: %s.\n", _name.c_str(), __FUNCTION__, + _rank, values_sent.size(), get_values_str(values_recv).c_str()); +} + +template +void +send_recv(int _rank, MPI_Comm _comm) +{ + if(_comm == MPI_COMM_NULL) return; + static_assert(N > 0, "Error! N must be greater than zero!"); + int _size = 0; + MPI_Comm_size(_comm, &_size); + + auto _dtype = get_dtype(); + auto _mt = std::mt19937_64{ size_t(_rank + 100) }; + auto values_sent = std::array{}; + auto values_recv = std::array{}; + for(size_t i = 0; i < N; ++i) + values_sent[i] = get_dist(_mt); + + if(_rank == 0) + printf("[%s][%s][%i] values sent (# = %zu) :: %s.\n", _name.c_str(), __FUNCTION__, + _rank, values_sent.size(), get_values_str(values_sent).c_str()); + + for(int i = 0; i < _size; ++i) + { + if(i != _rank) MPI_Send(&values_sent[_rank], 1, _dtype, i, N, _comm); + } + + for(int i = 0; i < _size; ++i) + { + if(i != _rank) + { + MPI_Status _status; + MPI_Recv(&values_recv[i], 1, _dtype, i, N, _comm, &_status); + } + } + + if(_rank == 0) + printf("[%s][%s][%i] values recv (# = %zu) :: %s.\n", _name.c_str(), __FUNCTION__, + _rank, values_sent.size(), get_values_str(values_recv).c_str()); } void @@ -109,9 +164,13 @@ run(MPI_Comm _comm, int nitr) MPI_Barrier(_comm); for(int i = 0; i < nitr; ++i) { + send_recv(_rank, _comm); + send_recv(_rank, _comm); + send_recv(_rank, _comm); + send_recv(_rank, _comm); + MPI_Barrier(_comm); all2all(_rank, _comm); all2all(_rank, _comm); - MPI_Barrier(_comm); all2all(_rank, _comm); all2all(_rank, _comm); } @@ -259,6 +318,7 @@ run_main(int argc, char** argv) int main(int argc, char** argv) { + std::this_thread::sleep_for(std::chrono::seconds{ 2 }); int _mpi_thread_provided; MPI_Init_thread(&argc, &argv, MPI_THREAD_SERIALIZED, &_mpi_thread_provided); diff --git a/external/timemory b/external/timemory index 97c7415498..48f4735fb7 160000 --- a/external/timemory +++ b/external/timemory @@ -1 +1 @@ -Subproject commit 97c74154988414ed8e0339bd1585829aa598dcdd +Subproject commit 48f4735fb7c8c452088c1103936423c4727e4884 diff --git a/source/lib/omnitrace/CMakeLists.txt b/source/lib/omnitrace/CMakeLists.txt index f08a1279e3..2533d2fd49 100644 --- a/source/lib/omnitrace/CMakeLists.txt +++ b/source/lib/omnitrace/CMakeLists.txt @@ -76,6 +76,7 @@ set(library_sources ${CMAKE_CURRENT_LIST_DIR}/library/timemory.cpp ${CMAKE_CURRENT_LIST_DIR}/library/tracing.cpp ${CMAKE_CURRENT_LIST_DIR}/library/components/backtrace.cpp + ${CMAKE_CURRENT_LIST_DIR}/library/components/comm_data.cpp ${CMAKE_CURRENT_LIST_DIR}/library/components/exit_gotcha.cpp ${CMAKE_CURRENT_LIST_DIR}/library/components/fork_gotcha.cpp ${CMAKE_CURRENT_LIST_DIR}/library/components/mpi_gotcha.cpp @@ -115,6 +116,7 @@ set(library_headers ${CMAKE_CURRENT_LIST_DIR}/library/components/fwd.hpp ${CMAKE_CURRENT_LIST_DIR}/library/components/backtrace.hpp ${CMAKE_CURRENT_LIST_DIR}/library/components/category_region.hpp + ${CMAKE_CURRENT_LIST_DIR}/library/components/comm_data.hpp ${CMAKE_CURRENT_LIST_DIR}/library/components/exit_gotcha.hpp ${CMAKE_CURRENT_LIST_DIR}/library/components/fork_gotcha.hpp ${CMAKE_CURRENT_LIST_DIR}/library/components/functors.hpp diff --git a/source/lib/omnitrace/library/components/comm_data.cpp b/source/lib/omnitrace/library/components/comm_data.cpp new file mode 100644 index 0000000000..27f22818c4 --- /dev/null +++ b/source/lib/omnitrace/library/components/comm_data.cpp @@ -0,0 +1,423 @@ +// MIT License +// +// Copyright (c) 2022 Advanced Micro Devices, Inc. All Rights Reserved. +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in all +// copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +// SOFTWARE. + +#include "library/components/comm_data.hpp" +#include "library/components/fwd.hpp" +#include "library/config.hpp" +#include "library/perfetto.hpp" +#include "library/tracing.hpp" + +#include +#include +#include +#include + +namespace tim +{ +namespace component +{ +namespace +{ +template +void +write_perfetto_counter_track(uint64_t _val) +{ + using counter_track = omnitrace::perfetto_counter_track; + + if(omnitrace::get_use_perfetto() && + omnitrace::get_state() == omnitrace::State::Active) + { + auto _emplace = [](const size_t _idx) { + if(!counter_track::exists(_idx)) + { + std::string _label = (_idx > 0) + ? JOIN(" ", Tp::label, JOIN("", '[', _idx, ']')) + : Tp::label; + counter_track::emplace(_idx, _label, "bytes"); + } + }; + + const size_t _idx = 0; + static std::once_flag _once{}; + std::call_once(_once, _emplace, _idx); + + static std::mutex _mutex{}; + static uint64_t value = 0; + uint64_t _now = 0; + { + std::unique_lock _lk{ _mutex }; + _now = omnitrace::tracing::now(); + _val = (value += _val); + } + + TRACE_COUNTER(Tp::value, counter_track::at(_idx, 0), _now, _val); + } +} +} // namespace + +void +comm_data::preinit() +{ + configure(); +} + +void +comm_data::global_finalize() +{ + configure(); +} + +void +comm_data::configure() +{ + static bool _once = false; + if(_once) return; + _once = true; + + comm_data_tracker_t::label() = "comm_data"; + comm_data_tracker_t::description() = "Tracks MPI/RCCL communication data sizes"; + comm_data_tracker_t::display_unit() = "MB"; + comm_data_tracker_t::unit() = units::megabyte; + + auto _fmt_flags = comm_data_tracker_t::get_format_flags(); + _fmt_flags &= (std::ios_base::fixed & std::ios_base::scientific); + _fmt_flags |= (std::ios_base::scientific); + comm_data_tracker_t::set_precision(3); + comm_data_tracker_t::set_format_flags(_fmt_flags); +} + +#if defined(OMNITRACE_USE_MPI) +// MPI_Send +void +comm_data::audit(const gotcha_data& _data, audit::incoming, const void*, int count, + MPI_Datatype datatype, int dst, int tag, MPI_Comm) +{ + int _size = mpi_type_size(datatype); + if(_size == 0) return; + + write_perfetto_counter_track(count * _size); + + if(!omnitrace::get_use_timemory()) return; + auto _name = std::string_view{ _data.tool_id }; + tracker_t _a{ _name }; + add(_a, count * _size); + tracker_t _b{ JOIN('_', _name, "dst", dst) }; + add(_b, count * _size); + add(JOIN('_', _name, "dst", dst, "tag", tag), count * _size); +} + +// MPI_Recv +void +comm_data::audit(const gotcha_data& _data, audit::incoming, void*, int count, + MPI_Datatype datatype, int dst, int tag, MPI_Comm, MPI_Status*) +{ + int _size = mpi_type_size(datatype); + if(_size == 0) return; + + write_perfetto_counter_track(count * _size); + + if(!omnitrace::get_use_timemory()) return; + auto _name = std::string_view{ _data.tool_id }; + tracker_t _a{ _name }; + add(_a, count * _size); + tracker_t _b{ JOIN('_', _name, "dst", dst) }; + add(_b, count * _size); + add(JOIN('_', _name, "dst", dst, "tag", tag), count * _size); +} + +// MPI_Isend +void +comm_data::audit(const gotcha_data& _data, audit::incoming, const void*, int count, + MPI_Datatype datatype, int dst, int tag, MPI_Comm, MPI_Request*) +{ + int _size = mpi_type_size(datatype); + if(_size == 0) return; + + write_perfetto_counter_track(count * _size); + + if(!omnitrace::get_use_timemory()) return; + auto _name = std::string_view{ _data.tool_id }; + tracker_t _a{ _name }; + add(_a, count * _size); + tracker_t _b{ JOIN('_', _name, "dst", dst) }; + add(_b, count * _size); + add(JOIN('_', _name, "dst", dst, "tag", tag), count * _size); +} + +// MPI_Irecv +void +comm_data::audit(const gotcha_data& _data, audit::incoming, void*, int count, + MPI_Datatype datatype, int dst, int tag, MPI_Comm, MPI_Request*) +{ + int _size = mpi_type_size(datatype); + if(_size == 0) return; + + write_perfetto_counter_track(count * _size); + + if(!omnitrace::get_use_timemory()) return; + auto _name = std::string_view{ _data.tool_id }; + tracker_t _a{ _name }; + add(_a, count * _size); + tracker_t _b{ JOIN('_', _name, "dst", dst) }; + add(_b, count * _size); + add(JOIN('_', _name, "dst", dst, "tag", tag), count * _size); +} + +// MPI_Bcast +void +comm_data::audit(const gotcha_data& _data, audit::incoming, void*, int count, + MPI_Datatype datatype, int root, MPI_Comm) +{ + int _size = mpi_type_size(datatype); + if(_size == 0) return; + + write_perfetto_counter_track(count * _size); + + if(!omnitrace::get_use_timemory()) return; + auto _name = std::string_view{ _data.tool_id }; + tracker_t _t{ _name }; + add(_t, count * _size); + add(JOIN('_', _name, "root", root), count * _size); +} + +// MPI_Allreduce +void +comm_data::audit(const gotcha_data& _data, audit::incoming, const void*, void*, int count, + MPI_Datatype datatype, MPI_Op, MPI_Comm) +{ + int _size = mpi_type_size(datatype); + if(_size == 0) return; + + write_perfetto_counter_track(count * _size); + write_perfetto_counter_track(count * _size); + + if(!omnitrace::get_use_timemory()) return; + add(_data, count * _size); +} + +// MPI_Sendrecv +void +comm_data::audit(const gotcha_data& _data, audit::incoming, const void*, int sendcount, + MPI_Datatype sendtype, int dst, int sendtag, void*, int recvcount, + MPI_Datatype recvtype, int src, int recvtag, MPI_Comm, MPI_Status*) +{ + int _send_size = mpi_type_size(sendtype); + int _recv_size = mpi_type_size(recvtype); + if(_send_size == 0 || _recv_size == 0) return; + + write_perfetto_counter_track(sendcount * _send_size); + write_perfetto_counter_track(recvcount * _recv_size); + + if(!omnitrace::get_use_timemory()) return; + auto _name = std::string_view{ _data.tool_id }; + tracker_t _t{ _name }; + add(_t, sendcount * _send_size + recvcount * _recv_size); + { + tracker_t _b{ JOIN('_', _name, "send") }; + add(_b, sendcount * _send_size); + tracker_t _c{ JOIN('_', _name, "send", dst) }; + add(_b, sendcount * _send_size); + add(JOIN('_', _name, "send", "tag", sendtag), sendcount * _send_size); + add(JOIN('_', _name, "send", dst, "tag", sendtag), sendcount * _send_size); + } + { + tracker_t _b{ JOIN('_', _name, "recv") }; + add(_b, recvcount * _recv_size); + tracker_t _c{ JOIN('_', _name, "recv", src) }; + add(_b, recvcount * _recv_size); + add(JOIN('_', _name, "recv", "tag", recvtag), recvcount * _recv_size); + add(JOIN('_', _name, "recv", src, "tag", recvtag), recvcount * _recv_size); + } +} + +// MPI_Gather +// MPI_Scatter +void +comm_data::audit(const gotcha_data& _data, audit::incoming, const void*, int sendcount, + MPI_Datatype sendtype, void*, int recvcount, MPI_Datatype recvtype, + int root, MPI_Comm) +{ + int _send_size = mpi_type_size(sendtype); + int _recv_size = mpi_type_size(recvtype); + if(_send_size == 0 || _recv_size == 0) return; + + write_perfetto_counter_track(sendcount * _send_size); + write_perfetto_counter_track(recvcount * _recv_size); + + if(!omnitrace::get_use_timemory()) return; + auto _name = std::string_view{ _data.tool_id }; + tracker_t _t{ _name }; + add(_t, sendcount * _send_size + recvcount * _recv_size); + tracker_t _r(JOIN('_', _name, "root", root)); + add(_r, sendcount * _send_size + recvcount * _recv_size); + add(JOIN('_', _name, "root", root, "send"), sendcount * _send_size); + add(JOIN('_', _name, "root", root, "recv"), recvcount * _recv_size); +} + +// MPI_Alltoall +void +comm_data::audit(const gotcha_data& _data, audit::incoming, const void*, int sendcount, + MPI_Datatype sendtype, void*, int recvcount, MPI_Datatype recvtype, + MPI_Comm) +{ + int _send_size = mpi_type_size(sendtype); + int _recv_size = mpi_type_size(recvtype); + if(_send_size == 0 || _recv_size == 0) return; + + write_perfetto_counter_track(sendcount * _send_size); + write_perfetto_counter_track(recvcount * _recv_size); + + if(!omnitrace::get_use_timemory()) return; + auto _name = std::string_view{ _data.tool_id }; + tracker_t _t{ _name }; + add(_t, sendcount * _send_size + recvcount * _recv_size); + add(JOIN('_', _name, "send"), sendcount * _send_size); + add(JOIN('_', _name, "recv"), recvcount * _recv_size); +} +#endif + +#if defined(OMNITRACE_USE_RCCL) +// ncclReduce +void +comm_data::audit(const gotcha_data& _data, audit::incoming, const void*, const void*, + size_t count, ncclDataType_t datatype, ncclRedOp_t, int root, ncclComm_t, + hipStream_t) +{ + int _size = rccl_type_size(datatype); + if(_size <= 0) return; + + write_perfetto_counter_track(count * _size); + + if(!omnitrace::get_use_timemory()) return; + auto _name = std::string_view{ _data.tool_id }; + tracker_t _t{ _name }; + add(_t, count * _size); + add(JOIN('_', _name, "root", root), count * _size); +} + +// ncclSend +// ncclGather +// ncclBcast +// ncclRecv +void +comm_data::audit(const gotcha_data& _data, audit::incoming, const void*, size_t count, + ncclDataType_t datatype, int peer, ncclComm_t, hipStream_t) +{ + int _size = rccl_type_size(datatype); + if(_size <= 0) return; + + static auto _send_types = std::unordered_set{ "ncclSend", "ncclBcast" }; + static auto _recv_types = std::unordered_set{ "ncclGather", "ncclRecv" }; + + if(_send_types.count(_data.tool_id) > 0) + { + write_perfetto_counter_track(count * _size); + } + else if(_recv_types.count(_data.tool_id) > 0) + { + write_perfetto_counter_track(count * _size); + } + else + { + OMNITRACE_CI_THROW(true, "RCCL function not handled: %s", _data.tool_id.c_str()); + } + + write_perfetto_counter_track(count * _size); + + if(!omnitrace::get_use_timemory()) return; + auto _name = std::string_view{ _data.tool_id }; + std::string _label = "root"; + if(_name.find("Send") != std::string::npos) _label = "peer"; + + tracker_t _t{ _name }; + add(_t, count * _size); + add(JOIN('_', _name, _label, peer), count * _size); +} + +// ncclBroadcast +void +comm_data::audit(const gotcha_data& _data, audit::incoming, const void*, const void*, + size_t count, ncclDataType_t datatype, int root, ncclComm_t, hipStream_t) +{ + int _size = rccl_type_size(datatype); + if(_size <= 0) return; + + write_perfetto_counter_track(count * _size); + + if(!omnitrace::get_use_timemory()) return; + auto _name = std::string_view{ _data.tool_id }; + tracker_t _t{ _name }; + add(_t, count * _size); + add(JOIN('_', _data.tool_id, "root", root), count * _size); +} + +// ncclAllReduce +// ncclReduceScatter +void +comm_data::audit(const gotcha_data& _data, audit::incoming, const void*, const void*, + size_t count, ncclDataType_t datatype, ncclRedOp_t, ncclComm_t, + hipStream_t) +{ + int _size = rccl_type_size(datatype); + if(_size <= 0) return; + + static auto _recv_types = std::unordered_set{ "ncclAllReduce" }; + static auto _send_types = std::unordered_set{ "ncclReduceScatter" }; + + if(_send_types.count(_data.tool_id) > 0) + { + write_perfetto_counter_track(count * _size); + } + else if(_recv_types.count(_data.tool_id) > 0) + { + write_perfetto_counter_track(count * _size); + } + else + { + OMNITRACE_CI_THROW(true, "RCCL function not handled: %s", _data.tool_id.c_str()); + } + + if(!omnitrace::get_use_timemory()) return; + add(_data, count * _size); +} + +// ncclAllGather +void +comm_data::audit(const gotcha_data& _data, audit::incoming, const void*, const void*, + size_t count, ncclDataType_t datatype, ncclComm_t, hipStream_t) +{ + int _size = rccl_type_size(datatype); + if(_size <= 0) return; + + write_perfetto_counter_track(count * _size); + + if(!omnitrace::get_use_timemory()) return; + add(_data, count * _size); +} +#endif +} // namespace component +} // namespace tim + +TIMEMORY_INSTANTIATE_EXTERN_COMPONENT( + TIMEMORY_ESC(data_tracker), true, float) + +TIMEMORY_INSTANTIATE_EXTERN_COMPONENT(comm_data, false, void) diff --git a/source/lib/omnitrace/library/components/comm_data.hpp b/source/lib/omnitrace/library/components/comm_data.hpp new file mode 100644 index 0000000000..4d34d0b58c --- /dev/null +++ b/source/lib/omnitrace/library/components/comm_data.hpp @@ -0,0 +1,247 @@ +// MIT License +// +// Copyright (c) 2020, The Regents of the University of California, +// through Lawrence Berkeley National Laboratory (subject to receipt of any +// required approvals from the U.S. Dept. of Energy). All rights reserved. +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in all +// copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +// SOFTWARE. + +#include "common/join.hpp" +#include "library/common.hpp" +#include "library/components/category_region.hpp" +#include "library/components/fwd.hpp" +#include "library/defines.hpp" +#include "library/timemory.hpp" + +#include +#include +#include +#include + +#include + +#if defined(OMNITRACE_USE_RCCL) +# if OMNITRACE_HIP_VERSION == 0 || OMNITRACE_HIP_VERSION >= 50200 +# include +# else +# include +# endif +#endif + +#if defined(OMNITRACE_USE_MPI) +# include +#endif + +#include +#include +#include +#include +#include +#include + +namespace tim +{ +namespace component +{ +using comm_data_tracker_t = data_tracker; + +struct comm_data : base +{ + using value_type = void; + using this_type = comm_data; + using base_type = base; + using tracker_t = tim::auto_tuple; + using data_type = float; + + struct mpi_recv + { + static constexpr auto value = "comm_data"; + static constexpr auto label = "MPI Comm Recv"; + }; + + struct mpi_send + { + static constexpr auto value = "comm_data"; + static constexpr auto label = "MPI Comm Send"; + }; + + struct rccl_recv + { + static constexpr auto value = "comm_data"; + static constexpr auto label = "RCCL Comm Recv"; + }; + + struct rccl_send + { + static constexpr auto value = "comm_data"; + static constexpr auto label = "RCCL Comm Send"; + }; + + TIMEMORY_DEFAULT_OBJECT(comm_data) + + static void preinit(); + static void configure(); + static void global_finalize(); + static void start() {} + static void stop() {} + +#if defined(OMNITRACE_USE_MPI) + static int mpi_type_size(MPI_Datatype _datatype) + { + int _size = 0; + PMPI_Type_size(_datatype, &_size); + return _size; + } + + // MPI_Send + static void audit(const gotcha_data& _data, audit::incoming, const void*, int count, + MPI_Datatype datatype, int dst, int tag, MPI_Comm); + + // MPI_Recv + static void audit(const gotcha_data& _data, audit::incoming, void*, int count, + MPI_Datatype datatype, int dst, int tag, MPI_Comm, MPI_Status*); + + // MPI_Isend + static void audit(const gotcha_data& _data, audit::incoming, const void*, int count, + MPI_Datatype datatype, int dst, int tag, MPI_Comm, MPI_Request*); + + // MPI_Irecv + static void audit(const gotcha_data& _data, audit::incoming, void*, int count, + MPI_Datatype datatype, int dst, int tag, MPI_Comm, MPI_Request*); + + // MPI_Bcast + static void audit(const gotcha_data& _data, audit::incoming, void*, int count, + MPI_Datatype datatype, int root, MPI_Comm); + + // MPI_Allreduce + static void audit(const gotcha_data& _data, audit::incoming, const void*, void*, + int count, MPI_Datatype datatype, MPI_Op, MPI_Comm); + + // MPI_Sendrecv + static void audit(const gotcha_data& _data, audit::incoming, const void*, + int sendcount, MPI_Datatype sendtype, int, int sendtag, void*, + int recvcount, MPI_Datatype recvtype, int, int recvtag, MPI_Comm, + MPI_Status*); + + // MPI_Gather + // MPI_Scatter + static void audit(const gotcha_data& _data, audit::incoming, const void*, + int sendcount, MPI_Datatype sendtype, void*, int recvcount, + MPI_Datatype recvtype, int root, MPI_Comm); + + // MPI_Alltoall + static void audit(const gotcha_data& _data, audit::incoming, const void*, + int sendcount, MPI_Datatype sendtype, void*, int recvcount, + MPI_Datatype recvtype, MPI_Comm); +#endif + +#if defined(OMNITRACE_USE_RCCL) + static auto rccl_type_size(ncclDataType_t datatype) + { + switch(datatype) + { + case ncclInt8: + case ncclUint8: return 1; + case ncclFloat16: return 2; + case ncclInt32: + case ncclUint32: + case ncclFloat32: return 4; + case ncclInt64: + case ncclUint64: + case ncclFloat64: return 8; + default: return 0; + }; + } + + // ncclReduce + static void audit(const gotcha_data& _data, audit::incoming, const void*, const void*, + size_t count, ncclDataType_t datatype, ncclRedOp_t, int root, + ncclComm_t, hipStream_t); + + // ncclSend + // ncclGather + // ncclBcast + // ncclRecv + static void audit(const gotcha_data& _data, audit::incoming, const void*, + size_t count, ncclDataType_t datatype, int peer, ncclComm_t, + hipStream_t); + + // ncclBroadcast + static void audit(const gotcha_data& _data, audit::incoming, const void*, const void*, + size_t count, ncclDataType_t datatype, int root, ncclComm_t, + hipStream_t); + + // ncclAllReduce + // ncclReduceScatter + static void audit(const gotcha_data& _data, audit::incoming, const void*, const void*, + size_t count, ncclDataType_t datatype, ncclRedOp_t, ncclComm_t, + hipStream_t); + + // ncclAllGather + // ncclAlltoAll + static void audit(const gotcha_data& _data, audit::incoming, const void*, const void*, + size_t count, ncclDataType_t datatype, ncclComm_t, hipStream_t); + +#endif + +private: + static auto& add(tracker_t& _t, data_type value) + { + if(omnitrace::get_state() != omnitrace::State::Active) + { + _t.invoke(true); + return _t; + } + _t.store(std::plus{}, value); + return _t; + } + + static auto&& add(const gotcha_data& _data, data_type value) + { + tracker_t _t{ std::string_view{ _data.tool_id.c_str() } }; + return add(_t, value); + } + + static auto&& add(std::string&& _name, data_type value) + { + tracker_t _t{ _name }; + return add(_t, value); + } + + static auto&& add(std::string_view _name, data_type value) + { + tracker_t _t{ _name }; + return add(_t, value); + } +}; +} // namespace component +} // namespace tim + +#if !defined(OMNITRACE_EXTERN_COMPONENTS) || \ + (defined(OMNITRACE_EXTERN_COMPONENTS) && OMNITRACE_EXTERN_COMPONENTS > 0) + +# include +# include +# include + +TIMEMORY_DECLARE_EXTERN_COMPONENT(TIMEMORY_ESC(data_tracker), + true, float) + +TIMEMORY_DECLARE_EXTERN_COMPONENT(comm_data, false, void) +#endif diff --git a/source/lib/omnitrace/library/components/fwd.hpp b/source/lib/omnitrace/library/components/fwd.hpp index 26e578c088..300c664f4d 100644 --- a/source/lib/omnitrace/library/components/fwd.hpp +++ b/source/lib/omnitrace/library/components/fwd.hpp @@ -42,10 +42,10 @@ TIMEMORY_DEFINE_NS_API(category, process_sampling) TIMEMORY_DECLARE_COMPONENT(roctracer) TIMEMORY_DECLARE_COMPONENT(rocprofiler) -TIMEMORY_DECLARE_COMPONENT(rccl_comm_data) TIMEMORY_DECLARE_COMPONENT(rcclp_handle) TIMEMORY_COMPONENT_ALIAS(rccl_api_t, api::rccl) -TIMEMORY_COMPONENT_ALIAS(rccl_data_tracker_t, data_tracker) +TIMEMORY_COMPONENT_ALIAS(comm_data_tracker_t, data_tracker) +TIMEMORY_DECLARE_COMPONENT(comm_data) /// \struct tim::trait::name /// \brief provides a constexpr string in ::value @@ -160,11 +160,14 @@ TIMEMORY_DEFINE_CONCRETE_TRAIT(is_available, component::rocprofiler, false_type) #if !defined(OMNITRACE_USE_RCCL) TIMEMORY_DEFINE_CONCRETE_TRAIT(is_available, api::rccl, false_type) -TIMEMORY_DEFINE_CONCRETE_TRAIT(is_available, component::rccl_comm_data, false_type) -TIMEMORY_DEFINE_CONCRETE_TRAIT(is_available, component::rccl_data_tracker_t, false_type) TIMEMORY_DEFINE_CONCRETE_TRAIT(is_available, component::rcclp_handle, false_type) #endif +#if !defined(OMNITRACE_USE_RCCL) && !defined(OMNITRACE_USE_MPI) +TIMEMORY_DEFINE_CONCRETE_TRAIT(is_available, component::comm_data_tracker_t, false_type) +TIMEMORY_DEFINE_CONCRETE_TRAIT(is_available, component::comm_data, false_type) +#endif + #if !defined(TIMEMORY_USE_LIBUNWIND) TIMEMORY_DEFINE_CONCRETE_TRAIT(is_available, omnitrace::api::sampling, false_type) TIMEMORY_DEFINE_CONCRETE_TRAIT(is_available, omnitrace::component::backtrace, false_type) @@ -289,6 +292,7 @@ TIMEMORY_STATISTICS_TYPE(omnitrace::component::sampling_gpu_busy, double) TIMEMORY_STATISTICS_TYPE(omnitrace::component::sampling_gpu_temp, double) TIMEMORY_STATISTICS_TYPE(omnitrace::component::sampling_gpu_power, double) TIMEMORY_STATISTICS_TYPE(omnitrace::component::sampling_gpu_memory, double) +TIMEMORY_STATISTICS_TYPE(component::comm_data_tracker_t, float) // enable timing units TIMEMORY_DEFINE_CONCRETE_TRAIT(is_timing_category, diff --git a/source/lib/omnitrace/library/components/mpi_gotcha.cpp b/source/lib/omnitrace/library/components/mpi_gotcha.cpp index 01abbfba32..02eca94b08 100644 --- a/source/lib/omnitrace/library/components/mpi_gotcha.cpp +++ b/source/lib/omnitrace/library/components/mpi_gotcha.cpp @@ -23,12 +23,15 @@ #include "library/components/mpi_gotcha.hpp" #include "library/api.hpp" #include "library/components/category_region.hpp" +#include "library/components/comm_data.hpp" +#include "library/components/fwd.hpp" #include "library/config.hpp" #include "library/debug.hpp" #include "library/mproc.hpp" #include #include +#include #include #include @@ -40,6 +43,10 @@ namespace omnitrace { namespace { +using mpip_bundle_t = + tim::component_tuple, + comp::comm_data>; + struct comm_rank_data { int rank = -1; @@ -104,10 +111,7 @@ omnitrace_mpi_set_attr() static auto _mpi_fini = [](MPI_Comm, int, void*, void*) { OMNITRACE_DEBUG("MPI Comm attribute finalize\n"); if(mpip_index != std::numeric_limits::max()) - comp::deactivate_mpip< - tim::component_tuple< - omnitrace::component::category_region>, - api::omnitrace>(mpip_index); + comp::deactivate_mpip(mpip_index); omnitrace_finalize_hidden(); return MPI_SUCCESS; }; @@ -224,9 +228,7 @@ mpi_gotcha::audit(const gotcha_data_t& _data, audit::incoming) OMNITRACE_BASIC_DEBUG_F("%s()\n", _data.tool_id.c_str()); if(mpip_index != std::numeric_limits::max()) - comp::deactivate_mpip< - tim::component_tuple>, - api::omnitrace>(mpip_index); + comp::deactivate_mpip(mpip_index); #if !defined(TIMEMORY_USE_MPI) && defined(TIMEMORY_USE_MPI_HEADERS) tim::mpi::is_initialized_callback() = []() { return false; }; @@ -276,16 +278,15 @@ mpi_gotcha::audit(const gotcha_data_t& _data, audit::outgoing, int _retval) { OMNITRACE_BASIC_VERBOSE_F(2, "Activating MPI wrappers...\n"); + if(!get_use_timemory()) + { + trait::runtime_enabled::set(false); + trait::runtime_enabled::set(false); + } // use env vars OMNITRACE_MPIP_PERMIT_LIST and OMNITRACE_MPIP_REJECT_LIST // to control the gotcha bindings at runtime - comp::configure_mpip< - tim::component_tuple< - omnitrace::component::category_region>, - api::omnitrace>(); - mpip_index = comp::activate_mpip< - tim::component_tuple< - omnitrace::component::category_region>, - api::omnitrace>(); + comp::configure_mpip(); + mpip_index = comp::activate_mpip(); } auto_lock_t _lk{ type_mutex() }; diff --git a/source/lib/omnitrace/library/components/rcclp.cpp b/source/lib/omnitrace/library/components/rcclp.cpp index 4f42487d1d..90611323b5 100644 --- a/source/lib/omnitrace/library/components/rcclp.cpp +++ b/source/lib/omnitrace/library/components/rcclp.cpp @@ -196,74 +196,5 @@ rcclp_handle::get_tool_count() { return get_persistent_data().m_count; } - -void -rccl_comm_data::preinit() -{ - omnitrace::rcclp::configure(); -} - -// ncclReduce -void -rccl_comm_data::audit(const gotcha_data& _data, audit::incoming, const void*, void*, - size_t count, ncclDataType_t datatype, ncclRedOp_t, int root, - ncclComm_t, hipStream_t) -{ - int size = rccl_type_size(datatype); - add(_data, count * size, JOIN('_', _data.tool_id.c_str(), "root", root)); -} - -// ncclSend -void -rccl_comm_data::audit(const gotcha_data& _data, audit::incoming, const void*, - size_t count, ncclDataType_t datatype, int peer, ncclComm_t, - hipStream_t) -{ - int size = rccl_type_size(datatype); - add(_data, count * size, JOIN('_', _data.tool_id.c_str(), "root", peer)); -} - -// ncclBcast -// ncclRecv -void -rccl_comm_data::audit(const gotcha_data& _data, audit::incoming, void*, size_t count, - ncclDataType_t datatype, int root, ncclComm_t, hipStream_t) -{ - int size = rccl_type_size(datatype); - add(_data, count * size, JOIN('_', _data.tool_id.c_str(), "root", root)); -} - -// ncclBroadcast -void -rccl_comm_data::audit(const gotcha_data& _data, audit::incoming, const void*, void*, - size_t count, ncclDataType_t datatype, int root, ncclComm_t, - hipStream_t) -{ - int size = rccl_type_size(datatype); - add(_data, count * size, JOIN('_', _data.tool_id.c_str(), "root", root)); -} - -// ncclAllReduce -// ncclReduceScatter -void -rccl_comm_data::audit(const gotcha_data& _data, audit::incoming, const void*, void*, - size_t count, ncclDataType_t datatype, ncclRedOp_t, ncclComm_t, - hipStream_t) -{ - int size = rccl_type_size(datatype); - add(_data, count * size); -} - -// ncclAllGather -void -rccl_comm_data::audit(const gotcha_data& _data, audit::incoming, const void*, void*, - size_t count, ncclDataType_t datatype, ncclComm_t, hipStream_t) -{ - int size = rccl_type_size(datatype); - add(_data, count * size); -} - } // namespace component } // namespace tim - -TIMEMORY_INITIALIZE_STORAGE(rccl_comm_data, rccl_data_tracker_t) diff --git a/source/lib/omnitrace/library/components/rcclp.hpp b/source/lib/omnitrace/library/components/rcclp.hpp index 172ec97f0b..9f3c855b6a 100644 --- a/source/lib/omnitrace/library/components/rcclp.hpp +++ b/source/lib/omnitrace/library/components/rcclp.hpp @@ -24,6 +24,7 @@ #include "library/common.hpp" #include "library/components/category_region.hpp" +#include "library/components/comm_data.hpp" #include "library/components/fwd.hpp" #include "library/defines.hpp" #include "library/timemory.hpp" @@ -51,7 +52,7 @@ TIMEMORY_COMPONENT_ALIAS( rccl_toolset_t, component_bundle, - rccl_comm_data*>) + comm_data>) TIMEMORY_COMPONENT_ALIAS(rcclp_gotcha_t, gotcha) @@ -59,12 +60,6 @@ TIMEMORY_COMPONENT_ALIAS(rcclp_gotcha_t, TIMEMORY_DEFINE_CONCRETE_TRAIT(is_available, component::rcclp_gotcha_t, false_type) #endif -TIMEMORY_STATISTICS_TYPE(component::rccl_data_tracker_t, float) -TIMEMORY_DEFINE_CONCRETE_TRAIT(uses_memory_units, component::rccl_data_tracker_t, - true_type) -TIMEMORY_DEFINE_CONCRETE_TRAIT(is_memory_category, component::rccl_data_tracker_t, - true_type) - namespace tim { namespace component @@ -110,111 +105,5 @@ private: static toolset_ptr_t& get_tool_instance(); static std::atomic& get_tool_count(); }; - -struct rccl_comm_data : base -{ - using value_type = void; - using this_type = rccl_comm_data; - using base_type = base; - using tracker_t = tim::auto_tuple; - using data_type = float; - - TIMEMORY_DEFAULT_OBJECT(rccl_comm_data) - - static void preinit(); - static void start() {} - static void stop() {} - - static auto rccl_type_size(ncclDataType_t datatype) - { - switch(datatype) - { - case ncclInt8: - case ncclUint8: return 1; - case ncclFloat16: return 2; - case ncclInt32: - case ncclUint32: - case ncclFloat32: return 4; - case ncclInt64: - case ncclUint64: - case ncclFloat64: return 8; - default: return 0; - }; - } - - // ncclReduce - static void audit(const gotcha_data& _data, audit::incoming, const void*, void*, - size_t count, ncclDataType_t datatype, ncclRedOp_t, int root, - ncclComm_t, hipStream_t); - - // ncclSend - static void audit(const gotcha_data& _data, audit::incoming, const void*, - size_t count, ncclDataType_t datatype, int peer, ncclComm_t, - hipStream_t); - - // ncclBcast - // ncclRecv - static void audit(const gotcha_data& _data, audit::incoming, void*, size_t count, - ncclDataType_t datatype, int root, ncclComm_t, hipStream_t); - - // ncclBroadcast - static void audit(const gotcha_data& _data, audit::incoming, const void*, void*, - size_t count, ncclDataType_t datatype, int root, ncclComm_t, - hipStream_t); - - // ncclAllReduce - // ncclReduceScatter - static void audit(const gotcha_data& _data, audit::incoming, const void*, void*, - size_t count, ncclDataType_t datatype, ncclRedOp_t, ncclComm_t, - hipStream_t); - - // ncclAllGather - static void audit(const gotcha_data& _data, audit::incoming, const void*, void*, - size_t count, ncclDataType_t datatype, ncclComm_t, hipStream_t); - -private: - template - static void add(tracker_t& _t, data_type value, Args&&... args) - { - _t.store(std::plus{}, value); - TIMEMORY_FOLD_EXPRESSION(add_secondary(_t, std::forward(args), value)); - } - - template - static void add(const gotcha_data& _data, data_type value, Args&&... args) - { - tracker_t _t{ std::string_view{ _data.tool_id.c_str() } }; - add(_t, value, std::forward(args)...); - } - - template - static void add_secondary(tracker_t&, const gotcha_data& _data, data_type value, - Args&&... args) - { - // if(tim::settings::add_secondary()) - { - tracker_t _s{ std::string_view{ _data.tool_id.c_str() } }; - add(_s, _data, value, std::forward(args)...); - } - } - - template - static void add(std::string_view _name, data_type value, Args&&... args) - { - tracker_t _t{ _name }; - add(_t, value, std::forward(args)...); - } - - template - static void add_secondary(tracker_t&, std::string_view _name, data_type value, - Args&&... args) - { - // if(tim::settings::add_secondary()) - { - tracker_t _s{ _name }; - add(_s, value, std::forward(args)...); - } - } -}; } // namespace component } // namespace tim diff --git a/source/lib/omnitrace/library/perfetto.hpp b/source/lib/omnitrace/library/perfetto.hpp index f9d7c81b07..c8823d5429 100644 --- a/source/lib/omnitrace/library/perfetto.hpp +++ b/source/lib/omnitrace/library/perfetto.hpp @@ -85,6 +85,9 @@ perfetto::Category("ompt").SetDescription("OpenMP Tools regions"), \ perfetto::Category("rccl").SetDescription( \ "ROCm Communication Collectives Library (RCCL) regions"), \ + perfetto::Category("comm_data") \ + .SetDescription( \ + "MPI/RCCL counters for tracking amount of data sent or received"), \ perfetto::Category("critical-trace").SetDescription("Combined critical traces"), \ perfetto::Category("host-critical-trace") \ .SetDescription("Host-side critical traces"), \ diff --git a/source/lib/omnitrace/library/rcclp.cpp b/source/lib/omnitrace/library/rcclp.cpp index 420f5b72b9..4d92b7e8fd 100644 --- a/source/lib/omnitrace/library/rcclp.cpp +++ b/source/lib/omnitrace/library/rcclp.cpp @@ -51,10 +51,7 @@ namespace rcclp { void configure() -{ - comp::rccl_data_tracker_t::label() = "rccl_comm_data"; - comp::rccl_data_tracker_t::description() = "Tracks RCCL communication data"; -} +{} void setup() @@ -67,11 +64,17 @@ setup() if(!librccl_handle) fprintf(stderr, "%s\n", dlerror()); dlerror(); // Clear any existing error - auto _data = tim::get_env("OMNITRACE_RCCLP_COMM_DATA", true); - if(_data) - comp::rccl_toolset_t::get_initializer() = [](comp::rccl_toolset_t& cb) { - cb.initialize(); - }; + auto _use_data = tim::get_env("OMNITRACE_RCCLP_COMM_DATA", get_use_timemory()); + if(!get_use_timemory()) + { + trait::runtime_enabled::set(false); + trait::runtime_enabled::set(false); + } + else + { + trait::runtime_enabled::set(_use_data); + trait::runtime_enabled::set(_use_data); + } comp::configure_rcclp(); global_id = comp::activate_rcclp(); diff --git a/source/lib/omnitrace/library/tracing.hpp b/source/lib/omnitrace/library/tracing.hpp index 9e7678c65d..0903682f3c 100644 --- a/source/lib/omnitrace/library/tracing.hpp +++ b/source/lib/omnitrace/library/tracing.hpp @@ -31,6 +31,8 @@ #include "library/timemory.hpp" #include "library/utility.hpp" +#include + namespace omnitrace { namespace tracing @@ -49,6 +51,13 @@ get_timemory_hash_ids(int64_t _tid = threading::get_id()); tim::hash_alias_ptr_t& get_timemory_hash_aliases(int64_t _tid = threading::get_id()); +template +OMNITRACE_INLINE auto +now() +{ + return ::tim::get_clock_real_now(); +} + namespace { bool debug_push = // NOLINT