Generic comm_data component (#132)

* Generic comm_data component

- moved rccl_comm_data to comm_data
- comm_data includes communication data for MPI

* fix timemory include with quotes

* Only support MPI comm data with full MPI support

* Increase timeouts + kill perfetto

* Update timemory submodule

* Fix missing command killall

* set +e in Kill Perfetto workflow step

* Updated MPI example to include MPI_Send and MPI_Recv calls

* Update timemory submodule with storage merge fix

* Perfetto comm data

- tracing::now<T>() function

* Fix timemory header include
This commit is contained in:
Jonathan R. Madsen
2022-08-25 19:48:10 -05:00
committato da GitHub
parent a1afd69a02
commit 0dd8f52292
15 ha cambiato i file con 860 aggiunte e 248 eliminazioni
+10 -2
Vedi File
@@ -29,7 +29,7 @@ jobs:
- uses: actions/checkout@v2
- name: Install Packages
timeout-minutes: 5
timeout-minutes: 10
run:
for i in 6 7 8 9 10; do /opt/conda/envs/py3.${i}/bin/python -m pip install numpy perfetto dataclasses; done
@@ -66,7 +66,7 @@ jobs:
cmake --build build --target all --parallel 2 -- VERBOSE=1
- name: Install
timeout-minutes: 5
timeout-minutes: 10
run:
cmake --build build --target install --parallel 2
@@ -124,3 +124,11 @@ jobs:
build/omnitrace-tests-config/*.cfg
build/omnitrace-tests-output/**/*.txt
build/omnitrace-tests-output/**/*-instr*.json
- name: Kill Perfetto
if: success() || failure()
continue-on-error: True
run: |
set +e
RUNNING_PROCS=$(pgrep trace_processor_shell)
if [ -n "${RUNNING_PROCS}" ]; then kill -s 9 ${RUNNING_PROCS}; fi
+12 -4
Vedi File
@@ -19,7 +19,7 @@ env:
jobs:
ubuntu-bionic:
runs-on: ubuntu-18.04
runs-on: ubuntu-latest
container:
image: jrmadsen/omnitrace:ci-base-ubuntu-18.04
strategy:
@@ -29,7 +29,7 @@ jobs:
steps:
- name: Patch Git
timeout-minutes: 5
timeout-minutes: 10
run: |
apt-get update
apt-get install -y software-properties-common
@@ -43,7 +43,7 @@ jobs:
submodules: recursive
- name: Install Packages
timeout-minutes: 5
timeout-minutes: 10
run:
apt-get update &&
apt-get upgrade -y &&
@@ -55,7 +55,7 @@ jobs:
for i in 6 7 8 9 10; do /opt/conda/envs/py3.${i}/bin/python -m pip install numpy perfetto dataclasses; done
- name: Install Kokkos
timeout-minutes: 5
timeout-minutes: 10
run:
cd examples/lulesh/external/kokkos &&
cmake -B build -DKokkos_ENABLE_OPENMP=ON -DKokkos_ENABLE_SERIAL=ON -DBUILD_SHARED_LIBS=ON -DCMAKE_CXX_STANDARD=17 . &&
@@ -153,3 +153,11 @@ jobs:
build/omnitrace-tests-config/*.cfg
build/omnitrace-tests-output/**/*.txt
build/omnitrace-tests-output/**/*-instr*.json
- name: Kill Perfetto
if: success() || failure()
continue-on-error: True
run: |
set +e
RUNNING_PROCS=$(pgrep trace_processor_shell)
if [ -n "${RUNNING_PROCS}" ]; then kill -s 9 ${RUNNING_PROCS}; fi
+29 -5
Vedi File
@@ -59,7 +59,7 @@ jobs:
- uses: actions/checkout@v2
- name: Install Packages
timeout-minutes: 5
timeout-minutes: 10
run:
apt-get update &&
apt-get install -y software-properties-common &&
@@ -111,7 +111,7 @@ jobs:
cmake --build build --target all --parallel 2 -- VERBOSE=1
- name: Install
timeout-minutes: 5
timeout-minutes: 10
run:
cmake --build build --target install --parallel 2
@@ -170,6 +170,14 @@ jobs:
build/omnitrace-tests-output/**/*.txt
build/omnitrace-tests-output/**/*-instr*.json
- name: Kill Perfetto
if: success() || failure()
continue-on-error: True
run: |
set +e
RUNNING_PROCS=$(pgrep trace_processor_shell)
if [ -n "${RUNNING_PROCS}" ]; then kill -s 9 ${RUNNING_PROCS}; fi
ubuntu-focal-external-rocm:
runs-on: ubuntu-20.04
container:
@@ -199,7 +207,7 @@ jobs:
- uses: actions/checkout@v2
- name: Install Packages
timeout-minutes: 5
timeout-minutes: 10
run:
apt-get update &&
apt-get install -y software-properties-common wget gnupg2 &&
@@ -213,7 +221,7 @@ jobs:
- name: Install RCCL
if: ${{ matrix.rocm_version != '4.3' }}
timeout-minutes: 5
timeout-minutes: 10
run:
apt-get install -y rccl-dev
@@ -371,6 +379,14 @@ jobs:
build/omnitrace-tests-output/**/*.txt
build/omnitrace-tests-output/**/*-instr*.json
- name: Kill Perfetto
if: success() || failure()
continue-on-error: True
run: |
set +e
RUNNING_PROCS=$(pgrep trace_processor_shell)
if [ -n "${RUNNING_PROCS}" ]; then kill -s 9 ${RUNNING_PROCS}; fi
ubuntu-focal:
runs-on: ubuntu-20.04
strategy:
@@ -404,7 +420,7 @@ jobs:
- uses: actions/checkout@v2
- name: Install Packages
timeout-minutes: 5
timeout-minutes: 10
run:
sudo apt-get update &&
sudo apt-get install -y build-essential m4 autoconf libtool python3-pip ${{ matrix.deps }} clang libomp-dev ${{ matrix.compiler }} ${{ matrix.mpi }} &&
@@ -523,3 +539,11 @@ jobs:
${{ github.workspace }}/build/omnitrace-tests-config/*.cfg
${{ github.workspace }}/build/omnitrace-tests-output/**/*.txt
${{ github.workspace }}/build/omnitrace-tests-output/**/*-instr*.json
- name: Kill Perfetto
if: success() || failure()
continue-on-error: True
run: |
set +e
RUNNING_PROCS=$(pgrep trace_processor_shell)
if [ -n "${RUNNING_PROCS}" ]; then kill -s 9 ${RUNNING_PROCS}; fi
+86 -26
Vedi File
@@ -44,15 +44,21 @@ namespace
auto _name = std::string{};
} // namespace
template <typename Tp, size_t N>
void
all2all(int _rank, MPI_Comm _comm)
template <typename Tp>
auto
get_values_str(const Tp& _data)
{
if(_comm == MPI_COMM_NULL) return;
static_assert(N > 0, "Error! N must be greater than zero!");
std::stringstream _ss{};
for(auto&& itr : _data)
_ss << ", " << std::setw(6) << std::setprecision(2) << std::fixed << itr;
return _ss.str().substr(1);
};
auto _mt = std::mt19937_64{ size_t(_rank + 100) };
auto _dist = []() {
template <typename Tp, size_t N>
auto
get_dist(std::mt19937_64& _mt)
{
static auto _dist = []() {
if constexpr(std::is_integral<Tp>::value)
{
return std::uniform_int_distribution<Tp>(1, N * N);
@@ -62,23 +68,13 @@ all2all(int _rank, MPI_Comm _comm)
return std::uniform_real_distribution<Tp>(1.0, N * N);
}
}();
return _dist(_mt);
}
auto _get_values_str = [](const auto& _data) {
std::stringstream _ss{};
for(auto&& itr : _data)
_ss << ", " << std::setw(6) << std::setprecision(2) << std::fixed << itr;
return _ss.str().substr(1);
};
std::array<Tp, N> values_sent = {};
std::array<Tp, N> values_recv = {};
for(size_t i = 0; i < N; ++i)
values_sent[i] = _dist(_mt);
if(_rank == 0)
printf("[%s][%i] values sent (# = %zu) :: %s.\n", _name.c_str(), _rank,
values_sent.size(), _get_values_str(values_sent).c_str());
template <typename Tp>
auto
get_dtype()
{
auto _dtype = MPI_INT; // NOLINT
if(std::is_same<Tp, long>::value)
_dtype = MPI_LONG;
@@ -86,12 +82,71 @@ all2all(int _rank, MPI_Comm _comm)
_dtype = MPI_FLOAT;
else if(std::is_same<Tp, double>::value)
_dtype = MPI_DOUBLE;
return _dtype;
}
template <typename Tp, size_t N>
void
all2all(int _rank, MPI_Comm _comm)
{
if(_comm == MPI_COMM_NULL) return;
static_assert(N > 0, "Error! N must be greater than zero!");
auto _dtype = get_dtype<Tp>();
auto _mt = std::mt19937_64{ size_t(_rank + 100) };
auto values_sent = std::array<Tp, N>{};
auto values_recv = std::array<Tp, N>{};
for(size_t i = 0; i < N; ++i)
values_sent[i] = get_dist<Tp, N>(_mt);
if(_rank == 0)
printf("[%s][%s][%i] values sent (# = %zu) :: %s.\n", _name.c_str(), __FUNCTION__,
_rank, values_sent.size(), get_values_str(values_sent).c_str());
MPI_Alltoall(&values_sent[_rank], 1, _dtype, &values_recv[_rank], 1, _dtype, _comm);
if(_rank == 0)
printf("[%s][%i] values recv (# = %zu) :: %s.\n", _name.c_str(), _rank,
values_sent.size(), _get_values_str(values_recv).c_str());
printf("[%s][%s][%i] values recv (# = %zu) :: %s.\n", _name.c_str(), __FUNCTION__,
_rank, values_sent.size(), get_values_str(values_recv).c_str());
}
template <typename Tp, size_t N>
void
send_recv(int _rank, MPI_Comm _comm)
{
if(_comm == MPI_COMM_NULL) return;
static_assert(N > 0, "Error! N must be greater than zero!");
int _size = 0;
MPI_Comm_size(_comm, &_size);
auto _dtype = get_dtype<Tp>();
auto _mt = std::mt19937_64{ size_t(_rank + 100) };
auto values_sent = std::array<Tp, N>{};
auto values_recv = std::array<Tp, N>{};
for(size_t i = 0; i < N; ++i)
values_sent[i] = get_dist<Tp, N>(_mt);
if(_rank == 0)
printf("[%s][%s][%i] values sent (# = %zu) :: %s.\n", _name.c_str(), __FUNCTION__,
_rank, values_sent.size(), get_values_str(values_sent).c_str());
for(int i = 0; i < _size; ++i)
{
if(i != _rank) MPI_Send(&values_sent[_rank], 1, _dtype, i, N, _comm);
}
for(int i = 0; i < _size; ++i)
{
if(i != _rank)
{
MPI_Status _status;
MPI_Recv(&values_recv[i], 1, _dtype, i, N, _comm, &_status);
}
}
if(_rank == 0)
printf("[%s][%s][%i] values recv (# = %zu) :: %s.\n", _name.c_str(), __FUNCTION__,
_rank, values_sent.size(), get_values_str(values_recv).c_str());
}
void
@@ -109,9 +164,13 @@ run(MPI_Comm _comm, int nitr)
MPI_Barrier(_comm);
for(int i = 0; i < nitr; ++i)
{
send_recv<int, 3>(_rank, _comm);
send_recv<long, 4>(_rank, _comm);
send_recv<float, 5>(_rank, _comm);
send_recv<double, 6>(_rank, _comm);
MPI_Barrier(_comm);
all2all<int, 3>(_rank, _comm);
all2all<long, 4>(_rank, _comm);
MPI_Barrier(_comm);
all2all<float, 5>(_rank, _comm);
all2all<double, 6>(_rank, _comm);
}
@@ -259,6 +318,7 @@ run_main(int argc, char** argv)
int
main(int argc, char** argv)
{
std::this_thread::sleep_for(std::chrono::seconds{ 2 });
int _mpi_thread_provided;
MPI_Init_thread(&argc, &argv, MPI_THREAD_SERIALIZED, &_mpi_thread_provided);
+2
Vedi File
@@ -76,6 +76,7 @@ set(library_sources
${CMAKE_CURRENT_LIST_DIR}/library/timemory.cpp
${CMAKE_CURRENT_LIST_DIR}/library/tracing.cpp
${CMAKE_CURRENT_LIST_DIR}/library/components/backtrace.cpp
${CMAKE_CURRENT_LIST_DIR}/library/components/comm_data.cpp
${CMAKE_CURRENT_LIST_DIR}/library/components/exit_gotcha.cpp
${CMAKE_CURRENT_LIST_DIR}/library/components/fork_gotcha.cpp
${CMAKE_CURRENT_LIST_DIR}/library/components/mpi_gotcha.cpp
@@ -115,6 +116,7 @@ set(library_headers
${CMAKE_CURRENT_LIST_DIR}/library/components/fwd.hpp
${CMAKE_CURRENT_LIST_DIR}/library/components/backtrace.hpp
${CMAKE_CURRENT_LIST_DIR}/library/components/category_region.hpp
${CMAKE_CURRENT_LIST_DIR}/library/components/comm_data.hpp
${CMAKE_CURRENT_LIST_DIR}/library/components/exit_gotcha.hpp
${CMAKE_CURRENT_LIST_DIR}/library/components/fork_gotcha.hpp
${CMAKE_CURRENT_LIST_DIR}/library/components/functors.hpp
@@ -0,0 +1,423 @@
// MIT License
//
// Copyright (c) 2022 Advanced Micro Devices, Inc. All Rights Reserved.
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in all
// copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
// SOFTWARE.
#include "library/components/comm_data.hpp"
#include "library/components/fwd.hpp"
#include "library/config.hpp"
#include "library/perfetto.hpp"
#include "library/tracing.hpp"
#include <timemory/backends/mpi.hpp>
#include <timemory/manager.hpp>
#include <timemory/units.hpp>
#include <timemory/utility/locking.hpp>
namespace tim
{
namespace component
{
namespace
{
template <typename Tp, typename... Args>
void
write_perfetto_counter_track(uint64_t _val)
{
using counter_track = omnitrace::perfetto_counter_track<Tp>;
if(omnitrace::get_use_perfetto() &&
omnitrace::get_state() == omnitrace::State::Active)
{
auto _emplace = [](const size_t _idx) {
if(!counter_track::exists(_idx))
{
std::string _label = (_idx > 0)
? JOIN(" ", Tp::label, JOIN("", '[', _idx, ']'))
: Tp::label;
counter_track::emplace(_idx, _label, "bytes");
}
};
const size_t _idx = 0;
static std::once_flag _once{};
std::call_once(_once, _emplace, _idx);
static std::mutex _mutex{};
static uint64_t value = 0;
uint64_t _now = 0;
{
std::unique_lock<std::mutex> _lk{ _mutex };
_now = omnitrace::tracing::now<uint64_t>();
_val = (value += _val);
}
TRACE_COUNTER(Tp::value, counter_track::at(_idx, 0), _now, _val);
}
}
} // namespace
void
comm_data::preinit()
{
configure();
}
void
comm_data::global_finalize()
{
configure();
}
void
comm_data::configure()
{
static bool _once = false;
if(_once) return;
_once = true;
comm_data_tracker_t::label() = "comm_data";
comm_data_tracker_t::description() = "Tracks MPI/RCCL communication data sizes";
comm_data_tracker_t::display_unit() = "MB";
comm_data_tracker_t::unit() = units::megabyte;
auto _fmt_flags = comm_data_tracker_t::get_format_flags();
_fmt_flags &= (std::ios_base::fixed & std::ios_base::scientific);
_fmt_flags |= (std::ios_base::scientific);
comm_data_tracker_t::set_precision(3);
comm_data_tracker_t::set_format_flags(_fmt_flags);
}
#if defined(OMNITRACE_USE_MPI)
// MPI_Send
void
comm_data::audit(const gotcha_data& _data, audit::incoming, const void*, int count,
MPI_Datatype datatype, int dst, int tag, MPI_Comm)
{
int _size = mpi_type_size(datatype);
if(_size == 0) return;
write_perfetto_counter_track<mpi_send>(count * _size);
if(!omnitrace::get_use_timemory()) return;
auto _name = std::string_view{ _data.tool_id };
tracker_t _a{ _name };
add(_a, count * _size);
tracker_t _b{ JOIN('_', _name, "dst", dst) };
add(_b, count * _size);
add(JOIN('_', _name, "dst", dst, "tag", tag), count * _size);
}
// MPI_Recv
void
comm_data::audit(const gotcha_data& _data, audit::incoming, void*, int count,
MPI_Datatype datatype, int dst, int tag, MPI_Comm, MPI_Status*)
{
int _size = mpi_type_size(datatype);
if(_size == 0) return;
write_perfetto_counter_track<mpi_recv>(count * _size);
if(!omnitrace::get_use_timemory()) return;
auto _name = std::string_view{ _data.tool_id };
tracker_t _a{ _name };
add(_a, count * _size);
tracker_t _b{ JOIN('_', _name, "dst", dst) };
add(_b, count * _size);
add(JOIN('_', _name, "dst", dst, "tag", tag), count * _size);
}
// MPI_Isend
void
comm_data::audit(const gotcha_data& _data, audit::incoming, const void*, int count,
MPI_Datatype datatype, int dst, int tag, MPI_Comm, MPI_Request*)
{
int _size = mpi_type_size(datatype);
if(_size == 0) return;
write_perfetto_counter_track<mpi_send>(count * _size);
if(!omnitrace::get_use_timemory()) return;
auto _name = std::string_view{ _data.tool_id };
tracker_t _a{ _name };
add(_a, count * _size);
tracker_t _b{ JOIN('_', _name, "dst", dst) };
add(_b, count * _size);
add(JOIN('_', _name, "dst", dst, "tag", tag), count * _size);
}
// MPI_Irecv
void
comm_data::audit(const gotcha_data& _data, audit::incoming, void*, int count,
MPI_Datatype datatype, int dst, int tag, MPI_Comm, MPI_Request*)
{
int _size = mpi_type_size(datatype);
if(_size == 0) return;
write_perfetto_counter_track<mpi_recv>(count * _size);
if(!omnitrace::get_use_timemory()) return;
auto _name = std::string_view{ _data.tool_id };
tracker_t _a{ _name };
add(_a, count * _size);
tracker_t _b{ JOIN('_', _name, "dst", dst) };
add(_b, count * _size);
add(JOIN('_', _name, "dst", dst, "tag", tag), count * _size);
}
// MPI_Bcast
void
comm_data::audit(const gotcha_data& _data, audit::incoming, void*, int count,
MPI_Datatype datatype, int root, MPI_Comm)
{
int _size = mpi_type_size(datatype);
if(_size == 0) return;
write_perfetto_counter_track<mpi_send>(count * _size);
if(!omnitrace::get_use_timemory()) return;
auto _name = std::string_view{ _data.tool_id };
tracker_t _t{ _name };
add(_t, count * _size);
add(JOIN('_', _name, "root", root), count * _size);
}
// MPI_Allreduce
void
comm_data::audit(const gotcha_data& _data, audit::incoming, const void*, void*, int count,
MPI_Datatype datatype, MPI_Op, MPI_Comm)
{
int _size = mpi_type_size(datatype);
if(_size == 0) return;
write_perfetto_counter_track<mpi_recv>(count * _size);
write_perfetto_counter_track<mpi_send>(count * _size);
if(!omnitrace::get_use_timemory()) return;
add(_data, count * _size);
}
// MPI_Sendrecv
void
comm_data::audit(const gotcha_data& _data, audit::incoming, const void*, int sendcount,
MPI_Datatype sendtype, int dst, int sendtag, void*, int recvcount,
MPI_Datatype recvtype, int src, int recvtag, MPI_Comm, MPI_Status*)
{
int _send_size = mpi_type_size(sendtype);
int _recv_size = mpi_type_size(recvtype);
if(_send_size == 0 || _recv_size == 0) return;
write_perfetto_counter_track<mpi_send>(sendcount * _send_size);
write_perfetto_counter_track<mpi_recv>(recvcount * _recv_size);
if(!omnitrace::get_use_timemory()) return;
auto _name = std::string_view{ _data.tool_id };
tracker_t _t{ _name };
add(_t, sendcount * _send_size + recvcount * _recv_size);
{
tracker_t _b{ JOIN('_', _name, "send") };
add(_b, sendcount * _send_size);
tracker_t _c{ JOIN('_', _name, "send", dst) };
add(_b, sendcount * _send_size);
add(JOIN('_', _name, "send", "tag", sendtag), sendcount * _send_size);
add(JOIN('_', _name, "send", dst, "tag", sendtag), sendcount * _send_size);
}
{
tracker_t _b{ JOIN('_', _name, "recv") };
add(_b, recvcount * _recv_size);
tracker_t _c{ JOIN('_', _name, "recv", src) };
add(_b, recvcount * _recv_size);
add(JOIN('_', _name, "recv", "tag", recvtag), recvcount * _recv_size);
add(JOIN('_', _name, "recv", src, "tag", recvtag), recvcount * _recv_size);
}
}
// MPI_Gather
// MPI_Scatter
void
comm_data::audit(const gotcha_data& _data, audit::incoming, const void*, int sendcount,
MPI_Datatype sendtype, void*, int recvcount, MPI_Datatype recvtype,
int root, MPI_Comm)
{
int _send_size = mpi_type_size(sendtype);
int _recv_size = mpi_type_size(recvtype);
if(_send_size == 0 || _recv_size == 0) return;
write_perfetto_counter_track<mpi_send>(sendcount * _send_size);
write_perfetto_counter_track<mpi_recv>(recvcount * _recv_size);
if(!omnitrace::get_use_timemory()) return;
auto _name = std::string_view{ _data.tool_id };
tracker_t _t{ _name };
add(_t, sendcount * _send_size + recvcount * _recv_size);
tracker_t _r(JOIN('_', _name, "root", root));
add(_r, sendcount * _send_size + recvcount * _recv_size);
add(JOIN('_', _name, "root", root, "send"), sendcount * _send_size);
add(JOIN('_', _name, "root", root, "recv"), recvcount * _recv_size);
}
// MPI_Alltoall
void
comm_data::audit(const gotcha_data& _data, audit::incoming, const void*, int sendcount,
MPI_Datatype sendtype, void*, int recvcount, MPI_Datatype recvtype,
MPI_Comm)
{
int _send_size = mpi_type_size(sendtype);
int _recv_size = mpi_type_size(recvtype);
if(_send_size == 0 || _recv_size == 0) return;
write_perfetto_counter_track<mpi_send>(sendcount * _send_size);
write_perfetto_counter_track<mpi_recv>(recvcount * _recv_size);
if(!omnitrace::get_use_timemory()) return;
auto _name = std::string_view{ _data.tool_id };
tracker_t _t{ _name };
add(_t, sendcount * _send_size + recvcount * _recv_size);
add(JOIN('_', _name, "send"), sendcount * _send_size);
add(JOIN('_', _name, "recv"), recvcount * _recv_size);
}
#endif
#if defined(OMNITRACE_USE_RCCL)
// ncclReduce
void
comm_data::audit(const gotcha_data& _data, audit::incoming, const void*, const void*,
size_t count, ncclDataType_t datatype, ncclRedOp_t, int root, ncclComm_t,
hipStream_t)
{
int _size = rccl_type_size(datatype);
if(_size <= 0) return;
write_perfetto_counter_track<rccl_recv>(count * _size);
if(!omnitrace::get_use_timemory()) return;
auto _name = std::string_view{ _data.tool_id };
tracker_t _t{ _name };
add(_t, count * _size);
add(JOIN('_', _name, "root", root), count * _size);
}
// ncclSend
// ncclGather
// ncclBcast
// ncclRecv
void
comm_data::audit(const gotcha_data& _data, audit::incoming, const void*, size_t count,
ncclDataType_t datatype, int peer, ncclComm_t, hipStream_t)
{
int _size = rccl_type_size(datatype);
if(_size <= 0) return;
static auto _send_types = std::unordered_set<std::string>{ "ncclSend", "ncclBcast" };
static auto _recv_types = std::unordered_set<std::string>{ "ncclGather", "ncclRecv" };
if(_send_types.count(_data.tool_id) > 0)
{
write_perfetto_counter_track<rccl_send>(count * _size);
}
else if(_recv_types.count(_data.tool_id) > 0)
{
write_perfetto_counter_track<rccl_recv>(count * _size);
}
else
{
OMNITRACE_CI_THROW(true, "RCCL function not handled: %s", _data.tool_id.c_str());
}
write_perfetto_counter_track<rccl_recv>(count * _size);
if(!omnitrace::get_use_timemory()) return;
auto _name = std::string_view{ _data.tool_id };
std::string _label = "root";
if(_name.find("Send") != std::string::npos) _label = "peer";
tracker_t _t{ _name };
add(_t, count * _size);
add(JOIN('_', _name, _label, peer), count * _size);
}
// ncclBroadcast
void
comm_data::audit(const gotcha_data& _data, audit::incoming, const void*, const void*,
size_t count, ncclDataType_t datatype, int root, ncclComm_t, hipStream_t)
{
int _size = rccl_type_size(datatype);
if(_size <= 0) return;
write_perfetto_counter_track<rccl_send>(count * _size);
if(!omnitrace::get_use_timemory()) return;
auto _name = std::string_view{ _data.tool_id };
tracker_t _t{ _name };
add(_t, count * _size);
add(JOIN('_', _data.tool_id, "root", root), count * _size);
}
// ncclAllReduce
// ncclReduceScatter
void
comm_data::audit(const gotcha_data& _data, audit::incoming, const void*, const void*,
size_t count, ncclDataType_t datatype, ncclRedOp_t, ncclComm_t,
hipStream_t)
{
int _size = rccl_type_size(datatype);
if(_size <= 0) return;
static auto _recv_types = std::unordered_set<std::string>{ "ncclAllReduce" };
static auto _send_types = std::unordered_set<std::string>{ "ncclReduceScatter" };
if(_send_types.count(_data.tool_id) > 0)
{
write_perfetto_counter_track<rccl_send>(count * _size);
}
else if(_recv_types.count(_data.tool_id) > 0)
{
write_perfetto_counter_track<rccl_recv>(count * _size);
}
else
{
OMNITRACE_CI_THROW(true, "RCCL function not handled: %s", _data.tool_id.c_str());
}
if(!omnitrace::get_use_timemory()) return;
add(_data, count * _size);
}
// ncclAllGather
void
comm_data::audit(const gotcha_data& _data, audit::incoming, const void*, const void*,
size_t count, ncclDataType_t datatype, ncclComm_t, hipStream_t)
{
int _size = rccl_type_size(datatype);
if(_size <= 0) return;
write_perfetto_counter_track<rccl_recv>(count * _size);
if(!omnitrace::get_use_timemory()) return;
add(_data, count * _size);
}
#endif
} // namespace component
} // namespace tim
TIMEMORY_INSTANTIATE_EXTERN_COMPONENT(
TIMEMORY_ESC(data_tracker<float, tim::api::omnitrace>), true, float)
TIMEMORY_INSTANTIATE_EXTERN_COMPONENT(comm_data, false, void)
@@ -0,0 +1,247 @@
// MIT License
//
// Copyright (c) 2020, The Regents of the University of California,
// through Lawrence Berkeley National Laboratory (subject to receipt of any
// required approvals from the U.S. Dept. of Energy). All rights reserved.
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in all
// copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
// SOFTWARE.
#include "common/join.hpp"
#include "library/common.hpp"
#include "library/components/category_region.hpp"
#include "library/components/fwd.hpp"
#include "library/defines.hpp"
#include "library/timemory.hpp"
#include <timemory/api/macros.hpp>
#include <timemory/components/macros.hpp>
#include <timemory/operations/types/set.hpp>
#include <timemory/utility/types.hpp>
#include <optional>
#if defined(OMNITRACE_USE_RCCL)
# if OMNITRACE_HIP_VERSION == 0 || OMNITRACE_HIP_VERSION >= 50200
# include <rccl/rccl.h>
# else
# include <rccl.h>
# endif
#endif
#if defined(OMNITRACE_USE_MPI)
# include <mpi.h>
#endif
#include <atomic>
#include <functional>
#include <memory>
#include <set>
#include <string>
#include <utility>
namespace tim
{
namespace component
{
using comm_data_tracker_t = data_tracker<float, api::omnitrace>;
struct comm_data : base<comm_data, void>
{
using value_type = void;
using this_type = comm_data;
using base_type = base<this_type, value_type>;
using tracker_t = tim::auto_tuple<comm_data_tracker_t>;
using data_type = float;
struct mpi_recv
{
static constexpr auto value = "comm_data";
static constexpr auto label = "MPI Comm Recv";
};
struct mpi_send
{
static constexpr auto value = "comm_data";
static constexpr auto label = "MPI Comm Send";
};
struct rccl_recv
{
static constexpr auto value = "comm_data";
static constexpr auto label = "RCCL Comm Recv";
};
struct rccl_send
{
static constexpr auto value = "comm_data";
static constexpr auto label = "RCCL Comm Send";
};
TIMEMORY_DEFAULT_OBJECT(comm_data)
static void preinit();
static void configure();
static void global_finalize();
static void start() {}
static void stop() {}
#if defined(OMNITRACE_USE_MPI)
static int mpi_type_size(MPI_Datatype _datatype)
{
int _size = 0;
PMPI_Type_size(_datatype, &_size);
return _size;
}
// MPI_Send
static void audit(const gotcha_data& _data, audit::incoming, const void*, int count,
MPI_Datatype datatype, int dst, int tag, MPI_Comm);
// MPI_Recv
static void audit(const gotcha_data& _data, audit::incoming, void*, int count,
MPI_Datatype datatype, int dst, int tag, MPI_Comm, MPI_Status*);
// MPI_Isend
static void audit(const gotcha_data& _data, audit::incoming, const void*, int count,
MPI_Datatype datatype, int dst, int tag, MPI_Comm, MPI_Request*);
// MPI_Irecv
static void audit(const gotcha_data& _data, audit::incoming, void*, int count,
MPI_Datatype datatype, int dst, int tag, MPI_Comm, MPI_Request*);
// MPI_Bcast
static void audit(const gotcha_data& _data, audit::incoming, void*, int count,
MPI_Datatype datatype, int root, MPI_Comm);
// MPI_Allreduce
static void audit(const gotcha_data& _data, audit::incoming, const void*, void*,
int count, MPI_Datatype datatype, MPI_Op, MPI_Comm);
// MPI_Sendrecv
static void audit(const gotcha_data& _data, audit::incoming, const void*,
int sendcount, MPI_Datatype sendtype, int, int sendtag, void*,
int recvcount, MPI_Datatype recvtype, int, int recvtag, MPI_Comm,
MPI_Status*);
// MPI_Gather
// MPI_Scatter
static void audit(const gotcha_data& _data, audit::incoming, const void*,
int sendcount, MPI_Datatype sendtype, void*, int recvcount,
MPI_Datatype recvtype, int root, MPI_Comm);
// MPI_Alltoall
static void audit(const gotcha_data& _data, audit::incoming, const void*,
int sendcount, MPI_Datatype sendtype, void*, int recvcount,
MPI_Datatype recvtype, MPI_Comm);
#endif
#if defined(OMNITRACE_USE_RCCL)
static auto rccl_type_size(ncclDataType_t datatype)
{
switch(datatype)
{
case ncclInt8:
case ncclUint8: return 1;
case ncclFloat16: return 2;
case ncclInt32:
case ncclUint32:
case ncclFloat32: return 4;
case ncclInt64:
case ncclUint64:
case ncclFloat64: return 8;
default: return 0;
};
}
// ncclReduce
static void audit(const gotcha_data& _data, audit::incoming, const void*, const void*,
size_t count, ncclDataType_t datatype, ncclRedOp_t, int root,
ncclComm_t, hipStream_t);
// ncclSend
// ncclGather
// ncclBcast
// ncclRecv
static void audit(const gotcha_data& _data, audit::incoming, const void*,
size_t count, ncclDataType_t datatype, int peer, ncclComm_t,
hipStream_t);
// ncclBroadcast
static void audit(const gotcha_data& _data, audit::incoming, const void*, const void*,
size_t count, ncclDataType_t datatype, int root, ncclComm_t,
hipStream_t);
// ncclAllReduce
// ncclReduceScatter
static void audit(const gotcha_data& _data, audit::incoming, const void*, const void*,
size_t count, ncclDataType_t datatype, ncclRedOp_t, ncclComm_t,
hipStream_t);
// ncclAllGather
// ncclAlltoAll
static void audit(const gotcha_data& _data, audit::incoming, const void*, const void*,
size_t count, ncclDataType_t datatype, ncclComm_t, hipStream_t);
#endif
private:
static auto& add(tracker_t& _t, data_type value)
{
if(omnitrace::get_state() != omnitrace::State::Active)
{
_t.invoke<operation::set_is_invalid>(true);
return _t;
}
_t.store(std::plus<data_type>{}, value);
return _t;
}
static auto&& add(const gotcha_data& _data, data_type value)
{
tracker_t _t{ std::string_view{ _data.tool_id.c_str() } };
return add(_t, value);
}
static auto&& add(std::string&& _name, data_type value)
{
tracker_t _t{ _name };
return add(_t, value);
}
static auto&& add(std::string_view _name, data_type value)
{
tracker_t _t{ _name };
return add(_t, value);
}
};
} // namespace component
} // namespace tim
#if !defined(OMNITRACE_EXTERN_COMPONENTS) || \
(defined(OMNITRACE_EXTERN_COMPONENTS) && OMNITRACE_EXTERN_COMPONENTS > 0)
# include <timemory/components/base.hpp>
# include <timemory/components/data_tracker/components.hpp>
# include <timemory/operations.hpp>
TIMEMORY_DECLARE_EXTERN_COMPONENT(TIMEMORY_ESC(data_tracker<float, tim::api::omnitrace>),
true, float)
TIMEMORY_DECLARE_EXTERN_COMPONENT(comm_data, false, void)
#endif
@@ -42,10 +42,10 @@ TIMEMORY_DEFINE_NS_API(category, process_sampling)
TIMEMORY_DECLARE_COMPONENT(roctracer)
TIMEMORY_DECLARE_COMPONENT(rocprofiler)
TIMEMORY_DECLARE_COMPONENT(rccl_comm_data)
TIMEMORY_DECLARE_COMPONENT(rcclp_handle)
TIMEMORY_COMPONENT_ALIAS(rccl_api_t, api::rccl)
TIMEMORY_COMPONENT_ALIAS(rccl_data_tracker_t, data_tracker<float, rccl_api_t>)
TIMEMORY_COMPONENT_ALIAS(comm_data_tracker_t, data_tracker<float, api::omnitrace>)
TIMEMORY_DECLARE_COMPONENT(comm_data)
/// \struct tim::trait::name
/// \brief provides a constexpr string in ::value
@@ -160,11 +160,14 @@ TIMEMORY_DEFINE_CONCRETE_TRAIT(is_available, component::rocprofiler, false_type)
#if !defined(OMNITRACE_USE_RCCL)
TIMEMORY_DEFINE_CONCRETE_TRAIT(is_available, api::rccl, false_type)
TIMEMORY_DEFINE_CONCRETE_TRAIT(is_available, component::rccl_comm_data, false_type)
TIMEMORY_DEFINE_CONCRETE_TRAIT(is_available, component::rccl_data_tracker_t, false_type)
TIMEMORY_DEFINE_CONCRETE_TRAIT(is_available, component::rcclp_handle, false_type)
#endif
#if !defined(OMNITRACE_USE_RCCL) && !defined(OMNITRACE_USE_MPI)
TIMEMORY_DEFINE_CONCRETE_TRAIT(is_available, component::comm_data_tracker_t, false_type)
TIMEMORY_DEFINE_CONCRETE_TRAIT(is_available, component::comm_data, false_type)
#endif
#if !defined(TIMEMORY_USE_LIBUNWIND)
TIMEMORY_DEFINE_CONCRETE_TRAIT(is_available, omnitrace::api::sampling, false_type)
TIMEMORY_DEFINE_CONCRETE_TRAIT(is_available, omnitrace::component::backtrace, false_type)
@@ -289,6 +292,7 @@ TIMEMORY_STATISTICS_TYPE(omnitrace::component::sampling_gpu_busy, double)
TIMEMORY_STATISTICS_TYPE(omnitrace::component::sampling_gpu_temp, double)
TIMEMORY_STATISTICS_TYPE(omnitrace::component::sampling_gpu_power, double)
TIMEMORY_STATISTICS_TYPE(omnitrace::component::sampling_gpu_memory, double)
TIMEMORY_STATISTICS_TYPE(component::comm_data_tracker_t, float)
// enable timing units
TIMEMORY_DEFINE_CONCRETE_TRAIT(is_timing_category,
@@ -23,12 +23,15 @@
#include "library/components/mpi_gotcha.hpp"
#include "library/api.hpp"
#include "library/components/category_region.hpp"
#include "library/components/comm_data.hpp"
#include "library/components/fwd.hpp"
#include "library/config.hpp"
#include "library/debug.hpp"
#include "library/mproc.hpp"
#include <timemory/backends/mpi.hpp>
#include <timemory/backends/process.hpp>
#include <timemory/mpl/types.hpp>
#include <timemory/utility/locking.hpp>
#include <cstdint>
@@ -40,6 +43,10 @@ namespace omnitrace
{
namespace
{
using mpip_bundle_t =
tim::component_tuple<omnitrace::component::category_region<category::mpi>,
comp::comm_data>;
struct comm_rank_data
{
int rank = -1;
@@ -104,10 +111,7 @@ omnitrace_mpi_set_attr()
static auto _mpi_fini = [](MPI_Comm, int, void*, void*) {
OMNITRACE_DEBUG("MPI Comm attribute finalize\n");
if(mpip_index != std::numeric_limits<uint64_t>::max())
comp::deactivate_mpip<
tim::component_tuple<
omnitrace::component::category_region<category::mpi>>,
api::omnitrace>(mpip_index);
comp::deactivate_mpip<mpip_bundle_t, api::omnitrace>(mpip_index);
omnitrace_finalize_hidden();
return MPI_SUCCESS;
};
@@ -224,9 +228,7 @@ mpi_gotcha::audit(const gotcha_data_t& _data, audit::incoming)
OMNITRACE_BASIC_DEBUG_F("%s()\n", _data.tool_id.c_str());
if(mpip_index != std::numeric_limits<uint64_t>::max())
comp::deactivate_mpip<
tim::component_tuple<omnitrace::component::category_region<category::mpi>>,
api::omnitrace>(mpip_index);
comp::deactivate_mpip<mpip_bundle_t, api::omnitrace>(mpip_index);
#if !defined(TIMEMORY_USE_MPI) && defined(TIMEMORY_USE_MPI_HEADERS)
tim::mpi::is_initialized_callback() = []() { return false; };
@@ -276,16 +278,15 @@ mpi_gotcha::audit(const gotcha_data_t& _data, audit::outgoing, int _retval)
{
OMNITRACE_BASIC_VERBOSE_F(2, "Activating MPI wrappers...\n");
if(!get_use_timemory())
{
trait::runtime_enabled<comp::comm_data>::set(false);
trait::runtime_enabled<comp::comm_data_tracker_t>::set(false);
}
// use env vars OMNITRACE_MPIP_PERMIT_LIST and OMNITRACE_MPIP_REJECT_LIST
// to control the gotcha bindings at runtime
comp::configure_mpip<
tim::component_tuple<
omnitrace::component::category_region<category::mpi>>,
api::omnitrace>();
mpip_index = comp::activate_mpip<
tim::component_tuple<
omnitrace::component::category_region<category::mpi>>,
api::omnitrace>();
comp::configure_mpip<mpip_bundle_t, api::omnitrace>();
mpip_index = comp::activate_mpip<mpip_bundle_t, api::omnitrace>();
}
auto_lock_t _lk{ type_mutex<mpi_gotcha>() };
@@ -196,74 +196,5 @@ rcclp_handle::get_tool_count()
{
return get_persistent_data().m_count;
}
void
rccl_comm_data::preinit()
{
omnitrace::rcclp::configure();
}
// ncclReduce
void
rccl_comm_data::audit(const gotcha_data& _data, audit::incoming, const void*, void*,
size_t count, ncclDataType_t datatype, ncclRedOp_t, int root,
ncclComm_t, hipStream_t)
{
int size = rccl_type_size(datatype);
add(_data, count * size, JOIN('_', _data.tool_id.c_str(), "root", root));
}
// ncclSend
void
rccl_comm_data::audit(const gotcha_data& _data, audit::incoming, const void*,
size_t count, ncclDataType_t datatype, int peer, ncclComm_t,
hipStream_t)
{
int size = rccl_type_size(datatype);
add(_data, count * size, JOIN('_', _data.tool_id.c_str(), "root", peer));
}
// ncclBcast
// ncclRecv
void
rccl_comm_data::audit(const gotcha_data& _data, audit::incoming, void*, size_t count,
ncclDataType_t datatype, int root, ncclComm_t, hipStream_t)
{
int size = rccl_type_size(datatype);
add(_data, count * size, JOIN('_', _data.tool_id.c_str(), "root", root));
}
// ncclBroadcast
void
rccl_comm_data::audit(const gotcha_data& _data, audit::incoming, const void*, void*,
size_t count, ncclDataType_t datatype, int root, ncclComm_t,
hipStream_t)
{
int size = rccl_type_size(datatype);
add(_data, count * size, JOIN('_', _data.tool_id.c_str(), "root", root));
}
// ncclAllReduce
// ncclReduceScatter
void
rccl_comm_data::audit(const gotcha_data& _data, audit::incoming, const void*, void*,
size_t count, ncclDataType_t datatype, ncclRedOp_t, ncclComm_t,
hipStream_t)
{
int size = rccl_type_size(datatype);
add(_data, count * size);
}
// ncclAllGather
void
rccl_comm_data::audit(const gotcha_data& _data, audit::incoming, const void*, void*,
size_t count, ncclDataType_t datatype, ncclComm_t, hipStream_t)
{
int size = rccl_type_size(datatype);
add(_data, count * size);
}
} // namespace component
} // namespace tim
TIMEMORY_INITIALIZE_STORAGE(rccl_comm_data, rccl_data_tracker_t)
@@ -24,6 +24,7 @@
#include "library/common.hpp"
#include "library/components/category_region.hpp"
#include "library/components/comm_data.hpp"
#include "library/components/fwd.hpp"
#include "library/defines.hpp"
#include "library/timemory.hpp"
@@ -51,7 +52,7 @@
TIMEMORY_COMPONENT_ALIAS(
rccl_toolset_t,
component_bundle<rccl_api_t, omnitrace::component::category_region<category::rccl>,
rccl_comm_data*>)
comm_data>)
TIMEMORY_COMPONENT_ALIAS(rcclp_gotcha_t,
gotcha<OMNITRACE_NUM_RCCLP_WRAPPERS, rccl_toolset_t, rccl_api_t>)
@@ -59,12 +60,6 @@ TIMEMORY_COMPONENT_ALIAS(rcclp_gotcha_t,
TIMEMORY_DEFINE_CONCRETE_TRAIT(is_available, component::rcclp_gotcha_t, false_type)
#endif
TIMEMORY_STATISTICS_TYPE(component::rccl_data_tracker_t, float)
TIMEMORY_DEFINE_CONCRETE_TRAIT(uses_memory_units, component::rccl_data_tracker_t,
true_type)
TIMEMORY_DEFINE_CONCRETE_TRAIT(is_memory_category, component::rccl_data_tracker_t,
true_type)
namespace tim
{
namespace component
@@ -110,111 +105,5 @@ private:
static toolset_ptr_t& get_tool_instance();
static std::atomic<int64_t>& get_tool_count();
};
struct rccl_comm_data : base<rccl_comm_data, void>
{
using value_type = void;
using this_type = rccl_comm_data;
using base_type = base<this_type, value_type>;
using tracker_t = tim::auto_tuple<rccl_data_tracker_t>;
using data_type = float;
TIMEMORY_DEFAULT_OBJECT(rccl_comm_data)
static void preinit();
static void start() {}
static void stop() {}
static auto rccl_type_size(ncclDataType_t datatype)
{
switch(datatype)
{
case ncclInt8:
case ncclUint8: return 1;
case ncclFloat16: return 2;
case ncclInt32:
case ncclUint32:
case ncclFloat32: return 4;
case ncclInt64:
case ncclUint64:
case ncclFloat64: return 8;
default: return 0;
};
}
// ncclReduce
static void audit(const gotcha_data& _data, audit::incoming, const void*, void*,
size_t count, ncclDataType_t datatype, ncclRedOp_t, int root,
ncclComm_t, hipStream_t);
// ncclSend
static void audit(const gotcha_data& _data, audit::incoming, const void*,
size_t count, ncclDataType_t datatype, int peer, ncclComm_t,
hipStream_t);
// ncclBcast
// ncclRecv
static void audit(const gotcha_data& _data, audit::incoming, void*, size_t count,
ncclDataType_t datatype, int root, ncclComm_t, hipStream_t);
// ncclBroadcast
static void audit(const gotcha_data& _data, audit::incoming, const void*, void*,
size_t count, ncclDataType_t datatype, int root, ncclComm_t,
hipStream_t);
// ncclAllReduce
// ncclReduceScatter
static void audit(const gotcha_data& _data, audit::incoming, const void*, void*,
size_t count, ncclDataType_t datatype, ncclRedOp_t, ncclComm_t,
hipStream_t);
// ncclAllGather
static void audit(const gotcha_data& _data, audit::incoming, const void*, void*,
size_t count, ncclDataType_t datatype, ncclComm_t, hipStream_t);
private:
template <typename... Args>
static void add(tracker_t& _t, data_type value, Args&&... args)
{
_t.store(std::plus<data_type>{}, value);
TIMEMORY_FOLD_EXPRESSION(add_secondary(_t, std::forward<Args>(args), value));
}
template <typename... Args>
static void add(const gotcha_data& _data, data_type value, Args&&... args)
{
tracker_t _t{ std::string_view{ _data.tool_id.c_str() } };
add(_t, value, std::forward<Args>(args)...);
}
template <typename... Args>
static void add_secondary(tracker_t&, const gotcha_data& _data, data_type value,
Args&&... args)
{
// if(tim::settings::add_secondary())
{
tracker_t _s{ std::string_view{ _data.tool_id.c_str() } };
add(_s, _data, value, std::forward<Args>(args)...);
}
}
template <typename... Args>
static void add(std::string_view _name, data_type value, Args&&... args)
{
tracker_t _t{ _name };
add(_t, value, std::forward<Args>(args)...);
}
template <typename... Args>
static void add_secondary(tracker_t&, std::string_view _name, data_type value,
Args&&... args)
{
// if(tim::settings::add_secondary())
{
tracker_t _s{ _name };
add(_s, value, std::forward<Args>(args)...);
}
}
};
} // namespace component
} // namespace tim
@@ -85,6 +85,9 @@
perfetto::Category("ompt").SetDescription("OpenMP Tools regions"), \
perfetto::Category("rccl").SetDescription( \
"ROCm Communication Collectives Library (RCCL) regions"), \
perfetto::Category("comm_data") \
.SetDescription( \
"MPI/RCCL counters for tracking amount of data sent or received"), \
perfetto::Category("critical-trace").SetDescription("Combined critical traces"), \
perfetto::Category("host-critical-trace") \
.SetDescription("Host-side critical traces"), \
+12 -9
Vedi File
@@ -51,10 +51,7 @@ namespace rcclp
{
void
configure()
{
comp::rccl_data_tracker_t::label() = "rccl_comm_data";
comp::rccl_data_tracker_t::description() = "Tracks RCCL communication data";
}
{}
void
setup()
@@ -67,11 +64,17 @@ setup()
if(!librccl_handle) fprintf(stderr, "%s\n", dlerror());
dlerror(); // Clear any existing error
auto _data = tim::get_env("OMNITRACE_RCCLP_COMM_DATA", true);
if(_data)
comp::rccl_toolset_t::get_initializer() = [](comp::rccl_toolset_t& cb) {
cb.initialize<comp::rccl_comm_data>();
};
auto _use_data = tim::get_env("OMNITRACE_RCCLP_COMM_DATA", get_use_timemory());
if(!get_use_timemory())
{
trait::runtime_enabled<comp::comm_data>::set(false);
trait::runtime_enabled<comp::comm_data_tracker_t>::set(false);
}
else
{
trait::runtime_enabled<comp::comm_data>::set(_use_data);
trait::runtime_enabled<comp::comm_data_tracker_t>::set(_use_data);
}
comp::configure_rcclp();
global_id = comp::activate_rcclp();
@@ -31,6 +31,8 @@
#include "library/timemory.hpp"
#include "library/utility.hpp"
#include <timemory/components/timing/backends.hpp>
namespace omnitrace
{
namespace tracing
@@ -49,6 +51,13 @@ get_timemory_hash_ids(int64_t _tid = threading::get_id());
tim::hash_alias_ptr_t&
get_timemory_hash_aliases(int64_t _tid = threading::get_id());
template <typename Tp = uint64_t>
OMNITRACE_INLINE auto
now()
{
return ::tim::get_clock_real_now<Tp, std::nano>();
}
namespace
{
bool debug_push = // NOLINT