Generic comm_data component (#132)
* Generic comm_data component - moved rccl_comm_data to comm_data - comm_data includes communication data for MPI * fix timemory include with quotes * Only support MPI comm data with full MPI support * Increase timeouts + kill perfetto * Update timemory submodule * Fix missing command killall * set +e in Kill Perfetto workflow step * Updated MPI example to include MPI_Send and MPI_Recv calls * Update timemory submodule with storage merge fix * Perfetto comm data - tracing::now<T>() function * Fix timemory header include
This commit is contained in:
committato da
GitHub
parent
a1afd69a02
commit
0dd8f52292
@@ -29,7 +29,7 @@ jobs:
|
||||
- uses: actions/checkout@v2
|
||||
|
||||
- name: Install Packages
|
||||
timeout-minutes: 5
|
||||
timeout-minutes: 10
|
||||
run:
|
||||
for i in 6 7 8 9 10; do /opt/conda/envs/py3.${i}/bin/python -m pip install numpy perfetto dataclasses; done
|
||||
|
||||
@@ -66,7 +66,7 @@ jobs:
|
||||
cmake --build build --target all --parallel 2 -- VERBOSE=1
|
||||
|
||||
- name: Install
|
||||
timeout-minutes: 5
|
||||
timeout-minutes: 10
|
||||
run:
|
||||
cmake --build build --target install --parallel 2
|
||||
|
||||
@@ -124,3 +124,11 @@ jobs:
|
||||
build/omnitrace-tests-config/*.cfg
|
||||
build/omnitrace-tests-output/**/*.txt
|
||||
build/omnitrace-tests-output/**/*-instr*.json
|
||||
|
||||
- name: Kill Perfetto
|
||||
if: success() || failure()
|
||||
continue-on-error: True
|
||||
run: |
|
||||
set +e
|
||||
RUNNING_PROCS=$(pgrep trace_processor_shell)
|
||||
if [ -n "${RUNNING_PROCS}" ]; then kill -s 9 ${RUNNING_PROCS}; fi
|
||||
|
||||
@@ -19,7 +19,7 @@ env:
|
||||
|
||||
jobs:
|
||||
ubuntu-bionic:
|
||||
runs-on: ubuntu-18.04
|
||||
runs-on: ubuntu-latest
|
||||
container:
|
||||
image: jrmadsen/omnitrace:ci-base-ubuntu-18.04
|
||||
strategy:
|
||||
@@ -29,7 +29,7 @@ jobs:
|
||||
|
||||
steps:
|
||||
- name: Patch Git
|
||||
timeout-minutes: 5
|
||||
timeout-minutes: 10
|
||||
run: |
|
||||
apt-get update
|
||||
apt-get install -y software-properties-common
|
||||
@@ -43,7 +43,7 @@ jobs:
|
||||
submodules: recursive
|
||||
|
||||
- name: Install Packages
|
||||
timeout-minutes: 5
|
||||
timeout-minutes: 10
|
||||
run:
|
||||
apt-get update &&
|
||||
apt-get upgrade -y &&
|
||||
@@ -55,7 +55,7 @@ jobs:
|
||||
for i in 6 7 8 9 10; do /opt/conda/envs/py3.${i}/bin/python -m pip install numpy perfetto dataclasses; done
|
||||
|
||||
- name: Install Kokkos
|
||||
timeout-minutes: 5
|
||||
timeout-minutes: 10
|
||||
run:
|
||||
cd examples/lulesh/external/kokkos &&
|
||||
cmake -B build -DKokkos_ENABLE_OPENMP=ON -DKokkos_ENABLE_SERIAL=ON -DBUILD_SHARED_LIBS=ON -DCMAKE_CXX_STANDARD=17 . &&
|
||||
@@ -153,3 +153,11 @@ jobs:
|
||||
build/omnitrace-tests-config/*.cfg
|
||||
build/omnitrace-tests-output/**/*.txt
|
||||
build/omnitrace-tests-output/**/*-instr*.json
|
||||
|
||||
- name: Kill Perfetto
|
||||
if: success() || failure()
|
||||
continue-on-error: True
|
||||
run: |
|
||||
set +e
|
||||
RUNNING_PROCS=$(pgrep trace_processor_shell)
|
||||
if [ -n "${RUNNING_PROCS}" ]; then kill -s 9 ${RUNNING_PROCS}; fi
|
||||
|
||||
@@ -59,7 +59,7 @@ jobs:
|
||||
- uses: actions/checkout@v2
|
||||
|
||||
- name: Install Packages
|
||||
timeout-minutes: 5
|
||||
timeout-minutes: 10
|
||||
run:
|
||||
apt-get update &&
|
||||
apt-get install -y software-properties-common &&
|
||||
@@ -111,7 +111,7 @@ jobs:
|
||||
cmake --build build --target all --parallel 2 -- VERBOSE=1
|
||||
|
||||
- name: Install
|
||||
timeout-minutes: 5
|
||||
timeout-minutes: 10
|
||||
run:
|
||||
cmake --build build --target install --parallel 2
|
||||
|
||||
@@ -170,6 +170,14 @@ jobs:
|
||||
build/omnitrace-tests-output/**/*.txt
|
||||
build/omnitrace-tests-output/**/*-instr*.json
|
||||
|
||||
- name: Kill Perfetto
|
||||
if: success() || failure()
|
||||
continue-on-error: True
|
||||
run: |
|
||||
set +e
|
||||
RUNNING_PROCS=$(pgrep trace_processor_shell)
|
||||
if [ -n "${RUNNING_PROCS}" ]; then kill -s 9 ${RUNNING_PROCS}; fi
|
||||
|
||||
ubuntu-focal-external-rocm:
|
||||
runs-on: ubuntu-20.04
|
||||
container:
|
||||
@@ -199,7 +207,7 @@ jobs:
|
||||
- uses: actions/checkout@v2
|
||||
|
||||
- name: Install Packages
|
||||
timeout-minutes: 5
|
||||
timeout-minutes: 10
|
||||
run:
|
||||
apt-get update &&
|
||||
apt-get install -y software-properties-common wget gnupg2 &&
|
||||
@@ -213,7 +221,7 @@ jobs:
|
||||
|
||||
- name: Install RCCL
|
||||
if: ${{ matrix.rocm_version != '4.3' }}
|
||||
timeout-minutes: 5
|
||||
timeout-minutes: 10
|
||||
run:
|
||||
apt-get install -y rccl-dev
|
||||
|
||||
@@ -371,6 +379,14 @@ jobs:
|
||||
build/omnitrace-tests-output/**/*.txt
|
||||
build/omnitrace-tests-output/**/*-instr*.json
|
||||
|
||||
- name: Kill Perfetto
|
||||
if: success() || failure()
|
||||
continue-on-error: True
|
||||
run: |
|
||||
set +e
|
||||
RUNNING_PROCS=$(pgrep trace_processor_shell)
|
||||
if [ -n "${RUNNING_PROCS}" ]; then kill -s 9 ${RUNNING_PROCS}; fi
|
||||
|
||||
ubuntu-focal:
|
||||
runs-on: ubuntu-20.04
|
||||
strategy:
|
||||
@@ -404,7 +420,7 @@ jobs:
|
||||
- uses: actions/checkout@v2
|
||||
|
||||
- name: Install Packages
|
||||
timeout-minutes: 5
|
||||
timeout-minutes: 10
|
||||
run:
|
||||
sudo apt-get update &&
|
||||
sudo apt-get install -y build-essential m4 autoconf libtool python3-pip ${{ matrix.deps }} clang libomp-dev ${{ matrix.compiler }} ${{ matrix.mpi }} &&
|
||||
@@ -523,3 +539,11 @@ jobs:
|
||||
${{ github.workspace }}/build/omnitrace-tests-config/*.cfg
|
||||
${{ github.workspace }}/build/omnitrace-tests-output/**/*.txt
|
||||
${{ github.workspace }}/build/omnitrace-tests-output/**/*-instr*.json
|
||||
|
||||
- name: Kill Perfetto
|
||||
if: success() || failure()
|
||||
continue-on-error: True
|
||||
run: |
|
||||
set +e
|
||||
RUNNING_PROCS=$(pgrep trace_processor_shell)
|
||||
if [ -n "${RUNNING_PROCS}" ]; then kill -s 9 ${RUNNING_PROCS}; fi
|
||||
|
||||
+86
-26
@@ -44,15 +44,21 @@ namespace
|
||||
auto _name = std::string{};
|
||||
} // namespace
|
||||
|
||||
template <typename Tp, size_t N>
|
||||
void
|
||||
all2all(int _rank, MPI_Comm _comm)
|
||||
template <typename Tp>
|
||||
auto
|
||||
get_values_str(const Tp& _data)
|
||||
{
|
||||
if(_comm == MPI_COMM_NULL) return;
|
||||
static_assert(N > 0, "Error! N must be greater than zero!");
|
||||
std::stringstream _ss{};
|
||||
for(auto&& itr : _data)
|
||||
_ss << ", " << std::setw(6) << std::setprecision(2) << std::fixed << itr;
|
||||
return _ss.str().substr(1);
|
||||
};
|
||||
|
||||
auto _mt = std::mt19937_64{ size_t(_rank + 100) };
|
||||
auto _dist = []() {
|
||||
template <typename Tp, size_t N>
|
||||
auto
|
||||
get_dist(std::mt19937_64& _mt)
|
||||
{
|
||||
static auto _dist = []() {
|
||||
if constexpr(std::is_integral<Tp>::value)
|
||||
{
|
||||
return std::uniform_int_distribution<Tp>(1, N * N);
|
||||
@@ -62,23 +68,13 @@ all2all(int _rank, MPI_Comm _comm)
|
||||
return std::uniform_real_distribution<Tp>(1.0, N * N);
|
||||
}
|
||||
}();
|
||||
return _dist(_mt);
|
||||
}
|
||||
|
||||
auto _get_values_str = [](const auto& _data) {
|
||||
std::stringstream _ss{};
|
||||
for(auto&& itr : _data)
|
||||
_ss << ", " << std::setw(6) << std::setprecision(2) << std::fixed << itr;
|
||||
return _ss.str().substr(1);
|
||||
};
|
||||
|
||||
std::array<Tp, N> values_sent = {};
|
||||
std::array<Tp, N> values_recv = {};
|
||||
for(size_t i = 0; i < N; ++i)
|
||||
values_sent[i] = _dist(_mt);
|
||||
|
||||
if(_rank == 0)
|
||||
printf("[%s][%i] values sent (# = %zu) :: %s.\n", _name.c_str(), _rank,
|
||||
values_sent.size(), _get_values_str(values_sent).c_str());
|
||||
|
||||
template <typename Tp>
|
||||
auto
|
||||
get_dtype()
|
||||
{
|
||||
auto _dtype = MPI_INT; // NOLINT
|
||||
if(std::is_same<Tp, long>::value)
|
||||
_dtype = MPI_LONG;
|
||||
@@ -86,12 +82,71 @@ all2all(int _rank, MPI_Comm _comm)
|
||||
_dtype = MPI_FLOAT;
|
||||
else if(std::is_same<Tp, double>::value)
|
||||
_dtype = MPI_DOUBLE;
|
||||
return _dtype;
|
||||
}
|
||||
|
||||
template <typename Tp, size_t N>
|
||||
void
|
||||
all2all(int _rank, MPI_Comm _comm)
|
||||
{
|
||||
if(_comm == MPI_COMM_NULL) return;
|
||||
static_assert(N > 0, "Error! N must be greater than zero!");
|
||||
|
||||
auto _dtype = get_dtype<Tp>();
|
||||
auto _mt = std::mt19937_64{ size_t(_rank + 100) };
|
||||
auto values_sent = std::array<Tp, N>{};
|
||||
auto values_recv = std::array<Tp, N>{};
|
||||
for(size_t i = 0; i < N; ++i)
|
||||
values_sent[i] = get_dist<Tp, N>(_mt);
|
||||
|
||||
if(_rank == 0)
|
||||
printf("[%s][%s][%i] values sent (# = %zu) :: %s.\n", _name.c_str(), __FUNCTION__,
|
||||
_rank, values_sent.size(), get_values_str(values_sent).c_str());
|
||||
|
||||
MPI_Alltoall(&values_sent[_rank], 1, _dtype, &values_recv[_rank], 1, _dtype, _comm);
|
||||
|
||||
if(_rank == 0)
|
||||
printf("[%s][%i] values recv (# = %zu) :: %s.\n", _name.c_str(), _rank,
|
||||
values_sent.size(), _get_values_str(values_recv).c_str());
|
||||
printf("[%s][%s][%i] values recv (# = %zu) :: %s.\n", _name.c_str(), __FUNCTION__,
|
||||
_rank, values_sent.size(), get_values_str(values_recv).c_str());
|
||||
}
|
||||
|
||||
template <typename Tp, size_t N>
|
||||
void
|
||||
send_recv(int _rank, MPI_Comm _comm)
|
||||
{
|
||||
if(_comm == MPI_COMM_NULL) return;
|
||||
static_assert(N > 0, "Error! N must be greater than zero!");
|
||||
int _size = 0;
|
||||
MPI_Comm_size(_comm, &_size);
|
||||
|
||||
auto _dtype = get_dtype<Tp>();
|
||||
auto _mt = std::mt19937_64{ size_t(_rank + 100) };
|
||||
auto values_sent = std::array<Tp, N>{};
|
||||
auto values_recv = std::array<Tp, N>{};
|
||||
for(size_t i = 0; i < N; ++i)
|
||||
values_sent[i] = get_dist<Tp, N>(_mt);
|
||||
|
||||
if(_rank == 0)
|
||||
printf("[%s][%s][%i] values sent (# = %zu) :: %s.\n", _name.c_str(), __FUNCTION__,
|
||||
_rank, values_sent.size(), get_values_str(values_sent).c_str());
|
||||
|
||||
for(int i = 0; i < _size; ++i)
|
||||
{
|
||||
if(i != _rank) MPI_Send(&values_sent[_rank], 1, _dtype, i, N, _comm);
|
||||
}
|
||||
|
||||
for(int i = 0; i < _size; ++i)
|
||||
{
|
||||
if(i != _rank)
|
||||
{
|
||||
MPI_Status _status;
|
||||
MPI_Recv(&values_recv[i], 1, _dtype, i, N, _comm, &_status);
|
||||
}
|
||||
}
|
||||
|
||||
if(_rank == 0)
|
||||
printf("[%s][%s][%i] values recv (# = %zu) :: %s.\n", _name.c_str(), __FUNCTION__,
|
||||
_rank, values_sent.size(), get_values_str(values_recv).c_str());
|
||||
}
|
||||
|
||||
void
|
||||
@@ -109,9 +164,13 @@ run(MPI_Comm _comm, int nitr)
|
||||
MPI_Barrier(_comm);
|
||||
for(int i = 0; i < nitr; ++i)
|
||||
{
|
||||
send_recv<int, 3>(_rank, _comm);
|
||||
send_recv<long, 4>(_rank, _comm);
|
||||
send_recv<float, 5>(_rank, _comm);
|
||||
send_recv<double, 6>(_rank, _comm);
|
||||
MPI_Barrier(_comm);
|
||||
all2all<int, 3>(_rank, _comm);
|
||||
all2all<long, 4>(_rank, _comm);
|
||||
MPI_Barrier(_comm);
|
||||
all2all<float, 5>(_rank, _comm);
|
||||
all2all<double, 6>(_rank, _comm);
|
||||
}
|
||||
@@ -259,6 +318,7 @@ run_main(int argc, char** argv)
|
||||
int
|
||||
main(int argc, char** argv)
|
||||
{
|
||||
std::this_thread::sleep_for(std::chrono::seconds{ 2 });
|
||||
int _mpi_thread_provided;
|
||||
MPI_Init_thread(&argc, &argv, MPI_THREAD_SERIALIZED, &_mpi_thread_provided);
|
||||
|
||||
|
||||
esterno
+1
-1
Submodule external/timemory updated: 97c7415498...48f4735fb7
@@ -76,6 +76,7 @@ set(library_sources
|
||||
${CMAKE_CURRENT_LIST_DIR}/library/timemory.cpp
|
||||
${CMAKE_CURRENT_LIST_DIR}/library/tracing.cpp
|
||||
${CMAKE_CURRENT_LIST_DIR}/library/components/backtrace.cpp
|
||||
${CMAKE_CURRENT_LIST_DIR}/library/components/comm_data.cpp
|
||||
${CMAKE_CURRENT_LIST_DIR}/library/components/exit_gotcha.cpp
|
||||
${CMAKE_CURRENT_LIST_DIR}/library/components/fork_gotcha.cpp
|
||||
${CMAKE_CURRENT_LIST_DIR}/library/components/mpi_gotcha.cpp
|
||||
@@ -115,6 +116,7 @@ set(library_headers
|
||||
${CMAKE_CURRENT_LIST_DIR}/library/components/fwd.hpp
|
||||
${CMAKE_CURRENT_LIST_DIR}/library/components/backtrace.hpp
|
||||
${CMAKE_CURRENT_LIST_DIR}/library/components/category_region.hpp
|
||||
${CMAKE_CURRENT_LIST_DIR}/library/components/comm_data.hpp
|
||||
${CMAKE_CURRENT_LIST_DIR}/library/components/exit_gotcha.hpp
|
||||
${CMAKE_CURRENT_LIST_DIR}/library/components/fork_gotcha.hpp
|
||||
${CMAKE_CURRENT_LIST_DIR}/library/components/functors.hpp
|
||||
|
||||
@@ -0,0 +1,423 @@
|
||||
// MIT License
|
||||
//
|
||||
// Copyright (c) 2022 Advanced Micro Devices, Inc. All Rights Reserved.
|
||||
//
|
||||
// Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
// of this software and associated documentation files (the "Software"), to deal
|
||||
// in the Software without restriction, including without limitation the rights
|
||||
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
// copies of the Software, and to permit persons to whom the Software is
|
||||
// furnished to do so, subject to the following conditions:
|
||||
//
|
||||
// The above copyright notice and this permission notice shall be included in all
|
||||
// copies or substantial portions of the Software.
|
||||
//
|
||||
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
// SOFTWARE.
|
||||
|
||||
#include "library/components/comm_data.hpp"
|
||||
#include "library/components/fwd.hpp"
|
||||
#include "library/config.hpp"
|
||||
#include "library/perfetto.hpp"
|
||||
#include "library/tracing.hpp"
|
||||
|
||||
#include <timemory/backends/mpi.hpp>
|
||||
#include <timemory/manager.hpp>
|
||||
#include <timemory/units.hpp>
|
||||
#include <timemory/utility/locking.hpp>
|
||||
|
||||
namespace tim
|
||||
{
|
||||
namespace component
|
||||
{
|
||||
namespace
|
||||
{
|
||||
template <typename Tp, typename... Args>
|
||||
void
|
||||
write_perfetto_counter_track(uint64_t _val)
|
||||
{
|
||||
using counter_track = omnitrace::perfetto_counter_track<Tp>;
|
||||
|
||||
if(omnitrace::get_use_perfetto() &&
|
||||
omnitrace::get_state() == omnitrace::State::Active)
|
||||
{
|
||||
auto _emplace = [](const size_t _idx) {
|
||||
if(!counter_track::exists(_idx))
|
||||
{
|
||||
std::string _label = (_idx > 0)
|
||||
? JOIN(" ", Tp::label, JOIN("", '[', _idx, ']'))
|
||||
: Tp::label;
|
||||
counter_track::emplace(_idx, _label, "bytes");
|
||||
}
|
||||
};
|
||||
|
||||
const size_t _idx = 0;
|
||||
static std::once_flag _once{};
|
||||
std::call_once(_once, _emplace, _idx);
|
||||
|
||||
static std::mutex _mutex{};
|
||||
static uint64_t value = 0;
|
||||
uint64_t _now = 0;
|
||||
{
|
||||
std::unique_lock<std::mutex> _lk{ _mutex };
|
||||
_now = omnitrace::tracing::now<uint64_t>();
|
||||
_val = (value += _val);
|
||||
}
|
||||
|
||||
TRACE_COUNTER(Tp::value, counter_track::at(_idx, 0), _now, _val);
|
||||
}
|
||||
}
|
||||
} // namespace
|
||||
|
||||
void
|
||||
comm_data::preinit()
|
||||
{
|
||||
configure();
|
||||
}
|
||||
|
||||
void
|
||||
comm_data::global_finalize()
|
||||
{
|
||||
configure();
|
||||
}
|
||||
|
||||
void
|
||||
comm_data::configure()
|
||||
{
|
||||
static bool _once = false;
|
||||
if(_once) return;
|
||||
_once = true;
|
||||
|
||||
comm_data_tracker_t::label() = "comm_data";
|
||||
comm_data_tracker_t::description() = "Tracks MPI/RCCL communication data sizes";
|
||||
comm_data_tracker_t::display_unit() = "MB";
|
||||
comm_data_tracker_t::unit() = units::megabyte;
|
||||
|
||||
auto _fmt_flags = comm_data_tracker_t::get_format_flags();
|
||||
_fmt_flags &= (std::ios_base::fixed & std::ios_base::scientific);
|
||||
_fmt_flags |= (std::ios_base::scientific);
|
||||
comm_data_tracker_t::set_precision(3);
|
||||
comm_data_tracker_t::set_format_flags(_fmt_flags);
|
||||
}
|
||||
|
||||
#if defined(OMNITRACE_USE_MPI)
|
||||
// MPI_Send
|
||||
void
|
||||
comm_data::audit(const gotcha_data& _data, audit::incoming, const void*, int count,
|
||||
MPI_Datatype datatype, int dst, int tag, MPI_Comm)
|
||||
{
|
||||
int _size = mpi_type_size(datatype);
|
||||
if(_size == 0) return;
|
||||
|
||||
write_perfetto_counter_track<mpi_send>(count * _size);
|
||||
|
||||
if(!omnitrace::get_use_timemory()) return;
|
||||
auto _name = std::string_view{ _data.tool_id };
|
||||
tracker_t _a{ _name };
|
||||
add(_a, count * _size);
|
||||
tracker_t _b{ JOIN('_', _name, "dst", dst) };
|
||||
add(_b, count * _size);
|
||||
add(JOIN('_', _name, "dst", dst, "tag", tag), count * _size);
|
||||
}
|
||||
|
||||
// MPI_Recv
|
||||
void
|
||||
comm_data::audit(const gotcha_data& _data, audit::incoming, void*, int count,
|
||||
MPI_Datatype datatype, int dst, int tag, MPI_Comm, MPI_Status*)
|
||||
{
|
||||
int _size = mpi_type_size(datatype);
|
||||
if(_size == 0) return;
|
||||
|
||||
write_perfetto_counter_track<mpi_recv>(count * _size);
|
||||
|
||||
if(!omnitrace::get_use_timemory()) return;
|
||||
auto _name = std::string_view{ _data.tool_id };
|
||||
tracker_t _a{ _name };
|
||||
add(_a, count * _size);
|
||||
tracker_t _b{ JOIN('_', _name, "dst", dst) };
|
||||
add(_b, count * _size);
|
||||
add(JOIN('_', _name, "dst", dst, "tag", tag), count * _size);
|
||||
}
|
||||
|
||||
// MPI_Isend
|
||||
void
|
||||
comm_data::audit(const gotcha_data& _data, audit::incoming, const void*, int count,
|
||||
MPI_Datatype datatype, int dst, int tag, MPI_Comm, MPI_Request*)
|
||||
{
|
||||
int _size = mpi_type_size(datatype);
|
||||
if(_size == 0) return;
|
||||
|
||||
write_perfetto_counter_track<mpi_send>(count * _size);
|
||||
|
||||
if(!omnitrace::get_use_timemory()) return;
|
||||
auto _name = std::string_view{ _data.tool_id };
|
||||
tracker_t _a{ _name };
|
||||
add(_a, count * _size);
|
||||
tracker_t _b{ JOIN('_', _name, "dst", dst) };
|
||||
add(_b, count * _size);
|
||||
add(JOIN('_', _name, "dst", dst, "tag", tag), count * _size);
|
||||
}
|
||||
|
||||
// MPI_Irecv
|
||||
void
|
||||
comm_data::audit(const gotcha_data& _data, audit::incoming, void*, int count,
|
||||
MPI_Datatype datatype, int dst, int tag, MPI_Comm, MPI_Request*)
|
||||
{
|
||||
int _size = mpi_type_size(datatype);
|
||||
if(_size == 0) return;
|
||||
|
||||
write_perfetto_counter_track<mpi_recv>(count * _size);
|
||||
|
||||
if(!omnitrace::get_use_timemory()) return;
|
||||
auto _name = std::string_view{ _data.tool_id };
|
||||
tracker_t _a{ _name };
|
||||
add(_a, count * _size);
|
||||
tracker_t _b{ JOIN('_', _name, "dst", dst) };
|
||||
add(_b, count * _size);
|
||||
add(JOIN('_', _name, "dst", dst, "tag", tag), count * _size);
|
||||
}
|
||||
|
||||
// MPI_Bcast
|
||||
void
|
||||
comm_data::audit(const gotcha_data& _data, audit::incoming, void*, int count,
|
||||
MPI_Datatype datatype, int root, MPI_Comm)
|
||||
{
|
||||
int _size = mpi_type_size(datatype);
|
||||
if(_size == 0) return;
|
||||
|
||||
write_perfetto_counter_track<mpi_send>(count * _size);
|
||||
|
||||
if(!omnitrace::get_use_timemory()) return;
|
||||
auto _name = std::string_view{ _data.tool_id };
|
||||
tracker_t _t{ _name };
|
||||
add(_t, count * _size);
|
||||
add(JOIN('_', _name, "root", root), count * _size);
|
||||
}
|
||||
|
||||
// MPI_Allreduce
|
||||
void
|
||||
comm_data::audit(const gotcha_data& _data, audit::incoming, const void*, void*, int count,
|
||||
MPI_Datatype datatype, MPI_Op, MPI_Comm)
|
||||
{
|
||||
int _size = mpi_type_size(datatype);
|
||||
if(_size == 0) return;
|
||||
|
||||
write_perfetto_counter_track<mpi_recv>(count * _size);
|
||||
write_perfetto_counter_track<mpi_send>(count * _size);
|
||||
|
||||
if(!omnitrace::get_use_timemory()) return;
|
||||
add(_data, count * _size);
|
||||
}
|
||||
|
||||
// MPI_Sendrecv
|
||||
void
|
||||
comm_data::audit(const gotcha_data& _data, audit::incoming, const void*, int sendcount,
|
||||
MPI_Datatype sendtype, int dst, int sendtag, void*, int recvcount,
|
||||
MPI_Datatype recvtype, int src, int recvtag, MPI_Comm, MPI_Status*)
|
||||
{
|
||||
int _send_size = mpi_type_size(sendtype);
|
||||
int _recv_size = mpi_type_size(recvtype);
|
||||
if(_send_size == 0 || _recv_size == 0) return;
|
||||
|
||||
write_perfetto_counter_track<mpi_send>(sendcount * _send_size);
|
||||
write_perfetto_counter_track<mpi_recv>(recvcount * _recv_size);
|
||||
|
||||
if(!omnitrace::get_use_timemory()) return;
|
||||
auto _name = std::string_view{ _data.tool_id };
|
||||
tracker_t _t{ _name };
|
||||
add(_t, sendcount * _send_size + recvcount * _recv_size);
|
||||
{
|
||||
tracker_t _b{ JOIN('_', _name, "send") };
|
||||
add(_b, sendcount * _send_size);
|
||||
tracker_t _c{ JOIN('_', _name, "send", dst) };
|
||||
add(_b, sendcount * _send_size);
|
||||
add(JOIN('_', _name, "send", "tag", sendtag), sendcount * _send_size);
|
||||
add(JOIN('_', _name, "send", dst, "tag", sendtag), sendcount * _send_size);
|
||||
}
|
||||
{
|
||||
tracker_t _b{ JOIN('_', _name, "recv") };
|
||||
add(_b, recvcount * _recv_size);
|
||||
tracker_t _c{ JOIN('_', _name, "recv", src) };
|
||||
add(_b, recvcount * _recv_size);
|
||||
add(JOIN('_', _name, "recv", "tag", recvtag), recvcount * _recv_size);
|
||||
add(JOIN('_', _name, "recv", src, "tag", recvtag), recvcount * _recv_size);
|
||||
}
|
||||
}
|
||||
|
||||
// MPI_Gather
|
||||
// MPI_Scatter
|
||||
void
|
||||
comm_data::audit(const gotcha_data& _data, audit::incoming, const void*, int sendcount,
|
||||
MPI_Datatype sendtype, void*, int recvcount, MPI_Datatype recvtype,
|
||||
int root, MPI_Comm)
|
||||
{
|
||||
int _send_size = mpi_type_size(sendtype);
|
||||
int _recv_size = mpi_type_size(recvtype);
|
||||
if(_send_size == 0 || _recv_size == 0) return;
|
||||
|
||||
write_perfetto_counter_track<mpi_send>(sendcount * _send_size);
|
||||
write_perfetto_counter_track<mpi_recv>(recvcount * _recv_size);
|
||||
|
||||
if(!omnitrace::get_use_timemory()) return;
|
||||
auto _name = std::string_view{ _data.tool_id };
|
||||
tracker_t _t{ _name };
|
||||
add(_t, sendcount * _send_size + recvcount * _recv_size);
|
||||
tracker_t _r(JOIN('_', _name, "root", root));
|
||||
add(_r, sendcount * _send_size + recvcount * _recv_size);
|
||||
add(JOIN('_', _name, "root", root, "send"), sendcount * _send_size);
|
||||
add(JOIN('_', _name, "root", root, "recv"), recvcount * _recv_size);
|
||||
}
|
||||
|
||||
// MPI_Alltoall
|
||||
void
|
||||
comm_data::audit(const gotcha_data& _data, audit::incoming, const void*, int sendcount,
|
||||
MPI_Datatype sendtype, void*, int recvcount, MPI_Datatype recvtype,
|
||||
MPI_Comm)
|
||||
{
|
||||
int _send_size = mpi_type_size(sendtype);
|
||||
int _recv_size = mpi_type_size(recvtype);
|
||||
if(_send_size == 0 || _recv_size == 0) return;
|
||||
|
||||
write_perfetto_counter_track<mpi_send>(sendcount * _send_size);
|
||||
write_perfetto_counter_track<mpi_recv>(recvcount * _recv_size);
|
||||
|
||||
if(!omnitrace::get_use_timemory()) return;
|
||||
auto _name = std::string_view{ _data.tool_id };
|
||||
tracker_t _t{ _name };
|
||||
add(_t, sendcount * _send_size + recvcount * _recv_size);
|
||||
add(JOIN('_', _name, "send"), sendcount * _send_size);
|
||||
add(JOIN('_', _name, "recv"), recvcount * _recv_size);
|
||||
}
|
||||
#endif
|
||||
|
||||
#if defined(OMNITRACE_USE_RCCL)
|
||||
// ncclReduce
|
||||
void
|
||||
comm_data::audit(const gotcha_data& _data, audit::incoming, const void*, const void*,
|
||||
size_t count, ncclDataType_t datatype, ncclRedOp_t, int root, ncclComm_t,
|
||||
hipStream_t)
|
||||
{
|
||||
int _size = rccl_type_size(datatype);
|
||||
if(_size <= 0) return;
|
||||
|
||||
write_perfetto_counter_track<rccl_recv>(count * _size);
|
||||
|
||||
if(!omnitrace::get_use_timemory()) return;
|
||||
auto _name = std::string_view{ _data.tool_id };
|
||||
tracker_t _t{ _name };
|
||||
add(_t, count * _size);
|
||||
add(JOIN('_', _name, "root", root), count * _size);
|
||||
}
|
||||
|
||||
// ncclSend
|
||||
// ncclGather
|
||||
// ncclBcast
|
||||
// ncclRecv
|
||||
void
|
||||
comm_data::audit(const gotcha_data& _data, audit::incoming, const void*, size_t count,
|
||||
ncclDataType_t datatype, int peer, ncclComm_t, hipStream_t)
|
||||
{
|
||||
int _size = rccl_type_size(datatype);
|
||||
if(_size <= 0) return;
|
||||
|
||||
static auto _send_types = std::unordered_set<std::string>{ "ncclSend", "ncclBcast" };
|
||||
static auto _recv_types = std::unordered_set<std::string>{ "ncclGather", "ncclRecv" };
|
||||
|
||||
if(_send_types.count(_data.tool_id) > 0)
|
||||
{
|
||||
write_perfetto_counter_track<rccl_send>(count * _size);
|
||||
}
|
||||
else if(_recv_types.count(_data.tool_id) > 0)
|
||||
{
|
||||
write_perfetto_counter_track<rccl_recv>(count * _size);
|
||||
}
|
||||
else
|
||||
{
|
||||
OMNITRACE_CI_THROW(true, "RCCL function not handled: %s", _data.tool_id.c_str());
|
||||
}
|
||||
|
||||
write_perfetto_counter_track<rccl_recv>(count * _size);
|
||||
|
||||
if(!omnitrace::get_use_timemory()) return;
|
||||
auto _name = std::string_view{ _data.tool_id };
|
||||
std::string _label = "root";
|
||||
if(_name.find("Send") != std::string::npos) _label = "peer";
|
||||
|
||||
tracker_t _t{ _name };
|
||||
add(_t, count * _size);
|
||||
add(JOIN('_', _name, _label, peer), count * _size);
|
||||
}
|
||||
|
||||
// ncclBroadcast
|
||||
void
|
||||
comm_data::audit(const gotcha_data& _data, audit::incoming, const void*, const void*,
|
||||
size_t count, ncclDataType_t datatype, int root, ncclComm_t, hipStream_t)
|
||||
{
|
||||
int _size = rccl_type_size(datatype);
|
||||
if(_size <= 0) return;
|
||||
|
||||
write_perfetto_counter_track<rccl_send>(count * _size);
|
||||
|
||||
if(!omnitrace::get_use_timemory()) return;
|
||||
auto _name = std::string_view{ _data.tool_id };
|
||||
tracker_t _t{ _name };
|
||||
add(_t, count * _size);
|
||||
add(JOIN('_', _data.tool_id, "root", root), count * _size);
|
||||
}
|
||||
|
||||
// ncclAllReduce
|
||||
// ncclReduceScatter
|
||||
void
|
||||
comm_data::audit(const gotcha_data& _data, audit::incoming, const void*, const void*,
|
||||
size_t count, ncclDataType_t datatype, ncclRedOp_t, ncclComm_t,
|
||||
hipStream_t)
|
||||
{
|
||||
int _size = rccl_type_size(datatype);
|
||||
if(_size <= 0) return;
|
||||
|
||||
static auto _recv_types = std::unordered_set<std::string>{ "ncclAllReduce" };
|
||||
static auto _send_types = std::unordered_set<std::string>{ "ncclReduceScatter" };
|
||||
|
||||
if(_send_types.count(_data.tool_id) > 0)
|
||||
{
|
||||
write_perfetto_counter_track<rccl_send>(count * _size);
|
||||
}
|
||||
else if(_recv_types.count(_data.tool_id) > 0)
|
||||
{
|
||||
write_perfetto_counter_track<rccl_recv>(count * _size);
|
||||
}
|
||||
else
|
||||
{
|
||||
OMNITRACE_CI_THROW(true, "RCCL function not handled: %s", _data.tool_id.c_str());
|
||||
}
|
||||
|
||||
if(!omnitrace::get_use_timemory()) return;
|
||||
add(_data, count * _size);
|
||||
}
|
||||
|
||||
// ncclAllGather
|
||||
void
|
||||
comm_data::audit(const gotcha_data& _data, audit::incoming, const void*, const void*,
|
||||
size_t count, ncclDataType_t datatype, ncclComm_t, hipStream_t)
|
||||
{
|
||||
int _size = rccl_type_size(datatype);
|
||||
if(_size <= 0) return;
|
||||
|
||||
write_perfetto_counter_track<rccl_recv>(count * _size);
|
||||
|
||||
if(!omnitrace::get_use_timemory()) return;
|
||||
add(_data, count * _size);
|
||||
}
|
||||
#endif
|
||||
} // namespace component
|
||||
} // namespace tim
|
||||
|
||||
TIMEMORY_INSTANTIATE_EXTERN_COMPONENT(
|
||||
TIMEMORY_ESC(data_tracker<float, tim::api::omnitrace>), true, float)
|
||||
|
||||
TIMEMORY_INSTANTIATE_EXTERN_COMPONENT(comm_data, false, void)
|
||||
@@ -0,0 +1,247 @@
|
||||
// MIT License
|
||||
//
|
||||
// Copyright (c) 2020, The Regents of the University of California,
|
||||
// through Lawrence Berkeley National Laboratory (subject to receipt of any
|
||||
// required approvals from the U.S. Dept. of Energy). All rights reserved.
|
||||
//
|
||||
// Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
// of this software and associated documentation files (the "Software"), to deal
|
||||
// in the Software without restriction, including without limitation the rights
|
||||
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
// copies of the Software, and to permit persons to whom the Software is
|
||||
// furnished to do so, subject to the following conditions:
|
||||
//
|
||||
// The above copyright notice and this permission notice shall be included in all
|
||||
// copies or substantial portions of the Software.
|
||||
//
|
||||
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
// SOFTWARE.
|
||||
|
||||
#include "common/join.hpp"
|
||||
#include "library/common.hpp"
|
||||
#include "library/components/category_region.hpp"
|
||||
#include "library/components/fwd.hpp"
|
||||
#include "library/defines.hpp"
|
||||
#include "library/timemory.hpp"
|
||||
|
||||
#include <timemory/api/macros.hpp>
|
||||
#include <timemory/components/macros.hpp>
|
||||
#include <timemory/operations/types/set.hpp>
|
||||
#include <timemory/utility/types.hpp>
|
||||
|
||||
#include <optional>
|
||||
|
||||
#if defined(OMNITRACE_USE_RCCL)
|
||||
# if OMNITRACE_HIP_VERSION == 0 || OMNITRACE_HIP_VERSION >= 50200
|
||||
# include <rccl/rccl.h>
|
||||
# else
|
||||
# include <rccl.h>
|
||||
# endif
|
||||
#endif
|
||||
|
||||
#if defined(OMNITRACE_USE_MPI)
|
||||
# include <mpi.h>
|
||||
#endif
|
||||
|
||||
#include <atomic>
|
||||
#include <functional>
|
||||
#include <memory>
|
||||
#include <set>
|
||||
#include <string>
|
||||
#include <utility>
|
||||
|
||||
namespace tim
|
||||
{
|
||||
namespace component
|
||||
{
|
||||
using comm_data_tracker_t = data_tracker<float, api::omnitrace>;
|
||||
|
||||
struct comm_data : base<comm_data, void>
|
||||
{
|
||||
using value_type = void;
|
||||
using this_type = comm_data;
|
||||
using base_type = base<this_type, value_type>;
|
||||
using tracker_t = tim::auto_tuple<comm_data_tracker_t>;
|
||||
using data_type = float;
|
||||
|
||||
struct mpi_recv
|
||||
{
|
||||
static constexpr auto value = "comm_data";
|
||||
static constexpr auto label = "MPI Comm Recv";
|
||||
};
|
||||
|
||||
struct mpi_send
|
||||
{
|
||||
static constexpr auto value = "comm_data";
|
||||
static constexpr auto label = "MPI Comm Send";
|
||||
};
|
||||
|
||||
struct rccl_recv
|
||||
{
|
||||
static constexpr auto value = "comm_data";
|
||||
static constexpr auto label = "RCCL Comm Recv";
|
||||
};
|
||||
|
||||
struct rccl_send
|
||||
{
|
||||
static constexpr auto value = "comm_data";
|
||||
static constexpr auto label = "RCCL Comm Send";
|
||||
};
|
||||
|
||||
TIMEMORY_DEFAULT_OBJECT(comm_data)
|
||||
|
||||
static void preinit();
|
||||
static void configure();
|
||||
static void global_finalize();
|
||||
static void start() {}
|
||||
static void stop() {}
|
||||
|
||||
#if defined(OMNITRACE_USE_MPI)
|
||||
static int mpi_type_size(MPI_Datatype _datatype)
|
||||
{
|
||||
int _size = 0;
|
||||
PMPI_Type_size(_datatype, &_size);
|
||||
return _size;
|
||||
}
|
||||
|
||||
// MPI_Send
|
||||
static void audit(const gotcha_data& _data, audit::incoming, const void*, int count,
|
||||
MPI_Datatype datatype, int dst, int tag, MPI_Comm);
|
||||
|
||||
// MPI_Recv
|
||||
static void audit(const gotcha_data& _data, audit::incoming, void*, int count,
|
||||
MPI_Datatype datatype, int dst, int tag, MPI_Comm, MPI_Status*);
|
||||
|
||||
// MPI_Isend
|
||||
static void audit(const gotcha_data& _data, audit::incoming, const void*, int count,
|
||||
MPI_Datatype datatype, int dst, int tag, MPI_Comm, MPI_Request*);
|
||||
|
||||
// MPI_Irecv
|
||||
static void audit(const gotcha_data& _data, audit::incoming, void*, int count,
|
||||
MPI_Datatype datatype, int dst, int tag, MPI_Comm, MPI_Request*);
|
||||
|
||||
// MPI_Bcast
|
||||
static void audit(const gotcha_data& _data, audit::incoming, void*, int count,
|
||||
MPI_Datatype datatype, int root, MPI_Comm);
|
||||
|
||||
// MPI_Allreduce
|
||||
static void audit(const gotcha_data& _data, audit::incoming, const void*, void*,
|
||||
int count, MPI_Datatype datatype, MPI_Op, MPI_Comm);
|
||||
|
||||
// MPI_Sendrecv
|
||||
static void audit(const gotcha_data& _data, audit::incoming, const void*,
|
||||
int sendcount, MPI_Datatype sendtype, int, int sendtag, void*,
|
||||
int recvcount, MPI_Datatype recvtype, int, int recvtag, MPI_Comm,
|
||||
MPI_Status*);
|
||||
|
||||
// MPI_Gather
|
||||
// MPI_Scatter
|
||||
static void audit(const gotcha_data& _data, audit::incoming, const void*,
|
||||
int sendcount, MPI_Datatype sendtype, void*, int recvcount,
|
||||
MPI_Datatype recvtype, int root, MPI_Comm);
|
||||
|
||||
// MPI_Alltoall
|
||||
static void audit(const gotcha_data& _data, audit::incoming, const void*,
|
||||
int sendcount, MPI_Datatype sendtype, void*, int recvcount,
|
||||
MPI_Datatype recvtype, MPI_Comm);
|
||||
#endif
|
||||
|
||||
#if defined(OMNITRACE_USE_RCCL)
|
||||
static auto rccl_type_size(ncclDataType_t datatype)
|
||||
{
|
||||
switch(datatype)
|
||||
{
|
||||
case ncclInt8:
|
||||
case ncclUint8: return 1;
|
||||
case ncclFloat16: return 2;
|
||||
case ncclInt32:
|
||||
case ncclUint32:
|
||||
case ncclFloat32: return 4;
|
||||
case ncclInt64:
|
||||
case ncclUint64:
|
||||
case ncclFloat64: return 8;
|
||||
default: return 0;
|
||||
};
|
||||
}
|
||||
|
||||
// ncclReduce
|
||||
static void audit(const gotcha_data& _data, audit::incoming, const void*, const void*,
|
||||
size_t count, ncclDataType_t datatype, ncclRedOp_t, int root,
|
||||
ncclComm_t, hipStream_t);
|
||||
|
||||
// ncclSend
|
||||
// ncclGather
|
||||
// ncclBcast
|
||||
// ncclRecv
|
||||
static void audit(const gotcha_data& _data, audit::incoming, const void*,
|
||||
size_t count, ncclDataType_t datatype, int peer, ncclComm_t,
|
||||
hipStream_t);
|
||||
|
||||
// ncclBroadcast
|
||||
static void audit(const gotcha_data& _data, audit::incoming, const void*, const void*,
|
||||
size_t count, ncclDataType_t datatype, int root, ncclComm_t,
|
||||
hipStream_t);
|
||||
|
||||
// ncclAllReduce
|
||||
// ncclReduceScatter
|
||||
static void audit(const gotcha_data& _data, audit::incoming, const void*, const void*,
|
||||
size_t count, ncclDataType_t datatype, ncclRedOp_t, ncclComm_t,
|
||||
hipStream_t);
|
||||
|
||||
// ncclAllGather
|
||||
// ncclAlltoAll
|
||||
static void audit(const gotcha_data& _data, audit::incoming, const void*, const void*,
|
||||
size_t count, ncclDataType_t datatype, ncclComm_t, hipStream_t);
|
||||
|
||||
#endif
|
||||
|
||||
private:
|
||||
static auto& add(tracker_t& _t, data_type value)
|
||||
{
|
||||
if(omnitrace::get_state() != omnitrace::State::Active)
|
||||
{
|
||||
_t.invoke<operation::set_is_invalid>(true);
|
||||
return _t;
|
||||
}
|
||||
_t.store(std::plus<data_type>{}, value);
|
||||
return _t;
|
||||
}
|
||||
|
||||
static auto&& add(const gotcha_data& _data, data_type value)
|
||||
{
|
||||
tracker_t _t{ std::string_view{ _data.tool_id.c_str() } };
|
||||
return add(_t, value);
|
||||
}
|
||||
|
||||
static auto&& add(std::string&& _name, data_type value)
|
||||
{
|
||||
tracker_t _t{ _name };
|
||||
return add(_t, value);
|
||||
}
|
||||
|
||||
static auto&& add(std::string_view _name, data_type value)
|
||||
{
|
||||
tracker_t _t{ _name };
|
||||
return add(_t, value);
|
||||
}
|
||||
};
|
||||
} // namespace component
|
||||
} // namespace tim
|
||||
|
||||
#if !defined(OMNITRACE_EXTERN_COMPONENTS) || \
|
||||
(defined(OMNITRACE_EXTERN_COMPONENTS) && OMNITRACE_EXTERN_COMPONENTS > 0)
|
||||
|
||||
# include <timemory/components/base.hpp>
|
||||
# include <timemory/components/data_tracker/components.hpp>
|
||||
# include <timemory/operations.hpp>
|
||||
|
||||
TIMEMORY_DECLARE_EXTERN_COMPONENT(TIMEMORY_ESC(data_tracker<float, tim::api::omnitrace>),
|
||||
true, float)
|
||||
|
||||
TIMEMORY_DECLARE_EXTERN_COMPONENT(comm_data, false, void)
|
||||
#endif
|
||||
@@ -42,10 +42,10 @@ TIMEMORY_DEFINE_NS_API(category, process_sampling)
|
||||
|
||||
TIMEMORY_DECLARE_COMPONENT(roctracer)
|
||||
TIMEMORY_DECLARE_COMPONENT(rocprofiler)
|
||||
TIMEMORY_DECLARE_COMPONENT(rccl_comm_data)
|
||||
TIMEMORY_DECLARE_COMPONENT(rcclp_handle)
|
||||
TIMEMORY_COMPONENT_ALIAS(rccl_api_t, api::rccl)
|
||||
TIMEMORY_COMPONENT_ALIAS(rccl_data_tracker_t, data_tracker<float, rccl_api_t>)
|
||||
TIMEMORY_COMPONENT_ALIAS(comm_data_tracker_t, data_tracker<float, api::omnitrace>)
|
||||
TIMEMORY_DECLARE_COMPONENT(comm_data)
|
||||
|
||||
/// \struct tim::trait::name
|
||||
/// \brief provides a constexpr string in ::value
|
||||
@@ -160,11 +160,14 @@ TIMEMORY_DEFINE_CONCRETE_TRAIT(is_available, component::rocprofiler, false_type)
|
||||
|
||||
#if !defined(OMNITRACE_USE_RCCL)
|
||||
TIMEMORY_DEFINE_CONCRETE_TRAIT(is_available, api::rccl, false_type)
|
||||
TIMEMORY_DEFINE_CONCRETE_TRAIT(is_available, component::rccl_comm_data, false_type)
|
||||
TIMEMORY_DEFINE_CONCRETE_TRAIT(is_available, component::rccl_data_tracker_t, false_type)
|
||||
TIMEMORY_DEFINE_CONCRETE_TRAIT(is_available, component::rcclp_handle, false_type)
|
||||
#endif
|
||||
|
||||
#if !defined(OMNITRACE_USE_RCCL) && !defined(OMNITRACE_USE_MPI)
|
||||
TIMEMORY_DEFINE_CONCRETE_TRAIT(is_available, component::comm_data_tracker_t, false_type)
|
||||
TIMEMORY_DEFINE_CONCRETE_TRAIT(is_available, component::comm_data, false_type)
|
||||
#endif
|
||||
|
||||
#if !defined(TIMEMORY_USE_LIBUNWIND)
|
||||
TIMEMORY_DEFINE_CONCRETE_TRAIT(is_available, omnitrace::api::sampling, false_type)
|
||||
TIMEMORY_DEFINE_CONCRETE_TRAIT(is_available, omnitrace::component::backtrace, false_type)
|
||||
@@ -289,6 +292,7 @@ TIMEMORY_STATISTICS_TYPE(omnitrace::component::sampling_gpu_busy, double)
|
||||
TIMEMORY_STATISTICS_TYPE(omnitrace::component::sampling_gpu_temp, double)
|
||||
TIMEMORY_STATISTICS_TYPE(omnitrace::component::sampling_gpu_power, double)
|
||||
TIMEMORY_STATISTICS_TYPE(omnitrace::component::sampling_gpu_memory, double)
|
||||
TIMEMORY_STATISTICS_TYPE(component::comm_data_tracker_t, float)
|
||||
|
||||
// enable timing units
|
||||
TIMEMORY_DEFINE_CONCRETE_TRAIT(is_timing_category,
|
||||
|
||||
@@ -23,12 +23,15 @@
|
||||
#include "library/components/mpi_gotcha.hpp"
|
||||
#include "library/api.hpp"
|
||||
#include "library/components/category_region.hpp"
|
||||
#include "library/components/comm_data.hpp"
|
||||
#include "library/components/fwd.hpp"
|
||||
#include "library/config.hpp"
|
||||
#include "library/debug.hpp"
|
||||
#include "library/mproc.hpp"
|
||||
|
||||
#include <timemory/backends/mpi.hpp>
|
||||
#include <timemory/backends/process.hpp>
|
||||
#include <timemory/mpl/types.hpp>
|
||||
#include <timemory/utility/locking.hpp>
|
||||
|
||||
#include <cstdint>
|
||||
@@ -40,6 +43,10 @@ namespace omnitrace
|
||||
{
|
||||
namespace
|
||||
{
|
||||
using mpip_bundle_t =
|
||||
tim::component_tuple<omnitrace::component::category_region<category::mpi>,
|
||||
comp::comm_data>;
|
||||
|
||||
struct comm_rank_data
|
||||
{
|
||||
int rank = -1;
|
||||
@@ -104,10 +111,7 @@ omnitrace_mpi_set_attr()
|
||||
static auto _mpi_fini = [](MPI_Comm, int, void*, void*) {
|
||||
OMNITRACE_DEBUG("MPI Comm attribute finalize\n");
|
||||
if(mpip_index != std::numeric_limits<uint64_t>::max())
|
||||
comp::deactivate_mpip<
|
||||
tim::component_tuple<
|
||||
omnitrace::component::category_region<category::mpi>>,
|
||||
api::omnitrace>(mpip_index);
|
||||
comp::deactivate_mpip<mpip_bundle_t, api::omnitrace>(mpip_index);
|
||||
omnitrace_finalize_hidden();
|
||||
return MPI_SUCCESS;
|
||||
};
|
||||
@@ -224,9 +228,7 @@ mpi_gotcha::audit(const gotcha_data_t& _data, audit::incoming)
|
||||
OMNITRACE_BASIC_DEBUG_F("%s()\n", _data.tool_id.c_str());
|
||||
|
||||
if(mpip_index != std::numeric_limits<uint64_t>::max())
|
||||
comp::deactivate_mpip<
|
||||
tim::component_tuple<omnitrace::component::category_region<category::mpi>>,
|
||||
api::omnitrace>(mpip_index);
|
||||
comp::deactivate_mpip<mpip_bundle_t, api::omnitrace>(mpip_index);
|
||||
|
||||
#if !defined(TIMEMORY_USE_MPI) && defined(TIMEMORY_USE_MPI_HEADERS)
|
||||
tim::mpi::is_initialized_callback() = []() { return false; };
|
||||
@@ -276,16 +278,15 @@ mpi_gotcha::audit(const gotcha_data_t& _data, audit::outgoing, int _retval)
|
||||
{
|
||||
OMNITRACE_BASIC_VERBOSE_F(2, "Activating MPI wrappers...\n");
|
||||
|
||||
if(!get_use_timemory())
|
||||
{
|
||||
trait::runtime_enabled<comp::comm_data>::set(false);
|
||||
trait::runtime_enabled<comp::comm_data_tracker_t>::set(false);
|
||||
}
|
||||
// use env vars OMNITRACE_MPIP_PERMIT_LIST and OMNITRACE_MPIP_REJECT_LIST
|
||||
// to control the gotcha bindings at runtime
|
||||
comp::configure_mpip<
|
||||
tim::component_tuple<
|
||||
omnitrace::component::category_region<category::mpi>>,
|
||||
api::omnitrace>();
|
||||
mpip_index = comp::activate_mpip<
|
||||
tim::component_tuple<
|
||||
omnitrace::component::category_region<category::mpi>>,
|
||||
api::omnitrace>();
|
||||
comp::configure_mpip<mpip_bundle_t, api::omnitrace>();
|
||||
mpip_index = comp::activate_mpip<mpip_bundle_t, api::omnitrace>();
|
||||
}
|
||||
|
||||
auto_lock_t _lk{ type_mutex<mpi_gotcha>() };
|
||||
|
||||
@@ -196,74 +196,5 @@ rcclp_handle::get_tool_count()
|
||||
{
|
||||
return get_persistent_data().m_count;
|
||||
}
|
||||
|
||||
void
|
||||
rccl_comm_data::preinit()
|
||||
{
|
||||
omnitrace::rcclp::configure();
|
||||
}
|
||||
|
||||
// ncclReduce
|
||||
void
|
||||
rccl_comm_data::audit(const gotcha_data& _data, audit::incoming, const void*, void*,
|
||||
size_t count, ncclDataType_t datatype, ncclRedOp_t, int root,
|
||||
ncclComm_t, hipStream_t)
|
||||
{
|
||||
int size = rccl_type_size(datatype);
|
||||
add(_data, count * size, JOIN('_', _data.tool_id.c_str(), "root", root));
|
||||
}
|
||||
|
||||
// ncclSend
|
||||
void
|
||||
rccl_comm_data::audit(const gotcha_data& _data, audit::incoming, const void*,
|
||||
size_t count, ncclDataType_t datatype, int peer, ncclComm_t,
|
||||
hipStream_t)
|
||||
{
|
||||
int size = rccl_type_size(datatype);
|
||||
add(_data, count * size, JOIN('_', _data.tool_id.c_str(), "root", peer));
|
||||
}
|
||||
|
||||
// ncclBcast
|
||||
// ncclRecv
|
||||
void
|
||||
rccl_comm_data::audit(const gotcha_data& _data, audit::incoming, void*, size_t count,
|
||||
ncclDataType_t datatype, int root, ncclComm_t, hipStream_t)
|
||||
{
|
||||
int size = rccl_type_size(datatype);
|
||||
add(_data, count * size, JOIN('_', _data.tool_id.c_str(), "root", root));
|
||||
}
|
||||
|
||||
// ncclBroadcast
|
||||
void
|
||||
rccl_comm_data::audit(const gotcha_data& _data, audit::incoming, const void*, void*,
|
||||
size_t count, ncclDataType_t datatype, int root, ncclComm_t,
|
||||
hipStream_t)
|
||||
{
|
||||
int size = rccl_type_size(datatype);
|
||||
add(_data, count * size, JOIN('_', _data.tool_id.c_str(), "root", root));
|
||||
}
|
||||
|
||||
// ncclAllReduce
|
||||
// ncclReduceScatter
|
||||
void
|
||||
rccl_comm_data::audit(const gotcha_data& _data, audit::incoming, const void*, void*,
|
||||
size_t count, ncclDataType_t datatype, ncclRedOp_t, ncclComm_t,
|
||||
hipStream_t)
|
||||
{
|
||||
int size = rccl_type_size(datatype);
|
||||
add(_data, count * size);
|
||||
}
|
||||
|
||||
// ncclAllGather
|
||||
void
|
||||
rccl_comm_data::audit(const gotcha_data& _data, audit::incoming, const void*, void*,
|
||||
size_t count, ncclDataType_t datatype, ncclComm_t, hipStream_t)
|
||||
{
|
||||
int size = rccl_type_size(datatype);
|
||||
add(_data, count * size);
|
||||
}
|
||||
|
||||
} // namespace component
|
||||
} // namespace tim
|
||||
|
||||
TIMEMORY_INITIALIZE_STORAGE(rccl_comm_data, rccl_data_tracker_t)
|
||||
|
||||
@@ -24,6 +24,7 @@
|
||||
|
||||
#include "library/common.hpp"
|
||||
#include "library/components/category_region.hpp"
|
||||
#include "library/components/comm_data.hpp"
|
||||
#include "library/components/fwd.hpp"
|
||||
#include "library/defines.hpp"
|
||||
#include "library/timemory.hpp"
|
||||
@@ -51,7 +52,7 @@
|
||||
TIMEMORY_COMPONENT_ALIAS(
|
||||
rccl_toolset_t,
|
||||
component_bundle<rccl_api_t, omnitrace::component::category_region<category::rccl>,
|
||||
rccl_comm_data*>)
|
||||
comm_data>)
|
||||
TIMEMORY_COMPONENT_ALIAS(rcclp_gotcha_t,
|
||||
gotcha<OMNITRACE_NUM_RCCLP_WRAPPERS, rccl_toolset_t, rccl_api_t>)
|
||||
|
||||
@@ -59,12 +60,6 @@ TIMEMORY_COMPONENT_ALIAS(rcclp_gotcha_t,
|
||||
TIMEMORY_DEFINE_CONCRETE_TRAIT(is_available, component::rcclp_gotcha_t, false_type)
|
||||
#endif
|
||||
|
||||
TIMEMORY_STATISTICS_TYPE(component::rccl_data_tracker_t, float)
|
||||
TIMEMORY_DEFINE_CONCRETE_TRAIT(uses_memory_units, component::rccl_data_tracker_t,
|
||||
true_type)
|
||||
TIMEMORY_DEFINE_CONCRETE_TRAIT(is_memory_category, component::rccl_data_tracker_t,
|
||||
true_type)
|
||||
|
||||
namespace tim
|
||||
{
|
||||
namespace component
|
||||
@@ -110,111 +105,5 @@ private:
|
||||
static toolset_ptr_t& get_tool_instance();
|
||||
static std::atomic<int64_t>& get_tool_count();
|
||||
};
|
||||
|
||||
struct rccl_comm_data : base<rccl_comm_data, void>
|
||||
{
|
||||
using value_type = void;
|
||||
using this_type = rccl_comm_data;
|
||||
using base_type = base<this_type, value_type>;
|
||||
using tracker_t = tim::auto_tuple<rccl_data_tracker_t>;
|
||||
using data_type = float;
|
||||
|
||||
TIMEMORY_DEFAULT_OBJECT(rccl_comm_data)
|
||||
|
||||
static void preinit();
|
||||
static void start() {}
|
||||
static void stop() {}
|
||||
|
||||
static auto rccl_type_size(ncclDataType_t datatype)
|
||||
{
|
||||
switch(datatype)
|
||||
{
|
||||
case ncclInt8:
|
||||
case ncclUint8: return 1;
|
||||
case ncclFloat16: return 2;
|
||||
case ncclInt32:
|
||||
case ncclUint32:
|
||||
case ncclFloat32: return 4;
|
||||
case ncclInt64:
|
||||
case ncclUint64:
|
||||
case ncclFloat64: return 8;
|
||||
default: return 0;
|
||||
};
|
||||
}
|
||||
|
||||
// ncclReduce
|
||||
static void audit(const gotcha_data& _data, audit::incoming, const void*, void*,
|
||||
size_t count, ncclDataType_t datatype, ncclRedOp_t, int root,
|
||||
ncclComm_t, hipStream_t);
|
||||
|
||||
// ncclSend
|
||||
static void audit(const gotcha_data& _data, audit::incoming, const void*,
|
||||
size_t count, ncclDataType_t datatype, int peer, ncclComm_t,
|
||||
hipStream_t);
|
||||
|
||||
// ncclBcast
|
||||
// ncclRecv
|
||||
static void audit(const gotcha_data& _data, audit::incoming, void*, size_t count,
|
||||
ncclDataType_t datatype, int root, ncclComm_t, hipStream_t);
|
||||
|
||||
// ncclBroadcast
|
||||
static void audit(const gotcha_data& _data, audit::incoming, const void*, void*,
|
||||
size_t count, ncclDataType_t datatype, int root, ncclComm_t,
|
||||
hipStream_t);
|
||||
|
||||
// ncclAllReduce
|
||||
// ncclReduceScatter
|
||||
static void audit(const gotcha_data& _data, audit::incoming, const void*, void*,
|
||||
size_t count, ncclDataType_t datatype, ncclRedOp_t, ncclComm_t,
|
||||
hipStream_t);
|
||||
|
||||
// ncclAllGather
|
||||
static void audit(const gotcha_data& _data, audit::incoming, const void*, void*,
|
||||
size_t count, ncclDataType_t datatype, ncclComm_t, hipStream_t);
|
||||
|
||||
private:
|
||||
template <typename... Args>
|
||||
static void add(tracker_t& _t, data_type value, Args&&... args)
|
||||
{
|
||||
_t.store(std::plus<data_type>{}, value);
|
||||
TIMEMORY_FOLD_EXPRESSION(add_secondary(_t, std::forward<Args>(args), value));
|
||||
}
|
||||
|
||||
template <typename... Args>
|
||||
static void add(const gotcha_data& _data, data_type value, Args&&... args)
|
||||
{
|
||||
tracker_t _t{ std::string_view{ _data.tool_id.c_str() } };
|
||||
add(_t, value, std::forward<Args>(args)...);
|
||||
}
|
||||
|
||||
template <typename... Args>
|
||||
static void add_secondary(tracker_t&, const gotcha_data& _data, data_type value,
|
||||
Args&&... args)
|
||||
{
|
||||
// if(tim::settings::add_secondary())
|
||||
{
|
||||
tracker_t _s{ std::string_view{ _data.tool_id.c_str() } };
|
||||
add(_s, _data, value, std::forward<Args>(args)...);
|
||||
}
|
||||
}
|
||||
|
||||
template <typename... Args>
|
||||
static void add(std::string_view _name, data_type value, Args&&... args)
|
||||
{
|
||||
tracker_t _t{ _name };
|
||||
add(_t, value, std::forward<Args>(args)...);
|
||||
}
|
||||
|
||||
template <typename... Args>
|
||||
static void add_secondary(tracker_t&, std::string_view _name, data_type value,
|
||||
Args&&... args)
|
||||
{
|
||||
// if(tim::settings::add_secondary())
|
||||
{
|
||||
tracker_t _s{ _name };
|
||||
add(_s, value, std::forward<Args>(args)...);
|
||||
}
|
||||
}
|
||||
};
|
||||
} // namespace component
|
||||
} // namespace tim
|
||||
|
||||
@@ -85,6 +85,9 @@
|
||||
perfetto::Category("ompt").SetDescription("OpenMP Tools regions"), \
|
||||
perfetto::Category("rccl").SetDescription( \
|
||||
"ROCm Communication Collectives Library (RCCL) regions"), \
|
||||
perfetto::Category("comm_data") \
|
||||
.SetDescription( \
|
||||
"MPI/RCCL counters for tracking amount of data sent or received"), \
|
||||
perfetto::Category("critical-trace").SetDescription("Combined critical traces"), \
|
||||
perfetto::Category("host-critical-trace") \
|
||||
.SetDescription("Host-side critical traces"), \
|
||||
|
||||
@@ -51,10 +51,7 @@ namespace rcclp
|
||||
{
|
||||
void
|
||||
configure()
|
||||
{
|
||||
comp::rccl_data_tracker_t::label() = "rccl_comm_data";
|
||||
comp::rccl_data_tracker_t::description() = "Tracks RCCL communication data";
|
||||
}
|
||||
{}
|
||||
|
||||
void
|
||||
setup()
|
||||
@@ -67,11 +64,17 @@ setup()
|
||||
if(!librccl_handle) fprintf(stderr, "%s\n", dlerror());
|
||||
dlerror(); // Clear any existing error
|
||||
|
||||
auto _data = tim::get_env("OMNITRACE_RCCLP_COMM_DATA", true);
|
||||
if(_data)
|
||||
comp::rccl_toolset_t::get_initializer() = [](comp::rccl_toolset_t& cb) {
|
||||
cb.initialize<comp::rccl_comm_data>();
|
||||
};
|
||||
auto _use_data = tim::get_env("OMNITRACE_RCCLP_COMM_DATA", get_use_timemory());
|
||||
if(!get_use_timemory())
|
||||
{
|
||||
trait::runtime_enabled<comp::comm_data>::set(false);
|
||||
trait::runtime_enabled<comp::comm_data_tracker_t>::set(false);
|
||||
}
|
||||
else
|
||||
{
|
||||
trait::runtime_enabled<comp::comm_data>::set(_use_data);
|
||||
trait::runtime_enabled<comp::comm_data_tracker_t>::set(_use_data);
|
||||
}
|
||||
|
||||
comp::configure_rcclp();
|
||||
global_id = comp::activate_rcclp();
|
||||
|
||||
@@ -31,6 +31,8 @@
|
||||
#include "library/timemory.hpp"
|
||||
#include "library/utility.hpp"
|
||||
|
||||
#include <timemory/components/timing/backends.hpp>
|
||||
|
||||
namespace omnitrace
|
||||
{
|
||||
namespace tracing
|
||||
@@ -49,6 +51,13 @@ get_timemory_hash_ids(int64_t _tid = threading::get_id());
|
||||
tim::hash_alias_ptr_t&
|
||||
get_timemory_hash_aliases(int64_t _tid = threading::get_id());
|
||||
|
||||
template <typename Tp = uint64_t>
|
||||
OMNITRACE_INLINE auto
|
||||
now()
|
||||
{
|
||||
return ::tim::get_clock_real_now<Tp, std::nano>();
|
||||
}
|
||||
|
||||
namespace
|
||||
{
|
||||
bool debug_push = // NOLINT
|
||||
|
||||
Fai riferimento in un nuovo problema
Block a user