Remove reliance on MPI_Comm_rank (#56)
* Remove reliance on MPI_Comm_rank - read /proc/<PID>/tasks/<PID>/children of parent process to deduce the rank - Old format relied on user calling MPI_Comm_rank(MPI_COMM_WORLD, ...) - if MPI_Comm_rank called with subcommunicators only, multiple ranks would write to same file * Tweak mpi example
This commit is contained in:
committato da
GitHub
parent
f27f062e88
commit
8eff363ed3
+19
-6
@@ -20,6 +20,8 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
THE SOFTWARE.
|
||||
*/
|
||||
|
||||
#include <mpi.h>
|
||||
|
||||
#include <cfloat>
|
||||
#include <chrono>
|
||||
#include <cmath>
|
||||
@@ -30,15 +32,12 @@ THE SOFTWARE.
|
||||
#include <iostream>
|
||||
#include <mutex>
|
||||
#include <random>
|
||||
#include <sstream>
|
||||
#include <thread>
|
||||
#include <type_traits>
|
||||
#include <unistd.h>
|
||||
#include <vector>
|
||||
|
||||
static std::mutex print_lock{};
|
||||
using auto_lock_t = std::unique_lock<std::mutex>;
|
||||
|
||||
#include <mpi.h>
|
||||
|
||||
std::string _name = {};
|
||||
|
||||
template <typename Tp, size_t N>
|
||||
@@ -105,10 +104,24 @@ main(int argc, char** argv)
|
||||
|
||||
printf("[%s] Number of iterations: %i\n", _name.c_str(), nitr);
|
||||
|
||||
MPI_Init(&argc, &argv);
|
||||
int _mpi_thread_provided;
|
||||
MPI_Init_thread(&argc, &argv, MPI_THREAD_SINGLE, &_mpi_thread_provided);
|
||||
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
|
||||
MPI_Comm_size(MPI_COMM_WORLD, &size);
|
||||
|
||||
auto _ppid = getppid();
|
||||
std::ifstream _ifs{ "/proc/" + std::to_string(_ppid) + "/task/" +
|
||||
std::to_string(_ppid) + "/children" };
|
||||
std::stringstream _ss{};
|
||||
while(_ifs)
|
||||
{
|
||||
std::string _s{};
|
||||
_ifs >> _s;
|
||||
_ss << _s << " ";
|
||||
}
|
||||
printf("[%s] RANK = %i, PID = %i, PPID = %i :: %s\n", _name.c_str(), rank, getpid(),
|
||||
getppid(), _ss.str().c_str());
|
||||
|
||||
MPI_Barrier(MPI_COMM_WORLD);
|
||||
for(int i = 0; i < nitr; ++i)
|
||||
{
|
||||
|
||||
@@ -61,6 +61,7 @@ set(library_sources
|
||||
${CMAKE_CURRENT_LIST_DIR}/library/dynamic_library.cpp
|
||||
${CMAKE_CURRENT_LIST_DIR}/library/kokkosp.cpp
|
||||
${CMAKE_CURRENT_LIST_DIR}/library/gpu.cpp
|
||||
${CMAKE_CURRENT_LIST_DIR}/library/mproc.cpp
|
||||
${CMAKE_CURRENT_LIST_DIR}/library/ompt.cpp
|
||||
${CMAKE_CURRENT_LIST_DIR}/library/perfetto.cpp
|
||||
${CMAKE_CURRENT_LIST_DIR}/library/ptl.cpp
|
||||
@@ -90,6 +91,7 @@ set(library_headers
|
||||
${CMAKE_CURRENT_LIST_DIR}/library/debug.hpp
|
||||
${CMAKE_CURRENT_LIST_DIR}/library/dynamic_library.hpp
|
||||
${CMAKE_CURRENT_LIST_DIR}/library/gpu.hpp
|
||||
${CMAKE_CURRENT_LIST_DIR}/library/mproc.hpp
|
||||
${CMAKE_CURRENT_LIST_DIR}/library/ompt.hpp
|
||||
${CMAKE_CURRENT_LIST_DIR}/library/perfetto.hpp
|
||||
${CMAKE_CURRENT_LIST_DIR}/library/ptl.hpp
|
||||
|
||||
@@ -25,6 +25,7 @@
|
||||
#include "library/components/omnitrace.hpp"
|
||||
#include "library/config.hpp"
|
||||
#include "library/debug.hpp"
|
||||
#include "library/mproc.hpp"
|
||||
|
||||
#include <thread>
|
||||
#include <timemory/backends/mpi.hpp>
|
||||
@@ -170,6 +171,26 @@ mpi_gotcha::audit(const gotcha_data_t& _data, audit::outgoing, int _retval)
|
||||
comp::activate_mpip<tim::component_tuple<omnitrace::component::omnitrace>,
|
||||
api::omnitrace>();
|
||||
}
|
||||
|
||||
auto _size = mproc::get_concurrent_processes().size();
|
||||
if(_size > 0)
|
||||
{
|
||||
m_size = _size;
|
||||
tim::mpi::set_size(_size);
|
||||
OMNITRACE_BASIC_VERBOSE(0, "[pid=%i] MPI size: %i (%i)\n", process::get_id(),
|
||||
tim::mpi::size(), m_size);
|
||||
|
||||
auto _rank = mproc::get_process_index();
|
||||
if(_rank >= 0)
|
||||
{
|
||||
m_rank = _rank;
|
||||
tim::mpi::set_rank(_rank);
|
||||
tim::settings::default_process_suffix() = _rank;
|
||||
get_perfetto_output_filename().clear();
|
||||
OMNITRACE_BASIC_VERBOSE(0, "[pid=%i] MPI rank: %i (%i)\n",
|
||||
process::get_id(), tim::mpi::rank(), m_rank);
|
||||
}
|
||||
}
|
||||
}
|
||||
else if(_retval == tim::mpi::success_v && _data.tool_id.find("MPI_Comm_") == 0)
|
||||
{
|
||||
@@ -177,12 +198,15 @@ mpi_gotcha::audit(const gotcha_data_t& _data, audit::outgoing, int _retval)
|
||||
{
|
||||
if(m_rank_ptr)
|
||||
{
|
||||
m_rank = std::max<int>(*m_rank_ptr, m_rank);
|
||||
tim::mpi::set_rank(m_rank);
|
||||
tim::settings::default_process_suffix() = m_rank;
|
||||
get_perfetto_output_filename().clear();
|
||||
OMNITRACE_BASIC_VERBOSE(0, "[pid=%i] MPI rank: %i (%i)\n",
|
||||
process::get_id(), tim::mpi::rank(), m_rank);
|
||||
if(mproc::get_concurrent_processes().empty())
|
||||
{
|
||||
m_rank = std::max<int>(*m_rank_ptr, m_rank);
|
||||
tim::mpi::set_rank(m_rank);
|
||||
tim::settings::default_process_suffix() = m_rank;
|
||||
get_perfetto_output_filename().clear();
|
||||
OMNITRACE_BASIC_VERBOSE(0, "[pid=%i] MPI rank: %i (%i)\n",
|
||||
process::get_id(), tim::mpi::rank(), m_rank);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
@@ -194,10 +218,13 @@ mpi_gotcha::audit(const gotcha_data_t& _data, audit::outgoing, int _retval)
|
||||
{
|
||||
if(m_size_ptr)
|
||||
{
|
||||
m_size = std::max<int>(*m_size_ptr, m_size);
|
||||
tim::mpi::set_size(m_size);
|
||||
OMNITRACE_BASIC_VERBOSE(0, "[pid=%i] MPI size: %i (%i)\n",
|
||||
process::get_id(), tim::mpi::size(), m_size);
|
||||
if(mproc::get_concurrent_processes().empty())
|
||||
{
|
||||
m_size = std::max<int>(*m_size_ptr, m_size);
|
||||
tim::mpi::set_size(m_size);
|
||||
OMNITRACE_BASIC_VERBOSE(0, "[pid=%i] MPI size: %i (%i)\n",
|
||||
process::get_id(), tim::mpi::size(), m_size);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
|
||||
@@ -0,0 +1,74 @@
|
||||
// MIT License
|
||||
//
|
||||
// Copyright (c) 2022 Advanced Micro Devices, Inc. All Rights Reserved.
|
||||
//
|
||||
// Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
// of this software and associated documentation files (the "Software"), to deal
|
||||
// in the Software without restriction, including without limitation the rights
|
||||
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
// copies of the Software, and to permit persons to whom the Software is
|
||||
// furnished to do so, subject to the following conditions:
|
||||
//
|
||||
// The above copyright notice and this permission notice shall be included in all
|
||||
// copies or substantial portions of the Software.
|
||||
//
|
||||
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
// SOFTWARE.
|
||||
|
||||
#include "library/mproc.hpp"
|
||||
#include "library/debug.hpp"
|
||||
#include "library/timemory.hpp"
|
||||
|
||||
#include <fstream>
|
||||
#include <set>
|
||||
#include <sstream>
|
||||
#include <string>
|
||||
#include <unistd.h>
|
||||
|
||||
namespace omnitrace
|
||||
{
|
||||
namespace mproc
|
||||
{
|
||||
std::set<int>
|
||||
get_concurrent_processes(int _ppid)
|
||||
{
|
||||
std::set<int> _children = {};
|
||||
if(_ppid > 0)
|
||||
{
|
||||
auto _inp = JOIN('/', "/proc", _ppid, "task", _ppid, "children");
|
||||
std::ifstream _ifs{ _inp };
|
||||
if(!_ifs)
|
||||
{
|
||||
OMNITRACE_VERBOSE_F(0, "Warning! File '%s' cannot be read\n", _inp.c_str());
|
||||
return _children;
|
||||
}
|
||||
|
||||
while(_ifs)
|
||||
{
|
||||
int _v = -1;
|
||||
_ifs >> _v;
|
||||
if(!_ifs.good() || _ifs.eof()) break;
|
||||
if(_v < 0) continue;
|
||||
_children.emplace(_v);
|
||||
}
|
||||
}
|
||||
return _children;
|
||||
}
|
||||
|
||||
int
|
||||
get_process_index(int _pid, int _ppid)
|
||||
{
|
||||
auto _children = get_concurrent_processes(_ppid);
|
||||
for(auto itr = _children.begin(); itr != _children.end(); ++itr)
|
||||
{
|
||||
if(*itr == _pid) return std::distance(_children.begin(), itr);
|
||||
}
|
||||
return -1;
|
||||
}
|
||||
} // namespace mproc
|
||||
} // namespace omnitrace
|
||||
@@ -0,0 +1,39 @@
|
||||
// MIT License
|
||||
//
|
||||
// Copyright (c) 2022 Advanced Micro Devices, Inc. All Rights Reserved.
|
||||
//
|
||||
// Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
// of this software and associated documentation files (the "Software"), to deal
|
||||
// in the Software without restriction, including without limitation the rights
|
||||
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
// copies of the Software, and to permit persons to whom the Software is
|
||||
// furnished to do so, subject to the following conditions:
|
||||
//
|
||||
// The above copyright notice and this permission notice shall be included in all
|
||||
// copies or substantial portions of the Software.
|
||||
//
|
||||
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
// SOFTWARE.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <set>
|
||||
#include <unistd.h>
|
||||
|
||||
namespace omnitrace
|
||||
{
|
||||
namespace mproc
|
||||
{
|
||||
// get the concurrent processes from /proc/<PPID>/task/<PPID>/children
|
||||
std::set<int>
|
||||
get_concurrent_processes(int _ppid = getppid());
|
||||
|
||||
int
|
||||
get_process_index(int _pid = getpid(), int _ppid = getppid());
|
||||
} // namespace mproc
|
||||
} // namespace omnitrace
|
||||
Fai riferimento in un nuovo problema
Block a user