From 8eff363ed346a8a33a4de0af154ecd555ff69ca4 Mon Sep 17 00:00:00 2001 From: "Jonathan R. Madsen" Date: Mon, 20 Jun 2022 00:50:49 -0500 Subject: [PATCH] Remove reliance on MPI_Comm_rank (#56) * Remove reliance on MPI_Comm_rank - read /proc//tasks//children of parent process to deduce the rank - Old format relied on user calling MPI_Comm_rank(MPI_COMM_WORLD, ...) - if MPI_Comm_rank called with subcommunicators only, multiple ranks would write to same file * Tweak mpi example --- examples/mpi/mpi.cpp | 25 +++++-- source/lib/omnitrace/CMakeLists.txt | 2 + .../library/components/mpi_gotcha.cpp | 47 +++++++++--- source/lib/omnitrace/library/mproc.cpp | 74 +++++++++++++++++++ source/lib/omnitrace/library/mproc.hpp | 39 ++++++++++ 5 files changed, 171 insertions(+), 16 deletions(-) create mode 100644 source/lib/omnitrace/library/mproc.cpp create mode 100644 source/lib/omnitrace/library/mproc.hpp diff --git a/examples/mpi/mpi.cpp b/examples/mpi/mpi.cpp index 073f07f43b..2299c7db16 100644 --- a/examples/mpi/mpi.cpp +++ b/examples/mpi/mpi.cpp @@ -20,6 +20,8 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ +#include + #include #include #include @@ -30,15 +32,12 @@ THE SOFTWARE. #include #include #include +#include #include #include +#include #include -static std::mutex print_lock{}; -using auto_lock_t = std::unique_lock; - -#include - std::string _name = {}; template @@ -105,10 +104,24 @@ main(int argc, char** argv) printf("[%s] Number of iterations: %i\n", _name.c_str(), nitr); - MPI_Init(&argc, &argv); + int _mpi_thread_provided; + MPI_Init_thread(&argc, &argv, MPI_THREAD_SINGLE, &_mpi_thread_provided); MPI_Comm_rank(MPI_COMM_WORLD, &rank); MPI_Comm_size(MPI_COMM_WORLD, &size); + auto _ppid = getppid(); + std::ifstream _ifs{ "/proc/" + std::to_string(_ppid) + "/task/" + + std::to_string(_ppid) + "/children" }; + std::stringstream _ss{}; + while(_ifs) + { + std::string _s{}; + _ifs >> _s; + _ss << _s << " "; + } + printf("[%s] RANK = %i, PID = %i, PPID = %i :: %s\n", _name.c_str(), rank, getpid(), + getppid(), _ss.str().c_str()); + MPI_Barrier(MPI_COMM_WORLD); for(int i = 0; i < nitr; ++i) { diff --git a/source/lib/omnitrace/CMakeLists.txt b/source/lib/omnitrace/CMakeLists.txt index dd897a5702..30e43b3310 100644 --- a/source/lib/omnitrace/CMakeLists.txt +++ b/source/lib/omnitrace/CMakeLists.txt @@ -61,6 +61,7 @@ set(library_sources ${CMAKE_CURRENT_LIST_DIR}/library/dynamic_library.cpp ${CMAKE_CURRENT_LIST_DIR}/library/kokkosp.cpp ${CMAKE_CURRENT_LIST_DIR}/library/gpu.cpp + ${CMAKE_CURRENT_LIST_DIR}/library/mproc.cpp ${CMAKE_CURRENT_LIST_DIR}/library/ompt.cpp ${CMAKE_CURRENT_LIST_DIR}/library/perfetto.cpp ${CMAKE_CURRENT_LIST_DIR}/library/ptl.cpp @@ -90,6 +91,7 @@ set(library_headers ${CMAKE_CURRENT_LIST_DIR}/library/debug.hpp ${CMAKE_CURRENT_LIST_DIR}/library/dynamic_library.hpp ${CMAKE_CURRENT_LIST_DIR}/library/gpu.hpp + ${CMAKE_CURRENT_LIST_DIR}/library/mproc.hpp ${CMAKE_CURRENT_LIST_DIR}/library/ompt.hpp ${CMAKE_CURRENT_LIST_DIR}/library/perfetto.hpp ${CMAKE_CURRENT_LIST_DIR}/library/ptl.hpp diff --git a/source/lib/omnitrace/library/components/mpi_gotcha.cpp b/source/lib/omnitrace/library/components/mpi_gotcha.cpp index f401e4ebe2..09f2ec57a2 100644 --- a/source/lib/omnitrace/library/components/mpi_gotcha.cpp +++ b/source/lib/omnitrace/library/components/mpi_gotcha.cpp @@ -25,6 +25,7 @@ #include "library/components/omnitrace.hpp" #include "library/config.hpp" #include "library/debug.hpp" +#include "library/mproc.hpp" #include #include @@ -170,6 +171,26 @@ mpi_gotcha::audit(const gotcha_data_t& _data, audit::outgoing, int _retval) comp::activate_mpip, api::omnitrace>(); } + + auto _size = mproc::get_concurrent_processes().size(); + if(_size > 0) + { + m_size = _size; + tim::mpi::set_size(_size); + OMNITRACE_BASIC_VERBOSE(0, "[pid=%i] MPI size: %i (%i)\n", process::get_id(), + tim::mpi::size(), m_size); + + auto _rank = mproc::get_process_index(); + if(_rank >= 0) + { + m_rank = _rank; + tim::mpi::set_rank(_rank); + tim::settings::default_process_suffix() = _rank; + get_perfetto_output_filename().clear(); + OMNITRACE_BASIC_VERBOSE(0, "[pid=%i] MPI rank: %i (%i)\n", + process::get_id(), tim::mpi::rank(), m_rank); + } + } } else if(_retval == tim::mpi::success_v && _data.tool_id.find("MPI_Comm_") == 0) { @@ -177,12 +198,15 @@ mpi_gotcha::audit(const gotcha_data_t& _data, audit::outgoing, int _retval) { if(m_rank_ptr) { - m_rank = std::max(*m_rank_ptr, m_rank); - tim::mpi::set_rank(m_rank); - tim::settings::default_process_suffix() = m_rank; - get_perfetto_output_filename().clear(); - OMNITRACE_BASIC_VERBOSE(0, "[pid=%i] MPI rank: %i (%i)\n", - process::get_id(), tim::mpi::rank(), m_rank); + if(mproc::get_concurrent_processes().empty()) + { + m_rank = std::max(*m_rank_ptr, m_rank); + tim::mpi::set_rank(m_rank); + tim::settings::default_process_suffix() = m_rank; + get_perfetto_output_filename().clear(); + OMNITRACE_BASIC_VERBOSE(0, "[pid=%i] MPI rank: %i (%i)\n", + process::get_id(), tim::mpi::rank(), m_rank); + } } else { @@ -194,10 +218,13 @@ mpi_gotcha::audit(const gotcha_data_t& _data, audit::outgoing, int _retval) { if(m_size_ptr) { - m_size = std::max(*m_size_ptr, m_size); - tim::mpi::set_size(m_size); - OMNITRACE_BASIC_VERBOSE(0, "[pid=%i] MPI size: %i (%i)\n", - process::get_id(), tim::mpi::size(), m_size); + if(mproc::get_concurrent_processes().empty()) + { + m_size = std::max(*m_size_ptr, m_size); + tim::mpi::set_size(m_size); + OMNITRACE_BASIC_VERBOSE(0, "[pid=%i] MPI size: %i (%i)\n", + process::get_id(), tim::mpi::size(), m_size); + } } else { diff --git a/source/lib/omnitrace/library/mproc.cpp b/source/lib/omnitrace/library/mproc.cpp new file mode 100644 index 0000000000..4cad922233 --- /dev/null +++ b/source/lib/omnitrace/library/mproc.cpp @@ -0,0 +1,74 @@ +// MIT License +// +// Copyright (c) 2022 Advanced Micro Devices, Inc. All Rights Reserved. +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in all +// copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +// SOFTWARE. + +#include "library/mproc.hpp" +#include "library/debug.hpp" +#include "library/timemory.hpp" + +#include +#include +#include +#include +#include + +namespace omnitrace +{ +namespace mproc +{ +std::set +get_concurrent_processes(int _ppid) +{ + std::set _children = {}; + if(_ppid > 0) + { + auto _inp = JOIN('/', "/proc", _ppid, "task", _ppid, "children"); + std::ifstream _ifs{ _inp }; + if(!_ifs) + { + OMNITRACE_VERBOSE_F(0, "Warning! File '%s' cannot be read\n", _inp.c_str()); + return _children; + } + + while(_ifs) + { + int _v = -1; + _ifs >> _v; + if(!_ifs.good() || _ifs.eof()) break; + if(_v < 0) continue; + _children.emplace(_v); + } + } + return _children; +} + +int +get_process_index(int _pid, int _ppid) +{ + auto _children = get_concurrent_processes(_ppid); + for(auto itr = _children.begin(); itr != _children.end(); ++itr) + { + if(*itr == _pid) return std::distance(_children.begin(), itr); + } + return -1; +} +} // namespace mproc +} // namespace omnitrace diff --git a/source/lib/omnitrace/library/mproc.hpp b/source/lib/omnitrace/library/mproc.hpp new file mode 100644 index 0000000000..eb1534a4e0 --- /dev/null +++ b/source/lib/omnitrace/library/mproc.hpp @@ -0,0 +1,39 @@ +// MIT License +// +// Copyright (c) 2022 Advanced Micro Devices, Inc. All Rights Reserved. +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in all +// copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +// SOFTWARE. + +#pragma once + +#include +#include + +namespace omnitrace +{ +namespace mproc +{ +// get the concurrent processes from /proc//task//children +std::set +get_concurrent_processes(int _ppid = getppid()); + +int +get_process_index(int _pid = getpid(), int _ppid = getppid()); +} // namespace mproc +} // namespace omnitrace