From df729548d0644e5b9a7ac479e25672f34fb43aa4 Mon Sep 17 00:00:00 2001 From: ajanicijamd Date: Wed, 19 Mar 2025 14:48:04 -0400 Subject: [PATCH] Fix a RCCL initialization to avoid a deadlock (#136) Also fixes: - crash while finalizing rocprof-sys-causal [ROCm/rocprofiler-systems commit: 26bb6042156dd7f59bb5e8d12062f90855691514] --- .../source/lib/rocprof-sys/api.cpp | 2 +- .../source/lib/rocprof-sys/api.hpp | 2 +- .../source/lib/rocprof-sys/library.cpp | 44 ++++++++++++++----- .../lib/rocprof-sys/library/causal/data.cpp | 1 - .../lib/rocprof-sys/library/causal/delay.cpp | 35 ++++++++++++--- .../lib/rocprof-sys/library/causal/delay.hpp | 1 + .../source/lib/rocprof-sys/library/perf.cpp | 11 ++++- 7 files changed, 77 insertions(+), 19 deletions(-) diff --git a/projects/rocprofiler-systems/source/lib/rocprof-sys/api.cpp b/projects/rocprofiler-systems/source/lib/rocprof-sys/api.cpp index ad2369ca59..bda609ef6e 100644 --- a/projects/rocprofiler-systems/source/lib/rocprof-sys/api.cpp +++ b/projects/rocprofiler-systems/source/lib/rocprof-sys/api.cpp @@ -122,7 +122,7 @@ rocprofsys_init_library(void) extern "C" void rocprofsys_init_tooling(void) { - rocprofsys_init_tooling_hidden(); + rocprofsys_init_tooling_hidden(true); } extern "C" void diff --git a/projects/rocprofiler-systems/source/lib/rocprof-sys/api.hpp b/projects/rocprofiler-systems/source/lib/rocprof-sys/api.hpp index 478f96ec58..55dfbbdd0a 100644 --- a/projects/rocprofiler-systems/source/lib/rocprof-sys/api.hpp +++ b/projects/rocprofiler-systems/source/lib/rocprof-sys/api.hpp @@ -95,7 +95,7 @@ extern "C" // these are the real implementations for internal calling convention void rocprofsys_init_library_hidden(void) ROCPROFSYS_HIDDEN_API; - bool rocprofsys_init_tooling_hidden(void) ROCPROFSYS_HIDDEN_API; + bool rocprofsys_init_tooling_hidden(bool postinit = false) ROCPROFSYS_HIDDEN_API; void rocprofsys_init_hidden(const char*, bool, const char*) ROCPROFSYS_HIDDEN_API; void rocprofsys_finalize_hidden(void) ROCPROFSYS_HIDDEN_API; void rocprofsys_reset_preload_hidden(void) ROCPROFSYS_HIDDEN_API; diff --git a/projects/rocprofiler-systems/source/lib/rocprof-sys/library.cpp b/projects/rocprofiler-systems/source/lib/rocprof-sys/library.cpp index 32bd84f8ff..10391aefcc 100644 --- a/projects/rocprofiler-systems/source/lib/rocprof-sys/library.cpp +++ b/projects/rocprofiler-systems/source/lib/rocprof-sys/library.cpp @@ -403,16 +403,42 @@ rocprofsys_init_library_hidden() ROCPROFSYS_CONDITIONAL_BASIC_PRINT_F(_debug_init, "\n"); } +// Initialize RCCL if: +// - postinit=true - so the code doesn't hang at the initialization stage +// - get_state() >= State::Init - so the code doesn't throw an exception +// - rccl_initialized=false - so we don't try to initialize RCCL twice +// - get_use_rcclp()=true - only if the environment is configured to use RCCL +static void +rccl_setup(bool postinit) +{ + // Flag used to avoid initializing RCCL twice + static bool rccl_initialized = false; + + if(postinit && (get_state() >= State::Init) && !rccl_initialized && get_use_rcclp()) + { + ROCPROFSYS_VERBOSE_F(1, "Setting up RCCLP...\n"); + rcclp::setup(); + rccl_initialized = true; + } +} + +static void +rocprofsys_init_library_hidden_with_rccl(bool postinit) +{ + rocprofsys_init_library_hidden(); + rccl_setup(postinit); +} + //======================================================================================// extern "C" bool -rocprofsys_init_tooling_hidden() +rocprofsys_init_tooling_hidden(bool postinit) { if(get_env("ROCPROFSYS_MONOCHROME", false, false)) tim::log::monochrome() = true; if(!tim::get_env("ROCPROFSYS_INIT_TOOLING", true)) { - rocprofsys_init_library_hidden(); + rocprofsys_init_library_hidden_with_rccl(postinit); return false; } @@ -422,7 +448,11 @@ rocprofsys_init_tooling_hidden() ROCPROFSYS_CONDITIONAL_BASIC_PRINT_F(_debug_init, "State is %s...\n", std::to_string(get_state()).c_str()); - if(get_state() != State::PreInit || get_state() == State::Init || _once) return false; + if(get_state() != State::PreInit || get_state() == State::Init || _once) + { + rccl_setup(postinit); + return false; + } _once = true; ROCPROFSYS_SCOPED_THREAD_STATE(ThreadState::Internal); @@ -443,7 +473,7 @@ rocprofsys_init_tooling_hidden() ROCPROFSYS_CONDITIONAL_BASIC_PRINT_F(_debug_init, "Calling rocprofsys_init_library()...\n"); - rocprofsys_init_library_hidden(); + rocprofsys_init_library_hidden_with_rccl(postinit); ROCPROFSYS_DEBUG_F("\n"); @@ -541,12 +571,6 @@ rocprofsys_init_tooling_hidden() ompt::setup(); } - if(get_use_rcclp()) - { - ROCPROFSYS_VERBOSE_F(1, "Setting up RCCLP...\n"); - rcclp::setup(); - } - if(get_use_perfetto()) { ROCPROFSYS_VERBOSE_F(1, "Starting Perfetto...\n"); diff --git a/projects/rocprofiler-systems/source/lib/rocprof-sys/library/causal/data.cpp b/projects/rocprofiler-systems/source/lib/rocprof-sys/library/causal/data.cpp index 9153b5c70d..93e2bfbc01 100644 --- a/projects/rocprofiler-systems/source/lib/rocprof-sys/library/causal/data.cpp +++ b/projects/rocprofiler-systems/source/lib/rocprof-sys/library/causal/data.cpp @@ -855,7 +855,6 @@ sample_selection(size_t _nitr, size_t _wait_ns) while(eligible_pc_idx.load(std::memory_order_relaxed) == 0) { - if(get_state() >= State::Finalized) return selected_entry{}; std::this_thread::yield(); std::this_thread::sleep_for(std::chrono::nanoseconds{ _wait_ns }); } diff --git a/projects/rocprofiler-systems/source/lib/rocprof-sys/library/causal/delay.cpp b/projects/rocprofiler-systems/source/lib/rocprof-sys/library/causal/delay.cpp index db76f83927..c0f75aaf8a 100644 --- a/projects/rocprofiler-systems/source/lib/rocprof-sys/library/causal/delay.cpp +++ b/projects/rocprofiler-systems/source/lib/rocprof-sys/library/causal/delay.cpp @@ -105,6 +105,8 @@ delay::setup() void delay::process() { + if(!is_local_available()) return; + if(causal::experiment::is_active()) { if(get_global() < get_local()) @@ -130,6 +132,8 @@ delay::process() void delay::credit() { + if(!is_local_available()) return; + auto _diff = get_global() - get_local(); if(_diff > 0) { @@ -140,6 +144,8 @@ delay::credit() void delay::preblock() { + if(!is_local_available()) return; + auto _diff = get_global() - get_local(); if(_diff > 0) { @@ -150,6 +156,7 @@ delay::preblock() void delay::postblock(int64_t _preblock_global_delay_value) { + if(!is_local_available()) return; get_local() += (get_global() - _preblock_global_delay_value); } @@ -168,18 +175,36 @@ delay::get_global() return _v; } -int64_t& -delay::get_local(int64_t _tid) +static void +thr_init() { - auto& _data = get_delay_data(); static thread_local auto _thr_init = []() { using thread_data_t = thread_data, delay>; thread_data_t::construct(construct_on_thread{ threading::get_id() }, - get_global().load()); + delay::get_global().load()); return true; }(); + (void) _thr_init; // To make compiler happy. +} + +bool +delay::is_local_available() +{ + thr_init(); + auto& _data = get_delay_data(); + return _data != nullptr; +} + +int64_t& +delay::get_local(int64_t _tid) +{ + thr_init(); + auto& _data = get_delay_data(); + if(_data == nullptr) + { + throw std::runtime_error("No data: get_delay_data() returned nullptr"); + } return _data->at(_tid); - (void) _thr_init; } uint64_t diff --git a/projects/rocprofiler-systems/source/lib/rocprof-sys/library/causal/delay.hpp b/projects/rocprofiler-systems/source/lib/rocprof-sys/library/causal/delay.hpp index 4501f7c634..54e528061f 100644 --- a/projects/rocprofiler-systems/source/lib/rocprof-sys/library/causal/delay.hpp +++ b/projects/rocprofiler-systems/source/lib/rocprof-sys/library/causal/delay.hpp @@ -53,6 +53,7 @@ struct delay : comp::empty_base static std::atomic& get_global(); static int64_t& get_local(int64_t _tid = threading::get_id()); + static bool is_local_available(); static int64_t get(int64_t _tid = threading::get_id()); static uint64_t compute_total_delay(uint64_t); diff --git a/projects/rocprofiler-systems/source/lib/rocprof-sys/library/perf.cpp b/projects/rocprofiler-systems/source/lib/rocprof-sys/library/perf.cpp index dd5a2601bc..e57c0edc37 100644 --- a/projects/rocprofiler-systems/source/lib/rocprof-sys/library/perf.cpp +++ b/projects/rocprofiler-systems/source/lib/rocprof-sys/library/perf.cpp @@ -654,7 +654,16 @@ get_instances() std::unique_ptr& get_instance(int64_t _tid) { - auto& _data = get_instances(); + static auto nullInstance = std::unique_ptr{ nullptr }; + auto& _data = get_instances(); + + // If get_instances() returned an empty object, we have to return a reference to a + // static null instance, or else we will crash. + if(_data == nullptr) + { + return nullInstance; + } + if(static_cast(_tid) >= _data->size()) { ROCPROFSYS_SCOPED_THREAD_STATE(ThreadState::Internal);