Fix a RCCL initialization to avoid a deadlock (#136)
Also fixes:
- crash while finalizing rocprof-sys-causal
[ROCm/rocprofiler-systems commit: 26bb604215]
This commit is contained in:
@@ -122,7 +122,7 @@ rocprofsys_init_library(void)
|
||||
extern "C" void
|
||||
rocprofsys_init_tooling(void)
|
||||
{
|
||||
rocprofsys_init_tooling_hidden();
|
||||
rocprofsys_init_tooling_hidden(true);
|
||||
}
|
||||
|
||||
extern "C" void
|
||||
|
||||
@@ -95,7 +95,7 @@ extern "C"
|
||||
|
||||
// these are the real implementations for internal calling convention
|
||||
void rocprofsys_init_library_hidden(void) ROCPROFSYS_HIDDEN_API;
|
||||
bool rocprofsys_init_tooling_hidden(void) ROCPROFSYS_HIDDEN_API;
|
||||
bool rocprofsys_init_tooling_hidden(bool postinit = false) ROCPROFSYS_HIDDEN_API;
|
||||
void rocprofsys_init_hidden(const char*, bool, const char*) ROCPROFSYS_HIDDEN_API;
|
||||
void rocprofsys_finalize_hidden(void) ROCPROFSYS_HIDDEN_API;
|
||||
void rocprofsys_reset_preload_hidden(void) ROCPROFSYS_HIDDEN_API;
|
||||
|
||||
@@ -403,16 +403,42 @@ rocprofsys_init_library_hidden()
|
||||
ROCPROFSYS_CONDITIONAL_BASIC_PRINT_F(_debug_init, "\n");
|
||||
}
|
||||
|
||||
// Initialize RCCL if:
|
||||
// - postinit=true - so the code doesn't hang at the initialization stage
|
||||
// - get_state() >= State::Init - so the code doesn't throw an exception
|
||||
// - rccl_initialized=false - so we don't try to initialize RCCL twice
|
||||
// - get_use_rcclp()=true - only if the environment is configured to use RCCL
|
||||
static void
|
||||
rccl_setup(bool postinit)
|
||||
{
|
||||
// Flag used to avoid initializing RCCL twice
|
||||
static bool rccl_initialized = false;
|
||||
|
||||
if(postinit && (get_state() >= State::Init) && !rccl_initialized && get_use_rcclp())
|
||||
{
|
||||
ROCPROFSYS_VERBOSE_F(1, "Setting up RCCLP...\n");
|
||||
rcclp::setup();
|
||||
rccl_initialized = true;
|
||||
}
|
||||
}
|
||||
|
||||
static void
|
||||
rocprofsys_init_library_hidden_with_rccl(bool postinit)
|
||||
{
|
||||
rocprofsys_init_library_hidden();
|
||||
rccl_setup(postinit);
|
||||
}
|
||||
|
||||
//======================================================================================//
|
||||
|
||||
extern "C" bool
|
||||
rocprofsys_init_tooling_hidden()
|
||||
rocprofsys_init_tooling_hidden(bool postinit)
|
||||
{
|
||||
if(get_env("ROCPROFSYS_MONOCHROME", false, false)) tim::log::monochrome() = true;
|
||||
|
||||
if(!tim::get_env("ROCPROFSYS_INIT_TOOLING", true))
|
||||
{
|
||||
rocprofsys_init_library_hidden();
|
||||
rocprofsys_init_library_hidden_with_rccl(postinit);
|
||||
return false;
|
||||
}
|
||||
|
||||
@@ -422,7 +448,11 @@ rocprofsys_init_tooling_hidden()
|
||||
ROCPROFSYS_CONDITIONAL_BASIC_PRINT_F(_debug_init, "State is %s...\n",
|
||||
std::to_string(get_state()).c_str());
|
||||
|
||||
if(get_state() != State::PreInit || get_state() == State::Init || _once) return false;
|
||||
if(get_state() != State::PreInit || get_state() == State::Init || _once)
|
||||
{
|
||||
rccl_setup(postinit);
|
||||
return false;
|
||||
}
|
||||
_once = true;
|
||||
|
||||
ROCPROFSYS_SCOPED_THREAD_STATE(ThreadState::Internal);
|
||||
@@ -443,7 +473,7 @@ rocprofsys_init_tooling_hidden()
|
||||
ROCPROFSYS_CONDITIONAL_BASIC_PRINT_F(_debug_init,
|
||||
"Calling rocprofsys_init_library()...\n");
|
||||
|
||||
rocprofsys_init_library_hidden();
|
||||
rocprofsys_init_library_hidden_with_rccl(postinit);
|
||||
|
||||
ROCPROFSYS_DEBUG_F("\n");
|
||||
|
||||
@@ -541,12 +571,6 @@ rocprofsys_init_tooling_hidden()
|
||||
ompt::setup();
|
||||
}
|
||||
|
||||
if(get_use_rcclp())
|
||||
{
|
||||
ROCPROFSYS_VERBOSE_F(1, "Setting up RCCLP...\n");
|
||||
rcclp::setup();
|
||||
}
|
||||
|
||||
if(get_use_perfetto())
|
||||
{
|
||||
ROCPROFSYS_VERBOSE_F(1, "Starting Perfetto...\n");
|
||||
|
||||
@@ -855,7 +855,6 @@ sample_selection(size_t _nitr, size_t _wait_ns)
|
||||
|
||||
while(eligible_pc_idx.load(std::memory_order_relaxed) == 0)
|
||||
{
|
||||
if(get_state() >= State::Finalized) return selected_entry{};
|
||||
std::this_thread::yield();
|
||||
std::this_thread::sleep_for(std::chrono::nanoseconds{ _wait_ns });
|
||||
}
|
||||
|
||||
@@ -105,6 +105,8 @@ delay::setup()
|
||||
void
|
||||
delay::process()
|
||||
{
|
||||
if(!is_local_available()) return;
|
||||
|
||||
if(causal::experiment::is_active())
|
||||
{
|
||||
if(get_global() < get_local())
|
||||
@@ -130,6 +132,8 @@ delay::process()
|
||||
void
|
||||
delay::credit()
|
||||
{
|
||||
if(!is_local_available()) return;
|
||||
|
||||
auto _diff = get_global() - get_local();
|
||||
if(_diff > 0)
|
||||
{
|
||||
@@ -140,6 +144,8 @@ delay::credit()
|
||||
void
|
||||
delay::preblock()
|
||||
{
|
||||
if(!is_local_available()) return;
|
||||
|
||||
auto _diff = get_global() - get_local();
|
||||
if(_diff > 0)
|
||||
{
|
||||
@@ -150,6 +156,7 @@ delay::preblock()
|
||||
void
|
||||
delay::postblock(int64_t _preblock_global_delay_value)
|
||||
{
|
||||
if(!is_local_available()) return;
|
||||
get_local() += (get_global() - _preblock_global_delay_value);
|
||||
}
|
||||
|
||||
@@ -168,18 +175,36 @@ delay::get_global()
|
||||
return _v;
|
||||
}
|
||||
|
||||
int64_t&
|
||||
delay::get_local(int64_t _tid)
|
||||
static void
|
||||
thr_init()
|
||||
{
|
||||
auto& _data = get_delay_data();
|
||||
static thread_local auto _thr_init = []() {
|
||||
using thread_data_t = thread_data<identity<int64_t>, delay>;
|
||||
thread_data_t::construct(construct_on_thread{ threading::get_id() },
|
||||
get_global().load());
|
||||
delay::get_global().load());
|
||||
return true;
|
||||
}();
|
||||
(void) _thr_init; // To make compiler happy.
|
||||
}
|
||||
|
||||
bool
|
||||
delay::is_local_available()
|
||||
{
|
||||
thr_init();
|
||||
auto& _data = get_delay_data();
|
||||
return _data != nullptr;
|
||||
}
|
||||
|
||||
int64_t&
|
||||
delay::get_local(int64_t _tid)
|
||||
{
|
||||
thr_init();
|
||||
auto& _data = get_delay_data();
|
||||
if(_data == nullptr)
|
||||
{
|
||||
throw std::runtime_error("No data: get_delay_data() returned nullptr");
|
||||
}
|
||||
return _data->at(_tid);
|
||||
(void) _thr_init;
|
||||
}
|
||||
|
||||
uint64_t
|
||||
|
||||
@@ -53,6 +53,7 @@ struct delay : comp::empty_base
|
||||
|
||||
static std::atomic<int64_t>& get_global();
|
||||
static int64_t& get_local(int64_t _tid = threading::get_id());
|
||||
static bool is_local_available();
|
||||
|
||||
static int64_t get(int64_t _tid = threading::get_id());
|
||||
static uint64_t compute_total_delay(uint64_t);
|
||||
|
||||
@@ -654,7 +654,16 @@ get_instances()
|
||||
std::unique_ptr<perf_event>&
|
||||
get_instance(int64_t _tid)
|
||||
{
|
||||
auto& _data = get_instances();
|
||||
static auto nullInstance = std::unique_ptr<perf_event>{ nullptr };
|
||||
auto& _data = get_instances();
|
||||
|
||||
// If get_instances() returned an empty object, we have to return a reference to a
|
||||
// static null instance, or else we will crash.
|
||||
if(_data == nullptr)
|
||||
{
|
||||
return nullInstance;
|
||||
}
|
||||
|
||||
if(static_cast<size_t>(_tid) >= _data->size())
|
||||
{
|
||||
ROCPROFSYS_SCOPED_THREAD_STATE(ThreadState::Internal);
|
||||
|
||||
مرجع در شماره جدید
Block a user