From 7b00d3a89b43bdd084454843db0c8412f44414dd Mon Sep 17 00:00:00 2001 From: habajpai-amd Date: Fri, 19 Dec 2025 11:56:40 +0530 Subject: [PATCH] fix: prevent double-free crash during process exit in amd-smi (#2213) --- .../source/lib/core/gpu.cpp | 16 +++++++-- .../lib/rocprof-sys/library/amd_smi.cpp | 5 +-- .../library/components/exit_gotcha.cpp | 36 ++++++++++++++++++- 3 files changed, 50 insertions(+), 7 deletions(-) diff --git a/projects/rocprofiler-systems/source/lib/core/gpu.cpp b/projects/rocprofiler-systems/source/lib/core/gpu.cpp index 66770179c6..57c37f2dbf 100644 --- a/projects/rocprofiler-systems/source/lib/core/gpu.cpp +++ b/projects/rocprofiler-systems/source/lib/core/gpu.cpp @@ -31,8 +31,6 @@ } \ } // namespace ::tim::cereal -#include "common/defines.h" - #if !defined(ROCPROFSYS_USE_ROCM) # define ROCPROFSYS_USE_ROCM 0 #endif @@ -43,6 +41,7 @@ #include +#include #include #include "core/agent_manager.hpp" @@ -90,6 +89,17 @@ _amdsmi_is_initialized() return initialized; } +void +prevent_amdsmi_library_unload() +{ + static bool _initialized = false; + if(_initialized) return; + _initialized = true; + + dlopen("libamd_smi.so", RTLD_NOW | RTLD_NOLOAD | RTLD_NODELETE); + dlopen("librocm_smi64.so", RTLD_NOW | RTLD_NOLOAD | RTLD_NODELETE); +} + bool amdsmi_init() { @@ -100,6 +110,8 @@ amdsmi_init() ROCPROFSYS_AMD_SMI_CALL(::amdsmi_init(AMDSMI_INIT_AMD_GPUS)); get_processor_handles(); _amdsmi_is_initialized() = true; // Mark as initialized + + prevent_amdsmi_library_unload(); } catch(std::exception& _e) { ROCPROFSYS_BASIC_VERBOSE(1, "Exception thrown initializing amd-smi: %s\n", diff --git a/projects/rocprofiler-systems/source/lib/rocprof-sys/library/amd_smi.cpp b/projects/rocprofiler-systems/source/lib/rocprof-sys/library/amd_smi.cpp index c8869e6cf3..f802022afa 100644 --- a/projects/rocprofiler-systems/source/lib/rocprof-sys/library/amd_smi.cpp +++ b/projects/rocprofiler-systems/source/lib/rocprof-sys/library/amd_smi.cpp @@ -1265,10 +1265,7 @@ shutdown() try { - if(data::shutdown()) - { - ROCPROFSYS_AMD_SMI_CALL(amdsmi_shut_down()); - } + data::shutdown(); } catch(std::runtime_error& _e) { ROCPROFSYS_VERBOSE(0, "Exception thrown when shutting down amd-smi: %s\n", diff --git a/projects/rocprofiler-systems/source/lib/rocprof-sys/library/components/exit_gotcha.cpp b/projects/rocprofiler-systems/source/lib/rocprof-sys/library/components/exit_gotcha.cpp index 656697d73b..23ceeeb89a 100644 --- a/projects/rocprofiler-systems/source/lib/rocprof-sys/library/components/exit_gotcha.cpp +++ b/projects/rocprofiler-systems/source/lib/rocprof-sys/library/components/exit_gotcha.cpp @@ -25,7 +25,6 @@ #include "core/config.hpp" #include "core/debug.hpp" #include "core/state.hpp" -#include "core/timemory.hpp" #include "library/runtime.hpp" #include @@ -33,7 +32,9 @@ #include #include +#include #include +#include namespace rocprofsys { @@ -105,6 +106,39 @@ void exit_gotcha::operator()(const gotcha_data& _data, exit_func_t _func, int _ec) const { _exit_info = { true, _data.tool_id.find("quick") != std::string::npos, _ec }; + + if(config::get_use_amd_smi()) + { + threading::clear_callbacks(); + + if(get_state() < ::rocprofsys::State::Finalized && !is_child_process()) + { + if(config::settings_are_configured()) + { + ROCPROFSYS_VERBOSE(0, "finalizing %s before calling %s(%i)...\n", + get_exe_name().c_str(), _data.tool_id.c_str(), _ec); + } + else + { + ROCPROFSYS_BASIC_VERBOSE(0, "finalizing %s before calling %s(%i)...\n", + get_exe_name().c_str(), _data.tool_id.c_str(), + _ec); + } + + rocprofsys_finalize(); + } + + if(config::settings_are_configured()) + { + ROCPROFSYS_VERBOSE( + 0, "calling _exit(%i) in %s to avoid AMD SMI cleanup issues...\n", _ec, + get_exe_name().c_str()); + } + + std::fflush(nullptr); + _exit(_ec); + } + invoke_exit_gotcha(_data, _func, _ec); }