From 1883f736adb177395a33d6682a7ed465533460ce Mon Sep 17 00:00:00 2001 From: Matthias Gehre Date: Thu, 15 Jan 2026 17:43:47 +0100 Subject: [PATCH] Fix double-free crash when librocm_smi64.so and libamd_smi.so are loaded together (#2531) Problem: When TheRock-based PyTorch package is installed along with amdsmi, importing torch causes a double-free crash on exit (GitHub issue ROCm/TheRock#2269). Root cause: Both librocm_smi64.so and libamd_smi.so export the C++ static member 'amd::smi::Device::devInfoTypesStrings'. When libraries are loaded with RTLD_GLOBAL, the dynamic linker resolves libamd_smi.so's reference to this symbol to the one in librocm_smi64.so. This causes: 1. librocm_smi64.so registers its destructor for devInfoTypesStrings 2. libamd_smi.so also registers a destructor, but for the SAME address 3. On exit, both destructors run on the same object -> double-free Fix: Change devInfoTypesStrings from a class static member to a file-local static variable. This ensures the symbol has internal linkage and is not exported, preventing the symbol collision. Changes: - rocm_smi_device.h: Remove static member declaration - rocm_smi_device.cc: Change from 'Device::devInfoTypesStrings' to file-local 'static const std::map<...> devInfoTypesStrings' - rocm_smi.cc: Remove the global alias to the (now removed) class member Tested on gfx1151. `import torch` crashed on exit before the fix, and doesn't crash after the fix. --- projects/rocm-smi-lib/include/rocm_smi/rocm_smi_device.h | 2 +- projects/rocm-smi-lib/src/rocm_smi.cc | 1 - projects/rocm-smi-lib/src/rocm_smi_device.cc | 3 +-- 3 files changed, 2 insertions(+), 4 deletions(-) diff --git a/projects/rocm-smi-lib/include/rocm_smi/rocm_smi_device.h b/projects/rocm-smi-lib/include/rocm_smi/rocm_smi_device.h index d00d037178..627af3625b 100644 --- a/projects/rocm-smi-lib/include/rocm_smi/rocm_smi_device.h +++ b/projects/rocm-smi-lib/include/rocm_smi/rocm_smi_device.h @@ -244,7 +244,7 @@ class Device { rsmi_status_t dev_log_gpu_metrics(std::ostringstream& outstream_metrics); AMGpuMetricsPublicLatestTupl_t dev_copy_internal_to_external_metrics(); - static const std::map devInfoTypesStrings; + void set_smi_device_id(uint32_t i) { m_device_id = i; } void set_smi_partition_id(uint32_t i) { m_partition_id = i; } static const char* get_type_string(DevInfoTypes type); diff --git a/projects/rocm-smi-lib/src/rocm_smi.cc b/projects/rocm-smi-lib/src/rocm_smi.cc index e7e2151bec..190fcb8366 100644 --- a/projects/rocm-smi-lib/src/rocm_smi.cc +++ b/projects/rocm-smi-lib/src/rocm_smi.cc @@ -84,7 +84,6 @@ using amd::smi::monitorTypesToString; using amd::smi::getRSMIStatusString; using amd::smi::AMDGpuMetricsUnitType_t; using amd::smi::AMDGpuMetricTypeId_t; -auto &devInfoTypesStrings = amd::smi::Device::devInfoTypesStrings; static const uint32_t kMaxOverdriveLevel = 20; static const float kEnergyCounterResolution = 15.3F; diff --git a/projects/rocm-smi-lib/src/rocm_smi_device.cc b/projects/rocm-smi-lib/src/rocm_smi_device.cc index cf8cbf7ced..c03166b150 100644 --- a/projects/rocm-smi-lib/src/rocm_smi_device.cc +++ b/projects/rocm-smi-lib/src/rocm_smi_device.cc @@ -379,8 +379,7 @@ static const std::map kDevInfoVarTypeToRSMIVariant = { {kDevDFCountersAvailable, RSMI_EVNT_GRP_XGMI} }; -const std::map -Device::devInfoTypesStrings = { +static const std::map devInfoTypesStrings = { {kDevPerfLevel, "kDevPerfLevel"}, {kDevOverDriveLevel, "kDevOverDriveLevel"}, {kDevMemOverDriveLevel, "kDevMemOverDriveLevel"},