[SWDEV-509389] AMD-SMI crash when multiple threads call SMI APIs (#53)

Multi-threaded application rsmi_dev_gpu_metrics_info_get() causes crash

Code changes related to the following:
  * API implementation changes

Change-Id: I1f1fb39c1125569ec5d534b37fd6f68c8829eef7

Signed-off-by: Oliveira, Daniel <daniel.oliveira@amd.com>
Authored-by: Oliveira, Daniel <daniel.oliveira@amd.com>
This commit is contained in:
Arif, Maisam
2025-01-21 14:00:15 -06:00
committato da GitHub
parent 834993e1c3
commit 03a2368655
2 ha cambiato i file con 57 aggiunte e 34 eliminazioni
@@ -31,9 +31,12 @@
#include <cassert>
#include <cstdint>
#include <cstring>
#include <iostream>
#include <string>
#include <map>
#include <memory>
#include <mutex>
#include <thread>
#include <type_traits>
#include <tuple>
#include <variant>
@@ -938,20 +941,18 @@ class GpuMetricsBase_t {
virtual void set_device_id(uint32_t device_id) { m_device_id = device_id; }
virtual void set_partition_id(uint32_t partition_id) { m_partition_id = partition_id; }
virtual AMDGpuDynamicMetricsTbl_t get_metrics_dynamic_tbl() {
return m_metrics_dynamic_tbl;
return m_base_metrics_dynamic_tbl;
}
protected:
AMDGpuDynamicMetricsTbl_t m_metrics_dynamic_tbl;
AMDGpuDynamicMetricsTbl_t m_base_metrics_dynamic_tbl;
uint64_t m_metrics_timestamp;
uint32_t m_device_id;
uint32_t m_partition_id;
};
using GpuMetricsBasePtr = std::shared_ptr<GpuMetricsBase_t>;
using AMDGpuMetricFactories_t = const std::map<AMDGpuMetricVersionFlags_t, GpuMetricsBasePtr>;
class GpuMetricsBase_v11_t final : public GpuMetricsBase_t {
public:
virtual ~GpuMetricsBase_v11_t() = default;
+52 -30
Vedi File
@@ -869,10 +869,7 @@ rsmi_status_t GpuMetricsBase_v17_t::populate_metrics_dynamic_tbl() {
ss << __PRETTY_FUNCTION__ << " | ======= start =======";
LOG_TRACE(ss);
if (!m_metrics_dynamic_tbl.empty()) {
m_metrics_dynamic_tbl.clear();
}
auto m_metrics_dynamic_tbl = AMDGpuDynamicMetricsTbl_t{};
//
// Note: Any metric treatment/changes (if any) should happen before they
// get written to internal/external tables.
@@ -1117,6 +1114,12 @@ rsmi_status_t GpuMetricsBase_v17_t::populate_metrics_dynamic_tbl() {
<< " |";
LOG_TRACE(ss);
// Copy to base class
std::copy(m_metrics_dynamic_tbl.begin(),
m_metrics_dynamic_tbl.end(),
std::inserter(GpuMetricsBase_t::m_base_metrics_dynamic_tbl,
GpuMetricsBase_t::m_base_metrics_dynamic_tbl.end()));
return status_code;
}
@@ -1126,10 +1129,7 @@ rsmi_status_t GpuMetricsBase_v16_t::populate_metrics_dynamic_tbl() {
ss << __PRETTY_FUNCTION__ << " | ======= start =======";
LOG_TRACE(ss);
if (!m_metrics_dynamic_tbl.empty()) {
m_metrics_dynamic_tbl.clear();
}
auto m_metrics_dynamic_tbl = AMDGpuDynamicMetricsTbl_t{};
//
// Note: Any metric treatment/changes (if any) should happen before they
// get written to internal/external tables.
@@ -1357,6 +1357,12 @@ rsmi_status_t GpuMetricsBase_v16_t::populate_metrics_dynamic_tbl() {
<< " |";
LOG_TRACE(ss);
// Copy to base class
std::copy(m_metrics_dynamic_tbl.begin(),
m_metrics_dynamic_tbl.end(),
std::inserter(GpuMetricsBase_t::m_base_metrics_dynamic_tbl,
GpuMetricsBase_t::m_base_metrics_dynamic_tbl.end()));
return status_code;
}
@@ -1474,10 +1480,7 @@ rsmi_status_t GpuMetricsBase_v15_t::populate_metrics_dynamic_tbl() {
ss << __PRETTY_FUNCTION__ << " | ======= start =======";
LOG_TRACE(ss);
if (!m_metrics_dynamic_tbl.empty()) {
m_metrics_dynamic_tbl.clear();
}
auto m_metrics_dynamic_tbl = AMDGpuDynamicMetricsTbl_t{};
//
// Note: Any metric treatment/changes (if any) should happen before they
// get written to internal/external tables.
@@ -1694,6 +1697,12 @@ rsmi_status_t GpuMetricsBase_v15_t::populate_metrics_dynamic_tbl() {
<< " |";
LOG_TRACE(ss);
// Copy to base class
std::copy(m_metrics_dynamic_tbl.begin(),
m_metrics_dynamic_tbl.end(),
std::inserter(GpuMetricsBase_t::m_base_metrics_dynamic_tbl,
GpuMetricsBase_t::m_base_metrics_dynamic_tbl.end()));
return status_code;
}
@@ -1803,10 +1812,7 @@ rsmi_status_t GpuMetricsBase_v14_t::populate_metrics_dynamic_tbl() {
ss << __PRETTY_FUNCTION__ << " | ======= start =======";
LOG_TRACE(ss);
if (!m_metrics_dynamic_tbl.empty()) {
m_metrics_dynamic_tbl.clear();
}
auto m_metrics_dynamic_tbl = AMDGpuDynamicMetricsTbl_t{};
//
// Note: Any metric treatment/changes (if any) should happen before they
// get written to internal/external tables.
@@ -2008,6 +2014,12 @@ rsmi_status_t GpuMetricsBase_v14_t::populate_metrics_dynamic_tbl() {
<< " |";
LOG_TRACE(ss);
// Copy to base class
std::copy(m_metrics_dynamic_tbl.begin(),
m_metrics_dynamic_tbl.end(),
std::inserter(GpuMetricsBase_t::m_base_metrics_dynamic_tbl,
GpuMetricsBase_t::m_base_metrics_dynamic_tbl.end()));
return status_code;
}
@@ -3014,10 +3026,7 @@ rsmi_status_t GpuMetricsBase_v13_t::populate_metrics_dynamic_tbl() {
ss << __PRETTY_FUNCTION__ << " | ======= start =======";
LOG_TRACE(ss);
if (!m_metrics_dynamic_tbl.empty()) {
m_metrics_dynamic_tbl.clear();
}
auto m_metrics_dynamic_tbl = AMDGpuDynamicMetricsTbl_t{};
//
// Note: Any metric treatment/changes (if any) should happen before they
// get written to internal/external tables.
@@ -3263,6 +3272,12 @@ rsmi_status_t GpuMetricsBase_v13_t::populate_metrics_dynamic_tbl() {
<< " |";
LOG_TRACE(ss);
// Copy to base class
std::copy(m_metrics_dynamic_tbl.begin(),
m_metrics_dynamic_tbl.end(),
std::inserter(GpuMetricsBase_t::m_base_metrics_dynamic_tbl,
GpuMetricsBase_t::m_base_metrics_dynamic_tbl.end()));
return status_code;
}
@@ -3397,10 +3412,7 @@ rsmi_status_t GpuMetricsBase_v12_t::populate_metrics_dynamic_tbl() {
ss << __PRETTY_FUNCTION__ << " | ======= start =======";
LOG_TRACE(ss);
if (!m_metrics_dynamic_tbl.empty()) {
m_metrics_dynamic_tbl.clear();
}
auto m_metrics_dynamic_tbl = AMDGpuDynamicMetricsTbl_t{};
//
// Note: Any metric treatment/changes (if any) should happen before they
// get written to internal/external tables.
@@ -3624,6 +3636,12 @@ rsmi_status_t GpuMetricsBase_v12_t::populate_metrics_dynamic_tbl() {
<< " |";
LOG_TRACE(ss);
// Copy to base class
std::copy(m_metrics_dynamic_tbl.begin(),
m_metrics_dynamic_tbl.end(),
std::inserter(GpuMetricsBase_t::m_base_metrics_dynamic_tbl,
GpuMetricsBase_t::m_base_metrics_dynamic_tbl.end()));
return status_code;
}
@@ -3734,10 +3752,7 @@ rsmi_status_t GpuMetricsBase_v11_t::populate_metrics_dynamic_tbl() {
ss << __PRETTY_FUNCTION__ << " | ======= start =======";
LOG_TRACE(ss);
if (!m_metrics_dynamic_tbl.empty()) {
m_metrics_dynamic_tbl.clear();
}
auto m_metrics_dynamic_tbl = AMDGpuDynamicMetricsTbl_t{};
//
// Note: Any metric treatment/changes (if any) should happen before they
// get written to internal/external tables.
@@ -3948,6 +3963,12 @@ rsmi_status_t GpuMetricsBase_v11_t::populate_metrics_dynamic_tbl() {
<< " |";
LOG_TRACE(ss);
// Copy to base class
std::copy(m_metrics_dynamic_tbl.begin(),
m_metrics_dynamic_tbl.end(),
std::inserter(GpuMetricsBase_t::m_base_metrics_dynamic_tbl,
GpuMetricsBase_t::m_base_metrics_dynamic_tbl.end()));
return status_code;
}
@@ -4692,8 +4713,8 @@ rsmi_dev_gpu_metrics_info_get(uint32_t dv_ind, rsmi_gpu_metrics_t* smu) {
CHK_SUPPORT_NAME_ONLY(smu)
auto status_code(rsmi_status_t::RSMI_STATUS_SUCCESS);
std::ostringstream ostrstream;
std::ostringstream ss;
thread_local std::ostringstream ostrstream;
thread_local std::ostringstream ss;
ss << __PRETTY_FUNCTION__ << "| ======= start =======";
LOG_TRACE(ss);
@@ -4717,6 +4738,7 @@ rsmi_dev_gpu_metrics_info_get(uint32_t dv_ind, rsmi_gpu_metrics_t* smu) {
rsmi_dev_partition_id_get(dv_ind, &partition_id);
dev->set_smi_partition_id(partition_id);
dev->dev_log_gpu_metrics(ostrstream);
const auto [error_code, external_metrics] = dev->dev_copy_internal_to_external_metrics();
if (error_code != rsmi_status_t::RSMI_STATUS_SUCCESS) {
ss << __PRETTY_FUNCTION__