[SWDEV-509389] AMD-SMI crash when multiple threads call SMI APIs (#53)
Multi-threaded application rsmi_dev_gpu_metrics_info_get() causes crash Code changes related to the following: * API implementation changes Change-Id: I1f1fb39c1125569ec5d534b37fd6f68c8829eef7 Signed-off-by: Oliveira, Daniel <daniel.oliveira@amd.com> Authored-by: Oliveira, Daniel <daniel.oliveira@amd.com>
This commit is contained in:
@@ -31,9 +31,12 @@
|
||||
#include <cassert>
|
||||
#include <cstdint>
|
||||
#include <cstring>
|
||||
#include <iostream>
|
||||
#include <string>
|
||||
#include <map>
|
||||
#include <memory>
|
||||
#include <mutex>
|
||||
#include <thread>
|
||||
#include <type_traits>
|
||||
#include <tuple>
|
||||
#include <variant>
|
||||
@@ -938,20 +941,18 @@ class GpuMetricsBase_t {
|
||||
virtual void set_device_id(uint32_t device_id) { m_device_id = device_id; }
|
||||
virtual void set_partition_id(uint32_t partition_id) { m_partition_id = partition_id; }
|
||||
virtual AMDGpuDynamicMetricsTbl_t get_metrics_dynamic_tbl() {
|
||||
return m_metrics_dynamic_tbl;
|
||||
return m_base_metrics_dynamic_tbl;
|
||||
}
|
||||
|
||||
protected:
|
||||
AMDGpuDynamicMetricsTbl_t m_metrics_dynamic_tbl;
|
||||
AMDGpuDynamicMetricsTbl_t m_base_metrics_dynamic_tbl;
|
||||
uint64_t m_metrics_timestamp;
|
||||
uint32_t m_device_id;
|
||||
uint32_t m_partition_id;
|
||||
|
||||
};
|
||||
using GpuMetricsBasePtr = std::shared_ptr<GpuMetricsBase_t>;
|
||||
using AMDGpuMetricFactories_t = const std::map<AMDGpuMetricVersionFlags_t, GpuMetricsBasePtr>;
|
||||
|
||||
|
||||
class GpuMetricsBase_v11_t final : public GpuMetricsBase_t {
|
||||
public:
|
||||
virtual ~GpuMetricsBase_v11_t() = default;
|
||||
|
||||
@@ -869,10 +869,7 @@ rsmi_status_t GpuMetricsBase_v17_t::populate_metrics_dynamic_tbl() {
|
||||
ss << __PRETTY_FUNCTION__ << " | ======= start =======";
|
||||
LOG_TRACE(ss);
|
||||
|
||||
if (!m_metrics_dynamic_tbl.empty()) {
|
||||
m_metrics_dynamic_tbl.clear();
|
||||
}
|
||||
|
||||
auto m_metrics_dynamic_tbl = AMDGpuDynamicMetricsTbl_t{};
|
||||
//
|
||||
// Note: Any metric treatment/changes (if any) should happen before they
|
||||
// get written to internal/external tables.
|
||||
@@ -1117,6 +1114,12 @@ rsmi_status_t GpuMetricsBase_v17_t::populate_metrics_dynamic_tbl() {
|
||||
<< " |";
|
||||
LOG_TRACE(ss);
|
||||
|
||||
// Copy to base class
|
||||
std::copy(m_metrics_dynamic_tbl.begin(),
|
||||
m_metrics_dynamic_tbl.end(),
|
||||
std::inserter(GpuMetricsBase_t::m_base_metrics_dynamic_tbl,
|
||||
GpuMetricsBase_t::m_base_metrics_dynamic_tbl.end()));
|
||||
|
||||
return status_code;
|
||||
}
|
||||
|
||||
@@ -1126,10 +1129,7 @@ rsmi_status_t GpuMetricsBase_v16_t::populate_metrics_dynamic_tbl() {
|
||||
ss << __PRETTY_FUNCTION__ << " | ======= start =======";
|
||||
LOG_TRACE(ss);
|
||||
|
||||
if (!m_metrics_dynamic_tbl.empty()) {
|
||||
m_metrics_dynamic_tbl.clear();
|
||||
}
|
||||
|
||||
auto m_metrics_dynamic_tbl = AMDGpuDynamicMetricsTbl_t{};
|
||||
//
|
||||
// Note: Any metric treatment/changes (if any) should happen before they
|
||||
// get written to internal/external tables.
|
||||
@@ -1357,6 +1357,12 @@ rsmi_status_t GpuMetricsBase_v16_t::populate_metrics_dynamic_tbl() {
|
||||
<< " |";
|
||||
LOG_TRACE(ss);
|
||||
|
||||
// Copy to base class
|
||||
std::copy(m_metrics_dynamic_tbl.begin(),
|
||||
m_metrics_dynamic_tbl.end(),
|
||||
std::inserter(GpuMetricsBase_t::m_base_metrics_dynamic_tbl,
|
||||
GpuMetricsBase_t::m_base_metrics_dynamic_tbl.end()));
|
||||
|
||||
return status_code;
|
||||
}
|
||||
|
||||
@@ -1474,10 +1480,7 @@ rsmi_status_t GpuMetricsBase_v15_t::populate_metrics_dynamic_tbl() {
|
||||
ss << __PRETTY_FUNCTION__ << " | ======= start =======";
|
||||
LOG_TRACE(ss);
|
||||
|
||||
if (!m_metrics_dynamic_tbl.empty()) {
|
||||
m_metrics_dynamic_tbl.clear();
|
||||
}
|
||||
|
||||
auto m_metrics_dynamic_tbl = AMDGpuDynamicMetricsTbl_t{};
|
||||
//
|
||||
// Note: Any metric treatment/changes (if any) should happen before they
|
||||
// get written to internal/external tables.
|
||||
@@ -1694,6 +1697,12 @@ rsmi_status_t GpuMetricsBase_v15_t::populate_metrics_dynamic_tbl() {
|
||||
<< " |";
|
||||
LOG_TRACE(ss);
|
||||
|
||||
// Copy to base class
|
||||
std::copy(m_metrics_dynamic_tbl.begin(),
|
||||
m_metrics_dynamic_tbl.end(),
|
||||
std::inserter(GpuMetricsBase_t::m_base_metrics_dynamic_tbl,
|
||||
GpuMetricsBase_t::m_base_metrics_dynamic_tbl.end()));
|
||||
|
||||
return status_code;
|
||||
}
|
||||
|
||||
@@ -1803,10 +1812,7 @@ rsmi_status_t GpuMetricsBase_v14_t::populate_metrics_dynamic_tbl() {
|
||||
ss << __PRETTY_FUNCTION__ << " | ======= start =======";
|
||||
LOG_TRACE(ss);
|
||||
|
||||
if (!m_metrics_dynamic_tbl.empty()) {
|
||||
m_metrics_dynamic_tbl.clear();
|
||||
}
|
||||
|
||||
auto m_metrics_dynamic_tbl = AMDGpuDynamicMetricsTbl_t{};
|
||||
//
|
||||
// Note: Any metric treatment/changes (if any) should happen before they
|
||||
// get written to internal/external tables.
|
||||
@@ -2008,6 +2014,12 @@ rsmi_status_t GpuMetricsBase_v14_t::populate_metrics_dynamic_tbl() {
|
||||
<< " |";
|
||||
LOG_TRACE(ss);
|
||||
|
||||
// Copy to base class
|
||||
std::copy(m_metrics_dynamic_tbl.begin(),
|
||||
m_metrics_dynamic_tbl.end(),
|
||||
std::inserter(GpuMetricsBase_t::m_base_metrics_dynamic_tbl,
|
||||
GpuMetricsBase_t::m_base_metrics_dynamic_tbl.end()));
|
||||
|
||||
return status_code;
|
||||
}
|
||||
|
||||
@@ -3014,10 +3026,7 @@ rsmi_status_t GpuMetricsBase_v13_t::populate_metrics_dynamic_tbl() {
|
||||
ss << __PRETTY_FUNCTION__ << " | ======= start =======";
|
||||
LOG_TRACE(ss);
|
||||
|
||||
if (!m_metrics_dynamic_tbl.empty()) {
|
||||
m_metrics_dynamic_tbl.clear();
|
||||
}
|
||||
|
||||
auto m_metrics_dynamic_tbl = AMDGpuDynamicMetricsTbl_t{};
|
||||
//
|
||||
// Note: Any metric treatment/changes (if any) should happen before they
|
||||
// get written to internal/external tables.
|
||||
@@ -3263,6 +3272,12 @@ rsmi_status_t GpuMetricsBase_v13_t::populate_metrics_dynamic_tbl() {
|
||||
<< " |";
|
||||
LOG_TRACE(ss);
|
||||
|
||||
// Copy to base class
|
||||
std::copy(m_metrics_dynamic_tbl.begin(),
|
||||
m_metrics_dynamic_tbl.end(),
|
||||
std::inserter(GpuMetricsBase_t::m_base_metrics_dynamic_tbl,
|
||||
GpuMetricsBase_t::m_base_metrics_dynamic_tbl.end()));
|
||||
|
||||
return status_code;
|
||||
}
|
||||
|
||||
@@ -3397,10 +3412,7 @@ rsmi_status_t GpuMetricsBase_v12_t::populate_metrics_dynamic_tbl() {
|
||||
ss << __PRETTY_FUNCTION__ << " | ======= start =======";
|
||||
LOG_TRACE(ss);
|
||||
|
||||
if (!m_metrics_dynamic_tbl.empty()) {
|
||||
m_metrics_dynamic_tbl.clear();
|
||||
}
|
||||
|
||||
auto m_metrics_dynamic_tbl = AMDGpuDynamicMetricsTbl_t{};
|
||||
//
|
||||
// Note: Any metric treatment/changes (if any) should happen before they
|
||||
// get written to internal/external tables.
|
||||
@@ -3624,6 +3636,12 @@ rsmi_status_t GpuMetricsBase_v12_t::populate_metrics_dynamic_tbl() {
|
||||
<< " |";
|
||||
LOG_TRACE(ss);
|
||||
|
||||
// Copy to base class
|
||||
std::copy(m_metrics_dynamic_tbl.begin(),
|
||||
m_metrics_dynamic_tbl.end(),
|
||||
std::inserter(GpuMetricsBase_t::m_base_metrics_dynamic_tbl,
|
||||
GpuMetricsBase_t::m_base_metrics_dynamic_tbl.end()));
|
||||
|
||||
return status_code;
|
||||
}
|
||||
|
||||
@@ -3734,10 +3752,7 @@ rsmi_status_t GpuMetricsBase_v11_t::populate_metrics_dynamic_tbl() {
|
||||
ss << __PRETTY_FUNCTION__ << " | ======= start =======";
|
||||
LOG_TRACE(ss);
|
||||
|
||||
if (!m_metrics_dynamic_tbl.empty()) {
|
||||
m_metrics_dynamic_tbl.clear();
|
||||
}
|
||||
|
||||
auto m_metrics_dynamic_tbl = AMDGpuDynamicMetricsTbl_t{};
|
||||
//
|
||||
// Note: Any metric treatment/changes (if any) should happen before they
|
||||
// get written to internal/external tables.
|
||||
@@ -3948,6 +3963,12 @@ rsmi_status_t GpuMetricsBase_v11_t::populate_metrics_dynamic_tbl() {
|
||||
<< " |";
|
||||
LOG_TRACE(ss);
|
||||
|
||||
// Copy to base class
|
||||
std::copy(m_metrics_dynamic_tbl.begin(),
|
||||
m_metrics_dynamic_tbl.end(),
|
||||
std::inserter(GpuMetricsBase_t::m_base_metrics_dynamic_tbl,
|
||||
GpuMetricsBase_t::m_base_metrics_dynamic_tbl.end()));
|
||||
|
||||
return status_code;
|
||||
}
|
||||
|
||||
@@ -4692,8 +4713,8 @@ rsmi_dev_gpu_metrics_info_get(uint32_t dv_ind, rsmi_gpu_metrics_t* smu) {
|
||||
CHK_SUPPORT_NAME_ONLY(smu)
|
||||
|
||||
auto status_code(rsmi_status_t::RSMI_STATUS_SUCCESS);
|
||||
std::ostringstream ostrstream;
|
||||
std::ostringstream ss;
|
||||
thread_local std::ostringstream ostrstream;
|
||||
thread_local std::ostringstream ss;
|
||||
|
||||
ss << __PRETTY_FUNCTION__ << "| ======= start =======";
|
||||
LOG_TRACE(ss);
|
||||
@@ -4717,6 +4738,7 @@ rsmi_dev_gpu_metrics_info_get(uint32_t dv_ind, rsmi_gpu_metrics_t* smu) {
|
||||
rsmi_dev_partition_id_get(dv_ind, &partition_id);
|
||||
dev->set_smi_partition_id(partition_id);
|
||||
dev->dev_log_gpu_metrics(ostrstream);
|
||||
|
||||
const auto [error_code, external_metrics] = dev->dev_copy_internal_to_external_metrics();
|
||||
if (error_code != rsmi_status_t::RSMI_STATUS_SUCCESS) {
|
||||
ss << __PRETTY_FUNCTION__
|
||||
|
||||
Fai riferimento in un nuovo problema
Block a user