[SWDEV-546543] Fix segfault in gpu_metrics

Signed-off-by: adapryor <Adam.pryor@amd.com>


[ROCm/amdsmi commit: d25c01e802]
Этот коммит содержится в:
adapryor
2025-08-22 13:36:03 -05:00
коммит произвёл Pham, Gabriel
родитель a68cd9612a
Коммит 17f9feb94e
3 изменённых файлов: 31 добавлений и 21 удалений
+2
Просмотреть файл
@@ -1068,7 +1068,9 @@ class GpuMetricsBase_t {
virtual AMGpuMetricsPublicLatestTupl_t copy_internal_to_external_metrics() = 0;
virtual void set_device_id(uint32_t device_id) { m_device_id = device_id; }
virtual void set_partition_id(uint32_t partition_id) { m_partition_id = partition_id; }
static std::mutex s_base_tbl_mu;
virtual AMDGpuDynamicMetricsTbl_t get_metrics_dynamic_tbl() {
std::lock_guard<std::mutex> lk(s_base_tbl_mu);
return m_base_metrics_dynamic_tbl;
}
+18 -12
Просмотреть файл
@@ -1110,6 +1110,7 @@ namespace {
GpuMetricsCache g_gpu_metrics_cache;
// Keep 1 cache map, with an entry for each gpu
std::unordered_map<std::string, GpuMetricsCache> g_gpu_metrics_cache_map;
std::mutex g_gpu_metrics_cache_map_mu;
static const std::chrono::milliseconds kGpuMetricsCacheDuration(
read_env_ms("AMDSMI_GPU_METRICS_CACHE_MS", 1)
);
@@ -1132,20 +1133,25 @@ int Device::readDevInfoBinary(DevInfoTypes type, std::size_t b_size,
// is the issue, so should remain.
const std::string key = path_ + "/device/" + kDevAttribNameMap.at(type)
+ "#" + std::to_string(b_size);
auto& cache = g_gpu_metrics_cache_map[key];
GpuMetricsCache* cache_ptr = nullptr;
{
std::lock_guard<std::mutex> map_lk(g_gpu_metrics_cache_map_mu);
cache_ptr = &g_gpu_metrics_cache_map[key]; // safe now
}
// Only cache for kDevGpuMetrics
if (type == DevInfoTypes::kDevGpuMetrics) {
std::lock_guard<std::mutex> lock(cache.mtx);
std::lock_guard<std::mutex> lock(cache_ptr->mtx);
auto now = std::chrono::steady_clock::now();
auto last_read_delta = std::chrono::duration_cast<std::chrono::milliseconds>(now - cache.last_read);
auto last_read_delta = std::chrono::duration_cast<std::chrono::milliseconds>(now - cache_ptr->last_read);
if (!cache.data.empty() &&
if (!cache_ptr->data.empty() &&
kGpuMetricsCacheDuration > std::chrono::milliseconds::zero() &&
last_read_delta < kGpuMetricsCacheDuration &&
cache.data.size() == b_size) {
cache_ptr->data.size() == b_size) {
std::memcpy(p_binary_data, cache.data.data(), b_size);
std::memcpy(p_binary_data, cache_ptr->data.data(), b_size);
if (ROCmLogging::Logger::getInstance()->isLoggerEnabled()) {
ss << "Returned cached DevInfoBinary for DevInfoType ("
@@ -1206,12 +1212,12 @@ int Device::readDevInfoBinary(DevInfoTypes type, std::size_t b_size,
if (type == DevInfoTypes::kDevGpuMetrics &&
kGpuMetricsCacheDuration > std::chrono::milliseconds::zero()) {
auto now = std::chrono::steady_clock::now();
auto& cache = g_gpu_metrics_cache_map[key];
std::lock_guard<std::mutex> lock(cache.mtx);
cache.data.assign(
reinterpret_cast<uint8_t*>(p_binary_data),
reinterpret_cast<uint8_t*>(p_binary_data) + b_size);
cache.last_read = now;
std::lock_guard<std::mutex> lock(cache_ptr->mtx);
cache_ptr->data.assign(
reinterpret_cast<uint8_t*>(p_binary_data),
reinterpret_cast<uint8_t*>(p_binary_data) + b_size);
cache_ptr->last_read = now;
if (ROCmLogging::Logger::getInstance()->isLoggerEnabled()) {
ss << "Successfully Cached GPU Metrics binaryData = " << p_binary_data
+11 -9
Просмотреть файл
@@ -62,6 +62,8 @@ using namespace amd::smi;
namespace amd::smi
{
std::mutex GpuMetricsBase_t::s_base_tbl_mu;
constexpr uint16_t join_metrics_version(uint8_t format_rev, uint8_t content_rev)
{
return static_cast<uint16_t>((format_rev << 8 | content_rev));
@@ -509,11 +511,9 @@ rsmi_status_t GpuMetricsBase_v18_t::populate_metrics_dynamic_tbl() {
<< stringfy_metric_header_version(disjoin_metrics_version(gpu_metrics_version)) << " |";
LOG_TRACE(ss);
// firmware_timestamp is at 10ns resolution
ss << __PRETTY_FUNCTION__ << " | ======= Changes ======= "
<< " | {m_firmware_timestamp} from: " << m_gpu_metrics_tbl.m_firmware_timestamp
<< " to: " << (m_gpu_metrics_tbl.m_firmware_timestamp * 10);
m_gpu_metrics_tbl.m_firmware_timestamp = (m_gpu_metrics_tbl.m_firmware_timestamp * 10);
// firmware_timestamp is at 10ns resolution; leave as-is.
ss << __PRETTY_FUNCTION__ << " | firmware_timestamp (10ns) = "
<< m_gpu_metrics_tbl.m_firmware_timestamp;
LOG_DEBUG(ss);
};
@@ -744,10 +744,12 @@ rsmi_status_t GpuMetricsBase_v18_t::populate_metrics_dynamic_tbl() {
<< " | Returning = " << getRSMIStatusString(status_code) << " |";
LOG_TRACE(ss);
// Copy to base class
std::copy(m_metrics_dynamic_tbl.begin(), m_metrics_dynamic_tbl.end(),
std::inserter(GpuMetricsBase_t::m_base_metrics_dynamic_tbl,
GpuMetricsBase_t::m_base_metrics_dynamic_tbl.end()));
{
std::lock_guard<std::mutex> lk(s_base_tbl_mu);
// Copy to base class
this->m_base_metrics_dynamic_tbl = m_metrics_dynamic_tbl;
}
return status_code;
}