diff --git a/projects/amdsmi/rocm_smi/include/rocm_smi/rocm_smi_gpu_metrics.h b/projects/amdsmi/rocm_smi/include/rocm_smi/rocm_smi_gpu_metrics.h index ea9e1fa5e5..c2fd0585d6 100644 --- a/projects/amdsmi/rocm_smi/include/rocm_smi/rocm_smi_gpu_metrics.h +++ b/projects/amdsmi/rocm_smi/include/rocm_smi/rocm_smi_gpu_metrics.h @@ -1068,7 +1068,9 @@ class GpuMetricsBase_t { virtual AMGpuMetricsPublicLatestTupl_t copy_internal_to_external_metrics() = 0; virtual void set_device_id(uint32_t device_id) { m_device_id = device_id; } virtual void set_partition_id(uint32_t partition_id) { m_partition_id = partition_id; } + static std::mutex s_base_tbl_mu; virtual AMDGpuDynamicMetricsTbl_t get_metrics_dynamic_tbl() { + std::lock_guard lk(s_base_tbl_mu); return m_base_metrics_dynamic_tbl; } diff --git a/projects/amdsmi/rocm_smi/src/rocm_smi_device.cc b/projects/amdsmi/rocm_smi/src/rocm_smi_device.cc index 9185cbb581..d98bf766bc 100755 --- a/projects/amdsmi/rocm_smi/src/rocm_smi_device.cc +++ b/projects/amdsmi/rocm_smi/src/rocm_smi_device.cc @@ -1110,6 +1110,7 @@ namespace { GpuMetricsCache g_gpu_metrics_cache; // Keep 1 cache map, with an entry for each gpu std::unordered_map g_gpu_metrics_cache_map; + std::mutex g_gpu_metrics_cache_map_mu; static const std::chrono::milliseconds kGpuMetricsCacheDuration( read_env_ms("AMDSMI_GPU_METRICS_CACHE_MS", 1) ); @@ -1132,20 +1133,25 @@ int Device::readDevInfoBinary(DevInfoTypes type, std::size_t b_size, // is the issue, so should remain. const std::string key = path_ + "/device/" + kDevAttribNameMap.at(type) + "#" + std::to_string(b_size); - auto& cache = g_gpu_metrics_cache_map[key]; + + GpuMetricsCache* cache_ptr = nullptr; + { + std::lock_guard map_lk(g_gpu_metrics_cache_map_mu); + cache_ptr = &g_gpu_metrics_cache_map[key]; // safe now + } // Only cache for kDevGpuMetrics if (type == DevInfoTypes::kDevGpuMetrics) { - std::lock_guard lock(cache.mtx); + std::lock_guard lock(cache_ptr->mtx); auto now = std::chrono::steady_clock::now(); - auto last_read_delta = std::chrono::duration_cast(now - cache.last_read); + auto last_read_delta = std::chrono::duration_cast(now - cache_ptr->last_read); - if (!cache.data.empty() && + if (!cache_ptr->data.empty() && kGpuMetricsCacheDuration > std::chrono::milliseconds::zero() && last_read_delta < kGpuMetricsCacheDuration && - cache.data.size() == b_size) { + cache_ptr->data.size() == b_size) { - std::memcpy(p_binary_data, cache.data.data(), b_size); + std::memcpy(p_binary_data, cache_ptr->data.data(), b_size); if (ROCmLogging::Logger::getInstance()->isLoggerEnabled()) { ss << "Returned cached DevInfoBinary for DevInfoType (" @@ -1206,12 +1212,12 @@ int Device::readDevInfoBinary(DevInfoTypes type, std::size_t b_size, if (type == DevInfoTypes::kDevGpuMetrics && kGpuMetricsCacheDuration > std::chrono::milliseconds::zero()) { auto now = std::chrono::steady_clock::now(); - auto& cache = g_gpu_metrics_cache_map[key]; - std::lock_guard lock(cache.mtx); - cache.data.assign( - reinterpret_cast(p_binary_data), - reinterpret_cast(p_binary_data) + b_size); - cache.last_read = now; + + std::lock_guard lock(cache_ptr->mtx); + cache_ptr->data.assign( + reinterpret_cast(p_binary_data), + reinterpret_cast(p_binary_data) + b_size); + cache_ptr->last_read = now; if (ROCmLogging::Logger::getInstance()->isLoggerEnabled()) { ss << "Successfully Cached GPU Metrics binaryData = " << p_binary_data diff --git a/projects/amdsmi/rocm_smi/src/rocm_smi_gpu_metrics.cc b/projects/amdsmi/rocm_smi/src/rocm_smi_gpu_metrics.cc index 0722f898df..3bac9726b5 100644 --- a/projects/amdsmi/rocm_smi/src/rocm_smi_gpu_metrics.cc +++ b/projects/amdsmi/rocm_smi/src/rocm_smi_gpu_metrics.cc @@ -62,6 +62,8 @@ using namespace amd::smi; namespace amd::smi { +std::mutex GpuMetricsBase_t::s_base_tbl_mu; + constexpr uint16_t join_metrics_version(uint8_t format_rev, uint8_t content_rev) { return static_cast((format_rev << 8 | content_rev)); @@ -509,11 +511,9 @@ rsmi_status_t GpuMetricsBase_v18_t::populate_metrics_dynamic_tbl() { << stringfy_metric_header_version(disjoin_metrics_version(gpu_metrics_version)) << " |"; LOG_TRACE(ss); - // firmware_timestamp is at 10ns resolution - ss << __PRETTY_FUNCTION__ << " | ======= Changes ======= " - << " | {m_firmware_timestamp} from: " << m_gpu_metrics_tbl.m_firmware_timestamp - << " to: " << (m_gpu_metrics_tbl.m_firmware_timestamp * 10); - m_gpu_metrics_tbl.m_firmware_timestamp = (m_gpu_metrics_tbl.m_firmware_timestamp * 10); + // firmware_timestamp is at 10ns resolution; leave as-is. + ss << __PRETTY_FUNCTION__ << " | firmware_timestamp (10ns) = " + << m_gpu_metrics_tbl.m_firmware_timestamp; LOG_DEBUG(ss); }; @@ -744,10 +744,12 @@ rsmi_status_t GpuMetricsBase_v18_t::populate_metrics_dynamic_tbl() { << " | Returning = " << getRSMIStatusString(status_code) << " |"; LOG_TRACE(ss); - // Copy to base class - std::copy(m_metrics_dynamic_tbl.begin(), m_metrics_dynamic_tbl.end(), - std::inserter(GpuMetricsBase_t::m_base_metrics_dynamic_tbl, - GpuMetricsBase_t::m_base_metrics_dynamic_tbl.end())); + + { + std::lock_guard lk(s_base_tbl_mu); + // Copy to base class + this->m_base_metrics_dynamic_tbl = m_metrics_dynamic_tbl; + } return status_code; }