From 4303644f905799b7ca6cfe68e7fd05390213c859 Mon Sep 17 00:00:00 2001 From: "Pryor, Adam" Date: Sun, 13 Jul 2025 09:56:29 -0500 Subject: [PATCH] Add gpu metrics cache (#541) * Add gpu metrics caching defaulted to 100ms * AMDSMI_GPU_METRICS_CACHE_MS is used to set the caching rate limits --------- Signed-off-by: Maisam Arif [ROCm/amdsmi commit: 42096c1398daf0d36d20bb2f00e6c994faa6bc13] --- .../amdsmi/rocm_smi/src/rocm_smi_device.cc | 69 +++++++++++++++++++ 1 file changed, 69 insertions(+) mode change 100644 => 100755 projects/amdsmi/rocm_smi/src/rocm_smi_device.cc diff --git a/projects/amdsmi/rocm_smi/src/rocm_smi_device.cc b/projects/amdsmi/rocm_smi/src/rocm_smi_device.cc old mode 100644 new mode 100755 index 79dbc55e6c..1681b5d1be --- a/projects/amdsmi/rocm_smi/src/rocm_smi_device.cc +++ b/projects/amdsmi/rocm_smi/src/rocm_smi_device.cc @@ -27,6 +27,7 @@ #include #include +#include #include #include #include @@ -34,6 +35,7 @@ #include #include #include +#include #include #include #include @@ -1080,11 +1082,66 @@ const char* Device::get_type_string(DevInfoTypes type) { return "Unknown"; } +namespace { + + static int read_env_ms(const char* name, int def) { + if (const char* s = std::getenv(name)) { + try { + return std::max(0, std::stoi(s)); + } catch (...) { + // Ignore error, fallback on 100 ms default + } + } + return def; + } + + struct GpuMetricsCache { + std::vector data; + std::chrono::steady_clock::time_point last_read; + std::mutex mtx; + }; + + GpuMetricsCache g_gpu_metrics_cache; + // Keep 1 cache map, with an entry for each gpu + std::unordered_map g_gpu_metrics_cache_map; + static const std::chrono::milliseconds kGpuMetricsCacheDuration( + read_env_ms("AMDSMI_GPU_METRICS_CACHE_MS", 100) + ); +} + + int Device::readDevInfoBinary(DevInfoTypes type, std::size_t b_size, void *p_binary_data) { auto sysfs_path = path_; std::ostringstream ss; + // Size will either be 4, or 3872. When 4, it's only reading from the header. + // If this header read is inconsequential, we could only cache full read. + // However, it seems reading the sysfs in any capacity is the issue, so should remain. + const std::string key = path_ + "/device/" + kDevAttribNameMap.at(type) + "#" + std::to_string(b_size); + auto& cache = g_gpu_metrics_cache_map[key]; + + // Only cache for kDevGpuMetrics + if (type == DevInfoTypes::kDevGpuMetrics) { + std::lock_guard lock(cache.mtx); + auto now = std::chrono::steady_clock::now(); + + if (!cache.data.empty() && + std::chrono::duration_cast(now - cache.last_read) < kGpuMetricsCacheDuration && + cache.data.size() == b_size) { + + std::memcpy(p_binary_data, cache.data.data(), b_size); + + if (ROCmLogging::Logger::getInstance()->isLoggerEnabled()) { + ss << "Returned cached DevInfoBinary for DevInfoType (" + << get_type_string(type) << ")"; + LOG_INFO(ss); + } + + return 0; + } + } + FILE *ptr; sysfs_path += "/device/"; sysfs_path += kDevAttribNameMap.at(type); @@ -1128,6 +1185,18 @@ int Device::readDevInfoBinary(DevInfoTypes type, std::size_t b_size, logHexDump(metricDescription.c_str(), p_binary_data, b_size, 16); LOG_INFO(ss); } + + // Cache metric data + if (type == DevInfoTypes::kDevGpuMetrics) { + auto now = std::chrono::steady_clock::now(); + auto& cache = g_gpu_metrics_cache_map[key]; + std::lock_guard lock(cache.mtx); + cache.data.assign( + reinterpret_cast(p_binary_data), + reinterpret_cast(p_binary_data) + b_size); + cache.last_read = now; + } + return 0; }