Add gpu metrics cache (#541)

* Add gpu metrics caching defaulted to 100ms
* AMDSMI_GPU_METRICS_CACHE_MS is used to set the caching rate limits

---------

Signed-off-by: Maisam Arif <Maisam.Arif@amd.com>

[ROCm/amdsmi commit: 42096c1398]
This commit is contained in:
Pryor, Adam
2025-07-13 09:56:29 -05:00
committed by GitHub
vanhempi 6531fdd0fb
commit 4303644f90
+69
Näytä tiedosto
@@ -27,6 +27,7 @@
#include <algorithm>
#include <cassert>
#include <chrono>
#include <cstdint>
#include <cstring>
#include <fstream>
@@ -34,6 +35,7 @@
#include <iterator>
#include <map>
#include <memory>
#include <mutex>
#include <sstream>
#include <string>
#include <type_traits>
@@ -1080,11 +1082,66 @@ const char* Device::get_type_string(DevInfoTypes type) {
return "Unknown";
}
namespace {
static int read_env_ms(const char* name, int def) {
if (const char* s = std::getenv(name)) {
try {
return std::max(0, std::stoi(s));
} catch (...) {
// Ignore error, fallback on 100 ms default
}
}
return def;
}
struct GpuMetricsCache {
std::vector<uint8_t> data;
std::chrono::steady_clock::time_point last_read;
std::mutex mtx;
};
GpuMetricsCache g_gpu_metrics_cache;
// Keep 1 cache map, with an entry for each gpu
std::unordered_map<std::string, GpuMetricsCache> g_gpu_metrics_cache_map;
static const std::chrono::milliseconds kGpuMetricsCacheDuration(
read_env_ms("AMDSMI_GPU_METRICS_CACHE_MS", 100)
);
}
int Device::readDevInfoBinary(DevInfoTypes type, std::size_t b_size,
void *p_binary_data) {
auto sysfs_path = path_;
std::ostringstream ss;
// Size will either be 4, or 3872. When 4, it's only reading from the header.
// If this header read is inconsequential, we could only cache full read.
// However, it seems reading the sysfs in any capacity is the issue, so should remain.
const std::string key = path_ + "/device/" + kDevAttribNameMap.at(type) + "#" + std::to_string(b_size);
auto& cache = g_gpu_metrics_cache_map[key];
// Only cache for kDevGpuMetrics
if (type == DevInfoTypes::kDevGpuMetrics) {
std::lock_guard<std::mutex> lock(cache.mtx);
auto now = std::chrono::steady_clock::now();
if (!cache.data.empty() &&
std::chrono::duration_cast<std::chrono::milliseconds>(now - cache.last_read) < kGpuMetricsCacheDuration &&
cache.data.size() == b_size) {
std::memcpy(p_binary_data, cache.data.data(), b_size);
if (ROCmLogging::Logger::getInstance()->isLoggerEnabled()) {
ss << "Returned cached DevInfoBinary for DevInfoType ("
<< get_type_string(type) << ")";
LOG_INFO(ss);
}
return 0;
}
}
FILE *ptr;
sysfs_path += "/device/";
sysfs_path += kDevAttribNameMap.at(type);
@@ -1128,6 +1185,18 @@ int Device::readDevInfoBinary(DevInfoTypes type, std::size_t b_size,
logHexDump(metricDescription.c_str(), p_binary_data, b_size, 16);
LOG_INFO(ss);
}
// Cache metric data
if (type == DevInfoTypes::kDevGpuMetrics) {
auto now = std::chrono::steady_clock::now();
auto& cache = g_gpu_metrics_cache_map[key];
std::lock_guard<std::mutex> lock(cache.mtx);
cache.data.assign(
reinterpret_cast<uint8_t*>(p_binary_data),
reinterpret_cast<uint8_t*>(p_binary_data) + b_size);
cache.last_read = now;
}
return 0;
}