[SWDEV-559082] Add asic info cache (#756)

Signed-off-by: adapryor <Adam.pryor@amd.com>

[ROCm/amdsmi commit: cba4c871d3]
Этот коммит содержится в:
Pryor, Adam
2025-10-08 21:48:08 -05:00
коммит произвёл GitHub
родитель 0aae5d381d
Коммит 5127c923b9
9 изменённых файлов: 103 добавлений и 5 удалений
+6 -2
Просмотреть файл
@@ -42,8 +42,12 @@ except ImportError as e:
# from amdsmi import amdsmi_exception
# Set the environment variable for GPU metrics cache duration
cache_ms = os.environ.setdefault("AMDSMI_GPU_METRICS_CACHE_MS", "100")
logging.debug("AMDSMI_GPU_METRICS_CACHE_MS = %sms", cache_ms)
gpu_metrics_cache_ms = os.environ.setdefault("AMDSMI_GPU_METRICS_CACHE_MS", "100")
logging.debug("AMDSMI_GPU_METRICS_CACHE_MS = %sms", gpu_metrics_cache_ms)
# Set the environment variable for ASIC cache duration
asic_info_cache_ms = os.environ.setdefault("AMDSMI_ASIC_INFO_CACHE_MS", "10000") # 10 seconds
logging.debug("AMDSMI_ASIC_INFO_CACHE_MS = %sms", asic_info_cache_ms)
try:
from amdsmi_init import *
+1
Просмотреть файл
@@ -87,6 +87,7 @@ Current Variables:
```{note}
AMDSMI_GPU_METRICS_CACHE_MS - Controls the internal GPU metrics cache duration (ms). Default 100, set to 0 to disable.
AMDSMI_ASIC_INFO_CACHE_MS - Controls the internal GPU asic info cache duration (ms). Default 10000, set to 0 to disable.
```
(cmds)=
+6
Просмотреть файл
@@ -27,6 +27,12 @@ control the internal GPU metrics cache duration (ms).
Default 1, set to 0 to disable.
```
```{note}
The environment variable ``AMDSMI_ASIC_INFO_CACHE_MS`` may be set to
control the internal GPU asic info cache duration (ms).
Default 10000 ms, set to 0 to disable.
```
```{seealso}
Refer to the [C++ library API reference](../reference/amdsmi-cpp-api.md).
```
+5 -1
Просмотреть файл
@@ -44,7 +44,11 @@ The environment variable ``AMDSMI_GPU_METRICS_CACHE_MS`` may be set to
control the internal GPU metrics cache duration (ms).
Default 1, set to 0 to disable.
You can apply it in one of two ways:
The environment variable ``AMDSMI_ASIC_INFO_CACHE_MS`` may be set to
control the internal GPU asic info cache duration (ms).
Default 10000 ms, set to 0 to disable.
You can apply them in one of two ways:
1. In Python code (before the AMDSMI library loads):
```
+13
Просмотреть файл
@@ -118,6 +118,19 @@ amdsmi_status_t smi_amdgpu_get_processor_handle_by_index(
uint32_t device_index,
amdsmi_processor_handle *processor_handle);
/**
* @brief Get an int environment var or return default if does not exist
*
* @details Given a const char* @p name and a default int @p def
* and call getenv with name. On any error, return default int
*
* @param[in] name a const char* containing ENV var name
*
* @param[in] def default int in case of error
*
* @retval int of environment variable
*/
int read_env_ms(const char* name, int def);
template<typename>
constexpr bool is_dependent_false_v = false;
+1 -2
Просмотреть файл
@@ -1096,7 +1096,7 @@ namespace {
try {
return std::max(0, std::stoi(s));
} catch (...) {
// Ignore error, fallback on 100 ms default
// Ignore error, fallback to passed in def
}
}
return def;
@@ -1108,7 +1108,6 @@ namespace {
std::mutex mtx;
};
GpuMetricsCache g_gpu_metrics_cache;
// Keep 1 cache map, with an entry for each gpu
std::unordered_map<std::string, GpuMetricsCache> g_gpu_metrics_cache_map;
std::mutex g_gpu_metrics_cache_map_mu;
+57
Просмотреть файл
@@ -1572,6 +1572,22 @@ amdsmi_status_t amdsmi_get_fw_info(amdsmi_processor_handle processor_handle,
return AMDSMI_STATUS_SUCCESS;
}
// If similar caches are implemented in the future, make this generic and move it
namespace {
struct AsicInfoCache {
amdsmi_asic_info_t info{};
std::chrono::steady_clock::time_point last_read;
bool valid = false;
std::mutex mtx;
};
std::unordered_map<std::string, AsicInfoCache> g_asic_info_cache_map;
std::mutex g_asic_info_cache_map_mu;
static const std::chrono::milliseconds kAsicInfoCacheDuration(
read_env_ms("AMDSMI_ASIC_INFO_CACHE_MS", 10000)
);
}
amdsmi_status_t
amdsmi_get_gpu_asic_info(amdsmi_processor_handle processor_handle, amdsmi_asic_info_t *info) {
AMDSMI_CHECK_INIT();
@@ -1607,6 +1623,33 @@ amdsmi_get_gpu_asic_info(amdsmi_processor_handle processor_handle, amdsmi_asic_i
}
SMIGPUDEVICE_MUTEX(gpu_device->get_mutex())
// ---- ASIC info cache ----
const std::string key = gpu_device->get_gpu_path();
AsicInfoCache* cache_ptr = nullptr;
{
std::lock_guard<std::mutex> map_lk(g_asic_info_cache_map_mu);
cache_ptr = &g_asic_info_cache_map[key];
}
{
std::lock_guard<std::mutex> lk(cache_ptr->mtx);
auto now = std::chrono::steady_clock::now();
auto last_read_delta = std::chrono::duration_cast<std::chrono::milliseconds>(now - cache_ptr->last_read);
if (cache_ptr->valid &&
kAsicInfoCacheDuration > std::chrono::milliseconds::zero() &&
last_read_delta < kAsicInfoCacheDuration) {
*info = cache_ptr->info;
ss << "Returned cached ASIC info for key=" << key
<< " (age=" << last_read_delta.count() << "ms)";
LOG_INFO(ss);
return AMDSMI_STATUS_SUCCESS;
}
}
/**
* For other sysfs related information, get from rocm-smi
*/
@@ -1803,6 +1846,20 @@ amdsmi_get_gpu_asic_info(amdsmi_processor_handle processor_handle, amdsmi_asic_i
<< std::hex << info->target_graphics_version << "\n"
<< " | Returning: " << smi_amdgpu_get_status_string(AMDSMI_STATUS_SUCCESS, true);
LOG_INFO(ss);
// ---- Store cache success ----
if (status == AMDSMI_STATUS_SUCCESS &&
kAsicInfoCacheDuration > std::chrono::milliseconds::zero()) {
auto now = std::chrono::steady_clock::now();
std::lock_guard<std::mutex> lk(cache_ptr->mtx);
cache_ptr->info = *info;
cache_ptr->last_read = now;
cache_ptr->valid = true;
ss << "Successfully Cached ASIC info for key=" << key;
LOG_INFO(ss);
}
return AMDSMI_STATUS_SUCCESS;
}
+11
Просмотреть файл
@@ -988,6 +988,17 @@ amdsmi_status_t smi_amdgpu_get_processor_handle_by_index(
return AMDSMI_STATUS_API_FAILED;
}
int read_env_ms(const char* name, int def) {
if (const char* s = std::getenv(name)) {
try {
return std::max(0, std::stoi(s));
} catch (...) {
// Ignore error, fallback to passed in def
}
}
return def;
}
struct CperFileCtx {
amdsmi_status_t status = AMDSMI_STATUS_FILE_ERROR;
std::unique_ptr<char[]> buffer;
+3
Просмотреть файл
@@ -29,6 +29,9 @@ import sys
# Metrics cache set to 1 by default, uncomment to change
# os.environ["AMDSMI_GPU_METRICS_CACHE_MS"] = "1"
# ASIC info cache set to 10000 by default, uncomment to change
# os.environ["AMDSMI_ASIC_INFO_CACHE_MS"] = "10000"
try:
from amdsmi import *
except ImportError as e: