[SWDEV-559082] Add asic info cache (#756)
Signed-off-by: adapryor <Adam.pryor@amd.com>
[ROCm/amdsmi commit: cba4c871d3]
Этот коммит содержится в:
@@ -42,8 +42,12 @@ except ImportError as e:
|
||||
# from amdsmi import amdsmi_exception
|
||||
|
||||
# Set the environment variable for GPU metrics cache duration
|
||||
cache_ms = os.environ.setdefault("AMDSMI_GPU_METRICS_CACHE_MS", "100")
|
||||
logging.debug("AMDSMI_GPU_METRICS_CACHE_MS = %sms", cache_ms)
|
||||
gpu_metrics_cache_ms = os.environ.setdefault("AMDSMI_GPU_METRICS_CACHE_MS", "100")
|
||||
logging.debug("AMDSMI_GPU_METRICS_CACHE_MS = %sms", gpu_metrics_cache_ms)
|
||||
|
||||
# Set the environment variable for ASIC cache duration
|
||||
asic_info_cache_ms = os.environ.setdefault("AMDSMI_ASIC_INFO_CACHE_MS", "10000") # 10 seconds
|
||||
logging.debug("AMDSMI_ASIC_INFO_CACHE_MS = %sms", asic_info_cache_ms)
|
||||
|
||||
try:
|
||||
from amdsmi_init import *
|
||||
|
||||
@@ -87,6 +87,7 @@ Current Variables:
|
||||
|
||||
```{note}
|
||||
AMDSMI_GPU_METRICS_CACHE_MS - Controls the internal GPU metrics cache duration (ms). Default 100, set to 0 to disable.
|
||||
AMDSMI_ASIC_INFO_CACHE_MS - Controls the internal GPU asic info cache duration (ms). Default 10000, set to 0 to disable.
|
||||
```
|
||||
|
||||
(cmds)=
|
||||
|
||||
@@ -27,6 +27,12 @@ control the internal GPU metrics cache duration (ms).
|
||||
Default 1, set to 0 to disable.
|
||||
```
|
||||
|
||||
```{note}
|
||||
The environment variable ``AMDSMI_ASIC_INFO_CACHE_MS`` may be set to
|
||||
control the internal GPU asic info cache duration (ms).
|
||||
Default 10000 ms, set to 0 to disable.
|
||||
```
|
||||
|
||||
```{seealso}
|
||||
Refer to the [C++ library API reference](../reference/amdsmi-cpp-api.md).
|
||||
```
|
||||
|
||||
@@ -44,7 +44,11 @@ The environment variable ``AMDSMI_GPU_METRICS_CACHE_MS`` may be set to
|
||||
control the internal GPU metrics cache duration (ms).
|
||||
Default 1, set to 0 to disable.
|
||||
|
||||
You can apply it in one of two ways:
|
||||
The environment variable ``AMDSMI_ASIC_INFO_CACHE_MS`` may be set to
|
||||
control the internal GPU asic info cache duration (ms).
|
||||
Default 10000 ms, set to 0 to disable.
|
||||
|
||||
You can apply them in one of two ways:
|
||||
|
||||
1. In Python code (before the AMDSMI library loads):
|
||||
```
|
||||
|
||||
@@ -118,6 +118,19 @@ amdsmi_status_t smi_amdgpu_get_processor_handle_by_index(
|
||||
uint32_t device_index,
|
||||
amdsmi_processor_handle *processor_handle);
|
||||
|
||||
/**
|
||||
* @brief Get an int environment var or return default if does not exist
|
||||
*
|
||||
* @details Given a const char* @p name and a default int @p def
|
||||
* and call getenv with name. On any error, return default int
|
||||
*
|
||||
* @param[in] name a const char* containing ENV var name
|
||||
*
|
||||
* @param[in] def default int in case of error
|
||||
*
|
||||
* @retval int of environment variable
|
||||
*/
|
||||
int read_env_ms(const char* name, int def);
|
||||
|
||||
template<typename>
|
||||
constexpr bool is_dependent_false_v = false;
|
||||
|
||||
@@ -1096,7 +1096,7 @@ namespace {
|
||||
try {
|
||||
return std::max(0, std::stoi(s));
|
||||
} catch (...) {
|
||||
// Ignore error, fallback on 100 ms default
|
||||
// Ignore error, fallback to passed in def
|
||||
}
|
||||
}
|
||||
return def;
|
||||
@@ -1108,7 +1108,6 @@ namespace {
|
||||
std::mutex mtx;
|
||||
};
|
||||
|
||||
GpuMetricsCache g_gpu_metrics_cache;
|
||||
// Keep 1 cache map, with an entry for each gpu
|
||||
std::unordered_map<std::string, GpuMetricsCache> g_gpu_metrics_cache_map;
|
||||
std::mutex g_gpu_metrics_cache_map_mu;
|
||||
|
||||
@@ -1572,6 +1572,22 @@ amdsmi_status_t amdsmi_get_fw_info(amdsmi_processor_handle processor_handle,
|
||||
return AMDSMI_STATUS_SUCCESS;
|
||||
}
|
||||
|
||||
// If similar caches are implemented in the future, make this generic and move it
|
||||
namespace {
|
||||
struct AsicInfoCache {
|
||||
amdsmi_asic_info_t info{};
|
||||
std::chrono::steady_clock::time_point last_read;
|
||||
bool valid = false;
|
||||
std::mutex mtx;
|
||||
};
|
||||
|
||||
std::unordered_map<std::string, AsicInfoCache> g_asic_info_cache_map;
|
||||
std::mutex g_asic_info_cache_map_mu;
|
||||
static const std::chrono::milliseconds kAsicInfoCacheDuration(
|
||||
read_env_ms("AMDSMI_ASIC_INFO_CACHE_MS", 10000)
|
||||
);
|
||||
}
|
||||
|
||||
amdsmi_status_t
|
||||
amdsmi_get_gpu_asic_info(amdsmi_processor_handle processor_handle, amdsmi_asic_info_t *info) {
|
||||
AMDSMI_CHECK_INIT();
|
||||
@@ -1607,6 +1623,33 @@ amdsmi_get_gpu_asic_info(amdsmi_processor_handle processor_handle, amdsmi_asic_i
|
||||
}
|
||||
SMIGPUDEVICE_MUTEX(gpu_device->get_mutex())
|
||||
|
||||
// ---- ASIC info cache ----
|
||||
const std::string key = gpu_device->get_gpu_path();
|
||||
|
||||
AsicInfoCache* cache_ptr = nullptr;
|
||||
{
|
||||
std::lock_guard<std::mutex> map_lk(g_asic_info_cache_map_mu);
|
||||
cache_ptr = &g_asic_info_cache_map[key];
|
||||
}
|
||||
{
|
||||
std::lock_guard<std::mutex> lk(cache_ptr->mtx);
|
||||
auto now = std::chrono::steady_clock::now();
|
||||
auto last_read_delta = std::chrono::duration_cast<std::chrono::milliseconds>(now - cache_ptr->last_read);
|
||||
|
||||
if (cache_ptr->valid &&
|
||||
kAsicInfoCacheDuration > std::chrono::milliseconds::zero() &&
|
||||
last_read_delta < kAsicInfoCacheDuration) {
|
||||
|
||||
*info = cache_ptr->info;
|
||||
|
||||
ss << "Returned cached ASIC info for key=" << key
|
||||
<< " (age=" << last_read_delta.count() << "ms)";
|
||||
LOG_INFO(ss);
|
||||
|
||||
return AMDSMI_STATUS_SUCCESS;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* For other sysfs related information, get from rocm-smi
|
||||
*/
|
||||
@@ -1803,6 +1846,20 @@ amdsmi_get_gpu_asic_info(amdsmi_processor_handle processor_handle, amdsmi_asic_i
|
||||
<< std::hex << info->target_graphics_version << "\n"
|
||||
<< " | Returning: " << smi_amdgpu_get_status_string(AMDSMI_STATUS_SUCCESS, true);
|
||||
LOG_INFO(ss);
|
||||
|
||||
// ---- Store cache success ----
|
||||
if (status == AMDSMI_STATUS_SUCCESS &&
|
||||
kAsicInfoCacheDuration > std::chrono::milliseconds::zero()) {
|
||||
|
||||
auto now = std::chrono::steady_clock::now();
|
||||
std::lock_guard<std::mutex> lk(cache_ptr->mtx);
|
||||
cache_ptr->info = *info;
|
||||
cache_ptr->last_read = now;
|
||||
cache_ptr->valid = true;
|
||||
|
||||
ss << "Successfully Cached ASIC info for key=" << key;
|
||||
LOG_INFO(ss);
|
||||
}
|
||||
return AMDSMI_STATUS_SUCCESS;
|
||||
}
|
||||
|
||||
|
||||
@@ -988,6 +988,17 @@ amdsmi_status_t smi_amdgpu_get_processor_handle_by_index(
|
||||
return AMDSMI_STATUS_API_FAILED;
|
||||
}
|
||||
|
||||
int read_env_ms(const char* name, int def) {
|
||||
if (const char* s = std::getenv(name)) {
|
||||
try {
|
||||
return std::max(0, std::stoi(s));
|
||||
} catch (...) {
|
||||
// Ignore error, fallback to passed in def
|
||||
}
|
||||
}
|
||||
return def;
|
||||
}
|
||||
|
||||
struct CperFileCtx {
|
||||
amdsmi_status_t status = AMDSMI_STATUS_FILE_ERROR;
|
||||
std::unique_ptr<char[]> buffer;
|
||||
|
||||
@@ -29,6 +29,9 @@ import sys
|
||||
# Metrics cache set to 1 by default, uncomment to change
|
||||
# os.environ["AMDSMI_GPU_METRICS_CACHE_MS"] = "1"
|
||||
|
||||
# ASIC info cache set to 10000 by default, uncomment to change
|
||||
# os.environ["AMDSMI_ASIC_INFO_CACHE_MS"] = "10000"
|
||||
|
||||
try:
|
||||
from amdsmi import *
|
||||
except ImportError as e:
|
||||
|
||||
Ссылка в новой задаче
Block a user