diff --git a/projects/amdsmi/amdsmi_cli/amdsmi_cli.py b/projects/amdsmi/amdsmi_cli/amdsmi_cli.py index 19ec2734b9..daaeb6b644 100755 --- a/projects/amdsmi/amdsmi_cli/amdsmi_cli.py +++ b/projects/amdsmi/amdsmi_cli/amdsmi_cli.py @@ -42,8 +42,12 @@ except ImportError as e: # from amdsmi import amdsmi_exception # Set the environment variable for GPU metrics cache duration -cache_ms = os.environ.setdefault("AMDSMI_GPU_METRICS_CACHE_MS", "100") -logging.debug("AMDSMI_GPU_METRICS_CACHE_MS = %sms", cache_ms) +gpu_metrics_cache_ms = os.environ.setdefault("AMDSMI_GPU_METRICS_CACHE_MS", "100") +logging.debug("AMDSMI_GPU_METRICS_CACHE_MS = %sms", gpu_metrics_cache_ms) + +# Set the environment variable for ASIC cache duration +asic_info_cache_ms = os.environ.setdefault("AMDSMI_ASIC_INFO_CACHE_MS", "10000") # 10 seconds +logging.debug("AMDSMI_ASIC_INFO_CACHE_MS = %sms", asic_info_cache_ms) try: from amdsmi_init import * diff --git a/projects/amdsmi/docs/how-to/amdsmi-cli-tool.md b/projects/amdsmi/docs/how-to/amdsmi-cli-tool.md index 53dc9b315e..b041b05027 100644 --- a/projects/amdsmi/docs/how-to/amdsmi-cli-tool.md +++ b/projects/amdsmi/docs/how-to/amdsmi-cli-tool.md @@ -87,6 +87,7 @@ Current Variables: ```{note} AMDSMI_GPU_METRICS_CACHE_MS - Controls the internal GPU metrics cache duration (ms). Default 100, set to 0 to disable. +AMDSMI_ASIC_INFO_CACHE_MS - Controls the internal GPU asic info cache duration (ms). Default 10000, set to 0 to disable. ``` (cmds)= diff --git a/projects/amdsmi/docs/how-to/amdsmi-cpp-lib.md b/projects/amdsmi/docs/how-to/amdsmi-cpp-lib.md index 766bb90090..5c4ab54296 100644 --- a/projects/amdsmi/docs/how-to/amdsmi-cpp-lib.md +++ b/projects/amdsmi/docs/how-to/amdsmi-cpp-lib.md @@ -27,6 +27,12 @@ control the internal GPU metrics cache duration (ms). Default 1, set to 0 to disable. ``` +```{note} +The environment variable ``AMDSMI_ASIC_INFO_CACHE_MS`` may be set to +control the internal GPU asic info cache duration (ms). +Default 10000 ms, set to 0 to disable. +``` + ```{seealso} Refer to the [C++ library API reference](../reference/amdsmi-cpp-api.md). ``` diff --git a/projects/amdsmi/docs/how-to/amdsmi-py-lib.md b/projects/amdsmi/docs/how-to/amdsmi-py-lib.md index d054b31035..2fc1833f7c 100644 --- a/projects/amdsmi/docs/how-to/amdsmi-py-lib.md +++ b/projects/amdsmi/docs/how-to/amdsmi-py-lib.md @@ -44,7 +44,11 @@ The environment variable ``AMDSMI_GPU_METRICS_CACHE_MS`` may be set to control the internal GPU metrics cache duration (ms). Default 1, set to 0 to disable. -You can apply it in one of two ways: +The environment variable ``AMDSMI_ASIC_INFO_CACHE_MS`` may be set to +control the internal GPU asic info cache duration (ms). +Default 10000 ms, set to 0 to disable. + +You can apply them in one of two ways: 1. In Python code (before the AMDSMI library loads): ``` diff --git a/projects/amdsmi/include/amd_smi/impl/amd_smi_utils.h b/projects/amdsmi/include/amd_smi/impl/amd_smi_utils.h index 8e7246e8af..a3be8194f3 100644 --- a/projects/amdsmi/include/amd_smi/impl/amd_smi_utils.h +++ b/projects/amdsmi/include/amd_smi/impl/amd_smi_utils.h @@ -118,6 +118,19 @@ amdsmi_status_t smi_amdgpu_get_processor_handle_by_index( uint32_t device_index, amdsmi_processor_handle *processor_handle); +/** + * @brief Get an int environment var or return default if does not exist + * + * @details Given a const char* @p name and a default int @p def + * and call getenv with name. On any error, return default int + * + * @param[in] name a const char* containing ENV var name + * + * @param[in] def default int in case of error + * + * @retval int of environment variable + */ +int read_env_ms(const char* name, int def); template constexpr bool is_dependent_false_v = false; diff --git a/projects/amdsmi/rocm_smi/src/rocm_smi_device.cc b/projects/amdsmi/rocm_smi/src/rocm_smi_device.cc index 2630dc03e6..09a5fb964c 100755 --- a/projects/amdsmi/rocm_smi/src/rocm_smi_device.cc +++ b/projects/amdsmi/rocm_smi/src/rocm_smi_device.cc @@ -1096,7 +1096,7 @@ namespace { try { return std::max(0, std::stoi(s)); } catch (...) { - // Ignore error, fallback on 100 ms default + // Ignore error, fallback to passed in def } } return def; @@ -1108,7 +1108,6 @@ namespace { std::mutex mtx; }; - GpuMetricsCache g_gpu_metrics_cache; // Keep 1 cache map, with an entry for each gpu std::unordered_map g_gpu_metrics_cache_map; std::mutex g_gpu_metrics_cache_map_mu; diff --git a/projects/amdsmi/src/amd_smi/amd_smi.cc b/projects/amdsmi/src/amd_smi/amd_smi.cc index 89f053a5e7..1bd160dc18 100644 --- a/projects/amdsmi/src/amd_smi/amd_smi.cc +++ b/projects/amdsmi/src/amd_smi/amd_smi.cc @@ -1572,6 +1572,22 @@ amdsmi_status_t amdsmi_get_fw_info(amdsmi_processor_handle processor_handle, return AMDSMI_STATUS_SUCCESS; } +// If similar caches are implemented in the future, make this generic and move it +namespace { + struct AsicInfoCache { + amdsmi_asic_info_t info{}; + std::chrono::steady_clock::time_point last_read; + bool valid = false; + std::mutex mtx; + }; + + std::unordered_map g_asic_info_cache_map; + std::mutex g_asic_info_cache_map_mu; + static const std::chrono::milliseconds kAsicInfoCacheDuration( + read_env_ms("AMDSMI_ASIC_INFO_CACHE_MS", 10000) + ); +} + amdsmi_status_t amdsmi_get_gpu_asic_info(amdsmi_processor_handle processor_handle, amdsmi_asic_info_t *info) { AMDSMI_CHECK_INIT(); @@ -1607,6 +1623,33 @@ amdsmi_get_gpu_asic_info(amdsmi_processor_handle processor_handle, amdsmi_asic_i } SMIGPUDEVICE_MUTEX(gpu_device->get_mutex()) + // ---- ASIC info cache ---- + const std::string key = gpu_device->get_gpu_path(); + + AsicInfoCache* cache_ptr = nullptr; + { + std::lock_guard map_lk(g_asic_info_cache_map_mu); + cache_ptr = &g_asic_info_cache_map[key]; + } + { + std::lock_guard lk(cache_ptr->mtx); + auto now = std::chrono::steady_clock::now(); + auto last_read_delta = std::chrono::duration_cast(now - cache_ptr->last_read); + + if (cache_ptr->valid && + kAsicInfoCacheDuration > std::chrono::milliseconds::zero() && + last_read_delta < kAsicInfoCacheDuration) { + + *info = cache_ptr->info; + + ss << "Returned cached ASIC info for key=" << key + << " (age=" << last_read_delta.count() << "ms)"; + LOG_INFO(ss); + + return AMDSMI_STATUS_SUCCESS; + } + } + /** * For other sysfs related information, get from rocm-smi */ @@ -1803,6 +1846,20 @@ amdsmi_get_gpu_asic_info(amdsmi_processor_handle processor_handle, amdsmi_asic_i << std::hex << info->target_graphics_version << "\n" << " | Returning: " << smi_amdgpu_get_status_string(AMDSMI_STATUS_SUCCESS, true); LOG_INFO(ss); + + // ---- Store cache success ---- + if (status == AMDSMI_STATUS_SUCCESS && + kAsicInfoCacheDuration > std::chrono::milliseconds::zero()) { + + auto now = std::chrono::steady_clock::now(); + std::lock_guard lk(cache_ptr->mtx); + cache_ptr->info = *info; + cache_ptr->last_read = now; + cache_ptr->valid = true; + + ss << "Successfully Cached ASIC info for key=" << key; + LOG_INFO(ss); + } return AMDSMI_STATUS_SUCCESS; } diff --git a/projects/amdsmi/src/amd_smi/amd_smi_utils.cc b/projects/amdsmi/src/amd_smi/amd_smi_utils.cc index 2c7c4be9ba..f399d9f4e2 100644 --- a/projects/amdsmi/src/amd_smi/amd_smi_utils.cc +++ b/projects/amdsmi/src/amd_smi/amd_smi_utils.cc @@ -988,6 +988,17 @@ amdsmi_status_t smi_amdgpu_get_processor_handle_by_index( return AMDSMI_STATUS_API_FAILED; } +int read_env_ms(const char* name, int def) { + if (const char* s = std::getenv(name)) { + try { + return std::max(0, std::stoi(s)); + } catch (...) { + // Ignore error, fallback to passed in def + } + } + return def; +} + struct CperFileCtx { amdsmi_status_t status = AMDSMI_STATUS_FILE_ERROR; std::unique_ptr buffer; diff --git a/projects/amdsmi/tools/amdsmi_quick_start.py b/projects/amdsmi/tools/amdsmi_quick_start.py index d6f7a164e9..6d4ffe8094 100644 --- a/projects/amdsmi/tools/amdsmi_quick_start.py +++ b/projects/amdsmi/tools/amdsmi_quick_start.py @@ -29,6 +29,9 @@ import sys # Metrics cache set to 1 by default, uncomment to change # os.environ["AMDSMI_GPU_METRICS_CACHE_MS"] = "1" +# ASIC info cache set to 10000 by default, uncomment to change +# os.environ["AMDSMI_ASIC_INFO_CACHE_MS"] = "10000" + try: from amdsmi import * except ImportError as e: