From 32a1ef90cd5274c4d298bb5bd7fc0556996f38cd Mon Sep 17 00:00:00 2001 From: "Pryor, Adam" Date: Tue, 5 Aug 2025 19:58:37 -0500 Subject: [PATCH] Documentation updates for AMDSMI_GPU_METRICS_CACHE_MS (#564) Signed-off-by: Maisam Arif [ROCm/amdsmi commit: 2dc2e12a97f2cec4015cce307c4c0783f8dae56e] --- projects/amdsmi/amdsmi_cli/amdsmi_cli.py | 4 ++++ .../amdsmi/docs/how-to/amdsmi-cli-tool.md | 14 +++++++++++ projects/amdsmi/docs/how-to/amdsmi-cpp-lib.md | 6 +++++ projects/amdsmi/docs/how-to/amdsmi-py-lib.md | 24 +++++++++++++++++++ .../amdsmi/rocm_smi/src/rocm_smi_device.cc | 8 ++++++- projects/amdsmi/tools/amdsmi_quick_start.py | 3 +++ 6 files changed, 58 insertions(+), 1 deletion(-) diff --git a/projects/amdsmi/amdsmi_cli/amdsmi_cli.py b/projects/amdsmi/amdsmi_cli/amdsmi_cli.py index e9b2e7566e..94a6ee2222 100755 --- a/projects/amdsmi/amdsmi_cli/amdsmi_cli.py +++ b/projects/amdsmi/amdsmi_cli/amdsmi_cli.py @@ -41,6 +41,10 @@ except ImportError as e: # from amdsmi import amdsmi_interface # from amdsmi import amdsmi_exception +# Set the environment variable for GPU metrics cache duration +cache_ms = os.environ.setdefault("AMDSMI_GPU_METRICS_CACHE_MS", "100") +logging.debug("AMDSMI_GPU_METRICS_CACHE_MS = %sms", cache_ms) + try: from amdsmi_init import * from amdsmi_helpers import AMDSMIHelpers diff --git a/projects/amdsmi/docs/how-to/amdsmi-cli-tool.md b/projects/amdsmi/docs/how-to/amdsmi-cli-tool.md index edb975ea5d..3d5011182f 100644 --- a/projects/amdsmi/docs/how-to/amdsmi-cli-tool.md +++ b/projects/amdsmi/docs/how-to/amdsmi-cli-tool.md @@ -75,6 +75,20 @@ usage information. See [Commands](#cmds). For more detailed version information, use `amd-smi version`. ``` +Environment variables: + +You can set one or more variables in front of any `amd-smi` invocation. For example: + +```shell-session +AMDSMI_GPU_METRICS_CACHE_MS=200 amd-smi metric +``` + +Current Variables: + +```{note} +AMDSMI_GPU_METRICS_CACHE_MS - Controls the internal GPU metrics cache duration (ms). Default 100, set to 0 to disable. +``` + (cmds)= ## Commands diff --git a/projects/amdsmi/docs/how-to/amdsmi-cpp-lib.md b/projects/amdsmi/docs/how-to/amdsmi-cpp-lib.md index eb67a2b455..065db85681 100644 --- a/projects/amdsmi/docs/how-to/amdsmi-cpp-lib.md +++ b/projects/amdsmi/docs/how-to/amdsmi-cpp-lib.md @@ -21,6 +21,12 @@ variable to the directory containing ``librocm_smi64.so`` (usually ``/opt/rocm/lib``) or by passing the ``-lamd_smi`` flag to the compiler. ``` +```{note} +The environment variable ``AMDSMI_GPU_METRICS_CACHE_MS`` may be set to +control the internal GPU metrics cache duration (ms). +Default 1, set to 0 to disable. +``` + ```{seealso} Refer to the [C++ library API reference](../reference/amdsmi-cpp-api.md). ``` diff --git a/projects/amdsmi/docs/how-to/amdsmi-py-lib.md b/projects/amdsmi/docs/how-to/amdsmi-py-lib.md index 1799899304..d054b31035 100644 --- a/projects/amdsmi/docs/how-to/amdsmi-py-lib.md +++ b/projects/amdsmi/docs/how-to/amdsmi-py-lib.md @@ -39,6 +39,30 @@ variable to the directory containing ``librocm_smi64.so`` (usually ``/opt/rocm/lib``) or by passing the ``-lamd_smi`` flag to the compiler. ``` +```{note} +The environment variable ``AMDSMI_GPU_METRICS_CACHE_MS`` may be set to +control the internal GPU metrics cache duration (ms). +Default 1, set to 0 to disable. + +You can apply it in one of two ways: + +1. In Python code (before the AMDSMI library loads): +``` + +```python +import os +os.environ["AMDSMI_GPU_METRICS_CACHE_MS"] = "200" +from amdsmi import * +``` + +```{note} +2. On the shell when invoking Python: +``` + +```shell +AMDSMI_GPU_METRICS_CACHE_MS=200 python tools/amdsmi_quick_start.py +``` + To get started, the `amdsmi` folder should be copied and placed next to the importing script. Import it as follows: diff --git a/projects/amdsmi/rocm_smi/src/rocm_smi_device.cc b/projects/amdsmi/rocm_smi/src/rocm_smi_device.cc index 80457273e1..28c7c0267d 100755 --- a/projects/amdsmi/rocm_smi/src/rocm_smi_device.cc +++ b/projects/amdsmi/rocm_smi/src/rocm_smi_device.cc @@ -1104,7 +1104,7 @@ namespace { // Keep 1 cache map, with an entry for each gpu std::unordered_map g_gpu_metrics_cache_map; static const std::chrono::milliseconds kGpuMetricsCacheDuration( - read_env_ms("AMDSMI_GPU_METRICS_CACHE_MS", 100) + read_env_ms("AMDSMI_GPU_METRICS_CACHE_MS", 1) ); } @@ -1113,6 +1113,12 @@ int Device::readDevInfoBinary(DevInfoTypes type, std::size_t b_size, auto sysfs_path = path_; std::ostringstream ss; + ss << __PRETTY_FUNCTION__ + << " | AMDSMI_GPU_METRICS_CACHE_MS = " + << kGpuMetricsCacheDuration.count() + << " ms"; + LOG_DEBUG(ss); + // Size will either be 4, or 3872+. When 4, it's only reading from the header. // If this header read is inconsequential, we could only cache full read. // However, it seems reading the gpu_metrics sysfs in any capacity diff --git a/projects/amdsmi/tools/amdsmi_quick_start.py b/projects/amdsmi/tools/amdsmi_quick_start.py index 2440fbeff0..d6f7a164e9 100644 --- a/projects/amdsmi/tools/amdsmi_quick_start.py +++ b/projects/amdsmi/tools/amdsmi_quick_start.py @@ -26,6 +26,9 @@ import logging import signal import sys +# Metrics cache set to 1 by default, uncomment to change +# os.environ["AMDSMI_GPU_METRICS_CACHE_MS"] = "1" + try: from amdsmi import * except ImportError as e: