From 7bb45aba1c033389a58a98c9bb44cea10c7007b3 Mon Sep 17 00:00:00 2001 From: David Galiffi Date: Mon, 31 Mar 2025 11:07:50 -0400 Subject: [PATCH] Additional AMD-SMI Updates (#149) - Check AMDSMI header version to fix compilation failure with v2.0 header change - Fix ROCM-SMI references in documentation and tests - Check AMDSMI library version at runtime and output in logs - Fix a possible exception occurring when an in-flight sample is outstanding while the component is shutting down. --- CHANGELOG.md | 4 +++ docs/how-to/configuring-runtime-options.rst | 12 ++++---- docs/install/install.rst | 2 +- source/lib/core/categories.hpp | 2 +- source/lib/core/components/fwd.hpp | 2 +- source/lib/rocprof-sys/library/amd_smi.cpp | 32 ++++++++++++++++++++- tests/rocprof-sys-decode-tests.cmake | 4 +-- 7 files changed, 46 insertions(+), 12 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 259b832b7c..bee1eac71d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -8,6 +8,10 @@ Full documentation for ROCm Systems Profiler is available at [https://rocm.docs. - Added profiling and metric collection capabilities for VCN engine activity, JPEG engine activity and API tracing for rocDecode, rocJPEG and VA-APIs. +### Changed + +- Replaced ROCm-SMI backend with AMD-SMI backend for collecting GPU metrics. + ### Resolved issues - Fixed application hang when enabling the RCCL backend diff --git a/docs/how-to/configuring-runtime-options.rst b/docs/how-to/configuring-runtime-options.rst index 6396dbb48d..5006d24d67 100644 --- a/docs/how-to/configuring-runtime-options.rst +++ b/docs/how-to/configuring-runtime-options.rst @@ -220,20 +220,20 @@ The following example: Exploring GPU Metrics --------------------- -ROCm Systems Profiler supports GPU metrics collection, sampling, and API tracing via `ROCprofiler-SDK `_ and `ROCm-SMI `_. +ROCm Systems Profiler supports GPU metrics collection, sampling, and API tracing via `ROCprofiler-SDK `_ and `AMD-SMI `_. ROCprofiler-SDK supports application tracing to provide a big picture of the GPU application execution and kernel profiling to provide low-level hardware details from the performance counters. -The ROCm-SMI library offers a unified tool for managing, monitoring, and retrieving information about the system's drivers and GPUs. +The AMD-SMI library offers a unified tool for managing, monitoring, and retrieving information about the system's drivers and GPUs. -Sampling GPU metrics like utilization, temperature, power consumption, memory usage, etc., can be configured with ``ROCPROFSYS_ROCM_SMI_METRICS``. -The ``ROCPROFSYS_USE_ROCM_SMI`` setting should be enabled for GPU metric collection. +Sampling GPU metrics like utilization, temperature, power consumption, memory usage, etc., can be configured with ``ROCPROFSYS_AMD_SMI_METRICS``. +The ``ROCPROFSYS_USE_AMD_SMI`` setting should be enabled for GPU metric collection. For example, the following is a valid configuration: .. code-block:: shell - ROCPROFSYS_ROCM_SMI_METRICS=busy,temp,power,vcn_activity,mem_usage + ROCPROFSYS_AMD_SMI_METRICS=busy,temp,power,vcn_activity,mem_usage -Supported values for ``ROCPROFSYS_ROCM_SMI_METRICS`` are: ``busy``, ``temp``, ``power``, ``vcn_activity``, ``mem_usage``, ``jpeg_activity``. +Supported values for ``ROCPROFSYS_AMD_SMI_METRICS`` are: ``busy``, ``temp``, ``power``, ``vcn_activity``, ``mem_usage``, ``jpeg_activity``. API tracing is configured with the ``ROCPROFSYS_ROCM_DOMAINS`` setting. The domains are used to filter the events that are captured during profiling. Supported values for this setting are those supported by ROCprofiler-SDK, which are returned by the API ``get_callback_tracing_names()`` and ``get_buffer_tracing_names()``. See the `ROCprofiler-SDK developer API documentation `_ to learn more about ROCprofiler-SDK APIs. diff --git a/docs/install/install.rst b/docs/install/install.rst index 16889a9960..db649bd936 100644 --- a/docs/install/install.rst +++ b/docs/install/install.rst @@ -175,7 +175,7 @@ Optional third-party packages * `ROCm `_ * HIP - * ROCm SMI Lib for GPU monitoring + * AMD SMI Lib for GPU monitoring * ROCprofiler SDK for GPU hardware counters and ROCm tracing * `PAPI `_ diff --git a/source/lib/core/categories.hpp b/source/lib/core/categories.hpp index 9e85b0c114..fe28aa548e 100644 --- a/source/lib/core/categories.hpp +++ b/source/lib/core/categories.hpp @@ -102,7 +102,7 @@ ROCPROFSYS_DEFINE_CATEGORY(category, rocm_counter_collection, ROCPROFSYS_CATEGOR ROCPROFSYS_DEFINE_CATEGORY(category, rocm_marker_api, ROCPROFSYS_CATEGORY_ROCM_MARKER_API, "rocm_marker_api", "ROCTx labels") ROCPROFSYS_DEFINE_CATEGORY(category, rocm_rocdecode_api, ROCPROFSYS_CATEGORY_ROCM_ROCDECODE_API, "rocm_rocdecode_api", "ROCm RocDecode API") ROCPROFSYS_DEFINE_CATEGORY(category, rocm_rocjpeg_api, ROCPROFSYS_CATEGORY_ROCM_ROCJPEG_API, "rocm_rocjpeg_api", "ROCm RocJPEG API") -ROCPROFSYS_DEFINE_CATEGORY(category, amd_smi, ROCPROFSYS_CATEGORY_AMD_SMI, "rocm_smi", "rocm-smi data") +ROCPROFSYS_DEFINE_CATEGORY(category, amd_smi, ROCPROFSYS_CATEGORY_AMD_SMI, "amd_smi", "AMD-SMI data") ROCPROFSYS_DEFINE_CATEGORY(category, amd_smi_gfx_busy, ROCPROFSYS_CATEGORY_AMD_SMI_BUSY_GFX, "device_busy_gfx", "Busy percentage of GFX engine on a GPU device") ROCPROFSYS_DEFINE_CATEGORY(category, amd_smi_umc_busy, ROCPROFSYS_CATEGORY_AMD_SMI_BUSY_UMC, "device_busy_umc", "Busy percentage of UMC engin on a GPU device") ROCPROFSYS_DEFINE_CATEGORY(category, amd_smi_mm_busy, ROCPROFSYS_CATEGORY_AMD_SMI_BUSY_MM, "device_busy_mm", "Busy percentage of MM engine on a GPU device") diff --git a/source/lib/core/components/fwd.hpp b/source/lib/core/components/fwd.hpp index d2e65faa0b..fef44998c3 100644 --- a/source/lib/core/components/fwd.hpp +++ b/source/lib/core/components/fwd.hpp @@ -217,7 +217,7 @@ TIMEMORY_METADATA_SPECIALIZATION(rocprofsys::component::sampling_gpu_vcn, "Derived from sampling") TIMEMORY_METADATA_SPECIALIZATION(rocprofsys::component::sampling_gpu_jpeg, "sampling_gpu_jpeg", - "GPU JPEG Utilization (% activity) via ROCm-SMI", + "GPU JPEG Utilization (% activity) via AMD SMI", "Derived from sampling") // statistics type diff --git a/source/lib/rocprof-sys/library/amd_smi.cpp b/source/lib/rocprof-sys/library/amd_smi.cpp index 2ec9e57715..6e2eb152c1 100644 --- a/source/lib/rocprof-sys/library/amd_smi.cpp +++ b/source/lib/rocprof-sys/library/amd_smi.cpp @@ -83,6 +83,22 @@ is_initialized() return _v; } +amdsmi_version_t& +get_version() +{ + static amdsmi_version_t _v = {}; + + if(_v.major == 0 && _v.minor == 0) + { + auto _err = amdsmi_get_lib_version(&_v); + if(_err != AMDSMI_STATUS_SUCCESS) + ROCPROFSYS_THROW( + "amdsmi_get_version failed. No version information available."); + } + + return _v; +} + void check_error(const char* _file, int _line, amdsmi_status_t _code, bool* _option = nullptr) { @@ -157,8 +173,15 @@ data::sample(uint32_t _dev_id) ROCPROFSYS_AMDSMI_GET(get_settings(m_dev_id).temp, amdsmi_get_temp_metric, sample_handle, AMDSMI_TEMPERATURE_TYPE_JUNCTION, AMDSMI_TEMP_CURRENT, &m_temp); +#if(AMDSMI_LIB_VERSION_MAJOR == 2 && AMDSMI_LIB_VERSION_MINOR == 0) || \ + (AMDSMI_LIB_VERSION_MAJOR == 25 && AMDSMI_LIB_VERSION_MINOR == 2) + // This was a transient change in the AMD SMI API. It was never officially released. + ROCPROFSYS_AMDSMI_GET(get_settings(m_dev_id).power, amdsmi_get_power_info, + sample_handle, 0, &m_power) +#else ROCPROFSYS_AMDSMI_GET(get_settings(m_dev_id).power, amdsmi_get_power_info, sample_handle, &m_power) +#endif ROCPROFSYS_AMDSMI_GET(get_settings(m_dev_id).mem_usage, amdsmi_get_gpu_memory_usage, sample_handle, AMDSMI_MEM_TYPE_VRAM, &m_mem_usage); ROCPROFSYS_AMDSMI_GET(get_settings(m_dev_id).vcn_activity, @@ -220,6 +243,8 @@ config() void sample() { + auto_lock_t _lk{ type_mutex() }; + for(auto itr : data::device_list) { if(amd_smi::get_state() != State::Active) continue; @@ -255,7 +280,6 @@ data::setup() bool data::shutdown() { - ROCPROFSYS_DEBUG("Shutting down amd-smi...\n"); amd_smi::set_state(State::Finalized); return true; } @@ -453,6 +477,11 @@ setup() return; } + amdsmi_version_t _version = get_version(); + ROCPROFSYS_VERBOSE_F(0, "AMD SMI version: %u.%u.%u.%u - str: %s.\n", _version.year, + _version.major, _version.minor, _version.release, + _version.build); + data::device_count = gpu::get_processor_count(); auto _devices_v = get_sampling_gpus(); @@ -561,6 +590,7 @@ shutdown() auto_lock_t _lk{ type_mutex() }; if(!is_initialized()) return; + ROCPROFSYS_VERBOSE_F(1, "Shutting down amd-smi...\n"); try { diff --git a/tests/rocprof-sys-decode-tests.cmake b/tests/rocprof-sys-decode-tests.cmake index 76ce10cf1a..adc76c2846 100644 --- a/tests/rocprof-sys-decode-tests.cmake +++ b/tests/rocprof-sys-decode-tests.cmake @@ -7,12 +7,12 @@ set(_video_decode_environment "${_base_environment}" "ROCPROFSYS_ROCM_DOMAINS=hip_runtime_api,kernel_dispatch,memory_copy,rocdecode_api" - "ROCPROFSYS_ROCM_SMI_METRICS=busy,temp,power,vcn_activity,mem_usage" + "ROCPROFSYS_AMD_SMI_METRICS=busy,temp,power,vcn_activity,mem_usage" "ROCPROFSYS_SAMPLING_CPUS=none") set(_jpeg_decode_environment "${_base_environment}" "ROCPROFSYS_ROCM_DOMAINS=hip_runtime_api,kernel_dispatch,memory_copy,rocjpeg_api" - "ROCPROFSYS_ROCM_SMI_METRICS=busy,temp,power,jpeg_activity,mem_usage" + "ROCPROFSYS_AMD_SMI_METRICS=busy,temp,power,jpeg_activity,mem_usage" "ROCPROFSYS_SAMPLING_CPUS=none") check_gpu("MI300" MI300_DETECTED)