Additional AMD-SMI Updates (#149)
- Check AMDSMI header version to fix compilation failure with v2.0 header change - Fix ROCM-SMI references in documentation and tests - Check AMDSMI library version at runtime and output in logs - Fix a possible exception occurring when an in-flight sample is outstanding while the component is shutting down.
Este commit está contenido en:
@@ -8,6 +8,10 @@ Full documentation for ROCm Systems Profiler is available at [https://rocm.docs.
|
||||
|
||||
- Added profiling and metric collection capabilities for VCN engine activity, JPEG engine activity and API tracing for rocDecode, rocJPEG and VA-APIs.
|
||||
|
||||
### Changed
|
||||
|
||||
- Replaced ROCm-SMI backend with AMD-SMI backend for collecting GPU metrics.
|
||||
|
||||
### Resolved issues
|
||||
|
||||
- Fixed application hang when enabling the RCCL backend
|
||||
|
||||
@@ -220,20 +220,20 @@ The following example:
|
||||
Exploring GPU Metrics
|
||||
---------------------
|
||||
|
||||
ROCm Systems Profiler supports GPU metrics collection, sampling, and API tracing via `ROCprofiler-SDK <https://rocm.docs.amd.com/projects/rocprofiler-sdk/en/latest/index.html>`_ and `ROCm-SMI <https://rocm.docs.amd.com/projects/rocm_smi_lib/en/latest/>`_.
|
||||
ROCm Systems Profiler supports GPU metrics collection, sampling, and API tracing via `ROCprofiler-SDK <https://rocm.docs.amd.com/projects/rocprofiler-sdk/en/latest/index.html>`_ and `AMD-SMI <https://rocm.docs.amd.com/projects/amdsmi/en/latest/>`_.
|
||||
ROCprofiler-SDK supports application tracing to provide a big picture of the GPU application execution and kernel profiling to provide low-level hardware details from the performance counters.
|
||||
The ROCm-SMI library offers a unified tool for managing, monitoring, and retrieving information about the system's drivers and GPUs.
|
||||
The AMD-SMI library offers a unified tool for managing, monitoring, and retrieving information about the system's drivers and GPUs.
|
||||
|
||||
Sampling GPU metrics like utilization, temperature, power consumption, memory usage, etc., can be configured with ``ROCPROFSYS_ROCM_SMI_METRICS``.
|
||||
The ``ROCPROFSYS_USE_ROCM_SMI`` setting should be enabled for GPU metric collection.
|
||||
Sampling GPU metrics like utilization, temperature, power consumption, memory usage, etc., can be configured with ``ROCPROFSYS_AMD_SMI_METRICS``.
|
||||
The ``ROCPROFSYS_USE_AMD_SMI`` setting should be enabled for GPU metric collection.
|
||||
|
||||
For example, the following is a valid configuration:
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
ROCPROFSYS_ROCM_SMI_METRICS=busy,temp,power,vcn_activity,mem_usage
|
||||
ROCPROFSYS_AMD_SMI_METRICS=busy,temp,power,vcn_activity,mem_usage
|
||||
|
||||
Supported values for ``ROCPROFSYS_ROCM_SMI_METRICS`` are: ``busy``, ``temp``, ``power``, ``vcn_activity``, ``mem_usage``, ``jpeg_activity``.
|
||||
Supported values for ``ROCPROFSYS_AMD_SMI_METRICS`` are: ``busy``, ``temp``, ``power``, ``vcn_activity``, ``mem_usage``, ``jpeg_activity``.
|
||||
|
||||
API tracing is configured with the ``ROCPROFSYS_ROCM_DOMAINS`` setting. The domains are used to filter the events that are captured during profiling.
|
||||
Supported values for this setting are those supported by ROCprofiler-SDK, which are returned by the API ``get_callback_tracing_names()`` and ``get_buffer_tracing_names()``. See the `ROCprofiler-SDK developer API documentation <https://rocm.docs.amd.com/projects/rocprofiler-sdk/en/latest/_doxygen/html/namespacerocprofiler_1_1sdk.html>`_ to learn more about ROCprofiler-SDK APIs.
|
||||
|
||||
@@ -175,7 +175,7 @@ Optional third-party packages
|
||||
* `ROCm <https://rocm.docs.amd.com/projects/install-on-linux/en/latest>`_
|
||||
|
||||
* HIP
|
||||
* ROCm SMI Lib for GPU monitoring
|
||||
* AMD SMI Lib for GPU monitoring
|
||||
* ROCprofiler SDK for GPU hardware counters and ROCm tracing
|
||||
|
||||
* `PAPI <https://icl.utk.edu/papi/>`_
|
||||
|
||||
@@ -102,7 +102,7 @@ ROCPROFSYS_DEFINE_CATEGORY(category, rocm_counter_collection, ROCPROFSYS_CATEGOR
|
||||
ROCPROFSYS_DEFINE_CATEGORY(category, rocm_marker_api, ROCPROFSYS_CATEGORY_ROCM_MARKER_API, "rocm_marker_api", "ROCTx labels")
|
||||
ROCPROFSYS_DEFINE_CATEGORY(category, rocm_rocdecode_api, ROCPROFSYS_CATEGORY_ROCM_ROCDECODE_API, "rocm_rocdecode_api", "ROCm RocDecode API")
|
||||
ROCPROFSYS_DEFINE_CATEGORY(category, rocm_rocjpeg_api, ROCPROFSYS_CATEGORY_ROCM_ROCJPEG_API, "rocm_rocjpeg_api", "ROCm RocJPEG API")
|
||||
ROCPROFSYS_DEFINE_CATEGORY(category, amd_smi, ROCPROFSYS_CATEGORY_AMD_SMI, "rocm_smi", "rocm-smi data")
|
||||
ROCPROFSYS_DEFINE_CATEGORY(category, amd_smi, ROCPROFSYS_CATEGORY_AMD_SMI, "amd_smi", "AMD-SMI data")
|
||||
ROCPROFSYS_DEFINE_CATEGORY(category, amd_smi_gfx_busy, ROCPROFSYS_CATEGORY_AMD_SMI_BUSY_GFX, "device_busy_gfx", "Busy percentage of GFX engine on a GPU device")
|
||||
ROCPROFSYS_DEFINE_CATEGORY(category, amd_smi_umc_busy, ROCPROFSYS_CATEGORY_AMD_SMI_BUSY_UMC, "device_busy_umc", "Busy percentage of UMC engin on a GPU device")
|
||||
ROCPROFSYS_DEFINE_CATEGORY(category, amd_smi_mm_busy, ROCPROFSYS_CATEGORY_AMD_SMI_BUSY_MM, "device_busy_mm", "Busy percentage of MM engine on a GPU device")
|
||||
|
||||
@@ -217,7 +217,7 @@ TIMEMORY_METADATA_SPECIALIZATION(rocprofsys::component::sampling_gpu_vcn,
|
||||
"Derived from sampling")
|
||||
TIMEMORY_METADATA_SPECIALIZATION(rocprofsys::component::sampling_gpu_jpeg,
|
||||
"sampling_gpu_jpeg",
|
||||
"GPU JPEG Utilization (% activity) via ROCm-SMI",
|
||||
"GPU JPEG Utilization (% activity) via AMD SMI",
|
||||
"Derived from sampling")
|
||||
|
||||
// statistics type
|
||||
|
||||
@@ -83,6 +83,22 @@ is_initialized()
|
||||
return _v;
|
||||
}
|
||||
|
||||
amdsmi_version_t&
|
||||
get_version()
|
||||
{
|
||||
static amdsmi_version_t _v = {};
|
||||
|
||||
if(_v.major == 0 && _v.minor == 0)
|
||||
{
|
||||
auto _err = amdsmi_get_lib_version(&_v);
|
||||
if(_err != AMDSMI_STATUS_SUCCESS)
|
||||
ROCPROFSYS_THROW(
|
||||
"amdsmi_get_version failed. No version information available.");
|
||||
}
|
||||
|
||||
return _v;
|
||||
}
|
||||
|
||||
void
|
||||
check_error(const char* _file, int _line, amdsmi_status_t _code, bool* _option = nullptr)
|
||||
{
|
||||
@@ -157,8 +173,15 @@ data::sample(uint32_t _dev_id)
|
||||
ROCPROFSYS_AMDSMI_GET(get_settings(m_dev_id).temp, amdsmi_get_temp_metric,
|
||||
sample_handle, AMDSMI_TEMPERATURE_TYPE_JUNCTION,
|
||||
AMDSMI_TEMP_CURRENT, &m_temp);
|
||||
#if(AMDSMI_LIB_VERSION_MAJOR == 2 && AMDSMI_LIB_VERSION_MINOR == 0) || \
|
||||
(AMDSMI_LIB_VERSION_MAJOR == 25 && AMDSMI_LIB_VERSION_MINOR == 2)
|
||||
// This was a transient change in the AMD SMI API. It was never officially released.
|
||||
ROCPROFSYS_AMDSMI_GET(get_settings(m_dev_id).power, amdsmi_get_power_info,
|
||||
sample_handle, 0, &m_power)
|
||||
#else
|
||||
ROCPROFSYS_AMDSMI_GET(get_settings(m_dev_id).power, amdsmi_get_power_info,
|
||||
sample_handle, &m_power)
|
||||
#endif
|
||||
ROCPROFSYS_AMDSMI_GET(get_settings(m_dev_id).mem_usage, amdsmi_get_gpu_memory_usage,
|
||||
sample_handle, AMDSMI_MEM_TYPE_VRAM, &m_mem_usage);
|
||||
ROCPROFSYS_AMDSMI_GET(get_settings(m_dev_id).vcn_activity,
|
||||
@@ -220,6 +243,8 @@ config()
|
||||
void
|
||||
sample()
|
||||
{
|
||||
auto_lock_t _lk{ type_mutex<category::amd_smi>() };
|
||||
|
||||
for(auto itr : data::device_list)
|
||||
{
|
||||
if(amd_smi::get_state() != State::Active) continue;
|
||||
@@ -255,7 +280,6 @@ data::setup()
|
||||
bool
|
||||
data::shutdown()
|
||||
{
|
||||
ROCPROFSYS_DEBUG("Shutting down amd-smi...\n");
|
||||
amd_smi::set_state(State::Finalized);
|
||||
return true;
|
||||
}
|
||||
@@ -453,6 +477,11 @@ setup()
|
||||
return;
|
||||
}
|
||||
|
||||
amdsmi_version_t _version = get_version();
|
||||
ROCPROFSYS_VERBOSE_F(0, "AMD SMI version: %u.%u.%u.%u - str: %s.\n", _version.year,
|
||||
_version.major, _version.minor, _version.release,
|
||||
_version.build);
|
||||
|
||||
data::device_count = gpu::get_processor_count();
|
||||
|
||||
auto _devices_v = get_sampling_gpus();
|
||||
@@ -561,6 +590,7 @@ shutdown()
|
||||
auto_lock_t _lk{ type_mutex<category::amd_smi>() };
|
||||
|
||||
if(!is_initialized()) return;
|
||||
ROCPROFSYS_VERBOSE_F(1, "Shutting down amd-smi...\n");
|
||||
|
||||
try
|
||||
{
|
||||
|
||||
@@ -7,12 +7,12 @@
|
||||
set(_video_decode_environment
|
||||
"${_base_environment}"
|
||||
"ROCPROFSYS_ROCM_DOMAINS=hip_runtime_api,kernel_dispatch,memory_copy,rocdecode_api"
|
||||
"ROCPROFSYS_ROCM_SMI_METRICS=busy,temp,power,vcn_activity,mem_usage"
|
||||
"ROCPROFSYS_AMD_SMI_METRICS=busy,temp,power,vcn_activity,mem_usage"
|
||||
"ROCPROFSYS_SAMPLING_CPUS=none")
|
||||
set(_jpeg_decode_environment
|
||||
"${_base_environment}"
|
||||
"ROCPROFSYS_ROCM_DOMAINS=hip_runtime_api,kernel_dispatch,memory_copy,rocjpeg_api"
|
||||
"ROCPROFSYS_ROCM_SMI_METRICS=busy,temp,power,jpeg_activity,mem_usage"
|
||||
"ROCPROFSYS_AMD_SMI_METRICS=busy,temp,power,jpeg_activity,mem_usage"
|
||||
"ROCPROFSYS_SAMPLING_CPUS=none")
|
||||
|
||||
check_gpu("MI300" MI300_DETECTED)
|
||||
|
||||
Referencia en una nueva incidencia
Block a user