From b6a187aed12e6017980e733aef9fa88639d2d4ab Mon Sep 17 00:00:00 2001 From: Saurabh Verma <103202491+sauverma93@users.noreply.github.com> Date: Wed, 15 Oct 2025 19:31:36 +0530 Subject: [PATCH] migrate aqlprofile docs 7.0.1 from standalone repo (#1379) This PR migrates the aqlprofile/docs folder from standalone repo to monorepo Link to the docs branch: https://github.com/ROCm/aqlprofile/commits/docs/7.0.1 --------- Co-authored-by: Matt Williams Co-authored-by: pbhandar-amd <138039281+pbhandar-amd@users.noreply.github.com> --- .readthedocs.yaml | 18 ++ docs/conf.py | 61 ++++ docs/examples/pmc-workflow.rst | 109 +++++++ docs/examples/sqtt-workflow.rst | 93 ++++++ docs/index.rst | 44 +++ docs/install/aqlprofile-install.rst | 77 +++++ docs/license.rst | 29 ++ docs/reference/api-list.rst | 112 ++++++++ docs/reference/glossary.rst | 109 +++++++ docs/reference/supported-architectures.rst | 79 ++++++ docs/sphinx/_toc.yml | 37 +++ docs/sphinx/_toc.yml.in | 37 +++ docs/sphinx/requirements.in | 1 + docs/sphinx/requirements.txt | 314 +++++++++++++++++++++ docs/what-is-aqlprofile.rst | 60 ++++ 15 files changed, 1180 insertions(+) create mode 100644 .readthedocs.yaml create mode 100644 docs/conf.py create mode 100644 docs/examples/pmc-workflow.rst create mode 100644 docs/examples/sqtt-workflow.rst create mode 100644 docs/index.rst create mode 100644 docs/install/aqlprofile-install.rst create mode 100644 docs/license.rst create mode 100644 docs/reference/api-list.rst create mode 100644 docs/reference/glossary.rst create mode 100644 docs/reference/supported-architectures.rst create mode 100644 docs/sphinx/_toc.yml create mode 100644 docs/sphinx/_toc.yml.in create mode 100644 docs/sphinx/requirements.in create mode 100644 docs/sphinx/requirements.txt create mode 100644 docs/what-is-aqlprofile.rst diff --git a/.readthedocs.yaml b/.readthedocs.yaml new file mode 100644 index 0000000000..dea9d45a14 --- /dev/null +++ b/.readthedocs.yaml @@ -0,0 +1,18 @@ +# Read the Docs configuration file +# See https://docs.readthedocs.io/en/stable/config-file/v2.html for details + +version: 2 + +sphinx: + configuration: docs/conf.py + +formats: [htmlzip] + +python: + install: + - requirements: docs/sphinx/requirements.txt + +build: + os: ubuntu-22.04 + tools: + python: "3.10" diff --git a/docs/conf.py b/docs/conf.py new file mode 100644 index 0000000000..82602ec9fa --- /dev/null +++ b/docs/conf.py @@ -0,0 +1,61 @@ +# Configuration file for the Sphinx documentation builder. +# +# This file only contains a selection of the most common options. For a full +# list see the documentation: +# https://www.sphinx-doc.org/en/master/usage/configuration.html + +import re + +''' +html_theme is usually unchanged (rocm_docs_theme). +flavor defines the site header display, select the flavor for the corresponding portals +flavor options: rocm, rocm-docs-home, rocm-blogs, rocm-ds, instinct, ai-developer-hub, local, generic +''' +html_theme = "rocm_docs_theme" +html_theme_options = {"flavor": "rocm-docs-home"} + + +# This section turns on/off article info +setting_all_article_info = True +all_article_info_os = ["linux"] +all_article_info_author = "" + +# Dynamically extract component version +# with open('../CMakeLists.txt', encoding='utf-8') as f: +# pattern = r'.*\brocm_setup_version\(VERSION\s+([0-9.]+)[^0-9.]+' # Update according to each component's CMakeLists.txt +# match = re.search(pattern, +# f.read()) +# if not match: +# raise ValueError("VERSION not found!") +version_number = "1.0" + +# for PDF output on Read the Docs +project = "AQLprofile" +author = "Advanced Micro Devices, Inc." +copyright = "Copyright (c) 2025 Advanced Micro Devices, Inc. All rights reserved." +version = version_number +release = version_number + +external_toc_path = "./sphinx/_toc.yml" # Defines Table of Content structure definition path + +''' +Doxygen Settings +Ensure Doxyfile is located at docs/doxygen. +If the component does not need doxygen, delete this section for optimal build time +''' +#doxygen_root = "doxygen" +#doxysphinx_enabled = False +# doxygen_project = { +# "name": "doxygen", +# "path": "doxygen/xml", +#} + +# Add more addtional package accordingly +extensions = [ + "rocm_docs", +# "rocm_docs.doxygen", +] + +html_title = f"{project} {version_number} documentation" + +external_projects_current_project = "AQLprofile" diff --git a/docs/examples/pmc-workflow.rst b/docs/examples/pmc-workflow.rst new file mode 100644 index 0000000000..32d827639f --- /dev/null +++ b/docs/examples/pmc-workflow.rst @@ -0,0 +1,109 @@ +.. meta:: + :description: A typical workflow for collecting PMC data + :keywords: AQLprofile, ROCm, API, how-to, PMC + +********************************************************** +Performance Monitor Control (PMC) workflow with AQLprofile +********************************************************** + +This page describes a typical workflow for collecting PMC data using AQLprofile (as integrated in `ROCprofiler-SDK `__). +This workflow relies on creating a profile object, generating command packets, and iterating over output buffers: + +1. **Intercept kernel dispatch**: The SDK intercepts kernel dispatch packets submitted to the GPU queue. +2. **Create a profile object**: A profile/session object is created, specifying the agent (GPU), events (counters), and output buffers. +3. **Generate command packets**: Start, stop, and read command packets are generated and injected into the queue around the kernel dispatch. +4. **Submit packets and run the kernel**: The kernel and profiling packets are submitted to the GPU queue for execution. +5. **Collect the output buffer**: After execution, the output buffer is read back from the GPU. +6. **Iterate and extract the results**: The SDK iterates over the output buffer to extract and report counter results. + +The SDK abstracts queue interception and packet management so tool developers can focus on results. + +Key API code snippets +===================== + +These API snippets use the legacy interfaces from ``hsa_ven_amd_aqlprofile.h``. These are provided for understanding purposes only. +For new development, refer to the updated APIs in ``aql_profile_v2.h``. + +.. note:: + + The ROCprofiler-SDK is migrating to these newer interfaces in ``aql_profile_v2.h``. You should use the APIs in ``aql_profile_v2.h`` to stay up-to-date. + +Define the events and profile +----------------------------- + +.. code:: cpp + + // Select events (counters) to collect + hsa_ven_amd_aqlprofile_event_t events[] = { + { HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_SQ, 0, 2 }, // Example: SQ block, instance 0, counter 2 + { HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_SQ, 0, 3 } + }; + + // Create profile object + hsa_ven_amd_aqlprofile_profile_t profile = { + .agent = agent, // hsa_agent_t + .type = HSA_VEN_AMD_AQLPROFILE_EVENT_TYPE_PMC, + .events = events, + .event_count = sizeof(events)/sizeof(events[0]), + .parameters = nullptr, + .parameter_count = 0, + .output_buffer = {output_ptr, output_size}, + .command_buffer = {cmd_ptr, cmd_size} + }; + + +Validate events +--------------- + +.. code:: cpp + + bool valid = false; + hsa_ven_amd_aqlprofile_validate_event(agent, &events[0], &valid); + if (!valid) { + // Handle invalid event + } + + +Generate command packets +------------------------- + +.. code:: cpp + + hsa_ext_amd_aql_pm4_packet_t start_pkt, stop_pkt, read_pkt; + hsa_ven_amd_aqlprofile_start(&profile, &start_pkt); + hsa_ven_amd_aqlprofile_stop(&profile, &stop_pkt); + hsa_ven_amd_aqlprofile_read(&profile, &read_pkt); + + +Submit packets and run the kernel +--------------------------------- + +.. code:: cpp + + // Pseudocode: inject packets into HSA queue + queue->Submit(&start_pkt); + queue->Submit(&kernel_pkt); + queue->Submit(&stop_pkt); + queue->Submit(&read_pkt); + + +Iterate and extract results +---------------------------- + +.. code:: cpp + + hsa_ven_amd_aqlprofile_iterate_data( + &profile, + [](hsa_ven_amd_aqlprofile_info_type_t info_type, + hsa_ven_amd_aqlprofile_info_data_t* info_data, + void* user_data) -> hsa_status_t { + if (info_type == HSA_VEN_AMD_AQLPROFILE_INFO_PMC_DATA) { + printf("Event: block %d, id %d, value: %llu\n", + info_data->pmc_data.event.block_name, + info_data->pmc_data.event.counter_id, + info_data->pmc_data.result); + } + return HSA_STATUS_SUCCESS; + }, + nullptr + ); diff --git a/docs/examples/sqtt-workflow.rst b/docs/examples/sqtt-workflow.rst new file mode 100644 index 0000000000..f7babb980c --- /dev/null +++ b/docs/examples/sqtt-workflow.rst @@ -0,0 +1,93 @@ +.. meta:: + :description: A typical workflow for collecting detailed instruction-level traces + :keywords: AQLprofile, ROCm, API, how-to, SQTT + +*********************************************** +SQ Thread Trace (SQTT) workflow with AQLprofile +*********************************************** + +The SQ Thread Trace workflow focuses on collecting detailed instruction-level traces. +This workflow relies on creating a profile object, generating command packets, and iterating over output buffers: + +1. **Intercept the kernel dispatch**: The SDK intercepts the kernel dispatch. +2. **Create a SQTT profile object**: A profile object is created for SQTT, specifying trace parameters and output buffers. +3. **Generate SQTT command packets**: Start, stop, and read packets for SQTT are generated and injected into the queue. +4. **Submit packets and run the kernel**: The kernel and SQTT packets are submitted for execution. +5. **Collect the trace buffer**: The trace output buffer is collected after execution. +6. **Iterate and decode trace data**: The SDK iterates over the trace buffer and decodes the SQTT data for analysis. + +The SDK abstracts queue interception and packet management so tool developers can focus on results. + +Key API code snippets +===================== + +These API snippets use the legacy interfaces from ``hsa_ven_amd_aqlprofile.h``. These are provided for understanding purposes only. +For new development, refer to the updated APIs in ``aql_profile_v2.h``. + +In the `ROCprofiler-SDK `__ codebase, these APIs are wrapped and orchestrated in the ``aql``, ``hsa``, and ``thread_trace`` folders for queue interception, packet construction, and result iteration. + +.. note:: + + The`ROCprofiler-SDK is migrating to these newer interfaces in ``aql_profile_v2.h``. You should use the APIs in ``aql_profile_v2.h`` to stay up-to-date. + +Define parameters and profile +------------------------------ + +.. code:: cpp + + hsa_ven_amd_aqlprofile_parameter_t params[] = { + { HSA_VEN_AMD_AQLPROFILE_PARAMETER_NAME_ATT_BUFFER_SIZE, 0x1000000} // 16 MB buffer + }; + + hsa_ven_amd_aqlprofile_profile_t profile = { + .agent = agent, + .type = HSA_VEN_AMD_AQLPROFILE_EVENT_TYPE_TRACE, + .events = nullptr, + .event_count = 0, + .parameters = params, + .parameter_count = sizeof(params)/sizeof(params[0]), + .output_buffer = {trace_ptr, trace_size}, + .command_buffer = {cmd_ptr, cmd_size} + }; + + +Generate SQTT start/stop packets +--------------------------------- + +.. code:: cpp + + hsa_ext_amd_aql_pm4_packet_t sqtt_start_pkt, sqtt_stop_pkt; + hsa_ven_amd_aqlprofile_start(&profile, &sqtt_start_pkt); + hsa_ven_amd_aqlprofile_stop(&profile, &sqtt_stop_pkt); + + +Submit packets and run the kernel +--------------------------------- + +.. code:: cpp + + queue->Submit(&sqtt_start_pkt); + queue->Submit(&kernel_pkt); + queue->Submit(&sqtt_stop_pkt); + + +Iterate and decode trace data +----------------------------- + +.. code:: cpp + + hsa_ven_amd_aqlprofile_iterate_data( + &profile, + [](hsa_ven_amd_aqlprofile_info_type_t info_type, + hsa_ven_amd_aqlprofile_info_data_t* info_data, + void* user_data) -> hsa_status_t { + if (info_type == HSA_VEN_AMD_AQLPROFILE_INFO_TRACE_DATA) { + // info_data->trace_data.ptr, info_data->trace_data.size + decode_trace(info_data->trace_data.ptr, info_data->trace_data.size); + } + return HSA_STATUS_SUCCESS; + }, + nullptr + ); + + diff --git a/docs/index.rst b/docs/index.rst new file mode 100644 index 0000000000..6cda481711 --- /dev/null +++ b/docs/index.rst @@ -0,0 +1,44 @@ +.. meta:: + :description: AQLprofile is an open source library that enables advanced GPU profiling and tracing on AMD platforms. + :keywords: AQLprofile, ROCm, tool, Instinct, accelerator, AMD + +.. _index: + +************************ +AQLprofile documentation +************************ + +The Architected Queuing Language profiling library (AQLprofile) is an +open source library that enables advanced GPU profiling and tracing on +AMD platforms. + +This documentation provides a comprehensive overview of the AQLprofile library. + +If you're new to AQLprofile, see :doc:`What is AQLprofile? `. + +AQLprofile is open source and hosted at `AQLprofile on GitHub `_. + +.. grid:: 2 + :gutter: 3 + + .. grid-item-card:: Install + + * :doc:`Install AQLprofile ` + + .. grid-item-card:: Examples + + * :doc:`Performance Monitor Control (PMC) workflow ` + * :doc:`SQ Thread Trace (SQTT) workflow ` + + .. grid-item-card:: Reference + + * :doc:`Glossary ` + * :doc:`Supported architectures and counter blocks ` + * :doc:`APIs ` + + +To contribute to the documentation, refer to +`Contributing to ROCm `_. + +You can find licensing information on the +`Licensing `_ page. diff --git a/docs/install/aqlprofile-install.rst b/docs/install/aqlprofile-install.rst new file mode 100644 index 0000000000..06d9469861 --- /dev/null +++ b/docs/install/aqlprofile-install.rst @@ -0,0 +1,77 @@ +.. meta:: + :description: AQLprofile installation process + :keywords: AQLprofile, ROCm, install + +****************** +Install AQLprofile +****************** + +Learn how to build AQLprofile with a script or with CMake, then install the library with a command. + +Prerequisites +============= + +Before you begin, ensure these tools and dependencies are installed: + +* ROCm stack +* ``rocm-llvm-dev`` (required to build tests) + + +Build AQLprofile +================ + +You can build AQLprofile using either the provided build script (recommended for most users) or by manually invoking CMake for custom builds. + + +Option 1: Use the build script (Recommended) +-------------------------------------------- + +This configures and builds the project with the default settings: + +.. code:: bash + + ./build.sh + + +Option 2: Use CMake for custom builds +------------------------------------- + +For more control over the build process, you can set the CMake options manually: + +.. code:: bash + + # Set the CMAKE_PREFIX_PATH to point to hsa-runtime includes path and hsa-runtime library path + export CMAKE_PREFIX_PATH=: + # For example, if ROCm is installed at /opt/rocm: + # export CMAKE_PREFIX_PATH=/opt/rocm/lib:/opt/rocm/include/hsa + + export CMAKE_BUILD_TYPE= # release by default + + cd /path/to/aqlprofile + mkdir build + cd build + cmake .. + make -j + + +Enable debug tracing (Optional) +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +To enable debug tracing, set this environment variable before running CMake: + +.. code:: bash + + export CMAKE_DEBUG_TRACE=1 + +This enables verbose debug output of the command packets while this library executes. + + +Install the AQLprofile libraries +================================ + +Once your build is successful, install the AQLprofile libraries with: + +.. code:: bash + + cd build + sudo make install diff --git a/docs/license.rst b/docs/license.rst new file mode 100644 index 0000000000..c66c93b2ef --- /dev/null +++ b/docs/license.rst @@ -0,0 +1,29 @@ +.. meta:: + :description: The standard MIT license for AQLprofile + :keywords: AQLprofile, ROCm, license + +******* +License +******* + +MIT License + +Copyright (c) 2017-2025 Advanced Micro Devices, Inc. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. \ No newline at end of file diff --git a/docs/reference/api-list.rst b/docs/reference/api-list.rst new file mode 100644 index 0000000000..6bd29e58fa --- /dev/null +++ b/docs/reference/api-list.rst @@ -0,0 +1,112 @@ +.. meta:: + :description: A description of the APIs used with AQLprofile + :keywords: AQLprofile, ROCm, APIs + +AQLprofile APIs +=============== + +Learn about the typical APIs used in AQLprofile. + +The APIs in ``aqlprofile_v2.h`` are designed for use with `ROCprofiler-SDK `__, and are actively maintained and recommended for all new development. + +.. note:: + + The APIs in ``hsa_ven_amd_aqlprofile.h`` are used by legacy tools such as ``rocprof`` and ``rocprofv2``. You should use the new ``aqlprofile_v2.h`` APIs instead. + +From header ``aql_profile_v2.h`` +-------------------------------- + ++------------------------------------+------------------------------------------------------------------------------------------+ +| API Name | Purpose | ++====================================+==========================================================================================+ +| ``aqlprofile_register_agent`` | Registers an agent for profiling using basic agent info. | ++------------------------------------+------------------------------------------------------------------------------------------+ +| ``aqlprofile_register_agent_info`` | Registers an agent for profiling using extended agent info and versioning. | ++------------------------------------+------------------------------------------------------------------------------------------+ +| ``aqlprofile_get_pmc_info`` | Retrieves information about PMC profiles (for example, buffer sizes, counter data). | ++------------------------------------+------------------------------------------------------------------------------------------+ +| ``aqlprofile_validate_pmc_event`` | Checks if a given PMC event is valid for the specified agent. | ++------------------------------------+------------------------------------------------------------------------------------------+ +| ``aqlprofile_pmc_create_packets`` | Creates AQL packets (start, stop, read) for PMC profiling and returns a handle. | ++------------------------------------+------------------------------------------------------------------------------------------+ +| ``aqlprofile_pmc_delete_packets`` | Deletes PMC profiling packets and releases associated resources. | ++------------------------------------+------------------------------------------------------------------------------------------+ +| ``aqlprofile_pmc_iterate_data`` | Iterates over PMC profiling results using a callback. | ++------------------------------------+------------------------------------------------------------------------------------------+ +| ``aqlprofile_att_create_packets`` | Creates AQL packets (start, stop) for Advanced Thread Trace (SQTT) and returns a handle. | ++------------------------------------+------------------------------------------------------------------------------------------+ +| ``aqlprofile_att_delete_packets`` | Deletes ATT profiling packets and releases associated resources. | ++------------------------------------+------------------------------------------------------------------------------------------+ +| ``aqlprofile_att_iterate_data`` | Iterates over thread trace (SQTT) results using a callback. | ++------------------------------------+------------------------------------------------------------------------------------------+ +| ``aqlprofile_iterate_event_ids`` | Iterates over all possible event coordinate IDs and names using a callback. | ++------------------------------------+------------------------------------------------------------------------------------------+ +| ``aqlprofile_iterate_event_coord`` | Iterates over all event coordinates for a given agent and event using a callback. | ++------------------------------------+------------------------------------------------------------------------------------------+ +| ``aqlprofile_att_codeobj_marker`` | Creates a marker packet for code object events in thread trace workflows. | ++------------------------------------+------------------------------------------------------------------------------------------+ + +Callback Typedefs +~~~~~~~~~~~~~~~~~ + ++------------------------------------------+------------------------------------------------------------------------------------------+ +| Callback Typedef Name | Purpose | ++==========================================+==========================================================================================+ +| ``aqlprofile_memory_alloc_callback_t`` | Callback for allocating memory buffers for profiles (PMC/ATT). | ++------------------------------------------+------------------------------------------------------------------------------------------+ +| ``aqlprofile_memory_dealloc_callback_t`` | Callback for deallocating memory buffers allocated for profiles. | ++------------------------------------------+------------------------------------------------------------------------------------------+ +| ``aqlprofile_memory_copy_t`` | Callback for copying memory (used internally by the profiler). | ++------------------------------------------+------------------------------------------------------------------------------------------+ +| ``aqlprofile_pmc_data_callback_t`` | Used with ``aqlprofile_pmc_iterate_data`` to process each PMC profiling result. | ++------------------------------------------+------------------------------------------------------------------------------------------+ +| ``aqlprofile_att_data_callback_t`` | Used with ``aqlprofile_att_iterate_data`` to process each thread trace (SQTT) result. | ++------------------------------------------+------------------------------------------------------------------------------------------+ +| ``aqlprofile_eventname_callback_t`` | Used with ``aqlprofile_iterate_event_ids`` to process event coordinate IDs and names. | ++------------------------------------------+------------------------------------------------------------------------------------------+ +| ``aqlprofile_coordinate_callback_t`` | Used with ``aqlprofile_iterate_event_coord`` to process event coordinate information. | ++------------------------------------------+------------------------------------------------------------------------------------------+ + +From header ``hsa_ven_amd_aqlprofile.h`` (Legacy) +------------------------------------------------- + ++--------------------------------------------------+------------------------------------------------------------------------------------------+ +| API Name | Purpose | ++==================================================+==========================================================================================+ +| ``hsa_ven_amd_aqlprofile_validate_event`` | Checks if a given event (counter) is valid for the specified GPU agent. | ++--------------------------------------------------+------------------------------------------------------------------------------------------+ +| ``hsa_ven_amd_aqlprofile_start`` | Populates an AQL packet with commands to start profiling (PMC or SQTT). | ++--------------------------------------------------+------------------------------------------------------------------------------------------+ +| ``hsa_ven_amd_aqlprofile_stop`` | Populates an AQL packet with commands to stop profiling. | ++--------------------------------------------------+------------------------------------------------------------------------------------------+ +| ``hsa_ven_amd_aqlprofile_read`` | Populates an AQL packet with commands to read profiling results from the GPU. | ++--------------------------------------------------+------------------------------------------------------------------------------------------+ +| ``hsa_ven_amd_aqlprofile_legacy_get_pm4`` | Converts an AQL packet to a PM4 packet blob (for legacy devices). | ++--------------------------------------------------+------------------------------------------------------------------------------------------+ +| ``hsa_ven_amd_aqlprofile_att_marker`` | Inserts a marker (correlation ID) into the ATT (thread trace) buffer. | ++--------------------------------------------------+------------------------------------------------------------------------------------------+ +| ``hsa_ven_amd_aqlprofile_get_info`` | Retrieves various profile information, such as buffer sizes or collected data. | ++--------------------------------------------------+------------------------------------------------------------------------------------------+ +| ``hsa_ven_amd_aqlprofile_iterate_data`` | Iterates over the profiling output data (PMC results or SQTT trace) using a callback. | ++--------------------------------------------------+------------------------------------------------------------------------------------------+ +| ``hsa_ven_amd_aqlprofile_error_string`` | Returns a human-readable error string for the last error. | ++--------------------------------------------------+------------------------------------------------------------------------------------------+ +| ``hsa_ven_amd_aqlprofile_iterate_event_ids`` | Iterates over all possible event IDs and names for the agent. | ++--------------------------------------------------+------------------------------------------------------------------------------------------+ +| ``hsa_ven_amd_aqlprofile_iterate_event_coord`` | Iterates over all event coordinates for a given agent and event. | ++--------------------------------------------------+------------------------------------------------------------------------------------------+ + +.. _callback-typedefs-1: + +Callback Typedefs +~~~~~~~~~~~~~~~~~ + ++---------------------------------------------------+------------------------------------------------------------------------------------------------+ +| Callback Typedef Name | Purpose | ++===================================================+================================================================================================+ +| ``hsa_ven_amd_aqlprofile_data_callback_t`` | Used with ``hsa_ven_amd_aqlprofile_iterate_data`` to process each profiling result (PMC/SQTT). | ++---------------------------------------------------+------------------------------------------------------------------------------------------------+ +| ``hsa_ven_amd_aqlprofile_eventname_callback_t`` | Used with ``hsa_ven_amd_aqlprofile_iterate_event_ids`` to process event IDs and names. | ++---------------------------------------------------+------------------------------------------------------------------------------------------------+ +| ``hsa_ven_amd_aqlprofile_coordinate_callback_t`` | Used with ``hsa_ven_amd_aqlprofile_iterate_event_coord`` to process event coordinate info. | ++---------------------------------------------------+------------------------------------------------------------------------------------------------+ diff --git a/docs/reference/glossary.rst b/docs/reference/glossary.rst new file mode 100644 index 0000000000..fb8c2790a2 --- /dev/null +++ b/docs/reference/glossary.rst @@ -0,0 +1,109 @@ +.. meta:: + :description: Defined concepts commonly used in AQLprofile + :keywords: AQLprofile, ROCm + +AQLprofile glossary +=================== + +Learn the definitions of concepts commonly used in AQLprofile. + +Agents +------ + +Agents represent computational devices (CPUs, GPUs) in the Heterogeneous +System Architecture (HSA) runtime. In AQLprofile, agents are discovered +via HSA APIs and encapsulated in the ``AgentInfo`` structure. Each agent +contains metadata including device type, name, compute unit count, and +memory pools. + +Agents are enumerated using HSA API ``hsa_iterate_agents``, and their +properties are queried via another HSA API, ``hsa_agent_get_info``. +Agents are used to target specific GPUs for profiling, and to allocate +resources such as command buffers and memory pools. + +Counters and events +------------------- + +Performance counters are special circuits on the hardware that count +specific GPU events (for example, cycles, instructions, cache hits). Events +specify which counters to collect, identified by block name, block +index, and counter ID. + +- Events are described using ``hsa_ven_amd_aqlprofile_event_t`` + structures. +- Events are grouped into profiles and collected during profiling + sessions. + +.. code:: cpp + + const hsa_ven_amd_aqlprofile_event_t events_arr1[] = { + {HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_SQ, 0, 2 /*CYCLES*/}, + {HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_SQ, 0, 3 /*BUSY_CYCLES*/}, + // ... + }; + +Counter blocks +-------------- + +Counter blocks correspond to hardware units on the GPU (for example, SQ, TCC, +TCP). Each block exposes a set of counters/events. + +- Block names (for example, ``HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_SQ``) map to + specific hardware blocks. +- Events specify both the block and the counter within that block. + +Command buffers +--------------- + +Command buffers are memory regions that store AQL packets and PM4 +commands, which control GPU profiling operations. They're allocated per +agent, and must meet alignment and size requirements dictated by the +hardware. + +Command packets +--------------- + +Command packets are AQL or PM4 packets that encode profiling commands +for the GPU. They're constructed and written into command buffers. + +They're built using AQLprofile APIs or helper functions and submitted to +the GPU via HSA queues. + +.. code:: cpp + + bool Queue::Submit(hsa_ext_amd_aql_pm4_packet_t* packet) { + // Write packet to queue and signal doorbell + } + +Output buffer +------------- + +Output buffers are memory regions that store outputs such as counter +values and thread trace tokens. They're allocated using HSA memory pools +associated with the agent. + +Profile object +-------------- + +The profile object encapsulates all information required to perform a +profiling session. It's represented by the +``hsa_ven_amd_aqlprofile_profile_t`` struct, which includes the agent, +event type, list of events, command buffer, and additional parameters. + +Profile objects are constructed by specifying the agent, event type +(PMC, SQTT), events to collect, and associated buffers. They're passed +to AQLprofile APIs to start, stop, and read profiling data. + +.. code:: cpp + + hsa_ven_amd_aqlprofile_profile_t *profile = + new hsa_ven_amd_aqlprofile_profile_t{ + agent_info->dev_id, + HSA_VEN_AMD_AQLPROFILE_EVENT_TYPE_PMC, + events, + num_events, + NULL, + 0, + 0, + 0}; + diff --git a/docs/reference/supported-architectures.rst b/docs/reference/supported-architectures.rst new file mode 100644 index 0000000000..7113616944 --- /dev/null +++ b/docs/reference/supported-architectures.rst @@ -0,0 +1,79 @@ +.. meta:: + :description: A list of the supported architectures and counter blocks used with AQLprofile + :keywords: AQLprofile, ROCm, architectures, GFX + +Supported architectures and counter blocks in AQLprofile +======================================================== + +The AQLprofile library supports profiling and tracing GPU workloads +across multiple architectures. + +.. note:: + + The GFX versions (GFX9XX, GFX10XX, GFX11XX, GFX12XX) refer to the architecture families of the hardware for that version. See `System requirements (Linux) `__ for more info. + +Here's a summary of the counter blocks supported for each architecture: + ++-------------+------+--------+--------+--------+------+------+-----+ +| Counter | GFX9 | GFX908 | GFX90A | GFX942 | GFX10| GFX11|GFX12| +| Block Name | | | | | | | | ++=============+======+========+========+========+======+======+=====+ +| ATC | ✅ | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ | ++-------------+------+--------+--------+--------+------+------+-----+ +| ATC_L2 | ✅ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ++-------------+------+--------+--------+--------+------+------+-----+ +| CHA | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ✅ | ++-------------+------+--------+--------+--------+------+------+-----+ +| CHC | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ✅ | ++-------------+------+--------+--------+--------+------+------+-----+ +| CPC | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ++-------------+------+--------+--------+--------+------+------+-----+ +| CPF | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ++-------------+------+--------+--------+--------+------+------+-----+ +| CPG | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ✅ | ++-------------+------+--------+--------+--------+------+------+-----+ +| GCEA | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ++-------------+------+--------+--------+--------+------+------+-----+ +| GCR | ❌ | ❌ | ❌ | ❌ | ✅ | ✅ | ✅ | ++-------------+------+--------+--------+--------+------+------+-----+ +| GDS | ✅ | ❌ | ❌ | ❌ | ✅ | ✅ | ❌ | ++-------------+------+--------+--------+--------+------+------+-----+ +| GL1A | ❌ | ❌ | ❌ | ❌ | ✅ | ✅ | ✅ | ++-------------+------+--------+--------+--------+------+------+-----+ +| GL1C | ❌ | ❌ | ❌ | ❌ | ✅ | ✅ | ✅ | ++-------------+------+--------+--------+--------+------+------+-----+ +| GL2A | ❌ | ❌ | ❌ | ❌ | ✅ | ✅ | ✅ | ++-------------+------+--------+--------+--------+------+------+-----+ +| GL2C | ❌ | ❌ | ❌ | ❌ | ✅ | ✅ | ✅ | ++-------------+------+--------+--------+--------+------+------+-----+ +| GRBM | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ++-------------+------+--------+--------+--------+------+------+-----+ +| GRBMH | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ✅ | ++-------------+------+--------+--------+--------+------+------+-----+ +| GRBM_SE | ✅ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ++-------------+------+--------+--------+--------+------+------+-----+ +| GUS | ❌ | ❌ | ❌ | ❌ | ✅ | ✅ | ❌ | ++-------------+------+--------+--------+--------+------+------+-----+ +| MC_VM_L2 | ✅ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ++-------------+------+--------+--------+--------+------+------+-----+ +| RPB | ✅ | ✅ | ❌ | ❌ | ❌ | ❌ | ❌ | ++-------------+------+--------+--------+--------+------+------+-----+ +| SDMA | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ++-------------+------+--------+--------+--------+------+------+-----+ +| SPI | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ++-------------+------+--------+--------+--------+------+------+-----+ +| SQ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ++-------------+------+--------+--------+--------+------+------+-----+ +| SQ_CS | ✅ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ++-------------+------+--------+--------+--------+------+------+-----+ +| TA | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ++-------------+------+--------+--------+--------+------+------+-----+ +| TCA | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ | ❌ | ++-------------+------+--------+--------+--------+------+------+-----+ +| TCC | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ | ❌ | ++-------------+------+--------+--------+--------+------+------+-----+ +| TCP | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ++-------------+------+--------+--------+--------+------+------+-----+ +| TD | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ | ✅ | ++-------------+------+--------+--------+--------+------+------+-----+ + diff --git a/docs/sphinx/_toc.yml b/docs/sphinx/_toc.yml new file mode 100644 index 0000000000..c32685fb78 --- /dev/null +++ b/docs/sphinx/_toc.yml @@ -0,0 +1,37 @@ +defaults: + numbered: False + maxdepth: 6 +root: index +subtrees: + + - entries: + - file: what-is-aqlprofile.rst + + - caption: Install + entries: + - file: install/aqlprofile-install.rst + title: Install AQLprofile + + - caption: Examples + entries: + - file: examples/pmc-workflow.rst + title: Performance Monitor Control workflow + - file: examples/sqtt-workflow.rst + title: SQ Thread Trace workflow + + - caption: Reference + entries: + - file: reference/glossary.rst + title: Glossary + - file: reference/supported-architectures.rst + title: Supported architectures and counter blocks + - file: reference/api-list.rst + title: APIs + + - caption: About + entries: + - file: license.rst + title: License + + + diff --git a/docs/sphinx/_toc.yml.in b/docs/sphinx/_toc.yml.in new file mode 100644 index 0000000000..c32685fb78 --- /dev/null +++ b/docs/sphinx/_toc.yml.in @@ -0,0 +1,37 @@ +defaults: + numbered: False + maxdepth: 6 +root: index +subtrees: + + - entries: + - file: what-is-aqlprofile.rst + + - caption: Install + entries: + - file: install/aqlprofile-install.rst + title: Install AQLprofile + + - caption: Examples + entries: + - file: examples/pmc-workflow.rst + title: Performance Monitor Control workflow + - file: examples/sqtt-workflow.rst + title: SQ Thread Trace workflow + + - caption: Reference + entries: + - file: reference/glossary.rst + title: Glossary + - file: reference/supported-architectures.rst + title: Supported architectures and counter blocks + - file: reference/api-list.rst + title: APIs + + - caption: About + entries: + - file: license.rst + title: License + + + diff --git a/docs/sphinx/requirements.in b/docs/sphinx/requirements.in new file mode 100644 index 0000000000..513e0efb35 --- /dev/null +++ b/docs/sphinx/requirements.in @@ -0,0 +1 @@ +rocm-docs-core[api_reference]==1.23.0 diff --git a/docs/sphinx/requirements.txt b/docs/sphinx/requirements.txt new file mode 100644 index 0000000000..6f43bc7a74 --- /dev/null +++ b/docs/sphinx/requirements.txt @@ -0,0 +1,314 @@ +# +# This file is autogenerated by pip-compile with Python 3.10 +# by the following command: +# +# pip-compile requirements.in +# +accessible-pygments==0.0.5 + # via pydata-sphinx-theme +alabaster==1.0.0 + # via sphinx +asttokens==3.0.0 + # via stack-data +attrs==25.3.0 + # via + # jsonschema + # jupyter-cache + # referencing +babel==2.17.0 + # via + # pydata-sphinx-theme + # sphinx +beautifulsoup4==4.13.5 + # via pydata-sphinx-theme +breathe==4.36.0 + # via rocm-docs-core +certifi==2025.8.3 + # via requests +cffi==2.0.0 + # via + # cryptography + # pynacl +charset-normalizer==3.4.3 + # via requests +click==8.2.1 + # via + # click-log + # doxysphinx + # jupyter-cache + # sphinx-external-toc +click-log==0.4.0 + # via doxysphinx +comm==0.2.3 + # via ipykernel +contourpy==1.3.2 + # via matplotlib +cryptography==45.0.7 + # via pyjwt +cycler==0.12.1 + # via matplotlib +debugpy==1.8.16 + # via ipykernel +decorator==5.2.1 + # via ipython +docutils==0.21.2 + # via + # myst-parser + # pydata-sphinx-theme + # sphinx +doxysphinx==3.3.12 + # via rocm-docs-core +exceptiongroup==1.3.0 + # via ipython +executing==2.2.1 + # via stack-data +fastjsonschema==2.21.2 + # via + # nbformat + # rocm-docs-core +fonttools==4.59.2 + # via matplotlib +gitdb==4.0.12 + # via gitpython +gitpython==3.1.45 + # via rocm-docs-core +greenlet==3.2.4 + # via sqlalchemy +idna==3.10 + # via requests +imagesize==1.4.1 + # via sphinx +importlib-metadata==8.7.0 + # via + # jupyter-cache + # myst-nb +ipykernel==6.30.1 + # via myst-nb +ipython==8.37.0 + # via + # ipykernel + # myst-nb +jedi==0.19.2 + # via ipython +jinja2==3.1.6 + # via + # myst-parser + # sphinx +jsonschema==4.25.1 + # via nbformat +jsonschema-specifications==2025.9.1 + # via jsonschema +jupyter-cache==1.0.1 + # via myst-nb +jupyter-client==8.6.3 + # via + # ipykernel + # nbclient +jupyter-core==5.8.1 + # via + # ipykernel + # jupyter-client + # nbclient + # nbformat +kiwisolver==1.4.9 + # via matplotlib +libsass==0.22.0 + # via doxysphinx +lxml==5.2.1 + # via doxysphinx +markdown-it-py==3.0.0 + # via + # mdit-py-plugins + # myst-parser +markupsafe==3.0.2 + # via jinja2 +matplotlib==3.10.6 + # via doxysphinx +matplotlib-inline==0.1.7 + # via + # ipykernel + # ipython +mdit-py-plugins==0.5.0 + # via myst-parser +mdurl==0.1.2 + # via markdown-it-py +mpire==2.10.2 + # via doxysphinx +myst-nb==1.3.0 + # via rocm-docs-core +myst-parser==4.0.1 + # via myst-nb +nbclient==0.10.2 + # via + # jupyter-cache + # myst-nb +nbformat==5.10.4 + # via + # jupyter-cache + # myst-nb + # nbclient +nest-asyncio==1.6.0 + # via ipykernel +numpy==1.26.4 + # via + # contourpy + # doxysphinx + # matplotlib +packaging==25.0 + # via + # ipykernel + # matplotlib + # sphinx +parso==0.8.5 + # via jedi +pexpect==4.9.0 + # via ipython +pillow==11.3.0 + # via matplotlib +platformdirs==4.4.0 + # via jupyter-core +prompt-toolkit==3.0.52 + # via ipython +psutil==7.0.0 + # via ipykernel +ptyprocess==0.7.0 + # via pexpect +pure-eval==0.2.3 + # via stack-data +pycparser==2.23 + # via cffi +pydata-sphinx-theme==0.16.1 + # via + # rocm-docs-core + # sphinx-book-theme +pygithub==2.8.1 + # via rocm-docs-core +pygments==2.19.2 + # via + # accessible-pygments + # ipython + # mpire + # pydata-sphinx-theme + # sphinx +pyjson5==1.6.9 + # via doxysphinx +pyjwt[crypto]==2.10.1 + # via pygithub +pynacl==1.6.0 + # via pygithub +pyparsing==3.2.3 + # via + # doxysphinx + # matplotlib +python-dateutil==2.9.0.post0 + # via + # jupyter-client + # matplotlib +pyyaml==6.0.2 + # via + # jupyter-cache + # myst-nb + # myst-parser + # rocm-docs-core + # sphinx-external-toc +pyzmq==27.1.0 + # via + # ipykernel + # jupyter-client +referencing==0.36.2 + # via + # jsonschema + # jsonschema-specifications +requests==2.32.5 + # via + # pygithub + # sphinx +rocm-docs-core[api-reference]==1.23.0 + # via -r requirements.in +rpds-py==0.27.1 + # via + # jsonschema + # referencing +six==1.17.0 + # via python-dateutil +smmap==5.0.2 + # via gitdb +snowballstemmer==3.0.1 + # via sphinx +soupsieve==2.8 + # via beautifulsoup4 +sphinx==8.1.3 + # via + # breathe + # myst-nb + # myst-parser + # pydata-sphinx-theme + # rocm-docs-core + # sphinx-book-theme + # sphinx-copybutton + # sphinx-design + # sphinx-external-toc + # sphinx-notfound-page +sphinx-book-theme==1.1.3 + # via rocm-docs-core +sphinx-copybutton==0.5.2 + # via rocm-docs-core +sphinx-design==0.6.1 + # via rocm-docs-core +sphinx-external-toc==1.0.1 + # via rocm-docs-core +sphinx-notfound-page==1.1.0 + # via rocm-docs-core +sphinxcontrib-applehelp==2.0.0 + # via sphinx +sphinxcontrib-devhelp==2.0.0 + # via sphinx +sphinxcontrib-htmlhelp==2.1.0 + # via sphinx +sphinxcontrib-jsmath==1.0.1 + # via sphinx +sphinxcontrib-qthelp==2.0.0 + # via sphinx +sphinxcontrib-serializinghtml==2.0.0 + # via sphinx +sqlalchemy==2.0.43 + # via jupyter-cache +stack-data==0.6.3 + # via ipython +tabulate==0.9.0 + # via jupyter-cache +tomli==2.2.1 + # via sphinx +tornado==6.5.2 + # via + # ipykernel + # jupyter-client +tqdm==4.67.1 + # via mpire +traitlets==5.14.3 + # via + # ipykernel + # ipython + # jupyter-client + # jupyter-core + # matplotlib-inline + # nbclient + # nbformat +typing-extensions==4.15.0 + # via + # beautifulsoup4 + # exceptiongroup + # ipython + # myst-nb + # pydata-sphinx-theme + # pygithub + # referencing + # sqlalchemy +urllib3==2.5.0 + # via + # pygithub + # requests +wcwidth==0.2.13 + # via prompt-toolkit +zipp==3.23.0 + # via importlib-metadata diff --git a/docs/what-is-aqlprofile.rst b/docs/what-is-aqlprofile.rst new file mode 100644 index 0000000000..99bed62abd --- /dev/null +++ b/docs/what-is-aqlprofile.rst @@ -0,0 +1,60 @@ +.. meta:: + :description: AQLprofile is an open source library that enables advanced GPU profiling and tracing on AMD platforms. + :keywords: AQLprofile, ROCm, tool, Instinct, accelerator, AMD + +What is AQLprofile? +=================== + +The Architected Queuing Language profiling library (AQLprofile) is an +open source library that enables advanced GPU profiling and tracing on +AMD platforms. It works in conjunction with +`ROCprofiler-SDK `__ to +support profiling methods such as `performance counters +(PMC) `__ and `SQ thread trace +(SQTT) `__. AQLprofile provides the +foundational mechanisms for constructing AQL packets and managing +profiling operations across multiple AMD GPU architecture families. The +development of AQLprofile is aligned with ROCprofiler-SDK, ensuring +compatibility and feature support for new GPU architectures and +profiling requirements. + +AQLprofile builds on concepts from the Heterogeneous System Architecture +(HSA) and the AQL, which define the foundations for GPU command +processing and profiling on AMD platforms. For more information, see: + +- `HSA Platform System Architecture + Specification `__ +- `HSA Runtime Programmer's Reference + Specification `__ + +Features +-------- + +- Profiling AQL packets for GPU workloads. +- Performance counters and SQ thread traces. +- Support for GFX9, GFX10XX, GFX11XX, and GFX12XX architecture families. +- Verbose tracing and error logging capabilities. +- Thread trace binary data generated by AQLprofile can be decoded using + `rocprof-trace-decoder `__. + +Who should use this library? +---------------------------- + +- **End users**: If you want to profile AMD GPUs, use + `ROCprofiler-SDK `__ or + tools that depend on it. You do *not* need to use AQLprofile + directly. +- **Developers/integrators**: If you're building profiling tools, + custom workflows, or need to extend profiling capabilities, you may + use AQLprofile directly as a backend. + +How does AQLprofile fit into the ROCm profiling stack? +------------------------------------------------------ + +Here's the typical workflow: + +Application → ROCprofiler-SDK ⇄ **AQLprofile** ⇄ ROCprofiler-SDK → HSA/ROCR/KFD → AMD GPU hardware + +- **AQLprofile** generates profiling command packets (AQL/PM4) tailored to the GPU architecture. It doesn't interact with hardware or drivers directly. It only produces the packets and buffer requirements requested by ``ROCprofiler-SDK``. + +- **ROCprofiler-SDK** provides a higher-level API and user-facing tools, using AQLprofile internally. It manages profiling sessions, submits packets to the GPU via `ROCr `_/HSA/KFD, and collects results.